From c715d2e816748eac6325e7830854d9be18bf0bfb Mon Sep 17 00:00:00 2001 From: Shay Palachy Date: Sat, 23 May 2026 23:18:42 +0300 Subject: [PATCH] data(sources): add OPenn, NYPL, LoC leads; reject BL and Cambridge New high-priority candidates from Gemini research (2026-05-23): - openn__katz_center_judaica (CC0, bulk rsync/FTP, high priority) - nypl__hebrew_manuscripts_digital_collections (PD, API, high priority) - loc__hebrew_manuscripts_collection (PD/US govt, API, high priority) Rejected with rights-restriction evidence: - bl__hebrew_collection (commercial reuse restricted) - cambridge__digital_library_hebrew_genizah (commercial reuse restricted) Research note: docs/sources/gemini_summary_2.md Co-Authored-By: Claude Sonnet 4.6 --- data/index/sources.jsonl | 5 +++ docs/sources/gemini_summary_2.md | 65 ++++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+) create mode 100644 docs/sources/gemini_summary_2.md diff --git a/data/index/sources.jsonl b/data/index/sources.jsonl index de5b718..6c3b0ac 100644 --- a/data/index/sources.jsonl +++ b/data/index/sources.jsonl @@ -1,3 +1,5 @@ +{"source_id": "bl__hebrew_collection", "record_type": "collection", "status": "rejected", "priority": "exclude", "provider": "British Library", "title": "British Library — Hebrew Manuscripts and Cairo Genizah Collection", "description": "Extensive Hebrew manuscripts and Cairo Genizah fragments. Terms frequently restrict commercial reuse or require paid permissions for publication.", "urls": {"canonical": "https://www.bl.uk/hebrew-manuscripts", "landing": null, "api": null, "download": null, "related": []}, "rights": {"rights_basis": "restricted", "license_expression": null, "commercial_use_allowed": false, "derivatives_allowed": null, "scan_redistribution_allowed": false, "attribution_required": null, "evidence_text": "Terms frequently restrict commercial reuse or require paid permissions for publication. Source: docs/sources/gemini_summary_2.md.", "terms_url": null, "verification_status": "source_note_only", "verified_at": null}, "scope": {"date_range": "medieval-modern", "languages": ["he"], "document_types": ["manuscript", "other"], "creator_names": [], "expected_handwriting": "yes", "estimated_scan_count": null}, "ingest": {"method": "manual_download", "access_notes": "Rights must be verified per item before any ingestion.", "agent_notes": "Flagged as avoid unless specific items with confirmed permissive rights are identified.", "blocked_reason": "rights_restriction"}, "evidence": [{"kind": "repo_note", "citation": "docs/sources/gemini_summary_2.md", "quote": "Terms of use frequently restrict commercial reuse or require paid permissions for publication."}]} +{"source_id": "cambridge__digital_library_hebrew_genizah", "record_type": "collection", "status": "rejected", "priority": "exclude", "provider": "Cambridge Digital Library", "title": "Cambridge Digital Library — Hebrew Manuscripts and Cairo Genizah (Taylor-Schechter)", "description": "Taylor-Schechter Genizah Collection and other Hebrew manuscripts. Terms frequently restrict commercial reuse or require paid permissions.", "urls": {"canonical": "https://cudl.lib.cam.ac.uk/", "landing": null, "api": null, "download": null, "related": []}, "rights": {"rights_basis": "restricted", "license_expression": null, "commercial_use_allowed": false, "derivatives_allowed": null, "scan_redistribution_allowed": false, "attribution_required": null, "evidence_text": "Terms frequently restrict commercial reuse or require paid permissions for publication. Source: docs/sources/gemini_summary_2.md.", "terms_url": null, "verification_status": "source_note_only", "verified_at": null}, "scope": {"date_range": "medieval-modern", "languages": ["he", "ar", "other"], "document_types": ["manuscript", "fragment", "other"], "creator_names": [], "expected_handwriting": "yes", "estimated_scan_count": null}, "ingest": {"method": "manual_download", "access_notes": "Rights must be verified per item before any ingestion. Note: we already have 3 Cambridge entries sourced via Wikimedia Commons (CC BY-SA).", "agent_notes": "Already have Cambridge T-S entries via Wikimedia Commons where rights are confirmed. Do not bulk-ingest from CUDL directly.", "blocked_reason": "rights_restriction"}, "evidence": [{"kind": "repo_note", "citation": "docs/sources/gemini_summary_2.md", "quote": "Terms of use frequently restrict commercial reuse or require paid permissions for publication."}]} {"source_id": "commons__begani_netatikha", "record_type": "item", "status": "verified", "priority": "seed", "provider": "Wikimedia Commons", "title": "BeGani Netatikha handwritten manuscript", "description": "One-page handwritten Hebrew manuscript scan of Rachel Bluwstein's poem BeGani Netatikha, dated 1930 and hosted on Wikimedia Commons from the National Library of Israel.", "urls": {"canonical": "https://commons.wikimedia.org/wiki/File:BeGani_Netatikha.jpg", "landing": "https://web.nli.org.il/sites/NLI/Hebrew/digitallibrary/pages/viewer.aspx?docid=NNL03_EDU700291626&presentorid=NLI_EDU", "api": "https://commons.wikimedia.org/w/api.php?action=query&titles=File:BeGani_Netatikha.jpg&prop=imageinfo&iiprop=url%7Csize%7Cmime&format=json", "download": "https://upload.wikimedia.org/wikipedia/commons/6/6e/BeGani_Netatikha.jpg", "related": []}, "rights": {"rights_basis": "public_domain", "license_expression": "PDM-1.0", "commercial_use_allowed": true, "derivatives_allowed": true, "scan_redistribution_allowed": true, "attribution_required": false, "evidence_text": "Commons marks the scan and original as public domain and identifies the file as free of known copyright restrictions.", "terms_url": "https://creativecommons.org/publicdomain/mark/1.0/", "verification_status": "primary_page_checked", "verified_at": "2026-05-09"}, "scope": {"date_range": "1930", "languages": ["he"], "document_types": ["poem"], "creator_names": ["Rachel Bluwstein"], "expected_handwriting": "yes", "estimated_scan_count": 1}, "ingest": {"method": "manual_download", "access_notes": "Downloaded the Commons original JPEG and recorded its checksum, size, and dimensions at entry level.", "agent_notes": "This is a small but clean first verified Hebrew-language handwritten scan entry.", "blocked_reason": null}, "evidence": [{"kind": "primary_url", "citation": "https://commons.wikimedia.org/wiki/File:BeGani_Netatikha.jpg", "quote": "Description identifies the handwritten Hebrew poem scan, dated 1930, by Rachel Bluwstein."}, {"kind": "primary_url", "citation": "https://commons.wikimedia.org/wiki/File:BeGani_Netatikha.jpg#Licensing", "quote": "Commons marks the scan and original as public domain and free of known restrictions."}]} {"source_id": "commons__bialik_el_hazippor", "record_type": "item", "status": "verified", "priority": "high", "provider": "Wikimedia Commons", "title": "Bialik handwritten manuscript: 'El Ha-Tzippor' (1892)", "description": "Handwritten Hebrew manuscript of Hayyim Nahman Bialik's early poem 'El Ha-Tzippor' (To the Bird), dated 1892. Pre-dates the project's preferred post-1929 scope but is cleanly public domain (Bialik d. 1934).", "urls": {"canonical": "https://commons.wikimedia.org/wiki/File:El_hazippor.jpg", "landing": null, "api": "https://commons.wikimedia.org/w/api.php?action=query&titles=File:El_hazippor.jpg&prop=imageinfo&iiprop=url%7Csize%7Cmime&format=json", "download": "https://upload.wikimedia.org/wikipedia/commons/9/9a/El_hazippor.jpg", "related": []}, "rights": {"rights_basis": "public_domain", "license_expression": "PDM-1.0", "commercial_use_allowed": true, "derivatives_allowed": true, "scan_redistribution_allowed": true, "attribution_required": false, "evidence_text": "This work is in the public domain in its country of origin and other countries and areas where the copyright term is the author's life plus 70 years or fewer.", "terms_url": "https://creativecommons.org/publicdomain/mark/1.0/", "verification_status": "primary_page_checked", "verified_at": "2026-05-09"}, "scope": {"date_range": "1892", "languages": ["he"], "document_types": ["poem"], "creator_names": ["Hayyim Nahman Bialik"], "expected_handwriting": "yes", "estimated_scan_count": 1}, "ingest": {"method": "manual_download", "access_notes": "Downloaded original Commons JPEG.", "agent_notes": "19th-century Hebrew handwriting style; differs from modern Israeli cursive — flag in quality.notes.", "blocked_reason": null}, "evidence": [{"kind": "primary_url", "citation": "https://commons.wikimedia.org/wiki/File:El_hazippor.jpg", "quote": "This work is in the public domain in its country of origin and other countries and areas where the copyright term is the author's life plus 70 years or fewer."}]} {"source_id": "commons__bialik_letter_safed_1927", "record_type": "item", "status": "verified", "priority": "seed", "provider": "Wikimedia Commons", "title": "Bialik letter to Jerusalem institutions about Safed initiative (1927)", "description": "Multi-page handwritten Hebrew letter by Hayyim Nahman Bialik to Jerusalem institutions following his stay in Safed and his initiative to build in Safed, dated 1927. Hosted on Wikimedia Commons via the Bitmuna Archive (collaboration with Wikimedia Israel).", "urls": {"canonical": "https://commons.wikimedia.org/wiki/File:%D7%9E%D7%9B%D7%AA%D7%91_%D7%91%D7%99%D7%90%D7%9C%D7%99%D7%A7_%D7%9C%D7%9E%D7%95%D7%A1%D7%93%D7%95%D7%AA_%D7%91%D7%99%D7%A8%D7%95%D7%A9%D7%9C%D7%99%D7%9D_%D7%9C%D7%90%D7%97%D7%A8_%D7%A9%D7%94%D7%95%D7%AA%D7%95_%D7%91%D7%A9%D7%A4%D7%99%D7%94_%D7%95%D7%A9%D7%99%D7%A8%D7%95_%D7%9C%D7%91%D7%A0%D7%95%D7%AA_%D7%A9%D7%A4%D7%99%D7%94_1927_-_btm3552.jpeg", "landing": null, "api": "https://commons.wikimedia.org/w/api.php?action=query&titles=File:%D7%9E%D7%9B%D7%AA%D7%91_%D7%91%D7%99%D7%90%D7%9C%D7%99%D7%A7_%D7%9C%D7%9E%D7%95%D7%A1%D7%93%D7%95%D7%AA_%D7%91%D7%99%D7%A8%D7%95%D7%A9%D7%9C%D7%99%D7%9D_%D7%9C%D7%90%D7%97%D7%A8_%D7%A9%D7%94%D7%95%D7%AA%D7%95_%D7%91%D7%A9%D7%A4%D7%99%D7%94_%D7%95%D7%A9%D7%99%D7%A8%D7%95_%D7%9C%D7%91%D7%A0%D7%95%D7%AA_%D7%A9%D7%A4%D7%99%D7%94_1927_-_btm3552.jpeg&prop=imageinfo&iiprop=url%7Csize%7Cmime&format=json", "download": "https://upload.wikimedia.org/wikipedia/commons/7/77/%D7%9E%D7%9B%D7%AA%D7%91_%D7%91%D7%99%D7%90%D7%9C%D7%99%D7%A7_%D7%9C%D7%9E%D7%95%D7%A1%D7%93%D7%95%D7%AA_%D7%91%D7%99%D7%A8%D7%95%D7%A9%D7%9C%D7%99%D7%9D_%D7%9C%D7%90%D7%97%D7%A8_%D7%A9%D7%94%D7%95%D7%AA%D7%95_%D7%91%D7%A9%D7%A4%D7%99%D7%94_%D7%95%D7%A9%D7%99%D7%A8%D7%95_%D7%9C%D7%91%D7%A0%D7%95%D7%AA_%D7%A9%D7%A4%D7%99%D7%94_1927_-_btm3552.jpeg", "related": ["https://commons.wikimedia.org/wiki/File:%D7%9E%D7%9B%D7%AA%D7%91_%D7%91%D7%99%D7%90%D7%9C%D7%99%D7%A7_%D7%9C%D7%9E%D7%95%D7%A1%D7%93%D7%95%D7%AA_%D7%91%D7%99%D7%A8%D7%95%D7%A9%D7%9C%D7%99%D7%9D_%D7%9C%D7%90%D7%97%D7%A8_%D7%A9%D7%94%D7%95%D7%AA%D7%95_%D7%91%D7%A9%D7%A4%D7%99%D7%94_%D7%95%D7%A9%D7%99%D7%A8%D7%95_%D7%9C%D7%91%D7%A0%D7%95%D7%AA_%D7%A9%D7%A4%D7%99%D7%94_1927_-_btm3553.jpeg", "https://commons.wikimedia.org/wiki/File:%D7%9E%D7%9B%D7%AA%D7%91_%D7%91%D7%99%D7%90%D7%9C%D7%99%D7%A7_%D7%9C%D7%9E%D7%95%D7%A1%D7%93%D7%95%D7%AA_%D7%91%D7%99%D7%A8%D7%95%D7%A9%D7%9C%D7%99%D7%9D_%D7%9C%D7%90%D7%97%D7%A8_%D7%A9%D7%94%D7%95%D7%AA%D7%95_%D7%91%D7%A9%D7%A4%D7%99%D7%94_%D7%95%D7%A9%D7%99%D7%A8%D7%95_%D7%9C%D7%91%D7%A0%D7%95%D7%AA_%D7%A9%D7%A4%D7%99%D7%94_1927_-_btm3554.jpeg"]}, "rights": {"rights_basis": "public_domain", "license_expression": "LicenseRef-Public-Domain-Israel", "commercial_use_allowed": true, "derivatives_allowed": true, "scan_redistribution_allowed": true, "attribution_required": false, "evidence_text": "This work or image is now in the public domain because its term of copyright has expired in Israel.", "terms_url": null, "verification_status": "primary_page_checked", "verified_at": "2026-05-09"}, "scope": {"date_range": "1927", "languages": ["he"], "document_types": ["letter"], "creator_names": ["Hayyim Nahman Bialik"], "expected_handwriting": "yes", "estimated_scan_count": 3}, "ingest": {"method": "manual_download", "access_notes": "Downloaded the original Commons JPEGs for all 3 pages of the letter (btm3552, btm3553, btm3554).", "agent_notes": "All three pages of the letter ingested as p0001/p0002/p0003 of this source.", "blocked_reason": null}, "evidence": [{"kind": "primary_url", "citation": "https://commons.wikimedia.org/wiki/File:%D7%9E%D7%9B%D7%AA%D7%91_%D7%91%D7%99%D7%90%D7%9C%D7%99%D7%A7_%D7%9C%D7%9E%D7%95%D7%A1%D7%93%D7%95%D7%AA_%D7%91%D7%99%D7%A8%D7%95%D7%A9%D7%9C%D7%99%D7%9D_%D7%9C%D7%90%D7%97%D7%A8_%D7%A9%D7%94%D7%95%D7%AA%D7%95_%D7%91%D7%A9%D7%A4%D7%99%D7%94_%D7%95%D7%A9%D7%99%D7%A8%D7%95_%D7%9C%D7%91%D7%A0%D7%95%D7%AA_%D7%A9%D7%A4%D7%99%D7%94_1927_-_btm3552.jpeg", "quote": "This work or image is now in the public domain because its term of copyright has expired in Israel."}]} @@ -49,6 +51,7 @@ {"source_id": "hhd__gender_zenodo", "record_type": "dataset", "status": "rejected", "priority": "exclude", "provider": "Zenodo / HHD", "title": "HHD_gender", "description": "Hebrew Handwritten Dataset gender subset; source notes identify research-only or non-commercial restrictions.", "urls": {"canonical": "https://zenodo.org/records/4729908", "landing": null, "api": null, "download": null, "related": []}, "rights": {"rights_basis": "restricted", "license_expression": null, "commercial_use_allowed": false, "derivatives_allowed": null, "scan_redistribution_allowed": null, "attribution_required": null, "evidence_text": "Seed notes report non-commercial academic/research-only constraints.", "terms_url": null, "verification_status": "source_note_only", "verified_at": null}, "scope": {"date_range": "modern", "languages": ["he"], "document_types": ["form"], "creator_names": [], "expected_handwriting": "yes", "estimated_scan_count": 819}, "ingest": {"method": "dataset_download", "access_notes": "Do not include in remix-friendly release bundles unless relicensed.", "agent_notes": "Keep as excluded lead to prevent accidental ingestion.", "blocked_reason": "Research-only or non-commercial use conflicts with dataset goals."}, "evidence": [{"kind": "repo_note", "citation": "docs/sources/gemini_report_1.md:165-176", "quote": "HHD_gender; non-commercial academic restriction"}]} {"source_id": "hhd__v0_tc11", "record_type": "dataset", "status": "rejected", "priority": "exclude", "provider": "TC11 / HHD", "title": "HHD_v0 isolated characters", "description": "Isolated Hebrew character dataset; licensing notes include no-derivatives or conflicting terms.", "urls": {"canonical": "https://tc11.cvc.uab.es/datasets/HHD_v0_1", "landing": "https://huggingface.co/datasets/sivan22/hebrew-handwritten-dataset", "api": null, "download": null, "related": []}, "rights": {"rights_basis": "restricted", "license_expression": "CC-BY-ND-3.0", "commercial_use_allowed": null, "derivatives_allowed": false, "scan_redistribution_allowed": null, "attribution_required": true, "evidence_text": "Seed notes report CC BY-ND 3.0 / conflicting mirrors; no-derivatives conflicts with substantial transformation.", "terms_url": null, "verification_status": "source_note_only", "verified_at": null}, "scope": {"date_range": "modern", "languages": ["he"], "document_types": ["form", "other"], "creator_names": [], "expected_handwriting": "yes", "estimated_scan_count": null}, "ingest": {"method": "dataset_download", "access_notes": "Do not include in remix-friendly release bundles unless primary authors grant compatible terms.", "agent_notes": "Useful only as external reference for HTR, not as corpus content.", "blocked_reason": "No-derivatives/conflicting license terms conflict with dataset goals."}, "evidence": [{"kind": "repo_note", "citation": "docs/sources/gemini_summary_1.md:117-123", "quote": "HHD_v0 sources"}, {"kind": "repo_note", "citation": "docs/sources/gemini_report_1.md:174-176", "quote": "CC BY-ND 3.0 / CC BY 3.0 conflict"}]} {"source_id": "jabotinsky__zeev_jabotinsky_archive", "record_type": "collection", "status": "rejected", "priority": "exclude", "provider": "Jabotinsky Institute Archive", "title": "Ze'ev Jabotinsky handwritten archive items", "description": "Archive leads for handwritten notes and drafts by Ze'ev Jabotinsky, including Hebrew Accent and Population Exchange notes.", "urls": {"canonical": "https://en.jabotinsky.org/archive/search-archive/item/?itemId=115024", "landing": "https://en.jabotinsky.org/archive/catalog-of-files/?section=A&arc=9704&page=78", "api": null, "download": null, "related": ["https://en.jabotinsky.org/archive/search-archive/item/?itemId=115421"]}, "rights": {"rights_basis": "restricted", "license_expression": null, "commercial_use_allowed": false, "derivatives_allowed": false, "scan_redistribution_allowed": false, "attribution_required": null, "evidence_text": "Terms of Use (en.jabotinsky.org/about-us/terms-of-use/, verified 2026-05-23): content restricted to personal/educational/non-commercial use only; commercial publication and exploitation explicitly prohibited; no modifications permitted; commercial use requires prior written permission and possible usage fee. ML/HTR dataset use is out of scope.", "terms_url": "https://en.jabotinsky.org/about-us/terms-of-use/", "verification_status": "primary_page_checked", "verified_at": "2026-05-23"}, "scope": {"date_range": "1930-1939", "languages": ["he", "yi", "de"], "document_types": ["draft", "speech", "letter", "other"], "creator_names": ["Ze'ev Jabotinsky"], "expected_handwriting": "mixed", "estimated_scan_count": null}, "ingest": {"method": "manual_download", "access_notes": "Verify downloadable PDFs and terms before copying scans into repo.", "agent_notes": "1928 speech notes are out of post-1929 scope but may inform handwriting style; do not include as entry unless scope changes.", "blocked_reason": "rights_restriction"}, "evidence": [{"kind": "repo_note", "citation": "docs/sources/gemini_summary_1.md:51-65", "quote": "The Hebrew Accent; Population Exchange Handwritten Notes"}, {"kind": "repo_note", "citation": "docs/sources/gemini_report_1.md:71-81", "quote": "direct PDF downloads of scanned original documents"}]} +{"source_id": "loc__hebrew_manuscripts_collection", "record_type": "collection", "status": "candidate", "priority": "high", "provider": "Library of Congress", "title": "Library of Congress — Hebrew Manuscripts Collection", "description": "Handwritten texts, religious commentaries, and drafts spanning centuries. US government entity; items lacking known copyright restrictions are free for general use.", "urls": {"canonical": "https://www.loc.gov/collections/hebrew-manuscripts/", "landing": null, "api": null, "download": null, "related": []}, "rights": {"rights_basis": "public_domain", "license_expression": "PDM-1.0", "commercial_use_allowed": null, "derivatives_allowed": null, "scan_redistribution_allowed": null, "attribution_required": null, "evidence_text": "LoC is a US government entity; items lacking known copyright restrictions are free for general use. Source: docs/sources/gemini_summary_2.md.", "terms_url": null, "verification_status": "source_note_only", "verified_at": null}, "scope": {"date_range": "medieval-modern", "languages": ["he", "yi", "other"], "document_types": ["manuscript", "draft", "commentary", "other"], "creator_names": [], "expected_handwriting": "yes", "estimated_scan_count": null}, "ingest": {"method": "api", "access_notes": "LoC JSON API available; query Hebrew Manuscripts collection and download JPEG/TIFF programmatically.", "agent_notes": "Need to verify per-item rights, as some items may have added restrictions from donor institutions.", "blocked_reason": null}, "evidence": [{"kind": "repo_note", "citation": "docs/sources/gemini_summary_2.md", "quote": "LoC provides a robust JSON API. You can query their Hebrew Manuscript collection and programmatically download the raw JPEG/TIFF files."}]} {"source_id": "nli__hannah_senesh_archive", "record_type": "collection", "status": "verified", "priority": "seed", "provider": "National Library of Israel", "title": "Hannah Senesh Archive", "description": "Collection-level lead for Hannah Senesh diaries, manuscripts, correspondence, and related handwritten materials.", "urls": {"canonical": "https://www.nli.org.il/en/archives/nnl_archive_al997009165988705171/NLI", "landing": "https://www.nli.org.il/en/at-your-service/announcements/hannah-szenes-archive", "api": null, "download": null, "related": []}, "rights": {"rights_basis": "public_domain", "license_expression": "LicenseRef-Public-Domain-Israel", "commercial_use_allowed": null, "derivatives_allowed": null, "scan_redistribution_allowed": null, "attribution_required": false, "evidence_text": "Seed notes report item pages marked Any Use Permitted and Public Domain in Israel.", "terms_url": null, "verification_status": "source_note_only", "verified_at": null}, "scope": {"date_range": "1936-1944", "languages": ["he", "hu"], "document_types": ["diary", "notebook", "draft", "poem", "letter", "other"], "creator_names": ["Hannah Senesh"], "expected_handwriting": "yes", "estimated_scan_count": null}, "ingest": {"method": "manual_download", "access_notes": "Prioritize item-level NLI archive records with explicit download access and rights labels.", "agent_notes": "Promote only item pages with primary-page rights evidence; mixed Hebrew/Hungarian pages need page-level language tagging.", "blocked_reason": null}, "evidence": [{"kind": "repo_note", "citation": "docs/sources/chatgpt_summary_1.md:3-15", "quote": "clearest source family; several specific item pages explicitly handwritten, Hebrew, post-1929, Any Use Permitted"}, {"kind": "repo_note", "citation": "docs/sources/notebooklm_summary_1.md:3-7", "quote": "Hannah Senesh Archive features public domain scans"}]} {"source_id": "nli__nnl_aleph990025684880205171", "record_type": "item", "status": "candidate", "priority": "medium", "provider": "National Library of Israel", "title": "יומן מהשואה by Elimelech Bash", "description": "Hebrew-script manuscript diary lead with public-domain rights claim; creation date needs verification.", "urls": {"canonical": "https://www.nli.org.il/en/manuscripts/NNL_ALEPH990025684880205171/NLI", "landing": null, "api": null, "download": null, "related": []}, "rights": {"rights_basis": "public_domain", "license_expression": "LicenseRef-Public-Domain-Israel", "commercial_use_allowed": null, "derivatives_allowed": null, "scan_redistribution_allowed": null, "attribution_required": false, "evidence_text": "Seed note says Any Use Permitted and Public Domain in Israel.", "terms_url": null, "verification_status": "source_note_only", "verified_at": null}, "scope": {"date_range": null, "languages": ["he"], "document_types": ["diary"], "creator_names": ["Elimelech Bash"], "expected_handwriting": "yes", "estimated_scan_count": null}, "ingest": {"method": "manual_download", "access_notes": "Verify date written is after 1929 before inclusion.", "agent_notes": "Promising non-Senesh NLI seed.", "blocked_reason": null}, "evidence": [{"kind": "repo_note", "citation": "docs/sources/chatgpt_summary_1.md:93-106", "quote": "promising candidate that still needs date verification before inclusion"}]} {"source_id": "nli__nnl_archive_al990035403420205171", "record_type": "item", "status": "rejected", "priority": "exclude", "provider": "National Library of Israel", "title": "Hybrid Notebook (מחברת-שעטנז), Shaul Tchernichovsky", "description": "A mixture of notes and observations in handwritten Hebrew.", "urls": {"canonical": "https://www.nli.org.il/he/archives/NNL_ARCHIVE_AL990035403420205171/NLI", "landing": null, "api": null, "download": null, "related": []}, "rights": {"rights_basis": "restricted", "license_expression": null, "commercial_use_allowed": false, "derivatives_allowed": null, "scan_redistribution_allowed": false, "attribution_required": null, "evidence_text": "NLI item page rights checked 2026-05-23: 3 of 4 items marked \"permitted for research and study purposes only\"; 1 item does not permit redistribution. None meet the dataset requirement of free redistribution and transformation for ML/downstream use.", "terms_url": null, "verification_status": "primary_page_checked", "verified_at": "2026-05-23"}, "scope": {"date_range": "1930s-1943", "languages": ["he"], "document_types": ["notebook"], "creator_names": ["Shaul Tchernichovsky"], "expected_handwriting": "yes", "estimated_scan_count": null}, "ingest": {"method": "manual_download", "access_notes": "NLI Cloudflare blocks automated access; requires manual browser download.", "agent_notes": "Download as nli_tchern_notebook.zip via NLI download button.", "blocked_reason": "rights_restriction"}, "evidence": [{"kind": "repo_note", "citation": "docs/sources/gemini_summary_1.md", "quote": "Hybrid Notebook (מחברת-שעטנז): A mixture of notes and observations."}]} @@ -61,4 +64,6 @@ {"source_id": "nli__nnl_archive_al997009912248505171", "record_type": "item", "status": "verified", "priority": "seed", "provider": "National Library of Israel", "title": "Handwritten Diary of Hannah Szenes in Hebrew and Draft of The Violin", "description": "Strong item-level seed: handwritten Hebrew diary plus play draft, dated 1941-1944.", "urls": {"canonical": "https://www.nli.org.il/en/archives/NNL_ARCHIVE_AL997009912248505171/NLI", "landing": null, "api": null, "download": null, "related": []}, "rights": {"rights_basis": "public_domain", "license_expression": "LicenseRef-Public-Domain-Israel", "commercial_use_allowed": null, "derivatives_allowed": null, "scan_redistribution_allowed": null, "attribution_required": false, "evidence_text": "Seed note says the item page says Any Use Permitted and Public Domain in Israel.", "terms_url": null, "verification_status": "source_note_only", "verified_at": null}, "scope": {"date_range": "1941-1944", "languages": ["he"], "document_types": ["diary", "draft"], "creator_names": ["Hannah Senesh"], "expected_handwriting": "yes", "estimated_scan_count": null}, "ingest": {"method": "manual_download", "access_notes": "Verify direct image/PDF access and split multi-page item into entries.", "agent_notes": "Good first page-level ingestion candidate.", "blocked_reason": null}, "evidence": [{"kind": "repo_note", "citation": "docs/sources/chatgpt_summary_1.md:16-31", "quote": "explicitly handwritten, Hebrew, post-1929; Any Use Permitted; Public Domain in Israel"}]} {"source_id": "nli__nnl_archive_al997009912248705171", "record_type": "item", "status": "verified", "priority": "high", "provider": "National Library of Israel", "title": "Handwritten Diary of Hannah Szenes in Hebrew and Hungarian", "description": "Mixed Hebrew/Hungarian handwritten diary dated 1938-1941.", "urls": {"canonical": "https://www.nli.org.il/en/archives/NNL_ARCHIVE_AL997009912248705171/NLI", "landing": null, "api": null, "download": null, "related": []}, "rights": {"rights_basis": "public_domain", "license_expression": "LicenseRef-Public-Domain-Israel", "commercial_use_allowed": null, "derivatives_allowed": null, "scan_redistribution_allowed": null, "attribution_required": false, "evidence_text": "Seed note says Any Use Permitted and Public Domain in Israel, but remote access may need confirmation.", "terms_url": null, "verification_status": "source_note_only", "verified_at": null}, "scope": {"date_range": "1938-1941", "languages": ["he", "hu"], "document_types": ["diary"], "creator_names": ["Hannah Senesh"], "expected_handwriting": "yes", "estimated_scan_count": null}, "ingest": {"method": "manual_download", "access_notes": "Confirm whether online access is available outside the NLI building.", "agent_notes": "Tag Hebrew pages separately from Hungarian-heavy pages.", "blocked_reason": null}, "evidence": [{"kind": "repo_note", "citation": "docs/sources/chatgpt_summary_1.md:62-78", "quote": "Any Use Permitted; Public Domain in Israel; Online access from NLI building caveat"}]} {"source_id": "nli__shaul_tchernichovsky_archive_items", "record_type": "collection", "status": "rejected", "priority": "exclude", "provider": "National Library of Israel", "title": "Shaul Tchernichovsky handwritten archive items", "description": "NLI leads for receipts, literary drafts, notebooks, and memorandum drafts by Shaul Tchernichovsky.", "urls": {"canonical": "https://www.nli.org.il/he/archives/NNL_ARCHIVE_AL990035912210205171/NLI", "landing": null, "api": null, "download": null, "related": ["https://www.nli.org.il/he/archives/NNL_ARCHIVE_AL990035912380205171/NLI", "https://www.nli.org.il/he/archives/NNL_ARCHIVE_AL990035403420205171/NLI", "https://www.nli.org.il/he/archives/NNL_ARCHIVE_AL990035912230205171/NLI"]}, "rights": {"rights_basis": "unknown", "license_expression": null, "commercial_use_allowed": null, "derivatives_allowed": null, "scan_redistribution_allowed": null, "attribution_required": null, "evidence_text": "All 4 known NLI items checked 2026-05-23: rights are either \"research and study only\" or no redistribution allowed. Entire cluster out of scope for this dataset.", "terms_url": null, "verification_status": "primary_page_checked", "verified_at": "2026-05-23"}, "scope": {"date_range": "1930s-1943", "languages": ["he"], "document_types": ["receipt", "draft", "notebook", "other"], "creator_names": ["Shaul Tchernichovsky"], "expected_handwriting": "yes", "estimated_scan_count": null}, "ingest": {"method": "manual_download", "access_notes": "Expand each NLI item into its own source row before harvesting scans.", "agent_notes": "Useful for handwriting diversity beyond Senesh.", "blocked_reason": "rights_restriction"}, "evidence": [{"kind": "repo_note", "citation": "docs/sources/gemini_summary_1.md:5-34", "quote": "Seven handwritten receipts; The Bandage; Hybrid Notebook; Draft of a Memorandum"}]} +{"source_id": "nypl__hebrew_manuscripts_digital_collections", "record_type": "collection", "status": "candidate", "priority": "high", "provider": "New York Public Library", "title": "NYPL Digital Collections — Hebrew Manuscripts, Ketubbot, and Letters", "description": "Hebrew Illuminated Manuscripts, historical Ketubbot (handwritten marriage contracts), and early modern letters. Out-of-copyright materials are completely free for any use including commercial.", "urls": {"canonical": "https://digitalcollections.nypl.org/", "landing": null, "api": null, "download": null, "related": []}, "rights": {"rights_basis": "public_domain", "license_expression": "PDM-1.0", "commercial_use_allowed": null, "derivatives_allowed": null, "scan_redistribution_allowed": null, "attribution_required": null, "evidence_text": "NYPL policy: out-of-copyright digital materials are completely free for any use including commercial, no permission required. Source: docs/sources/gemini_summary_2.md.", "terms_url": null, "verification_status": "source_note_only", "verified_at": null}, "scope": {"date_range": "medieval-modern", "languages": ["he", "yi", "other"], "document_types": ["manuscript", "ketubbah", "letter", "other"], "creator_names": [], "expected_handwriting": "yes", "estimated_scan_count": null}, "ingest": {"method": "api", "access_notes": "Filter by public domain on portal; public API available for programmatic download. Need to identify specific in-scope Hebrew handwriting items.", "agent_notes": "Ketubbot (marriage contracts) are particularly promising — high volume, post-1929 dates possible, standardized form but varied handwriting.", "blocked_reason": null}, "evidence": [{"kind": "repo_note", "citation": "docs/sources/gemini_summary_2.md", "quote": "NYPL has a policy of making all of its out-of-copyright digital materials completely free for any use, including commercial, with no permission required."}]} +{"source_id": "openn__katz_center_judaica", "record_type": "collection", "status": "candidate", "priority": "high", "provider": "OPenn, University of Pennsylvania Libraries", "title": "Katz Center for Advanced Judaic Studies — Hebrew Manuscripts (OPenn)", "description": "Hundreds of digitized handwritten Hebrew manuscripts, codices, and historical documents. All OPenn content is CC0 1.0 Universal.", "urls": {"canonical": "https://openn.library.upenn.edu/", "landing": "https://openn.library.upenn.edu/ReadMe.html", "api": null, "download": null, "related": []}, "rights": {"rights_basis": "public_domain", "license_expression": "CC0-1.0", "commercial_use_allowed": null, "derivatives_allowed": null, "scan_redistribution_allowed": null, "attribution_required": null, "evidence_text": "OPenn platform policy: all content released under CC0 1.0 Universal (Public Domain Dedication). Source: docs/sources/gemini_summary_2.md.", "terms_url": "https://openn.library.upenn.edu/ReadMe.html", "verification_status": "source_note_only", "verified_at": null}, "scope": {"date_range": "medieval-modern", "languages": ["he", "yi", "ar", "other"], "document_types": ["manuscript", "codex", "letter", "other"], "creator_names": [], "expected_handwriting": "yes", "estimated_scan_count": null}, "ingest": {"method": "api", "access_notes": "Bulk rsync or FTP access (openn.library.upenn.edu). Need to identify specific Judaica sub-collections with Hebrew handwriting in scope, then filter by date range.", "agent_notes": "High-value lead: CC0 license, bulk access, and OPenn is explicitly designed for computational use.", "blocked_reason": null}, "evidence": [{"kind": "repo_note", "citation": "docs/sources/gemini_summary_2.md", "quote": "All of it is released under a CC0 1.0 Universal (Public Domain Dedication). You can pull entire directories of high-res TIFFs/JPEGs and XML metadata via Rsync or direct FTP."}]} {"source_id": "wikimedia__handwritten_hebrew_letters", "record_type": "category", "status": "verified", "priority": "medium", "provider": "Wikimedia Commons", "title": "Category: Handwritten Hebrew letters", "description": "Commons category lead for freely licensed or public-domain handwritten Hebrew letter images and related media.", "urls": {"canonical": "https://commons.wikimedia.org/wiki/Category:Handwritten_Hebrew_letters", "landing": null, "api": null, "download": null, "related": ["https://commons.wikimedia.org/wiki/Category:Hebrew_handwriting_scripts"]}, "rights": {"rights_basis": "mixed", "license_expression": "PDM-1.0", "commercial_use_allowed": true, "derivatives_allowed": true, "scan_redistribution_allowed": true, "attribution_required": false, "evidence_text": "Per-file verification: 2 qualifying files ingested. File:Delacroix letter.png uses PD-old-100; File:Solitreo contract.jpg uses PD-Art|PD-old-70. Most other category files are SVG teaching samples, character-level crops, or CC-BY-SA 3.0 (excluded). Mixed rights overall; ingested items are all PDM-1.0.", "terms_url": null, "verification_status": "primary_page_checked", "verified_at": "2026-05-15"}, "scope": {"date_range": "mixed", "languages": ["he", "lad"], "document_types": ["letter", "other"], "creator_names": [], "expected_handwriting": "mixed", "estimated_scan_count": 2}, "ingest": {"method": "api", "access_notes": "Use MediaWiki API and file pages; exclude SVG teaching samples unless the dataset explicitly wants vector handwriting examples.", "agent_notes": "Ingested 2 qualifying handwritten scans from Category:Handwritten_Hebrew_letters and subcategory Category:Solitreo_script (under Category:Hebrew_handwriting_scripts). Most files in the main category were excluded: SVG teaching samples, tiny character crops (<55px), group photographs, and CC-BY-SA 3.0 licensed files. Files with {{Wrong license}} template were excluded. Two public-domain Solitreo script documents qualified.", "blocked_reason": null}, "evidence": [{"kind": "repo_note", "citation": "docs/sources/gemini_summary_1.md:127-141", "quote": "community-moderated repository containing original photographic scans and SVG reproductions"}, {"kind": "repo_note", "citation": "docs/sources/gemini_report_1.md:204-210", "quote": "files generally mandated to be freely usable media"}]} diff --git a/docs/sources/gemini_summary_2.md b/docs/sources/gemini_summary_2.md new file mode 100644 index 0000000..e82e1c9 --- /dev/null +++ b/docs/sources/gemini_summary_2.md @@ -0,0 +1,65 @@ +# Gemini Research Summary 2 — Commercial-Use Hebrew Handwriting Sources + +*Ingested 2026-05-23. Prompt: best repositories for redistribution-friendly handwritten Hebrew.* + +## Key finding + +Almost all pre-packaged, ML-ready Hebrew handwriting datasets (e.g. HHD) are licensed +"Non-Commercial Research Purpose Only." To build a commercially usable dataset, source +high-resolution scans directly from digital library archives under CC0, PD, or CC BY. + +--- + +## Recommended sources + +### 1. OPenn (University of Pennsylvania Libraries) + +All content released under **CC0 1.0 Universal** (Public Domain Dedication). + +- **Collection:** Katz Center for Advanced Judaic Studies — hundreds of digitized + handwritten Hebrew manuscripts, codices, and historical documents. +- **Access:** Designed for bulk downloading. Pull entire directories of high-res + TIFFs/JPEGs and XML metadata via **rsync or direct FTP**. +- URL: https://openn.library.upenn.edu/ + +### 2. Wikimedia Commons + +Every file must be PD, CC0, CC BY, or CC BY-SA — always redistribution + commercial safe. + +- **Collections:** `Category:Hebrew manuscripts`, `Category:Hebrew handwriting`. + Thousands of individual pages, fragments, letters. +- **Access:** MediaWiki API or Wikimedia Toolforge for bulk category scraping. +- URL: https://commons.wikimedia.org/ + +### 3. New York Public Library (NYPL) Digital Collections + +Out-of-copyright digital materials: completely free for any use including commercial, +no permission required. + +- **Collections:** Hebrew Illuminated Manuscripts, historical Ketubbot (handwritten + marriage contracts), early modern letters. +- **Access:** Filter "Search only public domain materials" on the portal; download + high-res files directly or use the public API. +- URL: https://digitalcollections.nypl.org/ + +### 4. Library of Congress (LoC) + +US government entity; items lacking known copyright restrictions are free for general use. + +- **Collections:** "Hebrew Manuscripts" — handwritten texts, religious commentaries, + drafts spanning centuries. +- **Access:** Robust JSON API; query Hebrew Manuscript collection and download + JPEG/TIFF files programmatically. +- URL: https://www.loc.gov/collections/hebrew-manuscripts/ + +--- + +## Sources to avoid or check carefully + +- **HHD (Hebrew Handwritten Dataset):** Strictly non-commercial. +- **British Library & Cambridge Digital Library:** Hold incredible Cairo Genizah / + Hebrew manuscript collections but terms frequently restrict commercial reuse or + require paid permissions. +- **Ktiv (National Library of Israel aggregator):** Aggregates from hundreds of + libraries worldwide. The actual medieval texts are out of copyright, but holding + institutions often place restrictive terms on their digital photographs.