diff --git a/data/index/sources.jsonl b/data/index/sources.jsonl index 440110b..de5b718 100644 --- a/data/index/sources.jsonl +++ b/data/index/sources.jsonl @@ -48,7 +48,7 @@ {"source_id": "hhd__age_kaggle", "record_type": "dataset", "status": "rejected", "priority": "exclude", "provider": "Kaggle / HHD", "title": "HHD_age", "description": "Hebrew Handwritten Dataset age subset; useful reference but not compatible with remix-friendly corpus goals.", "urls": {"canonical": "https://www.kaggle.com/datasets/liorabergel/hhd-age", "landing": null, "api": null, "download": null, "related": []}, "rights": {"rights_basis": "restricted", "license_expression": "CC-BY-NC-SA-4.0", "commercial_use_allowed": false, "derivatives_allowed": null, "scan_redistribution_allowed": null, "attribution_required": true, "evidence_text": "Seed notes report non-commercial/share-alike terms and research-purpose restrictions.", "terms_url": null, "verification_status": "source_note_only", "verified_at": null}, "scope": {"date_range": "modern", "languages": ["he"], "document_types": ["form"], "creator_names": [], "expected_handwriting": "yes", "estimated_scan_count": 850}, "ingest": {"method": "dataset_download", "access_notes": "Do not include in remix-friendly release bundles unless relicensed.", "agent_notes": "May remain as a negative/restricted source record for search completeness.", "blocked_reason": "Non-commercial restriction conflicts with downstream remix and commercial use."}, "evidence": [{"kind": "repo_note", "citation": "docs/sources/gemini_summary_1.md:101-107", "quote": "licensed under CC BY-NC-SA 4.0"}, {"kind": "repo_note", "citation": "docs/sources/gemini_report_1.md:169-182", "quote": "do not strictly qualify as public domain"}]} {"source_id": "hhd__gender_zenodo", "record_type": "dataset", "status": "rejected", "priority": "exclude", "provider": "Zenodo / HHD", "title": "HHD_gender", "description": "Hebrew Handwritten Dataset gender subset; source notes identify research-only or non-commercial restrictions.", "urls": {"canonical": "https://zenodo.org/records/4729908", "landing": null, "api": null, "download": null, "related": []}, "rights": {"rights_basis": "restricted", "license_expression": null, "commercial_use_allowed": false, "derivatives_allowed": null, "scan_redistribution_allowed": null, "attribution_required": null, "evidence_text": "Seed notes report non-commercial academic/research-only constraints.", "terms_url": null, "verification_status": "source_note_only", "verified_at": null}, "scope": {"date_range": "modern", "languages": ["he"], "document_types": ["form"], "creator_names": [], "expected_handwriting": "yes", "estimated_scan_count": 819}, "ingest": {"method": "dataset_download", "access_notes": "Do not include in remix-friendly release bundles unless relicensed.", "agent_notes": "Keep as excluded lead to prevent accidental ingestion.", "blocked_reason": "Research-only or non-commercial use conflicts with dataset goals."}, "evidence": [{"kind": "repo_note", "citation": "docs/sources/gemini_report_1.md:165-176", "quote": "HHD_gender; non-commercial academic restriction"}]} {"source_id": "hhd__v0_tc11", "record_type": "dataset", "status": "rejected", "priority": "exclude", "provider": "TC11 / HHD", "title": "HHD_v0 isolated characters", "description": "Isolated Hebrew character dataset; licensing notes include no-derivatives or conflicting terms.", "urls": {"canonical": "https://tc11.cvc.uab.es/datasets/HHD_v0_1", "landing": "https://huggingface.co/datasets/sivan22/hebrew-handwritten-dataset", "api": null, "download": null, "related": []}, "rights": {"rights_basis": "restricted", "license_expression": "CC-BY-ND-3.0", "commercial_use_allowed": null, "derivatives_allowed": false, "scan_redistribution_allowed": null, "attribution_required": true, "evidence_text": "Seed notes report CC BY-ND 3.0 / conflicting mirrors; no-derivatives conflicts with substantial transformation.", "terms_url": null, "verification_status": "source_note_only", "verified_at": null}, "scope": {"date_range": "modern", "languages": ["he"], "document_types": ["form", "other"], "creator_names": [], "expected_handwriting": "yes", "estimated_scan_count": null}, "ingest": {"method": "dataset_download", "access_notes": "Do not include in remix-friendly release bundles unless primary authors grant compatible terms.", "agent_notes": "Useful only as external reference for HTR, not as corpus content.", "blocked_reason": "No-derivatives/conflicting license terms conflict with dataset goals."}, "evidence": [{"kind": "repo_note", "citation": "docs/sources/gemini_summary_1.md:117-123", "quote": "HHD_v0 sources"}, {"kind": "repo_note", "citation": "docs/sources/gemini_report_1.md:174-176", "quote": "CC BY-ND 3.0 / CC BY 3.0 conflict"}]} -{"source_id": "jabotinsky__zeev_jabotinsky_archive", "record_type": "collection", "status": "candidate", "priority": "high", "provider": "Jabotinsky Institute Archive", "title": "Ze'ev Jabotinsky handwritten archive items", "description": "Archive leads for handwritten notes and drafts by Ze'ev Jabotinsky, including Hebrew Accent and Population Exchange notes.", "urls": {"canonical": "https://en.jabotinsky.org/archive/search-archive/item/?itemId=115024", "landing": "https://en.jabotinsky.org/archive/catalog-of-files/?section=A&arc=9704&page=78", "api": null, "download": null, "related": ["https://en.jabotinsky.org/archive/search-archive/item/?itemId=115421"]}, "rights": {"rights_basis": "unknown", "license_expression": null, "commercial_use_allowed": null, "derivatives_allowed": null, "scan_redistribution_allowed": null, "attribution_required": null, "evidence_text": "Seed notes infer public domain from Jabotinsky death year, but repository terms and scan redistribution need primary verification.", "terms_url": null, "verification_status": "source_note_only", "verified_at": null}, "scope": {"date_range": "1930-1939", "languages": ["he", "yi", "de"], "document_types": ["draft", "speech", "letter", "other"], "creator_names": ["Ze'ev Jabotinsky"], "expected_handwriting": "mixed", "estimated_scan_count": null}, "ingest": {"method": "manual_download", "access_notes": "Verify downloadable PDFs and terms before copying scans into repo.", "agent_notes": "1928 speech notes are out of post-1929 scope but may inform handwriting style; do not include as entry unless scope changes.", "blocked_reason": null}, "evidence": [{"kind": "repo_note", "citation": "docs/sources/gemini_summary_1.md:51-65", "quote": "The Hebrew Accent; Population Exchange Handwritten Notes"}, {"kind": "repo_note", "citation": "docs/sources/gemini_report_1.md:71-81", "quote": "direct PDF downloads of scanned original documents"}]} +{"source_id": "jabotinsky__zeev_jabotinsky_archive", "record_type": "collection", "status": "rejected", "priority": "exclude", "provider": "Jabotinsky Institute Archive", "title": "Ze'ev Jabotinsky handwritten archive items", "description": "Archive leads for handwritten notes and drafts by Ze'ev Jabotinsky, including Hebrew Accent and Population Exchange notes.", "urls": {"canonical": "https://en.jabotinsky.org/archive/search-archive/item/?itemId=115024", "landing": "https://en.jabotinsky.org/archive/catalog-of-files/?section=A&arc=9704&page=78", "api": null, "download": null, "related": ["https://en.jabotinsky.org/archive/search-archive/item/?itemId=115421"]}, "rights": {"rights_basis": "restricted", "license_expression": null, "commercial_use_allowed": false, "derivatives_allowed": false, "scan_redistribution_allowed": false, "attribution_required": null, "evidence_text": "Terms of Use (en.jabotinsky.org/about-us/terms-of-use/, verified 2026-05-23): content restricted to personal/educational/non-commercial use only; commercial publication and exploitation explicitly prohibited; no modifications permitted; commercial use requires prior written permission and possible usage fee. ML/HTR dataset use is out of scope.", "terms_url": "https://en.jabotinsky.org/about-us/terms-of-use/", "verification_status": "primary_page_checked", "verified_at": "2026-05-23"}, "scope": {"date_range": "1930-1939", "languages": ["he", "yi", "de"], "document_types": ["draft", "speech", "letter", "other"], "creator_names": ["Ze'ev Jabotinsky"], "expected_handwriting": "mixed", "estimated_scan_count": null}, "ingest": {"method": "manual_download", "access_notes": "Verify downloadable PDFs and terms before copying scans into repo.", "agent_notes": "1928 speech notes are out of post-1929 scope but may inform handwriting style; do not include as entry unless scope changes.", "blocked_reason": "rights_restriction"}, "evidence": [{"kind": "repo_note", "citation": "docs/sources/gemini_summary_1.md:51-65", "quote": "The Hebrew Accent; Population Exchange Handwritten Notes"}, {"kind": "repo_note", "citation": "docs/sources/gemini_report_1.md:71-81", "quote": "direct PDF downloads of scanned original documents"}]} {"source_id": "nli__hannah_senesh_archive", "record_type": "collection", "status": "verified", "priority": "seed", "provider": "National Library of Israel", "title": "Hannah Senesh Archive", "description": "Collection-level lead for Hannah Senesh diaries, manuscripts, correspondence, and related handwritten materials.", "urls": {"canonical": "https://www.nli.org.il/en/archives/nnl_archive_al997009165988705171/NLI", "landing": "https://www.nli.org.il/en/at-your-service/announcements/hannah-szenes-archive", "api": null, "download": null, "related": []}, "rights": {"rights_basis": "public_domain", "license_expression": "LicenseRef-Public-Domain-Israel", "commercial_use_allowed": null, "derivatives_allowed": null, "scan_redistribution_allowed": null, "attribution_required": false, "evidence_text": "Seed notes report item pages marked Any Use Permitted and Public Domain in Israel.", "terms_url": null, "verification_status": "source_note_only", "verified_at": null}, "scope": {"date_range": "1936-1944", "languages": ["he", "hu"], "document_types": ["diary", "notebook", "draft", "poem", "letter", "other"], "creator_names": ["Hannah Senesh"], "expected_handwriting": "yes", "estimated_scan_count": null}, "ingest": {"method": "manual_download", "access_notes": "Prioritize item-level NLI archive records with explicit download access and rights labels.", "agent_notes": "Promote only item pages with primary-page rights evidence; mixed Hebrew/Hungarian pages need page-level language tagging.", "blocked_reason": null}, "evidence": [{"kind": "repo_note", "citation": "docs/sources/chatgpt_summary_1.md:3-15", "quote": "clearest source family; several specific item pages explicitly handwritten, Hebrew, post-1929, Any Use Permitted"}, {"kind": "repo_note", "citation": "docs/sources/notebooklm_summary_1.md:3-7", "quote": "Hannah Senesh Archive features public domain scans"}]} {"source_id": "nli__nnl_aleph990025684880205171", "record_type": "item", "status": "candidate", "priority": "medium", "provider": "National Library of Israel", "title": "יומן מהשואה by Elimelech Bash", "description": "Hebrew-script manuscript diary lead with public-domain rights claim; creation date needs verification.", "urls": {"canonical": "https://www.nli.org.il/en/manuscripts/NNL_ALEPH990025684880205171/NLI", "landing": null, "api": null, "download": null, "related": []}, "rights": {"rights_basis": "public_domain", "license_expression": "LicenseRef-Public-Domain-Israel", "commercial_use_allowed": null, "derivatives_allowed": null, "scan_redistribution_allowed": null, "attribution_required": false, "evidence_text": "Seed note says Any Use Permitted and Public Domain in Israel.", "terms_url": null, "verification_status": "source_note_only", "verified_at": null}, "scope": {"date_range": null, "languages": ["he"], "document_types": ["diary"], "creator_names": ["Elimelech Bash"], "expected_handwriting": "yes", "estimated_scan_count": null}, "ingest": {"method": "manual_download", "access_notes": "Verify date written is after 1929 before inclusion.", "agent_notes": "Promising non-Senesh NLI seed.", "blocked_reason": null}, "evidence": [{"kind": "repo_note", "citation": "docs/sources/chatgpt_summary_1.md:93-106", "quote": "promising candidate that still needs date verification before inclusion"}]} {"source_id": "nli__nnl_archive_al990035403420205171", "record_type": "item", "status": "rejected", "priority": "exclude", "provider": "National Library of Israel", "title": "Hybrid Notebook (מחברת-שעטנז), Shaul Tchernichovsky", "description": "A mixture of notes and observations in handwritten Hebrew.", "urls": {"canonical": "https://www.nli.org.il/he/archives/NNL_ARCHIVE_AL990035403420205171/NLI", "landing": null, "api": null, "download": null, "related": []}, "rights": {"rights_basis": "restricted", "license_expression": null, "commercial_use_allowed": false, "derivatives_allowed": null, "scan_redistribution_allowed": false, "attribution_required": null, "evidence_text": "NLI item page rights checked 2026-05-23: 3 of 4 items marked \"permitted for research and study purposes only\"; 1 item does not permit redistribution. None meet the dataset requirement of free redistribution and transformation for ML/downstream use.", "terms_url": null, "verification_status": "primary_page_checked", "verified_at": "2026-05-23"}, "scope": {"date_range": "1930s-1943", "languages": ["he"], "document_types": ["notebook"], "creator_names": ["Shaul Tchernichovsky"], "expected_handwriting": "yes", "estimated_scan_count": null}, "ingest": {"method": "manual_download", "access_notes": "NLI Cloudflare blocks automated access; requires manual browser download.", "agent_notes": "Download as nli_tchern_notebook.zip via NLI download button.", "blocked_reason": "rights_restriction"}, "evidence": [{"kind": "repo_note", "citation": "docs/sources/gemini_summary_1.md", "quote": "Hybrid Notebook (מחברת-שעטנז): A mixture of notes and observations."}]}