Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion data/index/sources.jsonl
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
{"source_id": "hhd__age_kaggle", "record_type": "dataset", "status": "rejected", "priority": "exclude", "provider": "Kaggle / HHD", "title": "HHD_age", "description": "Hebrew Handwritten Dataset age subset; useful reference but not compatible with remix-friendly corpus goals.", "urls": {"canonical": "https://www.kaggle.com/datasets/liorabergel/hhd-age", "landing": null, "api": null, "download": null, "related": []}, "rights": {"rights_basis": "restricted", "license_expression": "CC-BY-NC-SA-4.0", "commercial_use_allowed": false, "derivatives_allowed": null, "scan_redistribution_allowed": null, "attribution_required": true, "evidence_text": "Seed notes report non-commercial/share-alike terms and research-purpose restrictions.", "terms_url": null, "verification_status": "source_note_only", "verified_at": null}, "scope": {"date_range": "modern", "languages": ["he"], "document_types": ["form"], "creator_names": [], "expected_handwriting": "yes", "estimated_scan_count": 850}, "ingest": {"method": "dataset_download", "access_notes": "Do not include in remix-friendly release bundles unless relicensed.", "agent_notes": "May remain as a negative/restricted source record for search completeness.", "blocked_reason": "Non-commercial restriction conflicts with downstream remix and commercial use."}, "evidence": [{"kind": "repo_note", "citation": "docs/sources/gemini_summary_1.md:101-107", "quote": "licensed under CC BY-NC-SA 4.0"}, {"kind": "repo_note", "citation": "docs/sources/gemini_report_1.md:169-182", "quote": "do not strictly qualify as public domain"}]}
{"source_id": "hhd__gender_zenodo", "record_type": "dataset", "status": "rejected", "priority": "exclude", "provider": "Zenodo / HHD", "title": "HHD_gender", "description": "Hebrew Handwritten Dataset gender subset; source notes identify research-only or non-commercial restrictions.", "urls": {"canonical": "https://zenodo.org/records/4729908", "landing": null, "api": null, "download": null, "related": []}, "rights": {"rights_basis": "restricted", "license_expression": null, "commercial_use_allowed": false, "derivatives_allowed": null, "scan_redistribution_allowed": null, "attribution_required": null, "evidence_text": "Seed notes report non-commercial academic/research-only constraints.", "terms_url": null, "verification_status": "source_note_only", "verified_at": null}, "scope": {"date_range": "modern", "languages": ["he"], "document_types": ["form"], "creator_names": [], "expected_handwriting": "yes", "estimated_scan_count": 819}, "ingest": {"method": "dataset_download", "access_notes": "Do not include in remix-friendly release bundles unless relicensed.", "agent_notes": "Keep as excluded lead to prevent accidental ingestion.", "blocked_reason": "Research-only or non-commercial use conflicts with dataset goals."}, "evidence": [{"kind": "repo_note", "citation": "docs/sources/gemini_report_1.md:165-176", "quote": "HHD_gender; non-commercial academic restriction"}]}
{"source_id": "hhd__v0_tc11", "record_type": "dataset", "status": "rejected", "priority": "exclude", "provider": "TC11 / HHD", "title": "HHD_v0 isolated characters", "description": "Isolated Hebrew character dataset; licensing notes include no-derivatives or conflicting terms.", "urls": {"canonical": "https://tc11.cvc.uab.es/datasets/HHD_v0_1", "landing": "https://huggingface.co/datasets/sivan22/hebrew-handwritten-dataset", "api": null, "download": null, "related": []}, "rights": {"rights_basis": "restricted", "license_expression": "CC-BY-ND-3.0", "commercial_use_allowed": null, "derivatives_allowed": false, "scan_redistribution_allowed": null, "attribution_required": true, "evidence_text": "Seed notes report CC BY-ND 3.0 / conflicting mirrors; no-derivatives conflicts with substantial transformation.", "terms_url": null, "verification_status": "source_note_only", "verified_at": null}, "scope": {"date_range": "modern", "languages": ["he"], "document_types": ["form", "other"], "creator_names": [], "expected_handwriting": "yes", "estimated_scan_count": null}, "ingest": {"method": "dataset_download", "access_notes": "Do not include in remix-friendly release bundles unless primary authors grant compatible terms.", "agent_notes": "Useful only as external reference for HTR, not as corpus content.", "blocked_reason": "No-derivatives/conflicting license terms conflict with dataset goals."}, "evidence": [{"kind": "repo_note", "citation": "docs/sources/gemini_summary_1.md:117-123", "quote": "HHD_v0 sources"}, {"kind": "repo_note", "citation": "docs/sources/gemini_report_1.md:174-176", "quote": "CC BY-ND 3.0 / CC BY 3.0 conflict"}]}
{"source_id": "jabotinsky__zeev_jabotinsky_archive", "record_type": "collection", "status": "candidate", "priority": "high", "provider": "Jabotinsky Institute Archive", "title": "Ze'ev Jabotinsky handwritten archive items", "description": "Archive leads for handwritten notes and drafts by Ze'ev Jabotinsky, including Hebrew Accent and Population Exchange notes.", "urls": {"canonical": "https://en.jabotinsky.org/archive/search-archive/item/?itemId=115024", "landing": "https://en.jabotinsky.org/archive/catalog-of-files/?section=A&arc=9704&page=78", "api": null, "download": null, "related": ["https://en.jabotinsky.org/archive/search-archive/item/?itemId=115421"]}, "rights": {"rights_basis": "unknown", "license_expression": null, "commercial_use_allowed": null, "derivatives_allowed": null, "scan_redistribution_allowed": null, "attribution_required": null, "evidence_text": "Seed notes infer public domain from Jabotinsky death year, but repository terms and scan redistribution need primary verification.", "terms_url": null, "verification_status": "source_note_only", "verified_at": null}, "scope": {"date_range": "1930-1939", "languages": ["he", "yi", "de"], "document_types": ["draft", "speech", "letter", "other"], "creator_names": ["Ze'ev Jabotinsky"], "expected_handwriting": "mixed", "estimated_scan_count": null}, "ingest": {"method": "manual_download", "access_notes": "Verify downloadable PDFs and terms before copying scans into repo.", "agent_notes": "1928 speech notes are out of post-1929 scope but may inform handwriting style; do not include as entry unless scope changes.", "blocked_reason": null}, "evidence": [{"kind": "repo_note", "citation": "docs/sources/gemini_summary_1.md:51-65", "quote": "The Hebrew Accent; Population Exchange Handwritten Notes"}, {"kind": "repo_note", "citation": "docs/sources/gemini_report_1.md:71-81", "quote": "direct PDF downloads of scanned original documents"}]}
{"source_id": "jabotinsky__zeev_jabotinsky_archive", "record_type": "collection", "status": "rejected", "priority": "exclude", "provider": "Jabotinsky Institute Archive", "title": "Ze'ev Jabotinsky handwritten archive items", "description": "Archive leads for handwritten notes and drafts by Ze'ev Jabotinsky, including Hebrew Accent and Population Exchange notes.", "urls": {"canonical": "https://en.jabotinsky.org/archive/search-archive/item/?itemId=115024", "landing": "https://en.jabotinsky.org/archive/catalog-of-files/?section=A&arc=9704&page=78", "api": null, "download": null, "related": ["https://en.jabotinsky.org/archive/search-archive/item/?itemId=115421"]}, "rights": {"rights_basis": "restricted", "license_expression": null, "commercial_use_allowed": false, "derivatives_allowed": false, "scan_redistribution_allowed": false, "attribution_required": null, "evidence_text": "Terms of Use (en.jabotinsky.org/about-us/terms-of-use/, verified 2026-05-23): content restricted to personal/educational/non-commercial use only; commercial publication and exploitation explicitly prohibited; no modifications permitted; commercial use requires prior written permission and possible usage fee. ML/HTR dataset use is out of scope.", "terms_url": "https://en.jabotinsky.org/about-us/terms-of-use/", "verification_status": "primary_page_checked", "verified_at": "2026-05-23"}, "scope": {"date_range": "1930-1939", "languages": ["he", "yi", "de"], "document_types": ["draft", "speech", "letter", "other"], "creator_names": ["Ze'ev Jabotinsky"], "expected_handwriting": "mixed", "estimated_scan_count": null}, "ingest": {"method": "manual_download", "access_notes": "Verify downloadable PDFs and terms before copying scans into repo.", "agent_notes": "1928 speech notes are out of post-1929 scope but may inform handwriting style; do not include as entry unless scope changes.", "blocked_reason": "rights_restriction"}, "evidence": [{"kind": "repo_note", "citation": "docs/sources/gemini_summary_1.md:51-65", "quote": "The Hebrew Accent; Population Exchange Handwritten Notes"}, {"kind": "repo_note", "citation": "docs/sources/gemini_report_1.md:71-81", "quote": "direct PDF downloads of scanned original documents"}]}
{"source_id": "nli__hannah_senesh_archive", "record_type": "collection", "status": "verified", "priority": "seed", "provider": "National Library of Israel", "title": "Hannah Senesh Archive", "description": "Collection-level lead for Hannah Senesh diaries, manuscripts, correspondence, and related handwritten materials.", "urls": {"canonical": "https://www.nli.org.il/en/archives/nnl_archive_al997009165988705171/NLI", "landing": "https://www.nli.org.il/en/at-your-service/announcements/hannah-szenes-archive", "api": null, "download": null, "related": []}, "rights": {"rights_basis": "public_domain", "license_expression": "LicenseRef-Public-Domain-Israel", "commercial_use_allowed": null, "derivatives_allowed": null, "scan_redistribution_allowed": null, "attribution_required": false, "evidence_text": "Seed notes report item pages marked Any Use Permitted and Public Domain in Israel.", "terms_url": null, "verification_status": "source_note_only", "verified_at": null}, "scope": {"date_range": "1936-1944", "languages": ["he", "hu"], "document_types": ["diary", "notebook", "draft", "poem", "letter", "other"], "creator_names": ["Hannah Senesh"], "expected_handwriting": "yes", "estimated_scan_count": null}, "ingest": {"method": "manual_download", "access_notes": "Prioritize item-level NLI archive records with explicit download access and rights labels.", "agent_notes": "Promote only item pages with primary-page rights evidence; mixed Hebrew/Hungarian pages need page-level language tagging.", "blocked_reason": null}, "evidence": [{"kind": "repo_note", "citation": "docs/sources/chatgpt_summary_1.md:3-15", "quote": "clearest source family; several specific item pages explicitly handwritten, Hebrew, post-1929, Any Use Permitted"}, {"kind": "repo_note", "citation": "docs/sources/notebooklm_summary_1.md:3-7", "quote": "Hannah Senesh Archive features public domain scans"}]}
{"source_id": "nli__nnl_aleph990025684880205171", "record_type": "item", "status": "candidate", "priority": "medium", "provider": "National Library of Israel", "title": "יומן מהשואה by Elimelech Bash", "description": "Hebrew-script manuscript diary lead with public-domain rights claim; creation date needs verification.", "urls": {"canonical": "https://www.nli.org.il/en/manuscripts/NNL_ALEPH990025684880205171/NLI", "landing": null, "api": null, "download": null, "related": []}, "rights": {"rights_basis": "public_domain", "license_expression": "LicenseRef-Public-Domain-Israel", "commercial_use_allowed": null, "derivatives_allowed": null, "scan_redistribution_allowed": null, "attribution_required": false, "evidence_text": "Seed note says Any Use Permitted and Public Domain in Israel.", "terms_url": null, "verification_status": "source_note_only", "verified_at": null}, "scope": {"date_range": null, "languages": ["he"], "document_types": ["diary"], "creator_names": ["Elimelech Bash"], "expected_handwriting": "yes", "estimated_scan_count": null}, "ingest": {"method": "manual_download", "access_notes": "Verify date written is after 1929 before inclusion.", "agent_notes": "Promising non-Senesh NLI seed.", "blocked_reason": null}, "evidence": [{"kind": "repo_note", "citation": "docs/sources/chatgpt_summary_1.md:93-106", "quote": "promising candidate that still needs date verification before inclusion"}]}
{"source_id": "nli__nnl_archive_al990035403420205171", "record_type": "item", "status": "rejected", "priority": "exclude", "provider": "National Library of Israel", "title": "Hybrid Notebook (מחברת-שעטנז), Shaul Tchernichovsky", "description": "A mixture of notes and observations in handwritten Hebrew.", "urls": {"canonical": "https://www.nli.org.il/he/archives/NNL_ARCHIVE_AL990035403420205171/NLI", "landing": null, "api": null, "download": null, "related": []}, "rights": {"rights_basis": "restricted", "license_expression": null, "commercial_use_allowed": false, "derivatives_allowed": null, "scan_redistribution_allowed": false, "attribution_required": null, "evidence_text": "NLI item page rights checked 2026-05-23: 3 of 4 items marked \"permitted for research and study purposes only\"; 1 item does not permit redistribution. None meet the dataset requirement of free redistribution and transformation for ML/downstream use.", "terms_url": null, "verification_status": "primary_page_checked", "verified_at": "2026-05-23"}, "scope": {"date_range": "1930s-1943", "languages": ["he"], "document_types": ["notebook"], "creator_names": ["Shaul Tchernichovsky"], "expected_handwriting": "yes", "estimated_scan_count": null}, "ingest": {"method": "manual_download", "access_notes": "NLI Cloudflare blocks automated access; requires manual browser download.", "agent_notes": "Download as nli_tchern_notebook.zip via NLI download button.", "blocked_reason": "rights_restriction"}, "evidence": [{"kind": "repo_note", "citation": "docs/sources/gemini_summary_1.md", "quote": "Hybrid Notebook (מחברת-שעטנז): A mixture of notes and observations."}]}
Expand Down
Loading