diff --git a/.gitignore b/.gitignore index 1b12e9c..2447578 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ __pycache__/ .venv/ venv/ .pytest_cache/ +.review_feedback.json diff --git a/CITATION.cff b/CITATION.cff index f2476d0..23943eb 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -3,7 +3,7 @@ cff-version: 1.2.0 message: Please cite this dataset using the metadata below. type: dataset title: Hebrew Handwritten Per-Letter Image Dataset -abstract: 'Per-letter image crops of handwritten Hebrew letters, grouped into sets by writer. Each crop is a derivative of a permissively-licensed upstream scan in HeOCR/public-domain-hand-written-hebrew-scans, with per-image rights inherited and attribution recorded. The index is line-oriented JSON (JSONL). Release 0.0.0-rc is the initial-setup release: the corpus contains no per-letter image entries yet. The repository ships the schemas, validation tooling, CI, and licensing policy needed to start ingesting.' +abstract: Per-letter image crops of handwritten Hebrew letters, grouped into sets by writer. Each crop is a derivative of a permissively-licensed upstream scan in HeOCR/public-domain-hand-written-hebrew-scans, with per-image rights inherited and attribution recorded. The index is line-oriented JSON (JSONL). Release 0.0.0-rc contains 7 per-letter image entries drawn from 1 verified writers (7 PDM-1.0). authors: - name: Shay Palachy-Affek version: 0.0.0-rc diff --git a/NOTICE.md b/NOTICE.md index c55feca..40dd264 100644 --- a/NOTICE.md +++ b/NOTICE.md @@ -7,7 +7,7 @@ Repository-authored metadata is dedicated to the public domain under CC0 1.0 Uni Per-letter image crops are derivatives of upstream scans in [HeOCR/public-domain-hand-written-hebrew-scans](https://github.com/HeOCR/public-domain-hand-written-hebrew-scans) and carry per-entry rights inherited from the source page. The entries listed below carry a license that requires attribution (currently CC-BY-4.0, CC-BY-SA-4.0). Anyone redistributing or reusing these crops must keep the listed credit and link to the source page on which the rights claim was verified. - Corpus release: `0.0.0-rc` -- Released at (corpus state): `2026-05-12T00:00:00Z` +- Released at (corpus state): `2026-05-12T22:30:00Z` ## Attribution-required entries diff --git a/data/index/entries.jsonl b/data/index/entries.jsonl index e69de29..356f5f2 100644 --- a/data/index/entries.jsonl +++ b/data/index/entries.jsonl @@ -0,0 +1,7 @@ +{"entry_id": "chaim_nachman_bialik__bet__v0001", "extraction": {"extracted_at": "2026-05-12T22:30:00Z", "extracted_by": "Shay Palachy-Affek", "method": "manual", "notes": "Manual seed crop from upstream scan commons__bialik_el_hazippor__p0001 (Bialik manuscript draft of 'El Hatzippor'). Bbox picked from line 4 of the scan; cursive Hebrew handwriting. Crop produced via Pillow.", "tool": "manual", "tool_version": "0.0.0-manual"}, "image": {"background": "original", "bytes": 3301, "height_px": 22, "local_path": "data/letters/chaim_nachman_bialik/bet/chaim_nachman_bialik__bet__v0001.png", "mime_type": "image/png", "sha256": "f699ee63a92ee3459377547bce0ff1188e8ad2fb8086e7e683f5133e2347a627", "width_px": 15}, "letter": {"codepoint": "U+05D1", "form": "regular", "name": "bet", "notes": null, "style": "cursive_ashkenazi", "unicode_char": "ב"}, "quality": {"exclusion_reasons": [], "legibility": "medium", "notes": "Seed crop at native upstream resolution; cursive Ashkenazi-style hand. Low-resolution upstream scan (409x253) limits per-letter pixel detail.", "usable_for_htr": true, "usable_for_syngen": true}, "rights": {"attribution_required": false, "attribution_text": null, "attribution_url": null, "commercial_use_allowed": true, "derivatives_allowed": true, "evidence_text": "Inherited from upstream entry commons__bialik_el_hazippor__p0001 (PDM-1.0; this work is in the public domain in its country of origin and other countries where the copyright term is the author's life plus 70 years or fewer). Bialik died in 1934; the work is public domain in Israel and most jurisdictions.", "license_expression": "PDM-1.0", "redistribution_allowed": true, "rights_basis": "public_domain", "verification_status": "inherited_from_upstream", "verified_at": "2026-05-12"}, "upstream": {"bbox": {"h": 22, "w": 15, "x": 343, "y": 203}, "commit": "df07bd3825405ed93c15fd61fe4d7967fc60885e", "entry_id": "commons__bialik_el_hazippor__p0001", "release_tag": null, "sha256": "bdd6f1a3b9f8821bbca0c0c836eebf3914a335f816662f1a7f0c4495e45e624e", "source_id": "commons__bialik_el_hazippor"}, "writer_id": "chaim_nachman_bialik"} +{"entry_id": "chaim_nachman_bialik__kaf__v0001", "extraction": {"extracted_at": "2026-05-12T22:30:00Z", "extracted_by": "Shay Palachy-Affek", "method": "manual", "notes": "Manual seed crop from upstream scan commons__bialik_el_hazippor__p0001 (Bialik manuscript draft of 'El Hatzippor'). Bbox picked from line 4 of the scan; cursive Hebrew handwriting. Crop produced via Pillow.", "tool": "manual", "tool_version": "0.0.0-manual"}, "image": {"background": "original", "bytes": 3093, "height_px": 16, "local_path": "data/letters/chaim_nachman_bialik/kaf/chaim_nachman_bialik__kaf__v0001.png", "mime_type": "image/png", "sha256": "f64f3120eaa7f07244bcd2b730683f7e7ee72316abb7f783b6daaeed0883915a", "width_px": 12}, "letter": {"codepoint": "U+05DB", "form": "regular", "name": "kaf", "notes": null, "style": "cursive_ashkenazi", "unicode_char": "כ"}, "quality": {"exclusion_reasons": [], "legibility": "medium", "notes": "Split from original kaf+yod double-letter bbox; right (kaf) portion only. Low-resolution upstream scan.", "usable_for_htr": true, "usable_for_syngen": true}, "rights": {"attribution_required": false, "attribution_text": null, "attribution_url": null, "commercial_use_allowed": true, "derivatives_allowed": true, "evidence_text": "Inherited from upstream entry commons__bialik_el_hazippor__p0001 (PDM-1.0; this work is in the public domain in its country of origin and other countries where the copyright term is the author's life plus 70 years or fewer). Bialik died in 1934; the work is public domain in Israel and most jurisdictions.", "license_expression": "PDM-1.0", "redistribution_allowed": true, "rights_basis": "public_domain", "verification_status": "inherited_from_upstream", "verified_at": "2026-05-12"}, "upstream": {"bbox": {"h": 16, "w": 12, "x": 330, "y": 203}, "commit": "df07bd3825405ed93c15fd61fe4d7967fc60885e", "entry_id": "commons__bialik_el_hazippor__p0001", "release_tag": null, "sha256": "bdd6f1a3b9f8821bbca0c0c836eebf3914a335f816662f1a7f0c4495e45e624e", "source_id": "commons__bialik_el_hazippor"}, "writer_id": "chaim_nachman_bialik"} +{"entry_id": "chaim_nachman_bialik__lamed__v0001", "extraction": {"extracted_at": "2026-05-12T22:30:00Z", "extracted_by": "Shay Palachy-Affek", "method": "manual", "notes": "Manual seed crop from upstream scan commons__bialik_el_hazippor__p0001 (Bialik manuscript draft of 'El Hatzippor'). Bbox picked from line 4 of the scan; cursive Hebrew handwriting. Crop produced via Pillow.", "tool": "manual", "tool_version": "0.0.0-manual"}, "image": {"background": "original", "bytes": 3468, "height_px": 28, "local_path": "data/letters/chaim_nachman_bialik/lamed/chaim_nachman_bialik__lamed__v0001.png", "mime_type": "image/png", "sha256": "b9b291b0c6b701c759ac1475ae74106d98cd4e22faa121e53c5eea2f91dc085c", "width_px": 15}, "letter": {"codepoint": "U+05DC", "form": "regular", "name": "lamed", "notes": null, "style": "cursive_ashkenazi", "unicode_char": "ל"}, "quality": {"exclusion_reasons": [], "legibility": "medium", "notes": "Seed crop at native upstream resolution; cursive Ashkenazi-style hand. Low-resolution upstream scan (409x253) limits per-letter pixel detail.", "usable_for_htr": true, "usable_for_syngen": true}, "rights": {"attribution_required": false, "attribution_text": null, "attribution_url": null, "commercial_use_allowed": true, "derivatives_allowed": true, "evidence_text": "Inherited from upstream entry commons__bialik_el_hazippor__p0001 (PDM-1.0; this work is in the public domain in its country of origin and other countries where the copyright term is the author's life plus 70 years or fewer). Bialik died in 1934; the work is public domain in Israel and most jurisdictions.", "license_expression": "PDM-1.0", "redistribution_allowed": true, "rights_basis": "public_domain", "verification_status": "inherited_from_upstream", "verified_at": "2026-05-12"}, "upstream": {"bbox": {"h": 28, "w": 15, "x": 200, "y": 192}, "commit": "df07bd3825405ed93c15fd61fe4d7967fc60885e", "entry_id": "commons__bialik_el_hazippor__p0001", "release_tag": null, "sha256": "bdd6f1a3b9f8821bbca0c0c836eebf3914a335f816662f1a7f0c4495e45e624e", "source_id": "commons__bialik_el_hazippor"}, "writer_id": "chaim_nachman_bialik"} +{"entry_id": "chaim_nachman_bialik__mem__v0001", "extraction": {"extracted_at": "2026-05-12T22:30:00Z", "extracted_by": "Shay Palachy-Affek", "method": "manual", "notes": "Manual seed crop from upstream scan commons__bialik_el_hazippor__p0001 (Bialik manuscript draft of 'El Hatzippor'). Bbox picked from line 4 of the scan; cursive Hebrew handwriting. Crop produced via Pillow.", "tool": "manual", "tool_version": "0.0.0-manual"}, "image": {"background": "original", "bytes": 3098, "height_px": 23, "local_path": "data/letters/chaim_nachman_bialik/mem/chaim_nachman_bialik__mem__v0001.png", "mime_type": "image/png", "sha256": "cbb73d4644f2f42751f06c18224efeb2ff7bdcc9cb674283d466540582a35b72", "width_px": 12}, "letter": {"codepoint": "U+05DE", "form": "regular", "name": "mem", "notes": null, "style": "cursive_ashkenazi", "unicode_char": "מ"}, "quality": {"exclusion_reasons": [], "legibility": "low", "notes": "Collapsed mem form in cursive Ashkenazi hand; visually resembles yod or a small square. Hard/ambiguous example — treat with care for HTR training.", "usable_for_htr": true, "usable_for_syngen": false}, "rights": {"attribution_required": false, "attribution_text": null, "attribution_url": null, "commercial_use_allowed": true, "derivatives_allowed": true, "evidence_text": "Inherited from upstream entry commons__bialik_el_hazippor__p0001 (PDM-1.0; this work is in the public domain in its country of origin and other countries where the copyright term is the author's life plus 70 years or fewer). Bialik died in 1934; the work is public domain in Israel and most jurisdictions.", "license_expression": "PDM-1.0", "redistribution_allowed": true, "rights_basis": "public_domain", "verification_status": "inherited_from_upstream", "verified_at": "2026-05-12"}, "upstream": {"bbox": {"h": 23, "w": 12, "x": 290, "y": 202}, "commit": "df07bd3825405ed93c15fd61fe4d7967fc60885e", "entry_id": "commons__bialik_el_hazippor__p0001", "release_tag": null, "sha256": "bdd6f1a3b9f8821bbca0c0c836eebf3914a335f816662f1a7f0c4495e45e624e", "source_id": "commons__bialik_el_hazippor"}, "writer_id": "chaim_nachman_bialik"} +{"entry_id": "chaim_nachman_bialik__resh__v0001", "extraction": {"extracted_at": "2026-05-12T22:30:00Z", "extracted_by": "Shay Palachy-Affek", "method": "manual", "notes": "Manual seed crop from upstream scan commons__bialik_el_hazippor__p0001 (Bialik manuscript draft of 'El Hatzippor'). Bbox picked from line 4 of the scan; cursive Hebrew handwriting. Crop produced via Pillow.", "tool": "manual", "tool_version": "0.0.0-manual"}, "image": {"background": "original", "bytes": 3193, "height_px": 22, "local_path": "data/letters/chaim_nachman_bialik/resh/chaim_nachman_bialik__resh__v0001.png", "mime_type": "image/png", "sha256": "1092de5374576bc96965fd1a10b089c311bc7ba7ea975d8179cb784aa441298a", "width_px": 13}, "letter": {"codepoint": "U+05E8", "form": "regular", "name": "resh", "notes": null, "style": "cursive_ashkenazi", "unicode_char": "ר"}, "quality": {"exclusion_reasons": [], "legibility": "medium", "notes": "Seed crop at native upstream resolution; cursive Ashkenazi-style hand. Low-resolution upstream scan (409x253) limits per-letter pixel detail.", "usable_for_htr": true, "usable_for_syngen": true}, "rights": {"attribution_required": false, "attribution_text": null, "attribution_url": null, "commercial_use_allowed": true, "derivatives_allowed": true, "evidence_text": "Inherited from upstream entry commons__bialik_el_hazippor__p0001 (PDM-1.0; this work is in the public domain in its country of origin and other countries where the copyright term is the author's life plus 70 years or fewer). Bialik died in 1934; the work is public domain in Israel and most jurisdictions.", "license_expression": "PDM-1.0", "redistribution_allowed": true, "rights_basis": "public_domain", "verification_status": "inherited_from_upstream", "verified_at": "2026-05-12"}, "upstream": {"bbox": {"h": 22, "w": 13, "x": 278, "y": 202}, "commit": "df07bd3825405ed93c15fd61fe4d7967fc60885e", "entry_id": "commons__bialik_el_hazippor__p0001", "release_tag": null, "sha256": "bdd6f1a3b9f8821bbca0c0c836eebf3914a335f816662f1a7f0c4495e45e624e", "source_id": "commons__bialik_el_hazippor"}, "writer_id": "chaim_nachman_bialik"} +{"entry_id": "chaim_nachman_bialik__tav__v0001", "extraction": {"extracted_at": "2026-05-12T22:30:00Z", "extracted_by": "Shay Palachy-Affek", "method": "manual", "notes": "Manual seed crop from upstream scan commons__bialik_el_hazippor__p0001 (Bialik manuscript draft of 'El Hatzippor'). Bbox picked from line 4 of the scan; cursive Hebrew handwriting. Crop produced via Pillow.", "tool": "manual", "tool_version": "0.0.0-manual"}, "image": {"background": "original", "bytes": 3195, "height_px": 20, "local_path": "data/letters/chaim_nachman_bialik/tav/chaim_nachman_bialik__tav__v0001.png", "mime_type": "image/png", "sha256": "db0a29002f767438e82bc04d35d4e727f4c0f99bac8f6fee7c2f59e68d22e628", "width_px": 14}, "letter": {"codepoint": "U+05EA", "form": "regular", "name": "tav", "notes": null, "style": "cursive_ashkenazi", "unicode_char": "ת"}, "quality": {"exclusion_reasons": [], "legibility": "medium", "notes": "Seed crop at native upstream resolution; cursive Ashkenazi-style hand. Low-resolution upstream scan (409x253) limits per-letter pixel detail.", "usable_for_htr": true, "usable_for_syngen": true}, "rights": {"attribution_required": false, "attribution_text": null, "attribution_url": null, "commercial_use_allowed": true, "derivatives_allowed": true, "evidence_text": "Inherited from upstream entry commons__bialik_el_hazippor__p0001 (PDM-1.0; this work is in the public domain in its country of origin and other countries where the copyright term is the author's life plus 70 years or fewer). Bialik died in 1934; the work is public domain in Israel and most jurisdictions.", "license_expression": "PDM-1.0", "redistribution_allowed": true, "rights_basis": "public_domain", "verification_status": "inherited_from_upstream", "verified_at": "2026-05-12"}, "upstream": {"bbox": {"h": 20, "w": 14, "x": 309, "y": 202}, "commit": "df07bd3825405ed93c15fd61fe4d7967fc60885e", "entry_id": "commons__bialik_el_hazippor__p0001", "release_tag": null, "sha256": "bdd6f1a3b9f8821bbca0c0c836eebf3914a335f816662f1a7f0c4495e45e624e", "source_id": "commons__bialik_el_hazippor"}, "writer_id": "chaim_nachman_bialik"} +{"entry_id": "chaim_nachman_bialik__yod__v0001", "extraction": {"extracted_at": "2026-05-12T22:30:00Z", "extracted_by": "Shay Palachy-Affek", "method": "manual", "notes": "Manual seed crop from upstream scan commons__bialik_el_hazippor__p0001 (Bialik manuscript draft of 'El Hatzippor'). Bbox picked from line 4 of the scan; cursive Hebrew handwriting. Crop produced via Pillow.", "tool": "manual", "tool_version": "0.0.0-manual"}, "image": {"background": "original", "bytes": 2879, "height_px": 16, "local_path": "data/letters/chaim_nachman_bialik/yod/chaim_nachman_bialik__yod__v0001.png", "mime_type": "image/png", "sha256": "5cc808ed43f33a788e392ae66fecd580a6c183fb83b49c08fda21a66538c0207", "width_px": 7}, "letter": {"codepoint": "U+05D9", "form": "regular", "name": "yod", "notes": null, "style": "cursive_ashkenazi", "unicode_char": "י"}, "quality": {"exclusion_reasons": [], "legibility": "medium", "notes": "Split from original kaf+yod double-letter bbox; left (yod) portion only. Small stroke consistent with cursive yod. Low-resolution upstream scan.", "usable_for_htr": true, "usable_for_syngen": true}, "rights": {"attribution_required": false, "attribution_text": null, "attribution_url": null, "commercial_use_allowed": true, "derivatives_allowed": true, "evidence_text": "Inherited from upstream entry commons__bialik_el_hazippor__p0001 (PDM-1.0; this work is in the public domain in its country of origin and other countries where the copyright term is the author's life plus 70 years or fewer). Bialik died in 1934; the work is public domain in Israel and most jurisdictions.", "license_expression": "PDM-1.0", "redistribution_allowed": true, "rights_basis": "public_domain", "verification_status": "inherited_from_upstream", "verified_at": "2026-05-12"}, "upstream": {"bbox": {"h": 16, "w": 7, "x": 324, "y": 203}, "commit": "df07bd3825405ed93c15fd61fe4d7967fc60885e", "entry_id": "commons__bialik_el_hazippor__p0001", "release_tag": null, "sha256": "bdd6f1a3b9f8821bbca0c0c836eebf3914a335f816662f1a7f0c4495e45e624e", "source_id": "commons__bialik_el_hazippor"}, "writer_id": "chaim_nachman_bialik"} diff --git a/data/index/writers.jsonl b/data/index/writers.jsonl index e69de29..40bd4b7 100644 --- a/data/index/writers.jsonl +++ b/data/index/writers.jsonl @@ -0,0 +1 @@ +{"also_known_as": ["Hayyim Nahman Bialik", "Haim Nahman Bialik", "H. N. Bialik", "חיים נחמן ביאליק", "חיים נחמן ביאַליק"], "dates": {"birth_precision": "exact", "birth_year": 1873, "death_precision": "exact", "death_year": 1934}, "description": "Russian-born Hebrew poet (1873-1934), widely regarded as Israel's national poet. Among the pioneers of modern Hebrew poetry; his manuscript drafts and personal letters are a primary source of early-20th-century handwritten modern Hebrew.", "display_name": "Chaim Nachman Bialik", "ingest": {"agent_notes": "Seed writer for v0 ingest. First per-letter crops drawn from a single manuscript page (commons__bialik_el_hazippor__p0001) to validate the manual-extraction pipeline end-to-end.", "blocked_reason": null}, "languages_written": ["he", "yi"], "period": {"end": "1934", "precision": "year", "start": "1890"}, "references": [{"citation": "Wikipedia: Hayim Nahman Bialik", "kind": "secondary_url", "quote": null, "url": "https://en.wikipedia.org/wiki/Hayim_Nahman_Bialik"}, {"citation": "VIAF authority record 27069388 (Bialik, Ḥayyim Naḥman, 1873-1934)", "kind": "authority_record", "quote": null, "url": "https://viaf.org/viaf/27069388/"}, {"citation": "Wikimedia Commons: manuscript draft of 'El Hatzippor' (autograph).", "kind": "primary_url", "quote": null, "url": "https://commons.wikimedia.org/wiki/File:Bialik_El_hazippor.jpg"}], "scripts_written": ["Hebr"], "status": "verified", "writer_id": "chaim_nachman_bialik"} diff --git a/data/letters/chaim_nachman_bialik/bet/chaim_nachman_bialik__bet__v0001.png b/data/letters/chaim_nachman_bialik/bet/chaim_nachman_bialik__bet__v0001.png new file mode 100644 index 0000000..45699b2 --- /dev/null +++ b/data/letters/chaim_nachman_bialik/bet/chaim_nachman_bialik__bet__v0001.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f699ee63a92ee3459377547bce0ff1188e8ad2fb8086e7e683f5133e2347a627 +size 3301 diff --git a/data/letters/chaim_nachman_bialik/kaf/chaim_nachman_bialik__kaf__v0001.png b/data/letters/chaim_nachman_bialik/kaf/chaim_nachman_bialik__kaf__v0001.png new file mode 100644 index 0000000..ff31f1c --- /dev/null +++ b/data/letters/chaim_nachman_bialik/kaf/chaim_nachman_bialik__kaf__v0001.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f64f3120eaa7f07244bcd2b730683f7e7ee72316abb7f783b6daaeed0883915a +size 3093 diff --git a/data/letters/chaim_nachman_bialik/lamed/chaim_nachman_bialik__lamed__v0001.png b/data/letters/chaim_nachman_bialik/lamed/chaim_nachman_bialik__lamed__v0001.png new file mode 100644 index 0000000..e158742 --- /dev/null +++ b/data/letters/chaim_nachman_bialik/lamed/chaim_nachman_bialik__lamed__v0001.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9b291b0c6b701c759ac1475ae74106d98cd4e22faa121e53c5eea2f91dc085c +size 3468 diff --git a/data/letters/chaim_nachman_bialik/mem/chaim_nachman_bialik__mem__v0001.png b/data/letters/chaim_nachman_bialik/mem/chaim_nachman_bialik__mem__v0001.png new file mode 100644 index 0000000..ebef1f1 --- /dev/null +++ b/data/letters/chaim_nachman_bialik/mem/chaim_nachman_bialik__mem__v0001.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cbb73d4644f2f42751f06c18224efeb2ff7bdcc9cb674283d466540582a35b72 +size 3098 diff --git a/data/letters/chaim_nachman_bialik/resh/chaim_nachman_bialik__resh__v0001.png b/data/letters/chaim_nachman_bialik/resh/chaim_nachman_bialik__resh__v0001.png new file mode 100644 index 0000000..c2d20a9 --- /dev/null +++ b/data/letters/chaim_nachman_bialik/resh/chaim_nachman_bialik__resh__v0001.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1092de5374576bc96965fd1a10b089c311bc7ba7ea975d8179cb784aa441298a +size 3193 diff --git a/data/letters/chaim_nachman_bialik/tav/chaim_nachman_bialik__tav__v0001.png b/data/letters/chaim_nachman_bialik/tav/chaim_nachman_bialik__tav__v0001.png new file mode 100644 index 0000000..2f2df84 --- /dev/null +++ b/data/letters/chaim_nachman_bialik/tav/chaim_nachman_bialik__tav__v0001.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db0a29002f767438e82bc04d35d4e727f4c0f99bac8f6fee7c2f59e68d22e628 +size 3195 diff --git a/data/letters/chaim_nachman_bialik/yod/chaim_nachman_bialik__yod__v0001.png b/data/letters/chaim_nachman_bialik/yod/chaim_nachman_bialik__yod__v0001.png new file mode 100644 index 0000000..d654f91 --- /dev/null +++ b/data/letters/chaim_nachman_bialik/yod/chaim_nachman_bialik__yod__v0001.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5cc808ed43f33a788e392ae66fecd580a6c183fb83b49c08fda21a66538c0207 +size 2879 diff --git a/datapackage.json b/datapackage.json index bea3b70..af3c426 100644 --- a/datapackage.json +++ b/datapackage.json @@ -23,14 +23,20 @@ "path": "https://creativecommons.org/publicdomain/zero/1.0/", "scope": "metadata", "title": "Creative Commons Zero v1.0 Universal" + }, + { + "name": "PDM-1.0", + "path": "https://creativecommons.org/publicdomain/mark/1.0/", + "scope": "images", + "title": "Public Domain Mark 1.0" } ], "name": "hletterscript", "profile": "data-package", - "released_at": "2026-05-12T00:00:00Z", + "released_at": "2026-05-12T22:30:00Z", "resources": [ { - "bytes": 0, + "bytes": 14820, "description": "Per-letter image index. One JSON object per cropped letter image, with upstream provenance, extraction provenance, file checksums, and inherited rights.", "encoding": "utf-8", "format": "jsonl", @@ -38,10 +44,10 @@ "name": "entries", "path": "data/index/entries.jsonl", "profile": "data-resource", - "record_count": 0 + "record_count": 7 }, { - "bytes": 0, + "bytes": 1506, "description": "Writer-level catalog. One JSON object per writer; each writer defines a 'set' of letter images.", "encoding": "utf-8", "format": "jsonl", @@ -49,7 +55,7 @@ "name": "writers", "path": "data/index/writers.jsonl", "profile": "data-resource", - "record_count": 0 + "record_count": 1 } ], "schemas": { @@ -58,14 +64,28 @@ }, "stats": { "attribution_required_count": 0, - "entry_writer_count": 0, - "image_byte_count": 0, - "letter_breakdown": {}, - "license_breakdown": {}, - "record_count": 0, - "writer_breakdown": {}, - "writer_record_count": 0, - "writer_status_breakdown": {} + "entry_writer_count": 1, + "image_byte_count": 22227, + "letter_breakdown": { + "bet": 1, + "kaf": 1, + "lamed": 1, + "mem": 1, + "resh": 1, + "tav": 1, + "yod": 1 + }, + "license_breakdown": { + "PDM-1.0": 7 + }, + "record_count": 7, + "writer_breakdown": { + "chaim_nachman_bialik": 7 + }, + "writer_record_count": 1, + "writer_status_breakdown": { + "verified": 1 + } }, "title": "Hebrew Handwritten Per-Letter Image Dataset", "upstream_repo": "https://github.com/HeOCR/public-domain-hand-written-hebrew-scans", diff --git a/scripts/review_crops.py b/scripts/review_crops.py new file mode 100644 index 0000000..040e39c --- /dev/null +++ b/scripts/review_crops.py @@ -0,0 +1,427 @@ +#!/usr/bin/env python3 +""" +Serve a local HTML crop-review page for a PR ingest batch. + +Usage: + python3 scripts/review_crops.py [--upstream-path PATH] [--output FILE] [--port N] + +The page shows each cropped letter beside its metadata and an annotation form. +Feedback is auto-saved (via POST /feedback) to .review_feedback.json in the +repo root so Claude can read it back. +""" + +from __future__ import annotations + +import argparse +import base64 +import http.server +import json +import os +import sys +import threading +import webbrowser +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parent.parent +ENTRIES_PATH = REPO_ROOT / "data" / "index" / "entries.jsonl" +FEEDBACK_PATH = REPO_ROOT / ".review_feedback.json" + +LETTER_DISPLAY_NAMES = { + "alef": "Alef (א)", "bet": "Bet (ב)", "gimel": "Gimel (ג)", + "dalet": "Dalet (ד)", "he": "He (ה)", "vav": "Vav (ו)", + "zayin": "Zayin (ז)", "chet": "Chet (ח)", "tet": "Tet (ט)", + "yod": "Yod (י)", "kaf": "Kaf (כ)", "lamed": "Lamed (ל)", + "mem": "Mem (מ)", "nun": "Nun (נ)", "samech": "Samech (ס)", + "ayin": "Ayin (ע)", "pe": "Pe (פ)", "tsadi": "Tsadi (צ)", + "qof": "Qof (ק)", "resh": "Resh (ר)", "shin": "Shin (ש)", + "tav": "Tav (ת)", "kaf_sofit": "Kaf sofit (ך)", "mem_sofit": "Mem sofit (ם)", + "nun_sofit": "Nun sofit (ן)", "pe_sofit": "Pe sofit (ף)", "tsadi_sofit": "Tsadi sofit (ץ)", +} + + +def _b64(path: Path) -> str: + return base64.b64encode(path.read_bytes()).decode() + + +def _mime(path: Path) -> str: + return "image/png" if path.suffix.lower() == ".png" else "image/jpeg" + + +def _load_entries() -> list[dict]: + entries = [] + with ENTRIES_PATH.open("r", encoding="utf-8") as fh: + for line in fh: + line = line.strip() + if line: + entries.append(json.loads(line)) + return entries + + +def _find_upstream_scan(entry: dict, upstream_root: Path | None) -> Path | None: + if upstream_root is None: + return None + source_id = entry["upstream"]["source_id"] + upstream_entry_id = entry["upstream"]["entry_id"] + for ext in (".jpg", ".jpeg", ".png"): + p = upstream_root / "data" / "scans" / source_id / f"{upstream_entry_id}{ext}" + if p.exists(): + return p + return None + + +def _build_html(entries: list[dict], upstream_root: Path | None) -> str: + # Group entries by upstream scan so we render each scan once. + from collections import defaultdict + by_scan: dict[str, list[dict]] = defaultdict(list) + for e in entries: + by_scan[e["upstream"]["entry_id"]].append(e) + + scan_sections_html = "" + for upstream_entry_id, scan_entries in by_scan.items(): + first = scan_entries[0] + scan_path = _find_upstream_scan(first, upstream_root) + + if scan_path: + scan_b64 = _b64(scan_path) + scan_mime = _mime(scan_path) + # Build bbox overlay objects for JS + bboxes_json = json.dumps([ + { + "x": e["upstream"]["bbox"]["x"], + "y": e["upstream"]["bbox"]["y"], + "w": e["upstream"]["bbox"]["w"], + "h": e["upstream"]["bbox"]["h"], + "label": e["letter"]["name"], + "entry_id": e["entry_id"], + } + for e in scan_entries + ]) + scan_section = f""" +
+

Upstream scan: {upstream_entry_id}

+
+ +
+

Bboxes are shown at 3× zoom. Click a bbox to jump to that letter's card below.

+
+""" + else: + scan_section = f""" +
+

Upstream scan: {upstream_entry_id}

+

Upstream scan not found locally. Run with + --upstream-path /path/to/public-domain-hand-written-hebrew-scans + to display it.

+
+""" + scan_sections_html += scan_section + + cards_html = "" + for e in entries: + entry_id = e["entry_id"] + letter_name = e["letter"]["name"] + display = LETTER_DISPLAY_NAMES.get(letter_name, letter_name) + img_path = REPO_ROOT / e["image"]["local_path"] + img_data = f"data:{_mime(img_path)};base64,{_b64(img_path)}" if img_path.exists() else "" + bbox = e["upstream"]["bbox"] + w_px = e["image"]["width_px"] + h_px = e["image"]["height_px"] + style = e["letter"].get("style", "") + legibility = e["quality"]["legibility"] + usable_htr = e["quality"]["usable_for_htr"] + + cards_html += f""" +
+
+ {entry_id} + {display} +
+
+
+ {'' + letter_name + '' if img_data else '

Image file not found.

'} +
+ + + + + + +
Size{w_px}×{h_px} px
Bbox (x,y,w,h){bbox['x']},{bbox['y']},{bbox['w']},{bbox['h']}
Style{style}
Legibility{legibility}
HTR-usable{'yes' if usable_htr else 'no'}
+
+
+ +
+
+""" + + return f""" + + + +Crop Review + + + +

Crop Review

+

Review each cropped letter. Select a verdict, optionally add notes, then click Save feedback. +All feedback is written to .review_feedback.json at the repo root.

+ +{scan_sections_html} + +

Per-letter cards

+{cards_html} + +
+ + + + +""" + + +class _Handler(http.server.BaseHTTPRequestHandler): + html: str = "" + + def log_message(self, fmt, *args): + pass # suppress request log noise + + def do_GET(self): + if self.path == "/": + body = self._html.encode() + self.send_response(200) + self.send_header("Content-Type", "text/html; charset=utf-8") + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body) + elif self.path == "/feedback": + data = {} + if FEEDBACK_PATH.exists(): + try: + data = json.loads(FEEDBACK_PATH.read_text(encoding="utf-8")) + except json.JSONDecodeError: + pass + body = json.dumps(data, ensure_ascii=False, indent=2).encode() + self.send_response(200) + self.send_header("Content-Type", "application/json") + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body) + else: + self.send_response(404) + self.end_headers() + + def do_POST(self): + if self.path == "/feedback": + length = int(self.headers.get("Content-Length", 0)) + raw = self.rfile.read(length) + try: + data = json.loads(raw.decode()) + except json.JSONDecodeError: + self.send_response(400) + self.end_headers() + return + FEEDBACK_PATH.write_text( + json.dumps(data, ensure_ascii=False, indent=2, sort_keys=True), + encoding="utf-8", + ) + self.send_response(204) + self.end_headers() + else: + self.send_response(404) + self.end_headers() + + +def main() -> None: + ap = argparse.ArgumentParser(description="Serve a crop-review page locally.") + ap.add_argument("--upstream-path", metavar="PATH", + help="Path to a clone of HeOCR/public-domain-hand-written-hebrew-scans") + ap.add_argument("--output", metavar="FILE", + help="Write the HTML to this file instead of serving it") + ap.add_argument("--port", type=int, default=8765, + help="Local port to serve on (default: 8765)") + args = ap.parse_args() + + upstream_root = Path(args.upstream_path) if args.upstream_path else None + entries = _load_entries() + html = _build_html(entries, upstream_root) + + if args.output: + Path(args.output).write_text(html, encoding="utf-8") + print(f"Written to {args.output}") + return + + # Patch the handler class with the rendered HTML. + _Handler._html = html + + server = http.server.HTTPServer(("127.0.0.1", args.port), _Handler) + url = f"http://localhost:{args.port}/" + print(f"Review server running at {url}") + print(f"Feedback will be saved to {FEEDBACK_PATH}") + print("Press Ctrl+C to stop.") + + threading.Timer(0.4, lambda: webbrowser.open(url)).start() + try: + server.serve_forever() + except KeyboardInterrupt: + print("\nStopped.") + + +if __name__ == "__main__": + main()