From 025d09458b00a51232caf364761f0dde754b8db9 Mon Sep 17 00:00:00 2001 From: Shay Palachy Date: Tue, 12 May 2026 22:56:54 +0300 Subject: [PATCH 1/4] feat(data): seed corpus with 6 Bialik letter crops First real ingest PR: adds the writer chaim_nachman_bialik and 6 manual single-letter crops (lamed, dalet, vav, he, kaf, bet) cut from line 4 of the upstream Bialik manuscript scan commons__bialik_el_hazippor__p0001. Rights inherited from upstream (PDM-1.0; no attribution required). Regenerated NOTICE.md, CITATION.cff, and datapackage.json. Validates end-to-end against the upstream clone: ok: 1 writers, 6 entries, 6 files verified, 6 upstream-cross-checked Co-Authored-By: Claude Opus 4.7 (1M context) --- CITATION.cff | 2 +- NOTICE.md | 2 +- data/index/entries.jsonl | 6 +++ data/index/writers.jsonl | 1 + .../bet/chaim_nachman_bialik__bet__v0001.png | 3 ++ .../chaim_nachman_bialik__dalet__v0001.png | 3 ++ .../he/chaim_nachman_bialik__he__v0001.png | 3 ++ .../kaf/chaim_nachman_bialik__kaf__v0001.png | 3 ++ .../chaim_nachman_bialik__lamed__v0001.png | 3 ++ .../vav/chaim_nachman_bialik__vav__v0001.png | 3 ++ datapackage.json | 45 +++++++++++++------ 11 files changed, 59 insertions(+), 15 deletions(-) create mode 100644 data/letters/chaim_nachman_bialik/bet/chaim_nachman_bialik__bet__v0001.png create mode 100644 data/letters/chaim_nachman_bialik/dalet/chaim_nachman_bialik__dalet__v0001.png create mode 100644 data/letters/chaim_nachman_bialik/he/chaim_nachman_bialik__he__v0001.png create mode 100644 data/letters/chaim_nachman_bialik/kaf/chaim_nachman_bialik__kaf__v0001.png create mode 100644 data/letters/chaim_nachman_bialik/lamed/chaim_nachman_bialik__lamed__v0001.png create mode 100644 data/letters/chaim_nachman_bialik/vav/chaim_nachman_bialik__vav__v0001.png diff --git a/CITATION.cff b/CITATION.cff index f2476d0..4df30b7 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -3,7 +3,7 @@ cff-version: 1.2.0 message: Please cite this dataset using the metadata below. type: dataset title: Hebrew Handwritten Per-Letter Image Dataset -abstract: 'Per-letter image crops of handwritten Hebrew letters, grouped into sets by writer. Each crop is a derivative of a permissively-licensed upstream scan in HeOCR/public-domain-hand-written-hebrew-scans, with per-image rights inherited and attribution recorded. The index is line-oriented JSON (JSONL). Release 0.0.0-rc is the initial-setup release: the corpus contains no per-letter image entries yet. The repository ships the schemas, validation tooling, CI, and licensing policy needed to start ingesting.' +abstract: Per-letter image crops of handwritten Hebrew letters, grouped into sets by writer. Each crop is a derivative of a permissively-licensed upstream scan in HeOCR/public-domain-hand-written-hebrew-scans, with per-image rights inherited and attribution recorded. The index is line-oriented JSON (JSONL). Release 0.0.0-rc contains 6 per-letter image entries drawn from 1 verified writers (6 PDM-1.0). authors: - name: Shay Palachy-Affek version: 0.0.0-rc diff --git a/NOTICE.md b/NOTICE.md index c55feca..40dd264 100644 --- a/NOTICE.md +++ b/NOTICE.md @@ -7,7 +7,7 @@ Repository-authored metadata is dedicated to the public domain under CC0 1.0 Uni Per-letter image crops are derivatives of upstream scans in [HeOCR/public-domain-hand-written-hebrew-scans](https://github.com/HeOCR/public-domain-hand-written-hebrew-scans) and carry per-entry rights inherited from the source page. The entries listed below carry a license that requires attribution (currently CC-BY-4.0, CC-BY-SA-4.0). Anyone redistributing or reusing these crops must keep the listed credit and link to the source page on which the rights claim was verified. - Corpus release: `0.0.0-rc` -- Released at (corpus state): `2026-05-12T00:00:00Z` +- Released at (corpus state): `2026-05-12T22:30:00Z` ## Attribution-required entries diff --git a/data/index/entries.jsonl b/data/index/entries.jsonl index e69de29..966a106 100644 --- a/data/index/entries.jsonl +++ b/data/index/entries.jsonl @@ -0,0 +1,6 @@ +{"entry_id": "chaim_nachman_bialik__bet__v0001", "extraction": {"extracted_at": "2026-05-12T22:30:00Z", "extracted_by": "Shay Palachy-Affek", "method": "manual", "notes": "Manual seed crop from upstream scan commons__bialik_el_hazippor__p0001 (Bialik manuscript draft of 'El Hatzippor'). Bbox picked from line 4 of the scan; cursive Hebrew handwriting. Crop produced via Pillow.", "tool": "manual", "tool_version": "0.0.0-manual"}, "image": {"background": "original", "bytes": 3301, "height_px": 22, "local_path": "data/letters/chaim_nachman_bialik/bet/chaim_nachman_bialik__bet__v0001.png", "mime_type": "image/png", "sha256": "f699ee63a92ee3459377547bce0ff1188e8ad2fb8086e7e683f5133e2347a627", "width_px": 15}, "letter": {"codepoint": "U+05D1", "form": "regular", "name": "bet", "notes": null, "style": "cursive_ashkenazi", "unicode_char": "ב"}, "quality": {"exclusion_reasons": [], "legibility": "medium", "notes": "Seed crop at native upstream resolution; cursive Ashkenazi-style hand. Low-resolution upstream scan (409x253) limits per-letter pixel detail.", "usable_for_htr": true, "usable_for_syngen": true}, "rights": {"attribution_required": false, "attribution_text": null, "attribution_url": null, "commercial_use_allowed": true, "derivatives_allowed": true, "evidence_text": "Inherited from upstream entry commons__bialik_el_hazippor__p0001 (PDM-1.0; this work is in the public domain in its country of origin and other countries where the copyright term is the author's life plus 70 years or fewer). Bialik died in 1934; the work is public domain in Israel and most jurisdictions.", "license_expression": "PDM-1.0", "redistribution_allowed": true, "rights_basis": "public_domain", "verification_status": "inherited_from_upstream", "verified_at": "2026-05-12"}, "upstream": {"bbox": {"h": 22, "w": 15, "x": 343, "y": 203}, "commit": "df07bd3825405ed93c15fd61fe4d7967fc60885e", "entry_id": "commons__bialik_el_hazippor__p0001", "release_tag": null, "sha256": "bdd6f1a3b9f8821bbca0c0c836eebf3914a335f816662f1a7f0c4495e45e624e", "source_id": "commons__bialik_el_hazippor"}, "writer_id": "chaim_nachman_bialik"} +{"entry_id": "chaim_nachman_bialik__dalet__v0001", "extraction": {"extracted_at": "2026-05-12T22:30:00Z", "extracted_by": "Shay Palachy-Affek", "method": "manual", "notes": "Manual seed crop from upstream scan commons__bialik_el_hazippor__p0001 (Bialik manuscript draft of 'El Hatzippor'). Bbox picked from line 4 of the scan; cursive Hebrew handwriting. Crop produced via Pillow.", "tool": "manual", "tool_version": "0.0.0-manual"}, "image": {"background": "original", "bytes": 3193, "height_px": 22, "local_path": "data/letters/chaim_nachman_bialik/dalet/chaim_nachman_bialik__dalet__v0001.png", "mime_type": "image/png", "sha256": "1092de5374576bc96965fd1a10b089c311bc7ba7ea975d8179cb784aa441298a", "width_px": 13}, "letter": {"codepoint": "U+05D3", "form": "regular", "name": "dalet", "notes": null, "style": "cursive_ashkenazi", "unicode_char": "ד"}, "quality": {"exclusion_reasons": [], "legibility": "medium", "notes": "Seed crop at native upstream resolution; cursive Ashkenazi-style hand. Low-resolution upstream scan (409x253) limits per-letter pixel detail.", "usable_for_htr": true, "usable_for_syngen": true}, "rights": {"attribution_required": false, "attribution_text": null, "attribution_url": null, "commercial_use_allowed": true, "derivatives_allowed": true, "evidence_text": "Inherited from upstream entry commons__bialik_el_hazippor__p0001 (PDM-1.0; this work is in the public domain in its country of origin and other countries where the copyright term is the author's life plus 70 years or fewer). Bialik died in 1934; the work is public domain in Israel and most jurisdictions.", "license_expression": "PDM-1.0", "redistribution_allowed": true, "rights_basis": "public_domain", "verification_status": "inherited_from_upstream", "verified_at": "2026-05-12"}, "upstream": {"bbox": {"h": 22, "w": 13, "x": 278, "y": 202}, "commit": "df07bd3825405ed93c15fd61fe4d7967fc60885e", "entry_id": "commons__bialik_el_hazippor__p0001", "release_tag": null, "sha256": "bdd6f1a3b9f8821bbca0c0c836eebf3914a335f816662f1a7f0c4495e45e624e", "source_id": "commons__bialik_el_hazippor"}, "writer_id": "chaim_nachman_bialik"} +{"entry_id": "chaim_nachman_bialik__he__v0001", "extraction": {"extracted_at": "2026-05-12T22:30:00Z", "extracted_by": "Shay Palachy-Affek", "method": "manual", "notes": "Manual seed crop from upstream scan commons__bialik_el_hazippor__p0001 (Bialik manuscript draft of 'El Hatzippor'). Bbox picked from line 4 of the scan; cursive Hebrew handwriting. Crop produced via Pillow.", "tool": "manual", "tool_version": "0.0.0-manual"}, "image": {"background": "original", "bytes": 3195, "height_px": 20, "local_path": "data/letters/chaim_nachman_bialik/he/chaim_nachman_bialik__he__v0001.png", "mime_type": "image/png", "sha256": "db0a29002f767438e82bc04d35d4e727f4c0f99bac8f6fee7c2f59e68d22e628", "width_px": 14}, "letter": {"codepoint": "U+05D4", "form": "regular", "name": "he", "notes": null, "style": "cursive_ashkenazi", "unicode_char": "ה"}, "quality": {"exclusion_reasons": [], "legibility": "medium", "notes": "Seed crop at native upstream resolution; cursive Ashkenazi-style hand. Low-resolution upstream scan (409x253) limits per-letter pixel detail.", "usable_for_htr": true, "usable_for_syngen": true}, "rights": {"attribution_required": false, "attribution_text": null, "attribution_url": null, "commercial_use_allowed": true, "derivatives_allowed": true, "evidence_text": "Inherited from upstream entry commons__bialik_el_hazippor__p0001 (PDM-1.0; this work is in the public domain in its country of origin and other countries where the copyright term is the author's life plus 70 years or fewer). Bialik died in 1934; the work is public domain in Israel and most jurisdictions.", "license_expression": "PDM-1.0", "redistribution_allowed": true, "rights_basis": "public_domain", "verification_status": "inherited_from_upstream", "verified_at": "2026-05-12"}, "upstream": {"bbox": {"h": 20, "w": 14, "x": 309, "y": 202}, "commit": "df07bd3825405ed93c15fd61fe4d7967fc60885e", "entry_id": "commons__bialik_el_hazippor__p0001", "release_tag": null, "sha256": "bdd6f1a3b9f8821bbca0c0c836eebf3914a335f816662f1a7f0c4495e45e624e", "source_id": "commons__bialik_el_hazippor"}, "writer_id": "chaim_nachman_bialik"} +{"entry_id": "chaim_nachman_bialik__kaf__v0001", "extraction": {"extracted_at": "2026-05-12T22:30:00Z", "extracted_by": "Shay Palachy-Affek", "method": "manual", "notes": "Manual seed crop from upstream scan commons__bialik_el_hazippor__p0001 (Bialik manuscript draft of 'El Hatzippor'). Bbox picked from line 4 of the scan; cursive Hebrew handwriting. Crop produced via Pillow.", "tool": "manual", "tool_version": "0.0.0-manual"}, "image": {"background": "original", "bytes": 3321, "height_px": 21, "local_path": "data/letters/chaim_nachman_bialik/kaf/chaim_nachman_bialik__kaf__v0001.png", "mime_type": "image/png", "sha256": "2ffc0140ccb4049e45b8b8226f0731700a90ef2abe71a887ab06a5641a75018f", "width_px": 18}, "letter": {"codepoint": "U+05DB", "form": "regular", "name": "kaf", "notes": null, "style": "cursive_ashkenazi", "unicode_char": "כ"}, "quality": {"exclusion_reasons": [], "legibility": "medium", "notes": "Seed crop at native upstream resolution; cursive Ashkenazi-style hand. Low-resolution upstream scan (409x253) limits per-letter pixel detail.", "usable_for_htr": true, "usable_for_syngen": true}, "rights": {"attribution_required": false, "attribution_text": null, "attribution_url": null, "commercial_use_allowed": true, "derivatives_allowed": true, "evidence_text": "Inherited from upstream entry commons__bialik_el_hazippor__p0001 (PDM-1.0; this work is in the public domain in its country of origin and other countries where the copyright term is the author's life plus 70 years or fewer). Bialik died in 1934; the work is public domain in Israel and most jurisdictions.", "license_expression": "PDM-1.0", "redistribution_allowed": true, "rights_basis": "public_domain", "verification_status": "inherited_from_upstream", "verified_at": "2026-05-12"}, "upstream": {"bbox": {"h": 21, "w": 18, "x": 324, "y": 203}, "commit": "df07bd3825405ed93c15fd61fe4d7967fc60885e", "entry_id": "commons__bialik_el_hazippor__p0001", "release_tag": null, "sha256": "bdd6f1a3b9f8821bbca0c0c836eebf3914a335f816662f1a7f0c4495e45e624e", "source_id": "commons__bialik_el_hazippor"}, "writer_id": "chaim_nachman_bialik"} +{"entry_id": "chaim_nachman_bialik__lamed__v0001", "extraction": {"extracted_at": "2026-05-12T22:30:00Z", "extracted_by": "Shay Palachy-Affek", "method": "manual", "notes": "Manual seed crop from upstream scan commons__bialik_el_hazippor__p0001 (Bialik manuscript draft of 'El Hatzippor'). Bbox picked from line 4 of the scan; cursive Hebrew handwriting. Crop produced via Pillow.", "tool": "manual", "tool_version": "0.0.0-manual"}, "image": {"background": "original", "bytes": 3468, "height_px": 28, "local_path": "data/letters/chaim_nachman_bialik/lamed/chaim_nachman_bialik__lamed__v0001.png", "mime_type": "image/png", "sha256": "b9b291b0c6b701c759ac1475ae74106d98cd4e22faa121e53c5eea2f91dc085c", "width_px": 15}, "letter": {"codepoint": "U+05DC", "form": "regular", "name": "lamed", "notes": null, "style": "cursive_ashkenazi", "unicode_char": "ל"}, "quality": {"exclusion_reasons": [], "legibility": "medium", "notes": "Seed crop at native upstream resolution; cursive Ashkenazi-style hand. Low-resolution upstream scan (409x253) limits per-letter pixel detail.", "usable_for_htr": true, "usable_for_syngen": true}, "rights": {"attribution_required": false, "attribution_text": null, "attribution_url": null, "commercial_use_allowed": true, "derivatives_allowed": true, "evidence_text": "Inherited from upstream entry commons__bialik_el_hazippor__p0001 (PDM-1.0; this work is in the public domain in its country of origin and other countries where the copyright term is the author's life plus 70 years or fewer). Bialik died in 1934; the work is public domain in Israel and most jurisdictions.", "license_expression": "PDM-1.0", "redistribution_allowed": true, "rights_basis": "public_domain", "verification_status": "inherited_from_upstream", "verified_at": "2026-05-12"}, "upstream": {"bbox": {"h": 28, "w": 15, "x": 200, "y": 192}, "commit": "df07bd3825405ed93c15fd61fe4d7967fc60885e", "entry_id": "commons__bialik_el_hazippor__p0001", "release_tag": null, "sha256": "bdd6f1a3b9f8821bbca0c0c836eebf3914a335f816662f1a7f0c4495e45e624e", "source_id": "commons__bialik_el_hazippor"}, "writer_id": "chaim_nachman_bialik"} +{"entry_id": "chaim_nachman_bialik__vav__v0001", "extraction": {"extracted_at": "2026-05-12T22:30:00Z", "extracted_by": "Shay Palachy-Affek", "method": "manual", "notes": "Manual seed crop from upstream scan commons__bialik_el_hazippor__p0001 (Bialik manuscript draft of 'El Hatzippor'). Bbox picked from line 4 of the scan; cursive Hebrew handwriting. Crop produced via Pillow.", "tool": "manual", "tool_version": "0.0.0-manual"}, "image": {"background": "original", "bytes": 3098, "height_px": 23, "local_path": "data/letters/chaim_nachman_bialik/vav/chaim_nachman_bialik__vav__v0001.png", "mime_type": "image/png", "sha256": "cbb73d4644f2f42751f06c18224efeb2ff7bdcc9cb674283d466540582a35b72", "width_px": 12}, "letter": {"codepoint": "U+05D5", "form": "regular", "name": "vav", "notes": null, "style": "cursive_ashkenazi", "unicode_char": "ו"}, "quality": {"exclusion_reasons": [], "legibility": "medium", "notes": "Seed crop at native upstream resolution; cursive Ashkenazi-style hand. Low-resolution upstream scan (409x253) limits per-letter pixel detail.", "usable_for_htr": true, "usable_for_syngen": true}, "rights": {"attribution_required": false, "attribution_text": null, "attribution_url": null, "commercial_use_allowed": true, "derivatives_allowed": true, "evidence_text": "Inherited from upstream entry commons__bialik_el_hazippor__p0001 (PDM-1.0; this work is in the public domain in its country of origin and other countries where the copyright term is the author's life plus 70 years or fewer). Bialik died in 1934; the work is public domain in Israel and most jurisdictions.", "license_expression": "PDM-1.0", "redistribution_allowed": true, "rights_basis": "public_domain", "verification_status": "inherited_from_upstream", "verified_at": "2026-05-12"}, "upstream": {"bbox": {"h": 23, "w": 12, "x": 290, "y": 202}, "commit": "df07bd3825405ed93c15fd61fe4d7967fc60885e", "entry_id": "commons__bialik_el_hazippor__p0001", "release_tag": null, "sha256": "bdd6f1a3b9f8821bbca0c0c836eebf3914a335f816662f1a7f0c4495e45e624e", "source_id": "commons__bialik_el_hazippor"}, "writer_id": "chaim_nachman_bialik"} diff --git a/data/index/writers.jsonl b/data/index/writers.jsonl index e69de29..40bd4b7 100644 --- a/data/index/writers.jsonl +++ b/data/index/writers.jsonl @@ -0,0 +1 @@ +{"also_known_as": ["Hayyim Nahman Bialik", "Haim Nahman Bialik", "H. N. Bialik", "חיים נחמן ביאליק", "חיים נחמן ביאַליק"], "dates": {"birth_precision": "exact", "birth_year": 1873, "death_precision": "exact", "death_year": 1934}, "description": "Russian-born Hebrew poet (1873-1934), widely regarded as Israel's national poet. Among the pioneers of modern Hebrew poetry; his manuscript drafts and personal letters are a primary source of early-20th-century handwritten modern Hebrew.", "display_name": "Chaim Nachman Bialik", "ingest": {"agent_notes": "Seed writer for v0 ingest. First per-letter crops drawn from a single manuscript page (commons__bialik_el_hazippor__p0001) to validate the manual-extraction pipeline end-to-end.", "blocked_reason": null}, "languages_written": ["he", "yi"], "period": {"end": "1934", "precision": "year", "start": "1890"}, "references": [{"citation": "Wikipedia: Hayim Nahman Bialik", "kind": "secondary_url", "quote": null, "url": "https://en.wikipedia.org/wiki/Hayim_Nahman_Bialik"}, {"citation": "VIAF authority record 27069388 (Bialik, Ḥayyim Naḥman, 1873-1934)", "kind": "authority_record", "quote": null, "url": "https://viaf.org/viaf/27069388/"}, {"citation": "Wikimedia Commons: manuscript draft of 'El Hatzippor' (autograph).", "kind": "primary_url", "quote": null, "url": "https://commons.wikimedia.org/wiki/File:Bialik_El_hazippor.jpg"}], "scripts_written": ["Hebr"], "status": "verified", "writer_id": "chaim_nachman_bialik"} diff --git a/data/letters/chaim_nachman_bialik/bet/chaim_nachman_bialik__bet__v0001.png b/data/letters/chaim_nachman_bialik/bet/chaim_nachman_bialik__bet__v0001.png new file mode 100644 index 0000000..45699b2 --- /dev/null +++ b/data/letters/chaim_nachman_bialik/bet/chaim_nachman_bialik__bet__v0001.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f699ee63a92ee3459377547bce0ff1188e8ad2fb8086e7e683f5133e2347a627 +size 3301 diff --git a/data/letters/chaim_nachman_bialik/dalet/chaim_nachman_bialik__dalet__v0001.png b/data/letters/chaim_nachman_bialik/dalet/chaim_nachman_bialik__dalet__v0001.png new file mode 100644 index 0000000..c2d20a9 --- /dev/null +++ b/data/letters/chaim_nachman_bialik/dalet/chaim_nachman_bialik__dalet__v0001.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1092de5374576bc96965fd1a10b089c311bc7ba7ea975d8179cb784aa441298a +size 3193 diff --git a/data/letters/chaim_nachman_bialik/he/chaim_nachman_bialik__he__v0001.png b/data/letters/chaim_nachman_bialik/he/chaim_nachman_bialik__he__v0001.png new file mode 100644 index 0000000..2f2df84 --- /dev/null +++ b/data/letters/chaim_nachman_bialik/he/chaim_nachman_bialik__he__v0001.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db0a29002f767438e82bc04d35d4e727f4c0f99bac8f6fee7c2f59e68d22e628 +size 3195 diff --git a/data/letters/chaim_nachman_bialik/kaf/chaim_nachman_bialik__kaf__v0001.png b/data/letters/chaim_nachman_bialik/kaf/chaim_nachman_bialik__kaf__v0001.png new file mode 100644 index 0000000..e15b8d0 --- /dev/null +++ b/data/letters/chaim_nachman_bialik/kaf/chaim_nachman_bialik__kaf__v0001.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ffc0140ccb4049e45b8b8226f0731700a90ef2abe71a887ab06a5641a75018f +size 3321 diff --git a/data/letters/chaim_nachman_bialik/lamed/chaim_nachman_bialik__lamed__v0001.png b/data/letters/chaim_nachman_bialik/lamed/chaim_nachman_bialik__lamed__v0001.png new file mode 100644 index 0000000..e158742 --- /dev/null +++ b/data/letters/chaim_nachman_bialik/lamed/chaim_nachman_bialik__lamed__v0001.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9b291b0c6b701c759ac1475ae74106d98cd4e22faa121e53c5eea2f91dc085c +size 3468 diff --git a/data/letters/chaim_nachman_bialik/vav/chaim_nachman_bialik__vav__v0001.png b/data/letters/chaim_nachman_bialik/vav/chaim_nachman_bialik__vav__v0001.png new file mode 100644 index 0000000..ebef1f1 --- /dev/null +++ b/data/letters/chaim_nachman_bialik/vav/chaim_nachman_bialik__vav__v0001.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cbb73d4644f2f42751f06c18224efeb2ff7bdcc9cb674283d466540582a35b72 +size 3098 diff --git a/datapackage.json b/datapackage.json index bea3b70..b1a93db 100644 --- a/datapackage.json +++ b/datapackage.json @@ -23,14 +23,20 @@ "path": "https://creativecommons.org/publicdomain/zero/1.0/", "scope": "metadata", "title": "Creative Commons Zero v1.0 Universal" + }, + { + "name": "PDM-1.0", + "path": "https://creativecommons.org/publicdomain/mark/1.0/", + "scope": "images", + "title": "Public Domain Mark 1.0" } ], "name": "hletterscript", "profile": "data-package", - "released_at": "2026-05-12T00:00:00Z", + "released_at": "2026-05-12T22:30:00Z", "resources": [ { - "bytes": 0, + "bytes": 12732, "description": "Per-letter image index. One JSON object per cropped letter image, with upstream provenance, extraction provenance, file checksums, and inherited rights.", "encoding": "utf-8", "format": "jsonl", @@ -38,10 +44,10 @@ "name": "entries", "path": "data/index/entries.jsonl", "profile": "data-resource", - "record_count": 0 + "record_count": 6 }, { - "bytes": 0, + "bytes": 1506, "description": "Writer-level catalog. One JSON object per writer; each writer defines a 'set' of letter images.", "encoding": "utf-8", "format": "jsonl", @@ -49,7 +55,7 @@ "name": "writers", "path": "data/index/writers.jsonl", "profile": "data-resource", - "record_count": 0 + "record_count": 1 } ], "schemas": { @@ -58,14 +64,27 @@ }, "stats": { "attribution_required_count": 0, - "entry_writer_count": 0, - "image_byte_count": 0, - "letter_breakdown": {}, - "license_breakdown": {}, - "record_count": 0, - "writer_breakdown": {}, - "writer_record_count": 0, - "writer_status_breakdown": {} + "entry_writer_count": 1, + "image_byte_count": 19576, + "letter_breakdown": { + "bet": 1, + "dalet": 1, + "he": 1, + "kaf": 1, + "lamed": 1, + "vav": 1 + }, + "license_breakdown": { + "PDM-1.0": 6 + }, + "record_count": 6, + "writer_breakdown": { + "chaim_nachman_bialik": 6 + }, + "writer_record_count": 1, + "writer_status_breakdown": { + "verified": 1 + } }, "title": "Hebrew Handwritten Per-Letter Image Dataset", "upstream_repo": "https://github.com/HeOCR/public-domain-hand-written-hebrew-scans", From a9cdf1e8a1b9ec0c4f58aa1553f9b2c03cac901c Mon Sep 17 00:00:00 2001 From: Shay Palachy Date: Tue, 12 May 2026 23:32:08 +0300 Subject: [PATCH 2/4] =?UTF-8?q?feat(scripts):=20add=20review=5Fcrops.py=20?= =?UTF-8?q?=E2=80=94=20local=20HTML=20crop-review=20tool?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Serves a self-contained review page at http://localhost:8765/ that shows: - upstream scan with bbox overlays at 3× zoom (click a bbox to jump to card) - each cropped letter at native size with metadata - per-entry verdict form (correct / wrong / uncertain / drop) + free-text notes Feedback is auto-saved to .review_feedback.json (gitignored) via a POST /feedback handler so Claude can read it back in-session. Run: python3 scripts/review_crops.py --upstream-path /path/to/upstream-scans Co-Authored-By: Claude Opus 4.7 (1M context) --- scripts/review_crops.py | 427 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 427 insertions(+) create mode 100644 scripts/review_crops.py diff --git a/scripts/review_crops.py b/scripts/review_crops.py new file mode 100644 index 0000000..040e39c --- /dev/null +++ b/scripts/review_crops.py @@ -0,0 +1,427 @@ +#!/usr/bin/env python3 +""" +Serve a local HTML crop-review page for a PR ingest batch. + +Usage: + python3 scripts/review_crops.py [--upstream-path PATH] [--output FILE] [--port N] + +The page shows each cropped letter beside its metadata and an annotation form. +Feedback is auto-saved (via POST /feedback) to .review_feedback.json in the +repo root so Claude can read it back. +""" + +from __future__ import annotations + +import argparse +import base64 +import http.server +import json +import os +import sys +import threading +import webbrowser +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parent.parent +ENTRIES_PATH = REPO_ROOT / "data" / "index" / "entries.jsonl" +FEEDBACK_PATH = REPO_ROOT / ".review_feedback.json" + +LETTER_DISPLAY_NAMES = { + "alef": "Alef (א)", "bet": "Bet (ב)", "gimel": "Gimel (ג)", + "dalet": "Dalet (ד)", "he": "He (ה)", "vav": "Vav (ו)", + "zayin": "Zayin (ז)", "chet": "Chet (ח)", "tet": "Tet (ט)", + "yod": "Yod (י)", "kaf": "Kaf (כ)", "lamed": "Lamed (ל)", + "mem": "Mem (מ)", "nun": "Nun (נ)", "samech": "Samech (ס)", + "ayin": "Ayin (ע)", "pe": "Pe (פ)", "tsadi": "Tsadi (צ)", + "qof": "Qof (ק)", "resh": "Resh (ר)", "shin": "Shin (ש)", + "tav": "Tav (ת)", "kaf_sofit": "Kaf sofit (ך)", "mem_sofit": "Mem sofit (ם)", + "nun_sofit": "Nun sofit (ן)", "pe_sofit": "Pe sofit (ף)", "tsadi_sofit": "Tsadi sofit (ץ)", +} + + +def _b64(path: Path) -> str: + return base64.b64encode(path.read_bytes()).decode() + + +def _mime(path: Path) -> str: + return "image/png" if path.suffix.lower() == ".png" else "image/jpeg" + + +def _load_entries() -> list[dict]: + entries = [] + with ENTRIES_PATH.open("r", encoding="utf-8") as fh: + for line in fh: + line = line.strip() + if line: + entries.append(json.loads(line)) + return entries + + +def _find_upstream_scan(entry: dict, upstream_root: Path | None) -> Path | None: + if upstream_root is None: + return None + source_id = entry["upstream"]["source_id"] + upstream_entry_id = entry["upstream"]["entry_id"] + for ext in (".jpg", ".jpeg", ".png"): + p = upstream_root / "data" / "scans" / source_id / f"{upstream_entry_id}{ext}" + if p.exists(): + return p + return None + + +def _build_html(entries: list[dict], upstream_root: Path | None) -> str: + # Group entries by upstream scan so we render each scan once. + from collections import defaultdict + by_scan: dict[str, list[dict]] = defaultdict(list) + for e in entries: + by_scan[e["upstream"]["entry_id"]].append(e) + + scan_sections_html = "" + for upstream_entry_id, scan_entries in by_scan.items(): + first = scan_entries[0] + scan_path = _find_upstream_scan(first, upstream_root) + + if scan_path: + scan_b64 = _b64(scan_path) + scan_mime = _mime(scan_path) + # Build bbox overlay objects for JS + bboxes_json = json.dumps([ + { + "x": e["upstream"]["bbox"]["x"], + "y": e["upstream"]["bbox"]["y"], + "w": e["upstream"]["bbox"]["w"], + "h": e["upstream"]["bbox"]["h"], + "label": e["letter"]["name"], + "entry_id": e["entry_id"], + } + for e in scan_entries + ]) + scan_section = f""" +
+

Upstream scan: {upstream_entry_id}

+
+ +
+

Bboxes are shown at 3× zoom. Click a bbox to jump to that letter's card below.

+
+""" + else: + scan_section = f""" +
+

Upstream scan: {upstream_entry_id}

+

Upstream scan not found locally. Run with + --upstream-path /path/to/public-domain-hand-written-hebrew-scans + to display it.

+
+""" + scan_sections_html += scan_section + + cards_html = "" + for e in entries: + entry_id = e["entry_id"] + letter_name = e["letter"]["name"] + display = LETTER_DISPLAY_NAMES.get(letter_name, letter_name) + img_path = REPO_ROOT / e["image"]["local_path"] + img_data = f"data:{_mime(img_path)};base64,{_b64(img_path)}" if img_path.exists() else "" + bbox = e["upstream"]["bbox"] + w_px = e["image"]["width_px"] + h_px = e["image"]["height_px"] + style = e["letter"].get("style", "") + legibility = e["quality"]["legibility"] + usable_htr = e["quality"]["usable_for_htr"] + + cards_html += f""" +
+
+ {entry_id} + {display} +
+
+
+ {'' + letter_name + '' if img_data else '

Image file not found.

'} +
+ + + + + + +
Size{w_px}×{h_px} px
Bbox (x,y,w,h){bbox['x']},{bbox['y']},{bbox['w']},{bbox['h']}
Style{style}
Legibility{legibility}
HTR-usable{'yes' if usable_htr else 'no'}
+
+
+ +
+
+""" + + return f""" + + + +Crop Review + + + +

Crop Review

+

Review each cropped letter. Select a verdict, optionally add notes, then click Save feedback. +All feedback is written to .review_feedback.json at the repo root.

+ +{scan_sections_html} + +

Per-letter cards

+{cards_html} + +
+ + + + +""" + + +class _Handler(http.server.BaseHTTPRequestHandler): + html: str = "" + + def log_message(self, fmt, *args): + pass # suppress request log noise + + def do_GET(self): + if self.path == "/": + body = self._html.encode() + self.send_response(200) + self.send_header("Content-Type", "text/html; charset=utf-8") + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body) + elif self.path == "/feedback": + data = {} + if FEEDBACK_PATH.exists(): + try: + data = json.loads(FEEDBACK_PATH.read_text(encoding="utf-8")) + except json.JSONDecodeError: + pass + body = json.dumps(data, ensure_ascii=False, indent=2).encode() + self.send_response(200) + self.send_header("Content-Type", "application/json") + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body) + else: + self.send_response(404) + self.end_headers() + + def do_POST(self): + if self.path == "/feedback": + length = int(self.headers.get("Content-Length", 0)) + raw = self.rfile.read(length) + try: + data = json.loads(raw.decode()) + except json.JSONDecodeError: + self.send_response(400) + self.end_headers() + return + FEEDBACK_PATH.write_text( + json.dumps(data, ensure_ascii=False, indent=2, sort_keys=True), + encoding="utf-8", + ) + self.send_response(204) + self.end_headers() + else: + self.send_response(404) + self.end_headers() + + +def main() -> None: + ap = argparse.ArgumentParser(description="Serve a crop-review page locally.") + ap.add_argument("--upstream-path", metavar="PATH", + help="Path to a clone of HeOCR/public-domain-hand-written-hebrew-scans") + ap.add_argument("--output", metavar="FILE", + help="Write the HTML to this file instead of serving it") + ap.add_argument("--port", type=int, default=8765, + help="Local port to serve on (default: 8765)") + args = ap.parse_args() + + upstream_root = Path(args.upstream_path) if args.upstream_path else None + entries = _load_entries() + html = _build_html(entries, upstream_root) + + if args.output: + Path(args.output).write_text(html, encoding="utf-8") + print(f"Written to {args.output}") + return + + # Patch the handler class with the rendered HTML. + _Handler._html = html + + server = http.server.HTTPServer(("127.0.0.1", args.port), _Handler) + url = f"http://localhost:{args.port}/" + print(f"Review server running at {url}") + print(f"Feedback will be saved to {FEEDBACK_PATH}") + print("Press Ctrl+C to stop.") + + threading.Timer(0.4, lambda: webbrowser.open(url)).start() + try: + server.serve_forever() + except KeyboardInterrupt: + print("\nStopped.") + + +if __name__ == "__main__": + main() From 41a55abf1a4d050e26b248636144dd6e363351b0 Mon Sep 17 00:00:00 2001 From: Shay Palachy Date: Tue, 12 May 2026 23:32:14 +0300 Subject: [PATCH 3/4] chore: gitignore .review_feedback.json Local reviewer feedback file written by scripts/review_crops.py. Co-Authored-By: Claude Opus 4.7 (1M context) --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 1b12e9c..2447578 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ __pycache__/ .venv/ venv/ .pytest_cache/ +.review_feedback.json From e9d609a7b0d1f6636c1b09e99c15bce3d7ba1d5c Mon Sep 17 00:00:00 2001 From: Shay Palachy Date: Tue, 12 May 2026 23:40:04 +0300 Subject: [PATCH 4/4] fix(data): correct letter labels from human review; split kaf+yod bbox MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Human crop review (via scripts/review_crops.py) revealed: - dalet → resh (dalet/resh confusion in Ashkenazi cursive) - he → tav - vav → mem (collapsed mem form; low legibility; hard HTR example) - kaf bbox actually contained kaf+yod side by side Split the kaf+yod bbox at the natural ink gap (x=329–330): - kaf: x=330,y=203,w=12,h=16 - yod: x=324,y=203,w=7,h=16 Net result: 6→7 entries, all validated (validate_indexes + pytest green). Co-Authored-By: Claude Opus 4.7 (1M context) --- CITATION.cff | 2 +- data/index/entries.jsonl | 9 +++++---- .../kaf/chaim_nachman_bialik__kaf__v0001.png | 4 ++-- .../chaim_nachman_bialik__mem__v0001.png} | 0 .../chaim_nachman_bialik__resh__v0001.png} | 0 .../chaim_nachman_bialik__tav__v0001.png} | 0 .../yod/chaim_nachman_bialik__yod__v0001.png | 3 +++ datapackage.json | 19 ++++++++++--------- 8 files changed, 21 insertions(+), 16 deletions(-) rename data/letters/chaim_nachman_bialik/{vav/chaim_nachman_bialik__vav__v0001.png => mem/chaim_nachman_bialik__mem__v0001.png} (100%) rename data/letters/chaim_nachman_bialik/{dalet/chaim_nachman_bialik__dalet__v0001.png => resh/chaim_nachman_bialik__resh__v0001.png} (100%) rename data/letters/chaim_nachman_bialik/{he/chaim_nachman_bialik__he__v0001.png => tav/chaim_nachman_bialik__tav__v0001.png} (100%) create mode 100644 data/letters/chaim_nachman_bialik/yod/chaim_nachman_bialik__yod__v0001.png diff --git a/CITATION.cff b/CITATION.cff index 4df30b7..23943eb 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -3,7 +3,7 @@ cff-version: 1.2.0 message: Please cite this dataset using the metadata below. type: dataset title: Hebrew Handwritten Per-Letter Image Dataset -abstract: Per-letter image crops of handwritten Hebrew letters, grouped into sets by writer. Each crop is a derivative of a permissively-licensed upstream scan in HeOCR/public-domain-hand-written-hebrew-scans, with per-image rights inherited and attribution recorded. The index is line-oriented JSON (JSONL). Release 0.0.0-rc contains 6 per-letter image entries drawn from 1 verified writers (6 PDM-1.0). +abstract: Per-letter image crops of handwritten Hebrew letters, grouped into sets by writer. Each crop is a derivative of a permissively-licensed upstream scan in HeOCR/public-domain-hand-written-hebrew-scans, with per-image rights inherited and attribution recorded. The index is line-oriented JSON (JSONL). Release 0.0.0-rc contains 7 per-letter image entries drawn from 1 verified writers (7 PDM-1.0). authors: - name: Shay Palachy-Affek version: 0.0.0-rc diff --git a/data/index/entries.jsonl b/data/index/entries.jsonl index 966a106..356f5f2 100644 --- a/data/index/entries.jsonl +++ b/data/index/entries.jsonl @@ -1,6 +1,7 @@ {"entry_id": "chaim_nachman_bialik__bet__v0001", "extraction": {"extracted_at": "2026-05-12T22:30:00Z", "extracted_by": "Shay Palachy-Affek", "method": "manual", "notes": "Manual seed crop from upstream scan commons__bialik_el_hazippor__p0001 (Bialik manuscript draft of 'El Hatzippor'). Bbox picked from line 4 of the scan; cursive Hebrew handwriting. Crop produced via Pillow.", "tool": "manual", "tool_version": "0.0.0-manual"}, "image": {"background": "original", "bytes": 3301, "height_px": 22, "local_path": "data/letters/chaim_nachman_bialik/bet/chaim_nachman_bialik__bet__v0001.png", "mime_type": "image/png", "sha256": "f699ee63a92ee3459377547bce0ff1188e8ad2fb8086e7e683f5133e2347a627", "width_px": 15}, "letter": {"codepoint": "U+05D1", "form": "regular", "name": "bet", "notes": null, "style": "cursive_ashkenazi", "unicode_char": "ב"}, "quality": {"exclusion_reasons": [], "legibility": "medium", "notes": "Seed crop at native upstream resolution; cursive Ashkenazi-style hand. Low-resolution upstream scan (409x253) limits per-letter pixel detail.", "usable_for_htr": true, "usable_for_syngen": true}, "rights": {"attribution_required": false, "attribution_text": null, "attribution_url": null, "commercial_use_allowed": true, "derivatives_allowed": true, "evidence_text": "Inherited from upstream entry commons__bialik_el_hazippor__p0001 (PDM-1.0; this work is in the public domain in its country of origin and other countries where the copyright term is the author's life plus 70 years or fewer). Bialik died in 1934; the work is public domain in Israel and most jurisdictions.", "license_expression": "PDM-1.0", "redistribution_allowed": true, "rights_basis": "public_domain", "verification_status": "inherited_from_upstream", "verified_at": "2026-05-12"}, "upstream": {"bbox": {"h": 22, "w": 15, "x": 343, "y": 203}, "commit": "df07bd3825405ed93c15fd61fe4d7967fc60885e", "entry_id": "commons__bialik_el_hazippor__p0001", "release_tag": null, "sha256": "bdd6f1a3b9f8821bbca0c0c836eebf3914a335f816662f1a7f0c4495e45e624e", "source_id": "commons__bialik_el_hazippor"}, "writer_id": "chaim_nachman_bialik"} -{"entry_id": "chaim_nachman_bialik__dalet__v0001", "extraction": {"extracted_at": "2026-05-12T22:30:00Z", "extracted_by": "Shay Palachy-Affek", "method": "manual", "notes": "Manual seed crop from upstream scan commons__bialik_el_hazippor__p0001 (Bialik manuscript draft of 'El Hatzippor'). Bbox picked from line 4 of the scan; cursive Hebrew handwriting. Crop produced via Pillow.", "tool": "manual", "tool_version": "0.0.0-manual"}, "image": {"background": "original", "bytes": 3193, "height_px": 22, "local_path": "data/letters/chaim_nachman_bialik/dalet/chaim_nachman_bialik__dalet__v0001.png", "mime_type": "image/png", "sha256": "1092de5374576bc96965fd1a10b089c311bc7ba7ea975d8179cb784aa441298a", "width_px": 13}, "letter": {"codepoint": "U+05D3", "form": "regular", "name": "dalet", "notes": null, "style": "cursive_ashkenazi", "unicode_char": "ד"}, "quality": {"exclusion_reasons": [], "legibility": "medium", "notes": "Seed crop at native upstream resolution; cursive Ashkenazi-style hand. Low-resolution upstream scan (409x253) limits per-letter pixel detail.", "usable_for_htr": true, "usable_for_syngen": true}, "rights": {"attribution_required": false, "attribution_text": null, "attribution_url": null, "commercial_use_allowed": true, "derivatives_allowed": true, "evidence_text": "Inherited from upstream entry commons__bialik_el_hazippor__p0001 (PDM-1.0; this work is in the public domain in its country of origin and other countries where the copyright term is the author's life plus 70 years or fewer). Bialik died in 1934; the work is public domain in Israel and most jurisdictions.", "license_expression": "PDM-1.0", "redistribution_allowed": true, "rights_basis": "public_domain", "verification_status": "inherited_from_upstream", "verified_at": "2026-05-12"}, "upstream": {"bbox": {"h": 22, "w": 13, "x": 278, "y": 202}, "commit": "df07bd3825405ed93c15fd61fe4d7967fc60885e", "entry_id": "commons__bialik_el_hazippor__p0001", "release_tag": null, "sha256": "bdd6f1a3b9f8821bbca0c0c836eebf3914a335f816662f1a7f0c4495e45e624e", "source_id": "commons__bialik_el_hazippor"}, "writer_id": "chaim_nachman_bialik"} -{"entry_id": "chaim_nachman_bialik__he__v0001", "extraction": {"extracted_at": "2026-05-12T22:30:00Z", "extracted_by": "Shay Palachy-Affek", "method": "manual", "notes": "Manual seed crop from upstream scan commons__bialik_el_hazippor__p0001 (Bialik manuscript draft of 'El Hatzippor'). Bbox picked from line 4 of the scan; cursive Hebrew handwriting. Crop produced via Pillow.", "tool": "manual", "tool_version": "0.0.0-manual"}, "image": {"background": "original", "bytes": 3195, "height_px": 20, "local_path": "data/letters/chaim_nachman_bialik/he/chaim_nachman_bialik__he__v0001.png", "mime_type": "image/png", "sha256": "db0a29002f767438e82bc04d35d4e727f4c0f99bac8f6fee7c2f59e68d22e628", "width_px": 14}, "letter": {"codepoint": "U+05D4", "form": "regular", "name": "he", "notes": null, "style": "cursive_ashkenazi", "unicode_char": "ה"}, "quality": {"exclusion_reasons": [], "legibility": "medium", "notes": "Seed crop at native upstream resolution; cursive Ashkenazi-style hand. Low-resolution upstream scan (409x253) limits per-letter pixel detail.", "usable_for_htr": true, "usable_for_syngen": true}, "rights": {"attribution_required": false, "attribution_text": null, "attribution_url": null, "commercial_use_allowed": true, "derivatives_allowed": true, "evidence_text": "Inherited from upstream entry commons__bialik_el_hazippor__p0001 (PDM-1.0; this work is in the public domain in its country of origin and other countries where the copyright term is the author's life plus 70 years or fewer). Bialik died in 1934; the work is public domain in Israel and most jurisdictions.", "license_expression": "PDM-1.0", "redistribution_allowed": true, "rights_basis": "public_domain", "verification_status": "inherited_from_upstream", "verified_at": "2026-05-12"}, "upstream": {"bbox": {"h": 20, "w": 14, "x": 309, "y": 202}, "commit": "df07bd3825405ed93c15fd61fe4d7967fc60885e", "entry_id": "commons__bialik_el_hazippor__p0001", "release_tag": null, "sha256": "bdd6f1a3b9f8821bbca0c0c836eebf3914a335f816662f1a7f0c4495e45e624e", "source_id": "commons__bialik_el_hazippor"}, "writer_id": "chaim_nachman_bialik"} -{"entry_id": "chaim_nachman_bialik__kaf__v0001", "extraction": {"extracted_at": "2026-05-12T22:30:00Z", "extracted_by": "Shay Palachy-Affek", "method": "manual", "notes": "Manual seed crop from upstream scan commons__bialik_el_hazippor__p0001 (Bialik manuscript draft of 'El Hatzippor'). Bbox picked from line 4 of the scan; cursive Hebrew handwriting. Crop produced via Pillow.", "tool": "manual", "tool_version": "0.0.0-manual"}, "image": {"background": "original", "bytes": 3321, "height_px": 21, "local_path": "data/letters/chaim_nachman_bialik/kaf/chaim_nachman_bialik__kaf__v0001.png", "mime_type": "image/png", "sha256": "2ffc0140ccb4049e45b8b8226f0731700a90ef2abe71a887ab06a5641a75018f", "width_px": 18}, "letter": {"codepoint": "U+05DB", "form": "regular", "name": "kaf", "notes": null, "style": "cursive_ashkenazi", "unicode_char": "כ"}, "quality": {"exclusion_reasons": [], "legibility": "medium", "notes": "Seed crop at native upstream resolution; cursive Ashkenazi-style hand. Low-resolution upstream scan (409x253) limits per-letter pixel detail.", "usable_for_htr": true, "usable_for_syngen": true}, "rights": {"attribution_required": false, "attribution_text": null, "attribution_url": null, "commercial_use_allowed": true, "derivatives_allowed": true, "evidence_text": "Inherited from upstream entry commons__bialik_el_hazippor__p0001 (PDM-1.0; this work is in the public domain in its country of origin and other countries where the copyright term is the author's life plus 70 years or fewer). Bialik died in 1934; the work is public domain in Israel and most jurisdictions.", "license_expression": "PDM-1.0", "redistribution_allowed": true, "rights_basis": "public_domain", "verification_status": "inherited_from_upstream", "verified_at": "2026-05-12"}, "upstream": {"bbox": {"h": 21, "w": 18, "x": 324, "y": 203}, "commit": "df07bd3825405ed93c15fd61fe4d7967fc60885e", "entry_id": "commons__bialik_el_hazippor__p0001", "release_tag": null, "sha256": "bdd6f1a3b9f8821bbca0c0c836eebf3914a335f816662f1a7f0c4495e45e624e", "source_id": "commons__bialik_el_hazippor"}, "writer_id": "chaim_nachman_bialik"} +{"entry_id": "chaim_nachman_bialik__kaf__v0001", "extraction": {"extracted_at": "2026-05-12T22:30:00Z", "extracted_by": "Shay Palachy-Affek", "method": "manual", "notes": "Manual seed crop from upstream scan commons__bialik_el_hazippor__p0001 (Bialik manuscript draft of 'El Hatzippor'). Bbox picked from line 4 of the scan; cursive Hebrew handwriting. Crop produced via Pillow.", "tool": "manual", "tool_version": "0.0.0-manual"}, "image": {"background": "original", "bytes": 3093, "height_px": 16, "local_path": "data/letters/chaim_nachman_bialik/kaf/chaim_nachman_bialik__kaf__v0001.png", "mime_type": "image/png", "sha256": "f64f3120eaa7f07244bcd2b730683f7e7ee72316abb7f783b6daaeed0883915a", "width_px": 12}, "letter": {"codepoint": "U+05DB", "form": "regular", "name": "kaf", "notes": null, "style": "cursive_ashkenazi", "unicode_char": "כ"}, "quality": {"exclusion_reasons": [], "legibility": "medium", "notes": "Split from original kaf+yod double-letter bbox; right (kaf) portion only. Low-resolution upstream scan.", "usable_for_htr": true, "usable_for_syngen": true}, "rights": {"attribution_required": false, "attribution_text": null, "attribution_url": null, "commercial_use_allowed": true, "derivatives_allowed": true, "evidence_text": "Inherited from upstream entry commons__bialik_el_hazippor__p0001 (PDM-1.0; this work is in the public domain in its country of origin and other countries where the copyright term is the author's life plus 70 years or fewer). Bialik died in 1934; the work is public domain in Israel and most jurisdictions.", "license_expression": "PDM-1.0", "redistribution_allowed": true, "rights_basis": "public_domain", "verification_status": "inherited_from_upstream", "verified_at": "2026-05-12"}, "upstream": {"bbox": {"h": 16, "w": 12, "x": 330, "y": 203}, "commit": "df07bd3825405ed93c15fd61fe4d7967fc60885e", "entry_id": "commons__bialik_el_hazippor__p0001", "release_tag": null, "sha256": "bdd6f1a3b9f8821bbca0c0c836eebf3914a335f816662f1a7f0c4495e45e624e", "source_id": "commons__bialik_el_hazippor"}, "writer_id": "chaim_nachman_bialik"} {"entry_id": "chaim_nachman_bialik__lamed__v0001", "extraction": {"extracted_at": "2026-05-12T22:30:00Z", "extracted_by": "Shay Palachy-Affek", "method": "manual", "notes": "Manual seed crop from upstream scan commons__bialik_el_hazippor__p0001 (Bialik manuscript draft of 'El Hatzippor'). Bbox picked from line 4 of the scan; cursive Hebrew handwriting. Crop produced via Pillow.", "tool": "manual", "tool_version": "0.0.0-manual"}, "image": {"background": "original", "bytes": 3468, "height_px": 28, "local_path": "data/letters/chaim_nachman_bialik/lamed/chaim_nachman_bialik__lamed__v0001.png", "mime_type": "image/png", "sha256": "b9b291b0c6b701c759ac1475ae74106d98cd4e22faa121e53c5eea2f91dc085c", "width_px": 15}, "letter": {"codepoint": "U+05DC", "form": "regular", "name": "lamed", "notes": null, "style": "cursive_ashkenazi", "unicode_char": "ל"}, "quality": {"exclusion_reasons": [], "legibility": "medium", "notes": "Seed crop at native upstream resolution; cursive Ashkenazi-style hand. Low-resolution upstream scan (409x253) limits per-letter pixel detail.", "usable_for_htr": true, "usable_for_syngen": true}, "rights": {"attribution_required": false, "attribution_text": null, "attribution_url": null, "commercial_use_allowed": true, "derivatives_allowed": true, "evidence_text": "Inherited from upstream entry commons__bialik_el_hazippor__p0001 (PDM-1.0; this work is in the public domain in its country of origin and other countries where the copyright term is the author's life plus 70 years or fewer). Bialik died in 1934; the work is public domain in Israel and most jurisdictions.", "license_expression": "PDM-1.0", "redistribution_allowed": true, "rights_basis": "public_domain", "verification_status": "inherited_from_upstream", "verified_at": "2026-05-12"}, "upstream": {"bbox": {"h": 28, "w": 15, "x": 200, "y": 192}, "commit": "df07bd3825405ed93c15fd61fe4d7967fc60885e", "entry_id": "commons__bialik_el_hazippor__p0001", "release_tag": null, "sha256": "bdd6f1a3b9f8821bbca0c0c836eebf3914a335f816662f1a7f0c4495e45e624e", "source_id": "commons__bialik_el_hazippor"}, "writer_id": "chaim_nachman_bialik"} -{"entry_id": "chaim_nachman_bialik__vav__v0001", "extraction": {"extracted_at": "2026-05-12T22:30:00Z", "extracted_by": "Shay Palachy-Affek", "method": "manual", "notes": "Manual seed crop from upstream scan commons__bialik_el_hazippor__p0001 (Bialik manuscript draft of 'El Hatzippor'). Bbox picked from line 4 of the scan; cursive Hebrew handwriting. Crop produced via Pillow.", "tool": "manual", "tool_version": "0.0.0-manual"}, "image": {"background": "original", "bytes": 3098, "height_px": 23, "local_path": "data/letters/chaim_nachman_bialik/vav/chaim_nachman_bialik__vav__v0001.png", "mime_type": "image/png", "sha256": "cbb73d4644f2f42751f06c18224efeb2ff7bdcc9cb674283d466540582a35b72", "width_px": 12}, "letter": {"codepoint": "U+05D5", "form": "regular", "name": "vav", "notes": null, "style": "cursive_ashkenazi", "unicode_char": "ו"}, "quality": {"exclusion_reasons": [], "legibility": "medium", "notes": "Seed crop at native upstream resolution; cursive Ashkenazi-style hand. Low-resolution upstream scan (409x253) limits per-letter pixel detail.", "usable_for_htr": true, "usable_for_syngen": true}, "rights": {"attribution_required": false, "attribution_text": null, "attribution_url": null, "commercial_use_allowed": true, "derivatives_allowed": true, "evidence_text": "Inherited from upstream entry commons__bialik_el_hazippor__p0001 (PDM-1.0; this work is in the public domain in its country of origin and other countries where the copyright term is the author's life plus 70 years or fewer). Bialik died in 1934; the work is public domain in Israel and most jurisdictions.", "license_expression": "PDM-1.0", "redistribution_allowed": true, "rights_basis": "public_domain", "verification_status": "inherited_from_upstream", "verified_at": "2026-05-12"}, "upstream": {"bbox": {"h": 23, "w": 12, "x": 290, "y": 202}, "commit": "df07bd3825405ed93c15fd61fe4d7967fc60885e", "entry_id": "commons__bialik_el_hazippor__p0001", "release_tag": null, "sha256": "bdd6f1a3b9f8821bbca0c0c836eebf3914a335f816662f1a7f0c4495e45e624e", "source_id": "commons__bialik_el_hazippor"}, "writer_id": "chaim_nachman_bialik"} +{"entry_id": "chaim_nachman_bialik__mem__v0001", "extraction": {"extracted_at": "2026-05-12T22:30:00Z", "extracted_by": "Shay Palachy-Affek", "method": "manual", "notes": "Manual seed crop from upstream scan commons__bialik_el_hazippor__p0001 (Bialik manuscript draft of 'El Hatzippor'). Bbox picked from line 4 of the scan; cursive Hebrew handwriting. Crop produced via Pillow.", "tool": "manual", "tool_version": "0.0.0-manual"}, "image": {"background": "original", "bytes": 3098, "height_px": 23, "local_path": "data/letters/chaim_nachman_bialik/mem/chaim_nachman_bialik__mem__v0001.png", "mime_type": "image/png", "sha256": "cbb73d4644f2f42751f06c18224efeb2ff7bdcc9cb674283d466540582a35b72", "width_px": 12}, "letter": {"codepoint": "U+05DE", "form": "regular", "name": "mem", "notes": null, "style": "cursive_ashkenazi", "unicode_char": "מ"}, "quality": {"exclusion_reasons": [], "legibility": "low", "notes": "Collapsed mem form in cursive Ashkenazi hand; visually resembles yod or a small square. Hard/ambiguous example — treat with care for HTR training.", "usable_for_htr": true, "usable_for_syngen": false}, "rights": {"attribution_required": false, "attribution_text": null, "attribution_url": null, "commercial_use_allowed": true, "derivatives_allowed": true, "evidence_text": "Inherited from upstream entry commons__bialik_el_hazippor__p0001 (PDM-1.0; this work is in the public domain in its country of origin and other countries where the copyright term is the author's life plus 70 years or fewer). Bialik died in 1934; the work is public domain in Israel and most jurisdictions.", "license_expression": "PDM-1.0", "redistribution_allowed": true, "rights_basis": "public_domain", "verification_status": "inherited_from_upstream", "verified_at": "2026-05-12"}, "upstream": {"bbox": {"h": 23, "w": 12, "x": 290, "y": 202}, "commit": "df07bd3825405ed93c15fd61fe4d7967fc60885e", "entry_id": "commons__bialik_el_hazippor__p0001", "release_tag": null, "sha256": "bdd6f1a3b9f8821bbca0c0c836eebf3914a335f816662f1a7f0c4495e45e624e", "source_id": "commons__bialik_el_hazippor"}, "writer_id": "chaim_nachman_bialik"} +{"entry_id": "chaim_nachman_bialik__resh__v0001", "extraction": {"extracted_at": "2026-05-12T22:30:00Z", "extracted_by": "Shay Palachy-Affek", "method": "manual", "notes": "Manual seed crop from upstream scan commons__bialik_el_hazippor__p0001 (Bialik manuscript draft of 'El Hatzippor'). Bbox picked from line 4 of the scan; cursive Hebrew handwriting. Crop produced via Pillow.", "tool": "manual", "tool_version": "0.0.0-manual"}, "image": {"background": "original", "bytes": 3193, "height_px": 22, "local_path": "data/letters/chaim_nachman_bialik/resh/chaim_nachman_bialik__resh__v0001.png", "mime_type": "image/png", "sha256": "1092de5374576bc96965fd1a10b089c311bc7ba7ea975d8179cb784aa441298a", "width_px": 13}, "letter": {"codepoint": "U+05E8", "form": "regular", "name": "resh", "notes": null, "style": "cursive_ashkenazi", "unicode_char": "ר"}, "quality": {"exclusion_reasons": [], "legibility": "medium", "notes": "Seed crop at native upstream resolution; cursive Ashkenazi-style hand. Low-resolution upstream scan (409x253) limits per-letter pixel detail.", "usable_for_htr": true, "usable_for_syngen": true}, "rights": {"attribution_required": false, "attribution_text": null, "attribution_url": null, "commercial_use_allowed": true, "derivatives_allowed": true, "evidence_text": "Inherited from upstream entry commons__bialik_el_hazippor__p0001 (PDM-1.0; this work is in the public domain in its country of origin and other countries where the copyright term is the author's life plus 70 years or fewer). Bialik died in 1934; the work is public domain in Israel and most jurisdictions.", "license_expression": "PDM-1.0", "redistribution_allowed": true, "rights_basis": "public_domain", "verification_status": "inherited_from_upstream", "verified_at": "2026-05-12"}, "upstream": {"bbox": {"h": 22, "w": 13, "x": 278, "y": 202}, "commit": "df07bd3825405ed93c15fd61fe4d7967fc60885e", "entry_id": "commons__bialik_el_hazippor__p0001", "release_tag": null, "sha256": "bdd6f1a3b9f8821bbca0c0c836eebf3914a335f816662f1a7f0c4495e45e624e", "source_id": "commons__bialik_el_hazippor"}, "writer_id": "chaim_nachman_bialik"} +{"entry_id": "chaim_nachman_bialik__tav__v0001", "extraction": {"extracted_at": "2026-05-12T22:30:00Z", "extracted_by": "Shay Palachy-Affek", "method": "manual", "notes": "Manual seed crop from upstream scan commons__bialik_el_hazippor__p0001 (Bialik manuscript draft of 'El Hatzippor'). Bbox picked from line 4 of the scan; cursive Hebrew handwriting. Crop produced via Pillow.", "tool": "manual", "tool_version": "0.0.0-manual"}, "image": {"background": "original", "bytes": 3195, "height_px": 20, "local_path": "data/letters/chaim_nachman_bialik/tav/chaim_nachman_bialik__tav__v0001.png", "mime_type": "image/png", "sha256": "db0a29002f767438e82bc04d35d4e727f4c0f99bac8f6fee7c2f59e68d22e628", "width_px": 14}, "letter": {"codepoint": "U+05EA", "form": "regular", "name": "tav", "notes": null, "style": "cursive_ashkenazi", "unicode_char": "ת"}, "quality": {"exclusion_reasons": [], "legibility": "medium", "notes": "Seed crop at native upstream resolution; cursive Ashkenazi-style hand. Low-resolution upstream scan (409x253) limits per-letter pixel detail.", "usable_for_htr": true, "usable_for_syngen": true}, "rights": {"attribution_required": false, "attribution_text": null, "attribution_url": null, "commercial_use_allowed": true, "derivatives_allowed": true, "evidence_text": "Inherited from upstream entry commons__bialik_el_hazippor__p0001 (PDM-1.0; this work is in the public domain in its country of origin and other countries where the copyright term is the author's life plus 70 years or fewer). Bialik died in 1934; the work is public domain in Israel and most jurisdictions.", "license_expression": "PDM-1.0", "redistribution_allowed": true, "rights_basis": "public_domain", "verification_status": "inherited_from_upstream", "verified_at": "2026-05-12"}, "upstream": {"bbox": {"h": 20, "w": 14, "x": 309, "y": 202}, "commit": "df07bd3825405ed93c15fd61fe4d7967fc60885e", "entry_id": "commons__bialik_el_hazippor__p0001", "release_tag": null, "sha256": "bdd6f1a3b9f8821bbca0c0c836eebf3914a335f816662f1a7f0c4495e45e624e", "source_id": "commons__bialik_el_hazippor"}, "writer_id": "chaim_nachman_bialik"} +{"entry_id": "chaim_nachman_bialik__yod__v0001", "extraction": {"extracted_at": "2026-05-12T22:30:00Z", "extracted_by": "Shay Palachy-Affek", "method": "manual", "notes": "Manual seed crop from upstream scan commons__bialik_el_hazippor__p0001 (Bialik manuscript draft of 'El Hatzippor'). Bbox picked from line 4 of the scan; cursive Hebrew handwriting. Crop produced via Pillow.", "tool": "manual", "tool_version": "0.0.0-manual"}, "image": {"background": "original", "bytes": 2879, "height_px": 16, "local_path": "data/letters/chaim_nachman_bialik/yod/chaim_nachman_bialik__yod__v0001.png", "mime_type": "image/png", "sha256": "5cc808ed43f33a788e392ae66fecd580a6c183fb83b49c08fda21a66538c0207", "width_px": 7}, "letter": {"codepoint": "U+05D9", "form": "regular", "name": "yod", "notes": null, "style": "cursive_ashkenazi", "unicode_char": "י"}, "quality": {"exclusion_reasons": [], "legibility": "medium", "notes": "Split from original kaf+yod double-letter bbox; left (yod) portion only. Small stroke consistent with cursive yod. Low-resolution upstream scan.", "usable_for_htr": true, "usable_for_syngen": true}, "rights": {"attribution_required": false, "attribution_text": null, "attribution_url": null, "commercial_use_allowed": true, "derivatives_allowed": true, "evidence_text": "Inherited from upstream entry commons__bialik_el_hazippor__p0001 (PDM-1.0; this work is in the public domain in its country of origin and other countries where the copyright term is the author's life plus 70 years or fewer). Bialik died in 1934; the work is public domain in Israel and most jurisdictions.", "license_expression": "PDM-1.0", "redistribution_allowed": true, "rights_basis": "public_domain", "verification_status": "inherited_from_upstream", "verified_at": "2026-05-12"}, "upstream": {"bbox": {"h": 16, "w": 7, "x": 324, "y": 203}, "commit": "df07bd3825405ed93c15fd61fe4d7967fc60885e", "entry_id": "commons__bialik_el_hazippor__p0001", "release_tag": null, "sha256": "bdd6f1a3b9f8821bbca0c0c836eebf3914a335f816662f1a7f0c4495e45e624e", "source_id": "commons__bialik_el_hazippor"}, "writer_id": "chaim_nachman_bialik"} diff --git a/data/letters/chaim_nachman_bialik/kaf/chaim_nachman_bialik__kaf__v0001.png b/data/letters/chaim_nachman_bialik/kaf/chaim_nachman_bialik__kaf__v0001.png index e15b8d0..ff31f1c 100644 --- a/data/letters/chaim_nachman_bialik/kaf/chaim_nachman_bialik__kaf__v0001.png +++ b/data/letters/chaim_nachman_bialik/kaf/chaim_nachman_bialik__kaf__v0001.png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2ffc0140ccb4049e45b8b8226f0731700a90ef2abe71a887ab06a5641a75018f -size 3321 +oid sha256:f64f3120eaa7f07244bcd2b730683f7e7ee72316abb7f783b6daaeed0883915a +size 3093 diff --git a/data/letters/chaim_nachman_bialik/vav/chaim_nachman_bialik__vav__v0001.png b/data/letters/chaim_nachman_bialik/mem/chaim_nachman_bialik__mem__v0001.png similarity index 100% rename from data/letters/chaim_nachman_bialik/vav/chaim_nachman_bialik__vav__v0001.png rename to data/letters/chaim_nachman_bialik/mem/chaim_nachman_bialik__mem__v0001.png diff --git a/data/letters/chaim_nachman_bialik/dalet/chaim_nachman_bialik__dalet__v0001.png b/data/letters/chaim_nachman_bialik/resh/chaim_nachman_bialik__resh__v0001.png similarity index 100% rename from data/letters/chaim_nachman_bialik/dalet/chaim_nachman_bialik__dalet__v0001.png rename to data/letters/chaim_nachman_bialik/resh/chaim_nachman_bialik__resh__v0001.png diff --git a/data/letters/chaim_nachman_bialik/he/chaim_nachman_bialik__he__v0001.png b/data/letters/chaim_nachman_bialik/tav/chaim_nachman_bialik__tav__v0001.png similarity index 100% rename from data/letters/chaim_nachman_bialik/he/chaim_nachman_bialik__he__v0001.png rename to data/letters/chaim_nachman_bialik/tav/chaim_nachman_bialik__tav__v0001.png diff --git a/data/letters/chaim_nachman_bialik/yod/chaim_nachman_bialik__yod__v0001.png b/data/letters/chaim_nachman_bialik/yod/chaim_nachman_bialik__yod__v0001.png new file mode 100644 index 0000000..d654f91 --- /dev/null +++ b/data/letters/chaim_nachman_bialik/yod/chaim_nachman_bialik__yod__v0001.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5cc808ed43f33a788e392ae66fecd580a6c183fb83b49c08fda21a66538c0207 +size 2879 diff --git a/datapackage.json b/datapackage.json index b1a93db..af3c426 100644 --- a/datapackage.json +++ b/datapackage.json @@ -36,7 +36,7 @@ "released_at": "2026-05-12T22:30:00Z", "resources": [ { - "bytes": 12732, + "bytes": 14820, "description": "Per-letter image index. One JSON object per cropped letter image, with upstream provenance, extraction provenance, file checksums, and inherited rights.", "encoding": "utf-8", "format": "jsonl", @@ -44,7 +44,7 @@ "name": "entries", "path": "data/index/entries.jsonl", "profile": "data-resource", - "record_count": 6 + "record_count": 7 }, { "bytes": 1506, @@ -65,21 +65,22 @@ "stats": { "attribution_required_count": 0, "entry_writer_count": 1, - "image_byte_count": 19576, + "image_byte_count": 22227, "letter_breakdown": { "bet": 1, - "dalet": 1, - "he": 1, "kaf": 1, "lamed": 1, - "vav": 1 + "mem": 1, + "resh": 1, + "tav": 1, + "yod": 1 }, "license_breakdown": { - "PDM-1.0": 6 + "PDM-1.0": 7 }, - "record_count": 6, + "record_count": 7, "writer_breakdown": { - "chaim_nachman_bialik": 6 + "chaim_nachman_bialik": 7 }, "writer_record_count": 1, "writer_status_breakdown": {