diff --git a/AGENTS.md b/AGENTS.md index bb6207d..cf2d1ad 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -26,7 +26,7 @@ hletterscriptgen validate examples/letter_set/writer_example.json --format json scans into per-writer letter-glyph image sets. - `hletterscript` (separate repo) owns the **published letter-set datasets**. Do not commit generated glyph images to this repo. -- `public-domain-hand-written-hebrew-scans` (separate repo) owns +- `hash` (separate repo) owns **upstream scans** and their rights records. - `hocrsyngen`, `hocrgen`, `HeOCR`, `HeOCRsynth` are downstream consumers. Do not import them from `hletterscriptgen` and do not build their @@ -44,7 +44,7 @@ hletterscriptgen validate examples/letter_set/writer_example.json --format json ## Rights-carryover rules - Every variant must carry a `source.scan_entry_id` that resolves against - the upstream `public-domain-hand-written-hebrew-scans` index, plus a + the upstream `hash` index, plus a `source.license` matching the upstream record. The generator never invents, broadens, or relicenses upstream rights. - `license_summary.licenses` must include every distinct license that diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 07b80ef..d1fd624 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -7,7 +7,7 @@ Thanks for considering a contribution to `hletterscriptgen`. This repo holds the **code** that produces per-writer Hebrew letter-glyph image sets. It does **not** host the letter-set images themselves (those live in `HeOCR/hletterscript`), and it does **not** ingest upstream scans -(those live in `HeOCR/public-domain-hand-written-hebrew-scans`). Please +(those live in `HeOCR/hash`). Please keep PRs aligned with that boundary; cross-repo concerns belong upstream or downstream. diff --git a/LICENSE-POLICY.md b/LICENSE-POLICY.md index d12413b..bc9a186 100644 --- a/LICENSE-POLICY.md +++ b/LICENSE-POLICY.md @@ -16,7 +16,7 @@ rules apply to each layer. ## 2. Generated letter-set datasets The generator processes scans from -[`HeOCR/public-domain-hand-written-hebrew-scans`](https://github.com/HeOCR/public-domain-hand-written-hebrew-scans). +[`HeOCR/hash`](https://github.com/HeOCR/hash). That upstream repository uses a compound licensing model with rights recorded **per scan**. `hletterscriptgen` follows the same posture: diff --git a/README.md b/README.md index b7dccfb..6f76298 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ on rights-clean upstream scans of handwritten Hebrew documents. `hletterscriptgen` is part of the [HeOCR](https://github.com/HeOCR) project. It consumes scan-level records from -[`HeOCR/public-domain-hand-written-hebrew-scans`](https://github.com/HeOCR/public-domain-hand-written-hebrew-scans) +[`HeOCR/hash`](https://github.com/HeOCR/hash) (HASH — Hebrew Archive of Scanned Handwriting) and produces letter-set datasets that land in [`HeOCR/hletterscript`](https://github.com/HeOCR/hletterscript). Downstream, [`HeOCR/hocrsyngen`](https://github.com/HeOCR/hocrsyngen) composes those @@ -34,7 +34,7 @@ What does **not** live here: - Actual extracted glyph images (→ `HeOCR/hletterscript`). - Page-scan ingestion or rights curation (→ - `HeOCR/public-domain-hand-written-hebrew-scans`). + `HeOCR/hash`). - Document composition (→ `HeOCR/hocrsyngen`). - Dataset orchestration, governance, release assembly, or publication (→ `HeOCR/hocrgen` / `HeOCR/HeOCR` / `HeOCR/HeOCRsynth`). @@ -42,7 +42,7 @@ What does **not** live here: ## Position in the HeOCR system `hletterscriptgen` reads rights-clean scans from -`public-domain-hand-written-hebrew-scans`, produces per-writer letter +HASH (`HeOCR/hash`), produces per-writer letter sets that land in `hletterscript`, and ultimately feeds `hocrsyngen` / `hocrgen` / `HeOCR` / `HeOCRsynth`. See [`docs/repository_scope.md`](docs/repository_scope.md) for the full diff --git a/SECURITY.md b/SECURITY.md index 6530e3b..07ae56d 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -33,7 +33,7 @@ please report it. Two paths, in preference order: privacy. For takedown of an upstream scan, report directly in -[`HeOCR/public-domain-hand-written-hebrew-scans`](https://github.com/HeOCR/public-domain-hand-written-hebrew-scans); +[`HeOCR/hash`](https://github.com/HeOCR/hash); once an upstream scan is removed or relicensed, regenerated letter sets must drop or update the affected variants. @@ -52,7 +52,7 @@ In scope: Out of scope here (report to the relevant upstream / downstream repo): -- Rights records on upstream scans — `HeOCR/public-domain-hand-written-hebrew-scans`. +- Rights records on upstream scans — `HeOCR/hash`. - Published letter-set datasets — `HeOCR/hletterscript`. - Composed synthetic pages — `HeOCR/hocrsyngen`. - Release-level governance — `HeOCR/hocrgen`. diff --git a/docs/README.md b/docs/README.md index 58f64e7..69e3ae8 100644 --- a/docs/README.md +++ b/docs/README.md @@ -4,7 +4,7 @@ - [Architecture](architecture.md) — code layout and the validation pipeline. - [`letter_set.v1` contract](letter_set_v1.md) — output schema, field-by-field. - [Upstream integration](upstream_integration.md) — how scans from - `public-domain-hand-written-hebrew-scans` feed in. + `hash` feed in. - [Downstream handoff](downstream_handoff.md) — how outputs land in `hletterscript` and onward. - [Roadmap](roadmap.md) — staged milestones beyond the scaffolding. diff --git a/docs/design/letter_extraction.md b/docs/design/letter_extraction.md index 90db963..44687eb 100644 --- a/docs/design/letter_extraction.md +++ b/docs/design/letter_extraction.md @@ -10,7 +10,7 @@ ## Goal Turn rights-clean handwritten Hebrew page scans (upstream: -`HeOCR/public-domain-hand-written-hebrew-scans`) into per-writer +`HeOCR/hash`) into per-writer `letter_set.v1` documents plus their referenced glyph image assets. ## Sketch diff --git a/docs/design/segmentation-approach.md b/docs/design/segmentation-approach.md index 3ceb0f0..fd0c104 100644 --- a/docs/design/segmentation-approach.md +++ b/docs/design/segmentation-approach.md @@ -19,16 +19,17 @@ path from Option A to Option B/C. ## Evidence from the upstream corpus -Investigation target: `HeOCR/public-domain-hand-written-hebrew-scans` (GitHub, inspected via -`gh api` — local clone not present at time of spike). +Investigation target: `HeOCR/hash` — HASH (Hebrew Archive of Scanned Handwriting) (GitHub, +inspected via `gh api` at spike time; corpus has grown considerably since). | Finding | Detail | |---------|--------| -| Total entries in `data/index/entries.jsonl` | 60 | -| `transcription.status` distribution | `"none"`: 60 / 60 | -| Non-null `alto_path` | 0 / 60 | -| Non-null `hocr_path` | 0 / 60 | -| Non-null `text_path` | 0 / 60 | +| Total entries in `data/index/entries.jsonl` (spike) | 60 | +| Total entries (as of 2026-05) | 373 (111 sources, 48 unique creators) | +| `transcription.status` distribution | `"none"`: all entries at spike time | +| Non-null `alto_path` | 0 at spike time | +| Non-null `hocr_path` | 0 at spike time | +| Non-null `text_path` | 0 at spike time | | Unique `files[].role` values across all entries | `"original"` only | | Scan directories inspected (`data/scans/`) | `commons__begani_netatikha` (representative sample) — contains only the JPEG scan, no sidecars | @@ -38,9 +39,9 @@ The file-role enum (`original`, `normalized`, `thumbnail`, `transcription`, `met `transcription` role, but zero entries exercise it. Source files consulted: -- `HeOCR/public-domain-hand-written-hebrew-scans/schemas/entry.schema.json` -- `HeOCR/public-domain-hand-written-hebrew-scans/data/index/entries.jsonl` -- `HeOCR/public-domain-hand-written-hebrew-scans/data/scans/commons__begani_netatikha/` +- `HeOCR/hash/schemas/entry.schema.json` +- `HeOCR/hash/data/index/entries.jsonl` +- `HeOCR/hash/data/scans/commons__begani_netatikha/` --- diff --git a/docs/design/writer_attribution.md b/docs/design/writer_attribution.md index f64df20..fa391d1 100644 --- a/docs/design/writer_attribution.md +++ b/docs/design/writer_attribution.md @@ -19,7 +19,7 @@ the local upstream checkout and declares one or more writer blocks. ```json { - "upstream_path": "../public-domain-hand-written-hebrew-scans", + "upstream_path": "../hash", "writers": [ { "writer_id": "writer_bialik", diff --git a/docs/letter_set_v1.md b/docs/letter_set_v1.md index 0c82bc4..6a8b240 100644 --- a/docs/letter_set_v1.md +++ b/docs/letter_set_v1.md @@ -24,7 +24,7 @@ is exercised by CI and must remain valid. }, "generated_at": "2026-05-12T00:00:00Z", "upstream": { - "repo": "HeOCR/public-domain-hand-written-hebrew-scans", + "repo": "HeOCR/hash", "revision": "" }, "letters": { @@ -58,7 +58,7 @@ labels only. If in doubt, omit. **Required.** Records how the writer identity was established and which upstream scan entries are attributed to them. `source_repo` is normally -`HeOCR/public-domain-hand-written-hebrew-scans`; `source_entry_ids` are +`HeOCR/hash`; `source_entry_ids` are the upstream `entries.jsonl` ids. `attribution_method` is a short tag (e.g. `collection_metadata`, `manual_review`, `fixture`). @@ -115,7 +115,7 @@ A mapping from a single Hebrew letter character (base or final form, | `asset_path` | POSIX path relative to the letter-set root. No leading `/` (schema-enforced); no `..` segment (cross-field-enforced). | | `checksum_sha256` | Lowercase SHA-256 hex digest of the asset bytes. Real letter sets must use real checksums; the example fixture's all-zero/all-one digests are intentional placeholders. | | `image.{width_px,height_px,format}` | Image metadata. `format` ∈ `png`, `webp`, `tiff`. | -| `source.scan_entry_id` | Upstream entry id (resolves in `public-domain-hand-written-hebrew-scans`). Cross-field validator checks it appears in `writer_provenance.source_entry_ids`. | +| `source.scan_entry_id` | Upstream entry id (resolves in `hash`). Cross-field validator checks it appears in `writer_provenance.source_entry_ids`. | | `source.scan_url` | Optional URL pointer to the source scan. RFC 3986 URI; checked when format-checking is enabled. | | `source.license` | One of the accepted SPDX / `LicenseRef-*` identifiers (see `$defs.license_id` in the schema). Extending the allow-list requires a schema change. | | `source.rights_evidence` | Optional free-form note or URL with rights evidence. | diff --git a/docs/repository_scope.md b/docs/repository_scope.md index 978d0e1..a6de1b2 100644 --- a/docs/repository_scope.md +++ b/docs/repository_scope.md @@ -7,7 +7,7 @@ intentionally narrow. ## Position in the HeOCR system (canonical) ``` -public-domain-hand-written-hebrew-scans (full-page scans, PD / CC / CC-BY) +hash (full-page scans, PD / CC / CC-BY) │ ▼ hletterscriptgen (code/framework — this repo) @@ -39,7 +39,7 @@ copy it — only one diagram should ever rot. | Concern | Where it lives | | --- | --- | -| Hosting page scans and rights records | `HeOCR/public-domain-hand-written-hebrew-scans` | +| Hosting page scans and rights records | `HeOCR/hash` | | Hosting per-writer letter-glyph datasets | `HeOCR/hletterscript` | | Composing synthetic Hebrew handwritten pages | `HeOCR/hocrsyngen` | | Dataset orchestration, governance, release assembly, publication | `HeOCR/hocrgen` | diff --git a/docs/upstream_integration.md b/docs/upstream_integration.md index b99c802..a371a61 100644 --- a/docs/upstream_integration.md +++ b/docs/upstream_integration.md @@ -1,7 +1,7 @@ # Upstream integration `hletterscriptgen` consumes scans from -[`HeOCR/public-domain-hand-written-hebrew-scans`](https://github.com/HeOCR/public-domain-hand-written-hebrew-scans). +[`HeOCR/hash`](https://github.com/HeOCR/hash) — HASH (Hebrew Archive of Scanned Handwriting). That upstream repo holds the authoritative rights records; this repo defers to them. diff --git a/examples/letter_set/writer_example.json b/examples/letter_set/writer_example.json index 58ca41f..ae5fed2 100644 --- a/examples/letter_set/writer_example.json +++ b/examples/letter_set/writer_example.json @@ -3,7 +3,7 @@ "writer_id": "example-writer-0001", "writer_label": "Example Writer (fixture only — not a real person)", "writer_provenance": { - "source_repo": "HeOCR/public-domain-hand-written-hebrew-scans", + "source_repo": "HeOCR/hash", "source_entry_ids": ["example-scan-0001", "example-scan-0002"], "attribution_method": "fixture", "notes": "Fixture document used to validate the letter_set.v1 schema in CI. No real glyph images are referenced." @@ -15,7 +15,7 @@ }, "generated_at": "2026-05-12T00:00:00Z", "upstream": { - "repo": "HeOCR/public-domain-hand-written-hebrew-scans", + "repo": "HeOCR/hash", "revision": "0000000000000000000000000000000000000000" }, "letters": { diff --git a/src/hletterscriptgen/generate_profile.py b/src/hletterscriptgen/generate_profile.py index 7b5ad60..cefffd6 100644 --- a/src/hletterscriptgen/generate_profile.py +++ b/src/hletterscriptgen/generate_profile.py @@ -19,7 +19,7 @@ Profile JSON shape:: { - "upstream_checkout": "../public-domain-hand-written-hebrew-scans", + "upstream_checkout": "../hash", "writers": [ { "writer_id": "writer_bialik", diff --git a/src/hletterscriptgen/schemas/letter_set.schema.json b/src/hletterscriptgen/schemas/letter_set.schema.json index 14d8950..31f0d14 100644 --- a/src/hletterscriptgen/schemas/letter_set.schema.json +++ b/src/hletterscriptgen/schemas/letter_set.schema.json @@ -37,7 +37,7 @@ "source_repo": { "type": "string", "minLength": 1, - "description": "Upstream repository identifier, e.g. 'HeOCR/public-domain-hand-written-hebrew-scans'." + "description": "Upstream repository identifier, e.g. 'HeOCR/hash'." }, "source_entry_ids": { "type": "array", @@ -81,7 +81,7 @@ "repo": { "type": "string", "minLength": 1, - "description": "Upstream repository identifier in 'owner/name' form, e.g. 'HeOCR/public-domain-hand-written-hebrew-scans'." + "description": "Upstream repository identifier in 'owner/name' form, e.g. 'HeOCR/hash'." }, "revision": { "type": "string", diff --git a/src/hletterscriptgen/upstream.py b/src/hletterscriptgen/upstream.py index c50b406..85e2e38 100644 --- a/src/hletterscriptgen/upstream.py +++ b/src/hletterscriptgen/upstream.py @@ -1,7 +1,7 @@ -"""Upstream integration: read and filter ``public-domain-hand-written-hebrew-scans``. +"""Upstream integration: read and filter ``HeOCR/hash`` (HASH). This module is read-only: it consumes a local checkout of the upstream -scan corpus (``HeOCR/public-domain-hand-written-hebrew-scans``) and +scan corpus (``HeOCR/hash``) and exposes the records the generator pipeline actually needs. The full upstream contract is broader than what is modelled here; see ``schemas/entry.schema.json`` in the upstream repo. @@ -166,7 +166,7 @@ class UpstreamPin: """The ``(repo, revision)`` pair written to ``letter_set.v1.upstream``. ``repo`` is the ``owner/name`` form of the upstream remote (e.g. - ``"HeOCR/public-domain-hand-written-hebrew-scans"``). ``revision`` + ``"HeOCR/hash"``). ``revision`` is the full SHA of the pinned ``HEAD`` commit. """ diff --git a/tests/fixtures/attribution/writer_profile.json b/tests/fixtures/attribution/writer_profile.json index 3f9960d..c17cd46 100644 --- a/tests/fixtures/attribution/writer_profile.json +++ b/tests/fixtures/attribution/writer_profile.json @@ -1,5 +1,5 @@ { - "upstream_path": "../public-domain-hand-written-hebrew-scans", + "upstream_path": "../hash", "writers": [ { "writer_id": "writer_bialik", diff --git a/tests/test_attribution.py b/tests/test_attribution.py index 4b18283..a30cc35 100644 --- a/tests/test_attribution.py +++ b/tests/test_attribution.py @@ -71,7 +71,7 @@ def test_roundtrip_parse_fixture() -> None: profile = load_attribution(PROFILE_PATH) assert isinstance(profile, WriterProfile) - assert profile.upstream_path == Path("../public-domain-hand-written-hebrew-scans") + assert profile.upstream_path == Path("../hash") writers_by_id = {w.writer_id: w for w in profile.writers} assert set(writers_by_id) == {"writer_bialik", "writer_herzl"} diff --git a/tests/test_generator.py b/tests/test_generator.py index ba2b2da..9b060a6 100644 --- a/tests/test_generator.py +++ b/tests/test_generator.py @@ -64,7 +64,7 @@ def _make_upstream_checkout(tmp_path: Path) -> Path: _git(repo, "config", "user.email", "test@example.com") _git(repo, "config", "user.name", "Test") _git(repo, "config", "commit.gpgsign", "false") - _git(repo, "remote", "add", "origin", "https://github.com/HeOCR/public-domain-hand-written-hebrew-scans.git") + _git(repo, "remote", "add", "origin", "https://github.com/HeOCR/hash.git") # entries.jsonl index_dir = repo / "data" / "index" @@ -156,7 +156,7 @@ def test_generate_letter_set_content(tmp_path: Path) -> None: doc = json.loads(paths[0].read_text(encoding="utf-8")) assert doc["schema_version"] == "letter_set.v1" assert doc["writer_id"] == "writer_test_a" - assert doc["upstream"]["repo"] == "HeOCR/public-domain-hand-written-hebrew-scans" + assert doc["upstream"]["repo"] == "HeOCR/hash" assert doc["generator"]["name"] == "hletterscriptgen" # Both annotated letters must appear assert "א" in doc["letters"] @@ -324,7 +324,7 @@ def _make_upstream_checkout_no_cv2(tmp_path: Path) -> Path: _git(repo, "config", "user.email", "test@example.com") _git(repo, "config", "user.name", "Test") _git(repo, "config", "commit.gpgsign", "false") - _git(repo, "remote", "add", "origin", "https://github.com/HeOCR/public-domain-hand-written-hebrew-scans.git") + _git(repo, "remote", "add", "origin", "https://github.com/HeOCR/hash.git") index_dir = repo / "data" / "index" index_dir.mkdir(parents=True) diff --git a/tests/test_upstream.py b/tests/test_upstream.py index 66db8a4..2087338 100644 --- a/tests/test_upstream.py +++ b/tests/test_upstream.py @@ -196,16 +196,16 @@ def _init_repo(path: Path, remote_url: str) -> str: def test_pin_returns_upstream_pin(tmp_path: Path) -> None: repo = tmp_path / "upstream" - rev = _init_repo(repo, "https://github.com/HeOCR/public-domain-hand-written-hebrew-scans.git") + rev = _init_repo(repo, "https://github.com/HeOCR/hash.git") assert upstream_pin_from_checkout(repo) == UpstreamPin( - repo="HeOCR/public-domain-hand-written-hebrew-scans", + repo="HeOCR/hash", revision=rev, ) def test_pin_refuses_dirty_checkout(tmp_path: Path) -> None: repo = tmp_path / "upstream" - _init_repo(repo, "git@github.com:HeOCR/public-domain-hand-written-hebrew-scans.git") + _init_repo(repo, "git@github.com:HeOCR/hash.git") (repo / "README.md").write_text("dirty\n", encoding="utf-8") with pytest.raises(UpstreamCheckoutDirtyError): upstream_pin_from_checkout(repo)