diff --git a/.editorconfig b/.editorconfig
new file mode 100644
index 0000000..e75f0d6
--- /dev/null
+++ b/.editorconfig
@@ -0,0 +1,20 @@
+root = true
+
+[*]
+charset = utf-8
+end_of_line = lf
+insert_final_newline = true
+trim_trailing_whitespace = true
+indent_style = space
+indent_size = 2
+
+[*.py]
+indent_size = 4
+
+[*.md]
+# Trailing whitespace can be meaningful in Markdown (two-space hard
+# line breaks). Don't strip it automatically.
+trim_trailing_whitespace = false
+
+[Makefile]
+indent_style = tab
diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000..ccf0191
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,28 @@
+# Per-letter image crops are tracked with Git LFS. This keeps regular
+# git operations fast and the repository clone size sane as the corpus
+# grows (per-writer × 27 letter forms × multiple variants accumulates
+# quickly even at 10–50 KB per crop).
+#
+# After cloning, run `git lfs install` once, then `git lfs pull` to
+# fetch the actual image bytes. CI does this automatically before the
+# validator runs (see .github/workflows/ci.yml).
+data/letters/**/*.png  filter=lfs diff=lfs merge=lfs -text
+data/letters/**/*.jpg  filter=lfs diff=lfs merge=lfs -text
+data/letters/**/*.jpeg filter=lfs diff=lfs merge=lfs -text
+data/letters/**/*.webp filter=lfs diff=lfs merge=lfs -text
+data/letters/**/*.tif  filter=lfs diff=lfs merge=lfs -text
+data/letters/**/*.tiff filter=lfs diff=lfs merge=lfs -text
+
+# Force LF line endings on text files so checksums and diffs are stable
+# across macOS/Linux/Windows contributors.
+*.md         text eol=lf
+*.json       text eol=lf
+*.jsonl      text eol=lf
+*.yml        text eol=lf
+*.yaml       text eol=lf
+*.py         text eol=lf
+*.cff        text eol=lf
+*.txt        text eol=lf
+.gitignore   text eol=lf
+.gitattributes text eol=lf
+LICENSE      text eol=lf
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
new file mode 100644
index 0000000..9cb9e33
--- /dev/null
+++ b/.github/pull_request_template.md
@@ -0,0 +1,43 @@
+<!--
+  hletterscript PR template.
+  - Delete sections that don't apply.
+  - All checkboxes are mandatory unless you explicitly note why one
+    does not apply (e.g. "docs-only change, no entries touched").
+-->
+
+## Summary
+
+<!-- 1-3 bullets: what changed and why. -->
+
+## Type of change
+
+- [ ] New writer(s) / new per-letter image entries (ingest)
+- [ ] Schema or validator change
+- [ ] Release tooling / CI change
+- [ ] Documentation / policy
+- [ ] Refactor / chore (no behaviour change)
+
+## Pre-merge checklist
+
+- [ ] `python3 scripts/validate_indexes.py` passes locally.
+- [ ] `python3 scripts/generate_release_artifacts.py` was re-run after
+      any change to `data/index/*.jsonl` or `scripts/release_recipe.json`,
+      and the regenerated `NOTICE.md` / `CITATION.cff` / `datapackage.json`
+      are staged in this PR.
+- [ ] `python3 -m pytest` passes locally.
+- [ ] `git diff --check` shows no whitespace issues.
+- [ ] If image files were added/changed, they are tracked via Git LFS
+      (see `.gitattributes`).
+
+## Rights / licensing
+
+<!-- Only required for PRs that add entries. -->
+<!-- Confirm: every new entry's rights block matches the inheritance -->
+<!-- table in LICENSE.md (specifically, attribution_required and -->
+<!-- license_expression are aligned with the upstream scan). -->
+
+## Notes for reviewers
+
+<!-- Anything load-bearing the reviewer should focus on. Paste the -->
+<!-- validator output and pytest summary if this PR touches data or -->
+<!-- tooling. -->
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000..0f2e072
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,40 @@
+name: CI
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+
+jobs:
+  validate:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          # Pull LFS pointer files; we fetch the actual bytes in the
+          # next step so we control retry/error behaviour.
+          lfs: true
+      - name: Install Git LFS
+        run: |
+          git lfs install
+          git lfs pull
+      - uses: actions/setup-python@v5
+        with:
+          # Python 3.11+ is required (validate_indexes.py uses
+          # hashlib.file_digest). Pin 3.12 for stable CI; update both
+          # this line and requirements-dev.txt's header when bumping.
+          python-version: "3.12"
+      - name: Check out upstream scans repo for cross-validation
+        uses: actions/checkout@v4
+        with:
+          repository: HeOCR/public-domain-hand-written-hebrew-scans
+          path: .upstream
+          lfs: false
+      - name: Install dev dependencies
+        run: python -m pip install -r requirements-dev.txt
+      - name: Validate JSONL indexes (with upstream cross-check)
+        run: python scripts/validate_indexes.py --upstream-path .upstream
+      - name: Check release artefacts are up to date (run `python3 scripts/generate_release_artifacts.py` to refresh)
+        run: python scripts/generate_release_artifacts.py --check
+      - name: Run pytest
+        run: python -m pytest
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..1b12e9c
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,7 @@
+.DS_Store
+__pycache__/
+*.py[cod]
+.claude/
+.venv/
+venv/
+.pytest_cache/
diff --git a/AGENTS.md b/AGENTS.md
new file mode 100644
index 0000000..1cb9394
--- /dev/null
+++ b/AGENTS.md
@@ -0,0 +1,244 @@
+# AGENTS.md
+
+Operational rules for agents and humans contributing per-letter image
+crops, writer records, or tooling to this repository. If anything below
+conflicts with `docs/dataset_structure.md` or `LICENSE.md`, those
+documents win — this file is a working summary, not a re-derivation of
+policy.
+
+## What this repo is
+
+A dataset of **sets of per-letter images of handwritten Hebrew letters**,
+grouped by writer. Each set = one person/scribe. Each per-letter image
+is a **crop** of a permissively-licensed upstream scan from
+[HeOCR/public-domain-hand-written-hebrew-scans][upstream], with rights
+inherited and recorded per image. Canonical layout, schema motivation,
+and ingestion model live in [`docs/dataset_structure.md`]\
+(docs/dataset_structure.md). The Hebrew letter enumeration is in
+[`docs/letters.md`](docs/letters.md). Compound licensing (CC0 metadata,
+per-image rights inheritance) is described in
+[`LICENSE.md`](LICENSE.md). The machine-readable contracts are
+[`schemas/writer.schema.json`](schemas/writer.schema.json) and
+[`schemas/entry.schema.json`](schemas/entry.schema.json). The release
+runbook is [`docs/release_process.md`](docs/release_process.md).
+
+[upstream]: https://github.com/HeOCR/public-domain-hand-written-hebrew-scans
+
+## First-time setup
+
+Run once per clone:
+
+```bash
+git lfs install
+git lfs pull
+python3 -m pip install -r requirements-dev.txt
+```
+
+`data/letters/**` image files are tracked via Git LFS (see
+`.gitattributes`). Without `git lfs pull` you have pointer files, not
+images, and the validator's file-integrity check will fail.
+
+**Python 3.11+ is required** — the validator uses `hashlib.file_digest`.
+CI pins 3.12.
+
+## Mandatory pre-PR commands
+
+Run these from the repo root before opening or updating a PR. The first
+three are also run in CI (`.github/workflows/ci.yml`) on every push to
+`main` and every PR — they must stay green.
+
+```bash
+python3 scripts/validate_indexes.py
+python3 scripts/generate_release_artifacts.py
+python3 -m pytest
+git diff --check
+```
+
+`validate_indexes.py` must end with
+`ok: N writers, M entries, K files verified`.
+`generate_release_artifacts.py` must leave `NOTICE.md`, `CITATION.cff`,
+and `datapackage.json` unchanged in the diff — re-run it after any edit
+to `data/index/*.jsonl` or `scripts/release_recipe.json` and stage the
+regenerated artefacts.
+`python3 scripts/generate_release_artifacts.py --check` is the
+non-mutating equivalent (CI runs the `--check` form). `pytest` must
+report all tests passing. `git diff --check` must produce no output.
+
+### Optional upstream cross-validation
+
+If you have a local clone of the upstream scans repo, pass
+`--upstream-path` to validate `upstream.sha256` and `upstream.bbox`
+against the live upstream entry records:
+
+```bash
+python3 scripts/validate_indexes.py \
+  --upstream-path ../public-domain-hand-written-hebrew-scans
+```
+
+CI checks out the upstream repo as a sibling and runs the validator with
+`--upstream-path` automatically, so any mismatch (upstream re-encode,
+bbox-out-of-bounds) blocks the PR.
+
+### Tests-only flag
+
+`--repo-root PATH` overrides the file-integrity check's repo root. It
+exists for the pytest fixtures and is not part of the ingest workflow.
+
+## Release artefacts
+
+`NOTICE.md`, `CITATION.cff`, and `datapackage.json` at the repo root are
+generated deterministically from `data/index/*.jsonl` and
+`scripts/release_recipe.json`. Do not edit them by hand.
+
+Two timestamps with deliberately different semantics:
+
+- `datapackage.json::released_at` = `max(extraction.extracted_at)` —
+  the corpus-state timestamp. Bumps automatically on every ingest PR.
+  When the corpus is empty it falls back to
+  `release_recipe.json::initial_release_date`.
+- `CITATION.cff::date-released` = `release_recipe.json::version_released_date`
+  — stable per version. Only changes when a human bumps `version`
+  (see [`docs/release_process.md`](docs/release_process.md)).
+
+This means an ingest PR will bump `released_at` but not `date-released`.
+That is intentional: citations stay reproducible while
+corpus-freshness metadata moves with reality.
+
+Regenerate by running `python3 scripts/generate_release_artifacts.py`
+from the repo root.
+
+## GitHub workflow
+
+- One PR per coherent change. Batching is fine when tightly coupled
+  (tooling change + the docs that describe it); avoid batching
+  unrelated work.
+- Open PRs non-draft. The PR template's checkboxes are required.
+- Use the `git` and `gh` CLIs. Do not push to `main` directly.
+- Standard commit hygiene: conventional `type(scope): subject`, real
+  `Co-Authored-By` trailer when collaborating, no `--no-verify`, no
+  force-push to `main`.
+
+## Ingest rules
+
+### In scope
+
+- Cropped images of **single** Hebrew letters from handwriting attested
+  to a specific writer.
+- Both `regular` and `final` forms are first-class — they are never
+  merged into a single base letter. See [`docs/letters.md`]\
+  (docs/letters.md) for the canonical 27-form enumeration.
+- The crop must come from a scan that exists as a row in the upstream
+  repo's `data/index/entries.jsonl`. If the page is not yet in upstream,
+  add it there first.
+
+### Out of scope
+
+- Printed or typeset letters.
+- Composite glyphs (digraphs, niqqud-only marks, pointed shin/sin
+  variants `שׁ`/`שׂ`).
+- Crops from scans whose license does not permit redistribution,
+  commercial use, and derivatives.
+
+### Per-image metadata (mandatory)
+
+Every entry must include:
+
+- `upstream.source_id`, `upstream.entry_id`, `upstream.sha256`,
+  `upstream.commit` (40-char SHA — tag refs go in `upstream.release_tag`
+  instead), `upstream.bbox`.
+- `image.local_path` matching
+  `data/letters/<writer_id>/<letter.name>/<entry_id>.<ext>`.
+- `image.sha256` — full file SHA-256 (lowercase hex).
+- `image.bytes` — file size in bytes.
+- `image.mime_type` — `image/png`, `image/jpeg`, `image/webp`, or
+  `image/tiff`. Extension on `local_path` must match.
+- `image.width_px` and `image.height_px`.
+- `image.background` — `original`, `white`, `black`, `gray`,
+  `binarized`, or `transparent`. (`transparent` requires an
+  alpha-capable mime type; the schema rejects `transparent` + JPEG.)
+- `extraction.tool`, `extraction.tool_version` (SemVer or `git describe`
+  output), `extraction.method`, `extraction.extracted_at`,
+  `extraction.extracted_by`.
+- `rights.*` — inherited from the upstream entry per the table in
+  `LICENSE.md`. `rights.rights_basis` must match
+  `rights.license_expression` per the validator's `LICENSE_BASIS_MAP`.
+
+Helpers — macOS:
+
+```bash
+shasum -a 256 FILE
+stat -f%z FILE
+file --mime-type -b FILE
+sips -g pixelWidth -g pixelHeight FILE
+```
+
+Helpers — Linux (CI runs on Ubuntu, so these are the same shapes used
+in CI debugging):
+
+```bash
+sha256sum FILE
+stat -c%s FILE
+file --mime-type -b FILE
+identify -format "%w %h\n" FILE   # ImageMagick; or use Pillow from Python.
+```
+
+`scripts/validate_indexes.py` re-checks file integrity against the
+recorded metadata on every run. Mismatches block CI.
+
+### Accepted licenses
+
+- `PDM-1.0` → `rights_basis: public_domain`
+- `CC0-1.0` → `rights_basis: cc0`
+- `CC-BY-4.0` → `rights_basis: cc_by` (attribution required)
+- `CC-BY-SA-4.0` → `rights_basis: cc_by_sa` (attribution required;
+  ShareAlike applies to the crop, since the crop is an adaptation)
+- `LicenseRef-Public-Domain-Israel` → `rights_basis: public_domain`
+- `LicenseRef-Public-Domain-Ukraine` → `rights_basis: public_domain`
+
+The validator's `LICENSE_BASIS_MAP` is the single source of truth for
+this mapping. Adding a license means updating that map AND this list
+AND `scripts/release_recipe.json::license_names`/`license_urls`.
+
+### Rejected licenses
+
+- `CC-BY-NC`, `CC-BY-NC-SA`, `CC-BY-ND`.
+- "Research only", "permission required", "educational use only".
+- Anything unknown, ambiguous, or where the upstream entry's
+  `rights.verification_status` is not at least `primary_page_checked`.
+
+## Naming
+
+```text
+writer_id = <slug_of_writers_canonical_name>     # e.g. chaim_nachman_bialik
+entry_id  = <writer_id>__<letter.name>__v<NNNN>  # zero-padded variant
+```
+
+`<letter.name>` is the canonical slug from `docs/letters.md`. `<NNNN>`
+is the zero-padded 4-digit counter monotonic per
+`(writer_id, letter.name)`.
+
+### Writer disambiguation
+
+On Latin-name collision (e.g. two writers named "Yosef Haim"), append
+the birth year to disambiguate: `yosef_haim_1834`, `yosef_haim_1902`.
+Fallbacks when birth year is unknown:
+
+1. Death year: `yosef_haim_d1942`.
+2. Period start year: `yosef_haim_p1880`.
+3. Provider authority ID: `yosef_haim_viaf12345678` — last resort, only
+   when none of the above are knowable.
+
+Always record the rationale in the writer's `ingest.agent_notes`.
+
+## What NOT to commit
+
+The following are already in `.gitignore` and should never appear in a
+diff:
+
+- `.claude/` — local agent session state.
+- `.DS_Store` — macOS Finder metadata.
+- `__pycache__/`, `*.pyc`, `*.pyo`, `*.pyd` — Python bytecode caches.
+- `.venv/`, `venv/`, `.pytest_cache/`.
+
+If `git status` shows any of these as untracked, leave them untracked.
+Do not `git add -f` to override the ignore.
diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 0000000..64f9cc2
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,51 @@
+# Changelog
+
+All notable changes to this project will be documented in this file.
+
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html)
+for the dataset version recorded in `scripts/release_recipe.json::version`.
+
+## [Unreleased]
+
+(no in-progress changes)
+
+## [0.0.0-rc] - 2026-05-12
+
+Initial scaffolding release. Per-letter image corpus is empty; the
+repository ships the schemas, validators, release tooling, CI, and
+licensing policy needed to start ingesting.
+
+### Added
+
+- Writer-level (`schemas/writer.schema.json`) and entry-level
+  (`schemas/entry.schema.json`) record contracts. Each entry references
+  an upstream scan in `HeOCR/public-domain-hand-written-hebrew-scans`
+  by `source_id`, `entry_id`, `sha256` (mutable-tag-free), `commit`
+  (40-char SHA), and `bbox`.
+- `scripts/validate_indexes.py`: schema validation, referential
+  integrity, Hebrew letter codepoint/name/form consistency,
+  `rights_basis` ↔ `license_expression` cross-check, file-integrity
+  re-hashing, and optional `--upstream-path` cross-validation of
+  upstream `sha256` and `bbox` bounds.
+- `scripts/generate_release_artifacts.py` + `scripts/release_recipe.json`:
+  deterministic generation of `NOTICE.md`, `CITATION.cff`, and
+  `datapackage.json`. Citation `date-released` is stable per version
+  (`version_released_date` in the recipe); datapackage `released_at`
+  tracks the corpus state (`max(extraction.extracted_at)`).
+- `.gitattributes` configures Git LFS for `data/letters/**` image
+  files. CI fetches LFS bytes before validation.
+- `LICENSE` (CC0 1.0) and `LICENSE.md` compound-licensing policy with
+  per-license inheritance table and CC-BY-SA-4.0 ShareAlike handling.
+- `AGENTS.md`, `README.md`, `docs/dataset_structure.md`,
+  `docs/letters.md`, and `docs/release_process.md`.
+- `.github/workflows/ci.yml`, `.github/pull_request_template.md`,
+  `.editorconfig`.
+- Pytest test suite covering schema rejection, referential integrity,
+  letter consistency, rights cross-check, attribution gating,
+  file-integrity checks, empty-corpus fallbacks, non-empty corpus
+  NOTICE/CITATION/datapackage rendering, upstream cross-validation,
+  and Frictionless Data Package conformance.
+
+[Unreleased]: https://github.com/HeOCR/hletterscript/compare/v0.0.0-rc...HEAD
+[0.0.0-rc]: https://github.com/HeOCR/hletterscript/releases/tag/v0.0.0-rc
diff --git a/CITATION.cff b/CITATION.cff
new file mode 100644
index 0000000..f2476d0
--- /dev/null
+++ b/CITATION.cff
@@ -0,0 +1,22 @@
+# Generated by scripts/generate_release_artifacts.py. Do not edit by hand.
+cff-version: 1.2.0
+message: Please cite this dataset using the metadata below.
+type: dataset
+title: Hebrew Handwritten Per-Letter Image Dataset
+abstract: 'Per-letter image crops of handwritten Hebrew letters, grouped into sets by writer. Each crop is a derivative of a permissively-licensed upstream scan in HeOCR/public-domain-hand-written-hebrew-scans, with per-image rights inherited and attribution recorded. The index is line-oriented JSON (JSONL). Release 0.0.0-rc is the initial-setup release: the corpus contains no per-letter image entries yet. The repository ships the schemas, validation tooling, CI, and licensing policy needed to start ingesting.'
+authors:
+- name: Shay Palachy-Affek
+version: 0.0.0-rc
+date-released: '2026-05-12'
+repository-code: https://github.com/HeOCR/hletterscript
+url: https://github.com/HeOCR/hletterscript
+license: CC0-1.0
+keywords:
+- Hebrew
+- dataset
+- glyphs
+- handwriting
+- handwritten-text-recognition
+- letters
+- public-domain
+- synthetic-generation
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..0e259d4
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,121 @@
+Creative Commons Legal Code
+
+CC0 1.0 Universal
+
+    CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
+    LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
+    ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
+    INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
+    REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
+    PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
+    THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
+    HEREUNDER.
+
+Statement of Purpose
+
+The laws of most jurisdictions throughout the world automatically confer
+exclusive Copyright and Related Rights (defined below) upon the creator
+and subsequent owner(s) (each and all, an "owner") of an original work of
+authorship and/or a database (each, a "Work").
+
+Certain owners wish to permanently relinquish those rights to a Work for
+the purpose of contributing to a commons of creative, cultural and
+scientific works ("Commons") that the public can reliably and without fear
+of later claims of infringement build upon, modify, incorporate in other
+works, reuse and redistribute as freely as possible in any form whatsoever
+and for any purposes, including without limitation commercial purposes.
+These owners may contribute to the Commons to promote the ideal of a free
+culture and the further production of creative, cultural and scientific
+works, or to gain reputation or greater distribution for their Work in
+part through the use and efforts of others.
+
+For these and/or other purposes and motivations, and without any
+expectation of additional consideration or compensation, the person
+associating CC0 with a Work (the "Affirmer"), to the extent that he or she
+is an owner of Copyright and Related Rights in the Work, voluntarily
+elects to apply CC0 to the Work and publicly distribute the Work under its
+terms, with knowledge of his or her Copyright and Related Rights in the
+Work and the meaning and intended legal effect of CC0 on those rights.
+
+1. Copyright and Related Rights. A Work made available under CC0 may be
+protected by copyright and related or neighboring rights ("Copyright and
+Related Rights"). Copyright and Related Rights include, but are not
+limited to, the following:
+
+  i. the right to reproduce, adapt, distribute, perform, display,
+     communicate, and translate a Work;
+ ii. moral rights retained by the original author(s) and/or performer(s);
+iii. publicity and privacy rights pertaining to a person's image or
+     likeness depicted in a Work;
+ iv. rights protecting against unfair competition in regards to a Work,
+     subject to the limitations in paragraph 4(a), below;
+  v. rights protecting the extraction, dissemination, use and reuse of data
+     in a Work;
+ vi. database rights (such as those arising under Directive 96/9/EC of the
+     European Parliament and of the Council of 11 March 1996 on the legal
+     protection of databases, and under any national implementation
+     thereof, including any amended or successor version of such
+     directive); and
+vii. other similar, equivalent or corresponding rights throughout the
+     world based on applicable law or treaty, and any national
+     implementations thereof.
+
+2. Waiver. To the greatest extent permitted by, but not in contravention
+of, applicable law, Affirmer hereby overtly, fully, permanently,
+irrevocably and unconditionally waives, abandons, and surrenders all of
+Affirmer's Copyright and Related Rights and associated claims and causes
+of action, whether now known or unknown (including existing as well as
+future claims and causes of action), in the Work (i) in all territories
+worldwide, (ii) for the maximum duration provided by applicable law or
+treaty (including future time extensions), (iii) in any current or future
+medium and for any number of copies, and (iv) for any purpose whatsoever,
+including without limitation commercial, advertising or promotional
+purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
+member of the public at large and to the detriment of Affirmer's heirs and
+successors, fully intending that such Waiver shall not be subject to
+revocation, rescission, cancellation, termination, or any other legal or
+equitable action to disrupt the quiet enjoyment of the Work by the public
+as contemplated by Affirmer's express Statement of Purpose.
+
+3. Public License Fallback. Should any part of the Waiver for any reason
+be judged legally invalid or ineffective under applicable law, then the
+Waiver shall be preserved to the maximum extent permitted taking into
+account Affirmer's express Statement of Purpose. In addition, to the
+extent the Waiver is so judged Affirmer hereby grants to each affected
+person a royalty-free, non transferable, non sublicensable, non exclusive,
+irrevocable and unconditional license to exercise Affirmer's Copyright and
+Related Rights in the Work (i) in all territories worldwide, (ii) for the
+maximum duration provided by applicable law or treaty (including future
+time extensions), (iii) in any current or future medium and for any number
+of copies, and (iv) for any purpose whatsoever, including without
+limitation commercial, advertising or promotional purposes (the
+"License"). The License shall be deemed effective as of the date CC0 was
+applied by Affirmer to the Work. Should any part of the License for any
+reason be judged legally invalid or ineffective under applicable law, such
+partial invalidity or ineffectiveness shall not invalidate the remainder
+of the License, and in such case Affirmer hereby affirms that he or she
+will not (i) exercise any of his or her remaining Copyright and Related
+Rights in the Work or (ii) assert any associated claims and causes of
+action with respect to the Work, in either case contrary to Affirmer's
+express Statement of Purpose.
+
+4. Limitations and Disclaimers.
+
+ a. No trademark or patent rights held by Affirmer are waived, abandoned,
+    surrendered, licensed or otherwise affected by this document.
+ b. Affirmer offers the Work as-is and makes no representations or
+    warranties of any kind concerning the Work, express, implied,
+    statutory or otherwise, including without limitation warranties of
+    title, merchantability, fitness for a particular purpose, non
+    infringement, or the absence of latent or other defects, accuracy, or
+    the present or absence of errors, whether or not discoverable, all to
+    the greatest extent permissible under applicable law.
+ c. Affirmer disclaims responsibility for clearing rights of other persons
+    that may apply to the Work or any use thereof, including without
+    limitation any person's Copyright and Related Rights in the Work.
+    Further, Affirmer disclaims responsibility for obtaining any necessary
+    consents, permissions or other rights required for any use of the
+    Work.
+ d. Affirmer understands and acknowledges that Creative Commons is not a
+    party to this document and has no duty or obligation with respect to
+    this CC0 or use of the Work.
diff --git a/LICENSE.md b/LICENSE.md
new file mode 100644
index 0000000..cb8e99e
--- /dev/null
+++ b/LICENSE.md
@@ -0,0 +1,101 @@
+# Licensing Policy
+
+This repository is structured for compound licensing — the same model used
+by [HeOCR/public-domain-hand-written-hebrew-scans][upstream].
+
+[upstream]: https://github.com/HeOCR/public-domain-hand-written-hebrew-scans
+
+## Repository-authored metadata
+
+Metadata authored directly in this repository is dedicated to the public
+domain under CC0 1.0 Universal (`CC0-1.0`):
+
+https://creativecommons.org/publicdomain/zero/1.0/
+
+To the extent possible under law, the repository contributors waive
+copyright and related rights in this repository-authored metadata. The
+canonical legal text is in [`LICENSE`](LICENSE).
+
+This dedication includes:
+
+- dataset structure documentation,
+- writer and entry index metadata authored here,
+- the JSON Schemas in `schemas/`,
+- validation and release scripts in `scripts/`,
+- generated metadata exports derived only from repository-authored
+  metadata (e.g. `datapackage.json`, `CITATION.cff`, `NOTICE.md`).
+
+The CC0 dedication does **not** extend to third-party image bytes,
+upstream-owned descriptive text, or transcription bytes unless that
+material is separately released under compatible terms.
+
+## Per-letter image crops
+
+Per-letter image bytes are **derivatives** of upstream scans hosted in
+[HeOCR/public-domain-hand-written-hebrew-scans][upstream]. They are not
+automatically covered by the metadata license. Each crop carries its own
+entry-level rights record in `data/index/entries.jsonl`:
+
+- `rights.license_expression` (SPDX expression or `LicenseRef-*`),
+- `rights.commercial_use_allowed`,
+- `rights.derivatives_allowed`,
+- `rights.redistribution_allowed`,
+- `rights.attribution_required`,
+- `rights.attribution_text`,
+- `rights.attribution_url`.
+
+Consumers must use a crop according to the rights expressed in its own
+entry record, not the repository-level metadata license.
+
+### License inheritance
+
+Because a per-letter crop is by definition a *derivative* of an upstream
+scanned page, the crop's license is inherited from the upstream scan:
+
+| Upstream scan license            | Per-letter crop license          | Attribution required?         |
+| -------------------------------- | -------------------------------- | ----------------------------- |
+| `CC0-1.0`                        | `CC0-1.0`                        | no                            |
+| `PDM-1.0`                        | `PDM-1.0`                        | no                            |
+| `LicenseRef-Public-Domain-*`     | same `LicenseRef-Public-Domain-*`| no                            |
+| `CC-BY-4.0`                      | `CC-BY-4.0`                      | yes (text + url required)     |
+| `CC-BY-SA-4.0`                   | `CC-BY-SA-4.0`                   | yes (text + url required)     |
+| Anything else (NC, ND, unknown)  | **not ingestable**               | n/a                           |
+
+The ShareAlike obligation propagates: anyone who redistributes a further
+adaptation of a `CC-BY-SA-4.0` crop must release the adaptation under
+`CC-BY-SA-4.0` or a compatible later version. Mere aggregation of
+`CC-BY-SA-4.0` crops alongside public-domain or CC-BY crops in a release
+bundle is not an adaptation, so the bundle itself does not need to be
+relicensed.
+
+## Release bundles
+
+Remix-friendly public release bundles published from this repository
+should include only entries where:
+
+- redistribution is allowed,
+- commercial use is allowed,
+- derivatives are allowed,
+- both upstream scan rights and inherited crop rights have been verified.
+
+If a release bundle contains a mixture of public-domain, CC0, CC-BY, and
+CC-BY-SA crops, the release must keep per-entry license metadata and
+include attribution where required. Do not describe such a bundle as
+having a single uniform crop license unless every included crop has the
+same license.
+
+## Exclusions
+
+Do not include per-letter image crops with any of the following terms in
+release bundles, and do not ingest upstream scans carrying these terms:
+
+- non-commercial only,
+- no derivatives,
+- research-only,
+- permission required,
+- unknown rights,
+- inaccessible source evidence.
+
+This is stricter than the upstream scans repo's exclusion list because
+this repository's deliverable is **only useful if downstream synthetic
+document generators can redistribute and remix it**.
diff --git a/NOTICE.md b/NOTICE.md
new file mode 100644
index 0000000..c55feca
--- /dev/null
+++ b/NOTICE.md
@@ -0,0 +1,18 @@
+# NOTICE
+
+This file is generated by `scripts/generate_release_artifacts.py` from `data/index/entries.jsonl`. Do not edit by hand.
+
+Repository-authored metadata is dedicated to the public domain under CC0 1.0 Universal. See [`LICENSE`](LICENSE) and [`LICENSE.md`](LICENSE.md) for the full compound-licensing policy.
+
+Per-letter image crops are derivatives of upstream scans in [HeOCR/public-domain-hand-written-hebrew-scans](https://github.com/HeOCR/public-domain-hand-written-hebrew-scans) and carry per-entry rights inherited from the source page. The entries listed below carry a license that requires attribution (currently CC-BY-4.0, CC-BY-SA-4.0). Anyone redistributing or reusing these crops must keep the listed credit and link to the source page on which the rights claim was verified.
+
+- Corpus release: `0.0.0-rc`
+- Released at (corpus state): `2026-05-12T00:00:00Z`
+
+## Attribution-required entries
+
+_No entries in this release require attribution._
+
+## Full per-entry rights
+
+Every entry, attribution-required or not, ships with its rights record in [`data/index/entries.jsonl`](data/index/entries.jsonl). Consumers that need machine-readable rights metadata should read that file directly; the manifest at [`datapackage.json`](datapackage.json) summarises the license breakdown.
diff --git a/README.md b/README.md
index b003b90..2d97347 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,110 @@
-# hletterscript
\ No newline at end of file
+# hletterscript
+
+A dataset of **sets of per-letter images of handwritten Hebrew letters**.
+Each set groups crops produced from documents written by the *same
+writer*; each set typically contains several variants of the same letter
+cut from different scans by that writer.
+
+This repository is the downstream of:
+
+- [HeOCR/public-domain-hand-written-hebrew-scans][upstream] — the
+  canonical, permissively-licensed source of page-level scans. Every
+  entry here cites its upstream scan.
+- [HeOCR/hletterscriptgen][gen] — the framework that turns page scans
+  into per-letter crops. Each entry records which version of that
+  framework produced it.
+
+The intended downstream consumers are synthetic-document generators
+([HeOCR/hocrsyngen][syngen]) and the synthetic / real Hebrew handwriting
+corpora they feed into ([HeOCR/HeOCRsynth][heocrsynth],
+[HeOCR/HeOCR][heocr]).
+
+[upstream]: https://github.com/HeOCR/public-domain-hand-written-hebrew-scans
+[gen]: https://github.com/HeOCR/hletterscriptgen
+[syngen]: https://github.com/HeOCR/hocrsyngen
+[heocrsynth]: https://github.com/HeOCR/HeOCRsynth
+[heocr]: https://github.com/HeOCR/HeOCR
+
+## Dataset Layout
+
+- `docs/dataset_structure.md` defines the repository layout and
+  ingestion model.
+- `docs/letters.md` is the canonical Hebrew-letter enumeration
+  (27 forms — 22 base letters plus the 5 finals).
+- `data/index/writers.jsonl` is the set-level catalog: one JSON object
+  per writer/scribe.
+- `data/index/entries.jsonl` is the image-level catalog: one JSON
+  object per cropped letter image, with upstream provenance,
+  extraction provenance, file checksums, and inherited rights.
+- `data/letters/<writer_id>/<letter_name>/` stores the image bytes.
+- `schemas/writer.schema.json` and `schemas/entry.schema.json` define
+  the record contracts.
+- `scripts/validate_indexes.py` validates JSONL records against the
+  schemas, enforces referential integrity, checks Hebrew-letter
+  codepoint/name/form consistency, pins the upstream repo URL, and
+  re-verifies image file checksums and sizes on disk.
+- `scripts/generate_release_artifacts.py` regenerates `NOTICE.md`,
+  `CITATION.cff`, and `datapackage.json` deterministically from the
+  indexes.
+- `LICENSE.md` documents the compound licensing policy for
+  metadata and per-image inherited rights.
+
+## Serialization Decision
+
+The canonical editable indexes are newline-delimited JSON (`.jsonl`),
+matching the upstream scans repo's convention.
+
+JSONL is deliberately used instead of CSV because these records need
+nested upstream references, bounding boxes, rights inheritance,
+extraction provenance, and quality measurements. CSV/Parquet/SQLite
+exports can be generated later as derived artefacts; the source of
+truth stays line-oriented, diffable, streamable JSON.
+
+## Requirements
+
+- **Python ≥ 3.11** (the validator uses `hashlib.file_digest`).
+  CI pins 3.12.
+- **Git LFS** — image bytes under `data/letters/**` are tracked via
+  LFS (see `.gitattributes`). After cloning, run `git lfs install`
+  once, then `git lfs pull` to fetch the actual image bytes.
+
+Run the current validation check with:
+
+```bash
+git lfs install && git lfs pull
+python3 -m pip install -r requirements-dev.txt
+python3 scripts/validate_indexes.py
+python3 scripts/generate_release_artifacts.py --check
+python3 -m pytest
+```
+
+## Current Status
+
+`v0.0.0-rc` — **initial setup**. The repository ships with the
+schemas, validation tooling, release-artifact generator, CI workflow,
+and licensing policy in place. The per-letter image indexes
+(`writers.jsonl`, `entries.jsonl`) are empty: actual letter-image
+ingestion happens in subsequent PRs, produced by
+[HeOCR/hletterscriptgen][gen] from scans in the upstream repo.
+
+The repository uses a compound licensing model: repository-authored
+metadata is dedicated to the public domain under CC0 1.0 (see
+[`LICENSE`](LICENSE)), while per-image rights are recorded individually
+and inherited from each crop's upstream scan. See [`LICENSE.md`]\
+(LICENSE.md) for the full policy, including the CC BY-SA ShareAlike
+caveat and the rules for remix-friendly release bundles.
+
+## How to use this repo
+
+- [`data/index/entries.jsonl`](data/index/entries.jsonl) is the source
+  of truth for the per-letter image corpus — one JSON object per crop,
+  with upstream citation, file checksums, and inherited rights.
+- [`data/index/writers.jsonl`](data/index/writers.jsonl) catalogs the
+  writers, including candidate leads and rejected records.
+- [`schemas/entry.schema.json`](schemas/entry.schema.json) and
+  [`schemas/writer.schema.json`](schemas/writer.schema.json) define the
+  record contracts; [`scripts/validate_indexes.py`]\
+  (scripts/validate_indexes.py) enforces them in CI.
+- Contributors adding new entries should start with
+  [`AGENTS.md`](AGENTS.md) for ingest rules, naming, and the pre-PR
+  checklist.
diff --git a/data/index/entries.jsonl b/data/index/entries.jsonl
new file mode 100644
index 0000000..e69de29
diff --git a/data/index/writers.jsonl b/data/index/writers.jsonl
new file mode 100644
index 0000000..e69de29
diff --git a/data/letters/.gitkeep b/data/letters/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/datapackage.json b/datapackage.json
new file mode 100644
index 0000000..bea3b70
--- /dev/null
+++ b/datapackage.json
@@ -0,0 +1,74 @@
+{
+  "contributors": [
+    {
+      "role": "maintainer",
+      "title": "Shay Palachy-Affek"
+    }
+  ],
+  "description": "Per-letter image crops of handwritten Hebrew letters, grouped into sets by writer. Each crop is a derivative of a permissively-licensed upstream scan in HeOCR/public-domain-hand-written-hebrew-scans, with per-image rights inherited and attribution recorded. The index is line-oriented JSON (JSONL).",
+  "homepage": "https://github.com/HeOCR/hletterscript",
+  "keywords": [
+    "Hebrew",
+    "dataset",
+    "glyphs",
+    "handwriting",
+    "handwritten-text-recognition",
+    "letters",
+    "public-domain",
+    "synthetic-generation"
+  ],
+  "licenses": [
+    {
+      "name": "CC0-1.0",
+      "path": "https://creativecommons.org/publicdomain/zero/1.0/",
+      "scope": "metadata",
+      "title": "Creative Commons Zero v1.0 Universal"
+    }
+  ],
+  "name": "hletterscript",
+  "profile": "data-package",
+  "released_at": "2026-05-12T00:00:00Z",
+  "resources": [
+    {
+      "bytes": 0,
+      "description": "Per-letter image index. One JSON object per cropped letter image, with upstream provenance, extraction provenance, file checksums, and inherited rights.",
+      "encoding": "utf-8",
+      "format": "jsonl",
+      "mediatype": "application/x-ndjson",
+      "name": "entries",
+      "path": "data/index/entries.jsonl",
+      "profile": "data-resource",
+      "record_count": 0
+    },
+    {
+      "bytes": 0,
+      "description": "Writer-level catalog. One JSON object per writer; each writer defines a 'set' of letter images.",
+      "encoding": "utf-8",
+      "format": "jsonl",
+      "mediatype": "application/x-ndjson",
+      "name": "writers",
+      "path": "data/index/writers.jsonl",
+      "profile": "data-resource",
+      "record_count": 0
+    }
+  ],
+  "schemas": {
+    "entry": "https://github.com/HeOCR/hletterscript/blob/main/schemas/entry.schema.json",
+    "writer": "https://github.com/HeOCR/hletterscript/blob/main/schemas/writer.schema.json"
+  },
+  "stats": {
+    "attribution_required_count": 0,
+    "entry_writer_count": 0,
+    "image_byte_count": 0,
+    "letter_breakdown": {},
+    "license_breakdown": {},
+    "record_count": 0,
+    "writer_breakdown": {},
+    "writer_record_count": 0,
+    "writer_status_breakdown": {}
+  },
+  "title": "Hebrew Handwritten Per-Letter Image Dataset",
+  "upstream_repo": "https://github.com/HeOCR/public-domain-hand-written-hebrew-scans",
+  "version": "0.0.0-rc",
+  "version_released_date": "2026-05-12"
+}
diff --git a/docs/dataset_structure.md b/docs/dataset_structure.md
new file mode 100644
index 0000000..345fea5
--- /dev/null
+++ b/docs/dataset_structure.md
@@ -0,0 +1,254 @@
+# Dataset Structure and Index Model
+
+## Scope
+
+This dataset hosts **sets of per-letter images of handwritten Hebrew
+letters**. Each *set* groups crops that were produced from documents written
+by the **same writer** (the same person/scribe). Within a set, the same
+Hebrew letter may appear multiple times, each variant cut from a different
+document or scan written by that writer.
+
+The corpus is the *downstream* product of two upstream things:
+
+- [HeOCR/public-domain-hand-written-hebrew-scans] is the canonical source
+  of page-level scans. Every per-letter image entry in this repo cites
+  the upstream scan (`source_id`, `entry_id`, `sha256`) it was cut from.
+- [HeOCR/hletterscriptgen] is the framework that turns those page scans
+  into per-letter crops. Each entry records which version of that tool
+  produced it.
+
+The intended downstream consumers are synthetic-document generators
+([HeOCR/hocrsyngen]) and the synthetic / real Hebrew handwriting datasets
+they feed into ([HeOCR/HeOCRsynth], [HeOCR/HeOCR]).
+
+[HeOCR/public-domain-hand-written-hebrew-scans]: https://github.com/HeOCR/public-domain-hand-written-hebrew-scans
+[HeOCR/hletterscriptgen]: https://github.com/HeOCR/hletterscriptgen
+[HeOCR/hocrsyngen]: https://github.com/HeOCR/hocrsyngen
+[HeOCR/HeOCRsynth]: https://github.com/HeOCR/HeOCRsynth
+[HeOCR/HeOCR]: https://github.com/HeOCR/HeOCR
+
+## Recommended repository layout
+
+```text
+data/
+  index/
+    writers.jsonl          # One row per writer/scribe (a "set" of letter images).
+    entries.jsonl          # One row per per-letter image crop.
+  letters/
+    <writer_id>/
+      <letter_name>/
+        <entry_id>.<ext>   # The cropped letter image itself.
+docs/
+  dataset_structure.md     # This file.
+  letters.md               # Canonical Hebrew letter enumeration.
+  release_process.md       # Runbook for cutting a new release.
+schemas/
+  writer.schema.json
+  entry.schema.json
+scripts/
+  validate_indexes.py
+  generate_release_artifacts.py
+  release_recipe.json
+tests/
+  test_validate_indexes.py
+  test_generate_release_artifacts.py
+.github/
+  workflows/
+    ci.yml
+  pull_request_template.md
+```
+
+The `data/index/*.jsonl` files are the canonical catalogs. Image bytes
+live under `data/letters/` and are tracked via **Git LFS** from day one
+(see `.gitattributes`). Contributors must run `git lfs install` and
+`git lfs pull` after cloning to populate the actual image bytes; CI
+does the equivalent before validating.
+
+## Serialization format
+
+Same convention as the upstream scans repo: newline-delimited JSON.
+
+- one complete JSON object per line,
+- UTF-8,
+- no comments,
+- no trailing commas,
+- stable sorted keys when generated by tooling,
+- ISO 8601 timestamps where known,
+- `null` for unknown scalar values.
+
+JSONL is preferred over CSV because each writer and each letter image needs
+nested rights evidence, upstream provenance, extraction provenance, file
+checksums, and quality annotations. Analytics-oriented exports (CSV,
+Parquet, SQLite) can be generated later as derived artefacts.
+
+## Writer index
+
+`data/index/writers.jsonl` is the **set-level catalog**. One row per
+writer/scribe. A writer row defines the identity of the *set* of letter
+images attributed to that person; it does **not** by itself imply rights or
+scope decisions over any individual image — those live on the entries.
+
+Required core fields:
+
+- `writer_id`: stable lowercase identifier (regex
+  `^[a-z][a-z0-9]*(?:_[a-z0-9]+)*$`).
+- `status`: `candidate`, `verified`, `rejected`, or `needs_review`.
+- `display_name`: human-readable name in Latin script.
+- `also_known_as`: list of alternate spellings (Latin, Hebrew, Yiddish, etc.).
+- `dates`: birth and death years with precision flags.
+- `languages_written`: BCP-47 tags the writer is known to have written in.
+- `scripts_written`: ISO 15924 codes (almost always at least `Hebr`).
+- `period`: free-text range describing the rough span of known writings.
+- `description`: short prose summary of the writer's relevance.
+- `references`: at least one citation backing the biographical claims.
+- `ingest`: agent-side notes about provenance and any blockers.
+
+Writers are first-class even if they have zero verified entries — that is
+how research leads are tracked before letter crops have been extracted.
+
+## Entry index
+
+`data/index/entries.jsonl` is the **image-level catalog**. One row per
+cropped per-letter image. Required core fields:
+
+- `entry_id`: stable identifier matching
+  `^<writer_id>__<letter_name>__v[0-9]{4}$`.
+- `writer_id`: foreign key into `writers.jsonl`.
+- `letter`: codepoint + Unicode char + slug name + form (`regular` /
+  `final`). Cross-field consistency is enforced by the validator.
+- `upstream`: citation back to the source page in the upstream scans
+  repo — `source_id`, `entry_id`, scan `sha256`, bounding box `(x,y,w,h)`,
+  and the upstream commit or release at which the extraction was performed.
+- `image`: local crop file — `local_path`, `sha256`, `bytes`, `mime_type`,
+  `width_px`, `height_px`, and `background` (`original`, `white`,
+  `transparent`).
+- `extraction`: how the crop was produced — `tool`, `tool_version`,
+  `method` (`manual`, `auto`, `mixed`), timestamp, actor, free-text notes.
+- `rights`: scan-level license and attribution data, inherited from the
+  upstream scan.
+- `quality`: legibility and usability flags for HTR and synthetic
+  generation pipelines.
+
+## Stable IDs
+
+```text
+writer_id = <slug_of_writers_canonical_name>
+entry_id  = <writer_id>__<letter_name>__v<zero_padded_variant>
+```
+
+`<letter_name>` is the canonical slug from `docs/letters.md`
+(`alef`, `bet`, …, `kaf_final`, `mem_final`, etc.).
+`<variant>` is a zero-padded counter that is monotonic per
+`(writer_id, letter.name)`.
+
+### Writer disambiguation on name collisions
+
+On Latin-name collisions (e.g. two writers named "Yosef Haim"), append
+the birth year: `yosef_haim_1834`. When birth year is unknown, fall
+back to death year (`yosef_haim_d1942`), then to period start
+(`yosef_haim_p1880`), then to a provider authority ID
+(`yosef_haim_viaf12345678`). Record the rationale in
+`ingest.agent_notes` on the writer row. AGENTS.md documents the
+operational form of this rule.
+
+## Rights model (compound, inherited from upstream)
+
+Every per-letter image is a **crop / derivative** of an upstream scan whose
+rights have already been recorded in
+`public-domain-hand-written-hebrew-scans/data/index/entries.jsonl`.
+Repository policy:
+
+- **Repository-authored metadata** in this repo is dedicated to the public
+  domain under CC0 1.0. That includes `data/index/*.jsonl`, schemas,
+  scripts, docs, and generated metadata exports.
+- **Per-image rights** for each crop are recorded individually in the
+  entry's `rights` block. The crop inherits its parent scan's
+  `license_expression`, with the following inheritance rules:
+  - Public-Domain Mark / public-domain refs / CC0 upstream → crop carries
+    the same public-domain expression.
+  - CC BY-4.0 upstream → crop carries `CC-BY-4.0` and must populate
+    `attribution_required: true`, `attribution_text`, and `attribution_url`.
+  - CC BY-SA-4.0 upstream → crop is an **adaptation** of the upstream
+    scan, so the crop is itself `CC-BY-SA-4.0` with attribution; downstream
+    re-distributors of an adaptation of the crop must release under
+    `CC-BY-SA-4.0` (or compatible). This is enforced by
+    `scripts/generate_release_artifacts.py` and surfaced in `NOTICE.md`.
+- **No-commercial / no-derivative / research-only / unknown-rights**
+  upstream scans must not be cropped into this dataset at all. They are
+  rejected at ingest time because the entire premise of the corpus is
+  that the per-letter images can be redistributed and remixed for
+  downstream synthetic generation.
+- **CC BY-SA inheritance for the dataset as a whole**: the dataset is a
+  *collection*, not a single adaptation. Aggregating CC BY-SA crops in a
+  release bundle alongside public-domain crops does not force the bundle
+  to a uniform license; the per-entry license metadata travels with each
+  file. See `LICENSE.md`.
+
+## Upstream cross-reference (`upstream` block)
+
+The `upstream` block in each entry is the *load-bearing* link to the
+source of truth for rights. The validator enforces:
+
+- `upstream.source_id` and `upstream.entry_id` follow the upstream's
+  `entry_id` regex.
+- `upstream.sha256` is a 64-char lowercase hex string. With
+  `--upstream-path PATH`, the validator additionally cross-checks this
+  against the live upstream entry's file SHA-256.
+- `upstream.commit` is an **immutable 40-character commit SHA** — never
+  a tag ref. Tags are mutable and re-pointable; recording one here would
+  silently change the meaning of the entry if the tag moves.
+- `upstream.release_tag` is optional and carries a human-readable tag
+  (e.g. `v0.1.0-rc`) corresponding to the commit. It is for
+  reader convenience only; NOTICE.md links and any code-level
+  resolution use `commit`.
+- `upstream.bbox` has `x ≥ 0`, `y ≥ 0`, `w > 0`, `h > 0`. With
+  `--upstream-path`, the validator also asserts `x+w ≤ width_px` and
+  `y+h ≤ height_px` against the upstream scan dimensions.
+
+The upstream repository URL itself is recorded once in
+`scripts/release_recipe.json::upstream_repo` — not duplicated on every
+entry. Both the validator and the release generator read it from
+there.
+
+If upstream re-encodes a scan and its `sha256` changes, every dependent
+crop in this repo must be re-verified — `--upstream-path` will flag
+the mismatch.
+
+## Ingestion flow
+
+1. Add or update a `writers.jsonl` row as `candidate`.
+2. Pick an upstream entry whose scan is permissively licensed (PD, CC0,
+   CC BY, or CC BY-SA) and whose handwriting belongs to that writer.
+3. Run the relevant `hletterscriptgen` pipeline to produce per-letter
+   crops. Each crop carries upstream `source_id`, `entry_id`, `sha256`,
+   bbox, and the generator's `tool_version`.
+4. Add one row to `entries.jsonl` per crop. Inherit rights from the
+   upstream entry and record `verification_status` accordingly.
+5. Run:
+   ```bash
+   python3 scripts/validate_indexes.py
+   python3 scripts/generate_release_artifacts.py
+   python3 -m pytest
+   ```
+6. Open a PR. The CI workflow re-runs the same checks plus
+   `generate_release_artifacts.py --check`.
+
+## Release artefacts and two-timestamp model
+
+`NOTICE.md`, `CITATION.cff`, and `datapackage.json` at the repo root
+are generated deterministically from `data/index/*.jsonl` and
+`scripts/release_recipe.json`. Do not edit them by hand.
+
+The generator emits two timestamps with **deliberately different
+semantics**:
+
+| Field                                | Source                                                      | Meaning                                          |
+| ------------------------------------ | ----------------------------------------------------------- | ------------------------------------------------ |
+| `datapackage.json::released_at`      | `max(extraction.extracted_at)` (fallback: `initial_release_date`) | *Corpus state.* Bumps every ingest PR.           |
+| `CITATION.cff::date-released`        | `release_recipe.json::version_released_date`                | *Release date of this version.* Stable per version. |
+
+Citations are pinned to the version's release date, not to the latest
+extraction. Ingest PRs are free to add data without invalidating
+existing citations. Bumping a version (and its `version_released_date`)
+is a deliberate, human-driven step — see
+[`docs/release_process.md`](release_process.md).
diff --git a/docs/letters.md b/docs/letters.md
new file mode 100644
index 0000000..b90bea8
--- /dev/null
+++ b/docs/letters.md
@@ -0,0 +1,70 @@
+# Hebrew Letter Enumeration
+
+This file is the human-readable companion to the `letter` block in
+`schemas/entry.schema.json`. The schema is authoritative; this table is meant
+to be readable and stays in sync with it.
+
+Per-letter image entries use lowercase ASCII `letter.name` slugs in
+`entry_id`s, file paths, and statistics. The five letters that take a final
+form get two distinct slugs (`kaf` / `kaf_final`, etc.) — final-form glyphs
+are never collapsed into their base letter, because handwriting style varies
+between the two.
+
+| `letter.name`   | `letter.codepoint` | `letter.unicode_char` | `letter.form` | Hebrew name |
+| --------------- | ------------------ | --------------------- | ------------- | ----------- |
+| `alef`          | `U+05D0`           | א                     | `regular`     | אלף         |
+| `bet`           | `U+05D1`           | ב                     | `regular`     | בית         |
+| `gimel`         | `U+05D2`           | ג                     | `regular`     | גימל        |
+| `dalet`         | `U+05D3`           | ד                     | `regular`     | דלת         |
+| `he`            | `U+05D4`           | ה                     | `regular`     | הא          |
+| `vav`           | `U+05D5`           | ו                     | `regular`     | וו          |
+| `zayin`         | `U+05D6`           | ז                     | `regular`     | זין         |
+| `het`           | `U+05D7`           | ח                     | `regular`     | חית         |
+| `tet`           | `U+05D8`           | ט                     | `regular`     | טית         |
+| `yod`           | `U+05D9`           | י                     | `regular`     | יוד         |
+| `kaf_final`     | `U+05DA`           | ך                     | `final`       | כף סופית    |
+| `kaf`           | `U+05DB`           | כ                     | `regular`     | כף          |
+| `lamed`         | `U+05DC`           | ל                     | `regular`     | למד         |
+| `mem_final`     | `U+05DD`           | ם                     | `final`       | מם סופית    |
+| `mem`           | `U+05DE`           | מ                     | `regular`     | מם          |
+| `nun_final`     | `U+05DF`           | ן                     | `final`       | נון סופית   |
+| `nun`           | `U+05E0`           | נ                     | `regular`     | נון         |
+| `samekh`        | `U+05E1`           | ס                     | `regular`     | סמך         |
+| `ayin`          | `U+05E2`           | ע                     | `regular`     | עין         |
+| `pe_final`      | `U+05E3`           | ף                     | `final`       | פא סופית    |
+| `pe`            | `U+05E4`           | פ                     | `regular`     | פא          |
+| `tsadi_final`   | `U+05E5`           | ץ                     | `final`       | צדי סופית   |
+| `tsadi`         | `U+05E6`           | צ                     | `regular`     | צדי         |
+| `qof`           | `U+05E7`           | ק                     | `regular`     | קוף         |
+| `resh`          | `U+05E8`           | ר                     | `regular`     | ריש         |
+| `shin`          | `U+05E9`           | ש                     | `regular`     | שין         |
+| `tav`           | `U+05EA`           | ת                     | `regular`     | תו          |
+
+## Letters this dataset does NOT split out
+
+- **Pointed (niqqud) vowel marks** (`U+05B0`–`U+05BC`) and the rafe / sof
+  pasuq marks (`U+05BF`, `U+05C0`) are diacritics, not letters, and are
+  out of scope for the per-letter image corpus.
+- **Yiddish digraphs** `װ` (`U+05F0`), `ױ` (`U+05F1`), `ײ` (`U+05F2`) are
+  composed glyphs; they are out of scope. Underlying Yiddish handwriting
+  that uses the standard 27 forms above is in scope.
+- **Shin / sin dot variants** `שׁ` (`U+FB2A`) and `שׂ` (`U+FB2B`) are normalised
+  to the bare `shin` slug. The pointed variant lives in `letter.notes` if
+  the original page has the dot.
+
+## File-path convention
+
+Per-letter image files live at:
+
+```text
+data/letters/<writer_id>/<letter_name>/<entry_id>.<ext>
+```
+
+For example, the first verified alef variant from writer `chaim_nachman_bialik`:
+
+```text
+data/letters/chaim_nachman_bialik/alef/chaim_nachman_bialik__alef__v0001.png
+```
+
+`<entry_id>` always matches `^<writer_id>__<letter_name>__v[0-9]{4}$` and is
+enforced by `schemas/entry.schema.json` and `scripts/validate_indexes.py`.
diff --git a/docs/release_process.md b/docs/release_process.md
new file mode 100644
index 0000000..18fbd87
--- /dev/null
+++ b/docs/release_process.md
@@ -0,0 +1,76 @@
+# Release Process
+
+This document is the runbook for cutting a new dataset release. Releases
+are tagged on `main` and follow [Semantic Versioning](https://semver.org/spec/v2.0.0.html):
+
+- **MAJOR** — backwards-incompatible schema changes, ID renames, or
+  rights-policy changes that change what consumers can do with the data.
+- **MINOR** — additive schema fields, new writers, or substantial new
+  per-letter ingestion batches.
+- **PATCH** — bug-fix re-extractions, metadata corrections, validator
+  fixes, single-entry rights re-verifications.
+
+Pre-1.0 releases use `0.X.Y-rc` (release candidate) suffixes; the
+`-rc` is dropped at `1.0.0`.
+
+## Two timestamps, deliberately distinct
+
+The release generator emits two different timestamps with different
+semantics. Keep them separated mentally:
+
+| Field                                          | Source                                                       | Meaning                                              | Bumps when                                              |
+| ---------------------------------------------- | ------------------------------------------------------------ | ---------------------------------------------------- | ------------------------------------------------------- |
+| `CITATION.cff::date-released`                  | `release_recipe.json::version_released_date`                 | The date *this version* of the dataset was released. | A human bumps `version` *and* `version_released_date`.  |
+| `datapackage.json::released_at`                | `max(extraction.extracted_at)` across all entries            | Latest corpus-state timestamp.                       | Every ingest PR that adds or replaces an entry.         |
+
+`date-released` is what citations should be reproducible against and what
+Zenodo/GitHub indexers expect. `released_at` is informational metadata
+about how fresh the corpus is right now. **Never collapse them into
+one** — that was the v0.0.0-rc design's original bug.
+
+## Cutting a release
+
+1. **Choose the new version.** Decide MAJOR/MINOR/PATCH per the rules
+   above. Open a release PR (label `release:vX.Y.Z`).
+2. **Bump the recipe.** Edit `scripts/release_recipe.json`:
+   - `version` → the new version.
+   - `version_released_date` → today's date (YYYY-MM-DD).
+3. **Regenerate artefacts.**
+   ```bash
+   python3 scripts/generate_release_artifacts.py
+   ```
+   Stage the resulting `NOTICE.md`, `CITATION.cff`, and `datapackage.json`.
+4. **Update the changelog.** Move the `[Unreleased]` section to a new
+   `[X.Y.Z] - YYYY-MM-DD` section and add a fresh empty `[Unreleased]`
+   at the top. Update the link references at the bottom of the file.
+5. **Re-run pre-merge checks.**
+   ```bash
+   python3 scripts/validate_indexes.py
+   python3 scripts/generate_release_artifacts.py --check
+   python3 -m pytest
+   ```
+6. **Merge the release PR.** Squash-merge into `main`.
+7. **Tag the release.**
+   ```bash
+   git checkout main && git pull
+   git tag -a vX.Y.Z -m "Release vX.Y.Z"
+   git push origin vX.Y.Z
+   ```
+8. **Cut the GitHub release** from that tag. The body should be the
+   relevant `CHANGELOG.md` section.
+
+## When NOT to bump the version
+
+Ingest PRs that add writers or entries do **not** bump the version on
+their own. They bump `datapackage.json::released_at` (automatically, via
+`max(extraction.extracted_at)`), and they update the per-license,
+per-writer, and per-letter stats — but the version stays the same until
+a human deliberately cuts a release. This keeps `CITATION.cff` stable
+between releases.
+
+## Pre-1.0 versioning
+
+While the dataset is small and the schema may still shift, releases
+carry the `-rc` suffix. The first non-rc release is `1.0.0`, signalling
+that the schema and ID conventions are stable enough that downstream
+consumers can build long-lived pipelines on top.
diff --git a/requirements-dev.txt b/requirements-dev.txt
new file mode 100644
index 0000000..dd4bb23
--- /dev/null
+++ b/requirements-dev.txt
@@ -0,0 +1,7 @@
+# Development dependencies for hletterscript validators, release tooling,
+# and tests. Requires Python >= 3.11 (validate_indexes.py uses
+# hashlib.file_digest, which was added in 3.11). CI pins 3.12.
+jsonschema>=4.0,<5
+pytest>=8.0,<9
+PyYAML>=6,<7
+frictionless>=5,<6
diff --git a/schemas/entry.schema.json b/schemas/entry.schema.json
new file mode 100644
index 0000000..004717d
--- /dev/null
+++ b/schemas/entry.schema.json
@@ -0,0 +1,408 @@
+{
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "$id": "https://github.com/HeOCR/hletterscript/schemas/entry.schema.json",
+  "title": "Handwritten Hebrew Per-Letter Image Entry",
+  "description": "One row per cropped per-letter image. Each entry is a derivative of a specific upstream scan in HeOCR/public-domain-hand-written-hebrew-scans.",
+  "type": "object",
+  "required": [
+    "entry_id",
+    "writer_id",
+    "letter",
+    "upstream",
+    "image",
+    "extraction",
+    "rights",
+    "quality"
+  ],
+  "additionalProperties": false,
+  "allOf": [
+    {
+      "if": {
+        "properties": {
+          "rights": {
+            "properties": {
+              "verification_status": {
+                "enum": ["unverified", "source_note_only", "conflicting"]
+              }
+            },
+            "required": ["verification_status"]
+          }
+        },
+        "required": ["rights"]
+      },
+      "then": {
+        "properties": {
+          "rights": {
+            "properties": {
+              "commercial_use_allowed": {
+                "not": {
+                  "const": true
+                }
+              },
+              "derivatives_allowed": {
+                "not": {
+                  "const": true
+                }
+              },
+              "redistribution_allowed": {
+                "not": {
+                  "const": true
+                }
+              }
+            }
+          }
+        }
+      }
+    },
+    {
+      "if": {
+        "properties": {
+          "rights": {
+            "properties": {
+              "attribution_required": {
+                "const": true
+              }
+            },
+            "required": ["attribution_required"]
+          }
+        },
+        "required": ["rights"]
+      },
+      "then": {
+        "properties": {
+          "rights": {
+            "properties": {
+              "attribution_text": {
+                "type": "string",
+                "minLength": 1
+              },
+              "attribution_url": {
+                "type": "string",
+                "format": "uri",
+                "minLength": 1
+              }
+            },
+            "required": ["attribution_text", "attribution_url"]
+          }
+        }
+      }
+    },
+    {
+      "if": {
+        "properties": {
+          "image": {
+            "properties": {
+              "background": {
+                "const": "transparent"
+              }
+            },
+            "required": ["background"]
+          }
+        },
+        "required": ["image"]
+      },
+      "then": {
+        "properties": {
+          "image": {
+            "properties": {
+              "mime_type": {
+                "enum": ["image/png", "image/webp", "image/tiff"]
+              }
+            }
+          }
+        }
+      }
+    }
+  ],
+  "properties": {
+    "entry_id": {
+      "type": "string",
+      "pattern": "^[a-z][a-z0-9]*(?:_[a-z0-9]+)*__[a-z][a-z0-9_]*__v[0-9]{4}$",
+      "description": "Format: <writer_id>__<letter.name>__v<4-digit variant>."
+    },
+    "writer_id": {
+      "type": "string",
+      "pattern": "^[a-z][a-z0-9]*(?:_[a-z0-9]+)*$"
+    },
+    "letter": {
+      "type": "object",
+      "required": ["codepoint", "unicode_char", "name", "form"],
+      "additionalProperties": false,
+      "properties": {
+        "codepoint": {
+          "type": "string",
+          "description": "Hebrew Unicode codepoint of the letter (e.g. 'U+05D0'). Cross-validated against the canonical LETTER_TABLE by scripts/validate_indexes.py — no schema-level regex, since the table is the canonical source of truth."
+        },
+        "unicode_char": {
+          "type": "string",
+          "minLength": 1,
+          "maxLength": 1
+        },
+        "name": {
+          "type": "string",
+          "enum": [
+            "alef", "bet", "gimel", "dalet", "he", "vav", "zayin", "het",
+            "tet", "yod", "kaf", "kaf_final", "lamed", "mem", "mem_final",
+            "nun", "nun_final", "samekh", "ayin", "pe", "pe_final",
+            "tsadi", "tsadi_final", "qof", "resh", "shin", "tav"
+          ]
+        },
+        "form": {
+          "type": "string",
+          "enum": ["regular", "final"]
+        },
+        "style": {
+          "type": "string",
+          "enum": [
+            "unknown",
+            "cursive_ashkenazi",
+            "cursive_sephardi",
+            "cursive_mizrahi",
+            "cursive_yemenite",
+            "block_ashkenazi",
+            "block_sephardi",
+            "block_modern",
+            "rashi",
+            "yiddish_handwriting",
+            "other"
+          ],
+          "description": "Per-image handwriting style. Optional. Downstream syngen and HTR consumers should filter on this when style consistency matters."
+        },
+        "notes": {
+          "type": ["string", "null"]
+        }
+      }
+    },
+    "upstream": {
+      "type": "object",
+      "description": "Reference back to the scan in HeOCR/public-domain-hand-written-hebrew-scans this crop was extracted from. The upstream repository URL is recorded once in scripts/release_recipe.json (`upstream_repo`); it is not duplicated on every entry.",
+      "required": ["source_id", "entry_id", "sha256", "commit", "bbox"],
+      "additionalProperties": false,
+      "properties": {
+        "source_id": {
+          "type": "string",
+          "pattern": "^[a-z0-9]+(?:__[a-z0-9_]+)+$"
+        },
+        "entry_id": {
+          "type": "string",
+          "pattern": "^[a-z0-9]+(?:__[a-z0-9_]+)+__p[0-9]{4,6}$"
+        },
+        "sha256": {
+          "type": "string",
+          "pattern": "^[a-f0-9]{64}$",
+          "description": "SHA-256 of the upstream scan file at the time of extraction. The validator (with --upstream-path) re-checks this against the live upstream entry to catch upstream re-encodes."
+        },
+        "commit": {
+          "type": "string",
+          "pattern": "^[a-f0-9]{40}$",
+          "description": "Immutable 40-char upstream commit SHA at which extraction was performed. Tags are mutable; never put a tag ref here. Use `release_tag` for human-readable release labels."
+        },
+        "release_tag": {
+          "type": ["string", "null"],
+          "pattern": "^v[0-9]+\\.[0-9]+\\.[0-9]+(?:-[A-Za-z0-9.]+)?$",
+          "description": "Optional upstream release tag (e.g. 'v0.1.0-rc') corresponding to `commit`, recorded for human readability. The commit SHA is what NOTICE.md links to."
+        },
+        "bbox": {
+          "type": "object",
+          "required": ["x", "y", "w", "h"],
+          "additionalProperties": false,
+          "properties": {
+            "x": {"type": "integer", "minimum": 0},
+            "y": {"type": "integer", "minimum": 0},
+            "w": {"type": "integer", "minimum": 1},
+            "h": {"type": "integer", "minimum": 1}
+          }
+        }
+      }
+    },
+    "image": {
+      "type": "object",
+      "required": [
+        "local_path",
+        "sha256",
+        "mime_type",
+        "bytes",
+        "width_px",
+        "height_px",
+        "background"
+      ],
+      "additionalProperties": false,
+      "properties": {
+        "local_path": {
+          "type": "string",
+          "minLength": 1,
+          "description": "Repo-relative path under data/letters/."
+        },
+        "sha256": {
+          "type": "string",
+          "pattern": "^[a-f0-9]{64}$"
+        },
+        "mime_type": {
+          "type": "string",
+          "enum": ["image/png", "image/jpeg", "image/webp", "image/tiff"]
+        },
+        "bytes": {
+          "type": "integer",
+          "minimum": 1
+        },
+        "width_px": {
+          "type": "integer",
+          "minimum": 1
+        },
+        "height_px": {
+          "type": "integer",
+          "minimum": 1
+        },
+        "background": {
+          "type": "string",
+          "enum": [
+            "original",
+            "white",
+            "black",
+            "gray",
+            "binarized",
+            "transparent"
+          ],
+          "description": "Background of the crop. `original` preserves source pixels; named colors mean the background has been bleached to that color; `binarized` is a 1-bit map; `transparent` requires an alpha channel (PNG/WebP/TIFF only — enforced by the schema)."
+        }
+      }
+    },
+    "extraction": {
+      "type": "object",
+      "required": [
+        "tool",
+        "tool_version",
+        "method",
+        "extracted_at",
+        "extracted_by",
+        "notes"
+      ],
+      "additionalProperties": false,
+      "properties": {
+        "tool": {
+          "type": "string",
+          "minLength": 1,
+          "description": "Typically 'hletterscriptgen' or a sub-tool name."
+        },
+        "tool_version": {
+          "type": "string",
+          "pattern": "^v?[0-9]+\\.[0-9]+\\.[0-9]+(?:-[A-Za-z0-9.-]+)?(?:\\+[A-Za-z0-9.-]+)?$",
+          "description": "SemVer or `git describe --tags` output. Accepts prerelease (-rc1), build metadata (+build.5), and git-describe distance/hash (-3-gabc1234)."
+        },
+        "method": {
+          "type": "string",
+          "enum": ["manual", "auto", "mixed"]
+        },
+        "extracted_at": {
+          "type": "string",
+          "format": "date-time"
+        },
+        "extracted_by": {
+          "type": "string",
+          "minLength": 1
+        },
+        "notes": {
+          "type": ["string", "null"]
+        }
+      }
+    },
+    "rights": {
+      "type": "object",
+      "description": "Inherited from the upstream scan entry; the crop is a derivative. The validator cross-checks `rights_basis` against `license_expression` via LICENSE_BASIS_MAP.",
+      "required": [
+        "rights_basis",
+        "license_expression",
+        "commercial_use_allowed",
+        "derivatives_allowed",
+        "redistribution_allowed",
+        "attribution_required",
+        "attribution_text",
+        "attribution_url",
+        "verification_status",
+        "evidence_text",
+        "verified_at"
+      ],
+      "additionalProperties": false,
+      "properties": {
+        "rights_basis": {
+          "type": "string",
+          "enum": ["public_domain", "cc0", "cc_by", "cc_by_sa", "unknown"]
+        },
+        "license_expression": {
+          "type": ["string", "null"]
+        },
+        "commercial_use_allowed": {
+          "type": ["boolean", "null"]
+        },
+        "derivatives_allowed": {
+          "type": ["boolean", "null"]
+        },
+        "redistribution_allowed": {
+          "type": ["boolean", "null"]
+        },
+        "attribution_required": {
+          "type": ["boolean", "null"]
+        },
+        "attribution_text": {
+          "type": ["string", "null"],
+          "minLength": 1
+        },
+        "attribution_url": {
+          "type": ["string", "null"],
+          "format": "uri"
+        },
+        "verification_status": {
+          "type": "string",
+          "enum": [
+            "unverified",
+            "source_note_only",
+            "inherited_from_upstream",
+            "primary_page_checked",
+            "conflicting",
+            "rejected"
+          ]
+        },
+        "evidence_text": {
+          "type": ["string", "null"]
+        },
+        "verified_at": {
+          "type": ["string", "null"],
+          "format": "date"
+        }
+      }
+    },
+    "quality": {
+      "type": "object",
+      "required": [
+        "usable_for_htr",
+        "usable_for_syngen",
+        "legibility",
+        "exclusion_reasons"
+      ],
+      "additionalProperties": false,
+      "properties": {
+        "usable_for_htr": {
+          "type": ["boolean", "null"],
+          "description": "Suitable as ground truth for handwritten text recognition training."
+        },
+        "usable_for_syngen": {
+          "type": ["boolean", "null"],
+          "description": "Suitable as a glyph for HeOCR/hocrsyngen synthetic-document generation."
+        },
+        "legibility": {
+          "type": "string",
+          "enum": ["high", "medium", "low", "unknown"]
+        },
+        "exclusion_reasons": {
+          "type": "array",
+          "items": {
+            "type": "string"
+          }
+        },
+        "notes": {
+          "type": ["string", "null"]
+        }
+      }
+    }
+  }
+}
diff --git a/schemas/writer.schema.json b/schemas/writer.schema.json
new file mode 100644
index 0000000..abc6b3b
--- /dev/null
+++ b/schemas/writer.schema.json
@@ -0,0 +1,163 @@
+{
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "$id": "https://github.com/HeOCR/hletterscript/schemas/writer.schema.json",
+  "title": "Handwritten Hebrew Letter-Image Writer",
+  "description": "One row per writer/scribe. Each writer defines a 'set' of per-letter image entries attributed to that person.",
+  "type": "object",
+  "required": [
+    "writer_id",
+    "status",
+    "display_name",
+    "also_known_as",
+    "description",
+    "dates",
+    "languages_written",
+    "scripts_written",
+    "period",
+    "references",
+    "ingest"
+  ],
+  "additionalProperties": false,
+  "allOf": [
+    {
+      "if": {
+        "properties": {
+          "status": {
+            "enum": ["verified", "rejected"]
+          }
+        },
+        "required": ["status"]
+      },
+      "then": {
+        "properties": {
+          "references": {
+            "minItems": 1
+          }
+        }
+      }
+    }
+  ],
+  "properties": {
+    "writer_id": {
+      "type": "string",
+      "pattern": "^[a-z][a-z0-9]*(?:_[a-z0-9]+)*$",
+      "description": "Stable lowercase slug. On Latin-name collision (e.g. two writers named 'Yosef Haim'), disambiguate by appending the birth year: `yosef_haim_1834`. If the birth year is unknown, use death year; if both are unknown, use the start year of `period`. Document the disambiguation in `ingest.agent_notes`."
+    },
+    "status": {
+      "type": "string",
+      "enum": ["candidate", "verified", "rejected", "needs_review"]
+    },
+    "display_name": {
+      "type": "string",
+      "minLength": 1
+    },
+    "also_known_as": {
+      "type": "array",
+      "items": {
+        "type": "string",
+        "minLength": 1
+      }
+    },
+    "description": {
+      "type": ["string", "null"]
+    },
+    "dates": {
+      "type": "object",
+      "required": [
+        "birth_year",
+        "birth_precision",
+        "death_year",
+        "death_precision"
+      ],
+      "additionalProperties": false,
+      "properties": {
+        "birth_year": {
+          "type": ["integer", "null"]
+        },
+        "birth_precision": {
+          "type": "string",
+          "enum": ["exact", "circa", "decade", "unknown"]
+        },
+        "death_year": {
+          "type": ["integer", "null"]
+        },
+        "death_precision": {
+          "type": "string",
+          "enum": ["exact", "circa", "decade", "alive", "unknown"]
+        }
+      }
+    },
+    "languages_written": {
+      "type": "array",
+      "minItems": 1,
+      "items": {
+        "type": "string",
+        "minLength": 2
+      }
+    },
+    "scripts_written": {
+      "type": "array",
+      "minItems": 1,
+      "items": {
+        "type": "string",
+        "enum": ["Hebr", "Latn", "Arab", "Cyrl", "unknown"]
+      }
+    },
+    "period": {
+      "type": "object",
+      "required": ["start", "end", "precision"],
+      "additionalProperties": false,
+      "properties": {
+        "start": {
+          "type": ["string", "null"]
+        },
+        "end": {
+          "type": ["string", "null"]
+        },
+        "precision": {
+          "type": "string",
+          "enum": ["day", "month", "year", "decade", "range", "circa", "unknown"]
+        }
+      }
+    },
+    "references": {
+      "type": "array",
+      "description": "Biographical evidence. `candidate` and `needs_review` writers may have zero references; `verified` and `rejected` writers must have at least one (enforced by the conditional at the top of this schema).",
+      "items": {
+        "type": "object",
+        "required": ["kind", "citation"],
+        "additionalProperties": false,
+        "properties": {
+          "kind": {
+            "type": "string",
+            "enum": ["repo_note", "primary_url", "secondary_url", "authority_record", "agent_assessment"]
+          },
+          "citation": {
+            "type": "string",
+            "minLength": 1
+          },
+          "quote": {
+            "type": ["string", "null"]
+          },
+          "url": {
+            "type": ["string", "null"],
+            "format": "uri"
+          }
+        }
+      }
+    },
+    "ingest": {
+      "type": "object",
+      "required": ["agent_notes", "blocked_reason"],
+      "additionalProperties": false,
+      "properties": {
+        "agent_notes": {
+          "type": ["string", "null"]
+        },
+        "blocked_reason": {
+          "type": ["string", "null"]
+        }
+      }
+    }
+  }
+}
diff --git a/scripts/generate_release_artifacts.py b/scripts/generate_release_artifacts.py
new file mode 100644
index 0000000..a168d3f
--- /dev/null
+++ b/scripts/generate_release_artifacts.py
@@ -0,0 +1,590 @@
+#!/usr/bin/env python3
+"""Generate deterministic release artefacts from data/index/*.jsonl.
+
+Emits three files at the repo root:
+
+  - NOTICE.md         human-readable attribution roll-up.
+  - CITATION.cff      Citation File Format 1.2.0.
+  - datapackage.json  Frictionless Data Package manifest.
+
+The script is fully deterministic: same indexes + recipe in,
+byte-identical files out. No datetime.now(), no random ordering, no
+UUIDs.
+
+Two timestamps with deliberately different semantics:
+
+  - `datapackage.json::released_at` = max(extraction.extracted_at)
+    across entries — the *corpus-state timestamp*. Bumps on every
+    ingest PR. When the corpus is empty it falls back to
+    `release_recipe.json::initial_release_date`.
+
+  - `CITATION.cff::date-released` = `release_recipe.json::version_released_date`
+    — the date this `version` was released. Stable per release. Bumped
+    manually on `version` bump (see docs/release_process.md). This is
+    what citations should be reproducible against.
+
+Use `--check` to verify the on-disk artefacts match what would be
+generated without touching the tree.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from collections import Counter
+from pathlib import Path
+from typing import Any
+
+try:
+    import yaml
+except ImportError as exc:  # pragma: no cover - exercised when deps are absent.
+    raise SystemExit(
+        "Missing dependency: PyYAML. Install development dependencies with "
+        "`python3 -m pip install -r requirements-dev.txt`."
+    ) from exc
+
+
+REPO_ROOT = Path(__file__).resolve().parents[1]
+WRITERS_PATH = REPO_ROOT / "data" / "index" / "writers.jsonl"
+ENTRIES_PATH = REPO_ROOT / "data" / "index" / "entries.jsonl"
+RECIPE_PATH = REPO_ROOT / "scripts" / "release_recipe.json"
+NOTICE_PATH = REPO_ROOT / "NOTICE.md"
+CITATION_PATH = REPO_ROOT / "CITATION.cff"
+DATAPACKAGE_PATH = REPO_ROOT / "datapackage.json"
+
+# Licenses whose terms require attribution. Drives both NOTICE.md
+# inclusion and the consistency check below. Keep in sync with
+# scripts/validate_indexes.py::LICENSE_BASIS_MAP and the inheritance
+# table in docs/dataset_structure.md.
+ATTRIBUTION_REQUIRING_LICENSES: frozenset[str] = frozenset({
+    "CC-BY-4.0",
+    "CC-BY-SA-4.0",
+})
+
+
+def _load_jsonl(path: Path) -> list[dict[str, Any]]:
+    if not path.exists():
+        raise SystemExit(f"{path}: file does not exist")
+    rows: list[dict[str, Any]] = []
+    with path.open("r", encoding="utf-8") as handle:
+        for line_number, line in enumerate(handle, start=1):
+            stripped = line.strip()
+            if not stripped:
+                continue
+            try:
+                rows.append(json.loads(stripped))
+            except json.JSONDecodeError as exc:
+                raise SystemExit(f"{path}:{line_number}: invalid JSON: {exc}") from exc
+    return rows
+
+
+def _load_recipe(path: Path) -> dict[str, Any]:
+    if not path.exists():
+        raise SystemExit(f"{path}: file does not exist")
+    try:
+        return json.loads(path.read_text(encoding="utf-8"))
+    except json.JSONDecodeError as exc:
+        raise SystemExit(f"{path}: invalid JSON: {exc}") from exc
+
+
+def _derive_released_at(entries: list[dict[str, Any]], recipe: dict[str, Any]) -> str:
+    """Corpus-state timestamp for datapackage.json. Bumps every ingest."""
+    extracted = [
+        entry["extraction"]["extracted_at"]
+        for entry in entries
+        if entry.get("extraction") and entry["extraction"].get("extracted_at")
+    ]
+    if extracted:
+        return max(extracted)
+    # Empty corpus: initial-setup state. Fall back to the recipe's
+    # `initial_release_date` so generation is deterministic.
+    initial = recipe.get("initial_release_date")
+    if not isinstance(initial, str) or not initial:
+        raise SystemExit(
+            "no extraction.extracted_at values found in entries.jsonl, and "
+            "release_recipe.json has no initial_release_date fallback"
+        )
+    return initial
+
+
+def _resolve_citation_date(recipe: dict[str, Any]) -> str:
+    """Citation date for CITATION.cff. Stable per version."""
+    date = recipe.get("version_released_date")
+    if not isinstance(date, str) or not date:
+        raise SystemExit(
+            "release_recipe.json::version_released_date is missing; this is "
+            "the stable per-version release date used by CITATION.cff. Set it "
+            "when you bump `version` (see docs/release_process.md)."
+        )
+    return date
+
+
+def _license_breakdown(entries: list[dict[str, Any]]) -> dict[str, int]:
+    counts = Counter(entry["rights"]["license_expression"] for entry in entries)
+    return {key: counts[key] for key in sorted(counts, key=lambda k: (k is None, k))}
+
+
+def _writer_breakdown(entries: list[dict[str, Any]]) -> dict[str, int]:
+    counts = Counter(entry["writer_id"] for entry in entries)
+    return {key: counts[key] for key in sorted(counts)}
+
+
+def _letter_breakdown(entries: list[dict[str, Any]]) -> dict[str, int]:
+    counts = Counter(entry["letter"]["name"] for entry in entries)
+    return {key: counts[key] for key in sorted(counts)}
+
+
+def _image_byte_count(entries: list[dict[str, Any]]) -> int:
+    total = 0
+    for entry in entries:
+        byte_size = entry["image"].get("bytes")
+        if isinstance(byte_size, int):
+            total += byte_size
+    return total
+
+
+def _check_attribution_consistency(entries: list[dict[str, Any]]) -> None:
+    # Any entry whose license demands attribution must carry the flag,
+    # text, and url. The schema enforces text+url *given* the flag; this
+    # layer catches the prior failure mode of "license is CC-BY-SA but
+    # ingester forgot the flag", which would silently drop the entry
+    # from NOTICE.md.
+    for entry in entries:
+        rights = entry["rights"]
+        license_expr = rights.get("license_expression")
+        if license_expr in ATTRIBUTION_REQUIRING_LICENSES:
+            if rights.get("attribution_required") is not True:
+                raise SystemExit(
+                    f"{entry['entry_id']}: license {license_expr} requires "
+                    f"rights.attribution_required: true (found "
+                    f"{rights.get('attribution_required')!r})"
+                )
+            for field in ("attribution_text", "attribution_url"):
+                value = rights.get(field)
+                if not isinstance(value, str) or not value.strip():
+                    raise SystemExit(
+                        f"{entry['entry_id']}: license {license_expr} requires "
+                        f"rights.{field}, but it is null, blank, or "
+                        f"whitespace-only"
+                    )
+
+
+def _attribution_entries(entries: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    selected = [
+        entry
+        for entry in entries
+        if entry["rights"].get("license_expression") in ATTRIBUTION_REQUIRING_LICENSES
+    ]
+    return sorted(selected, key=lambda entry: entry["entry_id"])
+
+
+def _notice_stanza(
+    entry: dict[str, Any],
+    recipe: dict[str, Any],
+    upstream_repo_url: str,
+) -> str:
+    license_names: dict[str, str] = recipe["license_names"]
+    license_urls: dict[str, str] = recipe["license_urls"]
+    rights = entry["rights"]
+    license_expr = rights["license_expression"]
+    license_name = license_names.get(license_expr, license_expr)
+    license_url = license_urls.get(license_expr)
+
+    if license_url:
+        license_line = f"- License: [{license_name} ({license_expr})]({license_url})"
+    else:
+        license_line = f"- License: {license_name} ({license_expr})"
+
+    letter = entry["letter"]
+    title = (
+        f"{letter['unicode_char']} ({letter['name']}, {letter['form']}) "
+        f"by writer `{entry['writer_id']}`"
+    )
+
+    upstream = entry["upstream"]
+    # `upstream.commit` is enforced as a 40-char SHA by the schema; this
+    # URL form always resolves on GitHub. Never use `release_tag` here —
+    # tags are mutable and the link must outlive tag re-pointing.
+    upstream_link = (
+        f"{upstream_repo_url}/blob/{upstream['commit']}/data/index/entries.jsonl"
+    )
+
+    lines = [
+        f"### {title}",
+        "",
+        f"- Entry: `{entry['entry_id']}`",
+        license_line,
+        f"- Licensor: {rights['attribution_text']}",
+        f"- Source page: <{rights['attribution_url']}>",
+        f"- Upstream scan entry: `{upstream['entry_id']}` "
+        f"(<{upstream_link}>)",
+    ]
+    return "\n".join(lines)
+
+
+NOTICE_TEMPLATE = """\
+# NOTICE
+
+This file is generated by `scripts/generate_release_artifacts.py` from \
+`data/index/entries.jsonl`. Do not edit by hand.
+
+Repository-authored metadata is dedicated to the public domain under \
+CC0 1.0 Universal. See [`LICENSE`](LICENSE) and [`LICENSE.md`](LICENSE.md) \
+for the full compound-licensing policy.
+
+Per-letter image crops are derivatives of upstream scans in \
+[HeOCR/public-domain-hand-written-hebrew-scans]({upstream_repo_url}) and \
+carry per-entry rights inherited from the source page. The entries \
+listed below carry a license that requires attribution (currently \
+{license_set}). Anyone redistributing or reusing these crops must keep \
+the listed credit and link to the source page on which the rights claim \
+was verified.
+
+- Corpus release: `{version}`
+- Released at (corpus state): `{released_at}`
+
+## Attribution-required entries
+
+{stanzas}
+
+## Full per-entry rights
+
+Every entry, attribution-required or not, ships with its rights record in \
+[`data/index/entries.jsonl`](data/index/entries.jsonl). Consumers that \
+need machine-readable rights metadata should read that file directly; the \
+manifest at [`datapackage.json`](datapackage.json) summarises the license \
+breakdown.
+"""
+
+
+def build_notice(
+    entries: list[dict[str, Any]],
+    recipe: dict[str, Any],
+    released_at: str,
+    upstream_repo_url: str,
+) -> str:
+    required = _attribution_entries(entries)
+    if required:
+        stanzas = "\n\n".join(
+            _notice_stanza(entry, recipe, upstream_repo_url) for entry in required
+        )
+    else:
+        stanzas = "_No entries in this release require attribution._"
+
+    license_set = ", ".join(sorted(ATTRIBUTION_REQUIRING_LICENSES))
+    return NOTICE_TEMPLATE.format(
+        license_set=license_set,
+        version=recipe["version"],
+        released_at=released_at,
+        stanzas=stanzas,
+        upstream_repo_url=upstream_repo_url,
+    )
+
+
+def build_citation(
+    entries: list[dict[str, Any]],
+    writers: list[dict[str, Any]],
+    recipe: dict[str, Any],
+    citation_date: str,
+) -> str:
+    license_counts = _license_breakdown(entries)
+    if license_counts:
+        breakdown_summary = ", ".join(
+            f"{count} {license_id}" for license_id, count in license_counts.items()
+        )
+        entry_writer_count = len({entry["writer_id"] for entry in entries})
+        abstract = (
+            f"{recipe['description']} Release {recipe['version']} contains "
+            f"{len(entries)} per-letter image entries drawn from "
+            f"{entry_writer_count} verified writers ({breakdown_summary})."
+        )
+    else:
+        abstract = (
+            f"{recipe['description']} Release {recipe['version']} is the "
+            f"initial-setup release: the corpus contains no per-letter image "
+            f"entries yet. The repository ships the schemas, validation "
+            f"tooling, CI, and licensing policy needed to start ingesting."
+        )
+
+    document: dict[str, Any] = {
+        "cff-version": "1.2.0",
+        "message": "Please cite this dataset using the metadata below.",
+        "type": "dataset",
+        "title": recipe["title"],
+        "abstract": abstract,
+        "authors": [{"name": author["name"]} for author in recipe["authors"]],
+        "version": recipe["version"],
+        "date-released": citation_date,
+        "repository-code": recipe["repository_code"],
+        "url": recipe["homepage"],
+        "license": recipe["metadata_license"]["spdx"],
+        "keywords": sorted(recipe["keywords"]),
+    }
+    identifiers = recipe.get("citation_identifiers") or []
+    if identifiers:
+        document["identifiers"] = identifiers
+
+    header = "# Generated by scripts/generate_release_artifacts.py. Do not edit by hand.\n"
+    body = yaml.safe_dump(
+        document,
+        default_flow_style=False,
+        sort_keys=False,
+        allow_unicode=True,
+        width=10_000,
+    )
+    return header + body
+
+
+def build_datapackage(
+    entries: list[dict[str, Any]],
+    writers: list[dict[str, Any]],
+    recipe: dict[str, Any],
+    released_at: str,
+    citation_date: str,
+    entries_path: Path,
+    writers_path: Path,
+) -> dict[str, Any]:
+    license_names: dict[str, str] = recipe["license_names"]
+    license_urls: dict[str, str] = recipe["license_urls"]
+    license_counts = _license_breakdown(entries)
+    writer_status_counts = Counter(writer.get("status") for writer in writers)
+    writer_status_breakdown = {
+        key: writer_status_counts[key]
+        for key in sorted(writer_status_counts)
+        if key is not None
+    }
+
+    license_listings: list[dict[str, Any]] = []
+    license_listings.append({
+        "name": recipe["metadata_license"]["spdx"],
+        "path": recipe["metadata_license"]["url"],
+        "title": license_names.get(
+            recipe["metadata_license"]["spdx"], recipe["metadata_license"]["spdx"]
+        ),
+        "scope": "metadata",
+    })
+    for license_id in sorted(k for k in license_counts if k is not None):
+        listing: dict[str, Any] = {
+            "name": license_id,
+            "title": license_names.get(license_id, license_id),
+            "scope": "images",
+        }
+        url = license_urls.get(license_id)
+        if url:
+            listing["path"] = url
+        license_listings.append(listing)
+
+    resource_path_for: dict[str, Path] = {
+        "entries": entries_path,
+        "writers": writers_path,
+    }
+    resource_records_for: dict[str, int] = {
+        "entries": len(entries),
+        "writers": len(writers),
+    }
+
+    resources: list[dict[str, Any]] = []
+    for name in sorted(recipe["resources"]):
+        spec = recipe["resources"][name]
+        # Note: no `schema` field. Frictionless reserves
+        # `resources[].schema` for Table Schema (column definitions), but
+        # our data is nested JSON validated against JSON Schema. We
+        # expose the JSON Schema URLs via the top-level `schemas` block
+        # as a custom extension instead.
+        resources.append({
+            "name": name,
+            "path": spec["path"],
+            "profile": "data-resource",
+            "format": spec["format"],
+            "mediatype": spec["mediatype"],
+            "encoding": spec["encoding"],
+            "description": spec["description"],
+            "record_count": resource_records_for[name],
+            "bytes": resource_path_for[name].stat().st_size,
+        })
+
+    return {
+        "profile": "data-package",
+        "name": recipe["name"],
+        "title": recipe["title"],
+        "description": recipe["description"],
+        "version": recipe["version"],
+        "version_released_date": citation_date,
+        "released_at": released_at,
+        "homepage": recipe["homepage"],
+        "upstream_repo": recipe["upstream_repo"],
+        "keywords": sorted(recipe["keywords"]),
+        "contributors": [
+            {"title": author["name"], "role": author.get("role", "author")}
+            for author in recipe["authors"]
+        ],
+        "licenses": license_listings,
+        "schemas": {
+            "writer": recipe["schema_urls"]["writer"],
+            "entry": recipe["schema_urls"]["entry"],
+        },
+        "stats": {
+            "record_count": len(entries),
+            "entry_writer_count": len({entry["writer_id"] for entry in entries}),
+            "writer_record_count": len(writers),
+            "writer_status_breakdown": writer_status_breakdown,
+            "image_byte_count": _image_byte_count(entries),
+            "attribution_required_count": len(_attribution_entries(entries)),
+            "license_breakdown": license_counts,
+            "letter_breakdown": _letter_breakdown(entries),
+            "writer_breakdown": _writer_breakdown(entries),
+        },
+        "resources": resources,
+    }
+
+
+def _serialise_text(text: str) -> str:
+    return text if text.endswith("\n") else text + "\n"
+
+
+def _serialise_json(data: dict[str, Any]) -> str:
+    return json.dumps(data, ensure_ascii=False, indent=2, sort_keys=True) + "\n"
+
+
+def _require_recipe_fields(recipe: dict[str, Any]) -> None:
+    required = [
+        "name", "title", "version", "version_released_date",
+        "description", "homepage", "repository_code", "upstream_repo",
+        "authors", "keywords", "metadata_license",
+        "license_urls", "license_names", "schema_urls", "resources",
+    ]
+    missing = [field for field in required if field not in recipe]
+    if missing:
+        raise SystemExit(
+            f"release_recipe.json missing required field(s): {', '.join(missing)}"
+        )
+
+
+def _render(
+    writers_path: Path,
+    entries_path: Path,
+    recipe_path: Path,
+) -> dict[str, str]:
+    writers = _load_jsonl(writers_path)
+    entries = _load_jsonl(entries_path)
+    recipe = _load_recipe(recipe_path)
+    _require_recipe_fields(recipe)
+    _check_attribution_consistency(entries)
+    released_at = _derive_released_at(entries, recipe)
+    citation_date = _resolve_citation_date(recipe)
+    upstream_repo_url = recipe["upstream_repo"]
+
+    return {
+        "notice": _serialise_text(
+            build_notice(entries, recipe, released_at, upstream_repo_url)
+        ),
+        "citation": _serialise_text(
+            build_citation(entries, writers, recipe, citation_date)
+        ),
+        "datapackage": _serialise_json(
+            build_datapackage(
+                entries, writers, recipe, released_at, citation_date,
+                entries_path=entries_path, writers_path=writers_path,
+            )
+        ),
+    }
+
+
+def generate(
+    writers_path: Path = WRITERS_PATH,
+    entries_path: Path = ENTRIES_PATH,
+    recipe_path: Path = RECIPE_PATH,
+    notice_path: Path = NOTICE_PATH,
+    citation_path: Path = CITATION_PATH,
+    datapackage_path: Path = DATAPACKAGE_PATH,
+) -> dict[str, Path]:
+    rendered = _render(writers_path, entries_path, recipe_path)
+    notice_path.write_text(rendered["notice"], encoding="utf-8")
+    citation_path.write_text(rendered["citation"], encoding="utf-8")
+    datapackage_path.write_text(rendered["datapackage"], encoding="utf-8")
+    return {
+        "notice": notice_path,
+        "citation": citation_path,
+        "datapackage": datapackage_path,
+    }
+
+
+def check(
+    writers_path: Path = WRITERS_PATH,
+    entries_path: Path = ENTRIES_PATH,
+    recipe_path: Path = RECIPE_PATH,
+    notice_path: Path = NOTICE_PATH,
+    citation_path: Path = CITATION_PATH,
+    datapackage_path: Path = DATAPACKAGE_PATH,
+) -> list[Path]:
+    rendered = _render(writers_path, entries_path, recipe_path)
+    stale: list[Path] = []
+    for kind, path in (
+        ("notice", notice_path),
+        ("citation", citation_path),
+        ("datapackage", datapackage_path),
+    ):
+        actual = path.read_text(encoding="utf-8") if path.exists() else ""
+        if actual != rendered[kind]:
+            stale.append(path)
+    return stale
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--writers", type=Path, default=WRITERS_PATH)
+    parser.add_argument("--entries", type=Path, default=ENTRIES_PATH)
+    parser.add_argument("--recipe", type=Path, default=RECIPE_PATH)
+    parser.add_argument("--notice", type=Path, default=NOTICE_PATH)
+    parser.add_argument("--citation", type=Path, default=CITATION_PATH)
+    parser.add_argument("--datapackage", type=Path, default=DATAPACKAGE_PATH)
+    parser.add_argument(
+        "--check",
+        action="store_true",
+        help="Verify on-disk artefacts match what would be generated. Exit 1 if not.",
+    )
+    args = parser.parse_args()
+
+    if args.check:
+        stale = check(
+            writers_path=args.writers,
+            entries_path=args.entries,
+            recipe_path=args.recipe,
+            notice_path=args.notice,
+            citation_path=args.citation,
+            datapackage_path=args.datapackage,
+        )
+        if stale:
+            for path in stale:
+                try:
+                    display = path.relative_to(REPO_ROOT)
+                except ValueError:
+                    display = path
+                print(f"stale: {display}", file=sys.stderr)
+            print(
+                "Run `python3 scripts/generate_release_artifacts.py` to regenerate.",
+                file=sys.stderr,
+            )
+            raise SystemExit(1)
+        print("ok: release artefacts are up to date")
+        return
+
+    written = generate(
+        writers_path=args.writers,
+        entries_path=args.entries,
+        recipe_path=args.recipe,
+        notice_path=args.notice,
+        citation_path=args.citation,
+        datapackage_path=args.datapackage,
+    )
+    for label, path in written.items():
+        try:
+            display = path.relative_to(REPO_ROOT)
+        except ValueError:
+            display = path
+        print(f"wrote {label}: {display}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/release_recipe.json b/scripts/release_recipe.json
new file mode 100644
index 0000000..6cf2eb1
--- /dev/null
+++ b/scripts/release_recipe.json
@@ -0,0 +1,63 @@
+{
+  "name": "hletterscript",
+  "title": "Hebrew Handwritten Per-Letter Image Dataset",
+  "version": "0.0.0-rc",
+  "version_released_date": "2026-05-12",
+  "initial_release_date": "2026-05-12T00:00:00Z",
+  "description": "Per-letter image crops of handwritten Hebrew letters, grouped into sets by writer. Each crop is a derivative of a permissively-licensed upstream scan in HeOCR/public-domain-hand-written-hebrew-scans, with per-image rights inherited and attribution recorded. The index is line-oriented JSON (JSONL).",
+  "homepage": "https://github.com/HeOCR/hletterscript",
+  "repository_code": "https://github.com/HeOCR/hletterscript",
+  "upstream_repo": "https://github.com/HeOCR/public-domain-hand-written-hebrew-scans",
+  "authors": [
+    {"name": "Shay Palachy-Affek", "role": "maintainer"}
+  ],
+  "keywords": [
+    "Hebrew",
+    "dataset",
+    "handwriting",
+    "handwritten-text-recognition",
+    "letters",
+    "glyphs",
+    "synthetic-generation",
+    "public-domain"
+  ],
+  "metadata_license": {
+    "spdx": "CC0-1.0",
+    "url": "https://creativecommons.org/publicdomain/zero/1.0/"
+  },
+  "license_urls": {
+    "PDM-1.0": "https://creativecommons.org/publicdomain/mark/1.0/",
+    "CC0-1.0": "https://creativecommons.org/publicdomain/zero/1.0/",
+    "CC-BY-4.0": "https://creativecommons.org/licenses/by/4.0/",
+    "CC-BY-SA-4.0": "https://creativecommons.org/licenses/by-sa/4.0/"
+  },
+  "license_names": {
+    "PDM-1.0": "Public Domain Mark 1.0",
+    "CC0-1.0": "Creative Commons Zero v1.0 Universal",
+    "CC-BY-4.0": "Creative Commons Attribution 4.0 International",
+    "CC-BY-SA-4.0": "Creative Commons Attribution-ShareAlike 4.0 International",
+    "LicenseRef-Public-Domain-Israel": "Public Domain (Israel; life + 70)",
+    "LicenseRef-Public-Domain-Ukraine": "Public Domain (Ukraine; life + 70)"
+  },
+  "schema_urls": {
+    "writer": "https://github.com/HeOCR/hletterscript/blob/main/schemas/writer.schema.json",
+    "entry": "https://github.com/HeOCR/hletterscript/blob/main/schemas/entry.schema.json"
+  },
+  "citation_identifiers": [],
+  "resources": {
+    "entries": {
+      "path": "data/index/entries.jsonl",
+      "format": "jsonl",
+      "mediatype": "application/x-ndjson",
+      "encoding": "utf-8",
+      "description": "Per-letter image index. One JSON object per cropped letter image, with upstream provenance, extraction provenance, file checksums, and inherited rights."
+    },
+    "writers": {
+      "path": "data/index/writers.jsonl",
+      "format": "jsonl",
+      "mediatype": "application/x-ndjson",
+      "encoding": "utf-8",
+      "description": "Writer-level catalog. One JSON object per writer; each writer defines a 'set' of letter images."
+    }
+  }
+}
diff --git a/scripts/validate_indexes.py b/scripts/validate_indexes.py
new file mode 100644
index 0000000..ef001d6
--- /dev/null
+++ b/scripts/validate_indexes.py
@@ -0,0 +1,532 @@
+#!/usr/bin/env python3
+"""Validate the JSONL dataset indexes against their JSON Schemas.
+
+Validates writers.jsonl + entries.jsonl, enforces referential integrity
+between them, checks Hebrew-letter codepoint/name/form consistency,
+cross-checks `rights_basis` against `license_expression`, and
+re-verifies image file checksums and sizes on disk. With
+`--upstream-path` it also cross-checks each entry's
+`upstream.sha256` and `upstream.bbox` against the live upstream
+dataset.
+"""
+
+from __future__ import annotations
+
+import argparse
+import hashlib
+import json
+import sys
+from pathlib import Path
+from typing import Any
+
+try:
+    from jsonschema import Draft202012Validator, FormatChecker
+    from jsonschema.exceptions import SchemaError
+except ImportError as exc:  # pragma: no cover - exercised when deps are absent.
+    raise SystemExit(
+        "Missing dependency: jsonschema. Install development dependencies with "
+        "`python3 -m pip install -r requirements-dev.txt`."
+    ) from exc
+
+
+REPO_ROOT = Path(__file__).resolve().parents[1]
+WRITERS_PATH = REPO_ROOT / "data" / "index" / "writers.jsonl"
+ENTRIES_PATH = REPO_ROOT / "data" / "index" / "entries.jsonl"
+WRITER_SCHEMA_PATH = REPO_ROOT / "schemas" / "writer.schema.json"
+ENTRY_SCHEMA_PATH = REPO_ROOT / "schemas" / "entry.schema.json"
+
+# Canonical Hebrew letter table. Mirrors docs/letters.md and the
+# `letter.name` enum in schemas/entry.schema.json. The validator uses
+# this table to enforce cross-field consistency on the `letter` block.
+LETTER_TABLE: list[tuple[str, str, str, str]] = [
+    ("U+05D0", "א", "alef", "regular"),
+    ("U+05D1", "ב", "bet", "regular"),
+    ("U+05D2", "ג", "gimel", "regular"),
+    ("U+05D3", "ד", "dalet", "regular"),
+    ("U+05D4", "ה", "he", "regular"),
+    ("U+05D5", "ו", "vav", "regular"),
+    ("U+05D6", "ז", "zayin", "regular"),
+    ("U+05D7", "ח", "het", "regular"),
+    ("U+05D8", "ט", "tet", "regular"),
+    ("U+05D9", "י", "yod", "regular"),
+    ("U+05DA", "ך", "kaf_final", "final"),
+    ("U+05DB", "כ", "kaf", "regular"),
+    ("U+05DC", "ל", "lamed", "regular"),
+    ("U+05DD", "ם", "mem_final", "final"),
+    ("U+05DE", "מ", "mem", "regular"),
+    ("U+05DF", "ן", "nun_final", "final"),
+    ("U+05E0", "נ", "nun", "regular"),
+    ("U+05E1", "ס", "samekh", "regular"),
+    ("U+05E2", "ע", "ayin", "regular"),
+    ("U+05E3", "ף", "pe_final", "final"),
+    ("U+05E4", "פ", "pe", "regular"),
+    ("U+05E5", "ץ", "tsadi_final", "final"),
+    ("U+05E6", "צ", "tsadi", "regular"),
+    ("U+05E7", "ק", "qof", "regular"),
+    ("U+05E8", "ר", "resh", "regular"),
+    ("U+05E9", "ש", "shin", "regular"),
+    ("U+05EA", "ת", "tav", "regular"),
+]
+LETTER_BY_NAME: dict[str, tuple[str, str, str, str]] = {row[2]: row for row in LETTER_TABLE}
+
+# Permitted file extensions per `image.mime_type`. The first entry is
+# the preferred extension; subsequent ones are accepted aliases.
+MIME_EXTENSIONS: dict[str, tuple[str, ...]] = {
+    "image/png": (".png",),
+    "image/jpeg": (".jpg", ".jpeg"),
+    "image/webp": (".webp",),
+    "image/tiff": (".tif", ".tiff"),
+}
+
+# Canonical map from `license_expression` to `rights_basis`. The
+# validator hard-fails if an entry's pair doesn't match this map. Adding
+# a new accepted license means adding it here AND to AGENTS.md AND to
+# scripts/release_recipe.json::license_names + license_urls.
+LICENSE_BASIS_MAP: dict[str, str] = {
+    "CC0-1.0": "cc0",
+    "PDM-1.0": "public_domain",
+    "CC-BY-4.0": "cc_by",
+    "CC-BY-SA-4.0": "cc_by_sa",
+    "LicenseRef-Public-Domain-Israel": "public_domain",
+    "LicenseRef-Public-Domain-Ukraine": "public_domain",
+}
+
+
+def _err(file: Path | None, line: int | None, row_id: str | None, message: str) -> str:
+    """Build a uniform error string: <file>:<line>: <id>: <message>."""
+    head = ""
+    if file is not None:
+        head = str(file)
+        if line is not None:
+            head = f"{head}:{line}"
+        head = f"{head}: "
+    if row_id:
+        head = f"{head}{row_id}: "
+    return f"{head}{message}"
+
+
+def load_schema(path: Path) -> dict[str, Any]:
+    if not path.exists():
+        raise SystemExit(_err(path, None, None, "file does not exist"))
+    try:
+        schema = json.loads(path.read_text(encoding="utf-8"))
+    except json.JSONDecodeError as exc:
+        raise SystemExit(_err(path, None, None, f"invalid JSON schema: {exc}")) from exc
+    try:
+        Draft202012Validator.check_schema(schema)
+    except SchemaError as exc:
+        raise SystemExit(_err(path, None, None, f"invalid JSON schema: {exc.message}")) from exc
+    return schema
+
+
+def load_jsonl(
+    path: Path,
+    validator: Draft202012Validator,
+    id_key: str,
+) -> list[dict[str, Any]]:
+    if not path.exists():
+        raise SystemExit(_err(path, None, None, "file does not exist"))
+
+    rows: list[dict[str, Any]] = []
+    seen: set[str] = set()
+    with path.open("r", encoding="utf-8") as handle:
+        for line_number, line in enumerate(handle, start=1):
+            stripped = line.strip()
+            if not stripped:
+                continue
+            try:
+                row = json.loads(stripped)
+            except json.JSONDecodeError as exc:
+                raise SystemExit(_err(path, line_number, None, f"invalid JSON: {exc}")) from exc
+            if not isinstance(row, dict):
+                raise SystemExit(_err(path, line_number, None, "row must be a JSON object"))
+
+            row_id_candidate = row.get(id_key) if isinstance(row.get(id_key), str) else None
+
+            errors = sorted(validator.iter_errors(row), key=lambda error: list(error.path))
+            if errors:
+                first = errors[0]
+                location = ".".join(str(part) for part in first.path) or "<root>"
+                raise SystemExit(
+                    _err(path, line_number, row_id_candidate, f"{location}: {first.message}")
+                )
+
+            row_id = row.get(id_key)
+            if not isinstance(row_id, str) or not row_id:
+                raise SystemExit(
+                    _err(path, line_number, None, f"{id_key} must be a non-empty string")
+                )
+            if row_id in seen:
+                raise SystemExit(
+                    _err(path, line_number, row_id, f"duplicate {id_key}")
+                )
+            seen.add(row_id)
+            rows.append(row)
+    return rows
+
+
+def _check_letter_consistency(
+    entries_path: Path, line: int, entry_id: str, letter: dict[str, Any]
+) -> None:
+    name = letter["name"]
+    canonical = LETTER_BY_NAME.get(name)
+    if canonical is None:
+        # The schema's enum should have caught this already; defensive.
+        raise SystemExit(_err(entries_path, line, entry_id, f"unknown letter.name: {name}"))
+    expected_codepoint, expected_char, _, expected_form = canonical
+    if letter["codepoint"] != expected_codepoint:
+        raise SystemExit(_err(
+            entries_path, line, entry_id,
+            f"letter.codepoint mismatch for {name}: "
+            f"expected {expected_codepoint}, got {letter['codepoint']}",
+        ))
+    if letter["unicode_char"] != expected_char:
+        raise SystemExit(_err(
+            entries_path, line, entry_id,
+            f"letter.unicode_char mismatch for {name}: "
+            f"expected {expected_char!r}, got {letter['unicode_char']!r}",
+        ))
+    if letter["form"] != expected_form:
+        raise SystemExit(_err(
+            entries_path, line, entry_id,
+            f"letter.form mismatch for {name}: "
+            f"expected {expected_form}, got {letter['form']}",
+        ))
+
+
+def _check_upstream_shape(
+    entries_path: Path, line: int, entry_id: str, upstream: dict[str, Any]
+) -> None:
+    upstream_entry_id = upstream["entry_id"]
+    upstream_source_id = upstream["source_id"]
+    if not upstream_entry_id.startswith(f"{upstream_source_id}__p"):
+        raise SystemExit(_err(
+            entries_path, line, entry_id,
+            f"upstream.entry_id ({upstream_entry_id}) must start with "
+            f"upstream.source_id ({upstream_source_id}) plus '__p'",
+        ))
+
+
+def _check_local_path(
+    entries_path: Path,
+    line: int,
+    entry_id: str,
+    writer_id: str,
+    letter_name: str,
+    image: dict[str, Any],
+) -> None:
+    local_path = image["local_path"]
+    local_path_obj = Path(local_path)
+    if local_path_obj.is_absolute() or ".." in local_path_obj.parts:
+        raise SystemExit(_err(
+            entries_path, line, entry_id,
+            f"image.local_path must be repo-relative without '..': {local_path}",
+        ))
+
+    expected_prefix = f"data/letters/{writer_id}/{letter_name}/"
+    if not local_path.startswith(expected_prefix):
+        raise SystemExit(_err(
+            entries_path, line, entry_id,
+            f"image.local_path must start with {expected_prefix!r}, "
+            f"got {local_path!r}",
+        ))
+
+    suffix = local_path_obj.suffix.lower()
+    expected_exts = MIME_EXTENSIONS.get(image["mime_type"], ())
+    if suffix not in expected_exts:
+        raise SystemExit(_err(
+            entries_path, line, entry_id,
+            f"image.local_path extension {suffix!r} does not match "
+            f"image.mime_type {image['mime_type']!r} (allowed: {list(expected_exts)})",
+        ))
+
+    expected_stem = f"data/letters/{writer_id}/{letter_name}/{entry_id}"
+    actual_stem = str(local_path_obj.with_suffix(""))
+    if actual_stem != expected_stem:
+        raise SystemExit(_err(
+            entries_path, line, entry_id,
+            f"image.local_path stem must equal {expected_stem!r}, "
+            f"got {actual_stem!r}",
+        ))
+
+
+def _check_attribution_fields(
+    entries_path: Path, line: int, entry_id: str, rights: dict[str, Any]
+) -> None:
+    if rights.get("attribution_required") is not True:
+        return
+    attribution_text = rights.get("attribution_text")
+    if not isinstance(attribution_text, str) or not attribution_text.strip():
+        raise SystemExit(_err(
+            entries_path, line, entry_id,
+            "rights.attribution_required is true but "
+            "rights.attribution_text is null, blank, or whitespace-only",
+        ))
+    attribution_url = rights.get("attribution_url")
+    if not isinstance(attribution_url, str) or not attribution_url.strip():
+        raise SystemExit(_err(
+            entries_path, line, entry_id,
+            "rights.attribution_required is true but "
+            "rights.attribution_url is null, blank, or whitespace-only",
+        ))
+
+
+def _check_rights_basis_matches_license(
+    entries_path: Path, line: int, entry_id: str, rights: dict[str, Any]
+) -> None:
+    license_expression = rights.get("license_expression")
+    rights_basis = rights.get("rights_basis")
+    if license_expression is None:
+        # license_expression is allowed to be null only when rights_basis
+        # is `unknown`; any other null is a denormalization that will
+        # produce broken release artefacts.
+        if rights_basis != "unknown":
+            raise SystemExit(_err(
+                entries_path, line, entry_id,
+                f"rights.license_expression is null but rights.rights_basis "
+                f"is {rights_basis!r} (expected 'unknown')",
+            ))
+        return
+    expected_basis = LICENSE_BASIS_MAP.get(license_expression)
+    if expected_basis is None:
+        raise SystemExit(_err(
+            entries_path, line, entry_id,
+            f"rights.license_expression {license_expression!r} is not in "
+            f"the accepted-license map (LICENSE_BASIS_MAP). Update both this "
+            f"validator and AGENTS.md if a new license is being added.",
+        ))
+    if rights_basis != expected_basis:
+        raise SystemExit(_err(
+            entries_path, line, entry_id,
+            f"rights.rights_basis ({rights_basis!r}) does not match "
+            f"rights.license_expression ({license_expression!r}); expected "
+            f"rights_basis = {expected_basis!r}",
+        ))
+
+
+def validate_entries(
+    entries: list[dict[str, Any]],
+    writer_ids: set[str],
+    entries_path: Path,
+) -> None:
+    seen_entry_ids: set[str] = set()
+    for line, entry in enumerate(entries, start=1):
+        entry_id = entry["entry_id"]
+        writer_id = entry["writer_id"]
+        letter = entry["letter"]
+
+        if writer_id not in writer_ids:
+            raise SystemExit(_err(
+                entries_path, line, entry_id,
+                f"unknown writer_id: {writer_id}",
+            ))
+
+        expected_prefix = f"{writer_id}__{letter['name']}__v"
+        if not entry_id.startswith(expected_prefix):
+            raise SystemExit(_err(
+                entries_path, line, entry_id,
+                f"entry_id must start with {expected_prefix!r}",
+            ))
+
+        if entry_id in seen_entry_ids:
+            raise SystemExit(_err(
+                entries_path, line, entry_id, "duplicate entry_id"
+            ))
+        seen_entry_ids.add(entry_id)
+
+        _check_letter_consistency(entries_path, line, entry_id, letter)
+        _check_upstream_shape(entries_path, line, entry_id, entry["upstream"])
+        _check_local_path(
+            entries_path, line, entry_id, writer_id, letter["name"], entry["image"]
+        )
+        _check_attribution_fields(entries_path, line, entry_id, entry["rights"])
+        _check_rights_basis_matches_license(
+            entries_path, line, entry_id, entry["rights"]
+        )
+
+
+def _sha256_file(path: Path) -> str:
+    with path.open("rb") as handle:
+        return hashlib.file_digest(handle, "sha256").hexdigest()
+
+
+def validate_entry_files(
+    entries: list[dict[str, Any]],
+    repo_root: Path,
+    entries_path: Path,
+) -> int:
+    verified = 0
+    for line, entry in enumerate(entries, start=1):
+        entry_id = entry["entry_id"]
+        image = entry["image"]
+        local_path = image["local_path"]
+        absolute = repo_root / local_path
+        if not absolute.is_file():
+            raise SystemExit(_err(
+                entries_path, line, entry_id,
+                f"file does not exist: {local_path}",
+            ))
+
+        actual_bytes = absolute.stat().st_size
+        if actual_bytes != image["bytes"]:
+            raise SystemExit(_err(
+                entries_path, line, entry_id,
+                f"byte size mismatch for {local_path}: "
+                f"expected {image['bytes']}, got {actual_bytes}",
+            ))
+
+        actual_sha = _sha256_file(absolute)
+        if actual_sha != image["sha256"]:
+            raise SystemExit(_err(
+                entries_path, line, entry_id,
+                f"sha256 mismatch for {local_path}: "
+                f"expected {image['sha256']}, got {actual_sha}",
+            ))
+        verified += 1
+    return verified
+
+
+def _load_upstream_entries(upstream_root: Path) -> dict[str, dict[str, Any]]:
+    upstream_entries_path = upstream_root / "data" / "index" / "entries.jsonl"
+    if not upstream_entries_path.is_file():
+        raise SystemExit(_err(
+            upstream_entries_path, None, None,
+            "upstream entries.jsonl not found; --upstream-path must point at a clone of "
+            "public-domain-hand-written-hebrew-scans",
+        ))
+    by_id: dict[str, dict[str, Any]] = {}
+    with upstream_entries_path.open("r", encoding="utf-8") as handle:
+        for line_number, line in enumerate(handle, start=1):
+            stripped = line.strip()
+            if not stripped:
+                continue
+            try:
+                row = json.loads(stripped)
+            except json.JSONDecodeError as exc:
+                raise SystemExit(_err(
+                    upstream_entries_path, line_number, None,
+                    f"invalid JSON in upstream entries: {exc}",
+                )) from exc
+            row_id = row.get("entry_id")
+            if isinstance(row_id, str):
+                by_id[row_id] = row
+    return by_id
+
+
+def validate_against_upstream(
+    entries: list[dict[str, Any]],
+    upstream_root: Path,
+    entries_path: Path,
+) -> int:
+    """Cross-check `upstream.sha256` and `upstream.bbox` against the live
+    upstream dataset. Returns the number of entries cross-checked. Called
+    only when --upstream-path is set; CI does set it.
+    """
+    upstream_by_id = _load_upstream_entries(upstream_root)
+    cross_checked = 0
+    for line, entry in enumerate(entries, start=1):
+        entry_id = entry["entry_id"]
+        upstream = entry["upstream"]
+        upstream_entry_id = upstream["entry_id"]
+        ref = upstream_by_id.get(upstream_entry_id)
+        if ref is None:
+            raise SystemExit(_err(
+                entries_path, line, entry_id,
+                f"upstream.entry_id {upstream_entry_id!r} not found in "
+                f"{upstream_root}/data/index/entries.jsonl",
+            ))
+
+        # Find the upstream file record whose sha256 matches.
+        files = ref.get("files") or []
+        upstream_sha = upstream["sha256"]
+        matching = next(
+            (f for f in files if f.get("sha256") == upstream_sha), None
+        )
+        if matching is None:
+            recorded_shas = [f.get("sha256") for f in files if f.get("sha256")]
+            raise SystemExit(_err(
+                entries_path, line, entry_id,
+                f"upstream.sha256 {upstream_sha!r} does not match any file in "
+                f"upstream entry {upstream_entry_id!r}; upstream recorded "
+                f"{recorded_shas}",
+            ))
+
+        width = matching.get("width_px")
+        height = matching.get("height_px")
+        bbox = upstream["bbox"]
+        if isinstance(width, int):
+            if bbox["x"] + bbox["w"] > width:
+                raise SystemExit(_err(
+                    entries_path, line, entry_id,
+                    f"upstream.bbox extends beyond upstream scan width: "
+                    f"x+w = {bbox['x'] + bbox['w']} > width_px = {width}",
+                ))
+        if isinstance(height, int):
+            if bbox["y"] + bbox["h"] > height:
+                raise SystemExit(_err(
+                    entries_path, line, entry_id,
+                    f"upstream.bbox extends beyond upstream scan height: "
+                    f"y+h = {bbox['y'] + bbox['h']} > height_px = {height}",
+                ))
+        cross_checked += 1
+    return cross_checked
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--writers", type=Path, default=WRITERS_PATH)
+    parser.add_argument("--entries", type=Path, default=ENTRIES_PATH)
+    parser.add_argument("--writer-schema", type=Path, default=WRITER_SCHEMA_PATH)
+    parser.add_argument("--entry-schema", type=Path, default=ENTRY_SCHEMA_PATH)
+    parser.add_argument(
+        "--repo-root",
+        type=Path,
+        default=REPO_ROOT,
+        help=(
+            "Repo root used to resolve image.local_path during file-integrity "
+            "checks. Defaults to this repository. Mainly intended for tests "
+            "that need to validate fixture corpora outside the real tree."
+        ),
+    )
+    parser.add_argument(
+        "--upstream-path",
+        type=Path,
+        default=None,
+        help=(
+            "Path to a local clone of HeOCR/public-domain-hand-written-hebrew-scans. "
+            "When set, the validator additionally cross-checks each entry's "
+            "upstream.sha256 against the upstream file record and verifies "
+            "upstream.bbox fits inside the upstream scan dimensions."
+        ),
+    )
+    args = parser.parse_args()
+
+    writer_validator = Draft202012Validator(
+        load_schema(args.writer_schema), format_checker=FormatChecker()
+    )
+    entry_validator = Draft202012Validator(
+        load_schema(args.entry_schema), format_checker=FormatChecker()
+    )
+
+    writers = load_jsonl(args.writers, writer_validator, "writer_id")
+    entries = load_jsonl(args.entries, entry_validator, "entry_id")
+    validate_entries(
+        entries, {writer["writer_id"] for writer in writers}, args.entries
+    )
+    verified = validate_entry_files(entries, args.repo_root, args.entries)
+
+    if args.upstream_path is not None:
+        cross_checked = validate_against_upstream(entries, args.upstream_path, args.entries)
+        print(
+            f"ok: {len(writers)} writers, {len(entries)} entries, "
+            f"{verified} files verified, {cross_checked} upstream-cross-checked"
+        )
+    else:
+        print(
+            f"ok: {len(writers)} writers, {len(entries)} entries, "
+            f"{verified} files verified"
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/test_generate_release_artifacts.py b/tests/test_generate_release_artifacts.py
new file mode 100644
index 0000000..1fb2e3d
--- /dev/null
+++ b/tests/test_generate_release_artifacts.py
@@ -0,0 +1,563 @@
+from __future__ import annotations
+
+import hashlib
+import json
+import re
+import shutil
+import subprocess
+import sys
+from pathlib import Path
+
+import pytest
+import yaml
+from frictionless import Package
+
+
+REPO_ROOT = Path(__file__).resolve().parents[1]
+GENERATOR = REPO_ROOT / "scripts" / "generate_release_artifacts.py"
+RECIPE = REPO_ROOT / "scripts" / "release_recipe.json"
+WRITERS = REPO_ROOT / "data" / "index" / "writers.jsonl"
+ENTRIES = REPO_ROOT / "data" / "index" / "entries.jsonl"
+NOTICE = REPO_ROOT / "NOTICE.md"
+CITATION = REPO_ROOT / "CITATION.cff"
+DATAPACKAGE = REPO_ROOT / "datapackage.json"
+
+
+def _load_entries() -> list[dict]:
+    return [
+        json.loads(line)
+        for line in ENTRIES.read_text(encoding="utf-8").splitlines()
+        if line.strip()
+    ]
+
+
+def _load_writers() -> list[dict]:
+    return [
+        json.loads(line)
+        for line in WRITERS.read_text(encoding="utf-8").splitlines()
+        if line.strip()
+    ]
+
+
+def _run_generator(
+    *,
+    cwd: Path,
+    writers: Path = WRITERS,
+    entries: Path = ENTRIES,
+    recipe: Path = RECIPE,
+    notice: Path,
+    citation: Path,
+    datapackage: Path,
+    extra_args: tuple[str, ...] = (),
+) -> subprocess.CompletedProcess[str]:
+    return subprocess.run(
+        [
+            sys.executable,
+            str(GENERATOR),
+            "--writers", str(writers),
+            "--entries", str(entries),
+            "--recipe", str(recipe),
+            "--notice", str(notice),
+            "--citation", str(citation),
+            "--datapackage", str(datapackage),
+            *extra_args,
+        ],
+        cwd=cwd,
+        text=True,
+        capture_output=True,
+        check=False,
+    )
+
+
+# --- Empty-corpus tests (the current committed state) ----------------------
+
+
+def test_committed_artifacts_are_up_to_date(tmp_path: Path) -> None:
+    notice = tmp_path / "NOTICE.md"
+    citation = tmp_path / "CITATION.cff"
+    datapackage = tmp_path / "datapackage.json"
+
+    result = _run_generator(
+        cwd=tmp_path, notice=notice, citation=citation, datapackage=datapackage
+    )
+    assert result.returncode == 0, result.stderr
+
+    assert notice.read_bytes() == NOTICE.read_bytes(), (
+        "NOTICE.md is stale; run `python3 scripts/generate_release_artifacts.py`"
+    )
+    assert citation.read_bytes() == CITATION.read_bytes(), (
+        "CITATION.cff is stale; run `python3 scripts/generate_release_artifacts.py`"
+    )
+    assert datapackage.read_bytes() == DATAPACKAGE.read_bytes(), (
+        "datapackage.json is stale; run `python3 scripts/generate_release_artifacts.py`"
+    )
+
+
+def test_generator_is_idempotent(tmp_path: Path) -> None:
+    paths = {
+        "notice": tmp_path / "NOTICE.md",
+        "citation": tmp_path / "CITATION.cff",
+        "datapackage": tmp_path / "datapackage.json",
+    }
+
+    first = _run_generator(cwd=tmp_path, **paths)
+    assert first.returncode == 0, first.stderr
+    snapshot = {name: path.read_bytes() for name, path in paths.items()}
+
+    second = _run_generator(cwd=tmp_path, **paths)
+    assert second.returncode == 0, second.stderr
+    for name, path in paths.items():
+        assert path.read_bytes() == snapshot[name], f"{name} differed between runs"
+
+
+def test_datapackage_counts_match_index() -> None:
+    entries = _load_entries()
+    writers = _load_writers()
+    package = json.loads(DATAPACKAGE.read_text(encoding="utf-8"))
+    assert package["stats"]["record_count"] == len(entries)
+    assert package["stats"]["writer_record_count"] == len(writers)
+
+
+def test_datapackage_keys_are_sorted() -> None:
+    package = json.loads(DATAPACKAGE.read_text(encoding="utf-8"))
+    assert list(package.keys()) == sorted(package.keys())
+
+
+def test_citation_parses_and_has_required_cff_keys() -> None:
+    document = yaml.safe_load(CITATION.read_text(encoding="utf-8"))
+    assert isinstance(document, dict)
+    for required in (
+        "cff-version", "type", "title", "authors", "version", "date-released"
+    ):
+        assert required in document, f"CITATION.cff missing required key: {required}"
+    assert document["cff-version"] == "1.2.0"
+    assert document["type"] == "dataset"
+    assert document["license"] == "CC0-1.0"
+
+
+def test_datapackage_validates_against_frictionless_spec() -> None:
+    package = Package(str(DATAPACKAGE))
+    assert package.name == "hletterscript"
+    errors = list(Package.metadata_validate(package.to_descriptor()))
+    assert errors == [], [getattr(e, "message", str(e)) for e in errors]
+
+
+def test_empty_corpus_falls_back_to_recipe_initial_date(tmp_path: Path) -> None:
+    if _load_entries():
+        pytest.skip("corpus is no longer empty")
+    notice = tmp_path / "NOTICE.md"
+    citation = tmp_path / "CITATION.cff"
+    datapackage = tmp_path / "datapackage.json"
+    result = _run_generator(
+        cwd=tmp_path, notice=notice, citation=citation, datapackage=datapackage
+    )
+    assert result.returncode == 0, result.stderr
+    package = json.loads(datapackage.read_text(encoding="utf-8"))
+    recipe = json.loads(RECIPE.read_text(encoding="utf-8"))
+    assert package["released_at"] == recipe["initial_release_date"]
+    assert package["stats"]["record_count"] == 0
+
+
+def test_empty_corpus_falls_back_when_recipe_initial_date_missing(
+    tmp_path: Path,
+) -> None:
+    recipe = json.loads(RECIPE.read_text(encoding="utf-8"))
+    del recipe["initial_release_date"]
+    bad_recipe = tmp_path / "bad_recipe.json"
+    bad_recipe.write_text(json.dumps(recipe), encoding="utf-8")
+
+    writers_path = tmp_path / "writers.jsonl"
+    entries_path = tmp_path / "entries.jsonl"
+    writers_path.write_text("", encoding="utf-8")
+    entries_path.write_text("", encoding="utf-8")
+
+    result = _run_generator(
+        cwd=tmp_path,
+        writers=writers_path,
+        entries=entries_path,
+        recipe=bad_recipe,
+        notice=tmp_path / "NOTICE.md",
+        citation=tmp_path / "CITATION.cff",
+        datapackage=tmp_path / "datapackage.json",
+    )
+    assert result.returncode != 0
+    assert "initial_release_date" in result.stderr
+
+
+def test_check_mode_passes_when_up_to_date() -> None:
+    result = subprocess.run(
+        [sys.executable, str(GENERATOR), "--check"],
+        cwd=REPO_ROOT,
+        text=True,
+        capture_output=True,
+        check=False,
+    )
+    assert result.returncode == 0, result.stderr
+    assert "ok" in result.stdout
+
+
+def test_check_mode_fails_when_stale(tmp_path: Path) -> None:
+    notice = tmp_path / "NOTICE.md"
+    citation = tmp_path / "CITATION.cff"
+    datapackage = tmp_path / "datapackage.json"
+    shutil.copyfile(NOTICE, notice)
+    shutil.copyfile(CITATION, citation)
+    shutil.copyfile(DATAPACKAGE, datapackage)
+    datapackage.write_text("{}\n", encoding="utf-8")
+
+    result = _run_generator(
+        cwd=tmp_path,
+        notice=notice,
+        citation=citation,
+        datapackage=datapackage,
+        extra_args=("--check",),
+    )
+    assert result.returncode == 1
+    assert "stale" in result.stderr
+    assert "datapackage.json" in result.stderr
+
+
+def test_recipe_required_fields_must_be_present(tmp_path: Path) -> None:
+    recipe = json.loads(RECIPE.read_text(encoding="utf-8"))
+    del recipe["authors"]
+    bad_recipe = tmp_path / "bad_recipe.json"
+    bad_recipe.write_text(json.dumps(recipe), encoding="utf-8")
+
+    result = _run_generator(
+        cwd=tmp_path,
+        recipe=bad_recipe,
+        notice=tmp_path / "NOTICE.md",
+        citation=tmp_path / "CITATION.cff",
+        datapackage=tmp_path / "datapackage.json",
+    )
+    assert result.returncode != 0
+    assert "authors" in result.stderr
+
+
+def test_version_released_date_required(tmp_path: Path) -> None:
+    recipe = json.loads(RECIPE.read_text(encoding="utf-8"))
+    del recipe["version_released_date"]
+    bad_recipe = tmp_path / "bad_recipe.json"
+    bad_recipe.write_text(json.dumps(recipe), encoding="utf-8")
+    result = _run_generator(
+        cwd=tmp_path,
+        recipe=bad_recipe,
+        notice=tmp_path / "NOTICE.md",
+        citation=tmp_path / "CITATION.cff",
+        datapackage=tmp_path / "datapackage.json",
+    )
+    assert result.returncode != 0
+    assert "version_released_date" in result.stderr
+
+
+# --- Non-empty-corpus tests (the bug-prevention tier) ----------------------
+#
+# These tests construct a synthetic 2-entry corpus including one
+# CC-BY-SA-4.0 attribution-required entry, run the generator, and
+# verify the rendered artefacts. Without these, the entire NOTICE.md
+# stanza-building path would be unreachable by CI for as long as the
+# committed corpus stays empty.
+
+
+def _hash(data: bytes) -> tuple[str, int]:
+    return hashlib.sha256(data).hexdigest(), len(data)
+
+
+def _synthetic_writer(writer_id: str) -> dict:
+    return {
+        "writer_id": writer_id,
+        "status": "verified",
+        "display_name": writer_id.replace("_", " ").title(),
+        "also_known_as": [],
+        "description": "Synthetic writer used only by the test suite.",
+        "dates": {
+            "birth_year": 1890,
+            "birth_precision": "exact",
+            "death_year": 1950,
+            "death_precision": "exact",
+        },
+        "languages_written": ["he"],
+        "scripts_written": ["Hebr"],
+        "period": {
+            "start": "1920",
+            "end": "1949",
+            "precision": "year",
+        },
+        "references": [
+            {
+                "kind": "repo_note",
+                "citation": "tests/test_generate_release_artifacts.py",
+                "quote": None,
+                "url": None,
+            }
+        ],
+        "ingest": {"agent_notes": "fixture", "blocked_reason": None},
+    }
+
+
+def _synthetic_entry(
+    tmp_path: Path,
+    writer_id: str,
+    letter_name: str,
+    codepoint: str,
+    char: str,
+    form: str,
+    variant: int,
+    license_expression: str,
+    rights_basis: str,
+    extracted_at: str,
+    *,
+    attribution_required: bool = False,
+    attribution_text: str | None = None,
+    attribution_url: str | None = None,
+) -> dict:
+    entry_id = f"{writer_id}__{letter_name}__v{variant:04d}"
+    rel_dir = Path("data") / "letters" / writer_id / letter_name
+    abs_dir = tmp_path / rel_dir
+    abs_dir.mkdir(parents=True, exist_ok=True)
+    rel_path = rel_dir / f"{entry_id}.png"
+    abs_path = tmp_path / rel_path
+    payload = f"png-{entry_id}".encode("utf-8")
+    abs_path.write_bytes(payload)
+    sha, size = _hash(payload)
+    return {
+        "entry_id": entry_id,
+        "writer_id": writer_id,
+        "letter": {
+            "codepoint": codepoint,
+            "unicode_char": char,
+            "name": letter_name,
+            "form": form,
+        },
+        "upstream": {
+            "source_id": f"commons__{writer_id}_doc",
+            "entry_id": f"commons__{writer_id}_doc__p0001",
+            "sha256": "a" * 64,
+            "commit": "0" * 40,
+            "release_tag": "v0.1.0-rc",
+            "bbox": {"x": 0, "y": 0, "w": 100, "h": 100},
+        },
+        "image": {
+            "local_path": str(rel_path),
+            "sha256": sha,
+            "mime_type": "image/png",
+            "bytes": size,
+            "width_px": 1,
+            "height_px": 1,
+            "background": "original",
+        },
+        "extraction": {
+            "tool": "hletterscriptgen",
+            "tool_version": "v0.0.1",
+            "method": "manual",
+            "extracted_at": extracted_at,
+            "extracted_by": "test_suite",
+            "notes": None,
+        },
+        "rights": {
+            "rights_basis": rights_basis,
+            "license_expression": license_expression,
+            "commercial_use_allowed": True,
+            "derivatives_allowed": True,
+            "redistribution_allowed": True,
+            "attribution_required": attribution_required,
+            "attribution_text": attribution_text,
+            "attribution_url": attribution_url,
+            "verification_status": "inherited_from_upstream",
+            "evidence_text": "Upstream verified.",
+            "verified_at": "2026-05-12",
+        },
+        "quality": {
+            "usable_for_htr": True,
+            "usable_for_syngen": True,
+            "legibility": "high",
+            "exclusion_reasons": [],
+            "notes": None,
+        },
+    }
+
+
+@pytest.fixture
+def synthetic_corpus(tmp_path: Path) -> dict:
+    writers = [
+        _synthetic_writer("writer_pdm"),
+        _synthetic_writer("writer_cc_by_sa"),
+    ]
+    entries = [
+        _synthetic_entry(
+            tmp_path,
+            writer_id="writer_pdm",
+            letter_name="alef",
+            codepoint="U+05D0",
+            char="א",
+            form="regular",
+            variant=1,
+            license_expression="PDM-1.0",
+            rights_basis="public_domain",
+            extracted_at="2026-05-10T12:00:00Z",
+        ),
+        _synthetic_entry(
+            tmp_path,
+            writer_id="writer_cc_by_sa",
+            letter_name="bet",
+            codepoint="U+05D1",
+            char="ב",
+            form="regular",
+            variant=1,
+            license_expression="CC-BY-SA-4.0",
+            rights_basis="cc_by_sa",
+            extracted_at="2026-05-11T18:30:00Z",
+            attribution_required=True,
+            attribution_text="User:Example via Wikimedia Commons, CC BY-SA 4.0",
+            attribution_url="https://commons.wikimedia.org/wiki/File:Example.jpg",
+        ),
+    ]
+    writers_path = tmp_path / "writers.jsonl"
+    entries_path = tmp_path / "entries.jsonl"
+    writers_path.write_text(
+        "".join(json.dumps(w, ensure_ascii=False) + "\n" for w in writers),
+        encoding="utf-8",
+    )
+    entries_path.write_text(
+        "".join(json.dumps(e, ensure_ascii=False) + "\n" for e in entries),
+        encoding="utf-8",
+    )
+    return {
+        "writers_path": writers_path,
+        "entries_path": entries_path,
+        "writers": writers,
+        "entries": entries,
+    }
+
+
+def _generate_for_corpus(
+    tmp_path: Path, synthetic_corpus: dict
+) -> tuple[Path, Path, Path]:
+    notice = tmp_path / "NOTICE.md"
+    citation = tmp_path / "CITATION.cff"
+    datapackage = tmp_path / "datapackage.json"
+    result = _run_generator(
+        cwd=tmp_path,
+        writers=synthetic_corpus["writers_path"],
+        entries=synthetic_corpus["entries_path"],
+        notice=notice,
+        citation=citation,
+        datapackage=datapackage,
+    )
+    assert result.returncode == 0, result.stderr
+    return notice, citation, datapackage
+
+
+def test_non_empty_corpus_notice_lists_attribution_required(
+    tmp_path: Path, synthetic_corpus: dict
+) -> None:
+    notice, _, _ = _generate_for_corpus(tmp_path, synthetic_corpus)
+    text = notice.read_text(encoding="utf-8")
+    assert "writer_cc_by_sa__bet__v0001" in text, (
+        "CC-BY-SA entry should be listed in NOTICE.md"
+    )
+    assert "writer_pdm__alef__v0001" not in text, (
+        "PDM entry should NOT be listed in NOTICE.md (no attribution required)"
+    )
+    assert "User:Example" in text
+    assert "https://commons.wikimedia.org/wiki/File:Example.jpg" in text
+
+
+def test_non_empty_corpus_notice_url_is_valid_github_blob(
+    tmp_path: Path, synthetic_corpus: dict
+) -> None:
+    # The bug-fix verification: NOTICE.md must NOT embed a `release:`
+    # prefix in the upstream blob URL. The commit field is a SHA; the
+    # release_tag is metadata only. A `release:` substring in any URL
+    # would indicate the bug from the original PR has regressed.
+    notice, _, _ = _generate_for_corpus(tmp_path, synthetic_corpus)
+    text = notice.read_text(encoding="utf-8")
+
+    # Scan only URLs inside angle brackets (the markdown-link form the
+    # generator uses). Free prose like "Corpus release:" is unrelated.
+    urls = re.findall(r"<(https?://[^>]+)>", text)
+    for url in urls:
+        assert "release:" not in url, (
+            f"NOTICE.md URL must not contain `release:` prefix: {url!r}"
+        )
+
+    # The upstream link should contain a 40-char hex sha after /blob/.
+    pattern = re.compile(r"/blob/([a-f0-9]{40})/data/index/entries\.jsonl")
+    matches = pattern.findall(text)
+    assert matches, "expected at least one /blob/<sha>/ link in NOTICE.md"
+
+
+def test_non_empty_corpus_datapackage_stats(
+    tmp_path: Path, synthetic_corpus: dict
+) -> None:
+    _, _, datapackage = _generate_for_corpus(tmp_path, synthetic_corpus)
+    package = json.loads(datapackage.read_text(encoding="utf-8"))
+    stats = package["stats"]
+    assert stats["record_count"] == 2
+    assert stats["writer_record_count"] == 2
+    assert stats["entry_writer_count"] == 2
+    assert stats["attribution_required_count"] == 1
+    assert stats["license_breakdown"] == {"CC-BY-SA-4.0": 1, "PDM-1.0": 1}
+    assert stats["letter_breakdown"] == {"alef": 1, "bet": 1}
+    assert stats["writer_breakdown"] == {"writer_cc_by_sa": 1, "writer_pdm": 1}
+
+
+def test_non_empty_corpus_released_at_is_latest_extraction(
+    tmp_path: Path, synthetic_corpus: dict
+) -> None:
+    _, _, datapackage = _generate_for_corpus(tmp_path, synthetic_corpus)
+    package = json.loads(datapackage.read_text(encoding="utf-8"))
+    assert package["released_at"] == "2026-05-11T18:30:00Z"
+
+
+def test_non_empty_corpus_citation_date_is_stable_not_extraction(
+    tmp_path: Path, synthetic_corpus: dict
+) -> None:
+    # The whole point of separating `version_released_date` from
+    # `released_at`: citations must not drift as entries accumulate.
+    _, citation, datapackage = _generate_for_corpus(tmp_path, synthetic_corpus)
+    document = yaml.safe_load(citation.read_text(encoding="utf-8"))
+    recipe = json.loads(RECIPE.read_text(encoding="utf-8"))
+    assert str(document["date-released"]) == recipe["version_released_date"]
+    # And it must NOT equal the (later) corpus-state timestamp.
+    package = json.loads(datapackage.read_text(encoding="utf-8"))
+    assert str(document["date-released"]) != package["released_at"][:10] or (
+        recipe["version_released_date"] == package["released_at"][:10]
+    )
+
+
+def test_non_empty_corpus_datapackage_frictionless_valid(
+    tmp_path: Path, synthetic_corpus: dict
+) -> None:
+    _, _, datapackage = _generate_for_corpus(tmp_path, synthetic_corpus)
+    package = Package(str(datapackage))
+    errors = list(Package.metadata_validate(package.to_descriptor()))
+    assert errors == [], [getattr(e, "message", str(e)) for e in errors]
+
+
+def test_non_empty_corpus_missing_attribution_flag_is_rejected(
+    tmp_path: Path, synthetic_corpus: dict
+) -> None:
+    # If a CC-BY-SA entry forgets attribution_required=True the
+    # generator's consistency check must fail loudly rather than silently
+    # dropping the entry from NOTICE.md.
+    entries = synthetic_corpus["entries"]
+    cc_entry = next(e for e in entries if e["rights"]["license_expression"] == "CC-BY-SA-4.0")
+    cc_entry["rights"]["attribution_required"] = False
+    cc_entry["rights"]["attribution_text"] = None
+    cc_entry["rights"]["attribution_url"] = None
+    synthetic_corpus["entries_path"].write_text(
+        "".join(json.dumps(e, ensure_ascii=False) + "\n" for e in entries),
+        encoding="utf-8",
+    )
+    result = _run_generator(
+        cwd=tmp_path,
+        writers=synthetic_corpus["writers_path"],
+        entries=synthetic_corpus["entries_path"],
+        notice=tmp_path / "NOTICE.md",
+        citation=tmp_path / "CITATION.cff",
+        datapackage=tmp_path / "datapackage.json",
+    )
+    assert result.returncode != 0
+    assert "CC-BY-SA-4.0" in result.stderr
+    assert "attribution_required" in result.stderr
diff --git a/tests/test_validate_indexes.py b/tests/test_validate_indexes.py
new file mode 100644
index 0000000..e259135
--- /dev/null
+++ b/tests/test_validate_indexes.py
@@ -0,0 +1,688 @@
+from __future__ import annotations
+
+import hashlib
+import json
+import subprocess
+import sys
+from pathlib import Path
+
+import pytest
+
+
+REPO_ROOT = Path(__file__).resolve().parents[1]
+VALIDATOR = REPO_ROOT / "scripts" / "validate_indexes.py"
+WRITERS = REPO_ROOT / "data" / "index" / "writers.jsonl"
+ENTRIES = REPO_ROOT / "data" / "index" / "entries.jsonl"
+
+
+def run_validator(*args: str | Path) -> subprocess.CompletedProcess[str]:
+    return subprocess.run(
+        [sys.executable, str(VALIDATOR), *(str(arg) for arg in args)],
+        cwd=REPO_ROOT,
+        text=True,
+        capture_output=True,
+        check=False,
+    )
+
+
+@pytest.fixture
+def writer_fixture() -> dict:
+    return {
+        "writer_id": "fixture_writer",
+        "status": "verified",
+        "display_name": "Fixture Writer",
+        "also_known_as": [],
+        "description": "Synthetic writer used only by the test suite.",
+        "dates": {
+            "birth_year": 1890,
+            "birth_precision": "exact",
+            "death_year": 1950,
+            "death_precision": "exact",
+        },
+        "languages_written": ["he"],
+        "scripts_written": ["Hebr"],
+        "period": {
+            "start": "1920",
+            "end": "1949",
+            "precision": "year",
+        },
+        "references": [
+            {
+                "kind": "repo_note",
+                "citation": "tests/test_validate_indexes.py::writer_fixture",
+                "quote": None,
+                "url": None,
+            }
+        ],
+        "ingest": {
+            "agent_notes": "fixture",
+            "blocked_reason": None,
+        },
+    }
+
+
+def _hash_bytes(data: bytes) -> tuple[str, int]:
+    return hashlib.sha256(data).hexdigest(), len(data)
+
+
+@pytest.fixture
+def entry_fixture(tmp_path: Path) -> dict:
+    # Write a tiny placeholder PNG (1x1 pixel) so the file-integrity
+    # check has something real to hash. The validator only cares about
+    # size and sha256; the bytes do not need to decode as a valid PNG.
+    image_dir = tmp_path / "data" / "letters" / "fixture_writer" / "alef"
+    image_dir.mkdir(parents=True)
+    image_path = image_dir / "fixture_writer__alef__v0001.png"
+    image_bytes = b"\x89PNG\r\n\x1a\nfixture-test-bytes"
+    image_path.write_bytes(image_bytes)
+    sha, size = _hash_bytes(image_bytes)
+
+    return {
+        "entry_id": "fixture_writer__alef__v0001",
+        "writer_id": "fixture_writer",
+        "letter": {
+            "codepoint": "U+05D0",
+            "unicode_char": "א",
+            "name": "alef",
+            "form": "regular",
+        },
+        "upstream": {
+            "source_id": "commons__fixture_source",
+            "entry_id": "commons__fixture_source__p0001",
+            "sha256": "a" * 64,
+            "commit": "0" * 40,
+            "release_tag": "v0.1.0-rc",
+            "bbox": {"x": 10, "y": 20, "w": 64, "h": 64},
+        },
+        "image": {
+            "local_path": "data/letters/fixture_writer/alef/fixture_writer__alef__v0001.png",
+            "sha256": sha,
+            "mime_type": "image/png",
+            "bytes": size,
+            "width_px": 1,
+            "height_px": 1,
+            "background": "original",
+        },
+        "extraction": {
+            "tool": "hletterscriptgen",
+            "tool_version": "v0.0.1",
+            "method": "manual",
+            "extracted_at": "2026-05-12T00:00:00Z",
+            "extracted_by": "test_suite",
+            "notes": None,
+        },
+        "rights": {
+            "rights_basis": "public_domain",
+            "license_expression": "PDM-1.0",
+            "commercial_use_allowed": True,
+            "derivatives_allowed": True,
+            "redistribution_allowed": True,
+            "attribution_required": False,
+            "attribution_text": None,
+            "attribution_url": None,
+            "verification_status": "inherited_from_upstream",
+            "evidence_text": "Upstream entry verified as PDM-1.0.",
+            "verified_at": "2026-05-12",
+        },
+        "quality": {
+            "usable_for_htr": True,
+            "usable_for_syngen": True,
+            "legibility": "high",
+            "exclusion_reasons": [],
+            "notes": None,
+        },
+    }
+
+
+def _write_indexes(
+    tmp_path: Path,
+    writers: list[dict],
+    entries: list[dict],
+) -> tuple[Path, Path]:
+    writers_path = tmp_path / "writers.jsonl"
+    entries_path = tmp_path / "entries.jsonl"
+    writers_path.write_text(
+        "".join(json.dumps(w, ensure_ascii=False) + "\n" for w in writers),
+        encoding="utf-8",
+    )
+    entries_path.write_text(
+        "".join(json.dumps(e, ensure_ascii=False) + "\n" for e in entries),
+        encoding="utf-8",
+    )
+    return writers_path, entries_path
+
+
+def _run_against(
+    tmp_path: Path,
+    writers: list[dict],
+    entries: list[dict],
+    *extra_args: str,
+) -> subprocess.CompletedProcess[str]:
+    writers_path, entries_path = _write_indexes(tmp_path, writers, entries)
+    return subprocess.run(
+        [
+            sys.executable, str(VALIDATOR),
+            "--writers", str(writers_path),
+            "--entries", str(entries_path),
+            "--repo-root", str(tmp_path),
+            *extra_args,
+        ],
+        cwd=tmp_path,
+        text=True,
+        capture_output=True,
+        check=False,
+    )
+
+
+def test_current_indexes_validate() -> None:
+    result = run_validator()
+    assert result.returncode == 0, result.stderr
+    assert "ok:" in result.stdout
+
+
+def test_empty_indexes_validate(tmp_path: Path) -> None:
+    writers_path = tmp_path / "writers.jsonl"
+    entries_path = tmp_path / "entries.jsonl"
+    writers_path.write_text("", encoding="utf-8")
+    entries_path.write_text("", encoding="utf-8")
+    result = run_validator("--writers", writers_path, "--entries", entries_path)
+    assert result.returncode == 0, result.stderr
+    assert "0 writers, 0 entries" in result.stdout
+
+
+def test_fixture_round_trip(
+    tmp_path: Path,
+    writer_fixture: dict,
+    entry_fixture: dict,
+) -> None:
+    result = _run_against(tmp_path, [writer_fixture], [entry_fixture])
+    assert result.returncode == 0, result.stderr
+    assert "ok: 1 writers, 1 entries, 1 files verified" in result.stdout
+
+
+# --- Schema-level rejections ------------------------------------------------
+
+
+def test_schema_errors_are_rejected(
+    tmp_path: Path,
+    writer_fixture: dict,
+    entry_fixture: dict,
+) -> None:
+    writer_fixture["status"] = "garbage"
+    result = _run_against(tmp_path, [writer_fixture], [entry_fixture])
+    assert result.returncode != 0
+    assert "is not one of" in result.stderr
+
+
+def test_candidate_writer_with_zero_references_is_accepted(
+    tmp_path: Path,
+    writer_fixture: dict,
+    entry_fixture: dict,
+) -> None:
+    # The references.minItems requirement is conditional on status; a
+    # `candidate` writer is allowed to ship with no references yet.
+    writer_fixture["status"] = "candidate"
+    writer_fixture["references"] = []
+    # Remove the entry so the writer can be candidate without a verified
+    # crop referencing it.
+    result = _run_against(tmp_path, [writer_fixture], [])
+    assert result.returncode == 0, result.stderr
+
+
+def test_verified_writer_without_references_is_rejected(
+    tmp_path: Path,
+    writer_fixture: dict,
+) -> None:
+    writer_fixture["references"] = []
+    result = _run_against(tmp_path, [writer_fixture], [])
+    assert result.returncode != 0
+    assert "references" in result.stderr
+    # jsonschema's exact wording for "minItems violated" is
+    # "[] should be non-empty"; accept either that or a generic
+    # if/then failure message for forward compatibility.
+    lower = result.stderr.lower()
+    assert any(needle in lower for needle in (
+        "should be non-empty",
+        "minitems",
+        "is too short",
+        "should not be valid",
+    ))
+
+
+# --- Cross-field validation -------------------------------------------------
+
+
+def test_unknown_writer_id_is_rejected(
+    tmp_path: Path,
+    writer_fixture: dict,
+    entry_fixture: dict,
+) -> None:
+    entry_fixture["writer_id"] = "missing_writer"
+    entry_fixture["entry_id"] = "missing_writer__alef__v0001"
+    result = _run_against(tmp_path, [writer_fixture], [entry_fixture])
+    assert result.returncode != 0
+    assert "unknown writer_id" in result.stderr
+
+
+def test_entry_id_must_start_with_writer_and_letter(
+    tmp_path: Path,
+    writer_fixture: dict,
+    entry_fixture: dict,
+) -> None:
+    entry_fixture["entry_id"] = "fixture_writer__bet__v0001"
+    result = _run_against(tmp_path, [writer_fixture], [entry_fixture])
+    assert result.returncode != 0
+    assert "must start with" in result.stderr
+
+
+def test_letter_codepoint_must_match_name(
+    tmp_path: Path,
+    writer_fixture: dict,
+    entry_fixture: dict,
+) -> None:
+    entry_fixture["letter"]["codepoint"] = "U+05D1"
+    result = _run_against(tmp_path, [writer_fixture], [entry_fixture])
+    assert result.returncode != 0
+    assert "letter.codepoint mismatch" in result.stderr
+
+
+def test_letter_char_must_match_name(
+    tmp_path: Path,
+    writer_fixture: dict,
+    entry_fixture: dict,
+) -> None:
+    entry_fixture["letter"]["unicode_char"] = "ב"
+    result = _run_against(tmp_path, [writer_fixture], [entry_fixture])
+    assert result.returncode != 0
+    assert "letter.unicode_char mismatch" in result.stderr
+
+
+def test_letter_form_must_match_name(
+    tmp_path: Path,
+    writer_fixture: dict,
+    entry_fixture: dict,
+) -> None:
+    entry_fixture["letter"]["form"] = "final"
+    result = _run_against(tmp_path, [writer_fixture], [entry_fixture])
+    assert result.returncode != 0
+    assert "letter.form mismatch" in result.stderr
+
+
+# --- Upstream block ---------------------------------------------------------
+
+
+def test_upstream_commit_must_be_sha(
+    tmp_path: Path,
+    writer_fixture: dict,
+    entry_fixture: dict,
+) -> None:
+    # Tag-style refs are no longer accepted in `upstream.commit`. They
+    # belong in `upstream.release_tag` instead.
+    entry_fixture["upstream"]["commit"] = "v0.1.0-rc"
+    result = _run_against(tmp_path, [writer_fixture], [entry_fixture])
+    assert result.returncode != 0
+    assert "does not match" in result.stderr or "pattern" in result.stderr
+
+
+def test_upstream_release_tag_is_optional(
+    tmp_path: Path,
+    writer_fixture: dict,
+    entry_fixture: dict,
+) -> None:
+    entry_fixture["upstream"]["release_tag"] = None
+    result = _run_against(tmp_path, [writer_fixture], [entry_fixture])
+    assert result.returncode == 0, result.stderr
+
+
+def test_upstream_repo_field_is_not_allowed(
+    tmp_path: Path,
+    writer_fixture: dict,
+    entry_fixture: dict,
+) -> None:
+    # The upstream URL lives in scripts/release_recipe.json now; per-row
+    # duplication is rejected by additionalProperties:false.
+    entry_fixture["upstream"]["repo"] = "https://github.com/HeOCR/whatever"
+    result = _run_against(tmp_path, [writer_fixture], [entry_fixture])
+    assert result.returncode != 0
+    assert "Additional properties are not allowed" in result.stderr or "additionalProperties" in result.stderr
+
+
+def test_upstream_entry_id_must_match_source(
+    tmp_path: Path,
+    writer_fixture: dict,
+    entry_fixture: dict,
+) -> None:
+    entry_fixture["upstream"]["entry_id"] = "commons__another_source__p0001"
+    result = _run_against(tmp_path, [writer_fixture], [entry_fixture])
+    assert result.returncode != 0
+    assert "upstream.entry_id" in result.stderr
+
+
+# --- Local path conventions -------------------------------------------------
+
+
+def test_local_path_prefix_is_enforced(
+    tmp_path: Path,
+    writer_fixture: dict,
+    entry_fixture: dict,
+) -> None:
+    entry_fixture["image"]["local_path"] = "data/letters/wrong/alef/x.png"
+    result = _run_against(tmp_path, [writer_fixture], [entry_fixture])
+    assert result.returncode != 0
+    assert "must start with" in result.stderr
+
+
+def test_local_path_extension_must_match_mime(
+    tmp_path: Path,
+    writer_fixture: dict,
+    entry_fixture: dict,
+) -> None:
+    entry_fixture["image"]["local_path"] = (
+        "data/letters/fixture_writer/alef/fixture_writer__alef__v0001.jpg"
+    )
+    result = _run_against(tmp_path, [writer_fixture], [entry_fixture])
+    assert result.returncode != 0
+    assert "does not match" in result.stderr
+
+
+# --- Background <-> mime guard ----------------------------------------------
+
+
+def test_transparent_background_with_jpeg_is_rejected(
+    tmp_path: Path,
+    writer_fixture: dict,
+    entry_fixture: dict,
+) -> None:
+    # Schema if/then enforces that transparent backgrounds require an
+    # alpha-capable mime type. JPEG has no alpha.
+    entry_fixture["image"]["mime_type"] = "image/jpeg"
+    entry_fixture["image"]["local_path"] = (
+        "data/letters/fixture_writer/alef/fixture_writer__alef__v0001.jpg"
+    )
+    # Rename the on-disk fixture to match.
+    src = tmp_path / "data/letters/fixture_writer/alef/fixture_writer__alef__v0001.png"
+    dst = tmp_path / "data/letters/fixture_writer/alef/fixture_writer__alef__v0001.jpg"
+    src.rename(dst)
+    entry_fixture["image"]["background"] = "transparent"
+    result = _run_against(tmp_path, [writer_fixture], [entry_fixture])
+    assert result.returncode != 0
+
+
+# --- Rights validation ------------------------------------------------------
+
+
+def test_unverified_entry_cannot_claim_positive_permissions(
+    tmp_path: Path,
+    writer_fixture: dict,
+    entry_fixture: dict,
+) -> None:
+    entry_fixture["rights"]["verification_status"] = "source_note_only"
+    entry_fixture["rights"]["commercial_use_allowed"] = True
+    result = _run_against(tmp_path, [writer_fixture], [entry_fixture])
+    assert result.returncode != 0
+    assert "should not be valid" in result.stderr
+
+
+def test_attribution_required_without_text_is_rejected(
+    tmp_path: Path,
+    writer_fixture: dict,
+    entry_fixture: dict,
+) -> None:
+    entry_fixture["rights"]["license_expression"] = "CC-BY-SA-4.0"
+    entry_fixture["rights"]["rights_basis"] = "cc_by_sa"
+    entry_fixture["rights"]["attribution_required"] = True
+    entry_fixture["rights"]["attribution_text"] = None
+    entry_fixture["rights"]["attribution_url"] = None
+    result = _run_against(tmp_path, [writer_fixture], [entry_fixture])
+    assert result.returncode != 0
+
+
+def test_attribution_with_blank_text_is_rejected(
+    tmp_path: Path,
+    writer_fixture: dict,
+    entry_fixture: dict,
+) -> None:
+    entry_fixture["rights"]["license_expression"] = "CC-BY-SA-4.0"
+    entry_fixture["rights"]["rights_basis"] = "cc_by_sa"
+    entry_fixture["rights"]["attribution_required"] = True
+    entry_fixture["rights"]["attribution_text"] = "   "
+    entry_fixture["rights"]["attribution_url"] = (
+        "https://commons.wikimedia.org/wiki/File:Example.jpg"
+    )
+    result = _run_against(tmp_path, [writer_fixture], [entry_fixture])
+    assert result.returncode != 0
+    assert "attribution_text is null, blank, or whitespace-only" in result.stderr
+
+
+def test_rights_basis_must_match_license_expression(
+    tmp_path: Path,
+    writer_fixture: dict,
+    entry_fixture: dict,
+) -> None:
+    # The validator's LICENSE_BASIS_MAP says CC-BY-SA-4.0 → cc_by_sa.
+    # An ingester who flips one but not the other is rejected.
+    entry_fixture["rights"]["license_expression"] = "CC-BY-SA-4.0"
+    entry_fixture["rights"]["rights_basis"] = "cc0"
+    entry_fixture["rights"]["attribution_required"] = True
+    entry_fixture["rights"]["attribution_text"] = "Example licensor"
+    entry_fixture["rights"]["attribution_url"] = (
+        "https://commons.wikimedia.org/wiki/File:Example.jpg"
+    )
+    result = _run_against(tmp_path, [writer_fixture], [entry_fixture])
+    assert result.returncode != 0
+    assert "rights_basis" in result.stderr
+    assert "does not match" in result.stderr
+
+
+def test_unknown_license_expression_is_rejected(
+    tmp_path: Path,
+    writer_fixture: dict,
+    entry_fixture: dict,
+) -> None:
+    entry_fixture["rights"]["license_expression"] = "GPL-3.0"
+    result = _run_against(tmp_path, [writer_fixture], [entry_fixture])
+    assert result.returncode != 0
+    assert "LICENSE_BASIS_MAP" in result.stderr or "not in" in result.stderr
+
+
+def test_null_license_requires_unknown_basis(
+    tmp_path: Path,
+    writer_fixture: dict,
+    entry_fixture: dict,
+) -> None:
+    entry_fixture["rights"]["license_expression"] = None
+    entry_fixture["rights"]["rights_basis"] = "public_domain"
+    # null license + positive permissions is also conditionally blocked
+    # by the schema, so flip to a verification status that allows null
+    # everywhere.
+    entry_fixture["rights"]["verification_status"] = "unverified"
+    entry_fixture["rights"]["commercial_use_allowed"] = None
+    entry_fixture["rights"]["derivatives_allowed"] = None
+    entry_fixture["rights"]["redistribution_allowed"] = None
+    result = _run_against(tmp_path, [writer_fixture], [entry_fixture])
+    assert result.returncode != 0
+    assert "license_expression is null" in result.stderr
+
+
+# --- File integrity ---------------------------------------------------------
+
+
+def test_missing_local_image_is_rejected(
+    tmp_path: Path,
+    writer_fixture: dict,
+    entry_fixture: dict,
+) -> None:
+    (tmp_path / entry_fixture["image"]["local_path"]).unlink()
+    result = _run_against(tmp_path, [writer_fixture], [entry_fixture])
+    assert result.returncode != 0
+    assert "file does not exist" in result.stderr
+
+
+def test_byte_size_mismatch_is_rejected(
+    tmp_path: Path,
+    writer_fixture: dict,
+    entry_fixture: dict,
+) -> None:
+    real_bytes = entry_fixture["image"]["bytes"]
+    entry_fixture["image"]["bytes"] = real_bytes + 1
+    result = _run_against(tmp_path, [writer_fixture], [entry_fixture])
+    assert result.returncode != 0
+    assert "byte size mismatch" in result.stderr
+
+
+def test_sha256_mismatch_is_rejected(
+    tmp_path: Path,
+    writer_fixture: dict,
+    entry_fixture: dict,
+) -> None:
+    entry_fixture["image"]["sha256"] = "0" * 64
+    result = _run_against(tmp_path, [writer_fixture], [entry_fixture])
+    assert result.returncode != 0
+    assert "sha256 mismatch" in result.stderr
+
+
+def test_duplicate_entry_id_is_rejected(
+    tmp_path: Path,
+    writer_fixture: dict,
+    entry_fixture: dict,
+) -> None:
+    second = json.loads(json.dumps(entry_fixture))
+    result = _run_against(tmp_path, [writer_fixture], [entry_fixture, second])
+    assert result.returncode != 0
+    assert "duplicate" in result.stderr
+
+
+def test_missing_index_file_is_rejected(tmp_path: Path) -> None:
+    result = run_validator(
+        "--writers", tmp_path / "missing.jsonl",
+        "--entries", ENTRIES,
+    )
+    assert result.returncode != 0
+    assert "file does not exist" in result.stderr
+
+
+# --- Tool version ----------------------------------------------------------
+
+
+@pytest.mark.parametrize("version", [
+    "v0.0.1",
+    "0.0.1",
+    "v1.2.3",
+    "v1.2.3-rc1",
+    "v1.2.3-3-gabc1234",
+    "v1.2.3+build.5",
+    "v1.2.3-rc1+build.5",
+])
+def test_tool_version_accepts_common_shapes(
+    tmp_path: Path,
+    writer_fixture: dict,
+    entry_fixture: dict,
+    version: str,
+) -> None:
+    entry_fixture["extraction"]["tool_version"] = version
+    result = _run_against(tmp_path, [writer_fixture], [entry_fixture])
+    assert result.returncode == 0, result.stderr
+
+
+@pytest.mark.parametrize("version", [
+    "not-semver",
+    "v1",
+    "v1.2",
+    "1.2.3.4",
+])
+def test_tool_version_rejects_garbage(
+    tmp_path: Path,
+    writer_fixture: dict,
+    entry_fixture: dict,
+    version: str,
+) -> None:
+    entry_fixture["extraction"]["tool_version"] = version
+    result = _run_against(tmp_path, [writer_fixture], [entry_fixture])
+    assert result.returncode != 0
+
+
+# --- Upstream cross-validation ---------------------------------------------
+
+
+def _write_upstream(tmp_path: Path, entries: list[dict]) -> Path:
+    upstream_root = tmp_path / "upstream"
+    (upstream_root / "data" / "index").mkdir(parents=True)
+    upstream_entries = upstream_root / "data" / "index" / "entries.jsonl"
+    upstream_entries.write_text(
+        "".join(json.dumps(e, ensure_ascii=False) + "\n" for e in entries),
+        encoding="utf-8",
+    )
+    return upstream_root
+
+
+def _upstream_entry(width: int = 4000, height: int = 5000) -> dict:
+    """Minimal upstream entry shape (only the fields the validator
+    actually reads). The full upstream schema is enforced by the upstream
+    repo's own CI, not here."""
+    return {
+        "entry_id": "commons__fixture_source__p0001",
+        "files": [{
+            "sha256": "a" * 64,
+            "width_px": width,
+            "height_px": height,
+        }],
+    }
+
+
+def test_upstream_cross_check_passes_for_in_bounds_bbox(
+    tmp_path: Path,
+    writer_fixture: dict,
+    entry_fixture: dict,
+) -> None:
+    upstream_root = _write_upstream(tmp_path, [_upstream_entry()])
+    result = _run_against(
+        tmp_path, [writer_fixture], [entry_fixture],
+        "--upstream-path", str(upstream_root),
+    )
+    assert result.returncode == 0, result.stderr
+    assert "1 upstream-cross-checked" in result.stdout
+
+
+def test_upstream_cross_check_rejects_missing_entry(
+    tmp_path: Path,
+    writer_fixture: dict,
+    entry_fixture: dict,
+) -> None:
+    upstream_root = _write_upstream(tmp_path, [])
+    result = _run_against(
+        tmp_path, [writer_fixture], [entry_fixture],
+        "--upstream-path", str(upstream_root),
+    )
+    assert result.returncode != 0
+    assert "not found in" in result.stderr
+
+
+def test_upstream_cross_check_rejects_sha_mismatch(
+    tmp_path: Path,
+    writer_fixture: dict,
+    entry_fixture: dict,
+) -> None:
+    upstream = _upstream_entry()
+    upstream["files"][0]["sha256"] = "b" * 64
+    upstream_root = _write_upstream(tmp_path, [upstream])
+    result = _run_against(
+        tmp_path, [writer_fixture], [entry_fixture],
+        "--upstream-path", str(upstream_root),
+    )
+    assert result.returncode != 0
+    assert "upstream.sha256" in result.stderr
+
+
+def test_upstream_cross_check_rejects_bbox_out_of_bounds(
+    tmp_path: Path,
+    writer_fixture: dict,
+    entry_fixture: dict,
+) -> None:
+    upstream_root = _write_upstream(tmp_path, [_upstream_entry(width=50, height=50)])
+    entry_fixture["upstream"]["bbox"] = {"x": 10, "y": 20, "w": 100, "h": 100}
+    result = _run_against(
+        tmp_path, [writer_fixture], [entry_fixture],
+        "--upstream-path", str(upstream_root),
+    )
+    assert result.returncode != 0
+    assert "beyond upstream scan" in result.stderr