diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..e75f0d6 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,20 @@ +root = true + +[*] +charset = utf-8 +end_of_line = lf +insert_final_newline = true +trim_trailing_whitespace = true +indent_style = space +indent_size = 2 + +[*.py] +indent_size = 4 + +[*.md] +# Trailing whitespace can be meaningful in Markdown (two-space hard +# line breaks). Don't strip it automatically. +trim_trailing_whitespace = false + +[Makefile] +indent_style = tab diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..ccf0191 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,28 @@ +# Per-letter image crops are tracked with Git LFS. This keeps regular +# git operations fast and the repository clone size sane as the corpus +# grows (per-writer × 27 letter forms × multiple variants accumulates +# quickly even at 10–50 KB per crop). +# +# After cloning, run `git lfs install` once, then `git lfs pull` to +# fetch the actual image bytes. CI does this automatically before the +# validator runs (see .github/workflows/ci.yml). +data/letters/**/*.png filter=lfs diff=lfs merge=lfs -text +data/letters/**/*.jpg filter=lfs diff=lfs merge=lfs -text +data/letters/**/*.jpeg filter=lfs diff=lfs merge=lfs -text +data/letters/**/*.webp filter=lfs diff=lfs merge=lfs -text +data/letters/**/*.tif filter=lfs diff=lfs merge=lfs -text +data/letters/**/*.tiff filter=lfs diff=lfs merge=lfs -text + +# Force LF line endings on text files so checksums and diffs are stable +# across macOS/Linux/Windows contributors. +*.md text eol=lf +*.json text eol=lf +*.jsonl text eol=lf +*.yml text eol=lf +*.yaml text eol=lf +*.py text eol=lf +*.cff text eol=lf +*.txt text eol=lf +.gitignore text eol=lf +.gitattributes text eol=lf +LICENSE text eol=lf diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md new file mode 100644 index 0000000..9cb9e33 --- /dev/null +++ b/.github/pull_request_template.md @@ -0,0 +1,43 @@ + + +## Summary + + + +## Type of change + +- [ ] New writer(s) / new per-letter image entries (ingest) +- [ ] Schema or validator change +- [ ] Release tooling / CI change +- [ ] Documentation / policy +- [ ] Refactor / chore (no behaviour change) + +## Pre-merge checklist + +- [ ] `python3 scripts/validate_indexes.py` passes locally. +- [ ] `python3 scripts/generate_release_artifacts.py` was re-run after + any change to `data/index/*.jsonl` or `scripts/release_recipe.json`, + and the regenerated `NOTICE.md` / `CITATION.cff` / `datapackage.json` + are staged in this PR. +- [ ] `python3 -m pytest` passes locally. +- [ ] `git diff --check` shows no whitespace issues. +- [ ] If image files were added/changed, they are tracked via Git LFS + (see `.gitattributes`). + +## Rights / licensing + + + + + + +## Notes for reviewers + + + + diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..0f2e072 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,40 @@ +name: CI + +on: + push: + branches: [main] + pull_request: + +jobs: + validate: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + # Pull LFS pointer files; we fetch the actual bytes in the + # next step so we control retry/error behaviour. + lfs: true + - name: Install Git LFS + run: | + git lfs install + git lfs pull + - uses: actions/setup-python@v5 + with: + # Python 3.11+ is required (validate_indexes.py uses + # hashlib.file_digest). Pin 3.12 for stable CI; update both + # this line and requirements-dev.txt's header when bumping. + python-version: "3.12" + - name: Check out upstream scans repo for cross-validation + uses: actions/checkout@v4 + with: + repository: HeOCR/public-domain-hand-written-hebrew-scans + path: .upstream + lfs: false + - name: Install dev dependencies + run: python -m pip install -r requirements-dev.txt + - name: Validate JSONL indexes (with upstream cross-check) + run: python scripts/validate_indexes.py --upstream-path .upstream + - name: Check release artefacts are up to date (run `python3 scripts/generate_release_artifacts.py` to refresh) + run: python scripts/generate_release_artifacts.py --check + - name: Run pytest + run: python -m pytest diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1b12e9c --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +.DS_Store +__pycache__/ +*.py[cod] +.claude/ +.venv/ +venv/ +.pytest_cache/ diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..1cb9394 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,244 @@ +# AGENTS.md + +Operational rules for agents and humans contributing per-letter image +crops, writer records, or tooling to this repository. If anything below +conflicts with `docs/dataset_structure.md` or `LICENSE.md`, those +documents win — this file is a working summary, not a re-derivation of +policy. + +## What this repo is + +A dataset of **sets of per-letter images of handwritten Hebrew letters**, +grouped by writer. Each set = one person/scribe. Each per-letter image +is a **crop** of a permissively-licensed upstream scan from +[HeOCR/public-domain-hand-written-hebrew-scans][upstream], with rights +inherited and recorded per image. Canonical layout, schema motivation, +and ingestion model live in [`docs/dataset_structure.md`]\ +(docs/dataset_structure.md). The Hebrew letter enumeration is in +[`docs/letters.md`](docs/letters.md). Compound licensing (CC0 metadata, +per-image rights inheritance) is described in +[`LICENSE.md`](LICENSE.md). The machine-readable contracts are +[`schemas/writer.schema.json`](schemas/writer.schema.json) and +[`schemas/entry.schema.json`](schemas/entry.schema.json). The release +runbook is [`docs/release_process.md`](docs/release_process.md). + +[upstream]: https://github.com/HeOCR/public-domain-hand-written-hebrew-scans + +## First-time setup + +Run once per clone: + +```bash +git lfs install +git lfs pull +python3 -m pip install -r requirements-dev.txt +``` + +`data/letters/**` image files are tracked via Git LFS (see +`.gitattributes`). Without `git lfs pull` you have pointer files, not +images, and the validator's file-integrity check will fail. + +**Python 3.11+ is required** — the validator uses `hashlib.file_digest`. +CI pins 3.12. + +## Mandatory pre-PR commands + +Run these from the repo root before opening or updating a PR. The first +three are also run in CI (`.github/workflows/ci.yml`) on every push to +`main` and every PR — they must stay green. + +```bash +python3 scripts/validate_indexes.py +python3 scripts/generate_release_artifacts.py +python3 -m pytest +git diff --check +``` + +`validate_indexes.py` must end with +`ok: N writers, M entries, K files verified`. +`generate_release_artifacts.py` must leave `NOTICE.md`, `CITATION.cff`, +and `datapackage.json` unchanged in the diff — re-run it after any edit +to `data/index/*.jsonl` or `scripts/release_recipe.json` and stage the +regenerated artefacts. +`python3 scripts/generate_release_artifacts.py --check` is the +non-mutating equivalent (CI runs the `--check` form). `pytest` must +report all tests passing. `git diff --check` must produce no output. + +### Optional upstream cross-validation + +If you have a local clone of the upstream scans repo, pass +`--upstream-path` to validate `upstream.sha256` and `upstream.bbox` +against the live upstream entry records: + +```bash +python3 scripts/validate_indexes.py \ + --upstream-path ../public-domain-hand-written-hebrew-scans +``` + +CI checks out the upstream repo as a sibling and runs the validator with +`--upstream-path` automatically, so any mismatch (upstream re-encode, +bbox-out-of-bounds) blocks the PR. + +### Tests-only flag + +`--repo-root PATH` overrides the file-integrity check's repo root. It +exists for the pytest fixtures and is not part of the ingest workflow. + +## Release artefacts + +`NOTICE.md`, `CITATION.cff`, and `datapackage.json` at the repo root are +generated deterministically from `data/index/*.jsonl` and +`scripts/release_recipe.json`. Do not edit them by hand. + +Two timestamps with deliberately different semantics: + +- `datapackage.json::released_at` = `max(extraction.extracted_at)` — + the corpus-state timestamp. Bumps automatically on every ingest PR. + When the corpus is empty it falls back to + `release_recipe.json::initial_release_date`. +- `CITATION.cff::date-released` = `release_recipe.json::version_released_date` + — stable per version. Only changes when a human bumps `version` + (see [`docs/release_process.md`](docs/release_process.md)). + +This means an ingest PR will bump `released_at` but not `date-released`. +That is intentional: citations stay reproducible while +corpus-freshness metadata moves with reality. + +Regenerate by running `python3 scripts/generate_release_artifacts.py` +from the repo root. + +## GitHub workflow + +- One PR per coherent change. Batching is fine when tightly coupled + (tooling change + the docs that describe it); avoid batching + unrelated work. +- Open PRs non-draft. The PR template's checkboxes are required. +- Use the `git` and `gh` CLIs. Do not push to `main` directly. +- Standard commit hygiene: conventional `type(scope): subject`, real + `Co-Authored-By` trailer when collaborating, no `--no-verify`, no + force-push to `main`. + +## Ingest rules + +### In scope + +- Cropped images of **single** Hebrew letters from handwriting attested + to a specific writer. +- Both `regular` and `final` forms are first-class — they are never + merged into a single base letter. See [`docs/letters.md`]\ + (docs/letters.md) for the canonical 27-form enumeration. +- The crop must come from a scan that exists as a row in the upstream + repo's `data/index/entries.jsonl`. If the page is not yet in upstream, + add it there first. + +### Out of scope + +- Printed or typeset letters. +- Composite glyphs (digraphs, niqqud-only marks, pointed shin/sin + variants `שׁ`/`שׂ`). +- Crops from scans whose license does not permit redistribution, + commercial use, and derivatives. + +### Per-image metadata (mandatory) + +Every entry must include: + +- `upstream.source_id`, `upstream.entry_id`, `upstream.sha256`, + `upstream.commit` (40-char SHA — tag refs go in `upstream.release_tag` + instead), `upstream.bbox`. +- `image.local_path` matching + `data/letters///.`. +- `image.sha256` — full file SHA-256 (lowercase hex). +- `image.bytes` — file size in bytes. +- `image.mime_type` — `image/png`, `image/jpeg`, `image/webp`, or + `image/tiff`. Extension on `local_path` must match. +- `image.width_px` and `image.height_px`. +- `image.background` — `original`, `white`, `black`, `gray`, + `binarized`, or `transparent`. (`transparent` requires an + alpha-capable mime type; the schema rejects `transparent` + JPEG.) +- `extraction.tool`, `extraction.tool_version` (SemVer or `git describe` + output), `extraction.method`, `extraction.extracted_at`, + `extraction.extracted_by`. +- `rights.*` — inherited from the upstream entry per the table in + `LICENSE.md`. `rights.rights_basis` must match + `rights.license_expression` per the validator's `LICENSE_BASIS_MAP`. + +Helpers — macOS: + +```bash +shasum -a 256 FILE +stat -f%z FILE +file --mime-type -b FILE +sips -g pixelWidth -g pixelHeight FILE +``` + +Helpers — Linux (CI runs on Ubuntu, so these are the same shapes used +in CI debugging): + +```bash +sha256sum FILE +stat -c%s FILE +file --mime-type -b FILE +identify -format "%w %h\n" FILE # ImageMagick; or use Pillow from Python. +``` + +`scripts/validate_indexes.py` re-checks file integrity against the +recorded metadata on every run. Mismatches block CI. + +### Accepted licenses + +- `PDM-1.0` → `rights_basis: public_domain` +- `CC0-1.0` → `rights_basis: cc0` +- `CC-BY-4.0` → `rights_basis: cc_by` (attribution required) +- `CC-BY-SA-4.0` → `rights_basis: cc_by_sa` (attribution required; + ShareAlike applies to the crop, since the crop is an adaptation) +- `LicenseRef-Public-Domain-Israel` → `rights_basis: public_domain` +- `LicenseRef-Public-Domain-Ukraine` → `rights_basis: public_domain` + +The validator's `LICENSE_BASIS_MAP` is the single source of truth for +this mapping. Adding a license means updating that map AND this list +AND `scripts/release_recipe.json::license_names`/`license_urls`. + +### Rejected licenses + +- `CC-BY-NC`, `CC-BY-NC-SA`, `CC-BY-ND`. +- "Research only", "permission required", "educational use only". +- Anything unknown, ambiguous, or where the upstream entry's + `rights.verification_status` is not at least `primary_page_checked`. + +## Naming + +```text +writer_id = # e.g. chaim_nachman_bialik +entry_id = ____v # zero-padded variant +``` + +`` is the canonical slug from `docs/letters.md`. `` +is the zero-padded 4-digit counter monotonic per +`(writer_id, letter.name)`. + +### Writer disambiguation + +On Latin-name collision (e.g. two writers named "Yosef Haim"), append +the birth year to disambiguate: `yosef_haim_1834`, `yosef_haim_1902`. +Fallbacks when birth year is unknown: + +1. Death year: `yosef_haim_d1942`. +2. Period start year: `yosef_haim_p1880`. +3. Provider authority ID: `yosef_haim_viaf12345678` — last resort, only + when none of the above are knowable. + +Always record the rationale in the writer's `ingest.agent_notes`. + +## What NOT to commit + +The following are already in `.gitignore` and should never appear in a +diff: + +- `.claude/` — local agent session state. +- `.DS_Store` — macOS Finder metadata. +- `__pycache__/`, `*.pyc`, `*.pyo`, `*.pyd` — Python bytecode caches. +- `.venv/`, `venv/`, `.pytest_cache/`. + +If `git status` shows any of these as untracked, leave them untracked. +Do not `git add -f` to override the ignore. diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..64f9cc2 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,51 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) +for the dataset version recorded in `scripts/release_recipe.json::version`. + +## [Unreleased] + +(no in-progress changes) + +## [0.0.0-rc] - 2026-05-12 + +Initial scaffolding release. Per-letter image corpus is empty; the +repository ships the schemas, validators, release tooling, CI, and +licensing policy needed to start ingesting. + +### Added + +- Writer-level (`schemas/writer.schema.json`) and entry-level + (`schemas/entry.schema.json`) record contracts. Each entry references + an upstream scan in `HeOCR/public-domain-hand-written-hebrew-scans` + by `source_id`, `entry_id`, `sha256` (mutable-tag-free), `commit` + (40-char SHA), and `bbox`. +- `scripts/validate_indexes.py`: schema validation, referential + integrity, Hebrew letter codepoint/name/form consistency, + `rights_basis` ↔ `license_expression` cross-check, file-integrity + re-hashing, and optional `--upstream-path` cross-validation of + upstream `sha256` and `bbox` bounds. +- `scripts/generate_release_artifacts.py` + `scripts/release_recipe.json`: + deterministic generation of `NOTICE.md`, `CITATION.cff`, and + `datapackage.json`. Citation `date-released` is stable per version + (`version_released_date` in the recipe); datapackage `released_at` + tracks the corpus state (`max(extraction.extracted_at)`). +- `.gitattributes` configures Git LFS for `data/letters/**` image + files. CI fetches LFS bytes before validation. +- `LICENSE` (CC0 1.0) and `LICENSE.md` compound-licensing policy with + per-license inheritance table and CC-BY-SA-4.0 ShareAlike handling. +- `AGENTS.md`, `README.md`, `docs/dataset_structure.md`, + `docs/letters.md`, and `docs/release_process.md`. +- `.github/workflows/ci.yml`, `.github/pull_request_template.md`, + `.editorconfig`. +- Pytest test suite covering schema rejection, referential integrity, + letter consistency, rights cross-check, attribution gating, + file-integrity checks, empty-corpus fallbacks, non-empty corpus + NOTICE/CITATION/datapackage rendering, upstream cross-validation, + and Frictionless Data Package conformance. + +[Unreleased]: https://github.com/HeOCR/hletterscript/compare/v0.0.0-rc...HEAD +[0.0.0-rc]: https://github.com/HeOCR/hletterscript/releases/tag/v0.0.0-rc diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 0000000..f2476d0 --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,22 @@ +# Generated by scripts/generate_release_artifacts.py. Do not edit by hand. +cff-version: 1.2.0 +message: Please cite this dataset using the metadata below. +type: dataset +title: Hebrew Handwritten Per-Letter Image Dataset +abstract: 'Per-letter image crops of handwritten Hebrew letters, grouped into sets by writer. Each crop is a derivative of a permissively-licensed upstream scan in HeOCR/public-domain-hand-written-hebrew-scans, with per-image rights inherited and attribution recorded. The index is line-oriented JSON (JSONL). Release 0.0.0-rc is the initial-setup release: the corpus contains no per-letter image entries yet. The repository ships the schemas, validation tooling, CI, and licensing policy needed to start ingesting.' +authors: +- name: Shay Palachy-Affek +version: 0.0.0-rc +date-released: '2026-05-12' +repository-code: https://github.com/HeOCR/hletterscript +url: https://github.com/HeOCR/hletterscript +license: CC0-1.0 +keywords: +- Hebrew +- dataset +- glyphs +- handwriting +- handwritten-text-recognition +- letters +- public-domain +- synthetic-generation diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..0e259d4 --- /dev/null +++ b/LICENSE @@ -0,0 +1,121 @@ +Creative Commons Legal Code + +CC0 1.0 Universal + + CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE + LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN + ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS + INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES + REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS + PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM + THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED + HEREUNDER. + +Statement of Purpose + +The laws of most jurisdictions throughout the world automatically confer +exclusive Copyright and Related Rights (defined below) upon the creator +and subsequent owner(s) (each and all, an "owner") of an original work of +authorship and/or a database (each, a "Work"). + +Certain owners wish to permanently relinquish those rights to a Work for +the purpose of contributing to a commons of creative, cultural and +scientific works ("Commons") that the public can reliably and without fear +of later claims of infringement build upon, modify, incorporate in other +works, reuse and redistribute as freely as possible in any form whatsoever +and for any purposes, including without limitation commercial purposes. +These owners may contribute to the Commons to promote the ideal of a free +culture and the further production of creative, cultural and scientific +works, or to gain reputation or greater distribution for their Work in +part through the use and efforts of others. + +For these and/or other purposes and motivations, and without any +expectation of additional consideration or compensation, the person +associating CC0 with a Work (the "Affirmer"), to the extent that he or she +is an owner of Copyright and Related Rights in the Work, voluntarily +elects to apply CC0 to the Work and publicly distribute the Work under its +terms, with knowledge of his or her Copyright and Related Rights in the +Work and the meaning and intended legal effect of CC0 on those rights. + +1. Copyright and Related Rights. A Work made available under CC0 may be +protected by copyright and related or neighboring rights ("Copyright and +Related Rights"). Copyright and Related Rights include, but are not +limited to, the following: + + i. the right to reproduce, adapt, distribute, perform, display, + communicate, and translate a Work; + ii. moral rights retained by the original author(s) and/or performer(s); +iii. publicity and privacy rights pertaining to a person's image or + likeness depicted in a Work; + iv. rights protecting against unfair competition in regards to a Work, + subject to the limitations in paragraph 4(a), below; + v. rights protecting the extraction, dissemination, use and reuse of data + in a Work; + vi. database rights (such as those arising under Directive 96/9/EC of the + European Parliament and of the Council of 11 March 1996 on the legal + protection of databases, and under any national implementation + thereof, including any amended or successor version of such + directive); and +vii. other similar, equivalent or corresponding rights throughout the + world based on applicable law or treaty, and any national + implementations thereof. + +2. Waiver. To the greatest extent permitted by, but not in contravention +of, applicable law, Affirmer hereby overtly, fully, permanently, +irrevocably and unconditionally waives, abandons, and surrenders all of +Affirmer's Copyright and Related Rights and associated claims and causes +of action, whether now known or unknown (including existing as well as +future claims and causes of action), in the Work (i) in all territories +worldwide, (ii) for the maximum duration provided by applicable law or +treaty (including future time extensions), (iii) in any current or future +medium and for any number of copies, and (iv) for any purpose whatsoever, +including without limitation commercial, advertising or promotional +purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each +member of the public at large and to the detriment of Affirmer's heirs and +successors, fully intending that such Waiver shall not be subject to +revocation, rescission, cancellation, termination, or any other legal or +equitable action to disrupt the quiet enjoyment of the Work by the public +as contemplated by Affirmer's express Statement of Purpose. + +3. Public License Fallback. Should any part of the Waiver for any reason +be judged legally invalid or ineffective under applicable law, then the +Waiver shall be preserved to the maximum extent permitted taking into +account Affirmer's express Statement of Purpose. In addition, to the +extent the Waiver is so judged Affirmer hereby grants to each affected +person a royalty-free, non transferable, non sublicensable, non exclusive, +irrevocable and unconditional license to exercise Affirmer's Copyright and +Related Rights in the Work (i) in all territories worldwide, (ii) for the +maximum duration provided by applicable law or treaty (including future +time extensions), (iii) in any current or future medium and for any number +of copies, and (iv) for any purpose whatsoever, including without +limitation commercial, advertising or promotional purposes (the +"License"). The License shall be deemed effective as of the date CC0 was +applied by Affirmer to the Work. Should any part of the License for any +reason be judged legally invalid or ineffective under applicable law, such +partial invalidity or ineffectiveness shall not invalidate the remainder +of the License, and in such case Affirmer hereby affirms that he or she +will not (i) exercise any of his or her remaining Copyright and Related +Rights in the Work or (ii) assert any associated claims and causes of +action with respect to the Work, in either case contrary to Affirmer's +express Statement of Purpose. + +4. Limitations and Disclaimers. + + a. No trademark or patent rights held by Affirmer are waived, abandoned, + surrendered, licensed or otherwise affected by this document. + b. Affirmer offers the Work as-is and makes no representations or + warranties of any kind concerning the Work, express, implied, + statutory or otherwise, including without limitation warranties of + title, merchantability, fitness for a particular purpose, non + infringement, or the absence of latent or other defects, accuracy, or + the present or absence of errors, whether or not discoverable, all to + the greatest extent permissible under applicable law. + c. Affirmer disclaims responsibility for clearing rights of other persons + that may apply to the Work or any use thereof, including without + limitation any person's Copyright and Related Rights in the Work. + Further, Affirmer disclaims responsibility for obtaining any necessary + consents, permissions or other rights required for any use of the + Work. + d. Affirmer understands and acknowledges that Creative Commons is not a + party to this document and has no duty or obligation with respect to + this CC0 or use of the Work. diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 0000000..cb8e99e --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,101 @@ +# Licensing Policy + +This repository is structured for compound licensing — the same model used +by [HeOCR/public-domain-hand-written-hebrew-scans][upstream]. + +[upstream]: https://github.com/HeOCR/public-domain-hand-written-hebrew-scans + +## Repository-authored metadata + +Metadata authored directly in this repository is dedicated to the public +domain under CC0 1.0 Universal (`CC0-1.0`): + +https://creativecommons.org/publicdomain/zero/1.0/ + +To the extent possible under law, the repository contributors waive +copyright and related rights in this repository-authored metadata. The +canonical legal text is in [`LICENSE`](LICENSE). + +This dedication includes: + +- dataset structure documentation, +- writer and entry index metadata authored here, +- the JSON Schemas in `schemas/`, +- validation and release scripts in `scripts/`, +- generated metadata exports derived only from repository-authored + metadata (e.g. `datapackage.json`, `CITATION.cff`, `NOTICE.md`). + +The CC0 dedication does **not** extend to third-party image bytes, +upstream-owned descriptive text, or transcription bytes unless that +material is separately released under compatible terms. + +## Per-letter image crops + +Per-letter image bytes are **derivatives** of upstream scans hosted in +[HeOCR/public-domain-hand-written-hebrew-scans][upstream]. They are not +automatically covered by the metadata license. Each crop carries its own +entry-level rights record in `data/index/entries.jsonl`: + +- `rights.license_expression` (SPDX expression or `LicenseRef-*`), +- `rights.commercial_use_allowed`, +- `rights.derivatives_allowed`, +- `rights.redistribution_allowed`, +- `rights.attribution_required`, +- `rights.attribution_text`, +- `rights.attribution_url`. + +Consumers must use a crop according to the rights expressed in its own +entry record, not the repository-level metadata license. + +### License inheritance + +Because a per-letter crop is by definition a *derivative* of an upstream +scanned page, the crop's license is inherited from the upstream scan: + +| Upstream scan license | Per-letter crop license | Attribution required? | +| -------------------------------- | -------------------------------- | ----------------------------- | +| `CC0-1.0` | `CC0-1.0` | no | +| `PDM-1.0` | `PDM-1.0` | no | +| `LicenseRef-Public-Domain-*` | same `LicenseRef-Public-Domain-*`| no | +| `CC-BY-4.0` | `CC-BY-4.0` | yes (text + url required) | +| `CC-BY-SA-4.0` | `CC-BY-SA-4.0` | yes (text + url required) | +| Anything else (NC, ND, unknown) | **not ingestable** | n/a | + +The ShareAlike obligation propagates: anyone who redistributes a further +adaptation of a `CC-BY-SA-4.0` crop must release the adaptation under +`CC-BY-SA-4.0` or a compatible later version. Mere aggregation of +`CC-BY-SA-4.0` crops alongside public-domain or CC-BY crops in a release +bundle is not an adaptation, so the bundle itself does not need to be +relicensed. + +## Release bundles + +Remix-friendly public release bundles published from this repository +should include only entries where: + +- redistribution is allowed, +- commercial use is allowed, +- derivatives are allowed, +- both upstream scan rights and inherited crop rights have been verified. + +If a release bundle contains a mixture of public-domain, CC0, CC-BY, and +CC-BY-SA crops, the release must keep per-entry license metadata and +include attribution where required. Do not describe such a bundle as +having a single uniform crop license unless every included crop has the +same license. + +## Exclusions + +Do not include per-letter image crops with any of the following terms in +release bundles, and do not ingest upstream scans carrying these terms: + +- non-commercial only, +- no derivatives, +- research-only, +- permission required, +- unknown rights, +- inaccessible source evidence. + +This is stricter than the upstream scans repo's exclusion list because +this repository's deliverable is **only useful if downstream synthetic +document generators can redistribute and remix it**. diff --git a/NOTICE.md b/NOTICE.md new file mode 100644 index 0000000..c55feca --- /dev/null +++ b/NOTICE.md @@ -0,0 +1,18 @@ +# NOTICE + +This file is generated by `scripts/generate_release_artifacts.py` from `data/index/entries.jsonl`. Do not edit by hand. + +Repository-authored metadata is dedicated to the public domain under CC0 1.0 Universal. See [`LICENSE`](LICENSE) and [`LICENSE.md`](LICENSE.md) for the full compound-licensing policy. + +Per-letter image crops are derivatives of upstream scans in [HeOCR/public-domain-hand-written-hebrew-scans](https://github.com/HeOCR/public-domain-hand-written-hebrew-scans) and carry per-entry rights inherited from the source page. The entries listed below carry a license that requires attribution (currently CC-BY-4.0, CC-BY-SA-4.0). Anyone redistributing or reusing these crops must keep the listed credit and link to the source page on which the rights claim was verified. + +- Corpus release: `0.0.0-rc` +- Released at (corpus state): `2026-05-12T00:00:00Z` + +## Attribution-required entries + +_No entries in this release require attribution._ + +## Full per-entry rights + +Every entry, attribution-required or not, ships with its rights record in [`data/index/entries.jsonl`](data/index/entries.jsonl). Consumers that need machine-readable rights metadata should read that file directly; the manifest at [`datapackage.json`](datapackage.json) summarises the license breakdown. diff --git a/README.md b/README.md index b003b90..2d97347 100644 --- a/README.md +++ b/README.md @@ -1 +1,110 @@ -# hletterscript \ No newline at end of file +# hletterscript + +A dataset of **sets of per-letter images of handwritten Hebrew letters**. +Each set groups crops produced from documents written by the *same +writer*; each set typically contains several variants of the same letter +cut from different scans by that writer. + +This repository is the downstream of: + +- [HeOCR/public-domain-hand-written-hebrew-scans][upstream] — the + canonical, permissively-licensed source of page-level scans. Every + entry here cites its upstream scan. +- [HeOCR/hletterscriptgen][gen] — the framework that turns page scans + into per-letter crops. Each entry records which version of that + framework produced it. + +The intended downstream consumers are synthetic-document generators +([HeOCR/hocrsyngen][syngen]) and the synthetic / real Hebrew handwriting +corpora they feed into ([HeOCR/HeOCRsynth][heocrsynth], +[HeOCR/HeOCR][heocr]). + +[upstream]: https://github.com/HeOCR/public-domain-hand-written-hebrew-scans +[gen]: https://github.com/HeOCR/hletterscriptgen +[syngen]: https://github.com/HeOCR/hocrsyngen +[heocrsynth]: https://github.com/HeOCR/HeOCRsynth +[heocr]: https://github.com/HeOCR/HeOCR + +## Dataset Layout + +- `docs/dataset_structure.md` defines the repository layout and + ingestion model. +- `docs/letters.md` is the canonical Hebrew-letter enumeration + (27 forms — 22 base letters plus the 5 finals). +- `data/index/writers.jsonl` is the set-level catalog: one JSON object + per writer/scribe. +- `data/index/entries.jsonl` is the image-level catalog: one JSON + object per cropped letter image, with upstream provenance, + extraction provenance, file checksums, and inherited rights. +- `data/letters///` stores the image bytes. +- `schemas/writer.schema.json` and `schemas/entry.schema.json` define + the record contracts. +- `scripts/validate_indexes.py` validates JSONL records against the + schemas, enforces referential integrity, checks Hebrew-letter + codepoint/name/form consistency, pins the upstream repo URL, and + re-verifies image file checksums and sizes on disk. +- `scripts/generate_release_artifacts.py` regenerates `NOTICE.md`, + `CITATION.cff`, and `datapackage.json` deterministically from the + indexes. +- `LICENSE.md` documents the compound licensing policy for + metadata and per-image inherited rights. + +## Serialization Decision + +The canonical editable indexes are newline-delimited JSON (`.jsonl`), +matching the upstream scans repo's convention. + +JSONL is deliberately used instead of CSV because these records need +nested upstream references, bounding boxes, rights inheritance, +extraction provenance, and quality measurements. CSV/Parquet/SQLite +exports can be generated later as derived artefacts; the source of +truth stays line-oriented, diffable, streamable JSON. + +## Requirements + +- **Python ≥ 3.11** (the validator uses `hashlib.file_digest`). + CI pins 3.12. +- **Git LFS** — image bytes under `data/letters/**` are tracked via + LFS (see `.gitattributes`). After cloning, run `git lfs install` + once, then `git lfs pull` to fetch the actual image bytes. + +Run the current validation check with: + +```bash +git lfs install && git lfs pull +python3 -m pip install -r requirements-dev.txt +python3 scripts/validate_indexes.py +python3 scripts/generate_release_artifacts.py --check +python3 -m pytest +``` + +## Current Status + +`v0.0.0-rc` — **initial setup**. The repository ships with the +schemas, validation tooling, release-artifact generator, CI workflow, +and licensing policy in place. The per-letter image indexes +(`writers.jsonl`, `entries.jsonl`) are empty: actual letter-image +ingestion happens in subsequent PRs, produced by +[HeOCR/hletterscriptgen][gen] from scans in the upstream repo. + +The repository uses a compound licensing model: repository-authored +metadata is dedicated to the public domain under CC0 1.0 (see +[`LICENSE`](LICENSE)), while per-image rights are recorded individually +and inherited from each crop's upstream scan. See [`LICENSE.md`]\ +(LICENSE.md) for the full policy, including the CC BY-SA ShareAlike +caveat and the rules for remix-friendly release bundles. + +## How to use this repo + +- [`data/index/entries.jsonl`](data/index/entries.jsonl) is the source + of truth for the per-letter image corpus — one JSON object per crop, + with upstream citation, file checksums, and inherited rights. +- [`data/index/writers.jsonl`](data/index/writers.jsonl) catalogs the + writers, including candidate leads and rejected records. +- [`schemas/entry.schema.json`](schemas/entry.schema.json) and + [`schemas/writer.schema.json`](schemas/writer.schema.json) define the + record contracts; [`scripts/validate_indexes.py`]\ + (scripts/validate_indexes.py) enforces them in CI. +- Contributors adding new entries should start with + [`AGENTS.md`](AGENTS.md) for ingest rules, naming, and the pre-PR + checklist. diff --git a/data/index/entries.jsonl b/data/index/entries.jsonl new file mode 100644 index 0000000..e69de29 diff --git a/data/index/writers.jsonl b/data/index/writers.jsonl new file mode 100644 index 0000000..e69de29 diff --git a/data/letters/.gitkeep b/data/letters/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/datapackage.json b/datapackage.json new file mode 100644 index 0000000..bea3b70 --- /dev/null +++ b/datapackage.json @@ -0,0 +1,74 @@ +{ + "contributors": [ + { + "role": "maintainer", + "title": "Shay Palachy-Affek" + } + ], + "description": "Per-letter image crops of handwritten Hebrew letters, grouped into sets by writer. Each crop is a derivative of a permissively-licensed upstream scan in HeOCR/public-domain-hand-written-hebrew-scans, with per-image rights inherited and attribution recorded. The index is line-oriented JSON (JSONL).", + "homepage": "https://github.com/HeOCR/hletterscript", + "keywords": [ + "Hebrew", + "dataset", + "glyphs", + "handwriting", + "handwritten-text-recognition", + "letters", + "public-domain", + "synthetic-generation" + ], + "licenses": [ + { + "name": "CC0-1.0", + "path": "https://creativecommons.org/publicdomain/zero/1.0/", + "scope": "metadata", + "title": "Creative Commons Zero v1.0 Universal" + } + ], + "name": "hletterscript", + "profile": "data-package", + "released_at": "2026-05-12T00:00:00Z", + "resources": [ + { + "bytes": 0, + "description": "Per-letter image index. One JSON object per cropped letter image, with upstream provenance, extraction provenance, file checksums, and inherited rights.", + "encoding": "utf-8", + "format": "jsonl", + "mediatype": "application/x-ndjson", + "name": "entries", + "path": "data/index/entries.jsonl", + "profile": "data-resource", + "record_count": 0 + }, + { + "bytes": 0, + "description": "Writer-level catalog. One JSON object per writer; each writer defines a 'set' of letter images.", + "encoding": "utf-8", + "format": "jsonl", + "mediatype": "application/x-ndjson", + "name": "writers", + "path": "data/index/writers.jsonl", + "profile": "data-resource", + "record_count": 0 + } + ], + "schemas": { + "entry": "https://github.com/HeOCR/hletterscript/blob/main/schemas/entry.schema.json", + "writer": "https://github.com/HeOCR/hletterscript/blob/main/schemas/writer.schema.json" + }, + "stats": { + "attribution_required_count": 0, + "entry_writer_count": 0, + "image_byte_count": 0, + "letter_breakdown": {}, + "license_breakdown": {}, + "record_count": 0, + "writer_breakdown": {}, + "writer_record_count": 0, + "writer_status_breakdown": {} + }, + "title": "Hebrew Handwritten Per-Letter Image Dataset", + "upstream_repo": "https://github.com/HeOCR/public-domain-hand-written-hebrew-scans", + "version": "0.0.0-rc", + "version_released_date": "2026-05-12" +} diff --git a/docs/dataset_structure.md b/docs/dataset_structure.md new file mode 100644 index 0000000..345fea5 --- /dev/null +++ b/docs/dataset_structure.md @@ -0,0 +1,254 @@ +# Dataset Structure and Index Model + +## Scope + +This dataset hosts **sets of per-letter images of handwritten Hebrew +letters**. Each *set* groups crops that were produced from documents written +by the **same writer** (the same person/scribe). Within a set, the same +Hebrew letter may appear multiple times, each variant cut from a different +document or scan written by that writer. + +The corpus is the *downstream* product of two upstream things: + +- [HeOCR/public-domain-hand-written-hebrew-scans] is the canonical source + of page-level scans. Every per-letter image entry in this repo cites + the upstream scan (`source_id`, `entry_id`, `sha256`) it was cut from. +- [HeOCR/hletterscriptgen] is the framework that turns those page scans + into per-letter crops. Each entry records which version of that tool + produced it. + +The intended downstream consumers are synthetic-document generators +([HeOCR/hocrsyngen]) and the synthetic / real Hebrew handwriting datasets +they feed into ([HeOCR/HeOCRsynth], [HeOCR/HeOCR]). + +[HeOCR/public-domain-hand-written-hebrew-scans]: https://github.com/HeOCR/public-domain-hand-written-hebrew-scans +[HeOCR/hletterscriptgen]: https://github.com/HeOCR/hletterscriptgen +[HeOCR/hocrsyngen]: https://github.com/HeOCR/hocrsyngen +[HeOCR/HeOCRsynth]: https://github.com/HeOCR/HeOCRsynth +[HeOCR/HeOCR]: https://github.com/HeOCR/HeOCR + +## Recommended repository layout + +```text +data/ + index/ + writers.jsonl # One row per writer/scribe (a "set" of letter images). + entries.jsonl # One row per per-letter image crop. + letters/ + / + / + . # The cropped letter image itself. +docs/ + dataset_structure.md # This file. + letters.md # Canonical Hebrew letter enumeration. + release_process.md # Runbook for cutting a new release. +schemas/ + writer.schema.json + entry.schema.json +scripts/ + validate_indexes.py + generate_release_artifacts.py + release_recipe.json +tests/ + test_validate_indexes.py + test_generate_release_artifacts.py +.github/ + workflows/ + ci.yml + pull_request_template.md +``` + +The `data/index/*.jsonl` files are the canonical catalogs. Image bytes +live under `data/letters/` and are tracked via **Git LFS** from day one +(see `.gitattributes`). Contributors must run `git lfs install` and +`git lfs pull` after cloning to populate the actual image bytes; CI +does the equivalent before validating. + +## Serialization format + +Same convention as the upstream scans repo: newline-delimited JSON. + +- one complete JSON object per line, +- UTF-8, +- no comments, +- no trailing commas, +- stable sorted keys when generated by tooling, +- ISO 8601 timestamps where known, +- `null` for unknown scalar values. + +JSONL is preferred over CSV because each writer and each letter image needs +nested rights evidence, upstream provenance, extraction provenance, file +checksums, and quality annotations. Analytics-oriented exports (CSV, +Parquet, SQLite) can be generated later as derived artefacts. + +## Writer index + +`data/index/writers.jsonl` is the **set-level catalog**. One row per +writer/scribe. A writer row defines the identity of the *set* of letter +images attributed to that person; it does **not** by itself imply rights or +scope decisions over any individual image — those live on the entries. + +Required core fields: + +- `writer_id`: stable lowercase identifier (regex + `^[a-z][a-z0-9]*(?:_[a-z0-9]+)*$`). +- `status`: `candidate`, `verified`, `rejected`, or `needs_review`. +- `display_name`: human-readable name in Latin script. +- `also_known_as`: list of alternate spellings (Latin, Hebrew, Yiddish, etc.). +- `dates`: birth and death years with precision flags. +- `languages_written`: BCP-47 tags the writer is known to have written in. +- `scripts_written`: ISO 15924 codes (almost always at least `Hebr`). +- `period`: free-text range describing the rough span of known writings. +- `description`: short prose summary of the writer's relevance. +- `references`: at least one citation backing the biographical claims. +- `ingest`: agent-side notes about provenance and any blockers. + +Writers are first-class even if they have zero verified entries — that is +how research leads are tracked before letter crops have been extracted. + +## Entry index + +`data/index/entries.jsonl` is the **image-level catalog**. One row per +cropped per-letter image. Required core fields: + +- `entry_id`: stable identifier matching + `^____v[0-9]{4}$`. +- `writer_id`: foreign key into `writers.jsonl`. +- `letter`: codepoint + Unicode char + slug name + form (`regular` / + `final`). Cross-field consistency is enforced by the validator. +- `upstream`: citation back to the source page in the upstream scans + repo — `source_id`, `entry_id`, scan `sha256`, bounding box `(x,y,w,h)`, + and the upstream commit or release at which the extraction was performed. +- `image`: local crop file — `local_path`, `sha256`, `bytes`, `mime_type`, + `width_px`, `height_px`, and `background` (`original`, `white`, + `transparent`). +- `extraction`: how the crop was produced — `tool`, `tool_version`, + `method` (`manual`, `auto`, `mixed`), timestamp, actor, free-text notes. +- `rights`: scan-level license and attribution data, inherited from the + upstream scan. +- `quality`: legibility and usability flags for HTR and synthetic + generation pipelines. + +## Stable IDs + +```text +writer_id = +entry_id = ____v +``` + +`` is the canonical slug from `docs/letters.md` +(`alef`, `bet`, …, `kaf_final`, `mem_final`, etc.). +`` is a zero-padded counter that is monotonic per +`(writer_id, letter.name)`. + +### Writer disambiguation on name collisions + +On Latin-name collisions (e.g. two writers named "Yosef Haim"), append +the birth year: `yosef_haim_1834`. When birth year is unknown, fall +back to death year (`yosef_haim_d1942`), then to period start +(`yosef_haim_p1880`), then to a provider authority ID +(`yosef_haim_viaf12345678`). Record the rationale in +`ingest.agent_notes` on the writer row. AGENTS.md documents the +operational form of this rule. + +## Rights model (compound, inherited from upstream) + +Every per-letter image is a **crop / derivative** of an upstream scan whose +rights have already been recorded in +`public-domain-hand-written-hebrew-scans/data/index/entries.jsonl`. +Repository policy: + +- **Repository-authored metadata** in this repo is dedicated to the public + domain under CC0 1.0. That includes `data/index/*.jsonl`, schemas, + scripts, docs, and generated metadata exports. +- **Per-image rights** for each crop are recorded individually in the + entry's `rights` block. The crop inherits its parent scan's + `license_expression`, with the following inheritance rules: + - Public-Domain Mark / public-domain refs / CC0 upstream → crop carries + the same public-domain expression. + - CC BY-4.0 upstream → crop carries `CC-BY-4.0` and must populate + `attribution_required: true`, `attribution_text`, and `attribution_url`. + - CC BY-SA-4.0 upstream → crop is an **adaptation** of the upstream + scan, so the crop is itself `CC-BY-SA-4.0` with attribution; downstream + re-distributors of an adaptation of the crop must release under + `CC-BY-SA-4.0` (or compatible). This is enforced by + `scripts/generate_release_artifacts.py` and surfaced in `NOTICE.md`. +- **No-commercial / no-derivative / research-only / unknown-rights** + upstream scans must not be cropped into this dataset at all. They are + rejected at ingest time because the entire premise of the corpus is + that the per-letter images can be redistributed and remixed for + downstream synthetic generation. +- **CC BY-SA inheritance for the dataset as a whole**: the dataset is a + *collection*, not a single adaptation. Aggregating CC BY-SA crops in a + release bundle alongside public-domain crops does not force the bundle + to a uniform license; the per-entry license metadata travels with each + file. See `LICENSE.md`. + +## Upstream cross-reference (`upstream` block) + +The `upstream` block in each entry is the *load-bearing* link to the +source of truth for rights. The validator enforces: + +- `upstream.source_id` and `upstream.entry_id` follow the upstream's + `entry_id` regex. +- `upstream.sha256` is a 64-char lowercase hex string. With + `--upstream-path PATH`, the validator additionally cross-checks this + against the live upstream entry's file SHA-256. +- `upstream.commit` is an **immutable 40-character commit SHA** — never + a tag ref. Tags are mutable and re-pointable; recording one here would + silently change the meaning of the entry if the tag moves. +- `upstream.release_tag` is optional and carries a human-readable tag + (e.g. `v0.1.0-rc`) corresponding to the commit. It is for + reader convenience only; NOTICE.md links and any code-level + resolution use `commit`. +- `upstream.bbox` has `x ≥ 0`, `y ≥ 0`, `w > 0`, `h > 0`. With + `--upstream-path`, the validator also asserts `x+w ≤ width_px` and + `y+h ≤ height_px` against the upstream scan dimensions. + +The upstream repository URL itself is recorded once in +`scripts/release_recipe.json::upstream_repo` — not duplicated on every +entry. Both the validator and the release generator read it from +there. + +If upstream re-encodes a scan and its `sha256` changes, every dependent +crop in this repo must be re-verified — `--upstream-path` will flag +the mismatch. + +## Ingestion flow + +1. Add or update a `writers.jsonl` row as `candidate`. +2. Pick an upstream entry whose scan is permissively licensed (PD, CC0, + CC BY, or CC BY-SA) and whose handwriting belongs to that writer. +3. Run the relevant `hletterscriptgen` pipeline to produce per-letter + crops. Each crop carries upstream `source_id`, `entry_id`, `sha256`, + bbox, and the generator's `tool_version`. +4. Add one row to `entries.jsonl` per crop. Inherit rights from the + upstream entry and record `verification_status` accordingly. +5. Run: + ```bash + python3 scripts/validate_indexes.py + python3 scripts/generate_release_artifacts.py + python3 -m pytest + ``` +6. Open a PR. The CI workflow re-runs the same checks plus + `generate_release_artifacts.py --check`. + +## Release artefacts and two-timestamp model + +`NOTICE.md`, `CITATION.cff`, and `datapackage.json` at the repo root +are generated deterministically from `data/index/*.jsonl` and +`scripts/release_recipe.json`. Do not edit them by hand. + +The generator emits two timestamps with **deliberately different +semantics**: + +| Field | Source | Meaning | +| ------------------------------------ | ----------------------------------------------------------- | ------------------------------------------------ | +| `datapackage.json::released_at` | `max(extraction.extracted_at)` (fallback: `initial_release_date`) | *Corpus state.* Bumps every ingest PR. | +| `CITATION.cff::date-released` | `release_recipe.json::version_released_date` | *Release date of this version.* Stable per version. | + +Citations are pinned to the version's release date, not to the latest +extraction. Ingest PRs are free to add data without invalidating +existing citations. Bumping a version (and its `version_released_date`) +is a deliberate, human-driven step — see +[`docs/release_process.md`](release_process.md). diff --git a/docs/letters.md b/docs/letters.md new file mode 100644 index 0000000..b90bea8 --- /dev/null +++ b/docs/letters.md @@ -0,0 +1,70 @@ +# Hebrew Letter Enumeration + +This file is the human-readable companion to the `letter` block in +`schemas/entry.schema.json`. The schema is authoritative; this table is meant +to be readable and stays in sync with it. + +Per-letter image entries use lowercase ASCII `letter.name` slugs in +`entry_id`s, file paths, and statistics. The five letters that take a final +form get two distinct slugs (`kaf` / `kaf_final`, etc.) — final-form glyphs +are never collapsed into their base letter, because handwriting style varies +between the two. + +| `letter.name` | `letter.codepoint` | `letter.unicode_char` | `letter.form` | Hebrew name | +| --------------- | ------------------ | --------------------- | ------------- | ----------- | +| `alef` | `U+05D0` | א | `regular` | אלף | +| `bet` | `U+05D1` | ב | `regular` | בית | +| `gimel` | `U+05D2` | ג | `regular` | גימל | +| `dalet` | `U+05D3` | ד | `regular` | דלת | +| `he` | `U+05D4` | ה | `regular` | הא | +| `vav` | `U+05D5` | ו | `regular` | וו | +| `zayin` | `U+05D6` | ז | `regular` | זין | +| `het` | `U+05D7` | ח | `regular` | חית | +| `tet` | `U+05D8` | ט | `regular` | טית | +| `yod` | `U+05D9` | י | `regular` | יוד | +| `kaf_final` | `U+05DA` | ך | `final` | כף סופית | +| `kaf` | `U+05DB` | כ | `regular` | כף | +| `lamed` | `U+05DC` | ל | `regular` | למד | +| `mem_final` | `U+05DD` | ם | `final` | מם סופית | +| `mem` | `U+05DE` | מ | `regular` | מם | +| `nun_final` | `U+05DF` | ן | `final` | נון סופית | +| `nun` | `U+05E0` | נ | `regular` | נון | +| `samekh` | `U+05E1` | ס | `regular` | סמך | +| `ayin` | `U+05E2` | ע | `regular` | עין | +| `pe_final` | `U+05E3` | ף | `final` | פא סופית | +| `pe` | `U+05E4` | פ | `regular` | פא | +| `tsadi_final` | `U+05E5` | ץ | `final` | צדי סופית | +| `tsadi` | `U+05E6` | צ | `regular` | צדי | +| `qof` | `U+05E7` | ק | `regular` | קוף | +| `resh` | `U+05E8` | ר | `regular` | ריש | +| `shin` | `U+05E9` | ש | `regular` | שין | +| `tav` | `U+05EA` | ת | `regular` | תו | + +## Letters this dataset does NOT split out + +- **Pointed (niqqud) vowel marks** (`U+05B0`–`U+05BC`) and the rafe / sof + pasuq marks (`U+05BF`, `U+05C0`) are diacritics, not letters, and are + out of scope for the per-letter image corpus. +- **Yiddish digraphs** `װ` (`U+05F0`), `ױ` (`U+05F1`), `ײ` (`U+05F2`) are + composed glyphs; they are out of scope. Underlying Yiddish handwriting + that uses the standard 27 forms above is in scope. +- **Shin / sin dot variants** `שׁ` (`U+FB2A`) and `שׂ` (`U+FB2B`) are normalised + to the bare `shin` slug. The pointed variant lives in `letter.notes` if + the original page has the dot. + +## File-path convention + +Per-letter image files live at: + +```text +data/letters///. +``` + +For example, the first verified alef variant from writer `chaim_nachman_bialik`: + +```text +data/letters/chaim_nachman_bialik/alef/chaim_nachman_bialik__alef__v0001.png +``` + +`` always matches `^____v[0-9]{4}$` and is +enforced by `schemas/entry.schema.json` and `scripts/validate_indexes.py`. diff --git a/docs/release_process.md b/docs/release_process.md new file mode 100644 index 0000000..18fbd87 --- /dev/null +++ b/docs/release_process.md @@ -0,0 +1,76 @@ +# Release Process + +This document is the runbook for cutting a new dataset release. Releases +are tagged on `main` and follow [Semantic Versioning](https://semver.org/spec/v2.0.0.html): + +- **MAJOR** — backwards-incompatible schema changes, ID renames, or + rights-policy changes that change what consumers can do with the data. +- **MINOR** — additive schema fields, new writers, or substantial new + per-letter ingestion batches. +- **PATCH** — bug-fix re-extractions, metadata corrections, validator + fixes, single-entry rights re-verifications. + +Pre-1.0 releases use `0.X.Y-rc` (release candidate) suffixes; the +`-rc` is dropped at `1.0.0`. + +## Two timestamps, deliberately distinct + +The release generator emits two different timestamps with different +semantics. Keep them separated mentally: + +| Field | Source | Meaning | Bumps when | +| ---------------------------------------------- | ------------------------------------------------------------ | ---------------------------------------------------- | ------------------------------------------------------- | +| `CITATION.cff::date-released` | `release_recipe.json::version_released_date` | The date *this version* of the dataset was released. | A human bumps `version` *and* `version_released_date`. | +| `datapackage.json::released_at` | `max(extraction.extracted_at)` across all entries | Latest corpus-state timestamp. | Every ingest PR that adds or replaces an entry. | + +`date-released` is what citations should be reproducible against and what +Zenodo/GitHub indexers expect. `released_at` is informational metadata +about how fresh the corpus is right now. **Never collapse them into +one** — that was the v0.0.0-rc design's original bug. + +## Cutting a release + +1. **Choose the new version.** Decide MAJOR/MINOR/PATCH per the rules + above. Open a release PR (label `release:vX.Y.Z`). +2. **Bump the recipe.** Edit `scripts/release_recipe.json`: + - `version` → the new version. + - `version_released_date` → today's date (YYYY-MM-DD). +3. **Regenerate artefacts.** + ```bash + python3 scripts/generate_release_artifacts.py + ``` + Stage the resulting `NOTICE.md`, `CITATION.cff`, and `datapackage.json`. +4. **Update the changelog.** Move the `[Unreleased]` section to a new + `[X.Y.Z] - YYYY-MM-DD` section and add a fresh empty `[Unreleased]` + at the top. Update the link references at the bottom of the file. +5. **Re-run pre-merge checks.** + ```bash + python3 scripts/validate_indexes.py + python3 scripts/generate_release_artifacts.py --check + python3 -m pytest + ``` +6. **Merge the release PR.** Squash-merge into `main`. +7. **Tag the release.** + ```bash + git checkout main && git pull + git tag -a vX.Y.Z -m "Release vX.Y.Z" + git push origin vX.Y.Z + ``` +8. **Cut the GitHub release** from that tag. The body should be the + relevant `CHANGELOG.md` section. + +## When NOT to bump the version + +Ingest PRs that add writers or entries do **not** bump the version on +their own. They bump `datapackage.json::released_at` (automatically, via +`max(extraction.extracted_at)`), and they update the per-license, +per-writer, and per-letter stats — but the version stays the same until +a human deliberately cuts a release. This keeps `CITATION.cff` stable +between releases. + +## Pre-1.0 versioning + +While the dataset is small and the schema may still shift, releases +carry the `-rc` suffix. The first non-rc release is `1.0.0`, signalling +that the schema and ID conventions are stable enough that downstream +consumers can build long-lived pipelines on top. diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..dd4bb23 --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,7 @@ +# Development dependencies for hletterscript validators, release tooling, +# and tests. Requires Python >= 3.11 (validate_indexes.py uses +# hashlib.file_digest, which was added in 3.11). CI pins 3.12. +jsonschema>=4.0,<5 +pytest>=8.0,<9 +PyYAML>=6,<7 +frictionless>=5,<6 diff --git a/schemas/entry.schema.json b/schemas/entry.schema.json new file mode 100644 index 0000000..004717d --- /dev/null +++ b/schemas/entry.schema.json @@ -0,0 +1,408 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://github.com/HeOCR/hletterscript/schemas/entry.schema.json", + "title": "Handwritten Hebrew Per-Letter Image Entry", + "description": "One row per cropped per-letter image. Each entry is a derivative of a specific upstream scan in HeOCR/public-domain-hand-written-hebrew-scans.", + "type": "object", + "required": [ + "entry_id", + "writer_id", + "letter", + "upstream", + "image", + "extraction", + "rights", + "quality" + ], + "additionalProperties": false, + "allOf": [ + { + "if": { + "properties": { + "rights": { + "properties": { + "verification_status": { + "enum": ["unverified", "source_note_only", "conflicting"] + } + }, + "required": ["verification_status"] + } + }, + "required": ["rights"] + }, + "then": { + "properties": { + "rights": { + "properties": { + "commercial_use_allowed": { + "not": { + "const": true + } + }, + "derivatives_allowed": { + "not": { + "const": true + } + }, + "redistribution_allowed": { + "not": { + "const": true + } + } + } + } + } + } + }, + { + "if": { + "properties": { + "rights": { + "properties": { + "attribution_required": { + "const": true + } + }, + "required": ["attribution_required"] + } + }, + "required": ["rights"] + }, + "then": { + "properties": { + "rights": { + "properties": { + "attribution_text": { + "type": "string", + "minLength": 1 + }, + "attribution_url": { + "type": "string", + "format": "uri", + "minLength": 1 + } + }, + "required": ["attribution_text", "attribution_url"] + } + } + } + }, + { + "if": { + "properties": { + "image": { + "properties": { + "background": { + "const": "transparent" + } + }, + "required": ["background"] + } + }, + "required": ["image"] + }, + "then": { + "properties": { + "image": { + "properties": { + "mime_type": { + "enum": ["image/png", "image/webp", "image/tiff"] + } + } + } + } + } + } + ], + "properties": { + "entry_id": { + "type": "string", + "pattern": "^[a-z][a-z0-9]*(?:_[a-z0-9]+)*__[a-z][a-z0-9_]*__v[0-9]{4}$", + "description": "Format: ____v<4-digit variant>." + }, + "writer_id": { + "type": "string", + "pattern": "^[a-z][a-z0-9]*(?:_[a-z0-9]+)*$" + }, + "letter": { + "type": "object", + "required": ["codepoint", "unicode_char", "name", "form"], + "additionalProperties": false, + "properties": { + "codepoint": { + "type": "string", + "description": "Hebrew Unicode codepoint of the letter (e.g. 'U+05D0'). Cross-validated against the canonical LETTER_TABLE by scripts/validate_indexes.py — no schema-level regex, since the table is the canonical source of truth." + }, + "unicode_char": { + "type": "string", + "minLength": 1, + "maxLength": 1 + }, + "name": { + "type": "string", + "enum": [ + "alef", "bet", "gimel", "dalet", "he", "vav", "zayin", "het", + "tet", "yod", "kaf", "kaf_final", "lamed", "mem", "mem_final", + "nun", "nun_final", "samekh", "ayin", "pe", "pe_final", + "tsadi", "tsadi_final", "qof", "resh", "shin", "tav" + ] + }, + "form": { + "type": "string", + "enum": ["regular", "final"] + }, + "style": { + "type": "string", + "enum": [ + "unknown", + "cursive_ashkenazi", + "cursive_sephardi", + "cursive_mizrahi", + "cursive_yemenite", + "block_ashkenazi", + "block_sephardi", + "block_modern", + "rashi", + "yiddish_handwriting", + "other" + ], + "description": "Per-image handwriting style. Optional. Downstream syngen and HTR consumers should filter on this when style consistency matters." + }, + "notes": { + "type": ["string", "null"] + } + } + }, + "upstream": { + "type": "object", + "description": "Reference back to the scan in HeOCR/public-domain-hand-written-hebrew-scans this crop was extracted from. The upstream repository URL is recorded once in scripts/release_recipe.json (`upstream_repo`); it is not duplicated on every entry.", + "required": ["source_id", "entry_id", "sha256", "commit", "bbox"], + "additionalProperties": false, + "properties": { + "source_id": { + "type": "string", + "pattern": "^[a-z0-9]+(?:__[a-z0-9_]+)+$" + }, + "entry_id": { + "type": "string", + "pattern": "^[a-z0-9]+(?:__[a-z0-9_]+)+__p[0-9]{4,6}$" + }, + "sha256": { + "type": "string", + "pattern": "^[a-f0-9]{64}$", + "description": "SHA-256 of the upstream scan file at the time of extraction. The validator (with --upstream-path) re-checks this against the live upstream entry to catch upstream re-encodes." + }, + "commit": { + "type": "string", + "pattern": "^[a-f0-9]{40}$", + "description": "Immutable 40-char upstream commit SHA at which extraction was performed. Tags are mutable; never put a tag ref here. Use `release_tag` for human-readable release labels." + }, + "release_tag": { + "type": ["string", "null"], + "pattern": "^v[0-9]+\\.[0-9]+\\.[0-9]+(?:-[A-Za-z0-9.]+)?$", + "description": "Optional upstream release tag (e.g. 'v0.1.0-rc') corresponding to `commit`, recorded for human readability. The commit SHA is what NOTICE.md links to." + }, + "bbox": { + "type": "object", + "required": ["x", "y", "w", "h"], + "additionalProperties": false, + "properties": { + "x": {"type": "integer", "minimum": 0}, + "y": {"type": "integer", "minimum": 0}, + "w": {"type": "integer", "minimum": 1}, + "h": {"type": "integer", "minimum": 1} + } + } + } + }, + "image": { + "type": "object", + "required": [ + "local_path", + "sha256", + "mime_type", + "bytes", + "width_px", + "height_px", + "background" + ], + "additionalProperties": false, + "properties": { + "local_path": { + "type": "string", + "minLength": 1, + "description": "Repo-relative path under data/letters/." + }, + "sha256": { + "type": "string", + "pattern": "^[a-f0-9]{64}$" + }, + "mime_type": { + "type": "string", + "enum": ["image/png", "image/jpeg", "image/webp", "image/tiff"] + }, + "bytes": { + "type": "integer", + "minimum": 1 + }, + "width_px": { + "type": "integer", + "minimum": 1 + }, + "height_px": { + "type": "integer", + "minimum": 1 + }, + "background": { + "type": "string", + "enum": [ + "original", + "white", + "black", + "gray", + "binarized", + "transparent" + ], + "description": "Background of the crop. `original` preserves source pixels; named colors mean the background has been bleached to that color; `binarized` is a 1-bit map; `transparent` requires an alpha channel (PNG/WebP/TIFF only — enforced by the schema)." + } + } + }, + "extraction": { + "type": "object", + "required": [ + "tool", + "tool_version", + "method", + "extracted_at", + "extracted_by", + "notes" + ], + "additionalProperties": false, + "properties": { + "tool": { + "type": "string", + "minLength": 1, + "description": "Typically 'hletterscriptgen' or a sub-tool name." + }, + "tool_version": { + "type": "string", + "pattern": "^v?[0-9]+\\.[0-9]+\\.[0-9]+(?:-[A-Za-z0-9.-]+)?(?:\\+[A-Za-z0-9.-]+)?$", + "description": "SemVer or `git describe --tags` output. Accepts prerelease (-rc1), build metadata (+build.5), and git-describe distance/hash (-3-gabc1234)." + }, + "method": { + "type": "string", + "enum": ["manual", "auto", "mixed"] + }, + "extracted_at": { + "type": "string", + "format": "date-time" + }, + "extracted_by": { + "type": "string", + "minLength": 1 + }, + "notes": { + "type": ["string", "null"] + } + } + }, + "rights": { + "type": "object", + "description": "Inherited from the upstream scan entry; the crop is a derivative. The validator cross-checks `rights_basis` against `license_expression` via LICENSE_BASIS_MAP.", + "required": [ + "rights_basis", + "license_expression", + "commercial_use_allowed", + "derivatives_allowed", + "redistribution_allowed", + "attribution_required", + "attribution_text", + "attribution_url", + "verification_status", + "evidence_text", + "verified_at" + ], + "additionalProperties": false, + "properties": { + "rights_basis": { + "type": "string", + "enum": ["public_domain", "cc0", "cc_by", "cc_by_sa", "unknown"] + }, + "license_expression": { + "type": ["string", "null"] + }, + "commercial_use_allowed": { + "type": ["boolean", "null"] + }, + "derivatives_allowed": { + "type": ["boolean", "null"] + }, + "redistribution_allowed": { + "type": ["boolean", "null"] + }, + "attribution_required": { + "type": ["boolean", "null"] + }, + "attribution_text": { + "type": ["string", "null"], + "minLength": 1 + }, + "attribution_url": { + "type": ["string", "null"], + "format": "uri" + }, + "verification_status": { + "type": "string", + "enum": [ + "unverified", + "source_note_only", + "inherited_from_upstream", + "primary_page_checked", + "conflicting", + "rejected" + ] + }, + "evidence_text": { + "type": ["string", "null"] + }, + "verified_at": { + "type": ["string", "null"], + "format": "date" + } + } + }, + "quality": { + "type": "object", + "required": [ + "usable_for_htr", + "usable_for_syngen", + "legibility", + "exclusion_reasons" + ], + "additionalProperties": false, + "properties": { + "usable_for_htr": { + "type": ["boolean", "null"], + "description": "Suitable as ground truth for handwritten text recognition training." + }, + "usable_for_syngen": { + "type": ["boolean", "null"], + "description": "Suitable as a glyph for HeOCR/hocrsyngen synthetic-document generation." + }, + "legibility": { + "type": "string", + "enum": ["high", "medium", "low", "unknown"] + }, + "exclusion_reasons": { + "type": "array", + "items": { + "type": "string" + } + }, + "notes": { + "type": ["string", "null"] + } + } + } + } +} diff --git a/schemas/writer.schema.json b/schemas/writer.schema.json new file mode 100644 index 0000000..abc6b3b --- /dev/null +++ b/schemas/writer.schema.json @@ -0,0 +1,163 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://github.com/HeOCR/hletterscript/schemas/writer.schema.json", + "title": "Handwritten Hebrew Letter-Image Writer", + "description": "One row per writer/scribe. Each writer defines a 'set' of per-letter image entries attributed to that person.", + "type": "object", + "required": [ + "writer_id", + "status", + "display_name", + "also_known_as", + "description", + "dates", + "languages_written", + "scripts_written", + "period", + "references", + "ingest" + ], + "additionalProperties": false, + "allOf": [ + { + "if": { + "properties": { + "status": { + "enum": ["verified", "rejected"] + } + }, + "required": ["status"] + }, + "then": { + "properties": { + "references": { + "minItems": 1 + } + } + } + } + ], + "properties": { + "writer_id": { + "type": "string", + "pattern": "^[a-z][a-z0-9]*(?:_[a-z0-9]+)*$", + "description": "Stable lowercase slug. On Latin-name collision (e.g. two writers named 'Yosef Haim'), disambiguate by appending the birth year: `yosef_haim_1834`. If the birth year is unknown, use death year; if both are unknown, use the start year of `period`. Document the disambiguation in `ingest.agent_notes`." + }, + "status": { + "type": "string", + "enum": ["candidate", "verified", "rejected", "needs_review"] + }, + "display_name": { + "type": "string", + "minLength": 1 + }, + "also_known_as": { + "type": "array", + "items": { + "type": "string", + "minLength": 1 + } + }, + "description": { + "type": ["string", "null"] + }, + "dates": { + "type": "object", + "required": [ + "birth_year", + "birth_precision", + "death_year", + "death_precision" + ], + "additionalProperties": false, + "properties": { + "birth_year": { + "type": ["integer", "null"] + }, + "birth_precision": { + "type": "string", + "enum": ["exact", "circa", "decade", "unknown"] + }, + "death_year": { + "type": ["integer", "null"] + }, + "death_precision": { + "type": "string", + "enum": ["exact", "circa", "decade", "alive", "unknown"] + } + } + }, + "languages_written": { + "type": "array", + "minItems": 1, + "items": { + "type": "string", + "minLength": 2 + } + }, + "scripts_written": { + "type": "array", + "minItems": 1, + "items": { + "type": "string", + "enum": ["Hebr", "Latn", "Arab", "Cyrl", "unknown"] + } + }, + "period": { + "type": "object", + "required": ["start", "end", "precision"], + "additionalProperties": false, + "properties": { + "start": { + "type": ["string", "null"] + }, + "end": { + "type": ["string", "null"] + }, + "precision": { + "type": "string", + "enum": ["day", "month", "year", "decade", "range", "circa", "unknown"] + } + } + }, + "references": { + "type": "array", + "description": "Biographical evidence. `candidate` and `needs_review` writers may have zero references; `verified` and `rejected` writers must have at least one (enforced by the conditional at the top of this schema).", + "items": { + "type": "object", + "required": ["kind", "citation"], + "additionalProperties": false, + "properties": { + "kind": { + "type": "string", + "enum": ["repo_note", "primary_url", "secondary_url", "authority_record", "agent_assessment"] + }, + "citation": { + "type": "string", + "minLength": 1 + }, + "quote": { + "type": ["string", "null"] + }, + "url": { + "type": ["string", "null"], + "format": "uri" + } + } + } + }, + "ingest": { + "type": "object", + "required": ["agent_notes", "blocked_reason"], + "additionalProperties": false, + "properties": { + "agent_notes": { + "type": ["string", "null"] + }, + "blocked_reason": { + "type": ["string", "null"] + } + } + } + } +} diff --git a/scripts/generate_release_artifacts.py b/scripts/generate_release_artifacts.py new file mode 100644 index 0000000..a168d3f --- /dev/null +++ b/scripts/generate_release_artifacts.py @@ -0,0 +1,590 @@ +#!/usr/bin/env python3 +"""Generate deterministic release artefacts from data/index/*.jsonl. + +Emits three files at the repo root: + + - NOTICE.md human-readable attribution roll-up. + - CITATION.cff Citation File Format 1.2.0. + - datapackage.json Frictionless Data Package manifest. + +The script is fully deterministic: same indexes + recipe in, +byte-identical files out. No datetime.now(), no random ordering, no +UUIDs. + +Two timestamps with deliberately different semantics: + + - `datapackage.json::released_at` = max(extraction.extracted_at) + across entries — the *corpus-state timestamp*. Bumps on every + ingest PR. When the corpus is empty it falls back to + `release_recipe.json::initial_release_date`. + + - `CITATION.cff::date-released` = `release_recipe.json::version_released_date` + — the date this `version` was released. Stable per release. Bumped + manually on `version` bump (see docs/release_process.md). This is + what citations should be reproducible against. + +Use `--check` to verify the on-disk artefacts match what would be +generated without touching the tree. +""" + +from __future__ import annotations + +import argparse +import json +import sys +from collections import Counter +from pathlib import Path +from typing import Any + +try: + import yaml +except ImportError as exc: # pragma: no cover - exercised when deps are absent. + raise SystemExit( + "Missing dependency: PyYAML. Install development dependencies with " + "`python3 -m pip install -r requirements-dev.txt`." + ) from exc + + +REPO_ROOT = Path(__file__).resolve().parents[1] +WRITERS_PATH = REPO_ROOT / "data" / "index" / "writers.jsonl" +ENTRIES_PATH = REPO_ROOT / "data" / "index" / "entries.jsonl" +RECIPE_PATH = REPO_ROOT / "scripts" / "release_recipe.json" +NOTICE_PATH = REPO_ROOT / "NOTICE.md" +CITATION_PATH = REPO_ROOT / "CITATION.cff" +DATAPACKAGE_PATH = REPO_ROOT / "datapackage.json" + +# Licenses whose terms require attribution. Drives both NOTICE.md +# inclusion and the consistency check below. Keep in sync with +# scripts/validate_indexes.py::LICENSE_BASIS_MAP and the inheritance +# table in docs/dataset_structure.md. +ATTRIBUTION_REQUIRING_LICENSES: frozenset[str] = frozenset({ + "CC-BY-4.0", + "CC-BY-SA-4.0", +}) + + +def _load_jsonl(path: Path) -> list[dict[str, Any]]: + if not path.exists(): + raise SystemExit(f"{path}: file does not exist") + rows: list[dict[str, Any]] = [] + with path.open("r", encoding="utf-8") as handle: + for line_number, line in enumerate(handle, start=1): + stripped = line.strip() + if not stripped: + continue + try: + rows.append(json.loads(stripped)) + except json.JSONDecodeError as exc: + raise SystemExit(f"{path}:{line_number}: invalid JSON: {exc}") from exc + return rows + + +def _load_recipe(path: Path) -> dict[str, Any]: + if not path.exists(): + raise SystemExit(f"{path}: file does not exist") + try: + return json.loads(path.read_text(encoding="utf-8")) + except json.JSONDecodeError as exc: + raise SystemExit(f"{path}: invalid JSON: {exc}") from exc + + +def _derive_released_at(entries: list[dict[str, Any]], recipe: dict[str, Any]) -> str: + """Corpus-state timestamp for datapackage.json. Bumps every ingest.""" + extracted = [ + entry["extraction"]["extracted_at"] + for entry in entries + if entry.get("extraction") and entry["extraction"].get("extracted_at") + ] + if extracted: + return max(extracted) + # Empty corpus: initial-setup state. Fall back to the recipe's + # `initial_release_date` so generation is deterministic. + initial = recipe.get("initial_release_date") + if not isinstance(initial, str) or not initial: + raise SystemExit( + "no extraction.extracted_at values found in entries.jsonl, and " + "release_recipe.json has no initial_release_date fallback" + ) + return initial + + +def _resolve_citation_date(recipe: dict[str, Any]) -> str: + """Citation date for CITATION.cff. Stable per version.""" + date = recipe.get("version_released_date") + if not isinstance(date, str) or not date: + raise SystemExit( + "release_recipe.json::version_released_date is missing; this is " + "the stable per-version release date used by CITATION.cff. Set it " + "when you bump `version` (see docs/release_process.md)." + ) + return date + + +def _license_breakdown(entries: list[dict[str, Any]]) -> dict[str, int]: + counts = Counter(entry["rights"]["license_expression"] for entry in entries) + return {key: counts[key] for key in sorted(counts, key=lambda k: (k is None, k))} + + +def _writer_breakdown(entries: list[dict[str, Any]]) -> dict[str, int]: + counts = Counter(entry["writer_id"] for entry in entries) + return {key: counts[key] for key in sorted(counts)} + + +def _letter_breakdown(entries: list[dict[str, Any]]) -> dict[str, int]: + counts = Counter(entry["letter"]["name"] for entry in entries) + return {key: counts[key] for key in sorted(counts)} + + +def _image_byte_count(entries: list[dict[str, Any]]) -> int: + total = 0 + for entry in entries: + byte_size = entry["image"].get("bytes") + if isinstance(byte_size, int): + total += byte_size + return total + + +def _check_attribution_consistency(entries: list[dict[str, Any]]) -> None: + # Any entry whose license demands attribution must carry the flag, + # text, and url. The schema enforces text+url *given* the flag; this + # layer catches the prior failure mode of "license is CC-BY-SA but + # ingester forgot the flag", which would silently drop the entry + # from NOTICE.md. + for entry in entries: + rights = entry["rights"] + license_expr = rights.get("license_expression") + if license_expr in ATTRIBUTION_REQUIRING_LICENSES: + if rights.get("attribution_required") is not True: + raise SystemExit( + f"{entry['entry_id']}: license {license_expr} requires " + f"rights.attribution_required: true (found " + f"{rights.get('attribution_required')!r})" + ) + for field in ("attribution_text", "attribution_url"): + value = rights.get(field) + if not isinstance(value, str) or not value.strip(): + raise SystemExit( + f"{entry['entry_id']}: license {license_expr} requires " + f"rights.{field}, but it is null, blank, or " + f"whitespace-only" + ) + + +def _attribution_entries(entries: list[dict[str, Any]]) -> list[dict[str, Any]]: + selected = [ + entry + for entry in entries + if entry["rights"].get("license_expression") in ATTRIBUTION_REQUIRING_LICENSES + ] + return sorted(selected, key=lambda entry: entry["entry_id"]) + + +def _notice_stanza( + entry: dict[str, Any], + recipe: dict[str, Any], + upstream_repo_url: str, +) -> str: + license_names: dict[str, str] = recipe["license_names"] + license_urls: dict[str, str] = recipe["license_urls"] + rights = entry["rights"] + license_expr = rights["license_expression"] + license_name = license_names.get(license_expr, license_expr) + license_url = license_urls.get(license_expr) + + if license_url: + license_line = f"- License: [{license_name} ({license_expr})]({license_url})" + else: + license_line = f"- License: {license_name} ({license_expr})" + + letter = entry["letter"] + title = ( + f"{letter['unicode_char']} ({letter['name']}, {letter['form']}) " + f"by writer `{entry['writer_id']}`" + ) + + upstream = entry["upstream"] + # `upstream.commit` is enforced as a 40-char SHA by the schema; this + # URL form always resolves on GitHub. Never use `release_tag` here — + # tags are mutable and the link must outlive tag re-pointing. + upstream_link = ( + f"{upstream_repo_url}/blob/{upstream['commit']}/data/index/entries.jsonl" + ) + + lines = [ + f"### {title}", + "", + f"- Entry: `{entry['entry_id']}`", + license_line, + f"- Licensor: {rights['attribution_text']}", + f"- Source page: <{rights['attribution_url']}>", + f"- Upstream scan entry: `{upstream['entry_id']}` " + f"(<{upstream_link}>)", + ] + return "\n".join(lines) + + +NOTICE_TEMPLATE = """\ +# NOTICE + +This file is generated by `scripts/generate_release_artifacts.py` from \ +`data/index/entries.jsonl`. Do not edit by hand. + +Repository-authored metadata is dedicated to the public domain under \ +CC0 1.0 Universal. See [`LICENSE`](LICENSE) and [`LICENSE.md`](LICENSE.md) \ +for the full compound-licensing policy. + +Per-letter image crops are derivatives of upstream scans in \ +[HeOCR/public-domain-hand-written-hebrew-scans]({upstream_repo_url}) and \ +carry per-entry rights inherited from the source page. The entries \ +listed below carry a license that requires attribution (currently \ +{license_set}). Anyone redistributing or reusing these crops must keep \ +the listed credit and link to the source page on which the rights claim \ +was verified. + +- Corpus release: `{version}` +- Released at (corpus state): `{released_at}` + +## Attribution-required entries + +{stanzas} + +## Full per-entry rights + +Every entry, attribution-required or not, ships with its rights record in \ +[`data/index/entries.jsonl`](data/index/entries.jsonl). Consumers that \ +need machine-readable rights metadata should read that file directly; the \ +manifest at [`datapackage.json`](datapackage.json) summarises the license \ +breakdown. +""" + + +def build_notice( + entries: list[dict[str, Any]], + recipe: dict[str, Any], + released_at: str, + upstream_repo_url: str, +) -> str: + required = _attribution_entries(entries) + if required: + stanzas = "\n\n".join( + _notice_stanza(entry, recipe, upstream_repo_url) for entry in required + ) + else: + stanzas = "_No entries in this release require attribution._" + + license_set = ", ".join(sorted(ATTRIBUTION_REQUIRING_LICENSES)) + return NOTICE_TEMPLATE.format( + license_set=license_set, + version=recipe["version"], + released_at=released_at, + stanzas=stanzas, + upstream_repo_url=upstream_repo_url, + ) + + +def build_citation( + entries: list[dict[str, Any]], + writers: list[dict[str, Any]], + recipe: dict[str, Any], + citation_date: str, +) -> str: + license_counts = _license_breakdown(entries) + if license_counts: + breakdown_summary = ", ".join( + f"{count} {license_id}" for license_id, count in license_counts.items() + ) + entry_writer_count = len({entry["writer_id"] for entry in entries}) + abstract = ( + f"{recipe['description']} Release {recipe['version']} contains " + f"{len(entries)} per-letter image entries drawn from " + f"{entry_writer_count} verified writers ({breakdown_summary})." + ) + else: + abstract = ( + f"{recipe['description']} Release {recipe['version']} is the " + f"initial-setup release: the corpus contains no per-letter image " + f"entries yet. The repository ships the schemas, validation " + f"tooling, CI, and licensing policy needed to start ingesting." + ) + + document: dict[str, Any] = { + "cff-version": "1.2.0", + "message": "Please cite this dataset using the metadata below.", + "type": "dataset", + "title": recipe["title"], + "abstract": abstract, + "authors": [{"name": author["name"]} for author in recipe["authors"]], + "version": recipe["version"], + "date-released": citation_date, + "repository-code": recipe["repository_code"], + "url": recipe["homepage"], + "license": recipe["metadata_license"]["spdx"], + "keywords": sorted(recipe["keywords"]), + } + identifiers = recipe.get("citation_identifiers") or [] + if identifiers: + document["identifiers"] = identifiers + + header = "# Generated by scripts/generate_release_artifacts.py. Do not edit by hand.\n" + body = yaml.safe_dump( + document, + default_flow_style=False, + sort_keys=False, + allow_unicode=True, + width=10_000, + ) + return header + body + + +def build_datapackage( + entries: list[dict[str, Any]], + writers: list[dict[str, Any]], + recipe: dict[str, Any], + released_at: str, + citation_date: str, + entries_path: Path, + writers_path: Path, +) -> dict[str, Any]: + license_names: dict[str, str] = recipe["license_names"] + license_urls: dict[str, str] = recipe["license_urls"] + license_counts = _license_breakdown(entries) + writer_status_counts = Counter(writer.get("status") for writer in writers) + writer_status_breakdown = { + key: writer_status_counts[key] + for key in sorted(writer_status_counts) + if key is not None + } + + license_listings: list[dict[str, Any]] = [] + license_listings.append({ + "name": recipe["metadata_license"]["spdx"], + "path": recipe["metadata_license"]["url"], + "title": license_names.get( + recipe["metadata_license"]["spdx"], recipe["metadata_license"]["spdx"] + ), + "scope": "metadata", + }) + for license_id in sorted(k for k in license_counts if k is not None): + listing: dict[str, Any] = { + "name": license_id, + "title": license_names.get(license_id, license_id), + "scope": "images", + } + url = license_urls.get(license_id) + if url: + listing["path"] = url + license_listings.append(listing) + + resource_path_for: dict[str, Path] = { + "entries": entries_path, + "writers": writers_path, + } + resource_records_for: dict[str, int] = { + "entries": len(entries), + "writers": len(writers), + } + + resources: list[dict[str, Any]] = [] + for name in sorted(recipe["resources"]): + spec = recipe["resources"][name] + # Note: no `schema` field. Frictionless reserves + # `resources[].schema` for Table Schema (column definitions), but + # our data is nested JSON validated against JSON Schema. We + # expose the JSON Schema URLs via the top-level `schemas` block + # as a custom extension instead. + resources.append({ + "name": name, + "path": spec["path"], + "profile": "data-resource", + "format": spec["format"], + "mediatype": spec["mediatype"], + "encoding": spec["encoding"], + "description": spec["description"], + "record_count": resource_records_for[name], + "bytes": resource_path_for[name].stat().st_size, + }) + + return { + "profile": "data-package", + "name": recipe["name"], + "title": recipe["title"], + "description": recipe["description"], + "version": recipe["version"], + "version_released_date": citation_date, + "released_at": released_at, + "homepage": recipe["homepage"], + "upstream_repo": recipe["upstream_repo"], + "keywords": sorted(recipe["keywords"]), + "contributors": [ + {"title": author["name"], "role": author.get("role", "author")} + for author in recipe["authors"] + ], + "licenses": license_listings, + "schemas": { + "writer": recipe["schema_urls"]["writer"], + "entry": recipe["schema_urls"]["entry"], + }, + "stats": { + "record_count": len(entries), + "entry_writer_count": len({entry["writer_id"] for entry in entries}), + "writer_record_count": len(writers), + "writer_status_breakdown": writer_status_breakdown, + "image_byte_count": _image_byte_count(entries), + "attribution_required_count": len(_attribution_entries(entries)), + "license_breakdown": license_counts, + "letter_breakdown": _letter_breakdown(entries), + "writer_breakdown": _writer_breakdown(entries), + }, + "resources": resources, + } + + +def _serialise_text(text: str) -> str: + return text if text.endswith("\n") else text + "\n" + + +def _serialise_json(data: dict[str, Any]) -> str: + return json.dumps(data, ensure_ascii=False, indent=2, sort_keys=True) + "\n" + + +def _require_recipe_fields(recipe: dict[str, Any]) -> None: + required = [ + "name", "title", "version", "version_released_date", + "description", "homepage", "repository_code", "upstream_repo", + "authors", "keywords", "metadata_license", + "license_urls", "license_names", "schema_urls", "resources", + ] + missing = [field for field in required if field not in recipe] + if missing: + raise SystemExit( + f"release_recipe.json missing required field(s): {', '.join(missing)}" + ) + + +def _render( + writers_path: Path, + entries_path: Path, + recipe_path: Path, +) -> dict[str, str]: + writers = _load_jsonl(writers_path) + entries = _load_jsonl(entries_path) + recipe = _load_recipe(recipe_path) + _require_recipe_fields(recipe) + _check_attribution_consistency(entries) + released_at = _derive_released_at(entries, recipe) + citation_date = _resolve_citation_date(recipe) + upstream_repo_url = recipe["upstream_repo"] + + return { + "notice": _serialise_text( + build_notice(entries, recipe, released_at, upstream_repo_url) + ), + "citation": _serialise_text( + build_citation(entries, writers, recipe, citation_date) + ), + "datapackage": _serialise_json( + build_datapackage( + entries, writers, recipe, released_at, citation_date, + entries_path=entries_path, writers_path=writers_path, + ) + ), + } + + +def generate( + writers_path: Path = WRITERS_PATH, + entries_path: Path = ENTRIES_PATH, + recipe_path: Path = RECIPE_PATH, + notice_path: Path = NOTICE_PATH, + citation_path: Path = CITATION_PATH, + datapackage_path: Path = DATAPACKAGE_PATH, +) -> dict[str, Path]: + rendered = _render(writers_path, entries_path, recipe_path) + notice_path.write_text(rendered["notice"], encoding="utf-8") + citation_path.write_text(rendered["citation"], encoding="utf-8") + datapackage_path.write_text(rendered["datapackage"], encoding="utf-8") + return { + "notice": notice_path, + "citation": citation_path, + "datapackage": datapackage_path, + } + + +def check( + writers_path: Path = WRITERS_PATH, + entries_path: Path = ENTRIES_PATH, + recipe_path: Path = RECIPE_PATH, + notice_path: Path = NOTICE_PATH, + citation_path: Path = CITATION_PATH, + datapackage_path: Path = DATAPACKAGE_PATH, +) -> list[Path]: + rendered = _render(writers_path, entries_path, recipe_path) + stale: list[Path] = [] + for kind, path in ( + ("notice", notice_path), + ("citation", citation_path), + ("datapackage", datapackage_path), + ): + actual = path.read_text(encoding="utf-8") if path.exists() else "" + if actual != rendered[kind]: + stale.append(path) + return stale + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument("--writers", type=Path, default=WRITERS_PATH) + parser.add_argument("--entries", type=Path, default=ENTRIES_PATH) + parser.add_argument("--recipe", type=Path, default=RECIPE_PATH) + parser.add_argument("--notice", type=Path, default=NOTICE_PATH) + parser.add_argument("--citation", type=Path, default=CITATION_PATH) + parser.add_argument("--datapackage", type=Path, default=DATAPACKAGE_PATH) + parser.add_argument( + "--check", + action="store_true", + help="Verify on-disk artefacts match what would be generated. Exit 1 if not.", + ) + args = parser.parse_args() + + if args.check: + stale = check( + writers_path=args.writers, + entries_path=args.entries, + recipe_path=args.recipe, + notice_path=args.notice, + citation_path=args.citation, + datapackage_path=args.datapackage, + ) + if stale: + for path in stale: + try: + display = path.relative_to(REPO_ROOT) + except ValueError: + display = path + print(f"stale: {display}", file=sys.stderr) + print( + "Run `python3 scripts/generate_release_artifacts.py` to regenerate.", + file=sys.stderr, + ) + raise SystemExit(1) + print("ok: release artefacts are up to date") + return + + written = generate( + writers_path=args.writers, + entries_path=args.entries, + recipe_path=args.recipe, + notice_path=args.notice, + citation_path=args.citation, + datapackage_path=args.datapackage, + ) + for label, path in written.items(): + try: + display = path.relative_to(REPO_ROOT) + except ValueError: + display = path + print(f"wrote {label}: {display}") + + +if __name__ == "__main__": + main() diff --git a/scripts/release_recipe.json b/scripts/release_recipe.json new file mode 100644 index 0000000..6cf2eb1 --- /dev/null +++ b/scripts/release_recipe.json @@ -0,0 +1,63 @@ +{ + "name": "hletterscript", + "title": "Hebrew Handwritten Per-Letter Image Dataset", + "version": "0.0.0-rc", + "version_released_date": "2026-05-12", + "initial_release_date": "2026-05-12T00:00:00Z", + "description": "Per-letter image crops of handwritten Hebrew letters, grouped into sets by writer. Each crop is a derivative of a permissively-licensed upstream scan in HeOCR/public-domain-hand-written-hebrew-scans, with per-image rights inherited and attribution recorded. The index is line-oriented JSON (JSONL).", + "homepage": "https://github.com/HeOCR/hletterscript", + "repository_code": "https://github.com/HeOCR/hletterscript", + "upstream_repo": "https://github.com/HeOCR/public-domain-hand-written-hebrew-scans", + "authors": [ + {"name": "Shay Palachy-Affek", "role": "maintainer"} + ], + "keywords": [ + "Hebrew", + "dataset", + "handwriting", + "handwritten-text-recognition", + "letters", + "glyphs", + "synthetic-generation", + "public-domain" + ], + "metadata_license": { + "spdx": "CC0-1.0", + "url": "https://creativecommons.org/publicdomain/zero/1.0/" + }, + "license_urls": { + "PDM-1.0": "https://creativecommons.org/publicdomain/mark/1.0/", + "CC0-1.0": "https://creativecommons.org/publicdomain/zero/1.0/", + "CC-BY-4.0": "https://creativecommons.org/licenses/by/4.0/", + "CC-BY-SA-4.0": "https://creativecommons.org/licenses/by-sa/4.0/" + }, + "license_names": { + "PDM-1.0": "Public Domain Mark 1.0", + "CC0-1.0": "Creative Commons Zero v1.0 Universal", + "CC-BY-4.0": "Creative Commons Attribution 4.0 International", + "CC-BY-SA-4.0": "Creative Commons Attribution-ShareAlike 4.0 International", + "LicenseRef-Public-Domain-Israel": "Public Domain (Israel; life + 70)", + "LicenseRef-Public-Domain-Ukraine": "Public Domain (Ukraine; life + 70)" + }, + "schema_urls": { + "writer": "https://github.com/HeOCR/hletterscript/blob/main/schemas/writer.schema.json", + "entry": "https://github.com/HeOCR/hletterscript/blob/main/schemas/entry.schema.json" + }, + "citation_identifiers": [], + "resources": { + "entries": { + "path": "data/index/entries.jsonl", + "format": "jsonl", + "mediatype": "application/x-ndjson", + "encoding": "utf-8", + "description": "Per-letter image index. One JSON object per cropped letter image, with upstream provenance, extraction provenance, file checksums, and inherited rights." + }, + "writers": { + "path": "data/index/writers.jsonl", + "format": "jsonl", + "mediatype": "application/x-ndjson", + "encoding": "utf-8", + "description": "Writer-level catalog. One JSON object per writer; each writer defines a 'set' of letter images." + } + } +} diff --git a/scripts/validate_indexes.py b/scripts/validate_indexes.py new file mode 100644 index 0000000..ef001d6 --- /dev/null +++ b/scripts/validate_indexes.py @@ -0,0 +1,532 @@ +#!/usr/bin/env python3 +"""Validate the JSONL dataset indexes against their JSON Schemas. + +Validates writers.jsonl + entries.jsonl, enforces referential integrity +between them, checks Hebrew-letter codepoint/name/form consistency, +cross-checks `rights_basis` against `license_expression`, and +re-verifies image file checksums and sizes on disk. With +`--upstream-path` it also cross-checks each entry's +`upstream.sha256` and `upstream.bbox` against the live upstream +dataset. +""" + +from __future__ import annotations + +import argparse +import hashlib +import json +import sys +from pathlib import Path +from typing import Any + +try: + from jsonschema import Draft202012Validator, FormatChecker + from jsonschema.exceptions import SchemaError +except ImportError as exc: # pragma: no cover - exercised when deps are absent. + raise SystemExit( + "Missing dependency: jsonschema. Install development dependencies with " + "`python3 -m pip install -r requirements-dev.txt`." + ) from exc + + +REPO_ROOT = Path(__file__).resolve().parents[1] +WRITERS_PATH = REPO_ROOT / "data" / "index" / "writers.jsonl" +ENTRIES_PATH = REPO_ROOT / "data" / "index" / "entries.jsonl" +WRITER_SCHEMA_PATH = REPO_ROOT / "schemas" / "writer.schema.json" +ENTRY_SCHEMA_PATH = REPO_ROOT / "schemas" / "entry.schema.json" + +# Canonical Hebrew letter table. Mirrors docs/letters.md and the +# `letter.name` enum in schemas/entry.schema.json. The validator uses +# this table to enforce cross-field consistency on the `letter` block. +LETTER_TABLE: list[tuple[str, str, str, str]] = [ + ("U+05D0", "א", "alef", "regular"), + ("U+05D1", "ב", "bet", "regular"), + ("U+05D2", "ג", "gimel", "regular"), + ("U+05D3", "ד", "dalet", "regular"), + ("U+05D4", "ה", "he", "regular"), + ("U+05D5", "ו", "vav", "regular"), + ("U+05D6", "ז", "zayin", "regular"), + ("U+05D7", "ח", "het", "regular"), + ("U+05D8", "ט", "tet", "regular"), + ("U+05D9", "י", "yod", "regular"), + ("U+05DA", "ך", "kaf_final", "final"), + ("U+05DB", "כ", "kaf", "regular"), + ("U+05DC", "ל", "lamed", "regular"), + ("U+05DD", "ם", "mem_final", "final"), + ("U+05DE", "מ", "mem", "regular"), + ("U+05DF", "ן", "nun_final", "final"), + ("U+05E0", "נ", "nun", "regular"), + ("U+05E1", "ס", "samekh", "regular"), + ("U+05E2", "ע", "ayin", "regular"), + ("U+05E3", "ף", "pe_final", "final"), + ("U+05E4", "פ", "pe", "regular"), + ("U+05E5", "ץ", "tsadi_final", "final"), + ("U+05E6", "צ", "tsadi", "regular"), + ("U+05E7", "ק", "qof", "regular"), + ("U+05E8", "ר", "resh", "regular"), + ("U+05E9", "ש", "shin", "regular"), + ("U+05EA", "ת", "tav", "regular"), +] +LETTER_BY_NAME: dict[str, tuple[str, str, str, str]] = {row[2]: row for row in LETTER_TABLE} + +# Permitted file extensions per `image.mime_type`. The first entry is +# the preferred extension; subsequent ones are accepted aliases. +MIME_EXTENSIONS: dict[str, tuple[str, ...]] = { + "image/png": (".png",), + "image/jpeg": (".jpg", ".jpeg"), + "image/webp": (".webp",), + "image/tiff": (".tif", ".tiff"), +} + +# Canonical map from `license_expression` to `rights_basis`. The +# validator hard-fails if an entry's pair doesn't match this map. Adding +# a new accepted license means adding it here AND to AGENTS.md AND to +# scripts/release_recipe.json::license_names + license_urls. +LICENSE_BASIS_MAP: dict[str, str] = { + "CC0-1.0": "cc0", + "PDM-1.0": "public_domain", + "CC-BY-4.0": "cc_by", + "CC-BY-SA-4.0": "cc_by_sa", + "LicenseRef-Public-Domain-Israel": "public_domain", + "LicenseRef-Public-Domain-Ukraine": "public_domain", +} + + +def _err(file: Path | None, line: int | None, row_id: str | None, message: str) -> str: + """Build a uniform error string: :: : .""" + head = "" + if file is not None: + head = str(file) + if line is not None: + head = f"{head}:{line}" + head = f"{head}: " + if row_id: + head = f"{head}{row_id}: " + return f"{head}{message}" + + +def load_schema(path: Path) -> dict[str, Any]: + if not path.exists(): + raise SystemExit(_err(path, None, None, "file does not exist")) + try: + schema = json.loads(path.read_text(encoding="utf-8")) + except json.JSONDecodeError as exc: + raise SystemExit(_err(path, None, None, f"invalid JSON schema: {exc}")) from exc + try: + Draft202012Validator.check_schema(schema) + except SchemaError as exc: + raise SystemExit(_err(path, None, None, f"invalid JSON schema: {exc.message}")) from exc + return schema + + +def load_jsonl( + path: Path, + validator: Draft202012Validator, + id_key: str, +) -> list[dict[str, Any]]: + if not path.exists(): + raise SystemExit(_err(path, None, None, "file does not exist")) + + rows: list[dict[str, Any]] = [] + seen: set[str] = set() + with path.open("r", encoding="utf-8") as handle: + for line_number, line in enumerate(handle, start=1): + stripped = line.strip() + if not stripped: + continue + try: + row = json.loads(stripped) + except json.JSONDecodeError as exc: + raise SystemExit(_err(path, line_number, None, f"invalid JSON: {exc}")) from exc + if not isinstance(row, dict): + raise SystemExit(_err(path, line_number, None, "row must be a JSON object")) + + row_id_candidate = row.get(id_key) if isinstance(row.get(id_key), str) else None + + errors = sorted(validator.iter_errors(row), key=lambda error: list(error.path)) + if errors: + first = errors[0] + location = ".".join(str(part) for part in first.path) or "" + raise SystemExit( + _err(path, line_number, row_id_candidate, f"{location}: {first.message}") + ) + + row_id = row.get(id_key) + if not isinstance(row_id, str) or not row_id: + raise SystemExit( + _err(path, line_number, None, f"{id_key} must be a non-empty string") + ) + if row_id in seen: + raise SystemExit( + _err(path, line_number, row_id, f"duplicate {id_key}") + ) + seen.add(row_id) + rows.append(row) + return rows + + +def _check_letter_consistency( + entries_path: Path, line: int, entry_id: str, letter: dict[str, Any] +) -> None: + name = letter["name"] + canonical = LETTER_BY_NAME.get(name) + if canonical is None: + # The schema's enum should have caught this already; defensive. + raise SystemExit(_err(entries_path, line, entry_id, f"unknown letter.name: {name}")) + expected_codepoint, expected_char, _, expected_form = canonical + if letter["codepoint"] != expected_codepoint: + raise SystemExit(_err( + entries_path, line, entry_id, + f"letter.codepoint mismatch for {name}: " + f"expected {expected_codepoint}, got {letter['codepoint']}", + )) + if letter["unicode_char"] != expected_char: + raise SystemExit(_err( + entries_path, line, entry_id, + f"letter.unicode_char mismatch for {name}: " + f"expected {expected_char!r}, got {letter['unicode_char']!r}", + )) + if letter["form"] != expected_form: + raise SystemExit(_err( + entries_path, line, entry_id, + f"letter.form mismatch for {name}: " + f"expected {expected_form}, got {letter['form']}", + )) + + +def _check_upstream_shape( + entries_path: Path, line: int, entry_id: str, upstream: dict[str, Any] +) -> None: + upstream_entry_id = upstream["entry_id"] + upstream_source_id = upstream["source_id"] + if not upstream_entry_id.startswith(f"{upstream_source_id}__p"): + raise SystemExit(_err( + entries_path, line, entry_id, + f"upstream.entry_id ({upstream_entry_id}) must start with " + f"upstream.source_id ({upstream_source_id}) plus '__p'", + )) + + +def _check_local_path( + entries_path: Path, + line: int, + entry_id: str, + writer_id: str, + letter_name: str, + image: dict[str, Any], +) -> None: + local_path = image["local_path"] + local_path_obj = Path(local_path) + if local_path_obj.is_absolute() or ".." in local_path_obj.parts: + raise SystemExit(_err( + entries_path, line, entry_id, + f"image.local_path must be repo-relative without '..': {local_path}", + )) + + expected_prefix = f"data/letters/{writer_id}/{letter_name}/" + if not local_path.startswith(expected_prefix): + raise SystemExit(_err( + entries_path, line, entry_id, + f"image.local_path must start with {expected_prefix!r}, " + f"got {local_path!r}", + )) + + suffix = local_path_obj.suffix.lower() + expected_exts = MIME_EXTENSIONS.get(image["mime_type"], ()) + if suffix not in expected_exts: + raise SystemExit(_err( + entries_path, line, entry_id, + f"image.local_path extension {suffix!r} does not match " + f"image.mime_type {image['mime_type']!r} (allowed: {list(expected_exts)})", + )) + + expected_stem = f"data/letters/{writer_id}/{letter_name}/{entry_id}" + actual_stem = str(local_path_obj.with_suffix("")) + if actual_stem != expected_stem: + raise SystemExit(_err( + entries_path, line, entry_id, + f"image.local_path stem must equal {expected_stem!r}, " + f"got {actual_stem!r}", + )) + + +def _check_attribution_fields( + entries_path: Path, line: int, entry_id: str, rights: dict[str, Any] +) -> None: + if rights.get("attribution_required") is not True: + return + attribution_text = rights.get("attribution_text") + if not isinstance(attribution_text, str) or not attribution_text.strip(): + raise SystemExit(_err( + entries_path, line, entry_id, + "rights.attribution_required is true but " + "rights.attribution_text is null, blank, or whitespace-only", + )) + attribution_url = rights.get("attribution_url") + if not isinstance(attribution_url, str) or not attribution_url.strip(): + raise SystemExit(_err( + entries_path, line, entry_id, + "rights.attribution_required is true but " + "rights.attribution_url is null, blank, or whitespace-only", + )) + + +def _check_rights_basis_matches_license( + entries_path: Path, line: int, entry_id: str, rights: dict[str, Any] +) -> None: + license_expression = rights.get("license_expression") + rights_basis = rights.get("rights_basis") + if license_expression is None: + # license_expression is allowed to be null only when rights_basis + # is `unknown`; any other null is a denormalization that will + # produce broken release artefacts. + if rights_basis != "unknown": + raise SystemExit(_err( + entries_path, line, entry_id, + f"rights.license_expression is null but rights.rights_basis " + f"is {rights_basis!r} (expected 'unknown')", + )) + return + expected_basis = LICENSE_BASIS_MAP.get(license_expression) + if expected_basis is None: + raise SystemExit(_err( + entries_path, line, entry_id, + f"rights.license_expression {license_expression!r} is not in " + f"the accepted-license map (LICENSE_BASIS_MAP). Update both this " + f"validator and AGENTS.md if a new license is being added.", + )) + if rights_basis != expected_basis: + raise SystemExit(_err( + entries_path, line, entry_id, + f"rights.rights_basis ({rights_basis!r}) does not match " + f"rights.license_expression ({license_expression!r}); expected " + f"rights_basis = {expected_basis!r}", + )) + + +def validate_entries( + entries: list[dict[str, Any]], + writer_ids: set[str], + entries_path: Path, +) -> None: + seen_entry_ids: set[str] = set() + for line, entry in enumerate(entries, start=1): + entry_id = entry["entry_id"] + writer_id = entry["writer_id"] + letter = entry["letter"] + + if writer_id not in writer_ids: + raise SystemExit(_err( + entries_path, line, entry_id, + f"unknown writer_id: {writer_id}", + )) + + expected_prefix = f"{writer_id}__{letter['name']}__v" + if not entry_id.startswith(expected_prefix): + raise SystemExit(_err( + entries_path, line, entry_id, + f"entry_id must start with {expected_prefix!r}", + )) + + if entry_id in seen_entry_ids: + raise SystemExit(_err( + entries_path, line, entry_id, "duplicate entry_id" + )) + seen_entry_ids.add(entry_id) + + _check_letter_consistency(entries_path, line, entry_id, letter) + _check_upstream_shape(entries_path, line, entry_id, entry["upstream"]) + _check_local_path( + entries_path, line, entry_id, writer_id, letter["name"], entry["image"] + ) + _check_attribution_fields(entries_path, line, entry_id, entry["rights"]) + _check_rights_basis_matches_license( + entries_path, line, entry_id, entry["rights"] + ) + + +def _sha256_file(path: Path) -> str: + with path.open("rb") as handle: + return hashlib.file_digest(handle, "sha256").hexdigest() + + +def validate_entry_files( + entries: list[dict[str, Any]], + repo_root: Path, + entries_path: Path, +) -> int: + verified = 0 + for line, entry in enumerate(entries, start=1): + entry_id = entry["entry_id"] + image = entry["image"] + local_path = image["local_path"] + absolute = repo_root / local_path + if not absolute.is_file(): + raise SystemExit(_err( + entries_path, line, entry_id, + f"file does not exist: {local_path}", + )) + + actual_bytes = absolute.stat().st_size + if actual_bytes != image["bytes"]: + raise SystemExit(_err( + entries_path, line, entry_id, + f"byte size mismatch for {local_path}: " + f"expected {image['bytes']}, got {actual_bytes}", + )) + + actual_sha = _sha256_file(absolute) + if actual_sha != image["sha256"]: + raise SystemExit(_err( + entries_path, line, entry_id, + f"sha256 mismatch for {local_path}: " + f"expected {image['sha256']}, got {actual_sha}", + )) + verified += 1 + return verified + + +def _load_upstream_entries(upstream_root: Path) -> dict[str, dict[str, Any]]: + upstream_entries_path = upstream_root / "data" / "index" / "entries.jsonl" + if not upstream_entries_path.is_file(): + raise SystemExit(_err( + upstream_entries_path, None, None, + "upstream entries.jsonl not found; --upstream-path must point at a clone of " + "public-domain-hand-written-hebrew-scans", + )) + by_id: dict[str, dict[str, Any]] = {} + with upstream_entries_path.open("r", encoding="utf-8") as handle: + for line_number, line in enumerate(handle, start=1): + stripped = line.strip() + if not stripped: + continue + try: + row = json.loads(stripped) + except json.JSONDecodeError as exc: + raise SystemExit(_err( + upstream_entries_path, line_number, None, + f"invalid JSON in upstream entries: {exc}", + )) from exc + row_id = row.get("entry_id") + if isinstance(row_id, str): + by_id[row_id] = row + return by_id + + +def validate_against_upstream( + entries: list[dict[str, Any]], + upstream_root: Path, + entries_path: Path, +) -> int: + """Cross-check `upstream.sha256` and `upstream.bbox` against the live + upstream dataset. Returns the number of entries cross-checked. Called + only when --upstream-path is set; CI does set it. + """ + upstream_by_id = _load_upstream_entries(upstream_root) + cross_checked = 0 + for line, entry in enumerate(entries, start=1): + entry_id = entry["entry_id"] + upstream = entry["upstream"] + upstream_entry_id = upstream["entry_id"] + ref = upstream_by_id.get(upstream_entry_id) + if ref is None: + raise SystemExit(_err( + entries_path, line, entry_id, + f"upstream.entry_id {upstream_entry_id!r} not found in " + f"{upstream_root}/data/index/entries.jsonl", + )) + + # Find the upstream file record whose sha256 matches. + files = ref.get("files") or [] + upstream_sha = upstream["sha256"] + matching = next( + (f for f in files if f.get("sha256") == upstream_sha), None + ) + if matching is None: + recorded_shas = [f.get("sha256") for f in files if f.get("sha256")] + raise SystemExit(_err( + entries_path, line, entry_id, + f"upstream.sha256 {upstream_sha!r} does not match any file in " + f"upstream entry {upstream_entry_id!r}; upstream recorded " + f"{recorded_shas}", + )) + + width = matching.get("width_px") + height = matching.get("height_px") + bbox = upstream["bbox"] + if isinstance(width, int): + if bbox["x"] + bbox["w"] > width: + raise SystemExit(_err( + entries_path, line, entry_id, + f"upstream.bbox extends beyond upstream scan width: " + f"x+w = {bbox['x'] + bbox['w']} > width_px = {width}", + )) + if isinstance(height, int): + if bbox["y"] + bbox["h"] > height: + raise SystemExit(_err( + entries_path, line, entry_id, + f"upstream.bbox extends beyond upstream scan height: " + f"y+h = {bbox['y'] + bbox['h']} > height_px = {height}", + )) + cross_checked += 1 + return cross_checked + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument("--writers", type=Path, default=WRITERS_PATH) + parser.add_argument("--entries", type=Path, default=ENTRIES_PATH) + parser.add_argument("--writer-schema", type=Path, default=WRITER_SCHEMA_PATH) + parser.add_argument("--entry-schema", type=Path, default=ENTRY_SCHEMA_PATH) + parser.add_argument( + "--repo-root", + type=Path, + default=REPO_ROOT, + help=( + "Repo root used to resolve image.local_path during file-integrity " + "checks. Defaults to this repository. Mainly intended for tests " + "that need to validate fixture corpora outside the real tree." + ), + ) + parser.add_argument( + "--upstream-path", + type=Path, + default=None, + help=( + "Path to a local clone of HeOCR/public-domain-hand-written-hebrew-scans. " + "When set, the validator additionally cross-checks each entry's " + "upstream.sha256 against the upstream file record and verifies " + "upstream.bbox fits inside the upstream scan dimensions." + ), + ) + args = parser.parse_args() + + writer_validator = Draft202012Validator( + load_schema(args.writer_schema), format_checker=FormatChecker() + ) + entry_validator = Draft202012Validator( + load_schema(args.entry_schema), format_checker=FormatChecker() + ) + + writers = load_jsonl(args.writers, writer_validator, "writer_id") + entries = load_jsonl(args.entries, entry_validator, "entry_id") + validate_entries( + entries, {writer["writer_id"] for writer in writers}, args.entries + ) + verified = validate_entry_files(entries, args.repo_root, args.entries) + + if args.upstream_path is not None: + cross_checked = validate_against_upstream(entries, args.upstream_path, args.entries) + print( + f"ok: {len(writers)} writers, {len(entries)} entries, " + f"{verified} files verified, {cross_checked} upstream-cross-checked" + ) + else: + print( + f"ok: {len(writers)} writers, {len(entries)} entries, " + f"{verified} files verified" + ) + + +if __name__ == "__main__": + main() diff --git a/tests/test_generate_release_artifacts.py b/tests/test_generate_release_artifacts.py new file mode 100644 index 0000000..1fb2e3d --- /dev/null +++ b/tests/test_generate_release_artifacts.py @@ -0,0 +1,563 @@ +from __future__ import annotations + +import hashlib +import json +import re +import shutil +import subprocess +import sys +from pathlib import Path + +import pytest +import yaml +from frictionless import Package + + +REPO_ROOT = Path(__file__).resolve().parents[1] +GENERATOR = REPO_ROOT / "scripts" / "generate_release_artifacts.py" +RECIPE = REPO_ROOT / "scripts" / "release_recipe.json" +WRITERS = REPO_ROOT / "data" / "index" / "writers.jsonl" +ENTRIES = REPO_ROOT / "data" / "index" / "entries.jsonl" +NOTICE = REPO_ROOT / "NOTICE.md" +CITATION = REPO_ROOT / "CITATION.cff" +DATAPACKAGE = REPO_ROOT / "datapackage.json" + + +def _load_entries() -> list[dict]: + return [ + json.loads(line) + for line in ENTRIES.read_text(encoding="utf-8").splitlines() + if line.strip() + ] + + +def _load_writers() -> list[dict]: + return [ + json.loads(line) + for line in WRITERS.read_text(encoding="utf-8").splitlines() + if line.strip() + ] + + +def _run_generator( + *, + cwd: Path, + writers: Path = WRITERS, + entries: Path = ENTRIES, + recipe: Path = RECIPE, + notice: Path, + citation: Path, + datapackage: Path, + extra_args: tuple[str, ...] = (), +) -> subprocess.CompletedProcess[str]: + return subprocess.run( + [ + sys.executable, + str(GENERATOR), + "--writers", str(writers), + "--entries", str(entries), + "--recipe", str(recipe), + "--notice", str(notice), + "--citation", str(citation), + "--datapackage", str(datapackage), + *extra_args, + ], + cwd=cwd, + text=True, + capture_output=True, + check=False, + ) + + +# --- Empty-corpus tests (the current committed state) ---------------------- + + +def test_committed_artifacts_are_up_to_date(tmp_path: Path) -> None: + notice = tmp_path / "NOTICE.md" + citation = tmp_path / "CITATION.cff" + datapackage = tmp_path / "datapackage.json" + + result = _run_generator( + cwd=tmp_path, notice=notice, citation=citation, datapackage=datapackage + ) + assert result.returncode == 0, result.stderr + + assert notice.read_bytes() == NOTICE.read_bytes(), ( + "NOTICE.md is stale; run `python3 scripts/generate_release_artifacts.py`" + ) + assert citation.read_bytes() == CITATION.read_bytes(), ( + "CITATION.cff is stale; run `python3 scripts/generate_release_artifacts.py`" + ) + assert datapackage.read_bytes() == DATAPACKAGE.read_bytes(), ( + "datapackage.json is stale; run `python3 scripts/generate_release_artifacts.py`" + ) + + +def test_generator_is_idempotent(tmp_path: Path) -> None: + paths = { + "notice": tmp_path / "NOTICE.md", + "citation": tmp_path / "CITATION.cff", + "datapackage": tmp_path / "datapackage.json", + } + + first = _run_generator(cwd=tmp_path, **paths) + assert first.returncode == 0, first.stderr + snapshot = {name: path.read_bytes() for name, path in paths.items()} + + second = _run_generator(cwd=tmp_path, **paths) + assert second.returncode == 0, second.stderr + for name, path in paths.items(): + assert path.read_bytes() == snapshot[name], f"{name} differed between runs" + + +def test_datapackage_counts_match_index() -> None: + entries = _load_entries() + writers = _load_writers() + package = json.loads(DATAPACKAGE.read_text(encoding="utf-8")) + assert package["stats"]["record_count"] == len(entries) + assert package["stats"]["writer_record_count"] == len(writers) + + +def test_datapackage_keys_are_sorted() -> None: + package = json.loads(DATAPACKAGE.read_text(encoding="utf-8")) + assert list(package.keys()) == sorted(package.keys()) + + +def test_citation_parses_and_has_required_cff_keys() -> None: + document = yaml.safe_load(CITATION.read_text(encoding="utf-8")) + assert isinstance(document, dict) + for required in ( + "cff-version", "type", "title", "authors", "version", "date-released" + ): + assert required in document, f"CITATION.cff missing required key: {required}" + assert document["cff-version"] == "1.2.0" + assert document["type"] == "dataset" + assert document["license"] == "CC0-1.0" + + +def test_datapackage_validates_against_frictionless_spec() -> None: + package = Package(str(DATAPACKAGE)) + assert package.name == "hletterscript" + errors = list(Package.metadata_validate(package.to_descriptor())) + assert errors == [], [getattr(e, "message", str(e)) for e in errors] + + +def test_empty_corpus_falls_back_to_recipe_initial_date(tmp_path: Path) -> None: + if _load_entries(): + pytest.skip("corpus is no longer empty") + notice = tmp_path / "NOTICE.md" + citation = tmp_path / "CITATION.cff" + datapackage = tmp_path / "datapackage.json" + result = _run_generator( + cwd=tmp_path, notice=notice, citation=citation, datapackage=datapackage + ) + assert result.returncode == 0, result.stderr + package = json.loads(datapackage.read_text(encoding="utf-8")) + recipe = json.loads(RECIPE.read_text(encoding="utf-8")) + assert package["released_at"] == recipe["initial_release_date"] + assert package["stats"]["record_count"] == 0 + + +def test_empty_corpus_falls_back_when_recipe_initial_date_missing( + tmp_path: Path, +) -> None: + recipe = json.loads(RECIPE.read_text(encoding="utf-8")) + del recipe["initial_release_date"] + bad_recipe = tmp_path / "bad_recipe.json" + bad_recipe.write_text(json.dumps(recipe), encoding="utf-8") + + writers_path = tmp_path / "writers.jsonl" + entries_path = tmp_path / "entries.jsonl" + writers_path.write_text("", encoding="utf-8") + entries_path.write_text("", encoding="utf-8") + + result = _run_generator( + cwd=tmp_path, + writers=writers_path, + entries=entries_path, + recipe=bad_recipe, + notice=tmp_path / "NOTICE.md", + citation=tmp_path / "CITATION.cff", + datapackage=tmp_path / "datapackage.json", + ) + assert result.returncode != 0 + assert "initial_release_date" in result.stderr + + +def test_check_mode_passes_when_up_to_date() -> None: + result = subprocess.run( + [sys.executable, str(GENERATOR), "--check"], + cwd=REPO_ROOT, + text=True, + capture_output=True, + check=False, + ) + assert result.returncode == 0, result.stderr + assert "ok" in result.stdout + + +def test_check_mode_fails_when_stale(tmp_path: Path) -> None: + notice = tmp_path / "NOTICE.md" + citation = tmp_path / "CITATION.cff" + datapackage = tmp_path / "datapackage.json" + shutil.copyfile(NOTICE, notice) + shutil.copyfile(CITATION, citation) + shutil.copyfile(DATAPACKAGE, datapackage) + datapackage.write_text("{}\n", encoding="utf-8") + + result = _run_generator( + cwd=tmp_path, + notice=notice, + citation=citation, + datapackage=datapackage, + extra_args=("--check",), + ) + assert result.returncode == 1 + assert "stale" in result.stderr + assert "datapackage.json" in result.stderr + + +def test_recipe_required_fields_must_be_present(tmp_path: Path) -> None: + recipe = json.loads(RECIPE.read_text(encoding="utf-8")) + del recipe["authors"] + bad_recipe = tmp_path / "bad_recipe.json" + bad_recipe.write_text(json.dumps(recipe), encoding="utf-8") + + result = _run_generator( + cwd=tmp_path, + recipe=bad_recipe, + notice=tmp_path / "NOTICE.md", + citation=tmp_path / "CITATION.cff", + datapackage=tmp_path / "datapackage.json", + ) + assert result.returncode != 0 + assert "authors" in result.stderr + + +def test_version_released_date_required(tmp_path: Path) -> None: + recipe = json.loads(RECIPE.read_text(encoding="utf-8")) + del recipe["version_released_date"] + bad_recipe = tmp_path / "bad_recipe.json" + bad_recipe.write_text(json.dumps(recipe), encoding="utf-8") + result = _run_generator( + cwd=tmp_path, + recipe=bad_recipe, + notice=tmp_path / "NOTICE.md", + citation=tmp_path / "CITATION.cff", + datapackage=tmp_path / "datapackage.json", + ) + assert result.returncode != 0 + assert "version_released_date" in result.stderr + + +# --- Non-empty-corpus tests (the bug-prevention tier) ---------------------- +# +# These tests construct a synthetic 2-entry corpus including one +# CC-BY-SA-4.0 attribution-required entry, run the generator, and +# verify the rendered artefacts. Without these, the entire NOTICE.md +# stanza-building path would be unreachable by CI for as long as the +# committed corpus stays empty. + + +def _hash(data: bytes) -> tuple[str, int]: + return hashlib.sha256(data).hexdigest(), len(data) + + +def _synthetic_writer(writer_id: str) -> dict: + return { + "writer_id": writer_id, + "status": "verified", + "display_name": writer_id.replace("_", " ").title(), + "also_known_as": [], + "description": "Synthetic writer used only by the test suite.", + "dates": { + "birth_year": 1890, + "birth_precision": "exact", + "death_year": 1950, + "death_precision": "exact", + }, + "languages_written": ["he"], + "scripts_written": ["Hebr"], + "period": { + "start": "1920", + "end": "1949", + "precision": "year", + }, + "references": [ + { + "kind": "repo_note", + "citation": "tests/test_generate_release_artifacts.py", + "quote": None, + "url": None, + } + ], + "ingest": {"agent_notes": "fixture", "blocked_reason": None}, + } + + +def _synthetic_entry( + tmp_path: Path, + writer_id: str, + letter_name: str, + codepoint: str, + char: str, + form: str, + variant: int, + license_expression: str, + rights_basis: str, + extracted_at: str, + *, + attribution_required: bool = False, + attribution_text: str | None = None, + attribution_url: str | None = None, +) -> dict: + entry_id = f"{writer_id}__{letter_name}__v{variant:04d}" + rel_dir = Path("data") / "letters" / writer_id / letter_name + abs_dir = tmp_path / rel_dir + abs_dir.mkdir(parents=True, exist_ok=True) + rel_path = rel_dir / f"{entry_id}.png" + abs_path = tmp_path / rel_path + payload = f"png-{entry_id}".encode("utf-8") + abs_path.write_bytes(payload) + sha, size = _hash(payload) + return { + "entry_id": entry_id, + "writer_id": writer_id, + "letter": { + "codepoint": codepoint, + "unicode_char": char, + "name": letter_name, + "form": form, + }, + "upstream": { + "source_id": f"commons__{writer_id}_doc", + "entry_id": f"commons__{writer_id}_doc__p0001", + "sha256": "a" * 64, + "commit": "0" * 40, + "release_tag": "v0.1.0-rc", + "bbox": {"x": 0, "y": 0, "w": 100, "h": 100}, + }, + "image": { + "local_path": str(rel_path), + "sha256": sha, + "mime_type": "image/png", + "bytes": size, + "width_px": 1, + "height_px": 1, + "background": "original", + }, + "extraction": { + "tool": "hletterscriptgen", + "tool_version": "v0.0.1", + "method": "manual", + "extracted_at": extracted_at, + "extracted_by": "test_suite", + "notes": None, + }, + "rights": { + "rights_basis": rights_basis, + "license_expression": license_expression, + "commercial_use_allowed": True, + "derivatives_allowed": True, + "redistribution_allowed": True, + "attribution_required": attribution_required, + "attribution_text": attribution_text, + "attribution_url": attribution_url, + "verification_status": "inherited_from_upstream", + "evidence_text": "Upstream verified.", + "verified_at": "2026-05-12", + }, + "quality": { + "usable_for_htr": True, + "usable_for_syngen": True, + "legibility": "high", + "exclusion_reasons": [], + "notes": None, + }, + } + + +@pytest.fixture +def synthetic_corpus(tmp_path: Path) -> dict: + writers = [ + _synthetic_writer("writer_pdm"), + _synthetic_writer("writer_cc_by_sa"), + ] + entries = [ + _synthetic_entry( + tmp_path, + writer_id="writer_pdm", + letter_name="alef", + codepoint="U+05D0", + char="א", + form="regular", + variant=1, + license_expression="PDM-1.0", + rights_basis="public_domain", + extracted_at="2026-05-10T12:00:00Z", + ), + _synthetic_entry( + tmp_path, + writer_id="writer_cc_by_sa", + letter_name="bet", + codepoint="U+05D1", + char="ב", + form="regular", + variant=1, + license_expression="CC-BY-SA-4.0", + rights_basis="cc_by_sa", + extracted_at="2026-05-11T18:30:00Z", + attribution_required=True, + attribution_text="User:Example via Wikimedia Commons, CC BY-SA 4.0", + attribution_url="https://commons.wikimedia.org/wiki/File:Example.jpg", + ), + ] + writers_path = tmp_path / "writers.jsonl" + entries_path = tmp_path / "entries.jsonl" + writers_path.write_text( + "".join(json.dumps(w, ensure_ascii=False) + "\n" for w in writers), + encoding="utf-8", + ) + entries_path.write_text( + "".join(json.dumps(e, ensure_ascii=False) + "\n" for e in entries), + encoding="utf-8", + ) + return { + "writers_path": writers_path, + "entries_path": entries_path, + "writers": writers, + "entries": entries, + } + + +def _generate_for_corpus( + tmp_path: Path, synthetic_corpus: dict +) -> tuple[Path, Path, Path]: + notice = tmp_path / "NOTICE.md" + citation = tmp_path / "CITATION.cff" + datapackage = tmp_path / "datapackage.json" + result = _run_generator( + cwd=tmp_path, + writers=synthetic_corpus["writers_path"], + entries=synthetic_corpus["entries_path"], + notice=notice, + citation=citation, + datapackage=datapackage, + ) + assert result.returncode == 0, result.stderr + return notice, citation, datapackage + + +def test_non_empty_corpus_notice_lists_attribution_required( + tmp_path: Path, synthetic_corpus: dict +) -> None: + notice, _, _ = _generate_for_corpus(tmp_path, synthetic_corpus) + text = notice.read_text(encoding="utf-8") + assert "writer_cc_by_sa__bet__v0001" in text, ( + "CC-BY-SA entry should be listed in NOTICE.md" + ) + assert "writer_pdm__alef__v0001" not in text, ( + "PDM entry should NOT be listed in NOTICE.md (no attribution required)" + ) + assert "User:Example" in text + assert "https://commons.wikimedia.org/wiki/File:Example.jpg" in text + + +def test_non_empty_corpus_notice_url_is_valid_github_blob( + tmp_path: Path, synthetic_corpus: dict +) -> None: + # The bug-fix verification: NOTICE.md must NOT embed a `release:` + # prefix in the upstream blob URL. The commit field is a SHA; the + # release_tag is metadata only. A `release:` substring in any URL + # would indicate the bug from the original PR has regressed. + notice, _, _ = _generate_for_corpus(tmp_path, synthetic_corpus) + text = notice.read_text(encoding="utf-8") + + # Scan only URLs inside angle brackets (the markdown-link form the + # generator uses). Free prose like "Corpus release:" is unrelated. + urls = re.findall(r"<(https?://[^>]+)>", text) + for url in urls: + assert "release:" not in url, ( + f"NOTICE.md URL must not contain `release:` prefix: {url!r}" + ) + + # The upstream link should contain a 40-char hex sha after /blob/. + pattern = re.compile(r"/blob/([a-f0-9]{40})/data/index/entries\.jsonl") + matches = pattern.findall(text) + assert matches, "expected at least one /blob// link in NOTICE.md" + + +def test_non_empty_corpus_datapackage_stats( + tmp_path: Path, synthetic_corpus: dict +) -> None: + _, _, datapackage = _generate_for_corpus(tmp_path, synthetic_corpus) + package = json.loads(datapackage.read_text(encoding="utf-8")) + stats = package["stats"] + assert stats["record_count"] == 2 + assert stats["writer_record_count"] == 2 + assert stats["entry_writer_count"] == 2 + assert stats["attribution_required_count"] == 1 + assert stats["license_breakdown"] == {"CC-BY-SA-4.0": 1, "PDM-1.0": 1} + assert stats["letter_breakdown"] == {"alef": 1, "bet": 1} + assert stats["writer_breakdown"] == {"writer_cc_by_sa": 1, "writer_pdm": 1} + + +def test_non_empty_corpus_released_at_is_latest_extraction( + tmp_path: Path, synthetic_corpus: dict +) -> None: + _, _, datapackage = _generate_for_corpus(tmp_path, synthetic_corpus) + package = json.loads(datapackage.read_text(encoding="utf-8")) + assert package["released_at"] == "2026-05-11T18:30:00Z" + + +def test_non_empty_corpus_citation_date_is_stable_not_extraction( + tmp_path: Path, synthetic_corpus: dict +) -> None: + # The whole point of separating `version_released_date` from + # `released_at`: citations must not drift as entries accumulate. + _, citation, datapackage = _generate_for_corpus(tmp_path, synthetic_corpus) + document = yaml.safe_load(citation.read_text(encoding="utf-8")) + recipe = json.loads(RECIPE.read_text(encoding="utf-8")) + assert str(document["date-released"]) == recipe["version_released_date"] + # And it must NOT equal the (later) corpus-state timestamp. + package = json.loads(datapackage.read_text(encoding="utf-8")) + assert str(document["date-released"]) != package["released_at"][:10] or ( + recipe["version_released_date"] == package["released_at"][:10] + ) + + +def test_non_empty_corpus_datapackage_frictionless_valid( + tmp_path: Path, synthetic_corpus: dict +) -> None: + _, _, datapackage = _generate_for_corpus(tmp_path, synthetic_corpus) + package = Package(str(datapackage)) + errors = list(Package.metadata_validate(package.to_descriptor())) + assert errors == [], [getattr(e, "message", str(e)) for e in errors] + + +def test_non_empty_corpus_missing_attribution_flag_is_rejected( + tmp_path: Path, synthetic_corpus: dict +) -> None: + # If a CC-BY-SA entry forgets attribution_required=True the + # generator's consistency check must fail loudly rather than silently + # dropping the entry from NOTICE.md. + entries = synthetic_corpus["entries"] + cc_entry = next(e for e in entries if e["rights"]["license_expression"] == "CC-BY-SA-4.0") + cc_entry["rights"]["attribution_required"] = False + cc_entry["rights"]["attribution_text"] = None + cc_entry["rights"]["attribution_url"] = None + synthetic_corpus["entries_path"].write_text( + "".join(json.dumps(e, ensure_ascii=False) + "\n" for e in entries), + encoding="utf-8", + ) + result = _run_generator( + cwd=tmp_path, + writers=synthetic_corpus["writers_path"], + entries=synthetic_corpus["entries_path"], + notice=tmp_path / "NOTICE.md", + citation=tmp_path / "CITATION.cff", + datapackage=tmp_path / "datapackage.json", + ) + assert result.returncode != 0 + assert "CC-BY-SA-4.0" in result.stderr + assert "attribution_required" in result.stderr diff --git a/tests/test_validate_indexes.py b/tests/test_validate_indexes.py new file mode 100644 index 0000000..e259135 --- /dev/null +++ b/tests/test_validate_indexes.py @@ -0,0 +1,688 @@ +from __future__ import annotations + +import hashlib +import json +import subprocess +import sys +from pathlib import Path + +import pytest + + +REPO_ROOT = Path(__file__).resolve().parents[1] +VALIDATOR = REPO_ROOT / "scripts" / "validate_indexes.py" +WRITERS = REPO_ROOT / "data" / "index" / "writers.jsonl" +ENTRIES = REPO_ROOT / "data" / "index" / "entries.jsonl" + + +def run_validator(*args: str | Path) -> subprocess.CompletedProcess[str]: + return subprocess.run( + [sys.executable, str(VALIDATOR), *(str(arg) for arg in args)], + cwd=REPO_ROOT, + text=True, + capture_output=True, + check=False, + ) + + +@pytest.fixture +def writer_fixture() -> dict: + return { + "writer_id": "fixture_writer", + "status": "verified", + "display_name": "Fixture Writer", + "also_known_as": [], + "description": "Synthetic writer used only by the test suite.", + "dates": { + "birth_year": 1890, + "birth_precision": "exact", + "death_year": 1950, + "death_precision": "exact", + }, + "languages_written": ["he"], + "scripts_written": ["Hebr"], + "period": { + "start": "1920", + "end": "1949", + "precision": "year", + }, + "references": [ + { + "kind": "repo_note", + "citation": "tests/test_validate_indexes.py::writer_fixture", + "quote": None, + "url": None, + } + ], + "ingest": { + "agent_notes": "fixture", + "blocked_reason": None, + }, + } + + +def _hash_bytes(data: bytes) -> tuple[str, int]: + return hashlib.sha256(data).hexdigest(), len(data) + + +@pytest.fixture +def entry_fixture(tmp_path: Path) -> dict: + # Write a tiny placeholder PNG (1x1 pixel) so the file-integrity + # check has something real to hash. The validator only cares about + # size and sha256; the bytes do not need to decode as a valid PNG. + image_dir = tmp_path / "data" / "letters" / "fixture_writer" / "alef" + image_dir.mkdir(parents=True) + image_path = image_dir / "fixture_writer__alef__v0001.png" + image_bytes = b"\x89PNG\r\n\x1a\nfixture-test-bytes" + image_path.write_bytes(image_bytes) + sha, size = _hash_bytes(image_bytes) + + return { + "entry_id": "fixture_writer__alef__v0001", + "writer_id": "fixture_writer", + "letter": { + "codepoint": "U+05D0", + "unicode_char": "א", + "name": "alef", + "form": "regular", + }, + "upstream": { + "source_id": "commons__fixture_source", + "entry_id": "commons__fixture_source__p0001", + "sha256": "a" * 64, + "commit": "0" * 40, + "release_tag": "v0.1.0-rc", + "bbox": {"x": 10, "y": 20, "w": 64, "h": 64}, + }, + "image": { + "local_path": "data/letters/fixture_writer/alef/fixture_writer__alef__v0001.png", + "sha256": sha, + "mime_type": "image/png", + "bytes": size, + "width_px": 1, + "height_px": 1, + "background": "original", + }, + "extraction": { + "tool": "hletterscriptgen", + "tool_version": "v0.0.1", + "method": "manual", + "extracted_at": "2026-05-12T00:00:00Z", + "extracted_by": "test_suite", + "notes": None, + }, + "rights": { + "rights_basis": "public_domain", + "license_expression": "PDM-1.0", + "commercial_use_allowed": True, + "derivatives_allowed": True, + "redistribution_allowed": True, + "attribution_required": False, + "attribution_text": None, + "attribution_url": None, + "verification_status": "inherited_from_upstream", + "evidence_text": "Upstream entry verified as PDM-1.0.", + "verified_at": "2026-05-12", + }, + "quality": { + "usable_for_htr": True, + "usable_for_syngen": True, + "legibility": "high", + "exclusion_reasons": [], + "notes": None, + }, + } + + +def _write_indexes( + tmp_path: Path, + writers: list[dict], + entries: list[dict], +) -> tuple[Path, Path]: + writers_path = tmp_path / "writers.jsonl" + entries_path = tmp_path / "entries.jsonl" + writers_path.write_text( + "".join(json.dumps(w, ensure_ascii=False) + "\n" for w in writers), + encoding="utf-8", + ) + entries_path.write_text( + "".join(json.dumps(e, ensure_ascii=False) + "\n" for e in entries), + encoding="utf-8", + ) + return writers_path, entries_path + + +def _run_against( + tmp_path: Path, + writers: list[dict], + entries: list[dict], + *extra_args: str, +) -> subprocess.CompletedProcess[str]: + writers_path, entries_path = _write_indexes(tmp_path, writers, entries) + return subprocess.run( + [ + sys.executable, str(VALIDATOR), + "--writers", str(writers_path), + "--entries", str(entries_path), + "--repo-root", str(tmp_path), + *extra_args, + ], + cwd=tmp_path, + text=True, + capture_output=True, + check=False, + ) + + +def test_current_indexes_validate() -> None: + result = run_validator() + assert result.returncode == 0, result.stderr + assert "ok:" in result.stdout + + +def test_empty_indexes_validate(tmp_path: Path) -> None: + writers_path = tmp_path / "writers.jsonl" + entries_path = tmp_path / "entries.jsonl" + writers_path.write_text("", encoding="utf-8") + entries_path.write_text("", encoding="utf-8") + result = run_validator("--writers", writers_path, "--entries", entries_path) + assert result.returncode == 0, result.stderr + assert "0 writers, 0 entries" in result.stdout + + +def test_fixture_round_trip( + tmp_path: Path, + writer_fixture: dict, + entry_fixture: dict, +) -> None: + result = _run_against(tmp_path, [writer_fixture], [entry_fixture]) + assert result.returncode == 0, result.stderr + assert "ok: 1 writers, 1 entries, 1 files verified" in result.stdout + + +# --- Schema-level rejections ------------------------------------------------ + + +def test_schema_errors_are_rejected( + tmp_path: Path, + writer_fixture: dict, + entry_fixture: dict, +) -> None: + writer_fixture["status"] = "garbage" + result = _run_against(tmp_path, [writer_fixture], [entry_fixture]) + assert result.returncode != 0 + assert "is not one of" in result.stderr + + +def test_candidate_writer_with_zero_references_is_accepted( + tmp_path: Path, + writer_fixture: dict, + entry_fixture: dict, +) -> None: + # The references.minItems requirement is conditional on status; a + # `candidate` writer is allowed to ship with no references yet. + writer_fixture["status"] = "candidate" + writer_fixture["references"] = [] + # Remove the entry so the writer can be candidate without a verified + # crop referencing it. + result = _run_against(tmp_path, [writer_fixture], []) + assert result.returncode == 0, result.stderr + + +def test_verified_writer_without_references_is_rejected( + tmp_path: Path, + writer_fixture: dict, +) -> None: + writer_fixture["references"] = [] + result = _run_against(tmp_path, [writer_fixture], []) + assert result.returncode != 0 + assert "references" in result.stderr + # jsonschema's exact wording for "minItems violated" is + # "[] should be non-empty"; accept either that or a generic + # if/then failure message for forward compatibility. + lower = result.stderr.lower() + assert any(needle in lower for needle in ( + "should be non-empty", + "minitems", + "is too short", + "should not be valid", + )) + + +# --- Cross-field validation ------------------------------------------------- + + +def test_unknown_writer_id_is_rejected( + tmp_path: Path, + writer_fixture: dict, + entry_fixture: dict, +) -> None: + entry_fixture["writer_id"] = "missing_writer" + entry_fixture["entry_id"] = "missing_writer__alef__v0001" + result = _run_against(tmp_path, [writer_fixture], [entry_fixture]) + assert result.returncode != 0 + assert "unknown writer_id" in result.stderr + + +def test_entry_id_must_start_with_writer_and_letter( + tmp_path: Path, + writer_fixture: dict, + entry_fixture: dict, +) -> None: + entry_fixture["entry_id"] = "fixture_writer__bet__v0001" + result = _run_against(tmp_path, [writer_fixture], [entry_fixture]) + assert result.returncode != 0 + assert "must start with" in result.stderr + + +def test_letter_codepoint_must_match_name( + tmp_path: Path, + writer_fixture: dict, + entry_fixture: dict, +) -> None: + entry_fixture["letter"]["codepoint"] = "U+05D1" + result = _run_against(tmp_path, [writer_fixture], [entry_fixture]) + assert result.returncode != 0 + assert "letter.codepoint mismatch" in result.stderr + + +def test_letter_char_must_match_name( + tmp_path: Path, + writer_fixture: dict, + entry_fixture: dict, +) -> None: + entry_fixture["letter"]["unicode_char"] = "ב" + result = _run_against(tmp_path, [writer_fixture], [entry_fixture]) + assert result.returncode != 0 + assert "letter.unicode_char mismatch" in result.stderr + + +def test_letter_form_must_match_name( + tmp_path: Path, + writer_fixture: dict, + entry_fixture: dict, +) -> None: + entry_fixture["letter"]["form"] = "final" + result = _run_against(tmp_path, [writer_fixture], [entry_fixture]) + assert result.returncode != 0 + assert "letter.form mismatch" in result.stderr + + +# --- Upstream block --------------------------------------------------------- + + +def test_upstream_commit_must_be_sha( + tmp_path: Path, + writer_fixture: dict, + entry_fixture: dict, +) -> None: + # Tag-style refs are no longer accepted in `upstream.commit`. They + # belong in `upstream.release_tag` instead. + entry_fixture["upstream"]["commit"] = "v0.1.0-rc" + result = _run_against(tmp_path, [writer_fixture], [entry_fixture]) + assert result.returncode != 0 + assert "does not match" in result.stderr or "pattern" in result.stderr + + +def test_upstream_release_tag_is_optional( + tmp_path: Path, + writer_fixture: dict, + entry_fixture: dict, +) -> None: + entry_fixture["upstream"]["release_tag"] = None + result = _run_against(tmp_path, [writer_fixture], [entry_fixture]) + assert result.returncode == 0, result.stderr + + +def test_upstream_repo_field_is_not_allowed( + tmp_path: Path, + writer_fixture: dict, + entry_fixture: dict, +) -> None: + # The upstream URL lives in scripts/release_recipe.json now; per-row + # duplication is rejected by additionalProperties:false. + entry_fixture["upstream"]["repo"] = "https://github.com/HeOCR/whatever" + result = _run_against(tmp_path, [writer_fixture], [entry_fixture]) + assert result.returncode != 0 + assert "Additional properties are not allowed" in result.stderr or "additionalProperties" in result.stderr + + +def test_upstream_entry_id_must_match_source( + tmp_path: Path, + writer_fixture: dict, + entry_fixture: dict, +) -> None: + entry_fixture["upstream"]["entry_id"] = "commons__another_source__p0001" + result = _run_against(tmp_path, [writer_fixture], [entry_fixture]) + assert result.returncode != 0 + assert "upstream.entry_id" in result.stderr + + +# --- Local path conventions ------------------------------------------------- + + +def test_local_path_prefix_is_enforced( + tmp_path: Path, + writer_fixture: dict, + entry_fixture: dict, +) -> None: + entry_fixture["image"]["local_path"] = "data/letters/wrong/alef/x.png" + result = _run_against(tmp_path, [writer_fixture], [entry_fixture]) + assert result.returncode != 0 + assert "must start with" in result.stderr + + +def test_local_path_extension_must_match_mime( + tmp_path: Path, + writer_fixture: dict, + entry_fixture: dict, +) -> None: + entry_fixture["image"]["local_path"] = ( + "data/letters/fixture_writer/alef/fixture_writer__alef__v0001.jpg" + ) + result = _run_against(tmp_path, [writer_fixture], [entry_fixture]) + assert result.returncode != 0 + assert "does not match" in result.stderr + + +# --- Background <-> mime guard ---------------------------------------------- + + +def test_transparent_background_with_jpeg_is_rejected( + tmp_path: Path, + writer_fixture: dict, + entry_fixture: dict, +) -> None: + # Schema if/then enforces that transparent backgrounds require an + # alpha-capable mime type. JPEG has no alpha. + entry_fixture["image"]["mime_type"] = "image/jpeg" + entry_fixture["image"]["local_path"] = ( + "data/letters/fixture_writer/alef/fixture_writer__alef__v0001.jpg" + ) + # Rename the on-disk fixture to match. + src = tmp_path / "data/letters/fixture_writer/alef/fixture_writer__alef__v0001.png" + dst = tmp_path / "data/letters/fixture_writer/alef/fixture_writer__alef__v0001.jpg" + src.rename(dst) + entry_fixture["image"]["background"] = "transparent" + result = _run_against(tmp_path, [writer_fixture], [entry_fixture]) + assert result.returncode != 0 + + +# --- Rights validation ------------------------------------------------------ + + +def test_unverified_entry_cannot_claim_positive_permissions( + tmp_path: Path, + writer_fixture: dict, + entry_fixture: dict, +) -> None: + entry_fixture["rights"]["verification_status"] = "source_note_only" + entry_fixture["rights"]["commercial_use_allowed"] = True + result = _run_against(tmp_path, [writer_fixture], [entry_fixture]) + assert result.returncode != 0 + assert "should not be valid" in result.stderr + + +def test_attribution_required_without_text_is_rejected( + tmp_path: Path, + writer_fixture: dict, + entry_fixture: dict, +) -> None: + entry_fixture["rights"]["license_expression"] = "CC-BY-SA-4.0" + entry_fixture["rights"]["rights_basis"] = "cc_by_sa" + entry_fixture["rights"]["attribution_required"] = True + entry_fixture["rights"]["attribution_text"] = None + entry_fixture["rights"]["attribution_url"] = None + result = _run_against(tmp_path, [writer_fixture], [entry_fixture]) + assert result.returncode != 0 + + +def test_attribution_with_blank_text_is_rejected( + tmp_path: Path, + writer_fixture: dict, + entry_fixture: dict, +) -> None: + entry_fixture["rights"]["license_expression"] = "CC-BY-SA-4.0" + entry_fixture["rights"]["rights_basis"] = "cc_by_sa" + entry_fixture["rights"]["attribution_required"] = True + entry_fixture["rights"]["attribution_text"] = " " + entry_fixture["rights"]["attribution_url"] = ( + "https://commons.wikimedia.org/wiki/File:Example.jpg" + ) + result = _run_against(tmp_path, [writer_fixture], [entry_fixture]) + assert result.returncode != 0 + assert "attribution_text is null, blank, or whitespace-only" in result.stderr + + +def test_rights_basis_must_match_license_expression( + tmp_path: Path, + writer_fixture: dict, + entry_fixture: dict, +) -> None: + # The validator's LICENSE_BASIS_MAP says CC-BY-SA-4.0 → cc_by_sa. + # An ingester who flips one but not the other is rejected. + entry_fixture["rights"]["license_expression"] = "CC-BY-SA-4.0" + entry_fixture["rights"]["rights_basis"] = "cc0" + entry_fixture["rights"]["attribution_required"] = True + entry_fixture["rights"]["attribution_text"] = "Example licensor" + entry_fixture["rights"]["attribution_url"] = ( + "https://commons.wikimedia.org/wiki/File:Example.jpg" + ) + result = _run_against(tmp_path, [writer_fixture], [entry_fixture]) + assert result.returncode != 0 + assert "rights_basis" in result.stderr + assert "does not match" in result.stderr + + +def test_unknown_license_expression_is_rejected( + tmp_path: Path, + writer_fixture: dict, + entry_fixture: dict, +) -> None: + entry_fixture["rights"]["license_expression"] = "GPL-3.0" + result = _run_against(tmp_path, [writer_fixture], [entry_fixture]) + assert result.returncode != 0 + assert "LICENSE_BASIS_MAP" in result.stderr or "not in" in result.stderr + + +def test_null_license_requires_unknown_basis( + tmp_path: Path, + writer_fixture: dict, + entry_fixture: dict, +) -> None: + entry_fixture["rights"]["license_expression"] = None + entry_fixture["rights"]["rights_basis"] = "public_domain" + # null license + positive permissions is also conditionally blocked + # by the schema, so flip to a verification status that allows null + # everywhere. + entry_fixture["rights"]["verification_status"] = "unverified" + entry_fixture["rights"]["commercial_use_allowed"] = None + entry_fixture["rights"]["derivatives_allowed"] = None + entry_fixture["rights"]["redistribution_allowed"] = None + result = _run_against(tmp_path, [writer_fixture], [entry_fixture]) + assert result.returncode != 0 + assert "license_expression is null" in result.stderr + + +# --- File integrity --------------------------------------------------------- + + +def test_missing_local_image_is_rejected( + tmp_path: Path, + writer_fixture: dict, + entry_fixture: dict, +) -> None: + (tmp_path / entry_fixture["image"]["local_path"]).unlink() + result = _run_against(tmp_path, [writer_fixture], [entry_fixture]) + assert result.returncode != 0 + assert "file does not exist" in result.stderr + + +def test_byte_size_mismatch_is_rejected( + tmp_path: Path, + writer_fixture: dict, + entry_fixture: dict, +) -> None: + real_bytes = entry_fixture["image"]["bytes"] + entry_fixture["image"]["bytes"] = real_bytes + 1 + result = _run_against(tmp_path, [writer_fixture], [entry_fixture]) + assert result.returncode != 0 + assert "byte size mismatch" in result.stderr + + +def test_sha256_mismatch_is_rejected( + tmp_path: Path, + writer_fixture: dict, + entry_fixture: dict, +) -> None: + entry_fixture["image"]["sha256"] = "0" * 64 + result = _run_against(tmp_path, [writer_fixture], [entry_fixture]) + assert result.returncode != 0 + assert "sha256 mismatch" in result.stderr + + +def test_duplicate_entry_id_is_rejected( + tmp_path: Path, + writer_fixture: dict, + entry_fixture: dict, +) -> None: + second = json.loads(json.dumps(entry_fixture)) + result = _run_against(tmp_path, [writer_fixture], [entry_fixture, second]) + assert result.returncode != 0 + assert "duplicate" in result.stderr + + +def test_missing_index_file_is_rejected(tmp_path: Path) -> None: + result = run_validator( + "--writers", tmp_path / "missing.jsonl", + "--entries", ENTRIES, + ) + assert result.returncode != 0 + assert "file does not exist" in result.stderr + + +# --- Tool version ---------------------------------------------------------- + + +@pytest.mark.parametrize("version", [ + "v0.0.1", + "0.0.1", + "v1.2.3", + "v1.2.3-rc1", + "v1.2.3-3-gabc1234", + "v1.2.3+build.5", + "v1.2.3-rc1+build.5", +]) +def test_tool_version_accepts_common_shapes( + tmp_path: Path, + writer_fixture: dict, + entry_fixture: dict, + version: str, +) -> None: + entry_fixture["extraction"]["tool_version"] = version + result = _run_against(tmp_path, [writer_fixture], [entry_fixture]) + assert result.returncode == 0, result.stderr + + +@pytest.mark.parametrize("version", [ + "not-semver", + "v1", + "v1.2", + "1.2.3.4", +]) +def test_tool_version_rejects_garbage( + tmp_path: Path, + writer_fixture: dict, + entry_fixture: dict, + version: str, +) -> None: + entry_fixture["extraction"]["tool_version"] = version + result = _run_against(tmp_path, [writer_fixture], [entry_fixture]) + assert result.returncode != 0 + + +# --- Upstream cross-validation --------------------------------------------- + + +def _write_upstream(tmp_path: Path, entries: list[dict]) -> Path: + upstream_root = tmp_path / "upstream" + (upstream_root / "data" / "index").mkdir(parents=True) + upstream_entries = upstream_root / "data" / "index" / "entries.jsonl" + upstream_entries.write_text( + "".join(json.dumps(e, ensure_ascii=False) + "\n" for e in entries), + encoding="utf-8", + ) + return upstream_root + + +def _upstream_entry(width: int = 4000, height: int = 5000) -> dict: + """Minimal upstream entry shape (only the fields the validator + actually reads). The full upstream schema is enforced by the upstream + repo's own CI, not here.""" + return { + "entry_id": "commons__fixture_source__p0001", + "files": [{ + "sha256": "a" * 64, + "width_px": width, + "height_px": height, + }], + } + + +def test_upstream_cross_check_passes_for_in_bounds_bbox( + tmp_path: Path, + writer_fixture: dict, + entry_fixture: dict, +) -> None: + upstream_root = _write_upstream(tmp_path, [_upstream_entry()]) + result = _run_against( + tmp_path, [writer_fixture], [entry_fixture], + "--upstream-path", str(upstream_root), + ) + assert result.returncode == 0, result.stderr + assert "1 upstream-cross-checked" in result.stdout + + +def test_upstream_cross_check_rejects_missing_entry( + tmp_path: Path, + writer_fixture: dict, + entry_fixture: dict, +) -> None: + upstream_root = _write_upstream(tmp_path, []) + result = _run_against( + tmp_path, [writer_fixture], [entry_fixture], + "--upstream-path", str(upstream_root), + ) + assert result.returncode != 0 + assert "not found in" in result.stderr + + +def test_upstream_cross_check_rejects_sha_mismatch( + tmp_path: Path, + writer_fixture: dict, + entry_fixture: dict, +) -> None: + upstream = _upstream_entry() + upstream["files"][0]["sha256"] = "b" * 64 + upstream_root = _write_upstream(tmp_path, [upstream]) + result = _run_against( + tmp_path, [writer_fixture], [entry_fixture], + "--upstream-path", str(upstream_root), + ) + assert result.returncode != 0 + assert "upstream.sha256" in result.stderr + + +def test_upstream_cross_check_rejects_bbox_out_of_bounds( + tmp_path: Path, + writer_fixture: dict, + entry_fixture: dict, +) -> None: + upstream_root = _write_upstream(tmp_path, [_upstream_entry(width=50, height=50)]) + entry_fixture["upstream"]["bbox"] = {"x": 10, "y": 20, "w": 100, "h": 100} + result = _run_against( + tmp_path, [writer_fixture], [entry_fixture], + "--upstream-path", str(upstream_root), + ) + assert result.returncode != 0 + assert "beyond upstream scan" in result.stderr