diff --git a/docs/upstream_integration.md b/docs/upstream_integration.md index 3b79c29..b99c802 100644 --- a/docs/upstream_integration.md +++ b/docs/upstream_integration.md @@ -26,7 +26,7 @@ fetches scans on its own. 1. **Pin an upstream revision.** [`upstream_pin_from_checkout(path)`](../src/hletterscriptgen/upstream.py) - returns `(repo, revision)` for the + returns an `UpstreamPin(repo, revision)` for the [`letter_set.v1.upstream`](letter_set_v1.md#upstream) block. `repo` is the `owner/name` form derived from `git remote get-url origin` (https, ssh, and `.git`-suffixed URLs all normalize the same way); @@ -50,9 +50,9 @@ fetches scans on its own. - `rights.commercial_use_allowed`, `rights.derivatives_allowed`, and `rights.scan_redistribution_allowed` are all `True`. `None` fails the gate intentionally — only positive assertions pass. - - `rights.verification_status ∉ FORBIDDEN_VERIFICATION_STATUSES` - (`unverified`, `source_note_only`, `conflicting`, `rejected`). - Only `primary_page_checked` is a positive verification. + - `rights.verification_status ∈ ALLOWED_VERIFICATION_STATUSES` + (currently only `primary_page_checked`). An allowlist is used so + that new upstream statuses fail closed until explicitly reviewed. - `quality.usable_for_htr` is `True`. The sibling helper diff --git a/src/hletterscriptgen/upstream.py b/src/hletterscriptgen/upstream.py index 063c9c9..c50b406 100644 --- a/src/hletterscriptgen/upstream.py +++ b/src/hletterscriptgen/upstream.py @@ -14,8 +14,8 @@ * :func:`load_entries` — stream a local ``entries.jsonl``. * :func:`is_eligible` / :func:`explain_ineligible` — enforce the rights-and-quality gate declared in ``LICENSE-POLICY.md``. -* :func:`upstream_pin_from_checkout` — derive the ``(repo, revision)`` - pin recorded in ``letter_set.v1.upstream`` from a clean checkout. +* :func:`upstream_pin_from_checkout` — derive the ``UpstreamPin`` + recorded in ``letter_set.v1.upstream`` from a clean checkout. The eligibility gate is intentionally strict — entries pass only when the upstream record positively asserts each required property. ``None`` @@ -34,17 +34,19 @@ from hletterscriptgen import ALLOWED_LICENSES -# Upstream ``rights.verification_status`` values that disqualify an -# entry. The upstream enum is -# ``{unverified, source_note_only, primary_page_checked, conflicting, rejected}``; -# only ``primary_page_checked`` represents a positive verification, so -# every other state is rejected here. -FORBIDDEN_VERIFICATION_STATUSES: frozenset[str] = frozenset( - {"unverified", "source_note_only", "conflicting", "rejected"} -) +# Upstream ``rights.verification_status`` values that represent a +# positive verification. The upstream enum is +# ``{unverified, source_note_only, primary_page_checked, conflicting, rejected}``. +# Using an allowlist (rather than a blocklist) ensures any new status +# added upstream fails closed until explicitly reviewed and added here. +ALLOWED_VERIFICATION_STATUSES: frozenset[str] = frozenset({"primary_page_checked"}) -class UpstreamLoadError(ValueError): +class UpstreamError(Exception): + """Base class for all errors raised by :mod:`hletterscriptgen.upstream`.""" + + +class UpstreamLoadError(UpstreamError): """Raised when an ``entries.jsonl`` line cannot be parsed. ``line_number`` is 1-based and refers to the offending line in the @@ -57,7 +59,7 @@ def __init__(self, message: str, *, path: Path, line_number: int) -> None: self.line_number = line_number -class UpstreamCheckoutDirtyError(RuntimeError): +class UpstreamCheckoutDirtyError(UpstreamError): """Raised when an upstream checkout has uncommitted changes. Reproducibility requires the pinned revision to fully describe the @@ -65,7 +67,7 @@ class UpstreamCheckoutDirtyError(RuntimeError): """ -class UpstreamDetachedHeadError(RuntimeError): +class UpstreamDetachedHeadError(UpstreamError): """Raised when an upstream checkout is in detached HEAD state. A detached HEAD commit may exist only locally, making the pinned @@ -76,14 +78,30 @@ class UpstreamDetachedHeadError(RuntimeError): @dataclass(frozen=True) class UpstreamCreator: + """A creator record from the upstream ``creators[]`` list. + + ``name`` and ``role`` are always present. ``death_year`` is ``None`` + when the upstream record does not include it (e.g. for anonymous + writers or living persons). ``authority_url`` is ``None`` when no + authority record is linked. + """ + name: str - role: str + role: str # e.g. "writer", "scribe" death_year: int | None authority_url: str | None = None @dataclass(frozen=True) class UpstreamFile: + """A file record from the upstream ``files[]`` list. + + ``role`` is always present (e.g. ``"original"``, ``"thumbnail"``). + All other fields are optional in the upstream schema — ``None`` when + not recorded. ``local_path`` is relative to the upstream checkout + root; combine it with the checkout ``Path`` to get an absolute path. + """ + role: str local_path: str | None sha256: str | None @@ -94,21 +112,47 @@ class UpstreamFile: @dataclass(frozen=True) class UpstreamRights: - license_expression: str | None + """Rights metadata for an upstream entry. + + ``verification_status`` is always present (required by the upstream + schema). All other fields are optional (``None`` = "not recorded"). + Only entries whose ``verification_status`` is in + :data:`ALLOWED_VERIFICATION_STATUSES` pass the eligibility gate; + ``None`` fields uniformly fail the gate — the gate requires positive + assertion, not absence of denial. + """ + + license_expression: str | None # SPDX expression, e.g. "CC0-1.0" commercial_use_allowed: bool | None derivatives_allowed: bool | None scan_redistribution_allowed: bool | None - verification_status: str + verification_status: str # required; see ALLOWED_VERIFICATION_STATUSES @dataclass(frozen=True) class UpstreamQuality: + """Quality assessment for an upstream entry. + + ``legibility`` is always present (required by the upstream schema). + ``usable_for_htr`` is optional; ``None`` fails the eligibility gate. + Valid ``legibility`` values: ``"high"``, ``"medium"``, ``"low"``. + """ + usable_for_htr: bool | None - legibility: str + legibility: str # e.g. "high", "medium", "low" @dataclass(frozen=True) class UpstreamEntry: + """A typed, frozen subset of one upstream scan entry. + + Models the fields the generator pipeline actually consumes; the full + upstream schema (``schemas/entry.schema.json``) is broader. + ``entry_id`` and ``source_id`` are always present and non-empty. + ``creators`` and ``files`` may be empty tuples when the upstream + record has no entries in those lists. + """ + entry_id: str source_id: str creators: tuple[UpstreamCreator, ...] @@ -117,6 +161,19 @@ class UpstreamEntry: quality: UpstreamQuality +@dataclass(frozen=True) +class UpstreamPin: + """The ``(repo, revision)`` pair written to ``letter_set.v1.upstream``. + + ``repo`` is the ``owner/name`` form of the upstream remote (e.g. + ``"HeOCR/public-domain-hand-written-hebrew-scans"``). ``revision`` + is the full SHA of the pinned ``HEAD`` commit. + """ + + repo: str + revision: str + + def _parse_creator(raw: dict[str, Any]) -> UpstreamCreator: return UpstreamCreator( name=raw["name"], @@ -168,10 +225,13 @@ def _parse_entry(raw: dict[str, Any]) -> UpstreamEntry: def load_entries(path: Path) -> Iterator[UpstreamEntry]: """Stream :class:`UpstreamEntry` records from ``entries.jsonl``. - Blank lines (whitespace-only) are skipped. A line that is not valid - JSON, or whose shape does not match the modelled subset, raises + Raises :class:`UpstreamError` when ``path`` does not exist. Blank + lines (whitespace-only) are skipped. A line that is not valid JSON, + or whose shape does not match the modelled subset, raises :class:`UpstreamLoadError` carrying the 1-based ``line_number``. """ + if not path.is_file(): + raise UpstreamError(f"entries file not found: {path}") with path.open("r", encoding="utf-8") as fh: for line_number, raw_line in enumerate(fh, start=1): if not raw_line.strip(): @@ -200,11 +260,29 @@ def load_entries(path: Path) -> Iterator[UpstreamEntry]: ) from exc +def is_eligible(entry: UpstreamEntry) -> bool: + """Whether ``entry`` satisfies the LICENSE-POLICY rights-and-quality gate. + + Implemented with short-circuit evaluation — safe to call in tight + filter loops over large entry sets. For human-readable failure + reasons use :func:`explain_ineligible`. + """ + rights = entry.rights + return ( + rights.license_expression in ALLOWED_LICENSES + and rights.commercial_use_allowed is True + and rights.derivatives_allowed is True + and rights.scan_redistribution_allowed is True + and rights.verification_status in ALLOWED_VERIFICATION_STATUSES + and entry.quality.usable_for_htr is True + ) + + def explain_ineligible(entry: UpstreamEntry) -> list[str]: - """Return human-readable reasons ``entry`` fails eligibility. + """Return human-readable reasons ``entry`` fails the eligibility gate. - Returns an empty list when the entry passes. The check mirrors - :func:`is_eligible`; the two stay in lockstep. + Returns an empty list when the entry passes all checks. Intended for + diagnostic output (CLI, logs); use :func:`is_eligible` in filter loops. """ reasons: list[str] = [] rights = entry.rights @@ -219,20 +297,16 @@ def explain_ineligible(entry: UpstreamEntry) -> list[str]: reasons.append("rights.derivatives_allowed is not True") if rights.scan_redistribution_allowed is not True: reasons.append("rights.scan_redistribution_allowed is not True") - if rights.verification_status in FORBIDDEN_VERIFICATION_STATUSES: + if rights.verification_status not in ALLOWED_VERIFICATION_STATUSES: reasons.append( - f"rights.verification_status {rights.verification_status!r} is forbidden" + f"rights.verification_status {rights.verification_status!r} " + "is not in the set of allowed verification statuses" ) if entry.quality.usable_for_htr is not True: reasons.append("quality.usable_for_htr is not True") return reasons -def is_eligible(entry: UpstreamEntry) -> bool: - """Whether ``entry`` satisfies the LICENSE-POLICY rights-and-quality gate.""" - return not explain_ineligible(entry) - - # Match the trailing ``owner/name`` of an ``origin`` URL. Handles # ``https://host/owner/name(.git)``, ``git@host:owner/name(.git)``, # ``ssh://git@host/owner/name(.git)``, and ``git://`` variants. @@ -242,30 +316,36 @@ def is_eligible(entry: UpstreamEntry) -> bool: def _normalize_remote_url(url: str) -> str: match = _REMOTE_TAIL_RE.search(url.strip()) if not match: - raise ValueError(f"could not parse 'owner/name' from remote URL: {url!r}") + raise UpstreamError(f"could not parse 'owner/name' from remote URL: {url!r}") return f"{match.group(1)}/{match.group(2)}" -def _run_git(path: Path, *args: str) -> str: - result = subprocess.run( +def _run_git_raw(path: Path, *args: str) -> subprocess.CompletedProcess[str]: + """Run a git command and return the full result without raising.""" + return subprocess.run( ["git", "-C", str(path), *args], capture_output=True, text=True, check=False, ) + + +def _run_git(path: Path, *args: str) -> str: + """Run a git command; raise :class:`UpstreamError` on non-zero exit.""" + result = _run_git_raw(path, *args) if result.returncode != 0: - raise RuntimeError( + raise UpstreamError( f"git {' '.join(args)} failed in {path}: {result.stderr.strip()}" ) return result.stdout -def upstream_pin_from_checkout(path: Path) -> tuple[str, str]: - """Return ``(repo, revision)`` suitable for ``letter_set.v1.upstream``. +def upstream_pin_from_checkout(path: Path) -> UpstreamPin: + """Return an :class:`UpstreamPin` suitable for ``letter_set.v1.upstream``. - * ``repo`` is the ``owner/name`` form derived from + * ``pin.repo`` is the ``owner/name`` form derived from ``git remote get-url origin``. - * ``revision`` is the current ``HEAD`` SHA. + * ``pin.revision`` is the current ``HEAD`` SHA. Raises :class:`UpstreamCheckoutDirtyError` when the working tree is dirty, and :class:`UpstreamDetachedHeadError` when HEAD is detached. @@ -279,31 +359,29 @@ def upstream_pin_from_checkout(path: Path) -> tuple[str, str]: f"upstream checkout at {path} has uncommitted changes; " "commit, stash, or reset before pinning" ) - # ``git symbolic-ref`` exits non-zero when HEAD is detached. - sym_result = subprocess.run( - ["git", "-C", str(path), "symbolic-ref", "--quiet", "HEAD"], - capture_output=True, - text=True, - check=False, - ) - if sym_result.returncode != 0: + # ``git symbolic-ref`` exits non-zero when HEAD is detached; use + # _run_git_raw so we can inspect the return code without raising. + sym = _run_git_raw(path, "symbolic-ref", "--quiet", "HEAD") + if sym.returncode != 0: raise UpstreamDetachedHeadError( f"upstream checkout at {path} is in detached HEAD state; " "check out a branch or tag before pinning" ) remote_url = _run_git(path, "remote", "get-url", "origin").strip() revision = _run_git(path, "rev-parse", "HEAD").strip() - return _normalize_remote_url(remote_url), revision + return UpstreamPin(repo=_normalize_remote_url(remote_url), revision=revision) __all__ = [ - "FORBIDDEN_VERIFICATION_STATUSES", + "ALLOWED_VERIFICATION_STATUSES", "UpstreamCheckoutDirtyError", "UpstreamCreator", "UpstreamDetachedHeadError", "UpstreamEntry", + "UpstreamError", "UpstreamFile", "UpstreamLoadError", + "UpstreamPin", "UpstreamQuality", "UpstreamRights", "explain_ineligible", diff --git a/tests/test_upstream.py b/tests/test_upstream.py index 4c1aaef..66db8a4 100644 --- a/tests/test_upstream.py +++ b/tests/test_upstream.py @@ -11,7 +11,10 @@ UpstreamCheckoutDirtyError, UpstreamDetachedHeadError, UpstreamEntry, + UpstreamError, UpstreamLoadError, + UpstreamPin, + _normalize_remote_url, explain_ineligible, is_eligible, load_entries, @@ -58,7 +61,7 @@ def test_load_entries_skips_blank_lines(tmp_path: Path) -> None: target = tmp_path / "entries.jsonl" target.write_text( "\n" - ' \n' + " \n" '{"entry_id":"a__b__p0001","source_id":"a__b","creators":[],"files":[],' '"rights":{"verification_status":"primary_page_checked"},' '"quality":{"legibility":"high"}}\n' @@ -70,6 +73,12 @@ def test_load_entries_skips_blank_lines(tmp_path: Path) -> None: assert entries[0].entry_id == "a__b__p0001" +def test_load_entries_raises_on_missing_file(tmp_path: Path) -> None: + missing = tmp_path / "nonexistent.jsonl" + with pytest.raises(UpstreamError): + list(load_entries(missing)) + + def test_load_entries_raises_on_malformed_json(tmp_path: Path) -> None: target = tmp_path / "entries.jsonl" target.write_text( @@ -96,6 +105,14 @@ def test_load_entries_raises_on_missing_required_field(tmp_path: Path) -> None: assert excinfo.value.line_number == 1 +def test_load_error_is_upstream_error(tmp_path: Path) -> None: + """UpstreamLoadError must be catchable as UpstreamError.""" + target = tmp_path / "entries.jsonl" + target.write_text("{not valid\n", encoding="utf-8") + with pytest.raises(UpstreamError): + list(load_entries(target)) + + # --- eligibility ------------------------------------------------------------- @@ -125,6 +142,32 @@ def test_reject_unverified_status(entries_by_id: dict[str, UpstreamEntry]) -> No assert any("verification_status" in r for r in reasons) +def test_is_eligible_and_explain_ineligible_agree( + entries_by_id: dict[str, UpstreamEntry], +) -> None: + """is_eligible and explain_ineligible must agree on every fixture entry.""" + for entry in entries_by_id.values(): + assert is_eligible(entry) == (explain_ineligible(entry) == []) + + +# --- _normalize_remote_url (unit tests — no subprocess overhead) ------------- + + +@pytest.mark.parametrize( + ("url", "expected"), + [ + ("https://github.com/HeOCR/foo.git", "HeOCR/foo"), + ("https://github.com/HeOCR/foo", "HeOCR/foo"), + ("https://github.com/HeOCR/foo/", "HeOCR/foo"), + ("git@github.com:HeOCR/foo.git", "HeOCR/foo"), + ("ssh://git@github.com/HeOCR/foo.git", "HeOCR/foo"), + ("git://github.com/HeOCR/foo.git", "HeOCR/foo"), + ], +) +def test_normalize_remote_url(url: str, expected: str) -> None: + assert _normalize_remote_url(url) == expected + + # --- upstream_pin_from_checkout ---------------------------------------------- @@ -151,12 +194,12 @@ def _init_repo(path: Path, remote_url: str) -> str: return rev -def test_pin_returns_repo_and_revision(tmp_path: Path) -> None: +def test_pin_returns_upstream_pin(tmp_path: Path) -> None: repo = tmp_path / "upstream" rev = _init_repo(repo, "https://github.com/HeOCR/public-domain-hand-written-hebrew-scans.git") - assert upstream_pin_from_checkout(repo) == ( - "HeOCR/public-domain-hand-written-hebrew-scans", - rev, + assert upstream_pin_from_checkout(repo) == UpstreamPin( + repo="HeOCR/public-domain-hand-written-hebrew-scans", + revision=rev, ) @@ -176,6 +219,15 @@ def test_pin_refuses_detached_head(tmp_path: Path) -> None: upstream_pin_from_checkout(repo) +def test_checkout_errors_are_upstream_errors(tmp_path: Path) -> None: + """UpstreamCheckoutDirtyError and UpstreamDetachedHeadError must be UpstreamError.""" + repo = tmp_path / "upstream" + rev = _init_repo(repo, "https://github.com/HeOCR/foo.git") + _git(repo, "checkout", "--detach", rev) + with pytest.raises(UpstreamError): + upstream_pin_from_checkout(repo) + + @pytest.mark.parametrize( "remote_url", [ @@ -186,7 +238,8 @@ def test_pin_refuses_detached_head(tmp_path: Path) -> None: ], ) def test_pin_normalizes_remote_url(tmp_path: Path, remote_url: str) -> None: - repo = tmp_path / f"upstream_{abs(hash(remote_url))}" + # Each parametrize invocation gets its own tmp_path — no name collision. + repo = tmp_path / "upstream" _init_repo(repo, remote_url) - repo_name, _ = upstream_pin_from_checkout(repo) - assert repo_name == "HeOCR/foo" + pin = upstream_pin_from_checkout(repo) + assert pin.repo == "HeOCR/foo"