Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions docs/upstream_integration.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ fetches scans on its own.

1. **Pin an upstream revision.**
[`upstream_pin_from_checkout(path)`](../src/hletterscriptgen/upstream.py)
returns `(repo, revision)` for the
returns an `UpstreamPin(repo, revision)` for the
[`letter_set.v1.upstream`](letter_set_v1.md#upstream) block. `repo`
is the `owner/name` form derived from `git remote get-url origin`
(https, ssh, and `.git`-suffixed URLs all normalize the same way);
Expand All @@ -50,9 +50,9 @@ fetches scans on its own.
- `rights.commercial_use_allowed`, `rights.derivatives_allowed`,
and `rights.scan_redistribution_allowed` are all `True`. `None`
fails the gate intentionally — only positive assertions pass.
- `rights.verification_status ∉ FORBIDDEN_VERIFICATION_STATUSES`
(`unverified`, `source_note_only`, `conflicting`, `rejected`).
Only `primary_page_checked` is a positive verification.
- `rights.verification_status ∈ ALLOWED_VERIFICATION_STATUSES`
(currently only `primary_page_checked`). An allowlist is used so
that new upstream statuses fail closed until explicitly reviewed.
- `quality.usable_for_htr` is `True`.

The sibling helper
Expand Down
172 changes: 125 additions & 47 deletions src/hletterscriptgen/upstream.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@
* :func:`load_entries` — stream a local ``entries.jsonl``.
* :func:`is_eligible` / :func:`explain_ineligible` — enforce the
rights-and-quality gate declared in ``LICENSE-POLICY.md``.
* :func:`upstream_pin_from_checkout` — derive the ``(repo, revision)``
pin recorded in ``letter_set.v1.upstream`` from a clean checkout.
* :func:`upstream_pin_from_checkout` — derive the ``UpstreamPin``
recorded in ``letter_set.v1.upstream`` from a clean checkout.

The eligibility gate is intentionally strict — entries pass only when
the upstream record positively asserts each required property. ``None``
Expand All @@ -34,17 +34,19 @@

from hletterscriptgen import ALLOWED_LICENSES

# Upstream ``rights.verification_status`` values that disqualify an
# entry. The upstream enum is
# ``{unverified, source_note_only, primary_page_checked, conflicting, rejected}``;
# only ``primary_page_checked`` represents a positive verification, so
# every other state is rejected here.
FORBIDDEN_VERIFICATION_STATUSES: frozenset[str] = frozenset(
{"unverified", "source_note_only", "conflicting", "rejected"}
)
# Upstream ``rights.verification_status`` values that represent a
# positive verification. The upstream enum is
# ``{unverified, source_note_only, primary_page_checked, conflicting, rejected}``.
# Using an allowlist (rather than a blocklist) ensures any new status
# added upstream fails closed until explicitly reviewed and added here.
ALLOWED_VERIFICATION_STATUSES: frozenset[str] = frozenset({"primary_page_checked"})


class UpstreamLoadError(ValueError):
class UpstreamError(Exception):
"""Base class for all errors raised by :mod:`hletterscriptgen.upstream`."""


class UpstreamLoadError(UpstreamError):
"""Raised when an ``entries.jsonl`` line cannot be parsed.

``line_number`` is 1-based and refers to the offending line in the
Expand All @@ -57,15 +59,15 @@ def __init__(self, message: str, *, path: Path, line_number: int) -> None:
self.line_number = line_number


class UpstreamCheckoutDirtyError(RuntimeError):
class UpstreamCheckoutDirtyError(UpstreamError):
"""Raised when an upstream checkout has uncommitted changes.

Reproducibility requires the pinned revision to fully describe the
bytes the generator read.
"""


class UpstreamDetachedHeadError(RuntimeError):
class UpstreamDetachedHeadError(UpstreamError):
"""Raised when an upstream checkout is in detached HEAD state.

A detached HEAD commit may exist only locally, making the pinned
Expand All @@ -76,14 +78,30 @@ class UpstreamDetachedHeadError(RuntimeError):

@dataclass(frozen=True)
class UpstreamCreator:
"""A creator record from the upstream ``creators[]`` list.

``name`` and ``role`` are always present. ``death_year`` is ``None``
when the upstream record does not include it (e.g. for anonymous
writers or living persons). ``authority_url`` is ``None`` when no
authority record is linked.
"""

name: str
role: str
role: str # e.g. "writer", "scribe"
death_year: int | None
authority_url: str | None = None


@dataclass(frozen=True)
class UpstreamFile:
"""A file record from the upstream ``files[]`` list.

``role`` is always present (e.g. ``"original"``, ``"thumbnail"``).
All other fields are optional in the upstream schema — ``None`` when
not recorded. ``local_path`` is relative to the upstream checkout
root; combine it with the checkout ``Path`` to get an absolute path.
"""

role: str
local_path: str | None
sha256: str | None
Expand All @@ -94,21 +112,47 @@ class UpstreamFile:

@dataclass(frozen=True)
class UpstreamRights:
license_expression: str | None
"""Rights metadata for an upstream entry.

``verification_status`` is always present (required by the upstream
schema). All other fields are optional (``None`` = "not recorded").
Only entries whose ``verification_status`` is in
:data:`ALLOWED_VERIFICATION_STATUSES` pass the eligibility gate;
``None`` fields uniformly fail the gate — the gate requires positive
assertion, not absence of denial.
"""

license_expression: str | None # SPDX expression, e.g. "CC0-1.0"
commercial_use_allowed: bool | None
derivatives_allowed: bool | None
scan_redistribution_allowed: bool | None
verification_status: str
verification_status: str # required; see ALLOWED_VERIFICATION_STATUSES


@dataclass(frozen=True)
class UpstreamQuality:
"""Quality assessment for an upstream entry.

``legibility`` is always present (required by the upstream schema).
``usable_for_htr`` is optional; ``None`` fails the eligibility gate.
Valid ``legibility`` values: ``"high"``, ``"medium"``, ``"low"``.
"""

usable_for_htr: bool | None
legibility: str
legibility: str # e.g. "high", "medium", "low"


@dataclass(frozen=True)
class UpstreamEntry:
"""A typed, frozen subset of one upstream scan entry.

Models the fields the generator pipeline actually consumes; the full
upstream schema (``schemas/entry.schema.json``) is broader.
``entry_id`` and ``source_id`` are always present and non-empty.
``creators`` and ``files`` may be empty tuples when the upstream
record has no entries in those lists.
"""

entry_id: str
source_id: str
creators: tuple[UpstreamCreator, ...]
Expand All @@ -117,6 +161,19 @@ class UpstreamEntry:
quality: UpstreamQuality


@dataclass(frozen=True)
class UpstreamPin:
"""The ``(repo, revision)`` pair written to ``letter_set.v1.upstream``.

``repo`` is the ``owner/name`` form of the upstream remote (e.g.
``"HeOCR/public-domain-hand-written-hebrew-scans"``). ``revision``
is the full SHA of the pinned ``HEAD`` commit.
"""

repo: str
revision: str


def _parse_creator(raw: dict[str, Any]) -> UpstreamCreator:
return UpstreamCreator(
name=raw["name"],
Expand Down Expand Up @@ -168,10 +225,13 @@ def _parse_entry(raw: dict[str, Any]) -> UpstreamEntry:
def load_entries(path: Path) -> Iterator[UpstreamEntry]:
"""Stream :class:`UpstreamEntry` records from ``entries.jsonl``.

Blank lines (whitespace-only) are skipped. A line that is not valid
JSON, or whose shape does not match the modelled subset, raises
Raises :class:`UpstreamError` when ``path`` does not exist. Blank
lines (whitespace-only) are skipped. A line that is not valid JSON,
or whose shape does not match the modelled subset, raises
:class:`UpstreamLoadError` carrying the 1-based ``line_number``.
"""
if not path.is_file():
raise UpstreamError(f"entries file not found: {path}")
with path.open("r", encoding="utf-8") as fh:
for line_number, raw_line in enumerate(fh, start=1):
if not raw_line.strip():
Expand Down Expand Up @@ -200,11 +260,29 @@ def load_entries(path: Path) -> Iterator[UpstreamEntry]:
) from exc


def is_eligible(entry: UpstreamEntry) -> bool:
"""Whether ``entry`` satisfies the LICENSE-POLICY rights-and-quality gate.

Implemented with short-circuit evaluation — safe to call in tight
filter loops over large entry sets. For human-readable failure
reasons use :func:`explain_ineligible`.
"""
rights = entry.rights
return (
rights.license_expression in ALLOWED_LICENSES
and rights.commercial_use_allowed is True
and rights.derivatives_allowed is True
and rights.scan_redistribution_allowed is True
and rights.verification_status in ALLOWED_VERIFICATION_STATUSES
and entry.quality.usable_for_htr is True
)


def explain_ineligible(entry: UpstreamEntry) -> list[str]:
"""Return human-readable reasons ``entry`` fails eligibility.
"""Return human-readable reasons ``entry`` fails the eligibility gate.

Returns an empty list when the entry passes. The check mirrors
:func:`is_eligible`; the two stay in lockstep.
Returns an empty list when the entry passes all checks. Intended for
diagnostic output (CLI, logs); use :func:`is_eligible` in filter loops.
"""
reasons: list[str] = []
rights = entry.rights
Expand All @@ -219,20 +297,16 @@ def explain_ineligible(entry: UpstreamEntry) -> list[str]:
reasons.append("rights.derivatives_allowed is not True")
if rights.scan_redistribution_allowed is not True:
reasons.append("rights.scan_redistribution_allowed is not True")
if rights.verification_status in FORBIDDEN_VERIFICATION_STATUSES:
if rights.verification_status not in ALLOWED_VERIFICATION_STATUSES:
reasons.append(
f"rights.verification_status {rights.verification_status!r} is forbidden"
f"rights.verification_status {rights.verification_status!r} "
"is not in the set of allowed verification statuses"
)
if entry.quality.usable_for_htr is not True:
reasons.append("quality.usable_for_htr is not True")
return reasons


def is_eligible(entry: UpstreamEntry) -> bool:
"""Whether ``entry`` satisfies the LICENSE-POLICY rights-and-quality gate."""
return not explain_ineligible(entry)


# Match the trailing ``owner/name`` of an ``origin`` URL. Handles
# ``https://host/owner/name(.git)``, ``git@host:owner/name(.git)``,
# ``ssh://git@host/owner/name(.git)``, and ``git://`` variants.
Expand All @@ -242,30 +316,36 @@ def is_eligible(entry: UpstreamEntry) -> bool:
def _normalize_remote_url(url: str) -> str:
match = _REMOTE_TAIL_RE.search(url.strip())
if not match:
raise ValueError(f"could not parse 'owner/name' from remote URL: {url!r}")
raise UpstreamError(f"could not parse 'owner/name' from remote URL: {url!r}")
return f"{match.group(1)}/{match.group(2)}"


def _run_git(path: Path, *args: str) -> str:
result = subprocess.run(
def _run_git_raw(path: Path, *args: str) -> subprocess.CompletedProcess[str]:
"""Run a git command and return the full result without raising."""
return subprocess.run(
["git", "-C", str(path), *args],
capture_output=True,
text=True,
check=False,
)


def _run_git(path: Path, *args: str) -> str:
"""Run a git command; raise :class:`UpstreamError` on non-zero exit."""
result = _run_git_raw(path, *args)
if result.returncode != 0:
raise RuntimeError(
raise UpstreamError(
f"git {' '.join(args)} failed in {path}: {result.stderr.strip()}"
)
return result.stdout


def upstream_pin_from_checkout(path: Path) -> tuple[str, str]:
"""Return ``(repo, revision)`` suitable for ``letter_set.v1.upstream``.
def upstream_pin_from_checkout(path: Path) -> UpstreamPin:
"""Return an :class:`UpstreamPin` suitable for ``letter_set.v1.upstream``.

* ``repo`` is the ``owner/name`` form derived from
* ``pin.repo`` is the ``owner/name`` form derived from
``git remote get-url origin``.
* ``revision`` is the current ``HEAD`` SHA.
* ``pin.revision`` is the current ``HEAD`` SHA.

Raises :class:`UpstreamCheckoutDirtyError` when the working tree is
dirty, and :class:`UpstreamDetachedHeadError` when HEAD is detached.
Expand All @@ -279,31 +359,29 @@ def upstream_pin_from_checkout(path: Path) -> tuple[str, str]:
f"upstream checkout at {path} has uncommitted changes; "
"commit, stash, or reset before pinning"
)
# ``git symbolic-ref`` exits non-zero when HEAD is detached.
sym_result = subprocess.run(
["git", "-C", str(path), "symbolic-ref", "--quiet", "HEAD"],
capture_output=True,
text=True,
check=False,
)
if sym_result.returncode != 0:
# ``git symbolic-ref`` exits non-zero when HEAD is detached; use
# _run_git_raw so we can inspect the return code without raising.
sym = _run_git_raw(path, "symbolic-ref", "--quiet", "HEAD")
if sym.returncode != 0:
raise UpstreamDetachedHeadError(
f"upstream checkout at {path} is in detached HEAD state; "
"check out a branch or tag before pinning"
)
remote_url = _run_git(path, "remote", "get-url", "origin").strip()
revision = _run_git(path, "rev-parse", "HEAD").strip()
return _normalize_remote_url(remote_url), revision
return UpstreamPin(repo=_normalize_remote_url(remote_url), revision=revision)


__all__ = [
"FORBIDDEN_VERIFICATION_STATUSES",
"ALLOWED_VERIFICATION_STATUSES",
"UpstreamCheckoutDirtyError",
"UpstreamCreator",
"UpstreamDetachedHeadError",
"UpstreamEntry",
"UpstreamError",
"UpstreamFile",
"UpstreamLoadError",
"UpstreamPin",
"UpstreamQuality",
"UpstreamRights",
"explain_ineligible",
Expand Down
Loading