From ea6b8b1f39b20bed626e54db6bbc5c314b44ebc6 Mon Sep 17 00:00:00 2001 From: Constantinos <41453723+constk@users.noreply.github.com> Date: Sun, 3 May 2026 13:56:29 +1000 Subject: [PATCH 1/8] fix: pin-freshness audit normalises sub-path actions before API call (#83) --- .github/scripts/check_pin_freshness.py | 21 +++++++++++-- .github/workflows/ci.yml | 16 +++++----- .github/workflows/codeql.yml | 4 +-- .github/workflows/eval-nightly.yml | 2 +- .github/workflows/release.yml | 2 +- .github/workflows/security.yml | 2 +- pyproject.toml | 2 +- tests/test_check_pin_freshness.py | 43 ++++++++++++++++++++++++++ uv.lock | 2 +- 9 files changed, 77 insertions(+), 17 deletions(-) diff --git a/.github/scripts/check_pin_freshness.py b/.github/scripts/check_pin_freshness.py index f901bdc..85ac943 100644 --- a/.github/scripts/check_pin_freshness.py +++ b/.github/scripts/check_pin_freshness.py @@ -109,6 +109,22 @@ def _fetch_json(url: str, token: str) -> dict[str, object] | None: return payload if isinstance(payload, dict) else None +def _action_repo(action: str) -> str: + """Return `owner/repo` for an action string that may carry a sub-path. + + Action references can be `owner/repo` or `owner/repo/path/to/subaction` + (e.g. `github/codeql-action/init`). Only the first two slash-segments + name the GitHub repository — the trailing segments are paths within + the repo's tree (containing per-subaction `action.yml` files). The + REST API endpoint we hit (`/repos///git/...`) only + accepts the `owner/repo` form; passing the full action string would + 404 on every sub-path action and surface as a false-positive + "tag no longer resolves" finding. + """ + parts = action.split("/", 2) + return "/".join(parts[:2]) if len(parts) >= 2 else action + + def _resolve_tag_sha(action: str, tag: str, token: str) -> str | None: """Return the commit SHA the tag points at, or None on missing/error. @@ -117,7 +133,8 @@ def _resolve_tag_sha(action: str, tag: str, token: str) -> str | None: commit. Lightweight tags resolve in one GET (the ref's `object.sha` is the commit directly). """ - ref = _fetch_json(f"{_API_BASE}/repos/{action}/git/refs/tags/{tag}", token) + repo = _action_repo(action) + ref = _fetch_json(f"{_API_BASE}/repos/{repo}/git/refs/tags/{tag}", token) if ref is None: return None obj = ref.get("object") @@ -131,7 +148,7 @@ def _resolve_tag_sha(action: str, tag: str, token: str) -> str | None: return obj_sha if obj_type == "tag": # Annotated tag — dereference to the commit it points at. - annotated = _fetch_json(f"{_API_BASE}/repos/{action}/git/tags/{obj_sha}", token) + annotated = _fetch_json(f"{_API_BASE}/repos/{repo}/git/tags/{obj_sha}", token) if annotated is None: return None inner = annotated.get("object") diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0c9bf08..2a22fb1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -18,7 +18,7 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 - - uses: astral-sh/setup-uv@cec208311dfd045dd5311c1add060b2062131d57 # v8 + - uses: astral-sh/setup-uv@cec208311dfd045dd5311c1add060b2062131d57 # v8.0.0 - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5 with: python-version: "3.14" @@ -31,7 +31,7 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 - - uses: astral-sh/setup-uv@cec208311dfd045dd5311c1add060b2062131d57 # v8 + - uses: astral-sh/setup-uv@cec208311dfd045dd5311c1add060b2062131d57 # v8.0.0 - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5 with: python-version: "3.14" @@ -44,7 +44,7 @@ jobs: # Pure in-process tests — completes fast so PR authors get quick feedback. steps: - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 - - uses: astral-sh/setup-uv@cec208311dfd045dd5311c1add060b2062131d57 # v8 + - uses: astral-sh/setup-uv@cec208311dfd045dd5311c1add060b2062131d57 # v8.0.0 - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5 with: python-version: "3.14" @@ -57,7 +57,7 @@ jobs: # Enforces [tool.coverage.report].fail_under from pyproject.toml (75%). steps: - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 - - uses: astral-sh/setup-uv@cec208311dfd045dd5311c1add060b2062131d57 # v8 + - uses: astral-sh/setup-uv@cec208311dfd045dd5311c1add060b2062131d57 # v8.0.0 - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5 with: python-version: "3.14" @@ -69,7 +69,7 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 - - uses: astral-sh/setup-uv@cec208311dfd045dd5311c1add060b2062131d57 # v8 + - uses: astral-sh/setup-uv@cec208311dfd045dd5311c1add060b2062131d57 # v8.0.0 - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5 with: python-version: "3.14" @@ -84,7 +84,7 @@ jobs: # secret past the first defence layer. steps: - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 - - uses: astral-sh/setup-uv@cec208311dfd045dd5311c1add060b2062131d57 # v8 + - uses: astral-sh/setup-uv@cec208311dfd045dd5311c1add060b2062131d57 # v8.0.0 - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5 with: python-version: "3.14" @@ -218,7 +218,7 @@ jobs: # actual workflow jobs on disk. steps: - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 - - uses: astral-sh/setup-uv@cec208311dfd045dd5311c1add060b2062131d57 # v8 + - uses: astral-sh/setup-uv@cec208311dfd045dd5311c1add060b2062131d57 # v8.0.0 - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5 with: python-version: "3.14" @@ -234,7 +234,7 @@ jobs: # while PR titles fail in CI (or vice versa). steps: - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 - - uses: astral-sh/setup-uv@cec208311dfd045dd5311c1add060b2062131d57 # v8 + - uses: astral-sh/setup-uv@cec208311dfd045dd5311c1add060b2062131d57 # v8.0.0 - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5 with: python-version: "3.14" diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index a06c133..85c73ad 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -44,12 +44,12 @@ jobs: - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 - name: Initialize CodeQL - uses: github/codeql-action/init@v3 + uses: github/codeql-action/init@v4 with: languages: ${{ matrix.language }} build-mode: ${{ matrix.build-mode }} - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@v3 + uses: github/codeql-action/analyze@v4 with: category: "/language:${{ matrix.language }}" diff --git a/.github/workflows/eval-nightly.yml b/.github/workflows/eval-nightly.yml index 3b069a9..9020446 100644 --- a/.github/workflows/eval-nightly.yml +++ b/.github/workflows/eval-nightly.yml @@ -39,7 +39,7 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 - - uses: astral-sh/setup-uv@cec208311dfd045dd5311c1add060b2062131d57 # v8 + - uses: astral-sh/setup-uv@cec208311dfd045dd5311c1add060b2062131d57 # v8.0.0 - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5 with: python-version: ${{ inputs.python_version || '3.14' }} diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 577bef4..60d3df2 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -29,7 +29,7 @@ jobs: # annotation when a new release lands and you've reviewed the diff. - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 - - uses: astral-sh/setup-uv@cec208311dfd045dd5311c1add060b2062131d57 # v8 + - uses: astral-sh/setup-uv@cec208311dfd045dd5311c1add060b2062131d57 # v8.0.0 - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5 with: diff --git a/.github/workflows/security.yml b/.github/workflows/security.yml index 04c6894..8a01886 100644 --- a/.github/workflows/security.yml +++ b/.github/workflows/security.yml @@ -44,7 +44,7 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 - - uses: astral-sh/setup-uv@cec208311dfd045dd5311c1add060b2062131d57 # v8 + - uses: astral-sh/setup-uv@cec208311dfd045dd5311c1add060b2062131d57 # v8.0.0 - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5 with: python-version: "3.14" diff --git a/pyproject.toml b/pyproject.toml index c1f5158..a01deaa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "harness-python-react" -version = "0.2.9" +version = "0.2.10" description = "Production-quality LLM-driven coding harness — Python (FastAPI) backend, Vite + React + TypeScript frontend." readme = "README.md" requires-python = ">=3.14" diff --git a/tests/test_check_pin_freshness.py b/tests/test_check_pin_freshness.py index 4f62ee2..6aabccb 100644 --- a/tests/test_check_pin_freshness.py +++ b/tests/test_check_pin_freshness.py @@ -76,6 +76,49 @@ def test_resolve_returns_none_on_malformed_payload() -> None: assert cpf._resolve_tag_sha("foo/bar", "v1.0.0", "fake") is None +# ---------- _action_repo (sub-path normalisation) ---------- + + +def test_action_repo_passthrough_for_owner_repo() -> None: + assert cpf._action_repo("actions/checkout") == "actions/checkout" + + +def test_action_repo_strips_subpath() -> None: + """`github/codeql-action/init` → `github/codeql-action` (subpath isn't a repo).""" + assert cpf._action_repo("github/codeql-action/init") == "github/codeql-action" + + +def test_action_repo_strips_deep_subpath() -> None: + """Deeply nested sub-actions still strip back to owner/repo.""" + assert cpf._action_repo("owner/repo/path/to/sub-action") == "owner/repo" + + +def test_resolve_tag_sha_uses_owner_repo_for_subpath_action( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Regression for the false-positive 404 on sub-path actions. + + Before this fix, _resolve_tag_sha passed `github/codeql-action/init` as + the API path segment, hitting `/repos/github/codeql-action/init/...` + which 404s (init is a tree path, not a repo). The audit then reported + `init@v4 — upstream tag no longer resolves` even though `v4` resolves + fine on `github/codeql-action`. + """ + seen_urls: list[str] = [] + + def fake_fetch(url: str, _token: str) -> dict[str, object] | None: + seen_urls.append(url) + return {"object": {"type": "commit", "sha": "deadbeef" * 5}} + + monkeypatch.setattr(cpf, "_fetch_json", fake_fetch) + sha = cpf._resolve_tag_sha("github/codeql-action/init", "v4", "fake") + assert sha == "deadbeef" * 5 + assert ( + seen_urls[0] + == "https://api.github.com/repos/github/codeql-action/git/refs/tags/v4" + ), seen_urls + + # ---------- _check_tag_pin ---------- diff --git a/uv.lock b/uv.lock index 8fe0b4f..cdc1eae 100644 --- a/uv.lock +++ b/uv.lock @@ -328,7 +328,7 @@ wheels = [ [[package]] name = "harness-python-react" -version = "0.2.9" +version = "0.2.10" source = { virtual = "." } dependencies = [ { name = "fastapi" }, From d256e32a833ddf92da5a51b841dcf176a70cedbc Mon Sep 17 00:00:00 2001 From: Constantinos <41453723+constk@users.noreply.github.com> Date: Tue, 26 May 2026 15:17:23 +1000 Subject: [PATCH 2/8] fix: bump idna + starlette to patched versions, 0.2.11 (#103) pip-audit on develop is flagging two transitive-dep CVEs: - idna 3.13 CVE-2026-45409 (fix in 3.15+) - starlette 1.0.0 PYSEC-2026-161 (fix in 1.0.1+) Both are surfaced via fastapi/httpx. Bumps via: uv lock --upgrade-package idna --upgrade-package starlette Resolves to idna 3.16 (3.15 was the listed fix; 3.16 is a further patch with the same fix) and starlette 1.1.0 (minor bump; FastAPI is compatible with it). All 192 unit tests pass on the upgraded lock. Bumps the project self-version 0.2.10 -> 0.2.11 per docs/DEVELOPMENT.md. Unblocks the pip-audit CI gate on #99, #100, #101, #102 (and any other PRs currently sitting on develop), all of which inherit the flagged transitive CVEs from develop and cannot pass that gate until this lands. --- pyproject.toml | 2 +- uv.lock | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index a01deaa..0651387 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "harness-python-react" -version = "0.2.10" +version = "0.2.11" description = "Production-quality LLM-driven coding harness — Python (FastAPI) backend, Vite + React + TypeScript frontend." readme = "README.md" requires-python = ">=3.14" diff --git a/uv.lock b/uv.lock index cdc1eae..e8fcd8c 100644 --- a/uv.lock +++ b/uv.lock @@ -328,7 +328,7 @@ wheels = [ [[package]] name = "harness-python-react" -version = "0.2.10" +version = "0.2.11" source = { virtual = "." } dependencies = [ { name = "fastapi" }, @@ -438,11 +438,11 @@ wheels = [ [[package]] name = "idna" -version = "3.13" +version = "3.16" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/ce/cc/762dfb036166873f0059f3b7de4565e1b5bc3d6f28a414c13da27e442f99/idna-3.13.tar.gz", hash = "sha256:585ea8fe5d69b9181ec1afba340451fba6ba764af97026f92a91d4eef164a242", size = 194210, upload-time = "2026-04-22T16:42:42.314Z" } +sdist = { url = "https://files.pythonhosted.org/packages/1a/88/bcf9709822fe69d02c2a6a77956c98ce6ea8ca8767a9aadcedc7eb6a2390/idna-3.16.tar.gz", hash = "sha256:d7a6da03db833450fca25d2358ac9ff06cd624577a4aea3a596d5c0f77b8e03d", size = 203770, upload-time = "2026-05-22T00:16:18.781Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/5d/13/ad7d7ca3808a898b4612b6fe93cde56b53f3034dcde235acb1f0e1df24c6/idna-3.13-py3-none-any.whl", hash = "sha256:892ea0cde124a99ce773decba204c5552b69c3c67ffd5f232eb7696135bc8bb3", size = 68629, upload-time = "2026-04-22T16:42:40.909Z" }, + { url = "https://files.pythonhosted.org/packages/94/16/70255075a9859a0e3adb789b68ceb0e210dec03934245fd98d248226572f/idna-3.16-py3-none-any.whl", hash = "sha256:cc246e3a3f89580c3a951b5ad298ca4638078b2cdd4f115654332b5c26daded5", size = 74165, upload-time = "2026-05-22T00:16:16.698Z" }, ] [[package]] @@ -1104,14 +1104,14 @@ wheels = [ [[package]] name = "starlette" -version = "1.0.0" +version = "1.1.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "anyio" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/81/69/17425771797c36cded50b7fe44e850315d039f28b15901ab44839e70b593/starlette-1.0.0.tar.gz", hash = "sha256:6a4beaf1f81bb472fd19ea9b918b50dc3a77a6f2e190a12954b25e6ed5eea149", size = 2655289, upload-time = "2026-03-22T18:29:46.779Z" } +sdist = { url = "https://files.pythonhosted.org/packages/95/66/4d20cdf39a8d6a51e663b7038e3b828ff211d3891a43a713fe7e4643f3a8/starlette-1.1.0.tar.gz", hash = "sha256:e83c7fe0ddecd8719c5b840080325aec0260acec86e9832899e377b91d65e90f", size = 2660060, upload-time = "2026-05-23T16:55:41.376Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/0b/c9/584bc9651441b4ba60cc4d557d8a547b5aff901af35bda3a4ee30c819b82/starlette-1.0.0-py3-none-any.whl", hash = "sha256:d3ec55e0bb321692d275455ddfd3df75fff145d009685eb40dc91fc66b03d38b", size = 72651, upload-time = "2026-03-22T18:29:45.111Z" }, + { url = "https://files.pythonhosted.org/packages/93/79/920b8e0a8b20f793e8d64855095cb8febabf6175b8550b6f7a547d813891/starlette-1.1.0-py3-none-any.whl", hash = "sha256:7f0dfd38e428aad5cb6f9f667f0ca1d2d8ca3f3385dccac8305f79ec98458382", size = 72899, upload-time = "2026-05-23T16:55:39.201Z" }, ] [[package]] From 18b4d30120159d61af388838d96ef9b696bb5ca6 Mon Sep 17 00:00:00 2001 From: Constantinos <41453723+constk@users.noreply.github.com> Date: Tue, 26 May 2026 15:21:46 +1000 Subject: [PATCH 3/8] feat: eval pattern examples calling Azure OpenAI (#104) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: eval pattern examples calling Azure OpenAI (#94) The eval slice previously shipped one toy case (echo-hello) and a disabled-by-default nightly. A reader expecting an LLM-eval story found the infrastructure without conviction. Adds four worked-pattern cases that exercise the existing three tolerance modes against a real Azure OpenAI deployment. These are not benchmarks — they demonstrate what an eval case *looks like* for the four LLM-eval patterns you most often need to write: - factual-http-200 exact_match format-constrained recall - numeric-seconds-per-day numeric_close numeric reasoning + tolerance - definitional-fastapi-depends semantic_similar free-form judge-scored prose - structured-json-status exact_match structured-output adherence When the template is forked for a real project, replace these four with cases that exercise the project's own prompts; the patterns transfer regardless of what product is bolted on. Provider choice — Azure OpenAI via the openai SDK with AzureOpenAI client — is intentionally distinct from the rest of the harness (which uses Claude via Claude Code). Demonstrates that the LLMClient Protocol in src/eval/judge.py does its job: the eval core never imports openai, vendor lock-in lives only in the adapter. Changes: - src/eval/adapters/azure_openai.py — implements LLMClient via the openai.AzureOpenAI SDK. Reads endpoint/key/deployment/api-version from env. Lazy-imports the SDK so the module is importable without the optional extra installed; the adapter raises a clear AzureOpenAIConfigError if the env or SDK is missing. - eval/golden_patterns.json — the four cases with notes explaining which pattern each demonstrates. - eval/test_golden_patterns.py — separate test file gated on the Azure env vars via pytestmark. Skipped on a stock checkout, so `uv run pytest eval/` always exits 0. The toy test_golden_qa.py keeps running as before. - pyproject.toml — new optional [project.optional-dependencies] eval extra (just `openai>=1.40.0`), mypy override for openai.* matching the existing opentelemetry.* pattern, and a 0.2.10 -> 0.2.11 self-version bump. - .github/workflows/eval-nightly.yml — env vars renamed from the placeholder LLM_* set to AZURE_OPENAI_*. Header comment updated with the Azure setup recipe. uv sync now passes --extra eval. - docs/EVAL_HARNESS.md — new "Worked patterns" section with the table mapping case -> tolerance -> pattern, the local setup recipe, and a "Swapping providers" note documenting the Protocol-based extension path. Local gates: mypy --strict clean on 42 source files (was 31), ruff clean, ruff format clean, import-linter both contracts kept, 192 unit tests pass, eval/ runs 1 passed + 4 skipped without LLM env. Closes #94 * test: add adapter unit tests + adapters README (#94 review fixes) Addresses two gate failures on #104 surfaced by code review: 1. "Tests required" gate — feat: prefix declared a behaviour change but tests/ had no test for the new adapter (the eval/-side test only runs with live Azure credentials). Adds tests/test_eval_azure_openai_adapter.py: 13 fully-offline cases covering _resolve_config (defaults, override, empty-string fallback, missing-env error listing), the constructor (env wiring, explicit API version, missing-env, missing-SDK), and the two SDK call paths (complete_json structured-output mode, complete user-message dispatch, null-content returns "" / "{}"). The SDK is mocked at sys.modules level so the test never hits the network and never requires the openai extra to be installed. 2. "src/ README audit" gate — every src/ package needs a README.md per CLAUDE.md. Adds src/eval/adapters/README.md documenting the layer's purpose, the current adapter, a 7-step "adding a new adapter" recipe, and why the layer lives at the top of the import order. Also applies the reviewer's non-blocking sentinel-string suggestion: the magic "azure-deployment" string passed as judge_model in eval/test_golden_patterns.py is now the named constant _AZURE_DEPLOYMENT_SENTINEL with a comment explaining why the runner threads it through but the Azure adapter discards it. Local gates: 205 unit tests pass (was 192, +13 new), mypy clean on 43 source files, ruff/format/import-linter all green. Refs #94 * docs: add Key interfaces section to adapters README (#94 review) src/ README audit gate looks for a `## Key interfaces` (or `## Public surface`) anchor — the existing README had purpose / table / extension recipe / layering rationale, but no exported-names section. Adds a `## Key interfaces` section listing the two exported names: - AzureOpenAIClient — the LLMClient implementation with notes on complete() vs complete_json() and the discarded `model` arg (Azure dispatches by deployment, not model). - AzureOpenAIConfigError — the construction-time error type, noting that it batches every missing env var into a single message instead of failing-and-retrying. Both already documented in the adapter docstrings; this section hoists them to the README anchor the audit gate enforces. Refs #94 * chore: bump version to 0.2.12 (rebase onto develop after #103) --- .github/workflows/eval-nightly.yml | 35 ++-- docs/EVAL_HARNESS.md | 52 +++++- eval/golden_patterns.json | 38 ++++ eval/test_golden_patterns.py | 86 +++++++++ pyproject.toml | 14 +- src/eval/adapters/README.md | 40 ++++ src/eval/adapters/__init__.py | 13 ++ src/eval/adapters/azure_openai.py | 123 +++++++++++++ tests/test_eval_azure_openai_adapter.py | 234 ++++++++++++++++++++++++ uv.lock | 92 +++++++++- 10 files changed, 702 insertions(+), 25 deletions(-) create mode 100644 eval/golden_patterns.json create mode 100644 eval/test_golden_patterns.py create mode 100644 src/eval/adapters/README.md create mode 100644 src/eval/adapters/__init__.py create mode 100644 src/eval/adapters/azure_openai.py create mode 100644 tests/test_eval_azure_openai_adapter.py diff --git a/.github/workflows/eval-nightly.yml b/.github/workflows/eval-nightly.yml index 9020446..2ca8981 100644 --- a/.github/workflows/eval-nightly.yml +++ b/.github/workflows/eval-nightly.yml @@ -1,12 +1,15 @@ # Eval harness nightly — disabled-by-default. # -# This workflow runs the golden QA dataset against the agent / LLM loop. It -# is `workflow_dispatch`-only by default to prevent accidental LLM API -# spend. To enable nightly runs: +# This workflow runs the golden QA dataset + worked-pattern cases against a +# real Azure OpenAI deployment. It is `workflow_dispatch`-only by default +# to prevent accidental API spend. To enable nightly runs: +# +# 1. Set the Azure OpenAI secrets in repo settings: +# AZURE_OPENAI_ENDPOINT e.g. https://my.openai.azure.com +# AZURE_OPENAI_API_KEY the Azure resource key +# AZURE_OPENAI_DEPLOYMENT deployment name, e.g. gpt-4o-mini +# AZURE_OPENAI_API_VERSION optional, defaults to 2024-10-21 # -# 1. Set the LLM secrets in repo settings (LLM_API_KEY at minimum; -# LLM_BASE_URL / LLM_MODEL / LLM_PROVIDER if your judge differs from -# OpenAI defaults). # 2. Replace the `on:` block below with: # # on: @@ -14,9 +17,13 @@ # - cron: "0 6 * * *" # daily 06:00 UTC # workflow_dispatch: # -# 3. Add the `eval-nightly.yml` to EXEMPT_WORKFLOWS in -# `.github/scripts/check_required_contexts.py` if it's not already -# there (it is, by default — scheduled runs never gate PRs). +# 3. Confirm `eval-nightly.yml` is in EXEMPT_WORKFLOWS in +# `.github/scripts/check_required_contexts.py` (it is, by default +# — scheduled runs never gate PRs). +# +# When the Azure secrets are absent, eval/test_golden_patterns.py is +# skipped via pytestmark — the toy eval/test_golden_qa.py case still +# runs as a smoke check on the runner mechanics. # # See docs/EVAL_HARNESS.md for the full setup story. @@ -43,11 +50,11 @@ jobs: - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5 with: python-version: ${{ inputs.python_version || '3.14' }} - - run: uv sync --frozen --extra dev + - run: uv sync --frozen --extra dev --extra eval - name: Run pytest eval/ env: - LLM_PROVIDER: ${{ secrets.LLM_PROVIDER }} - LLM_API_KEY: ${{ secrets.LLM_API_KEY }} - LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }} - LLM_MODEL: ${{ secrets.LLM_MODEL }} + AZURE_OPENAI_ENDPOINT: ${{ secrets.AZURE_OPENAI_ENDPOINT }} + AZURE_OPENAI_API_KEY: ${{ secrets.AZURE_OPENAI_API_KEY }} + AZURE_OPENAI_DEPLOYMENT: ${{ secrets.AZURE_OPENAI_DEPLOYMENT }} + AZURE_OPENAI_API_VERSION: ${{ secrets.AZURE_OPENAI_API_VERSION }} run: uv run pytest eval/ -v diff --git a/docs/EVAL_HARNESS.md b/docs/EVAL_HARNESS.md index ec115b1..d352636 100644 --- a/docs/EVAL_HARNESS.md +++ b/docs/EVAL_HARNESS.md @@ -6,15 +6,19 @@ LLM-driven systems regress in ways unit tests don't catch: the prompt drifts, th ``` src/eval/ -├── models.py # EvalCase, EvalResult (Pydantic) -├── runner.py # EvalRunner — generic, takes a Callable[[str], str] -├── judge.py # LLMClient Protocol + semantic-similarity judge -├── report.py # Markdown report generator -└── __main__.py # python -m src.eval +├── models.py # EvalCase, EvalResult (Pydantic) +├── runner.py # EvalRunner — generic, takes a Callable[[str], str] +├── judge.py # LLMClient Protocol + semantic-similarity judge +├── report.py # Markdown report generator +├── __main__.py # python -m src.eval +└── adapters/ + └── azure_openai.py # Concrete LLMClient for Azure OpenAI (optional extra) eval/ -├── golden_qa.json # The dataset (one trivial example case ships) -└── test_golden_qa.py # Parametrised pytest runner +├── golden_qa.json # Toy smoke case — runs without LLM credentials +├── test_golden_qa.py # Parametrised runner for the toy case +├── golden_patterns.json # Four worked-pattern cases — require Azure OpenAI +└── test_golden_patterns.py # Skipped unless AZURE_OPENAI_* env vars are set ``` ## How it works @@ -86,11 +90,43 @@ python -m src.eval # CLI runner — prints the markdown report The pytest invocation is marked `@pytest.mark.eval`, so the default `pytest tests/` skips it. +## Worked patterns (Azure OpenAI) + +The four cases in `eval/golden_patterns.json` are *not* benchmarks. They exist to demonstrate what an eval case looks like against each of the runner's tolerance modes; together they cover the four LLM-eval patterns you most often need to write: + +| Case ID | Tolerance | Pattern demonstrated | +|---|---|---| +| `factual-http-200` | `exact_match` | Format-constrained factual recall. The prompt forces a single canonical token; if the model wraps the answer in prose, the case fails loudly. | +| `numeric-seconds-per-day` | `numeric_close` | Numeric reasoning with extraction tolerance. The runner pulls the first number from each side and compares within 1 %, so `86,400` and `86400 seconds` both match. | +| `definitional-fastapi-depends` | `semantic_similar` | Free-form prose scored by an LLM judge at ≥ 0.8. Use for explanations and any case where wording can vary but the underlying claim is checkable. | +| `structured-json-status` | `exact_match` | Structured-output adherence. The prompt asks for raw JSON; markdown-fenced or prose-wrapped responses fail — which is the failure mode downstream parsers also hit. | + +The cases all call a real Azure OpenAI deployment via the adapter at `src/eval/adapters/azure_openai.py`. When you fork the template for a real project, replace these four with cases that exercise your own product's prompts; the patterns transfer. + +### Setup + +```sh +uv sync --extra dev --extra eval # installs the openai SDK + +export AZURE_OPENAI_ENDPOINT="https://your-resource.openai.azure.com" +export AZURE_OPENAI_API_KEY="..." +export AZURE_OPENAI_DEPLOYMENT="gpt-4o-mini" # or whatever you deployed +export AZURE_OPENAI_API_VERSION="2024-10-21" # optional, this is the default + +uv run pytest eval/test_golden_patterns.py -v +``` + +Without the env vars, `eval/test_golden_patterns.py` is skipped via `pytestmark` — `eval/test_golden_qa.py` still runs as a smoke check on the runner mechanics, so `uv run pytest eval/` always exits 0 on a fresh checkout. + +### Swapping providers + +`src/eval/judge.py` defines `LLMClient` as a `Protocol` — the eval core does not import `openai` anywhere. To target a different provider (Anthropic, vLLM, vanilla OpenAI), write a new adapter under `src/eval/adapters/` that implements `complete_json(*, model, prompt) -> str` and update the runner fixture in your test file. Nothing in `src/eval/` itself changes. + ## Nightly opt-in `.github/workflows/eval-nightly.yml` ships `workflow_dispatch`-only by default to avoid accidental LLM API spend. To turn on a real nightly: -1. Add the LLM secrets in repo settings: `LLM_API_KEY` (required), `LLM_PROVIDER`, `LLM_BASE_URL`, `LLM_MODEL` (optional, depending on adapter). +1. Add the Azure OpenAI secrets in repo settings: `AZURE_OPENAI_ENDPOINT`, `AZURE_OPENAI_API_KEY`, `AZURE_OPENAI_DEPLOYMENT`, and optionally `AZURE_OPENAI_API_VERSION`. 2. Replace the workflow's `on:` block with: diff --git a/eval/golden_patterns.json b/eval/golden_patterns.json new file mode 100644 index 0000000..d0b7316 --- /dev/null +++ b/eval/golden_patterns.json @@ -0,0 +1,38 @@ +[ + { + "id": "factual-http-200", + "question": "What HTTP status code means OK? Respond with only the number, no prose.", + "category": "factual-recall", + "expected_answer": "200", + "tolerance": "exact_match", + "difficulty": "easy", + "notes": "Pattern: factual recall with format-constrained output. exact_match works because the prompt forces a single canonical token. If the model adds prose (\"The status code is 200.\") this fails loudly — which is the point: format adherence is part of the assertion." + }, + { + "id": "numeric-seconds-per-day", + "question": "How many seconds are in 24 hours? Respond with the integer only.", + "category": "numeric-reasoning", + "expected_answer": "86400", + "tolerance": "numeric_close", + "difficulty": "easy", + "notes": "Pattern: numeric extraction with 1% tolerance. The runner pulls the first number from each side and compares ratios, so '86,400', '86400 seconds', and '86400.0' all match. Use this tolerance for math, conversions, and any case where formatting around the number is uninteresting." + }, + { + "id": "definitional-fastapi-depends", + "question": "In one sentence: what does FastAPI's Depends() do?", + "category": "definitional", + "expected_answer": "Depends declares a callable that FastAPI resolves at request time and injects the result into the parameter, enabling dependency injection for things like authentication, database sessions, or settings.", + "tolerance": "semantic_similar", + "difficulty": "medium", + "notes": "Pattern: free-form prose scored by LLM judge. semantic_similar passes at score >= 0.8 via the judge in src/eval/judge.py. Use this for definitions, explanations, and any case where wording can legitimately vary but the underlying claim is checkable." + }, + { + "id": "structured-json-status", + "question": "Return exactly this JSON object and nothing else (no markdown fence, no prose, no trailing newline): {\"ok\": true, \"version\": 1}", + "category": "structured-output", + "expected_answer": "{\"ok\": true, \"version\": 1}", + "tolerance": "exact_match", + "difficulty": "medium", + "notes": "Pattern: format adherence on structured output. Models commonly wrap JSON in ```json``` fences or add a preamble; exact_match after normalisation (lowercase + whitespace-collapse) accepts a clean response but rejects the fenced or prose-wrapped version. This is the failure mode you want to catch — downstream parsers break the same way." + } +] diff --git a/eval/test_golden_patterns.py b/eval/test_golden_patterns.py new file mode 100644 index 0000000..bf42432 --- /dev/null +++ b/eval/test_golden_patterns.py @@ -0,0 +1,86 @@ +"""LLM-eval pattern showcase — four worked cases that exercise the existing +tolerance modes against a real Azure OpenAI deployment. + +Each case demonstrates a different eval *pattern* (see notes inside +`eval/golden_patterns.json`): + + - factual recall with exact_match + - numeric reasoning with numeric_close + - free-form definitional with semantic_similar + - structured-output adherence with exact_match + +This file is *skipped entirely* unless the Azure OpenAI env vars are set +(`AZURE_OPENAI_ENDPOINT`, `AZURE_OPENAI_API_KEY`, `AZURE_OPENAI_DEPLOYMENT`). +Run with:: + + uv sync --extra eval --extra dev + AZURE_OPENAI_ENDPOINT=... AZURE_OPENAI_API_KEY=... \\ + AZURE_OPENAI_DEPLOYMENT=... uv run pytest eval/test_golden_patterns.py + +The toy `eval/test_golden_qa.py` runs without any credentials — that one +exercises the runner mechanics; this one exercises the runner against a +real model. +""" + +from __future__ import annotations + +import os +from pathlib import Path + +import pytest + +from src.eval.models import EvalCase +from src.eval.runner import EvalRunner, load_golden_dataset + +_PATTERNS_PATH = Path(__file__).resolve().parent / "golden_patterns.json" +_REQUIRED_ENV = ( + "AZURE_OPENAI_ENDPOINT", + "AZURE_OPENAI_API_KEY", + "AZURE_OPENAI_DEPLOYMENT", +) + +_missing = [name for name in _REQUIRED_ENV if not os.environ.get(name)] +pytestmark = [ + pytest.mark.eval, + pytest.mark.skipif( + bool(_missing), + reason=f"requires Azure OpenAI env vars: missing {', '.join(_missing)}", + ), +] + +patterns = load_golden_dataset(_PATTERNS_PATH) + +# Sentinel passed to EvalRunner.judge_model. The runner threads this through +# to LLMClient.complete_json(model=...), where the Azure adapter discards it +# — Azure addresses by deployment name (set at adapter construction), not by +# the model parameter. Named constant makes the intent obvious to a reader +# of this fixture without needing to chase into the adapter. +_AZURE_DEPLOYMENT_SENTINEL = "azure-deployment-from-env" + + +@pytest.fixture(scope="module") +def runner() -> EvalRunner: + """Construct the runner with one Azure client serving both roles + (answer_fn and judge_client). Same deployment for cost simplicity; + a real project might split subject and judge models.""" + from src.eval.adapters.azure_openai import AzureOpenAIClient + + client = AzureOpenAIClient() + return EvalRunner( + answer_fn=client.complete, + judge_client=client, + judge_model=_AZURE_DEPLOYMENT_SENTINEL, + ) + + +@pytest.mark.parametrize("case", patterns, ids=lambda c: c.id) +def test_golden_patterns(case: EvalCase, runner: EvalRunner) -> None: + """Run one worked pattern case against the live Azure deployment.""" + result = runner.evaluate(case) + assert result.pass_result, ( + f"[{case.id}] {case.category}/{case.difficulty}\n" + f"Q: {case.question}\n" + f"Expected: {case.expected_answer}\n" + f"Got: {result.actual_answer}\n" + f"Reason: {result.failure_reason}" + ) diff --git a/pyproject.toml b/pyproject.toml index 0651387..71c6d76 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "harness-python-react" -version = "0.2.11" +version = "0.2.12" description = "Production-quality LLM-driven coding harness — Python (FastAPI) backend, Vite + React + TypeScript frontend." readme = "README.md" requires-python = ">=3.14" @@ -55,6 +55,13 @@ dev = [ "commitizen>=4.0.0", "pyyaml>=6.0.3", ] +# Optional extra for the eval harness's LLM-backed pattern cases. Kept +# separate from `dev` so a contributor working on backend/frontend code +# never pulls the openai SDK or its transitive deps. See +# docs/EVAL_HARNESS.md for the full setup. +eval = [ + "openai>=1.40.0", +] [project.urls] Homepage = "https://github.com/constk/harness-python-react" @@ -122,6 +129,11 @@ warn_unused_ignores = true [[tool.mypy.overrides]] module = [ "opentelemetry.*", + # `openai` is an optional extra (see [project.optional-dependencies]). + # mypy on a stock `uv sync --extra dev` checkout doesn't see it; the + # adapter in src/eval/adapters/azure_openai.py wraps it in `Any` at + # the import boundary so the rest of src/ stays fully typed. + "openai.*", ] ignore_missing_imports = true diff --git a/src/eval/adapters/README.md b/src/eval/adapters/README.md new file mode 100644 index 0000000..e5a66e4 --- /dev/null +++ b/src/eval/adapters/README.md @@ -0,0 +1,40 @@ +# `src/eval/adapters` + +Concrete `LLMClient` adapters for the eval harness. The judge in [`src/eval/judge.py`](../judge.py) calls an `LLMClient` Protocol — never a vendor SDK directly. Each adapter in this package implements that Protocol for one provider, so the eval core stays vendor-neutral and a downstream consumer can swap providers by changing one wiring line in their test fixture. + +## Key interfaces + +Exported from this package: + +- **`AzureOpenAIClient`** — implements `src.eval.judge.LLMClient`. Construct from env via `AzureOpenAIClient()`; call `complete(prompt)` for runner `answer_fn` use, `complete_json(*, model, prompt)` for judge use. The `model` argument on `complete_json` is accepted for Protocol conformance and discarded — Azure addresses by deployment name (set at construction time, read from `AZURE_OPENAI_DEPLOYMENT`). +- **`AzureOpenAIConfigError`** — raised at construction when required env is missing or the optional `openai` extra is not installed. Subclass of `RuntimeError`. The error message names every missing env var in one go so the caller doesn't have to fix-and-retry. + +## Why this layer exists + +Without the Protocol seam, swapping LLM providers would mean touching the eval core. With it, vendor lock-in is confined to one file per provider. The layer demonstrates that the harness's "provider-agnostic" claim is structural, not aspirational: the eval core has zero imports of any vendor SDK. + +## Current adapters + +| File | Provider | Optional extra | Env contract | +|---|---|---|---| +| [`azure_openai.py`](azure_openai.py) | Azure OpenAI | `uv sync --extra eval` | `AZURE_OPENAI_ENDPOINT`, `AZURE_OPENAI_API_KEY`, `AZURE_OPENAI_DEPLOYMENT`, optional `AZURE_OPENAI_API_VERSION` (default `2024-10-21`) | + +## Adding a new adapter + +1. Add the SDK to `[project.optional-dependencies]` in `pyproject.toml` — either to the existing `eval` extra or a new provider-scoped one. +2. Add the SDK's top-level module to `[[tool.mypy.overrides]]` with `ignore_missing_imports = true`, matching the existing `openai.*` / `opentelemetry.*` entries. This keeps mypy clean on stock `uv sync --extra dev` checkouts. +3. Implement `complete_json(*, model: str, prompt: str) -> str` per the `LLMClient` Protocol in [`src/eval/judge.py`](../judge.py). Optionally add a `complete(prompt: str) -> str` for use as an `EvalRunner.answer_fn`. +4. **Lazy-import the SDK inside `__init__`** so the adapter module remains importable without the optional extra installed. The import error path should raise a clear, named exception (e.g. `AzureOpenAIConfigError`) telling the reader which `uv sync --extra ...` to run. +5. Read configuration from environment variables at construction time. Raise the same named exception listing every missing var when env is incomplete — fail fast, fail clear. +6. Add an offline unit test in [`tests/`](../../../tests/) that mocks the SDK at the `sys.modules` level (see `tests/test_eval_azure_openai_adapter.py` for the pattern). This keeps the unit suite credential-free; live-credential paths are exercised by [`eval/test_golden_patterns.py`](../../../eval/test_golden_patterns.py). +7. Document the env contract in this README's table above and in [`docs/EVAL_HARNESS.md`](../../../docs/EVAL_HARNESS.md)'s "Worked patterns" section. + +## Why adapters live under `src/eval/` + +The import-linter contract in `pyproject.toml` puts `src.eval` at the top of the layered import order: + +``` +api | eval -> agent -> tools -> data -> observability -> models +``` + +Adapters can therefore depend on anything in `src/`; nothing in `src/` depends on them. That asymmetry is exactly what the layered architecture exists to encode — vendor-specific code stays at the boundary, never leaks down into the eval primitives or the model layer. diff --git a/src/eval/adapters/__init__.py b/src/eval/adapters/__init__.py new file mode 100644 index 0000000..7a11e47 --- /dev/null +++ b/src/eval/adapters/__init__.py @@ -0,0 +1,13 @@ +"""Concrete LLM-client adapters for the eval harness. + +The judge in `src.eval.judge` calls an `LLMClient` Protocol — never an SDK +directly. Each adapter in this package implements that Protocol for one +provider, so the eval core stays vendor-neutral and a downstream consumer +can swap providers by changing one wiring line. + +Adapters are intentionally thin: env-driven construction, lazy SDK import, +one `complete_json(...)` method. No retries, no streaming, no batching — +the goal is "works for nightly eval runs", not "production-grade client". +""" + +from __future__ import annotations diff --git a/src/eval/adapters/azure_openai.py b/src/eval/adapters/azure_openai.py new file mode 100644 index 0000000..a8ed742 --- /dev/null +++ b/src/eval/adapters/azure_openai.py @@ -0,0 +1,123 @@ +"""Azure OpenAI adapter implementing the eval-harness `LLMClient` Protocol. + +Why Azure and not vanilla OpenAI: the eval slice is intentionally +provider-distinct from the rest of the harness (which uses Claude via +Claude Code). Demonstrates that the `LLMClient` Protocol does its job — +the eval core in `src/eval/judge.py` doesn't import the `openai` SDK +anywhere. + +Env vars (read at construction time; all required except API version): + + AZURE_OPENAI_ENDPOINT e.g. https://my-resource.openai.azure.com + AZURE_OPENAI_API_KEY the Azure resource key + AZURE_OPENAI_DEPLOYMENT deployment name, e.g. "gpt-4o-mini" + AZURE_OPENAI_API_VERSION optional; defaults to 2024-10-21 + +The `openai` SDK is an *optional* extra (`uv sync --extra eval`). Importing +this module does not require the SDK; only constructing `AzureOpenAIClient` +does. That keeps the rest of the harness importable on a stock +`uv sync --extra dev` checkout. +""" + +from __future__ import annotations + +import os +from typing import TYPE_CHECKING, Any + +if TYPE_CHECKING: + from collections.abc import Mapping + + +class AzureOpenAIConfigError(RuntimeError): + """Raised when required Azure OpenAI configuration is missing.""" + + +_REQUIRED_ENV = ( + "AZURE_OPENAI_ENDPOINT", + "AZURE_OPENAI_API_KEY", + "AZURE_OPENAI_DEPLOYMENT", +) +_DEFAULT_API_VERSION = "2024-10-21" + + +def _resolve_config(env: Mapping[str, str]) -> tuple[str, str, str, str]: + """Read the four config values from env; raise with all missing names.""" + endpoint = env.get("AZURE_OPENAI_ENDPOINT", "") + api_key = env.get("AZURE_OPENAI_API_KEY", "") + deployment = env.get("AZURE_OPENAI_DEPLOYMENT", "") + api_version = env.get("AZURE_OPENAI_API_VERSION", "") or _DEFAULT_API_VERSION + + missing = [name for name in _REQUIRED_ENV if not env.get(name)] + if missing: + raise AzureOpenAIConfigError( + f"Missing required Azure OpenAI env vars: {', '.join(missing)}. " + "See docs/EVAL_HARNESS.md for the full setup." + ) + return endpoint, api_key, deployment, api_version + + +class AzureOpenAIClient: + """Implements `src.eval.judge.LLMClient` against an Azure OpenAI deployment. + + Used in two roles by `eval/test_golden_patterns.py`: + + 1. As the `answer_fn` — the thing whose output we are evaluating. + 2. As the `judge_client` — the LLM that scores `semantic_similar` + cases. Same deployment serves both for cost simplicity; a real + project might split judge and subject. + """ + + def __init__(self) -> None: + endpoint, api_key, deployment, api_version = _resolve_config(os.environ) + self._deployment = deployment + + # Lazy SDK import: keeps the module importable without `openai` + # installed. Constructing the client without the extra is the + # error case, not importing the module. + try: + from openai import AzureOpenAI + except ImportError as exc: # pragma: no cover - env-dependent + raise AzureOpenAIConfigError( + "openai SDK not installed. Run: uv sync --extra eval" + ) from exc + + self._client: Any = AzureOpenAI( + azure_endpoint=endpoint, + api_key=api_key, + api_version=api_version, + ) + + def complete(self, prompt: str) -> str: + """Return the assistant's plain-text response to `prompt`. + + Used as the eval runner's `answer_fn`. Returns "" if the model + returns no content (rare but possible for safety-filtered prompts). + """ + response = self._client.chat.completions.create( + model=self._deployment, + messages=[{"role": "user", "content": prompt}], + ) + return response.choices[0].message.content or "" + + def complete_json(self, *, model: str, prompt: str) -> str: + """Return the assistant's response as a raw JSON string. + + Implements the `LLMClient` Protocol. The `model` argument is + accepted for Protocol conformance but ignored — Azure addresses + by deployment name, set at construction time. Uses Azure's + structured-output mode (`response_format={"type": "json_object"}`) + to guarantee parseable JSON. + """ + del model # Azure dispatches by deployment, not model + response = self._client.chat.completions.create( + model=self._deployment, + messages=[ + { + "role": "system", + "content": "Respond with valid JSON only. No prose, no markdown.", + }, + {"role": "user", "content": prompt}, + ], + response_format={"type": "json_object"}, + ) + return response.choices[0].message.content or "{}" diff --git a/tests/test_eval_azure_openai_adapter.py b/tests/test_eval_azure_openai_adapter.py new file mode 100644 index 0000000..7b08bca --- /dev/null +++ b/tests/test_eval_azure_openai_adapter.py @@ -0,0 +1,234 @@ +"""Offline unit tests for the Azure OpenAI eval adapter. + +These tests never hit the network. The `openai` SDK is replaced at the +`sys.modules` level so the adapter's lazy import resolves to a `MagicMock`, +which lets us assert on the constructor arguments and the chat-completions +call shape without an API key. + +The live-credential path is exercised by `eval/test_golden_patterns.py`, +which is skipped on stock checkouts. +""" + +from __future__ import annotations + +import sys +from types import SimpleNamespace +from unittest.mock import MagicMock + +import pytest + +from src.eval.adapters.azure_openai import ( + _DEFAULT_API_VERSION, + AzureOpenAIClient, + AzureOpenAIConfigError, + _resolve_config, +) + +# --------------------------------------------------------------------------- +# _resolve_config — pure function, no SDK involved +# --------------------------------------------------------------------------- + + +class TestResolveConfig: + """`_resolve_config` reads env, applies the default API version, and + raises a single `AzureOpenAIConfigError` naming every missing var.""" + + def test_returns_env_values_with_default_api_version(self) -> None: + env = { + "AZURE_OPENAI_ENDPOINT": "https://x.openai.azure.com", + "AZURE_OPENAI_API_KEY": "key", + "AZURE_OPENAI_DEPLOYMENT": "gpt-4o-mini", + } + endpoint, key, deploy, version = _resolve_config(env) + assert endpoint == "https://x.openai.azure.com" + assert key == "key" + assert deploy == "gpt-4o-mini" + assert version == _DEFAULT_API_VERSION + + def test_explicit_api_version_overrides_default(self) -> None: + env = { + "AZURE_OPENAI_ENDPOINT": "https://x.openai.azure.com", + "AZURE_OPENAI_API_KEY": "key", + "AZURE_OPENAI_DEPLOYMENT": "deploy", + "AZURE_OPENAI_API_VERSION": "2025-01-01", + } + _, _, _, version = _resolve_config(env) + assert version == "2025-01-01" + + def test_empty_api_version_falls_back_to_default(self) -> None: + env = { + "AZURE_OPENAI_ENDPOINT": "https://x.openai.azure.com", + "AZURE_OPENAI_API_KEY": "key", + "AZURE_OPENAI_DEPLOYMENT": "deploy", + "AZURE_OPENAI_API_VERSION": "", + } + _, _, _, version = _resolve_config(env) + assert version == _DEFAULT_API_VERSION + + def test_raises_listing_all_missing_when_none_set(self) -> None: + with pytest.raises(AzureOpenAIConfigError) as exc: + _resolve_config({}) + msg = str(exc.value) + assert "AZURE_OPENAI_ENDPOINT" in msg + assert "AZURE_OPENAI_API_KEY" in msg + assert "AZURE_OPENAI_DEPLOYMENT" in msg + + def test_raises_listing_only_missing(self) -> None: + env = { + "AZURE_OPENAI_ENDPOINT": "x", + "AZURE_OPENAI_DEPLOYMENT": "d", + # AZURE_OPENAI_API_KEY missing + } + with pytest.raises(AzureOpenAIConfigError) as exc: + _resolve_config(env) + msg = str(exc.value) + assert "AZURE_OPENAI_API_KEY" in msg + assert "AZURE_OPENAI_ENDPOINT" not in msg + assert "AZURE_OPENAI_DEPLOYMENT" not in msg + + +# --------------------------------------------------------------------------- +# AzureOpenAIClient — SDK is mocked at sys.modules level +# --------------------------------------------------------------------------- + + +@pytest.fixture +def _env(monkeypatch: pytest.MonkeyPatch) -> None: + """Populate the three required env vars with test values.""" + monkeypatch.setenv("AZURE_OPENAI_ENDPOINT", "https://x.openai.azure.com") + monkeypatch.setenv("AZURE_OPENAI_API_KEY", "test-key") + monkeypatch.setenv("AZURE_OPENAI_DEPLOYMENT", "test-deploy") + monkeypatch.delenv("AZURE_OPENAI_API_VERSION", raising=False) + + +@pytest.fixture +def _mock_openai(monkeypatch: pytest.MonkeyPatch) -> MagicMock: + """Install a fake `openai` module exporting a `AzureOpenAI` constructor. + + The adapter's lazy `from openai import AzureOpenAI` will resolve to the + `MagicMock` returned here, so call-args assertions work without any SDK + installed. + """ + mock_constructor = MagicMock(name="AzureOpenAI") + fake_module = SimpleNamespace(AzureOpenAI=mock_constructor) + monkeypatch.setitem(sys.modules, "openai", fake_module) + return mock_constructor + + +class TestAzureOpenAIClientConstruction: + """Constructor wires env config into the SDK client and surfaces clear + errors when prerequisites are missing.""" + + def test_init_constructs_sdk_with_resolved_env_config( + self, _env: None, _mock_openai: MagicMock + ) -> None: + AzureOpenAIClient() + _mock_openai.assert_called_once_with( + azure_endpoint="https://x.openai.azure.com", + api_key="test-key", + api_version=_DEFAULT_API_VERSION, + ) + + def test_init_passes_explicit_api_version( + self, + _env: None, + _mock_openai: MagicMock, + monkeypatch: pytest.MonkeyPatch, + ) -> None: + monkeypatch.setenv("AZURE_OPENAI_API_VERSION", "2025-01-01") + AzureOpenAIClient() + kwargs = _mock_openai.call_args.kwargs + assert kwargs["api_version"] == "2025-01-01" + + def test_init_raises_when_env_missing( + self, monkeypatch: pytest.MonkeyPatch + ) -> None: + for name in ( + "AZURE_OPENAI_ENDPOINT", + "AZURE_OPENAI_API_KEY", + "AZURE_OPENAI_DEPLOYMENT", + ): + monkeypatch.delenv(name, raising=False) + with pytest.raises(AzureOpenAIConfigError, match="AZURE_OPENAI_ENDPOINT"): + AzureOpenAIClient() + + def test_init_raises_when_openai_sdk_missing( + self, + _env: None, + monkeypatch: pytest.MonkeyPatch, + ) -> None: + # Force the lazy import inside __init__ to ImportError. Setting the + # module to None makes `from openai import AzureOpenAI` raise the + # exact ImportError the adapter catches. + monkeypatch.setitem(sys.modules, "openai", None) + with pytest.raises(AzureOpenAIConfigError, match="openai SDK not installed"): + AzureOpenAIClient() + + +class TestAzureOpenAIClientCalls: + """`complete` and `complete_json` dispatch correctly to the SDK and + return the assistant message content.""" + + @staticmethod + def _mock_response(content: str | None) -> MagicMock: + """Build a ChatCompletion-shaped MagicMock with the given content.""" + message = MagicMock() + message.content = content + choice = MagicMock() + choice.message = message + response = MagicMock() + response.choices = [choice] + return response + + def test_complete_json_uses_structured_output_mode( + self, _env: None, _mock_openai: MagicMock + ) -> None: + sdk_instance = _mock_openai.return_value + sdk_instance.chat.completions.create.return_value = self._mock_response( + '{"ok": true}' + ) + + client = AzureOpenAIClient() + body = client.complete_json(model="ignored-per-Protocol", prompt="judge this") + + assert body == '{"ok": true}' + call = sdk_instance.chat.completions.create.call_args + assert call.kwargs["model"] == "test-deploy" + assert call.kwargs["response_format"] == {"type": "json_object"} + messages = call.kwargs["messages"] + assert messages[0]["role"] == "system" + assert "JSON" in messages[0]["content"] + assert messages[1] == {"role": "user", "content": "judge this"} + + def test_complete_json_returns_empty_json_on_null_content( + self, _env: None, _mock_openai: MagicMock + ) -> None: + sdk_instance = _mock_openai.return_value + sdk_instance.chat.completions.create.return_value = self._mock_response(None) + + client = AzureOpenAIClient() + assert client.complete_json(model="x", prompt="x") == "{}" + + def test_complete_dispatches_user_message_to_deployment( + self, _env: None, _mock_openai: MagicMock + ) -> None: + sdk_instance = _mock_openai.return_value + sdk_instance.chat.completions.create.return_value = self._mock_response("hi") + + client = AzureOpenAIClient() + assert client.complete("say hi") == "hi" + + call = sdk_instance.chat.completions.create.call_args + assert call.kwargs["model"] == "test-deploy" + assert call.kwargs["messages"] == [{"role": "user", "content": "say hi"}] + # complete() does not pin response_format — only complete_json does + assert "response_format" not in call.kwargs + + def test_complete_returns_empty_string_on_null_content( + self, _env: None, _mock_openai: MagicMock + ) -> None: + sdk_instance = _mock_openai.return_value + sdk_instance.chat.completions.create.return_value = self._mock_response(None) + + client = AzureOpenAIClient() + assert client.complete("x") == "" diff --git a/uv.lock b/uv.lock index e8fcd8c..1b94326 100644 --- a/uv.lock +++ b/uv.lock @@ -226,6 +226,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/33/6b/e0547afaf41bf2c42e52430072fa5658766e3d65bd4b03a563d1b6336f57/distlib-0.4.0-py2.py3-none-any.whl", hash = "sha256:9659f7d87e46584a30b5780e43ac7a2143098441670ff0a49d5f9034c54a6c16", size = 469047, upload-time = "2025-07-17T16:51:58.613Z" }, ] +[[package]] +name = "distro" +version = "1.9.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/fc/f8/98eea607f65de6527f8a2e8885fc8015d3e6f5775df186e443e0964a11c3/distro-1.9.0.tar.gz", hash = "sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed", size = 60722, upload-time = "2023-12-24T09:54:32.31Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/12/b3/231ffd4ab1fc9d679809f356cebee130ac7daa00d6d6f3206dd4fd137e9e/distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2", size = 20277, upload-time = "2023-12-24T09:54:30.421Z" }, +] + [[package]] name = "fastapi" version = "0.136.1" @@ -328,7 +337,7 @@ wheels = [ [[package]] name = "harness-python-react" -version = "0.2.11" +version = "0.2.12" source = { virtual = "." } dependencies = [ { name = "fastapi" }, @@ -357,6 +366,9 @@ dev = [ { name = "pyyaml" }, { name = "ruff" }, ] +eval = [ + { name = "openai" }, +] [package.metadata] requires-dist = [ @@ -365,6 +377,7 @@ requires-dist = [ { name = "httpx", specifier = ">=0.28.1" }, { name = "import-linter", marker = "extra == 'dev'", specifier = ">=2.0.0" }, { name = "mypy", marker = "extra == 'dev'", specifier = ">=1.15.0" }, + { name = "openai", marker = "extra == 'eval'", specifier = ">=1.40.0" }, { name = "opentelemetry-api", specifier = ">=1.33.0" }, { name = "opentelemetry-exporter-otlp-proto-grpc", specifier = ">=1.33.0" }, { name = "opentelemetry-instrumentation-fastapi", specifier = ">=0.62b0" }, @@ -382,7 +395,7 @@ requires-dist = [ { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.11.0" }, { name = "uvicorn", extras = ["standard"], specifier = ">=0.34.0" }, ] -provides-extras = ["dev"] +provides-extras = ["dev", "eval"] [[package]] name = "httpcore" @@ -493,6 +506,41 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/62/a1/3d680cbfd5f4b8f15abc1d571870c5fc3e594bb582bc3b64ea099db13e56/jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67", size = 134899, upload-time = "2025-03-05T20:05:00.369Z" }, ] +[[package]] +name = "jiter" +version = "0.15.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/66/b5/55f06bb281d92fb3cc86d14e1def2bd908bb77693183e7cb1f5a3c388b0c/jiter-0.15.0.tar.gz", hash = "sha256:4251acc80e2b7c9b7b8823456ea0fceeb0734dac2df7636d3c711b38476b5a76", size = 166640, upload-time = "2026-05-19T10:09:48.361Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/eb/d2/079f350ebf7859d081de30aa890f9e3be68516f754f3ba32366ffff4dcee/jiter-0.15.0-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:ac0d9ddea4350974be7a221fc25895f251a8fee748c889bdced2141c0fec1a49", size = 308884, upload-time = "2026-05-19T10:08:31.667Z" }, + { url = "https://files.pythonhosted.org/packages/04/4e/a2c30a7f69b48c03b20935d647479106fe932f6e63f75faf53937197e05d/jiter-0.15.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:01a8222cf05ab1128e239421156c207949808acaaea2bdfd33130ae666786e86", size = 310028, upload-time = "2026-05-19T10:08:33.304Z" }, + { url = "https://files.pythonhosted.org/packages/40/90/2e7cdfd3cf8ca967be38c48f5cf474d79f089efaf559a40f15984a77ae69/jiter-0.15.0-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:182226cbc930c9fab81bc2e41a4da672f89539906dadb05e75670ac07b94f71f", size = 337485, upload-time = "2026-05-19T10:08:35.259Z" }, + { url = "https://files.pythonhosted.org/packages/9b/11/15a1aa28b120b8ee5b4f1fb894c125046225f09847738bd64233d3b84883/jiter-0.15.0-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:71683c38c825452999b5717fcae07ea708e8c93003e808be4319c1b02e3d176e", size = 364223, upload-time = "2026-05-19T10:08:36.694Z" }, + { url = "https://files.pythonhosted.org/packages/b7/25/f442e8af5f3d0dcf47b39e83a0efd9ee45ea946aa6d04625dc3181eae3b6/jiter-0.15.0-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:30f2218e6a9e5c18bc10fe6d41ac189c442c88eacf11bad9f28ef95a9bef00e6", size = 456387, upload-time = "2026-05-19T10:08:38.143Z" }, + { url = "https://files.pythonhosted.org/packages/da/f4/37f2d2c9f64f49af7da652ed7532bb5a2372e588e6927c3fdd76f911db65/jiter-0.15.0-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5157de9f76eb4bc5ea74a1219366a25f945ad305641d74e04f59c54087091aa9", size = 374461, upload-time = "2026-05-19T10:08:39.869Z" }, + { url = "https://files.pythonhosted.org/packages/60/28/edcfbbbf0cb15436f36664a8908a0df47ab9006298d4cd937dc08ea932d6/jiter-0.15.0-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:90c5db5527c221249a876160663ab891ace358c17f7b9c93ec1478b7f0550e5c", size = 345924, upload-time = "2026-05-19T10:08:41.668Z" }, + { url = "https://files.pythonhosted.org/packages/47/13/89fba6398dab7f202b7278c4b4aac122399d2c0183971c4a57a3b7088df5/jiter-0.15.0-cp314-cp314-manylinux_2_31_riscv64.whl", hash = "sha256:3e4540b8e74e4268811ac05db226a6a128ff572e7e0ce3f1163b693cadb184cd", size = 352283, upload-time = "2026-05-19T10:08:43.091Z" }, + { url = "https://files.pythonhosted.org/packages/1b/da/0f6af8cef2c565a1ab44d970f268c43ccaa72707386ea6388e6fe2b6cd26/jiter-0.15.0-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:62ebd14e47e9aed9df4472afcb2663668ce4d74891cd54f86bf6e44029d6dc89", size = 389985, upload-time = "2026-05-19T10:08:44.915Z" }, + { url = "https://files.pythonhosted.org/packages/a1/ec/b9cb7d6d29e24ee14910266157d2a279d7a8f60ee0df7fa840882976ba64/jiter-0.15.0-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:0be6f5ad41a809f303f416d17cec92a7a725902fb9b4f3de3d19362ac0ef8554", size = 517695, upload-time = "2026-05-19T10:08:46.486Z" }, + { url = "https://files.pythonhosted.org/packages/64/5e/6d1bda880723aae0ad86b4b763f044362448efe31e3e819635d41cb03451/jiter-0.15.0-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:813dfbb17d65328bf86e5f0905dd277ba2265d3ca20556e86c0c7035b7182e5a", size = 548868, upload-time = "2026-05-19T10:08:48.026Z" }, + { url = "https://files.pythonhosted.org/packages/0c/72/7de501cf38dcacaf35098796f3a50e0f2e338baba18a58946c618544b809/jiter-0.15.0-cp314-cp314-win32.whl", hash = "sha256:50e51156192722a9c58db112837d3f8ef96fb3c5ecc14e95f409134b08b158ec", size = 206380, upload-time = "2026-05-19T10:08:49.738Z" }, + { url = "https://files.pythonhosted.org/packages/1e/a9/e19addf4b0c1bdce52c6da12351e6bc42c340c45e7c09e2158e46d293ccc/jiter-0.15.0-cp314-cp314-win_amd64.whl", hash = "sha256:30ce1a5d16b5641dc935d50ef775af6a0871e3d14ab05d6fc54dff371b78e558", size = 197687, upload-time = "2026-05-19T10:08:51.088Z" }, + { url = "https://files.pythonhosted.org/packages/f2/c9/776b1db01db25fc6c1d58d1979a37b0a9fe787e5f5b1d062d2eaacb77923/jiter-0.15.0-cp314-cp314-win_arm64.whl", hash = "sha256:510c8b3c17a0ed9ac69850c0438dada3c9b82d9c4d589fcb62002a5a9cf3a866", size = 192571, upload-time = "2026-05-19T10:08:52.451Z" }, + { url = "https://files.pythonhosted.org/packages/a0/f6/45bb4670bacf300fd2c7abadbfb3af376e5f1b6ae75fd9bc069891d15870/jiter-0.15.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:7553333dd0930c104a5a0db8df72bf7219fe663d731383b576bb6ed6351c984d", size = 317151, upload-time = "2026-05-19T10:08:53.867Z" }, + { url = "https://files.pythonhosted.org/packages/d7/68/ed635ad5acd7b73e454283083bbb7c8205ad10e88b0d9d7d793b09fe8226/jiter-0.15.0-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f2143ab06181d2b029eedcb6af3cebe95f11bbac62441781860f98ee9330a6a6", size = 341243, upload-time = "2026-05-19T10:08:55.383Z" }, + { url = "https://files.pythonhosted.org/packages/5d/db/3ff4176b817b8ea33879e71e13d8bc2b0d481a7ed3fe9e080f333d415c16/jiter-0.15.0-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:6eac374c5c975709b69c10f09afd199df74150172156ad10c8d4fd785b7da995", size = 363629, upload-time = "2026-05-19T10:08:56.928Z" }, + { url = "https://files.pythonhosted.org/packages/ab/24/5f8270e0ba9c883582f96f722f8a0b58015c7ce1f8c6d4571cf394e99b6b/jiter-0.15.0-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b3b3b775e33d3bfaec9899edc526ae97b0da0bf9d071a46124ba419149a414f8", size = 456198, upload-time = "2026-05-19T10:08:58.618Z" }, + { url = "https://files.pythonhosted.org/packages/45/5b/76fc02b0b5c54c3d18c60653156e2f76fde1816f9b4722db68d6ee2c897e/jiter-0.15.0-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:eda3071db3346334beae1360b46da4606da57bf3528c167b3c38533afaf9f2c5", size = 373710, upload-time = "2026-05-19T10:09:00.151Z" }, + { url = "https://files.pythonhosted.org/packages/c4/52/4310821b0ea9277994d3e1f49fc6a4b34e4800caebacb2c0af81da59a454/jiter-0.15.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c6694a173ecabc12eb60efbc0b474464ead1951ff65cd8b1e72100715c64512b", size = 349901, upload-time = "2026-05-19T10:09:01.621Z" }, + { url = "https://files.pythonhosted.org/packages/93/fe/67648c35b3594fba8854ac64cc8a826d8bcd18324bbdb53d77697c60b6ef/jiter-0.15.0-cp314-cp314t-manylinux_2_31_riscv64.whl", hash = "sha256:a254e10b593624d230c365b6d616b22ca0ad65e63a16e6631c2b3466022e6ba8", size = 352438, upload-time = "2026-05-19T10:09:03.216Z" }, + { url = "https://files.pythonhosted.org/packages/cb/28/0a1879d07ad6b3e025a2750027363452ced93c2d16d1c9d4b153ffd51c91/jiter-0.15.0-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d8d2955167274e15d79a7a020afdd9b39c990eb80b2d89fca695d92dcfdd38ec", size = 388152, upload-time = "2026-05-19T10:09:04.741Z" }, + { url = "https://files.pythonhosted.org/packages/c1/78/46c6f6b56ba85c90021f4afd72ed42f691f8f84daacb5fe27277070e3858/jiter-0.15.0-cp314-cp314t-musllinux_1_1_aarch64.whl", hash = "sha256:acf4ee4d1fc55917239fe72972fb292dd773055d05eb040d36f4326e02cc2c0e", size = 517707, upload-time = "2026-05-19T10:09:06.231Z" }, + { url = "https://files.pythonhosted.org/packages/ca/cb/720662d4c88fcad606e826fef5424365527ba43ce4868a479aed8f8c507e/jiter-0.15.0-cp314-cp314t-musllinux_1_1_x86_64.whl", hash = "sha256:e7196e56f1cd69af1dbb07dff02dcfb260a50b45a82d409d92a06fedb32473b5", size = 548241, upload-time = "2026-05-19T10:09:08.093Z" }, + { url = "https://files.pythonhosted.org/packages/60/e3/935b8034fd143f21125c87d51404a9e0e1449186a494405721ff5d1d695e/jiter-0.15.0-cp314-cp314t-win32.whl", hash = "sha256:7f6163c0f10b055245f814dcc59f4818da60dfe72f3e72ab89fc24b6bd5e9c52", size = 207950, upload-time = "2026-05-19T10:09:09.616Z" }, + { url = "https://files.pythonhosted.org/packages/93/59/984fd9ece895953dad3e0880a650e766f5a2da2c5514f0eafdaaabbeb5f9/jiter-0.15.0-cp314-cp314t-win_amd64.whl", hash = "sha256:980c256edb05b78a111b99c4de3b1d32e31634b867fd1fc2cf726e7b7bba9854", size = 200055, upload-time = "2026-05-19T10:09:11.367Z" }, + { url = "https://files.pythonhosted.org/packages/0e/a4/cf8d779feb133a27a2e3bc833bccb9e13aa332cdf820497ebf72c10ce8c3/jiter-0.15.0-cp314-cp314t-win_arm64.whl", hash = "sha256:66b1880df2d01e206e8339769d1c7c1753bcb653efd6289e203f6f24ebada0c0", size = 191244, upload-time = "2026-05-19T10:09:12.74Z" }, +] + [[package]] name = "librt" version = "0.9.0" @@ -625,6 +673,25 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/88/b2/d0896bdcdc8d28a7fc5717c305f1a861c26e18c05047949fb371034d98bd/nodeenv-1.10.0-py2.py3-none-any.whl", hash = "sha256:5bb13e3eed2923615535339b3c620e76779af4cb4c6a90deccc9e36b274d3827", size = 23438, upload-time = "2025-12-20T14:08:52.782Z" }, ] +[[package]] +name = "openai" +version = "2.38.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "anyio" }, + { name = "distro" }, + { name = "httpx" }, + { name = "jiter" }, + { name = "pydantic" }, + { name = "sniffio" }, + { name = "tqdm" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/8f/12/cfa322c5f5dd8fa21aab9a7a8e979e7a11123800f86ca8d82eb68a83d213/openai-2.38.0.tar.gz", hash = "sha256:798694c6cf74145541fda94325b6f8f72d8e1fd0262cc137c8d728177a6a4ce3", size = 772764, upload-time = "2026-05-21T21:23:42.105Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0a/bf/ccff9be562e24207716d04ef9dc931c76aff0c89a7265da43e2104d7fe06/openai-2.38.0-py3-none-any.whl", hash = "sha256:ec6661c57b2dcc47414a767e6e3335c7ed3d19c9696999283a3c82e95c756a3c", size = 1344910, upload-time = "2026-05-21T21:23:39.636Z" }, +] + [[package]] name = "opentelemetry-api" version = "1.41.1" @@ -1102,6 +1169,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c0/98/6beb4b351e472e5f4c4613f7c35a5290b8be2497e183825310c4c3a3984b/ruff-0.15.12-py3-none-win_arm64.whl", hash = "sha256:a538f7a82d061cee7be55542aca1d86d1393d55d81d4fcc314370f4340930d4f", size = 11120821, upload-time = "2026-04-24T18:16:57.979Z" }, ] +[[package]] +name = "sniffio" +version = "1.3.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a2/87/a6771e1546d97e7e041b6ae58d80074f81b7d5121207425c964ddf5cfdbd/sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc", size = 20372, upload-time = "2024-02-25T23:20:04.057Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" }, +] + [[package]] name = "starlette" version = "1.1.0" @@ -1132,6 +1208,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b5/11/87d6d29fb5d237229d67973a6c9e06e048f01cf4994dee194ab0ea841814/tomlkit-0.14.0-py3-none-any.whl", hash = "sha256:592064ed85b40fa213469f81ac584f67a4f2992509a7c3ea2d632208623a3680", size = 39310, upload-time = "2026-01-13T01:14:51.965Z" }, ] +[[package]] +name = "tqdm" +version = "4.67.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/09/a9/6ba95a270c6f1fbcd8dac228323f2777d886cb206987444e4bce66338dd4/tqdm-4.67.3.tar.gz", hash = "sha256:7d825f03f89244ef73f1d4ce193cb1774a8179fd96f31d7e1dcde62092b960bb", size = 169598, upload-time = "2026-02-03T17:35:53.048Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/16/e1/3079a9ff9b8e11b846c6ac5c8b5bfb7ff225eee721825310c91b3b50304f/tqdm-4.67.3-py3-none-any.whl", hash = "sha256:ee1e4c0e59148062281c49d80b25b67771a127c85fc9676d3be5f243206826bf", size = 78374, upload-time = "2026-02-03T17:35:50.982Z" }, +] + [[package]] name = "typing-extensions" version = "4.15.0" From eb0136e350713bc8a711f92b0a8a42a50fec15fd Mon Sep 17 00:00:00 2001 From: Constantinos <41453723+constk@users.noreply.github.com> Date: Tue, 26 May 2026 15:27:47 +1000 Subject: [PATCH 4/8] chore: add optional Beads issue queue guidance (#86) (#106) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * chore: add optional Beads issue queue guidance * chore: address PR-86 review feedback (BEADS doc + template + CI-script compile gate) Applies the actionable items from the PR-86 review: - docs/BEADS.md: lead with a one-sentence "what Beads is" + upstream link; state the stance explicitly (optional/additive, recommended for agent-driven flows, GitHub remains authoritative); add a YAML example block under Recommended Bead fields; replace the duplicated Closure checklist with a Bead-specific narrowing that cites the PR template + CONTRIBUTING; call out that .beads/ is wiped by git clean -fdx. - .github/pull_request_template.md: collapse the "Local Beads" section into an HTML-commented opt-in block so it is invisible in the rendered preview until a Beads-using team uncomments it. - CONTRIBUTING.md: document the one-shot git renormalisation step for Windows clones after the .gitattributes change lands. - tests/test_scripts_compile.py: regression gate that py_compiles every .github/scripts/*.py. The "scripts unparseable" review finding was based on an older local Python — PEP 758 (3.14) makes the unparenthesised except clauses valid, so the scripts ARE fine on the project pin. The test guards against an actual syntax error landing in future. * chore: bump version to 0.2.11 --------- Co-authored-by: jakelindsay87 --- .gitattributes | 11 ++ .github/pull_request_template.md | 9 ++ .github/scripts/check_aspirational_tickets.py | 3 +- .github/scripts/check_pin_freshness.py | 3 +- .github/scripts/check_tests_present.py | 4 +- .github/scripts/check_version_bump.py | 3 +- .gitignore | 4 + CONTRIBUTING.md | 22 ++- README.md | 4 +- docs/BEADS.md | 149 ++++++++++++++++++ docs/DEVELOPMENT.md | 1 + docs/HARNESS.md | 8 +- docs/HARNESS_PRIMER.md | 3 + pyproject.toml | 2 +- tests/test_scripts_compile.py | 28 ++++ uv.lock | 2 +- 16 files changed, 243 insertions(+), 13 deletions(-) create mode 100644 .gitattributes create mode 100644 docs/BEADS.md create mode 100644 tests/test_scripts_compile.py diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..7f90681 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,11 @@ +# The pre-commit hook stack enforces LF line endings. Keep checkout behavior +# aligned across Windows, macOS, and Linux so `pre-commit run --all-files` does +# not rewrite the working tree on Windows clones with global autocrlf enabled. +* text=auto eol=lf + +*.png binary +*.jpg binary +*.jpeg binary +*.gif binary +*.ico binary +*.pdf binary diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 7693700..a180c91 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -40,6 +40,15 @@ + + + ## Linked issue Closes # diff --git a/.github/scripts/check_aspirational_tickets.py b/.github/scripts/check_aspirational_tickets.py index ded7013..42f2a05 100644 --- a/.github/scripts/check_aspirational_tickets.py +++ b/.github/scripts/check_aspirational_tickets.py @@ -57,6 +57,7 @@ from pathlib import Path INVARIANTS_DOC = Path("docs/INVARIANTS.md") +GITHUB_API_ERRORS = (urllib.error.URLError, TimeoutError, json.JSONDecodeError) # A marker line *starts* with one or two asterisks immediately followed by # `Aspirational` and a word boundary. Avoids picking up mid-sentence prose @@ -88,7 +89,7 @@ def _issue_state(repo: str, number: str, token: str) -> str | None: try: with urllib.request.urlopen(req, timeout=5) as response: # noqa: S310 payload = json.loads(response.read().decode("utf-8")) - except urllib.error.URLError, TimeoutError, json.JSONDecodeError: + except GITHUB_API_ERRORS: return None state = payload.get("state") return state if isinstance(state, str) else None diff --git a/.github/scripts/check_pin_freshness.py b/.github/scripts/check_pin_freshness.py index 85ac943..27a1a0c 100644 --- a/.github/scripts/check_pin_freshness.py +++ b/.github/scripts/check_pin_freshness.py @@ -84,6 +84,7 @@ def _load_pin_module() -> ModuleType: _pins = _load_pin_module() _API_BASE = "https://api.github.com" +GITHUB_API_ERRORS = (urllib.error.URLError, TimeoutError, json.JSONDecodeError) def _fetch_json(url: str, token: str) -> dict[str, object] | None: @@ -104,7 +105,7 @@ def _fetch_json(url: str, token: str) -> dict[str, object] | None: try: with urllib.request.urlopen(req, timeout=10) as response: # noqa: S310 payload = json.loads(response.read().decode("utf-8")) - except urllib.error.URLError, TimeoutError, json.JSONDecodeError: + except GITHUB_API_ERRORS: return None return payload if isinstance(payload, dict) else None diff --git a/.github/scripts/check_tests_present.py b/.github/scripts/check_tests_present.py index 3c689b1..92069ea 100644 --- a/.github/scripts/check_tests_present.py +++ b/.github/scripts/check_tests_present.py @@ -43,6 +43,8 @@ import sys from pathlib import Path +EVENT_READ_ERRORS = (OSError, json.JSONDecodeError) + # Prefixes that declare a behaviour change → tests required. BLOCKING_PREFIXES: frozenset[str] = frozenset({"feat", "fix"}) @@ -59,7 +61,7 @@ def pr_title_from_event() -> str | None: return None try: data = json.loads(Path(event_path).read_text(encoding="utf-8")) - except OSError, json.JSONDecodeError: + except EVENT_READ_ERRORS: return None pr = data.get("pull_request") if not isinstance(pr, dict): diff --git a/.github/scripts/check_version_bump.py b/.github/scripts/check_version_bump.py index 4510c0f..cdf7959 100644 --- a/.github/scripts/check_version_bump.py +++ b/.github/scripts/check_version_bump.py @@ -39,6 +39,7 @@ PYPROJECT = Path("pyproject.toml") UV_LOCK = Path("uv.lock") PACKAGE_NAME = "harness-python-react" +EVENT_READ_ERRORS = (OSError, json.JSONDecodeError) # Match the project's self-version block in uv.lock: # @@ -105,7 +106,7 @@ def pr_title_from_event() -> str | None: return None try: data = json.loads(Path(event_path).read_text(encoding="utf-8")) - except OSError, json.JSONDecodeError: + except EVENT_READ_ERRORS: return None pr = data.get("pull_request") if not isinstance(pr, dict): diff --git a/.gitignore b/.gitignore index 1c8bc4a..a3f61fd 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,10 @@ .claude/bash-log.txt .claude/worktrees/ +# Optional local Beads queue state +.beads/ +beads/ + # Node / Frontend node_modules/ frontend/dist/ diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index ba47e27..3565676 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -34,14 +34,15 @@ The subject is **lowercase** after the colon. Title Case prose (`Add the thing`) 1. Open the issue first. Use a feature/bug template; fill every section. 2. Branch off `develop` with the matching name. -3. Land one logical change per PR. Stack PRs if the work is naturally split. -4. The PR template asks five things — answer each (`None` is valid where applicable): +3. If your team uses Beads, mirror or claim the linked issue in the local Beads queue after the issue exists. Beads track local ready/blocked execution only; GitHub Issues remain canonical for scope, discussion, PR linkage, and closure. +4. Land one logical change per PR. Stack PRs if the work is naturally split. +5. The PR template asks five things — answer each (`None` is valid where applicable): - **What & why** (1–3 lines) - **Test plan** (checkbox list; CI covers most of it) - **Invariants affected** — cite numbered rules from `docs/INVARIANTS.md` - **New deps / actions / external surface** (anchor for supply-chain review) - **Screenshots** (UI changes only) -5. Wait for green CI + a code-owner review before merging. +6. Wait for green CI + a code-owner review before merging. ### Solo-owner merge policy @@ -55,6 +56,21 @@ gh pr merge --admin --squash --delete-branch When a second collaborator joins, drop the `--admin` flag and adopt standard PR review. Update this section + `CODEOWNERS` in the same PR. +## Line endings (Windows clones) + +This repo enforces LF line endings via `.gitattributes` (`* text=auto eol=lf`) +and the pre-commit hygiene hook. If you cloned on Windows with +`core.autocrlf=true`, the first checkout after pulling the `.gitattributes` +change can leave the working tree out of sync with the index. Renormalise +once: + +```sh +git add --renormalize . +git commit -m "chore: renormalise line endings" +``` + +After that, day-to-day work is unaffected. + ## Local pre-push gate ```sh diff --git a/README.md b/README.md index 2c4d4da..b972f7a 100644 --- a/README.md +++ b/README.md @@ -15,9 +15,10 @@ - **Backend:** Python 3.14, FastAPI, Pydantic v2 (`StrictModel` base), `uv` deps, OpenTelemetry SDK + OTLP exporter, structured JSON logs, generic tool-registry pattern. - **Frontend:** Node 24 LTS, React 19.2, Vite 8, TypeScript strict, ESLint 10 flat config, Prettier, Vitest + jsdom + Testing Library. - **Eval harness:** provider-agnostic runner + LLM-judge `Protocol`, three tolerance modes (exact / numeric / semantic), one example golden case, nightly workflow (disabled by default). -- **CI:** 15 required status checks across `ci.yml` (lint/format, mypy strict, unit tests, coverage ≥75%, import-linter architecture, pre-commit, frontend build, frontend quality, branch-protection sync, commit-type sync) + `security.yml` (gitleaks, pip-audit, npm audit, trivy) + PR-title lint. +- **CI:** 21 required status checks across `ci.yml` (lint/format, mypy strict, unit tests, coverage, import-linter architecture, pre-commit, frontend build, frontend quality, branch-protection sync, commit-type sync, version/action/tests/docs audits) + `security.yml` (gitleaks, pip-audit, npm audit, trivy) + PR-title lint. - **Release:** tag-triggered workflow that builds the image, pushes to `ghcr.io`, generates a CycloneDX SBOM, and publishes the GitHub Release. - **Agent integration:** `.claude/hooks/` (forbidden-flag blocker, secret scan, formatter dispatch, SessionStart context) + six auto-activating skills (architect / code-reviewer / devops / frontend / qa-engineer / technical-writer). +- **Issue execution:** GitHub Issues remain the external source of truth; optional Beads guidance adds a local dependency-aware execution queue without changing issue closure authority. - **Docker:** multi-stage Dockerfile (non-root, healthcheck), `docker compose up` boots app + frontend + Jaeger. ## Quickstart @@ -114,6 +115,7 @@ See [`docs/HARNESS.md`](docs/HARNESS.md) for the full umbrella. Highlights: | [`docs/BOUNDARIES.md`](docs/BOUNDARIES.md) | Module layering + the import-linter contracts | | [`docs/DEVELOPMENT.md`](docs/DEVELOPMENT.md) | Local setup, branching, justfile, CI | | [`docs/EVAL_HARNESS.md`](docs/EVAL_HARNESS.md) | Eval flywheel + opt-in for the nightly workflow | +| [`docs/BEADS.md`](docs/BEADS.md) | Optional local Beads queue layered under GitHub Issues | | [`docs/SECURITY.md`](docs/SECURITY.md) | Threat model + defence-in-depth map | | [`docs/ARCHITECTURE.md`](docs/ARCHITECTURE.md) | Scaffold-level component view | | [`CONTRIBUTING.md`](CONTRIBUTING.md) | Branching, commit format, PR flow | diff --git a/docs/BEADS.md b/docs/BEADS.md new file mode 100644 index 0000000..d3271cc --- /dev/null +++ b/docs/BEADS.md @@ -0,0 +1,149 @@ +# Optional Beads execution queue + +[Beads](https://github.com/steveyegge/beads) is an open-source +dependency-aware issue tracker designed for AI coding agents — it gives an +agent a local ready/blocked view of work, a dependency graph, and restart-safe +task claims that GitHub Issues alone do not. + +This document is **optional and additive**. The base harness does not assume +Beads; if your team has no agent or multi-actor execution concern, GitHub +Issues plus the PR template is sufficient and you can skip this doc entirely. +Beads is recommended specifically when you are coordinating an LLM agent (or +several) against this repo and want dependency planning the public issue +tracker does not provide. The README and `docs/HARNESS.md` references describe +Beads as optional infrastructure, not part of the standard contributor flow. + +Wherever Beads is used, GitHub Issues remain the external source of truth and +the authority for issue closure. + +## Review of existing GitHub issue guidance + +The current harness already treats GitHub as the public planning and merge +record: + +- `.github/ISSUE_TEMPLATE/bug.md`, `feature.md`, and `eval-regression.md` + define the supported intake paths, and blank issues are disabled in + `.github/ISSUE_TEMPLATE/config.yml`. +- `CONTRIBUTING.md` requires one issue per branch, short-lived branches named + `/-`, and green CI plus review before merge. +- `.github/pull_request_template.md` requires What & why, Test plan, + Invariants affected, supply-chain surface, Screenshots when relevant, and a + linked issue. +- `CLAUDE.md` and `docs/DEVELOPMENT.md` describe the same one-issue, + one-branch, `develop` to `main` release flow for agent and human operators. +- `docs/TASKS.md` is a project-local planning map cross-referenced with GitHub + issues and the project board. + +There is no Beads-specific policy in the base harness today. Any Beads addition +must therefore be additive and must not make GitHub issue state ambiguous. + +## GitHub Issues vs Beads + +| System | Owns | Does not own | +|---|---|---| +| GitHub Issues | Public backlog, user-facing requirements, labels, project board state, discussion, acceptance criteria, links from PRs, and final issue closure. | Local agent claims, transient execution notes, or dependency scheduling that would be noisy in the public issue. | +| Beads | Local execution queue, ready/blocked views, dependency graph, implementation notes, reviewer handoff notes, and restart-safe task claims. | The canonical requirement, public status, release notes, or authority to close a GitHub issue. | + +The rule is simple: **GitHub answers what work exists and whether it is +externally done; Beads answers what the local execution system should pick up +next.** + +## Sync contract + +When using Beads with this harness: + +1. Create or confirm the GitHub issue first. +2. Mirror the issue into Beads with an immutable external reference: + - GitHub repository owner/name. + - GitHub issue number. + - GitHub issue URL. + - Original issue title. +3. Use Beads for local status only: `ready`, `in_progress`, `blocked`, + `review`, or `done` are execution states, not replacements for the GitHub + issue state. +4. Put the Bead id in local notes, branch notes, or PR body when useful, but + keep `Closes #` pointing at the GitHub issue. +5. Do not close a GitHub issue because a Bead is marked done. Close only after + the PR is merged, required checks are green, any required manual or browser + validation is recorded, and a human-readable note has been added to the + issue or PR. + +If the GitHub issue changes after import, update the Bead from GitHub before +continuing. GitHub wins on scope, acceptance criteria, and user-visible status. + +## Recommended Bead fields + +A Bead should carry enough information for a new agent or contributor to resume +without reopening every browser tab: + +| Field | Purpose | +|---|---| +| `external_ref` | GitHub issue URL, for example `https://github.com/owner/repo/issues/123`. | +| `github_issue` | Numeric issue id used by branches and PRs. | +| `acceptance` | The current acceptance criteria copied or summarized from GitHub. | +| `dependencies` | Other Beads or GitHub issues that must land first. | +| `status` | Local execution state. | +| `owner` | Optional local agent or human claim. | +| `evidence` | Paths or URLs for test output, review notes, screenshots, or deploy checks. | +| `closeout` | Merge SHA, PR URL, and verification notes once complete. | + +A short YAML example: + +```yaml +external_ref: https://github.com/owner/repo/issues/123 +github_issue: 123 +acceptance: | + /api/v1/echo rejects payloads >1KiB with HTTP 413. +dependencies: [122] # other Bead ids or GitHub issues +status: ready +owner: agent-a +evidence: + - tests/test_api.py::test_echo_size_cap +closeout: null +``` + +Avoid storing secrets, tokens, credentials, private customer data, or raw +production payloads in Beads. Treat Beads data as local operational metadata. +Note that `.beads/` is gitignored, so anything Beads stores locally — including +agent-action audit logs — is wiped by `git clean -fdx`; commit deliberate +summaries to the repo if you need them to survive workspace resets. + +## PR discipline when Beads are used + +The existing PR template still applies. Add Beads information without deleting +any required section: + +- `Linked issue` remains `Closes #`. +- Mention the Bead id or local queue reference under `What & why` or the + optional Beads section. +- Include Beads-derived evidence paths in `Test plan` only when they are useful + to a reviewer. +- If the Bead changed scope, update the GitHub issue before asking for review. +- If the Bead was blocked by an external dependency, note that in the PR or + issue rather than hiding it in the local queue. + +## Local artifact hygiene + +Beads state is usually local execution metadata. Do not commit raw Beads +databases, scratch exports, or agent logs by default. Commit only intentional +summaries or docs that reviewers need. + +If a downstream project decides to version Beads state, document that policy in +that project and make sure secret scanning, review, and retention expectations +are explicit. + +## Closure checklist + +The PR-merge and issue-closure gates already live in +`.github/pull_request_template.md` and `CONTRIBUTING.md` — don't duplicate them +here. The Bead-specific closure rule is narrower: + +- Do not mark a Bead done until the GitHub issue's closure conditions (per the + PR template and `CONTRIBUTING.md`) are met. Beads track the local execution + state of work GitHub already authorised; they don't grant new closure + authority. +- If the Bead and the GitHub issue disagree on scope, acceptance, or status, + stop and reconcile against GitHub before continuing. + +Beads improve local throughput only if they reduce ambiguity. If a Bead and a +GitHub issue disagree, the GitHub issue wins. diff --git a/docs/DEVELOPMENT.md b/docs/DEVELOPMENT.md index db7bb90..a5827cd 100644 --- a/docs/DEVELOPMENT.md +++ b/docs/DEVELOPMENT.md @@ -73,6 +73,7 @@ Every recipe uses `uv run --frozen` — bare `uv run` silently re-resolves when - `main` is protected: every required CI context must pass + 1 review + commit-type sync + branch-protection sync. - `develop` is the integration branch; same gates as `main` minus a strictness flag (`strict: false` so PRs don't need rebases). - Feature branches are short-lived and named `/-`. +- Optional Beads queues can mirror GitHub issues for local execution, but GitHub remains the source of truth for requirements, PR linkage, and closure. See `docs/BEADS.md`. ## Commit messages diff --git a/docs/HARNESS.md b/docs/HARNESS.md index e24007c..6a7a7e8 100644 --- a/docs/HARNESS.md +++ b/docs/HARNESS.md @@ -13,7 +13,7 @@ The "harness" is the set of mechanical controls that make LLM-driven coding prod | **Tests** | Behaviour | `pytest tests/`, `pytest eval/`, `vitest` | | **Coverage** | ≥ 75% on `src/` | `pyproject.toml` `[tool.coverage.report]` | | **Pre-commit** | Local-first defence | `.pre-commit-config.yaml` (ruff, gitleaks, commitizen, mypy, hygiene) | -| **CI** | Non-bypassable | `.github/workflows/ci.yml` (15 contexts) + `security.yml` + `pr-title.yml` + `release.yml` + `release-drafter.yml` | +| **CI** | Non-bypassable | `.github/workflows/ci.yml` + `security.yml` + `pr-title.yml` (21 required contexts) plus release and maintenance workflows | | **Branch protection** | Declarative, drift-checked | `.github/branch-protection/{develop,main}.json` + `branch-protection.yml` apply workflow + `check_required_contexts.py` meta-gate | | **Commit format** | Seven prefixes only | `[tool.commitizen]` schema + `pr-title.yml` allowlist + `check_commit_types.py` meta-gate | | **Secret scan** | Three checkpoints | local hook → pre-commit → `security.yml` gitleaks | @@ -21,6 +21,7 @@ The "harness" is the set of mechanical controls that make LLM-driven coding prod | **Dep scan** | Pinned + audited | pip-audit, npm audit | | **Release** | Reproducible artefacts | `release.yml` (image push to GHCR + CycloneDX SBOM) | | **Eval** | LLM-output regressions | `src/eval/`, `eval/`, `eval-nightly.yml` (workflow_dispatch by default) | +| **Issue execution** | GitHub stays canonical; Beads can drive local ready/blocked work | GitHub issue templates + PR template + optional `docs/BEADS.md` queue guidance | | **Agent hooks** | LLM coder side enforcement | `.claude/hooks/{pretooluse_bash, posttooluse_writeedit, sessionstart}.py` + `settings.local.json.example` | | **Skills** | Auto-activated agent guidance | `.claude/skills/{architect, code-reviewer, devops, frontend, qa-engineer, technical-writer}` | @@ -40,5 +41,6 @@ For an engineer setting up the template: 2. **`docs/BOUNDARIES.md`** — module layering and the import-linter contracts. 3. **`docs/DEVELOPMENT.md`** — local setup, the `justfile`, the CI pipeline. 4. **`docs/EVAL_HARNESS.md`** — the eval flywheel; how to add a case, how to opt the nightly into running. -5. **`docs/SECURITY.md`** — threat model + the defence-in-depth map. -6. **`docs/ARCHITECTURE.md`** — scaffold-level diagram; expand as your domain lands. +5. **`docs/BEADS.md`** — optional local execution queue layered under GitHub Issues. +6. **`docs/SECURITY.md`** — threat model + the defence-in-depth map. +7. **`docs/ARCHITECTURE.md`** — scaffold-level diagram; expand as your domain lands. diff --git a/docs/HARNESS_PRIMER.md b/docs/HARNESS_PRIMER.md index 7d9606d..737ac4c 100644 --- a/docs/HARNESS_PRIMER.md +++ b/docs/HARNESS_PRIMER.md @@ -269,6 +269,7 @@ Distinct from the **build harness** (everything above), the **evaluation harness |---|---| | PR template | [.github/pull_request_template.md](../.github/pull_request_template.md). | | Issue templates | [.github/ISSUE_TEMPLATE/](../.github/ISSUE_TEMPLATE/): `bug.md`, `feature.md`, `eval-regression.md`. Blank issues disabled. | +| Optional Beads queue | [docs/BEADS.md](BEADS.md): GitHub Issues remain canonical while Beads can track local ready/blocked execution. | | Code ownership | [.github/CODEOWNERS](../.github/CODEOWNERS). | | Branch protection | [.github/branch-protection/{main,develop}.json](../.github/branch-protection/) declarative configs, re-applied weekly by [branch-protection.yml](../.github/workflows/branch-protection.yml). | | Commit message shape | Commitizen, configured in `pyproject.toml`. | @@ -359,6 +360,7 @@ The error names the offending module, line, and contract — no guessing. | **OpenTelemetry (OTel)** | Vendor-neutral standard for traces, metrics, logs. The repo follows `gen_ai.*` and `db.*` semantic conventions for attribute names. | | **CycloneDX** | An SBOM format. Generated per release and attached to the GitHub Release. | | **gitleaks** | Pattern-based secret scanner. | +| **Beads** | Optional local issue queue used for dependency-aware execution and handoffs; GitHub Issues remain canonical. | --- @@ -372,4 +374,5 @@ The error names the offending module, line, and contract — no guessing. | [ARCHITECTURE.md](ARCHITECTURE.md) | The system design — components, request flow. | | [SECURITY.md](SECURITY.md) | Threat model + defence-in-depth mapping. | | [EVAL_HARNESS.md](EVAL_HARNESS.md) | The eval flywheel. | +| [BEADS.md](BEADS.md) | Optional local Beads queue layered under GitHub Issues. | | [DEVELOPMENT.md](DEVELOPMENT.md) | Local setup, branching, releases. | diff --git a/pyproject.toml b/pyproject.toml index 71c6d76..701f7a5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "harness-python-react" -version = "0.2.12" +version = "0.2.13" description = "Production-quality LLM-driven coding harness — Python (FastAPI) backend, Vite + React + TypeScript frontend." readme = "README.md" requires-python = ">=3.14" diff --git a/tests/test_scripts_compile.py b/tests/test_scripts_compile.py new file mode 100644 index 0000000..214abbb --- /dev/null +++ b/tests/test_scripts_compile.py @@ -0,0 +1,28 @@ +"""Every script in `.github/scripts/` must parse on the project's pinned Python. + +Catches a class of regression where a script lands with a syntax error that +the corresponding CI gate happens to not exercise. Cheap and broad: one test, +one `py_compile.compile` per script. +""" + +from __future__ import annotations + +import py_compile +from pathlib import Path + +import pytest + +REPO_ROOT = Path(__file__).resolve().parent.parent +SCRIPTS_DIR = REPO_ROOT / ".github" / "scripts" + + +def _ci_scripts() -> list[Path]: + return sorted(p for p in SCRIPTS_DIR.glob("*.py") if p.is_file()) + + +@pytest.mark.parametrize("script", _ci_scripts(), ids=lambda p: p.name) +def test_ci_script_compiles(script: Path) -> None: + try: + py_compile.compile(str(script), doraise=True) + except py_compile.PyCompileError as exc: # pragma: no cover — failure path + pytest.fail(f"{script.name} failed to compile: {exc.msg}") diff --git a/uv.lock b/uv.lock index 1b94326..1d71286 100644 --- a/uv.lock +++ b/uv.lock @@ -337,7 +337,7 @@ wheels = [ [[package]] name = "harness-python-react" -version = "0.2.12" +version = "0.2.13" source = { virtual = "." } dependencies = [ { name = "fastapi" }, From 722293d63046dac3a912c4d8801f7f19b69f3f70 Mon Sep 17 00:00:00 2001 From: Constantinos <41453723+constk@users.noreply.github.com> Date: Tue, 26 May 2026 15:31:17 +1000 Subject: [PATCH 5/8] docs: mark admin-merge policy as transitional solo-owner state (#101) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * docs: mark admin-merge policy as transitional solo-owner state (#93) The existing "Solo-owner merge policy" section accurately documented how merges work today, but read as standing policy. From an external contributor's perspective it could look like the maintainer routinely bypasses their own gates. Adds a leading "Transitional" blockquote framing this as a single-owner workaround, not standing policy, and replaces the closing sentence with a numbered exit checklist (drop --admin, remove the subsection, update CODEOWNERS, optionally flip enforce_admins to true). All four changes land together when a second collaborator is onboarded. Mechanics of the merge command itself are unchanged. Closes #93 * chore: bump version to 0.2.11 * docs: make enforce_admins flip required in exit checklist (#93 review) Code review on #101 pushed back on step 4 of the "When the exemption ends" checklist: "Optionally flip enforce_admins to true". Leaving it false in a 2-person setup keeps the admin-bypass door open even after the single-owner workaround is no longer needed — which defeats the point of having an exit checklist. Drops "Optionally" and adds a one-line rationale so a future reader understands why the flip is non-optional. Refs #93 --- CONTRIBUTING.md | 15 ++++++++++++--- pyproject.toml | 2 +- uv.lock | 2 +- 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 3565676..a0c4b23 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -46,15 +46,24 @@ The subject is **lowercase** after the colon. Title Case prose (`Add the thing`) ### Solo-owner merge policy -This repo runs with a single code owner (`* @constk` in `CODEOWNERS`). GitHub forbids a PR author from approving their own PR, so the standard "1 code-owner review" gate cannot be satisfied without an admin override. While in this state, the **intended workflow is**: +> **Transitional — only while this repo has a single code owner.** Standard practice is a code-owner review on every PR. The flow below exists because GitHub forbids self-approval, so a single-owner repo cannot satisfy the "1 code-owner review" gate any other way. The exemption is **removed** the moment a second collaborator with merge rights joins. + +This repo currently runs with a single code owner (`* @constk` in `CODEOWNERS`). While in this state, the intended merge command is: ```sh gh pr merge --admin --squash --delete-branch ``` -…for `feat:` / `fix:` / `chore:` PRs, and `--admin --merge` (preserves history) for `release:` PRs. The `enforce_admins: false` line in `.github/branch-protection/{develop,main}.json` is the documented escape hatch — admin merge here is the policy, not a deviation from it. +…for `feat:` / `fix:` / `chore:` PRs, and `--admin --merge` (preserves history) for `release:` PRs. The `enforce_admins: false` line in `.github/branch-protection/{develop,main}.json` is the documented escape hatch — admin merge here is the documented single-owner workaround, not bypass of the gates (every required status check still has to pass). + +**When the exemption ends.** As soon as a second collaborator with merge rights is onboarded: + +1. Drop the `--admin` flag from the merge command and adopt standard PR review. +2. Remove this entire subsection. +3. Update `CODEOWNERS` to add the new collaborator. +4. Flip `enforce_admins` to `true` in the branch-protection JSON for both branches. Leaving it `false` would keep the admin-bypass door open even after the single-owner workaround is no longer needed — defeats the point of removing the workaround. -When a second collaborator joins, drop the `--admin` flag and adopt standard PR review. Update this section + `CODEOWNERS` in the same PR. +All four changes land in a single PR. ## Line endings (Windows clones) diff --git a/pyproject.toml b/pyproject.toml index 701f7a5..c7affac 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "harness-python-react" -version = "0.2.13" +version = "0.2.14" description = "Production-quality LLM-driven coding harness — Python (FastAPI) backend, Vite + React + TypeScript frontend." readme = "README.md" requires-python = ">=3.14" diff --git a/uv.lock b/uv.lock index 1d71286..851edd4 100644 --- a/uv.lock +++ b/uv.lock @@ -337,7 +337,7 @@ wheels = [ [[package]] name = "harness-python-react" -version = "0.2.13" +version = "0.2.14" source = { virtual = "." } dependencies = [ { name = "fastapi" }, From 59ad7f07d42b21714ee2e558a5400e605fadd108 Mon Sep 17 00:00:00 2001 From: Constantinos <41453723+constk@users.noreply.github.com> Date: Tue, 26 May 2026 15:33:50 +1000 Subject: [PATCH 6/8] docs: reframe README opener around the human+agent audience (#99) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * docs: reframe README opener around the human+agent audience (#90) The previous opener led with what the harness is (a coding harness for Python+React) and folded the audience into a trailing clause. The new opener leads with who it's for — teams pairing AI agents with human engineers — and keeps the mechanism punchline ("every gate enforced mechanically in CI, not by discipline") that makes the harness story distinctive. Wording matches the repo's GitHub description for consistency between the two surfaces. Closes #90 * docs: tighten README opener — harness vocab + 0.2.11 bump (#90) Review feedback on #99: - "Production-grade SDLC scaffold" -> "Production-grade SDLC harness". Everywhere else (package name, docs/HARNESS.md, CLAUDE.md) calls it a harness; "scaffold" was an unintentional vocabulary drift. - "regardless of who's at the keyboard" -> "regardless of who shipped the code". Agents don't have keyboards; the original metaphor leaked. The new phrasing covers humans and agents without forcing the human-only mental model. - README opener now also mirrors the GitHub repo description verbatim ("human-LLM coding collaborations"), so the two surfaces stay aligned. Also bumps the project version 0.2.10 -> 0.2.11 (docs change -> PATCH per docs/DEVELOPMENT.md) in pyproject.toml and the self-version line in uv.lock, unblocking the "Version bump check" CI gate that flagged the original commit. The "enforced mechanically in CI, not by discipline" punchline is preserved verbatim. Refs #90 --- README.md | 2 +- pyproject.toml | 2 +- uv.lock | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index b972f7a..da50e18 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ [![React 19.2](https://img.shields.io/badge/react-19.2-61dafb.svg)](https://react.dev/) [![Coverage 98%](https://img.shields.io/badge/coverage-98%25-brightgreen.svg)](docs/HARNESS.md) -> A production-quality coding harness for Python (FastAPI) + Vite/React/TypeScript projects. Designed for LLM-driven development: every gate — lint, types, architecture, security, eval — is enforced mechanically so code quality stays consistent across many human and AI contributors. +> Production-grade SDLC harness for human–LLM coding collaborations — keeping quality consistent regardless of who shipped the code. Python (FastAPI) + Vite/React/TypeScript, with every gate (lint, types, architecture, security, eval) enforced mechanically in CI, not by discipline. ## What ships diff --git a/pyproject.toml b/pyproject.toml index c7affac..7961745 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "harness-python-react" -version = "0.2.14" +version = "0.2.15" description = "Production-quality LLM-driven coding harness — Python (FastAPI) backend, Vite + React + TypeScript frontend." readme = "README.md" requires-python = ">=3.14" diff --git a/uv.lock b/uv.lock index 851edd4..9e4e858 100644 --- a/uv.lock +++ b/uv.lock @@ -337,7 +337,7 @@ wheels = [ [[package]] name = "harness-python-react" -version = "0.2.14" +version = "0.2.15" source = { virtual = "." } dependencies = [ { name = "fastapi" }, From 7c84f1810d57c4924ceb8dd77f497f28781963c2 Mon Sep 17 00:00:00 2001 From: Constantinos <41453723+constk@users.noreply.github.com> Date: Tue, 26 May 2026 15:36:26 +1000 Subject: [PATCH 7/8] docs: add concrete agent-failure example to README (#100) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * docs: add concrete agent-failure example to "Why a harness" (#91) The "harness IS the product" claim reads abstract without a worked example. Adds a blockquoted, 3-line sidebar inside the "Why a harness" section showing one realistic failure mode: an agent reaches for a reverse import (src.models → src.tools), import-linter blocks it in CI against the "src.models depends on nothing in src/" contract, the agent's next iteration routes around it via docs/BOUNDARIES.md. Names a real gate, cites the real contract, links the real doc — so the example is verifiable, not theatre. Closes #91 * chore: bump version to 0.2.11 --- README.md | 2 ++ pyproject.toml | 2 +- uv.lock | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index da50e18..a69899a 100644 --- a/README.md +++ b/README.md @@ -98,6 +98,8 @@ analogous to "Hello page" above. The differentiator isn't the scaffold — it's that every layer of the pipeline catches a different failure class **without relying on the human or LLM coder remembering to run anything**. The same posture protects code regardless of who wrote it. +> **Example.** An agent added `from src.tools import ...` inside `src.models` for type reuse. `lint-imports` failed CI — the `src.models depends on nothing in src/` contract broke — and pointed the next iteration at [`docs/BOUNDARIES.md`](docs/BOUNDARIES.md). The type moved into `src.models` instead. Never shipped. + See [`docs/HARNESS.md`](docs/HARNESS.md) for the full umbrella. Highlights: - **Pydantic `StrictModel` everywhere a contract crosses a seam** (rejects unknown keys at construction). diff --git a/pyproject.toml b/pyproject.toml index 7961745..8e2df15 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "harness-python-react" -version = "0.2.15" +version = "0.2.16" description = "Production-quality LLM-driven coding harness — Python (FastAPI) backend, Vite + React + TypeScript frontend." readme = "README.md" requires-python = ">=3.14" diff --git a/uv.lock b/uv.lock index 9e4e858..8e9e9f8 100644 --- a/uv.lock +++ b/uv.lock @@ -337,7 +337,7 @@ wheels = [ [[package]] name = "harness-python-react" -version = "0.2.15" +version = "0.2.16" source = { virtual = "." } dependencies = [ { name = "fastapi" }, From 8938eb796a0250d1054ad03addf98d8084b5982f Mon Sep 17 00:00:00 2001 From: Constantinos <41453723+constk@users.noreply.github.com> Date: Tue, 26 May 2026 15:39:03 +1000 Subject: [PATCH 8/8] docs: replace Jaeger screenshot TODO with section scaffold (#105) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * docs: replace Jaeger screenshot TODO with section scaffold (#92) The observability story in README has one visible loose end: a TODO block where the Jaeger trace screenshot should go. The rest of the section reads cleanly, so the TODO sticks out. Promotes the placeholder to a real subsection ("Jaeger trace") with the explanatory caption already written: what boots the stack, what endpoint produces the trace, where to view it, and that span attributes use only the constant-defined semconv keys from src/observability/spans.py. The image itself still needs to be captured. The original capture recipe is preserved as an HTML comment so it remains discoverable, and the comment includes the exact one-line markdown to paste in once docs/images/jaeger-trace.png lands. Hiding the placeholder inside an HTML comment (rather than a broken-image ref) keeps the rendered README clean while the PNG is outstanding. The image-capture step itself is a follow-up — needs the maintainer to run docker compose locally and take the screenshot. Closes #92 (capture step tracked separately as a single-line README edit when the PNG is committed). * chore: bump version to 0.2.11 --- README.md | 22 ++++++++++++++-------- pyproject.toml | 2 +- uv.lock | 2 +- 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index a69899a..c222e0a 100644 --- a/README.md +++ b/README.md @@ -81,17 +81,23 @@ The scaffold's React page hits `/api/v1/health` on load and renders the version ![Hello page](docs/images/hello-page.png) +### Jaeger trace (`docker compose up` + `/api/v1/health`) + +The full stack — backend, frontend, Jaeger collector — boots with `docker compose up`. Hitting `/api/v1/health` once produces an OpenTelemetry trace exported via OTLP/gRPC; the span hierarchy is visible at under the `harness-python-react` service, with `agent_span(...)` attributes attached using only the keys constant-defined at the top of [`src/observability/spans.py`](src/observability/spans.py). + ## Why a harness diff --git a/pyproject.toml b/pyproject.toml index 8e2df15..3e2e858 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "harness-python-react" -version = "0.2.16" +version = "0.2.17" description = "Production-quality LLM-driven coding harness — Python (FastAPI) backend, Vite + React + TypeScript frontend." readme = "README.md" requires-python = ">=3.14" diff --git a/uv.lock b/uv.lock index 8e9e9f8..77c0d17 100644 --- a/uv.lock +++ b/uv.lock @@ -337,7 +337,7 @@ wheels = [ [[package]] name = "harness-python-react" -version = "0.2.16" +version = "0.2.17" source = { virtual = "." } dependencies = [ { name = "fastapi" },