feat: eval harness scaffold + 1 example golden case + nightly workflow_dispatch (#24) (#60)

constk · web-flow · commit 00880e513dc5 · 2026-04-27T18:27:14.000+10:00
diff --git a/.github/workflows/eval-nightly.yml b/.github/workflows/eval-nightly.yml
@@ -0,0 +1,53 @@
+# Eval harness nightly — disabled-by-default.
+#
+# This workflow runs the golden QA dataset against the agent / LLM loop. It
+# is `workflow_dispatch`-only by default to prevent accidental LLM API
+# spend. To enable nightly runs:
+#
+#   1. Set the LLM secrets in repo settings (LLM_API_KEY at minimum;
+#      LLM_BASE_URL / LLM_MODEL / LLM_PROVIDER if your judge differs from
+#      OpenAI defaults).
+#   2. Replace the `on:` block below with:
+#
+#        on:
+#          schedule:
+#            - cron: "0 6 * * *"   # daily 06:00 UTC
+#          workflow_dispatch:
+#
+#   3. Add the `eval-nightly.yml` to EXEMPT_WORKFLOWS in
+#      `.github/scripts/check_required_contexts.py` if it's not already
+#      there (it is, by default — scheduled runs never gate PRs).
+#
+# See docs/EVAL_HARNESS.md for the full setup story.
+
+name: Eval nightly
+
+on:
+  workflow_dispatch:
+    inputs:
+      python_version:
+        description: "Python version to use"
+        required: false
+        default: "3.14"
+
+permissions:
+  contents: read
+
+jobs:
+  eval:
+    name: Run golden QA dataset
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5  # v4
+      - uses: astral-sh/setup-uv@cec208311dfd045dd5311c1add060b2062131d57  # v8
+      - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065  # v5
+        with:
+          python-version: ${{ inputs.python_version || '3.14' }}
+      - run: uv sync --frozen --extra dev
+      - name: Run pytest eval/
+        env:
+          LLM_PROVIDER: ${{ secrets.LLM_PROVIDER }}
+          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
+          LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
+          LLM_MODEL: ${{ secrets.LLM_MODEL }}
+        run: uv run pytest eval/ -v
diff --git a/eval/golden_qa.json b/eval/golden_qa.json
@@ -0,0 +1,11 @@
+[
+  {
+    "id": "echo-hello",
+    "question": "echo hello",
+    "category": "smoke",
+    "expected_answer": "hello",
+    "tolerance": "exact_match",
+    "difficulty": "easy",
+    "notes": "Trivial example: exercises the runner without an LLM call."
+  }
+]
diff --git a/eval/test_golden_qa.py b/eval/test_golden_qa.py
@@ -0,0 +1,49 @@
+"""Parametrised evaluation tests from the golden QA dataset.
+
+Marked with ``@pytest.mark.eval`` so the default ``just check`` /
+``pytest tests/`` invocations skip them. Run explicitly with::
+
+    uv run pytest eval/
+
+Wire a real LLM-backed ``answer_fn`` by editing ``_answer_fn`` below or
+constructing ``EvalRunner`` with your own callable. The default echoes the
+question verbatim, which makes the trivial example case (``echo hello``
+expects ``hello``) pass without any LLM credentials.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from src.eval.models import EvalCase
+from src.eval.runner import EvalRunner, load_golden_dataset
+
+# Load cases without initialising a runner (no side effects at import time)
+golden = load_golden_dataset()
+
+
+def _answer_fn(question: str) -> str:
+    """Placeholder agent. Strips a leading ``echo `` so the example case
+    resolves to the expected answer; in real use this calls the agent
+    loop / LLM client wired by the project."""
+    return question.removeprefix("echo ").strip()
+
+
+@pytest.fixture(scope="module")
+def runner() -> EvalRunner:
+    """Single runner shared across cases in this module."""
+    return EvalRunner(answer_fn=_answer_fn)
+
+
+@pytest.mark.eval
+@pytest.mark.parametrize("case", golden, ids=lambda c: c.id)
+def test_golden_qa(case: EvalCase, runner: EvalRunner) -> None:
+    """Evaluate a golden QA test case against the configured agent."""
+    result = runner.evaluate(case)
+    assert result.pass_result, (
+        f"[{case.id}] {case.category}/{case.difficulty}\n"
+        f"Q: {case.question}\n"
+        f"Expected: {case.expected_answer}\n"
+        f"Got: {result.actual_answer}\n"
+        f"Reason: {result.failure_reason}"
+    )
diff --git a/pyproject.toml b/pyproject.toml
@@ -159,6 +159,7 @@ info = "See docs/DEVELOPMENT.md#commit-messages for the allowed prefixes."
 
 [tool.pytest.ini_options]
 testpaths = ["tests"]
+pythonpath = ["."]
 asyncio_mode = "auto"
 timeout = 30
 markers = [
diff --git a/src/eval/__main__.py b/src/eval/__main__.py
@@ -0,0 +1,31 @@
+"""Eval-harness CLI: ``python -m src.eval`` runs the golden dataset and
+prints the markdown report. The default ``answer_fn`` echoes the question
+verbatim — wire your real agent loop in by importing this module and
+constructing ``EvalRunner(answer_fn=your_callable)``.
+"""
+
+from __future__ import annotations
+
+from src.eval.report import generate_report
+from src.eval.runner import EvalRunner
+
+
+def _identity_answer(question: str) -> str:
+    """Default placeholder agent — returns the question verbatim.
+
+    Real users wire a ``Callable[[str], str]`` that hits their LLM / agent
+    loop. This default keeps the CLI runnable without LLM credentials.
+    """
+    return question
+
+
+def main() -> int:
+    runner = EvalRunner(answer_fn=_identity_answer)
+    results = runner.evaluate_all()
+    report = generate_report(results)
+    print(report)
+    return 0 if all(r.pass_result for r in results) else 1
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/src/eval/judge.py b/src/eval/judge.py
@@ -0,0 +1,82 @@
+"""LLM judge for semantic-similarity evaluation — provider-agnostic.
+
+The judge calls an `LLMClient` Protocol, NOT a specific provider SDK. Wire
+your concrete client (OpenAI, Anthropic, Azure, vLLM, …) at runtime; the
+eval harness stays decoupled. The default behaviour when no client is wired
+is `(None, "no LLM client configured")`, which the runner treats as
+inconclusive rather than a hard failure.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+from typing import Protocol
+
+logger = logging.getLogger(__name__)
+
+_JUDGE_PROMPT = """You are an evaluation judge. Given a question, an expected
+answer, and an actual answer from an AI agent, score how well the actual
+answer matches the expected answer.
+
+Question: {question}
+Expected answer: {expected_answer}
+Actual answer: {actual_answer}
+
+Respond with JSON only:
+{{
+  "score": <float 0.0 to 1.0>,
+  "explanation": "<one line explaining the score>"
+}}
+
+Scoring guide:
+- 1.0: Semantically identical, same information conveyed
+- 0.8-0.9: Correct answer with minor wording or formatting differences
+- 0.5-0.7: Partially correct, missing key details or minor inaccuracies
+- 0.1-0.4: Substantially wrong but shows some relevant information
+- 0.0: Completely wrong or unrelated"""
+
+
+class LLMClient(Protocol):
+    """Minimum surface the judge needs from any LLM SDK.
+
+    Concrete adapters live alongside the agent code. The Protocol form lets
+    a downstream consumer wire OpenAI's SDK, Anthropic's, Azure OpenAI, or a
+    self-hosted vLLM endpoint without the eval harness importing any
+    vendor-specific module.
+    """
+
+    def complete_json(self, *, model: str, prompt: str) -> str:
+        """Send *prompt* to *model* and return the raw JSON response body."""
+
+
+def evaluate_semantic_similarity(
+    question: str,
+    expected: str,
+    actual: str,
+    client: LLMClient | None,
+    model: str,
+) -> tuple[float | None, str]:
+    """Score semantic similarity between expected and actual answers.
+
+    Returns ``(score in [0.0, 1.0], explanation)``. On failure returns
+    ``(None, error)`` — the caller treats this as inconclusive, not a fail.
+    """
+    if client is None:
+        return (None, "no LLM client configured")
+
+    prompt = _JUDGE_PROMPT.format(
+        question=question,
+        expected_answer=expected,
+        actual_answer=actual,
+    )
+
+    try:
+        body = client.complete_json(model=model, prompt=prompt)
+        parsed = json.loads(body)
+        score = float(parsed.get("score", 0.0))
+        explanation = str(parsed.get("explanation", "No explanation"))
+        return (max(0.0, min(1.0, score)), explanation)
+    except Exception as exc:
+        logger.exception("LLM judge call failed")
+        return (None, f"Judge call failed: {exc}")
diff --git a/src/eval/models.py b/src/eval/models.py
@@ -0,0 +1,57 @@
+"""Pydantic models for the evaluation harness."""
+
+from __future__ import annotations
+
+from typing import Literal
+
+from pydantic import BaseModel, Field
+
+
+class EvalCase(BaseModel):
+    """A single test case from the golden QA dataset."""
+
+    id: str = Field(description="Unique test case identifier")
+    question: str = Field(description="Natural language question / input")
+    category: str = Field(default="general", description="Test category")
+    expected_answer: str = Field(description="Expected answer text")
+    expected_tools: list[str] = Field(
+        default_factory=list,
+        description="Tools the agent should call (informational; not asserted)",
+    )
+    tolerance: Literal["exact_match", "numeric_close", "semantic_similar"] = Field(
+        description="How to compare actual vs expected"
+    )
+    difficulty: Literal["easy", "medium", "hard"] = Field(
+        default="easy",
+        description="Difficulty level",
+    )
+    notes: str = Field(default="", description="Why this test case exists")
+
+
+class EvalResult(BaseModel):
+    """Result of evaluating a single test case."""
+
+    case_id: str = Field(description="Test case ID")
+    question: str = Field(description="The question asked")
+    category: str = Field(description="Test category")
+    difficulty: str = Field(description="Difficulty level")
+    expected_answer: str = Field(description="Expected answer")
+    actual_answer: str = Field(description="Agent's actual answer")
+    tools_called: list[str] = Field(
+        default_factory=list,
+        description="Tools the agent invoked",
+    )
+    reasoning_trace: list[str] = Field(
+        default_factory=list,
+        description="Chain of thought steps",
+    )
+    latency_ms: int = Field(description="Wall clock time in ms")
+    pass_result: bool = Field(description="Whether the test passed")
+    score: float | None = Field(
+        default=None,
+        description="LLM judge score for semantic_similar",
+    )
+    failure_reason: str | None = Field(
+        default=None,
+        description="Why the test failed",
+    )
diff --git a/src/eval/report.py b/src/eval/report.py
@@ -0,0 +1,75 @@
+"""Generate a markdown evaluation report from results."""
+
+from __future__ import annotations
+
+import logging
+from collections import defaultdict
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from src.eval.models import EvalResult
+
+logger = logging.getLogger(__name__)
+
+
+def generate_report(results: list[EvalResult]) -> str:
+    """Generate a markdown report from evaluation results.
+
+    Sections: overall accuracy, by category, by difficulty, failure analysis.
+    """
+    total = len(results)
+    passed = sum(1 for r in results if r.pass_result)
+
+    lines: list[str] = []
+    lines.append("# Evaluation Report\n")
+
+    pct = (passed / total * 100) if total > 0 else 0
+    lines.append("## Overall Accuracy\n")
+    lines.append(f"**{passed}/{total} passed ({pct:.1f}%)**\n")
+
+    lines.append("## Accuracy by Category\n")
+    lines.append("| Category | Passed | Total | Rate |")
+    lines.append("|---|---|---|---|")
+
+    by_cat: dict[str, list[EvalResult]] = defaultdict(list)
+    for r in results:
+        by_cat[r.category].append(r)
+
+    for cat in sorted(by_cat):
+        cat_results = by_cat[cat]
+        cat_passed = sum(1 for r in cat_results if r.pass_result)
+        cat_total = len(cat_results)
+        cat_pct = (cat_passed / cat_total * 100) if cat_total > 0 else 0
+        lines.append(f"| {cat} | {cat_passed} | {cat_total} | {cat_pct:.0f}% |")
+
+    lines.append("")
+
+    lines.append("## Accuracy by Difficulty\n")
+    lines.append("| Difficulty | Passed | Total | Rate |")
+    lines.append("|---|---|---|---|")
+
+    by_diff: dict[str, list[EvalResult]] = defaultdict(list)
+    for r in results:
+        by_diff[r.difficulty].append(r)
+
+    for diff in ["easy", "medium", "hard"]:
+        if diff in by_diff:
+            diff_results = by_diff[diff]
+            diff_passed = sum(1 for r in diff_results if r.pass_result)
+            diff_total = len(diff_results)
+            diff_pct = (diff_passed / diff_total * 100) if diff_total > 0 else 0
+            lines.append(f"| {diff} | {diff_passed} | {diff_total} | {diff_pct:.0f}% |")
+
+    lines.append("")
+
+    failures = [r for r in results if not r.pass_result]
+    if failures:
+        lines.append("## Failure Analysis\n")
+        for r in failures:
+            lines.append(f"### {r.case_id} ({r.category}/{r.difficulty})\n")
+            lines.append(f"**Question:** {r.question}\n")
+            lines.append(f"**Expected:** {r.expected_answer}\n")
+            lines.append(f"**Actual:** {r.actual_answer}\n")
+            lines.append(f"**Reason:** {r.failure_reason}\n")
+
+    return "\n".join(lines)
diff --git a/src/eval/runner.py b/src/eval/runner.py