From 220b3db5606ffa65436f662311e86fc0d0df2d35 Mon Sep 17 00:00:00 2001
From: "const.koutsakis@aurecongroup.com" <constantinos.koutsakis@gmail.com>
Date: Mon, 27 Apr 2026 18:26:56 +1000
Subject: [PATCH] feat: eval harness scaffold + 1 example golden case + nightly
 workflow_dispatch (#24)

---
 .github/workflows/eval-nightly.yml |  53 ++++++++
 eval/golden_qa.json                |  11 ++
 eval/test_golden_qa.py             |  49 ++++++++
 pyproject.toml                     |   1 +
 src/eval/__main__.py               |  31 +++++
 src/eval/judge.py                  |  82 +++++++++++++
 src/eval/models.py                 |  57 +++++++++
 src/eval/report.py                 |  75 ++++++++++++
 src/eval/runner.py                 | 190 +++++++++++++++++++++++++++++
 9 files changed, 549 insertions(+)
 create mode 100644 .github/workflows/eval-nightly.yml
 create mode 100644 eval/golden_qa.json
 create mode 100644 eval/test_golden_qa.py
 create mode 100644 src/eval/__main__.py
 create mode 100644 src/eval/judge.py
 create mode 100644 src/eval/models.py
 create mode 100644 src/eval/report.py
 create mode 100644 src/eval/runner.py

diff --git a/.github/workflows/eval-nightly.yml b/.github/workflows/eval-nightly.yml
new file mode 100644
index 0000000..3b069a9
--- /dev/null
+++ b/.github/workflows/eval-nightly.yml
@@ -0,0 +1,53 @@
+# Eval harness nightly — disabled-by-default.
+#
+# This workflow runs the golden QA dataset against the agent / LLM loop. It
+# is `workflow_dispatch`-only by default to prevent accidental LLM API
+# spend. To enable nightly runs:
+#
+#   1. Set the LLM secrets in repo settings (LLM_API_KEY at minimum;
+#      LLM_BASE_URL / LLM_MODEL / LLM_PROVIDER if your judge differs from
+#      OpenAI defaults).
+#   2. Replace the `on:` block below with:
+#
+#        on:
+#          schedule:
+#            - cron: "0 6 * * *"   # daily 06:00 UTC
+#          workflow_dispatch:
+#
+#   3. Add the `eval-nightly.yml` to EXEMPT_WORKFLOWS in
+#      `.github/scripts/check_required_contexts.py` if it's not already
+#      there (it is, by default — scheduled runs never gate PRs).
+#
+# See docs/EVAL_HARNESS.md for the full setup story.
+
+name: Eval nightly
+
+on:
+  workflow_dispatch:
+    inputs:
+      python_version:
+        description: "Python version to use"
+        required: false
+        default: "3.14"
+
+permissions:
+  contents: read
+
+jobs:
+  eval:
+    name: Run golden QA dataset
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5  # v4
+      - uses: astral-sh/setup-uv@cec208311dfd045dd5311c1add060b2062131d57  # v8
+      - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065  # v5
+        with:
+          python-version: ${{ inputs.python_version || '3.14' }}
+      - run: uv sync --frozen --extra dev
+      - name: Run pytest eval/
+        env:
+          LLM_PROVIDER: ${{ secrets.LLM_PROVIDER }}
+          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
+          LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
+          LLM_MODEL: ${{ secrets.LLM_MODEL }}
+        run: uv run pytest eval/ -v
diff --git a/eval/golden_qa.json b/eval/golden_qa.json
new file mode 100644
index 0000000..d8348de
--- /dev/null
+++ b/eval/golden_qa.json
@@ -0,0 +1,11 @@
+[
+  {
+    "id": "echo-hello",
+    "question": "echo hello",
+    "category": "smoke",
+    "expected_answer": "hello",
+    "tolerance": "exact_match",
+    "difficulty": "easy",
+    "notes": "Trivial example: exercises the runner without an LLM call."
+  }
+]
diff --git a/eval/test_golden_qa.py b/eval/test_golden_qa.py
new file mode 100644
index 0000000..bf290d9
--- /dev/null
+++ b/eval/test_golden_qa.py
@@ -0,0 +1,49 @@
+"""Parametrised evaluation tests from the golden QA dataset.
+
+Marked with ``@pytest.mark.eval`` so the default ``just check`` /
+``pytest tests/`` invocations skip them. Run explicitly with::
+
+    uv run pytest eval/
+
+Wire a real LLM-backed ``answer_fn`` by editing ``_answer_fn`` below or
+constructing ``EvalRunner`` with your own callable. The default echoes the
+question verbatim, which makes the trivial example case (``echo hello``
+expects ``hello``) pass without any LLM credentials.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from src.eval.models import EvalCase
+from src.eval.runner import EvalRunner, load_golden_dataset
+
+# Load cases without initialising a runner (no side effects at import time)
+golden = load_golden_dataset()
+
+
+def _answer_fn(question: str) -> str:
+    """Placeholder agent. Strips a leading ``echo `` so the example case
+    resolves to the expected answer; in real use this calls the agent
+    loop / LLM client wired by the project."""
+    return question.removeprefix("echo ").strip()
+
+
+@pytest.fixture(scope="module")
+def runner() -> EvalRunner:
+    """Single runner shared across cases in this module."""
+    return EvalRunner(answer_fn=_answer_fn)
+
+
+@pytest.mark.eval
+@pytest.mark.parametrize("case", golden, ids=lambda c: c.id)
+def test_golden_qa(case: EvalCase, runner: EvalRunner) -> None:
+    """Evaluate a golden QA test case against the configured agent."""
+    result = runner.evaluate(case)
+    assert result.pass_result, (
+        f"[{case.id}] {case.category}/{case.difficulty}\n"
+        f"Q: {case.question}\n"
+        f"Expected: {case.expected_answer}\n"
+        f"Got: {result.actual_answer}\n"
+        f"Reason: {result.failure_reason}"
+    )
diff --git a/pyproject.toml b/pyproject.toml
index ae42e70..1d4574a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -159,6 +159,7 @@ info = "See docs/DEVELOPMENT.md#commit-messages for the allowed prefixes."
 
 [tool.pytest.ini_options]
 testpaths = ["tests"]
+pythonpath = ["."]
 asyncio_mode = "auto"
 timeout = 30
 markers = [
diff --git a/src/eval/__main__.py b/src/eval/__main__.py
new file mode 100644
index 0000000..0cf1103
--- /dev/null
+++ b/src/eval/__main__.py
@@ -0,0 +1,31 @@
+"""Eval-harness CLI: ``python -m src.eval`` runs the golden dataset and
+prints the markdown report. The default ``answer_fn`` echoes the question
+verbatim — wire your real agent loop in by importing this module and
+constructing ``EvalRunner(answer_fn=your_callable)``.
+"""
+
+from __future__ import annotations
+
+from src.eval.report import generate_report
+from src.eval.runner import EvalRunner
+
+
+def _identity_answer(question: str) -> str:
+    """Default placeholder agent — returns the question verbatim.
+
+    Real users wire a ``Callable[[str], str]`` that hits their LLM / agent
+    loop. This default keeps the CLI runnable without LLM credentials.
+    """
+    return question
+
+
+def main() -> int:
+    runner = EvalRunner(answer_fn=_identity_answer)
+    results = runner.evaluate_all()
+    report = generate_report(results)
+    print(report)
+    return 0 if all(r.pass_result for r in results) else 1
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/src/eval/judge.py b/src/eval/judge.py
new file mode 100644
index 0000000..d2d2591
--- /dev/null
+++ b/src/eval/judge.py
@@ -0,0 +1,82 @@
+"""LLM judge for semantic-similarity evaluation — provider-agnostic.
+
+The judge calls an `LLMClient` Protocol, NOT a specific provider SDK. Wire
+your concrete client (OpenAI, Anthropic, Azure, vLLM, …) at runtime; the
+eval harness stays decoupled. The default behaviour when no client is wired
+is `(None, "no LLM client configured")`, which the runner treats as
+inconclusive rather than a hard failure.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+from typing import Protocol
+
+logger = logging.getLogger(__name__)
+
+_JUDGE_PROMPT = """You are an evaluation judge. Given a question, an expected
+answer, and an actual answer from an AI agent, score how well the actual
+answer matches the expected answer.
+
+Question: {question}
+Expected answer: {expected_answer}
+Actual answer: {actual_answer}
+
+Respond with JSON only:
+{{
+  "score": <float 0.0 to 1.0>,
+  "explanation": "<one line explaining the score>"
+}}
+
+Scoring guide:
+- 1.0: Semantically identical, same information conveyed
+- 0.8-0.9: Correct answer with minor wording or formatting differences
+- 0.5-0.7: Partially correct, missing key details or minor inaccuracies
+- 0.1-0.4: Substantially wrong but shows some relevant information
+- 0.0: Completely wrong or unrelated"""
+
+
+class LLMClient(Protocol):
+    """Minimum surface the judge needs from any LLM SDK.
+
+    Concrete adapters live alongside the agent code. The Protocol form lets
+    a downstream consumer wire OpenAI's SDK, Anthropic's, Azure OpenAI, or a
+    self-hosted vLLM endpoint without the eval harness importing any
+    vendor-specific module.
+    """
+
+    def complete_json(self, *, model: str, prompt: str) -> str:
+        """Send *prompt* to *model* and return the raw JSON response body."""
+
+
+def evaluate_semantic_similarity(
+    question: str,
+    expected: str,
+    actual: str,
+    client: LLMClient | None,
+    model: str,
+) -> tuple[float | None, str]:
+    """Score semantic similarity between expected and actual answers.
+
+    Returns ``(score in [0.0, 1.0], explanation)``. On failure returns
+    ``(None, error)`` — the caller treats this as inconclusive, not a fail.
+    """
+    if client is None:
+        return (None, "no LLM client configured")
+
+    prompt = _JUDGE_PROMPT.format(
+        question=question,
+        expected_answer=expected,
+        actual_answer=actual,
+    )
+
+    try:
+        body = client.complete_json(model=model, prompt=prompt)
+        parsed = json.loads(body)
+        score = float(parsed.get("score", 0.0))
+        explanation = str(parsed.get("explanation", "No explanation"))
+        return (max(0.0, min(1.0, score)), explanation)
+    except Exception as exc:
+        logger.exception("LLM judge call failed")
+        return (None, f"Judge call failed: {exc}")
diff --git a/src/eval/models.py b/src/eval/models.py
new file mode 100644
index 0000000..548768a
--- /dev/null
+++ b/src/eval/models.py
@@ -0,0 +1,57 @@
+"""Pydantic models for the evaluation harness."""
+
+from __future__ import annotations
+
+from typing import Literal
+
+from pydantic import BaseModel, Field
+
+
+class EvalCase(BaseModel):
+    """A single test case from the golden QA dataset."""
+
+    id: str = Field(description="Unique test case identifier")
+    question: str = Field(description="Natural language question / input")
+    category: str = Field(default="general", description="Test category")
+    expected_answer: str = Field(description="Expected answer text")
+    expected_tools: list[str] = Field(
+        default_factory=list,
+        description="Tools the agent should call (informational; not asserted)",
+    )
+    tolerance: Literal["exact_match", "numeric_close", "semantic_similar"] = Field(
+        description="How to compare actual vs expected"
+    )
+    difficulty: Literal["easy", "medium", "hard"] = Field(
+        default="easy",
+        description="Difficulty level",
+    )
+    notes: str = Field(default="", description="Why this test case exists")
+
+
+class EvalResult(BaseModel):
+    """Result of evaluating a single test case."""
+
+    case_id: str = Field(description="Test case ID")
+    question: str = Field(description="The question asked")
+    category: str = Field(description="Test category")
+    difficulty: str = Field(description="Difficulty level")
+    expected_answer: str = Field(description="Expected answer")
+    actual_answer: str = Field(description="Agent's actual answer")
+    tools_called: list[str] = Field(
+        default_factory=list,
+        description="Tools the agent invoked",
+    )
+    reasoning_trace: list[str] = Field(
+        default_factory=list,
+        description="Chain of thought steps",
+    )
+    latency_ms: int = Field(description="Wall clock time in ms")
+    pass_result: bool = Field(description="Whether the test passed")
+    score: float | None = Field(
+        default=None,
+        description="LLM judge score for semantic_similar",
+    )
+    failure_reason: str | None = Field(
+        default=None,
+        description="Why the test failed",
+    )
diff --git a/src/eval/report.py b/src/eval/report.py
new file mode 100644
index 0000000..22e1661
--- /dev/null
+++ b/src/eval/report.py
@@ -0,0 +1,75 @@
+"""Generate a markdown evaluation report from results."""
+
+from __future__ import annotations
+
+import logging
+from collections import defaultdict
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from src.eval.models import EvalResult
+
+logger = logging.getLogger(__name__)
+
+
+def generate_report(results: list[EvalResult]) -> str:
+    """Generate a markdown report from evaluation results.
+
+    Sections: overall accuracy, by category, by difficulty, failure analysis.
+    """
+    total = len(results)
+    passed = sum(1 for r in results if r.pass_result)
+
+    lines: list[str] = []
+    lines.append("# Evaluation Report\n")
+
+    pct = (passed / total * 100) if total > 0 else 0
+    lines.append("## Overall Accuracy\n")
+    lines.append(f"**{passed}/{total} passed ({pct:.1f}%)**\n")
+
+    lines.append("## Accuracy by Category\n")
+    lines.append("| Category | Passed | Total | Rate |")
+    lines.append("|---|---|---|---|")
+
+    by_cat: dict[str, list[EvalResult]] = defaultdict(list)
+    for r in results:
+        by_cat[r.category].append(r)
+
+    for cat in sorted(by_cat):
+        cat_results = by_cat[cat]
+        cat_passed = sum(1 for r in cat_results if r.pass_result)
+        cat_total = len(cat_results)
+        cat_pct = (cat_passed / cat_total * 100) if cat_total > 0 else 0
+        lines.append(f"| {cat} | {cat_passed} | {cat_total} | {cat_pct:.0f}% |")
+
+    lines.append("")
+
+    lines.append("## Accuracy by Difficulty\n")
+    lines.append("| Difficulty | Passed | Total | Rate |")
+    lines.append("|---|---|---|---|")
+
+    by_diff: dict[str, list[EvalResult]] = defaultdict(list)
+    for r in results:
+        by_diff[r.difficulty].append(r)
+
+    for diff in ["easy", "medium", "hard"]:
+        if diff in by_diff:
+            diff_results = by_diff[diff]
+            diff_passed = sum(1 for r in diff_results if r.pass_result)
+            diff_total = len(diff_results)
+            diff_pct = (diff_passed / diff_total * 100) if diff_total > 0 else 0
+            lines.append(f"| {diff} | {diff_passed} | {diff_total} | {diff_pct:.0f}% |")
+
+    lines.append("")
+
+    failures = [r for r in results if not r.pass_result]
+    if failures:
+        lines.append("## Failure Analysis\n")
+        for r in failures:
+            lines.append(f"### {r.case_id} ({r.category}/{r.difficulty})\n")
+            lines.append(f"**Question:** {r.question}\n")
+            lines.append(f"**Expected:** {r.expected_answer}\n")
+            lines.append(f"**Actual:** {r.actual_answer}\n")
+            lines.append(f"**Reason:** {r.failure_reason}\n")
+
+    return "\n".join(lines)
diff --git a/src/eval/runner.py b/src/eval/runner.py
new file mode 100644
index 0000000..6e08bfe
--- /dev/null
+++ b/src/eval/runner.py
@@ -0,0 +1,190 @@
+"""Evaluation runner: executes golden QA cases against a caller-supplied
+``answer_fn``. The harness is provider-agnostic — wire any agent loop,
+direct LLM call, or stub by passing a function ``str -> str`` to
+``EvalRunner``.
+
+The runner does NOT spin up tracing, databases, or LLM clients on its own.
+That keeps it cheap to import in unit tests and lets the agent code own its
+own startup story.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import re
+import time
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+from src.eval.judge import evaluate_semantic_similarity
+from src.eval.models import EvalCase, EvalResult
+
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
+    from src.eval.judge import LLMClient
+
+logger = logging.getLogger(__name__)
+
+_GOLDEN_QA_PATH = (
+    Path(__file__).resolve().parent.parent.parent / "eval" / "golden_qa.json"
+)
+
+
+def load_golden_dataset(path: Path | None = None) -> list[EvalCase]:
+    """Load test cases from the golden QA JSON file.
+
+    Standalone so callers can introspect the dataset without paying the
+    cost of constructing an ``EvalRunner``.
+    """
+    qa_path = path or _GOLDEN_QA_PATH
+    with qa_path.open() as fh:
+        data = json.load(fh)
+    return [EvalCase.model_validate(item) for item in data]
+
+
+class EvalRunner:
+    """Runs the golden QA dataset and collects results."""
+
+    def __init__(
+        self,
+        answer_fn: Callable[[str], str],
+        judge_client: LLMClient | None = None,
+        judge_model: str = "",
+    ) -> None:
+        self._answer_fn = answer_fn
+        self._judge_client = judge_client
+        self._judge_model = judge_model
+
+    def evaluate(self, case: EvalCase) -> EvalResult:
+        """Evaluate a single test case."""
+        start = time.monotonic()
+        try:
+            actual_answer = self._answer_fn(case.question)
+        except Exception as exc:
+            actual_answer = f"ERROR: {exc}"
+        latency_ms = int((time.monotonic() - start) * 1000)
+
+        passed, score, reason = self._compare(case, actual_answer)
+
+        return EvalResult(
+            case_id=case.id,
+            question=case.question,
+            category=case.category,
+            difficulty=case.difficulty,
+            expected_answer=case.expected_answer,
+            actual_answer=actual_answer,
+            tools_called=[],
+            reasoning_trace=[],
+            latency_ms=latency_ms,
+            pass_result=passed,
+            score=score,
+            failure_reason=reason,
+        )
+
+    def evaluate_all(self) -> list[EvalResult]:
+        """Evaluate every case in the loaded golden dataset."""
+        cases = load_golden_dataset()
+        results: list[EvalResult] = []
+        for case in cases:
+            logger.info("Evaluating %s: %s", case.id, case.question[:50])
+            result = self.evaluate(case)
+            status = "PASS" if result.pass_result else "FAIL"
+            logger.info("  %s %s", status, result.failure_reason or "")
+            results.append(result)
+        return results
+
+    # ---------------------------------------------------------------------
+
+    def _compare(
+        self,
+        case: EvalCase,
+        actual: str,
+    ) -> tuple[bool, float | None, str | None]:
+        if case.tolerance == "exact_match":
+            return self._exact_match(case.expected_answer, actual)
+        if case.tolerance == "numeric_close":
+            return self._numeric_close(case.expected_answer, actual)
+        if case.tolerance == "semantic_similar":
+            return self._semantic_similar(case, actual)
+        return (False, None, f"Unknown tolerance: {case.tolerance}")
+
+    @staticmethod
+    def _exact_match(
+        expected: str, actual: str
+    ) -> tuple[bool, float | None, str | None]:
+        norm_expected = _normalise(expected)
+        norm_actual = _normalise(actual)
+        if norm_expected == norm_actual:
+            return (True, None, None)
+        return (False, None, f"Exact match failed: '{actual[:100]}'")
+
+    @staticmethod
+    def _numeric_close(
+        expected: str, actual: str
+    ) -> tuple[bool, float | None, str | None]:
+        expected_nums = _extract_numbers(expected)
+        actual_nums = _extract_numbers(actual)
+
+        if not expected_nums:
+            return (False, None, "No numbers found in expected answer")
+        if not actual_nums:
+            return (False, None, "No numbers found in actual answer")
+
+        target = expected_nums[0]
+        for num in actual_nums:
+            if target == 0:
+                if num == 0:
+                    return (True, None, None)
+            elif abs(num - target) / abs(target) <= 0.01:
+                return (True, None, None)
+
+        return (
+            False,
+            None,
+            f"Numeric mismatch: expected ~{target}, got {actual_nums}",
+        )
+
+    def _semantic_similar(
+        self,
+        case: EvalCase,
+        actual: str,
+    ) -> tuple[bool, float | None, str | None]:
+        score, explanation = evaluate_semantic_similarity(
+            question=case.question,
+            expected=case.expected_answer,
+            actual=actual,
+            client=self._judge_client,
+            model=self._judge_model,
+        )
+        if score is None:
+            return (True, None, f"Judge inconclusive: {explanation}")
+        if score >= 0.8:
+            return (True, score, None)
+        return (False, score, f"Semantic score {score:.2f}: {explanation}")
+
+
+def _normalise(text: str) -> str:
+    """Lowercase, strip, collapse whitespace."""
+    return " ".join(text.lower().strip().split())
+
+
+def _extract_numbers(text: str) -> list[float]:
+    """Pull decimals + integers out of free-form text.
+
+    Filters out 4-digit values in the year range (2020-2029) so a question
+    referencing a year doesn't accidentally provide the comparison target.
+    """
+    pattern = r"\d[\d,]*\.?\d*"
+    matches = re.findall(pattern, text)
+    numbers: list[float] = []
+    for m in matches:
+        try:
+            val = float(m.replace(",", ""))
+        except ValueError:
+            continue
+        if val == int(val) and 2020 <= val <= 2029:
+            continue
+        numbers.append(val)
+    return numbers