From 220b3db5606ffa65436f662311e86fc0d0df2d35 Mon Sep 17 00:00:00 2001 From: "const.koutsakis@aurecongroup.com" Date: Mon, 27 Apr 2026 18:26:56 +1000 Subject: [PATCH] feat: eval harness scaffold + 1 example golden case + nightly workflow_dispatch (#24) --- .github/workflows/eval-nightly.yml | 53 ++++++++ eval/golden_qa.json | 11 ++ eval/test_golden_qa.py | 49 ++++++++ pyproject.toml | 1 + src/eval/__main__.py | 31 +++++ src/eval/judge.py | 82 +++++++++++++ src/eval/models.py | 57 +++++++++ src/eval/report.py | 75 ++++++++++++ src/eval/runner.py | 190 +++++++++++++++++++++++++++++ 9 files changed, 549 insertions(+) create mode 100644 .github/workflows/eval-nightly.yml create mode 100644 eval/golden_qa.json create mode 100644 eval/test_golden_qa.py create mode 100644 src/eval/__main__.py create mode 100644 src/eval/judge.py create mode 100644 src/eval/models.py create mode 100644 src/eval/report.py create mode 100644 src/eval/runner.py diff --git a/.github/workflows/eval-nightly.yml b/.github/workflows/eval-nightly.yml new file mode 100644 index 0000000..3b069a9 --- /dev/null +++ b/.github/workflows/eval-nightly.yml @@ -0,0 +1,53 @@ +# Eval harness nightly — disabled-by-default. +# +# This workflow runs the golden QA dataset against the agent / LLM loop. It +# is `workflow_dispatch`-only by default to prevent accidental LLM API +# spend. To enable nightly runs: +# +# 1. Set the LLM secrets in repo settings (LLM_API_KEY at minimum; +# LLM_BASE_URL / LLM_MODEL / LLM_PROVIDER if your judge differs from +# OpenAI defaults). +# 2. Replace the `on:` block below with: +# +# on: +# schedule: +# - cron: "0 6 * * *" # daily 06:00 UTC +# workflow_dispatch: +# +# 3. Add the `eval-nightly.yml` to EXEMPT_WORKFLOWS in +# `.github/scripts/check_required_contexts.py` if it's not already +# there (it is, by default — scheduled runs never gate PRs). +# +# See docs/EVAL_HARNESS.md for the full setup story. + +name: Eval nightly + +on: + workflow_dispatch: + inputs: + python_version: + description: "Python version to use" + required: false + default: "3.14" + +permissions: + contents: read + +jobs: + eval: + name: Run golden QA dataset + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 + - uses: astral-sh/setup-uv@cec208311dfd045dd5311c1add060b2062131d57 # v8 + - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5 + with: + python-version: ${{ inputs.python_version || '3.14' }} + - run: uv sync --frozen --extra dev + - name: Run pytest eval/ + env: + LLM_PROVIDER: ${{ secrets.LLM_PROVIDER }} + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }} + LLM_MODEL: ${{ secrets.LLM_MODEL }} + run: uv run pytest eval/ -v diff --git a/eval/golden_qa.json b/eval/golden_qa.json new file mode 100644 index 0000000..d8348de --- /dev/null +++ b/eval/golden_qa.json @@ -0,0 +1,11 @@ +[ + { + "id": "echo-hello", + "question": "echo hello", + "category": "smoke", + "expected_answer": "hello", + "tolerance": "exact_match", + "difficulty": "easy", + "notes": "Trivial example: exercises the runner without an LLM call." + } +] diff --git a/eval/test_golden_qa.py b/eval/test_golden_qa.py new file mode 100644 index 0000000..bf290d9 --- /dev/null +++ b/eval/test_golden_qa.py @@ -0,0 +1,49 @@ +"""Parametrised evaluation tests from the golden QA dataset. + +Marked with ``@pytest.mark.eval`` so the default ``just check`` / +``pytest tests/`` invocations skip them. Run explicitly with:: + + uv run pytest eval/ + +Wire a real LLM-backed ``answer_fn`` by editing ``_answer_fn`` below or +constructing ``EvalRunner`` with your own callable. The default echoes the +question verbatim, which makes the trivial example case (``echo hello`` +expects ``hello``) pass without any LLM credentials. +""" + +from __future__ import annotations + +import pytest + +from src.eval.models import EvalCase +from src.eval.runner import EvalRunner, load_golden_dataset + +# Load cases without initialising a runner (no side effects at import time) +golden = load_golden_dataset() + + +def _answer_fn(question: str) -> str: + """Placeholder agent. Strips a leading ``echo `` so the example case + resolves to the expected answer; in real use this calls the agent + loop / LLM client wired by the project.""" + return question.removeprefix("echo ").strip() + + +@pytest.fixture(scope="module") +def runner() -> EvalRunner: + """Single runner shared across cases in this module.""" + return EvalRunner(answer_fn=_answer_fn) + + +@pytest.mark.eval +@pytest.mark.parametrize("case", golden, ids=lambda c: c.id) +def test_golden_qa(case: EvalCase, runner: EvalRunner) -> None: + """Evaluate a golden QA test case against the configured agent.""" + result = runner.evaluate(case) + assert result.pass_result, ( + f"[{case.id}] {case.category}/{case.difficulty}\n" + f"Q: {case.question}\n" + f"Expected: {case.expected_answer}\n" + f"Got: {result.actual_answer}\n" + f"Reason: {result.failure_reason}" + ) diff --git a/pyproject.toml b/pyproject.toml index ae42e70..1d4574a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -159,6 +159,7 @@ info = "See docs/DEVELOPMENT.md#commit-messages for the allowed prefixes." [tool.pytest.ini_options] testpaths = ["tests"] +pythonpath = ["."] asyncio_mode = "auto" timeout = 30 markers = [ diff --git a/src/eval/__main__.py b/src/eval/__main__.py new file mode 100644 index 0000000..0cf1103 --- /dev/null +++ b/src/eval/__main__.py @@ -0,0 +1,31 @@ +"""Eval-harness CLI: ``python -m src.eval`` runs the golden dataset and +prints the markdown report. The default ``answer_fn`` echoes the question +verbatim — wire your real agent loop in by importing this module and +constructing ``EvalRunner(answer_fn=your_callable)``. +""" + +from __future__ import annotations + +from src.eval.report import generate_report +from src.eval.runner import EvalRunner + + +def _identity_answer(question: str) -> str: + """Default placeholder agent — returns the question verbatim. + + Real users wire a ``Callable[[str], str]`` that hits their LLM / agent + loop. This default keeps the CLI runnable without LLM credentials. + """ + return question + + +def main() -> int: + runner = EvalRunner(answer_fn=_identity_answer) + results = runner.evaluate_all() + report = generate_report(results) + print(report) + return 0 if all(r.pass_result for r in results) else 1 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/src/eval/judge.py b/src/eval/judge.py new file mode 100644 index 0000000..d2d2591 --- /dev/null +++ b/src/eval/judge.py @@ -0,0 +1,82 @@ +"""LLM judge for semantic-similarity evaluation — provider-agnostic. + +The judge calls an `LLMClient` Protocol, NOT a specific provider SDK. Wire +your concrete client (OpenAI, Anthropic, Azure, vLLM, …) at runtime; the +eval harness stays decoupled. The default behaviour when no client is wired +is `(None, "no LLM client configured")`, which the runner treats as +inconclusive rather than a hard failure. +""" + +from __future__ import annotations + +import json +import logging +from typing import Protocol + +logger = logging.getLogger(__name__) + +_JUDGE_PROMPT = """You are an evaluation judge. Given a question, an expected +answer, and an actual answer from an AI agent, score how well the actual +answer matches the expected answer. + +Question: {question} +Expected answer: {expected_answer} +Actual answer: {actual_answer} + +Respond with JSON only: +{{ + "score": , + "explanation": "" +}} + +Scoring guide: +- 1.0: Semantically identical, same information conveyed +- 0.8-0.9: Correct answer with minor wording or formatting differences +- 0.5-0.7: Partially correct, missing key details or minor inaccuracies +- 0.1-0.4: Substantially wrong but shows some relevant information +- 0.0: Completely wrong or unrelated""" + + +class LLMClient(Protocol): + """Minimum surface the judge needs from any LLM SDK. + + Concrete adapters live alongside the agent code. The Protocol form lets + a downstream consumer wire OpenAI's SDK, Anthropic's, Azure OpenAI, or a + self-hosted vLLM endpoint without the eval harness importing any + vendor-specific module. + """ + + def complete_json(self, *, model: str, prompt: str) -> str: + """Send *prompt* to *model* and return the raw JSON response body.""" + + +def evaluate_semantic_similarity( + question: str, + expected: str, + actual: str, + client: LLMClient | None, + model: str, +) -> tuple[float | None, str]: + """Score semantic similarity between expected and actual answers. + + Returns ``(score in [0.0, 1.0], explanation)``. On failure returns + ``(None, error)`` — the caller treats this as inconclusive, not a fail. + """ + if client is None: + return (None, "no LLM client configured") + + prompt = _JUDGE_PROMPT.format( + question=question, + expected_answer=expected, + actual_answer=actual, + ) + + try: + body = client.complete_json(model=model, prompt=prompt) + parsed = json.loads(body) + score = float(parsed.get("score", 0.0)) + explanation = str(parsed.get("explanation", "No explanation")) + return (max(0.0, min(1.0, score)), explanation) + except Exception as exc: + logger.exception("LLM judge call failed") + return (None, f"Judge call failed: {exc}") diff --git a/src/eval/models.py b/src/eval/models.py new file mode 100644 index 0000000..548768a --- /dev/null +++ b/src/eval/models.py @@ -0,0 +1,57 @@ +"""Pydantic models for the evaluation harness.""" + +from __future__ import annotations + +from typing import Literal + +from pydantic import BaseModel, Field + + +class EvalCase(BaseModel): + """A single test case from the golden QA dataset.""" + + id: str = Field(description="Unique test case identifier") + question: str = Field(description="Natural language question / input") + category: str = Field(default="general", description="Test category") + expected_answer: str = Field(description="Expected answer text") + expected_tools: list[str] = Field( + default_factory=list, + description="Tools the agent should call (informational; not asserted)", + ) + tolerance: Literal["exact_match", "numeric_close", "semantic_similar"] = Field( + description="How to compare actual vs expected" + ) + difficulty: Literal["easy", "medium", "hard"] = Field( + default="easy", + description="Difficulty level", + ) + notes: str = Field(default="", description="Why this test case exists") + + +class EvalResult(BaseModel): + """Result of evaluating a single test case.""" + + case_id: str = Field(description="Test case ID") + question: str = Field(description="The question asked") + category: str = Field(description="Test category") + difficulty: str = Field(description="Difficulty level") + expected_answer: str = Field(description="Expected answer") + actual_answer: str = Field(description="Agent's actual answer") + tools_called: list[str] = Field( + default_factory=list, + description="Tools the agent invoked", + ) + reasoning_trace: list[str] = Field( + default_factory=list, + description="Chain of thought steps", + ) + latency_ms: int = Field(description="Wall clock time in ms") + pass_result: bool = Field(description="Whether the test passed") + score: float | None = Field( + default=None, + description="LLM judge score for semantic_similar", + ) + failure_reason: str | None = Field( + default=None, + description="Why the test failed", + ) diff --git a/src/eval/report.py b/src/eval/report.py new file mode 100644 index 0000000..22e1661 --- /dev/null +++ b/src/eval/report.py @@ -0,0 +1,75 @@ +"""Generate a markdown evaluation report from results.""" + +from __future__ import annotations + +import logging +from collections import defaultdict +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from src.eval.models import EvalResult + +logger = logging.getLogger(__name__) + + +def generate_report(results: list[EvalResult]) -> str: + """Generate a markdown report from evaluation results. + + Sections: overall accuracy, by category, by difficulty, failure analysis. + """ + total = len(results) + passed = sum(1 for r in results if r.pass_result) + + lines: list[str] = [] + lines.append("# Evaluation Report\n") + + pct = (passed / total * 100) if total > 0 else 0 + lines.append("## Overall Accuracy\n") + lines.append(f"**{passed}/{total} passed ({pct:.1f}%)**\n") + + lines.append("## Accuracy by Category\n") + lines.append("| Category | Passed | Total | Rate |") + lines.append("|---|---|---|---|") + + by_cat: dict[str, list[EvalResult]] = defaultdict(list) + for r in results: + by_cat[r.category].append(r) + + for cat in sorted(by_cat): + cat_results = by_cat[cat] + cat_passed = sum(1 for r in cat_results if r.pass_result) + cat_total = len(cat_results) + cat_pct = (cat_passed / cat_total * 100) if cat_total > 0 else 0 + lines.append(f"| {cat} | {cat_passed} | {cat_total} | {cat_pct:.0f}% |") + + lines.append("") + + lines.append("## Accuracy by Difficulty\n") + lines.append("| Difficulty | Passed | Total | Rate |") + lines.append("|---|---|---|---|") + + by_diff: dict[str, list[EvalResult]] = defaultdict(list) + for r in results: + by_diff[r.difficulty].append(r) + + for diff in ["easy", "medium", "hard"]: + if diff in by_diff: + diff_results = by_diff[diff] + diff_passed = sum(1 for r in diff_results if r.pass_result) + diff_total = len(diff_results) + diff_pct = (diff_passed / diff_total * 100) if diff_total > 0 else 0 + lines.append(f"| {diff} | {diff_passed} | {diff_total} | {diff_pct:.0f}% |") + + lines.append("") + + failures = [r for r in results if not r.pass_result] + if failures: + lines.append("## Failure Analysis\n") + for r in failures: + lines.append(f"### {r.case_id} ({r.category}/{r.difficulty})\n") + lines.append(f"**Question:** {r.question}\n") + lines.append(f"**Expected:** {r.expected_answer}\n") + lines.append(f"**Actual:** {r.actual_answer}\n") + lines.append(f"**Reason:** {r.failure_reason}\n") + + return "\n".join(lines) diff --git a/src/eval/runner.py b/src/eval/runner.py new file mode 100644 index 0000000..6e08bfe --- /dev/null +++ b/src/eval/runner.py @@ -0,0 +1,190 @@ +"""Evaluation runner: executes golden QA cases against a caller-supplied +``answer_fn``. The harness is provider-agnostic — wire any agent loop, +direct LLM call, or stub by passing a function ``str -> str`` to +``EvalRunner``. + +The runner does NOT spin up tracing, databases, or LLM clients on its own. +That keeps it cheap to import in unit tests and lets the agent code own its +own startup story. +""" + +from __future__ import annotations + +import json +import logging +import re +import time +from pathlib import Path +from typing import TYPE_CHECKING + +from src.eval.judge import evaluate_semantic_similarity +from src.eval.models import EvalCase, EvalResult + +if TYPE_CHECKING: + from collections.abc import Callable + + from src.eval.judge import LLMClient + +logger = logging.getLogger(__name__) + +_GOLDEN_QA_PATH = ( + Path(__file__).resolve().parent.parent.parent / "eval" / "golden_qa.json" +) + + +def load_golden_dataset(path: Path | None = None) -> list[EvalCase]: + """Load test cases from the golden QA JSON file. + + Standalone so callers can introspect the dataset without paying the + cost of constructing an ``EvalRunner``. + """ + qa_path = path or _GOLDEN_QA_PATH + with qa_path.open() as fh: + data = json.load(fh) + return [EvalCase.model_validate(item) for item in data] + + +class EvalRunner: + """Runs the golden QA dataset and collects results.""" + + def __init__( + self, + answer_fn: Callable[[str], str], + judge_client: LLMClient | None = None, + judge_model: str = "", + ) -> None: + self._answer_fn = answer_fn + self._judge_client = judge_client + self._judge_model = judge_model + + def evaluate(self, case: EvalCase) -> EvalResult: + """Evaluate a single test case.""" + start = time.monotonic() + try: + actual_answer = self._answer_fn(case.question) + except Exception as exc: + actual_answer = f"ERROR: {exc}" + latency_ms = int((time.monotonic() - start) * 1000) + + passed, score, reason = self._compare(case, actual_answer) + + return EvalResult( + case_id=case.id, + question=case.question, + category=case.category, + difficulty=case.difficulty, + expected_answer=case.expected_answer, + actual_answer=actual_answer, + tools_called=[], + reasoning_trace=[], + latency_ms=latency_ms, + pass_result=passed, + score=score, + failure_reason=reason, + ) + + def evaluate_all(self) -> list[EvalResult]: + """Evaluate every case in the loaded golden dataset.""" + cases = load_golden_dataset() + results: list[EvalResult] = [] + for case in cases: + logger.info("Evaluating %s: %s", case.id, case.question[:50]) + result = self.evaluate(case) + status = "PASS" if result.pass_result else "FAIL" + logger.info(" %s %s", status, result.failure_reason or "") + results.append(result) + return results + + # --------------------------------------------------------------------- + + def _compare( + self, + case: EvalCase, + actual: str, + ) -> tuple[bool, float | None, str | None]: + if case.tolerance == "exact_match": + return self._exact_match(case.expected_answer, actual) + if case.tolerance == "numeric_close": + return self._numeric_close(case.expected_answer, actual) + if case.tolerance == "semantic_similar": + return self._semantic_similar(case, actual) + return (False, None, f"Unknown tolerance: {case.tolerance}") + + @staticmethod + def _exact_match( + expected: str, actual: str + ) -> tuple[bool, float | None, str | None]: + norm_expected = _normalise(expected) + norm_actual = _normalise(actual) + if norm_expected == norm_actual: + return (True, None, None) + return (False, None, f"Exact match failed: '{actual[:100]}'") + + @staticmethod + def _numeric_close( + expected: str, actual: str + ) -> tuple[bool, float | None, str | None]: + expected_nums = _extract_numbers(expected) + actual_nums = _extract_numbers(actual) + + if not expected_nums: + return (False, None, "No numbers found in expected answer") + if not actual_nums: + return (False, None, "No numbers found in actual answer") + + target = expected_nums[0] + for num in actual_nums: + if target == 0: + if num == 0: + return (True, None, None) + elif abs(num - target) / abs(target) <= 0.01: + return (True, None, None) + + return ( + False, + None, + f"Numeric mismatch: expected ~{target}, got {actual_nums}", + ) + + def _semantic_similar( + self, + case: EvalCase, + actual: str, + ) -> tuple[bool, float | None, str | None]: + score, explanation = evaluate_semantic_similarity( + question=case.question, + expected=case.expected_answer, + actual=actual, + client=self._judge_client, + model=self._judge_model, + ) + if score is None: + return (True, None, f"Judge inconclusive: {explanation}") + if score >= 0.8: + return (True, score, None) + return (False, score, f"Semantic score {score:.2f}: {explanation}") + + +def _normalise(text: str) -> str: + """Lowercase, strip, collapse whitespace.""" + return " ".join(text.lower().strip().split()) + + +def _extract_numbers(text: str) -> list[float]: + """Pull decimals + integers out of free-form text. + + Filters out 4-digit values in the year range (2020-2029) so a question + referencing a year doesn't accidentally provide the comparison target. + """ + pattern = r"\d[\d,]*\.?\d*" + matches = re.findall(pattern, text) + numbers: list[float] = [] + for m in matches: + try: + val = float(m.replace(",", "")) + except ValueError: + continue + if val == int(val) and 2020 <= val <= 2029: + continue + numbers.append(val) + return numbers