Skip to content

Commit 00880e5

Browse files
authored
feat: eval harness scaffold + 1 example golden case + nightly workflow_dispatch (#24) (#60)
1 parent 3d71869 commit 00880e5

9 files changed

Lines changed: 549 additions & 0 deletions

File tree

.github/workflows/eval-nightly.yml

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
# Eval harness nightly — disabled-by-default.
2+
#
3+
# This workflow runs the golden QA dataset against the agent / LLM loop. It
4+
# is `workflow_dispatch`-only by default to prevent accidental LLM API
5+
# spend. To enable nightly runs:
6+
#
7+
# 1. Set the LLM secrets in repo settings (LLM_API_KEY at minimum;
8+
# LLM_BASE_URL / LLM_MODEL / LLM_PROVIDER if your judge differs from
9+
# OpenAI defaults).
10+
# 2. Replace the `on:` block below with:
11+
#
12+
# on:
13+
# schedule:
14+
# - cron: "0 6 * * *" # daily 06:00 UTC
15+
# workflow_dispatch:
16+
#
17+
# 3. Add the `eval-nightly.yml` to EXEMPT_WORKFLOWS in
18+
# `.github/scripts/check_required_contexts.py` if it's not already
19+
# there (it is, by default — scheduled runs never gate PRs).
20+
#
21+
# See docs/EVAL_HARNESS.md for the full setup story.
22+
23+
name: Eval nightly
24+
25+
on:
26+
workflow_dispatch:
27+
inputs:
28+
python_version:
29+
description: "Python version to use"
30+
required: false
31+
default: "3.14"
32+
33+
permissions:
34+
contents: read
35+
36+
jobs:
37+
eval:
38+
name: Run golden QA dataset
39+
runs-on: ubuntu-latest
40+
steps:
41+
- uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
42+
- uses: astral-sh/setup-uv@cec208311dfd045dd5311c1add060b2062131d57 # v8
43+
- uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5
44+
with:
45+
python-version: ${{ inputs.python_version || '3.14' }}
46+
- run: uv sync --frozen --extra dev
47+
- name: Run pytest eval/
48+
env:
49+
LLM_PROVIDER: ${{ secrets.LLM_PROVIDER }}
50+
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
51+
LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
52+
LLM_MODEL: ${{ secrets.LLM_MODEL }}
53+
run: uv run pytest eval/ -v

eval/golden_qa.json

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
[
2+
{
3+
"id": "echo-hello",
4+
"question": "echo hello",
5+
"category": "smoke",
6+
"expected_answer": "hello",
7+
"tolerance": "exact_match",
8+
"difficulty": "easy",
9+
"notes": "Trivial example: exercises the runner without an LLM call."
10+
}
11+
]

eval/test_golden_qa.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
"""Parametrised evaluation tests from the golden QA dataset.
2+
3+
Marked with ``@pytest.mark.eval`` so the default ``just check`` /
4+
``pytest tests/`` invocations skip them. Run explicitly with::
5+
6+
uv run pytest eval/
7+
8+
Wire a real LLM-backed ``answer_fn`` by editing ``_answer_fn`` below or
9+
constructing ``EvalRunner`` with your own callable. The default echoes the
10+
question verbatim, which makes the trivial example case (``echo hello``
11+
expects ``hello``) pass without any LLM credentials.
12+
"""
13+
14+
from __future__ import annotations
15+
16+
import pytest
17+
18+
from src.eval.models import EvalCase
19+
from src.eval.runner import EvalRunner, load_golden_dataset
20+
21+
# Load cases without initialising a runner (no side effects at import time)
22+
golden = load_golden_dataset()
23+
24+
25+
def _answer_fn(question: str) -> str:
26+
"""Placeholder agent. Strips a leading ``echo `` so the example case
27+
resolves to the expected answer; in real use this calls the agent
28+
loop / LLM client wired by the project."""
29+
return question.removeprefix("echo ").strip()
30+
31+
32+
@pytest.fixture(scope="module")
33+
def runner() -> EvalRunner:
34+
"""Single runner shared across cases in this module."""
35+
return EvalRunner(answer_fn=_answer_fn)
36+
37+
38+
@pytest.mark.eval
39+
@pytest.mark.parametrize("case", golden, ids=lambda c: c.id)
40+
def test_golden_qa(case: EvalCase, runner: EvalRunner) -> None:
41+
"""Evaluate a golden QA test case against the configured agent."""
42+
result = runner.evaluate(case)
43+
assert result.pass_result, (
44+
f"[{case.id}] {case.category}/{case.difficulty}\n"
45+
f"Q: {case.question}\n"
46+
f"Expected: {case.expected_answer}\n"
47+
f"Got: {result.actual_answer}\n"
48+
f"Reason: {result.failure_reason}"
49+
)

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,7 @@ info = "See docs/DEVELOPMENT.md#commit-messages for the allowed prefixes."
159159

160160
[tool.pytest.ini_options]
161161
testpaths = ["tests"]
162+
pythonpath = ["."]
162163
asyncio_mode = "auto"
163164
timeout = 30
164165
markers = [

src/eval/__main__.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
"""Eval-harness CLI: ``python -m src.eval`` runs the golden dataset and
2+
prints the markdown report. The default ``answer_fn`` echoes the question
3+
verbatim — wire your real agent loop in by importing this module and
4+
constructing ``EvalRunner(answer_fn=your_callable)``.
5+
"""
6+
7+
from __future__ import annotations
8+
9+
from src.eval.report import generate_report
10+
from src.eval.runner import EvalRunner
11+
12+
13+
def _identity_answer(question: str) -> str:
14+
"""Default placeholder agent — returns the question verbatim.
15+
16+
Real users wire a ``Callable[[str], str]`` that hits their LLM / agent
17+
loop. This default keeps the CLI runnable without LLM credentials.
18+
"""
19+
return question
20+
21+
22+
def main() -> int:
23+
runner = EvalRunner(answer_fn=_identity_answer)
24+
results = runner.evaluate_all()
25+
report = generate_report(results)
26+
print(report)
27+
return 0 if all(r.pass_result for r in results) else 1
28+
29+
30+
if __name__ == "__main__":
31+
raise SystemExit(main())

src/eval/judge.py

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
"""LLM judge for semantic-similarity evaluation — provider-agnostic.
2+
3+
The judge calls an `LLMClient` Protocol, NOT a specific provider SDK. Wire
4+
your concrete client (OpenAI, Anthropic, Azure, vLLM, …) at runtime; the
5+
eval harness stays decoupled. The default behaviour when no client is wired
6+
is `(None, "no LLM client configured")`, which the runner treats as
7+
inconclusive rather than a hard failure.
8+
"""
9+
10+
from __future__ import annotations
11+
12+
import json
13+
import logging
14+
from typing import Protocol
15+
16+
logger = logging.getLogger(__name__)
17+
18+
_JUDGE_PROMPT = """You are an evaluation judge. Given a question, an expected
19+
answer, and an actual answer from an AI agent, score how well the actual
20+
answer matches the expected answer.
21+
22+
Question: {question}
23+
Expected answer: {expected_answer}
24+
Actual answer: {actual_answer}
25+
26+
Respond with JSON only:
27+
{{
28+
"score": <float 0.0 to 1.0>,
29+
"explanation": "<one line explaining the score>"
30+
}}
31+
32+
Scoring guide:
33+
- 1.0: Semantically identical, same information conveyed
34+
- 0.8-0.9: Correct answer with minor wording or formatting differences
35+
- 0.5-0.7: Partially correct, missing key details or minor inaccuracies
36+
- 0.1-0.4: Substantially wrong but shows some relevant information
37+
- 0.0: Completely wrong or unrelated"""
38+
39+
40+
class LLMClient(Protocol):
41+
"""Minimum surface the judge needs from any LLM SDK.
42+
43+
Concrete adapters live alongside the agent code. The Protocol form lets
44+
a downstream consumer wire OpenAI's SDK, Anthropic's, Azure OpenAI, or a
45+
self-hosted vLLM endpoint without the eval harness importing any
46+
vendor-specific module.
47+
"""
48+
49+
def complete_json(self, *, model: str, prompt: str) -> str:
50+
"""Send *prompt* to *model* and return the raw JSON response body."""
51+
52+
53+
def evaluate_semantic_similarity(
54+
question: str,
55+
expected: str,
56+
actual: str,
57+
client: LLMClient | None,
58+
model: str,
59+
) -> tuple[float | None, str]:
60+
"""Score semantic similarity between expected and actual answers.
61+
62+
Returns ``(score in [0.0, 1.0], explanation)``. On failure returns
63+
``(None, error)`` — the caller treats this as inconclusive, not a fail.
64+
"""
65+
if client is None:
66+
return (None, "no LLM client configured")
67+
68+
prompt = _JUDGE_PROMPT.format(
69+
question=question,
70+
expected_answer=expected,
71+
actual_answer=actual,
72+
)
73+
74+
try:
75+
body = client.complete_json(model=model, prompt=prompt)
76+
parsed = json.loads(body)
77+
score = float(parsed.get("score", 0.0))
78+
explanation = str(parsed.get("explanation", "No explanation"))
79+
return (max(0.0, min(1.0, score)), explanation)
80+
except Exception as exc:
81+
logger.exception("LLM judge call failed")
82+
return (None, f"Judge call failed: {exc}")

src/eval/models.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
"""Pydantic models for the evaluation harness."""
2+
3+
from __future__ import annotations
4+
5+
from typing import Literal
6+
7+
from pydantic import BaseModel, Field
8+
9+
10+
class EvalCase(BaseModel):
11+
"""A single test case from the golden QA dataset."""
12+
13+
id: str = Field(description="Unique test case identifier")
14+
question: str = Field(description="Natural language question / input")
15+
category: str = Field(default="general", description="Test category")
16+
expected_answer: str = Field(description="Expected answer text")
17+
expected_tools: list[str] = Field(
18+
default_factory=list,
19+
description="Tools the agent should call (informational; not asserted)",
20+
)
21+
tolerance: Literal["exact_match", "numeric_close", "semantic_similar"] = Field(
22+
description="How to compare actual vs expected"
23+
)
24+
difficulty: Literal["easy", "medium", "hard"] = Field(
25+
default="easy",
26+
description="Difficulty level",
27+
)
28+
notes: str = Field(default="", description="Why this test case exists")
29+
30+
31+
class EvalResult(BaseModel):
32+
"""Result of evaluating a single test case."""
33+
34+
case_id: str = Field(description="Test case ID")
35+
question: str = Field(description="The question asked")
36+
category: str = Field(description="Test category")
37+
difficulty: str = Field(description="Difficulty level")
38+
expected_answer: str = Field(description="Expected answer")
39+
actual_answer: str = Field(description="Agent's actual answer")
40+
tools_called: list[str] = Field(
41+
default_factory=list,
42+
description="Tools the agent invoked",
43+
)
44+
reasoning_trace: list[str] = Field(
45+
default_factory=list,
46+
description="Chain of thought steps",
47+
)
48+
latency_ms: int = Field(description="Wall clock time in ms")
49+
pass_result: bool = Field(description="Whether the test passed")
50+
score: float | None = Field(
51+
default=None,
52+
description="LLM judge score for semantic_similar",
53+
)
54+
failure_reason: str | None = Field(
55+
default=None,
56+
description="Why the test failed",
57+
)

src/eval/report.py

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
"""Generate a markdown evaluation report from results."""
2+
3+
from __future__ import annotations
4+
5+
import logging
6+
from collections import defaultdict
7+
from typing import TYPE_CHECKING
8+
9+
if TYPE_CHECKING:
10+
from src.eval.models import EvalResult
11+
12+
logger = logging.getLogger(__name__)
13+
14+
15+
def generate_report(results: list[EvalResult]) -> str:
16+
"""Generate a markdown report from evaluation results.
17+
18+
Sections: overall accuracy, by category, by difficulty, failure analysis.
19+
"""
20+
total = len(results)
21+
passed = sum(1 for r in results if r.pass_result)
22+
23+
lines: list[str] = []
24+
lines.append("# Evaluation Report\n")
25+
26+
pct = (passed / total * 100) if total > 0 else 0
27+
lines.append("## Overall Accuracy\n")
28+
lines.append(f"**{passed}/{total} passed ({pct:.1f}%)**\n")
29+
30+
lines.append("## Accuracy by Category\n")
31+
lines.append("| Category | Passed | Total | Rate |")
32+
lines.append("|---|---|---|---|")
33+
34+
by_cat: dict[str, list[EvalResult]] = defaultdict(list)
35+
for r in results:
36+
by_cat[r.category].append(r)
37+
38+
for cat in sorted(by_cat):
39+
cat_results = by_cat[cat]
40+
cat_passed = sum(1 for r in cat_results if r.pass_result)
41+
cat_total = len(cat_results)
42+
cat_pct = (cat_passed / cat_total * 100) if cat_total > 0 else 0
43+
lines.append(f"| {cat} | {cat_passed} | {cat_total} | {cat_pct:.0f}% |")
44+
45+
lines.append("")
46+
47+
lines.append("## Accuracy by Difficulty\n")
48+
lines.append("| Difficulty | Passed | Total | Rate |")
49+
lines.append("|---|---|---|---|")
50+
51+
by_diff: dict[str, list[EvalResult]] = defaultdict(list)
52+
for r in results:
53+
by_diff[r.difficulty].append(r)
54+
55+
for diff in ["easy", "medium", "hard"]:
56+
if diff in by_diff:
57+
diff_results = by_diff[diff]
58+
diff_passed = sum(1 for r in diff_results if r.pass_result)
59+
diff_total = len(diff_results)
60+
diff_pct = (diff_passed / diff_total * 100) if diff_total > 0 else 0
61+
lines.append(f"| {diff} | {diff_passed} | {diff_total} | {diff_pct:.0f}% |")
62+
63+
lines.append("")
64+
65+
failures = [r for r in results if not r.pass_result]
66+
if failures:
67+
lines.append("## Failure Analysis\n")
68+
for r in failures:
69+
lines.append(f"### {r.case_id} ({r.category}/{r.difficulty})\n")
70+
lines.append(f"**Question:** {r.question}\n")
71+
lines.append(f"**Expected:** {r.expected_answer}\n")
72+
lines.append(f"**Actual:** {r.actual_answer}\n")
73+
lines.append(f"**Reason:** {r.failure_reason}\n")
74+
75+
return "\n".join(lines)

0 commit comments

Comments
 (0)