Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 53 additions & 0 deletions .github/workflows/eval-nightly.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# Eval harness nightly — disabled-by-default.
#
# This workflow runs the golden QA dataset against the agent / LLM loop. It
# is `workflow_dispatch`-only by default to prevent accidental LLM API
# spend. To enable nightly runs:
#
# 1. Set the LLM secrets in repo settings (LLM_API_KEY at minimum;
# LLM_BASE_URL / LLM_MODEL / LLM_PROVIDER if your judge differs from
# OpenAI defaults).
# 2. Replace the `on:` block below with:
#
# on:
# schedule:
# - cron: "0 6 * * *" # daily 06:00 UTC
# workflow_dispatch:
#
# 3. Add the `eval-nightly.yml` to EXEMPT_WORKFLOWS in
# `.github/scripts/check_required_contexts.py` if it's not already
# there (it is, by default — scheduled runs never gate PRs).
#
# See docs/EVAL_HARNESS.md for the full setup story.

name: Eval nightly

on:
workflow_dispatch:
inputs:
python_version:
description: "Python version to use"
required: false
default: "3.14"

permissions:
contents: read

jobs:
eval:
name: Run golden QA dataset
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
- uses: astral-sh/setup-uv@cec208311dfd045dd5311c1add060b2062131d57 # v8
- uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5
with:
python-version: ${{ inputs.python_version || '3.14' }}
- run: uv sync --frozen --extra dev
- name: Run pytest eval/
env:
LLM_PROVIDER: ${{ secrets.LLM_PROVIDER }}
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
LLM_MODEL: ${{ secrets.LLM_MODEL }}
run: uv run pytest eval/ -v
11 changes: 11 additions & 0 deletions eval/golden_qa.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
[
{
"id": "echo-hello",
"question": "echo hello",
"category": "smoke",
"expected_answer": "hello",
"tolerance": "exact_match",
"difficulty": "easy",
"notes": "Trivial example: exercises the runner without an LLM call."
}
]
49 changes: 49 additions & 0 deletions eval/test_golden_qa.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
"""Parametrised evaluation tests from the golden QA dataset.

Marked with ``@pytest.mark.eval`` so the default ``just check`` /
``pytest tests/`` invocations skip them. Run explicitly with::

uv run pytest eval/

Wire a real LLM-backed ``answer_fn`` by editing ``_answer_fn`` below or
constructing ``EvalRunner`` with your own callable. The default echoes the
question verbatim, which makes the trivial example case (``echo hello``
expects ``hello``) pass without any LLM credentials.
"""

from __future__ import annotations

import pytest

from src.eval.models import EvalCase
from src.eval.runner import EvalRunner, load_golden_dataset

# Load cases without initialising a runner (no side effects at import time)
golden = load_golden_dataset()


def _answer_fn(question: str) -> str:
"""Placeholder agent. Strips a leading ``echo `` so the example case
resolves to the expected answer; in real use this calls the agent
loop / LLM client wired by the project."""
return question.removeprefix("echo ").strip()


@pytest.fixture(scope="module")
def runner() -> EvalRunner:
"""Single runner shared across cases in this module."""
return EvalRunner(answer_fn=_answer_fn)


@pytest.mark.eval
@pytest.mark.parametrize("case", golden, ids=lambda c: c.id)
def test_golden_qa(case: EvalCase, runner: EvalRunner) -> None:
"""Evaluate a golden QA test case against the configured agent."""
result = runner.evaluate(case)
assert result.pass_result, (
f"[{case.id}] {case.category}/{case.difficulty}\n"
f"Q: {case.question}\n"
f"Expected: {case.expected_answer}\n"
f"Got: {result.actual_answer}\n"
f"Reason: {result.failure_reason}"
)
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,7 @@ info = "See docs/DEVELOPMENT.md#commit-messages for the allowed prefixes."

[tool.pytest.ini_options]
testpaths = ["tests"]
pythonpath = ["."]
asyncio_mode = "auto"
timeout = 30
markers = [
Expand Down
31 changes: 31 additions & 0 deletions src/eval/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
"""Eval-harness CLI: ``python -m src.eval`` runs the golden dataset and
prints the markdown report. The default ``answer_fn`` echoes the question
verbatim — wire your real agent loop in by importing this module and
constructing ``EvalRunner(answer_fn=your_callable)``.
"""

from __future__ import annotations

from src.eval.report import generate_report
from src.eval.runner import EvalRunner


def _identity_answer(question: str) -> str:
"""Default placeholder agent — returns the question verbatim.

Real users wire a ``Callable[[str], str]`` that hits their LLM / agent
loop. This default keeps the CLI runnable without LLM credentials.
"""
return question


def main() -> int:
runner = EvalRunner(answer_fn=_identity_answer)
results = runner.evaluate_all()
report = generate_report(results)
print(report)
return 0 if all(r.pass_result for r in results) else 1


if __name__ == "__main__":
raise SystemExit(main())
82 changes: 82 additions & 0 deletions src/eval/judge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
"""LLM judge for semantic-similarity evaluation — provider-agnostic.

The judge calls an `LLMClient` Protocol, NOT a specific provider SDK. Wire
your concrete client (OpenAI, Anthropic, Azure, vLLM, …) at runtime; the
eval harness stays decoupled. The default behaviour when no client is wired
is `(None, "no LLM client configured")`, which the runner treats as
inconclusive rather than a hard failure.
"""

from __future__ import annotations

import json
import logging
from typing import Protocol

logger = logging.getLogger(__name__)

_JUDGE_PROMPT = """You are an evaluation judge. Given a question, an expected
answer, and an actual answer from an AI agent, score how well the actual
answer matches the expected answer.

Question: {question}
Expected answer: {expected_answer}
Actual answer: {actual_answer}

Respond with JSON only:
{{
"score": <float 0.0 to 1.0>,
"explanation": "<one line explaining the score>"
}}

Scoring guide:
- 1.0: Semantically identical, same information conveyed
- 0.8-0.9: Correct answer with minor wording or formatting differences
- 0.5-0.7: Partially correct, missing key details or minor inaccuracies
- 0.1-0.4: Substantially wrong but shows some relevant information
- 0.0: Completely wrong or unrelated"""


class LLMClient(Protocol):
"""Minimum surface the judge needs from any LLM SDK.

Concrete adapters live alongside the agent code. The Protocol form lets
a downstream consumer wire OpenAI's SDK, Anthropic's, Azure OpenAI, or a
self-hosted vLLM endpoint without the eval harness importing any
vendor-specific module.
"""

def complete_json(self, *, model: str, prompt: str) -> str:
"""Send *prompt* to *model* and return the raw JSON response body."""


def evaluate_semantic_similarity(
question: str,
expected: str,
actual: str,
client: LLMClient | None,
model: str,
) -> tuple[float | None, str]:
"""Score semantic similarity between expected and actual answers.

Returns ``(score in [0.0, 1.0], explanation)``. On failure returns
``(None, error)`` — the caller treats this as inconclusive, not a fail.
"""
if client is None:
return (None, "no LLM client configured")

prompt = _JUDGE_PROMPT.format(
question=question,
expected_answer=expected,
actual_answer=actual,
)

try:
body = client.complete_json(model=model, prompt=prompt)
parsed = json.loads(body)
score = float(parsed.get("score", 0.0))
explanation = str(parsed.get("explanation", "No explanation"))
return (max(0.0, min(1.0, score)), explanation)
except Exception as exc:
logger.exception("LLM judge call failed")
return (None, f"Judge call failed: {exc}")
57 changes: 57 additions & 0 deletions src/eval/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
"""Pydantic models for the evaluation harness."""

from __future__ import annotations

from typing import Literal

from pydantic import BaseModel, Field


class EvalCase(BaseModel):
"""A single test case from the golden QA dataset."""

id: str = Field(description="Unique test case identifier")
question: str = Field(description="Natural language question / input")
category: str = Field(default="general", description="Test category")
expected_answer: str = Field(description="Expected answer text")
expected_tools: list[str] = Field(
default_factory=list,
description="Tools the agent should call (informational; not asserted)",
)
tolerance: Literal["exact_match", "numeric_close", "semantic_similar"] = Field(
description="How to compare actual vs expected"
)
difficulty: Literal["easy", "medium", "hard"] = Field(
default="easy",
description="Difficulty level",
)
notes: str = Field(default="", description="Why this test case exists")


class EvalResult(BaseModel):
"""Result of evaluating a single test case."""

case_id: str = Field(description="Test case ID")
question: str = Field(description="The question asked")
category: str = Field(description="Test category")
difficulty: str = Field(description="Difficulty level")
expected_answer: str = Field(description="Expected answer")
actual_answer: str = Field(description="Agent's actual answer")
tools_called: list[str] = Field(
default_factory=list,
description="Tools the agent invoked",
)
reasoning_trace: list[str] = Field(
default_factory=list,
description="Chain of thought steps",
)
latency_ms: int = Field(description="Wall clock time in ms")
pass_result: bool = Field(description="Whether the test passed")
score: float | None = Field(
default=None,
description="LLM judge score for semantic_similar",
)
failure_reason: str | None = Field(
default=None,
description="Why the test failed",
)
75 changes: 75 additions & 0 deletions src/eval/report.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
"""Generate a markdown evaluation report from results."""

from __future__ import annotations

import logging
from collections import defaultdict
from typing import TYPE_CHECKING

if TYPE_CHECKING:
from src.eval.models import EvalResult

logger = logging.getLogger(__name__)


def generate_report(results: list[EvalResult]) -> str:
"""Generate a markdown report from evaluation results.

Sections: overall accuracy, by category, by difficulty, failure analysis.
"""
total = len(results)
passed = sum(1 for r in results if r.pass_result)

lines: list[str] = []
lines.append("# Evaluation Report\n")

pct = (passed / total * 100) if total > 0 else 0
lines.append("## Overall Accuracy\n")
lines.append(f"**{passed}/{total} passed ({pct:.1f}%)**\n")

lines.append("## Accuracy by Category\n")
lines.append("| Category | Passed | Total | Rate |")
lines.append("|---|---|---|---|")

by_cat: dict[str, list[EvalResult]] = defaultdict(list)
for r in results:
by_cat[r.category].append(r)

for cat in sorted(by_cat):
cat_results = by_cat[cat]
cat_passed = sum(1 for r in cat_results if r.pass_result)
cat_total = len(cat_results)
cat_pct = (cat_passed / cat_total * 100) if cat_total > 0 else 0
lines.append(f"| {cat} | {cat_passed} | {cat_total} | {cat_pct:.0f}% |")

lines.append("")

lines.append("## Accuracy by Difficulty\n")
lines.append("| Difficulty | Passed | Total | Rate |")
lines.append("|---|---|---|---|")

by_diff: dict[str, list[EvalResult]] = defaultdict(list)
for r in results:
by_diff[r.difficulty].append(r)

for diff in ["easy", "medium", "hard"]:
if diff in by_diff:
diff_results = by_diff[diff]
diff_passed = sum(1 for r in diff_results if r.pass_result)
diff_total = len(diff_results)
diff_pct = (diff_passed / diff_total * 100) if diff_total > 0 else 0
lines.append(f"| {diff} | {diff_passed} | {diff_total} | {diff_pct:.0f}% |")

lines.append("")

failures = [r for r in results if not r.pass_result]
if failures:
lines.append("## Failure Analysis\n")
for r in failures:
lines.append(f"### {r.case_id} ({r.category}/{r.difficulty})\n")
lines.append(f"**Question:** {r.question}\n")
lines.append(f"**Expected:** {r.expected_answer}\n")
lines.append(f"**Actual:** {r.actual_answer}\n")
lines.append(f"**Reason:** {r.failure_reason}\n")

return "\n".join(lines)
Loading
Loading