|
| 1 | +"""LLM judge for semantic-similarity evaluation — provider-agnostic. |
| 2 | +
|
| 3 | +The judge calls an `LLMClient` Protocol, NOT a specific provider SDK. Wire |
| 4 | +your concrete client (OpenAI, Anthropic, Azure, vLLM, …) at runtime; the |
| 5 | +eval harness stays decoupled. The default behaviour when no client is wired |
| 6 | +is `(None, "no LLM client configured")`, which the runner treats as |
| 7 | +inconclusive rather than a hard failure. |
| 8 | +""" |
| 9 | + |
| 10 | +from __future__ import annotations |
| 11 | + |
| 12 | +import json |
| 13 | +import logging |
| 14 | +from typing import Protocol |
| 15 | + |
| 16 | +logger = logging.getLogger(__name__) |
| 17 | + |
| 18 | +_JUDGE_PROMPT = """You are an evaluation judge. Given a question, an expected |
| 19 | +answer, and an actual answer from an AI agent, score how well the actual |
| 20 | +answer matches the expected answer. |
| 21 | +
|
| 22 | +Question: {question} |
| 23 | +Expected answer: {expected_answer} |
| 24 | +Actual answer: {actual_answer} |
| 25 | +
|
| 26 | +Respond with JSON only: |
| 27 | +{{ |
| 28 | + "score": <float 0.0 to 1.0>, |
| 29 | + "explanation": "<one line explaining the score>" |
| 30 | +}} |
| 31 | +
|
| 32 | +Scoring guide: |
| 33 | +- 1.0: Semantically identical, same information conveyed |
| 34 | +- 0.8-0.9: Correct answer with minor wording or formatting differences |
| 35 | +- 0.5-0.7: Partially correct, missing key details or minor inaccuracies |
| 36 | +- 0.1-0.4: Substantially wrong but shows some relevant information |
| 37 | +- 0.0: Completely wrong or unrelated""" |
| 38 | + |
| 39 | + |
| 40 | +class LLMClient(Protocol): |
| 41 | + """Minimum surface the judge needs from any LLM SDK. |
| 42 | +
|
| 43 | + Concrete adapters live alongside the agent code. The Protocol form lets |
| 44 | + a downstream consumer wire OpenAI's SDK, Anthropic's, Azure OpenAI, or a |
| 45 | + self-hosted vLLM endpoint without the eval harness importing any |
| 46 | + vendor-specific module. |
| 47 | + """ |
| 48 | + |
| 49 | + def complete_json(self, *, model: str, prompt: str) -> str: |
| 50 | + """Send *prompt* to *model* and return the raw JSON response body.""" |
| 51 | + |
| 52 | + |
| 53 | +def evaluate_semantic_similarity( |
| 54 | + question: str, |
| 55 | + expected: str, |
| 56 | + actual: str, |
| 57 | + client: LLMClient | None, |
| 58 | + model: str, |
| 59 | +) -> tuple[float | None, str]: |
| 60 | + """Score semantic similarity between expected and actual answers. |
| 61 | +
|
| 62 | + Returns ``(score in [0.0, 1.0], explanation)``. On failure returns |
| 63 | + ``(None, error)`` — the caller treats this as inconclusive, not a fail. |
| 64 | + """ |
| 65 | + if client is None: |
| 66 | + return (None, "no LLM client configured") |
| 67 | + |
| 68 | + prompt = _JUDGE_PROMPT.format( |
| 69 | + question=question, |
| 70 | + expected_answer=expected, |
| 71 | + actual_answer=actual, |
| 72 | + ) |
| 73 | + |
| 74 | + try: |
| 75 | + body = client.complete_json(model=model, prompt=prompt) |
| 76 | + parsed = json.loads(body) |
| 77 | + score = float(parsed.get("score", 0.0)) |
| 78 | + explanation = str(parsed.get("explanation", "No explanation")) |
| 79 | + return (max(0.0, min(1.0, score)), explanation) |
| 80 | + except Exception as exc: |
| 81 | + logger.exception("LLM judge call failed") |
| 82 | + return (None, f"Judge call failed: {exc}") |
0 commit comments