-
-
Notifications
You must be signed in to change notification settings - Fork 0
Writing Eval Cases
gaafa edited this page Apr 9, 2026
·
1 revision
→ Full docs: https://synapsekit.github.io/synapsekit-docs/docs/evalci/writing-evals
from synapsekit import eval_case
@eval_case(min_score=0.80, max_cost_usd=0.01, max_latency_ms=3000)
async def eval_my_pipeline():
result = await my_pipeline.run("some input")
score = compute_score(result)
return {
"score": score, # required if min_score set
"cost_usd": result.cost, # optional
"latency_ms": result.ms, # optional, auto-measured if omitted
}| Parameter | Type | Default | Description |
|---|---|---|---|
min_score |
float |
None |
Case fails if score < min_score
|
max_cost_usd |
float |
None |
Case fails if cost_usd > max_cost_usd
|
max_latency_ms |
float |
None |
Case fails if latency_ms > max_latency_ms
|
tags |
list[str] |
[] |
Tags for filtering with --tag
|
| Key | Type | Required | Description |
|---|---|---|---|
score |
float |
Yes (if min_score set) |
Quality score 0.0–1.0 |
cost_usd |
float |
No | Cost in USD |
latency_ms |
float |
No | Latency in ms (auto-measured if omitted) |
from synapsekit import eval_case, RelevancyMetric, FaithfulnessMetric
@eval_case(min_score=0.80)
async def eval_relevancy():
result = await pipeline.ask("What is RAG?")
metric = RelevancyMetric(llm=llm)
score = await metric.score(question="What is RAG?", answer=result.answer)
return {"score": score, "cost_usd": result.cost_usd}
@eval_case(min_score=0.75)
async def eval_faithfulness():
result = await pipeline.ask("How many providers does SynapseKit support?")
metric = FaithfulnessMetric(llm=llm)
score = await metric.score(
question="How many providers?",
answer=result.answer,
contexts=result.source_documents,
)
return {"score": score}EvalCI only discovers files matching eval_*.py or *_eval.py:
eval_rag.py ✅
eval_agents.py ✅
rag_eval.py ✅
test_rag.py ❌ (not discovered)
Both async def and def are supported:
@eval_case(min_score=0.90)
def eval_deterministic():
result = my_sync_pipeline("test input")
return {"score": float(result.quality > 0.9)}