diff --git a/__tests__/golden-seed.test.ts b/__tests__/golden-seed.test.ts new file mode 100644 index 000000000..4e113efbc --- /dev/null +++ b/__tests__/golden-seed.test.ts @@ -0,0 +1,57 @@ +/** + * GEPA golden-seed: proves the seed source that bridges the hand-graded static + * cases into the evaluation_golden_cases table (which the eval/PGR benchmark reads) + * is complete, deduplicable, and maps cleanly to a DB insert row. Pure - no DB. + */ +import { describe, expect, it } from "vitest"; +import { getSeedGoldenCases, toGoldenCaseInsert } from "@/lib/eval/golden-seed"; +import { GOLDEN_CASES_V1 } from "@/lib/eval/golden-cases-v1"; +import { GOLDEN_CASES_RECOVERY_V1 } from "@/lib/eval/golden-cases-recovery-v1"; + +describe("golden-seed - bridges the hand-graded cases into the eval benchmark", () => { + const cases = getSeedGoldenCases(); + + it("combines both static sets with no loss", () => { + expect(cases.length).toBe(GOLDEN_CASES_V1.length + GOLDEN_CASES_RECOVERY_V1.length); + expect(cases.length).toBeGreaterThan(100); + }); + + it("covers the four revenue workflows plus morning-brief the benchmark grades", () => { + const formations = new Set(cases.map((c) => c.formation_key)); + for (const f of [ + "lapsed-winback", + "estimate-recovery", + "review-lift", + "slot-rescue", + "morning-brief", + ]) { + expect(formations.has(f)).toBe(true); + } + }); + + it("has unique titles - the idempotency key the seed dedups on", () => { + const titles = cases.map((c) => c.title); + expect(new Set(titles).size).toBe(titles.length); + }); + + it("maps to a DB insert row that drops the client id/timestamps and keeps graded dims", () => { + const row = toGoldenCaseInsert(cases[0]) as Record; + expect(row.id).toBeUndefined(); + expect(row.created_at).toBeUndefined(); + expect(row.updated_at).toBeUndefined(); + expect(row.title).toBe(cases[0].title); + expect(row.formation_key).toBe(cases[0].formation_key); + expect(row.expected_min_quality).toBe(cases[0].expected_min_quality); + expect(row.is_active).toBe(true); + }); + + it("every seeded row carries a reference_verdict or graded thresholds (real grading, not schema-only)", () => { + for (const c of cases) { + const graded = + c.reference_verdict != null || + c.expected_evidence_quality != null || + c.expected_reasoning_depth != null; + expect(graded).toBe(true); + } + }); +}); diff --git a/lib/eval/golden-seed.ts b/lib/eval/golden-seed.ts new file mode 100644 index 000000000..67bc3600a --- /dev/null +++ b/lib/eval/golden-seed.ts @@ -0,0 +1,61 @@ +/** + * Golden-set seed source (GEPA benchmark keystone). + * + * The eval/PGR benchmark runner reads golden cases from the + * `evaluation_golden_cases` Supabase table (see lib/eval/golden-dataset.ts). + * The 100+ hand-graded cases live as static TS constants + * (GOLDEN_CASES_V1 + GOLDEN_CASES_RECOVERY_V1) but were never bridged into that + * table - so run-first-eval.ts bails with "No active golden cases found. Seed + * cases first." This module is the single seed source; scripts/seed-golden-cases.ts + * loads it. Pure (no IO) so it can be unit-tested without a database. + */ +import type { GoldenCase } from "./types"; +import { GOLDEN_CASES_V1 } from "./golden-cases-v1"; +import { GOLDEN_CASES_RECOVERY_V1 } from "./golden-cases-recovery-v1"; + +/** DB insert payload for evaluation_golden_cases (the table generates id + timestamps). */ +export interface GoldenCaseInsert { + title: string; + question: string; + formation_key: string; + expected_min_quality: number; + expected_evidence_quality: number | null; + expected_reasoning_depth: number | null; + expected_novelty: number | null; + expected_contrarian_value: number | null; + reference_verdict: string | null; + sector_id: string | null; + tags: string[]; + is_active: boolean; +} + +/** + * The full hand-graded golden set the benchmark grades against: the four revenue + * workflows (lapsed-winback, estimate-recovery, review-lift, slot-rescue) plus the + * general workflows including morning-brief, churn-prediction, and isa-routing. + */ +export function getSeedGoldenCases(): GoldenCase[] { + return [...GOLDEN_CASES_V1, ...GOLDEN_CASES_RECOVERY_V1]; +} + +/** + * Map a static GoldenCase to a DB insert row. Drops the client-side string `id` + * (the table uses a UUID default) and the created_at/updated_at timestamps (DB + * defaults). Title is the natural idempotency key the seed dedups on. + */ +export function toGoldenCaseInsert(c: GoldenCase): GoldenCaseInsert { + return { + title: c.title, + question: c.question, + formation_key: c.formation_key, + expected_min_quality: c.expected_min_quality, + expected_evidence_quality: c.expected_evidence_quality, + expected_reasoning_depth: c.expected_reasoning_depth, + expected_novelty: c.expected_novelty, + expected_contrarian_value: c.expected_contrarian_value, + reference_verdict: c.reference_verdict, + sector_id: c.sector_id, + tags: c.tags, + is_active: c.is_active, + }; +} diff --git a/scripts/seed-golden-cases.ts b/scripts/seed-golden-cases.ts new file mode 100644 index 000000000..0bf14f19f --- /dev/null +++ b/scripts/seed-golden-cases.ts @@ -0,0 +1,69 @@ +/** + * Seed the hand-graded golden cases into `evaluation_golden_cases` so the eval/PGR + * benchmark can actually run. Without this, run-first-eval.ts bails with "No active + * golden cases found. Seed cases first." - the graded content existed only as static + * TS constants, orphaned from the table the runner reads. + * + * Idempotent: inserts only cases whose title is not already present (safe to re-run; + * never deletes or overwrites). Founder-gated on prod (writes to Supabase); use + * --dry-run to preview. + * + * Usage: npx tsx scripts/seed-golden-cases.ts [--dry-run] + * Requires NEXT_PUBLIC_SUPABASE_URL + SUPABASE_SERVICE_ROLE_KEY in .env.local + */ +import { createClient } from "@supabase/supabase-js"; +import { getSeedGoldenCases, toGoldenCaseInsert } from "../lib/eval/golden-seed"; + +const SUPABASE_URL = process.env.NEXT_PUBLIC_SUPABASE_URL!; +const SERVICE_ROLE_KEY = process.env.SUPABASE_SERVICE_ROLE_KEY!; +const dryRun = process.argv.includes("--dry-run"); + +if (!SUPABASE_URL || !SERVICE_ROLE_KEY) { + console.error("Missing NEXT_PUBLIC_SUPABASE_URL or SUPABASE_SERVICE_ROLE_KEY in .env.local"); + process.exit(1); +} + +async function main() { + const cases = getSeedGoldenCases(); + const workflows = new Set(cases.map((c) => c.formation_key)); + console.log( + `${dryRun ? "=== DRY RUN - " : "=== "}golden seed: ${cases.length} hand-graded cases across ${workflows.size} workflows ===`, + ); + + const supabase = createClient(SUPABASE_URL, SERVICE_ROLE_KEY); + const { data: existing, error } = await supabase + .from("evaluation_golden_cases") + .select("title"); + if (error) { + console.error("Failed to read existing golden cases:", error.message); + process.exit(1); + } + + const existingTitles = new Set((existing ?? []).map((r: { title: string }) => r.title)); + const toInsert = cases.filter((c) => !existingTitles.has(c.title)); + console.log(` ${existingTitles.size} already present, ${toInsert.length} to insert`); + + if (dryRun) { + for (const c of toInsert) console.log(` + [${c.formation_key}] ${c.title}`); + console.log("\nRe-run without --dry-run to seed. Then run scripts/run-first-eval.ts for PGR baselines."); + return; + } + + if (toInsert.length === 0) { + console.log("Nothing to seed - the golden set is already up to date."); + return; + } + + const rows = toInsert.map(toGoldenCaseInsert); + const { error: insErr } = await supabase.from("evaluation_golden_cases").insert(rows); + if (insErr) { + console.error("Insert failed:", insErr.message); + process.exit(1); + } + console.log(`Seeded ${rows.length} golden cases. Run scripts/run-first-eval.ts to establish PGR baselines.`); +} + +main().catch((err) => { + console.error("Fatal error:", err); + process.exit(1); +});