Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 57 additions & 0 deletions __tests__/golden-seed.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
/**
* GEPA golden-seed: proves the seed source that bridges the hand-graded static
* cases into the evaluation_golden_cases table (which the eval/PGR benchmark reads)
* is complete, deduplicable, and maps cleanly to a DB insert row. Pure - no DB.
*/
import { describe, expect, it } from "vitest";
import { getSeedGoldenCases, toGoldenCaseInsert } from "@/lib/eval/golden-seed";
import { GOLDEN_CASES_V1 } from "@/lib/eval/golden-cases-v1";
import { GOLDEN_CASES_RECOVERY_V1 } from "@/lib/eval/golden-cases-recovery-v1";

describe("golden-seed - bridges the hand-graded cases into the eval benchmark", () => {
const cases = getSeedGoldenCases();

it("combines both static sets with no loss", () => {
expect(cases.length).toBe(GOLDEN_CASES_V1.length + GOLDEN_CASES_RECOVERY_V1.length);
expect(cases.length).toBeGreaterThan(100);
});

it("covers the four revenue workflows plus morning-brief the benchmark grades", () => {
const formations = new Set(cases.map((c) => c.formation_key));
for (const f of [
"lapsed-winback",
"estimate-recovery",
"review-lift",
"slot-rescue",
"morning-brief",
]) {
expect(formations.has(f)).toBe(true);
}
});

it("has unique titles - the idempotency key the seed dedups on", () => {
const titles = cases.map((c) => c.title);
expect(new Set(titles).size).toBe(titles.length);
});

it("maps to a DB insert row that drops the client id/timestamps and keeps graded dims", () => {
const row = toGoldenCaseInsert(cases[0]) as Record<string, unknown>;
expect(row.id).toBeUndefined();
expect(row.created_at).toBeUndefined();
expect(row.updated_at).toBeUndefined();
expect(row.title).toBe(cases[0].title);
expect(row.formation_key).toBe(cases[0].formation_key);
expect(row.expected_min_quality).toBe(cases[0].expected_min_quality);
expect(row.is_active).toBe(true);
});

it("every seeded row carries a reference_verdict or graded thresholds (real grading, not schema-only)", () => {
for (const c of cases) {
const graded =
c.reference_verdict != null ||
c.expected_evidence_quality != null ||
c.expected_reasoning_depth != null;
expect(graded).toBe(true);
}
});
});
61 changes: 61 additions & 0 deletions lib/eval/golden-seed.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
/**
* Golden-set seed source (GEPA benchmark keystone).
*
* The eval/PGR benchmark runner reads golden cases from the
* `evaluation_golden_cases` Supabase table (see lib/eval/golden-dataset.ts).
* The 100+ hand-graded cases live as static TS constants
* (GOLDEN_CASES_V1 + GOLDEN_CASES_RECOVERY_V1) but were never bridged into that
* table - so run-first-eval.ts bails with "No active golden cases found. Seed
* cases first." This module is the single seed source; scripts/seed-golden-cases.ts
* loads it. Pure (no IO) so it can be unit-tested without a database.
*/
import type { GoldenCase } from "./types";
import { GOLDEN_CASES_V1 } from "./golden-cases-v1";
import { GOLDEN_CASES_RECOVERY_V1 } from "./golden-cases-recovery-v1";

/** DB insert payload for evaluation_golden_cases (the table generates id + timestamps). */
export interface GoldenCaseInsert {
title: string;
question: string;
formation_key: string;
expected_min_quality: number;
expected_evidence_quality: number | null;
expected_reasoning_depth: number | null;
expected_novelty: number | null;
expected_contrarian_value: number | null;
reference_verdict: string | null;
sector_id: string | null;
tags: string[];
is_active: boolean;
}

/**
* The full hand-graded golden set the benchmark grades against: the four revenue
* workflows (lapsed-winback, estimate-recovery, review-lift, slot-rescue) plus the
* general workflows including morning-brief, churn-prediction, and isa-routing.
*/
export function getSeedGoldenCases(): GoldenCase[] {
return [...GOLDEN_CASES_V1, ...GOLDEN_CASES_RECOVERY_V1];
}

/**
* Map a static GoldenCase to a DB insert row. Drops the client-side string `id`
* (the table uses a UUID default) and the created_at/updated_at timestamps (DB
* defaults). Title is the natural idempotency key the seed dedups on.
*/
export function toGoldenCaseInsert(c: GoldenCase): GoldenCaseInsert {
return {
title: c.title,
question: c.question,
formation_key: c.formation_key,
expected_min_quality: c.expected_min_quality,
expected_evidence_quality: c.expected_evidence_quality,
expected_reasoning_depth: c.expected_reasoning_depth,
expected_novelty: c.expected_novelty,
expected_contrarian_value: c.expected_contrarian_value,
reference_verdict: c.reference_verdict,
sector_id: c.sector_id,
tags: c.tags,
is_active: c.is_active,
};
}
69 changes: 69 additions & 0 deletions scripts/seed-golden-cases.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
/**
* Seed the hand-graded golden cases into `evaluation_golden_cases` so the eval/PGR
* benchmark can actually run. Without this, run-first-eval.ts bails with "No active
* golden cases found. Seed cases first." - the graded content existed only as static
* TS constants, orphaned from the table the runner reads.
*
* Idempotent: inserts only cases whose title is not already present (safe to re-run;
* never deletes or overwrites). Founder-gated on prod (writes to Supabase); use
* --dry-run to preview.
*
* Usage: npx tsx scripts/seed-golden-cases.ts [--dry-run]
* Requires NEXT_PUBLIC_SUPABASE_URL + SUPABASE_SERVICE_ROLE_KEY in .env.local
*/
import { createClient } from "@supabase/supabase-js";
import { getSeedGoldenCases, toGoldenCaseInsert } from "../lib/eval/golden-seed";

const SUPABASE_URL = process.env.NEXT_PUBLIC_SUPABASE_URL!;
const SERVICE_ROLE_KEY = process.env.SUPABASE_SERVICE_ROLE_KEY!;
const dryRun = process.argv.includes("--dry-run");

if (!SUPABASE_URL || !SERVICE_ROLE_KEY) {
console.error("Missing NEXT_PUBLIC_SUPABASE_URL or SUPABASE_SERVICE_ROLE_KEY in .env.local");
process.exit(1);
}

async function main() {
const cases = getSeedGoldenCases();
const workflows = new Set(cases.map((c) => c.formation_key));
console.log(
`${dryRun ? "=== DRY RUN - " : "=== "}golden seed: ${cases.length} hand-graded cases across ${workflows.size} workflows ===`,
);

const supabase = createClient(SUPABASE_URL, SERVICE_ROLE_KEY);
const { data: existing, error } = await supabase
.from("evaluation_golden_cases")
.select("title");
if (error) {
console.error("Failed to read existing golden cases:", error.message);
process.exit(1);
}

const existingTitles = new Set((existing ?? []).map((r: { title: string }) => r.title));
const toInsert = cases.filter((c) => !existingTitles.has(c.title));
console.log(` ${existingTitles.size} already present, ${toInsert.length} to insert`);

if (dryRun) {
for (const c of toInsert) console.log(` + [${c.formation_key}] ${c.title}`);
console.log("\nRe-run without --dry-run to seed. Then run scripts/run-first-eval.ts for PGR baselines.");
return;
}

if (toInsert.length === 0) {
console.log("Nothing to seed - the golden set is already up to date.");
return;
}

const rows = toInsert.map(toGoldenCaseInsert);
const { error: insErr } = await supabase.from("evaluation_golden_cases").insert(rows);
if (insErr) {
console.error("Insert failed:", insErr.message);
process.exit(1);
}
console.log(`Seeded ${rows.length} golden cases. Run scripts/run-first-eval.ts to establish PGR baselines.`);
}

main().catch((err) => {
console.error("Fatal error:", err);
process.exit(1);
});