Relay-Launch · Victor "David" Medina (Victor-David-Medina) · Jul 2, 2026
diff --git a/__tests__/golden-seed.test.ts b/__tests__/golden-seed.test.ts
@@ -0,0 +1,57 @@
+/**
+ * GEPA golden-seed: proves the seed source that bridges the hand-graded static
+ * cases into the evaluation_golden_cases table (which the eval/PGR benchmark reads)
+ * is complete, deduplicable, and maps cleanly to a DB insert row. Pure - no DB.
+ */
+import { describe, expect, it } from "vitest";
+import { getSeedGoldenCases, toGoldenCaseInsert } from "@/lib/eval/golden-seed";
+import { GOLDEN_CASES_V1 } from "@/lib/eval/golden-cases-v1";
+import { GOLDEN_CASES_RECOVERY_V1 } from "@/lib/eval/golden-cases-recovery-v1";
+
+describe("golden-seed - bridges the hand-graded cases into the eval benchmark", () => {
+  const cases = getSeedGoldenCases();
+
+  it("combines both static sets with no loss", () => {
+    expect(cases.length).toBe(GOLDEN_CASES_V1.length + GOLDEN_CASES_RECOVERY_V1.length);
+    expect(cases.length).toBeGreaterThan(100);
+  });
+
+  it("covers the four revenue workflows plus morning-brief the benchmark grades", () => {
+    const formations = new Set(cases.map((c) => c.formation_key));
+    for (const f of [
+      "lapsed-winback",
+      "estimate-recovery",
+      "review-lift",
+      "slot-rescue",
+      "morning-brief",
+    ]) {
+      expect(formations.has(f)).toBe(true);
+    }
+  });
+
+  it("has unique titles - the idempotency key the seed dedups on", () => {
+    const titles = cases.map((c) => c.title);
+    expect(new Set(titles).size).toBe(titles.length);
+  });
+
+  it("maps to a DB insert row that drops the client id/timestamps and keeps graded dims", () => {
+    const row = toGoldenCaseInsert(cases[0]) as Record<string, unknown>;
+    expect(row.id).toBeUndefined();
+    expect(row.created_at).toBeUndefined();
+    expect(row.updated_at).toBeUndefined();
+    expect(row.title).toBe(cases[0].title);
+    expect(row.formation_key).toBe(cases[0].formation_key);
+    expect(row.expected_min_quality).toBe(cases[0].expected_min_quality);
+    expect(row.is_active).toBe(true);
+  });
+
+  it("every seeded row carries a reference_verdict or graded thresholds (real grading, not schema-only)", () => {
+    for (const c of cases) {
+      const graded =
+        c.reference_verdict != null ||
+        c.expected_evidence_quality != null ||
+        c.expected_reasoning_depth != null;
+      expect(graded).toBe(true);
+    }
+  });
+});
diff --git a/lib/eval/golden-seed.ts b/lib/eval/golden-seed.ts
@@ -0,0 +1,61 @@
+/**
+ * Golden-set seed source (GEPA benchmark keystone).
+ *
+ * The eval/PGR benchmark runner reads golden cases from the
+ * `evaluation_golden_cases` Supabase table (see lib/eval/golden-dataset.ts).
+ * The 100+ hand-graded cases live as static TS constants
+ * (GOLDEN_CASES_V1 + GOLDEN_CASES_RECOVERY_V1) but were never bridged into that
+ * table - so run-first-eval.ts bails with "No active golden cases found. Seed
+ * cases first." This module is the single seed source; scripts/seed-golden-cases.ts
+ * loads it. Pure (no IO) so it can be unit-tested without a database.
+ */
+import type { GoldenCase } from "./types";
+import { GOLDEN_CASES_V1 } from "./golden-cases-v1";
+import { GOLDEN_CASES_RECOVERY_V1 } from "./golden-cases-recovery-v1";
+
+/** DB insert payload for evaluation_golden_cases (the table generates id + timestamps). */
+export interface GoldenCaseInsert {
+  title: string;
+  question: string;
+  formation_key: string;
+  expected_min_quality: number;
+  expected_evidence_quality: number | null;
+  expected_reasoning_depth: number | null;
+  expected_novelty: number | null;
+  expected_contrarian_value: number | null;
+  reference_verdict: string | null;
+  sector_id: string | null;
+  tags: string[];
+  is_active: boolean;
+}
+
+/**
+ * The full hand-graded golden set the benchmark grades against: the four revenue
+ * workflows (lapsed-winback, estimate-recovery, review-lift, slot-rescue) plus the
+ * general workflows including morning-brief, churn-prediction, and isa-routing.
+ */
+export function getSeedGoldenCases(): GoldenCase[] {
+  return [...GOLDEN_CASES_V1, ...GOLDEN_CASES_RECOVERY_V1];
+}
+
+/**
+ * Map a static GoldenCase to a DB insert row. Drops the client-side string `id`
+ * (the table uses a UUID default) and the created_at/updated_at timestamps (DB
+ * defaults). Title is the natural idempotency key the seed dedups on.
+ */
+export function toGoldenCaseInsert(c: GoldenCase): GoldenCaseInsert {
+  return {
+    title: c.title,
+    question: c.question,
+    formation_key: c.formation_key,
+    expected_min_quality: c.expected_min_quality,
+    expected_evidence_quality: c.expected_evidence_quality,
+    expected_reasoning_depth: c.expected_reasoning_depth,
+    expected_novelty: c.expected_novelty,
+    expected_contrarian_value: c.expected_contrarian_value,
+    reference_verdict: c.reference_verdict,
+    sector_id: c.sector_id,
+    tags: c.tags,
+    is_active: c.is_active,
+  };
+}
diff --git a/scripts/seed-golden-cases.ts b/scripts/seed-golden-cases.ts
@@ -0,0 +1,69 @@
+/**
+ * Seed the hand-graded golden cases into `evaluation_golden_cases` so the eval/PGR
+ * benchmark can actually run. Without this, run-first-eval.ts bails with "No active
+ * golden cases found. Seed cases first." - the graded content existed only as static
+ * TS constants, orphaned from the table the runner reads.
+ *
+ * Idempotent: inserts only cases whose title is not already present (safe to re-run;
+ * never deletes or overwrites). Founder-gated on prod (writes to Supabase); use
+ * --dry-run to preview.
+ *
+ * Usage: npx tsx scripts/seed-golden-cases.ts [--dry-run]
+ * Requires NEXT_PUBLIC_SUPABASE_URL + SUPABASE_SERVICE_ROLE_KEY in .env.local
+ */
+import { createClient } from "@supabase/supabase-js";
+import { getSeedGoldenCases, toGoldenCaseInsert } from "../lib/eval/golden-seed";
+
+const SUPABASE_URL = process.env.NEXT_PUBLIC_SUPABASE_URL!;
+const SERVICE_ROLE_KEY = process.env.SUPABASE_SERVICE_ROLE_KEY!;
+const dryRun = process.argv.includes("--dry-run");
+
+if (!SUPABASE_URL || !SERVICE_ROLE_KEY) {
+  console.error("Missing NEXT_PUBLIC_SUPABASE_URL or SUPABASE_SERVICE_ROLE_KEY in .env.local");
+  process.exit(1);
+}
+
+async function main() {
+  const cases = getSeedGoldenCases();
+  const workflows = new Set(cases.map((c) => c.formation_key));
+  console.log(
+    `${dryRun ? "=== DRY RUN - " : "=== "}golden seed: ${cases.length} hand-graded cases across ${workflows.size} workflows ===`,
+  );
+
+  const supabase = createClient(SUPABASE_URL, SERVICE_ROLE_KEY);
+  const { data: existing, error } = await supabase
+    .from("evaluation_golden_cases")
+    .select("title");
+  if (error) {
+    console.error("Failed to read existing golden cases:", error.message);
+    process.exit(1);
+  }
+
+  const existingTitles = new Set((existing ?? []).map((r: { title: string }) => r.title));
+  const toInsert = cases.filter((c) => !existingTitles.has(c.title));
+  console.log(`  ${existingTitles.size} already present, ${toInsert.length} to insert`);
+
+  if (dryRun) {
+    for (const c of toInsert) console.log(`  + [${c.formation_key}] ${c.title}`);
+    console.log("\nRe-run without --dry-run to seed. Then run scripts/run-first-eval.ts for PGR baselines.");
+    return;
+  }
+
+  if (toInsert.length === 0) {
+    console.log("Nothing to seed - the golden set is already up to date.");
+    return;
+  }
+
+  const rows = toInsert.map(toGoldenCaseInsert);
+  const { error: insErr } = await supabase.from("evaluation_golden_cases").insert(rows);
+  if (insErr) {
+    console.error("Insert failed:", insErr.message);
+    process.exit(1);
+  }
+  console.log(`Seeded ${rows.length} golden cases. Run scripts/run-first-eval.ts to establish PGR baselines.`);
+}
+
+main().catch((err) => {
+  console.error("Fatal error:", err);
+  process.exit(1);
+});