From b41e58e8ce9144623c3c14b136289b71b597d7bb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Thu, 11 Jun 2026 21:58:35 +0200
Subject: [PATCH 01/73] refactor(auto-routing): move classifier core into
 contracts package

---
 packages/auto-routing-contracts/package.json  |   5 +-
 .../src/classifier/index.ts                   |  20 ++
 .../src/classifier}/model-classifier.test.ts  |   4 +-
 .../src/classifier/model-classifier.ts        | 200 ++++++++++++++++
 .../src/classifier/output-fallback.test.ts    |   4 +-
 .../src/classifier/output-fallback.ts         |   4 +-
 .../src/classifier/output.test.ts             |   2 +-
 .../src/classifier/output.ts                  |   4 +-
 .../src/classifier/prompt.test.ts             |   4 +-
 .../src/classifier/prompt.ts                  |   4 +-
 .../src/classifier/taxonomy.json              |   0
 .../src/classifier/taxonomy.test.ts           |   2 +-
 packages/auto-routing-contracts/tsconfig.json |   2 +
 .../auto-routing-contracts/vitest.config.ts   |   9 +
 pnpm-lock.yaml                                |  12 +-
 .../src/admin-classifier-model.ts             |   2 +-
 .../src/classifier-analytics.test.ts          |   2 +-
 .../auto-routing/src/classifier-analytics.ts  |   2 +-
 .../src/classifier-config.test.ts             |   2 +-
 .../auto-routing/src/classifier-config.ts     |   2 +-
 services/auto-routing/src/decide.ts           |   2 +-
 .../auto-routing/src/decision-cache.test.ts   |   2 +-
 services/auto-routing/src/model-classifier.ts | 215 ++----------------
 23 files changed, 278 insertions(+), 227 deletions(-)
 create mode 100644 packages/auto-routing-contracts/src/classifier/index.ts
 rename {services/auto-routing/src => packages/auto-routing-contracts/src/classifier}/model-classifier.test.ts (97%)
 create mode 100644 packages/auto-routing-contracts/src/classifier/model-classifier.ts
 rename services/auto-routing/src/classifier-output/fallback.test.ts => packages/auto-routing-contracts/src/classifier/output-fallback.test.ts (92%)
 rename services/auto-routing/src/classifier-output/fallback.ts => packages/auto-routing-contracts/src/classifier/output-fallback.ts (92%)
 rename services/auto-routing/src/classifier-output/index.test.ts => packages/auto-routing-contracts/src/classifier/output.test.ts (99%)
 rename services/auto-routing/src/classifier-output/index.ts => packages/auto-routing-contracts/src/classifier/output.ts (98%)
 rename services/auto-routing/src/classifier-prompt.test.ts => packages/auto-routing-contracts/src/classifier/prompt.test.ts (97%)
 rename services/auto-routing/src/classifier-prompt.ts => packages/auto-routing-contracts/src/classifier/prompt.ts (96%)
 rename services/auto-routing/src/classifier-taxonomy.json => packages/auto-routing-contracts/src/classifier/taxonomy.json (100%)
 rename services/auto-routing/src/classifier-taxonomy.test.ts => packages/auto-routing-contracts/src/classifier/taxonomy.test.ts (96%)
 create mode 100644 packages/auto-routing-contracts/vitest.config.ts

diff --git a/packages/auto-routing-contracts/package.json b/packages/auto-routing-contracts/package.json
index 43e1bd2cfd..6ea28e8178 100644
--- a/packages/auto-routing-contracts/package.json
+++ b/packages/auto-routing-contracts/package.json
@@ -6,7 +6,8 @@
   "main": "./src/index.ts",
   "types": "./src/index.ts",
   "exports": {
-    ".": "./src/index.ts"
+    ".": "./src/index.ts",
+    "./classifier": "./src/classifier/index.ts"
   },
   "scripts": {
     "typecheck": "tsgo --noEmit",
@@ -14,9 +15,11 @@
     "test": "vitest run"
   },
   "dependencies": {
+    "@openrouter/sdk": "^0.12.79",
     "zod": "catalog:"
   },
   "devDependencies": {
+    "@types/node": "catalog:",
     "@typescript/native-preview": "catalog:",
     "typescript": "catalog:",
     "vitest": "catalog:"
diff --git a/packages/auto-routing-contracts/src/classifier/index.ts b/packages/auto-routing-contracts/src/classifier/index.ts
new file mode 100644
index 0000000000..d3422ad6a7
--- /dev/null
+++ b/packages/auto-routing-contracts/src/classifier/index.ts
@@ -0,0 +1,20 @@
+export {
+  buildClassifierMessages,
+  CLASSIFIER_MAX_TOKENS,
+  DEFAULT_CLASSIFIER_MODEL,
+} from './prompt';
+export {
+  ClassifierOutputParseError,
+  parseClassifierOutput,
+  type ClassifierOutput,
+} from './output';
+export { fallbackClassifierOutput } from './output-fallback';
+export {
+  classifyWithOpenRouter,
+  ClassifierRunError,
+  type ClassifierCallOptions,
+  type ClassifierModelCallMeta,
+  type ClassifierRunFailureMetadata,
+  type ClassifierRunFallbackMetadata,
+  type ClassifierRunResult,
+} from './model-classifier';
diff --git a/services/auto-routing/src/model-classifier.test.ts b/packages/auto-routing-contracts/src/classifier/model-classifier.test.ts
similarity index 97%
rename from services/auto-routing/src/model-classifier.test.ts
rename to packages/auto-routing-contracts/src/classifier/model-classifier.test.ts
index 622409612e..de54484a8d 100644
--- a/services/auto-routing/src/model-classifier.test.ts
+++ b/packages/auto-routing-contracts/src/classifier/model-classifier.test.ts
@@ -1,9 +1,9 @@
 import { describe, expect, it, vi } from 'vitest';
 import type { OpenRouter } from '@openrouter/sdk';
 import type { ChatResult } from '@openrouter/sdk/models';
-import { DEFAULT_CLASSIFIER_MODEL } from './classifier-prompt';
+import { DEFAULT_CLASSIFIER_MODEL } from './prompt';
 import { ClassifierRunError, classifyWithOpenRouter } from './model-classifier';
-import type { NormalizedClassifierInput } from '@kilocode/auto-routing-contracts';
+import type { NormalizedClassifierInput } from '../index';
 
 const normalizedInput = {
   apiKind: 'responses',
diff --git a/packages/auto-routing-contracts/src/classifier/model-classifier.ts b/packages/auto-routing-contracts/src/classifier/model-classifier.ts
new file mode 100644
index 0000000000..645276dd6a
--- /dev/null
+++ b/packages/auto-routing-contracts/src/classifier/model-classifier.ts
@@ -0,0 +1,200 @@
+import type { OpenRouter } from '@openrouter/sdk';
+import type { ChatResult } from '@openrouter/sdk/models';
+import { buildClassifierMessages, CLASSIFIER_MAX_TOKENS } from './prompt';
+import type { NormalizedClassifierInput } from '../index';
+import { ClassifierOutputParseError, parseClassifierOutput, type ClassifierOutput } from './output';
+import { fallbackClassifierOutput } from './output-fallback';
+
+export type ClassifierRunResult = {
+  cost: number | null;
+  classifierModel: string;
+  classification: ClassifierOutput;
+  fallback?: ClassifierRunFallbackMetadata;
+  modelCallMeta?: ClassifierModelCallMeta;
+  retried?: boolean;
+  // Why the first attempt was retried; present only when retried is true.
+  firstAttemptFailure?: {
+    reason: string;
+    failureStage: string | null;
+    finishReason: string | null;
+  };
+};
+
+export type ClassifierModelCallMeta = {
+  finishReason: string | null;
+  completionTokens: number | null;
+  reasoningTokens: number | null;
+  // Length only — the raw output is derived from untrusted, mirrored user
+  // prompts and must not reach persistent logs. Combined with finishReason
+  // and token counts this still distinguishes truncation from prompt echo.
+  textLength: number | null;
+};
+
+export type ClassifierRunFailureMetadata = {
+  cost: number | null;
+  classifierModel: string;
+  failureStage?: string;
+  schemaIssueSummary?: string[];
+  topLevelKeys?: string[];
+};
+
+export type ClassifierRunFallbackMetadata = {
+  reason: 'no_text' | 'invalid_output';
+  failureStage?: string;
+  schemaIssueSummary?: string[];
+  topLevelKeys?: string[];
+};
+
+export class ClassifierRunError extends Error {
+  readonly cost: number | null;
+  readonly classifierModel: string;
+  readonly failureStage?: string;
+  readonly schemaIssueSummary: string[];
+  readonly topLevelKeys: string[];
+
+  constructor(message: string, metadata: ClassifierRunFailureMetadata) {
+    super(message);
+    this.name = 'ClassifierRunError';
+    this.cost = metadata.cost;
+    this.classifierModel = metadata.classifierModel;
+    this.failureStage = metadata.failureStage;
+    this.schemaIssueSummary = metadata.schemaIssueSummary ?? [];
+    this.topLevelKeys = metadata.topLevelKeys ?? [];
+  }
+}
+
+export type ClassifierCallOptions = {
+  // Sticky routing key passed to OpenRouter so requests from the same
+  // session land on the same provider and reuse its prompt cache.
+  openrouterSessionId?: string;
+};
+
+export async function classifyWithOpenRouter(
+  client: OpenRouter,
+  input: NormalizedClassifierInput,
+  classifierModel: string,
+  options: ClassifierCallOptions = {}
+): Promise<ClassifierRunResult> {
+  // Invalid output is usually a transient provider glitch (responses cut
+  // off after a handful of tokens with a "stop" finish reason), so one
+  // retry recovers most of those classifications.
+  const firstAttempt = await runClassifierAttempt(client, input, classifierModel, options);
+  if (!firstAttempt.fallback) {
+    return firstAttempt;
+  }
+
+  let retryAttempt: ClassifierRunResult;
+  try {
+    retryAttempt = await runClassifierAttempt(client, input, classifierModel, options);
+  } catch (error) {
+    // The retry threw (e.g. a transport error) after the first attempt had
+    // already billed and produced diagnostics. Surface those rather than
+    // letting the raw error escape and underreport spend.
+    throw new ClassifierRunError(
+      error instanceof Error ? error.message : 'classifier retry failed',
+      {
+        cost: firstAttempt.cost,
+        classifierModel,
+        failureStage: firstAttempt.fallback.failureStage ?? firstAttempt.fallback.reason,
+        schemaIssueSummary: firstAttempt.fallback.schemaIssueSummary,
+        topLevelKeys: firstAttempt.fallback.topLevelKeys,
+      }
+    );
+  }
+  return {
+    ...retryAttempt,
+    cost: sumCosts(firstAttempt.cost, retryAttempt.cost),
+    retried: true,
+    firstAttemptFailure: {
+      reason: firstAttempt.fallback.reason,
+      failureStage: firstAttempt.fallback.failureStage ?? null,
+      finishReason: firstAttempt.modelCallMeta?.finishReason ?? null,
+    },
+  };
+}
+
+function sumCosts(first: number | null, second: number | null): number | null {
+  if (first === null && second === null) return null;
+  return (first ?? 0) + (second ?? 0);
+}
+
+async function runClassifierAttempt(
+  client: OpenRouter,
+  input: NormalizedClassifierInput,
+  classifierModel: string,
+  options: ClassifierCallOptions
+): Promise<ClassifierRunResult> {
+  const result = await client.chat.send({
+    chatRequest: {
+      model: classifierModel,
+      messages: buildClassifierMessages(input),
+      responseFormat: { type: 'json_object' },
+      stream: false,
+      temperature: 0,
+      maxTokens: CLASSIFIER_MAX_TOKENS,
+      ...(options.openrouterSessionId ? { sessionId: options.openrouterSessionId } : {}),
+    },
+  });
+
+  const cost = result.usage?.cost ?? null;
+  const text = extractClassifierText(result);
+  const modelCallMeta = extractModelCallMeta(result, text);
+  if (!text) {
+    return fallbackClassifierResult(input, classifierModel, cost, modelCallMeta, {
+      reason: 'no_text',
+    });
+  }
+
+  try {
+    return {
+      cost,
+      classifierModel,
+      classification: parseClassifierOutput(text),
+      modelCallMeta,
+    };
+  } catch (error) {
+    return fallbackClassifierResult(input, classifierModel, cost, modelCallMeta, {
+      reason: 'invalid_output',
+      ...(error instanceof ClassifierOutputParseError
+        ? {
+            failureStage: error.failureStage,
+            schemaIssueSummary: error.schemaIssueSummary,
+            topLevelKeys: error.topLevelKeys,
+          }
+        : {}),
+    });
+  }
+}
+
+function extractModelCallMeta(result: ChatResult, text: string | null): ClassifierModelCallMeta {
+  return {
+    finishReason: result.choices[0]?.finishReason ?? null,
+    completionTokens: result.usage?.completionTokens ?? null,
+    reasoningTokens: result.usage?.completionTokensDetails?.reasoningTokens ?? null,
+    textLength: text?.length ?? null,
+  };
+}
+
+function fallbackClassifierResult(
+  input: NormalizedClassifierInput,
+  classifierModel: string,
+  cost: number | null,
+  modelCallMeta: ClassifierModelCallMeta,
+  fallback: ClassifierRunFallbackMetadata
+): ClassifierRunResult {
+  return {
+    cost,
+    classifierModel,
+    classification: fallbackClassifierOutput(input),
+    fallback,
+    modelCallMeta,
+  };
+}
+
+function extractClassifierText(result: ChatResult) {
+  const content: unknown = result.choices[0]?.message.content;
+  if (typeof content === 'string' && content.trim().length > 0) {
+    return content;
+  }
+  return null;
+}
diff --git a/services/auto-routing/src/classifier-output/fallback.test.ts b/packages/auto-routing-contracts/src/classifier/output-fallback.test.ts
similarity index 92%
rename from services/auto-routing/src/classifier-output/fallback.test.ts
rename to packages/auto-routing-contracts/src/classifier/output-fallback.test.ts
index c5ee6394a1..6bafe4acf3 100644
--- a/services/auto-routing/src/classifier-output/fallback.test.ts
+++ b/packages/auto-routing-contracts/src/classifier/output-fallback.test.ts
@@ -1,6 +1,6 @@
 import { describe, expect, it } from 'vitest';
-import type { NormalizedClassifierInput } from '@kilocode/auto-routing-contracts';
-import { fallbackClassifierOutput } from './fallback';
+import type { NormalizedClassifierInput } from '../index';
+import { fallbackClassifierOutput } from './output-fallback';
 
 const input = {
   apiKind: 'chat_completions',
diff --git a/services/auto-routing/src/classifier-output/fallback.ts b/packages/auto-routing-contracts/src/classifier/output-fallback.ts
similarity index 92%
rename from services/auto-routing/src/classifier-output/fallback.ts
rename to packages/auto-routing-contracts/src/classifier/output-fallback.ts
index c047813e50..969374b893 100644
--- a/services/auto-routing/src/classifier-output/fallback.ts
+++ b/packages/auto-routing-contracts/src/classifier/output-fallback.ts
@@ -1,5 +1,5 @@
-import type { NormalizedClassifierInput } from '@kilocode/auto-routing-contracts';
-import type { ClassifierOutput } from './index';
+import type { NormalizedClassifierInput } from '../index';
+import type { ClassifierOutput } from './output';
 
 type IntentRule = {
   taskType: ClassifierOutput['taskType'];
diff --git a/services/auto-routing/src/classifier-output/index.test.ts b/packages/auto-routing-contracts/src/classifier/output.test.ts
similarity index 99%
rename from services/auto-routing/src/classifier-output/index.test.ts
rename to packages/auto-routing-contracts/src/classifier/output.test.ts
index d57003b00d..e842a4b178 100644
--- a/services/auto-routing/src/classifier-output/index.test.ts
+++ b/packages/auto-routing-contracts/src/classifier/output.test.ts
@@ -4,7 +4,7 @@ import {
   parseClassifierOutput,
   type ClassifierOutputParseError,
   type ClassifierOutput,
-} from './index';
+} from './output';
 
 const validOutput = {
   taskType: 'debugging',
diff --git a/services/auto-routing/src/classifier-output/index.ts b/packages/auto-routing-contracts/src/classifier/output.ts
similarity index 98%
rename from services/auto-routing/src/classifier-output/index.ts
rename to packages/auto-routing-contracts/src/classifier/output.ts
index 1796e4b724..8acd5392fc 100644
--- a/services/auto-routing/src/classifier-output/index.ts
+++ b/packages/auto-routing-contracts/src/classifier/output.ts
@@ -1,5 +1,5 @@
-import { ClassifierOutputSchema, type ClassifierOutput } from '@kilocode/auto-routing-contracts';
-import classifierTaxonomy from '../classifier-taxonomy.json';
+import { ClassifierOutputSchema, type ClassifierOutput } from '../index';
+import classifierTaxonomy from './taxonomy.json';
 
 export const classifierOutputSchema = ClassifierOutputSchema;
 export type { ClassifierOutput };
diff --git a/services/auto-routing/src/classifier-prompt.test.ts b/packages/auto-routing-contracts/src/classifier/prompt.test.ts
similarity index 97%
rename from services/auto-routing/src/classifier-prompt.test.ts
rename to packages/auto-routing-contracts/src/classifier/prompt.test.ts
index 782c5a22c6..e3444fedc4 100644
--- a/services/auto-routing/src/classifier-prompt.test.ts
+++ b/packages/auto-routing-contracts/src/classifier/prompt.test.ts
@@ -1,6 +1,6 @@
 import { describe, expect, it } from 'vitest';
-import { buildClassifierMessages, DEFAULT_CLASSIFIER_MODEL } from './classifier-prompt';
-import type { NormalizedClassifierInput } from '@kilocode/auto-routing-contracts';
+import { buildClassifierMessages, DEFAULT_CLASSIFIER_MODEL } from './prompt';
+import type { NormalizedClassifierInput } from '../index';
 
 const input = {
   apiKind: 'chat_completions',
diff --git a/services/auto-routing/src/classifier-prompt.ts b/packages/auto-routing-contracts/src/classifier/prompt.ts
similarity index 96%
rename from services/auto-routing/src/classifier-prompt.ts
rename to packages/auto-routing-contracts/src/classifier/prompt.ts
index 641df0fb24..efaf1793fd 100644
--- a/services/auto-routing/src/classifier-prompt.ts
+++ b/packages/auto-routing-contracts/src/classifier/prompt.ts
@@ -1,5 +1,5 @@
-import classifierTaxonomy from './classifier-taxonomy.json';
-import type { NormalizedClassifierInput } from '@kilocode/auto-routing-contracts';
+import classifierTaxonomy from './taxonomy.json';
+import type { NormalizedClassifierInput } from '../index';
 
 export const DEFAULT_CLASSIFIER_MODEL = 'google/gemini-2.5-flash-lite';
 // The classification JSON needs ~60 tokens; the headroom avoids truncated
diff --git a/services/auto-routing/src/classifier-taxonomy.json b/packages/auto-routing-contracts/src/classifier/taxonomy.json
similarity index 100%
rename from services/auto-routing/src/classifier-taxonomy.json
rename to packages/auto-routing-contracts/src/classifier/taxonomy.json
diff --git a/services/auto-routing/src/classifier-taxonomy.test.ts b/packages/auto-routing-contracts/src/classifier/taxonomy.test.ts
similarity index 96%
rename from services/auto-routing/src/classifier-taxonomy.test.ts
rename to packages/auto-routing-contracts/src/classifier/taxonomy.test.ts
index dc510492cf..b3a3ab7dd0 100644
--- a/services/auto-routing/src/classifier-taxonomy.test.ts
+++ b/packages/auto-routing-contracts/src/classifier/taxonomy.test.ts
@@ -46,7 +46,7 @@ const TaxonomySchema = z.object({
 });
 
 async function readTaxonomy() {
-  const file = await readFile(join(__dirname, 'classifier-taxonomy.json'), 'utf8');
+  const file = await readFile(join(__dirname, 'taxonomy.json'), 'utf8');
   return TaxonomySchema.parse(JSON.parse(file));
 }
 
diff --git a/packages/auto-routing-contracts/tsconfig.json b/packages/auto-routing-contracts/tsconfig.json
index 76473b226e..b293f0f4ef 100644
--- a/packages/auto-routing-contracts/tsconfig.json
+++ b/packages/auto-routing-contracts/tsconfig.json
@@ -4,11 +4,13 @@
     "module": "ESNext",
     "moduleResolution": "bundler",
     "lib": ["ESNext", "WebWorker"],
+    "types": ["node"],
     "strict": true,
     "skipLibCheck": true,
     "forceConsistentCasingInFileNames": true,
     "noEmit": true,
     "isolatedModules": true,
+    "resolveJsonModule": true,
     "noImplicitReturns": true,
     "noFallthroughCasesInSwitch": true
   },
diff --git a/packages/auto-routing-contracts/vitest.config.ts b/packages/auto-routing-contracts/vitest.config.ts
new file mode 100644
index 0000000000..7dd13254e7
--- /dev/null
+++ b/packages/auto-routing-contracts/vitest.config.ts
@@ -0,0 +1,9 @@
+import { defineConfig } from 'vitest/config';
+
+export default defineConfig({
+  test: {
+    globals: true,
+    environment: 'node',
+    include: ['src/**/*.test.ts'],
+  },
+});
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index e208522daa..7677b03452 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -990,10 +990,16 @@ importers:
 
   packages/auto-routing-contracts:
     dependencies:
+      '@openrouter/sdk':
+        specifier: ^0.12.79
+        version: 0.12.79
       zod:
         specifier: 'catalog:'
         version: 4.4.3
     devDependencies:
+      '@types/node':
+        specifier: 'catalog:'
+        version: 24.12.4
       '@typescript/native-preview':
         specifier: 'catalog:'
         version: 7.0.0-dev.20260514.1
@@ -1002,7 +1008,7 @@ importers:
         version: 5.9.3
       vitest:
         specifier: 'catalog:'
-        version: 4.1.6(@opentelemetry/api@1.9.1)(@types/node@25.5.2)(@vitest/coverage-v8@4.1.6)(@vitest/ui@4.1.6)(esbuild@0.27.4)(jiti@2.7.0)(terser@5.46.0)(tsx@4.21.0)(yaml@2.8.4)
+        version: 4.1.6(@opentelemetry/api@1.9.1)(@types/node@24.12.4)(@vitest/coverage-v8@4.1.6)(@vitest/ui@4.1.6)(esbuild@0.27.4)(jiti@2.7.0)(terser@5.46.0)(tsx@4.21.0)(yaml@2.8.4)
 
   packages/cloud-agent-profile:
     dependencies:
@@ -1530,7 +1536,7 @@ importers:
         version: 5.9.3
       vitest:
         specifier: 'catalog:'
-        version: 4.1.6(@opentelemetry/api@1.9.1)(@types/node@25.5.2)(@vitest/coverage-v8@4.1.6)(@vitest/ui@4.1.6)(esbuild@0.27.4)(jiti@2.7.0)(terser@5.46.0)(tsx@4.21.0)(yaml@2.8.4)
+        version: 4.1.6(@opentelemetry/api@1.9.1)(@types/node@24.12.4)(@vitest/coverage-v8@4.1.6)(@vitest/ui@4.1.6)(esbuild@0.27.4)(jiti@2.7.0)(terser@5.46.0)(tsx@4.21.0)(yaml@2.8.4)
       wrangler:
         specifier: 'catalog:'
         version: 4.98.0(@cloudflare/workers-types@4.20260605.1)(bufferutil@4.1.0)(utf-8-validate@6.0.6)
@@ -23833,7 +23839,7 @@ snapshots:
 
   '@types/pg@8.18.0':
     dependencies:
-      '@types/node': 25.5.2
+      '@types/node': 24.12.4
       pg-protocol: 1.13.0
       pg-types: 2.2.0
 
diff --git a/services/auto-routing/src/admin-classifier-model.ts b/services/auto-routing/src/admin-classifier-model.ts
index 7fc6660e31..8eb5f2f0dd 100644
--- a/services/auto-routing/src/admin-classifier-model.ts
+++ b/services/auto-routing/src/admin-classifier-model.ts
@@ -3,7 +3,7 @@ import {
   type AutoRoutingClassifierModelResponse,
 } from '@kilocode/auto-routing-contracts';
 import type { Handler } from 'hono';
-import { DEFAULT_CLASSIFIER_MODEL } from './classifier-prompt';
+import { DEFAULT_CLASSIFIER_MODEL } from '@kilocode/auto-routing-contracts/classifier';
 import { getClassifierModel, setClassifierModel } from './classifier-config';
 import type { HonoEnv } from './hono-env';
 
diff --git a/services/auto-routing/src/classifier-analytics.test.ts b/services/auto-routing/src/classifier-analytics.test.ts
index e3ebc38e0c..11a8d5f12e 100644
--- a/services/auto-routing/src/classifier-analytics.test.ts
+++ b/services/auto-routing/src/classifier-analytics.test.ts
@@ -1,6 +1,6 @@
 import { describe, expect, it, vi } from 'vitest';
 import { writeClassifierMetricsDataPoint } from './classifier-analytics';
-import type { ClassifierOutput } from './classifier-output';
+import type { ClassifierOutput } from '@kilocode/auto-routing-contracts/classifier';
 
 const classification = {
   taskType: 'debugging',
diff --git a/services/auto-routing/src/classifier-analytics.ts b/services/auto-routing/src/classifier-analytics.ts
index b0ceb9a4c4..08c5c0deb3 100644
--- a/services/auto-routing/src/classifier-analytics.ts
+++ b/services/auto-routing/src/classifier-analytics.ts
@@ -1,4 +1,4 @@
-import type { ClassifierOutput } from './classifier-output';
+import type { ClassifierOutput } from '@kilocode/auto-routing-contracts/classifier';
 
 export type ClassifierAnalyticsStatus =
   | 'classified'
diff --git a/services/auto-routing/src/classifier-config.test.ts b/services/auto-routing/src/classifier-config.test.ts
index fbd3a0e8c4..08c83c4b59 100644
--- a/services/auto-routing/src/classifier-config.test.ts
+++ b/services/auto-routing/src/classifier-config.test.ts
@@ -1,5 +1,5 @@
 import { beforeEach, describe, expect, it, vi } from 'vitest';
-import { DEFAULT_CLASSIFIER_MODEL } from './classifier-prompt';
+import { DEFAULT_CLASSIFIER_MODEL } from '@kilocode/auto-routing-contracts/classifier';
 import {
   CLASSIFIER_MODEL_CONFIG_KEY,
   clearClassifierConfigCache,
diff --git a/services/auto-routing/src/classifier-config.ts b/services/auto-routing/src/classifier-config.ts
index 6b0687a539..e9025a9c95 100644
--- a/services/auto-routing/src/classifier-config.ts
+++ b/services/auto-routing/src/classifier-config.ts
@@ -1,5 +1,5 @@
 import { formatError } from '@kilocode/worker-utils';
-import { DEFAULT_CLASSIFIER_MODEL } from './classifier-prompt';
+import { DEFAULT_CLASSIFIER_MODEL } from '@kilocode/auto-routing-contracts/classifier';
 import { ttlCached } from './ttl-cache';
 
 export const CLASSIFIER_MODEL_CONFIG_KEY = 'classifier_model';
diff --git a/services/auto-routing/src/decide.ts b/services/auto-routing/src/decide.ts
index 3cc94edc56..4303192f60 100644
--- a/services/auto-routing/src/decide.ts
+++ b/services/auto-routing/src/decide.ts
@@ -9,7 +9,7 @@ import type { Handler } from 'hono';
 import { writeClassifierMetricsDataPoint } from './classifier-analytics';
 import type { ClassifierAnalyticsStatus } from './classifier-analytics';
 import { getClassifierModel, getDecisionLogSampleRate } from './classifier-config';
-import type { ClassifierOutput } from './classifier-output';
+import type { ClassifierOutput } from '@kilocode/auto-routing-contracts/classifier';
 import {
   computeContentHashes,
   deriveConversationKey,
diff --git a/services/auto-routing/src/decision-cache.test.ts b/services/auto-routing/src/decision-cache.test.ts
index 1e3245835d..c61cd2eb97 100644
--- a/services/auto-routing/src/decision-cache.test.ts
+++ b/services/auto-routing/src/decision-cache.test.ts
@@ -1,5 +1,5 @@
 import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
-import type { ClassifierOutput } from './classifier-output';
+import type { ClassifierOutput } from '@kilocode/auto-routing-contracts/classifier';
 import { AutoRoutingDecisionCacheDO } from './decision-cache';
 
 const classification = {
diff --git a/services/auto-routing/src/model-classifier.ts b/services/auto-routing/src/model-classifier.ts
index 94d7f672cf..e9a9898f13 100644
--- a/services/auto-routing/src/model-classifier.ts
+++ b/services/auto-routing/src/model-classifier.ts
@@ -1,81 +1,22 @@
-import type { OpenRouter } from '@openrouter/sdk';
-import type { ChatResult } from '@openrouter/sdk/models';
-import { buildClassifierMessages, CLASSIFIER_MAX_TOKENS } from './classifier-prompt';
+import { classifyWithOpenRouter } from '@kilocode/auto-routing-contracts/classifier';
+import type {
+  ClassifierCallOptions,
+  ClassifierRunResult,
+} from '@kilocode/auto-routing-contracts/classifier';
 import type { NormalizedClassifierInput } from '@kilocode/auto-routing-contracts';
-import {
-  ClassifierOutputParseError,
-  parseClassifierOutput,
-  type ClassifierOutput,
-} from './classifier-output';
-import { fallbackClassifierOutput } from './classifier-output/fallback';
 import { createOpenRouterClient } from './openrouter';
 
-export type ClassifierRunResult = {
-  cost: number | null;
-  classifierModel: string;
-  classification: ClassifierOutput;
-  fallback?: ClassifierRunFallbackMetadata;
-  modelCallMeta?: ClassifierModelCallMeta;
-  retried?: boolean;
-  // Why the first attempt was retried; present only when retried is true.
-  firstAttemptFailure?: {
-    reason: string;
-    failureStage: string | null;
-    finishReason: string | null;
-  };
-};
-
-export type ClassifierModelCallMeta = {
-  finishReason: string | null;
-  completionTokens: number | null;
-  reasoningTokens: number | null;
-  // Length only — the raw output is derived from untrusted, mirrored user
-  // prompts and must not reach persistent logs. Combined with finishReason
-  // and token counts this still distinguishes truncation from prompt echo.
-  textLength: number | null;
-};
-
-export type ClassifierRunFailureMetadata = {
-  cost: number | null;
-  classifierModel: string;
-  failureStage?: string;
-  schemaIssueSummary?: string[];
-  topLevelKeys?: string[];
-};
-
-export type ClassifierRunFallbackMetadata = {
-  reason: 'no_text' | 'invalid_output';
-  failureStage?: string;
-  schemaIssueSummary?: string[];
-  topLevelKeys?: string[];
-};
-
-export class ClassifierRunError extends Error {
-  readonly cost: number | null;
-  readonly classifierModel: string;
-  readonly failureStage?: string;
-  readonly schemaIssueSummary: string[];
-  readonly topLevelKeys: string[];
-
-  constructor(message: string, metadata: ClassifierRunFailureMetadata) {
-    super(message);
-    this.name = 'ClassifierRunError';
-    this.cost = metadata.cost;
-    this.classifierModel = metadata.classifierModel;
-    this.failureStage = metadata.failureStage;
-    this.schemaIssueSummary = metadata.schemaIssueSummary ?? [];
-    this.topLevelKeys = metadata.topLevelKeys ?? [];
-  }
-}
+export {
+  ClassifierRunError,
+  classifyWithOpenRouter,
+} from '@kilocode/auto-routing-contracts/classifier';
+export type {
+  ClassifierCallOptions,
+  ClassifierRunResult,
+} from '@kilocode/auto-routing-contracts/classifier';
 
 type ClassifierEnv = Pick<Env, 'OPENROUTER_API_KEY'>;
 
-export type ClassifierCallOptions = {
-  // Sticky routing key passed to OpenRouter so requests from the same
-  // session land on the same provider and reuse its prompt cache.
-  openrouterSessionId?: string;
-};
-
 export async function classifyNormalizedInput(
   env: ClassifierEnv,
   input: NormalizedClassifierInput,
@@ -85,133 +26,3 @@ export async function classifyNormalizedInput(
   const client = await createOpenRouterClient(env);
   return classifyWithOpenRouter(client, input, classifierModel, options);
 }
-
-export async function classifyWithOpenRouter(
-  client: OpenRouter,
-  input: NormalizedClassifierInput,
-  classifierModel: string,
-  options: ClassifierCallOptions = {}
-): Promise<ClassifierRunResult> {
-  // Invalid output is usually a transient provider glitch (responses cut
-  // off after a handful of tokens with a "stop" finish reason), so one
-  // retry recovers most of those classifications.
-  const firstAttempt = await runClassifierAttempt(client, input, classifierModel, options);
-  if (!firstAttempt.fallback) {
-    return firstAttempt;
-  }
-
-  let retryAttempt: ClassifierRunResult;
-  try {
-    retryAttempt = await runClassifierAttempt(client, input, classifierModel, options);
-  } catch (error) {
-    // The retry threw (e.g. a transport error) after the first attempt had
-    // already billed and produced diagnostics. Surface those rather than
-    // letting the raw error escape and underreport spend.
-    throw new ClassifierRunError(
-      error instanceof Error ? error.message : 'classifier retry failed',
-      {
-        cost: firstAttempt.cost,
-        classifierModel,
-        failureStage: firstAttempt.fallback.failureStage ?? firstAttempt.fallback.reason,
-        schemaIssueSummary: firstAttempt.fallback.schemaIssueSummary,
-        topLevelKeys: firstAttempt.fallback.topLevelKeys,
-      }
-    );
-  }
-  return {
-    ...retryAttempt,
-    cost: sumCosts(firstAttempt.cost, retryAttempt.cost),
-    retried: true,
-    firstAttemptFailure: {
-      reason: firstAttempt.fallback.reason,
-      failureStage: firstAttempt.fallback.failureStage ?? null,
-      finishReason: firstAttempt.modelCallMeta?.finishReason ?? null,
-    },
-  };
-}
-
-function sumCosts(first: number | null, second: number | null): number | null {
-  if (first === null && second === null) return null;
-  return (first ?? 0) + (second ?? 0);
-}
-
-async function runClassifierAttempt(
-  client: OpenRouter,
-  input: NormalizedClassifierInput,
-  classifierModel: string,
-  options: ClassifierCallOptions
-): Promise<ClassifierRunResult> {
-  const result = await client.chat.send({
-    chatRequest: {
-      model: classifierModel,
-      messages: buildClassifierMessages(input),
-      responseFormat: { type: 'json_object' },
-      stream: false,
-      temperature: 0,
-      maxTokens: CLASSIFIER_MAX_TOKENS,
-      ...(options.openrouterSessionId ? { sessionId: options.openrouterSessionId } : {}),
-    },
-  });
-
-  const cost = result.usage?.cost ?? null;
-  const text = extractClassifierText(result);
-  const modelCallMeta = extractModelCallMeta(result, text);
-  if (!text) {
-    return fallbackClassifierResult(input, classifierModel, cost, modelCallMeta, {
-      reason: 'no_text',
-    });
-  }
-
-  try {
-    return {
-      cost,
-      classifierModel,
-      classification: parseClassifierOutput(text),
-      modelCallMeta,
-    };
-  } catch (error) {
-    return fallbackClassifierResult(input, classifierModel, cost, modelCallMeta, {
-      reason: 'invalid_output',
-      ...(error instanceof ClassifierOutputParseError
-        ? {
-            failureStage: error.failureStage,
-            schemaIssueSummary: error.schemaIssueSummary,
-            topLevelKeys: error.topLevelKeys,
-          }
-        : {}),
-    });
-  }
-}
-
-function extractModelCallMeta(result: ChatResult, text: string | null): ClassifierModelCallMeta {
-  return {
-    finishReason: result.choices[0]?.finishReason ?? null,
-    completionTokens: result.usage?.completionTokens ?? null,
-    reasoningTokens: result.usage?.completionTokensDetails?.reasoningTokens ?? null,
-    textLength: text?.length ?? null,
-  };
-}
-
-function fallbackClassifierResult(
-  input: NormalizedClassifierInput,
-  classifierModel: string,
-  cost: number | null,
-  modelCallMeta: ClassifierModelCallMeta,
-  fallback: ClassifierRunFallbackMetadata
-): ClassifierRunResult {
-  return {
-    cost,
-    classifierModel,
-    classification: fallbackClassifierOutput(input),
-    fallback,
-    modelCallMeta,
-  };
-}
-
-function extractClassifierText(result: ChatResult) {
-  const content: unknown = result.choices[0]?.message.content;
-  if (typeof content === 'string' && content.trim().length > 0) {
-    return content;
-  }
-  return null;
-}

From 1fb85f5c5e83165046a58c5876de23f641ce9bbe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Thu, 11 Jun 2026 22:04:48 +0200
Subject: [PATCH 02/73] feat(auto-routing): add tier, routing-table, decision
 and benchmark contracts

---
 .../auto-routing-contracts/src/benchmark.ts   | 63 +++++++++++++++++++
 .../src/classifier/index.ts                   | 12 +---
 packages/auto-routing-contracts/src/index.ts  | 17 ++++-
 .../src/routing-table.test.ts                 | 44 +++++++++++++
 .../src/routing-table.ts                      | 49 +++++++++++++++
 .../auto-routing-contracts/src/tiers.test.ts  | 60 ++++++++++++++++++
 packages/auto-routing-contracts/src/tiers.ts  | 32 ++++++++++
 7 files changed, 266 insertions(+), 11 deletions(-)
 create mode 100644 packages/auto-routing-contracts/src/benchmark.ts
 create mode 100644 packages/auto-routing-contracts/src/routing-table.test.ts
 create mode 100644 packages/auto-routing-contracts/src/routing-table.ts
 create mode 100644 packages/auto-routing-contracts/src/tiers.test.ts
 create mode 100644 packages/auto-routing-contracts/src/tiers.ts

diff --git a/packages/auto-routing-contracts/src/benchmark.ts b/packages/auto-routing-contracts/src/benchmark.ts
new file mode 100644
index 0000000000..7c14447a40
--- /dev/null
+++ b/packages/auto-routing-contracts/src/benchmark.ts
@@ -0,0 +1,63 @@
+import * as z from 'zod';
+import { ClassifierApiKindSchema } from './routing-table';
+import { DifficultyTierSchema } from './tiers';
+
+export const BenchmarkKindSchema = z.enum(['classifier', 'decider']);
+export type BenchmarkKind = z.infer<typeof BenchmarkKindSchema>;
+
+export const BenchmarkDeciderModelSchema = z.object({
+  id: z.string().trim().min(1),
+  // Which gateway API kinds this model can serve when chosen by the router.
+  // The benchmark itself always exercises chat completions.
+  supportedApiKinds: z.array(ClassifierApiKindSchema).min(1).default(['chat_completions']),
+});
+export type BenchmarkDeciderModel = z.infer<typeof BenchmarkDeciderModelSchema>;
+
+export const BenchmarkConfigSchema = z.object({
+  classifierModels: z.array(z.string().trim().min(1)).min(1),
+  deciderModels: z.array(BenchmarkDeciderModelSchema).min(1),
+  // Accuracy threshold for "gets the job done" (per tier).
+  minAccuracy: z.number().min(0).max(1),
+  // Parallel OpenRouter calls per queue message.
+  maxConcurrency: z.number().int().min(1).max(16),
+  updatedAt: z.string().nullable(),
+  updatedBy: z.string().nullable(),
+});
+export type BenchmarkConfig = z.infer<typeof BenchmarkConfigSchema>;
+
+export const BenchmarkRunStatusSchema = z.enum(['running', 'completed', 'failed']);
+
+export const BenchmarkModelSummarySchema = z.object({
+  model: z.string(),
+  // '*' for classifier runs (no tiering), otherwise the difficulty tier.
+  tier: z.union([DifficultyTierSchema, z.literal('*')]),
+  accuracy: z.number(),
+  avgCostUsd: z.number().nullable(),
+  avgLatencyMs: z.number(),
+  p50LatencyMs: z.number().nullable(),
+  cases: z.number().int(),
+  errors: z.number().int(),
+});
+export type BenchmarkModelSummary = z.infer<typeof BenchmarkModelSummarySchema>;
+
+export const BenchmarkRunSchema = z.object({
+  id: z.string(),
+  kind: BenchmarkKindSchema,
+  status: BenchmarkRunStatusSchema,
+  startedAt: z.string(),
+  completedAt: z.string().nullable(),
+  error: z.string().nullable(),
+  summaries: z.array(BenchmarkModelSummarySchema),
+});
+export type BenchmarkRun = z.infer<typeof BenchmarkRunSchema>;
+
+export const BenchmarkRunsResponseSchema = z.object({ runs: z.array(BenchmarkRunSchema) });
+export const BenchmarkConfigResponseSchema = z.object({
+  config: BenchmarkConfigSchema,
+  defaults: BenchmarkConfigSchema,
+});
+export const StartBenchmarkRunRequestSchema = z.object({ kind: BenchmarkKindSchema });
+export const StartBenchmarkRunResponseSchema = z.object({
+  runId: z.string(),
+  enqueuedModels: z.number().int(),
+});
diff --git a/packages/auto-routing-contracts/src/classifier/index.ts b/packages/auto-routing-contracts/src/classifier/index.ts
index d3422ad6a7..78c27cb244 100644
--- a/packages/auto-routing-contracts/src/classifier/index.ts
+++ b/packages/auto-routing-contracts/src/classifier/index.ts
@@ -1,13 +1,5 @@
-export {
-  buildClassifierMessages,
-  CLASSIFIER_MAX_TOKENS,
-  DEFAULT_CLASSIFIER_MODEL,
-} from './prompt';
-export {
-  ClassifierOutputParseError,
-  parseClassifierOutput,
-  type ClassifierOutput,
-} from './output';
+export { buildClassifierMessages, CLASSIFIER_MAX_TOKENS, DEFAULT_CLASSIFIER_MODEL } from './prompt';
+export { ClassifierOutputParseError, parseClassifierOutput, type ClassifierOutput } from './output';
 export { fallbackClassifierOutput } from './output-fallback';
 export {
   classifyWithOpenRouter,
diff --git a/packages/auto-routing-contracts/src/index.ts b/packages/auto-routing-contracts/src/index.ts
index ef537f600e..c7022c5477 100644
--- a/packages/auto-routing-contracts/src/index.ts
+++ b/packages/auto-routing-contracts/src/index.ts
@@ -1,5 +1,6 @@
 import * as z from 'zod';
 import { NormalizedClassifierInputSchema } from './input';
+import { DifficultyTierSchema } from './tiers';
 
 export {
   NormalizedClassifierInputSchema,
@@ -96,9 +97,19 @@ export const ClassifierOutputSchema = z
   });
 export type ClassifierOutput = z.infer<typeof ClassifierOutputSchema>;
 
+export const AutoRoutingDecisionSchema = z.object({
+  model: z.string(),
+  tier: DifficultyTierSchema,
+  source: z.enum(['benchmark', 'default']),
+  tableVersion: z.string(),
+});
+export type AutoRoutingDecision = z.infer<typeof AutoRoutingDecisionSchema>;
+
 export const AutoRoutingDecisionResponseSchema = z.object({
   cost: z.number(),
-  decision: z.null(),
+  // Null when classification failed or no table candidate supports the
+  // request's API kind; the gateway then falls back to its static default.
+  decision: AutoRoutingDecisionSchema.nullable(),
   classifierResult: z
     .object({
       classification: ClassifierOutputSchema,
@@ -158,3 +169,7 @@ export type AutoRoutingClassifierAnalyticsResponse = z.infer<
 >;
 
 export { normalizeClassifierInput, redactProviderHints, type ClassifierApiKind } from './normalize';
+
+export * from './tiers';
+export * from './routing-table';
+export * from './benchmark';
diff --git a/packages/auto-routing-contracts/src/routing-table.test.ts b/packages/auto-routing-contracts/src/routing-table.test.ts
new file mode 100644
index 0000000000..c1180b5371
--- /dev/null
+++ b/packages/auto-routing-contracts/src/routing-table.test.ts
@@ -0,0 +1,44 @@
+import { describe, expect, it } from 'vitest';
+import { rankCandidates, RoutingTableSchema } from './routing-table';
+
+const candidate = (model: string, accuracy: number, avgCostUsd: number) => ({
+  model,
+  accuracy,
+  avgCostUsd,
+  meetsThreshold: false,
+  supportedApiKinds: ['chat_completions' as const],
+});
+
+describe('rankCandidates', () => {
+  it('puts the cheapest above-threshold candidate first', () => {
+    const ranked = rankCandidates(
+      [candidate('expensive', 0.95, 10), candidate('cheap', 0.8, 1), candidate('weak', 0.5, 0.1)],
+      0.7
+    );
+    expect(ranked.map(c => c.model)).toEqual(['cheap', 'expensive', 'weak']);
+    expect(ranked[0].meetsThreshold).toBe(true);
+    expect(ranked[2].meetsThreshold).toBe(false);
+  });
+  it('falls back to highest accuracy when nothing meets the threshold', () => {
+    const ranked = rankCandidates([candidate('a', 0.5, 1), candidate('b', 0.6, 5)], 0.9);
+    expect(ranked[0].model).toBe('b');
+  });
+  it('breaks cost ties by accuracy', () => {
+    const ranked = rankCandidates([candidate('a', 0.8, 1), candidate('b', 0.9, 1)], 0.7);
+    expect(ranked[0].model).toBe('b');
+  });
+});
+
+describe('RoutingTableSchema', () => {
+  it('requires at least one candidate per tier', () => {
+    expect(
+      RoutingTableSchema.safeParse({
+        version: 'v',
+        generatedAt: new Date(0).toISOString(),
+        minAccuracy: 0.7,
+        source: 'benchmark',
+        tiers: { low: [], medium: [candidate('m', 1, 1)], high: [candidate('h', 1, 1)] },
+      }).success
+    ).toBe(false);
+  });
+});
diff --git a/packages/auto-routing-contracts/src/routing-table.ts b/packages/auto-routing-contracts/src/routing-table.ts
new file mode 100644
index 0000000000..acb892cbd8
--- /dev/null
+++ b/packages/auto-routing-contracts/src/routing-table.ts
@@ -0,0 +1,49 @@
+import * as z from 'zod';
+import { DifficultyTierSchema } from './tiers';
+
+export const ClassifierApiKindSchema = z.enum(['chat_completions', 'responses', 'messages']);
+
+export const RankedCandidateSchema = z.object({
+  model: z.string().trim().min(1),
+  // Benchmark accuracy in [0, 1] for this tier.
+  accuracy: z.number().min(0).max(1),
+  // Average observed OpenRouter cost per benchmark case, in USD credits.
+  avgCostUsd: z.number().nonnegative(),
+  meetsThreshold: z.boolean(),
+  supportedApiKinds: z.array(ClassifierApiKindSchema).min(1),
+});
+export type RankedCandidate = z.infer<typeof RankedCandidateSchema>;
+
+export const RoutingTableSchema = z.object({
+  // Benchmark run id (or 'default' for the built-in table).
+  version: z.string().min(1),
+  generatedAt: z.string().min(1),
+  minAccuracy: z.number().min(0).max(1),
+  source: z.enum(['benchmark', 'default']),
+  tiers: z.object({
+    low: z.array(RankedCandidateSchema).min(1),
+    medium: z.array(RankedCandidateSchema).min(1),
+    high: z.array(RankedCandidateSchema).min(1),
+  }),
+});
+export type RoutingTable = z.infer<typeof RoutingTableSchema>;
+
+export const ROUTING_TABLE_KV_KEY = 'routing_table_v1';
+
+// "Best bang for buck": candidates meeting the accuracy threshold come
+// first, cheapest first (accuracy breaks ties); below-threshold candidates
+// follow ordered by accuracy so a degenerate table still routes sensibly.
+export function rankCandidates(
+  candidates: ReadonlyArray<Omit<RankedCandidate, 'meetsThreshold'> & { meetsThreshold?: boolean }>,
+  minAccuracy: number
+): RankedCandidate[] {
+  const flagged = candidates.map(c => ({ ...c, meetsThreshold: c.accuracy >= minAccuracy }));
+  return flagged.toSorted((a, b) => {
+    if (a.meetsThreshold !== b.meetsThreshold) return a.meetsThreshold ? -1 : 1;
+    if (a.meetsThreshold) {
+      return a.avgCostUsd - b.avgCostUsd || b.accuracy - a.accuracy;
+    }
+    return b.accuracy - a.accuracy || a.avgCostUsd - b.avgCostUsd;
+  });
+}
+
diff --git a/packages/auto-routing-contracts/src/tiers.test.ts b/packages/auto-routing-contracts/src/tiers.test.ts
new file mode 100644
index 0000000000..edf3a9d6c8
--- /dev/null
+++ b/packages/auto-routing-contracts/src/tiers.test.ts
@@ -0,0 +1,60 @@
+import { describe, expect, it } from 'vitest';
+import { deriveDifficultyTier } from './tiers';
+import type { ClassifierOutput } from './index';
+
+function classification(overrides: Partial<ClassifierOutput>): ClassifierOutput {
+  return {
+    taskType: 'implementation',
+    subtaskType: 'code_generation',
+    contextComplexity: 'small',
+    reasoningComplexity: 'low',
+    riskLevel: 'low',
+    executionMode: 'answer_only',
+    requiresTools: false,
+    confidence: 0.9,
+    ...overrides,
+  };
+}
+
+describe('deriveDifficultyTier', () => {
+  it('classifies trivial answer-only requests as low', () => {
+    expect(deriveDifficultyTier(classification({}))).toBe('low');
+  });
+  it('classifies mid-size code changes as medium', () => {
+    expect(
+      deriveDifficultyTier(
+        classification({
+          contextComplexity: 'medium',
+          reasoningComplexity: 'medium',
+          executionMode: 'code_change',
+        })
+      )
+    ).toBe('medium');
+  });
+  it('classifies high-reasoning multi-step work as high', () => {
+    expect(
+      deriveDifficultyTier(
+        classification({
+          contextComplexity: 'large',
+          reasoningComplexity: 'high',
+          executionMode: 'multi_step_project',
+          riskLevel: 'high',
+        })
+      )
+    ).toBe('high');
+  });
+  it('is monotonic: bumping reasoning complexity never lowers the tier', () => {
+    const tiers = ['low', 'medium', 'high'] as const;
+    for (const ctx of ['small', 'medium', 'large'] as const) {
+      let prev = 0;
+      for (const reasoning of ['low', 'medium', 'high'] as const) {
+        const tier = deriveDifficultyTier(
+          classification({ contextComplexity: ctx, reasoningComplexity: reasoning })
+        );
+        const idx = tiers.indexOf(tier);
+        expect(idx).toBeGreaterThanOrEqual(prev);
+        prev = idx;
+      }
+    }
+  });
+});
diff --git a/packages/auto-routing-contracts/src/tiers.ts b/packages/auto-routing-contracts/src/tiers.ts
new file mode 100644
index 0000000000..d0f4cb4c7e
--- /dev/null
+++ b/packages/auto-routing-contracts/src/tiers.ts
@@ -0,0 +1,32 @@
+import * as z from 'zod';
+import type { ClassifierOutput } from './index';
+
+export const DifficultyTierSchema = z.enum(['low', 'medium', 'high']);
+export type DifficultyTier = z.infer<typeof DifficultyTierSchema>;
+
+export const DIFFICULTY_TIERS: readonly DifficultyTier[] = ['low', 'medium', 'high'];
+
+const REASONING_POINTS = { low: 0, medium: 2, high: 4 } as const;
+const CONTEXT_POINTS = { small: 0, medium: 1, large: 2 } as const;
+const EXECUTION_POINTS = {
+  answer_only: 0,
+  code_change: 1,
+  command_execution: 1,
+  multi_step_project: 2,
+} as const;
+const RISK_POINTS = { low: 0, medium: 0, high: 1 } as const;
+
+// Deterministic mapping from the classifier taxonomy to a difficulty tier.
+// Reasoning complexity dominates (weight 2x) because it is the strongest
+// signal for whether a cheap model can complete the task; context size,
+// execution mode and blast radius nudge borderline cases up.
+export function deriveDifficultyTier(classification: ClassifierOutput): DifficultyTier {
+  const score =
+    REASONING_POINTS[classification.reasoningComplexity] +
+    CONTEXT_POINTS[classification.contextComplexity] +
+    EXECUTION_POINTS[classification.executionMode] +
+    RISK_POINTS[classification.riskLevel];
+  if (score <= 2) return 'low';
+  if (score <= 5) return 'medium';
+  return 'high';
+}

From 39acfdb2ccaf1196fe8fd50606ddc38ff0e36d7d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Thu, 11 Jun 2026 22:08:38 +0200
Subject: [PATCH 03/73] feat(auto-routing): add benchmark-driven decision
 engine and KV routing table

---
 .../auto-routing/src/decision-engine.test.ts  | 47 +++++++++++++
 services/auto-routing/src/decision-engine.ts  | 18 +++++
 .../auto-routing/src/routing-table.test.ts    | 34 ++++++++++
 services/auto-routing/src/routing-table.ts    | 66 +++++++++++++++++++
 4 files changed, 165 insertions(+)
 create mode 100644 services/auto-routing/src/decision-engine.test.ts
 create mode 100644 services/auto-routing/src/decision-engine.ts
 create mode 100644 services/auto-routing/src/routing-table.test.ts
 create mode 100644 services/auto-routing/src/routing-table.ts

diff --git a/services/auto-routing/src/decision-engine.test.ts b/services/auto-routing/src/decision-engine.test.ts
new file mode 100644
index 0000000000..1dc79c4572
--- /dev/null
+++ b/services/auto-routing/src/decision-engine.test.ts
@@ -0,0 +1,47 @@
+import { describe, expect, it } from 'vitest';
+import type { ClassifierOutput, RoutingTable } from '@kilocode/auto-routing-contracts';
+import { computeDecision } from './decision-engine';
+
+const classification: ClassifierOutput = {
+  taskType: 'implementation',
+  subtaskType: 'code_generation',
+  contextComplexity: 'small',
+  reasoningComplexity: 'low',
+  riskLevel: 'low',
+  executionMode: 'answer_only',
+  requiresTools: false,
+  confidence: 0.9,
+};
+
+const table: RoutingTable = {
+  version: 'run-1',
+  generatedAt: '2026-06-11T00:00:00.000Z',
+  minAccuracy: 0.7,
+  source: 'benchmark',
+  tiers: {
+    low: [
+      { model: 'cheap/messages-only', accuracy: 0.9, avgCostUsd: 0.001, meetsThreshold: true, supportedApiKinds: ['messages'] },
+      { model: 'cheap/chat', accuracy: 0.85, avgCostUsd: 0.002, meetsThreshold: true, supportedApiKinds: ['chat_completions'] },
+    ],
+    medium: [
+      { model: 'mid/chat', accuracy: 0.8, avgCostUsd: 0.01, meetsThreshold: true, supportedApiKinds: ['chat_completions', 'messages'] },
+    ],
+    high: [
+      { model: 'big/chat', accuracy: 0.9, avgCostUsd: 0.1, meetsThreshold: true, supportedApiKinds: ['chat_completions'] },
+    ],
+  },
+};
+
+describe('computeDecision', () => {
+  it('picks the first candidate supporting the request api kind', () => {
+    const decision = computeDecision(classification, 'chat_completions', table);
+    expect(decision).toEqual({ model: 'cheap/chat', tier: 'low', source: 'benchmark', tableVersion: 'run-1' });
+  });
+  it('uses the tier derived from the classification', () => {
+    const hard: ClassifierOutput = { ...classification, reasoningComplexity: 'high', contextComplexity: 'large', executionMode: 'multi_step_project' };
+    expect(computeDecision(hard, 'chat_completions', table)?.model).toBe('big/chat');
+  });
+  it('returns null when no candidate supports the api kind', () => {
+    expect(computeDecision(classification, 'responses', table)).toBeNull();
+  });
+});
diff --git a/services/auto-routing/src/decision-engine.ts b/services/auto-routing/src/decision-engine.ts
new file mode 100644
index 0000000000..26645ead7f
--- /dev/null
+++ b/services/auto-routing/src/decision-engine.ts
@@ -0,0 +1,18 @@
+import {
+  deriveDifficultyTier,
+  type AutoRoutingDecision,
+  type ClassifierOutput,
+  type NormalizedClassifierInput,
+  type RoutingTable,
+} from '@kilocode/auto-routing-contracts';
+
+export function computeDecision(
+  classification: ClassifierOutput,
+  apiKind: NormalizedClassifierInput['apiKind'],
+  table: RoutingTable
+): AutoRoutingDecision | null {
+  const tier = deriveDifficultyTier(classification);
+  const candidate = table.tiers[tier].find(c => c.supportedApiKinds.includes(apiKind));
+  if (!candidate) return null;
+  return { model: candidate.model, tier, source: table.source, tableVersion: table.version };
+}
diff --git a/services/auto-routing/src/routing-table.test.ts b/services/auto-routing/src/routing-table.test.ts
new file mode 100644
index 0000000000..788d866945
--- /dev/null
+++ b/services/auto-routing/src/routing-table.test.ts
@@ -0,0 +1,34 @@
+import { afterEach, describe, expect, it } from 'vitest';
+import { clearRoutingTableCache, DEFAULT_ROUTING_TABLE, getRoutingTable } from './routing-table';
+
+type KvStub = Pick<Env, 'AUTO_ROUTING_CONFIG'>;
+const kvEnv = (value: string | null, onGet?: () => void): KvStub =>
+  ({
+    AUTO_ROUTING_CONFIG: {
+      get: async () => {
+        onGet?.();
+        return value;
+      },
+    },
+  }) as unknown as KvStub;
+
+afterEach(() => clearRoutingTableCache());
+
+describe('getRoutingTable', () => {
+  it('returns the default table when the key is missing', async () => {
+    expect(await getRoutingTable(kvEnv(null))).toEqual(DEFAULT_ROUTING_TABLE);
+  });
+  it('returns the default table when the stored JSON is invalid', async () => {
+    expect(await getRoutingTable(kvEnv('{"nope":true}'))).toEqual(DEFAULT_ROUTING_TABLE);
+    clearRoutingTableCache();
+    expect(await getRoutingTable(kvEnv('not json at all'))).toEqual(DEFAULT_ROUTING_TABLE);
+  });
+  it('parses and caches a valid stored table', async () => {
+    let reads = 0;
+    const env = kvEnv(JSON.stringify(DEFAULT_ROUTING_TABLE), () => reads++);
+    const first = await getRoutingTable(env);
+    await getRoutingTable(env);
+    expect(first.version).toBe(DEFAULT_ROUTING_TABLE.version);
+    expect(reads).toBe(1);
+  });
+});
diff --git a/services/auto-routing/src/routing-table.ts b/services/auto-routing/src/routing-table.ts
new file mode 100644
index 0000000000..7293cebebe
--- /dev/null
+++ b/services/auto-routing/src/routing-table.ts
@@ -0,0 +1,66 @@
+import { formatError } from '@kilocode/worker-utils';
+import {
+  ROUTING_TABLE_KV_KEY,
+  RoutingTableSchema,
+  type RoutingTable,
+} from '@kilocode/auto-routing-contracts';
+import { ttlCached } from './ttl-cache';
+
+// Safety net used until the first decider benchmark publishes a table (and
+// whenever the stored table is missing or unparseable). Mirrors the static
+// defaults the gateway uses for kilo-auto/balanced today.
+export const DEFAULT_ROUTING_TABLE: RoutingTable = {
+  version: 'default',
+  generatedAt: '2026-06-11T00:00:00.000Z',
+  minAccuracy: 0.7,
+  source: 'default',
+  tiers: {
+    low: [
+      { model: 'google/gemini-2.5-flash', accuracy: 1, avgCostUsd: 0, meetsThreshold: true, supportedApiKinds: ['chat_completions'] },
+    ],
+    medium: [
+      { model: 'qwen/qwen3.7-plus', accuracy: 1, avgCostUsd: 0, meetsThreshold: true, supportedApiKinds: ['chat_completions'] },
+      { model: 'anthropic/claude-sonnet-4.6', accuracy: 1, avgCostUsd: 0, meetsThreshold: true, supportedApiKinds: ['chat_completions', 'messages', 'responses'] },
+    ],
+    high: [
+      { model: 'anthropic/claude-sonnet-4.6', accuracy: 1, avgCostUsd: 0, meetsThreshold: true, supportedApiKinds: ['chat_completions', 'messages', 'responses'] },
+    ],
+  },
+};
+
+const ROUTING_TABLE_CACHE_TTL_MS = 60_000;
+
+type RoutingTableEnv = Pick<Env, 'AUTO_ROUTING_CONFIG'>;
+
+const routingTableCache = ttlCached(ROUTING_TABLE_CACHE_TTL_MS, async (env: RoutingTableEnv) => {
+  const raw = await env.AUTO_ROUTING_CONFIG.get(ROUTING_TABLE_KV_KEY);
+  if (raw === null) return DEFAULT_ROUTING_TABLE;
+  try {
+    const parsed = RoutingTableSchema.safeParse(JSON.parse(raw));
+    if (!parsed.success) {
+      console.warn(
+        JSON.stringify({
+          event: 'auto_routing_table_invalid',
+          issues: parsed.error.issues.slice(0, 5).map(i => `${i.path.join('.')}: ${i.code}`),
+        })
+      );
+      return DEFAULT_ROUTING_TABLE;
+    }
+    return parsed.data;
+  } catch {
+    return DEFAULT_ROUTING_TABLE;
+  }
+});
+
+export function clearRoutingTableCache(): void {
+  routingTableCache.clear();
+}
+
+export function getRoutingTable(env: RoutingTableEnv): Promise<RoutingTable> {
+  return routingTableCache.get(env).catch((error: unknown) => {
+    console.warn(
+      JSON.stringify({ event: 'auto_routing_table_read_failed', ...formatError(error) })
+    );
+    return DEFAULT_ROUTING_TABLE;
+  });
+}

From bd83fdc65ee6a797cef41b40120647193733a861 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Thu, 11 Jun 2026 22:10:30 +0200
Subject: [PATCH 04/73] feat(auto-routing): return routing decisions from
 /decide

---
 .../src/routing-table.ts                      |  1 -
 services/auto-routing/src/decide.ts           | 51 ++++++++++++++-----
 .../auto-routing/src/decision-engine.test.ts  | 46 ++++++++++++++---
 services/auto-routing/src/index.test.ts       | 16 +++++-
 services/auto-routing/src/routing-table.ts    | 32 ++++++++++--
 5 files changed, 119 insertions(+), 27 deletions(-)

diff --git a/packages/auto-routing-contracts/src/routing-table.ts b/packages/auto-routing-contracts/src/routing-table.ts
index acb892cbd8..82ca7e7dfc 100644
--- a/packages/auto-routing-contracts/src/routing-table.ts
+++ b/packages/auto-routing-contracts/src/routing-table.ts
@@ -46,4 +46,3 @@ export function rankCandidates(
     return b.accuracy - a.accuracy || a.avgCostUsd - b.avgCostUsd;
   });
 }
-
diff --git a/services/auto-routing/src/decide.ts b/services/auto-routing/src/decide.ts
index 4303192f60..c976d9d6e9 100644
--- a/services/auto-routing/src/decide.ts
+++ b/services/auto-routing/src/decide.ts
@@ -1,5 +1,6 @@
 import { MirrorPayloadSchema } from '@kilocode/auto-routing-contracts';
 import type {
+  AutoRoutingDecision,
   AutoRoutingDecisionResponse,
   MirrorPayload,
   NormalizedClassifierInput,
@@ -18,8 +19,10 @@ import {
 } from './conversation-identity';
 import type { ContentHashes } from './conversation-identity';
 import { getCachedClassification, putCachedClassification } from './decision-cache';
+import { computeDecision } from './decision-engine';
 import { ClassifierRunError, classifyNormalizedInput } from './model-classifier';
 import type { ClassifierRunResult } from './model-classifier';
+import { getRoutingTable } from './routing-table';
 import type { HonoEnv } from './hono-env';
 
 // Isolate-scoped request counter, used to correlate latency with isolate
@@ -29,11 +32,12 @@ let isolateRequestSeq = 0;
 function decisionResponse(
   cost: number,
   classification: ClassifierOutput,
-  normalized: NormalizedClassifierInput
+  normalized: NormalizedClassifierInput,
+  decision: AutoRoutingDecision | null
 ): AutoRoutingDecisionResponse {
   return {
     cost,
-    decision: null,
+    decision,
     classifierResult: { classification, normalized },
   };
 }
@@ -194,7 +198,8 @@ function recordDecision(
   env: Env,
   ctx: DecisionContext,
   durationMs: number,
-  outcome: DecisionOutcome
+  outcome: DecisionOutcome,
+  decision: AutoRoutingDecision | null = null
 ): void {
   const summary = summarizeOutcome(outcome);
 
@@ -243,6 +248,9 @@ function recordDecision(
       hasMachineId: ctx.payload.machineId !== null,
       mode: ctx.payload.mode,
       uaPrefix: ctx.payload.userAgent?.slice(0, 40) ?? null,
+      decidedModel: decision?.model ?? null,
+      decidedTier: decision?.tier ?? null,
+      decisionSource: decision?.source ?? null,
       ...summary.details,
     })
   );
@@ -265,11 +273,12 @@ export const decideHandler: Handler<HonoEnv> = async c => {
 
   const payload = parsed.data;
   const startedAt = performance.now();
-  const [hashes, userIdHash, classifierModel, successSampleRate] = await Promise.all([
+  const [hashes, userIdHash, classifierModel, successSampleRate, routingTable] = await Promise.all([
     computeContentHashes(payload.input),
     hashIdentifierForTelemetry(payload.userId),
     getClassifierModel(c.env),
     getDecisionLogSampleRate(c.env),
+    getRoutingTable(c.env),
   ]);
   const ctx: DecisionContext = {
     payload,
@@ -288,12 +297,15 @@ export const decideHandler: Handler<HonoEnv> = async c => {
     classifierModel
   );
   if (cached) {
-    recordDecision(c.env, ctx, performance.now() - startedAt, {
-      kind: 'cache_hit',
-      classifierModel,
-      classification: cached,
-    });
-    return c.json(decisionResponse(0, cached, payload.input));
+    const decision = computeDecision(cached, payload.input.apiKind, routingTable);
+    recordDecision(
+      c.env,
+      ctx,
+      performance.now() - startedAt,
+      { kind: 'cache_hit', classifierModel, classification: cached },
+      decision
+    );
+    return c.json(decisionResponse(0, cached, payload.input, decision));
   }
 
   try {
@@ -311,10 +323,21 @@ export const decideHandler: Handler<HonoEnv> = async c => {
         )
       );
     }
-    recordDecision(c.env, ctx, performance.now() - startedAt, { kind: 'model', classifier });
-    // When routing decisions are implemented, include the prior decision for
-    // this session as an input alongside classifier output.
-    return c.json(decisionResponse(classifier.cost ?? 0, classifier.classification, payload.input));
+    const decision = computeDecision(
+      classifier.classification,
+      payload.input.apiKind,
+      routingTable
+    );
+    recordDecision(
+      c.env,
+      ctx,
+      performance.now() - startedAt,
+      { kind: 'model', classifier },
+      decision
+    );
+    return c.json(
+      decisionResponse(classifier.cost ?? 0, classifier.classification, payload.input, decision)
+    );
   } catch (error) {
     recordDecision(c.env, ctx, performance.now() - startedAt, { kind: 'error', error });
     // A failed run can still have billed the first attempt (e.g. a valid-but-
diff --git a/services/auto-routing/src/decision-engine.test.ts b/services/auto-routing/src/decision-engine.test.ts
index 1dc79c4572..16c36ed4f7 100644
--- a/services/auto-routing/src/decision-engine.test.ts
+++ b/services/auto-routing/src/decision-engine.test.ts
@@ -20,14 +20,38 @@ const table: RoutingTable = {
   source: 'benchmark',
   tiers: {
     low: [
-      { model: 'cheap/messages-only', accuracy: 0.9, avgCostUsd: 0.001, meetsThreshold: true, supportedApiKinds: ['messages'] },
-      { model: 'cheap/chat', accuracy: 0.85, avgCostUsd: 0.002, meetsThreshold: true, supportedApiKinds: ['chat_completions'] },
+      {
+        model: 'cheap/messages-only',
+        accuracy: 0.9,
+        avgCostUsd: 0.001,
+        meetsThreshold: true,
+        supportedApiKinds: ['messages'],
+      },
+      {
+        model: 'cheap/chat',
+        accuracy: 0.85,
+        avgCostUsd: 0.002,
+        meetsThreshold: true,
+        supportedApiKinds: ['chat_completions'],
+      },
     ],
     medium: [
-      { model: 'mid/chat', accuracy: 0.8, avgCostUsd: 0.01, meetsThreshold: true, supportedApiKinds: ['chat_completions', 'messages'] },
+      {
+        model: 'mid/chat',
+        accuracy: 0.8,
+        avgCostUsd: 0.01,
+        meetsThreshold: true,
+        supportedApiKinds: ['chat_completions', 'messages'],
+      },
     ],
     high: [
-      { model: 'big/chat', accuracy: 0.9, avgCostUsd: 0.1, meetsThreshold: true, supportedApiKinds: ['chat_completions'] },
+      {
+        model: 'big/chat',
+        accuracy: 0.9,
+        avgCostUsd: 0.1,
+        meetsThreshold: true,
+        supportedApiKinds: ['chat_completions'],
+      },
     ],
   },
 };
@@ -35,10 +59,20 @@ const table: RoutingTable = {
 describe('computeDecision', () => {
   it('picks the first candidate supporting the request api kind', () => {
     const decision = computeDecision(classification, 'chat_completions', table);
-    expect(decision).toEqual({ model: 'cheap/chat', tier: 'low', source: 'benchmark', tableVersion: 'run-1' });
+    expect(decision).toEqual({
+      model: 'cheap/chat',
+      tier: 'low',
+      source: 'benchmark',
+      tableVersion: 'run-1',
+    });
   });
   it('uses the tier derived from the classification', () => {
-    const hard: ClassifierOutput = { ...classification, reasoningComplexity: 'high', contextComplexity: 'large', executionMode: 'multi_step_project' };
+    const hard: ClassifierOutput = {
+      ...classification,
+      reasoningComplexity: 'high',
+      contextComplexity: 'large',
+      executionMode: 'multi_step_project',
+    };
     expect(computeDecision(hard, 'chat_completions', table)?.model).toBe('big/chat');
   });
   it('returns null when no candidate supports the api kind', () => {
diff --git a/services/auto-routing/src/index.test.ts b/services/auto-routing/src/index.test.ts
index 89b9ba675c..d8a3991117 100644
--- a/services/auto-routing/src/index.test.ts
+++ b/services/auto-routing/src/index.test.ts
@@ -1,5 +1,6 @@
 import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
 import { clearClassifierConfigCache } from './classifier-config';
+import { clearRoutingTableCache } from './routing-table';
 import { app } from './index';
 import { ClassifierRunError } from './model-classifier';
 import type * as ModelClassifierModule from './model-classifier';
@@ -117,6 +118,7 @@ function decideRequest(payload: unknown) {
 describe('auto routing worker', () => {
   beforeEach(() => {
     clearClassifierConfigCache();
+    clearRoutingTableCache();
     classifyNormalizedInput.mockReset();
     classifyNormalizedInput.mockResolvedValue(mockClassifierResult);
     writeDataPoint.mockReset();
@@ -158,7 +160,12 @@ describe('auto routing worker', () => {
     expect(response.status).toBe(200);
     await expect(response.json()).resolves.toEqual({
       cost: 0.00000123,
-      decision: null,
+      decision: {
+        model: expect.any(String),
+        tier: expect.stringMatching(/^(low|medium|high)$/),
+        source: 'default',
+        tableVersion: 'default',
+      },
       classifierResult: {
         classification: mockClassification,
         normalized: normalizedInput,
@@ -215,7 +222,12 @@ describe('auto routing worker', () => {
     expect(response.status).toBe(200);
     await expect(response.json()).resolves.toMatchObject({
       cost: 0,
-      decision: null,
+      decision: {
+        model: expect.any(String),
+        tier: expect.stringMatching(/^(low|medium|high)$/),
+        source: 'default',
+        tableVersion: 'default',
+      },
       classifierResult: { classification: mockClassification },
     });
     expect(cacheIdFromName).toHaveBeenCalledWith('user:user-1:task:task-123');
diff --git a/services/auto-routing/src/routing-table.ts b/services/auto-routing/src/routing-table.ts
index 7293cebebe..524f4d526f 100644
--- a/services/auto-routing/src/routing-table.ts
+++ b/services/auto-routing/src/routing-table.ts
@@ -16,14 +16,38 @@ export const DEFAULT_ROUTING_TABLE: RoutingTable = {
   source: 'default',
   tiers: {
     low: [
-      { model: 'google/gemini-2.5-flash', accuracy: 1, avgCostUsd: 0, meetsThreshold: true, supportedApiKinds: ['chat_completions'] },
+      {
+        model: 'google/gemini-2.5-flash',
+        accuracy: 1,
+        avgCostUsd: 0,
+        meetsThreshold: true,
+        supportedApiKinds: ['chat_completions'],
+      },
     ],
     medium: [
-      { model: 'qwen/qwen3.7-plus', accuracy: 1, avgCostUsd: 0, meetsThreshold: true, supportedApiKinds: ['chat_completions'] },
-      { model: 'anthropic/claude-sonnet-4.6', accuracy: 1, avgCostUsd: 0, meetsThreshold: true, supportedApiKinds: ['chat_completions', 'messages', 'responses'] },
+      {
+        model: 'qwen/qwen3.7-plus',
+        accuracy: 1,
+        avgCostUsd: 0,
+        meetsThreshold: true,
+        supportedApiKinds: ['chat_completions'],
+      },
+      {
+        model: 'anthropic/claude-sonnet-4.6',
+        accuracy: 1,
+        avgCostUsd: 0,
+        meetsThreshold: true,
+        supportedApiKinds: ['chat_completions', 'messages', 'responses'],
+      },
     ],
     high: [
-      { model: 'anthropic/claude-sonnet-4.6', accuracy: 1, avgCostUsd: 0, meetsThreshold: true, supportedApiKinds: ['chat_completions', 'messages', 'responses'] },
+      {
+        model: 'anthropic/claude-sonnet-4.6',
+        accuracy: 1,
+        avgCostUsd: 0,
+        meetsThreshold: true,
+        supportedApiKinds: ['chat_completions', 'messages', 'responses'],
+      },
     ],
   },
 };

From 9621d62036c8121803271dc83e185d8d4f6ce548 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Thu, 11 Jun 2026 22:13:06 +0200
Subject: [PATCH 05/73] fix(auto-routing): log unparseable routing table JSON
 before falling back

---
 services/auto-routing/src/index.test.ts    | 3 +++
 services/auto-routing/src/routing-table.ts | 3 ++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/services/auto-routing/src/index.test.ts b/services/auto-routing/src/index.test.ts
index d8a3991117..5bc7a08146 100644
--- a/services/auto-routing/src/index.test.ts
+++ b/services/auto-routing/src/index.test.ts
@@ -123,6 +123,9 @@ describe('auto routing worker', () => {
     classifyNormalizedInput.mockResolvedValue(mockClassifierResult);
     writeDataPoint.mockReset();
     configGet.mockReset();
+    // Real KV returns null for missing keys; an undefined here would send the
+    // routing-table loader down the JSON.parse-throw path instead.
+    configGet.mockResolvedValue(null);
     configPut.mockReset();
     analyticsTokenGet.mockReset();
     analyticsTokenGet.mockResolvedValue('analytics-token');
diff --git a/services/auto-routing/src/routing-table.ts b/services/auto-routing/src/routing-table.ts
index 524f4d526f..aa2baccce4 100644
--- a/services/auto-routing/src/routing-table.ts
+++ b/services/auto-routing/src/routing-table.ts
@@ -71,7 +71,8 @@ const routingTableCache = ttlCached(ROUTING_TABLE_CACHE_TTL_MS, async (env: Rout
       return DEFAULT_ROUTING_TABLE;
     }
     return parsed.data;
-  } catch {
+  } catch (error) {
+    console.warn(JSON.stringify({ event: 'auto_routing_table_invalid', ...formatError(error) }));
     return DEFAULT_ROUTING_TABLE;
   }
 });

From 7af1b6dd37af81695c3a55b3a16af0d8e3756c2a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Thu, 11 Jun 2026 22:18:32 +0200
Subject: [PATCH 06/73] feat(auto-routing-benchmark): scaffold benchmark worker
 with D1 schema

---
 pnpm-lock.yaml                                |  37 +++
 .../migrations/0001_init.sql                  |  49 ++++
 services/auto-routing-benchmark/package.json  |  29 ++
 services/auto-routing-benchmark/src/auth.ts   |   6 +
 .../auto-routing-benchmark/src/db.test.ts     | 139 +++++++++
 services/auto-routing-benchmark/src/db.ts     | 264 ++++++++++++++++++
 .../auto-routing-benchmark/src/hono-env.ts    |   1 +
 services/auto-routing-benchmark/src/index.ts  |  25 ++
 .../auto-routing-benchmark/src/openrouter.ts  |  26 ++
 .../auto-routing-benchmark/src/ttl-cache.ts   |  35 +++
 services/auto-routing-benchmark/tsconfig.json |  16 ++
 .../auto-routing-benchmark/vitest.config.ts   |   9 +
 .../worker-configuration.d.ts                 |  16 ++
 .../auto-routing-benchmark/wrangler.jsonc     |  54 ++++
 14 files changed, 706 insertions(+)
 create mode 100644 services/auto-routing-benchmark/migrations/0001_init.sql
 create mode 100644 services/auto-routing-benchmark/package.json
 create mode 100644 services/auto-routing-benchmark/src/auth.ts
 create mode 100644 services/auto-routing-benchmark/src/db.test.ts
 create mode 100644 services/auto-routing-benchmark/src/db.ts
 create mode 100644 services/auto-routing-benchmark/src/hono-env.ts
 create mode 100644 services/auto-routing-benchmark/src/index.ts
 create mode 100644 services/auto-routing-benchmark/src/openrouter.ts
 create mode 100644 services/auto-routing-benchmark/src/ttl-cache.ts
 create mode 100644 services/auto-routing-benchmark/tsconfig.json
 create mode 100644 services/auto-routing-benchmark/vitest.config.ts
 create mode 100644 services/auto-routing-benchmark/worker-configuration.d.ts
 create mode 100644 services/auto-routing-benchmark/wrangler.jsonc

diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index 7677b03452..0c48fc8fe1 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -1510,6 +1510,43 @@ importers:
         specifier: 'catalog:'
         version: 4.98.0(@cloudflare/workers-types@4.20260605.1)(bufferutil@4.1.0)(utf-8-validate@6.0.6)
 
+  services/auto-routing-benchmark:
+    dependencies:
+      '@kilocode/auto-routing-contracts':
+        specifier: workspace:*
+        version: link:../../packages/auto-routing-contracts
+      '@kilocode/worker-utils':
+        specifier: workspace:*
+        version: link:../../packages/worker-utils
+      '@openrouter/sdk':
+        specifier: ^0.12.79
+        version: 0.12.79
+      hono:
+        specifier: 4.12.18
+        version: 4.12.18
+      zod:
+        specifier: 'catalog:'
+        version: 4.4.3
+    devDependencies:
+      '@cloudflare/workers-types':
+        specifier: 'catalog:'
+        version: 4.20260605.1
+      '@types/node':
+        specifier: 'catalog:'
+        version: 24.12.4
+      '@typescript/native-preview':
+        specifier: 'catalog:'
+        version: 7.0.0-dev.20260514.1
+      typescript:
+        specifier: 'catalog:'
+        version: 5.9.3
+      vitest:
+        specifier: 'catalog:'
+        version: 4.1.6(@opentelemetry/api@1.9.1)(@types/node@24.12.4)(@vitest/coverage-v8@4.1.6)(@vitest/ui@4.1.6)(esbuild@0.27.4)(jiti@2.7.0)(terser@5.46.0)(tsx@4.21.0)(yaml@2.8.4)
+      wrangler:
+        specifier: 'catalog:'
+        version: 4.98.0(@cloudflare/workers-types@4.20260605.1)(bufferutil@4.1.0)(utf-8-validate@6.0.6)
+
   services/auto-triage-infra:
     dependencies:
       '@kilocode/worker-utils':
diff --git a/services/auto-routing-benchmark/migrations/0001_init.sql b/services/auto-routing-benchmark/migrations/0001_init.sql
new file mode 100644
index 0000000000..6452dcfd1b
--- /dev/null
+++ b/services/auto-routing-benchmark/migrations/0001_init.sql
@@ -0,0 +1,49 @@
+CREATE TABLE benchmark_runs (
+  id TEXT PRIMARY KEY,
+  kind TEXT NOT NULL CHECK (kind IN ('classifier', 'decider')),
+  status TEXT NOT NULL CHECK (status IN ('running', 'completed', 'failed')),
+  started_at TEXT NOT NULL,
+  completed_at TEXT,
+  config_json TEXT NOT NULL,
+  error TEXT
+);
+
+CREATE TABLE case_results (
+  run_id TEXT NOT NULL REFERENCES benchmark_runs(id),
+  model TEXT NOT NULL,
+  case_id TEXT NOT NULL,
+  tier TEXT,
+  score REAL NOT NULL,
+  latency_ms INTEGER NOT NULL,
+  cost_usd REAL,
+  detail_json TEXT,
+  error TEXT,
+  PRIMARY KEY (run_id, model, case_id)
+);
+CREATE INDEX idx_case_results_run ON case_results (run_id);
+
+CREATE TABLE model_summaries (
+  run_id TEXT NOT NULL REFERENCES benchmark_runs(id),
+  model TEXT NOT NULL,
+  tier TEXT NOT NULL,
+  accuracy REAL NOT NULL,
+  avg_cost_usd REAL,
+  avg_latency_ms REAL NOT NULL,
+  p50_latency_ms REAL,
+  cases INTEGER NOT NULL,
+  errors INTEGER NOT NULL,
+  PRIMARY KEY (run_id, model, tier)
+);
+
+CREATE TABLE routing_tables (
+  run_id TEXT PRIMARY KEY REFERENCES benchmark_runs(id),
+  published_at TEXT NOT NULL,
+  table_json TEXT NOT NULL
+);
+
+CREATE TABLE benchmark_config (
+  id INTEGER PRIMARY KEY CHECK (id = 1),
+  config_json TEXT NOT NULL,
+  updated_at TEXT NOT NULL,
+  updated_by TEXT
+);
diff --git a/services/auto-routing-benchmark/package.json b/services/auto-routing-benchmark/package.json
new file mode 100644
index 0000000000..ba51b15107
--- /dev/null
+++ b/services/auto-routing-benchmark/package.json
@@ -0,0 +1,29 @@
+{
+  "name": "auto-routing-benchmark",
+  "version": "1.0.0",
+  "private": true,
+  "type": "module",
+  "scripts": {
+    "deploy": "wrangler deploy",
+    "dev": "wrangler dev",
+    "types": "wrangler types --include-runtime=false",
+    "typecheck": "tsgo --noEmit",
+    "lint": "pnpm -w exec oxlint --config .oxlintrc.json services/auto-routing-benchmark/src",
+    "test": "vitest run"
+  },
+  "dependencies": {
+    "@kilocode/auto-routing-contracts": "workspace:*",
+    "@kilocode/worker-utils": "workspace:*",
+    "@openrouter/sdk": "^0.12.79",
+    "hono": "catalog:",
+    "zod": "catalog:"
+  },
+  "devDependencies": {
+    "@cloudflare/workers-types": "catalog:",
+    "@types/node": "catalog:",
+    "@typescript/native-preview": "catalog:",
+    "typescript": "catalog:",
+    "vitest": "catalog:",
+    "wrangler": "catalog:"
+  }
+}
diff --git a/services/auto-routing-benchmark/src/auth.ts b/services/auto-routing-benchmark/src/auth.ts
new file mode 100644
index 0000000000..62d86cfe71
--- /dev/null
+++ b/services/auto-routing-benchmark/src/auth.ts
@@ -0,0 +1,6 @@
+import { backendAuthMiddleware } from '@kilocode/worker-utils';
+import type { HonoEnv } from './hono-env';
+
+export const authMiddleware = backendAuthMiddleware<HonoEnv>(c =>
+  c.env.INTERNAL_API_SECRET_PROD.get()
+);
diff --git a/services/auto-routing-benchmark/src/db.test.ts b/services/auto-routing-benchmark/src/db.test.ts
new file mode 100644
index 0000000000..163786350b
--- /dev/null
+++ b/services/auto-routing-benchmark/src/db.test.ts
@@ -0,0 +1,139 @@
+import { describe, it, expect } from 'vitest';
+import { mapSummaryRow, mapRunRow } from './db';
+import type { BenchmarkModelSummary } from '@kilocode/auto-routing-contracts';
+
+describe('mapSummaryRow', () => {
+  it('maps snake_case columns to camelCase BenchmarkModelSummary', () => {
+    const row = {
+      run_id: 'run-1',
+      model: 'openai/gpt-4o',
+      tier: 'high',
+      accuracy: 0.92,
+      avg_cost_usd: 0.0015,
+      avg_latency_ms: 320.5,
+      p50_latency_ms: 300.0,
+      cases: 50,
+      errors: 2,
+    };
+    const result = mapSummaryRow(row);
+    expect(result).toEqual<BenchmarkModelSummary>({
+      model: 'openai/gpt-4o',
+      tier: 'high',
+      accuracy: 0.92,
+      avgCostUsd: 0.0015,
+      avgLatencyMs: 320.5,
+      p50LatencyMs: 300.0,
+      cases: 50,
+      errors: 2,
+    });
+  });
+
+  it('handles null avg_cost_usd and p50_latency_ms', () => {
+    const row = {
+      run_id: 'run-2',
+      model: 'anthropic/claude-3-haiku',
+      tier: '*',
+      accuracy: 0.85,
+      avg_cost_usd: null,
+      avg_latency_ms: 150.0,
+      p50_latency_ms: null,
+      cases: 30,
+      errors: 0,
+    };
+    const result = mapSummaryRow(row);
+    expect(result.avgCostUsd).toBeNull();
+    expect(result.p50LatencyMs).toBeNull();
+    expect(result.tier).toBe('*');
+    expect(result.errors).toBe(0);
+  });
+});
+
+describe('mapRunRow', () => {
+  it('maps a RunRow and attaches its summaries', () => {
+    const runRow = {
+      id: 'run-abc',
+      kind: 'classifier' as const,
+      status: 'completed' as const,
+      started_at: '2026-06-10T04:10:00.000Z',
+      completed_at: '2026-06-10T04:25:00.000Z',
+      config_json: '{}',
+      error: null,
+    };
+    const summaries: BenchmarkModelSummary[] = [
+      {
+        model: 'openai/gpt-4o-mini',
+        tier: '*',
+        accuracy: 0.78,
+        avgCostUsd: 0.0002,
+        avgLatencyMs: 120,
+        p50LatencyMs: 110,
+        cases: 100,
+        errors: 5,
+      },
+    ];
+    const result = mapRunRow(runRow, summaries);
+    expect(result.id).toBe('run-abc');
+    expect(result.kind).toBe('classifier');
+    expect(result.status).toBe('completed');
+    expect(result.startedAt).toBe('2026-06-10T04:10:00.000Z');
+    expect(result.completedAt).toBe('2026-06-10T04:25:00.000Z');
+    expect(result.error).toBeNull();
+    expect(result.summaries).toHaveLength(1);
+    expect(result.summaries[0].model).toBe('openai/gpt-4o-mini');
+  });
+
+  it('attaches an empty summaries array when none are provided', () => {
+    const runRow = {
+      id: 'run-xyz',
+      kind: 'decider' as const,
+      status: 'running' as const,
+      started_at: '2026-06-11T05:10:00.000Z',
+      completed_at: null,
+      config_json: '{}',
+      error: null,
+    };
+    const result = mapRunRow(runRow, []);
+    expect(result.summaries).toEqual([]);
+    expect(result.completedAt).toBeNull();
+  });
+
+  it('summaries are attached to the correct run (not mixed up)', () => {
+    const runRow1 = {
+      id: 'run-1',
+      kind: 'classifier' as const,
+      status: 'completed' as const,
+      started_at: '2026-06-01T04:10:00.000Z',
+      completed_at: '2026-06-01T04:20:00.000Z',
+      config_json: '{}',
+      error: null,
+    };
+    const runRow2 = {
+      id: 'run-2',
+      kind: 'decider' as const,
+      status: 'failed' as const,
+      started_at: '2026-06-02T05:10:00.000Z',
+      completed_at: null,
+      config_json: '{}',
+      error: 'timed out',
+    };
+    const summariesForRun1: BenchmarkModelSummary[] = [
+      {
+        model: 'model-a',
+        tier: '*',
+        accuracy: 0.9,
+        avgCostUsd: null,
+        avgLatencyMs: 200,
+        p50LatencyMs: null,
+        cases: 10,
+        errors: 1,
+      },
+    ];
+    const result1 = mapRunRow(runRow1, summariesForRun1);
+    const result2 = mapRunRow(runRow2, []);
+
+    expect(result1.summaries).toHaveLength(1);
+    expect(result1.summaries[0].model).toBe('model-a');
+    expect(result2.summaries).toHaveLength(0);
+    expect(result2.error).toBe('timed out');
+  });
+});
diff --git a/services/auto-routing-benchmark/src/db.ts b/services/auto-routing-benchmark/src/db.ts
new file mode 100644
index 0000000000..27a817006f
--- /dev/null
+++ b/services/auto-routing-benchmark/src/db.ts
@@ -0,0 +1,264 @@
+import type {
+  BenchmarkKind,
+  BenchmarkModelSummary,
+  BenchmarkRun,
+} from '@kilocode/auto-routing-contracts';
+
+export type CaseResultRow = {
+  run_id: string;
+  model: string;
+  case_id: string;
+  tier: string | null;
+  score: number;
+  latency_ms: number;
+  cost_usd: number | null;
+  detail_json: string | null;
+  error: string | null;
+};
+
+export type RunRow = {
+  id: string;
+  kind: BenchmarkKind;
+  status: 'running' | 'completed' | 'failed';
+  started_at: string;
+  completed_at: string | null;
+  config_json: string;
+  error: string | null;
+};
+
+type ModelSummaryRow = {
+  run_id: string;
+  model: string;
+  tier: string;
+  accuracy: number;
+  avg_cost_usd: number | null;
+  avg_latency_ms: number;
+  p50_latency_ms: number | null;
+  cases: number;
+  errors: number;
+};
+
+export function mapSummaryRow(row: ModelSummaryRow): BenchmarkModelSummary {
+  return {
+    model: row.model,
+    tier: row.tier as BenchmarkModelSummary['tier'],
+    accuracy: row.accuracy,
+    avgCostUsd: row.avg_cost_usd,
+    avgLatencyMs: row.avg_latency_ms,
+    p50LatencyMs: row.p50_latency_ms,
+    cases: row.cases,
+    errors: row.errors,
+  };
+}
+
+export function mapRunRow(row: RunRow, summaries: BenchmarkModelSummary[]): BenchmarkRun {
+  return {
+    id: row.id,
+    kind: row.kind,
+    status: row.status,
+    startedAt: row.started_at,
+    completedAt: row.completed_at,
+    error: row.error,
+    summaries,
+  };
+}
+
+export async function insertRun(
+  db: D1Database,
+  run: { id: string; kind: BenchmarkKind; startedAt: string; configJson: string }
+): Promise<void> {
+  await db
+    .prepare(
+      `INSERT INTO benchmark_runs (id, kind, status, started_at, config_json)
+       VALUES (?1, ?2, 'running', ?3, ?4)`
+    )
+    .bind(run.id, run.kind, run.startedAt, run.configJson)
+    .run();
+}
+
+export async function getRun(db: D1Database, runId: string): Promise<RunRow | null> {
+  const row = await db
+    .prepare('SELECT * FROM benchmark_runs WHERE id = ?1')
+    .bind(runId)
+    .first<RunRow>();
+  return row ?? null;
+}
+
+export async function upsertCaseResult(db: D1Database, row: CaseResultRow): Promise<void> {
+  await db
+    .prepare(
+      `INSERT OR REPLACE INTO case_results
+       (run_id, model, case_id, tier, score, latency_ms, cost_usd, detail_json, error)
+       VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9)`
+    )
+    .bind(
+      row.run_id,
+      row.model,
+      row.case_id,
+      row.tier,
+      row.score,
+      row.latency_ms,
+      row.cost_usd,
+      row.detail_json,
+      row.error
+    )
+    .run();
+}
+
+export async function countCaseResults(db: D1Database, runId: string): Promise<number> {
+  const row = await db
+    .prepare('SELECT COUNT(*) AS n FROM case_results WHERE run_id = ?1')
+    .bind(runId)
+    .first<{ n: number }>();
+  return row?.n ?? 0;
+}
+
+export async function getCaseResults(db: D1Database, runId: string): Promise<CaseResultRow[]> {
+  const { results } = await db
+    .prepare('SELECT * FROM case_results WHERE run_id = ?1')
+    .bind(runId)
+    .all<CaseResultRow>();
+  return results;
+}
+
+export async function replaceModelSummaries(
+  db: D1Database,
+  runId: string,
+  summaries: BenchmarkModelSummary[]
+): Promise<void> {
+  const statements = [
+    db.prepare('DELETE FROM model_summaries WHERE run_id = ?1').bind(runId),
+    ...summaries.map(s =>
+      db
+        .prepare(
+          `INSERT INTO model_summaries
+           (run_id, model, tier, accuracy, avg_cost_usd, avg_latency_ms, p50_latency_ms, cases, errors)
+           VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9)`
+        )
+        .bind(
+          runId,
+          s.model,
+          s.tier,
+          s.accuracy,
+          s.avgCostUsd,
+          s.avgLatencyMs,
+          s.p50LatencyMs,
+          s.cases,
+          s.errors
+        )
+    ),
+  ];
+  await db.batch(statements);
+}
+
+export async function getSummaries(
+  db: D1Database,
+  runId: string
+): Promise<BenchmarkModelSummary[]> {
+  const { results } = await db
+    .prepare('SELECT * FROM model_summaries WHERE run_id = ?1')
+    .bind(runId)
+    .all<ModelSummaryRow>();
+  return results.map(mapSummaryRow);
+}
+
+export async function listRuns(db: D1Database, limit: number): Promise<BenchmarkRun[]> {
+  const { results: runRows } = await db
+    .prepare('SELECT * FROM benchmark_runs ORDER BY started_at DESC LIMIT ?1')
+    .bind(limit)
+    .all<RunRow>();
+
+  if (runRows.length === 0) {
+    return [];
+  }
+
+  const placeholders = runRows.map((_, i) => `?${i + 1}`).join(', ');
+  const { results: summaryRows } = await db
+    .prepare(`SELECT * FROM model_summaries WHERE run_id IN (${placeholders})`)
+    .bind(...runRows.map(r => r.id))
+    .all<ModelSummaryRow>();
+
+  const summariesByRunId = new Map<string, BenchmarkModelSummary[]>();
+  for (const row of summaryRows) {
+    const existing = summariesByRunId.get(row.run_id);
+    if (existing) {
+      existing.push(mapSummaryRow(row));
+    } else {
+      summariesByRunId.set(row.run_id, [mapSummaryRow(row)]);
+    }
+  }
+
+  return runRows.map(row => mapRunRow(row, summariesByRunId.get(row.id) ?? []));
+}
+
+export async function markRunCompleted(db: D1Database, runId: string): Promise<void> {
+  await db
+    .prepare(
+      `UPDATE benchmark_runs SET status = 'completed', completed_at = ?2
+       WHERE id = ?1 AND status = 'running'`
+    )
+    .bind(runId, new Date().toISOString())
+    .run();
+}
+
+export async function markStaleRunsFailed(
+  db: D1Database,
+  olderThanIso: string
+): Promise<number> {
+  const result = await db
+    .prepare(
+      `UPDATE benchmark_runs SET status = 'failed', error = 'timed out'
+       WHERE status = 'running' AND started_at < ?1`
+    )
+    .bind(olderThanIso)
+    .run();
+  return result.meta.changes;
+}
+
+export async function saveRoutingTable(
+  db: D1Database,
+  runId: string,
+  publishedAt: string,
+  tableJson: string
+): Promise<void> {
+  await db
+    .prepare(
+      `INSERT OR REPLACE INTO routing_tables (run_id, published_at, table_json)
+       VALUES (?1, ?2, ?3)`
+    )
+    .bind(runId, publishedAt, tableJson)
+    .run();
+}
+
+export async function getLatestRoutingTable(
+  db: D1Database
+): Promise<{ run_id: string; published_at: string; table_json: string } | null> {
+  const row = await db
+    .prepare('SELECT * FROM routing_tables ORDER BY published_at DESC LIMIT 1')
+    .first<{ run_id: string; published_at: string; table_json: string }>();
+  return row ?? null;
+}
+
+export async function getConfigRow(
+  db: D1Database
+): Promise<{ config_json: string; updated_at: string; updated_by: string | null } | null> {
+  const row = await db
+    .prepare('SELECT config_json, updated_at, updated_by FROM benchmark_config WHERE id = 1')
+    .first<{ config_json: string; updated_at: string; updated_by: string | null }>();
+  return row ?? null;
+}
+
+export async function saveConfigRow(
+  db: D1Database,
+  configJson: string,
+  updatedAt: string,
+  updatedBy: string | null
+): Promise<void> {
+  await db
+    .prepare(
+      `INSERT OR REPLACE INTO benchmark_config (id, config_json, updated_at, updated_by)
+       VALUES (1, ?1, ?2, ?3)`
+    )
+    .bind(configJson, updatedAt, updatedBy)
+    .run();
+}
diff --git a/services/auto-routing-benchmark/src/hono-env.ts b/services/auto-routing-benchmark/src/hono-env.ts
new file mode 100644
index 0000000000..deb5b5bea3
--- /dev/null
+++ b/services/auto-routing-benchmark/src/hono-env.ts
@@ -0,0 +1 @@
+export type HonoEnv = { Bindings: Env };
diff --git a/services/auto-routing-benchmark/src/index.ts b/services/auto-routing-benchmark/src/index.ts
new file mode 100644
index 0000000000..feb6d991e2
--- /dev/null
+++ b/services/auto-routing-benchmark/src/index.ts
@@ -0,0 +1,25 @@
+import { Hono } from 'hono';
+import { createErrorHandler, createNotFoundHandler } from '@kilocode/worker-utils';
+import { authMiddleware } from './auth';
+import type { HonoEnv } from './hono-env';
+
+export const app = new Hono<HonoEnv>();
+app.use('*', authMiddleware);
+app.get('/health', c => c.json({ status: 'ok', service: 'auto-routing-benchmark' }));
+app.notFound(createNotFoundHandler());
+app.onError(createErrorHandler());
+
+export default {
+  fetch: app.fetch,
+  // Wired up in later tasks (run orchestration + admin endpoints).
+  async scheduled(
+    _controller: ScheduledController,
+    _env: Env,
+    _ctx: ExecutionContext
+  ): Promise<void> {},
+  async queue(
+    _batch: MessageBatch<unknown>,
+    _env: Env,
+    _ctx: ExecutionContext
+  ): Promise<void> {},
+};
diff --git a/services/auto-routing-benchmark/src/openrouter.ts b/services/auto-routing-benchmark/src/openrouter.ts
new file mode 100644
index 0000000000..4d8608d6f5
--- /dev/null
+++ b/services/auto-routing-benchmark/src/openrouter.ts
@@ -0,0 +1,26 @@
+import { OpenRouter } from '@openrouter/sdk';
+import { ttlCached } from './ttl-cache';
+
+type OpenRouterEnv = Pick<Env, 'OPENROUTER_API_KEY'>;
+
+export const OPENROUTER_HTTP_REFERER = 'https://kilocode.ai';
+export const OPENROUTER_APP_TITLE = 'Kilo Code';
+
+// Only the API key string is cached at module scope (plain value, not a
+// transport-owning SDK object), so each classification skips the
+// secrets-store read. The client itself is constructed per request; that is
+// just object setup around global fetch. The TTL keeps key rotations
+// effective within five minutes.
+const API_KEY_CACHE_TTL_MS = 300_000;
+
+const apiKeyCache = ttlCached(API_KEY_CACHE_TTL_MS, (env: OpenRouterEnv) =>
+  env.OPENROUTER_API_KEY.get()
+);
+
+export async function createOpenRouterClient(env: OpenRouterEnv): Promise<OpenRouter> {
+  return new OpenRouter({
+    apiKey: await apiKeyCache.get(env),
+    httpReferer: OPENROUTER_HTTP_REFERER,
+    appTitle: OPENROUTER_APP_TITLE,
+  });
+}
diff --git a/services/auto-routing-benchmark/src/ttl-cache.ts b/services/auto-routing-benchmark/src/ttl-cache.ts
new file mode 100644
index 0000000000..f773b9c4fc
--- /dev/null
+++ b/services/auto-routing-benchmark/src/ttl-cache.ts
@@ -0,0 +1,35 @@
+// Isolate-local TTL memoization for per-request lookups that change rarely
+// (KV config, secrets-backed clients). Values are cached as promises so
+// concurrent callers share one load; rejected loads are evicted immediately
+// so a transient failure is not pinned for the TTL.
+export type TtlCache<TEnv, T> = {
+  get(env: TEnv): Promise<T>;
+  clear(): void;
+};
+
+export function ttlCached<TEnv, T>(
+  ttlMs: number,
+  load: (env: TEnv) => Promise<T>
+): TtlCache<TEnv, T> {
+  let cached: { promise: Promise<T>; expiresAt: number } | null = null;
+
+  return {
+    get(env: TEnv): Promise<T> {
+      if (cached && cached.expiresAt > Date.now()) {
+        return cached.promise;
+      }
+      const promise = load(env);
+      const entry = { promise, expiresAt: Date.now() + ttlMs };
+      cached = entry;
+      promise.catch(() => {
+        if (cached === entry) {
+          cached = null;
+        }
+      });
+      return promise;
+    },
+    clear(): void {
+      cached = null;
+    },
+  };
+}
diff --git a/services/auto-routing-benchmark/tsconfig.json b/services/auto-routing-benchmark/tsconfig.json
new file mode 100644
index 0000000000..4f765c05f6
--- /dev/null
+++ b/services/auto-routing-benchmark/tsconfig.json
@@ -0,0 +1,16 @@
+{
+  "compilerOptions": {
+    "target": "esnext",
+    "lib": ["esnext"],
+    "module": "esnext",
+    "moduleResolution": "bundler",
+    "types": ["@types/node", "@cloudflare/workers-types", "./worker-configuration.d.ts"],
+    "esModuleInterop": true,
+    "resolveJsonModule": true,
+    "forceConsistentCasingInFileNames": true,
+    "strict": true,
+    "skipLibCheck": true,
+    "noEmit": true
+  },
+  "include": ["worker-configuration.d.ts", "src/**/*.ts", "src/**/*.d.ts", "vitest.config.ts"]
+}
diff --git a/services/auto-routing-benchmark/vitest.config.ts b/services/auto-routing-benchmark/vitest.config.ts
new file mode 100644
index 0000000000..7dd13254e7
--- /dev/null
+++ b/services/auto-routing-benchmark/vitest.config.ts
@@ -0,0 +1,9 @@
+import { defineConfig } from 'vitest/config';
+
+export default defineConfig({
+  test: {
+    globals: true,
+    environment: 'node',
+    include: ['src/**/*.test.ts'],
+  },
+});
diff --git a/services/auto-routing-benchmark/worker-configuration.d.ts b/services/auto-routing-benchmark/worker-configuration.d.ts
new file mode 100644
index 0000000000..5952f82e1b
--- /dev/null
+++ b/services/auto-routing-benchmark/worker-configuration.d.ts
@@ -0,0 +1,16 @@
+/* eslint-disable */
+// Generated by Wrangler by running `wrangler types --include-runtime=false` (hash: 8d542fe6f931aa8df862b4b96f2474be)
+interface __BaseEnv_Env {
+	AUTO_ROUTING_CONFIG: KVNamespace;
+	BENCH_DB: D1Database;
+	BENCH_QUEUE: Queue;
+	INTERNAL_API_SECRET_PROD: SecretsStoreSecret;
+	OPENROUTER_API_KEY: SecretsStoreSecret;
+}
+declare namespace Cloudflare {
+	interface GlobalProps {
+		mainModule: typeof import("./src/index");
+	}
+	interface Env extends __BaseEnv_Env {}
+}
+interface Env extends __BaseEnv_Env {}
diff --git a/services/auto-routing-benchmark/wrangler.jsonc b/services/auto-routing-benchmark/wrangler.jsonc
new file mode 100644
index 0000000000..5f3b67f6b5
--- /dev/null
+++ b/services/auto-routing-benchmark/wrangler.jsonc
@@ -0,0 +1,54 @@
+{
+  "$schema": "node_modules/wrangler/config-schema.json",
+  "account_id": "e115e769bcdd4c3d66af59d3332cb394",
+  "name": "auto-routing-benchmark",
+  "main": "src/index.ts",
+  "compatibility_date": "2026-05-15",
+  "compatibility_flags": ["nodejs_compat"],
+  "workers_dev": false,
+  "preview_urls": false,
+  "logpush": true,
+  "routes": [{ "pattern": "auto-routing-benchmark.kiloapps.io", "custom_domain": true }],
+  "dev": { "port": 8814, "local_protocol": "http", "ip": "0.0.0.0" },
+  "observability": { "enabled": true },
+  "triggers": {
+    // 04:10 UTC daily: classifier benchmark. 05:10 UTC Monday: decider benchmark.
+    "crons": ["10 4 * * *", "10 5 * * 1"]
+  },
+  "d1_databases": [
+    {
+      "binding": "BENCH_DB",
+      "database_name": "auto-routing-benchmark",
+      "database_id": "92f2c88a-5ee6-4fd0-b118-75bd141b5cac",
+      "migrations_dir": "migrations"
+    }
+  ],
+  "queues": {
+    "producers": [{ "binding": "BENCH_QUEUE", "queue": "auto-routing-benchmark-jobs" }],
+    "consumers": [
+      {
+        "queue": "auto-routing-benchmark-jobs",
+        "max_batch_size": 1,
+        "max_retries": 2,
+        "max_concurrency": 4
+      }
+    ]
+  },
+  "kv_namespaces": [
+    // Shared with the auto-routing worker: the decider benchmark publishes
+    // the routing table here and auto-routing reads it on /decide.
+    { "binding": "AUTO_ROUTING_CONFIG", "id": "4316b8db31e347e19cfadad1b6386ad5" }
+  ],
+  "secrets_store_secrets": [
+    {
+      "binding": "INTERNAL_API_SECRET_PROD",
+      "store_id": "342a86d9e3a94da698e82d0c6e2a36f0",
+      "secret_name": "INTERNAL_API_SECRET_PROD"
+    },
+    {
+      "binding": "OPENROUTER_API_KEY",
+      "store_id": "342a86d9e3a94da698e82d0c6e2a36f0",
+      "secret_name": "OPENROUTER_API_KEY"
+    }
+  ]
+}

From 22de71333886423b858af5941364f280b7c64887 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Thu, 11 Jun 2026 22:24:39 +0200
Subject: [PATCH 07/73] feat(auto-routing-benchmark): classifier golden dataset
 and grading

---
 .../src/datasets/classifier-cases.test.ts     |  51 ++
 .../src/datasets/classifier-cases.ts          | 610 ++++++++++++++++++
 .../src/grading.test.ts                       |  58 ++
 .../auto-routing-benchmark/src/grading.ts     |  31 +
 4 files changed, 750 insertions(+)
 create mode 100644 services/auto-routing-benchmark/src/datasets/classifier-cases.test.ts
 create mode 100644 services/auto-routing-benchmark/src/datasets/classifier-cases.ts
 create mode 100644 services/auto-routing-benchmark/src/grading.test.ts
 create mode 100644 services/auto-routing-benchmark/src/grading.ts

diff --git a/services/auto-routing-benchmark/src/datasets/classifier-cases.test.ts b/services/auto-routing-benchmark/src/datasets/classifier-cases.test.ts
new file mode 100644
index 0000000000..08523eaa4f
--- /dev/null
+++ b/services/auto-routing-benchmark/src/datasets/classifier-cases.test.ts
@@ -0,0 +1,51 @@
+import { describe, expect, it } from 'vitest';
+import { NormalizedClassifierInputSchema } from '@kilocode/auto-routing-contracts';
+import { CLASSIFIER_CASES } from './classifier-cases';
+
+describe('CLASSIFIER_CASES', () => {
+  it('has exactly 36 cases', () => {
+    expect(CLASSIFIER_CASES.length).toBe(36);
+  });
+
+  it('has unique ids and valid inputs', () => {
+    const ids = new Set(CLASSIFIER_CASES.map(c => c.id));
+    expect(ids.size).toBe(CLASSIFIER_CASES.length);
+    for (const c of CLASSIFIER_CASES) {
+      const result = NormalizedClassifierInputSchema.safeParse(c.input);
+      expect(result.success, `case ${c.id}: ${JSON.stringify(result.error?.issues)}`).toBe(true);
+    }
+  });
+
+  it('covers every task type with exactly 6 cases', () => {
+    const byType = Map.groupBy(CLASSIFIER_CASES, c => c.expected.taskType);
+    for (const taskType of [
+      'implementation',
+      'debugging',
+      'refactoring',
+      'planning_design',
+      'investigation',
+      'agentic_execution',
+    ] as const) {
+      expect(byType.get(taskType)?.length ?? 0, taskType).toBe(6);
+    }
+  });
+
+  it('covers every reasoning complexity at least 8 times', () => {
+    for (const level of ['low', 'medium', 'high'] as const) {
+      expect(
+        CLASSIFIER_CASES.filter(c => c.expected.reasoningComplexity === level).length,
+        level
+      ).toBeGreaterThanOrEqual(8);
+    }
+  });
+
+  it('has at least one of each reasoning complexity within every task type', () => {
+    const byType = Map.groupBy(CLASSIFIER_CASES, c => c.expected.taskType);
+    for (const [taskType, cases] of byType) {
+      const levels = new Set(cases.map(c => c.expected.reasoningComplexity));
+      for (const level of ['low', 'medium', 'high'] as const) {
+        expect(levels.has(level), `${taskType} missing ${level}`).toBe(true);
+      }
+    }
+  });
+});
diff --git a/services/auto-routing-benchmark/src/datasets/classifier-cases.ts b/services/auto-routing-benchmark/src/datasets/classifier-cases.ts
new file mode 100644
index 0000000000..a857cd3169
--- /dev/null
+++ b/services/auto-routing-benchmark/src/datasets/classifier-cases.ts
@@ -0,0 +1,610 @@
+import type { NormalizedClassifierInput } from '@kilocode/auto-routing-contracts';
+import type { ClassifierExpectation } from '../grading';
+
+export type ClassifierCase = {
+  id: string; // stable slug, e.g. 'impl-low-regex-helper'
+  input: NormalizedClassifierInput;
+  expected: ClassifierExpectation;
+};
+
+const AGENT_TOOLS_SYSTEM =
+  'You are Kilo Code, an AI coding assistant operating in an agentic loop with access to read_file, write_file, apply_diff, run_command and search_files tools. Work step by step and verify your changes.';
+const AGENT_PLAIN_SYSTEM =
+  'You are Kilo Code, an AI coding assistant. You help the user write and modify code in their workspace. Follow the user instructions precisely.';
+const CHAT_ASSISTANT_SYSTEM =
+  'You are a helpful senior software engineer. Answer the user clearly and concisely. Do not assume access to the user files unless they are pasted in the conversation.';
+
+const HINTS = { provider: null, providerOptions: null } as const;
+
+function chat(
+  systemPromptPrefix: string,
+  userPromptPrefix: string,
+  opts: {
+    messageCount: number;
+    hasTools: boolean;
+    latestUserPromptPrefix?: string | null;
+  }
+): NormalizedClassifierInput {
+  return {
+    apiKind: 'chat_completions',
+    requestedModel: 'kilo-auto/efficient',
+    systemPromptPrefix,
+    userPromptPrefix,
+    latestUserPromptPrefix: opts.latestUserPromptPrefix ?? null,
+    messageCount: opts.messageCount,
+    hasTools: opts.hasTools,
+    stream: true,
+    providerHints: HINTS,
+  };
+}
+
+export const CLASSIFIER_CASES: readonly ClassifierCase[] = [
+  // ---------------------------------------------------------------------------
+  // implementation (2 low, 2 medium, 2 high)
+  // ---------------------------------------------------------------------------
+  {
+    id: 'impl-low-regex-helper',
+    input: chat(
+      AGENT_PLAIN_SYSTEM,
+      'Write a TypeScript helper function isValidSemver(version: string): boolean that returns true for valid semantic version strings like 1.2.3 and false otherwise. No external dependencies.',
+      { messageCount: 1, hasTools: false }
+    ),
+    expected: {
+      taskType: 'implementation',
+      contextComplexity: 'small',
+      reasoningComplexity: 'low',
+      executionMode: 'answer_only',
+      requiresTools: false,
+    },
+  },
+  {
+    id: 'impl-low-add-zod-schema',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Add a Zod schema named PaginationParamsSchema to src/schemas/pagination.ts with optional page (positive int, default 1) and pageSize (positive int, max 100, default 20) fields, and export its inferred type.',
+      { messageCount: 3, hasTools: true }
+    ),
+    expected: {
+      taskType: 'implementation',
+      contextComplexity: 'small',
+      reasoningComplexity: 'low',
+      executionMode: 'code_change',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'impl-medium-rest-endpoint',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Add a new GET /api/projects/:id/members endpoint to our Express router in src/routes/projects.ts. Reuse the existing requireAuth middleware and the ProjectService.getMembers method, and return 404 when the project does not exist.',
+      { messageCount: 7, hasTools: true }
+    ),
+    expected: {
+      taskType: 'implementation',
+      contextComplexity: 'medium',
+      reasoningComplexity: 'medium',
+      executionMode: 'code_change',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'impl-medium-react-hook',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Implement a useDebouncedValue(value, delayMs) React hook in src/hooks and use it in the SearchBar component so the onSearch callback fires at most once every 300ms. Keep the existing controlled-input behavior.',
+      { messageCount: 9, hasTools: true }
+    ),
+    expected: {
+      taskType: 'implementation',
+      contextComplexity: 'medium',
+      reasoningComplexity: 'medium',
+      executionMode: 'code_change',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'impl-high-realtime-collab',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Build real-time collaborative editing for our document editor. We have a React frontend, a Node WebSocket gateway, and a Postgres store. Decide and implement a conflict-resolution strategy (OT vs CRDT), wire presence, persistence, and reconnection, and make it consistent across all three layers.',
+      { messageCount: 18, hasTools: true }
+    ),
+    expected: {
+      taskType: 'implementation',
+      contextComplexity: 'large',
+      reasoningComplexity: 'high',
+      executionMode: 'multi_step_project',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'impl-high-rate-limiter',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Implement a distributed sliding-window rate limiter that works across our 4 API replicas backed by Redis. It must handle clock skew between nodes, degrade gracefully if Redis is unavailable, and expose per-tenant limits configured in src/config/limits.ts. Integrate it into the existing middleware chain.',
+      { messageCount: 16, hasTools: true }
+    ),
+    expected: {
+      taskType: 'implementation',
+      contextComplexity: 'large',
+      reasoningComplexity: 'high',
+      executionMode: 'multi_step_project',
+      requiresTools: true,
+    },
+  },
+
+  // ---------------------------------------------------------------------------
+  // debugging (2 low, 2 medium, 2 high)
+  // ---------------------------------------------------------------------------
+  {
+    id: 'debug-low-typo-import',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      "Running the app throws \"TypeError: formatDate is not a function\" from src/utils/date.ts line 12. The file exports formatDate as a named export but App.tsx imports it as a default. Fix the import.",
+      { messageCount: 4, hasTools: true }
+    ),
+    expected: {
+      taskType: 'debugging',
+      contextComplexity: 'small',
+      reasoningComplexity: 'low',
+      executionMode: 'code_change',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'debug-low-off-by-one',
+    input: chat(
+      AGENT_PLAIN_SYSTEM,
+      'This pagination function returns one too few items on the last page. Here is the code: `return items.slice(page * size, page * size + size - 1)`. What is wrong and how do I fix it?',
+      { messageCount: 1, hasTools: false }
+    ),
+    expected: {
+      taskType: 'debugging',
+      contextComplexity: 'small',
+      reasoningComplexity: 'low',
+      executionMode: 'answer_only',
+      requiresTools: false,
+    },
+  },
+  {
+    id: 'debug-medium-failing-test',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Our test "UserService > createUser persists the hashed password" started failing after I changed the bcrypt cost factor. The assertion expects a 60-char hash but now gets undefined. Figure out whether the service or the test is wrong and fix it so the suite passes.',
+      { messageCount: 8, hasTools: true }
+    ),
+    expected: {
+      taskType: 'debugging',
+      contextComplexity: 'medium',
+      reasoningComplexity: 'medium',
+      executionMode: 'code_change',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'debug-medium-cors-error',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Browser requests to our /api/upload endpoint fail with "blocked by CORS policy: No Access-Control-Allow-Origin header". GET requests to other endpoints work fine. The cors middleware is configured in src/server.ts. Find why only upload is affected and fix it.',
+      { messageCount: 10, hasTools: true }
+    ),
+    expected: {
+      taskType: 'debugging',
+      contextComplexity: 'medium',
+      reasoningComplexity: 'medium',
+      executionMode: 'code_change',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'debug-high-race-condition',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Our payment webhook handler intermittently double-charges customers under load. We use a Postgres advisory lock around the charge, but the duplicate rows have timestamps 2-3ms apart. The handler runs on 3 replicas behind a queue with at-least-once delivery. Investigate the root cause across the worker, queue consumer, and DB layers and fix it.',
+      { messageCount: 14, hasTools: true }
+    ),
+    expected: {
+      taskType: 'debugging',
+      contextComplexity: 'large',
+      reasoningComplexity: 'high',
+      executionMode: 'multi_step_project',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'debug-high-memory-leak',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Our Node service RSS grows by ~50MB/hour in production and OOMs after a day, but it is stable locally. Heap snapshots show growing retained closures referencing our EventEmitter-based cache. It spans the cache module, the websocket session manager, and a third-party metrics client. Trace the leak across these and fix it.',
+      { messageCount: 22, hasTools: true }
+    ),
+    expected: {
+      taskType: 'debugging',
+      contextComplexity: 'large',
+      reasoningComplexity: 'high',
+      executionMode: 'multi_step_project',
+      requiresTools: true,
+    },
+  },
+
+  // ---------------------------------------------------------------------------
+  // refactoring (2 low, 2 medium, 2 high)
+  // ---------------------------------------------------------------------------
+  {
+    id: 'refactor-low-rename-var',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'In src/cart.ts rename the variable `x` to `lineItemTotal` everywhere it is used in the calculateTotal function. No behavior change.',
+      { messageCount: 3, hasTools: true }
+    ),
+    expected: {
+      taskType: 'refactoring',
+      contextComplexity: 'small',
+      reasoningComplexity: 'low',
+      executionMode: 'code_change',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'refactor-low-extract-constant',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'The magic number 86400 appears three times in src/scheduler.ts. Extract it into a named constant SECONDS_PER_DAY at the top of the file and use it in all three places. Keep behavior identical.',
+      { messageCount: 2, hasTools: true }
+    ),
+    expected: {
+      taskType: 'refactoring',
+      contextComplexity: 'small',
+      reasoningComplexity: 'low',
+      executionMode: 'code_change',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'refactor-medium-extract-service',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'The OrderController in src/controllers/order.ts has grown to 400 lines and mixes HTTP handling with business logic. Extract the business logic into an OrderService class, keep the controller thin, and update the existing controller tests to match. Behavior must stay the same.',
+      { messageCount: 11, hasTools: true }
+    ),
+    expected: {
+      taskType: 'refactoring',
+      contextComplexity: 'medium',
+      reasoningComplexity: 'medium',
+      executionMode: 'code_change',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'refactor-medium-promise-to-async',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Convert the .then()/.catch() promise chains in src/api/client.ts to async/await. There are about six methods. Preserve the existing error-handling semantics and return types exactly.',
+      { messageCount: 6, hasTools: true }
+    ),
+    expected: {
+      taskType: 'refactoring',
+      contextComplexity: 'medium',
+      reasoningComplexity: 'medium',
+      executionMode: 'code_change',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'refactor-high-modularize-monolith',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Our monolithic src/app.ts wires routing, auth, database access, and background jobs in one 1200-line file with tangled circular imports. Restructure it into clear modules with one-directional dependencies, without changing any external behavior or public routes. Decide the boundaries and migrate incrementally.',
+      { messageCount: 26, hasTools: true }
+    ),
+    expected: {
+      taskType: 'refactoring',
+      contextComplexity: 'large',
+      reasoningComplexity: 'high',
+      executionMode: 'multi_step_project',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'refactor-high-orm-migration',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Migrate our data layer from the legacy hand-written SQL query helpers spread across 30 files to Drizzle ORM, preserving every query result shape and transaction boundary. Plan the sequence so the app keeps passing tests at each step, then carry it out.',
+      { messageCount: 30, hasTools: true }
+    ),
+    expected: {
+      taskType: 'refactoring',
+      contextComplexity: 'large',
+      reasoningComplexity: 'high',
+      executionMode: 'multi_step_project',
+      requiresTools: true,
+    },
+  },
+
+  // ---------------------------------------------------------------------------
+  // planning_design (2 low, 2 medium, 2 high)
+  // ---------------------------------------------------------------------------
+  {
+    id: 'plan-low-naming-choice',
+    input: chat(
+      CHAT_ASSISTANT_SYSTEM,
+      'I have a function that both validates and saves a user. What is a good single name for it, or should I split it? Just give me a recommendation, no code.',
+      { messageCount: 1, hasTools: false }
+    ),
+    expected: {
+      taskType: 'planning_design',
+      contextComplexity: 'small',
+      reasoningComplexity: 'low',
+      executionMode: 'answer_only',
+      requiresTools: false,
+    },
+  },
+  {
+    id: 'plan-low-folder-structure',
+    input: chat(
+      CHAT_ASSISTANT_SYSTEM,
+      'For a small Express API with about 8 endpoints, what is a sensible folder structure for routes, controllers, and services? Just describe the layout, do not write code.',
+      { messageCount: 1, hasTools: false }
+    ),
+    expected: {
+      taskType: 'planning_design',
+      contextComplexity: 'small',
+      reasoningComplexity: 'low',
+      executionMode: 'answer_only',
+      requiresTools: false,
+    },
+  },
+  {
+    id: 'plan-medium-caching-strategy',
+    input: chat(
+      CHAT_ASSISTANT_SYSTEM,
+      'We have a read-heavy product catalog API hitting Postgres directly. Walk me through the tradeoffs of adding Redis caching vs HTTP cache headers vs a materialized view, and recommend one for a team of three with moderate traffic. No implementation yet.',
+      { messageCount: 1, hasTools: false }
+    ),
+    expected: {
+      taskType: 'planning_design',
+      contextComplexity: 'medium',
+      reasoningComplexity: 'medium',
+      executionMode: 'answer_only',
+      requiresTools: false,
+    },
+  },
+  {
+    id: 'plan-medium-rollout-steps',
+    input: chat(
+      CHAT_ASSISTANT_SYSTEM,
+      'We want to add optimistic UI updates to our existing React + tRPC todo app. Break the work into an ordered implementation plan (state, mutation handling, rollback on error, tests). Just the plan, I will implement it.',
+      { messageCount: 1, hasTools: false }
+    ),
+    expected: {
+      taskType: 'planning_design',
+      contextComplexity: 'medium',
+      reasoningComplexity: 'medium',
+      executionMode: 'answer_only',
+      requiresTools: false,
+    },
+  },
+  {
+    id: 'plan-high-multitenant-architecture',
+    input: chat(
+      CHAT_ASSISTANT_SYSTEM,
+      'Design a multi-tenant architecture for our B2B SaaS. We need tenant isolation, per-tenant data residency (EU vs US), noisy-neighbor protection, and a path to enterprise single-tenant deployments later. Compare schema-per-tenant, row-level, and database-per-tenant, and recommend an approach with its failure modes. Design only.',
+      { messageCount: 1, hasTools: false }
+    ),
+    expected: {
+      taskType: 'planning_design',
+      contextComplexity: 'large',
+      reasoningComplexity: 'high',
+      executionMode: 'answer_only',
+      requiresTools: false,
+    },
+  },
+  {
+    id: 'plan-high-event-driven-migration',
+    input: chat(
+      CHAT_ASSISTANT_SYSTEM,
+      'We run a synchronous request/response monolith and want to move order processing to an event-driven design with a message broker. Design the target architecture: event schema/versioning, idempotency, ordering guarantees, dead-letter handling, and how we cut over without downtime. Tradeoffs and a recommended broker, no code.',
+      { messageCount: 1, hasTools: false }
+    ),
+    expected: {
+      taskType: 'planning_design',
+      contextComplexity: 'large',
+      reasoningComplexity: 'high',
+      executionMode: 'answer_only',
+      requiresTools: false,
+    },
+  },
+
+  // ---------------------------------------------------------------------------
+  // investigation (2 low, 2 medium, 2 high)
+  // ---------------------------------------------------------------------------
+  {
+    id: 'invest-low-find-usage',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Where in the codebase is the function getFeatureFlags defined and which files import it? Just tell me, do not change anything.',
+      { messageCount: 2, hasTools: true }
+    ),
+    expected: {
+      taskType: 'investigation',
+      contextComplexity: 'small',
+      reasoningComplexity: 'low',
+      executionMode: 'answer_only',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'invest-low-explain-function',
+    input: chat(
+      AGENT_PLAIN_SYSTEM,
+      'Explain what this reducer does, step by step. It handles ADD_ITEM, REMOVE_ITEM, and CLEAR_CART actions. I just want to understand the logic.',
+      { messageCount: 1, hasTools: false }
+    ),
+    expected: {
+      taskType: 'investigation',
+      contextComplexity: 'small',
+      reasoningComplexity: 'low',
+      executionMode: 'answer_only',
+      requiresTools: false,
+    },
+  },
+  {
+    id: 'invest-medium-trace-auth-flow',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Explain how a login request flows through our app from the /auth/login route to the session cookie being set. Cover the controller, the AuthService, and the session middleware. I want to understand it before changing anything.',
+      { messageCount: 6, hasTools: true }
+    ),
+    expected: {
+      taskType: 'investigation',
+      contextComplexity: 'medium',
+      reasoningComplexity: 'medium',
+      executionMode: 'answer_only',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'invest-medium-research-sdk',
+    input: chat(
+      CHAT_ASSISTANT_SYSTEM,
+      'Look up the current Stripe Node SDK and summarize how to verify a webhook signature and what the recommended way to handle idempotency keys is. I need to know the current recommended API before I write any code.',
+      { messageCount: 1, hasTools: true, latestUserPromptPrefix: null }
+    ),
+    expected: {
+      taskType: 'investigation',
+      contextComplexity: 'medium',
+      reasoningComplexity: 'medium',
+      executionMode: 'answer_only',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'invest-high-perf-regression-analysis',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Our checkout p95 latency doubled over the last two weeks but no single deploy stands out. Investigate across the API, the database query patterns, the cache hit rates, and the third-party payment calls, and tell me the most likely contributors ranked by evidence. Do not fix anything yet, just analyze.',
+      { messageCount: 20, hasTools: true }
+    ),
+    expected: {
+      taskType: 'investigation',
+      contextComplexity: 'large',
+      reasoningComplexity: 'high',
+      executionMode: 'answer_only',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'invest-high-understand-legacy-pipeline',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'We inherited an undocumented data pipeline spanning a cron service, three Lambda functions, an SQS queue, and a Redshift loader. Map out how data flows end to end, what each component assumes about the others, and where the implicit coupling and failure points are. Understanding only, no changes.',
+      { messageCount: 24, hasTools: true }
+    ),
+    expected: {
+      taskType: 'investigation',
+      contextComplexity: 'large',
+      reasoningComplexity: 'high',
+      executionMode: 'answer_only',
+      requiresTools: true,
+    },
+  },
+
+  // ---------------------------------------------------------------------------
+  // agentic_execution (2 low, 2 medium, 2 high)
+  // ---------------------------------------------------------------------------
+  {
+    id: 'agentic-low-run-tests',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Run the test suite with `pnpm test` and tell me if it passes.',
+      { messageCount: 2, hasTools: true }
+    ),
+    expected: {
+      taskType: 'agentic_execution',
+      contextComplexity: 'small',
+      reasoningComplexity: 'low',
+      executionMode: 'command_execution',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'agentic-low-check-git-status',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Run git status and git log --oneline -5 and show me the output so I know what state this checkout is in.',
+      { messageCount: 3, hasTools: true }
+    ),
+    expected: {
+      taskType: 'agentic_execution',
+      contextComplexity: 'small',
+      reasoningComplexity: 'low',
+      executionMode: 'command_execution',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'agentic-medium-start-dev-server',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Start the local dev environment with `pnpm dev`, wait for it to boot, then curl http://localhost:3000/health and report whether the service and its database connection are healthy.',
+      { messageCount: 8, hasTools: true }
+    ),
+    expected: {
+      taskType: 'agentic_execution',
+      contextComplexity: 'medium',
+      reasoningComplexity: 'medium',
+      executionMode: 'command_execution',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'agentic-medium-docker-logs',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'The api container keeps restarting. Run docker compose ps, then docker compose logs api --tail 100, identify which command in the logs is failing on boot, and report it back. Just diagnose via the commands, do not edit files.',
+      { messageCount: 10, hasTools: true }
+    ),
+    expected: {
+      taskType: 'agentic_execution',
+      contextComplexity: 'medium',
+      reasoningComplexity: 'medium',
+      executionMode: 'command_execution',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'agentic-high-release-pipeline',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Cut a release: bump the version, run the full build and test suite, build and push the multi-arch Docker image to our registry, tag the git commit, and verify the staging deploy comes up healthy. Stop and report if any step fails.',
+      { messageCount: 28, hasTools: true }
+    ),
+    expected: {
+      taskType: 'agentic_execution',
+      contextComplexity: 'large',
+      reasoningComplexity: 'high',
+      executionMode: 'multi_step_project',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'agentic-high-recover-broken-env',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'My local environment is broken after a branch switch: migrations are out of sync, node_modules looks stale, and the worker will not start. Diagnose and recover it end to end by running the right commands in order, re-running checks after each fix, until pnpm dev comes up clean. Report what you changed.',
+      {
+        messageCount: 32,
+        hasTools: true,
+        latestUserPromptPrefix:
+          'Also clear the local cache before reinstalling, I think it is corrupt.',
+      }
+    ),
+    expected: {
+      taskType: 'agentic_execution',
+      contextComplexity: 'large',
+      reasoningComplexity: 'high',
+      executionMode: 'multi_step_project',
+      requiresTools: true,
+    },
+  },
+];
diff --git a/services/auto-routing-benchmark/src/grading.test.ts b/services/auto-routing-benchmark/src/grading.test.ts
new file mode 100644
index 0000000000..7094da8d09
--- /dev/null
+++ b/services/auto-routing-benchmark/src/grading.test.ts
@@ -0,0 +1,58 @@
+import { describe, expect, it } from 'vitest';
+import type { ClassifierOutput } from '@kilocode/auto-routing-contracts';
+import {
+  CLASSIFIER_FIELD_WEIGHTS,
+  gradeClassifierOutput,
+  type ClassifierExpectation,
+} from './grading';
+
+const expected: ClassifierExpectation = {
+  taskType: 'implementation',
+  contextComplexity: 'small',
+  reasoningComplexity: 'low',
+  executionMode: 'answer_only',
+  requiresTools: false,
+};
+
+function actualFrom(overrides: Partial<ClassifierOutput>): ClassifierOutput {
+  return {
+    taskType: 'implementation',
+    subtaskType: 'code_generation',
+    contextComplexity: 'small',
+    reasoningComplexity: 'low',
+    riskLevel: 'low',
+    executionMode: 'answer_only',
+    requiresTools: false,
+    confidence: 0.9,
+    ...overrides,
+  };
+}
+
+describe('gradeClassifierOutput', () => {
+  it('scores a full match as 1', () => {
+    expect(gradeClassifierOutput(expected, actualFrom({}))).toBe(1);
+  });
+
+  it('scores a taskType mismatch alone as 0.7', () => {
+    expect(gradeClassifierOutput(expected, actualFrom({ taskType: 'debugging' }))).toBe(0.7);
+  });
+
+  it('scores a requiresTools mismatch alone as 0.9', () => {
+    expect(gradeClassifierOutput(expected, actualFrom({ requiresTools: true }))).toBe(0.9);
+  });
+
+  it('ignores ungraded fields like subtaskType and riskLevel', () => {
+    expect(
+      gradeClassifierOutput(
+        expected,
+        actualFrom({ subtaskType: 'feature_development', riskLevel: 'high' })
+      )
+    ).toBe(1);
+  });
+});
+
+describe('CLASSIFIER_FIELD_WEIGHTS', () => {
+  it('sums to 1', () => {
+    expect(Object.values(CLASSIFIER_FIELD_WEIGHTS).reduce((a, b) => a + b, 0)).toBeCloseTo(1);
+  });
+});
diff --git a/services/auto-routing-benchmark/src/grading.ts b/services/auto-routing-benchmark/src/grading.ts
new file mode 100644
index 0000000000..746e68a546
--- /dev/null
+++ b/services/auto-routing-benchmark/src/grading.ts
@@ -0,0 +1,31 @@
+import type { ClassifierOutput } from '@kilocode/auto-routing-contracts';
+
+// Golden labels grade the axes the decision engine actually consumes.
+// subtaskType is intentionally ungraded (high label ambiguity, unused by
+// deriveDifficultyTier); riskLevel likewise; requiresTools gets a small weight.
+export type ClassifierExpectation = {
+  taskType: ClassifierOutput['taskType'];
+  contextComplexity: ClassifierOutput['contextComplexity'];
+  reasoningComplexity: ClassifierOutput['reasoningComplexity'];
+  executionMode: ClassifierOutput['executionMode'];
+  requiresTools: boolean;
+};
+
+export const CLASSIFIER_FIELD_WEIGHTS: Record<keyof ClassifierExpectation, number> = {
+  taskType: 0.3,
+  reasoningComplexity: 0.25,
+  contextComplexity: 0.15,
+  executionMode: 0.2,
+  requiresTools: 0.1,
+};
+
+export function gradeClassifierOutput(
+  expected: ClassifierExpectation,
+  actual: ClassifierOutput
+): number {
+  let score = 0;
+  for (const key of Object.keys(CLASSIFIER_FIELD_WEIGHTS) as (keyof ClassifierExpectation)[]) {
+    if (actual[key] === expected[key]) score += CLASSIFIER_FIELD_WEIGHTS[key];
+  }
+  return Number(score.toFixed(4));
+}

From 878e49b1c011ddc53854eff0585e2ac3d95ede1a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Thu, 11 Jun 2026 22:25:17 +0200
Subject: [PATCH 08/73] style(auto-routing-benchmark): apply oxfmt formatting

---
 .../auto-routing-benchmark/src/datasets/classifier-cases.ts | 2 +-
 services/auto-routing-benchmark/src/db.ts                   | 5 +----
 services/auto-routing-benchmark/src/index.ts                | 6 +-----
 3 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/services/auto-routing-benchmark/src/datasets/classifier-cases.ts b/services/auto-routing-benchmark/src/datasets/classifier-cases.ts
index a857cd3169..33baacaefb 100644
--- a/services/auto-routing-benchmark/src/datasets/classifier-cases.ts
+++ b/services/auto-routing-benchmark/src/datasets/classifier-cases.ts
@@ -140,7 +140,7 @@ export const CLASSIFIER_CASES: readonly ClassifierCase[] = [
     id: 'debug-low-typo-import',
     input: chat(
       AGENT_TOOLS_SYSTEM,
-      "Running the app throws \"TypeError: formatDate is not a function\" from src/utils/date.ts line 12. The file exports formatDate as a named export but App.tsx imports it as a default. Fix the import.",
+      'Running the app throws "TypeError: formatDate is not a function" from src/utils/date.ts line 12. The file exports formatDate as a named export but App.tsx imports it as a default. Fix the import.',
       { messageCount: 4, hasTools: true }
     ),
     expected: {
diff --git a/services/auto-routing-benchmark/src/db.ts b/services/auto-routing-benchmark/src/db.ts
index 27a817006f..48130a90fd 100644
--- a/services/auto-routing-benchmark/src/db.ts
+++ b/services/auto-routing-benchmark/src/db.ts
@@ -201,10 +201,7 @@ export async function markRunCompleted(db: D1Database, runId: string): Promise<v
     .run();
 }
 
-export async function markStaleRunsFailed(
-  db: D1Database,
-  olderThanIso: string
-): Promise<number> {
+export async function markStaleRunsFailed(db: D1Database, olderThanIso: string): Promise<number> {
   const result = await db
     .prepare(
       `UPDATE benchmark_runs SET status = 'failed', error = 'timed out'
diff --git a/services/auto-routing-benchmark/src/index.ts b/services/auto-routing-benchmark/src/index.ts
index feb6d991e2..3c0a00cf8b 100644
--- a/services/auto-routing-benchmark/src/index.ts
+++ b/services/auto-routing-benchmark/src/index.ts
@@ -17,9 +17,5 @@ export default {
     _env: Env,
     _ctx: ExecutionContext
   ): Promise<void> {},
-  async queue(
-    _batch: MessageBatch<unknown>,
-    _env: Env,
-    _ctx: ExecutionContext
-  ): Promise<void> {},
+  async queue(_batch: MessageBatch<unknown>, _env: Env, _ctx: ExecutionContext): Promise<void> {},
 };

From 662717ce08faf6e571ec53fcf9c8a54cc178c74d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Thu, 11 Jun 2026 22:31:54 +0200
Subject: [PATCH 09/73] feat(auto-routing-benchmark): decider golden dataset
 with deterministic checkers

---
 .../src/datasets/decider-cases.test.ts        |  64 ++++
 .../src/datasets/decider-cases.ts             | 328 ++++++++++++++++++
 .../src/grading.test.ts                       |  72 ++++
 .../auto-routing-benchmark/src/grading.ts     |  75 ++++
 4 files changed, 539 insertions(+)
 create mode 100644 services/auto-routing-benchmark/src/datasets/decider-cases.test.ts
 create mode 100644 services/auto-routing-benchmark/src/datasets/decider-cases.ts

diff --git a/services/auto-routing-benchmark/src/datasets/decider-cases.test.ts b/services/auto-routing-benchmark/src/datasets/decider-cases.test.ts
new file mode 100644
index 0000000000..92a734700c
--- /dev/null
+++ b/services/auto-routing-benchmark/src/datasets/decider-cases.test.ts
@@ -0,0 +1,64 @@
+import { describe, expect, it } from 'vitest';
+import { DECIDER_CASES } from './decider-cases';
+
+describe('DECIDER_CASES', () => {
+  it('has exactly 30 cases with unique ids', () => {
+    expect(DECIDER_CASES.length).toBe(30);
+    const ids = new Set(DECIDER_CASES.map(c => c.id));
+    expect(ids.size).toBe(DECIDER_CASES.length);
+  });
+
+  it('has exactly 10 cases per tier', () => {
+    for (const tier of ['low', 'medium', 'high'] as const) {
+      expect(DECIDER_CASES.filter(c => c.tier === tier).length, tier).toBe(10);
+    }
+  });
+
+  it('covers at least 4 distinct task types per tier', () => {
+    for (const tier of ['low', 'medium', 'high'] as const) {
+      const taskTypes = new Set(DECIDER_CASES.filter(c => c.tier === tier).map(c => c.taskType));
+      expect(taskTypes.size, tier).toBeGreaterThanOrEqual(4);
+    }
+  });
+
+  it('has compilable regex patterns', () => {
+    for (const c of DECIDER_CASES) {
+      const check = c.check;
+      if (check.kind === 'regex') {
+        expect(() => new RegExp(check.pattern, check.flags), c.id).not.toThrow();
+      }
+    }
+  });
+
+  it('has json_equal values that round-trip through JSON', () => {
+    for (const c of DECIDER_CASES) {
+      const check = c.check;
+      if (check.kind === 'json_equal') {
+        expect(JSON.parse(JSON.stringify(check.value)), c.id).toEqual(check.value);
+      }
+    }
+  });
+
+  it('has generous maxTokens and nonempty prompts', () => {
+    for (const c of DECIDER_CASES) {
+      expect(c.maxTokens, c.id).toBeGreaterThanOrEqual(512);
+      expect(c.systemPrompt.length, c.id).toBeGreaterThan(0);
+      expect(c.userPrompt.length, c.id).toBeGreaterThan(0);
+    }
+  });
+
+  it('has nonempty exact and contains_all values', () => {
+    for (const c of DECIDER_CASES) {
+      const check = c.check;
+      if (check.kind === 'exact') {
+        expect(check.value.length, c.id).toBeGreaterThan(0);
+      }
+      if (check.kind === 'contains_all') {
+        expect(check.values.length, c.id).toBeGreaterThan(0);
+        for (const v of check.values) {
+          expect(v.length, c.id).toBeGreaterThan(0);
+        }
+      }
+    }
+  });
+});
diff --git a/services/auto-routing-benchmark/src/datasets/decider-cases.ts b/services/auto-routing-benchmark/src/datasets/decider-cases.ts
new file mode 100644
index 0000000000..561995d520
--- /dev/null
+++ b/services/auto-routing-benchmark/src/datasets/decider-cases.ts
@@ -0,0 +1,328 @@
+import type { ClassifierTaskType, DifficultyTier } from '@kilocode/auto-routing-contracts';
+import type { DeciderCheck } from '../grading';
+
+export type DeciderCase = {
+  id: string;
+  tier: DifficultyTier;
+  taskType: ClassifierTaskType;
+  systemPrompt: string;
+  userPrompt: string;
+  maxTokens: number;
+  check: DeciderCheck;
+};
+
+const CODE_SYS = 'You are a precise coding assistant. Answer with only what is asked, no explanations.';
+const SYS_SYS = 'You are a precise systems engineer. Answer with only what is asked, no explanations.';
+
+// Golden answers below were each worked through by hand. Every case has a
+// single unambiguous, mechanically-checkable answer. Checks tolerate
+// formatting noise (fences/case/whitespace) but never wrong values. For
+// json_equal cases the prompt pins the exact key set in the same order as the
+// expected value (the comparison is JSON.stringify-based and order-sensitive).
+export const DECIDER_CASES: readonly DeciderCase[] = [
+  // ---------------- LOW (mechanical lookups / trivial evaluation) ----------------
+  {
+    id: 'low-impl-array-pipeline',
+    tier: 'low',
+    taskType: 'implementation',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'What does this JavaScript print? Answer with the exact output line only.\n\nconst xs = [1, 2, 3, 4].filter(x => x % 2 === 0).map(x => x * 10);\nconsole.log(xs.join("-"));',
+    maxTokens: 512,
+    check: { kind: 'exact', value: '20-40' },
+  },
+  {
+    id: 'low-impl-sort-numeric',
+    tier: 'low',
+    taskType: 'implementation',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'What does this JavaScript print? Answer with the exact output line only.\n\nconsole.log([5, 3, 8, 1].sort((a, b) => a - b).join(","));',
+    maxTokens: 512,
+    check: { kind: 'exact', value: '1,3,5,8' },
+  },
+  {
+    id: 'low-impl-string-upper',
+    tier: 'low',
+    taskType: 'implementation',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'What does this JavaScript print? Answer with the exact output line only.\n\nconsole.log("hello".toUpperCase());',
+    maxTokens: 512,
+    check: { kind: 'exact', value: 'HELLO' },
+  },
+  {
+    id: 'low-impl-ternary-parity',
+    tier: 'low',
+    taskType: 'implementation',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'What does this JavaScript print? Answer with the exact output line only.\n\nconst n = 7;\nconsole.log(n % 2 === 0 ? "even" : "odd");',
+    maxTokens: 512,
+    check: { kind: 'exact', value: 'odd' },
+  },
+  {
+    id: 'low-debug-compound-assign',
+    tier: 'low',
+    taskType: 'debugging',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'What is the final value printed? Answer with only the number.\n\nlet x = 10;\nx += 5;\nx *= 2;\nconsole.log(x);',
+    maxTokens: 512,
+    check: { kind: 'exact', value: '30' },
+  },
+  {
+    id: 'low-debug-parseint-suffix',
+    tier: 'low',
+    taskType: 'debugging',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'What does this JavaScript print? Answer with only the number.\n\nconsole.log(parseInt("42px", 10));',
+    maxTokens: 512,
+    check: { kind: 'exact', value: '42' },
+  },
+  {
+    id: 'low-investigation-char-count',
+    tier: 'low',
+    taskType: 'investigation',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'How many times does the letter "a" appear in the word "banana"? Answer with only the number.',
+    maxTokens: 512,
+    check: { kind: 'exact', value: '3' },
+  },
+  {
+    id: 'low-investigation-object-keys',
+    tier: 'low',
+    taskType: 'investigation',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'How many own enumerable keys does this object have? Answer with only the number.\n\nconst o = { a: 1, b: 2, c: 3 };',
+    maxTokens: 512,
+    check: { kind: 'exact', value: '3' },
+  },
+  {
+    id: 'low-planning-http-created',
+    tier: 'low',
+    taskType: 'planning_design',
+    systemPrompt: 'You are a precise web API expert. Answer with only what is asked, no explanations.',
+    userPrompt:
+      'Which standard HTTP status code indicates that a new resource was successfully created? Answer with only the 3-digit number.',
+    maxTokens: 512,
+    check: { kind: 'exact', value: '201' },
+  },
+  {
+    id: 'low-refactoring-reduce-sum',
+    tier: 'low',
+    taskType: 'refactoring',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'A loop sums an array. What value does it produce? Answer with only the number.\n\nlet total = 0;\nfor (const n of [4, 4, 4]) total += n;\nconsole.log(total);',
+    maxTokens: 512,
+    check: { kind: 'exact', value: '12' },
+  },
+
+  // ---------------- MEDIUM (multi-step reasoning, off-by-one, spec application) -------------
+  {
+    id: 'medium-debug-off-by-one',
+    tier: 'medium',
+    taskType: 'debugging',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'This binary search has a bug. Reply with JSON {"line": <1-based line number of the buggy line>, "fix": "<the corrected line, exactly, preserving original spacing>"}.\n\n1: function bsearch(a, t) {\n2:   let lo = 0, hi = a.length;\n3:   while (lo < hi) {\n4:     const mid = (lo + hi) >> 1;\n5:     if (a[mid] === t) return mid;\n6:     if (a[mid] < t) lo = mid;\n7:     else hi = mid;\n8:   }\n9:   return -1;\n10: }',
+    maxTokens: 2048,
+    check: { kind: 'json_equal', value: { line: 6, fix: 'if (a[mid] < t) lo = mid + 1;' } },
+  },
+  {
+    id: 'medium-impl-reduce-trace',
+    tier: 'medium',
+    taskType: 'implementation',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'What does this print? Answer with only the number.\n\nconst r = [1, 2, 3, 4].reduce((acc, x) => acc + x * x, 0);\nconsole.log(r);',
+    maxTokens: 2048,
+    check: { kind: 'exact', value: '30' },
+  },
+  {
+    id: 'medium-impl-closure-counter',
+    tier: 'medium',
+    taskType: 'implementation',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'What is the final printed value? Answer with only the number.\n\nfunction make() {\n  let c = 0;\n  return () => ++c;\n}\nconst f = make();\nf();\nf();\nconsole.log(f());',
+    maxTokens: 2048,
+    check: { kind: 'exact', value: '3' },
+  },
+  {
+    id: 'medium-debug-async-order',
+    tier: 'medium',
+    taskType: 'debugging',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'What does this program print, in order? Answer with the four uppercase letters joined by commas, e.g. "A,B,C,D".\n\nconsole.log("A");\nPromise.resolve().then(() => console.log("B"));\nsetTimeout(() => console.log("C"), 0);\nconsole.log("D");',
+    maxTokens: 2048,
+    check: { kind: 'regex', pattern: '^\\s*A\\s*,\\s*D\\s*,\\s*B\\s*,\\s*C\\s*$', flags: 'im' },
+  },
+  {
+    id: 'medium-impl-map-set-dedup',
+    tier: 'medium',
+    taskType: 'implementation',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'What is the size of the resulting Set? Answer with only the number.\n\nconst s = new Set([1, 2, 2, 3, 3, 3, 4]);\nconsole.log(s.size);',
+    maxTokens: 2048,
+    check: { kind: 'exact', value: '4' },
+  },
+  {
+    id: 'medium-investigation-regex-groups',
+    tier: 'medium',
+    taskType: 'investigation',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'Given the regex /(\\d{4})-(\\d{2})-(\\d{2})/ applied to "2026-06-11", what is capture group 2? Answer with only the value.',
+    maxTokens: 2048,
+    check: { kind: 'exact', value: '06' },
+  },
+  {
+    id: 'medium-impl-recursion-fib',
+    tier: 'medium',
+    taskType: 'implementation',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'This computes a Fibonacci-like sequence where f(0)=0, f(1)=1, f(n)=f(n-1)+f(n-2). What is f(7)? Answer with only the number.',
+    maxTokens: 2048,
+    check: { kind: 'exact', value: '13' },
+  },
+  {
+    id: 'medium-debug-mutation-shared-ref',
+    tier: 'medium',
+    taskType: 'debugging',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'What does this print? Answer with only the number.\n\nconst a = [1, 2, 3];\nconst b = a;\nb.push(4);\nconsole.log(a.length);',
+    maxTokens: 2048,
+    check: { kind: 'exact', value: '4' },
+  },
+  {
+    id: 'medium-planning-rate-limit-window',
+    tier: 'medium',
+    taskType: 'planning_design',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'A fixed-window rate limiter allows 100 requests per 60-second window. A client sends 80 requests in the first 30 seconds of a window, then 40 more requests in the next 20 seconds (same window). How many of the 40 later requests are rejected? Answer with only the number.',
+    maxTokens: 2048,
+    check: { kind: 'exact', value: '20' },
+  },
+  {
+    id: 'medium-refactoring-equivalent-output',
+    tier: 'medium',
+    taskType: 'refactoring',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'After refactoring, both versions must produce the same output. What number does this print? Answer with only the number.\n\nconst nums = [10, 20, 30];\nconst doubled = nums.map(n => n * 2);\nconsole.log(doubled[1]);',
+    maxTokens: 2048,
+    check: { kind: 'exact', value: '40' },
+  },
+
+  // ---------------- HIGH (deep multi-constraint reasoning, subtle semantics) -------------
+  {
+    id: 'high-investigation-queue-trace',
+    tier: 'high',
+    taskType: 'investigation',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'Three workers process a queue with at-least-once delivery. Worker A reads job 7 at t=0ms and crashes at t=50ms before ack. Visibility timeout is 30ms. Worker B receives job 7 at t=35ms, processes it in 40ms and acks. Worker C receives job 7 at t=80ms (redelivery triggered by the crash recovery scan at t=70ms) and processes it in 10ms, acking at t=90ms. The job inserts a row keyed by an idempotency key with ON CONFLICT DO NOTHING. How many rows exist at t=100ms, and which worker\'s insert won? Reply with JSON {"rows": <number>, "winner": "<A|B|C>"}.',
+    maxTokens: 4096,
+    check: { kind: 'json_equal', value: { rows: 1, winner: 'B' } },
+  },
+  {
+    id: 'high-debug-closure-loop-var',
+    tier: 'high',
+    taskType: 'debugging',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'What does this print? Answer with the three numbers joined by commas, e.g. "1,2,3".\n\nconst fns = [];\nfor (var i = 0; i < 3; i++) {\n  fns.push(() => i);\n}\nconsole.log(fns[0]() + "," + fns[1]() + "," + fns[2]());',
+    maxTokens: 4096,
+    check: { kind: 'regex', pattern: '^\\s*3\\s*,\\s*3\\s*,\\s*3\\s*$', flags: 'm' },
+  },
+  {
+    id: 'high-debug-closure-let-var',
+    tier: 'high',
+    taskType: 'debugging',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'What does this print? Answer with the three numbers joined by commas, e.g. "1,2,3".\n\nconst fns = [];\nfor (let i = 0; i < 3; i++) {\n  fns.push(() => i);\n}\nconsole.log(fns[0]() + "," + fns[1]() + "," + fns[2]());',
+    maxTokens: 4096,
+    check: { kind: 'regex', pattern: '^\\s*0\\s*,\\s*1\\s*,\\s*2\\s*$', flags: 'm' },
+  },
+  {
+    id: 'high-impl-this-binding',
+    tier: 'high',
+    taskType: 'implementation',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'What does this print? Answer with only the number.\n\nconst obj = {\n  v: 10,\n  get() {\n    return [1, 2].map(function () {\n      return this?.v ?? 0;\n    }).reduce((a, b) => a + b, 0);\n  },\n};\nconsole.log(obj.get());',
+    maxTokens: 4096,
+    check: { kind: 'exact', value: '0' },
+  },
+  {
+    id: 'high-investigation-deadlock-order',
+    tier: 'high',
+    taskType: 'investigation',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'Two threads acquire locks. Thread 1: lock A, then lock B. Thread 2: lock B, then lock A. Both hold the first lock and then block forever waiting for the second. To eliminate the deadlock by enforcing a global lock acquisition order (alphabetical: A before B), which single thread number must have its two lock acquisitions reordered? Answer with only the thread number.',
+    maxTokens: 4096,
+    check: { kind: 'exact', value: '2' },
+  },
+  {
+    id: 'high-debug-float-equality',
+    tier: 'high',
+    taskType: 'debugging',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'In IEEE-754 double precision (JavaScript Number), does the expression (0.1 + 0.2 === 0.3) evaluate to true or false? Answer with only the lowercase word true or false.',
+    maxTokens: 4096,
+    check: { kind: 'exact', value: 'false' },
+  },
+  {
+    id: 'high-investigation-txn-isolation',
+    tier: 'high',
+    taskType: 'investigation',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'A counter row holds value 5. Under READ COMMITTED isolation, two concurrent transactions T1 and T2 each run: SELECT v FROM c; then UPDATE c SET v = (the value they read) + 1. Both read before either writes, T1 commits first, then T2 commits (last-write-wins, no row lock taken on the SELECT). What is the final value of v? Answer with only the number.',
+    maxTokens: 4096,
+    check: { kind: 'exact', value: '6' },
+  },
+  {
+    id: 'high-impl-generator-trace',
+    tier: 'high',
+    taskType: 'implementation',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'What does this print? Answer with the values joined by commas, e.g. "1,2,3".\n\nfunction* g() {\n  yield 1;\n  yield* [2, 3];\n  yield 4;\n}\nconsole.log([...g()].join(","));',
+    maxTokens: 4096,
+    check: { kind: 'regex', pattern: '^\\s*1\\s*,\\s*2\\s*,\\s*3\\s*,\\s*4\\s*$', flags: 'm' },
+  },
+  {
+    id: 'high-planning-cache-invalidation',
+    tier: 'high',
+    taskType: 'planning_design',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'A write-through cache with TTL 60s. At t=0s key K is written (value 1, cached). At t=30s the database row for K is updated to value 2 by a process that bypasses the cache (does not invalidate it). At t=45s a reader requests K. At t=70s another reader requests K. The cache returns its entry if present and unexpired, otherwise reads the DB and caches. What value does the t=45s reader get, and what value does the t=70s reader get? Reply with JSON {"first": <number>, "second": <number>}.',
+    maxTokens: 4096,
+    check: { kind: 'json_equal', value: { first: 1, second: 2 } },
+  },
+  {
+    id: 'high-refactoring-short-circuit',
+    tier: 'high',
+    taskType: 'refactoring',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'What does this print? Answer with only the number.\n\nlet calls = 0;\nfunction side() {\n  calls++;\n  return 0;\n}\nconst result = side() || side() || 7;\nconsole.log(calls);',
+    maxTokens: 4096,
+    check: { kind: 'exact', value: '2' },
+  },
+];
diff --git a/services/auto-routing-benchmark/src/grading.test.ts b/services/auto-routing-benchmark/src/grading.test.ts
index 7094da8d09..3ea7abad76 100644
--- a/services/auto-routing-benchmark/src/grading.test.ts
+++ b/services/auto-routing-benchmark/src/grading.test.ts
@@ -3,6 +3,8 @@ import type { ClassifierOutput } from '@kilocode/auto-routing-contracts';
 import {
   CLASSIFIER_FIELD_WEIGHTS,
   gradeClassifierOutput,
+  normalizeAnswer,
+  runDeciderCheck,
   type ClassifierExpectation,
 } from './grading';
 
@@ -56,3 +58,73 @@ describe('CLASSIFIER_FIELD_WEIGHTS', () => {
     expect(Object.values(CLASSIFIER_FIELD_WEIGHTS).reduce((a, b) => a + b, 0)).toBeCloseTo(1);
   });
 });
+
+describe('normalizeAnswer', () => {
+  it('strips fences, lowercases and trims', () => {
+    expect(normalizeAnswer('```js\n  Hello World  \n```')).toBe('hello world');
+  });
+});
+
+describe('runDeciderCheck: exact', () => {
+  it('passes with surrounding code fences and different case', () => {
+    expect(runDeciderCheck({ kind: 'exact', value: '20-40' }, '```\n20-40\n```')).toBe(true);
+    expect(runDeciderCheck({ kind: 'exact', value: 'Hello' }, 'HELLO')).toBe(true);
+  });
+
+  it('fails on a wrong answer', () => {
+    expect(runDeciderCheck({ kind: 'exact', value: '20-40' }, '20-30')).toBe(false);
+  });
+});
+
+describe('runDeciderCheck: contains_all', () => {
+  it('passes regardless of order and case', () => {
+    expect(
+      runDeciderCheck({ kind: 'contains_all', values: ['Alpha', 'Beta'] }, 'beta then ALPHA')
+    ).toBe(true);
+  });
+
+  it('fails when one value is missing', () => {
+    expect(
+      runDeciderCheck({ kind: 'contains_all', values: ['alpha', 'beta'] }, 'only alpha here')
+    ).toBe(false);
+  });
+});
+
+describe('runDeciderCheck: regex', () => {
+  it('passes a basic match with flags', () => {
+    expect(
+      runDeciderCheck({ kind: 'regex', pattern: '^answer: \\d+$', flags: 'im' }, 'ANSWER: 42')
+    ).toBe(true);
+  });
+
+  it('fails when the pattern does not match', () => {
+    expect(runDeciderCheck({ kind: 'regex', pattern: '^\\d+$' }, 'not a number')).toBe(false);
+  });
+});
+
+describe('runDeciderCheck: json_equal', () => {
+  it('passes with a json fence plus prose before and after', () => {
+    const output = 'Here you go:\n```json\n{"a":1}\n```\nLet me know!';
+    expect(runDeciderCheck({ kind: 'json_equal', value: { a: 1 } }, output)).toBe(true);
+  });
+
+  it('passes with bare JSON', () => {
+    expect(runDeciderCheck({ kind: 'json_equal', value: { line: 6 } }, '{"line": 6}')).toBe(true);
+  });
+
+  it('fails on unparseable output', () => {
+    expect(runDeciderCheck({ kind: 'json_equal', value: { a: 1 } }, 'sorry, no idea')).toBe(false);
+  });
+
+  it('fails when values differ', () => {
+    expect(runDeciderCheck({ kind: 'json_equal', value: { a: 1 } }, '{"a": 2}')).toBe(false);
+  });
+
+  // Documents current behavior: comparison is JSON.stringify-based, so key
+  // ORDER is significant. Dataset authoring must mirror the prompted key order.
+  it('is sensitive to object key order (documented behavior)', () => {
+    expect(runDeciderCheck({ kind: 'json_equal', value: { a: 1, b: 2 } }, '{"b": 2, "a": 1}')).toBe(
+      false
+    );
+  });
+});
diff --git a/services/auto-routing-benchmark/src/grading.ts b/services/auto-routing-benchmark/src/grading.ts
index 746e68a546..0c3291c1d7 100644
--- a/services/auto-routing-benchmark/src/grading.ts
+++ b/services/auto-routing-benchmark/src/grading.ts
@@ -29,3 +29,78 @@ export function gradeClassifierOutput(
   }
   return Number(score.toFixed(4));
 }
+
+export type DeciderCheck =
+  | { kind: 'exact'; value: string }
+  | { kind: 'contains_all'; values: readonly string[] }
+  | { kind: 'regex'; pattern: string; flags?: string }
+  | { kind: 'json_equal'; value: unknown };
+
+// Mechanical pass/fail grading keeps the decider benchmark deterministic:
+// no LLM judges. Normalization tolerates formatting noise (whitespace,
+// case, markdown fences) without weakening the assertion.
+export function normalizeAnswer(text: string): string {
+  return text
+    .replace(/```[a-z]*\n?/gi, '')
+    .replace(/```/g, '')
+    .trim()
+    .toLowerCase();
+}
+
+// Balance-scan from the first `{`/`[` to its matching close so trailing prose
+// after the JSON payload doesn't break parsing. String-aware so braces inside
+// string literals are ignored.
+function extractJson(text: string): unknown {
+  const stripped = text.replace(/```(?:json)?\n?/gi, '').replace(/```/g, '');
+  const start = stripped.search(/[[{]/);
+  if (start === -1) throw new Error('no JSON found');
+
+  const open = stripped[start];
+  const close = open === '{' ? '}' : ']';
+  let depth = 0;
+  let inString = false;
+  let escaped = false;
+
+  for (let i = start; i < stripped.length; i++) {
+    const ch = stripped[i];
+    if (inString) {
+      if (escaped) {
+        escaped = false;
+      } else if (ch === '\\') {
+        escaped = true;
+      } else if (ch === '"') {
+        inString = false;
+      }
+      continue;
+    }
+    if (ch === '"') {
+      inString = true;
+    } else if (ch === open) {
+      depth++;
+    } else if (ch === close) {
+      depth--;
+      if (depth === 0) {
+        return JSON.parse(stripped.slice(start, i + 1));
+      }
+    }
+  }
+  throw new Error('unbalanced JSON');
+}
+
+export function runDeciderCheck(check: DeciderCheck, output: string): boolean {
+  switch (check.kind) {
+    case 'exact':
+      return normalizeAnswer(output) === normalizeAnswer(check.value);
+    case 'contains_all':
+      return check.values.every(v => normalizeAnswer(output).includes(normalizeAnswer(v)));
+    case 'regex':
+      return new RegExp(check.pattern, check.flags).test(output);
+    case 'json_equal': {
+      try {
+        return JSON.stringify(extractJson(output)) === JSON.stringify(check.value);
+      } catch {
+        return false;
+      }
+    }
+  }
+}

From 110cbd9ea44fab7e8ee51cfd1a5d823d19155eac Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Thu, 11 Jun 2026 22:35:49 +0200
Subject: [PATCH 10/73] fix(auto-routing-benchmark): unambiguous whitespace
 instruction in off-by-one case

---
 .../src/datasets/decider-cases.ts                     | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/services/auto-routing-benchmark/src/datasets/decider-cases.ts b/services/auto-routing-benchmark/src/datasets/decider-cases.ts
index 561995d520..b14ec63d2c 100644
--- a/services/auto-routing-benchmark/src/datasets/decider-cases.ts
+++ b/services/auto-routing-benchmark/src/datasets/decider-cases.ts
@@ -11,8 +11,10 @@ export type DeciderCase = {
   check: DeciderCheck;
 };
 
-const CODE_SYS = 'You are a precise coding assistant. Answer with only what is asked, no explanations.';
-const SYS_SYS = 'You are a precise systems engineer. Answer with only what is asked, no explanations.';
+const CODE_SYS =
+  'You are a precise coding assistant. Answer with only what is asked, no explanations.';
+const SYS_SYS =
+  'You are a precise systems engineer. Answer with only what is asked, no explanations.';
 
 // Golden answers below were each worked through by hand. Every case has a
 // single unambiguous, mechanically-checkable answer. Checks tolerate
@@ -105,7 +107,8 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
     id: 'low-planning-http-created',
     tier: 'low',
     taskType: 'planning_design',
-    systemPrompt: 'You are a precise web API expert. Answer with only what is asked, no explanations.',
+    systemPrompt:
+      'You are a precise web API expert. Answer with only what is asked, no explanations.',
     userPrompt:
       'Which standard HTTP status code indicates that a new resource was successfully created? Answer with only the 3-digit number.',
     maxTokens: 512,
@@ -129,7 +132,7 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
     taskType: 'debugging',
     systemPrompt: CODE_SYS,
     userPrompt:
-      'This binary search has a bug. Reply with JSON {"line": <1-based line number of the buggy line>, "fix": "<the corrected line, exactly, preserving original spacing>"}.\n\n1: function bsearch(a, t) {\n2:   let lo = 0, hi = a.length;\n3:   while (lo < hi) {\n4:     const mid = (lo + hi) >> 1;\n5:     if (a[mid] === t) return mid;\n6:     if (a[mid] < t) lo = mid;\n7:     else hi = mid;\n8:   }\n9:   return -1;\n10: }',
+      'This binary search has a bug. Reply with JSON {"line": <1-based line number of the buggy line>, "fix": "<the corrected line with leading whitespace removed>"}.\n\n1: function bsearch(a, t) {\n2:   let lo = 0, hi = a.length;\n3:   while (lo < hi) {\n4:     const mid = (lo + hi) >> 1;\n5:     if (a[mid] === t) return mid;\n6:     if (a[mid] < t) lo = mid;\n7:     else hi = mid;\n8:   }\n9:   return -1;\n10: }',
     maxTokens: 2048,
     check: { kind: 'json_equal', value: { line: 6, fix: 'if (a[mid] < t) lo = mid + 1;' } },
   },

From 5ce86212b37463d0a39cb64525000c31c74fa937 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Thu, 11 Jun 2026 22:40:49 +0200
Subject: [PATCH 11/73] feat(auto-routing-benchmark): queue-driven benchmark
 runs with aggregation and table publish

---
 .../auto-routing-benchmark/src/config.test.ts |  41 +++
 services/auto-routing-benchmark/src/config.ts |  52 ++++
 services/auto-routing-benchmark/src/index.ts  |  20 +-
 .../src/routing-table-builder.test.ts         | 227 ++++++++++++++
 .../src/routing-table-builder.ts              |  52 ++++
 .../auto-routing-benchmark/src/run.test.ts    | 191 ++++++++++++
 services/auto-routing-benchmark/src/run.ts    | 283 ++++++++++++++++++
 7 files changed, 859 insertions(+), 7 deletions(-)
 create mode 100644 services/auto-routing-benchmark/src/config.test.ts
 create mode 100644 services/auto-routing-benchmark/src/config.ts
 create mode 100644 services/auto-routing-benchmark/src/routing-table-builder.test.ts
 create mode 100644 services/auto-routing-benchmark/src/routing-table-builder.ts
 create mode 100644 services/auto-routing-benchmark/src/run.test.ts
 create mode 100644 services/auto-routing-benchmark/src/run.ts

diff --git a/services/auto-routing-benchmark/src/config.test.ts b/services/auto-routing-benchmark/src/config.test.ts
new file mode 100644
index 0000000000..32c04e3a86
--- /dev/null
+++ b/services/auto-routing-benchmark/src/config.test.ts
@@ -0,0 +1,41 @@
+import { describe, expect, it } from 'vitest';
+import { DEFAULT_BENCHMARK_CONFIG, parseConfigJson } from './config';
+
+describe('parseConfigJson', () => {
+  it('returns defaults on null', () => {
+    expect(parseConfigJson(null)).toEqual(DEFAULT_BENCHMARK_CONFIG);
+  });
+
+  it('returns defaults on invalid JSON string', () => {
+    expect(parseConfigJson('not valid json {{{')).toEqual(DEFAULT_BENCHMARK_CONFIG);
+  });
+
+  it('returns defaults on schema-invalid JSON', () => {
+    const invalid = JSON.stringify({ classifierModels: 'not-an-array', minAccuracy: 'bad' });
+    expect(parseConfigJson(invalid)).toEqual(DEFAULT_BENCHMARK_CONFIG);
+  });
+
+  it('returns defaults on empty object', () => {
+    expect(parseConfigJson('{}')).toEqual(DEFAULT_BENCHMARK_CONFIG);
+  });
+
+  it('round-trips a valid config', () => {
+    const config = {
+      ...DEFAULT_BENCHMARK_CONFIG,
+      classifierModels: ['some/model'],
+      minAccuracy: 0.8,
+      maxConcurrency: 2,
+      updatedAt: '2026-01-01T00:00:00.000Z',
+      updatedBy: 'admin@example.com',
+    };
+    expect(parseConfigJson(JSON.stringify(config))).toEqual(config);
+  });
+
+  it('returns defaults when classifierModels is empty array (schema violation)', () => {
+    const invalid = JSON.stringify({
+      ...DEFAULT_BENCHMARK_CONFIG,
+      classifierModels: [],
+    });
+    expect(parseConfigJson(invalid)).toEqual(DEFAULT_BENCHMARK_CONFIG);
+  });
+});
diff --git a/services/auto-routing-benchmark/src/config.ts b/services/auto-routing-benchmark/src/config.ts
new file mode 100644
index 0000000000..e609090760
--- /dev/null
+++ b/services/auto-routing-benchmark/src/config.ts
@@ -0,0 +1,52 @@
+import { BenchmarkConfigSchema, type BenchmarkConfig } from '@kilocode/auto-routing-contracts';
+import { getConfigRow, saveConfigRow } from './db';
+
+export const DEFAULT_BENCHMARK_CONFIG: BenchmarkConfig = {
+  classifierModels: [
+    'google/gemini-2.5-flash-lite',
+    'google/gemini-2.5-flash',
+    'openai/gpt-5-mini',
+    'qwen/qwen3.7-plus',
+  ],
+  deciderModels: [
+    { id: 'google/gemini-2.5-flash-lite', supportedApiKinds: ['chat_completions'] },
+    { id: 'google/gemini-2.5-flash', supportedApiKinds: ['chat_completions'] },
+    { id: 'qwen/qwen3.7-plus', supportedApiKinds: ['chat_completions'] },
+    { id: 'openai/gpt-5.5', supportedApiKinds: ['chat_completions', 'responses'] },
+    {
+      id: 'anthropic/claude-sonnet-4.6',
+      supportedApiKinds: ['chat_completions', 'messages', 'responses'],
+    },
+  ],
+  minAccuracy: 0.7,
+  maxConcurrency: 4,
+  updatedAt: null,
+  updatedBy: null,
+};
+
+// Pure so the fallback path is unit-testable without D1.
+export function parseConfigJson(raw: string | null): BenchmarkConfig {
+  if (raw === null) return DEFAULT_BENCHMARK_CONFIG;
+  try {
+    const parsed = BenchmarkConfigSchema.safeParse(JSON.parse(raw));
+    return parsed.success ? parsed.data : DEFAULT_BENCHMARK_CONFIG;
+  } catch {
+    return DEFAULT_BENCHMARK_CONFIG;
+  }
+}
+
+export async function getBenchmarkConfig(db: D1Database): Promise<BenchmarkConfig> {
+  const row = await getConfigRow(db);
+  return parseConfigJson(row?.config_json ?? null);
+}
+
+export async function saveBenchmarkConfig(
+  db: D1Database,
+  config: BenchmarkConfig,
+  updatedBy: string | null
+): Promise<BenchmarkConfig> {
+  const updatedAt = new Date().toISOString();
+  const stamped: BenchmarkConfig = { ...config, updatedAt, updatedBy };
+  await saveConfigRow(db, JSON.stringify(stamped), updatedAt, updatedBy);
+  return stamped;
+}
diff --git a/services/auto-routing-benchmark/src/index.ts b/services/auto-routing-benchmark/src/index.ts
index 3c0a00cf8b..542e9e3ce6 100644
--- a/services/auto-routing-benchmark/src/index.ts
+++ b/services/auto-routing-benchmark/src/index.ts
@@ -2,6 +2,7 @@ import { Hono } from 'hono';
 import { createErrorHandler, createNotFoundHandler } from '@kilocode/worker-utils';
 import { authMiddleware } from './auth';
 import type { HonoEnv } from './hono-env';
+import { processJob, startRun, type BenchmarkJobMessage } from './run';
 
 export const app = new Hono<HonoEnv>();
 app.use('*', authMiddleware);
@@ -9,13 +10,18 @@ app.get('/health', c => c.json({ status: 'ok', service: 'auto-routing-benchmark'
 app.notFound(createNotFoundHandler());
 app.onError(createErrorHandler());
 
+const DECIDER_CRON = '10 5 * * 1';
+
 export default {
   fetch: app.fetch,
-  // Wired up in later tasks (run orchestration + admin endpoints).
-  async scheduled(
-    _controller: ScheduledController,
-    _env: Env,
-    _ctx: ExecutionContext
-  ): Promise<void> {},
-  async queue(_batch: MessageBatch<unknown>, _env: Env, _ctx: ExecutionContext): Promise<void> {},
+  async scheduled(controller: ScheduledController, env: Env, ctx: ExecutionContext): Promise<void> {
+    const kind = controller.cron === DECIDER_CRON ? 'decider' : 'classifier';
+    ctx.waitUntil(startRun(env, kind));
+  },
+  async queue(batch: MessageBatch<BenchmarkJobMessage>, env: Env): Promise<void> {
+    for (const message of batch.messages) {
+      await processJob(env, message.body);
+      message.ack();
+    }
+  },
 };
diff --git a/services/auto-routing-benchmark/src/routing-table-builder.test.ts b/services/auto-routing-benchmark/src/routing-table-builder.test.ts
new file mode 100644
index 0000000000..892ef5c88d
--- /dev/null
+++ b/services/auto-routing-benchmark/src/routing-table-builder.test.ts
@@ -0,0 +1,227 @@
+import { describe, expect, it } from 'vitest';
+import type { BenchmarkConfig, BenchmarkModelSummary } from '@kilocode/auto-routing-contracts';
+import { buildRoutingTable } from './routing-table-builder';
+
+const BASE_CONFIG: BenchmarkConfig = {
+  classifierModels: ['some/classifier'],
+  deciderModels: [
+    { id: 'model/cheap', supportedApiKinds: ['chat_completions'] },
+    { id: 'model/expensive', supportedApiKinds: ['chat_completions', 'responses'] },
+    { id: 'model/mid', supportedApiKinds: ['chat_completions', 'messages'] },
+  ],
+  minAccuracy: 0.7,
+  maxConcurrency: 4,
+  updatedAt: null,
+  updatedBy: null,
+};
+
+function summary(
+  model: string,
+  tier: BenchmarkModelSummary['tier'],
+  accuracy: number,
+  avgCostUsd: number | null = 0.001
+): BenchmarkModelSummary {
+  return {
+    model,
+    tier,
+    accuracy,
+    avgCostUsd,
+    avgLatencyMs: 500,
+    p50LatencyMs: 450,
+    cases: 10,
+    errors: 0,
+  };
+}
+
+const ALL_TIERS_SUMMARIES: BenchmarkModelSummary[] = [
+  summary('model/cheap', 'low', 0.9, 0.001),
+  summary('model/expensive', 'low', 0.95, 0.01),
+  summary('model/mid', 'low', 0.8, 0.005),
+  summary('model/cheap', 'medium', 0.75, 0.001),
+  summary('model/expensive', 'medium', 0.85, 0.01),
+  summary('model/mid', 'medium', 0.72, 0.005),
+  summary('model/cheap', 'high', 0.6, 0.001),
+  summary('model/expensive', 'high', 0.9, 0.01),
+  summary('model/mid', 'high', 0.75, 0.005),
+];
+
+describe('buildRoutingTable', () => {
+  it('cheapest above-threshold model comes first per tier', () => {
+    const table = buildRoutingTable({
+      runId: 'test-run-1',
+      generatedAt: '2026-01-01T00:00:00.000Z',
+      config: BASE_CONFIG,
+      summaries: ALL_TIERS_SUMMARIES,
+    });
+
+    // low tier: cheap (0.001) and mid (0.005) and expensive (0.01) all meet threshold (0.7)
+    // cheapest first
+    expect(table.tiers.low[0].model).toBe('model/cheap');
+    expect(table.tiers.low[1].model).toBe('model/mid');
+    expect(table.tiers.low[2].model).toBe('model/expensive');
+
+    // medium tier: all meet threshold, cheapest first
+    expect(table.tiers.medium[0].model).toBe('model/cheap');
+    expect(table.tiers.medium[1].model).toBe('model/mid');
+    expect(table.tiers.medium[2].model).toBe('model/expensive');
+
+    // high tier: expensive (0.9) and mid (0.75) meet threshold; cheap (0.6) does not
+    // meeting threshold first, then by cost; cheap last (below threshold)
+    expect(table.tiers.high[0].model).toBe('model/mid'); // meets threshold, cheaper
+    expect(table.tiers.high[1].model).toBe('model/expensive'); // meets threshold, more expensive
+    expect(table.tiers.high[2].model).toBe('model/cheap'); // below threshold
+  });
+
+  it('marks meetsThreshold correctly', () => {
+    const table = buildRoutingTable({
+      runId: 'test-run-2',
+      generatedAt: '2026-01-01T00:00:00.000Z',
+      config: BASE_CONFIG,
+      summaries: ALL_TIERS_SUMMARIES,
+    });
+
+    for (const candidate of table.tiers.low) {
+      expect(candidate.meetsThreshold).toBe(candidate.accuracy >= 0.7);
+    }
+  });
+
+  it('excludes a model absent from a tier summaries', () => {
+    // model/cheap has no 'high' summary entry
+    const summaries: BenchmarkModelSummary[] = [
+      summary('model/cheap', 'low', 0.9),
+      summary('model/cheap', 'medium', 0.8),
+      // no 'high' entry for model/cheap
+      summary('model/expensive', 'low', 0.9),
+      summary('model/expensive', 'medium', 0.8),
+      summary('model/expensive', 'high', 0.9),
+      summary('model/mid', 'low', 0.8),
+      summary('model/mid', 'medium', 0.75),
+      summary('model/mid', 'high', 0.75),
+    ];
+
+    const table = buildRoutingTable({
+      runId: 'test-run-3',
+      generatedAt: '2026-01-01T00:00:00.000Z',
+      config: BASE_CONFIG,
+      summaries,
+    });
+
+    const highModels = table.tiers.high.map(c => c.model);
+    expect(highModels).not.toContain('model/cheap');
+    expect(highModels).toContain('model/expensive');
+    expect(highModels).toContain('model/mid');
+  });
+
+  it('carries supportedApiKinds from config', () => {
+    const table = buildRoutingTable({
+      runId: 'test-run-4',
+      generatedAt: '2026-01-01T00:00:00.000Z',
+      config: BASE_CONFIG,
+      summaries: ALL_TIERS_SUMMARIES,
+    });
+
+    const expensiveInLow = table.tiers.low.find(c => c.model === 'model/expensive');
+    expect(expensiveInLow?.supportedApiKinds).toEqual(['chat_completions', 'responses']);
+
+    const midInLow = table.tiers.low.find(c => c.model === 'model/mid');
+    expect(midInLow?.supportedApiKinds).toEqual(['chat_completions', 'messages']);
+  });
+
+  it('defaults supportedApiKinds to chat_completions when model missing from config', () => {
+    const summaries: BenchmarkModelSummary[] = [
+      summary('model/unknown', 'low', 0.9),
+      summary('model/cheap', 'low', 0.8),
+      summary('model/cheap', 'medium', 0.8),
+      summary('model/cheap', 'high', 0.8),
+      summary('model/unknown', 'medium', 0.9),
+      summary('model/unknown', 'high', 0.9),
+    ];
+
+    // Add a model that isn't in deciderModels
+    const config = { ...BASE_CONFIG };
+
+    const table = buildRoutingTable({
+      runId: 'test-run-5',
+      generatedAt: '2026-01-01T00:00:00.000Z',
+      config,
+      summaries,
+    });
+
+    const unknown = table.tiers.low.find(c => c.model === 'model/unknown');
+    expect(unknown?.supportedApiKinds).toEqual(['chat_completions']);
+  });
+
+  it('throws when a tier has no candidates', () => {
+    // Only low and medium summaries — high is missing entirely
+    const summaries: BenchmarkModelSummary[] = [
+      summary('model/cheap', 'low', 0.9),
+      summary('model/expensive', 'low', 0.9),
+      summary('model/mid', 'low', 0.9),
+      summary('model/cheap', 'medium', 0.9),
+      summary('model/expensive', 'medium', 0.9),
+      summary('model/mid', 'medium', 0.9),
+    ];
+
+    expect(() =>
+      buildRoutingTable({
+        runId: 'test-run-6',
+        generatedAt: '2026-01-01T00:00:00.000Z',
+        config: BASE_CONFIG,
+        summaries,
+      })
+    ).toThrow();
+  });
+
+  it('throws when a tier has only zero-case entries', () => {
+    const summaries: BenchmarkModelSummary[] = [
+      ...ALL_TIERS_SUMMARIES.filter(s => s.tier !== 'high'),
+      // high tier entries with 0 cases — should be excluded
+      { ...summary('model/cheap', 'high', 0.9), cases: 0 },
+      { ...summary('model/expensive', 'high', 0.9), cases: 0 },
+      { ...summary('model/mid', 'high', 0.9), cases: 0 },
+    ];
+
+    expect(() =>
+      buildRoutingTable({
+        runId: 'test-run-7',
+        generatedAt: '2026-01-01T00:00:00.000Z',
+        config: BASE_CONFIG,
+        summaries,
+      })
+    ).toThrow();
+  });
+
+  it('ignores classifier-style * tier summaries', () => {
+    const summaries: BenchmarkModelSummary[] = [
+      ...ALL_TIERS_SUMMARIES,
+      // classifier summaries with '*' tier — should be ignored
+      summary('model/cheap', '*', 0.95),
+      summary('model/expensive', '*', 0.95),
+    ];
+
+    // Should not throw and * tier entries should not affect output
+    const table = buildRoutingTable({
+      runId: 'test-run-8',
+      generatedAt: '2026-01-01T00:00:00.000Z',
+      config: BASE_CONFIG,
+      summaries,
+    });
+
+    expect(table.tiers.low.length).toBe(3);
+    expect(table.tiers.medium.length).toBe(3);
+  });
+
+  it('sets version and generatedAt from params', () => {
+    const table = buildRoutingTable({
+      runId: 'decider-2026-01-01',
+      generatedAt: '2026-01-01T12:00:00.000Z',
+      config: BASE_CONFIG,
+      summaries: ALL_TIERS_SUMMARIES,
+    });
+
+    expect(table.version).toBe('decider-2026-01-01');
+    expect(table.generatedAt).toBe('2026-01-01T12:00:00.000Z');
+    expect(table.source).toBe('benchmark');
+    expect(table.minAccuracy).toBe(0.7);
+  });
+});
diff --git a/services/auto-routing-benchmark/src/routing-table-builder.ts b/services/auto-routing-benchmark/src/routing-table-builder.ts
new file mode 100644
index 0000000000..16bc21a9e8
--- /dev/null
+++ b/services/auto-routing-benchmark/src/routing-table-builder.ts
@@ -0,0 +1,52 @@
+import {
+  rankCandidates,
+  RoutingTableSchema,
+  type BenchmarkConfig,
+  type BenchmarkModelSummary,
+  type DifficultyTier,
+  type RoutingTable,
+} from '@kilocode/auto-routing-contracts';
+
+// Builds the routing table from per-(model, tier) decider summaries. Models
+// with zero graded cases in a tier are excluded from that tier. Throws when
+// any tier ends up empty so the caller keeps the previous published table.
+export function buildRoutingTable(params: {
+  runId: string;
+  generatedAt: string;
+  config: BenchmarkConfig;
+  summaries: BenchmarkModelSummary[];
+}): RoutingTable {
+  const { runId, generatedAt, config, summaries } = params;
+  const apiKindsByModel = new Map(config.deciderModels.map(m => [m.id, m.supportedApiKinds] as const));
+
+  const tierCandidates = (t: DifficultyTier) =>
+    rankCandidates(
+      summaries
+        .filter(s => s.tier === t && s.cases > 0)
+        .map(s => ({
+          model: s.model,
+          accuracy: s.accuracy,
+          avgCostUsd: s.avgCostUsd ?? 0,
+          // Spread into a mutable array so tsgo is happy with the readonly type.
+          supportedApiKinds: [...(apiKindsByModel.get(s.model) ?? (['chat_completions'] as const))],
+        })),
+      config.minAccuracy
+    );
+
+  const table: RoutingTable = {
+    version: runId,
+    generatedAt,
+    minAccuracy: config.minAccuracy,
+    source: 'benchmark',
+    tiers: {
+      low: tierCandidates('low'),
+      medium: tierCandidates('medium'),
+      high: tierCandidates('high'),
+    },
+  };
+
+  // RoutingTableSchema enforces .min(1) on each tier array; throws ZodError
+  // when a tier is empty — caller logs and skips publish, keeping the previous
+  // live table intact.
+  return RoutingTableSchema.parse(table);
+}
diff --git a/services/auto-routing-benchmark/src/run.test.ts b/services/auto-routing-benchmark/src/run.test.ts
new file mode 100644
index 0000000000..3797f74adb
--- /dev/null
+++ b/services/auto-routing-benchmark/src/run.test.ts
@@ -0,0 +1,191 @@
+import { describe, expect, it } from 'vitest';
+import type { CaseResultRow } from './db';
+import { runCasesWithConcurrency, summarize } from './run';
+
+function makeRow(overrides: Partial<CaseResultRow> = {}): CaseResultRow {
+  return {
+    run_id: 'run-1',
+    model: 'model/a',
+    case_id: 'case-1',
+    tier: null,
+    score: 1,
+    latency_ms: 100,
+    cost_usd: 0.001,
+    detail_json: null,
+    error: null,
+    ...overrides,
+  };
+}
+
+describe('summarize — classifier kind', () => {
+  it('groups all classifier rows under * tier', () => {
+    const rows: CaseResultRow[] = [
+      makeRow({ model: 'model/a', case_id: 'c1', tier: null, score: 1, latency_ms: 100, cost_usd: 0.001 }),
+      makeRow({ model: 'model/a', case_id: 'c2', tier: null, score: 0.5, latency_ms: 200, cost_usd: 0.002 }),
+    ];
+
+    const summaries = summarize(rows, 'classifier');
+    expect(summaries).toHaveLength(1);
+    const [s] = summaries;
+    expect(s.model).toBe('model/a');
+    expect(s.tier).toBe('*');
+    expect(s.cases).toBe(2);
+  });
+
+  it('computes accuracy correctly', () => {
+    const rows: CaseResultRow[] = [
+      makeRow({ score: 1.0 }),
+      makeRow({ case_id: 'c2', score: 0.5 }),
+      makeRow({ case_id: 'c3', score: 0.0 }),
+    ];
+
+    const [s] = summarize(rows, 'classifier');
+    // (1.0 + 0.5 + 0.0) / 3 = 0.5
+    expect(s.accuracy).toBe(0.5);
+  });
+
+  it('computes avgCostUsd excluding null cost rows', () => {
+    const rows: CaseResultRow[] = [
+      makeRow({ case_id: 'c1', cost_usd: 0.002 }),
+      makeRow({ case_id: 'c2', cost_usd: null }),
+      makeRow({ case_id: 'c3', cost_usd: 0.004 }),
+    ];
+
+    const [s] = summarize(rows, 'classifier');
+    // (0.002 + 0.004) / 2 = 0.003
+    expect(s.avgCostUsd).toBe(0.003);
+  });
+
+  it('returns null avgCostUsd when all cost_usd are null', () => {
+    const rows: CaseResultRow[] = [
+      makeRow({ case_id: 'c1', cost_usd: null }),
+      makeRow({ case_id: 'c2', cost_usd: null }),
+    ];
+
+    const [s] = summarize(rows, 'classifier');
+    expect(s.avgCostUsd).toBeNull();
+  });
+
+  it('computes p50LatencyMs', () => {
+    const rows: CaseResultRow[] = [
+      makeRow({ case_id: 'c1', latency_ms: 100 }),
+      makeRow({ case_id: 'c2', latency_ms: 300 }),
+      makeRow({ case_id: 'c3', latency_ms: 200 }),
+    ];
+
+    const [s] = summarize(rows, 'classifier');
+    // sorted: [100, 200, 300], floor(3/2) = 1 → 200
+    expect(s.p50LatencyMs).toBe(200);
+  });
+
+  it('counts errors correctly', () => {
+    const rows: CaseResultRow[] = [
+      makeRow({ case_id: 'c1', score: 0, error: 'timeout' }),
+      makeRow({ case_id: 'c2', score: 1, error: null }),
+      makeRow({ case_id: 'c3', score: 0, error: 'rate_limit' }),
+    ];
+
+    const [s] = summarize(rows, 'classifier');
+    expect(s.errors).toBe(2);
+    // error rows have score 0 which drags accuracy down
+    expect(s.accuracy).toBe(Number((1 / 3).toFixed(4)));
+  });
+});
+
+describe('summarize — decider kind', () => {
+  it('groups by tier', () => {
+    const rows: CaseResultRow[] = [
+      makeRow({ model: 'model/a', case_id: 'low-1', tier: 'low', score: 1 }),
+      makeRow({ model: 'model/a', case_id: 'low-2', tier: 'low', score: 0 }),
+      makeRow({ model: 'model/a', case_id: 'med-1', tier: 'medium', score: 1 }),
+      makeRow({ model: 'model/b', case_id: 'low-3', tier: 'low', score: 1 }),
+    ];
+
+    const summaries = summarize(rows, 'decider');
+    expect(summaries).toHaveLength(3);
+
+    const aLow = summaries.find(s => s.model === 'model/a' && s.tier === 'low');
+    expect(aLow?.cases).toBe(2);
+    expect(aLow?.accuracy).toBe(0.5);
+
+    const aMed = summaries.find(s => s.model === 'model/a' && s.tier === 'medium');
+    expect(aMed?.cases).toBe(1);
+    expect(aMed?.accuracy).toBe(1);
+
+    const bLow = summaries.find(s => s.model === 'model/b' && s.tier === 'low');
+    expect(bLow?.cases).toBe(1);
+  });
+
+  it('uses * fallback when tier is null', () => {
+    const rows: CaseResultRow[] = [makeRow({ tier: null, score: 1 })];
+    const [s] = summarize(rows, 'decider');
+    expect(s.tier).toBe('*');
+  });
+
+  it('computes avgLatencyMs as rounded mean', () => {
+    const rows: CaseResultRow[] = [
+      makeRow({ case_id: 'c1', tier: 'low', latency_ms: 100 }),
+      makeRow({ case_id: 'c2', tier: 'low', latency_ms: 301 }),
+    ];
+
+    const [s] = summarize(rows, 'decider');
+    expect(s.avgLatencyMs).toBe(Math.round((100 + 301) / 2));
+  });
+
+  it('handles single-element groups for p50', () => {
+    const rows: CaseResultRow[] = [makeRow({ tier: 'high', latency_ms: 500 })];
+    const [s] = summarize(rows, 'decider');
+    expect(s.p50LatencyMs).toBe(500);
+  });
+});
+
+describe('runCasesWithConcurrency', () => {
+  it('processes all items exactly once', async () => {
+    const processed: number[] = [];
+    await runCasesWithConcurrency([1, 2, 3, 4, 5], 2, async item => {
+      processed.push(item);
+    });
+    expect(processed.sort((a, b) => a - b)).toEqual([1, 2, 3, 4, 5]);
+  });
+
+  it('processes empty array without error', async () => {
+    await expect(runCasesWithConcurrency([], 4, async () => {})).resolves.toBeUndefined();
+  });
+
+  it('respects the concurrency cap', async () => {
+    let inFlight = 0;
+    let maxInFlight = 0;
+    const concurrency = 3;
+
+    await runCasesWithConcurrency(
+      Array.from({ length: 10 }, (_, i) => i),
+      concurrency,
+      async () => {
+        inFlight++;
+        maxInFlight = Math.max(maxInFlight, inFlight);
+        // Yield to allow other workers to start
+        await new Promise(resolve => setTimeout(resolve, 0));
+        inFlight--;
+      }
+    );
+
+    expect(maxInFlight).toBeLessThanOrEqual(concurrency);
+    expect(maxInFlight).toBeGreaterThan(0);
+  });
+
+  it('works when concurrency exceeds item count', async () => {
+    const processed: number[] = [];
+    await runCasesWithConcurrency([1, 2], 10, async item => {
+      processed.push(item);
+    });
+    expect(processed.sort((a, b) => a - b)).toEqual([1, 2]);
+  });
+
+  it('propagates errors from the callback', async () => {
+    await expect(
+      runCasesWithConcurrency([1], 1, async () => {
+        throw new Error('test error');
+      })
+    ).rejects.toThrow('test error');
+  });
+});
diff --git a/services/auto-routing-benchmark/src/run.ts b/services/auto-routing-benchmark/src/run.ts
new file mode 100644
index 0000000000..519944d3a9
--- /dev/null
+++ b/services/auto-routing-benchmark/src/run.ts
@@ -0,0 +1,283 @@
+import { classifyWithOpenRouter } from '@kilocode/auto-routing-contracts/classifier';
+import {
+  BenchmarkConfigSchema,
+  ROUTING_TABLE_KV_KEY,
+  type BenchmarkConfig,
+  type BenchmarkKind,
+  type BenchmarkModelSummary,
+} from '@kilocode/auto-routing-contracts';
+import { formatError } from '@kilocode/worker-utils';
+import * as z from 'zod';
+import { getBenchmarkConfig } from './config';
+import { CLASSIFIER_CASES } from './datasets/classifier-cases';
+import { DECIDER_CASES } from './datasets/decider-cases';
+import {
+  countCaseResults,
+  getCaseResults,
+  getRun,
+  insertRun,
+  markRunCompleted,
+  markStaleRunsFailed,
+  replaceModelSummaries,
+  saveRoutingTable,
+  upsertCaseResult,
+  type CaseResultRow,
+} from './db';
+import { gradeClassifierOutput, runDeciderCheck } from './grading';
+import { createOpenRouterClient } from './openrouter';
+import { buildRoutingTable } from './routing-table-builder';
+
+export type BenchmarkJobMessage = { runId: string; kind: BenchmarkKind; model: string };
+
+export const BenchmarkJobMessageSchema = z.object({
+  runId: z.string().min(1),
+  kind: z.enum(['classifier', 'decider']),
+  model: z.string().min(1),
+});
+
+const STALE_RUN_MAX_AGE_MS = 6 * 3600_000;
+
+export async function startRun(
+  env: Env,
+  kind: BenchmarkKind
+): Promise<{ runId: string; enqueuedModels: number }> {
+  // Stale-run sweeper: anything still 'running' after 6h is dead (queue
+  // retries exhausted); fail it so the admin panel shows the truth.
+  await markStaleRunsFailed(env.BENCH_DB, new Date(Date.now() - STALE_RUN_MAX_AGE_MS).toISOString());
+
+  const config = await getBenchmarkConfig(env.BENCH_DB);
+  const models = kind === 'classifier' ? config.classifierModels : config.deciderModels.map(m => m.id);
+  const runId = `${kind}-${new Date().toISOString().replace(/[:.]/g, '-')}`;
+  await insertRun(env.BENCH_DB, {
+    id: runId,
+    kind,
+    startedAt: new Date().toISOString(),
+    configJson: JSON.stringify(config),
+  });
+  await env.BENCH_QUEUE.sendBatch(
+    models.map(model => ({ body: { runId, kind, model } satisfies BenchmarkJobMessage }))
+  );
+  console.log(JSON.stringify({ event: 'benchmark_run_started', runId, kind, models }));
+  return { runId, enqueuedModels: models.length };
+}
+
+export async function processJob(env: Env, rawMessage: unknown): Promise<void> {
+  // Validate the message shape; malformed messages are logged and dropped
+  // rather than retried forever.
+  const parsed = BenchmarkJobMessageSchema.safeParse(rawMessage);
+  if (!parsed.success) {
+    console.warn(
+      JSON.stringify({
+        event: 'benchmark_job_invalid_message',
+        error: parsed.error.message,
+        raw: JSON.stringify(rawMessage).slice(0, 200),
+      })
+    );
+    return;
+  }
+
+  const message = parsed.data;
+  const config = await getRunConfig(env, message.runId);
+  // Create the OpenRouter client inside processJob — no module-scope transport clients.
+  const client = await createOpenRouterClient(env);
+
+  if (message.kind === 'classifier') {
+    await runCasesWithConcurrency(CLASSIFIER_CASES, config.maxConcurrency, async benchCase => {
+      const startedAt = performance.now();
+      try {
+        const result = await classifyWithOpenRouter(client, benchCase.input, message.model);
+        const score = result.fallback ? 0 : gradeClassifierOutput(benchCase.expected, result.classification);
+        await upsertCaseResult(env.BENCH_DB, {
+          run_id: message.runId,
+          model: message.model,
+          case_id: benchCase.id,
+          tier: null,
+          score,
+          latency_ms: Math.round(performance.now() - startedAt),
+          cost_usd: result.cost,
+          detail_json: JSON.stringify({
+            classification: result.fallback ? null : result.classification,
+            fallback: result.fallback?.reason ?? null,
+            retried: result.retried ?? false,
+          }),
+          error: null,
+        });
+      } catch (error) {
+        await upsertCaseResult(env.BENCH_DB, failedRow(message, benchCase.id, null, startedAt, error));
+      }
+    });
+  } else {
+    // Determinism note: temperature 0, fixed maxTokens, pinned prompts, mechanical checks.
+    // Provider-side nondeterminism can't be fully eliminated, which is why grading is
+    // binary on a single canonical answer.
+    await runCasesWithConcurrency(DECIDER_CASES, config.maxConcurrency, async benchCase => {
+      const startedAt = performance.now();
+      try {
+        const result = await client.chat.send({
+          chatRequest: {
+            model: message.model,
+            messages: [
+              { role: 'system', content: benchCase.systemPrompt },
+              { role: 'user', content: benchCase.userPrompt },
+            ],
+            stream: false,
+            temperature: 0,
+            maxTokens: benchCase.maxTokens,
+          },
+        });
+        const content: unknown = result.choices[0]?.message.content;
+        const text = typeof content === 'string' ? content : '';
+        const passed = text.length > 0 && runDeciderCheck(benchCase.check, text);
+        await upsertCaseResult(env.BENCH_DB, {
+          run_id: message.runId,
+          model: message.model,
+          case_id: benchCase.id,
+          tier: benchCase.tier,
+          score: passed ? 1 : 0,
+          latency_ms: Math.round(performance.now() - startedAt),
+          cost_usd: result.usage?.cost ?? null,
+          detail_json: JSON.stringify({
+            finishReason: result.choices[0]?.finishReason ?? null,
+            outputPrefix: text.slice(0, 200),
+          }),
+          error: null,
+        });
+      } catch (error) {
+        await upsertCaseResult(
+          env.BENCH_DB,
+          failedRow(message, benchCase.id, benchCase.tier, startedAt, error)
+        );
+      }
+    });
+  }
+
+  await finalizeRunIfComplete(env, message.runId, message.kind);
+}
+
+function failedRow(
+  message: BenchmarkJobMessage,
+  caseId: string,
+  tier: string | null,
+  startedAt: number,
+  error: unknown
+): CaseResultRow {
+  return {
+    run_id: message.runId,
+    model: message.model,
+    case_id: caseId,
+    tier,
+    score: 0,
+    latency_ms: Math.round(performance.now() - startedAt),
+    cost_usd: null,
+    detail_json: null,
+    error: JSON.stringify(formatError(error)).slice(0, 500),
+  };
+}
+
+async function getRunConfig(env: Env, runId: string): Promise<BenchmarkConfig> {
+  // Snapshot taken at startRun time so a mid-run admin edit can't skew it.
+  const run = await getRun(env.BENCH_DB, runId);
+  if (!run) throw new Error(`unknown run ${runId}`);
+  return BenchmarkConfigSchema.parse(JSON.parse(run.config_json));
+}
+
+export async function runCasesWithConcurrency<T>(
+  cases: readonly T[],
+  concurrency: number,
+  fn: (item: T) => Promise<void>
+): Promise<void> {
+  const queue = [...cases];
+  const workers = Array.from({ length: Math.min(concurrency, queue.length) }, async () => {
+    for (let item = queue.shift(); item !== undefined; item = queue.shift()) {
+      await fn(item);
+    }
+  });
+  await Promise.all(workers);
+}
+
+async function finalizeRunIfComplete(env: Env, runId: string, kind: BenchmarkKind): Promise<void> {
+  const config = await getRunConfig(env, runId);
+  const models =
+    kind === 'classifier' ? config.classifierModels : config.deciderModels.map(m => m.id);
+  const caseCount = kind === 'classifier' ? CLASSIFIER_CASES.length : DECIDER_CASES.length;
+  const expected = models.length * caseCount;
+  const actual = await countCaseResults(env.BENCH_DB, runId);
+
+  if (actual < expected) return;
+
+  // Two consumers may both see completion and both aggregate — harmless:
+  // identical deterministic inputs → identical summaries; replaceModelSummaries
+  // is a batched delete+insert; markRunCompleted guards on status='running';
+  // KV put is idempotent.
+  const rows = await getCaseResults(env.BENCH_DB, runId);
+  const summaries = summarize(rows, kind);
+  await replaceModelSummaries(env.BENCH_DB, runId, summaries);
+  await markRunCompleted(env.BENCH_DB, runId);
+
+  if (kind === 'decider') {
+    const generatedAt = new Date().toISOString();
+    try {
+      const table = buildRoutingTable({ runId, generatedAt, config, summaries });
+      const tableJson = JSON.stringify(table);
+      await saveRoutingTable(env.BENCH_DB, runId, generatedAt, tableJson);
+      await env.AUTO_ROUTING_CONFIG.put(ROUTING_TABLE_KV_KEY, tableJson);
+      console.log(
+        JSON.stringify({ event: 'routing_table_published', runId, version: table.version })
+      );
+    } catch (error) {
+      console.warn(
+        JSON.stringify({
+          event: 'routing_table_publish_skipped',
+          runId,
+          ...formatError(error),
+        })
+      );
+    }
+  }
+
+  console.log(
+    JSON.stringify({
+      event: 'benchmark_run_completed',
+      runId,
+      kind,
+      summaries,
+    })
+  );
+}
+
+export function summarize(rows: CaseResultRow[], kind: BenchmarkKind): BenchmarkModelSummary[] {
+  // Group by "model tier-key" using a plain reduce so this works in all runtimes.
+  // Classifier rows use '*' as the tier (no tiering); decider rows use the actual tier
+  // (falling back to '*' when tier is null).
+  const groups = new Map<string, CaseResultRow[]>();
+  for (const row of rows) {
+    const tierKey = kind === 'classifier' ? '*' : (row.tier ?? '*');
+    const key = `${row.model}\0${tierKey}`;
+    const existing = groups.get(key);
+    if (existing) {
+      existing.push(row);
+    } else {
+      groups.set(key, [row]);
+    }
+  }
+
+  return [...groups.entries()].map(([key, group]) => {
+    const [model, tier] = key.split('\0');
+    const latencies = group.map(r => r.latency_ms).toSorted((a, b) => a - b);
+    const costs = group.filter(r => r.cost_usd !== null);
+    return {
+      model,
+      tier: tier as BenchmarkModelSummary['tier'],
+      accuracy: Number((group.reduce((a, r) => a + r.score, 0) / group.length).toFixed(4)),
+      avgCostUsd: costs.length
+        ? Number(
+            (costs.reduce((a, r) => a + (r.cost_usd ?? 0), 0) / costs.length).toFixed(8)
+          )
+        : null,
+      avgLatencyMs: Math.round(group.reduce((a, r) => a + r.latency_ms, 0) / group.length),
+      p50LatencyMs: latencies[Math.floor(latencies.length / 2)] ?? null,
+      cases: group.length,
+      errors: group.filter(r => r.error !== null).length,
+    };
+  });
+}

From 0c763cec3054aba6501f75211ffeafc0a7aabd17 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Thu, 11 Jun 2026 22:46:12 +0200
Subject: [PATCH 12/73] feat(auto-routing-benchmark): admin config, runs and
 routing-table endpoints

---
 .../auto-routing-benchmark/src/admin.test.ts  | 284 ++++++++++++++++++
 services/auto-routing-benchmark/src/admin.ts  |  56 ++++
 services/auto-routing-benchmark/src/index.ts  |  14 +
 .../src/routing-table-builder.ts              |   4 +-
 .../auto-routing-benchmark/src/run.test.ts    |  18 +-
 services/auto-routing-benchmark/src/run.ts    |  21 +-
 6 files changed, 387 insertions(+), 10 deletions(-)
 create mode 100644 services/auto-routing-benchmark/src/admin.test.ts
 create mode 100644 services/auto-routing-benchmark/src/admin.ts

diff --git a/services/auto-routing-benchmark/src/admin.test.ts b/services/auto-routing-benchmark/src/admin.test.ts
new file mode 100644
index 0000000000..6a74bdc7e3
--- /dev/null
+++ b/services/auto-routing-benchmark/src/admin.test.ts
@@ -0,0 +1,284 @@
+import { beforeEach, describe, expect, it, vi } from 'vitest';
+import { DEFAULT_BENCHMARK_CONFIG } from './config';
+import { app } from './index';
+
+// ---------------------------------------------------------------------------
+// Env / binding stubs
+// ---------------------------------------------------------------------------
+
+const tokenGet = vi.fn<() => Promise<string>>();
+const dbFirst = vi.fn();
+const dbAll = vi.fn();
+const dbRun = vi.fn();
+const dbBind = vi.fn();
+const dbPrepare = vi.fn();
+const queueSendBatch = vi.fn();
+
+// Minimal chainable D1 stub.
+// prepare() → { bind() → { first(), all(), run() } }
+function makeD1Stub() {
+  const stmt = {
+    bind: (..._args: unknown[]) => {
+      dbBind(..._args);
+      return stmt;
+    },
+    first: dbFirst,
+    all: dbAll,
+    run: dbRun,
+  };
+  dbPrepare.mockReturnValue(stmt);
+  return {
+    prepare: (sql: string) => {
+      dbPrepare(sql);
+      return stmt;
+    },
+    batch: vi.fn().mockResolvedValue([]),
+  } as unknown as D1Database;
+}
+
+const env = {
+  INTERNAL_API_SECRET_PROD: { get: tokenGet },
+  BENCH_DB: null as unknown as D1Database,
+  BENCH_QUEUE: { sendBatch: queueSendBatch },
+  AUTO_ROUTING_CONFIG: { put: vi.fn(), get: vi.fn() },
+} as unknown as Env;
+
+const executionCtx = {
+  waitUntil: () => {},
+  passThroughOnException: () => {},
+} as unknown as ExecutionContext;
+
+function request(path: string, init: RequestInit = {}) {
+  return app.request(`https://bench.example.com${path}`, init, env, executionCtx);
+}
+
+function authedGet(path: string) {
+  return request(path, { headers: { authorization: 'Bearer bench-token' } });
+}
+
+function authedPost(path: string, body: unknown) {
+  return request(path, {
+    method: 'POST',
+    headers: { authorization: 'Bearer bench-token', 'content-type': 'application/json' },
+    body: JSON.stringify(body),
+  });
+}
+
+function authedPut(path: string, body: unknown, extraHeaders: Record<string, string> = {}) {
+  return request(path, {
+    method: 'PUT',
+    headers: {
+      authorization: 'Bearer bench-token',
+      'content-type': 'application/json',
+      ...extraHeaders,
+    },
+    body: JSON.stringify(body),
+  });
+}
+
+// ---------------------------------------------------------------------------
+// Setup
+// ---------------------------------------------------------------------------
+
+beforeEach(() => {
+  tokenGet.mockResolvedValue('bench-token');
+  dbFirst.mockResolvedValue(null);
+  dbAll.mockResolvedValue({ results: [] });
+  dbRun.mockResolvedValue({ meta: { changes: 0 } });
+  queueSendBatch.mockResolvedValue(undefined);
+
+  // Rebuild the D1 stub each test so prepare/bind point to fresh mocks.
+  (env as unknown as Record<string, unknown>).BENCH_DB = makeD1Stub();
+});
+
+// ---------------------------------------------------------------------------
+// Auth guard
+// ---------------------------------------------------------------------------
+
+describe('auth middleware', () => {
+  it('rejects requests without a bearer token', async () => {
+    const res = await request('/admin/config');
+    expect(res.status).toBe(401);
+    await expect(res.json()).resolves.toEqual({ error: 'Unauthorized' });
+  });
+
+  it('rejects requests with the wrong bearer token', async () => {
+    const res = await request('/admin/config', {
+      headers: { authorization: 'Bearer wrong-token' },
+    });
+    expect(res.status).toBe(401);
+  });
+});
+
+// ---------------------------------------------------------------------------
+// GET /admin/config
+// ---------------------------------------------------------------------------
+
+describe('GET /admin/config', () => {
+  it('returns defaults when the DB row is absent', async () => {
+    // dbFirst already returns null by default
+    const res = await authedGet('/admin/config');
+    expect(res.status).toBe(200);
+    await expect(res.json()).resolves.toEqual({
+      config: DEFAULT_BENCHMARK_CONFIG,
+      defaults: DEFAULT_BENCHMARK_CONFIG,
+    });
+  });
+
+  it('returns the stored config when a DB row exists', async () => {
+    const storedConfig = {
+      ...DEFAULT_BENCHMARK_CONFIG,
+      minAccuracy: 0.9,
+      updatedAt: '2026-06-01T00:00:00.000Z',
+      updatedBy: 'admin@example.com',
+    };
+    dbFirst.mockResolvedValueOnce({ config_json: JSON.stringify(storedConfig) });
+
+    const res = await authedGet('/admin/config');
+    expect(res.status).toBe(200);
+    const body = (await res.json()) as { config: typeof storedConfig };
+    expect(body.config.minAccuracy).toBe(0.9);
+    expect(body.config.updatedBy).toBe('admin@example.com');
+  });
+});
+
+// ---------------------------------------------------------------------------
+// PUT /admin/config
+// ---------------------------------------------------------------------------
+
+describe('PUT /admin/config', () => {
+  it('returns 400 for a non-JSON body', async () => {
+    const res = await request('/admin/config', {
+      method: 'PUT',
+      headers: {
+        authorization: 'Bearer bench-token',
+        'content-type': 'application/json',
+      },
+      body: 'not json {{{',
+    });
+    expect(res.status).toBe(400);
+    await expect(res.json()).resolves.toEqual({ error: 'Invalid JSON body' });
+  });
+
+  it('returns 400 for a schema-invalid config', async () => {
+    const res = await authedPut('/admin/config', { classifierModels: 'oops' });
+    expect(res.status).toBe(400);
+    await expect(res.json()).resolves.toEqual({ error: 'Invalid benchmark config' });
+    expect(dbRun).not.toHaveBeenCalled();
+  });
+
+  it('persists a valid config and returns it with defaults', async () => {
+    const validConfig = {
+      ...DEFAULT_BENCHMARK_CONFIG,
+      minAccuracy: 0.85,
+      updatedAt: null,
+      updatedBy: null,
+    };
+
+    const res = await authedPut('/admin/config', validConfig, {
+      'x-updated-by': 'igor@kilocode.ai',
+    });
+
+    expect(res.status).toBe(200);
+    const body = (await res.json()) as {
+      config: { minAccuracy: number; updatedBy: string | null; updatedAt: string | null };
+      defaults: typeof DEFAULT_BENCHMARK_CONFIG;
+    };
+    // Returned config carries the stamped fields.
+    expect(body.config.minAccuracy).toBe(0.85);
+    expect(body.config.updatedBy).toBe('igor@kilocode.ai');
+    expect(typeof body.config.updatedAt).toBe('string');
+    expect(body.defaults).toEqual(DEFAULT_BENCHMARK_CONFIG);
+
+    // The INSERT was actually executed (dbRun was called on the saveConfigRow stmt).
+    expect(dbRun).toHaveBeenCalled();
+    // The SQL should be an INSERT OR REPLACE into benchmark_config.
+    const insertCall = dbPrepare.mock.calls.find(
+      (args: unknown[]) => typeof args[0] === 'string' && (args[0] as string).includes('benchmark_config')
+    );
+    expect(insertCall).toBeDefined();
+    // The updatedBy value was forwarded via bind.
+    const bindCalls: unknown[][] = dbBind.mock.calls;
+    const foundUpdatedBy = bindCalls.some(args => args.includes('igor@kilocode.ai'));
+    expect(foundUpdatedBy).toBe(true);
+  });
+});
+
+// ---------------------------------------------------------------------------
+// GET /admin/runs
+// ---------------------------------------------------------------------------
+
+describe('GET /admin/runs', () => {
+  it('returns an empty runs array when the table is empty', async () => {
+    // dbAll returns { results: [] } by default
+    const res = await authedGet('/admin/runs');
+    expect(res.status).toBe(200);
+    await expect(res.json()).resolves.toEqual({ runs: [] });
+  });
+});
+
+// ---------------------------------------------------------------------------
+// POST /admin/runs
+// ---------------------------------------------------------------------------
+
+describe('POST /admin/runs', () => {
+  it('returns 400 for a non-JSON body', async () => {
+    const res = await request('/admin/runs', {
+      method: 'POST',
+      headers: {
+        authorization: 'Bearer bench-token',
+        'content-type': 'application/json',
+      },
+      body: '<<<',
+    });
+    expect(res.status).toBe(400);
+    await expect(res.json()).resolves.toEqual({ error: 'Invalid JSON body' });
+  });
+
+  it('returns 400 for an invalid kind', async () => {
+    const res = await authedPost('/admin/runs', { kind: 'turbo' });
+    expect(res.status).toBe(400);
+    await expect(res.json()).resolves.toEqual({ error: 'Invalid run request' });
+    expect(queueSendBatch).not.toHaveBeenCalled();
+  });
+
+  it('starts a classifier run and returns runId + enqueuedModels', async () => {
+    // markStaleRunsFailed → run (UPDATE), getBenchmarkConfig → first (null → defaults),
+    // insertRun → run, then sendBatch.
+    const res = await authedPost('/admin/runs', { kind: 'classifier' });
+    expect(res.status).toBe(200);
+    const body = (await res.json()) as { runId: string; enqueuedModels: number };
+    expect(body.runId).toMatch(/^classifier-/);
+    expect(body.enqueuedModels).toBe(DEFAULT_BENCHMARK_CONFIG.classifierModels.length);
+    expect(queueSendBatch).toHaveBeenCalledOnce();
+  });
+});
+
+// ---------------------------------------------------------------------------
+// GET /admin/routing-table
+// ---------------------------------------------------------------------------
+
+describe('GET /admin/routing-table', () => {
+  it('returns {table: null, publishedAt: null} when no rows exist', async () => {
+    // dbFirst already returns null by default
+    const res = await authedGet('/admin/routing-table');
+    expect(res.status).toBe(200);
+    await expect(res.json()).resolves.toEqual({ table: null, publishedAt: null });
+  });
+
+  it('returns the parsed table and publishedAt when a row exists', async () => {
+    const tableData = { version: 'test-v1', tiers: {} };
+    dbFirst.mockResolvedValueOnce({
+      run_id: 'run-123',
+      published_at: '2026-06-01T10:00:00.000Z',
+      table_json: JSON.stringify(tableData),
+    });
+
+    const res = await authedGet('/admin/routing-table');
+    expect(res.status).toBe(200);
+    await expect(res.json()).resolves.toEqual({
+      table: tableData,
+      publishedAt: '2026-06-01T10:00:00.000Z',
+    });
+  });
+});
diff --git a/services/auto-routing-benchmark/src/admin.ts b/services/auto-routing-benchmark/src/admin.ts
new file mode 100644
index 0000000000..5bb6649a69
--- /dev/null
+++ b/services/auto-routing-benchmark/src/admin.ts
@@ -0,0 +1,56 @@
+import {
+  BenchmarkConfigSchema,
+  StartBenchmarkRunRequestSchema,
+  type BenchmarkRun,
+} from '@kilocode/auto-routing-contracts';
+import type { Handler } from 'hono';
+import { DEFAULT_BENCHMARK_CONFIG, getBenchmarkConfig, saveBenchmarkConfig } from './config';
+import { getLatestRoutingTable, listRuns } from './db';
+import { startRun } from './run';
+import type { HonoEnv } from './hono-env';
+
+export const getConfigHandler: Handler<HonoEnv> = async c =>
+  c.json({
+    config: await getBenchmarkConfig(c.env.BENCH_DB),
+    defaults: DEFAULT_BENCHMARK_CONFIG,
+  });
+
+export const putConfigHandler: Handler<HonoEnv> = async c => {
+  let body: unknown;
+  try {
+    body = await c.req.json();
+  } catch {
+    return c.json({ error: 'Invalid JSON body' }, 400);
+  }
+  const parsed = BenchmarkConfigSchema.safeParse(body);
+  if (!parsed.success) return c.json({ error: 'Invalid benchmark config' }, 400);
+  const updatedBy = c.req.header('x-updated-by') ?? null;
+  const saved = await saveBenchmarkConfig(c.env.BENCH_DB, parsed.data, updatedBy);
+  return c.json({ config: saved, defaults: DEFAULT_BENCHMARK_CONFIG });
+};
+
+export const listRunsHandler: Handler<HonoEnv> = async c => {
+  const limit = Math.min(Number(c.req.query('limit') ?? 20) || 20, 100);
+  const runs: BenchmarkRun[] = await listRuns(c.env.BENCH_DB, limit);
+  return c.json({ runs });
+};
+
+export const startRunHandler: Handler<HonoEnv> = async c => {
+  let body: unknown;
+  try {
+    body = await c.req.json();
+  } catch {
+    return c.json({ error: 'Invalid JSON body' }, 400);
+  }
+  const parsed = StartBenchmarkRunRequestSchema.safeParse(body);
+  if (!parsed.success) return c.json({ error: 'Invalid run request' }, 400);
+  return c.json(await startRun(c.env, parsed.data.kind));
+};
+
+export const getRoutingTableHandler: Handler<HonoEnv> = async c => {
+  const latest = await getLatestRoutingTable(c.env.BENCH_DB);
+  return c.json({
+    table: latest ? (JSON.parse(latest.table_json) as unknown) : null,
+    publishedAt: latest?.published_at ?? null,
+  });
+};
diff --git a/services/auto-routing-benchmark/src/index.ts b/services/auto-routing-benchmark/src/index.ts
index 542e9e3ce6..e78437b9dd 100644
--- a/services/auto-routing-benchmark/src/index.ts
+++ b/services/auto-routing-benchmark/src/index.ts
@@ -1,12 +1,26 @@
 import { Hono } from 'hono';
 import { createErrorHandler, createNotFoundHandler } from '@kilocode/worker-utils';
 import { authMiddleware } from './auth';
+import {
+  getConfigHandler,
+  putConfigHandler,
+  listRunsHandler,
+  startRunHandler,
+  getRoutingTableHandler,
+} from './admin';
 import type { HonoEnv } from './hono-env';
 import { processJob, startRun, type BenchmarkJobMessage } from './run';
 
 export const app = new Hono<HonoEnv>();
 app.use('*', authMiddleware);
 app.get('/health', c => c.json({ status: 'ok', service: 'auto-routing-benchmark' }));
+
+app.get('/admin/config', getConfigHandler);
+app.put('/admin/config', putConfigHandler);
+app.get('/admin/runs', listRunsHandler);
+app.post('/admin/runs', startRunHandler);
+app.get('/admin/routing-table', getRoutingTableHandler);
+
 app.notFound(createNotFoundHandler());
 app.onError(createErrorHandler());
 
diff --git a/services/auto-routing-benchmark/src/routing-table-builder.ts b/services/auto-routing-benchmark/src/routing-table-builder.ts
index 16bc21a9e8..71bfa772d3 100644
--- a/services/auto-routing-benchmark/src/routing-table-builder.ts
+++ b/services/auto-routing-benchmark/src/routing-table-builder.ts
@@ -17,7 +17,9 @@ export function buildRoutingTable(params: {
   summaries: BenchmarkModelSummary[];
 }): RoutingTable {
   const { runId, generatedAt, config, summaries } = params;
-  const apiKindsByModel = new Map(config.deciderModels.map(m => [m.id, m.supportedApiKinds] as const));
+  const apiKindsByModel = new Map(
+    config.deciderModels.map(m => [m.id, m.supportedApiKinds] as const)
+  );
 
   const tierCandidates = (t: DifficultyTier) =>
     rankCandidates(
diff --git a/services/auto-routing-benchmark/src/run.test.ts b/services/auto-routing-benchmark/src/run.test.ts
index 3797f74adb..1c9826c640 100644
--- a/services/auto-routing-benchmark/src/run.test.ts
+++ b/services/auto-routing-benchmark/src/run.test.ts
@@ -20,8 +20,22 @@ function makeRow(overrides: Partial<CaseResultRow> = {}): CaseResultRow {
 describe('summarize — classifier kind', () => {
   it('groups all classifier rows under * tier', () => {
     const rows: CaseResultRow[] = [
-      makeRow({ model: 'model/a', case_id: 'c1', tier: null, score: 1, latency_ms: 100, cost_usd: 0.001 }),
-      makeRow({ model: 'model/a', case_id: 'c2', tier: null, score: 0.5, latency_ms: 200, cost_usd: 0.002 }),
+      makeRow({
+        model: 'model/a',
+        case_id: 'c1',
+        tier: null,
+        score: 1,
+        latency_ms: 100,
+        cost_usd: 0.001,
+      }),
+      makeRow({
+        model: 'model/a',
+        case_id: 'c2',
+        tier: null,
+        score: 0.5,
+        latency_ms: 200,
+        cost_usd: 0.002,
+      }),
     ];
 
     const summaries = summarize(rows, 'classifier');
diff --git a/services/auto-routing-benchmark/src/run.ts b/services/auto-routing-benchmark/src/run.ts
index 519944d3a9..4efd91e93f 100644
--- a/services/auto-routing-benchmark/src/run.ts
+++ b/services/auto-routing-benchmark/src/run.ts
@@ -43,10 +43,14 @@ export async function startRun(
 ): Promise<{ runId: string; enqueuedModels: number }> {
   // Stale-run sweeper: anything still 'running' after 6h is dead (queue
   // retries exhausted); fail it so the admin panel shows the truth.
-  await markStaleRunsFailed(env.BENCH_DB, new Date(Date.now() - STALE_RUN_MAX_AGE_MS).toISOString());
+  await markStaleRunsFailed(
+    env.BENCH_DB,
+    new Date(Date.now() - STALE_RUN_MAX_AGE_MS).toISOString()
+  );
 
   const config = await getBenchmarkConfig(env.BENCH_DB);
-  const models = kind === 'classifier' ? config.classifierModels : config.deciderModels.map(m => m.id);
+  const models =
+    kind === 'classifier' ? config.classifierModels : config.deciderModels.map(m => m.id);
   const runId = `${kind}-${new Date().toISOString().replace(/[:.]/g, '-')}`;
   await insertRun(env.BENCH_DB, {
     id: runId,
@@ -86,7 +90,9 @@ export async function processJob(env: Env, rawMessage: unknown): Promise<void> {
       const startedAt = performance.now();
       try {
         const result = await classifyWithOpenRouter(client, benchCase.input, message.model);
-        const score = result.fallback ? 0 : gradeClassifierOutput(benchCase.expected, result.classification);
+        const score = result.fallback
+          ? 0
+          : gradeClassifierOutput(benchCase.expected, result.classification);
         await upsertCaseResult(env.BENCH_DB, {
           run_id: message.runId,
           model: message.model,
@@ -103,7 +109,10 @@ export async function processJob(env: Env, rawMessage: unknown): Promise<void> {
           error: null,
         });
       } catch (error) {
-        await upsertCaseResult(env.BENCH_DB, failedRow(message, benchCase.id, null, startedAt, error));
+        await upsertCaseResult(
+          env.BENCH_DB,
+          failedRow(message, benchCase.id, null, startedAt, error)
+        );
       }
     });
   } else {
@@ -270,9 +279,7 @@ export function summarize(rows: CaseResultRow[], kind: BenchmarkKind): Benchmark
       tier: tier as BenchmarkModelSummary['tier'],
       accuracy: Number((group.reduce((a, r) => a + r.score, 0) / group.length).toFixed(4)),
       avgCostUsd: costs.length
-        ? Number(
-            (costs.reduce((a, r) => a + (r.cost_usd ?? 0), 0) / costs.length).toFixed(8)
-          )
+        ? Number((costs.reduce((a, r) => a + (r.cost_usd ?? 0), 0) / costs.length).toFixed(8))
         : null,
       avgLatencyMs: Math.round(group.reduce((a, r) => a + r.latency_ms, 0) / group.length),
       p50LatencyMs: latencies[Math.floor(latencies.length / 2)] ?? null,

From c749be26d4257cc9e7639b931961ff571e779907 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Thu, 11 Jun 2026 22:53:14 +0200
Subject: [PATCH 13/73] feat(admin): proxy routes for auto-routing benchmark
 service

---
 .../auto-routing/benchmark-config/route.ts    |  37 ++++
 .../benchmark-routing-table/route.ts          |  11 +
 .../api/auto-routing/benchmark-runs/route.ts  |  36 +++
 ...uto-routing-benchmark-admin-client.test.ts | 207 ++++++++++++++++++
 .../auto-routing-benchmark-admin-client.ts    | 108 +++++++++
 apps/web/src/lib/config.server.ts             |   5 +
 .../auto-routing-benchmark/src/admin.test.ts  |   3 +-
 7 files changed, 406 insertions(+), 1 deletion(-)
 create mode 100644 apps/web/src/app/admin/api/auto-routing/benchmark-config/route.ts
 create mode 100644 apps/web/src/app/admin/api/auto-routing/benchmark-routing-table/route.ts
 create mode 100644 apps/web/src/app/admin/api/auto-routing/benchmark-runs/route.ts
 create mode 100644 apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.test.ts
 create mode 100644 apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.ts

diff --git a/apps/web/src/app/admin/api/auto-routing/benchmark-config/route.ts b/apps/web/src/app/admin/api/auto-routing/benchmark-config/route.ts
new file mode 100644
index 0000000000..d81cc4f69c
--- /dev/null
+++ b/apps/web/src/app/admin/api/auto-routing/benchmark-config/route.ts
@@ -0,0 +1,37 @@
+import { BenchmarkConfigSchema } from '@kilocode/auto-routing-contracts';
+import type { NextRequest } from 'next/server';
+import { NextResponse } from 'next/server';
+import {
+  getBenchmarkConfig,
+  updateBenchmarkConfig,
+} from '@/lib/ai-gateway/auto-routing-benchmark-admin-client';
+import { getUserFromAuth } from '@/lib/user/server';
+
+export async function GET() {
+  const { authFailedResponse } = await getUserFromAuth({ adminOnly: true });
+  if (authFailedResponse) return authFailedResponse;
+
+  const result = await getBenchmarkConfig();
+  return NextResponse.json(result.body, { status: result.status });
+}
+
+export async function PUT(request: NextRequest) {
+  const { authFailedResponse, user } = await getUserFromAuth({ adminOnly: true });
+  if (authFailedResponse) return authFailedResponse;
+
+  let rawBody: unknown;
+  try {
+    rawBody = await request.json();
+  } catch {
+    return NextResponse.json({ error: 'Invalid JSON body' }, { status: 400 });
+  }
+
+  const parsed = BenchmarkConfigSchema.safeParse(rawBody);
+  if (!parsed.success) {
+    return NextResponse.json({ error: 'Invalid benchmark config' }, { status: 400 });
+  }
+
+  const email = user?.google_user_email ?? '';
+  const result = await updateBenchmarkConfig(parsed.data, email);
+  return NextResponse.json(result.body, { status: result.status });
+}
diff --git a/apps/web/src/app/admin/api/auto-routing/benchmark-routing-table/route.ts b/apps/web/src/app/admin/api/auto-routing/benchmark-routing-table/route.ts
new file mode 100644
index 0000000000..26fdc8eef1
--- /dev/null
+++ b/apps/web/src/app/admin/api/auto-routing/benchmark-routing-table/route.ts
@@ -0,0 +1,11 @@
+import { NextResponse } from 'next/server';
+import { getBenchmarkRoutingTable } from '@/lib/ai-gateway/auto-routing-benchmark-admin-client';
+import { getUserFromAuth } from '@/lib/user/server';
+
+export async function GET() {
+  const { authFailedResponse } = await getUserFromAuth({ adminOnly: true });
+  if (authFailedResponse) return authFailedResponse;
+
+  const result = await getBenchmarkRoutingTable();
+  return NextResponse.json(result.body, { status: result.status });
+}
diff --git a/apps/web/src/app/admin/api/auto-routing/benchmark-runs/route.ts b/apps/web/src/app/admin/api/auto-routing/benchmark-runs/route.ts
new file mode 100644
index 0000000000..afb3f47f65
--- /dev/null
+++ b/apps/web/src/app/admin/api/auto-routing/benchmark-runs/route.ts
@@ -0,0 +1,36 @@
+import { StartBenchmarkRunRequestSchema } from '@kilocode/auto-routing-contracts';
+import type { NextRequest } from 'next/server';
+import { NextResponse } from 'next/server';
+import {
+  listBenchmarkRuns,
+  startBenchmarkRun,
+} from '@/lib/ai-gateway/auto-routing-benchmark-admin-client';
+import { getUserFromAuth } from '@/lib/user/server';
+
+export async function GET() {
+  const { authFailedResponse } = await getUserFromAuth({ adminOnly: true });
+  if (authFailedResponse) return authFailedResponse;
+
+  const result = await listBenchmarkRuns();
+  return NextResponse.json(result.body, { status: result.status });
+}
+
+export async function POST(request: NextRequest) {
+  const { authFailedResponse } = await getUserFromAuth({ adminOnly: true });
+  if (authFailedResponse) return authFailedResponse;
+
+  let rawBody: unknown;
+  try {
+    rawBody = await request.json();
+  } catch {
+    return NextResponse.json({ error: 'Invalid JSON body' }, { status: 400 });
+  }
+
+  const parsed = StartBenchmarkRunRequestSchema.safeParse(rawBody);
+  if (!parsed.success) {
+    return NextResponse.json({ error: 'Invalid start benchmark run request' }, { status: 400 });
+  }
+
+  const result = await startBenchmarkRun(parsed.data.kind);
+  return NextResponse.json(result.body, { status: result.status });
+}
diff --git a/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.test.ts b/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.test.ts
new file mode 100644
index 0000000000..f0e62f1d80
--- /dev/null
+++ b/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.test.ts
@@ -0,0 +1,207 @@
+import {
+  getBenchmarkConfig,
+  updateBenchmarkConfig,
+  listBenchmarkRuns,
+  startBenchmarkRun,
+  getBenchmarkRoutingTable,
+} from './auto-routing-benchmark-admin-client';
+
+jest.mock('@/lib/config.server', () => ({
+  AUTO_ROUTING_BENCHMARK_WORKER_URL: 'https://benchmark-worker.example.com',
+  INTERNAL_API_SECRET: 'test-internal-secret',
+}));
+
+const mockFetch = jest.fn();
+global.fetch = mockFetch;
+
+const configResponse = {
+  config: {
+    classifierModels: ['anthropic/claude-haiku-4'],
+    deciderModels: [
+      {
+        id: 'anthropic/claude-sonnet-4',
+        supportedApiKinds: ['chat_completions' as const] as ('chat_completions' | 'responses' | 'messages')[],
+      },
+    ],
+    minAccuracy: 0.8,
+    maxConcurrency: 4,
+    updatedAt: null,
+    updatedBy: null,
+  },
+  defaults: {
+    classifierModels: ['anthropic/claude-haiku-4'],
+    deciderModels: [
+      {
+        id: 'anthropic/claude-sonnet-4',
+        supportedApiKinds: ['chat_completions' as const] as ('chat_completions' | 'responses' | 'messages')[],
+      },
+    ],
+    minAccuracy: 0.8,
+    maxConcurrency: 4,
+    updatedAt: null,
+    updatedBy: null,
+  },
+};
+
+const runsResponse = {
+  runs: [
+    {
+      id: 'run-1',
+      kind: 'classifier',
+      status: 'completed',
+      startedAt: '2026-06-01T00:00:00Z',
+      completedAt: '2026-06-01T01:00:00Z',
+      error: null,
+      summaries: [],
+    },
+  ],
+};
+
+describe('auto routing benchmark admin client', () => {
+  beforeEach(() => {
+    mockFetch.mockReset();
+  });
+
+  it('gets the benchmark config and sends bearer auth header', async () => {
+    mockFetch.mockResolvedValue({
+      status: 200,
+      ok: true,
+      json: () => Promise.resolve(configResponse),
+    });
+
+    await expect(getBenchmarkConfig()).resolves.toEqual({
+      status: 200,
+      body: configResponse,
+    });
+
+    expect(mockFetch).toHaveBeenCalledWith(
+      'https://benchmark-worker.example.com/admin/config',
+      {
+        method: 'GET',
+        headers: {
+          authorization: 'Bearer test-internal-secret',
+        },
+      }
+    );
+  });
+
+  it('propagates error body when upstream responds with a non-OK status', async () => {
+    mockFetch.mockResolvedValue({
+      status: 404,
+      ok: false,
+      json: () => Promise.resolve({ error: 'not found' }),
+    });
+
+    await expect(getBenchmarkConfig()).resolves.toEqual({
+      status: 404,
+      body: { error: 'not found' },
+    });
+  });
+
+  it('updates the benchmark config and sends x-updated-by header', async () => {
+    mockFetch.mockResolvedValue({
+      status: 200,
+      ok: true,
+      json: () => Promise.resolve(configResponse),
+    });
+
+    await updateBenchmarkConfig(configResponse.config, 'admin@kilocode.ai');
+
+    expect(mockFetch).toHaveBeenCalledWith(
+      'https://benchmark-worker.example.com/admin/config',
+      {
+        method: 'PUT',
+        headers: {
+          authorization: 'Bearer test-internal-secret',
+          'content-type': 'application/json',
+          'x-updated-by': 'admin@kilocode.ai',
+        },
+        body: JSON.stringify(configResponse.config),
+      }
+    );
+  });
+
+  it('lists benchmark runs', async () => {
+    mockFetch.mockResolvedValue({
+      status: 200,
+      ok: true,
+      json: () => Promise.resolve(runsResponse),
+    });
+
+    await expect(listBenchmarkRuns()).resolves.toEqual({
+      status: 200,
+      body: runsResponse,
+    });
+
+    expect(mockFetch).toHaveBeenCalledWith(
+      'https://benchmark-worker.example.com/admin/runs',
+      {
+        method: 'GET',
+        headers: {
+          authorization: 'Bearer test-internal-secret',
+        },
+      }
+    );
+  });
+
+  it('propagates error body from listBenchmarkRuns on non-OK status', async () => {
+    mockFetch.mockResolvedValue({
+      status: 401,
+      ok: false,
+      json: () => Promise.resolve({ error: 'unauthorized' }),
+    });
+
+    await expect(listBenchmarkRuns()).resolves.toEqual({
+      status: 401,
+      body: { error: 'unauthorized' },
+    });
+  });
+
+  it('starts a benchmark run with the given kind', async () => {
+    mockFetch.mockResolvedValue({
+      status: 200,
+      ok: true,
+      json: () => Promise.resolve({ runId: 'run-2', enqueuedModels: 3 }),
+    });
+
+    await expect(startBenchmarkRun('classifier')).resolves.toEqual({
+      status: 200,
+      body: { runId: 'run-2', enqueuedModels: 3 },
+    });
+
+    expect(mockFetch).toHaveBeenCalledWith(
+      'https://benchmark-worker.example.com/admin/runs',
+      {
+        method: 'POST',
+        headers: {
+          authorization: 'Bearer test-internal-secret',
+          'content-type': 'application/json',
+        },
+        body: JSON.stringify({ kind: 'classifier' }),
+      }
+    );
+  });
+
+  it('gets the benchmark routing table', async () => {
+    mockFetch.mockResolvedValue({
+      status: 200,
+      ok: true,
+      json: () => Promise.resolve({ table: null, publishedAt: null }),
+    });
+
+    await expect(getBenchmarkRoutingTable()).resolves.toEqual({
+      status: 200,
+      body: { table: null, publishedAt: null },
+    });
+
+    expect(mockFetch).toHaveBeenCalledWith(
+      'https://benchmark-worker.example.com/admin/routing-table',
+      {
+        method: 'GET',
+        headers: {
+          authorization: 'Bearer test-internal-secret',
+        },
+      }
+    );
+  });
+});
diff --git a/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.ts b/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.ts
new file mode 100644
index 0000000000..52ebee417a
--- /dev/null
+++ b/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.ts
@@ -0,0 +1,108 @@
+import {
+  BenchmarkConfigResponseSchema,
+  BenchmarkRunsResponseSchema,
+  StartBenchmarkRunResponseSchema,
+  RoutingTableSchema,
+  type BenchmarkConfig,
+  type BenchmarkKind,
+} from '@kilocode/auto-routing-contracts';
+import { AUTO_ROUTING_BENCHMARK_WORKER_URL, INTERNAL_API_SECRET } from '@/lib/config.server';
+import * as z from 'zod';
+
+export type AutoRoutingAdminResult<T> = {
+  status: number;
+  body: T;
+};
+
+type ErrorBody = { error: string };
+const ErrorBodySchema = z.object({ error: z.string() });
+
+export const BenchmarkRoutingTableResponseSchema = z.object({
+  table: RoutingTableSchema.nullable(),
+  publishedAt: z.string().nullable(),
+});
+export type BenchmarkRoutingTableResponse = z.infer<typeof BenchmarkRoutingTableResponseSchema>;
+
+type AutoRoutingBenchmarkAdminRequestInit = Omit<RequestInit, 'headers'> & {
+  headers?: Record<string, string>;
+};
+
+async function fetchBenchmarkAdmin<T>(
+  path: string,
+  init: AutoRoutingBenchmarkAdminRequestInit,
+  schema: z.ZodType<T>
+): Promise<AutoRoutingAdminResult<T | ErrorBody>> {
+  if (!AUTO_ROUTING_BENCHMARK_WORKER_URL || !INTERNAL_API_SECRET) {
+    return {
+      status: 500,
+      body: { error: 'Auto routing benchmark worker is not configured' },
+    };
+  }
+
+  const response = await fetch(`${AUTO_ROUTING_BENCHMARK_WORKER_URL}${path}`, {
+    ...init,
+    headers: {
+      authorization: `Bearer ${INTERNAL_API_SECRET}`,
+      ...init.headers,
+    },
+  });
+
+  const body: unknown = await response.json();
+  if (!response.ok) {
+    const parsedError = ErrorBodySchema.safeParse(body);
+    return {
+      status: response.status,
+      body: parsedError.success
+        ? parsedError.data
+        : { error: `Request failed: ${response.status}` },
+    };
+  }
+
+  return {
+    status: response.status,
+    body: schema.parse(body),
+  };
+}
+
+export function getBenchmarkConfig() {
+  return fetchBenchmarkAdmin('/admin/config', { method: 'GET' }, BenchmarkConfigResponseSchema);
+}
+
+export function updateBenchmarkConfig(config: BenchmarkConfig, updatedByEmail: string) {
+  return fetchBenchmarkAdmin(
+    '/admin/config',
+    {
+      method: 'PUT',
+      headers: {
+        'content-type': 'application/json',
+        'x-updated-by': updatedByEmail,
+      },
+      body: JSON.stringify(config),
+    },
+    BenchmarkConfigResponseSchema
+  );
+}
+
+export function listBenchmarkRuns() {
+  return fetchBenchmarkAdmin('/admin/runs', { method: 'GET' }, BenchmarkRunsResponseSchema);
+}
+
+export function startBenchmarkRun(kind: BenchmarkKind) {
+  return fetchBenchmarkAdmin(
+    '/admin/runs',
+    {
+      method: 'POST',
+      headers: { 'content-type': 'application/json' },
+      body: JSON.stringify({ kind }),
+    },
+    StartBenchmarkRunResponseSchema
+  );
+}
+
+export function getBenchmarkRoutingTable() {
+  return fetchBenchmarkAdmin(
+    '/admin/routing-table',
+    { method: 'GET' },
+    BenchmarkRoutingTableResponseSchema
+  );
+}
diff --git a/apps/web/src/lib/config.server.ts b/apps/web/src/lib/config.server.ts
index 6240690665..a0812e0c36 100644
--- a/apps/web/src/lib/config.server.ts
+++ b/apps/web/src/lib/config.server.ts
@@ -369,6 +369,11 @@ export const SESSION_INGEST_WORKER_URL = getEnvVariable('SESSION_INGEST_WORKER_U
 // Auto routing worker
 export const AUTO_ROUTING_WORKER_URL = getEnvVariable('AUTO_ROUTING_WORKER_URL') || '';
 
+// Auto routing benchmark worker
+export const AUTO_ROUTING_BENCHMARK_WORKER_URL =
+  getEnvVariable('AUTO_ROUTING_BENCHMARK_WORKER_URL') ||
+  'https://auto-routing-benchmark.kiloapps.io';
+
 // Security Agent sync Worker command ingress
 export const SECURITY_SYNC_WORKER_URL = getEnvVariable('SECURITY_SYNC_WORKER_URL') || '';
 // Security Agent auto-analysis Worker command ingress
diff --git a/services/auto-routing-benchmark/src/admin.test.ts b/services/auto-routing-benchmark/src/admin.test.ts
index 6a74bdc7e3..99da4c6a8a 100644
--- a/services/auto-routing-benchmark/src/admin.test.ts
+++ b/services/auto-routing-benchmark/src/admin.test.ts
@@ -194,7 +194,8 @@ describe('PUT /admin/config', () => {
     expect(dbRun).toHaveBeenCalled();
     // The SQL should be an INSERT OR REPLACE into benchmark_config.
     const insertCall = dbPrepare.mock.calls.find(
-      (args: unknown[]) => typeof args[0] === 'string' && (args[0] as string).includes('benchmark_config')
+      (args: unknown[]) =>
+        typeof args[0] === 'string' && (args[0] as string).includes('benchmark_config')
     );
     expect(insertCall).toBeDefined();
     // The updatedBy value was forwarded via bind.

From 0e34c020848573d00005d6ea74ff0bd8443dd726 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Thu, 11 Jun 2026 23:00:32 +0200
Subject: [PATCH 14/73] feat(admin): benchmark config, runs and routing table
 panel

---
 .../auto-routing/AutoRoutingAdminContent.tsx  |   3 +
 .../auto-routing/BenchmarksSection.test.ts    |  51 ++
 .../admin/auto-routing/BenchmarksSection.tsx  | 817 ++++++++++++++++++
 .../auto-routing/BenchmarksSection.types.ts   |   8 +
 ...uto-routing-benchmark-admin-client.test.ts |  82 +-
 5 files changed, 918 insertions(+), 43 deletions(-)
 create mode 100644 apps/web/src/app/admin/auto-routing/BenchmarksSection.test.ts
 create mode 100644 apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx
 create mode 100644 apps/web/src/app/admin/auto-routing/BenchmarksSection.types.ts

diff --git a/apps/web/src/app/admin/auto-routing/AutoRoutingAdminContent.tsx b/apps/web/src/app/admin/auto-routing/AutoRoutingAdminContent.tsx
index d893f27382..f55d1bccdc 100644
--- a/apps/web/src/app/admin/auto-routing/AutoRoutingAdminContent.tsx
+++ b/apps/web/src/app/admin/auto-routing/AutoRoutingAdminContent.tsx
@@ -31,6 +31,7 @@ import {
   type OpenRouterModelsResponse,
 } from '@/lib/organizations/organization-types';
 import { cn } from '@/lib/utils';
+import { BenchmarksSection } from './BenchmarksSection';
 
 const periods: Array<{ value: AutoRoutingAnalyticsPeriod; label: string }> = [
   { value: '1h', label: '1h' },
@@ -600,6 +601,8 @@ export function AutoRoutingAdminContent() {
           />
         </>
       )}
+
+      <BenchmarksSection />
     </div>
   );
 }
diff --git a/apps/web/src/app/admin/auto-routing/BenchmarksSection.test.ts b/apps/web/src/app/admin/auto-routing/BenchmarksSection.test.ts
new file mode 100644
index 0000000000..768545f81f
--- /dev/null
+++ b/apps/web/src/app/admin/auto-routing/BenchmarksSection.test.ts
@@ -0,0 +1,51 @@
+import { describe, expect, it } from '@jest/globals';
+import { formatAccuracy, formatUsd } from './BenchmarksSection';
+
+describe('formatAccuracy', () => {
+  it('formats 0.8542 as 85.4%', () => {
+    expect(formatAccuracy(0.8542)).toBe('85.4%');
+  });
+
+  it('formats 1.0 as 100.0%', () => {
+    expect(formatAccuracy(1.0)).toBe('100.0%');
+  });
+
+  it('formats 0 as 0.0%', () => {
+    expect(formatAccuracy(0)).toBe('0.0%');
+  });
+
+  it('formats 0.5 as 50.0%', () => {
+    expect(formatAccuracy(0.5)).toBe('50.0%');
+  });
+
+  it('rounds to one decimal place', () => {
+    expect(formatAccuracy(0.9999)).toBe('100.0%');
+    expect(formatAccuracy(0.9994)).toBe('99.9%');
+  });
+});
+
+describe('formatUsd', () => {
+  it('returns em dash for null', () => {
+    expect(formatUsd(null)).toBe('—');
+  });
+
+  it('formats a small cost with 6 decimal places', () => {
+    expect(formatUsd(0.000123)).toBe('$0.000123');
+  });
+
+  it('trims trailing zeros', () => {
+    expect(formatUsd(0.1)).toBe('$0.1');
+  });
+
+  it('formats zero as $0.0', () => {
+    expect(formatUsd(0)).toBe('$0.0');
+  });
+
+  it('formats a typical cost', () => {
+    expect(formatUsd(0.001234)).toBe('$0.001234');
+  });
+
+  it('formats a cost that fits exactly at 6dp', () => {
+    expect(formatUsd(0.000001)).toBe('$0.000001');
+  });
+});
diff --git a/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx b/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx
new file mode 100644
index 0000000000..4d13898548
--- /dev/null
+++ b/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx
@@ -0,0 +1,817 @@
+'use client';
+
+import {
+  BenchmarkConfigResponseSchema,
+  BenchmarkRunsResponseSchema,
+  StartBenchmarkRunResponseSchema,
+  type BenchmarkConfig,
+  type BenchmarkRun,
+  type BenchmarkModelSummary,
+} from '@kilocode/auto-routing-contracts';
+import React, { useCallback, useEffect, useRef, useState } from 'react';
+import { useMutation, useQuery, useQueryClient } from '@tanstack/react-query';
+import { toast } from 'sonner';
+import { ChevronDown, ChevronRight, Play, Plus, RotateCcw, Save, Trash2 } from 'lucide-react';
+import * as z from 'zod';
+import { Badge } from '@/components/ui/badge';
+import { Button } from '@/components/ui/button';
+import { Card, CardContent, CardHeader, CardTitle } from '@/components/ui/card';
+import { Checkbox } from '@/components/ui/checkbox';
+import { Input } from '@/components/ui/input';
+import { Label } from '@/components/ui/label';
+import { Skeleton } from '@/components/ui/skeleton';
+import {
+  Table,
+  TableBody,
+  TableCell,
+  TableHead,
+  TableHeader,
+  TableRow,
+} from '@/components/ui/table';
+import { Textarea } from '@/components/ui/textarea';
+import {
+  BenchmarkRoutingTableResponseSchema,
+  type BenchmarkRoutingTableResponse,
+} from './BenchmarksSection.types';
+
+// ---------------------------------------------------------------------------
+// Pure helpers (exported for unit tests)
+// ---------------------------------------------------------------------------
+
+export function formatAccuracy(n: number): string {
+  return `${(n * 100).toFixed(1)}%`;
+}
+
+export function formatUsd(n: number | null): string {
+  if (n === null) return '—';
+  // 6 dp, remove trailing zeros, but keep at least $0.000001 precision
+  const fixed = n.toFixed(6);
+  // Trim trailing zeros after decimal, but leave at least one digit after dot
+  const trimmed = fixed.replace(/(\.\d*?)0+$/, '$1').replace(/\.$/, '.0');
+  return `$${trimmed}`;
+}
+
+// ---------------------------------------------------------------------------
+// API error helper (mirrors the one in AutoRoutingAdminContent.tsx)
+// ---------------------------------------------------------------------------
+
+const AdminApiErrorSchema = z.object({ error: z.string().optional() });
+
+async function parseAdminResponse<T extends object>(
+  response: Response,
+  schema: z.ZodType<T>
+): Promise<T> {
+  const body: unknown = await response.json();
+  if (!response.ok) {
+    const parsedError = AdminApiErrorSchema.safeParse(body);
+    throw new Error(
+      parsedError.success && parsedError.data.error
+        ? parsedError.data.error
+        : `Request failed: ${response.status}`
+    );
+  }
+  return schema.parse(body);
+}
+
+// ---------------------------------------------------------------------------
+// Fetch helpers
+// ---------------------------------------------------------------------------
+
+async function fetchBenchmarkConfig() {
+  const response = await fetch('/admin/api/auto-routing/benchmark-config');
+  return parseAdminResponse(response, BenchmarkConfigResponseSchema);
+}
+
+async function saveBenchmarkConfig(config: BenchmarkConfig) {
+  const response = await fetch('/admin/api/auto-routing/benchmark-config', {
+    method: 'PUT',
+    headers: { 'content-type': 'application/json' },
+    body: JSON.stringify(config),
+  });
+  return parseAdminResponse(response, BenchmarkConfigResponseSchema);
+}
+
+async function fetchBenchmarkRuns() {
+  const response = await fetch('/admin/api/auto-routing/benchmark-runs');
+  return parseAdminResponse(response, BenchmarkRunsResponseSchema);
+}
+
+async function startBenchmarkRun(kind: 'classifier' | 'decider') {
+  const response = await fetch('/admin/api/auto-routing/benchmark-runs', {
+    method: 'POST',
+    headers: { 'content-type': 'application/json' },
+    body: JSON.stringify({ kind }),
+  });
+  return parseAdminResponse(response, StartBenchmarkRunResponseSchema);
+}
+
+async function fetchBenchmarkRoutingTable() {
+  const response = await fetch('/admin/api/auto-routing/benchmark-routing-table');
+  return parseAdminResponse<BenchmarkRoutingTableResponse>(
+    response,
+    BenchmarkRoutingTableResponseSchema
+  );
+}
+
+// ---------------------------------------------------------------------------
+// Local form state type for decider model rows
+// ---------------------------------------------------------------------------
+
+type DeciderModelRow = {
+  id: string;
+  chat_completions: boolean;
+  responses: boolean;
+  messages: boolean;
+};
+
+function configToFormState(config: BenchmarkConfig): {
+  classifierModels: string;
+  deciderModels: DeciderModelRow[];
+  minAccuracy: number;
+  maxConcurrency: number;
+} {
+  return {
+    classifierModels: config.classifierModels.join('\n'),
+    deciderModels: config.deciderModels.map(m => ({
+      id: m.id,
+      chat_completions: m.supportedApiKinds.includes('chat_completions'),
+      responses: m.supportedApiKinds.includes('responses'),
+      messages: m.supportedApiKinds.includes('messages'),
+    })),
+    minAccuracy: config.minAccuracy,
+    maxConcurrency: config.maxConcurrency,
+  };
+}
+
+function formStateToConfig(
+  state: ReturnType<typeof configToFormState>,
+  base: BenchmarkConfig
+): BenchmarkConfig {
+  const classifierModels = state.classifierModels
+    .split('\n')
+    .map(s => s.trim())
+    .filter(s => s.length > 0);
+  const deciderModels = state.deciderModels
+    .filter(row => row.id.trim().length > 0)
+    .map(row => {
+      const kinds: Array<'chat_completions' | 'responses' | 'messages'> = [];
+      if (row.chat_completions) kinds.push('chat_completions');
+      if (row.responses) kinds.push('responses');
+      if (row.messages) kinds.push('messages');
+      return { id: row.id.trim(), supportedApiKinds: kinds.length ? kinds : ['chat_completions' as const] };
+    });
+  return {
+    classifierModels,
+    deciderModels,
+    minAccuracy: state.minAccuracy,
+    maxConcurrency: state.maxConcurrency,
+    updatedAt: base.updatedAt,
+    updatedBy: base.updatedBy,
+  };
+}
+
+// ---------------------------------------------------------------------------
+// Config editor sub-component
+// ---------------------------------------------------------------------------
+
+function BenchmarkConfigEditor({
+  config,
+  defaults,
+  onSaved,
+}: {
+  config: BenchmarkConfig;
+  defaults: BenchmarkConfig;
+  onSaved: (next: { config: BenchmarkConfig; defaults: BenchmarkConfig }) => void;
+}) {
+  const [form, setForm] = useState(() => configToFormState(config));
+
+  // Sync when config changes from outside (initial load / after save)
+  const prevConfigRef = useRef(config);
+  useEffect(() => {
+    if (prevConfigRef.current !== config) {
+      prevConfigRef.current = config;
+      setForm(configToFormState(config));
+    }
+  }, [config]);
+
+  const saveMutation = useMutation({
+    mutationFn: saveBenchmarkConfig,
+    onSuccess: data => {
+      onSaved(data);
+      toast.success('Benchmark config saved');
+    },
+    onError: (error: unknown) => {
+      toast.error(error instanceof Error ? error.message : 'Failed to save benchmark config');
+    },
+  });
+
+  const handleResetToDefaults = useCallback(() => {
+    setForm(configToFormState(defaults));
+  }, [defaults]);
+
+  const handleAddDeciderRow = useCallback(() => {
+    setForm(prev => ({
+      ...prev,
+      deciderModels: [
+        ...prev.deciderModels,
+        { id: '', chat_completions: true, responses: false, messages: false },
+      ],
+    }));
+  }, []);
+
+  const handleRemoveDeciderRow = useCallback((index: number) => {
+    setForm(prev => ({
+      ...prev,
+      deciderModels: prev.deciderModels.filter((_, i) => i !== index),
+    }));
+  }, []);
+
+  const handleDeciderRowChange = useCallback(
+    (index: number, patch: Partial<DeciderModelRow>) => {
+      setForm(prev => ({
+        ...prev,
+        deciderModels: prev.deciderModels.map((row, i) =>
+          i === index ? { ...row, ...patch } : row
+        ),
+      }));
+    },
+    []
+  );
+
+  const handleSave = useCallback(() => {
+    saveMutation.mutate(formStateToConfig(form, config));
+  }, [form, config, saveMutation]);
+
+  return (
+    <Card className="rounded-lg">
+      <CardHeader className="p-4 pb-2">
+        <CardTitle className="text-base">Benchmark Config</CardTitle>
+      </CardHeader>
+      <CardContent className="flex flex-col gap-4 p-4 pt-0">
+        {/* Classifier models */}
+        <div className="flex flex-col gap-1.5">
+          <Label htmlFor="benchmark-classifier-models" className="text-sm font-medium">
+            Classifier models (one per line)
+          </Label>
+          <Textarea
+            id="benchmark-classifier-models"
+            value={form.classifierModels}
+            onChange={e => setForm(prev => ({ ...prev, classifierModels: e.target.value }))}
+            rows={4}
+            className="font-mono text-xs"
+            placeholder="openai/gpt-4o-mini"
+          />
+        </div>
+
+        {/* Decider models table */}
+        <div className="flex flex-col gap-1.5">
+          <Label className="text-sm font-medium">Decider models</Label>
+          <div className="rounded-md border">
+            <Table>
+              <TableHeader>
+                <TableRow>
+                  <TableHead>Model ID</TableHead>
+                  <TableHead className="w-32 text-center">chat_completions</TableHead>
+                  <TableHead className="w-24 text-center">responses</TableHead>
+                  <TableHead className="w-24 text-center">messages</TableHead>
+                  <TableHead className="w-12" />
+                </TableRow>
+              </TableHeader>
+              <TableBody>
+                {form.deciderModels.map((row, index) => (
+                  <TableRow key={index}>
+                    <TableCell className="py-2">
+                      <Input
+                        value={row.id}
+                        onChange={e => handleDeciderRowChange(index, { id: e.target.value })}
+                        className="h-8 font-mono text-xs"
+                        placeholder="openai/gpt-4o"
+                        aria-label={`Decider model ${index + 1} ID`}
+                      />
+                    </TableCell>
+                    <TableCell className="py-2 text-center">
+                      <Checkbox
+                        checked={row.chat_completions}
+                        onCheckedChange={checked =>
+                          handleDeciderRowChange(index, { chat_completions: checked === true })
+                        }
+                        aria-label={`Model ${index + 1} supports chat_completions`}
+                      />
+                    </TableCell>
+                    <TableCell className="py-2 text-center">
+                      <Checkbox
+                        checked={row.responses}
+                        onCheckedChange={checked =>
+                          handleDeciderRowChange(index, { responses: checked === true })
+                        }
+                        aria-label={`Model ${index + 1} supports responses`}
+                      />
+                    </TableCell>
+                    <TableCell className="py-2 text-center">
+                      <Checkbox
+                        checked={row.messages}
+                        onCheckedChange={checked =>
+                          handleDeciderRowChange(index, { messages: checked === true })
+                        }
+                        aria-label={`Model ${index + 1} supports messages`}
+                      />
+                    </TableCell>
+                    <TableCell className="py-2">
+                      <Button
+                        type="button"
+                        variant="ghost"
+                        size="icon"
+                        className="h-8 w-8 text-destructive hover:text-destructive"
+                        onClick={() => handleRemoveDeciderRow(index)}
+                        aria-label={`Remove decider model ${index + 1}`}
+                      >
+                        <Trash2 className="size-3.5" />
+                      </Button>
+                    </TableCell>
+                  </TableRow>
+                ))}
+              </TableBody>
+            </Table>
+          </div>
+          <Button
+            type="button"
+            variant="outline"
+            size="sm"
+            className="w-fit"
+            onClick={handleAddDeciderRow}
+          >
+            <Plus className="size-3.5" />
+            Add model
+          </Button>
+        </div>
+
+        {/* Numeric inputs */}
+        <div className="grid gap-4 sm:grid-cols-2">
+          <div className="flex flex-col gap-1.5">
+            <Label htmlFor="benchmark-min-accuracy" className="text-sm font-medium">
+              Min accuracy (0–1)
+            </Label>
+            <Input
+              id="benchmark-min-accuracy"
+              type="number"
+              min={0}
+              max={1}
+              step={0.05}
+              value={form.minAccuracy}
+              onChange={e =>
+                setForm(prev => ({ ...prev, minAccuracy: parseFloat(e.target.value) || 0 }))
+              }
+              className="h-8 w-40 tabular-nums"
+            />
+          </div>
+          <div className="flex flex-col gap-1.5">
+            <Label htmlFor="benchmark-max-concurrency" className="text-sm font-medium">
+              Max concurrency (1–16)
+            </Label>
+            <Input
+              id="benchmark-max-concurrency"
+              type="number"
+              min={1}
+              max={16}
+              step={1}
+              value={form.maxConcurrency}
+              onChange={e =>
+                setForm(prev => ({ ...prev, maxConcurrency: parseInt(e.target.value, 10) || 1 }))
+              }
+              className="h-8 w-40 tabular-nums"
+            />
+          </div>
+        </div>
+
+        {/* Actions + metadata */}
+        <div className="flex flex-col gap-2">
+          <div className="flex flex-wrap gap-2">
+            <Button
+              type="button"
+              onClick={handleSave}
+              disabled={saveMutation.isPending}
+            >
+              <Save className="size-4" />
+              Save config
+            </Button>
+            <Button type="button" variant="outline" onClick={handleResetToDefaults}>
+              <RotateCcw className="size-4" />
+              Reset to defaults
+            </Button>
+          </div>
+          {config.updatedAt ? (
+            <p className="text-muted-foreground text-xs">
+              Last updated {config.updatedAt}
+              {config.updatedBy ? ` by ${config.updatedBy}` : ''}
+            </p>
+          ) : null}
+        </div>
+      </CardContent>
+    </Card>
+  );
+}
+
+// ---------------------------------------------------------------------------
+// Run summaries expandable table
+// ---------------------------------------------------------------------------
+
+function RunSummariesTable({ run }: { run: BenchmarkRun }) {
+  const isDecider = run.kind === 'decider';
+
+  const sortedSummaries: BenchmarkModelSummary[] = isDecider
+    ? [...run.summaries].sort((a, b) => {
+        const tierOrder = { low: 0, medium: 1, high: 2, '*': 3 };
+        const tierDiff =
+          (tierOrder[a.tier as keyof typeof tierOrder] ?? 3) -
+          (tierOrder[b.tier as keyof typeof tierOrder] ?? 3);
+        if (tierDiff !== 0) return tierDiff;
+        return b.accuracy - a.accuracy;
+      })
+    : run.summaries;
+
+  if (sortedSummaries.length === 0) {
+    return (
+      <TableRow>
+        <TableCell colSpan={8} className="text-muted-foreground h-10 text-center text-xs">
+          No summaries
+        </TableCell>
+      </TableRow>
+    );
+  }
+
+  return (
+    <>
+      <TableRow className="bg-muted/30">
+        <TableCell colSpan={8} className="px-4 py-2">
+          <Table>
+            <TableHeader>
+              <TableRow>
+                <TableHead className="text-xs">Model</TableHead>
+                {isDecider ? <TableHead className="text-xs">Tier</TableHead> : null}
+                <TableHead className="text-right text-xs">Accuracy</TableHead>
+                <TableHead className="text-right text-xs">Avg cost</TableHead>
+                <TableHead className="text-right text-xs">Avg latency</TableHead>
+                <TableHead className="text-right text-xs">p50 latency</TableHead>
+                <TableHead className="text-right text-xs">Cases</TableHead>
+                <TableHead className="text-right text-xs">Errors</TableHead>
+              </TableRow>
+            </TableHeader>
+            <TableBody>
+              {sortedSummaries.map((s, i) => (
+                <TableRow key={`${s.model}-${s.tier}-${i}`}>
+                  <TableCell className="max-w-56 truncate font-mono text-xs">{s.model}</TableCell>
+                  {isDecider ? (
+                    <TableCell className="text-xs capitalize">{s.tier}</TableCell>
+                  ) : null}
+                  <TableCell className="text-right tabular-nums text-xs">
+                    {formatAccuracy(s.accuracy)}
+                  </TableCell>
+                  <TableCell className="text-right tabular-nums text-xs">
+                    {formatUsd(s.avgCostUsd)}
+                  </TableCell>
+                  <TableCell className="text-right tabular-nums text-xs">
+                    {s.avgLatencyMs.toFixed(0)} ms
+                  </TableCell>
+                  <TableCell className="text-right tabular-nums text-xs">
+                    {s.p50LatencyMs !== null ? `${s.p50LatencyMs.toFixed(0)} ms` : '—'}
+                  </TableCell>
+                  <TableCell className="text-right tabular-nums text-xs">{s.cases}</TableCell>
+                  <TableCell className="text-right tabular-nums text-xs">{s.errors}</TableCell>
+                </TableRow>
+              ))}
+            </TableBody>
+          </Table>
+        </TableCell>
+      </TableRow>
+    </>
+  );
+}
+
+// ---------------------------------------------------------------------------
+// Runs table
+// ---------------------------------------------------------------------------
+
+function statusBadgeVariant(status: BenchmarkRun['status']): 'default' | 'secondary' | 'destructive' {
+  if (status === 'completed') return 'default';
+  if (status === 'running') return 'secondary';
+  return 'destructive';
+}
+
+function BenchmarkRunsTable({ runs }: { runs: BenchmarkRun[] }) {
+  const [expandedIds, setExpandedIds] = useState<Set<string>>(new Set());
+
+  const toggleExpand = useCallback((id: string) => {
+    setExpandedIds(prev => {
+      const next = new Set(prev);
+      if (next.has(id)) {
+        next.delete(id);
+      } else {
+        next.add(id);
+      }
+      return next;
+    });
+  }, []);
+
+  if (runs.length === 0) {
+    return (
+      <TableRow>
+        <TableCell colSpan={6} className="text-muted-foreground h-16 text-center">
+          No runs yet
+        </TableCell>
+      </TableRow>
+    );
+  }
+
+  return (
+    <>
+      {runs.map(run => {
+        const expanded = expandedIds.has(run.id);
+        return (
+          <React.Fragment key={run.id}>
+            <TableRow
+              className="cursor-pointer"
+              onClick={() => toggleExpand(run.id)}
+              aria-expanded={expanded}
+            >
+              <TableCell className="w-8 py-2">
+                {expanded ? (
+                  <ChevronDown className="size-4" />
+                ) : (
+                  <ChevronRight className="size-4" />
+                )}
+              </TableCell>
+              <TableCell className="py-2 capitalize text-sm">{run.kind}</TableCell>
+              <TableCell className="py-2">
+                <Badge variant={statusBadgeVariant(run.status)} className="capitalize">
+                  {run.status}
+                </Badge>
+              </TableCell>
+              <TableCell className="py-2 text-xs tabular-nums">{run.startedAt}</TableCell>
+              <TableCell className="py-2 text-xs tabular-nums">
+                {run.completedAt ?? '—'}
+              </TableCell>
+              <TableCell className="py-2 text-xs text-destructive max-w-48 truncate">
+                {run.error ?? ''}
+              </TableCell>
+            </TableRow>
+            {expanded ? <RunSummariesTable run={run} /> : null}
+          </React.Fragment>
+        );
+      })}
+    </>
+  );
+}
+
+// ---------------------------------------------------------------------------
+// Routing table view
+// ---------------------------------------------------------------------------
+
+function RoutingTableView({
+  data,
+}: {
+  data: BenchmarkRoutingTableResponse;
+}) {
+  if (!data.table) {
+    return (
+      <p className="text-muted-foreground text-sm">No routing table published yet.</p>
+    );
+  }
+
+  const { table } = data;
+  const tierEntries = [
+    { tier: 'low', candidates: table.tiers.low },
+    { tier: 'medium', candidates: table.tiers.medium },
+    { tier: 'high', candidates: table.tiers.high },
+  ] as const;
+
+  return (
+    <div className="flex flex-col gap-3">
+      <div className="text-muted-foreground text-xs flex flex-wrap gap-x-4 gap-y-1">
+        <span>Version: <span className="font-mono">{table.version}</span></span>
+        <span>Generated: {table.generatedAt}</span>
+        <span>Min accuracy: {formatAccuracy(table.minAccuracy)}</span>
+        <span>Source: <span className="capitalize">{table.source}</span></span>
+      </div>
+
+      {tierEntries.map(({ tier, candidates }) => (
+        <div key={tier}>
+          <p className="text-sm font-medium capitalize mb-1.5">{tier} tier</p>
+          <div className="rounded-md border">
+            <Table>
+              <TableHeader>
+                <TableRow>
+                  <TableHead>Model</TableHead>
+                  <TableHead className="text-right">Accuracy</TableHead>
+                  <TableHead className="text-right">Avg cost</TableHead>
+                  <TableHead>Threshold</TableHead>
+                  <TableHead>API kinds</TableHead>
+                </TableRow>
+              </TableHeader>
+              <TableBody>
+                {candidates.map((c, i) => (
+                  <TableRow key={`${tier}-${c.model}-${i}`}>
+                    <TableCell className="max-w-56 truncate font-mono text-xs">{c.model}</TableCell>
+                    <TableCell className="text-right tabular-nums text-xs">
+                      {formatAccuracy(c.accuracy)}
+                    </TableCell>
+                    <TableCell className="text-right tabular-nums text-xs">
+                      {formatUsd(c.avgCostUsd)}
+                    </TableCell>
+                    <TableCell>
+                      <Badge variant={c.meetsThreshold ? 'default' : 'secondary'}>
+                        {c.meetsThreshold ? 'meets' : 'below'}
+                      </Badge>
+                    </TableCell>
+                    <TableCell className="text-xs">
+                      <div className="flex flex-wrap gap-1">
+                        {c.supportedApiKinds.map(kind => (
+                          <Badge key={kind} variant="outline" className="font-mono text-xs px-1">
+                            {kind}
+                          </Badge>
+                        ))}
+                      </div>
+                    </TableCell>
+                  </TableRow>
+                ))}
+              </TableBody>
+            </Table>
+          </div>
+        </div>
+      ))}
+    </div>
+  );
+}
+
+// ---------------------------------------------------------------------------
+// Main exported section component
+// ---------------------------------------------------------------------------
+
+export function BenchmarksSection() {
+  const queryClient = useQueryClient();
+
+  const configQuery = useQuery({
+    queryKey: ['auto-routing', 'benchmark-config'],
+    queryFn: fetchBenchmarkConfig,
+  });
+
+  const runsQuery = useQuery({
+    queryKey: ['auto-routing', 'benchmark-runs'],
+    queryFn: fetchBenchmarkRuns,
+  });
+
+  const routingTableQuery = useQuery({
+    queryKey: ['auto-routing', 'benchmark-routing-table'],
+    queryFn: fetchBenchmarkRoutingTable,
+  });
+
+  // Poll runs every 30s while any run is 'running'
+  const hasRunningRun = runsQuery.data?.runs.some(r => r.status === 'running') ?? false;
+  useEffect(() => {
+    if (!hasRunningRun) return;
+    const id = setInterval(() => {
+      void runsQuery.refetch();
+    }, 30_000);
+    return () => clearInterval(id);
+  }, [hasRunningRun, runsQuery]);
+
+  const startRunMutation = useMutation({
+    mutationFn: startBenchmarkRun,
+    onSuccess: (data, kind) => {
+      toast.success(
+        `${kind === 'classifier' ? 'Classifier' : 'Decider'} benchmark started — ${data.enqueuedModels} models enqueued`
+      );
+      void queryClient.invalidateQueries({ queryKey: ['auto-routing', 'benchmark-runs'] });
+    },
+    onError: (error: unknown) => {
+      toast.error(error instanceof Error ? error.message : 'Failed to start benchmark run');
+    },
+  });
+
+  const handleConfigSaved = useCallback(
+    (next: { config: BenchmarkConfig; defaults: BenchmarkConfig }) => {
+      queryClient.setQueryData(['auto-routing', 'benchmark-config'], next);
+    },
+    [queryClient]
+  );
+
+  const anyRunning =
+    hasRunningRun || startRunMutation.isPending;
+
+  return (
+    <div className="flex flex-col gap-4">
+      <div>
+        <h2 className="text-lg font-semibold">Benchmarks</h2>
+        <p className="text-muted-foreground text-sm">
+          Benchmark configuration, runs, and published routing table.
+        </p>
+      </div>
+
+      {/* Config editor */}
+      {configQuery.isLoading ? (
+        <Card className="rounded-lg">
+          <CardContent className="p-4">
+            <Skeleton className="h-48 w-full" />
+          </CardContent>
+        </Card>
+      ) : configQuery.error ? (
+        <div className="border-destructive/40 bg-destructive/10 text-destructive rounded-md border px-3 py-2 text-sm">
+          {configQuery.error instanceof Error
+            ? configQuery.error.message
+            : 'Failed to load benchmark config'}
+        </div>
+      ) : configQuery.data ? (
+        <BenchmarkConfigEditor
+          config={configQuery.data.config}
+          defaults={configQuery.data.defaults}
+          onSaved={handleConfigSaved}
+        />
+      ) : null}
+
+      {/* Run controls */}
+      <Card className="rounded-lg">
+        <CardHeader className="p-4 pb-2">
+          <CardTitle className="text-base">Run Benchmark</CardTitle>
+        </CardHeader>
+        <CardContent className="flex flex-wrap gap-2 p-4 pt-0">
+          <Button
+            type="button"
+            variant="outline"
+            disabled={anyRunning}
+            onClick={() => startRunMutation.mutate('classifier')}
+          >
+            <Play className="size-4" />
+            Run classifier benchmark
+          </Button>
+          <Button
+            type="button"
+            variant="outline"
+            disabled={anyRunning}
+            onClick={() => startRunMutation.mutate('decider')}
+          >
+            <Play className="size-4" />
+            Run decider benchmark
+          </Button>
+          {hasRunningRun ? (
+            <p className="text-muted-foreground self-center text-xs">
+              A benchmark is running — refreshing every 30 s
+            </p>
+          ) : null}
+        </CardContent>
+      </Card>
+
+      {/* Runs table */}
+      <Card className="rounded-lg">
+        <CardHeader className="p-4 pb-2">
+          <CardTitle className="text-base">Benchmark Runs</CardTitle>
+        </CardHeader>
+        <CardContent className="p-4 pt-0">
+          {runsQuery.isLoading ? (
+            <Skeleton className="h-24 w-full" />
+          ) : runsQuery.error ? (
+            <div className="border-destructive/40 bg-destructive/10 text-destructive rounded-md border px-3 py-2 text-sm">
+              {runsQuery.error instanceof Error
+                ? runsQuery.error.message
+                : 'Failed to load benchmark runs'}
+            </div>
+          ) : (
+            <Table>
+              <TableHeader>
+                <TableRow>
+                  <TableHead className="w-8" />
+                  <TableHead>Kind</TableHead>
+                  <TableHead>Status</TableHead>
+                  <TableHead>Started</TableHead>
+                  <TableHead>Completed</TableHead>
+                  <TableHead>Error</TableHead>
+                </TableRow>
+              </TableHeader>
+              <TableBody>
+                <BenchmarkRunsTable runs={runsQuery.data?.runs ?? []} />
+              </TableBody>
+            </Table>
+          )}
+        </CardContent>
+      </Card>
+
+      {/* Routing table */}
+      <Card className="rounded-lg">
+        <CardHeader className="p-4 pb-2">
+          <CardTitle className="text-base">Published Routing Table</CardTitle>
+        </CardHeader>
+        <CardContent className="p-4 pt-0">
+          {routingTableQuery.isLoading ? (
+            <Skeleton className="h-32 w-full" />
+          ) : routingTableQuery.error ? (
+            <div className="border-destructive/40 bg-destructive/10 text-destructive rounded-md border px-3 py-2 text-sm">
+              {routingTableQuery.error instanceof Error
+                ? routingTableQuery.error.message
+                : 'Failed to load routing table'}
+            </div>
+          ) : routingTableQuery.data ? (
+            <RoutingTableView data={routingTableQuery.data} />
+          ) : null}
+        </CardContent>
+      </Card>
+    </div>
+  );
+}
diff --git a/apps/web/src/app/admin/auto-routing/BenchmarksSection.types.ts b/apps/web/src/app/admin/auto-routing/BenchmarksSection.types.ts
new file mode 100644
index 0000000000..17ccd94cbf
--- /dev/null
+++ b/apps/web/src/app/admin/auto-routing/BenchmarksSection.types.ts
@@ -0,0 +1,8 @@
+import * as z from 'zod';
+import { RoutingTableSchema } from '@kilocode/auto-routing-contracts';
+
+export const BenchmarkRoutingTableResponseSchema = z.object({
+  table: RoutingTableSchema.nullable(),
+  publishedAt: z.string().nullable(),
+});
+export type BenchmarkRoutingTableResponse = z.infer<typeof BenchmarkRoutingTableResponseSchema>;
diff --git a/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.test.ts b/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.test.ts
index f0e62f1d80..8db475b305 100644
--- a/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.test.ts
+++ b/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.test.ts
@@ -20,7 +20,11 @@ const configResponse = {
     deciderModels: [
       {
         id: 'anthropic/claude-sonnet-4',
-        supportedApiKinds: ['chat_completions' as const] as ('chat_completions' | 'responses' | 'messages')[],
+        supportedApiKinds: ['chat_completions' as const] as (
+          | 'chat_completions'
+          | 'responses'
+          | 'messages'
+        )[],
       },
     ],
     minAccuracy: 0.8,
@@ -33,7 +37,11 @@ const configResponse = {
     deciderModels: [
       {
         id: 'anthropic/claude-sonnet-4',
-        supportedApiKinds: ['chat_completions' as const] as ('chat_completions' | 'responses' | 'messages')[],
+        supportedApiKinds: ['chat_completions' as const] as (
+          | 'chat_completions'
+          | 'responses'
+          | 'messages'
+        )[],
       },
     ],
     minAccuracy: 0.8,
@@ -74,15 +82,12 @@ describe('auto routing benchmark admin client', () => {
       body: configResponse,
     });
 
-    expect(mockFetch).toHaveBeenCalledWith(
-      'https://benchmark-worker.example.com/admin/config',
-      {
-        method: 'GET',
-        headers: {
-          authorization: 'Bearer test-internal-secret',
-        },
-      }
-    );
+    expect(mockFetch).toHaveBeenCalledWith('https://benchmark-worker.example.com/admin/config', {
+      method: 'GET',
+      headers: {
+        authorization: 'Bearer test-internal-secret',
+      },
+    });
   });
 
   it('propagates error body when upstream responds with a non-OK status', async () => {
@@ -107,18 +112,15 @@ describe('auto routing benchmark admin client', () => {
 
     await updateBenchmarkConfig(configResponse.config, 'admin@kilocode.ai');
 
-    expect(mockFetch).toHaveBeenCalledWith(
-      'https://benchmark-worker.example.com/admin/config',
-      {
-        method: 'PUT',
-        headers: {
-          authorization: 'Bearer test-internal-secret',
-          'content-type': 'application/json',
-          'x-updated-by': 'admin@kilocode.ai',
-        },
-        body: JSON.stringify(configResponse.config),
-      }
-    );
+    expect(mockFetch).toHaveBeenCalledWith('https://benchmark-worker.example.com/admin/config', {
+      method: 'PUT',
+      headers: {
+        authorization: 'Bearer test-internal-secret',
+        'content-type': 'application/json',
+        'x-updated-by': 'admin@kilocode.ai',
+      },
+      body: JSON.stringify(configResponse.config),
+    });
   });
 
   it('lists benchmark runs', async () => {
@@ -133,15 +135,12 @@ describe('auto routing benchmark admin client', () => {
       body: runsResponse,
     });
 
-    expect(mockFetch).toHaveBeenCalledWith(
-      'https://benchmark-worker.example.com/admin/runs',
-      {
-        method: 'GET',
-        headers: {
-          authorization: 'Bearer test-internal-secret',
-        },
-      }
-    );
+    expect(mockFetch).toHaveBeenCalledWith('https://benchmark-worker.example.com/admin/runs', {
+      method: 'GET',
+      headers: {
+        authorization: 'Bearer test-internal-secret',
+      },
+    });
   });
 
   it('propagates error body from listBenchmarkRuns on non-OK status', async () => {
@@ -169,17 +168,14 @@ describe('auto routing benchmark admin client', () => {
       body: { runId: 'run-2', enqueuedModels: 3 },
     });
 
-    expect(mockFetch).toHaveBeenCalledWith(
-      'https://benchmark-worker.example.com/admin/runs',
-      {
-        method: 'POST',
-        headers: {
-          authorization: 'Bearer test-internal-secret',
-          'content-type': 'application/json',
-        },
-        body: JSON.stringify({ kind: 'classifier' }),
-      }
-    );
+    expect(mockFetch).toHaveBeenCalledWith('https://benchmark-worker.example.com/admin/runs', {
+      method: 'POST',
+      headers: {
+        authorization: 'Bearer test-internal-secret',
+        'content-type': 'application/json',
+      },
+      body: JSON.stringify({ kind: 'classifier' }),
+    });
   });
 
   it('gets the benchmark routing table', async () => {

From fb084c32ee8e1617df993439b4decc446eeb6ffe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Thu, 11 Jun 2026 23:06:05 +0200
Subject: [PATCH 15/73] fix(admin): stabilize benchmark runs polling interval
 dependencies

---
 .../admin/auto-routing/BenchmarksSection.tsx  | 62 ++++++++-----------
 1 file changed, 27 insertions(+), 35 deletions(-)

diff --git a/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx b/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx
index 4d13898548..c7ae9db12a 100644
--- a/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx
+++ b/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx
@@ -158,7 +158,10 @@ function formStateToConfig(
       if (row.chat_completions) kinds.push('chat_completions');
       if (row.responses) kinds.push('responses');
       if (row.messages) kinds.push('messages');
-      return { id: row.id.trim(), supportedApiKinds: kinds.length ? kinds : ['chat_completions' as const] };
+      return {
+        id: row.id.trim(),
+        supportedApiKinds: kinds.length ? kinds : ['chat_completions' as const],
+      };
     });
   return {
     classifierModels,
@@ -226,17 +229,12 @@ function BenchmarkConfigEditor({
     }));
   }, []);
 
-  const handleDeciderRowChange = useCallback(
-    (index: number, patch: Partial<DeciderModelRow>) => {
-      setForm(prev => ({
-        ...prev,
-        deciderModels: prev.deciderModels.map((row, i) =>
-          i === index ? { ...row, ...patch } : row
-        ),
-      }));
-    },
-    []
-  );
+  const handleDeciderRowChange = useCallback((index: number, patch: Partial<DeciderModelRow>) => {
+    setForm(prev => ({
+      ...prev,
+      deciderModels: prev.deciderModels.map((row, i) => (i === index ? { ...row, ...patch } : row)),
+    }));
+  }, []);
 
   const handleSave = useCallback(() => {
     saveMutation.mutate(formStateToConfig(form, config));
@@ -386,11 +384,7 @@ function BenchmarkConfigEditor({
         {/* Actions + metadata */}
         <div className="flex flex-col gap-2">
           <div className="flex flex-wrap gap-2">
-            <Button
-              type="button"
-              onClick={handleSave}
-              disabled={saveMutation.isPending}
-            >
+            <Button type="button" onClick={handleSave} disabled={saveMutation.isPending}>
               <Save className="size-4" />
               Save config
             </Button>
@@ -491,7 +485,9 @@ function RunSummariesTable({ run }: { run: BenchmarkRun }) {
 // Runs table
 // ---------------------------------------------------------------------------
 
-function statusBadgeVariant(status: BenchmarkRun['status']): 'default' | 'secondary' | 'destructive' {
+function statusBadgeVariant(
+  status: BenchmarkRun['status']
+): 'default' | 'secondary' | 'destructive' {
   if (status === 'completed') return 'default';
   if (status === 'running') return 'secondary';
   return 'destructive';
@@ -547,9 +543,7 @@ function BenchmarkRunsTable({ runs }: { runs: BenchmarkRun[] }) {
                 </Badge>
               </TableCell>
               <TableCell className="py-2 text-xs tabular-nums">{run.startedAt}</TableCell>
-              <TableCell className="py-2 text-xs tabular-nums">
-                {run.completedAt ?? '—'}
-              </TableCell>
+              <TableCell className="py-2 text-xs tabular-nums">{run.completedAt ?? '—'}</TableCell>
               <TableCell className="py-2 text-xs text-destructive max-w-48 truncate">
                 {run.error ?? ''}
               </TableCell>
@@ -566,15 +560,9 @@ function BenchmarkRunsTable({ runs }: { runs: BenchmarkRun[] }) {
 // Routing table view
 // ---------------------------------------------------------------------------
 
-function RoutingTableView({
-  data,
-}: {
-  data: BenchmarkRoutingTableResponse;
-}) {
+function RoutingTableView({ data }: { data: BenchmarkRoutingTableResponse }) {
   if (!data.table) {
-    return (
-      <p className="text-muted-foreground text-sm">No routing table published yet.</p>
-    );
+    return <p className="text-muted-foreground text-sm">No routing table published yet.</p>;
   }
 
   const { table } = data;
@@ -587,10 +575,14 @@ function RoutingTableView({
   return (
     <div className="flex flex-col gap-3">
       <div className="text-muted-foreground text-xs flex flex-wrap gap-x-4 gap-y-1">
-        <span>Version: <span className="font-mono">{table.version}</span></span>
+        <span>
+          Version: <span className="font-mono">{table.version}</span>
+        </span>
         <span>Generated: {table.generatedAt}</span>
         <span>Min accuracy: {formatAccuracy(table.minAccuracy)}</span>
-        <span>Source: <span className="capitalize">{table.source}</span></span>
+        <span>
+          Source: <span className="capitalize">{table.source}</span>
+        </span>
       </div>
 
       {tierEntries.map(({ tier, candidates }) => (
@@ -666,13 +658,14 @@ export function BenchmarksSection() {
 
   // Poll runs every 30s while any run is 'running'
   const hasRunningRun = runsQuery.data?.runs.some(r => r.status === 'running') ?? false;
+  const refetchRuns = runsQuery.refetch;
   useEffect(() => {
     if (!hasRunningRun) return;
     const id = setInterval(() => {
-      void runsQuery.refetch();
+      void refetchRuns();
     }, 30_000);
     return () => clearInterval(id);
-  }, [hasRunningRun, runsQuery]);
+  }, [hasRunningRun, refetchRuns]);
 
   const startRunMutation = useMutation({
     mutationFn: startBenchmarkRun,
@@ -694,8 +687,7 @@ export function BenchmarksSection() {
     [queryClient]
   );
 
-  const anyRunning =
-    hasRunningRun || startRunMutation.isPending;
+  const anyRunning = hasRunningRun || startRunMutation.isPending;
 
   return (
     <div className="flex flex-col gap-4">

From 9f2d876e5f655270136cb82a8d9ec10d97beab98 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Thu, 11 Jun 2026 23:20:41 +0200
Subject: [PATCH 16/73] feat(web): internal token mint endpoint for
 auto-routing benchmark

Mints a short-lived (6h) user API token for a given userId, guarded by the
shared internal secret over Authorization: Bearer. The decider benchmark uses
this to authenticate the kilo CLI against the gateway under a real user's
identity.
---
 .../token/route.test.ts                       | 85 +++++++++++++++++++
 .../auto-routing-benchmark/token/route.ts     | 82 ++++++++++++++++++
 2 files changed, 167 insertions(+)
 create mode 100644 apps/web/src/app/api/internal/auto-routing-benchmark/token/route.test.ts
 create mode 100644 apps/web/src/app/api/internal/auto-routing-benchmark/token/route.ts

diff --git a/apps/web/src/app/api/internal/auto-routing-benchmark/token/route.test.ts b/apps/web/src/app/api/internal/auto-routing-benchmark/token/route.test.ts
new file mode 100644
index 0000000000..33c01fe1d1
--- /dev/null
+++ b/apps/web/src/app/api/internal/auto-routing-benchmark/token/route.test.ts
@@ -0,0 +1,85 @@
+import { NextRequest } from 'next/server';
+import { generateApiToken } from '@/lib/tokens';
+
+jest.mock('@/lib/config.server', () => ({
+  INTERNAL_API_SECRET: 'internal-secret',
+}));
+
+// Chainable drizzle query builder mock. `.limit()` resolves to the rows we set.
+const mockRows: unknown[] = [];
+jest.mock('@/lib/drizzle', () => ({
+  db: {
+    select: () => ({
+      from: () => ({
+        where: () => ({
+          limit: () => Promise.resolve(mockRows),
+        }),
+      }),
+    }),
+  },
+}));
+
+jest.mock('@/lib/tokens', () => ({
+  generateApiToken: jest.fn(() => 'minted-token'),
+}));
+
+import { POST } from './route';
+
+const mockGenerateApiToken = jest.mocked(generateApiToken);
+
+function createRequest(body: unknown, headers: Record<string, string> = {}) {
+  return new NextRequest('http://localhost:3000/api/internal/auto-routing-benchmark/token', {
+    method: 'POST',
+    body: JSON.stringify(body),
+    headers: { 'content-type': 'application/json', ...headers },
+  });
+}
+
+describe('POST /api/internal/auto-routing-benchmark/token', () => {
+  beforeEach(() => {
+    jest.clearAllMocks();
+    mockRows.length = 0;
+  });
+
+  it('returns 401 without the bearer secret', async () => {
+    mockRows.push({ id: 'user-1', api_token_pepper: 'pepper' });
+    const res = await POST(createRequest({ userId: 'user-1' }));
+    expect(res.status).toBe(401);
+    expect(mockGenerateApiToken).not.toHaveBeenCalled();
+  });
+
+  it('returns 401 with the wrong bearer secret', async () => {
+    const res = await POST(
+      createRequest({ userId: 'user-1' }, { authorization: 'Bearer wrong' })
+    );
+    expect(res.status).toBe(401);
+  });
+
+  it('returns 400 for an invalid body', async () => {
+    const res = await POST(createRequest({}, { authorization: 'Bearer internal-secret' }));
+    expect(res.status).toBe(400);
+  });
+
+  it('returns 404 when the user does not exist', async () => {
+    const res = await POST(
+      createRequest({ userId: 'missing' }, { authorization: 'Bearer internal-secret' })
+    );
+    expect(res.status).toBe(404);
+    expect(mockGenerateApiToken).not.toHaveBeenCalled();
+  });
+
+  it('mints a 6h token for an existing user', async () => {
+    const user = { id: 'user-1', api_token_pepper: 'pepper' };
+    mockRows.push(user);
+    const res = await POST(
+      createRequest({ userId: 'user-1' }, { authorization: 'Bearer internal-secret' })
+    );
+    expect(res.status).toBe(200);
+    const json = (await res.json()) as { token: string; expiresAt: string };
+    expect(json.token).toBe('minted-token');
+    expect(typeof json.expiresAt).toBe('string');
+    expect(mockGenerateApiToken).toHaveBeenCalledWith(user, undefined, {
+      expiresIn: 6 * 60 * 60,
+    });
+  });
+});
diff --git a/apps/web/src/app/api/internal/auto-routing-benchmark/token/route.ts b/apps/web/src/app/api/internal/auto-routing-benchmark/token/route.ts
new file mode 100644
index 0000000000..5278cce9db
--- /dev/null
+++ b/apps/web/src/app/api/internal/auto-routing-benchmark/token/route.ts
@@ -0,0 +1,82 @@
+/**
+ * Internal API: mint a short-lived user API token for the auto-routing
+ * decider benchmark.
+ *
+ * Called by:
+ * - services/auto-routing-benchmark — the decider benchmark runs each case
+ *   through the real `kilo` CLI inside a Cloudflare Container. The CLI
+ *   authenticates against the gateway with a user API token, so the worker
+ *   fetches a fresh, short-lived token for the configured benchmark user
+ *   once per queue message.
+ *
+ * Auth: shared internal secret over `Authorization: Bearer <secret>` — this
+ * is the exact header the benchmark worker sends
+ * (`Authorization: Bearer ${INTERNAL_API_SECRET_PROD}`), and
+ * INTERNAL_API_SECRET_PROD holds the same value as INTERNAL_API_SECRET here.
+ *
+ * The minted token is a full user API token (includes apiTokenPepper) so the
+ * gateway accepts it as a real user token; an internal-service token would be
+ * rejected by gateway pepper validation. It expires in 6 hours.
+ *
+ * URL: POST /api/internal/auto-routing-benchmark/token
+ */
+
+import type { NextRequest } from 'next/server';
+import { NextResponse } from 'next/server';
+import { z } from 'zod';
+import { eq } from 'drizzle-orm';
+import { kilocode_users } from '@kilocode/db/schema';
+import { db } from '@/lib/drizzle';
+import { generateApiToken } from '@/lib/tokens';
+import { INTERNAL_API_SECRET } from '@/lib/config.server';
+
+const RequestSchema = z.object({ userId: z.string().min(1) });
+
+const SIX_HOURS_IN_SECONDS = 6 * 60 * 60;
+
+// Inline bearer extraction (case-insensitive prefix, RFC 6750 §2.1). Kept local
+// to avoid importing @kilocode/worker-utils, whose transitive `jose` ESM import
+// breaks under jest's CJS transform.
+function extractBearerToken(authHeader: string | null): string | null {
+  if (!authHeader) return null;
+  const trimmed = authHeader.trim();
+  if (trimmed.slice(0, 7).toLowerCase() !== 'bearer ') return null;
+  return trimmed.slice(7).trim() || null;
+}
+
+export async function POST(req: NextRequest) {
+  const token = extractBearerToken(req.headers.get('authorization'));
+  if (!INTERNAL_API_SECRET || token !== INTERNAL_API_SECRET) {
+    return NextResponse.json({ error: 'Unauthorized' }, { status: 401 });
+  }
+
+  let body: unknown;
+  try {
+    body = await req.json();
+  } catch {
+    return NextResponse.json({ error: 'Invalid JSON body' }, { status: 400 });
+  }
+
+  const parsed = RequestSchema.safeParse(body);
+  if (!parsed.success) {
+    return NextResponse.json(
+      { error: 'Invalid request body', issues: parsed.error.issues },
+      { status: 400 }
+    );
+  }
+
+  const [user] = await db
+    .select()
+    .from(kilocode_users)
+    .where(eq(kilocode_users.id, parsed.data.userId))
+    .limit(1);
+
+  if (!user) {
+    return NextResponse.json({ error: 'User not found' }, { status: 404 });
+  }
+
+  const apiToken = generateApiToken(user, undefined, { expiresIn: SIX_HOURS_IN_SECONDS });
+  const expiresAt = new Date(Date.now() + SIX_HOURS_IN_SECONDS * 1000).toISOString();
+
+  return NextResponse.json({ token: apiToken, expiresAt });
+}

From 7a31d4a1712a43e527d22b1648e83599e0dfb228 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Thu, 11 Jun 2026 23:21:00 +0200
Subject: [PATCH 17/73] feat(auto-routing-benchmark): run decider cases through
 kilo CLI in a container

The decider benchmark now executes each case through the stable kilo CLI
(@kilocode/cli) running in a Cloudflare Container, instead of bare OpenRouter
chat completions, so it measures the real agent harness.

- Container (Dockerfile + dependency-free server.mjs) spawns `kilo run
  --format json --auto` per case; the kilo user token is injected only as a
  child-process env var, never logged or written to disk.
- BenchRunnerContainer DO + wrangler containers/durable_objects/migrations.
- kilo-events.ts: pure parser for the CLI JSON event stream (text + cost),
  tolerant of both part.* and flattened event shapes.
- cli-runner.ts: proxies a case to the container and parses the result.
- run.ts: chunks decider cases (10/chunk) into per-(model,chunk) queue
  messages; fetches a short-lived user token once per message; fails fast when
  benchmarkUserId is unset (plus a defensive per-case guard). Classifier path
  unchanged.
- New benchmarkUserId config field (nullable) on BenchmarkConfig.
- vitest aliases @cloudflare/containers to a node-safe stub so unit tests can
  import the worker entry without the cloudflare:workers chain.
---
 ...uto-routing-benchmark-admin-client.test.ts |   2 +
 .../auto-routing-contracts/src/benchmark.ts   |   3 +
 pnpm-lock.yaml                                |   9 +-
 .../container/Dockerfile                      |  16 ++
 .../container/server.mjs                      | 141 ++++++++++++
 services/auto-routing-benchmark/package.json  |   1 +
 .../src/bench-runner-container.ts             |  10 +
 .../auto-routing-benchmark/src/cli-runner.ts  |  61 +++++
 services/auto-routing-benchmark/src/config.ts |   1 +
 .../src/datasets/decider-cases.ts             |   2 +
 services/auto-routing-benchmark/src/index.ts  |   3 +
 .../src/kilo-events.test.ts                   |  63 ++++++
 .../auto-routing-benchmark/src/kilo-events.ts |  73 ++++++
 .../src/routing-table-builder.test.ts         |   1 +
 .../auto-routing-benchmark/src/run.test.ts    |  30 ++-
 services/auto-routing-benchmark/src/run.ts    | 213 ++++++++++++++----
 .../test/stubs/cloudflare-containers.ts       |  14 ++
 .../auto-routing-benchmark/vitest.config.ts   |  10 +
 .../worker-configuration.d.ts                 |  11 +-
 .../auto-routing-benchmark/wrangler.jsonc     |  18 ++
 20 files changed, 629 insertions(+), 53 deletions(-)
 create mode 100644 services/auto-routing-benchmark/container/Dockerfile
 create mode 100644 services/auto-routing-benchmark/container/server.mjs
 create mode 100644 services/auto-routing-benchmark/src/bench-runner-container.ts
 create mode 100644 services/auto-routing-benchmark/src/cli-runner.ts
 create mode 100644 services/auto-routing-benchmark/src/kilo-events.test.ts
 create mode 100644 services/auto-routing-benchmark/src/kilo-events.ts
 create mode 100644 services/auto-routing-benchmark/test/stubs/cloudflare-containers.ts

diff --git a/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.test.ts b/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.test.ts
index 8db475b305..673119734d 100644
--- a/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.test.ts
+++ b/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.test.ts
@@ -29,6 +29,7 @@ const configResponse = {
     ],
     minAccuracy: 0.8,
     maxConcurrency: 4,
+    benchmarkUserId: null,
     updatedAt: null,
     updatedBy: null,
   },
@@ -46,6 +47,7 @@ const configResponse = {
     ],
     minAccuracy: 0.8,
     maxConcurrency: 4,
+    benchmarkUserId: null,
     updatedAt: null,
     updatedBy: null,
   },
diff --git a/packages/auto-routing-contracts/src/benchmark.ts b/packages/auto-routing-contracts/src/benchmark.ts
index 7c14447a40..5f915dd9e8 100644
--- a/packages/auto-routing-contracts/src/benchmark.ts
+++ b/packages/auto-routing-contracts/src/benchmark.ts
@@ -20,6 +20,9 @@ export const BenchmarkConfigSchema = z.object({
   minAccuracy: z.number().min(0).max(1),
   // Parallel OpenRouter calls per queue message.
   maxConcurrency: z.number().int().min(1).max(16),
+  // The Kilo user whose identity/billing the decider CLI runs execute under.
+  // Null until an admin configures it; decider runs fail fast while null.
+  benchmarkUserId: z.string().trim().min(1).nullable(),
   updatedAt: z.string().nullable(),
   updatedBy: z.string().nullable(),
 });
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index 0c48fc8fe1..5eee7fd65d 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -1512,6 +1512,9 @@ importers:
 
   services/auto-routing-benchmark:
     dependencies:
+      '@cloudflare/containers':
+        specifier: 0.1.1
+        version: 0.1.1
       '@kilocode/auto-routing-contracts':
         specifier: workspace:*
         version: link:../../packages/auto-routing-contracts
@@ -18019,7 +18022,7 @@ snapshots:
       cjs-module-lexer: 1.2.3
       esbuild: 0.27.4
       miniflare: 4.20260603.0(bufferutil@4.1.0)(utf-8-validate@6.0.6)
-      vitest: 4.1.6(@opentelemetry/api@1.9.1)(@types/node@24.12.4)(@vitest/coverage-v8@4.1.6)(@vitest/ui@4.1.6)(esbuild@0.27.4)(jiti@2.7.0)(terser@5.46.0)(tsx@4.21.0)(yaml@2.8.4)
+      vitest: 4.1.6(@opentelemetry/api@1.9.1)(@types/node@25.5.2)(@vitest/coverage-v8@4.1.6)(@vitest/ui@4.1.6)(esbuild@0.27.4)(jiti@2.7.0)(terser@5.46.0)(tsx@4.21.0)(yaml@2.8.4)
       wrangler: 4.98.0(@cloudflare/workers-types@4.20260605.1)(bufferutil@4.1.0)(utf-8-validate@6.0.6)
       zod: 3.25.76
     transitivePeerDependencies:
@@ -24129,7 +24132,7 @@ snapshots:
       obug: 2.1.1
       std-env: 4.0.0
       tinyrainbow: 3.1.0
-      vitest: 4.1.6(@opentelemetry/api@1.9.1)(@types/node@25.5.2)(@vitest/coverage-v8@4.1.6)(@vitest/ui@4.1.6)(esbuild@0.27.4)(jiti@2.7.0)(terser@5.46.0)(tsx@4.21.0)(yaml@2.8.4)
+      vitest: 4.1.6(@opentelemetry/api@1.9.1)(@types/node@24.12.4)(@vitest/coverage-v8@4.1.6)(@vitest/ui@4.1.6)(esbuild@0.27.4)(jiti@2.7.0)(terser@5.46.0)(tsx@4.21.0)(yaml@2.8.4)
 
   '@vitest/expect@3.2.4':
     dependencies:
@@ -24215,7 +24218,7 @@ snapshots:
       sirv: 3.0.2
       tinyglobby: 0.2.16
       tinyrainbow: 3.1.0
-      vitest: 4.1.6(@opentelemetry/api@1.9.1)(@types/node@25.5.2)(@vitest/coverage-v8@4.1.6)(@vitest/ui@4.1.6)(esbuild@0.27.4)(jiti@2.7.0)(terser@5.46.0)(tsx@4.21.0)(yaml@2.8.4)
+      vitest: 4.1.6(@opentelemetry/api@1.9.1)(@types/node@24.12.4)(@vitest/coverage-v8@4.1.6)(@vitest/ui@4.1.6)(esbuild@0.27.4)(jiti@2.7.0)(terser@5.46.0)(tsx@4.21.0)(yaml@2.8.4)
 
   '@vitest/utils@3.2.4':
     dependencies:
diff --git a/services/auto-routing-benchmark/container/Dockerfile b/services/auto-routing-benchmark/container/Dockerfile
new file mode 100644
index 0000000000..25550a3da1
--- /dev/null
+++ b/services/auto-routing-benchmark/container/Dockerfile
@@ -0,0 +1,16 @@
+# Decider-benchmark runner container.
+#
+# Runs the stable `kilo` CLI (@kilocode/cli, dist-tag `latest`) for one decider
+# case at a time. `wrangler deploy` builds and pushes this image automatically.
+#
+# NOTE: `@kilocode/cli@latest` is resolved at IMAGE BUILD time (i.e. at deploy
+# time), so each deploy pins whatever version was `latest` then. Re-deploy to
+# pick up a newer stable CLI.
+FROM node:22-slim
+RUN apt-get update && apt-get install -y --no-install-recommends ca-certificates git python3 g++ make \
+  && rm -rf /var/lib/apt/lists/* \
+  && npm install -g @kilocode/cli@latest
+WORKDIR /app
+COPY server.mjs ./
+EXPOSE 3000
+CMD ["node", "server.mjs"]
diff --git a/services/auto-routing-benchmark/container/server.mjs b/services/auto-routing-benchmark/container/server.mjs
new file mode 100644
index 0000000000..1931864c3b
--- /dev/null
+++ b/services/auto-routing-benchmark/container/server.mjs
@@ -0,0 +1,141 @@
+// Dependency-free HTTP server that runs one decider-benchmark case through the
+// stable `kilo` CLI per request. Intentionally dumb: it spawns the CLI, caps
+// output, and returns raw stdout lines. All event parsing happens in the
+// worker (src/kilo-events.ts), not here.
+//
+// The Kilo user token is passed in the request body and injected only as a
+// child-process env var (KILO_AUTH_CONTENT). It is never written to disk and
+// never logged.
+
+import { createServer } from 'node:http';
+import { spawn } from 'node:child_process';
+import { mkdtemp, rm } from 'node:fs/promises';
+import { tmpdir } from 'node:os';
+import { join } from 'node:path';
+
+const PORT = 3000;
+const DEFAULT_TIMEOUT_MS = 180_000;
+const STDOUT_CAP_BYTES = 2 * 1024 * 1024; // 2MB
+const STDERR_CAP_BYTES = 4 * 1024; // 4KB tail
+
+function sendJson(res, status, body) {
+  const payload = JSON.stringify(body);
+  res.writeHead(status, { 'content-type': 'application/json' });
+  res.end(payload);
+}
+
+async function readBody(req) {
+  const chunks = [];
+  for await (const chunk of req) chunks.push(chunk);
+  return Buffer.concat(chunks).toString('utf8');
+}
+
+function runCase({ model, prompt, kiloToken, timeoutMs }) {
+  return new Promise(resolve => {
+    void (async () => {
+      const dir = await mkdtemp(join(tmpdir(), 'kilo-bench-'));
+      const startedAt = Date.now();
+
+      let stdout = '';
+      let stdoutTruncated = false;
+      let stderrTail = '';
+
+      const child = spawn(
+        'kilo',
+        ['run', '--format', 'json', '--auto', '-m', `kilo/${model}`, prompt],
+        {
+          cwd: dir,
+          env: {
+            ...process.env,
+            KILO_AUTH_CONTENT: JSON.stringify({ kilo: { type: 'api', key: kiloToken } }),
+            NO_COLOR: '1',
+          },
+          stdio: ['ignore', 'pipe', 'pipe'],
+        }
+      );
+
+      const killTimer = setTimeout(() => {
+        child.kill('SIGKILL');
+      }, timeoutMs);
+
+      child.stdout.on('data', chunk => {
+        if (stdoutTruncated) return;
+        const text = chunk.toString('utf8');
+        if (stdout.length + text.length > STDOUT_CAP_BYTES) {
+          stdout += text.slice(0, STDOUT_CAP_BYTES - stdout.length);
+          stdoutTruncated = true;
+        } else {
+          stdout += text;
+        }
+      });
+
+      child.stderr.on('data', chunk => {
+        stderrTail = (stderrTail + chunk.toString('utf8')).slice(-STDERR_CAP_BYTES);
+      });
+
+      const finish = async (exitCode) => {
+        clearTimeout(killTimer);
+        await rm(dir, { recursive: true, force: true }).catch(() => {});
+        const stdoutLines = stdout.split('\n').filter(line => line.length > 0);
+        resolve({
+          exitCode,
+          durationMs: Date.now() - startedAt,
+          stdoutLines,
+          stderrTail,
+        });
+      };
+
+      child.on('error', err => {
+        stderrTail = (stderrTail + `\nspawn error: ${err.message}`).slice(-STDERR_CAP_BYTES);
+        void finish(-1);
+      });
+      child.on('close', code => {
+        void finish(code ?? -1);
+      });
+    })();
+  });
+}
+
+const server = createServer((req, res) => {
+  void (async () => {
+    if (req.method === 'GET' && req.url === '/health') {
+      sendJson(res, 200, { ok: true });
+      return;
+    }
+
+    if (req.method === 'POST' && req.url === '/run') {
+      let parsed;
+      try {
+        parsed = JSON.parse(await readBody(req));
+      } catch {
+        sendJson(res, 400, { error: 'invalid JSON body' });
+        return;
+      }
+
+      const { model, prompt, kiloToken } = parsed ?? {};
+      const timeoutMs =
+        typeof parsed?.timeoutMs === 'number' && parsed.timeoutMs > 0
+          ? parsed.timeoutMs
+          : DEFAULT_TIMEOUT_MS;
+
+      if (typeof model !== 'string' || typeof prompt !== 'string' || typeof kiloToken !== 'string') {
+        sendJson(res, 400, { error: 'model, prompt and kiloToken are required strings' });
+        return;
+      }
+
+      try {
+        const result = await runCase({ model, prompt, kiloToken, timeoutMs });
+        sendJson(res, 200, result);
+      } catch (err) {
+        sendJson(res, 500, { error: err instanceof Error ? err.message : 'run failed' });
+      }
+      return;
+    }
+
+    sendJson(res, 404, { error: 'not found' });
+  })();
+});
+
+server.listen(PORT, () => {
+  console.log(`decider-benchmark runner listening on :${PORT}`);
+});
diff --git a/services/auto-routing-benchmark/package.json b/services/auto-routing-benchmark/package.json
index ba51b15107..c347e6be44 100644
--- a/services/auto-routing-benchmark/package.json
+++ b/services/auto-routing-benchmark/package.json
@@ -12,6 +12,7 @@
     "test": "vitest run"
   },
   "dependencies": {
+    "@cloudflare/containers": "0.1.1",
     "@kilocode/auto-routing-contracts": "workspace:*",
     "@kilocode/worker-utils": "workspace:*",
     "@openrouter/sdk": "^0.12.79",
diff --git a/services/auto-routing-benchmark/src/bench-runner-container.ts b/services/auto-routing-benchmark/src/bench-runner-container.ts
new file mode 100644
index 0000000000..c61a6ac57e
--- /dev/null
+++ b/services/auto-routing-benchmark/src/bench-runner-container.ts
@@ -0,0 +1,10 @@
+import { Container } from '@cloudflare/containers';
+
+// Cloudflare Container that runs the stable `kilo` CLI for decider benchmark
+// cases. The worker proxies POST /run to the container's HTTP server (see
+// container/server.mjs) via this DO. One instance is keyed per
+// (runId, model, chunk) so concurrent chunks/models don't share state.
+export class BenchRunnerContainer extends Container<Env> {
+  defaultPort = 3000;
+  sleepAfter = '5m';
+}
diff --git a/services/auto-routing-benchmark/src/cli-runner.ts b/services/auto-routing-benchmark/src/cli-runner.ts
new file mode 100644
index 0000000000..d80eb19e3f
--- /dev/null
+++ b/services/auto-routing-benchmark/src/cli-runner.ts
@@ -0,0 +1,61 @@
+import { parseKiloRunEvents } from './kilo-events';
+import type { DeciderCase } from './datasets/decider-cases';
+
+export type CliRunResult = {
+  text: string;
+  costUsd: number | null;
+  latencyMs: number;
+  exitCode: number;
+  stderrTail: string;
+};
+
+const DECIDER_CLI_TIMEOUT_MS = 180_000;
+
+type ContainerRunResponse = {
+  exitCode: number;
+  durationMs: number;
+  stdoutLines: string[];
+  stderrTail: string;
+};
+
+/**
+ * Run one decider case through the `kilo` CLI inside a Cloudflare Container.
+ *
+ * `instanceName` is the precomputed DO instance name (e.g.
+ * `${runId}:${model}:${chunk}`); the caller owns the keying so chunks/models
+ * map to stable instances. The CLI has no system-prompt flag, so we fold the
+ * system prompt into the user prompt.
+ */
+export async function runDeciderCaseViaCli(
+  env: Env,
+  params: { instanceName: string; model: string; benchCase: DeciderCase; kiloToken: string }
+): Promise<CliRunResult> {
+  const { instanceName, model, benchCase, kiloToken } = params;
+  const stub = env.BENCH_RUNNER.get(env.BENCH_RUNNER.idFromName(instanceName));
+  const prompt = `${benchCase.systemPrompt}\n\n${benchCase.userPrompt}`;
+
+  const startedAt = Date.now();
+  const response = await stub.fetch(
+    new Request('http://container/run', {
+      method: 'POST',
+      headers: { 'content-type': 'application/json' },
+      body: JSON.stringify({ model, prompt, kiloToken, timeoutMs: DECIDER_CLI_TIMEOUT_MS }),
+    })
+  );
+
+  if (!response.ok) {
+    const detail = (await response.text().catch(() => '')).slice(0, 500);
+    throw new Error(`container /run failed: HTTP ${response.status} ${detail}`);
+  }
+
+  const body = (await response.json()) as ContainerRunResponse;
+  const { text, costUsd } = parseKiloRunEvents(body.stdoutLines ?? []);
+
+  return {
+    text,
+    costUsd,
+    latencyMs: body.durationMs ?? Date.now() - startedAt,
+    exitCode: body.exitCode,
+    stderrTail: body.stderrTail ?? '',
+  };
+}
diff --git a/services/auto-routing-benchmark/src/config.ts b/services/auto-routing-benchmark/src/config.ts
index e609090760..30b1e6a072 100644
--- a/services/auto-routing-benchmark/src/config.ts
+++ b/services/auto-routing-benchmark/src/config.ts
@@ -20,6 +20,7 @@ export const DEFAULT_BENCHMARK_CONFIG: BenchmarkConfig = {
   ],
   minAccuracy: 0.7,
   maxConcurrency: 4,
+  benchmarkUserId: null,
   updatedAt: null,
   updatedBy: null,
 };
diff --git a/services/auto-routing-benchmark/src/datasets/decider-cases.ts b/services/auto-routing-benchmark/src/datasets/decider-cases.ts
index b14ec63d2c..1ac7a2af35 100644
--- a/services/auto-routing-benchmark/src/datasets/decider-cases.ts
+++ b/services/auto-routing-benchmark/src/datasets/decider-cases.ts
@@ -7,6 +7,8 @@ export type DeciderCase = {
   taskType: ClassifierTaskType;
   systemPrompt: string;
   userPrompt: string;
+  // Retained as metadata only. The decider now runs cases through the kilo CLI
+  // (no chat-completions maxTokens knob), so this field is no longer consumed.
   maxTokens: number;
   check: DeciderCheck;
 };
diff --git a/services/auto-routing-benchmark/src/index.ts b/services/auto-routing-benchmark/src/index.ts
index e78437b9dd..c6449e82a9 100644
--- a/services/auto-routing-benchmark/src/index.ts
+++ b/services/auto-routing-benchmark/src/index.ts
@@ -11,6 +11,9 @@ import {
 import type { HonoEnv } from './hono-env';
 import { processJob, startRun, type BenchmarkJobMessage } from './run';
 
+// Re-exported so the Durable Object class binding (BENCH_RUNNER) can find it.
+export { BenchRunnerContainer } from './bench-runner-container';
+
 export const app = new Hono<HonoEnv>();
 app.use('*', authMiddleware);
 app.get('/health', c => c.json({ status: 'ok', service: 'auto-routing-benchmark' }));
diff --git a/services/auto-routing-benchmark/src/kilo-events.test.ts b/services/auto-routing-benchmark/src/kilo-events.test.ts
new file mode 100644
index 0000000000..937753f97a
--- /dev/null
+++ b/services/auto-routing-benchmark/src/kilo-events.test.ts
@@ -0,0 +1,63 @@
+import { describe, expect, it } from 'vitest';
+import { parseKiloRunEvents } from './kilo-events';
+
+describe('parseKiloRunEvents', () => {
+  it('assembles completed text parts and sums step-finish costs (part.* shape)', () => {
+    const lines = [
+      JSON.stringify({ type: 'text', part: { text: 'partial', time: { start: 1 } } }), // no end → skipped
+      JSON.stringify({ type: 'text', part: { text: 'The answer is', time: { end: 10 } } }),
+      JSON.stringify({ type: 'step-finish', part: { cost: 0.0012, tokens: { input: 5 } } }),
+      JSON.stringify({ type: 'text', part: { text: '```\n20-40\n```', time: { end: 20 } } }),
+      JSON.stringify({ type: 'step-finish', part: { cost: 0.0008 } }),
+    ];
+
+    const { text, costUsd } = parseKiloRunEvents(lines);
+    expect(text).toBe('The answer is\n```\n20-40\n```');
+    expect(costUsd).toBeCloseTo(0.002, 10);
+  });
+
+  it('skips unparseable lines without throwing', () => {
+    const lines = [
+      'not json',
+      '',
+      JSON.stringify({ type: 'text', part: { text: 'hello', time: { end: 1 } } }),
+      '{ broken',
+    ];
+    const { text, costUsd } = parseKiloRunEvents(lines);
+    expect(text).toBe('hello');
+    expect(costUsd).toBeNull();
+  });
+
+  it('returns null cost when no step-finish event is seen', () => {
+    const lines = [JSON.stringify({ type: 'text', part: { text: 'x', time: { end: 1 } } })];
+    expect(parseKiloRunEvents(lines).costUsd).toBeNull();
+  });
+
+  it('accepts the flattened top-level event shape (evt.text / evt.cost)', () => {
+    const lines = [
+      JSON.stringify({ type: 'text', text: 'flat answer', time: { end: 5 } }),
+      JSON.stringify({ type: 'step-finish', cost: 0.5 }),
+    ];
+    const { text, costUsd } = parseKiloRunEvents(lines);
+    expect(text).toBe('flat answer');
+    expect(costUsd).toBe(0.5);
+  });
+
+  it('prefers part.* over top-level fields when both present', () => {
+    const lines = [
+      JSON.stringify({ type: 'text', text: 'top', part: { text: 'nested', time: { end: 1 } } }),
+      JSON.stringify({ type: 'step-finish', cost: 9, part: { cost: 0.01 } }),
+    ];
+    const { text, costUsd } = parseKiloRunEvents(lines);
+    expect(text).toBe('nested');
+    expect(costUsd).toBe(0.01);
+  });
+
+  it('returns empty text and null cost for no relevant events', () => {
+    const lines = [
+      JSON.stringify({ type: 'tool', part: { name: 'read' } }),
+      JSON.stringify({ type: 'start' }),
+    ];
+    expect(parseKiloRunEvents(lines)).toEqual({ text: '', costUsd: null });
+  });
+});
diff --git a/services/auto-routing-benchmark/src/kilo-events.ts b/services/auto-routing-benchmark/src/kilo-events.ts
new file mode 100644
index 0000000000..5bd874e638
--- /dev/null
+++ b/services/auto-routing-benchmark/src/kilo-events.ts
@@ -0,0 +1,73 @@
+// Pure parser for the `kilo run --format json` event stream.
+//
+// The CLI emits one JSON event per line on stdout. We care about two things:
+//   1. The final assistant answer — assembled from completed `text` events
+//      (those whose part has `time.end` set), concatenated in order.
+//   2. Total cost — summed across `step-finish` events' `part.cost` (USD).
+//
+// Event shapes vary across CLI versions; we accept both the documented
+// `evt.part.*` shape and a flattened `evt.*` shape, preferring `part.*`.
+// Everything is optional-chained so malformed lines can't throw.
+
+export type ParsedKiloRun = {
+  text: string;
+  costUsd: number | null;
+};
+
+type LooseEvent = {
+  type?: unknown;
+  text?: unknown;
+  cost?: unknown;
+  time?: { end?: unknown };
+  part?: {
+    text?: unknown;
+    cost?: unknown;
+    time?: { end?: unknown };
+  };
+};
+
+function isCompletedTextEvent(evt: LooseEvent): boolean {
+  const end = evt.part?.time?.end ?? evt.time?.end;
+  return end !== undefined && end !== null;
+}
+
+function readText(evt: LooseEvent): string | null {
+  const partText = evt.part?.text;
+  if (typeof partText === 'string') return partText;
+  if (typeof evt.text === 'string') return evt.text;
+  return null;
+}
+
+function readCost(evt: LooseEvent): number | null {
+  const partCost = evt.part?.cost;
+  if (typeof partCost === 'number' && Number.isFinite(partCost)) return partCost;
+  if (typeof evt.cost === 'number' && Number.isFinite(evt.cost)) return evt.cost;
+  return null;
+}
+
+export function parseKiloRunEvents(lines: string[]): ParsedKiloRun {
+  const textParts: string[] = [];
+  let costUsd: number | null = null;
+
+  for (const line of lines) {
+    let evt: LooseEvent;
+    try {
+      evt = JSON.parse(line) as LooseEvent;
+    } catch {
+      continue;
+    }
+    if (evt === null || typeof evt !== 'object') continue;
+
+    if (evt.type === 'text' && isCompletedTextEvent(evt)) {
+      const text = readText(evt);
+      if (text !== null) textParts.push(text);
+    }
+
+    if (evt.type === 'step-finish') {
+      const cost = readCost(evt);
+      if (cost !== null) costUsd = (costUsd ?? 0) + cost;
+    }
+  }
+
+  return { text: textParts.join('\n'), costUsd };
+}
diff --git a/services/auto-routing-benchmark/src/routing-table-builder.test.ts b/services/auto-routing-benchmark/src/routing-table-builder.test.ts
index 892ef5c88d..5e8e23e7d4 100644
--- a/services/auto-routing-benchmark/src/routing-table-builder.test.ts
+++ b/services/auto-routing-benchmark/src/routing-table-builder.test.ts
@@ -11,6 +11,7 @@ const BASE_CONFIG: BenchmarkConfig = {
   ],
   minAccuracy: 0.7,
   maxConcurrency: 4,
+  benchmarkUserId: null,
   updatedAt: null,
   updatedBy: null,
 };
diff --git a/services/auto-routing-benchmark/src/run.test.ts b/services/auto-routing-benchmark/src/run.test.ts
index 1c9826c640..a9c57baedf 100644
--- a/services/auto-routing-benchmark/src/run.test.ts
+++ b/services/auto-routing-benchmark/src/run.test.ts
@@ -1,6 +1,6 @@
 import { describe, expect, it } from 'vitest';
 import type { CaseResultRow } from './db';
-import { runCasesWithConcurrency, summarize } from './run';
+import { chunkArray, runCasesWithConcurrency, summarize } from './run';
 
 function makeRow(overrides: Partial<CaseResultRow> = {}): CaseResultRow {
   return {
@@ -203,3 +203,31 @@ describe('runCasesWithConcurrency', () => {
     ).rejects.toThrow('test error');
   });
 });
+
+describe('chunkArray', () => {
+  it('splits into 10-per-chunk with a partial final chunk', () => {
+    const items = Array.from({ length: 23 }, (_, i) => i);
+    const chunks = chunkArray(items, 10);
+    expect(chunks).toHaveLength(3);
+    expect(chunks[0]).toHaveLength(10);
+    expect(chunks[1]).toHaveLength(10);
+    expect(chunks[2]).toHaveLength(3);
+  });
+
+  it('round-trips caseIds: flatten equals the original order', () => {
+    const ids = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k'];
+    const chunks = chunkArray(ids, 10);
+    expect(chunks).toHaveLength(2);
+    expect(chunks.flat()).toEqual(ids);
+  });
+
+  it('returns a single full chunk when items fit exactly', () => {
+    const chunks = chunkArray([1, 2, 3, 4, 5], 5);
+    expect(chunks).toHaveLength(1);
+    expect(chunks[0]).toEqual([1, 2, 3, 4, 5]);
+  });
+
+  it('returns no chunks for an empty array', () => {
+    expect(chunkArray([], 10)).toEqual([]);
+  });
+});
diff --git a/services/auto-routing-benchmark/src/run.ts b/services/auto-routing-benchmark/src/run.ts
index 4efd91e93f..721dc7f9e0 100644
--- a/services/auto-routing-benchmark/src/run.ts
+++ b/services/auto-routing-benchmark/src/run.ts
@@ -26,15 +26,39 @@ import {
 import { gradeClassifierOutput, runDeciderCheck } from './grading';
 import { createOpenRouterClient } from './openrouter';
 import { buildRoutingTable } from './routing-table-builder';
+import { runDeciderCaseViaCli } from './cli-runner';
 
-export type BenchmarkJobMessage = { runId: string; kind: BenchmarkKind; model: string };
+export type BenchmarkJobMessage = {
+  runId: string;
+  kind: BenchmarkKind;
+  model: string;
+  // Decider only: the case ids this message is responsible for, plus the chunk
+  // index used to key the container instance. Absent for classifier messages.
+  caseIds?: string[];
+  chunk?: number;
+};
 
 export const BenchmarkJobMessageSchema = z.object({
   runId: z.string().min(1),
   kind: z.enum(['classifier', 'decider']),
   model: z.string().min(1),
+  caseIds: z.array(z.string().min(1)).optional(),
+  chunk: z.number().int().min(0).optional(),
 });
 
+// Decider cases run through the real `kilo` CLI in a container (up to ~3 min
+// each). Chunking caps how many cases a single queue invocation processes so
+// each stays well under CF's wall-clock limit.
+const DECIDER_CHUNK_SIZE = 10;
+
+export function chunkArray<T>(items: readonly T[], size: number): T[][] {
+  const chunks: T[][] = [];
+  for (let i = 0; i < items.length; i += size) {
+    chunks.push(items.slice(i, i + size));
+  }
+  return chunks;
+}
+
 const STALE_RUN_MAX_AGE_MS = 6 * 3600_000;
 
 export async function startRun(
@@ -51,6 +75,16 @@ export async function startRun(
   const config = await getBenchmarkConfig(env.BENCH_DB);
   const models =
     kind === 'classifier' ? config.classifierModels : config.deciderModels.map(m => m.id);
+
+  // Decider runs execute through the kilo CLI under a real Kilo user's
+  // identity/billing. Fail fast (before inserting the run) when that user
+  // isn't configured so the admin POST surfaces the misconfiguration.
+  if (kind === 'decider' && !config.benchmarkUserId) {
+    throw new Error(
+      'benchmark user not configured: set benchmarkUserId before running the decider benchmark'
+    );
+  }
+
   const runId = `${kind}-${new Date().toISOString().replace(/[:.]/g, '-')}`;
   await insertRun(env.BENCH_DB, {
     id: runId,
@@ -58,10 +92,33 @@ export async function startRun(
     startedAt: new Date().toISOString(),
     configJson: JSON.stringify(config),
   });
-  await env.BENCH_QUEUE.sendBatch(
-    models.map(model => ({ body: { runId, kind, model } satisfies BenchmarkJobMessage }))
+
+  if (kind === 'classifier') {
+    await env.BENCH_QUEUE.sendBatch(
+      models.map(model => ({ body: { runId, kind, model } satisfies BenchmarkJobMessage }))
+    );
+    console.log(JSON.stringify({ event: 'benchmark_run_started', runId, kind, models }));
+    return { runId, enqueuedModels: models.length };
+  }
+
+  // Decider: one message per (model, chunk) so each queue invocation stays
+  // bounded. finalizeRunIfComplete still expects models × DECIDER_CASES rows.
+  const chunks = chunkArray(DECIDER_CASES, DECIDER_CHUNK_SIZE);
+  const messages = models.flatMap(model =>
+    chunks.map((chunkCases, chunk) => ({
+      body: {
+        runId,
+        kind,
+        model,
+        chunk,
+        caseIds: chunkCases.map(c => c.id),
+      } satisfies BenchmarkJobMessage,
+    }))
+  );
+  await env.BENCH_QUEUE.sendBatch(messages);
+  console.log(
+    JSON.stringify({ event: 'benchmark_run_started', runId, kind, models, chunks: chunks.length })
   );
-  console.log(JSON.stringify({ event: 'benchmark_run_started', runId, kind, models }));
   return { runId, enqueuedModels: models.length };
 }
 
@@ -82,10 +139,10 @@ export async function processJob(env: Env, rawMessage: unknown): Promise<void> {
 
   const message = parsed.data;
   const config = await getRunConfig(env, message.runId);
-  // Create the OpenRouter client inside processJob — no module-scope transport clients.
-  const client = await createOpenRouterClient(env);
 
   if (message.kind === 'classifier') {
+    // Create the OpenRouter client inside processJob — no module-scope transport clients.
+    const client = await createOpenRouterClient(env);
     await runCasesWithConcurrency(CLASSIFIER_CASES, config.maxConcurrency, async benchCase => {
       const startedAt = performance.now();
       try {
@@ -116,53 +173,113 @@ export async function processJob(env: Env, rawMessage: unknown): Promise<void> {
       }
     });
   } else {
-    // Determinism note: temperature 0, fixed maxTokens, pinned prompts, mechanical checks.
-    // Provider-side nondeterminism can't be fully eliminated, which is why grading is
-    // binary on a single canonical answer.
-    await runCasesWithConcurrency(DECIDER_CASES, config.maxConcurrency, async benchCase => {
-      const startedAt = performance.now();
-      try {
-        const result = await client.chat.send({
-          chatRequest: {
-            model: message.model,
-            messages: [
-              { role: 'system', content: benchCase.systemPrompt },
-              { role: 'user', content: benchCase.userPrompt },
-            ],
-            stream: false,
-            temperature: 0,
-            maxTokens: benchCase.maxTokens,
-          },
-        });
-        const content: unknown = result.choices[0]?.message.content;
-        const text = typeof content === 'string' ? content : '';
-        const passed = text.length > 0 && runDeciderCheck(benchCase.check, text);
-        await upsertCaseResult(env.BENCH_DB, {
-          run_id: message.runId,
-          model: message.model,
-          case_id: benchCase.id,
-          tier: benchCase.tier,
-          score: passed ? 1 : 0,
-          latency_ms: Math.round(performance.now() - startedAt),
-          cost_usd: result.usage?.cost ?? null,
-          detail_json: JSON.stringify({
-            finishReason: result.choices[0]?.finishReason ?? null,
-            outputPrefix: text.slice(0, 200),
-          }),
-          error: null,
-        });
-      } catch (error) {
-        await upsertCaseResult(
-          env.BENCH_DB,
-          failedRow(message, benchCase.id, benchCase.tier, startedAt, error)
-        );
-      }
-    });
+    await processDeciderJob(env, message, config);
   }
 
   await finalizeRunIfComplete(env, message.runId, message.kind);
 }
 
+async function processDeciderJob(
+  env: Env,
+  message: BenchmarkJobMessage,
+  config: BenchmarkConfig
+): Promise<void> {
+  // Only the cases this message owns (chunked); fall back to the full set for
+  // legacy/un-chunked messages.
+  const cases =
+    message.caseIds && message.caseIds.length > 0
+      ? DECIDER_CASES.filter(c => message.caseIds?.includes(c.id))
+      : DECIDER_CASES;
+
+  // Defensive guard mirroring the startRun fail-fast: if the run snapshot has
+  // no benchmark user, every case in this chunk fails with a clear error so
+  // the run still completes and surfaces the misconfiguration.
+  if (!config.benchmarkUserId) {
+    for (const benchCase of cases) {
+      await upsertCaseResult(env.BENCH_DB, {
+        run_id: message.runId,
+        model: message.model,
+        case_id: benchCase.id,
+        tier: benchCase.tier,
+        score: 0,
+        latency_ms: 0,
+        cost_usd: null,
+        detail_json: null,
+        error: 'benchmark user not configured',
+      });
+    }
+    return;
+  }
+
+  // Fetch a short-lived user token ONCE per queue message. Non-OK throws so the
+  // queue retries the message. The token is never logged.
+  const kiloToken = await fetchBenchmarkUserToken(env, config.benchmarkUserId);
+  const instanceName = `${message.runId}:${message.model}:${message.chunk ?? 0}`;
+
+  await runCasesWithConcurrency(cases, config.maxConcurrency, async benchCase => {
+    const startedAt = performance.now();
+    try {
+      const result = await runDeciderCaseViaCli(env, {
+        instanceName,
+        model: message.model,
+        benchCase,
+        kiloToken,
+      });
+      const succeeded =
+        result.exitCode === 0 &&
+        result.text.length > 0 &&
+        runDeciderCheck(benchCase.check, result.text);
+      await upsertCaseResult(env.BENCH_DB, {
+        run_id: message.runId,
+        model: message.model,
+        case_id: benchCase.id,
+        tier: benchCase.tier,
+        score: succeeded ? 1 : 0,
+        latency_ms: result.latencyMs,
+        cost_usd: result.costUsd,
+        detail_json: JSON.stringify({
+          exitCode: result.exitCode,
+          outputPrefix: result.text.slice(0, 200),
+        }),
+        error: result.exitCode !== 0 ? result.stderrTail.slice(0, 500) : null,
+      });
+    } catch (error) {
+      await upsertCaseResult(
+        env.BENCH_DB,
+        failedRow(message, benchCase.id, benchCase.tier, startedAt, error)
+      );
+    }
+  });
+}
+
+const TokenResponseSchema = z.object({ token: z.string().min(1), expiresAt: z.string() });
+
+// Calls apps/web's internal endpoint to mint a short-lived user API token for
+// the decider CLI. Never logs the token.
+async function fetchBenchmarkUserToken(env: Env, userId: string): Promise<string> {
+  const secret = await env.INTERNAL_API_SECRET_PROD.get();
+  const response = await fetch(
+    `${env.KILO_WEB_API_BASE_URL}/api/internal/auto-routing-benchmark/token`,
+    {
+      method: 'POST',
+      headers: {
+        'content-type': 'application/json',
+        authorization: `Bearer ${secret}`,
+      },
+      body: JSON.stringify({ userId }),
+    }
+  );
+  if (!response.ok) {
+    const detail = (await response.text().catch(() => '')).slice(0, 200);
+    throw new Error(`token mint failed: HTTP ${response.status} ${detail}`);
+  }
+  const parsed = TokenResponseSchema.safeParse(await response.json());
+  if (!parsed.success) {
+    throw new Error('token mint returned unexpected response shape');
+  }
+  return parsed.data.token;
+}
+
 function failedRow(
   message: BenchmarkJobMessage,
   caseId: string,
diff --git a/services/auto-routing-benchmark/test/stubs/cloudflare-containers.ts b/services/auto-routing-benchmark/test/stubs/cloudflare-containers.ts
new file mode 100644
index 0000000000..bc5bed4fdf
--- /dev/null
+++ b/services/auto-routing-benchmark/test/stubs/cloudflare-containers.ts
@@ -0,0 +1,14 @@
+// Node-safe stub for `@cloudflare/containers`, aliased in vitest.config.ts.
+//
+// The real package imports `cloudflare:workers`, which only exists in the
+// workerd runtime. Unit tests run in the node pool and merely need the worker
+// entry (src/index.ts) to import without pulling in that chain — they never
+// instantiate the container DO. This stub provides the minimal `Container`
+// base class so `class BenchRunnerContainer extends Container<Env>` resolves.
+
+export class Container<Env = unknown> {
+  defaultPort?: number;
+  sleepAfter?: string;
+  // eslint-disable-next-line @typescript-eslint/no-unused-vars
+  constructor(_ctx: unknown, _env: Env) {}
+}
diff --git a/services/auto-routing-benchmark/vitest.config.ts b/services/auto-routing-benchmark/vitest.config.ts
index 7dd13254e7..6a49fa250d 100644
--- a/services/auto-routing-benchmark/vitest.config.ts
+++ b/services/auto-routing-benchmark/vitest.config.ts
@@ -1,6 +1,16 @@
+import { resolve } from 'node:path';
 import { defineConfig } from 'vitest/config';
 
 export default defineConfig({
+  resolve: {
+    alias: {
+      // The real package imports `cloudflare:workers` (workerd-only). Unit
+      // tests run in the node pool, so alias it to a node-safe stub. Tests
+      // never instantiate the container DO; they only need the worker entry to
+      // import cleanly.
+      '@cloudflare/containers': resolve(__dirname, 'test/stubs/cloudflare-containers.ts'),
+    },
+  },
   test: {
     globals: true,
     environment: 'node',
diff --git a/services/auto-routing-benchmark/worker-configuration.d.ts b/services/auto-routing-benchmark/worker-configuration.d.ts
index 5952f82e1b..a4c1d95ae1 100644
--- a/services/auto-routing-benchmark/worker-configuration.d.ts
+++ b/services/auto-routing-benchmark/worker-configuration.d.ts
@@ -1,16 +1,25 @@
 /* eslint-disable */
-// Generated by Wrangler by running `wrangler types --include-runtime=false` (hash: 8d542fe6f931aa8df862b4b96f2474be)
+// Generated by Wrangler by running `wrangler types --include-runtime=false` (hash: a4dd2037113d28278748c75791163aa0)
 interface __BaseEnv_Env {
 	AUTO_ROUTING_CONFIG: KVNamespace;
 	BENCH_DB: D1Database;
 	BENCH_QUEUE: Queue;
 	INTERNAL_API_SECRET_PROD: SecretsStoreSecret;
 	OPENROUTER_API_KEY: SecretsStoreSecret;
+	KILO_WEB_API_BASE_URL: "https://app.kilo.ai";
+	BENCH_RUNNER: DurableObjectNamespace<import("./src/index").BenchRunnerContainer>;
 }
 declare namespace Cloudflare {
 	interface GlobalProps {
 		mainModule: typeof import("./src/index");
+		durableNamespaces: "BenchRunnerContainer";
 	}
 	interface Env extends __BaseEnv_Env {}
 }
 interface Env extends __BaseEnv_Env {}
+type StringifyValues<EnvType extends Record<string, unknown>> = {
+	[Binding in keyof EnvType]: EnvType[Binding] extends string ? EnvType[Binding] : string;
+};
+declare namespace NodeJS {
+	interface ProcessEnv extends StringifyValues<Pick<Cloudflare.Env, "KILO_WEB_API_BASE_URL">> {}
+}
diff --git a/services/auto-routing-benchmark/wrangler.jsonc b/services/auto-routing-benchmark/wrangler.jsonc
index 5f3b67f6b5..3b7d06d2f1 100644
--- a/services/auto-routing-benchmark/wrangler.jsonc
+++ b/services/auto-routing-benchmark/wrangler.jsonc
@@ -11,6 +11,24 @@
   "routes": [{ "pattern": "auto-routing-benchmark.kiloapps.io", "custom_domain": true }],
   "dev": { "port": 8814, "local_protocol": "http", "ip": "0.0.0.0" },
   "observability": { "enabled": true },
+  "vars": {
+    // Base URL for reaching apps/web's /api/internal/* routes. Other workers
+    // that call apps/web internal endpoints use app.kilo.ai.
+    "KILO_WEB_API_BASE_URL": "https://app.kilo.ai"
+  },
+  "containers": [
+    {
+      "name": "auto-routing-benchmark-runner",
+      "class_name": "BenchRunnerContainer",
+      "image": "./container/Dockerfile",
+      "instance_type": "standard-2",
+      "max_instances": 15
+    }
+  ],
+  "durable_objects": {
+    "bindings": [{ "name": "BENCH_RUNNER", "class_name": "BenchRunnerContainer" }]
+  },
+  "migrations": [{ "tag": "v1", "new_sqlite_classes": ["BenchRunnerContainer"] }],
   "triggers": {
     // 04:10 UTC daily: classifier benchmark. 05:10 UTC Monday: decider benchmark.
     "crons": ["10 4 * * *", "10 5 * * 1"]

From d0f13b04d86cbafd39b0f4cf611c65c42454c9b5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Thu, 11 Jun 2026 23:21:05 +0200
Subject: [PATCH 18/73] feat(admin): benchmark user id config field

Adds a Benchmark user id input to the benchmark config editor (empty -> null),
with help text noting decider runs fail until it is set. Round-trips through
configToFormState/formStateToConfig.
---
 .../admin/auto-routing/BenchmarksSection.tsx  | 21 +++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx b/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx
index c7ae9db12a..8d7f2dc881 100644
--- a/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx
+++ b/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx
@@ -129,6 +129,7 @@ function configToFormState(config: BenchmarkConfig): {
   deciderModels: DeciderModelRow[];
   minAccuracy: number;
   maxConcurrency: number;
+  benchmarkUserId: string;
 } {
   return {
     classifierModels: config.classifierModels.join('\n'),
@@ -140,6 +141,7 @@ function configToFormState(config: BenchmarkConfig): {
     })),
     minAccuracy: config.minAccuracy,
     maxConcurrency: config.maxConcurrency,
+    benchmarkUserId: config.benchmarkUserId ?? '',
   };
 }
 
@@ -163,11 +165,13 @@ function formStateToConfig(
         supportedApiKinds: kinds.length ? kinds : ['chat_completions' as const],
       };
     });
+  const benchmarkUserId = state.benchmarkUserId.trim();
   return {
     classifierModels,
     deciderModels,
     minAccuracy: state.minAccuracy,
     maxConcurrency: state.maxConcurrency,
+    benchmarkUserId: benchmarkUserId.length > 0 ? benchmarkUserId : null,
     updatedAt: base.updatedAt,
     updatedBy: base.updatedBy,
   };
@@ -381,6 +385,23 @@ function BenchmarkConfigEditor({
           </div>
         </div>
 
+        {/* Benchmark user id */}
+        <div className="flex flex-col gap-1.5">
+          <Label htmlFor="benchmark-user-id" className="text-sm font-medium">
+            Benchmark user id
+          </Label>
+          <Input
+            id="benchmark-user-id"
+            value={form.benchmarkUserId}
+            onChange={e => setForm(prev => ({ ...prev, benchmarkUserId: e.target.value }))}
+            className="h-8 font-mono text-xs"
+            placeholder="(unset)"
+          />
+          <p className="text-muted-foreground text-xs">
+            Kilo user the decider CLI runs bill to; decider runs fail until set.
+          </p>
+        </div>
+
         {/* Actions + metadata */}
         <div className="flex flex-col gap-2">
           <div className="flex flex-wrap gap-2">

From fdc652003b81208615c96b3bee7401a91256aa17 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Thu, 11 Jun 2026 23:29:53 +0200
Subject: [PATCH 19/73] feat(gateway): add kilo-auto/efficient with blocking
 auto-routing decisions

---
 .../token/route.test.ts                       |   4 +-
 .../src/app/api/openrouter/[...path]/route.ts |  57 ++++--
 .../src/lib/ai-gateway/auto-model/index.ts    |  17 ++
 .../ai-gateway/auto-model/resolution.test.ts  | 112 ++++++++++++
 .../lib/ai-gateway/auto-model/resolution.ts   |  16 ++
 .../ai-gateway/auto-routing-decision.test.ts  | 168 ++++++++++++++++++
 .../lib/ai-gateway/auto-routing-decision.ts   |  89 ++++++++++
 7 files changed, 445 insertions(+), 18 deletions(-)
 create mode 100644 apps/web/src/lib/ai-gateway/auto-model/resolution.test.ts
 create mode 100644 apps/web/src/lib/ai-gateway/auto-routing-decision.test.ts
 create mode 100644 apps/web/src/lib/ai-gateway/auto-routing-decision.ts

diff --git a/apps/web/src/app/api/internal/auto-routing-benchmark/token/route.test.ts b/apps/web/src/app/api/internal/auto-routing-benchmark/token/route.test.ts
index 33c01fe1d1..61564185e8 100644
--- a/apps/web/src/app/api/internal/auto-routing-benchmark/token/route.test.ts
+++ b/apps/web/src/app/api/internal/auto-routing-benchmark/token/route.test.ts
@@ -49,9 +49,7 @@ describe('POST /api/internal/auto-routing-benchmark/token', () => {
   });
 
   it('returns 401 with the wrong bearer secret', async () => {
-    const res = await POST(
-      createRequest({ userId: 'user-1' }, { authorization: 'Bearer wrong' })
-    );
+    const res = await POST(createRequest({ userId: 'user-1' }, { authorization: 'Bearer wrong' }));
     expect(res.status).toBe(401);
   });
 
diff --git a/apps/web/src/app/api/openrouter/[...path]/route.ts b/apps/web/src/app/api/openrouter/[...path]/route.ts
index 179f0f3116..f4d456730f 100644
--- a/apps/web/src/app/api/openrouter/[...path]/route.ts
+++ b/apps/web/src/app/api/openrouter/[...path]/route.ts
@@ -89,8 +89,13 @@ import {
 import { normalizeModelId } from '@/lib/ai-gateway/model-utils';
 import { isForbiddenFreeModel } from '@/lib/ai-gateway/forbidden-free-models';
 import { isCloudflareIP } from '@/lib/cloudflare-ip';
-import { isKiloAutoModel, KILO_AUTO_FREE_MODEL } from '@/lib/ai-gateway/auto-model';
+import {
+  isKiloAutoModel,
+  KILO_AUTO_FREE_MODEL,
+  KILO_AUTO_EFFICIENT_MODEL,
+} from '@/lib/ai-gateway/auto-model';
 import { applyResolvedAutoModel } from '@/lib/ai-gateway/auto-model/resolution';
+import { fetchEfficientAutoDecision } from '@/lib/ai-gateway/auto-routing-decision';
 import type { MicrodollarUsageContext } from '@/lib/ai-gateway/processUsage.types';
 import {
   getMaxTokens,
@@ -263,6 +268,25 @@ export async function POST(request: NextRequest): Promise<NextResponseType<unkno
   let autoModel: string | null = null;
   if (isKiloAutoModel(requestedModelLowerCased)) {
     autoModel = requestedModelLowerCased;
+    const efficientDecision =
+      requestedModelLowerCased === KILO_AUTO_EFFICIENT_MODEL.id
+        ? async () => {
+            const user = (await authPromise).user;
+            return fetchEfficientAutoDecision({
+              apiKind: requestBodyParsed.kind,
+              body: requestBodyParsed.body,
+              requestedModel,
+              providerHints: mirrorProviderHints,
+              bodyBytes: Buffer.byteLength(requestBodyText),
+              userId: user?.id ?? `anon:${ipAddress ?? 'unknown'}`,
+              sessionId: taskId ?? sessionHeader,
+              machineId: machineIdHeader,
+              clientRequestId,
+              mode: modeHeader,
+              userAgent: extractHeaderAndLimitLength(request, 'user-agent'),
+            });
+          }
+        : undefined;
     const autoResult = await applyResolvedAutoModel(
       {
         model: requestedModelLowerCased,
@@ -271,6 +295,7 @@ export async function POST(request: NextRequest): Promise<NextResponseType<unkno
         sessionId: taskId ?? null,
         apiKind: requestBodyParsed.kind,
         clientIp: ipAddress ?? null,
+        efficientDecision,
       },
       requestBodyParsed,
       authPromise.then(res => res.user),
@@ -718,20 +743,22 @@ export async function POST(request: NextRequest): Promise<NextResponseType<unkno
     await sleepForRulesEngineAction(rulesEngineDecision.delayMs);
   }
 
-  scheduleAutoRoutingMirror({
-    apiKind: requestBodyParsed.kind,
-    body: requestBodyParsed.body,
-    requestedModel,
-    providerHints: mirrorProviderHints,
-    bodyBytes: Buffer.byteLength(requestBodyText),
-    userId: user.id,
-    sessionId: taskId ?? sessionHeader,
-    machineId: machineIdHeader,
-    clientRequestId,
-    mode: modeHeader,
-    userAgent: extractHeaderAndLimitLength(request, 'user-agent'),
-    authContext: Promise.resolve({ organizationId }),
-  });
+  if (autoModel !== KILO_AUTO_EFFICIENT_MODEL.id) {
+    scheduleAutoRoutingMirror({
+      apiKind: requestBodyParsed.kind,
+      body: requestBodyParsed.body,
+      requestedModel,
+      providerHints: mirrorProviderHints,
+      bodyBytes: Buffer.byteLength(requestBodyText),
+      userId: user.id,
+      sessionId: taskId ?? sessionHeader,
+      machineId: machineIdHeader,
+      clientRequestId,
+      mode: modeHeader,
+      userAgent: extractHeaderAndLimitLength(request, 'user-agent'),
+      authContext: Promise.resolve({ organizationId }),
+    });
+  }
 
   const observesProvider = effectiveProviderContext.provider.id === 'custom';
   const attemptId = observesProvider ? crypto.randomUUID() : null;
diff --git a/apps/web/src/lib/ai-gateway/auto-model/index.ts b/apps/web/src/lib/ai-gateway/auto-model/index.ts
index c5cb6cc72c..581785702c 100644
--- a/apps/web/src/lib/ai-gateway/auto-model/index.ts
+++ b/apps/web/src/lib/ai-gateway/auto-model/index.ts
@@ -164,9 +164,26 @@ export const KILO_AUTO_SMALL_MODEL: AutoModel = {
   opencode_settings: undefined,
 };
 
+export const KILO_AUTO_EFFICIENT_MODEL: AutoModel = {
+  id: 'kilo-auto/efficient',
+  name: 'Auto Efficient',
+  description:
+    'Routes each request to the cheapest model that gets the job done, based on continuously benchmarked accuracy and cost.',
+  context_length: 262_144,
+  max_completion_tokens: 32_768,
+  prompt_price: '0.000000325',
+  completion_price: '0.00000195',
+  input_cache_read_price: undefined,
+  input_cache_write_price: undefined,
+  supports_images: false,
+  supports_pdf: false,
+  opencode_settings: undefined,
+};
+
 export const AUTO_MODELS = [
   KILO_AUTO_FRONTIER_MODEL,
   KILO_AUTO_BALANCED_MODEL,
+  KILO_AUTO_EFFICIENT_MODEL,
   KILO_AUTO_FREE_MODEL,
   KILO_AUTO_SMALL_MODEL,
 ];
diff --git a/apps/web/src/lib/ai-gateway/auto-model/resolution.test.ts b/apps/web/src/lib/ai-gateway/auto-model/resolution.test.ts
new file mode 100644
index 0000000000..813bfed596
--- /dev/null
+++ b/apps/web/src/lib/ai-gateway/auto-model/resolution.test.ts
@@ -0,0 +1,112 @@
+import { describe, expect, it, jest } from '@jest/globals';
+
+jest.mock('@/lib/ai-gateway/providers/gateway-models-cache', () => ({
+  getOpenRouterModels: jest.fn(async () => new Set<string>()),
+}));
+
+jest.mock('@/lib/kiloclaw/setup-promo', () => ({
+  userIsWithinFirstKiloClawInstanceWindow: jest.fn(async () => false),
+}));
+
+import { resolveAutoModel } from './resolution';
+import {
+  BALANCED_RESPONSES_FALLBACK_MODEL,
+  BALANCED_MESSAGES_FALLBACK_MODEL,
+  BALANCED_QWEN_MODEL,
+  KILO_AUTO_EFFICIENT_MODEL,
+} from '@/lib/ai-gateway/auto-model';
+import type { AutoRoutingDecision } from '@kilocode/auto-routing-contracts';
+
+const baseParams = {
+  model: KILO_AUTO_EFFICIENT_MODEL.id,
+  modeHeader: null,
+  featureHeader: null,
+  sessionId: null,
+  clientIp: null,
+};
+
+const nullUserPromise = Promise.resolve(null);
+const zeroBalancePromise = Promise.resolve(0);
+
+const sampleDecision: AutoRoutingDecision = {
+  model: 'anthropic/claude-haiku-4',
+  tier: 'low',
+  source: 'benchmark',
+  tableVersion: 'v1',
+};
+
+describe('resolveAutoModel — kilo-auto/efficient branch', () => {
+  it('resolves to decision.model when the thunk returns a decision', async () => {
+    const result = await resolveAutoModel(
+      {
+        ...baseParams,
+        apiKind: 'chat_completions',
+        efficientDecision: async () => sampleDecision,
+      },
+      nullUserPromise,
+      zeroBalancePromise
+    );
+
+    expect(result).toEqual({ kind: 'ok', resolved: { model: 'anthropic/claude-haiku-4' } });
+  });
+
+  it('falls back to BALANCED_RESPONSES_FALLBACK_MODEL when no thunk is provided and apiKind=responses', async () => {
+    const result = await resolveAutoModel(
+      { ...baseParams, apiKind: 'responses' },
+      nullUserPromise,
+      zeroBalancePromise
+    );
+
+    expect(result).toEqual({ kind: 'ok', resolved: BALANCED_RESPONSES_FALLBACK_MODEL });
+  });
+
+  it('falls back to BALANCED_MESSAGES_FALLBACK_MODEL when no thunk is provided and apiKind=messages', async () => {
+    const result = await resolveAutoModel(
+      { ...baseParams, apiKind: 'messages' },
+      nullUserPromise,
+      zeroBalancePromise
+    );
+
+    expect(result).toEqual({ kind: 'ok', resolved: BALANCED_MESSAGES_FALLBACK_MODEL });
+  });
+
+  it('falls back to BALANCED_QWEN_MODEL when no thunk is provided and apiKind=chat_completions', async () => {
+    const result = await resolveAutoModel(
+      { ...baseParams, apiKind: 'chat_completions' },
+      nullUserPromise,
+      zeroBalancePromise
+    );
+
+    expect(result).toEqual({ kind: 'ok', resolved: BALANCED_QWEN_MODEL });
+  });
+
+  it('falls back to BALANCED_QWEN_MODEL when thunk returns null and apiKind=chat_completions', async () => {
+    const result = await resolveAutoModel(
+      {
+        ...baseParams,
+        apiKind: 'chat_completions',
+        efficientDecision: async () => null,
+      },
+      nullUserPromise,
+      zeroBalancePromise
+    );
+
+    expect(result).toEqual({ kind: 'ok', resolved: BALANCED_QWEN_MODEL });
+  });
+
+  it('does not call the thunk more than once', async () => {
+    const thunk = jest.fn(async () => sampleDecision);
+
+    await resolveAutoModel(
+      {
+        ...baseParams,
+        apiKind: 'chat_completions',
+        efficientDecision: thunk,
+      },
+      nullUserPromise,
+      zeroBalancePromise
+    );
+
+    expect(thunk).toHaveBeenCalledTimes(1);
+  });
+});
diff --git a/apps/web/src/lib/ai-gateway/auto-model/resolution.ts b/apps/web/src/lib/ai-gateway/auto-model/resolution.ts
index 1417dc37f2..05377f8796 100644
--- a/apps/web/src/lib/ai-gateway/auto-model/resolution.ts
+++ b/apps/web/src/lib/ai-gateway/auto-model/resolution.ts
@@ -9,10 +9,12 @@ import type {
 } from '@/lib/ai-gateway/providers/openrouter/types';
 import type OpenAI from 'openai';
 import type { User } from '@kilocode/db';
+import type { AutoRoutingDecision } from '@kilocode/auto-routing-contracts';
 import {
   KILO_AUTO_FREE_MODEL,
   KILO_AUTO_SMALL_MODEL,
   KILO_AUTO_BALANCED_MODEL,
+  KILO_AUTO_EFFICIENT_MODEL,
   modeSchema,
   BALANCED_CLAW_SETUP_MODEL,
   BALANCED_QWEN_MODEL,
@@ -41,6 +43,9 @@ type ResolveAutoModelParams = {
   sessionId: string | null;
   apiKind: GatewayRequest['kind'] | null;
   clientIp: string | null;
+  // Lazily fetches the auto-routing worker's decision; only set for
+  // kilo-auto/efficient requests (route.ts owns the request-body capture).
+  efficientDecision?: () => Promise<AutoRoutingDecision | null>;
 };
 
 function resolveMode(modeHeader: string | null, featureHeader: FeatureValue | null) {
@@ -117,6 +122,17 @@ export async function resolveAutoModel(
       },
     };
   }
+  if (model === KILO_AUTO_EFFICIENT_MODEL.id) {
+    const decision = params.efficientDecision ? await params.efficientDecision() : null;
+    if (decision) {
+      return { kind: 'ok', resolved: { model: decision.model } };
+    }
+    // Static fallback when the worker is slow/unavailable: same shape as
+    // balanced so an efficient request never degrades below balanced.
+    if (apiKind === 'responses') return { kind: 'ok', resolved: BALANCED_RESPONSES_FALLBACK_MODEL };
+    if (apiKind === 'messages') return { kind: 'ok', resolved: BALANCED_MESSAGES_FALLBACK_MODEL };
+    return { kind: 'ok', resolved: BALANCED_QWEN_MODEL };
+  }
   const mode = resolveMode(modeHeader, featureHeader);
   if (model === KILO_AUTO_BALANCED_MODEL.id || model === KILO_AUTO_LEGACY_MODEL) {
     if (mode === 'claw' && featureHeader === 'kiloclaw') {
diff --git a/apps/web/src/lib/ai-gateway/auto-routing-decision.test.ts b/apps/web/src/lib/ai-gateway/auto-routing-decision.test.ts
new file mode 100644
index 0000000000..1eefcb2aa4
--- /dev/null
+++ b/apps/web/src/lib/ai-gateway/auto-routing-decision.test.ts
@@ -0,0 +1,168 @@
+import { afterEach, beforeEach, describe, expect, it, jest } from '@jest/globals';
+
+const mockedWarnExceptInTest = jest.fn();
+
+jest.mock('@/lib/config.server', () => ({
+  AUTO_ROUTING_WORKER_URL: '',
+  INTERNAL_API_SECRET: '',
+}));
+
+jest.mock('@/lib/utils.server', () => ({
+  warnExceptInTest: (...args: unknown[]) => mockedWarnExceptInTest(...args),
+}));
+
+import { fetchEfficientAutoDecision } from './auto-routing-decision';
+import type { EfficientDecisionParams } from './auto-routing-decision';
+
+const originalFetch = globalThis.fetch;
+const mockedFetch = jest.fn() as jest.MockedFunction<typeof globalThis.fetch>;
+
+function makeParams(): EfficientDecisionParams {
+  return {
+    apiKind: 'chat_completions',
+    body: {
+      model: 'kilo-auto/efficient',
+      stream: true,
+      messages: [
+        { role: 'system', content: 'You are Kilo Code.' },
+        { role: 'user', content: 'Fix the parser bug.' },
+      ],
+    },
+    requestedModel: 'kilo-auto/efficient',
+    providerHints: { provider: null, providerOptions: null },
+    bodyBytes: 512,
+    userId: 'user-1',
+    sessionId: 'task-123',
+    machineId: 'machine-1',
+    clientRequestId: 'req-1',
+    mode: 'code',
+    userAgent: 'Kilo-Code/1.2.3',
+  };
+}
+
+const options = {
+  workerUrl: 'https://auto-routing.example.com',
+  authToken: 'classifier-token',
+};
+
+const validDecision = {
+  model: 'anthropic/claude-haiku-4',
+  tier: 'low' as const,
+  source: 'benchmark' as const,
+  tableVersion: 'v1',
+};
+
+const validResponse = {
+  cost: 0.001,
+  decision: validDecision,
+  classifierResult: null,
+};
+
+describe('fetchEfficientAutoDecision', () => {
+  beforeEach(() => {
+    jest.clearAllMocks();
+    globalThis.fetch = mockedFetch;
+  });
+
+  afterEach(() => {
+    globalThis.fetch = originalFetch;
+  });
+
+  it('returns the decision on a 200 response with valid body', async () => {
+    mockedFetch.mockResolvedValueOnce(
+      new Response(JSON.stringify(validResponse), { status: 200 })
+    );
+
+    const result = await fetchEfficientAutoDecision(makeParams(), options);
+
+    expect(mockedFetch).toHaveBeenCalledTimes(1);
+    const [url, init] = mockedFetch.mock.calls[0];
+    expect(url).toBe('https://auto-routing.example.com/decide');
+    expect(init).toMatchObject({ method: 'POST' });
+    const headers = init?.headers as Headers;
+    expect(headers.get('authorization')).toBe('Bearer classifier-token');
+    expect(headers.get('content-type')).toBe('application/json');
+    expect(result).toEqual(validDecision);
+  });
+
+  it('returns null and calls onError on a non-OK response', async () => {
+    const onError = jest.fn();
+    mockedFetch.mockResolvedValueOnce(new Response('Internal Server Error', { status: 500 }));
+
+    const result = await fetchEfficientAutoDecision(makeParams(), { ...options, onError });
+
+    expect(result).toBeNull();
+    expect(onError).toHaveBeenCalledWith('Efficient auto decision request failed', {
+      error: 'status 500',
+    });
+  });
+
+  it('returns null and calls onError when fetch rejects (timeout/abort)', async () => {
+    const onError = jest.fn();
+    mockedFetch.mockRejectedValueOnce(new Error('The operation was aborted'));
+
+    const result = await fetchEfficientAutoDecision(makeParams(), { ...options, onError });
+
+    expect(result).toBeNull();
+    expect(onError).toHaveBeenCalledWith('Efficient auto decision request failed', {
+      error: 'The operation was aborted',
+    });
+  });
+
+  it('returns null and calls onError on a schema-invalid response body', async () => {
+    const onError = jest.fn();
+    mockedFetch.mockResolvedValueOnce(
+      new Response(JSON.stringify({ unexpected: 'shape' }), { status: 200 })
+    );
+
+    const result = await fetchEfficientAutoDecision(makeParams(), { ...options, onError });
+
+    expect(result).toBeNull();
+    expect(onError).toHaveBeenCalledWith('Efficient auto decision response invalid', {
+      error: 'invalid_response',
+    });
+  });
+
+  it('returns null when normalization fails (unclassifiable body)', async () => {
+    const result = await fetchEfficientAutoDecision(
+      { ...makeParams(), body: { stream: true } },
+      options
+    );
+
+    expect(mockedFetch).not.toHaveBeenCalled();
+    expect(result).toBeNull();
+  });
+
+  it('returns null when workerUrl is not configured', async () => {
+    const result = await fetchEfficientAutoDecision(makeParams(), {
+      ...options,
+      workerUrl: '',
+    });
+
+    expect(mockedFetch).not.toHaveBeenCalled();
+    expect(result).toBeNull();
+  });
+
+  it('returns null when authToken is not configured', async () => {
+    const result = await fetchEfficientAutoDecision(makeParams(), {
+      ...options,
+      authToken: '',
+    });
+
+    expect(mockedFetch).not.toHaveBeenCalled();
+    expect(result).toBeNull();
+  });
+
+  it('returns null (not the decision object) when the worker returns a null decision', async () => {
+    mockedFetch.mockResolvedValueOnce(
+      new Response(
+        JSON.stringify({ cost: 0.001, decision: null, classifierResult: null }),
+        { status: 200 }
+      )
+    );
+
+    const result = await fetchEfficientAutoDecision(makeParams(), options);
+
+    expect(result).toBeNull();
+  });
+});
diff --git a/apps/web/src/lib/ai-gateway/auto-routing-decision.ts b/apps/web/src/lib/ai-gateway/auto-routing-decision.ts
new file mode 100644
index 0000000000..bf8b3b1f95
--- /dev/null
+++ b/apps/web/src/lib/ai-gateway/auto-routing-decision.ts
@@ -0,0 +1,89 @@
+import {
+  AutoRoutingDecisionResponseSchema,
+  normalizeClassifierInput,
+  type AutoRoutingDecision,
+  type ClassifierApiKind,
+  type MirrorPayload,
+} from '@kilocode/auto-routing-contracts';
+import { AUTO_ROUTING_WORKER_URL, INTERNAL_API_SECRET } from '@/lib/config.server';
+import { warnExceptInTest } from '@/lib/utils.server';
+
+export const EFFICIENT_DECISION_TIMEOUT_MS = 2_000;
+
+export type EfficientDecisionParams = {
+  apiKind: ClassifierApiKind;
+  body: unknown;
+  requestedModel: string;
+  providerHints: MirrorPayload['input']['providerHints'];
+  bodyBytes: number;
+  userId: string;
+  sessionId: string | null;
+  machineId: string | null;
+  clientRequestId: string | null;
+  mode: string | null;
+  userAgent: string | null;
+};
+
+type FetchEfficientDecisionOptions = {
+  workerUrl?: string;
+  authToken?: string;
+  timeoutMs?: number;
+  onError?: (message: string, data: { error: string }) => void;
+};
+
+// Blocking counterpart of the fire-and-forget mirror: kilo-auto/efficient
+// waits for the worker's routing decision (cache hits ~20ms, classifier
+// misses ~1.2s) and falls back to the static default on timeout or error.
+export async function fetchEfficientAutoDecision(
+  params: EfficientDecisionParams,
+  options: FetchEfficientDecisionOptions = {}
+): Promise<AutoRoutingDecision | null> {
+  const workerUrl = options.workerUrl ?? AUTO_ROUTING_WORKER_URL;
+  const authToken = options.authToken ?? INTERNAL_API_SECRET;
+  const onError = options.onError ?? warnExceptInTest;
+  if (!workerUrl || !authToken) return null;
+
+  const normalizedInput = normalizeClassifierInput(params.apiKind, params.body, {
+    requestedModel: params.requestedModel,
+    providerHints: params.providerHints,
+  });
+  if (!normalizedInput) return null;
+
+  const payload: MirrorPayload = {
+    input: normalizedInput,
+    userId: params.userId,
+    sessionId: params.sessionId,
+    machineId: params.machineId,
+    clientRequestId: params.clientRequestId,
+    mode: params.mode,
+    userAgent: params.userAgent,
+    bodyBytes: params.bodyBytes,
+  };
+
+  try {
+    const response = await fetch(`${workerUrl}/decide`, {
+      method: 'POST',
+      headers: new Headers({
+        authorization: `Bearer ${authToken}`,
+        'content-type': 'application/json',
+      }),
+      body: JSON.stringify(payload),
+      signal: AbortSignal.timeout(options.timeoutMs ?? EFFICIENT_DECISION_TIMEOUT_MS),
+    });
+    if (!response.ok) {
+      onError('Efficient auto decision request failed', { error: `status ${response.status}` });
+      return null;
+    }
+    const parsed = AutoRoutingDecisionResponseSchema.safeParse(await response.json());
+    if (!parsed.success) {
+      onError('Efficient auto decision response invalid', { error: 'invalid_response' });
+      return null;
+    }
+    return parsed.data.decision;
+  } catch (error) {
+    onError('Efficient auto decision request failed', {
+      error: error instanceof Error ? error.message : String(error),
+    });
+    return null;
+  }
+}

From 813ea0eee3246edca8d938e33b1b721978fd96d1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Thu, 11 Jun 2026 23:36:32 +0200
Subject: [PATCH 20/73] chore(auto-routing): drop unused import in
 routing-table contracts

---
 packages/auto-routing-contracts/src/routing-table.ts | 1 -
 1 file changed, 1 deletion(-)

diff --git a/packages/auto-routing-contracts/src/routing-table.ts b/packages/auto-routing-contracts/src/routing-table.ts
index 82ca7e7dfc..d569b2a8e7 100644
--- a/packages/auto-routing-contracts/src/routing-table.ts
+++ b/packages/auto-routing-contracts/src/routing-table.ts
@@ -1,5 +1,4 @@
 import * as z from 'zod';
-import { DifficultyTierSchema } from './tiers';
 
 export const ClassifierApiKindSchema = z.enum(['chat_completions', 'responses', 'messages']);
 

From 9b69edfda701860ebf888cf0fcd6a988b5cd8e61 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Thu, 11 Jun 2026 23:53:03 +0200
Subject: [PATCH 21/73] fix(auto-routing-benchmark): harden decider CLI
 parsing, grading and retries

- accept step_finish (underscore) events so per-case cost is summed
- retry once when a CLI session ends with no assistant text
- exact checks also accept the last non-empty output line
- uniform final-answer suffix on decider prompts
- /admin/debug-cli endpoint returning raw CLI events for diagnosis
---
 .../ai-gateway/auto-routing-decision.test.ts  | 11 ++--
 services/auto-routing-benchmark/src/admin.ts  | 28 ++++++++++
 .../auto-routing-benchmark/src/cli-runner.ts  | 55 ++++++++++++++++++-
 .../auto-routing-benchmark/src/grading.ts     | 16 +++++-
 services/auto-routing-benchmark/src/index.ts  |  2 +
 .../src/kilo-events.test.ts                   |  2 +-
 .../auto-routing-benchmark/src/kilo-events.ts | 17 +++++-
 services/auto-routing-benchmark/src/run.ts    | 25 ++++++++-
 8 files changed, 140 insertions(+), 16 deletions(-)

diff --git a/apps/web/src/lib/ai-gateway/auto-routing-decision.test.ts b/apps/web/src/lib/ai-gateway/auto-routing-decision.test.ts
index 1eefcb2aa4..433bc8517e 100644
--- a/apps/web/src/lib/ai-gateway/auto-routing-decision.test.ts
+++ b/apps/web/src/lib/ai-gateway/auto-routing-decision.test.ts
@@ -69,9 +69,7 @@ describe('fetchEfficientAutoDecision', () => {
   });
 
   it('returns the decision on a 200 response with valid body', async () => {
-    mockedFetch.mockResolvedValueOnce(
-      new Response(JSON.stringify(validResponse), { status: 200 })
-    );
+    mockedFetch.mockResolvedValueOnce(new Response(JSON.stringify(validResponse), { status: 200 }));
 
     const result = await fetchEfficientAutoDecision(makeParams(), options);
 
@@ -155,10 +153,9 @@ describe('fetchEfficientAutoDecision', () => {
 
   it('returns null (not the decision object) when the worker returns a null decision', async () => {
     mockedFetch.mockResolvedValueOnce(
-      new Response(
-        JSON.stringify({ cost: 0.001, decision: null, classifierResult: null }),
-        { status: 200 }
-      )
+      new Response(JSON.stringify({ cost: 0.001, decision: null, classifierResult: null }), {
+        status: 200,
+      })
     );
 
     const result = await fetchEfficientAutoDecision(makeParams(), options);
diff --git a/services/auto-routing-benchmark/src/admin.ts b/services/auto-routing-benchmark/src/admin.ts
index 5bb6649a69..395635bd53 100644
--- a/services/auto-routing-benchmark/src/admin.ts
+++ b/services/auto-routing-benchmark/src/admin.ts
@@ -1,3 +1,4 @@
+import * as z from 'zod';
 import {
   BenchmarkConfigSchema,
   StartBenchmarkRunRequestSchema,
@@ -5,6 +6,8 @@ import {
 } from '@kilocode/auto-routing-contracts';
 import type { Handler } from 'hono';
 import { DEFAULT_BENCHMARK_CONFIG, getBenchmarkConfig, saveBenchmarkConfig } from './config';
+import { debugRunCli } from './cli-runner';
+import { fetchBenchmarkUserToken } from './run';
 import { getLatestRoutingTable, listRuns } from './db';
 import { startRun } from './run';
 import type { HonoEnv } from './hono-env';
@@ -54,3 +57,28 @@ export const getRoutingTableHandler: Handler<HonoEnv> = async c => {
     publishedAt: latest?.published_at ?? null,
   });
 };
+
+const DebugCliRequestSchema = z.object({
+  model: z.string().trim().min(1),
+  prompt: z.string().min(1),
+});
+
+// Runs one ad-hoc prompt through the kilo CLI container and returns raw
+// (truncated) stdout lines plus the parsed result. Diagnostic-only.
+export const debugCliHandler: Handler<HonoEnv> = async c => {
+  let body: unknown;
+  try {
+    body = await c.req.json();
+  } catch {
+    return c.json({ error: 'Invalid JSON body' }, 400);
+  }
+  const parsed = DebugCliRequestSchema.safeParse(body);
+  if (!parsed.success) return c.json({ error: 'Invalid debug request' }, 400);
+  const config = await getBenchmarkConfig(c.env.BENCH_DB);
+  if (!config.benchmarkUserId) {
+    return c.json({ error: 'benchmarkUserId is not configured' }, 400);
+  }
+  const kiloToken = await fetchBenchmarkUserToken(c.env, config.benchmarkUserId);
+  const result = await debugRunCli(c.env, { ...parsed.data, kiloToken });
+  return c.json(result);
+};
diff --git a/services/auto-routing-benchmark/src/cli-runner.ts b/services/auto-routing-benchmark/src/cli-runner.ts
index d80eb19e3f..8d7e370d0f 100644
--- a/services/auto-routing-benchmark/src/cli-runner.ts
+++ b/services/auto-routing-benchmark/src/cli-runner.ts
@@ -7,10 +7,18 @@ export type CliRunResult = {
   latencyMs: number;
   exitCode: number;
   stderrTail: string;
+  eventCount: number;
+  lastEventTypes: string[];
 };
 
 const DECIDER_CLI_TIMEOUT_MS = 180_000;
 
+// Appended to every decider prompt: the agent harness tends to wrap answers
+// in prose ("The output is: ..."), which strict mechanical checks reject.
+// One uniform instruction across all candidate models keeps grading fair.
+const FINAL_ANSWER_SUFFIX =
+  '\n\nIMPORTANT: Your final message must contain ONLY the answer in the exact requested format - no explanations, no preamble, no extra words.';
+
 type ContainerRunResponse = {
   exitCode: number;
   durationMs: number;
@@ -32,7 +40,7 @@ export async function runDeciderCaseViaCli(
 ): Promise<CliRunResult> {
   const { instanceName, model, benchCase, kiloToken } = params;
   const stub = env.BENCH_RUNNER.get(env.BENCH_RUNNER.idFromName(instanceName));
-  const prompt = `${benchCase.systemPrompt}\n\n${benchCase.userPrompt}`;
+  const prompt = `${benchCase.systemPrompt}\n\n${benchCase.userPrompt}${FINAL_ANSWER_SUFFIX}`;
 
   const startedAt = Date.now();
   const response = await stub.fetch(
@@ -49,7 +57,7 @@ export async function runDeciderCaseViaCli(
   }
 
   const body = (await response.json()) as ContainerRunResponse;
-  const { text, costUsd } = parseKiloRunEvents(body.stdoutLines ?? []);
+  const { text, costUsd, eventCount, lastEventTypes } = parseKiloRunEvents(body.stdoutLines ?? []);
 
   return {
     text,
@@ -57,5 +65,48 @@ export async function runDeciderCaseViaCli(
     latencyMs: body.durationMs ?? Date.now() - startedAt,
     exitCode: body.exitCode,
     stderrTail: body.stderrTail ?? '',
+    eventCount,
+    lastEventTypes,
+  };
+}
+
+// Ad-hoc CLI run for the /admin/debug-cli endpoint: returns raw (truncated)
+// stdout lines alongside the parsed result so empty-output cases in prod can
+// be diagnosed without redeploying.
+export async function debugRunCli(
+  env: Env,
+  params: { model: string; prompt: string; kiloToken: string }
+): Promise<{
+  exitCode: number;
+  durationMs: number;
+  stderrTail: string;
+  stdoutLines: string[];
+  parsed: ReturnType<typeof parseKiloRunEvents>;
+}> {
+  const stub = env.BENCH_RUNNER.get(env.BENCH_RUNNER.idFromName(`debug:${params.model}`));
+  const response = await stub.fetch(
+    new Request('http://container/run', {
+      method: 'POST',
+      headers: { 'content-type': 'application/json' },
+      body: JSON.stringify({
+        model: params.model,
+        prompt: params.prompt,
+        kiloToken: params.kiloToken,
+        timeoutMs: DECIDER_CLI_TIMEOUT_MS,
+      }),
+    })
+  );
+  if (!response.ok) {
+    const detail = (await response.text().catch(() => '')).slice(0, 500);
+    throw new Error(`container /run failed: HTTP ${response.status} ${detail}`);
+  }
+  const body = (await response.json()) as ContainerRunResponse;
+  const stdoutLines = (body.stdoutLines ?? []).slice(0, 80).map(l => l.slice(0, 600));
+  return {
+    exitCode: body.exitCode,
+    durationMs: body.durationMs,
+    stderrTail: body.stderrTail ?? '',
+    stdoutLines,
+    parsed: parseKiloRunEvents(body.stdoutLines ?? []),
   };
 }
diff --git a/services/auto-routing-benchmark/src/grading.ts b/services/auto-routing-benchmark/src/grading.ts
index 0c3291c1d7..6a489be73e 100644
--- a/services/auto-routing-benchmark/src/grading.ts
+++ b/services/auto-routing-benchmark/src/grading.ts
@@ -89,8 +89,20 @@ function extractJson(text: string): unknown {
 
 export function runDeciderCheck(check: DeciderCheck, output: string): boolean {
   switch (check.kind) {
-    case 'exact':
-      return normalizeAnswer(output) === normalizeAnswer(check.value);
+    case 'exact': {
+      // Agent harnesses sometimes prepend prose despite instructions; accept
+      // the answer when the whole output OR its last non-empty line matches.
+      // Wrong answers fail either way.
+      const normalized = normalizeAnswer(output);
+      const expected = normalizeAnswer(check.value);
+      if (normalized === expected) return true;
+      const lastLine =
+        normalized
+          .split('\n')
+          .filter(l => l.trim().length > 0)
+          .at(-1) ?? '';
+      return lastLine.trim() === expected;
+    }
     case 'contains_all':
       return check.values.every(v => normalizeAnswer(output).includes(normalizeAnswer(v)));
     case 'regex':
diff --git a/services/auto-routing-benchmark/src/index.ts b/services/auto-routing-benchmark/src/index.ts
index c6449e82a9..bfc5d9929d 100644
--- a/services/auto-routing-benchmark/src/index.ts
+++ b/services/auto-routing-benchmark/src/index.ts
@@ -7,6 +7,7 @@ import {
   listRunsHandler,
   startRunHandler,
   getRoutingTableHandler,
+  debugCliHandler,
 } from './admin';
 import type { HonoEnv } from './hono-env';
 import { processJob, startRun, type BenchmarkJobMessage } from './run';
@@ -23,6 +24,7 @@ app.put('/admin/config', putConfigHandler);
 app.get('/admin/runs', listRunsHandler);
 app.post('/admin/runs', startRunHandler);
 app.get('/admin/routing-table', getRoutingTableHandler);
+app.post('/admin/debug-cli', debugCliHandler);
 
 app.notFound(createNotFoundHandler());
 app.onError(createErrorHandler());
diff --git a/services/auto-routing-benchmark/src/kilo-events.test.ts b/services/auto-routing-benchmark/src/kilo-events.test.ts
index 937753f97a..1f0c1078dc 100644
--- a/services/auto-routing-benchmark/src/kilo-events.test.ts
+++ b/services/auto-routing-benchmark/src/kilo-events.test.ts
@@ -58,6 +58,6 @@ describe('parseKiloRunEvents', () => {
       JSON.stringify({ type: 'tool', part: { name: 'read' } }),
       JSON.stringify({ type: 'start' }),
     ];
-    expect(parseKiloRunEvents(lines)).toEqual({ text: '', costUsd: null });
+    expect(parseKiloRunEvents(lines)).toMatchObject({ text: '', costUsd: null });
   });
 });
diff --git a/services/auto-routing-benchmark/src/kilo-events.ts b/services/auto-routing-benchmark/src/kilo-events.ts
index 5bd874e638..53efff642b 100644
--- a/services/auto-routing-benchmark/src/kilo-events.ts
+++ b/services/auto-routing-benchmark/src/kilo-events.ts
@@ -12,6 +12,10 @@
 export type ParsedKiloRun = {
   text: string;
   costUsd: number | null;
+  // Diagnostics for empty-output investigations: how many event lines parsed
+  // and the trailing event types (never the payloads, which may be sensitive).
+  eventCount: number;
+  lastEventTypes: string[];
 };
 
 type LooseEvent = {
@@ -48,6 +52,7 @@ function readCost(evt: LooseEvent): number | null {
 export function parseKiloRunEvents(lines: string[]): ParsedKiloRun {
   const textParts: string[] = [];
   let costUsd: number | null = null;
+  const eventTypes: string[] = [];
 
   for (const line of lines) {
     let evt: LooseEvent;
@@ -57,17 +62,25 @@ export function parseKiloRunEvents(lines: string[]): ParsedKiloRun {
       continue;
     }
     if (evt === null || typeof evt !== 'object') continue;
+    if (typeof evt.type === 'string') eventTypes.push(evt.type);
 
     if (evt.type === 'text' && isCompletedTextEvent(evt)) {
       const text = readText(evt);
       if (text !== null) textParts.push(text);
     }
 
-    if (evt.type === 'step-finish') {
+    // The CLI emits `step_finish` at the top level (part.type is the
+    // hyphenated `step-finish`); accept both spellings across versions.
+    if (evt.type === 'step_finish' || evt.type === 'step-finish') {
       const cost = readCost(evt);
       if (cost !== null) costUsd = (costUsd ?? 0) + cost;
     }
   }
 
-  return { text: textParts.join('\n'), costUsd };
+  return {
+    text: textParts.join('\n'),
+    costUsd,
+    eventCount: eventTypes.length,
+    lastEventTypes: eventTypes.slice(-3),
+  };
 }
diff --git a/services/auto-routing-benchmark/src/run.ts b/services/auto-routing-benchmark/src/run.ts
index 721dc7f9e0..fdb980e655 100644
--- a/services/auto-routing-benchmark/src/run.ts
+++ b/services/auto-routing-benchmark/src/run.ts
@@ -219,12 +219,30 @@ async function processDeciderJob(
   await runCasesWithConcurrency(cases, config.maxConcurrency, async benchCase => {
     const startedAt = performance.now();
     try {
-      const result = await runDeciderCaseViaCli(env, {
+      let result = await runDeciderCaseViaCli(env, {
         instanceName,
         model: message.model,
         benchCase,
         kiloToken,
       });
+      // The CLI occasionally ends a session with no assistant text at all
+      // (transient empty completion: a lone step_finish with cost 0). Mirror
+      // the production classifier's policy and retry once.
+      let retried = false;
+      if (result.exitCode === 0 && result.text.length === 0) {
+        retried = true;
+        const retry = await runDeciderCaseViaCli(env, {
+          instanceName,
+          model: message.model,
+          benchCase,
+          kiloToken,
+        });
+        retry.costUsd =
+          retry.costUsd === null && result.costUsd === null
+            ? null
+            : (retry.costUsd ?? 0) + (result.costUsd ?? 0);
+        result = retry;
+      }
       const succeeded =
         result.exitCode === 0 &&
         result.text.length > 0 &&
@@ -240,6 +258,9 @@ async function processDeciderJob(
         detail_json: JSON.stringify({
           exitCode: result.exitCode,
           outputPrefix: result.text.slice(0, 200),
+          eventCount: result.eventCount,
+          lastEventTypes: result.lastEventTypes,
+          retried,
         }),
         error: result.exitCode !== 0 ? result.stderrTail.slice(0, 500) : null,
       });
@@ -256,7 +277,7 @@ const TokenResponseSchema = z.object({ token: z.string().min(1), expiresAt: z.st
 
 // Calls apps/web's internal endpoint to mint a short-lived user API token for
 // the decider CLI. Never logs the token.
-async function fetchBenchmarkUserToken(env: Env, userId: string): Promise<string> {
+export async function fetchBenchmarkUserToken(env: Env, userId: string): Promise<string> {
   const secret = await env.INTERNAL_API_SECRET_PROD.get();
   const response = await fetch(
     `${env.KILO_WEB_API_BASE_URL}/api/internal/auto-routing-benchmark/token`,

From 5ff4b08832c16a0b2d673ad90329bf11515614f2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Fri, 12 Jun 2026 00:02:10 +0200
Subject: [PATCH 22/73] fix(auto-routing-benchmark): warm up CLI container
 before concurrent decider cases

---
 services/auto-routing-benchmark/src/run.ts | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/services/auto-routing-benchmark/src/run.ts b/services/auto-routing-benchmark/src/run.ts
index fdb980e655..c701d39e18 100644
--- a/services/auto-routing-benchmark/src/run.ts
+++ b/services/auto-routing-benchmark/src/run.ts
@@ -216,6 +216,25 @@ async function processDeciderJob(
   const kiloToken = await fetchBenchmarkUserToken(env, config.benchmarkUserId);
   const instanceName = `${message.runId}:${message.model}:${message.chunk ?? 0}`;
 
+  // The CLI performs a one-time sqlite migration on each fresh container
+  // instance; concurrent first runs against the migrating database end with
+  // empty event streams (exit 0, zero events). One sequential warmup run
+  // completes the migration before the concurrent case loop starts.
+  await runDeciderCaseViaCli(env, {
+    instanceName,
+    model: message.model,
+    benchCase: {
+      id: 'warmup',
+      tier: 'low',
+      taskType: 'implementation',
+      systemPrompt: 'You are a terse assistant.',
+      userPrompt: 'Reply with exactly: ok',
+      maxTokens: 512,
+      check: { kind: 'exact', value: 'ok' },
+    },
+    kiloToken,
+  }).catch(() => {});
+
   await runCasesWithConcurrency(cases, config.maxConcurrency, async benchCase => {
     const startedAt = performance.now();
     try {

From 06836cc388251c0a1c36543e466ca9266124118d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Fri, 12 Jun 2026 00:04:47 +0200
Subject: [PATCH 23/73] fix(auto-routing-benchmark): faster container turnover
 to avoid instance exhaustion

---
 services/auto-routing-benchmark/src/bench-runner-container.ts | 2 +-
 services/auto-routing-benchmark/wrangler.jsonc                | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/services/auto-routing-benchmark/src/bench-runner-container.ts b/services/auto-routing-benchmark/src/bench-runner-container.ts
index c61a6ac57e..3c7f5233f0 100644
--- a/services/auto-routing-benchmark/src/bench-runner-container.ts
+++ b/services/auto-routing-benchmark/src/bench-runner-container.ts
@@ -6,5 +6,5 @@ import { Container } from '@cloudflare/containers';
 // (runId, model, chunk) so concurrent chunks/models don't share state.
 export class BenchRunnerContainer extends Container<Env> {
   defaultPort = 3000;
-  sleepAfter = '5m';
+  sleepAfter = '2m';
 }
diff --git a/services/auto-routing-benchmark/wrangler.jsonc b/services/auto-routing-benchmark/wrangler.jsonc
index 3b7d06d2f1..e5340b3679 100644
--- a/services/auto-routing-benchmark/wrangler.jsonc
+++ b/services/auto-routing-benchmark/wrangler.jsonc
@@ -22,7 +22,7 @@
       "class_name": "BenchRunnerContainer",
       "image": "./container/Dockerfile",
       "instance_type": "standard-2",
-      "max_instances": 15
+      "max_instances": 25
     }
   ],
   "durable_objects": {

From 2faee138f6b36aca8268e7620eeea4cd95f899bd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Fri, 12 Jun 2026 00:18:44 +0200
Subject: [PATCH 24/73] fix(auto-routing-benchmark): address review findings

- serialize CLI runs per container and run decider cases sequentially
  (the CLI sqlite migration is unsafe under concurrent sessions)
- add dead-letter queue and raise container instance ceiling
- redact the kilo token from captured stderr before it leaves the container
- timing-safe secret comparison and tokenSource audit field on minted tokens
- validate persisted routing tables before serving them from the admin API
- regenerate worker types with the production web base URL
- dedupe the routing-table response schema; tier boundary tests
---
 .../auto-routing/BenchmarksSection.types.ts   | 12 ++++--------
 .../token/route.test.ts                       | 10 +++++++---
 .../auto-routing-benchmark/token/route.ts     | 17 +++++++++++++++--
 .../auto-routing-contracts/src/tiers.test.ts  | 19 +++++++++++++++++++
 .../container/server.mjs                      | 16 ++++++++++++++--
 .../auto-routing-benchmark/src/admin.test.ts  | 15 ++++++++++++++-
 services/auto-routing-benchmark/src/admin.ts  |  8 ++++++--
 services/auto-routing-benchmark/src/run.ts    |  5 ++++-
 .../auto-routing-benchmark/wrangler.jsonc     |  5 +++--
 9 files changed, 86 insertions(+), 21 deletions(-)

diff --git a/apps/web/src/app/admin/auto-routing/BenchmarksSection.types.ts b/apps/web/src/app/admin/auto-routing/BenchmarksSection.types.ts
index 17ccd94cbf..520a21dcde 100644
--- a/apps/web/src/app/admin/auto-routing/BenchmarksSection.types.ts
+++ b/apps/web/src/app/admin/auto-routing/BenchmarksSection.types.ts
@@ -1,8 +1,4 @@
-import * as z from 'zod';
-import { RoutingTableSchema } from '@kilocode/auto-routing-contracts';
-
-export const BenchmarkRoutingTableResponseSchema = z.object({
-  table: RoutingTableSchema.nullable(),
-  publishedAt: z.string().nullable(),
-});
-export type BenchmarkRoutingTableResponse = z.infer<typeof BenchmarkRoutingTableResponseSchema>;
+export {
+  BenchmarkRoutingTableResponseSchema,
+  type BenchmarkRoutingTableResponse,
+} from '@/lib/ai-gateway/auto-routing-benchmark-admin-client';
diff --git a/apps/web/src/app/api/internal/auto-routing-benchmark/token/route.test.ts b/apps/web/src/app/api/internal/auto-routing-benchmark/token/route.test.ts
index 61564185e8..798af89524 100644
--- a/apps/web/src/app/api/internal/auto-routing-benchmark/token/route.test.ts
+++ b/apps/web/src/app/api/internal/auto-routing-benchmark/token/route.test.ts
@@ -76,8 +76,12 @@ describe('POST /api/internal/auto-routing-benchmark/token', () => {
     const json = (await res.json()) as { token: string; expiresAt: string };
     expect(json.token).toBe('minted-token');
     expect(typeof json.expiresAt).toBe('string');
-    expect(mockGenerateApiToken).toHaveBeenCalledWith(user, undefined, {
-      expiresIn: 6 * 60 * 60,
-    });
+    expect(mockGenerateApiToken).toHaveBeenCalledWith(
+      user,
+      { tokenSource: 'auto-routing-benchmark' },
+      {
+        expiresIn: 6 * 60 * 60,
+      }
+    );
   });
 });
diff --git a/apps/web/src/app/api/internal/auto-routing-benchmark/token/route.ts b/apps/web/src/app/api/internal/auto-routing-benchmark/token/route.ts
index 5278cce9db..82b4e6d79d 100644
--- a/apps/web/src/app/api/internal/auto-routing-benchmark/token/route.ts
+++ b/apps/web/src/app/api/internal/auto-routing-benchmark/token/route.ts
@@ -23,6 +23,7 @@
 
 import type { NextRequest } from 'next/server';
 import { NextResponse } from 'next/server';
+import { timingSafeEqual } from 'node:crypto';
 import { z } from 'zod';
 import { eq } from 'drizzle-orm';
 import { kilocode_users } from '@kilocode/db/schema';
@@ -44,9 +45,17 @@ function extractBearerToken(authHeader: string | null): string | null {
   return trimmed.slice(7).trim() || null;
 }
 
+function timingSafeStringEqual(a: string, b: string): boolean {
+  const encoder = new TextEncoder();
+  const bufA = encoder.encode(a);
+  const bufB = encoder.encode(b);
+  if (bufA.byteLength !== bufB.byteLength) return false;
+  return timingSafeEqual(bufA, bufB);
+}
+
 export async function POST(req: NextRequest) {
   const token = extractBearerToken(req.headers.get('authorization'));
-  if (!INTERNAL_API_SECRET || token !== INTERNAL_API_SECRET) {
+  if (!INTERNAL_API_SECRET || !token || !timingSafeStringEqual(token, INTERNAL_API_SECRET)) {
     return NextResponse.json({ error: 'Unauthorized' }, { status: 401 });
   }
 
@@ -75,7 +84,11 @@ export async function POST(req: NextRequest) {
     return NextResponse.json({ error: 'User not found' }, { status: 404 });
   }
 
-  const apiToken = generateApiToken(user, undefined, { expiresIn: SIX_HOURS_IN_SECONDS });
+  const apiToken = generateApiToken(
+    user,
+    { tokenSource: 'auto-routing-benchmark' },
+    { expiresIn: SIX_HOURS_IN_SECONDS }
+  );
   const expiresAt = new Date(Date.now() + SIX_HOURS_IN_SECONDS * 1000).toISOString();
 
   return NextResponse.json({ token: apiToken, expiresAt });
diff --git a/packages/auto-routing-contracts/src/tiers.test.ts b/packages/auto-routing-contracts/src/tiers.test.ts
index edf3a9d6c8..5d62f7259f 100644
--- a/packages/auto-routing-contracts/src/tiers.test.ts
+++ b/packages/auto-routing-contracts/src/tiers.test.ts
@@ -43,6 +43,25 @@ describe('deriveDifficultyTier', () => {
       )
     ).toBe('high');
   });
+  it('high risk tips an otherwise-low request to medium', () => {
+    expect(
+      deriveDifficultyTier(
+        classification({ executionMode: 'multi_step_project', riskLevel: 'high' })
+      )
+    ).toBe('medium');
+  });
+  it('high risk tips an otherwise-medium request to high', () => {
+    expect(
+      deriveDifficultyTier(
+        classification({
+          reasoningComplexity: 'medium',
+          contextComplexity: 'large',
+          executionMode: 'code_change',
+          riskLevel: 'high',
+        })
+      )
+    ).toBe('high');
+  });
   it('is monotonic: bumping reasoning complexity never lowers the tier', () => {
     const tiers = ['low', 'medium', 'high'] as const;
     for (const ctx of ['small', 'medium', 'large'] as const) {
diff --git a/services/auto-routing-benchmark/container/server.mjs b/services/auto-routing-benchmark/container/server.mjs
index 1931864c3b..d2d9969788 100644
--- a/services/auto-routing-benchmark/container/server.mjs
+++ b/services/auto-routing-benchmark/container/server.mjs
@@ -30,6 +30,16 @@ async function readBody(req) {
   return Buffer.concat(chunks).toString('utf8');
 }
 
+// The CLI's one-time sqlite migration (and its state dir generally) is not
+// safe under concurrent first runs; serialize every CLI execution in this
+// instance. Callers see requests queue, which is fine for benchmark traffic.
+let runChain = Promise.resolve();
+function runCaseSerialized(params) {
+  const next = runChain.then(() => runCase(params));
+  runChain = next.catch(() => {});
+  return next;
+}
+
 function runCase({ model, prompt, kiloToken, timeoutMs }) {
   return new Promise(resolve => {
     void (async () => {
@@ -77,11 +87,13 @@ function runCase({ model, prompt, kiloToken, timeoutMs }) {
         clearTimeout(killTimer);
         await rm(dir, { recursive: true, force: true }).catch(() => {});
         const stdoutLines = stdout.split('\n').filter(line => line.length > 0);
+        // Defense in case a future CLI version echoes auth material to stderr.
+        const redactedStderrTail = stderrTail.split(kiloToken).join('[redacted]');
         resolve({
           exitCode,
           durationMs: Date.now() - startedAt,
           stdoutLines,
-          stderrTail,
+          stderrTail: redactedStderrTail,
         });
       };
 
@@ -124,7 +136,7 @@ const server = createServer((req, res) => {
       }
 
       try {
-        const result = await runCase({ model, prompt, kiloToken, timeoutMs });
+        const result = await runCaseSerialized({ model, prompt, kiloToken, timeoutMs });
         sendJson(res, 200, result);
       } catch (err) {
         sendJson(res, 500, { error: err instanceof Error ? err.message : 'run failed' });
diff --git a/services/auto-routing-benchmark/src/admin.test.ts b/services/auto-routing-benchmark/src/admin.test.ts
index 99da4c6a8a..02a9077580 100644
--- a/services/auto-routing-benchmark/src/admin.test.ts
+++ b/services/auto-routing-benchmark/src/admin.test.ts
@@ -268,7 +268,20 @@ describe('GET /admin/routing-table', () => {
   });
 
   it('returns the parsed table and publishedAt when a row exists', async () => {
-    const tableData = { version: 'test-v1', tiers: {} };
+    const candidate = {
+      model: 'm',
+      accuracy: 1,
+      avgCostUsd: 0.1,
+      meetsThreshold: true,
+      supportedApiKinds: ['chat_completions'],
+    };
+    const tableData = {
+      version: 'test-v1',
+      generatedAt: '2026-06-01T10:00:00.000Z',
+      minAccuracy: 0.7,
+      source: 'benchmark',
+      tiers: { low: [candidate], medium: [candidate], high: [candidate] },
+    };
     dbFirst.mockResolvedValueOnce({
       run_id: 'run-123',
       published_at: '2026-06-01T10:00:00.000Z',
diff --git a/services/auto-routing-benchmark/src/admin.ts b/services/auto-routing-benchmark/src/admin.ts
index 395635bd53..627bbea1bb 100644
--- a/services/auto-routing-benchmark/src/admin.ts
+++ b/services/auto-routing-benchmark/src/admin.ts
@@ -1,6 +1,7 @@
 import * as z from 'zod';
 import {
   BenchmarkConfigSchema,
+  RoutingTableSchema,
   StartBenchmarkRunRequestSchema,
   type BenchmarkRun,
 } from '@kilocode/auto-routing-contracts';
@@ -52,9 +53,12 @@ export const startRunHandler: Handler<HonoEnv> = async c => {
 
 export const getRoutingTableHandler: Handler<HonoEnv> = async c => {
   const latest = await getLatestRoutingTable(c.env.BENCH_DB);
+  // Validated at publish time, but re-validate before crossing the contract
+  // boundary so a schema change can never surface a stale incompatible table.
+  const parsed = latest ? RoutingTableSchema.safeParse(JSON.parse(latest.table_json)) : null;
   return c.json({
-    table: latest ? (JSON.parse(latest.table_json) as unknown) : null,
-    publishedAt: latest?.published_at ?? null,
+    table: parsed?.success ? parsed.data : null,
+    publishedAt: parsed?.success ? (latest?.published_at ?? null) : null,
   });
 };
 
diff --git a/services/auto-routing-benchmark/src/run.ts b/services/auto-routing-benchmark/src/run.ts
index c701d39e18..43435a7449 100644
--- a/services/auto-routing-benchmark/src/run.ts
+++ b/services/auto-routing-benchmark/src/run.ts
@@ -235,7 +235,10 @@ async function processDeciderJob(
     kiloToken,
   }).catch(() => {});
 
-  await runCasesWithConcurrency(cases, config.maxConcurrency, async benchCase => {
+  // Concurrency 1: the CLI's sqlite state in the container is not safe under
+  // concurrent sessions (partial-migration crashes); the container serializes
+  // too, so higher concurrency here would only hold HTTP requests open.
+  await runCasesWithConcurrency(cases, 1, async benchCase => {
     const startedAt = performance.now();
     try {
       let result = await runDeciderCaseViaCli(env, {
diff --git a/services/auto-routing-benchmark/wrangler.jsonc b/services/auto-routing-benchmark/wrangler.jsonc
index e5340b3679..37ae5a4e6e 100644
--- a/services/auto-routing-benchmark/wrangler.jsonc
+++ b/services/auto-routing-benchmark/wrangler.jsonc
@@ -22,7 +22,7 @@
       "class_name": "BenchRunnerContainer",
       "image": "./container/Dockerfile",
       "instance_type": "standard-2",
-      "max_instances": 25
+      "max_instances": 40
     }
   ],
   "durable_objects": {
@@ -48,7 +48,8 @@
         "queue": "auto-routing-benchmark-jobs",
         "max_batch_size": 1,
         "max_retries": 2,
-        "max_concurrency": 4
+        "max_concurrency": 4,
+        "dead_letter_queue": "auto-routing-benchmark-dlq"
       }
     ]
   },

From cac57b796a996d603e90d48269d0aed7ef6c1500 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Fri, 12 Jun 2026 01:12:37 +0200
Subject: [PATCH 25/73] style(auto-routing-benchmark): format wrangler.jsonc

---
 .../container/server.mjs                      |  8 +++--
 .../auto-routing-benchmark/wrangler.jsonc     | 30 +++++++++----------
 2 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/services/auto-routing-benchmark/container/server.mjs b/services/auto-routing-benchmark/container/server.mjs
index d2d9969788..6ec6020f55 100644
--- a/services/auto-routing-benchmark/container/server.mjs
+++ b/services/auto-routing-benchmark/container/server.mjs
@@ -83,7 +83,7 @@ function runCase({ model, prompt, kiloToken, timeoutMs }) {
         stderrTail = (stderrTail + chunk.toString('utf8')).slice(-STDERR_CAP_BYTES);
       });
 
-      const finish = async (exitCode) => {
+      const finish = async exitCode => {
         clearTimeout(killTimer);
         await rm(dir, { recursive: true, force: true }).catch(() => {});
         const stdoutLines = stdout.split('\n').filter(line => line.length > 0);
@@ -130,7 +130,11 @@ const server = createServer((req, res) => {
           ? parsed.timeoutMs
           : DEFAULT_TIMEOUT_MS;
 
-      if (typeof model !== 'string' || typeof prompt !== 'string' || typeof kiloToken !== 'string') {
+      if (
+        typeof model !== 'string' ||
+        typeof prompt !== 'string' ||
+        typeof kiloToken !== 'string'
+      ) {
         sendJson(res, 400, { error: 'model, prompt and kiloToken are required strings' });
         return;
       }
diff --git a/services/auto-routing-benchmark/wrangler.jsonc b/services/auto-routing-benchmark/wrangler.jsonc
index 37ae5a4e6e..0b02d01c4a 100644
--- a/services/auto-routing-benchmark/wrangler.jsonc
+++ b/services/auto-routing-benchmark/wrangler.jsonc
@@ -14,7 +14,7 @@
   "vars": {
     // Base URL for reaching apps/web's /api/internal/* routes. Other workers
     // that call apps/web internal endpoints use app.kilo.ai.
-    "KILO_WEB_API_BASE_URL": "https://app.kilo.ai"
+    "KILO_WEB_API_BASE_URL": "https://app.kilo.ai",
   },
   "containers": [
     {
@@ -22,24 +22,24 @@
       "class_name": "BenchRunnerContainer",
       "image": "./container/Dockerfile",
       "instance_type": "standard-2",
-      "max_instances": 40
-    }
+      "max_instances": 40,
+    },
   ],
   "durable_objects": {
-    "bindings": [{ "name": "BENCH_RUNNER", "class_name": "BenchRunnerContainer" }]
+    "bindings": [{ "name": "BENCH_RUNNER", "class_name": "BenchRunnerContainer" }],
   },
   "migrations": [{ "tag": "v1", "new_sqlite_classes": ["BenchRunnerContainer"] }],
   "triggers": {
     // 04:10 UTC daily: classifier benchmark. 05:10 UTC Monday: decider benchmark.
-    "crons": ["10 4 * * *", "10 5 * * 1"]
+    "crons": ["10 4 * * *", "10 5 * * 1"],
   },
   "d1_databases": [
     {
       "binding": "BENCH_DB",
       "database_name": "auto-routing-benchmark",
       "database_id": "92f2c88a-5ee6-4fd0-b118-75bd141b5cac",
-      "migrations_dir": "migrations"
-    }
+      "migrations_dir": "migrations",
+    },
   ],
   "queues": {
     "producers": [{ "binding": "BENCH_QUEUE", "queue": "auto-routing-benchmark-jobs" }],
@@ -49,25 +49,25 @@
         "max_batch_size": 1,
         "max_retries": 2,
         "max_concurrency": 4,
-        "dead_letter_queue": "auto-routing-benchmark-dlq"
-      }
-    ]
+        "dead_letter_queue": "auto-routing-benchmark-dlq",
+      },
+    ],
   },
   "kv_namespaces": [
     // Shared with the auto-routing worker: the decider benchmark publishes
     // the routing table here and auto-routing reads it on /decide.
-    { "binding": "AUTO_ROUTING_CONFIG", "id": "4316b8db31e347e19cfadad1b6386ad5" }
+    { "binding": "AUTO_ROUTING_CONFIG", "id": "4316b8db31e347e19cfadad1b6386ad5" },
   ],
   "secrets_store_secrets": [
     {
       "binding": "INTERNAL_API_SECRET_PROD",
       "store_id": "342a86d9e3a94da698e82d0c6e2a36f0",
-      "secret_name": "INTERNAL_API_SECRET_PROD"
+      "secret_name": "INTERNAL_API_SECRET_PROD",
     },
     {
       "binding": "OPENROUTER_API_KEY",
       "store_id": "342a86d9e3a94da698e82d0c6e2a36f0",
-      "secret_name": "OPENROUTER_API_KEY"
-    }
-  ]
+      "secret_name": "OPENROUTER_API_KEY",
+    },
+  ],
 }

From ccc9c9d62980d5781bf789edaf98bf382b3a6041 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Fri, 12 Jun 2026 01:20:53 +0200
Subject: [PATCH 26/73] fix(auto-routing-benchmark): guard against double
 finish on spawn failure

Also documents the queue handler's throw-to-retry contract.
---
 services/auto-routing-benchmark/container/server.mjs | 5 +++++
 services/auto-routing-benchmark/src/index.ts         | 5 +++++
 2 files changed, 10 insertions(+)

diff --git a/services/auto-routing-benchmark/container/server.mjs b/services/auto-routing-benchmark/container/server.mjs
index 6ec6020f55..dee2b62554 100644
--- a/services/auto-routing-benchmark/container/server.mjs
+++ b/services/auto-routing-benchmark/container/server.mjs
@@ -83,7 +83,12 @@ function runCase({ model, prompt, kiloToken, timeoutMs }) {
         stderrTail = (stderrTail + chunk.toString('utf8')).slice(-STDERR_CAP_BYTES);
       });
 
+      // 'error' and 'close' can both fire for the same child (Node emits
+      // 'close' after 'error' on spawn failure); only the first wins.
+      let finished = false;
       const finish = async exitCode => {
+        if (finished) return;
+        finished = true;
         clearTimeout(killTimer);
         await rm(dir, { recursive: true, force: true }).catch(() => {});
         const stdoutLines = stdout.split('\n').filter(line => line.length > 0);
diff --git a/services/auto-routing-benchmark/src/index.ts b/services/auto-routing-benchmark/src/index.ts
index bfc5d9929d..a1b7a0664f 100644
--- a/services/auto-routing-benchmark/src/index.ts
+++ b/services/auto-routing-benchmark/src/index.ts
@@ -39,6 +39,11 @@ export default {
   },
   async queue(batch: MessageBatch<BenchmarkJobMessage>, env: Env): Promise<void> {
     for (const message of batch.messages) {
+      // Deliberately no try/catch: a throw from processJob (transient token,
+      // D1 or container failures) must skip the ack so the queue retries the
+      // whole (run, model, chunk) unit, dead-lettering after max_retries.
+      // Case-level failures are recorded as failed rows inside processJob and
+      // do not throw. Swallowing the throw here would silently drop chunks.
       await processJob(env, message.body);
       message.ack();
     }

From ba3b3be8bb42d4b2eaabebcb7ecd1614990c0ed5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Fri, 12 Jun 2026 01:26:53 +0200
Subject: [PATCH 27/73] fix(auto-routing): break contracts module cycle and
 keep response schema client-safe

madge flagged tiers.ts -> index.ts (type-only but counted); tier derivation
now takes a structural subset of ClassifierOutput. The routing-table response
schema moves into contracts so the client component no longer pulls
config.server (server-only) through the admin client re-export.
---
 .../admin/auto-routing/BenchmarksSection.types.ts    |  2 +-
 .../auto-routing-benchmark-admin-client.ts           | 12 ++++++------
 packages/auto-routing-contracts/src/benchmark.ts     |  8 +++++++-
 packages/auto-routing-contracts/src/tiers.ts         | 12 ++++++++++--
 4 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/apps/web/src/app/admin/auto-routing/BenchmarksSection.types.ts b/apps/web/src/app/admin/auto-routing/BenchmarksSection.types.ts
index 520a21dcde..04f0376bd3 100644
--- a/apps/web/src/app/admin/auto-routing/BenchmarksSection.types.ts
+++ b/apps/web/src/app/admin/auto-routing/BenchmarksSection.types.ts
@@ -1,4 +1,4 @@
 export {
   BenchmarkRoutingTableResponseSchema,
   type BenchmarkRoutingTableResponse,
-} from '@/lib/ai-gateway/auto-routing-benchmark-admin-client';
+} from '@kilocode/auto-routing-contracts';
diff --git a/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.ts b/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.ts
index 52ebee417a..735aa3371e 100644
--- a/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.ts
+++ b/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.ts
@@ -1,4 +1,5 @@
 import {
+  BenchmarkRoutingTableResponseSchema,
   BenchmarkConfigResponseSchema,
   BenchmarkRunsResponseSchema,
   StartBenchmarkRunResponseSchema,
@@ -6,6 +7,11 @@ import {
   type BenchmarkConfig,
   type BenchmarkKind,
 } from '@kilocode/auto-routing-contracts';
+
+export {
+  BenchmarkRoutingTableResponseSchema,
+  type BenchmarkRoutingTableResponse,
+} from '@kilocode/auto-routing-contracts';
 import { AUTO_ROUTING_BENCHMARK_WORKER_URL, INTERNAL_API_SECRET } from '@/lib/config.server';
 import * as z from 'zod';
 
@@ -17,12 +23,6 @@ export type AutoRoutingAdminResult<T> = {
 type ErrorBody = { error: string };
 const ErrorBodySchema = z.object({ error: z.string() });
 
-export const BenchmarkRoutingTableResponseSchema = z.object({
-  table: RoutingTableSchema.nullable(),
-  publishedAt: z.string().nullable(),
-});
-export type BenchmarkRoutingTableResponse = z.infer<typeof BenchmarkRoutingTableResponseSchema>;
-
 type AutoRoutingBenchmarkAdminRequestInit = Omit<RequestInit, 'headers'> & {
   headers?: Record<string, string>;
 };
diff --git a/packages/auto-routing-contracts/src/benchmark.ts b/packages/auto-routing-contracts/src/benchmark.ts
index 5f915dd9e8..9c01d9687b 100644
--- a/packages/auto-routing-contracts/src/benchmark.ts
+++ b/packages/auto-routing-contracts/src/benchmark.ts
@@ -1,5 +1,5 @@
 import * as z from 'zod';
-import { ClassifierApiKindSchema } from './routing-table';
+import { ClassifierApiKindSchema, RoutingTableSchema } from './routing-table';
 import { DifficultyTierSchema } from './tiers';
 
 export const BenchmarkKindSchema = z.enum(['classifier', 'decider']);
@@ -64,3 +64,9 @@ export const StartBenchmarkRunResponseSchema = z.object({
   runId: z.string(),
   enqueuedModels: z.number().int(),
 });
+
+export const BenchmarkRoutingTableResponseSchema = z.object({
+  table: RoutingTableSchema.nullable(),
+  publishedAt: z.string().nullable(),
+});
+export type BenchmarkRoutingTableResponse = z.infer<typeof BenchmarkRoutingTableResponseSchema>;
diff --git a/packages/auto-routing-contracts/src/tiers.ts b/packages/auto-routing-contracts/src/tiers.ts
index d0f4cb4c7e..5315174c84 100644
--- a/packages/auto-routing-contracts/src/tiers.ts
+++ b/packages/auto-routing-contracts/src/tiers.ts
@@ -1,5 +1,4 @@
 import * as z from 'zod';
-import type { ClassifierOutput } from './index';
 
 export const DifficultyTierSchema = z.enum(['low', 'medium', 'high']);
 export type DifficultyTier = z.infer<typeof DifficultyTierSchema>;
@@ -20,7 +19,16 @@ const RISK_POINTS = { low: 0, medium: 0, high: 1 } as const;
 // Reasoning complexity dominates (weight 2x) because it is the strongest
 // signal for whether a cheap model can complete the task; context size,
 // execution mode and blast radius nudge borderline cases up.
-export function deriveDifficultyTier(classification: ClassifierOutput): DifficultyTier {
+// Structural subset of ClassifierOutput: importing the full type from
+// ./index would create a module cycle (index re-exports this file).
+export type DifficultyTierSignal = {
+  reasoningComplexity: 'low' | 'medium' | 'high';
+  contextComplexity: 'small' | 'medium' | 'large';
+  executionMode: 'answer_only' | 'code_change' | 'command_execution' | 'multi_step_project';
+  riskLevel: 'low' | 'medium' | 'high';
+};
+
+export function deriveDifficultyTier(classification: DifficultyTierSignal): DifficultyTier {
   const score =
     REASONING_POINTS[classification.reasoningComplexity] +
     CONTEXT_POINTS[classification.contextComplexity] +

From 6776db02e2dc1b023b1b82fcab655e82933fd2c0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Fri, 12 Jun 2026 01:32:57 +0200
Subject: [PATCH 28/73] chore(admin): drop unused import after schema move

---
 .../src/lib/ai-gateway/auto-routing-benchmark-admin-client.ts    | 1 -
 1 file changed, 1 deletion(-)

diff --git a/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.ts b/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.ts
index 735aa3371e..d12990de14 100644
--- a/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.ts
+++ b/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.ts
@@ -3,7 +3,6 @@ import {
   BenchmarkConfigResponseSchema,
   BenchmarkRunsResponseSchema,
   StartBenchmarkRunResponseSchema,
-  RoutingTableSchema,
   type BenchmarkConfig,
   type BenchmarkKind,
 } from '@kilocode/auto-routing-contracts';

From c0320c7c602c1fceb6fb9422d6de00df7409a764 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Fri, 12 Jun 2026 01:53:20 +0200
Subject: [PATCH 29/73] feat(auto-routing): classifier model becomes an admin
 override over the benchmark winner

---
 .../auto-routing-contracts/src/benchmark.ts   | 26 ++++++-
 .../src/contracts.test.ts                     |  2 +
 packages/auto-routing-contracts/src/index.ts  |  8 ++-
 .../src/routing-table.ts                      |  3 +
 .../src/admin-classifier-model.ts             | 19 ++---
 .../auto-routing/src/classifier-config.ts     | 60 +++++++++++++---
 .../auto-routing/src/decision-engine.test.ts  |  1 +
 services/auto-routing/src/decision-engine.ts  |  8 ++-
 services/auto-routing/src/index.test.ts       | 72 ++++++++++++++++++-
 9 files changed, 176 insertions(+), 23 deletions(-)

diff --git a/packages/auto-routing-contracts/src/benchmark.ts b/packages/auto-routing-contracts/src/benchmark.ts
index 9c01d9687b..5620736444 100644
--- a/packages/auto-routing-contracts/src/benchmark.ts
+++ b/packages/auto-routing-contracts/src/benchmark.ts
@@ -5,11 +5,18 @@ import { DifficultyTierSchema } from './tiers';
 export const BenchmarkKindSchema = z.enum(['classifier', 'decider']);
 export type BenchmarkKind = z.infer<typeof BenchmarkKindSchema>;
 
+export const ReasoningEffortSchema = z.enum(['minimal', 'low', 'medium', 'high']);
+export type ReasoningEffort = z.infer<typeof ReasoningEffortSchema>;
+
 export const BenchmarkDeciderModelSchema = z.object({
   id: z.string().trim().min(1),
   // Which gateway API kinds this model can serve when chosen by the router.
   // The benchmark itself always exercises chat completions.
   supportedApiKinds: z.array(ClassifierApiKindSchema).min(1).default(['chat_completions']),
+  // Passed to the kilo CLI as --variant during the benchmark and carried into
+  // the routing table so serving uses the same effort the model was graded
+  // with. Null for models without (or not using) configurable reasoning.
+  reasoningEffort: ReasoningEffortSchema.nullable().default(null),
 });
 export type BenchmarkDeciderModel = z.infer<typeof BenchmarkDeciderModelSchema>;
 
@@ -59,10 +66,15 @@ export const BenchmarkConfigResponseSchema = z.object({
   config: BenchmarkConfigSchema,
   defaults: BenchmarkConfigSchema,
 });
-export const StartBenchmarkRunRequestSchema = z.object({ kind: BenchmarkKindSchema });
+export const StartBenchmarkRunRequestSchema = z.object({
+  kind: BenchmarkKindSchema,
+  // Re-run every configured model even when prior results exist.
+  force: z.boolean().default(false),
+});
 export const StartBenchmarkRunResponseSchema = z.object({
   runId: z.string(),
   enqueuedModels: z.number().int(),
+  skippedModels: z.array(z.string()).default([]),
 });
 
 export const BenchmarkRoutingTableResponseSchema = z.object({
@@ -70,3 +82,15 @@ export const BenchmarkRoutingTableResponseSchema = z.object({
   publishedAt: z.string().nullable(),
 });
 export type BenchmarkRoutingTableResponse = z.infer<typeof BenchmarkRoutingTableResponseSchema>;
+
+// Published to the auto-routing KV namespace when a classifier benchmark run
+// completes: the cheapest candidate meeting the accuracy threshold.
+export const ClassifierWinnerSchema = z.object({
+  model: z.string().trim().min(1),
+  runId: z.string(),
+  accuracy: z.number(),
+  generatedAt: z.string(),
+});
+export type ClassifierWinner = z.infer<typeof ClassifierWinnerSchema>;
+
+export const CLASSIFIER_WINNER_KV_KEY = 'classifier_benchmark_winner';
diff --git a/packages/auto-routing-contracts/src/contracts.test.ts b/packages/auto-routing-contracts/src/contracts.test.ts
index 56257f8f05..7c0efc2799 100644
--- a/packages/auto-routing-contracts/src/contracts.test.ts
+++ b/packages/auto-routing-contracts/src/contracts.test.ts
@@ -94,6 +94,8 @@ describe('auto routing contracts', () => {
     expect(
       AutoRoutingClassifierModelResponseSchema.parse({
         model: 'google/gemini-2.5-flash-lite',
+        override: null,
+        benchmarkWinner: 'google/gemini-2.5-flash-lite',
         defaultModel: 'google/gemini-2.5-flash-lite',
       })
     ).toMatchObject({ model: 'google/gemini-2.5-flash-lite' });
diff --git a/packages/auto-routing-contracts/src/index.ts b/packages/auto-routing-contracts/src/index.ts
index c7022c5477..9743536692 100644
--- a/packages/auto-routing-contracts/src/index.ts
+++ b/packages/auto-routing-contracts/src/index.ts
@@ -102,6 +102,8 @@ export const AutoRoutingDecisionSchema = z.object({
   tier: DifficultyTierSchema,
   source: z.enum(['benchmark', 'default']),
   tableVersion: z.string(),
+  // Mirrors the effort the chosen model was benchmarked with, when set.
+  reasoningEffort: z.enum(['minimal', 'low', 'medium', 'high']).nullable().optional(),
 });
 export type AutoRoutingDecision = z.infer<typeof AutoRoutingDecisionSchema>;
 
@@ -119,13 +121,17 @@ export const AutoRoutingDecisionResponseSchema = z.object({
 });
 export type AutoRoutingDecisionResponse = z.infer<typeof AutoRoutingDecisionResponseSchema>;
 
+// model: null clears the admin override (benchmark winner takes effect).
 export const UpdateClassifierModelRequestSchema = z.object({
-  model: z.string().trim().min(1),
+  model: z.string().trim().min(1).nullable(),
 });
 export type UpdateClassifierModelRequest = z.infer<typeof UpdateClassifierModelRequestSchema>;
 
 export const AutoRoutingClassifierModelResponseSchema = z.object({
+  // Effective model used by /decide: override ?? benchmark winner ?? default.
   model: z.string(),
+  override: z.string().nullable(),
+  benchmarkWinner: z.string().nullable(),
   defaultModel: z.string(),
 });
 export type AutoRoutingClassifierModelResponse = z.infer<
diff --git a/packages/auto-routing-contracts/src/routing-table.ts b/packages/auto-routing-contracts/src/routing-table.ts
index d569b2a8e7..f134cba8db 100644
--- a/packages/auto-routing-contracts/src/routing-table.ts
+++ b/packages/auto-routing-contracts/src/routing-table.ts
@@ -10,6 +10,9 @@ export const RankedCandidateSchema = z.object({
   avgCostUsd: z.number().nonnegative(),
   meetsThreshold: z.boolean(),
   supportedApiKinds: z.array(ClassifierApiKindSchema).min(1),
+  // Reasoning effort the model was benchmarked with; serving mirrors it.
+  // Optional so tables published before this field existed stay valid.
+  reasoningEffort: z.enum(['minimal', 'low', 'medium', 'high']).nullable().optional(),
 });
 export type RankedCandidate = z.infer<typeof RankedCandidateSchema>;
 
diff --git a/services/auto-routing/src/admin-classifier-model.ts b/services/auto-routing/src/admin-classifier-model.ts
index 8eb5f2f0dd..472298d1a0 100644
--- a/services/auto-routing/src/admin-classifier-model.ts
+++ b/services/auto-routing/src/admin-classifier-model.ts
@@ -4,19 +4,22 @@ import {
 } from '@kilocode/auto-routing-contracts';
 import type { Handler } from 'hono';
 import { DEFAULT_CLASSIFIER_MODEL } from '@kilocode/auto-routing-contracts/classifier';
-import { getClassifierModel, setClassifierModel } from './classifier-config';
+import { getClassifierModelInfo, setClassifierModel } from './classifier-config';
+import type { ClassifierModelInfo } from './classifier-config';
 import type { HonoEnv } from './hono-env';
 
-function classifierModelResponse(model: string): AutoRoutingClassifierModelResponse {
+function classifierModelResponse(info: ClassifierModelInfo): AutoRoutingClassifierModelResponse {
   return {
-    model,
+    model: info.model,
+    override: info.override,
+    benchmarkWinner: info.benchmarkWinner,
     defaultModel: DEFAULT_CLASSIFIER_MODEL,
   };
 }
 
 export const getClassifierModelHandler: Handler<HonoEnv> = async c => {
-  const model = await getClassifierModel(c.env);
-  return c.json(classifierModelResponse(model));
+  const info = await getClassifierModelInfo(c.env);
+  return c.json(classifierModelResponse(info));
 };
 
 export const putClassifierModelHandler: Handler<HonoEnv> = async c => {
@@ -32,10 +35,10 @@ export const putClassifierModelHandler: Handler<HonoEnv> = async c => {
     return c.json({ error: 'Invalid classifier model' }, 400);
   }
 
-  const model = await setClassifierModel(c.env, parsed.data.model);
-  if (!model) {
+  const info = await setClassifierModel(c.env, parsed.data.model);
+  if (!info) {
     return c.json({ error: 'Invalid classifier model' }, 400);
   }
 
-  return c.json(classifierModelResponse(model));
+  return c.json(classifierModelResponse(info));
 };
diff --git a/services/auto-routing/src/classifier-config.ts b/services/auto-routing/src/classifier-config.ts
index e9025a9c95..6801a6d6a5 100644
--- a/services/auto-routing/src/classifier-config.ts
+++ b/services/auto-routing/src/classifier-config.ts
@@ -1,4 +1,5 @@
 import { formatError } from '@kilocode/worker-utils';
+import { CLASSIFIER_WINNER_KV_KEY, ClassifierWinnerSchema } from '@kilocode/auto-routing-contracts';
 import { DEFAULT_CLASSIFIER_MODEL } from '@kilocode/auto-routing-contracts/classifier';
 import { ttlCached } from './ttl-cache';
 
@@ -18,10 +19,35 @@ const CONFIG_CACHE_TTL_MS = 60_000;
 
 type ClassifierConfigEnv = Pick<Env, 'AUTO_ROUTING_CONFIG'>;
 
+export type ClassifierModelInfo = {
+  // Effective model used by /decide: override ?? benchmark winner ?? default.
+  model: string;
+  override: string | null;
+  benchmarkWinner: string | null;
+};
+
+function parseBenchmarkWinner(raw: string | null): string | null {
+  if (raw === null) return null;
+  try {
+    const parsed = ClassifierWinnerSchema.safeParse(JSON.parse(raw));
+    return parsed.success ? parsed.data.model : null;
+  } catch {
+    return null;
+  }
+}
+
 const classifierModelCache = ttlCached(CONFIG_CACHE_TTL_MS, async (env: ClassifierConfigEnv) => {
-  const configuredModel = await env.AUTO_ROUTING_CONFIG.get(CLASSIFIER_MODEL_CONFIG_KEY);
-  const trimmedModel = configuredModel?.trim();
-  return trimmedModel && trimmedModel.length > 0 ? trimmedModel : DEFAULT_CLASSIFIER_MODEL;
+  const [configuredModel, winnerRaw] = await Promise.all([
+    env.AUTO_ROUTING_CONFIG.get(CLASSIFIER_MODEL_CONFIG_KEY),
+    env.AUTO_ROUTING_CONFIG.get(CLASSIFIER_WINNER_KV_KEY),
+  ]);
+  const override = configuredModel?.trim() || null;
+  const benchmarkWinner = parseBenchmarkWinner(winnerRaw);
+  return {
+    model: override ?? benchmarkWinner ?? DEFAULT_CLASSIFIER_MODEL,
+    override,
+    benchmarkWinner,
+  } satisfies ClassifierModelInfo;
 });
 
 const decisionLogSampleRateCache = ttlCached(
@@ -57,10 +83,20 @@ function failClosed<T>(key: string, fallback: T): (error: unknown) => T {
   };
 }
 
-export function getClassifierModel(env: ClassifierConfigEnv): Promise<string> {
+const DEFAULT_CLASSIFIER_MODEL_INFO: ClassifierModelInfo = {
+  model: DEFAULT_CLASSIFIER_MODEL,
+  override: null,
+  benchmarkWinner: null,
+};
+
+export function getClassifierModelInfo(env: ClassifierConfigEnv): Promise<ClassifierModelInfo> {
   return classifierModelCache
     .get(env)
-    .catch(failClosed(CLASSIFIER_MODEL_CONFIG_KEY, DEFAULT_CLASSIFIER_MODEL));
+    .catch(failClosed(CLASSIFIER_MODEL_CONFIG_KEY, DEFAULT_CLASSIFIER_MODEL_INFO));
+}
+
+export async function getClassifierModel(env: ClassifierConfigEnv): Promise<string> {
+  return (await getClassifierModelInfo(env)).model;
 }
 
 export function getDecisionLogSampleRate(env: ClassifierConfigEnv): Promise<number> {
@@ -69,16 +105,22 @@ export function getDecisionLogSampleRate(env: ClassifierConfigEnv): Promise<numb
     .catch(failClosed(DECISION_LOG_SAMPLE_RATE_CONFIG_KEY, DEFAULT_DECISION_LOG_SAMPLE_RATE));
 }
 
+// model: null clears the admin override so the benchmark winner (or the
+// built-in default) takes effect.
 export async function setClassifierModel(
   env: ClassifierConfigEnv,
-  model: string
-): Promise<string | null> {
+  model: string | null
+): Promise<ClassifierModelInfo | null> {
+  if (model === null) {
+    await env.AUTO_ROUTING_CONFIG.delete(CLASSIFIER_MODEL_CONFIG_KEY);
+    classifierModelCache.clear();
+    return getClassifierModelInfo(env);
+  }
   const trimmedModel = model.trim();
   if (trimmedModel.length === 0) {
     return null;
   }
-
   await env.AUTO_ROUTING_CONFIG.put(CLASSIFIER_MODEL_CONFIG_KEY, trimmedModel);
   classifierModelCache.clear();
-  return trimmedModel;
+  return getClassifierModelInfo(env);
 }
diff --git a/services/auto-routing/src/decision-engine.test.ts b/services/auto-routing/src/decision-engine.test.ts
index 16c36ed4f7..41c9cbef7e 100644
--- a/services/auto-routing/src/decision-engine.test.ts
+++ b/services/auto-routing/src/decision-engine.test.ts
@@ -64,6 +64,7 @@ describe('computeDecision', () => {
       tier: 'low',
       source: 'benchmark',
       tableVersion: 'run-1',
+      reasoningEffort: null,
     });
   });
   it('uses the tier derived from the classification', () => {
diff --git a/services/auto-routing/src/decision-engine.ts b/services/auto-routing/src/decision-engine.ts
index 26645ead7f..d41d0961ea 100644
--- a/services/auto-routing/src/decision-engine.ts
+++ b/services/auto-routing/src/decision-engine.ts
@@ -14,5 +14,11 @@ export function computeDecision(
   const tier = deriveDifficultyTier(classification);
   const candidate = table.tiers[tier].find(c => c.supportedApiKinds.includes(apiKind));
   if (!candidate) return null;
-  return { model: candidate.model, tier, source: table.source, tableVersion: table.version };
+  return {
+    model: candidate.model,
+    tier,
+    source: table.source,
+    tableVersion: table.version,
+    reasoningEffort: candidate.reasoningEffort ?? null,
+  };
 }
diff --git a/services/auto-routing/src/index.test.ts b/services/auto-routing/src/index.test.ts
index 5bc7a08146..41b9670041 100644
--- a/services/auto-routing/src/index.test.ts
+++ b/services/auto-routing/src/index.test.ts
@@ -14,6 +14,7 @@ vi.mock('./model-classifier', async importOriginal => {
 
 const writeDataPoint = vi.fn();
 const configGet = vi.fn();
+const configDelete = vi.fn();
 const configPut = vi.fn();
 const analyticsTokenGet = vi.fn();
 const cacheGetEntry = vi.fn();
@@ -28,6 +29,7 @@ const env = {
   },
   AUTO_ROUTING_CONFIG: {
     get: configGet,
+    delete: configDelete,
     put: configPut,
   },
   AUTO_ROUTING_CLASSIFIER_METRICS_V2: {
@@ -126,6 +128,8 @@ describe('auto routing worker', () => {
     // Real KV returns null for missing keys; an undefined here would send the
     // routing-table loader down the JSON.parse-throw path instead.
     configGet.mockResolvedValue(null);
+    configDelete.mockReset();
+    configDelete.mockResolvedValue(undefined);
     configPut.mockReset();
     analyticsTokenGet.mockReset();
     analyticsTokenGet.mockResolvedValue('analytics-token');
@@ -168,6 +172,7 @@ describe('auto routing worker', () => {
         tier: expect.stringMatching(/^(low|medium|high)$/),
         source: 'default',
         tableVersion: 'default',
+        reasoningEffort: null,
       },
       classifierResult: {
         classification: mockClassification,
@@ -230,6 +235,7 @@ describe('auto routing worker', () => {
         tier: expect.stringMatching(/^(low|medium|high)$/),
         source: 'default',
         tableVersion: 'default',
+        reasoningEffort: null,
       },
       classifierResult: { classification: mockClassification },
     });
@@ -463,8 +469,10 @@ describe('auto routing worker', () => {
     expect(classifyNormalizedInput).not.toHaveBeenCalled();
   });
 
-  it('returns the configured classifier model', async () => {
-    configGet.mockResolvedValueOnce('google/gemini-2.5-flash-lite');
+  it('returns the override as the effective classifier model', async () => {
+    configGet.mockImplementation(key =>
+      Promise.resolve(key === 'classifier_model' ? 'google/gemini-2.5-flash-lite' : null)
+    );
 
     const response = await request('/admin/classifier-model', {
       headers: { authorization: 'Bearer classifier-token' },
@@ -473,12 +481,48 @@ describe('auto routing worker', () => {
     expect(response.status).toBe(200);
     await expect(response.json()).resolves.toEqual({
       model: 'google/gemini-2.5-flash-lite',
+      override: 'google/gemini-2.5-flash-lite',
+      benchmarkWinner: null,
       defaultModel: 'google/gemini-2.5-flash-lite',
     });
     expect(configGet).toHaveBeenCalledWith('classifier_model');
   });
 
-  it('updates the configured classifier model', async () => {
+  it('falls back to the benchmark winner when no override is set', async () => {
+    configGet.mockImplementation(key =>
+      Promise.resolve(
+        key === 'classifier_benchmark_winner'
+          ? JSON.stringify({
+              model: 'qwen/qwen3.7-plus',
+              runId: 'classifier-run-1',
+              accuracy: 0.93,
+              generatedAt: '2026-06-12T00:00:00.000Z',
+            })
+          : null
+      )
+    );
+
+    const response = await request('/admin/classifier-model', {
+      headers: { authorization: 'Bearer classifier-token' },
+    });
+
+    expect(response.status).toBe(200);
+    await expect(response.json()).resolves.toEqual({
+      model: 'qwen/qwen3.7-plus',
+      override: null,
+      benchmarkWinner: 'qwen/qwen3.7-plus',
+      defaultModel: 'google/gemini-2.5-flash-lite',
+    });
+  });
+
+  it('updates the classifier model override', async () => {
+    const stored = new Map<string, string>();
+    configGet.mockImplementation(key => Promise.resolve(stored.get(key) ?? null));
+    configPut.mockImplementation((key, value) => {
+      stored.set(key, value);
+      return Promise.resolve();
+    });
+
     const response = await request('/admin/classifier-model', {
       method: 'PUT',
       headers: {
@@ -491,11 +535,33 @@ describe('auto routing worker', () => {
     expect(response.status).toBe(200);
     await expect(response.json()).resolves.toEqual({
       model: 'google/gemini-2.5-flash-lite:free',
+      override: 'google/gemini-2.5-flash-lite:free',
+      benchmarkWinner: null,
       defaultModel: 'google/gemini-2.5-flash-lite',
     });
     expect(configPut).toHaveBeenCalledWith('classifier_model', 'google/gemini-2.5-flash-lite:free');
   });
 
+  it('clears the override when model is null', async () => {
+    const response = await request('/admin/classifier-model', {
+      method: 'PUT',
+      headers: {
+        authorization: 'Bearer classifier-token',
+        'content-type': 'application/json',
+      },
+      body: JSON.stringify({ model: null }),
+    });
+
+    expect(response.status).toBe(200);
+    await expect(response.json()).resolves.toEqual({
+      model: 'google/gemini-2.5-flash-lite',
+      override: null,
+      benchmarkWinner: null,
+      defaultModel: 'google/gemini-2.5-flash-lite',
+    });
+    expect(configDelete).toHaveBeenCalledWith('classifier_model');
+  });
+
   it('rejects blank classifier model updates', async () => {
     const response = await request('/admin/classifier-model', {
       method: 'PUT',

From 7bb504868abd6c4175a4b4d553b74f43402315db Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Fri, 12 Jun 2026 02:12:21 +0200
Subject: [PATCH 30/73] feat(auto-routing): manual benchmark runs, classifier
 override, decider reasoning effort

- benchmark runs start only from the admin panel; models with existing
  results are skipped (latest summaries carried forward) unless forced
- classifier benchmark publishes a winner; the admin-set classifier model
  becomes an override on top of it (clearable from the panel)
- decider models accept a reasoning effort, forwarded to the kilo CLI as
  --variant and mirrored in the routing table and live decisions
---
 .../api/auto-routing/benchmark-runs/route.ts  |   2 +-
 .../auto-routing/AutoRoutingAdminContent.tsx  | 107 +++++++++-----
 .../admin/auto-routing/BenchmarksSection.tsx  | 132 ++++++++++++++----
 .../auto-routing-admin-client.test.ts         |  24 ++++
 .../ai-gateway/auto-routing-admin-client.ts   |   2 +-
 ...uto-routing-benchmark-admin-client.test.ts |  32 ++++-
 .../auto-routing-benchmark-admin-client.ts    |   4 +-
 .../container/server.mjs                      |  36 ++---
 .../migrations/0002_carried_results.sql       |   1 +
 services/auto-routing-benchmark/src/admin.ts  |   2 +-
 .../auto-routing-benchmark/src/cli-runner.ts  |  18 ++-
 services/auto-routing-benchmark/src/config.ts |  21 ++-
 .../auto-routing-benchmark/src/db.test.ts     |   4 +
 services/auto-routing-benchmark/src/db.ts     |  64 ++++++++-
 services/auto-routing-benchmark/src/index.ts  |   8 +-
 .../src/routing-table-builder.test.ts         |  10 +-
 .../src/routing-table-builder.ts              |   9 +-
 .../auto-routing-benchmark/src/run.test.ts    |  43 +++++-
 services/auto-routing-benchmark/src/run.ts    | 127 ++++++++++++++---
 .../auto-routing-benchmark/wrangler.jsonc     |   4 -
 20 files changed, 510 insertions(+), 140 deletions(-)
 create mode 100644 services/auto-routing-benchmark/migrations/0002_carried_results.sql

diff --git a/apps/web/src/app/admin/api/auto-routing/benchmark-runs/route.ts b/apps/web/src/app/admin/api/auto-routing/benchmark-runs/route.ts
index afb3f47f65..efbfebdde3 100644
--- a/apps/web/src/app/admin/api/auto-routing/benchmark-runs/route.ts
+++ b/apps/web/src/app/admin/api/auto-routing/benchmark-runs/route.ts
@@ -31,6 +31,6 @@ export async function POST(request: NextRequest) {
     return NextResponse.json({ error: 'Invalid start benchmark run request' }, { status: 400 });
   }
 
-  const result = await startBenchmarkRun(parsed.data.kind);
+  const result = await startBenchmarkRun(parsed.data.kind, parsed.data.force);
   return NextResponse.json(result.body, { status: result.status });
 }
diff --git a/apps/web/src/app/admin/auto-routing/AutoRoutingAdminContent.tsx b/apps/web/src/app/admin/auto-routing/AutoRoutingAdminContent.tsx
index f55d1bccdc..185dc097f9 100644
--- a/apps/web/src/app/admin/auto-routing/AutoRoutingAdminContent.tsx
+++ b/apps/web/src/app/admin/auto-routing/AutoRoutingAdminContent.tsx
@@ -66,7 +66,7 @@ async function fetchClassifierModel() {
   );
 }
 
-async function saveClassifierModel(model: string) {
+async function saveClassifierModel(model: string | null) {
   const response = await fetch('/admin/api/auto-routing/classifier-model', {
     method: 'PUT',
     headers: { 'content-type': 'application/json' },
@@ -398,10 +398,10 @@ export function AutoRoutingAdminContent() {
   });
 
   useEffect(() => {
-    if (classifierModelQuery.data?.model) {
-      setSelectedModel(classifierModelQuery.data.model);
+    if (classifierModelQuery.data) {
+      setSelectedModel(classifierModelQuery.data.override ?? classifierModelQuery.data.model);
     }
-  }, [classifierModelQuery.data?.model]);
+  }, [classifierModelQuery.data]);
 
   const modelOptions = useMemo<ModelOption[]>(() => {
     return (
@@ -415,10 +415,14 @@ export function AutoRoutingAdminContent() {
 
   const saveMutation = useMutation({
     mutationFn: saveClassifierModel,
-    onSuccess: data => {
+    onSuccess: (data, model) => {
       queryClient.setQueryData(['auto-routing', 'classifier-model'], data);
-      setSelectedModel(data.model);
-      toast.success('Classifier model updated');
+      setSelectedModel(data.override ?? data.model);
+      if (model === null) {
+        toast.success('Override cleared — benchmark winner in effect');
+      } else {
+        toast.success('Classifier model override saved');
+      }
     },
     onError: error => {
       toast.error(error instanceof Error ? error.message : 'Failed to update classifier model');
@@ -433,10 +437,12 @@ export function AutoRoutingAdminContent() {
     classifierModelQuery.error instanceof Error ? classifierModelQuery.error.message : undefined;
   const openRouterModelsError =
     openRouterModelsQuery.error instanceof Error ? openRouterModelsQuery.error.message : undefined;
-  const currentModel = classifierModelQuery.data?.model ?? '';
-  const hasClassifierModelLoaded = classifierModelQuery.isSuccess && currentModel.length > 0;
+  const currentOverride = classifierModelQuery.data?.override ?? null;
+  const hasClassifierModelLoaded = classifierModelQuery.isSuccess;
   const hasModelChange =
-    hasClassifierModelLoaded && selectedModel.trim().length > 0 && selectedModel !== currentModel;
+    hasClassifierModelLoaded &&
+    selectedModel.trim().length > 0 &&
+    selectedModel !== (currentOverride ?? '');
   const summary = analyticsQuery.data?.summary;
   const totalRequests = summary?.totalRequests ?? 0;
   const { classifiedRate, cacheHitRate, fallbackRate } = summaryRates(summary);
@@ -473,32 +479,67 @@ export function AutoRoutingAdminContent() {
 
       <Card className="rounded-lg">
         <CardHeader className="flex flex-row items-center justify-between space-y-0 p-4 pb-2">
-          <CardTitle className="text-base">Classifier Model</CardTitle>
+          <CardTitle className="text-base">Classifier model override</CardTitle>
           <MetricHelp
-            label="Classifier Model"
-            description="The OpenRouter model used by the auto-routing classifier. Saving changes updates KV config, so the classifier can change without a redeploy."
+            label="Classifier model override"
+            description="When unset, the latest classifier benchmark winner is used. Setting an override bypasses the benchmark winner. Saving updates KV config without a redeploy."
           />
         </CardHeader>
-        <CardContent className="grid gap-4 p-4 pt-0 lg:grid-cols-[1fr_auto] lg:items-end">
-          <ModelCombobox
-            label="Model"
-            models={modelOptions}
-            value={selectedModel}
-            onValueChange={setSelectedModel}
-            isLoading={openRouterModelsQuery.isLoading || classifierModelQuery.isLoading}
-            error={classifierModelError ?? openRouterModelsError}
-            placeholder={classifierModelQuery.data?.defaultModel ?? 'Select classifier model'}
-            className="w-full"
-          />
-          <Button
-            type="button"
-            onClick={() => saveMutation.mutate(selectedModel)}
-            disabled={!hasModelChange || saveMutation.isPending}
-            className="w-full lg:w-auto"
-          >
-            <Save className="size-4" />
-            Save model
-          </Button>
+        <CardContent className="flex flex-col gap-4 p-4 pt-0">
+          <dl className="grid grid-cols-[auto_1fr] gap-x-4 gap-y-1 text-sm">
+            <dt className="text-muted-foreground">Effective model</dt>
+            <dd className="font-mono text-xs truncate">
+              {classifierModelQuery.data?.model ?? <Skeleton className="h-4 w-48" />}
+            </dd>
+            <dt className="text-muted-foreground">Override</dt>
+            <dd className="font-mono text-xs truncate">
+              {classifierModelQuery.isLoading ? (
+                <Skeleton className="h-4 w-48" />
+              ) : (
+                (classifierModelQuery.data?.override ?? 'none')
+              )}
+            </dd>
+            <dt className="text-muted-foreground">Benchmark winner</dt>
+            <dd className="font-mono text-xs truncate">
+              {classifierModelQuery.isLoading ? (
+                <Skeleton className="h-4 w-48" />
+              ) : (
+                (classifierModelQuery.data?.benchmarkWinner ?? 'not yet published')
+              )}
+            </dd>
+          </dl>
+          <div className="grid gap-4 lg:grid-cols-[1fr_auto_auto] lg:items-end">
+            <ModelCombobox
+              label="Set override"
+              models={modelOptions}
+              value={selectedModel}
+              onValueChange={setSelectedModel}
+              isLoading={openRouterModelsQuery.isLoading || classifierModelQuery.isLoading}
+              error={classifierModelError ?? openRouterModelsError}
+              placeholder={classifierModelQuery.data?.defaultModel ?? 'Select classifier model'}
+              className="w-full"
+            />
+            <Button
+              type="button"
+              onClick={() => saveMutation.mutate(selectedModel)}
+              disabled={!hasModelChange || saveMutation.isPending}
+              className="w-full lg:w-auto"
+            >
+              <Save className="size-4" />
+              Save override
+            </Button>
+            {currentOverride !== null ? (
+              <Button
+                type="button"
+                variant="outline"
+                onClick={() => saveMutation.mutate(null)}
+                disabled={saveMutation.isPending}
+                className="w-full lg:w-auto text-destructive hover:text-destructive"
+              >
+                Clear override
+              </Button>
+            ) : null}
+          </div>
         </CardContent>
       </Card>
 
diff --git a/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx b/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx
index 8d7f2dc881..e2509c7649 100644
--- a/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx
+++ b/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx
@@ -7,6 +7,7 @@ import {
   type BenchmarkConfig,
   type BenchmarkRun,
   type BenchmarkModelSummary,
+  type ReasoningEffort,
 } from '@kilocode/auto-routing-contracts';
 import React, { useCallback, useEffect, useRef, useState } from 'react';
 import { useMutation, useQuery, useQueryClient } from '@tanstack/react-query';
@@ -19,6 +20,13 @@ import { Card, CardContent, CardHeader, CardTitle } from '@/components/ui/card';
 import { Checkbox } from '@/components/ui/checkbox';
 import { Input } from '@/components/ui/input';
 import { Label } from '@/components/ui/label';
+import {
+  Select,
+  SelectContent,
+  SelectItem,
+  SelectTrigger,
+  SelectValue,
+} from '@/components/ui/select';
 import { Skeleton } from '@/components/ui/skeleton';
 import {
   Table,
@@ -96,11 +104,17 @@ async function fetchBenchmarkRuns() {
   return parseAdminResponse(response, BenchmarkRunsResponseSchema);
 }
 
-async function startBenchmarkRun(kind: 'classifier' | 'decider') {
+async function startBenchmarkRun({
+  kind,
+  force,
+}: {
+  kind: 'classifier' | 'decider';
+  force: boolean;
+}) {
   const response = await fetch('/admin/api/auto-routing/benchmark-runs', {
     method: 'POST',
     headers: { 'content-type': 'application/json' },
-    body: JSON.stringify({ kind }),
+    body: JSON.stringify({ kind, force }),
   });
   return parseAdminResponse(response, StartBenchmarkRunResponseSchema);
 }
@@ -122,6 +136,7 @@ type DeciderModelRow = {
   chat_completions: boolean;
   responses: boolean;
   messages: boolean;
+  reasoningEffort: ReasoningEffort | null;
 };
 
 function configToFormState(config: BenchmarkConfig): {
@@ -138,6 +153,7 @@ function configToFormState(config: BenchmarkConfig): {
       chat_completions: m.supportedApiKinds.includes('chat_completions'),
       responses: m.supportedApiKinds.includes('responses'),
       messages: m.supportedApiKinds.includes('messages'),
+      reasoningEffort: m.reasoningEffort ?? null,
     })),
     minAccuracy: config.minAccuracy,
     maxConcurrency: config.maxConcurrency,
@@ -163,6 +179,7 @@ function formStateToConfig(
       return {
         id: row.id.trim(),
         supportedApiKinds: kinds.length ? kinds : ['chat_completions' as const],
+        reasoningEffort: row.reasoningEffort ?? null,
       };
     });
   const benchmarkUserId = state.benchmarkUserId.trim();
@@ -221,7 +238,13 @@ function BenchmarkConfigEditor({
       ...prev,
       deciderModels: [
         ...prev.deciderModels,
-        { id: '', chat_completions: true, responses: false, messages: false },
+        {
+          id: '',
+          chat_completions: true,
+          responses: false,
+          messages: false,
+          reasoningEffort: null,
+        },
       ],
     }));
   }, []);
@@ -276,6 +299,7 @@ function BenchmarkConfigEditor({
                   <TableHead className="w-32 text-center">chat_completions</TableHead>
                   <TableHead className="w-24 text-center">responses</TableHead>
                   <TableHead className="w-24 text-center">messages</TableHead>
+                  <TableHead className="w-36">Reasoning effort</TableHead>
                   <TableHead className="w-12" />
                 </TableRow>
               </TableHeader>
@@ -318,6 +342,30 @@ function BenchmarkConfigEditor({
                         aria-label={`Model ${index + 1} supports messages`}
                       />
                     </TableCell>
+                    <TableCell className="py-2">
+                      <Select
+                        value={row.reasoningEffort ?? 'none'}
+                        onValueChange={value =>
+                          handleDeciderRowChange(index, {
+                            reasoningEffort: value === 'none' ? null : (value as ReasoningEffort),
+                          })
+                        }
+                      >
+                        <SelectTrigger
+                          className="h-8 text-xs"
+                          aria-label={`Model ${index + 1} reasoning effort`}
+                        >
+                          <SelectValue />
+                        </SelectTrigger>
+                        <SelectContent>
+                          <SelectItem value="none">None</SelectItem>
+                          <SelectItem value="minimal">minimal</SelectItem>
+                          <SelectItem value="low">low</SelectItem>
+                          <SelectItem value="medium">medium</SelectItem>
+                          <SelectItem value="high">high</SelectItem>
+                        </SelectContent>
+                      </Select>
+                    </TableCell>
                     <TableCell className="py-2">
                       <Button
                         type="button"
@@ -661,6 +709,7 @@ function RoutingTableView({ data }: { data: BenchmarkRoutingTableResponse }) {
 
 export function BenchmarksSection() {
   const queryClient = useQueryClient();
+  const [forceRerun, setForceRerun] = useState(false);
 
   const configQuery = useQuery({
     queryKey: ['auto-routing', 'benchmark-config'],
@@ -690,10 +739,15 @@ export function BenchmarksSection() {
 
   const startRunMutation = useMutation({
     mutationFn: startBenchmarkRun,
-    onSuccess: (data, kind) => {
-      toast.success(
-        `${kind === 'classifier' ? 'Classifier' : 'Decider'} benchmark started — ${data.enqueuedModels} models enqueued`
-      );
+    onSuccess: (data, variables) => {
+      const kindLabel = variables.kind === 'classifier' ? 'Classifier' : 'Decider';
+      if (data.enqueuedModels === 0) {
+        toast.success(`All models already have results — republished from existing data`);
+      } else {
+        toast.success(
+          `${kindLabel} benchmark started — ${data.enqueuedModels} models enqueued, ${data.skippedModels.length} skipped`
+        );
+      }
       void queryClient.invalidateQueries({ queryKey: ['auto-routing', 'benchmark-runs'] });
     },
     onError: (error: unknown) => {
@@ -745,30 +799,46 @@ export function BenchmarksSection() {
         <CardHeader className="p-4 pb-2">
           <CardTitle className="text-base">Run Benchmark</CardTitle>
         </CardHeader>
-        <CardContent className="flex flex-wrap gap-2 p-4 pt-0">
-          <Button
-            type="button"
-            variant="outline"
-            disabled={anyRunning}
-            onClick={() => startRunMutation.mutate('classifier')}
-          >
-            <Play className="size-4" />
-            Run classifier benchmark
-          </Button>
-          <Button
-            type="button"
-            variant="outline"
-            disabled={anyRunning}
-            onClick={() => startRunMutation.mutate('decider')}
-          >
-            <Play className="size-4" />
-            Run decider benchmark
-          </Button>
-          {hasRunningRun ? (
-            <p className="text-muted-foreground self-center text-xs">
-              A benchmark is running — refreshing every 30 s
-            </p>
-          ) : null}
+        <CardContent className="flex flex-col gap-3 p-4 pt-0">
+          <p className="text-muted-foreground text-xs">
+            Runs are triggered manually. Models with existing results are skipped unless "Re-run
+            models with existing results" is checked.
+          </p>
+          <div className="flex items-center gap-2">
+            <Checkbox
+              id="force-rerun"
+              checked={forceRerun}
+              onCheckedChange={checked => setForceRerun(checked === true)}
+            />
+            <Label htmlFor="force-rerun" className="text-sm font-normal cursor-pointer">
+              Re-run models with existing results
+            </Label>
+          </div>
+          <div className="flex flex-wrap gap-2">
+            <Button
+              type="button"
+              variant="outline"
+              disabled={anyRunning}
+              onClick={() => startRunMutation.mutate({ kind: 'classifier', force: forceRerun })}
+            >
+              <Play className="size-4" />
+              Run classifier benchmark
+            </Button>
+            <Button
+              type="button"
+              variant="outline"
+              disabled={anyRunning}
+              onClick={() => startRunMutation.mutate({ kind: 'decider', force: forceRerun })}
+            >
+              <Play className="size-4" />
+              Run decider benchmark
+            </Button>
+            {hasRunningRun ? (
+              <p className="text-muted-foreground self-center text-xs">
+                A benchmark is running — refreshing every 30 s
+              </p>
+            ) : null}
+          </div>
         </CardContent>
       </Card>
 
diff --git a/apps/web/src/lib/ai-gateway/auto-routing-admin-client.test.ts b/apps/web/src/lib/ai-gateway/auto-routing-admin-client.test.ts
index f98da3baf2..adc58001b3 100644
--- a/apps/web/src/lib/ai-gateway/auto-routing-admin-client.test.ts
+++ b/apps/web/src/lib/ai-gateway/auto-routing-admin-client.test.ts
@@ -14,6 +14,8 @@ global.fetch = mockFetch;
 
 const classifierModelResponse = {
   model: 'google/gemini-2.5-flash-lite',
+  override: null,
+  benchmarkWinner: null,
   defaultModel: 'google/gemini-2.5-flash-lite',
 };
 
@@ -86,6 +88,28 @@ describe('auto routing admin client', () => {
     );
   });
 
+  it('clears the classifier model override by sending null', async () => {
+    mockFetch.mockResolvedValue({
+      status: 200,
+      ok: true,
+      json: () => Promise.resolve(classifierModelResponse),
+    });
+
+    await updateAutoRoutingClassifierModel(null);
+
+    expect(mockFetch).toHaveBeenCalledWith(
+      'https://auto-routing.example.com/admin/classifier-model',
+      {
+        method: 'PUT',
+        headers: {
+          authorization: 'Bearer test-internal-secret',
+          'content-type': 'application/json',
+        },
+        body: JSON.stringify({ model: null }),
+      }
+    );
+  });
+
   it('queries classifier analytics for the selected period', async () => {
     mockFetch.mockResolvedValue({
       status: 200,
diff --git a/apps/web/src/lib/ai-gateway/auto-routing-admin-client.ts b/apps/web/src/lib/ai-gateway/auto-routing-admin-client.ts
index fe67e003d2..1af226d8c7 100644
--- a/apps/web/src/lib/ai-gateway/auto-routing-admin-client.ts
+++ b/apps/web/src/lib/ai-gateway/auto-routing-admin-client.ts
@@ -65,7 +65,7 @@ export function getAutoRoutingClassifierModel() {
   );
 }
 
-export function updateAutoRoutingClassifierModel(model: string) {
+export function updateAutoRoutingClassifierModel(model: string | null) {
   return fetchAutoRoutingAdmin(
     '/admin/classifier-model',
     {
diff --git a/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.test.ts b/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.test.ts
index 673119734d..164525eb19 100644
--- a/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.test.ts
+++ b/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.test.ts
@@ -25,6 +25,7 @@ const configResponse = {
           | 'responses'
           | 'messages'
         )[],
+        reasoningEffort: null,
       },
     ],
     minAccuracy: 0.8,
@@ -43,6 +44,7 @@ const configResponse = {
           | 'responses'
           | 'messages'
         )[],
+        reasoningEffort: null,
       },
     ],
     minAccuracy: 0.8,
@@ -158,16 +160,16 @@ describe('auto routing benchmark admin client', () => {
     });
   });
 
-  it('starts a benchmark run with the given kind', async () => {
+  it('starts a benchmark run with the given kind and force flag', async () => {
     mockFetch.mockResolvedValue({
       status: 200,
       ok: true,
-      json: () => Promise.resolve({ runId: 'run-2', enqueuedModels: 3 }),
+      json: () => Promise.resolve({ runId: 'run-2', enqueuedModels: 3, skippedModels: [] }),
     });
 
-    await expect(startBenchmarkRun('classifier')).resolves.toEqual({
+    await expect(startBenchmarkRun('classifier', false)).resolves.toEqual({
       status: 200,
-      body: { runId: 'run-2', enqueuedModels: 3 },
+      body: { runId: 'run-2', enqueuedModels: 3, skippedModels: [] },
     });
 
     expect(mockFetch).toHaveBeenCalledWith('https://benchmark-worker.example.com/admin/runs', {
@@ -176,7 +178,27 @@ describe('auto routing benchmark admin client', () => {
         authorization: 'Bearer test-internal-secret',
         'content-type': 'application/json',
       },
-      body: JSON.stringify({ kind: 'classifier' }),
+      body: JSON.stringify({ kind: 'classifier', force: false }),
+    });
+  });
+
+  it('starts a benchmark run with force=true to re-run existing models', async () => {
+    mockFetch.mockResolvedValue({
+      status: 200,
+      ok: true,
+      json: () =>
+        Promise.resolve({ runId: 'run-3', enqueuedModels: 3, skippedModels: ['model-a'] }),
+    });
+
+    await startBenchmarkRun('decider', true);
+
+    expect(mockFetch).toHaveBeenCalledWith('https://benchmark-worker.example.com/admin/runs', {
+      method: 'POST',
+      headers: {
+        authorization: 'Bearer test-internal-secret',
+        'content-type': 'application/json',
+      },
+      body: JSON.stringify({ kind: 'decider', force: true }),
     });
   });
 
diff --git a/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.ts b/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.ts
index d12990de14..71780f149f 100644
--- a/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.ts
+++ b/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.ts
@@ -86,13 +86,13 @@ export function listBenchmarkRuns() {
   return fetchBenchmarkAdmin('/admin/runs', { method: 'GET' }, BenchmarkRunsResponseSchema);
 }
 
-export function startBenchmarkRun(kind: BenchmarkKind) {
+export function startBenchmarkRun(kind: BenchmarkKind, force: boolean) {
   return fetchBenchmarkAdmin(
     '/admin/runs',
     {
       method: 'POST',
       headers: { 'content-type': 'application/json' },
-      body: JSON.stringify({ kind }),
+      body: JSON.stringify({ kind, force }),
     },
     StartBenchmarkRunResponseSchema
   );
diff --git a/services/auto-routing-benchmark/container/server.mjs b/services/auto-routing-benchmark/container/server.mjs
index dee2b62554..67799c87e8 100644
--- a/services/auto-routing-benchmark/container/server.mjs
+++ b/services/auto-routing-benchmark/container/server.mjs
@@ -40,7 +40,7 @@ function runCaseSerialized(params) {
   return next;
 }
 
-function runCase({ model, prompt, kiloToken, timeoutMs }) {
+function runCase({ model, prompt, kiloToken, timeoutMs, variant }) {
   return new Promise(resolve => {
     void (async () => {
       const dir = await mkdtemp(join(tmpdir(), 'kilo-bench-'));
@@ -50,19 +50,19 @@ function runCase({ model, prompt, kiloToken, timeoutMs }) {
       let stdoutTruncated = false;
       let stderrTail = '';
 
-      const child = spawn(
-        'kilo',
-        ['run', '--format', 'json', '--auto', '-m', `kilo/${model}`, prompt],
-        {
-          cwd: dir,
-          env: {
-            ...process.env,
-            KILO_AUTH_CONTENT: JSON.stringify({ kilo: { type: 'api', key: kiloToken } }),
-            NO_COLOR: '1',
-          },
-          stdio: ['ignore', 'pipe', 'pipe'],
-        }
-      );
+      const args = ['run', '--format', 'json', '--auto', '-m', `kilo/${model}`];
+      // Reasoning effort: forwarded as the CLI's provider-specific variant.
+      if (typeof variant === 'string' && variant.length > 0) args.push('--variant', variant);
+      args.push(prompt);
+      const child = spawn('kilo', args, {
+        cwd: dir,
+        env: {
+          ...process.env,
+          KILO_AUTH_CONTENT: JSON.stringify({ kilo: { type: 'api', key: kiloToken } }),
+          NO_COLOR: '1',
+        },
+        stdio: ['ignore', 'pipe', 'pipe'],
+      });
 
       const killTimer = setTimeout(() => {
         child.kill('SIGKILL');
@@ -129,7 +129,7 @@ const server = createServer((req, res) => {
         return;
       }
 
-      const { model, prompt, kiloToken } = parsed ?? {};
+      const { model, prompt, kiloToken, variant } = parsed ?? {};
       const timeoutMs =
         typeof parsed?.timeoutMs === 'number' && parsed.timeoutMs > 0
           ? parsed.timeoutMs
@@ -145,7 +145,11 @@ const server = createServer((req, res) => {
       }
 
       try {
-        const result = await runCaseSerialized({ model, prompt, kiloToken, timeoutMs });
+        if (variant !== undefined && variant !== null && typeof variant !== 'string') {
+          sendJson(res, 400, { error: 'variant must be a string when provided' });
+          return;
+        }
+        const result = await runCaseSerialized({ model, prompt, kiloToken, timeoutMs, variant });
         sendJson(res, 200, result);
       } catch (err) {
         sendJson(res, 500, { error: err instanceof Error ? err.message : 'run failed' });
diff --git a/services/auto-routing-benchmark/migrations/0002_carried_results.sql b/services/auto-routing-benchmark/migrations/0002_carried_results.sql
new file mode 100644
index 0000000000..d1c66da8d3
--- /dev/null
+++ b/services/auto-routing-benchmark/migrations/0002_carried_results.sql
@@ -0,0 +1 @@
+ALTER TABLE benchmark_runs ADD COLUMN runtime_json TEXT;
diff --git a/services/auto-routing-benchmark/src/admin.ts b/services/auto-routing-benchmark/src/admin.ts
index 627bbea1bb..abc1a3b79c 100644
--- a/services/auto-routing-benchmark/src/admin.ts
+++ b/services/auto-routing-benchmark/src/admin.ts
@@ -48,7 +48,7 @@ export const startRunHandler: Handler<HonoEnv> = async c => {
   }
   const parsed = StartBenchmarkRunRequestSchema.safeParse(body);
   if (!parsed.success) return c.json({ error: 'Invalid run request' }, 400);
-  return c.json(await startRun(c.env, parsed.data.kind));
+  return c.json(await startRun(c.env, parsed.data.kind, { force: parsed.data.force }));
 };
 
 export const getRoutingTableHandler: Handler<HonoEnv> = async c => {
diff --git a/services/auto-routing-benchmark/src/cli-runner.ts b/services/auto-routing-benchmark/src/cli-runner.ts
index 8d7e370d0f..c766bab3e3 100644
--- a/services/auto-routing-benchmark/src/cli-runner.ts
+++ b/services/auto-routing-benchmark/src/cli-runner.ts
@@ -36,9 +36,15 @@ type ContainerRunResponse = {
  */
 export async function runDeciderCaseViaCli(
   env: Env,
-  params: { instanceName: string; model: string; benchCase: DeciderCase; kiloToken: string }
+  params: {
+    instanceName: string;
+    model: string;
+    benchCase: DeciderCase;
+    kiloToken: string;
+    reasoningEffort?: string | null;
+  }
 ): Promise<CliRunResult> {
-  const { instanceName, model, benchCase, kiloToken } = params;
+  const { instanceName, model, benchCase, kiloToken, reasoningEffort } = params;
   const stub = env.BENCH_RUNNER.get(env.BENCH_RUNNER.idFromName(instanceName));
   const prompt = `${benchCase.systemPrompt}\n\n${benchCase.userPrompt}${FINAL_ANSWER_SUFFIX}`;
 
@@ -47,7 +53,13 @@ export async function runDeciderCaseViaCli(
     new Request('http://container/run', {
       method: 'POST',
       headers: { 'content-type': 'application/json' },
-      body: JSON.stringify({ model, prompt, kiloToken, timeoutMs: DECIDER_CLI_TIMEOUT_MS }),
+      body: JSON.stringify({
+        model,
+        prompt,
+        kiloToken,
+        timeoutMs: DECIDER_CLI_TIMEOUT_MS,
+        variant: reasoningEffort ?? null,
+      }),
     })
   );
 
diff --git a/services/auto-routing-benchmark/src/config.ts b/services/auto-routing-benchmark/src/config.ts
index 30b1e6a072..9e04172801 100644
--- a/services/auto-routing-benchmark/src/config.ts
+++ b/services/auto-routing-benchmark/src/config.ts
@@ -9,13 +9,26 @@ export const DEFAULT_BENCHMARK_CONFIG: BenchmarkConfig = {
     'qwen/qwen3.7-plus',
   ],
   deciderModels: [
-    { id: 'google/gemini-2.5-flash-lite', supportedApiKinds: ['chat_completions'] },
-    { id: 'google/gemini-2.5-flash', supportedApiKinds: ['chat_completions'] },
-    { id: 'qwen/qwen3.7-plus', supportedApiKinds: ['chat_completions'] },
-    { id: 'openai/gpt-5.5', supportedApiKinds: ['chat_completions', 'responses'] },
+    {
+      id: 'google/gemini-2.5-flash-lite',
+      supportedApiKinds: ['chat_completions'],
+      reasoningEffort: null,
+    },
+    {
+      id: 'google/gemini-2.5-flash',
+      supportedApiKinds: ['chat_completions'],
+      reasoningEffort: null,
+    },
+    { id: 'qwen/qwen3.7-plus', supportedApiKinds: ['chat_completions'], reasoningEffort: null },
+    {
+      id: 'openai/gpt-5.5',
+      supportedApiKinds: ['chat_completions', 'responses'],
+      reasoningEffort: null,
+    },
     {
       id: 'anthropic/claude-sonnet-4.6',
       supportedApiKinds: ['chat_completions', 'messages', 'responses'],
+      reasoningEffort: null,
     },
   ],
   minAccuracy: 0.7,
diff --git a/services/auto-routing-benchmark/src/db.test.ts b/services/auto-routing-benchmark/src/db.test.ts
index 163786350b..bce49d937c 100644
--- a/services/auto-routing-benchmark/src/db.test.ts
+++ b/services/auto-routing-benchmark/src/db.test.ts
@@ -57,6 +57,7 @@ describe('mapRunRow', () => {
       started_at: '2026-06-10T04:10:00.000Z',
       completed_at: '2026-06-10T04:25:00.000Z',
       config_json: '{}',
+      runtime_json: null,
       error: null,
     };
     const summaries: BenchmarkModelSummary[] = [
@@ -90,6 +91,7 @@ describe('mapRunRow', () => {
       started_at: '2026-06-11T05:10:00.000Z',
       completed_at: null,
       config_json: '{}',
+      runtime_json: null,
       error: null,
     };
     const result = mapRunRow(runRow, []);
@@ -105,6 +107,7 @@ describe('mapRunRow', () => {
       started_at: '2026-06-01T04:10:00.000Z',
       completed_at: '2026-06-01T04:20:00.000Z',
       config_json: '{}',
+      runtime_json: null,
       error: null,
     };
     const runRow2 = {
@@ -114,6 +117,7 @@ describe('mapRunRow', () => {
       started_at: '2026-06-02T05:10:00.000Z',
       completed_at: null,
       config_json: '{}',
+      runtime_json: null,
       error: 'timed out',
     };
     const summariesForRun1: BenchmarkModelSummary[] = [
diff --git a/services/auto-routing-benchmark/src/db.ts b/services/auto-routing-benchmark/src/db.ts
index 48130a90fd..9a8450b1a1 100644
--- a/services/auto-routing-benchmark/src/db.ts
+++ b/services/auto-routing-benchmark/src/db.ts
@@ -23,6 +23,10 @@ export type RunRow = {
   started_at: string;
   completed_at: string | null;
   config_json: string;
+  // Run-scoped execution state: which models were actually enqueued and the
+  // summaries carried forward for models skipped because they already had
+  // results. Null on rows created before the column existed.
+  runtime_json: string | null;
   error: string | null;
 };
 
@@ -65,14 +69,20 @@ export function mapRunRow(row: RunRow, summaries: BenchmarkModelSummary[]): Benc
 
 export async function insertRun(
   db: D1Database,
-  run: { id: string; kind: BenchmarkKind; startedAt: string; configJson: string }
+  run: {
+    id: string;
+    kind: BenchmarkKind;
+    startedAt: string;
+    configJson: string;
+    runtimeJson: string;
+  }
 ): Promise<void> {
   await db
     .prepare(
-      `INSERT INTO benchmark_runs (id, kind, status, started_at, config_json)
-       VALUES (?1, ?2, 'running', ?3, ?4)`
+      `INSERT INTO benchmark_runs (id, kind, status, started_at, config_json, runtime_json)
+       VALUES (?1, ?2, 'running', ?3, ?4, ?5)`
     )
-    .bind(run.id, run.kind, run.startedAt, run.configJson)
+    .bind(run.id, run.kind, run.startedAt, run.configJson, run.runtimeJson)
     .run();
 }
 
@@ -259,3 +269,49 @@ export async function saveConfigRow(
     .bind(configJson, updatedAt, updatedBy)
     .run();
 }
+
+type LatestSummaryRow = {
+  run_id: string;
+  started_at: string;
+  model: string;
+  tier: string;
+  accuracy: number;
+  avg_cost_usd: number | null;
+  avg_latency_ms: number;
+  p50_latency_ms: number | null;
+  cases: number;
+  errors: number;
+};
+
+// Latest summaries per model for a benchmark kind: for each model, all tiers
+// from the most recent COMPLETED run that included it (mixing tiers across
+// runs would pair incomparable numbers).
+export async function getLatestSummariesByModel(
+  db: D1Database,
+  kind: BenchmarkKind
+): Promise<Map<string, BenchmarkModelSummary[]>> {
+  const { results } = await db
+    .prepare(
+      `SELECT ms.run_id, r.started_at, ms.model, ms.tier, ms.accuracy, ms.avg_cost_usd,
+              ms.avg_latency_ms, ms.p50_latency_ms, ms.cases, ms.errors
+       FROM model_summaries ms
+       JOIN benchmark_runs r ON r.id = ms.run_id
+       WHERE r.kind = ?1 AND r.status = 'completed'
+       ORDER BY r.started_at DESC`
+    )
+    .bind(kind)
+    .all<LatestSummaryRow>();
+
+  const latestRunByModel = new Map<string, string>();
+  for (const row of results) {
+    if (!latestRunByModel.has(row.model)) latestRunByModel.set(row.model, row.run_id);
+  }
+  const byModel = new Map<string, BenchmarkModelSummary[]>();
+  for (const row of results) {
+    if (latestRunByModel.get(row.model) !== row.run_id) continue;
+    const list = byModel.get(row.model) ?? [];
+    list.push(mapSummaryRow(row));
+    byModel.set(row.model, list);
+  }
+  return byModel;
+}
diff --git a/services/auto-routing-benchmark/src/index.ts b/services/auto-routing-benchmark/src/index.ts
index a1b7a0664f..6259f9866c 100644
--- a/services/auto-routing-benchmark/src/index.ts
+++ b/services/auto-routing-benchmark/src/index.ts
@@ -10,7 +10,7 @@ import {
   debugCliHandler,
 } from './admin';
 import type { HonoEnv } from './hono-env';
-import { processJob, startRun, type BenchmarkJobMessage } from './run';
+import { processJob, type BenchmarkJobMessage } from './run';
 
 // Re-exported so the Durable Object class binding (BENCH_RUNNER) can find it.
 export { BenchRunnerContainer } from './bench-runner-container';
@@ -29,14 +29,8 @@ app.post('/admin/debug-cli', debugCliHandler);
 app.notFound(createNotFoundHandler());
 app.onError(createErrorHandler());
 
-const DECIDER_CRON = '10 5 * * 1';
-
 export default {
   fetch: app.fetch,
-  async scheduled(controller: ScheduledController, env: Env, ctx: ExecutionContext): Promise<void> {
-    const kind = controller.cron === DECIDER_CRON ? 'decider' : 'classifier';
-    ctx.waitUntil(startRun(env, kind));
-  },
   async queue(batch: MessageBatch<BenchmarkJobMessage>, env: Env): Promise<void> {
     for (const message of batch.messages) {
       // Deliberately no try/catch: a throw from processJob (transient token,
diff --git a/services/auto-routing-benchmark/src/routing-table-builder.test.ts b/services/auto-routing-benchmark/src/routing-table-builder.test.ts
index 5e8e23e7d4..c08ef576be 100644
--- a/services/auto-routing-benchmark/src/routing-table-builder.test.ts
+++ b/services/auto-routing-benchmark/src/routing-table-builder.test.ts
@@ -5,9 +5,13 @@ import { buildRoutingTable } from './routing-table-builder';
 const BASE_CONFIG: BenchmarkConfig = {
   classifierModels: ['some/classifier'],
   deciderModels: [
-    { id: 'model/cheap', supportedApiKinds: ['chat_completions'] },
-    { id: 'model/expensive', supportedApiKinds: ['chat_completions', 'responses'] },
-    { id: 'model/mid', supportedApiKinds: ['chat_completions', 'messages'] },
+    { id: 'model/cheap', supportedApiKinds: ['chat_completions'], reasoningEffort: null },
+    {
+      id: 'model/expensive',
+      supportedApiKinds: ['chat_completions', 'responses'],
+      reasoningEffort: null,
+    },
+    { id: 'model/mid', supportedApiKinds: ['chat_completions', 'messages'], reasoningEffort: null },
   ],
   minAccuracy: 0.7,
   maxConcurrency: 4,
diff --git a/services/auto-routing-benchmark/src/routing-table-builder.ts b/services/auto-routing-benchmark/src/routing-table-builder.ts
index 71bfa772d3..d6d6e625a3 100644
--- a/services/auto-routing-benchmark/src/routing-table-builder.ts
+++ b/services/auto-routing-benchmark/src/routing-table-builder.ts
@@ -17,9 +17,7 @@ export function buildRoutingTable(params: {
   summaries: BenchmarkModelSummary[];
 }): RoutingTable {
   const { runId, generatedAt, config, summaries } = params;
-  const apiKindsByModel = new Map(
-    config.deciderModels.map(m => [m.id, m.supportedApiKinds] as const)
-  );
+  const modelConfigById = new Map(config.deciderModels.map(m => [m.id, m] as const));
 
   const tierCandidates = (t: DifficultyTier) =>
     rankCandidates(
@@ -30,7 +28,10 @@ export function buildRoutingTable(params: {
           accuracy: s.accuracy,
           avgCostUsd: s.avgCostUsd ?? 0,
           // Spread into a mutable array so tsgo is happy with the readonly type.
-          supportedApiKinds: [...(apiKindsByModel.get(s.model) ?? (['chat_completions'] as const))],
+          supportedApiKinds: [
+            ...(modelConfigById.get(s.model)?.supportedApiKinds ?? (['chat_completions'] as const)),
+          ],
+          reasoningEffort: modelConfigById.get(s.model)?.reasoningEffort ?? null,
         })),
       config.minAccuracy
     );
diff --git a/services/auto-routing-benchmark/src/run.test.ts b/services/auto-routing-benchmark/src/run.test.ts
index a9c57baedf..64f5964c52 100644
--- a/services/auto-routing-benchmark/src/run.test.ts
+++ b/services/auto-routing-benchmark/src/run.test.ts
@@ -1,6 +1,6 @@
 import { describe, expect, it } from 'vitest';
 import type { CaseResultRow } from './db';
-import { chunkArray, runCasesWithConcurrency, summarize } from './run';
+import { chunkArray, pickClassifierWinner, runCasesWithConcurrency, summarize } from './run';
 
 function makeRow(overrides: Partial<CaseResultRow> = {}): CaseResultRow {
   return {
@@ -231,3 +231,44 @@ describe('chunkArray', () => {
     expect(chunkArray([], 10)).toEqual([]);
   });
 });
+
+describe('pickClassifierWinner', () => {
+  const summary = (model: string, accuracy: number, avgCostUsd: number | null) => ({
+    model,
+    tier: '*' as const,
+    accuracy,
+    avgCostUsd,
+    avgLatencyMs: 100,
+    p50LatencyMs: 90,
+    cases: 36,
+    errors: 0,
+  });
+
+  it('picks the cheapest model meeting the threshold', () => {
+    const winner = pickClassifierWinner(
+      [summary('pricy', 0.95, 0.01), summary('cheap', 0.9, 0.001), summary('weak', 0.5, 0.0001)],
+      0.7
+    );
+    expect(winner?.model).toBe('cheap');
+  });
+
+  it('falls back to highest accuracy when nothing meets the threshold', () => {
+    const winner = pickClassifierWinner([summary('a', 0.5, 0.001), summary('b', 0.6, 0.01)], 0.9);
+    expect(winner?.model).toBe('b');
+  });
+
+  it('treats null cost as most expensive', () => {
+    const winner = pickClassifierWinner(
+      [summary('nocost', 0.95, null), summary('cheap', 0.9, 0.001)],
+      0.7
+    );
+    expect(winner?.model).toBe('cheap');
+  });
+
+  it('ignores decider-tier summaries and returns null when nothing is graded', () => {
+    expect(
+      pickClassifierWinner([{ ...summary('m', 1, 0.001), tier: 'low' as const }], 0.7)
+    ).toBeNull();
+    expect(pickClassifierWinner([], 0.7)).toBeNull();
+  });
+});
diff --git a/services/auto-routing-benchmark/src/run.ts b/services/auto-routing-benchmark/src/run.ts
index 43435a7449..337c18e901 100644
--- a/services/auto-routing-benchmark/src/run.ts
+++ b/services/auto-routing-benchmark/src/run.ts
@@ -1,10 +1,13 @@
 import { classifyWithOpenRouter } from '@kilocode/auto-routing-contracts/classifier';
 import {
   BenchmarkConfigSchema,
+  BenchmarkModelSummarySchema,
+  CLASSIFIER_WINNER_KV_KEY,
   ROUTING_TABLE_KV_KEY,
   type BenchmarkConfig,
   type BenchmarkKind,
   type BenchmarkModelSummary,
+  type ClassifierWinner,
 } from '@kilocode/auto-routing-contracts';
 import { formatError } from '@kilocode/worker-utils';
 import * as z from 'zod';
@@ -14,6 +17,7 @@ import { DECIDER_CASES } from './datasets/decider-cases';
 import {
   countCaseResults,
   getCaseResults,
+  getLatestSummariesByModel,
   getRun,
   insertRun,
   markRunCompleted,
@@ -61,10 +65,20 @@ export function chunkArray<T>(items: readonly T[], size: number): T[][] {
 
 const STALE_RUN_MAX_AGE_MS = 6 * 3600_000;
 
+// Per-run execution state stored in benchmark_runs.runtime_json: which models
+// were actually enqueued, and the latest prior summaries carried forward for
+// models skipped because they already had results.
+const RunRuntimeSchema = z.object({
+  enqueuedModels: z.array(z.string()),
+  carriedSummaries: z.array(BenchmarkModelSummarySchema),
+});
+type RunRuntime = z.infer<typeof RunRuntimeSchema>;
+
 export async function startRun(
   env: Env,
-  kind: BenchmarkKind
-): Promise<{ runId: string; enqueuedModels: number }> {
+  kind: BenchmarkKind,
+  options: { force?: boolean } = {}
+): Promise<{ runId: string; enqueuedModels: number; skippedModels: string[] }> {
   // Stale-run sweeper: anything still 'running' after 6h is dead (queue
   // retries exhausted); fail it so the admin panel shows the truth.
   await markStaleRunsFailed(
@@ -76,35 +90,57 @@ export async function startRun(
   const models =
     kind === 'classifier' ? config.classifierModels : config.deciderModels.map(m => m.id);
 
+  // Models with prior results are skipped (their latest summaries are carried
+  // into this run's aggregate) unless the admin forces a full re-run.
+  const priorSummaries = options.force
+    ? new Map<string, BenchmarkModelSummary[]>()
+    : await getLatestSummariesByModel(env.BENCH_DB, kind);
+  const enqueuedModels = models.filter(m => !priorSummaries.has(m));
+  const skippedModels = models.filter(m => priorSummaries.has(m));
+  const carriedSummaries = skippedModels.flatMap(m => priorSummaries.get(m) ?? []);
+
   // Decider runs execute through the kilo CLI under a real Kilo user's
   // identity/billing. Fail fast (before inserting the run) when that user
   // isn't configured so the admin POST surfaces the misconfiguration.
-  if (kind === 'decider' && !config.benchmarkUserId) {
+  if (kind === 'decider' && enqueuedModels.length > 0 && !config.benchmarkUserId) {
     throw new Error(
       'benchmark user not configured: set benchmarkUserId before running the decider benchmark'
     );
   }
 
   const runId = `${kind}-${new Date().toISOString().replace(/[:.]/g, '-')}`;
+  const runtime: RunRuntime = { enqueuedModels, carriedSummaries };
   await insertRun(env.BENCH_DB, {
     id: runId,
     kind,
     startedAt: new Date().toISOString(),
     configJson: JSON.stringify(config),
+    runtimeJson: JSON.stringify(runtime),
   });
 
+  console.log(
+    JSON.stringify({ event: 'benchmark_run_started', runId, kind, enqueuedModels, skippedModels })
+  );
+
+  if (enqueuedModels.length === 0) {
+    // Everything already has results: complete immediately and republish the
+    // aggregate so config-only changes (model removed, threshold tweaked)
+    // take effect without re-running any model.
+    await finalizeRunIfComplete(env, runId, kind);
+    return { runId, enqueuedModels: 0, skippedModels };
+  }
+
   if (kind === 'classifier') {
     await env.BENCH_QUEUE.sendBatch(
-      models.map(model => ({ body: { runId, kind, model } satisfies BenchmarkJobMessage }))
+      enqueuedModels.map(model => ({ body: { runId, kind, model } satisfies BenchmarkJobMessage }))
     );
-    console.log(JSON.stringify({ event: 'benchmark_run_started', runId, kind, models }));
-    return { runId, enqueuedModels: models.length };
+    return { runId, enqueuedModels: enqueuedModels.length, skippedModels };
   }
 
   // Decider: one message per (model, chunk) so each queue invocation stays
-  // bounded. finalizeRunIfComplete still expects models × DECIDER_CASES rows.
+  // bounded. finalizeRunIfComplete expects enqueuedModels × DECIDER_CASES rows.
   const chunks = chunkArray(DECIDER_CASES, DECIDER_CHUNK_SIZE);
-  const messages = models.flatMap(model =>
+  const messages = enqueuedModels.flatMap(model =>
     chunks.map((chunkCases, chunk) => ({
       body: {
         runId,
@@ -116,10 +152,7 @@ export async function startRun(
     }))
   );
   await env.BENCH_QUEUE.sendBatch(messages);
-  console.log(
-    JSON.stringify({ event: 'benchmark_run_started', runId, kind, models, chunks: chunks.length })
-  );
-  return { runId, enqueuedModels: models.length };
+  return { runId, enqueuedModels: enqueuedModels.length, skippedModels };
 }
 
 export async function processJob(env: Env, rawMessage: unknown): Promise<void> {
@@ -215,6 +248,8 @@ async function processDeciderJob(
   // queue retries the message. The token is never logged.
   const kiloToken = await fetchBenchmarkUserToken(env, config.benchmarkUserId);
   const instanceName = `${message.runId}:${message.model}:${message.chunk ?? 0}`;
+  const reasoningEffort =
+    config.deciderModels.find(m => m.id === message.model)?.reasoningEffort ?? null;
 
   // The CLI performs a one-time sqlite migration on each fresh container
   // instance; concurrent first runs against the migrating database end with
@@ -246,6 +281,7 @@ async function processDeciderJob(
         model: message.model,
         benchCase,
         kiloToken,
+        reasoningEffort,
       });
       // The CLI occasionally ends a session with no assistant text at all
       // (transient empty completion: a lone step_finish with cost 0). Mirror
@@ -258,6 +294,7 @@ async function processDeciderJob(
           model: message.model,
           benchCase,
           kiloToken,
+          reasoningEffort,
         });
         retry.costUsd =
           retry.costUsd === null && result.costUsd === null
@@ -343,11 +380,27 @@ function failedRow(
   };
 }
 
-async function getRunConfig(env: Env, runId: string): Promise<BenchmarkConfig> {
-  // Snapshot taken at startRun time so a mid-run admin edit can't skew it.
+async function getRunState(
+  env: Env,
+  runId: string
+): Promise<{ config: BenchmarkConfig; runtime: RunRuntime }> {
+  // Snapshots taken at startRun time so a mid-run admin edit can't skew them.
   const run = await getRun(env.BENCH_DB, runId);
   if (!run) throw new Error(`unknown run ${runId}`);
-  return BenchmarkConfigSchema.parse(JSON.parse(run.config_json));
+  const config = BenchmarkConfigSchema.parse(JSON.parse(run.config_json));
+  const runtime = run.runtime_json
+    ? RunRuntimeSchema.parse(JSON.parse(run.runtime_json))
+    : // Rows written before runtime_json existed ran every configured model.
+      {
+        enqueuedModels:
+          run.kind === 'classifier' ? config.classifierModels : config.deciderModels.map(m => m.id),
+        carriedSummaries: [],
+      };
+  return { config, runtime };
+}
+
+async function getRunConfig(env: Env, runId: string): Promise<BenchmarkConfig> {
+  return (await getRunState(env, runId)).config;
 }
 
 export async function runCasesWithConcurrency<T>(
@@ -365,11 +418,9 @@ export async function runCasesWithConcurrency<T>(
 }
 
 async function finalizeRunIfComplete(env: Env, runId: string, kind: BenchmarkKind): Promise<void> {
-  const config = await getRunConfig(env, runId);
-  const models =
-    kind === 'classifier' ? config.classifierModels : config.deciderModels.map(m => m.id);
+  const { config, runtime } = await getRunState(env, runId);
   const caseCount = kind === 'classifier' ? CLASSIFIER_CASES.length : DECIDER_CASES.length;
-  const expected = models.length * caseCount;
+  const expected = runtime.enqueuedModels.length * caseCount;
   const actual = await countCaseResults(env.BENCH_DB, runId);
 
   if (actual < expected) return;
@@ -379,10 +430,29 @@ async function finalizeRunIfComplete(env: Env, runId: string, kind: BenchmarkKin
   // is a batched delete+insert; markRunCompleted guards on status='running';
   // KV put is idempotent.
   const rows = await getCaseResults(env.BENCH_DB, runId);
-  const summaries = summarize(rows, kind);
+  // Fresh results plus the carried-forward summaries of skipped models.
+  const summaries = [...summarize(rows, kind), ...runtime.carriedSummaries];
   await replaceModelSummaries(env.BENCH_DB, runId, summaries);
   await markRunCompleted(env.BENCH_DB, runId);
 
+  if (kind === 'classifier') {
+    const winner = pickClassifierWinner(summaries, config.minAccuracy);
+    if (winner) {
+      const payload: ClassifierWinner = {
+        model: winner.model,
+        runId,
+        accuracy: winner.accuracy,
+        generatedAt: new Date().toISOString(),
+      };
+      await env.AUTO_ROUTING_CONFIG.put(CLASSIFIER_WINNER_KV_KEY, JSON.stringify(payload));
+      console.log(
+        JSON.stringify({ event: 'classifier_winner_published', runId, model: winner.model })
+      );
+    } else {
+      console.warn(JSON.stringify({ event: 'classifier_winner_skipped', runId }));
+    }
+  }
+
   if (kind === 'decider') {
     const generatedAt = new Date().toISOString();
     try {
@@ -414,6 +484,23 @@ async function finalizeRunIfComplete(env: Env, runId: string, kind: BenchmarkKin
   );
 }
 
+// Same bang-for-buck rule as the routing table, applied to classifier
+// summaries (tier '*'): cheapest candidate meeting the accuracy threshold,
+// else the most accurate one. Null when there are no graded summaries.
+export function pickClassifierWinner(
+  summaries: BenchmarkModelSummary[],
+  minAccuracy: number
+): BenchmarkModelSummary | null {
+  const graded = summaries.filter(s => s.tier === '*' && s.cases > 0);
+  if (graded.length === 0) return null;
+  const cost = (s: BenchmarkModelSummary) => s.avgCostUsd ?? Number.POSITIVE_INFINITY;
+  const meeting = graded.filter(s => s.accuracy >= minAccuracy);
+  if (meeting.length > 0) {
+    return meeting.toSorted((a, b) => cost(a) - cost(b) || b.accuracy - a.accuracy)[0];
+  }
+  return graded.toSorted((a, b) => b.accuracy - a.accuracy || cost(a) - cost(b))[0];
+}
+
 export function summarize(rows: CaseResultRow[], kind: BenchmarkKind): BenchmarkModelSummary[] {
   // Group by "model tier-key" using a plain reduce so this works in all runtimes.
   // Classifier rows use '*' as the tier (no tiering); decider rows use the actual tier
diff --git a/services/auto-routing-benchmark/wrangler.jsonc b/services/auto-routing-benchmark/wrangler.jsonc
index 0b02d01c4a..d48f588d52 100644
--- a/services/auto-routing-benchmark/wrangler.jsonc
+++ b/services/auto-routing-benchmark/wrangler.jsonc
@@ -29,10 +29,6 @@
     "bindings": [{ "name": "BENCH_RUNNER", "class_name": "BenchRunnerContainer" }],
   },
   "migrations": [{ "tag": "v1", "new_sqlite_classes": ["BenchRunnerContainer"] }],
-  "triggers": {
-    // 04:10 UTC daily: classifier benchmark. 05:10 UTC Monday: decider benchmark.
-    "crons": ["10 4 * * *", "10 5 * * 1"],
-  },
   "d1_databases": [
     {
       "binding": "BENCH_DB",

From f3c0128401ace1fd3b03cbb057108611de576cdd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Fri, 12 Jun 2026 10:42:16 +0200
Subject: [PATCH 31/73] refactor(auto-routing): simplification pass

- benchmark worker: single run-state read per queue message; decider chunks
  require caseIds (legacy fallback removed); dead defensive branch and unused
  DeciderCase.maxTokens dropped; container owns CLI warmup via /warmup
  instead of a synthetic benchmark case; admin routes use zodJsonValidator
  like sibling services
- apps/web: parseAdminResponse and the worker-admin fetch wrapper are shared
  modules instead of per-file copies; BenchmarksSection.types re-export shim
  deleted; dead prevConfigRef guard removed; classifier-model sync effect
  keyed on stable primitives; tier sort order hoisted to module scope
---
 .../auto-routing/AutoRoutingAdminContent.tsx  |  28 +---
 .../admin/auto-routing/BenchmarksSection.tsx  |  45 ++----
 .../auto-routing/BenchmarksSection.types.ts   |   4 -
 .../src/app/admin/auto-routing/admin-fetch.ts |  19 +++
 .../ai-gateway/auto-routing-admin-client.ts   |  57 ++------
 .../auto-routing-benchmark-admin-client.ts    |  57 ++------
 .../src/lib/ai-gateway/worker-admin-fetch.ts  |  56 ++++++++
 .../container/server.mjs                      |  25 ++++
 .../auto-routing-benchmark/src/admin.test.ts  |  24 ++--
 services/auto-routing-benchmark/src/admin.ts  | 132 ++++++++----------
 .../auto-routing-benchmark/src/cli-runner.ts  |  19 +++
 .../src/datasets/decider-cases.test.ts        |   8 --
 .../src/datasets/decider-cases.ts             |  33 -----
 services/auto-routing-benchmark/src/index.ts  |  16 +--
 services/auto-routing-benchmark/src/run.ts    |  88 +++++-------
 15 files changed, 262 insertions(+), 349 deletions(-)
 delete mode 100644 apps/web/src/app/admin/auto-routing/BenchmarksSection.types.ts
 create mode 100644 apps/web/src/app/admin/auto-routing/admin-fetch.ts
 create mode 100644 apps/web/src/lib/ai-gateway/worker-admin-fetch.ts

diff --git a/apps/web/src/app/admin/auto-routing/AutoRoutingAdminContent.tsx b/apps/web/src/app/admin/auto-routing/AutoRoutingAdminContent.tsx
index 185dc097f9..f6e262d43d 100644
--- a/apps/web/src/app/admin/auto-routing/AutoRoutingAdminContent.tsx
+++ b/apps/web/src/app/admin/auto-routing/AutoRoutingAdminContent.tsx
@@ -11,7 +11,6 @@ import React, { useEffect, useMemo, useState, type ReactNode } from 'react';
 import { useMutation, useQuery, useQueryClient } from '@tanstack/react-query';
 import { toast } from 'sonner';
 import { BarChart3, Clock3, DollarSign, HelpCircle, RefreshCw, Route, Save } from 'lucide-react';
-import * as z from 'zod';
 import { ModelCombobox, type ModelOption } from '@/components/shared/ModelCombobox';
 import { Badge } from '@/components/ui/badge';
 import { Button } from '@/components/ui/button';
@@ -32,6 +31,7 @@ import {
 } from '@/lib/organizations/organization-types';
 import { cn } from '@/lib/utils';
 import { BenchmarksSection } from './BenchmarksSection';
+import { parseAdminResponse } from './admin-fetch';
 
 const periods: Array<{ value: AutoRoutingAnalyticsPeriod; label: string }> = [
   { value: '1h', label: '1h' },
@@ -40,24 +40,6 @@ const periods: Array<{ value: AutoRoutingAnalyticsPeriod; label: string }> = [
   { value: '30d', label: '30d' },
 ];
 
-const AdminApiErrorSchema = z.object({ error: z.string().optional() });
-
-async function parseAdminResponse<T extends object>(
-  response: Response,
-  schema: z.ZodType<T>
-): Promise<T> {
-  const body: unknown = await response.json();
-  if (!response.ok) {
-    const parsedError = AdminApiErrorSchema.safeParse(body);
-    throw new Error(
-      parsedError.success && parsedError.data.error
-        ? parsedError.data.error
-        : `Request failed: ${response.status}`
-    );
-  }
-  return schema.parse(body);
-}
-
 async function fetchClassifierModel() {
   const response = await fetch('/admin/api/auto-routing/classifier-model');
   return parseAdminResponse<AutoRoutingClassifierModelResponse>(
@@ -398,10 +380,12 @@ export function AutoRoutingAdminContent() {
   });
 
   useEffect(() => {
-    if (classifierModelQuery.data) {
-      setSelectedModel(classifierModelQuery.data.override ?? classifierModelQuery.data.model);
+    const override = classifierModelQuery.data?.override;
+    const model = classifierModelQuery.data?.model;
+    if (model !== undefined) {
+      setSelectedModel(override ?? model);
     }
-  }, [classifierModelQuery.data]);
+  }, [classifierModelQuery.data?.override, classifierModelQuery.data?.model]);
 
   const modelOptions = useMemo<ModelOption[]>(() => {
     return (
diff --git a/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx b/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx
index e2509c7649..feea0023bd 100644
--- a/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx
+++ b/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx
@@ -2,18 +2,19 @@
 
 import {
   BenchmarkConfigResponseSchema,
+  BenchmarkRoutingTableResponseSchema,
   BenchmarkRunsResponseSchema,
   StartBenchmarkRunResponseSchema,
   type BenchmarkConfig,
+  type BenchmarkRoutingTableResponse,
   type BenchmarkRun,
   type BenchmarkModelSummary,
   type ReasoningEffort,
 } from '@kilocode/auto-routing-contracts';
-import React, { useCallback, useEffect, useRef, useState } from 'react';
+import React, { useCallback, useEffect, useState } from 'react';
 import { useMutation, useQuery, useQueryClient } from '@tanstack/react-query';
 import { toast } from 'sonner';
 import { ChevronDown, ChevronRight, Play, Plus, RotateCcw, Save, Trash2 } from 'lucide-react';
-import * as z from 'zod';
 import { Badge } from '@/components/ui/badge';
 import { Button } from '@/components/ui/button';
 import { Card, CardContent, CardHeader, CardTitle } from '@/components/ui/card';
@@ -37,10 +38,7 @@ import {
   TableRow,
 } from '@/components/ui/table';
 import { Textarea } from '@/components/ui/textarea';
-import {
-  BenchmarkRoutingTableResponseSchema,
-  type BenchmarkRoutingTableResponse,
-} from './BenchmarksSection.types';
+import { parseAdminResponse } from './admin-fetch';
 
 // ---------------------------------------------------------------------------
 // Pure helpers (exported for unit tests)
@@ -59,28 +57,6 @@ export function formatUsd(n: number | null): string {
   return `$${trimmed}`;
 }
 
-// ---------------------------------------------------------------------------
-// API error helper (mirrors the one in AutoRoutingAdminContent.tsx)
-// ---------------------------------------------------------------------------
-
-const AdminApiErrorSchema = z.object({ error: z.string().optional() });
-
-async function parseAdminResponse<T extends object>(
-  response: Response,
-  schema: z.ZodType<T>
-): Promise<T> {
-  const body: unknown = await response.json();
-  if (!response.ok) {
-    const parsedError = AdminApiErrorSchema.safeParse(body);
-    throw new Error(
-      parsedError.success && parsedError.data.error
-        ? parsedError.data.error
-        : `Request failed: ${response.status}`
-    );
-  }
-  return schema.parse(body);
-}
-
 // ---------------------------------------------------------------------------
 // Fetch helpers
 // ---------------------------------------------------------------------------
@@ -210,12 +186,8 @@ function BenchmarkConfigEditor({
   const [form, setForm] = useState(() => configToFormState(config));
 
   // Sync when config changes from outside (initial load / after save)
-  const prevConfigRef = useRef(config);
   useEffect(() => {
-    if (prevConfigRef.current !== config) {
-      prevConfigRef.current = config;
-      setForm(configToFormState(config));
-    }
+    setForm(configToFormState(config));
   }, [config]);
 
   const saveMutation = useMutation({
@@ -478,15 +450,16 @@ function BenchmarkConfigEditor({
 // Run summaries expandable table
 // ---------------------------------------------------------------------------
 
+const TIER_ORDER = { low: 0, medium: 1, high: 2, '*': 3 } as const;
+
 function RunSummariesTable({ run }: { run: BenchmarkRun }) {
   const isDecider = run.kind === 'decider';
 
   const sortedSummaries: BenchmarkModelSummary[] = isDecider
     ? [...run.summaries].sort((a, b) => {
-        const tierOrder = { low: 0, medium: 1, high: 2, '*': 3 };
         const tierDiff =
-          (tierOrder[a.tier as keyof typeof tierOrder] ?? 3) -
-          (tierOrder[b.tier as keyof typeof tierOrder] ?? 3);
+          (TIER_ORDER[a.tier as keyof typeof TIER_ORDER] ?? 3) -
+          (TIER_ORDER[b.tier as keyof typeof TIER_ORDER] ?? 3);
         if (tierDiff !== 0) return tierDiff;
         return b.accuracy - a.accuracy;
       })
diff --git a/apps/web/src/app/admin/auto-routing/BenchmarksSection.types.ts b/apps/web/src/app/admin/auto-routing/BenchmarksSection.types.ts
deleted file mode 100644
index 04f0376bd3..0000000000
--- a/apps/web/src/app/admin/auto-routing/BenchmarksSection.types.ts
+++ /dev/null
@@ -1,4 +0,0 @@
-export {
-  BenchmarkRoutingTableResponseSchema,
-  type BenchmarkRoutingTableResponse,
-} from '@kilocode/auto-routing-contracts';
diff --git a/apps/web/src/app/admin/auto-routing/admin-fetch.ts b/apps/web/src/app/admin/auto-routing/admin-fetch.ts
new file mode 100644
index 0000000000..b29d538e09
--- /dev/null
+++ b/apps/web/src/app/admin/auto-routing/admin-fetch.ts
@@ -0,0 +1,19 @@
+import * as z from 'zod';
+
+const AdminApiErrorSchema = z.object({ error: z.string().optional() });
+
+export async function parseAdminResponse<T extends object>(
+  response: Response,
+  schema: z.ZodType<T>
+): Promise<T> {
+  const body: unknown = await response.json();
+  if (!response.ok) {
+    const parsedError = AdminApiErrorSchema.safeParse(body);
+    throw new Error(
+      parsedError.success && parsedError.data.error
+        ? parsedError.data.error
+        : `Request failed: ${response.status}`
+    );
+  }
+  return schema.parse(body);
+}
diff --git a/apps/web/src/lib/ai-gateway/auto-routing-admin-client.ts b/apps/web/src/lib/ai-gateway/auto-routing-admin-client.ts
index 1af226d8c7..49d4d2ca07 100644
--- a/apps/web/src/lib/ai-gateway/auto-routing-admin-client.ts
+++ b/apps/web/src/lib/ai-gateway/auto-routing-admin-client.ts
@@ -3,57 +3,16 @@ import {
   AutoRoutingClassifierModelResponseSchema,
   type AutoRoutingAnalyticsPeriod,
 } from '@kilocode/auto-routing-contracts';
-import { AUTO_ROUTING_WORKER_URL, INTERNAL_API_SECRET } from '@/lib/config.server';
-import * as z from 'zod';
+import { AUTO_ROUTING_WORKER_URL } from '@/lib/config.server';
+import { createWorkerAdminFetch } from './worker-admin-fetch';
+import type { WorkerAdminResult } from './worker-admin-fetch';
 
-export type AutoRoutingAdminResult<T> = {
-  status: number;
-  body: T;
-};
+export type AutoRoutingAdminResult<T> = WorkerAdminResult<T>;
 
-type ErrorBody = { error: string };
-const ErrorBodySchema = z.object({ error: z.string() });
-
-type AutoRoutingAdminRequestInit = Omit<RequestInit, 'headers'> & {
-  headers?: Record<string, string>;
-};
-
-async function fetchAutoRoutingAdmin<T>(
-  path: string,
-  init: AutoRoutingAdminRequestInit,
-  schema: z.ZodType<T>
-): Promise<AutoRoutingAdminResult<T | ErrorBody>> {
-  if (!AUTO_ROUTING_WORKER_URL || !INTERNAL_API_SECRET) {
-    return {
-      status: 500,
-      body: { error: 'Auto routing worker is not configured' },
-    };
-  }
-
-  const response = await fetch(`${AUTO_ROUTING_WORKER_URL}${path}`, {
-    ...init,
-    headers: {
-      authorization: `Bearer ${INTERNAL_API_SECRET}`,
-      ...init.headers,
-    },
-  });
-
-  const body: unknown = await response.json();
-  if (!response.ok) {
-    const parsedError = ErrorBodySchema.safeParse(body);
-    return {
-      status: response.status,
-      body: parsedError.success
-        ? parsedError.data
-        : { error: `Request failed: ${response.status}` },
-    };
-  }
-
-  return {
-    status: response.status,
-    body: schema.parse(body),
-  };
-}
+const fetchAutoRoutingAdmin = createWorkerAdminFetch({
+  workerUrl: AUTO_ROUTING_WORKER_URL,
+  unconfiguredError: 'Auto routing worker is not configured',
+});
 
 export function getAutoRoutingClassifierModel() {
   return fetchAutoRoutingAdmin(
diff --git a/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.ts b/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.ts
index 71780f149f..3939234c55 100644
--- a/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.ts
+++ b/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.ts
@@ -11,57 +11,16 @@ export {
   BenchmarkRoutingTableResponseSchema,
   type BenchmarkRoutingTableResponse,
 } from '@kilocode/auto-routing-contracts';
-import { AUTO_ROUTING_BENCHMARK_WORKER_URL, INTERNAL_API_SECRET } from '@/lib/config.server';
-import * as z from 'zod';
+import { AUTO_ROUTING_BENCHMARK_WORKER_URL } from '@/lib/config.server';
+import { createWorkerAdminFetch } from './worker-admin-fetch';
+import type { WorkerAdminResult } from './worker-admin-fetch';
 
-export type AutoRoutingAdminResult<T> = {
-  status: number;
-  body: T;
-};
+export type AutoRoutingAdminResult<T> = WorkerAdminResult<T>;
 
-type ErrorBody = { error: string };
-const ErrorBodySchema = z.object({ error: z.string() });
-
-type AutoRoutingBenchmarkAdminRequestInit = Omit<RequestInit, 'headers'> & {
-  headers?: Record<string, string>;
-};
-
-async function fetchBenchmarkAdmin<T>(
-  path: string,
-  init: AutoRoutingBenchmarkAdminRequestInit,
-  schema: z.ZodType<T>
-): Promise<AutoRoutingAdminResult<T | ErrorBody>> {
-  if (!AUTO_ROUTING_BENCHMARK_WORKER_URL || !INTERNAL_API_SECRET) {
-    return {
-      status: 500,
-      body: { error: 'Auto routing benchmark worker is not configured' },
-    };
-  }
-
-  const response = await fetch(`${AUTO_ROUTING_BENCHMARK_WORKER_URL}${path}`, {
-    ...init,
-    headers: {
-      authorization: `Bearer ${INTERNAL_API_SECRET}`,
-      ...init.headers,
-    },
-  });
-
-  const body: unknown = await response.json();
-  if (!response.ok) {
-    const parsedError = ErrorBodySchema.safeParse(body);
-    return {
-      status: response.status,
-      body: parsedError.success
-        ? parsedError.data
-        : { error: `Request failed: ${response.status}` },
-    };
-  }
-
-  return {
-    status: response.status,
-    body: schema.parse(body),
-  };
-}
+const fetchBenchmarkAdmin = createWorkerAdminFetch({
+  workerUrl: AUTO_ROUTING_BENCHMARK_WORKER_URL,
+  unconfiguredError: 'Auto routing benchmark worker is not configured',
+});
 
 export function getBenchmarkConfig() {
   return fetchBenchmarkAdmin('/admin/config', { method: 'GET' }, BenchmarkConfigResponseSchema);
diff --git a/apps/web/src/lib/ai-gateway/worker-admin-fetch.ts b/apps/web/src/lib/ai-gateway/worker-admin-fetch.ts
new file mode 100644
index 0000000000..855f99b920
--- /dev/null
+++ b/apps/web/src/lib/ai-gateway/worker-admin-fetch.ts
@@ -0,0 +1,56 @@
+import { INTERNAL_API_SECRET } from '@/lib/config.server';
+import * as z from 'zod';
+
+export type WorkerAdminResult<T> = {
+  status: number;
+  body: T;
+};
+
+export type ErrorBody = { error: string };
+export const ErrorBodySchema = z.object({ error: z.string() });
+
+type WorkerAdminRequestInit = Omit<RequestInit, 'headers'> & {
+  headers?: Record<string, string>;
+};
+
+export function createWorkerAdminFetch(options: {
+  workerUrl: string | undefined;
+  unconfiguredError: string;
+}) {
+  return async function fetchAdmin<T>(
+    path: string,
+    init: WorkerAdminRequestInit,
+    schema: z.ZodType<T>
+  ): Promise<WorkerAdminResult<T | ErrorBody>> {
+    if (!options.workerUrl || !INTERNAL_API_SECRET) {
+      return {
+        status: 500,
+        body: { error: options.unconfiguredError },
+      };
+    }
+
+    const response = await fetch(`${options.workerUrl}${path}`, {
+      ...init,
+      headers: {
+        authorization: `Bearer ${INTERNAL_API_SECRET}`,
+        ...init.headers,
+      },
+    });
+
+    const body: unknown = await response.json();
+    if (!response.ok) {
+      const parsedError = ErrorBodySchema.safeParse(body);
+      return {
+        status: response.status,
+        body: parsedError.success
+          ? parsedError.data
+          : { error: `Request failed: ${response.status}` },
+      };
+    }
+
+    return {
+      status: response.status,
+      body: schema.parse(body),
+    };
+  };
+}
diff --git a/services/auto-routing-benchmark/container/server.mjs b/services/auto-routing-benchmark/container/server.mjs
index 67799c87e8..91ef1a19c4 100644
--- a/services/auto-routing-benchmark/container/server.mjs
+++ b/services/auto-routing-benchmark/container/server.mjs
@@ -120,6 +120,31 @@ const server = createServer((req, res) => {
       return;
     }
 
+    // One-time CLI warmup (sqlite migration on a fresh instance): a trivial
+    // serialized run so real cases never burn their timeout on it.
+    if (req.method === 'POST' && req.url === '/warmup') {
+      let parsed;
+      try {
+        parsed = JSON.parse(await readBody(req));
+      } catch {
+        sendJson(res, 400, { error: 'invalid JSON body' });
+        return;
+      }
+      const { model, kiloToken } = parsed ?? {};
+      if (typeof model !== 'string' || typeof kiloToken !== 'string') {
+        sendJson(res, 400, { error: 'model and kiloToken are required strings' });
+        return;
+      }
+      const result = await runCaseSerialized({
+        model,
+        prompt: 'Reply with exactly: ok',
+        kiloToken,
+        timeoutMs: DEFAULT_TIMEOUT_MS,
+      });
+      sendJson(res, 200, { exitCode: result.exitCode, durationMs: result.durationMs });
+      return;
+    }
+
     if (req.method === 'POST' && req.url === '/run') {
       let parsed;
       try {
diff --git a/services/auto-routing-benchmark/src/admin.test.ts b/services/auto-routing-benchmark/src/admin.test.ts
index 02a9077580..f30a50aa23 100644
--- a/services/auto-routing-benchmark/src/admin.test.ts
+++ b/services/auto-routing-benchmark/src/admin.test.ts
@@ -147,7 +147,7 @@ describe('GET /admin/config', () => {
 // ---------------------------------------------------------------------------
 
 describe('PUT /admin/config', () => {
-  it('returns 400 for a non-JSON body', async () => {
+  it('rejects a non-JSON body', async () => {
     const res = await request('/admin/config', {
       method: 'PUT',
       headers: {
@@ -156,14 +156,18 @@ describe('PUT /admin/config', () => {
       },
       body: 'not json {{{',
     });
-    expect(res.status).toBe(400);
-    await expect(res.json()).resolves.toEqual({ error: 'Invalid JSON body' });
+    // Malformed JSON surfaces via the framework error handler (same behavior
+    // as the other zodJsonValidator-based services).
+    expect(res.status).toBe(500);
   });
 
   it('returns 400 for a schema-invalid config', async () => {
     const res = await authedPut('/admin/config', { classifierModels: 'oops' });
     expect(res.status).toBe(400);
-    await expect(res.json()).resolves.toEqual({ error: 'Invalid benchmark config' });
+    await expect(res.json()).resolves.toMatchObject({
+      success: false,
+      error: 'Invalid benchmark config',
+    });
     expect(dbRun).not.toHaveBeenCalled();
   });
 
@@ -223,7 +227,7 @@ describe('GET /admin/runs', () => {
 // ---------------------------------------------------------------------------
 
 describe('POST /admin/runs', () => {
-  it('returns 400 for a non-JSON body', async () => {
+  it('rejects a non-JSON body', async () => {
     const res = await request('/admin/runs', {
       method: 'POST',
       headers: {
@@ -232,14 +236,18 @@ describe('POST /admin/runs', () => {
       },
       body: '<<<',
     });
-    expect(res.status).toBe(400);
-    await expect(res.json()).resolves.toEqual({ error: 'Invalid JSON body' });
+    // Malformed JSON surfaces via the framework error handler (same behavior
+    // as the other zodJsonValidator-based services).
+    expect(res.status).toBe(500);
   });
 
   it('returns 400 for an invalid kind', async () => {
     const res = await authedPost('/admin/runs', { kind: 'turbo' });
     expect(res.status).toBe(400);
-    await expect(res.json()).resolves.toEqual({ error: 'Invalid run request' });
+    await expect(res.json()).resolves.toMatchObject({
+      success: false,
+      error: 'Invalid run request',
+    });
     expect(queueSendBatch).not.toHaveBeenCalled();
   });
 
diff --git a/services/auto-routing-benchmark/src/admin.ts b/services/auto-routing-benchmark/src/admin.ts
index abc1a3b79c..a539850040 100644
--- a/services/auto-routing-benchmark/src/admin.ts
+++ b/services/auto-routing-benchmark/src/admin.ts
@@ -5,84 +5,76 @@ import {
   StartBenchmarkRunRequestSchema,
   type BenchmarkRun,
 } from '@kilocode/auto-routing-contracts';
-import type { Handler } from 'hono';
+import { zodJsonValidator } from '@kilocode/worker-utils';
+import type { Hono } from 'hono';
 import { DEFAULT_BENCHMARK_CONFIG, getBenchmarkConfig, saveBenchmarkConfig } from './config';
 import { debugRunCli } from './cli-runner';
-import { fetchBenchmarkUserToken } from './run';
+import { fetchBenchmarkUserToken, startRun } from './run';
 import { getLatestRoutingTable, listRuns } from './db';
-import { startRun } from './run';
 import type { HonoEnv } from './hono-env';
 
-export const getConfigHandler: Handler<HonoEnv> = async c =>
-  c.json({
-    config: await getBenchmarkConfig(c.env.BENCH_DB),
-    defaults: DEFAULT_BENCHMARK_CONFIG,
-  });
-
-export const putConfigHandler: Handler<HonoEnv> = async c => {
-  let body: unknown;
-  try {
-    body = await c.req.json();
-  } catch {
-    return c.json({ error: 'Invalid JSON body' }, 400);
-  }
-  const parsed = BenchmarkConfigSchema.safeParse(body);
-  if (!parsed.success) return c.json({ error: 'Invalid benchmark config' }, 400);
-  const updatedBy = c.req.header('x-updated-by') ?? null;
-  const saved = await saveBenchmarkConfig(c.env.BENCH_DB, parsed.data, updatedBy);
-  return c.json({ config: saved, defaults: DEFAULT_BENCHMARK_CONFIG });
-};
+const DebugCliRequestSchema = z.object({
+  model: z.string().trim().min(1),
+  prompt: z.string().min(1),
+});
 
-export const listRunsHandler: Handler<HonoEnv> = async c => {
-  const limit = Math.min(Number(c.req.query('limit') ?? 20) || 20, 100);
-  const runs: BenchmarkRun[] = await listRuns(c.env.BENCH_DB, limit);
-  return c.json({ runs });
-};
+export function registerAdminRoutes(app: Hono<HonoEnv>): void {
+  app.get('/admin/config', async c =>
+    c.json({
+      config: await getBenchmarkConfig(c.env.BENCH_DB),
+      defaults: DEFAULT_BENCHMARK_CONFIG,
+    })
+  );
 
-export const startRunHandler: Handler<HonoEnv> = async c => {
-  let body: unknown;
-  try {
-    body = await c.req.json();
-  } catch {
-    return c.json({ error: 'Invalid JSON body' }, 400);
-  }
-  const parsed = StartBenchmarkRunRequestSchema.safeParse(body);
-  if (!parsed.success) return c.json({ error: 'Invalid run request' }, 400);
-  return c.json(await startRun(c.env, parsed.data.kind, { force: parsed.data.force }));
-};
+  app.put(
+    '/admin/config',
+    zodJsonValidator(BenchmarkConfigSchema, { errorMessage: 'Invalid benchmark config' }),
+    async c => {
+      const updatedBy = c.req.header('x-updated-by') ?? null;
+      const saved = await saveBenchmarkConfig(c.env.BENCH_DB, c.req.valid('json'), updatedBy);
+      return c.json({ config: saved, defaults: DEFAULT_BENCHMARK_CONFIG });
+    }
+  );
 
-export const getRoutingTableHandler: Handler<HonoEnv> = async c => {
-  const latest = await getLatestRoutingTable(c.env.BENCH_DB);
-  // Validated at publish time, but re-validate before crossing the contract
-  // boundary so a schema change can never surface a stale incompatible table.
-  const parsed = latest ? RoutingTableSchema.safeParse(JSON.parse(latest.table_json)) : null;
-  return c.json({
-    table: parsed?.success ? parsed.data : null,
-    publishedAt: parsed?.success ? (latest?.published_at ?? null) : null,
+  app.get('/admin/runs', async c => {
+    const limit = Math.min(Number(c.req.query('limit') ?? 20) || 20, 100);
+    const runs: BenchmarkRun[] = await listRuns(c.env.BENCH_DB, limit);
+    return c.json({ runs });
   });
-};
 
-const DebugCliRequestSchema = z.object({
-  model: z.string().trim().min(1),
-  prompt: z.string().min(1),
-});
+  app.post(
+    '/admin/runs',
+    zodJsonValidator(StartBenchmarkRunRequestSchema, { errorMessage: 'Invalid run request' }),
+    async c => {
+      const { kind, force } = c.req.valid('json');
+      return c.json(await startRun(c.env, kind, { force }));
+    }
+  );
+
+  app.get('/admin/routing-table', async c => {
+    const latest = await getLatestRoutingTable(c.env.BENCH_DB);
+    // Validated at publish time, but re-validate before crossing the contract
+    // boundary so a schema change can never surface a stale incompatible table.
+    const parsed = latest ? RoutingTableSchema.safeParse(JSON.parse(latest.table_json)) : null;
+    return c.json({
+      table: parsed?.success ? parsed.data : null,
+      publishedAt: parsed?.success ? (latest?.published_at ?? null) : null,
+    });
+  });
 
-// Runs one ad-hoc prompt through the kilo CLI container and returns raw
-// (truncated) stdout lines plus the parsed result. Diagnostic-only.
-export const debugCliHandler: Handler<HonoEnv> = async c => {
-  let body: unknown;
-  try {
-    body = await c.req.json();
-  } catch {
-    return c.json({ error: 'Invalid JSON body' }, 400);
-  }
-  const parsed = DebugCliRequestSchema.safeParse(body);
-  if (!parsed.success) return c.json({ error: 'Invalid debug request' }, 400);
-  const config = await getBenchmarkConfig(c.env.BENCH_DB);
-  if (!config.benchmarkUserId) {
-    return c.json({ error: 'benchmarkUserId is not configured' }, 400);
-  }
-  const kiloToken = await fetchBenchmarkUserToken(c.env, config.benchmarkUserId);
-  const result = await debugRunCli(c.env, { ...parsed.data, kiloToken });
-  return c.json(result);
-};
+  // Runs one ad-hoc prompt through the kilo CLI container and returns raw
+  // (truncated) stdout lines plus the parsed result. Diagnostic-only.
+  app.post(
+    '/admin/debug-cli',
+    zodJsonValidator(DebugCliRequestSchema, { errorMessage: 'Invalid debug request' }),
+    async c => {
+      const config = await getBenchmarkConfig(c.env.BENCH_DB);
+      if (!config.benchmarkUserId) {
+        return c.json({ error: 'benchmarkUserId is not configured' }, 400);
+      }
+      const kiloToken = await fetchBenchmarkUserToken(c.env, config.benchmarkUserId);
+      const result = await debugRunCli(c.env, { ...c.req.valid('json'), kiloToken });
+      return c.json(result);
+    }
+  );
+}
diff --git a/services/auto-routing-benchmark/src/cli-runner.ts b/services/auto-routing-benchmark/src/cli-runner.ts
index c766bab3e3..0eef033f8c 100644
--- a/services/auto-routing-benchmark/src/cli-runner.ts
+++ b/services/auto-routing-benchmark/src/cli-runner.ts
@@ -122,3 +122,22 @@ export async function debugRunCli(
     parsed: parseKiloRunEvents(body.stdoutLines ?? []),
   };
 }
+
+// Asks the container to run its one-time CLI warmup (sqlite migration etc.)
+// before the case loop starts. Best-effort: callers ignore failures.
+export async function warmUpCliContainer(
+  env: Env,
+  params: { instanceName: string; model: string; kiloToken: string }
+): Promise<void> {
+  const stub = env.BENCH_RUNNER.get(env.BENCH_RUNNER.idFromName(params.instanceName));
+  const response = await stub.fetch(
+    new Request('http://container/warmup', {
+      method: 'POST',
+      headers: { 'content-type': 'application/json' },
+      body: JSON.stringify({ model: params.model, kiloToken: params.kiloToken }),
+    })
+  );
+  if (!response.ok) {
+    throw new Error(`container /warmup failed: HTTP ${response.status}`);
+  }
+}
diff --git a/services/auto-routing-benchmark/src/datasets/decider-cases.test.ts b/services/auto-routing-benchmark/src/datasets/decider-cases.test.ts
index 92a734700c..824a049293 100644
--- a/services/auto-routing-benchmark/src/datasets/decider-cases.test.ts
+++ b/services/auto-routing-benchmark/src/datasets/decider-cases.test.ts
@@ -39,14 +39,6 @@ describe('DECIDER_CASES', () => {
     }
   });
 
-  it('has generous maxTokens and nonempty prompts', () => {
-    for (const c of DECIDER_CASES) {
-      expect(c.maxTokens, c.id).toBeGreaterThanOrEqual(512);
-      expect(c.systemPrompt.length, c.id).toBeGreaterThan(0);
-      expect(c.userPrompt.length, c.id).toBeGreaterThan(0);
-    }
-  });
-
   it('has nonempty exact and contains_all values', () => {
     for (const c of DECIDER_CASES) {
       const check = c.check;
diff --git a/services/auto-routing-benchmark/src/datasets/decider-cases.ts b/services/auto-routing-benchmark/src/datasets/decider-cases.ts
index 1ac7a2af35..31ba56e17e 100644
--- a/services/auto-routing-benchmark/src/datasets/decider-cases.ts
+++ b/services/auto-routing-benchmark/src/datasets/decider-cases.ts
@@ -7,9 +7,6 @@ export type DeciderCase = {
   taskType: ClassifierTaskType;
   systemPrompt: string;
   userPrompt: string;
-  // Retained as metadata only. The decider now runs cases through the kilo CLI
-  // (no chat-completions maxTokens knob), so this field is no longer consumed.
-  maxTokens: number;
   check: DeciderCheck;
 };
 
@@ -32,7 +29,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
     systemPrompt: CODE_SYS,
     userPrompt:
       'What does this JavaScript print? Answer with the exact output line only.\n\nconst xs = [1, 2, 3, 4].filter(x => x % 2 === 0).map(x => x * 10);\nconsole.log(xs.join("-"));',
-    maxTokens: 512,
     check: { kind: 'exact', value: '20-40' },
   },
   {
@@ -42,7 +38,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
     systemPrompt: CODE_SYS,
     userPrompt:
       'What does this JavaScript print? Answer with the exact output line only.\n\nconsole.log([5, 3, 8, 1].sort((a, b) => a - b).join(","));',
-    maxTokens: 512,
     check: { kind: 'exact', value: '1,3,5,8' },
   },
   {
@@ -52,7 +47,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
     systemPrompt: CODE_SYS,
     userPrompt:
       'What does this JavaScript print? Answer with the exact output line only.\n\nconsole.log("hello".toUpperCase());',
-    maxTokens: 512,
     check: { kind: 'exact', value: 'HELLO' },
   },
   {
@@ -62,7 +56,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
     systemPrompt: CODE_SYS,
     userPrompt:
       'What does this JavaScript print? Answer with the exact output line only.\n\nconst n = 7;\nconsole.log(n % 2 === 0 ? "even" : "odd");',
-    maxTokens: 512,
     check: { kind: 'exact', value: 'odd' },
   },
   {
@@ -72,7 +65,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
     systemPrompt: CODE_SYS,
     userPrompt:
       'What is the final value printed? Answer with only the number.\n\nlet x = 10;\nx += 5;\nx *= 2;\nconsole.log(x);',
-    maxTokens: 512,
     check: { kind: 'exact', value: '30' },
   },
   {
@@ -82,7 +74,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
     systemPrompt: CODE_SYS,
     userPrompt:
       'What does this JavaScript print? Answer with only the number.\n\nconsole.log(parseInt("42px", 10));',
-    maxTokens: 512,
     check: { kind: 'exact', value: '42' },
   },
   {
@@ -92,7 +83,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
     systemPrompt: CODE_SYS,
     userPrompt:
       'How many times does the letter "a" appear in the word "banana"? Answer with only the number.',
-    maxTokens: 512,
     check: { kind: 'exact', value: '3' },
   },
   {
@@ -102,7 +92,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
     systemPrompt: CODE_SYS,
     userPrompt:
       'How many own enumerable keys does this object have? Answer with only the number.\n\nconst o = { a: 1, b: 2, c: 3 };',
-    maxTokens: 512,
     check: { kind: 'exact', value: '3' },
   },
   {
@@ -113,7 +102,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
       'You are a precise web API expert. Answer with only what is asked, no explanations.',
     userPrompt:
       'Which standard HTTP status code indicates that a new resource was successfully created? Answer with only the 3-digit number.',
-    maxTokens: 512,
     check: { kind: 'exact', value: '201' },
   },
   {
@@ -123,7 +111,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
     systemPrompt: CODE_SYS,
     userPrompt:
       'A loop sums an array. What value does it produce? Answer with only the number.\n\nlet total = 0;\nfor (const n of [4, 4, 4]) total += n;\nconsole.log(total);',
-    maxTokens: 512,
     check: { kind: 'exact', value: '12' },
   },
 
@@ -135,7 +122,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
     systemPrompt: CODE_SYS,
     userPrompt:
       'This binary search has a bug. Reply with JSON {"line": <1-based line number of the buggy line>, "fix": "<the corrected line with leading whitespace removed>"}.\n\n1: function bsearch(a, t) {\n2:   let lo = 0, hi = a.length;\n3:   while (lo < hi) {\n4:     const mid = (lo + hi) >> 1;\n5:     if (a[mid] === t) return mid;\n6:     if (a[mid] < t) lo = mid;\n7:     else hi = mid;\n8:   }\n9:   return -1;\n10: }',
-    maxTokens: 2048,
     check: { kind: 'json_equal', value: { line: 6, fix: 'if (a[mid] < t) lo = mid + 1;' } },
   },
   {
@@ -145,7 +131,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
     systemPrompt: CODE_SYS,
     userPrompt:
       'What does this print? Answer with only the number.\n\nconst r = [1, 2, 3, 4].reduce((acc, x) => acc + x * x, 0);\nconsole.log(r);',
-    maxTokens: 2048,
     check: { kind: 'exact', value: '30' },
   },
   {
@@ -155,7 +140,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
     systemPrompt: CODE_SYS,
     userPrompt:
       'What is the final printed value? Answer with only the number.\n\nfunction make() {\n  let c = 0;\n  return () => ++c;\n}\nconst f = make();\nf();\nf();\nconsole.log(f());',
-    maxTokens: 2048,
     check: { kind: 'exact', value: '3' },
   },
   {
@@ -165,7 +149,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
     systemPrompt: CODE_SYS,
     userPrompt:
       'What does this program print, in order? Answer with the four uppercase letters joined by commas, e.g. "A,B,C,D".\n\nconsole.log("A");\nPromise.resolve().then(() => console.log("B"));\nsetTimeout(() => console.log("C"), 0);\nconsole.log("D");',
-    maxTokens: 2048,
     check: { kind: 'regex', pattern: '^\\s*A\\s*,\\s*D\\s*,\\s*B\\s*,\\s*C\\s*$', flags: 'im' },
   },
   {
@@ -175,7 +158,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
     systemPrompt: CODE_SYS,
     userPrompt:
       'What is the size of the resulting Set? Answer with only the number.\n\nconst s = new Set([1, 2, 2, 3, 3, 3, 4]);\nconsole.log(s.size);',
-    maxTokens: 2048,
     check: { kind: 'exact', value: '4' },
   },
   {
@@ -185,7 +167,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
     systemPrompt: CODE_SYS,
     userPrompt:
       'Given the regex /(\\d{4})-(\\d{2})-(\\d{2})/ applied to "2026-06-11", what is capture group 2? Answer with only the value.',
-    maxTokens: 2048,
     check: { kind: 'exact', value: '06' },
   },
   {
@@ -195,7 +176,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
     systemPrompt: CODE_SYS,
     userPrompt:
       'This computes a Fibonacci-like sequence where f(0)=0, f(1)=1, f(n)=f(n-1)+f(n-2). What is f(7)? Answer with only the number.',
-    maxTokens: 2048,
     check: { kind: 'exact', value: '13' },
   },
   {
@@ -205,7 +185,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
     systemPrompt: CODE_SYS,
     userPrompt:
       'What does this print? Answer with only the number.\n\nconst a = [1, 2, 3];\nconst b = a;\nb.push(4);\nconsole.log(a.length);',
-    maxTokens: 2048,
     check: { kind: 'exact', value: '4' },
   },
   {
@@ -215,7 +194,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
     systemPrompt: SYS_SYS,
     userPrompt:
       'A fixed-window rate limiter allows 100 requests per 60-second window. A client sends 80 requests in the first 30 seconds of a window, then 40 more requests in the next 20 seconds (same window). How many of the 40 later requests are rejected? Answer with only the number.',
-    maxTokens: 2048,
     check: { kind: 'exact', value: '20' },
   },
   {
@@ -225,7 +203,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
     systemPrompt: CODE_SYS,
     userPrompt:
       'After refactoring, both versions must produce the same output. What number does this print? Answer with only the number.\n\nconst nums = [10, 20, 30];\nconst doubled = nums.map(n => n * 2);\nconsole.log(doubled[1]);',
-    maxTokens: 2048,
     check: { kind: 'exact', value: '40' },
   },
 
@@ -237,7 +214,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
     systemPrompt: SYS_SYS,
     userPrompt:
       'Three workers process a queue with at-least-once delivery. Worker A reads job 7 at t=0ms and crashes at t=50ms before ack. Visibility timeout is 30ms. Worker B receives job 7 at t=35ms, processes it in 40ms and acks. Worker C receives job 7 at t=80ms (redelivery triggered by the crash recovery scan at t=70ms) and processes it in 10ms, acking at t=90ms. The job inserts a row keyed by an idempotency key with ON CONFLICT DO NOTHING. How many rows exist at t=100ms, and which worker\'s insert won? Reply with JSON {"rows": <number>, "winner": "<A|B|C>"}.',
-    maxTokens: 4096,
     check: { kind: 'json_equal', value: { rows: 1, winner: 'B' } },
   },
   {
@@ -247,7 +223,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
     systemPrompt: CODE_SYS,
     userPrompt:
       'What does this print? Answer with the three numbers joined by commas, e.g. "1,2,3".\n\nconst fns = [];\nfor (var i = 0; i < 3; i++) {\n  fns.push(() => i);\n}\nconsole.log(fns[0]() + "," + fns[1]() + "," + fns[2]());',
-    maxTokens: 4096,
     check: { kind: 'regex', pattern: '^\\s*3\\s*,\\s*3\\s*,\\s*3\\s*$', flags: 'm' },
   },
   {
@@ -257,7 +232,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
     systemPrompt: CODE_SYS,
     userPrompt:
       'What does this print? Answer with the three numbers joined by commas, e.g. "1,2,3".\n\nconst fns = [];\nfor (let i = 0; i < 3; i++) {\n  fns.push(() => i);\n}\nconsole.log(fns[0]() + "," + fns[1]() + "," + fns[2]());',
-    maxTokens: 4096,
     check: { kind: 'regex', pattern: '^\\s*0\\s*,\\s*1\\s*,\\s*2\\s*$', flags: 'm' },
   },
   {
@@ -267,7 +241,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
     systemPrompt: CODE_SYS,
     userPrompt:
       'What does this print? Answer with only the number.\n\nconst obj = {\n  v: 10,\n  get() {\n    return [1, 2].map(function () {\n      return this?.v ?? 0;\n    }).reduce((a, b) => a + b, 0);\n  },\n};\nconsole.log(obj.get());',
-    maxTokens: 4096,
     check: { kind: 'exact', value: '0' },
   },
   {
@@ -277,7 +250,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
     systemPrompt: SYS_SYS,
     userPrompt:
       'Two threads acquire locks. Thread 1: lock A, then lock B. Thread 2: lock B, then lock A. Both hold the first lock and then block forever waiting for the second. To eliminate the deadlock by enforcing a global lock acquisition order (alphabetical: A before B), which single thread number must have its two lock acquisitions reordered? Answer with only the thread number.',
-    maxTokens: 4096,
     check: { kind: 'exact', value: '2' },
   },
   {
@@ -287,7 +259,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
     systemPrompt: CODE_SYS,
     userPrompt:
       'In IEEE-754 double precision (JavaScript Number), does the expression (0.1 + 0.2 === 0.3) evaluate to true or false? Answer with only the lowercase word true or false.',
-    maxTokens: 4096,
     check: { kind: 'exact', value: 'false' },
   },
   {
@@ -297,7 +268,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
     systemPrompt: SYS_SYS,
     userPrompt:
       'A counter row holds value 5. Under READ COMMITTED isolation, two concurrent transactions T1 and T2 each run: SELECT v FROM c; then UPDATE c SET v = (the value they read) + 1. Both read before either writes, T1 commits first, then T2 commits (last-write-wins, no row lock taken on the SELECT). What is the final value of v? Answer with only the number.',
-    maxTokens: 4096,
     check: { kind: 'exact', value: '6' },
   },
   {
@@ -307,7 +277,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
     systemPrompt: CODE_SYS,
     userPrompt:
       'What does this print? Answer with the values joined by commas, e.g. "1,2,3".\n\nfunction* g() {\n  yield 1;\n  yield* [2, 3];\n  yield 4;\n}\nconsole.log([...g()].join(","));',
-    maxTokens: 4096,
     check: { kind: 'regex', pattern: '^\\s*1\\s*,\\s*2\\s*,\\s*3\\s*,\\s*4\\s*$', flags: 'm' },
   },
   {
@@ -317,7 +286,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
     systemPrompt: SYS_SYS,
     userPrompt:
       'A write-through cache with TTL 60s. At t=0s key K is written (value 1, cached). At t=30s the database row for K is updated to value 2 by a process that bypasses the cache (does not invalidate it). At t=45s a reader requests K. At t=70s another reader requests K. The cache returns its entry if present and unexpired, otherwise reads the DB and caches. What value does the t=45s reader get, and what value does the t=70s reader get? Reply with JSON {"first": <number>, "second": <number>}.',
-    maxTokens: 4096,
     check: { kind: 'json_equal', value: { first: 1, second: 2 } },
   },
   {
@@ -327,7 +295,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
     systemPrompt: CODE_SYS,
     userPrompt:
       'What does this print? Answer with only the number.\n\nlet calls = 0;\nfunction side() {\n  calls++;\n  return 0;\n}\nconst result = side() || side() || 7;\nconsole.log(calls);',
-    maxTokens: 4096,
     check: { kind: 'exact', value: '2' },
   },
 ];
diff --git a/services/auto-routing-benchmark/src/index.ts b/services/auto-routing-benchmark/src/index.ts
index 6259f9866c..dd431b5ce8 100644
--- a/services/auto-routing-benchmark/src/index.ts
+++ b/services/auto-routing-benchmark/src/index.ts
@@ -1,14 +1,7 @@
 import { Hono } from 'hono';
 import { createErrorHandler, createNotFoundHandler } from '@kilocode/worker-utils';
+import { registerAdminRoutes } from './admin';
 import { authMiddleware } from './auth';
-import {
-  getConfigHandler,
-  putConfigHandler,
-  listRunsHandler,
-  startRunHandler,
-  getRoutingTableHandler,
-  debugCliHandler,
-} from './admin';
 import type { HonoEnv } from './hono-env';
 import { processJob, type BenchmarkJobMessage } from './run';
 
@@ -19,12 +12,7 @@ export const app = new Hono<HonoEnv>();
 app.use('*', authMiddleware);
 app.get('/health', c => c.json({ status: 'ok', service: 'auto-routing-benchmark' }));
 
-app.get('/admin/config', getConfigHandler);
-app.put('/admin/config', putConfigHandler);
-app.get('/admin/runs', listRunsHandler);
-app.post('/admin/runs', startRunHandler);
-app.get('/admin/routing-table', getRoutingTableHandler);
-app.post('/admin/debug-cli', debugCliHandler);
+registerAdminRoutes(app);
 
 app.notFound(createNotFoundHandler());
 app.onError(createErrorHandler());
diff --git a/services/auto-routing-benchmark/src/run.ts b/services/auto-routing-benchmark/src/run.ts
index 337c18e901..00ead718c1 100644
--- a/services/auto-routing-benchmark/src/run.ts
+++ b/services/auto-routing-benchmark/src/run.ts
@@ -30,7 +30,7 @@ import {
 import { gradeClassifierOutput, runDeciderCheck } from './grading';
 import { createOpenRouterClient } from './openrouter';
 import { buildRoutingTable } from './routing-table-builder';
-import { runDeciderCaseViaCli } from './cli-runner';
+import { runDeciderCaseViaCli, warmUpCliContainer } from './cli-runner';
 
 export type BenchmarkJobMessage = {
   runId: string;
@@ -126,7 +126,7 @@ export async function startRun(
     // Everything already has results: complete immediately and republish the
     // aggregate so config-only changes (model removed, threshold tweaked)
     // take effect without re-running any model.
-    await finalizeRunIfComplete(env, runId, kind);
+    await finalizeRunIfComplete(env, runId, kind, { config, runtime });
     return { runId, enqueuedModels: 0, skippedModels };
   }
 
@@ -171,7 +171,8 @@ export async function processJob(env: Env, rawMessage: unknown): Promise<void> {
   }
 
   const message = parsed.data;
-  const config = await getRunConfig(env, message.runId);
+  const state = await getRunState(env, message.runId);
+  const { config } = state;
 
   if (message.kind === 'classifier') {
     // Create the OpenRouter client inside processJob — no module-scope transport clients.
@@ -209,7 +210,7 @@ export async function processJob(env: Env, rawMessage: unknown): Promise<void> {
     await processDeciderJob(env, message, config);
   }
 
-  await finalizeRunIfComplete(env, message.runId, message.kind);
+  await finalizeRunIfComplete(env, message.runId, message.kind, state);
 }
 
 async function processDeciderJob(
@@ -217,32 +218,20 @@ async function processDeciderJob(
   message: BenchmarkJobMessage,
   config: BenchmarkConfig
 ): Promise<void> {
-  // Only the cases this message owns (chunked); fall back to the full set for
-  // legacy/un-chunked messages.
-  const cases =
-    message.caseIds && message.caseIds.length > 0
-      ? DECIDER_CASES.filter(c => message.caseIds?.includes(c.id))
-      : DECIDER_CASES;
-
-  // Defensive guard mirroring the startRun fail-fast: if the run snapshot has
-  // no benchmark user, every case in this chunk fails with a clear error so
-  // the run still completes and surfaces the misconfiguration.
-  if (!config.benchmarkUserId) {
-    for (const benchCase of cases) {
-      await upsertCaseResult(env.BENCH_DB, {
-        run_id: message.runId,
-        model: message.model,
-        case_id: benchCase.id,
-        tier: benchCase.tier,
-        score: 0,
-        latency_ms: 0,
-        cost_usd: null,
-        detail_json: null,
-        error: 'benchmark user not configured',
-      });
-    }
+  // Decider messages always carry their chunk's case ids; anything else is
+  // malformed and dropped (same policy as unparseable messages).
+  if (!message.caseIds?.length) {
+    console.warn(JSON.stringify({ event: 'benchmark_job_missing_case_ids', runId: message.runId }));
     return;
   }
+  const caseIds = new Set(message.caseIds);
+  const cases = DECIDER_CASES.filter(c => caseIds.has(c.id));
+
+  if (!config.benchmarkUserId) {
+    // startRun fails fast before enqueueing, so this only happens if the run
+    // snapshot was tampered with; throwing lets the queue retry/dead-letter.
+    throw new Error(`run ${message.runId} has no benchmarkUserId`);
+  }
 
   // Fetch a short-lived user token ONCE per queue message. Non-OK throws so the
   // queue retries the message. The token is never logged.
@@ -251,24 +240,11 @@ async function processDeciderJob(
   const reasoningEffort =
     config.deciderModels.find(m => m.id === message.model)?.reasoningEffort ?? null;
 
-  // The CLI performs a one-time sqlite migration on each fresh container
-  // instance; concurrent first runs against the migrating database end with
-  // empty event streams (exit 0, zero events). One sequential warmup run
-  // completes the migration before the concurrent case loop starts.
-  await runDeciderCaseViaCli(env, {
-    instanceName,
-    model: message.model,
-    benchCase: {
-      id: 'warmup',
-      tier: 'low',
-      taskType: 'implementation',
-      systemPrompt: 'You are a terse assistant.',
-      userPrompt: 'Reply with exactly: ok',
-      maxTokens: 512,
-      check: { kind: 'exact', value: 'ok' },
-    },
-    kiloToken,
-  }).catch(() => {});
+  // Fresh container instances run the CLI's one-time sqlite migration; the
+  // container owns that via its /warmup endpoint so the first real case
+  // doesn't burn its timeout on it. Failures are non-fatal: the first case
+  // simply absorbs whatever warmup work remains.
+  await warmUpCliContainer(env, { instanceName, model: message.model, kiloToken }).catch(() => {});
 
   // Concurrency 1: the CLI's sqlite state in the container is not safe under
   // concurrent sessions (partial-migration crashes); the container serializes
@@ -380,10 +356,9 @@ function failedRow(
   };
 }
 
-async function getRunState(
-  env: Env,
-  runId: string
-): Promise<{ config: BenchmarkConfig; runtime: RunRuntime }> {
+type RunState = { config: BenchmarkConfig; runtime: RunRuntime };
+
+async function getRunState(env: Env, runId: string): Promise<RunState> {
   // Snapshots taken at startRun time so a mid-run admin edit can't skew them.
   const run = await getRun(env.BENCH_DB, runId);
   if (!run) throw new Error(`unknown run ${runId}`);
@@ -399,10 +374,6 @@ async function getRunState(
   return { config, runtime };
 }
 
-async function getRunConfig(env: Env, runId: string): Promise<BenchmarkConfig> {
-  return (await getRunState(env, runId)).config;
-}
-
 export async function runCasesWithConcurrency<T>(
   cases: readonly T[],
   concurrency: number,
@@ -417,8 +388,13 @@ export async function runCasesWithConcurrency<T>(
   await Promise.all(workers);
 }
 
-async function finalizeRunIfComplete(env: Env, runId: string, kind: BenchmarkKind): Promise<void> {
-  const { config, runtime } = await getRunState(env, runId);
+async function finalizeRunIfComplete(
+  env: Env,
+  runId: string,
+  kind: BenchmarkKind,
+  state: RunState
+): Promise<void> {
+  const { config, runtime } = state;
   const caseCount = kind === 'classifier' ? CLASSIFIER_CASES.length : DECIDER_CASES.length;
   const expected = runtime.enqueuedModels.length * caseCount;
   const actual = await countCaseResults(env.BENCH_DB, runId);

From 641f6efddd3f6632fd6bae149a97fb9d29b22e5c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Fri, 12 Jun 2026 12:08:32 +0200
Subject: [PATCH 32/73] refactor(auto-routing-benchmark): use drizzle for all
 D1 access

---
 pnpm-lock.yaml                                |   9 +-
 services/auto-routing-benchmark/package.json  |   1 +
 .../auto-routing-benchmark/src/admin.test.ts  | 109 +++---
 .../auto-routing-benchmark/src/db-schema.ts   |  67 ++++
 services/auto-routing-benchmark/src/db.ts     | 315 ++++++++----------
 5 files changed, 270 insertions(+), 231 deletions(-)
 create mode 100644 services/auto-routing-benchmark/src/db-schema.ts

diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index 5eee7fd65d..891de54e82 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -1524,6 +1524,9 @@ importers:
       '@openrouter/sdk':
         specifier: ^0.12.79
         version: 0.12.79
+      drizzle-orm:
+        specifier: 0.45.2
+        version: 0.45.2(@cloudflare/workers-types@4.20260605.1)(@opentelemetry/api@1.9.1)(@types/pg@8.18.0)(@upstash/redis@1.38.0)(bun-types@1.3.14)(kysely@0.29.2)(pg@8.20.0)
       hono:
         specifier: 4.12.18
         version: 4.12.18
@@ -18022,7 +18025,7 @@ snapshots:
       cjs-module-lexer: 1.2.3
       esbuild: 0.27.4
       miniflare: 4.20260603.0(bufferutil@4.1.0)(utf-8-validate@6.0.6)
-      vitest: 4.1.6(@opentelemetry/api@1.9.1)(@types/node@25.5.2)(@vitest/coverage-v8@4.1.6)(@vitest/ui@4.1.6)(esbuild@0.27.4)(jiti@2.7.0)(terser@5.46.0)(tsx@4.21.0)(yaml@2.8.4)
+      vitest: 4.1.6(@opentelemetry/api@1.9.1)(@types/node@24.12.4)(@vitest/coverage-v8@4.1.6)(@vitest/ui@4.1.6)(esbuild@0.27.4)(jiti@2.7.0)(terser@5.46.0)(tsx@4.21.0)(yaml@2.8.4)
       wrangler: 4.98.0(@cloudflare/workers-types@4.20260605.1)(bufferutil@4.1.0)(utf-8-validate@6.0.6)
       zod: 3.25.76
     transitivePeerDependencies:
@@ -24132,7 +24135,7 @@ snapshots:
       obug: 2.1.1
       std-env: 4.0.0
       tinyrainbow: 3.1.0
-      vitest: 4.1.6(@opentelemetry/api@1.9.1)(@types/node@24.12.4)(@vitest/coverage-v8@4.1.6)(@vitest/ui@4.1.6)(esbuild@0.27.4)(jiti@2.7.0)(terser@5.46.0)(tsx@4.21.0)(yaml@2.8.4)
+      vitest: 4.1.6(@opentelemetry/api@1.9.1)(@types/node@25.5.2)(@vitest/coverage-v8@4.1.6)(@vitest/ui@4.1.6)(esbuild@0.27.4)(jiti@2.7.0)(terser@5.46.0)(tsx@4.21.0)(yaml@2.8.4)
 
   '@vitest/expect@3.2.4':
     dependencies:
@@ -24218,7 +24221,7 @@ snapshots:
       sirv: 3.0.2
       tinyglobby: 0.2.16
       tinyrainbow: 3.1.0
-      vitest: 4.1.6(@opentelemetry/api@1.9.1)(@types/node@24.12.4)(@vitest/coverage-v8@4.1.6)(@vitest/ui@4.1.6)(esbuild@0.27.4)(jiti@2.7.0)(terser@5.46.0)(tsx@4.21.0)(yaml@2.8.4)
+      vitest: 4.1.6(@opentelemetry/api@1.9.1)(@types/node@25.5.2)(@vitest/coverage-v8@4.1.6)(@vitest/ui@4.1.6)(esbuild@0.27.4)(jiti@2.7.0)(terser@5.46.0)(tsx@4.21.0)(yaml@2.8.4)
 
   '@vitest/utils@3.2.4':
     dependencies:
diff --git a/services/auto-routing-benchmark/package.json b/services/auto-routing-benchmark/package.json
index c347e6be44..92c8f9605d 100644
--- a/services/auto-routing-benchmark/package.json
+++ b/services/auto-routing-benchmark/package.json
@@ -16,6 +16,7 @@
     "@kilocode/auto-routing-contracts": "workspace:*",
     "@kilocode/worker-utils": "workspace:*",
     "@openrouter/sdk": "^0.12.79",
+    "drizzle-orm": "catalog:",
     "hono": "catalog:",
     "zod": "catalog:"
   },
diff --git a/services/auto-routing-benchmark/src/admin.test.ts b/services/auto-routing-benchmark/src/admin.test.ts
index f30a50aa23..4218a7f3c8 100644
--- a/services/auto-routing-benchmark/src/admin.test.ts
+++ b/services/auto-routing-benchmark/src/admin.test.ts
@@ -1,44 +1,43 @@
 import { beforeEach, describe, expect, it, vi } from 'vitest';
 import { DEFAULT_BENCHMARK_CONFIG } from './config';
 import { app } from './index';
+import type * as DbModule from './db';
 
 // ---------------------------------------------------------------------------
-// Env / binding stubs
+// Stubs: the db module is mocked at its function boundary (drizzle generates
+// the SQL, so statement-level stubbing would couple tests to its internals).
 // ---------------------------------------------------------------------------
 
+vi.mock('./db', async importOriginal => {
+  const actual = await importOriginal<typeof DbModule>();
+  return {
+    ...actual,
+    getConfigRow: vi.fn(),
+    saveConfigRow: vi.fn(),
+    listRuns: vi.fn(),
+    getLatestRoutingTable: vi.fn(),
+    getLatestSummariesByModel: vi.fn(),
+    insertRun: vi.fn(),
+    markStaleRunsFailed: vi.fn(),
+  };
+});
+
+import {
+  getConfigRow,
+  getLatestRoutingTable,
+  getLatestSummariesByModel,
+  insertRun,
+  listRuns,
+  markStaleRunsFailed,
+  saveConfigRow,
+} from './db';
+
 const tokenGet = vi.fn<() => Promise<string>>();
-const dbFirst = vi.fn();
-const dbAll = vi.fn();
-const dbRun = vi.fn();
-const dbBind = vi.fn();
-const dbPrepare = vi.fn();
 const queueSendBatch = vi.fn();
 
-// Minimal chainable D1 stub.
-// prepare() → { bind() → { first(), all(), run() } }
-function makeD1Stub() {
-  const stmt = {
-    bind: (..._args: unknown[]) => {
-      dbBind(..._args);
-      return stmt;
-    },
-    first: dbFirst,
-    all: dbAll,
-    run: dbRun,
-  };
-  dbPrepare.mockReturnValue(stmt);
-  return {
-    prepare: (sql: string) => {
-      dbPrepare(sql);
-      return stmt;
-    },
-    batch: vi.fn().mockResolvedValue([]),
-  } as unknown as D1Database;
-}
-
 const env = {
   INTERNAL_API_SECRET_PROD: { get: tokenGet },
-  BENCH_DB: null as unknown as D1Database,
+  BENCH_DB: {} as D1Database,
   BENCH_QUEUE: { sendBatch: queueSendBatch },
   AUTO_ROUTING_CONFIG: { put: vi.fn(), get: vi.fn() },
 } as unknown as Env;
@@ -81,14 +80,16 @@ function authedPut(path: string, body: unknown, extraHeaders: Record<string, str
 // ---------------------------------------------------------------------------
 
 beforeEach(() => {
+  vi.clearAllMocks();
   tokenGet.mockResolvedValue('bench-token');
-  dbFirst.mockResolvedValue(null);
-  dbAll.mockResolvedValue({ results: [] });
-  dbRun.mockResolvedValue({ meta: { changes: 0 } });
+  vi.mocked(getConfigRow).mockResolvedValue(null);
+  vi.mocked(saveConfigRow).mockResolvedValue(undefined);
+  vi.mocked(listRuns).mockResolvedValue([]);
+  vi.mocked(getLatestRoutingTable).mockResolvedValue(null);
+  vi.mocked(getLatestSummariesByModel).mockResolvedValue(new Map());
+  vi.mocked(insertRun).mockResolvedValue(undefined);
+  vi.mocked(markStaleRunsFailed).mockResolvedValue(undefined);
   queueSendBatch.mockResolvedValue(undefined);
-
-  // Rebuild the D1 stub each test so prepare/bind point to fresh mocks.
-  (env as unknown as Record<string, unknown>).BENCH_DB = makeD1Stub();
 });
 
 // ---------------------------------------------------------------------------
@@ -116,7 +117,7 @@ describe('auth middleware', () => {
 
 describe('GET /admin/config', () => {
   it('returns defaults when the DB row is absent', async () => {
-    // dbFirst already returns null by default
+    // getConfigRow already returns null by default
     const res = await authedGet('/admin/config');
     expect(res.status).toBe(200);
     await expect(res.json()).resolves.toEqual({
@@ -132,7 +133,11 @@ describe('GET /admin/config', () => {
       updatedAt: '2026-06-01T00:00:00.000Z',
       updatedBy: 'admin@example.com',
     };
-    dbFirst.mockResolvedValueOnce({ config_json: JSON.stringify(storedConfig) });
+    vi.mocked(getConfigRow).mockResolvedValueOnce({
+      config_json: JSON.stringify(storedConfig),
+      updated_at: '2026-06-01T00:00:00.000Z',
+      updated_by: 'admin@example.com',
+    });
 
     const res = await authedGet('/admin/config');
     expect(res.status).toBe(200);
@@ -168,7 +173,7 @@ describe('PUT /admin/config', () => {
       success: false,
       error: 'Invalid benchmark config',
     });
-    expect(dbRun).not.toHaveBeenCalled();
+    expect(saveConfigRow).not.toHaveBeenCalled();
   });
 
   it('persists a valid config and returns it with defaults', async () => {
@@ -194,18 +199,12 @@ describe('PUT /admin/config', () => {
     expect(typeof body.config.updatedAt).toBe('string');
     expect(body.defaults).toEqual(DEFAULT_BENCHMARK_CONFIG);
 
-    // The INSERT was actually executed (dbRun was called on the saveConfigRow stmt).
-    expect(dbRun).toHaveBeenCalled();
-    // The SQL should be an INSERT OR REPLACE into benchmark_config.
-    const insertCall = dbPrepare.mock.calls.find(
-      (args: unknown[]) =>
-        typeof args[0] === 'string' && (args[0] as string).includes('benchmark_config')
-    );
-    expect(insertCall).toBeDefined();
-    // The updatedBy value was forwarded via bind.
-    const bindCalls: unknown[][] = dbBind.mock.calls;
-    const foundUpdatedBy = bindCalls.some(args => args.includes('igor@kilocode.ai'));
-    expect(foundUpdatedBy).toBe(true);
+    // The row was persisted with the stamped config and updatedBy.
+    expect(saveConfigRow).toHaveBeenCalledOnce();
+    const [, configJson, updatedAt, updatedBy] = vi.mocked(saveConfigRow).mock.calls[0];
+    expect(JSON.parse(configJson).minAccuracy).toBe(0.85);
+    expect(typeof updatedAt).toBe('string');
+    expect(updatedBy).toBe('igor@kilocode.ai');
   });
 });
 
@@ -215,7 +214,7 @@ describe('PUT /admin/config', () => {
 
 describe('GET /admin/runs', () => {
   it('returns an empty runs array when the table is empty', async () => {
-    // dbAll returns { results: [] } by default
+    // listRuns returns [] by default
     const res = await authedGet('/admin/runs');
     expect(res.status).toBe(200);
     await expect(res.json()).resolves.toEqual({ runs: [] });
@@ -252,13 +251,13 @@ describe('POST /admin/runs', () => {
   });
 
   it('starts a classifier run and returns runId + enqueuedModels', async () => {
-    // markStaleRunsFailed → run (UPDATE), getBenchmarkConfig → first (null → defaults),
-    // insertRun → run, then sendBatch.
+    // No prior summaries → every configured model is enqueued.
     const res = await authedPost('/admin/runs', { kind: 'classifier' });
     expect(res.status).toBe(200);
     const body = (await res.json()) as { runId: string; enqueuedModels: number };
     expect(body.runId).toMatch(/^classifier-/);
     expect(body.enqueuedModels).toBe(DEFAULT_BENCHMARK_CONFIG.classifierModels.length);
+    expect(insertRun).toHaveBeenCalledOnce();
     expect(queueSendBatch).toHaveBeenCalledOnce();
   });
 });
@@ -269,7 +268,7 @@ describe('POST /admin/runs', () => {
 
 describe('GET /admin/routing-table', () => {
   it('returns {table: null, publishedAt: null} when no rows exist', async () => {
-    // dbFirst already returns null by default
+    // getLatestRoutingTable already returns null by default
     const res = await authedGet('/admin/routing-table');
     expect(res.status).toBe(200);
     await expect(res.json()).resolves.toEqual({ table: null, publishedAt: null });
@@ -290,7 +289,7 @@ describe('GET /admin/routing-table', () => {
       source: 'benchmark',
       tiers: { low: [candidate], medium: [candidate], high: [candidate] },
     };
-    dbFirst.mockResolvedValueOnce({
+    vi.mocked(getLatestRoutingTable).mockResolvedValueOnce({
       run_id: 'run-123',
       published_at: '2026-06-01T10:00:00.000Z',
       table_json: JSON.stringify(tableData),
diff --git a/services/auto-routing-benchmark/src/db-schema.ts b/services/auto-routing-benchmark/src/db-schema.ts
new file mode 100644
index 0000000000..d18177d154
--- /dev/null
+++ b/services/auto-routing-benchmark/src/db-schema.ts
@@ -0,0 +1,67 @@
+import { index, integer, primaryKey, real, sqliteTable, text } from 'drizzle-orm/sqlite-core';
+import type { BenchmarkKind } from '@kilocode/auto-routing-contracts';
+
+// Mirrors migrations/*.sql (the source of truth, applied via wrangler). Keep
+// the two in sync when adding columns.
+
+export const benchmarkRuns = sqliteTable('benchmark_runs', {
+  id: text('id').primaryKey(),
+  kind: text('kind').$type<BenchmarkKind>().notNull(),
+  status: text('status').$type<'running' | 'completed' | 'failed'>().notNull(),
+  started_at: text('started_at').notNull(),
+  completed_at: text('completed_at'),
+  config_json: text('config_json').notNull(),
+  // Run-scoped execution state: which models were actually enqueued and the
+  // summaries carried forward for models skipped because they already had
+  // results. Null on rows created before the column existed.
+  runtime_json: text('runtime_json'),
+  error: text('error'),
+});
+
+export const caseResults = sqliteTable(
+  'case_results',
+  {
+    run_id: text('run_id').notNull(),
+    model: text('model').notNull(),
+    case_id: text('case_id').notNull(),
+    tier: text('tier'),
+    score: real('score').notNull(),
+    latency_ms: integer('latency_ms').notNull(),
+    cost_usd: real('cost_usd'),
+    detail_json: text('detail_json'),
+    error: text('error'),
+  },
+  table => [
+    primaryKey({ columns: [table.run_id, table.model, table.case_id] }),
+    index('idx_case_results_run').on(table.run_id),
+  ]
+);
+
+export const modelSummaries = sqliteTable(
+  'model_summaries',
+  {
+    run_id: text('run_id').notNull(),
+    model: text('model').notNull(),
+    tier: text('tier').notNull(),
+    accuracy: real('accuracy').notNull(),
+    avg_cost_usd: real('avg_cost_usd'),
+    avg_latency_ms: real('avg_latency_ms').notNull(),
+    p50_latency_ms: real('p50_latency_ms'),
+    cases: integer('cases').notNull(),
+    errors: integer('errors').notNull(),
+  },
+  table => [primaryKey({ columns: [table.run_id, table.model, table.tier] })]
+);
+
+export const routingTables = sqliteTable('routing_tables', {
+  run_id: text('run_id').primaryKey(),
+  published_at: text('published_at').notNull(),
+  table_json: text('table_json').notNull(),
+});
+
+export const benchmarkConfig = sqliteTable('benchmark_config', {
+  id: integer('id').primaryKey(),
+  config_json: text('config_json').notNull(),
+  updated_at: text('updated_at').notNull(),
+  updated_by: text('updated_by'),
+});
diff --git a/services/auto-routing-benchmark/src/db.ts b/services/auto-routing-benchmark/src/db.ts
index 9a8450b1a1..1b6314dc30 100644
--- a/services/auto-routing-benchmark/src/db.ts
+++ b/services/auto-routing-benchmark/src/db.ts
@@ -3,44 +3,19 @@ import type {
   BenchmarkModelSummary,
   BenchmarkRun,
 } from '@kilocode/auto-routing-contracts';
+import { and, count, desc, eq, inArray, lt } from 'drizzle-orm';
+import { drizzle } from 'drizzle-orm/d1';
+import {
+  benchmarkConfig,
+  benchmarkRuns,
+  caseResults,
+  modelSummaries,
+  routingTables,
+} from './db-schema';
 
-export type CaseResultRow = {
-  run_id: string;
-  model: string;
-  case_id: string;
-  tier: string | null;
-  score: number;
-  latency_ms: number;
-  cost_usd: number | null;
-  detail_json: string | null;
-  error: string | null;
-};
-
-export type RunRow = {
-  id: string;
-  kind: BenchmarkKind;
-  status: 'running' | 'completed' | 'failed';
-  started_at: string;
-  completed_at: string | null;
-  config_json: string;
-  // Run-scoped execution state: which models were actually enqueued and the
-  // summaries carried forward for models skipped because they already had
-  // results. Null on rows created before the column existed.
-  runtime_json: string | null;
-  error: string | null;
-};
-
-type ModelSummaryRow = {
-  run_id: string;
-  model: string;
-  tier: string;
-  accuracy: number;
-  avg_cost_usd: number | null;
-  avg_latency_ms: number;
-  p50_latency_ms: number | null;
-  cases: number;
-  errors: number;
-};
+export type CaseResultRow = typeof caseResults.$inferSelect;
+export type RunRow = typeof benchmarkRuns.$inferSelect;
+type ModelSummaryRow = typeof modelSummaries.$inferSelect;
 
 export function mapSummaryRow(row: ModelSummaryRow): BenchmarkModelSummary {
   return {
@@ -77,58 +52,53 @@ export async function insertRun(
     runtimeJson: string;
   }
 ): Promise<void> {
-  await db
-    .prepare(
-      `INSERT INTO benchmark_runs (id, kind, status, started_at, config_json, runtime_json)
-       VALUES (?1, ?2, 'running', ?3, ?4, ?5)`
-    )
-    .bind(run.id, run.kind, run.startedAt, run.configJson, run.runtimeJson)
-    .run();
+  await drizzle(db).insert(benchmarkRuns).values({
+    id: run.id,
+    kind: run.kind,
+    status: 'running',
+    started_at: run.startedAt,
+    config_json: run.configJson,
+    runtime_json: run.runtimeJson,
+  });
 }
 
 export async function getRun(db: D1Database, runId: string): Promise<RunRow | null> {
-  const row = await db
-    .prepare('SELECT * FROM benchmark_runs WHERE id = ?1')
-    .bind(runId)
-    .first<RunRow>();
+  const row = await drizzle(db)
+    .select()
+    .from(benchmarkRuns)
+    .where(eq(benchmarkRuns.id, runId))
+    .get();
   return row ?? null;
 }
 
 export async function upsertCaseResult(db: D1Database, row: CaseResultRow): Promise<void> {
-  await db
-    .prepare(
-      `INSERT OR REPLACE INTO case_results
-       (run_id, model, case_id, tier, score, latency_ms, cost_usd, detail_json, error)
-       VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9)`
-    )
-    .bind(
-      row.run_id,
-      row.model,
-      row.case_id,
-      row.tier,
-      row.score,
-      row.latency_ms,
-      row.cost_usd,
-      row.detail_json,
-      row.error
-    )
-    .run();
+  await drizzle(db)
+    .insert(caseResults)
+    .values(row)
+    .onConflictDoUpdate({
+      target: [caseResults.run_id, caseResults.model, caseResults.case_id],
+      set: {
+        tier: row.tier,
+        score: row.score,
+        latency_ms: row.latency_ms,
+        cost_usd: row.cost_usd,
+        detail_json: row.detail_json,
+        error: row.error,
+      },
+    });
 }
 
 export async function countCaseResults(db: D1Database, runId: string): Promise<number> {
-  const row = await db
-    .prepare('SELECT COUNT(*) AS n FROM case_results WHERE run_id = ?1')
-    .bind(runId)
-    .first<{ n: number }>();
+  const row = await drizzle(db)
+    .select({ n: count() })
+    .from(caseResults)
+    .where(eq(caseResults.run_id, runId))
+    .get();
   return row?.n ?? 0;
 }
 
 export async function getCaseResults(db: D1Database, runId: string): Promise<CaseResultRow[]> {
-  const { results } = await db
-    .prepare('SELECT * FROM case_results WHERE run_id = ?1')
-    .bind(runId)
-    .all<CaseResultRow>();
-  return results;
+  return drizzle(db).select().from(caseResults).where(eq(caseResults.run_id, runId));
 }
 
 export async function replaceModelSummaries(
@@ -136,57 +106,62 @@ export async function replaceModelSummaries(
   runId: string,
   summaries: BenchmarkModelSummary[]
 ): Promise<void> {
-  const statements = [
-    db.prepare('DELETE FROM model_summaries WHERE run_id = ?1').bind(runId),
-    ...summaries.map(s =>
-      db
-        .prepare(
-          `INSERT INTO model_summaries
-           (run_id, model, tier, accuracy, avg_cost_usd, avg_latency_ms, p50_latency_ms, cases, errors)
-           VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9)`
-        )
-        .bind(
-          runId,
-          s.model,
-          s.tier,
-          s.accuracy,
-          s.avgCostUsd,
-          s.avgLatencyMs,
-          s.p50LatencyMs,
-          s.cases,
-          s.errors
-        )
+  const orm = drizzle(db);
+  const deleteExisting = orm.delete(modelSummaries).where(eq(modelSummaries.run_id, runId));
+  if (summaries.length === 0) {
+    await deleteExisting;
+    return;
+  }
+  await orm.batch([
+    deleteExisting,
+    orm.insert(modelSummaries).values(
+      summaries.map(s => ({
+        run_id: runId,
+        model: s.model,
+        tier: s.tier,
+        accuracy: s.accuracy,
+        avg_cost_usd: s.avgCostUsd,
+        avg_latency_ms: s.avgLatencyMs,
+        p50_latency_ms: s.p50LatencyMs,
+        cases: s.cases,
+        errors: s.errors,
+      }))
     ),
-  ];
-  await db.batch(statements);
+  ]);
 }
 
 export async function getSummaries(
   db: D1Database,
   runId: string
 ): Promise<BenchmarkModelSummary[]> {
-  const { results } = await db
-    .prepare('SELECT * FROM model_summaries WHERE run_id = ?1')
-    .bind(runId)
-    .all<ModelSummaryRow>();
-  return results.map(mapSummaryRow);
+  const rows = await drizzle(db)
+    .select()
+    .from(modelSummaries)
+    .where(eq(modelSummaries.run_id, runId));
+  return rows.map(mapSummaryRow);
 }
 
 export async function listRuns(db: D1Database, limit: number): Promise<BenchmarkRun[]> {
-  const { results: runRows } = await db
-    .prepare('SELECT * FROM benchmark_runs ORDER BY started_at DESC LIMIT ?1')
-    .bind(limit)
-    .all<RunRow>();
+  const orm = drizzle(db);
+  const runRows = await orm
+    .select()
+    .from(benchmarkRuns)
+    .orderBy(desc(benchmarkRuns.started_at))
+    .limit(limit);
 
   if (runRows.length === 0) {
     return [];
   }
 
-  const placeholders = runRows.map((_, i) => `?${i + 1}`).join(', ');
-  const { results: summaryRows } = await db
-    .prepare(`SELECT * FROM model_summaries WHERE run_id IN (${placeholders})`)
-    .bind(...runRows.map(r => r.id))
-    .all<ModelSummaryRow>();
+  const summaryRows = await orm
+    .select()
+    .from(modelSummaries)
+    .where(
+      inArray(
+        modelSummaries.run_id,
+        runRows.map(r => r.id)
+      )
+    );
 
   const summariesByRunId = new Map<string, BenchmarkModelSummary[]>();
   for (const row of summaryRows) {
@@ -202,24 +177,17 @@ export async function listRuns(db: D1Database, limit: number): Promise<Benchmark
 }
 
 export async function markRunCompleted(db: D1Database, runId: string): Promise<void> {
-  await db
-    .prepare(
-      `UPDATE benchmark_runs SET status = 'completed', completed_at = ?2
-       WHERE id = ?1 AND status = 'running'`
-    )
-    .bind(runId, new Date().toISOString())
-    .run();
+  await drizzle(db)
+    .update(benchmarkRuns)
+    .set({ status: 'completed', completed_at: new Date().toISOString() })
+    .where(and(eq(benchmarkRuns.id, runId), eq(benchmarkRuns.status, 'running')));
 }
 
-export async function markStaleRunsFailed(db: D1Database, olderThanIso: string): Promise<number> {
-  const result = await db
-    .prepare(
-      `UPDATE benchmark_runs SET status = 'failed', error = 'timed out'
-       WHERE status = 'running' AND started_at < ?1`
-    )
-    .bind(olderThanIso)
-    .run();
-  return result.meta.changes;
+export async function markStaleRunsFailed(db: D1Database, olderThanIso: string): Promise<void> {
+  await drizzle(db)
+    .update(benchmarkRuns)
+    .set({ status: 'failed', error: 'timed out' })
+    .where(and(eq(benchmarkRuns.status, 'running'), lt(benchmarkRuns.started_at, olderThanIso)));
 }
 
 export async function saveRoutingTable(
@@ -228,30 +196,39 @@ export async function saveRoutingTable(
   publishedAt: string,
   tableJson: string
 ): Promise<void> {
-  await db
-    .prepare(
-      `INSERT OR REPLACE INTO routing_tables (run_id, published_at, table_json)
-       VALUES (?1, ?2, ?3)`
-    )
-    .bind(runId, publishedAt, tableJson)
-    .run();
+  await drizzle(db)
+    .insert(routingTables)
+    .values({ run_id: runId, published_at: publishedAt, table_json: tableJson })
+    .onConflictDoUpdate({
+      target: routingTables.run_id,
+      set: { published_at: publishedAt, table_json: tableJson },
+    });
 }
 
 export async function getLatestRoutingTable(
   db: D1Database
-): Promise<{ run_id: string; published_at: string; table_json: string } | null> {
-  const row = await db
-    .prepare('SELECT * FROM routing_tables ORDER BY published_at DESC LIMIT 1')
-    .first<{ run_id: string; published_at: string; table_json: string }>();
+): Promise<typeof routingTables.$inferSelect | null> {
+  const row = await drizzle(db)
+    .select()
+    .from(routingTables)
+    .orderBy(desc(routingTables.published_at))
+    .limit(1)
+    .get();
   return row ?? null;
 }
 
 export async function getConfigRow(
   db: D1Database
-): Promise<{ config_json: string; updated_at: string; updated_by: string | null } | null> {
-  const row = await db
-    .prepare('SELECT config_json, updated_at, updated_by FROM benchmark_config WHERE id = 1')
-    .first<{ config_json: string; updated_at: string; updated_by: string | null }>();
+): Promise<Omit<typeof benchmarkConfig.$inferSelect, 'id'> | null> {
+  const row = await drizzle(db)
+    .select({
+      config_json: benchmarkConfig.config_json,
+      updated_at: benchmarkConfig.updated_at,
+      updated_by: benchmarkConfig.updated_by,
+    })
+    .from(benchmarkConfig)
+    .where(eq(benchmarkConfig.id, 1))
+    .get();
   return row ?? null;
 }
 
@@ -261,28 +238,15 @@ export async function saveConfigRow(
   updatedAt: string,
   updatedBy: string | null
 ): Promise<void> {
-  await db
-    .prepare(
-      `INSERT OR REPLACE INTO benchmark_config (id, config_json, updated_at, updated_by)
-       VALUES (1, ?1, ?2, ?3)`
-    )
-    .bind(configJson, updatedAt, updatedBy)
-    .run();
+  await drizzle(db)
+    .insert(benchmarkConfig)
+    .values({ id: 1, config_json: configJson, updated_at: updatedAt, updated_by: updatedBy })
+    .onConflictDoUpdate({
+      target: benchmarkConfig.id,
+      set: { config_json: configJson, updated_at: updatedAt, updated_by: updatedBy },
+    });
 }
 
-type LatestSummaryRow = {
-  run_id: string;
-  started_at: string;
-  model: string;
-  tier: string;
-  accuracy: number;
-  avg_cost_usd: number | null;
-  avg_latency_ms: number;
-  p50_latency_ms: number | null;
-  cases: number;
-  errors: number;
-};
-
 // Latest summaries per model for a benchmark kind: for each model, all tiers
 // from the most recent COMPLETED run that included it (mixing tiers across
 // runs would pair incomparable numbers).
@@ -290,17 +254,22 @@ export async function getLatestSummariesByModel(
   db: D1Database,
   kind: BenchmarkKind
 ): Promise<Map<string, BenchmarkModelSummary[]>> {
-  const { results } = await db
-    .prepare(
-      `SELECT ms.run_id, r.started_at, ms.model, ms.tier, ms.accuracy, ms.avg_cost_usd,
-              ms.avg_latency_ms, ms.p50_latency_ms, ms.cases, ms.errors
-       FROM model_summaries ms
-       JOIN benchmark_runs r ON r.id = ms.run_id
-       WHERE r.kind = ?1 AND r.status = 'completed'
-       ORDER BY r.started_at DESC`
-    )
-    .bind(kind)
-    .all<LatestSummaryRow>();
+  const results = await drizzle(db)
+    .select({
+      run_id: modelSummaries.run_id,
+      model: modelSummaries.model,
+      tier: modelSummaries.tier,
+      accuracy: modelSummaries.accuracy,
+      avg_cost_usd: modelSummaries.avg_cost_usd,
+      avg_latency_ms: modelSummaries.avg_latency_ms,
+      p50_latency_ms: modelSummaries.p50_latency_ms,
+      cases: modelSummaries.cases,
+      errors: modelSummaries.errors,
+    })
+    .from(modelSummaries)
+    .innerJoin(benchmarkRuns, eq(benchmarkRuns.id, modelSummaries.run_id))
+    .where(and(eq(benchmarkRuns.kind, kind), eq(benchmarkRuns.status, 'completed')))
+    .orderBy(desc(benchmarkRuns.started_at));
 
   const latestRunByModel = new Map<string, string>();
   for (const row of results) {

From 2d2691f6c4f88911388dc9f56ffe80035f619a8a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Fri, 12 Jun 2026 12:47:30 +0200
Subject: [PATCH 33/73] refactor(auto-routing-benchmark): normalize D1 schema
 and adopt drizzle-kit migrations

Eliminate all JSON blob columns from the benchmark worker's D1 database:
- Add drizzle-kit, drizzle.config.ts, and pnpm db:generate script
- Replace config_json/runtime_json blobs with dedicated tables
  (config_classifier_models, config_decider_models) and snapshot columns
  on benchmark_runs (min_accuracy, max_concurrency, benchmark_user_id)
- Replace detail_json blob in case_results with explicit diagnostic columns
  (fallback_reason, retried, exit_code, output_prefix, event_count,
  last_event_types)
- Add run_models table for per-run model config snapshots (enqueued flag,
  api kind flags, reasoning_effort)
- Add carried flag to model_summaries (true = prior-run summary copied in
  at startRun for skipped models)
- Explode routing_tables.table_json into routing_table_candidates rows
- Squash old migrations into a single baseline 0000 migration

Rewrite storage layer accordingly: apiKindsToFlags/flagsToApiKinds helpers,
getConfigRows/replaceConfig, insertRun(run, models, carried), getRunWithModels,
saveRoutingTable(table, publishedAt), getLatestRoutingTable returning RoutingTable
with safeParse, getClassifierWinner from D1 directly.

Move pickClassifierWinner to src/winner.ts (pure, no D1 dep).
Add GET /admin/classifier-winner endpoint.
Add ClassifierWinnerResponseSchema to contracts.
KV puts removed; finalizeRunIfComplete now only deletes KV keys so the
auto-routing worker repopulates as a read-through cache.
---
 .../auto-routing-contracts/src/benchmark.ts   |   5 +
 pnpm-lock.yaml                                |   3 +
 .../auto-routing-benchmark/drizzle.config.ts  |   6 +
 .../migrations/0000_supreme_captain_flint.sql |  99 +++
 .../migrations/0001_init.sql                  |  49 --
 .../migrations/0002_carried_results.sql       |   1 -
 .../migrations/meta/0000_snapshot.json        | 628 ++++++++++++++++++
 .../migrations/meta/_journal.json             |  13 +
 services/auto-routing-benchmark/package.json  |   2 +
 .../auto-routing-benchmark/src/admin.test.ts  | 112 ++--
 services/auto-routing-benchmark/src/admin.ts  |  15 +-
 .../auto-routing-benchmark/src/config.test.ts | 102 ++-
 services/auto-routing-benchmark/src/config.ts |  69 +-
 .../auto-routing-benchmark/src/db-schema.ts   | 111 +++-
 .../auto-routing-benchmark/src/db.test.ts     | 218 ++++--
 services/auto-routing-benchmark/src/db.ts     | 438 ++++++++++--
 .../auto-routing-benchmark/src/run.test.ts    |  10 +-
 services/auto-routing-benchmark/src/run.ts    | 246 +++----
 services/auto-routing-benchmark/src/winner.ts |  18 +
 19 files changed, 1749 insertions(+), 396 deletions(-)
 create mode 100644 services/auto-routing-benchmark/drizzle.config.ts
 create mode 100644 services/auto-routing-benchmark/migrations/0000_supreme_captain_flint.sql
 delete mode 100644 services/auto-routing-benchmark/migrations/0001_init.sql
 delete mode 100644 services/auto-routing-benchmark/migrations/0002_carried_results.sql
 create mode 100644 services/auto-routing-benchmark/migrations/meta/0000_snapshot.json
 create mode 100644 services/auto-routing-benchmark/migrations/meta/_journal.json
 create mode 100644 services/auto-routing-benchmark/src/winner.ts

diff --git a/packages/auto-routing-contracts/src/benchmark.ts b/packages/auto-routing-contracts/src/benchmark.ts
index 5620736444..61c38bcff6 100644
--- a/packages/auto-routing-contracts/src/benchmark.ts
+++ b/packages/auto-routing-contracts/src/benchmark.ts
@@ -94,3 +94,8 @@ export const ClassifierWinnerSchema = z.object({
 export type ClassifierWinner = z.infer<typeof ClassifierWinnerSchema>;
 
 export const CLASSIFIER_WINNER_KV_KEY = 'classifier_benchmark_winner';
+
+export const ClassifierWinnerResponseSchema = z.object({
+  winner: ClassifierWinnerSchema.nullable(),
+});
+export type ClassifierWinnerResponse = z.infer<typeof ClassifierWinnerResponseSchema>;
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index 891de54e82..1182b1ce3c 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -1543,6 +1543,9 @@ importers:
       '@typescript/native-preview':
         specifier: 'catalog:'
         version: 7.0.0-dev.20260514.1
+      drizzle-kit:
+        specifier: 'catalog:'
+        version: 0.31.10
       typescript:
         specifier: 'catalog:'
         version: 5.9.3
diff --git a/services/auto-routing-benchmark/drizzle.config.ts b/services/auto-routing-benchmark/drizzle.config.ts
new file mode 100644
index 0000000000..3214ffe4b8
--- /dev/null
+++ b/services/auto-routing-benchmark/drizzle.config.ts
@@ -0,0 +1,6 @@
+import { defineConfig } from 'drizzle-kit';
+export default defineConfig({
+  out: './migrations',
+  schema: './src/db-schema.ts',
+  dialect: 'sqlite',
+});
diff --git a/services/auto-routing-benchmark/migrations/0000_supreme_captain_flint.sql b/services/auto-routing-benchmark/migrations/0000_supreme_captain_flint.sql
new file mode 100644
index 0000000000..7760e97e5f
--- /dev/null
+++ b/services/auto-routing-benchmark/migrations/0000_supreme_captain_flint.sql
@@ -0,0 +1,99 @@
+CREATE TABLE `benchmark_config` (
+	`id` integer PRIMARY KEY NOT NULL,
+	`min_accuracy` real NOT NULL,
+	`max_concurrency` integer NOT NULL,
+	`benchmark_user_id` text,
+	`updated_at` text NOT NULL,
+	`updated_by` text
+);
+--> statement-breakpoint
+CREATE TABLE `benchmark_runs` (
+	`id` text PRIMARY KEY NOT NULL,
+	`kind` text NOT NULL,
+	`status` text NOT NULL,
+	`started_at` text NOT NULL,
+	`completed_at` text,
+	`error` text,
+	`min_accuracy` real NOT NULL,
+	`max_concurrency` integer NOT NULL,
+	`benchmark_user_id` text
+);
+--> statement-breakpoint
+CREATE TABLE `case_results` (
+	`run_id` text NOT NULL,
+	`model` text NOT NULL,
+	`case_id` text NOT NULL,
+	`tier` text,
+	`score` real NOT NULL,
+	`latency_ms` integer NOT NULL,
+	`cost_usd` real,
+	`error` text,
+	`fallback_reason` text,
+	`retried` integer,
+	`exit_code` integer,
+	`output_prefix` text,
+	`event_count` integer,
+	`last_event_types` text,
+	PRIMARY KEY(`run_id`, `model`, `case_id`)
+);
+--> statement-breakpoint
+CREATE INDEX `idx_case_results_run` ON `case_results` (`run_id`);--> statement-breakpoint
+CREATE TABLE `config_classifier_models` (
+	`model` text PRIMARY KEY NOT NULL
+);
+--> statement-breakpoint
+CREATE TABLE `config_decider_models` (
+	`model` text PRIMARY KEY NOT NULL,
+	`reasoning_effort` text,
+	`supports_chat_completions` integer NOT NULL,
+	`supports_messages` integer NOT NULL,
+	`supports_responses` integer NOT NULL
+);
+--> statement-breakpoint
+CREATE TABLE `model_summaries` (
+	`run_id` text NOT NULL,
+	`model` text NOT NULL,
+	`tier` text NOT NULL,
+	`accuracy` real NOT NULL,
+	`avg_cost_usd` real,
+	`avg_latency_ms` real NOT NULL,
+	`p50_latency_ms` real,
+	`cases` integer NOT NULL,
+	`errors` integer NOT NULL,
+	`carried` integer DEFAULT false NOT NULL,
+	PRIMARY KEY(`run_id`, `model`, `tier`)
+);
+--> statement-breakpoint
+CREATE TABLE `routing_table_candidates` (
+	`run_id` text NOT NULL,
+	`tier` text NOT NULL,
+	`rank` integer NOT NULL,
+	`model` text NOT NULL,
+	`accuracy` real NOT NULL,
+	`avg_cost_usd` real,
+	`meets_threshold` integer NOT NULL,
+	`reasoning_effort` text,
+	`supports_chat_completions` integer NOT NULL,
+	`supports_messages` integer NOT NULL,
+	`supports_responses` integer NOT NULL,
+	PRIMARY KEY(`run_id`, `tier`, `rank`)
+);
+--> statement-breakpoint
+CREATE TABLE `routing_tables` (
+	`run_id` text PRIMARY KEY NOT NULL,
+	`published_at` text NOT NULL,
+	`generated_at` text NOT NULL,
+	`min_accuracy` real NOT NULL,
+	`source` text NOT NULL
+);
+--> statement-breakpoint
+CREATE TABLE `run_models` (
+	`run_id` text NOT NULL,
+	`model` text NOT NULL,
+	`enqueued` integer NOT NULL,
+	`reasoning_effort` text,
+	`supports_chat_completions` integer NOT NULL,
+	`supports_messages` integer NOT NULL,
+	`supports_responses` integer NOT NULL,
+	PRIMARY KEY(`run_id`, `model`)
+);
diff --git a/services/auto-routing-benchmark/migrations/0001_init.sql b/services/auto-routing-benchmark/migrations/0001_init.sql
deleted file mode 100644
index 6452dcfd1b..0000000000
--- a/services/auto-routing-benchmark/migrations/0001_init.sql
+++ /dev/null
@@ -1,49 +0,0 @@
-CREATE TABLE benchmark_runs (
-  id TEXT PRIMARY KEY,
-  kind TEXT NOT NULL CHECK (kind IN ('classifier', 'decider')),
-  status TEXT NOT NULL CHECK (status IN ('running', 'completed', 'failed')),
-  started_at TEXT NOT NULL,
-  completed_at TEXT,
-  config_json TEXT NOT NULL,
-  error TEXT
-);
-
-CREATE TABLE case_results (
-  run_id TEXT NOT NULL REFERENCES benchmark_runs(id),
-  model TEXT NOT NULL,
-  case_id TEXT NOT NULL,
-  tier TEXT,
-  score REAL NOT NULL,
-  latency_ms INTEGER NOT NULL,
-  cost_usd REAL,
-  detail_json TEXT,
-  error TEXT,
-  PRIMARY KEY (run_id, model, case_id)
-);
-CREATE INDEX idx_case_results_run ON case_results (run_id);
-
-CREATE TABLE model_summaries (
-  run_id TEXT NOT NULL REFERENCES benchmark_runs(id),
-  model TEXT NOT NULL,
-  tier TEXT NOT NULL,
-  accuracy REAL NOT NULL,
-  avg_cost_usd REAL,
-  avg_latency_ms REAL NOT NULL,
-  p50_latency_ms REAL,
-  cases INTEGER NOT NULL,
-  errors INTEGER NOT NULL,
-  PRIMARY KEY (run_id, model, tier)
-);
-
-CREATE TABLE routing_tables (
-  run_id TEXT PRIMARY KEY REFERENCES benchmark_runs(id),
-  published_at TEXT NOT NULL,
-  table_json TEXT NOT NULL
-);
-
-CREATE TABLE benchmark_config (
-  id INTEGER PRIMARY KEY CHECK (id = 1),
-  config_json TEXT NOT NULL,
-  updated_at TEXT NOT NULL,
-  updated_by TEXT
-);
diff --git a/services/auto-routing-benchmark/migrations/0002_carried_results.sql b/services/auto-routing-benchmark/migrations/0002_carried_results.sql
deleted file mode 100644
index d1c66da8d3..0000000000
--- a/services/auto-routing-benchmark/migrations/0002_carried_results.sql
+++ /dev/null
@@ -1 +0,0 @@
-ALTER TABLE benchmark_runs ADD COLUMN runtime_json TEXT;
diff --git a/services/auto-routing-benchmark/migrations/meta/0000_snapshot.json b/services/auto-routing-benchmark/migrations/meta/0000_snapshot.json
new file mode 100644
index 0000000000..c297d0ce00
--- /dev/null
+++ b/services/auto-routing-benchmark/migrations/meta/0000_snapshot.json
@@ -0,0 +1,628 @@
+{
+  "version": "6",
+  "dialect": "sqlite",
+  "id": "d78fc8c4-926b-42c0-b876-6877d22e28fe",
+  "prevId": "00000000-0000-0000-0000-000000000000",
+  "tables": {
+    "benchmark_config": {
+      "name": "benchmark_config",
+      "columns": {
+        "id": {
+          "name": "id",
+          "type": "integer",
+          "primaryKey": true,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "min_accuracy": {
+          "name": "min_accuracy",
+          "type": "real",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "max_concurrency": {
+          "name": "max_concurrency",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "benchmark_user_id": {
+          "name": "benchmark_user_id",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        },
+        "updated_at": {
+          "name": "updated_at",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "updated_by": {
+          "name": "updated_by",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        }
+      },
+      "indexes": {},
+      "foreignKeys": {},
+      "compositePrimaryKeys": {},
+      "uniqueConstraints": {},
+      "checkConstraints": {}
+    },
+    "benchmark_runs": {
+      "name": "benchmark_runs",
+      "columns": {
+        "id": {
+          "name": "id",
+          "type": "text",
+          "primaryKey": true,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "kind": {
+          "name": "kind",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "status": {
+          "name": "status",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "started_at": {
+          "name": "started_at",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "completed_at": {
+          "name": "completed_at",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        },
+        "error": {
+          "name": "error",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        },
+        "min_accuracy": {
+          "name": "min_accuracy",
+          "type": "real",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "max_concurrency": {
+          "name": "max_concurrency",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "benchmark_user_id": {
+          "name": "benchmark_user_id",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        }
+      },
+      "indexes": {},
+      "foreignKeys": {},
+      "compositePrimaryKeys": {},
+      "uniqueConstraints": {},
+      "checkConstraints": {}
+    },
+    "case_results": {
+      "name": "case_results",
+      "columns": {
+        "run_id": {
+          "name": "run_id",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "model": {
+          "name": "model",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "case_id": {
+          "name": "case_id",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "tier": {
+          "name": "tier",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        },
+        "score": {
+          "name": "score",
+          "type": "real",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "latency_ms": {
+          "name": "latency_ms",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "cost_usd": {
+          "name": "cost_usd",
+          "type": "real",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        },
+        "error": {
+          "name": "error",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        },
+        "fallback_reason": {
+          "name": "fallback_reason",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        },
+        "retried": {
+          "name": "retried",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        },
+        "exit_code": {
+          "name": "exit_code",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        },
+        "output_prefix": {
+          "name": "output_prefix",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        },
+        "event_count": {
+          "name": "event_count",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        },
+        "last_event_types": {
+          "name": "last_event_types",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        }
+      },
+      "indexes": {
+        "idx_case_results_run": {
+          "name": "idx_case_results_run",
+          "columns": [
+            "run_id"
+          ],
+          "isUnique": false
+        }
+      },
+      "foreignKeys": {},
+      "compositePrimaryKeys": {
+        "case_results_run_id_model_case_id_pk": {
+          "columns": [
+            "run_id",
+            "model",
+            "case_id"
+          ],
+          "name": "case_results_run_id_model_case_id_pk"
+        }
+      },
+      "uniqueConstraints": {},
+      "checkConstraints": {}
+    },
+    "config_classifier_models": {
+      "name": "config_classifier_models",
+      "columns": {
+        "model": {
+          "name": "model",
+          "type": "text",
+          "primaryKey": true,
+          "notNull": true,
+          "autoincrement": false
+        }
+      },
+      "indexes": {},
+      "foreignKeys": {},
+      "compositePrimaryKeys": {},
+      "uniqueConstraints": {},
+      "checkConstraints": {}
+    },
+    "config_decider_models": {
+      "name": "config_decider_models",
+      "columns": {
+        "model": {
+          "name": "model",
+          "type": "text",
+          "primaryKey": true,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "reasoning_effort": {
+          "name": "reasoning_effort",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        },
+        "supports_chat_completions": {
+          "name": "supports_chat_completions",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "supports_messages": {
+          "name": "supports_messages",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "supports_responses": {
+          "name": "supports_responses",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        }
+      },
+      "indexes": {},
+      "foreignKeys": {},
+      "compositePrimaryKeys": {},
+      "uniqueConstraints": {},
+      "checkConstraints": {}
+    },
+    "model_summaries": {
+      "name": "model_summaries",
+      "columns": {
+        "run_id": {
+          "name": "run_id",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "model": {
+          "name": "model",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "tier": {
+          "name": "tier",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "accuracy": {
+          "name": "accuracy",
+          "type": "real",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "avg_cost_usd": {
+          "name": "avg_cost_usd",
+          "type": "real",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        },
+        "avg_latency_ms": {
+          "name": "avg_latency_ms",
+          "type": "real",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "p50_latency_ms": {
+          "name": "p50_latency_ms",
+          "type": "real",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        },
+        "cases": {
+          "name": "cases",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "errors": {
+          "name": "errors",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "carried": {
+          "name": "carried",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false,
+          "default": false
+        }
+      },
+      "indexes": {},
+      "foreignKeys": {},
+      "compositePrimaryKeys": {
+        "model_summaries_run_id_model_tier_pk": {
+          "columns": [
+            "run_id",
+            "model",
+            "tier"
+          ],
+          "name": "model_summaries_run_id_model_tier_pk"
+        }
+      },
+      "uniqueConstraints": {},
+      "checkConstraints": {}
+    },
+    "routing_table_candidates": {
+      "name": "routing_table_candidates",
+      "columns": {
+        "run_id": {
+          "name": "run_id",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "tier": {
+          "name": "tier",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "rank": {
+          "name": "rank",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "model": {
+          "name": "model",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "accuracy": {
+          "name": "accuracy",
+          "type": "real",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "avg_cost_usd": {
+          "name": "avg_cost_usd",
+          "type": "real",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        },
+        "meets_threshold": {
+          "name": "meets_threshold",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "reasoning_effort": {
+          "name": "reasoning_effort",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        },
+        "supports_chat_completions": {
+          "name": "supports_chat_completions",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "supports_messages": {
+          "name": "supports_messages",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "supports_responses": {
+          "name": "supports_responses",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        }
+      },
+      "indexes": {},
+      "foreignKeys": {},
+      "compositePrimaryKeys": {
+        "routing_table_candidates_run_id_tier_rank_pk": {
+          "columns": [
+            "run_id",
+            "tier",
+            "rank"
+          ],
+          "name": "routing_table_candidates_run_id_tier_rank_pk"
+        }
+      },
+      "uniqueConstraints": {},
+      "checkConstraints": {}
+    },
+    "routing_tables": {
+      "name": "routing_tables",
+      "columns": {
+        "run_id": {
+          "name": "run_id",
+          "type": "text",
+          "primaryKey": true,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "published_at": {
+          "name": "published_at",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "generated_at": {
+          "name": "generated_at",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "min_accuracy": {
+          "name": "min_accuracy",
+          "type": "real",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "source": {
+          "name": "source",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        }
+      },
+      "indexes": {},
+      "foreignKeys": {},
+      "compositePrimaryKeys": {},
+      "uniqueConstraints": {},
+      "checkConstraints": {}
+    },
+    "run_models": {
+      "name": "run_models",
+      "columns": {
+        "run_id": {
+          "name": "run_id",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "model": {
+          "name": "model",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "enqueued": {
+          "name": "enqueued",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "reasoning_effort": {
+          "name": "reasoning_effort",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        },
+        "supports_chat_completions": {
+          "name": "supports_chat_completions",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "supports_messages": {
+          "name": "supports_messages",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "supports_responses": {
+          "name": "supports_responses",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        }
+      },
+      "indexes": {},
+      "foreignKeys": {},
+      "compositePrimaryKeys": {
+        "run_models_run_id_model_pk": {
+          "columns": [
+            "run_id",
+            "model"
+          ],
+          "name": "run_models_run_id_model_pk"
+        }
+      },
+      "uniqueConstraints": {},
+      "checkConstraints": {}
+    }
+  },
+  "views": {},
+  "enums": {},
+  "_meta": {
+    "schemas": {},
+    "tables": {},
+    "columns": {}
+  },
+  "internal": {
+    "indexes": {}
+  }
+}
\ No newline at end of file
diff --git a/services/auto-routing-benchmark/migrations/meta/_journal.json b/services/auto-routing-benchmark/migrations/meta/_journal.json
new file mode 100644
index 0000000000..cb34f2ea08
--- /dev/null
+++ b/services/auto-routing-benchmark/migrations/meta/_journal.json
@@ -0,0 +1,13 @@
+{
+  "version": "7",
+  "dialect": "sqlite",
+  "entries": [
+    {
+      "idx": 0,
+      "version": "6",
+      "when": 1781260685397,
+      "tag": "0000_supreme_captain_flint",
+      "breakpoints": true
+    }
+  ]
+}
\ No newline at end of file
diff --git a/services/auto-routing-benchmark/package.json b/services/auto-routing-benchmark/package.json
index 92c8f9605d..7a38a89ba4 100644
--- a/services/auto-routing-benchmark/package.json
+++ b/services/auto-routing-benchmark/package.json
@@ -9,6 +9,7 @@
     "types": "wrangler types --include-runtime=false",
     "typecheck": "tsgo --noEmit",
     "lint": "pnpm -w exec oxlint --config .oxlintrc.json services/auto-routing-benchmark/src",
+    "db:generate": "drizzle-kit generate",
     "test": "vitest run"
   },
   "dependencies": {
@@ -24,6 +25,7 @@
     "@cloudflare/workers-types": "catalog:",
     "@types/node": "catalog:",
     "@typescript/native-preview": "catalog:",
+    "drizzle-kit": "catalog:",
     "typescript": "catalog:",
     "vitest": "catalog:",
     "wrangler": "catalog:"
diff --git a/services/auto-routing-benchmark/src/admin.test.ts b/services/auto-routing-benchmark/src/admin.test.ts
index 4218a7f3c8..42b3770e49 100644
--- a/services/auto-routing-benchmark/src/admin.test.ts
+++ b/services/auto-routing-benchmark/src/admin.test.ts
@@ -1,4 +1,5 @@
 import { beforeEach, describe, expect, it, vi } from 'vitest';
+import type { RoutingTable } from '@kilocode/auto-routing-contracts';
 import { DEFAULT_BENCHMARK_CONFIG } from './config';
 import { app } from './index';
 import type * as DbModule from './db';
@@ -12,10 +13,11 @@ vi.mock('./db', async importOriginal => {
   const actual = await importOriginal<typeof DbModule>();
   return {
     ...actual,
-    getConfigRow: vi.fn(),
-    saveConfigRow: vi.fn(),
+    getConfigRows: vi.fn(),
+    replaceConfig: vi.fn(),
     listRuns: vi.fn(),
     getLatestRoutingTable: vi.fn(),
+    getClassifierWinner: vi.fn(),
     getLatestSummariesByModel: vi.fn(),
     insertRun: vi.fn(),
     markStaleRunsFailed: vi.fn(),
@@ -23,13 +25,14 @@ vi.mock('./db', async importOriginal => {
 });
 
 import {
-  getConfigRow,
+  getConfigRows,
+  getClassifierWinner,
   getLatestRoutingTable,
   getLatestSummariesByModel,
   insertRun,
   listRuns,
   markStaleRunsFailed,
-  saveConfigRow,
+  replaceConfig,
 } from './db';
 
 const tokenGet = vi.fn<() => Promise<string>>();
@@ -39,7 +42,7 @@ const env = {
   INTERNAL_API_SECRET_PROD: { get: tokenGet },
   BENCH_DB: {} as D1Database,
   BENCH_QUEUE: { sendBatch: queueSendBatch },
-  AUTO_ROUTING_CONFIG: { put: vi.fn(), get: vi.fn() },
+  AUTO_ROUTING_CONFIG: { put: vi.fn(), get: vi.fn(), delete: vi.fn() },
 } as unknown as Env;
 
 const executionCtx = {
@@ -82,10 +85,15 @@ function authedPut(path: string, body: unknown, extraHeaders: Record<string, str
 beforeEach(() => {
   vi.clearAllMocks();
   tokenGet.mockResolvedValue('bench-token');
-  vi.mocked(getConfigRow).mockResolvedValue(null);
-  vi.mocked(saveConfigRow).mockResolvedValue(undefined);
+  vi.mocked(getConfigRows).mockResolvedValue({
+    config: null,
+    classifierModels: [],
+    deciderModels: [],
+  });
+  vi.mocked(replaceConfig).mockResolvedValue(undefined);
   vi.mocked(listRuns).mockResolvedValue([]);
   vi.mocked(getLatestRoutingTable).mockResolvedValue(null);
+  vi.mocked(getClassifierWinner).mockResolvedValue(null);
   vi.mocked(getLatestSummariesByModel).mockResolvedValue(new Map());
   vi.mocked(insertRun).mockResolvedValue(undefined);
   vi.mocked(markStaleRunsFailed).mockResolvedValue(undefined);
@@ -116,8 +124,8 @@ describe('auth middleware', () => {
 // ---------------------------------------------------------------------------
 
 describe('GET /admin/config', () => {
-  it('returns defaults when the DB row is absent', async () => {
-    // getConfigRow already returns null by default
+  it('returns defaults when the DB rows are absent', async () => {
+    // getConfigRows already returns null config by default
     const res = await authedGet('/admin/config');
     expect(res.status).toBe(200);
     await expect(res.json()).resolves.toEqual({
@@ -126,22 +134,33 @@ describe('GET /admin/config', () => {
     });
   });
 
-  it('returns the stored config when a DB row exists', async () => {
-    const storedConfig = {
-      ...DEFAULT_BENCHMARK_CONFIG,
-      minAccuracy: 0.9,
-      updatedAt: '2026-06-01T00:00:00.000Z',
-      updatedBy: 'admin@example.com',
-    };
-    vi.mocked(getConfigRow).mockResolvedValueOnce({
-      config_json: JSON.stringify(storedConfig),
-      updated_at: '2026-06-01T00:00:00.000Z',
-      updated_by: 'admin@example.com',
+  it('returns the stored config when DB rows exist', async () => {
+    const classifierModels = ['some/model'];
+    const deciderModels = DEFAULT_BENCHMARK_CONFIG.deciderModels.map(m => ({
+      model: m.id,
+      reasoning_effort: null,
+      supports_chat_completions: m.supportedApiKinds.includes('chat_completions'),
+      supports_messages: m.supportedApiKinds.includes('messages'),
+      supports_responses: m.supportedApiKinds.includes('responses'),
+    }));
+    vi.mocked(getConfigRows).mockResolvedValueOnce({
+      config: {
+        id: 1,
+        min_accuracy: 0.9,
+        max_concurrency: 4,
+        benchmark_user_id: null,
+        updated_at: '2026-06-01T00:00:00.000Z',
+        updated_by: 'admin@example.com',
+      },
+      classifierModels,
+      deciderModels,
     });
 
     const res = await authedGet('/admin/config');
     expect(res.status).toBe(200);
-    const body = (await res.json()) as { config: typeof storedConfig };
+    const body = (await res.json()) as {
+      config: { minAccuracy: number; updatedBy: string | null };
+    };
     expect(body.config.minAccuracy).toBe(0.9);
     expect(body.config.updatedBy).toBe('admin@example.com');
   });
@@ -161,8 +180,6 @@ describe('PUT /admin/config', () => {
       },
       body: 'not json {{{',
     });
-    // Malformed JSON surfaces via the framework error handler (same behavior
-    // as the other zodJsonValidator-based services).
     expect(res.status).toBe(500);
   });
 
@@ -173,7 +190,7 @@ describe('PUT /admin/config', () => {
       success: false,
       error: 'Invalid benchmark config',
     });
-    expect(saveConfigRow).not.toHaveBeenCalled();
+    expect(replaceConfig).not.toHaveBeenCalled();
   });
 
   it('persists a valid config and returns it with defaults', async () => {
@@ -193,18 +210,16 @@ describe('PUT /admin/config', () => {
       config: { minAccuracy: number; updatedBy: string | null; updatedAt: string | null };
       defaults: typeof DEFAULT_BENCHMARK_CONFIG;
     };
-    // Returned config carries the stamped fields.
     expect(body.config.minAccuracy).toBe(0.85);
     expect(body.config.updatedBy).toBe('igor@kilocode.ai');
     expect(typeof body.config.updatedAt).toBe('string');
     expect(body.defaults).toEqual(DEFAULT_BENCHMARK_CONFIG);
 
-    // The row was persisted with the stamped config and updatedBy.
-    expect(saveConfigRow).toHaveBeenCalledOnce();
-    const [, configJson, updatedAt, updatedBy] = vi.mocked(saveConfigRow).mock.calls[0];
-    expect(JSON.parse(configJson).minAccuracy).toBe(0.85);
-    expect(typeof updatedAt).toBe('string');
-    expect(updatedBy).toBe('igor@kilocode.ai');
+    expect(replaceConfig).toHaveBeenCalledOnce();
+    const [, configArg] = vi.mocked(replaceConfig).mock.calls[0];
+    expect(configArg.min_accuracy).toBe(0.85);
+    expect(typeof configArg.updated_at).toBe('string');
+    expect(configArg.updated_by).toBe('igor@kilocode.ai');
   });
 });
 
@@ -214,7 +229,6 @@ describe('PUT /admin/config', () => {
 
 describe('GET /admin/runs', () => {
   it('returns an empty runs array when the table is empty', async () => {
-    // listRuns returns [] by default
     const res = await authedGet('/admin/runs');
     expect(res.status).toBe(200);
     await expect(res.json()).resolves.toEqual({ runs: [] });
@@ -235,8 +249,6 @@ describe('POST /admin/runs', () => {
       },
       body: '<<<',
     });
-    // Malformed JSON surfaces via the framework error handler (same behavior
-    // as the other zodJsonValidator-based services).
     expect(res.status).toBe(500);
   });
 
@@ -268,7 +280,6 @@ describe('POST /admin/runs', () => {
 
 describe('GET /admin/routing-table', () => {
   it('returns {table: null, publishedAt: null} when no rows exist', async () => {
-    // getLatestRoutingTable already returns null by default
     const res = await authedGet('/admin/routing-table');
     expect(res.status).toBe(200);
     await expect(res.json()).resolves.toEqual({ table: null, publishedAt: null });
@@ -290,9 +301,8 @@ describe('GET /admin/routing-table', () => {
       tiers: { low: [candidate], medium: [candidate], high: [candidate] },
     };
     vi.mocked(getLatestRoutingTable).mockResolvedValueOnce({
-      run_id: 'run-123',
-      published_at: '2026-06-01T10:00:00.000Z',
-      table_json: JSON.stringify(tableData),
+      table: tableData as RoutingTable,
+      publishedAt: '2026-06-01T10:00:00.000Z',
     });
 
     const res = await authedGet('/admin/routing-table');
@@ -303,3 +313,29 @@ describe('GET /admin/routing-table', () => {
     });
   });
 });
+
+// ---------------------------------------------------------------------------
+// GET /admin/classifier-winner
+// ---------------------------------------------------------------------------
+
+describe('GET /admin/classifier-winner', () => {
+  it('returns {winner: null} when no completed classifier run exists', async () => {
+    const res = await authedGet('/admin/classifier-winner');
+    expect(res.status).toBe(200);
+    await expect(res.json()).resolves.toEqual({ winner: null });
+  });
+
+  it('returns the winner when a completed classifier run exists', async () => {
+    const winner = {
+      model: 'google/gemini-2.5-flash-lite',
+      runId: 'classifier-2026-06-01T00-00-00-000Z',
+      accuracy: 0.92,
+      generatedAt: '2026-06-01T10:00:00.000Z',
+    };
+    vi.mocked(getClassifierWinner).mockResolvedValueOnce(winner);
+
+    const res = await authedGet('/admin/classifier-winner');
+    expect(res.status).toBe(200);
+    await expect(res.json()).resolves.toEqual({ winner });
+  });
+});
diff --git a/services/auto-routing-benchmark/src/admin.ts b/services/auto-routing-benchmark/src/admin.ts
index a539850040..e88784d434 100644
--- a/services/auto-routing-benchmark/src/admin.ts
+++ b/services/auto-routing-benchmark/src/admin.ts
@@ -1,7 +1,6 @@
 import * as z from 'zod';
 import {
   BenchmarkConfigSchema,
-  RoutingTableSchema,
   StartBenchmarkRunRequestSchema,
   type BenchmarkRun,
 } from '@kilocode/auto-routing-contracts';
@@ -10,7 +9,7 @@ import type { Hono } from 'hono';
 import { DEFAULT_BENCHMARK_CONFIG, getBenchmarkConfig, saveBenchmarkConfig } from './config';
 import { debugRunCli } from './cli-runner';
 import { fetchBenchmarkUserToken, startRun } from './run';
-import { getLatestRoutingTable, listRuns } from './db';
+import { getClassifierWinner, getLatestRoutingTable, listRuns } from './db';
 import type { HonoEnv } from './hono-env';
 
 const DebugCliRequestSchema = z.object({
@@ -53,15 +52,17 @@ export function registerAdminRoutes(app: Hono<HonoEnv>): void {
 
   app.get('/admin/routing-table', async c => {
     const latest = await getLatestRoutingTable(c.env.BENCH_DB);
-    // Validated at publish time, but re-validate before crossing the contract
-    // boundary so a schema change can never surface a stale incompatible table.
-    const parsed = latest ? RoutingTableSchema.safeParse(JSON.parse(latest.table_json)) : null;
     return c.json({
-      table: parsed?.success ? parsed.data : null,
-      publishedAt: parsed?.success ? (latest?.published_at ?? null) : null,
+      table: latest?.table ?? null,
+      publishedAt: latest?.publishedAt ?? null,
     });
   });
 
+  app.get('/admin/classifier-winner', async c => {
+    const winner = await getClassifierWinner(c.env.BENCH_DB);
+    return c.json({ winner });
+  });
+
   // Runs one ad-hoc prompt through the kilo CLI container and returns raw
   // (truncated) stdout lines plus the parsed result. Diagnostic-only.
   app.post(
diff --git a/services/auto-routing-benchmark/src/config.test.ts b/services/auto-routing-benchmark/src/config.test.ts
index 32c04e3a86..1d2bec0b40 100644
--- a/services/auto-routing-benchmark/src/config.test.ts
+++ b/services/auto-routing-benchmark/src/config.test.ts
@@ -1,41 +1,89 @@
 import { describe, expect, it } from 'vitest';
-import { DEFAULT_BENCHMARK_CONFIG, parseConfigJson } from './config';
+import { DEFAULT_BENCHMARK_CONFIG, mapConfigRows } from './config';
+import type { ConfigDeciderModelRow } from './db';
 
-describe('parseConfigJson', () => {
-  it('returns defaults on null', () => {
-    expect(parseConfigJson(null)).toEqual(DEFAULT_BENCHMARK_CONFIG);
-  });
+const defaultDeciderRows: ConfigDeciderModelRow[] = DEFAULT_BENCHMARK_CONFIG.deciderModels.map(
+  m => ({
+    model: m.id,
+    reasoning_effort: m.reasoningEffort ?? null,
+    supports_chat_completions: m.supportedApiKinds.includes('chat_completions'),
+    supports_messages: m.supportedApiKinds.includes('messages'),
+    supports_responses: m.supportedApiKinds.includes('responses'),
+  })
+);
+
+const defaultConfigRow = {
+  id: 1 as const,
+  min_accuracy: DEFAULT_BENCHMARK_CONFIG.minAccuracy,
+  max_concurrency: DEFAULT_BENCHMARK_CONFIG.maxConcurrency,
+  benchmark_user_id: DEFAULT_BENCHMARK_CONFIG.benchmarkUserId,
+  updated_at: '2026-06-01T00:00:00.000Z',
+  updated_by: null,
+};
 
-  it('returns defaults on invalid JSON string', () => {
-    expect(parseConfigJson('not valid json {{{')).toEqual(DEFAULT_BENCHMARK_CONFIG);
+describe('mapConfigRows', () => {
+  it('returns defaults when config row is null', () => {
+    expect(mapConfigRows(null, [], [])).toEqual(DEFAULT_BENCHMARK_CONFIG);
   });
 
-  it('returns defaults on schema-invalid JSON', () => {
-    const invalid = JSON.stringify({ classifierModels: 'not-an-array', minAccuracy: 'bad' });
-    expect(parseConfigJson(invalid)).toEqual(DEFAULT_BENCHMARK_CONFIG);
+  it('returns defaults when classifierModels array is empty', () => {
+    expect(mapConfigRows(defaultConfigRow, [], defaultDeciderRows)).toEqual(
+      DEFAULT_BENCHMARK_CONFIG
+    );
   });
 
-  it('returns defaults on empty object', () => {
-    expect(parseConfigJson('{}')).toEqual(DEFAULT_BENCHMARK_CONFIG);
+  it('returns defaults when deciderModels array is empty', () => {
+    expect(mapConfigRows(defaultConfigRow, DEFAULT_BENCHMARK_CONFIG.classifierModels, [])).toEqual(
+      DEFAULT_BENCHMARK_CONFIG
+    );
   });
 
-  it('round-trips a valid config', () => {
-    const config = {
-      ...DEFAULT_BENCHMARK_CONFIG,
-      classifierModels: ['some/model'],
-      minAccuracy: 0.8,
-      maxConcurrency: 2,
-      updatedAt: '2026-01-01T00:00:00.000Z',
-      updatedBy: 'admin@example.com',
+  it('maps a full config row set to BenchmarkConfig', () => {
+    const configRow = {
+      id: 1 as const,
+      min_accuracy: 0.85,
+      max_concurrency: 8,
+      benchmark_user_id: 'user-123',
+      updated_at: '2026-06-01T00:00:00.000Z',
+      updated_by: 'admin@example.com',
     };
-    expect(parseConfigJson(JSON.stringify(config))).toEqual(config);
+    const classifierModels = ['some/model-a', 'some/model-b'];
+    const deciderRows: ConfigDeciderModelRow[] = [
+      {
+        model: 'some/decider',
+        reasoning_effort: 'high',
+        supports_chat_completions: true,
+        supports_messages: true,
+        supports_responses: false,
+      },
+    ];
+
+    const result = mapConfigRows(configRow, classifierModels, deciderRows);
+
+    expect(result.minAccuracy).toBe(0.85);
+    expect(result.maxConcurrency).toBe(8);
+    expect(result.benchmarkUserId).toBe('user-123');
+    expect(result.updatedAt).toBe('2026-06-01T00:00:00.000Z');
+    expect(result.updatedBy).toBe('admin@example.com');
+    expect(result.classifierModels).toEqual(classifierModels);
+    expect(result.deciderModels).toHaveLength(1);
+    expect(result.deciderModels[0].id).toBe('some/decider');
+    expect(result.deciderModels[0].reasoningEffort).toBe('high');
+    expect(result.deciderModels[0].supportedApiKinds).toEqual(['chat_completions', 'messages']);
   });
 
-  it('returns defaults when classifierModels is empty array (schema violation)', () => {
-    const invalid = JSON.stringify({
-      ...DEFAULT_BENCHMARK_CONFIG,
-      classifierModels: [],
-    });
-    expect(parseConfigJson(invalid)).toEqual(DEFAULT_BENCHMARK_CONFIG);
+  it('round-trips the default config through rows', () => {
+    const result = mapConfigRows(
+      defaultConfigRow,
+      DEFAULT_BENCHMARK_CONFIG.classifierModels,
+      defaultDeciderRows
+    );
+    // updatedAt/updatedBy come from the row, not DEFAULT_BENCHMARK_CONFIG (which has null)
+    expect(result.minAccuracy).toBe(DEFAULT_BENCHMARK_CONFIG.minAccuracy);
+    expect(result.maxConcurrency).toBe(DEFAULT_BENCHMARK_CONFIG.maxConcurrency);
+    expect(result.classifierModels).toEqual(DEFAULT_BENCHMARK_CONFIG.classifierModels);
+    expect(result.deciderModels.map(m => m.id)).toEqual(
+      DEFAULT_BENCHMARK_CONFIG.deciderModels.map(m => m.id)
+    );
   });
 });
diff --git a/services/auto-routing-benchmark/src/config.ts b/services/auto-routing-benchmark/src/config.ts
index 9e04172801..5ede546b70 100644
--- a/services/auto-routing-benchmark/src/config.ts
+++ b/services/auto-routing-benchmark/src/config.ts
@@ -1,5 +1,5 @@
-import { BenchmarkConfigSchema, type BenchmarkConfig } from '@kilocode/auto-routing-contracts';
-import { getConfigRow, saveConfigRow } from './db';
+import type { BenchmarkConfig } from '@kilocode/auto-routing-contracts';
+import { apiKindsToFlags, getConfigRows, replaceConfig, type ConfigDeciderModelRow } from './db';
 
 export const DEFAULT_BENCHMARK_CONFIG: BenchmarkConfig = {
   classifierModels: [
@@ -38,20 +38,46 @@ export const DEFAULT_BENCHMARK_CONFIG: BenchmarkConfig = {
   updatedBy: null,
 };
 
-// Pure so the fallback path is unit-testable without D1.
-export function parseConfigJson(raw: string | null): BenchmarkConfig {
-  if (raw === null) return DEFAULT_BENCHMARK_CONFIG;
-  try {
-    const parsed = BenchmarkConfigSchema.safeParse(JSON.parse(raw));
-    return parsed.success ? parsed.data : DEFAULT_BENCHMARK_CONFIG;
-  } catch {
+// Maps the three normalized config tables to the BenchmarkConfig contract.
+// Falls back to DEFAULT_BENCHMARK_CONFIG fields when no config row exists.
+export function mapConfigRows(
+  configRow: {
+    min_accuracy: number;
+    max_concurrency: number;
+    benchmark_user_id: string | null;
+    updated_at: string;
+    updated_by: string | null;
+  } | null,
+  classifierModels: string[],
+  deciderModelRows: ConfigDeciderModelRow[]
+): BenchmarkConfig {
+  if (configRow === null || classifierModels.length === 0 || deciderModelRows.length === 0) {
     return DEFAULT_BENCHMARK_CONFIG;
   }
+
+  return {
+    classifierModels,
+    deciderModels: deciderModelRows.map(r => ({
+      id: r.model,
+      supportedApiKinds: [
+        ...(r.supports_chat_completions ? (['chat_completions'] as const) : []),
+        ...(r.supports_messages ? (['messages'] as const) : []),
+        ...(r.supports_responses ? (['responses'] as const) : []),
+      ],
+      reasoningEffort:
+        r.reasoning_effort as BenchmarkConfig['deciderModels'][number]['reasoningEffort'],
+    })),
+    minAccuracy: configRow.min_accuracy,
+    maxConcurrency: configRow.max_concurrency,
+    benchmarkUserId: configRow.benchmark_user_id,
+    updatedAt: configRow.updated_at,
+    updatedBy: configRow.updated_by,
+  };
 }
 
 export async function getBenchmarkConfig(db: D1Database): Promise<BenchmarkConfig> {
-  const row = await getConfigRow(db);
-  return parseConfigJson(row?.config_json ?? null);
+  const { config, classifierModels, deciderModels } = await getConfigRows(db);
+  return mapConfigRows(config, classifierModels, deciderModels);
 }
 
 export async function saveBenchmarkConfig(
@@ -61,6 +87,25 @@ export async function saveBenchmarkConfig(
 ): Promise<BenchmarkConfig> {
   const updatedAt = new Date().toISOString();
   const stamped: BenchmarkConfig = { ...config, updatedAt, updatedBy };
-  await saveConfigRow(db, JSON.stringify(stamped), updatedAt, updatedBy);
+
+  const deciderModelRows: ConfigDeciderModelRow[] = config.deciderModels.map(m => ({
+    model: m.id,
+    reasoning_effort: m.reasoningEffort ?? null,
+    ...apiKindsToFlags(m.supportedApiKinds),
+  }));
+
+  await replaceConfig(
+    db,
+    {
+      min_accuracy: config.minAccuracy,
+      max_concurrency: config.maxConcurrency,
+      benchmark_user_id: config.benchmarkUserId,
+      updated_at: updatedAt,
+      updated_by: updatedBy,
+    },
+    config.classifierModels,
+    deciderModelRows
+  );
+
   return stamped;
 }
diff --git a/services/auto-routing-benchmark/src/db-schema.ts b/services/auto-routing-benchmark/src/db-schema.ts
index d18177d154..0436c28ff6 100644
--- a/services/auto-routing-benchmark/src/db-schema.ts
+++ b/services/auto-routing-benchmark/src/db-schema.ts
@@ -1,8 +1,29 @@
 import { index, integer, primaryKey, real, sqliteTable, text } from 'drizzle-orm/sqlite-core';
 import type { BenchmarkKind } from '@kilocode/auto-routing-contracts';
 
-// Mirrors migrations/*.sql (the source of truth, applied via wrangler). Keep
-// the two in sync when adding columns.
+// Migrations are generated via `pnpm db:generate` (drizzle-kit) and applied
+// via wrangler d1 migrations apply.
+
+export const benchmarkConfig = sqliteTable('benchmark_config', {
+  id: integer('id').primaryKey(),
+  min_accuracy: real('min_accuracy').notNull(),
+  max_concurrency: integer('max_concurrency').notNull(),
+  benchmark_user_id: text('benchmark_user_id'),
+  updated_at: text('updated_at').notNull(),
+  updated_by: text('updated_by'),
+});
+
+export const configClassifierModels = sqliteTable('config_classifier_models', {
+  model: text('model').primaryKey(),
+});
+
+export const configDeciderModels = sqliteTable('config_decider_models', {
+  model: text('model').primaryKey(),
+  reasoning_effort: text('reasoning_effort'),
+  supports_chat_completions: integer('supports_chat_completions', { mode: 'boolean' }).notNull(),
+  supports_messages: integer('supports_messages', { mode: 'boolean' }).notNull(),
+  supports_responses: integer('supports_responses', { mode: 'boolean' }).notNull(),
+});
 
 export const benchmarkRuns = sqliteTable('benchmark_runs', {
   id: text('id').primaryKey(),
@@ -10,31 +31,26 @@ export const benchmarkRuns = sqliteTable('benchmark_runs', {
   status: text('status').$type<'running' | 'completed' | 'failed'>().notNull(),
   started_at: text('started_at').notNull(),
   completed_at: text('completed_at'),
-  config_json: text('config_json').notNull(),
-  // Run-scoped execution state: which models were actually enqueued and the
-  // summaries carried forward for models skipped because they already had
-  // results. Null on rows created before the column existed.
-  runtime_json: text('runtime_json'),
   error: text('error'),
+  // Config snapshot taken at startRun time so mid-run edits can't skew results.
+  min_accuracy: real('min_accuracy').notNull(),
+  max_concurrency: integer('max_concurrency').notNull(),
+  benchmark_user_id: text('benchmark_user_id'),
 });
 
-export const caseResults = sqliteTable(
-  'case_results',
+export const runModels = sqliteTable(
+  'run_models',
   {
     run_id: text('run_id').notNull(),
     model: text('model').notNull(),
-    case_id: text('case_id').notNull(),
-    tier: text('tier'),
-    score: real('score').notNull(),
-    latency_ms: integer('latency_ms').notNull(),
-    cost_usd: real('cost_usd'),
-    detail_json: text('detail_json'),
-    error: text('error'),
+    // enqueued=false means the model was skipped (had prior results).
+    enqueued: integer('enqueued', { mode: 'boolean' }).notNull(),
+    reasoning_effort: text('reasoning_effort'),
+    supports_chat_completions: integer('supports_chat_completions', { mode: 'boolean' }).notNull(),
+    supports_messages: integer('supports_messages', { mode: 'boolean' }).notNull(),
+    supports_responses: integer('supports_responses', { mode: 'boolean' }).notNull(),
   },
-  table => [
-    primaryKey({ columns: [table.run_id, table.model, table.case_id] }),
-    index('idx_case_results_run').on(table.run_id),
-  ]
+  table => [primaryKey({ columns: [table.run_id, table.model] })]
 );
 
 export const modelSummaries = sqliteTable(
@@ -49,19 +65,60 @@ export const modelSummaries = sqliteTable(
     p50_latency_ms: real('p50_latency_ms'),
     cases: integer('cases').notNull(),
     errors: integer('errors').notNull(),
+    // carried=true rows are prior-run summaries copied in at startRun for skipped models.
+    carried: integer('carried', { mode: 'boolean' }).notNull().default(false),
   },
   table => [primaryKey({ columns: [table.run_id, table.model, table.tier] })]
 );
 
+export const caseResults = sqliteTable(
+  'case_results',
+  {
+    run_id: text('run_id').notNull(),
+    model: text('model').notNull(),
+    case_id: text('case_id').notNull(),
+    tier: text('tier'),
+    score: real('score').notNull(),
+    latency_ms: integer('latency_ms').notNull(),
+    cost_usd: real('cost_usd'),
+    error: text('error'),
+    // Classifier diagnostics.
+    fallback_reason: text('fallback_reason'),
+    retried: integer('retried', { mode: 'boolean' }),
+    // Decider diagnostics.
+    exit_code: integer('exit_code'),
+    output_prefix: text('output_prefix'),
+    event_count: integer('event_count'),
+    last_event_types: text('last_event_types'),
+  },
+  table => [
+    primaryKey({ columns: [table.run_id, table.model, table.case_id] }),
+    index('idx_case_results_run').on(table.run_id),
+  ]
+);
+
 export const routingTables = sqliteTable('routing_tables', {
   run_id: text('run_id').primaryKey(),
   published_at: text('published_at').notNull(),
-  table_json: text('table_json').notNull(),
+  generated_at: text('generated_at').notNull(),
+  min_accuracy: real('min_accuracy').notNull(),
+  source: text('source').notNull(),
 });
 
-export const benchmarkConfig = sqliteTable('benchmark_config', {
-  id: integer('id').primaryKey(),
-  config_json: text('config_json').notNull(),
-  updated_at: text('updated_at').notNull(),
-  updated_by: text('updated_by'),
-});
+export const routingTableCandidates = sqliteTable(
+  'routing_table_candidates',
+  {
+    run_id: text('run_id').notNull(),
+    tier: text('tier').notNull(),
+    rank: integer('rank').notNull(),
+    model: text('model').notNull(),
+    accuracy: real('accuracy').notNull(),
+    avg_cost_usd: real('avg_cost_usd'),
+    meets_threshold: integer('meets_threshold', { mode: 'boolean' }).notNull(),
+    reasoning_effort: text('reasoning_effort'),
+    supports_chat_completions: integer('supports_chat_completions', { mode: 'boolean' }).notNull(),
+    supports_messages: integer('supports_messages', { mode: 'boolean' }).notNull(),
+    supports_responses: integer('supports_responses', { mode: 'boolean' }).notNull(),
+  },
+  table => [primaryKey({ columns: [table.run_id, table.tier, table.rank] })]
+);
diff --git a/services/auto-routing-benchmark/src/db.test.ts b/services/auto-routing-benchmark/src/db.test.ts
index bce49d937c..17a5d10133 100644
--- a/services/auto-routing-benchmark/src/db.test.ts
+++ b/services/auto-routing-benchmark/src/db.test.ts
@@ -1,7 +1,100 @@
 import { describe, it, expect } from 'vitest';
-import { mapSummaryRow, mapRunRow } from './db';
+import type { RankedCandidate, RoutingTable } from '@kilocode/auto-routing-contracts';
+import {
+  apiKindsToFlags,
+  flagsToApiKinds,
+  mapRunRow,
+  mapSummaryRow,
+  routingTableToRows,
+  rowsToRoutingTable,
+} from './db';
 import type { BenchmarkModelSummary } from '@kilocode/auto-routing-contracts';
 
+// ---------------------------------------------------------------------------
+// apiKindsToFlags / flagsToApiKinds round-trip
+// ---------------------------------------------------------------------------
+
+describe('apiKindsToFlags', () => {
+  it('maps all three kinds to true when all present', () => {
+    expect(apiKindsToFlags(['chat_completions', 'messages', 'responses'])).toEqual({
+      supports_chat_completions: true,
+      supports_messages: true,
+      supports_responses: true,
+    });
+  });
+
+  it('maps an empty array to all false', () => {
+    expect(apiKindsToFlags([])).toEqual({
+      supports_chat_completions: false,
+      supports_messages: false,
+      supports_responses: false,
+    });
+  });
+
+  it('maps a single kind correctly', () => {
+    expect(apiKindsToFlags(['chat_completions'])).toEqual({
+      supports_chat_completions: true,
+      supports_messages: false,
+      supports_responses: false,
+    });
+  });
+});
+
+describe('flagsToApiKinds', () => {
+  it('returns all three kinds when all flags are true', () => {
+    expect(
+      flagsToApiKinds({
+        supports_chat_completions: true,
+        supports_messages: true,
+        supports_responses: true,
+      })
+    ).toEqual(['chat_completions', 'messages', 'responses']);
+  });
+
+  it('returns empty array when all flags are false', () => {
+    expect(
+      flagsToApiKinds({
+        supports_chat_completions: false,
+        supports_messages: false,
+        supports_responses: false,
+      })
+    ).toEqual([]);
+  });
+
+  it('returns only the set flags in order: chat_completions, messages, responses', () => {
+    expect(
+      flagsToApiKinds({
+        supports_chat_completions: false,
+        supports_messages: true,
+        supports_responses: true,
+      })
+    ).toEqual(['messages', 'responses']);
+  });
+});
+
+describe('apiKindsToFlags / flagsToApiKinds round-trip', () => {
+  const cases: Parameters<typeof apiKindsToFlags>[0][] = [
+    [],
+    ['chat_completions'],
+    ['messages'],
+    ['responses'],
+    ['chat_completions', 'messages'],
+    ['chat_completions', 'responses'],
+    ['messages', 'responses'],
+    ['chat_completions', 'messages', 'responses'],
+  ];
+
+  for (const kinds of cases) {
+    it(`round-trips [${kinds.join(', ')}]`, () => {
+      expect(flagsToApiKinds(apiKindsToFlags(kinds))).toEqual(kinds);
+    });
+  }
+});
+
+// ---------------------------------------------------------------------------
+// mapSummaryRow
+// ---------------------------------------------------------------------------
+
 describe('mapSummaryRow', () => {
   it('maps snake_case columns to camelCase BenchmarkModelSummary', () => {
     const row = {
@@ -14,6 +107,7 @@ describe('mapSummaryRow', () => {
       p50_latency_ms: 300.0,
       cases: 50,
       errors: 2,
+      carried: false,
     };
     const result = mapSummaryRow(row);
     expect(result).toEqual<BenchmarkModelSummary>({
@@ -39,6 +133,7 @@ describe('mapSummaryRow', () => {
       p50_latency_ms: null,
       cases: 30,
       errors: 0,
+      carried: false,
     };
     const result = mapSummaryRow(row);
     expect(result.avgCostUsd).toBeNull();
@@ -48,6 +143,10 @@ describe('mapSummaryRow', () => {
   });
 });
 
+// ---------------------------------------------------------------------------
+// mapRunRow
+// ---------------------------------------------------------------------------
+
 describe('mapRunRow', () => {
   it('maps a RunRow and attaches its summaries', () => {
     const runRow = {
@@ -56,9 +155,10 @@ describe('mapRunRow', () => {
       status: 'completed' as const,
       started_at: '2026-06-10T04:10:00.000Z',
       completed_at: '2026-06-10T04:25:00.000Z',
-      config_json: '{}',
-      runtime_json: null,
       error: null,
+      min_accuracy: 0.7,
+      max_concurrency: 4,
+      benchmark_user_id: null,
     };
     const summaries: BenchmarkModelSummary[] = [
       {
@@ -90,54 +190,84 @@ describe('mapRunRow', () => {
       status: 'running' as const,
       started_at: '2026-06-11T05:10:00.000Z',
       completed_at: null,
-      config_json: '{}',
-      runtime_json: null,
       error: null,
+      min_accuracy: 0.7,
+      max_concurrency: 4,
+      benchmark_user_id: null,
     };
     const result = mapRunRow(runRow, []);
     expect(result.summaries).toEqual([]);
     expect(result.completedAt).toBeNull();
   });
+});
 
-  it('summaries are attached to the correct run (not mixed up)', () => {
-    const runRow1 = {
-      id: 'run-1',
-      kind: 'classifier' as const,
-      status: 'completed' as const,
-      started_at: '2026-06-01T04:10:00.000Z',
-      completed_at: '2026-06-01T04:20:00.000Z',
-      config_json: '{}',
-      runtime_json: null,
-      error: null,
-    };
-    const runRow2 = {
-      id: 'run-2',
-      kind: 'decider' as const,
-      status: 'failed' as const,
-      started_at: '2026-06-02T05:10:00.000Z',
-      completed_at: null,
-      config_json: '{}',
-      runtime_json: null,
-      error: 'timed out',
-    };
-    const summariesForRun1: BenchmarkModelSummary[] = [
-      {
-        model: 'model-a',
-        tier: '*',
-        accuracy: 0.9,
-        avgCostUsd: null,
-        avgLatencyMs: 200,
-        p50LatencyMs: null,
-        cases: 10,
-        errors: 1,
-      },
-    ];
-    const result1 = mapRunRow(runRow1, summariesForRun1);
-    const result2 = mapRunRow(runRow2, []);
+// ---------------------------------------------------------------------------
+// routingTableToRows / rowsToRoutingTable round-trip
+// ---------------------------------------------------------------------------
+
+const candidate = (model: string): RankedCandidate => ({
+  model,
+  accuracy: 0.9,
+  avgCostUsd: 0.001,
+  meetsThreshold: true,
+  supportedApiKinds: ['chat_completions', 'messages'],
+  reasoningEffort: null,
+});
+
+const sampleTable: RoutingTable = {
+  version: 'run-test-1',
+  generatedAt: '2026-06-01T10:00:00.000Z',
+  minAccuracy: 0.7,
+  source: 'benchmark',
+  tiers: {
+    low: [candidate('model-a'), candidate('model-b')],
+    medium: [candidate('model-c')],
+    high: [candidate('model-a')],
+  },
+};
+
+describe('routingTableToRows', () => {
+  it('produces a tableRow with the correct scalar fields', () => {
+    const { tableRow } = routingTableToRows(sampleTable, '2026-06-01T11:00:00.000Z');
+    expect(tableRow.run_id).toBe('run-test-1');
+    expect(tableRow.published_at).toBe('2026-06-01T11:00:00.000Z');
+    expect(tableRow.generated_at).toBe('2026-06-01T10:00:00.000Z');
+    expect(tableRow.min_accuracy).toBe(0.7);
+    expect(tableRow.source).toBe('benchmark');
+  });
+
+  it('assigns rank 0,1 for the two low-tier candidates', () => {
+    const { candidateRows } = routingTableToRows(sampleTable, '2026-06-01T11:00:00.000Z');
+    const lowRows = candidateRows.filter(r => r.tier === 'low').sort((a, b) => a.rank - b.rank);
+    expect(lowRows).toHaveLength(2);
+    expect(lowRows[0].model).toBe('model-a');
+    expect(lowRows[0].rank).toBe(0);
+    expect(lowRows[1].model).toBe('model-b');
+    expect(lowRows[1].rank).toBe(1);
+  });
+
+  it('maps supportedApiKinds to boolean flags', () => {
+    const { candidateRows } = routingTableToRows(sampleTable, '2026-06-01T11:00:00.000Z');
+    const row = candidateRows[0];
+    expect(row.supports_chat_completions).toBe(true);
+    expect(row.supports_messages).toBe(true);
+    expect(row.supports_responses).toBe(false);
+  });
+});
+
+describe('rowsToRoutingTable', () => {
+  it('round-trips: rowsToRoutingTable(routingTableToRows(table)) === table', () => {
+    const { tableRow, candidateRows } = routingTableToRows(sampleTable, '2026-06-01T11:00:00.000Z');
+    const reassembled = rowsToRoutingTable(tableRow, candidateRows);
+    expect(reassembled).toEqual(sampleTable);
+  });
 
-    expect(result1.summaries).toHaveLength(1);
-    expect(result1.summaries[0].model).toBe('model-a');
-    expect(result2.summaries).toHaveLength(0);
-    expect(result2.error).toBe('timed out');
+  it('preserves candidate order within each tier', () => {
+    const { tableRow, candidateRows } = routingTableToRows(sampleTable, '2026-06-01T11:00:00.000Z');
+    // Shuffle candidateRows to verify rank-based sorting.
+    const shuffled = [...candidateRows].reverse();
+    const reassembled = rowsToRoutingTable(tableRow, shuffled);
+    expect(reassembled.tiers.low[0].model).toBe('model-a');
+    expect(reassembled.tiers.low[1].model).toBe('model-b');
   });
 });
diff --git a/services/auto-routing-benchmark/src/db.ts b/services/auto-routing-benchmark/src/db.ts
index 1b6314dc30..3096bd0fc8 100644
--- a/services/auto-routing-benchmark/src/db.ts
+++ b/services/auto-routing-benchmark/src/db.ts
@@ -2,20 +2,66 @@ import type {
   BenchmarkKind,
   BenchmarkModelSummary,
   BenchmarkRun,
+  ClassifierWinner,
+  RankedCandidate,
+  RoutingTable,
 } from '@kilocode/auto-routing-contracts';
+import { RoutingTableSchema } from '@kilocode/auto-routing-contracts';
 import { and, count, desc, eq, inArray, lt } from 'drizzle-orm';
 import { drizzle } from 'drizzle-orm/d1';
 import {
   benchmarkConfig,
   benchmarkRuns,
   caseResults,
+  configClassifierModels,
+  configDeciderModels,
   modelSummaries,
+  routingTableCandidates,
   routingTables,
+  runModels,
 } from './db-schema';
+import { pickClassifierWinner } from './winner';
 
 export type CaseResultRow = typeof caseResults.$inferSelect;
 export type RunRow = typeof benchmarkRuns.$inferSelect;
+export type RunModelRow = typeof runModels.$inferSelect;
+export type ConfigDeciderModelRow = typeof configDeciderModels.$inferSelect;
 type ModelSummaryRow = typeof modelSummaries.$inferSelect;
+type ApiKind = 'chat_completions' | 'messages' | 'responses';
+
+// ---------------------------------------------------------------------------
+// ApiKind flag helpers
+// ---------------------------------------------------------------------------
+
+const ALL_API_KINDS: ApiKind[] = ['chat_completions', 'messages', 'responses'];
+
+export function apiKindsToFlags(kinds: ApiKind[]): {
+  supports_chat_completions: boolean;
+  supports_messages: boolean;
+  supports_responses: boolean;
+} {
+  return {
+    supports_chat_completions: kinds.includes('chat_completions'),
+    supports_messages: kinds.includes('messages'),
+    supports_responses: kinds.includes('responses'),
+  };
+}
+
+export function flagsToApiKinds(flags: {
+  supports_chat_completions: boolean;
+  supports_messages: boolean;
+  supports_responses: boolean;
+}): ApiKind[] {
+  return ALL_API_KINDS.filter(k => {
+    if (k === 'chat_completions') return flags.supports_chat_completions;
+    if (k === 'messages') return flags.supports_messages;
+    return flags.supports_responses;
+  });
+}
+
+// ---------------------------------------------------------------------------
+// Row mapping helpers
+// ---------------------------------------------------------------------------
 
 export function mapSummaryRow(row: ModelSummaryRow): BenchmarkModelSummary {
   return {
@@ -42,35 +88,145 @@ export function mapRunRow(row: RunRow, summaries: BenchmarkModelSummary[]): Benc
   };
 }
 
+// ---------------------------------------------------------------------------
+// Config
+// ---------------------------------------------------------------------------
+
+export async function getConfigRows(db: D1Database): Promise<{
+  config: typeof benchmarkConfig.$inferSelect | null;
+  classifierModels: string[];
+  deciderModels: ConfigDeciderModelRow[];
+}> {
+  const orm = drizzle(db);
+  const [configRows, classifierRows, deciderRows] = await Promise.all([
+    orm.select().from(benchmarkConfig).where(eq(benchmarkConfig.id, 1)).limit(1),
+    orm.select().from(configClassifierModels),
+    orm.select().from(configDeciderModels),
+  ]);
+  return {
+    config: configRows[0] ?? null,
+    classifierModels: classifierRows.map(r => r.model),
+    deciderModels: deciderRows,
+  };
+}
+
+export async function replaceConfig(
+  db: D1Database,
+  config: {
+    min_accuracy: number;
+    max_concurrency: number;
+    benchmark_user_id: string | null;
+    updated_at: string;
+    updated_by: string | null;
+  },
+  classifierModels: string[],
+  deciderModels: ConfigDeciderModelRow[]
+): Promise<void> {
+  const orm = drizzle(db);
+  // Build the batch as a plain array; the cast is required because drizzle's
+  // batch type is a readonly non-empty tuple, but we populate it conditionally.
+  // eslint-disable-next-line @typescript-eslint/no-explicit-any
+  const stmts: any[] = [
+    orm
+      .insert(benchmarkConfig)
+      .values({ id: 1, ...config })
+      .onConflictDoUpdate({
+        target: benchmarkConfig.id,
+        set: config,
+      }),
+    orm.delete(configClassifierModels),
+    orm.delete(configDeciderModels),
+  ];
+  if (classifierModels.length > 0) {
+    stmts.push(
+      orm.insert(configClassifierModels).values(classifierModels.map(m => ({ model: m })))
+    );
+  }
+  if (deciderModels.length > 0) {
+    stmts.push(orm.insert(configDeciderModels).values(deciderModels));
+  }
+  await orm.batch(stmts as unknown as Parameters<typeof orm.batch>[0]);
+}
+
+// ---------------------------------------------------------------------------
+// Runs
+// ---------------------------------------------------------------------------
+
 export async function insertRun(
   db: D1Database,
   run: {
     id: string;
     kind: BenchmarkKind;
     startedAt: string;
-    configJson: string;
-    runtimeJson: string;
-  }
+    min_accuracy: number;
+    max_concurrency: number;
+    benchmark_user_id: string | null;
+  },
+  models: RunModelRow[],
+  carriedSummaries: BenchmarkModelSummary[]
 ): Promise<void> {
-  await drizzle(db).insert(benchmarkRuns).values({
+  const orm = drizzle(db);
+  const insertRunStmt = orm.insert(benchmarkRuns).values({
     id: run.id,
     kind: run.kind,
     status: 'running',
     started_at: run.startedAt,
-    config_json: run.configJson,
-    runtime_json: run.runtimeJson,
+    min_accuracy: run.min_accuracy,
+    max_concurrency: run.max_concurrency,
+    benchmark_user_id: run.benchmark_user_id,
   });
+
+  if (models.length === 0 && carriedSummaries.length === 0) {
+    await insertRunStmt;
+    return;
+  }
+
+  // eslint-disable-next-line @typescript-eslint/no-explicit-any
+  const stmts: any[] = [insertRunStmt];
+
+  if (models.length > 0) {
+    stmts.push(orm.insert(runModels).values(models));
+  }
+
+  if (carriedSummaries.length > 0) {
+    stmts.push(
+      orm.insert(modelSummaries).values(
+        carriedSummaries.map(s => ({
+          run_id: run.id,
+          model: s.model,
+          tier: s.tier,
+          accuracy: s.accuracy,
+          avg_cost_usd: s.avgCostUsd,
+          avg_latency_ms: s.avgLatencyMs,
+          p50_latency_ms: s.p50LatencyMs,
+          cases: s.cases,
+          errors: s.errors,
+          carried: true,
+        }))
+      )
+    );
+  }
+
+  await orm.batch(stmts as unknown as Parameters<typeof orm.batch>[0]);
 }
 
-export async function getRun(db: D1Database, runId: string): Promise<RunRow | null> {
-  const row = await drizzle(db)
-    .select()
-    .from(benchmarkRuns)
-    .where(eq(benchmarkRuns.id, runId))
-    .get();
-  return row ?? null;
+export async function getRunWithModels(
+  db: D1Database,
+  runId: string
+): Promise<{ run: RunRow; models: RunModelRow[] } | null> {
+  const orm = drizzle(db);
+  const [run, models] = await Promise.all([
+    orm.select().from(benchmarkRuns).where(eq(benchmarkRuns.id, runId)).get(),
+    orm.select().from(runModels).where(eq(runModels.run_id, runId)),
+  ]);
+  if (!run) return null;
+  return { run, models };
 }
 
+// ---------------------------------------------------------------------------
+// Case results
+// ---------------------------------------------------------------------------
+
 export async function upsertCaseResult(db: D1Database, row: CaseResultRow): Promise<void> {
   await drizzle(db)
     .insert(caseResults)
@@ -82,8 +238,13 @@ export async function upsertCaseResult(db: D1Database, row: CaseResultRow): Prom
         score: row.score,
         latency_ms: row.latency_ms,
         cost_usd: row.cost_usd,
-        detail_json: row.detail_json,
         error: row.error,
+        fallback_reason: row.fallback_reason,
+        retried: row.retried,
+        exit_code: row.exit_code,
+        output_prefix: row.output_prefix,
+        event_count: row.event_count,
+        last_event_types: row.last_event_types,
       },
     });
 }
@@ -101,19 +262,27 @@ export async function getCaseResults(db: D1Database, runId: string): Promise<Cas
   return drizzle(db).select().from(caseResults).where(eq(caseResults.run_id, runId));
 }
 
+// ---------------------------------------------------------------------------
+// Model summaries
+// ---------------------------------------------------------------------------
+
 export async function replaceModelSummaries(
   db: D1Database,
   runId: string,
   summaries: BenchmarkModelSummary[]
 ): Promise<void> {
   const orm = drizzle(db);
-  const deleteExisting = orm.delete(modelSummaries).where(eq(modelSummaries.run_id, runId));
+  // Only delete non-carried rows; carried rows (from skipped models) stay.
+  const deleteStmt = orm
+    .delete(modelSummaries)
+    .where(and(eq(modelSummaries.run_id, runId), eq(modelSummaries.carried, false)));
+
   if (summaries.length === 0) {
-    await deleteExisting;
+    await deleteStmt;
     return;
   }
   await orm.batch([
-    deleteExisting,
+    deleteStmt,
     orm.insert(modelSummaries).values(
       summaries.map(s => ({
         run_id: runId,
@@ -125,6 +294,7 @@ export async function replaceModelSummaries(
         p50_latency_ms: s.p50LatencyMs,
         cases: s.cases,
         errors: s.errors,
+        carried: false,
       }))
     ),
   ]);
@@ -190,62 +360,9 @@ export async function markStaleRunsFailed(db: D1Database, olderThanIso: string):
     .where(and(eq(benchmarkRuns.status, 'running'), lt(benchmarkRuns.started_at, olderThanIso)));
 }
 
-export async function saveRoutingTable(
-  db: D1Database,
-  runId: string,
-  publishedAt: string,
-  tableJson: string
-): Promise<void> {
-  await drizzle(db)
-    .insert(routingTables)
-    .values({ run_id: runId, published_at: publishedAt, table_json: tableJson })
-    .onConflictDoUpdate({
-      target: routingTables.run_id,
-      set: { published_at: publishedAt, table_json: tableJson },
-    });
-}
-
-export async function getLatestRoutingTable(
-  db: D1Database
-): Promise<typeof routingTables.$inferSelect | null> {
-  const row = await drizzle(db)
-    .select()
-    .from(routingTables)
-    .orderBy(desc(routingTables.published_at))
-    .limit(1)
-    .get();
-  return row ?? null;
-}
-
-export async function getConfigRow(
-  db: D1Database
-): Promise<Omit<typeof benchmarkConfig.$inferSelect, 'id'> | null> {
-  const row = await drizzle(db)
-    .select({
-      config_json: benchmarkConfig.config_json,
-      updated_at: benchmarkConfig.updated_at,
-      updated_by: benchmarkConfig.updated_by,
-    })
-    .from(benchmarkConfig)
-    .where(eq(benchmarkConfig.id, 1))
-    .get();
-  return row ?? null;
-}
-
-export async function saveConfigRow(
-  db: D1Database,
-  configJson: string,
-  updatedAt: string,
-  updatedBy: string | null
-): Promise<void> {
-  await drizzle(db)
-    .insert(benchmarkConfig)
-    .values({ id: 1, config_json: configJson, updated_at: updatedAt, updated_by: updatedBy })
-    .onConflictDoUpdate({
-      target: benchmarkConfig.id,
-      set: { config_json: configJson, updated_at: updatedAt, updated_by: updatedBy },
-    });
-}
+// ---------------------------------------------------------------------------
+// Latest summaries per model (for skip logic and classifier winner)
+// ---------------------------------------------------------------------------
 
 // Latest summaries per model for a benchmark kind: for each model, all tiers
 // from the most recent COMPLETED run that included it (mixing tiers across
@@ -265,6 +382,7 @@ export async function getLatestSummariesByModel(
       p50_latency_ms: modelSummaries.p50_latency_ms,
       cases: modelSummaries.cases,
       errors: modelSummaries.errors,
+      carried: modelSummaries.carried,
     })
     .from(modelSummaries)
     .innerJoin(benchmarkRuns, eq(benchmarkRuns.id, modelSummaries.run_id))
@@ -284,3 +402,177 @@ export async function getLatestSummariesByModel(
   }
   return byModel;
 }
+
+// ---------------------------------------------------------------------------
+// Routing table — pure helpers for explode/reassemble
+// ---------------------------------------------------------------------------
+
+type RoutingTableRow = typeof routingTables.$inferSelect;
+type RoutingTableCandidateRow = typeof routingTableCandidates.$inferSelect;
+
+export function routingTableToRows(
+  table: RoutingTable,
+  publishedAt: string
+): { tableRow: RoutingTableRow; candidateRows: RoutingTableCandidateRow[] } {
+  const tableRow: RoutingTableRow = {
+    run_id: table.version,
+    published_at: publishedAt,
+    generated_at: table.generatedAt,
+    min_accuracy: table.minAccuracy,
+    source: table.source,
+  };
+
+  const candidateRows: RoutingTableCandidateRow[] = [];
+  for (const [tier, candidates] of Object.entries(table.tiers)) {
+    candidates.forEach((c, rank) => {
+      candidateRows.push({
+        run_id: table.version,
+        tier,
+        rank,
+        model: c.model,
+        accuracy: c.accuracy,
+        avg_cost_usd: c.avgCostUsd ?? null,
+        meets_threshold: c.meetsThreshold,
+        reasoning_effort: c.reasoningEffort ?? null,
+        ...apiKindsToFlags(c.supportedApiKinds),
+      });
+    });
+  }
+
+  return { tableRow, candidateRows };
+}
+
+export function rowsToRoutingTable(
+  tableRow: RoutingTableRow,
+  candidateRows: RoutingTableCandidateRow[]
+): RoutingTable {
+  const tierMap: Record<string, RankedCandidate[]> = { low: [], medium: [], high: [] };
+  const sorted = [...candidateRows].sort((a, b) => {
+    if (a.tier !== b.tier) return a.tier.localeCompare(b.tier);
+    return a.rank - b.rank;
+  });
+  for (const row of sorted) {
+    if (!(row.tier in tierMap)) tierMap[row.tier] = [];
+    tierMap[row.tier].push({
+      model: row.model,
+      accuracy: row.accuracy,
+      avgCostUsd: row.avg_cost_usd ?? 0,
+      meetsThreshold: row.meets_threshold,
+      supportedApiKinds: flagsToApiKinds(row),
+      reasoningEffort: row.reasoning_effort as RankedCandidate['reasoningEffort'],
+    });
+  }
+  return {
+    version: tableRow.run_id,
+    generatedAt: tableRow.generated_at,
+    minAccuracy: tableRow.min_accuracy,
+    source: tableRow.source as RoutingTable['source'],
+    tiers: {
+      low: tierMap.low ?? [],
+      medium: tierMap.medium ?? [],
+      high: tierMap.high ?? [],
+    },
+  };
+}
+
+export async function saveRoutingTable(
+  db: D1Database,
+  table: RoutingTable,
+  publishedAt: string
+): Promise<void> {
+  const orm = drizzle(db);
+  const { tableRow, candidateRows } = routingTableToRows(table, publishedAt);
+
+  // eslint-disable-next-line @typescript-eslint/no-explicit-any
+  const stmts: any[] = [
+    orm.delete(routingTableCandidates).where(eq(routingTableCandidates.run_id, table.version)),
+    orm
+      .insert(routingTables)
+      .values(tableRow)
+      .onConflictDoUpdate({
+        target: routingTables.run_id,
+        set: {
+          published_at: tableRow.published_at,
+          generated_at: tableRow.generated_at,
+          min_accuracy: tableRow.min_accuracy,
+          source: tableRow.source,
+        },
+      }),
+  ];
+
+  if (candidateRows.length > 0) {
+    stmts.push(orm.insert(routingTableCandidates).values(candidateRows));
+  }
+
+  await orm.batch(stmts as unknown as Parameters<typeof orm.batch>[0]);
+}
+
+export async function getLatestRoutingTable(
+  db: D1Database
+): Promise<{ table: RoutingTable; publishedAt: string } | null> {
+  const orm = drizzle(db);
+  const tableRow = await orm
+    .select()
+    .from(routingTables)
+    .orderBy(desc(routingTables.published_at))
+    .limit(1)
+    .get();
+
+  if (!tableRow) return null;
+
+  const candidateRows = await orm
+    .select()
+    .from(routingTableCandidates)
+    .where(eq(routingTableCandidates.run_id, tableRow.run_id))
+    .orderBy(routingTableCandidates.tier, routingTableCandidates.rank);
+
+  const assembled = rowsToRoutingTable(tableRow, candidateRows);
+  const parsed = RoutingTableSchema.safeParse(assembled);
+  if (!parsed.success) {
+    console.warn(
+      JSON.stringify({
+        event: 'routing_table_invalid',
+        run_id: tableRow.run_id,
+        error: parsed.error.message,
+      })
+    );
+    return null;
+  }
+
+  return { table: parsed.data, publishedAt: tableRow.published_at };
+}
+
+// ---------------------------------------------------------------------------
+// Classifier winner
+// ---------------------------------------------------------------------------
+
+export async function getClassifierWinner(db: D1Database): Promise<ClassifierWinner | null> {
+  const orm = drizzle(db);
+  // Find the latest completed classifier run.
+  const runRow = await orm
+    .select()
+    .from(benchmarkRuns)
+    .where(and(eq(benchmarkRuns.kind, 'classifier'), eq(benchmarkRuns.status, 'completed')))
+    .orderBy(desc(benchmarkRuns.completed_at))
+    .limit(1)
+    .get();
+
+  if (!runRow) return null;
+
+  // Get the tier='*' summaries for this run (classifier uses '*' tier).
+  const summaryRows = await orm
+    .select()
+    .from(modelSummaries)
+    .where(and(eq(modelSummaries.run_id, runRow.id), eq(modelSummaries.tier, '*')));
+
+  const summaries = summaryRows.map(mapSummaryRow);
+  const winner = pickClassifierWinner(summaries, runRow.min_accuracy);
+  if (!winner) return null;
+
+  return {
+    model: winner.model,
+    runId: runRow.id,
+    accuracy: winner.accuracy,
+    generatedAt: runRow.completed_at ?? new Date().toISOString(),
+  };
+}
diff --git a/services/auto-routing-benchmark/src/run.test.ts b/services/auto-routing-benchmark/src/run.test.ts
index 64f5964c52..11aec2cb3b 100644
--- a/services/auto-routing-benchmark/src/run.test.ts
+++ b/services/auto-routing-benchmark/src/run.test.ts
@@ -1,6 +1,7 @@
 import { describe, expect, it } from 'vitest';
 import type { CaseResultRow } from './db';
-import { chunkArray, pickClassifierWinner, runCasesWithConcurrency, summarize } from './run';
+import { chunkArray, runCasesWithConcurrency, summarize } from './run';
+import { pickClassifierWinner } from './winner';
 
 function makeRow(overrides: Partial<CaseResultRow> = {}): CaseResultRow {
   return {
@@ -11,8 +12,13 @@ function makeRow(overrides: Partial<CaseResultRow> = {}): CaseResultRow {
     score: 1,
     latency_ms: 100,
     cost_usd: 0.001,
-    detail_json: null,
     error: null,
+    fallback_reason: null,
+    retried: null,
+    exit_code: null,
+    output_prefix: null,
+    event_count: null,
+    last_event_types: null,
     ...overrides,
   };
 }
diff --git a/services/auto-routing-benchmark/src/run.ts b/services/auto-routing-benchmark/src/run.ts
index 00ead718c1..449c7ddbde 100644
--- a/services/auto-routing-benchmark/src/run.ts
+++ b/services/auto-routing-benchmark/src/run.ts
@@ -1,24 +1,23 @@
 import { classifyWithOpenRouter } from '@kilocode/auto-routing-contracts/classifier';
 import {
-  BenchmarkConfigSchema,
-  BenchmarkModelSummarySchema,
   CLASSIFIER_WINNER_KV_KEY,
   ROUTING_TABLE_KV_KEY,
-  type BenchmarkConfig,
   type BenchmarkKind,
   type BenchmarkModelSummary,
-  type ClassifierWinner,
 } from '@kilocode/auto-routing-contracts';
 import { formatError } from '@kilocode/worker-utils';
 import * as z from 'zod';
 import { getBenchmarkConfig } from './config';
 import { CLASSIFIER_CASES } from './datasets/classifier-cases';
 import { DECIDER_CASES } from './datasets/decider-cases';
+import type { RunModelRow } from './db';
 import {
+  apiKindsToFlags,
   countCaseResults,
   getCaseResults,
   getLatestSummariesByModel,
-  getRun,
+  getRunWithModels,
+  getSummaries,
   insertRun,
   markRunCompleted,
   markStaleRunsFailed,
@@ -31,6 +30,7 @@ import { gradeClassifierOutput, runDeciderCheck } from './grading';
 import { createOpenRouterClient } from './openrouter';
 import { buildRoutingTable } from './routing-table-builder';
 import { runDeciderCaseViaCli, warmUpCliContainer } from './cli-runner';
+import { pickClassifierWinner } from './winner';
 
 export type BenchmarkJobMessage = {
   runId: string;
@@ -65,15 +65,6 @@ export function chunkArray<T>(items: readonly T[], size: number): T[][] {
 
 const STALE_RUN_MAX_AGE_MS = 6 * 3600_000;
 
-// Per-run execution state stored in benchmark_runs.runtime_json: which models
-// were actually enqueued, and the latest prior summaries carried forward for
-// models skipped because they already had results.
-const RunRuntimeSchema = z.object({
-  enqueuedModels: z.array(z.string()),
-  carriedSummaries: z.array(BenchmarkModelSummarySchema),
-});
-type RunRuntime = z.infer<typeof RunRuntimeSchema>;
-
 export async function startRun(
   env: Env,
   kind: BenchmarkKind,
@@ -95,52 +86,89 @@ export async function startRun(
   const priorSummaries = options.force
     ? new Map<string, BenchmarkModelSummary[]>()
     : await getLatestSummariesByModel(env.BENCH_DB, kind);
-  const enqueuedModels = models.filter(m => !priorSummaries.has(m));
+  const enqueuedModelIds = models.filter(m => !priorSummaries.has(m));
   const skippedModels = models.filter(m => priorSummaries.has(m));
   const carriedSummaries = skippedModels.flatMap(m => priorSummaries.get(m) ?? []);
 
   // Decider runs execute through the kilo CLI under a real Kilo user's
   // identity/billing. Fail fast (before inserting the run) when that user
   // isn't configured so the admin POST surfaces the misconfiguration.
-  if (kind === 'decider' && enqueuedModels.length > 0 && !config.benchmarkUserId) {
+  if (kind === 'decider' && enqueuedModelIds.length > 0 && !config.benchmarkUserId) {
     throw new Error(
       'benchmark user not configured: set benchmarkUserId before running the decider benchmark'
     );
   }
 
   const runId = `${kind}-${new Date().toISOString().replace(/[:.]/g, '-')}`;
-  const runtime: RunRuntime = { enqueuedModels, carriedSummaries };
-  await insertRun(env.BENCH_DB, {
-    id: runId,
-    kind,
-    startedAt: new Date().toISOString(),
-    configJson: JSON.stringify(config),
-    runtimeJson: JSON.stringify(runtime),
+
+  // Build run_models rows for ALL models of this run's kind.
+  const runModelRows: RunModelRow[] = models.map(modelId => {
+    if (kind === 'classifier') {
+      return {
+        run_id: runId,
+        model: modelId,
+        enqueued: enqueuedModelIds.includes(modelId),
+        reasoning_effort: null,
+        supports_chat_completions: false,
+        supports_messages: false,
+        supports_responses: false,
+      };
+    }
+    const deciderModel = config.deciderModels.find(m => m.id === modelId);
+    return {
+      run_id: runId,
+      model: modelId,
+      enqueued: enqueuedModelIds.includes(modelId),
+      reasoning_effort: deciderModel?.reasoningEffort ?? null,
+      ...apiKindsToFlags(deciderModel?.supportedApiKinds ?? ['chat_completions']),
+    };
   });
 
+  await insertRun(
+    env.BENCH_DB,
+    {
+      id: runId,
+      kind,
+      startedAt: new Date().toISOString(),
+      min_accuracy: config.minAccuracy,
+      max_concurrency: config.maxConcurrency,
+      benchmark_user_id: config.benchmarkUserId,
+    },
+    runModelRows,
+    carriedSummaries
+  );
+
   console.log(
-    JSON.stringify({ event: 'benchmark_run_started', runId, kind, enqueuedModels, skippedModels })
+    JSON.stringify({
+      event: 'benchmark_run_started',
+      runId,
+      kind,
+      enqueuedModels: enqueuedModelIds,
+      skippedModels,
+    })
   );
 
-  if (enqueuedModels.length === 0) {
+  if (enqueuedModelIds.length === 0) {
     // Everything already has results: complete immediately and republish the
     // aggregate so config-only changes (model removed, threshold tweaked)
     // take effect without re-running any model.
-    await finalizeRunIfComplete(env, runId, kind, { config, runtime });
+    await finalizeRunIfComplete(env, runId, kind);
     return { runId, enqueuedModels: 0, skippedModels };
   }
 
   if (kind === 'classifier') {
     await env.BENCH_QUEUE.sendBatch(
-      enqueuedModels.map(model => ({ body: { runId, kind, model } satisfies BenchmarkJobMessage }))
+      enqueuedModelIds.map(model => ({
+        body: { runId, kind, model } satisfies BenchmarkJobMessage,
+      }))
     );
-    return { runId, enqueuedModels: enqueuedModels.length, skippedModels };
+    return { runId, enqueuedModels: enqueuedModelIds.length, skippedModels };
   }
 
   // Decider: one message per (model, chunk) so each queue invocation stays
   // bounded. finalizeRunIfComplete expects enqueuedModels × DECIDER_CASES rows.
   const chunks = chunkArray(DECIDER_CASES, DECIDER_CHUNK_SIZE);
-  const messages = enqueuedModels.flatMap(model =>
+  const messages = enqueuedModelIds.flatMap(model =>
     chunks.map((chunkCases, chunk) => ({
       body: {
         runId,
@@ -152,7 +180,7 @@ export async function startRun(
     }))
   );
   await env.BENCH_QUEUE.sendBatch(messages);
-  return { runId, enqueuedModels: enqueuedModels.length, skippedModels };
+  return { runId, enqueuedModels: enqueuedModelIds.length, skippedModels };
 }
 
 export async function processJob(env: Env, rawMessage: unknown): Promise<void> {
@@ -172,12 +200,11 @@ export async function processJob(env: Env, rawMessage: unknown): Promise<void> {
 
   const message = parsed.data;
   const state = await getRunState(env, message.runId);
-  const { config } = state;
 
   if (message.kind === 'classifier') {
     // Create the OpenRouter client inside processJob — no module-scope transport clients.
     const client = await createOpenRouterClient(env);
-    await runCasesWithConcurrency(CLASSIFIER_CASES, config.maxConcurrency, async benchCase => {
+    await runCasesWithConcurrency(CLASSIFIER_CASES, state.maxConcurrency, async benchCase => {
       const startedAt = performance.now();
       try {
         const result = await classifyWithOpenRouter(client, benchCase.input, message.model);
@@ -192,12 +219,13 @@ export async function processJob(env: Env, rawMessage: unknown): Promise<void> {
           score,
           latency_ms: Math.round(performance.now() - startedAt),
           cost_usd: result.cost,
-          detail_json: JSON.stringify({
-            classification: result.fallback ? null : result.classification,
-            fallback: result.fallback?.reason ?? null,
-            retried: result.retried ?? false,
-          }),
           error: null,
+          fallback_reason: result.fallback?.reason ?? null,
+          retried: result.retried ?? false,
+          exit_code: null,
+          output_prefix: null,
+          event_count: null,
+          last_event_types: null,
         });
       } catch (error) {
         await upsertCaseResult(
@@ -207,16 +235,36 @@ export async function processJob(env: Env, rawMessage: unknown): Promise<void> {
       }
     });
   } else {
-    await processDeciderJob(env, message, config);
+    await processDeciderJob(env, message, state);
   }
 
-  await finalizeRunIfComplete(env, message.runId, message.kind, state);
+  await finalizeRunIfComplete(env, message.runId, message.kind);
+}
+
+type RunState = {
+  maxConcurrency: number;
+  minAccuracy: number;
+  benchmarkUserId: string | null;
+  models: RunModelRow[];
+};
+
+async function getRunState(env: Env, runId: string): Promise<RunState> {
+  // Snapshots taken at startRun time so a mid-run admin edit can't skew them.
+  const result = await getRunWithModels(env.BENCH_DB, runId);
+  if (!result) throw new Error(`unknown run ${runId}`);
+  const { run, models } = result;
+  return {
+    maxConcurrency: run.max_concurrency,
+    minAccuracy: run.min_accuracy,
+    benchmarkUserId: run.benchmark_user_id,
+    models,
+  };
 }
 
 async function processDeciderJob(
   env: Env,
   message: BenchmarkJobMessage,
-  config: BenchmarkConfig
+  state: RunState
 ): Promise<void> {
   // Decider messages always carry their chunk's case ids; anything else is
   // malformed and dropped (same policy as unparseable messages).
@@ -227,7 +275,7 @@ async function processDeciderJob(
   const caseIds = new Set(message.caseIds);
   const cases = DECIDER_CASES.filter(c => caseIds.has(c.id));
 
-  if (!config.benchmarkUserId) {
+  if (!state.benchmarkUserId) {
     // startRun fails fast before enqueueing, so this only happens if the run
     // snapshot was tampered with; throwing lets the queue retry/dead-letter.
     throw new Error(`run ${message.runId} has no benchmarkUserId`);
@@ -235,10 +283,12 @@ async function processDeciderJob(
 
   // Fetch a short-lived user token ONCE per queue message. Non-OK throws so the
   // queue retries the message. The token is never logged.
-  const kiloToken = await fetchBenchmarkUserToken(env, config.benchmarkUserId);
+  const kiloToken = await fetchBenchmarkUserToken(env, state.benchmarkUserId);
   const instanceName = `${message.runId}:${message.model}:${message.chunk ?? 0}`;
-  const reasoningEffort =
-    config.deciderModels.find(m => m.id === message.model)?.reasoningEffort ?? null;
+
+  // Reasoning effort comes from the run snapshot (run_models row), not live config.
+  const modelRow = state.models.find(m => m.model === message.model);
+  const reasoningEffort = modelRow?.reasoning_effort ?? null;
 
   // Fresh container instances run the CLI's one-time sqlite migration; the
   // container owns that via its /warmup endpoint so the first real case
@@ -290,14 +340,13 @@ async function processDeciderJob(
         score: succeeded ? 1 : 0,
         latency_ms: result.latencyMs,
         cost_usd: result.costUsd,
-        detail_json: JSON.stringify({
-          exitCode: result.exitCode,
-          outputPrefix: result.text.slice(0, 200),
-          eventCount: result.eventCount,
-          lastEventTypes: result.lastEventTypes,
-          retried,
-        }),
         error: result.exitCode !== 0 ? result.stderrTail.slice(0, 500) : null,
+        fallback_reason: null,
+        retried,
+        exit_code: result.exitCode,
+        output_prefix: result.text.slice(0, 200),
+        event_count: result.eventCount,
+        last_event_types: result.lastEventTypes.join(' '),
       });
     } catch (error) {
       await upsertCaseResult(
@@ -329,11 +378,11 @@ export async function fetchBenchmarkUserToken(env: Env, userId: string): Promise
     const detail = (await response.text().catch(() => '')).slice(0, 200);
     throw new Error(`token mint failed: HTTP ${response.status} ${detail}`);
   }
-  const parsed = TokenResponseSchema.safeParse(await response.json());
-  if (!parsed.success) {
+  const parsedToken = TokenResponseSchema.safeParse(await response.json());
+  if (!parsedToken.success) {
     throw new Error('token mint returned unexpected response shape');
   }
-  return parsed.data.token;
+  return parsedToken.data.token;
 }
 
 function failedRow(
@@ -351,29 +400,16 @@ function failedRow(
     score: 0,
     latency_ms: Math.round(performance.now() - startedAt),
     cost_usd: null,
-    detail_json: null,
     error: JSON.stringify(formatError(error)).slice(0, 500),
+    fallback_reason: null,
+    retried: null,
+    exit_code: null,
+    output_prefix: null,
+    event_count: null,
+    last_event_types: null,
   };
 }
 
-type RunState = { config: BenchmarkConfig; runtime: RunRuntime };
-
-async function getRunState(env: Env, runId: string): Promise<RunState> {
-  // Snapshots taken at startRun time so a mid-run admin edit can't skew them.
-  const run = await getRun(env.BENCH_DB, runId);
-  if (!run) throw new Error(`unknown run ${runId}`);
-  const config = BenchmarkConfigSchema.parse(JSON.parse(run.config_json));
-  const runtime = run.runtime_json
-    ? RunRuntimeSchema.parse(JSON.parse(run.runtime_json))
-    : // Rows written before runtime_json existed ran every configured model.
-      {
-        enqueuedModels:
-          run.kind === 'classifier' ? config.classifierModels : config.deciderModels.map(m => m.id),
-        carriedSummaries: [],
-      };
-  return { config, runtime };
-}
-
 export async function runCasesWithConcurrency<T>(
   cases: readonly T[],
   concurrency: number,
@@ -388,54 +424,49 @@ export async function runCasesWithConcurrency<T>(
   await Promise.all(workers);
 }
 
-async function finalizeRunIfComplete(
-  env: Env,
-  runId: string,
-  kind: BenchmarkKind,
-  state: RunState
-): Promise<void> {
-  const { config, runtime } = state;
+async function finalizeRunIfComplete(env: Env, runId: string, kind: BenchmarkKind): Promise<void> {
+  const state = await getRunState(env, runId);
+  const enqueuedModels = state.models.filter(m => m.enqueued);
   const caseCount = kind === 'classifier' ? CLASSIFIER_CASES.length : DECIDER_CASES.length;
-  const expected = runtime.enqueuedModels.length * caseCount;
+  const expected = enqueuedModels.length * caseCount;
   const actual = await countCaseResults(env.BENCH_DB, runId);
 
   if (actual < expected) return;
 
   // Two consumers may both see completion and both aggregate — harmless:
   // identical deterministic inputs → identical summaries; replaceModelSummaries
-  // is a batched delete+insert; markRunCompleted guards on status='running';
-  // KV put is idempotent.
+  // is a batched delete+insert; markRunCompleted guards on status='running'.
   const rows = await getCaseResults(env.BENCH_DB, runId);
-  // Fresh results plus the carried-forward summaries of skipped models.
-  const summaries = [...summarize(rows, kind), ...runtime.carriedSummaries];
-  await replaceModelSummaries(env.BENCH_DB, runId, summaries);
+  // Fresh results (enqueued models). Carried summaries (skipped models) stay in
+  // model_summaries with carried=true and are included via getSummaries below.
+  const freshSummaries = summarize(rows, kind);
+  await replaceModelSummaries(env.BENCH_DB, runId, freshSummaries);
   await markRunCompleted(env.BENCH_DB, runId);
 
+  // Read back all summaries (fresh + carried) for publishing.
+  const allSummaries = await getSummaries(env.BENCH_DB, runId);
+
   if (kind === 'classifier') {
-    const winner = pickClassifierWinner(summaries, config.minAccuracy);
+    const winner = pickClassifierWinner(allSummaries, state.minAccuracy);
     if (winner) {
-      const payload: ClassifierWinner = {
-        model: winner.model,
-        runId,
-        accuracy: winner.accuracy,
-        generatedAt: new Date().toISOString(),
-      };
-      await env.AUTO_ROUTING_CONFIG.put(CLASSIFIER_WINNER_KV_KEY, JSON.stringify(payload));
       console.log(
         JSON.stringify({ event: 'classifier_winner_published', runId, model: winner.model })
       );
     } else {
       console.warn(JSON.stringify({ event: 'classifier_winner_skipped', runId }));
     }
+    // Clear KV so the auto-routing worker repopulates from D1 on next request.
+    await env.AUTO_ROUTING_CONFIG.delete(CLASSIFIER_WINNER_KV_KEY);
   }
 
   if (kind === 'decider') {
     const generatedAt = new Date().toISOString();
     try {
-      const table = buildRoutingTable({ runId, generatedAt, config, summaries });
-      const tableJson = JSON.stringify(table);
-      await saveRoutingTable(env.BENCH_DB, runId, generatedAt, tableJson);
-      await env.AUTO_ROUTING_CONFIG.put(ROUTING_TABLE_KV_KEY, tableJson);
+      const config = await getBenchmarkConfig(env.BENCH_DB);
+      const table = buildRoutingTable({ runId, generatedAt, config, summaries: allSummaries });
+      await saveRoutingTable(env.BENCH_DB, table, generatedAt);
+      // Clear KV so the auto-routing worker repopulates from D1 on next request.
+      await env.AUTO_ROUTING_CONFIG.delete(ROUTING_TABLE_KV_KEY);
       console.log(
         JSON.stringify({ event: 'routing_table_published', runId, version: table.version })
       );
@@ -455,28 +486,11 @@ async function finalizeRunIfComplete(
       event: 'benchmark_run_completed',
       runId,
       kind,
-      summaries,
+      summaries: allSummaries,
     })
   );
 }
 
-// Same bang-for-buck rule as the routing table, applied to classifier
-// summaries (tier '*'): cheapest candidate meeting the accuracy threshold,
-// else the most accurate one. Null when there are no graded summaries.
-export function pickClassifierWinner(
-  summaries: BenchmarkModelSummary[],
-  minAccuracy: number
-): BenchmarkModelSummary | null {
-  const graded = summaries.filter(s => s.tier === '*' && s.cases > 0);
-  if (graded.length === 0) return null;
-  const cost = (s: BenchmarkModelSummary) => s.avgCostUsd ?? Number.POSITIVE_INFINITY;
-  const meeting = graded.filter(s => s.accuracy >= minAccuracy);
-  if (meeting.length > 0) {
-    return meeting.toSorted((a, b) => cost(a) - cost(b) || b.accuracy - a.accuracy)[0];
-  }
-  return graded.toSorted((a, b) => b.accuracy - a.accuracy || cost(a) - cost(b))[0];
-}
-
 export function summarize(rows: CaseResultRow[], kind: BenchmarkKind): BenchmarkModelSummary[] {
   // Group by "model tier-key" using a plain reduce so this works in all runtimes.
   // Classifier rows use '*' as the tier (no tiering); decider rows use the actual tier
diff --git a/services/auto-routing-benchmark/src/winner.ts b/services/auto-routing-benchmark/src/winner.ts
new file mode 100644
index 0000000000..1cb0abd7e4
--- /dev/null
+++ b/services/auto-routing-benchmark/src/winner.ts
@@ -0,0 +1,18 @@
+import type { BenchmarkModelSummary } from '@kilocode/auto-routing-contracts';
+
+// Same bang-for-buck rule as the routing table, applied to classifier
+// summaries (tier '*'): cheapest candidate meeting the accuracy threshold,
+// else the most accurate one. Null when there are no graded summaries.
+export function pickClassifierWinner(
+  summaries: BenchmarkModelSummary[],
+  minAccuracy: number
+): BenchmarkModelSummary | null {
+  const graded = summaries.filter(s => s.tier === '*' && s.cases > 0);
+  if (graded.length === 0) return null;
+  const cost = (s: BenchmarkModelSummary) => s.avgCostUsd ?? Number.POSITIVE_INFINITY;
+  const meeting = graded.filter(s => s.accuracy >= minAccuracy);
+  if (meeting.length > 0) {
+    return meeting.toSorted((a, b) => cost(a) - cost(b) || b.accuracy - a.accuracy)[0];
+  }
+  return graded.toSorted((a, b) => b.accuracy - a.accuracy || cost(a) - cost(b))[0];
+}

From 86e2fdc0042ddaaa813f77609454baf1b9a20674 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Fri, 12 Jun 2026 12:56:45 +0200
Subject: [PATCH 34/73] fix(auto-routing-benchmark): preserve null candidate
 cost and type drizzle batches
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace `avg_cost_usd ?? 0` with a transparent pass-through cast so a stored
NULL is not silently promoted to 0 (cheapest) in the ranking; the downstream
RoutingTableSchema.safeParse in getLatestRoutingTable will reject a corrupted
table rather than serve it with wrong costs. Add a round-trip test confirming
null is preserved through routingTableToRows → rowsToRoutingTable.

Replace the three `any[]` + `as unknown as Parameters<typeof orm.batch>[0]`
patterns in replaceConfig, insertRun, and saveRoutingTable with the typed
`BatchItem<'sqlite'>` tuple form from drizzle-orm/batch, removing the
eslint-disable suppressions.
---
 .../auto-routing-benchmark/src/db.test.ts     | 30 +++++++++++++++++++
 services/auto-routing-benchmark/src/db.ts     | 23 +++++++-------
 2 files changed, 41 insertions(+), 12 deletions(-)

diff --git a/services/auto-routing-benchmark/src/db.test.ts b/services/auto-routing-benchmark/src/db.test.ts
index 17a5d10133..549c4f2e2b 100644
--- a/services/auto-routing-benchmark/src/db.test.ts
+++ b/services/auto-routing-benchmark/src/db.test.ts
@@ -270,4 +270,34 @@ describe('rowsToRoutingTable', () => {
     expect(reassembled.tiers.low[0].model).toBe('model-a');
     expect(reassembled.tiers.low[1].model).toBe('model-b');
   });
+
+  it('preserves null avgCostUsd through routingTableToRows → rowsToRoutingTable', () => {
+    const nullCostCandidate: RankedCandidate = {
+      model: 'model-nullcost',
+      accuracy: 0.88,
+      avgCostUsd: null as unknown as number,
+      meetsThreshold: true,
+      supportedApiKinds: ['responses'],
+      reasoningEffort: null,
+    };
+    const tableWithNullCost: RoutingTable = {
+      version: 'run-null-cost',
+      generatedAt: '2026-06-01T10:00:00.000Z',
+      minAccuracy: 0.7,
+      source: 'benchmark',
+      tiers: {
+        low: [nullCostCandidate],
+        medium: [candidate('model-c')],
+        high: [candidate('model-a')],
+      },
+    };
+    const { tableRow, candidateRows } = routingTableToRows(
+      tableWithNullCost,
+      '2026-06-01T11:00:00.000Z'
+    );
+    const lowRow = candidateRows.find(r => r.tier === 'low');
+    expect(lowRow?.avg_cost_usd).toBeNull();
+    const reassembled = rowsToRoutingTable(tableRow, candidateRows);
+    expect(reassembled.tiers.low[0].avgCostUsd).toBeNull();
+  });
 });
diff --git a/services/auto-routing-benchmark/src/db.ts b/services/auto-routing-benchmark/src/db.ts
index 3096bd0fc8..f1df09f16e 100644
--- a/services/auto-routing-benchmark/src/db.ts
+++ b/services/auto-routing-benchmark/src/db.ts
@@ -6,6 +6,7 @@ import type {
   RankedCandidate,
   RoutingTable,
 } from '@kilocode/auto-routing-contracts';
+import type { BatchItem } from 'drizzle-orm/batch';
 import { RoutingTableSchema } from '@kilocode/auto-routing-contracts';
 import { and, count, desc, eq, inArray, lt } from 'drizzle-orm';
 import { drizzle } from 'drizzle-orm/d1';
@@ -123,10 +124,7 @@ export async function replaceConfig(
   deciderModels: ConfigDeciderModelRow[]
 ): Promise<void> {
   const orm = drizzle(db);
-  // Build the batch as a plain array; the cast is required because drizzle's
-  // batch type is a readonly non-empty tuple, but we populate it conditionally.
-  // eslint-disable-next-line @typescript-eslint/no-explicit-any
-  const stmts: any[] = [
+  const stmts: [BatchItem<'sqlite'>, ...BatchItem<'sqlite'>[]] = [
     orm
       .insert(benchmarkConfig)
       .values({ id: 1, ...config })
@@ -145,7 +143,7 @@ export async function replaceConfig(
   if (deciderModels.length > 0) {
     stmts.push(orm.insert(configDeciderModels).values(deciderModels));
   }
-  await orm.batch(stmts as unknown as Parameters<typeof orm.batch>[0]);
+  await orm.batch(stmts);
 }
 
 // ---------------------------------------------------------------------------
@@ -181,8 +179,7 @@ export async function insertRun(
     return;
   }
 
-  // eslint-disable-next-line @typescript-eslint/no-explicit-any
-  const stmts: any[] = [insertRunStmt];
+  const stmts: [BatchItem<'sqlite'>, ...BatchItem<'sqlite'>[]] = [insertRunStmt];
 
   if (models.length > 0) {
     stmts.push(orm.insert(runModels).values(models));
@@ -207,7 +204,7 @@ export async function insertRun(
     );
   }
 
-  await orm.batch(stmts as unknown as Parameters<typeof orm.batch>[0]);
+  await orm.batch(stmts);
 }
 
 export async function getRunWithModels(
@@ -456,7 +453,10 @@ export function rowsToRoutingTable(
     tierMap[row.tier].push({
       model: row.model,
       accuracy: row.accuracy,
-      avgCostUsd: row.avg_cost_usd ?? 0,
+      // Pass through the stored value; a NULL here means corrupted data.
+      // getLatestRoutingTable runs RoutingTableSchema.safeParse which will
+      // reject the table rather than serving a null cost as 0 (cheapest).
+      avgCostUsd: row.avg_cost_usd as number,
       meetsThreshold: row.meets_threshold,
       supportedApiKinds: flagsToApiKinds(row),
       reasoningEffort: row.reasoning_effort as RankedCandidate['reasoningEffort'],
@@ -483,8 +483,7 @@ export async function saveRoutingTable(
   const orm = drizzle(db);
   const { tableRow, candidateRows } = routingTableToRows(table, publishedAt);
 
-  // eslint-disable-next-line @typescript-eslint/no-explicit-any
-  const stmts: any[] = [
+  const stmts: [BatchItem<'sqlite'>, ...BatchItem<'sqlite'>[]] = [
     orm.delete(routingTableCandidates).where(eq(routingTableCandidates.run_id, table.version)),
     orm
       .insert(routingTables)
@@ -504,7 +503,7 @@ export async function saveRoutingTable(
     stmts.push(orm.insert(routingTableCandidates).values(candidateRows));
   }
 
-  await orm.batch(stmts as unknown as Parameters<typeof orm.batch>[0]);
+  await orm.batch(stmts);
 }
 
 export async function getLatestRoutingTable(

From 0241d477cc4d129794127beca7f268546849c906 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Fri, 12 Jun 2026 12:58:31 +0200
Subject: [PATCH 35/73] refactor(auto-routing-benchmark): make candidate cost
 non-null to match the contract

---
 ...ptain_flint.sql => 0000_complete_lyja.sql} |  2 +-
 .../migrations/meta/0000_snapshot.json        |  4 +--
 .../migrations/meta/_journal.json             |  4 +--
 .../auto-routing-benchmark/src/db-schema.ts   |  4 ++-
 .../auto-routing-benchmark/src/db.test.ts     | 30 -------------------
 services/auto-routing-benchmark/src/db.ts     |  7 ++---
 6 files changed, 10 insertions(+), 41 deletions(-)
 rename services/auto-routing-benchmark/migrations/{0000_supreme_captain_flint.sql => 0000_complete_lyja.sql} (98%)

diff --git a/services/auto-routing-benchmark/migrations/0000_supreme_captain_flint.sql b/services/auto-routing-benchmark/migrations/0000_complete_lyja.sql
similarity index 98%
rename from services/auto-routing-benchmark/migrations/0000_supreme_captain_flint.sql
rename to services/auto-routing-benchmark/migrations/0000_complete_lyja.sql
index 7760e97e5f..5dbc7e2bd0 100644
--- a/services/auto-routing-benchmark/migrations/0000_supreme_captain_flint.sql
+++ b/services/auto-routing-benchmark/migrations/0000_complete_lyja.sql
@@ -70,7 +70,7 @@ CREATE TABLE `routing_table_candidates` (
 	`rank` integer NOT NULL,
 	`model` text NOT NULL,
 	`accuracy` real NOT NULL,
-	`avg_cost_usd` real,
+	`avg_cost_usd` real NOT NULL,
 	`meets_threshold` integer NOT NULL,
 	`reasoning_effort` text,
 	`supports_chat_completions` integer NOT NULL,
diff --git a/services/auto-routing-benchmark/migrations/meta/0000_snapshot.json b/services/auto-routing-benchmark/migrations/meta/0000_snapshot.json
index c297d0ce00..e0bfebb9ea 100644
--- a/services/auto-routing-benchmark/migrations/meta/0000_snapshot.json
+++ b/services/auto-routing-benchmark/migrations/meta/0000_snapshot.json
@@ -1,7 +1,7 @@
 {
   "version": "6",
   "dialect": "sqlite",
-  "id": "d78fc8c4-926b-42c0-b876-6877d22e28fe",
+  "id": "77565fdd-e92c-4de7-a4ee-0f832a620652",
   "prevId": "00000000-0000-0000-0000-000000000000",
   "tables": {
     "benchmark_config": {
@@ -448,7 +448,7 @@
           "name": "avg_cost_usd",
           "type": "real",
           "primaryKey": false,
-          "notNull": false,
+          "notNull": true,
           "autoincrement": false
         },
         "meets_threshold": {
diff --git a/services/auto-routing-benchmark/migrations/meta/_journal.json b/services/auto-routing-benchmark/migrations/meta/_journal.json
index cb34f2ea08..826916b528 100644
--- a/services/auto-routing-benchmark/migrations/meta/_journal.json
+++ b/services/auto-routing-benchmark/migrations/meta/_journal.json
@@ -5,8 +5,8 @@
     {
       "idx": 0,
       "version": "6",
-      "when": 1781260685397,
-      "tag": "0000_supreme_captain_flint",
+      "when": 1781261894022,
+      "tag": "0000_complete_lyja",
       "breakpoints": true
     }
   ]
diff --git a/services/auto-routing-benchmark/src/db-schema.ts b/services/auto-routing-benchmark/src/db-schema.ts
index 0436c28ff6..9fac9afb21 100644
--- a/services/auto-routing-benchmark/src/db-schema.ts
+++ b/services/auto-routing-benchmark/src/db-schema.ts
@@ -113,7 +113,9 @@ export const routingTableCandidates = sqliteTable(
     rank: integer('rank').notNull(),
     model: text('model').notNull(),
     accuracy: real('accuracy').notNull(),
-    avg_cost_usd: real('avg_cost_usd'),
+    // Non-null unlike model_summaries: RankedCandidate.avgCostUsd is a plain
+    // nonnegative number (buildRoutingTable maps unknown costs to 0).
+    avg_cost_usd: real('avg_cost_usd').notNull(),
     meets_threshold: integer('meets_threshold', { mode: 'boolean' }).notNull(),
     reasoning_effort: text('reasoning_effort'),
     supports_chat_completions: integer('supports_chat_completions', { mode: 'boolean' }).notNull(),
diff --git a/services/auto-routing-benchmark/src/db.test.ts b/services/auto-routing-benchmark/src/db.test.ts
index 549c4f2e2b..17a5d10133 100644
--- a/services/auto-routing-benchmark/src/db.test.ts
+++ b/services/auto-routing-benchmark/src/db.test.ts
@@ -270,34 +270,4 @@ describe('rowsToRoutingTable', () => {
     expect(reassembled.tiers.low[0].model).toBe('model-a');
     expect(reassembled.tiers.low[1].model).toBe('model-b');
   });
-
-  it('preserves null avgCostUsd through routingTableToRows → rowsToRoutingTable', () => {
-    const nullCostCandidate: RankedCandidate = {
-      model: 'model-nullcost',
-      accuracy: 0.88,
-      avgCostUsd: null as unknown as number,
-      meetsThreshold: true,
-      supportedApiKinds: ['responses'],
-      reasoningEffort: null,
-    };
-    const tableWithNullCost: RoutingTable = {
-      version: 'run-null-cost',
-      generatedAt: '2026-06-01T10:00:00.000Z',
-      minAccuracy: 0.7,
-      source: 'benchmark',
-      tiers: {
-        low: [nullCostCandidate],
-        medium: [candidate('model-c')],
-        high: [candidate('model-a')],
-      },
-    };
-    const { tableRow, candidateRows } = routingTableToRows(
-      tableWithNullCost,
-      '2026-06-01T11:00:00.000Z'
-    );
-    const lowRow = candidateRows.find(r => r.tier === 'low');
-    expect(lowRow?.avg_cost_usd).toBeNull();
-    const reassembled = rowsToRoutingTable(tableRow, candidateRows);
-    expect(reassembled.tiers.low[0].avgCostUsd).toBeNull();
-  });
 });
diff --git a/services/auto-routing-benchmark/src/db.ts b/services/auto-routing-benchmark/src/db.ts
index f1df09f16e..bc83f23741 100644
--- a/services/auto-routing-benchmark/src/db.ts
+++ b/services/auto-routing-benchmark/src/db.ts
@@ -428,7 +428,7 @@ export function routingTableToRows(
         rank,
         model: c.model,
         accuracy: c.accuracy,
-        avg_cost_usd: c.avgCostUsd ?? null,
+        avg_cost_usd: c.avgCostUsd,
         meets_threshold: c.meetsThreshold,
         reasoning_effort: c.reasoningEffort ?? null,
         ...apiKindsToFlags(c.supportedApiKinds),
@@ -453,10 +453,7 @@ export function rowsToRoutingTable(
     tierMap[row.tier].push({
       model: row.model,
       accuracy: row.accuracy,
-      // Pass through the stored value; a NULL here means corrupted data.
-      // getLatestRoutingTable runs RoutingTableSchema.safeParse which will
-      // reject the table rather than serving a null cost as 0 (cheapest).
-      avgCostUsd: row.avg_cost_usd as number,
+      avgCostUsd: row.avg_cost_usd,
       meetsThreshold: row.meets_threshold,
       supportedApiKinds: flagsToApiKinds(row),
       reasoningEffort: row.reasoning_effort as RankedCandidate['reasoningEffort'],

From 82446762d5fb56f146ddc680b3f3fee902b0d60b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Fri, 12 Jun 2026 13:05:26 +0200
Subject: [PATCH 36/73] feat(auto-routing): read-through KV cache backed by the
 benchmark service

On a KV miss (or corrupt value), fetch routing-table and classifier-winner
from the benchmark worker via a service binding, write the result back with
a 1h TTL, and return it. Corrupt cached values are treated as misses. The
existing 60s isolate-level ttlCached wrappers and fail-closed defaults are
unchanged.
---
 services/auto-routing/src/benchmark-origin.ts |  46 +++++
 .../src/classifier-config.test.ts             | 170 ++++++++++++++----
 .../auto-routing/src/classifier-config.ts     |  30 +++-
 services/auto-routing/src/index.test.ts       |  15 ++
 .../auto-routing/src/kv-read-through.test.ts  | 111 ++++++++++++
 services/auto-routing/src/kv-read-through.ts  |  35 ++++
 .../auto-routing/src/routing-table.test.ts    | 110 ++++++++++--
 services/auto-routing/src/routing-table.ts    |  52 ++++--
 .../auto-routing/worker-configuration.d.ts    |   3 +-
 services/auto-routing/wrangler.jsonc          |   2 +
 10 files changed, 502 insertions(+), 72 deletions(-)
 create mode 100644 services/auto-routing/src/benchmark-origin.ts
 create mode 100644 services/auto-routing/src/kv-read-through.test.ts
 create mode 100644 services/auto-routing/src/kv-read-through.ts

diff --git a/services/auto-routing/src/benchmark-origin.ts b/services/auto-routing/src/benchmark-origin.ts
new file mode 100644
index 0000000000..a2e8ad5c39
--- /dev/null
+++ b/services/auto-routing/src/benchmark-origin.ts
@@ -0,0 +1,46 @@
+import {
+  BenchmarkRoutingTableResponseSchema,
+  ClassifierWinnerResponseSchema,
+  type ClassifierWinner,
+  type RoutingTable,
+} from '@kilocode/auto-routing-contracts';
+
+type BenchmarkEnv = Pick<Env, 'BENCHMARK_SERVICE' | 'INTERNAL_API_SECRET_PROD'>;
+
+async function fetchBenchmark(env: BenchmarkEnv, path: string): Promise<unknown> {
+  const secret = await env.INTERNAL_API_SECRET_PROD.get();
+  const res = await env.BENCHMARK_SERVICE.fetch(
+    `https://auto-routing-benchmark${path}`,
+    { headers: { authorization: `Bearer ${secret}` } }
+  );
+  if (!res.ok) {
+    throw new Error(`benchmark origin ${path} responded ${res.status}`);
+  }
+  return res.json();
+}
+
+export async function fetchRoutingTableFromOrigin(
+  env: BenchmarkEnv
+): Promise<RoutingTable | null> {
+  const body = await fetchBenchmark(env, '/admin/routing-table');
+  const parsed = BenchmarkRoutingTableResponseSchema.safeParse(body);
+  if (!parsed.success) {
+    throw new Error(
+      `benchmark routing-table response invalid: ${parsed.error.issues[0]?.message ?? 'unknown'}`
+    );
+  }
+  return parsed.data.table;
+}
+
+export async function fetchClassifierWinnerFromOrigin(
+  env: BenchmarkEnv
+): Promise<ClassifierWinner | null> {
+  const body = await fetchBenchmark(env, '/admin/classifier-winner');
+  const parsed = ClassifierWinnerResponseSchema.safeParse(body);
+  if (!parsed.success) {
+    throw new Error(
+      `benchmark classifier-winner response invalid: ${parsed.error.issues[0]?.message ?? 'unknown'}`
+    );
+  }
+  return parsed.data.winner;
+}
diff --git a/services/auto-routing/src/classifier-config.test.ts b/services/auto-routing/src/classifier-config.test.ts
index 08c83c4b59..3c9b660606 100644
--- a/services/auto-routing/src/classifier-config.test.ts
+++ b/services/auto-routing/src/classifier-config.test.ts
@@ -4,14 +4,74 @@ import {
   CLASSIFIER_MODEL_CONFIG_KEY,
   clearClassifierConfigCache,
   getClassifierModel,
+  getClassifierModelInfo,
 } from './classifier-config';
+import { CLASSIFIER_WINNER_KV_KEY } from '@kilocode/auto-routing-contracts';
 
-function createKv(value: string | null) {
-  const get = vi.fn(async () => value);
-  return {
-    kv: { get } as unknown as KVNamespace,
-    get,
-  };
+type ClassifierEnvStub = Pick<
+  Env,
+  'AUTO_ROUTING_CONFIG' | 'BENCHMARK_SERVICE' | 'INTERNAL_API_SECRET_PROD'
+>;
+
+const EXAMPLE_WINNER = {
+  model: 'google/gemini-2.5-flash-lite',
+  runId: 'run-abc',
+  accuracy: 0.95,
+  generatedAt: '2026-06-11T00:00:00.000Z',
+};
+
+type EnvSetup = {
+  env: ClassifierEnvStub;
+  configGet: ReturnType<typeof vi.fn>;
+  configPut: ReturnType<typeof vi.fn>;
+  benchmarkFetch: ReturnType<typeof vi.fn>;
+};
+
+function makeEnv(opts: {
+  overrideModel?: string | null;
+  winnerKvValue?: string | null;
+  originWinner?: typeof EXAMPLE_WINNER | null;
+  originStatus?: number;
+  originThrow?: boolean;
+  onPut?: (key: string, value: string, options: unknown) => void;
+}): EnvSetup {
+  const configGet = vi.fn(async (key: string) => {
+    if (key === CLASSIFIER_MODEL_CONFIG_KEY) {
+      return opts.overrideModel === undefined ? null : opts.overrideModel;
+    }
+    if (key === CLASSIFIER_WINNER_KV_KEY) {
+      return opts.winnerKvValue === undefined ? null : opts.winnerKvValue;
+    }
+    return null;
+  });
+  const configPut = vi.fn(async (key: string, value: string, options: unknown) => {
+    opts.onPut?.(key, value, options);
+  });
+  const benchmarkFetch = vi.fn(async () => {
+    if (opts.originThrow) throw new Error('benchmark unavailable');
+    return {
+      ok: opts.originStatus === undefined ? true : opts.originStatus < 400,
+      status: opts.originStatus ?? 200,
+      json: async () => ({
+        winner: opts.originWinner !== undefined ? opts.originWinner : null,
+      }),
+    };
+  });
+
+  const env: ClassifierEnvStub = {
+    AUTO_ROUTING_CONFIG: {
+      get: configGet,
+      put: configPut,
+    },
+    BENCHMARK_SERVICE: {
+      fetch: benchmarkFetch,
+    },
+    INTERNAL_API_SECRET_PROD: {
+      get: vi.fn(async () => 'test-secret'),
+    },
+  } as unknown as ClassifierEnvStub;
+
+  return { env, configGet, configPut, benchmarkFetch };
 }
 
 describe('classifier config', () => {
@@ -20,42 +80,92 @@ describe('classifier config', () => {
   });
 
   it('falls back to the default classifier model when KV has no value', async () => {
-    const { get, kv } = createKv(null);
+    const { env, configGet } = makeEnv({});
 
-    await expect(getClassifierModel({ AUTO_ROUTING_CONFIG: kv })).resolves.toBe(
-      DEFAULT_CLASSIFIER_MODEL
-    );
-    expect(get).toHaveBeenCalledWith(CLASSIFIER_MODEL_CONFIG_KEY);
+    await expect(getClassifierModel(env)).resolves.toBe(DEFAULT_CLASSIFIER_MODEL);
+    expect(configGet).toHaveBeenCalledWith(CLASSIFIER_MODEL_CONFIG_KEY);
   });
 
-  it('uses the trimmed classifier model from KV', async () => {
-    await expect(
-      getClassifierModel({
-        AUTO_ROUTING_CONFIG: createKv('  google/gemini-2.5-flash-lite  ').kv,
-      })
-    ).resolves.toBe('google/gemini-2.5-flash-lite');
+  it('uses the trimmed classifier model from KV override', async () => {
+    const { env } = makeEnv({ overrideModel: '  google/gemini-2.5-flash-lite  ' });
+    await expect(getClassifierModel(env)).resolves.toBe('google/gemini-2.5-flash-lite');
   });
 
   it('falls back to the default classifier model when KV has a blank value', async () => {
-    await expect(
-      getClassifierModel({
-        AUTO_ROUTING_CONFIG: createKv('   ').kv,
-      })
-    ).resolves.toBe(DEFAULT_CLASSIFIER_MODEL);
+    const { env } = makeEnv({ overrideModel: '   ' });
+    await expect(getClassifierModel(env)).resolves.toBe(DEFAULT_CLASSIFIER_MODEL);
   });
 
   it('fails closed to the default classifier model when the KV read rejects', async () => {
     const warn = vi.spyOn(console, 'warn').mockImplementation(() => {});
-    const kv = {
-      get: vi.fn(async () => {
-        throw new Error('KV unavailable');
-      }),
-    } as unknown as KVNamespace;
+    const configGet = vi.fn(async () => {
+      throw new Error('KV unavailable');
+    });
+    const env: ClassifierEnvStub = {
+      AUTO_ROUTING_CONFIG: {
+        get: configGet,
+        put: vi.fn(async () => {}),
+      } as unknown as KVNamespace,
+      BENCHMARK_SERVICE: { fetch: vi.fn() } as unknown as Fetcher,
+      INTERNAL_API_SECRET_PROD: {
+        get: vi.fn(async () => 'secret'),
+      } as unknown as SecretsStoreSecret,
+    };
 
-    await expect(getClassifierModel({ AUTO_ROUTING_CONFIG: kv })).resolves.toBe(
-      DEFAULT_CLASSIFIER_MODEL
-    );
+    await expect(getClassifierModel(env)).resolves.toBe(DEFAULT_CLASSIFIER_MODEL);
     expect(warn).toHaveBeenCalled();
     warn.mockRestore();
   });
+
+  it('serves the benchmark winner from KV without calling origin', async () => {
+    const { env, benchmarkFetch } = makeEnv({ winnerKvValue: JSON.stringify(EXAMPLE_WINNER) });
+    const info = await getClassifierModelInfo(env);
+    expect(info.benchmarkWinner).toBe(EXAMPLE_WINNER.model);
+    expect(info.model).toBe(EXAMPLE_WINNER.model);
+    expect(benchmarkFetch).not.toHaveBeenCalled();
+  });
+
+  it('fetches from origin on winner KV miss, writes to KV with expirationTtl, and returns winner', async () => {
+    const puts: Array<{ key: string; value: string; options: unknown }> = [];
+    const { env } = makeEnv({
+      winnerKvValue: null,
+      originWinner: EXAMPLE_WINNER,
+      onPut: (key, value, options) => puts.push({ key, value, options }),
+    });
+
+    const info = await getClassifierModelInfo(env);
+    expect(info.benchmarkWinner).toBe(EXAMPLE_WINNER.model);
+    expect(
+      puts.some(
+        p =>
+          p.key === CLASSIFIER_WINNER_KV_KEY &&
+          (p.options as { expirationTtl: number }).expirationTtl === 3600
+      )
+    ).toBe(true);
+  });
+
+  it('falls back to default model when origin returns null winner', async () => {
+    const { env } = makeEnv({ winnerKvValue: null, originWinner: null });
+    const info = await getClassifierModelInfo(env);
+    expect(info.benchmarkWinner).toBeNull();
+    expect(info.model).toBe(DEFAULT_CLASSIFIER_MODEL);
+  });
+
+  it('falls back to default model when origin fails for the winner', async () => {
+    const warn = vi.spyOn(console, 'warn').mockImplementation(() => {});
+    const { env } = makeEnv({ winnerKvValue: null, originThrow: true });
+    await expect(getClassifierModel(env)).resolves.toBe(DEFAULT_CLASSIFIER_MODEL);
+    warn.mockRestore();
+  });
+
+  it('override takes precedence over benchmark winner from origin', async () => {
+    const { env } = makeEnv({
+      overrideModel: 'openai/gpt-4o',
+      winnerKvValue: null,
+      originWinner: EXAMPLE_WINNER,
+    });
+    const info = await getClassifierModelInfo(env);
+    expect(info.override).toBe('openai/gpt-4o');
+    expect(info.model).toBe('openai/gpt-4o');
+  });
 });
diff --git a/services/auto-routing/src/classifier-config.ts b/services/auto-routing/src/classifier-config.ts
index 6801a6d6a5..e4840a3e5e 100644
--- a/services/auto-routing/src/classifier-config.ts
+++ b/services/auto-routing/src/classifier-config.ts
@@ -1,7 +1,13 @@
 import { formatError } from '@kilocode/worker-utils';
-import { CLASSIFIER_WINNER_KV_KEY, ClassifierWinnerSchema } from '@kilocode/auto-routing-contracts';
+import {
+  CLASSIFIER_WINNER_KV_KEY,
+  ClassifierWinnerSchema,
+  type ClassifierWinner,
+} from '@kilocode/auto-routing-contracts';
 import { DEFAULT_CLASSIFIER_MODEL } from '@kilocode/auto-routing-contracts/classifier';
 import { ttlCached } from './ttl-cache';
+import { kvReadThrough } from './kv-read-through';
+import { fetchClassifierWinnerFromOrigin } from './benchmark-origin';
 
 export const CLASSIFIER_MODEL_CONFIG_KEY = 'classifier_model';
 export const DECISION_LOG_SAMPLE_RATE_CONFIG_KEY = 'decision_log_sample_rate';
@@ -17,7 +23,10 @@ const DEFAULT_DECISION_LOG_SAMPLE_RATE = 0.01;
 // read from every classification.
 const CONFIG_CACHE_TTL_MS = 60_000;
 
-type ClassifierConfigEnv = Pick<Env, 'AUTO_ROUTING_CONFIG'>;
+type ClassifierConfigEnv = Pick<
+  Env,
+  'AUTO_ROUTING_CONFIG' | 'BENCHMARK_SERVICE' | 'INTERNAL_API_SECRET_PROD'
+>;
 
 export type ClassifierModelInfo = {
   // Effective model used by /decide: override ?? benchmark winner ?? default.
@@ -26,23 +35,28 @@ export type ClassifierModelInfo = {
   benchmarkWinner: string | null;
 };
 
-function parseBenchmarkWinner(raw: string | null): string | null {
-  if (raw === null) return null;
+function parseClassifierWinner(raw: string): ClassifierWinner | null {
   try {
     const parsed = ClassifierWinnerSchema.safeParse(JSON.parse(raw));
-    return parsed.success ? parsed.data.model : null;
+    return parsed.success ? parsed.data : null;
   } catch {
     return null;
   }
 }
 
 const classifierModelCache = ttlCached(CONFIG_CACHE_TTL_MS, async (env: ClassifierConfigEnv) => {
-  const [configuredModel, winnerRaw] = await Promise.all([
+  const [configuredModel, winner] = await Promise.all([
     env.AUTO_ROUTING_CONFIG.get(CLASSIFIER_MODEL_CONFIG_KEY),
-    env.AUTO_ROUTING_CONFIG.get(CLASSIFIER_WINNER_KV_KEY),
+    kvReadThrough<ClassifierWinner>({
+      kv: env.AUTO_ROUTING_CONFIG,
+      key: CLASSIFIER_WINNER_KV_KEY,
+      ttlSeconds: 3600,
+      fetchOrigin: () => fetchClassifierWinnerFromOrigin(env),
+      parse: parseClassifierWinner,
+    }),
   ]);
   const override = configuredModel?.trim() || null;
-  const benchmarkWinner = parseBenchmarkWinner(winnerRaw);
+  const benchmarkWinner = winner?.model ?? null;
   return {
     model: override ?? benchmarkWinner ?? DEFAULT_CLASSIFIER_MODEL,
     override,
diff --git a/services/auto-routing/src/index.test.ts b/services/auto-routing/src/index.test.ts
index 41b9670041..1e7e553dc2 100644
--- a/services/auto-routing/src/index.test.ts
+++ b/services/auto-routing/src/index.test.ts
@@ -20,6 +20,7 @@ const analyticsTokenGet = vi.fn();
 const cacheGetEntry = vi.fn();
 const cachePutEntry = vi.fn();
 const cacheIdFromName = vi.fn(() => 'cache-do-id');
+const benchmarkFetch = vi.fn();
 const originalFetch = globalThis.fetch;
 const mockedFetch = vi.fn<typeof globalThis.fetch>();
 
@@ -32,6 +33,9 @@ const env = {
     delete: configDelete,
     put: configPut,
   },
+  BENCHMARK_SERVICE: {
+    fetch: benchmarkFetch,
+  },
   AUTO_ROUTING_CLASSIFIER_METRICS_V2: {
     writeDataPoint,
   },
@@ -131,6 +135,17 @@ describe('auto routing worker', () => {
     configDelete.mockReset();
     configDelete.mockResolvedValue(undefined);
     configPut.mockReset();
+    benchmarkFetch.mockReset();
+    benchmarkFetch.mockImplementation(async (url: string) => {
+      if (String(url).includes('/admin/classifier-winner')) {
+        return { ok: true, status: 200, json: async () => ({ winner: null }) };
+      }
+      return {
+        ok: true,
+        status: 200,
+        json: async () => ({ table: null, publishedAt: null }),
+      };
+    });
     analyticsTokenGet.mockReset();
     analyticsTokenGet.mockResolvedValue('analytics-token');
     cacheGetEntry.mockReset();
diff --git a/services/auto-routing/src/kv-read-through.test.ts b/services/auto-routing/src/kv-read-through.test.ts
new file mode 100644
index 0000000000..d1eca871b6
--- /dev/null
+++ b/services/auto-routing/src/kv-read-through.test.ts
@@ -0,0 +1,111 @@
+import { describe, expect, it, vi } from 'vitest';
+import { kvReadThrough } from './kv-read-through';
+
+function makeKv(value: string | null): { kv: KVNamespace; put: ReturnType<typeof vi.fn> } {
+  const put = vi.fn(async () => {});
+  const kv = {
+    get: vi.fn(async () => value),
+    put,
+  } as unknown as KVNamespace;
+  return { kv, put };
+}
+
+describe('kvReadThrough', () => {
+  it('returns cached value on KV hit without calling origin', async () => {
+    const value = { model: 'test/model', accuracy: 0.9 };
+    const { kv, put } = makeKv(JSON.stringify(value));
+    const fetchOrigin = vi.fn(async () => value);
+
+    const result = await kvReadThrough({
+      kv,
+      key: 'test-key',
+      ttlSeconds: 300,
+      fetchOrigin,
+      parse: raw => JSON.parse(raw) as typeof value,
+    });
+
+    expect(result).toEqual(value);
+    expect(fetchOrigin).not.toHaveBeenCalled();
+    expect(put).not.toHaveBeenCalled();
+  });
+
+  it('treats a corrupt KV value as a miss, fetches from origin, and writes back with expirationTtl', async () => {
+    const warn = vi.spyOn(console, 'warn').mockImplementation(() => {});
+    const { kv, put } = makeKv('not valid json {{{');
+    const origin = { model: 'origin/model', accuracy: 0.8 };
+    const fetchOrigin = vi.fn(async () => origin);
+
+    const result = await kvReadThrough({
+      kv,
+      key: 'corrupt-key',
+      ttlSeconds: 3600,
+      fetchOrigin,
+      parse: raw => {
+        try {
+          return JSON.parse(raw) as typeof origin;
+        } catch {
+          return null;
+        }
+      },
+    });
+
+    expect(result).toEqual(origin);
+    expect(fetchOrigin).toHaveBeenCalledOnce();
+    expect(put).toHaveBeenCalledWith('corrupt-key', JSON.stringify(origin), { expirationTtl: 3600 });
+    expect(warn).toHaveBeenCalled();
+    warn.mockRestore();
+  });
+
+  it('fetches from origin on KV miss and writes back with expirationTtl', async () => {
+    const { kv, put } = makeKv(null);
+    const origin = { model: 'from/origin', accuracy: 0.95 };
+    const fetchOrigin = vi.fn(async () => origin);
+
+    const result = await kvReadThrough({
+      kv,
+      key: 'missing-key',
+      ttlSeconds: 3600,
+      fetchOrigin,
+      parse: raw => JSON.parse(raw) as typeof origin,
+    });
+
+    expect(result).toEqual(origin);
+    expect(fetchOrigin).toHaveBeenCalledOnce();
+    expect(put).toHaveBeenCalledWith('missing-key', JSON.stringify(origin), { expirationTtl: 3600 });
+  });
+
+  it('returns null and does NOT write to KV when origin returns null', async () => {
+    const { kv, put } = makeKv(null);
+    const fetchOrigin = vi.fn(async () => null);
+
+    const result = await kvReadThrough({
+      kv,
+      key: 'empty-key',
+      ttlSeconds: 3600,
+      fetchOrigin,
+      parse: raw => JSON.parse(raw) as Record<string, unknown>,
+    });
+
+    expect(result).toBeNull();
+    expect(put).not.toHaveBeenCalled();
+  });
+
+  it('propagates origin errors without writing to KV', async () => {
+    const { kv, put } = makeKv(null);
+    const fetchOrigin = vi.fn(async () => {
+      throw new Error('origin unavailable');
+    });
+
+    await expect(
+      kvReadThrough({
+        kv,
+        key: 'throw-key',
+        ttlSeconds: 3600,
+        fetchOrigin,
+        parse: raw => JSON.parse(raw) as Record<string, unknown>,
+      })
+    ).rejects.toThrow('origin unavailable');
+
+    expect(put).not.toHaveBeenCalled();
+  });
+});
diff --git a/services/auto-routing/src/kv-read-through.ts b/services/auto-routing/src/kv-read-through.ts
new file mode 100644
index 0000000000..6df7d28d56
--- /dev/null
+++ b/services/auto-routing/src/kv-read-through.ts
@@ -0,0 +1,35 @@
+// Generic read-through cache on top of a KV namespace.
+// On KV hit: parse+validate; corrupt values are treated as misses.
+// On miss: fetch from origin; write to KV with expirationTtl on success.
+// Origin null → no KV write; origin throw → propagates to caller.
+export async function kvReadThrough<T>(options: {
+  kv: KVNamespace;
+  key: string;
+  ttlSeconds: number;
+  fetchOrigin: () => Promise<T | null>;
+  parse: (raw: string) => T | null;
+  serialize?: (value: T) => string;
+}): Promise<T | null> {
+  const { kv, key, ttlSeconds, fetchOrigin, parse, serialize = JSON.stringify } = options;
+
+  const raw = await kv.get(key);
+  if (raw !== null) {
+    const parsed = parse(raw);
+    if (parsed !== null) {
+      return parsed;
+    }
+    console.warn(
+      JSON.stringify({ event: 'kv_read_through_corrupt', key })
+    );
+  }
+
+  // Miss (or corrupt value treated as miss): fetch from origin.
+  const value = await fetchOrigin();
+  if (value === null) {
+    return null;
+  }
+
+  // Fire-and-forget the KV put; failures here should not block the response.
+  void kv.put(key, serialize(value), { expirationTtl: ttlSeconds });
+  return value;
+}
diff --git a/services/auto-routing/src/routing-table.test.ts b/services/auto-routing/src/routing-table.test.ts
index 788d866945..54f77d2715 100644
--- a/services/auto-routing/src/routing-table.test.ts
+++ b/services/auto-routing/src/routing-table.test.ts
@@ -1,34 +1,116 @@
-import { afterEach, describe, expect, it } from 'vitest';
+import { afterEach, describe, expect, it, vi } from 'vitest';
 import { clearRoutingTableCache, DEFAULT_ROUTING_TABLE, getRoutingTable } from './routing-table';
 
-type KvStub = Pick<Env, 'AUTO_ROUTING_CONFIG'>;
-const kvEnv = (value: string | null, onGet?: () => void): KvStub =>
-  ({
+type KvStub = Pick<Env, 'AUTO_ROUTING_CONFIG' | 'BENCHMARK_SERVICE' | 'INTERNAL_API_SECRET_PROD'>;
+
+function makeEnv(
+  kvValue: string | null,
+  opts: {
+    onGet?: () => void;
+    onPut?: (key: string, value: string, options: unknown) => void;
+    originTable?: unknown;
+    originStatus?: number;
+    originThrow?: boolean;
+  } = {}
+): KvStub {
+  return {
     AUTO_ROUTING_CONFIG: {
       get: async () => {
-        onGet?.();
-        return value;
+        opts.onGet?.();
+        return kvValue;
+      },
+      put: async (key: string, value: string, options: unknown) => {
+        opts.onPut?.(key, value, options);
       },
     },
-  }) as unknown as KvStub;
+    BENCHMARK_SERVICE: {
+      fetch: async () => {
+        if (opts.originThrow) throw new Error('benchmark unavailable');
+        return {
+          ok: opts.originStatus === undefined ? true : opts.originStatus < 400,
+          status: opts.originStatus ?? 200,
+          json: async () =>
+            opts.originTable !== undefined
+              ? { table: opts.originTable, publishedAt: '2026-06-11T00:00:00.000Z' }
+              : { table: null, publishedAt: null },
+        };
+      },
+    },
+    INTERNAL_API_SECRET_PROD: {
+      get: async () => 'test-secret',
+    },
+  } as unknown as KvStub;
+}
 
 afterEach(() => clearRoutingTableCache());
 
 describe('getRoutingTable', () => {
-  it('returns the default table when the key is missing', async () => {
-    expect(await getRoutingTable(kvEnv(null))).toEqual(DEFAULT_ROUTING_TABLE);
+  it('returns the default table when the key is missing and origin has no table', async () => {
+    expect(await getRoutingTable(makeEnv(null))).toEqual(DEFAULT_ROUTING_TABLE);
   });
-  it('returns the default table when the stored JSON is invalid', async () => {
-    expect(await getRoutingTable(kvEnv('{"nope":true}'))).toEqual(DEFAULT_ROUTING_TABLE);
+
+  it('returns the default table when the stored JSON is invalid and origin has no table', async () => {
+    expect(await getRoutingTable(makeEnv('{"nope":true}'))).toEqual(DEFAULT_ROUTING_TABLE);
     clearRoutingTableCache();
-    expect(await getRoutingTable(kvEnv('not json at all'))).toEqual(DEFAULT_ROUTING_TABLE);
+    expect(await getRoutingTable(makeEnv('not json at all'))).toEqual(DEFAULT_ROUTING_TABLE);
   });
-  it('parses and caches a valid stored table', async () => {
+
+  it('parses and caches a valid stored table without calling origin', async () => {
     let reads = 0;
-    const env = kvEnv(JSON.stringify(DEFAULT_ROUTING_TABLE), () => reads++);
+    const fetchSpy = vi.fn(async () => ({
+      ok: true,
+      status: 200,
+      json: async () => ({ table: null, publishedAt: null }),
+    }));
+    const env: KvStub = {
+      AUTO_ROUTING_CONFIG: {
+        get: async () => {
+          reads++;
+          return JSON.stringify(DEFAULT_ROUTING_TABLE);
+        },
+        put: async () => {},
+      },
+      BENCHMARK_SERVICE: { fetch: fetchSpy },
+      INTERNAL_API_SECRET_PROD: { get: async () => 'secret' },
+    } as unknown as KvStub;
+
     const first = await getRoutingTable(env);
     await getRoutingTable(env);
     expect(first.version).toBe(DEFAULT_ROUTING_TABLE.version);
     expect(reads).toBe(1);
+    expect(fetchSpy).not.toHaveBeenCalled();
+  });
+
+  it('fetches from origin on KV miss, writes to KV with expirationTtl, and returns the table', async () => {
+    const puts: Array<{ key: string; value: string; options: unknown }> = [];
+    const env = makeEnv(null, {
+      originTable: DEFAULT_ROUTING_TABLE,
+      onPut: (key, value, options) => puts.push({ key, value, options }),
+    });
+
+    const result = await getRoutingTable(env);
+    expect(result).toEqual(DEFAULT_ROUTING_TABLE);
+    expect(puts).toHaveLength(1);
+    expect(puts[0].key).toBe('routing_table_v1');
+    expect(puts[0].options).toEqual({ expirationTtl: 3600 });
+  });
+
+  it('returns the default table when origin responds non-OK', async () => {
+    const warn = vi.spyOn(console, 'warn').mockImplementation(() => {});
+    const env = makeEnv(null, { originStatus: 500 });
+    expect(await getRoutingTable(env)).toEqual(DEFAULT_ROUTING_TABLE);
+    warn.mockRestore();
+  });
+
+  it('returns the default table when origin throws', async () => {
+    const warn = vi.spyOn(console, 'warn').mockImplementation(() => {});
+    const env = makeEnv(null, { originThrow: true });
+    expect(await getRoutingTable(env)).toEqual(DEFAULT_ROUTING_TABLE);
+    warn.mockRestore();
+  });
+
+  it('returns the default table when origin returns null table', async () => {
+    const env = makeEnv(null, { originTable: undefined });
+    expect(await getRoutingTable(env)).toEqual(DEFAULT_ROUTING_TABLE);
   });
 });
diff --git a/services/auto-routing/src/routing-table.ts b/services/auto-routing/src/routing-table.ts
index aa2baccce4..eeaebb1973 100644
--- a/services/auto-routing/src/routing-table.ts
+++ b/services/auto-routing/src/routing-table.ts
@@ -5,6 +5,8 @@ import {
   type RoutingTable,
 } from '@kilocode/auto-routing-contracts';
 import { ttlCached } from './ttl-cache';
+import { kvReadThrough } from './kv-read-through';
+import { fetchRoutingTableFromOrigin } from './benchmark-origin';
 
 // Safety net used until the first decider benchmark publishes a table (and
 // whenever the stored table is missing or unparseable). Mirrors the static
@@ -54,27 +56,39 @@ export const DEFAULT_ROUTING_TABLE: RoutingTable = {
 
 const ROUTING_TABLE_CACHE_TTL_MS = 60_000;
 
-type RoutingTableEnv = Pick<Env, 'AUTO_ROUTING_CONFIG'>;
+type RoutingTableEnv = Pick<
+  Env,
+  'AUTO_ROUTING_CONFIG' | 'BENCHMARK_SERVICE' | 'INTERNAL_API_SECRET_PROD'
+>;
 
 const routingTableCache = ttlCached(ROUTING_TABLE_CACHE_TTL_MS, async (env: RoutingTableEnv) => {
-  const raw = await env.AUTO_ROUTING_CONFIG.get(ROUTING_TABLE_KV_KEY);
-  if (raw === null) return DEFAULT_ROUTING_TABLE;
-  try {
-    const parsed = RoutingTableSchema.safeParse(JSON.parse(raw));
-    if (!parsed.success) {
-      console.warn(
-        JSON.stringify({
-          event: 'auto_routing_table_invalid',
-          issues: parsed.error.issues.slice(0, 5).map(i => `${i.path.join('.')}: ${i.code}`),
-        })
-      );
-      return DEFAULT_ROUTING_TABLE;
-    }
-    return parsed.data;
-  } catch (error) {
-    console.warn(JSON.stringify({ event: 'auto_routing_table_invalid', ...formatError(error) }));
-    return DEFAULT_ROUTING_TABLE;
-  }
+  const table = await kvReadThrough({
+    kv: env.AUTO_ROUTING_CONFIG,
+    key: ROUTING_TABLE_KV_KEY,
+    ttlSeconds: 3600,
+    fetchOrigin: () => fetchRoutingTableFromOrigin(env),
+    parse: raw => {
+      try {
+        const parsed = RoutingTableSchema.safeParse(JSON.parse(raw));
+        if (!parsed.success) {
+          console.warn(
+            JSON.stringify({
+              event: 'auto_routing_table_invalid',
+              issues: parsed.error.issues.slice(0, 5).map(i => `${i.path.join('.')}: ${i.code}`),
+            })
+          );
+          return null;
+        }
+        return parsed.data;
+      } catch (error) {
+        console.warn(
+          JSON.stringify({ event: 'auto_routing_table_invalid', ...formatError(error) })
+        );
+        return null;
+      }
+    },
+  });
+  return table ?? DEFAULT_ROUTING_TABLE;
 });
 
 export function clearRoutingTableCache(): void {
diff --git a/services/auto-routing/worker-configuration.d.ts b/services/auto-routing/worker-configuration.d.ts
index 6b69a65d5b..e91b95e923 100644
--- a/services/auto-routing/worker-configuration.d.ts
+++ b/services/auto-routing/worker-configuration.d.ts
@@ -1,5 +1,5 @@
 /* eslint-disable */
-// Generated by Wrangler by running `wrangler types --include-runtime=false` (hash: 0d84c4429525cf1b432d2ffe636e1ca8)
+// Generated by Wrangler by running `wrangler types --include-runtime=false` (hash: 7e1033a1604c8e567cadb72d6145fa58)
 interface __BaseEnv_Env {
 	AUTO_ROUTING_CONFIG: KVNamespace;
 	AUTO_ROUTING_CLASSIFIER_METRICS_V2: AnalyticsEngineDataset;
@@ -8,6 +8,7 @@ interface __BaseEnv_Env {
 	O11Y_CF_AE_API_TOKEN: SecretsStoreSecret;
 	O11Y_CF_ACCOUNT_ID: "e115e769bcdd4c3d66af59d3332cb394";
 	AUTO_ROUTING_DECISION_CACHE: DurableObjectNamespace<import("./src/index").AutoRoutingDecisionCacheDO>;
+	BENCHMARK_SERVICE: Fetcher /* auto-routing-benchmark */;
 }
 declare namespace Cloudflare {
 	interface GlobalProps {
diff --git a/services/auto-routing/wrangler.jsonc b/services/auto-routing/wrangler.jsonc
index ddcf6d9baa..297c4557e8 100644
--- a/services/auto-routing/wrangler.jsonc
+++ b/services/auto-routing/wrangler.jsonc
@@ -45,6 +45,8 @@
       "dataset": "auto_routing_classifier_metrics_v2",
     },
   ],
+  "services": [{ "binding": "BENCHMARK_SERVICE", "service": "auto-routing-benchmark" }],
+
   "kv_namespaces": [
     {
       "binding": "AUTO_ROUTING_CONFIG",

From 36f32a7a39f0fc2f9b69b10c4aa0faf9829cdb41 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Fri, 12 Jun 2026 13:10:26 +0200
Subject: [PATCH 37/73] fix(auto-routing): await read-through cache writes and
 surface origin error bodies

---
 services/auto-routing/src/benchmark-origin.ts    | 14 ++++++--------
 .../auto-routing/src/kv-read-through.test.ts     |  8 ++++++--
 services/auto-routing/src/kv-read-through.ts     | 16 +++++++++++-----
 3 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/services/auto-routing/src/benchmark-origin.ts b/services/auto-routing/src/benchmark-origin.ts
index a2e8ad5c39..ecbadc2fd6 100644
--- a/services/auto-routing/src/benchmark-origin.ts
+++ b/services/auto-routing/src/benchmark-origin.ts
@@ -9,19 +9,17 @@ type BenchmarkEnv = Pick<Env, 'BENCHMARK_SERVICE' | 'INTERNAL_API_SECRET_PROD'>;
 
 async function fetchBenchmark(env: BenchmarkEnv, path: string): Promise<unknown> {
   const secret = await env.INTERNAL_API_SECRET_PROD.get();
-  const res = await env.BENCHMARK_SERVICE.fetch(
-    `https://auto-routing-benchmark${path}`,
-    { headers: { authorization: `Bearer ${secret}` } }
-  );
+  const res = await env.BENCHMARK_SERVICE.fetch(`https://auto-routing-benchmark${path}`, {
+    headers: { authorization: `Bearer ${secret}` },
+  });
   if (!res.ok) {
-    throw new Error(`benchmark origin ${path} responded ${res.status}`);
+    const detail = (await res.text().catch(() => '')).slice(0, 200);
+    throw new Error(`benchmark origin ${path} responded ${res.status} ${detail}`);
   }
   return res.json();
 }
 
-export async function fetchRoutingTableFromOrigin(
-  env: BenchmarkEnv
-): Promise<RoutingTable | null> {
+export async function fetchRoutingTableFromOrigin(env: BenchmarkEnv): Promise<RoutingTable | null> {
   const body = await fetchBenchmark(env, '/admin/routing-table');
   const parsed = BenchmarkRoutingTableResponseSchema.safeParse(body);
   if (!parsed.success) {
diff --git a/services/auto-routing/src/kv-read-through.test.ts b/services/auto-routing/src/kv-read-through.test.ts
index d1eca871b6..2b154d4959 100644
--- a/services/auto-routing/src/kv-read-through.test.ts
+++ b/services/auto-routing/src/kv-read-through.test.ts
@@ -51,7 +51,9 @@ describe('kvReadThrough', () => {
 
     expect(result).toEqual(origin);
     expect(fetchOrigin).toHaveBeenCalledOnce();
-    expect(put).toHaveBeenCalledWith('corrupt-key', JSON.stringify(origin), { expirationTtl: 3600 });
+    expect(put).toHaveBeenCalledWith('corrupt-key', JSON.stringify(origin), {
+      expirationTtl: 3600,
+    });
     expect(warn).toHaveBeenCalled();
     warn.mockRestore();
   });
@@ -71,7 +73,9 @@ describe('kvReadThrough', () => {
 
     expect(result).toEqual(origin);
     expect(fetchOrigin).toHaveBeenCalledOnce();
-    expect(put).toHaveBeenCalledWith('missing-key', JSON.stringify(origin), { expirationTtl: 3600 });
+    expect(put).toHaveBeenCalledWith('missing-key', JSON.stringify(origin), {
+      expirationTtl: 3600,
+    });
   });
 
   it('returns null and does NOT write to KV when origin returns null', async () => {
diff --git a/services/auto-routing/src/kv-read-through.ts b/services/auto-routing/src/kv-read-through.ts
index 6df7d28d56..96300c91e2 100644
--- a/services/auto-routing/src/kv-read-through.ts
+++ b/services/auto-routing/src/kv-read-through.ts
@@ -18,9 +18,7 @@ export async function kvReadThrough<T>(options: {
     if (parsed !== null) {
       return parsed;
     }
-    console.warn(
-      JSON.stringify({ event: 'kv_read_through_corrupt', key })
-    );
+    console.warn(JSON.stringify({ event: 'kv_read_through_corrupt', key }));
   }
 
   // Miss (or corrupt value treated as miss): fetch from origin.
@@ -29,7 +27,15 @@ export async function kvReadThrough<T>(options: {
     return null;
   }
 
-  // Fire-and-forget the KV put; failures here should not block the response.
-  void kv.put(key, serialize(value), { expirationTtl: ttlSeconds });
+  // Awaited: an unawaited promise without waitUntil may be cancelled when the
+  // request ends, silently dropping the cache write. A put failure must not
+  // discard the value we already fetched, so it only warns.
+  await kv
+    .put(key, serialize(value), { expirationTtl: ttlSeconds })
+    .catch((error: unknown) =>
+      console.warn(
+        JSON.stringify({ event: 'kv_read_through_put_failed', key, error: String(error) })
+      )
+    );
   return value;
 }

From aa14657b4f1d0fb6937e9c4612bf43936fcc044d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Fri, 12 Jun 2026 13:11:08 +0200
Subject: [PATCH 38/73] ci(workers): run worker predeploy scripts (D1
 migrations) before deploy

---
 .github/workflows/deploy-workers.yml         | 8 ++++++++
 services/auto-routing-benchmark/package.json | 1 +
 2 files changed, 9 insertions(+)

diff --git a/.github/workflows/deploy-workers.yml b/.github/workflows/deploy-workers.yml
index a24e2dc86d..4f09f92dd6 100644
--- a/.github/workflows/deploy-workers.yml
+++ b/.github/workflows/deploy-workers.yml
@@ -46,6 +46,10 @@ jobs:
         with:
           apiToken: ${{ secrets.CLOUDFLARE_API_TOKEN }}
           workingDirectory: ${{ inputs.worker }}
+          # Workers that define a `predeploy` script (e.g. D1 migrations) run it
+          # right before deploy; all other workers are unaffected.
+          preCommands: |
+            if [ "$(jq -r '.scripts.predeploy // empty' package.json)" != "" ]; then pnpm run predeploy; fi
           command: deploy
 
   # ── Push to main: detect changed workers, deploy each one ─────────────────
@@ -148,4 +152,8 @@ jobs:
         with:
           apiToken: ${{ secrets.CLOUDFLARE_API_TOKEN }}
           workingDirectory: ${{ matrix.worker }}
+          # Workers that define a `predeploy` script (e.g. D1 migrations) run it
+          # right before deploy; all other workers are unaffected.
+          preCommands: |
+            if [ "$(jq -r '.scripts.predeploy // empty' package.json)" != "" ]; then pnpm run predeploy; fi
           command: deploy
diff --git a/services/auto-routing-benchmark/package.json b/services/auto-routing-benchmark/package.json
index 7a38a89ba4..46745f55d3 100644
--- a/services/auto-routing-benchmark/package.json
+++ b/services/auto-routing-benchmark/package.json
@@ -5,6 +5,7 @@
   "type": "module",
   "scripts": {
     "deploy": "wrangler deploy",
+    "predeploy": "wrangler d1 migrations apply auto-routing-benchmark --remote",
     "dev": "wrangler dev",
     "types": "wrangler types --include-runtime=false",
     "typecheck": "tsgo --noEmit",

From 82aef0be273ea96803ca606730f8836dfacdb08f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Fri, 12 Jun 2026 14:00:01 +0200
Subject: [PATCH 39/73] fix(auto-routing-benchmark): reuse loaded run state in
 finalize and build tables from the run snapshot

---
 .../src/routing-table-builder.test.ts         | 65 ++++++++++---------
 .../src/routing-table-builder.ts              | 14 ++--
 services/auto-routing-benchmark/src/run.ts    | 39 +++++++++--
 3 files changed, 73 insertions(+), 45 deletions(-)

diff --git a/services/auto-routing-benchmark/src/routing-table-builder.test.ts b/services/auto-routing-benchmark/src/routing-table-builder.test.ts
index c08ef576be..265a321714 100644
--- a/services/auto-routing-benchmark/src/routing-table-builder.test.ts
+++ b/services/auto-routing-benchmark/src/routing-table-builder.test.ts
@@ -1,24 +1,19 @@
 import { describe, expect, it } from 'vitest';
-import type { BenchmarkConfig, BenchmarkModelSummary } from '@kilocode/auto-routing-contracts';
+import type {
+  BenchmarkDeciderModel,
+  BenchmarkModelSummary,
+} from '@kilocode/auto-routing-contracts';
 import { buildRoutingTable } from './routing-table-builder';
 
-const BASE_CONFIG: BenchmarkConfig = {
-  classifierModels: ['some/classifier'],
-  deciderModels: [
-    { id: 'model/cheap', supportedApiKinds: ['chat_completions'], reasoningEffort: null },
-    {
-      id: 'model/expensive',
-      supportedApiKinds: ['chat_completions', 'responses'],
-      reasoningEffort: null,
-    },
-    { id: 'model/mid', supportedApiKinds: ['chat_completions', 'messages'], reasoningEffort: null },
-  ],
-  minAccuracy: 0.7,
-  maxConcurrency: 4,
-  benchmarkUserId: null,
-  updatedAt: null,
-  updatedBy: null,
-};
+const DECIDER_MODELS: BenchmarkDeciderModel[] = [
+  { id: 'model/cheap', supportedApiKinds: ['chat_completions'], reasoningEffort: null },
+  {
+    id: 'model/expensive',
+    supportedApiKinds: ['chat_completions', 'responses'],
+    reasoningEffort: null,
+  },
+  { id: 'model/mid', supportedApiKinds: ['chat_completions', 'messages'], reasoningEffort: null },
+];
 
 function summary(
   model: string,
@@ -55,7 +50,8 @@ describe('buildRoutingTable', () => {
     const table = buildRoutingTable({
       runId: 'test-run-1',
       generatedAt: '2026-01-01T00:00:00.000Z',
-      config: BASE_CONFIG,
+      minAccuracy: 0.7,
+      deciderModels: DECIDER_MODELS,
       summaries: ALL_TIERS_SUMMARIES,
     });
 
@@ -81,7 +77,8 @@ describe('buildRoutingTable', () => {
     const table = buildRoutingTable({
       runId: 'test-run-2',
       generatedAt: '2026-01-01T00:00:00.000Z',
-      config: BASE_CONFIG,
+      minAccuracy: 0.7,
+      deciderModels: DECIDER_MODELS,
       summaries: ALL_TIERS_SUMMARIES,
     });
 
@@ -107,7 +104,8 @@ describe('buildRoutingTable', () => {
     const table = buildRoutingTable({
       runId: 'test-run-3',
       generatedAt: '2026-01-01T00:00:00.000Z',
-      config: BASE_CONFIG,
+      minAccuracy: 0.7,
+      deciderModels: DECIDER_MODELS,
       summaries,
     });
 
@@ -117,11 +115,12 @@ describe('buildRoutingTable', () => {
     expect(highModels).toContain('model/mid');
   });
 
-  it('carries supportedApiKinds from config', () => {
+  it('carries supportedApiKinds from the run snapshot', () => {
     const table = buildRoutingTable({
       runId: 'test-run-4',
       generatedAt: '2026-01-01T00:00:00.000Z',
-      config: BASE_CONFIG,
+      minAccuracy: 0.7,
+      deciderModels: DECIDER_MODELS,
       summaries: ALL_TIERS_SUMMARIES,
     });
 
@@ -132,7 +131,7 @@ describe('buildRoutingTable', () => {
     expect(midInLow?.supportedApiKinds).toEqual(['chat_completions', 'messages']);
   });
 
-  it('defaults supportedApiKinds to chat_completions when model missing from config', () => {
+  it('defaults supportedApiKinds to chat_completions when model missing from the snapshot', () => {
     const summaries: BenchmarkModelSummary[] = [
       summary('model/unknown', 'low', 0.9),
       summary('model/cheap', 'low', 0.8),
@@ -142,13 +141,11 @@ describe('buildRoutingTable', () => {
       summary('model/unknown', 'high', 0.9),
     ];
 
-    // Add a model that isn't in deciderModels
-    const config = { ...BASE_CONFIG };
-
     const table = buildRoutingTable({
       runId: 'test-run-5',
       generatedAt: '2026-01-01T00:00:00.000Z',
-      config,
+      minAccuracy: 0.7,
+      deciderModels: DECIDER_MODELS,
       summaries,
     });
 
@@ -171,7 +168,8 @@ describe('buildRoutingTable', () => {
       buildRoutingTable({
         runId: 'test-run-6',
         generatedAt: '2026-01-01T00:00:00.000Z',
-        config: BASE_CONFIG,
+        minAccuracy: 0.7,
+        deciderModels: DECIDER_MODELS,
         summaries,
       })
     ).toThrow();
@@ -190,7 +188,8 @@ describe('buildRoutingTable', () => {
       buildRoutingTable({
         runId: 'test-run-7',
         generatedAt: '2026-01-01T00:00:00.000Z',
-        config: BASE_CONFIG,
+        minAccuracy: 0.7,
+        deciderModels: DECIDER_MODELS,
         summaries,
       })
     ).toThrow();
@@ -208,7 +207,8 @@ describe('buildRoutingTable', () => {
     const table = buildRoutingTable({
       runId: 'test-run-8',
       generatedAt: '2026-01-01T00:00:00.000Z',
-      config: BASE_CONFIG,
+      minAccuracy: 0.7,
+      deciderModels: DECIDER_MODELS,
       summaries,
     });
 
@@ -220,7 +220,8 @@ describe('buildRoutingTable', () => {
     const table = buildRoutingTable({
       runId: 'decider-2026-01-01',
       generatedAt: '2026-01-01T12:00:00.000Z',
-      config: BASE_CONFIG,
+      minAccuracy: 0.7,
+      deciderModels: DECIDER_MODELS,
       summaries: ALL_TIERS_SUMMARIES,
     });
 
diff --git a/services/auto-routing-benchmark/src/routing-table-builder.ts b/services/auto-routing-benchmark/src/routing-table-builder.ts
index d6d6e625a3..4230b9077a 100644
--- a/services/auto-routing-benchmark/src/routing-table-builder.ts
+++ b/services/auto-routing-benchmark/src/routing-table-builder.ts
@@ -1,7 +1,7 @@
 import {
   rankCandidates,
   RoutingTableSchema,
-  type BenchmarkConfig,
+  type BenchmarkDeciderModel,
   type BenchmarkModelSummary,
   type DifficultyTier,
   type RoutingTable,
@@ -10,14 +10,16 @@ import {
 // Builds the routing table from per-(model, tier) decider summaries. Models
 // with zero graded cases in a tier are excluded from that tier. Throws when
 // any tier ends up empty so the caller keeps the previous published table.
+// deciderModels/minAccuracy come from the run's snapshot, not live config.
 export function buildRoutingTable(params: {
   runId: string;
   generatedAt: string;
-  config: BenchmarkConfig;
+  minAccuracy: number;
+  deciderModels: BenchmarkDeciderModel[];
   summaries: BenchmarkModelSummary[];
 }): RoutingTable {
-  const { runId, generatedAt, config, summaries } = params;
-  const modelConfigById = new Map(config.deciderModels.map(m => [m.id, m] as const));
+  const { runId, generatedAt, minAccuracy, deciderModels, summaries } = params;
+  const modelConfigById = new Map(deciderModels.map(m => [m.id, m] as const));
 
   const tierCandidates = (t: DifficultyTier) =>
     rankCandidates(
@@ -33,13 +35,13 @@ export function buildRoutingTable(params: {
           ],
           reasoningEffort: modelConfigById.get(s.model)?.reasoningEffort ?? null,
         })),
-      config.minAccuracy
+      minAccuracy
     );
 
   const table: RoutingTable = {
     version: runId,
     generatedAt,
-    minAccuracy: config.minAccuracy,
+    minAccuracy,
     source: 'benchmark',
     tiers: {
       low: tierCandidates('low'),
diff --git a/services/auto-routing-benchmark/src/run.ts b/services/auto-routing-benchmark/src/run.ts
index 449c7ddbde..5a2bb2ffa8 100644
--- a/services/auto-routing-benchmark/src/run.ts
+++ b/services/auto-routing-benchmark/src/run.ts
@@ -2,6 +2,7 @@ import { classifyWithOpenRouter } from '@kilocode/auto-routing-contracts/classif
 import {
   CLASSIFIER_WINNER_KV_KEY,
   ROUTING_TABLE_KV_KEY,
+  type BenchmarkDeciderModel,
   type BenchmarkKind,
   type BenchmarkModelSummary,
 } from '@kilocode/auto-routing-contracts';
@@ -13,6 +14,7 @@ import { DECIDER_CASES } from './datasets/decider-cases';
 import type { RunModelRow } from './db';
 import {
   apiKindsToFlags,
+  flagsToApiKinds,
   countCaseResults,
   getCaseResults,
   getLatestSummariesByModel,
@@ -151,8 +153,14 @@ export async function startRun(
   if (enqueuedModelIds.length === 0) {
     // Everything already has results: complete immediately and republish the
     // aggregate so config-only changes (model removed, threshold tweaked)
-    // take effect without re-running any model.
-    await finalizeRunIfComplete(env, runId, kind);
+    // take effect without re-running any model. The state mirrors the rows
+    // insertRun just wrote, so no re-read is needed.
+    await finalizeRunIfComplete(env, runId, kind, {
+      maxConcurrency: config.maxConcurrency,
+      minAccuracy: config.minAccuracy,
+      benchmarkUserId: config.benchmarkUserId,
+      models: runModelRows,
+    });
     return { runId, enqueuedModels: 0, skippedModels };
   }
 
@@ -238,7 +246,7 @@ export async function processJob(env: Env, rawMessage: unknown): Promise<void> {
     await processDeciderJob(env, message, state);
   }
 
-  await finalizeRunIfComplete(env, message.runId, message.kind);
+  await finalizeRunIfComplete(env, message.runId, message.kind, state);
 }
 
 type RunState = {
@@ -424,8 +432,13 @@ export async function runCasesWithConcurrency<T>(
   await Promise.all(workers);
 }
 
-async function finalizeRunIfComplete(env: Env, runId: string, kind: BenchmarkKind): Promise<void> {
-  const state = await getRunState(env, runId);
+async function finalizeRunIfComplete(
+  env: Env,
+  runId: string,
+  kind: BenchmarkKind,
+  // Run snapshot already loaded by the caller (startRun / processJob).
+  state: RunState
+): Promise<void> {
   const enqueuedModels = state.models.filter(m => m.enqueued);
   const caseCount = kind === 'classifier' ? CLASSIFIER_CASES.length : DECIDER_CASES.length;
   const expected = enqueuedModels.length * caseCount;
@@ -462,8 +475,20 @@ async function finalizeRunIfComplete(env: Env, runId: string, kind: BenchmarkKin
   if (kind === 'decider') {
     const generatedAt = new Date().toISOString();
     try {
-      const config = await getBenchmarkConfig(env.BENCH_DB);
-      const table = buildRoutingTable({ runId, generatedAt, config, summaries: allSummaries });
+      // Built from the run's own model snapshot, not live config, so a mid-run
+      // admin edit can't skew the published table.
+      const deciderModels: BenchmarkDeciderModel[] = state.models.map(m => ({
+        id: m.model,
+        supportedApiKinds: flagsToApiKinds(m),
+        reasoningEffort: m.reasoning_effort as BenchmarkDeciderModel['reasoningEffort'],
+      }));
+      const table = buildRoutingTable({
+        runId,
+        generatedAt,
+        minAccuracy: state.minAccuracy,
+        deciderModels,
+        summaries: allSummaries,
+      });
       await saveRoutingTable(env.BENCH_DB, table, generatedAt);
       // Clear KV so the auto-routing worker repopulates from D1 on next request.
       await env.AUTO_ROUTING_CONFIG.delete(ROUTING_TABLE_KV_KEY);

From 4a7478bd6ea136757f0459e6311ab566ddc9fdc0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Fri, 12 Jun 2026 14:22:21 +0200
Subject: [PATCH 40/73] refactor(auto-routing): share ttl cache, single-source
 schemas and drop dead exports

- Move TtlCache/ttlCached to @kilocode/worker-utils; delete the two
  identical service-local copies and update all import sites
- Single-source ReasoningEffortSchema in packages/auto-routing-contracts/tiers.ts;
  routing-table.ts and index.ts use it; benchmark.ts re-exports for compatibility
- Add BenchmarkRunStatus type to contracts; db-schema.ts uses it instead of
  the inline literal union
- Replace local ApiKind in benchmark db.ts with ClassifierApiKind from contracts
- Extract DecideBaseParams / buildDecidePayload shared helper from mirror into
  auto-routing-mirror.ts; auto-routing-decision.ts consumes it
- Delete AutoRoutingAdminResult<T> type alias from both admin client files
  (zero consumers); delete BenchmarkRoutingTableResponseSchema re-export from
  benchmark admin client (consumers import from contracts directly)
- Replace route.ts timingSafeStringEqual with timingSafeEqual from
  @kilocode/encryption; keep extractBearerToken local (jose/jest constraint)
- Replace inline 'classifier'|'decider' and api-kind array types in
  BenchmarksSection.tsx with BenchmarkKind and ClassifierApiKind from contracts
---
 .../admin/auto-routing/BenchmarksSection.tsx  | 12 ++---
 .../auto-routing-benchmark/token/route.ts     | 12 +----
 .../ai-gateway/auto-routing-admin-client.ts   |  3 --
 .../auto-routing-benchmark-admin-client.ts    |  8 ---
 .../lib/ai-gateway/auto-routing-decision.ts   | 37 ++-----------
 .../src/lib/ai-gateway/auto-routing-mirror.ts | 53 ++++++++++++-------
 .../auto-routing-contracts/src/benchmark.ts   |  9 ++--
 packages/auto-routing-contracts/src/index.ts  |  4 +-
 .../src/routing-table.ts                      |  3 +-
 packages/auto-routing-contracts/src/tiers.ts  |  3 ++
 packages/worker-utils/src/index.ts            |  3 ++
 .../worker-utils}/src/ttl-cache.ts            |  0
 .../auto-routing-benchmark/src/db-schema.ts   |  4 +-
 services/auto-routing-benchmark/src/db.ts     |  8 +--
 .../auto-routing-benchmark/src/openrouter.ts  |  2 +-
 .../auto-routing/src/classifier-config.ts     |  3 +-
 services/auto-routing/src/openrouter.ts       |  2 +-
 services/auto-routing/src/routing-table.ts    |  3 +-
 services/auto-routing/src/ttl-cache.ts        | 35 ------------
 19 files changed, 69 insertions(+), 135 deletions(-)
 rename {services/auto-routing-benchmark => packages/worker-utils}/src/ttl-cache.ts (100%)
 delete mode 100644 services/auto-routing/src/ttl-cache.ts

diff --git a/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx b/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx
index feea0023bd..d79be0e52a 100644
--- a/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx
+++ b/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx
@@ -6,9 +6,11 @@ import {
   BenchmarkRunsResponseSchema,
   StartBenchmarkRunResponseSchema,
   type BenchmarkConfig,
+  type BenchmarkKind,
   type BenchmarkRoutingTableResponse,
   type BenchmarkRun,
   type BenchmarkModelSummary,
+  type ClassifierApiKind,
   type ReasoningEffort,
 } from '@kilocode/auto-routing-contracts';
 import React, { useCallback, useEffect, useState } from 'react';
@@ -80,13 +82,7 @@ async function fetchBenchmarkRuns() {
   return parseAdminResponse(response, BenchmarkRunsResponseSchema);
 }
 
-async function startBenchmarkRun({
-  kind,
-  force,
-}: {
-  kind: 'classifier' | 'decider';
-  force: boolean;
-}) {
+async function startBenchmarkRun({ kind, force }: { kind: BenchmarkKind; force: boolean }) {
   const response = await fetch('/admin/api/auto-routing/benchmark-runs', {
     method: 'POST',
     headers: { 'content-type': 'application/json' },
@@ -148,7 +144,7 @@ function formStateToConfig(
   const deciderModels = state.deciderModels
     .filter(row => row.id.trim().length > 0)
     .map(row => {
-      const kinds: Array<'chat_completions' | 'responses' | 'messages'> = [];
+      const kinds: ClassifierApiKind[] = [];
       if (row.chat_completions) kinds.push('chat_completions');
       if (row.responses) kinds.push('responses');
       if (row.messages) kinds.push('messages');
diff --git a/apps/web/src/app/api/internal/auto-routing-benchmark/token/route.ts b/apps/web/src/app/api/internal/auto-routing-benchmark/token/route.ts
index 82b4e6d79d..ef9bf4f93e 100644
--- a/apps/web/src/app/api/internal/auto-routing-benchmark/token/route.ts
+++ b/apps/web/src/app/api/internal/auto-routing-benchmark/token/route.ts
@@ -23,7 +23,7 @@
 
 import type { NextRequest } from 'next/server';
 import { NextResponse } from 'next/server';
-import { timingSafeEqual } from 'node:crypto';
+import { timingSafeEqual } from '@kilocode/encryption';
 import { z } from 'zod';
 import { eq } from 'drizzle-orm';
 import { kilocode_users } from '@kilocode/db/schema';
@@ -45,17 +45,9 @@ function extractBearerToken(authHeader: string | null): string | null {
   return trimmed.slice(7).trim() || null;
 }
 
-function timingSafeStringEqual(a: string, b: string): boolean {
-  const encoder = new TextEncoder();
-  const bufA = encoder.encode(a);
-  const bufB = encoder.encode(b);
-  if (bufA.byteLength !== bufB.byteLength) return false;
-  return timingSafeEqual(bufA, bufB);
-}
-
 export async function POST(req: NextRequest) {
   const token = extractBearerToken(req.headers.get('authorization'));
-  if (!INTERNAL_API_SECRET || !token || !timingSafeStringEqual(token, INTERNAL_API_SECRET)) {
+  if (!INTERNAL_API_SECRET || !token || !timingSafeEqual(token, INTERNAL_API_SECRET)) {
     return NextResponse.json({ error: 'Unauthorized' }, { status: 401 });
   }
 
diff --git a/apps/web/src/lib/ai-gateway/auto-routing-admin-client.ts b/apps/web/src/lib/ai-gateway/auto-routing-admin-client.ts
index 49d4d2ca07..937c589cf9 100644
--- a/apps/web/src/lib/ai-gateway/auto-routing-admin-client.ts
+++ b/apps/web/src/lib/ai-gateway/auto-routing-admin-client.ts
@@ -5,9 +5,6 @@ import {
 } from '@kilocode/auto-routing-contracts';
 import { AUTO_ROUTING_WORKER_URL } from '@/lib/config.server';
 import { createWorkerAdminFetch } from './worker-admin-fetch';
-import type { WorkerAdminResult } from './worker-admin-fetch';
-
-export type AutoRoutingAdminResult<T> = WorkerAdminResult<T>;
 
 const fetchAutoRoutingAdmin = createWorkerAdminFetch({
   workerUrl: AUTO_ROUTING_WORKER_URL,
diff --git a/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.ts b/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.ts
index 3939234c55..56a345053e 100644
--- a/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.ts
+++ b/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.ts
@@ -6,16 +6,8 @@ import {
   type BenchmarkConfig,
   type BenchmarkKind,
 } from '@kilocode/auto-routing-contracts';
-
-export {
-  BenchmarkRoutingTableResponseSchema,
-  type BenchmarkRoutingTableResponse,
-} from '@kilocode/auto-routing-contracts';
 import { AUTO_ROUTING_BENCHMARK_WORKER_URL } from '@/lib/config.server';
 import { createWorkerAdminFetch } from './worker-admin-fetch';
-import type { WorkerAdminResult } from './worker-admin-fetch';
-
-export type AutoRoutingAdminResult<T> = WorkerAdminResult<T>;
 
 const fetchBenchmarkAdmin = createWorkerAdminFetch({
   workerUrl: AUTO_ROUTING_BENCHMARK_WORKER_URL,
diff --git a/apps/web/src/lib/ai-gateway/auto-routing-decision.ts b/apps/web/src/lib/ai-gateway/auto-routing-decision.ts
index bf8b3b1f95..a7e35022c4 100644
--- a/apps/web/src/lib/ai-gateway/auto-routing-decision.ts
+++ b/apps/web/src/lib/ai-gateway/auto-routing-decision.ts
@@ -1,28 +1,15 @@
 import {
   AutoRoutingDecisionResponseSchema,
-  normalizeClassifierInput,
   type AutoRoutingDecision,
-  type ClassifierApiKind,
-  type MirrorPayload,
 } from '@kilocode/auto-routing-contracts';
 import { AUTO_ROUTING_WORKER_URL, INTERNAL_API_SECRET } from '@/lib/config.server';
 import { warnExceptInTest } from '@/lib/utils.server';
+import { buildDecidePayload, type DecideBaseParams } from './auto-routing-mirror';
 
 export const EFFICIENT_DECISION_TIMEOUT_MS = 2_000;
 
-export type EfficientDecisionParams = {
-  apiKind: ClassifierApiKind;
-  body: unknown;
-  requestedModel: string;
-  providerHints: MirrorPayload['input']['providerHints'];
-  bodyBytes: number;
-  userId: string;
-  sessionId: string | null;
-  machineId: string | null;
-  clientRequestId: string | null;
-  mode: string | null;
-  userAgent: string | null;
-};
+// EfficientDecisionParams is an alias for the shared base params type.
+export type EfficientDecisionParams = DecideBaseParams;
 
 type FetchEfficientDecisionOptions = {
   workerUrl?: string;
@@ -43,22 +30,8 @@ export async function fetchEfficientAutoDecision(
   const onError = options.onError ?? warnExceptInTest;
   if (!workerUrl || !authToken) return null;
 
-  const normalizedInput = normalizeClassifierInput(params.apiKind, params.body, {
-    requestedModel: params.requestedModel,
-    providerHints: params.providerHints,
-  });
-  if (!normalizedInput) return null;
-
-  const payload: MirrorPayload = {
-    input: normalizedInput,
-    userId: params.userId,
-    sessionId: params.sessionId,
-    machineId: params.machineId,
-    clientRequestId: params.clientRequestId,
-    mode: params.mode,
-    userAgent: params.userAgent,
-    bodyBytes: params.bodyBytes,
-  };
+  const payload = buildDecidePayload(params);
+  if (!payload) return null;
 
   try {
     const response = await fetch(`${workerUrl}/decide`, {
diff --git a/apps/web/src/lib/ai-gateway/auto-routing-mirror.ts b/apps/web/src/lib/ai-gateway/auto-routing-mirror.ts
index 6192bb9bef..1c730446dd 100644
--- a/apps/web/src/lib/ai-gateway/auto-routing-mirror.ts
+++ b/apps/web/src/lib/ai-gateway/auto-routing-mirror.ts
@@ -4,11 +4,11 @@ import { after } from 'next/server';
 import { AUTO_ROUTING_WORKER_URL, INTERNAL_API_SECRET } from '@/lib/config.server';
 import { warnExceptInTest } from '@/lib/utils.server';
 
-type ScheduleAutoRoutingMirrorParams = {
+// Shared base params for both the mirror (fire-and-forget) and the
+// efficient-decision (blocking) call sites. Keep in sync with
+// EfficientDecisionParams in auto-routing-decision.ts.
+export type DecideBaseParams = {
   apiKind: ClassifierApiKind;
-  // The parsed gateway request body. Provider transforms may mutate it after
-  // scheduling, which is why the requested model and provider hints are
-  // captured separately before any mutation.
   body: unknown;
   requestedModel: string;
   providerHints: MirrorPayload['input']['providerHints'];
@@ -19,6 +19,33 @@ type ScheduleAutoRoutingMirrorParams = {
   clientRequestId: string | null;
   mode: string | null;
   userAgent: string | null;
+};
+
+// Normalize and assemble the /decide payload. Returns null when the body
+// cannot be classified (normalization failed).
+export function buildDecidePayload(params: DecideBaseParams): MirrorPayload | null {
+  const normalizedInput = normalizeClassifierInput(params.apiKind, params.body, {
+    requestedModel: params.requestedModel,
+    providerHints: params.providerHints,
+  });
+  if (!normalizedInput) return null;
+
+  return {
+    input: normalizedInput,
+    userId: params.userId,
+    sessionId: params.sessionId,
+    machineId: params.machineId,
+    clientRequestId: params.clientRequestId,
+    mode: params.mode,
+    userAgent: params.userAgent,
+    bodyBytes: params.bodyBytes,
+  };
+}
+
+type ScheduleAutoRoutingMirrorParams = DecideBaseParams & {
+  // The parsed gateway request body. Provider transforms may mutate it after
+  // scheduling, which is why the requested model and provider hints are
+  // captured separately before any mutation.
   authContext?: Promise<{ organizationId?: string | null }>;
 };
 
@@ -41,11 +68,8 @@ async function sendAutoRoutingMirror(
   // Normalizing here (in background work, off the request path) keeps the
   // mirror payload at a few KB instead of the full request body, and lets
   // requests the worker could not classify anyway skip the mirror call.
-  const normalizedInput = normalizeClassifierInput(params.apiKind, params.body, {
-    requestedModel: params.requestedModel,
-    providerHints: params.providerHints,
-  });
-  if (!normalizedInput) {
+  const payload = buildDecidePayload(params);
+  if (!payload) {
     const onError = options.onError ?? warnExceptInTest;
     onError('Auto routing mirror skipped unclassifiable request body', {
       error: 'normalize_failed',
@@ -53,17 +77,6 @@ async function sendAutoRoutingMirror(
     return;
   }
 
-  const payload: MirrorPayload = {
-    input: normalizedInput,
-    userId: params.userId,
-    sessionId: params.sessionId,
-    machineId: params.machineId,
-    clientRequestId: params.clientRequestId,
-    mode: params.mode,
-    userAgent: params.userAgent,
-    bodyBytes: params.bodyBytes,
-  };
-
   const response = await fetch(`${workerUrl}/decide`, {
     method: 'POST',
     headers: new Headers({
diff --git a/packages/auto-routing-contracts/src/benchmark.ts b/packages/auto-routing-contracts/src/benchmark.ts
index 61c38bcff6..2ed7d2d106 100644
--- a/packages/auto-routing-contracts/src/benchmark.ts
+++ b/packages/auto-routing-contracts/src/benchmark.ts
@@ -1,13 +1,13 @@
 import * as z from 'zod';
 import { ClassifierApiKindSchema, RoutingTableSchema } from './routing-table';
-import { DifficultyTierSchema } from './tiers';
+import { DifficultyTierSchema, ReasoningEffortSchema } from './tiers';
+
+export { ReasoningEffortSchema } from './tiers';
+export type { ReasoningEffort } from './tiers';
 
 export const BenchmarkKindSchema = z.enum(['classifier', 'decider']);
 export type BenchmarkKind = z.infer<typeof BenchmarkKindSchema>;
 
-export const ReasoningEffortSchema = z.enum(['minimal', 'low', 'medium', 'high']);
-export type ReasoningEffort = z.infer<typeof ReasoningEffortSchema>;
-
 export const BenchmarkDeciderModelSchema = z.object({
   id: z.string().trim().min(1),
   // Which gateway API kinds this model can serve when chosen by the router.
@@ -36,6 +36,7 @@ export const BenchmarkConfigSchema = z.object({
 export type BenchmarkConfig = z.infer<typeof BenchmarkConfigSchema>;
 
 export const BenchmarkRunStatusSchema = z.enum(['running', 'completed', 'failed']);
+export type BenchmarkRunStatus = z.infer<typeof BenchmarkRunStatusSchema>;
 
 export const BenchmarkModelSummarySchema = z.object({
   model: z.string(),
diff --git a/packages/auto-routing-contracts/src/index.ts b/packages/auto-routing-contracts/src/index.ts
index 9743536692..ce0d57021b 100644
--- a/packages/auto-routing-contracts/src/index.ts
+++ b/packages/auto-routing-contracts/src/index.ts
@@ -1,6 +1,6 @@
 import * as z from 'zod';
 import { NormalizedClassifierInputSchema } from './input';
-import { DifficultyTierSchema } from './tiers';
+import { DifficultyTierSchema, ReasoningEffortSchema } from './tiers';
 
 export {
   NormalizedClassifierInputSchema,
@@ -103,7 +103,7 @@ export const AutoRoutingDecisionSchema = z.object({
   source: z.enum(['benchmark', 'default']),
   tableVersion: z.string(),
   // Mirrors the effort the chosen model was benchmarked with, when set.
-  reasoningEffort: z.enum(['minimal', 'low', 'medium', 'high']).nullable().optional(),
+  reasoningEffort: ReasoningEffortSchema.nullable().optional(),
 });
 export type AutoRoutingDecision = z.infer<typeof AutoRoutingDecisionSchema>;
 
diff --git a/packages/auto-routing-contracts/src/routing-table.ts b/packages/auto-routing-contracts/src/routing-table.ts
index f134cba8db..b48d8291e6 100644
--- a/packages/auto-routing-contracts/src/routing-table.ts
+++ b/packages/auto-routing-contracts/src/routing-table.ts
@@ -1,4 +1,5 @@
 import * as z from 'zod';
+import { ReasoningEffortSchema } from './tiers';
 
 export const ClassifierApiKindSchema = z.enum(['chat_completions', 'responses', 'messages']);
 
@@ -12,7 +13,7 @@ export const RankedCandidateSchema = z.object({
   supportedApiKinds: z.array(ClassifierApiKindSchema).min(1),
   // Reasoning effort the model was benchmarked with; serving mirrors it.
   // Optional so tables published before this field existed stay valid.
-  reasoningEffort: z.enum(['minimal', 'low', 'medium', 'high']).nullable().optional(),
+  reasoningEffort: ReasoningEffortSchema.nullable().optional(),
 });
 export type RankedCandidate = z.infer<typeof RankedCandidateSchema>;
 
diff --git a/packages/auto-routing-contracts/src/tiers.ts b/packages/auto-routing-contracts/src/tiers.ts
index 5315174c84..8358c5e3bf 100644
--- a/packages/auto-routing-contracts/src/tiers.ts
+++ b/packages/auto-routing-contracts/src/tiers.ts
@@ -1,6 +1,9 @@
 import * as z from 'zod';
 
 export const DifficultyTierSchema = z.enum(['low', 'medium', 'high']);
+
+export const ReasoningEffortSchema = z.enum(['minimal', 'low', 'medium', 'high']);
+export type ReasoningEffort = z.infer<typeof ReasoningEffortSchema>;
 export type DifficultyTier = z.infer<typeof DifficultyTierSchema>;
 
 export const DIFFICULTY_TIERS: readonly DifficultyTier[] = ['low', 'medium', 'high'];
diff --git a/packages/worker-utils/src/index.ts b/packages/worker-utils/src/index.ts
index 1c61b58ff1..066a693587 100644
--- a/packages/worker-utils/src/index.ts
+++ b/packages/worker-utils/src/index.ts
@@ -99,6 +99,9 @@ export type { RepoCoordinates } from './git-url.js';
 
 export { KILO_MODEL_PREFIX, unprefixKiloGatewayModelId } from './kilo-model-id.js';
 
+export { ttlCached } from './ttl-cache.js';
+export type { TtlCache } from './ttl-cache.js';
+
 export {
   CloudAgentQueueReportSchema,
   CloudAgentRunStatuses,
diff --git a/services/auto-routing-benchmark/src/ttl-cache.ts b/packages/worker-utils/src/ttl-cache.ts
similarity index 100%
rename from services/auto-routing-benchmark/src/ttl-cache.ts
rename to packages/worker-utils/src/ttl-cache.ts
diff --git a/services/auto-routing-benchmark/src/db-schema.ts b/services/auto-routing-benchmark/src/db-schema.ts
index 9fac9afb21..90b66f62c3 100644
--- a/services/auto-routing-benchmark/src/db-schema.ts
+++ b/services/auto-routing-benchmark/src/db-schema.ts
@@ -1,5 +1,5 @@
 import { index, integer, primaryKey, real, sqliteTable, text } from 'drizzle-orm/sqlite-core';
-import type { BenchmarkKind } from '@kilocode/auto-routing-contracts';
+import type { BenchmarkKind, BenchmarkRunStatus } from '@kilocode/auto-routing-contracts';
 
 // Migrations are generated via `pnpm db:generate` (drizzle-kit) and applied
 // via wrangler d1 migrations apply.
@@ -28,7 +28,7 @@ export const configDeciderModels = sqliteTable('config_decider_models', {
 export const benchmarkRuns = sqliteTable('benchmark_runs', {
   id: text('id').primaryKey(),
   kind: text('kind').$type<BenchmarkKind>().notNull(),
-  status: text('status').$type<'running' | 'completed' | 'failed'>().notNull(),
+  status: text('status').$type<BenchmarkRunStatus>().notNull(),
   started_at: text('started_at').notNull(),
   completed_at: text('completed_at'),
   error: text('error'),
diff --git a/services/auto-routing-benchmark/src/db.ts b/services/auto-routing-benchmark/src/db.ts
index bc83f23741..ce95bd26b7 100644
--- a/services/auto-routing-benchmark/src/db.ts
+++ b/services/auto-routing-benchmark/src/db.ts
@@ -2,6 +2,7 @@ import type {
   BenchmarkKind,
   BenchmarkModelSummary,
   BenchmarkRun,
+  ClassifierApiKind,
   ClassifierWinner,
   RankedCandidate,
   RoutingTable,
@@ -28,15 +29,14 @@ export type RunRow = typeof benchmarkRuns.$inferSelect;
 export type RunModelRow = typeof runModels.$inferSelect;
 export type ConfigDeciderModelRow = typeof configDeciderModels.$inferSelect;
 type ModelSummaryRow = typeof modelSummaries.$inferSelect;
-type ApiKind = 'chat_completions' | 'messages' | 'responses';
 
 // ---------------------------------------------------------------------------
 // ApiKind flag helpers
 // ---------------------------------------------------------------------------
 
-const ALL_API_KINDS: ApiKind[] = ['chat_completions', 'messages', 'responses'];
+const ALL_API_KINDS: ClassifierApiKind[] = ['chat_completions', 'messages', 'responses'];
 
-export function apiKindsToFlags(kinds: ApiKind[]): {
+export function apiKindsToFlags(kinds: ClassifierApiKind[]): {
   supports_chat_completions: boolean;
   supports_messages: boolean;
   supports_responses: boolean;
@@ -52,7 +52,7 @@ export function flagsToApiKinds(flags: {
   supports_chat_completions: boolean;
   supports_messages: boolean;
   supports_responses: boolean;
-}): ApiKind[] {
+}): ClassifierApiKind[] {
   return ALL_API_KINDS.filter(k => {
     if (k === 'chat_completions') return flags.supports_chat_completions;
     if (k === 'messages') return flags.supports_messages;
diff --git a/services/auto-routing-benchmark/src/openrouter.ts b/services/auto-routing-benchmark/src/openrouter.ts
index 4d8608d6f5..8d48367720 100644
--- a/services/auto-routing-benchmark/src/openrouter.ts
+++ b/services/auto-routing-benchmark/src/openrouter.ts
@@ -1,5 +1,5 @@
 import { OpenRouter } from '@openrouter/sdk';
-import { ttlCached } from './ttl-cache';
+import { ttlCached } from '@kilocode/worker-utils';
 
 type OpenRouterEnv = Pick<Env, 'OPENROUTER_API_KEY'>;
 
diff --git a/services/auto-routing/src/classifier-config.ts b/services/auto-routing/src/classifier-config.ts
index e4840a3e5e..17e0ad33e9 100644
--- a/services/auto-routing/src/classifier-config.ts
+++ b/services/auto-routing/src/classifier-config.ts
@@ -1,11 +1,10 @@
-import { formatError } from '@kilocode/worker-utils';
+import { formatError, ttlCached } from '@kilocode/worker-utils';
 import {
   CLASSIFIER_WINNER_KV_KEY,
   ClassifierWinnerSchema,
   type ClassifierWinner,
 } from '@kilocode/auto-routing-contracts';
 import { DEFAULT_CLASSIFIER_MODEL } from '@kilocode/auto-routing-contracts/classifier';
-import { ttlCached } from './ttl-cache';
 import { kvReadThrough } from './kv-read-through';
 import { fetchClassifierWinnerFromOrigin } from './benchmark-origin';
 
diff --git a/services/auto-routing/src/openrouter.ts b/services/auto-routing/src/openrouter.ts
index 4d8608d6f5..8d48367720 100644
--- a/services/auto-routing/src/openrouter.ts
+++ b/services/auto-routing/src/openrouter.ts
@@ -1,5 +1,5 @@
 import { OpenRouter } from '@openrouter/sdk';
-import { ttlCached } from './ttl-cache';
+import { ttlCached } from '@kilocode/worker-utils';
 
 type OpenRouterEnv = Pick<Env, 'OPENROUTER_API_KEY'>;
 
diff --git a/services/auto-routing/src/routing-table.ts b/services/auto-routing/src/routing-table.ts
index eeaebb1973..20cc3ea141 100644
--- a/services/auto-routing/src/routing-table.ts
+++ b/services/auto-routing/src/routing-table.ts
@@ -1,10 +1,9 @@
-import { formatError } from '@kilocode/worker-utils';
+import { formatError, ttlCached } from '@kilocode/worker-utils';
 import {
   ROUTING_TABLE_KV_KEY,
   RoutingTableSchema,
   type RoutingTable,
 } from '@kilocode/auto-routing-contracts';
-import { ttlCached } from './ttl-cache';
 import { kvReadThrough } from './kv-read-through';
 import { fetchRoutingTableFromOrigin } from './benchmark-origin';
 
diff --git a/services/auto-routing/src/ttl-cache.ts b/services/auto-routing/src/ttl-cache.ts
deleted file mode 100644
index f773b9c4fc..0000000000
--- a/services/auto-routing/src/ttl-cache.ts
+++ /dev/null
@@ -1,35 +0,0 @@
-// Isolate-local TTL memoization for per-request lookups that change rarely
-// (KV config, secrets-backed clients). Values are cached as promises so
-// concurrent callers share one load; rejected loads are evicted immediately
-// so a transient failure is not pinned for the TTL.
-export type TtlCache<TEnv, T> = {
-  get(env: TEnv): Promise<T>;
-  clear(): void;
-};
-
-export function ttlCached<TEnv, T>(
-  ttlMs: number,
-  load: (env: TEnv) => Promise<T>
-): TtlCache<TEnv, T> {
-  let cached: { promise: Promise<T>; expiresAt: number } | null = null;
-
-  return {
-    get(env: TEnv): Promise<T> {
-      if (cached && cached.expiresAt > Date.now()) {
-        return cached.promise;
-      }
-      const promise = load(env);
-      const entry = { promise, expiresAt: Date.now() + ttlMs };
-      cached = entry;
-      promise.catch(() => {
-        if (cached === entry) {
-          cached = null;
-        }
-      });
-      return promise;
-    },
-    clear(): void {
-      cached = null;
-    },
-  };
-}

From a449c26eba2b9629286f08ab1bc03f60bc802c6e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Fri, 12 Jun 2026 14:27:58 +0200
Subject: [PATCH 41/73] docs(gateway): drop stale keep-in-sync comment on
 DecideBaseParams

---
 apps/web/src/lib/ai-gateway/auto-routing-mirror.ts | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/apps/web/src/lib/ai-gateway/auto-routing-mirror.ts b/apps/web/src/lib/ai-gateway/auto-routing-mirror.ts
index 1c730446dd..210b78c8d9 100644
--- a/apps/web/src/lib/ai-gateway/auto-routing-mirror.ts
+++ b/apps/web/src/lib/ai-gateway/auto-routing-mirror.ts
@@ -5,8 +5,7 @@ import { AUTO_ROUTING_WORKER_URL, INTERNAL_API_SECRET } from '@/lib/config.serve
 import { warnExceptInTest } from '@/lib/utils.server';
 
 // Shared base params for both the mirror (fire-and-forget) and the
-// efficient-decision (blocking) call sites. Keep in sync with
-// EfficientDecisionParams in auto-routing-decision.ts.
+// efficient-decision (blocking) call sites.
 export type DecideBaseParams = {
   apiKind: ClassifierApiKind;
   body: unknown;

From 4caa4f80431748b91b699285fc03bbab1698ac80 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Fri, 12 Jun 2026 14:42:27 +0200
Subject: [PATCH 42/73] feat(gateway): bill classifier cost to the user for
 kilo-auto/efficient

---
 .../api/openrouter/[...path]/route.test.ts    | 148 ++++++++++++++++++
 .../src/app/api/openrouter/[...path]/route.ts |  81 +++++++++-
 .../ai-gateway/auto-routing-decision.test.ts  |   6 +-
 .../lib/ai-gateway/auto-routing-decision.ts   |   4 +-
 4 files changed, 230 insertions(+), 9 deletions(-)

diff --git a/apps/web/src/app/api/openrouter/[...path]/route.test.ts b/apps/web/src/app/api/openrouter/[...path]/route.test.ts
index 2fec32f537..47a23461bb 100644
--- a/apps/web/src/app/api/openrouter/[...path]/route.test.ts
+++ b/apps/web/src/app/api/openrouter/[...path]/route.test.ts
@@ -10,6 +10,9 @@ import { emitApiMetricsForResponse } from '@/lib/ai-gateway/o11y/api-metrics.ser
 import { accountForMicrodollarUsage } from '@/lib/ai-gateway/llm-proxy-helpers';
 import { redisClient } from '@/lib/redis';
 import type { Provider } from '@/lib/ai-gateway/providers/types';
+import { fetchEfficientAutoDecision } from '@/lib/ai-gateway/auto-routing-decision';
+import { logMicrodollarUsage } from '@/lib/ai-gateway/processUsage';
+import { applyResolvedAutoModel } from '@/lib/ai-gateway/auto-model/resolution';
 
 jest.mock('next/server', () => {
   return {
@@ -58,6 +61,21 @@ jest.mock('@/lib/ai-gateway/llm-proxy-helpers', () => {
     captureProxyError: jest.fn(),
   };
 });
+jest.mock('@/lib/ai-gateway/auto-routing-decision');
+jest.mock('@/lib/ai-gateway/processUsage', () => {
+  const actual = jest.requireActual('@/lib/ai-gateway/processUsage');
+  return {
+    ...(actual as Record<string, unknown>),
+    logMicrodollarUsage: jest.fn(),
+  };
+});
+jest.mock('@/lib/ai-gateway/auto-model/resolution', () => {
+  const actual = jest.requireActual('@/lib/ai-gateway/auto-model/resolution');
+  return {
+    ...(actual as Record<string, unknown>),
+    applyResolvedAutoModel: jest.fn(),
+  };
+});
 
 const mockedGetUserFromAuth = jest.mocked(getUserFromAuth);
 const mockedGetBalanceAndOrgSettings = jest.mocked(getBalanceAndOrgSettings);
@@ -69,6 +87,9 @@ const mockedEmitApiMetricsForResponse = jest.mocked(emitApiMetricsForResponse);
 const mockedAccountForMicrodollarUsage = jest.mocked(accountForMicrodollarUsage);
 const mockedRedisGet = jest.mocked(redisClient.get);
 const mockedRedisSet = jest.mocked(redisClient.set);
+const mockedFetchEfficientAutoDecision = jest.mocked(fetchEfficientAutoDecision);
+const mockedLogMicrodollarUsage = jest.mocked(logMicrodollarUsage);
+const mockedApplyResolvedAutoModel = jest.mocked(applyResolvedAutoModel);
 
 const provider = {
   id: 'openrouter',
@@ -388,3 +409,130 @@ describe('POST /api/openrouter/v1/chat/completions rules-engine actions', () =>
     expect(mockedUpstreamRequest.mock.calls[0]?.[0].body.model).toBe('openai/gpt-4o');
   });
 });
+
+describe('kilo-auto/efficient classifier billing', () => {
+  beforeEach(() => {
+    jest.clearAllMocks();
+    setUserAuth();
+    mockedGetProvider.mockResolvedValue({
+      kind: 'provider',
+      provider,
+      userByok: null,
+      bypassAccessCheck: false,
+    });
+    mockedClassifyAbuse.mockResolvedValue(classifyResult(null));
+    mockedRedisGet.mockResolvedValue(null);
+    mockedRedisSet.mockResolvedValue('OK');
+    mockedGetOpenRouterModels.mockResolvedValue(new Set());
+    mockedUpstreamRequest.mockResolvedValue(
+      upstreamJsonResponse({ id: 'chatcmpl-1', model: 'anthropic/claude-haiku-4', choices: [] })
+    );
+    mockedEmitApiMetricsForResponse.mockReturnValue(undefined);
+    mockedAccountForMicrodollarUsage.mockReturnValue(undefined);
+    mockedLogMicrodollarUsage.mockResolvedValue(null);
+    // Mock applyResolvedAutoModel to resolve the virtual model and invoke the efficientDecision thunk
+    mockedApplyResolvedAutoModel.mockImplementation(async (opts, request) => {
+      if (opts.efficientDecision) await opts.efficientDecision();
+      request.body.model = 'anthropic/claude-haiku-4';
+      return { kind: 'ok', resolved: { model: 'anthropic/claude-haiku-4' } };
+    });
+    // after() accepts a Promise or a function; the billing path passes a Promise
+    const { after: mockedAfter } = jest.requireMock<{ after: jest.Mock }>('next/server');
+    mockedAfter.mockImplementation((_arg: unknown) => {
+      // no-op: the promise has already been started when passed to after()
+    });
+  });
+
+  it('bills classifier cost when cost > 0 and user is non-BYOK', async () => {
+    mockedFetchEfficientAutoDecision.mockResolvedValue({
+      decision: {
+        model: 'anthropic/claude-haiku-4',
+        tier: 'low',
+        source: 'benchmark',
+        tableVersion: 'v1',
+      },
+      costUsd: 0.002,
+    });
+
+    const { POST } = await import('./route');
+    const response = await POST(makeRequest(makeBody('kilo-auto/efficient')) as never);
+
+    expect(response.status).toBe(200);
+    // Wait for after() callback to settle
+    await Promise.resolve();
+    await Promise.resolve();
+
+    expect(mockedLogMicrodollarUsage).toHaveBeenCalledTimes(1);
+    const [stats, ctx] = mockedLogMicrodollarUsage.mock.calls[0];
+    expect(stats.cost_mUsd).toBe(2000); // toMicrodollars(0.002)
+    expect(stats.model).toBe('auto-routing/classifier');
+    expect(stats.inputTokens).toBe(0);
+    expect(stats.outputTokens).toBe(0);
+    expect(ctx.requested_model).toBe('kilo-auto/efficient');
+    expect(ctx.user_byok).toBe(false);
+  });
+
+  it('does not bill when classifier cost is 0 (cache hit)', async () => {
+    mockedFetchEfficientAutoDecision.mockResolvedValue({
+      decision: {
+        model: 'anthropic/claude-haiku-4',
+        tier: 'low',
+        source: 'cache',
+        tableVersion: 'v1',
+      },
+      costUsd: 0,
+    });
+
+    const { POST } = await import('./route');
+    await POST(makeRequest(makeBody('kilo-auto/efficient')) as never);
+
+    await Promise.resolve();
+    await Promise.resolve();
+
+    expect(mockedLogMicrodollarUsage).not.toHaveBeenCalled();
+  });
+
+  it('does not bill when user is BYOK', async () => {
+    mockedGetProvider.mockResolvedValue({
+      kind: 'provider',
+      provider,
+      userByok: [{ decryptedAPIKey: 'byok-key', providerId: 'openai' }],
+      bypassAccessCheck: false,
+    });
+    mockedFetchEfficientAutoDecision.mockResolvedValue({
+      decision: {
+        model: 'anthropic/claude-haiku-4',
+        tier: 'low',
+        source: 'benchmark',
+        tableVersion: 'v1',
+      },
+      costUsd: 0.002,
+    });
+
+    const { POST } = await import('./route');
+    await POST(makeRequest(makeBody('kilo-auto/efficient')) as never);
+
+    await Promise.resolve();
+    await Promise.resolve();
+
+    expect(mockedLogMicrodollarUsage).not.toHaveBeenCalled();
+  });
+
+  it('bills classifier cost even when decision is null but cost > 0', async () => {
+    mockedFetchEfficientAutoDecision.mockResolvedValue({
+      decision: null,
+      costUsd: 0.001,
+    });
+
+    const { POST } = await import('./route');
+    const response = await POST(makeRequest(makeBody('kilo-auto/efficient')) as never);
+
+    expect(response.status).toBe(200);
+    await Promise.resolve();
+    await Promise.resolve();
+
+    expect(mockedLogMicrodollarUsage).toHaveBeenCalledTimes(1);
+    const [stats] = mockedLogMicrodollarUsage.mock.calls[0];
+    expect(stats.cost_mUsd).toBe(1000); // toMicrodollars(0.001)
+  });
+});
diff --git a/apps/web/src/app/api/openrouter/[...path]/route.ts b/apps/web/src/app/api/openrouter/[...path]/route.ts
index f4d456730f..eee667699b 100644
--- a/apps/web/src/app/api/openrouter/[...path]/route.ts
+++ b/apps/web/src/app/api/openrouter/[...path]/route.ts
@@ -1,6 +1,6 @@
-import { NextResponse, type NextResponse as NextResponseType } from 'next/server';
+import { after, NextResponse, type NextResponse as NextResponseType } from 'next/server';
 import { type NextRequest } from 'next/server';
-import { stripRequiredPrefix } from '@/lib/utils';
+import { stripRequiredPrefix, toMicrodollars } from '@/lib/utils';
 import { extractPromptInfo } from '@/lib/ai-gateway/extractPromptInfo';
 import { determineFallbackFeature } from '@/lib/ai-gateway/determineFallbackFeature';
 import {
@@ -96,7 +96,11 @@ import {
 } from '@/lib/ai-gateway/auto-model';
 import { applyResolvedAutoModel } from '@/lib/ai-gateway/auto-model/resolution';
 import { fetchEfficientAutoDecision } from '@/lib/ai-gateway/auto-routing-decision';
-import type { MicrodollarUsageContext } from '@/lib/ai-gateway/processUsage.types';
+import type {
+  MicrodollarUsageContext,
+  MicrodollarUsageStats,
+} from '@/lib/ai-gateway/processUsage.types';
+import { logMicrodollarUsage } from '@/lib/ai-gateway/processUsage';
 import {
   getMaxTokens,
   hasMiddleOutTransform,
@@ -266,13 +270,14 @@ export async function POST(request: NextRequest): Promise<NextResponseType<unkno
   }
 
   let autoModel: string | null = null;
+  let classifierCostUsd = 0;
   if (isKiloAutoModel(requestedModelLowerCased)) {
     autoModel = requestedModelLowerCased;
     const efficientDecision =
       requestedModelLowerCased === KILO_AUTO_EFFICIENT_MODEL.id
         ? async () => {
             const user = (await authPromise).user;
-            return fetchEfficientAutoDecision({
+            const result = await fetchEfficientAutoDecision({
               apiKind: requestBodyParsed.kind,
               body: requestBodyParsed.body,
               requestedModel,
@@ -285,6 +290,8 @@ export async function POST(request: NextRequest): Promise<NextResponseType<unkno
               mode: modeHeader,
               userAgent: extractHeaderAndLimitLength(request, 'user-agent'),
             });
+            classifierCostUsd = result?.costUsd ?? 0;
+            return result?.decision ?? null;
           }
         : undefined;
     const autoResult = await applyResolvedAutoModel(
@@ -866,6 +873,72 @@ export async function POST(request: NextRequest): Promise<NextResponseType<unkno
 
   accountForMicrodollarUsage(clonedReponse, usageContext, openrouterRequestSpan);
 
+  if (classifierCostUsd > 0 && !usageContext.user_byok) {
+    after(
+      (async () => {
+        try {
+          if (await isFreeModel(KILO_AUTO_EFFICIENT_MODEL.id)) return;
+          const classifierStats: MicrodollarUsageStats = {
+            messageId: null,
+            model: 'auto-routing/classifier',
+            responseContent: '',
+            hasError: false,
+            inference_provider: null,
+            upstream_id: null,
+            finish_reason: null,
+            latency: null,
+            moderation_latency: null,
+            generation_time: null,
+            streamed: false,
+            cancelled: false,
+            status_code: 200,
+            cost_mUsd: toMicrodollars(classifierCostUsd),
+            inputTokens: 0,
+            outputTokens: 0,
+            cacheWriteTokens: 0,
+            cacheHitTokens: 0,
+            is_byok: false,
+          };
+          const classifierContext: MicrodollarUsageContext = {
+            api_kind: usageContext.api_kind,
+            kiloUserId: usageContext.kiloUserId,
+            fraudHeaders: usageContext.fraudHeaders,
+            organizationId: usageContext.organizationId,
+            provider: 'openrouter',
+            requested_model: KILO_AUTO_EFFICIENT_MODEL.id,
+            promptInfo: {
+              system_prompt_prefix: '',
+              system_prompt_length: 0,
+              user_prompt_prefix: '',
+            },
+            max_tokens: null,
+            has_middle_out_transform: null,
+            isStreaming: false,
+            prior_microdollar_usage: usageContext.prior_microdollar_usage,
+            posthog_distinct_id: usageContext.posthog_distinct_id,
+            project_id: usageContext.project_id,
+            status_code: 200,
+            editor_name: usageContext.editor_name,
+            machine_id: usageContext.machine_id,
+            user_byok: false,
+            has_tools: false,
+            botId: usageContext.botId,
+            tokenSource: usageContext.tokenSource,
+            feature: usageContext.feature,
+            session_id: usageContext.session_id,
+            mode: usageContext.mode,
+            auto_model: autoModel,
+            ttfb_ms: null,
+            clientRequestId,
+          };
+          await logMicrodollarUsage(classifierStats, classifierContext);
+        } catch (error) {
+          console.error('Failed to bill classifier cost for kilo-auto/efficient', error);
+        }
+      })()
+    );
+  }
+
   await handleRequestLogging({
     clonedResponse: response.clone(),
     user: maybeUser,
diff --git a/apps/web/src/lib/ai-gateway/auto-routing-decision.test.ts b/apps/web/src/lib/ai-gateway/auto-routing-decision.test.ts
index 433bc8517e..2c0afcae0f 100644
--- a/apps/web/src/lib/ai-gateway/auto-routing-decision.test.ts
+++ b/apps/web/src/lib/ai-gateway/auto-routing-decision.test.ts
@@ -80,7 +80,7 @@ describe('fetchEfficientAutoDecision', () => {
     const headers = init?.headers as Headers;
     expect(headers.get('authorization')).toBe('Bearer classifier-token');
     expect(headers.get('content-type')).toBe('application/json');
-    expect(result).toEqual(validDecision);
+    expect(result).toEqual({ decision: validDecision, costUsd: 0.001 });
   });
 
   it('returns null and calls onError on a non-OK response', async () => {
@@ -151,7 +151,7 @@ describe('fetchEfficientAutoDecision', () => {
     expect(result).toBeNull();
   });
 
-  it('returns null (not the decision object) when the worker returns a null decision', async () => {
+  it('returns decision: null with costUsd when the worker returns a null decision', async () => {
     mockedFetch.mockResolvedValueOnce(
       new Response(JSON.stringify({ cost: 0.001, decision: null, classifierResult: null }), {
         status: 200,
@@ -160,6 +160,6 @@ describe('fetchEfficientAutoDecision', () => {
 
     const result = await fetchEfficientAutoDecision(makeParams(), options);
 
-    expect(result).toBeNull();
+    expect(result).toEqual({ decision: null, costUsd: 0.001 });
   });
 });
diff --git a/apps/web/src/lib/ai-gateway/auto-routing-decision.ts b/apps/web/src/lib/ai-gateway/auto-routing-decision.ts
index a7e35022c4..b8bef47c09 100644
--- a/apps/web/src/lib/ai-gateway/auto-routing-decision.ts
+++ b/apps/web/src/lib/ai-gateway/auto-routing-decision.ts
@@ -24,7 +24,7 @@ type FetchEfficientDecisionOptions = {
 export async function fetchEfficientAutoDecision(
   params: EfficientDecisionParams,
   options: FetchEfficientDecisionOptions = {}
-): Promise<AutoRoutingDecision | null> {
+): Promise<{ decision: AutoRoutingDecision | null; costUsd: number } | null> {
   const workerUrl = options.workerUrl ?? AUTO_ROUTING_WORKER_URL;
   const authToken = options.authToken ?? INTERNAL_API_SECRET;
   const onError = options.onError ?? warnExceptInTest;
@@ -52,7 +52,7 @@ export async function fetchEfficientAutoDecision(
       onError('Efficient auto decision response invalid', { error: 'invalid_response' });
       return null;
     }
-    return parsed.data.decision;
+    return { decision: parsed.data.decision, costUsd: parsed.data.cost };
   } catch (error) {
     onError('Efficient auto decision request failed', {
       error: error instanceof Error ? error.message : String(error),

From ec5dc3f1ea48dd7f59dcbc9314cb02c3d9aa3de5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Fri, 12 Jun 2026 14:51:01 +0200
Subject: [PATCH 43/73] fix(gateway): fix type error and remove dead guard in
 classifier billing

---
 apps/web/src/app/api/openrouter/[...path]/route.test.ts | 2 +-
 apps/web/src/app/api/openrouter/[...path]/route.ts      | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/apps/web/src/app/api/openrouter/[...path]/route.test.ts b/apps/web/src/app/api/openrouter/[...path]/route.test.ts
index 47a23461bb..9d53a66f68 100644
--- a/apps/web/src/app/api/openrouter/[...path]/route.test.ts
+++ b/apps/web/src/app/api/openrouter/[...path]/route.test.ts
@@ -477,7 +477,7 @@ describe('kilo-auto/efficient classifier billing', () => {
       decision: {
         model: 'anthropic/claude-haiku-4',
         tier: 'low',
-        source: 'cache',
+        source: 'benchmark' as const,
         tableVersion: 'v1',
       },
       costUsd: 0,
diff --git a/apps/web/src/app/api/openrouter/[...path]/route.ts b/apps/web/src/app/api/openrouter/[...path]/route.ts
index eee667699b..2d39608a67 100644
--- a/apps/web/src/app/api/openrouter/[...path]/route.ts
+++ b/apps/web/src/app/api/openrouter/[...path]/route.ts
@@ -877,7 +877,6 @@ export async function POST(request: NextRequest): Promise<NextResponseType<unkno
     after(
       (async () => {
         try {
-          if (await isFreeModel(KILO_AUTO_EFFICIENT_MODEL.id)) return;
           const classifierStats: MicrodollarUsageStats = {
             messageId: null,
             model: 'auto-routing/classifier',

From 0141b7199bd1e08613624426920b3777efa594d4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Fri, 12 Jun 2026 15:02:08 +0200
Subject: [PATCH 44/73] fix(auto-routing): apply decision reasoningEffort to
 efficient routing

---
 .../ai-gateway/auto-model/resolution.test.ts  | 34 +++++++++++++++++++
 .../lib/ai-gateway/auto-model/resolution.ts   | 12 ++++++-
 2 files changed, 45 insertions(+), 1 deletion(-)

diff --git a/apps/web/src/lib/ai-gateway/auto-model/resolution.test.ts b/apps/web/src/lib/ai-gateway/auto-model/resolution.test.ts
index 813bfed596..37c6d87a95 100644
--- a/apps/web/src/lib/ai-gateway/auto-model/resolution.test.ts
+++ b/apps/web/src/lib/ai-gateway/auto-model/resolution.test.ts
@@ -50,6 +50,40 @@ describe('resolveAutoModel — kilo-auto/efficient branch', () => {
     expect(result).toEqual({ kind: 'ok', resolved: { model: 'anthropic/claude-haiku-4' } });
   });
 
+  it('applies the decision reasoningEffort as a reasoning config', async () => {
+    const result = await resolveAutoModel(
+      {
+        ...baseParams,
+        apiKind: 'chat_completions',
+        efficientDecision: async () => ({ ...sampleDecision, reasoningEffort: 'minimal' }),
+      },
+      nullUserPromise,
+      zeroBalancePromise
+    );
+
+    expect(result).toEqual({
+      kind: 'ok',
+      resolved: {
+        model: 'anthropic/claude-haiku-4',
+        reasoning: { enabled: true, effort: 'minimal' },
+      },
+    });
+  });
+
+  it('omits reasoning when the decision reasoningEffort is null', async () => {
+    const result = await resolveAutoModel(
+      {
+        ...baseParams,
+        apiKind: 'chat_completions',
+        efficientDecision: async () => ({ ...sampleDecision, reasoningEffort: null }),
+      },
+      nullUserPromise,
+      zeroBalancePromise
+    );
+
+    expect(result).toEqual({ kind: 'ok', resolved: { model: 'anthropic/claude-haiku-4' } });
+  });
+
   it('falls back to BALANCED_RESPONSES_FALLBACK_MODEL when no thunk is provided and apiKind=responses', async () => {
     const result = await resolveAutoModel(
       { ...baseParams, apiKind: 'responses' },
diff --git a/apps/web/src/lib/ai-gateway/auto-model/resolution.ts b/apps/web/src/lib/ai-gateway/auto-model/resolution.ts
index 05377f8796..bad2b8a819 100644
--- a/apps/web/src/lib/ai-gateway/auto-model/resolution.ts
+++ b/apps/web/src/lib/ai-gateway/auto-model/resolution.ts
@@ -125,7 +125,17 @@ export async function resolveAutoModel(
   if (model === KILO_AUTO_EFFICIENT_MODEL.id) {
     const decision = params.efficientDecision ? await params.efficientDecision() : null;
     if (decision) {
-      return { kind: 'ok', resolved: { model: decision.model } };
+      // Apply the candidate's pinned reasoning effort so the model runs under
+      // the same conditions the benchmark measured it at.
+      return {
+        kind: 'ok',
+        resolved: {
+          model: decision.model,
+          ...(decision.reasoningEffort
+            ? { reasoning: { enabled: true, effort: decision.reasoningEffort } }
+            : {}),
+        },
+      };
     }
     // Static fallback when the worker is slow/unavailable: same shape as
     // balanced so an efficient request never degrades below balanced.

From 6960e1a793e4e7d7403f4ce806231abe211e0ba1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Fri, 12 Jun 2026 15:20:24 +0200
Subject: [PATCH 45/73] feat(auto-routing): align kilo-auto/efficient catalog
 with balanced, hide from listing

---
 .../src/lib/ai-gateway/auto-model/index.ts    | 20 ++++++++++---------
 .../ai-gateway/providers/openrouter/index.ts  |  2 +-
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/apps/web/src/lib/ai-gateway/auto-model/index.ts b/apps/web/src/lib/ai-gateway/auto-model/index.ts
index 581785702c..d0149c1502 100644
--- a/apps/web/src/lib/ai-gateway/auto-model/index.ts
+++ b/apps/web/src/lib/ai-gateway/auto-model/index.ts
@@ -22,6 +22,9 @@ type AutoModel = {
   supports_images: boolean;
   supports_pdf: boolean;
   opencode_settings: OpenCodeSettings | undefined;
+  // Mirrors KiloExclusiveModel['status']. 'hidden' auto models are excluded
+  // from the /models listing but stay usable by anyone who knows the id.
+  status: 'public' | 'hidden';
 };
 
 export type ResolvedAutoModel = {
@@ -115,6 +118,7 @@ export const KILO_AUTO_FRONTIER_MODEL: AutoModel = {
     family: 'claude',
     prompt: 'anthropic',
   },
+  status: 'public',
 };
 
 export const KILO_AUTO_FREE_MODEL: AutoModel = {
@@ -132,6 +136,7 @@ export const KILO_AUTO_FREE_MODEL: AutoModel = {
   supports_images: false,
   supports_pdf: false,
   opencode_settings: undefined,
+  status: 'public',
 };
 
 export const KILO_AUTO_BALANCED_MODEL: AutoModel = {
@@ -147,6 +152,7 @@ export const KILO_AUTO_BALANCED_MODEL: AutoModel = {
   supports_images: true,
   supports_pdf: false,
   opencode_settings: undefined,
+  status: 'public',
 };
 
 export const KILO_AUTO_SMALL_MODEL: AutoModel = {
@@ -162,22 +168,18 @@ export const KILO_AUTO_SMALL_MODEL: AutoModel = {
   supports_images: true,
   supports_pdf: false,
   opencode_settings: undefined,
+  status: 'public',
 };
 
+// Same catalog properties as balanced (it is intended to eventually replace
+// it); hidden while the routing engine is validated on Kilo team traffic.
 export const KILO_AUTO_EFFICIENT_MODEL: AutoModel = {
+  ...KILO_AUTO_BALANCED_MODEL,
   id: 'kilo-auto/efficient',
   name: 'Auto Efficient',
   description:
     'Routes each request to the cheapest model that gets the job done, based on continuously benchmarked accuracy and cost.',
-  context_length: 262_144,
-  max_completion_tokens: 32_768,
-  prompt_price: '0.000000325',
-  completion_price: '0.00000195',
-  input_cache_read_price: undefined,
-  input_cache_write_price: undefined,
-  supports_images: false,
-  supports_pdf: false,
-  opencode_settings: undefined,
+  status: 'hidden',
 };
 
 export const AUTO_MODELS = [
diff --git a/apps/web/src/lib/ai-gateway/providers/openrouter/index.ts b/apps/web/src/lib/ai-gateway/providers/openrouter/index.ts
index d57b604274..197941a306 100644
--- a/apps/web/src/lib/ai-gateway/providers/openrouter/index.ts
+++ b/apps/web/src/lib/ai-gateway/providers/openrouter/index.ts
@@ -32,7 +32,7 @@ import { applyCustomPricingToModel } from '@/lib/ai-gateway/custom-pricing';
 export { normalizeModelId } from '@/lib/ai-gateway/model-utils';
 
 function buildAutoModels(): OpenRouterModel[] {
-  return AUTO_MODELS.map(m => {
+  return AUTO_MODELS.filter(m => m.status === 'public').map(m => {
     const input_modalities = ['text'];
     if (m.supports_images) {
       input_modalities.push('image');

From debdd037de1112dab6f773fc4867265b33203f0a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Fri, 12 Jun 2026 15:25:52 +0200
Subject: [PATCH 46/73] fix(admin): correct run-summaries colspan in benchmarks
 section

---
 apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx b/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx
index d79be0e52a..e24346850a 100644
--- a/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx
+++ b/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx
@@ -464,7 +464,7 @@ function RunSummariesTable({ run }: { run: BenchmarkRun }) {
   if (sortedSummaries.length === 0) {
     return (
       <TableRow>
-        <TableCell colSpan={8} className="text-muted-foreground h-10 text-center text-xs">
+        <TableCell colSpan={6} className="text-muted-foreground h-10 text-center text-xs">
           No summaries
         </TableCell>
       </TableRow>
@@ -474,7 +474,7 @@ function RunSummariesTable({ run }: { run: BenchmarkRun }) {
   return (
     <>
       <TableRow className="bg-muted/30">
-        <TableCell colSpan={8} className="px-4 py-2">
+        <TableCell colSpan={6} className="px-4 py-2">
           <Table>
             <TableHeader>
               <TableRow>

From a016310e5f2cb038a7b93b00ba23311af6e6fdc1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Fri, 12 Jun 2026 16:01:29 +0200
Subject: [PATCH 47/73] feat(admin): derive decider model API kinds from
 gateway provider definitions

---
 .../auto-routing/benchmark-config/route.ts    | 17 ++++-
 .../admin/auto-routing/BenchmarksSection.tsx  | 70 +++----------------
 .../lib/ai-gateway/model-api-kinds.test.ts    | 36 ++++++++++
 .../web/src/lib/ai-gateway/model-api-kinds.ts | 24 +++++++
 .../auto-routing-contracts/src/benchmark.ts   |  8 +++
 5 files changed, 92 insertions(+), 63 deletions(-)
 create mode 100644 apps/web/src/lib/ai-gateway/model-api-kinds.test.ts
 create mode 100644 apps/web/src/lib/ai-gateway/model-api-kinds.ts

diff --git a/apps/web/src/app/admin/api/auto-routing/benchmark-config/route.ts b/apps/web/src/app/admin/api/auto-routing/benchmark-config/route.ts
index d81cc4f69c..de5d38b38c 100644
--- a/apps/web/src/app/admin/api/auto-routing/benchmark-config/route.ts
+++ b/apps/web/src/app/admin/api/auto-routing/benchmark-config/route.ts
@@ -1,10 +1,11 @@
-import { BenchmarkConfigSchema } from '@kilocode/auto-routing-contracts';
+import { BenchmarkConfigUpdateSchema } from '@kilocode/auto-routing-contracts';
 import type { NextRequest } from 'next/server';
 import { NextResponse } from 'next/server';
 import {
   getBenchmarkConfig,
   updateBenchmarkConfig,
 } from '@/lib/ai-gateway/auto-routing-benchmark-admin-client';
+import { supportedApiKindsForModel } from '@/lib/ai-gateway/model-api-kinds';
 import { getUserFromAuth } from '@/lib/user/server';
 
 export async function GET() {
@@ -26,12 +27,22 @@ export async function PUT(request: NextRequest) {
     return NextResponse.json({ error: 'Invalid JSON body' }, { status: 400 });
   }
 
-  const parsed = BenchmarkConfigSchema.safeParse(rawBody);
+  const parsed = BenchmarkConfigUpdateSchema.safeParse(rawBody);
   if (!parsed.success) {
     return NextResponse.json({ error: 'Invalid benchmark config' }, { status: 400 });
   }
 
+  // supportedApiKinds is server-derived from gateway provider definitions —
+  // the admin UI never sends it.
+  const config = {
+    ...parsed.data,
+    deciderModels: parsed.data.deciderModels.map(m => ({
+      ...m,
+      supportedApiKinds: supportedApiKindsForModel(m.id),
+    })),
+  };
+
   const email = user?.google_user_email ?? '';
-  const result = await updateBenchmarkConfig(parsed.data, email);
+  const result = await updateBenchmarkConfig(config, email);
   return NextResponse.json(result.body, { status: result.status });
 }
diff --git a/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx b/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx
index e24346850a..3c8b5286ae 100644
--- a/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx
+++ b/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx
@@ -8,9 +8,9 @@ import {
   type BenchmarkConfig,
   type BenchmarkKind,
   type BenchmarkRoutingTableResponse,
+  type BenchmarkConfigUpdate,
   type BenchmarkRun,
   type BenchmarkModelSummary,
-  type ClassifierApiKind,
   type ReasoningEffort,
 } from '@kilocode/auto-routing-contracts';
 import React, { useCallback, useEffect, useState } from 'react';
@@ -68,7 +68,7 @@ async function fetchBenchmarkConfig() {
   return parseAdminResponse(response, BenchmarkConfigResponseSchema);
 }
 
-async function saveBenchmarkConfig(config: BenchmarkConfig) {
+async function saveBenchmarkConfig(config: BenchmarkConfigUpdate) {
   const response = await fetch('/admin/api/auto-routing/benchmark-config', {
     method: 'PUT',
     headers: { 'content-type': 'application/json' },
@@ -103,11 +103,10 @@ async function fetchBenchmarkRoutingTable() {
 // Local form state type for decider model rows
 // ---------------------------------------------------------------------------
 
+// supportedApiKinds is intentionally absent: it is derived server-side from
+// gateway provider definitions when the config is saved.
 type DeciderModelRow = {
   id: string;
-  chat_completions: boolean;
-  responses: boolean;
-  messages: boolean;
   reasoningEffort: ReasoningEffort | null;
 };
 
@@ -122,9 +121,6 @@ function configToFormState(config: BenchmarkConfig): {
     classifierModels: config.classifierModels.join('\n'),
     deciderModels: config.deciderModels.map(m => ({
       id: m.id,
-      chat_completions: m.supportedApiKinds.includes('chat_completions'),
-      responses: m.supportedApiKinds.includes('responses'),
-      messages: m.supportedApiKinds.includes('messages'),
       reasoningEffort: m.reasoningEffort ?? null,
     })),
     minAccuracy: config.minAccuracy,
@@ -136,24 +132,17 @@ function configToFormState(config: BenchmarkConfig): {
 function formStateToConfig(
   state: ReturnType<typeof configToFormState>,
   base: BenchmarkConfig
-): BenchmarkConfig {
+): BenchmarkConfigUpdate {
   const classifierModels = state.classifierModels
     .split('\n')
     .map(s => s.trim())
     .filter(s => s.length > 0);
   const deciderModels = state.deciderModels
     .filter(row => row.id.trim().length > 0)
-    .map(row => {
-      const kinds: ClassifierApiKind[] = [];
-      if (row.chat_completions) kinds.push('chat_completions');
-      if (row.responses) kinds.push('responses');
-      if (row.messages) kinds.push('messages');
-      return {
-        id: row.id.trim(),
-        supportedApiKinds: kinds.length ? kinds : ['chat_completions' as const],
-        reasoningEffort: row.reasoningEffort ?? null,
-      };
-    });
+    .map(row => ({
+      id: row.id.trim(),
+      reasoningEffort: row.reasoningEffort ?? null,
+    }));
   const benchmarkUserId = state.benchmarkUserId.trim();
   return {
     classifierModels,
@@ -204,16 +193,7 @@ function BenchmarkConfigEditor({
   const handleAddDeciderRow = useCallback(() => {
     setForm(prev => ({
       ...prev,
-      deciderModels: [
-        ...prev.deciderModels,
-        {
-          id: '',
-          chat_completions: true,
-          responses: false,
-          messages: false,
-          reasoningEffort: null,
-        },
-      ],
+      deciderModels: [...prev.deciderModels, { id: '', reasoningEffort: null }],
     }));
   }, []);
 
@@ -264,9 +244,6 @@ function BenchmarkConfigEditor({
               <TableHeader>
                 <TableRow>
                   <TableHead>Model ID</TableHead>
-                  <TableHead className="w-32 text-center">chat_completions</TableHead>
-                  <TableHead className="w-24 text-center">responses</TableHead>
-                  <TableHead className="w-24 text-center">messages</TableHead>
                   <TableHead className="w-36">Reasoning effort</TableHead>
                   <TableHead className="w-12" />
                 </TableRow>
@@ -283,33 +260,6 @@ function BenchmarkConfigEditor({
                         aria-label={`Decider model ${index + 1} ID`}
                       />
                     </TableCell>
-                    <TableCell className="py-2 text-center">
-                      <Checkbox
-                        checked={row.chat_completions}
-                        onCheckedChange={checked =>
-                          handleDeciderRowChange(index, { chat_completions: checked === true })
-                        }
-                        aria-label={`Model ${index + 1} supports chat_completions`}
-                      />
-                    </TableCell>
-                    <TableCell className="py-2 text-center">
-                      <Checkbox
-                        checked={row.responses}
-                        onCheckedChange={checked =>
-                          handleDeciderRowChange(index, { responses: checked === true })
-                        }
-                        aria-label={`Model ${index + 1} supports responses`}
-                      />
-                    </TableCell>
-                    <TableCell className="py-2 text-center">
-                      <Checkbox
-                        checked={row.messages}
-                        onCheckedChange={checked =>
-                          handleDeciderRowChange(index, { messages: checked === true })
-                        }
-                        aria-label={`Model ${index + 1} supports messages`}
-                      />
-                    </TableCell>
                     <TableCell className="py-2">
                       <Select
                         value={row.reasoningEffort ?? 'none'}
diff --git a/apps/web/src/lib/ai-gateway/model-api-kinds.test.ts b/apps/web/src/lib/ai-gateway/model-api-kinds.test.ts
new file mode 100644
index 0000000000..c75891f257
--- /dev/null
+++ b/apps/web/src/lib/ai-gateway/model-api-kinds.test.ts
@@ -0,0 +1,36 @@
+import { describe, expect, it } from '@jest/globals';
+import { supportedApiKindsForModel } from './model-api-kinds';
+import { morph_warp_grep_free_model } from '@/lib/ai-gateway/providers/morph';
+import { seed_20_code_free_model } from '@/lib/ai-gateway/providers/seed';
+
+describe('supportedApiKindsForModel', () => {
+  it('returns all OpenRouter chat APIs for a plain OpenRouter model', () => {
+    expect(supportedApiKindsForModel('openai/gpt-5-mini')).toEqual([
+      'chat_completions',
+      'messages',
+      'responses',
+    ]);
+  });
+
+  it('uses the declared gateway for Kilo-exclusive models', () => {
+    expect(supportedApiKindsForModel(morph_warp_grep_free_model.public_id)).toEqual([
+      'chat_completions',
+    ]);
+  });
+
+  it('treats disabled Kilo-exclusive models like plain OpenRouter models, matching get-provider', () => {
+    expect(supportedApiKindsForModel(seed_20_code_free_model.public_id)).toEqual([
+      'chat_completions',
+      'messages',
+      'responses',
+    ]);
+  });
+
+  it('falls back to OpenRouter for unknown model ids', () => {
+    expect(supportedApiKindsForModel('made-up/model')).toEqual([
+      'chat_completions',
+      'messages',
+      'responses',
+    ]);
+  });
+});
diff --git a/apps/web/src/lib/ai-gateway/model-api-kinds.ts b/apps/web/src/lib/ai-gateway/model-api-kinds.ts
new file mode 100644
index 0000000000..63895cc279
--- /dev/null
+++ b/apps/web/src/lib/ai-gateway/model-api-kinds.ts
@@ -0,0 +1,24 @@
+import { ClassifierApiKindSchema, type ClassifierApiKind } from '@kilocode/auto-routing-contracts';
+import { findKiloExclusiveModel } from '@/lib/ai-gateway/models';
+import PROVIDERS from '@/lib/ai-gateway/providers/provider-definitions';
+
+/**
+ * Which gateway API kinds a model can serve, derived from the provider the
+ * gateway would route it to. Mirrors get-provider.ts's static fallback
+ * resolution — a Kilo-exclusive model is served by its declared gateway,
+ * everything else by OpenRouter. The dynamic paths (BYOK, custom LLMs,
+ * experiments, Vercel re-routing) never apply to auto-routing benchmark
+ * candidates, which is the only consumer.
+ */
+export function supportedApiKindsForModel(modelId: string): ClassifierApiKind[] {
+  const exclusive = findKiloExclusiveModel(modelId);
+  const provider =
+    Object.values(PROVIDERS).find(p => p.id === exclusive?.gateway) ?? PROVIDERS.OPENROUTER;
+  const kinds = provider.supportedChatApis.filter((kind): kind is ClassifierApiKind =>
+    (ClassifierApiKindSchema.options as readonly string[]).includes(kind)
+  );
+  // A provider with no chat APIs (e.g. Mistral) can't serve gateway chat
+  // traffic at all; such models are not meaningful decider candidates, but
+  // the contract requires a non-empty list.
+  return kinds.length > 0 ? kinds : ['chat_completions'];
+}
diff --git a/packages/auto-routing-contracts/src/benchmark.ts b/packages/auto-routing-contracts/src/benchmark.ts
index 2ed7d2d106..344b24cd86 100644
--- a/packages/auto-routing-contracts/src/benchmark.ts
+++ b/packages/auto-routing-contracts/src/benchmark.ts
@@ -35,6 +35,14 @@ export const BenchmarkConfigSchema = z.object({
 });
 export type BenchmarkConfig = z.infer<typeof BenchmarkConfigSchema>;
 
+// Admin-save payload: deciderModels carry no supportedApiKinds — the web
+// layer derives them from gateway provider definitions before forwarding the
+// full BenchmarkConfig to the benchmark worker.
+export const BenchmarkConfigUpdateSchema = BenchmarkConfigSchema.extend({
+  deciderModels: z.array(BenchmarkDeciderModelSchema.omit({ supportedApiKinds: true })).min(1),
+});
+export type BenchmarkConfigUpdate = z.infer<typeof BenchmarkConfigUpdateSchema>;
+
 export const BenchmarkRunStatusSchema = z.enum(['running', 'completed', 'failed']);
 export type BenchmarkRunStatus = z.infer<typeof BenchmarkRunStatusSchema>;
 

From fc427e5b00ff349cdd69f66afcbb9d4e8260a288 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Fri, 12 Jun 2026 16:12:27 +0200
Subject: [PATCH 48/73] feat(auto-routing): drop default routing table; no
 table means no decision

---
 packages/auto-routing-contracts/src/index.ts  |  7 +-
 .../src/routing-table.ts                      |  4 +-
 .../auto-routing/src/decision-engine.test.ts  |  3 +
 services/auto-routing/src/decision-engine.ts  |  3 +-
 services/auto-routing/src/index.test.ts       | 71 ++++++++++++++++--
 .../auto-routing/src/routing-table.test.ts    | 74 +++++++++++++++----
 services/auto-routing/src/routing-table.ts    | 55 ++------------
 7 files changed, 141 insertions(+), 76 deletions(-)

diff --git a/packages/auto-routing-contracts/src/index.ts b/packages/auto-routing-contracts/src/index.ts
index ce0d57021b..6c7ecbd792 100644
--- a/packages/auto-routing-contracts/src/index.ts
+++ b/packages/auto-routing-contracts/src/index.ts
@@ -100,7 +100,7 @@ export type ClassifierOutput = z.infer<typeof ClassifierOutputSchema>;
 export const AutoRoutingDecisionSchema = z.object({
   model: z.string(),
   tier: DifficultyTierSchema,
-  source: z.enum(['benchmark', 'default']),
+  source: z.enum(['benchmark']),
   tableVersion: z.string(),
   // Mirrors the effort the chosen model was benchmarked with, when set.
   reasoningEffort: ReasoningEffortSchema.nullable().optional(),
@@ -109,8 +109,9 @@ export type AutoRoutingDecision = z.infer<typeof AutoRoutingDecisionSchema>;
 
 export const AutoRoutingDecisionResponseSchema = z.object({
   cost: z.number(),
-  // Null when classification failed or no table candidate supports the
-  // request's API kind; the gateway then falls back to its static default.
+  // Null when classification failed, no routing table is published, or no
+  // table candidate supports the request's API kind; the gateway then falls
+  // back to its static balanced defaults.
   decision: AutoRoutingDecisionSchema.nullable(),
   classifierResult: z
     .object({
diff --git a/packages/auto-routing-contracts/src/routing-table.ts b/packages/auto-routing-contracts/src/routing-table.ts
index b48d8291e6..b4c1696203 100644
--- a/packages/auto-routing-contracts/src/routing-table.ts
+++ b/packages/auto-routing-contracts/src/routing-table.ts
@@ -18,11 +18,11 @@ export const RankedCandidateSchema = z.object({
 export type RankedCandidate = z.infer<typeof RankedCandidateSchema>;
 
 export const RoutingTableSchema = z.object({
-  // Benchmark run id (or 'default' for the built-in table).
+  // Benchmark run id.
   version: z.string().min(1),
   generatedAt: z.string().min(1),
   minAccuracy: z.number().min(0).max(1),
-  source: z.enum(['benchmark', 'default']),
+  source: z.enum(['benchmark']),
   tiers: z.object({
     low: z.array(RankedCandidateSchema).min(1),
     medium: z.array(RankedCandidateSchema).min(1),
diff --git a/services/auto-routing/src/decision-engine.test.ts b/services/auto-routing/src/decision-engine.test.ts
index 41c9cbef7e..59211a7329 100644
--- a/services/auto-routing/src/decision-engine.test.ts
+++ b/services/auto-routing/src/decision-engine.test.ts
@@ -79,4 +79,7 @@ describe('computeDecision', () => {
   it('returns null when no candidate supports the api kind', () => {
     expect(computeDecision(classification, 'responses', table)).toBeNull();
   });
+  it('returns null when there is no routing table', () => {
+    expect(computeDecision(classification, 'chat_completions', null)).toBeNull();
+  });
 });
diff --git a/services/auto-routing/src/decision-engine.ts b/services/auto-routing/src/decision-engine.ts
index d41d0961ea..bc9c1d8011 100644
--- a/services/auto-routing/src/decision-engine.ts
+++ b/services/auto-routing/src/decision-engine.ts
@@ -9,8 +9,9 @@ import {
 export function computeDecision(
   classification: ClassifierOutput,
   apiKind: NormalizedClassifierInput['apiKind'],
-  table: RoutingTable
+  table: RoutingTable | null
 ): AutoRoutingDecision | null {
+  if (!table) return null;
   const tier = deriveDifficultyTier(classification);
   const candidate = table.tiers[tier].find(c => c.supportedApiKinds.includes(apiKind));
   if (!candidate) return null;
diff --git a/services/auto-routing/src/index.test.ts b/services/auto-routing/src/index.test.ts
index 1e7e553dc2..445fbf4f54 100644
--- a/services/auto-routing/src/index.test.ts
+++ b/services/auto-routing/src/index.test.ts
@@ -81,6 +81,45 @@ const normalizedInput = {
   },
 };
 
+const benchmarkRoutingTable = {
+  version: 'bench-run-1',
+  generatedAt: '2026-06-12T00:00:00.000Z',
+  minAccuracy: 0.7,
+  source: 'benchmark',
+  tiers: {
+    low: [
+      {
+        model: 'google/gemini-2.5-flash-lite',
+        accuracy: 0.9,
+        avgCostUsd: 0.001,
+        meetsThreshold: true,
+        supportedApiKinds: ['chat_completions'],
+        reasoningEffort: null,
+      },
+    ],
+    medium: [
+      {
+        model: 'google/gemini-2.5-flash',
+        accuracy: 0.85,
+        avgCostUsd: 0.002,
+        meetsThreshold: true,
+        supportedApiKinds: ['chat_completions'],
+        reasoningEffort: null,
+      },
+    ],
+    high: [
+      {
+        model: 'anthropic/claude-sonnet-4.6',
+        accuracy: 0.8,
+        avgCostUsd: 0.01,
+        meetsThreshold: true,
+        supportedApiKinds: ['chat_completions', 'messages', 'responses'],
+        reasoningEffort: null,
+      },
+    ],
+  },
+};
+
 function mirrorPayload(overrides: Record<string, unknown> = {}) {
   return {
     input: normalizedInput,
@@ -135,6 +174,7 @@ describe('auto routing worker', () => {
     configDelete.mockReset();
     configDelete.mockResolvedValue(undefined);
     configPut.mockReset();
+    configPut.mockResolvedValue(undefined);
     benchmarkFetch.mockReset();
     benchmarkFetch.mockImplementation(async (url: string) => {
       if (String(url).includes('/admin/classifier-winner')) {
@@ -143,7 +183,10 @@ describe('auto routing worker', () => {
       return {
         ok: true,
         status: 200,
-        json: async () => ({ table: null, publishedAt: null }),
+        json: async () => ({
+          table: benchmarkRoutingTable,
+          publishedAt: benchmarkRoutingTable.generatedAt,
+        }),
       };
     });
     analyticsTokenGet.mockReset();
@@ -185,8 +228,8 @@ describe('auto routing worker', () => {
       decision: {
         model: expect.any(String),
         tier: expect.stringMatching(/^(low|medium|high)$/),
-        source: 'default',
-        tableVersion: 'default',
+        source: 'benchmark',
+        tableVersion: 'bench-run-1',
         reasoningEffort: null,
       },
       classifierResult: {
@@ -248,8 +291,8 @@ describe('auto routing worker', () => {
       decision: {
         model: expect.any(String),
         tier: expect.stringMatching(/^(low|medium|high)$/),
-        source: 'default',
-        tableVersion: 'default',
+        source: 'benchmark',
+        tableVersion: 'bench-run-1',
         reasoningEffort: null,
       },
       classifierResult: { classification: mockClassification },
@@ -364,6 +407,24 @@ describe('auto routing worker', () => {
     });
   });
 
+  it('makes no decision when no routing table is published', async () => {
+    benchmarkFetch.mockImplementation(async (url: string) => {
+      if (String(url).includes('/admin/classifier-winner')) {
+        return { ok: true, status: 200, json: async () => ({ winner: null }) };
+      }
+      return { ok: true, status: 200, json: async () => ({ table: null, publishedAt: null }) };
+    });
+
+    const response = await decideRequest(mirrorPayload());
+
+    expect(response.status).toBe(200);
+    await expect(response.json()).resolves.toMatchObject({
+      cost: 0.00000123,
+      decision: null,
+      classifierResult: { classification: mockClassification },
+    });
+  });
+
   it('returns a null classifier result when the classifier request fails', async () => {
     const warnSpy = vi.spyOn(console, 'warn').mockImplementation(() => {});
     classifyNormalizedInput.mockRejectedValueOnce(
diff --git a/services/auto-routing/src/routing-table.test.ts b/services/auto-routing/src/routing-table.test.ts
index 54f77d2715..cbed758688 100644
--- a/services/auto-routing/src/routing-table.test.ts
+++ b/services/auto-routing/src/routing-table.test.ts
@@ -1,5 +1,45 @@
 import { afterEach, describe, expect, it, vi } from 'vitest';
-import { clearRoutingTableCache, DEFAULT_ROUTING_TABLE, getRoutingTable } from './routing-table';
+import type { RoutingTable } from '@kilocode/auto-routing-contracts';
+import { clearRoutingTableCache, getRoutingTable } from './routing-table';
+
+const SAMPLE_TABLE: RoutingTable = {
+  version: 'bench-run-1',
+  generatedAt: '2026-06-12T00:00:00.000Z',
+  minAccuracy: 0.7,
+  source: 'benchmark',
+  tiers: {
+    low: [
+      {
+        model: 'google/gemini-2.5-flash-lite',
+        accuracy: 0.9,
+        avgCostUsd: 0.001,
+        meetsThreshold: true,
+        supportedApiKinds: ['chat_completions'],
+        reasoningEffort: null,
+      },
+    ],
+    medium: [
+      {
+        model: 'google/gemini-2.5-flash',
+        accuracy: 0.85,
+        avgCostUsd: 0.002,
+        meetsThreshold: true,
+        supportedApiKinds: ['chat_completions'],
+        reasoningEffort: null,
+      },
+    ],
+    high: [
+      {
+        model: 'anthropic/claude-sonnet-4.6',
+        accuracy: 0.8,
+        avgCostUsd: 0.01,
+        meetsThreshold: true,
+        supportedApiKinds: ['chat_completions', 'messages', 'responses'],
+        reasoningEffort: null,
+      },
+    ],
+  },
+};
 
 type KvStub = Pick<Env, 'AUTO_ROUTING_CONFIG' | 'BENCHMARK_SERVICE' | 'INTERNAL_API_SECRET_PROD'>;
 
@@ -45,14 +85,16 @@ function makeEnv(
 afterEach(() => clearRoutingTableCache());
 
 describe('getRoutingTable', () => {
-  it('returns the default table when the key is missing and origin has no table', async () => {
-    expect(await getRoutingTable(makeEnv(null))).toEqual(DEFAULT_ROUTING_TABLE);
+  it('returns null when the key is missing and origin has no table', async () => {
+    expect(await getRoutingTable(makeEnv(null))).toBeNull();
   });
 
-  it('returns the default table when the stored JSON is invalid and origin has no table', async () => {
-    expect(await getRoutingTable(makeEnv('{"nope":true}'))).toEqual(DEFAULT_ROUTING_TABLE);
+  it('returns null when the stored JSON is invalid and origin has no table', async () => {
+    const warn = vi.spyOn(console, 'warn').mockImplementation(() => {});
+    expect(await getRoutingTable(makeEnv('{"nope":true}'))).toBeNull();
     clearRoutingTableCache();
-    expect(await getRoutingTable(makeEnv('not json at all'))).toEqual(DEFAULT_ROUTING_TABLE);
+    expect(await getRoutingTable(makeEnv('not json at all'))).toBeNull();
+    warn.mockRestore();
   });
 
   it('parses and caches a valid stored table without calling origin', async () => {
@@ -66,7 +108,7 @@ describe('getRoutingTable', () => {
       AUTO_ROUTING_CONFIG: {
         get: async () => {
           reads++;
-          return JSON.stringify(DEFAULT_ROUTING_TABLE);
+          return JSON.stringify(SAMPLE_TABLE);
         },
         put: async () => {},
       },
@@ -76,7 +118,7 @@ describe('getRoutingTable', () => {
 
     const first = await getRoutingTable(env);
     await getRoutingTable(env);
-    expect(first.version).toBe(DEFAULT_ROUTING_TABLE.version);
+    expect(first?.version).toBe(SAMPLE_TABLE.version);
     expect(reads).toBe(1);
     expect(fetchSpy).not.toHaveBeenCalled();
   });
@@ -84,33 +126,33 @@ describe('getRoutingTable', () => {
   it('fetches from origin on KV miss, writes to KV with expirationTtl, and returns the table', async () => {
     const puts: Array<{ key: string; value: string; options: unknown }> = [];
     const env = makeEnv(null, {
-      originTable: DEFAULT_ROUTING_TABLE,
+      originTable: SAMPLE_TABLE,
       onPut: (key, value, options) => puts.push({ key, value, options }),
     });
 
     const result = await getRoutingTable(env);
-    expect(result).toEqual(DEFAULT_ROUTING_TABLE);
+    expect(result).toEqual(SAMPLE_TABLE);
     expect(puts).toHaveLength(1);
     expect(puts[0].key).toBe('routing_table_v1');
     expect(puts[0].options).toEqual({ expirationTtl: 3600 });
   });
 
-  it('returns the default table when origin responds non-OK', async () => {
+  it('returns null when origin responds non-OK', async () => {
     const warn = vi.spyOn(console, 'warn').mockImplementation(() => {});
     const env = makeEnv(null, { originStatus: 500 });
-    expect(await getRoutingTable(env)).toEqual(DEFAULT_ROUTING_TABLE);
+    expect(await getRoutingTable(env)).toBeNull();
     warn.mockRestore();
   });
 
-  it('returns the default table when origin throws', async () => {
+  it('returns null when origin throws', async () => {
     const warn = vi.spyOn(console, 'warn').mockImplementation(() => {});
     const env = makeEnv(null, { originThrow: true });
-    expect(await getRoutingTable(env)).toEqual(DEFAULT_ROUTING_TABLE);
+    expect(await getRoutingTable(env)).toBeNull();
     warn.mockRestore();
   });
 
-  it('returns the default table when origin returns null table', async () => {
+  it('returns null when origin returns a null table', async () => {
     const env = makeEnv(null, { originTable: undefined });
-    expect(await getRoutingTable(env)).toEqual(DEFAULT_ROUTING_TABLE);
+    expect(await getRoutingTable(env)).toBeNull();
   });
 });
diff --git a/services/auto-routing/src/routing-table.ts b/services/auto-routing/src/routing-table.ts
index 20cc3ea141..5c9ad85b0d 100644
--- a/services/auto-routing/src/routing-table.ts
+++ b/services/auto-routing/src/routing-table.ts
@@ -7,52 +7,6 @@ import {
 import { kvReadThrough } from './kv-read-through';
 import { fetchRoutingTableFromOrigin } from './benchmark-origin';
 
-// Safety net used until the first decider benchmark publishes a table (and
-// whenever the stored table is missing or unparseable). Mirrors the static
-// defaults the gateway uses for kilo-auto/balanced today.
-export const DEFAULT_ROUTING_TABLE: RoutingTable = {
-  version: 'default',
-  generatedAt: '2026-06-11T00:00:00.000Z',
-  minAccuracy: 0.7,
-  source: 'default',
-  tiers: {
-    low: [
-      {
-        model: 'google/gemini-2.5-flash',
-        accuracy: 1,
-        avgCostUsd: 0,
-        meetsThreshold: true,
-        supportedApiKinds: ['chat_completions'],
-      },
-    ],
-    medium: [
-      {
-        model: 'qwen/qwen3.7-plus',
-        accuracy: 1,
-        avgCostUsd: 0,
-        meetsThreshold: true,
-        supportedApiKinds: ['chat_completions'],
-      },
-      {
-        model: 'anthropic/claude-sonnet-4.6',
-        accuracy: 1,
-        avgCostUsd: 0,
-        meetsThreshold: true,
-        supportedApiKinds: ['chat_completions', 'messages', 'responses'],
-      },
-    ],
-    high: [
-      {
-        model: 'anthropic/claude-sonnet-4.6',
-        accuracy: 1,
-        avgCostUsd: 0,
-        meetsThreshold: true,
-        supportedApiKinds: ['chat_completions', 'messages', 'responses'],
-      },
-    ],
-  },
-};
-
 const ROUTING_TABLE_CACHE_TTL_MS = 60_000;
 
 type RoutingTableEnv = Pick<
@@ -87,18 +41,21 @@ const routingTableCache = ttlCached(ROUTING_TABLE_CACHE_TTL_MS, async (env: Rout
       }
     },
   });
-  return table ?? DEFAULT_ROUTING_TABLE;
+  return table;
 });
 
 export function clearRoutingTableCache(): void {
   routingTableCache.clear();
 }
 
-export function getRoutingTable(env: RoutingTableEnv): Promise<RoutingTable> {
+// Null when no benchmark-published table exists (or it cannot be read):
+// /decide then makes no decision and the gateway falls back to its static
+// balanced defaults.
+export function getRoutingTable(env: RoutingTableEnv): Promise<RoutingTable | null> {
   return routingTableCache.get(env).catch((error: unknown) => {
     console.warn(
       JSON.stringify({ event: 'auto_routing_table_read_failed', ...formatError(error) })
     );
-    return DEFAULT_ROUTING_TABLE;
+    return null;
   });
 }

From 01e4bd9c725a3bd93cacf32b3600e94717bab729 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Fri, 12 Jun 2026 16:26:29 +0200
Subject: [PATCH 49/73] fix(auto-routing): keep classifier override when
 benchmark origin is unavailable

---
 services/auto-routing/src/classifier-config.test.ts | 11 +++++++++++
 services/auto-routing/src/classifier-config.ts      | 11 +++++++++++
 2 files changed, 22 insertions(+)

diff --git a/services/auto-routing/src/classifier-config.test.ts b/services/auto-routing/src/classifier-config.test.ts
index 3c9b660606..1ec3d438b4 100644
--- a/services/auto-routing/src/classifier-config.test.ts
+++ b/services/auto-routing/src/classifier-config.test.ts
@@ -158,6 +158,17 @@ describe('classifier config', () => {
     warn.mockRestore();
   });
 
+  it('keeps a healthy admin override when the winner origin fails', async () => {
+    const warn = vi.spyOn(console, 'warn').mockImplementation(() => {});
+    const { env } = makeEnv({ overrideModel: 'override/model', originThrow: true });
+    expect(await getClassifierModelInfo(env)).toEqual({
+      model: 'override/model',
+      override: 'override/model',
+      benchmarkWinner: null,
+    });
+    warn.mockRestore();
+  });
+
   it('override takes precedence over benchmark winner from origin', async () => {
     const { env } = makeEnv({
       overrideModel: 'openai/gpt-4o',
diff --git a/services/auto-routing/src/classifier-config.ts b/services/auto-routing/src/classifier-config.ts
index 17e0ad33e9..5e4de3c938 100644
--- a/services/auto-routing/src/classifier-config.ts
+++ b/services/auto-routing/src/classifier-config.ts
@@ -52,6 +52,17 @@ const classifierModelCache = ttlCached(CONFIG_CACHE_TTL_MS, async (env: Classifi
       ttlSeconds: 3600,
       fetchOrigin: () => fetchClassifierWinnerFromOrigin(env),
       parse: parseClassifierWinner,
+    }).catch((error: unknown) => {
+      // A benchmark-origin failure must not reject the whole load: that would
+      // discard a healthy admin override and fail closed to the default.
+      console.warn(
+        JSON.stringify({
+          event: 'auto_routing_config_read_failed',
+          key: CLASSIFIER_WINNER_KV_KEY,
+          ...formatError(error),
+        })
+      );
+      return null;
     }),
   ]);
   const override = configuredModel?.trim() || null;

From 0828e4782784391f99324d0bee8e6346c7ff3daf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Fri, 12 Jun 2026 16:34:02 +0200
Subject: [PATCH 50/73] docs(contracts): fix stale classifier-winner comment

---
 packages/auto-routing-contracts/src/benchmark.ts | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/packages/auto-routing-contracts/src/benchmark.ts b/packages/auto-routing-contracts/src/benchmark.ts
index 344b24cd86..20d185c626 100644
--- a/packages/auto-routing-contracts/src/benchmark.ts
+++ b/packages/auto-routing-contracts/src/benchmark.ts
@@ -92,8 +92,9 @@ export const BenchmarkRoutingTableResponseSchema = z.object({
 });
 export type BenchmarkRoutingTableResponse = z.infer<typeof BenchmarkRoutingTableResponseSchema>;
 
-// Published to the auto-routing KV namespace when a classifier benchmark run
-// completes: the cheapest candidate meeting the accuracy threshold.
+// The cheapest classifier candidate meeting the accuracy threshold, derived
+// on read from the latest completed classifier run (served via
+// /admin/classifier-winner and cached in the auto-routing KV namespace).
 export const ClassifierWinnerSchema = z.object({
   model: z.string().trim().min(1),
   runId: z.string(),

From 71222caa3cc68622ed2b4df4786a95b44778def3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Fri, 12 Jun 2026 16:39:50 +0200
Subject: [PATCH 51/73] fix(benchmark): exclude no-cost-signal summaries from
 routing table ranking

---
 services/auto-routing-benchmark/src/db-schema.ts |  3 ++-
 .../src/routing-table-builder.test.ts            | 16 ++++++++++++++++
 .../src/routing-table-builder.ts                 | 11 +++++++----
 3 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/services/auto-routing-benchmark/src/db-schema.ts b/services/auto-routing-benchmark/src/db-schema.ts
index 90b66f62c3..8c2f7184ff 100644
--- a/services/auto-routing-benchmark/src/db-schema.ts
+++ b/services/auto-routing-benchmark/src/db-schema.ts
@@ -114,7 +114,8 @@ export const routingTableCandidates = sqliteTable(
     model: text('model').notNull(),
     accuracy: real('accuracy').notNull(),
     // Non-null unlike model_summaries: RankedCandidate.avgCostUsd is a plain
-    // nonnegative number (buildRoutingTable maps unknown costs to 0).
+    // nonnegative number (buildRoutingTable excludes summaries without a
+    // cost signal, so every published candidate has one).
     avg_cost_usd: real('avg_cost_usd').notNull(),
     meets_threshold: integer('meets_threshold', { mode: 'boolean' }).notNull(),
     reasoning_effort: text('reasoning_effort'),
diff --git a/services/auto-routing-benchmark/src/routing-table-builder.test.ts b/services/auto-routing-benchmark/src/routing-table-builder.test.ts
index 265a321714..950bd7c32c 100644
--- a/services/auto-routing-benchmark/src/routing-table-builder.test.ts
+++ b/services/auto-routing-benchmark/src/routing-table-builder.test.ts
@@ -73,6 +73,22 @@ describe('buildRoutingTable', () => {
     expect(table.tiers.high[2].model).toBe('model/cheap'); // below threshold
   });
 
+  it('excludes a model whose tier summary has no cost signal', () => {
+    const table = buildRoutingTable({
+      runId: 'test-run-nocost',
+      generatedAt: '2026-01-01T00:00:00.000Z',
+      minAccuracy: 0.7,
+      deciderModels: DECIDER_MODELS,
+      summaries: ALL_TIERS_SUMMARIES.map(s =>
+        s.model === 'model/cheap' && s.tier === 'low' ? { ...s, avgCostUsd: null } : s
+      ),
+    });
+
+    // model/cheap would have won 'low' as cheapest; without a cost signal it
+    // must not be ranked (unknown cost is not zero cost).
+    expect(table.tiers.low.map(c => c.model)).toEqual(['model/expensive', 'model/mid']);
+  });
+
   it('marks meetsThreshold correctly', () => {
     const table = buildRoutingTable({
       runId: 'test-run-2',
diff --git a/services/auto-routing-benchmark/src/routing-table-builder.ts b/services/auto-routing-benchmark/src/routing-table-builder.ts
index 4230b9077a..8eb62bb6bf 100644
--- a/services/auto-routing-benchmark/src/routing-table-builder.ts
+++ b/services/auto-routing-benchmark/src/routing-table-builder.ts
@@ -8,9 +8,12 @@ import {
 } from '@kilocode/auto-routing-contracts';
 
 // Builds the routing table from per-(model, tier) decider summaries. Models
-// with zero graded cases in a tier are excluded from that tier. Throws when
-// any tier ends up empty so the caller keeps the previous published table.
-// deciderModels/minAccuracy come from the run's snapshot, not live config.
+// with zero graded cases in a tier are excluded from that tier, as are
+// models with no cost signal at all (avgCostUsd null means every case failed
+// to report cost; ranking such a model as cheapest would hand it the tier).
+// Throws when any tier ends up empty so the caller keeps the previous
+// published table. deciderModels/minAccuracy come from the run's snapshot,
+// not live config.
 export function buildRoutingTable(params: {
   runId: string;
   generatedAt: string;
@@ -24,7 +27,7 @@ export function buildRoutingTable(params: {
   const tierCandidates = (t: DifficultyTier) =>
     rankCandidates(
       summaries
-        .filter(s => s.tier === t && s.cases > 0)
+        .filter(s => s.tier === t && s.cases > 0 && s.avgCostUsd !== null)
         .map(s => ({
           model: s.model,
           accuracy: s.accuracy,

From 6f5fd38e319f83cb4e9c298c99ccc2e2b11c6024 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Fri, 12 Jun 2026 16:40:25 +0200
Subject: [PATCH 52/73] test(benchmark): fix expected ranking order in
 no-cost-signal test

---
 .../auto-routing-benchmark/src/routing-table-builder.test.ts    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/auto-routing-benchmark/src/routing-table-builder.test.ts b/services/auto-routing-benchmark/src/routing-table-builder.test.ts
index 950bd7c32c..f3694e6d62 100644
--- a/services/auto-routing-benchmark/src/routing-table-builder.test.ts
+++ b/services/auto-routing-benchmark/src/routing-table-builder.test.ts
@@ -86,7 +86,7 @@ describe('buildRoutingTable', () => {
 
     // model/cheap would have won 'low' as cheapest; without a cost signal it
     // must not be ranked (unknown cost is not zero cost).
-    expect(table.tiers.low.map(c => c.model)).toEqual(['model/expensive', 'model/mid']);
+    expect(table.tiers.low.map(c => c.model)).toEqual(['model/mid', 'model/expensive']);
   });
 
   it('marks meetsThreshold correctly', () => {

From 2cd53f954794c65206dac0be9b26ba4e10cbef5e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Fri, 12 Jun 2026 16:52:29 +0200
Subject: [PATCH 53/73] feat(benchmark): remove fabricated default config; runs
 require a saved config

---
 .../admin/auto-routing/BenchmarksSection.tsx  |  49 ++++-----
 ...uto-routing-benchmark-admin-client.test.ts |  19 ----
 .../auto-routing-contracts/src/benchmark.ts   |   5 +-
 .../auto-routing-benchmark/src/admin.test.ts  |  70 +++++++++---
 services/auto-routing-benchmark/src/admin.ts  |  13 +--
 .../auto-routing-benchmark/src/config.test.ts | 101 ++++++------------
 services/auto-routing-benchmark/src/config.ts |  46 +-------
 services/auto-routing-benchmark/src/run.ts    |   3 +
 8 files changed, 130 insertions(+), 176 deletions(-)

diff --git a/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx b/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx
index 3c8b5286ae..1ca5d437d7 100644
--- a/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx
+++ b/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx
@@ -16,7 +16,7 @@ import {
 import React, { useCallback, useEffect, useState } from 'react';
 import { useMutation, useQuery, useQueryClient } from '@tanstack/react-query';
 import { toast } from 'sonner';
-import { ChevronDown, ChevronRight, Play, Plus, RotateCcw, Save, Trash2 } from 'lucide-react';
+import { ChevronDown, ChevronRight, Play, Plus, Save, Trash2 } from 'lucide-react';
 import { Badge } from '@/components/ui/badge';
 import { Button } from '@/components/ui/button';
 import { Card, CardContent, CardHeader, CardTitle } from '@/components/ui/card';
@@ -110,13 +110,24 @@ type DeciderModelRow = {
   reasoningEffort: ReasoningEffort | null;
 };
 
-function configToFormState(config: BenchmarkConfig): {
+function configToFormState(config: BenchmarkConfig | null): {
   classifierModels: string;
   deciderModels: DeciderModelRow[];
   minAccuracy: number;
   maxConcurrency: number;
   benchmarkUserId: string;
 } {
+  if (config === null) {
+    // No config saved yet: the worker fabricates nothing, so the form starts
+    // empty and the admin must enter and save a config before running.
+    return {
+      classifierModels: '',
+      deciderModels: [],
+      minAccuracy: 0.7,
+      maxConcurrency: 4,
+      benchmarkUserId: '',
+    };
+  }
   return {
     classifierModels: config.classifierModels.join('\n'),
     deciderModels: config.deciderModels.map(m => ({
@@ -131,7 +142,7 @@ function configToFormState(config: BenchmarkConfig): {
 
 function formStateToConfig(
   state: ReturnType<typeof configToFormState>,
-  base: BenchmarkConfig
+  base: BenchmarkConfig | null
 ): BenchmarkConfigUpdate {
   const classifierModels = state.classifierModels
     .split('\n')
@@ -150,8 +161,8 @@ function formStateToConfig(
     minAccuracy: state.minAccuracy,
     maxConcurrency: state.maxConcurrency,
     benchmarkUserId: benchmarkUserId.length > 0 ? benchmarkUserId : null,
-    updatedAt: base.updatedAt,
-    updatedBy: base.updatedBy,
+    updatedAt: base?.updatedAt ?? null,
+    updatedBy: base?.updatedBy ?? null,
   };
 }
 
@@ -161,12 +172,10 @@ function formStateToConfig(
 
 function BenchmarkConfigEditor({
   config,
-  defaults,
   onSaved,
 }: {
-  config: BenchmarkConfig;
-  defaults: BenchmarkConfig;
-  onSaved: (next: { config: BenchmarkConfig; defaults: BenchmarkConfig }) => void;
+  config: BenchmarkConfig | null;
+  onSaved: (next: { config: BenchmarkConfig | null }) => void;
 }) {
   const [form, setForm] = useState(() => configToFormState(config));
 
@@ -186,10 +195,6 @@ function BenchmarkConfigEditor({
     },
   });
 
-  const handleResetToDefaults = useCallback(() => {
-    setForm(configToFormState(defaults));
-  }, [defaults]);
-
   const handleAddDeciderRow = useCallback(() => {
     setForm(prev => ({
       ...prev,
@@ -375,12 +380,12 @@ function BenchmarkConfigEditor({
               <Save className="size-4" />
               Save config
             </Button>
-            <Button type="button" variant="outline" onClick={handleResetToDefaults}>
-              <RotateCcw className="size-4" />
-              Reset to defaults
-            </Button>
           </div>
-          {config.updatedAt ? (
+          {config === null ? (
+            <p className="text-muted-foreground text-xs">
+              No config saved yet — runs cannot start until one is saved.
+            </p>
+          ) : config.updatedAt ? (
             <p className="text-muted-foreground text-xs">
               Last updated {config.updatedAt}
               {config.updatedBy ? ` by ${config.updatedBy}` : ''}
@@ -675,7 +680,7 @@ export function BenchmarksSection() {
   });
 
   const handleConfigSaved = useCallback(
-    (next: { config: BenchmarkConfig; defaults: BenchmarkConfig }) => {
+    (next: { config: BenchmarkConfig | null }) => {
       queryClient.setQueryData(['auto-routing', 'benchmark-config'], next);
     },
     [queryClient]
@@ -706,11 +711,7 @@ export function BenchmarksSection() {
             : 'Failed to load benchmark config'}
         </div>
       ) : configQuery.data ? (
-        <BenchmarkConfigEditor
-          config={configQuery.data.config}
-          defaults={configQuery.data.defaults}
-          onSaved={handleConfigSaved}
-        />
+        <BenchmarkConfigEditor config={configQuery.data.config} onSaved={handleConfigSaved} />
       ) : null}
 
       {/* Run controls */}
diff --git a/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.test.ts b/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.test.ts
index 164525eb19..1879f8e82b 100644
--- a/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.test.ts
+++ b/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.test.ts
@@ -34,25 +34,6 @@ const configResponse = {
     updatedAt: null,
     updatedBy: null,
   },
-  defaults: {
-    classifierModels: ['anthropic/claude-haiku-4'],
-    deciderModels: [
-      {
-        id: 'anthropic/claude-sonnet-4',
-        supportedApiKinds: ['chat_completions' as const] as (
-          | 'chat_completions'
-          | 'responses'
-          | 'messages'
-        )[],
-        reasoningEffort: null,
-      },
-    ],
-    minAccuracy: 0.8,
-    maxConcurrency: 4,
-    benchmarkUserId: null,
-    updatedAt: null,
-    updatedBy: null,
-  },
 };
 
 const runsResponse = {
diff --git a/packages/auto-routing-contracts/src/benchmark.ts b/packages/auto-routing-contracts/src/benchmark.ts
index 20d185c626..53e48127e5 100644
--- a/packages/auto-routing-contracts/src/benchmark.ts
+++ b/packages/auto-routing-contracts/src/benchmark.ts
@@ -71,9 +71,10 @@ export const BenchmarkRunSchema = z.object({
 export type BenchmarkRun = z.infer<typeof BenchmarkRunSchema>;
 
 export const BenchmarkRunsResponseSchema = z.object({ runs: z.array(BenchmarkRunSchema) });
+// config is null until an admin saves one — the worker never fabricates a
+// default config, and runs cannot start without a saved one.
 export const BenchmarkConfigResponseSchema = z.object({
-  config: BenchmarkConfigSchema,
-  defaults: BenchmarkConfigSchema,
+  config: BenchmarkConfigSchema.nullable(),
 });
 export const StartBenchmarkRunRequestSchema = z.object({
   kind: BenchmarkKindSchema,
diff --git a/services/auto-routing-benchmark/src/admin.test.ts b/services/auto-routing-benchmark/src/admin.test.ts
index 42b3770e49..c6c5ae158d 100644
--- a/services/auto-routing-benchmark/src/admin.test.ts
+++ b/services/auto-routing-benchmark/src/admin.test.ts
@@ -1,9 +1,49 @@
 import { beforeEach, describe, expect, it, vi } from 'vitest';
-import type { RoutingTable } from '@kilocode/auto-routing-contracts';
-import { DEFAULT_BENCHMARK_CONFIG } from './config';
+import type { BenchmarkConfig, RoutingTable } from '@kilocode/auto-routing-contracts';
 import { app } from './index';
 import type * as DbModule from './db';
 
+const TEST_CONFIG: BenchmarkConfig = {
+  classifierModels: ['google/gemini-2.5-flash-lite', 'google/gemini-2.5-flash'],
+  deciderModels: [
+    {
+      id: 'google/gemini-2.5-flash-lite',
+      supportedApiKinds: ['chat_completions'],
+      reasoningEffort: null,
+    },
+    {
+      id: 'anthropic/claude-sonnet-4.6',
+      supportedApiKinds: ['chat_completions', 'messages', 'responses'],
+      reasoningEffort: null,
+    },
+  ],
+  minAccuracy: 0.7,
+  maxConcurrency: 4,
+  benchmarkUserId: null,
+  updatedAt: null,
+  updatedBy: null,
+};
+
+// getConfigRows result that mapConfigRows resolves back to TEST_CONFIG.
+const TEST_CONFIG_ROWS = {
+  config: {
+    id: 1 as const,
+    min_accuracy: TEST_CONFIG.minAccuracy,
+    max_concurrency: TEST_CONFIG.maxConcurrency,
+    benchmark_user_id: TEST_CONFIG.benchmarkUserId,
+    updated_at: '2026-06-01T00:00:00.000Z',
+    updated_by: null,
+  },
+  classifierModels: TEST_CONFIG.classifierModels,
+  deciderModels: TEST_CONFIG.deciderModels.map(m => ({
+    model: m.id,
+    reasoning_effort: m.reasoningEffort ?? null,
+    supports_chat_completions: m.supportedApiKinds.includes('chat_completions'),
+    supports_messages: m.supportedApiKinds.includes('messages'),
+    supports_responses: m.supportedApiKinds.includes('responses'),
+  })),
+};
+
 // ---------------------------------------------------------------------------
 // Stubs: the db module is mocked at its function boundary (drizzle generates
 // the SQL, so statement-level stubbing would couple tests to its internals).
@@ -124,19 +164,16 @@ describe('auth middleware', () => {
 // ---------------------------------------------------------------------------
 
 describe('GET /admin/config', () => {
-  it('returns defaults when the DB rows are absent', async () => {
+  it('returns a null config when the DB rows are absent', async () => {
     // getConfigRows already returns null config by default
     const res = await authedGet('/admin/config');
     expect(res.status).toBe(200);
-    await expect(res.json()).resolves.toEqual({
-      config: DEFAULT_BENCHMARK_CONFIG,
-      defaults: DEFAULT_BENCHMARK_CONFIG,
-    });
+    await expect(res.json()).resolves.toEqual({ config: null });
   });
 
   it('returns the stored config when DB rows exist', async () => {
     const classifierModels = ['some/model'];
-    const deciderModels = DEFAULT_BENCHMARK_CONFIG.deciderModels.map(m => ({
+    const deciderModels = TEST_CONFIG.deciderModels.map(m => ({
       model: m.id,
       reasoning_effort: null,
       supports_chat_completions: m.supportedApiKinds.includes('chat_completions'),
@@ -193,9 +230,9 @@ describe('PUT /admin/config', () => {
     expect(replaceConfig).not.toHaveBeenCalled();
   });
 
-  it('persists a valid config and returns it with defaults', async () => {
+  it('persists a valid config and returns it', async () => {
     const validConfig = {
-      ...DEFAULT_BENCHMARK_CONFIG,
+      ...TEST_CONFIG,
       minAccuracy: 0.85,
       updatedAt: null,
       updatedBy: null,
@@ -208,12 +245,10 @@ describe('PUT /admin/config', () => {
     expect(res.status).toBe(200);
     const body = (await res.json()) as {
       config: { minAccuracy: number; updatedBy: string | null; updatedAt: string | null };
-      defaults: typeof DEFAULT_BENCHMARK_CONFIG;
     };
     expect(body.config.minAccuracy).toBe(0.85);
     expect(body.config.updatedBy).toBe('igor@kilocode.ai');
     expect(typeof body.config.updatedAt).toBe('string');
-    expect(body.defaults).toEqual(DEFAULT_BENCHMARK_CONFIG);
 
     expect(replaceConfig).toHaveBeenCalledOnce();
     const [, configArg] = vi.mocked(replaceConfig).mock.calls[0];
@@ -262,13 +297,22 @@ describe('POST /admin/runs', () => {
     expect(queueSendBatch).not.toHaveBeenCalled();
   });
 
+  it('rejects starting a run when no config has been saved', async () => {
+    // getConfigRows already returns null config by default
+    const res = await authedPost('/admin/runs', { kind: 'classifier' });
+    expect(res.status).toBe(500);
+    expect(insertRun).not.toHaveBeenCalled();
+    expect(queueSendBatch).not.toHaveBeenCalled();
+  });
+
   it('starts a classifier run and returns runId + enqueuedModels', async () => {
     // No prior summaries → every configured model is enqueued.
+    vi.mocked(getConfigRows).mockResolvedValue(TEST_CONFIG_ROWS);
     const res = await authedPost('/admin/runs', { kind: 'classifier' });
     expect(res.status).toBe(200);
     const body = (await res.json()) as { runId: string; enqueuedModels: number };
     expect(body.runId).toMatch(/^classifier-/);
-    expect(body.enqueuedModels).toBe(DEFAULT_BENCHMARK_CONFIG.classifierModels.length);
+    expect(body.enqueuedModels).toBe(TEST_CONFIG.classifierModels.length);
     expect(insertRun).toHaveBeenCalledOnce();
     expect(queueSendBatch).toHaveBeenCalledOnce();
   });
diff --git a/services/auto-routing-benchmark/src/admin.ts b/services/auto-routing-benchmark/src/admin.ts
index e88784d434..9fc401ce08 100644
--- a/services/auto-routing-benchmark/src/admin.ts
+++ b/services/auto-routing-benchmark/src/admin.ts
@@ -6,7 +6,7 @@ import {
 } from '@kilocode/auto-routing-contracts';
 import { zodJsonValidator } from '@kilocode/worker-utils';
 import type { Hono } from 'hono';
-import { DEFAULT_BENCHMARK_CONFIG, getBenchmarkConfig, saveBenchmarkConfig } from './config';
+import { getBenchmarkConfig, saveBenchmarkConfig } from './config';
 import { debugRunCli } from './cli-runner';
 import { fetchBenchmarkUserToken, startRun } from './run';
 import { getClassifierWinner, getLatestRoutingTable, listRuns } from './db';
@@ -18,12 +18,7 @@ const DebugCliRequestSchema = z.object({
 });
 
 export function registerAdminRoutes(app: Hono<HonoEnv>): void {
-  app.get('/admin/config', async c =>
-    c.json({
-      config: await getBenchmarkConfig(c.env.BENCH_DB),
-      defaults: DEFAULT_BENCHMARK_CONFIG,
-    })
-  );
+  app.get('/admin/config', async c => c.json({ config: await getBenchmarkConfig(c.env.BENCH_DB) }));
 
   app.put(
     '/admin/config',
@@ -31,7 +26,7 @@ export function registerAdminRoutes(app: Hono<HonoEnv>): void {
     async c => {
       const updatedBy = c.req.header('x-updated-by') ?? null;
       const saved = await saveBenchmarkConfig(c.env.BENCH_DB, c.req.valid('json'), updatedBy);
-      return c.json({ config: saved, defaults: DEFAULT_BENCHMARK_CONFIG });
+      return c.json({ config: saved });
     }
   );
 
@@ -70,7 +65,7 @@ export function registerAdminRoutes(app: Hono<HonoEnv>): void {
     zodJsonValidator(DebugCliRequestSchema, { errorMessage: 'Invalid debug request' }),
     async c => {
       const config = await getBenchmarkConfig(c.env.BENCH_DB);
-      if (!config.benchmarkUserId) {
+      if (!config?.benchmarkUserId) {
         return c.json({ error: 'benchmarkUserId is not configured' }, 400);
       }
       const kiloToken = await fetchBenchmarkUserToken(c.env, config.benchmarkUserId);
diff --git a/services/auto-routing-benchmark/src/config.test.ts b/services/auto-routing-benchmark/src/config.test.ts
index 1d2bec0b40..27be15b4b0 100644
--- a/services/auto-routing-benchmark/src/config.test.ts
+++ b/services/auto-routing-benchmark/src/config.test.ts
@@ -1,89 +1,54 @@
 import { describe, expect, it } from 'vitest';
-import { DEFAULT_BENCHMARK_CONFIG, mapConfigRows } from './config';
+import { mapConfigRows } from './config';
 import type { ConfigDeciderModelRow } from './db';
 
-const defaultDeciderRows: ConfigDeciderModelRow[] = DEFAULT_BENCHMARK_CONFIG.deciderModels.map(
-  m => ({
-    model: m.id,
-    reasoning_effort: m.reasoningEffort ?? null,
-    supports_chat_completions: m.supportedApiKinds.includes('chat_completions'),
-    supports_messages: m.supportedApiKinds.includes('messages'),
-    supports_responses: m.supportedApiKinds.includes('responses'),
-  })
-);
-
-const defaultConfigRow = {
+const configRow = {
   id: 1 as const,
-  min_accuracy: DEFAULT_BENCHMARK_CONFIG.minAccuracy,
-  max_concurrency: DEFAULT_BENCHMARK_CONFIG.maxConcurrency,
-  benchmark_user_id: DEFAULT_BENCHMARK_CONFIG.benchmarkUserId,
+  min_accuracy: 0.85,
+  max_concurrency: 8,
+  benchmark_user_id: 'user-123',
   updated_at: '2026-06-01T00:00:00.000Z',
-  updated_by: null,
+  updated_by: 'admin@example.com',
 };
 
+const deciderRows: ConfigDeciderModelRow[] = [
+  {
+    model: 'some/decider',
+    reasoning_effort: 'high',
+    supports_chat_completions: true,
+    supports_messages: true,
+    supports_responses: false,
+  },
+];
+
 describe('mapConfigRows', () => {
-  it('returns defaults when config row is null', () => {
-    expect(mapConfigRows(null, [], [])).toEqual(DEFAULT_BENCHMARK_CONFIG);
+  it('returns null when config row is null', () => {
+    expect(mapConfigRows(null, ['some/model'], deciderRows)).toBeNull();
   });
 
-  it('returns defaults when classifierModels array is empty', () => {
-    expect(mapConfigRows(defaultConfigRow, [], defaultDeciderRows)).toEqual(
-      DEFAULT_BENCHMARK_CONFIG
-    );
+  it('returns null when classifierModels array is empty', () => {
+    expect(mapConfigRows(configRow, [], deciderRows)).toBeNull();
   });
 
-  it('returns defaults when deciderModels array is empty', () => {
-    expect(mapConfigRows(defaultConfigRow, DEFAULT_BENCHMARK_CONFIG.classifierModels, [])).toEqual(
-      DEFAULT_BENCHMARK_CONFIG
-    );
+  it('returns null when deciderModels array is empty', () => {
+    expect(mapConfigRows(configRow, ['some/model'], [])).toBeNull();
   });
 
   it('maps a full config row set to BenchmarkConfig', () => {
-    const configRow = {
-      id: 1 as const,
-      min_accuracy: 0.85,
-      max_concurrency: 8,
-      benchmark_user_id: 'user-123',
-      updated_at: '2026-06-01T00:00:00.000Z',
-      updated_by: 'admin@example.com',
-    };
     const classifierModels = ['some/model-a', 'some/model-b'];
-    const deciderRows: ConfigDeciderModelRow[] = [
-      {
-        model: 'some/decider',
-        reasoning_effort: 'high',
-        supports_chat_completions: true,
-        supports_messages: true,
-        supports_responses: false,
-      },
-    ];
 
     const result = mapConfigRows(configRow, classifierModels, deciderRows);
 
-    expect(result.minAccuracy).toBe(0.85);
-    expect(result.maxConcurrency).toBe(8);
-    expect(result.benchmarkUserId).toBe('user-123');
-    expect(result.updatedAt).toBe('2026-06-01T00:00:00.000Z');
-    expect(result.updatedBy).toBe('admin@example.com');
-    expect(result.classifierModels).toEqual(classifierModels);
-    expect(result.deciderModels).toHaveLength(1);
-    expect(result.deciderModels[0].id).toBe('some/decider');
-    expect(result.deciderModels[0].reasoningEffort).toBe('high');
-    expect(result.deciderModels[0].supportedApiKinds).toEqual(['chat_completions', 'messages']);
-  });
-
-  it('round-trips the default config through rows', () => {
-    const result = mapConfigRows(
-      defaultConfigRow,
-      DEFAULT_BENCHMARK_CONFIG.classifierModels,
-      defaultDeciderRows
-    );
-    // updatedAt/updatedBy come from the row, not DEFAULT_BENCHMARK_CONFIG (which has null)
-    expect(result.minAccuracy).toBe(DEFAULT_BENCHMARK_CONFIG.minAccuracy);
-    expect(result.maxConcurrency).toBe(DEFAULT_BENCHMARK_CONFIG.maxConcurrency);
-    expect(result.classifierModels).toEqual(DEFAULT_BENCHMARK_CONFIG.classifierModels);
-    expect(result.deciderModels.map(m => m.id)).toEqual(
-      DEFAULT_BENCHMARK_CONFIG.deciderModels.map(m => m.id)
-    );
+    expect(result).not.toBeNull();
+    expect(result?.minAccuracy).toBe(0.85);
+    expect(result?.maxConcurrency).toBe(8);
+    expect(result?.benchmarkUserId).toBe('user-123');
+    expect(result?.updatedAt).toBe('2026-06-01T00:00:00.000Z');
+    expect(result?.updatedBy).toBe('admin@example.com');
+    expect(result?.classifierModels).toEqual(classifierModels);
+    expect(result?.deciderModels).toHaveLength(1);
+    expect(result?.deciderModels[0].id).toBe('some/decider');
+    expect(result?.deciderModels[0].reasoningEffort).toBe('high');
+    expect(result?.deciderModels[0].supportedApiKinds).toEqual(['chat_completions', 'messages']);
   });
 });
diff --git a/services/auto-routing-benchmark/src/config.ts b/services/auto-routing-benchmark/src/config.ts
index 5ede546b70..0dd78e8e92 100644
--- a/services/auto-routing-benchmark/src/config.ts
+++ b/services/auto-routing-benchmark/src/config.ts
@@ -1,45 +1,9 @@
 import type { BenchmarkConfig } from '@kilocode/auto-routing-contracts';
 import { apiKindsToFlags, getConfigRows, replaceConfig, type ConfigDeciderModelRow } from './db';
 
-export const DEFAULT_BENCHMARK_CONFIG: BenchmarkConfig = {
-  classifierModels: [
-    'google/gemini-2.5-flash-lite',
-    'google/gemini-2.5-flash',
-    'openai/gpt-5-mini',
-    'qwen/qwen3.7-plus',
-  ],
-  deciderModels: [
-    {
-      id: 'google/gemini-2.5-flash-lite',
-      supportedApiKinds: ['chat_completions'],
-      reasoningEffort: null,
-    },
-    {
-      id: 'google/gemini-2.5-flash',
-      supportedApiKinds: ['chat_completions'],
-      reasoningEffort: null,
-    },
-    { id: 'qwen/qwen3.7-plus', supportedApiKinds: ['chat_completions'], reasoningEffort: null },
-    {
-      id: 'openai/gpt-5.5',
-      supportedApiKinds: ['chat_completions', 'responses'],
-      reasoningEffort: null,
-    },
-    {
-      id: 'anthropic/claude-sonnet-4.6',
-      supportedApiKinds: ['chat_completions', 'messages', 'responses'],
-      reasoningEffort: null,
-    },
-  ],
-  minAccuracy: 0.7,
-  maxConcurrency: 4,
-  benchmarkUserId: null,
-  updatedAt: null,
-  updatedBy: null,
-};
-
 // Maps the three normalized config tables to the BenchmarkConfig contract.
-// Falls back to DEFAULT_BENCHMARK_CONFIG fields when no config row exists.
+// Null when no admin has saved a config yet — the worker never fabricates
+// one, and runs cannot start until a config exists.
 export function mapConfigRows(
   configRow: {
     min_accuracy: number;
@@ -50,9 +14,9 @@ export function mapConfigRows(
   } | null,
   classifierModels: string[],
   deciderModelRows: ConfigDeciderModelRow[]
-): BenchmarkConfig {
+): BenchmarkConfig | null {
   if (configRow === null || classifierModels.length === 0 || deciderModelRows.length === 0) {
-    return DEFAULT_BENCHMARK_CONFIG;
+    return null;
   }
 
   return {
@@ -75,7 +39,7 @@ export function mapConfigRows(
   };
 }
 
-export async function getBenchmarkConfig(db: D1Database): Promise<BenchmarkConfig> {
+export async function getBenchmarkConfig(db: D1Database): Promise<BenchmarkConfig | null> {
   const { config, classifierModels, deciderModels } = await getConfigRows(db);
   return mapConfigRows(config, classifierModels, deciderModels);
 }
diff --git a/services/auto-routing-benchmark/src/run.ts b/services/auto-routing-benchmark/src/run.ts
index 5a2bb2ffa8..d6b2fcf818 100644
--- a/services/auto-routing-benchmark/src/run.ts
+++ b/services/auto-routing-benchmark/src/run.ts
@@ -80,6 +80,9 @@ export async function startRun(
   );
 
   const config = await getBenchmarkConfig(env.BENCH_DB);
+  if (!config) {
+    throw new Error('benchmark config not set: save it in the admin panel before starting a run');
+  }
   const models =
     kind === 'classifier' ? config.classifierModels : config.deciderModels.map(m => m.id);
 

From 354054d4e5f13e1282969e97d9aa9ac8424eb606 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Fri, 12 Jun 2026 17:01:32 +0200
Subject: [PATCH 54/73] chore(benchmark): drop redundant case_results index,
 regenerate baseline migration

---
 ...0000_complete_lyja.sql => 0000_dear_nicolaos.sql} |  1 -
 .../migrations/meta/0000_snapshot.json               | 12 ++----------
 .../migrations/meta/_journal.json                    |  4 ++--
 services/auto-routing-benchmark/src/db-schema.ts     |  9 ++++-----
 4 files changed, 8 insertions(+), 18 deletions(-)
 rename services/auto-routing-benchmark/migrations/{0000_complete_lyja.sql => 0000_dear_nicolaos.sql} (96%)

diff --git a/services/auto-routing-benchmark/migrations/0000_complete_lyja.sql b/services/auto-routing-benchmark/migrations/0000_dear_nicolaos.sql
similarity index 96%
rename from services/auto-routing-benchmark/migrations/0000_complete_lyja.sql
rename to services/auto-routing-benchmark/migrations/0000_dear_nicolaos.sql
index 5dbc7e2bd0..defeee3c79 100644
--- a/services/auto-routing-benchmark/migrations/0000_complete_lyja.sql
+++ b/services/auto-routing-benchmark/migrations/0000_dear_nicolaos.sql
@@ -37,7 +37,6 @@ CREATE TABLE `case_results` (
 	PRIMARY KEY(`run_id`, `model`, `case_id`)
 );
 --> statement-breakpoint
-CREATE INDEX `idx_case_results_run` ON `case_results` (`run_id`);--> statement-breakpoint
 CREATE TABLE `config_classifier_models` (
 	`model` text PRIMARY KEY NOT NULL
 );
diff --git a/services/auto-routing-benchmark/migrations/meta/0000_snapshot.json b/services/auto-routing-benchmark/migrations/meta/0000_snapshot.json
index e0bfebb9ea..6444a68a13 100644
--- a/services/auto-routing-benchmark/migrations/meta/0000_snapshot.json
+++ b/services/auto-routing-benchmark/migrations/meta/0000_snapshot.json
@@ -1,7 +1,7 @@
 {
   "version": "6",
   "dialect": "sqlite",
-  "id": "77565fdd-e92c-4de7-a4ee-0f832a620652",
+  "id": "905b4a67-d32c-491d-9206-ede5de77d0b2",
   "prevId": "00000000-0000-0000-0000-000000000000",
   "tables": {
     "benchmark_config": {
@@ -231,15 +231,7 @@
           "autoincrement": false
         }
       },
-      "indexes": {
-        "idx_case_results_run": {
-          "name": "idx_case_results_run",
-          "columns": [
-            "run_id"
-          ],
-          "isUnique": false
-        }
-      },
+      "indexes": {},
       "foreignKeys": {},
       "compositePrimaryKeys": {
         "case_results_run_id_model_case_id_pk": {
diff --git a/services/auto-routing-benchmark/migrations/meta/_journal.json b/services/auto-routing-benchmark/migrations/meta/_journal.json
index 826916b528..ba4e4ef0cc 100644
--- a/services/auto-routing-benchmark/migrations/meta/_journal.json
+++ b/services/auto-routing-benchmark/migrations/meta/_journal.json
@@ -5,8 +5,8 @@
     {
       "idx": 0,
       "version": "6",
-      "when": 1781261894022,
-      "tag": "0000_complete_lyja",
+      "when": 1781276443789,
+      "tag": "0000_dear_nicolaos",
       "breakpoints": true
     }
   ]
diff --git a/services/auto-routing-benchmark/src/db-schema.ts b/services/auto-routing-benchmark/src/db-schema.ts
index 8c2f7184ff..897fe1b74b 100644
--- a/services/auto-routing-benchmark/src/db-schema.ts
+++ b/services/auto-routing-benchmark/src/db-schema.ts
@@ -1,4 +1,4 @@
-import { index, integer, primaryKey, real, sqliteTable, text } from 'drizzle-orm/sqlite-core';
+import { integer, primaryKey, real, sqliteTable, text } from 'drizzle-orm/sqlite-core';
 import type { BenchmarkKind, BenchmarkRunStatus } from '@kilocode/auto-routing-contracts';
 
 // Migrations are generated via `pnpm db:generate` (drizzle-kit) and applied
@@ -91,10 +91,9 @@ export const caseResults = sqliteTable(
     event_count: integer('event_count'),
     last_event_types: text('last_event_types'),
   },
-  table => [
-    primaryKey({ columns: [table.run_id, table.model, table.case_id] }),
-    index('idx_case_results_run').on(table.run_id),
-  ]
+  // The composite PK's leftmost column already serves run_id-prefix lookups
+  // (count/fetch by run); no separate run_id index is needed.
+  table => [primaryKey({ columns: [table.run_id, table.model, table.case_id] })]
 );
 
 export const routingTables = sqliteTable('routing_tables', {

From 6aba145b7161b86bed87bc234f675c31e7005ea0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Fri, 12 Jun 2026 17:02:57 +0200
Subject: [PATCH 55/73] docs(benchmark): fix stale KV comment in wrangler
 config

---
 services/auto-routing-benchmark/wrangler.jsonc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/services/auto-routing-benchmark/wrangler.jsonc b/services/auto-routing-benchmark/wrangler.jsonc
index d48f588d52..088752febb 100644
--- a/services/auto-routing-benchmark/wrangler.jsonc
+++ b/services/auto-routing-benchmark/wrangler.jsonc
@@ -50,8 +50,9 @@
     ],
   },
   "kv_namespaces": [
-    // Shared with the auto-routing worker: the decider benchmark publishes
-    // the routing table here and auto-routing reads it on /decide.
+    // Shared with the auto-routing worker, which uses it as a read-through
+    // cache over this worker's D1. On publish we only DELETE the cached keys
+    // (routing table + classifier winner) so the next read repopulates.
     { "binding": "AUTO_ROUTING_CONFIG", "id": "4316b8db31e347e19cfadad1b6386ad5" },
   ],
   "secrets_store_secrets": [

From 8955269124691b5bc98e52e5946fd2a0c60afdba Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Fri, 12 Jun 2026 17:44:46 +0200
Subject: [PATCH 56/73] feat(auto-routing-benchmark): grade subtaskType and
 riskLevel, expand classifier dataset to per-pair coverage

---
 .../src/classifier/index.ts                   |   1 +
 .../src/datasets/classifier-cases.test.ts     |  63 +-
 .../src/datasets/classifier-cases.ts          | 949 ++++++++++++++++--
 .../src/grading.test.ts                       |  10 +-
 .../auto-routing-benchmark/src/grading.ts     |  17 +-
 5 files changed, 913 insertions(+), 127 deletions(-)

diff --git a/packages/auto-routing-contracts/src/classifier/index.ts b/packages/auto-routing-contracts/src/classifier/index.ts
index 78c27cb244..c3d38367e4 100644
--- a/packages/auto-routing-contracts/src/classifier/index.ts
+++ b/packages/auto-routing-contracts/src/classifier/index.ts
@@ -1,4 +1,5 @@
 export { buildClassifierMessages, CLASSIFIER_MAX_TOKENS, DEFAULT_CLASSIFIER_MODEL } from './prompt';
+export { default as classifierTaxonomy } from './taxonomy.json';
 export { ClassifierOutputParseError, parseClassifierOutput, type ClassifierOutput } from './output';
 export { fallbackClassifierOutput } from './output-fallback';
 export {
diff --git a/services/auto-routing-benchmark/src/datasets/classifier-cases.test.ts b/services/auto-routing-benchmark/src/datasets/classifier-cases.test.ts
index 08523eaa4f..ab1221497f 100644
--- a/services/auto-routing-benchmark/src/datasets/classifier-cases.test.ts
+++ b/services/auto-routing-benchmark/src/datasets/classifier-cases.test.ts
@@ -1,10 +1,22 @@
 import { describe, expect, it } from 'vitest';
 import { NormalizedClassifierInputSchema } from '@kilocode/auto-routing-contracts';
+import { classifierTaxonomy } from '@kilocode/auto-routing-contracts/classifier';
 import { CLASSIFIER_CASES } from './classifier-cases';
 
+const TAXONOMY_PAIRS = classifierTaxonomy.taskTypes.flatMap(taskType =>
+  taskType.subtypes.map(subtype => ({ taskType: taskType.id, subtaskType: subtype.id }))
+);
+
+const SUBTYPES_BY_TASK_TYPE = new Map(
+  classifierTaxonomy.taskTypes.map(taskType => [
+    taskType.id,
+    new Set(taskType.subtypes.map(subtype => subtype.id)),
+  ])
+);
+
 describe('CLASSIFIER_CASES', () => {
-  it('has exactly 36 cases', () => {
-    expect(CLASSIFIER_CASES.length).toBe(36);
+  it('covers all 18 taxonomy pairs', () => {
+    expect(TAXONOMY_PAIRS.length).toBe(18);
   });
 
   it('has unique ids and valid inputs', () => {
@@ -16,17 +28,33 @@ describe('CLASSIFIER_CASES', () => {
     }
   });
 
-  it('covers every task type with exactly 6 cases', () => {
-    const byType = Map.groupBy(CLASSIFIER_CASES, c => c.expected.taskType);
-    for (const taskType of [
-      'implementation',
-      'debugging',
-      'refactoring',
-      'planning_design',
-      'investigation',
-      'agentic_execution',
-    ] as const) {
-      expect(byType.get(taskType)?.length ?? 0, taskType).toBe(6);
+  it('has at least 4 cases per (taskType, subtaskType) pair', () => {
+    for (const pair of TAXONOMY_PAIRS) {
+      const count = CLASSIFIER_CASES.filter(
+        c => c.expected.taskType === pair.taskType && c.expected.subtaskType === pair.subtaskType
+      ).length;
+      expect(count, `${pair.taskType}/${pair.subtaskType}`).toBeGreaterThanOrEqual(4);
+    }
+  });
+
+  it('labels every case with a subtaskType that belongs to its taskType', () => {
+    for (const c of CLASSIFIER_CASES) {
+      const subtypes = SUBTYPES_BY_TASK_TYPE.get(c.expected.taskType);
+      expect(subtypes, `unknown taskType in case ${c.id}`).toBeDefined();
+      expect(
+        subtypes?.has(c.expected.subtaskType),
+        `case ${c.id}: ${c.expected.subtaskType} does not belong to ${c.expected.taskType}`
+      ).toBe(true);
+    }
+  });
+
+  it('covers every task type with exactly 12 cases', () => {
+    const counts = new Map<string, number>();
+    for (const c of CLASSIFIER_CASES) {
+      counts.set(c.expected.taskType, (counts.get(c.expected.taskType) ?? 0) + 1);
+    }
+    for (const taskType of classifierTaxonomy.taskTypes) {
+      expect(counts.get(taskType.id) ?? 0, taskType.id).toBe(12);
     }
   });
 
@@ -39,6 +67,15 @@ describe('CLASSIFIER_CASES', () => {
     }
   });
 
+  it('covers every risk level at least 4 times', () => {
+    for (const level of ['low', 'medium', 'high'] as const) {
+      expect(
+        CLASSIFIER_CASES.filter(c => c.expected.riskLevel === level).length,
+        level
+      ).toBeGreaterThanOrEqual(4);
+    }
+  });
+
   it('has at least one of each reasoning complexity within every task type', () => {
     const byType = Map.groupBy(CLASSIFIER_CASES, c => c.expected.taskType);
     for (const [taskType, cases] of byType) {
diff --git a/services/auto-routing-benchmark/src/datasets/classifier-cases.ts b/services/auto-routing-benchmark/src/datasets/classifier-cases.ts
index 33baacaefb..7866c762c1 100644
--- a/services/auto-routing-benchmark/src/datasets/classifier-cases.ts
+++ b/services/auto-routing-benchmark/src/datasets/classifier-cases.ts
@@ -2,7 +2,7 @@ import type { NormalizedClassifierInput } from '@kilocode/auto-routing-contracts
 import type { ClassifierExpectation } from '../grading';
 
 export type ClassifierCase = {
-  id: string; // stable slug, e.g. 'impl-low-regex-helper'
+  id: string; // stable slug, e.g. 'impl-gen-semver-helper' (<taskType>-<subtype>-<topic>)
   input: NormalizedClassifierInput;
   expected: ClassifierExpectation;
 };
@@ -38,12 +38,90 @@ function chat(
   };
 }
 
+// Four cases per (taskType, subtaskType) pair, with difficulty (context and
+// reasoning), execution mode, and risk varied within each pair. riskLevel
+// follows the taxonomy axis: high = auth/secrets/billing/user-data
+// migrations/production routing/destructive ops; medium = changes runtime
+// code, service config, or request contracts; low = read-only, test-only,
+// docs-only, or isolated reversible code.
 export const CLASSIFIER_CASES: readonly ClassifierCase[] = [
   // ---------------------------------------------------------------------------
-  // implementation (2 low, 2 medium, 2 high)
+  // implementation / feature_development
   // ---------------------------------------------------------------------------
   {
-    id: 'impl-low-regex-helper',
+    id: 'impl-feat-members-endpoint',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Add a new GET /api/projects/:id/members endpoint to our Express router in src/routes/projects.ts. Reuse the existing requireAuth middleware and the ProjectService.getMembers method, and return 404 when the project does not exist.',
+      { messageCount: 7, hasTools: true }
+    ),
+    expected: {
+      taskType: 'implementation',
+      subtaskType: 'feature_development',
+      contextComplexity: 'medium',
+      reasoningComplexity: 'medium',
+      riskLevel: 'medium',
+      executionMode: 'code_change',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'impl-feat-debounced-search',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Implement a useDebouncedValue(value, delayMs) React hook in src/hooks and use it in the SearchBar component so the onSearch callback fires at most once every 300ms. Keep the existing controlled-input behavior.',
+      { messageCount: 9, hasTools: true }
+    ),
+    expected: {
+      taskType: 'implementation',
+      subtaskType: 'feature_development',
+      contextComplexity: 'medium',
+      reasoningComplexity: 'medium',
+      riskLevel: 'medium',
+      executionMode: 'code_change',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'impl-feat-realtime-collab',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Build real-time collaborative editing for our document editor. We have a React frontend, a Node WebSocket gateway, and a Postgres store. Decide and implement a conflict-resolution strategy (OT vs CRDT), wire presence, persistence, and reconnection, and make it consistent across all three layers.',
+      { messageCount: 18, hasTools: true }
+    ),
+    expected: {
+      taskType: 'implementation',
+      subtaskType: 'feature_development',
+      contextComplexity: 'large',
+      reasoningComplexity: 'high',
+      riskLevel: 'medium',
+      executionMode: 'multi_step_project',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'impl-feat-rate-limiter',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Implement a distributed sliding-window rate limiter that works across our 4 API replicas backed by Redis. It must handle clock skew between nodes, degrade gracefully if Redis is unavailable, and expose per-tenant limits configured in src/config/limits.ts. Integrate it into the existing middleware chain.',
+      { messageCount: 16, hasTools: true }
+    ),
+    expected: {
+      taskType: 'implementation',
+      subtaskType: 'feature_development',
+      contextComplexity: 'large',
+      reasoningComplexity: 'high',
+      riskLevel: 'medium',
+      executionMode: 'multi_step_project',
+      requiresTools: true,
+    },
+  },
+
+  // ---------------------------------------------------------------------------
+  // implementation / code_generation
+  // ---------------------------------------------------------------------------
+  {
+    id: 'impl-gen-semver-helper',
     input: chat(
       AGENT_PLAIN_SYSTEM,
       'Write a TypeScript helper function isValidSemver(version: string): boolean that returns true for valid semantic version strings like 1.2.3 and false otherwise. No external dependencies.',
@@ -51,14 +129,16 @@ export const CLASSIFIER_CASES: readonly ClassifierCase[] = [
     ),
     expected: {
       taskType: 'implementation',
+      subtaskType: 'code_generation',
       contextComplexity: 'small',
       reasoningComplexity: 'low',
+      riskLevel: 'low',
       executionMode: 'answer_only',
       requiresTools: false,
     },
   },
   {
-    id: 'impl-low-add-zod-schema',
+    id: 'impl-gen-pagination-schema',
     input: chat(
       AGENT_TOOLS_SYSTEM,
       'Add a Zod schema named PaginationParamsSchema to src/schemas/pagination.ts with optional page (positive int, default 1) and pageSize (positive int, max 100, default 20) fields, and export its inferred type.',
@@ -66,78 +146,126 @@ export const CLASSIFIER_CASES: readonly ClassifierCase[] = [
     ),
     expected: {
       taskType: 'implementation',
+      subtaskType: 'code_generation',
       contextComplexity: 'small',
       reasoningComplexity: 'low',
+      riskLevel: 'low',
       executionMode: 'code_change',
       requiresTools: true,
     },
   },
   {
-    id: 'impl-medium-rest-endpoint',
+    id: 'impl-gen-api-client',
     input: chat(
       AGENT_TOOLS_SYSTEM,
-      'Add a new GET /api/projects/:id/members endpoint to our Express router in src/routes/projects.ts. Reuse the existing requireAuth middleware and the ProjectService.getMembers method, and return 404 when the project does not exist.',
-      { messageCount: 7, hasTools: true }
+      'Generate a typed TypeScript client for our internal REST API from the OpenAPI spec at docs/openapi.yaml: one function per endpoint, a shared fetch wrapper that injects the Authorization header, and response types derived from the spec schemas. Write it to src/generated/api-client.ts; nothing imports it yet, we will wire it in later.',
+      { messageCount: 5, hasTools: true }
     ),
     expected: {
       taskType: 'implementation',
+      subtaskType: 'code_generation',
       contextComplexity: 'medium',
       reasoningComplexity: 'medium',
+      riskLevel: 'low',
       executionMode: 'code_change',
       requiresTools: true,
     },
   },
   {
-    id: 'impl-medium-react-hook',
+    id: 'impl-gen-ci-workflow',
     input: chat(
       AGENT_TOOLS_SYSTEM,
-      'Implement a useDebouncedValue(value, delayMs) React hook in src/hooks and use it in the SearchBar component so the onSearch callback fires at most once every 300ms. Keep the existing controlled-input behavior.',
-      { messageCount: 9, hasTools: true }
+      'Create a GitHub Actions workflow at .github/workflows/ci.yml that runs pnpm install with caching, then runs typecheck, lint, and test as parallel jobs on every pull request, using Node 22 and pnpm 9.',
+      { messageCount: 3, hasTools: true }
     ),
     expected: {
       taskType: 'implementation',
-      contextComplexity: 'medium',
-      reasoningComplexity: 'medium',
+      subtaskType: 'code_generation',
+      contextComplexity: 'small',
+      reasoningComplexity: 'low',
+      riskLevel: 'medium',
       executionMode: 'code_change',
       requiresTools: true,
     },
   },
+
+  // ---------------------------------------------------------------------------
+  // implementation / test_creation
+  // ---------------------------------------------------------------------------
   {
-    id: 'impl-high-realtime-collab',
+    id: 'impl-test-slugify-units',
     input: chat(
       AGENT_TOOLS_SYSTEM,
-      'Build real-time collaborative editing for our document editor. We have a React frontend, a Node WebSocket gateway, and a Postgres store. Decide and implement a conflict-resolution strategy (OT vs CRDT), wire presence, persistence, and reconnection, and make it consistent across all three layers.',
-      { messageCount: 18, hasTools: true }
+      'Add Jest unit tests for the slugify function in src/utils/slugify.ts. Cover unicode input, repeated spaces, leading and trailing dashes, and the maxLength option. The function works correctly today, we just have no coverage.',
+      { messageCount: 2, hasTools: true }
     ),
     expected: {
       taskType: 'implementation',
-      contextComplexity: 'large',
-      reasoningComplexity: 'high',
-      executionMode: 'multi_step_project',
+      subtaskType: 'test_creation',
+      contextComplexity: 'small',
+      reasoningComplexity: 'low',
+      riskLevel: 'low',
+      executionMode: 'code_change',
       requiresTools: true,
     },
   },
   {
-    id: 'impl-high-rate-limiter',
+    id: 'impl-test-checkout-route',
     input: chat(
       AGENT_TOOLS_SYSTEM,
-      'Implement a distributed sliding-window rate limiter that works across our 4 API replicas backed by Redis. It must handle clock skew between nodes, degrade gracefully if Redis is unavailable, and expose per-tenant limits configured in src/config/limits.ts. Integrate it into the existing middleware chain.',
-      { messageCount: 16, hasTools: true }
+      'Add supertest integration tests for the POST /api/checkout route: the happy path, an invalid coupon code, and an out-of-stock item. Reuse the existing test app factory in test/helpers/app.ts and the product fixtures. The route itself works fine in production.',
+      { messageCount: 7, hasTools: true }
     ),
     expected: {
       taskType: 'implementation',
+      subtaskType: 'test_creation',
+      contextComplexity: 'medium',
+      reasoningComplexity: 'medium',
+      riskLevel: 'low',
+      executionMode: 'code_change',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'impl-test-e2e-onboarding',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Build a Playwright E2E suite covering signup, email verification, workspace creation, and inviting a teammate, across the web app and the API. Set up seeded test users, per-test database isolation, and wire the suite into CI. Nothing is broken — we have zero end-to-end coverage today and need it before the next launch.',
+      { messageCount: 15, hasTools: true }
+    ),
+    expected: {
+      taskType: 'implementation',
+      subtaskType: 'test_creation',
       contextComplexity: 'large',
       reasoningComplexity: 'high',
+      riskLevel: 'low',
       executionMode: 'multi_step_project',
       requiresTools: true,
     },
   },
+  {
+    id: 'impl-test-pasted-debounce',
+    input: chat(
+      CHAT_ASSISTANT_SYSTEM,
+      'Here is my debounce implementation pasted below. Write a Jest test file for it covering the delay behavior, cancellation, and the immediate=true mode. Just give me the test code, I will add it to the repo myself.',
+      { messageCount: 1, hasTools: false }
+    ),
+    expected: {
+      taskType: 'implementation',
+      subtaskType: 'test_creation',
+      contextComplexity: 'small',
+      reasoningComplexity: 'low',
+      riskLevel: 'low',
+      executionMode: 'answer_only',
+      requiresTools: false,
+    },
+  },
 
   // ---------------------------------------------------------------------------
-  // debugging (2 low, 2 medium, 2 high)
+  // debugging / bug_fixing
   // ---------------------------------------------------------------------------
   {
-    id: 'debug-low-typo-import',
+    id: 'debug-fix-import-mismatch',
     input: chat(
       AGENT_TOOLS_SYSTEM,
       'Running the app throws "TypeError: formatDate is not a function" from src/utils/date.ts line 12. The file exports formatDate as a named export but App.tsx imports it as a default. Fix the import.',
@@ -145,14 +273,16 @@ export const CLASSIFIER_CASES: readonly ClassifierCase[] = [
     ),
     expected: {
       taskType: 'debugging',
+      subtaskType: 'bug_fixing',
       contextComplexity: 'small',
       reasoningComplexity: 'low',
+      riskLevel: 'low',
       executionMode: 'code_change',
       requiresTools: true,
     },
   },
   {
-    id: 'debug-low-off-by-one',
+    id: 'debug-fix-pagination-slice',
     input: chat(
       AGENT_PLAIN_SYSTEM,
       'This pagination function returns one too few items on the last page. Here is the code: `return items.slice(page * size, page * size + size - 1)`. What is wrong and how do I fix it?',
@@ -160,78 +290,198 @@ export const CLASSIFIER_CASES: readonly ClassifierCase[] = [
     ),
     expected: {
       taskType: 'debugging',
+      subtaskType: 'bug_fixing',
       contextComplexity: 'small',
       reasoningComplexity: 'low',
+      riskLevel: 'low',
       executionMode: 'answer_only',
       requiresTools: false,
     },
   },
   {
-    id: 'debug-medium-failing-test',
+    id: 'debug-fix-cors-upload',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Browser requests to our /api/upload endpoint fail with "blocked by CORS policy: No Access-Control-Allow-Origin header". GET requests to other endpoints work fine. The cors middleware is configured in src/server.ts. Find why only upload is affected and fix it.',
+      { messageCount: 10, hasTools: true }
+    ),
+    expected: {
+      taskType: 'debugging',
+      subtaskType: 'bug_fixing',
+      contextComplexity: 'medium',
+      reasoningComplexity: 'medium',
+      riskLevel: 'medium',
+      executionMode: 'code_change',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'debug-fix-double-charge',
     input: chat(
       AGENT_TOOLS_SYSTEM,
-      'Our test "UserService > createUser persists the hashed password" started failing after I changed the bcrypt cost factor. The assertion expects a 60-char hash but now gets undefined. Figure out whether the service or the test is wrong and fix it so the suite passes.',
+      'Our payment webhook handler intermittently double-charges customers under load. We use a Postgres advisory lock around the charge, but the duplicate rows have timestamps 2-3ms apart. The handler runs on 3 replicas behind a queue with at-least-once delivery. Investigate the root cause across the worker, queue consumer, and DB layers and fix it.',
+      { messageCount: 14, hasTools: true }
+    ),
+    expected: {
+      taskType: 'debugging',
+      subtaskType: 'bug_fixing',
+      contextComplexity: 'large',
+      reasoningComplexity: 'high',
+      riskLevel: 'high',
+      executionMode: 'multi_step_project',
+      requiresTools: true,
+    },
+  },
+
+  // ---------------------------------------------------------------------------
+  // debugging / test_repair
+  // ---------------------------------------------------------------------------
+  {
+    id: 'debug-repair-bcrypt-stub',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Our test "UserService > createUser persists the hashed password" fails since we upgraded bcryptjs to v3: the hash comes back undefined because the test still stubs the old callback-style API. The production code is verified working in staging. Update the test stub and assertions so the suite passes.',
       { messageCount: 8, hasTools: true }
     ),
     expected: {
       taskType: 'debugging',
+      subtaskType: 'test_repair',
       contextComplexity: 'medium',
       reasoningComplexity: 'medium',
+      riskLevel: 'low',
       executionMode: 'code_change',
       requiresTools: true,
     },
   },
   {
-    id: 'debug-medium-cors-error',
+    id: 'debug-repair-aria-snapshots',
     input: chat(
       AGENT_TOOLS_SYSTEM,
-      'Browser requests to our /api/upload endpoint fail with "blocked by CORS policy: No Access-Control-Allow-Origin header". GET requests to other endpoints work fine. The cors middleware is configured in src/server.ts. Find why only upload is affected and fix it.',
-      { messageCount: 10, hasTools: true }
+      'After adding an aria-label to the IconButton component, 14 Jest snapshot tests fail and every diff is just the new attribute. The new markup is intentional and correct. Update the snapshots and fix the one inline assertion in IconButton.test.tsx that checks the rendered props.',
+      { messageCount: 5, hasTools: true }
+    ),
+    expected: {
+      taskType: 'debugging',
+      subtaskType: 'test_repair',
+      contextComplexity: 'small',
+      reasoningComplexity: 'low',
+      riskLevel: 'low',
+      executionMode: 'code_change',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'debug-repair-flaky-backoff',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'The "retries with exponential backoff" test in src/queue/retry.test.ts is flaky in CI: it asserts real elapsed time around setTimeout and fails when the runners are slow. The production retry logic is correct. Make the test deterministic with vitest fake timers without weakening what it asserts.',
+      { messageCount: 9, hasTools: true }
     ),
     expected: {
       taskType: 'debugging',
+      subtaskType: 'test_repair',
       contextComplexity: 'medium',
       reasoningComplexity: 'medium',
+      riskLevel: 'low',
       executionMode: 'code_change',
       requiresTools: true,
     },
   },
   {
-    id: 'debug-high-race-condition',
+    id: 'debug-repair-stale-fixtures',
     input: chat(
       AGENT_TOOLS_SYSTEM,
-      'Our payment webhook handler intermittently double-charges customers under load. We use a Postgres advisory lock around the charge, but the duplicate rows have timestamps 2-3ms apart. The handler runs on 3 replicas behind a queue with at-least-once delivery. Investigate the root cause across the worker, queue consumer, and DB layers and fix it.',
-      { messageCount: 14, hasTools: true }
+      'CI is red: nine tests in services/billing-worker fail with ZodError because the request fixtures still use the old amountCents field that was intentionally renamed to amountMinorUnits last week. The schema change is correct and already deployed. Update the fixtures to match.',
+      { messageCount: 6, hasTools: true }
+    ),
+    expected: {
+      taskType: 'debugging',
+      subtaskType: 'test_repair',
+      contextComplexity: 'medium',
+      reasoningComplexity: 'low',
+      riskLevel: 'low',
+      executionMode: 'code_change',
+      requiresTools: true,
+    },
+  },
+
+  // ---------------------------------------------------------------------------
+  // debugging / root_cause_analysis
+  // ---------------------------------------------------------------------------
+  {
+    id: 'debug-rca-sidebar-overflow',
+    input: chat(
+      CHAT_ASSISTANT_SYSTEM,
+      'Why does this sidebar overflow horizontally on mobile only? I pasted the component and its CSS module below; min-width is set on the nav list. Explain the cause — I will fix it myself.',
+      { messageCount: 1, hasTools: false }
+    ),
+    expected: {
+      taskType: 'debugging',
+      subtaskType: 'root_cause_analysis',
+      contextComplexity: 'small',
+      reasoningComplexity: 'low',
+      riskLevel: 'low',
+      executionMode: 'answer_only',
+      requiresTools: false,
+    },
+  },
+  {
+    id: 'debug-rca-local-401',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Requests from our Next.js app to the o11y worker return 401 in local dev, but the same code works in staging. The bearer token is read in apps/web/src/lib/workerClient.ts and validated in the worker auth middleware. Trace where the values diverge and tell me the root cause. Do not change anything yet.',
+      { messageCount: 7, hasTools: true }
+    ),
+    expected: {
+      taskType: 'debugging',
+      subtaskType: 'root_cause_analysis',
+      contextComplexity: 'medium',
+      reasoningComplexity: 'medium',
+      riskLevel: 'low',
+      executionMode: 'answer_only',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'debug-rca-search-500s',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Roughly 0.5% of requests to /api/search return a 500 with nothing in the application logs. Candidates: the Express handler, the OpenSearch client timeout config, or the nginx proxy in front. Gather evidence from the code and configs and tell me where the failures originate and why. Diagnosis only — I will decide on the fix.',
+      { messageCount: 13, hasTools: true }
     ),
     expected: {
       taskType: 'debugging',
+      subtaskType: 'root_cause_analysis',
       contextComplexity: 'large',
       reasoningComplexity: 'high',
-      executionMode: 'multi_step_project',
+      riskLevel: 'low',
+      executionMode: 'answer_only',
       requiresTools: true,
     },
   },
   {
-    id: 'debug-high-memory-leak',
+    id: 'debug-rca-memory-leak',
     input: chat(
       AGENT_TOOLS_SYSTEM,
-      'Our Node service RSS grows by ~50MB/hour in production and OOMs after a day, but it is stable locally. Heap snapshots show growing retained closures referencing our EventEmitter-based cache. It spans the cache module, the websocket session manager, and a third-party metrics client. Trace the leak across these and fix it.',
+      'Our Node service RSS grows by ~50MB/hour in production and OOMs after a day, but it is stable locally. Heap snapshots show growing retained closures referencing our EventEmitter-based cache. It spans the cache module, the websocket session manager, and a third-party metrics client. Trace the leak across these and report the root cause with the retaining-path evidence. Do not fix anything yet — I want to review the diagnosis with the team first.',
       { messageCount: 22, hasTools: true }
     ),
     expected: {
       taskType: 'debugging',
+      subtaskType: 'root_cause_analysis',
       contextComplexity: 'large',
       reasoningComplexity: 'high',
-      executionMode: 'multi_step_project',
+      riskLevel: 'low',
+      executionMode: 'answer_only',
       requiresTools: true,
     },
   },
 
   // ---------------------------------------------------------------------------
-  // refactoring (2 low, 2 medium, 2 high)
+  // refactoring / code_cleanup
   // ---------------------------------------------------------------------------
   {
-    id: 'refactor-low-rename-var',
+    id: 'refactor-cleanup-rename-total',
     input: chat(
       AGENT_TOOLS_SYSTEM,
       'In src/cart.ts rename the variable `x` to `lineItemTotal` everywhere it is used in the calculateTotal function. No behavior change.',
@@ -239,14 +489,16 @@ export const CLASSIFIER_CASES: readonly ClassifierCase[] = [
     ),
     expected: {
       taskType: 'refactoring',
+      subtaskType: 'code_cleanup',
       contextComplexity: 'small',
       reasoningComplexity: 'low',
+      riskLevel: 'low',
       executionMode: 'code_change',
       requiresTools: true,
     },
   },
   {
-    id: 'refactor-low-extract-constant',
+    id: 'refactor-cleanup-seconds-constant',
     input: chat(
       AGENT_TOOLS_SYSTEM,
       'The magic number 86400 appears three times in src/scheduler.ts. Extract it into a named constant SECONDS_PER_DAY at the top of the file and use it in all three places. Keep behavior identical.',
@@ -254,14 +506,54 @@ export const CLASSIFIER_CASES: readonly ClassifierCase[] = [
     ),
     expected: {
       taskType: 'refactoring',
+      subtaskType: 'code_cleanup',
       contextComplexity: 'small',
       reasoningComplexity: 'low',
+      riskLevel: 'low',
+      executionMode: 'code_change',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'refactor-cleanup-shared-pagination',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'src/routes/users.ts and src/routes/orgs.ts each define a parsePagination helper that is character-for-character identical. Move it to src/lib/pagination.ts and import it in both routes. No behavior change.',
+      { messageCount: 4, hasTools: true }
+    ),
+    expected: {
+      taskType: 'refactoring',
+      subtaskType: 'code_cleanup',
+      contextComplexity: 'medium',
+      reasoningComplexity: 'low',
+      riskLevel: 'low',
+      executionMode: 'code_change',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'refactor-cleanup-dead-flag',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Remove the dead code paths guarded by ENABLE_OLD_DASHBOARD across src/dashboard/ — the flag has been false in every environment for over a year and the env var was deleted from our deploy configs. Delete the guarded branches, the flag helper, and the now-unused components, keeping everything else identical. Run the test suite when done.',
+      { messageCount: 10, hasTools: true }
+    ),
+    expected: {
+      taskType: 'refactoring',
+      subtaskType: 'code_cleanup',
+      contextComplexity: 'medium',
+      reasoningComplexity: 'medium',
+      riskLevel: 'medium',
       executionMode: 'code_change',
       requiresTools: true,
     },
   },
+
+  // ---------------------------------------------------------------------------
+  // refactoring / architecture_improvement
+  // ---------------------------------------------------------------------------
   {
-    id: 'refactor-medium-extract-service',
+    id: 'refactor-arch-order-service',
     input: chat(
       AGENT_TOOLS_SYSTEM,
       'The OrderController in src/controllers/order.ts has grown to 400 lines and mixes HTTP handling with business logic. Extract the business logic into an OrderService class, keep the controller thin, and update the existing controller tests to match. Behavior must stay the same.',
@@ -269,123 +561,304 @@ export const CLASSIFIER_CASES: readonly ClassifierCase[] = [
     ),
     expected: {
       taskType: 'refactoring',
+      subtaskType: 'architecture_improvement',
+      contextComplexity: 'medium',
+      reasoningComplexity: 'medium',
+      riskLevel: 'medium',
+      executionMode: 'code_change',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'refactor-arch-modular-monolith',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Our monolithic src/app.ts wires routing, auth, database access, and background jobs in one 1200-line file with tangled circular imports. Restructure it into clear modules with one-directional dependencies, without changing any external behavior or public routes. Decide the boundaries and migrate incrementally.',
+      { messageCount: 26, hasTools: true }
+    ),
+    expected: {
+      taskType: 'refactoring',
+      subtaskType: 'architecture_improvement',
+      contextComplexity: 'large',
+      reasoningComplexity: 'high',
+      riskLevel: 'medium',
+      executionMode: 'multi_step_project',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'refactor-arch-shared-worker-auth',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'The o11y worker and the notifications worker each carry a copy of the same bearer-token auth middleware. Move it into packages/worker-utils as a shared helper and have both workers consume it. Keep the validation behavior identical and keep both workers test suites green.',
+      { messageCount: 9, hasTools: true }
+    ),
+    expected: {
+      taskType: 'refactoring',
+      subtaskType: 'architecture_improvement',
       contextComplexity: 'medium',
       reasoningComplexity: 'medium',
+      riskLevel: 'medium',
       executionMode: 'code_change',
       requiresTools: true,
     },
   },
   {
-    id: 'refactor-medium-promise-to-async',
+    id: 'refactor-arch-repository-layer',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Our tRPC routers import the Drizzle client directly all over the place. Introduce a repository layer: define repository interfaces, implement them for the user and project routers first, update the wiring, and keep every procedure output identical. Set it up so the remaining routers can migrate incrementally.',
+      { messageCount: 21, hasTools: true }
+    ),
+    expected: {
+      taskType: 'refactoring',
+      subtaskType: 'architecture_improvement',
+      contextComplexity: 'large',
+      reasoningComplexity: 'high',
+      riskLevel: 'medium',
+      executionMode: 'multi_step_project',
+      requiresTools: true,
+    },
+  },
+
+  // ---------------------------------------------------------------------------
+  // refactoring / migration
+  // ---------------------------------------------------------------------------
+  {
+    id: 'refactor-migrate-async-await',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Migrate the .then()/.catch() promise chains in src/api/client.ts to async/await. There are about six methods. Preserve the existing error-handling semantics and return types exactly.',
+      { messageCount: 6, hasTools: true }
+    ),
+    expected: {
+      taskType: 'refactoring',
+      subtaskType: 'migration',
+      contextComplexity: 'medium',
+      reasoningComplexity: 'medium',
+      riskLevel: 'medium',
+      executionMode: 'code_change',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'refactor-migrate-drizzle',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Migrate our data layer from the legacy hand-written SQL query helpers spread across 30 files to Drizzle ORM, preserving every query result shape and transaction boundary. Plan the sequence so the app keeps passing tests at each step, then carry it out.',
+      { messageCount: 30, hasTools: true }
+    ),
+    expected: {
+      taskType: 'refactoring',
+      subtaskType: 'migration',
+      contextComplexity: 'large',
+      reasoningComplexity: 'high',
+      riskLevel: 'medium',
+      executionMode: 'multi_step_project',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'refactor-migrate-secrets-binding',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Migrate the gastown worker from plaintext vars in wrangler.jsonc to Cloudflare Secrets Store bindings for OPENROUTER_API_KEY and WEBHOOK_SIGNING_SECRET: add the secrets_store_secrets binding, update the env access in the code, and remove the plaintext values. These are live production credentials.',
+      { messageCount: 8, hasTools: true }
+    ),
+    expected: {
+      taskType: 'refactoring',
+      subtaskType: 'migration',
+      contextComplexity: 'medium',
+      reasoningComplexity: 'medium',
+      riskLevel: 'high',
+      executionMode: 'code_change',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'refactor-migrate-oxlint',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Migrate the packages/encryption package from ESLint to oxlint to match the rest of the monorepo: add an .oxlintrc.json extending the root config, switch the lint script in its package.json, and remove the eslint devDependencies.',
+      { messageCount: 3, hasTools: true }
+    ),
+    expected: {
+      taskType: 'refactoring',
+      subtaskType: 'migration',
+      contextComplexity: 'small',
+      reasoningComplexity: 'low',
+      riskLevel: 'low',
+      executionMode: 'code_change',
+      requiresTools: true,
+    },
+  },
+
+  // ---------------------------------------------------------------------------
+  // planning_design / architecture_design
+  // ---------------------------------------------------------------------------
+  {
+    id: 'plan-arch-express-structure',
+    input: chat(
+      CHAT_ASSISTANT_SYSTEM,
+      'For a small Express API with about 8 endpoints, what is a sensible folder structure for routes, controllers, and services? Just describe the layout, do not write code.',
+      { messageCount: 1, hasTools: false }
+    ),
+    expected: {
+      taskType: 'planning_design',
+      subtaskType: 'architecture_design',
+      contextComplexity: 'small',
+      reasoningComplexity: 'low',
+      riskLevel: 'low',
+      executionMode: 'answer_only',
+      requiresTools: false,
+    },
+  },
+  {
+    id: 'plan-arch-export-responsibility',
+    input: chat(
+      CHAT_ASSISTANT_SYSTEM,
+      'We are adding CSV export to the reporting feature. Should it live in the existing ReportsService, which already handles querying and aggregation, or in a new ExportService? Export adds formatting and async delivery concerns. Recommend where the responsibility belongs and why — no code.',
+      { messageCount: 1, hasTools: false }
+    ),
+    expected: {
+      taskType: 'planning_design',
+      subtaskType: 'architecture_design',
+      contextComplexity: 'small',
+      reasoningComplexity: 'low',
+      riskLevel: 'low',
+      executionMode: 'answer_only',
+      requiresTools: false,
+    },
+  },
+  {
+    id: 'plan-arch-dashboard-state',
     input: chat(
-      AGENT_TOOLS_SYSTEM,
-      'Convert the .then()/.catch() promise chains in src/api/client.ts to async/await. There are about six methods. Preserve the existing error-handling semantics and return types exactly.',
-      { messageCount: 6, hasTools: true }
+      CHAT_ASSISTANT_SYSTEM,
+      'Design the state-management structure for our React dashboard: we have server data via tRPC and React Query, local UI state, and filters that must survive page navigation. Propose which layer owns what (query cache vs a store vs URL params) and where the boundaries between them sit. Design only, I will implement it.',
+      { messageCount: 2, hasTools: false }
     ),
     expected: {
-      taskType: 'refactoring',
+      taskType: 'planning_design',
+      subtaskType: 'architecture_design',
       contextComplexity: 'medium',
       reasoningComplexity: 'medium',
-      executionMode: 'code_change',
-      requiresTools: true,
+      riskLevel: 'low',
+      executionMode: 'answer_only',
+      requiresTools: false,
     },
   },
   {
-    id: 'refactor-high-modularize-monolith',
+    id: 'plan-arch-cli-plugins',
     input: chat(
-      AGENT_TOOLS_SYSTEM,
-      'Our monolithic src/app.ts wires routing, auth, database access, and background jobs in one 1200-line file with tangled circular imports. Restructure it into clear modules with one-directional dependencies, without changing any external behavior or public routes. Decide the boundaries and migrate incrementally.',
-      { messageCount: 26, hasTools: true }
+      CHAT_ASSISTANT_SYSTEM,
+      'Design a plugin architecture for our internal CLI so other teams can ship commands without touching core: the plugin interface, discovery and loading, version compatibility between core and plugins, and which core APIs stay stable. There are about 40 commands today and three teams that want in. Architecture only — no implementation plan needed yet.',
+      { messageCount: 1, hasTools: false }
     ),
     expected: {
-      taskType: 'refactoring',
-      contextComplexity: 'large',
+      taskType: 'planning_design',
+      subtaskType: 'architecture_design',
+      contextComplexity: 'medium',
       reasoningComplexity: 'high',
-      executionMode: 'multi_step_project',
-      requiresTools: true,
+      riskLevel: 'low',
+      executionMode: 'answer_only',
+      requiresTools: false,
     },
   },
+
+  // ---------------------------------------------------------------------------
+  // planning_design / technical_planning
+  // ---------------------------------------------------------------------------
   {
-    id: 'refactor-high-orm-migration',
+    id: 'plan-steps-optimistic-ui',
     input: chat(
-      AGENT_TOOLS_SYSTEM,
-      'Migrate our data layer from the legacy hand-written SQL query helpers spread across 30 files to Drizzle ORM, preserving every query result shape and transaction boundary. Plan the sequence so the app keeps passing tests at each step, then carry it out.',
-      { messageCount: 30, hasTools: true }
+      CHAT_ASSISTANT_SYSTEM,
+      'We want to add optimistic UI updates to our existing React + tRPC todo app. Break the work into an ordered implementation plan (state, mutation handling, rollback on error, tests). Just the plan, I will implement it.',
+      { messageCount: 1, hasTools: false }
     ),
     expected: {
-      taskType: 'refactoring',
-      contextComplexity: 'large',
-      reasoningComplexity: 'high',
-      executionMode: 'multi_step_project',
-      requiresTools: true,
+      taskType: 'planning_design',
+      subtaskType: 'technical_planning',
+      contextComplexity: 'medium',
+      reasoningComplexity: 'medium',
+      riskLevel: 'low',
+      executionMode: 'answer_only',
+      requiresTools: false,
     },
   },
-
-  // ---------------------------------------------------------------------------
-  // planning_design (2 low, 2 medium, 2 high)
-  // ---------------------------------------------------------------------------
   {
-    id: 'plan-low-naming-choice',
+    id: 'plan-steps-node-upgrade',
     input: chat(
       CHAT_ASSISTANT_SYSTEM,
-      'I have a function that both validates and saves a user. What is a good single name for it, or should I split it? Just give me a recommendation, no code.',
+      'Give me an ordered checklist for upgrading our Express service from Node 20 to Node 22: what to verify beforehand, the upgrade steps, and how to validate after each step. Keep it to the sequence of steps — we already know the runtime differences barely affect our code.',
       { messageCount: 1, hasTools: false }
     ),
     expected: {
       taskType: 'planning_design',
+      subtaskType: 'technical_planning',
       contextComplexity: 'small',
       reasoningComplexity: 'low',
+      riskLevel: 'low',
       executionMode: 'answer_only',
       requiresTools: false,
     },
   },
   {
-    id: 'plan-low-folder-structure',
+    id: 'plan-steps-user-module-cutover',
     input: chat(
       CHAT_ASSISTANT_SYSTEM,
-      'For a small Express API with about 8 endpoints, what is a sensible folder structure for routes, controllers, and services? Just describe the layout, do not write code.',
-      { messageCount: 1, hasTools: false }
+      'The target design is already approved: the user module moves from the PHP monolith to the new TypeScript service. Plan the cutover into shippable steps — sequencing, feature flags, data backfill order, verification gates, and rollback points for each step. Plan only, the architecture itself is settled.',
+      { messageCount: 3, hasTools: false }
     ),
     expected: {
       taskType: 'planning_design',
-      contextComplexity: 'small',
-      reasoningComplexity: 'low',
+      subtaskType: 'technical_planning',
+      contextComplexity: 'large',
+      reasoningComplexity: 'high',
+      riskLevel: 'low',
       executionMode: 'answer_only',
       requiresTools: false,
     },
   },
   {
-    id: 'plan-medium-caching-strategy',
+    id: 'plan-steps-flaky-ci-triage',
     input: chat(
       CHAT_ASSISTANT_SYSTEM,
-      'We have a read-heavy product catalog API hitting Postgres directly. Walk me through the tradeoffs of adding Redis caching vs HTTP cache headers vs a materialized view, and recommend one for a team of three with moderate traffic. No implementation yet.',
+      'Our CI is red on about 30% of runs due to flaky tests. Draft a triage plan: how to rank the worst offenders from CI history, a quarantine policy, the order to fix them in, and how to keep new flakes out. Just the plan — no test code.',
       { messageCount: 1, hasTools: false }
     ),
     expected: {
       taskType: 'planning_design',
+      subtaskType: 'technical_planning',
       contextComplexity: 'medium',
       reasoningComplexity: 'medium',
+      riskLevel: 'low',
       executionMode: 'answer_only',
       requiresTools: false,
     },
   },
+
+  // ---------------------------------------------------------------------------
+  // planning_design / system_design
+  // ---------------------------------------------------------------------------
   {
-    id: 'plan-medium-rollout-steps',
+    id: 'plan-system-catalog-caching',
     input: chat(
       CHAT_ASSISTANT_SYSTEM,
-      'We want to add optimistic UI updates to our existing React + tRPC todo app. Break the work into an ordered implementation plan (state, mutation handling, rollback on error, tests). Just the plan, I will implement it.',
+      'We have a read-heavy product catalog API hitting Postgres directly. Walk me through the tradeoffs of adding Redis caching vs HTTP cache headers vs a materialized view, and recommend one for a team of three with moderate traffic. No implementation yet.',
       { messageCount: 1, hasTools: false }
     ),
     expected: {
       taskType: 'planning_design',
+      subtaskType: 'system_design',
       contextComplexity: 'medium',
       reasoningComplexity: 'medium',
+      riskLevel: 'low',
       executionMode: 'answer_only',
       requiresTools: false,
     },
   },
   {
-    id: 'plan-high-multitenant-architecture',
+    id: 'plan-system-multitenant',
     input: chat(
       CHAT_ASSISTANT_SYSTEM,
       'Design a multi-tenant architecture for our B2B SaaS. We need tenant isolation, per-tenant data residency (EU vs US), noisy-neighbor protection, and a path to enterprise single-tenant deployments later. Compare schema-per-tenant, row-level, and database-per-tenant, and recommend an approach with its failure modes. Design only.',
@@ -393,14 +866,16 @@ export const CLASSIFIER_CASES: readonly ClassifierCase[] = [
     ),
     expected: {
       taskType: 'planning_design',
+      subtaskType: 'system_design',
       contextComplexity: 'large',
       reasoningComplexity: 'high',
+      riskLevel: 'low',
       executionMode: 'answer_only',
       requiresTools: false,
     },
   },
   {
-    id: 'plan-high-event-driven-migration',
+    id: 'plan-system-event-driven-orders',
     input: chat(
       CHAT_ASSISTANT_SYSTEM,
       'We run a synchronous request/response monolith and want to move order processing to an event-driven design with a message broker. Design the target architecture: event schema/versioning, idempotency, ordering guarantees, dead-letter handling, and how we cut over without downtime. Tradeoffs and a recommended broker, no code.',
@@ -408,18 +883,37 @@ export const CLASSIFIER_CASES: readonly ClassifierCase[] = [
     ),
     expected: {
       taskType: 'planning_design',
+      subtaskType: 'system_design',
       contextComplexity: 'large',
       reasoningComplexity: 'high',
+      riskLevel: 'low',
+      executionMode: 'answer_only',
+      requiresTools: false,
+    },
+  },
+  {
+    id: 'plan-system-webhook-guarantees',
+    input: chat(
+      CHAT_ASSISTANT_SYSTEM,
+      'Design the delivery contract for our outbound webhooks: retry schedule, idempotency keys, payload signing, ordering guarantees, and what we promise customers when their endpoint is down for hours. I want the contract and failure modes nailed down, not code.',
+      { messageCount: 1, hasTools: false }
+    ),
+    expected: {
+      taskType: 'planning_design',
+      subtaskType: 'system_design',
+      contextComplexity: 'medium',
+      reasoningComplexity: 'medium',
+      riskLevel: 'low',
       executionMode: 'answer_only',
       requiresTools: false,
     },
   },
 
   // ---------------------------------------------------------------------------
-  // investigation (2 low, 2 medium, 2 high)
+  // investigation / repo_exploration
   // ---------------------------------------------------------------------------
   {
-    id: 'invest-low-find-usage',
+    id: 'invest-repo-feature-flags',
     input: chat(
       AGENT_TOOLS_SYSTEM,
       'Where in the codebase is the function getFeatureFlags defined and which files import it? Just tell me, do not change anything.',
@@ -427,14 +921,71 @@ export const CLASSIFIER_CASES: readonly ClassifierCase[] = [
     ),
     expected: {
       taskType: 'investigation',
+      subtaskType: 'repo_exploration',
+      contextComplexity: 'small',
+      reasoningComplexity: 'low',
+      riskLevel: 'low',
+      executionMode: 'answer_only',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'invest-repo-secrets-usage',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'List every worker service in this monorepo that uses secrets_store_secrets in its wrangler config, and flag any that still keep plaintext vars. Just report the list with file paths — change nothing.',
+      { messageCount: 3, hasTools: true }
+    ),
+    expected: {
+      taskType: 'investigation',
+      subtaskType: 'repo_exploration',
+      contextComplexity: 'medium',
+      reasoningComplexity: 'low',
+      riskLevel: 'low',
+      executionMode: 'answer_only',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'invest-repo-kiloclaw-todos',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Find all TODO and FIXME comments under services/kiloclaw and list them with file path and line number. Read-only, do not modify anything.',
+      { messageCount: 2, hasTools: true }
+    ),
+    expected: {
+      taskType: 'investigation',
+      subtaskType: 'repo_exploration',
       contextComplexity: 'small',
       reasoningComplexity: 'low',
+      riskLevel: 'low',
+      executionMode: 'answer_only',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'invest-repo-lodash-audit',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Which packages in the monorepo still depend on lodash, and which lodash functions does each one actually import? I am assessing whether we can drop the dependency entirely. Report findings only.',
+      { messageCount: 4, hasTools: true }
+    ),
+    expected: {
+      taskType: 'investigation',
+      subtaskType: 'repo_exploration',
+      contextComplexity: 'large',
+      reasoningComplexity: 'medium',
+      riskLevel: 'low',
       executionMode: 'answer_only',
       requiresTools: true,
     },
   },
+
+  // ---------------------------------------------------------------------------
+  // investigation / codebase_understanding
+  // ---------------------------------------------------------------------------
   {
-    id: 'invest-low-explain-function',
+    id: 'invest-code-cart-reducer',
     input: chat(
       AGENT_PLAIN_SYSTEM,
       'Explain what this reducer does, step by step. It handles ADD_ITEM, REMOVE_ITEM, and CLEAR_CART actions. I just want to understand the logic.',
@@ -442,14 +993,16 @@ export const CLASSIFIER_CASES: readonly ClassifierCase[] = [
     ),
     expected: {
       taskType: 'investigation',
+      subtaskType: 'codebase_understanding',
       contextComplexity: 'small',
       reasoningComplexity: 'low',
+      riskLevel: 'low',
       executionMode: 'answer_only',
       requiresTools: false,
     },
   },
   {
-    id: 'invest-medium-trace-auth-flow',
+    id: 'invest-code-login-flow',
     input: chat(
       AGENT_TOOLS_SYSTEM,
       'Explain how a login request flows through our app from the /auth/login route to the session cookie being set. Cover the controller, the AuthService, and the session middleware. I want to understand it before changing anything.',
@@ -457,14 +1010,54 @@ export const CLASSIFIER_CASES: readonly ClassifierCase[] = [
     ),
     expected: {
       taskType: 'investigation',
+      subtaskType: 'codebase_understanding',
       contextComplexity: 'medium',
       reasoningComplexity: 'medium',
+      riskLevel: 'low',
+      executionMode: 'answer_only',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'invest-code-checkout-path',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Before we plan any optimization work, walk me through where time goes in our checkout path: the API handler, the database queries it runs, the cache lookups, and the synchronous third-party payment call. Explain which parts block the response and which are deferred. Understanding only — nothing is broken and nothing should change.',
+      { messageCount: 12, hasTools: true }
+    ),
+    expected: {
+      taskType: 'investigation',
+      subtaskType: 'codebase_understanding',
+      contextComplexity: 'large',
+      reasoningComplexity: 'high',
+      riskLevel: 'low',
+      executionMode: 'answer_only',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'invest-code-data-pipeline',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'We inherited an undocumented data pipeline spanning a cron service, three Lambda functions, an SQS queue, and a Redshift loader. Map out how data flows end to end, what each component assumes about the others, and where the implicit coupling and failure points are. Understanding only, no changes.',
+      { messageCount: 24, hasTools: true }
+    ),
+    expected: {
+      taskType: 'investigation',
+      subtaskType: 'codebase_understanding',
+      contextComplexity: 'large',
+      reasoningComplexity: 'high',
+      riskLevel: 'low',
       executionMode: 'answer_only',
       requiresTools: true,
     },
   },
+
+  // ---------------------------------------------------------------------------
+  // investigation / external_research
+  // ---------------------------------------------------------------------------
   {
-    id: 'invest-medium-research-sdk',
+    id: 'invest-ext-stripe-webhooks',
     input: chat(
       CHAT_ASSISTANT_SYSTEM,
       'Look up the current Stripe Node SDK and summarize how to verify a webhook signature and what the recommended way to handle idempotency keys is. I need to know the current recommended API before I write any code.',
@@ -472,63 +1065,163 @@ export const CLASSIFIER_CASES: readonly ClassifierCase[] = [
     ),
     expected: {
       taskType: 'investigation',
+      subtaskType: 'external_research',
       contextComplexity: 'medium',
       reasoningComplexity: 'medium',
+      riskLevel: 'low',
       executionMode: 'answer_only',
       requiresTools: true,
     },
   },
   {
-    id: 'invest-high-perf-regression-analysis',
+    id: 'invest-ext-license-check',
     input: chat(
-      AGENT_TOOLS_SYSTEM,
-      'Our checkout p95 latency doubled over the last two weeks but no single deploy stands out. Investigate across the API, the database query patterns, the cache hit rates, and the third-party payment calls, and tell me the most likely contributors ranked by evidence. Do not fix anything yet, just analyze.',
-      { messageCount: 20, hasTools: true }
+      CHAT_ASSISTANT_SYSTEM,
+      'Check the current license of the fast-xml-parser npm package — the package page and its repository — and tell me whether we can use it in a commercial closed-source product. Report what the license actually says today, do not rely on memory.',
+      { messageCount: 1, hasTools: true }
     ),
     expected: {
       taskType: 'investigation',
-      contextComplexity: 'large',
-      reasoningComplexity: 'high',
+      subtaskType: 'external_research',
+      contextComplexity: 'small',
+      reasoningComplexity: 'low',
+      riskLevel: 'low',
       executionMode: 'answer_only',
       requiresTools: true,
     },
   },
   {
-    id: 'invest-high-understand-legacy-pipeline',
+    id: 'invest-ext-wrangler-secrets',
     input: chat(
-      AGENT_TOOLS_SYSTEM,
-      'We inherited an undocumented data pipeline spanning a cron service, three Lambda functions, an SQS queue, and a Redshift loader. Map out how data flows end to end, what each component assumes about the others, and where the implicit coupling and failure points are. Understanding only, no changes.',
-      { messageCount: 24, hasTools: true }
+      CHAT_ASSISTANT_SYSTEM,
+      'Read the current Cloudflare Wrangler docs and summarize how wrangler secret put relates to the newer Secrets Store commands: which command writes where, and what the recommended setup for Workers is today. Current docs only — this changed recently.',
+      { messageCount: 2, hasTools: true }
+    ),
+    expected: {
+      taskType: 'investigation',
+      subtaskType: 'external_research',
+      contextComplexity: 'medium',
+      reasoningComplexity: 'medium',
+      riskLevel: 'low',
+      executionMode: 'answer_only',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'invest-ext-llm-pricing',
+    input: chat(
+      CHAT_ASSISTANT_SYSTEM,
+      'Research current pricing and rate limits for the frontier model APIs we could route to — OpenRouter plus the major first-party providers — and compare the effective cost per million tokens for our traffic mix of 80% short completions and 20% long-context requests, with prompt caching factored in. Summarize with sources.',
+      { messageCount: 1, hasTools: true }
     ),
     expected: {
       taskType: 'investigation',
+      subtaskType: 'external_research',
       contextComplexity: 'large',
       reasoningComplexity: 'high',
+      riskLevel: 'low',
       executionMode: 'answer_only',
       requiresTools: true,
     },
   },
 
   // ---------------------------------------------------------------------------
-  // agentic_execution (2 low, 2 medium, 2 high)
+  // agentic_execution / tool_usage
+  // ---------------------------------------------------------------------------
+  {
+    id: 'agentic-tool-pricing-toggle',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Use the browser tool to open http://localhost:3000/pricing, verify the new annual-billing toggle switches the displayed prices, and take a screenshot of both states. Report what you see — do not change any code.',
+      { messageCount: 4, hasTools: true }
+    ),
+    expected: {
+      taskType: 'agentic_execution',
+      subtaskType: 'tool_usage',
+      contextComplexity: 'small',
+      reasoningComplexity: 'low',
+      riskLevel: 'low',
+      executionMode: 'command_execution',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'agentic-tool-flag-toggle',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Use your HTTP tool to call the staging admin API: enable the newOnboarding feature flag for the qa-team tenant via POST /admin/flags, then GET it back to confirm it took effect. The admin token is in .env.staging.',
+      { messageCount: 5, hasTools: true }
+    ),
+    expected: {
+      taskType: 'agentic_execution',
+      subtaskType: 'tool_usage',
+      contextComplexity: 'small',
+      reasoningComplexity: 'low',
+      riskLevel: 'medium',
+      executionMode: 'command_execution',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'agentic-tool-mobile-screenshots',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Open /dashboard, /settings, and /billing in the browser at 375px viewport width and take a screenshot of each. I need them to review the mobile layout — just capture and report, no code changes.',
+      { messageCount: 3, hasTools: true }
+    ),
+    expected: {
+      taskType: 'agentic_execution',
+      subtaskType: 'tool_usage',
+      contextComplexity: 'small',
+      reasoningComplexity: 'low',
+      riskLevel: 'low',
+      executionMode: 'command_execution',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'agentic-tool-signup-walkthrough',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Drive the browser through the signup flow on localhost:3000: fill the form with test+kilo@example.com, submit, enter the dev-mode verification code 000000, and confirm you land on the onboarding screen. Report the outcome of each step with a screenshot at the end.',
+      { messageCount: 8, hasTools: true }
+    ),
+    expected: {
+      taskType: 'agentic_execution',
+      subtaskType: 'tool_usage',
+      contextComplexity: 'medium',
+      reasoningComplexity: 'medium',
+      riskLevel: 'low',
+      executionMode: 'command_execution',
+      requiresTools: true,
+    },
+  },
+
+  // ---------------------------------------------------------------------------
+  // agentic_execution / terminal_operations
   // ---------------------------------------------------------------------------
   {
-    id: 'agentic-low-run-tests',
+    id: 'agentic-term-run-tests',
     input: chat(
       AGENT_TOOLS_SYSTEM,
       'Run the test suite with `pnpm test` and tell me if it passes.',
-      { messageCount: 2, hasTools: true }
+      {
+        messageCount: 2,
+        hasTools: true,
+      }
     ),
     expected: {
       taskType: 'agentic_execution',
+      subtaskType: 'terminal_operations',
       contextComplexity: 'small',
       reasoningComplexity: 'low',
+      riskLevel: 'low',
       executionMode: 'command_execution',
       requiresTools: true,
     },
   },
   {
-    id: 'agentic-low-check-git-status',
+    id: 'agentic-term-git-state',
     input: chat(
       AGENT_TOOLS_SYSTEM,
       'Run git status and git log --oneline -5 and show me the output so I know what state this checkout is in.',
@@ -536,14 +1229,16 @@ export const CLASSIFIER_CASES: readonly ClassifierCase[] = [
     ),
     expected: {
       taskType: 'agentic_execution',
+      subtaskType: 'terminal_operations',
       contextComplexity: 'small',
       reasoningComplexity: 'low',
+      riskLevel: 'low',
       executionMode: 'command_execution',
       requiresTools: true,
     },
   },
   {
-    id: 'agentic-medium-start-dev-server',
+    id: 'agentic-term-dev-health',
     input: chat(
       AGENT_TOOLS_SYSTEM,
       'Start the local dev environment with `pnpm dev`, wait for it to boot, then curl http://localhost:3000/health and report whether the service and its database connection are healthy.',
@@ -551,14 +1246,16 @@ export const CLASSIFIER_CASES: readonly ClassifierCase[] = [
     ),
     expected: {
       taskType: 'agentic_execution',
+      subtaskType: 'terminal_operations',
       contextComplexity: 'medium',
       reasoningComplexity: 'medium',
+      riskLevel: 'low',
       executionMode: 'command_execution',
       requiresTools: true,
     },
   },
   {
-    id: 'agentic-medium-docker-logs',
+    id: 'agentic-term-api-container-logs',
     input: chat(
       AGENT_TOOLS_SYSTEM,
       'The api container keeps restarting. Run docker compose ps, then docker compose logs api --tail 100, identify which command in the logs is failing on boot, and report it back. Just diagnose via the commands, do not edit files.',
@@ -566,14 +1263,20 @@ export const CLASSIFIER_CASES: readonly ClassifierCase[] = [
     ),
     expected: {
       taskType: 'agentic_execution',
+      subtaskType: 'terminal_operations',
       contextComplexity: 'medium',
       reasoningComplexity: 'medium',
+      riskLevel: 'low',
       executionMode: 'command_execution',
       requiresTools: true,
     },
   },
+
+  // ---------------------------------------------------------------------------
+  // agentic_execution / multi_step_execution
+  // ---------------------------------------------------------------------------
   {
-    id: 'agentic-high-release-pipeline',
+    id: 'agentic-multi-cut-release',
     input: chat(
       AGENT_TOOLS_SYSTEM,
       'Cut a release: bump the version, run the full build and test suite, build and push the multi-arch Docker image to our registry, tag the git commit, and verify the staging deploy comes up healthy. Stop and report if any step fails.',
@@ -581,14 +1284,16 @@ export const CLASSIFIER_CASES: readonly ClassifierCase[] = [
     ),
     expected: {
       taskType: 'agentic_execution',
+      subtaskType: 'multi_step_execution',
       contextComplexity: 'large',
       reasoningComplexity: 'high',
+      riskLevel: 'high',
       executionMode: 'multi_step_project',
       requiresTools: true,
     },
   },
   {
-    id: 'agentic-high-recover-broken-env',
+    id: 'agentic-multi-env-recovery',
     input: chat(
       AGENT_TOOLS_SYSTEM,
       'My local environment is broken after a branch switch: migrations are out of sync, node_modules looks stale, and the worker will not start. Diagnose and recover it end to end by running the right commands in order, re-running checks after each fix, until pnpm dev comes up clean. Report what you changed.',
@@ -601,8 +1306,44 @@ export const CLASSIFIER_CASES: readonly ClassifierCase[] = [
     ),
     expected: {
       taskType: 'agentic_execution',
+      subtaskType: 'multi_step_execution',
       contextComplexity: 'large',
       reasoningComplexity: 'high',
+      riskLevel: 'low',
+      executionMode: 'multi_step_project',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'agentic-multi-staging-deploy',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Deploy the notifications worker to staging: run its tests first, then wrangler deploy --env staging, tail the logs for a couple of minutes, hit the staging /health endpoint, and roll back to the previous version if anything looks wrong. Report each step.',
+      { messageCount: 11, hasTools: true }
+    ),
+    expected: {
+      taskType: 'agentic_execution',
+      subtaskType: 'multi_step_execution',
+      contextComplexity: 'medium',
+      reasoningComplexity: 'medium',
+      riskLevel: 'medium',
+      executionMode: 'multi_step_project',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'agentic-multi-prod-backfill',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Run the production backfill for the new display_name column: snapshot the database first, run scripts/backfill-display-name.ts against prod in batches of 1000, verify the updated row count matches the user count, and stop immediately and report if any batch errors. I will be watching — narrate each step.',
+      { messageCount: 14, hasTools: true }
+    ),
+    expected: {
+      taskType: 'agentic_execution',
+      subtaskType: 'multi_step_execution',
+      contextComplexity: 'medium',
+      reasoningComplexity: 'medium',
+      riskLevel: 'high',
       executionMode: 'multi_step_project',
       requiresTools: true,
     },
diff --git a/services/auto-routing-benchmark/src/grading.test.ts b/services/auto-routing-benchmark/src/grading.test.ts
index 3ea7abad76..3ed664ba15 100644
--- a/services/auto-routing-benchmark/src/grading.test.ts
+++ b/services/auto-routing-benchmark/src/grading.test.ts
@@ -10,8 +10,10 @@ import {
 
 const expected: ClassifierExpectation = {
   taskType: 'implementation',
+  subtaskType: 'code_generation',
   contextComplexity: 'small',
   reasoningComplexity: 'low',
+  riskLevel: 'low',
   executionMode: 'answer_only',
   requiresTools: false,
 };
@@ -35,21 +37,21 @@ describe('gradeClassifierOutput', () => {
     expect(gradeClassifierOutput(expected, actualFrom({}))).toBe(1);
   });
 
-  it('scores a taskType mismatch alone as 0.7', () => {
-    expect(gradeClassifierOutput(expected, actualFrom({ taskType: 'debugging' }))).toBe(0.7);
+  it('scores a taskType mismatch alone as 0.75', () => {
+    expect(gradeClassifierOutput(expected, actualFrom({ taskType: 'debugging' }))).toBe(0.75);
   });
 
   it('scores a requiresTools mismatch alone as 0.9', () => {
     expect(gradeClassifierOutput(expected, actualFrom({ requiresTools: true }))).toBe(0.9);
   });
 
-  it('ignores ungraded fields like subtaskType and riskLevel', () => {
+  it('scores a combined subtaskType and riskLevel mismatch as 0.85', () => {
     expect(
       gradeClassifierOutput(
         expected,
         actualFrom({ subtaskType: 'feature_development', riskLevel: 'high' })
       )
-    ).toBe(1);
+    ).toBe(0.85);
   });
 });
 
diff --git a/services/auto-routing-benchmark/src/grading.ts b/services/auto-routing-benchmark/src/grading.ts
index 6a489be73e..0661e3ac4b 100644
--- a/services/auto-routing-benchmark/src/grading.ts
+++ b/services/auto-routing-benchmark/src/grading.ts
@@ -1,21 +1,26 @@
 import type { ClassifierOutput } from '@kilocode/auto-routing-contracts';
 
-// Golden labels grade the axes the decision engine actually consumes.
-// subtaskType is intentionally ungraded (high label ambiguity, unused by
-// deriveDifficultyTier); riskLevel likewise; requiresTools gets a small weight.
+// Golden labels grade every classifier field except confidence. subtaskType
+// is worth less than taskType: a wrong subtype under the right type is a near
+// miss. riskLevel gets a small weight matching its small influence on tier
+// derivation.
 export type ClassifierExpectation = {
   taskType: ClassifierOutput['taskType'];
+  subtaskType: ClassifierOutput['subtaskType'];
   contextComplexity: ClassifierOutput['contextComplexity'];
   reasoningComplexity: ClassifierOutput['reasoningComplexity'];
+  riskLevel: ClassifierOutput['riskLevel'];
   executionMode: ClassifierOutput['executionMode'];
   requiresTools: boolean;
 };
 
 export const CLASSIFIER_FIELD_WEIGHTS: Record<keyof ClassifierExpectation, number> = {
-  taskType: 0.3,
-  reasoningComplexity: 0.25,
+  taskType: 0.25,
+  subtaskType: 0.1,
+  reasoningComplexity: 0.2,
   contextComplexity: 0.15,
-  executionMode: 0.2,
+  executionMode: 0.15,
+  riskLevel: 0.05,
   requiresTools: 0.1,
 };
 

From ae707f33e82d40f431b02249bd06f04f49446ab2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Fri, 12 Jun 2026 18:06:51 +0200
Subject: [PATCH 57/73] feat(auto-routing-benchmark): expand decider dataset to
 per-pair taxonomy coverage

Grow the decider benchmark from 30 to 76 cases so every
(taskType, subtaskType) pair in the classifier taxonomy has at least
4 mechanically-checkable cases, with at least 20 cases per difficulty
tier (23 low / 31 medium / 22 high).

- DeciderCase gains subtaskType; ids follow the
  <taskType>-<subtype>-<topic> scheme used by the classifier dataset
- Existing cases retagged with subtypes where they genuinely fit
  (three system-behavior investigation cases moved to
  planning_design/system_design, the HTTP 201 lookup to
  investigation/external_research, and the let-closure case reframed
  as refactoring/migration)
- New agentic_execution cases are self-contained file/terminal tasks
  deterministic in the node:22-slim container
- Tests now enforce per-pair and per-tier quotas from the
  classifierTaxonomy export, subtype/taskType consistency, regex
  compilability, and json_equal round-tripping
---
 .../src/datasets/decider-cases.test.ts        |  44 +-
 .../src/datasets/decider-cases.ts             | 827 +++++++++++++++---
 2 files changed, 743 insertions(+), 128 deletions(-)

diff --git a/services/auto-routing-benchmark/src/datasets/decider-cases.test.ts b/services/auto-routing-benchmark/src/datasets/decider-cases.test.ts
index 824a049293..1fb02e8de4 100644
--- a/services/auto-routing-benchmark/src/datasets/decider-cases.test.ts
+++ b/services/auto-routing-benchmark/src/datasets/decider-cases.test.ts
@@ -1,16 +1,52 @@
 import { describe, expect, it } from 'vitest';
+import { classifierTaxonomy } from '@kilocode/auto-routing-contracts/classifier';
 import { DECIDER_CASES } from './decider-cases';
 
+const TAXONOMY_PAIRS = classifierTaxonomy.taskTypes.flatMap(taskType =>
+  taskType.subtypes.map(subtype => ({ taskType: taskType.id, subtaskType: subtype.id }))
+);
+
+const SUBTYPES_BY_TASK_TYPE = new Map(
+  classifierTaxonomy.taskTypes.map(taskType => [
+    taskType.id,
+    new Set(taskType.subtypes.map(subtype => subtype.id)),
+  ])
+);
+
 describe('DECIDER_CASES', () => {
-  it('has exactly 30 cases with unique ids', () => {
-    expect(DECIDER_CASES.length).toBe(30);
+  it('covers all 18 taxonomy pairs', () => {
+    expect(TAXONOMY_PAIRS.length).toBe(18);
+  });
+
+  it('has exactly 76 cases with unique ids', () => {
+    expect(DECIDER_CASES.length).toBe(76);
     const ids = new Set(DECIDER_CASES.map(c => c.id));
     expect(ids.size).toBe(DECIDER_CASES.length);
   });
 
-  it('has exactly 10 cases per tier', () => {
+  it('has at least 4 cases per (taskType, subtaskType) pair', () => {
+    for (const pair of TAXONOMY_PAIRS) {
+      const count = DECIDER_CASES.filter(
+        c => c.taskType === pair.taskType && c.subtaskType === pair.subtaskType
+      ).length;
+      expect(count, `${pair.taskType}/${pair.subtaskType}`).toBeGreaterThanOrEqual(4);
+    }
+  });
+
+  it('labels every case with a subtaskType that belongs to its taskType', () => {
+    for (const c of DECIDER_CASES) {
+      const subtypes = SUBTYPES_BY_TASK_TYPE.get(c.taskType);
+      expect(subtypes, `unknown taskType in case ${c.id}`).toBeDefined();
+      expect(
+        subtypes?.has(c.subtaskType),
+        `case ${c.id}: ${c.subtaskType} does not belong to ${c.taskType}`
+      ).toBe(true);
+    }
+  });
+
+  it('has at least 20 cases per tier', () => {
     for (const tier of ['low', 'medium', 'high'] as const) {
-      expect(DECIDER_CASES.filter(c => c.tier === tier).length, tier).toBe(10);
+      expect(DECIDER_CASES.filter(c => c.tier === tier).length, tier).toBeGreaterThanOrEqual(20);
     }
   });
 
diff --git a/services/auto-routing-benchmark/src/datasets/decider-cases.ts b/services/auto-routing-benchmark/src/datasets/decider-cases.ts
index 31ba56e17e..745cd423f4 100644
--- a/services/auto-routing-benchmark/src/datasets/decider-cases.ts
+++ b/services/auto-routing-benchmark/src/datasets/decider-cases.ts
@@ -1,10 +1,15 @@
-import type { ClassifierTaskType, DifficultyTier } from '@kilocode/auto-routing-contracts';
+import type {
+  ClassifierSubtaskType,
+  ClassifierTaskType,
+  DifficultyTier,
+} from '@kilocode/auto-routing-contracts';
 import type { DeciderCheck } from '../grading';
 
 export type DeciderCase = {
-  id: string;
+  id: string; // stable slug, e.g. 'impl-gen-squares-array' (<taskType>-<subtype>-<topic>)
   tier: DifficultyTier;
   taskType: ClassifierTaskType;
+  subtaskType: ClassifierSubtaskType;
   systemPrompt: string;
   userPrompt: string;
   check: DeciderCheck;
@@ -14,287 +19,861 @@ const CODE_SYS =
   'You are a precise coding assistant. Answer with only what is asked, no explanations.';
 const SYS_SYS =
   'You are a precise systems engineer. Answer with only what is asked, no explanations.';
+const AGENT_SYS =
+  'You are a precise coding agent with file and terminal tools available. Complete the task exactly as specified, then answer with only what is asked, no explanations.';
 
-// Golden answers below were each worked through by hand. Every case has a
-// single unambiguous, mechanically-checkable answer. Checks tolerate
-// formatting noise (fences/case/whitespace) but never wrong values. For
-// json_equal cases the prompt pins the exact key set in the same order as the
-// expected value (the comparison is JSON.stringify-based and order-sensitive).
+// Golden answers below were each worked through by hand (and re-verified
+// mechanically where a snippet could be executed). Every case has a single
+// unambiguous, mechanically-checkable answer. Checks tolerate formatting
+// noise (fences/case/whitespace) but never wrong values. For json_equal cases
+// the prompt pins the exact key set in the same order as the expected value
+// (the comparison is JSON.stringify-based and order-sensitive). Each case
+// carries exactly one difficulty tier: low = mechanical lookups / trivial
+// evaluation, medium = multi-step reasoning / off-by-one traps / spec
+// application, high = deep tracing / multi-constraint puzzles / subtle
+// semantics. agentic_execution cases are self-contained tasks performed with
+// file/terminal tools inside the benchmark container (node:22-slim, no repo,
+// no network) and every command involved is deterministic there.
 export const DECIDER_CASES: readonly DeciderCase[] = [
-  // ---------------- LOW (mechanical lookups / trivial evaluation) ----------------
+  // ---------------------------------------------------------------------------
+  // implementation / feature_development
+  // ---------------------------------------------------------------------------
   {
-    id: 'low-impl-array-pipeline',
+    id: 'impl-feat-ternary-parity',
     tier: 'low',
     taskType: 'implementation',
+    subtaskType: 'feature_development',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'What does this JavaScript print? Answer with the exact output line only.\n\nconst n = 7;\nconsole.log(n % 2 === 0 ? "even" : "odd");',
+    check: { kind: 'exact', value: 'odd' },
+  },
+  {
+    id: 'impl-feat-array-pipeline',
+    tier: 'low',
+    taskType: 'implementation',
+    subtaskType: 'feature_development',
     systemPrompt: CODE_SYS,
     userPrompt:
       'What does this JavaScript print? Answer with the exact output line only.\n\nconst xs = [1, 2, 3, 4].filter(x => x % 2 === 0).map(x => x * 10);\nconsole.log(xs.join("-"));',
     check: { kind: 'exact', value: '20-40' },
   },
   {
-    id: 'low-impl-sort-numeric',
+    id: 'impl-feat-closure-counter',
+    tier: 'medium',
+    taskType: 'implementation',
+    subtaskType: 'feature_development',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'What is the final printed value? Answer with only the number.\n\nfunction make() {\n  let c = 0;\n  return () => ++c;\n}\nconst f = make();\nf();\nf();\nconsole.log(f());',
+    check: { kind: 'exact', value: '3' },
+  },
+  {
+    id: 'impl-feat-recursion-fib',
+    tier: 'medium',
+    taskType: 'implementation',
+    subtaskType: 'feature_development',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'This computes a Fibonacci-like sequence where f(0)=0, f(1)=1, f(n)=f(n-1)+f(n-2). What is f(7)? Answer with only the number.',
+    check: { kind: 'exact', value: '13' },
+  },
+  {
+    id: 'impl-feat-this-binding',
+    tier: 'high',
+    taskType: 'implementation',
+    subtaskType: 'feature_development',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'What does this print? Answer with only the number.\n\nconst obj = {\n  v: 10,\n  get() {\n    return [1, 2].map(function () {\n      return this?.v ?? 0;\n    }).reduce((a, b) => a + b, 0);\n  },\n};\nconsole.log(obj.get());',
+    check: { kind: 'exact', value: '0' },
+  },
+
+  // ---------------------------------------------------------------------------
+  // implementation / code_generation
+  // ---------------------------------------------------------------------------
+  {
+    id: 'impl-gen-package-manifest',
     tier: 'low',
     taskType: 'implementation',
+    subtaskType: 'code_generation',
     systemPrompt: CODE_SYS,
     userPrompt:
-      'What does this JavaScript print? Answer with the exact output line only.\n\nconsole.log([5, 3, 8, 1].sort((a, b) => a - b).join(","));',
-    check: { kind: 'exact', value: '1,3,5,8' },
+      'Generate a minimal package manifest. Reply with only a JSON object with exactly the keys "name" and "version" in that order, where name is "demo-app" and version is "1.2.3".',
+    check: { kind: 'json_equal', value: { name: 'demo-app', version: '1.2.3' } },
   },
   {
-    id: 'low-impl-string-upper',
+    id: 'impl-gen-squares-array',
     tier: 'low',
     taskType: 'implementation',
+    subtaskType: 'code_generation',
     systemPrompt: CODE_SYS,
     userPrompt:
-      'What does this JavaScript print? Answer with the exact output line only.\n\nconsole.log("hello".toUpperCase());',
-    check: { kind: 'exact', value: 'HELLO' },
+      'Generate a test fixture: a JSON array containing the squares of the integers 1 through 6, in increasing order. Reply with only the JSON array.',
+    check: { kind: 'json_equal', value: [1, 4, 9, 16, 25, 36] },
+  },
+  {
+    id: 'impl-gen-no-consecutive-ones',
+    tier: 'medium',
+    taskType: 'implementation',
+    subtaskType: 'code_generation',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'Generate a test fixture: a JSON array of all binary strings of length 3 that contain no two consecutive 1s, in lexicographic order, each string as a JSON string. Reply with only the JSON array.',
+    check: { kind: 'json_equal', value: ['000', '001', '010', '100', '101'] },
   },
   {
-    id: 'low-impl-ternary-parity',
+    id: 'impl-gen-two-ones-strings',
+    tier: 'high',
+    taskType: 'implementation',
+    subtaskType: 'code_generation',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'Generate a test fixture. Reply with only a JSON object with exactly the keys "count" and "strings" in that order, where strings is the JSON array of all binary strings of length 4 containing exactly two 1s, in lexicographic order, each as a JSON string, and count is the length of that array.',
+    check: {
+      kind: 'json_equal',
+      value: { count: 6, strings: ['0011', '0101', '0110', '1001', '1010', '1100'] },
+    },
+  },
+
+  // ---------------------------------------------------------------------------
+  // implementation / test_creation
+  // ---------------------------------------------------------------------------
+  {
+    id: 'impl-test-sort-expectation',
     tier: 'low',
     taskType: 'implementation',
+    subtaskType: 'test_creation',
     systemPrompt: CODE_SYS,
     userPrompt:
-      'What does this JavaScript print? Answer with the exact output line only.\n\nconst n = 7;\nconsole.log(n % 2 === 0 ? "even" : "odd");',
-    check: { kind: 'exact', value: 'odd' },
+      'You are writing a unit test. What value makes this assertion pass? Answer with the exact string only.\n\nexpect([5, 3, 8, 1].sort((a, b) => a - b).join(",")).toBe(?)',
+    check: { kind: 'exact', value: '1,3,5,8' },
   },
   {
-    id: 'low-debug-compound-assign',
+    id: 'impl-test-upper-expectation',
     tier: 'low',
-    taskType: 'debugging',
+    taskType: 'implementation',
+    subtaskType: 'test_creation',
     systemPrompt: CODE_SYS,
     userPrompt:
-      'What is the final value printed? Answer with only the number.\n\nlet x = 10;\nx += 5;\nx *= 2;\nconsole.log(x);',
-    check: { kind: 'exact', value: '30' },
+      'You are writing a unit test. What value makes this assertion pass? Answer with the exact string only.\n\nexpect("hello".toUpperCase()).toBe(?)',
+    check: { kind: 'exact', value: 'HELLO' },
+  },
+  {
+    id: 'impl-test-mock-call-count',
+    tier: 'medium',
+    taskType: 'implementation',
+    subtaskType: 'test_creation',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'You are writing a unit test with a mock:\n\nconst fn = vi.fn(x => x * 2);\nconst wrapped = x => fn(x) + fn(x);\nwrapped(3);\nwrapped(4);\nexpect(fn).toHaveBeenCalledTimes(?)\n\nWhat number makes the assertion pass? Answer with only the number.',
+    check: { kind: 'exact', value: '4' },
   },
   {
-    id: 'low-debug-parseint-suffix',
+    id: 'impl-test-trailing-zeros',
+    tier: 'high',
+    taskType: 'implementation',
+    subtaskType: 'test_creation',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'You are adding a test for a function trailingZeros(n) that returns the number of trailing zero digits of n! (n factorial). What expected value should the test assert for trailingZeros(25)? Answer with only the number.',
+    check: { kind: 'exact', value: '6' },
+  },
+
+  // ---------------------------------------------------------------------------
+  // debugging / bug_fixing
+  // ---------------------------------------------------------------------------
+  {
+    id: 'debug-fix-parseint-suffix',
     tier: 'low',
     taskType: 'debugging',
+    subtaskType: 'bug_fixing',
     systemPrompt: CODE_SYS,
     userPrompt:
       'What does this JavaScript print? Answer with only the number.\n\nconsole.log(parseInt("42px", 10));',
     check: { kind: 'exact', value: '42' },
   },
   {
-    id: 'low-investigation-char-count',
-    tier: 'low',
-    taskType: 'investigation',
+    id: 'debug-fix-binary-search',
+    tier: 'medium',
+    taskType: 'debugging',
+    subtaskType: 'bug_fixing',
     systemPrompt: CODE_SYS,
     userPrompt:
-      'How many times does the letter "a" appear in the word "banana"? Answer with only the number.',
-    check: { kind: 'exact', value: '3' },
+      'This binary search has a bug. Reply with JSON {"line": <1-based line number of the buggy line>, "fix": "<the corrected line with leading whitespace removed, keeping single spaces around operators>"}.\n\n1: function bsearch(a, t) {\n2:   let lo = 0, hi = a.length;\n3:   while (lo < hi) {\n4:     const mid = (lo + hi) >> 1;\n5:     if (a[mid] === t) return mid;\n6:     if (a[mid] < t) lo = mid;\n7:     else hi = mid;\n8:   }\n9:   return -1;\n10: }',
+    check: { kind: 'json_equal', value: { line: 6, fix: 'if (a[mid] < t) lo = mid + 1;' } },
   },
   {
-    id: 'low-investigation-object-keys',
-    tier: 'low',
-    taskType: 'investigation',
+    id: 'debug-fix-pagination-slice',
+    tier: 'medium',
+    taskType: 'debugging',
+    subtaskType: 'bug_fixing',
     systemPrompt: CODE_SYS,
     userPrompt:
-      'How many own enumerable keys does this object have? Answer with only the number.\n\nconst o = { a: 1, b: 2, c: 3 };',
-    check: { kind: 'exact', value: '3' },
+      'This pagination helper is buggy: pages([1, 2, 3, 4, 5, 6, 7], 3) should return [[1,2,3],[4,5,6],[7]] but loses elements. Reply with JSON {"line": <1-based line number of the buggy line>, "fix": "<the corrected line with leading whitespace removed, keeping single spaces around operators>"}.\n\n1: function pages(xs, size) {\n2:   const out = [];\n3:   for (let i = 0; i < xs.length; i += size) {\n4:     out.push(xs.slice(i, size));\n5:   }\n6:   return out;\n7: }',
+    check: { kind: 'json_equal', value: { line: 4, fix: 'out.push(xs.slice(i, i + size));' } },
   },
   {
-    id: 'low-planning-http-created',
-    tier: 'low',
-    taskType: 'planning_design',
-    systemPrompt:
-      'You are a precise web API expert. Answer with only what is asked, no explanations.',
+    id: 'debug-fix-regex-lastindex',
+    tier: 'high',
+    taskType: 'debugging',
+    subtaskType: 'bug_fixing',
+    systemPrompt: CODE_SYS,
     userPrompt:
-      'Which standard HTTP status code indicates that a new resource was successfully created? Answer with only the 3-digit number.',
-    check: { kind: 'exact', value: '201' },
+      'A validator misbehaves on its second call because of a stateful regex bug. What does this print? Answer with only the two words printed, separated by a single space.\n\nconst re = /a/g;\nconsole.log(re.test("abc"), re.test("abc"));',
+    check: { kind: 'exact', value: 'true false' },
   },
+
+  // ---------------------------------------------------------------------------
+  // debugging / test_repair
+  // ---------------------------------------------------------------------------
   {
-    id: 'low-refactoring-reduce-sum',
+    id: 'debug-repair-compound-assign',
     tier: 'low',
-    taskType: 'refactoring',
+    taskType: 'debugging',
+    subtaskType: 'test_repair',
     systemPrompt: CODE_SYS,
     userPrompt:
-      'A loop sums an array. What value does it produce? Answer with only the number.\n\nlet total = 0;\nfor (const n of [4, 4, 4]) total += n;\nconsole.log(total);',
-    check: { kind: 'exact', value: '12' },
+      'A unit test asserts that this program prints 25, and the test fails. The code is correct; the expectation is stale. What value should the updated test expect? Answer with only the number.\n\nlet x = 10;\nx += 5;\nx *= 2;\nconsole.log(x);',
+    check: { kind: 'exact', value: '30' },
   },
-
-  // ---------------- MEDIUM (multi-step reasoning, off-by-one, spec application) -------------
   {
-    id: 'medium-debug-off-by-one',
+    id: 'debug-repair-date-format',
     tier: 'medium',
     taskType: 'debugging',
+    subtaskType: 'test_repair',
     systemPrompt: CODE_SYS,
     userPrompt:
-      'This binary search has a bug. Reply with JSON {"line": <1-based line number of the buggy line>, "fix": "<the corrected line with leading whitespace removed>"}.\n\n1: function bsearch(a, t) {\n2:   let lo = 0, hi = a.length;\n3:   while (lo < hi) {\n4:     const mid = (lo + hi) >> 1;\n5:     if (a[mid] === t) return mid;\n6:     if (a[mid] < t) lo = mid;\n7:     else hi = mid;\n8:   }\n9:   return -1;\n10: }',
-    check: { kind: 'json_equal', value: { line: 6, fix: 'if (a[mid] < t) lo = mid + 1;' } },
+      'A snapshot test fails after a date-formatter fix. The formatter now emits dates as zero-padded YYYY-MM-DD. What exact string should the updated snapshot expect for June 1, 2026? Answer with only the date string.',
+    check: { kind: 'exact', value: '2026-06-01' },
   },
   {
-    id: 'medium-impl-reduce-trace',
+    id: 'debug-repair-entries-shape',
     tier: 'medium',
-    taskType: 'implementation',
+    taskType: 'debugging',
+    subtaskType: 'test_repair',
     systemPrompt: CODE_SYS,
     userPrompt:
-      'What does this print? Answer with only the number.\n\nconst r = [1, 2, 3, 4].reduce((acc, x) => acc + x * x, 0);\nconsole.log(r);',
-    check: { kind: 'exact', value: '30' },
+      'A test broke because a refactor changed a function to return Object.entries(obj) instead of obj. For obj = {a: 1, b: 2} (keys in that insertion order), what is the new return value? Reply with only that value as JSON (an array of [key, value] pairs in insertion order).',
+    check: {
+      kind: 'json_equal',
+      value: [
+        ['a', 1],
+        ['b', 2],
+      ],
+    },
   },
   {
-    id: 'medium-impl-closure-counter',
-    tier: 'medium',
-    taskType: 'implementation',
+    id: 'debug-repair-float-sum',
+    tier: 'high',
+    taskType: 'debugging',
+    subtaskType: 'test_repair',
     systemPrompt: CODE_SYS,
     userPrompt:
-      'What is the final printed value? Answer with only the number.\n\nfunction make() {\n  let c = 0;\n  return () => ++c;\n}\nconst f = make();\nf();\nf();\nconsole.log(f());',
-    check: { kind: 'exact', value: '3' },
+      'A failing test asserts expect(0.1 + 0.2).toBe(0.3). The repair pins the actual IEEE-754 value. What does console.log(0.1 + 0.2) print in JavaScript? Answer with the exact printed number only.',
+    check: { kind: 'exact', value: '0.30000000000000004' },
   },
+
+  // ---------------------------------------------------------------------------
+  // debugging / root_cause_analysis
+  // ---------------------------------------------------------------------------
   {
-    id: 'medium-debug-async-order',
+    id: 'debug-rca-async-order',
     tier: 'medium',
     taskType: 'debugging',
+    subtaskType: 'root_cause_analysis',
     systemPrompt: CODE_SYS,
     userPrompt:
       'What does this program print, in order? Answer with the four uppercase letters joined by commas, e.g. "A,B,C,D".\n\nconsole.log("A");\nPromise.resolve().then(() => console.log("B"));\nsetTimeout(() => console.log("C"), 0);\nconsole.log("D");',
     check: { kind: 'regex', pattern: '^\\s*A\\s*,\\s*D\\s*,\\s*B\\s*,\\s*C\\s*$', flags: 'im' },
   },
   {
-    id: 'medium-impl-map-set-dedup',
+    id: 'debug-rca-shared-ref',
     tier: 'medium',
-    taskType: 'implementation',
+    taskType: 'debugging',
+    subtaskType: 'root_cause_analysis',
     systemPrompt: CODE_SYS,
     userPrompt:
-      'What is the size of the resulting Set? Answer with only the number.\n\nconst s = new Set([1, 2, 2, 3, 3, 3, 4]);\nconsole.log(s.size);',
+      'What does this print? Answer with only the number.\n\nconst a = [1, 2, 3];\nconst b = a;\nb.push(4);\nconsole.log(a.length);',
     check: { kind: 'exact', value: '4' },
   },
   {
-    id: 'medium-investigation-regex-groups',
+    id: 'debug-rca-closure-loop-var',
+    tier: 'high',
+    taskType: 'debugging',
+    subtaskType: 'root_cause_analysis',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'What does this print? Answer with the three numbers joined by commas, e.g. "1,2,3".\n\nconst fns = [];\nfor (var i = 0; i < 3; i++) {\n  fns.push(() => i);\n}\nconsole.log(fns[0]() + "," + fns[1]() + "," + fns[2]());',
+    check: { kind: 'regex', pattern: '^\\s*3\\s*,\\s*3\\s*,\\s*3\\s*$', flags: 'm' },
+  },
+  {
+    id: 'debug-rca-float-equality',
+    tier: 'high',
+    taskType: 'debugging',
+    subtaskType: 'root_cause_analysis',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'In IEEE-754 double precision (JavaScript Number), does the expression (0.1 + 0.2 === 0.3) evaluate to true or false? Answer with only the lowercase word true or false.',
+    check: { kind: 'exact', value: 'false' },
+  },
+
+  // ---------------------------------------------------------------------------
+  // refactoring / code_cleanup
+  // ---------------------------------------------------------------------------
+  {
+    id: 'refactor-cleanup-loop-to-reduce',
+    tier: 'low',
+    taskType: 'refactoring',
+    subtaskType: 'code_cleanup',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'A loop sums an array. What value does it produce? Answer with only the number.\n\nlet total = 0;\nfor (const n of [4, 4, 4]) total += n;\nconsole.log(total);',
+    check: { kind: 'exact', value: '12' },
+  },
+  {
+    id: 'refactor-cleanup-extract-helper',
+    tier: 'low',
+    taskType: 'refactoring',
+    subtaskType: 'code_cleanup',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'Two branches both compute s.trim().toLowerCase(), so you extract a helper norm(s) that does exactly that. What does norm("  HeLLo ") return? Answer with the exact string only.',
+    check: { kind: 'exact', value: 'hello' },
+  },
+  {
+    id: 'refactor-cleanup-map-equivalent',
     tier: 'medium',
-    taskType: 'investigation',
+    taskType: 'refactoring',
+    subtaskType: 'code_cleanup',
     systemPrompt: CODE_SYS,
     userPrompt:
-      'Given the regex /(\\d{4})-(\\d{2})-(\\d{2})/ applied to "2026-06-11", what is capture group 2? Answer with only the value.',
-    check: { kind: 'exact', value: '06' },
+      'After refactoring, both versions must produce the same output. What number does this print? Answer with only the number.\n\nconst nums = [10, 20, 30];\nconst doubled = nums.map(n => n * 2);\nconsole.log(doubled[1]);',
+    check: { kind: 'exact', value: '40' },
+  },
+  {
+    id: 'refactor-cleanup-short-circuit',
+    tier: 'high',
+    taskType: 'refactoring',
+    subtaskType: 'code_cleanup',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'What does this print? Answer with only the number.\n\nlet calls = 0;\nfunction side() {\n  calls++;\n  return 0;\n}\nconst result = side() || side() || 7;\nconsole.log(calls);',
+    check: { kind: 'exact', value: '2' },
+  },
+
+  // ---------------------------------------------------------------------------
+  // refactoring / architecture_improvement
+  // ---------------------------------------------------------------------------
+  {
+    id: 'refactor-arch-import-updates',
+    tier: 'low',
+    taskType: 'refactoring',
+    subtaskType: 'architecture_improvement',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'Files x.ts, y.ts, and z.ts each contain exactly one import of helper.ts. helper.ts moves to a new directory, changing its import path. How many import statements must be updated? Answer with only the number.',
+    check: { kind: 'exact', value: '3' },
   },
   {
-    id: 'medium-impl-recursion-fib',
+    id: 'refactor-arch-layer-depth',
     tier: 'medium',
-    taskType: 'implementation',
+    taskType: 'refactoring',
+    subtaskType: 'architecture_improvement',
     systemPrompt: CODE_SYS,
     userPrompt:
-      'This computes a Fibonacci-like sequence where f(0)=0, f(1)=1, f(n)=f(n-1)+f(n-2). What is f(7)? Answer with only the number.',
-    check: { kind: 'exact', value: '13' },
+      "Modules and their imports: app imports auth and billing; auth imports core; billing imports core; core imports nothing. In a layered architecture where a module's layer is 1 + the maximum layer of its imports, and core is layer 1, what layer is app? Answer with only the number.",
+    check: { kind: 'exact', value: '3' },
   },
   {
-    id: 'medium-debug-mutation-shared-ref',
+    id: 'refactor-arch-interface-edges',
     tier: 'medium',
-    taskType: 'debugging',
+    taskType: 'refactoring',
+    subtaskType: 'architecture_improvement',
     systemPrompt: CODE_SYS,
     userPrompt:
-      'What does this print? Answer with only the number.\n\nconst a = [1, 2, 3];\nconst b = a;\nb.push(4);\nconsole.log(a.length);',
-    check: { kind: 'exact', value: '4' },
+      'A module graph has edges A->B, A->C, B->D, C->D. To improve the architecture you introduce an interface module I: the edges B->D and C->D are removed and replaced by B->I, C->I, and I->D. How many edges does the new graph have? Answer with only the number.',
+    check: { kind: 'exact', value: '5' },
+  },
+  {
+    id: 'refactor-arch-cycle-cut',
+    tier: 'high',
+    taskType: 'refactoring',
+    subtaskType: 'architecture_improvement',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'A module graph has directed import edges A->B, B->C, C->A, B->D, D->B, D->E. You must make the graph acyclic by deleting the minimum number of import edges. Reply with JSON {"deleted": <minimum number of edges to delete>, "remaining": <number of edges left after deleting them>}.',
+    check: { kind: 'json_equal', value: { deleted: 2, remaining: 4 } },
+  },
+
+  // ---------------------------------------------------------------------------
+  // refactoring / migration
+  // ---------------------------------------------------------------------------
+  {
+    id: 'refactor-migrate-substr-slice',
+    tier: 'low',
+    taskType: 'refactoring',
+    subtaskType: 'migration',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'You are migrating code off the deprecated String.prototype.substr. The old call is "javascript".substr(4, 3). What string does the equivalent migrated call "javascript".slice(4, 7) return? Answer with the exact string only.',
+    check: { kind: 'exact', value: 'scr' },
+  },
+  {
+    id: 'refactor-migrate-promise-chain',
+    tier: 'medium',
+    taskType: 'refactoring',
+    subtaskType: 'migration',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'After migrating a callback API to promises, the code reads:\n\nPromise.resolve(2).then(x => x + 1).then(x => x * 10).then(x => console.log(x));\n\nWhat number does it print? Answer with only the number.',
+    check: { kind: 'exact', value: '30' },
+  },
+  {
+    id: 'refactor-migrate-strict-equality',
+    tier: 'medium',
+    taskType: 'refactoring',
+    subtaskType: 'migration',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'You are migrating a codebase from == to ===. How many of these four comparisons change their result after replacing == with ===?\n\n"1" == 1\nnull == undefined\n2 == 2\nNaN == NaN\n\nAnswer with only the number.',
+    check: { kind: 'exact', value: '2' },
+  },
+  {
+    id: 'refactor-migrate-var-to-let',
+    tier: 'high',
+    taskType: 'refactoring',
+    subtaskType: 'migration',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'A loop variable was migrated from var to let. What does the migrated code print? Answer with the three numbers joined by commas, e.g. "1,2,3".\n\nconst fns = [];\nfor (let i = 0; i < 3; i++) {\n  fns.push(() => i);\n}\nconsole.log(fns[0]() + "," + fns[1]() + "," + fns[2]());',
+    check: { kind: 'regex', pattern: '^\\s*0\\s*,\\s*1\\s*,\\s*2\\s*$', flags: 'm' },
+  },
+
+  // ---------------------------------------------------------------------------
+  // planning_design / architecture_design
+  // ---------------------------------------------------------------------------
+  {
+    id: 'plan-arch-three-tier',
+    tier: 'low',
+    taskType: 'planning_design',
+    subtaskType: 'architecture_design',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'In a classic three-tier architecture with presentation, business, and data tiers, which tier should contain the SQL queries? Answer with only one word: presentation, business, or data.',
+    check: { kind: 'exact', value: 'data' },
   },
   {
-    id: 'medium-planning-rate-limit-window',
+    id: 'plan-arch-call-chain',
     tier: 'medium',
     taskType: 'planning_design',
+    subtaskType: 'architecture_design',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'A service design has these synchronous call edges: gateway calls auth and orders; orders calls inventory and billing; billing calls ledger. Counting edges, how long is the longest call chain starting at gateway? Answer with only the number.',
+    check: { kind: 'exact', value: '3' },
+  },
+  {
+    id: 'plan-arch-dependency-rules',
+    tier: 'medium',
+    taskType: 'planning_design',
+    subtaskType: 'architecture_design',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'A layered design enforces these rules: ui may import only app; app may import domain and infra; infra may import domain; domain imports nothing. How many of these five proposed imports violate the rules?\n\nui -> app\nui -> domain\napp -> domain\ninfra -> app\ndomain -> infra\n\nAnswer with only the number.',
+    check: { kind: 'exact', value: '3' },
+  },
+  {
+    id: 'plan-arch-latency-budget',
+    tier: 'high',
+    taskType: 'planning_design',
+    subtaskType: 'architecture_design',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'A design must keep worst-case request latency within a 300 ms budget. The synchronous chain is gateway (10 ms) -> auth (40 ms) -> service (120 ms) -> db (90 ms), and in the worst case the db call is retried once (the db is called twice; all other components run once). Reply with JSON {"totalMs": <worst-case total latency in ms>, "withinBudget": <true|false>}.',
+    check: { kind: 'json_equal', value: { totalMs: 350, withinBudget: false } },
+  },
+
+  // ---------------------------------------------------------------------------
+  // planning_design / technical_planning
+  // ---------------------------------------------------------------------------
+  {
+    id: 'plan-steps-rollout-order',
+    tier: 'low',
+    taskType: 'planning_design',
+    subtaskType: 'technical_planning',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'A rollout plan has four steps in strict sequence: write code, code review, deploy to staging, deploy to production. Which step is third? Answer with only the exact step name.',
+    check: { kind: 'exact', value: 'deploy to staging' },
+  },
+  {
+    id: 'plan-steps-batch-count',
+    tier: 'medium',
+    taskType: 'planning_design',
+    subtaskType: 'technical_planning',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'A data migration plan processes 1000 records in batches of up to 80 records, one batch per run. How many runs does the plan need to process all records? Answer with only the number.',
+    check: { kind: 'exact', value: '13' },
+  },
+  {
+    id: 'plan-steps-deploy-waves',
+    tier: 'medium',
+    taskType: 'planning_design',
+    subtaskType: 'technical_planning',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'Services A, B, C, D deploy in waves: a service can only deploy after all its dependencies are deployed, and any number of services can share a wave. Dependencies: B needs A; C needs A; D needs B and C. Reply with JSON {"waves": <minimum number of waves>, "dWave": <1-based wave in which D deploys>}.',
+    check: { kind: 'json_equal', value: { waves: 3, dWave: 3 } },
+  },
+  {
+    id: 'plan-steps-critical-path',
+    tier: 'high',
+    taskType: 'planning_design',
+    subtaskType: 'technical_planning',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'A plan has tasks with durations in days and dependencies: A (3 days) has no dependencies; B (2 days) starts after A; C (4 days) starts after A; D (1 day) starts after both B and C; E (2 days) starts after D. With unlimited parallelism, what is the minimum number of days to finish all tasks? Answer with only the number.',
+    check: { kind: 'exact', value: '10' },
+  },
+
+  // ---------------------------------------------------------------------------
+  // planning_design / system_design
+  // ---------------------------------------------------------------------------
+  {
+    id: 'plan-system-write-quorum',
+    tier: 'low',
+    taskType: 'planning_design',
+    subtaskType: 'system_design',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'A system replicates each write to 3 nodes and requires a majority quorum of acknowledgements before confirming the write. How many node acknowledgements are required? Answer with only the number.',
+    check: { kind: 'exact', value: '2' },
+  },
+  {
+    id: 'plan-system-rate-limit-window',
+    tier: 'medium',
+    taskType: 'planning_design',
+    subtaskType: 'system_design',
     systemPrompt: SYS_SYS,
     userPrompt:
       'A fixed-window rate limiter allows 100 requests per 60-second window. A client sends 80 requests in the first 30 seconds of a window, then 40 more requests in the next 20 seconds (same window). How many of the 40 later requests are rejected? Answer with only the number.',
     check: { kind: 'exact', value: '20' },
   },
   {
-    id: 'medium-refactoring-equivalent-output',
+    id: 'plan-system-replica-availability',
     tier: 'medium',
-    taskType: 'refactoring',
-    systemPrompt: CODE_SYS,
+    taskType: 'planning_design',
+    subtaskType: 'system_design',
+    systemPrompt: SYS_SYS,
     userPrompt:
-      'After refactoring, both versions must produce the same output. What number does this print? Answer with only the number.\n\nconst nums = [10, 20, 30];\nconst doubled = nums.map(n => n * 2);\nconsole.log(doubled[1]);',
-    check: { kind: 'exact', value: '40' },
+      'A service is available when at least one of its two independent replicas is up. Each replica is up 90% of the time, independently. What is the service availability as a percentage? Answer with only the number.',
+    check: { kind: 'exact', value: '99' },
   },
-
-  // ---------------- HIGH (deep multi-constraint reasoning, subtle semantics) -------------
   {
-    id: 'high-investigation-queue-trace',
+    id: 'plan-system-cache-staleness',
     tier: 'high',
-    taskType: 'investigation',
+    taskType: 'planning_design',
+    subtaskType: 'system_design',
     systemPrompt: SYS_SYS,
     userPrompt:
-      'Three workers process a queue with at-least-once delivery. Worker A reads job 7 at t=0ms and crashes at t=50ms before ack. Visibility timeout is 30ms. Worker B receives job 7 at t=35ms, processes it in 40ms and acks. Worker C receives job 7 at t=80ms (redelivery triggered by the crash recovery scan at t=70ms) and processes it in 10ms, acking at t=90ms. The job inserts a row keyed by an idempotency key with ON CONFLICT DO NOTHING. How many rows exist at t=100ms, and which worker\'s insert won? Reply with JSON {"rows": <number>, "winner": "<A|B|C>"}.',
+      'A write-through cache with TTL 60s. At t=0s key K is written (value 1, cached). At t=30s the database row for K is updated to value 2 by a process that bypasses the cache (does not invalidate it). At t=45s a reader requests K. At t=70s another reader requests K. The cache returns its entry if present and unexpired, otherwise reads the DB and caches. What value does the t=45s reader get, and what value does the t=70s reader get? Reply with JSON {"first": <number>, "second": <number>}.',
+    check: { kind: 'json_equal', value: { first: 1, second: 2 } },
+  },
+  {
+    id: 'plan-system-queue-trace',
+    tier: 'high',
+    taskType: 'planning_design',
+    subtaskType: 'system_design',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'Three workers process a queue with at-least-once delivery. Worker A reads job 7 at t=0ms and crashes at t=50ms, before performing the insert and before ack. Visibility timeout is 30ms. Worker B receives job 7 at t=35ms, processes it in 40ms and acks. Worker C receives job 7 at t=80ms (redelivery triggered by the crash recovery scan at t=70ms) and processes it in 10ms, acking at t=90ms. The job inserts a row keyed by an idempotency key with ON CONFLICT DO NOTHING. How many rows exist at t=100ms, and which worker\'s insert won? Reply with JSON {"rows": <number>, "winner": "<A|B|C>"}.',
     check: { kind: 'json_equal', value: { rows: 1, winner: 'B' } },
   },
   {
-    id: 'high-debug-closure-loop-var',
+    id: 'plan-system-deadlock-order',
     tier: 'high',
-    taskType: 'debugging',
-    systemPrompt: CODE_SYS,
+    taskType: 'planning_design',
+    subtaskType: 'system_design',
+    systemPrompt: SYS_SYS,
     userPrompt:
-      'What does this print? Answer with the three numbers joined by commas, e.g. "1,2,3".\n\nconst fns = [];\nfor (var i = 0; i < 3; i++) {\n  fns.push(() => i);\n}\nconsole.log(fns[0]() + "," + fns[1]() + "," + fns[2]());',
-    check: { kind: 'regex', pattern: '^\\s*3\\s*,\\s*3\\s*,\\s*3\\s*$', flags: 'm' },
+      'Two threads acquire locks. Thread 1: lock A, then lock B. Thread 2: lock B, then lock A. Both hold the first lock and then block forever waiting for the second. To eliminate the deadlock by enforcing a global lock acquisition order (alphabetical: A before B), which single thread number must have its two lock acquisitions reordered? Answer with only the thread number.',
+    check: { kind: 'exact', value: '2' },
   },
   {
-    id: 'high-debug-closure-let-var',
+    id: 'plan-system-txn-isolation',
     tier: 'high',
-    taskType: 'debugging',
+    taskType: 'planning_design',
+    subtaskType: 'system_design',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'A counter row holds value 5. Under READ COMMITTED isolation, two concurrent transactions T1 and T2 each run: SELECT v FROM c; then UPDATE c SET v = (the value they read) + 1. Both read before either writes, T1 commits first, then T2 commits (last-write-wins, no row lock taken on the SELECT). What is the final value of v? Answer with only the number.',
+    check: { kind: 'exact', value: '6' },
+  },
+
+  // ---------------------------------------------------------------------------
+  // investigation / repo_exploration
+  // ---------------------------------------------------------------------------
+  {
+    id: 'invest-repo-test-file-count',
+    tier: 'low',
+    taskType: 'investigation',
+    subtaskType: 'repo_exploration',
     systemPrompt: CODE_SYS,
     userPrompt:
-      'What does this print? Answer with the three numbers joined by commas, e.g. "1,2,3".\n\nconst fns = [];\nfor (let i = 0; i < 3; i++) {\n  fns.push(() => i);\n}\nconsole.log(fns[0]() + "," + fns[1]() + "," + fns[2]());',
-    check: { kind: 'regex', pattern: '^\\s*0\\s*,\\s*1\\s*,\\s*2\\s*$', flags: 'm' },
+      'A repository listing shows these files:\n\nsrc/app.ts\nsrc/app.test.ts\nsrc/util.ts\nsrc/util.test.ts\nsrc/index.ts\nREADME.md\n\nHow many files end in .test.ts? Answer with only the number.',
+    check: { kind: 'exact', value: '2' },
   },
   {
-    id: 'high-impl-this-binding',
-    tier: 'high',
-    taskType: 'implementation',
+    id: 'invest-repo-glob-match',
+    tier: 'medium',
+    taskType: 'investigation',
+    subtaskType: 'repo_exploration',
     systemPrompt: CODE_SYS,
     userPrompt:
-      'What does this print? Answer with only the number.\n\nconst obj = {\n  v: 10,\n  get() {\n    return [1, 2].map(function () {\n      return this?.v ?? 0;\n    }).reduce((a, b) => a + b, 0);\n  },\n};\nconsole.log(obj.get());',
-    check: { kind: 'exact', value: '0' },
+      'Using a glob where ** matches zero or more directories, how many of these files match the pattern src/**/*.ts?\n\nsrc/a.ts\nsrc/lib/b.ts\nsrc/lib/deep/c.ts\ntest/d.ts\nsrc/e.tsx\n\nAnswer with only the number.',
+    check: { kind: 'exact', value: '3' },
   },
   {
-    id: 'high-investigation-deadlock-order',
-    tier: 'high',
+    id: 'invest-repo-grep-case',
+    tier: 'medium',
     taskType: 'investigation',
-    systemPrompt: SYS_SYS,
+    subtaskType: 'repo_exploration',
+    systemPrompt: CODE_SYS,
     userPrompt:
-      'Two threads acquire locks. Thread 1: lock A, then lock B. Thread 2: lock B, then lock A. Both hold the first lock and then block forever waiting for the second. To eliminate the deadlock by enforcing a global lock acquisition order (alphabetical: A before B), which single thread number must have its two lock acquisitions reordered? Answer with only the thread number.',
+      'A file contains exactly these 5 lines:\n\nError: failed\nerror handled\nno problems\nERROR_CODE=7\nerrors: none\n\nHow many lines does a case-sensitive search for the string "error" match? Answer with only the number.',
     check: { kind: 'exact', value: '2' },
   },
   {
-    id: 'high-debug-float-equality',
+    id: 'invest-repo-gitignore',
     tier: 'high',
-    taskType: 'debugging',
+    taskType: 'investigation',
+    subtaskType: 'repo_exploration',
     systemPrompt: CODE_SYS,
     userPrompt:
-      'In IEEE-754 double precision (JavaScript Number), does the expression (0.1 + 0.2 === 0.3) evaluate to true or false? Answer with only the lowercase word true or false.',
-    check: { kind: 'exact', value: 'false' },
+      'A .gitignore contains exactly these rules in order:\n\n*.log\n!important.log\nlogs/\n\nUsing standard git semantics (a pattern without a slash matches at any depth, and a file cannot be re-included if a parent directory of it is excluded), how many of these files are ignored?\n\ndebug.log\nimportant.log\nlogs/important.log\nlogs/app.txt\nsrc/trace.log\n\nAnswer with only the number.',
+    check: { kind: 'exact', value: '4' },
+  },
+
+  // ---------------------------------------------------------------------------
+  // investigation / codebase_understanding
+  // ---------------------------------------------------------------------------
+  {
+    id: 'invest-code-char-count',
+    tier: 'low',
+    taskType: 'investigation',
+    subtaskType: 'codebase_understanding',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'How many times does the letter "a" appear in the word "banana"? Answer with only the number.',
+    check: { kind: 'exact', value: '3' },
   },
   {
-    id: 'high-investigation-txn-isolation',
+    id: 'invest-code-object-keys',
+    tier: 'low',
+    taskType: 'investigation',
+    subtaskType: 'codebase_understanding',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'How many own enumerable keys does this object have? Answer with only the number.\n\nconst o = { a: 1, b: 2, c: 3 };',
+    check: { kind: 'exact', value: '3' },
+  },
+  {
+    id: 'invest-code-regex-groups',
+    tier: 'medium',
+    taskType: 'investigation',
+    subtaskType: 'codebase_understanding',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'Given the regex /(\\d{4})-(\\d{2})-(\\d{2})/ applied to "2026-06-11", what is capture group 2? Answer with only the value.',
+    check: { kind: 'exact', value: '06' },
+  },
+  {
+    id: 'invest-code-collatz-depth',
     tier: 'high',
     taskType: 'investigation',
+    subtaskType: 'codebase_understanding',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'You are reading unfamiliar code. What does f(6) return?\n\nfunction f(n) {\n  if (n <= 1) return n;\n  return n % 2 === 0 ? f(n / 2) + 1 : f(3 * n + 1);\n}\n\nAnswer with only the number.',
+    check: { kind: 'exact', value: '7' },
+  },
+
+  // ---------------------------------------------------------------------------
+  // investigation / external_research
+  // ---------------------------------------------------------------------------
+  {
+    id: 'invest-ext-http-created',
+    tier: 'low',
+    taskType: 'investigation',
+    subtaskType: 'external_research',
+    systemPrompt:
+      'You are a precise web API expert. Answer with only what is asked, no explanations.',
+    userPrompt:
+      'Which standard HTTP status code indicates that a new resource was successfully created? Answer with only the 3-digit number.',
+    check: { kind: 'exact', value: '201' },
+  },
+  {
+    id: 'invest-ext-utf8-euro',
+    tier: 'medium',
+    taskType: 'investigation',
+    subtaskType: 'external_research',
     systemPrompt: SYS_SYS,
     userPrompt:
-      'A counter row holds value 5. Under READ COMMITTED isolation, two concurrent transactions T1 and T2 each run: SELECT v FROM c; then UPDATE c SET v = (the value they read) + 1. Both read before either writes, T1 commits first, then T2 commits (last-write-wins, no row lock taken on the SELECT). What is the final value of v? Answer with only the number.',
-    check: { kind: 'exact', value: '6' },
+      'Per the UTF-8 encoding specification, how many bytes does the encoding of the euro sign (U+20AC) use? Answer with only the number.',
+    check: { kind: 'exact', value: '3' },
   },
   {
-    id: 'high-impl-generator-trace',
+    id: 'invest-ext-semver-caret',
+    tier: 'medium',
+    taskType: 'investigation',
+    subtaskType: 'external_research',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'Per the npm semver range specification, consider the range ^1.4.2. Does it include version 1.5.0, and does it include version 2.0.0? Reply with JSON {"v150": <true|false>, "v200": <true|false>}.',
+    check: { kind: 'json_equal', value: { v150: true, v200: false } },
+  },
+  {
+    id: 'invest-ext-json-spec',
     tier: 'high',
-    taskType: 'implementation',
+    taskType: 'investigation',
+    subtaskType: 'external_research',
     systemPrompt: CODE_SYS,
     userPrompt:
-      'What does this print? Answer with the values joined by commas, e.g. "1,2,3".\n\nfunction* g() {\n  yield 1;\n  yield* [2, 3];\n  yield 4;\n}\nconsole.log([...g()].join(","));',
-    check: { kind: 'regex', pattern: '^\\s*1\\s*,\\s*2\\s*,\\s*3\\s*,\\s*4\\s*$', flags: 'm' },
+      'Per the JSON specification (RFC 8259), how many of these four documents are valid JSON?\n\n{"a": 01}\n{"a": 1,}\n{"a": .5}\n{"a": 1e2}\n\nAnswer with only the number.',
+    check: { kind: 'exact', value: '1' },
+  },
+
+  // ---------------------------------------------------------------------------
+  // agentic_execution / tool_usage
+  // ---------------------------------------------------------------------------
+  {
+    id: 'agentic-tool-json-read',
+    tier: 'low',
+    taskType: 'agentic_execution',
+    subtaskType: 'tool_usage',
+    systemPrompt: AGENT_SYS,
+    userPrompt:
+      'Using your file tools, create a file /tmp/bench-kv.json containing exactly this JSON: {"alpha": 4, "beta": 9}. Then read the file back and answer with only the value of the key "beta".',
+    check: { kind: 'exact', value: '9' },
+  },
+  {
+    id: 'agentic-tool-notes-count',
+    tier: 'low',
+    taskType: 'agentic_execution',
+    subtaskType: 'tool_usage',
+    systemPrompt: AGENT_SYS,
+    userPrompt:
+      'Create a directory /tmp/bench-notes containing exactly three files named one.txt, two.txt, and three.txt (any content). Then list the directory and answer with only the number of files it contains.',
+    check: { kind: 'exact', value: '3' },
+  },
+  {
+    id: 'agentic-tool-log-grep',
+    tier: 'medium',
+    taskType: 'agentic_execution',
+    subtaskType: 'tool_usage',
+    systemPrompt: AGENT_SYS,
+    userPrompt:
+      'Create a file /tmp/bench-app.log containing exactly these 6 lines:\n\nINFO start\nERROR disk full\nINFO retry\nERROR timeout\nWARN slow\nERROR disk full\n\nThen search the file and answer with only the number of lines that contain the word ERROR.',
+    check: { kind: 'exact', value: '3' },
   },
   {
-    id: 'high-planning-cache-invalidation',
+    id: 'agentic-tool-csv-filter-sum',
     tier: 'high',
-    taskType: 'planning_design',
-    systemPrompt: SYS_SYS,
+    taskType: 'agentic_execution',
+    subtaskType: 'tool_usage',
+    systemPrompt: AGENT_SYS,
     userPrompt:
-      'A write-through cache with TTL 60s. At t=0s key K is written (value 1, cached). At t=30s the database row for K is updated to value 2 by a process that bypasses the cache (does not invalidate it). At t=45s a reader requests K. At t=70s another reader requests K. The cache returns its entry if present and unexpired, otherwise reads the DB and caches. What value does the t=45s reader get, and what value does the t=70s reader get? Reply with JSON {"first": <number>, "second": <number>}.',
-    check: { kind: 'json_equal', value: { first: 1, second: 2 } },
+      'Create a file /tmp/bench-data.csv containing exactly these 6 lines:\n\nid,qty\na,12\nb,7\ne,31\no,50\nk,9\n\nThen compute the sum of the qty column over only the rows whose id is a vowel (a, e, i, o, or u), and answer with only the number.',
+    check: { kind: 'exact', value: '93' },
+  },
+
+  // ---------------------------------------------------------------------------
+  // agentic_execution / terminal_operations
+  // ---------------------------------------------------------------------------
+  {
+    id: 'agentic-term-node-major',
+    tier: 'low',
+    taskType: 'agentic_execution',
+    subtaskType: 'terminal_operations',
+    systemPrompt: AGENT_SYS,
+    userPrompt:
+      'Run this command in the terminal and answer with only the number it prints:\n\nnode -e "console.log(process.versions.node.split(\'.\')[0])"',
+    check: { kind: 'exact', value: '22' },
+  },
+  {
+    id: 'agentic-term-wc-lines',
+    tier: 'low',
+    taskType: 'agentic_execution',
+    subtaskType: 'terminal_operations',
+    systemPrompt: AGENT_SYS,
+    userPrompt:
+      'Using the terminal, write a file /tmp/bench-words.txt containing exactly these 5 lines:\n\nred\ngreen\nblue\ncyan\nplum\n\nThen run: wc -l < /tmp/bench-words.txt and answer with only the number it prints.',
+    check: { kind: 'exact', value: '5' },
   },
   {
-    id: 'high-refactoring-short-circuit',
+    id: 'agentic-term-sort-pipeline',
+    tier: 'medium',
+    taskType: 'agentic_execution',
+    subtaskType: 'terminal_operations',
+    systemPrompt: AGENT_SYS,
+    userPrompt:
+      "Run this pipeline in the terminal and answer with only the line it prints:\n\nprintf 'pear\\napple\\nbanana\\n' | sort | head -n 1",
+    check: { kind: 'exact', value: 'apple' },
+  },
+  {
+    id: 'agentic-term-sha256-prefix',
     tier: 'high',
-    taskType: 'refactoring',
-    systemPrompt: CODE_SYS,
+    taskType: 'agentic_execution',
+    subtaskType: 'terminal_operations',
+    systemPrompt: AGENT_SYS,
     userPrompt:
-      'What does this print? Answer with only the number.\n\nlet calls = 0;\nfunction side() {\n  calls++;\n  return 0;\n}\nconst result = side() || side() || 7;\nconsole.log(calls);',
-    check: { kind: 'exact', value: '2' },
+      "Run this command in the terminal and answer with only the 8 characters it prints:\n\nnode -e \"console.log(require('crypto').createHash('sha256').update('kilo-benchmark').digest('hex').slice(0, 8))\"",
+    check: { kind: 'exact', value: 'fd99e6a4' },
+  },
+
+  // ---------------------------------------------------------------------------
+  // agentic_execution / multi_step_execution
+  // ---------------------------------------------------------------------------
+  {
+    id: 'agentic-multi-seq-sum',
+    tier: 'medium',
+    taskType: 'agentic_execution',
+    subtaskType: 'multi_step_execution',
+    systemPrompt: AGENT_SYS,
+    userPrompt:
+      'Create a file /tmp/bench-seq.txt containing the integers 1 through 10, one per line. Then use a terminal command to sum the lines and answer with only the sum.',
+    check: { kind: 'exact', value: '55' },
+  },
+  {
+    id: 'agentic-multi-node-script',
+    tier: 'medium',
+    taskType: 'agentic_execution',
+    subtaskType: 'multi_step_execution',
+    systemPrompt: AGENT_SYS,
+    userPrompt:
+      'Write a file /tmp/bench-fib.js containing a Node.js script that computes f(12) for the sequence f(1) = 1, f(2) = 1, f(n) = f(n-1) + f(n-2), and prints the result. Run it with node and answer with only the number it prints.',
+    check: { kind: 'exact', value: '144' },
+  },
+  {
+    id: 'agentic-multi-find-count',
+    tier: 'medium',
+    taskType: 'agentic_execution',
+    subtaskType: 'multi_step_execution',
+    systemPrompt: AGENT_SYS,
+    userPrompt:
+      "Create directories /tmp/bench-proj/src and /tmp/bench-proj/test. Create empty files /tmp/bench-proj/src/a.ts, /tmp/bench-proj/src/b.ts, and /tmp/bench-proj/test/a.test.ts. Then run:\n\nfind /tmp/bench-proj -name '*.ts' | wc -l\n\nand answer with only the number it prints.",
+    check: { kind: 'exact', value: '3' },
+  },
+  {
+    id: 'agentic-multi-json-transform',
+    tier: 'high',
+    taskType: 'agentic_execution',
+    subtaskType: 'multi_step_execution',
+    systemPrompt: AGENT_SYS,
+    userPrompt:
+      'Create a file /tmp/bench-in.json containing exactly this JSON array: [3, 1, 4, 1, 5, 9, 2, 6, 5, 3]. Then write and run a Node.js script that reads the file, computes the sum of the distinct values in the array, and prints it. Answer with only the number.',
+    check: { kind: 'exact', value: '30' },
   },
 ];

From adb49f527b5487ab4a6d44beae3ca0a26df38f4b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Fri, 12 Jun 2026 18:24:28 +0200
Subject: [PATCH 58/73] feat(auto-routing): session-sticky decisions with
 switch-cost factor

Remember the last served model per conversation in the decision-cache DO
and keep it while it meets the current tier's accuracy threshold, unless
the fresh pick is cheaper by more than the routing table's new
switchCostFactor. Switching models discards provider prompt caches, so a
session whose difficulty tier oscillates no longer ping-pongs between
models. Decisions report a sticky flag in the response and the
auto_routing_decision log line.
---
 .../auto-routing-contracts/src/benchmark.ts   |  7 ++
 packages/auto-routing-contracts/src/index.ts  |  4 +
 .../src/routing-table.ts                      |  3 +
 services/auto-routing/src/decide.ts           | 30 ++++--
 .../auto-routing/src/decision-cache.test.ts   | 47 ++++++++-
 services/auto-routing/src/decision-cache.ts   | 52 +++++++++-
 .../auto-routing/src/decision-engine.test.ts  | 96 ++++++++++++++++++-
 services/auto-routing/src/decision-engine.ts  | 40 +++++++-
 services/auto-routing/src/index.test.ts       | 60 +++++++++++-
 .../auto-routing/src/routing-table.test.ts    |  1 +
 10 files changed, 317 insertions(+), 23 deletions(-)

diff --git a/packages/auto-routing-contracts/src/benchmark.ts b/packages/auto-routing-contracts/src/benchmark.ts
index 53e48127e5..6d9db4d287 100644
--- a/packages/auto-routing-contracts/src/benchmark.ts
+++ b/packages/auto-routing-contracts/src/benchmark.ts
@@ -30,6 +30,13 @@ export const BenchmarkConfigSchema = z.object({
   // The Kilo user whose identity/billing the decider CLI runs execute under.
   // Null until an admin configures it; decider runs fail fast while null.
   benchmarkUserId: z.string().trim().min(1).nullable(),
+  // Session stickiness knob carried into published routing tables: a session
+  // stays on its incumbent model while it meets the tier's accuracy
+  // threshold, unless the fresh pick is cheaper by more than this factor.
+  // Model switches discard provider prompt caches (cache reads are far
+  // cheaper than fresh input tokens), so switching only pays off when the
+  // recurring savings clearly outweigh the cache-rebuild penalty.
+  switchCostFactor: z.number().min(1).max(100),
   updatedAt: z.string().nullable(),
   updatedBy: z.string().nullable(),
 });
diff --git a/packages/auto-routing-contracts/src/index.ts b/packages/auto-routing-contracts/src/index.ts
index 6c7ecbd792..a7e12222e1 100644
--- a/packages/auto-routing-contracts/src/index.ts
+++ b/packages/auto-routing-contracts/src/index.ts
@@ -104,6 +104,10 @@ export const AutoRoutingDecisionSchema = z.object({
   tableVersion: z.string(),
   // Mirrors the effort the chosen model was benchmarked with, when set.
   reasoningEffort: ReasoningEffortSchema.nullable().optional(),
+  // True when the session's incumbent model was kept over a cheaper fresh
+  // pick. Defaulted so responses from a not-yet-redeployed worker still
+  // parse.
+  sticky: z.boolean().default(false),
 });
 export type AutoRoutingDecision = z.infer<typeof AutoRoutingDecisionSchema>;
 
diff --git a/packages/auto-routing-contracts/src/routing-table.ts b/packages/auto-routing-contracts/src/routing-table.ts
index b4c1696203..160742d6f9 100644
--- a/packages/auto-routing-contracts/src/routing-table.ts
+++ b/packages/auto-routing-contracts/src/routing-table.ts
@@ -22,6 +22,9 @@ export const RoutingTableSchema = z.object({
   version: z.string().min(1),
   generatedAt: z.string().min(1),
   minAccuracy: z.number().min(0).max(1),
+  // Keep a session's incumbent model unless the fresh pick is cheaper by
+  // more than this factor (see BenchmarkConfigSchema.switchCostFactor).
+  switchCostFactor: z.number().min(1),
   source: z.enum(['benchmark']),
   tiers: z.object({
     low: z.array(RankedCandidateSchema).min(1),
diff --git a/services/auto-routing/src/decide.ts b/services/auto-routing/src/decide.ts
index c976d9d6e9..411f6a20c6 100644
--- a/services/auto-routing/src/decide.ts
+++ b/services/auto-routing/src/decide.ts
@@ -18,7 +18,12 @@ import {
   hashIdentifierForTelemetry,
 } from './conversation-identity';
 import type { ContentHashes } from './conversation-identity';
-import { getCachedClassification, putCachedClassification } from './decision-cache';
+import {
+  getCachedClassification,
+  getStickyDecision,
+  putCachedClassification,
+  putStickyDecision,
+} from './decision-cache';
 import { computeDecision } from './decision-engine';
 import { ClassifierRunError, classifyNormalizedInput } from './model-classifier';
 import type { ClassifierRunResult } from './model-classifier';
@@ -251,6 +256,7 @@ function recordDecision(
       decidedModel: decision?.model ?? null,
       decidedTier: decision?.tier ?? null,
       decisionSource: decision?.source ?? null,
+      sticky: decision?.sticky ?? null,
       ...summary.details,
     })
   );
@@ -290,14 +296,16 @@ export const decideHandler: Handler<HonoEnv> = async c => {
     successSampleRate,
   };
 
-  const cached = await getCachedClassification(
-    c.env,
-    ctx.conversationKey,
-    hashes.exact,
-    classifierModel
-  );
+  // Both live in the conversation's Durable Object; fetch them together.
+  const [cached, stickyModel] = await Promise.all([
+    getCachedClassification(c.env, ctx.conversationKey, hashes.exact, classifierModel),
+    getStickyDecision(c.env, ctx.conversationKey),
+  ]);
   if (cached) {
-    const decision = computeDecision(cached, payload.input.apiKind, routingTable);
+    const decision = computeDecision(cached, payload.input.apiKind, routingTable, stickyModel);
+    if (decision) {
+      c.executionCtx.waitUntil(putStickyDecision(c.env, ctx.conversationKey, decision.model));
+    }
     recordDecision(
       c.env,
       ctx,
@@ -326,8 +334,12 @@ export const decideHandler: Handler<HonoEnv> = async c => {
     const decision = computeDecision(
       classifier.classification,
       payload.input.apiKind,
-      routingTable
+      routingTable,
+      stickyModel
     );
+    if (decision) {
+      c.executionCtx.waitUntil(putStickyDecision(c.env, ctx.conversationKey, decision.model));
+    }
     recordDecision(
       c.env,
       ctx,
diff --git a/services/auto-routing/src/decision-cache.test.ts b/services/auto-routing/src/decision-cache.test.ts
index c61cd2eb97..8f97f5c3ca 100644
--- a/services/auto-routing/src/decision-cache.test.ts
+++ b/services/auto-routing/src/decision-cache.test.ts
@@ -1,6 +1,6 @@
 import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
 import type { ClassifierOutput } from '@kilocode/auto-routing-contracts/classifier';
-import { AutoRoutingDecisionCacheDO } from './decision-cache';
+import { AutoRoutingDecisionCacheDO, getStickyDecision, putStickyDecision } from './decision-cache';
 
 const classification = {
   taskType: 'implementation',
@@ -104,3 +104,48 @@ describe('AutoRoutingDecisionCacheDO', () => {
     await expect(storage.getAlarm()).resolves.toBeNull();
   });
 });
+
+describe('sticky decision storage', () => {
+  beforeEach(() => {
+    vi.useFakeTimers();
+    vi.setSystemTime(new Date('2026-06-11T12:00:00Z'));
+  });
+
+  afterEach(() => {
+    vi.useRealTimers();
+  });
+
+  function createStickyEnv() {
+    const { cacheDO, storage } = createCacheDO();
+    const env = {
+      AUTO_ROUTING_DECISION_CACHE: {
+        idFromName: (name: string) => name,
+        get: () => cacheDO,
+      },
+    } as unknown as Pick<Env, 'AUTO_ROUTING_DECISION_CACHE'>;
+    return { env, cacheDO, storage };
+  }
+
+  it('round-trips the sticky model for a conversation', async () => {
+    const { env } = createStickyEnv();
+    await expect(getStickyDecision(env, 'conversation-1')).resolves.toBeNull();
+
+    await putStickyDecision(env, 'conversation-1', 'mid/chat');
+    await expect(getStickyDecision(env, 'conversation-1')).resolves.toBe('mid/chat');
+  });
+
+  it('expires sticky entries after the TTL', async () => {
+    const { env } = createStickyEnv();
+    await putStickyDecision(env, 'conversation-1', 'mid/chat');
+
+    vi.advanceTimersByTime(31 * 60 * 1000);
+    await expect(getStickyDecision(env, 'conversation-1')).resolves.toBeNull();
+  });
+
+  it('returns null for invalid stored shapes', async () => {
+    const { env, cacheDO } = createStickyEnv();
+    await cacheDO.putEntry('sticky', { nope: true } as unknown as ClassifierOutput);
+
+    await expect(getStickyDecision(env, 'conversation-1')).resolves.toBeNull();
+  });
+});
diff --git a/services/auto-routing/src/decision-cache.ts b/services/auto-routing/src/decision-cache.ts
index a4bd929bf8..ae98778688 100644
--- a/services/auto-routing/src/decision-cache.ts
+++ b/services/auto-routing/src/decision-cache.ts
@@ -1,5 +1,6 @@
 import { ClassifierOutputSchema, type ClassifierOutput } from '@kilocode/auto-routing-contracts';
 import { DurableObject } from 'cloudflare:workers';
+import * as z from 'zod';
 
 // Mirrored agent sessions classify the same prompt prefixes on every API
 // call, so identical classifier inputs repeat heavily within a short
@@ -13,13 +14,19 @@ const ENTRY_TTL_MS = 30 * 60 * 1000;
 // Cloudflare caps storage.delete() at 128 keys per call.
 const DELETE_BATCH_SIZE = 128;
 
+// The DO treats stored values as opaque — callers validate on read, since
+// entries may have been written by an older worker version. A concrete union
+// rather than unknown because the workers RPC stub maps non-serializable
+// method types to never.
+type StoredValue = ClassifierOutput | StickyDecision;
+
 type StoredEntry = {
-  value: ClassifierOutput;
+  value: StoredValue;
   storedAt: number;
 };
 
 export class AutoRoutingDecisionCacheDO extends DurableObject<Env> {
-  async getEntry(key: string): Promise<ClassifierOutput | null> {
+  async getEntry(key: string): Promise<StoredValue | null> {
     const entry = await this.ctx.storage.get<StoredEntry>(key);
     if (!entry) return null;
     if (Date.now() - entry.storedAt > ENTRY_TTL_MS) {
@@ -29,7 +36,7 @@ export class AutoRoutingDecisionCacheDO extends DurableObject<Env> {
     return entry.value;
   }
 
-  async putEntry(key: string, value: ClassifierOutput): Promise<void> {
+  async putEntry(key: string, value: StoredValue): Promise<void> {
     await this.ctx.storage.put(key, { value, storedAt: Date.now() } satisfies StoredEntry);
     // A fixed-period sweep (rather than an idle alarm pushed out on every
     // write) so storage stays bounded even when distinct conversations
@@ -73,6 +80,15 @@ function entryKey(contentHash: string, classifierModel: string): string {
   return `${classifierModel}:${contentHash}`;
 }
 
+// Single per-conversation slot remembering the last model the decision
+// engine served, so the session can stay on it (keeping the provider's
+// prompt cache warm) instead of ping-ponging when its tier oscillates.
+// Cannot collide with classification keys, which always contain a ':'.
+const STICKY_DECISION_KEY = 'sticky';
+
+const StickyDecisionSchema = z.object({ model: z.string().min(1) });
+type StickyDecision = z.infer<typeof StickyDecisionSchema>;
+
 export async function getCachedClassification(
   env: DecisionCacheEnv,
   conversationKey: string,
@@ -93,6 +109,36 @@ export async function getCachedClassification(
   }
 }
 
+export async function getStickyDecision(
+  env: DecisionCacheEnv,
+  conversationKey: string
+): Promise<string | null> {
+  try {
+    const value = await cacheStub(env, conversationKey).getEntry(STICKY_DECISION_KEY);
+    if (!value) return null;
+    // Entries may have been written by an older worker version; validate
+    // before serving.
+    const parsed = StickyDecisionSchema.safeParse(value);
+    return parsed.success ? parsed.data.model : null;
+  } catch {
+    return null;
+  }
+}
+
+export async function putStickyDecision(
+  env: DecisionCacheEnv,
+  conversationKey: string,
+  model: string
+): Promise<void> {
+  try {
+    await cacheStub(env, conversationKey).putEntry(STICKY_DECISION_KEY, {
+      model,
+    } satisfies StickyDecision);
+  } catch {
+    // Sticky writes are best effort and must not fail the decision.
+  }
+}
+
 export async function putCachedClassification(
   env: DecisionCacheEnv,
   conversationKey: string,
diff --git a/services/auto-routing/src/decision-engine.test.ts b/services/auto-routing/src/decision-engine.test.ts
index 59211a7329..6cc0018223 100644
--- a/services/auto-routing/src/decision-engine.test.ts
+++ b/services/auto-routing/src/decision-engine.test.ts
@@ -17,6 +17,7 @@ const table: RoutingTable = {
   version: 'run-1',
   generatedAt: '2026-06-11T00:00:00.000Z',
   minAccuracy: 0.7,
+  switchCostFactor: 3,
   source: 'benchmark',
   tiers: {
     low: [
@@ -34,6 +35,28 @@ const table: RoutingTable = {
         meetsThreshold: true,
         supportedApiKinds: ['chat_completions'],
       },
+      {
+        model: 'mid/chat',
+        accuracy: 0.8,
+        avgCostUsd: 0.005,
+        meetsThreshold: true,
+        supportedApiKinds: ['chat_completions', 'messages'],
+        reasoningEffort: 'medium',
+      },
+      {
+        model: 'pricey/chat',
+        accuracy: 0.9,
+        avgCostUsd: 0.02,
+        meetsThreshold: true,
+        supportedApiKinds: ['chat_completions'],
+      },
+      {
+        model: 'weak/chat',
+        accuracy: 0.5,
+        avgCostUsd: 0.003,
+        meetsThreshold: false,
+        supportedApiKinds: ['chat_completions'],
+      },
     ],
     medium: [
       {
@@ -58,13 +81,14 @@ const table: RoutingTable = {
 
 describe('computeDecision', () => {
   it('picks the first candidate supporting the request api kind', () => {
-    const decision = computeDecision(classification, 'chat_completions', table);
+    const decision = computeDecision(classification, 'chat_completions', table, null);
     expect(decision).toEqual({
       model: 'cheap/chat',
       tier: 'low',
       source: 'benchmark',
       tableVersion: 'run-1',
       reasoningEffort: null,
+      sticky: false,
     });
   });
   it('uses the tier derived from the classification', () => {
@@ -74,12 +98,76 @@ describe('computeDecision', () => {
       contextComplexity: 'large',
       executionMode: 'multi_step_project',
     };
-    expect(computeDecision(hard, 'chat_completions', table)?.model).toBe('big/chat');
+    expect(computeDecision(hard, 'chat_completions', table, null)?.model).toBe('big/chat');
   });
   it('returns null when no candidate supports the api kind', () => {
-    expect(computeDecision(classification, 'responses', table)).toBeNull();
+    expect(computeDecision(classification, 'responses', table, null)).toBeNull();
   });
   it('returns null when there is no routing table', () => {
-    expect(computeDecision(classification, 'chat_completions', null)).toBeNull();
+    expect(computeDecision(classification, 'chat_completions', null, null)).toBeNull();
+  });
+
+  describe('session stickiness', () => {
+    it('keeps the incumbent on tier de-escalation when it is within the switch-cost factor', () => {
+      // Fresh pick cheap/chat at 0.002; mid/chat at 0.005 is not cheaper by
+      // more than 3x (0.002 * 3 = 0.006 >= 0.005), so the session stays put.
+      const decision = computeDecision(classification, 'chat_completions', table, 'mid/chat');
+      expect(decision).toEqual({
+        model: 'mid/chat',
+        tier: 'low',
+        source: 'benchmark',
+        tableVersion: 'run-1',
+        // The incumbent's benchmarked effort, not the fresh pick's.
+        reasoningEffort: 'medium',
+        sticky: true,
+      });
+    });
+    it('keeps the incumbent at the exact switch-cost boundary', () => {
+      // Strict comparison: switch only when fresh * factor < incumbent.
+      // Integer costs avoid float noise on the equality case (1 * 3 === 3).
+      const boundaryTable: RoutingTable = {
+        ...table,
+        tiers: {
+          ...table.tiers,
+          low: [
+            { ...table.tiers.low[1]!, model: 'fresh/chat', avgCostUsd: 1 },
+            { ...table.tiers.low[2]!, model: 'incumbent/chat', avgCostUsd: 3 },
+          ],
+        },
+      };
+      const decision = computeDecision(
+        classification,
+        'chat_completions',
+        boundaryTable,
+        'incumbent/chat'
+      );
+      expect(decision).toMatchObject({ model: 'incumbent/chat', sticky: true });
+    });
+    it('switches when the fresh pick is cheaper by more than the factor', () => {
+      // pricey/chat at 0.02 vs fresh 0.002 * 3 = 0.006: switch pays off.
+      const decision = computeDecision(classification, 'chat_completions', table, 'pricey/chat');
+      expect(decision).toMatchObject({ model: 'cheap/chat', sticky: false });
+    });
+    it('switches when the incumbent no longer meets the tier threshold', () => {
+      const decision = computeDecision(classification, 'chat_completions', table, 'weak/chat');
+      expect(decision).toMatchObject({ model: 'cheap/chat', sticky: false });
+    });
+    it('serves the fresh pick when the incumbent is not in the tier', () => {
+      const decision = computeDecision(classification, 'chat_completions', table, 'gone/model');
+      expect(decision).toMatchObject({ model: 'cheap/chat', sticky: false });
+    });
+    it('is not sticky when the incumbent is the fresh pick', () => {
+      const decision = computeDecision(classification, 'chat_completions', table, 'cheap/chat');
+      expect(decision).toMatchObject({ model: 'cheap/chat', sticky: false });
+    });
+    it('serves the fresh pick when the incumbent does not support the api kind', () => {
+      const decision = computeDecision(
+        classification,
+        'chat_completions',
+        table,
+        'cheap/messages-only'
+      );
+      expect(decision).toMatchObject({ model: 'cheap/chat', sticky: false });
+    });
   });
 });
diff --git a/services/auto-routing/src/decision-engine.ts b/services/auto-routing/src/decision-engine.ts
index bc9c1d8011..9f5b949616 100644
--- a/services/auto-routing/src/decision-engine.ts
+++ b/services/auto-routing/src/decision-engine.ts
@@ -9,17 +9,47 @@ import {
 export function computeDecision(
   classification: ClassifierOutput,
   apiKind: NormalizedClassifierInput['apiKind'],
-  table: RoutingTable | null
+  table: RoutingTable | null,
+  incumbentModel: string | null
 ): AutoRoutingDecision | null {
   if (!table) return null;
   const tier = deriveDifficultyTier(classification);
-  const candidate = table.tiers[tier].find(c => c.supportedApiKinds.includes(apiKind));
-  if (!candidate) return null;
+  const candidates = table.tiers[tier];
+  const freshPick = candidates.find(c => c.supportedApiKinds.includes(apiKind));
+  if (!freshPick) return null;
+
+  // Keep the session on its incumbent model when it is still good enough for
+  // the current tier. A model switch discards the provider's prompt cache,
+  // and rebuilding it costs full-price input tokens (4-10x cache-read rates)
+  // on a context that dominates agent-session spend — so a switch is only
+  // worth it when the fresh pick's recurring per-turn savings clearly exceed
+  // that one-time penalty, i.e. it is cheaper by more than switchCostFactor.
+  const incumbent =
+    incumbentModel === null
+      ? undefined
+      : candidates.find(c => c.model === incumbentModel && c.supportedApiKinds.includes(apiKind));
+  if (
+    incumbent &&
+    incumbent.meetsThreshold &&
+    incumbent.model !== freshPick.model &&
+    !(freshPick.avgCostUsd * table.switchCostFactor < incumbent.avgCostUsd)
+  ) {
+    return {
+      model: incumbent.model,
+      tier,
+      source: table.source,
+      tableVersion: table.version,
+      reasoningEffort: incumbent.reasoningEffort ?? null,
+      sticky: true,
+    };
+  }
+
   return {
-    model: candidate.model,
+    model: freshPick.model,
     tier,
     source: table.source,
     tableVersion: table.version,
-    reasoningEffort: candidate.reasoningEffort ?? null,
+    reasoningEffort: freshPick.reasoningEffort ?? null,
+    sticky: false,
   };
 }
diff --git a/services/auto-routing/src/index.test.ts b/services/auto-routing/src/index.test.ts
index 445fbf4f54..825c499daf 100644
--- a/services/auto-routing/src/index.test.ts
+++ b/services/auto-routing/src/index.test.ts
@@ -85,6 +85,7 @@ const benchmarkRoutingTable = {
   version: 'bench-run-1',
   generatedAt: '2026-06-12T00:00:00.000Z',
   minAccuracy: 0.7,
+  switchCostFactor: 3,
   source: 'benchmark',
   tiers: {
     low: [
@@ -106,6 +107,17 @@ const benchmarkRoutingTable = {
         supportedApiKinds: ['chat_completions'],
         reasoningEffort: null,
       },
+      // The high-tier model also qualifies for medium, within the 3x
+      // switch-cost factor of the fresh pick (0.002 * 3 >= 0.005): a session
+      // de-escalating from high stays on it.
+      {
+        model: 'anthropic/claude-sonnet-4.6',
+        accuracy: 0.8,
+        avgCostUsd: 0.005,
+        meetsThreshold: true,
+        supportedApiKinds: ['chat_completions', 'messages', 'responses'],
+        reasoningEffort: null,
+      },
     ],
     high: [
       {
@@ -231,6 +243,7 @@ describe('auto routing worker', () => {
         source: 'benchmark',
         tableVersion: 'bench-run-1',
         reasoningEffort: null,
+        sticky: false,
       },
       classifierResult: {
         classification: mockClassification,
@@ -274,6 +287,7 @@ describe('auto routing worker', () => {
       mode: 'code',
       uaPrefix: 'Kilo-Code/4.106.0',
       bodyBytes: 2048,
+      sticky: false,
     });
     // The raw user id (which embeds the client IP for anonymous users) must
     // never reach persisted logs.
@@ -294,12 +308,16 @@ describe('auto routing worker', () => {
         source: 'benchmark',
         tableVersion: 'bench-run-1',
         reasoningEffort: null,
+        sticky: false,
       },
       classifierResult: { classification: mockClassification },
     });
     expect(cacheIdFromName).toHaveBeenCalledWith('user:user-1:task:task-123');
     expect(classifyNormalizedInput).not.toHaveBeenCalled();
-    expect(cachePutEntry).not.toHaveBeenCalled();
+    // The classification is not re-cached; only the served model is
+    // remembered for session stickiness.
+    expect(cachePutEntry).toHaveBeenCalledTimes(1);
+    expect(cachePutEntry).toHaveBeenCalledWith('sticky', { model: expect.any(String) });
     expect(writeDataPoint).toHaveBeenCalledWith(
       expect.objectContaining({
         doubles: [0, 0, mockClassification.confidence, 1],
@@ -317,6 +335,44 @@ describe('auto routing worker', () => {
     );
   });
 
+  it('keeps the session on the incumbent model when the tier de-escalates', async () => {
+    // Back the mocked DO stub with real storage so the sticky model written
+    // by the first request is visible to the second.
+    const store = new Map<string, unknown>();
+    cacheGetEntry.mockImplementation(async (key: string) => store.get(key) ?? null);
+    cachePutEntry.mockImplementation(async (key: string, value: unknown) => {
+      store.set(key, value);
+    });
+
+    classifyNormalizedInput.mockResolvedValueOnce({
+      ...mockClassifierResult,
+      classification: {
+        ...mockClassification,
+        reasoningComplexity: 'high',
+        contextComplexity: 'large',
+        executionMode: 'multi_step_project',
+      },
+    });
+    const first = await decideRequest(mirrorPayload());
+    expect(first.status).toBe(200);
+    await expect(first.json()).resolves.toMatchObject({
+      decision: { model: 'anthropic/claude-sonnet-4.6', tier: 'high', sticky: false },
+    });
+
+    // The second turn (different prompt, same session) classifies as medium.
+    // The fresh medium pick is cheaper, but not by more than the switch-cost
+    // factor, so the session keeps its incumbent.
+    const second = await decideRequest(
+      mirrorPayload({
+        input: { ...normalizedInput, userPromptPrefix: 'Now a much easier follow-up.' },
+      })
+    );
+    expect(second.status).toBe(200);
+    await expect(second.json()).resolves.toMatchObject({
+      decision: { model: 'anthropic/claude-sonnet-4.6', tier: 'medium', sticky: true },
+    });
+  });
+
   it('falls back to a machine-scoped conversation key without a session id', async () => {
     const response = await decideRequest(mirrorPayload({ sessionId: null }));
 
@@ -423,6 +479,8 @@ describe('auto routing worker', () => {
       decision: null,
       classifierResult: { classification: mockClassification },
     });
+    // A null decision must not overwrite the session's sticky model.
+    expect(cachePutEntry).not.toHaveBeenCalledWith('sticky', expect.anything());
   });
 
   it('returns a null classifier result when the classifier request fails', async () => {
diff --git a/services/auto-routing/src/routing-table.test.ts b/services/auto-routing/src/routing-table.test.ts
index cbed758688..ded4744e81 100644
--- a/services/auto-routing/src/routing-table.test.ts
+++ b/services/auto-routing/src/routing-table.test.ts
@@ -6,6 +6,7 @@ const SAMPLE_TABLE: RoutingTable = {
   version: 'bench-run-1',
   generatedAt: '2026-06-12T00:00:00.000Z',
   minAccuracy: 0.7,
+  switchCostFactor: 3,
   source: 'benchmark',
   tiers: {
     low: [

From a24dc4d498f6ac4275f556b9a1a2a76efc9422ff Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Fri, 12 Jun 2026 18:36:45 +0200
Subject: [PATCH 59/73] feat(auto-routing-benchmark): plumb switchCostFactor
 through config, runs, and routing table

Store the new BenchmarkConfig.switchCostFactor in the benchmark_config
singleton, snapshot it into benchmark_runs at startRun, and carry the
run's snapshotted value into published routing tables so the schema's
required RoutingTableSchema.switchCostFactor parses on read. Regenerate
the squashed D1 baseline migration, add a Switch cost factor field to
the admin config form, and update test fixtures (including the apps/web
decision fixtures missing the new required sticky flag).
---
 .../admin/auto-routing/BenchmarksSection.tsx  | 21 +++++++++++++++++
 .../api/openrouter/[...path]/route.test.ts    |  3 +++
 .../ai-gateway/auto-model/resolution.test.ts  |  1 +
 ...uto-routing-benchmark-admin-client.test.ts |  1 +
 ...ar_nicolaos.sql => 0000_dusty_maginty.sql} |  3 +++
 .../migrations/meta/0000_snapshot.json        | 23 ++++++++++++++++++-
 .../migrations/meta/_journal.json             |  4 ++--
 .../auto-routing-benchmark/src/admin.test.ts  |  8 +++++++
 .../auto-routing-benchmark/src/config.test.ts |  2 ++
 services/auto-routing-benchmark/src/config.ts |  3 +++
 .../auto-routing-benchmark/src/db-schema.ts   |  3 +++
 .../auto-routing-benchmark/src/db.test.ts     |  7 ++++++
 services/auto-routing-benchmark/src/db.ts     |  6 +++++
 .../src/routing-table-builder.test.ts         | 11 +++++++++
 .../src/routing-table-builder.ts              |  8 ++++---
 services/auto-routing-benchmark/src/run.ts    |  5 ++++
 16 files changed, 103 insertions(+), 6 deletions(-)
 rename services/auto-routing-benchmark/migrations/{0000_dear_nicolaos.sql => 0000_dusty_maginty.sql} (95%)

diff --git a/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx b/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx
index 1ca5d437d7..c8cbcffbb8 100644
--- a/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx
+++ b/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx
@@ -114,6 +114,7 @@ function configToFormState(config: BenchmarkConfig | null): {
   classifierModels: string;
   deciderModels: DeciderModelRow[];
   minAccuracy: number;
+  switchCostFactor: number;
   maxConcurrency: number;
   benchmarkUserId: string;
 } {
@@ -124,6 +125,7 @@ function configToFormState(config: BenchmarkConfig | null): {
       classifierModels: '',
       deciderModels: [],
       minAccuracy: 0.7,
+      switchCostFactor: 3,
       maxConcurrency: 4,
       benchmarkUserId: '',
     };
@@ -135,6 +137,7 @@ function configToFormState(config: BenchmarkConfig | null): {
       reasoningEffort: m.reasoningEffort ?? null,
     })),
     minAccuracy: config.minAccuracy,
+    switchCostFactor: config.switchCostFactor,
     maxConcurrency: config.maxConcurrency,
     benchmarkUserId: config.benchmarkUserId ?? '',
   };
@@ -159,6 +162,7 @@ function formStateToConfig(
     classifierModels,
     deciderModels,
     minAccuracy: state.minAccuracy,
+    switchCostFactor: state.switchCostFactor,
     maxConcurrency: state.maxConcurrency,
     benchmarkUserId: benchmarkUserId.length > 0 ? benchmarkUserId : null,
     updatedAt: base?.updatedAt ?? null,
@@ -337,6 +341,23 @@ function BenchmarkConfigEditor({
               className="h-8 w-40 tabular-nums"
             />
           </div>
+          <div className="flex flex-col gap-1.5">
+            <Label htmlFor="benchmark-switch-cost-factor" className="text-sm font-medium">
+              Switch cost factor (1–100)
+            </Label>
+            <Input
+              id="benchmark-switch-cost-factor"
+              type="number"
+              min={1}
+              max={100}
+              step={0.5}
+              value={form.switchCostFactor}
+              onChange={e =>
+                setForm(prev => ({ ...prev, switchCostFactor: parseFloat(e.target.value) || 1 }))
+              }
+              className="h-8 w-40 tabular-nums"
+            />
+          </div>
           <div className="flex flex-col gap-1.5">
             <Label htmlFor="benchmark-max-concurrency" className="text-sm font-medium">
               Max concurrency (1–16)
diff --git a/apps/web/src/app/api/openrouter/[...path]/route.test.ts b/apps/web/src/app/api/openrouter/[...path]/route.test.ts
index 9d53a66f68..5e88298134 100644
--- a/apps/web/src/app/api/openrouter/[...path]/route.test.ts
+++ b/apps/web/src/app/api/openrouter/[...path]/route.test.ts
@@ -450,6 +450,7 @@ describe('kilo-auto/efficient classifier billing', () => {
         tier: 'low',
         source: 'benchmark',
         tableVersion: 'v1',
+        sticky: false,
       },
       costUsd: 0.002,
     });
@@ -479,6 +480,7 @@ describe('kilo-auto/efficient classifier billing', () => {
         tier: 'low',
         source: 'benchmark' as const,
         tableVersion: 'v1',
+        sticky: false,
       },
       costUsd: 0,
     });
@@ -505,6 +507,7 @@ describe('kilo-auto/efficient classifier billing', () => {
         tier: 'low',
         source: 'benchmark',
         tableVersion: 'v1',
+        sticky: false,
       },
       costUsd: 0.002,
     });
diff --git a/apps/web/src/lib/ai-gateway/auto-model/resolution.test.ts b/apps/web/src/lib/ai-gateway/auto-model/resolution.test.ts
index 37c6d87a95..64a54b0c84 100644
--- a/apps/web/src/lib/ai-gateway/auto-model/resolution.test.ts
+++ b/apps/web/src/lib/ai-gateway/auto-model/resolution.test.ts
@@ -33,6 +33,7 @@ const sampleDecision: AutoRoutingDecision = {
   tier: 'low',
   source: 'benchmark',
   tableVersion: 'v1',
+  sticky: false,
 };
 
 describe('resolveAutoModel — kilo-auto/efficient branch', () => {
diff --git a/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.test.ts b/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.test.ts
index 1879f8e82b..6596f8103c 100644
--- a/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.test.ts
+++ b/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.test.ts
@@ -29,6 +29,7 @@ const configResponse = {
       },
     ],
     minAccuracy: 0.8,
+    switchCostFactor: 3,
     maxConcurrency: 4,
     benchmarkUserId: null,
     updatedAt: null,
diff --git a/services/auto-routing-benchmark/migrations/0000_dear_nicolaos.sql b/services/auto-routing-benchmark/migrations/0000_dusty_maginty.sql
similarity index 95%
rename from services/auto-routing-benchmark/migrations/0000_dear_nicolaos.sql
rename to services/auto-routing-benchmark/migrations/0000_dusty_maginty.sql
index defeee3c79..991e7b9960 100644
--- a/services/auto-routing-benchmark/migrations/0000_dear_nicolaos.sql
+++ b/services/auto-routing-benchmark/migrations/0000_dusty_maginty.sql
@@ -1,6 +1,7 @@
 CREATE TABLE `benchmark_config` (
 	`id` integer PRIMARY KEY NOT NULL,
 	`min_accuracy` real NOT NULL,
+	`switch_cost_factor` real NOT NULL,
 	`max_concurrency` integer NOT NULL,
 	`benchmark_user_id` text,
 	`updated_at` text NOT NULL,
@@ -15,6 +16,7 @@ CREATE TABLE `benchmark_runs` (
 	`completed_at` text,
 	`error` text,
 	`min_accuracy` real NOT NULL,
+	`switch_cost_factor` real NOT NULL,
 	`max_concurrency` integer NOT NULL,
 	`benchmark_user_id` text
 );
@@ -83,6 +85,7 @@ CREATE TABLE `routing_tables` (
 	`published_at` text NOT NULL,
 	`generated_at` text NOT NULL,
 	`min_accuracy` real NOT NULL,
+	`switch_cost_factor` real NOT NULL,
 	`source` text NOT NULL
 );
 --> statement-breakpoint
diff --git a/services/auto-routing-benchmark/migrations/meta/0000_snapshot.json b/services/auto-routing-benchmark/migrations/meta/0000_snapshot.json
index 6444a68a13..f966b3d2ea 100644
--- a/services/auto-routing-benchmark/migrations/meta/0000_snapshot.json
+++ b/services/auto-routing-benchmark/migrations/meta/0000_snapshot.json
@@ -1,7 +1,7 @@
 {
   "version": "6",
   "dialect": "sqlite",
-  "id": "905b4a67-d32c-491d-9206-ede5de77d0b2",
+  "id": "fcf958da-88cb-4c79-af3d-4709268e140c",
   "prevId": "00000000-0000-0000-0000-000000000000",
   "tables": {
     "benchmark_config": {
@@ -21,6 +21,13 @@
           "notNull": true,
           "autoincrement": false
         },
+        "switch_cost_factor": {
+          "name": "switch_cost_factor",
+          "type": "real",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
         "max_concurrency": {
           "name": "max_concurrency",
           "type": "integer",
@@ -108,6 +115,13 @@
           "notNull": true,
           "autoincrement": false
         },
+        "switch_cost_factor": {
+          "name": "switch_cost_factor",
+          "type": "real",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
         "max_concurrency": {
           "name": "max_concurrency",
           "type": "integer",
@@ -525,6 +539,13 @@
           "notNull": true,
           "autoincrement": false
         },
+        "switch_cost_factor": {
+          "name": "switch_cost_factor",
+          "type": "real",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
         "source": {
           "name": "source",
           "type": "text",
diff --git a/services/auto-routing-benchmark/migrations/meta/_journal.json b/services/auto-routing-benchmark/migrations/meta/_journal.json
index ba4e4ef0cc..238f03ad45 100644
--- a/services/auto-routing-benchmark/migrations/meta/_journal.json
+++ b/services/auto-routing-benchmark/migrations/meta/_journal.json
@@ -5,8 +5,8 @@
     {
       "idx": 0,
       "version": "6",
-      "when": 1781276443789,
-      "tag": "0000_dear_nicolaos",
+      "when": 1781281890154,
+      "tag": "0000_dusty_maginty",
       "breakpoints": true
     }
   ]
diff --git a/services/auto-routing-benchmark/src/admin.test.ts b/services/auto-routing-benchmark/src/admin.test.ts
index c6c5ae158d..e2ed0517bc 100644
--- a/services/auto-routing-benchmark/src/admin.test.ts
+++ b/services/auto-routing-benchmark/src/admin.test.ts
@@ -18,6 +18,7 @@ const TEST_CONFIG: BenchmarkConfig = {
     },
   ],
   minAccuracy: 0.7,
+  switchCostFactor: 3,
   maxConcurrency: 4,
   benchmarkUserId: null,
   updatedAt: null,
@@ -29,6 +30,7 @@ const TEST_CONFIG_ROWS = {
   config: {
     id: 1 as const,
     min_accuracy: TEST_CONFIG.minAccuracy,
+    switch_cost_factor: TEST_CONFIG.switchCostFactor,
     max_concurrency: TEST_CONFIG.maxConcurrency,
     benchmark_user_id: TEST_CONFIG.benchmarkUserId,
     updated_at: '2026-06-01T00:00:00.000Z',
@@ -184,6 +186,7 @@ describe('GET /admin/config', () => {
       config: {
         id: 1,
         min_accuracy: 0.9,
+        switch_cost_factor: 3,
         max_concurrency: 4,
         benchmark_user_id: null,
         updated_at: '2026-06-01T00:00:00.000Z',
@@ -314,6 +317,10 @@ describe('POST /admin/runs', () => {
     expect(body.runId).toMatch(/^classifier-/);
     expect(body.enqueuedModels).toBe(TEST_CONFIG.classifierModels.length);
     expect(insertRun).toHaveBeenCalledOnce();
+    // The run row snapshots the live config (mid-run edits must not skew results).
+    const [, runArg] = vi.mocked(insertRun).mock.calls[0];
+    expect(runArg.min_accuracy).toBe(TEST_CONFIG.minAccuracy);
+    expect(runArg.switch_cost_factor).toBe(TEST_CONFIG.switchCostFactor);
     expect(queueSendBatch).toHaveBeenCalledOnce();
   });
 });
@@ -341,6 +348,7 @@ describe('GET /admin/routing-table', () => {
       version: 'test-v1',
       generatedAt: '2026-06-01T10:00:00.000Z',
       minAccuracy: 0.7,
+      switchCostFactor: 3,
       source: 'benchmark',
       tiers: { low: [candidate], medium: [candidate], high: [candidate] },
     };
diff --git a/services/auto-routing-benchmark/src/config.test.ts b/services/auto-routing-benchmark/src/config.test.ts
index 27be15b4b0..d4ffb884f2 100644
--- a/services/auto-routing-benchmark/src/config.test.ts
+++ b/services/auto-routing-benchmark/src/config.test.ts
@@ -5,6 +5,7 @@ import type { ConfigDeciderModelRow } from './db';
 const configRow = {
   id: 1 as const,
   min_accuracy: 0.85,
+  switch_cost_factor: 3,
   max_concurrency: 8,
   benchmark_user_id: 'user-123',
   updated_at: '2026-06-01T00:00:00.000Z',
@@ -41,6 +42,7 @@ describe('mapConfigRows', () => {
 
     expect(result).not.toBeNull();
     expect(result?.minAccuracy).toBe(0.85);
+    expect(result?.switchCostFactor).toBe(3);
     expect(result?.maxConcurrency).toBe(8);
     expect(result?.benchmarkUserId).toBe('user-123');
     expect(result?.updatedAt).toBe('2026-06-01T00:00:00.000Z');
diff --git a/services/auto-routing-benchmark/src/config.ts b/services/auto-routing-benchmark/src/config.ts
index 0dd78e8e92..3b00a905da 100644
--- a/services/auto-routing-benchmark/src/config.ts
+++ b/services/auto-routing-benchmark/src/config.ts
@@ -7,6 +7,7 @@ import { apiKindsToFlags, getConfigRows, replaceConfig, type ConfigDeciderModelR
 export function mapConfigRows(
   configRow: {
     min_accuracy: number;
+    switch_cost_factor: number;
     max_concurrency: number;
     benchmark_user_id: string | null;
     updated_at: string;
@@ -32,6 +33,7 @@ export function mapConfigRows(
         r.reasoning_effort as BenchmarkConfig['deciderModels'][number]['reasoningEffort'],
     })),
     minAccuracy: configRow.min_accuracy,
+    switchCostFactor: configRow.switch_cost_factor,
     maxConcurrency: configRow.max_concurrency,
     benchmarkUserId: configRow.benchmark_user_id,
     updatedAt: configRow.updated_at,
@@ -62,6 +64,7 @@ export async function saveBenchmarkConfig(
     db,
     {
       min_accuracy: config.minAccuracy,
+      switch_cost_factor: config.switchCostFactor,
       max_concurrency: config.maxConcurrency,
       benchmark_user_id: config.benchmarkUserId,
       updated_at: updatedAt,
diff --git a/services/auto-routing-benchmark/src/db-schema.ts b/services/auto-routing-benchmark/src/db-schema.ts
index 897fe1b74b..c0981e073d 100644
--- a/services/auto-routing-benchmark/src/db-schema.ts
+++ b/services/auto-routing-benchmark/src/db-schema.ts
@@ -7,6 +7,7 @@ import type { BenchmarkKind, BenchmarkRunStatus } from '@kilocode/auto-routing-c
 export const benchmarkConfig = sqliteTable('benchmark_config', {
   id: integer('id').primaryKey(),
   min_accuracy: real('min_accuracy').notNull(),
+  switch_cost_factor: real('switch_cost_factor').notNull(),
   max_concurrency: integer('max_concurrency').notNull(),
   benchmark_user_id: text('benchmark_user_id'),
   updated_at: text('updated_at').notNull(),
@@ -34,6 +35,7 @@ export const benchmarkRuns = sqliteTable('benchmark_runs', {
   error: text('error'),
   // Config snapshot taken at startRun time so mid-run edits can't skew results.
   min_accuracy: real('min_accuracy').notNull(),
+  switch_cost_factor: real('switch_cost_factor').notNull(),
   max_concurrency: integer('max_concurrency').notNull(),
   benchmark_user_id: text('benchmark_user_id'),
 });
@@ -101,6 +103,7 @@ export const routingTables = sqliteTable('routing_tables', {
   published_at: text('published_at').notNull(),
   generated_at: text('generated_at').notNull(),
   min_accuracy: real('min_accuracy').notNull(),
+  switch_cost_factor: real('switch_cost_factor').notNull(),
   source: text('source').notNull(),
 });
 
diff --git a/services/auto-routing-benchmark/src/db.test.ts b/services/auto-routing-benchmark/src/db.test.ts
index 17a5d10133..1385c5a26e 100644
--- a/services/auto-routing-benchmark/src/db.test.ts
+++ b/services/auto-routing-benchmark/src/db.test.ts
@@ -1,4 +1,5 @@
 import { describe, it, expect } from 'vitest';
+import { RoutingTableSchema } from '@kilocode/auto-routing-contracts';
 import type { RankedCandidate, RoutingTable } from '@kilocode/auto-routing-contracts';
 import {
   apiKindsToFlags,
@@ -157,6 +158,7 @@ describe('mapRunRow', () => {
       completed_at: '2026-06-10T04:25:00.000Z',
       error: null,
       min_accuracy: 0.7,
+      switch_cost_factor: 3,
       max_concurrency: 4,
       benchmark_user_id: null,
     };
@@ -192,6 +194,7 @@ describe('mapRunRow', () => {
       completed_at: null,
       error: null,
       min_accuracy: 0.7,
+      switch_cost_factor: 3,
       max_concurrency: 4,
       benchmark_user_id: null,
     };
@@ -218,6 +221,7 @@ const sampleTable: RoutingTable = {
   version: 'run-test-1',
   generatedAt: '2026-06-01T10:00:00.000Z',
   minAccuracy: 0.7,
+  switchCostFactor: 3,
   source: 'benchmark',
   tiers: {
     low: [candidate('model-a'), candidate('model-b')],
@@ -233,6 +237,7 @@ describe('routingTableToRows', () => {
     expect(tableRow.published_at).toBe('2026-06-01T11:00:00.000Z');
     expect(tableRow.generated_at).toBe('2026-06-01T10:00:00.000Z');
     expect(tableRow.min_accuracy).toBe(0.7);
+    expect(tableRow.switch_cost_factor).toBe(3);
     expect(tableRow.source).toBe('benchmark');
   });
 
@@ -260,6 +265,8 @@ describe('rowsToRoutingTable', () => {
     const { tableRow, candidateRows } = routingTableToRows(sampleTable, '2026-06-01T11:00:00.000Z');
     const reassembled = rowsToRoutingTable(tableRow, candidateRows);
     expect(reassembled).toEqual(sampleTable);
+    // The reassembled table must satisfy the contract schema (getLatestRoutingTable parses it).
+    expect(RoutingTableSchema.parse(reassembled)).toEqual(sampleTable);
   });
 
   it('preserves candidate order within each tier', () => {
diff --git a/services/auto-routing-benchmark/src/db.ts b/services/auto-routing-benchmark/src/db.ts
index ce95bd26b7..c20398a8a4 100644
--- a/services/auto-routing-benchmark/src/db.ts
+++ b/services/auto-routing-benchmark/src/db.ts
@@ -115,6 +115,7 @@ export async function replaceConfig(
   db: D1Database,
   config: {
     min_accuracy: number;
+    switch_cost_factor: number;
     max_concurrency: number;
     benchmark_user_id: string | null;
     updated_at: string;
@@ -157,6 +158,7 @@ export async function insertRun(
     kind: BenchmarkKind;
     startedAt: string;
     min_accuracy: number;
+    switch_cost_factor: number;
     max_concurrency: number;
     benchmark_user_id: string | null;
   },
@@ -170,6 +172,7 @@ export async function insertRun(
     status: 'running',
     started_at: run.startedAt,
     min_accuracy: run.min_accuracy,
+    switch_cost_factor: run.switch_cost_factor,
     max_concurrency: run.max_concurrency,
     benchmark_user_id: run.benchmark_user_id,
   });
@@ -416,6 +419,7 @@ export function routingTableToRows(
     published_at: publishedAt,
     generated_at: table.generatedAt,
     min_accuracy: table.minAccuracy,
+    switch_cost_factor: table.switchCostFactor,
     source: table.source,
   };
 
@@ -463,6 +467,7 @@ export function rowsToRoutingTable(
     version: tableRow.run_id,
     generatedAt: tableRow.generated_at,
     minAccuracy: tableRow.min_accuracy,
+    switchCostFactor: tableRow.switch_cost_factor,
     source: tableRow.source as RoutingTable['source'],
     tiers: {
       low: tierMap.low ?? [],
@@ -491,6 +496,7 @@ export async function saveRoutingTable(
           published_at: tableRow.published_at,
           generated_at: tableRow.generated_at,
           min_accuracy: tableRow.min_accuracy,
+          switch_cost_factor: tableRow.switch_cost_factor,
           source: tableRow.source,
         },
       }),
diff --git a/services/auto-routing-benchmark/src/routing-table-builder.test.ts b/services/auto-routing-benchmark/src/routing-table-builder.test.ts
index f3694e6d62..3ae9cd6d35 100644
--- a/services/auto-routing-benchmark/src/routing-table-builder.test.ts
+++ b/services/auto-routing-benchmark/src/routing-table-builder.test.ts
@@ -51,6 +51,7 @@ describe('buildRoutingTable', () => {
       runId: 'test-run-1',
       generatedAt: '2026-01-01T00:00:00.000Z',
       minAccuracy: 0.7,
+      switchCostFactor: 3,
       deciderModels: DECIDER_MODELS,
       summaries: ALL_TIERS_SUMMARIES,
     });
@@ -78,6 +79,7 @@ describe('buildRoutingTable', () => {
       runId: 'test-run-nocost',
       generatedAt: '2026-01-01T00:00:00.000Z',
       minAccuracy: 0.7,
+      switchCostFactor: 3,
       deciderModels: DECIDER_MODELS,
       summaries: ALL_TIERS_SUMMARIES.map(s =>
         s.model === 'model/cheap' && s.tier === 'low' ? { ...s, avgCostUsd: null } : s
@@ -94,6 +96,7 @@ describe('buildRoutingTable', () => {
       runId: 'test-run-2',
       generatedAt: '2026-01-01T00:00:00.000Z',
       minAccuracy: 0.7,
+      switchCostFactor: 3,
       deciderModels: DECIDER_MODELS,
       summaries: ALL_TIERS_SUMMARIES,
     });
@@ -121,6 +124,7 @@ describe('buildRoutingTable', () => {
       runId: 'test-run-3',
       generatedAt: '2026-01-01T00:00:00.000Z',
       minAccuracy: 0.7,
+      switchCostFactor: 3,
       deciderModels: DECIDER_MODELS,
       summaries,
     });
@@ -136,6 +140,7 @@ describe('buildRoutingTable', () => {
       runId: 'test-run-4',
       generatedAt: '2026-01-01T00:00:00.000Z',
       minAccuracy: 0.7,
+      switchCostFactor: 3,
       deciderModels: DECIDER_MODELS,
       summaries: ALL_TIERS_SUMMARIES,
     });
@@ -161,6 +166,7 @@ describe('buildRoutingTable', () => {
       runId: 'test-run-5',
       generatedAt: '2026-01-01T00:00:00.000Z',
       minAccuracy: 0.7,
+      switchCostFactor: 3,
       deciderModels: DECIDER_MODELS,
       summaries,
     });
@@ -185,6 +191,7 @@ describe('buildRoutingTable', () => {
         runId: 'test-run-6',
         generatedAt: '2026-01-01T00:00:00.000Z',
         minAccuracy: 0.7,
+        switchCostFactor: 3,
         deciderModels: DECIDER_MODELS,
         summaries,
       })
@@ -205,6 +212,7 @@ describe('buildRoutingTable', () => {
         runId: 'test-run-7',
         generatedAt: '2026-01-01T00:00:00.000Z',
         minAccuracy: 0.7,
+        switchCostFactor: 3,
         deciderModels: DECIDER_MODELS,
         summaries,
       })
@@ -224,6 +232,7 @@ describe('buildRoutingTable', () => {
       runId: 'test-run-8',
       generatedAt: '2026-01-01T00:00:00.000Z',
       minAccuracy: 0.7,
+      switchCostFactor: 3,
       deciderModels: DECIDER_MODELS,
       summaries,
     });
@@ -237,6 +246,7 @@ describe('buildRoutingTable', () => {
       runId: 'decider-2026-01-01',
       generatedAt: '2026-01-01T12:00:00.000Z',
       minAccuracy: 0.7,
+      switchCostFactor: 3,
       deciderModels: DECIDER_MODELS,
       summaries: ALL_TIERS_SUMMARIES,
     });
@@ -245,5 +255,6 @@ describe('buildRoutingTable', () => {
     expect(table.generatedAt).toBe('2026-01-01T12:00:00.000Z');
     expect(table.source).toBe('benchmark');
     expect(table.minAccuracy).toBe(0.7);
+    expect(table.switchCostFactor).toBe(3);
   });
 });
diff --git a/services/auto-routing-benchmark/src/routing-table-builder.ts b/services/auto-routing-benchmark/src/routing-table-builder.ts
index 8eb62bb6bf..03ea33399a 100644
--- a/services/auto-routing-benchmark/src/routing-table-builder.ts
+++ b/services/auto-routing-benchmark/src/routing-table-builder.ts
@@ -12,16 +12,17 @@ import {
 // models with no cost signal at all (avgCostUsd null means every case failed
 // to report cost; ranking such a model as cheapest would hand it the tier).
 // Throws when any tier ends up empty so the caller keeps the previous
-// published table. deciderModels/minAccuracy come from the run's snapshot,
-// not live config.
+// published table. deciderModels/minAccuracy/switchCostFactor come from the
+// run's snapshot, not live config.
 export function buildRoutingTable(params: {
   runId: string;
   generatedAt: string;
   minAccuracy: number;
+  switchCostFactor: number;
   deciderModels: BenchmarkDeciderModel[];
   summaries: BenchmarkModelSummary[];
 }): RoutingTable {
-  const { runId, generatedAt, minAccuracy, deciderModels, summaries } = params;
+  const { runId, generatedAt, minAccuracy, switchCostFactor, deciderModels, summaries } = params;
   const modelConfigById = new Map(deciderModels.map(m => [m.id, m] as const));
 
   const tierCandidates = (t: DifficultyTier) =>
@@ -45,6 +46,7 @@ export function buildRoutingTable(params: {
     version: runId,
     generatedAt,
     minAccuracy,
+    switchCostFactor,
     source: 'benchmark',
     tiers: {
       low: tierCandidates('low'),
diff --git a/services/auto-routing-benchmark/src/run.ts b/services/auto-routing-benchmark/src/run.ts
index d6b2fcf818..f0d3bb36e7 100644
--- a/services/auto-routing-benchmark/src/run.ts
+++ b/services/auto-routing-benchmark/src/run.ts
@@ -136,6 +136,7 @@ export async function startRun(
       kind,
       startedAt: new Date().toISOString(),
       min_accuracy: config.minAccuracy,
+      switch_cost_factor: config.switchCostFactor,
       max_concurrency: config.maxConcurrency,
       benchmark_user_id: config.benchmarkUserId,
     },
@@ -161,6 +162,7 @@ export async function startRun(
     await finalizeRunIfComplete(env, runId, kind, {
       maxConcurrency: config.maxConcurrency,
       minAccuracy: config.minAccuracy,
+      switchCostFactor: config.switchCostFactor,
       benchmarkUserId: config.benchmarkUserId,
       models: runModelRows,
     });
@@ -255,6 +257,7 @@ export async function processJob(env: Env, rawMessage: unknown): Promise<void> {
 type RunState = {
   maxConcurrency: number;
   minAccuracy: number;
+  switchCostFactor: number;
   benchmarkUserId: string | null;
   models: RunModelRow[];
 };
@@ -267,6 +270,7 @@ async function getRunState(env: Env, runId: string): Promise<RunState> {
   return {
     maxConcurrency: run.max_concurrency,
     minAccuracy: run.min_accuracy,
+    switchCostFactor: run.switch_cost_factor,
     benchmarkUserId: run.benchmark_user_id,
     models,
   };
@@ -489,6 +493,7 @@ async function finalizeRunIfComplete(
         runId,
         generatedAt,
         minAccuracy: state.minAccuracy,
+        switchCostFactor: state.switchCostFactor,
         deciderModels,
         summaries: allSummaries,
       });

From 3d5044179f6471e87f480a228497708423f9ef6b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Fri, 12 Jun 2026 18:46:08 +0200
Subject: [PATCH 60/73] fix(ai-gateway): align efficient fallback with
 Qwen-for-all-APIs after main merge

---
 .../lib/ai-gateway/auto-model/resolution.test.ts  | 15 +++++----------
 .../src/lib/ai-gateway/auto-model/resolution.ts   |  4 +---
 2 files changed, 6 insertions(+), 13 deletions(-)

diff --git a/apps/web/src/lib/ai-gateway/auto-model/resolution.test.ts b/apps/web/src/lib/ai-gateway/auto-model/resolution.test.ts
index 64a54b0c84..f241c5f222 100644
--- a/apps/web/src/lib/ai-gateway/auto-model/resolution.test.ts
+++ b/apps/web/src/lib/ai-gateway/auto-model/resolution.test.ts
@@ -9,12 +9,7 @@ jest.mock('@/lib/kiloclaw/setup-promo', () => ({
 }));
 
 import { resolveAutoModel } from './resolution';
-import {
-  BALANCED_RESPONSES_FALLBACK_MODEL,
-  BALANCED_MESSAGES_FALLBACK_MODEL,
-  BALANCED_QWEN_MODEL,
-  KILO_AUTO_EFFICIENT_MODEL,
-} from '@/lib/ai-gateway/auto-model';
+import { BALANCED_QWEN_MODEL, KILO_AUTO_EFFICIENT_MODEL } from '@/lib/ai-gateway/auto-model';
 import type { AutoRoutingDecision } from '@kilocode/auto-routing-contracts';
 
 const baseParams = {
@@ -85,24 +80,24 @@ describe('resolveAutoModel — kilo-auto/efficient branch', () => {
     expect(result).toEqual({ kind: 'ok', resolved: { model: 'anthropic/claude-haiku-4' } });
   });
 
-  it('falls back to BALANCED_RESPONSES_FALLBACK_MODEL when no thunk is provided and apiKind=responses', async () => {
+  it('falls back to BALANCED_QWEN_MODEL when no thunk is provided and apiKind=responses', async () => {
     const result = await resolveAutoModel(
       { ...baseParams, apiKind: 'responses' },
       nullUserPromise,
       zeroBalancePromise
     );
 
-    expect(result).toEqual({ kind: 'ok', resolved: BALANCED_RESPONSES_FALLBACK_MODEL });
+    expect(result).toEqual({ kind: 'ok', resolved: BALANCED_QWEN_MODEL });
   });
 
-  it('falls back to BALANCED_MESSAGES_FALLBACK_MODEL when no thunk is provided and apiKind=messages', async () => {
+  it('falls back to BALANCED_QWEN_MODEL when no thunk is provided and apiKind=messages', async () => {
     const result = await resolveAutoModel(
       { ...baseParams, apiKind: 'messages' },
       nullUserPromise,
       zeroBalancePromise
     );
 
-    expect(result).toEqual({ kind: 'ok', resolved: BALANCED_MESSAGES_FALLBACK_MODEL });
+    expect(result).toEqual({ kind: 'ok', resolved: BALANCED_QWEN_MODEL });
   });
 
   it('falls back to BALANCED_QWEN_MODEL when no thunk is provided and apiKind=chat_completions', async () => {
diff --git a/apps/web/src/lib/ai-gateway/auto-model/resolution.ts b/apps/web/src/lib/ai-gateway/auto-model/resolution.ts
index 78c748f59f..b45bf1617a 100644
--- a/apps/web/src/lib/ai-gateway/auto-model/resolution.ts
+++ b/apps/web/src/lib/ai-gateway/auto-model/resolution.ts
@@ -135,10 +135,8 @@ export async function resolveAutoModel(
         },
       };
     }
-    // Static fallback when the worker is slow/unavailable: same shape as
+    // Static fallback when the worker is slow/unavailable: same model as
     // balanced so an efficient request never degrades below balanced.
-    if (apiKind === 'responses') return { kind: 'ok', resolved: BALANCED_RESPONSES_FALLBACK_MODEL };
-    if (apiKind === 'messages') return { kind: 'ok', resolved: BALANCED_MESSAGES_FALLBACK_MODEL };
     return { kind: 'ok', resolved: BALANCED_QWEN_MODEL };
   }
   const mode = resolveMode(modeHeader, featureHeader);

From d922d92ae4b761a9008adf6af8e67a4bdfefc4c7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Fri, 12 Jun 2026 20:24:46 +0200
Subject: [PATCH 61/73] refactor(auto-routing): drop per-candidate API-kind
 plumbing, validate at config save

All decider candidates are served via providers that speak every gateway
chat API (in practice OpenRouter), so per-candidate supportedApiKinds was
dead weight in the contracts, decision engine, D1 schema, and routing
table. The one real failure mode - an admin configuring a model whose
serving provider is chat-completions-only - is now rejected at config
save time instead.
---
 .../benchmark-config/route.test.ts            | 94 ++++++++++++++++++
 .../auto-routing/benchmark-config/route.ts    | 35 ++++---
 .../admin/auto-routing/BenchmarksSection.tsx  | 17 +---
 ...uto-routing-benchmark-admin-client.test.ts | 12 +--
 .../lib/ai-gateway/model-api-kinds.test.ts    | 29 ++----
 .../web/src/lib/ai-gateway/model-api-kinds.ts | 41 +++++---
 .../auto-routing-contracts/src/benchmark.ts   | 13 +--
 packages/auto-routing-contracts/src/index.ts  |  5 +-
 .../src/routing-table.test.ts                 |  1 -
 .../src/routing-table.ts                      |  1 -
 ...usty_maginty.sql => 0000_amused_shard.sql} | 11 +--
 .../migrations/meta/0000_snapshot.json        | 65 +-----------
 .../migrations/meta/_journal.json             |  4 +-
 .../auto-routing-benchmark/src/admin.test.ts  | 19 +---
 .../auto-routing-benchmark/src/config.test.ts |  4 -
 services/auto-routing-benchmark/src/config.ts |  8 +-
 .../auto-routing-benchmark/src/db-schema.ts   |  9 --
 .../auto-routing-benchmark/src/db.test.ts     | 99 +------------------
 services/auto-routing-benchmark/src/db.ts     | 33 -------
 .../src/routing-table-builder.test.ts         | 20 ++--
 .../src/routing-table-builder.ts              |  4 -
 services/auto-routing-benchmark/src/run.ts    | 33 ++-----
 services/auto-routing/src/decide.ts           |  9 +-
 .../auto-routing/src/decision-engine.test.ts  | 73 ++++++--------
 services/auto-routing/src/decision-engine.ts  | 11 +--
 services/auto-routing/src/index.test.ts       |  4 -
 .../auto-routing/src/routing-table.test.ts    |  3 -
 27 files changed, 220 insertions(+), 437 deletions(-)
 create mode 100644 apps/web/src/app/admin/api/auto-routing/benchmark-config/route.test.ts
 rename services/auto-routing-benchmark/migrations/{0000_dusty_maginty.sql => 0000_amused_shard.sql} (85%)

diff --git a/apps/web/src/app/admin/api/auto-routing/benchmark-config/route.test.ts b/apps/web/src/app/admin/api/auto-routing/benchmark-config/route.test.ts
new file mode 100644
index 0000000000..0117f52da2
--- /dev/null
+++ b/apps/web/src/app/admin/api/auto-routing/benchmark-config/route.test.ts
@@ -0,0 +1,94 @@
+import { NextRequest } from 'next/server';
+import type { User } from '@kilocode/db';
+import {
+  getBenchmarkConfig,
+  updateBenchmarkConfig,
+} from '@/lib/ai-gateway/auto-routing-benchmark-admin-client';
+import { getUserFromAuth } from '@/lib/user/server';
+import { morph_warp_grep_free_model } from '@/lib/ai-gateway/providers/morph';
+
+jest.mock('@/lib/user/server', () => ({
+  getUserFromAuth: jest.fn(),
+}));
+
+jest.mock('@/lib/ai-gateway/auto-routing-benchmark-admin-client', () => ({
+  getBenchmarkConfig: jest.fn(),
+  updateBenchmarkConfig: jest.fn(),
+}));
+
+import { PUT } from './route';
+
+const mockGetUserFromAuth = jest.mocked(getUserFromAuth);
+const mockGetBenchmarkConfig = jest.mocked(getBenchmarkConfig);
+const mockUpdateBenchmarkConfig = jest.mocked(updateBenchmarkConfig);
+
+// Test-fixture boundary: only the fields the route actually reads.
+function adminUserFixture(): User {
+  return { id: 'admin_123', google_user_email: 'admin@kilocode.ai' } as Partial<User> as User;
+}
+
+function putRequest(body: unknown) {
+  return new NextRequest('http://localhost:3000/admin/api/auto-routing/benchmark-config', {
+    method: 'PUT',
+    body: JSON.stringify(body),
+    headers: { 'content-type': 'application/json' },
+  });
+}
+
+const validConfig = {
+  classifierModels: ['google/gemini-2.5-flash-lite'],
+  deciderModels: [{ id: 'openai/gpt-5-mini', reasoningEffort: null }],
+  minAccuracy: 0.7,
+  switchCostFactor: 3,
+  maxConcurrency: 4,
+  benchmarkUserId: null,
+  updatedAt: null,
+  updatedBy: null,
+};
+
+describe('PUT /admin/api/auto-routing/benchmark-config', () => {
+  beforeEach(() => {
+    jest.clearAllMocks();
+    mockGetUserFromAuth.mockResolvedValue({
+      user: adminUserFixture(),
+      authFailedResponse: null,
+    });
+    mockUpdateBenchmarkConfig.mockResolvedValue({
+      status: 200,
+      body: { config: validConfig },
+    });
+    mockGetBenchmarkConfig.mockResolvedValue({ status: 200, body: { config: null } });
+  });
+
+  it('forwards a config whose decider models all serve every gateway chat API', async () => {
+    const response = await PUT(putRequest(validConfig));
+    expect(response.status).toBe(200);
+    expect(mockUpdateBenchmarkConfig).toHaveBeenCalledWith(validConfig, 'admin@kilocode.ai');
+  });
+
+  it('rejects with 400 listing decider models not servable on all gateway chat APIs', async () => {
+    const response = await PUT(
+      putRequest({
+        ...validConfig,
+        deciderModels: [
+          { id: 'openai/gpt-5-mini', reasoningEffort: null },
+          { id: morph_warp_grep_free_model.public_id, reasoningEffort: null },
+        ],
+      })
+    );
+
+    expect(response.status).toBe(400);
+    const body = (await response.json()) as { error: string };
+    expect(body.error).toContain(morph_warp_grep_free_model.public_id);
+    expect(body.error).toContain('chat_completions');
+    expect(body.error).not.toContain('openai/gpt-5-mini (');
+    expect(mockUpdateBenchmarkConfig).not.toHaveBeenCalled();
+  });
+
+  it('rejects a schema-invalid config with 400', async () => {
+    const response = await PUT(putRequest({ classifierModels: 'oops' }));
+    expect(response.status).toBe(400);
+    await expect(response.json()).resolves.toEqual({ error: 'Invalid benchmark config' });
+    expect(mockUpdateBenchmarkConfig).not.toHaveBeenCalled();
+  });
+});
diff --git a/apps/web/src/app/admin/api/auto-routing/benchmark-config/route.ts b/apps/web/src/app/admin/api/auto-routing/benchmark-config/route.ts
index de5d38b38c..7d1e9cc0c4 100644
--- a/apps/web/src/app/admin/api/auto-routing/benchmark-config/route.ts
+++ b/apps/web/src/app/admin/api/auto-routing/benchmark-config/route.ts
@@ -1,11 +1,14 @@
-import { BenchmarkConfigUpdateSchema } from '@kilocode/auto-routing-contracts';
+import { BenchmarkConfigSchema } from '@kilocode/auto-routing-contracts';
 import type { NextRequest } from 'next/server';
 import { NextResponse } from 'next/server';
 import {
   getBenchmarkConfig,
   updateBenchmarkConfig,
 } from '@/lib/ai-gateway/auto-routing-benchmark-admin-client';
-import { supportedApiKindsForModel } from '@/lib/ai-gateway/model-api-kinds';
+import {
+  gatewayChatApisForModel,
+  modelServesAllGatewayChatApis,
+} from '@/lib/ai-gateway/model-api-kinds';
 import { getUserFromAuth } from '@/lib/user/server';
 
 export async function GET() {
@@ -27,22 +30,28 @@ export async function PUT(request: NextRequest) {
     return NextResponse.json({ error: 'Invalid JSON body' }, { status: 400 });
   }
 
-  const parsed = BenchmarkConfigUpdateSchema.safeParse(rawBody);
+  const parsed = BenchmarkConfigSchema.safeParse(rawBody);
   if (!parsed.success) {
     return NextResponse.json({ error: 'Invalid benchmark config' }, { status: 400 });
   }
 
-  // supportedApiKinds is server-derived from gateway provider definitions —
-  // the admin UI never sends it.
-  const config = {
-    ...parsed.data,
-    deciderModels: parsed.data.deciderModels.map(m => ({
-      ...m,
-      supportedApiKinds: supportedApiKindsForModel(m.id),
-    })),
-  };
+  // Routing-table candidates carry no per-protocol metadata, so every decider
+  // model must be servable on ALL gateway chat API kinds by the provider the
+  // gateway would route it to.
+  const unsupported = parsed.data.deciderModels
+    .map(m => m.id)
+    .filter(id => !modelServesAllGatewayChatApis(id))
+    .map(id => `${id} (supports: ${gatewayChatApisForModel(id).join(', ') || 'none'})`);
+  if (unsupported.length > 0) {
+    return NextResponse.json(
+      {
+        error: `Decider models must support all gateway chat APIs (chat_completions, responses, messages): ${unsupported.join('; ')}`,
+      },
+      { status: 400 }
+    );
+  }
 
   const email = user?.google_user_email ?? '';
-  const result = await updateBenchmarkConfig(config, email);
+  const result = await updateBenchmarkConfig(parsed.data, email);
   return NextResponse.json(result.body, { status: result.status });
 }
diff --git a/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx b/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx
index c8cbcffbb8..27229964a8 100644
--- a/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx
+++ b/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx
@@ -8,7 +8,6 @@ import {
   type BenchmarkConfig,
   type BenchmarkKind,
   type BenchmarkRoutingTableResponse,
-  type BenchmarkConfigUpdate,
   type BenchmarkRun,
   type BenchmarkModelSummary,
   type ReasoningEffort,
@@ -68,7 +67,7 @@ async function fetchBenchmarkConfig() {
   return parseAdminResponse(response, BenchmarkConfigResponseSchema);
 }
 
-async function saveBenchmarkConfig(config: BenchmarkConfigUpdate) {
+async function saveBenchmarkConfig(config: BenchmarkConfig) {
   const response = await fetch('/admin/api/auto-routing/benchmark-config', {
     method: 'PUT',
     headers: { 'content-type': 'application/json' },
@@ -103,8 +102,6 @@ async function fetchBenchmarkRoutingTable() {
 // Local form state type for decider model rows
 // ---------------------------------------------------------------------------
 
-// supportedApiKinds is intentionally absent: it is derived server-side from
-// gateway provider definitions when the config is saved.
 type DeciderModelRow = {
   id: string;
   reasoningEffort: ReasoningEffort | null;
@@ -146,7 +143,7 @@ function configToFormState(config: BenchmarkConfig | null): {
 function formStateToConfig(
   state: ReturnType<typeof configToFormState>,
   base: BenchmarkConfig | null
-): BenchmarkConfigUpdate {
+): BenchmarkConfig {
   const classifierModels = state.classifierModels
     .split('\n')
     .map(s => s.trim())
@@ -610,7 +607,6 @@ function RoutingTableView({ data }: { data: BenchmarkRoutingTableResponse }) {
                   <TableHead className="text-right">Accuracy</TableHead>
                   <TableHead className="text-right">Avg cost</TableHead>
                   <TableHead>Threshold</TableHead>
-                  <TableHead>API kinds</TableHead>
                 </TableRow>
               </TableHeader>
               <TableBody>
@@ -628,15 +624,6 @@ function RoutingTableView({ data }: { data: BenchmarkRoutingTableResponse }) {
                         {c.meetsThreshold ? 'meets' : 'below'}
                       </Badge>
                     </TableCell>
-                    <TableCell className="text-xs">
-                      <div className="flex flex-wrap gap-1">
-                        {c.supportedApiKinds.map(kind => (
-                          <Badge key={kind} variant="outline" className="font-mono text-xs px-1">
-                            {kind}
-                          </Badge>
-                        ))}
-                      </div>
-                    </TableCell>
                   </TableRow>
                 ))}
               </TableBody>
diff --git a/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.test.ts b/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.test.ts
index 6596f8103c..275c92da1a 100644
--- a/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.test.ts
+++ b/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.test.ts
@@ -17,17 +17,7 @@ global.fetch = mockFetch;
 const configResponse = {
   config: {
     classifierModels: ['anthropic/claude-haiku-4'],
-    deciderModels: [
-      {
-        id: 'anthropic/claude-sonnet-4',
-        supportedApiKinds: ['chat_completions' as const] as (
-          | 'chat_completions'
-          | 'responses'
-          | 'messages'
-        )[],
-        reasoningEffort: null,
-      },
-    ],
+    deciderModels: [{ id: 'anthropic/claude-sonnet-4', reasoningEffort: null }],
     minAccuracy: 0.8,
     switchCostFactor: 3,
     maxConcurrency: 4,
diff --git a/apps/web/src/lib/ai-gateway/model-api-kinds.test.ts b/apps/web/src/lib/ai-gateway/model-api-kinds.test.ts
index c75891f257..4ec802e1dc 100644
--- a/apps/web/src/lib/ai-gateway/model-api-kinds.test.ts
+++ b/apps/web/src/lib/ai-gateway/model-api-kinds.test.ts
@@ -1,36 +1,25 @@
 import { describe, expect, it } from '@jest/globals';
-import { supportedApiKindsForModel } from './model-api-kinds';
+import { gatewayChatApisForModel, modelServesAllGatewayChatApis } from './model-api-kinds';
 import { morph_warp_grep_free_model } from '@/lib/ai-gateway/providers/morph';
 import { seed_20_code_free_model } from '@/lib/ai-gateway/providers/seed';
 
-describe('supportedApiKindsForModel', () => {
-  it('returns all OpenRouter chat APIs for a plain OpenRouter model', () => {
-    expect(supportedApiKindsForModel('openai/gpt-5-mini')).toEqual([
-      'chat_completions',
-      'messages',
-      'responses',
-    ]);
+describe('modelServesAllGatewayChatApis', () => {
+  it('accepts a plain OpenRouter model (OpenRouter speaks all gateway chat APIs)', () => {
+    expect(modelServesAllGatewayChatApis('openai/gpt-5-mini')).toBe(true);
   });
 
-  it('uses the declared gateway for Kilo-exclusive models', () => {
-    expect(supportedApiKindsForModel(morph_warp_grep_free_model.public_id)).toEqual([
+  it('rejects a Kilo-exclusive model served by a chat-completions-only provider', () => {
+    expect(modelServesAllGatewayChatApis(morph_warp_grep_free_model.public_id)).toBe(false);
+    expect(gatewayChatApisForModel(morph_warp_grep_free_model.public_id)).toEqual([
       'chat_completions',
     ]);
   });
 
   it('treats disabled Kilo-exclusive models like plain OpenRouter models, matching get-provider', () => {
-    expect(supportedApiKindsForModel(seed_20_code_free_model.public_id)).toEqual([
-      'chat_completions',
-      'messages',
-      'responses',
-    ]);
+    expect(modelServesAllGatewayChatApis(seed_20_code_free_model.public_id)).toBe(true);
   });
 
   it('falls back to OpenRouter for unknown model ids', () => {
-    expect(supportedApiKindsForModel('made-up/model')).toEqual([
-      'chat_completions',
-      'messages',
-      'responses',
-    ]);
+    expect(modelServesAllGatewayChatApis('made-up/model')).toBe(true);
   });
 });
diff --git a/apps/web/src/lib/ai-gateway/model-api-kinds.ts b/apps/web/src/lib/ai-gateway/model-api-kinds.ts
index 63895cc279..fce82c582c 100644
--- a/apps/web/src/lib/ai-gateway/model-api-kinds.ts
+++ b/apps/web/src/lib/ai-gateway/model-api-kinds.ts
@@ -1,24 +1,35 @@
-import { ClassifierApiKindSchema, type ClassifierApiKind } from '@kilocode/auto-routing-contracts';
 import { findKiloExclusiveModel } from '@/lib/ai-gateway/models';
 import PROVIDERS from '@/lib/ai-gateway/providers/provider-definitions';
+import type { GatewayChatApiKind } from '@/lib/ai-gateway/providers/types';
+
+const GATEWAY_CHAT_API_KINDS: readonly GatewayChatApiKind[] = [
+  'chat_completions',
+  'responses',
+  'messages',
+];
 
 /**
- * Which gateway API kinds a model can serve, derived from the provider the
- * gateway would route it to. Mirrors get-provider.ts's static fallback
- * resolution — a Kilo-exclusive model is served by its declared gateway,
- * everything else by OpenRouter. The dynamic paths (BYOK, custom LLMs,
- * experiments, Vercel re-routing) never apply to auto-routing benchmark
- * candidates, which is the only consumer.
+ * The gateway chat API kinds the model's serving provider can speak, derived
+ * from the provider the gateway would route it to. Mirrors get-provider.ts's
+ * static fallback resolution — a Kilo-exclusive model is served by its
+ * declared gateway, everything else by OpenRouter. The dynamic paths (BYOK,
+ * custom LLMs, experiments, Vercel re-routing) never apply to auto-routing
+ * benchmark candidates, which is the only consumer.
  */
-export function supportedApiKindsForModel(modelId: string): ClassifierApiKind[] {
+export function gatewayChatApisForModel(modelId: string): ReadonlyArray<GatewayChatApiKind> {
   const exclusive = findKiloExclusiveModel(modelId);
   const provider =
     Object.values(PROVIDERS).find(p => p.id === exclusive?.gateway) ?? PROVIDERS.OPENROUTER;
-  const kinds = provider.supportedChatApis.filter((kind): kind is ClassifierApiKind =>
-    (ClassifierApiKindSchema.options as readonly string[]).includes(kind)
-  );
-  // A provider with no chat APIs (e.g. Mistral) can't serve gateway chat
-  // traffic at all; such models are not meaningful decider candidates, but
-  // the contract requires a non-empty list.
-  return kinds.length > 0 ? kinds : ['chat_completions'];
+  return provider.supportedChatApis;
+}
+
+/**
+ * Guards admin saves of the auto-routing benchmark config: routing-table
+ * candidates carry no per-protocol metadata, so every decider model must be
+ * servable on ALL gateway chat API kinds — otherwise the gateway would hard-
+ * reject requests whose protocol the model's provider can't speak.
+ */
+export function modelServesAllGatewayChatApis(modelId: string): boolean {
+  const supported = gatewayChatApisForModel(modelId);
+  return GATEWAY_CHAT_API_KINDS.every(kind => supported.includes(kind));
 }
diff --git a/packages/auto-routing-contracts/src/benchmark.ts b/packages/auto-routing-contracts/src/benchmark.ts
index 6d9db4d287..f5b0d56479 100644
--- a/packages/auto-routing-contracts/src/benchmark.ts
+++ b/packages/auto-routing-contracts/src/benchmark.ts
@@ -1,5 +1,5 @@
 import * as z from 'zod';
-import { ClassifierApiKindSchema, RoutingTableSchema } from './routing-table';
+import { RoutingTableSchema } from './routing-table';
 import { DifficultyTierSchema, ReasoningEffortSchema } from './tiers';
 
 export { ReasoningEffortSchema } from './tiers';
@@ -10,9 +10,6 @@ export type BenchmarkKind = z.infer<typeof BenchmarkKindSchema>;
 
 export const BenchmarkDeciderModelSchema = z.object({
   id: z.string().trim().min(1),
-  // Which gateway API kinds this model can serve when chosen by the router.
-  // The benchmark itself always exercises chat completions.
-  supportedApiKinds: z.array(ClassifierApiKindSchema).min(1).default(['chat_completions']),
   // Passed to the kilo CLI as --variant during the benchmark and carried into
   // the routing table so serving uses the same effort the model was graded
   // with. Null for models without (or not using) configurable reasoning.
@@ -42,14 +39,6 @@ export const BenchmarkConfigSchema = z.object({
 });
 export type BenchmarkConfig = z.infer<typeof BenchmarkConfigSchema>;
 
-// Admin-save payload: deciderModels carry no supportedApiKinds — the web
-// layer derives them from gateway provider definitions before forwarding the
-// full BenchmarkConfig to the benchmark worker.
-export const BenchmarkConfigUpdateSchema = BenchmarkConfigSchema.extend({
-  deciderModels: z.array(BenchmarkDeciderModelSchema.omit({ supportedApiKinds: true })).min(1),
-});
-export type BenchmarkConfigUpdate = z.infer<typeof BenchmarkConfigUpdateSchema>;
-
 export const BenchmarkRunStatusSchema = z.enum(['running', 'completed', 'failed']);
 export type BenchmarkRunStatus = z.infer<typeof BenchmarkRunStatusSchema>;
 
diff --git a/packages/auto-routing-contracts/src/index.ts b/packages/auto-routing-contracts/src/index.ts
index a7e12222e1..31915439ec 100644
--- a/packages/auto-routing-contracts/src/index.ts
+++ b/packages/auto-routing-contracts/src/index.ts
@@ -113,9 +113,8 @@ export type AutoRoutingDecision = z.infer<typeof AutoRoutingDecisionSchema>;
 
 export const AutoRoutingDecisionResponseSchema = z.object({
   cost: z.number(),
-  // Null when classification failed, no routing table is published, or no
-  // table candidate supports the request's API kind; the gateway then falls
-  // back to its static balanced defaults.
+  // Null when classification failed or no routing table is published; the
+  // gateway then falls back to its static balanced defaults.
   decision: AutoRoutingDecisionSchema.nullable(),
   classifierResult: z
     .object({
diff --git a/packages/auto-routing-contracts/src/routing-table.test.ts b/packages/auto-routing-contracts/src/routing-table.test.ts
index c1180b5371..edcd573b44 100644
--- a/packages/auto-routing-contracts/src/routing-table.test.ts
+++ b/packages/auto-routing-contracts/src/routing-table.test.ts
@@ -6,7 +6,6 @@ const candidate = (model: string, accuracy: number, avgCostUsd: number) => ({
   accuracy,
   avgCostUsd,
   meetsThreshold: false,
-  supportedApiKinds: ['chat_completions' as const],
 });
 
 describe('rankCandidates', () => {
diff --git a/packages/auto-routing-contracts/src/routing-table.ts b/packages/auto-routing-contracts/src/routing-table.ts
index 160742d6f9..84e0628106 100644
--- a/packages/auto-routing-contracts/src/routing-table.ts
+++ b/packages/auto-routing-contracts/src/routing-table.ts
@@ -10,7 +10,6 @@ export const RankedCandidateSchema = z.object({
   // Average observed OpenRouter cost per benchmark case, in USD credits.
   avgCostUsd: z.number().nonnegative(),
   meetsThreshold: z.boolean(),
-  supportedApiKinds: z.array(ClassifierApiKindSchema).min(1),
   // Reasoning effort the model was benchmarked with; serving mirrors it.
   // Optional so tables published before this field existed stay valid.
   reasoningEffort: ReasoningEffortSchema.nullable().optional(),
diff --git a/services/auto-routing-benchmark/migrations/0000_dusty_maginty.sql b/services/auto-routing-benchmark/migrations/0000_amused_shard.sql
similarity index 85%
rename from services/auto-routing-benchmark/migrations/0000_dusty_maginty.sql
rename to services/auto-routing-benchmark/migrations/0000_amused_shard.sql
index 991e7b9960..4889e7a415 100644
--- a/services/auto-routing-benchmark/migrations/0000_dusty_maginty.sql
+++ b/services/auto-routing-benchmark/migrations/0000_amused_shard.sql
@@ -45,10 +45,7 @@ CREATE TABLE `config_classifier_models` (
 --> statement-breakpoint
 CREATE TABLE `config_decider_models` (
 	`model` text PRIMARY KEY NOT NULL,
-	`reasoning_effort` text,
-	`supports_chat_completions` integer NOT NULL,
-	`supports_messages` integer NOT NULL,
-	`supports_responses` integer NOT NULL
+	`reasoning_effort` text
 );
 --> statement-breakpoint
 CREATE TABLE `model_summaries` (
@@ -74,9 +71,6 @@ CREATE TABLE `routing_table_candidates` (
 	`avg_cost_usd` real NOT NULL,
 	`meets_threshold` integer NOT NULL,
 	`reasoning_effort` text,
-	`supports_chat_completions` integer NOT NULL,
-	`supports_messages` integer NOT NULL,
-	`supports_responses` integer NOT NULL,
 	PRIMARY KEY(`run_id`, `tier`, `rank`)
 );
 --> statement-breakpoint
@@ -94,8 +88,5 @@ CREATE TABLE `run_models` (
 	`model` text NOT NULL,
 	`enqueued` integer NOT NULL,
 	`reasoning_effort` text,
-	`supports_chat_completions` integer NOT NULL,
-	`supports_messages` integer NOT NULL,
-	`supports_responses` integer NOT NULL,
 	PRIMARY KEY(`run_id`, `model`)
 );
diff --git a/services/auto-routing-benchmark/migrations/meta/0000_snapshot.json b/services/auto-routing-benchmark/migrations/meta/0000_snapshot.json
index f966b3d2ea..53a026135e 100644
--- a/services/auto-routing-benchmark/migrations/meta/0000_snapshot.json
+++ b/services/auto-routing-benchmark/migrations/meta/0000_snapshot.json
@@ -1,7 +1,7 @@
 {
   "version": "6",
   "dialect": "sqlite",
-  "id": "fcf958da-88cb-4c79-af3d-4709268e140c",
+  "id": "20295052-406c-424a-956f-77acc985f44a",
   "prevId": "00000000-0000-0000-0000-000000000000",
   "tables": {
     "benchmark_config": {
@@ -293,27 +293,6 @@
           "primaryKey": false,
           "notNull": false,
           "autoincrement": false
-        },
-        "supports_chat_completions": {
-          "name": "supports_chat_completions",
-          "type": "integer",
-          "primaryKey": false,
-          "notNull": true,
-          "autoincrement": false
-        },
-        "supports_messages": {
-          "name": "supports_messages",
-          "type": "integer",
-          "primaryKey": false,
-          "notNull": true,
-          "autoincrement": false
-        },
-        "supports_responses": {
-          "name": "supports_responses",
-          "type": "integer",
-          "primaryKey": false,
-          "notNull": true,
-          "autoincrement": false
         }
       },
       "indexes": {},
@@ -470,27 +449,6 @@
           "primaryKey": false,
           "notNull": false,
           "autoincrement": false
-        },
-        "supports_chat_completions": {
-          "name": "supports_chat_completions",
-          "type": "integer",
-          "primaryKey": false,
-          "notNull": true,
-          "autoincrement": false
-        },
-        "supports_messages": {
-          "name": "supports_messages",
-          "type": "integer",
-          "primaryKey": false,
-          "notNull": true,
-          "autoincrement": false
-        },
-        "supports_responses": {
-          "name": "supports_responses",
-          "type": "integer",
-          "primaryKey": false,
-          "notNull": true,
-          "autoincrement": false
         }
       },
       "indexes": {},
@@ -590,27 +548,6 @@
           "primaryKey": false,
           "notNull": false,
           "autoincrement": false
-        },
-        "supports_chat_completions": {
-          "name": "supports_chat_completions",
-          "type": "integer",
-          "primaryKey": false,
-          "notNull": true,
-          "autoincrement": false
-        },
-        "supports_messages": {
-          "name": "supports_messages",
-          "type": "integer",
-          "primaryKey": false,
-          "notNull": true,
-          "autoincrement": false
-        },
-        "supports_responses": {
-          "name": "supports_responses",
-          "type": "integer",
-          "primaryKey": false,
-          "notNull": true,
-          "autoincrement": false
         }
       },
       "indexes": {},
diff --git a/services/auto-routing-benchmark/migrations/meta/_journal.json b/services/auto-routing-benchmark/migrations/meta/_journal.json
index 238f03ad45..c42129fb6b 100644
--- a/services/auto-routing-benchmark/migrations/meta/_journal.json
+++ b/services/auto-routing-benchmark/migrations/meta/_journal.json
@@ -5,8 +5,8 @@
     {
       "idx": 0,
       "version": "6",
-      "when": 1781281890154,
-      "tag": "0000_dusty_maginty",
+      "when": 1781283444549,
+      "tag": "0000_amused_shard",
       "breakpoints": true
     }
   ]
diff --git a/services/auto-routing-benchmark/src/admin.test.ts b/services/auto-routing-benchmark/src/admin.test.ts
index e2ed0517bc..5981d6db2a 100644
--- a/services/auto-routing-benchmark/src/admin.test.ts
+++ b/services/auto-routing-benchmark/src/admin.test.ts
@@ -6,16 +6,8 @@ import type * as DbModule from './db';
 const TEST_CONFIG: BenchmarkConfig = {
   classifierModels: ['google/gemini-2.5-flash-lite', 'google/gemini-2.5-flash'],
   deciderModels: [
-    {
-      id: 'google/gemini-2.5-flash-lite',
-      supportedApiKinds: ['chat_completions'],
-      reasoningEffort: null,
-    },
-    {
-      id: 'anthropic/claude-sonnet-4.6',
-      supportedApiKinds: ['chat_completions', 'messages', 'responses'],
-      reasoningEffort: null,
-    },
+    { id: 'google/gemini-2.5-flash-lite', reasoningEffort: null },
+    { id: 'anthropic/claude-sonnet-4.6', reasoningEffort: null },
   ],
   minAccuracy: 0.7,
   switchCostFactor: 3,
@@ -40,9 +32,6 @@ const TEST_CONFIG_ROWS = {
   deciderModels: TEST_CONFIG.deciderModels.map(m => ({
     model: m.id,
     reasoning_effort: m.reasoningEffort ?? null,
-    supports_chat_completions: m.supportedApiKinds.includes('chat_completions'),
-    supports_messages: m.supportedApiKinds.includes('messages'),
-    supports_responses: m.supportedApiKinds.includes('responses'),
   })),
 };
 
@@ -178,9 +167,6 @@ describe('GET /admin/config', () => {
     const deciderModels = TEST_CONFIG.deciderModels.map(m => ({
       model: m.id,
       reasoning_effort: null,
-      supports_chat_completions: m.supportedApiKinds.includes('chat_completions'),
-      supports_messages: m.supportedApiKinds.includes('messages'),
-      supports_responses: m.supportedApiKinds.includes('responses'),
     }));
     vi.mocked(getConfigRows).mockResolvedValueOnce({
       config: {
@@ -342,7 +328,6 @@ describe('GET /admin/routing-table', () => {
       accuracy: 1,
       avgCostUsd: 0.1,
       meetsThreshold: true,
-      supportedApiKinds: ['chat_completions'],
     };
     const tableData = {
       version: 'test-v1',
diff --git a/services/auto-routing-benchmark/src/config.test.ts b/services/auto-routing-benchmark/src/config.test.ts
index d4ffb884f2..ed9851b167 100644
--- a/services/auto-routing-benchmark/src/config.test.ts
+++ b/services/auto-routing-benchmark/src/config.test.ts
@@ -16,9 +16,6 @@ const deciderRows: ConfigDeciderModelRow[] = [
   {
     model: 'some/decider',
     reasoning_effort: 'high',
-    supports_chat_completions: true,
-    supports_messages: true,
-    supports_responses: false,
   },
 ];
 
@@ -51,6 +48,5 @@ describe('mapConfigRows', () => {
     expect(result?.deciderModels).toHaveLength(1);
     expect(result?.deciderModels[0].id).toBe('some/decider');
     expect(result?.deciderModels[0].reasoningEffort).toBe('high');
-    expect(result?.deciderModels[0].supportedApiKinds).toEqual(['chat_completions', 'messages']);
   });
 });
diff --git a/services/auto-routing-benchmark/src/config.ts b/services/auto-routing-benchmark/src/config.ts
index 3b00a905da..208484ce90 100644
--- a/services/auto-routing-benchmark/src/config.ts
+++ b/services/auto-routing-benchmark/src/config.ts
@@ -1,5 +1,5 @@
 import type { BenchmarkConfig } from '@kilocode/auto-routing-contracts';
-import { apiKindsToFlags, getConfigRows, replaceConfig, type ConfigDeciderModelRow } from './db';
+import { getConfigRows, replaceConfig, type ConfigDeciderModelRow } from './db';
 
 // Maps the three normalized config tables to the BenchmarkConfig contract.
 // Null when no admin has saved a config yet — the worker never fabricates
@@ -24,11 +24,6 @@ export function mapConfigRows(
     classifierModels,
     deciderModels: deciderModelRows.map(r => ({
       id: r.model,
-      supportedApiKinds: [
-        ...(r.supports_chat_completions ? (['chat_completions'] as const) : []),
-        ...(r.supports_messages ? (['messages'] as const) : []),
-        ...(r.supports_responses ? (['responses'] as const) : []),
-      ],
       reasoningEffort:
         r.reasoning_effort as BenchmarkConfig['deciderModels'][number]['reasoningEffort'],
     })),
@@ -57,7 +52,6 @@ export async function saveBenchmarkConfig(
   const deciderModelRows: ConfigDeciderModelRow[] = config.deciderModels.map(m => ({
     model: m.id,
     reasoning_effort: m.reasoningEffort ?? null,
-    ...apiKindsToFlags(m.supportedApiKinds),
   }));
 
   await replaceConfig(
diff --git a/services/auto-routing-benchmark/src/db-schema.ts b/services/auto-routing-benchmark/src/db-schema.ts
index c0981e073d..748b789737 100644
--- a/services/auto-routing-benchmark/src/db-schema.ts
+++ b/services/auto-routing-benchmark/src/db-schema.ts
@@ -21,9 +21,6 @@ export const configClassifierModels = sqliteTable('config_classifier_models', {
 export const configDeciderModels = sqliteTable('config_decider_models', {
   model: text('model').primaryKey(),
   reasoning_effort: text('reasoning_effort'),
-  supports_chat_completions: integer('supports_chat_completions', { mode: 'boolean' }).notNull(),
-  supports_messages: integer('supports_messages', { mode: 'boolean' }).notNull(),
-  supports_responses: integer('supports_responses', { mode: 'boolean' }).notNull(),
 });
 
 export const benchmarkRuns = sqliteTable('benchmark_runs', {
@@ -48,9 +45,6 @@ export const runModels = sqliteTable(
     // enqueued=false means the model was skipped (had prior results).
     enqueued: integer('enqueued', { mode: 'boolean' }).notNull(),
     reasoning_effort: text('reasoning_effort'),
-    supports_chat_completions: integer('supports_chat_completions', { mode: 'boolean' }).notNull(),
-    supports_messages: integer('supports_messages', { mode: 'boolean' }).notNull(),
-    supports_responses: integer('supports_responses', { mode: 'boolean' }).notNull(),
   },
   table => [primaryKey({ columns: [table.run_id, table.model] })]
 );
@@ -121,9 +115,6 @@ export const routingTableCandidates = sqliteTable(
     avg_cost_usd: real('avg_cost_usd').notNull(),
     meets_threshold: integer('meets_threshold', { mode: 'boolean' }).notNull(),
     reasoning_effort: text('reasoning_effort'),
-    supports_chat_completions: integer('supports_chat_completions', { mode: 'boolean' }).notNull(),
-    supports_messages: integer('supports_messages', { mode: 'boolean' }).notNull(),
-    supports_responses: integer('supports_responses', { mode: 'boolean' }).notNull(),
   },
   table => [primaryKey({ columns: [table.run_id, table.tier, table.rank] })]
 );
diff --git a/services/auto-routing-benchmark/src/db.test.ts b/services/auto-routing-benchmark/src/db.test.ts
index 1385c5a26e..c668a14125 100644
--- a/services/auto-routing-benchmark/src/db.test.ts
+++ b/services/auto-routing-benchmark/src/db.test.ts
@@ -1,97 +1,9 @@
 import { describe, it, expect } from 'vitest';
 import { RoutingTableSchema } from '@kilocode/auto-routing-contracts';
 import type { RankedCandidate, RoutingTable } from '@kilocode/auto-routing-contracts';
-import {
-  apiKindsToFlags,
-  flagsToApiKinds,
-  mapRunRow,
-  mapSummaryRow,
-  routingTableToRows,
-  rowsToRoutingTable,
-} from './db';
+import { mapRunRow, mapSummaryRow, routingTableToRows, rowsToRoutingTable } from './db';
 import type { BenchmarkModelSummary } from '@kilocode/auto-routing-contracts';
 
-// ---------------------------------------------------------------------------
-// apiKindsToFlags / flagsToApiKinds round-trip
-// ---------------------------------------------------------------------------
-
-describe('apiKindsToFlags', () => {
-  it('maps all three kinds to true when all present', () => {
-    expect(apiKindsToFlags(['chat_completions', 'messages', 'responses'])).toEqual({
-      supports_chat_completions: true,
-      supports_messages: true,
-      supports_responses: true,
-    });
-  });
-
-  it('maps an empty array to all false', () => {
-    expect(apiKindsToFlags([])).toEqual({
-      supports_chat_completions: false,
-      supports_messages: false,
-      supports_responses: false,
-    });
-  });
-
-  it('maps a single kind correctly', () => {
-    expect(apiKindsToFlags(['chat_completions'])).toEqual({
-      supports_chat_completions: true,
-      supports_messages: false,
-      supports_responses: false,
-    });
-  });
-});
-
-describe('flagsToApiKinds', () => {
-  it('returns all three kinds when all flags are true', () => {
-    expect(
-      flagsToApiKinds({
-        supports_chat_completions: true,
-        supports_messages: true,
-        supports_responses: true,
-      })
-    ).toEqual(['chat_completions', 'messages', 'responses']);
-  });
-
-  it('returns empty array when all flags are false', () => {
-    expect(
-      flagsToApiKinds({
-        supports_chat_completions: false,
-        supports_messages: false,
-        supports_responses: false,
-      })
-    ).toEqual([]);
-  });
-
-  it('returns only the set flags in order: chat_completions, messages, responses', () => {
-    expect(
-      flagsToApiKinds({
-        supports_chat_completions: false,
-        supports_messages: true,
-        supports_responses: true,
-      })
-    ).toEqual(['messages', 'responses']);
-  });
-});
-
-describe('apiKindsToFlags / flagsToApiKinds round-trip', () => {
-  const cases: Parameters<typeof apiKindsToFlags>[0][] = [
-    [],
-    ['chat_completions'],
-    ['messages'],
-    ['responses'],
-    ['chat_completions', 'messages'],
-    ['chat_completions', 'responses'],
-    ['messages', 'responses'],
-    ['chat_completions', 'messages', 'responses'],
-  ];
-
-  for (const kinds of cases) {
-    it(`round-trips [${kinds.join(', ')}]`, () => {
-      expect(flagsToApiKinds(apiKindsToFlags(kinds))).toEqual(kinds);
-    });
-  }
-});
-
 // ---------------------------------------------------------------------------
 // mapSummaryRow
 // ---------------------------------------------------------------------------
@@ -213,7 +125,6 @@ const candidate = (model: string): RankedCandidate => ({
   accuracy: 0.9,
   avgCostUsd: 0.001,
   meetsThreshold: true,
-  supportedApiKinds: ['chat_completions', 'messages'],
   reasoningEffort: null,
 });
 
@@ -250,14 +161,6 @@ describe('routingTableToRows', () => {
     expect(lowRows[1].model).toBe('model-b');
     expect(lowRows[1].rank).toBe(1);
   });
-
-  it('maps supportedApiKinds to boolean flags', () => {
-    const { candidateRows } = routingTableToRows(sampleTable, '2026-06-01T11:00:00.000Z');
-    const row = candidateRows[0];
-    expect(row.supports_chat_completions).toBe(true);
-    expect(row.supports_messages).toBe(true);
-    expect(row.supports_responses).toBe(false);
-  });
 });
 
 describe('rowsToRoutingTable', () => {
diff --git a/services/auto-routing-benchmark/src/db.ts b/services/auto-routing-benchmark/src/db.ts
index c20398a8a4..5955c7c820 100644
--- a/services/auto-routing-benchmark/src/db.ts
+++ b/services/auto-routing-benchmark/src/db.ts
@@ -2,7 +2,6 @@ import type {
   BenchmarkKind,
   BenchmarkModelSummary,
   BenchmarkRun,
-  ClassifierApiKind,
   ClassifierWinner,
   RankedCandidate,
   RoutingTable,
@@ -30,36 +29,6 @@ export type RunModelRow = typeof runModels.$inferSelect;
 export type ConfigDeciderModelRow = typeof configDeciderModels.$inferSelect;
 type ModelSummaryRow = typeof modelSummaries.$inferSelect;
 
-// ---------------------------------------------------------------------------
-// ApiKind flag helpers
-// ---------------------------------------------------------------------------
-
-const ALL_API_KINDS: ClassifierApiKind[] = ['chat_completions', 'messages', 'responses'];
-
-export function apiKindsToFlags(kinds: ClassifierApiKind[]): {
-  supports_chat_completions: boolean;
-  supports_messages: boolean;
-  supports_responses: boolean;
-} {
-  return {
-    supports_chat_completions: kinds.includes('chat_completions'),
-    supports_messages: kinds.includes('messages'),
-    supports_responses: kinds.includes('responses'),
-  };
-}
-
-export function flagsToApiKinds(flags: {
-  supports_chat_completions: boolean;
-  supports_messages: boolean;
-  supports_responses: boolean;
-}): ClassifierApiKind[] {
-  return ALL_API_KINDS.filter(k => {
-    if (k === 'chat_completions') return flags.supports_chat_completions;
-    if (k === 'messages') return flags.supports_messages;
-    return flags.supports_responses;
-  });
-}
-
 // ---------------------------------------------------------------------------
 // Row mapping helpers
 // ---------------------------------------------------------------------------
@@ -435,7 +404,6 @@ export function routingTableToRows(
         avg_cost_usd: c.avgCostUsd,
         meets_threshold: c.meetsThreshold,
         reasoning_effort: c.reasoningEffort ?? null,
-        ...apiKindsToFlags(c.supportedApiKinds),
       });
     });
   }
@@ -459,7 +427,6 @@ export function rowsToRoutingTable(
       accuracy: row.accuracy,
       avgCostUsd: row.avg_cost_usd,
       meetsThreshold: row.meets_threshold,
-      supportedApiKinds: flagsToApiKinds(row),
       reasoningEffort: row.reasoning_effort as RankedCandidate['reasoningEffort'],
     });
   }
diff --git a/services/auto-routing-benchmark/src/routing-table-builder.test.ts b/services/auto-routing-benchmark/src/routing-table-builder.test.ts
index 3ae9cd6d35..27c0f2d16f 100644
--- a/services/auto-routing-benchmark/src/routing-table-builder.test.ts
+++ b/services/auto-routing-benchmark/src/routing-table-builder.test.ts
@@ -6,13 +6,9 @@ import type {
 import { buildRoutingTable } from './routing-table-builder';
 
 const DECIDER_MODELS: BenchmarkDeciderModel[] = [
-  { id: 'model/cheap', supportedApiKinds: ['chat_completions'], reasoningEffort: null },
-  {
-    id: 'model/expensive',
-    supportedApiKinds: ['chat_completions', 'responses'],
-    reasoningEffort: null,
-  },
-  { id: 'model/mid', supportedApiKinds: ['chat_completions', 'messages'], reasoningEffort: null },
+  { id: 'model/cheap', reasoningEffort: null },
+  { id: 'model/expensive', reasoningEffort: 'medium' },
+  { id: 'model/mid', reasoningEffort: null },
 ];
 
 function summary(
@@ -135,7 +131,7 @@ describe('buildRoutingTable', () => {
     expect(highModels).toContain('model/mid');
   });
 
-  it('carries supportedApiKinds from the run snapshot', () => {
+  it('carries reasoningEffort from the run snapshot', () => {
     const table = buildRoutingTable({
       runId: 'test-run-4',
       generatedAt: '2026-01-01T00:00:00.000Z',
@@ -146,13 +142,13 @@ describe('buildRoutingTable', () => {
     });
 
     const expensiveInLow = table.tiers.low.find(c => c.model === 'model/expensive');
-    expect(expensiveInLow?.supportedApiKinds).toEqual(['chat_completions', 'responses']);
+    expect(expensiveInLow?.reasoningEffort).toBe('medium');
 
     const midInLow = table.tiers.low.find(c => c.model === 'model/mid');
-    expect(midInLow?.supportedApiKinds).toEqual(['chat_completions', 'messages']);
+    expect(midInLow?.reasoningEffort).toBeNull();
   });
 
-  it('defaults supportedApiKinds to chat_completions when model missing from the snapshot', () => {
+  it('defaults reasoningEffort to null when model missing from the snapshot', () => {
     const summaries: BenchmarkModelSummary[] = [
       summary('model/unknown', 'low', 0.9),
       summary('model/cheap', 'low', 0.8),
@@ -172,7 +168,7 @@ describe('buildRoutingTable', () => {
     });
 
     const unknown = table.tiers.low.find(c => c.model === 'model/unknown');
-    expect(unknown?.supportedApiKinds).toEqual(['chat_completions']);
+    expect(unknown?.reasoningEffort).toBeNull();
   });
 
   it('throws when a tier has no candidates', () => {
diff --git a/services/auto-routing-benchmark/src/routing-table-builder.ts b/services/auto-routing-benchmark/src/routing-table-builder.ts
index 03ea33399a..222f19436f 100644
--- a/services/auto-routing-benchmark/src/routing-table-builder.ts
+++ b/services/auto-routing-benchmark/src/routing-table-builder.ts
@@ -33,10 +33,6 @@ export function buildRoutingTable(params: {
           model: s.model,
           accuracy: s.accuracy,
           avgCostUsd: s.avgCostUsd ?? 0,
-          // Spread into a mutable array so tsgo is happy with the readonly type.
-          supportedApiKinds: [
-            ...(modelConfigById.get(s.model)?.supportedApiKinds ?? (['chat_completions'] as const)),
-          ],
           reasoningEffort: modelConfigById.get(s.model)?.reasoningEffort ?? null,
         })),
       minAccuracy
diff --git a/services/auto-routing-benchmark/src/run.ts b/services/auto-routing-benchmark/src/run.ts
index f0d3bb36e7..2dc33d5f0d 100644
--- a/services/auto-routing-benchmark/src/run.ts
+++ b/services/auto-routing-benchmark/src/run.ts
@@ -13,8 +13,6 @@ import { CLASSIFIER_CASES } from './datasets/classifier-cases';
 import { DECIDER_CASES } from './datasets/decider-cases';
 import type { RunModelRow } from './db';
 import {
-  apiKindsToFlags,
-  flagsToApiKinds,
   countCaseResults,
   getCaseResults,
   getLatestSummariesByModel,
@@ -107,27 +105,15 @@ export async function startRun(
   const runId = `${kind}-${new Date().toISOString().replace(/[:.]/g, '-')}`;
 
   // Build run_models rows for ALL models of this run's kind.
-  const runModelRows: RunModelRow[] = models.map(modelId => {
-    if (kind === 'classifier') {
-      return {
-        run_id: runId,
-        model: modelId,
-        enqueued: enqueuedModelIds.includes(modelId),
-        reasoning_effort: null,
-        supports_chat_completions: false,
-        supports_messages: false,
-        supports_responses: false,
-      };
-    }
-    const deciderModel = config.deciderModels.find(m => m.id === modelId);
-    return {
-      run_id: runId,
-      model: modelId,
-      enqueued: enqueuedModelIds.includes(modelId),
-      reasoning_effort: deciderModel?.reasoningEffort ?? null,
-      ...apiKindsToFlags(deciderModel?.supportedApiKinds ?? ['chat_completions']),
-    };
-  });
+  const runModelRows: RunModelRow[] = models.map(modelId => ({
+    run_id: runId,
+    model: modelId,
+    enqueued: enqueuedModelIds.includes(modelId),
+    reasoning_effort:
+      kind === 'classifier'
+        ? null
+        : (config.deciderModels.find(m => m.id === modelId)?.reasoningEffort ?? null),
+  }));
 
   await insertRun(
     env.BENCH_DB,
@@ -486,7 +472,6 @@ async function finalizeRunIfComplete(
       // admin edit can't skew the published table.
       const deciderModels: BenchmarkDeciderModel[] = state.models.map(m => ({
         id: m.model,
-        supportedApiKinds: flagsToApiKinds(m),
         reasoningEffort: m.reasoning_effort as BenchmarkDeciderModel['reasoningEffort'],
       }));
       const table = buildRoutingTable({
diff --git a/services/auto-routing/src/decide.ts b/services/auto-routing/src/decide.ts
index 411f6a20c6..c69955ed2e 100644
--- a/services/auto-routing/src/decide.ts
+++ b/services/auto-routing/src/decide.ts
@@ -302,7 +302,7 @@ export const decideHandler: Handler<HonoEnv> = async c => {
     getStickyDecision(c.env, ctx.conversationKey),
   ]);
   if (cached) {
-    const decision = computeDecision(cached, payload.input.apiKind, routingTable, stickyModel);
+    const decision = computeDecision(cached, routingTable, stickyModel);
     if (decision) {
       c.executionCtx.waitUntil(putStickyDecision(c.env, ctx.conversationKey, decision.model));
     }
@@ -331,12 +331,7 @@ export const decideHandler: Handler<HonoEnv> = async c => {
         )
       );
     }
-    const decision = computeDecision(
-      classifier.classification,
-      payload.input.apiKind,
-      routingTable,
-      stickyModel
-    );
+    const decision = computeDecision(classifier.classification, routingTable, stickyModel);
     if (decision) {
       c.executionCtx.waitUntil(putStickyDecision(c.env, ctx.conversationKey, decision.model));
     }
diff --git a/services/auto-routing/src/decision-engine.test.ts b/services/auto-routing/src/decision-engine.test.ts
index 6cc0018223..b10fcc2e47 100644
--- a/services/auto-routing/src/decision-engine.test.ts
+++ b/services/auto-routing/src/decision-engine.test.ts
@@ -21,26 +21,17 @@ const table: RoutingTable = {
   source: 'benchmark',
   tiers: {
     low: [
-      {
-        model: 'cheap/messages-only',
-        accuracy: 0.9,
-        avgCostUsd: 0.001,
-        meetsThreshold: true,
-        supportedApiKinds: ['messages'],
-      },
       {
         model: 'cheap/chat',
         accuracy: 0.85,
         avgCostUsd: 0.002,
         meetsThreshold: true,
-        supportedApiKinds: ['chat_completions'],
       },
       {
         model: 'mid/chat',
         accuracy: 0.8,
         avgCostUsd: 0.005,
         meetsThreshold: true,
-        supportedApiKinds: ['chat_completions', 'messages'],
         reasoningEffort: 'medium',
       },
       {
@@ -48,14 +39,12 @@ const table: RoutingTable = {
         accuracy: 0.9,
         avgCostUsd: 0.02,
         meetsThreshold: true,
-        supportedApiKinds: ['chat_completions'],
       },
       {
         model: 'weak/chat',
         accuracy: 0.5,
         avgCostUsd: 0.003,
         meetsThreshold: false,
-        supportedApiKinds: ['chat_completions'],
       },
     ],
     medium: [
@@ -64,7 +53,6 @@ const table: RoutingTable = {
         accuracy: 0.8,
         avgCostUsd: 0.01,
         meetsThreshold: true,
-        supportedApiKinds: ['chat_completions', 'messages'],
       },
     ],
     high: [
@@ -73,15 +61,14 @@ const table: RoutingTable = {
         accuracy: 0.9,
         avgCostUsd: 0.1,
         meetsThreshold: true,
-        supportedApiKinds: ['chat_completions'],
       },
     ],
   },
 };
 
 describe('computeDecision', () => {
-  it('picks the first candidate supporting the request api kind', () => {
-    const decision = computeDecision(classification, 'chat_completions', table, null);
+  it('picks the first candidate of the tier', () => {
+    const decision = computeDecision(classification, table, null);
     expect(decision).toEqual({
       model: 'cheap/chat',
       tier: 'low',
@@ -98,20 +85,38 @@ describe('computeDecision', () => {
       contextComplexity: 'large',
       executionMode: 'multi_step_project',
     };
-    expect(computeDecision(hard, 'chat_completions', table, null)?.model).toBe('big/chat');
+    expect(computeDecision(hard, table, null)?.model).toBe('big/chat');
   });
-  it('returns null when no candidate supports the api kind', () => {
-    expect(computeDecision(classification, 'responses', table, null)).toBeNull();
+  it('returns a decision for every tier of a valid table', () => {
+    const byTier: Array<[ClassifierOutput, string]> = [
+      [classification, 'cheap/chat'],
+      [
+        { ...classification, reasoningComplexity: 'medium', contextComplexity: 'medium' },
+        'mid/chat',
+      ],
+      [
+        {
+          ...classification,
+          reasoningComplexity: 'high',
+          contextComplexity: 'large',
+          executionMode: 'multi_step_project',
+        },
+        'big/chat',
+      ],
+    ];
+    for (const [input, expected] of byTier) {
+      expect(computeDecision(input, table, null)?.model).toBe(expected);
+    }
   });
   it('returns null when there is no routing table', () => {
-    expect(computeDecision(classification, 'chat_completions', null, null)).toBeNull();
+    expect(computeDecision(classification, null, null)).toBeNull();
   });
 
   describe('session stickiness', () => {
     it('keeps the incumbent on tier de-escalation when it is within the switch-cost factor', () => {
       // Fresh pick cheap/chat at 0.002; mid/chat at 0.005 is not cheaper by
       // more than 3x (0.002 * 3 = 0.006 >= 0.005), so the session stays put.
-      const decision = computeDecision(classification, 'chat_completions', table, 'mid/chat');
+      const decision = computeDecision(classification, table, 'mid/chat');
       expect(decision).toEqual({
         model: 'mid/chat',
         tier: 'low',
@@ -130,43 +135,29 @@ describe('computeDecision', () => {
         tiers: {
           ...table.tiers,
           low: [
-            { ...table.tiers.low[1]!, model: 'fresh/chat', avgCostUsd: 1 },
-            { ...table.tiers.low[2]!, model: 'incumbent/chat', avgCostUsd: 3 },
+            { ...table.tiers.low[0]!, model: 'fresh/chat', avgCostUsd: 1 },
+            { ...table.tiers.low[1]!, model: 'incumbent/chat', avgCostUsd: 3 },
           ],
         },
       };
-      const decision = computeDecision(
-        classification,
-        'chat_completions',
-        boundaryTable,
-        'incumbent/chat'
-      );
+      const decision = computeDecision(classification, boundaryTable, 'incumbent/chat');
       expect(decision).toMatchObject({ model: 'incumbent/chat', sticky: true });
     });
     it('switches when the fresh pick is cheaper by more than the factor', () => {
       // pricey/chat at 0.02 vs fresh 0.002 * 3 = 0.006: switch pays off.
-      const decision = computeDecision(classification, 'chat_completions', table, 'pricey/chat');
+      const decision = computeDecision(classification, table, 'pricey/chat');
       expect(decision).toMatchObject({ model: 'cheap/chat', sticky: false });
     });
     it('switches when the incumbent no longer meets the tier threshold', () => {
-      const decision = computeDecision(classification, 'chat_completions', table, 'weak/chat');
+      const decision = computeDecision(classification, table, 'weak/chat');
       expect(decision).toMatchObject({ model: 'cheap/chat', sticky: false });
     });
     it('serves the fresh pick when the incumbent is not in the tier', () => {
-      const decision = computeDecision(classification, 'chat_completions', table, 'gone/model');
+      const decision = computeDecision(classification, table, 'gone/model');
       expect(decision).toMatchObject({ model: 'cheap/chat', sticky: false });
     });
     it('is not sticky when the incumbent is the fresh pick', () => {
-      const decision = computeDecision(classification, 'chat_completions', table, 'cheap/chat');
-      expect(decision).toMatchObject({ model: 'cheap/chat', sticky: false });
-    });
-    it('serves the fresh pick when the incumbent does not support the api kind', () => {
-      const decision = computeDecision(
-        classification,
-        'chat_completions',
-        table,
-        'cheap/messages-only'
-      );
+      const decision = computeDecision(classification, table, 'cheap/chat');
       expect(decision).toMatchObject({ model: 'cheap/chat', sticky: false });
     });
   });
diff --git a/services/auto-routing/src/decision-engine.ts b/services/auto-routing/src/decision-engine.ts
index 9f5b949616..0d641e069d 100644
--- a/services/auto-routing/src/decision-engine.ts
+++ b/services/auto-routing/src/decision-engine.ts
@@ -2,21 +2,20 @@ import {
   deriveDifficultyTier,
   type AutoRoutingDecision,
   type ClassifierOutput,
-  type NormalizedClassifierInput,
   type RoutingTable,
 } from '@kilocode/auto-routing-contracts';
 
 export function computeDecision(
   classification: ClassifierOutput,
-  apiKind: NormalizedClassifierInput['apiKind'],
   table: RoutingTable | null,
   incumbentModel: string | null
 ): AutoRoutingDecision | null {
   if (!table) return null;
   const tier = deriveDifficultyTier(classification);
   const candidates = table.tiers[tier];
-  const freshPick = candidates.find(c => c.supportedApiKinds.includes(apiKind));
-  if (!freshPick) return null;
+  // A parsed table guarantees a non-empty tier (schema .min(1)), so with a
+  // table and a classification a decision always exists.
+  const freshPick = candidates[0];
 
   // Keep the session on its incumbent model when it is still good enough for
   // the current tier. A model switch discards the provider's prompt cache,
@@ -25,9 +24,7 @@ export function computeDecision(
   // worth it when the fresh pick's recurring per-turn savings clearly exceed
   // that one-time penalty, i.e. it is cheaper by more than switchCostFactor.
   const incumbent =
-    incumbentModel === null
-      ? undefined
-      : candidates.find(c => c.model === incumbentModel && c.supportedApiKinds.includes(apiKind));
+    incumbentModel === null ? undefined : candidates.find(c => c.model === incumbentModel);
   if (
     incumbent &&
     incumbent.meetsThreshold &&
diff --git a/services/auto-routing/src/index.test.ts b/services/auto-routing/src/index.test.ts
index 825c499daf..96682849b1 100644
--- a/services/auto-routing/src/index.test.ts
+++ b/services/auto-routing/src/index.test.ts
@@ -94,7 +94,6 @@ const benchmarkRoutingTable = {
         accuracy: 0.9,
         avgCostUsd: 0.001,
         meetsThreshold: true,
-        supportedApiKinds: ['chat_completions'],
         reasoningEffort: null,
       },
     ],
@@ -104,7 +103,6 @@ const benchmarkRoutingTable = {
         accuracy: 0.85,
         avgCostUsd: 0.002,
         meetsThreshold: true,
-        supportedApiKinds: ['chat_completions'],
         reasoningEffort: null,
       },
       // The high-tier model also qualifies for medium, within the 3x
@@ -115,7 +113,6 @@ const benchmarkRoutingTable = {
         accuracy: 0.8,
         avgCostUsd: 0.005,
         meetsThreshold: true,
-        supportedApiKinds: ['chat_completions', 'messages', 'responses'],
         reasoningEffort: null,
       },
     ],
@@ -125,7 +122,6 @@ const benchmarkRoutingTable = {
         accuracy: 0.8,
         avgCostUsd: 0.01,
         meetsThreshold: true,
-        supportedApiKinds: ['chat_completions', 'messages', 'responses'],
         reasoningEffort: null,
       },
     ],
diff --git a/services/auto-routing/src/routing-table.test.ts b/services/auto-routing/src/routing-table.test.ts
index ded4744e81..be60e909ab 100644
--- a/services/auto-routing/src/routing-table.test.ts
+++ b/services/auto-routing/src/routing-table.test.ts
@@ -15,7 +15,6 @@ const SAMPLE_TABLE: RoutingTable = {
         accuracy: 0.9,
         avgCostUsd: 0.001,
         meetsThreshold: true,
-        supportedApiKinds: ['chat_completions'],
         reasoningEffort: null,
       },
     ],
@@ -25,7 +24,6 @@ const SAMPLE_TABLE: RoutingTable = {
         accuracy: 0.85,
         avgCostUsd: 0.002,
         meetsThreshold: true,
-        supportedApiKinds: ['chat_completions'],
         reasoningEffort: null,
       },
     ],
@@ -35,7 +33,6 @@ const SAMPLE_TABLE: RoutingTable = {
         accuracy: 0.8,
         avgCostUsd: 0.01,
         meetsThreshold: true,
-        supportedApiKinds: ['chat_completions', 'messages', 'responses'],
         reasoningEffort: null,
       },
     ],

From 427dcc208ff53325a84c3123f850f2f75ece27b0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Fri, 12 Jun 2026 20:37:13 +0200
Subject: [PATCH 62/73] fix(auto-routing): review-pass fixes

- never let a heuristic fallback classification re-anchor the session's
  sticky model (same trust rule as the classification cache)
- drop the dead ClassifierApiKindSchema export
- rename the decider pages-helper case so its id no longer collides with
  the classifier dataset's debug-fix-pagination-slice in shared telemetry
- trim a stale JSDoc in model-api-kinds.ts
---
 apps/web/src/lib/ai-gateway/model-api-kinds.ts                | 4 +---
 packages/auto-routing-contracts/src/routing-table.ts          | 2 --
 services/auto-routing-benchmark/src/datasets/decider-cases.ts | 4 +++-
 services/auto-routing/src/decide.ts                           | 4 +++-
 services/auto-routing/src/index.test.ts                       | 3 +++
 5 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/apps/web/src/lib/ai-gateway/model-api-kinds.ts b/apps/web/src/lib/ai-gateway/model-api-kinds.ts
index fce82c582c..77c00058bf 100644
--- a/apps/web/src/lib/ai-gateway/model-api-kinds.ts
+++ b/apps/web/src/lib/ai-gateway/model-api-kinds.ts
@@ -12,9 +12,7 @@ const GATEWAY_CHAT_API_KINDS: readonly GatewayChatApiKind[] = [
  * The gateway chat API kinds the model's serving provider can speak, derived
  * from the provider the gateway would route it to. Mirrors get-provider.ts's
  * static fallback resolution — a Kilo-exclusive model is served by its
- * declared gateway, everything else by OpenRouter. The dynamic paths (BYOK,
- * custom LLMs, experiments, Vercel re-routing) never apply to auto-routing
- * benchmark candidates, which is the only consumer.
+ * declared gateway, everything else by OpenRouter.
  */
 export function gatewayChatApisForModel(modelId: string): ReadonlyArray<GatewayChatApiKind> {
   const exclusive = findKiloExclusiveModel(modelId);
diff --git a/packages/auto-routing-contracts/src/routing-table.ts b/packages/auto-routing-contracts/src/routing-table.ts
index 84e0628106..ff49e81578 100644
--- a/packages/auto-routing-contracts/src/routing-table.ts
+++ b/packages/auto-routing-contracts/src/routing-table.ts
@@ -1,8 +1,6 @@
 import * as z from 'zod';
 import { ReasoningEffortSchema } from './tiers';
 
-export const ClassifierApiKindSchema = z.enum(['chat_completions', 'responses', 'messages']);
-
 export const RankedCandidateSchema = z.object({
   model: z.string().trim().min(1),
   // Benchmark accuracy in [0, 1] for this tier.
diff --git a/services/auto-routing-benchmark/src/datasets/decider-cases.ts b/services/auto-routing-benchmark/src/datasets/decider-cases.ts
index 745cd423f4..fcb82a223f 100644
--- a/services/auto-routing-benchmark/src/datasets/decider-cases.ts
+++ b/services/auto-routing-benchmark/src/datasets/decider-cases.ts
@@ -204,7 +204,9 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
     check: { kind: 'json_equal', value: { line: 6, fix: 'if (a[mid] < t) lo = mid + 1;' } },
   },
   {
-    id: 'debug-fix-pagination-slice',
+    // 'pages' rather than 'pagination' so the id never collides with the
+    // classifier dataset's debug-fix-pagination-slice in shared telemetry.
+    id: 'debug-fix-pages-slice',
     tier: 'medium',
     taskType: 'debugging',
     subtaskType: 'bug_fixing',
diff --git a/services/auto-routing/src/decide.ts b/services/auto-routing/src/decide.ts
index c69955ed2e..fd476a5668 100644
--- a/services/auto-routing/src/decide.ts
+++ b/services/auto-routing/src/decide.ts
@@ -332,7 +332,9 @@ export const decideHandler: Handler<HonoEnv> = async c => {
       );
     }
     const decision = computeDecision(classifier.classification, routingTable, stickyModel);
-    if (decision) {
+    // Like the classification cache, sticky state only trusts real classifier
+    // output: a heuristic fallback must not re-anchor the session's model.
+    if (decision && !classifier.fallback) {
       c.executionCtx.waitUntil(putStickyDecision(c.env, ctx.conversationKey, decision.model));
     }
     recordDecision(
diff --git a/services/auto-routing/src/index.test.ts b/services/auto-routing/src/index.test.ts
index 96682849b1..4519c7c310 100644
--- a/services/auto-routing/src/index.test.ts
+++ b/services/auto-routing/src/index.test.ts
@@ -457,6 +457,9 @@ describe('auto routing worker', () => {
       ],
       doubles: [expect.any(Number), 0.00000123, 0, 0],
     });
+    // A heuristic fallback classification is served but must not re-anchor
+    // the session's sticky model (same rule as the classification cache).
+    expect(cachePutEntry).not.toHaveBeenCalledWith('sticky', expect.anything());
   });
 
   it('makes no decision when no routing table is published', async () => {

From 053373be026dc54e16f3e56445a8497be5adf6e5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Fri, 12 Jun 2026 20:53:40 +0200
Subject: [PATCH 63/73] test(ai-gateway): add sticky field to decision fixture

---
 apps/web/src/lib/ai-gateway/auto-routing-decision.test.ts | 1 +
 1 file changed, 1 insertion(+)

diff --git a/apps/web/src/lib/ai-gateway/auto-routing-decision.test.ts b/apps/web/src/lib/ai-gateway/auto-routing-decision.test.ts
index 2c0afcae0f..70d8e7e0c6 100644
--- a/apps/web/src/lib/ai-gateway/auto-routing-decision.test.ts
+++ b/apps/web/src/lib/ai-gateway/auto-routing-decision.test.ts
@@ -50,6 +50,7 @@ const validDecision = {
   tier: 'low' as const,
   source: 'benchmark' as const,
   tableVersion: 'v1',
+  sticky: false,
 };
 
 const validResponse = {

From b8a58926cf9e1b04f88efa35e5701738631fc40e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Fri, 12 Jun 2026 21:06:43 +0200
Subject: [PATCH 64/73] feat(dev): move auto-routing workers into their own
 opt-in dev group

---
 dev/local/services.test.ts | 15 ++++++++++++---
 dev/local/services.ts      | 11 +++++++++--
 2 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/dev/local/services.test.ts b/dev/local/services.test.ts
index b7faf19c7f..327ff96a1d 100644
--- a/dev/local/services.test.ts
+++ b/dev/local/services.test.ts
@@ -4,15 +4,24 @@ import test from 'node:test';
 
 import { getAlwaysOnGroupIds, getService, resolveGroups } from './services';
 
-test('starts auto routing as a core dev service', () => {
+test('keeps auto routing workers in their own opt-in group', () => {
   const service = getService('auto-routing');
 
-  assert.equal(service.group, 'core');
+  assert.equal(service.group, 'auto-routing');
   assert.equal(service.type, 'worker');
   assert.equal(service.dir, 'services/auto-routing');
   assert.equal(service.port, 8810);
   assert.match(service.command.join(' '), /pnpm run dev/);
-  assert.ok(resolveGroups(getAlwaysOnGroupIds()).includes('auto-routing'));
+
+  const benchmark = getService('auto-routing-benchmark');
+  assert.equal(benchmark.group, 'auto-routing');
+  assert.equal(benchmark.type, 'worker');
+  assert.equal(benchmark.dir, 'services/auto-routing-benchmark');
+  assert.equal(benchmark.port, 8814);
+
+  const alwaysOn = resolveGroups(getAlwaysOnGroupIds());
+  assert.ok(!alwaysOn.includes('auto-routing'));
+  assert.ok(!alwaysOn.includes('auto-routing-benchmark'));
 });
 
 test('keeps auto routing package dev script compatible with local launcher flags', () => {
diff --git a/dev/local/services.ts b/dev/local/services.ts
index ac2a081187..0d434c3cee 100644
--- a/dev/local/services.ts
+++ b/dev/local/services.ts
@@ -48,6 +48,7 @@ const groups: ServiceGroup[] = [
   },
   { id: 'deploy', label: 'Deploy', alwaysOn: false },
   { id: 'observability', label: 'Observability', alwaysOn: false },
+  { id: 'auto-routing', label: 'Auto Routing', alwaysOn: false, sectionBreakBefore: true },
   { id: 'mobile', label: 'Mobile', alwaysOn: false, sectionBreakBefore: true },
   { id: 'storybook', label: 'Storybook', alwaysOn: false, sectionBreakBefore: true },
 ];
@@ -74,17 +75,23 @@ const serviceMeta: Record<string, ServiceMeta> = {
   // core
   nextjs: {
     group: 'core',
-    dependsOn: ['postgres', 'redis', 'redis-http', 'stripe', 'auto-routing'],
+    dependsOn: ['postgres', 'redis', 'redis-http', 'stripe'],
   },
   postgres: { group: 'core', dependsOn: [] },
   redis: { group: 'core', dependsOn: [] },
   'redis-http': { group: 'core', dependsOn: ['redis'] },
   stripe: { group: 'core', dependsOn: [] },
+  // auto-routing (kilo-auto/efficient decision engine + benchmark runner)
   'auto-routing': {
-    group: 'core',
+    group: 'auto-routing',
     dependsOn: [],
     dir: 'services/auto-routing',
   },
+  'auto-routing-benchmark': {
+    group: 'auto-routing',
+    dependsOn: [],
+    dir: 'services/auto-routing-benchmark',
+  },
   // cloud-agent
   'cloud-agent-next': {
     group: 'cloud-agent',

From 2f39419ad2cca7e9ba5cf93cfecba4aeba7aa2f8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Fri, 12 Jun 2026 21:57:20 +0200
Subject: [PATCH 65/73] fix(auto-routing): make the decider benchmark runnable
 in local dev

- Inject KILO_API_URL into the benchmark container via a new
  KILO_CLI_API_URL worker var so the kilo CLI targets the same gateway
  the worker mints tokens against (prod default: api.kilo.ai).
- Add .dev.vars.example mapping both URLs to the local apps/web dev
  server (worker-side localhost, container-side host.docker.internal).
- Add AUTO_ROUTING_BENCHMARK_WORKER_URL to the apps/web env example so
  the admin panel proxies to the local benchmark worker instead of prod.
- Work around wrangler force-pulling the amd64 container egress proxy
  on Apple Silicon (its transparent-proxy setsockopt crashes under
  emulation, failing every local container start) by pinning the arm64
  manifest digest via MINIFLARE_CONTAINER_EGRESS_IMAGE in the dev
  runner.
---
 apps/web/.env.development.local.example        |  3 +++
 dev/local/services.ts                          | 18 ++++++++++++++++++
 .../auto-routing-benchmark/.dev.vars.example   | 14 ++++++++++++++
 .../src/bench-runner-container.ts              |  4 ++++
 .../worker-configuration.d.ts                  |  7 ++++---
 services/auto-routing-benchmark/wrangler.jsonc |  4 ++++
 6 files changed, 47 insertions(+), 3 deletions(-)
 create mode 100644 services/auto-routing-benchmark/.dev.vars.example

diff --git a/apps/web/.env.development.local.example b/apps/web/.env.development.local.example
index c17511bfe9..ae816bb2c2 100644
--- a/apps/web/.env.development.local.example
+++ b/apps/web/.env.development.local.example
@@ -19,6 +19,9 @@ AUTO_TRIAGE_URL=http://localhost:8791
 # @url auto-routing
 AUTO_ROUTING_WORKER_URL=http://localhost:8810
 
+# @url auto-routing-benchmark
+AUTO_ROUTING_BENCHMARK_WORKER_URL=http://localhost:8814
+
 # @url cloudflare-security-sync
 SECURITY_SYNC_WORKER_URL=http://localhost:8812
 
diff --git a/dev/local/services.ts b/dev/local/services.ts
index 0d434c3cee..5e377f8309 100644
--- a/dev/local/services.ts
+++ b/dev/local/services.ts
@@ -374,6 +374,23 @@ export function getAllInfraProfiles(): string[] {
   return [...new Set(Object.values(INFRA_PROFILES))];
 }
 
+// Wrangler always pulls its container egress-interceptor sidecar
+// (cloudflare/proxy-everything) with --platform linux/amd64. On Apple Silicon
+// the emulated amd64 proxy crashes at startup ("setsockopt: protocol not
+// available" — its transparent-proxy socket options don't survive Rosetta),
+// which surfaces as "Failed to start container" for every local container.
+// Point wrangler at the same proxy version's linux/arm64 manifest instead:
+// pulling a single-platform manifest digest with --platform amd64 only warns.
+// Keep the digest in sync with DEFAULT_CONTAINER_EGRESS_INTERCEPTOR_IMAGE in
+// the pinned wrangler/miniflare version (tag 3cb1195).
+const CONTAINER_EGRESS_IMAGE_ARM64 =
+  'cloudflare/proxy-everything:3cb1195@sha256:78c7910f4575a511d928d7824b1cbcaec6b7c4bf4dbb3fafaeeae3104030e73c';
+
+function containerEgressImageEnvPrefix(): string[] {
+  if (process.arch !== 'arm64') return [];
+  return ['env', `MINIFLARE_CONTAINER_EGRESS_IMAGE=${CONTAINER_EGRESS_IMAGE_ARM64}`];
+}
+
 function buildServiceDefs(): ServiceDef[] {
   const repoRoot = path.resolve(import.meta.dirname, '../..');
   const defs: ServiceDef[] = [];
@@ -520,6 +537,7 @@ function buildServiceDefs(): ServiceDef[] {
     const inspectorPort = port + 10000;
 
     const command = [
+      ...containerEgressImageEnvPrefix(),
       'pnpm',
       'run',
       'dev',
diff --git a/services/auto-routing-benchmark/.dev.vars.example b/services/auto-routing-benchmark/.dev.vars.example
new file mode 100644
index 0000000000..9f3063f8ad
--- /dev/null
+++ b/services/auto-routing-benchmark/.dev.vars.example
@@ -0,0 +1,14 @@
+# Base URL the worker uses for apps/web's /api/internal/* routes (decider
+# benchmark token mint). The worker process runs on the host, so localhost
+# reaches the local apps/web dev server directly.
+# @url nextjs
+KILO_WEB_API_BASE_URL=http://localhost:3000
+
+# Gateway base URL for the kilo CLI inside the benchmark container (injected
+# as KILO_API_URL). Containers cannot use localhost (that resolves to the
+# container itself). host.docker.internal works under OrbStack; on Docker
+# Desktop the wrangler container network may not get that mapping — use the
+# Docker Desktop host gateway IP http://192.168.65.254:3000 instead (same
+# convention as services/wasteland).
+# @url nextjs
+KILO_CLI_API_URL=http://host.docker.internal:3000
diff --git a/services/auto-routing-benchmark/src/bench-runner-container.ts b/services/auto-routing-benchmark/src/bench-runner-container.ts
index 3c7f5233f0..a3c712c4c7 100644
--- a/services/auto-routing-benchmark/src/bench-runner-container.ts
+++ b/services/auto-routing-benchmark/src/bench-runner-container.ts
@@ -7,4 +7,8 @@ import { Container } from '@cloudflare/containers';
 export class BenchRunnerContainer extends Container<Env> {
   defaultPort = 3000;
   sleepAfter = '2m';
+  // The CLI resolves every gateway endpoint from KILO_API_URL. Production
+  // points at the real gateway; local dev overrides it via .dev.vars so the
+  // benchmark runs against the local apps/web instance.
+  envVars = { KILO_API_URL: this.env.KILO_CLI_API_URL };
 }
diff --git a/services/auto-routing-benchmark/worker-configuration.d.ts b/services/auto-routing-benchmark/worker-configuration.d.ts
index a4c1d95ae1..b91e340c4e 100644
--- a/services/auto-routing-benchmark/worker-configuration.d.ts
+++ b/services/auto-routing-benchmark/worker-configuration.d.ts
@@ -1,12 +1,13 @@
 /* eslint-disable */
-// Generated by Wrangler by running `wrangler types --include-runtime=false` (hash: a4dd2037113d28278748c75791163aa0)
+// Generated by Wrangler by running `wrangler types --include-runtime=false` (hash: bb795d62b0d99d5132cd935146748ae9)
 interface __BaseEnv_Env {
 	AUTO_ROUTING_CONFIG: KVNamespace;
 	BENCH_DB: D1Database;
 	BENCH_QUEUE: Queue;
 	INTERNAL_API_SECRET_PROD: SecretsStoreSecret;
 	OPENROUTER_API_KEY: SecretsStoreSecret;
-	KILO_WEB_API_BASE_URL: "https://app.kilo.ai";
+	KILO_WEB_API_BASE_URL: string;
+	KILO_CLI_API_URL: string;
 	BENCH_RUNNER: DurableObjectNamespace<import("./src/index").BenchRunnerContainer>;
 }
 declare namespace Cloudflare {
@@ -21,5 +22,5 @@ type StringifyValues<EnvType extends Record<string, unknown>> = {
 	[Binding in keyof EnvType]: EnvType[Binding] extends string ? EnvType[Binding] : string;
 };
 declare namespace NodeJS {
-	interface ProcessEnv extends StringifyValues<Pick<Cloudflare.Env, "KILO_WEB_API_BASE_URL">> {}
+	interface ProcessEnv extends StringifyValues<Pick<Cloudflare.Env, "KILO_WEB_API_BASE_URL" | "KILO_CLI_API_URL">> {}
 }
diff --git a/services/auto-routing-benchmark/wrangler.jsonc b/services/auto-routing-benchmark/wrangler.jsonc
index 088752febb..fbf655d599 100644
--- a/services/auto-routing-benchmark/wrangler.jsonc
+++ b/services/auto-routing-benchmark/wrangler.jsonc
@@ -15,6 +15,10 @@
     // Base URL for reaching apps/web's /api/internal/* routes. Other workers
     // that call apps/web internal endpoints use app.kilo.ai.
     "KILO_WEB_API_BASE_URL": "https://app.kilo.ai",
+    // Gateway base URL injected into the benchmark container as the kilo
+    // CLI's KILO_API_URL. Local dev overrides both vars via .dev.vars (see
+    // .dev.vars.example) so the decider benchmark runs fully locally.
+    "KILO_CLI_API_URL": "https://api.kilo.ai",
   },
   "containers": [
     {

From ae0cec5ceb6464dfe25e0f8c1806733e2603960d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Fri, 12 Jun 2026 22:27:05 +0200
Subject: [PATCH 66/73] fix(auto-routing): kill the whole CLI process tree on
 decider case timeout
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The kilo bin is a Node wrapper that spawns the real CLI binary as a
grandchild. SIGKILLing only the wrapper orphaned the grandchild on
timeout: it kept running (and spending) and held the stdout/stderr
pipes open, so 'close' never fired, the case promise never resolved,
and the chunk's queue message hung until the runtime cut it — then
retried from case 0 and eventually dead-lettered. Observed live: a
runaway agentic case ran 20+ minutes past the 180s cap and wedged the
whole run.

Spawn the CLI detached so it leads its own process group, kill the
group on timeout, and add an after-exit grace backstop so a stray
pipe-holder can never hang a case again.
---
 .../container/server.mjs                      | 25 ++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/services/auto-routing-benchmark/container/server.mjs b/services/auto-routing-benchmark/container/server.mjs
index 91ef1a19c4..3a212cff1e 100644
--- a/services/auto-routing-benchmark/container/server.mjs
+++ b/services/auto-routing-benchmark/container/server.mjs
@@ -54,6 +54,11 @@ function runCase({ model, prompt, kiloToken, timeoutMs, variant }) {
       // Reasoning effort: forwarded as the CLI's provider-specific variant.
       if (typeof variant === 'string' && variant.length > 0) args.push('--variant', variant);
       args.push(prompt);
+      // detached: the `kilo` bin is a wrapper that spawns the real CLI binary
+      // as a grandchild. Killing only the wrapper orphans the grandchild: it
+      // keeps running (and spending) and holds the stdout/stderr pipes open,
+      // so 'close' never fires and the case hangs forever. A detached child
+      // leads its own process group, letting the timeout kill the whole tree.
       const child = spawn('kilo', args, {
         cwd: dir,
         env: {
@@ -62,11 +67,18 @@ function runCase({ model, prompt, kiloToken, timeoutMs, variant }) {
           NO_COLOR: '1',
         },
         stdio: ['ignore', 'pipe', 'pipe'],
+        detached: true,
       });
 
-      const killTimer = setTimeout(() => {
-        child.kill('SIGKILL');
-      }, timeoutMs);
+      const killProcessTree = () => {
+        // Negative pid = the child's whole process group (wrapper + real CLI).
+        try {
+          process.kill(-child.pid, 'SIGKILL');
+        } catch {
+          child.kill('SIGKILL');
+        }
+      };
+      const killTimer = setTimeout(killProcessTree, timeoutMs);
 
       child.stdout.on('data', chunk => {
         if (stdoutTruncated) return;
@@ -109,6 +121,13 @@ function runCase({ model, prompt, kiloToken, timeoutMs, variant }) {
       child.on('close', code => {
         void finish(code ?? -1);
       });
+      // Backstop for 'close' never firing: a stray process that survives the
+      // group kill (e.g. a tool process that moved to its own group) can hold
+      // the stdio pipes open indefinitely. After the child itself has exited,
+      // give the streams a short grace to flush, then finish regardless.
+      child.on('exit', code => {
+        setTimeout(() => void finish(code ?? -1), 5_000).unref();
+      });
     })();
   });
 }

From 4f04e0ad25d11ba949f1e644b4bdb332f5a651e5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Sat, 13 Jun 2026 01:52:47 +0200
Subject: [PATCH 67/73] feat(auto-routing): benchmark repetitions, p95 latency,
 and classifier latency gate

- Config gains classifierRepetitions, deciderRepetitions (1-5), and
  classifierMaxP95LatencyMs (null = no constraint); run rows snapshot the
  active repetition count and latency budget at start time.
- case_results PK extended with rep column; timed_out column added.
- model_summaries gains p95_latency_ms (nearest-rank p95 over all rows)
  and timeouts count.
- pickClassifierWinner enforces an optional p95 latency budget: candidates
  meeting both accuracy and latency are ranked by cost; when none meet the
  budget, falls back to lowest-p95 among accuracy-meeting models.
- classifier_winner contract surfaces the winner's p95LatencyMs.
- DECIDER_CHUNK_SIZE reduced from 10 to 5 to stay well within queue
  consumer wall-clock limits.
- Container server propagates timedOut flag through ContainerRunResponse
  and CliRunResult so timed-out cases are recorded in D1.
---
 .../auto-routing-contracts/src/benchmark.ts   |  11 +
 .../container/server.mjs                      |   7 +-
 .../migrations/0001_stormy_tarot.sql          |  32 +
 .../migrations/meta/0001_snapshot.json        | 648 ++++++++++++++++++
 .../migrations/meta/_journal.json             |   7 +
 .../auto-routing-benchmark/src/admin.test.ts  |  10 +
 .../auto-routing-benchmark/src/cli-runner.ts  |   3 +
 .../auto-routing-benchmark/src/config.test.ts |   6 +
 services/auto-routing-benchmark/src/config.ts |   9 +
 .../auto-routing-benchmark/src/db-schema.ts   |  13 +-
 .../auto-routing-benchmark/src/db.test.ts     |  14 +
 services/auto-routing-benchmark/src/db.ts     |  26 +-
 .../src/routing-table-builder.test.ts         |   2 +
 .../auto-routing-benchmark/src/run.test.ts    | 146 +++-
 services/auto-routing-benchmark/src/run.ts    | 139 ++--
 services/auto-routing-benchmark/src/winner.ts |  34 +-
 16 files changed, 1044 insertions(+), 63 deletions(-)
 create mode 100644 services/auto-routing-benchmark/migrations/0001_stormy_tarot.sql
 create mode 100644 services/auto-routing-benchmark/migrations/meta/0001_snapshot.json

diff --git a/packages/auto-routing-contracts/src/benchmark.ts b/packages/auto-routing-contracts/src/benchmark.ts
index f5b0d56479..92672bba59 100644
--- a/packages/auto-routing-contracts/src/benchmark.ts
+++ b/packages/auto-routing-contracts/src/benchmark.ts
@@ -34,6 +34,14 @@ export const BenchmarkConfigSchema = z.object({
   // cheaper than fresh input tokens), so switching only pays off when the
   // recurring savings clearly outweigh the cache-rebuild penalty.
   switchCostFactor: z.number().min(1).max(100),
+  // How many times to repeat each case for classifier / decider benchmarks.
+  // Repeated runs reduce variance; the default of 1 preserves the current
+  // single-pass behaviour.
+  classifierRepetitions: z.number().int().min(1).max(5).default(1),
+  deciderRepetitions: z.number().int().min(1).max(5).default(1),
+  // Maximum acceptable p95 latency for the classifier winner; null means no
+  // constraint (cost-only selection).
+  classifierMaxP95LatencyMs: z.number().int().positive().nullable().default(1000),
   updatedAt: z.string().nullable(),
   updatedBy: z.string().nullable(),
 });
@@ -50,8 +58,10 @@ export const BenchmarkModelSummarySchema = z.object({
   avgCostUsd: z.number().nullable(),
   avgLatencyMs: z.number(),
   p50LatencyMs: z.number().nullable(),
+  p95LatencyMs: z.number().nullable(),
   cases: z.number().int(),
   errors: z.number().int(),
+  timeouts: z.number().int().default(0),
 });
 export type BenchmarkModelSummary = z.infer<typeof BenchmarkModelSummarySchema>;
 
@@ -96,6 +106,7 @@ export const ClassifierWinnerSchema = z.object({
   model: z.string().trim().min(1),
   runId: z.string(),
   accuracy: z.number(),
+  p95LatencyMs: z.number().nullable().default(null),
   generatedAt: z.string(),
 });
 export type ClassifierWinner = z.infer<typeof ClassifierWinnerSchema>;
diff --git a/services/auto-routing-benchmark/container/server.mjs b/services/auto-routing-benchmark/container/server.mjs
index 3a212cff1e..719c54d68e 100644
--- a/services/auto-routing-benchmark/container/server.mjs
+++ b/services/auto-routing-benchmark/container/server.mjs
@@ -45,6 +45,7 @@ function runCase({ model, prompt, kiloToken, timeoutMs, variant }) {
     void (async () => {
       const dir = await mkdtemp(join(tmpdir(), 'kilo-bench-'));
       const startedAt = Date.now();
+      let timedOut = false;
 
       let stdout = '';
       let stdoutTruncated = false;
@@ -78,7 +79,10 @@ function runCase({ model, prompt, kiloToken, timeoutMs, variant }) {
           child.kill('SIGKILL');
         }
       };
-      const killTimer = setTimeout(killProcessTree, timeoutMs);
+      const killTimer = setTimeout(() => {
+        timedOut = true;
+        killProcessTree();
+      }, timeoutMs);
 
       child.stdout.on('data', chunk => {
         if (stdoutTruncated) return;
@@ -111,6 +115,7 @@ function runCase({ model, prompt, kiloToken, timeoutMs, variant }) {
           durationMs: Date.now() - startedAt,
           stdoutLines,
           stderrTail: redactedStderrTail,
+          timedOut,
         });
       };
 
diff --git a/services/auto-routing-benchmark/migrations/0001_stormy_tarot.sql b/services/auto-routing-benchmark/migrations/0001_stormy_tarot.sql
new file mode 100644
index 0000000000..7117c47ed3
--- /dev/null
+++ b/services/auto-routing-benchmark/migrations/0001_stormy_tarot.sql
@@ -0,0 +1,32 @@
+PRAGMA foreign_keys=OFF;--> statement-breakpoint
+CREATE TABLE `__new_case_results` (
+	`run_id` text NOT NULL,
+	`model` text NOT NULL,
+	`case_id` text NOT NULL,
+	`tier` text,
+	`score` real NOT NULL,
+	`latency_ms` integer NOT NULL,
+	`cost_usd` real,
+	`error` text,
+	`fallback_reason` text,
+	`retried` integer,
+	`exit_code` integer,
+	`output_prefix` text,
+	`event_count` integer,
+	`last_event_types` text,
+	`rep` integer DEFAULT 0 NOT NULL,
+	`timed_out` integer DEFAULT 0 NOT NULL,
+	PRIMARY KEY(`run_id`, `model`, `case_id`, `rep`)
+);
+--> statement-breakpoint
+INSERT INTO `__new_case_results`("run_id", "model", "case_id", "tier", "score", "latency_ms", "cost_usd", "error", "fallback_reason", "retried", "exit_code", "output_prefix", "event_count", "last_event_types", "rep", "timed_out") SELECT "run_id", "model", "case_id", "tier", "score", "latency_ms", "cost_usd", "error", "fallback_reason", "retried", "exit_code", "output_prefix", "event_count", "last_event_types", "rep", "timed_out" FROM `case_results`;--> statement-breakpoint
+DROP TABLE `case_results`;--> statement-breakpoint
+ALTER TABLE `__new_case_results` RENAME TO `case_results`;--> statement-breakpoint
+PRAGMA foreign_keys=ON;--> statement-breakpoint
+ALTER TABLE `benchmark_config` ADD `classifier_repetitions` integer DEFAULT 1 NOT NULL;--> statement-breakpoint
+ALTER TABLE `benchmark_config` ADD `decider_repetitions` integer DEFAULT 1 NOT NULL;--> statement-breakpoint
+ALTER TABLE `benchmark_config` ADD `classifier_max_p95_latency_ms` integer;--> statement-breakpoint
+ALTER TABLE `benchmark_runs` ADD `repetitions` integer DEFAULT 1 NOT NULL;--> statement-breakpoint
+ALTER TABLE `benchmark_runs` ADD `classifier_max_p95_latency_ms` integer;--> statement-breakpoint
+ALTER TABLE `model_summaries` ADD `p95_latency_ms` real;--> statement-breakpoint
+ALTER TABLE `model_summaries` ADD `timeouts` integer DEFAULT 0 NOT NULL;
\ No newline at end of file
diff --git a/services/auto-routing-benchmark/migrations/meta/0001_snapshot.json b/services/auto-routing-benchmark/migrations/meta/0001_snapshot.json
new file mode 100644
index 0000000000..bfda46aad3
--- /dev/null
+++ b/services/auto-routing-benchmark/migrations/meta/0001_snapshot.json
@@ -0,0 +1,648 @@
+{
+  "version": "6",
+  "dialect": "sqlite",
+  "id": "4a066f6c-0cac-485c-9489-fa4e0728622d",
+  "prevId": "20295052-406c-424a-956f-77acc985f44a",
+  "tables": {
+    "benchmark_config": {
+      "name": "benchmark_config",
+      "columns": {
+        "id": {
+          "name": "id",
+          "type": "integer",
+          "primaryKey": true,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "min_accuracy": {
+          "name": "min_accuracy",
+          "type": "real",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "switch_cost_factor": {
+          "name": "switch_cost_factor",
+          "type": "real",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "max_concurrency": {
+          "name": "max_concurrency",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "benchmark_user_id": {
+          "name": "benchmark_user_id",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        },
+        "classifier_repetitions": {
+          "name": "classifier_repetitions",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false,
+          "default": 1
+        },
+        "decider_repetitions": {
+          "name": "decider_repetitions",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false,
+          "default": 1
+        },
+        "classifier_max_p95_latency_ms": {
+          "name": "classifier_max_p95_latency_ms",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        },
+        "updated_at": {
+          "name": "updated_at",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "updated_by": {
+          "name": "updated_by",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        }
+      },
+      "indexes": {},
+      "foreignKeys": {},
+      "compositePrimaryKeys": {},
+      "uniqueConstraints": {},
+      "checkConstraints": {}
+    },
+    "benchmark_runs": {
+      "name": "benchmark_runs",
+      "columns": {
+        "id": {
+          "name": "id",
+          "type": "text",
+          "primaryKey": true,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "kind": {
+          "name": "kind",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "status": {
+          "name": "status",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "started_at": {
+          "name": "started_at",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "completed_at": {
+          "name": "completed_at",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        },
+        "error": {
+          "name": "error",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        },
+        "min_accuracy": {
+          "name": "min_accuracy",
+          "type": "real",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "switch_cost_factor": {
+          "name": "switch_cost_factor",
+          "type": "real",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "max_concurrency": {
+          "name": "max_concurrency",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "benchmark_user_id": {
+          "name": "benchmark_user_id",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        },
+        "repetitions": {
+          "name": "repetitions",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false,
+          "default": 1
+        },
+        "classifier_max_p95_latency_ms": {
+          "name": "classifier_max_p95_latency_ms",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        }
+      },
+      "indexes": {},
+      "foreignKeys": {},
+      "compositePrimaryKeys": {},
+      "uniqueConstraints": {},
+      "checkConstraints": {}
+    },
+    "case_results": {
+      "name": "case_results",
+      "columns": {
+        "run_id": {
+          "name": "run_id",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "model": {
+          "name": "model",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "case_id": {
+          "name": "case_id",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "tier": {
+          "name": "tier",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        },
+        "score": {
+          "name": "score",
+          "type": "real",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "latency_ms": {
+          "name": "latency_ms",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "cost_usd": {
+          "name": "cost_usd",
+          "type": "real",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        },
+        "error": {
+          "name": "error",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        },
+        "fallback_reason": {
+          "name": "fallback_reason",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        },
+        "retried": {
+          "name": "retried",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        },
+        "exit_code": {
+          "name": "exit_code",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        },
+        "output_prefix": {
+          "name": "output_prefix",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        },
+        "event_count": {
+          "name": "event_count",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        },
+        "last_event_types": {
+          "name": "last_event_types",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        },
+        "rep": {
+          "name": "rep",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false,
+          "default": 0
+        },
+        "timed_out": {
+          "name": "timed_out",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false,
+          "default": 0
+        }
+      },
+      "indexes": {},
+      "foreignKeys": {},
+      "compositePrimaryKeys": {
+        "case_results_run_id_model_case_id_rep_pk": {
+          "columns": [
+            "run_id",
+            "model",
+            "case_id",
+            "rep"
+          ],
+          "name": "case_results_run_id_model_case_id_rep_pk"
+        }
+      },
+      "uniqueConstraints": {},
+      "checkConstraints": {}
+    },
+    "config_classifier_models": {
+      "name": "config_classifier_models",
+      "columns": {
+        "model": {
+          "name": "model",
+          "type": "text",
+          "primaryKey": true,
+          "notNull": true,
+          "autoincrement": false
+        }
+      },
+      "indexes": {},
+      "foreignKeys": {},
+      "compositePrimaryKeys": {},
+      "uniqueConstraints": {},
+      "checkConstraints": {}
+    },
+    "config_decider_models": {
+      "name": "config_decider_models",
+      "columns": {
+        "model": {
+          "name": "model",
+          "type": "text",
+          "primaryKey": true,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "reasoning_effort": {
+          "name": "reasoning_effort",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        }
+      },
+      "indexes": {},
+      "foreignKeys": {},
+      "compositePrimaryKeys": {},
+      "uniqueConstraints": {},
+      "checkConstraints": {}
+    },
+    "model_summaries": {
+      "name": "model_summaries",
+      "columns": {
+        "run_id": {
+          "name": "run_id",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "model": {
+          "name": "model",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "tier": {
+          "name": "tier",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "accuracy": {
+          "name": "accuracy",
+          "type": "real",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "avg_cost_usd": {
+          "name": "avg_cost_usd",
+          "type": "real",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        },
+        "avg_latency_ms": {
+          "name": "avg_latency_ms",
+          "type": "real",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "p50_latency_ms": {
+          "name": "p50_latency_ms",
+          "type": "real",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        },
+        "cases": {
+          "name": "cases",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "errors": {
+          "name": "errors",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "p95_latency_ms": {
+          "name": "p95_latency_ms",
+          "type": "real",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        },
+        "timeouts": {
+          "name": "timeouts",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false,
+          "default": 0
+        },
+        "carried": {
+          "name": "carried",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false,
+          "default": false
+        }
+      },
+      "indexes": {},
+      "foreignKeys": {},
+      "compositePrimaryKeys": {
+        "model_summaries_run_id_model_tier_pk": {
+          "columns": [
+            "run_id",
+            "model",
+            "tier"
+          ],
+          "name": "model_summaries_run_id_model_tier_pk"
+        }
+      },
+      "uniqueConstraints": {},
+      "checkConstraints": {}
+    },
+    "routing_table_candidates": {
+      "name": "routing_table_candidates",
+      "columns": {
+        "run_id": {
+          "name": "run_id",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "tier": {
+          "name": "tier",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "rank": {
+          "name": "rank",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "model": {
+          "name": "model",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "accuracy": {
+          "name": "accuracy",
+          "type": "real",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "avg_cost_usd": {
+          "name": "avg_cost_usd",
+          "type": "real",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "meets_threshold": {
+          "name": "meets_threshold",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "reasoning_effort": {
+          "name": "reasoning_effort",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        }
+      },
+      "indexes": {},
+      "foreignKeys": {},
+      "compositePrimaryKeys": {
+        "routing_table_candidates_run_id_tier_rank_pk": {
+          "columns": [
+            "run_id",
+            "tier",
+            "rank"
+          ],
+          "name": "routing_table_candidates_run_id_tier_rank_pk"
+        }
+      },
+      "uniqueConstraints": {},
+      "checkConstraints": {}
+    },
+    "routing_tables": {
+      "name": "routing_tables",
+      "columns": {
+        "run_id": {
+          "name": "run_id",
+          "type": "text",
+          "primaryKey": true,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "published_at": {
+          "name": "published_at",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "generated_at": {
+          "name": "generated_at",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "min_accuracy": {
+          "name": "min_accuracy",
+          "type": "real",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "switch_cost_factor": {
+          "name": "switch_cost_factor",
+          "type": "real",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "source": {
+          "name": "source",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        }
+      },
+      "indexes": {},
+      "foreignKeys": {},
+      "compositePrimaryKeys": {},
+      "uniqueConstraints": {},
+      "checkConstraints": {}
+    },
+    "run_models": {
+      "name": "run_models",
+      "columns": {
+        "run_id": {
+          "name": "run_id",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "model": {
+          "name": "model",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "enqueued": {
+          "name": "enqueued",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "reasoning_effort": {
+          "name": "reasoning_effort",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        }
+      },
+      "indexes": {},
+      "foreignKeys": {},
+      "compositePrimaryKeys": {
+        "run_models_run_id_model_pk": {
+          "columns": [
+            "run_id",
+            "model"
+          ],
+          "name": "run_models_run_id_model_pk"
+        }
+      },
+      "uniqueConstraints": {},
+      "checkConstraints": {}
+    }
+  },
+  "views": {},
+  "enums": {},
+  "_meta": {
+    "schemas": {},
+    "tables": {},
+    "columns": {}
+  },
+  "internal": {
+    "indexes": {}
+  }
+}
\ No newline at end of file
diff --git a/services/auto-routing-benchmark/migrations/meta/_journal.json b/services/auto-routing-benchmark/migrations/meta/_journal.json
index c42129fb6b..7ec4e5d40d 100644
--- a/services/auto-routing-benchmark/migrations/meta/_journal.json
+++ b/services/auto-routing-benchmark/migrations/meta/_journal.json
@@ -8,6 +8,13 @@
       "when": 1781283444549,
       "tag": "0000_amused_shard",
       "breakpoints": true
+    },
+    {
+      "idx": 1,
+      "version": "6",
+      "when": 1781307943215,
+      "tag": "0001_stormy_tarot",
+      "breakpoints": true
     }
   ]
 }
\ No newline at end of file
diff --git a/services/auto-routing-benchmark/src/admin.test.ts b/services/auto-routing-benchmark/src/admin.test.ts
index 5981d6db2a..7c0cc73164 100644
--- a/services/auto-routing-benchmark/src/admin.test.ts
+++ b/services/auto-routing-benchmark/src/admin.test.ts
@@ -13,6 +13,9 @@ const TEST_CONFIG: BenchmarkConfig = {
   switchCostFactor: 3,
   maxConcurrency: 4,
   benchmarkUserId: null,
+  classifierRepetitions: 1,
+  deciderRepetitions: 1,
+  classifierMaxP95LatencyMs: 1000,
   updatedAt: null,
   updatedBy: null,
 };
@@ -25,6 +28,9 @@ const TEST_CONFIG_ROWS = {
     switch_cost_factor: TEST_CONFIG.switchCostFactor,
     max_concurrency: TEST_CONFIG.maxConcurrency,
     benchmark_user_id: TEST_CONFIG.benchmarkUserId,
+    classifier_repetitions: TEST_CONFIG.classifierRepetitions,
+    decider_repetitions: TEST_CONFIG.deciderRepetitions,
+    classifier_max_p95_latency_ms: TEST_CONFIG.classifierMaxP95LatencyMs,
     updated_at: '2026-06-01T00:00:00.000Z',
     updated_by: null,
   },
@@ -175,6 +181,9 @@ describe('GET /admin/config', () => {
         switch_cost_factor: 3,
         max_concurrency: 4,
         benchmark_user_id: null,
+        classifier_repetitions: 1,
+        decider_repetitions: 1,
+        classifier_max_p95_latency_ms: null,
         updated_at: '2026-06-01T00:00:00.000Z',
         updated_by: 'admin@example.com',
       },
@@ -367,6 +376,7 @@ describe('GET /admin/classifier-winner', () => {
       model: 'google/gemini-2.5-flash-lite',
       runId: 'classifier-2026-06-01T00-00-00-000Z',
       accuracy: 0.92,
+      p95LatencyMs: null,
       generatedAt: '2026-06-01T10:00:00.000Z',
     };
     vi.mocked(getClassifierWinner).mockResolvedValueOnce(winner);
diff --git a/services/auto-routing-benchmark/src/cli-runner.ts b/services/auto-routing-benchmark/src/cli-runner.ts
index 0eef033f8c..9f22cb3695 100644
--- a/services/auto-routing-benchmark/src/cli-runner.ts
+++ b/services/auto-routing-benchmark/src/cli-runner.ts
@@ -9,6 +9,7 @@ export type CliRunResult = {
   stderrTail: string;
   eventCount: number;
   lastEventTypes: string[];
+  timedOut: boolean;
 };
 
 const DECIDER_CLI_TIMEOUT_MS = 180_000;
@@ -24,6 +25,7 @@ type ContainerRunResponse = {
   durationMs: number;
   stdoutLines: string[];
   stderrTail: string;
+  timedOut?: boolean;
 };
 
 /**
@@ -79,6 +81,7 @@ export async function runDeciderCaseViaCli(
     stderrTail: body.stderrTail ?? '',
     eventCount,
     lastEventTypes,
+    timedOut: body.timedOut ?? false,
   };
 }
 
diff --git a/services/auto-routing-benchmark/src/config.test.ts b/services/auto-routing-benchmark/src/config.test.ts
index ed9851b167..02bf051239 100644
--- a/services/auto-routing-benchmark/src/config.test.ts
+++ b/services/auto-routing-benchmark/src/config.test.ts
@@ -8,6 +8,9 @@ const configRow = {
   switch_cost_factor: 3,
   max_concurrency: 8,
   benchmark_user_id: 'user-123',
+  classifier_repetitions: 1,
+  decider_repetitions: 1,
+  classifier_max_p95_latency_ms: null,
   updated_at: '2026-06-01T00:00:00.000Z',
   updated_by: 'admin@example.com',
 };
@@ -48,5 +51,8 @@ describe('mapConfigRows', () => {
     expect(result?.deciderModels).toHaveLength(1);
     expect(result?.deciderModels[0].id).toBe('some/decider');
     expect(result?.deciderModels[0].reasoningEffort).toBe('high');
+    expect(result?.classifierRepetitions).toBe(1);
+    expect(result?.deciderRepetitions).toBe(1);
+    expect(result?.classifierMaxP95LatencyMs).toBeNull();
   });
 });
diff --git a/services/auto-routing-benchmark/src/config.ts b/services/auto-routing-benchmark/src/config.ts
index 208484ce90..99f34e2cb6 100644
--- a/services/auto-routing-benchmark/src/config.ts
+++ b/services/auto-routing-benchmark/src/config.ts
@@ -10,6 +10,9 @@ export function mapConfigRows(
     switch_cost_factor: number;
     max_concurrency: number;
     benchmark_user_id: string | null;
+    classifier_repetitions: number;
+    decider_repetitions: number;
+    classifier_max_p95_latency_ms: number | null;
     updated_at: string;
     updated_by: string | null;
   } | null,
@@ -31,6 +34,9 @@ export function mapConfigRows(
     switchCostFactor: configRow.switch_cost_factor,
     maxConcurrency: configRow.max_concurrency,
     benchmarkUserId: configRow.benchmark_user_id,
+    classifierRepetitions: configRow.classifier_repetitions,
+    deciderRepetitions: configRow.decider_repetitions,
+    classifierMaxP95LatencyMs: configRow.classifier_max_p95_latency_ms,
     updatedAt: configRow.updated_at,
     updatedBy: configRow.updated_by,
   };
@@ -61,6 +67,9 @@ export async function saveBenchmarkConfig(
       switch_cost_factor: config.switchCostFactor,
       max_concurrency: config.maxConcurrency,
       benchmark_user_id: config.benchmarkUserId,
+      classifier_repetitions: config.classifierRepetitions,
+      decider_repetitions: config.deciderRepetitions,
+      classifier_max_p95_latency_ms: config.classifierMaxP95LatencyMs,
       updated_at: updatedAt,
       updated_by: updatedBy,
     },
diff --git a/services/auto-routing-benchmark/src/db-schema.ts b/services/auto-routing-benchmark/src/db-schema.ts
index 748b789737..c99d2bad2a 100644
--- a/services/auto-routing-benchmark/src/db-schema.ts
+++ b/services/auto-routing-benchmark/src/db-schema.ts
@@ -10,6 +10,9 @@ export const benchmarkConfig = sqliteTable('benchmark_config', {
   switch_cost_factor: real('switch_cost_factor').notNull(),
   max_concurrency: integer('max_concurrency').notNull(),
   benchmark_user_id: text('benchmark_user_id'),
+  classifier_repetitions: integer('classifier_repetitions').notNull().default(1),
+  decider_repetitions: integer('decider_repetitions').notNull().default(1),
+  classifier_max_p95_latency_ms: integer('classifier_max_p95_latency_ms'),
   updated_at: text('updated_at').notNull(),
   updated_by: text('updated_by'),
 });
@@ -35,6 +38,8 @@ export const benchmarkRuns = sqliteTable('benchmark_runs', {
   switch_cost_factor: real('switch_cost_factor').notNull(),
   max_concurrency: integer('max_concurrency').notNull(),
   benchmark_user_id: text('benchmark_user_id'),
+  repetitions: integer('repetitions').notNull().default(1),
+  classifier_max_p95_latency_ms: integer('classifier_max_p95_latency_ms'),
 });
 
 export const runModels = sqliteTable(
@@ -61,6 +66,8 @@ export const modelSummaries = sqliteTable(
     p50_latency_ms: real('p50_latency_ms'),
     cases: integer('cases').notNull(),
     errors: integer('errors').notNull(),
+    p95_latency_ms: real('p95_latency_ms'),
+    timeouts: integer('timeouts').notNull().default(0),
     // carried=true rows are prior-run summaries copied in at startRun for skipped models.
     carried: integer('carried', { mode: 'boolean' }).notNull().default(false),
   },
@@ -86,10 +93,14 @@ export const caseResults = sqliteTable(
     output_prefix: text('output_prefix'),
     event_count: integer('event_count'),
     last_event_types: text('last_event_types'),
+    // Repetition index (0-based); together with run_id/model/case_id forms the PK.
+    rep: integer('rep').notNull().default(0),
+    // 1 when the case was killed by the wall-clock timeout, 0 otherwise.
+    timed_out: integer('timed_out').notNull().default(0),
   },
   // The composite PK's leftmost column already serves run_id-prefix lookups
   // (count/fetch by run); no separate run_id index is needed.
-  table => [primaryKey({ columns: [table.run_id, table.model, table.case_id] })]
+  table => [primaryKey({ columns: [table.run_id, table.model, table.case_id, table.rep] })]
 );
 
 export const routingTables = sqliteTable('routing_tables', {
diff --git a/services/auto-routing-benchmark/src/db.test.ts b/services/auto-routing-benchmark/src/db.test.ts
index c668a14125..23ff1d6d32 100644
--- a/services/auto-routing-benchmark/src/db.test.ts
+++ b/services/auto-routing-benchmark/src/db.test.ts
@@ -18,8 +18,10 @@ describe('mapSummaryRow', () => {
       avg_cost_usd: 0.0015,
       avg_latency_ms: 320.5,
       p50_latency_ms: 300.0,
+      p95_latency_ms: 300.0,
       cases: 50,
       errors: 2,
+      timeouts: 0,
       carried: false,
     };
     const result = mapSummaryRow(row);
@@ -30,8 +32,10 @@ describe('mapSummaryRow', () => {
       avgCostUsd: 0.0015,
       avgLatencyMs: 320.5,
       p50LatencyMs: 300.0,
+      p95LatencyMs: 300.0,
       cases: 50,
       errors: 2,
+      timeouts: 0,
     });
   });
 
@@ -44,15 +48,19 @@ describe('mapSummaryRow', () => {
       avg_cost_usd: null,
       avg_latency_ms: 150.0,
       p50_latency_ms: null,
+      p95_latency_ms: null,
       cases: 30,
       errors: 0,
+      timeouts: 0,
       carried: false,
     };
     const result = mapSummaryRow(row);
     expect(result.avgCostUsd).toBeNull();
     expect(result.p50LatencyMs).toBeNull();
+    expect(result.p95LatencyMs).toBeNull();
     expect(result.tier).toBe('*');
     expect(result.errors).toBe(0);
+    expect(result.timeouts).toBe(0);
   });
 });
 
@@ -73,6 +81,8 @@ describe('mapRunRow', () => {
       switch_cost_factor: 3,
       max_concurrency: 4,
       benchmark_user_id: null,
+      repetitions: 1,
+      classifier_max_p95_latency_ms: null,
     };
     const summaries: BenchmarkModelSummary[] = [
       {
@@ -82,8 +92,10 @@ describe('mapRunRow', () => {
         avgCostUsd: 0.0002,
         avgLatencyMs: 120,
         p50LatencyMs: 110,
+        p95LatencyMs: null,
         cases: 100,
         errors: 5,
+        timeouts: 0,
       },
     ];
     const result = mapRunRow(runRow, summaries);
@@ -109,6 +121,8 @@ describe('mapRunRow', () => {
       switch_cost_factor: 3,
       max_concurrency: 4,
       benchmark_user_id: null,
+      repetitions: 1,
+      classifier_max_p95_latency_ms: null,
     };
     const result = mapRunRow(runRow, []);
     expect(result.summaries).toEqual([]);
diff --git a/services/auto-routing-benchmark/src/db.ts b/services/auto-routing-benchmark/src/db.ts
index 5955c7c820..8e4696c383 100644
--- a/services/auto-routing-benchmark/src/db.ts
+++ b/services/auto-routing-benchmark/src/db.ts
@@ -41,8 +41,10 @@ export function mapSummaryRow(row: ModelSummaryRow): BenchmarkModelSummary {
     avgCostUsd: row.avg_cost_usd,
     avgLatencyMs: row.avg_latency_ms,
     p50LatencyMs: row.p50_latency_ms,
+    p95LatencyMs: row.p95_latency_ms,
     cases: row.cases,
     errors: row.errors,
+    timeouts: row.timeouts,
   };
 }
 
@@ -87,6 +89,9 @@ export async function replaceConfig(
     switch_cost_factor: number;
     max_concurrency: number;
     benchmark_user_id: string | null;
+    classifier_repetitions: number;
+    decider_repetitions: number;
+    classifier_max_p95_latency_ms: number | null;
     updated_at: string;
     updated_by: string | null;
   },
@@ -130,6 +135,8 @@ export async function insertRun(
     switch_cost_factor: number;
     max_concurrency: number;
     benchmark_user_id: string | null;
+    repetitions: number;
+    classifier_max_p95_latency_ms: number | null;
   },
   models: RunModelRow[],
   carriedSummaries: BenchmarkModelSummary[]
@@ -144,6 +151,8 @@ export async function insertRun(
     switch_cost_factor: run.switch_cost_factor,
     max_concurrency: run.max_concurrency,
     benchmark_user_id: run.benchmark_user_id,
+    repetitions: run.repetitions,
+    classifier_max_p95_latency_ms: run.classifier_max_p95_latency_ms,
   });
 
   if (models.length === 0 && carriedSummaries.length === 0) {
@@ -168,8 +177,10 @@ export async function insertRun(
           avg_cost_usd: s.avgCostUsd,
           avg_latency_ms: s.avgLatencyMs,
           p50_latency_ms: s.p50LatencyMs,
+          p95_latency_ms: s.p95LatencyMs,
           cases: s.cases,
           errors: s.errors,
+          timeouts: s.timeouts,
           carried: true,
         }))
       )
@@ -201,7 +212,7 @@ export async function upsertCaseResult(db: D1Database, row: CaseResultRow): Prom
     .insert(caseResults)
     .values(row)
     .onConflictDoUpdate({
-      target: [caseResults.run_id, caseResults.model, caseResults.case_id],
+      target: [caseResults.run_id, caseResults.model, caseResults.case_id, caseResults.rep],
       set: {
         tier: row.tier,
         score: row.score,
@@ -214,6 +225,8 @@ export async function upsertCaseResult(db: D1Database, row: CaseResultRow): Prom
         output_prefix: row.output_prefix,
         event_count: row.event_count,
         last_event_types: row.last_event_types,
+        rep: row.rep,
+        timed_out: row.timed_out,
       },
     });
 }
@@ -261,8 +274,10 @@ export async function replaceModelSummaries(
         avg_cost_usd: s.avgCostUsd,
         avg_latency_ms: s.avgLatencyMs,
         p50_latency_ms: s.p50LatencyMs,
+        p95_latency_ms: s.p95LatencyMs,
         cases: s.cases,
         errors: s.errors,
+        timeouts: s.timeouts,
         carried: false,
       }))
     ),
@@ -349,8 +364,10 @@ export async function getLatestSummariesByModel(
       avg_cost_usd: modelSummaries.avg_cost_usd,
       avg_latency_ms: modelSummaries.avg_latency_ms,
       p50_latency_ms: modelSummaries.p50_latency_ms,
+      p95_latency_ms: modelSummaries.p95_latency_ms,
       cases: modelSummaries.cases,
       errors: modelSummaries.errors,
+      timeouts: modelSummaries.timeouts,
       carried: modelSummaries.carried,
     })
     .from(modelSummaries)
@@ -535,13 +552,18 @@ export async function getClassifierWinner(db: D1Database): Promise<ClassifierWin
     .where(and(eq(modelSummaries.run_id, runRow.id), eq(modelSummaries.tier, '*')));
 
   const summaries = summaryRows.map(mapSummaryRow);
-  const winner = pickClassifierWinner(summaries, runRow.min_accuracy);
+  const winner = pickClassifierWinner(
+    summaries,
+    runRow.min_accuracy,
+    runRow.classifier_max_p95_latency_ms
+  );
   if (!winner) return null;
 
   return {
     model: winner.model,
     runId: runRow.id,
     accuracy: winner.accuracy,
+    p95LatencyMs: winner.p95LatencyMs,
     generatedAt: runRow.completed_at ?? new Date().toISOString(),
   };
 }
diff --git a/services/auto-routing-benchmark/src/routing-table-builder.test.ts b/services/auto-routing-benchmark/src/routing-table-builder.test.ts
index 27c0f2d16f..8c124ee496 100644
--- a/services/auto-routing-benchmark/src/routing-table-builder.test.ts
+++ b/services/auto-routing-benchmark/src/routing-table-builder.test.ts
@@ -24,8 +24,10 @@ function summary(
     avgCostUsd,
     avgLatencyMs: 500,
     p50LatencyMs: 450,
+    p95LatencyMs: null,
     cases: 10,
     errors: 0,
+    timeouts: 0,
   };
 }
 
diff --git a/services/auto-routing-benchmark/src/run.test.ts b/services/auto-routing-benchmark/src/run.test.ts
index 11aec2cb3b..707b7330bc 100644
--- a/services/auto-routing-benchmark/src/run.test.ts
+++ b/services/auto-routing-benchmark/src/run.test.ts
@@ -1,6 +1,6 @@
 import { describe, expect, it } from 'vitest';
 import type { CaseResultRow } from './db';
-import { chunkArray, runCasesWithConcurrency, summarize } from './run';
+import { BenchmarkJobMessageSchema, chunkArray, runCasesWithConcurrency, summarize } from './run';
 import { pickClassifierWinner } from './winner';
 
 function makeRow(overrides: Partial<CaseResultRow> = {}): CaseResultRow {
@@ -19,6 +19,8 @@ function makeRow(overrides: Partial<CaseResultRow> = {}): CaseResultRow {
     output_prefix: null,
     event_count: null,
     last_event_types: null,
+    rep: 0,
+    timed_out: 0,
     ...overrides,
   };
 }
@@ -211,12 +213,12 @@ describe('runCasesWithConcurrency', () => {
 });
 
 describe('chunkArray', () => {
-  it('splits into 10-per-chunk with a partial final chunk', () => {
-    const items = Array.from({ length: 23 }, (_, i) => i);
-    const chunks = chunkArray(items, 10);
+  it('splits into 5-per-chunk with a partial final chunk', () => {
+    const items = Array.from({ length: 13 }, (_, i) => i);
+    const chunks = chunkArray(items, 5);
     expect(chunks).toHaveLength(3);
-    expect(chunks[0]).toHaveLength(10);
-    expect(chunks[1]).toHaveLength(10);
+    expect(chunks[0]).toHaveLength(5);
+    expect(chunks[1]).toHaveLength(5);
     expect(chunks[2]).toHaveLength(3);
   });
 
@@ -246,8 +248,10 @@ describe('pickClassifierWinner', () => {
     avgCostUsd,
     avgLatencyMs: 100,
     p50LatencyMs: 90,
+    p95LatencyMs: 90,
     cases: 36,
     errors: 0,
+    timeouts: 0,
   });
 
   it('picks the cheapest model meeting the threshold', () => {
@@ -277,4 +281,134 @@ describe('pickClassifierWinner', () => {
     ).toBeNull();
     expect(pickClassifierWinner([], 0.7)).toBeNull();
   });
+
+  // helper with explicit p95LatencyMs
+  const summaryWithLatency = (
+    model: string,
+    accuracy: number,
+    avgCostUsd: number | null,
+    p95: number | null = 90
+  ) => ({
+    model,
+    tier: '*' as const,
+    accuracy,
+    avgCostUsd,
+    avgLatencyMs: 100,
+    p50LatencyMs: 80,
+    p95LatencyMs: p95,
+    timeouts: 0,
+    cases: 36,
+    errors: 0,
+  });
+
+  it('latency gate: picks cheapest within budget when both meet accuracy and latency', () => {
+    const winner = pickClassifierWinner(
+      [
+        summaryWithLatency('fast-cheap', 0.9, 0.001, 800),
+        summaryWithLatency('fast-pricy', 0.95, 0.01, 500),
+        summaryWithLatency('slow', 0.9, 0.0005, 1500),
+      ],
+      0.7,
+      1000
+    );
+    expect(winner?.model).toBe('fast-cheap');
+  });
+
+  it('latency gate fallback: picks lowest p95 among accuracy-meeting when none in budget', () => {
+    const winner = pickClassifierWinner(
+      [
+        summaryWithLatency('almost', 0.9, 0.001, 1200),
+        summaryWithLatency('closest', 0.85, 0.002, 1100),
+        summaryWithLatency('way-off', 0.9, 0.0005, 2000),
+      ],
+      0.8,
+      1000
+    );
+    expect(winner?.model).toBe('closest');
+  });
+
+  it('null budget disables latency gate', () => {
+    const winner = pickClassifierWinner(
+      [
+        summaryWithLatency('cheap-slow', 0.9, 0.001, 5000),
+        summaryWithLatency('pricy-fast', 0.95, 0.01, 100),
+      ],
+      0.7,
+      null
+    );
+    expect(winner?.model).toBe('cheap-slow');
+  });
+
+  it('null p95 on summary fails non-null latency constraint', () => {
+    const winner = pickClassifierWinner(
+      [
+        summaryWithLatency('no-p95', 0.9, 0.001, null),
+        summaryWithLatency('has-p95', 0.85, 0.01, 800),
+      ],
+      0.7,
+      1000
+    );
+    // no-p95 fails the gate (null p95 cannot meet non-null constraint)
+    // has-p95 meets both → wins
+    expect(winner?.model).toBe('has-p95');
+  });
+});
+
+describe('summarize — p95 and timeouts', () => {
+  it('computes p95LatencyMs using nearest-rank formula', () => {
+    // 20 rows, sorted latencies at 95th percentile: ceil(0.95*20)-1 = 18
+    const rows = Array.from({ length: 20 }, (_, i) =>
+      makeRow({ case_id: `c${i}`, latency_ms: (i + 1) * 100 })
+    );
+    const [s] = summarize(rows, 'classifier');
+    // sorted latencies: [100, 200, ..., 2000], index 18 = 1900
+    expect(s.p95LatencyMs).toBe(1900);
+  });
+
+  it('counts timeouts', () => {
+    const rows = [
+      makeRow({ case_id: 'c1', timed_out: 1 }),
+      makeRow({ case_id: 'c2', timed_out: 0 }),
+      makeRow({ case_id: 'c3', timed_out: 1 }),
+    ];
+    const [s] = summarize(rows, 'classifier');
+    expect(s.timeouts).toBe(2);
+  });
+
+  it('aggregates multi-rep rows correctly (same case_id different rep)', () => {
+    const rows = [
+      makeRow({ case_id: 'c1', rep: 0, score: 1, latency_ms: 100 }),
+      makeRow({ case_id: 'c1', rep: 1, score: 0, latency_ms: 200 }),
+      makeRow({ case_id: 'c2', rep: 0, score: 1, latency_ms: 150 }),
+      makeRow({ case_id: 'c2', rep: 1, score: 1, latency_ms: 250 }),
+    ];
+    const [s] = summarize(rows, 'classifier');
+    expect(s.cases).toBe(4);
+    expect(s.accuracy).toBe(0.75);
+  });
+});
+
+describe('decider message fan-out', () => {
+  it('DECIDER_CHUNK_SIZE is 5 (chunk count for 76 cases)', () => {
+    // DECIDER_CASES = 76, chunk size 5 → ceil(76/5) = 16 chunks
+    const chunks = chunkArray(
+      Array.from({ length: 76 }, (_, i) => String(i)),
+      5
+    );
+    expect(chunks).toHaveLength(16);
+  });
+
+  it('message schema accepts and defaults rep', () => {
+    const msg = BenchmarkJobMessageSchema.parse({ runId: 'r1', kind: 'classifier', model: 'm1' });
+    expect(msg.rep).toBeUndefined();
+    const withRep = BenchmarkJobMessageSchema.parse({
+      runId: 'r1',
+      kind: 'decider',
+      model: 'm1',
+      rep: 2,
+      caseIds: ['a'],
+      chunk: 0,
+    });
+    expect(withRep.rep).toBe(2);
+  });
 });
diff --git a/services/auto-routing-benchmark/src/run.ts b/services/auto-routing-benchmark/src/run.ts
index 2dc33d5f0d..2a71dde05a 100644
--- a/services/auto-routing-benchmark/src/run.ts
+++ b/services/auto-routing-benchmark/src/run.ts
@@ -40,6 +40,8 @@ export type BenchmarkJobMessage = {
   // index used to key the container instance. Absent for classifier messages.
   caseIds?: string[];
   chunk?: number;
+  // Repetition index (0-based). Absent for classifier messages.
+  rep?: number;
 };
 
 export const BenchmarkJobMessageSchema = z.object({
@@ -48,12 +50,13 @@ export const BenchmarkJobMessageSchema = z.object({
   model: z.string().min(1),
   caseIds: z.array(z.string().min(1)).optional(),
   chunk: z.number().int().min(0).optional(),
+  rep: z.number().int().min(0).optional(),
 });
 
 // Decider cases run through the real `kilo` CLI in a container (up to ~3 min
 // each). Chunking caps how many cases a single queue invocation processes so
 // each stays well under CF's wall-clock limit.
-const DECIDER_CHUNK_SIZE = 10;
+const DECIDER_CHUNK_SIZE = 5;
 
 export function chunkArray<T>(items: readonly T[], size: number): T[][] {
   const chunks: T[][] = [];
@@ -81,6 +84,8 @@ export async function startRun(
   if (!config) {
     throw new Error('benchmark config not set: save it in the admin panel before starting a run');
   }
+  const repetitions =
+    kind === 'classifier' ? config.classifierRepetitions : config.deciderRepetitions;
   const models =
     kind === 'classifier' ? config.classifierModels : config.deciderModels.map(m => m.id);
 
@@ -125,6 +130,9 @@ export async function startRun(
       switch_cost_factor: config.switchCostFactor,
       max_concurrency: config.maxConcurrency,
       benchmark_user_id: config.benchmarkUserId,
+      repetitions,
+      classifier_max_p95_latency_ms:
+        kind === 'classifier' ? config.classifierMaxP95LatencyMs : null,
     },
     runModelRows,
     carriedSummaries
@@ -151,6 +159,8 @@ export async function startRun(
       switchCostFactor: config.switchCostFactor,
       benchmarkUserId: config.benchmarkUserId,
       models: runModelRows,
+      repetitions,
+      classifierMaxP95LatencyMs: kind === 'classifier' ? config.classifierMaxP95LatencyMs : null,
     });
     return { runId, enqueuedModels: 0, skippedModels };
   }
@@ -164,19 +174,22 @@ export async function startRun(
     return { runId, enqueuedModels: enqueuedModelIds.length, skippedModels };
   }
 
-  // Decider: one message per (model, chunk) so each queue invocation stays
-  // bounded. finalizeRunIfComplete expects enqueuedModels × DECIDER_CASES rows.
+  // Decider: one message per (model, rep, chunk) so each queue invocation stays
+  // bounded. finalizeRunIfComplete expects enqueuedModels × DECIDER_CASES × repetitions rows.
   const chunks = chunkArray(DECIDER_CASES, DECIDER_CHUNK_SIZE);
   const messages = enqueuedModelIds.flatMap(model =>
-    chunks.map((chunkCases, chunk) => ({
-      body: {
-        runId,
-        kind,
-        model,
-        chunk,
-        caseIds: chunkCases.map(c => c.id),
-      } satisfies BenchmarkJobMessage,
-    }))
+    Array.from({ length: repetitions }, (_, rep) =>
+      chunks.map((chunkCases, chunk) => ({
+        body: {
+          runId,
+          kind,
+          model,
+          chunk,
+          rep,
+          caseIds: chunkCases.map(c => c.id),
+        } satisfies BenchmarkJobMessage,
+      }))
+    ).flat()
   );
   await env.BENCH_QUEUE.sendBatch(messages);
   return { runId, enqueuedModels: enqueuedModelIds.length, skippedModels };
@@ -203,36 +216,49 @@ export async function processJob(env: Env, rawMessage: unknown): Promise<void> {
   if (message.kind === 'classifier') {
     // Create the OpenRouter client inside processJob — no module-scope transport clients.
     const client = await createOpenRouterClient(env);
-    await runCasesWithConcurrency(CLASSIFIER_CASES, state.maxConcurrency, async benchCase => {
-      const startedAt = performance.now();
-      try {
-        const result = await classifyWithOpenRouter(client, benchCase.input, message.model);
-        const score = result.fallback
-          ? 0
-          : gradeClassifierOutput(benchCase.expected, result.classification);
-        await upsertCaseResult(env.BENCH_DB, {
-          run_id: message.runId,
-          model: message.model,
-          case_id: benchCase.id,
-          tier: null,
-          score,
-          latency_ms: Math.round(performance.now() - startedAt),
-          cost_usd: result.cost,
-          error: null,
-          fallback_reason: result.fallback?.reason ?? null,
-          retried: result.retried ?? false,
-          exit_code: null,
-          output_prefix: null,
-          event_count: null,
-          last_event_types: null,
-        });
-      } catch (error) {
-        await upsertCaseResult(
-          env.BENCH_DB,
-          failedRow(message, benchCase.id, null, startedAt, error)
-        );
+    // Expand cases × repetitions into a flat work list.
+    const expandedItems: { benchCase: (typeof CLASSIFIER_CASES)[number]; rep: number }[] = [];
+    for (let rep = 0; rep < state.repetitions; rep++) {
+      for (const benchCase of CLASSIFIER_CASES) {
+        expandedItems.push({ benchCase, rep });
       }
-    });
+    }
+    await runCasesWithConcurrency(
+      expandedItems,
+      state.maxConcurrency,
+      async ({ benchCase, rep }) => {
+        const startedAt = performance.now();
+        try {
+          const result = await classifyWithOpenRouter(client, benchCase.input, message.model);
+          const score = result.fallback
+            ? 0
+            : gradeClassifierOutput(benchCase.expected, result.classification);
+          await upsertCaseResult(env.BENCH_DB, {
+            run_id: message.runId,
+            model: message.model,
+            case_id: benchCase.id,
+            tier: null,
+            score,
+            latency_ms: Math.round(performance.now() - startedAt),
+            cost_usd: result.cost,
+            error: null,
+            fallback_reason: result.fallback?.reason ?? null,
+            retried: result.retried ?? false,
+            exit_code: null,
+            output_prefix: null,
+            event_count: null,
+            last_event_types: null,
+            rep,
+            timed_out: 0,
+          });
+        } catch (error) {
+          await upsertCaseResult(
+            env.BENCH_DB,
+            failedRow(message, benchCase.id, null, startedAt, error, rep)
+          );
+        }
+      }
+    );
   } else {
     await processDeciderJob(env, message, state);
   }
@@ -246,6 +272,8 @@ type RunState = {
   switchCostFactor: number;
   benchmarkUserId: string | null;
   models: RunModelRow[];
+  repetitions: number;
+  classifierMaxP95LatencyMs: number | null;
 };
 
 async function getRunState(env: Env, runId: string): Promise<RunState> {
@@ -259,6 +287,8 @@ async function getRunState(env: Env, runId: string): Promise<RunState> {
     switchCostFactor: run.switch_cost_factor,
     benchmarkUserId: run.benchmark_user_id,
     models,
+    repetitions: run.repetitions,
+    classifierMaxP95LatencyMs: run.classifier_max_p95_latency_ms,
   };
 }
 
@@ -285,7 +315,8 @@ async function processDeciderJob(
   // Fetch a short-lived user token ONCE per queue message. Non-OK throws so the
   // queue retries the message. The token is never logged.
   const kiloToken = await fetchBenchmarkUserToken(env, state.benchmarkUserId);
-  const instanceName = `${message.runId}:${message.model}:${message.chunk ?? 0}`;
+  const rep = message.rep ?? 0;
+  const instanceName = `${message.runId}:${message.model}:${rep}:${message.chunk ?? 0}`;
 
   // Reasoning effort comes from the run snapshot (run_models row), not live config.
   const modelRow = state.models.find(m => m.model === message.model);
@@ -348,11 +379,13 @@ async function processDeciderJob(
         output_prefix: result.text.slice(0, 200),
         event_count: result.eventCount,
         last_event_types: result.lastEventTypes.join(' '),
+        rep,
+        timed_out: result.timedOut ? 1 : 0,
       });
     } catch (error) {
       await upsertCaseResult(
         env.BENCH_DB,
-        failedRow(message, benchCase.id, benchCase.tier, startedAt, error)
+        failedRow(message, benchCase.id, benchCase.tier, startedAt, error, rep)
       );
     }
   });
@@ -391,7 +424,8 @@ function failedRow(
   caseId: string,
   tier: string | null,
   startedAt: number,
-  error: unknown
+  error: unknown,
+  rep: number = 0
 ): CaseResultRow {
   return {
     run_id: message.runId,
@@ -408,6 +442,8 @@ function failedRow(
     output_prefix: null,
     event_count: null,
     last_event_types: null,
+    rep,
+    timed_out: 0,
   };
 }
 
@@ -434,7 +470,7 @@ async function finalizeRunIfComplete(
 ): Promise<void> {
   const enqueuedModels = state.models.filter(m => m.enqueued);
   const caseCount = kind === 'classifier' ? CLASSIFIER_CASES.length : DECIDER_CASES.length;
-  const expected = enqueuedModels.length * caseCount;
+  const expected = enqueuedModels.length * caseCount * state.repetitions;
   const actual = await countCaseResults(env.BENCH_DB, runId);
 
   if (actual < expected) return;
@@ -453,7 +489,11 @@ async function finalizeRunIfComplete(
   const allSummaries = await getSummaries(env.BENCH_DB, runId);
 
   if (kind === 'classifier') {
-    const winner = pickClassifierWinner(allSummaries, state.minAccuracy);
+    const winner = pickClassifierWinner(
+      allSummaries,
+      state.minAccuracy,
+      state.classifierMaxP95LatencyMs
+    );
     if (winner) {
       console.log(
         JSON.stringify({ event: 'classifier_winner_published', runId, model: winner.model })
@@ -529,6 +569,11 @@ export function summarize(rows: CaseResultRow[], kind: BenchmarkKind): Benchmark
     const [model, tier] = key.split('\0');
     const latencies = group.map(r => r.latency_ms).toSorted((a, b) => a - b);
     const costs = group.filter(r => r.cost_usd !== null);
+    const p95LatencyMs =
+      latencies.length > 0
+        ? (latencies[Math.min(latencies.length - 1, Math.ceil(0.95 * latencies.length) - 1)] ??
+          null)
+        : null;
     return {
       model,
       tier: tier as BenchmarkModelSummary['tier'],
@@ -538,8 +583,10 @@ export function summarize(rows: CaseResultRow[], kind: BenchmarkKind): Benchmark
         : null,
       avgLatencyMs: Math.round(group.reduce((a, r) => a + r.latency_ms, 0) / group.length),
       p50LatencyMs: latencies[Math.floor(latencies.length / 2)] ?? null,
+      p95LatencyMs,
       cases: group.length,
       errors: group.filter(r => r.error !== null).length,
+      timeouts: group.filter(r => r.timed_out).length,
     };
   });
 }
diff --git a/services/auto-routing-benchmark/src/winner.ts b/services/auto-routing-benchmark/src/winner.ts
index 1cb0abd7e4..318809c4a3 100644
--- a/services/auto-routing-benchmark/src/winner.ts
+++ b/services/auto-routing-benchmark/src/winner.ts
@@ -1,18 +1,38 @@
 import type { BenchmarkModelSummary } from '@kilocode/auto-routing-contracts';
 
-// Same bang-for-buck rule as the routing table, applied to classifier
-// summaries (tier '*'): cheapest candidate meeting the accuracy threshold,
-// else the most accurate one. Null when there are no graded summaries.
+// Picks the best classifier candidate from summaries (tier '*') applying:
+//   1. Accuracy gate: must meet minAccuracy.
+//   2. Optional p95 latency gate: when maxP95LatencyMs is non-null, prefer
+//      candidates whose measured p95 latency is within budget.
+// Selection order:
+//   - Candidates meeting BOTH accuracy and latency → cheapest (tie: highest accuracy).
+//   - Candidates meeting accuracy only (latency gate not met) → lowest p95
+//     (tie: cheapest). This ensures the admin always sees a winner, even
+//     when all models are over budget.
+//   - No accuracy threshold met → most accurate (tie: cheapest).
+// Returns null when there are no graded summaries at all.
 export function pickClassifierWinner(
   summaries: BenchmarkModelSummary[],
-  minAccuracy: number
+  minAccuracy: number,
+  maxP95LatencyMs: number | null = null
 ): BenchmarkModelSummary | null {
   const graded = summaries.filter(s => s.tier === '*' && s.cases > 0);
   if (graded.length === 0) return null;
   const cost = (s: BenchmarkModelSummary) => s.avgCostUsd ?? Number.POSITIVE_INFINITY;
-  const meeting = graded.filter(s => s.accuracy >= minAccuracy);
-  if (meeting.length > 0) {
-    return meeting.toSorted((a, b) => cost(a) - cost(b) || b.accuracy - a.accuracy)[0];
+  const p95 = (s: BenchmarkModelSummary) => s.p95LatencyMs ?? Number.POSITIVE_INFINITY;
+
+  const meetingAccuracy = graded.filter(s => s.accuracy >= minAccuracy);
+  const meetingBoth =
+    maxP95LatencyMs !== null
+      ? meetingAccuracy.filter(s => s.p95LatencyMs !== null && s.p95LatencyMs <= maxP95LatencyMs)
+      : meetingAccuracy;
+
+  if (meetingBoth.length > 0) {
+    return meetingBoth.toSorted((a, b) => cost(a) - cost(b) || b.accuracy - a.accuracy)[0];
+  }
+  if (meetingAccuracy.length > 0) {
+    // Latency gate not met: pick lowest p95 (null p95 sorts last), tie-break cheapest.
+    return meetingAccuracy.toSorted((a, b) => p95(a) - p95(b) || cost(a) - cost(b))[0];
   }
   return graded.toSorted((a, b) => b.accuracy - a.accuracy || cost(a) - cost(b))[0];
 }

From 1eae06f27db6b977ace862d0b8f8ba9b74f97ada Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Sat, 13 Jun 2026 02:01:03 +0200
Subject: [PATCH 68/73] fix(auto-routing): correct case_results migration
 backfill and close test gaps
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Migration 0001: replace "rep"/"timed_out" column refs in INSERT...SELECT
  with literal 0,0 — old table lacks those columns; D1 silently degrades
  double-quoted unknowns to string literals, corrupting NOT NULL integer rows.
- Contracts: add BenchmarkConfigSchema defaults test (classifierRepetitions=1,
  deciderRepetitions=1, classifierMaxP95LatencyMs=1000 when omitted).
- Benchmark: extract buildDeciderMessages() pure function; add fan-out test
  asserting models × reps × ceil(76/5) messages each carrying the correct rep.
---
 .../src/contracts.test.ts                     | 20 +++++++++
 .../migrations/0001_stormy_tarot.sql          |  2 +-
 .../auto-routing-benchmark/src/run.test.ts    | 43 ++++++++++++++++++-
 services/auto-routing-benchmark/src/run.ts    | 41 ++++++++++++------
 4 files changed, 90 insertions(+), 16 deletions(-)

diff --git a/packages/auto-routing-contracts/src/contracts.test.ts b/packages/auto-routing-contracts/src/contracts.test.ts
index 7c0efc2799..e9cd6bedb0 100644
--- a/packages/auto-routing-contracts/src/contracts.test.ts
+++ b/packages/auto-routing-contracts/src/contracts.test.ts
@@ -6,6 +6,7 @@ import {
   MirrorPayloadSchema,
   UpdateClassifierModelRequestSchema,
 } from './index';
+import { BenchmarkConfigSchema } from './benchmark';
 
 describe('auto routing contracts', () => {
   it('validates the cross-service request and response contracts', () => {
@@ -128,3 +129,22 @@ describe('auto routing contracts', () => {
     ).toMatchObject({ period: '24h' });
   });
 });
+
+describe('BenchmarkConfigSchema defaults', () => {
+  it('applies defaults of 1/1/1000 for classifierRepetitions, deciderRepetitions, classifierMaxP95LatencyMs', () => {
+    const result = BenchmarkConfigSchema.parse({
+      classifierModels: ['model/a'],
+      deciderModels: [{ id: 'model/b' }],
+      minAccuracy: 0.8,
+      maxConcurrency: 4,
+      benchmarkUserId: null,
+      switchCostFactor: 2,
+      updatedAt: null,
+      updatedBy: null,
+      // classifierRepetitions, deciderRepetitions, classifierMaxP95LatencyMs intentionally omitted
+    });
+    expect(result.classifierRepetitions).toBe(1);
+    expect(result.deciderRepetitions).toBe(1);
+    expect(result.classifierMaxP95LatencyMs).toBe(1000);
+  });
+});
diff --git a/services/auto-routing-benchmark/migrations/0001_stormy_tarot.sql b/services/auto-routing-benchmark/migrations/0001_stormy_tarot.sql
index 7117c47ed3..ab4c317e70 100644
--- a/services/auto-routing-benchmark/migrations/0001_stormy_tarot.sql
+++ b/services/auto-routing-benchmark/migrations/0001_stormy_tarot.sql
@@ -19,7 +19,7 @@ CREATE TABLE `__new_case_results` (
 	PRIMARY KEY(`run_id`, `model`, `case_id`, `rep`)
 );
 --> statement-breakpoint
-INSERT INTO `__new_case_results`("run_id", "model", "case_id", "tier", "score", "latency_ms", "cost_usd", "error", "fallback_reason", "retried", "exit_code", "output_prefix", "event_count", "last_event_types", "rep", "timed_out") SELECT "run_id", "model", "case_id", "tier", "score", "latency_ms", "cost_usd", "error", "fallback_reason", "retried", "exit_code", "output_prefix", "event_count", "last_event_types", "rep", "timed_out" FROM `case_results`;--> statement-breakpoint
+INSERT INTO `__new_case_results`("run_id", "model", "case_id", "tier", "score", "latency_ms", "cost_usd", "error", "fallback_reason", "retried", "exit_code", "output_prefix", "event_count", "last_event_types", "rep", "timed_out") SELECT "run_id", "model", "case_id", "tier", "score", "latency_ms", "cost_usd", "error", "fallback_reason", "retried", "exit_code", "output_prefix", "event_count", "last_event_types", 0, 0 FROM `case_results`;--> statement-breakpoint
 DROP TABLE `case_results`;--> statement-breakpoint
 ALTER TABLE `__new_case_results` RENAME TO `case_results`;--> statement-breakpoint
 PRAGMA foreign_keys=ON;--> statement-breakpoint
diff --git a/services/auto-routing-benchmark/src/run.test.ts b/services/auto-routing-benchmark/src/run.test.ts
index 707b7330bc..14ce3de784 100644
--- a/services/auto-routing-benchmark/src/run.test.ts
+++ b/services/auto-routing-benchmark/src/run.test.ts
@@ -1,6 +1,12 @@
 import { describe, expect, it } from 'vitest';
 import type { CaseResultRow } from './db';
-import { BenchmarkJobMessageSchema, chunkArray, runCasesWithConcurrency, summarize } from './run';
+import {
+  BenchmarkJobMessageSchema,
+  buildDeciderMessages,
+  chunkArray,
+  runCasesWithConcurrency,
+  summarize,
+} from './run';
 import { pickClassifierWinner } from './winner';
 
 function makeRow(overrides: Partial<CaseResultRow> = {}): CaseResultRow {
@@ -411,4 +417,39 @@ describe('decider message fan-out', () => {
     });
     expect(withRep.rep).toBe(2);
   });
+
+  it('buildDeciderMessages: produces models × reps × ceil(76/5) messages with correct rep', () => {
+    // 76 cases, chunk size 5 → 16 chunks
+    const cases76 = Array.from({ length: 76 }, (_, i) => ({ id: `case-${i}` }));
+    const chunks = chunkArray(cases76, 5);
+    expect(chunks).toHaveLength(16);
+
+    const models = ['model/a', 'model/b'];
+    const repetitions = 3;
+    const messages = buildDeciderMessages('run-test', 'decider', models, repetitions, chunks);
+
+    // Total: 2 models × 3 reps × 16 chunks = 96 messages
+    expect(messages).toHaveLength(models.length * repetitions * chunks.length);
+
+    // Each rep index (0..2) should appear exactly models.length × chunks.length times
+    for (let rep = 0; rep < repetitions; rep++) {
+      const forRep = messages.filter(m => m.body.rep === rep);
+      expect(forRep).toHaveLength(models.length * chunks.length);
+    }
+
+    // Every message carries the correct rep in its body
+    for (const { body } of messages) {
+      expect(typeof body.rep).toBe('number');
+      expect(body.rep).toBeGreaterThanOrEqual(0);
+      expect(body.rep).toBeLessThan(repetitions);
+    }
+
+    // caseIds on each message match the chunk
+    for (let chunkIdx = 0; chunkIdx < chunks.length; chunkIdx++) {
+      const forChunk = messages.filter(m => m.body.chunk === chunkIdx);
+      for (const { body } of forChunk) {
+        expect(body.caseIds).toEqual(chunks[chunkIdx].map(c => c.id));
+      }
+    }
+  });
 });
diff --git a/services/auto-routing-benchmark/src/run.ts b/services/auto-routing-benchmark/src/run.ts
index 2a71dde05a..c48c10a221 100644
--- a/services/auto-routing-benchmark/src/run.ts
+++ b/services/auto-routing-benchmark/src/run.ts
@@ -68,6 +68,32 @@ export function chunkArray<T>(items: readonly T[], size: number): T[][] {
 
 const STALE_RUN_MAX_AGE_MS = 6 * 3600_000;
 
+/** Pure helper: produces the sendBatch bodies for a decider run fan-out.
+ * Extracted for unit-testability; the shape is models × reps × chunks messages.
+ */
+export function buildDeciderMessages(
+  runId: string,
+  kind: BenchmarkKind,
+  modelIds: string[],
+  repetitions: number,
+  chunks: readonly (readonly { id: string }[])[]
+): { body: BenchmarkJobMessage }[] {
+  return modelIds.flatMap(model =>
+    Array.from({ length: repetitions }, (_, rep) =>
+      chunks.map((chunkCases, chunk) => ({
+        body: {
+          runId,
+          kind,
+          model,
+          chunk,
+          rep,
+          caseIds: chunkCases.map(c => c.id),
+        } satisfies BenchmarkJobMessage,
+      }))
+    ).flat()
+  );
+}
+
 export async function startRun(
   env: Env,
   kind: BenchmarkKind,
@@ -177,20 +203,7 @@ export async function startRun(
   // Decider: one message per (model, rep, chunk) so each queue invocation stays
   // bounded. finalizeRunIfComplete expects enqueuedModels × DECIDER_CASES × repetitions rows.
   const chunks = chunkArray(DECIDER_CASES, DECIDER_CHUNK_SIZE);
-  const messages = enqueuedModelIds.flatMap(model =>
-    Array.from({ length: repetitions }, (_, rep) =>
-      chunks.map((chunkCases, chunk) => ({
-        body: {
-          runId,
-          kind,
-          model,
-          chunk,
-          rep,
-          caseIds: chunkCases.map(c => c.id),
-        } satisfies BenchmarkJobMessage,
-      }))
-    ).flat()
-  );
+  const messages = buildDeciderMessages(runId, kind, enqueuedModelIds, repetitions, chunks);
   await env.BENCH_QUEUE.sendBatch(messages);
   return { runId, enqueuedModels: enqueuedModelIds.length, skippedModels };
 }

From 715125635e1ef45e6dbebfd9516077d07ef789c3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Sat, 13 Jun 2026 02:09:11 +0200
Subject: [PATCH 69/73] feat(admin): benchmark repetitions, latency budget, and
 p95/timeout columns
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add classifier/decider repetitions (1–5) and classifierMaxP95LatencyMs
inputs to the Benchmark Config card; add p95 latency and Timeouts
columns to the run summaries table; update test fixtures with new fields.
---
 .../benchmark-config/route.test.ts            |  3 +
 .../admin/auto-routing/BenchmarksSection.tsx  | 87 ++++++++++++++++++-
 ...uto-routing-benchmark-admin-client.test.ts |  3 +
 3 files changed, 91 insertions(+), 2 deletions(-)

diff --git a/apps/web/src/app/admin/api/auto-routing/benchmark-config/route.test.ts b/apps/web/src/app/admin/api/auto-routing/benchmark-config/route.test.ts
index 0117f52da2..49a7ffdf79 100644
--- a/apps/web/src/app/admin/api/auto-routing/benchmark-config/route.test.ts
+++ b/apps/web/src/app/admin/api/auto-routing/benchmark-config/route.test.ts
@@ -42,6 +42,9 @@ const validConfig = {
   switchCostFactor: 3,
   maxConcurrency: 4,
   benchmarkUserId: null,
+  classifierRepetitions: 1,
+  deciderRepetitions: 1,
+  classifierMaxP95LatencyMs: 1000,
   updatedAt: null,
   updatedBy: null,
 };
diff --git a/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx b/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx
index 27229964a8..14ab5d6e52 100644
--- a/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx
+++ b/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx
@@ -114,6 +114,9 @@ function configToFormState(config: BenchmarkConfig | null): {
   switchCostFactor: number;
   maxConcurrency: number;
   benchmarkUserId: string;
+  classifierRepetitions: number;
+  deciderRepetitions: number;
+  classifierMaxP95LatencyMs: string;
 } {
   if (config === null) {
     // No config saved yet: the worker fabricates nothing, so the form starts
@@ -125,6 +128,9 @@ function configToFormState(config: BenchmarkConfig | null): {
       switchCostFactor: 3,
       maxConcurrency: 4,
       benchmarkUserId: '',
+      classifierRepetitions: 1,
+      deciderRepetitions: 1,
+      classifierMaxP95LatencyMs: '1000',
     };
   }
   return {
@@ -137,6 +143,10 @@ function configToFormState(config: BenchmarkConfig | null): {
     switchCostFactor: config.switchCostFactor,
     maxConcurrency: config.maxConcurrency,
     benchmarkUserId: config.benchmarkUserId ?? '',
+    classifierRepetitions: config.classifierRepetitions,
+    deciderRepetitions: config.deciderRepetitions,
+    classifierMaxP95LatencyMs:
+      config.classifierMaxP95LatencyMs !== null ? String(config.classifierMaxP95LatencyMs) : '',
   };
 }
 
@@ -155,6 +165,8 @@ function formStateToConfig(
       reasoningEffort: row.reasoningEffort ?? null,
     }));
   const benchmarkUserId = state.benchmarkUserId.trim();
+  const rawLatency = state.classifierMaxP95LatencyMs.trim();
+  const classifierMaxP95LatencyMs = rawLatency.length > 0 ? parseInt(rawLatency, 10) || null : null;
   return {
     classifierModels,
     deciderModels,
@@ -162,6 +174,9 @@ function formStateToConfig(
     switchCostFactor: state.switchCostFactor,
     maxConcurrency: state.maxConcurrency,
     benchmarkUserId: benchmarkUserId.length > 0 ? benchmarkUserId : null,
+    classifierRepetitions: state.classifierRepetitions,
+    deciderRepetitions: state.deciderRepetitions,
+    classifierMaxP95LatencyMs,
     updatedAt: base?.updatedAt ?? null,
     updatedBy: base?.updatedBy ?? null,
   };
@@ -372,6 +387,46 @@ function BenchmarkConfigEditor({
               className="h-8 w-40 tabular-nums"
             />
           </div>
+          <div className="flex flex-col gap-1.5">
+            <Label htmlFor="benchmark-classifier-repetitions" className="text-sm font-medium">
+              Classifier repetitions (1–5)
+            </Label>
+            <Input
+              id="benchmark-classifier-repetitions"
+              type="number"
+              min={1}
+              max={5}
+              step={1}
+              value={form.classifierRepetitions}
+              onChange={e =>
+                setForm(prev => ({
+                  ...prev,
+                  classifierRepetitions: parseInt(e.target.value, 10) || 1,
+                }))
+              }
+              className="h-8 w-40 tabular-nums"
+            />
+          </div>
+          <div className="flex flex-col gap-1.5">
+            <Label htmlFor="benchmark-decider-repetitions" className="text-sm font-medium">
+              Decider repetitions (1–5)
+            </Label>
+            <Input
+              id="benchmark-decider-repetitions"
+              type="number"
+              min={1}
+              max={5}
+              step={1}
+              value={form.deciderRepetitions}
+              onChange={e =>
+                setForm(prev => ({
+                  ...prev,
+                  deciderRepetitions: parseInt(e.target.value, 10) || 1,
+                }))
+              }
+              className="h-8 w-40 tabular-nums"
+            />
+          </div>
         </div>
 
         {/* Benchmark user id */}
@@ -391,6 +446,28 @@ function BenchmarkConfigEditor({
           </p>
         </div>
 
+        {/* Classifier max p95 latency */}
+        <div className="flex flex-col gap-1.5">
+          <Label htmlFor="benchmark-classifier-max-p95-latency" className="text-sm font-medium">
+            Classifier max p95 latency (ms)
+          </Label>
+          <Input
+            id="benchmark-classifier-max-p95-latency"
+            type="number"
+            min={1}
+            step={1}
+            value={form.classifierMaxP95LatencyMs}
+            onChange={e =>
+              setForm(prev => ({ ...prev, classifierMaxP95LatencyMs: e.target.value }))
+            }
+            className="h-8 w-40 tabular-nums"
+            placeholder="(no limit)"
+          />
+          <p className="text-muted-foreground text-xs">
+            Winner must classify under this p95 latency; empty disables the latency gate.
+          </p>
+        </div>
+
         {/* Actions + metadata */}
         <div className="flex flex-col gap-2">
           <div className="flex flex-wrap gap-2">
@@ -437,7 +514,7 @@ function RunSummariesTable({ run }: { run: BenchmarkRun }) {
   if (sortedSummaries.length === 0) {
     return (
       <TableRow>
-        <TableCell colSpan={6} className="text-muted-foreground h-10 text-center text-xs">
+        <TableCell colSpan={8} className="text-muted-foreground h-10 text-center text-xs">
           No summaries
         </TableCell>
       </TableRow>
@@ -447,7 +524,7 @@ function RunSummariesTable({ run }: { run: BenchmarkRun }) {
   return (
     <>
       <TableRow className="bg-muted/30">
-        <TableCell colSpan={6} className="px-4 py-2">
+        <TableCell colSpan={8} className="px-4 py-2">
           <Table>
             <TableHeader>
               <TableRow>
@@ -457,8 +534,10 @@ function RunSummariesTable({ run }: { run: BenchmarkRun }) {
                 <TableHead className="text-right text-xs">Avg cost</TableHead>
                 <TableHead className="text-right text-xs">Avg latency</TableHead>
                 <TableHead className="text-right text-xs">p50 latency</TableHead>
+                <TableHead className="text-right text-xs">p95 latency</TableHead>
                 <TableHead className="text-right text-xs">Cases</TableHead>
                 <TableHead className="text-right text-xs">Errors</TableHead>
+                <TableHead className="text-right text-xs">Timeouts</TableHead>
               </TableRow>
             </TableHeader>
             <TableBody>
@@ -480,8 +559,12 @@ function RunSummariesTable({ run }: { run: BenchmarkRun }) {
                   <TableCell className="text-right tabular-nums text-xs">
                     {s.p50LatencyMs !== null ? `${s.p50LatencyMs.toFixed(0)} ms` : '—'}
                   </TableCell>
+                  <TableCell className="text-right tabular-nums text-xs">
+                    {s.p95LatencyMs !== null ? `${s.p95LatencyMs.toFixed(0)} ms` : '—'}
+                  </TableCell>
                   <TableCell className="text-right tabular-nums text-xs">{s.cases}</TableCell>
                   <TableCell className="text-right tabular-nums text-xs">{s.errors}</TableCell>
+                  <TableCell className="text-right tabular-nums text-xs">{s.timeouts}</TableCell>
                 </TableRow>
               ))}
             </TableBody>
diff --git a/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.test.ts b/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.test.ts
index 275c92da1a..d8f209d8df 100644
--- a/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.test.ts
+++ b/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.test.ts
@@ -22,6 +22,9 @@ const configResponse = {
     switchCostFactor: 3,
     maxConcurrency: 4,
     benchmarkUserId: null,
+    classifierRepetitions: 1,
+    deciderRepetitions: 1,
+    classifierMaxP95LatencyMs: 1000,
     updatedAt: null,
     updatedBy: null,
   },

From 17a8c01d32aa4483563da20009835f5bed3c25d0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Sat, 13 Jun 2026 02:15:17 +0200
Subject: [PATCH 70/73] fix(admin): correct runs-table colSpan and cover config
 form round-trip

Set both RunSummariesTable colSpan values back to 6 to match the outer
BenchmarkRunsTable's 6-column header (chevron, Kind, Status, Started,
Completed, Error). Export configToFormState and formStateToConfig for
unit testing and add focused tests covering null-config defaults,
round-trip preservation of repetitions/latency fields, and empty-string
classifierMaxP95LatencyMs coercing to null.
---
 .../auto-routing/BenchmarksSection.test.ts    | 53 ++++++++++++++++++-
 .../admin/auto-routing/BenchmarksSection.tsx  |  8 +--
 2 files changed, 56 insertions(+), 5 deletions(-)

diff --git a/apps/web/src/app/admin/auto-routing/BenchmarksSection.test.ts b/apps/web/src/app/admin/auto-routing/BenchmarksSection.test.ts
index 768545f81f..11a8a6a0e3 100644
--- a/apps/web/src/app/admin/auto-routing/BenchmarksSection.test.ts
+++ b/apps/web/src/app/admin/auto-routing/BenchmarksSection.test.ts
@@ -1,5 +1,10 @@
 import { describe, expect, it } from '@jest/globals';
-import { formatAccuracy, formatUsd } from './BenchmarksSection';
+import {
+  configToFormState,
+  formatAccuracy,
+  formatUsd,
+  formStateToConfig,
+} from './BenchmarksSection';
 
 describe('formatAccuracy', () => {
   it('formats 0.8542 as 85.4%', () => {
@@ -49,3 +54,49 @@ describe('formatUsd', () => {
     expect(formatUsd(0.000001)).toBe('$0.000001');
   });
 });
+
+describe('configToFormState', () => {
+  it('yields defaults including classifierMaxP95LatencyMs "1000" when config is null', () => {
+    const state = configToFormState(null);
+    expect(state.classifierRepetitions).toBe(1);
+    expect(state.deciderRepetitions).toBe(1);
+    expect(state.classifierMaxP95LatencyMs).toBe('1000');
+    expect(state.classifierModels).toBe('');
+    expect(state.deciderModels).toEqual([]);
+  });
+});
+
+describe('formStateToConfig round-trip', () => {
+  const baseConfig = {
+    classifierModels: ['model-a', 'model-b'],
+    deciderModels: [{ id: 'model-c', reasoningEffort: null }],
+    minAccuracy: 0.8,
+    switchCostFactor: 3,
+    maxConcurrency: 4,
+    benchmarkUserId: 'user-123',
+    classifierRepetitions: 3,
+    deciderRepetitions: 2,
+    classifierMaxP95LatencyMs: 500,
+    updatedAt: null,
+    updatedBy: null,
+  };
+
+  it('preserves classifierRepetitions, deciderRepetitions, and classifierMaxP95LatencyMs', () => {
+    const state = configToFormState(baseConfig);
+    expect(state.classifierRepetitions).toBe(3);
+    expect(state.deciderRepetitions).toBe(2);
+    expect(state.classifierMaxP95LatencyMs).toBe('500');
+
+    const result = formStateToConfig(state, baseConfig);
+    expect(result.classifierRepetitions).toBe(3);
+    expect(result.deciderRepetitions).toBe(2);
+    expect(result.classifierMaxP95LatencyMs).toBe(500);
+  });
+
+  it('converts empty-string classifierMaxP95LatencyMs form value to null in config', () => {
+    const state = configToFormState(baseConfig);
+    const stateWithEmpty = { ...state, classifierMaxP95LatencyMs: '' };
+    const result = formStateToConfig(stateWithEmpty, baseConfig);
+    expect(result.classifierMaxP95LatencyMs).toBeNull();
+  });
+});
diff --git a/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx b/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx
index 14ab5d6e52..088b20f7a5 100644
--- a/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx
+++ b/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx
@@ -107,7 +107,7 @@ type DeciderModelRow = {
   reasoningEffort: ReasoningEffort | null;
 };
 
-function configToFormState(config: BenchmarkConfig | null): {
+export function configToFormState(config: BenchmarkConfig | null): {
   classifierModels: string;
   deciderModels: DeciderModelRow[];
   minAccuracy: number;
@@ -150,7 +150,7 @@ function configToFormState(config: BenchmarkConfig | null): {
   };
 }
 
-function formStateToConfig(
+export function formStateToConfig(
   state: ReturnType<typeof configToFormState>,
   base: BenchmarkConfig | null
 ): BenchmarkConfig {
@@ -514,7 +514,7 @@ function RunSummariesTable({ run }: { run: BenchmarkRun }) {
   if (sortedSummaries.length === 0) {
     return (
       <TableRow>
-        <TableCell colSpan={8} className="text-muted-foreground h-10 text-center text-xs">
+        <TableCell colSpan={6} className="text-muted-foreground h-10 text-center text-xs">
           No summaries
         </TableCell>
       </TableRow>
@@ -524,7 +524,7 @@ function RunSummariesTable({ run }: { run: BenchmarkRun }) {
   return (
     <>
       <TableRow className="bg-muted/30">
-        <TableCell colSpan={8} className="px-4 py-2">
+        <TableCell colSpan={6} className="px-4 py-2">
           <Table>
             <TableHeader>
               <TableRow>

From 1a5d858d64dacf7174712d6d9b72554754e6c95b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Sat, 13 Jun 2026 02:16:23 +0200
Subject: [PATCH 71/73] chore(auto-routing): squash benchmark D1 migrations
 into one baseline

---
 ..._shard.sql => 0000_dashing_lightspeed.sql} |  13 +-
 .../migrations/0001_stormy_tarot.sql          |  32 -
 .../migrations/meta/0000_snapshot.json        |  78 ++-
 .../migrations/meta/0001_snapshot.json        | 648 ------------------
 .../migrations/meta/_journal.json             |  11 +-
 5 files changed, 87 insertions(+), 695 deletions(-)
 rename services/auto-routing-benchmark/migrations/{0000_amused_shard.sql => 0000_dashing_lightspeed.sql} (83%)
 delete mode 100644 services/auto-routing-benchmark/migrations/0001_stormy_tarot.sql
 delete mode 100644 services/auto-routing-benchmark/migrations/meta/0001_snapshot.json

diff --git a/services/auto-routing-benchmark/migrations/0000_amused_shard.sql b/services/auto-routing-benchmark/migrations/0000_dashing_lightspeed.sql
similarity index 83%
rename from services/auto-routing-benchmark/migrations/0000_amused_shard.sql
rename to services/auto-routing-benchmark/migrations/0000_dashing_lightspeed.sql
index 4889e7a415..156f459501 100644
--- a/services/auto-routing-benchmark/migrations/0000_amused_shard.sql
+++ b/services/auto-routing-benchmark/migrations/0000_dashing_lightspeed.sql
@@ -4,6 +4,9 @@ CREATE TABLE `benchmark_config` (
 	`switch_cost_factor` real NOT NULL,
 	`max_concurrency` integer NOT NULL,
 	`benchmark_user_id` text,
+	`classifier_repetitions` integer DEFAULT 1 NOT NULL,
+	`decider_repetitions` integer DEFAULT 1 NOT NULL,
+	`classifier_max_p95_latency_ms` integer,
 	`updated_at` text NOT NULL,
 	`updated_by` text
 );
@@ -18,7 +21,9 @@ CREATE TABLE `benchmark_runs` (
 	`min_accuracy` real NOT NULL,
 	`switch_cost_factor` real NOT NULL,
 	`max_concurrency` integer NOT NULL,
-	`benchmark_user_id` text
+	`benchmark_user_id` text,
+	`repetitions` integer DEFAULT 1 NOT NULL,
+	`classifier_max_p95_latency_ms` integer
 );
 --> statement-breakpoint
 CREATE TABLE `case_results` (
@@ -36,7 +41,9 @@ CREATE TABLE `case_results` (
 	`output_prefix` text,
 	`event_count` integer,
 	`last_event_types` text,
-	PRIMARY KEY(`run_id`, `model`, `case_id`)
+	`rep` integer DEFAULT 0 NOT NULL,
+	`timed_out` integer DEFAULT 0 NOT NULL,
+	PRIMARY KEY(`run_id`, `model`, `case_id`, `rep`)
 );
 --> statement-breakpoint
 CREATE TABLE `config_classifier_models` (
@@ -58,6 +65,8 @@ CREATE TABLE `model_summaries` (
 	`p50_latency_ms` real,
 	`cases` integer NOT NULL,
 	`errors` integer NOT NULL,
+	`p95_latency_ms` real,
+	`timeouts` integer DEFAULT 0 NOT NULL,
 	`carried` integer DEFAULT false NOT NULL,
 	PRIMARY KEY(`run_id`, `model`, `tier`)
 );
diff --git a/services/auto-routing-benchmark/migrations/0001_stormy_tarot.sql b/services/auto-routing-benchmark/migrations/0001_stormy_tarot.sql
deleted file mode 100644
index ab4c317e70..0000000000
--- a/services/auto-routing-benchmark/migrations/0001_stormy_tarot.sql
+++ /dev/null
@@ -1,32 +0,0 @@
-PRAGMA foreign_keys=OFF;--> statement-breakpoint
-CREATE TABLE `__new_case_results` (
-	`run_id` text NOT NULL,
-	`model` text NOT NULL,
-	`case_id` text NOT NULL,
-	`tier` text,
-	`score` real NOT NULL,
-	`latency_ms` integer NOT NULL,
-	`cost_usd` real,
-	`error` text,
-	`fallback_reason` text,
-	`retried` integer,
-	`exit_code` integer,
-	`output_prefix` text,
-	`event_count` integer,
-	`last_event_types` text,
-	`rep` integer DEFAULT 0 NOT NULL,
-	`timed_out` integer DEFAULT 0 NOT NULL,
-	PRIMARY KEY(`run_id`, `model`, `case_id`, `rep`)
-);
---> statement-breakpoint
-INSERT INTO `__new_case_results`("run_id", "model", "case_id", "tier", "score", "latency_ms", "cost_usd", "error", "fallback_reason", "retried", "exit_code", "output_prefix", "event_count", "last_event_types", "rep", "timed_out") SELECT "run_id", "model", "case_id", "tier", "score", "latency_ms", "cost_usd", "error", "fallback_reason", "retried", "exit_code", "output_prefix", "event_count", "last_event_types", 0, 0 FROM `case_results`;--> statement-breakpoint
-DROP TABLE `case_results`;--> statement-breakpoint
-ALTER TABLE `__new_case_results` RENAME TO `case_results`;--> statement-breakpoint
-PRAGMA foreign_keys=ON;--> statement-breakpoint
-ALTER TABLE `benchmark_config` ADD `classifier_repetitions` integer DEFAULT 1 NOT NULL;--> statement-breakpoint
-ALTER TABLE `benchmark_config` ADD `decider_repetitions` integer DEFAULT 1 NOT NULL;--> statement-breakpoint
-ALTER TABLE `benchmark_config` ADD `classifier_max_p95_latency_ms` integer;--> statement-breakpoint
-ALTER TABLE `benchmark_runs` ADD `repetitions` integer DEFAULT 1 NOT NULL;--> statement-breakpoint
-ALTER TABLE `benchmark_runs` ADD `classifier_max_p95_latency_ms` integer;--> statement-breakpoint
-ALTER TABLE `model_summaries` ADD `p95_latency_ms` real;--> statement-breakpoint
-ALTER TABLE `model_summaries` ADD `timeouts` integer DEFAULT 0 NOT NULL;
\ No newline at end of file
diff --git a/services/auto-routing-benchmark/migrations/meta/0000_snapshot.json b/services/auto-routing-benchmark/migrations/meta/0000_snapshot.json
index 53a026135e..0615f25069 100644
--- a/services/auto-routing-benchmark/migrations/meta/0000_snapshot.json
+++ b/services/auto-routing-benchmark/migrations/meta/0000_snapshot.json
@@ -1,7 +1,7 @@
 {
   "version": "6",
   "dialect": "sqlite",
-  "id": "20295052-406c-424a-956f-77acc985f44a",
+  "id": "d5601eb8-dce7-4679-bd48-f73b4f88e2d3",
   "prevId": "00000000-0000-0000-0000-000000000000",
   "tables": {
     "benchmark_config": {
@@ -42,6 +42,29 @@
           "notNull": false,
           "autoincrement": false
         },
+        "classifier_repetitions": {
+          "name": "classifier_repetitions",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false,
+          "default": 1
+        },
+        "decider_repetitions": {
+          "name": "decider_repetitions",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false,
+          "default": 1
+        },
+        "classifier_max_p95_latency_ms": {
+          "name": "classifier_max_p95_latency_ms",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        },
         "updated_at": {
           "name": "updated_at",
           "type": "text",
@@ -135,6 +158,21 @@
           "primaryKey": false,
           "notNull": false,
           "autoincrement": false
+        },
+        "repetitions": {
+          "name": "repetitions",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false,
+          "default": 1
+        },
+        "classifier_max_p95_latency_ms": {
+          "name": "classifier_max_p95_latency_ms",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
         }
       },
       "indexes": {},
@@ -243,18 +281,35 @@
           "primaryKey": false,
           "notNull": false,
           "autoincrement": false
+        },
+        "rep": {
+          "name": "rep",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false,
+          "default": 0
+        },
+        "timed_out": {
+          "name": "timed_out",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false,
+          "default": 0
         }
       },
       "indexes": {},
       "foreignKeys": {},
       "compositePrimaryKeys": {
-        "case_results_run_id_model_case_id_pk": {
+        "case_results_run_id_model_case_id_rep_pk": {
           "columns": [
             "run_id",
             "model",
-            "case_id"
+            "case_id",
+            "rep"
           ],
-          "name": "case_results_run_id_model_case_id_pk"
+          "name": "case_results_run_id_model_case_id_rep_pk"
         }
       },
       "uniqueConstraints": {},
@@ -367,6 +422,21 @@
           "notNull": true,
           "autoincrement": false
         },
+        "p95_latency_ms": {
+          "name": "p95_latency_ms",
+          "type": "real",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        },
+        "timeouts": {
+          "name": "timeouts",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false,
+          "default": 0
+        },
         "carried": {
           "name": "carried",
           "type": "integer",
diff --git a/services/auto-routing-benchmark/migrations/meta/0001_snapshot.json b/services/auto-routing-benchmark/migrations/meta/0001_snapshot.json
deleted file mode 100644
index bfda46aad3..0000000000
--- a/services/auto-routing-benchmark/migrations/meta/0001_snapshot.json
+++ /dev/null
@@ -1,648 +0,0 @@
-{
-  "version": "6",
-  "dialect": "sqlite",
-  "id": "4a066f6c-0cac-485c-9489-fa4e0728622d",
-  "prevId": "20295052-406c-424a-956f-77acc985f44a",
-  "tables": {
-    "benchmark_config": {
-      "name": "benchmark_config",
-      "columns": {
-        "id": {
-          "name": "id",
-          "type": "integer",
-          "primaryKey": true,
-          "notNull": true,
-          "autoincrement": false
-        },
-        "min_accuracy": {
-          "name": "min_accuracy",
-          "type": "real",
-          "primaryKey": false,
-          "notNull": true,
-          "autoincrement": false
-        },
-        "switch_cost_factor": {
-          "name": "switch_cost_factor",
-          "type": "real",
-          "primaryKey": false,
-          "notNull": true,
-          "autoincrement": false
-        },
-        "max_concurrency": {
-          "name": "max_concurrency",
-          "type": "integer",
-          "primaryKey": false,
-          "notNull": true,
-          "autoincrement": false
-        },
-        "benchmark_user_id": {
-          "name": "benchmark_user_id",
-          "type": "text",
-          "primaryKey": false,
-          "notNull": false,
-          "autoincrement": false
-        },
-        "classifier_repetitions": {
-          "name": "classifier_repetitions",
-          "type": "integer",
-          "primaryKey": false,
-          "notNull": true,
-          "autoincrement": false,
-          "default": 1
-        },
-        "decider_repetitions": {
-          "name": "decider_repetitions",
-          "type": "integer",
-          "primaryKey": false,
-          "notNull": true,
-          "autoincrement": false,
-          "default": 1
-        },
-        "classifier_max_p95_latency_ms": {
-          "name": "classifier_max_p95_latency_ms",
-          "type": "integer",
-          "primaryKey": false,
-          "notNull": false,
-          "autoincrement": false
-        },
-        "updated_at": {
-          "name": "updated_at",
-          "type": "text",
-          "primaryKey": false,
-          "notNull": true,
-          "autoincrement": false
-        },
-        "updated_by": {
-          "name": "updated_by",
-          "type": "text",
-          "primaryKey": false,
-          "notNull": false,
-          "autoincrement": false
-        }
-      },
-      "indexes": {},
-      "foreignKeys": {},
-      "compositePrimaryKeys": {},
-      "uniqueConstraints": {},
-      "checkConstraints": {}
-    },
-    "benchmark_runs": {
-      "name": "benchmark_runs",
-      "columns": {
-        "id": {
-          "name": "id",
-          "type": "text",
-          "primaryKey": true,
-          "notNull": true,
-          "autoincrement": false
-        },
-        "kind": {
-          "name": "kind",
-          "type": "text",
-          "primaryKey": false,
-          "notNull": true,
-          "autoincrement": false
-        },
-        "status": {
-          "name": "status",
-          "type": "text",
-          "primaryKey": false,
-          "notNull": true,
-          "autoincrement": false
-        },
-        "started_at": {
-          "name": "started_at",
-          "type": "text",
-          "primaryKey": false,
-          "notNull": true,
-          "autoincrement": false
-        },
-        "completed_at": {
-          "name": "completed_at",
-          "type": "text",
-          "primaryKey": false,
-          "notNull": false,
-          "autoincrement": false
-        },
-        "error": {
-          "name": "error",
-          "type": "text",
-          "primaryKey": false,
-          "notNull": false,
-          "autoincrement": false
-        },
-        "min_accuracy": {
-          "name": "min_accuracy",
-          "type": "real",
-          "primaryKey": false,
-          "notNull": true,
-          "autoincrement": false
-        },
-        "switch_cost_factor": {
-          "name": "switch_cost_factor",
-          "type": "real",
-          "primaryKey": false,
-          "notNull": true,
-          "autoincrement": false
-        },
-        "max_concurrency": {
-          "name": "max_concurrency",
-          "type": "integer",
-          "primaryKey": false,
-          "notNull": true,
-          "autoincrement": false
-        },
-        "benchmark_user_id": {
-          "name": "benchmark_user_id",
-          "type": "text",
-          "primaryKey": false,
-          "notNull": false,
-          "autoincrement": false
-        },
-        "repetitions": {
-          "name": "repetitions",
-          "type": "integer",
-          "primaryKey": false,
-          "notNull": true,
-          "autoincrement": false,
-          "default": 1
-        },
-        "classifier_max_p95_latency_ms": {
-          "name": "classifier_max_p95_latency_ms",
-          "type": "integer",
-          "primaryKey": false,
-          "notNull": false,
-          "autoincrement": false
-        }
-      },
-      "indexes": {},
-      "foreignKeys": {},
-      "compositePrimaryKeys": {},
-      "uniqueConstraints": {},
-      "checkConstraints": {}
-    },
-    "case_results": {
-      "name": "case_results",
-      "columns": {
-        "run_id": {
-          "name": "run_id",
-          "type": "text",
-          "primaryKey": false,
-          "notNull": true,
-          "autoincrement": false
-        },
-        "model": {
-          "name": "model",
-          "type": "text",
-          "primaryKey": false,
-          "notNull": true,
-          "autoincrement": false
-        },
-        "case_id": {
-          "name": "case_id",
-          "type": "text",
-          "primaryKey": false,
-          "notNull": true,
-          "autoincrement": false
-        },
-        "tier": {
-          "name": "tier",
-          "type": "text",
-          "primaryKey": false,
-          "notNull": false,
-          "autoincrement": false
-        },
-        "score": {
-          "name": "score",
-          "type": "real",
-          "primaryKey": false,
-          "notNull": true,
-          "autoincrement": false
-        },
-        "latency_ms": {
-          "name": "latency_ms",
-          "type": "integer",
-          "primaryKey": false,
-          "notNull": true,
-          "autoincrement": false
-        },
-        "cost_usd": {
-          "name": "cost_usd",
-          "type": "real",
-          "primaryKey": false,
-          "notNull": false,
-          "autoincrement": false
-        },
-        "error": {
-          "name": "error",
-          "type": "text",
-          "primaryKey": false,
-          "notNull": false,
-          "autoincrement": false
-        },
-        "fallback_reason": {
-          "name": "fallback_reason",
-          "type": "text",
-          "primaryKey": false,
-          "notNull": false,
-          "autoincrement": false
-        },
-        "retried": {
-          "name": "retried",
-          "type": "integer",
-          "primaryKey": false,
-          "notNull": false,
-          "autoincrement": false
-        },
-        "exit_code": {
-          "name": "exit_code",
-          "type": "integer",
-          "primaryKey": false,
-          "notNull": false,
-          "autoincrement": false
-        },
-        "output_prefix": {
-          "name": "output_prefix",
-          "type": "text",
-          "primaryKey": false,
-          "notNull": false,
-          "autoincrement": false
-        },
-        "event_count": {
-          "name": "event_count",
-          "type": "integer",
-          "primaryKey": false,
-          "notNull": false,
-          "autoincrement": false
-        },
-        "last_event_types": {
-          "name": "last_event_types",
-          "type": "text",
-          "primaryKey": false,
-          "notNull": false,
-          "autoincrement": false
-        },
-        "rep": {
-          "name": "rep",
-          "type": "integer",
-          "primaryKey": false,
-          "notNull": true,
-          "autoincrement": false,
-          "default": 0
-        },
-        "timed_out": {
-          "name": "timed_out",
-          "type": "integer",
-          "primaryKey": false,
-          "notNull": true,
-          "autoincrement": false,
-          "default": 0
-        }
-      },
-      "indexes": {},
-      "foreignKeys": {},
-      "compositePrimaryKeys": {
-        "case_results_run_id_model_case_id_rep_pk": {
-          "columns": [
-            "run_id",
-            "model",
-            "case_id",
-            "rep"
-          ],
-          "name": "case_results_run_id_model_case_id_rep_pk"
-        }
-      },
-      "uniqueConstraints": {},
-      "checkConstraints": {}
-    },
-    "config_classifier_models": {
-      "name": "config_classifier_models",
-      "columns": {
-        "model": {
-          "name": "model",
-          "type": "text",
-          "primaryKey": true,
-          "notNull": true,
-          "autoincrement": false
-        }
-      },
-      "indexes": {},
-      "foreignKeys": {},
-      "compositePrimaryKeys": {},
-      "uniqueConstraints": {},
-      "checkConstraints": {}
-    },
-    "config_decider_models": {
-      "name": "config_decider_models",
-      "columns": {
-        "model": {
-          "name": "model",
-          "type": "text",
-          "primaryKey": true,
-          "notNull": true,
-          "autoincrement": false
-        },
-        "reasoning_effort": {
-          "name": "reasoning_effort",
-          "type": "text",
-          "primaryKey": false,
-          "notNull": false,
-          "autoincrement": false
-        }
-      },
-      "indexes": {},
-      "foreignKeys": {},
-      "compositePrimaryKeys": {},
-      "uniqueConstraints": {},
-      "checkConstraints": {}
-    },
-    "model_summaries": {
-      "name": "model_summaries",
-      "columns": {
-        "run_id": {
-          "name": "run_id",
-          "type": "text",
-          "primaryKey": false,
-          "notNull": true,
-          "autoincrement": false
-        },
-        "model": {
-          "name": "model",
-          "type": "text",
-          "primaryKey": false,
-          "notNull": true,
-          "autoincrement": false
-        },
-        "tier": {
-          "name": "tier",
-          "type": "text",
-          "primaryKey": false,
-          "notNull": true,
-          "autoincrement": false
-        },
-        "accuracy": {
-          "name": "accuracy",
-          "type": "real",
-          "primaryKey": false,
-          "notNull": true,
-          "autoincrement": false
-        },
-        "avg_cost_usd": {
-          "name": "avg_cost_usd",
-          "type": "real",
-          "primaryKey": false,
-          "notNull": false,
-          "autoincrement": false
-        },
-        "avg_latency_ms": {
-          "name": "avg_latency_ms",
-          "type": "real",
-          "primaryKey": false,
-          "notNull": true,
-          "autoincrement": false
-        },
-        "p50_latency_ms": {
-          "name": "p50_latency_ms",
-          "type": "real",
-          "primaryKey": false,
-          "notNull": false,
-          "autoincrement": false
-        },
-        "cases": {
-          "name": "cases",
-          "type": "integer",
-          "primaryKey": false,
-          "notNull": true,
-          "autoincrement": false
-        },
-        "errors": {
-          "name": "errors",
-          "type": "integer",
-          "primaryKey": false,
-          "notNull": true,
-          "autoincrement": false
-        },
-        "p95_latency_ms": {
-          "name": "p95_latency_ms",
-          "type": "real",
-          "primaryKey": false,
-          "notNull": false,
-          "autoincrement": false
-        },
-        "timeouts": {
-          "name": "timeouts",
-          "type": "integer",
-          "primaryKey": false,
-          "notNull": true,
-          "autoincrement": false,
-          "default": 0
-        },
-        "carried": {
-          "name": "carried",
-          "type": "integer",
-          "primaryKey": false,
-          "notNull": true,
-          "autoincrement": false,
-          "default": false
-        }
-      },
-      "indexes": {},
-      "foreignKeys": {},
-      "compositePrimaryKeys": {
-        "model_summaries_run_id_model_tier_pk": {
-          "columns": [
-            "run_id",
-            "model",
-            "tier"
-          ],
-          "name": "model_summaries_run_id_model_tier_pk"
-        }
-      },
-      "uniqueConstraints": {},
-      "checkConstraints": {}
-    },
-    "routing_table_candidates": {
-      "name": "routing_table_candidates",
-      "columns": {
-        "run_id": {
-          "name": "run_id",
-          "type": "text",
-          "primaryKey": false,
-          "notNull": true,
-          "autoincrement": false
-        },
-        "tier": {
-          "name": "tier",
-          "type": "text",
-          "primaryKey": false,
-          "notNull": true,
-          "autoincrement": false
-        },
-        "rank": {
-          "name": "rank",
-          "type": "integer",
-          "primaryKey": false,
-          "notNull": true,
-          "autoincrement": false
-        },
-        "model": {
-          "name": "model",
-          "type": "text",
-          "primaryKey": false,
-          "notNull": true,
-          "autoincrement": false
-        },
-        "accuracy": {
-          "name": "accuracy",
-          "type": "real",
-          "primaryKey": false,
-          "notNull": true,
-          "autoincrement": false
-        },
-        "avg_cost_usd": {
-          "name": "avg_cost_usd",
-          "type": "real",
-          "primaryKey": false,
-          "notNull": true,
-          "autoincrement": false
-        },
-        "meets_threshold": {
-          "name": "meets_threshold",
-          "type": "integer",
-          "primaryKey": false,
-          "notNull": true,
-          "autoincrement": false
-        },
-        "reasoning_effort": {
-          "name": "reasoning_effort",
-          "type": "text",
-          "primaryKey": false,
-          "notNull": false,
-          "autoincrement": false
-        }
-      },
-      "indexes": {},
-      "foreignKeys": {},
-      "compositePrimaryKeys": {
-        "routing_table_candidates_run_id_tier_rank_pk": {
-          "columns": [
-            "run_id",
-            "tier",
-            "rank"
-          ],
-          "name": "routing_table_candidates_run_id_tier_rank_pk"
-        }
-      },
-      "uniqueConstraints": {},
-      "checkConstraints": {}
-    },
-    "routing_tables": {
-      "name": "routing_tables",
-      "columns": {
-        "run_id": {
-          "name": "run_id",
-          "type": "text",
-          "primaryKey": true,
-          "notNull": true,
-          "autoincrement": false
-        },
-        "published_at": {
-          "name": "published_at",
-          "type": "text",
-          "primaryKey": false,
-          "notNull": true,
-          "autoincrement": false
-        },
-        "generated_at": {
-          "name": "generated_at",
-          "type": "text",
-          "primaryKey": false,
-          "notNull": true,
-          "autoincrement": false
-        },
-        "min_accuracy": {
-          "name": "min_accuracy",
-          "type": "real",
-          "primaryKey": false,
-          "notNull": true,
-          "autoincrement": false
-        },
-        "switch_cost_factor": {
-          "name": "switch_cost_factor",
-          "type": "real",
-          "primaryKey": false,
-          "notNull": true,
-          "autoincrement": false
-        },
-        "source": {
-          "name": "source",
-          "type": "text",
-          "primaryKey": false,
-          "notNull": true,
-          "autoincrement": false
-        }
-      },
-      "indexes": {},
-      "foreignKeys": {},
-      "compositePrimaryKeys": {},
-      "uniqueConstraints": {},
-      "checkConstraints": {}
-    },
-    "run_models": {
-      "name": "run_models",
-      "columns": {
-        "run_id": {
-          "name": "run_id",
-          "type": "text",
-          "primaryKey": false,
-          "notNull": true,
-          "autoincrement": false
-        },
-        "model": {
-          "name": "model",
-          "type": "text",
-          "primaryKey": false,
-          "notNull": true,
-          "autoincrement": false
-        },
-        "enqueued": {
-          "name": "enqueued",
-          "type": "integer",
-          "primaryKey": false,
-          "notNull": true,
-          "autoincrement": false
-        },
-        "reasoning_effort": {
-          "name": "reasoning_effort",
-          "type": "text",
-          "primaryKey": false,
-          "notNull": false,
-          "autoincrement": false
-        }
-      },
-      "indexes": {},
-      "foreignKeys": {},
-      "compositePrimaryKeys": {
-        "run_models_run_id_model_pk": {
-          "columns": [
-            "run_id",
-            "model"
-          ],
-          "name": "run_models_run_id_model_pk"
-        }
-      },
-      "uniqueConstraints": {},
-      "checkConstraints": {}
-    }
-  },
-  "views": {},
-  "enums": {},
-  "_meta": {
-    "schemas": {},
-    "tables": {},
-    "columns": {}
-  },
-  "internal": {
-    "indexes": {}
-  }
-}
\ No newline at end of file
diff --git a/services/auto-routing-benchmark/migrations/meta/_journal.json b/services/auto-routing-benchmark/migrations/meta/_journal.json
index 7ec4e5d40d..8e9697a3dd 100644
--- a/services/auto-routing-benchmark/migrations/meta/_journal.json
+++ b/services/auto-routing-benchmark/migrations/meta/_journal.json
@@ -5,15 +5,8 @@
     {
       "idx": 0,
       "version": "6",
-      "when": 1781283444549,
-      "tag": "0000_amused_shard",
-      "breakpoints": true
-    },
-    {
-      "idx": 1,
-      "version": "6",
-      "when": 1781307943215,
-      "tag": "0001_stormy_tarot",
+      "when": 1781309761949,
+      "tag": "0000_dashing_lightspeed",
       "breakpoints": true
     }
   ]

From 9eaae607d2c524e4f6d417761e41b050220634f1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Sat, 13 Jun 2026 02:25:56 +0200
Subject: [PATCH 72/73] test(ai-gateway): stop depending on removed morph model
 in API-kind tests

Main merged PR #4004 which deleted the morph provider. The two test files
that exercised the rejection branch of modelServesAllGatewayChatApis used
morph as the only available Kilo-exclusive model on a chat_completions-only
gateway. With morph gone, no real catalog entry satisfies that condition.

Both test files now stub findKiloExclusiveModel via jest.mock/requireActual
so that the marker id 'test-exclusive/alibaba-only' returns a KiloExclusiveModel
with gateway: 'alibaba'. The real PROVIDERS.ALIBABA definition supports only
chat_completions, so the rejection path is exercised without relying on any
specific provider file being present in the catalog.
---
 .../benchmark-config/route.test.ts            | 32 +++++++++++++++--
 .../lib/ai-gateway/model-api-kinds.test.ts    | 35 ++++++++++++++++---
 2 files changed, 59 insertions(+), 8 deletions(-)

diff --git a/apps/web/src/app/admin/api/auto-routing/benchmark-config/route.test.ts b/apps/web/src/app/admin/api/auto-routing/benchmark-config/route.test.ts
index 49a7ffdf79..92f6159c0f 100644
--- a/apps/web/src/app/admin/api/auto-routing/benchmark-config/route.test.ts
+++ b/apps/web/src/app/admin/api/auto-routing/benchmark-config/route.test.ts
@@ -5,7 +5,8 @@ import {
   updateBenchmarkConfig,
 } from '@/lib/ai-gateway/auto-routing-benchmark-admin-client';
 import { getUserFromAuth } from '@/lib/user/server';
-import { morph_warp_grep_free_model } from '@/lib/ai-gateway/providers/morph';
+import type { KiloExclusiveModel } from '@/lib/ai-gateway/providers/kilo-exclusive-model';
+import type * as ModelsModule from '@/lib/ai-gateway/models';
 
 jest.mock('@/lib/user/server', () => ({
   getUserFromAuth: jest.fn(),
@@ -16,6 +17,31 @@ jest.mock('@/lib/ai-gateway/auto-routing-benchmark-admin-client', () => ({
   updateBenchmarkConfig: jest.fn(),
 }));
 
+// Stub the catalog so tests don't depend on any specific provider file.
+// 'test-exclusive/alibaba-only' maps to the alibaba gateway (chat_completions only).
+jest.mock('@/lib/ai-gateway/models', () => {
+  const actual = jest.requireActual<typeof ModelsModule>('@/lib/ai-gateway/models');
+  const stubModel: KiloExclusiveModel = {
+    public_id: 'test-exclusive/alibaba-only',
+    display_name: 'Test Alibaba-only',
+    description: 'stub for unit tests',
+    context_length: 8192,
+    max_completion_tokens: 4096,
+    status: 'public',
+    flags: [],
+    gateway: 'alibaba',
+    internal_id: 'stub-internal',
+    pricing: null,
+    exclusive_to: [],
+    inference_provider_restriction: [],
+  };
+  return {
+    ...actual,
+    findKiloExclusiveModel: (id: string) =>
+      id === 'test-exclusive/alibaba-only' ? stubModel : actual.findKiloExclusiveModel(id),
+  };
+});
+
 import { PUT } from './route';
 
 const mockGetUserFromAuth = jest.mocked(getUserFromAuth);
@@ -75,14 +101,14 @@ describe('PUT /admin/api/auto-routing/benchmark-config', () => {
         ...validConfig,
         deciderModels: [
           { id: 'openai/gpt-5-mini', reasoningEffort: null },
-          { id: morph_warp_grep_free_model.public_id, reasoningEffort: null },
+          { id: 'test-exclusive/alibaba-only', reasoningEffort: null },
         ],
       })
     );
 
     expect(response.status).toBe(400);
     const body = (await response.json()) as { error: string };
-    expect(body.error).toContain(morph_warp_grep_free_model.public_id);
+    expect(body.error).toContain('test-exclusive/alibaba-only');
     expect(body.error).toContain('chat_completions');
     expect(body.error).not.toContain('openai/gpt-5-mini (');
     expect(mockUpdateBenchmarkConfig).not.toHaveBeenCalled();
diff --git a/apps/web/src/lib/ai-gateway/model-api-kinds.test.ts b/apps/web/src/lib/ai-gateway/model-api-kinds.test.ts
index 4ec802e1dc..e2e8eaa01c 100644
--- a/apps/web/src/lib/ai-gateway/model-api-kinds.test.ts
+++ b/apps/web/src/lib/ai-gateway/model-api-kinds.test.ts
@@ -1,7 +1,34 @@
 import { describe, expect, it } from '@jest/globals';
 import { gatewayChatApisForModel, modelServesAllGatewayChatApis } from './model-api-kinds';
-import { morph_warp_grep_free_model } from '@/lib/ai-gateway/providers/morph';
 import { seed_20_code_free_model } from '@/lib/ai-gateway/providers/seed';
+import type { KiloExclusiveModel } from '@/lib/ai-gateway/providers/kilo-exclusive-model';
+import type * as ModelsModule from '@/lib/ai-gateway/models';
+
+// Stub the catalog so the rejection test doesn't depend on any specific provider file.
+// 'test-exclusive/alibaba-only' resolves to a KiloExclusiveModel on the alibaba gateway,
+// which only supports chat_completions, exercising the rejection branch.
+jest.mock('@/lib/ai-gateway/models', () => {
+  const actual = jest.requireActual<typeof ModelsModule>('@/lib/ai-gateway/models');
+  const stubModel: KiloExclusiveModel = {
+    public_id: 'test-exclusive/alibaba-only',
+    display_name: 'Test Alibaba-only',
+    description: 'stub for unit tests',
+    context_length: 8192,
+    max_completion_tokens: 4096,
+    status: 'public',
+    flags: [],
+    gateway: 'alibaba',
+    internal_id: 'stub-internal',
+    pricing: null,
+    exclusive_to: [],
+    inference_provider_restriction: [],
+  };
+  return {
+    ...actual,
+    findKiloExclusiveModel: (id: string) =>
+      id === 'test-exclusive/alibaba-only' ? stubModel : actual.findKiloExclusiveModel(id),
+  };
+});
 
 describe('modelServesAllGatewayChatApis', () => {
   it('accepts a plain OpenRouter model (OpenRouter speaks all gateway chat APIs)', () => {
@@ -9,10 +36,8 @@ describe('modelServesAllGatewayChatApis', () => {
   });
 
   it('rejects a Kilo-exclusive model served by a chat-completions-only provider', () => {
-    expect(modelServesAllGatewayChatApis(morph_warp_grep_free_model.public_id)).toBe(false);
-    expect(gatewayChatApisForModel(morph_warp_grep_free_model.public_id)).toEqual([
-      'chat_completions',
-    ]);
+    expect(modelServesAllGatewayChatApis('test-exclusive/alibaba-only')).toBe(false);
+    expect(gatewayChatApisForModel('test-exclusive/alibaba-only')).toEqual(['chat_completions']);
   });
 
   it('treats disabled Kilo-exclusive models like plain OpenRouter models, matching get-provider', () => {

From 165240b37d7ee01b3e3e64a2a91fef19e2065f7b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= <iscekic@protonmail.com>
Date: Mon, 15 Jun 2026 11:47:06 +0200
Subject: [PATCH 73/73] fix(auto-routing-benchmark): return 400 when starting a
 run without config

The POST /admin/runs handler let startRun's "config not set" precondition
error propagate to the global error handler, surfacing a client-side
precondition as HTTP 500. Guard the null config in the route handler,
mirroring the /admin/debug-cli pattern, and return 400 instead.
---
 services/auto-routing-benchmark/src/admin.test.ts | 7 +++++--
 services/auto-routing-benchmark/src/admin.ts      | 7 +++++++
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/services/auto-routing-benchmark/src/admin.test.ts b/services/auto-routing-benchmark/src/admin.test.ts
index 7c0cc73164..62df829df7 100644
--- a/services/auto-routing-benchmark/src/admin.test.ts
+++ b/services/auto-routing-benchmark/src/admin.test.ts
@@ -295,10 +295,13 @@ describe('POST /admin/runs', () => {
     expect(queueSendBatch).not.toHaveBeenCalled();
   });
 
-  it('rejects starting a run when no config has been saved', async () => {
+  it('returns 400 when no config has been saved', async () => {
     // getConfigRows already returns null config by default
     const res = await authedPost('/admin/runs', { kind: 'classifier' });
-    expect(res.status).toBe(500);
+    expect(res.status).toBe(400);
+    await expect(res.json()).resolves.toMatchObject({
+      error: 'benchmark config not set: save it in the admin panel before starting a run',
+    });
     expect(insertRun).not.toHaveBeenCalled();
     expect(queueSendBatch).not.toHaveBeenCalled();
   });
diff --git a/services/auto-routing-benchmark/src/admin.ts b/services/auto-routing-benchmark/src/admin.ts
index 9fc401ce08..3b470a56fd 100644
--- a/services/auto-routing-benchmark/src/admin.ts
+++ b/services/auto-routing-benchmark/src/admin.ts
@@ -41,6 +41,13 @@ export function registerAdminRoutes(app: Hono<HonoEnv>): void {
     zodJsonValidator(StartBenchmarkRunRequestSchema, { errorMessage: 'Invalid run request' }),
     async c => {
       const { kind, force } = c.req.valid('json');
+      const config = await getBenchmarkConfig(c.env.BENCH_DB);
+      if (!config) {
+        return c.json(
+          { error: 'benchmark config not set: save it in the admin panel before starting a run' },
+          400
+        );
+      }
       return c.json(await startRun(c.env, kind, { force }));
     }
   );