From b41e58e8ce9144623c3c14b136289b71b597d7bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= Date: Thu, 11 Jun 2026 21:58:35 +0200 Subject: [PATCH 01/73] refactor(auto-routing): move classifier core into contracts package --- packages/auto-routing-contracts/package.json | 5 +- .../src/classifier/index.ts | 20 ++ .../src/classifier}/model-classifier.test.ts | 4 +- .../src/classifier/model-classifier.ts | 200 ++++++++++++++++ .../src/classifier/output-fallback.test.ts | 4 +- .../src/classifier/output-fallback.ts | 4 +- .../src/classifier/output.test.ts | 2 +- .../src/classifier/output.ts | 4 +- .../src/classifier/prompt.test.ts | 4 +- .../src/classifier/prompt.ts | 4 +- .../src/classifier/taxonomy.json | 0 .../src/classifier/taxonomy.test.ts | 2 +- packages/auto-routing-contracts/tsconfig.json | 2 + .../auto-routing-contracts/vitest.config.ts | 9 + pnpm-lock.yaml | 12 +- .../src/admin-classifier-model.ts | 2 +- .../src/classifier-analytics.test.ts | 2 +- .../auto-routing/src/classifier-analytics.ts | 2 +- .../src/classifier-config.test.ts | 2 +- .../auto-routing/src/classifier-config.ts | 2 +- services/auto-routing/src/decide.ts | 2 +- .../auto-routing/src/decision-cache.test.ts | 2 +- services/auto-routing/src/model-classifier.ts | 215 ++---------------- 23 files changed, 278 insertions(+), 227 deletions(-) create mode 100644 packages/auto-routing-contracts/src/classifier/index.ts rename {services/auto-routing/src => packages/auto-routing-contracts/src/classifier}/model-classifier.test.ts (97%) create mode 100644 packages/auto-routing-contracts/src/classifier/model-classifier.ts rename services/auto-routing/src/classifier-output/fallback.test.ts => packages/auto-routing-contracts/src/classifier/output-fallback.test.ts (92%) rename services/auto-routing/src/classifier-output/fallback.ts => packages/auto-routing-contracts/src/classifier/output-fallback.ts (92%) rename services/auto-routing/src/classifier-output/index.test.ts => packages/auto-routing-contracts/src/classifier/output.test.ts (99%) rename services/auto-routing/src/classifier-output/index.ts => packages/auto-routing-contracts/src/classifier/output.ts (98%) rename services/auto-routing/src/classifier-prompt.test.ts => packages/auto-routing-contracts/src/classifier/prompt.test.ts (97%) rename services/auto-routing/src/classifier-prompt.ts => packages/auto-routing-contracts/src/classifier/prompt.ts (96%) rename services/auto-routing/src/classifier-taxonomy.json => packages/auto-routing-contracts/src/classifier/taxonomy.json (100%) rename services/auto-routing/src/classifier-taxonomy.test.ts => packages/auto-routing-contracts/src/classifier/taxonomy.test.ts (96%) create mode 100644 packages/auto-routing-contracts/vitest.config.ts diff --git a/packages/auto-routing-contracts/package.json b/packages/auto-routing-contracts/package.json index 43e1bd2cfd..6ea28e8178 100644 --- a/packages/auto-routing-contracts/package.json +++ b/packages/auto-routing-contracts/package.json @@ -6,7 +6,8 @@ "main": "./src/index.ts", "types": "./src/index.ts", "exports": { - ".": "./src/index.ts" + ".": "./src/index.ts", + "./classifier": "./src/classifier/index.ts" }, "scripts": { "typecheck": "tsgo --noEmit", @@ -14,9 +15,11 @@ "test": "vitest run" }, "dependencies": { + "@openrouter/sdk": "^0.12.79", "zod": "catalog:" }, "devDependencies": { + "@types/node": "catalog:", "@typescript/native-preview": "catalog:", "typescript": "catalog:", "vitest": "catalog:" diff --git a/packages/auto-routing-contracts/src/classifier/index.ts b/packages/auto-routing-contracts/src/classifier/index.ts new file mode 100644 index 0000000000..d3422ad6a7 --- /dev/null +++ b/packages/auto-routing-contracts/src/classifier/index.ts @@ -0,0 +1,20 @@ +export { + buildClassifierMessages, + CLASSIFIER_MAX_TOKENS, + DEFAULT_CLASSIFIER_MODEL, +} from './prompt'; +export { + ClassifierOutputParseError, + parseClassifierOutput, + type ClassifierOutput, +} from './output'; +export { fallbackClassifierOutput } from './output-fallback'; +export { + classifyWithOpenRouter, + ClassifierRunError, + type ClassifierCallOptions, + type ClassifierModelCallMeta, + type ClassifierRunFailureMetadata, + type ClassifierRunFallbackMetadata, + type ClassifierRunResult, +} from './model-classifier'; diff --git a/services/auto-routing/src/model-classifier.test.ts b/packages/auto-routing-contracts/src/classifier/model-classifier.test.ts similarity index 97% rename from services/auto-routing/src/model-classifier.test.ts rename to packages/auto-routing-contracts/src/classifier/model-classifier.test.ts index 622409612e..de54484a8d 100644 --- a/services/auto-routing/src/model-classifier.test.ts +++ b/packages/auto-routing-contracts/src/classifier/model-classifier.test.ts @@ -1,9 +1,9 @@ import { describe, expect, it, vi } from 'vitest'; import type { OpenRouter } from '@openrouter/sdk'; import type { ChatResult } from '@openrouter/sdk/models'; -import { DEFAULT_CLASSIFIER_MODEL } from './classifier-prompt'; +import { DEFAULT_CLASSIFIER_MODEL } from './prompt'; import { ClassifierRunError, classifyWithOpenRouter } from './model-classifier'; -import type { NormalizedClassifierInput } from '@kilocode/auto-routing-contracts'; +import type { NormalizedClassifierInput } from '../index'; const normalizedInput = { apiKind: 'responses', diff --git a/packages/auto-routing-contracts/src/classifier/model-classifier.ts b/packages/auto-routing-contracts/src/classifier/model-classifier.ts new file mode 100644 index 0000000000..645276dd6a --- /dev/null +++ b/packages/auto-routing-contracts/src/classifier/model-classifier.ts @@ -0,0 +1,200 @@ +import type { OpenRouter } from '@openrouter/sdk'; +import type { ChatResult } from '@openrouter/sdk/models'; +import { buildClassifierMessages, CLASSIFIER_MAX_TOKENS } from './prompt'; +import type { NormalizedClassifierInput } from '../index'; +import { ClassifierOutputParseError, parseClassifierOutput, type ClassifierOutput } from './output'; +import { fallbackClassifierOutput } from './output-fallback'; + +export type ClassifierRunResult = { + cost: number | null; + classifierModel: string; + classification: ClassifierOutput; + fallback?: ClassifierRunFallbackMetadata; + modelCallMeta?: ClassifierModelCallMeta; + retried?: boolean; + // Why the first attempt was retried; present only when retried is true. + firstAttemptFailure?: { + reason: string; + failureStage: string | null; + finishReason: string | null; + }; +}; + +export type ClassifierModelCallMeta = { + finishReason: string | null; + completionTokens: number | null; + reasoningTokens: number | null; + // Length only — the raw output is derived from untrusted, mirrored user + // prompts and must not reach persistent logs. Combined with finishReason + // and token counts this still distinguishes truncation from prompt echo. + textLength: number | null; +}; + +export type ClassifierRunFailureMetadata = { + cost: number | null; + classifierModel: string; + failureStage?: string; + schemaIssueSummary?: string[]; + topLevelKeys?: string[]; +}; + +export type ClassifierRunFallbackMetadata = { + reason: 'no_text' | 'invalid_output'; + failureStage?: string; + schemaIssueSummary?: string[]; + topLevelKeys?: string[]; +}; + +export class ClassifierRunError extends Error { + readonly cost: number | null; + readonly classifierModel: string; + readonly failureStage?: string; + readonly schemaIssueSummary: string[]; + readonly topLevelKeys: string[]; + + constructor(message: string, metadata: ClassifierRunFailureMetadata) { + super(message); + this.name = 'ClassifierRunError'; + this.cost = metadata.cost; + this.classifierModel = metadata.classifierModel; + this.failureStage = metadata.failureStage; + this.schemaIssueSummary = metadata.schemaIssueSummary ?? []; + this.topLevelKeys = metadata.topLevelKeys ?? []; + } +} + +export type ClassifierCallOptions = { + // Sticky routing key passed to OpenRouter so requests from the same + // session land on the same provider and reuse its prompt cache. + openrouterSessionId?: string; +}; + +export async function classifyWithOpenRouter( + client: OpenRouter, + input: NormalizedClassifierInput, + classifierModel: string, + options: ClassifierCallOptions = {} +): Promise { + // Invalid output is usually a transient provider glitch (responses cut + // off after a handful of tokens with a "stop" finish reason), so one + // retry recovers most of those classifications. + const firstAttempt = await runClassifierAttempt(client, input, classifierModel, options); + if (!firstAttempt.fallback) { + return firstAttempt; + } + + let retryAttempt: ClassifierRunResult; + try { + retryAttempt = await runClassifierAttempt(client, input, classifierModel, options); + } catch (error) { + // The retry threw (e.g. a transport error) after the first attempt had + // already billed and produced diagnostics. Surface those rather than + // letting the raw error escape and underreport spend. + throw new ClassifierRunError( + error instanceof Error ? error.message : 'classifier retry failed', + { + cost: firstAttempt.cost, + classifierModel, + failureStage: firstAttempt.fallback.failureStage ?? firstAttempt.fallback.reason, + schemaIssueSummary: firstAttempt.fallback.schemaIssueSummary, + topLevelKeys: firstAttempt.fallback.topLevelKeys, + } + ); + } + return { + ...retryAttempt, + cost: sumCosts(firstAttempt.cost, retryAttempt.cost), + retried: true, + firstAttemptFailure: { + reason: firstAttempt.fallback.reason, + failureStage: firstAttempt.fallback.failureStage ?? null, + finishReason: firstAttempt.modelCallMeta?.finishReason ?? null, + }, + }; +} + +function sumCosts(first: number | null, second: number | null): number | null { + if (first === null && second === null) return null; + return (first ?? 0) + (second ?? 0); +} + +async function runClassifierAttempt( + client: OpenRouter, + input: NormalizedClassifierInput, + classifierModel: string, + options: ClassifierCallOptions +): Promise { + const result = await client.chat.send({ + chatRequest: { + model: classifierModel, + messages: buildClassifierMessages(input), + responseFormat: { type: 'json_object' }, + stream: false, + temperature: 0, + maxTokens: CLASSIFIER_MAX_TOKENS, + ...(options.openrouterSessionId ? { sessionId: options.openrouterSessionId } : {}), + }, + }); + + const cost = result.usage?.cost ?? null; + const text = extractClassifierText(result); + const modelCallMeta = extractModelCallMeta(result, text); + if (!text) { + return fallbackClassifierResult(input, classifierModel, cost, modelCallMeta, { + reason: 'no_text', + }); + } + + try { + return { + cost, + classifierModel, + classification: parseClassifierOutput(text), + modelCallMeta, + }; + } catch (error) { + return fallbackClassifierResult(input, classifierModel, cost, modelCallMeta, { + reason: 'invalid_output', + ...(error instanceof ClassifierOutputParseError + ? { + failureStage: error.failureStage, + schemaIssueSummary: error.schemaIssueSummary, + topLevelKeys: error.topLevelKeys, + } + : {}), + }); + } +} + +function extractModelCallMeta(result: ChatResult, text: string | null): ClassifierModelCallMeta { + return { + finishReason: result.choices[0]?.finishReason ?? null, + completionTokens: result.usage?.completionTokens ?? null, + reasoningTokens: result.usage?.completionTokensDetails?.reasoningTokens ?? null, + textLength: text?.length ?? null, + }; +} + +function fallbackClassifierResult( + input: NormalizedClassifierInput, + classifierModel: string, + cost: number | null, + modelCallMeta: ClassifierModelCallMeta, + fallback: ClassifierRunFallbackMetadata +): ClassifierRunResult { + return { + cost, + classifierModel, + classification: fallbackClassifierOutput(input), + fallback, + modelCallMeta, + }; +} + +function extractClassifierText(result: ChatResult) { + const content: unknown = result.choices[0]?.message.content; + if (typeof content === 'string' && content.trim().length > 0) { + return content; + } + return null; +} diff --git a/services/auto-routing/src/classifier-output/fallback.test.ts b/packages/auto-routing-contracts/src/classifier/output-fallback.test.ts similarity index 92% rename from services/auto-routing/src/classifier-output/fallback.test.ts rename to packages/auto-routing-contracts/src/classifier/output-fallback.test.ts index c5ee6394a1..6bafe4acf3 100644 --- a/services/auto-routing/src/classifier-output/fallback.test.ts +++ b/packages/auto-routing-contracts/src/classifier/output-fallback.test.ts @@ -1,6 +1,6 @@ import { describe, expect, it } from 'vitest'; -import type { NormalizedClassifierInput } from '@kilocode/auto-routing-contracts'; -import { fallbackClassifierOutput } from './fallback'; +import type { NormalizedClassifierInput } from '../index'; +import { fallbackClassifierOutput } from './output-fallback'; const input = { apiKind: 'chat_completions', diff --git a/services/auto-routing/src/classifier-output/fallback.ts b/packages/auto-routing-contracts/src/classifier/output-fallback.ts similarity index 92% rename from services/auto-routing/src/classifier-output/fallback.ts rename to packages/auto-routing-contracts/src/classifier/output-fallback.ts index c047813e50..969374b893 100644 --- a/services/auto-routing/src/classifier-output/fallback.ts +++ b/packages/auto-routing-contracts/src/classifier/output-fallback.ts @@ -1,5 +1,5 @@ -import type { NormalizedClassifierInput } from '@kilocode/auto-routing-contracts'; -import type { ClassifierOutput } from './index'; +import type { NormalizedClassifierInput } from '../index'; +import type { ClassifierOutput } from './output'; type IntentRule = { taskType: ClassifierOutput['taskType']; diff --git a/services/auto-routing/src/classifier-output/index.test.ts b/packages/auto-routing-contracts/src/classifier/output.test.ts similarity index 99% rename from services/auto-routing/src/classifier-output/index.test.ts rename to packages/auto-routing-contracts/src/classifier/output.test.ts index d57003b00d..e842a4b178 100644 --- a/services/auto-routing/src/classifier-output/index.test.ts +++ b/packages/auto-routing-contracts/src/classifier/output.test.ts @@ -4,7 +4,7 @@ import { parseClassifierOutput, type ClassifierOutputParseError, type ClassifierOutput, -} from './index'; +} from './output'; const validOutput = { taskType: 'debugging', diff --git a/services/auto-routing/src/classifier-output/index.ts b/packages/auto-routing-contracts/src/classifier/output.ts similarity index 98% rename from services/auto-routing/src/classifier-output/index.ts rename to packages/auto-routing-contracts/src/classifier/output.ts index 1796e4b724..8acd5392fc 100644 --- a/services/auto-routing/src/classifier-output/index.ts +++ b/packages/auto-routing-contracts/src/classifier/output.ts @@ -1,5 +1,5 @@ -import { ClassifierOutputSchema, type ClassifierOutput } from '@kilocode/auto-routing-contracts'; -import classifierTaxonomy from '../classifier-taxonomy.json'; +import { ClassifierOutputSchema, type ClassifierOutput } from '../index'; +import classifierTaxonomy from './taxonomy.json'; export const classifierOutputSchema = ClassifierOutputSchema; export type { ClassifierOutput }; diff --git a/services/auto-routing/src/classifier-prompt.test.ts b/packages/auto-routing-contracts/src/classifier/prompt.test.ts similarity index 97% rename from services/auto-routing/src/classifier-prompt.test.ts rename to packages/auto-routing-contracts/src/classifier/prompt.test.ts index 782c5a22c6..e3444fedc4 100644 --- a/services/auto-routing/src/classifier-prompt.test.ts +++ b/packages/auto-routing-contracts/src/classifier/prompt.test.ts @@ -1,6 +1,6 @@ import { describe, expect, it } from 'vitest'; -import { buildClassifierMessages, DEFAULT_CLASSIFIER_MODEL } from './classifier-prompt'; -import type { NormalizedClassifierInput } from '@kilocode/auto-routing-contracts'; +import { buildClassifierMessages, DEFAULT_CLASSIFIER_MODEL } from './prompt'; +import type { NormalizedClassifierInput } from '../index'; const input = { apiKind: 'chat_completions', diff --git a/services/auto-routing/src/classifier-prompt.ts b/packages/auto-routing-contracts/src/classifier/prompt.ts similarity index 96% rename from services/auto-routing/src/classifier-prompt.ts rename to packages/auto-routing-contracts/src/classifier/prompt.ts index 641df0fb24..efaf1793fd 100644 --- a/services/auto-routing/src/classifier-prompt.ts +++ b/packages/auto-routing-contracts/src/classifier/prompt.ts @@ -1,5 +1,5 @@ -import classifierTaxonomy from './classifier-taxonomy.json'; -import type { NormalizedClassifierInput } from '@kilocode/auto-routing-contracts'; +import classifierTaxonomy from './taxonomy.json'; +import type { NormalizedClassifierInput } from '../index'; export const DEFAULT_CLASSIFIER_MODEL = 'google/gemini-2.5-flash-lite'; // The classification JSON needs ~60 tokens; the headroom avoids truncated diff --git a/services/auto-routing/src/classifier-taxonomy.json b/packages/auto-routing-contracts/src/classifier/taxonomy.json similarity index 100% rename from services/auto-routing/src/classifier-taxonomy.json rename to packages/auto-routing-contracts/src/classifier/taxonomy.json diff --git a/services/auto-routing/src/classifier-taxonomy.test.ts b/packages/auto-routing-contracts/src/classifier/taxonomy.test.ts similarity index 96% rename from services/auto-routing/src/classifier-taxonomy.test.ts rename to packages/auto-routing-contracts/src/classifier/taxonomy.test.ts index dc510492cf..b3a3ab7dd0 100644 --- a/services/auto-routing/src/classifier-taxonomy.test.ts +++ b/packages/auto-routing-contracts/src/classifier/taxonomy.test.ts @@ -46,7 +46,7 @@ const TaxonomySchema = z.object({ }); async function readTaxonomy() { - const file = await readFile(join(__dirname, 'classifier-taxonomy.json'), 'utf8'); + const file = await readFile(join(__dirname, 'taxonomy.json'), 'utf8'); return TaxonomySchema.parse(JSON.parse(file)); } diff --git a/packages/auto-routing-contracts/tsconfig.json b/packages/auto-routing-contracts/tsconfig.json index 76473b226e..b293f0f4ef 100644 --- a/packages/auto-routing-contracts/tsconfig.json +++ b/packages/auto-routing-contracts/tsconfig.json @@ -4,11 +4,13 @@ "module": "ESNext", "moduleResolution": "bundler", "lib": ["ESNext", "WebWorker"], + "types": ["node"], "strict": true, "skipLibCheck": true, "forceConsistentCasingInFileNames": true, "noEmit": true, "isolatedModules": true, + "resolveJsonModule": true, "noImplicitReturns": true, "noFallthroughCasesInSwitch": true }, diff --git a/packages/auto-routing-contracts/vitest.config.ts b/packages/auto-routing-contracts/vitest.config.ts new file mode 100644 index 0000000000..7dd13254e7 --- /dev/null +++ b/packages/auto-routing-contracts/vitest.config.ts @@ -0,0 +1,9 @@ +import { defineConfig } from 'vitest/config'; + +export default defineConfig({ + test: { + globals: true, + environment: 'node', + include: ['src/**/*.test.ts'], + }, +}); diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index e208522daa..7677b03452 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -990,10 +990,16 @@ importers: packages/auto-routing-contracts: dependencies: + '@openrouter/sdk': + specifier: ^0.12.79 + version: 0.12.79 zod: specifier: 'catalog:' version: 4.4.3 devDependencies: + '@types/node': + specifier: 'catalog:' + version: 24.12.4 '@typescript/native-preview': specifier: 'catalog:' version: 7.0.0-dev.20260514.1 @@ -1002,7 +1008,7 @@ importers: version: 5.9.3 vitest: specifier: 'catalog:' - version: 4.1.6(@opentelemetry/api@1.9.1)(@types/node@25.5.2)(@vitest/coverage-v8@4.1.6)(@vitest/ui@4.1.6)(esbuild@0.27.4)(jiti@2.7.0)(terser@5.46.0)(tsx@4.21.0)(yaml@2.8.4) + version: 4.1.6(@opentelemetry/api@1.9.1)(@types/node@24.12.4)(@vitest/coverage-v8@4.1.6)(@vitest/ui@4.1.6)(esbuild@0.27.4)(jiti@2.7.0)(terser@5.46.0)(tsx@4.21.0)(yaml@2.8.4) packages/cloud-agent-profile: dependencies: @@ -1530,7 +1536,7 @@ importers: version: 5.9.3 vitest: specifier: 'catalog:' - version: 4.1.6(@opentelemetry/api@1.9.1)(@types/node@25.5.2)(@vitest/coverage-v8@4.1.6)(@vitest/ui@4.1.6)(esbuild@0.27.4)(jiti@2.7.0)(terser@5.46.0)(tsx@4.21.0)(yaml@2.8.4) + version: 4.1.6(@opentelemetry/api@1.9.1)(@types/node@24.12.4)(@vitest/coverage-v8@4.1.6)(@vitest/ui@4.1.6)(esbuild@0.27.4)(jiti@2.7.0)(terser@5.46.0)(tsx@4.21.0)(yaml@2.8.4) wrangler: specifier: 'catalog:' version: 4.98.0(@cloudflare/workers-types@4.20260605.1)(bufferutil@4.1.0)(utf-8-validate@6.0.6) @@ -23833,7 +23839,7 @@ snapshots: '@types/pg@8.18.0': dependencies: - '@types/node': 25.5.2 + '@types/node': 24.12.4 pg-protocol: 1.13.0 pg-types: 2.2.0 diff --git a/services/auto-routing/src/admin-classifier-model.ts b/services/auto-routing/src/admin-classifier-model.ts index 7fc6660e31..8eb5f2f0dd 100644 --- a/services/auto-routing/src/admin-classifier-model.ts +++ b/services/auto-routing/src/admin-classifier-model.ts @@ -3,7 +3,7 @@ import { type AutoRoutingClassifierModelResponse, } from '@kilocode/auto-routing-contracts'; import type { Handler } from 'hono'; -import { DEFAULT_CLASSIFIER_MODEL } from './classifier-prompt'; +import { DEFAULT_CLASSIFIER_MODEL } from '@kilocode/auto-routing-contracts/classifier'; import { getClassifierModel, setClassifierModel } from './classifier-config'; import type { HonoEnv } from './hono-env'; diff --git a/services/auto-routing/src/classifier-analytics.test.ts b/services/auto-routing/src/classifier-analytics.test.ts index e3ebc38e0c..11a8d5f12e 100644 --- a/services/auto-routing/src/classifier-analytics.test.ts +++ b/services/auto-routing/src/classifier-analytics.test.ts @@ -1,6 +1,6 @@ import { describe, expect, it, vi } from 'vitest'; import { writeClassifierMetricsDataPoint } from './classifier-analytics'; -import type { ClassifierOutput } from './classifier-output'; +import type { ClassifierOutput } from '@kilocode/auto-routing-contracts/classifier'; const classification = { taskType: 'debugging', diff --git a/services/auto-routing/src/classifier-analytics.ts b/services/auto-routing/src/classifier-analytics.ts index b0ceb9a4c4..08c5c0deb3 100644 --- a/services/auto-routing/src/classifier-analytics.ts +++ b/services/auto-routing/src/classifier-analytics.ts @@ -1,4 +1,4 @@ -import type { ClassifierOutput } from './classifier-output'; +import type { ClassifierOutput } from '@kilocode/auto-routing-contracts/classifier'; export type ClassifierAnalyticsStatus = | 'classified' diff --git a/services/auto-routing/src/classifier-config.test.ts b/services/auto-routing/src/classifier-config.test.ts index fbd3a0e8c4..08c83c4b59 100644 --- a/services/auto-routing/src/classifier-config.test.ts +++ b/services/auto-routing/src/classifier-config.test.ts @@ -1,5 +1,5 @@ import { beforeEach, describe, expect, it, vi } from 'vitest'; -import { DEFAULT_CLASSIFIER_MODEL } from './classifier-prompt'; +import { DEFAULT_CLASSIFIER_MODEL } from '@kilocode/auto-routing-contracts/classifier'; import { CLASSIFIER_MODEL_CONFIG_KEY, clearClassifierConfigCache, diff --git a/services/auto-routing/src/classifier-config.ts b/services/auto-routing/src/classifier-config.ts index 6b0687a539..e9025a9c95 100644 --- a/services/auto-routing/src/classifier-config.ts +++ b/services/auto-routing/src/classifier-config.ts @@ -1,5 +1,5 @@ import { formatError } from '@kilocode/worker-utils'; -import { DEFAULT_CLASSIFIER_MODEL } from './classifier-prompt'; +import { DEFAULT_CLASSIFIER_MODEL } from '@kilocode/auto-routing-contracts/classifier'; import { ttlCached } from './ttl-cache'; export const CLASSIFIER_MODEL_CONFIG_KEY = 'classifier_model'; diff --git a/services/auto-routing/src/decide.ts b/services/auto-routing/src/decide.ts index 3cc94edc56..4303192f60 100644 --- a/services/auto-routing/src/decide.ts +++ b/services/auto-routing/src/decide.ts @@ -9,7 +9,7 @@ import type { Handler } from 'hono'; import { writeClassifierMetricsDataPoint } from './classifier-analytics'; import type { ClassifierAnalyticsStatus } from './classifier-analytics'; import { getClassifierModel, getDecisionLogSampleRate } from './classifier-config'; -import type { ClassifierOutput } from './classifier-output'; +import type { ClassifierOutput } from '@kilocode/auto-routing-contracts/classifier'; import { computeContentHashes, deriveConversationKey, diff --git a/services/auto-routing/src/decision-cache.test.ts b/services/auto-routing/src/decision-cache.test.ts index 1e3245835d..c61cd2eb97 100644 --- a/services/auto-routing/src/decision-cache.test.ts +++ b/services/auto-routing/src/decision-cache.test.ts @@ -1,5 +1,5 @@ import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; -import type { ClassifierOutput } from './classifier-output'; +import type { ClassifierOutput } from '@kilocode/auto-routing-contracts/classifier'; import { AutoRoutingDecisionCacheDO } from './decision-cache'; const classification = { diff --git a/services/auto-routing/src/model-classifier.ts b/services/auto-routing/src/model-classifier.ts index 94d7f672cf..e9a9898f13 100644 --- a/services/auto-routing/src/model-classifier.ts +++ b/services/auto-routing/src/model-classifier.ts @@ -1,81 +1,22 @@ -import type { OpenRouter } from '@openrouter/sdk'; -import type { ChatResult } from '@openrouter/sdk/models'; -import { buildClassifierMessages, CLASSIFIER_MAX_TOKENS } from './classifier-prompt'; +import { classifyWithOpenRouter } from '@kilocode/auto-routing-contracts/classifier'; +import type { + ClassifierCallOptions, + ClassifierRunResult, +} from '@kilocode/auto-routing-contracts/classifier'; import type { NormalizedClassifierInput } from '@kilocode/auto-routing-contracts'; -import { - ClassifierOutputParseError, - parseClassifierOutput, - type ClassifierOutput, -} from './classifier-output'; -import { fallbackClassifierOutput } from './classifier-output/fallback'; import { createOpenRouterClient } from './openrouter'; -export type ClassifierRunResult = { - cost: number | null; - classifierModel: string; - classification: ClassifierOutput; - fallback?: ClassifierRunFallbackMetadata; - modelCallMeta?: ClassifierModelCallMeta; - retried?: boolean; - // Why the first attempt was retried; present only when retried is true. - firstAttemptFailure?: { - reason: string; - failureStage: string | null; - finishReason: string | null; - }; -}; - -export type ClassifierModelCallMeta = { - finishReason: string | null; - completionTokens: number | null; - reasoningTokens: number | null; - // Length only — the raw output is derived from untrusted, mirrored user - // prompts and must not reach persistent logs. Combined with finishReason - // and token counts this still distinguishes truncation from prompt echo. - textLength: number | null; -}; - -export type ClassifierRunFailureMetadata = { - cost: number | null; - classifierModel: string; - failureStage?: string; - schemaIssueSummary?: string[]; - topLevelKeys?: string[]; -}; - -export type ClassifierRunFallbackMetadata = { - reason: 'no_text' | 'invalid_output'; - failureStage?: string; - schemaIssueSummary?: string[]; - topLevelKeys?: string[]; -}; - -export class ClassifierRunError extends Error { - readonly cost: number | null; - readonly classifierModel: string; - readonly failureStage?: string; - readonly schemaIssueSummary: string[]; - readonly topLevelKeys: string[]; - - constructor(message: string, metadata: ClassifierRunFailureMetadata) { - super(message); - this.name = 'ClassifierRunError'; - this.cost = metadata.cost; - this.classifierModel = metadata.classifierModel; - this.failureStage = metadata.failureStage; - this.schemaIssueSummary = metadata.schemaIssueSummary ?? []; - this.topLevelKeys = metadata.topLevelKeys ?? []; - } -} +export { + ClassifierRunError, + classifyWithOpenRouter, +} from '@kilocode/auto-routing-contracts/classifier'; +export type { + ClassifierCallOptions, + ClassifierRunResult, +} from '@kilocode/auto-routing-contracts/classifier'; type ClassifierEnv = Pick; -export type ClassifierCallOptions = { - // Sticky routing key passed to OpenRouter so requests from the same - // session land on the same provider and reuse its prompt cache. - openrouterSessionId?: string; -}; - export async function classifyNormalizedInput( env: ClassifierEnv, input: NormalizedClassifierInput, @@ -85,133 +26,3 @@ export async function classifyNormalizedInput( const client = await createOpenRouterClient(env); return classifyWithOpenRouter(client, input, classifierModel, options); } - -export async function classifyWithOpenRouter( - client: OpenRouter, - input: NormalizedClassifierInput, - classifierModel: string, - options: ClassifierCallOptions = {} -): Promise { - // Invalid output is usually a transient provider glitch (responses cut - // off after a handful of tokens with a "stop" finish reason), so one - // retry recovers most of those classifications. - const firstAttempt = await runClassifierAttempt(client, input, classifierModel, options); - if (!firstAttempt.fallback) { - return firstAttempt; - } - - let retryAttempt: ClassifierRunResult; - try { - retryAttempt = await runClassifierAttempt(client, input, classifierModel, options); - } catch (error) { - // The retry threw (e.g. a transport error) after the first attempt had - // already billed and produced diagnostics. Surface those rather than - // letting the raw error escape and underreport spend. - throw new ClassifierRunError( - error instanceof Error ? error.message : 'classifier retry failed', - { - cost: firstAttempt.cost, - classifierModel, - failureStage: firstAttempt.fallback.failureStage ?? firstAttempt.fallback.reason, - schemaIssueSummary: firstAttempt.fallback.schemaIssueSummary, - topLevelKeys: firstAttempt.fallback.topLevelKeys, - } - ); - } - return { - ...retryAttempt, - cost: sumCosts(firstAttempt.cost, retryAttempt.cost), - retried: true, - firstAttemptFailure: { - reason: firstAttempt.fallback.reason, - failureStage: firstAttempt.fallback.failureStage ?? null, - finishReason: firstAttempt.modelCallMeta?.finishReason ?? null, - }, - }; -} - -function sumCosts(first: number | null, second: number | null): number | null { - if (first === null && second === null) return null; - return (first ?? 0) + (second ?? 0); -} - -async function runClassifierAttempt( - client: OpenRouter, - input: NormalizedClassifierInput, - classifierModel: string, - options: ClassifierCallOptions -): Promise { - const result = await client.chat.send({ - chatRequest: { - model: classifierModel, - messages: buildClassifierMessages(input), - responseFormat: { type: 'json_object' }, - stream: false, - temperature: 0, - maxTokens: CLASSIFIER_MAX_TOKENS, - ...(options.openrouterSessionId ? { sessionId: options.openrouterSessionId } : {}), - }, - }); - - const cost = result.usage?.cost ?? null; - const text = extractClassifierText(result); - const modelCallMeta = extractModelCallMeta(result, text); - if (!text) { - return fallbackClassifierResult(input, classifierModel, cost, modelCallMeta, { - reason: 'no_text', - }); - } - - try { - return { - cost, - classifierModel, - classification: parseClassifierOutput(text), - modelCallMeta, - }; - } catch (error) { - return fallbackClassifierResult(input, classifierModel, cost, modelCallMeta, { - reason: 'invalid_output', - ...(error instanceof ClassifierOutputParseError - ? { - failureStage: error.failureStage, - schemaIssueSummary: error.schemaIssueSummary, - topLevelKeys: error.topLevelKeys, - } - : {}), - }); - } -} - -function extractModelCallMeta(result: ChatResult, text: string | null): ClassifierModelCallMeta { - return { - finishReason: result.choices[0]?.finishReason ?? null, - completionTokens: result.usage?.completionTokens ?? null, - reasoningTokens: result.usage?.completionTokensDetails?.reasoningTokens ?? null, - textLength: text?.length ?? null, - }; -} - -function fallbackClassifierResult( - input: NormalizedClassifierInput, - classifierModel: string, - cost: number | null, - modelCallMeta: ClassifierModelCallMeta, - fallback: ClassifierRunFallbackMetadata -): ClassifierRunResult { - return { - cost, - classifierModel, - classification: fallbackClassifierOutput(input), - fallback, - modelCallMeta, - }; -} - -function extractClassifierText(result: ChatResult) { - const content: unknown = result.choices[0]?.message.content; - if (typeof content === 'string' && content.trim().length > 0) { - return content; - } - return null; -} From 1fb85f5c5e83165046a58c5876de23f641ce9bbe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= Date: Thu, 11 Jun 2026 22:04:48 +0200 Subject: [PATCH 02/73] feat(auto-routing): add tier, routing-table, decision and benchmark contracts --- .../auto-routing-contracts/src/benchmark.ts | 63 +++++++++++++++++++ .../src/classifier/index.ts | 12 +--- packages/auto-routing-contracts/src/index.ts | 17 ++++- .../src/routing-table.test.ts | 44 +++++++++++++ .../src/routing-table.ts | 49 +++++++++++++++ .../auto-routing-contracts/src/tiers.test.ts | 60 ++++++++++++++++++ packages/auto-routing-contracts/src/tiers.ts | 32 ++++++++++ 7 files changed, 266 insertions(+), 11 deletions(-) create mode 100644 packages/auto-routing-contracts/src/benchmark.ts create mode 100644 packages/auto-routing-contracts/src/routing-table.test.ts create mode 100644 packages/auto-routing-contracts/src/routing-table.ts create mode 100644 packages/auto-routing-contracts/src/tiers.test.ts create mode 100644 packages/auto-routing-contracts/src/tiers.ts diff --git a/packages/auto-routing-contracts/src/benchmark.ts b/packages/auto-routing-contracts/src/benchmark.ts new file mode 100644 index 0000000000..7c14447a40 --- /dev/null +++ b/packages/auto-routing-contracts/src/benchmark.ts @@ -0,0 +1,63 @@ +import * as z from 'zod'; +import { ClassifierApiKindSchema } from './routing-table'; +import { DifficultyTierSchema } from './tiers'; + +export const BenchmarkKindSchema = z.enum(['classifier', 'decider']); +export type BenchmarkKind = z.infer; + +export const BenchmarkDeciderModelSchema = z.object({ + id: z.string().trim().min(1), + // Which gateway API kinds this model can serve when chosen by the router. + // The benchmark itself always exercises chat completions. + supportedApiKinds: z.array(ClassifierApiKindSchema).min(1).default(['chat_completions']), +}); +export type BenchmarkDeciderModel = z.infer; + +export const BenchmarkConfigSchema = z.object({ + classifierModels: z.array(z.string().trim().min(1)).min(1), + deciderModels: z.array(BenchmarkDeciderModelSchema).min(1), + // Accuracy threshold for "gets the job done" (per tier). + minAccuracy: z.number().min(0).max(1), + // Parallel OpenRouter calls per queue message. + maxConcurrency: z.number().int().min(1).max(16), + updatedAt: z.string().nullable(), + updatedBy: z.string().nullable(), +}); +export type BenchmarkConfig = z.infer; + +export const BenchmarkRunStatusSchema = z.enum(['running', 'completed', 'failed']); + +export const BenchmarkModelSummarySchema = z.object({ + model: z.string(), + // '*' for classifier runs (no tiering), otherwise the difficulty tier. + tier: z.union([DifficultyTierSchema, z.literal('*')]), + accuracy: z.number(), + avgCostUsd: z.number().nullable(), + avgLatencyMs: z.number(), + p50LatencyMs: z.number().nullable(), + cases: z.number().int(), + errors: z.number().int(), +}); +export type BenchmarkModelSummary = z.infer; + +export const BenchmarkRunSchema = z.object({ + id: z.string(), + kind: BenchmarkKindSchema, + status: BenchmarkRunStatusSchema, + startedAt: z.string(), + completedAt: z.string().nullable(), + error: z.string().nullable(), + summaries: z.array(BenchmarkModelSummarySchema), +}); +export type BenchmarkRun = z.infer; + +export const BenchmarkRunsResponseSchema = z.object({ runs: z.array(BenchmarkRunSchema) }); +export const BenchmarkConfigResponseSchema = z.object({ + config: BenchmarkConfigSchema, + defaults: BenchmarkConfigSchema, +}); +export const StartBenchmarkRunRequestSchema = z.object({ kind: BenchmarkKindSchema }); +export const StartBenchmarkRunResponseSchema = z.object({ + runId: z.string(), + enqueuedModels: z.number().int(), +}); diff --git a/packages/auto-routing-contracts/src/classifier/index.ts b/packages/auto-routing-contracts/src/classifier/index.ts index d3422ad6a7..78c27cb244 100644 --- a/packages/auto-routing-contracts/src/classifier/index.ts +++ b/packages/auto-routing-contracts/src/classifier/index.ts @@ -1,13 +1,5 @@ -export { - buildClassifierMessages, - CLASSIFIER_MAX_TOKENS, - DEFAULT_CLASSIFIER_MODEL, -} from './prompt'; -export { - ClassifierOutputParseError, - parseClassifierOutput, - type ClassifierOutput, -} from './output'; +export { buildClassifierMessages, CLASSIFIER_MAX_TOKENS, DEFAULT_CLASSIFIER_MODEL } from './prompt'; +export { ClassifierOutputParseError, parseClassifierOutput, type ClassifierOutput } from './output'; export { fallbackClassifierOutput } from './output-fallback'; export { classifyWithOpenRouter, diff --git a/packages/auto-routing-contracts/src/index.ts b/packages/auto-routing-contracts/src/index.ts index ef537f600e..c7022c5477 100644 --- a/packages/auto-routing-contracts/src/index.ts +++ b/packages/auto-routing-contracts/src/index.ts @@ -1,5 +1,6 @@ import * as z from 'zod'; import { NormalizedClassifierInputSchema } from './input'; +import { DifficultyTierSchema } from './tiers'; export { NormalizedClassifierInputSchema, @@ -96,9 +97,19 @@ export const ClassifierOutputSchema = z }); export type ClassifierOutput = z.infer; +export const AutoRoutingDecisionSchema = z.object({ + model: z.string(), + tier: DifficultyTierSchema, + source: z.enum(['benchmark', 'default']), + tableVersion: z.string(), +}); +export type AutoRoutingDecision = z.infer; + export const AutoRoutingDecisionResponseSchema = z.object({ cost: z.number(), - decision: z.null(), + // Null when classification failed or no table candidate supports the + // request's API kind; the gateway then falls back to its static default. + decision: AutoRoutingDecisionSchema.nullable(), classifierResult: z .object({ classification: ClassifierOutputSchema, @@ -158,3 +169,7 @@ export type AutoRoutingClassifierAnalyticsResponse = z.infer< >; export { normalizeClassifierInput, redactProviderHints, type ClassifierApiKind } from './normalize'; + +export * from './tiers'; +export * from './routing-table'; +export * from './benchmark'; diff --git a/packages/auto-routing-contracts/src/routing-table.test.ts b/packages/auto-routing-contracts/src/routing-table.test.ts new file mode 100644 index 0000000000..c1180b5371 --- /dev/null +++ b/packages/auto-routing-contracts/src/routing-table.test.ts @@ -0,0 +1,44 @@ +import { describe, expect, it } from 'vitest'; +import { rankCandidates, RoutingTableSchema } from './routing-table'; + +const candidate = (model: string, accuracy: number, avgCostUsd: number) => ({ + model, + accuracy, + avgCostUsd, + meetsThreshold: false, + supportedApiKinds: ['chat_completions' as const], +}); + +describe('rankCandidates', () => { + it('puts the cheapest above-threshold candidate first', () => { + const ranked = rankCandidates( + [candidate('expensive', 0.95, 10), candidate('cheap', 0.8, 1), candidate('weak', 0.5, 0.1)], + 0.7 + ); + expect(ranked.map(c => c.model)).toEqual(['cheap', 'expensive', 'weak']); + expect(ranked[0].meetsThreshold).toBe(true); + expect(ranked[2].meetsThreshold).toBe(false); + }); + it('falls back to highest accuracy when nothing meets the threshold', () => { + const ranked = rankCandidates([candidate('a', 0.5, 1), candidate('b', 0.6, 5)], 0.9); + expect(ranked[0].model).toBe('b'); + }); + it('breaks cost ties by accuracy', () => { + const ranked = rankCandidates([candidate('a', 0.8, 1), candidate('b', 0.9, 1)], 0.7); + expect(ranked[0].model).toBe('b'); + }); +}); + +describe('RoutingTableSchema', () => { + it('requires at least one candidate per tier', () => { + expect( + RoutingTableSchema.safeParse({ + version: 'v', + generatedAt: new Date(0).toISOString(), + minAccuracy: 0.7, + source: 'benchmark', + tiers: { low: [], medium: [candidate('m', 1, 1)], high: [candidate('h', 1, 1)] }, + }).success + ).toBe(false); + }); +}); diff --git a/packages/auto-routing-contracts/src/routing-table.ts b/packages/auto-routing-contracts/src/routing-table.ts new file mode 100644 index 0000000000..acb892cbd8 --- /dev/null +++ b/packages/auto-routing-contracts/src/routing-table.ts @@ -0,0 +1,49 @@ +import * as z from 'zod'; +import { DifficultyTierSchema } from './tiers'; + +export const ClassifierApiKindSchema = z.enum(['chat_completions', 'responses', 'messages']); + +export const RankedCandidateSchema = z.object({ + model: z.string().trim().min(1), + // Benchmark accuracy in [0, 1] for this tier. + accuracy: z.number().min(0).max(1), + // Average observed OpenRouter cost per benchmark case, in USD credits. + avgCostUsd: z.number().nonnegative(), + meetsThreshold: z.boolean(), + supportedApiKinds: z.array(ClassifierApiKindSchema).min(1), +}); +export type RankedCandidate = z.infer; + +export const RoutingTableSchema = z.object({ + // Benchmark run id (or 'default' for the built-in table). + version: z.string().min(1), + generatedAt: z.string().min(1), + minAccuracy: z.number().min(0).max(1), + source: z.enum(['benchmark', 'default']), + tiers: z.object({ + low: z.array(RankedCandidateSchema).min(1), + medium: z.array(RankedCandidateSchema).min(1), + high: z.array(RankedCandidateSchema).min(1), + }), +}); +export type RoutingTable = z.infer; + +export const ROUTING_TABLE_KV_KEY = 'routing_table_v1'; + +// "Best bang for buck": candidates meeting the accuracy threshold come +// first, cheapest first (accuracy breaks ties); below-threshold candidates +// follow ordered by accuracy so a degenerate table still routes sensibly. +export function rankCandidates( + candidates: ReadonlyArray & { meetsThreshold?: boolean }>, + minAccuracy: number +): RankedCandidate[] { + const flagged = candidates.map(c => ({ ...c, meetsThreshold: c.accuracy >= minAccuracy })); + return flagged.toSorted((a, b) => { + if (a.meetsThreshold !== b.meetsThreshold) return a.meetsThreshold ? -1 : 1; + if (a.meetsThreshold) { + return a.avgCostUsd - b.avgCostUsd || b.accuracy - a.accuracy; + } + return b.accuracy - a.accuracy || a.avgCostUsd - b.avgCostUsd; + }); +} + diff --git a/packages/auto-routing-contracts/src/tiers.test.ts b/packages/auto-routing-contracts/src/tiers.test.ts new file mode 100644 index 0000000000..edf3a9d6c8 --- /dev/null +++ b/packages/auto-routing-contracts/src/tiers.test.ts @@ -0,0 +1,60 @@ +import { describe, expect, it } from 'vitest'; +import { deriveDifficultyTier } from './tiers'; +import type { ClassifierOutput } from './index'; + +function classification(overrides: Partial): ClassifierOutput { + return { + taskType: 'implementation', + subtaskType: 'code_generation', + contextComplexity: 'small', + reasoningComplexity: 'low', + riskLevel: 'low', + executionMode: 'answer_only', + requiresTools: false, + confidence: 0.9, + ...overrides, + }; +} + +describe('deriveDifficultyTier', () => { + it('classifies trivial answer-only requests as low', () => { + expect(deriveDifficultyTier(classification({}))).toBe('low'); + }); + it('classifies mid-size code changes as medium', () => { + expect( + deriveDifficultyTier( + classification({ + contextComplexity: 'medium', + reasoningComplexity: 'medium', + executionMode: 'code_change', + }) + ) + ).toBe('medium'); + }); + it('classifies high-reasoning multi-step work as high', () => { + expect( + deriveDifficultyTier( + classification({ + contextComplexity: 'large', + reasoningComplexity: 'high', + executionMode: 'multi_step_project', + riskLevel: 'high', + }) + ) + ).toBe('high'); + }); + it('is monotonic: bumping reasoning complexity never lowers the tier', () => { + const tiers = ['low', 'medium', 'high'] as const; + for (const ctx of ['small', 'medium', 'large'] as const) { + let prev = 0; + for (const reasoning of ['low', 'medium', 'high'] as const) { + const tier = deriveDifficultyTier( + classification({ contextComplexity: ctx, reasoningComplexity: reasoning }) + ); + const idx = tiers.indexOf(tier); + expect(idx).toBeGreaterThanOrEqual(prev); + prev = idx; + } + } + }); +}); diff --git a/packages/auto-routing-contracts/src/tiers.ts b/packages/auto-routing-contracts/src/tiers.ts new file mode 100644 index 0000000000..d0f4cb4c7e --- /dev/null +++ b/packages/auto-routing-contracts/src/tiers.ts @@ -0,0 +1,32 @@ +import * as z from 'zod'; +import type { ClassifierOutput } from './index'; + +export const DifficultyTierSchema = z.enum(['low', 'medium', 'high']); +export type DifficultyTier = z.infer; + +export const DIFFICULTY_TIERS: readonly DifficultyTier[] = ['low', 'medium', 'high']; + +const REASONING_POINTS = { low: 0, medium: 2, high: 4 } as const; +const CONTEXT_POINTS = { small: 0, medium: 1, large: 2 } as const; +const EXECUTION_POINTS = { + answer_only: 0, + code_change: 1, + command_execution: 1, + multi_step_project: 2, +} as const; +const RISK_POINTS = { low: 0, medium: 0, high: 1 } as const; + +// Deterministic mapping from the classifier taxonomy to a difficulty tier. +// Reasoning complexity dominates (weight 2x) because it is the strongest +// signal for whether a cheap model can complete the task; context size, +// execution mode and blast radius nudge borderline cases up. +export function deriveDifficultyTier(classification: ClassifierOutput): DifficultyTier { + const score = + REASONING_POINTS[classification.reasoningComplexity] + + CONTEXT_POINTS[classification.contextComplexity] + + EXECUTION_POINTS[classification.executionMode] + + RISK_POINTS[classification.riskLevel]; + if (score <= 2) return 'low'; + if (score <= 5) return 'medium'; + return 'high'; +} From 39acfdb2ccaf1196fe8fd50606ddc38ff0e36d7d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= Date: Thu, 11 Jun 2026 22:08:38 +0200 Subject: [PATCH 03/73] feat(auto-routing): add benchmark-driven decision engine and KV routing table --- .../auto-routing/src/decision-engine.test.ts | 47 +++++++++++++ services/auto-routing/src/decision-engine.ts | 18 +++++ .../auto-routing/src/routing-table.test.ts | 34 ++++++++++ services/auto-routing/src/routing-table.ts | 66 +++++++++++++++++++ 4 files changed, 165 insertions(+) create mode 100644 services/auto-routing/src/decision-engine.test.ts create mode 100644 services/auto-routing/src/decision-engine.ts create mode 100644 services/auto-routing/src/routing-table.test.ts create mode 100644 services/auto-routing/src/routing-table.ts diff --git a/services/auto-routing/src/decision-engine.test.ts b/services/auto-routing/src/decision-engine.test.ts new file mode 100644 index 0000000000..1dc79c4572 --- /dev/null +++ b/services/auto-routing/src/decision-engine.test.ts @@ -0,0 +1,47 @@ +import { describe, expect, it } from 'vitest'; +import type { ClassifierOutput, RoutingTable } from '@kilocode/auto-routing-contracts'; +import { computeDecision } from './decision-engine'; + +const classification: ClassifierOutput = { + taskType: 'implementation', + subtaskType: 'code_generation', + contextComplexity: 'small', + reasoningComplexity: 'low', + riskLevel: 'low', + executionMode: 'answer_only', + requiresTools: false, + confidence: 0.9, +}; + +const table: RoutingTable = { + version: 'run-1', + generatedAt: '2026-06-11T00:00:00.000Z', + minAccuracy: 0.7, + source: 'benchmark', + tiers: { + low: [ + { model: 'cheap/messages-only', accuracy: 0.9, avgCostUsd: 0.001, meetsThreshold: true, supportedApiKinds: ['messages'] }, + { model: 'cheap/chat', accuracy: 0.85, avgCostUsd: 0.002, meetsThreshold: true, supportedApiKinds: ['chat_completions'] }, + ], + medium: [ + { model: 'mid/chat', accuracy: 0.8, avgCostUsd: 0.01, meetsThreshold: true, supportedApiKinds: ['chat_completions', 'messages'] }, + ], + high: [ + { model: 'big/chat', accuracy: 0.9, avgCostUsd: 0.1, meetsThreshold: true, supportedApiKinds: ['chat_completions'] }, + ], + }, +}; + +describe('computeDecision', () => { + it('picks the first candidate supporting the request api kind', () => { + const decision = computeDecision(classification, 'chat_completions', table); + expect(decision).toEqual({ model: 'cheap/chat', tier: 'low', source: 'benchmark', tableVersion: 'run-1' }); + }); + it('uses the tier derived from the classification', () => { + const hard: ClassifierOutput = { ...classification, reasoningComplexity: 'high', contextComplexity: 'large', executionMode: 'multi_step_project' }; + expect(computeDecision(hard, 'chat_completions', table)?.model).toBe('big/chat'); + }); + it('returns null when no candidate supports the api kind', () => { + expect(computeDecision(classification, 'responses', table)).toBeNull(); + }); +}); diff --git a/services/auto-routing/src/decision-engine.ts b/services/auto-routing/src/decision-engine.ts new file mode 100644 index 0000000000..26645ead7f --- /dev/null +++ b/services/auto-routing/src/decision-engine.ts @@ -0,0 +1,18 @@ +import { + deriveDifficultyTier, + type AutoRoutingDecision, + type ClassifierOutput, + type NormalizedClassifierInput, + type RoutingTable, +} from '@kilocode/auto-routing-contracts'; + +export function computeDecision( + classification: ClassifierOutput, + apiKind: NormalizedClassifierInput['apiKind'], + table: RoutingTable +): AutoRoutingDecision | null { + const tier = deriveDifficultyTier(classification); + const candidate = table.tiers[tier].find(c => c.supportedApiKinds.includes(apiKind)); + if (!candidate) return null; + return { model: candidate.model, tier, source: table.source, tableVersion: table.version }; +} diff --git a/services/auto-routing/src/routing-table.test.ts b/services/auto-routing/src/routing-table.test.ts new file mode 100644 index 0000000000..788d866945 --- /dev/null +++ b/services/auto-routing/src/routing-table.test.ts @@ -0,0 +1,34 @@ +import { afterEach, describe, expect, it } from 'vitest'; +import { clearRoutingTableCache, DEFAULT_ROUTING_TABLE, getRoutingTable } from './routing-table'; + +type KvStub = Pick; +const kvEnv = (value: string | null, onGet?: () => void): KvStub => + ({ + AUTO_ROUTING_CONFIG: { + get: async () => { + onGet?.(); + return value; + }, + }, + }) as unknown as KvStub; + +afterEach(() => clearRoutingTableCache()); + +describe('getRoutingTable', () => { + it('returns the default table when the key is missing', async () => { + expect(await getRoutingTable(kvEnv(null))).toEqual(DEFAULT_ROUTING_TABLE); + }); + it('returns the default table when the stored JSON is invalid', async () => { + expect(await getRoutingTable(kvEnv('{"nope":true}'))).toEqual(DEFAULT_ROUTING_TABLE); + clearRoutingTableCache(); + expect(await getRoutingTable(kvEnv('not json at all'))).toEqual(DEFAULT_ROUTING_TABLE); + }); + it('parses and caches a valid stored table', async () => { + let reads = 0; + const env = kvEnv(JSON.stringify(DEFAULT_ROUTING_TABLE), () => reads++); + const first = await getRoutingTable(env); + await getRoutingTable(env); + expect(first.version).toBe(DEFAULT_ROUTING_TABLE.version); + expect(reads).toBe(1); + }); +}); diff --git a/services/auto-routing/src/routing-table.ts b/services/auto-routing/src/routing-table.ts new file mode 100644 index 0000000000..7293cebebe --- /dev/null +++ b/services/auto-routing/src/routing-table.ts @@ -0,0 +1,66 @@ +import { formatError } from '@kilocode/worker-utils'; +import { + ROUTING_TABLE_KV_KEY, + RoutingTableSchema, + type RoutingTable, +} from '@kilocode/auto-routing-contracts'; +import { ttlCached } from './ttl-cache'; + +// Safety net used until the first decider benchmark publishes a table (and +// whenever the stored table is missing or unparseable). Mirrors the static +// defaults the gateway uses for kilo-auto/balanced today. +export const DEFAULT_ROUTING_TABLE: RoutingTable = { + version: 'default', + generatedAt: '2026-06-11T00:00:00.000Z', + minAccuracy: 0.7, + source: 'default', + tiers: { + low: [ + { model: 'google/gemini-2.5-flash', accuracy: 1, avgCostUsd: 0, meetsThreshold: true, supportedApiKinds: ['chat_completions'] }, + ], + medium: [ + { model: 'qwen/qwen3.7-plus', accuracy: 1, avgCostUsd: 0, meetsThreshold: true, supportedApiKinds: ['chat_completions'] }, + { model: 'anthropic/claude-sonnet-4.6', accuracy: 1, avgCostUsd: 0, meetsThreshold: true, supportedApiKinds: ['chat_completions', 'messages', 'responses'] }, + ], + high: [ + { model: 'anthropic/claude-sonnet-4.6', accuracy: 1, avgCostUsd: 0, meetsThreshold: true, supportedApiKinds: ['chat_completions', 'messages', 'responses'] }, + ], + }, +}; + +const ROUTING_TABLE_CACHE_TTL_MS = 60_000; + +type RoutingTableEnv = Pick; + +const routingTableCache = ttlCached(ROUTING_TABLE_CACHE_TTL_MS, async (env: RoutingTableEnv) => { + const raw = await env.AUTO_ROUTING_CONFIG.get(ROUTING_TABLE_KV_KEY); + if (raw === null) return DEFAULT_ROUTING_TABLE; + try { + const parsed = RoutingTableSchema.safeParse(JSON.parse(raw)); + if (!parsed.success) { + console.warn( + JSON.stringify({ + event: 'auto_routing_table_invalid', + issues: parsed.error.issues.slice(0, 5).map(i => `${i.path.join('.')}: ${i.code}`), + }) + ); + return DEFAULT_ROUTING_TABLE; + } + return parsed.data; + } catch { + return DEFAULT_ROUTING_TABLE; + } +}); + +export function clearRoutingTableCache(): void { + routingTableCache.clear(); +} + +export function getRoutingTable(env: RoutingTableEnv): Promise { + return routingTableCache.get(env).catch((error: unknown) => { + console.warn( + JSON.stringify({ event: 'auto_routing_table_read_failed', ...formatError(error) }) + ); + return DEFAULT_ROUTING_TABLE; + }); +} From bd83fdc65ee6a797cef41b40120647193733a861 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= Date: Thu, 11 Jun 2026 22:10:30 +0200 Subject: [PATCH 04/73] feat(auto-routing): return routing decisions from /decide --- .../src/routing-table.ts | 1 - services/auto-routing/src/decide.ts | 51 ++++++++++++++----- .../auto-routing/src/decision-engine.test.ts | 46 ++++++++++++++--- services/auto-routing/src/index.test.ts | 16 +++++- services/auto-routing/src/routing-table.ts | 32 ++++++++++-- 5 files changed, 119 insertions(+), 27 deletions(-) diff --git a/packages/auto-routing-contracts/src/routing-table.ts b/packages/auto-routing-contracts/src/routing-table.ts index acb892cbd8..82ca7e7dfc 100644 --- a/packages/auto-routing-contracts/src/routing-table.ts +++ b/packages/auto-routing-contracts/src/routing-table.ts @@ -46,4 +46,3 @@ export function rankCandidates( return b.accuracy - a.accuracy || a.avgCostUsd - b.avgCostUsd; }); } - diff --git a/services/auto-routing/src/decide.ts b/services/auto-routing/src/decide.ts index 4303192f60..c976d9d6e9 100644 --- a/services/auto-routing/src/decide.ts +++ b/services/auto-routing/src/decide.ts @@ -1,5 +1,6 @@ import { MirrorPayloadSchema } from '@kilocode/auto-routing-contracts'; import type { + AutoRoutingDecision, AutoRoutingDecisionResponse, MirrorPayload, NormalizedClassifierInput, @@ -18,8 +19,10 @@ import { } from './conversation-identity'; import type { ContentHashes } from './conversation-identity'; import { getCachedClassification, putCachedClassification } from './decision-cache'; +import { computeDecision } from './decision-engine'; import { ClassifierRunError, classifyNormalizedInput } from './model-classifier'; import type { ClassifierRunResult } from './model-classifier'; +import { getRoutingTable } from './routing-table'; import type { HonoEnv } from './hono-env'; // Isolate-scoped request counter, used to correlate latency with isolate @@ -29,11 +32,12 @@ let isolateRequestSeq = 0; function decisionResponse( cost: number, classification: ClassifierOutput, - normalized: NormalizedClassifierInput + normalized: NormalizedClassifierInput, + decision: AutoRoutingDecision | null ): AutoRoutingDecisionResponse { return { cost, - decision: null, + decision, classifierResult: { classification, normalized }, }; } @@ -194,7 +198,8 @@ function recordDecision( env: Env, ctx: DecisionContext, durationMs: number, - outcome: DecisionOutcome + outcome: DecisionOutcome, + decision: AutoRoutingDecision | null = null ): void { const summary = summarizeOutcome(outcome); @@ -243,6 +248,9 @@ function recordDecision( hasMachineId: ctx.payload.machineId !== null, mode: ctx.payload.mode, uaPrefix: ctx.payload.userAgent?.slice(0, 40) ?? null, + decidedModel: decision?.model ?? null, + decidedTier: decision?.tier ?? null, + decisionSource: decision?.source ?? null, ...summary.details, }) ); @@ -265,11 +273,12 @@ export const decideHandler: Handler = async c => { const payload = parsed.data; const startedAt = performance.now(); - const [hashes, userIdHash, classifierModel, successSampleRate] = await Promise.all([ + const [hashes, userIdHash, classifierModel, successSampleRate, routingTable] = await Promise.all([ computeContentHashes(payload.input), hashIdentifierForTelemetry(payload.userId), getClassifierModel(c.env), getDecisionLogSampleRate(c.env), + getRoutingTable(c.env), ]); const ctx: DecisionContext = { payload, @@ -288,12 +297,15 @@ export const decideHandler: Handler = async c => { classifierModel ); if (cached) { - recordDecision(c.env, ctx, performance.now() - startedAt, { - kind: 'cache_hit', - classifierModel, - classification: cached, - }); - return c.json(decisionResponse(0, cached, payload.input)); + const decision = computeDecision(cached, payload.input.apiKind, routingTable); + recordDecision( + c.env, + ctx, + performance.now() - startedAt, + { kind: 'cache_hit', classifierModel, classification: cached }, + decision + ); + return c.json(decisionResponse(0, cached, payload.input, decision)); } try { @@ -311,10 +323,21 @@ export const decideHandler: Handler = async c => { ) ); } - recordDecision(c.env, ctx, performance.now() - startedAt, { kind: 'model', classifier }); - // When routing decisions are implemented, include the prior decision for - // this session as an input alongside classifier output. - return c.json(decisionResponse(classifier.cost ?? 0, classifier.classification, payload.input)); + const decision = computeDecision( + classifier.classification, + payload.input.apiKind, + routingTable + ); + recordDecision( + c.env, + ctx, + performance.now() - startedAt, + { kind: 'model', classifier }, + decision + ); + return c.json( + decisionResponse(classifier.cost ?? 0, classifier.classification, payload.input, decision) + ); } catch (error) { recordDecision(c.env, ctx, performance.now() - startedAt, { kind: 'error', error }); // A failed run can still have billed the first attempt (e.g. a valid-but- diff --git a/services/auto-routing/src/decision-engine.test.ts b/services/auto-routing/src/decision-engine.test.ts index 1dc79c4572..16c36ed4f7 100644 --- a/services/auto-routing/src/decision-engine.test.ts +++ b/services/auto-routing/src/decision-engine.test.ts @@ -20,14 +20,38 @@ const table: RoutingTable = { source: 'benchmark', tiers: { low: [ - { model: 'cheap/messages-only', accuracy: 0.9, avgCostUsd: 0.001, meetsThreshold: true, supportedApiKinds: ['messages'] }, - { model: 'cheap/chat', accuracy: 0.85, avgCostUsd: 0.002, meetsThreshold: true, supportedApiKinds: ['chat_completions'] }, + { + model: 'cheap/messages-only', + accuracy: 0.9, + avgCostUsd: 0.001, + meetsThreshold: true, + supportedApiKinds: ['messages'], + }, + { + model: 'cheap/chat', + accuracy: 0.85, + avgCostUsd: 0.002, + meetsThreshold: true, + supportedApiKinds: ['chat_completions'], + }, ], medium: [ - { model: 'mid/chat', accuracy: 0.8, avgCostUsd: 0.01, meetsThreshold: true, supportedApiKinds: ['chat_completions', 'messages'] }, + { + model: 'mid/chat', + accuracy: 0.8, + avgCostUsd: 0.01, + meetsThreshold: true, + supportedApiKinds: ['chat_completions', 'messages'], + }, ], high: [ - { model: 'big/chat', accuracy: 0.9, avgCostUsd: 0.1, meetsThreshold: true, supportedApiKinds: ['chat_completions'] }, + { + model: 'big/chat', + accuracy: 0.9, + avgCostUsd: 0.1, + meetsThreshold: true, + supportedApiKinds: ['chat_completions'], + }, ], }, }; @@ -35,10 +59,20 @@ const table: RoutingTable = { describe('computeDecision', () => { it('picks the first candidate supporting the request api kind', () => { const decision = computeDecision(classification, 'chat_completions', table); - expect(decision).toEqual({ model: 'cheap/chat', tier: 'low', source: 'benchmark', tableVersion: 'run-1' }); + expect(decision).toEqual({ + model: 'cheap/chat', + tier: 'low', + source: 'benchmark', + tableVersion: 'run-1', + }); }); it('uses the tier derived from the classification', () => { - const hard: ClassifierOutput = { ...classification, reasoningComplexity: 'high', contextComplexity: 'large', executionMode: 'multi_step_project' }; + const hard: ClassifierOutput = { + ...classification, + reasoningComplexity: 'high', + contextComplexity: 'large', + executionMode: 'multi_step_project', + }; expect(computeDecision(hard, 'chat_completions', table)?.model).toBe('big/chat'); }); it('returns null when no candidate supports the api kind', () => { diff --git a/services/auto-routing/src/index.test.ts b/services/auto-routing/src/index.test.ts index 89b9ba675c..d8a3991117 100644 --- a/services/auto-routing/src/index.test.ts +++ b/services/auto-routing/src/index.test.ts @@ -1,5 +1,6 @@ import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; import { clearClassifierConfigCache } from './classifier-config'; +import { clearRoutingTableCache } from './routing-table'; import { app } from './index'; import { ClassifierRunError } from './model-classifier'; import type * as ModelClassifierModule from './model-classifier'; @@ -117,6 +118,7 @@ function decideRequest(payload: unknown) { describe('auto routing worker', () => { beforeEach(() => { clearClassifierConfigCache(); + clearRoutingTableCache(); classifyNormalizedInput.mockReset(); classifyNormalizedInput.mockResolvedValue(mockClassifierResult); writeDataPoint.mockReset(); @@ -158,7 +160,12 @@ describe('auto routing worker', () => { expect(response.status).toBe(200); await expect(response.json()).resolves.toEqual({ cost: 0.00000123, - decision: null, + decision: { + model: expect.any(String), + tier: expect.stringMatching(/^(low|medium|high)$/), + source: 'default', + tableVersion: 'default', + }, classifierResult: { classification: mockClassification, normalized: normalizedInput, @@ -215,7 +222,12 @@ describe('auto routing worker', () => { expect(response.status).toBe(200); await expect(response.json()).resolves.toMatchObject({ cost: 0, - decision: null, + decision: { + model: expect.any(String), + tier: expect.stringMatching(/^(low|medium|high)$/), + source: 'default', + tableVersion: 'default', + }, classifierResult: { classification: mockClassification }, }); expect(cacheIdFromName).toHaveBeenCalledWith('user:user-1:task:task-123'); diff --git a/services/auto-routing/src/routing-table.ts b/services/auto-routing/src/routing-table.ts index 7293cebebe..524f4d526f 100644 --- a/services/auto-routing/src/routing-table.ts +++ b/services/auto-routing/src/routing-table.ts @@ -16,14 +16,38 @@ export const DEFAULT_ROUTING_TABLE: RoutingTable = { source: 'default', tiers: { low: [ - { model: 'google/gemini-2.5-flash', accuracy: 1, avgCostUsd: 0, meetsThreshold: true, supportedApiKinds: ['chat_completions'] }, + { + model: 'google/gemini-2.5-flash', + accuracy: 1, + avgCostUsd: 0, + meetsThreshold: true, + supportedApiKinds: ['chat_completions'], + }, ], medium: [ - { model: 'qwen/qwen3.7-plus', accuracy: 1, avgCostUsd: 0, meetsThreshold: true, supportedApiKinds: ['chat_completions'] }, - { model: 'anthropic/claude-sonnet-4.6', accuracy: 1, avgCostUsd: 0, meetsThreshold: true, supportedApiKinds: ['chat_completions', 'messages', 'responses'] }, + { + model: 'qwen/qwen3.7-plus', + accuracy: 1, + avgCostUsd: 0, + meetsThreshold: true, + supportedApiKinds: ['chat_completions'], + }, + { + model: 'anthropic/claude-sonnet-4.6', + accuracy: 1, + avgCostUsd: 0, + meetsThreshold: true, + supportedApiKinds: ['chat_completions', 'messages', 'responses'], + }, ], high: [ - { model: 'anthropic/claude-sonnet-4.6', accuracy: 1, avgCostUsd: 0, meetsThreshold: true, supportedApiKinds: ['chat_completions', 'messages', 'responses'] }, + { + model: 'anthropic/claude-sonnet-4.6', + accuracy: 1, + avgCostUsd: 0, + meetsThreshold: true, + supportedApiKinds: ['chat_completions', 'messages', 'responses'], + }, ], }, }; From 9621d62036c8121803271dc83e185d8d4f6ce548 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= Date: Thu, 11 Jun 2026 22:13:06 +0200 Subject: [PATCH 05/73] fix(auto-routing): log unparseable routing table JSON before falling back --- services/auto-routing/src/index.test.ts | 3 +++ services/auto-routing/src/routing-table.ts | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/services/auto-routing/src/index.test.ts b/services/auto-routing/src/index.test.ts index d8a3991117..5bc7a08146 100644 --- a/services/auto-routing/src/index.test.ts +++ b/services/auto-routing/src/index.test.ts @@ -123,6 +123,9 @@ describe('auto routing worker', () => { classifyNormalizedInput.mockResolvedValue(mockClassifierResult); writeDataPoint.mockReset(); configGet.mockReset(); + // Real KV returns null for missing keys; an undefined here would send the + // routing-table loader down the JSON.parse-throw path instead. + configGet.mockResolvedValue(null); configPut.mockReset(); analyticsTokenGet.mockReset(); analyticsTokenGet.mockResolvedValue('analytics-token'); diff --git a/services/auto-routing/src/routing-table.ts b/services/auto-routing/src/routing-table.ts index 524f4d526f..aa2baccce4 100644 --- a/services/auto-routing/src/routing-table.ts +++ b/services/auto-routing/src/routing-table.ts @@ -71,7 +71,8 @@ const routingTableCache = ttlCached(ROUTING_TABLE_CACHE_TTL_MS, async (env: Rout return DEFAULT_ROUTING_TABLE; } return parsed.data; - } catch { + } catch (error) { + console.warn(JSON.stringify({ event: 'auto_routing_table_invalid', ...formatError(error) })); return DEFAULT_ROUTING_TABLE; } }); From 7af1b6dd37af81695c3a55b3a16af0d8e3756c2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= Date: Thu, 11 Jun 2026 22:18:32 +0200 Subject: [PATCH 06/73] feat(auto-routing-benchmark): scaffold benchmark worker with D1 schema --- pnpm-lock.yaml | 37 +++ .../migrations/0001_init.sql | 49 ++++ services/auto-routing-benchmark/package.json | 29 ++ services/auto-routing-benchmark/src/auth.ts | 6 + .../auto-routing-benchmark/src/db.test.ts | 139 +++++++++ services/auto-routing-benchmark/src/db.ts | 264 ++++++++++++++++++ .../auto-routing-benchmark/src/hono-env.ts | 1 + services/auto-routing-benchmark/src/index.ts | 25 ++ .../auto-routing-benchmark/src/openrouter.ts | 26 ++ .../auto-routing-benchmark/src/ttl-cache.ts | 35 +++ services/auto-routing-benchmark/tsconfig.json | 16 ++ .../auto-routing-benchmark/vitest.config.ts | 9 + .../worker-configuration.d.ts | 16 ++ .../auto-routing-benchmark/wrangler.jsonc | 54 ++++ 14 files changed, 706 insertions(+) create mode 100644 services/auto-routing-benchmark/migrations/0001_init.sql create mode 100644 services/auto-routing-benchmark/package.json create mode 100644 services/auto-routing-benchmark/src/auth.ts create mode 100644 services/auto-routing-benchmark/src/db.test.ts create mode 100644 services/auto-routing-benchmark/src/db.ts create mode 100644 services/auto-routing-benchmark/src/hono-env.ts create mode 100644 services/auto-routing-benchmark/src/index.ts create mode 100644 services/auto-routing-benchmark/src/openrouter.ts create mode 100644 services/auto-routing-benchmark/src/ttl-cache.ts create mode 100644 services/auto-routing-benchmark/tsconfig.json create mode 100644 services/auto-routing-benchmark/vitest.config.ts create mode 100644 services/auto-routing-benchmark/worker-configuration.d.ts create mode 100644 services/auto-routing-benchmark/wrangler.jsonc diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 7677b03452..0c48fc8fe1 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -1510,6 +1510,43 @@ importers: specifier: 'catalog:' version: 4.98.0(@cloudflare/workers-types@4.20260605.1)(bufferutil@4.1.0)(utf-8-validate@6.0.6) + services/auto-routing-benchmark: + dependencies: + '@kilocode/auto-routing-contracts': + specifier: workspace:* + version: link:../../packages/auto-routing-contracts + '@kilocode/worker-utils': + specifier: workspace:* + version: link:../../packages/worker-utils + '@openrouter/sdk': + specifier: ^0.12.79 + version: 0.12.79 + hono: + specifier: 4.12.18 + version: 4.12.18 + zod: + specifier: 'catalog:' + version: 4.4.3 + devDependencies: + '@cloudflare/workers-types': + specifier: 'catalog:' + version: 4.20260605.1 + '@types/node': + specifier: 'catalog:' + version: 24.12.4 + '@typescript/native-preview': + specifier: 'catalog:' + version: 7.0.0-dev.20260514.1 + typescript: + specifier: 'catalog:' + version: 5.9.3 + vitest: + specifier: 'catalog:' + version: 4.1.6(@opentelemetry/api@1.9.1)(@types/node@24.12.4)(@vitest/coverage-v8@4.1.6)(@vitest/ui@4.1.6)(esbuild@0.27.4)(jiti@2.7.0)(terser@5.46.0)(tsx@4.21.0)(yaml@2.8.4) + wrangler: + specifier: 'catalog:' + version: 4.98.0(@cloudflare/workers-types@4.20260605.1)(bufferutil@4.1.0)(utf-8-validate@6.0.6) + services/auto-triage-infra: dependencies: '@kilocode/worker-utils': diff --git a/services/auto-routing-benchmark/migrations/0001_init.sql b/services/auto-routing-benchmark/migrations/0001_init.sql new file mode 100644 index 0000000000..6452dcfd1b --- /dev/null +++ b/services/auto-routing-benchmark/migrations/0001_init.sql @@ -0,0 +1,49 @@ +CREATE TABLE benchmark_runs ( + id TEXT PRIMARY KEY, + kind TEXT NOT NULL CHECK (kind IN ('classifier', 'decider')), + status TEXT NOT NULL CHECK (status IN ('running', 'completed', 'failed')), + started_at TEXT NOT NULL, + completed_at TEXT, + config_json TEXT NOT NULL, + error TEXT +); + +CREATE TABLE case_results ( + run_id TEXT NOT NULL REFERENCES benchmark_runs(id), + model TEXT NOT NULL, + case_id TEXT NOT NULL, + tier TEXT, + score REAL NOT NULL, + latency_ms INTEGER NOT NULL, + cost_usd REAL, + detail_json TEXT, + error TEXT, + PRIMARY KEY (run_id, model, case_id) +); +CREATE INDEX idx_case_results_run ON case_results (run_id); + +CREATE TABLE model_summaries ( + run_id TEXT NOT NULL REFERENCES benchmark_runs(id), + model TEXT NOT NULL, + tier TEXT NOT NULL, + accuracy REAL NOT NULL, + avg_cost_usd REAL, + avg_latency_ms REAL NOT NULL, + p50_latency_ms REAL, + cases INTEGER NOT NULL, + errors INTEGER NOT NULL, + PRIMARY KEY (run_id, model, tier) +); + +CREATE TABLE routing_tables ( + run_id TEXT PRIMARY KEY REFERENCES benchmark_runs(id), + published_at TEXT NOT NULL, + table_json TEXT NOT NULL +); + +CREATE TABLE benchmark_config ( + id INTEGER PRIMARY KEY CHECK (id = 1), + config_json TEXT NOT NULL, + updated_at TEXT NOT NULL, + updated_by TEXT +); diff --git a/services/auto-routing-benchmark/package.json b/services/auto-routing-benchmark/package.json new file mode 100644 index 0000000000..ba51b15107 --- /dev/null +++ b/services/auto-routing-benchmark/package.json @@ -0,0 +1,29 @@ +{ + "name": "auto-routing-benchmark", + "version": "1.0.0", + "private": true, + "type": "module", + "scripts": { + "deploy": "wrangler deploy", + "dev": "wrangler dev", + "types": "wrangler types --include-runtime=false", + "typecheck": "tsgo --noEmit", + "lint": "pnpm -w exec oxlint --config .oxlintrc.json services/auto-routing-benchmark/src", + "test": "vitest run" + }, + "dependencies": { + "@kilocode/auto-routing-contracts": "workspace:*", + "@kilocode/worker-utils": "workspace:*", + "@openrouter/sdk": "^0.12.79", + "hono": "catalog:", + "zod": "catalog:" + }, + "devDependencies": { + "@cloudflare/workers-types": "catalog:", + "@types/node": "catalog:", + "@typescript/native-preview": "catalog:", + "typescript": "catalog:", + "vitest": "catalog:", + "wrangler": "catalog:" + } +} diff --git a/services/auto-routing-benchmark/src/auth.ts b/services/auto-routing-benchmark/src/auth.ts new file mode 100644 index 0000000000..62d86cfe71 --- /dev/null +++ b/services/auto-routing-benchmark/src/auth.ts @@ -0,0 +1,6 @@ +import { backendAuthMiddleware } from '@kilocode/worker-utils'; +import type { HonoEnv } from './hono-env'; + +export const authMiddleware = backendAuthMiddleware(c => + c.env.INTERNAL_API_SECRET_PROD.get() +); diff --git a/services/auto-routing-benchmark/src/db.test.ts b/services/auto-routing-benchmark/src/db.test.ts new file mode 100644 index 0000000000..163786350b --- /dev/null +++ b/services/auto-routing-benchmark/src/db.test.ts @@ -0,0 +1,139 @@ +import { describe, it, expect } from 'vitest'; +import { mapSummaryRow, mapRunRow } from './db'; +import type { BenchmarkModelSummary } from '@kilocode/auto-routing-contracts'; + +describe('mapSummaryRow', () => { + it('maps snake_case columns to camelCase BenchmarkModelSummary', () => { + const row = { + run_id: 'run-1', + model: 'openai/gpt-4o', + tier: 'high', + accuracy: 0.92, + avg_cost_usd: 0.0015, + avg_latency_ms: 320.5, + p50_latency_ms: 300.0, + cases: 50, + errors: 2, + }; + const result = mapSummaryRow(row); + expect(result).toEqual({ + model: 'openai/gpt-4o', + tier: 'high', + accuracy: 0.92, + avgCostUsd: 0.0015, + avgLatencyMs: 320.5, + p50LatencyMs: 300.0, + cases: 50, + errors: 2, + }); + }); + + it('handles null avg_cost_usd and p50_latency_ms', () => { + const row = { + run_id: 'run-2', + model: 'anthropic/claude-3-haiku', + tier: '*', + accuracy: 0.85, + avg_cost_usd: null, + avg_latency_ms: 150.0, + p50_latency_ms: null, + cases: 30, + errors: 0, + }; + const result = mapSummaryRow(row); + expect(result.avgCostUsd).toBeNull(); + expect(result.p50LatencyMs).toBeNull(); + expect(result.tier).toBe('*'); + expect(result.errors).toBe(0); + }); +}); + +describe('mapRunRow', () => { + it('maps a RunRow and attaches its summaries', () => { + const runRow = { + id: 'run-abc', + kind: 'classifier' as const, + status: 'completed' as const, + started_at: '2026-06-10T04:10:00.000Z', + completed_at: '2026-06-10T04:25:00.000Z', + config_json: '{}', + error: null, + }; + const summaries: BenchmarkModelSummary[] = [ + { + model: 'openai/gpt-4o-mini', + tier: '*', + accuracy: 0.78, + avgCostUsd: 0.0002, + avgLatencyMs: 120, + p50LatencyMs: 110, + cases: 100, + errors: 5, + }, + ]; + const result = mapRunRow(runRow, summaries); + expect(result.id).toBe('run-abc'); + expect(result.kind).toBe('classifier'); + expect(result.status).toBe('completed'); + expect(result.startedAt).toBe('2026-06-10T04:10:00.000Z'); + expect(result.completedAt).toBe('2026-06-10T04:25:00.000Z'); + expect(result.error).toBeNull(); + expect(result.summaries).toHaveLength(1); + expect(result.summaries[0].model).toBe('openai/gpt-4o-mini'); + }); + + it('attaches an empty summaries array when none are provided', () => { + const runRow = { + id: 'run-xyz', + kind: 'decider' as const, + status: 'running' as const, + started_at: '2026-06-11T05:10:00.000Z', + completed_at: null, + config_json: '{}', + error: null, + }; + const result = mapRunRow(runRow, []); + expect(result.summaries).toEqual([]); + expect(result.completedAt).toBeNull(); + }); + + it('summaries are attached to the correct run (not mixed up)', () => { + const runRow1 = { + id: 'run-1', + kind: 'classifier' as const, + status: 'completed' as const, + started_at: '2026-06-01T04:10:00.000Z', + completed_at: '2026-06-01T04:20:00.000Z', + config_json: '{}', + error: null, + }; + const runRow2 = { + id: 'run-2', + kind: 'decider' as const, + status: 'failed' as const, + started_at: '2026-06-02T05:10:00.000Z', + completed_at: null, + config_json: '{}', + error: 'timed out', + }; + const summariesForRun1: BenchmarkModelSummary[] = [ + { + model: 'model-a', + tier: '*', + accuracy: 0.9, + avgCostUsd: null, + avgLatencyMs: 200, + p50LatencyMs: null, + cases: 10, + errors: 1, + }, + ]; + const result1 = mapRunRow(runRow1, summariesForRun1); + const result2 = mapRunRow(runRow2, []); + + expect(result1.summaries).toHaveLength(1); + expect(result1.summaries[0].model).toBe('model-a'); + expect(result2.summaries).toHaveLength(0); + expect(result2.error).toBe('timed out'); + }); +}); diff --git a/services/auto-routing-benchmark/src/db.ts b/services/auto-routing-benchmark/src/db.ts new file mode 100644 index 0000000000..27a817006f --- /dev/null +++ b/services/auto-routing-benchmark/src/db.ts @@ -0,0 +1,264 @@ +import type { + BenchmarkKind, + BenchmarkModelSummary, + BenchmarkRun, +} from '@kilocode/auto-routing-contracts'; + +export type CaseResultRow = { + run_id: string; + model: string; + case_id: string; + tier: string | null; + score: number; + latency_ms: number; + cost_usd: number | null; + detail_json: string | null; + error: string | null; +}; + +export type RunRow = { + id: string; + kind: BenchmarkKind; + status: 'running' | 'completed' | 'failed'; + started_at: string; + completed_at: string | null; + config_json: string; + error: string | null; +}; + +type ModelSummaryRow = { + run_id: string; + model: string; + tier: string; + accuracy: number; + avg_cost_usd: number | null; + avg_latency_ms: number; + p50_latency_ms: number | null; + cases: number; + errors: number; +}; + +export function mapSummaryRow(row: ModelSummaryRow): BenchmarkModelSummary { + return { + model: row.model, + tier: row.tier as BenchmarkModelSummary['tier'], + accuracy: row.accuracy, + avgCostUsd: row.avg_cost_usd, + avgLatencyMs: row.avg_latency_ms, + p50LatencyMs: row.p50_latency_ms, + cases: row.cases, + errors: row.errors, + }; +} + +export function mapRunRow(row: RunRow, summaries: BenchmarkModelSummary[]): BenchmarkRun { + return { + id: row.id, + kind: row.kind, + status: row.status, + startedAt: row.started_at, + completedAt: row.completed_at, + error: row.error, + summaries, + }; +} + +export async function insertRun( + db: D1Database, + run: { id: string; kind: BenchmarkKind; startedAt: string; configJson: string } +): Promise { + await db + .prepare( + `INSERT INTO benchmark_runs (id, kind, status, started_at, config_json) + VALUES (?1, ?2, 'running', ?3, ?4)` + ) + .bind(run.id, run.kind, run.startedAt, run.configJson) + .run(); +} + +export async function getRun(db: D1Database, runId: string): Promise { + const row = await db + .prepare('SELECT * FROM benchmark_runs WHERE id = ?1') + .bind(runId) + .first(); + return row ?? null; +} + +export async function upsertCaseResult(db: D1Database, row: CaseResultRow): Promise { + await db + .prepare( + `INSERT OR REPLACE INTO case_results + (run_id, model, case_id, tier, score, latency_ms, cost_usd, detail_json, error) + VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9)` + ) + .bind( + row.run_id, + row.model, + row.case_id, + row.tier, + row.score, + row.latency_ms, + row.cost_usd, + row.detail_json, + row.error + ) + .run(); +} + +export async function countCaseResults(db: D1Database, runId: string): Promise { + const row = await db + .prepare('SELECT COUNT(*) AS n FROM case_results WHERE run_id = ?1') + .bind(runId) + .first<{ n: number }>(); + return row?.n ?? 0; +} + +export async function getCaseResults(db: D1Database, runId: string): Promise { + const { results } = await db + .prepare('SELECT * FROM case_results WHERE run_id = ?1') + .bind(runId) + .all(); + return results; +} + +export async function replaceModelSummaries( + db: D1Database, + runId: string, + summaries: BenchmarkModelSummary[] +): Promise { + const statements = [ + db.prepare('DELETE FROM model_summaries WHERE run_id = ?1').bind(runId), + ...summaries.map(s => + db + .prepare( + `INSERT INTO model_summaries + (run_id, model, tier, accuracy, avg_cost_usd, avg_latency_ms, p50_latency_ms, cases, errors) + VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9)` + ) + .bind( + runId, + s.model, + s.tier, + s.accuracy, + s.avgCostUsd, + s.avgLatencyMs, + s.p50LatencyMs, + s.cases, + s.errors + ) + ), + ]; + await db.batch(statements); +} + +export async function getSummaries( + db: D1Database, + runId: string +): Promise { + const { results } = await db + .prepare('SELECT * FROM model_summaries WHERE run_id = ?1') + .bind(runId) + .all(); + return results.map(mapSummaryRow); +} + +export async function listRuns(db: D1Database, limit: number): Promise { + const { results: runRows } = await db + .prepare('SELECT * FROM benchmark_runs ORDER BY started_at DESC LIMIT ?1') + .bind(limit) + .all(); + + if (runRows.length === 0) { + return []; + } + + const placeholders = runRows.map((_, i) => `?${i + 1}`).join(', '); + const { results: summaryRows } = await db + .prepare(`SELECT * FROM model_summaries WHERE run_id IN (${placeholders})`) + .bind(...runRows.map(r => r.id)) + .all(); + + const summariesByRunId = new Map(); + for (const row of summaryRows) { + const existing = summariesByRunId.get(row.run_id); + if (existing) { + existing.push(mapSummaryRow(row)); + } else { + summariesByRunId.set(row.run_id, [mapSummaryRow(row)]); + } + } + + return runRows.map(row => mapRunRow(row, summariesByRunId.get(row.id) ?? [])); +} + +export async function markRunCompleted(db: D1Database, runId: string): Promise { + await db + .prepare( + `UPDATE benchmark_runs SET status = 'completed', completed_at = ?2 + WHERE id = ?1 AND status = 'running'` + ) + .bind(runId, new Date().toISOString()) + .run(); +} + +export async function markStaleRunsFailed( + db: D1Database, + olderThanIso: string +): Promise { + const result = await db + .prepare( + `UPDATE benchmark_runs SET status = 'failed', error = 'timed out' + WHERE status = 'running' AND started_at < ?1` + ) + .bind(olderThanIso) + .run(); + return result.meta.changes; +} + +export async function saveRoutingTable( + db: D1Database, + runId: string, + publishedAt: string, + tableJson: string +): Promise { + await db + .prepare( + `INSERT OR REPLACE INTO routing_tables (run_id, published_at, table_json) + VALUES (?1, ?2, ?3)` + ) + .bind(runId, publishedAt, tableJson) + .run(); +} + +export async function getLatestRoutingTable( + db: D1Database +): Promise<{ run_id: string; published_at: string; table_json: string } | null> { + const row = await db + .prepare('SELECT * FROM routing_tables ORDER BY published_at DESC LIMIT 1') + .first<{ run_id: string; published_at: string; table_json: string }>(); + return row ?? null; +} + +export async function getConfigRow( + db: D1Database +): Promise<{ config_json: string; updated_at: string; updated_by: string | null } | null> { + const row = await db + .prepare('SELECT config_json, updated_at, updated_by FROM benchmark_config WHERE id = 1') + .first<{ config_json: string; updated_at: string; updated_by: string | null }>(); + return row ?? null; +} + +export async function saveConfigRow( + db: D1Database, + configJson: string, + updatedAt: string, + updatedBy: string | null +): Promise { + await db + .prepare( + `INSERT OR REPLACE INTO benchmark_config (id, config_json, updated_at, updated_by) + VALUES (1, ?1, ?2, ?3)` + ) + .bind(configJson, updatedAt, updatedBy) + .run(); +} diff --git a/services/auto-routing-benchmark/src/hono-env.ts b/services/auto-routing-benchmark/src/hono-env.ts new file mode 100644 index 0000000000..deb5b5bea3 --- /dev/null +++ b/services/auto-routing-benchmark/src/hono-env.ts @@ -0,0 +1 @@ +export type HonoEnv = { Bindings: Env }; diff --git a/services/auto-routing-benchmark/src/index.ts b/services/auto-routing-benchmark/src/index.ts new file mode 100644 index 0000000000..feb6d991e2 --- /dev/null +++ b/services/auto-routing-benchmark/src/index.ts @@ -0,0 +1,25 @@ +import { Hono } from 'hono'; +import { createErrorHandler, createNotFoundHandler } from '@kilocode/worker-utils'; +import { authMiddleware } from './auth'; +import type { HonoEnv } from './hono-env'; + +export const app = new Hono(); +app.use('*', authMiddleware); +app.get('/health', c => c.json({ status: 'ok', service: 'auto-routing-benchmark' })); +app.notFound(createNotFoundHandler()); +app.onError(createErrorHandler()); + +export default { + fetch: app.fetch, + // Wired up in later tasks (run orchestration + admin endpoints). + async scheduled( + _controller: ScheduledController, + _env: Env, + _ctx: ExecutionContext + ): Promise {}, + async queue( + _batch: MessageBatch, + _env: Env, + _ctx: ExecutionContext + ): Promise {}, +}; diff --git a/services/auto-routing-benchmark/src/openrouter.ts b/services/auto-routing-benchmark/src/openrouter.ts new file mode 100644 index 0000000000..4d8608d6f5 --- /dev/null +++ b/services/auto-routing-benchmark/src/openrouter.ts @@ -0,0 +1,26 @@ +import { OpenRouter } from '@openrouter/sdk'; +import { ttlCached } from './ttl-cache'; + +type OpenRouterEnv = Pick; + +export const OPENROUTER_HTTP_REFERER = 'https://kilocode.ai'; +export const OPENROUTER_APP_TITLE = 'Kilo Code'; + +// Only the API key string is cached at module scope (plain value, not a +// transport-owning SDK object), so each classification skips the +// secrets-store read. The client itself is constructed per request; that is +// just object setup around global fetch. The TTL keeps key rotations +// effective within five minutes. +const API_KEY_CACHE_TTL_MS = 300_000; + +const apiKeyCache = ttlCached(API_KEY_CACHE_TTL_MS, (env: OpenRouterEnv) => + env.OPENROUTER_API_KEY.get() +); + +export async function createOpenRouterClient(env: OpenRouterEnv): Promise { + return new OpenRouter({ + apiKey: await apiKeyCache.get(env), + httpReferer: OPENROUTER_HTTP_REFERER, + appTitle: OPENROUTER_APP_TITLE, + }); +} diff --git a/services/auto-routing-benchmark/src/ttl-cache.ts b/services/auto-routing-benchmark/src/ttl-cache.ts new file mode 100644 index 0000000000..f773b9c4fc --- /dev/null +++ b/services/auto-routing-benchmark/src/ttl-cache.ts @@ -0,0 +1,35 @@ +// Isolate-local TTL memoization for per-request lookups that change rarely +// (KV config, secrets-backed clients). Values are cached as promises so +// concurrent callers share one load; rejected loads are evicted immediately +// so a transient failure is not pinned for the TTL. +export type TtlCache = { + get(env: TEnv): Promise; + clear(): void; +}; + +export function ttlCached( + ttlMs: number, + load: (env: TEnv) => Promise +): TtlCache { + let cached: { promise: Promise; expiresAt: number } | null = null; + + return { + get(env: TEnv): Promise { + if (cached && cached.expiresAt > Date.now()) { + return cached.promise; + } + const promise = load(env); + const entry = { promise, expiresAt: Date.now() + ttlMs }; + cached = entry; + promise.catch(() => { + if (cached === entry) { + cached = null; + } + }); + return promise; + }, + clear(): void { + cached = null; + }, + }; +} diff --git a/services/auto-routing-benchmark/tsconfig.json b/services/auto-routing-benchmark/tsconfig.json new file mode 100644 index 0000000000..4f765c05f6 --- /dev/null +++ b/services/auto-routing-benchmark/tsconfig.json @@ -0,0 +1,16 @@ +{ + "compilerOptions": { + "target": "esnext", + "lib": ["esnext"], + "module": "esnext", + "moduleResolution": "bundler", + "types": ["@types/node", "@cloudflare/workers-types", "./worker-configuration.d.ts"], + "esModuleInterop": true, + "resolveJsonModule": true, + "forceConsistentCasingInFileNames": true, + "strict": true, + "skipLibCheck": true, + "noEmit": true + }, + "include": ["worker-configuration.d.ts", "src/**/*.ts", "src/**/*.d.ts", "vitest.config.ts"] +} diff --git a/services/auto-routing-benchmark/vitest.config.ts b/services/auto-routing-benchmark/vitest.config.ts new file mode 100644 index 0000000000..7dd13254e7 --- /dev/null +++ b/services/auto-routing-benchmark/vitest.config.ts @@ -0,0 +1,9 @@ +import { defineConfig } from 'vitest/config'; + +export default defineConfig({ + test: { + globals: true, + environment: 'node', + include: ['src/**/*.test.ts'], + }, +}); diff --git a/services/auto-routing-benchmark/worker-configuration.d.ts b/services/auto-routing-benchmark/worker-configuration.d.ts new file mode 100644 index 0000000000..5952f82e1b --- /dev/null +++ b/services/auto-routing-benchmark/worker-configuration.d.ts @@ -0,0 +1,16 @@ +/* eslint-disable */ +// Generated by Wrangler by running `wrangler types --include-runtime=false` (hash: 8d542fe6f931aa8df862b4b96f2474be) +interface __BaseEnv_Env { + AUTO_ROUTING_CONFIG: KVNamespace; + BENCH_DB: D1Database; + BENCH_QUEUE: Queue; + INTERNAL_API_SECRET_PROD: SecretsStoreSecret; + OPENROUTER_API_KEY: SecretsStoreSecret; +} +declare namespace Cloudflare { + interface GlobalProps { + mainModule: typeof import("./src/index"); + } + interface Env extends __BaseEnv_Env {} +} +interface Env extends __BaseEnv_Env {} diff --git a/services/auto-routing-benchmark/wrangler.jsonc b/services/auto-routing-benchmark/wrangler.jsonc new file mode 100644 index 0000000000..5f3b67f6b5 --- /dev/null +++ b/services/auto-routing-benchmark/wrangler.jsonc @@ -0,0 +1,54 @@ +{ + "$schema": "node_modules/wrangler/config-schema.json", + "account_id": "e115e769bcdd4c3d66af59d3332cb394", + "name": "auto-routing-benchmark", + "main": "src/index.ts", + "compatibility_date": "2026-05-15", + "compatibility_flags": ["nodejs_compat"], + "workers_dev": false, + "preview_urls": false, + "logpush": true, + "routes": [{ "pattern": "auto-routing-benchmark.kiloapps.io", "custom_domain": true }], + "dev": { "port": 8814, "local_protocol": "http", "ip": "0.0.0.0" }, + "observability": { "enabled": true }, + "triggers": { + // 04:10 UTC daily: classifier benchmark. 05:10 UTC Monday: decider benchmark. + "crons": ["10 4 * * *", "10 5 * * 1"] + }, + "d1_databases": [ + { + "binding": "BENCH_DB", + "database_name": "auto-routing-benchmark", + "database_id": "92f2c88a-5ee6-4fd0-b118-75bd141b5cac", + "migrations_dir": "migrations" + } + ], + "queues": { + "producers": [{ "binding": "BENCH_QUEUE", "queue": "auto-routing-benchmark-jobs" }], + "consumers": [ + { + "queue": "auto-routing-benchmark-jobs", + "max_batch_size": 1, + "max_retries": 2, + "max_concurrency": 4 + } + ] + }, + "kv_namespaces": [ + // Shared with the auto-routing worker: the decider benchmark publishes + // the routing table here and auto-routing reads it on /decide. + { "binding": "AUTO_ROUTING_CONFIG", "id": "4316b8db31e347e19cfadad1b6386ad5" } + ], + "secrets_store_secrets": [ + { + "binding": "INTERNAL_API_SECRET_PROD", + "store_id": "342a86d9e3a94da698e82d0c6e2a36f0", + "secret_name": "INTERNAL_API_SECRET_PROD" + }, + { + "binding": "OPENROUTER_API_KEY", + "store_id": "342a86d9e3a94da698e82d0c6e2a36f0", + "secret_name": "OPENROUTER_API_KEY" + } + ] +} From 22de71333886423b858af5941364f280b7c64887 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= Date: Thu, 11 Jun 2026 22:24:39 +0200 Subject: [PATCH 07/73] feat(auto-routing-benchmark): classifier golden dataset and grading --- .../src/datasets/classifier-cases.test.ts | 51 ++ .../src/datasets/classifier-cases.ts | 610 ++++++++++++++++++ .../src/grading.test.ts | 58 ++ .../auto-routing-benchmark/src/grading.ts | 31 + 4 files changed, 750 insertions(+) create mode 100644 services/auto-routing-benchmark/src/datasets/classifier-cases.test.ts create mode 100644 services/auto-routing-benchmark/src/datasets/classifier-cases.ts create mode 100644 services/auto-routing-benchmark/src/grading.test.ts create mode 100644 services/auto-routing-benchmark/src/grading.ts diff --git a/services/auto-routing-benchmark/src/datasets/classifier-cases.test.ts b/services/auto-routing-benchmark/src/datasets/classifier-cases.test.ts new file mode 100644 index 0000000000..08523eaa4f --- /dev/null +++ b/services/auto-routing-benchmark/src/datasets/classifier-cases.test.ts @@ -0,0 +1,51 @@ +import { describe, expect, it } from 'vitest'; +import { NormalizedClassifierInputSchema } from '@kilocode/auto-routing-contracts'; +import { CLASSIFIER_CASES } from './classifier-cases'; + +describe('CLASSIFIER_CASES', () => { + it('has exactly 36 cases', () => { + expect(CLASSIFIER_CASES.length).toBe(36); + }); + + it('has unique ids and valid inputs', () => { + const ids = new Set(CLASSIFIER_CASES.map(c => c.id)); + expect(ids.size).toBe(CLASSIFIER_CASES.length); + for (const c of CLASSIFIER_CASES) { + const result = NormalizedClassifierInputSchema.safeParse(c.input); + expect(result.success, `case ${c.id}: ${JSON.stringify(result.error?.issues)}`).toBe(true); + } + }); + + it('covers every task type with exactly 6 cases', () => { + const byType = Map.groupBy(CLASSIFIER_CASES, c => c.expected.taskType); + for (const taskType of [ + 'implementation', + 'debugging', + 'refactoring', + 'planning_design', + 'investigation', + 'agentic_execution', + ] as const) { + expect(byType.get(taskType)?.length ?? 0, taskType).toBe(6); + } + }); + + it('covers every reasoning complexity at least 8 times', () => { + for (const level of ['low', 'medium', 'high'] as const) { + expect( + CLASSIFIER_CASES.filter(c => c.expected.reasoningComplexity === level).length, + level + ).toBeGreaterThanOrEqual(8); + } + }); + + it('has at least one of each reasoning complexity within every task type', () => { + const byType = Map.groupBy(CLASSIFIER_CASES, c => c.expected.taskType); + for (const [taskType, cases] of byType) { + const levels = new Set(cases.map(c => c.expected.reasoningComplexity)); + for (const level of ['low', 'medium', 'high'] as const) { + expect(levels.has(level), `${taskType} missing ${level}`).toBe(true); + } + } + }); +}); diff --git a/services/auto-routing-benchmark/src/datasets/classifier-cases.ts b/services/auto-routing-benchmark/src/datasets/classifier-cases.ts new file mode 100644 index 0000000000..a857cd3169 --- /dev/null +++ b/services/auto-routing-benchmark/src/datasets/classifier-cases.ts @@ -0,0 +1,610 @@ +import type { NormalizedClassifierInput } from '@kilocode/auto-routing-contracts'; +import type { ClassifierExpectation } from '../grading'; + +export type ClassifierCase = { + id: string; // stable slug, e.g. 'impl-low-regex-helper' + input: NormalizedClassifierInput; + expected: ClassifierExpectation; +}; + +const AGENT_TOOLS_SYSTEM = + 'You are Kilo Code, an AI coding assistant operating in an agentic loop with access to read_file, write_file, apply_diff, run_command and search_files tools. Work step by step and verify your changes.'; +const AGENT_PLAIN_SYSTEM = + 'You are Kilo Code, an AI coding assistant. You help the user write and modify code in their workspace. Follow the user instructions precisely.'; +const CHAT_ASSISTANT_SYSTEM = + 'You are a helpful senior software engineer. Answer the user clearly and concisely. Do not assume access to the user files unless they are pasted in the conversation.'; + +const HINTS = { provider: null, providerOptions: null } as const; + +function chat( + systemPromptPrefix: string, + userPromptPrefix: string, + opts: { + messageCount: number; + hasTools: boolean; + latestUserPromptPrefix?: string | null; + } +): NormalizedClassifierInput { + return { + apiKind: 'chat_completions', + requestedModel: 'kilo-auto/efficient', + systemPromptPrefix, + userPromptPrefix, + latestUserPromptPrefix: opts.latestUserPromptPrefix ?? null, + messageCount: opts.messageCount, + hasTools: opts.hasTools, + stream: true, + providerHints: HINTS, + }; +} + +export const CLASSIFIER_CASES: readonly ClassifierCase[] = [ + // --------------------------------------------------------------------------- + // implementation (2 low, 2 medium, 2 high) + // --------------------------------------------------------------------------- + { + id: 'impl-low-regex-helper', + input: chat( + AGENT_PLAIN_SYSTEM, + 'Write a TypeScript helper function isValidSemver(version: string): boolean that returns true for valid semantic version strings like 1.2.3 and false otherwise. No external dependencies.', + { messageCount: 1, hasTools: false } + ), + expected: { + taskType: 'implementation', + contextComplexity: 'small', + reasoningComplexity: 'low', + executionMode: 'answer_only', + requiresTools: false, + }, + }, + { + id: 'impl-low-add-zod-schema', + input: chat( + AGENT_TOOLS_SYSTEM, + 'Add a Zod schema named PaginationParamsSchema to src/schemas/pagination.ts with optional page (positive int, default 1) and pageSize (positive int, max 100, default 20) fields, and export its inferred type.', + { messageCount: 3, hasTools: true } + ), + expected: { + taskType: 'implementation', + contextComplexity: 'small', + reasoningComplexity: 'low', + executionMode: 'code_change', + requiresTools: true, + }, + }, + { + id: 'impl-medium-rest-endpoint', + input: chat( + AGENT_TOOLS_SYSTEM, + 'Add a new GET /api/projects/:id/members endpoint to our Express router in src/routes/projects.ts. Reuse the existing requireAuth middleware and the ProjectService.getMembers method, and return 404 when the project does not exist.', + { messageCount: 7, hasTools: true } + ), + expected: { + taskType: 'implementation', + contextComplexity: 'medium', + reasoningComplexity: 'medium', + executionMode: 'code_change', + requiresTools: true, + }, + }, + { + id: 'impl-medium-react-hook', + input: chat( + AGENT_TOOLS_SYSTEM, + 'Implement a useDebouncedValue(value, delayMs) React hook in src/hooks and use it in the SearchBar component so the onSearch callback fires at most once every 300ms. Keep the existing controlled-input behavior.', + { messageCount: 9, hasTools: true } + ), + expected: { + taskType: 'implementation', + contextComplexity: 'medium', + reasoningComplexity: 'medium', + executionMode: 'code_change', + requiresTools: true, + }, + }, + { + id: 'impl-high-realtime-collab', + input: chat( + AGENT_TOOLS_SYSTEM, + 'Build real-time collaborative editing for our document editor. We have a React frontend, a Node WebSocket gateway, and a Postgres store. Decide and implement a conflict-resolution strategy (OT vs CRDT), wire presence, persistence, and reconnection, and make it consistent across all three layers.', + { messageCount: 18, hasTools: true } + ), + expected: { + taskType: 'implementation', + contextComplexity: 'large', + reasoningComplexity: 'high', + executionMode: 'multi_step_project', + requiresTools: true, + }, + }, + { + id: 'impl-high-rate-limiter', + input: chat( + AGENT_TOOLS_SYSTEM, + 'Implement a distributed sliding-window rate limiter that works across our 4 API replicas backed by Redis. It must handle clock skew between nodes, degrade gracefully if Redis is unavailable, and expose per-tenant limits configured in src/config/limits.ts. Integrate it into the existing middleware chain.', + { messageCount: 16, hasTools: true } + ), + expected: { + taskType: 'implementation', + contextComplexity: 'large', + reasoningComplexity: 'high', + executionMode: 'multi_step_project', + requiresTools: true, + }, + }, + + // --------------------------------------------------------------------------- + // debugging (2 low, 2 medium, 2 high) + // --------------------------------------------------------------------------- + { + id: 'debug-low-typo-import', + input: chat( + AGENT_TOOLS_SYSTEM, + "Running the app throws \"TypeError: formatDate is not a function\" from src/utils/date.ts line 12. The file exports formatDate as a named export but App.tsx imports it as a default. Fix the import.", + { messageCount: 4, hasTools: true } + ), + expected: { + taskType: 'debugging', + contextComplexity: 'small', + reasoningComplexity: 'low', + executionMode: 'code_change', + requiresTools: true, + }, + }, + { + id: 'debug-low-off-by-one', + input: chat( + AGENT_PLAIN_SYSTEM, + 'This pagination function returns one too few items on the last page. Here is the code: `return items.slice(page * size, page * size + size - 1)`. What is wrong and how do I fix it?', + { messageCount: 1, hasTools: false } + ), + expected: { + taskType: 'debugging', + contextComplexity: 'small', + reasoningComplexity: 'low', + executionMode: 'answer_only', + requiresTools: false, + }, + }, + { + id: 'debug-medium-failing-test', + input: chat( + AGENT_TOOLS_SYSTEM, + 'Our test "UserService > createUser persists the hashed password" started failing after I changed the bcrypt cost factor. The assertion expects a 60-char hash but now gets undefined. Figure out whether the service or the test is wrong and fix it so the suite passes.', + { messageCount: 8, hasTools: true } + ), + expected: { + taskType: 'debugging', + contextComplexity: 'medium', + reasoningComplexity: 'medium', + executionMode: 'code_change', + requiresTools: true, + }, + }, + { + id: 'debug-medium-cors-error', + input: chat( + AGENT_TOOLS_SYSTEM, + 'Browser requests to our /api/upload endpoint fail with "blocked by CORS policy: No Access-Control-Allow-Origin header". GET requests to other endpoints work fine. The cors middleware is configured in src/server.ts. Find why only upload is affected and fix it.', + { messageCount: 10, hasTools: true } + ), + expected: { + taskType: 'debugging', + contextComplexity: 'medium', + reasoningComplexity: 'medium', + executionMode: 'code_change', + requiresTools: true, + }, + }, + { + id: 'debug-high-race-condition', + input: chat( + AGENT_TOOLS_SYSTEM, + 'Our payment webhook handler intermittently double-charges customers under load. We use a Postgres advisory lock around the charge, but the duplicate rows have timestamps 2-3ms apart. The handler runs on 3 replicas behind a queue with at-least-once delivery. Investigate the root cause across the worker, queue consumer, and DB layers and fix it.', + { messageCount: 14, hasTools: true } + ), + expected: { + taskType: 'debugging', + contextComplexity: 'large', + reasoningComplexity: 'high', + executionMode: 'multi_step_project', + requiresTools: true, + }, + }, + { + id: 'debug-high-memory-leak', + input: chat( + AGENT_TOOLS_SYSTEM, + 'Our Node service RSS grows by ~50MB/hour in production and OOMs after a day, but it is stable locally. Heap snapshots show growing retained closures referencing our EventEmitter-based cache. It spans the cache module, the websocket session manager, and a third-party metrics client. Trace the leak across these and fix it.', + { messageCount: 22, hasTools: true } + ), + expected: { + taskType: 'debugging', + contextComplexity: 'large', + reasoningComplexity: 'high', + executionMode: 'multi_step_project', + requiresTools: true, + }, + }, + + // --------------------------------------------------------------------------- + // refactoring (2 low, 2 medium, 2 high) + // --------------------------------------------------------------------------- + { + id: 'refactor-low-rename-var', + input: chat( + AGENT_TOOLS_SYSTEM, + 'In src/cart.ts rename the variable `x` to `lineItemTotal` everywhere it is used in the calculateTotal function. No behavior change.', + { messageCount: 3, hasTools: true } + ), + expected: { + taskType: 'refactoring', + contextComplexity: 'small', + reasoningComplexity: 'low', + executionMode: 'code_change', + requiresTools: true, + }, + }, + { + id: 'refactor-low-extract-constant', + input: chat( + AGENT_TOOLS_SYSTEM, + 'The magic number 86400 appears three times in src/scheduler.ts. Extract it into a named constant SECONDS_PER_DAY at the top of the file and use it in all three places. Keep behavior identical.', + { messageCount: 2, hasTools: true } + ), + expected: { + taskType: 'refactoring', + contextComplexity: 'small', + reasoningComplexity: 'low', + executionMode: 'code_change', + requiresTools: true, + }, + }, + { + id: 'refactor-medium-extract-service', + input: chat( + AGENT_TOOLS_SYSTEM, + 'The OrderController in src/controllers/order.ts has grown to 400 lines and mixes HTTP handling with business logic. Extract the business logic into an OrderService class, keep the controller thin, and update the existing controller tests to match. Behavior must stay the same.', + { messageCount: 11, hasTools: true } + ), + expected: { + taskType: 'refactoring', + contextComplexity: 'medium', + reasoningComplexity: 'medium', + executionMode: 'code_change', + requiresTools: true, + }, + }, + { + id: 'refactor-medium-promise-to-async', + input: chat( + AGENT_TOOLS_SYSTEM, + 'Convert the .then()/.catch() promise chains in src/api/client.ts to async/await. There are about six methods. Preserve the existing error-handling semantics and return types exactly.', + { messageCount: 6, hasTools: true } + ), + expected: { + taskType: 'refactoring', + contextComplexity: 'medium', + reasoningComplexity: 'medium', + executionMode: 'code_change', + requiresTools: true, + }, + }, + { + id: 'refactor-high-modularize-monolith', + input: chat( + AGENT_TOOLS_SYSTEM, + 'Our monolithic src/app.ts wires routing, auth, database access, and background jobs in one 1200-line file with tangled circular imports. Restructure it into clear modules with one-directional dependencies, without changing any external behavior or public routes. Decide the boundaries and migrate incrementally.', + { messageCount: 26, hasTools: true } + ), + expected: { + taskType: 'refactoring', + contextComplexity: 'large', + reasoningComplexity: 'high', + executionMode: 'multi_step_project', + requiresTools: true, + }, + }, + { + id: 'refactor-high-orm-migration', + input: chat( + AGENT_TOOLS_SYSTEM, + 'Migrate our data layer from the legacy hand-written SQL query helpers spread across 30 files to Drizzle ORM, preserving every query result shape and transaction boundary. Plan the sequence so the app keeps passing tests at each step, then carry it out.', + { messageCount: 30, hasTools: true } + ), + expected: { + taskType: 'refactoring', + contextComplexity: 'large', + reasoningComplexity: 'high', + executionMode: 'multi_step_project', + requiresTools: true, + }, + }, + + // --------------------------------------------------------------------------- + // planning_design (2 low, 2 medium, 2 high) + // --------------------------------------------------------------------------- + { + id: 'plan-low-naming-choice', + input: chat( + CHAT_ASSISTANT_SYSTEM, + 'I have a function that both validates and saves a user. What is a good single name for it, or should I split it? Just give me a recommendation, no code.', + { messageCount: 1, hasTools: false } + ), + expected: { + taskType: 'planning_design', + contextComplexity: 'small', + reasoningComplexity: 'low', + executionMode: 'answer_only', + requiresTools: false, + }, + }, + { + id: 'plan-low-folder-structure', + input: chat( + CHAT_ASSISTANT_SYSTEM, + 'For a small Express API with about 8 endpoints, what is a sensible folder structure for routes, controllers, and services? Just describe the layout, do not write code.', + { messageCount: 1, hasTools: false } + ), + expected: { + taskType: 'planning_design', + contextComplexity: 'small', + reasoningComplexity: 'low', + executionMode: 'answer_only', + requiresTools: false, + }, + }, + { + id: 'plan-medium-caching-strategy', + input: chat( + CHAT_ASSISTANT_SYSTEM, + 'We have a read-heavy product catalog API hitting Postgres directly. Walk me through the tradeoffs of adding Redis caching vs HTTP cache headers vs a materialized view, and recommend one for a team of three with moderate traffic. No implementation yet.', + { messageCount: 1, hasTools: false } + ), + expected: { + taskType: 'planning_design', + contextComplexity: 'medium', + reasoningComplexity: 'medium', + executionMode: 'answer_only', + requiresTools: false, + }, + }, + { + id: 'plan-medium-rollout-steps', + input: chat( + CHAT_ASSISTANT_SYSTEM, + 'We want to add optimistic UI updates to our existing React + tRPC todo app. Break the work into an ordered implementation plan (state, mutation handling, rollback on error, tests). Just the plan, I will implement it.', + { messageCount: 1, hasTools: false } + ), + expected: { + taskType: 'planning_design', + contextComplexity: 'medium', + reasoningComplexity: 'medium', + executionMode: 'answer_only', + requiresTools: false, + }, + }, + { + id: 'plan-high-multitenant-architecture', + input: chat( + CHAT_ASSISTANT_SYSTEM, + 'Design a multi-tenant architecture for our B2B SaaS. We need tenant isolation, per-tenant data residency (EU vs US), noisy-neighbor protection, and a path to enterprise single-tenant deployments later. Compare schema-per-tenant, row-level, and database-per-tenant, and recommend an approach with its failure modes. Design only.', + { messageCount: 1, hasTools: false } + ), + expected: { + taskType: 'planning_design', + contextComplexity: 'large', + reasoningComplexity: 'high', + executionMode: 'answer_only', + requiresTools: false, + }, + }, + { + id: 'plan-high-event-driven-migration', + input: chat( + CHAT_ASSISTANT_SYSTEM, + 'We run a synchronous request/response monolith and want to move order processing to an event-driven design with a message broker. Design the target architecture: event schema/versioning, idempotency, ordering guarantees, dead-letter handling, and how we cut over without downtime. Tradeoffs and a recommended broker, no code.', + { messageCount: 1, hasTools: false } + ), + expected: { + taskType: 'planning_design', + contextComplexity: 'large', + reasoningComplexity: 'high', + executionMode: 'answer_only', + requiresTools: false, + }, + }, + + // --------------------------------------------------------------------------- + // investigation (2 low, 2 medium, 2 high) + // --------------------------------------------------------------------------- + { + id: 'invest-low-find-usage', + input: chat( + AGENT_TOOLS_SYSTEM, + 'Where in the codebase is the function getFeatureFlags defined and which files import it? Just tell me, do not change anything.', + { messageCount: 2, hasTools: true } + ), + expected: { + taskType: 'investigation', + contextComplexity: 'small', + reasoningComplexity: 'low', + executionMode: 'answer_only', + requiresTools: true, + }, + }, + { + id: 'invest-low-explain-function', + input: chat( + AGENT_PLAIN_SYSTEM, + 'Explain what this reducer does, step by step. It handles ADD_ITEM, REMOVE_ITEM, and CLEAR_CART actions. I just want to understand the logic.', + { messageCount: 1, hasTools: false } + ), + expected: { + taskType: 'investigation', + contextComplexity: 'small', + reasoningComplexity: 'low', + executionMode: 'answer_only', + requiresTools: false, + }, + }, + { + id: 'invest-medium-trace-auth-flow', + input: chat( + AGENT_TOOLS_SYSTEM, + 'Explain how a login request flows through our app from the /auth/login route to the session cookie being set. Cover the controller, the AuthService, and the session middleware. I want to understand it before changing anything.', + { messageCount: 6, hasTools: true } + ), + expected: { + taskType: 'investigation', + contextComplexity: 'medium', + reasoningComplexity: 'medium', + executionMode: 'answer_only', + requiresTools: true, + }, + }, + { + id: 'invest-medium-research-sdk', + input: chat( + CHAT_ASSISTANT_SYSTEM, + 'Look up the current Stripe Node SDK and summarize how to verify a webhook signature and what the recommended way to handle idempotency keys is. I need to know the current recommended API before I write any code.', + { messageCount: 1, hasTools: true, latestUserPromptPrefix: null } + ), + expected: { + taskType: 'investigation', + contextComplexity: 'medium', + reasoningComplexity: 'medium', + executionMode: 'answer_only', + requiresTools: true, + }, + }, + { + id: 'invest-high-perf-regression-analysis', + input: chat( + AGENT_TOOLS_SYSTEM, + 'Our checkout p95 latency doubled over the last two weeks but no single deploy stands out. Investigate across the API, the database query patterns, the cache hit rates, and the third-party payment calls, and tell me the most likely contributors ranked by evidence. Do not fix anything yet, just analyze.', + { messageCount: 20, hasTools: true } + ), + expected: { + taskType: 'investigation', + contextComplexity: 'large', + reasoningComplexity: 'high', + executionMode: 'answer_only', + requiresTools: true, + }, + }, + { + id: 'invest-high-understand-legacy-pipeline', + input: chat( + AGENT_TOOLS_SYSTEM, + 'We inherited an undocumented data pipeline spanning a cron service, three Lambda functions, an SQS queue, and a Redshift loader. Map out how data flows end to end, what each component assumes about the others, and where the implicit coupling and failure points are. Understanding only, no changes.', + { messageCount: 24, hasTools: true } + ), + expected: { + taskType: 'investigation', + contextComplexity: 'large', + reasoningComplexity: 'high', + executionMode: 'answer_only', + requiresTools: true, + }, + }, + + // --------------------------------------------------------------------------- + // agentic_execution (2 low, 2 medium, 2 high) + // --------------------------------------------------------------------------- + { + id: 'agentic-low-run-tests', + input: chat( + AGENT_TOOLS_SYSTEM, + 'Run the test suite with `pnpm test` and tell me if it passes.', + { messageCount: 2, hasTools: true } + ), + expected: { + taskType: 'agentic_execution', + contextComplexity: 'small', + reasoningComplexity: 'low', + executionMode: 'command_execution', + requiresTools: true, + }, + }, + { + id: 'agentic-low-check-git-status', + input: chat( + AGENT_TOOLS_SYSTEM, + 'Run git status and git log --oneline -5 and show me the output so I know what state this checkout is in.', + { messageCount: 3, hasTools: true } + ), + expected: { + taskType: 'agentic_execution', + contextComplexity: 'small', + reasoningComplexity: 'low', + executionMode: 'command_execution', + requiresTools: true, + }, + }, + { + id: 'agentic-medium-start-dev-server', + input: chat( + AGENT_TOOLS_SYSTEM, + 'Start the local dev environment with `pnpm dev`, wait for it to boot, then curl http://localhost:3000/health and report whether the service and its database connection are healthy.', + { messageCount: 8, hasTools: true } + ), + expected: { + taskType: 'agentic_execution', + contextComplexity: 'medium', + reasoningComplexity: 'medium', + executionMode: 'command_execution', + requiresTools: true, + }, + }, + { + id: 'agentic-medium-docker-logs', + input: chat( + AGENT_TOOLS_SYSTEM, + 'The api container keeps restarting. Run docker compose ps, then docker compose logs api --tail 100, identify which command in the logs is failing on boot, and report it back. Just diagnose via the commands, do not edit files.', + { messageCount: 10, hasTools: true } + ), + expected: { + taskType: 'agentic_execution', + contextComplexity: 'medium', + reasoningComplexity: 'medium', + executionMode: 'command_execution', + requiresTools: true, + }, + }, + { + id: 'agentic-high-release-pipeline', + input: chat( + AGENT_TOOLS_SYSTEM, + 'Cut a release: bump the version, run the full build and test suite, build and push the multi-arch Docker image to our registry, tag the git commit, and verify the staging deploy comes up healthy. Stop and report if any step fails.', + { messageCount: 28, hasTools: true } + ), + expected: { + taskType: 'agentic_execution', + contextComplexity: 'large', + reasoningComplexity: 'high', + executionMode: 'multi_step_project', + requiresTools: true, + }, + }, + { + id: 'agentic-high-recover-broken-env', + input: chat( + AGENT_TOOLS_SYSTEM, + 'My local environment is broken after a branch switch: migrations are out of sync, node_modules looks stale, and the worker will not start. Diagnose and recover it end to end by running the right commands in order, re-running checks after each fix, until pnpm dev comes up clean. Report what you changed.', + { + messageCount: 32, + hasTools: true, + latestUserPromptPrefix: + 'Also clear the local cache before reinstalling, I think it is corrupt.', + } + ), + expected: { + taskType: 'agentic_execution', + contextComplexity: 'large', + reasoningComplexity: 'high', + executionMode: 'multi_step_project', + requiresTools: true, + }, + }, +]; diff --git a/services/auto-routing-benchmark/src/grading.test.ts b/services/auto-routing-benchmark/src/grading.test.ts new file mode 100644 index 0000000000..7094da8d09 --- /dev/null +++ b/services/auto-routing-benchmark/src/grading.test.ts @@ -0,0 +1,58 @@ +import { describe, expect, it } from 'vitest'; +import type { ClassifierOutput } from '@kilocode/auto-routing-contracts'; +import { + CLASSIFIER_FIELD_WEIGHTS, + gradeClassifierOutput, + type ClassifierExpectation, +} from './grading'; + +const expected: ClassifierExpectation = { + taskType: 'implementation', + contextComplexity: 'small', + reasoningComplexity: 'low', + executionMode: 'answer_only', + requiresTools: false, +}; + +function actualFrom(overrides: Partial): ClassifierOutput { + return { + taskType: 'implementation', + subtaskType: 'code_generation', + contextComplexity: 'small', + reasoningComplexity: 'low', + riskLevel: 'low', + executionMode: 'answer_only', + requiresTools: false, + confidence: 0.9, + ...overrides, + }; +} + +describe('gradeClassifierOutput', () => { + it('scores a full match as 1', () => { + expect(gradeClassifierOutput(expected, actualFrom({}))).toBe(1); + }); + + it('scores a taskType mismatch alone as 0.7', () => { + expect(gradeClassifierOutput(expected, actualFrom({ taskType: 'debugging' }))).toBe(0.7); + }); + + it('scores a requiresTools mismatch alone as 0.9', () => { + expect(gradeClassifierOutput(expected, actualFrom({ requiresTools: true }))).toBe(0.9); + }); + + it('ignores ungraded fields like subtaskType and riskLevel', () => { + expect( + gradeClassifierOutput( + expected, + actualFrom({ subtaskType: 'feature_development', riskLevel: 'high' }) + ) + ).toBe(1); + }); +}); + +describe('CLASSIFIER_FIELD_WEIGHTS', () => { + it('sums to 1', () => { + expect(Object.values(CLASSIFIER_FIELD_WEIGHTS).reduce((a, b) => a + b, 0)).toBeCloseTo(1); + }); +}); diff --git a/services/auto-routing-benchmark/src/grading.ts b/services/auto-routing-benchmark/src/grading.ts new file mode 100644 index 0000000000..746e68a546 --- /dev/null +++ b/services/auto-routing-benchmark/src/grading.ts @@ -0,0 +1,31 @@ +import type { ClassifierOutput } from '@kilocode/auto-routing-contracts'; + +// Golden labels grade the axes the decision engine actually consumes. +// subtaskType is intentionally ungraded (high label ambiguity, unused by +// deriveDifficultyTier); riskLevel likewise; requiresTools gets a small weight. +export type ClassifierExpectation = { + taskType: ClassifierOutput['taskType']; + contextComplexity: ClassifierOutput['contextComplexity']; + reasoningComplexity: ClassifierOutput['reasoningComplexity']; + executionMode: ClassifierOutput['executionMode']; + requiresTools: boolean; +}; + +export const CLASSIFIER_FIELD_WEIGHTS: Record = { + taskType: 0.3, + reasoningComplexity: 0.25, + contextComplexity: 0.15, + executionMode: 0.2, + requiresTools: 0.1, +}; + +export function gradeClassifierOutput( + expected: ClassifierExpectation, + actual: ClassifierOutput +): number { + let score = 0; + for (const key of Object.keys(CLASSIFIER_FIELD_WEIGHTS) as (keyof ClassifierExpectation)[]) { + if (actual[key] === expected[key]) score += CLASSIFIER_FIELD_WEIGHTS[key]; + } + return Number(score.toFixed(4)); +} From 878e49b1c011ddc53854eff0585e2ac3d95ede1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= Date: Thu, 11 Jun 2026 22:25:17 +0200 Subject: [PATCH 08/73] style(auto-routing-benchmark): apply oxfmt formatting --- .../auto-routing-benchmark/src/datasets/classifier-cases.ts | 2 +- services/auto-routing-benchmark/src/db.ts | 5 +---- services/auto-routing-benchmark/src/index.ts | 6 +----- 3 files changed, 3 insertions(+), 10 deletions(-) diff --git a/services/auto-routing-benchmark/src/datasets/classifier-cases.ts b/services/auto-routing-benchmark/src/datasets/classifier-cases.ts index a857cd3169..33baacaefb 100644 --- a/services/auto-routing-benchmark/src/datasets/classifier-cases.ts +++ b/services/auto-routing-benchmark/src/datasets/classifier-cases.ts @@ -140,7 +140,7 @@ export const CLASSIFIER_CASES: readonly ClassifierCase[] = [ id: 'debug-low-typo-import', input: chat( AGENT_TOOLS_SYSTEM, - "Running the app throws \"TypeError: formatDate is not a function\" from src/utils/date.ts line 12. The file exports formatDate as a named export but App.tsx imports it as a default. Fix the import.", + 'Running the app throws "TypeError: formatDate is not a function" from src/utils/date.ts line 12. The file exports formatDate as a named export but App.tsx imports it as a default. Fix the import.', { messageCount: 4, hasTools: true } ), expected: { diff --git a/services/auto-routing-benchmark/src/db.ts b/services/auto-routing-benchmark/src/db.ts index 27a817006f..48130a90fd 100644 --- a/services/auto-routing-benchmark/src/db.ts +++ b/services/auto-routing-benchmark/src/db.ts @@ -201,10 +201,7 @@ export async function markRunCompleted(db: D1Database, runId: string): Promise { +export async function markStaleRunsFailed(db: D1Database, olderThanIso: string): Promise { const result = await db .prepare( `UPDATE benchmark_runs SET status = 'failed', error = 'timed out' diff --git a/services/auto-routing-benchmark/src/index.ts b/services/auto-routing-benchmark/src/index.ts index feb6d991e2..3c0a00cf8b 100644 --- a/services/auto-routing-benchmark/src/index.ts +++ b/services/auto-routing-benchmark/src/index.ts @@ -17,9 +17,5 @@ export default { _env: Env, _ctx: ExecutionContext ): Promise {}, - async queue( - _batch: MessageBatch, - _env: Env, - _ctx: ExecutionContext - ): Promise {}, + async queue(_batch: MessageBatch, _env: Env, _ctx: ExecutionContext): Promise {}, }; From 662717ce08faf6e571ec53fcf9c8a54cc178c74d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= Date: Thu, 11 Jun 2026 22:31:54 +0200 Subject: [PATCH 09/73] feat(auto-routing-benchmark): decider golden dataset with deterministic checkers --- .../src/datasets/decider-cases.test.ts | 64 ++++ .../src/datasets/decider-cases.ts | 328 ++++++++++++++++++ .../src/grading.test.ts | 72 ++++ .../auto-routing-benchmark/src/grading.ts | 75 ++++ 4 files changed, 539 insertions(+) create mode 100644 services/auto-routing-benchmark/src/datasets/decider-cases.test.ts create mode 100644 services/auto-routing-benchmark/src/datasets/decider-cases.ts diff --git a/services/auto-routing-benchmark/src/datasets/decider-cases.test.ts b/services/auto-routing-benchmark/src/datasets/decider-cases.test.ts new file mode 100644 index 0000000000..92a734700c --- /dev/null +++ b/services/auto-routing-benchmark/src/datasets/decider-cases.test.ts @@ -0,0 +1,64 @@ +import { describe, expect, it } from 'vitest'; +import { DECIDER_CASES } from './decider-cases'; + +describe('DECIDER_CASES', () => { + it('has exactly 30 cases with unique ids', () => { + expect(DECIDER_CASES.length).toBe(30); + const ids = new Set(DECIDER_CASES.map(c => c.id)); + expect(ids.size).toBe(DECIDER_CASES.length); + }); + + it('has exactly 10 cases per tier', () => { + for (const tier of ['low', 'medium', 'high'] as const) { + expect(DECIDER_CASES.filter(c => c.tier === tier).length, tier).toBe(10); + } + }); + + it('covers at least 4 distinct task types per tier', () => { + for (const tier of ['low', 'medium', 'high'] as const) { + const taskTypes = new Set(DECIDER_CASES.filter(c => c.tier === tier).map(c => c.taskType)); + expect(taskTypes.size, tier).toBeGreaterThanOrEqual(4); + } + }); + + it('has compilable regex patterns', () => { + for (const c of DECIDER_CASES) { + const check = c.check; + if (check.kind === 'regex') { + expect(() => new RegExp(check.pattern, check.flags), c.id).not.toThrow(); + } + } + }); + + it('has json_equal values that round-trip through JSON', () => { + for (const c of DECIDER_CASES) { + const check = c.check; + if (check.kind === 'json_equal') { + expect(JSON.parse(JSON.stringify(check.value)), c.id).toEqual(check.value); + } + } + }); + + it('has generous maxTokens and nonempty prompts', () => { + for (const c of DECIDER_CASES) { + expect(c.maxTokens, c.id).toBeGreaterThanOrEqual(512); + expect(c.systemPrompt.length, c.id).toBeGreaterThan(0); + expect(c.userPrompt.length, c.id).toBeGreaterThan(0); + } + }); + + it('has nonempty exact and contains_all values', () => { + for (const c of DECIDER_CASES) { + const check = c.check; + if (check.kind === 'exact') { + expect(check.value.length, c.id).toBeGreaterThan(0); + } + if (check.kind === 'contains_all') { + expect(check.values.length, c.id).toBeGreaterThan(0); + for (const v of check.values) { + expect(v.length, c.id).toBeGreaterThan(0); + } + } + } + }); +}); diff --git a/services/auto-routing-benchmark/src/datasets/decider-cases.ts b/services/auto-routing-benchmark/src/datasets/decider-cases.ts new file mode 100644 index 0000000000..561995d520 --- /dev/null +++ b/services/auto-routing-benchmark/src/datasets/decider-cases.ts @@ -0,0 +1,328 @@ +import type { ClassifierTaskType, DifficultyTier } from '@kilocode/auto-routing-contracts'; +import type { DeciderCheck } from '../grading'; + +export type DeciderCase = { + id: string; + tier: DifficultyTier; + taskType: ClassifierTaskType; + systemPrompt: string; + userPrompt: string; + maxTokens: number; + check: DeciderCheck; +}; + +const CODE_SYS = 'You are a precise coding assistant. Answer with only what is asked, no explanations.'; +const SYS_SYS = 'You are a precise systems engineer. Answer with only what is asked, no explanations.'; + +// Golden answers below were each worked through by hand. Every case has a +// single unambiguous, mechanically-checkable answer. Checks tolerate +// formatting noise (fences/case/whitespace) but never wrong values. For +// json_equal cases the prompt pins the exact key set in the same order as the +// expected value (the comparison is JSON.stringify-based and order-sensitive). +export const DECIDER_CASES: readonly DeciderCase[] = [ + // ---------------- LOW (mechanical lookups / trivial evaluation) ---------------- + { + id: 'low-impl-array-pipeline', + tier: 'low', + taskType: 'implementation', + systemPrompt: CODE_SYS, + userPrompt: + 'What does this JavaScript print? Answer with the exact output line only.\n\nconst xs = [1, 2, 3, 4].filter(x => x % 2 === 0).map(x => x * 10);\nconsole.log(xs.join("-"));', + maxTokens: 512, + check: { kind: 'exact', value: '20-40' }, + }, + { + id: 'low-impl-sort-numeric', + tier: 'low', + taskType: 'implementation', + systemPrompt: CODE_SYS, + userPrompt: + 'What does this JavaScript print? Answer with the exact output line only.\n\nconsole.log([5, 3, 8, 1].sort((a, b) => a - b).join(","));', + maxTokens: 512, + check: { kind: 'exact', value: '1,3,5,8' }, + }, + { + id: 'low-impl-string-upper', + tier: 'low', + taskType: 'implementation', + systemPrompt: CODE_SYS, + userPrompt: + 'What does this JavaScript print? Answer with the exact output line only.\n\nconsole.log("hello".toUpperCase());', + maxTokens: 512, + check: { kind: 'exact', value: 'HELLO' }, + }, + { + id: 'low-impl-ternary-parity', + tier: 'low', + taskType: 'implementation', + systemPrompt: CODE_SYS, + userPrompt: + 'What does this JavaScript print? Answer with the exact output line only.\n\nconst n = 7;\nconsole.log(n % 2 === 0 ? "even" : "odd");', + maxTokens: 512, + check: { kind: 'exact', value: 'odd' }, + }, + { + id: 'low-debug-compound-assign', + tier: 'low', + taskType: 'debugging', + systemPrompt: CODE_SYS, + userPrompt: + 'What is the final value printed? Answer with only the number.\n\nlet x = 10;\nx += 5;\nx *= 2;\nconsole.log(x);', + maxTokens: 512, + check: { kind: 'exact', value: '30' }, + }, + { + id: 'low-debug-parseint-suffix', + tier: 'low', + taskType: 'debugging', + systemPrompt: CODE_SYS, + userPrompt: + 'What does this JavaScript print? Answer with only the number.\n\nconsole.log(parseInt("42px", 10));', + maxTokens: 512, + check: { kind: 'exact', value: '42' }, + }, + { + id: 'low-investigation-char-count', + tier: 'low', + taskType: 'investigation', + systemPrompt: CODE_SYS, + userPrompt: + 'How many times does the letter "a" appear in the word "banana"? Answer with only the number.', + maxTokens: 512, + check: { kind: 'exact', value: '3' }, + }, + { + id: 'low-investigation-object-keys', + tier: 'low', + taskType: 'investigation', + systemPrompt: CODE_SYS, + userPrompt: + 'How many own enumerable keys does this object have? Answer with only the number.\n\nconst o = { a: 1, b: 2, c: 3 };', + maxTokens: 512, + check: { kind: 'exact', value: '3' }, + }, + { + id: 'low-planning-http-created', + tier: 'low', + taskType: 'planning_design', + systemPrompt: 'You are a precise web API expert. Answer with only what is asked, no explanations.', + userPrompt: + 'Which standard HTTP status code indicates that a new resource was successfully created? Answer with only the 3-digit number.', + maxTokens: 512, + check: { kind: 'exact', value: '201' }, + }, + { + id: 'low-refactoring-reduce-sum', + tier: 'low', + taskType: 'refactoring', + systemPrompt: CODE_SYS, + userPrompt: + 'A loop sums an array. What value does it produce? Answer with only the number.\n\nlet total = 0;\nfor (const n of [4, 4, 4]) total += n;\nconsole.log(total);', + maxTokens: 512, + check: { kind: 'exact', value: '12' }, + }, + + // ---------------- MEDIUM (multi-step reasoning, off-by-one, spec application) ------------- + { + id: 'medium-debug-off-by-one', + tier: 'medium', + taskType: 'debugging', + systemPrompt: CODE_SYS, + userPrompt: + 'This binary search has a bug. Reply with JSON {"line": <1-based line number of the buggy line>, "fix": ""}.\n\n1: function bsearch(a, t) {\n2: let lo = 0, hi = a.length;\n3: while (lo < hi) {\n4: const mid = (lo + hi) >> 1;\n5: if (a[mid] === t) return mid;\n6: if (a[mid] < t) lo = mid;\n7: else hi = mid;\n8: }\n9: return -1;\n10: }', + maxTokens: 2048, + check: { kind: 'json_equal', value: { line: 6, fix: 'if (a[mid] < t) lo = mid + 1;' } }, + }, + { + id: 'medium-impl-reduce-trace', + tier: 'medium', + taskType: 'implementation', + systemPrompt: CODE_SYS, + userPrompt: + 'What does this print? Answer with only the number.\n\nconst r = [1, 2, 3, 4].reduce((acc, x) => acc + x * x, 0);\nconsole.log(r);', + maxTokens: 2048, + check: { kind: 'exact', value: '30' }, + }, + { + id: 'medium-impl-closure-counter', + tier: 'medium', + taskType: 'implementation', + systemPrompt: CODE_SYS, + userPrompt: + 'What is the final printed value? Answer with only the number.\n\nfunction make() {\n let c = 0;\n return () => ++c;\n}\nconst f = make();\nf();\nf();\nconsole.log(f());', + maxTokens: 2048, + check: { kind: 'exact', value: '3' }, + }, + { + id: 'medium-debug-async-order', + tier: 'medium', + taskType: 'debugging', + systemPrompt: CODE_SYS, + userPrompt: + 'What does this program print, in order? Answer with the four uppercase letters joined by commas, e.g. "A,B,C,D".\n\nconsole.log("A");\nPromise.resolve().then(() => console.log("B"));\nsetTimeout(() => console.log("C"), 0);\nconsole.log("D");', + maxTokens: 2048, + check: { kind: 'regex', pattern: '^\\s*A\\s*,\\s*D\\s*,\\s*B\\s*,\\s*C\\s*$', flags: 'im' }, + }, + { + id: 'medium-impl-map-set-dedup', + tier: 'medium', + taskType: 'implementation', + systemPrompt: CODE_SYS, + userPrompt: + 'What is the size of the resulting Set? Answer with only the number.\n\nconst s = new Set([1, 2, 2, 3, 3, 3, 4]);\nconsole.log(s.size);', + maxTokens: 2048, + check: { kind: 'exact', value: '4' }, + }, + { + id: 'medium-investigation-regex-groups', + tier: 'medium', + taskType: 'investigation', + systemPrompt: CODE_SYS, + userPrompt: + 'Given the regex /(\\d{4})-(\\d{2})-(\\d{2})/ applied to "2026-06-11", what is capture group 2? Answer with only the value.', + maxTokens: 2048, + check: { kind: 'exact', value: '06' }, + }, + { + id: 'medium-impl-recursion-fib', + tier: 'medium', + taskType: 'implementation', + systemPrompt: CODE_SYS, + userPrompt: + 'This computes a Fibonacci-like sequence where f(0)=0, f(1)=1, f(n)=f(n-1)+f(n-2). What is f(7)? Answer with only the number.', + maxTokens: 2048, + check: { kind: 'exact', value: '13' }, + }, + { + id: 'medium-debug-mutation-shared-ref', + tier: 'medium', + taskType: 'debugging', + systemPrompt: CODE_SYS, + userPrompt: + 'What does this print? Answer with only the number.\n\nconst a = [1, 2, 3];\nconst b = a;\nb.push(4);\nconsole.log(a.length);', + maxTokens: 2048, + check: { kind: 'exact', value: '4' }, + }, + { + id: 'medium-planning-rate-limit-window', + tier: 'medium', + taskType: 'planning_design', + systemPrompt: SYS_SYS, + userPrompt: + 'A fixed-window rate limiter allows 100 requests per 60-second window. A client sends 80 requests in the first 30 seconds of a window, then 40 more requests in the next 20 seconds (same window). How many of the 40 later requests are rejected? Answer with only the number.', + maxTokens: 2048, + check: { kind: 'exact', value: '20' }, + }, + { + id: 'medium-refactoring-equivalent-output', + tier: 'medium', + taskType: 'refactoring', + systemPrompt: CODE_SYS, + userPrompt: + 'After refactoring, both versions must produce the same output. What number does this print? Answer with only the number.\n\nconst nums = [10, 20, 30];\nconst doubled = nums.map(n => n * 2);\nconsole.log(doubled[1]);', + maxTokens: 2048, + check: { kind: 'exact', value: '40' }, + }, + + // ---------------- HIGH (deep multi-constraint reasoning, subtle semantics) ------------- + { + id: 'high-investigation-queue-trace', + tier: 'high', + taskType: 'investigation', + systemPrompt: SYS_SYS, + userPrompt: + 'Three workers process a queue with at-least-once delivery. Worker A reads job 7 at t=0ms and crashes at t=50ms before ack. Visibility timeout is 30ms. Worker B receives job 7 at t=35ms, processes it in 40ms and acks. Worker C receives job 7 at t=80ms (redelivery triggered by the crash recovery scan at t=70ms) and processes it in 10ms, acking at t=90ms. The job inserts a row keyed by an idempotency key with ON CONFLICT DO NOTHING. How many rows exist at t=100ms, and which worker\'s insert won? Reply with JSON {"rows": , "winner": ""}.', + maxTokens: 4096, + check: { kind: 'json_equal', value: { rows: 1, winner: 'B' } }, + }, + { + id: 'high-debug-closure-loop-var', + tier: 'high', + taskType: 'debugging', + systemPrompt: CODE_SYS, + userPrompt: + 'What does this print? Answer with the three numbers joined by commas, e.g. "1,2,3".\n\nconst fns = [];\nfor (var i = 0; i < 3; i++) {\n fns.push(() => i);\n}\nconsole.log(fns[0]() + "," + fns[1]() + "," + fns[2]());', + maxTokens: 4096, + check: { kind: 'regex', pattern: '^\\s*3\\s*,\\s*3\\s*,\\s*3\\s*$', flags: 'm' }, + }, + { + id: 'high-debug-closure-let-var', + tier: 'high', + taskType: 'debugging', + systemPrompt: CODE_SYS, + userPrompt: + 'What does this print? Answer with the three numbers joined by commas, e.g. "1,2,3".\n\nconst fns = [];\nfor (let i = 0; i < 3; i++) {\n fns.push(() => i);\n}\nconsole.log(fns[0]() + "," + fns[1]() + "," + fns[2]());', + maxTokens: 4096, + check: { kind: 'regex', pattern: '^\\s*0\\s*,\\s*1\\s*,\\s*2\\s*$', flags: 'm' }, + }, + { + id: 'high-impl-this-binding', + tier: 'high', + taskType: 'implementation', + systemPrompt: CODE_SYS, + userPrompt: + 'What does this print? Answer with only the number.\n\nconst obj = {\n v: 10,\n get() {\n return [1, 2].map(function () {\n return this?.v ?? 0;\n }).reduce((a, b) => a + b, 0);\n },\n};\nconsole.log(obj.get());', + maxTokens: 4096, + check: { kind: 'exact', value: '0' }, + }, + { + id: 'high-investigation-deadlock-order', + tier: 'high', + taskType: 'investigation', + systemPrompt: SYS_SYS, + userPrompt: + 'Two threads acquire locks. Thread 1: lock A, then lock B. Thread 2: lock B, then lock A. Both hold the first lock and then block forever waiting for the second. To eliminate the deadlock by enforcing a global lock acquisition order (alphabetical: A before B), which single thread number must have its two lock acquisitions reordered? Answer with only the thread number.', + maxTokens: 4096, + check: { kind: 'exact', value: '2' }, + }, + { + id: 'high-debug-float-equality', + tier: 'high', + taskType: 'debugging', + systemPrompt: CODE_SYS, + userPrompt: + 'In IEEE-754 double precision (JavaScript Number), does the expression (0.1 + 0.2 === 0.3) evaluate to true or false? Answer with only the lowercase word true or false.', + maxTokens: 4096, + check: { kind: 'exact', value: 'false' }, + }, + { + id: 'high-investigation-txn-isolation', + tier: 'high', + taskType: 'investigation', + systemPrompt: SYS_SYS, + userPrompt: + 'A counter row holds value 5. Under READ COMMITTED isolation, two concurrent transactions T1 and T2 each run: SELECT v FROM c; then UPDATE c SET v = (the value they read) + 1. Both read before either writes, T1 commits first, then T2 commits (last-write-wins, no row lock taken on the SELECT). What is the final value of v? Answer with only the number.', + maxTokens: 4096, + check: { kind: 'exact', value: '6' }, + }, + { + id: 'high-impl-generator-trace', + tier: 'high', + taskType: 'implementation', + systemPrompt: CODE_SYS, + userPrompt: + 'What does this print? Answer with the values joined by commas, e.g. "1,2,3".\n\nfunction* g() {\n yield 1;\n yield* [2, 3];\n yield 4;\n}\nconsole.log([...g()].join(","));', + maxTokens: 4096, + check: { kind: 'regex', pattern: '^\\s*1\\s*,\\s*2\\s*,\\s*3\\s*,\\s*4\\s*$', flags: 'm' }, + }, + { + id: 'high-planning-cache-invalidation', + tier: 'high', + taskType: 'planning_design', + systemPrompt: SYS_SYS, + userPrompt: + 'A write-through cache with TTL 60s. At t=0s key K is written (value 1, cached). At t=30s the database row for K is updated to value 2 by a process that bypasses the cache (does not invalidate it). At t=45s a reader requests K. At t=70s another reader requests K. The cache returns its entry if present and unexpired, otherwise reads the DB and caches. What value does the t=45s reader get, and what value does the t=70s reader get? Reply with JSON {"first": , "second": }.', + maxTokens: 4096, + check: { kind: 'json_equal', value: { first: 1, second: 2 } }, + }, + { + id: 'high-refactoring-short-circuit', + tier: 'high', + taskType: 'refactoring', + systemPrompt: CODE_SYS, + userPrompt: + 'What does this print? Answer with only the number.\n\nlet calls = 0;\nfunction side() {\n calls++;\n return 0;\n}\nconst result = side() || side() || 7;\nconsole.log(calls);', + maxTokens: 4096, + check: { kind: 'exact', value: '2' }, + }, +]; diff --git a/services/auto-routing-benchmark/src/grading.test.ts b/services/auto-routing-benchmark/src/grading.test.ts index 7094da8d09..3ea7abad76 100644 --- a/services/auto-routing-benchmark/src/grading.test.ts +++ b/services/auto-routing-benchmark/src/grading.test.ts @@ -3,6 +3,8 @@ import type { ClassifierOutput } from '@kilocode/auto-routing-contracts'; import { CLASSIFIER_FIELD_WEIGHTS, gradeClassifierOutput, + normalizeAnswer, + runDeciderCheck, type ClassifierExpectation, } from './grading'; @@ -56,3 +58,73 @@ describe('CLASSIFIER_FIELD_WEIGHTS', () => { expect(Object.values(CLASSIFIER_FIELD_WEIGHTS).reduce((a, b) => a + b, 0)).toBeCloseTo(1); }); }); + +describe('normalizeAnswer', () => { + it('strips fences, lowercases and trims', () => { + expect(normalizeAnswer('```js\n Hello World \n```')).toBe('hello world'); + }); +}); + +describe('runDeciderCheck: exact', () => { + it('passes with surrounding code fences and different case', () => { + expect(runDeciderCheck({ kind: 'exact', value: '20-40' }, '```\n20-40\n```')).toBe(true); + expect(runDeciderCheck({ kind: 'exact', value: 'Hello' }, 'HELLO')).toBe(true); + }); + + it('fails on a wrong answer', () => { + expect(runDeciderCheck({ kind: 'exact', value: '20-40' }, '20-30')).toBe(false); + }); +}); + +describe('runDeciderCheck: contains_all', () => { + it('passes regardless of order and case', () => { + expect( + runDeciderCheck({ kind: 'contains_all', values: ['Alpha', 'Beta'] }, 'beta then ALPHA') + ).toBe(true); + }); + + it('fails when one value is missing', () => { + expect( + runDeciderCheck({ kind: 'contains_all', values: ['alpha', 'beta'] }, 'only alpha here') + ).toBe(false); + }); +}); + +describe('runDeciderCheck: regex', () => { + it('passes a basic match with flags', () => { + expect( + runDeciderCheck({ kind: 'regex', pattern: '^answer: \\d+$', flags: 'im' }, 'ANSWER: 42') + ).toBe(true); + }); + + it('fails when the pattern does not match', () => { + expect(runDeciderCheck({ kind: 'regex', pattern: '^\\d+$' }, 'not a number')).toBe(false); + }); +}); + +describe('runDeciderCheck: json_equal', () => { + it('passes with a json fence plus prose before and after', () => { + const output = 'Here you go:\n```json\n{"a":1}\n```\nLet me know!'; + expect(runDeciderCheck({ kind: 'json_equal', value: { a: 1 } }, output)).toBe(true); + }); + + it('passes with bare JSON', () => { + expect(runDeciderCheck({ kind: 'json_equal', value: { line: 6 } }, '{"line": 6}')).toBe(true); + }); + + it('fails on unparseable output', () => { + expect(runDeciderCheck({ kind: 'json_equal', value: { a: 1 } }, 'sorry, no idea')).toBe(false); + }); + + it('fails when values differ', () => { + expect(runDeciderCheck({ kind: 'json_equal', value: { a: 1 } }, '{"a": 2}')).toBe(false); + }); + + // Documents current behavior: comparison is JSON.stringify-based, so key + // ORDER is significant. Dataset authoring must mirror the prompted key order. + it('is sensitive to object key order (documented behavior)', () => { + expect(runDeciderCheck({ kind: 'json_equal', value: { a: 1, b: 2 } }, '{"b": 2, "a": 1}')).toBe( + false + ); + }); +}); diff --git a/services/auto-routing-benchmark/src/grading.ts b/services/auto-routing-benchmark/src/grading.ts index 746e68a546..0c3291c1d7 100644 --- a/services/auto-routing-benchmark/src/grading.ts +++ b/services/auto-routing-benchmark/src/grading.ts @@ -29,3 +29,78 @@ export function gradeClassifierOutput( } return Number(score.toFixed(4)); } + +export type DeciderCheck = + | { kind: 'exact'; value: string } + | { kind: 'contains_all'; values: readonly string[] } + | { kind: 'regex'; pattern: string; flags?: string } + | { kind: 'json_equal'; value: unknown }; + +// Mechanical pass/fail grading keeps the decider benchmark deterministic: +// no LLM judges. Normalization tolerates formatting noise (whitespace, +// case, markdown fences) without weakening the assertion. +export function normalizeAnswer(text: string): string { + return text + .replace(/```[a-z]*\n?/gi, '') + .replace(/```/g, '') + .trim() + .toLowerCase(); +} + +// Balance-scan from the first `{`/`[` to its matching close so trailing prose +// after the JSON payload doesn't break parsing. String-aware so braces inside +// string literals are ignored. +function extractJson(text: string): unknown { + const stripped = text.replace(/```(?:json)?\n?/gi, '').replace(/```/g, ''); + const start = stripped.search(/[[{]/); + if (start === -1) throw new Error('no JSON found'); + + const open = stripped[start]; + const close = open === '{' ? '}' : ']'; + let depth = 0; + let inString = false; + let escaped = false; + + for (let i = start; i < stripped.length; i++) { + const ch = stripped[i]; + if (inString) { + if (escaped) { + escaped = false; + } else if (ch === '\\') { + escaped = true; + } else if (ch === '"') { + inString = false; + } + continue; + } + if (ch === '"') { + inString = true; + } else if (ch === open) { + depth++; + } else if (ch === close) { + depth--; + if (depth === 0) { + return JSON.parse(stripped.slice(start, i + 1)); + } + } + } + throw new Error('unbalanced JSON'); +} + +export function runDeciderCheck(check: DeciderCheck, output: string): boolean { + switch (check.kind) { + case 'exact': + return normalizeAnswer(output) === normalizeAnswer(check.value); + case 'contains_all': + return check.values.every(v => normalizeAnswer(output).includes(normalizeAnswer(v))); + case 'regex': + return new RegExp(check.pattern, check.flags).test(output); + case 'json_equal': { + try { + return JSON.stringify(extractJson(output)) === JSON.stringify(check.value); + } catch { + return false; + } + } + } +} From 110cbd9ea44fab7e8ee51cfd1a5d823d19155eac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= Date: Thu, 11 Jun 2026 22:35:49 +0200 Subject: [PATCH 10/73] fix(auto-routing-benchmark): unambiguous whitespace instruction in off-by-one case --- .../src/datasets/decider-cases.ts | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/services/auto-routing-benchmark/src/datasets/decider-cases.ts b/services/auto-routing-benchmark/src/datasets/decider-cases.ts index 561995d520..b14ec63d2c 100644 --- a/services/auto-routing-benchmark/src/datasets/decider-cases.ts +++ b/services/auto-routing-benchmark/src/datasets/decider-cases.ts @@ -11,8 +11,10 @@ export type DeciderCase = { check: DeciderCheck; }; -const CODE_SYS = 'You are a precise coding assistant. Answer with only what is asked, no explanations.'; -const SYS_SYS = 'You are a precise systems engineer. Answer with only what is asked, no explanations.'; +const CODE_SYS = + 'You are a precise coding assistant. Answer with only what is asked, no explanations.'; +const SYS_SYS = + 'You are a precise systems engineer. Answer with only what is asked, no explanations.'; // Golden answers below were each worked through by hand. Every case has a // single unambiguous, mechanically-checkable answer. Checks tolerate @@ -105,7 +107,8 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ id: 'low-planning-http-created', tier: 'low', taskType: 'planning_design', - systemPrompt: 'You are a precise web API expert. Answer with only what is asked, no explanations.', + systemPrompt: + 'You are a precise web API expert. Answer with only what is asked, no explanations.', userPrompt: 'Which standard HTTP status code indicates that a new resource was successfully created? Answer with only the 3-digit number.', maxTokens: 512, @@ -129,7 +132,7 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ taskType: 'debugging', systemPrompt: CODE_SYS, userPrompt: - 'This binary search has a bug. Reply with JSON {"line": <1-based line number of the buggy line>, "fix": ""}.\n\n1: function bsearch(a, t) {\n2: let lo = 0, hi = a.length;\n3: while (lo < hi) {\n4: const mid = (lo + hi) >> 1;\n5: if (a[mid] === t) return mid;\n6: if (a[mid] < t) lo = mid;\n7: else hi = mid;\n8: }\n9: return -1;\n10: }', + 'This binary search has a bug. Reply with JSON {"line": <1-based line number of the buggy line>, "fix": ""}.\n\n1: function bsearch(a, t) {\n2: let lo = 0, hi = a.length;\n3: while (lo < hi) {\n4: const mid = (lo + hi) >> 1;\n5: if (a[mid] === t) return mid;\n6: if (a[mid] < t) lo = mid;\n7: else hi = mid;\n8: }\n9: return -1;\n10: }', maxTokens: 2048, check: { kind: 'json_equal', value: { line: 6, fix: 'if (a[mid] < t) lo = mid + 1;' } }, }, From 5ce86212b37463d0a39cb64525000c31c74fa937 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= Date: Thu, 11 Jun 2026 22:40:49 +0200 Subject: [PATCH 11/73] feat(auto-routing-benchmark): queue-driven benchmark runs with aggregation and table publish --- .../auto-routing-benchmark/src/config.test.ts | 41 +++ services/auto-routing-benchmark/src/config.ts | 52 ++++ services/auto-routing-benchmark/src/index.ts | 20 +- .../src/routing-table-builder.test.ts | 227 ++++++++++++++ .../src/routing-table-builder.ts | 52 ++++ .../auto-routing-benchmark/src/run.test.ts | 191 ++++++++++++ services/auto-routing-benchmark/src/run.ts | 283 ++++++++++++++++++ 7 files changed, 859 insertions(+), 7 deletions(-) create mode 100644 services/auto-routing-benchmark/src/config.test.ts create mode 100644 services/auto-routing-benchmark/src/config.ts create mode 100644 services/auto-routing-benchmark/src/routing-table-builder.test.ts create mode 100644 services/auto-routing-benchmark/src/routing-table-builder.ts create mode 100644 services/auto-routing-benchmark/src/run.test.ts create mode 100644 services/auto-routing-benchmark/src/run.ts diff --git a/services/auto-routing-benchmark/src/config.test.ts b/services/auto-routing-benchmark/src/config.test.ts new file mode 100644 index 0000000000..32c04e3a86 --- /dev/null +++ b/services/auto-routing-benchmark/src/config.test.ts @@ -0,0 +1,41 @@ +import { describe, expect, it } from 'vitest'; +import { DEFAULT_BENCHMARK_CONFIG, parseConfigJson } from './config'; + +describe('parseConfigJson', () => { + it('returns defaults on null', () => { + expect(parseConfigJson(null)).toEqual(DEFAULT_BENCHMARK_CONFIG); + }); + + it('returns defaults on invalid JSON string', () => { + expect(parseConfigJson('not valid json {{{')).toEqual(DEFAULT_BENCHMARK_CONFIG); + }); + + it('returns defaults on schema-invalid JSON', () => { + const invalid = JSON.stringify({ classifierModels: 'not-an-array', minAccuracy: 'bad' }); + expect(parseConfigJson(invalid)).toEqual(DEFAULT_BENCHMARK_CONFIG); + }); + + it('returns defaults on empty object', () => { + expect(parseConfigJson('{}')).toEqual(DEFAULT_BENCHMARK_CONFIG); + }); + + it('round-trips a valid config', () => { + const config = { + ...DEFAULT_BENCHMARK_CONFIG, + classifierModels: ['some/model'], + minAccuracy: 0.8, + maxConcurrency: 2, + updatedAt: '2026-01-01T00:00:00.000Z', + updatedBy: 'admin@example.com', + }; + expect(parseConfigJson(JSON.stringify(config))).toEqual(config); + }); + + it('returns defaults when classifierModels is empty array (schema violation)', () => { + const invalid = JSON.stringify({ + ...DEFAULT_BENCHMARK_CONFIG, + classifierModels: [], + }); + expect(parseConfigJson(invalid)).toEqual(DEFAULT_BENCHMARK_CONFIG); + }); +}); diff --git a/services/auto-routing-benchmark/src/config.ts b/services/auto-routing-benchmark/src/config.ts new file mode 100644 index 0000000000..e609090760 --- /dev/null +++ b/services/auto-routing-benchmark/src/config.ts @@ -0,0 +1,52 @@ +import { BenchmarkConfigSchema, type BenchmarkConfig } from '@kilocode/auto-routing-contracts'; +import { getConfigRow, saveConfigRow } from './db'; + +export const DEFAULT_BENCHMARK_CONFIG: BenchmarkConfig = { + classifierModels: [ + 'google/gemini-2.5-flash-lite', + 'google/gemini-2.5-flash', + 'openai/gpt-5-mini', + 'qwen/qwen3.7-plus', + ], + deciderModels: [ + { id: 'google/gemini-2.5-flash-lite', supportedApiKinds: ['chat_completions'] }, + { id: 'google/gemini-2.5-flash', supportedApiKinds: ['chat_completions'] }, + { id: 'qwen/qwen3.7-plus', supportedApiKinds: ['chat_completions'] }, + { id: 'openai/gpt-5.5', supportedApiKinds: ['chat_completions', 'responses'] }, + { + id: 'anthropic/claude-sonnet-4.6', + supportedApiKinds: ['chat_completions', 'messages', 'responses'], + }, + ], + minAccuracy: 0.7, + maxConcurrency: 4, + updatedAt: null, + updatedBy: null, +}; + +// Pure so the fallback path is unit-testable without D1. +export function parseConfigJson(raw: string | null): BenchmarkConfig { + if (raw === null) return DEFAULT_BENCHMARK_CONFIG; + try { + const parsed = BenchmarkConfigSchema.safeParse(JSON.parse(raw)); + return parsed.success ? parsed.data : DEFAULT_BENCHMARK_CONFIG; + } catch { + return DEFAULT_BENCHMARK_CONFIG; + } +} + +export async function getBenchmarkConfig(db: D1Database): Promise { + const row = await getConfigRow(db); + return parseConfigJson(row?.config_json ?? null); +} + +export async function saveBenchmarkConfig( + db: D1Database, + config: BenchmarkConfig, + updatedBy: string | null +): Promise { + const updatedAt = new Date().toISOString(); + const stamped: BenchmarkConfig = { ...config, updatedAt, updatedBy }; + await saveConfigRow(db, JSON.stringify(stamped), updatedAt, updatedBy); + return stamped; +} diff --git a/services/auto-routing-benchmark/src/index.ts b/services/auto-routing-benchmark/src/index.ts index 3c0a00cf8b..542e9e3ce6 100644 --- a/services/auto-routing-benchmark/src/index.ts +++ b/services/auto-routing-benchmark/src/index.ts @@ -2,6 +2,7 @@ import { Hono } from 'hono'; import { createErrorHandler, createNotFoundHandler } from '@kilocode/worker-utils'; import { authMiddleware } from './auth'; import type { HonoEnv } from './hono-env'; +import { processJob, startRun, type BenchmarkJobMessage } from './run'; export const app = new Hono(); app.use('*', authMiddleware); @@ -9,13 +10,18 @@ app.get('/health', c => c.json({ status: 'ok', service: 'auto-routing-benchmark' app.notFound(createNotFoundHandler()); app.onError(createErrorHandler()); +const DECIDER_CRON = '10 5 * * 1'; + export default { fetch: app.fetch, - // Wired up in later tasks (run orchestration + admin endpoints). - async scheduled( - _controller: ScheduledController, - _env: Env, - _ctx: ExecutionContext - ): Promise {}, - async queue(_batch: MessageBatch, _env: Env, _ctx: ExecutionContext): Promise {}, + async scheduled(controller: ScheduledController, env: Env, ctx: ExecutionContext): Promise { + const kind = controller.cron === DECIDER_CRON ? 'decider' : 'classifier'; + ctx.waitUntil(startRun(env, kind)); + }, + async queue(batch: MessageBatch, env: Env): Promise { + for (const message of batch.messages) { + await processJob(env, message.body); + message.ack(); + } + }, }; diff --git a/services/auto-routing-benchmark/src/routing-table-builder.test.ts b/services/auto-routing-benchmark/src/routing-table-builder.test.ts new file mode 100644 index 0000000000..892ef5c88d --- /dev/null +++ b/services/auto-routing-benchmark/src/routing-table-builder.test.ts @@ -0,0 +1,227 @@ +import { describe, expect, it } from 'vitest'; +import type { BenchmarkConfig, BenchmarkModelSummary } from '@kilocode/auto-routing-contracts'; +import { buildRoutingTable } from './routing-table-builder'; + +const BASE_CONFIG: BenchmarkConfig = { + classifierModels: ['some/classifier'], + deciderModels: [ + { id: 'model/cheap', supportedApiKinds: ['chat_completions'] }, + { id: 'model/expensive', supportedApiKinds: ['chat_completions', 'responses'] }, + { id: 'model/mid', supportedApiKinds: ['chat_completions', 'messages'] }, + ], + minAccuracy: 0.7, + maxConcurrency: 4, + updatedAt: null, + updatedBy: null, +}; + +function summary( + model: string, + tier: BenchmarkModelSummary['tier'], + accuracy: number, + avgCostUsd: number | null = 0.001 +): BenchmarkModelSummary { + return { + model, + tier, + accuracy, + avgCostUsd, + avgLatencyMs: 500, + p50LatencyMs: 450, + cases: 10, + errors: 0, + }; +} + +const ALL_TIERS_SUMMARIES: BenchmarkModelSummary[] = [ + summary('model/cheap', 'low', 0.9, 0.001), + summary('model/expensive', 'low', 0.95, 0.01), + summary('model/mid', 'low', 0.8, 0.005), + summary('model/cheap', 'medium', 0.75, 0.001), + summary('model/expensive', 'medium', 0.85, 0.01), + summary('model/mid', 'medium', 0.72, 0.005), + summary('model/cheap', 'high', 0.6, 0.001), + summary('model/expensive', 'high', 0.9, 0.01), + summary('model/mid', 'high', 0.75, 0.005), +]; + +describe('buildRoutingTable', () => { + it('cheapest above-threshold model comes first per tier', () => { + const table = buildRoutingTable({ + runId: 'test-run-1', + generatedAt: '2026-01-01T00:00:00.000Z', + config: BASE_CONFIG, + summaries: ALL_TIERS_SUMMARIES, + }); + + // low tier: cheap (0.001) and mid (0.005) and expensive (0.01) all meet threshold (0.7) + // cheapest first + expect(table.tiers.low[0].model).toBe('model/cheap'); + expect(table.tiers.low[1].model).toBe('model/mid'); + expect(table.tiers.low[2].model).toBe('model/expensive'); + + // medium tier: all meet threshold, cheapest first + expect(table.tiers.medium[0].model).toBe('model/cheap'); + expect(table.tiers.medium[1].model).toBe('model/mid'); + expect(table.tiers.medium[2].model).toBe('model/expensive'); + + // high tier: expensive (0.9) and mid (0.75) meet threshold; cheap (0.6) does not + // meeting threshold first, then by cost; cheap last (below threshold) + expect(table.tiers.high[0].model).toBe('model/mid'); // meets threshold, cheaper + expect(table.tiers.high[1].model).toBe('model/expensive'); // meets threshold, more expensive + expect(table.tiers.high[2].model).toBe('model/cheap'); // below threshold + }); + + it('marks meetsThreshold correctly', () => { + const table = buildRoutingTable({ + runId: 'test-run-2', + generatedAt: '2026-01-01T00:00:00.000Z', + config: BASE_CONFIG, + summaries: ALL_TIERS_SUMMARIES, + }); + + for (const candidate of table.tiers.low) { + expect(candidate.meetsThreshold).toBe(candidate.accuracy >= 0.7); + } + }); + + it('excludes a model absent from a tier summaries', () => { + // model/cheap has no 'high' summary entry + const summaries: BenchmarkModelSummary[] = [ + summary('model/cheap', 'low', 0.9), + summary('model/cheap', 'medium', 0.8), + // no 'high' entry for model/cheap + summary('model/expensive', 'low', 0.9), + summary('model/expensive', 'medium', 0.8), + summary('model/expensive', 'high', 0.9), + summary('model/mid', 'low', 0.8), + summary('model/mid', 'medium', 0.75), + summary('model/mid', 'high', 0.75), + ]; + + const table = buildRoutingTable({ + runId: 'test-run-3', + generatedAt: '2026-01-01T00:00:00.000Z', + config: BASE_CONFIG, + summaries, + }); + + const highModels = table.tiers.high.map(c => c.model); + expect(highModels).not.toContain('model/cheap'); + expect(highModels).toContain('model/expensive'); + expect(highModels).toContain('model/mid'); + }); + + it('carries supportedApiKinds from config', () => { + const table = buildRoutingTable({ + runId: 'test-run-4', + generatedAt: '2026-01-01T00:00:00.000Z', + config: BASE_CONFIG, + summaries: ALL_TIERS_SUMMARIES, + }); + + const expensiveInLow = table.tiers.low.find(c => c.model === 'model/expensive'); + expect(expensiveInLow?.supportedApiKinds).toEqual(['chat_completions', 'responses']); + + const midInLow = table.tiers.low.find(c => c.model === 'model/mid'); + expect(midInLow?.supportedApiKinds).toEqual(['chat_completions', 'messages']); + }); + + it('defaults supportedApiKinds to chat_completions when model missing from config', () => { + const summaries: BenchmarkModelSummary[] = [ + summary('model/unknown', 'low', 0.9), + summary('model/cheap', 'low', 0.8), + summary('model/cheap', 'medium', 0.8), + summary('model/cheap', 'high', 0.8), + summary('model/unknown', 'medium', 0.9), + summary('model/unknown', 'high', 0.9), + ]; + + // Add a model that isn't in deciderModels + const config = { ...BASE_CONFIG }; + + const table = buildRoutingTable({ + runId: 'test-run-5', + generatedAt: '2026-01-01T00:00:00.000Z', + config, + summaries, + }); + + const unknown = table.tiers.low.find(c => c.model === 'model/unknown'); + expect(unknown?.supportedApiKinds).toEqual(['chat_completions']); + }); + + it('throws when a tier has no candidates', () => { + // Only low and medium summaries — high is missing entirely + const summaries: BenchmarkModelSummary[] = [ + summary('model/cheap', 'low', 0.9), + summary('model/expensive', 'low', 0.9), + summary('model/mid', 'low', 0.9), + summary('model/cheap', 'medium', 0.9), + summary('model/expensive', 'medium', 0.9), + summary('model/mid', 'medium', 0.9), + ]; + + expect(() => + buildRoutingTable({ + runId: 'test-run-6', + generatedAt: '2026-01-01T00:00:00.000Z', + config: BASE_CONFIG, + summaries, + }) + ).toThrow(); + }); + + it('throws when a tier has only zero-case entries', () => { + const summaries: BenchmarkModelSummary[] = [ + ...ALL_TIERS_SUMMARIES.filter(s => s.tier !== 'high'), + // high tier entries with 0 cases — should be excluded + { ...summary('model/cheap', 'high', 0.9), cases: 0 }, + { ...summary('model/expensive', 'high', 0.9), cases: 0 }, + { ...summary('model/mid', 'high', 0.9), cases: 0 }, + ]; + + expect(() => + buildRoutingTable({ + runId: 'test-run-7', + generatedAt: '2026-01-01T00:00:00.000Z', + config: BASE_CONFIG, + summaries, + }) + ).toThrow(); + }); + + it('ignores classifier-style * tier summaries', () => { + const summaries: BenchmarkModelSummary[] = [ + ...ALL_TIERS_SUMMARIES, + // classifier summaries with '*' tier — should be ignored + summary('model/cheap', '*', 0.95), + summary('model/expensive', '*', 0.95), + ]; + + // Should not throw and * tier entries should not affect output + const table = buildRoutingTable({ + runId: 'test-run-8', + generatedAt: '2026-01-01T00:00:00.000Z', + config: BASE_CONFIG, + summaries, + }); + + expect(table.tiers.low.length).toBe(3); + expect(table.tiers.medium.length).toBe(3); + }); + + it('sets version and generatedAt from params', () => { + const table = buildRoutingTable({ + runId: 'decider-2026-01-01', + generatedAt: '2026-01-01T12:00:00.000Z', + config: BASE_CONFIG, + summaries: ALL_TIERS_SUMMARIES, + }); + + expect(table.version).toBe('decider-2026-01-01'); + expect(table.generatedAt).toBe('2026-01-01T12:00:00.000Z'); + expect(table.source).toBe('benchmark'); + expect(table.minAccuracy).toBe(0.7); + }); +}); diff --git a/services/auto-routing-benchmark/src/routing-table-builder.ts b/services/auto-routing-benchmark/src/routing-table-builder.ts new file mode 100644 index 0000000000..16bc21a9e8 --- /dev/null +++ b/services/auto-routing-benchmark/src/routing-table-builder.ts @@ -0,0 +1,52 @@ +import { + rankCandidates, + RoutingTableSchema, + type BenchmarkConfig, + type BenchmarkModelSummary, + type DifficultyTier, + type RoutingTable, +} from '@kilocode/auto-routing-contracts'; + +// Builds the routing table from per-(model, tier) decider summaries. Models +// with zero graded cases in a tier are excluded from that tier. Throws when +// any tier ends up empty so the caller keeps the previous published table. +export function buildRoutingTable(params: { + runId: string; + generatedAt: string; + config: BenchmarkConfig; + summaries: BenchmarkModelSummary[]; +}): RoutingTable { + const { runId, generatedAt, config, summaries } = params; + const apiKindsByModel = new Map(config.deciderModels.map(m => [m.id, m.supportedApiKinds] as const)); + + const tierCandidates = (t: DifficultyTier) => + rankCandidates( + summaries + .filter(s => s.tier === t && s.cases > 0) + .map(s => ({ + model: s.model, + accuracy: s.accuracy, + avgCostUsd: s.avgCostUsd ?? 0, + // Spread into a mutable array so tsgo is happy with the readonly type. + supportedApiKinds: [...(apiKindsByModel.get(s.model) ?? (['chat_completions'] as const))], + })), + config.minAccuracy + ); + + const table: RoutingTable = { + version: runId, + generatedAt, + minAccuracy: config.minAccuracy, + source: 'benchmark', + tiers: { + low: tierCandidates('low'), + medium: tierCandidates('medium'), + high: tierCandidates('high'), + }, + }; + + // RoutingTableSchema enforces .min(1) on each tier array; throws ZodError + // when a tier is empty — caller logs and skips publish, keeping the previous + // live table intact. + return RoutingTableSchema.parse(table); +} diff --git a/services/auto-routing-benchmark/src/run.test.ts b/services/auto-routing-benchmark/src/run.test.ts new file mode 100644 index 0000000000..3797f74adb --- /dev/null +++ b/services/auto-routing-benchmark/src/run.test.ts @@ -0,0 +1,191 @@ +import { describe, expect, it } from 'vitest'; +import type { CaseResultRow } from './db'; +import { runCasesWithConcurrency, summarize } from './run'; + +function makeRow(overrides: Partial = {}): CaseResultRow { + return { + run_id: 'run-1', + model: 'model/a', + case_id: 'case-1', + tier: null, + score: 1, + latency_ms: 100, + cost_usd: 0.001, + detail_json: null, + error: null, + ...overrides, + }; +} + +describe('summarize — classifier kind', () => { + it('groups all classifier rows under * tier', () => { + const rows: CaseResultRow[] = [ + makeRow({ model: 'model/a', case_id: 'c1', tier: null, score: 1, latency_ms: 100, cost_usd: 0.001 }), + makeRow({ model: 'model/a', case_id: 'c2', tier: null, score: 0.5, latency_ms: 200, cost_usd: 0.002 }), + ]; + + const summaries = summarize(rows, 'classifier'); + expect(summaries).toHaveLength(1); + const [s] = summaries; + expect(s.model).toBe('model/a'); + expect(s.tier).toBe('*'); + expect(s.cases).toBe(2); + }); + + it('computes accuracy correctly', () => { + const rows: CaseResultRow[] = [ + makeRow({ score: 1.0 }), + makeRow({ case_id: 'c2', score: 0.5 }), + makeRow({ case_id: 'c3', score: 0.0 }), + ]; + + const [s] = summarize(rows, 'classifier'); + // (1.0 + 0.5 + 0.0) / 3 = 0.5 + expect(s.accuracy).toBe(0.5); + }); + + it('computes avgCostUsd excluding null cost rows', () => { + const rows: CaseResultRow[] = [ + makeRow({ case_id: 'c1', cost_usd: 0.002 }), + makeRow({ case_id: 'c2', cost_usd: null }), + makeRow({ case_id: 'c3', cost_usd: 0.004 }), + ]; + + const [s] = summarize(rows, 'classifier'); + // (0.002 + 0.004) / 2 = 0.003 + expect(s.avgCostUsd).toBe(0.003); + }); + + it('returns null avgCostUsd when all cost_usd are null', () => { + const rows: CaseResultRow[] = [ + makeRow({ case_id: 'c1', cost_usd: null }), + makeRow({ case_id: 'c2', cost_usd: null }), + ]; + + const [s] = summarize(rows, 'classifier'); + expect(s.avgCostUsd).toBeNull(); + }); + + it('computes p50LatencyMs', () => { + const rows: CaseResultRow[] = [ + makeRow({ case_id: 'c1', latency_ms: 100 }), + makeRow({ case_id: 'c2', latency_ms: 300 }), + makeRow({ case_id: 'c3', latency_ms: 200 }), + ]; + + const [s] = summarize(rows, 'classifier'); + // sorted: [100, 200, 300], floor(3/2) = 1 → 200 + expect(s.p50LatencyMs).toBe(200); + }); + + it('counts errors correctly', () => { + const rows: CaseResultRow[] = [ + makeRow({ case_id: 'c1', score: 0, error: 'timeout' }), + makeRow({ case_id: 'c2', score: 1, error: null }), + makeRow({ case_id: 'c3', score: 0, error: 'rate_limit' }), + ]; + + const [s] = summarize(rows, 'classifier'); + expect(s.errors).toBe(2); + // error rows have score 0 which drags accuracy down + expect(s.accuracy).toBe(Number((1 / 3).toFixed(4))); + }); +}); + +describe('summarize — decider kind', () => { + it('groups by tier', () => { + const rows: CaseResultRow[] = [ + makeRow({ model: 'model/a', case_id: 'low-1', tier: 'low', score: 1 }), + makeRow({ model: 'model/a', case_id: 'low-2', tier: 'low', score: 0 }), + makeRow({ model: 'model/a', case_id: 'med-1', tier: 'medium', score: 1 }), + makeRow({ model: 'model/b', case_id: 'low-3', tier: 'low', score: 1 }), + ]; + + const summaries = summarize(rows, 'decider'); + expect(summaries).toHaveLength(3); + + const aLow = summaries.find(s => s.model === 'model/a' && s.tier === 'low'); + expect(aLow?.cases).toBe(2); + expect(aLow?.accuracy).toBe(0.5); + + const aMed = summaries.find(s => s.model === 'model/a' && s.tier === 'medium'); + expect(aMed?.cases).toBe(1); + expect(aMed?.accuracy).toBe(1); + + const bLow = summaries.find(s => s.model === 'model/b' && s.tier === 'low'); + expect(bLow?.cases).toBe(1); + }); + + it('uses * fallback when tier is null', () => { + const rows: CaseResultRow[] = [makeRow({ tier: null, score: 1 })]; + const [s] = summarize(rows, 'decider'); + expect(s.tier).toBe('*'); + }); + + it('computes avgLatencyMs as rounded mean', () => { + const rows: CaseResultRow[] = [ + makeRow({ case_id: 'c1', tier: 'low', latency_ms: 100 }), + makeRow({ case_id: 'c2', tier: 'low', latency_ms: 301 }), + ]; + + const [s] = summarize(rows, 'decider'); + expect(s.avgLatencyMs).toBe(Math.round((100 + 301) / 2)); + }); + + it('handles single-element groups for p50', () => { + const rows: CaseResultRow[] = [makeRow({ tier: 'high', latency_ms: 500 })]; + const [s] = summarize(rows, 'decider'); + expect(s.p50LatencyMs).toBe(500); + }); +}); + +describe('runCasesWithConcurrency', () => { + it('processes all items exactly once', async () => { + const processed: number[] = []; + await runCasesWithConcurrency([1, 2, 3, 4, 5], 2, async item => { + processed.push(item); + }); + expect(processed.sort((a, b) => a - b)).toEqual([1, 2, 3, 4, 5]); + }); + + it('processes empty array without error', async () => { + await expect(runCasesWithConcurrency([], 4, async () => {})).resolves.toBeUndefined(); + }); + + it('respects the concurrency cap', async () => { + let inFlight = 0; + let maxInFlight = 0; + const concurrency = 3; + + await runCasesWithConcurrency( + Array.from({ length: 10 }, (_, i) => i), + concurrency, + async () => { + inFlight++; + maxInFlight = Math.max(maxInFlight, inFlight); + // Yield to allow other workers to start + await new Promise(resolve => setTimeout(resolve, 0)); + inFlight--; + } + ); + + expect(maxInFlight).toBeLessThanOrEqual(concurrency); + expect(maxInFlight).toBeGreaterThan(0); + }); + + it('works when concurrency exceeds item count', async () => { + const processed: number[] = []; + await runCasesWithConcurrency([1, 2], 10, async item => { + processed.push(item); + }); + expect(processed.sort((a, b) => a - b)).toEqual([1, 2]); + }); + + it('propagates errors from the callback', async () => { + await expect( + runCasesWithConcurrency([1], 1, async () => { + throw new Error('test error'); + }) + ).rejects.toThrow('test error'); + }); +}); diff --git a/services/auto-routing-benchmark/src/run.ts b/services/auto-routing-benchmark/src/run.ts new file mode 100644 index 0000000000..519944d3a9 --- /dev/null +++ b/services/auto-routing-benchmark/src/run.ts @@ -0,0 +1,283 @@ +import { classifyWithOpenRouter } from '@kilocode/auto-routing-contracts/classifier'; +import { + BenchmarkConfigSchema, + ROUTING_TABLE_KV_KEY, + type BenchmarkConfig, + type BenchmarkKind, + type BenchmarkModelSummary, +} from '@kilocode/auto-routing-contracts'; +import { formatError } from '@kilocode/worker-utils'; +import * as z from 'zod'; +import { getBenchmarkConfig } from './config'; +import { CLASSIFIER_CASES } from './datasets/classifier-cases'; +import { DECIDER_CASES } from './datasets/decider-cases'; +import { + countCaseResults, + getCaseResults, + getRun, + insertRun, + markRunCompleted, + markStaleRunsFailed, + replaceModelSummaries, + saveRoutingTable, + upsertCaseResult, + type CaseResultRow, +} from './db'; +import { gradeClassifierOutput, runDeciderCheck } from './grading'; +import { createOpenRouterClient } from './openrouter'; +import { buildRoutingTable } from './routing-table-builder'; + +export type BenchmarkJobMessage = { runId: string; kind: BenchmarkKind; model: string }; + +export const BenchmarkJobMessageSchema = z.object({ + runId: z.string().min(1), + kind: z.enum(['classifier', 'decider']), + model: z.string().min(1), +}); + +const STALE_RUN_MAX_AGE_MS = 6 * 3600_000; + +export async function startRun( + env: Env, + kind: BenchmarkKind +): Promise<{ runId: string; enqueuedModels: number }> { + // Stale-run sweeper: anything still 'running' after 6h is dead (queue + // retries exhausted); fail it so the admin panel shows the truth. + await markStaleRunsFailed(env.BENCH_DB, new Date(Date.now() - STALE_RUN_MAX_AGE_MS).toISOString()); + + const config = await getBenchmarkConfig(env.BENCH_DB); + const models = kind === 'classifier' ? config.classifierModels : config.deciderModels.map(m => m.id); + const runId = `${kind}-${new Date().toISOString().replace(/[:.]/g, '-')}`; + await insertRun(env.BENCH_DB, { + id: runId, + kind, + startedAt: new Date().toISOString(), + configJson: JSON.stringify(config), + }); + await env.BENCH_QUEUE.sendBatch( + models.map(model => ({ body: { runId, kind, model } satisfies BenchmarkJobMessage })) + ); + console.log(JSON.stringify({ event: 'benchmark_run_started', runId, kind, models })); + return { runId, enqueuedModels: models.length }; +} + +export async function processJob(env: Env, rawMessage: unknown): Promise { + // Validate the message shape; malformed messages are logged and dropped + // rather than retried forever. + const parsed = BenchmarkJobMessageSchema.safeParse(rawMessage); + if (!parsed.success) { + console.warn( + JSON.stringify({ + event: 'benchmark_job_invalid_message', + error: parsed.error.message, + raw: JSON.stringify(rawMessage).slice(0, 200), + }) + ); + return; + } + + const message = parsed.data; + const config = await getRunConfig(env, message.runId); + // Create the OpenRouter client inside processJob — no module-scope transport clients. + const client = await createOpenRouterClient(env); + + if (message.kind === 'classifier') { + await runCasesWithConcurrency(CLASSIFIER_CASES, config.maxConcurrency, async benchCase => { + const startedAt = performance.now(); + try { + const result = await classifyWithOpenRouter(client, benchCase.input, message.model); + const score = result.fallback ? 0 : gradeClassifierOutput(benchCase.expected, result.classification); + await upsertCaseResult(env.BENCH_DB, { + run_id: message.runId, + model: message.model, + case_id: benchCase.id, + tier: null, + score, + latency_ms: Math.round(performance.now() - startedAt), + cost_usd: result.cost, + detail_json: JSON.stringify({ + classification: result.fallback ? null : result.classification, + fallback: result.fallback?.reason ?? null, + retried: result.retried ?? false, + }), + error: null, + }); + } catch (error) { + await upsertCaseResult(env.BENCH_DB, failedRow(message, benchCase.id, null, startedAt, error)); + } + }); + } else { + // Determinism note: temperature 0, fixed maxTokens, pinned prompts, mechanical checks. + // Provider-side nondeterminism can't be fully eliminated, which is why grading is + // binary on a single canonical answer. + await runCasesWithConcurrency(DECIDER_CASES, config.maxConcurrency, async benchCase => { + const startedAt = performance.now(); + try { + const result = await client.chat.send({ + chatRequest: { + model: message.model, + messages: [ + { role: 'system', content: benchCase.systemPrompt }, + { role: 'user', content: benchCase.userPrompt }, + ], + stream: false, + temperature: 0, + maxTokens: benchCase.maxTokens, + }, + }); + const content: unknown = result.choices[0]?.message.content; + const text = typeof content === 'string' ? content : ''; + const passed = text.length > 0 && runDeciderCheck(benchCase.check, text); + await upsertCaseResult(env.BENCH_DB, { + run_id: message.runId, + model: message.model, + case_id: benchCase.id, + tier: benchCase.tier, + score: passed ? 1 : 0, + latency_ms: Math.round(performance.now() - startedAt), + cost_usd: result.usage?.cost ?? null, + detail_json: JSON.stringify({ + finishReason: result.choices[0]?.finishReason ?? null, + outputPrefix: text.slice(0, 200), + }), + error: null, + }); + } catch (error) { + await upsertCaseResult( + env.BENCH_DB, + failedRow(message, benchCase.id, benchCase.tier, startedAt, error) + ); + } + }); + } + + await finalizeRunIfComplete(env, message.runId, message.kind); +} + +function failedRow( + message: BenchmarkJobMessage, + caseId: string, + tier: string | null, + startedAt: number, + error: unknown +): CaseResultRow { + return { + run_id: message.runId, + model: message.model, + case_id: caseId, + tier, + score: 0, + latency_ms: Math.round(performance.now() - startedAt), + cost_usd: null, + detail_json: null, + error: JSON.stringify(formatError(error)).slice(0, 500), + }; +} + +async function getRunConfig(env: Env, runId: string): Promise { + // Snapshot taken at startRun time so a mid-run admin edit can't skew it. + const run = await getRun(env.BENCH_DB, runId); + if (!run) throw new Error(`unknown run ${runId}`); + return BenchmarkConfigSchema.parse(JSON.parse(run.config_json)); +} + +export async function runCasesWithConcurrency( + cases: readonly T[], + concurrency: number, + fn: (item: T) => Promise +): Promise { + const queue = [...cases]; + const workers = Array.from({ length: Math.min(concurrency, queue.length) }, async () => { + for (let item = queue.shift(); item !== undefined; item = queue.shift()) { + await fn(item); + } + }); + await Promise.all(workers); +} + +async function finalizeRunIfComplete(env: Env, runId: string, kind: BenchmarkKind): Promise { + const config = await getRunConfig(env, runId); + const models = + kind === 'classifier' ? config.classifierModels : config.deciderModels.map(m => m.id); + const caseCount = kind === 'classifier' ? CLASSIFIER_CASES.length : DECIDER_CASES.length; + const expected = models.length * caseCount; + const actual = await countCaseResults(env.BENCH_DB, runId); + + if (actual < expected) return; + + // Two consumers may both see completion and both aggregate — harmless: + // identical deterministic inputs → identical summaries; replaceModelSummaries + // is a batched delete+insert; markRunCompleted guards on status='running'; + // KV put is idempotent. + const rows = await getCaseResults(env.BENCH_DB, runId); + const summaries = summarize(rows, kind); + await replaceModelSummaries(env.BENCH_DB, runId, summaries); + await markRunCompleted(env.BENCH_DB, runId); + + if (kind === 'decider') { + const generatedAt = new Date().toISOString(); + try { + const table = buildRoutingTable({ runId, generatedAt, config, summaries }); + const tableJson = JSON.stringify(table); + await saveRoutingTable(env.BENCH_DB, runId, generatedAt, tableJson); + await env.AUTO_ROUTING_CONFIG.put(ROUTING_TABLE_KV_KEY, tableJson); + console.log( + JSON.stringify({ event: 'routing_table_published', runId, version: table.version }) + ); + } catch (error) { + console.warn( + JSON.stringify({ + event: 'routing_table_publish_skipped', + runId, + ...formatError(error), + }) + ); + } + } + + console.log( + JSON.stringify({ + event: 'benchmark_run_completed', + runId, + kind, + summaries, + }) + ); +} + +export function summarize(rows: CaseResultRow[], kind: BenchmarkKind): BenchmarkModelSummary[] { + // Group by "model tier-key" using a plain reduce so this works in all runtimes. + // Classifier rows use '*' as the tier (no tiering); decider rows use the actual tier + // (falling back to '*' when tier is null). + const groups = new Map(); + for (const row of rows) { + const tierKey = kind === 'classifier' ? '*' : (row.tier ?? '*'); + const key = `${row.model}\0${tierKey}`; + const existing = groups.get(key); + if (existing) { + existing.push(row); + } else { + groups.set(key, [row]); + } + } + + return [...groups.entries()].map(([key, group]) => { + const [model, tier] = key.split('\0'); + const latencies = group.map(r => r.latency_ms).toSorted((a, b) => a - b); + const costs = group.filter(r => r.cost_usd !== null); + return { + model, + tier: tier as BenchmarkModelSummary['tier'], + accuracy: Number((group.reduce((a, r) => a + r.score, 0) / group.length).toFixed(4)), + avgCostUsd: costs.length + ? Number( + (costs.reduce((a, r) => a + (r.cost_usd ?? 0), 0) / costs.length).toFixed(8) + ) + : null, + avgLatencyMs: Math.round(group.reduce((a, r) => a + r.latency_ms, 0) / group.length), + p50LatencyMs: latencies[Math.floor(latencies.length / 2)] ?? null, + cases: group.length, + errors: group.filter(r => r.error !== null).length, + }; + }); +} From 0c763cec3054aba6501f75211ffeafc0a7aabd17 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= Date: Thu, 11 Jun 2026 22:46:12 +0200 Subject: [PATCH 12/73] feat(auto-routing-benchmark): admin config, runs and routing-table endpoints --- .../auto-routing-benchmark/src/admin.test.ts | 284 ++++++++++++++++++ services/auto-routing-benchmark/src/admin.ts | 56 ++++ services/auto-routing-benchmark/src/index.ts | 14 + .../src/routing-table-builder.ts | 4 +- .../auto-routing-benchmark/src/run.test.ts | 18 +- services/auto-routing-benchmark/src/run.ts | 21 +- 6 files changed, 387 insertions(+), 10 deletions(-) create mode 100644 services/auto-routing-benchmark/src/admin.test.ts create mode 100644 services/auto-routing-benchmark/src/admin.ts diff --git a/services/auto-routing-benchmark/src/admin.test.ts b/services/auto-routing-benchmark/src/admin.test.ts new file mode 100644 index 0000000000..6a74bdc7e3 --- /dev/null +++ b/services/auto-routing-benchmark/src/admin.test.ts @@ -0,0 +1,284 @@ +import { beforeEach, describe, expect, it, vi } from 'vitest'; +import { DEFAULT_BENCHMARK_CONFIG } from './config'; +import { app } from './index'; + +// --------------------------------------------------------------------------- +// Env / binding stubs +// --------------------------------------------------------------------------- + +const tokenGet = vi.fn<() => Promise>(); +const dbFirst = vi.fn(); +const dbAll = vi.fn(); +const dbRun = vi.fn(); +const dbBind = vi.fn(); +const dbPrepare = vi.fn(); +const queueSendBatch = vi.fn(); + +// Minimal chainable D1 stub. +// prepare() → { bind() → { first(), all(), run() } } +function makeD1Stub() { + const stmt = { + bind: (..._args: unknown[]) => { + dbBind(..._args); + return stmt; + }, + first: dbFirst, + all: dbAll, + run: dbRun, + }; + dbPrepare.mockReturnValue(stmt); + return { + prepare: (sql: string) => { + dbPrepare(sql); + return stmt; + }, + batch: vi.fn().mockResolvedValue([]), + } as unknown as D1Database; +} + +const env = { + INTERNAL_API_SECRET_PROD: { get: tokenGet }, + BENCH_DB: null as unknown as D1Database, + BENCH_QUEUE: { sendBatch: queueSendBatch }, + AUTO_ROUTING_CONFIG: { put: vi.fn(), get: vi.fn() }, +} as unknown as Env; + +const executionCtx = { + waitUntil: () => {}, + passThroughOnException: () => {}, +} as unknown as ExecutionContext; + +function request(path: string, init: RequestInit = {}) { + return app.request(`https://bench.example.com${path}`, init, env, executionCtx); +} + +function authedGet(path: string) { + return request(path, { headers: { authorization: 'Bearer bench-token' } }); +} + +function authedPost(path: string, body: unknown) { + return request(path, { + method: 'POST', + headers: { authorization: 'Bearer bench-token', 'content-type': 'application/json' }, + body: JSON.stringify(body), + }); +} + +function authedPut(path: string, body: unknown, extraHeaders: Record = {}) { + return request(path, { + method: 'PUT', + headers: { + authorization: 'Bearer bench-token', + 'content-type': 'application/json', + ...extraHeaders, + }, + body: JSON.stringify(body), + }); +} + +// --------------------------------------------------------------------------- +// Setup +// --------------------------------------------------------------------------- + +beforeEach(() => { + tokenGet.mockResolvedValue('bench-token'); + dbFirst.mockResolvedValue(null); + dbAll.mockResolvedValue({ results: [] }); + dbRun.mockResolvedValue({ meta: { changes: 0 } }); + queueSendBatch.mockResolvedValue(undefined); + + // Rebuild the D1 stub each test so prepare/bind point to fresh mocks. + (env as unknown as Record).BENCH_DB = makeD1Stub(); +}); + +// --------------------------------------------------------------------------- +// Auth guard +// --------------------------------------------------------------------------- + +describe('auth middleware', () => { + it('rejects requests without a bearer token', async () => { + const res = await request('/admin/config'); + expect(res.status).toBe(401); + await expect(res.json()).resolves.toEqual({ error: 'Unauthorized' }); + }); + + it('rejects requests with the wrong bearer token', async () => { + const res = await request('/admin/config', { + headers: { authorization: 'Bearer wrong-token' }, + }); + expect(res.status).toBe(401); + }); +}); + +// --------------------------------------------------------------------------- +// GET /admin/config +// --------------------------------------------------------------------------- + +describe('GET /admin/config', () => { + it('returns defaults when the DB row is absent', async () => { + // dbFirst already returns null by default + const res = await authedGet('/admin/config'); + expect(res.status).toBe(200); + await expect(res.json()).resolves.toEqual({ + config: DEFAULT_BENCHMARK_CONFIG, + defaults: DEFAULT_BENCHMARK_CONFIG, + }); + }); + + it('returns the stored config when a DB row exists', async () => { + const storedConfig = { + ...DEFAULT_BENCHMARK_CONFIG, + minAccuracy: 0.9, + updatedAt: '2026-06-01T00:00:00.000Z', + updatedBy: 'admin@example.com', + }; + dbFirst.mockResolvedValueOnce({ config_json: JSON.stringify(storedConfig) }); + + const res = await authedGet('/admin/config'); + expect(res.status).toBe(200); + const body = (await res.json()) as { config: typeof storedConfig }; + expect(body.config.minAccuracy).toBe(0.9); + expect(body.config.updatedBy).toBe('admin@example.com'); + }); +}); + +// --------------------------------------------------------------------------- +// PUT /admin/config +// --------------------------------------------------------------------------- + +describe('PUT /admin/config', () => { + it('returns 400 for a non-JSON body', async () => { + const res = await request('/admin/config', { + method: 'PUT', + headers: { + authorization: 'Bearer bench-token', + 'content-type': 'application/json', + }, + body: 'not json {{{', + }); + expect(res.status).toBe(400); + await expect(res.json()).resolves.toEqual({ error: 'Invalid JSON body' }); + }); + + it('returns 400 for a schema-invalid config', async () => { + const res = await authedPut('/admin/config', { classifierModels: 'oops' }); + expect(res.status).toBe(400); + await expect(res.json()).resolves.toEqual({ error: 'Invalid benchmark config' }); + expect(dbRun).not.toHaveBeenCalled(); + }); + + it('persists a valid config and returns it with defaults', async () => { + const validConfig = { + ...DEFAULT_BENCHMARK_CONFIG, + minAccuracy: 0.85, + updatedAt: null, + updatedBy: null, + }; + + const res = await authedPut('/admin/config', validConfig, { + 'x-updated-by': 'igor@kilocode.ai', + }); + + expect(res.status).toBe(200); + const body = (await res.json()) as { + config: { minAccuracy: number; updatedBy: string | null; updatedAt: string | null }; + defaults: typeof DEFAULT_BENCHMARK_CONFIG; + }; + // Returned config carries the stamped fields. + expect(body.config.minAccuracy).toBe(0.85); + expect(body.config.updatedBy).toBe('igor@kilocode.ai'); + expect(typeof body.config.updatedAt).toBe('string'); + expect(body.defaults).toEqual(DEFAULT_BENCHMARK_CONFIG); + + // The INSERT was actually executed (dbRun was called on the saveConfigRow stmt). + expect(dbRun).toHaveBeenCalled(); + // The SQL should be an INSERT OR REPLACE into benchmark_config. + const insertCall = dbPrepare.mock.calls.find( + (args: unknown[]) => typeof args[0] === 'string' && (args[0] as string).includes('benchmark_config') + ); + expect(insertCall).toBeDefined(); + // The updatedBy value was forwarded via bind. + const bindCalls: unknown[][] = dbBind.mock.calls; + const foundUpdatedBy = bindCalls.some(args => args.includes('igor@kilocode.ai')); + expect(foundUpdatedBy).toBe(true); + }); +}); + +// --------------------------------------------------------------------------- +// GET /admin/runs +// --------------------------------------------------------------------------- + +describe('GET /admin/runs', () => { + it('returns an empty runs array when the table is empty', async () => { + // dbAll returns { results: [] } by default + const res = await authedGet('/admin/runs'); + expect(res.status).toBe(200); + await expect(res.json()).resolves.toEqual({ runs: [] }); + }); +}); + +// --------------------------------------------------------------------------- +// POST /admin/runs +// --------------------------------------------------------------------------- + +describe('POST /admin/runs', () => { + it('returns 400 for a non-JSON body', async () => { + const res = await request('/admin/runs', { + method: 'POST', + headers: { + authorization: 'Bearer bench-token', + 'content-type': 'application/json', + }, + body: '<<<', + }); + expect(res.status).toBe(400); + await expect(res.json()).resolves.toEqual({ error: 'Invalid JSON body' }); + }); + + it('returns 400 for an invalid kind', async () => { + const res = await authedPost('/admin/runs', { kind: 'turbo' }); + expect(res.status).toBe(400); + await expect(res.json()).resolves.toEqual({ error: 'Invalid run request' }); + expect(queueSendBatch).not.toHaveBeenCalled(); + }); + + it('starts a classifier run and returns runId + enqueuedModels', async () => { + // markStaleRunsFailed → run (UPDATE), getBenchmarkConfig → first (null → defaults), + // insertRun → run, then sendBatch. + const res = await authedPost('/admin/runs', { kind: 'classifier' }); + expect(res.status).toBe(200); + const body = (await res.json()) as { runId: string; enqueuedModels: number }; + expect(body.runId).toMatch(/^classifier-/); + expect(body.enqueuedModels).toBe(DEFAULT_BENCHMARK_CONFIG.classifierModels.length); + expect(queueSendBatch).toHaveBeenCalledOnce(); + }); +}); + +// --------------------------------------------------------------------------- +// GET /admin/routing-table +// --------------------------------------------------------------------------- + +describe('GET /admin/routing-table', () => { + it('returns {table: null, publishedAt: null} when no rows exist', async () => { + // dbFirst already returns null by default + const res = await authedGet('/admin/routing-table'); + expect(res.status).toBe(200); + await expect(res.json()).resolves.toEqual({ table: null, publishedAt: null }); + }); + + it('returns the parsed table and publishedAt when a row exists', async () => { + const tableData = { version: 'test-v1', tiers: {} }; + dbFirst.mockResolvedValueOnce({ + run_id: 'run-123', + published_at: '2026-06-01T10:00:00.000Z', + table_json: JSON.stringify(tableData), + }); + + const res = await authedGet('/admin/routing-table'); + expect(res.status).toBe(200); + await expect(res.json()).resolves.toEqual({ + table: tableData, + publishedAt: '2026-06-01T10:00:00.000Z', + }); + }); +}); diff --git a/services/auto-routing-benchmark/src/admin.ts b/services/auto-routing-benchmark/src/admin.ts new file mode 100644 index 0000000000..5bb6649a69 --- /dev/null +++ b/services/auto-routing-benchmark/src/admin.ts @@ -0,0 +1,56 @@ +import { + BenchmarkConfigSchema, + StartBenchmarkRunRequestSchema, + type BenchmarkRun, +} from '@kilocode/auto-routing-contracts'; +import type { Handler } from 'hono'; +import { DEFAULT_BENCHMARK_CONFIG, getBenchmarkConfig, saveBenchmarkConfig } from './config'; +import { getLatestRoutingTable, listRuns } from './db'; +import { startRun } from './run'; +import type { HonoEnv } from './hono-env'; + +export const getConfigHandler: Handler = async c => + c.json({ + config: await getBenchmarkConfig(c.env.BENCH_DB), + defaults: DEFAULT_BENCHMARK_CONFIG, + }); + +export const putConfigHandler: Handler = async c => { + let body: unknown; + try { + body = await c.req.json(); + } catch { + return c.json({ error: 'Invalid JSON body' }, 400); + } + const parsed = BenchmarkConfigSchema.safeParse(body); + if (!parsed.success) return c.json({ error: 'Invalid benchmark config' }, 400); + const updatedBy = c.req.header('x-updated-by') ?? null; + const saved = await saveBenchmarkConfig(c.env.BENCH_DB, parsed.data, updatedBy); + return c.json({ config: saved, defaults: DEFAULT_BENCHMARK_CONFIG }); +}; + +export const listRunsHandler: Handler = async c => { + const limit = Math.min(Number(c.req.query('limit') ?? 20) || 20, 100); + const runs: BenchmarkRun[] = await listRuns(c.env.BENCH_DB, limit); + return c.json({ runs }); +}; + +export const startRunHandler: Handler = async c => { + let body: unknown; + try { + body = await c.req.json(); + } catch { + return c.json({ error: 'Invalid JSON body' }, 400); + } + const parsed = StartBenchmarkRunRequestSchema.safeParse(body); + if (!parsed.success) return c.json({ error: 'Invalid run request' }, 400); + return c.json(await startRun(c.env, parsed.data.kind)); +}; + +export const getRoutingTableHandler: Handler = async c => { + const latest = await getLatestRoutingTable(c.env.BENCH_DB); + return c.json({ + table: latest ? (JSON.parse(latest.table_json) as unknown) : null, + publishedAt: latest?.published_at ?? null, + }); +}; diff --git a/services/auto-routing-benchmark/src/index.ts b/services/auto-routing-benchmark/src/index.ts index 542e9e3ce6..e78437b9dd 100644 --- a/services/auto-routing-benchmark/src/index.ts +++ b/services/auto-routing-benchmark/src/index.ts @@ -1,12 +1,26 @@ import { Hono } from 'hono'; import { createErrorHandler, createNotFoundHandler } from '@kilocode/worker-utils'; import { authMiddleware } from './auth'; +import { + getConfigHandler, + putConfigHandler, + listRunsHandler, + startRunHandler, + getRoutingTableHandler, +} from './admin'; import type { HonoEnv } from './hono-env'; import { processJob, startRun, type BenchmarkJobMessage } from './run'; export const app = new Hono(); app.use('*', authMiddleware); app.get('/health', c => c.json({ status: 'ok', service: 'auto-routing-benchmark' })); + +app.get('/admin/config', getConfigHandler); +app.put('/admin/config', putConfigHandler); +app.get('/admin/runs', listRunsHandler); +app.post('/admin/runs', startRunHandler); +app.get('/admin/routing-table', getRoutingTableHandler); + app.notFound(createNotFoundHandler()); app.onError(createErrorHandler()); diff --git a/services/auto-routing-benchmark/src/routing-table-builder.ts b/services/auto-routing-benchmark/src/routing-table-builder.ts index 16bc21a9e8..71bfa772d3 100644 --- a/services/auto-routing-benchmark/src/routing-table-builder.ts +++ b/services/auto-routing-benchmark/src/routing-table-builder.ts @@ -17,7 +17,9 @@ export function buildRoutingTable(params: { summaries: BenchmarkModelSummary[]; }): RoutingTable { const { runId, generatedAt, config, summaries } = params; - const apiKindsByModel = new Map(config.deciderModels.map(m => [m.id, m.supportedApiKinds] as const)); + const apiKindsByModel = new Map( + config.deciderModels.map(m => [m.id, m.supportedApiKinds] as const) + ); const tierCandidates = (t: DifficultyTier) => rankCandidates( diff --git a/services/auto-routing-benchmark/src/run.test.ts b/services/auto-routing-benchmark/src/run.test.ts index 3797f74adb..1c9826c640 100644 --- a/services/auto-routing-benchmark/src/run.test.ts +++ b/services/auto-routing-benchmark/src/run.test.ts @@ -20,8 +20,22 @@ function makeRow(overrides: Partial = {}): CaseResultRow { describe('summarize — classifier kind', () => { it('groups all classifier rows under * tier', () => { const rows: CaseResultRow[] = [ - makeRow({ model: 'model/a', case_id: 'c1', tier: null, score: 1, latency_ms: 100, cost_usd: 0.001 }), - makeRow({ model: 'model/a', case_id: 'c2', tier: null, score: 0.5, latency_ms: 200, cost_usd: 0.002 }), + makeRow({ + model: 'model/a', + case_id: 'c1', + tier: null, + score: 1, + latency_ms: 100, + cost_usd: 0.001, + }), + makeRow({ + model: 'model/a', + case_id: 'c2', + tier: null, + score: 0.5, + latency_ms: 200, + cost_usd: 0.002, + }), ]; const summaries = summarize(rows, 'classifier'); diff --git a/services/auto-routing-benchmark/src/run.ts b/services/auto-routing-benchmark/src/run.ts index 519944d3a9..4efd91e93f 100644 --- a/services/auto-routing-benchmark/src/run.ts +++ b/services/auto-routing-benchmark/src/run.ts @@ -43,10 +43,14 @@ export async function startRun( ): Promise<{ runId: string; enqueuedModels: number }> { // Stale-run sweeper: anything still 'running' after 6h is dead (queue // retries exhausted); fail it so the admin panel shows the truth. - await markStaleRunsFailed(env.BENCH_DB, new Date(Date.now() - STALE_RUN_MAX_AGE_MS).toISOString()); + await markStaleRunsFailed( + env.BENCH_DB, + new Date(Date.now() - STALE_RUN_MAX_AGE_MS).toISOString() + ); const config = await getBenchmarkConfig(env.BENCH_DB); - const models = kind === 'classifier' ? config.classifierModels : config.deciderModels.map(m => m.id); + const models = + kind === 'classifier' ? config.classifierModels : config.deciderModels.map(m => m.id); const runId = `${kind}-${new Date().toISOString().replace(/[:.]/g, '-')}`; await insertRun(env.BENCH_DB, { id: runId, @@ -86,7 +90,9 @@ export async function processJob(env: Env, rawMessage: unknown): Promise { const startedAt = performance.now(); try { const result = await classifyWithOpenRouter(client, benchCase.input, message.model); - const score = result.fallback ? 0 : gradeClassifierOutput(benchCase.expected, result.classification); + const score = result.fallback + ? 0 + : gradeClassifierOutput(benchCase.expected, result.classification); await upsertCaseResult(env.BENCH_DB, { run_id: message.runId, model: message.model, @@ -103,7 +109,10 @@ export async function processJob(env: Env, rawMessage: unknown): Promise { error: null, }); } catch (error) { - await upsertCaseResult(env.BENCH_DB, failedRow(message, benchCase.id, null, startedAt, error)); + await upsertCaseResult( + env.BENCH_DB, + failedRow(message, benchCase.id, null, startedAt, error) + ); } }); } else { @@ -270,9 +279,7 @@ export function summarize(rows: CaseResultRow[], kind: BenchmarkKind): Benchmark tier: tier as BenchmarkModelSummary['tier'], accuracy: Number((group.reduce((a, r) => a + r.score, 0) / group.length).toFixed(4)), avgCostUsd: costs.length - ? Number( - (costs.reduce((a, r) => a + (r.cost_usd ?? 0), 0) / costs.length).toFixed(8) - ) + ? Number((costs.reduce((a, r) => a + (r.cost_usd ?? 0), 0) / costs.length).toFixed(8)) : null, avgLatencyMs: Math.round(group.reduce((a, r) => a + r.latency_ms, 0) / group.length), p50LatencyMs: latencies[Math.floor(latencies.length / 2)] ?? null, From c749be26d4257cc9e7639b931961ff571e779907 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= Date: Thu, 11 Jun 2026 22:53:14 +0200 Subject: [PATCH 13/73] feat(admin): proxy routes for auto-routing benchmark service --- .../auto-routing/benchmark-config/route.ts | 37 ++++ .../benchmark-routing-table/route.ts | 11 + .../api/auto-routing/benchmark-runs/route.ts | 36 +++ ...uto-routing-benchmark-admin-client.test.ts | 207 ++++++++++++++++++ .../auto-routing-benchmark-admin-client.ts | 108 +++++++++ apps/web/src/lib/config.server.ts | 5 + .../auto-routing-benchmark/src/admin.test.ts | 3 +- 7 files changed, 406 insertions(+), 1 deletion(-) create mode 100644 apps/web/src/app/admin/api/auto-routing/benchmark-config/route.ts create mode 100644 apps/web/src/app/admin/api/auto-routing/benchmark-routing-table/route.ts create mode 100644 apps/web/src/app/admin/api/auto-routing/benchmark-runs/route.ts create mode 100644 apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.test.ts create mode 100644 apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.ts diff --git a/apps/web/src/app/admin/api/auto-routing/benchmark-config/route.ts b/apps/web/src/app/admin/api/auto-routing/benchmark-config/route.ts new file mode 100644 index 0000000000..d81cc4f69c --- /dev/null +++ b/apps/web/src/app/admin/api/auto-routing/benchmark-config/route.ts @@ -0,0 +1,37 @@ +import { BenchmarkConfigSchema } from '@kilocode/auto-routing-contracts'; +import type { NextRequest } from 'next/server'; +import { NextResponse } from 'next/server'; +import { + getBenchmarkConfig, + updateBenchmarkConfig, +} from '@/lib/ai-gateway/auto-routing-benchmark-admin-client'; +import { getUserFromAuth } from '@/lib/user/server'; + +export async function GET() { + const { authFailedResponse } = await getUserFromAuth({ adminOnly: true }); + if (authFailedResponse) return authFailedResponse; + + const result = await getBenchmarkConfig(); + return NextResponse.json(result.body, { status: result.status }); +} + +export async function PUT(request: NextRequest) { + const { authFailedResponse, user } = await getUserFromAuth({ adminOnly: true }); + if (authFailedResponse) return authFailedResponse; + + let rawBody: unknown; + try { + rawBody = await request.json(); + } catch { + return NextResponse.json({ error: 'Invalid JSON body' }, { status: 400 }); + } + + const parsed = BenchmarkConfigSchema.safeParse(rawBody); + if (!parsed.success) { + return NextResponse.json({ error: 'Invalid benchmark config' }, { status: 400 }); + } + + const email = user?.google_user_email ?? ''; + const result = await updateBenchmarkConfig(parsed.data, email); + return NextResponse.json(result.body, { status: result.status }); +} diff --git a/apps/web/src/app/admin/api/auto-routing/benchmark-routing-table/route.ts b/apps/web/src/app/admin/api/auto-routing/benchmark-routing-table/route.ts new file mode 100644 index 0000000000..26fdc8eef1 --- /dev/null +++ b/apps/web/src/app/admin/api/auto-routing/benchmark-routing-table/route.ts @@ -0,0 +1,11 @@ +import { NextResponse } from 'next/server'; +import { getBenchmarkRoutingTable } from '@/lib/ai-gateway/auto-routing-benchmark-admin-client'; +import { getUserFromAuth } from '@/lib/user/server'; + +export async function GET() { + const { authFailedResponse } = await getUserFromAuth({ adminOnly: true }); + if (authFailedResponse) return authFailedResponse; + + const result = await getBenchmarkRoutingTable(); + return NextResponse.json(result.body, { status: result.status }); +} diff --git a/apps/web/src/app/admin/api/auto-routing/benchmark-runs/route.ts b/apps/web/src/app/admin/api/auto-routing/benchmark-runs/route.ts new file mode 100644 index 0000000000..afb3f47f65 --- /dev/null +++ b/apps/web/src/app/admin/api/auto-routing/benchmark-runs/route.ts @@ -0,0 +1,36 @@ +import { StartBenchmarkRunRequestSchema } from '@kilocode/auto-routing-contracts'; +import type { NextRequest } from 'next/server'; +import { NextResponse } from 'next/server'; +import { + listBenchmarkRuns, + startBenchmarkRun, +} from '@/lib/ai-gateway/auto-routing-benchmark-admin-client'; +import { getUserFromAuth } from '@/lib/user/server'; + +export async function GET() { + const { authFailedResponse } = await getUserFromAuth({ adminOnly: true }); + if (authFailedResponse) return authFailedResponse; + + const result = await listBenchmarkRuns(); + return NextResponse.json(result.body, { status: result.status }); +} + +export async function POST(request: NextRequest) { + const { authFailedResponse } = await getUserFromAuth({ adminOnly: true }); + if (authFailedResponse) return authFailedResponse; + + let rawBody: unknown; + try { + rawBody = await request.json(); + } catch { + return NextResponse.json({ error: 'Invalid JSON body' }, { status: 400 }); + } + + const parsed = StartBenchmarkRunRequestSchema.safeParse(rawBody); + if (!parsed.success) { + return NextResponse.json({ error: 'Invalid start benchmark run request' }, { status: 400 }); + } + + const result = await startBenchmarkRun(parsed.data.kind); + return NextResponse.json(result.body, { status: result.status }); +} diff --git a/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.test.ts b/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.test.ts new file mode 100644 index 0000000000..f0e62f1d80 --- /dev/null +++ b/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.test.ts @@ -0,0 +1,207 @@ +import { + getBenchmarkConfig, + updateBenchmarkConfig, + listBenchmarkRuns, + startBenchmarkRun, + getBenchmarkRoutingTable, +} from './auto-routing-benchmark-admin-client'; + +jest.mock('@/lib/config.server', () => ({ + AUTO_ROUTING_BENCHMARK_WORKER_URL: 'https://benchmark-worker.example.com', + INTERNAL_API_SECRET: 'test-internal-secret', +})); + +const mockFetch = jest.fn(); +global.fetch = mockFetch; + +const configResponse = { + config: { + classifierModels: ['anthropic/claude-haiku-4'], + deciderModels: [ + { + id: 'anthropic/claude-sonnet-4', + supportedApiKinds: ['chat_completions' as const] as ('chat_completions' | 'responses' | 'messages')[], + }, + ], + minAccuracy: 0.8, + maxConcurrency: 4, + updatedAt: null, + updatedBy: null, + }, + defaults: { + classifierModels: ['anthropic/claude-haiku-4'], + deciderModels: [ + { + id: 'anthropic/claude-sonnet-4', + supportedApiKinds: ['chat_completions' as const] as ('chat_completions' | 'responses' | 'messages')[], + }, + ], + minAccuracy: 0.8, + maxConcurrency: 4, + updatedAt: null, + updatedBy: null, + }, +}; + +const runsResponse = { + runs: [ + { + id: 'run-1', + kind: 'classifier', + status: 'completed', + startedAt: '2026-06-01T00:00:00Z', + completedAt: '2026-06-01T01:00:00Z', + error: null, + summaries: [], + }, + ], +}; + +describe('auto routing benchmark admin client', () => { + beforeEach(() => { + mockFetch.mockReset(); + }); + + it('gets the benchmark config and sends bearer auth header', async () => { + mockFetch.mockResolvedValue({ + status: 200, + ok: true, + json: () => Promise.resolve(configResponse), + }); + + await expect(getBenchmarkConfig()).resolves.toEqual({ + status: 200, + body: configResponse, + }); + + expect(mockFetch).toHaveBeenCalledWith( + 'https://benchmark-worker.example.com/admin/config', + { + method: 'GET', + headers: { + authorization: 'Bearer test-internal-secret', + }, + } + ); + }); + + it('propagates error body when upstream responds with a non-OK status', async () => { + mockFetch.mockResolvedValue({ + status: 404, + ok: false, + json: () => Promise.resolve({ error: 'not found' }), + }); + + await expect(getBenchmarkConfig()).resolves.toEqual({ + status: 404, + body: { error: 'not found' }, + }); + }); + + it('updates the benchmark config and sends x-updated-by header', async () => { + mockFetch.mockResolvedValue({ + status: 200, + ok: true, + json: () => Promise.resolve(configResponse), + }); + + await updateBenchmarkConfig(configResponse.config, 'admin@kilocode.ai'); + + expect(mockFetch).toHaveBeenCalledWith( + 'https://benchmark-worker.example.com/admin/config', + { + method: 'PUT', + headers: { + authorization: 'Bearer test-internal-secret', + 'content-type': 'application/json', + 'x-updated-by': 'admin@kilocode.ai', + }, + body: JSON.stringify(configResponse.config), + } + ); + }); + + it('lists benchmark runs', async () => { + mockFetch.mockResolvedValue({ + status: 200, + ok: true, + json: () => Promise.resolve(runsResponse), + }); + + await expect(listBenchmarkRuns()).resolves.toEqual({ + status: 200, + body: runsResponse, + }); + + expect(mockFetch).toHaveBeenCalledWith( + 'https://benchmark-worker.example.com/admin/runs', + { + method: 'GET', + headers: { + authorization: 'Bearer test-internal-secret', + }, + } + ); + }); + + it('propagates error body from listBenchmarkRuns on non-OK status', async () => { + mockFetch.mockResolvedValue({ + status: 401, + ok: false, + json: () => Promise.resolve({ error: 'unauthorized' }), + }); + + await expect(listBenchmarkRuns()).resolves.toEqual({ + status: 401, + body: { error: 'unauthorized' }, + }); + }); + + it('starts a benchmark run with the given kind', async () => { + mockFetch.mockResolvedValue({ + status: 200, + ok: true, + json: () => Promise.resolve({ runId: 'run-2', enqueuedModels: 3 }), + }); + + await expect(startBenchmarkRun('classifier')).resolves.toEqual({ + status: 200, + body: { runId: 'run-2', enqueuedModels: 3 }, + }); + + expect(mockFetch).toHaveBeenCalledWith( + 'https://benchmark-worker.example.com/admin/runs', + { + method: 'POST', + headers: { + authorization: 'Bearer test-internal-secret', + 'content-type': 'application/json', + }, + body: JSON.stringify({ kind: 'classifier' }), + } + ); + }); + + it('gets the benchmark routing table', async () => { + mockFetch.mockResolvedValue({ + status: 200, + ok: true, + json: () => Promise.resolve({ table: null, publishedAt: null }), + }); + + await expect(getBenchmarkRoutingTable()).resolves.toEqual({ + status: 200, + body: { table: null, publishedAt: null }, + }); + + expect(mockFetch).toHaveBeenCalledWith( + 'https://benchmark-worker.example.com/admin/routing-table', + { + method: 'GET', + headers: { + authorization: 'Bearer test-internal-secret', + }, + } + ); + }); +}); diff --git a/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.ts b/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.ts new file mode 100644 index 0000000000..52ebee417a --- /dev/null +++ b/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.ts @@ -0,0 +1,108 @@ +import { + BenchmarkConfigResponseSchema, + BenchmarkRunsResponseSchema, + StartBenchmarkRunResponseSchema, + RoutingTableSchema, + type BenchmarkConfig, + type BenchmarkKind, +} from '@kilocode/auto-routing-contracts'; +import { AUTO_ROUTING_BENCHMARK_WORKER_URL, INTERNAL_API_SECRET } from '@/lib/config.server'; +import * as z from 'zod'; + +export type AutoRoutingAdminResult = { + status: number; + body: T; +}; + +type ErrorBody = { error: string }; +const ErrorBodySchema = z.object({ error: z.string() }); + +export const BenchmarkRoutingTableResponseSchema = z.object({ + table: RoutingTableSchema.nullable(), + publishedAt: z.string().nullable(), +}); +export type BenchmarkRoutingTableResponse = z.infer; + +type AutoRoutingBenchmarkAdminRequestInit = Omit & { + headers?: Record; +}; + +async function fetchBenchmarkAdmin( + path: string, + init: AutoRoutingBenchmarkAdminRequestInit, + schema: z.ZodType +): Promise> { + if (!AUTO_ROUTING_BENCHMARK_WORKER_URL || !INTERNAL_API_SECRET) { + return { + status: 500, + body: { error: 'Auto routing benchmark worker is not configured' }, + }; + } + + const response = await fetch(`${AUTO_ROUTING_BENCHMARK_WORKER_URL}${path}`, { + ...init, + headers: { + authorization: `Bearer ${INTERNAL_API_SECRET}`, + ...init.headers, + }, + }); + + const body: unknown = await response.json(); + if (!response.ok) { + const parsedError = ErrorBodySchema.safeParse(body); + return { + status: response.status, + body: parsedError.success + ? parsedError.data + : { error: `Request failed: ${response.status}` }, + }; + } + + return { + status: response.status, + body: schema.parse(body), + }; +} + +export function getBenchmarkConfig() { + return fetchBenchmarkAdmin('/admin/config', { method: 'GET' }, BenchmarkConfigResponseSchema); +} + +export function updateBenchmarkConfig(config: BenchmarkConfig, updatedByEmail: string) { + return fetchBenchmarkAdmin( + '/admin/config', + { + method: 'PUT', + headers: { + 'content-type': 'application/json', + 'x-updated-by': updatedByEmail, + }, + body: JSON.stringify(config), + }, + BenchmarkConfigResponseSchema + ); +} + +export function listBenchmarkRuns() { + return fetchBenchmarkAdmin('/admin/runs', { method: 'GET' }, BenchmarkRunsResponseSchema); +} + +export function startBenchmarkRun(kind: BenchmarkKind) { + return fetchBenchmarkAdmin( + '/admin/runs', + { + method: 'POST', + headers: { 'content-type': 'application/json' }, + body: JSON.stringify({ kind }), + }, + StartBenchmarkRunResponseSchema + ); +} + +export function getBenchmarkRoutingTable() { + return fetchBenchmarkAdmin( + '/admin/routing-table', + { method: 'GET' }, + BenchmarkRoutingTableResponseSchema + ); +} diff --git a/apps/web/src/lib/config.server.ts b/apps/web/src/lib/config.server.ts index 6240690665..a0812e0c36 100644 --- a/apps/web/src/lib/config.server.ts +++ b/apps/web/src/lib/config.server.ts @@ -369,6 +369,11 @@ export const SESSION_INGEST_WORKER_URL = getEnvVariable('SESSION_INGEST_WORKER_U // Auto routing worker export const AUTO_ROUTING_WORKER_URL = getEnvVariable('AUTO_ROUTING_WORKER_URL') || ''; +// Auto routing benchmark worker +export const AUTO_ROUTING_BENCHMARK_WORKER_URL = + getEnvVariable('AUTO_ROUTING_BENCHMARK_WORKER_URL') || + 'https://auto-routing-benchmark.kiloapps.io'; + // Security Agent sync Worker command ingress export const SECURITY_SYNC_WORKER_URL = getEnvVariable('SECURITY_SYNC_WORKER_URL') || ''; // Security Agent auto-analysis Worker command ingress diff --git a/services/auto-routing-benchmark/src/admin.test.ts b/services/auto-routing-benchmark/src/admin.test.ts index 6a74bdc7e3..99da4c6a8a 100644 --- a/services/auto-routing-benchmark/src/admin.test.ts +++ b/services/auto-routing-benchmark/src/admin.test.ts @@ -194,7 +194,8 @@ describe('PUT /admin/config', () => { expect(dbRun).toHaveBeenCalled(); // The SQL should be an INSERT OR REPLACE into benchmark_config. const insertCall = dbPrepare.mock.calls.find( - (args: unknown[]) => typeof args[0] === 'string' && (args[0] as string).includes('benchmark_config') + (args: unknown[]) => + typeof args[0] === 'string' && (args[0] as string).includes('benchmark_config') ); expect(insertCall).toBeDefined(); // The updatedBy value was forwarded via bind. From 0e34c020848573d00005d6ea74ff0bd8443dd726 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= Date: Thu, 11 Jun 2026 23:00:32 +0200 Subject: [PATCH 14/73] feat(admin): benchmark config, runs and routing table panel --- .../auto-routing/AutoRoutingAdminContent.tsx | 3 + .../auto-routing/BenchmarksSection.test.ts | 51 ++ .../admin/auto-routing/BenchmarksSection.tsx | 817 ++++++++++++++++++ .../auto-routing/BenchmarksSection.types.ts | 8 + ...uto-routing-benchmark-admin-client.test.ts | 82 +- 5 files changed, 918 insertions(+), 43 deletions(-) create mode 100644 apps/web/src/app/admin/auto-routing/BenchmarksSection.test.ts create mode 100644 apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx create mode 100644 apps/web/src/app/admin/auto-routing/BenchmarksSection.types.ts diff --git a/apps/web/src/app/admin/auto-routing/AutoRoutingAdminContent.tsx b/apps/web/src/app/admin/auto-routing/AutoRoutingAdminContent.tsx index d893f27382..f55d1bccdc 100644 --- a/apps/web/src/app/admin/auto-routing/AutoRoutingAdminContent.tsx +++ b/apps/web/src/app/admin/auto-routing/AutoRoutingAdminContent.tsx @@ -31,6 +31,7 @@ import { type OpenRouterModelsResponse, } from '@/lib/organizations/organization-types'; import { cn } from '@/lib/utils'; +import { BenchmarksSection } from './BenchmarksSection'; const periods: Array<{ value: AutoRoutingAnalyticsPeriod; label: string }> = [ { value: '1h', label: '1h' }, @@ -600,6 +601,8 @@ export function AutoRoutingAdminContent() { /> )} + + ); } diff --git a/apps/web/src/app/admin/auto-routing/BenchmarksSection.test.ts b/apps/web/src/app/admin/auto-routing/BenchmarksSection.test.ts new file mode 100644 index 0000000000..768545f81f --- /dev/null +++ b/apps/web/src/app/admin/auto-routing/BenchmarksSection.test.ts @@ -0,0 +1,51 @@ +import { describe, expect, it } from '@jest/globals'; +import { formatAccuracy, formatUsd } from './BenchmarksSection'; + +describe('formatAccuracy', () => { + it('formats 0.8542 as 85.4%', () => { + expect(formatAccuracy(0.8542)).toBe('85.4%'); + }); + + it('formats 1.0 as 100.0%', () => { + expect(formatAccuracy(1.0)).toBe('100.0%'); + }); + + it('formats 0 as 0.0%', () => { + expect(formatAccuracy(0)).toBe('0.0%'); + }); + + it('formats 0.5 as 50.0%', () => { + expect(formatAccuracy(0.5)).toBe('50.0%'); + }); + + it('rounds to one decimal place', () => { + expect(formatAccuracy(0.9999)).toBe('100.0%'); + expect(formatAccuracy(0.9994)).toBe('99.9%'); + }); +}); + +describe('formatUsd', () => { + it('returns em dash for null', () => { + expect(formatUsd(null)).toBe('—'); + }); + + it('formats a small cost with 6 decimal places', () => { + expect(formatUsd(0.000123)).toBe('$0.000123'); + }); + + it('trims trailing zeros', () => { + expect(formatUsd(0.1)).toBe('$0.1'); + }); + + it('formats zero as $0.0', () => { + expect(formatUsd(0)).toBe('$0.0'); + }); + + it('formats a typical cost', () => { + expect(formatUsd(0.001234)).toBe('$0.001234'); + }); + + it('formats a cost that fits exactly at 6dp', () => { + expect(formatUsd(0.000001)).toBe('$0.000001'); + }); +}); diff --git a/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx b/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx new file mode 100644 index 0000000000..4d13898548 --- /dev/null +++ b/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx @@ -0,0 +1,817 @@ +'use client'; + +import { + BenchmarkConfigResponseSchema, + BenchmarkRunsResponseSchema, + StartBenchmarkRunResponseSchema, + type BenchmarkConfig, + type BenchmarkRun, + type BenchmarkModelSummary, +} from '@kilocode/auto-routing-contracts'; +import React, { useCallback, useEffect, useRef, useState } from 'react'; +import { useMutation, useQuery, useQueryClient } from '@tanstack/react-query'; +import { toast } from 'sonner'; +import { ChevronDown, ChevronRight, Play, Plus, RotateCcw, Save, Trash2 } from 'lucide-react'; +import * as z from 'zod'; +import { Badge } from '@/components/ui/badge'; +import { Button } from '@/components/ui/button'; +import { Card, CardContent, CardHeader, CardTitle } from '@/components/ui/card'; +import { Checkbox } from '@/components/ui/checkbox'; +import { Input } from '@/components/ui/input'; +import { Label } from '@/components/ui/label'; +import { Skeleton } from '@/components/ui/skeleton'; +import { + Table, + TableBody, + TableCell, + TableHead, + TableHeader, + TableRow, +} from '@/components/ui/table'; +import { Textarea } from '@/components/ui/textarea'; +import { + BenchmarkRoutingTableResponseSchema, + type BenchmarkRoutingTableResponse, +} from './BenchmarksSection.types'; + +// --------------------------------------------------------------------------- +// Pure helpers (exported for unit tests) +// --------------------------------------------------------------------------- + +export function formatAccuracy(n: number): string { + return `${(n * 100).toFixed(1)}%`; +} + +export function formatUsd(n: number | null): string { + if (n === null) return '—'; + // 6 dp, remove trailing zeros, but keep at least $0.000001 precision + const fixed = n.toFixed(6); + // Trim trailing zeros after decimal, but leave at least one digit after dot + const trimmed = fixed.replace(/(\.\d*?)0+$/, '$1').replace(/\.$/, '.0'); + return `$${trimmed}`; +} + +// --------------------------------------------------------------------------- +// API error helper (mirrors the one in AutoRoutingAdminContent.tsx) +// --------------------------------------------------------------------------- + +const AdminApiErrorSchema = z.object({ error: z.string().optional() }); + +async function parseAdminResponse( + response: Response, + schema: z.ZodType +): Promise { + const body: unknown = await response.json(); + if (!response.ok) { + const parsedError = AdminApiErrorSchema.safeParse(body); + throw new Error( + parsedError.success && parsedError.data.error + ? parsedError.data.error + : `Request failed: ${response.status}` + ); + } + return schema.parse(body); +} + +// --------------------------------------------------------------------------- +// Fetch helpers +// --------------------------------------------------------------------------- + +async function fetchBenchmarkConfig() { + const response = await fetch('/admin/api/auto-routing/benchmark-config'); + return parseAdminResponse(response, BenchmarkConfigResponseSchema); +} + +async function saveBenchmarkConfig(config: BenchmarkConfig) { + const response = await fetch('/admin/api/auto-routing/benchmark-config', { + method: 'PUT', + headers: { 'content-type': 'application/json' }, + body: JSON.stringify(config), + }); + return parseAdminResponse(response, BenchmarkConfigResponseSchema); +} + +async function fetchBenchmarkRuns() { + const response = await fetch('/admin/api/auto-routing/benchmark-runs'); + return parseAdminResponse(response, BenchmarkRunsResponseSchema); +} + +async function startBenchmarkRun(kind: 'classifier' | 'decider') { + const response = await fetch('/admin/api/auto-routing/benchmark-runs', { + method: 'POST', + headers: { 'content-type': 'application/json' }, + body: JSON.stringify({ kind }), + }); + return parseAdminResponse(response, StartBenchmarkRunResponseSchema); +} + +async function fetchBenchmarkRoutingTable() { + const response = await fetch('/admin/api/auto-routing/benchmark-routing-table'); + return parseAdminResponse( + response, + BenchmarkRoutingTableResponseSchema + ); +} + +// --------------------------------------------------------------------------- +// Local form state type for decider model rows +// --------------------------------------------------------------------------- + +type DeciderModelRow = { + id: string; + chat_completions: boolean; + responses: boolean; + messages: boolean; +}; + +function configToFormState(config: BenchmarkConfig): { + classifierModels: string; + deciderModels: DeciderModelRow[]; + minAccuracy: number; + maxConcurrency: number; +} { + return { + classifierModels: config.classifierModels.join('\n'), + deciderModels: config.deciderModels.map(m => ({ + id: m.id, + chat_completions: m.supportedApiKinds.includes('chat_completions'), + responses: m.supportedApiKinds.includes('responses'), + messages: m.supportedApiKinds.includes('messages'), + })), + minAccuracy: config.minAccuracy, + maxConcurrency: config.maxConcurrency, + }; +} + +function formStateToConfig( + state: ReturnType, + base: BenchmarkConfig +): BenchmarkConfig { + const classifierModels = state.classifierModels + .split('\n') + .map(s => s.trim()) + .filter(s => s.length > 0); + const deciderModels = state.deciderModels + .filter(row => row.id.trim().length > 0) + .map(row => { + const kinds: Array<'chat_completions' | 'responses' | 'messages'> = []; + if (row.chat_completions) kinds.push('chat_completions'); + if (row.responses) kinds.push('responses'); + if (row.messages) kinds.push('messages'); + return { id: row.id.trim(), supportedApiKinds: kinds.length ? kinds : ['chat_completions' as const] }; + }); + return { + classifierModels, + deciderModels, + minAccuracy: state.minAccuracy, + maxConcurrency: state.maxConcurrency, + updatedAt: base.updatedAt, + updatedBy: base.updatedBy, + }; +} + +// --------------------------------------------------------------------------- +// Config editor sub-component +// --------------------------------------------------------------------------- + +function BenchmarkConfigEditor({ + config, + defaults, + onSaved, +}: { + config: BenchmarkConfig; + defaults: BenchmarkConfig; + onSaved: (next: { config: BenchmarkConfig; defaults: BenchmarkConfig }) => void; +}) { + const [form, setForm] = useState(() => configToFormState(config)); + + // Sync when config changes from outside (initial load / after save) + const prevConfigRef = useRef(config); + useEffect(() => { + if (prevConfigRef.current !== config) { + prevConfigRef.current = config; + setForm(configToFormState(config)); + } + }, [config]); + + const saveMutation = useMutation({ + mutationFn: saveBenchmarkConfig, + onSuccess: data => { + onSaved(data); + toast.success('Benchmark config saved'); + }, + onError: (error: unknown) => { + toast.error(error instanceof Error ? error.message : 'Failed to save benchmark config'); + }, + }); + + const handleResetToDefaults = useCallback(() => { + setForm(configToFormState(defaults)); + }, [defaults]); + + const handleAddDeciderRow = useCallback(() => { + setForm(prev => ({ + ...prev, + deciderModels: [ + ...prev.deciderModels, + { id: '', chat_completions: true, responses: false, messages: false }, + ], + })); + }, []); + + const handleRemoveDeciderRow = useCallback((index: number) => { + setForm(prev => ({ + ...prev, + deciderModels: prev.deciderModels.filter((_, i) => i !== index), + })); + }, []); + + const handleDeciderRowChange = useCallback( + (index: number, patch: Partial) => { + setForm(prev => ({ + ...prev, + deciderModels: prev.deciderModels.map((row, i) => + i === index ? { ...row, ...patch } : row + ), + })); + }, + [] + ); + + const handleSave = useCallback(() => { + saveMutation.mutate(formStateToConfig(form, config)); + }, [form, config, saveMutation]); + + return ( + + + Benchmark Config + + + {/* Classifier models */} +
+ +