Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
74 commits
Select commit Hold shift + click to select a range
b41e58e
refactor(auto-routing): move classifier core into contracts package
iscekic Jun 11, 2026
1fb85f5
feat(auto-routing): add tier, routing-table, decision and benchmark c…
iscekic Jun 11, 2026
39acfdb
feat(auto-routing): add benchmark-driven decision engine and KV routi…
iscekic Jun 11, 2026
bd83fdc
feat(auto-routing): return routing decisions from /decide
iscekic Jun 11, 2026
9621d62
fix(auto-routing): log unparseable routing table JSON before falling …
iscekic Jun 11, 2026
7af1b6d
feat(auto-routing-benchmark): scaffold benchmark worker with D1 schema
iscekic Jun 11, 2026
22de713
feat(auto-routing-benchmark): classifier golden dataset and grading
iscekic Jun 11, 2026
878e49b
style(auto-routing-benchmark): apply oxfmt formatting
iscekic Jun 11, 2026
662717c
feat(auto-routing-benchmark): decider golden dataset with determinist…
iscekic Jun 11, 2026
110cbd9
fix(auto-routing-benchmark): unambiguous whitespace instruction in of…
iscekic Jun 11, 2026
5ce8621
feat(auto-routing-benchmark): queue-driven benchmark runs with aggreg…
iscekic Jun 11, 2026
0c763ce
feat(auto-routing-benchmark): admin config, runs and routing-table en…
iscekic Jun 11, 2026
c749be2
feat(admin): proxy routes for auto-routing benchmark service
iscekic Jun 11, 2026
0e34c02
feat(admin): benchmark config, runs and routing table panel
iscekic Jun 11, 2026
fb084c3
fix(admin): stabilize benchmark runs polling interval dependencies
iscekic Jun 11, 2026
9f2d876
feat(web): internal token mint endpoint for auto-routing benchmark
iscekic Jun 11, 2026
7a31d4a
feat(auto-routing-benchmark): run decider cases through kilo CLI in a…
iscekic Jun 11, 2026
d0f13b0
feat(admin): benchmark user id config field
iscekic Jun 11, 2026
fdc6520
feat(gateway): add kilo-auto/efficient with blocking auto-routing dec…
iscekic Jun 11, 2026
813ea0e
chore(auto-routing): drop unused import in routing-table contracts
iscekic Jun 11, 2026
9b69edf
fix(auto-routing-benchmark): harden decider CLI parsing, grading and …
iscekic Jun 11, 2026
5ff4b08
fix(auto-routing-benchmark): warm up CLI container before concurrent …
iscekic Jun 11, 2026
06836cc
fix(auto-routing-benchmark): faster container turnover to avoid insta…
iscekic Jun 11, 2026
2faee13
fix(auto-routing-benchmark): address review findings
iscekic Jun 11, 2026
cac57b7
style(auto-routing-benchmark): format wrangler.jsonc
iscekic Jun 11, 2026
ccc9c9d
fix(auto-routing-benchmark): guard against double finish on spawn fai…
iscekic Jun 11, 2026
ba3b3be
fix(auto-routing): break contracts module cycle and keep response sch…
iscekic Jun 11, 2026
6776db0
chore(admin): drop unused import after schema move
iscekic Jun 11, 2026
c0320c7
feat(auto-routing): classifier model becomes an admin override over t…
iscekic Jun 11, 2026
7bb5048
feat(auto-routing): manual benchmark runs, classifier override, decid…
iscekic Jun 12, 2026
f3c0128
refactor(auto-routing): simplification pass
iscekic Jun 12, 2026
641f6ef
refactor(auto-routing-benchmark): use drizzle for all D1 access
iscekic Jun 12, 2026
2d2691f
refactor(auto-routing-benchmark): normalize D1 schema and adopt drizz…
iscekic Jun 12, 2026
86e2fdc
fix(auto-routing-benchmark): preserve null candidate cost and type dr…
iscekic Jun 12, 2026
0241d47
refactor(auto-routing-benchmark): make candidate cost non-null to mat…
iscekic Jun 12, 2026
8244676
feat(auto-routing): read-through KV cache backed by the benchmark ser…
iscekic Jun 12, 2026
36f32a7
fix(auto-routing): await read-through cache writes and surface origin…
iscekic Jun 12, 2026
aa14657
ci(workers): run worker predeploy scripts (D1 migrations) before deploy
iscekic Jun 12, 2026
82aef0b
fix(auto-routing-benchmark): reuse loaded run state in finalize and b…
iscekic Jun 12, 2026
4a7478b
refactor(auto-routing): share ttl cache, single-source schemas and dr…
iscekic Jun 12, 2026
a449c26
docs(gateway): drop stale keep-in-sync comment on DecideBaseParams
iscekic Jun 12, 2026
4caa4f8
feat(gateway): bill classifier cost to the user for kilo-auto/efficient
iscekic Jun 12, 2026
ec5dc3f
fix(gateway): fix type error and remove dead guard in classifier billing
iscekic Jun 12, 2026
0141b71
fix(auto-routing): apply decision reasoningEffort to efficient routing
iscekic Jun 12, 2026
6960e1a
feat(auto-routing): align kilo-auto/efficient catalog with balanced, …
iscekic Jun 12, 2026
debdd03
fix(admin): correct run-summaries colspan in benchmarks section
iscekic Jun 12, 2026
a016310
feat(admin): derive decider model API kinds from gateway provider def…
iscekic Jun 12, 2026
fc427e5
feat(auto-routing): drop default routing table; no table means no dec…
iscekic Jun 12, 2026
01e4bd9
fix(auto-routing): keep classifier override when benchmark origin is …
iscekic Jun 12, 2026
0828e47
docs(contracts): fix stale classifier-winner comment
iscekic Jun 12, 2026
71222ca
fix(benchmark): exclude no-cost-signal summaries from routing table r…
iscekic Jun 12, 2026
6f5fd38
test(benchmark): fix expected ranking order in no-cost-signal test
iscekic Jun 12, 2026
2cd53f9
feat(benchmark): remove fabricated default config; runs require a sav…
iscekic Jun 12, 2026
354054d
chore(benchmark): drop redundant case_results index, regenerate basel…
iscekic Jun 12, 2026
6aba145
docs(benchmark): fix stale KV comment in wrangler config
iscekic Jun 12, 2026
8955269
feat(auto-routing-benchmark): grade subtaskType and riskLevel, expand…
iscekic Jun 12, 2026
ae707f3
feat(auto-routing-benchmark): expand decider dataset to per-pair taxo…
iscekic Jun 12, 2026
adb49f5
feat(auto-routing): session-sticky decisions with switch-cost factor
iscekic Jun 12, 2026
a24dc4d
feat(auto-routing-benchmark): plumb switchCostFactor through config, …
iscekic Jun 12, 2026
1d424c5
Merge remote-tracking branch 'origin/main' into feat/auto-routing-eff…
iscekic Jun 12, 2026
3d50441
fix(ai-gateway): align efficient fallback with Qwen-for-all-APIs afte…
iscekic Jun 12, 2026
d922d92
refactor(auto-routing): drop per-candidate API-kind plumbing, validat…
iscekic Jun 12, 2026
427dcc2
fix(auto-routing): review-pass fixes
iscekic Jun 12, 2026
053373b
test(ai-gateway): add sticky field to decision fixture
iscekic Jun 12, 2026
b8a5892
feat(dev): move auto-routing workers into their own opt-in dev group
iscekic Jun 12, 2026
2f39419
fix(auto-routing): make the decider benchmark runnable in local dev
iscekic Jun 12, 2026
ae0cec5
fix(auto-routing): kill the whole CLI process tree on decider case ti…
iscekic Jun 12, 2026
4f04e0a
feat(auto-routing): benchmark repetitions, p95 latency, and classifie…
iscekic Jun 12, 2026
1eae06f
fix(auto-routing): correct case_results migration backfill and close …
iscekic Jun 13, 2026
7151256
feat(admin): benchmark repetitions, latency budget, and p95/timeout c…
iscekic Jun 13, 2026
17a8c01
fix(admin): correct runs-table colSpan and cover config form round-trip
iscekic Jun 13, 2026
1a5d858
chore(auto-routing): squash benchmark D1 migrations into one baseline
iscekic Jun 13, 2026
c9db589
Merge remote-tracking branch 'origin/main' into feat/auto-routing-eff…
iscekic Jun 13, 2026
9eaae60
test(ai-gateway): stop depending on removed morph model in API-kind t…
iscekic Jun 13, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .github/workflows/deploy-workers.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,10 @@ jobs:
with:
apiToken: ${{ secrets.CLOUDFLARE_API_TOKEN }}
workingDirectory: ${{ inputs.worker }}
# Workers that define a `predeploy` script (e.g. D1 migrations) run it
# right before deploy; all other workers are unaffected.
preCommands: |
if [ "$(jq -r '.scripts.predeploy // empty' package.json)" != "" ]; then pnpm run predeploy; fi
command: deploy

detect-changes:
Expand Down Expand Up @@ -150,4 +154,8 @@ jobs:
with:
apiToken: ${{ secrets.CLOUDFLARE_API_TOKEN }}
workingDirectory: ${{ matrix.worker }}
# Workers that define a `predeploy` script (e.g. D1 migrations) run it
# right before deploy; all other workers are unaffected.
preCommands: |
if [ "$(jq -r '.scripts.predeploy // empty' package.json)" != "" ]; then pnpm run predeploy; fi
command: deploy
3 changes: 3 additions & 0 deletions apps/web/.env.development.local.example
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ AUTO_TRIAGE_URL=http://localhost:8791
# @url auto-routing
AUTO_ROUTING_WORKER_URL=http://localhost:8810

# @url auto-routing-benchmark
AUTO_ROUTING_BENCHMARK_WORKER_URL=http://localhost:8814

# @url cloudflare-security-sync
SECURITY_SYNC_WORKER_URL=http://localhost:8812

Expand Down
123 changes: 123 additions & 0 deletions apps/web/src/app/admin/api/auto-routing/benchmark-config/route.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
import { NextRequest } from 'next/server';
import type { User } from '@kilocode/db';
import {
getBenchmarkConfig,
updateBenchmarkConfig,
} from '@/lib/ai-gateway/auto-routing-benchmark-admin-client';
import { getUserFromAuth } from '@/lib/user/server';
import type { KiloExclusiveModel } from '@/lib/ai-gateway/providers/kilo-exclusive-model';
import type * as ModelsModule from '@/lib/ai-gateway/models';

jest.mock('@/lib/user/server', () => ({
getUserFromAuth: jest.fn(),
}));

jest.mock('@/lib/ai-gateway/auto-routing-benchmark-admin-client', () => ({
getBenchmarkConfig: jest.fn(),
updateBenchmarkConfig: jest.fn(),
}));

// Stub the catalog so tests don't depend on any specific provider file.
// 'test-exclusive/alibaba-only' maps to the alibaba gateway (chat_completions only).
jest.mock('@/lib/ai-gateway/models', () => {
const actual = jest.requireActual<typeof ModelsModule>('@/lib/ai-gateway/models');
const stubModel: KiloExclusiveModel = {
public_id: 'test-exclusive/alibaba-only',
display_name: 'Test Alibaba-only',
description: 'stub for unit tests',
context_length: 8192,
max_completion_tokens: 4096,
status: 'public',
flags: [],
gateway: 'alibaba',
internal_id: 'stub-internal',
pricing: null,
exclusive_to: [],
inference_provider_restriction: [],
};
return {
...actual,
findKiloExclusiveModel: (id: string) =>
id === 'test-exclusive/alibaba-only' ? stubModel : actual.findKiloExclusiveModel(id),
};
});

import { PUT } from './route';

const mockGetUserFromAuth = jest.mocked(getUserFromAuth);
const mockGetBenchmarkConfig = jest.mocked(getBenchmarkConfig);
const mockUpdateBenchmarkConfig = jest.mocked(updateBenchmarkConfig);

// Test-fixture boundary: only the fields the route actually reads.
function adminUserFixture(): User {
return { id: 'admin_123', google_user_email: 'admin@kilocode.ai' } as Partial<User> as User;
}

function putRequest(body: unknown) {
return new NextRequest('http://localhost:3000/admin/api/auto-routing/benchmark-config', {
method: 'PUT',
body: JSON.stringify(body),
headers: { 'content-type': 'application/json' },
});
}

const validConfig = {
classifierModels: ['google/gemini-2.5-flash-lite'],
deciderModels: [{ id: 'openai/gpt-5-mini', reasoningEffort: null }],
minAccuracy: 0.7,
switchCostFactor: 3,
maxConcurrency: 4,
benchmarkUserId: null,
classifierRepetitions: 1,
deciderRepetitions: 1,
classifierMaxP95LatencyMs: 1000,
updatedAt: null,
updatedBy: null,
};

describe('PUT /admin/api/auto-routing/benchmark-config', () => {
beforeEach(() => {
jest.clearAllMocks();
mockGetUserFromAuth.mockResolvedValue({
user: adminUserFixture(),
authFailedResponse: null,
});
mockUpdateBenchmarkConfig.mockResolvedValue({
status: 200,
body: { config: validConfig },
});
mockGetBenchmarkConfig.mockResolvedValue({ status: 200, body: { config: null } });
});

it('forwards a config whose decider models all serve every gateway chat API', async () => {
const response = await PUT(putRequest(validConfig));
expect(response.status).toBe(200);
expect(mockUpdateBenchmarkConfig).toHaveBeenCalledWith(validConfig, 'admin@kilocode.ai');
});

it('rejects with 400 listing decider models not servable on all gateway chat APIs', async () => {
const response = await PUT(
putRequest({
...validConfig,
deciderModels: [
{ id: 'openai/gpt-5-mini', reasoningEffort: null },
{ id: 'test-exclusive/alibaba-only', reasoningEffort: null },
],
})
);

expect(response.status).toBe(400);
const body = (await response.json()) as { error: string };
expect(body.error).toContain('test-exclusive/alibaba-only');
expect(body.error).toContain('chat_completions');
expect(body.error).not.toContain('openai/gpt-5-mini (');
expect(mockUpdateBenchmarkConfig).not.toHaveBeenCalled();
});

it('rejects a schema-invalid config with 400', async () => {
const response = await PUT(putRequest({ classifierModels: 'oops' }));
expect(response.status).toBe(400);
await expect(response.json()).resolves.toEqual({ error: 'Invalid benchmark config' });
expect(mockUpdateBenchmarkConfig).not.toHaveBeenCalled();
});
});
57 changes: 57 additions & 0 deletions apps/web/src/app/admin/api/auto-routing/benchmark-config/route.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import { BenchmarkConfigSchema } from '@kilocode/auto-routing-contracts';
import type { NextRequest } from 'next/server';
import { NextResponse } from 'next/server';
import {
getBenchmarkConfig,
updateBenchmarkConfig,
} from '@/lib/ai-gateway/auto-routing-benchmark-admin-client';
import {
gatewayChatApisForModel,
modelServesAllGatewayChatApis,
} from '@/lib/ai-gateway/model-api-kinds';
import { getUserFromAuth } from '@/lib/user/server';

export async function GET() {
const { authFailedResponse } = await getUserFromAuth({ adminOnly: true });
if (authFailedResponse) return authFailedResponse;

const result = await getBenchmarkConfig();
return NextResponse.json(result.body, { status: result.status });
}

export async function PUT(request: NextRequest) {
const { authFailedResponse, user } = await getUserFromAuth({ adminOnly: true });
if (authFailedResponse) return authFailedResponse;

let rawBody: unknown;
try {
rawBody = await request.json();
} catch {
return NextResponse.json({ error: 'Invalid JSON body' }, { status: 400 });
}

const parsed = BenchmarkConfigSchema.safeParse(rawBody);
if (!parsed.success) {
return NextResponse.json({ error: 'Invalid benchmark config' }, { status: 400 });
}

// Routing-table candidates carry no per-protocol metadata, so every decider
// model must be servable on ALL gateway chat API kinds by the provider the
// gateway would route it to.
const unsupported = parsed.data.deciderModels
.map(m => m.id)
.filter(id => !modelServesAllGatewayChatApis(id))
.map(id => `${id} (supports: ${gatewayChatApisForModel(id).join(', ') || 'none'})`);
if (unsupported.length > 0) {
return NextResponse.json(
{
error: `Decider models must support all gateway chat APIs (chat_completions, responses, messages): ${unsupported.join('; ')}`,
},
{ status: 400 }
);
}

const email = user?.google_user_email ?? '';
const result = await updateBenchmarkConfig(parsed.data, email);
return NextResponse.json(result.body, { status: result.status });
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import { NextResponse } from 'next/server';
import { getBenchmarkRoutingTable } from '@/lib/ai-gateway/auto-routing-benchmark-admin-client';
import { getUserFromAuth } from '@/lib/user/server';

export async function GET() {
const { authFailedResponse } = await getUserFromAuth({ adminOnly: true });
if (authFailedResponse) return authFailedResponse;

const result = await getBenchmarkRoutingTable();
return NextResponse.json(result.body, { status: result.status });
}
36 changes: 36 additions & 0 deletions apps/web/src/app/admin/api/auto-routing/benchmark-runs/route.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import { StartBenchmarkRunRequestSchema } from '@kilocode/auto-routing-contracts';
import type { NextRequest } from 'next/server';
import { NextResponse } from 'next/server';
import {
listBenchmarkRuns,
startBenchmarkRun,
} from '@/lib/ai-gateway/auto-routing-benchmark-admin-client';
import { getUserFromAuth } from '@/lib/user/server';

export async function GET() {
const { authFailedResponse } = await getUserFromAuth({ adminOnly: true });
if (authFailedResponse) return authFailedResponse;

const result = await listBenchmarkRuns();
return NextResponse.json(result.body, { status: result.status });
}

export async function POST(request: NextRequest) {
const { authFailedResponse } = await getUserFromAuth({ adminOnly: true });
if (authFailedResponse) return authFailedResponse;

let rawBody: unknown;
try {
rawBody = await request.json();
} catch {
return NextResponse.json({ error: 'Invalid JSON body' }, { status: 400 });
}

const parsed = StartBenchmarkRunRequestSchema.safeParse(rawBody);
if (!parsed.success) {
return NextResponse.json({ error: 'Invalid start benchmark run request' }, { status: 400 });
}

const result = await startBenchmarkRun(parsed.data.kind, parsed.data.force);
return NextResponse.json(result.body, { status: result.status });
}
Loading