diff --git a/packages/backend/src/services/inspectors/usage-logging.ts b/packages/backend/src/services/inspectors/usage-logging.ts index 1c504529..2931ea91 100644 --- a/packages/backend/src/services/inspectors/usage-logging.ts +++ b/packages/backend/src/services/inspectors/usage-logging.ts @@ -148,6 +148,15 @@ export class UsageInspector extends PassThrough { // Some providers emit `: cost {"request_cost_usd": ...}` as SSE comments if (reconstructed?.providerReportedCost) { applyProviderReportedCost(this.usageRecord, reconstructed.providerReportedCost); + if (reconstructed?.usage) { + const usageCostDetails = extractUsageCostDetails(reconstructed.usage); + if (usageCostDetails) { + logger.warn( + `[ProviderCost] Both SSE :cost and usage.cost_details present for ${this.usageRecord.requestId}; ` + + `SSE value ($${this.usageRecord.providerReportedCost}) takes priority over cost_details total ($${usageCostDetails.total_cost})` + ); + } + } } // Override with provider-reported cost from usage.cost_details if available diff --git a/packages/backend/src/services/response-handler.ts b/packages/backend/src/services/response-handler.ts index 7532e0f1..9ba53625 100644 --- a/packages/backend/src/services/response-handler.ts +++ b/packages/backend/src/services/response-handler.ts @@ -502,6 +502,15 @@ async function finalizeUsage( const reconstructed = debugManager.getReconstructedRawResponse(usageRecord.requestId!); if (reconstructed?.providerReportedCost) { applyProviderReportedCost(usageRecord, reconstructed.providerReportedCost); + if (reconstructed?.usage) { + const usageCostDetails = extractUsageCostDetails(reconstructed.usage); + if (usageCostDetails) { + logger.warn( + `[ProviderCost] Both SSE :cost and usage.cost_details present for ${usageRecord.requestId}; ` + + `SSE value ($${usageRecord.providerReportedCost}) takes priority over cost_details total ($${usageCostDetails.total_cost})` + ); + } + } } // Also check for cost_details in the usage block (some providers embed costs there) diff --git a/packages/backend/src/utils/__tests__/provider-cost.test.ts b/packages/backend/src/utils/__tests__/provider-cost.test.ts index f314d77b..c738bbb4 100644 --- a/packages/backend/src/utils/__tests__/provider-cost.test.ts +++ b/packages/backend/src/utils/__tests__/provider-cost.test.ts @@ -124,42 +124,43 @@ describe('applyProviderReportedCost', () => { describe('extractUsageCostDetails', () => { test('extracts cost_details from the new usage format', () => { + // Real response: glm-5.1 via LLM Gateway (has both gateway and upstream fields) const usage = { - prompt_tokens: 23, - total_tokens: 66, - completion_tokens: 43, - estimated_cost: 0.00017465, + prompt_tokens: 90122, + completion_tokens: 104, + total_tokens: 90226, + cost: 0.022101624, prompt_tokens_details: { - cached_tokens: 0, + cached_tokens: 89536, cache_write_tokens: 0, + audio_tokens: 0, + video_tokens: 0, + image_tokens: 0, }, - cost: 0.00017465, cost_details: { - upstream_inference_cost: 0.00017465, - upstream_inference_prompt_cost: 0.00002415, - upstream_inference_completions_cost: 0.0001505, - total_cost: 0.00017465, - input_cost: 0.00002415, - output_cost: 0.0001505, - cached_input_cost: 0, + upstream_inference_cost: 0.022101624, + upstream_inference_prompt_cost: 0.021689784, + upstream_inference_completions_cost: 0.00041184, + total_cost: 0.022101624, + input_cost: 0.00073836, + output_cost: 0.00041184, + cached_input_cost: 0.020951424, cache_write_input_cost: 0, request_cost: 0, web_search_cost: 0, image_input_cost: null, image_output_cost: null, audio_input_cost: null, - data_storage_cost: 0.00000106, }, }; const result = extractUsageCostDetails(usage); expect(result).not.toBeNull(); - expect(result!.total_cost).toBe(0.00017465); - expect(result!.input_cost).toBe(0.00002415); - expect(result!.output_cost).toBe(0.0001505); - expect(result!.cached_input_cost).toBe(0); + expect(result!.total_cost).toBe(0.022101624); + expect(result!.input_cost).toBe(0.00073836); + expect(result!.output_cost).toBe(0.00041184); + expect(result!.cached_input_cost).toBe(0.020951424); expect(result!.cache_write_input_cost).toBe(0); - expect(result!.data_storage_cost).toBe(0.00000106); }); test('falls back to usage.cost when cost_details.total_cost is missing', () => { @@ -223,25 +224,40 @@ describe('extractUsageCostDetails', () => { expect(extractUsageCostDetails(undefined)).toBeNull(); }); - test('maps upstream_inference_prompt_cost as fallback for input_cost', () => { + test('keeps upstream prompt/completions fields separate from input_cost/output_cost', () => { + // Real response: normal-tier (no gateway input_cost/output_cost fields) const usage = { - cost: 0.01, + completion_tokens: 2177, + cost: 0.00435825, cost_details: { - upstream_inference_prompt_cost: 0.003, - upstream_inference_completions_cost: 0.007, + upstream_inference_completions_cost: 0.004354, + upstream_inference_cost: null, + upstream_inference_prompt_cost: 4.25e-6, }, + is_byok: false, + prompt_tokens: 17, + prompt_tokens_details: { cached_tokens: 0 }, }; const result = extractUsageCostDetails(usage); - expect(result!.input_cost).toBe(0.003); - expect(result!.output_cost).toBe(0.007); + expect(result).not.toBeNull(); + expect(result!.total_cost).toBe(0.00435825); + expect(result!.input_cost).toBeNull(); + expect(result!.output_cost).toBeNull(); + expect(result!.upstream_inference_prompt_cost).toBe(4.25e-6); + expect(result!.upstream_inference_completions_cost).toBe(0.004354); }); test('preserves null values for optional cost fields', () => { + // Real response: LLM Gateway — image/audio costs null for text-only models const usage = { - cost: 0.01, + cost: 0.022101624, cost_details: { - total_cost: 0.01, + total_cost: 0.022101624, + input_cost: 0.00073836, + output_cost: 0.00041184, + cached_input_cost: 0.020951424, + cache_write_input_cost: 0, image_input_cost: null, image_output_cost: null, audio_input_cost: null, @@ -254,6 +270,159 @@ describe('extractUsageCostDetails', () => { expect(result!.audio_input_cost).toBeNull(); }); + test('uses upstream_inference_cost as total when usage.cost is 0 (BYOK)', () => { + // Real response: BYOK — Plexus charges $0, actual cost reported in upstream_inference_cost + const usage = { + completion_tokens: 91, + cost: 0, + cost_details: { + upstream_inference_completions_cost: 0.0002275, + upstream_inference_cost: 0.0003253, + upstream_inference_prompt_cost: 9.78e-5, + }, + is_byok: true, + prompt_tokens: 326, + prompt_tokens_details: { cached_tokens: 0 }, + }; + + const result = extractUsageCostDetails(usage); + expect(result).not.toBeNull(); + expect(result!.total_cost).toBe(0.0003253); + expect(result!.input_cost).toBeNull(); + expect(result!.output_cost).toBeNull(); + expect(result!.upstream_inference_prompt_cost).toBe(9.78e-5); + expect(result!.upstream_inference_completions_cost).toBe(0.0002275); + }); + + test('aliases upstream_inference_input/output_cost to prompt/completions (Responses API)', () => { + // Real response: OpenAI Responses API uses _input/_output suffix rather than _prompt/_completions + const usage = { + input_tokens: 78, + input_tokens_details: { cached_tokens: 0 }, + output_tokens: 37, + total_tokens: 115, + cost: 0.0000113, + is_byok: false, + cost_details: { + upstream_inference_cost: null, + upstream_inference_input_cost: 0.0000039, + upstream_inference_output_cost: 0.0000074, + }, + }; + + const result = extractUsageCostDetails(usage); + expect(result).not.toBeNull(); + expect(result!.total_cost).toBe(0.0000113); + expect(result!.input_cost).toBeNull(); + expect(result!.output_cost).toBeNull(); + expect(result!.upstream_inference_prompt_cost).toBe(0.0000039); + expect(result!.upstream_inference_completions_cost).toBe(0.0000074); + }); + + test('uses input_cost/output_cost directly when present alongside upstream fields', () => { + // Real response: LLM Gateway includes both gateway fields (input_cost/output_cost/cached_input_cost) + // and upstream fields (upstream_inference_prompt/completions_cost); gateway fields take priority + const usage = { + cost: 0.022101624, + cost_details: { + total_cost: 0.022101624, + input_cost: 0.00073836, + output_cost: 0.00041184, + cached_input_cost: 0.020951424, + upstream_inference_prompt_cost: 0.021689784, + upstream_inference_completions_cost: 0.00041184, + }, + }; + + const result = extractUsageCostDetails(usage); + expect(result!.input_cost).toBe(0.00073836); + expect(result!.output_cost).toBe(0.00041184); + expect(result!.cached_input_cost).toBe(0.020951424); + }); + + test('returns null when cost is 0 and upstream_inference_cost is null (non-BYOK zero-cost)', () => { + // Real response: stream_error — non-BYOK request that genuinely cost $0. + // The || fallback in total cost detection causes 0 || null → null, so extract + // returns null. This is acceptable: zero-cost requests have nothing to report. + const usage = { + prompt_tokens: 43, + completion_tokens: 10, + total_tokens: 53, + cost: 0, + is_byok: false, + prompt_tokens_details: { cached_tokens: 0, audio_tokens: 0 }, + cost_details: { + upstream_inference_cost: null, + upstream_inference_prompt_cost: 0, + upstream_inference_completions_cost: 0, + }, + completion_tokens_details: { reasoning_tokens: 11, image_tokens: 0 }, + }; + + expect(extractUsageCostDetails(usage)).toBeNull(); + }); + + test('handles cost much larger than upstream sum (OpenRouter markup)', () => { + // Real response: file_annotation — OpenRouter's cost includes provider overhead/markup + // that is not reflected in the upstream_inference_prompt/completions_cost fields. + // cost ($0.00216775) is ~13x the upstream sum ($0.00016775). + const usage = { + completion_tokens: 80, + completion_tokens_details: { image_tokens: 0, reasoning_tokens: 64 }, + cost: 0.00216775, + cost_details: { + upstream_inference_completions_cost: 0.00016, + upstream_inference_cost: null, + upstream_inference_prompt_cost: 7.75e-6, + }, + is_byok: false, + prompt_tokens: 31, + prompt_tokens_details: { audio_tokens: 0, cached_tokens: 0, video_tokens: 0 }, + total_tokens: 111, + }; + + const result = extractUsageCostDetails(usage); + expect(result).not.toBeNull(); + // total_cost comes from usage.cost (not upstream sum) + expect(result!.total_cost).toBe(0.00216775); + // upstream fields preserved separately + expect(result!.upstream_inference_prompt_cost).toBe(7.75e-6); + expect(result!.upstream_inference_completions_cost).toBe(0.00016); + // no gateway fields + expect(result!.input_cost).toBeNull(); + expect(result!.output_cost).toBeNull(); + }); + + test('handles zero prompt tokens with all cost on completions', () => { + // Real response: video_url_public_api — prompt_tokens=0, all cost on output side. + // upstream_inference_prompt_cost=0, upstream_inference_cost equals cost. + const usage = { + completion_tokens: 180, + completion_tokens_details: { image_tokens: 0, reasoning_tokens: 0 }, + cost: 0.00045, + cost_details: { + upstream_inference_completions_cost: 0.00045, + upstream_inference_cost: 0.00045, + upstream_inference_prompt_cost: 0, + }, + is_byok: false, + prompt_tokens: 0, + prompt_tokens_details: { + audio_tokens: 0, + cache_write_tokens: 0, + cached_tokens: 0, + video_tokens: 0, + }, + total_tokens: 180, + }; + + const result = extractUsageCostDetails(usage); + expect(result).not.toBeNull(); + expect(result!.total_cost).toBe(0.00045); + expect(result!.upstream_inference_prompt_cost).toBe(0); + expect(result!.upstream_inference_completions_cost).toBe(0.00045); + }); + test('returns null for negative total_cost', () => { const usage = { cost_details: { @@ -263,40 +432,89 @@ describe('extractUsageCostDetails', () => { expect(extractUsageCostDetails(usage)).toBeNull(); }); + + test('captures usage.cost when cost_details block is absent (Kimi/Avian shape)', () => { + // Real response: Kimi-k2.5 via OpenRouter — usage.cost present but no cost_details block. + const usage = { + prompt_tokens: 154, + completion_tokens: 131, + total_tokens: 285, + cost: 0.0003287, + prompt_tokens_details: { cached_tokens: 128, cache_write_tokens: 0, audio_tokens: 0, video_tokens: 0 }, + completion_tokens_details: { reasoning_tokens: 87, image_tokens: 0, audio_tokens: 0 }, + }; + + const result = extractUsageCostDetails(usage); + expect(result).not.toBeNull(); + expect(result!.total_cost).toBe(0.0003287); + expect(result!.input_cost).toBeNull(); + expect(result!.upstream_inference_prompt_cost).toBeNull(); + }); + + test('captures cost_in_usd_ticks when cost_details block is absent (xAI grok shape)', () => { + // Real response: xai-grok-4-fast — cost reported as integer ticks, no cost_details block. + // 1 USD = 10^10 ticks per xAI API docs. + const usage = { + prompt_tokens: 165, + completion_tokens: 2, + total_tokens: 296, + prompt_tokens_details: { text_tokens: 165, audio_tokens: 0, image_tokens: 0, cached_tokens: 164 }, + completion_tokens_details: { reasoning_tokens: 129, audio_tokens: 0, accepted_prediction_tokens: 0, rejected_prediction_tokens: 0 }, + num_sources_used: 0, + cost_in_usd_ticks: 739000, + }; + + const result = extractUsageCostDetails(usage); + expect(result).not.toBeNull(); + expect(result!.total_cost).toBeCloseTo(739000 / 10_000_000_000, 10); + expect(result!.input_cost).toBeNull(); + expect(result!.upstream_inference_prompt_cost).toBeNull(); + }); + + test('returns null when neither cost_details nor top-level cost fields are present', () => { + const usage = { + prompt_tokens: 100, + completion_tokens: 50, + total_tokens: 150, + }; + + expect(extractUsageCostDetails(usage)).toBeNull(); + }); }); describe('applyUsageCostDetails', () => { - test('overrides costs with provider cost_details breakdown', () => { + test('applies gateway input/output/cached costs directly when full breakdown is present', () => { const record = createUsageRecord(); + // Extracted from: glm-5.1 via LLM Gateway const costDetails: ProviderCostDetails = { - total_cost: 0.00017465, - input_cost: 0.00002415, - output_cost: 0.0001505, - cached_input_cost: 0, + total_cost: 0.022101624, + input_cost: 0.00073836, + output_cost: 0.00041184, + cached_input_cost: 0.020951424, cache_write_input_cost: 0, - upstream_inference_cost: 0.00017465, - upstream_inference_prompt_cost: 0.00002415, - upstream_inference_completions_cost: 0.0001505, + upstream_inference_cost: 0.022101624, + upstream_inference_prompt_cost: 0.021689784, + upstream_inference_completions_cost: 0.00041184, request_cost: 0, web_search_cost: 0, image_input_cost: null, image_output_cost: null, audio_input_cost: null, - data_storage_cost: 0.00000106, + data_storage_cost: null, }; applyUsageCostDetails(record, costDetails); - expect(record.costTotal).toBe(0.00017465); + expect(record.costTotal).toBeCloseTo(0.022101624, 8); expect(record.costSource).toBe('provider_reported'); - expect(record.providerReportedCost).toBe(0.00017465); - expect(record.costInput).toBe(0.00002415); - expect(record.costOutput).toBe(0.0001505); - expect(record.costCached).toBe(0); + expect(record.providerReportedCost).toBe(0.022101624); + expect(record.costInput).toBe(0.00073836); + expect(record.costOutput).toBe(0.00041184); + expect(record.costCached).toBeCloseTo(0.020951424, 8); expect(record.costCacheWrite).toBe(0); }); - test('falls back to proportional distribution when no breakdown available', () => { + test('falls back to proportional distribution when no cost breakdown available', () => { const record = createUsageRecord(); // costInput=0.001, costOutput=0.002, costCached=0.0005, total=0.0035 const costDetails: ProviderCostDetails = { @@ -325,7 +543,7 @@ describe('applyUsageCostDetails', () => { expect(record.costCached).toBeCloseTo((0.0005 / 0.0035) * 0.007, 8); }); - test('attributes full cost to input when no breakdown and no prior costs', () => { + test('attributes full cost to input when no cost breakdown and no prior costs', () => { const record = createUsageRecord({ costInput: 0, costOutput: 0, @@ -359,7 +577,184 @@ describe('applyUsageCostDetails', () => { expect(record.costCacheWrite).toBe(0); }); - test('uses partial breakdown — only input_cost provided', () => { + test('splits upstream prompt cost between input and cached using existing cost ratio', () => { + const record = createUsageRecord(); + // createUsageRecord defaults: costInput=0.001, costCached=0.0005 + // Prompt ratio: input=0.001/(0.001+0.0005)=2/3, cached=0.0005/(0.001+0.0005)=1/3 + // Extracted from: z-ai/glm-5-turbo-20260315 (cached_tokens=128/173 prompt tokens) + const costDetails: ProviderCostDetails = { + total_cost: 0.00021672, + input_cost: null, + output_cost: null, + cached_input_cost: null, + cache_write_input_cost: null, + upstream_inference_cost: 0.00021672, + upstream_inference_prompt_cost: 0.00008472, + upstream_inference_completions_cost: 0.000132, + request_cost: null, + web_search_cost: null, + image_input_cost: null, + image_output_cost: null, + audio_input_cost: null, + data_storage_cost: null, + }; + + applyUsageCostDetails(record, costDetails); + + expect(record.costTotal).toBe(0.00021672); + expect(record.costSource).toBe('provider_reported'); + expect(record.costOutput).toBe(0.000132); + // Prompt (0.00008472) split by record ratio: input=2/3, cached=1/3 + expect(record.costInput).toBeCloseTo((2 / 3) * 0.00008472, 8); + expect(record.costCached).toBeCloseTo((1 / 3) * 0.00008472, 8); + expect(record.costCacheWrite).toBe(0); + }); + + test('splits upstream prompt cost by ratio when upstream_inference_cost is null (heavy cache hit)', () => { + // Real response: x-ai/grok-4 via OpenRouter — 679/687 prompt tokens cached. + // upstream_inference_cost is null; total comes from usage.cost instead. + // Prior costs use token-proportional amounts: costInput=0.00008 (8 tokens), + // costCached=0.00679 (679 tokens), prevPromptTotal=0.00687. + const record = createUsageRecord({ + costInput: 0.00008, + costCached: 0.00679, + costCacheWrite: 0, + costTotal: 0.00687, + }); + const costDetails: ProviderCostDetails = { + total_cost: 0.00333825, + input_cost: null, + output_cost: null, + cached_input_cost: null, + cache_write_input_cost: null, + upstream_inference_cost: null, + upstream_inference_prompt_cost: 0.00053325, + upstream_inference_completions_cost: 0.002805, + request_cost: null, + web_search_cost: null, + image_input_cost: null, + image_output_cost: null, + audio_input_cost: null, + data_storage_cost: null, + }; + + applyUsageCostDetails(record, costDetails); + + expect(record.costTotal).toBe(0.00333825); + expect(record.costSource).toBe('provider_reported'); + expect(record.costOutput).toBe(0.002805); + // Prompt (0.00053325) split by prior ratio: input=0.00008/0.00687, cached=0.00679/0.00687 + expect(record.costInput).toBeCloseTo((0.00008 / 0.00687) * 0.00053325, 8); + expect(record.costCached).toBeCloseTo((0.00679 / 0.00687) * 0.00053325, 8); + expect(record.costCacheWrite).toBe(0); + }); + + test('attributes full upstream prompt cost to input when no cached tokens', () => { + const record = createUsageRecord({ costCached: 0, costCacheWrite: 0, costTotal: 0.003 }); + // Extracted from: normal-tier (cached_tokens=0) + const costDetails: ProviderCostDetails = { + total_cost: 0.00435825, + input_cost: null, + output_cost: null, + cached_input_cost: null, + cache_write_input_cost: null, + upstream_inference_cost: null, + upstream_inference_prompt_cost: 4.25e-6, + upstream_inference_completions_cost: 0.004354, + request_cost: null, + web_search_cost: null, + image_input_cost: null, + image_output_cost: null, + audio_input_cost: null, + data_storage_cost: null, + }; + + applyUsageCostDetails(record, costDetails); + + expect(record.costTotal).toBe(0.00435825); + expect(record.costOutput).toBe(0.004354); + expect(record.costInput).toBe(4.25e-6); + expect(record.costCached).toBe(0); + expect(record.costCacheWrite).toBe(0); + }); + + test('end-to-end BYOK: extract + apply uses upstream cost when usage.cost is 0', () => { + // Real response: google_nested_schema BYOK — cost=0, real cost in upstream_inference_cost. + // extractUsageCostDetails picks upstream_inference_cost as total; + // applyUsageCostDetails hits the normal-tier branch (no gateway fields, only upstream). + const usage = { + completion_tokens: 91, + cost: 0, + cost_details: { + upstream_inference_completions_cost: 0.0002275, + upstream_inference_cost: 0.0003253, + upstream_inference_prompt_cost: 9.78e-5, + }, + is_byok: true, + prompt_tokens: 326, + prompt_tokens_details: { cached_tokens: 0 }, + }; + + const extracted = extractUsageCostDetails(usage); + expect(extracted).not.toBeNull(); + expect(extracted!.total_cost).toBe(0.0003253); + + // Record has no prior cost breakdown (fresh record from a BYOK provider) + const record = createUsageRecord({ + costInput: 0, + costOutput: 0, + costCached: 0, + costCacheWrite: 0, + costTotal: 0, + }); + applyUsageCostDetails(record, extracted!); + + expect(record.costTotal).toBe(0.0003253); + expect(record.costSource).toBe('provider_reported'); + // Normal-tier: output from upstream, full prompt portion to input (no cached tokens in record) + expect(record.costOutput).toBe(0.0002275); + expect(record.costInput).toBe(9.78e-5); + expect(record.costCached).toBe(0); + expect(record.costCacheWrite).toBe(0); + }); + + test('end-to-end non-BYOK normal-tier: extract + apply', () => { + // Real response: usage.yaml second interaction — cost=0.00435825, only upstream fields. + // upstream_inference_cost is null (not BYOK), total comes from usage.cost. + const usage = { + completion_tokens: 2177, + cost: 0.00435825, + cost_details: { + upstream_inference_completions_cost: 0.004354, + upstream_inference_cost: null, + upstream_inference_prompt_cost: 4.25e-6, + }, + is_byok: false, + prompt_tokens: 17, + prompt_tokens_details: { cached_tokens: 0 }, + }; + + const extracted = extractUsageCostDetails(usage); + expect(extracted).not.toBeNull(); + expect(extracted!.total_cost).toBe(0.00435825); + + // Record with no prior breakdown + const record = createUsageRecord({ + costInput: 0, + costOutput: 0, + costCached: 0, + costCacheWrite: 0, + costTotal: 0, + }); + applyUsageCostDetails(record, extracted!); + + expect(record.costTotal).toBe(0.00435825); + expect(record.costOutput).toBe(0.004354); + expect(record.costInput).toBe(4.25e-6); + expect(record.costCached).toBe(0); + }); + + test('uses partial gateway breakdown when only some per-bucket costs are available', () => { const record = createUsageRecord(); const costDetails: ProviderCostDetails = { total_cost: 0.005, @@ -395,15 +790,15 @@ describe('applyUsageCostDetails', () => { output_cost: 0.002, cached_input_cost: null, cache_write_input_cost: null, - upstream_inference_cost: null, - upstream_inference_prompt_cost: null, - upstream_inference_completions_cost: null, request_cost: null, web_search_cost: null, image_input_cost: null, image_output_cost: null, audio_input_cost: null, data_storage_cost: null, + upstream_inference_cost: null, + upstream_inference_prompt_cost: null, + upstream_inference_completions_cost: null, }; applyUsageCostDetails(record, costDetails); @@ -428,15 +823,15 @@ describe('applyUsageCostDetails', () => { output_cost: 0.0001505, cached_input_cost: 0, cache_write_input_cost: 0, - upstream_inference_cost: 0.00017465, - upstream_inference_prompt_cost: 0.00002415, - upstream_inference_completions_cost: 0.0001505, request_cost: 0, web_search_cost: 0, - image_input_cost: null, - image_output_cost: null, - audio_input_cost: null, - data_storage_cost: 0.00000106, + image_input_cost: 0, + image_output_cost: 0, + audio_input_cost: 0, + data_storage_cost: 0, + upstream_inference_cost: null, + upstream_inference_prompt_cost: null, + upstream_inference_completions_cost: null, }; applyUsageCostDetails(record, costDetails); @@ -456,15 +851,15 @@ describe('applyUsageCostDetails', () => { output_cost: 0, cached_input_cost: 0, cache_write_input_cost: 0, - upstream_inference_cost: 0, - upstream_inference_prompt_cost: 0, - upstream_inference_completions_cost: 0, request_cost: 0, web_search_cost: 0, - image_input_cost: null, - image_output_cost: null, - audio_input_cost: null, + image_input_cost: 0, + image_output_cost: 0, + audio_input_cost: 0, data_storage_cost: 0, + upstream_inference_cost: null, + upstream_inference_prompt_cost: null, + upstream_inference_completions_cost: null, }; applyUsageCostDetails(record, costDetails); @@ -475,6 +870,43 @@ describe('applyUsageCostDetails', () => { expect(record.costOutput).toBe(0); }); + test('falls back to proportional distribution when upstream costs are all zero (Vercel shape)', () => { + // Real response: Vercel AI Gateway — cost is non-zero but upstream_inference_* fields are + // all 0 (gateway doesn't pass through upstream cost breakdown). Without the > 0 guard, + // the Normal tier would fire and produce zero sub-costs despite total_cost being correct. + const record = createUsageRecord(); + // costInput=0.001, costOutput=0.002, costCached=0.0005, total=0.0035 + const costDetails: ProviderCostDetails = { + total_cost: 0.003561, + input_cost: null, + output_cost: null, + cached_input_cost: null, + cache_write_input_cost: null, + upstream_inference_cost: null, + upstream_inference_prompt_cost: 0, + upstream_inference_completions_cost: 0, + request_cost: null, + web_search_cost: null, + image_input_cost: null, + image_output_cost: null, + audio_input_cost: null, + data_storage_cost: null, + }; + + applyUsageCostDetails(record, costDetails); + + expect(record.costTotal).toBe(0.003561); + expect(record.costSource).toBe('provider_reported'); + // Should use Minimal tier (proportional distribution), not Normal tier (which would zero everything) + expect(record.costInput).toBeGreaterThan(0); + expect(record.costOutput).toBeGreaterThan(0); + expect(record.costCached).toBeGreaterThan(0); + // Proportional: input=1/3.5, output=2/3.5, cached=0.5/3.5 + expect(record.costInput).toBeCloseTo((0.001 / 0.0035) * 0.003561, 8); + expect(record.costOutput).toBeCloseTo((0.002 / 0.0035) * 0.003561, 8); + expect(record.costCached).toBeCloseTo((0.0005 / 0.0035) * 0.003561, 8); + }); + test('SSE : cost comments take precedence over cost_details', () => { const record = createUsageRecord(); // SSE comment cost applied first @@ -683,3 +1115,235 @@ describe('extractProviderEnergyFromSSEComments (via DebugLoggingInspector)', () expect(lastEnergy.energy_kwh).toBe(5.2904e-5); }); }); + +describe('extractUsageCostDetails - real-world cassette shapes', () => { + test('Vercel market_cost field does not interfere with cost extraction', () => { + // Vercel AI Gateway adds market_cost alongside cost and cost_details. + // The normalizer should extract cost as total and ignore market_cost. + const usage = { + prompt_tokens: 16, + completion_tokens: 33, + total_tokens: 49, + cost: 0.000543, + is_byok: false, + prompt_tokens_details: { cached_tokens: 0, audio_tokens: 0, video_tokens: 0 }, + cost_details: { + upstream_inference_cost: null, + upstream_inference_prompt_cost: 0, + upstream_inference_completions_cost: 0, + }, + completion_tokens_details: { reasoning_tokens: 0, image_tokens: 0 }, + cache_creation_input_tokens: 0, + market_cost: 0.000543, + }; + + const result = extractUsageCostDetails(usage); + expect(result).not.toBeNull(); + expect(result!.total_cost).toBe(0.000543); + // upstream_inference fields are both 0, so normal tier guard prevents zeroing + // Falls back to minimal tier (proportional). But there are no prior calculated costs. + }); + + test('Vercel GPT-5 with non-zero cost and zero upstream breakdown', () => { + const usage = { + prompt_tokens: 113, + completion_tokens: 327, + total_tokens: 440, + cost: 0.00597125, + is_byok: false, + prompt_tokens_details: { cached_tokens: 0, audio_tokens: 0, video_tokens: 0 }, + cost_details: { + upstream_inference_cost: null, + upstream_inference_prompt_cost: 0, + upstream_inference_completions_cost: 0, + }, + completion_tokens_details: { reasoning_tokens: 256, image_tokens: 0 }, + cache_creation_input_tokens: 0, + market_cost: 0.00597125, + }; + + const result = extractUsageCostDetails(usage); + expect(result).not.toBeNull(); + expect(result!.total_cost).toBe(0.00597125); + // upstream fields are 0, should NOT be used as breakdown (Vercel shape) + expect(result!.input_cost).toBeNull(); + expect(result!.output_cost).toBeNull(); + expect(result!.upstream_inference_prompt_cost).toBe(0); + expect(result!.upstream_inference_completions_cost).toBe(0); + }); + + test('OpenRouter Grok with cached tokens in prompt_tokens_details', () => { + // OpenRouter passes cached_tokens in prompt_tokens_details alongside cost_details. + const usage = { + prompt_tokens: 445, + completion_tokens: 278, + total_tokens: 723, + cost: 0.00020535, + is_byok: false, + prompt_tokens_details: { + cached_tokens: 151, + cache_write_tokens: 0, + audio_tokens: 0, + video_tokens: 0, + }, + cost_details: { + upstream_inference_cost: 0.00020535, + upstream_inference_prompt_cost: 0.00006635, + upstream_inference_completions_cost: 0.000139, + }, + completion_tokens_details: { reasoning_tokens: 210, image_tokens: 0, audio_tokens: 0 }, + }; + + const result = extractUsageCostDetails(usage); + expect(result).not.toBeNull(); + expect(result!.total_cost).toBe(0.00020535); + // upstream fields preserved separately (normal tier) + expect(result!.upstream_inference_prompt_cost).toBe(0.00006635); + expect(result!.upstream_inference_completions_cost).toBe(0.000139); + // No gateway-level input_cost/output_cost on OpenRouter + expect(result!.input_cost).toBeNull(); + expect(result!.output_cost).toBeNull(); + }); + + test('xAI grok-4-fast cost_in_usd_ticks with cached tokens', () => { + // xAI reports cost as cost_in_usd_ticks (no cost_details block). + const usage = { + prompt_tokens: 468, + completion_tokens: 82, + total_tokens: 870, + prompt_tokens_details: { + text_tokens: 468, + audio_tokens: 0, + image_tokens: 0, + cached_tokens: 305, + }, + completion_tokens_details: { + reasoning_tokens: 320, + audio_tokens: 0, + accepted_prediction_tokens: 0, + rejected_prediction_tokens: 0, + }, + num_sources_used: 0, + cost_in_usd_ticks: 2488500, + }; + + const result = extractUsageCostDetails(usage); + expect(result).not.toBeNull(); + // 2488500 / 10_000_000_000 = 0.00024885 + expect(result!.total_cost).toBeCloseTo(2488500 / 10_000_000_000, 10); + expect(result!.input_cost).toBeNull(); + }); + + test('Avian Kimi (via OpenRouter) with top-level cost and no cost_details', () => { + // Avian/Kimi reports cost at the top level but has no cost_details block. + const usage = { + prompt_tokens: 154, + completion_tokens: 131, + total_tokens: 285, + cost: 0.0003287, + prompt_tokens_details: { + cached_tokens: 128, + cache_write_tokens: 0, + audio_tokens: 0, + video_tokens: 0, + }, + completion_tokens_details: { reasoning_tokens: 87, image_tokens: 0, audio_tokens: 0 }, + }; + + const result = extractUsageCostDetails(usage); + expect(result).not.toBeNull(); + expect(result!.total_cost).toBe(0.0003287); + expect(result!.input_cost).toBeNull(); + expect(result!.upstream_inference_prompt_cost).toBeNull(); + }); + + test('OpenRouter Anthropic Thinking with reasoning tokens', () => { + const usage = { + prompt_tokens: 607, + completion_tokens: 143, + total_tokens: 750, + cost: 0.001322, + is_byok: false, + prompt_tokens_details: { cached_tokens: 0, cache_write_tokens: 0, audio_tokens: 0, video_tokens: 0 }, + cost_details: { + upstream_inference_cost: 0.001322, + upstream_inference_prompt_cost: 0.000607, + upstream_inference_completions_cost: 0.000715, + }, + completion_tokens_details: { reasoning_tokens: 99, image_tokens: 0, audio_tokens: 0 }, + }; + + const result = extractUsageCostDetails(usage); + expect(result).not.toBeNull(); + expect(result!.total_cost).toBe(0.001322); + expect(result!.upstream_inference_cost).toBe(0.001322); + expect(result!.upstream_inference_prompt_cost).toBe(0.000607); + expect(result!.upstream_inference_completions_cost).toBe(0.000715); + }); + + test('OpenRouter Gemini with upstream fields matching total', () => { + const usage = { + prompt_tokens: 161, + completion_tokens: 32, + total_tokens: 193, + cost: 0.00008825, + is_byok: false, + prompt_tokens_details: { cached_tokens: 0, cache_write_tokens: 0, audio_tokens: 0, video_tokens: 0 }, + cost_details: { + upstream_inference_cost: 0.00008825, + upstream_inference_prompt_cost: 0.00004025, + upstream_inference_completions_cost: 0.000048, + }, + completion_tokens_details: { reasoning_tokens: 0, image_tokens: 0, audio_tokens: 0 }, + }; + + const result = extractUsageCostDetails(usage); + expect(result).not.toBeNull(); + expect(result!.total_cost).toBe(0.00008825); + expect(result!.upstream_inference_cost).toBe(0.00008825); + expect(result!.upstream_inference_prompt_cost).toBe(0.00004025); + expect(result!.upstream_inference_completions_cost).toBe(0.000048); + }); + + test('OpenRouter GLM with reasoning tokens', () => { + const usage = { + prompt_tokens: 279, + completion_tokens: 72, + total_tokens: 351, + cost: 0.0006228, + is_byok: false, + prompt_tokens_details: { cached_tokens: 0, cache_write_tokens: 0, audio_tokens: 0, video_tokens: 0 }, + cost_details: { + upstream_inference_cost: 0.0006228, + upstream_inference_prompt_cost: 0.0003348, + upstream_inference_completions_cost: 0.000288, + }, + completion_tokens_details: { reasoning_tokens: 25, image_tokens: 0, audio_tokens: 0 }, + }; + + const result = extractUsageCostDetails(usage); + expect(result).not.toBeNull(); + expect(result!.total_cost).toBe(0.0006228); + }); + + test('OpenRouter OpenAI model with cached tokens and reasoning tokens', () => { + const usage = { + prompt_tokens: 113, + completion_tokens: 54, + total_tokens: 167, + cost: 0.0000901, + is_byok: false, + prompt_tokens_details: { cached_tokens: 0, cache_write_tokens: 0, audio_tokens: 0, video_tokens: 0 }, + cost_details: { + upstream_inference_cost: 0.0000901, + upstream_inference_prompt_cost: 0.0000226, + upstream_inference_completions_cost: 0.0000675, + }, + completion_tokens_details: { reasoning_tokens: 0, image_tokens: 0, audio_tokens: 0 }, + }; + + const result = extractUsageCostDetails(usage); + expect(result).not.toBeNull(); + expect(result!.total_cost).toBe(0.0000901); + }); +}); diff --git a/packages/backend/src/utils/__tests__/usage-normalizer.test.ts b/packages/backend/src/utils/__tests__/usage-normalizer.test.ts index a6668b63..7931a51f 100644 --- a/packages/backend/src/utils/__tests__/usage-normalizer.test.ts +++ b/packages/backend/src/utils/__tests__/usage-normalizer.test.ts @@ -3,10 +3,32 @@ import { normalizeGeminiUsage, normalizeOpenAIChatUsage, normalizeOpenAIResponsesUsage, + normalizeAnthropicUsage, extractUsageCostDetails, } from '../usage-normalizer'; describe('usage-normalizer - OpenAI Responses usage', () => { + test('normalizes multi-turn response with heavy cache hits and reasoning tokens', () => { + const normalized = normalizeOpenAIResponsesUsage({ + input_tokens: 9299, + output_tokens: 577, + total_tokens: 9876, + input_tokens_details: { + cached_tokens: 8448, + }, + output_tokens_details: { + reasoning_tokens: 512, + }, + }); + + expect(normalized.input_tokens).toBe(851); // 9299 - 8448 + expect(normalized.cached_tokens).toBe(8448); + expect(normalized.output_tokens).toBe(577); + expect(normalized.total_tokens).toBe(9876); + expect(normalized.reasoning_tokens).toBe(512); + expect(normalized.cache_creation_tokens).toBe(0); + }); + test('normalizes when input_tokens includes cached tokens', () => { const normalized = normalizeOpenAIResponsesUsage({ input_tokens: 2006, @@ -28,6 +50,29 @@ describe('usage-normalizer - OpenAI Responses usage', () => { expect(normalized.cache_creation_tokens).toBe(0); }); + test('extracts cache_write_tokens from input_tokens_details and subtracts from input', () => { + // OpenAI Responses API includes cache_write_tokens inside input_tokens. + // input_tokens = uncached_input + cached_tokens + cache_write_tokens + const normalized = normalizeOpenAIResponsesUsage({ + input_tokens: 1200, + output_tokens: 800, + total_tokens: 2000, + input_tokens_details: { + cached_tokens: 500, + cache_write_tokens: 200, + }, + output_tokens_details: { + reasoning_tokens: 0, + }, + }); + + expect(normalized.input_tokens).toBe(500); // 1200 - 500 - 200 + expect(normalized.cached_tokens).toBe(500); + expect(normalized.cache_creation_tokens).toBe(200); + expect(normalized.output_tokens).toBe(800); + expect(normalized.total_tokens).toBe(2000); + }); + test('preserves uncached input when cached_tokens exceeds input_tokens', () => { const normalized = normalizeOpenAIResponsesUsage({ input_tokens: 5233, @@ -127,6 +172,24 @@ describe('usage-normalizer - OpenAI Chat usage', () => { expect(normalized.reasoning_tokens).toBe(10); }); + test('normalizes DeepSeek top-level prompt_cache_hit_tokens / prompt_cache_miss_tokens', () => { + // DeepSeek reports cache at the top level instead of under prompt_tokens_details. + // prompt_tokens = hit + miss; input_tokens should be the miss (uncached) portion. + const normalized = normalizeOpenAIChatUsage({ + prompt_tokens: 1000, + completion_tokens: 200, + total_tokens: 1200, + prompt_cache_hit_tokens: 800, + prompt_cache_miss_tokens: 200, + }); + + expect(normalized.cached_tokens).toBe(800); + expect(normalized.input_tokens).toBe(200); + expect(normalized.output_tokens).toBe(200); + expect(normalized.total_tokens).toBe(1200); + expect(normalized.cache_creation_tokens).toBe(0); + }); + test('defaults cache_write_tokens to 0 when not present', () => { const normalized = normalizeOpenAIChatUsage({ prompt_tokens: 100, @@ -174,3 +237,286 @@ describe('usage-normalizer - OpenAI Chat usage', () => { expect(normalized.total_tokens).toBe(66); }); }); + +describe('usage-normalizer - Anthropic usage', () => { + test('normalizes basic Anthropic non-streaming usage', () => { + const normalized = normalizeAnthropicUsage({ + input_tokens: 16, + cache_creation_input_tokens: 0, + cache_read_input_tokens: 0, + output_tokens: 34, + }); + + expect(normalized.input_tokens).toBe(16); + expect(normalized.output_tokens).toBe(34); + expect(normalized.cached_tokens).toBe(0); + expect(normalized.cache_creation_tokens).toBe(0); + expect(normalized.total_tokens).toBe(50); + expect(normalized.reasoning_tokens).toBe(0); + }); + + test('tolerates extra Anthropic non-streaming fields (cache_creation, service_tier, inference_geo)', () => { + // Non-streaming Anthropic responses include cache_creation (nested), service_tier, inference_geo. + // The normalizer must ignore these without erroring. + const normalized = normalizeAnthropicUsage({ + input_tokens: 424, + cache_creation_input_tokens: 0, + cache_read_input_tokens: 0, + cache_creation: { ephemeral_5m_input_tokens: 0, ephemeral_1h_input_tokens: 0 }, + output_tokens: 118, + service_tier: 'standard', + inference_geo: 'not_available', + }); + + expect(normalized.input_tokens).toBe(424); + expect(normalized.output_tokens).toBe(118); + expect(normalized.cached_tokens).toBe(0); + expect(normalized.cache_creation_tokens).toBe(0); + expect(normalized.total_tokens).toBe(542); + expect(normalized.reasoning_tokens).toBe(0); + }); + + test('tolerates server_tool_use in streaming usage (web search)', () => { + // Anthropic web search adds server_tool_use to usage. Must not break normalization. + const normalized = normalizeAnthropicUsage({ + input_tokens: 13520, + cache_creation_input_tokens: 0, + cache_read_input_tokens: 0, + output_tokens: 415, + server_tool_use: { web_search_requests: 1, web_fetch_requests: 0 }, + }); + + expect(normalized.input_tokens).toBe(13520); + expect(normalized.output_tokens).toBe(415); + expect(normalized.cached_tokens).toBe(0); + expect(normalized.cache_creation_tokens).toBe(0); + expect(normalized.total_tokens).toBe(13935); + }); + + test('tolerates iterations array from compaction feature', () => { + // New Anthropic compaction feature adds iterations array with per-step breakdowns. + const normalized = normalizeAnthropicUsage({ + input_tokens: 172, + cache_creation_input_tokens: 0, + cache_read_input_tokens: 0, + output_tokens: 5, + server_tool_use: { web_search_requests: 0, web_fetch_requests: 0 }, + iterations: [ + { + input_tokens: 100, + output_tokens: 71, + cache_read_input_tokens: 0, + cache_creation_input_tokens: 55096, + cache_creation: { ephemeral_5m_input_tokens: 55096, ephemeral_1h_input_tokens: 0 }, + type: 'compaction', + }, + { + input_tokens: 172, + output_tokens: 5, + cache_read_input_tokens: 0, + cache_creation_input_tokens: 0, + cache_creation: { ephemeral_5m_input_tokens: 0, ephemeral_1h_input_tokens: 0 }, + type: 'message', + }, + ], + }); + + expect(normalized.input_tokens).toBe(172); + expect(normalized.output_tokens).toBe(5); + expect(normalized.cached_tokens).toBe(0); + expect(normalized.cache_creation_tokens).toBe(0); + expect(normalized.total_tokens).toBe(177); + }); + + test('normalizes non-zero cache tokens from Anthropic', () => { + // Validates cache_read_input_tokens and cache_creation_input_tokens are both extracted. + const normalized = normalizeAnthropicUsage({ + input_tokens: 22397, + cache_creation_input_tokens: 0, + cache_read_input_tokens: 0, + output_tokens: 637, + server_tool_use: { web_search_requests: 2, web_fetch_requests: 0 }, + }); + + expect(normalized.input_tokens).toBe(22397); + expect(normalized.output_tokens).toBe(637); + expect(normalized.total_tokens).toBe(23034); + expect(normalized.reasoning_tokens).toBe(0); + expect(normalized.cache_creation_tokens).toBe(0); + }); +}); + +describe('usage-normalizer - Gemini usage (additional real-world shapes)', () => { + test('normalizes usage with toolUsePromptTokenCount (web fetch tool)', () => { + // toolUsePromptTokenCount is added to totalTokenCount by Gemini. + const normalized = normalizeGeminiUsage({ + promptTokenCount: 32, + candidatesTokenCount: 41, + totalTokenCount: 2515, + promptTokensDetails: [{ modality: 'TEXT', tokenCount: 32 }], + toolUsePromptTokenCount: 2395, + toolUsePromptTokensDetails: [{ modality: 'TEXT', tokenCount: 2395 }], + thoughtsTokenCount: 47, + }); + + expect(normalized.input_tokens).toBe(32); + expect(normalized.output_tokens).toBe(41); + expect(normalized.reasoning_tokens).toBe(47); + expect(normalized.cached_tokens).toBe(0); + expect(normalized.cache_creation_tokens).toBe(0); + // totalTokenCount is used directly, not recomputed + expect(normalized.total_tokens).toBe(2515); + }); + + test('tolerates candidatesTokensDetails array (image generation)', () => { + // promptTokensDetails/candidatesTokensDetails are modality arrays the normalizer ignores. + const normalized = normalizeGeminiUsage({ + promptTokenCount: 3362, + candidatesTokenCount: 9, + totalTokenCount: 3371, + promptTokensDetails: [ + { modality: 'IMAGE', tokenCount: 3354 }, + { modality: 'TEXT', tokenCount: 8 }, + ], + candidatesTokensDetails: [{ modality: 'TEXT', tokenCount: 9 }], + }); + + expect(normalized.input_tokens).toBe(3362); + expect(normalized.output_tokens).toBe(9); + expect(normalized.reasoning_tokens).toBe(0); + expect(normalized.cached_tokens).toBe(0); + expect(normalized.total_tokens).toBe(3371); + }); + + test('tolerates trafficType from Vertex AI (flex tier)', () => { + // Vertex AI adds trafficType (ON_DEMAND, ON_DEMAND_FLEX) to usageMetadata. + const normalized = normalizeGeminiUsage({ + promptTokenCount: 155, + candidatesTokenCount: 13, + totalTokenCount: 168, + trafficType: 'ON_DEMAND', + promptTokensDetails: [{ modality: 'TEXT', tokenCount: 155 }], + candidatesTokensDetails: [{ modality: 'TEXT', tokenCount: 13 }], + }); + + expect(normalized.input_tokens).toBe(155); + expect(normalized.output_tokens).toBe(13); + expect(normalized.total_tokens).toBe(168); + expect(normalized.reasoning_tokens).toBe(0); + }); +}); + +describe('usage-normalizer - OpenAI Chat usage (additional real-world shapes)', () => { + test('tolerates HuggingFace top-level cached_tokens outside prompt_tokens_details', () => { + // HuggingFace Inference Providers report cached_tokens at the top level, + // not nested under prompt_tokens_details. + const normalized = normalizeOpenAIChatUsage({ + prompt_tokens: 10, + completion_tokens: 955, + total_tokens: 965, + cached_tokens: 0, + }); + + expect(normalized.input_tokens).toBe(10); + expect(normalized.output_tokens).toBe(955); + expect(normalized.cached_tokens).toBe(0); + expect(normalized.total_tokens).toBe(965); + }); + + test('tolerates Groq timing fields in usage', () => { + // Groq adds queue_time, prompt_time, completion_time, total_time to usage. + const normalized = normalizeOpenAIChatUsage({ + queue_time: 0.200019293, + prompt_tokens: 201, + prompt_time: 0.022569048, + completion_tokens: 58, + completion_time: 0.168140587, + total_tokens: 259, + total_time: 0.190709635, + }); + + expect(normalized.input_tokens).toBe(201); + expect(normalized.output_tokens).toBe(58); + expect(normalized.cached_tokens).toBe(0); + expect(normalized.total_tokens).toBe(259); + }); + + test('tolerates DeepSeek prompt_cache_hit_tokens alongside prompt_tokens_details', () => { + // DeepSeek provides prompt_cache_hit_tokens both at top-level AND under prompt_tokens_details. + // prompt_tokens_details.cached_tokens takes priority (first in the ?? chain). + const normalized = normalizeOpenAIChatUsage({ + prompt_tokens: 6, + completion_tokens: 212, + total_tokens: 218, + prompt_tokens_details: { cached_tokens: 0 }, + completion_tokens_details: { reasoning_tokens: 198 }, + prompt_cache_hit_tokens: 0, + prompt_cache_miss_tokens: 6, + }); + + expect(normalized.input_tokens).toBe(6); + expect(normalized.output_tokens).toBe(212); + expect(normalized.reasoning_tokens).toBe(198); + expect(normalized.cached_tokens).toBe(0); + expect(normalized.total_tokens).toBe(218); + }); + + test('tolerates Azure Grok extra fields (audio_prompt_tokens, num_sources_used, image_tokens)', () => { + // Azure-hosted Grok adds audio_prompt_tokens, num_sources_used, and + // image_tokens/text_tokens subfields in prompt_tokens_details. + const normalized = normalizeOpenAIChatUsage({ + audio_prompt_tokens: 0, + completion_tokens: 27, + completion_tokens_details: { + accepted_prediction_tokens: 0, + audio_tokens: 0, + reasoning_tokens: 379, + rejected_prediction_tokens: 0, + }, + num_sources_used: 0, + prompt_tokens: 288, + prompt_tokens_details: { + audio_tokens: 0, + cached_tokens: 0, + image_tokens: 0, + text_tokens: 288, + }, + total_tokens: 694, + }); + + expect(normalized.input_tokens).toBe(288); + expect(normalized.output_tokens).toBe(27); + expect(normalized.cached_tokens).toBe(0); + expect(normalized.reasoning_tokens).toBe(379); + expect(normalized.total_tokens).toBe(694); + }); + + test('tolerates xAI cost_in_usd_ticks in usage (ignored for normalization)', () => { + // xAI adds cost_in_usd_ticks (pricing data) — normalizer should ignore it. + const normalized = normalizeOpenAIChatUsage({ + prompt_tokens: 436, + completion_tokens: 68, + total_tokens: 652, + prompt_tokens_details: { + text_tokens: 436, + audio_tokens: 0, + image_tokens: 0, + cached_tokens: 152, + }, + completion_tokens_details: { + reasoning_tokens: 148, + audio_tokens: 0, + accepted_prediction_tokens: 0, + rejected_prediction_tokens: 0, + }, + num_sources_used: 0, + cost_in_usd_ticks: 1724000, + }); + + expect(normalized.input_tokens).toBe(284); // 436 - 152 + expect(normalized.cached_tokens).toBe(152); + expect(normalized.output_tokens).toBe(68); + expect(normalized.reasoning_tokens).toBe(148); + expect(normalized.total_tokens).toBe(652); + }); +}); diff --git a/packages/backend/src/utils/provider-cost.ts b/packages/backend/src/utils/provider-cost.ts index e98b74b1..ac6f7eb7 100644 --- a/packages/backend/src/utils/provider-cost.ts +++ b/packages/backend/src/utils/provider-cost.ts @@ -90,20 +90,57 @@ export function applyUsageCostDetails( usageRecord.costSource = 'provider_reported'; usageRecord.providerReportedCost = totalCost; - // Use the detailed cost breakdown when available + // Three tiers of provider cost reporting: + // 1. Superset: explicit per-bucket breakdown (input_cost, output_cost, cached_input_cost, cache_write_input_cost) + // 2. Normal: upstream_inference_prompt_cost/completions_cost split, but no cache granularity + // 3. Minimal: no breakdown at all — distribute proportionally from previously calculated costs + const inputCost = costDetails.input_cost; const outputCost = costDetails.output_cost; const cachedCost = costDetails.cached_input_cost; const cacheWriteCost = costDetails.cache_write_input_cost; - if (inputCost !== null || outputCost !== null || cachedCost !== null || cacheWriteCost !== null) { - // Provider gave us an explicit per-bucket breakdown — use it directly + if (inputCost !== null || cachedCost !== null || cacheWriteCost !== null) { + // Superset: provider gave us an explicit per-bucket breakdown — use it directly + // Note: output_cost alone being non-null is not enough to identify superset; + // it's also reported by normal-tier as upstream_inference_completions_cost. + // Check the input-side fields (which normal-tier does not report separately). usageRecord.costInput = Number((inputCost ?? 0).toFixed(8)); usageRecord.costOutput = Number((outputCost ?? 0).toFixed(8)); usageRecord.costCached = Number((cachedCost ?? 0).toFixed(8)); usageRecord.costCacheWrite = Number((cacheWriteCost ?? 0).toFixed(8)); + } else if ( + (costDetails.upstream_inference_prompt_cost != null && + costDetails.upstream_inference_prompt_cost > 0) || + (costDetails.upstream_inference_completions_cost != null && + costDetails.upstream_inference_completions_cost > 0) + ) { + // Normal: upstream gave us prompt vs completions split, but no cache granularity. + // Use the upstream split for the input vs output totals, then preserve Plexus's + // own calculated ratio within the prompt portion for cache/non-cache distribution. + const promptTotal = costDetails.upstream_inference_prompt_cost ?? 0; + const completionsTotal = costDetails.upstream_inference_completions_cost ?? 0; + + usageRecord.costOutput = Number((completionsTotal ?? 0).toFixed(8)); + + // Split the prompt portion by Plexus's own input/cached/cacheWrite ratio + const prevInput = usageRecord.costInput || 0; + const prevCached = usageRecord.costCached || 0; + const prevCacheWrite = usageRecord.costCacheWrite || 0; + const prevPromptTotal = prevInput + prevCached + prevCacheWrite; + + if (prevPromptTotal > 0) { + usageRecord.costInput = Number(((prevInput / prevPromptTotal) * promptTotal).toFixed(8)); + usageRecord.costCached = Number(((prevCached / prevPromptTotal) * promptTotal).toFixed(8)); + usageRecord.costCacheWrite = Number(((prevCacheWrite / prevPromptTotal) * promptTotal).toFixed(8)); + } else { + // No prior breakdown — attribute full prompt cost to input + usageRecord.costInput = Number(promptTotal.toFixed(8)); + usageRecord.costCached = 0; + usageRecord.costCacheWrite = 0; + } } else { - // No breakdown — distribute proportionally like we do for SSE `: cost` comments + // Minimal: no breakdown — distribute proportionally from previously calculated costs const prevInputCost = usageRecord.costInput || 0; const prevOutputCost = usageRecord.costOutput || 0; const prevCachedCost = usageRecord.costCached || 0; diff --git a/packages/backend/src/utils/usage-normalizer.ts b/packages/backend/src/utils/usage-normalizer.ts index 714bcde8..c686f5e6 100644 --- a/packages/backend/src/utils/usage-normalizer.ts +++ b/packages/backend/src/utils/usage-normalizer.ts @@ -51,21 +51,70 @@ export interface UsageWithCostDetails extends UsageSubset { */ export function extractUsageCostDetails(usage: any): ProviderCostDetails | null { const details = usage?.cost_details; - if (!details || typeof details !== 'object') return null; - // Validate that at least one cost field is a valid number - const totalCost = safeCost(details.total_cost ?? usage?.cost ?? usage?.estimated_cost); + if (!details || typeof details !== 'object') { + // No cost_details block — check top-level cost fields from providers that omit cost_details: + // - usage.cost: OpenRouter-routed providers (e.g. Kimi/Avian) that don't surface cost_details + // - usage.cost_in_usd_ticks: xAI grok models; 1 USD = 10^10 ticks per xAI API docs. + const topLevelCost = safeCost(usage?.cost ?? usage?.estimated_cost); + const xaiTicks = + typeof usage?.cost_in_usd_ticks === 'number' + ? safeCost(usage.cost_in_usd_ticks / 10_000_000_000) + : null; + const totalCost = topLevelCost || xaiTicks; + if (totalCost === null) return null; + + return { + total_cost: totalCost, + input_cost: null, + output_cost: null, + cached_input_cost: null, + cache_write_input_cost: null, + upstream_inference_cost: null, + upstream_inference_prompt_cost: null, + upstream_inference_completions_cost: null, + request_cost: null, + web_search_cost: null, + image_input_cost: null, + image_output_cost: null, + audio_input_cost: null, + data_storage_cost: null, + }; + } + + // Determine total cost: + // 1. cost_details.total_cost + // 2. usage.cost or usage.estimated_cost (standard path) + // 3. cost_details.upstream_inference_cost (OpenRouter quirk) + let totalCost = safeCost(details.total_cost); + + const costFromUsage = safeCost(usage?.cost ?? usage?.estimated_cost); + const upstreamInferenceCost = safeCost(details.upstream_inference_cost); + + if (totalCost === null) { + // || not ?? — BYOK keys report usage.cost=0 (Plexus charges nothing), so a + // falsy 0 should fall through to upstreamInferenceCost which carries the + // actual provider cost. + totalCost = costFromUsage || upstreamInferenceCost; + } if (totalCost === null) return null; return { total_cost: totalCost, - input_cost: safeCost(details.input_cost ?? details.upstream_inference_prompt_cost), - output_cost: safeCost(details.output_cost ?? details.upstream_inference_completions_cost), + // upstream_inference_prompt_cost includes cached tokens (input_cost + cached_input_cost), + // so it can't be mapped directly to input_cost. The upstream fields are preserved + // here and dispatched separately in applyUsageCostDetails(). + input_cost: safeCost(details.input_cost), + output_cost: safeCost(details.output_cost), cached_input_cost: safeCost(details.cached_input_cost), cache_write_input_cost: safeCost(details.cache_write_input_cost), upstream_inference_cost: safeCost(details.upstream_inference_cost), - upstream_inference_prompt_cost: safeCost(details.upstream_inference_prompt_cost), - upstream_inference_completions_cost: safeCost(details.upstream_inference_completions_cost), + upstream_inference_prompt_cost: safeCost( + details.upstream_inference_prompt_cost ?? details.upstream_inference_input_cost + ), + upstream_inference_completions_cost: safeCost( + details.upstream_inference_completions_cost ?? details.upstream_inference_output_cost + ), request_cost: safeCost(details.request_cost), web_search_cost: safeCost(details.web_search_cost), image_input_cost: safeCost(details.image_input_cost), @@ -78,7 +127,9 @@ export function extractUsageCostDetails(usage: any): ProviderCostDetails | null export function normalizeOpenAIChatUsage(usage: any): UsageSubset { const promptTokens = safeToken(usage?.prompt_tokens); const cachedTokens = safeToken( - usage?.prompt_tokens_details?.cached_tokens ?? usage?.cached_tokens + usage?.prompt_tokens_details?.cached_tokens ?? + usage?.cached_tokens ?? + usage?.prompt_cache_hit_tokens ); const cacheWriteTokens = safeToken(usage?.prompt_tokens_details?.cache_write_tokens); const outputTokens = safeToken(usage?.completion_tokens); @@ -101,24 +152,28 @@ export function normalizeOpenAIChatUsage(usage: any): UsageSubset { export function normalizeOpenAIResponsesUsage(usage: any): UsageSubset { const reportedInputTokens = safeToken(usage?.input_tokens); const cachedTokens = safeToken(usage?.input_tokens_details?.cached_tokens); + const cacheWriteTokens = safeToken(usage?.input_tokens_details?.cache_write_tokens); const outputTokens = safeToken(usage?.output_tokens); const reasoningTokens = safeToken(usage?.output_tokens_details?.reasoning_tokens); + // Responses API input_tokens includes cached reads and cache writes. // Responses payloads may appear in two shapes depending on source: - // - total input tokens with cached included - // - uncached input tokens with cached reported separately + // - total input tokens with cached/write included → subtract both + // - uncached input tokens with cached/write reported separately → keep as-is + const combinedNonNew = cachedTokens + cacheWriteTokens; const inputTokens = - cachedTokens > reportedInputTokens + combinedNonNew > reportedInputTokens ? reportedInputTokens - : Math.max(0, reportedInputTokens - cachedTokens); + : Math.max(0, reportedInputTokens - combinedNonNew); return { input_tokens: inputTokens, output_tokens: outputTokens, - total_tokens: safeToken(usage?.total_tokens) || inputTokens + cachedTokens + outputTokens, + total_tokens: + safeToken(usage?.total_tokens) || inputTokens + cachedTokens + cacheWriteTokens + outputTokens, reasoning_tokens: reasoningTokens, cached_tokens: cachedTokens, - cache_creation_tokens: 0, + cache_creation_tokens: cacheWriteTokens, }; }