From df35b0884411c0dbf723e8e958dcbfe80d861fcc Mon Sep 17 00:00:00 2001 From: Fsocietyhhh <1211904451@qq.com> Date: Mon, 11 May 2026 17:38:15 -0700 Subject: [PATCH] fix(tokens): image blocks as flat ~1500 tokens; compute real contextUsagePct MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three related issues that combined to make /context and the renderer's context-window ring widely inaccurate on image-bearing sessions: 1. estimateContentPartTokens: when tool_result.content was an array containing an image block, the whole array was JSON.stringify-ed and tokenized as text. That counted the base64 `data` field literally, so a single ~100KB normalized image was estimated at ~70k tokens (Anthropic actually bills (w*h)/750 ≈ 1100-1500). One image read pushed a fresh session's /context to 35%+ and could trigger spurious /compact loops. Fix: walk the content array block-by-block. Text blocks count as text; image blocks count as a flat 1500 tokens (close enough to Anthropic's real billing for normalized images — Read tool caps long edge so most images land near 1024x768 ≈ 1050 tokens). Unknown block types still stringify, but with `source.data` redacted to '' to prevent the same blow-up. 2. getAnchoredTokenCount: returned `contextUsagePct: 0` hardcoded on both return paths. The agent loop emits this verbatim via `kind: 'usage'` events, so the desktop/extension renderer's context ring sat at 0% regardless of how full the model's context actually was. `/context` was unaffected because the CLI command re-derives pct from `estimated` itself. Fix: compute (estimated / contextWindow) * 100, using the current model's window from getContextWindow(_currentModel). 3. loop.ts emitted contextPct rounded to an integer. A 200-message session at 0.4% rounded to 0 and froze the renderer's ring. Match /context's `.toFixed(1)` fidelity by keeping one decimal place. Verified locally: a 4-message session with one ~100KB image read now shows ~1.9k / 200k (1.0%) on /context, matching Anthropic's true input token count within ~5%. Pre-fix the same session showed ~75k / 200k (37.8%). --- src/agent/loop.ts | 6 +++++- src/agent/tokens.ts | 49 ++++++++++++++++++++++++++++++++++++++------- 2 files changed, 47 insertions(+), 8 deletions(-) diff --git a/src/agent/loop.ts b/src/agent/loop.ts index a69604a0..32e0e63c 100644 --- a/src/agent/loop.ts +++ b/src/agent/loop.ts @@ -1732,7 +1732,11 @@ export async function interactiveSession( tier: routingTier, confidence: routingConfidence, savings: routingSavings, - contextPct: Math.round(contextUsagePct), + // Preserve sub-1% precision: a fresh session at 0.4% would + // round to 0 and freeze the renderer's context ring until the + // conversation grows past ~1k tokens. Match `/context`'s + // `.toFixed(1)` fidelity. + contextPct: Math.round(contextUsagePct * 10) / 10, }); // Record usage for stats tracking (franklin stats command). diff --git a/src/agent/tokens.ts b/src/agent/tokens.ts index 88b8bd83..06aed5ae 100644 --- a/src/agent/tokens.ts +++ b/src/agent/tokens.ts @@ -58,6 +58,12 @@ export function getAnchoredTokenCount(history: Dialogue[]): { apiAnchored: boolean; contextUsagePct: number; } { + // The model that just billed input — used as the denominator below. + // _currentModel is set per-turn by setEstimationModel(), so it reflects + // whatever the router actually resolved (not just config.model, which + // may be a routing profile like blockrun/auto). + const contextWindow = _currentModel ? getContextWindow(_currentModel) : 200_000; + if (lastApiInputTokens > 0 && lastApiMessageCount > 0 && history.length >= lastApiMessageCount) { // Sanity check: if history was mutated (compaction, micro-compact), anchor may be stale. // Detect by checking if new messages were only appended (length grew), not if content changed. @@ -73,7 +79,7 @@ export function getAnchoredTokenCount(history: Dialogue[]): { return { estimated: total, apiAnchored: true, - contextUsagePct: 0, + contextUsagePct: (total / contextWindow) * 100, }; } // Too much growth — anchor is unreliable, fall through to estimation @@ -81,10 +87,11 @@ export function getAnchoredTokenCount(history: Dialogue[]): { } // No anchor — pure estimation + const est = estimateHistoryTokens(history); return { - estimated: estimateHistoryTokens(history), + estimated: est, apiAnchored: false, - contextUsagePct: 0, + contextUsagePct: (est / contextWindow) * 100, }; } @@ -133,10 +140,38 @@ function estimateContentPartTokens(part: ContentPart | UserContentPart): number // +16 tokens for tool_use framing (type, id, name fields, JSON structure) return 16 + estimateTokens(part.name) + estimateTokens(JSON.stringify(part.input), 2); case 'tool_result': { - const content = typeof part.content === 'string' - ? part.content - : JSON.stringify(part.content); - return estimateTokens(content, 2); + // String content: count as text directly. + if (typeof part.content === 'string') { + return estimateTokens(part.content, 2); + } + // Array content: sum block-by-block. CRITICAL: image blocks must + // NOT go through JSON.stringify — their base64 `data` field would + // be tokenized as text (a 100KB image → ~70k phantom tokens), + // which is what made the context ring read ~86% on a 2-image chat + // and triggered premature /compact loops. Anthropic actually + // bills (w*h)/750 per image, ≈1100-1500 for typical sizes; a flat + // 1500-token estimate is close enough without needing to decode + // the image dimensions client-side. + let total = 0; + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const blocks = part.content as any[]; + for (const block of blocks) { + const blockType = block?.type; + if (blockType === 'text') { + total += estimateTokens((block?.text as string) ?? '', 2); + } else if (blockType === 'image') { + total += 1500; + } else { + // Unknown block — stringify minus any nested base64 data field + // to avoid the same blow-up for future block kinds. + const sanitized = { ...block }; + if (sanitized?.source && typeof sanitized.source === 'object' && sanitized.source.data) { + sanitized.source = { ...sanitized.source, data: '' }; + } + total += estimateTokens(JSON.stringify(sanitized), 2); + } + } + return total; } case 'thinking': return estimateTokens(part.thinking);