From df35b0884411c0dbf723e8e958dcbfe80d861fcc Mon Sep 17 00:00:00 2001
From: Fsocietyhhh <1211904451@qq.com>
Date: Mon, 11 May 2026 17:38:15 -0700
Subject: [PATCH] fix(tokens): image blocks as flat ~1500 tokens; compute real
 contextUsagePct
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three related issues that combined to make /context and the renderer's
context-window ring widely inaccurate on image-bearing sessions:

1. estimateContentPartTokens: when tool_result.content was an array
   containing an image block, the whole array was JSON.stringify-ed and
   tokenized as text. That counted the base64 `data` field literally, so
   a single ~100KB normalized image was estimated at ~70k tokens (Anthropic
   actually bills (w*h)/750 ≈ 1100-1500). One image read pushed a fresh
   session's /context to 35%+ and could trigger spurious /compact loops.

   Fix: walk the content array block-by-block. Text blocks count as text;
   image blocks count as a flat 1500 tokens (close enough to Anthropic's
   real billing for normalized images — Read tool caps long edge so most
   images land near 1024x768 ≈ 1050 tokens). Unknown block types still
   stringify, but with `source.data` redacted to '<bytes>' to prevent the
   same blow-up.

2. getAnchoredTokenCount: returned `contextUsagePct: 0` hardcoded on both
   return paths. The agent loop emits this verbatim via `kind: 'usage'`
   events, so the desktop/extension renderer's context ring sat at 0%
   regardless of how full the model's context actually was. `/context`
   was unaffected because the CLI command re-derives pct from `estimated`
   itself.

   Fix: compute (estimated / contextWindow) * 100, using the current
   model's window from getContextWindow(_currentModel).

3. loop.ts emitted contextPct rounded to an integer. A 200-message
   session at 0.4% rounded to 0 and froze the renderer's ring. Match
   /context's `.toFixed(1)` fidelity by keeping one decimal place.

Verified locally: a 4-message session with one ~100KB image read now
shows ~1.9k / 200k (1.0%) on /context, matching Anthropic's true input
token count within ~5%. Pre-fix the same session showed ~75k / 200k
(37.8%).
---
 src/agent/loop.ts   |  6 +++++-
 src/agent/tokens.ts | 49 ++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 47 insertions(+), 8 deletions(-)
diff --git a/src/agent/loop.ts b/src/agent/loop.ts
index a69604a0..32e0e63c 100644
--- a/src/agent/loop.ts
+++ b/src/agent/loop.ts
@@ -1732,7 +1732,11 @@ export async function interactiveSession(
         tier: routingTier,
         confidence: routingConfidence,
         savings: routingSavings,
-        contextPct: Math.round(contextUsagePct),
+        // Preserve sub-1% precision: a fresh session at 0.4% would
+        // round to 0 and freeze the renderer's context ring until the
+        // conversation grows past ~1k tokens. Match `/context`'s
+        // `.toFixed(1)` fidelity.
+        contextPct: Math.round(contextUsagePct * 10) / 10,
       });
 
       // Record usage for stats tracking (franklin stats command).
diff --git a/src/agent/tokens.ts b/src/agent/tokens.ts
index 88b8bd83..06aed5ae 100644
--- a/src/agent/tokens.ts
+++ b/src/agent/tokens.ts
@@ -58,6 +58,12 @@ export function getAnchoredTokenCount(history: Dialogue[]): {
   apiAnchored: boolean;
   contextUsagePct: number;
 } {
+  // The model that just billed input — used as the denominator below.
+  // _currentModel is set per-turn by setEstimationModel(), so it reflects
+  // whatever the router actually resolved (not just config.model, which
+  // may be a routing profile like blockrun/auto).
+  const contextWindow = _currentModel ? getContextWindow(_currentModel) : 200_000;
+
   if (lastApiInputTokens > 0 && lastApiMessageCount > 0 && history.length >= lastApiMessageCount) {
     // Sanity check: if history was mutated (compaction, micro-compact), anchor may be stale.
     // Detect by checking if new messages were only appended (length grew), not if content changed.
@@ -73,7 +79,7 @@ export function getAnchoredTokenCount(history: Dialogue[]): {
       return {
         estimated: total,
         apiAnchored: true,
-        contextUsagePct: 0,
+        contextUsagePct: (total / contextWindow) * 100,
       };
     }
     // Too much growth — anchor is unreliable, fall through to estimation
@@ -81,10 +87,11 @@ export function getAnchoredTokenCount(history: Dialogue[]): {
   }
 
   // No anchor — pure estimation
+  const est = estimateHistoryTokens(history);
   return {
-    estimated: estimateHistoryTokens(history),
+    estimated: est,
     apiAnchored: false,
-    contextUsagePct: 0,
+    contextUsagePct: (est / contextWindow) * 100,
   };
 }
 
@@ -133,10 +140,38 @@ function estimateContentPartTokens(part: ContentPart | UserContentPart): number
       // +16 tokens for tool_use framing (type, id, name fields, JSON structure)
       return 16 + estimateTokens(part.name) + estimateTokens(JSON.stringify(part.input), 2);
     case 'tool_result': {
-      const content = typeof part.content === 'string'
-        ? part.content
-        : JSON.stringify(part.content);
-      return estimateTokens(content, 2);
+      // String content: count as text directly.
+      if (typeof part.content === 'string') {
+        return estimateTokens(part.content, 2);
+      }
+      // Array content: sum block-by-block. CRITICAL: image blocks must
+      // NOT go through JSON.stringify — their base64 `data` field would
+      // be tokenized as text (a 100KB image → ~70k phantom tokens),
+      // which is what made the context ring read ~86% on a 2-image chat
+      // and triggered premature /compact loops. Anthropic actually
+      // bills (w*h)/750 per image, ≈1100-1500 for typical sizes; a flat
+      // 1500-token estimate is close enough without needing to decode
+      // the image dimensions client-side.
+      let total = 0;
+      // eslint-disable-next-line @typescript-eslint/no-explicit-any
+      const blocks = part.content as any[];
+      for (const block of blocks) {
+        const blockType = block?.type;
+        if (blockType === 'text') {
+          total += estimateTokens((block?.text as string) ?? '', 2);
+        } else if (blockType === 'image') {
+          total += 1500;
+        } else {
+          // Unknown block — stringify minus any nested base64 data field
+          // to avoid the same blow-up for future block kinds.
+          const sanitized = { ...block };
+          if (sanitized?.source && typeof sanitized.source === 'object' && sanitized.source.data) {
+            sanitized.source = { ...sanitized.source, data: '<bytes>' };
+          }
+          total += estimateTokens(JSON.stringify(sanitized), 2);
+        }
+      }
+      return total;
     }
     case 'thinking':
       return estimateTokens(part.thinking);