From d9b83c1c70c7afc06d90146fddddd35f03bbc68c Mon Sep 17 00:00:00 2001 From: Aaron Yourk Date: Sat, 11 Apr 2026 16:59:53 -0600 Subject: [PATCH] Fix shell execution degradation and session lock corruption Long conversations cause context window overflow, making the model output tool calls as text instead of using the tool API. This adds token-aware context limiting, output streaming, fake tool call sanitization, and improved context pruning to prevent the feedback loop. Session locks left behind by crashes leave sessions permanently stuck in "running" state. This adds automatic stale lock cleanup on startup and a force-release endpoint for the UI. Shell execution: - Validate working directory before spawning (bash.ts) - Stream tool output via SSE as it arrives (bash.ts) - Reduce message history when context exceeds 75% of window (agent-loop.ts) - Fix doom loop detection with proper counting (agent-loop.ts) - Keep 16 recent turns instead of 6, preserve tool excerpts (context-pruner.ts) - Sanitize fake tool call patterns from compacted history (message-builder.ts) - Merge consecutive assistant messages instead of dropping (message-builder.ts) - Distinguish real summaries from hallucinated calls (message-normalizer.ts) - Harden fallback context instruction (claude-client.ts) Session lock recovery: - Clean up expired locks and reset stuck sessions on startup (db.ts) - Force-release lock endpoint for UI recovery (sessions/[id]/route.ts) - Validate working directory before starting session (chat/route.ts) --- src/app/api/chat/route.ts | 18 ++++++- src/app/api/chat/sessions/[id]/route.ts | 5 +- src/lib/agent-loop.ts | 51 +++++++++++++----- src/lib/claude-client.ts | 8 +-- src/lib/context-pruner.ts | 10 ++-- src/lib/db.ts | 32 +++++++++++ src/lib/message-builder.ts | 71 +++++++++++++++++++++++-- src/lib/message-normalizer.ts | 3 +- src/lib/tools/bash.ts | 38 +++++++++++-- 9 files changed, 202 insertions(+), 34 deletions(-) diff --git a/src/app/api/chat/route.ts b/src/app/api/chat/route.ts index dc819c5c..51eb06b2 100644 --- a/src/app/api/chat/route.ts +++ b/src/app/api/chat/route.ts @@ -1,6 +1,6 @@ import { NextRequest } from 'next/server'; import { streamClaude } from '@/lib/claude-client'; -import { addMessage, getMessages, getSession, getSessionSummary, updateSessionTitle, updateSdkSessionId, updateSessionModel, updateSessionProvider, updateSessionProviderId, getSetting, acquireSessionLock, renewSessionLock, releaseSessionLock, setSessionRuntimeStatus, syncSdkTasks } from '@/lib/db'; +import { addMessage, getMessages, getSession, getSessionSummary, updateSessionTitle, updateSdkSessionId, updateSessionModel, updateSessionProvider, updateSessionProviderId, getSetting, acquireSessionLock, renewSessionLock, releaseSessionLock, setSessionRuntimeStatus, syncSdkTasks, cleanupStaleLocks } from '@/lib/db'; import { resolveProvider as resolveProviderUnified } from '@/lib/provider-resolver'; import { notifySessionStart, notifySessionComplete, notifySessionError } from '@/lib/telegram-bot'; import { extractCompletion } from '@/lib/onboarding-completion'; @@ -12,6 +12,8 @@ import { ensureSchedulerRunning } from '@/lib/task-scheduler'; // Start the task scheduler on first API call ensureSchedulerRunning(); +// Clean up any session locks left behind by crashes +try { const cleaned = cleanupStaleLocks(); if (cleaned > 0) console.log(`[chat API] Cleaned up ${cleaned} stale session lock(s)`); } catch { /* best effort */ } import crypto from 'crypto'; import fs from 'fs'; import path from 'path'; @@ -351,13 +353,25 @@ export async function POST(request: NextRequest) { systemPromptLength: finalSystemPrompt?.length || 0, systemPromptFirst200: finalSystemPrompt?.slice(0, 200) || 'none', }); + // Validate working directory exists — stale sessions may reference deleted directories + const sessionCwd = session.sdk_cwd || session.working_directory || undefined; + if (sessionCwd && !fs.existsSync(sessionCwd)) { + console.warn(`[chat API] Working directory does not exist: ${sessionCwd}`); + releaseSessionLock(session_id, lockId); + setSessionRuntimeStatus(session_id, 'idle'); + return new Response( + JSON.stringify({ error: `Working directory no longer exists: ${sessionCwd}`, code: 'INVALID_CWD' }), + { status: 400, headers: { 'Content-Type': 'application/json' } }, + ); + } + const stream = streamClaude({ prompt: content, sessionId: session_id, sdkSessionId: session.sdk_session_id || undefined, model: resolved.upstreamModel || resolved.model || effectiveModel, systemPrompt: finalSystemPrompt, - workingDirectory: session.sdk_cwd || session.working_directory || undefined, + workingDirectory: sessionCwd, abortController, permissionMode, files: fileAttachments, diff --git a/src/app/api/chat/sessions/[id]/route.ts b/src/app/api/chat/sessions/[id]/route.ts index d9322255..7564d721 100644 --- a/src/app/api/chat/sessions/[id]/route.ts +++ b/src/app/api/chat/sessions/[id]/route.ts @@ -1,5 +1,5 @@ import { NextRequest } from 'next/server'; -import { deleteSession, getSession, updateSessionWorkingDirectory, updateSessionTitle, updateSessionMode, updateSessionModel, updateSessionProviderId, clearSessionMessages, updateSdkSessionId, updateSessionPermissionProfile } from '@/lib/db'; +import { deleteSession, getSession, updateSessionWorkingDirectory, updateSessionTitle, updateSessionMode, updateSessionModel, updateSessionProviderId, clearSessionMessages, updateSdkSessionId, updateSessionPermissionProfile, forceReleaseSessionLock } from '@/lib/db'; import { autoApprovePendingForSession } from '@/lib/bridge/permission-broker'; export async function GET( @@ -88,6 +88,9 @@ export async function PATCH( if (body.clear_messages) { clearSessionMessages(id); } + if (body.force_unlock) { + forceReleaseSessionLock(id); + } const updated = getSession(id); return Response.json({ session: updated }); diff --git a/src/lib/agent-loop.ts b/src/lib/agent-loop.ts index 24f689bc..e60bd3cc 100644 --- a/src/lib/agent-loop.ts +++ b/src/lib/agent-loop.ts @@ -15,7 +15,7 @@ import type { SSEEvent, TokenUsage } from '@/types'; import { createModel } from './ai-provider'; import { assembleTools, READ_ONLY_TOOLS } from './agent-tools'; import { reportNativeError } from './error-classifier'; -import { pruneOldToolResults } from './context-pruner'; +import { pruneOldToolResults, estimateTokens } from './context-pruner'; import { emit as emitEvent } from './runtime/event-bus'; import { createCheckpoint } from './file-checkpoint'; import type { PermissionMode } from './permission-checker'; @@ -168,9 +168,25 @@ export function runAgentLoop(options: AgentLoopOptions): ReadableStream sessionModel, }); - // 2. Load conversation history from DB - const { messages: dbMessages } = getMessages(sessionId, { limit: 200, excludeHeartbeatAck: true }); - const historyMessages = buildCoreMessages(dbMessages); + // 2. Load conversation history from DB with token-aware limiting. + // Start with 200 messages, then reduce if the context exceeds the model's window. + // This prevents context overflow that causes the model to degrade and output + // tool calls as text instead of using the tool API. + const contextWindowTokens = context1m ? 1_000_000 : 200_000; + const maxContextTokens = Math.floor(contextWindowTokens * 0.75); // Reserve 25% for response + let msgLimit = 200; + const { messages: dbMessages } = getMessages(sessionId, { limit: msgLimit, excludeHeartbeatAck: true }); + let historyMessages = buildCoreMessages(dbMessages); + { + let estimatedTokens = estimateTokens(historyMessages); + while (estimatedTokens > maxContextTokens && msgLimit > 20) { + msgLimit = Math.max(20, Math.floor(msgLimit * 0.6)); + console.log(`[agent-loop] Context too large (${estimatedTokens} tokens est.), reducing to ${msgLimit} messages`); + const { messages: reducedMsgs } = getMessages(sessionId, { limit: msgLimit, excludeHeartbeatAck: true }); + historyMessages = buildCoreMessages(reducedMsgs); + estimatedTokens = estimateTokens(historyMessages); + } + } // The chat route persists the user message to DB BEFORE calling us, // so for normal messages it's already the last entry in historyMessages. @@ -219,7 +235,8 @@ export function runAgentLoop(options: AgentLoopOptions): ReadableStream onRuntimeStatusChange?.('streaming'); let step = 0; const totalUsage: TokenUsage = { input_tokens: 0, output_tokens: 0 }; - let lastToolNames: string[] = []; // for doom loop detection + let lastToolKey = ''; // for doom loop detection + let doomLoopCount = 0; let messages = historyMessages; while (step < maxSteps) { @@ -419,15 +436,25 @@ export function runAgentLoop(options: AgentLoopOptions): ReadableStream break; } - // Doom loop detection: same tool(s) called 3 times in a row + // Doom loop detection: same tool(s) called N times in a row → break const toolKey = stepToolNames.sort().join(','); - const lastKey = lastToolNames.sort().join(','); - if (toolKey === lastKey) { - const repeatCount = (step > 1) ? DOOM_LOOP_THRESHOLD : 1; - // Simple heuristic: track repeats via a counter we'd need to add - // For now, just detect immediate repeats and break after threshold + if (toolKey === lastToolKey) { + doomLoopCount++; + if (doomLoopCount >= DOOM_LOOP_THRESHOLD) { + console.warn(`[agent-loop] Doom loop detected: "${toolKey}" called ${doomLoopCount + 1} times in a row — breaking`); + controller.enqueue(formatSSE({ + type: 'error', + data: JSON.stringify({ + category: 'DOOM_LOOP', + userMessage: `Agent stopped: tool "${toolKey}" was called ${doomLoopCount + 1} times in a row with no progress.`, + }), + })); + break; + } + } else { + doomLoopCount = 0; } - lastToolNames = stepToolNames; + lastToolKey = toolKey; // Update messages for next iteration. // streamText returns the full message list including our input + model response. diff --git a/src/lib/claude-client.ts b/src/lib/claude-client.ts index 3ae4d048..cb7573da 100644 --- a/src/lib/claude-client.ts +++ b/src/lib/claude-client.ts @@ -300,7 +300,7 @@ function buildFallbackContext(params: { } lines.push(''); - lines.push('(This is a summary of earlier conversation turns for context. Tool calls shown here were already executed — do not repeat them or output their markers as text.)'); + lines.push('(This is a summary of earlier conversation turns for context. Tool calls shown here were already executed — do not repeat them or output their markers as text. NEVER output "[Tool call: ...]" brackets as text. Use the actual tool API to execute tools.)'); for (const msg of selected) { lines.push(`${msg.role === 'user' ? 'Human' : 'Assistant'}: ${msg.content}`); } @@ -1181,9 +1181,9 @@ export function streamClaudeSdk(options: ClaudeStreamOptions): ReadableStream c.type === 'text') - .map((c: { text?: string }) => c.text) + ? (block.content as Array<{ type: string; text?: string }>) + .filter((c) => c.type === 'text') + .map((c) => c.text) .join('\n') : String(block.content ?? ''); diff --git a/src/lib/context-pruner.ts b/src/lib/context-pruner.ts index d3b82e02..b7717873 100644 --- a/src/lib/context-pruner.ts +++ b/src/lib/context-pruner.ts @@ -9,8 +9,7 @@ import type { ModelMessage } from 'ai'; -const RECENT_TURNS_TO_KEEP = 6; // Keep last N messages fully intact -const TRUNCATED_RESULT_MARKER = '[Tool result truncated — see earlier in conversation]'; +const RECENT_TURNS_TO_KEEP = 16; // Keep last N messages fully intact (~8 exchanges) /** * Prune old tool results from message history to reduce token usage. @@ -37,9 +36,14 @@ export function pruneOldToolResults(messages: ModelMessage[]): ModelMessage[] { ...msg, content: (msg.content as Array<{ type: string; [k: string]: unknown }>).map((part) => { if (part.type === 'tool-result') { + const toolName = ('toolName' in part && typeof part.toolName === 'string') ? part.toolName : 'unknown'; + const original = ('output' in part && part.output && typeof part.output === 'object' && 'value' in (part.output as Record)) + ? String((part.output as Record).value) : ''; + const excerpt = original.slice(0, 200); + const marker = `[Pruned ${toolName} result${excerpt ? ': ' + excerpt + (original.length > 200 ? '...' : '') : ''}]`; return { ...part, - output: { type: 'text' as const, value: TRUNCATED_RESULT_MARKER }, + output: { type: 'text' as const, value: marker }, }; } return part; diff --git a/src/lib/db.ts b/src/lib/db.ts index e9bebf40..ce1af918 100644 --- a/src/lib/db.ts +++ b/src/lib/db.ts @@ -1925,6 +1925,38 @@ export function releaseSessionLock(sessionId: string, lockId: string): boolean { return result.changes > 0; } +/** + * Clean up all expired or stale session locks. + * Call on startup to recover from crashes that left locks behind. + */ +export function cleanupStaleLocks(): number { + const db = getDb(); + const now = new Date().toISOString().replace('T', ' ').split('.')[0]; + const result = db.prepare('DELETE FROM session_runtime_locks WHERE expires_at < ?').run(now); + if (result.changes > 0) { + // Reset runtime status for sessions that were left in 'running' state + db.prepare( + "UPDATE chat_sessions SET runtime_status = 'idle', runtime_error = 'Recovered from stale lock' WHERE runtime_status = 'running'" + ).run(); + } + return result.changes; +} + +/** + * Force-release a session lock regardless of lock_id. + * Use when the UI needs to break a stuck session. + */ +export function forceReleaseSessionLock(sessionId: string): boolean { + const db = getDb(); + const result = db.prepare( + 'DELETE FROM session_runtime_locks WHERE session_id = ?' + ).run(sessionId); + if (result.changes > 0) { + setSessionRuntimeStatus(sessionId, 'idle', 'Force-released by user'); + } + return result.changes > 0; +} + /** * Update the runtime status of a session. */ diff --git a/src/lib/message-builder.ts b/src/lib/message-builder.ts index ac6cdb84..1f5ca20e 100644 --- a/src/lib/message-builder.ts +++ b/src/lib/message-builder.ts @@ -67,7 +67,8 @@ export function buildCoreMessages(dbMessages: Message[]): ModelMessage[] { /** * Ensure messages alternate between user and assistant/tool roles. - * Consecutive user messages are merged. Consecutive assistant messages keep the last. + * Consecutive user messages are merged. Consecutive assistant messages are merged + * (preserving all content parts) to avoid silently dropping tool call history. */ function enforceAlternation(messages: ModelMessage[]): ModelMessage[] { if (messages.length <= 1) return messages; @@ -79,11 +80,12 @@ function enforceAlternation(messages: ModelMessage[]): ModelMessage[] { const curr = messages[i]; if (curr.role === prev.role && curr.role === 'user') { - // Merge consecutive user messages, preserving multi-part content result[result.length - 1] = { role: 'user', content: mergeUserContent(prev.content, curr.content) }; } else if (curr.role === prev.role && curr.role === 'assistant') { - // Keep the later assistant message (more recent) - result[result.length - 1] = curr; + result[result.length - 1] = { + role: 'assistant', + content: mergeAssistantContent(prev.content, curr.content), + } as AssistantModelMessage; } else { result.push(curr); } @@ -92,6 +94,51 @@ function enforceAlternation(messages: ModelMessage[]): ModelMessage[] { return result; } +// ── Sanitization ──────────────────────────────────────────────── + +/** + * Detect and clean assistant text blocks that contain fake tool call syntax. + * When the model outputs "(used Bash: {...})" as text instead of making a real + * tool_use API call, those patterns pollute the conversation history and cause + * the model to imitate them on future turns (feedback loop). + * + * Also strips leaked thinking tags like "(antml:thinking>..." which indicate + * protocol confusion from context overload. + */ +function sanitizeAssistantText(text: string): string { + // Strip fake tool call patterns: (used ToolName: {json...}) and [Tool call: Name — ...] + let cleaned = text.replace(/\(used\s+\w+:\s*\{[^}]*\}[^)]*\)/g, ''); + cleaned = cleaned.replace(/\[Tool call:\s+\w+\s*—[^\]]*\]/g, ''); + // Strip leaked thinking tags: (antml:thinking>...) or + cleaned = cleaned.replace(/\(antml:thinking>[\s\S]*?(?:<\/antml:thinking>|\))/g, ''); + cleaned = cleaned.replace(/<\/?antml:thinking>/g, ''); + // Collapse multiple blank lines left by stripping + cleaned = cleaned.replace(/\n{3,}/g, '\n\n').trim(); + return cleaned; +} + +/** + * Check if a text block consists entirely (or almost entirely) of fake tool calls. + * If so, replace with a brief marker instead of sending garbage to the model. + * Only sanitize if >80% of the content appears to be fake tool calls. + */ +function cleanAssistantTextBlock(text: string): string { + const originalLength = text.trim().length; + if (originalLength < 50) return text; // Don't sanitize short texts + + const sanitized = sanitizeAssistantText(text); + const removedLength = originalLength - sanitized.length; + const removalRatio = removedLength / originalLength; + + // Only apply sanitization if >80% was fake tool calls + if (removalRatio > 0.8) { + return sanitized || '[Tool calls in this turn were not executed]'; + } + + // Otherwise return original text untouched + return text; +} + // ── Internal ──────────────────────────────────────────────────── /** @@ -111,6 +158,19 @@ function mergeUserContent(a: any, b: any): any { return merged; } +/** + * Merge two assistant message contents, preserving all content parts (text, tool-call, etc.). + */ +// eslint-disable-next-line @typescript-eslint/no-explicit-any +function mergeAssistantContent(a: any, b: any): Exclude { + const toArray = (c: unknown): Exclude => { + if (typeof c === 'string') return c.trim() ? [{ type: 'text' as const, text: c }] : []; + if (Array.isArray(c)) return c as Exclude; + return []; + }; + return [...toArray(a), ...toArray(b)]; +} + /** * Parse user message content, rebuilding file attachments as multi-modal content parts. * File metadata is stored as `text`. @@ -229,7 +289,8 @@ function convertAssistantBlocks(blocks: MessageContentBlock[]): ModelMessage[] { flushToolResults(); } if (block.text.trim()) { - assistantParts.push({ type: 'text', text: block.text }); + const cleaned = cleanAssistantTextBlock(block.text); + assistantParts.push({ type: 'text', text: cleaned }); } break; diff --git a/src/lib/message-normalizer.ts b/src/lib/message-normalizer.ts index 00a113a3..215c9da1 100644 --- a/src/lib/message-normalizer.ts +++ b/src/lib/message-normalizer.ts @@ -43,11 +43,10 @@ export function normalizeMessageContent(role: string, raw: string): string { } else if (b.type === 'text' && b.text) { parts.push(b.text); } else if (b.type === 'tool_use') { - // Keep a brief summary of tool usage (name + truncated input) const name = b.name || 'unknown_tool'; const inputStr = typeof b.input === 'object' ? JSON.stringify(b.input) : String(b.input || ''); const truncated = inputStr.length > 80 ? inputStr.slice(0, 80) + '...' : inputStr; - parts.push(`(used ${name}: ${truncated})`); + parts.push(`[Tool call: ${name} — ${truncated}]`); } // tool_result blocks are skipped — the summary above captures intent } diff --git a/src/lib/tools/bash.ts b/src/lib/tools/bash.ts index 5c3f9b52..3044bc4a 100644 --- a/src/lib/tools/bash.ts +++ b/src/lib/tools/bash.ts @@ -5,6 +5,7 @@ import { tool } from 'ai'; import { z } from 'zod'; import { spawn } from 'child_process'; +import fs from 'fs'; import type { ToolContext } from './index'; const MAX_OUTPUT_BYTES = 1024 * 1024; // 1MB @@ -24,14 +25,21 @@ export function createBashTool(ctx: ToolContext) { }), execute: async ({ command, timeout }, { abortSignal }) => { const timeoutMs = timeout ?? DEFAULT_TIMEOUT_MS; + const cwd = ctx.workingDirectory; + + // Validate working directory exists before spawning + if (!fs.existsSync(cwd)) { + return `Error: working directory does not exist: ${cwd}`; + } return new Promise((resolve) => { const chunks: Buffer[] = []; let totalBytes = 0; let truncated = false; + let killTimer: ReturnType | null = null; const proc = spawn('bash', ['-c', command], { - cwd: ctx.workingDirectory, + cwd, env: { ...process.env, TERM: 'dumb' }, stdio: ['ignore', 'pipe', 'pipe'], timeout: timeoutMs, @@ -48,22 +56,41 @@ export function createBashTool(ctx: ToolContext) { } }; - proc.stdout?.on('data', collect); - proc.stderr?.on('data', collect); + // Stream tool output to the client as it arrives + const streamOutput = (data: Buffer) => { + if (ctx.emitSSE) { + try { + ctx.emitSSE({ type: 'tool_output', data: data.toString('utf-8') }); + } catch { /* stream closed */ } + } + }; + + proc.stdout?.on('data', (data: Buffer) => { + collect(data); + streamOutput(data); + }); + proc.stderr?.on('data', (data: Buffer) => { + collect(data); + streamOutput(data); + }); // Handle abort const onAbort = () => { proc.kill('SIGTERM'); - setTimeout(() => proc.kill('SIGKILL'), 3000); + killTimer = setTimeout(() => { + try { proc.kill('SIGKILL'); } catch { /* already dead */ } + }, 3000); }; abortSignal?.addEventListener('abort', onAbort, { once: true }); proc.on('close', (code, signal) => { abortSignal?.removeEventListener('abort', onAbort); + if (killTimer) clearTimeout(killTimer); let output = Buffer.concat(chunks).toString('utf-8'); if (truncated) { - output += '\n\n[Output truncated — exceeded 1MB limit]'; + const droppedBytes = totalBytes - MAX_OUTPUT_BYTES; + output += `\n\n[Output truncated at 1MB — ${droppedBytes} bytes dropped]`; } if (signal === 'SIGTERM' || signal === 'SIGKILL') { @@ -78,6 +105,7 @@ export function createBashTool(ctx: ToolContext) { }); proc.on('error', (err) => { + if (killTimer) clearTimeout(killTimer); resolve(`Error executing command: ${err.message}`); }); });