From ede595683a65962625806587293aa1cc54309d80 Mon Sep 17 00:00:00 2001 From: Kevin De Porre Date: Mon, 4 May 2026 11:41:28 +0200 Subject: [PATCH 1/7] feat(agents): drive coder via Claude/Codex SDKs instead of CLI binaries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The coder entity used to spawn the `claude` / `codex` binaries and mirror events by tailing JSONL files on disk. Replace that with two SDK-driven runners that iterate the official SDK async generators and forward each event through `agent-session-protocol`'s per-event normalisers, so the entity no longer depends on global CLI installs and stops touching the filesystem to discover sessions. - Claude runner adapts each `SDKMessage` to `ClaudeEntry` and routes it through `normalizeClaudeEvent`. - Codex runner translates each completed `ThreadItem` directly into `NormalizedEvent`s — Codex's SDK envelope is higher-level than the rollout JSONL, so the JSONL normaliser doesn't apply. - `CodingSessionCliRunner.run()` gains `onEvent` and `onSessionId` callbacks; the orchestrator loses its file-watcher / pre+post diff plumbing and just forwards events live. Co-Authored-By: Claude Opus 4.7 (1M context) --- packages/agents/package.json | 4 +- packages/agents/src/agents/coding-session.ts | 458 +++--------------- .../agents/src/agents/runners/claude-sdk.ts | 179 +++++++ .../agents/src/agents/runners/codex-sdk.ts | 329 +++++++++++++ packages/agents/test/coding-session.test.ts | 86 ++-- .../test/find-new-session-after-run.test.ts | 165 ------- pnpm-lock.yaml | 222 ++++++++- 7 files changed, 829 insertions(+), 614 deletions(-) create mode 100644 packages/agents/src/agents/runners/claude-sdk.ts create mode 100644 packages/agents/src/agents/runners/codex-sdk.ts delete mode 100644 packages/agents/test/find-new-session-after-run.test.ts diff --git a/packages/agents/package.json b/packages/agents/package.json index 5c7bf66967..f436c02b57 100644 --- a/packages/agents/package.json +++ b/packages/agents/package.json @@ -40,13 +40,15 @@ "./package.json": "./package.json" }, "dependencies": { + "@anthropic-ai/claude-agent-sdk": "^0.2.123", "@anthropic-ai/sdk": "^0.78.0", "@durable-streams/state": "npm:@electric-ax/durable-streams-state-beta@^0.3.1", "@electric-ax/agents-runtime": "workspace:*", "@mariozechner/pi-agent-core": "^0.70.2", "@mariozechner/pi-ai": "^0.70.2", + "@openai/codex-sdk": "^0.125.0", "@sinclair/typebox": "^0.34.48", - "agent-session-protocol": "^0.0.2", + "agent-session-protocol": "^0.0.8", "better-sqlite3": "^11.10.0", "nanoid": "^3.3.11", "pino": "^10.3.1", diff --git a/packages/agents/src/agents/coding-session.ts b/packages/agents/src/agents/coding-session.ts index 447e09e5fd..9249c0b9b5 100644 --- a/packages/agents/src/agents/coding-session.ts +++ b/packages/agents/src/agents/coding-session.ts @@ -1,22 +1,10 @@ -import { spawn } from 'node:child_process' -import { watch, promises as fsp } from 'node:fs' -import { homedir } from 'node:os' -import path from 'node:path' import { z } from 'zod' import { - deserializeCursor, - discoverSessions, importLocalSession, loadSession, - resolveSession, serializeCursor, - tailSession, -} from 'agent-session-protocol' -import type { - NormalizedEvent, - SerializedSessionCursor, - SessionCursor, } from 'agent-session-protocol' +import type { NormalizedEvent } from 'agent-session-protocol' import { CODING_SESSION_CURSOR_COLLECTION_TYPE, CODING_SESSION_EVENT_COLLECTION_TYPE, @@ -30,13 +18,22 @@ import type { WakeEvent, } from '@electric-ax/agents-runtime' +import { claudeSdkRunner } from './runners/claude-sdk.js' +import { codexSdkRunner } from './runners/codex-sdk.js' + /** - * Abstraction over the claude/codex CLI. Default implementation spawns - * the real binary; tests can inject a fake. + * Abstraction over a coding-agent runner. The default implementations + * drive `@anthropic-ai/claude-agent-sdk` and `@openai/codex-sdk` + * directly; tests can inject a fake. + * + * Runners stream `NormalizedEvent`s via `onEvent` as the agent makes + * progress, and call `onSessionId` once with the new (or resumed) + * session id so the orchestrator can persist it on the entity. * * `sessionId` is undefined for the first prompt on a fresh session — - * the runner should then let the CLI generate its own id. For every - * subsequent prompt, pass the id so the CLI resumes that conversation. + * the runner should then let the SDK generate its own id and emit it + * via `onSessionId`. For every subsequent prompt, pass the id so the + * SDK resumes that conversation. */ export interface CodingSessionCliRunner { run(opts: { @@ -44,165 +41,18 @@ export interface CodingSessionCliRunner { sessionId?: string cwd: string prompt: string + onEvent?: (ev: NormalizedEvent) => void + onSessionId?: (id: string) => void }): Promise<{ exitCode: number; stdout: string; stderr: string }> } const defaultCliRunner: CodingSessionCliRunner = { async run(opts) { - return new Promise((resolve, reject) => { - // Claude Code: prompt goes in on stdin (not argv). Needs - // --dangerously-skip-permissions because the session runs - // autonomously — any tool call would otherwise block on an - // interactive approval prompt and exit 1. - // Codex: prompt is an argv; stdin is ignored. Needs - // --skip-git-repo-check because `codex exec` refuses to run in - // a directory that isn't a trusted-dir and isn't a git repo, - // and we can't assume callers have configured trust for the - // cwd they pointed the entity at. - const isClaude = opts.agent === `claude` - const bin = isClaude ? `claude` : `codex` - const args = isClaude - ? opts.sessionId - ? [`-r`, opts.sessionId, `--dangerously-skip-permissions`, `-p`] - : [`--dangerously-skip-permissions`, `-p`] - : opts.sessionId - ? [ - `exec`, - `--skip-git-repo-check`, - `resume`, - opts.sessionId, - opts.prompt, - ] - : [`exec`, `--skip-git-repo-check`, opts.prompt] - const child = spawn(bin, args, { - cwd: opts.cwd, - stdio: [isClaude ? `pipe` : `ignore`, `pipe`, `pipe`], - }) - // Cap how much output we hold on the heap. Only the first ~800 - // chars of each stream show up in error messages, but a verbose - // CLI session can produce megabytes — keep just enough for a - // meaningful diagnostic and discard the rest. - const MAX_BUF_CHARS = 4096 - let stdout = `` - let stderr = `` - child.stdout?.on(`data`, (d: Buffer) => { - if (stdout.length < MAX_BUF_CHARS) { - stdout += d.toString().slice(0, MAX_BUF_CHARS - stdout.length) - } - }) - child.stderr?.on(`data`, (d: Buffer) => { - if (stderr.length < MAX_BUF_CHARS) { - stderr += d.toString().slice(0, MAX_BUF_CHARS - stderr.length) - } - }) - child.on(`error`, reject) - child.on(`exit`, (code) => { - resolve({ exitCode: code ?? -1, stdout, stderr }) - }) - if (isClaude && child.stdin) { - child.stdin.write(opts.prompt) - child.stdin.end() - } - }) + const runner = opts.agent === `claude` ? claudeSdkRunner : codexSdkRunner + return runner.run(opts) }, } -export async function discoverNewestSession( - agent: CodingAgentType, - cwd: string, - excludeIds: ReadonlySet -): Promise { - const all = await discoverSessions(agent) - const candidates = all.filter( - (s) => !excludeIds.has(s.sessionId) && (!s.cwd || s.cwd === cwd) - ) - if (candidates.length === 0) return null - // discoverSessions returns most-recent-first for each agent, so - // the first match is what the CLI just wrote. - return candidates[0]!.sessionId -} - -/** - * Compute the candidate directories where Claude Code stores per-cwd - * session JSONL files. Claude resolves the cwd to its realpath when - * choosing the directory name (so /tmp/foo on macOS lands under - * `-private-tmp-foo`), but the entity may have been spawned with the - * non-realpath form. Return both candidates so the caller can union - * their contents. - */ -export async function getClaudeProjectDirs( - cwd: string -): Promise> { - const home = homedir() - const make = (c: string): string => - path.join(home, `.claude`, `projects`, c.replace(/\//g, `-`)) - const dirs = [make(cwd)] - try { - const real = await fsp.realpath(cwd) - if (real !== cwd) dirs.push(make(real)) - } catch { - // cwd may not exist on disk yet — skip realpath - } - return dirs -} - -export async function listClaudeJsonlIdsByCwd( - cwd: string -): Promise> { - const ids = new Set() - for (const dir of await getClaudeProjectDirs(cwd)) { - try { - const files = await fsp.readdir(dir) - for (const f of files) { - if (f.endsWith(`.jsonl`)) ids.add(f.slice(0, -`.jsonl`.length)) - } - } catch { - // dir may not exist (no prior runs in this cwd) - } - } - return ids -} - -/** - * Deterministic-path discovery for a freshly created session. After the - * Claude CLI runs in `-p` mode it writes the new JSONL straight into - * `~/.claude/projects//.jsonl` *without* leaving a - * `~/.claude/sessions/.json` lock file (those are interactive-only), - * so `discoverSessions` can miss it. Compute the expected dir directly - * and diff its contents against a pre-run snapshot. Returns the newest - * fresh sessionId or null. Codex falls back to discoverNewestSession. - */ -export async function findNewSessionAfterRun( - agent: CodingAgentType, - cwd: string, - preDirectIds: ReadonlySet, - preDiscoveredIds: ReadonlySet -): Promise { - if (agent === `claude`) { - const dirs = await getClaudeProjectDirs(cwd) - let best: { id: string; mtime: number } | null = null - for (const dir of dirs) { - try { - const files = await fsp.readdir(dir) - for (const f of files) { - if (!f.endsWith(`.jsonl`)) continue - const id = f.slice(0, -`.jsonl`.length) - if (preDirectIds.has(id)) continue - const st = await fsp.stat(path.join(dir, f)).catch(() => null) - if (!st) continue - if (!best || st.mtimeMs > best.mtime) { - best = { id, mtime: st.mtimeMs } - } - } - } catch { - // dir may not exist - } - } - if (best) return best.id - } - return discoverNewestSession(agent, cwd, preDiscoveredIds) -} - const sessionMetaRowSchema = z.object({ key: z.literal(`current`), electricSessionId: z.string(), @@ -216,7 +66,13 @@ const sessionMetaRowSchema = z.object({ const cursorStateRowSchema = z.object({ key: z.literal(`current`), - /** JSON-serialized SerializedSessionCursor, or empty string if none yet. */ + /** + * JSON-serialized SerializedSessionCursor or empty string. Used as a + * "have I seeded the events collection from the JSONL yet?" marker for + * imported / attached sessions — once non-empty, we don't reseed. + * The SDK runners stream events live, so this is no longer used for + * tail/cursor state past first wake. + */ cursor: z.string(), lastProcessedInboxKey: z.string().optional(), }) @@ -261,9 +117,9 @@ interface InboxRow { } export interface RegisterCodingSessionOptions { - /** Working directory the CLI runs in when `args.cwd` is not provided. Defaults to `process.cwd()`. */ + /** Working directory the runner uses when `args.cwd` is not provided. Defaults to `process.cwd()`. */ defaultWorkingDirectory?: string - /** Override the CLI runner (for tests or alternate backends). */ + /** Override the runner (for tests or alternate backends). */ cliRunner?: CodingSessionCliRunner } @@ -317,120 +173,6 @@ function appendIfNew(ctx: LiveMirrorCtx, event: NormalizedEvent): void { ctx.actions.events_insert({ row }) } -/** - * Mirror every event that lands in the JSONL file while `runWork` is - * executing (i.e. while the CLI is running). Returns the advanced cursor - * and the `runWork` result once everything has settled and every append - * has been persisted to the entity's durable stream. - * - * If setup fails (e.g. the session file can't be resolved), `runWork` - * still runs — but nothing is mirrored and `setupError` is populated so - * the caller can surface the condition. If `runWork` throws, the error - * propagates after the watcher has been cleaned up. - */ -async function runWithLiveMirror(opts: { - agent: CodingAgentType - nativeSessionId: string - serializedCursor: SerializedSessionCursor | null - ctx: LiveMirrorCtx - runWork: () => Promise -}): Promise<{ - cursor: SerializedSessionCursor | null - setupError?: unknown - result: T -}> { - let cursor: SessionCursor | null = null - let setupError: unknown = undefined - - try { - const session = await resolveSession(opts.nativeSessionId, opts.agent) - if (opts.serializedCursor) { - cursor = deserializeCursor({ - ...opts.serializedCursor, - path: session.path, - }) - } else { - // First real tail — absorb whatever's already on disk (e.g. the - // pre-existing user turn for an imported session, or nothing for - // a freshly-created empty file). - const initial = await loadSession({ - sessionId: opts.nativeSessionId, - agent: opts.agent, - }) - for (const ev of initial.events) appendIfNew(opts.ctx, ev) - cursor = initial.cursor - } - } catch (e) { - setupError = e - } - - if (!cursor) { - // Setup failed — just run and surface the error to the caller. - const result = await opts.runWork() - return { cursor: opts.serializedCursor, setupError, result } - } - - let activeCursor: SessionCursor = cursor - let busy = false - let pending = false - let stopped = false - - const drainOnce = async (): Promise => { - if (stopped && busy) return - if (busy) { - pending = true - return - } - busy = true - try { - const res = await tailSession({ cursor: activeCursor }) - activeCursor = res.cursor - for (const ev of res.newEvents) appendIfNew(opts.ctx, ev) - } catch { - // Transient read errors (truncation, rename during rotation) — - // the final tail after runWork settles will catch up. - } finally { - busy = false - if (pending && !stopped) { - pending = false - void drainOnce() - } - } - } - - const fileWatcher = watch(activeCursor.path, () => { - void drainOnce() - }) - const pollHandle = setInterval(() => { - void drainOnce() - }, 1500) - - let result: T - try { - result = await opts.runWork() - } finally { - stopped = true - clearInterval(pollHandle) - fileWatcher.close() - // Wait for any in-flight drain to settle before doing the final tail. - while (busy) { - await new Promise((r) => setTimeout(r, 10)) - } - // Final tail — catches anything written between the last watcher - // tick and the watcher shutdown. - try { - const final = await tailSession({ cursor: activeCursor }) - activeCursor = final.cursor - for (const ev of final.newEvents) appendIfNew(opts.ctx, ev) - } catch { - // Swallow; the caller's own post-run tail/persistence will - // surface the condition if it matters. - } - } - - return { cursor: serializeCursor(activeCursor), setupError, result } -} - export function registerCodingSession( registry: EntityRegistry, options: RegisterCodingSessionOptions = {} @@ -439,7 +181,7 @@ export function registerCodingSession( const defaultCwd = options.defaultWorkingDirectory ?? process.cwd() registry.define(`coder`, { - description: `Runs a Claude Code / Codex CLI session and mirrors its normalized event stream into a durable store. Prompts arrive via message_received (type: "prompt") and are executed serially.`, + description: `Runs a Claude Code / Codex SDK session and mirrors its normalized event stream into a durable store. Prompts arrive via message_received (type: "prompt") and are executed serially.`, creationSchema: creationArgsSchema, inboxSchemas: { prompt: promptMessageSchema, @@ -623,13 +365,13 @@ export function registerCodingSession( }, }) - // Record the CLI invocation as a `runs` collection event so - // observers waking on `runFinished` are notified when the turn - // ends. Without this the parent (e.g. Horton via spawn_coder) - // would never be woken because the coder bypasses useAgent. + // Record the run as a `runs` collection event so observers + // waking on `runFinished` are notified when the turn ends. + // Without this the parent (e.g. Horton via spawn_coder) would + // never be woken because the coder bypasses useAgent. const recordedRun = ctx.recordRun() // Snapshot the existing event keys so we can identify which - // events are appended during this CLI run and surface their + // events are appended during this run and surface their // assistant text as the run's response payload. const eventKeysBefore = new Set( ( @@ -649,123 +391,49 @@ export function registerCodingSession( }, } - let nextCursorJson = runningCursor.cursor - - if (!runningMeta.nativeSessionId) { - // First real prompt on a fresh session. Let the CLI create - // its own jsonl (writing an empty one ourselves breaks - // `claude -r ` — claude can't resume an empty file). - // After it exits, diff the on-disk sessions to find the - // new id, then load and mirror in one shot. Snapshot both - // the deterministic per-cwd directory (works for Claude - // `-p` runs that don't drop a metadata lock file) and - // discoverSessions (covers Codex + interactive Claude - // sessions) before the run so either path can spot the - // freshly written session. - const preDirectIds = - runningMeta.agent === `claude` - ? await listClaudeJsonlIdsByCwd(runningMeta.cwd) - : new Set() - const preDiscoveredIds = new Set( - (await discoverSessions(runningMeta.agent)).map( - (s) => s.sessionId - ) - ) - const cliResult = await runner.run({ - agent: runningMeta.agent, - cwd: runningMeta.cwd, - prompt, - }) - if (cliResult.exitCode !== 0) { - throw new Error( - `[coding-session] ${runningMeta.agent} CLI exited ${cliResult.exitCode}. stderr=${cliResult.stderr.slice(0, 800) || ``} stdout=${cliResult.stdout.slice(0, 800) || ``}` - ) - } - const foundId = await findNewSessionAfterRun( - runningMeta.agent, - runningMeta.cwd, - preDirectIds, - preDiscoveredIds - ) - if (!foundId) { - throw new Error( - `[coding-session] ${runningMeta.agent} CLI succeeded but no new session file was found` - ) - } - ctx.db.actions.sessionMeta_update({ - key: `current`, - updater: (d: SessionMetaRow) => { - d.nativeSessionId = foundId - }, - }) - runningMeta = { ...runningMeta, nativeSessionId: foundId } - - // Post-run full load. No live streaming on the first prompt - // since the file didn't exist when we started. - const initial = await loadSession({ - sessionId: foundId, - agent: runningMeta.agent, - }) - for (const ev of initial.events) appendIfNew(mirrorCtx, ev) - nextCursorJson = JSON.stringify(serializeCursor(initial.cursor)) - } else { - // Existing session: stream events into the DS while the CLI - // runs, so the UI sees the prompt turn, assistant tokens, - // and tool calls as they land. - const serializedCursor = runningCursor.cursor - ? (JSON.parse(runningCursor.cursor) as SerializedSessionCursor) - : null - - const { - cursor: nextSerialized, - setupError, - result: cliResult, - } = await runWithLiveMirror({ - agent: runningMeta.agent, - nativeSessionId: runningMeta.nativeSessionId, - serializedCursor, - ctx: mirrorCtx, - runWork: () => - runner.run({ - agent: runningMeta.agent, - sessionId: runningMeta.nativeSessionId, - cwd: runningMeta.cwd, - prompt, - }), - }) - - if (setupError) { - throw setupError instanceof Error - ? setupError - : new Error(String(setupError)) - } - if (cliResult.exitCode !== 0) { - throw new Error( - `[coding-session] ${runningMeta.agent} CLI exited ${cliResult.exitCode}. stderr=${cliResult.stderr.slice(0, 800) || ``} stdout=${cliResult.stdout.slice(0, 800) || ``}` - ) - } + const cliResult = await runner.run({ + agent: runningMeta.agent, + ...(runningMeta.nativeSessionId + ? { sessionId: runningMeta.nativeSessionId } + : {}), + cwd: runningMeta.cwd, + prompt, + onEvent: (ev) => appendIfNew(mirrorCtx, ev), + onSessionId: (id) => { + if (runningMeta.nativeSessionId === id) return + ctx.db.actions.sessionMeta_update({ + key: `current`, + updater: (d: SessionMetaRow) => { + d.nativeSessionId = id + }, + }) + runningMeta = { ...runningMeta, nativeSessionId: id } + }, + }) - const persistedCursor = nextSerialized ?? serializedCursor - nextCursorJson = persistedCursor - ? JSON.stringify(persistedCursor) - : `` + if (cliResult.exitCode !== 0) { + throw new Error( + `[coding-session] ${runningMeta.agent} runner exited ${cliResult.exitCode}. stderr=${cliResult.stderr.slice(0, 800) || ``} stdout=${cliResult.stdout.slice(0, 800) || ``}` + ) } ctx.db.actions.cursorState_update({ key: `current`, updater: (d: CursorStateRow) => { - d.cursor = nextCursorJson + // Cursor is now just a "have we seeded?" marker — set to + // any non-empty string after the first successful run. + if (!d.cursor) d.cursor = `sdk-stream` d.lastProcessedInboxKey = inboxMsg.key }, }) runningCursor = { ...runningCursor, - cursor: nextCursorJson, + cursor: runningCursor.cursor || `sdk-stream`, lastProcessedInboxKey: inboxMsg.key, } - // Pipe assistant_message text from this run into text_delta - // events linked to recordedRun so the runFinished wake's - // `includeResponse` payload carries the coder's reply. + // Pipe assistant_message text from this run into recordedRun + // so the runFinished wake's `includeResponse` payload carries + // the coder's reply. for (const row of ctx.db.collections.events .toArray as unknown as Array<{ key: string diff --git a/packages/agents/src/agents/runners/claude-sdk.ts b/packages/agents/src/agents/runners/claude-sdk.ts new file mode 100644 index 0000000000..7cbfcb4f78 --- /dev/null +++ b/packages/agents/src/agents/runners/claude-sdk.ts @@ -0,0 +1,179 @@ +import { query } from '@anthropic-ai/claude-agent-sdk' +import type { SDKMessage } from '@anthropic-ai/claude-agent-sdk' +import { normalizeClaudeEvent } from 'agent-session-protocol' +import type { ClaudeEntry } from 'agent-session-protocol' + +import type { CodingSessionCliRunner } from '../coding-session.js' + +/** + * SDK-backed runner for Claude. Drives `query()` from + * `@anthropic-ai/claude-agent-sdk`, iterates the resulting async + * generator, adapts each `SDKMessage` to the `ClaudeEntry` shape + * `normalizeClaudeEvent` expects, and forwards each emitted normalized + * event via the `onEvent` callback. + * + * The Claude SDK ships its own subprocess binary as an optional + * platform-specific dep, so this no longer requires a globally + * installed `claude` CLI on PATH. + */ +export const claudeSdkRunner: CodingSessionCliRunner = { + async run(opts) { + const q = query({ + prompt: opts.prompt, + options: { + cwd: opts.cwd, + ...(opts.sessionId ? { resume: opts.sessionId } : {}), + permissionMode: `bypassPermissions`, + allowDangerouslySkipPermissions: true, + }, + }) + + let capturedSessionId: string | null = opts.sessionId ?? null + let resultMessage: { + is_error: boolean + result?: string + error?: string + } | null = null + + try { + for await (const msg of q) { + const sid = (msg as { session_id?: string }).session_id + if (sid && sid !== capturedSessionId) { + capturedSessionId = sid + opts.onSessionId?.(sid) + } + + if (msg.type === `result`) { + resultMessage = { + is_error: msg.is_error, + result: `result` in msg ? msg.result : undefined, + error: + `subtype` in msg && msg.subtype !== `success` + ? msg.subtype + : undefined, + } + } + + const entry = sdkMessageToClaudeEntry(msg) + if (!entry) continue + for (const ev of normalizeClaudeEvent(entry)) opts.onEvent?.(ev) + } + } catch (e) { + const message = e instanceof Error ? e.message : String(e) + return { exitCode: -1, stdout: ``, stderr: message } + } + + if (resultMessage?.is_error) { + return { + exitCode: 1, + stdout: resultMessage.result ?? ``, + stderr: resultMessage.error ?? `claude SDK reported is_error`, + } + } + return { exitCode: 0, stdout: resultMessage?.result ?? ``, stderr: `` } + }, +} + +/** + * Adapt one `SDKMessage` to the `ClaudeEntry` shape the JSONL + * normaliser expects. The SDK and the JSONL share *most* fields but + * differ in casing on a few keys (`session_id` vs `sessionId`, + * `claude_code_version` vs `version`, `duration_ms` vs `durationMs`). + * Everything else is structurally compatible. + * + * Returns null for SDK-only message types (status pings, retries, hook + * lifecycle, etc.) that have no JSONL counterpart. + */ +function sdkMessageToClaudeEntry(msg: SDKMessage): ClaudeEntry | null { + const ts = + (msg as { timestamp?: string }).timestamp ?? new Date().toISOString() + const sessionId = (msg as { session_id?: string }).session_id + + if (msg.type === `system`) { + if (`subtype` in msg && msg.subtype === `init`) { + return { + type: `system`, + subtype: `init`, + timestamp: ts, + sessionId, + cwd: msg.cwd, + version: msg.claude_code_version, + message: { model: msg.model }, + } + } + if (`subtype` in msg && msg.subtype === `compact_boundary`) { + return { + type: `system`, + subtype: `compact_boundary`, + timestamp: ts, + sessionId, + } + } + return null + } + + if (msg.type === `user`) { + const inner = msg.message as + | { role?: string; content?: unknown } + | undefined + return { + type: `user`, + timestamp: ts, + sessionId, + message: { + role: `user`, + content: inner?.content, + }, + } + } + + if (msg.type === `assistant`) { + const inner = msg.message as { + role?: string + model?: string + content?: unknown + stop_reason?: string + usage?: { + input_tokens?: number + output_tokens?: number + cache_read_input_tokens?: number + cache_creation_input_tokens?: number + } + } + return { + type: `assistant`, + timestamp: ts, + sessionId, + message: { + role: `assistant`, + model: inner.model, + content: inner.content, + stop_reason: inner.stop_reason, + usage: inner.usage, + }, + } + } + + if (msg.type === `result`) { + return { + type: `result`, + timestamp: ts, + sessionId, + subtype: msg.subtype, + durationMs: msg.duration_ms, + message: msg.usage + ? { + usage: { + input_tokens: msg.usage.input_tokens, + output_tokens: msg.usage.output_tokens, + cache_read_input_tokens: msg.usage.cache_read_input_tokens, + cache_creation_input_tokens: + msg.usage.cache_creation_input_tokens, + }, + } + : undefined, + } + } + + return null +} diff --git a/packages/agents/src/agents/runners/codex-sdk.ts b/packages/agents/src/agents/runners/codex-sdk.ts new file mode 100644 index 0000000000..f8f6928e12 --- /dev/null +++ b/packages/agents/src/agents/runners/codex-sdk.ts @@ -0,0 +1,329 @@ +import { Codex } from '@openai/codex-sdk' +import type { + AgentMessageItem, + CommandExecutionItem, + ErrorItem, + FileChangeItem, + McpToolCallItem, + ReasoningItem, + ThreadItem, + WebSearchItem, +} from '@openai/codex-sdk' +import { normalizeToolName } from 'agent-session-protocol' +import type { NormalizedEvent } from 'agent-session-protocol' + +import type { CodingSessionCliRunner } from '../coding-session.js' + +/** + * SDK-backed runner for Codex. Codex's SDK exposes ThreadEvents that + * wrap higher-level UI items (CommandExecutionItem, FileChangeItem, + * etc.) — these are NOT the same shape as the lower-level + * `response_item` payloads that land in the rollout JSONL, so we can't + * route them through `normalizeCodexEvent`. Instead this runner + * synthesises `NormalizedEvent`s directly from each completed + * ThreadItem. + * + * Each tool-style item is emitted as a tool_call when it starts and a + * matching tool_result when it completes, so the UI shows the same + * lifecycle it would for a CLI-driven session. + */ +export const codexSdkRunner: CodingSessionCliRunner = { + async run(opts) { + const codex = new Codex() + // Mirror what the CLI runner did: write access in the cwd and no + // interactive approval prompts. Without these the SDK defaults to + // `read-only` + `on-request` and the agent fails the moment it + // tries to edit a file. + const threadOptions = { + workingDirectory: opts.cwd, + skipGitRepoCheck: true, + sandboxMode: `workspace-write` as const, + approvalPolicy: `never` as const, + } + const thread = opts.sessionId + ? codex.resumeThread(opts.sessionId, threadOptions) + : codex.startThread(threadOptions) + + const startedItems = new Set() + let turnFailed: { message: string } | null = null + let assistantText = `` + let capturedSessionId: string | null = opts.sessionId ?? null + + try { + const { events } = await thread.runStreamed(opts.prompt) + for await (const ev of events) { + if (!capturedSessionId && thread.id) { + capturedSessionId = thread.id + opts.onSessionId?.(thread.id) + } + + switch (ev.type) { + case `thread.started`: { + if (!capturedSessionId) { + capturedSessionId = ev.thread_id + opts.onSessionId?.(ev.thread_id) + } + opts.onEvent?.({ + v: 1, + ts: Date.now(), + type: `session_init`, + sessionId: ev.thread_id, + cwd: opts.cwd, + agent: `codex`, + }) + break + } + case `item.started`: { + const startEvents = threadItemStartedToEvents(ev.item) + for (const e of startEvents) opts.onEvent?.(e) + startedItems.add(ev.item.id) + break + } + case `item.completed`: { + const completeEvents = threadItemCompletedToEvents(ev.item) + for (const e of completeEvents) opts.onEvent?.(e) + if (ev.item.type === `agent_message`) { + assistantText += (assistantText ? `\n` : ``) + ev.item.text + } + break + } + case `turn.completed`: { + opts.onEvent?.({ + v: 1, + ts: Date.now(), + type: `turn_complete`, + success: true, + usage: { + inputTokens: ev.usage.input_tokens, + outputTokens: ev.usage.output_tokens, + cachedInputTokens: ev.usage.cached_input_tokens, + reasoningOutputTokens: ev.usage.reasoning_output_tokens, + }, + }) + break + } + case `turn.failed`: { + turnFailed = { message: ev.error.message } + opts.onEvent?.({ + v: 1, + ts: Date.now(), + type: `turn_aborted`, + reason: ev.error.message, + }) + break + } + case `error`: { + turnFailed = { message: ev.message } + opts.onEvent?.({ + v: 1, + ts: Date.now(), + type: `error`, + message: ev.message, + }) + break + } + case `item.updated`: + case `turn.started`: + break + } + } + } catch (e) { + const message = e instanceof Error ? e.message : String(e) + return { exitCode: -1, stdout: ``, stderr: message } + } + + if (turnFailed) { + return { exitCode: 1, stdout: assistantText, stderr: turnFailed.message } + } + return { exitCode: 0, stdout: assistantText, stderr: `` } + }, +} + +function threadItemStartedToEvents(item: ThreadItem): Array { + const ts = Date.now() + switch (item.type) { + case `command_execution`: + return [commandExecutionToToolCall(item, ts)] + case `mcp_tool_call`: + return [mcpToolCallToToolCall(item, ts)] + case `web_search`: + return [webSearchToToolCall(item, ts)] + case `file_change`: + case `agent_message`: + case `reasoning`: + case `todo_list`: + case `error`: + return [] + } +} + +function threadItemCompletedToEvents(item: ThreadItem): Array { + const ts = Date.now() + switch (item.type) { + case `agent_message`: + return [agentMessageToEvent(item, ts)] + case `reasoning`: + return [reasoningToEvent(item, ts)] + case `command_execution`: + return [commandExecutionToToolResult(item, ts)] + case `file_change`: + return fileChangeToEvents(item, ts) + case `mcp_tool_call`: + return [mcpToolCallToToolResult(item, ts)] + case `web_search`: + return [] + case `todo_list`: + return [] + case `error`: + return [errorItemToEvent(item, ts)] + } +} + +function agentMessageToEvent( + item: AgentMessageItem, + ts: number +): NormalizedEvent { + return { + v: 1, + ts, + type: `assistant_message`, + text: item.text, + phase: `final`, + } +} + +function reasoningToEvent(item: ReasoningItem, ts: number): NormalizedEvent { + return { + v: 1, + ts, + type: `thinking`, + summary: item.text.slice(0, 200) || `(thinking)`, + text: item.text || null, + } +} + +function commandExecutionToToolCall( + item: CommandExecutionItem, + ts: number +): NormalizedEvent { + const mapping = normalizeToolName(`exec_command`, `codex`, { + command: item.command, + }) + return { + v: 1, + ts, + type: `tool_call`, + callId: item.id, + tool: mapping.normalized, + originalTool: mapping.originalTool, + originalAgent: `codex`, + input: { command: item.command }, + } +} + +function commandExecutionToToolResult( + item: CommandExecutionItem, + ts: number +): NormalizedEvent { + const isError = item.status === `failed` || (item.exit_code ?? 0) !== 0 + return { + v: 1, + ts, + type: `tool_result`, + callId: item.id, + output: item.aggregated_output, + isError, + ...(item.exit_code !== undefined ? { exitCode: item.exit_code } : {}), + } +} + +function fileChangeToEvents( + item: FileChangeItem, + ts: number +): Array { + const isError = item.status === `failed` + // Synthesise a tool_call + tool_result pair for the patch as a whole. + // Codex doesn't expose per-file ids, so we use the FileChangeItem's id + // for both events. + const summary = item.changes.map((c) => `${c.kind} ${c.path}`).join(`\n`) + const allAdds = item.changes.every((c) => c.kind === `add`) + const tool = allAdds ? `file_write` : `file_edit` + return [ + { + v: 1, + ts, + type: `tool_call`, + callId: item.id, + tool, + originalTool: `apply_patch`, + originalAgent: `codex`, + input: { changes: item.changes }, + }, + { + v: 1, + ts, + type: `tool_result`, + callId: item.id, + output: summary, + isError, + }, + ] +} + +function mcpToolCallToToolCall( + item: McpToolCallItem, + ts: number +): NormalizedEvent { + return { + v: 1, + ts, + type: `tool_call`, + callId: item.id, + tool: item.tool, + originalTool: item.tool, + originalAgent: `codex`, + input: (item.arguments as Record) ?? {}, + } +} + +function mcpToolCallToToolResult( + item: McpToolCallItem, + ts: number +): NormalizedEvent { + const isError = item.status === `failed` + const output = item.error + ? item.error.message + : item.result + ? JSON.stringify(item.result.structured_content ?? item.result.content) + : `` + return { + v: 1, + ts, + type: `tool_result`, + callId: item.id, + output, + isError, + } +} + +function webSearchToToolCall(item: WebSearchItem, ts: number): NormalizedEvent { + return { + v: 1, + ts, + type: `tool_call`, + callId: item.id, + tool: `web_search`, + originalTool: `web_search`, + originalAgent: `codex`, + input: { query: item.query }, + } +} + +function errorItemToEvent(item: ErrorItem, ts: number): NormalizedEvent { + return { + v: 1, + ts, + type: `error`, + message: item.message, + } +} diff --git a/packages/agents/test/coding-session.test.ts b/packages/agents/test/coding-session.test.ts index 42452512bd..b479acb45a 100644 --- a/packages/agents/test/coding-session.test.ts +++ b/packages/agents/test/coding-session.test.ts @@ -1,6 +1,7 @@ import { describe, expect, it, vi } from 'vitest' import { createEntityRegistry } from '@electric-ax/agents-runtime' import { registerCodingSession } from '../src/agents/coding-session' +import type { NormalizedEvent } from 'agent-session-protocol' function makeFakeCtx(opts: { firstWake: boolean @@ -81,6 +82,7 @@ function makeFakeCtx(opts: { sessionMeta: { get: (k: string) => state.sessionMeta!.get(k) }, cursorState: { get: (k: string) => state.cursorState!.get(k) }, events: { + get: (k: string) => state.events!.get(k), get toArray() { return Array.from(state.events!.values()) }, @@ -178,16 +180,26 @@ describe(`registerCodingSession`, () => { }) it(`invokes the injected cliRunner for a queued prompt and mirrors normalized events`, async () => { - // Inject a fake runner + fake agent-session-protocol pullEvents path - // by pre-populating the cursorState (so pullNewEvents takes the tail - // branch) and attaching to an existing nativeSessionId (so the - // lazy-create path that hits the filesystem is bypassed). - // - // This still exercises the handler's queue-drain logic without - // touching ~/.claude or ~/.codex. + // Pre-populate the cursorState with a non-empty seeded marker so + // the initial-mirror path is skipped (no filesystem touch). The + // injected runner streams events and the orchestrator should + // append them to the events collection and complete cleanly. const runner = { - run: vi.fn(async () => ({ exitCode: 0, stdout: ``, stderr: `` })), + run: vi.fn( + async (callArgs: { + onEvent?: (ev: NormalizedEvent) => void + onSessionId?: (id: string) => void + }) => { + callArgs.onEvent?.({ + v: 1, + ts: 1714000000000, + type: `assistant_message`, + text: `hi back`, + }) + return { exitCode: 0, stdout: `hi back`, stderr: `` } + } + ), } const registry = createEntityRegistry() registerCodingSession(registry, { @@ -196,7 +208,7 @@ describe(`registerCodingSession`, () => { }) const def = registry.get(`coder`)! - const { ctx, state, calls } = makeFakeCtx({ + const { ctx, state } = makeFakeCtx({ firstWake: false, args: { agent: `claude`, nativeSessionId: `existing-uuid` }, inbox: [ @@ -217,22 +229,20 @@ describe(`registerCodingSession`, () => { cwd: `/tmp/x`, status: `idle`, }, - cursorState: { key: `current`, cursor: ``, eventCounter: 0 }, + cursorState: { + key: `current`, + cursor: `sdk-stream`, + eventCounter: 0, + }, }, }) - // The handler will call resolveSession + loadSession under the hood, - // which hit the filesystem. Expect this call to throw — we're - // asserting the error surfaces cleanly as a failed prompt rather - // than a hang. - await expect( - def.definition.handler( - ctx as unknown as Parameters[0], - { type: `message_received` } as unknown as Parameters< - typeof def.definition.handler - >[1] - ) - ).rejects.toThrow() + await def.definition.handler( + ctx as unknown as Parameters[0], + { type: `message_received` } as unknown as Parameters< + typeof def.definition.handler + >[1] + ) // Runner was invoked with the prompt expect(runner.run).toHaveBeenCalledTimes(1) @@ -247,14 +257,16 @@ describe(`registerCodingSession`, () => { expect(call.prompt).toBe(`say hi`) expect(call.sessionId).toBe(`existing-uuid`) - // Meta was flipped to error with a diagnostic message + // Streamed event made it into the events collection + expect(state.events!.size).toBe(1) + const event = Array.from(state.events!.values())[0]! + expect(event.type).toBe(`assistant_message`) + + // Meta is back to idle and the inbox key is marked processed const meta = state.sessionMeta!.get(`current`)! - expect(meta.status).toBe(`error`) - expect(typeof meta.error).toBe(`string`) - // The prompt is marked as processed so it won't be retried on the next wake + expect(meta.status).toBe(`idle`) const cursor = state.cursorState!.get(`current`)! expect(cursor.lastProcessedInboxKey).toBe(`m-001`) - void calls // reserved for future assertions }) it(`accepts inbox messages without message_type (bare /send from generic UI)`, async () => { @@ -289,18 +301,20 @@ describe(`registerCodingSession`, () => { cwd: `/tmp/x`, status: `idle`, }, - cursorState: { key: `current`, cursor: ``, eventCounter: 0 }, + cursorState: { + key: `current`, + cursor: `sdk-stream`, + eventCounter: 0, + }, }, }) - await expect( - def.definition.handler( - ctx as unknown as Parameters[0], - { type: `message_received` } as unknown as Parameters< - typeof def.definition.handler - >[1] - ) - ).rejects.toThrow() // resolveSession fails for a synthetic id — same as the other test + await def.definition.handler( + ctx as unknown as Parameters[0], + { type: `message_received` } as unknown as Parameters< + typeof def.definition.handler + >[1] + ) expect(runner.run).toHaveBeenCalledTimes(1) const call = ( diff --git a/packages/agents/test/find-new-session-after-run.test.ts b/packages/agents/test/find-new-session-after-run.test.ts deleted file mode 100644 index 73d6aee3ba..0000000000 --- a/packages/agents/test/find-new-session-after-run.test.ts +++ /dev/null @@ -1,165 +0,0 @@ -import * as fs from 'node:fs' -import * as fsp from 'node:fs/promises' -import * as path from 'node:path' -import { tmpdir } from 'node:os' -import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest' -import { - findNewSessionAfterRun, - getClaudeProjectDirs, - listClaudeJsonlIdsByCwd, -} from '../src/agents/coding-session' - -// Each test runs against a private fake $HOME so the real -// `~/.claude/projects/` is never touched. `homedir()` reads HOME, so -// stubbing it via vitest is enough to redirect every path-derivation -// helper inside coding-session.ts. -let fakeHome: string - -beforeEach(() => { - fakeHome = fs.mkdtempSync(path.join(tmpdir(), `coder-test-`)) - vi.stubEnv(`HOME`, fakeHome) -}) - -afterEach(() => { - vi.unstubAllEnvs() - fs.rmSync(fakeHome, { recursive: true, force: true }) -}) - -function projectsDirFor(cwd: string): string { - return path.join(fakeHome, `.claude`, `projects`, cwd.replace(/\//g, `-`)) -} - -async function writeJsonl( - cwd: string, - sessionId: string, - opts: { mtimeOffsetMs?: number } = {} -): Promise { - const dir = projectsDirFor(cwd) - await fsp.mkdir(dir, { recursive: true }) - const file = path.join(dir, `${sessionId}.jsonl`) - await fsp.writeFile(file, ``) - if (opts.mtimeOffsetMs !== undefined) { - const t = new Date(Date.now() + opts.mtimeOffsetMs) - await fsp.utimes(file, t, t) - } -} - -describe(`findNewSessionAfterRun (claude)`, () => { - it(`returns null when the per-cwd projects directory doesn't exist`, async () => { - const result = await findNewSessionAfterRun( - `claude`, - `/tmp/nope`, - new Set(), - new Set() - ) - expect(result).toBeNull() - }) - - it(`returns the sessionId of the only new jsonl in the cwd dir`, async () => { - const cwd = `/tmp/cwd-a` - await writeJsonl(cwd, `aaa-111`) - - const result = await findNewSessionAfterRun( - `claude`, - cwd, - new Set(), - new Set() - ) - expect(result).toBe(`aaa-111`) - }) - - it(`picks the newest by mtime when multiple new jsonls are present`, async () => { - const cwd = `/tmp/cwd-b` - await writeJsonl(cwd, `older`, { mtimeOffsetMs: -10_000 }) - await writeJsonl(cwd, `newest`, { mtimeOffsetMs: 0 }) - await writeJsonl(cwd, `middle`, { mtimeOffsetMs: -5_000 }) - - const result = await findNewSessionAfterRun( - `claude`, - cwd, - new Set(), - new Set() - ) - expect(result).toBe(`newest`) - }) - - it(`filters out sessionIds that were already present before the run`, async () => { - const cwd = `/tmp/cwd-c` - await writeJsonl(cwd, `pre-1`, { mtimeOffsetMs: 0 }) - await writeJsonl(cwd, `post-1`, { mtimeOffsetMs: -1_000 }) - - const result = await findNewSessionAfterRun( - `claude`, - cwd, - new Set([`pre-1`]), - new Set() - ) - expect(result).toBe(`post-1`) - }) - - it(`falls back to discoverNewestSession (returning null here, since no real ~/.claude/sessions lock files exist) when nothing is found in the deterministic dir`, async () => { - const result = await findNewSessionAfterRun( - `claude`, - `/tmp/cwd-empty`, - new Set(), - new Set() - ) - expect(result).toBeNull() - }) -}) - -describe(`getClaudeProjectDirs`, () => { - it(`returns the sanitized-cwd directory under fake $HOME`, async () => { - const dirs = await getClaudeProjectDirs(`/private/tmp/foo`) - // realpath resolution may produce a second candidate when the path - // exists on disk; in this test the path doesn't exist, so we get - // exactly the raw-form candidate. - expect(dirs[0]).toBe( - path.join(fakeHome, `.claude`, `projects`, `-private-tmp-foo`) - ) - }) - - it(`also returns the realpath-resolved candidate when the cwd is a symlink`, async () => { - // /tmp on macOS is a symlink to /private/tmp; we replicate that - // shape inside the fake home so the test is portable. - const target = path.join(fakeHome, `realdir`) - const link = path.join(fakeHome, `linkdir`) - fs.mkdirSync(target, { recursive: true }) - fs.symlinkSync(target, link) - - const dirs = await getClaudeProjectDirs(link) - expect(dirs.length).toBe(2) - expect(dirs[0]).toContain(link.replace(/\//g, `-`)) - expect(dirs[1]).toContain(target.replace(/\//g, `-`)) - }) -}) - -describe(`listClaudeJsonlIdsByCwd`, () => { - it(`unions ids across realpath and raw-form dirs and ignores non-jsonl files`, async () => { - const cwd = `/tmp/cwd-list` - await writeJsonl(cwd, `id-1`) - await writeJsonl(cwd, `id-2`) - // Drop a non-jsonl into the same dir to confirm it's ignored. - await fsp.writeFile(path.join(projectsDirFor(cwd), `notes.txt`), `x`) - - const ids = await listClaudeJsonlIdsByCwd(cwd) - expect(Array.from(ids).sort()).toEqual([`id-1`, `id-2`]) - }) - - it(`returns an empty set when the cwd has no projects directory`, async () => { - const ids = await listClaudeJsonlIdsByCwd(`/tmp/cwd-absent`) - expect(ids.size).toBe(0) - }) -}) - -describe(`findNewSessionAfterRun (codex)`, () => { - it(`falls through to discoverNewestSession (no codex sessions on the fake $HOME → null)`, async () => { - const result = await findNewSessionAfterRun( - `codex`, - `/tmp/cwd-codex`, - new Set(), - new Set() - ) - expect(result).toBeNull() - }) -}) diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index c25aaa3b3b..1edf5f68b0 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -1502,6 +1502,9 @@ importers: packages/agents: dependencies: + '@anthropic-ai/claude-agent-sdk': + specifier: ^0.2.123 + version: 0.2.123(zod@4.3.6) '@anthropic-ai/sdk': specifier: ^0.78.0 version: 0.78.0(zod@4.3.6) @@ -1517,12 +1520,15 @@ importers: '@mariozechner/pi-ai': specifier: ^0.70.2 version: 0.70.2(@modelcontextprotocol/sdk@1.29.0(zod@4.3.6))(ws@8.20.0)(zod@4.3.6) + '@openai/codex-sdk': + specifier: ^0.125.0 + version: 0.125.0 '@sinclair/typebox': specifier: ^0.34.48 version: 0.34.49 agent-session-protocol: - specifier: ^0.0.2 - version: 0.0.2 + specifier: ^0.0.8 + version: 0.0.8 better-sqlite3: specifier: ^11.10.0 version: 11.10.0 @@ -2398,6 +2404,52 @@ packages: '@antfu/install-pkg@1.1.0': resolution: {integrity: sha512-MGQsmw10ZyI+EJo45CdSER4zEb+p31LpDAFp2Z3gkSd1yqVZGi0Ebx++YTEMonJy4oChEMLsxZ64j8FH6sSqtQ==} + '@anthropic-ai/claude-agent-sdk-darwin-arm64@0.2.123': + resolution: {integrity: sha512-tYAXCjlXZQklsUs0J//gip3fZQRzhlH5OCgvNXV70qe7A1iiwHqO2KPGvEHV1L+deEKQoMZmTaCOrQpN6zju3w==} + cpu: [arm64] + os: [darwin] + + '@anthropic-ai/claude-agent-sdk-darwin-x64@0.2.123': + resolution: {integrity: sha512-AcUC6sTon6z6HculP87KsAOeTMRLBwpovdhcXUTjXUpo/8nplJ7lBEzWjZCHt8FF1KuN/WBy1Z4bDg/59TQDmA==} + cpu: [x64] + os: [darwin] + + '@anthropic-ai/claude-agent-sdk-linux-arm64-musl@0.2.123': + resolution: {integrity: sha512-bYgRiaf2q+yVbGAoUluuhqrEW1zexL34+3HDmK9DneKXa2K2EJpw4M6Sq4XoBD/JezGaemoAP78Xv/M/QUS1OQ==} + cpu: [arm64] + os: [linux] + + '@anthropic-ai/claude-agent-sdk-linux-arm64@0.2.123': + resolution: {integrity: sha512-7+GnbcF3/aZ8RJ1WmU/ogtPsOpknBAoUPer90MvZuFYBLPT9iI/U7f24gjrOHuYdcbDA5n7jFlhcfIO26F5DJQ==} + cpu: [arm64] + os: [linux] + + '@anthropic-ai/claude-agent-sdk-linux-x64-musl@0.2.123': + resolution: {integrity: sha512-IX95lFKhmmndY/YPfWPsVV+C3rLYJmuuq5wCS53p6jYIkCMxH1iGfhBGF1EUWcXO4Uc8yqXFmQ3aaxMzOOPrwA==} + cpu: [x64] + os: [linux] + + '@anthropic-ai/claude-agent-sdk-linux-x64@0.2.123': + resolution: {integrity: sha512-Xi+Rwk8uP5vWEnawJOlsk179fr0ATLl5J90MlbLj+puKaX5svEq8ljS+P3zq6zHTJeKh9GKLzPf7bc5YJKwcew==} + cpu: [x64] + os: [linux] + + '@anthropic-ai/claude-agent-sdk-win32-arm64@0.2.123': + resolution: {integrity: sha512-WDZmAQG1rOiqNLZlSXaCjSWmqJvLk2io+vFQWWqSy2b5HCk9pa3PadLiaLztiihyk81wPhH9Q/44kOxdyfEGMw==} + cpu: [arm64] + os: [win32] + + '@anthropic-ai/claude-agent-sdk-win32-x64@0.2.123': + resolution: {integrity: sha512-588xrd1i6d4kXQ6FqwL+cgBiN4evRQSi5DCtPa02CZ3VEbuVQBeFlyPlD8tfWtNNeGZ4NM8kjPNNzZz5omezPA==} + cpu: [x64] + os: [win32] + + '@anthropic-ai/claude-agent-sdk@0.2.123': + resolution: {integrity: sha512-a4TysYoR9DBdkM9Uwh4J5ub7TwKmRPe5hFiWh4En+IKC+qkk5UFkxFM22c//cZjYZKynHX0ah2t6LUqb+najYA==} + engines: {node: '>=18.0.0'} + peerDependencies: + zod: ^4.0.0 + '@anthropic-ai/sdk@0.73.0': resolution: {integrity: sha512-URURVzhxXGJDGUGFunIOtBlSl7KWvZiAAKY/ttTkZAkXT9bTPqdk2eK0b8qqSxXpikh3QKPnPYpiyX98zf5ebw==} hasBin: true @@ -2416,6 +2468,15 @@ packages: zod: optional: true + '@anthropic-ai/sdk@0.81.0': + resolution: {integrity: sha512-D4K5PvEV6wPiRtVlVsJHIUhHAmOZ6IT/I9rKlTf84gR7GyyAurPJK7z9BOf/AZqC5d1DhYQGJNKRmV+q8dGhgw==} + hasBin: true + peerDependencies: + zod: ^3.25.0 || ^4.0.0 + peerDependenciesMeta: + zod: + optional: true + '@anthropic-ai/sdk@0.90.0': resolution: {integrity: sha512-MzZtPabJF1b0FTDl6Z6H5ljphPwACLGP13lu8MTiB8jXaW/YXlpOp+Po2cVou3MPM5+f5toyLnul9whKCy7fBg==} hasBin: true @@ -5982,6 +6043,51 @@ packages: resolution: {integrity: sha512-hAX0pT/73190NLqBPPWSdBVGtbY6VOhWYK3qqHqtXQ1gK7kS2yz4+ivsN07hpJ6I3aeMtKP6J6npsEKOAzuTLA==} engines: {node: '>=20.0'} + '@openai/codex-sdk@0.125.0': + resolution: {integrity: sha512-1xCIHdSbQVF880nJ2aVWdPIsWZbSpKODwuP9y/gvtChDYhYfYEW0DKp2H8ZlctkzIjlzS/WzYmP6ZZPHIvs2Dg==} + engines: {node: '>=18'} + + '@openai/codex@0.125.0': + resolution: {integrity: sha512-GiE9wlgL95u/5BRirY5d3EaRLU1tu7Y1R09R8lCHHVmcQdSmhS809FdPDWH3gIYHS7ZriAPqXwJ3aLA0WKl40Q==} + engines: {node: '>=16'} + hasBin: true + + '@openai/codex@0.125.0-darwin-arm64': + resolution: {integrity: sha512-Gn2fHiSO0XgyHp1OSd5DWUTm66Bv9UEuipW5pVEj1E+hWZCOrdqnYttllKFWtRGj5yiKefNX3JIxONgh/ZwlOQ==} + engines: {node: '>=16'} + cpu: [arm64] + os: [darwin] + + '@openai/codex@0.125.0-darwin-x64': + resolution: {integrity: sha512-TZ5Lek2X/UXTI9LXFxzarvQaJeuTrqVh4POc7soO/8RclVnCxADnCf15sivxLd5eiFW4t0myGoeVoM4lciRiRg==} + engines: {node: '>=16'} + cpu: [x64] + os: [darwin] + + '@openai/codex@0.125.0-linux-arm64': + resolution: {integrity: sha512-pPnJoJD6rZ2Iin0zNt/up36bO2/EOp2B+1/rPHu/lSq3PJbT3Fmnfut2kJy5LylXb7bGA2XQbtqOogZzIbnlkA==} + engines: {node: '>=16'} + cpu: [arm64] + os: [linux] + + '@openai/codex@0.125.0-linux-x64': + resolution: {integrity: sha512-K2NTTEeBpz/G+N2x17UGWfauRt3So+ir4f+U/60l5PPnYEJB/w3YZrlXo2G9og8Dm9BqtoBAjoPV74sRv9tWWQ==} + engines: {node: '>=16'} + cpu: [x64] + os: [linux] + + '@openai/codex@0.125.0-win32-arm64': + resolution: {integrity: sha512-zxoUakw9oIHIFrAyk400XkkLBJFA6nOym0NDq6sQ/jhdcYraKqNSRCII2nsBwZHk+/4zgUvuk52iuutgysY/rQ==} + engines: {node: '>=16'} + cpu: [arm64] + os: [win32] + + '@openai/codex@0.125.0-win32-x64': + resolution: {integrity: sha512-ofpOK+OWH5QFuUZ9pTM0d/PcXUXiIP5z5DpRcE9MlucJoyOl4Zy4Nu3NcuHF4YzCkZMQb6x3j0tjDEPHKqNQzw==} + engines: {node: '>=16'} + cpu: [x64] + os: [win32] + '@opentelemetry/api@1.9.1': resolution: {integrity: sha512-gLyJlPHPZYdAk1JENA9LeHejZe1Ti77/pTeFm/nMXmQH/HFZlcS/O2XJB+L8fkbrNSqhdtlvjBVjxwUYanNH5Q==} engines: {node: '>=8.0.0'} @@ -10162,8 +10268,8 @@ packages: resolution: {integrity: sha512-MnA+YT8fwfJPgBx3m60MNqakm30XOkyIoH1y6huTQvC0PwZG7ki8NacLBcrPbNoo8vEZy7Jpuk7+jMO+CUovTQ==} engines: {node: '>= 14'} - agent-session-protocol@0.0.2: - resolution: {integrity: sha512-mGGQKUB9RrOCl8y22uF1AJlcGV+v1Z+SII/wEDgUp4p0Emx83w8s1C+4Mtjn18ksnr3ia5eI9jwwLguNlZ5RDw==} + agent-session-protocol@0.0.8: + resolution: {integrity: sha512-1LHPvWzole19D+Iv8vj6ktMy5epw5tYtq9g35si3t0YHLHPSXuWL7mSqZb9oIt2ZeXtwO7vJE9hTrTM7gckjvQ==} engines: {node: '>=18.0.0'} hasBin: true @@ -18057,6 +18163,7 @@ packages: uuid@10.0.0: resolution: {integrity: sha512-8XkAphELsDnEGrDxUOHB3RGvXz6TeuYSGEZBOjtTtPm2lwhGBjLgOzLHB63IUWfBpNucQjND6d3AOudO+H3RWQ==} + deprecated: uuid@10 and below is no longer supported. For ESM codebases, update to uuid@latest. For CommonJS codebases, use uuid@11 (but be aware this version will likely be deprecated in 2028). hasBin: true uuid@11.1.0: @@ -18065,14 +18172,17 @@ packages: uuid@7.0.3: resolution: {integrity: sha512-DPSke0pXhTZgoF/d+WSt2QaKMCFSfx7QegxEWT+JOuHF5aWrKEn0G+ztjuJg/gG8/ItK+rbPCD/yNv8yyih6Cg==} + deprecated: uuid@10 and below is no longer supported. For ESM codebases, update to uuid@latest. For CommonJS codebases, use uuid@11 (but be aware this version will likely be deprecated in 2028). hasBin: true uuid@8.0.0: resolution: {integrity: sha512-jOXGuXZAWdsTH7eZLtyXMqUb9EcWMGZNbL9YcGBJl4MH4nrxHmZJhEHvyLFrkxo+28uLb/NYRcStH48fnD0Vzw==} + deprecated: uuid@10 and below is no longer supported. For ESM codebases, update to uuid@latest. For CommonJS codebases, use uuid@11 (but be aware this version will likely be deprecated in 2028). hasBin: true uuid@9.0.1: resolution: {integrity: sha512-b+1eJOlsR9K8HJpow9Ok3fiWOWSIcIzXodvv0rQjVoOVNpWMpxf1wZNpt4y9h10odCNrqnYp1OBzRktckBe3sA==} + deprecated: uuid@10 and below is no longer supported. For ESM codebases, update to uuid@latest. For CommonJS codebases, use uuid@11 (but be aware this version will likely be deprecated in 2028). hasBin: true valibot@1.0.0: @@ -19064,6 +19174,48 @@ snapshots: package-manager-detector: 1.6.0 tinyexec: 1.0.2 + '@anthropic-ai/claude-agent-sdk-darwin-arm64@0.2.123': + optional: true + + '@anthropic-ai/claude-agent-sdk-darwin-x64@0.2.123': + optional: true + + '@anthropic-ai/claude-agent-sdk-linux-arm64-musl@0.2.123': + optional: true + + '@anthropic-ai/claude-agent-sdk-linux-arm64@0.2.123': + optional: true + + '@anthropic-ai/claude-agent-sdk-linux-x64-musl@0.2.123': + optional: true + + '@anthropic-ai/claude-agent-sdk-linux-x64@0.2.123': + optional: true + + '@anthropic-ai/claude-agent-sdk-win32-arm64@0.2.123': + optional: true + + '@anthropic-ai/claude-agent-sdk-win32-x64@0.2.123': + optional: true + + '@anthropic-ai/claude-agent-sdk@0.2.123(zod@4.3.6)': + dependencies: + '@anthropic-ai/sdk': 0.81.0(zod@4.3.6) + '@modelcontextprotocol/sdk': 1.29.0(zod@4.3.6) + zod: 4.3.6 + optionalDependencies: + '@anthropic-ai/claude-agent-sdk-darwin-arm64': 0.2.123 + '@anthropic-ai/claude-agent-sdk-darwin-x64': 0.2.123 + '@anthropic-ai/claude-agent-sdk-linux-arm64': 0.2.123 + '@anthropic-ai/claude-agent-sdk-linux-arm64-musl': 0.2.123 + '@anthropic-ai/claude-agent-sdk-linux-x64': 0.2.123 + '@anthropic-ai/claude-agent-sdk-linux-x64-musl': 0.2.123 + '@anthropic-ai/claude-agent-sdk-win32-arm64': 0.2.123 + '@anthropic-ai/claude-agent-sdk-win32-x64': 0.2.123 + transitivePeerDependencies: + - '@cfworker/json-schema' + - supports-color + '@anthropic-ai/sdk@0.73.0(zod@4.3.6)': dependencies: json-schema-to-ts: 3.1.1 @@ -19076,6 +19228,12 @@ snapshots: optionalDependencies: zod: 4.3.6 + '@anthropic-ai/sdk@0.81.0(zod@4.3.6)': + dependencies: + json-schema-to-ts: 3.1.1 + optionalDependencies: + zod: 4.3.6 + '@anthropic-ai/sdk@0.90.0(zod@4.3.6)': dependencies: json-schema-to-ts: 3.1.1 @@ -23494,7 +23652,7 @@ snapshots: jose: 6.2.3 json-schema-typed: 8.0.2 pkce-challenge: 5.0.1 - raw-body: 3.0.0 + raw-body: 3.0.2 zod: 3.25.76 zod-to-json-schema: 3.25.2(zod@3.25.76) transitivePeerDependencies: @@ -23516,22 +23674,21 @@ snapshots: jose: 6.2.3 json-schema-typed: 8.0.2 pkce-challenge: 5.0.1 - raw-body: 3.0.0 + raw-body: 3.0.2 zod: 4.3.6 zod-to-json-schema: 3.25.2(zod@4.3.6) transitivePeerDependencies: - supports-color - optional: true '@modelcontextprotocol/sdk@1.6.1': dependencies: content-type: 1.0.5 cors: 2.8.5 eventsource: 3.0.7 - express: 5.1.0 - express-rate-limit: 7.5.1(express@5.1.0) + express: 5.2.1 + express-rate-limit: 7.5.1(express@5.2.1) pkce-challenge: 4.1.0 - raw-body: 3.0.0 + raw-body: 3.0.2 zod: 3.25.76 zod-to-json-schema: 3.25.2(zod@3.25.76) transitivePeerDependencies: @@ -23651,6 +23808,37 @@ snapshots: '@oozcitak/util@10.0.0': {} + '@openai/codex-sdk@0.125.0': + dependencies: + '@openai/codex': 0.125.0 + + '@openai/codex@0.125.0': + optionalDependencies: + '@openai/codex-darwin-arm64': '@openai/codex@0.125.0-darwin-arm64' + '@openai/codex-darwin-x64': '@openai/codex@0.125.0-darwin-x64' + '@openai/codex-linux-arm64': '@openai/codex@0.125.0-linux-arm64' + '@openai/codex-linux-x64': '@openai/codex@0.125.0-linux-x64' + '@openai/codex-win32-arm64': '@openai/codex@0.125.0-win32-arm64' + '@openai/codex-win32-x64': '@openai/codex@0.125.0-win32-x64' + + '@openai/codex@0.125.0-darwin-arm64': + optional: true + + '@openai/codex@0.125.0-darwin-x64': + optional: true + + '@openai/codex@0.125.0-linux-arm64': + optional: true + + '@openai/codex@0.125.0-linux-x64': + optional: true + + '@openai/codex@0.125.0-win32-arm64': + optional: true + + '@openai/codex@0.125.0-win32-x64': + optional: true + '@opentelemetry/api@1.9.1': {} '@oxc-minify/binding-android-arm64@0.96.0': @@ -29587,7 +29775,7 @@ snapshots: agent-base@7.1.4: {} - agent-session-protocol@0.0.2: + agent-session-protocol@0.0.8: dependencies: '@durable-streams/client': 0.2.3 '@modelcontextprotocol/sdk': 1.29.0(zod@3.25.76) @@ -30150,7 +30338,7 @@ snapshots: bytes: 3.1.2 content-type: 1.0.5 debug: 4.4.3 - http-errors: 2.0.0 + http-errors: 2.0.1 iconv-lite: 0.7.2 on-finished: 2.4.1 qs: 6.15.1 @@ -32506,9 +32694,9 @@ snapshots: exponential-backoff@3.1.2: {} - express-rate-limit@7.5.1(express@5.1.0): + express-rate-limit@7.5.1(express@5.2.1): dependencies: - express: 5.1.0 + express: 5.2.1 express-rate-limit@8.4.1(express@5.2.1): dependencies: @@ -32598,19 +32786,19 @@ snapshots: etag: 1.8.1 finalhandler: 2.1.0 fresh: 2.0.0 - http-errors: 2.0.0 + http-errors: 2.0.1 merge-descriptors: 2.0.0 mime-types: 3.0.1 on-finished: 2.4.1 once: 1.4.0 parseurl: 1.3.3 proxy-addr: 2.0.7 - qs: 6.14.0 + qs: 6.15.1 range-parser: 1.2.1 router: 2.2.0 send: 1.2.0 serve-static: 2.2.0 - statuses: 2.0.1 + statuses: 2.0.2 type-is: 2.0.1 vary: 1.1.2 transitivePeerDependencies: From 0c3e6f6247122a8b2da0d85447637141e3c18908 Mon Sep 17 00:00:00 2001 From: Kevin De Porre Date: Mon, 4 May 2026 11:47:22 +0200 Subject: [PATCH 2/7] changeset: coder entity now uses SDKs instead of CLI binaries Co-Authored-By: Claude Opus 4.7 (1M context) --- .changeset/coder-sdk-runners.md | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 .changeset/coder-sdk-runners.md diff --git a/.changeset/coder-sdk-runners.md b/.changeset/coder-sdk-runners.md new file mode 100644 index 0000000000..31c995b513 --- /dev/null +++ b/.changeset/coder-sdk-runners.md @@ -0,0 +1,7 @@ +--- +'@electric-ax/agents': minor +--- + +feat: drive the coder entity via Claude Code and Codex SDKs instead of the `claude` / `codex` CLI binaries + +The `coder` entity now invokes `@anthropic-ai/claude-agent-sdk` and `@openai/codex-sdk` directly, so the host no longer needs `claude` or `codex` installed on PATH — both SDKs ship their own platform-specific subprocess binaries as optional dependencies. Events stream from the SDK iterators into the entity's durable event collection live, replacing the previous JSONL file-watcher and post-run discovery plumbing. From cb07580d976b18bea652474b16b8ee91a460fce1 Mon Sep 17 00:00:00 2001 From: Kevin De Porre Date: Mon, 4 May 2026 12:02:07 +0200 Subject: [PATCH 3/7] refactor(agents): remove dead startedItems Set in codex SDK runner MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The set was populated on item.started but never read — neither guarded duplicate processing nor drove downstream logic, just leaked memory across long sessions. Co-Authored-By: Claude Opus 4.7 (1M context) --- packages/agents/src/agents/runners/codex-sdk.ts | 2 -- 1 file changed, 2 deletions(-) diff --git a/packages/agents/src/agents/runners/codex-sdk.ts b/packages/agents/src/agents/runners/codex-sdk.ts index f8f6928e12..64587bd16a 100644 --- a/packages/agents/src/agents/runners/codex-sdk.ts +++ b/packages/agents/src/agents/runners/codex-sdk.ts @@ -44,7 +44,6 @@ export const codexSdkRunner: CodingSessionCliRunner = { ? codex.resumeThread(opts.sessionId, threadOptions) : codex.startThread(threadOptions) - const startedItems = new Set() let turnFailed: { message: string } | null = null let assistantText = `` let capturedSessionId: string | null = opts.sessionId ?? null @@ -76,7 +75,6 @@ export const codexSdkRunner: CodingSessionCliRunner = { case `item.started`: { const startEvents = threadItemStartedToEvents(ev.item) for (const e of startEvents) opts.onEvent?.(e) - startedItems.add(ev.item.id) break } case `item.completed`: { From 26c6dbbb37dc22c6dcec4e7d7ea81ce18230d2ca Mon Sep 17 00:00:00 2001 From: Kevin De Porre Date: Mon, 4 May 2026 12:02:51 +0200 Subject: [PATCH 4/7] fix(agents): emit tool_result for completed Codex web_search items MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously web_search items emitted a tool_call on item.started but nothing on item.completed, leaving the lifecycle dangling — UIs that render tool calls would show a perpetually-pending web search. Codex's WebSearchItem doesn't expose result content (only `query`), so emit an empty tool_result to honour the contract every other tool-style item already follows. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../agents/src/agents/runners/codex-sdk.ts | 21 ++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/packages/agents/src/agents/runners/codex-sdk.ts b/packages/agents/src/agents/runners/codex-sdk.ts index 64587bd16a..37cd1dc63e 100644 --- a/packages/agents/src/agents/runners/codex-sdk.ts +++ b/packages/agents/src/agents/runners/codex-sdk.ts @@ -169,7 +169,12 @@ function threadItemCompletedToEvents(item: ThreadItem): Array { case `mcp_tool_call`: return [mcpToolCallToToolResult(item, ts)] case `web_search`: - return [] + // Codex's WebSearchItem doesn't expose the search results to the + // SDK consumer (only `query`), so we can't produce a meaningful + // tool_result payload. Emit an empty one anyway to honor the + // tool_call→tool_result contract — without it any UI rendering + // tool lifecycles would show a perpetually-pending web search. + return [webSearchToToolResult(item, ts)] case `todo_list`: return [] case `error`: @@ -317,6 +322,20 @@ function webSearchToToolCall(item: WebSearchItem, ts: number): NormalizedEvent { } } +function webSearchToToolResult( + item: WebSearchItem, + ts: number +): NormalizedEvent { + return { + v: 1, + ts, + type: `tool_result`, + callId: item.id, + output: ``, + isError: false, + } +} + function errorItemToEvent(item: ErrorItem, ts: number): NormalizedEvent { return { v: 1, From d917b225af7e14aebafb19f806c7780a5aaae133 Mon Sep 17 00:00:00 2001 From: Kevin De Porre Date: Mon, 4 May 2026 12:03:07 +0200 Subject: [PATCH 5/7] docs(agents): explain why both Claude permission flags are required MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewer flagged permissionMode + allowDangerouslySkipPermissions as possibly redundant. They aren't — the SDK requires both: the mode selects bypass and the boolean is the explicit acknowledgement gate (SDK throws if missing when the bypass mode is used). Document that in the call site so the next reader doesn't try to drop one. Co-Authored-By: Claude Opus 4.7 (1M context) --- packages/agents/src/agents/runners/claude-sdk.ts | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/packages/agents/src/agents/runners/claude-sdk.ts b/packages/agents/src/agents/runners/claude-sdk.ts index 7cbfcb4f78..c5657a1721 100644 --- a/packages/agents/src/agents/runners/claude-sdk.ts +++ b/packages/agents/src/agents/runners/claude-sdk.ts @@ -23,6 +23,12 @@ export const claudeSdkRunner: CodingSessionCliRunner = { options: { cwd: opts.cwd, ...(opts.sessionId ? { resume: opts.sessionId } : {}), + // The Claude SDK requires *both* of these to skip approvals: + // `permissionMode: 'bypassPermissions'` selects the bypass + // policy and `allowDangerouslySkipPermissions: true` is the + // explicit acknowledgement gate (the SDK throws unless that + // boolean is set when the bypass mode is used). They are not + // redundant despite the name overlap. permissionMode: `bypassPermissions`, allowDangerouslySkipPermissions: true, }, From fe192aca4d49c748acd838c5045c997feb661a11 Mon Sep 17 00:00:00 2001 From: Kevin De Porre Date: Mon, 4 May 2026 12:04:57 +0200 Subject: [PATCH 6/7] test(agents): unit-test SDK message and ThreadItem conversions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewer flagged the SDK→NormalizedEvent conversion helpers as the most likely place for silent regressions when the underlying SDKs evolve. They are pure functions and don't need the SDK to be mocked, so cover each branch directly: - `sdkMessageToClaudeEntry` for system/init, system/compact_boundary, user, assistant (with usage + stop_reason), result (renames `duration_ms`), and SDK-only types that should return null. - `threadItemStartedToEvents` and `threadItemCompletedToEvents` for every ThreadItem variant — including the new web_search lifecycle pairing, file_change kind classification (add → file_write, mixed → file_edit), MCP success/failure, and command exit-code → isError. Helpers are exported with a doc comment marking them as test-only entry points so it's clear they're not part of the runner's public surface. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../agents/src/agents/runners/claude-sdk.ts | 4 +- .../agents/src/agents/runners/codex-sdk.ts | 10 +- .../agents/test/runners/claude-sdk.test.ts | 150 ++++++++++ .../agents/test/runners/codex-sdk.test.ts | 270 ++++++++++++++++++ 4 files changed, 431 insertions(+), 3 deletions(-) create mode 100644 packages/agents/test/runners/claude-sdk.test.ts create mode 100644 packages/agents/test/runners/codex-sdk.test.ts diff --git a/packages/agents/src/agents/runners/claude-sdk.ts b/packages/agents/src/agents/runners/claude-sdk.ts index c5657a1721..c9c260fa4c 100644 --- a/packages/agents/src/agents/runners/claude-sdk.ts +++ b/packages/agents/src/agents/runners/claude-sdk.ts @@ -89,8 +89,10 @@ export const claudeSdkRunner: CodingSessionCliRunner = { * * Returns null for SDK-only message types (status pings, retries, hook * lifecycle, etc.) that have no JSONL counterpart. + * + * Exported for unit testing — the runner is the only production caller. */ -function sdkMessageToClaudeEntry(msg: SDKMessage): ClaudeEntry | null { +export function sdkMessageToClaudeEntry(msg: SDKMessage): ClaudeEntry | null { const ts = (msg as { timestamp?: string }).timestamp ?? new Date().toISOString() const sessionId = (msg as { session_id?: string }).session_id diff --git a/packages/agents/src/agents/runners/codex-sdk.ts b/packages/agents/src/agents/runners/codex-sdk.ts index 37cd1dc63e..c736755cd9 100644 --- a/packages/agents/src/agents/runners/codex-sdk.ts +++ b/packages/agents/src/agents/runners/codex-sdk.ts @@ -137,7 +137,10 @@ export const codexSdkRunner: CodingSessionCliRunner = { }, } -function threadItemStartedToEvents(item: ThreadItem): Array { +/** Exported for unit testing — the runner is the only production caller. */ +export function threadItemStartedToEvents( + item: ThreadItem +): Array { const ts = Date.now() switch (item.type) { case `command_execution`: @@ -155,7 +158,10 @@ function threadItemStartedToEvents(item: ThreadItem): Array { } } -function threadItemCompletedToEvents(item: ThreadItem): Array { +/** Exported for unit testing — the runner is the only production caller. */ +export function threadItemCompletedToEvents( + item: ThreadItem +): Array { const ts = Date.now() switch (item.type) { case `agent_message`: diff --git a/packages/agents/test/runners/claude-sdk.test.ts b/packages/agents/test/runners/claude-sdk.test.ts new file mode 100644 index 0000000000..947702eada --- /dev/null +++ b/packages/agents/test/runners/claude-sdk.test.ts @@ -0,0 +1,150 @@ +import { describe, expect, it } from 'vitest' +import type { SDKMessage } from '@anthropic-ai/claude-agent-sdk' +import { sdkMessageToClaudeEntry } from '../../src/agents/runners/claude-sdk' + +describe(`sdkMessageToClaudeEntry`, () => { + it(`maps a system/init message to a ClaudeEntry init entry`, () => { + const msg = { + type: `system`, + subtype: `init`, + session_id: `s-1`, + cwd: `/tmp/x`, + claude_code_version: `2.1.83`, + model: `claude-sonnet-4-5`, + tools: [], + mcp_servers: [], + slash_commands: [], + output_style: ``, + skills: [], + plugins: [], + apiKeySource: `user`, + permissionMode: `default`, + uuid: `u-1`, + } as unknown as SDKMessage + const entry = sdkMessageToClaudeEntry(msg) + expect(entry).toMatchObject({ + type: `system`, + subtype: `init`, + sessionId: `s-1`, + cwd: `/tmp/x`, + version: `2.1.83`, + message: { model: `claude-sonnet-4-5` }, + }) + }) + + it(`maps a system/compact_boundary message`, () => { + const msg = { + type: `system`, + subtype: `compact_boundary`, + session_id: `s-1`, + compact_metadata: { trigger: `auto`, pre_tokens: 100 }, + uuid: `u-2`, + } as unknown as SDKMessage + const entry = sdkMessageToClaudeEntry(msg) + expect(entry).toMatchObject({ + type: `system`, + subtype: `compact_boundary`, + sessionId: `s-1`, + }) + }) + + it(`maps a user message`, () => { + const msg = { + type: `user`, + session_id: `s-1`, + message: { role: `user`, content: `hello` }, + parent_tool_use_id: null, + } as unknown as SDKMessage + const entry = sdkMessageToClaudeEntry(msg) + expect(entry).toMatchObject({ + type: `user`, + sessionId: `s-1`, + message: { role: `user`, content: `hello` }, + }) + }) + + it(`maps an assistant message and preserves usage + stop_reason`, () => { + const msg = { + type: `assistant`, + session_id: `s-1`, + message: { + id: `m-1`, + type: `message`, + role: `assistant`, + model: `claude-sonnet-4-5`, + content: [{ type: `text`, text: `hi` }], + stop_reason: `end_turn`, + stop_sequence: null, + usage: { + input_tokens: 10, + output_tokens: 5, + cache_read_input_tokens: 0, + cache_creation_input_tokens: 0, + }, + }, + parent_tool_use_id: null, + uuid: `u-3`, + } as unknown as SDKMessage + const entry = sdkMessageToClaudeEntry(msg) + expect(entry).toMatchObject({ + type: `assistant`, + sessionId: `s-1`, + message: { + role: `assistant`, + model: `claude-sonnet-4-5`, + stop_reason: `end_turn`, + usage: { input_tokens: 10, output_tokens: 5 }, + }, + }) + // content is forwarded through so normalizeClaudeEvent can iterate it + expect((entry!.message!.content as Array)[0]).toMatchObject({ + type: `text`, + text: `hi`, + }) + }) + + it(`maps a result message and renames duration_ms to durationMs`, () => { + const msg = { + type: `result`, + subtype: `success`, + session_id: `s-1`, + duration_ms: 1234, + duration_api_ms: 1000, + is_error: false, + num_turns: 1, + result: `done`, + stop_reason: `end_turn`, + total_cost_usd: 0.01, + usage: { + input_tokens: 10, + output_tokens: 5, + cache_read_input_tokens: 0, + cache_creation_input_tokens: 0, + }, + modelUsage: {}, + permission_denials: [], + uuid: `u-4`, + } as unknown as SDKMessage + const entry = sdkMessageToClaudeEntry(msg) + expect(entry).toMatchObject({ + type: `result`, + subtype: `success`, + sessionId: `s-1`, + durationMs: 1234, + message: { + usage: { input_tokens: 10, output_tokens: 5 }, + }, + }) + }) + + it(`returns null for SDK-only message types`, () => { + const msg = { + type: `auth_status`, + session_id: `s-1`, + isAuthenticating: false, + output: [], + uuid: `u-5`, + } as unknown as SDKMessage + expect(sdkMessageToClaudeEntry(msg)).toBeNull() + }) +}) diff --git a/packages/agents/test/runners/codex-sdk.test.ts b/packages/agents/test/runners/codex-sdk.test.ts new file mode 100644 index 0000000000..c056863c09 --- /dev/null +++ b/packages/agents/test/runners/codex-sdk.test.ts @@ -0,0 +1,270 @@ +import { describe, expect, it } from 'vitest' +import type { ThreadItem } from '@openai/codex-sdk' +import { + threadItemCompletedToEvents, + threadItemStartedToEvents, +} from '../../src/agents/runners/codex-sdk' + +describe(`threadItemStartedToEvents`, () => { + it(`maps command_execution to a tool_call (terminal for an unclassified cmd)`, () => { + const item: ThreadItem = { + id: `i-1`, + type: `command_execution`, + command: `pwd`, + aggregated_output: ``, + status: `in_progress`, + } + const events = threadItemStartedToEvents(item) + expect(events).toHaveLength(1) + const ev = events[0]! + expect(ev).toMatchObject({ + type: `tool_call`, + callId: `i-1`, + tool: `terminal`, + originalTool: `exec_command`, + originalAgent: `codex`, + input: { command: `pwd` }, + }) + }) + + it(`classifies cat as file_read`, () => { + const item: ThreadItem = { + id: `i-2`, + type: `command_execution`, + command: `cat /tmp/x.txt`, + aggregated_output: ``, + status: `in_progress`, + } + const ev = threadItemStartedToEvents(item)[0]! + if (ev.type !== `tool_call`) throw new Error(`unexpected`) + expect(ev.tool).toBe(`file_read`) + }) + + it(`maps mcp_tool_call to a tool_call carrying the MCP tool name`, () => { + const item: ThreadItem = { + id: `i-3`, + type: `mcp_tool_call`, + server: `my-server`, + tool: `my_tool`, + arguments: { foo: 1 }, + status: `in_progress`, + } + const ev = threadItemStartedToEvents(item)[0]! + expect(ev).toMatchObject({ + type: `tool_call`, + callId: `i-3`, + tool: `my_tool`, + originalTool: `my_tool`, + input: { foo: 1 }, + }) + }) + + it(`maps web_search to a tool_call`, () => { + const item: ThreadItem = { + id: `i-4`, + type: `web_search`, + query: `electric sql`, + } + const ev = threadItemStartedToEvents(item)[0]! + expect(ev).toMatchObject({ + type: `tool_call`, + callId: `i-4`, + tool: `web_search`, + input: { query: `electric sql` }, + }) + }) + + it(`emits nothing on start for items completed in one event`, () => { + const items: Array = [ + { id: `m-1`, type: `agent_message`, text: `hello` }, + { id: `r-1`, type: `reasoning`, text: `step 1` }, + { + id: `f-1`, + type: `file_change`, + changes: [{ path: `a.txt`, kind: `add` }], + status: `completed`, + }, + ] + for (const item of items) { + expect(threadItemStartedToEvents(item)).toEqual([]) + } + }) +}) + +describe(`threadItemCompletedToEvents`, () => { + it(`maps agent_message to assistant_message with phase=final`, () => { + const item: ThreadItem = { + id: `m-1`, + type: `agent_message`, + text: `done`, + } + expect(threadItemCompletedToEvents(item)[0]).toMatchObject({ + type: `assistant_message`, + text: `done`, + phase: `final`, + }) + }) + + it(`maps reasoning to thinking`, () => { + const item: ThreadItem = { + id: `r-1`, + type: `reasoning`, + text: `the user asked for X so I'll do Y`, + } + const ev = threadItemCompletedToEvents(item)[0]! + expect(ev).toMatchObject({ + type: `thinking`, + text: `the user asked for X so I'll do Y`, + }) + if (ev.type === `thinking`) { + expect(ev.summary.length).toBeLessThanOrEqual(200) + } + }) + + it(`maps a successful command_execution to tool_result`, () => { + const item: ThreadItem = { + id: `i-1`, + type: `command_execution`, + command: `ls /`, + aggregated_output: `bin\netc`, + exit_code: 0, + status: `completed`, + } + expect(threadItemCompletedToEvents(item)[0]).toMatchObject({ + type: `tool_result`, + callId: `i-1`, + output: `bin\netc`, + isError: false, + exitCode: 0, + }) + }) + + it(`marks a non-zero exit as error`, () => { + const item: ThreadItem = { + id: `i-2`, + type: `command_execution`, + command: `false`, + aggregated_output: ``, + exit_code: 1, + status: `completed`, + } + const ev = threadItemCompletedToEvents(item)[0]! + if (ev.type !== `tool_result`) throw new Error(`unexpected`) + expect(ev.isError).toBe(true) + }) + + it(`emits paired tool_call+tool_result for file_change`, () => { + const item: ThreadItem = { + id: `f-1`, + type: `file_change`, + changes: [ + { path: `a.txt`, kind: `add` }, + { path: `b.txt`, kind: `update` }, + ], + status: `completed`, + } + const events = threadItemCompletedToEvents(item) + expect(events).toHaveLength(2) + expect(events[0]).toMatchObject({ + type: `tool_call`, + callId: `f-1`, + tool: `file_edit`, // mixed adds + updates → file_edit + originalTool: `apply_patch`, + }) + expect(events[1]).toMatchObject({ + type: `tool_result`, + callId: `f-1`, + isError: false, + }) + }) + + it(`maps an all-add file_change to file_write`, () => { + const item: ThreadItem = { + id: `f-2`, + type: `file_change`, + changes: [{ path: `new.txt`, kind: `add` }], + status: `completed`, + } + const ev = threadItemCompletedToEvents(item)[0]! + if (ev.type !== `tool_call`) throw new Error(`unexpected`) + expect(ev.tool).toBe(`file_write`) + }) + + it(`maps mcp_tool_call success to tool_result with structured content`, () => { + const item: ThreadItem = { + id: `i-3`, + type: `mcp_tool_call`, + server: `s`, + tool: `t`, + arguments: {}, + result: { + content: [], + structured_content: { ok: true }, + }, + status: `completed`, + } + const ev = threadItemCompletedToEvents(item)[0]! + expect(ev).toMatchObject({ + type: `tool_result`, + callId: `i-3`, + isError: false, + }) + if (ev.type === `tool_result`) { + expect(JSON.parse(ev.output)).toEqual({ ok: true }) + } + }) + + it(`maps mcp_tool_call failure to error tool_result`, () => { + const item: ThreadItem = { + id: `i-4`, + type: `mcp_tool_call`, + server: `s`, + tool: `t`, + arguments: {}, + error: { message: `boom` }, + status: `failed`, + } + const ev = threadItemCompletedToEvents(item)[0]! + expect(ev).toMatchObject({ + type: `tool_result`, + callId: `i-4`, + output: `boom`, + isError: true, + }) + }) + + it(`closes the web_search lifecycle with an empty tool_result`, () => { + const item: ThreadItem = { + id: `i-5`, + type: `web_search`, + query: `q`, + } + expect(threadItemCompletedToEvents(item)[0]).toMatchObject({ + type: `tool_result`, + callId: `i-5`, + output: ``, + isError: false, + }) + }) + + it(`maps error items to an error event`, () => { + const item: ThreadItem = { + id: `e-1`, + type: `error`, + message: `something broke`, + } + expect(threadItemCompletedToEvents(item)[0]).toMatchObject({ + type: `error`, + message: `something broke`, + }) + }) + + it(`returns empty for todo_list (no normalized counterpart yet)`, () => { + const item: ThreadItem = { + id: `t-1`, + type: `todo_list`, + items: [{ text: `do x`, completed: false }], + } + expect(threadItemCompletedToEvents(item)).toEqual([]) + }) +}) From b9d4702d461941f61e5a590d2054b45e3503654c Mon Sep 17 00:00:00 2001 From: Kevin De Porre Date: Mon, 4 May 2026 13:56:05 +0200 Subject: [PATCH 7/7] feat(agents): coder runners use user-configured CLI auth, not API keys MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Strip ANTHROPIC_API_KEY / OPENAI_API_KEY from the env passed to the spawned `claude` / `codex` subprocesses so they fall back to whatever the user has set up via `claude login` / `codex login` (OAuth tokens in ~/.claude/.credentials.json and ~/.codex/auth.json). The parent process can still hold those keys for other uses — Horton talks to the Anthropic API directly via @anthropic-ai/sdk and depends on the key being in scope — they just don't leak into the coder's CLI subprocess. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../agents/src/agents/runners/claude-sdk.ts | 8 +++++++ .../agents/src/agents/runners/codex-sdk.ts | 9 +++++++- packages/agents/src/agents/runners/env.ts | 22 +++++++++++++++++++ 3 files changed, 38 insertions(+), 1 deletion(-) create mode 100644 packages/agents/src/agents/runners/env.ts diff --git a/packages/agents/src/agents/runners/claude-sdk.ts b/packages/agents/src/agents/runners/claude-sdk.ts index c9c260fa4c..d6295e0615 100644 --- a/packages/agents/src/agents/runners/claude-sdk.ts +++ b/packages/agents/src/agents/runners/claude-sdk.ts @@ -4,6 +4,7 @@ import { normalizeClaudeEvent } from 'agent-session-protocol' import type { ClaudeEntry } from 'agent-session-protocol' import type { CodingSessionCliRunner } from '../coding-session.js' +import { subprocessEnvWithoutKey } from './env.js' /** * SDK-backed runner for Claude. Drives `query()` from @@ -22,6 +23,13 @@ export const claudeSdkRunner: CodingSessionCliRunner = { prompt: opts.prompt, options: { cwd: opts.cwd, + // Hide ANTHROPIC_API_KEY from the spawned `claude` subprocess + // so it falls back to whatever auth the user has configured + // (typically OAuth tokens written by `claude login`). Horton + // still needs the key in its own process.env to talk to the + // Anthropic API directly, so we strip it here rather than + // leaving it unset everywhere. + env: subprocessEnvWithoutKey(`ANTHROPIC_API_KEY`), ...(opts.sessionId ? { resume: opts.sessionId } : {}), // The Claude SDK requires *both* of these to skip approvals: // `permissionMode: 'bypassPermissions'` selects the bypass diff --git a/packages/agents/src/agents/runners/codex-sdk.ts b/packages/agents/src/agents/runners/codex-sdk.ts index c736755cd9..ca09f7a17c 100644 --- a/packages/agents/src/agents/runners/codex-sdk.ts +++ b/packages/agents/src/agents/runners/codex-sdk.ts @@ -13,6 +13,7 @@ import { normalizeToolName } from 'agent-session-protocol' import type { NormalizedEvent } from 'agent-session-protocol' import type { CodingSessionCliRunner } from '../coding-session.js' +import { subprocessEnvWithoutKey } from './env.js' /** * SDK-backed runner for Codex. Codex's SDK exposes ThreadEvents that @@ -29,7 +30,13 @@ import type { CodingSessionCliRunner } from '../coding-session.js' */ export const codexSdkRunner: CodingSessionCliRunner = { async run(opts) { - const codex = new Codex() + // Hide OPENAI_API_KEY from the spawned `codex` subprocess so it + // falls back to user-configured credentials (`codex login` writes + // tokens to `~/.codex/auth.json`). Symmetric with the Claude + // runner — neither coder runner consumes the parent process's API + // keys so a Horton+coder co-tenant can keep the keys in scope for + // direct API calls without leaking them into the CLI subprocesses. + const codex = new Codex({ env: subprocessEnvWithoutKey(`OPENAI_API_KEY`) }) // Mirror what the CLI runner did: write access in the cwd and no // interactive approval prompts. Without these the SDK defaults to // `read-only` + `on-request` and the agent fails the moment it diff --git a/packages/agents/src/agents/runners/env.ts b/packages/agents/src/agents/runners/env.ts new file mode 100644 index 0000000000..41a989d412 --- /dev/null +++ b/packages/agents/src/agents/runners/env.ts @@ -0,0 +1,22 @@ +/** + * Build a subprocess `env` derived from `process.env` minus a single + * variable. Used by the SDK runners to hide a parent-process API key + * (e.g. `ANTHROPIC_API_KEY`) from the spawned `claude` / `codex` + * subprocess so the binary falls back to user-configured credentials + * (`claude login` OAuth, `~/.codex/auth.json`, etc.) instead of using + * the API key. + * + * Both SDKs replace the subprocess env with this object when provided + * — they don't merge — so we have to spread `process.env` first to + * preserve `HOME`, `PATH`, and everything else the binary needs. + */ +export function subprocessEnvWithoutKey( + keyName: string +): Record { + const out: Record = {} + for (const [k, v] of Object.entries(process.env)) { + if (k === keyName) continue + if (typeof v === `string`) out[k] = v + } + return out +}