From f80aa4e9a0a6881d78d11422e0f17361ef8dd3ff Mon Sep 17 00:00:00 2001 From: haikaljeh Date: Wed, 6 May 2026 22:52:09 -0700 Subject: [PATCH] fix(runtime): add wall-clock timeouts to JSON-RPC request and captureTurn MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two unbounded `await` sites in the runtime could hang indefinitely if the app-server stopped responding mid-flight. A real-world user lost ~25 minutes when `codex app-server` accepted a connection and never replied, with no timeout to surface the failure. The two sites: 1. `AppServerClientBase.request()` in `plugins/codex/scripts/lib/app-server.mjs` — the returned Promise resolves only when a matching `id` is delivered to the message handler. If the app-server never sends a response for that id, the Promise never settles. 2. `captureTurn()` in `plugins/codex/scripts/lib/codex.mjs` — awaits `state.completion`, which only settles when `turn/completed` (or an inferred completion) fires. If the app-server stops emitting events mid-turn, the await never resolves. This adds two narrow timeouts: * Per-request RPC wall-clock timeout, 30s default, override via `CODEX_APP_SERVER_RPC_TIMEOUT_MS` (set to `0` to disable). On timeout, the pending entry is removed and the Promise rejects with a clear message naming the method and elapsed window. Resolve and reject are wrapped to clear the timer on either path so the timer cannot fire after a normal response and reach into a recycled id. * Per-turn idle (no-progress) timeout, 240s default, override via `CODEX_TURN_IDLE_TIMEOUT_MS` (set to `0` to disable). Any notification arrival resets the idle timer; a successful or failed turn-completed clears it via the existing `finally`. On expiry the turn rejects with the thread/turn ids in the message. Healthy turns and RPCs are unaffected — the timers are unrefed and cleared on every settle path. Tests: two fake-codex behaviors (`hang-rpc-thread-start` and `hang-after-turn-start`) drive the new code paths via the existing fixture/spawn pattern. With short overrides, both reject in well under 2 seconds. No source refactors outside the two timeout sites; no changes to broker, lifecycle, or transport. Diagnosed by an external user via codex's own diagnose flow against `f4d65d9`. --- plugins/codex/scripts/lib/app-server.mjs | 35 +++++++++++++++- plugins/codex/scripts/lib/codex.mjs | 47 +++++++++++++++++++++ tests/fake-codex-fixture.mjs | 20 +++++++++ tests/runtime.test.mjs | 52 ++++++++++++++++++++++++ 4 files changed, 153 insertions(+), 1 deletion(-) diff --git a/plugins/codex/scripts/lib/app-server.mjs b/plugins/codex/scripts/lib/app-server.mjs index 127c8376..ce803ff1 100644 --- a/plugins/codex/scripts/lib/app-server.mjs +++ b/plugins/codex/scripts/lib/app-server.mjs @@ -22,6 +22,20 @@ const PLUGIN_MANIFEST = JSON.parse(fs.readFileSync(PLUGIN_MANIFEST_URL, "utf8")) export const BROKER_ENDPOINT_ENV = "CODEX_COMPANION_APP_SERVER_ENDPOINT"; export const BROKER_BUSY_RPC_CODE = -32001; +const DEFAULT_RPC_TIMEOUT_MS = 30_000; + +function resolveRpcTimeoutMs() { + const raw = process.env.CODEX_APP_SERVER_RPC_TIMEOUT_MS; + if (raw === undefined || raw === "") { + return DEFAULT_RPC_TIMEOUT_MS; + } + const parsed = Number(raw); + if (!Number.isFinite(parsed) || parsed < 0) { + return DEFAULT_RPC_TIMEOUT_MS; + } + return parsed; +} + /** @type {ClientInfo} */ const DEFAULT_CLIENT_INFO = { title: "Codex Plugin", @@ -91,7 +105,26 @@ class AppServerClientBase { this.nextId += 1; return new Promise((resolve, reject) => { - this.pending.set(id, { resolve, reject, method }); + const timeoutMs = resolveRpcTimeoutMs(); + let timer = null; + const wrappedResolve = (value) => { + if (timer) clearTimeout(timer); + resolve(value); + }; + const wrappedReject = (err) => { + if (timer) clearTimeout(timer); + reject(err); + }; + if (timeoutMs > 0) { + timer = setTimeout(() => { + if (this.pending.get(id)?.reject === wrappedReject) { + this.pending.delete(id); + } + reject(new Error(`codex app-server ${method} request timed out after ${timeoutMs}ms`)); + }, timeoutMs); + timer.unref?.(); + } + this.pending.set(id, { resolve: wrappedResolve, reject: wrappedReject, method }); this.sendMessage({ id, method, params }); }); } diff --git a/plugins/codex/scripts/lib/codex.mjs b/plugins/codex/scripts/lib/codex.mjs index f2fe88bd..902aff93 100644 --- a/plugins/codex/scripts/lib/codex.mjs +++ b/plugins/codex/scripts/lib/codex.mjs @@ -550,11 +550,54 @@ function applyTurnNotification(state, message) { } } +const DEFAULT_TURN_IDLE_TIMEOUT_MS = 240_000; + +function resolveTurnIdleTimeoutMs() { + const raw = process.env.CODEX_TURN_IDLE_TIMEOUT_MS; + if (raw === undefined || raw === "") { + return DEFAULT_TURN_IDLE_TIMEOUT_MS; + } + const parsed = Number(raw); + if (!Number.isFinite(parsed) || parsed < 0) { + return DEFAULT_TURN_IDLE_TIMEOUT_MS; + } + return parsed; +} + async function captureTurn(client, threadId, startRequest, options = {}) { const state = createTurnCaptureState(threadId, options); const previousHandler = client.notificationHandler; + const idleTimeoutMs = resolveTurnIdleTimeoutMs(); + let idleTimer = null; + const clearIdle = () => { + if (idleTimer) { + clearTimeout(idleTimer); + idleTimer = null; + } + }; + const armIdle = () => { + if (idleTimeoutMs <= 0 || state.completed) { + return; + } + clearIdle(); + idleTimer = setTimeout(() => { + idleTimer = null; + if (state.completed) { + return; + } + state.completed = true; + clearCompletionTimer(state); + const tid = state.turnId ?? "(pending)"; + state.rejectCompletion( + new Error(`codex turn idle timeout after ${idleTimeoutMs}ms (thread=${state.threadId}, turn=${tid})`) + ); + }, idleTimeoutMs); + idleTimer.unref?.(); + }; + client.setNotificationHandler((message) => { + armIdle(); if (!state.turnId) { state.bufferedNotifications.push(message); return; @@ -575,8 +618,11 @@ async function captureTurn(client, threadId, startRequest, options = {}) { applyTurnNotification(state, message); }); + armIdle(); + try { const response = await startRequest(); + armIdle(); options.onResponse?.(response, state); state.turnId = response.turn?.id ?? null; if (state.turnId) { @@ -599,6 +645,7 @@ async function captureTurn(client, threadId, startRequest, options = {}) { return await state.completion; } finally { + clearIdle(); clearCompletionTimer(state); client.setNotificationHandler(previousHandler ?? null); } diff --git a/tests/fake-codex-fixture.mjs b/tests/fake-codex-fixture.mjs index debcadce..b739a713 100644 --- a/tests/fake-codex-fixture.mjs +++ b/tests/fake-codex-fixture.mjs @@ -290,6 +290,10 @@ rl.on("line", (line) => { break; case "thread/start": { + if (BEHAVIOR === "hang-rpc-thread-start") { + // Intentionally drop the request: never respond, never error. + break; + } if (BEHAVIOR === "auth-run-fails") { throw new Error("authentication expired; run codex login"); } @@ -368,6 +372,22 @@ rl.on("line", (line) => { } case "turn/start": { + if (BEHAVIOR === "hang-after-turn-start") { + const thread = ensureThread(state, message.params.threadId); + const hangTurnId = nextTurnId(state); + thread.updatedAt = now(); + state.lastTurnStart = { + threadId: message.params.threadId, + turnId: hangTurnId, + model: message.params.model ?? null, + effort: message.params.effort ?? null, + prompt: "" + }; + saveState(state); + send({ id: message.id, result: { turn: buildTurn(hangTurnId) } }); + // Never emit turn/completed or any further notifications for this turn. + break; + } const thread = ensureThread(state, message.params.threadId); const prompt = (message.params.input || []) .filter((item) => item.type === "text") diff --git a/tests/runtime.test.mjs b/tests/runtime.test.mjs index 90408372..b3dac9ae 100644 --- a/tests/runtime.test.mjs +++ b/tests/runtime.test.mjs @@ -2121,3 +2121,55 @@ test("setup and status honor --cwd when reading shared session runtime", () => { assert.equal(payload.sessionRuntime.mode, "shared"); assert.equal(payload.sessionRuntime.endpoint, "unix:/tmp/fake-broker.sock"); }); + +test("task fails fast when an app-server JSON-RPC request never receives a response", () => { + const repo = makeTempDir(); + const binDir = makeTempDir(); + installFakeCodex(binDir, "hang-rpc-thread-start"); + initGitRepo(repo); + fs.writeFileSync(path.join(repo, "README.md"), "hello\n"); + run("git", ["add", "README.md"], { cwd: repo }); + run("git", ["commit", "-m", "init"], { cwd: repo }); + + const env = { + ...buildEnv(binDir), + CODEX_APP_SERVER_RPC_TIMEOUT_MS: "200" + }; + const startedAt = Date.now(); + const result = run("node", [SCRIPT, "task", "rpc hang check"], { + cwd: repo, + env + }); + const elapsed = Date.now() - startedAt; + + assert.notEqual(result.status, 0); + assert.match(result.stderr + result.stdout, /timed out after 200ms/); + // Should fail well before any default 30s timeout — give plenty of slack for + // node startup + IPC, but assert we did not silently wait minutes. + assert.ok(elapsed < 15000, `expected fast timeout, took ${elapsed}ms`); +}); + +test("task fails fast when the app-server stops emitting notifications mid-turn", () => { + const repo = makeTempDir(); + const binDir = makeTempDir(); + installFakeCodex(binDir, "hang-after-turn-start"); + initGitRepo(repo); + fs.writeFileSync(path.join(repo, "README.md"), "hello\n"); + run("git", ["add", "README.md"], { cwd: repo }); + run("git", ["commit", "-m", "init"], { cwd: repo }); + + const env = { + ...buildEnv(binDir), + CODEX_TURN_IDLE_TIMEOUT_MS: "200" + }; + const startedAt = Date.now(); + const result = run("node", [SCRIPT, "task", "turn idle hang check"], { + cwd: repo, + env + }); + const elapsed = Date.now() - startedAt; + + assert.notEqual(result.status, 0); + assert.match(result.stderr + result.stdout, /turn idle timeout after 200ms/); + assert.ok(elapsed < 15000, `expected fast turn-idle timeout, took ${elapsed}ms`); +});