-
Notifications
You must be signed in to change notification settings - Fork 1.1k
feat(codex): per-turn watchdog with TurnWatchdogError + exit 124 #312
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -293,6 +293,24 @@ function describeCompletedItem(state, item) { | |
| } | ||
| } | ||
|
|
||
| /** | ||
| * Thrown by captureTurn when no JSON-RPC notification arrives within the | ||
| * configured watchdog window. Wrappers can detect via `instanceof` or by | ||
| * `error.code === 'TURN_WATCHDOG_TIMEOUT'` and propagate exit code 124 | ||
| * (matching `timeout(1)` convention). | ||
| */ | ||
| export class TurnWatchdogError extends Error { | ||
| constructor(message, { watchdogMs, threadId, turnId } = {}) { | ||
| super(message); | ||
| this.name = "TurnWatchdogError"; | ||
| this.code = "TURN_WATCHDOG_TIMEOUT"; | ||
| this.exitCode = 124; | ||
| this.watchdogMs = watchdogMs ?? null; | ||
| this.threadId = threadId ?? null; | ||
| this.turnId = turnId ?? null; | ||
| } | ||
| } | ||
|
|
||
| /** @returns {TurnCaptureState} */ | ||
| function createTurnCaptureState(threadId, options = {}) { | ||
| let resolveCompletion; | ||
|
|
@@ -319,6 +337,11 @@ function createTurnCaptureState(threadId, options = {}) { | |
| pendingCollaborations: new Set(), | ||
| activeSubagentTurns: new Set(), | ||
| completionTimer: null, | ||
| watchdogTimer: null, | ||
| watchdogMs: | ||
| typeof options.watchdogMs === "number" && options.watchdogMs > 0 | ||
| ? options.watchdogMs | ||
| : null, | ||
| lastAgentMessage: "", | ||
| reviewText: "", | ||
| reasoningSummary: [], | ||
|
|
@@ -337,12 +360,54 @@ function clearCompletionTimer(state) { | |
| } | ||
| } | ||
|
|
||
| function disarmWatchdog(state) { | ||
| if (state.watchdogTimer) { | ||
| clearTimeout(state.watchdogTimer); | ||
| state.watchdogTimer = null; | ||
| } | ||
| } | ||
|
|
||
| function armWatchdog(state) { | ||
| if (!state.watchdogMs || state.completed) { | ||
| return; | ||
| } | ||
| disarmWatchdog(state); | ||
| state.watchdogTimer = setTimeout(() => { | ||
| state.watchdogTimer = null; | ||
| if (state.completed) { | ||
| return; | ||
| } | ||
| state.completed = true; | ||
| clearCompletionTimer(state); | ||
| const message = | ||
| `Codex turn watchdog fired after ${state.watchdogMs}ms of silence ` + | ||
| `(thread ${state.threadId}, turn ${state.turnId ?? "pending"}). ` + | ||
| `No JSON-RPC notification arrived in that window.`; | ||
| state.rejectCompletion( | ||
| new TurnWatchdogError(message, { | ||
| watchdogMs: state.watchdogMs, | ||
| threadId: state.threadId, | ||
| turnId: state.turnId | ||
| }) | ||
| ); | ||
| }, state.watchdogMs); | ||
| state.watchdogTimer.unref?.(); | ||
| } | ||
|
|
||
| function kickWatchdog(state) { | ||
| if (!state.watchdogMs || state.completed) { | ||
| return; | ||
| } | ||
| armWatchdog(state); | ||
| } | ||
|
|
||
| function completeTurn(state, turn = null, options = {}) { | ||
| if (state.completed) { | ||
| return; | ||
| } | ||
|
|
||
| clearCompletionTimer(state); | ||
| disarmWatchdog(state); | ||
| state.completed = true; | ||
|
|
||
| if (turn) { | ||
|
|
@@ -555,6 +620,8 @@ async function captureTurn(client, threadId, startRequest, options = {}) { | |
| const previousHandler = client.notificationHandler; | ||
|
|
||
| client.setNotificationHandler((message) => { | ||
| kickWatchdog(state); | ||
|
|
||
| if (!state.turnId) { | ||
| state.bufferedNotifications.push(message); | ||
| return; | ||
|
|
@@ -576,6 +643,7 @@ async function captureTurn(client, threadId, startRequest, options = {}) { | |
| }); | ||
|
|
||
| try { | ||
| armWatchdog(state); | ||
| const response = await startRequest(); | ||
| options.onResponse?.(response, state); | ||
| state.turnId = response.turn?.id ?? null; | ||
|
|
@@ -600,6 +668,7 @@ async function captureTurn(client, threadId, startRequest, options = {}) { | |
| return await state.completion; | ||
| } finally { | ||
| clearCompletionTimer(state); | ||
| disarmWatchdog(state); | ||
| client.setNotificationHandler(previousHandler ?? null); | ||
| } | ||
| } | ||
|
|
@@ -1009,7 +1078,13 @@ export async function runAppServerTurn(cwd, options = {}) { | |
| effort: options.effort ?? null, | ||
| outputSchema: options.outputSchema ?? null | ||
| }), | ||
| { onProgress: options.onProgress } | ||
| { | ||
| onProgress: options.onProgress, | ||
| watchdogMs: | ||
| typeof options.watchdogMs === "number" | ||
| ? options.watchdogMs | ||
| : Number(process.env.CODEX_TURN_WATCHDOG_MS) || null | ||
|
Comment on lines
+1083
to
+1086
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
The env-backed watchdog is only forwarded from Useful? React with 👍 / 👎. |
||
| } | ||
| ); | ||
|
|
||
| return { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,31 @@ | ||
| import test from "node:test"; | ||
| import assert from "node:assert/strict"; | ||
|
|
||
| import { TurnWatchdogError } from "../plugins/codex/scripts/lib/codex.mjs"; | ||
|
|
||
| test("TurnWatchdogError carries code, exitCode, and metadata", () => { | ||
| const err = new TurnWatchdogError("watchdog fired after 600000ms", { | ||
| watchdogMs: 600000, | ||
| threadId: "thr_123", | ||
| turnId: "turn_456" | ||
| }); | ||
|
|
||
| assert.equal(err.name, "TurnWatchdogError"); | ||
| assert.equal(err.code, "TURN_WATCHDOG_TIMEOUT"); | ||
| assert.equal(err.exitCode, 124); | ||
| assert.equal(err.watchdogMs, 600000); | ||
| assert.equal(err.threadId, "thr_123"); | ||
| assert.equal(err.turnId, "turn_456"); | ||
| assert.equal(err.message, "watchdog fired after 600000ms"); | ||
| assert.ok(err instanceof Error); | ||
| }); | ||
|
|
||
| test("TurnWatchdogError defaults metadata to null when omitted", () => { | ||
| const err = new TurnWatchdogError("silent"); | ||
|
|
||
| assert.equal(err.code, "TURN_WATCHDOG_TIMEOUT"); | ||
| assert.equal(err.exitCode, 124); | ||
| assert.equal(err.watchdogMs, null); | ||
| assert.equal(err.threadId, null); | ||
| assert.equal(err.turnId, null); | ||
| }); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
When
turn/startitself stops responding after the timer is armed, this only rejectsstate.completion;captureTurnis still blocked onawait startRequest()and never observes that rejection, so the watchdog either leaves the call hung or surfaces as an unhandled rejection instead of returning the structured exit-124 error. This affects the same stalled JSON-RPC connection scenario before the start response is delivered; the timer needs to be raced withstartRequest()or otherwise abort the pending request path too.Useful? React with 👍 / 👎.