From 42c37978798241fcc2d31f478c3e60a5a7467013 Mon Sep 17 00:00:00 2001 From: yoshitarof Date: Tue, 5 May 2026 18:54:42 +0900 Subject: [PATCH] fix(stop-review-gate): treat job as inactive when its pid is dead Codex plugin job state (state.json / jobs/task-*.json) records status: "running" while a job is in flight. The dying codex.exe is expected to update this field to "completed" / "cancelled" on exit. But that responsibility cannot be honored if the process is killed by something external (SIGKILL, OS reboot, power loss, a cleanup script, or `taskkill /F`). Result: zombie entries with status: "running" accumulate in state.json. stop-review-gate-hook.mjs reads them, finds a "running" entry, and emits "Codex task task-XYZ is still running." on every session end -- even when no Codex process is actually alive. Fix: add isPidAlive(pid) and isJobActive(job) helpers in lib/state.mjs. isPidAlive uses process.kill(pid, 0) -- cross-platform: ESRCH = dead, EPERM = exists but unsignalable (still alive). isJobActive combines the existing status check with the pid liveness probe. Replace the inline filter in stop-review-gate-hook.mjs with jobs.find(isJobActive). The hook now only reports a running task when it is actually running. This is read-only -- the hook does not mutate state.json. Persistent zombie cleanup is a separate concern that deserves its own PR. Smoke-tested locally on Windows (Node v24): isPidAlive correctly returns true for self pid, false for unused pid 99999999, false for null. isJobActive correctly returns true for {running, alive pid}, false for {running, dead pid}, false for {completed, alive pid}, false for {running, no pid}. --- plugins/codex/scripts/lib/state.mjs | 30 +++++++++++++++++++ .../codex/scripts/stop-review-gate-hook.mjs | 4 +-- 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/plugins/codex/scripts/lib/state.mjs b/plugins/codex/scripts/lib/state.mjs index 2da23498..11e12663 100644 --- a/plugins/codex/scripts/lib/state.mjs +++ b/plugins/codex/scripts/lib/state.mjs @@ -150,6 +150,36 @@ export function listJobs(cwd) { return loadState(cwd).jobs; } +/** + * Check whether a process with the given pid is currently alive. + * Cross-platform: uses `process.kill(pid, 0)` which sends signal 0 + * (no-op signal) and throws ESRCH if the process is gone. EPERM means + * the process exists but we lack permission to signal it (still alive). + */ +export function isPidAlive(pid) { + if (!pid || typeof pid !== "number") return false; + try { + process.kill(pid, 0); + return true; + } catch (err) { + return err && err.code === "EPERM"; + } +} + +/** + * A job counts as "active" only if its status says queued/running AND + * the recorded pid is still alive. This prevents zombie state from + * causing false alarms when the codex process was SIGKILL'd + * (e.g., by a cleanup script or an OS reboot) without getting a chance + * to write status="completed" to disk. + */ +export function isJobActive(job) { + if (!job) return false; + if (job.status !== "queued" && job.status !== "running") return false; + if (!job.pid) return false; + return isPidAlive(job.pid); +} + export function setConfig(cwd, key, value) { return updateState(cwd, (state) => { state.config = { diff --git a/plugins/codex/scripts/stop-review-gate-hook.mjs b/plugins/codex/scripts/stop-review-gate-hook.mjs index 2346bdcf..5bec0f75 100644 --- a/plugins/codex/scripts/stop-review-gate-hook.mjs +++ b/plugins/codex/scripts/stop-review-gate-hook.mjs @@ -8,7 +8,7 @@ import { fileURLToPath } from "node:url"; import { getCodexAvailability } from "./lib/codex.mjs"; import { loadPromptTemplate, interpolateTemplate } from "./lib/prompts.mjs"; -import { getConfig, listJobs } from "./lib/state.mjs"; +import { getConfig, isJobActive, listJobs } from "./lib/state.mjs"; import { sortJobsNewestFirst } from "./lib/job-control.mjs"; import { SESSION_ID_ENV } from "./lib/tracked-jobs.mjs"; import { resolveWorkspaceRoot } from "./lib/workspace.mjs"; @@ -146,7 +146,7 @@ function main() { const config = getConfig(workspaceRoot); const jobs = sortJobsNewestFirst(filterJobsForCurrentSession(listJobs(workspaceRoot), input)); - const runningJob = jobs.find((job) => job.status === "queued" || job.status === "running"); + const runningJob = jobs.find(isJobActive); const runningTaskNote = runningJob ? `Codex task ${runningJob.id} is still running. Check /codex:status and use /codex:cancel ${runningJob.id} if you want to stop it before ending the session.` : null;