From ac8690d862498aa7c4872a58a5393d5491f39a20 Mon Sep 17 00:00:00 2001 From: Friende <35026241+pengyou200902@users.noreply.github.com> Date: Mon, 4 May 2026 18:56:56 -0400 Subject: [PATCH] fix: harden rescue task handoff failures --- plugins/codex/agents/codex-rescue.md | 7 +- plugins/codex/commands/rescue.md | 2 + plugins/codex/scripts/codex-companion.mjs | 24 +++- plugins/codex/scripts/lib/codex.mjs | 3 +- .../codex/skills/codex-cli-runtime/SKILL.md | 7 +- tests/commands.test.mjs | 19 ++- tests/runtime.test.mjs | 110 ++++++++++++++++++ 7 files changed, 158 insertions(+), 14 deletions(-) diff --git a/plugins/codex/agents/codex-rescue.md b/plugins/codex/agents/codex-rescue.md index 7009ec86..900bb8f7 100644 --- a/plugins/codex/agents/codex-rescue.md +++ b/plugins/codex/agents/codex-rescue.md @@ -30,6 +30,7 @@ Forwarding rules: - Leave model unset by default. Only add `--model` when the user explicitly asks for a specific model. - If the user asks for `spark`, map that to `--model gpt-5.3-codex-spark`. - If the user asks for a concrete model name such as `gpt-5.4-mini`, pass it through with `--model`. +- Copy the user's requested model string into `--model` character-for-character, except for the documented `spark` alias. Do not substitute, shorten, normalize, or guess model names. Never emit a model name the user did not literally type or whose alias they did not literally type. - Treat `--effort ` and `--model ` as runtime controls and do not include them in the task text you pass through. - Default to a write-capable Codex run by adding `--write` unless the user explicitly asks for read-only behavior or only wants review, diagnosis, or research without edits. - Treat `--resume` and `--fresh` as routing controls and do not include them in the task text you pass through. @@ -38,8 +39,10 @@ Forwarding rules: - If the user is clearly asking to continue prior Codex work in this repository, such as "continue", "keep going", "resume", "apply the top fix", or "dig deeper", add `--resume-last` unless `--fresh` is present. - Otherwise forward the task as a fresh `task` run. - Preserve the user's task text as-is apart from stripping routing flags. -- Return the stdout of the `codex-companion` command exactly as-is. -- If the Bash call fails or Codex cannot be invoked, return nothing. +- Pass the forwarded task text on stdin with `task --stdin`, not as a shell-quoted CLI argument. +- Do not inline multi-line prompts as a quoted argument or as `"$(cat ...)"`; that re-expands prompt content through shell quoting. +- Return the stdout of the `codex-companion` command exactly as-is when it succeeds. +- If the Bash call fails or Codex cannot be invoked, return the failure output exactly once and stop. Do not retry, do not make another `task` invocation, and do not return an empty response. Response style: diff --git a/plugins/codex/commands/rescue.md b/plugins/codex/commands/rescue.md index 56de9555..f9e18e0f 100644 --- a/plugins/codex/commands/rescue.md +++ b/plugins/codex/commands/rescue.md @@ -39,6 +39,8 @@ node "${CLAUDE_PLUGIN_ROOT}/scripts/codex-companion.mjs" task-resume-candidate - Operating rules: - The subagent is a thin forwarder only. It should use one `Bash` call to invoke `node "${CLAUDE_PLUGIN_ROOT}/scripts/codex-companion.mjs" task ...` and return that command's stdout as-is. +- The subagent must pass forwarded task text on stdin with `task --stdin`, not as a shell-quoted prompt argument. +- If the subagent's Bash call fails, it must return the failure output once and stop rather than retrying or returning an empty response. - Return the Codex companion stdout verbatim to the user. - Do not paraphrase, summarize, rewrite, or add commentary before or after it. - Do not ask the subagent to inspect files, monitor progress, poll `/codex:status`, fetch `/codex:result`, call `/codex:cancel`, summarize output, or do follow-up work of its own. diff --git a/plugins/codex/scripts/codex-companion.mjs b/plugins/codex/scripts/codex-companion.mjs index 35222fd5..37132cb9 100644 --- a/plugins/codex/scripts/codex-companion.mjs +++ b/plugins/codex/scripts/codex-companion.mjs @@ -77,7 +77,7 @@ function printUsage() { " node scripts/codex-companion.mjs setup [--enable-review-gate|--disable-review-gate] [--json]", " node scripts/codex-companion.mjs review [--wait|--background] [--base ] [--scope ]", " node scripts/codex-companion.mjs adversarial-review [--wait|--background] [--base ] [--scope ] [focus text]", - " node scripts/codex-companion.mjs task [--background] [--write] [--resume-last|--resume|--fresh] [--model ] [--effort ] [prompt]", + " node scripts/codex-companion.mjs task [--background] [--write] [--resume-last|--resume|--fresh] [--model ] [--effort ] [--stdin|--prompt-file |prompt]", " node scripts/codex-companion.mjs status [job-id] [--all] [--json]", " node scripts/codex-companion.mjs result [job-id] [--json]", " node scripts/codex-companion.mjs cancel [job-id] [--json]" @@ -610,7 +610,20 @@ function buildTaskRequest({ cwd, model, effort, prompt, write, resumeLast, jobId }; } -function readTaskPrompt(cwd, options, positionals) { +function readTaskPrompt(cwd, options, positionals, context = {}) { + if (options.stdin && options["prompt-file"]) { + throw new Error("Choose either --stdin or --prompt-file."); + } + if (options.stdin && positionals.length > 0) { + throw new Error("Do not pass positional prompt text when using --stdin."); + } + if (options.stdin) { + const prompt = readStdinIfPiped(); + if (!prompt && !context.resumeLast) { + throw new Error("Expected prompt text on stdin."); + } + return prompt; + } if (options["prompt-file"]) { return fs.readFileSync(path.resolve(cwd, options["prompt-file"]), "utf8"); } @@ -732,7 +745,7 @@ async function handleReview(argv) { async function handleTask(argv) { const { options, positionals } = parseCommandInput(argv, { valueOptions: ["model", "effort", "cwd", "prompt-file"], - booleanOptions: ["json", "write", "resume-last", "resume", "fresh", "background"], + booleanOptions: ["json", "write", "resume-last", "resume", "fresh", "background", "stdin"], aliasMap: { m: "model" } @@ -742,13 +755,14 @@ async function handleTask(argv) { const workspaceRoot = resolveCommandWorkspace(options); const model = normalizeRequestedModel(options.model); const effort = normalizeReasoningEffort(options.effort); - const prompt = readTaskPrompt(cwd, options, positionals); - const resumeLast = Boolean(options["resume-last"] || options.resume); const fresh = Boolean(options.fresh); if (resumeLast && fresh) { throw new Error("Choose either --resume/--resume-last or --fresh."); } + const prompt = readTaskPrompt(cwd, options, positionals, { + resumeLast + }); const write = Boolean(options.write); const taskMetadata = buildTaskRunMetadata({ prompt, diff --git a/plugins/codex/scripts/lib/codex.mjs b/plugins/codex/scripts/lib/codex.mjs index f2fe88bd..2db9d1fb 100644 --- a/plugins/codex/scripts/lib/codex.mjs +++ b/plugins/codex/scripts/lib/codex.mjs @@ -60,8 +60,7 @@ function buildThreadParams(cwd, options = {}) { approvalPolicy: options.approvalPolicy ?? "never", sandbox: options.sandbox ?? "read-only", serviceName: SERVICE_NAME, - ephemeral: options.ephemeral ?? true, - experimentalRawEvents: false + ephemeral: options.ephemeral ?? true }; } diff --git a/plugins/codex/skills/codex-cli-runtime/SKILL.md b/plugins/codex/skills/codex-cli-runtime/SKILL.md index 0e91bfb5..c7348fce 100644 --- a/plugins/codex/skills/codex-cli-runtime/SKILL.md +++ b/plugins/codex/skills/codex-cli-runtime/SKILL.md @@ -9,7 +9,7 @@ user-invocable: false Use this skill only inside the `codex:codex-rescue` subagent. Primary helper: -- `node "${CLAUDE_PLUGIN_ROOT}/scripts/codex-companion.mjs" task ""` +- `node "${CLAUDE_PLUGIN_ROOT}/scripts/codex-companion.mjs" task --stdin ...` Execution rules: - The rescue subagent is a forwarder, not an orchestrator. Its only job is to invoke `task` once and return that stdout unchanged. @@ -21,6 +21,7 @@ Execution rules: - Leave `--effort` unset unless the user explicitly requests a specific effort. - Leave model unset by default. Add `--model` only when the user explicitly asks for one. - Map `spark` to `--model gpt-5.3-codex-spark`. +- Copy the user's requested model string into `--model` character-for-character, except for the documented `spark` alias. Do not substitute, shorten, normalize, or guess model names. - Default to a write-capable Codex run by adding `--write` unless the user explicitly asks for read-only behavior or only wants review, diagnosis, or research without edits. Command selection: @@ -34,10 +35,12 @@ Command selection: - `--fresh`: always use a fresh `task` run, even if the request sounds like a follow-up. - `--effort`: accepted values are `none`, `minimal`, `low`, `medium`, `high`, `xhigh`. - `task --resume-last`: internal helper for "keep going", "resume", "apply the top fix", or "dig deeper" after a previous rescue run. +- Pass the forwarded task text on stdin with `task --stdin`. +- Do not pass raw task text as a shell-quoted CLI argument, and do not use `"$(cat ...)"` to inline file contents. Safety rules: - Default to write-capable Codex work in `codex:codex-rescue` unless the user explicitly asks for read-only behavior. - Preserve the user's task text as-is apart from stripping routing flags. - Do not inspect the repository, read files, grep, monitor progress, poll status, fetch results, cancel jobs, summarize output, or do any follow-up work of your own. - Return the stdout of the `task` command exactly as-is. -- If the Bash call fails or Codex cannot be invoked, return nothing. +- If the Bash call fails or Codex cannot be invoked, return the failure output exactly once and stop. Do not retry, do not make another `task` invocation, and do not return an empty response. diff --git a/tests/commands.test.mjs b/tests/commands.test.mjs index 3724ffa4..d2bb8bf3 100644 --- a/tests/commands.test.mjs +++ b/tests/commands.test.mjs @@ -135,11 +135,19 @@ test("rescue command absorbs continue semantics", () => { assert.match(agent, /Leave model unset by default/i); assert.match(agent, /If the user asks for `spark`, map that to `--model gpt-5\.3-codex-spark`/i); assert.match(agent, /If the user asks for a concrete model name such as `gpt-5\.4-mini`, pass it through with `--model`/i); + assert.match(agent, /Copy the user's requested model string into `--model` character-for-character, except for the documented `spark` alias/i); + assert.match(agent, /Never emit a model name the user did not literally type or whose alias they did not literally type/i); + assert.match(agent, /Pass the forwarded task text on stdin with `task --stdin`, not as a shell-quoted CLI argument/i); + assert.doesNotMatch(agent, /mktemp|chmod 600|trap/); + assert.match(agent, /Do not inline multi-line prompts as a quoted argument or as `"\$\(cat \.\.\.\)"`/i); assert.match(agent, /Return the stdout of the `codex-companion` command exactly as-is/i); - assert.match(agent, /If the Bash call fails or Codex cannot be invoked, return nothing/i); + assert.match(agent, /If the Bash call fails or Codex cannot be invoked, return the failure output exactly once and stop/i); + assert.match(agent, /Do not retry, do not make another `task` invocation, and do not return an empty response/i); + assert.doesNotMatch(agent, /return nothing/i); assert.match(agent, /gpt-5-4-prompting/); assert.match(agent, /only to tighten the user's request into a better Codex prompt/i); assert.match(agent, /Do not use that skill to inspect the repository, reason through the problem yourself, draft a solution, or do any independent work/i); + assert.match(runtimeSkill, /codex-companion\.mjs" task --stdin \.\.\./); assert.match(runtimeSkill, /only job is to invoke `task` once and return that stdout unchanged/i); assert.match(runtimeSkill, /Do not call `setup`, `review`, `adversarial-review`, `status`, `result`, or `cancel`/i); assert.match(runtimeSkill, /use the `gpt-5-4-prompting` skill to rewrite the user's request into a tighter Codex prompt/i); @@ -147,11 +155,16 @@ test("rescue command absorbs continue semantics", () => { assert.match(runtimeSkill, /Leave `--effort` unset unless the user explicitly requests a specific effort/i); assert.match(runtimeSkill, /Leave model unset by default/i); assert.match(runtimeSkill, /Map `spark` to `--model gpt-5\.3-codex-spark`/i); + assert.match(runtimeSkill, /Copy the user's requested model string into `--model` character-for-character, except for the documented `spark` alias/i); assert.match(runtimeSkill, /If the forwarded request includes `--background` or `--wait`, treat that as Claude-side execution control only/i); assert.match(runtimeSkill, /Strip it before calling `task`/i); assert.match(runtimeSkill, /`--effort`: accepted values are `none`, `minimal`, `low`, `medium`, `high`, `xhigh`/i); + assert.match(runtimeSkill, /Pass the forwarded task text on stdin with `task --stdin`/i); + assert.match(runtimeSkill, /Do not pass raw task text as a shell-quoted CLI argument/i); assert.match(runtimeSkill, /Do not inspect the repository, read files, grep, monitor progress, poll status, fetch results, cancel jobs, summarize output, or do any follow-up work of your own/i); - assert.match(runtimeSkill, /If the Bash call fails or Codex cannot be invoked, return nothing/i); + assert.match(runtimeSkill, /If the Bash call fails or Codex cannot be invoked, return the failure output exactly once and stop/i); + assert.match(runtimeSkill, /Do not retry, do not make another `task` invocation, and do not return an empty response/i); + assert.doesNotMatch(runtimeSkill, /return nothing/i); assert.match(readme, /`codex:codex-rescue` subagent/i); assert.match(readme, /if you do not pass `--model` or `--effort`, Codex chooses its own defaults/i); assert.match(readme, /--model gpt-5\.4-mini --effort medium/i); @@ -186,7 +199,7 @@ test("internal docs use task terminology for rescue runs", () => { const promptingSkill = read("skills/gpt-5-4-prompting/SKILL.md"); const promptRecipes = read("skills/gpt-5-4-prompting/references/codex-prompt-recipes.md"); - assert.match(runtimeSkill, /codex-companion\.mjs" task ""/); + assert.match(runtimeSkill, /codex-companion\.mjs" task --stdin \.\.\./); assert.match(runtimeSkill, /Use `task` for every rescue request/i); assert.match(runtimeSkill, /task --resume-last/i); assert.match(promptingSkill, /Use `task` when the task is diagnosis/i); diff --git a/tests/runtime.test.mjs b/tests/runtime.test.mjs index 90408372..c9678270 100644 --- a/tests/runtime.test.mjs +++ b/tests/runtime.test.mjs @@ -627,6 +627,116 @@ test("task --fresh is treated as routing control and does not leak into the prom assert.equal(fakeState.lastTurnStart.prompt, "diagnose the flaky test"); }); +test("task --prompt-file preserves multiline shell-sensitive prompt text", () => { + const repo = makeTempDir(); + const binDir = makeTempDir(); + const statePath = path.join(binDir, "fake-codex-state.json"); + installFakeCodex(binDir); + initGitRepo(repo); + fs.writeFileSync(path.join(repo, "README.md"), "hello\n"); + run("git", ["add", "README.md"], { cwd: repo }); + run("git", ["commit", "-m", "init"], { cwd: repo }); + + const prompt = "diagnose quoting\nliteral $(echo should-not-run)\nquote ' and double \" and backslash \\"; + const promptFile = path.join(repo, "prompt.txt"); + fs.writeFileSync(promptFile, prompt, "utf8"); + + const result = run("node", [SCRIPT, "task", "--prompt-file", promptFile], { + cwd: repo, + env: buildEnv(binDir) + }); + + assert.equal(result.status, 0, result.stderr); + const fakeState = JSON.parse(fs.readFileSync(statePath, "utf8")); + assert.equal(fakeState.lastTurnStart.prompt, prompt); +}); + +test("task --stdin preserves multiline shell-sensitive prompt text", () => { + const repo = makeTempDir(); + const binDir = makeTempDir(); + const statePath = path.join(binDir, "fake-codex-state.json"); + installFakeCodex(binDir); + initGitRepo(repo); + fs.writeFileSync(path.join(repo, "README.md"), "hello\n"); + run("git", ["add", "README.md"], { cwd: repo }); + run("git", ["commit", "-m", "init"], { cwd: repo }); + + const prompt = "diagnose quoting\nliteral $(echo should-not-run)\nquote ' and double \" and backslash \\"; + const result = run("node", [SCRIPT, "task", "--stdin"], { + cwd: repo, + env: buildEnv(binDir), + input: prompt + }); + + assert.equal(result.status, 0, result.stderr); + const fakeState = JSON.parse(fs.readFileSync(statePath, "utf8")); + assert.equal(fakeState.lastTurnStart.prompt, prompt); +}); + +test("task --stdin rejects mixed prompt sources", () => { + const repo = makeTempDir(); + const binDir = makeTempDir(); + installFakeCodex(binDir); + initGitRepo(repo); + fs.writeFileSync(path.join(repo, "README.md"), "hello\n"); + run("git", ["add", "README.md"], { cwd: repo }); + run("git", ["commit", "-m", "init"], { cwd: repo }); + + const withPositional = run("node", [SCRIPT, "task", "--stdin", "diagnose"], { + cwd: repo, + env: buildEnv(binDir), + input: "from stdin" + }); + assert.notEqual(withPositional.status, 0); + assert.match(withPositional.stderr, /Do not pass positional prompt text when using --stdin/); + + const promptFile = path.join(repo, "prompt.txt"); + fs.writeFileSync(promptFile, "from file", "utf8"); + const withPromptFile = run("node", [SCRIPT, "task", "--stdin", "--prompt-file", promptFile], { + cwd: repo, + env: buildEnv(binDir), + input: "from stdin" + }); + assert.notEqual(withPromptFile.status, 0); + assert.match(withPromptFile.stderr, /Choose either --stdin or --prompt-file/); +}); + +test("task --stdin allows empty prompt only when resuming", () => { + const repo = makeTempDir(); + const binDir = makeTempDir(); + const statePath = path.join(binDir, "fake-codex-state.json"); + installFakeCodex(binDir); + initGitRepo(repo); + fs.writeFileSync(path.join(repo, "README.md"), "hello\n"); + run("git", ["add", "README.md"], { cwd: repo }); + run("git", ["commit", "-m", "init"], { cwd: repo }); + + const firstRun = run("node", [SCRIPT, "task", "initial task"], { + cwd: repo, + env: buildEnv(binDir) + }); + assert.equal(firstRun.status, 0, firstRun.stderr); + + const resume = run("node", [SCRIPT, "task", "--stdin", "--resume-last"], { + cwd: repo, + env: buildEnv(binDir), + input: "" + }); + + assert.equal(resume.status, 0, resume.stderr); + const fakeState = JSON.parse(fs.readFileSync(statePath, "utf8")); + assert.equal(fakeState.lastTurnStart.threadId, "thr_1"); + assert.equal(fakeState.lastTurnStart.prompt, "Continue from the current thread state. Pick the next highest-value step and follow through until the task is resolved."); + + const fresh = run("node", [SCRIPT, "task", "--stdin"], { + cwd: repo, + env: buildEnv(binDir), + input: "" + }); + assert.notEqual(fresh.status, 0); + assert.match(fresh.stderr, /Expected prompt text on stdin/); +}); + test("task forwards model selection and reasoning effort to app-server turn/start", () => { const repo = makeTempDir(); const binDir = makeTempDir();