From 5b9414454a421d8cde3e8f74ed07e4cf1d37c2ef Mon Sep 17 00:00:00 2001 From: LASK Date: Sun, 10 May 2026 02:54:54 +0800 Subject: [PATCH 1/2] fix: drop non-JSONL garbage on codex app-server stdout (#310, #23) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The JSONL parser in `lib/app-server.mjs` `handleLine()` previously called `JSON.parse()` on every line read from the codex app-server stdout pipe and, on failure, called `handleExit()` — rejecting every in-flight request and tearing the connection down. In practice the codex CLI occasionally emits non-JSON bytes onto stdout that have nothing to do with the protocol. Two known sources: 1. Terminal/shell init noise (issue #310 sibling-bug #23) — e.g. zsh writes the bracketed-paste marker `\x1b[?2004h` when a subprocess inherits the parent's TTY. 2. Localized OS messages on Windows non-English locales (issue #310 + openai/codex#21957). On zh-TW (CP-950 / Big5), when codex.exe runs `taskkill /T /F` to clean up a failed MCP child and inadvertently routes taskkill's stdout to its own stdout, the JSONL stream receives the bytes `A6 A8 A5 5C 3A 20 50 49 44 ...` ("成功: PID 為 xxxx ..."), which decode under UTF-8 as `���\: PID ...` — the exact substring in the original error. Both bug classes share the same correct shape of fix at the plugin layer: lines that cannot possibly be JSONL records should be dropped, not treated as protocol violations. A JSONL record always begins with `{` or `[`. Any line whose first non-whitespace, post-strip-ANSI character is something else is definitively garbage from outside the protocol. This commit factors a `cleanProtocolLine()` helper into `lib/jsonl.mjs` that: - strips CSI / OSC ANSI escape sequences, - trims whitespace, - returns `null` if the result is empty or does not start with `{` / `[`, - otherwise returns the cleaned candidate string. `handleLine()` skips the line on `null` and parses the candidate otherwise. Lines that pass the guard but are still malformed JSON continue to surface as real protocol errors via the existing `handleExit()` path — that branch is preserved. The root corruption — codex.exe leaking taskkill's stdout into its own stdout on Windows zh-TW — is filed upstream as openai/codex#21957. This PR is the plugin-side guard; it is also a strict superset of the ANSI-escape strip approach explored in #24, #97, and #171, so any of those can be closed in favour of this if the maintainers prefer. Test plan: - `node --test tests/jsonl.test.mjs` — 11/11 pass, covers bracketed-paste prefix, OSC sequences, SGR colours, the exact CP-950 mojibake from #310, plain JSON passthrough, malformed-but- JSON-shaped fall-through to the parser, and non-string input. - The pre-existing test failures on this branch (`state`, `runtime`, `process` Windows tmpdir / MSYS quirks) are unrelated and reproduce on `main`. Closes #310 Refs #23 Refs openai/codex#21957 --- plugins/codex/scripts/lib/app-server.mjs | 13 ++-- plugins/codex/scripts/lib/jsonl.mjs | 60 ++++++++++++++++ tests/jsonl.test.mjs | 87 ++++++++++++++++++++++++ 3 files changed, 156 insertions(+), 4 deletions(-) create mode 100644 plugins/codex/scripts/lib/jsonl.mjs create mode 100644 tests/jsonl.test.mjs diff --git a/plugins/codex/scripts/lib/app-server.mjs b/plugins/codex/scripts/lib/app-server.mjs index 127c8376..4df53aee 100644 --- a/plugins/codex/scripts/lib/app-server.mjs +++ b/plugins/codex/scripts/lib/app-server.mjs @@ -14,6 +14,7 @@ import { spawn } from "node:child_process"; import readline from "node:readline"; import { parseBrokerEndpoint } from "./broker-endpoint.mjs"; import { ensureBrokerSession, loadBrokerSession } from "./broker-lifecycle.mjs"; +import { cleanProtocolLine } from "./jsonl.mjs"; import { terminateProcessTree } from "./process.mjs"; const PLUGIN_MANIFEST_URL = new URL("../../.claude-plugin/plugin.json", import.meta.url); @@ -114,16 +115,20 @@ class AppServerClientBase { } } - handleLine(line) { - if (!line.trim()) { + handleLine(rawLine) { + const candidate = cleanProtocolLine(rawLine); + if (candidate === null) { + // Empty line, terminal noise (issue #23), or localized OS messages + // (e.g. CP-950 mojibaked taskkill SUCCESS on Windows zh-TW). Drop + // it instead of tearing the connection down — see lib/jsonl.mjs. return; } let message; try { - message = JSON.parse(line); + message = JSON.parse(candidate); } catch (error) { - this.handleExit(createProtocolError(`Failed to parse codex app-server JSONL: ${error.message}`, { line })); + this.handleExit(createProtocolError(`Failed to parse codex app-server JSONL: ${error.message}`, { line: rawLine })); return; } diff --git a/plugins/codex/scripts/lib/jsonl.mjs b/plugins/codex/scripts/lib/jsonl.mjs new file mode 100644 index 00000000..29e4c580 --- /dev/null +++ b/plugins/codex/scripts/lib/jsonl.mjs @@ -0,0 +1,60 @@ +/** + * Helpers for reading the Codex app-server JSONL protocol. + * + * The codex CLI's app-server occasionally emits non-JSON bytes onto stdout + * that have nothing to do with the protocol. Two known sources: + * + * 1. Terminal/shell init noise — e.g. zsh writes the bracketed-paste + * marker `\x1b[?2004h` when a subprocess inherits the parent's TTY + * (issue #23). + * + * 2. Localized OS messages on Windows non-English locales. On zh-TW + * (CP-950 / Big5), when codex.exe runs `taskkill /T /F` to clean up + * a failed MCP child and inadvertently routes taskkill's stdout to + * its own stdout, we see the bytes `A6 A8 A5 5C 3A 20 50 49 44 ...` + * ("成功: PID 為 xxxx ...") which decode under UTF-8 as + * `���\\: PID ...`. + * + * Both cases historically tore the broker connection down with + * `Failed to parse codex app-server JSONL: Unexpected token …`. The + * client never recovers because `handleExit` rejects every in-flight + * request before any subsequent valid record arrives. + * + * `cleanProtocolLine` is the conservative guard: + * + * - Strip CSI / OSC ANSI escape sequences from the raw line. + * - Trim whitespace. + * - Require the first remaining character to be `{` or `[`. JSONL + * records cannot start with anything else, so any other prefix is + * definitively garbage and is dropped. + * + * Lines that pass these checks are still parsed with `JSON.parse`; if + * that fails the caller surfaces a real protocol error as before. + */ + +// CSI: ESC [ … final-byte (e.g. \x1b[?2004h, \x1b[31m) +// OSC: ESC ] … (BEL | ESC \) (e.g. window-title sets) +// eslint-disable-next-line no-control-regex +const ANSI_ESCAPE_RE = /\x1b\[[0-9;?]*[a-zA-Z]|\x1b\][^\x07]*(?:\x07|\x1b\\)/g; + +/** + * Returns a JSON-shaped candidate string for the given raw line, or + * `null` if the line should be skipped because it cannot be valid JSONL. + * + * @param {string} rawLine + * @returns {string | null} + */ +export function cleanProtocolLine(rawLine) { + if (typeof rawLine !== "string") { + return null; + } + const cleaned = rawLine.replace(ANSI_ESCAPE_RE, "").trim(); + if (!cleaned) { + return null; + } + const firstChar = cleaned.charCodeAt(0); + if (firstChar !== 0x7B /* { */ && firstChar !== 0x5B /* [ */) { + return null; + } + return cleaned; +} diff --git a/tests/jsonl.test.mjs b/tests/jsonl.test.mjs new file mode 100644 index 00000000..ee086534 --- /dev/null +++ b/tests/jsonl.test.mjs @@ -0,0 +1,87 @@ +import test from "node:test"; +import assert from "node:assert/strict"; + +import { cleanProtocolLine } from "../plugins/codex/scripts/lib/jsonl.mjs"; + +test("cleanProtocolLine returns null for empty / whitespace lines", () => { + assert.equal(cleanProtocolLine(""), null); + assert.equal(cleanProtocolLine(" "), null); + assert.equal(cleanProtocolLine("\t\r\n"), null); +}); + +test("cleanProtocolLine returns null for non-string input", () => { + assert.equal(cleanProtocolLine(undefined), null); + assert.equal(cleanProtocolLine(null), null); + assert.equal(cleanProtocolLine(42), null); +}); + +test("cleanProtocolLine passes through plain JSON object lines unchanged", () => { + assert.equal(cleanProtocolLine('{"id":1,"result":{}}'), '{"id":1,"result":{}}'); + assert.equal(cleanProtocolLine(' {"a":1} '), '{"a":1}'); +}); + +test("cleanProtocolLine passes through plain JSON array lines unchanged", () => { + assert.equal(cleanProtocolLine('[1,2,3]'), '[1,2,3]'); +}); + +test("cleanProtocolLine strips bracketed-paste-mode ANSI prefix (issue #23)", () => { + assert.equal( + cleanProtocolLine('\x1b[?2004h{"id":1}'), + '{"id":1}' + ); + assert.equal( + cleanProtocolLine('{"id":1}\x1b[?2004l'), + '{"id":1}' + ); +}); + +test("cleanProtocolLine strips OSC window-title sequences", () => { + assert.equal( + cleanProtocolLine('\x1b]0;some title\x07{"id":1}'), + '{"id":1}' + ); +}); + +test("cleanProtocolLine strips SGR color codes", () => { + assert.equal( + cleanProtocolLine('\x1b[1;31m{"id":1}\x1b[0m'), + '{"id":1}' + ); +}); + +test("cleanProtocolLine drops CP-950 mojibake of Windows taskkill SUCCESS", () => { + // Raw bytes from `taskkill /T /F` on a zh-TW (Big5 / CP-950) Windows + // system: `成功: PID 為 1234 (PID 為 5678 的子處理程序) 的處理程序已終止。` + // When read off codex.exe's stdout pipe under Node's UTF-8 decoder, the + // invalid CP-950 high bytes become U+FFFD replacement characters. + const mojibake = "���\\: PID �� 1234 (PID �� 5678 ���l�B�z�{��)"; + assert.equal(cleanProtocolLine(mojibake), null); +}); + +test("cleanProtocolLine drops the literal U+FFFD prefix that triggered the original bug", () => { + // This is the smallest reproducer of the production failure: the parser + // saw `Unexpected token '�', "���\: PID "...` and tore the connection + // down. After this guard the line is silently skipped. + assert.equal(cleanProtocolLine("���\\: PID 1234"), null); +}); + +test("cleanProtocolLine drops other non-JSON garbage (shell prompts, plain text)", () => { + assert.equal(cleanProtocolLine("PS C:\\> "), null); + assert.equal(cleanProtocolLine("user@host:~$ "), null); + assert.equal(cleanProtocolLine("warning: something happened"), null); + // Note: anything whose first non-whitespace character is `[` (e.g. an + // ANSI-stripped bracket-log timestamp like `[2026-05-09T...]`) is NOT + // dropped by this guard — it looks like a JSON array. Such lines still + // reach JSON.parse and surface as a real protocol error, which is the + // desired behaviour: the guard only drops lines that cannot possibly + // be JSONL. +}); + +test("cleanProtocolLine forwards a malformed but JSON-shaped line for the parser to reject", () => { + // Lines that DO start with `{` or `[` are returned even if they are not + // strictly valid JSON. The caller will then surface a real protocol + // error via JSON.parse. This is the desired behaviour: the guard only + // drops lines that cannot possibly be JSONL. + assert.equal(cleanProtocolLine('{"id":1,'), '{"id":1,'); + assert.equal(cleanProtocolLine('[1,2,'), '[1,2,'); +}); From 2dc20bf2a2620cc72a5dbc7dca87ff814691cb07 Mon Sep 17 00:00:00 2001 From: LASK Date: Sun, 10 May 2026 05:16:51 +0800 Subject: [PATCH 2/2] fix(jsonl): broaden CSI final-byte range; correct cleanProtocolLine JSDoc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both addresses Copilot inline review on PR #311. 1. The CSI regex previously matched final bytes only in `[a-zA-Z]`, which misses valid CSI sequences that end on a non-letter. The most relevant missing case in this codebase is `\x1b[200~` / `\x1b[201~` — bracketed-paste content markers — which a TTY-attached subprocess emits *around* pasted content, not just the on/off `\x1b[?2004h` / `\x1b[?2004l` toggles. With the old regex, a JSON record sandwiched between `200~` and `201~` would survive `replace`, then fail the first-char `{`/`[` guard, then be silently dropped — the worst kind of regression for this fix's stated goal. Broaden to the ECMA-48 CSI grammar: parameter bytes in 0x30..0x3F, optional intermediate bytes in 0x20..0x2F, final byte in 0x40..0x7E. Adds a regression test for `\x1b[200~{...}\x1b[201~` and the boundary finals `@` (0x40) and `` ` `` (0x60). 2. The JSDoc declared `@param {string} rawLine` even though the function explicitly handles non-string input by short-circuiting to `null` (and a unit test asserts that). Update to `@param {unknown}` so editors and tsc-via-checkJs don't mislead callers into thinking they must type-narrow first. --- plugins/codex/scripts/lib/jsonl.mjs | 17 +++++++++++++---- tests/jsonl.test.mjs | 14 ++++++++++++++ 2 files changed, 27 insertions(+), 4 deletions(-) diff --git a/plugins/codex/scripts/lib/jsonl.mjs b/plugins/codex/scripts/lib/jsonl.mjs index 29e4c580..1e6dccf3 100644 --- a/plugins/codex/scripts/lib/jsonl.mjs +++ b/plugins/codex/scripts/lib/jsonl.mjs @@ -32,16 +32,25 @@ * that fails the caller surfaces a real protocol error as before. */ -// CSI: ESC [ … final-byte (e.g. \x1b[?2004h, \x1b[31m) -// OSC: ESC ] … (BEL | ESC \) (e.g. window-title sets) +// CSI: ESC [ +// params = 0x30..0x3F (digits, ?, etc.) +// intermediates= 0x20..0x2F (space, !, ", $, %, &, ', (, ), *, +, ,, -, ., /) +// final = 0x40..0x7E (any byte in @, A..Z, [, \, ], ^, _, `, a..z, {, |, }, ~) +// This grammar matches the original ECMA-48 spec and covers cases like +// bracketed-paste *content* markers `\x1b[200~` / `\x1b[201~` (final +// byte `~`) which a letter-only `[a-zA-Z]` final misses. +// OSC: ESC ] (BEL | ESC \) e.g. window-title sets // eslint-disable-next-line no-control-regex -const ANSI_ESCAPE_RE = /\x1b\[[0-9;?]*[a-zA-Z]|\x1b\][^\x07]*(?:\x07|\x1b\\)/g; +const ANSI_ESCAPE_RE = /\x1b\[[\x30-\x3F]*[\x20-\x2F]*[\x40-\x7E]|\x1b\][^\x07]*(?:\x07|\x1b\\)/g; /** * Returns a JSON-shaped candidate string for the given raw line, or * `null` if the line should be skipped because it cannot be valid JSONL. * - * @param {string} rawLine + * Accepts arbitrary input — non-string values short-circuit to `null` so + * callers don't have to type-check before invoking. + * + * @param {unknown} rawLine * @returns {string | null} */ export function cleanProtocolLine(rawLine) { diff --git a/tests/jsonl.test.mjs b/tests/jsonl.test.mjs index ee086534..e7333d6e 100644 --- a/tests/jsonl.test.mjs +++ b/tests/jsonl.test.mjs @@ -35,6 +35,20 @@ test("cleanProtocolLine strips bracketed-paste-mode ANSI prefix (issue #23)", () ); }); +test("cleanProtocolLine strips CSI sequences whose final byte is not a letter", () => { + // Bracketed-paste content markers use `~` as the final byte. A + // letter-only CSI final pattern (`[a-zA-Z]`) misses them and the + // surrounding JSON would be incorrectly dropped by the first-char + // guard. ECMA-48's CSI grammar allows any final byte in 0x40..0x7E. + assert.equal( + cleanProtocolLine('\x1b[200~{"id":1}\x1b[201~'), + '{"id":1}' + ); + // Other non-letter finals at the boundaries of the 0x40..0x7E range. + assert.equal(cleanProtocolLine('\x1b[@{"id":1}'), '{"id":1}'); // 0x40 + assert.equal(cleanProtocolLine('\x1b[`{"id":1}'), '{"id":1}'); // 0x60 +}); + test("cleanProtocolLine strips OSC window-title sequences", () => { assert.equal( cleanProtocolLine('\x1b]0;some title\x07{"id":1}'),