diff --git a/CHANGELOG.md b/CHANGELOG.md index 3e35df64..3cfadd1a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,18 @@ and adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [0.9.4] - 2026-05-22 +### Fixed +- **Orphaned `codegraph serve --mcp` processes after a parent SIGKILL.** When + the MCP host (Claude Code, opencode, …) was force-killed — OOM killer, a + `kill -9`, a container teardown — the child kept running indefinitely on + Linux, holding inotify watches, file descriptors, and the SQLite WAL. The + kernel doesn't propagate parent death to children, and the stdin + `end`/`close` handlers we relied on don't always fire. The MCP server now + polls `process.ppid` and shuts down the moment it changes from the value + observed at startup; the poll interval is `CODEGRAPH_PPID_POLL_MS` (default + `5000`, `0` disables). Resolves + [#277](https://github.com/colbymchenry/codegraph/issues/277). + ### Added - **Release archives now ship with a `SHA256SUMS` file**, and the npm launcher verifies the bundle it downloads against it — a mismatch aborts before diff --git a/__tests__/mcp-ppid-watchdog.test.ts b/__tests__/mcp-ppid-watchdog.test.ts new file mode 100644 index 00000000..0e3dc188 --- /dev/null +++ b/__tests__/mcp-ppid-watchdog.test.ts @@ -0,0 +1,168 @@ +/** + * PPID watchdog regression test (#277). + * + * On Linux, when an MCP host (Claude Code, opencode, …) is SIGKILL'd by the + * OOM killer / a force-quit / a container teardown, the kernel does NOT + * propagate the death to its `codegraph serve --mcp` child. The child gets + * reparented to init/systemd, its stdin stays half-open in some + * configurations, and the existing `stdin.on('end' | 'close')` handlers + * never fire — the server lingers indefinitely, holding inotify watches, + * file descriptors, and the SQLite WAL. + * + * `src/mcp/index.ts` polls `process.ppid` and shuts down the moment it + * diverges from the value observed at startup. This test stands up a + * four-tier process tree (vitest → wrapper → {stdin-holder, codegraph}) and + * SIGKILL's the wrapper. The stdin-holder is a long-lived sibling whose + * `stdout` pipe is dup'd into codegraph's `stdin`. After the wrapper dies + * the pipe stays open (stdin-holder still owns the write-end), so the + * existing stdin close handlers do **not** fire — the only thing that can + * terminate codegraph then is the PPID watchdog. + * + * Windows is excluded — `process.kill(pid, 'SIGKILL')` does not actually + * deliver SIGKILL there, and the per-OS reparenting semantics the watchdog + * relies on are POSIX-specific. + */ +import { describe, it, expect, afterEach } from 'vitest'; +import { spawn, ChildProcessWithoutNullStreams } from 'child_process'; +import * as fs from 'fs'; +import * as os from 'os'; +import * as path from 'path'; + +const BIN = path.resolve(__dirname, '../dist/bin/codegraph.js'); + +function isAlive(pid: number): boolean { + try { + process.kill(pid, 0); + return true; + } catch { + return false; + } +} + +function waitForExit(pid: number, timeoutMs: number): Promise { + return new Promise((resolve) => { + const start = Date.now(); + const tick = () => { + if (!isAlive(pid)) return resolve(true); + if (Date.now() - start > timeoutMs) return resolve(false); + setTimeout(tick, 100); + }; + tick(); + }); +} + +describe.skipIf(process.platform === 'win32')('MCP PPID watchdog (#277)', () => { + let wrapper: ChildProcessWithoutNullStreams | null = null; + let childPid: number | null = null; + let stdinHolderPid: number | null = null; + + afterEach(() => { + if (wrapper && !wrapper.killed) { + try { wrapper.kill('SIGKILL'); } catch { /* already gone */ } + } + // Belt and suspenders — don't leak processes if an assertion failed. + for (const pid of [childPid, stdinHolderPid]) { + if (pid !== null && isAlive(pid)) { + try { process.kill(pid, 'SIGKILL'); } catch { /* already gone */ } + } + } + wrapper = null; + childPid = null; + stdinHolderPid = null; + }); + + it("shuts down when its parent is SIGKILL'd and stdin stays open", async () => { + // The wrapper: + // 1. Spawns a "stdin-holder" — a tiny long-lived node process whose + // `stdout` pipe is dup'd into codegraph's `stdin`. As long as the + // stdin-holder is alive (it is — it's an orphan after the wrapper + // dies), codegraph's stdin never sees EOF. + // 2. Spawns codegraph with that pipe as fd 0 and its stderr redirected + // to a tmp file that survives the wrapper, then reports both PIDs. + // 3. Idles until SIGKILL'd from the test. + // + // CODEGRAPH_PPID_POLL_MS=200 keeps the watchdog responsive in test; the + // production default is 5000ms. + const stderrLog = path.join( + fs.mkdtempSync(path.join(os.tmpdir(), 'cg-ppid-watchdog-')), + 'codegraph.stderr.log', + ); + // The wrapper waits 800ms before reporting the PIDs so the codegraph + // child has time to finish its async start() (dynamic import + transport + // setup + watchdog registration). Otherwise the test races: it + // SIGKILL's the wrapper before the watchdog interval is installed, and + // nothing terminates codegraph. + const wrapperSrc = ` + const { spawn } = require('child_process'); + const fs = require('fs'); + const stderrFd = fs.openSync(${JSON.stringify(stderrLog)}, 'a'); + const stdinHolder = spawn(process.execPath, ['-e', 'setInterval(() => {}, 60000)'], { + stdio: ['ignore', 'pipe', 'ignore'], + detached: true, + }); + stdinHolder.unref(); + const child = spawn(process.execPath, [${JSON.stringify(BIN)}, 'serve', '--mcp'], { + stdio: [stdinHolder.stdout, 'ignore', stderrFd], + env: { ...process.env, CODEGRAPH_PPID_POLL_MS: '200' }, + detached: true, + }); + child.unref(); + setTimeout(() => { + process.stdout.write(JSON.stringify({ pid: child.pid, stdinHolderPid: stdinHolder.pid }) + '\\n'); + }, 800); + setInterval(() => {}, 60000); + `; + wrapper = spawn(process.execPath, ['-e', wrapperSrc], { + stdio: ['pipe', 'pipe', 'pipe'], + }) as ChildProcessWithoutNullStreams; + + const pids = await new Promise<{ pid: number; stdinHolderPid: number }>((resolve, reject) => { + let buf = ''; + const timer = setTimeout( + () => reject(new Error('wrapper did not report PIDs in time')), + 10000, + ); + wrapper!.stdout.on('data', (chunk: Buffer) => { + buf += chunk.toString('utf8'); + const m = buf.match(/\{"pid":(\d+),"stdinHolderPid":(\d+)\}/); + if (m) { + clearTimeout(timer); + resolve({ pid: parseInt(m[1], 10), stdinHolderPid: parseInt(m[2], 10) }); + } + }); + wrapper!.on('exit', () => { + clearTimeout(timer); + reject(new Error('wrapper exited before reporting PIDs')); + }); + }); + childPid = pids.pid; + stdinHolderPid = pids.stdinHolderPid; + + expect(isAlive(childPid)).toBe(true); + expect(isAlive(stdinHolderPid)).toBe(true); + + // SIGKILL the wrapper — no cleanup runs, just like a real OOM kill. + // codegraph and the stdin-holder both get reparented to init/systemd. + // Crucially, the pipe between them stays open, so codegraph's stdin + // doesn't close: only the watchdog can take it down. + wrapper.kill('SIGKILL'); + + // Watchdog runs every 200ms in this test → 5s gives ~25 polls of headroom. + const exited = await waitForExit(childPid, 5000); + const stderrContent = fs.existsSync(stderrLog) ? fs.readFileSync(stderrLog, 'utf-8') : ''; + expect( + exited, + `codegraph child (pid=${childPid}) did not exit within 5s after wrapper was SIGKILL'd.\nstderr:\n${stderrContent}`, + ).toBe(true); + // The watchdog announces itself before tearing down — assert that the + // shutdown came from the parent-death path, not from any other signal. + expect(stderrContent).toMatch(/Parent process exited.*shutting down/); + + // The stdin-holder is now an orphan — kill it explicitly so it doesn't + // outlive the test. It's still tracked in `stdinHolderPid` for the + // afterEach safety net, but we tidy up proactively here too. + if (isAlive(stdinHolderPid)) { + try { process.kill(stdinHolderPid, 'SIGKILL'); } catch { /* race */ } + } + }, 20000); +}); diff --git a/src/extraction/wasm-runtime-flags.ts b/src/extraction/wasm-runtime-flags.ts index f33a19ff..e44c84d8 100644 --- a/src/extraction/wasm-runtime-flags.ts +++ b/src/extraction/wasm-runtime-flags.ts @@ -46,6 +46,19 @@ export const WASM_RUNTIME_FLAGS: readonly string[] = ['--liftoff-only']; */ const RELAUNCH_GUARD_ENV = 'CODEGRAPH_WASM_RELAUNCHED'; +/** + * Env var carrying the *host* PID (the relauncher's own parent) across the + * re-exec. Without `--liftoff-only` the CLI re-execs itself once, inserting an + * intermediate process between the MCP host and the server. That intermediate + * stays alive (blocked in spawnSync) even after the host is killed, so the + * server's PPID watchdog can't detect the host's death by watching its own + * `process.ppid`. Passing the host PID through lets the watchdog poll it + * directly. Unset on the no-re-exec path (bundled launcher / flag already + * present), where the server is already a direct child of the host. See + * src/mcp/index.ts (#277). + */ +export const HOST_PPID_ENV = 'CODEGRAPH_HOST_PPID'; + /** True when every required WASM runtime flag is already present in `execArgv`. */ export function processHasWasmRuntimeFlags( execArgv: readonly string[] = process.execArgv @@ -84,7 +97,7 @@ export function relaunchWithWasmRuntimeFlagsIfNeeded(scriptPath: string): void { const argv = buildRelaunchArgv(scriptPath, process.argv.slice(2)); const result = spawnSync(process.execPath, argv, { stdio: 'inherit', - env: { ...process.env, [RELAUNCH_GUARD_ENV]: '1' }, + env: { ...process.env, [RELAUNCH_GUARD_ENV]: '1', [HOST_PPID_ENV]: String(process.ppid) }, }); if (result.error) { diff --git a/src/mcp/index.ts b/src/mcp/index.ts index c790a4bc..8d0e35d7 100644 --- a/src/mcp/index.ts +++ b/src/mcp/index.ts @@ -21,6 +21,7 @@ import { watchDisabledReason } from '../sync'; import { StdioTransport, JsonRpcRequest, JsonRpcNotification, ErrorCodes } from './transport'; import { tools, ToolHandler } from './tools'; import { SERVER_INSTRUCTIONS } from './server-instructions'; +import { HOST_PPID_ENV } from '../extraction/wasm-runtime-flags'; /** * Convert a file:// URI to a filesystem path. @@ -60,6 +61,51 @@ const PROTOCOL_VERSION = '2024-11-05'; */ const ROOTS_LIST_TIMEOUT_MS = 5000; +/** + * How often to poll `process.ppid` to detect parent process death (see #277). + * 5s is a deliberate trade-off: the failure mode being guarded against is rare + * (parent SIGKILL'd), and longer poll = less wakeup overhead while idle. + */ +const DEFAULT_PPID_POLL_MS = 5000; + +/** + * Resolve the PPID watchdog poll interval from an env override. A value of + * `0` disables the watchdog entirely (escape hatch for embedded scenarios + * where the parent legitimately re-parents the server on purpose). Anything + * non-numeric or negative falls back to the default. + */ +function parsePpidPollMs(raw: string | undefined): number { + if (raw === undefined || raw === '') return DEFAULT_PPID_POLL_MS; + const parsed = Number(raw); + if (!Number.isFinite(parsed)) return DEFAULT_PPID_POLL_MS; + if (parsed < 0) return DEFAULT_PPID_POLL_MS; + return Math.floor(parsed); +} + +/** + * Parse the host PID propagated across the `--liftoff-only` re-exec + * ({@link HOST_PPID_ENV}). Returns a positive integer PID, or null when + * unset/invalid — the direct-launch path, where the watchdog falls back to + * `process.ppid` divergence. PIDs of 0/1 are rejected (0 = unknown, 1 = init, + * i.e. already orphaned), so the watchdog doesn't latch onto init. + */ +function parseHostPpid(raw: string | undefined): number | null { + if (raw === undefined || raw === '') return null; + const parsed = Number(raw); + if (!Number.isInteger(parsed) || parsed <= 1) return null; + return parsed; +} + +/** True if a process with `pid` currently exists (signal-0 probe). */ +function isProcessAlive(pid: number): boolean { + try { + process.kill(pid, 0); + return true; + } catch { + return false; + } +} + /** * Extract the first usable filesystem path from a `roots/list` result. * Shape per MCP spec: `{ roots: [{ uri: "file:///path", name?: string }] }`. @@ -95,6 +141,19 @@ export class MCPServer { // Guards the one-shot deferred resolution (roots/list or cwd) so we don't // re-issue roots/list on every tool call. private rootsAttempted = false; + // PPID watchdog — see start(). Captured at construction so we always have a + // baseline, even if start() runs after a fork-style reparent. + private originalPpid: number = process.ppid; + // The MCP host's PID, propagated across the `--liftoff-only` re-exec (see + // HOST_PPID_ENV). When set, the watchdog polls it directly: the re-exec + // inserts an intermediate process whose *death* — not just our reparenting — + // is what we'd otherwise miss. null on the direct (bundled) launch path. + private hostPpid: number | null = parseHostPpid(process.env[HOST_PPID_ENV]); + private ppidWatchdog: ReturnType | null = null; + // Idempotency guard for stop(). Without it, the watchdog can race with the + // stdin `end`/`close` handlers (or SIGTERM/SIGINT) and double-close cg and + // the transport before process.exit() lands. + private stopped = false; constructor(projectPath?: string) { this.projectPath = projectPath || null; @@ -122,6 +181,38 @@ export class MCPServer { // Detect this and shut down gracefully to prevent orphaned processes. process.stdin.on('end', () => this.stop()); process.stdin.on('close', () => this.stop()); + + // PPID watchdog (#277). Linux doesn't propagate parent death to children, + // so when the MCP host (Claude Code, opencode, …) is SIGKILL'd by the OOM + // killer / a force-quit / a container teardown, the child is reparented to + // init/systemd and the stdin `end`/`close` events don't always fire. The + // server would then linger indefinitely, holding inotify watches, file + // descriptors, and the SQLite WAL. Poll `process.ppid` and shut down the + // moment it changes from what we observed at startup. Cross-platform: + // reparenting changes ppid on Linux *and* macOS; on Windows the value can + // also drop to 0 once the parent is gone. When the CLI re-execs itself for + // `--liftoff-only`, an intermediate process sits between us and the host and + // outlives it, so our own ppid wouldn't change — in that case we poll the + // host PID (propagated via HOST_PPID_ENV) for liveness instead. The watchdog + // is `.unref()`'d so it never holds the event loop open on its own. + const pollMs = parsePpidPollMs(process.env.CODEGRAPH_PPID_POLL_MS); + if (pollMs > 0) { + this.ppidWatchdog = setInterval(() => { + const current = process.ppid; + const ppidChanged = current !== this.originalPpid; + const hostGone = this.hostPpid !== null && !isProcessAlive(this.hostPpid); + if (ppidChanged || hostGone) { + const reason = ppidChanged + ? `ppid ${this.originalPpid} -> ${current}` + : `host pid ${this.hostPpid} exited`; + process.stderr.write( + `[CodeGraph MCP] Parent process exited (${reason}); shutting down.\n` + ); + this.stop(); + } + }, pollMs); + this.ppidWatchdog.unref(); + } } /** @@ -283,6 +374,12 @@ export class MCPServer { * Stop the server */ stop(): void { + if (this.stopped) return; + this.stopped = true; + if (this.ppidWatchdog) { + clearInterval(this.ppidWatchdog); + this.ppidWatchdog = null; + } // Close all cached cross-project connections first this.toolHandler.closeAll(); // Close the main CodeGraph instance