Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,18 @@ and adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [0.9.4] - 2026-05-22

### Fixed
- **Orphaned `codegraph serve --mcp` processes after a parent SIGKILL.** When
the MCP host (Claude Code, opencode, …) was force-killed — OOM killer, a
`kill -9`, a container teardown — the child kept running indefinitely on
Linux, holding inotify watches, file descriptors, and the SQLite WAL. The
kernel doesn't propagate parent death to children, and the stdin
`end`/`close` handlers we relied on don't always fire. The MCP server now
polls `process.ppid` and shuts down the moment it changes from the value
observed at startup; the poll interval is `CODEGRAPH_PPID_POLL_MS` (default
`5000`, `0` disables). Resolves
[#277](https://github.com/colbymchenry/codegraph/issues/277).

### Added
- **Release archives now ship with a `SHA256SUMS` file**, and the npm launcher
verifies the bundle it downloads against it — a mismatch aborts before
Expand Down
168 changes: 168 additions & 0 deletions __tests__/mcp-ppid-watchdog.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
/**
* PPID watchdog regression test (#277).
*
* On Linux, when an MCP host (Claude Code, opencode, …) is SIGKILL'd by the
* OOM killer / a force-quit / a container teardown, the kernel does NOT
* propagate the death to its `codegraph serve --mcp` child. The child gets
* reparented to init/systemd, its stdin stays half-open in some
* configurations, and the existing `stdin.on('end' | 'close')` handlers
* never fire — the server lingers indefinitely, holding inotify watches,
* file descriptors, and the SQLite WAL.
*
* `src/mcp/index.ts` polls `process.ppid` and shuts down the moment it
* diverges from the value observed at startup. This test stands up a
* four-tier process tree (vitest → wrapper → {stdin-holder, codegraph}) and
* SIGKILL's the wrapper. The stdin-holder is a long-lived sibling whose
* `stdout` pipe is dup'd into codegraph's `stdin`. After the wrapper dies
* the pipe stays open (stdin-holder still owns the write-end), so the
* existing stdin close handlers do **not** fire — the only thing that can
* terminate codegraph then is the PPID watchdog.
*
* Windows is excluded — `process.kill(pid, 'SIGKILL')` does not actually
* deliver SIGKILL there, and the per-OS reparenting semantics the watchdog
* relies on are POSIX-specific.
*/
import { describe, it, expect, afterEach } from 'vitest';
import { spawn, ChildProcessWithoutNullStreams } from 'child_process';
import * as fs from 'fs';
import * as os from 'os';
import * as path from 'path';

const BIN = path.resolve(__dirname, '../dist/bin/codegraph.js');

function isAlive(pid: number): boolean {
try {
process.kill(pid, 0);
return true;
} catch {
return false;
}
}

function waitForExit(pid: number, timeoutMs: number): Promise<boolean> {
return new Promise((resolve) => {
const start = Date.now();
const tick = () => {
if (!isAlive(pid)) return resolve(true);
if (Date.now() - start > timeoutMs) return resolve(false);
setTimeout(tick, 100);
};
tick();
});
}

describe.skipIf(process.platform === 'win32')('MCP PPID watchdog (#277)', () => {
let wrapper: ChildProcessWithoutNullStreams | null = null;
let childPid: number | null = null;
let stdinHolderPid: number | null = null;

afterEach(() => {
if (wrapper && !wrapper.killed) {
try { wrapper.kill('SIGKILL'); } catch { /* already gone */ }
}
// Belt and suspenders — don't leak processes if an assertion failed.
for (const pid of [childPid, stdinHolderPid]) {
if (pid !== null && isAlive(pid)) {
try { process.kill(pid, 'SIGKILL'); } catch { /* already gone */ }
}
}
wrapper = null;
childPid = null;
stdinHolderPid = null;
});

it("shuts down when its parent is SIGKILL'd and stdin stays open", async () => {
// The wrapper:
// 1. Spawns a "stdin-holder" — a tiny long-lived node process whose
// `stdout` pipe is dup'd into codegraph's `stdin`. As long as the
// stdin-holder is alive (it is — it's an orphan after the wrapper
// dies), codegraph's stdin never sees EOF.
// 2. Spawns codegraph with that pipe as fd 0 and its stderr redirected
// to a tmp file that survives the wrapper, then reports both PIDs.
// 3. Idles until SIGKILL'd from the test.
//
// CODEGRAPH_PPID_POLL_MS=200 keeps the watchdog responsive in test; the
// production default is 5000ms.
const stderrLog = path.join(
fs.mkdtempSync(path.join(os.tmpdir(), 'cg-ppid-watchdog-')),
'codegraph.stderr.log',
);
// The wrapper waits 800ms before reporting the PIDs so the codegraph
// child has time to finish its async start() (dynamic import + transport
// setup + watchdog registration). Otherwise the test races: it
// SIGKILL's the wrapper before the watchdog interval is installed, and
// nothing terminates codegraph.
const wrapperSrc = `
const { spawn } = require('child_process');
const fs = require('fs');
const stderrFd = fs.openSync(${JSON.stringify(stderrLog)}, 'a');
const stdinHolder = spawn(process.execPath, ['-e', 'setInterval(() => {}, 60000)'], {
stdio: ['ignore', 'pipe', 'ignore'],
detached: true,
});
stdinHolder.unref();
const child = spawn(process.execPath, [${JSON.stringify(BIN)}, 'serve', '--mcp'], {
stdio: [stdinHolder.stdout, 'ignore', stderrFd],
env: { ...process.env, CODEGRAPH_PPID_POLL_MS: '200' },
detached: true,
});
child.unref();
setTimeout(() => {
process.stdout.write(JSON.stringify({ pid: child.pid, stdinHolderPid: stdinHolder.pid }) + '\\n');
}, 800);
setInterval(() => {}, 60000);
`;
wrapper = spawn(process.execPath, ['-e', wrapperSrc], {
stdio: ['pipe', 'pipe', 'pipe'],
}) as ChildProcessWithoutNullStreams;

const pids = await new Promise<{ pid: number; stdinHolderPid: number }>((resolve, reject) => {
let buf = '';
const timer = setTimeout(
() => reject(new Error('wrapper did not report PIDs in time')),
10000,
);
wrapper!.stdout.on('data', (chunk: Buffer) => {
buf += chunk.toString('utf8');
const m = buf.match(/\{"pid":(\d+),"stdinHolderPid":(\d+)\}/);
if (m) {
clearTimeout(timer);
resolve({ pid: parseInt(m[1], 10), stdinHolderPid: parseInt(m[2], 10) });
}
});
wrapper!.on('exit', () => {
clearTimeout(timer);
reject(new Error('wrapper exited before reporting PIDs'));
});
});
childPid = pids.pid;
stdinHolderPid = pids.stdinHolderPid;

expect(isAlive(childPid)).toBe(true);
expect(isAlive(stdinHolderPid)).toBe(true);

// SIGKILL the wrapper — no cleanup runs, just like a real OOM kill.
// codegraph and the stdin-holder both get reparented to init/systemd.
// Crucially, the pipe between them stays open, so codegraph's stdin
// doesn't close: only the watchdog can take it down.
wrapper.kill('SIGKILL');

// Watchdog runs every 200ms in this test → 5s gives ~25 polls of headroom.
const exited = await waitForExit(childPid, 5000);
const stderrContent = fs.existsSync(stderrLog) ? fs.readFileSync(stderrLog, 'utf-8') : '<no stderr captured>';
expect(
exited,
`codegraph child (pid=${childPid}) did not exit within 5s after wrapper was SIGKILL'd.\nstderr:\n${stderrContent}`,
).toBe(true);
// The watchdog announces itself before tearing down — assert that the
// shutdown came from the parent-death path, not from any other signal.
expect(stderrContent).toMatch(/Parent process exited.*shutting down/);

// The stdin-holder is now an orphan — kill it explicitly so it doesn't
// outlive the test. It's still tracked in `stdinHolderPid` for the
// afterEach safety net, but we tidy up proactively here too.
if (isAlive(stdinHolderPid)) {
try { process.kill(stdinHolderPid, 'SIGKILL'); } catch { /* race */ }
}
}, 20000);
});
15 changes: 14 additions & 1 deletion src/extraction/wasm-runtime-flags.ts
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,19 @@ export const WASM_RUNTIME_FLAGS: readonly string[] = ['--liftoff-only'];
*/
const RELAUNCH_GUARD_ENV = 'CODEGRAPH_WASM_RELAUNCHED';

/**
* Env var carrying the *host* PID (the relauncher's own parent) across the
* re-exec. Without `--liftoff-only` the CLI re-execs itself once, inserting an
* intermediate process between the MCP host and the server. That intermediate
* stays alive (blocked in spawnSync) even after the host is killed, so the
* server's PPID watchdog can't detect the host's death by watching its own
* `process.ppid`. Passing the host PID through lets the watchdog poll it
* directly. Unset on the no-re-exec path (bundled launcher / flag already
* present), where the server is already a direct child of the host. See
* src/mcp/index.ts (#277).
*/
export const HOST_PPID_ENV = 'CODEGRAPH_HOST_PPID';

/** True when every required WASM runtime flag is already present in `execArgv`. */
export function processHasWasmRuntimeFlags(
execArgv: readonly string[] = process.execArgv
Expand Down Expand Up @@ -84,7 +97,7 @@ export function relaunchWithWasmRuntimeFlagsIfNeeded(scriptPath: string): void {
const argv = buildRelaunchArgv(scriptPath, process.argv.slice(2));
const result = spawnSync(process.execPath, argv, {
stdio: 'inherit',
env: { ...process.env, [RELAUNCH_GUARD_ENV]: '1' },
env: { ...process.env, [RELAUNCH_GUARD_ENV]: '1', [HOST_PPID_ENV]: String(process.ppid) },
});

if (result.error) {
Expand Down
97 changes: 97 additions & 0 deletions src/mcp/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import { watchDisabledReason } from '../sync';
import { StdioTransport, JsonRpcRequest, JsonRpcNotification, ErrorCodes } from './transport';
import { tools, ToolHandler } from './tools';
import { SERVER_INSTRUCTIONS } from './server-instructions';
import { HOST_PPID_ENV } from '../extraction/wasm-runtime-flags';

/**
* Convert a file:// URI to a filesystem path.
Expand Down Expand Up @@ -60,6 +61,51 @@ const PROTOCOL_VERSION = '2024-11-05';
*/
const ROOTS_LIST_TIMEOUT_MS = 5000;

/**
* How often to poll `process.ppid` to detect parent process death (see #277).
* 5s is a deliberate trade-off: the failure mode being guarded against is rare
* (parent SIGKILL'd), and longer poll = less wakeup overhead while idle.
*/
const DEFAULT_PPID_POLL_MS = 5000;

/**
* Resolve the PPID watchdog poll interval from an env override. A value of
* `0` disables the watchdog entirely (escape hatch for embedded scenarios
* where the parent legitimately re-parents the server on purpose). Anything
* non-numeric or negative falls back to the default.
*/
function parsePpidPollMs(raw: string | undefined): number {
if (raw === undefined || raw === '') return DEFAULT_PPID_POLL_MS;
const parsed = Number(raw);
if (!Number.isFinite(parsed)) return DEFAULT_PPID_POLL_MS;
if (parsed < 0) return DEFAULT_PPID_POLL_MS;
return Math.floor(parsed);
}

/**
* Parse the host PID propagated across the `--liftoff-only` re-exec
* ({@link HOST_PPID_ENV}). Returns a positive integer PID, or null when
* unset/invalid — the direct-launch path, where the watchdog falls back to
* `process.ppid` divergence. PIDs of 0/1 are rejected (0 = unknown, 1 = init,
* i.e. already orphaned), so the watchdog doesn't latch onto init.
*/
function parseHostPpid(raw: string | undefined): number | null {
if (raw === undefined || raw === '') return null;
const parsed = Number(raw);
if (!Number.isInteger(parsed) || parsed <= 1) return null;
return parsed;
}

/** True if a process with `pid` currently exists (signal-0 probe). */
function isProcessAlive(pid: number): boolean {
try {
process.kill(pid, 0);
return true;
} catch {
return false;
}
}

/**
* Extract the first usable filesystem path from a `roots/list` result.
* Shape per MCP spec: `{ roots: [{ uri: "file:///path", name?: string }] }`.
Expand Down Expand Up @@ -95,6 +141,19 @@ export class MCPServer {
// Guards the one-shot deferred resolution (roots/list or cwd) so we don't
// re-issue roots/list on every tool call.
private rootsAttempted = false;
// PPID watchdog — see start(). Captured at construction so we always have a
// baseline, even if start() runs after a fork-style reparent.
private originalPpid: number = process.ppid;
// The MCP host's PID, propagated across the `--liftoff-only` re-exec (see
// HOST_PPID_ENV). When set, the watchdog polls it directly: the re-exec
// inserts an intermediate process whose *death* — not just our reparenting —
// is what we'd otherwise miss. null on the direct (bundled) launch path.
private hostPpid: number | null = parseHostPpid(process.env[HOST_PPID_ENV]);
private ppidWatchdog: ReturnType<typeof setInterval> | null = null;
// Idempotency guard for stop(). Without it, the watchdog can race with the
// stdin `end`/`close` handlers (or SIGTERM/SIGINT) and double-close cg and
// the transport before process.exit() lands.
private stopped = false;

constructor(projectPath?: string) {
this.projectPath = projectPath || null;
Expand Down Expand Up @@ -122,6 +181,38 @@ export class MCPServer {
// Detect this and shut down gracefully to prevent orphaned processes.
process.stdin.on('end', () => this.stop());
process.stdin.on('close', () => this.stop());

// PPID watchdog (#277). Linux doesn't propagate parent death to children,
// so when the MCP host (Claude Code, opencode, …) is SIGKILL'd by the OOM
// killer / a force-quit / a container teardown, the child is reparented to
// init/systemd and the stdin `end`/`close` events don't always fire. The
// server would then linger indefinitely, holding inotify watches, file
// descriptors, and the SQLite WAL. Poll `process.ppid` and shut down the
// moment it changes from what we observed at startup. Cross-platform:
// reparenting changes ppid on Linux *and* macOS; on Windows the value can
// also drop to 0 once the parent is gone. When the CLI re-execs itself for
// `--liftoff-only`, an intermediate process sits between us and the host and
// outlives it, so our own ppid wouldn't change — in that case we poll the
// host PID (propagated via HOST_PPID_ENV) for liveness instead. The watchdog
// is `.unref()`'d so it never holds the event loop open on its own.
const pollMs = parsePpidPollMs(process.env.CODEGRAPH_PPID_POLL_MS);
if (pollMs > 0) {
this.ppidWatchdog = setInterval(() => {
const current = process.ppid;
const ppidChanged = current !== this.originalPpid;
const hostGone = this.hostPpid !== null && !isProcessAlive(this.hostPpid);
if (ppidChanged || hostGone) {
const reason = ppidChanged
? `ppid ${this.originalPpid} -> ${current}`
: `host pid ${this.hostPpid} exited`;
process.stderr.write(
`[CodeGraph MCP] Parent process exited (${reason}); shutting down.\n`
);
this.stop();
}
}, pollMs);
this.ppidWatchdog.unref();
}
}

/**
Expand Down Expand Up @@ -283,6 +374,12 @@ export class MCPServer {
* Stop the server
*/
stop(): void {
if (this.stopped) return;
this.stopped = true;
if (this.ppidWatchdog) {
clearInterval(this.ppidWatchdog);
this.ppidWatchdog = null;
}
// Close all cached cross-project connections first
this.toolHandler.closeAll();
// Close the main CodeGraph instance
Expand Down