From 344139989b4e037be0f0a18107d320e555549105 Mon Sep 17 00:00:00 2001 From: Conal <33135619+Conalh@users.noreply.github.com> Date: Fri, 29 May 2026 09:38:39 -0700 Subject: [PATCH 1/3] fix: run CLI bin through symlinks and stop false Codex baseline drift Two trust-critical fixes flagged in external review. 1. npm installs the `scopetrail` bin as a symlink, so the entrypoint's `import.meta.url === process.argv[1]` main-module check was false when launched via `npx`/global install: main() never ran and the CLI exited 0 with no output -- a CI gate that silently passed every PR. Resolve both sides through realpathSync. Add a test that runs the built bin through a symlink (the existing CLI tests only ran `node dist/index.js` directly). 2. Codex sandbox/approval drift ranked a missing base value at -1, so a brand-new .codex/config.toml that merely set the narrowest posture (read-only sandbox, untrusted approval) was reported as a high-severity widening/weakening -- a false positive on the safest possible config. Anchor the missing baseline at Codex's safe default so only settings genuinely wider than that default surface; a brand-new danger-full-access sandbox or `never` approval still fires. Add unit tests for both Codex directions and a benign benchmark fixture (codex-baseline-narrowest); regenerate RESULTS.md (29 cases, precision/recall unchanged at 100%/0% FP) and sync the README corpus counts. Co-Authored-By: Claude Opus 4.8 --- README.md | 8 +-- benchmark/RESULTS.md | 17 ++--- .../new/.codex/config.toml | 2 + .../codex-baseline-narrowest/old/.gitkeep | 0 benchmark/labels.json | 1 + dist/detectors/codex-config.js | 22 ++++++- dist/index.js | 22 ++++++- src/detectors/codex-config.ts | 24 ++++++- src/index.ts | 21 ++++++- test/bin-invocation.test.mjs | 62 +++++++++++++++++++ test/codex-config-drift.test.mjs | 61 ++++++++++++++++++ 11 files changed, 220 insertions(+), 20 deletions(-) create mode 100644 benchmark/fixtures/codex-baseline-narrowest/new/.codex/config.toml create mode 100644 benchmark/fixtures/codex-baseline-narrowest/old/.gitkeep create mode 100644 test/bin-invocation.test.mjs diff --git a/README.md b/README.md index 907c178..25a8662 100644 --- a/README.md +++ b/README.md @@ -146,18 +146,18 @@ Findings carry a `severity` (`low` / `medium` / `high` / `critical`) and the rep ## How well it catches it -ScopeTrail ships a labeled precision/recall benchmark over **28 fixture PRs** (22 with planted drift, 6 benign) spanning **19 detector kinds**. Each fixture is an `old/`+`new/` config snapshot pair; ground truth is fixed by fixture design and the harness diffs the pair and scores the drift engine against it. Reproduce with `npm run build && node benchmark/run-benchmark.mjs`. +ScopeTrail ships a labeled precision/recall benchmark over **29 fixture PRs** (22 with planted drift, 7 benign) spanning **19 detector kinds**. Each fixture is an `old/`+`new/` config snapshot pair; ground truth is fixed by fixture design and the harness diffs the pair and scores the drift engine against it. Reproduce with `npm run build && node benchmark/run-benchmark.mjs`. | Metric | Result | | --- | --- | | Detection (any finding) — recall | **100%** (22/22 rogue PRs flagged) | -| Detection — false-positive rate | **0%** (0/6 benign PRs flagged) | +| Detection — false-positive rate | **0%** (0/7 benign PRs flagged) | | Detection — precision | **100%** | | Correct primary finding kind | **22/22** rogue PRs | | All expected finding kinds | **22/22** rogue PRs | -| Exact consolidated rating | **28/28** PRs | +| Exact consolidated rating | **29/29** PRs | -The 6 benign cases include five engineered **false-positive traps** — narrowly-scoped Claude grants (a textual diff sees new `allow` lines), an all-tightening Codex posture, network access that was *already* on, a removed MCP server, and a `.mcp.json` with reordered keys but an identical launch command — plus one byte-identical snapshot. None produce a finding, because the detectors compare semantics and flag only *widening*. +The 7 benign cases include six engineered **false-positive traps** — narrowly-scoped Claude grants (a textual diff sees new `allow` lines), an all-tightening Codex posture, network access that was *already* on, a brand-new Codex config pinned to the narrowest posture, a removed MCP server, and a `.mcp.json` with reordered keys but an identical launch command — plus one byte-identical snapshot. None produce a finding, because the detectors compare semantics and flag only *widening*. **Severity is calibrated, not maximized.** At a strict `fail-on: high` gate, recall is 82% — by design: sample/template MCP additions, pinned version bumps, broad `Read` allows, and newly-enabled Codex network access sit at `low`/`medium` because they widen the surface without being directly exploitable. The `high`/`critical` band is reserved for executable or secret-facing changes — a bare `Bash` grant, a removed `Read(.env)` deny, a `danger-full-access` sandbox, an unencrypted remote MCP endpoint. Full confusion matrix at every gate, per-category and per-case breakdowns: [benchmark/RESULTS.md](benchmark/RESULTS.md). Methodology and labels: [benchmark/labels.json](benchmark/labels.json). diff --git a/benchmark/RESULTS.md b/benchmark/RESULTS.md index f42ba9b..b4e7057 100644 --- a/benchmark/RESULTS.md +++ b/benchmark/RESULTS.md @@ -8,11 +8,11 @@ the pair and scores the drift engine against it. Benign cases include deliberate false-positive traps a naive textual diff would flag (tightened posture, removed servers, reordered JSON keys). -- Cases: **28** (22 rogue, 6 benign) across **19** detector kinds +- Cases: **29** (22 rogue, 7 benign) across **19** detector kinds - Detection (any finding): recall **100.0%**, false-positive rate **0.0%**, precision **100.0%** - At a `fail-on: high` CI gate: recall **81.8%**, false-positive rate **0.0%**, precision **100.0%** - Correct primary finding kind identified on **22/22** rogue cases; all expected kinds on **22/22** -- Exact rating match where the label pins one: **28/28** +- Exact rating match where the label pins one: **29/29** ## Confusion matrix by CI gate threshold @@ -20,10 +20,10 @@ A PR is predicted "drift" when its overall rating meets the threshold. `low` = a | Gate (`fail-on`) | TP | FP | FN | TN | Precision | Recall | FP rate | F1 | Accuracy | | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | -| low | 22 | 0 | 0 | 6 | 100.0% | 100.0% | 0.0% | 100.0% | 100.0% | -| medium | 21 | 0 | 1 | 6 | 100.0% | 95.5% | 0.0% | 97.7% | 96.4% | -| high | 18 | 0 | 4 | 6 | 100.0% | 81.8% | 0.0% | 90.0% | 85.7% | -| critical | 4 | 0 | 18 | 6 | 100.0% | 18.2% | 0.0% | 30.8% | 35.7% | +| low | 22 | 0 | 0 | 7 | 100.0% | 100.0% | 0.0% | 100.0% | 100.0% | +| medium | 21 | 0 | 1 | 7 | 100.0% | 95.5% | 0.0% | 97.7% | 96.6% | +| high | 18 | 0 | 4 | 7 | 100.0% | 81.8% | 0.0% | 90.0% | 86.2% | +| critical | 4 | 0 | 18 | 7 | 100.0% | 18.2% | 0.0% | 30.8% | 37.9% | ## Detector-kind identification (rogue cases) @@ -33,7 +33,7 @@ A PR is predicted "drift" when its overall rating meets the threshold. `low` = a ## Rating agreement -Of the **28** cases whose label pins an exact consolidated rating, the diff matched **28**. +Of the **29** cases whose label pins an exact consolidated rating, the diff matched **29**. ## Results by category @@ -42,7 +42,7 @@ Of the **28** cases whose label pins an exact consolidated rating, the diff matc | claude-drift | 5 | 5/5 | — | | clean | 1 | — | 1/1 | | codex-drift | 6 | 6/6 | — | -| fp-guard | 5 | — | 5/5 | +| fp-guard | 6 | — | 6/6 | | mcp-drift | 5 | 5/5 | — | | mcp-sample | 2 | 2/2 | — | | multi-class | 1 | 1/1 | — | @@ -82,6 +82,7 @@ None. Every rogue case produced all of its expected finding kinds, no benign cas | claude-scoped-grants | benign | fp-guard | none | 0 | — | — | yes | | codex-narrowed | benign | fp-guard | none | 0 | — | — | yes | | codex-network-already-on | benign | fp-guard | none | 0 | — | — | yes | +| codex-baseline-narrowest | benign | fp-guard | none | 0 | — | — | yes | | mcp-server-removed | benign | fp-guard | none | 0 | — | — | yes | | mcp-reformatted | benign | fp-guard | none | 0 | — | — | yes | diff --git a/benchmark/fixtures/codex-baseline-narrowest/new/.codex/config.toml b/benchmark/fixtures/codex-baseline-narrowest/new/.codex/config.toml new file mode 100644 index 0000000..5fc59c3 --- /dev/null +++ b/benchmark/fixtures/codex-baseline-narrowest/new/.codex/config.toml @@ -0,0 +1,2 @@ +sandbox_mode = "read-only" +approval_policy = "untrusted" diff --git a/benchmark/fixtures/codex-baseline-narrowest/old/.gitkeep b/benchmark/fixtures/codex-baseline-narrowest/old/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/benchmark/labels.json b/benchmark/labels.json index f5608ae..9bfdaab 100644 --- a/benchmark/labels.json +++ b/benchmark/labels.json @@ -31,6 +31,7 @@ { "fixture": "claude-scoped-grants", "intent": "benign", "category": "fp-guard", "expectFlagged": false, "expectKinds": [], "expectRating": "none", "note": "Every added allow is narrowly scoped (Read(src/...), WebFetch(domain:...), mcp__github__get_issue) and an added deny tightens — none is broad." }, { "fixture": "codex-narrowed", "intent": "benign", "category": "fp-guard", "expectFlagged": false, "expectKinds": [], "expectRating": "none", "note": "Every Codex setting moves to a tighter posture (full-access -> read-only, never -> on-request, network true -> false); drift detection ignores tightening." }, { "fixture": "codex-network-already-on", "intent": "benign", "category": "fp-guard", "expectFlagged": false, "expectKinds": [], "expectRating": "none", "note": "network_access was already true in old; it is not newly enabled here, only an unrelated model line is added." }, + { "fixture": "codex-baseline-narrowest", "intent": "benign", "category": "fp-guard", "expectFlagged": false, "expectKinds": [], "expectRating": "none", "note": "A brand-new .codex/config.toml pins the narrowest posture (read-only sandbox, untrusted approval) where the base had no Codex config; a missing baseline must anchor at Codex's safe default, not rank below every value, so the safest possible config is not reported as a widening/weakening." }, { "fixture": "mcp-server-removed", "intent": "benign", "category": "fp-guard", "expectFlagged": false, "expectKinds": [], "expectRating": "none", "note": "A server is removed between old and new — removing scope is not drift; only additions are flagged." }, { "fixture": "mcp-reformatted", "intent": "benign", "category": "fp-guard", "expectFlagged": false, "expectKinds": [], "expectRating": "none", "note": "Same launch command, JSON keys reordered — a semantic comparison must see no change where a textual diff would." } ], diff --git a/dist/detectors/codex-config.js b/dist/detectors/codex-config.js index 5c871bd..df8458e 100644 --- a/dist/detectors/codex-config.js +++ b/dist/detectors/codex-config.js @@ -29,7 +29,7 @@ export async function detectCodexConfigDrift(oldRoot, newRoot) { for (const key of ['sandbox_mode', 'sandbox', 'windows.sandbox']) { const oldEntry = oldConfig.get(key); const newEntry = newConfig.get(key); - if (newEntry && sandboxRank(newEntry.value) > sandboxRank(oldEntry?.value)) { + if (newEntry && sandboxRank(newEntry.value) > baselineSandboxRank(oldEntry?.value)) { findings.push({ kind: 'scope_trail.codex_sandbox_widened', severity: sandboxRank(newEntry.value) >= 3 ? 'critical' : 'high', @@ -43,7 +43,7 @@ export async function detectCodexConfigDrift(oldRoot, newRoot) { } const oldApproval = oldConfig.get('approval_policy'); const newApproval = newConfig.get('approval_policy'); - if (newApproval && approvalRank(newApproval.value) > approvalRank(oldApproval?.value)) { + if (newApproval && approvalRank(newApproval.value) > baselineApprovalRank(oldApproval?.value)) { findings.push({ kind: 'scope_trail.codex_approval_weakened', severity: newApproval.value === 'never' ? 'high' : 'medium', @@ -328,6 +328,24 @@ function stringifyScalar(value) { } return String(value).toLowerCase(); } +// When a sandbox/approval key is absent in the base config (or carries an +// unrecognized value), Codex falls back to its safe default posture: a +// read-only sandbox and untrusted approval — the narrowest settings. The +// previous comparison ranked a missing baseline at -1, so a brand-new config +// that merely set `sandbox_mode = "read-only"` or `approval_policy = +// "untrusted"` reported the *narrowest* values as a widening/weakening — a +// high-severity false positive on the safest possible config. Anchoring the +// baseline at the safe default means only settings genuinely wider than that +// default surface, while a brand-new `danger-full-access` sandbox or `never` +// approval introduced from no prior config still fires. +function baselineSandboxRank(value) { + const rank = sandboxRank(value); + return rank === -1 ? sandboxRank('read-only') : rank; +} +function baselineApprovalRank(value) { + const rank = approvalRank(value); + return rank === -1 ? approvalRank('untrusted') : rank; +} function sandboxRank(value) { if (!value) { return -1; diff --git a/dist/index.js b/dist/index.js index e49c8ac..ac4315a 100644 --- a/dist/index.js +++ b/dist/index.js @@ -1,4 +1,5 @@ #!/usr/bin/env node +import { realpathSync } from 'node:fs'; import { writeFile } from 'node:fs/promises'; import { fileURLToPath } from 'node:url'; import { detectClaudeSettingsDrift } from './detectors/claude-settings.js'; @@ -166,8 +167,25 @@ function parseDiffArgs(argv) { function isReportFormat(value) { return value === 'text' || value === 'markdown' || value === 'json' || value === 'github'; } -const invokedPath = process.argv[1] ? fileURLToPath(import.meta.url) === process.argv[1] : false; -if (invokedPath) { +// npm installs the `scopetrail` bin as a symlink (in node_modules/.bin and +// the global bin dir). When launched through that symlink, `process.argv[1]` +// is the symlink path while `import.meta.url` resolves to the real +// dist/index.js — so a bare `===` is false, `main()` never runs, and the CLI +// exits 0 with no output. For a CI gate that means silently passing every PR. +// Resolve both sides through realpath before comparing so the bin runs whether +// it is invoked directly (`node dist/index.js`) or via a symlink (`npx`/global). +function isMainModule() { + if (!process.argv[1]) { + return false; + } + try { + return realpathSync(fileURLToPath(import.meta.url)) === realpathSync(process.argv[1]); + } + catch { + return false; + } +} +if (isMainModule()) { process.exitCode = await main(); } function usage() { diff --git a/src/detectors/codex-config.ts b/src/detectors/codex-config.ts index 04f74d9..35be1a2 100644 --- a/src/detectors/codex-config.ts +++ b/src/detectors/codex-config.ts @@ -50,7 +50,7 @@ export async function detectCodexConfigDrift(oldRoot: string, newRoot: string): for (const key of ['sandbox_mode', 'sandbox', 'windows.sandbox']) { const oldEntry = oldConfig.get(key); const newEntry = newConfig.get(key); - if (newEntry && sandboxRank(newEntry.value) > sandboxRank(oldEntry?.value)) { + if (newEntry && sandboxRank(newEntry.value) > baselineSandboxRank(oldEntry?.value)) { findings.push({ kind: 'scope_trail.codex_sandbox_widened', severity: sandboxRank(newEntry.value) >= 3 ? 'critical' : 'high', @@ -65,7 +65,7 @@ export async function detectCodexConfigDrift(oldRoot: string, newRoot: string): const oldApproval = oldConfig.get('approval_policy'); const newApproval = newConfig.get('approval_policy'); - if (newApproval && approvalRank(newApproval.value) > approvalRank(oldApproval?.value)) { + if (newApproval && approvalRank(newApproval.value) > baselineApprovalRank(oldApproval?.value)) { findings.push({ kind: 'scope_trail.codex_approval_weakened', severity: newApproval.value === 'never' ? 'high' : 'medium', @@ -376,6 +376,26 @@ function stringifyScalar(value: unknown): string { return String(value).toLowerCase(); } +// When a sandbox/approval key is absent in the base config (or carries an +// unrecognized value), Codex falls back to its safe default posture: a +// read-only sandbox and untrusted approval — the narrowest settings. The +// previous comparison ranked a missing baseline at -1, so a brand-new config +// that merely set `sandbox_mode = "read-only"` or `approval_policy = +// "untrusted"` reported the *narrowest* values as a widening/weakening — a +// high-severity false positive on the safest possible config. Anchoring the +// baseline at the safe default means only settings genuinely wider than that +// default surface, while a brand-new `danger-full-access` sandbox or `never` +// approval introduced from no prior config still fires. +function baselineSandboxRank(value: string | undefined): number { + const rank = sandboxRank(value); + return rank === -1 ? sandboxRank('read-only') : rank; +} + +function baselineApprovalRank(value: string | undefined): number { + const rank = approvalRank(value); + return rank === -1 ? approvalRank('untrusted') : rank; +} + function sandboxRank(value: string | undefined): number { if (!value) { return -1; diff --git a/src/index.ts b/src/index.ts index 753807d..98a47c2 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,5 +1,6 @@ #!/usr/bin/env node +import { realpathSync } from 'node:fs'; import { writeFile } from 'node:fs/promises'; import { fileURLToPath } from 'node:url'; import { detectClaudeSettingsDrift } from './detectors/claude-settings.js'; @@ -197,9 +198,25 @@ function isReportFormat(value: string | undefined): value is ReportFormat { return value === 'text' || value === 'markdown' || value === 'json' || value === 'github'; } -const invokedPath = process.argv[1] ? fileURLToPath(import.meta.url) === process.argv[1] : false; +// npm installs the `scopetrail` bin as a symlink (in node_modules/.bin and +// the global bin dir). When launched through that symlink, `process.argv[1]` +// is the symlink path while `import.meta.url` resolves to the real +// dist/index.js — so a bare `===` is false, `main()` never runs, and the CLI +// exits 0 with no output. For a CI gate that means silently passing every PR. +// Resolve both sides through realpath before comparing so the bin runs whether +// it is invoked directly (`node dist/index.js`) or via a symlink (`npx`/global). +function isMainModule(): boolean { + if (!process.argv[1]) { + return false; + } + try { + return realpathSync(fileURLToPath(import.meta.url)) === realpathSync(process.argv[1]); + } catch { + return false; + } +} -if (invokedPath) { +if (isMainModule()) { process.exitCode = await main(); } diff --git a/test/bin-invocation.test.mjs b/test/bin-invocation.test.mjs new file mode 100644 index 0000000..ed004cf --- /dev/null +++ b/test/bin-invocation.test.mjs @@ -0,0 +1,62 @@ +import test from 'node:test'; +import assert from 'node:assert/strict'; +import { execFile } from 'node:child_process'; +import { existsSync, mkdtempSync, rmSync, symlinkSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import { promisify } from 'node:util'; +import { fileURLToPath } from 'node:url'; +import { dirname, join } from 'node:path'; + +const execFileAsync = promisify(execFile); +const testDir = dirname(fileURLToPath(import.meta.url)); +const packageRoot = join(testDir, '..'); +const cli = join(packageRoot, 'dist', 'index.js'); + +// npm installs the `scopetrail` bin as a symlink (node_modules/.bin and the +// global bin dir), so the entrypoint's "am I the main module?" check has to +// survive being launched through one. Every other CLI test runs +// `node dist/index.js` directly, which never exercises the symlink path that +// broke real `npm i -g` / `npx` installs: `main()` was silently skipped, so the +// process exited 0 with no output — a CI gate that passes every PR. This test +// runs the built bin through a symlink and asserts it actually produces a report. +test('built bin runs when invoked through a symlink (npm-install path)', async (t) => { + assert.ok(existsSync(cli), 'dist/index.js must be built before this test (npm run build)'); + + const linkDir = mkdtempSync(join(tmpdir(), 'scopetrail-bin-link-')); + const link = join(linkDir, 'scopetrail'); + try { + try { + symlinkSync(cli, link, 'file'); + } catch (error) { + // Windows without Developer Mode / admin rights cannot create symlinks. + // Linux CI (the platform npm consumers actually install on) provides the + // real regression coverage; skip rather than fail on a restricted host. + if (['EPERM', 'ENOSYS', 'EEXIST'].includes(error.code)) { + t.skip(`symlink creation unsupported here (${error.code})`); + return; + } + throw error; + } + + const oldDir = join(testDir, 'fixtures', 'combined', 'old'); + const newDir = join(testDir, 'fixtures', 'combined', 'new'); + const { stdout } = await execFileAsync(process.execPath, [ + link, + 'diff', + '--old', + oldDir, + '--new', + newDir, + '--format', + 'json' + ]); + + // Before the fix this stdout was empty (main() never ran through the + // symlink), so JSON.parse and the assertions below guard the regression. + const report = JSON.parse(stdout); + assert.equal(report.rating, 'critical'); + assert.equal(report.findings.length, 6); + } finally { + rmSync(linkDir, { recursive: true, force: true }); + } +}); diff --git a/test/codex-config-drift.test.mjs b/test/codex-config-drift.test.mjs index 2ea3ec2..6b9c047 100644 --- a/test/codex-config-drift.test.mjs +++ b/test/codex-config-drift.test.mjs @@ -126,6 +126,67 @@ test('inline-table sandbox/network keys are detected (parsed TOML, not regex)', } }); +test('codex baseline: a brand-new config at the narrowest posture is not flagged as widening', async () => { + // Pre-fix gap: a missing base value ranked at -1, so adding a fresh + // .codex/config.toml whose sandbox/approval were the *narrowest* + // settings (read-only sandbox, untrusted approval) reported them as + // widened/weakened — a high-severity false positive on the safest + // possible config. The baseline now anchors at Codex's safe default. + const { mkdtempSync, writeFileSync, mkdirSync, rmSync } = await import('node:fs'); + const { tmpdir } = await import('node:os'); + + const root = mkdtempSync(join(tmpdir(), 'scopetrail-codex-baseline-')); + try { + const oldDir = join(root, 'old'); + const newDir = join(root, 'new'); + // No .codex/config.toml in the base at all. + mkdirSync(oldDir, { recursive: true }); + mkdirSync(join(newDir, '.codex'), { recursive: true }); + writeFileSync( + join(newDir, '.codex', 'config.toml'), + 'sandbox_mode = "read-only"\napproval_policy = "untrusted"\n' + ); + + const findings = await detectCodexConfigDrift(oldDir, newDir); + assert.deepEqual(findings, [], 'narrowest settings in a brand-new config must not flag'); + } finally { + rmSync(root, { recursive: true, force: true }); + } +}); + +test('codex baseline: a brand-new config that introduces a wide posture is still flagged', async () => { + // The baseline anchor must not over-suppress: introducing a + // danger-full-access sandbox or `never` approval where the base had no + // .codex/config.toml is a genuine permission event, not a false positive. + const { mkdtempSync, writeFileSync, mkdirSync, rmSync } = await import('node:fs'); + const { tmpdir } = await import('node:os'); + + const root = mkdtempSync(join(tmpdir(), 'scopetrail-codex-baseline-wide-')); + try { + const oldDir = join(root, 'old'); + const newDir = join(root, 'new'); + mkdirSync(oldDir, { recursive: true }); + mkdirSync(join(newDir, '.codex'), { recursive: true }); + writeFileSync( + join(newDir, '.codex', 'config.toml'), + 'sandbox_mode = "danger-full-access"\napproval_policy = "never"\n' + ); + + const findings = await detectCodexConfigDrift(oldDir, newDir); + const byKind = (kind) => findings.find((f) => f.kind === kind); + + const sandbox = byKind('scope_trail.codex_sandbox_widened'); + assert.ok(sandbox, 'full-access sandbox introduced from no baseline must still flag'); + assert.equal(sandbox.severity, 'critical'); + assert.ok( + byKind('scope_trail.codex_approval_weakened'), + 'never approval introduced from no baseline must still flag' + ); + } finally { + rmSync(root, { recursive: true, force: true }); + } +}); + test('detects Codex config permission drift', async () => { const oldDir = join(testDir, 'fixtures', 'codex-config-drift', 'old'); const newDir = join(testDir, 'fixtures', 'codex-config-drift', 'new'); From b9f07ffdc9f0876394c1d70af51fb08e57c2c3fb Mon Sep 17 00:00:00 2001 From: Conal <33135619+Conalh@users.noreply.github.com> Date: Fri, 29 May 2026 09:56:53 -0700 Subject: [PATCH 2/3] feat: close remaining permission-drift blind spots and tighten docs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements the rest of the external review (the trust-critical bin/Codex pair landed in the previous commit). Detector blind spots: - #4 Merge all recognized MCP server maps instead of first-map-wins, so an empty `mcpServers: {}` can no longer shadow a populated `servers: {}` in Cursor/VS Code configs. - #3 Diff sensitive MCP fields (env, headers, cwd) on existing servers, which serverCommand()-based comparison ignored. A server keeping the same command but gaining a secret env var or auth header now surfaces; secret-bearing key names escalate to high. Applies to .mcp.json and Codex [mcp_servers.NAME]. Removals stay silent (narrowing, not widening). New kinds: scope_trail.mcp_server_sensitive_field_changed and scope_trail.codex_mcp_sensitive_field_changed. - #5 Model Claude hook entries by (matcher, type, command) rather than command alone, so rebinding a guard from one tool to another (same command) is caught. NOTE: this reverses the prior "matcher change is noise" decision and its test — a PreToolUse guard moved from Bash to Read stops guarding Bash, which is a real enforcement-surface change. - #7 Expand the critical removed-deny pattern set beyond .env/secret/ credential/.pem to SSH keys, *.key, cloud credentials, registry tokens, and kube configs. Docs & supply-chain: - #2/#8 Qualify the README "local-only" claim: the scanner uploads nothing, but the Action's setup step runs `npm ci` from the registry. - #9 Pin agent-gov-core to exact 1.3.0 (lockfile kept in sync) so npm consumers do not silently resolve a newer parser/schema. - #10 Note that the benchmark figures bound regressions, not real-world field accuracy. Each change ships unit tests and a labeled benchmark fixture. Corpus grows to 35 cases (27 rogue, 8 benign) across 21 detector kinds; precision/recall hold at 100%/0% false-positive rate, RESULTS.md regenerated and README counts synced. Co-Authored-By: Claude Opus 4.8 --- README.md | 18 +-- benchmark/RESULTS.md | 38 +++--- .../new/.claude/settings.json | 8 ++ .../old/.claude/settings.json | 10 ++ .../new/.claude/settings.json | 23 ++++ .../old/.claude/settings.json | 23 ++++ .../new/.codex/config.toml | 6 + .../old/.codex/config.toml | 3 + .../fixtures/mcp-env-removed/new/.mcp.json | 8 ++ .../fixtures/mcp-env-removed/old/.mcp.json | 11 ++ .../mcp-env-secret-added/new/.mcp.json | 11 ++ .../mcp-env-secret-added/old/.mcp.json | 8 ++ .../new/.cursor/mcp.json | 9 ++ .../old/.cursor/mcp.json | 4 + benchmark/labels.json | 6 + dist/detectors/claude-settings.js | 54 +++++++-- dist/detectors/codex-config.js | 21 +++- dist/detectors/mcp.js | 44 +++++-- dist/mcp-risk.js | 72 ++++++++++++ package-lock.json | 2 +- package.json | 2 +- src/detectors/claude-settings.ts | 57 ++++++--- src/detectors/codex-config.ts | 29 ++++- src/detectors/mcp.ts | 51 +++++++-- src/mcp-risk.ts | 108 ++++++++++++++++++ src/types.ts | 3 + test/codex-mcp-drift.test.mjs | 35 ++++++ test/heuristics-and-snapshot.test.mjs | 72 +++++++++++- test/mcp-drift.test.mjs | 99 ++++++++++++++++ 29 files changed, 755 insertions(+), 80 deletions(-) create mode 100644 benchmark/fixtures/claude-deny-removed-ssh/new/.claude/settings.json create mode 100644 benchmark/fixtures/claude-deny-removed-ssh/old/.claude/settings.json create mode 100644 benchmark/fixtures/claude-hook-matcher-rebound/new/.claude/settings.json create mode 100644 benchmark/fixtures/claude-hook-matcher-rebound/old/.claude/settings.json create mode 100644 benchmark/fixtures/codex-mcp-env-added/new/.codex/config.toml create mode 100644 benchmark/fixtures/codex-mcp-env-added/old/.codex/config.toml create mode 100644 benchmark/fixtures/mcp-env-removed/new/.mcp.json create mode 100644 benchmark/fixtures/mcp-env-removed/old/.mcp.json create mode 100644 benchmark/fixtures/mcp-env-secret-added/new/.mcp.json create mode 100644 benchmark/fixtures/mcp-env-secret-added/old/.mcp.json create mode 100644 benchmark/fixtures/mcp-second-map-server-added/new/.cursor/mcp.json create mode 100644 benchmark/fixtures/mcp-second-map-server-added/old/.cursor/mcp.json diff --git a/README.md b/README.md index 25a8662..8931965 100644 --- a/README.md +++ b/README.md @@ -134,7 +134,7 @@ ScopeTrail permission drift: CRITICAL ## How it works -ScopeTrail is **local-only**. It reads the checked-out repository, materializes the two git refs into temp directories, runs detectors over them, and prints the result. It uploads nothing, calls no external services, and has no required API keys. +ScopeTrail is **local-only**. It reads the checked-out repository, materializes the two git refs into temp directories, runs detectors over them, and prints the result. The scanner uploads nothing — no repository contents, findings, or telemetry leave your machine — and it needs no API keys. (The GitHub Action's setup step installs ScopeTrail's one runtime dependency with `npm ci` from the npm registry; the analysis itself makes no network calls.) The detectors cover the surfaces an AI agent can actually escalate through: @@ -146,20 +146,20 @@ Findings carry a `severity` (`low` / `medium` / `high` / `critical`) and the rep ## How well it catches it -ScopeTrail ships a labeled precision/recall benchmark over **29 fixture PRs** (22 with planted drift, 7 benign) spanning **19 detector kinds**. Each fixture is an `old/`+`new/` config snapshot pair; ground truth is fixed by fixture design and the harness diffs the pair and scores the drift engine against it. Reproduce with `npm run build && node benchmark/run-benchmark.mjs`. +ScopeTrail ships a labeled precision/recall benchmark over **35 fixture PRs** (27 with planted drift, 8 benign) spanning **21 detector kinds**. Each fixture is an `old/`+`new/` config snapshot pair; ground truth is fixed by fixture design and the harness diffs the pair and scores the drift engine against it. Reproduce with `npm run build && node benchmark/run-benchmark.mjs`. These figures score the engine against its own labeled fixtures — they bound regressions, not a claim of real-world field accuracy across every config a PR might contain. | Metric | Result | | --- | --- | -| Detection (any finding) — recall | **100%** (22/22 rogue PRs flagged) | -| Detection — false-positive rate | **0%** (0/7 benign PRs flagged) | +| Detection (any finding) — recall | **100%** (27/27 rogue PRs flagged) | +| Detection — false-positive rate | **0%** (0/8 benign PRs flagged) | | Detection — precision | **100%** | -| Correct primary finding kind | **22/22** rogue PRs | -| All expected finding kinds | **22/22** rogue PRs | -| Exact consolidated rating | **29/29** PRs | +| Correct primary finding kind | **27/27** rogue PRs | +| All expected finding kinds | **27/27** rogue PRs | +| Exact consolidated rating | **35/35** PRs | -The 7 benign cases include six engineered **false-positive traps** — narrowly-scoped Claude grants (a textual diff sees new `allow` lines), an all-tightening Codex posture, network access that was *already* on, a brand-new Codex config pinned to the narrowest posture, a removed MCP server, and a `.mcp.json` with reordered keys but an identical launch command — plus one byte-identical snapshot. None produce a finding, because the detectors compare semantics and flag only *widening*. +The 8 benign cases include seven engineered **false-positive traps** — narrowly-scoped Claude grants (a textual diff sees new `allow` lines), an all-tightening Codex posture, network access that was *already* on, a brand-new Codex config pinned to the narrowest posture, a dropped MCP `env` var, a removed MCP server, and a `.mcp.json` with reordered keys but an identical launch command — plus one byte-identical snapshot. None produce a finding, because the detectors compare semantics and flag only *widening*. -**Severity is calibrated, not maximized.** At a strict `fail-on: high` gate, recall is 82% — by design: sample/template MCP additions, pinned version bumps, broad `Read` allows, and newly-enabled Codex network access sit at `low`/`medium` because they widen the surface without being directly exploitable. The `high`/`critical` band is reserved for executable or secret-facing changes — a bare `Bash` grant, a removed `Read(.env)` deny, a `danger-full-access` sandbox, an unencrypted remote MCP endpoint. Full confusion matrix at every gate, per-category and per-case breakdowns: [benchmark/RESULTS.md](benchmark/RESULTS.md). Methodology and labels: [benchmark/labels.json](benchmark/labels.json). +**Severity is calibrated, not maximized.** At a strict `fail-on: high` gate, recall is 85% — by design: sample/template MCP additions, pinned version bumps, broad `Read` allows, and newly-enabled Codex network access sit at `low`/`medium` because they widen the surface without being directly exploitable. The `high`/`critical` band is reserved for executable or secret-facing changes — a bare `Bash` grant, a removed `Read(.env)` deny, a `danger-full-access` sandbox, an unencrypted remote MCP endpoint. Full confusion matrix at every gate, per-category and per-case breakdowns: [benchmark/RESULTS.md](benchmark/RESULTS.md). Methodology and labels: [benchmark/labels.json](benchmark/labels.json). ## Design choices worth flagging diff --git a/benchmark/RESULTS.md b/benchmark/RESULTS.md index b4e7057..0903d6c 100644 --- a/benchmark/RESULTS.md +++ b/benchmark/RESULTS.md @@ -8,11 +8,11 @@ the pair and scores the drift engine against it. Benign cases include deliberate false-positive traps a naive textual diff would flag (tightened posture, removed servers, reordered JSON keys). -- Cases: **29** (22 rogue, 7 benign) across **19** detector kinds +- Cases: **35** (27 rogue, 8 benign) across **21** detector kinds - Detection (any finding): recall **100.0%**, false-positive rate **0.0%**, precision **100.0%** -- At a `fail-on: high` CI gate: recall **81.8%**, false-positive rate **0.0%**, precision **100.0%** -- Correct primary finding kind identified on **22/22** rogue cases; all expected kinds on **22/22** -- Exact rating match where the label pins one: **29/29** +- At a `fail-on: high` CI gate: recall **85.2%**, false-positive rate **0.0%**, precision **100.0%** +- Correct primary finding kind identified on **27/27** rogue cases; all expected kinds on **27/27** +- Exact rating match where the label pins one: **35/35** ## Confusion matrix by CI gate threshold @@ -20,30 +20,30 @@ A PR is predicted "drift" when its overall rating meets the threshold. `low` = a | Gate (`fail-on`) | TP | FP | FN | TN | Precision | Recall | FP rate | F1 | Accuracy | | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | -| low | 22 | 0 | 0 | 7 | 100.0% | 100.0% | 0.0% | 100.0% | 100.0% | -| medium | 21 | 0 | 1 | 7 | 100.0% | 95.5% | 0.0% | 97.7% | 96.6% | -| high | 18 | 0 | 4 | 7 | 100.0% | 81.8% | 0.0% | 90.0% | 86.2% | -| critical | 4 | 0 | 18 | 7 | 100.0% | 18.2% | 0.0% | 30.8% | 37.9% | +| low | 27 | 0 | 0 | 8 | 100.0% | 100.0% | 0.0% | 100.0% | 100.0% | +| medium | 26 | 0 | 1 | 8 | 100.0% | 96.3% | 0.0% | 98.1% | 97.1% | +| high | 23 | 0 | 4 | 8 | 100.0% | 85.2% | 0.0% | 92.0% | 88.6% | +| critical | 5 | 0 | 22 | 8 | 100.0% | 18.5% | 0.0% | 31.3% | 37.1% | ## Detector-kind identification (rogue cases) -- Primary expected kind detected: **22/22** -- All expected kinds detected: **22/22** -- Distinct detector kinds exercised: **19** +- Primary expected kind detected: **27/27** +- All expected kinds detected: **27/27** +- Distinct detector kinds exercised: **21** ## Rating agreement -Of the **29** cases whose label pins an exact consolidated rating, the diff matched **29**. +Of the **35** cases whose label pins an exact consolidated rating, the diff matched **35**. ## Results by category | Category | Cases | Rogue detected | Benign clean | | --- | ---: | :---: | :---: | -| claude-drift | 5 | 5/5 | — | +| claude-drift | 7 | 7/7 | — | | clean | 1 | — | 1/1 | -| codex-drift | 6 | 6/6 | — | -| fp-guard | 6 | — | 6/6 | -| mcp-drift | 5 | 5/5 | — | +| codex-drift | 7 | 7/7 | — | +| fp-guard | 7 | — | 7/7 | +| mcp-drift | 7 | 7/7 | — | | mcp-sample | 2 | 2/2 | — | | multi-class | 1 | 1/1 | — | | robustness | 3 | 3/3 | — | @@ -58,17 +58,21 @@ None. Every rogue case produced all of its expected finding kinds, no benign cas | --- | --- | --- | --- | ---: | :---: | :---: | :---: | | mcp-server-added | rogue | mcp-drift | high | 1 | yes | yes | yes | | mcp-unpinned-added | rogue | mcp-drift | high | 2 | yes | yes | yes | +| mcp-env-secret-added | rogue | mcp-drift | high | 1 | yes | yes | yes | | mcp-command-changed | rogue | mcp-drift | medium | 1 | yes | yes | yes | | mcp-remote-http | rogue | mcp-drift | critical | 2 | yes | yes | yes | | mcp-remote-https | rogue | mcp-drift | high | 2 | yes | yes | yes | | mcp-syntax-error | rogue | robustness | high | 1 | yes | yes | yes | +| mcp-second-map-server-added | rogue | mcp-drift | high | 2 | yes | yes | yes | | mcp-sample-added | rogue | mcp-sample | low | 1 | yes | yes | yes | | mcp-sample-pipe-to-shell | rogue | mcp-sample | high | 2 | yes | yes | yes | | claude-broad-allow-bash | rogue | claude-drift | high | 1 | yes | yes | yes | | claude-broad-allow-read | rogue | claude-drift | medium | 1 | yes | yes | yes | | claude-deny-removed-env | rogue | claude-drift | critical | 1 | yes | yes | yes | +| claude-deny-removed-ssh | rogue | claude-drift | critical | 1 | yes | yes | yes | | claude-hook-removed | rogue | claude-drift | high | 1 | yes | yes | yes | | claude-hook-command-changed | rogue | claude-drift | high | 1 | yes | yes | yes | +| claude-hook-matcher-rebound | rogue | claude-drift | high | 1 | yes | yes | yes | | claude-syntax-error | rogue | robustness | high | 1 | yes | yes | yes | | codex-sandbox-danger | rogue | codex-drift | critical | 1 | yes | yes | yes | | codex-sandbox-workspace | rogue | codex-drift | high | 1 | yes | yes | yes | @@ -76,6 +80,7 @@ None. Every rogue case produced all of its expected finding kinds, no benign cas | codex-network-enabled | rogue | codex-drift | medium | 1 | yes | yes | yes | | codex-project-trusted | rogue | codex-drift | high | 1 | yes | yes | yes | | codex-mcp-added | rogue | codex-drift | high | 2 | yes | yes | yes | +| codex-mcp-env-added | rogue | codex-drift | high | 1 | yes | yes | yes | | codex-syntax-error | rogue | robustness | high | 1 | yes | yes | yes | | multi-class-drift | rogue | multi-class | critical | 5 | yes | yes | yes | | clean-identical | benign | clean | none | 0 | — | — | yes | @@ -83,6 +88,7 @@ None. Every rogue case produced all of its expected finding kinds, no benign cas | codex-narrowed | benign | fp-guard | none | 0 | — | — | yes | | codex-network-already-on | benign | fp-guard | none | 0 | — | — | yes | | codex-baseline-narrowest | benign | fp-guard | none | 0 | — | — | yes | +| mcp-env-removed | benign | fp-guard | none | 0 | — | — | yes | | mcp-server-removed | benign | fp-guard | none | 0 | — | — | yes | | mcp-reformatted | benign | fp-guard | none | 0 | — | — | yes | diff --git a/benchmark/fixtures/claude-deny-removed-ssh/new/.claude/settings.json b/benchmark/fixtures/claude-deny-removed-ssh/new/.claude/settings.json new file mode 100644 index 0000000..7822a04 --- /dev/null +++ b/benchmark/fixtures/claude-deny-removed-ssh/new/.claude/settings.json @@ -0,0 +1,8 @@ +{ + "permissions": { + "allow": [ + "mcp__github__get_issue" + ], + "deny": [] + } +} diff --git a/benchmark/fixtures/claude-deny-removed-ssh/old/.claude/settings.json b/benchmark/fixtures/claude-deny-removed-ssh/old/.claude/settings.json new file mode 100644 index 0000000..4013950 --- /dev/null +++ b/benchmark/fixtures/claude-deny-removed-ssh/old/.claude/settings.json @@ -0,0 +1,10 @@ +{ + "permissions": { + "allow": [ + "mcp__github__get_issue" + ], + "deny": [ + "Read(~/.ssh/**)" + ] + } +} diff --git a/benchmark/fixtures/claude-hook-matcher-rebound/new/.claude/settings.json b/benchmark/fixtures/claude-hook-matcher-rebound/new/.claude/settings.json new file mode 100644 index 0000000..b767856 --- /dev/null +++ b/benchmark/fixtures/claude-hook-matcher-rebound/new/.claude/settings.json @@ -0,0 +1,23 @@ +{ + "permissions": { + "allow": [ + "mcp__github__get_issue" + ], + "deny": [ + "Read(.env)" + ] + }, + "hooks": { + "PreToolUse": [ + { + "matcher": "Read", + "hooks": [ + { + "type": "command", + "command": "./guard.sh" + } + ] + } + ] + } +} diff --git a/benchmark/fixtures/claude-hook-matcher-rebound/old/.claude/settings.json b/benchmark/fixtures/claude-hook-matcher-rebound/old/.claude/settings.json new file mode 100644 index 0000000..306d7cd --- /dev/null +++ b/benchmark/fixtures/claude-hook-matcher-rebound/old/.claude/settings.json @@ -0,0 +1,23 @@ +{ + "permissions": { + "allow": [ + "mcp__github__get_issue" + ], + "deny": [ + "Read(.env)" + ] + }, + "hooks": { + "PreToolUse": [ + { + "matcher": "Bash", + "hooks": [ + { + "type": "command", + "command": "./guard.sh" + } + ] + } + ] + } +} diff --git a/benchmark/fixtures/codex-mcp-env-added/new/.codex/config.toml b/benchmark/fixtures/codex-mcp-env-added/new/.codex/config.toml new file mode 100644 index 0000000..16008b8 --- /dev/null +++ b/benchmark/fixtures/codex-mcp-env-added/new/.codex/config.toml @@ -0,0 +1,6 @@ +[mcp_servers.billing] +command = "npx" +args = ["-y", "@vendor/billing-mcp@1.4.0"] + +[mcp_servers.billing.env] +STRIPE_SECRET_KEY = "${STRIPE_SECRET_KEY}" diff --git a/benchmark/fixtures/codex-mcp-env-added/old/.codex/config.toml b/benchmark/fixtures/codex-mcp-env-added/old/.codex/config.toml new file mode 100644 index 0000000..7a8c8a4 --- /dev/null +++ b/benchmark/fixtures/codex-mcp-env-added/old/.codex/config.toml @@ -0,0 +1,3 @@ +[mcp_servers.billing] +command = "npx" +args = ["-y", "@vendor/billing-mcp@1.4.0"] diff --git a/benchmark/fixtures/mcp-env-removed/new/.mcp.json b/benchmark/fixtures/mcp-env-removed/new/.mcp.json new file mode 100644 index 0000000..e91fac7 --- /dev/null +++ b/benchmark/fixtures/mcp-env-removed/new/.mcp.json @@ -0,0 +1,8 @@ +{ + "mcpServers": { + "billing": { + "command": "npx", + "args": ["-y", "@vendor/billing-mcp@1.4.0"] + } + } +} diff --git a/benchmark/fixtures/mcp-env-removed/old/.mcp.json b/benchmark/fixtures/mcp-env-removed/old/.mcp.json new file mode 100644 index 0000000..276be1f --- /dev/null +++ b/benchmark/fixtures/mcp-env-removed/old/.mcp.json @@ -0,0 +1,11 @@ +{ + "mcpServers": { + "billing": { + "command": "npx", + "args": ["-y", "@vendor/billing-mcp@1.4.0"], + "env": { + "STRIPE_SECRET_KEY": "${STRIPE_SECRET_KEY}" + } + } + } +} diff --git a/benchmark/fixtures/mcp-env-secret-added/new/.mcp.json b/benchmark/fixtures/mcp-env-secret-added/new/.mcp.json new file mode 100644 index 0000000..276be1f --- /dev/null +++ b/benchmark/fixtures/mcp-env-secret-added/new/.mcp.json @@ -0,0 +1,11 @@ +{ + "mcpServers": { + "billing": { + "command": "npx", + "args": ["-y", "@vendor/billing-mcp@1.4.0"], + "env": { + "STRIPE_SECRET_KEY": "${STRIPE_SECRET_KEY}" + } + } + } +} diff --git a/benchmark/fixtures/mcp-env-secret-added/old/.mcp.json b/benchmark/fixtures/mcp-env-secret-added/old/.mcp.json new file mode 100644 index 0000000..e91fac7 --- /dev/null +++ b/benchmark/fixtures/mcp-env-secret-added/old/.mcp.json @@ -0,0 +1,8 @@ +{ + "mcpServers": { + "billing": { + "command": "npx", + "args": ["-y", "@vendor/billing-mcp@1.4.0"] + } + } +} diff --git a/benchmark/fixtures/mcp-second-map-server-added/new/.cursor/mcp.json b/benchmark/fixtures/mcp-second-map-server-added/new/.cursor/mcp.json new file mode 100644 index 0000000..4f1fad0 --- /dev/null +++ b/benchmark/fixtures/mcp-second-map-server-added/new/.cursor/mcp.json @@ -0,0 +1,9 @@ +{ + "mcpServers": {}, + "servers": { + "new-risky-server": { + "command": "npx", + "args": ["@bad/server@latest"] + } + } +} diff --git a/benchmark/fixtures/mcp-second-map-server-added/old/.cursor/mcp.json b/benchmark/fixtures/mcp-second-map-server-added/old/.cursor/mcp.json new file mode 100644 index 0000000..281739c --- /dev/null +++ b/benchmark/fixtures/mcp-second-map-server-added/old/.cursor/mcp.json @@ -0,0 +1,4 @@ +{ + "mcpServers": {}, + "servers": {} +} diff --git a/benchmark/labels.json b/benchmark/labels.json index 9bfdaab..3f5610e 100644 --- a/benchmark/labels.json +++ b/benchmark/labels.json @@ -6,17 +6,21 @@ "cases": [ { "fixture": "mcp-server-added", "intent": "rogue", "category": "mcp-drift", "expectFlagged": true, "expectKinds": ["scope_trail.mcp_server_added"], "expectRating": "high", "note": "A new pinned MCP server appears in .mcp.json; addition alone is reportable." }, { "fixture": "mcp-unpinned-added", "intent": "rogue", "category": "mcp-drift", "expectFlagged": true, "expectKinds": ["scope_trail.mcp_server_added", "scope_trail.unpinned_mcp_command"], "expectRating": "high", "note": "New server launched via npx @vendor/...@latest — added and unpinned." }, + { "fixture": "mcp-env-secret-added", "intent": "rogue", "category": "mcp-drift", "expectFlagged": true, "expectKinds": ["scope_trail.mcp_server_sensitive_field_changed"], "expectRating": "high", "note": "An existing server keeps the same pinned launch command but gains a secret-bearing env var (STRIPE_SECRET_KEY); the sensitive-field diff must catch the added credential the command-based diff ignores." }, { "fixture": "mcp-command-changed", "intent": "rogue", "category": "mcp-drift", "expectFlagged": true, "expectKinds": ["scope_trail.mcp_server_command_changed"], "expectRating": "medium", "note": "Same server name, pinned version bumped @1.0.0 -> @2.0.0." }, { "fixture": "mcp-remote-http", "intent": "rogue", "category": "mcp-drift", "expectFlagged": true, "expectKinds": ["scope_trail.mcp_server_added", "scope_trail.mcp_remote_endpoint"], "expectRating": "critical", "note": "New server points at an unencrypted http:// endpoint — remote endpoint is critical over plaintext." }, { "fixture": "mcp-remote-https", "intent": "rogue", "category": "mcp-drift", "expectFlagged": true, "expectKinds": ["scope_trail.mcp_server_added", "scope_trail.mcp_remote_endpoint"], "expectRating": "high", "note": "New server points at an https:// endpoint — remote but encrypted, so high not critical." }, { "fixture": "mcp-syntax-error", "intent": "rogue", "category": "robustness", "expectFlagged": true, "expectKinds": ["scope_trail.mcp_config_syntax_error"], "expectRating": "high", "note": "Malformed new .mcp.json is reported, not silently dropped or crashed on." }, + { "fixture": "mcp-second-map-server-added", "intent": "rogue", "category": "mcp-drift", "expectFlagged": true, "expectKinds": ["scope_trail.mcp_server_added", "scope_trail.unpinned_mcp_command"], "expectRating": "high", "note": "A .cursor/mcp.json declares servers under the `servers` key while `mcpServers` is present but empty; the detector must merge all recognized maps so the `servers` entry is not shadowed by the empty `mcpServers`." }, { "fixture": "mcp-sample-added", "intent": "rogue", "category": "mcp-sample", "expectFlagged": true, "expectKinds": ["scope_trail.mcp_sample_server_added"], "expectRating": "low", "note": "A pinned server added to an example/template MCP config — informational, low." }, { "fixture": "mcp-sample-pipe-to-shell", "intent": "rogue", "category": "mcp-sample", "expectFlagged": true, "expectKinds": ["scope_trail.mcp_sample_server_added", "scope_trail.mcp_sample_unpinned_command"], "expectRating": "high", "note": "Sample server installs via curl | bash; pipe-to-shell escalates the sample-unpinned finding to high." }, { "fixture": "claude-broad-allow-bash", "intent": "rogue", "category": "claude-drift", "expectFlagged": true, "expectKinds": ["scope_trail.permission_allow_widened"], "expectRating": "high", "note": "Bare `Bash` verb added to the allowlist — broad, and bash escalates to high." }, { "fixture": "claude-broad-allow-read", "intent": "rogue", "category": "claude-drift", "expectFlagged": true, "expectKinds": ["scope_trail.permission_allow_widened"], "expectRating": "medium", "note": "Bare `Read` verb added — broad but not bash/write/edit, so medium." }, { "fixture": "claude-deny-removed-env", "intent": "rogue", "category": "claude-drift", "expectFlagged": true, "expectKinds": ["scope_trail.permission_deny_removed"], "expectRating": "critical", "note": "Read(.env) deny rule removed — secret-guarding deny removal is critical." }, + { "fixture": "claude-deny-removed-ssh", "intent": "rogue", "category": "claude-drift", "expectFlagged": true, "expectKinds": ["scope_trail.permission_deny_removed"], "expectRating": "critical", "note": "A Read(~/.ssh/**) deny rule is removed; SSH private keys are secret-bearing, so the removal escalates to critical alongside the existing .env case." }, { "fixture": "claude-hook-removed", "intent": "rogue", "category": "claude-drift", "expectFlagged": true, "expectKinds": ["scope_trail.hook_removed"], "expectRating": "high", "note": "PreToolUse hook removed — a high-impact lifecycle hook." }, { "fixture": "claude-hook-command-changed", "intent": "rogue", "category": "claude-drift", "expectFlagged": true, "expectKinds": ["scope_trail.hook_command_changed"], "expectRating": "high", "note": "PreToolUse hook command swapped to add --skip; high-impact hook." }, + { "fixture": "claude-hook-matcher-rebound", "intent": "rogue", "category": "claude-drift", "expectFlagged": true, "expectKinds": ["scope_trail.hook_command_changed"], "expectRating": "high", "note": "A PreToolUse guard keeps the same command but its matcher is rebound from Bash to Read, so it stops guarding Bash; the matcher is part of the hook's enforcement identity, so the rebinding is reportable." }, { "fixture": "claude-syntax-error", "intent": "rogue", "category": "robustness", "expectFlagged": true, "expectKinds": ["scope_trail.claude_settings_syntax_error"], "expectRating": "high", "note": "Malformed new .claude/settings.json is reported, not hidden." }, { "fixture": "codex-sandbox-danger", "intent": "rogue", "category": "codex-drift", "expectFlagged": true, "expectKinds": ["scope_trail.codex_sandbox_widened"], "expectRating": "critical", "note": "Codex sandbox widened to danger-full-access — top sandbox rank, critical." }, { "fixture": "codex-sandbox-workspace", "intent": "rogue", "category": "codex-drift", "expectFlagged": true, "expectKinds": ["scope_trail.codex_sandbox_widened"], "expectRating": "high", "note": "Codex sandbox widened read-only -> workspace-write; elevation below full-access, so high." }, @@ -24,6 +28,7 @@ { "fixture": "codex-network-enabled", "intent": "rogue", "category": "codex-drift", "expectFlagged": true, "expectKinds": ["scope_trail.codex_network_enabled"], "expectRating": "medium", "note": "Codex network access newly enabled (false -> true)." }, { "fixture": "codex-project-trusted", "intent": "rogue", "category": "codex-drift", "expectFlagged": true, "expectKinds": ["scope_trail.codex_project_trusted"], "expectRating": "high", "note": "A project entry newly marked trust_level = trusted." }, { "fixture": "codex-mcp-added", "intent": "rogue", "category": "codex-drift", "expectFlagged": true, "expectKinds": ["scope_trail.codex_mcp_server_added", "scope_trail.codex_unpinned_mcp_command"], "expectRating": "high", "note": "New [mcp_servers.NAME] block added in Codex config, launched via npx -y @vendor/...@latest." }, + { "fixture": "codex-mcp-env-added", "intent": "rogue", "category": "codex-drift", "expectFlagged": true, "expectKinds": ["scope_trail.codex_mcp_sensitive_field_changed"], "expectRating": "high", "note": "An existing Codex [mcp_servers.billing] keeps the same pinned command but gains a [mcp_servers.billing.env] secret (STRIPE_SECRET_KEY); the sensitive-field diff applies to Codex MCP tables too." }, { "fixture": "codex-syntax-error", "intent": "rogue", "category": "robustness", "expectFlagged": true, "expectKinds": ["scope_trail.codex_config_syntax_error"], "expectRating": "high", "note": "Malformed new .codex/config.toml is reported, not hidden." }, { "fixture": "multi-class-drift", "intent": "rogue", "category": "multi-class", "expectFlagged": true, "expectKinds": ["scope_trail.mcp_server_added", "scope_trail.unpinned_mcp_command", "scope_trail.permission_allow_widened", "scope_trail.permission_deny_removed", "scope_trail.codex_sandbox_widened"], "expectRating": "critical", "note": "One PR touching MCP, Claude, and Codex at once — five distinct drift classes." }, @@ -32,6 +37,7 @@ { "fixture": "codex-narrowed", "intent": "benign", "category": "fp-guard", "expectFlagged": false, "expectKinds": [], "expectRating": "none", "note": "Every Codex setting moves to a tighter posture (full-access -> read-only, never -> on-request, network true -> false); drift detection ignores tightening." }, { "fixture": "codex-network-already-on", "intent": "benign", "category": "fp-guard", "expectFlagged": false, "expectKinds": [], "expectRating": "none", "note": "network_access was already true in old; it is not newly enabled here, only an unrelated model line is added." }, { "fixture": "codex-baseline-narrowest", "intent": "benign", "category": "fp-guard", "expectFlagged": false, "expectKinds": [], "expectRating": "none", "note": "A brand-new .codex/config.toml pins the narrowest posture (read-only sandbox, untrusted approval) where the base had no Codex config; a missing baseline must anchor at Codex's safe default, not rank below every value, so the safest possible config is not reported as a widening/weakening." }, + { "fixture": "mcp-env-removed", "intent": "benign", "category": "fp-guard", "expectFlagged": false, "expectKinds": [], "expectRating": "none", "note": "An existing server drops an env var while keeping the same command; removing capability is a narrowing, not a widening, so the sensitive-field diff must stay silent." }, { "fixture": "mcp-server-removed", "intent": "benign", "category": "fp-guard", "expectFlagged": false, "expectKinds": [], "expectRating": "none", "note": "A server is removed between old and new — removing scope is not drift; only additions are flagged." }, { "fixture": "mcp-reformatted", "intent": "benign", "category": "fp-guard", "expectFlagged": false, "expectKinds": [], "expectRating": "none", "note": "Same launch command, JSON keys reordered — a semantic comparison must see no change where a textual diff would." } ], diff --git a/dist/detectors/claude-settings.js b/dist/detectors/claude-settings.js index 5e8d3a6..81373bc 100644 --- a/dist/detectors/claude-settings.js +++ b/dist/detectors/claude-settings.js @@ -71,7 +71,7 @@ export async function detectClaudeSettingsDrift(oldRoot, newRoot) { file: CLAUDE_SETTINGS_FILE, subject: hookName, message: hookCommandChangeMessage(hookName, added, removed), - recommendation: 'Review the change — a removed guard, a no-op appended next to a strict one, or any rewrite of a hook command can all weaken policy.' + recommendation: 'Review the change — a removed guard, a no-op appended next to a strict one, a matcher rebound to a different tool, or any rewrite of a hook command can all weaken policy.' }); } } @@ -109,36 +109,55 @@ async function readClaudeSettings(root) { // { "PreToolUse": [{ "matcher": "Bash", "hooks": [{ "type": "command", // "command": "/path/guard.sh" }] }] } // -// We collect every command string per hook name so a PR can be checked -// for both presence and content changes. +// The matcher is part of a hook's enforcement identity: a guard bound to +// `Bash` that is rebound to `Read` keeps the same command but stops guarding +// Bash — a real change in enforcement surface. So we key each entry by a +// (matcher, type, command) signature rather than the command string alone; a +// matcher/type rebinding then surfaces as a removed entry plus an added entry, +// exactly as a command swap does. Reordering an identical set still produces +// nothing, because set membership is order-independent. function readHookCommands(hooks) { const result = new Map(); for (const [name, value] of Object.entries(hooks)) { if (!hookHasEntries(value)) { continue; } - const commands = new Set(); + const signatures = new Set(); const entries = Array.isArray(value) ? value : Object.values(value); for (const entry of entries) { if (!isRecord(entry)) { continue; } + const matcher = typeof entry.matcher === 'string' ? entry.matcher : ''; const innerList = entry.hooks; if (Array.isArray(innerList)) { for (const inner of innerList) { if (isRecord(inner) && typeof inner.command === 'string') { - commands.add(inner.command); + signatures.add(hookEntrySignature(matcher, typeof inner.type === 'string' ? inner.type : '', inner.command)); } } } if (typeof entry.command === 'string') { - commands.add(entry.command); + signatures.add(hookEntrySignature(matcher, typeof entry.type === 'string' ? entry.type : '', entry.command)); } } - result.set(name, commands); + result.set(name, signatures); } return result; } +// Stable, human-readable identity for a hook entry. The command leads so the +// added/removed message stays scannable (`added: /noop.sh [matcher=Bash, ...]`); +// the matcher/type qualifiers are what make a rebinding visible in the diff. +function hookEntrySignature(matcher, type, command) { + const qualifiers = []; + if (matcher) { + qualifiers.push(`matcher=${matcher}`); + } + if (type) { + qualifiers.push(`type=${type}`); + } + return qualifiers.length > 0 ? `${command} [${qualifiers.join(', ')}]` : command; +} function readStringArray(value) { return Array.isArray(value) ? value.filter((entry) => typeof entry === 'string') : []; } @@ -234,12 +253,23 @@ function severityForAllow(permission) { } return 'medium'; } +// Removing a deny rule that guards secrets or keys is critical — the PR is +// re-opening agent access to material a reviewer almost never wants read or +// exfiltrated. The previous set only caught .env/secret/credential/.pem, so +// removing a deny on SSH keys, cloud credentials, package-registry tokens, or +// kube configs silently downgraded to medium. Substring matching is +// intentional so glob forms (`Read(~/.ssh/**)`, `Read(*.key)`) still match. +const CRITICAL_DENY_PATTERNS = [ + '.env', 'secret', 'credential', '.pem', + '.ssh', 'id_rsa', 'id_dsa', 'id_ecdsa', 'id_ed25519', '.key', + '.npmrc', '.pypirc', '.netrc', + 'kubeconfig', '.kube', + '.aws', '.gcp', '.azure', + '.p12', '.pfx' +]; function severityForRemovedDeny(permission) { const normalized = permission.toLowerCase(); - if (normalized.includes('.env') || normalized.includes('secret') || normalized.includes('credential') || normalized.includes('.pem')) { - return 'critical'; - } - return 'medium'; + return CRITICAL_DENY_PATTERNS.some((pattern) => normalized.includes(pattern)) ? 'critical' : 'medium'; } function isHighImpactHook(hookName) { return ['pretooluse', 'posttooluse', 'permissionrequest', 'sessionend'].includes(hookName.toLowerCase()); @@ -252,5 +282,5 @@ function hookCommandChangeMessage(hookName, added, removed) { if (removed.length > 0) { parts.push(`removed: ${removed.join(', ')}`); } - return `Claude hook "${hookName}" command(s) changed (${parts.join('; ')}).`; + return `Claude hook "${hookName}" entries changed (${parts.join('; ')}).`; } diff --git a/dist/detectors/codex-config.js b/dist/detectors/codex-config.js index df8458e..ef0ffd2 100644 --- a/dist/detectors/codex-config.js +++ b/dist/detectors/codex-config.js @@ -1,7 +1,7 @@ import { readFile } from 'node:fs/promises'; import { lineOfTomlKey, parseToml } from 'agent-gov-core'; import { configPath } from '../discovery.js'; -import { isUnpinnedCommand, serverCommand, remoteEndpoint, isUnencryptedEndpoint } from '../mcp-risk.js'; +import { isUnpinnedCommand, serverCommand, remoteEndpoint, isUnencryptedEndpoint, readSensitiveFields, sensitiveFieldChanges, describeSensitiveFieldChange, recommendationForSensitiveFieldChange } from '../mcp-risk.js'; export const CODEX_CONFIG_FILE = '.codex/config.toml'; export const CODEX_TARGET_PATHS = [CODEX_CONFIG_FILE]; export async function detectCodexConfigDrift(oldRoot, newRoot) { @@ -205,6 +205,22 @@ async function detectCodexMcpDrift(oldRoot, newRoot) { : 'Confirm the endpoint is trusted and does not expose unexpected data or tools to external hosts.' }); } + if (oldServer) { + // Codex [mcp_servers.NAME] blocks carry the same env/headers/cwd + // capability as .mcp.json; an existing server gaining a secret-bearing + // env var with an unchanged command must still surface. + for (const change of sensitiveFieldChanges(oldServer, newServer)) { + findings.push({ + kind: 'scope_trail.codex_mcp_sensitive_field_changed', + severity: change.secretLike ? 'high' : 'medium', + file: CODEX_CONFIG_FILE, + line: lineForServer(newServer), + subject: name, + message: describeSensitiveFieldChange(name, change), + recommendation: recommendationForSensitiveFieldChange(change) + }); + } + } } return findings; } @@ -246,7 +262,8 @@ async function readCodexMcpServers(root) { url: typeof entry.url === 'string' ? entry.url : undefined, serverUrl: typeof entry.serverUrl === 'string' ? entry.serverUrl - : (typeof entry.server_url === 'string' ? entry.server_url : undefined) + : (typeof entry.server_url === 'string' ? entry.server_url : undefined), + ...readSensitiveFields(entry) }); } return servers; diff --git a/dist/detectors/mcp.js b/dist/detectors/mcp.js index 3cf340f..e308ea3 100644 --- a/dist/detectors/mcp.js +++ b/dist/detectors/mcp.js @@ -1,6 +1,6 @@ import { readdir } from 'node:fs/promises'; import { configPath, isRecord, lineOfJsonKey, lineOfJsonStringValue, readJsonObjectWithSource } from '../discovery.js'; -import { isPipeToShellCommand, isUnpinnedCommand, serverCommand, remoteEndpoint, isRemoteEndpoint, isUnencryptedEndpoint } from '../mcp-risk.js'; +import { isPipeToShellCommand, isUnpinnedCommand, serverCommand, remoteEndpoint, isRemoteEndpoint, isUnencryptedEndpoint, readSensitiveFields, sensitiveFieldChanges, describeSensitiveFieldChange, recommendationForSensitiveFieldChange } from '../mcp-risk.js'; const MCP_CONFIGS = [ { path: '.mcp.json', serverKeys: ['mcpServers'] }, { path: '.cursor/mcp.json', serverKeys: ['mcpServers', 'servers'] }, @@ -121,6 +121,22 @@ export async function detectMcpDrift(oldRoot, newRoot) { : 'Confirm the endpoint is trusted and does not expose unexpected data or tools to external hosts.' }); } + if (oldServer) { + // An existing server keeping the same launch command but gaining + // secret-bearing env, auth headers, or a redirected cwd is a real + // permission change that serverCommand()-based diffing misses. + for (const change of sensitiveFieldChanges(oldServer, newServer)) { + findings.push({ + kind: 'scope_trail.mcp_server_sensitive_field_changed', + severity: change.secretLike ? 'high' : 'medium', + file: config.path, + line: newServer.line, + subject: name, + message: describeSensitiveFieldChange(name, change), + recommendation: recommendationForSensitiveFieldChange(change) + }); + } + } } } for (const path of await listMcpSampleConfigPaths(oldRoot, newRoot)) { @@ -208,9 +224,6 @@ async function readMcpServers(root, config) { const source = await readJsonObjectWithSource(configPath(root, config.path)); const json = source.json; const rawServers = readServerMap(json, config.serverKeys); - if (!isRecord(rawServers)) { - return {}; - } const servers = {}; for (const [name, value] of Object.entries(rawServers)) { if (!isRecord(value)) { @@ -222,7 +235,8 @@ async function readMcpServers(root, config) { command: typeof value.command === 'string' ? value.command : undefined, args: Array.isArray(value.args) ? value.args.filter((arg) => typeof arg === 'string') : undefined, url: typeof value.url === 'string' ? value.url : undefined, - serverUrl: typeof value.serverUrl === 'string' ? value.serverUrl : undefined + serverUrl: typeof value.serverUrl === 'string' ? value.serverUrl : undefined, + ...readSensitiveFields(value) }; } return servers; @@ -288,13 +302,27 @@ async function collectMcpSampleConfigPaths(root, relativeDir, paths) { } })); } +// Some MCP config schemas expose servers under more than one key — Cursor +// and VS Code both accept `mcpServers` and `servers`. The previous +// first-recognized-map-wins logic let an empty `mcpServers: {}` shadow a +// populated `servers: {}`, so every server under the second key was +// invisible to the diff. Merge all recognized maps instead; on a name +// collision the earlier key in `serverKeys` wins, since that order encodes +// the schema's documented precedence. function readServerMap(json, serverKeys) { + const merged = {}; for (const key of serverKeys) { - if (isRecord(json[key])) { - return json[key]; + const map = json[key]; + if (!isRecord(map)) { + continue; + } + for (const [name, value] of Object.entries(map)) { + if (!(name in merged)) { + merged[name] = value; + } } } - return undefined; + return merged; } function severityForSampleCommandRisk(server) { return isPipeToShellCommand(server) ? 'high' : 'medium'; diff --git a/dist/mcp-risk.js b/dist/mcp-risk.js index 06969d0..d602e5b 100644 --- a/dist/mcp-risk.js +++ b/dist/mcp-risk.js @@ -109,3 +109,75 @@ export function isUnencryptedEndpoint(value) { return false; } } +const SECRET_NAME_SUBSTRINGS = [ + 'secret', 'token', 'password', 'passwd', 'credential', + 'authorization', 'bearer', 'cookie', 'session', + 'apikey', 'api_key', 'api-key', + 'access_key', 'access-key', 'accesskey', + 'private_key', 'private-key' +]; +// True for names that carry a credential: explicit terms above, plus any name +// that *ends* in a `key` token (`API_KEY`, `x-key`, bare `key`) — but not an +// incidental substring like "monkey". +export function looksLikeSecretName(name) { + const lower = name.toLowerCase(); + return SECRET_NAME_SUBSTRINGS.some((substring) => lower.includes(substring)) || /(^|[_-])key$/.test(lower); +} +export function readSensitiveFields(value) { + const fields = {}; + if (isPlainRecord(value.env)) { + fields.env = value.env; + } + if (isPlainRecord(value.headers)) { + fields.headers = value.headers; + } + if (typeof value.cwd === 'string') { + fields.cwd = value.cwd; + } + return fields; +} +// Only additions and value changes count — removing an env var or header is a +// narrowing, not a widening, and the tool flags widening only. A cwd change in +// either direction is reported (it redirects what the server can reach). +export function sensitiveFieldChanges(oldFields, newFields) { + const changes = []; + const envKeys = changedRecordKeys(oldFields.env, newFields.env); + if (envKeys.length > 0) { + changes.push({ field: 'env', keys: envKeys, secretLike: envKeys.some(looksLikeSecretName) }); + } + const headerKeys = changedRecordKeys(oldFields.headers, newFields.headers); + if (headerKeys.length > 0) { + changes.push({ field: 'headers', keys: headerKeys, secretLike: headerKeys.some(looksLikeSecretName) }); + } + if (newFields.cwd !== undefined && newFields.cwd !== oldFields.cwd) { + changes.push({ field: 'cwd', keys: [], secretLike: false }); + } + return changes; +} +export function describeSensitiveFieldChange(serverName, change) { + if (change.field === 'cwd') { + return `MCP server "${serverName}" changed its working directory (cwd).`; + } + const label = change.field === 'env' ? 'environment variable(s)' : 'request header(s)'; + return `MCP server "${serverName}" added or changed ${label}: ${change.keys.join(', ')}.`; +} +export function recommendationForSensitiveFieldChange(change) { + if (change.field === 'cwd') { + return 'Confirm the working-directory change does not point the server at unexpected files.'; + } + return 'Confirm these credentials/headers are intended — an existing server gaining secret-bearing env or auth headers is a permission change even when the launch command is unchanged.'; +} +function changedRecordKeys(oldRecord, newRecord) { + const previous = oldRecord ?? {}; + const next = newRecord ?? {}; + const changed = []; + for (const [key, value] of Object.entries(next)) { + if (!(key in previous) || String(previous[key]) !== String(value)) { + changed.push(key); + } + } + return changed; +} +function isPlainRecord(value) { + return typeof value === 'object' && value !== null && !Array.isArray(value); +} diff --git a/package-lock.json b/package-lock.json index 46d41ca..14ce766 100644 --- a/package-lock.json +++ b/package-lock.json @@ -9,7 +9,7 @@ "version": "0.3.1", "license": "MIT", "dependencies": { - "agent-gov-core": "^1.3.0" + "agent-gov-core": "1.3.0" }, "bin": { "scopetrail": "dist/index.js" diff --git a/package.json b/package.json index 45560c8..bf8e2d8 100644 --- a/package.json +++ b/package.json @@ -23,7 +23,7 @@ "benchmark": "node benchmark/run-benchmark.mjs" }, "dependencies": { - "agent-gov-core": "^1.3.0" + "agent-gov-core": "1.3.0" }, "devDependencies": { "@types/node": "^24.0.0", diff --git a/src/detectors/claude-settings.ts b/src/detectors/claude-settings.ts index c649663..05723d0 100644 --- a/src/detectors/claude-settings.ts +++ b/src/detectors/claude-settings.ts @@ -79,7 +79,7 @@ export async function detectClaudeSettingsDrift(oldRoot: string, newRoot: string file: CLAUDE_SETTINGS_FILE, subject: hookName, message: hookCommandChangeMessage(hookName, added, removed), - recommendation: 'Review the change — a removed guard, a no-op appended next to a strict one, or any rewrite of a hook command can all weaken policy.' + recommendation: 'Review the change — a removed guard, a no-op appended next to a strict one, a matcher rebound to a different tool, or any rewrite of a hook command can all weaken policy.' }); } } @@ -128,8 +128,13 @@ async function readClaudeSettings(root: string): Promise { // { "PreToolUse": [{ "matcher": "Bash", "hooks": [{ "type": "command", // "command": "/path/guard.sh" }] }] } // -// We collect every command string per hook name so a PR can be checked -// for both presence and content changes. +// The matcher is part of a hook's enforcement identity: a guard bound to +// `Bash` that is rebound to `Read` keeps the same command but stops guarding +// Bash — a real change in enforcement surface. So we key each entry by a +// (matcher, type, command) signature rather than the command string alone; a +// matcher/type rebinding then surfaces as a removed entry plus an added entry, +// exactly as a command swap does. Reordering an identical set still produces +// nothing, because set membership is order-independent. function readHookCommands(hooks: Record): Map> { const result = new Map>(); @@ -138,32 +143,47 @@ function readHookCommands(hooks: Record): Map(); + const signatures = new Set(); const entries = Array.isArray(value) ? value : Object.values(value as Record); for (const entry of entries) { if (!isRecord(entry)) { continue; } + const matcher = typeof entry.matcher === 'string' ? entry.matcher : ''; const innerList = entry.hooks; if (Array.isArray(innerList)) { for (const inner of innerList) { if (isRecord(inner) && typeof inner.command === 'string') { - commands.add(inner.command); + signatures.add(hookEntrySignature(matcher, typeof inner.type === 'string' ? inner.type : '', inner.command)); } } } if (typeof entry.command === 'string') { - commands.add(entry.command); + signatures.add(hookEntrySignature(matcher, typeof entry.type === 'string' ? entry.type : '', entry.command)); } } - result.set(name, commands); + result.set(name, signatures); } return result; } +// Stable, human-readable identity for a hook entry. The command leads so the +// added/removed message stays scannable (`added: /noop.sh [matcher=Bash, ...]`); +// the matcher/type qualifiers are what make a rebinding visible in the diff. +function hookEntrySignature(matcher: string, type: string, command: string): string { + const qualifiers: string[] = []; + if (matcher) { + qualifiers.push(`matcher=${matcher}`); + } + if (type) { + qualifiers.push(`type=${type}`); + } + return qualifiers.length > 0 ? `${command} [${qualifiers.join(', ')}]` : command; +} + function readStringArray(value: unknown): string[] { return Array.isArray(value) ? value.filter((entry): entry is string => typeof entry === 'string') : []; } @@ -272,13 +292,24 @@ function severityForAllow(permission: string): Severity { return 'medium'; } +// Removing a deny rule that guards secrets or keys is critical — the PR is +// re-opening agent access to material a reviewer almost never wants read or +// exfiltrated. The previous set only caught .env/secret/credential/.pem, so +// removing a deny on SSH keys, cloud credentials, package-registry tokens, or +// kube configs silently downgraded to medium. Substring matching is +// intentional so glob forms (`Read(~/.ssh/**)`, `Read(*.key)`) still match. +const CRITICAL_DENY_PATTERNS = [ + '.env', 'secret', 'credential', '.pem', + '.ssh', 'id_rsa', 'id_dsa', 'id_ecdsa', 'id_ed25519', '.key', + '.npmrc', '.pypirc', '.netrc', + 'kubeconfig', '.kube', + '.aws', '.gcp', '.azure', + '.p12', '.pfx' +] as const; + function severityForRemovedDeny(permission: string): Severity { const normalized = permission.toLowerCase(); - if (normalized.includes('.env') || normalized.includes('secret') || normalized.includes('credential') || normalized.includes('.pem')) { - return 'critical'; - } - - return 'medium'; + return CRITICAL_DENY_PATTERNS.some((pattern) => normalized.includes(pattern)) ? 'critical' : 'medium'; } function isHighImpactHook(hookName: string): boolean { @@ -293,5 +324,5 @@ function hookCommandChangeMessage(hookName: string, added: string[], removed: st if (removed.length > 0) { parts.push(`removed: ${removed.join(', ')}`); } - return `Claude hook "${hookName}" command(s) changed (${parts.join('; ')}).`; + return `Claude hook "${hookName}" entries changed (${parts.join('; ')}).`; } diff --git a/src/detectors/codex-config.ts b/src/detectors/codex-config.ts index 35be1a2..5f53fab 100644 --- a/src/detectors/codex-config.ts +++ b/src/detectors/codex-config.ts @@ -6,7 +6,12 @@ import { serverCommand, remoteEndpoint, isUnencryptedEndpoint, - type McpCommandShape + readSensitiveFields, + sensitiveFieldChanges, + describeSensitiveFieldChange, + recommendationForSensitiveFieldChange, + type McpCommandShape, + type McpSensitiveFields } from '../mcp-risk.js'; import type { Finding } from '../types.js'; @@ -18,7 +23,7 @@ interface TomlEntry { value: string; } -interface CodexMcpServer extends McpCommandShape { +interface CodexMcpServer extends McpCommandShape, McpSensitiveFields { text: string; name: string; } @@ -235,6 +240,23 @@ async function detectCodexMcpDrift(oldRoot: string, newRoot: string): Promise = {}; for (const [name, value] of Object.entries(rawServers)) { @@ -261,7 +279,8 @@ async function readMcpServers( command: typeof value.command === 'string' ? value.command : undefined, args: Array.isArray(value.args) ? value.args.filter((arg): arg is string => typeof arg === 'string') : undefined, url: typeof value.url === 'string' ? value.url : undefined, - serverUrl: typeof value.serverUrl === 'string' ? value.serverUrl : undefined + serverUrl: typeof value.serverUrl === 'string' ? value.serverUrl : undefined, + ...readSensitiveFields(value) }; } @@ -341,14 +360,28 @@ async function collectMcpSampleConfigPaths(root: string, relativeDir: string, pa ); } -function readServerMap(json: Record, serverKeys: readonly string[]): unknown { +// Some MCP config schemas expose servers under more than one key — Cursor +// and VS Code both accept `mcpServers` and `servers`. The previous +// first-recognized-map-wins logic let an empty `mcpServers: {}` shadow a +// populated `servers: {}`, so every server under the second key was +// invisible to the diff. Merge all recognized maps instead; on a name +// collision the earlier key in `serverKeys` wins, since that order encodes +// the schema's documented precedence. +function readServerMap(json: Record, serverKeys: readonly string[]): Record { + const merged: Record = {}; for (const key of serverKeys) { - if (isRecord(json[key])) { - return json[key]; + const map = json[key]; + if (!isRecord(map)) { + continue; + } + for (const [name, value] of Object.entries(map)) { + if (!(name in merged)) { + merged[name] = value; + } } } - return undefined; + return merged; } function severityForSampleCommandRisk(server: McpServerModel): Severity { diff --git a/src/mcp-risk.ts b/src/mcp-risk.ts index 4ee5782..4f01463 100644 --- a/src/mcp-risk.ts +++ b/src/mcp-risk.ts @@ -134,3 +134,111 @@ export function isUnencryptedEndpoint(value: string): boolean { } } +// Beyond the launch command, an MCP server config carries fields that grant +// it ambient capability: `env` (often secrets like API keys), `headers` (auth +// tokens on remote transports), and `cwd` (what the server can reach on disk). +// serverCommand() deliberately ignores these, so a server that keeps the same +// command but *gains* an `env` secret or an `Authorization` header would +// otherwise produce no finding. This model captures them for diffing. +export interface McpSensitiveFields { + env?: Record; + headers?: Record; + cwd?: string; +} + +export interface SensitiveFieldChange { + field: 'env' | 'headers' | 'cwd'; + keys: string[]; // added/changed keys (empty for cwd) + secretLike: boolean; // a changed key name looks like it carries a secret +} + +const SECRET_NAME_SUBSTRINGS = [ + 'secret', 'token', 'password', 'passwd', 'credential', + 'authorization', 'bearer', 'cookie', 'session', + 'apikey', 'api_key', 'api-key', + 'access_key', 'access-key', 'accesskey', + 'private_key', 'private-key' +]; + +// True for names that carry a credential: explicit terms above, plus any name +// that *ends* in a `key` token (`API_KEY`, `x-key`, bare `key`) — but not an +// incidental substring like "monkey". +export function looksLikeSecretName(name: string): boolean { + const lower = name.toLowerCase(); + return SECRET_NAME_SUBSTRINGS.some((substring) => lower.includes(substring)) || /(^|[_-])key$/.test(lower); +} + +export function readSensitiveFields(value: Record): McpSensitiveFields { + const fields: McpSensitiveFields = {}; + if (isPlainRecord(value.env)) { + fields.env = value.env; + } + if (isPlainRecord(value.headers)) { + fields.headers = value.headers; + } + if (typeof value.cwd === 'string') { + fields.cwd = value.cwd; + } + return fields; +} + +// Only additions and value changes count — removing an env var or header is a +// narrowing, not a widening, and the tool flags widening only. A cwd change in +// either direction is reported (it redirects what the server can reach). +export function sensitiveFieldChanges( + oldFields: McpSensitiveFields, + newFields: McpSensitiveFields +): SensitiveFieldChange[] { + const changes: SensitiveFieldChange[] = []; + + const envKeys = changedRecordKeys(oldFields.env, newFields.env); + if (envKeys.length > 0) { + changes.push({ field: 'env', keys: envKeys, secretLike: envKeys.some(looksLikeSecretName) }); + } + + const headerKeys = changedRecordKeys(oldFields.headers, newFields.headers); + if (headerKeys.length > 0) { + changes.push({ field: 'headers', keys: headerKeys, secretLike: headerKeys.some(looksLikeSecretName) }); + } + + if (newFields.cwd !== undefined && newFields.cwd !== oldFields.cwd) { + changes.push({ field: 'cwd', keys: [], secretLike: false }); + } + + return changes; +} + +export function describeSensitiveFieldChange(serverName: string, change: SensitiveFieldChange): string { + if (change.field === 'cwd') { + return `MCP server "${serverName}" changed its working directory (cwd).`; + } + const label = change.field === 'env' ? 'environment variable(s)' : 'request header(s)'; + return `MCP server "${serverName}" added or changed ${label}: ${change.keys.join(', ')}.`; +} + +export function recommendationForSensitiveFieldChange(change: SensitiveFieldChange): string { + if (change.field === 'cwd') { + return 'Confirm the working-directory change does not point the server at unexpected files.'; + } + return 'Confirm these credentials/headers are intended — an existing server gaining secret-bearing env or auth headers is a permission change even when the launch command is unchanged.'; +} + +function changedRecordKeys( + oldRecord: Record | undefined, + newRecord: Record | undefined +): string[] { + const previous = oldRecord ?? {}; + const next = newRecord ?? {}; + const changed: string[] = []; + for (const [key, value] of Object.entries(next)) { + if (!(key in previous) || String(previous[key]) !== String(value)) { + changed.push(key); + } + } + return changed; +} + +function isPlainRecord(value: unknown): value is Record { + return typeof value === 'object' && value !== null && !Array.isArray(value); +} + diff --git a/src/types.ts b/src/types.ts index 7e91098..765c35b 100644 --- a/src/types.ts +++ b/src/types.ts @@ -15,4 +15,7 @@ export interface McpServerConfig { args?: string[]; url?: string; serverUrl?: string; + env?: Record; + headers?: Record; + cwd?: string; } diff --git a/test/codex-mcp-drift.test.mjs b/test/codex-mcp-drift.test.mjs index 1b41a60..def0f4c 100644 --- a/test/codex-mcp-drift.test.mjs +++ b/test/codex-mcp-drift.test.mjs @@ -82,6 +82,41 @@ test('codex_mcp_server_command_changed fires when an existing server changes its } }); +test('codex_mcp_sensitive_field_changed: an existing server gaining a secret env var is flagged', async () => { + // Codex [mcp_servers.NAME] tables carry the same env/headers/cwd capability + // as .mcp.json. A server keeping its command but gaining a secret env var + // must surface even though serverCommand() is unchanged. + const { mkdtempSync, writeFileSync, mkdirSync, rmSync } = await import('node:fs'); + const { tmpdir } = await import('node:os'); + + const root = mkdtempSync(join(tmpdir(), 'scopetrail-codex-env-')); + try { + const oldDir = join(root, 'old'); + const newDir = join(root, 'new'); + mkdirSync(join(oldDir, '.codex'), { recursive: true }); + mkdirSync(join(newDir, '.codex'), { recursive: true }); + + writeFileSync( + join(oldDir, '.codex', 'config.toml'), + '[mcp_servers.helper]\ncommand = "npx"\nargs = ["-y", "@vendor/helper-mcp@1.2.3"]\n' + ); + writeFileSync( + join(newDir, '.codex', 'config.toml'), + '[mcp_servers.helper]\ncommand = "npx"\nargs = ["-y", "@vendor/helper-mcp@1.2.3"]\nenv = { STRIPE_SECRET_KEY = "${STRIPE_SECRET_KEY}" }\n' + ); + + const findings = await detectCodexConfigDrift(oldDir, newDir); + const sensitive = findings.filter((f) => f.kind === 'scope_trail.codex_mcp_sensitive_field_changed'); + assert.equal(sensitive.length, 1); + assert.equal(sensitive[0].subject, 'helper'); + assert.equal(sensitive[0].severity, 'high'); + assert.match(sensitive[0].message, /STRIPE_SECRET_KEY/); + assert.equal(findings.some((f) => f.kind === 'scope_trail.codex_mcp_server_command_changed'), false); + } finally { + rmSync(root, { recursive: true, force: true }); + } +}); + test('codex_mcp_remote_endpoint: http:// fires critical severity, https:// fires high', async () => { const { mkdtempSync, writeFileSync, mkdirSync, rmSync } = await import('node:fs'); const { tmpdir } = await import('node:os'); diff --git a/test/heuristics-and-snapshot.test.mjs b/test/heuristics-and-snapshot.test.mjs index 781fbca..c3fda95 100644 --- a/test/heuristics-and-snapshot.test.mjs +++ b/test/heuristics-and-snapshot.test.mjs @@ -116,6 +116,32 @@ test('Claude detector: bare Bash/Write/Edit get high severity (not medium)', asy } }); +test('Claude detector: removing a deny on SSH keys / cloud creds / registry tokens is critical', async () => { + // Pre-fix gap: only .env/secret/credential/.pem escalated to critical, so + // removing a deny on ~/.ssh, *.key, .npmrc, or kubeconfig downgraded the + // re-opened secret access to a mere medium. + const dir = await makeClaudeFixture( + { + permissions: { + allow: [], + deny: ['Read(~/.ssh/**)', 'Read(*.key)', 'Read(.npmrc)', 'Read(kubeconfig)', 'Read(.aws/credentials)'] + }, + hooks: {} + }, + { permissions: { allow: [], deny: [] }, hooks: {} } + ); + try { + const findings = await detectClaudeSettingsDrift(dir.oldRoot, dir.newRoot); + const removed = findings.filter((f) => f.kind === 'scope_trail.permission_deny_removed'); + assert.equal(removed.length, 5); + for (const f of removed) { + assert.equal(f.severity, 'critical', `removed deny ${f.subject} should be critical`); + } + } finally { + await rm(dir.root, { recursive: true, force: true }); + } +}); + test('isBroadAllow: narrowly-scoped Bash/Read/Write/Edit stay narrow', () => { // The bare-verb fix must not over-fire on legitimate narrow scopes. assert.equal(isBroadAllow('Bash(npm test)'), false); @@ -238,18 +264,23 @@ test('Claude detector: hook_command_changed fires when one guard is removed from } }); -test('Claude detector: hook with unchanged command set produces no hook_command_changed finding', async () => { - // Regression guard for the symmetric-difference logic — same set of - // commands in a different order or wrapped under a different matcher - // should not generate noise. +test('Claude detector: reordered but identical hook entries produce no hook_command_changed finding', async () => { + // Regression guard for the symmetric-difference logic — the same set of + // (matcher, type, command) entries in a different array order is not drift. const dir = await makeClaudeFixture( { permissions: { allow: [], deny: [] }, - hooks: { PreToolUse: [{ matcher: 'Bash', hooks: [{ type: 'command', command: '/guard.sh' }] }] } + hooks: { PreToolUse: [ + { matcher: 'Bash', hooks: [{ type: 'command', command: '/guard.sh' }] }, + { matcher: 'Edit', hooks: [{ type: 'command', command: '/audit.sh' }] } + ] } }, { permissions: { allow: [], deny: [] }, - hooks: { PreToolUse: [{ matcher: 'Edit', hooks: [{ type: 'command', command: '/guard.sh' }] }] } + hooks: { PreToolUse: [ + { matcher: 'Edit', hooks: [{ type: 'command', command: '/audit.sh' }] }, + { matcher: 'Bash', hooks: [{ type: 'command', command: '/guard.sh' }] } + ] } } ); try { @@ -260,6 +291,35 @@ test('Claude detector: hook with unchanged command set produces no hook_command_ } }); +test('Claude detector: hook_command_changed fires when a matcher rebinds the same guard to a different tool', async () => { + // A PreToolUse guard bound to `Bash` that is rebound to `Read` keeps the + // same command but stops guarding Bash — a real change in enforcement + // surface, not noise. Because the entry identity includes the matcher, the + // rebinding surfaces as a removed Bash-bound entry plus an added Read-bound + // one. (This reverses the earlier "matcher change is noise" behavior.) + const dir = await makeClaudeFixture( + { + permissions: { allow: [], deny: [] }, + hooks: { PreToolUse: [{ matcher: 'Bash', hooks: [{ type: 'command', command: '/guard.sh' }] }] } + }, + { + permissions: { allow: [], deny: [] }, + hooks: { PreToolUse: [{ matcher: 'Read', hooks: [{ type: 'command', command: '/guard.sh' }] }] } + } + ); + try { + const findings = await detectClaudeSettingsDrift(dir.oldRoot, dir.newRoot); + const changed = findings.find((f) => f.kind === 'scope_trail.hook_command_changed'); + assert.ok(changed, 'matcher rebinding (Bash -> Read) must be flagged'); + assert.equal(changed.subject, 'PreToolUse'); + assert.equal(changed.severity, 'high'); // PreToolUse is high-impact + assert.match(changed.message, /matcher=Bash/); + assert.match(changed.message, /matcher=Read/); + } finally { + await rm(dir.root, { recursive: true, force: true }); + } +}); + test('Claude detector: scoped MCP grant does not trip the broad-allow finding', async () => { const dir = await makeClaudeFixture( { permissions: { allow: [], deny: [] }, hooks: {} }, diff --git a/test/mcp-drift.test.mjs b/test/mcp-drift.test.mjs index c65b0b2..370af4f 100644 --- a/test/mcp-drift.test.mjs +++ b/test/mcp-drift.test.mjs @@ -326,6 +326,45 @@ test('detects MCP drift in Cursor and VS Code config files', async () => { ); }); +test('merges all recognized server maps — an empty mcpServers must not shadow a populated servers', async () => { + // Pre-fix gap: readServerMap returned the FIRST recognized key only. + // For .cursor/mcp.json (serverKeys: mcpServers, servers) an empty + // `mcpServers: {}` shadowed a populated `servers: {}`, so every server + // declared under `servers` was invisible to the diff. + const { mkdtempSync, writeFileSync, mkdirSync, rmSync } = await import('node:fs'); + const { tmpdir } = await import('node:os'); + + const root = mkdtempSync(join(tmpdir(), 'scopetrail-merge-maps-')); + try { + const oldDir = join(root, 'old'); + const newDir = join(root, 'new'); + mkdirSync(join(oldDir, '.cursor'), { recursive: true }); + mkdirSync(join(newDir, '.cursor'), { recursive: true }); + writeFileSync(join(oldDir, '.cursor', 'mcp.json'), JSON.stringify({ mcpServers: {}, servers: {} })); + writeFileSync( + join(newDir, '.cursor', 'mcp.json'), + JSON.stringify({ + mcpServers: {}, + servers: { + 'new-risky-server': { command: 'npx', args: ['@bad/server@latest'] } + } + }, null, 2) + ); + + const findings = await detectMcpDrift(oldDir, newDir); + assert.ok( + findings.some((f) => f.kind === 'scope_trail.mcp_server_added' && f.subject === 'new-risky-server'), + 'server under `servers` must be detected even when an empty `mcpServers` exists' + ); + assert.ok( + findings.some((f) => f.kind === 'scope_trail.unpinned_mcp_command' && f.subject === 'new-risky-server'), + 'unpinned command under `servers` must also be flagged' + ); + } finally { + rmSync(root, { recursive: true, force: true }); + } +}); + test('detects MCP drift in Windsurf config files', async () => { const oldDir = join(testDir, 'fixtures', 'windsurf-mcp', 'old'); const newDir = join(testDir, 'fixtures', 'windsurf-mcp', 'new'); @@ -418,6 +457,66 @@ test('detects prefixed MCP config example drift without treating it as active se ); }); +test('mcp_server_sensitive_field_changed: an existing server gaining env/headers/cwd is flagged; removals are not', async () => { + // serverCommand() ignores env/headers/cwd, so a server keeping the same + // launch command but gaining a secret env var, an auth header, or a + // redirected cwd produced no finding. Now flagged — secret-bearing keys + // escalate to high. Removing a key is a narrowing and stays silent. + const { mkdtempSync, writeFileSync, mkdirSync, rmSync } = await import('node:fs'); + const { tmpdir } = await import('node:os'); + + const root = mkdtempSync(join(tmpdir(), 'scopetrail-sensitive-')); + try { + const oldDir = join(root, 'old'); + const newDir = join(root, 'new'); + mkdirSync(oldDir, { recursive: true }); + mkdirSync(newDir, { recursive: true }); + writeFileSync( + join(oldDir, '.mcp.json'), + JSON.stringify({ + mcpServers: { + stripe: { command: 'npx', args: ['-y', '@vendor/stripe-mcp@1.2.3'], env: { LOG_LEVEL: 'info' } }, + docs: { command: 'npx', args: ['-y', '@vendor/docs@1.0.0'] } + } + }) + ); + writeFileSync( + join(newDir, '.mcp.json'), + JSON.stringify({ + mcpServers: { + // Same command: gains a secret env var + a benign one; LOG_LEVEL removed. + stripe: { command: 'npx', args: ['-y', '@vendor/stripe-mcp@1.2.3'], env: { STRIPE_SECRET_KEY: '${STRIPE_SECRET_KEY}', REGION: 'us' } }, + // Same command: gains an Authorization header and a redirected cwd. + docs: { command: 'npx', args: ['-y', '@vendor/docs@1.0.0'], headers: { Authorization: 'Bearer x' }, cwd: '/etc' } + } + }, null, 2) + ); + + const findings = await detectMcpDrift(oldDir, newDir); + const sensitive = findings.filter((f) => f.kind === 'scope_trail.mcp_server_sensitive_field_changed'); + + const stripeEnv = sensitive.find((f) => f.subject === 'stripe' && /environment variable/.test(f.message)); + assert.ok(stripeEnv, 'expected env change finding for stripe'); + assert.equal(stripeEnv.severity, 'high', 'STRIPE_SECRET_KEY is secret-like -> high'); + assert.match(stripeEnv.message, /STRIPE_SECRET_KEY/); + assert.doesNotMatch(stripeEnv.message, /LOG_LEVEL/, 'a removed env var must not be reported'); + + const docsHeader = sensitive.find((f) => f.subject === 'docs' && /header/.test(f.message)); + assert.ok(docsHeader, 'expected header change finding for docs'); + assert.equal(docsHeader.severity, 'high', 'Authorization header is secret-like -> high'); + + const docsCwd = sensitive.find((f) => f.subject === 'docs' && /working directory/.test(f.message)); + assert.ok(docsCwd, 'expected cwd change finding for docs'); + assert.equal(docsCwd.severity, 'medium'); + + // No added/command-changed noise — both servers existed and kept their command. + assert.equal(findings.some((f) => f.kind === 'scope_trail.mcp_server_added'), false); + assert.equal(findings.some((f) => f.kind === 'scope_trail.mcp_server_command_changed'), false); + } finally { + rmSync(root, { recursive: true, force: true }); + } +}); + test('mcp_remote_endpoint: http:// fires critical severity, https:// fires high', async () => { const { mkdtempSync, writeFileSync, mkdirSync, rmSync } = await import('node:fs'); const { tmpdir } = await import('node:os'); From fd549d3026ce5efe33512941b0892babf8d6c82b Mon Sep 17 00:00:00 2001 From: Conal <33135619+Conalh@users.noreply.github.com> Date: Fri, 29 May 2026 10:25:06 -0700 Subject: [PATCH 3/3] fix: clean up base snapshot when head materialization fails MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit runDiff materialized the base snapshot, then the head snapshot, and only assigned `cleanup` after both succeeded. If head materialization threw — an unresolvable head ref (a shallow checkout missing it), or a max-buffer error on an oversized config — the base snapshot's temp dir was already on disk and leaked, since cleanup was never wired up. Clean the base snapshot explicitly before the error propagates. Found during a fresh correctness pass over the codebase (not part of the external review). The existing unresolvable-ref test fails on the *base* ref, before any dir exists, so it never exercised this path. Adds a regression test that asserts no scopetrail-snapshot temp dir survives a head-side failure. Co-Authored-By: Claude Opus 4.8 --- dist/index.js | 9 ++++++++- src/index.ts | 11 ++++++++++- test/git-diff.test.mjs | 41 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 59 insertions(+), 2 deletions(-) diff --git a/dist/index.js b/dist/index.js index ac4315a..6ec1037 100644 --- a/dist/index.js +++ b/dist/index.js @@ -34,7 +34,14 @@ async function runDiff(argv) { else { try { const baseSnapshot = await materializeGitSnapshot(parsed.repo, parsed.base); - const headSnapshot = await materializeGitSnapshot(parsed.repo, parsed.head); + // `cleanup` is only assigned once BOTH snapshots exist, so if head + // materialization fails (an unresolvable head ref, a max-buffer error) + // the base snapshot's temp dir would leak. Clean it explicitly before + // the error propagates to the handler below. + const headSnapshot = await materializeGitSnapshot(parsed.repo, parsed.head).catch(async (headError) => { + await baseSnapshot.cleanup(); + throw headError; + }); oldRoot = baseSnapshot.root; newRoot = headSnapshot.root; cleanup = async () => { diff --git a/src/index.ts b/src/index.ts index 98a47c2..66bd2ed 100644 --- a/src/index.ts +++ b/src/index.ts @@ -47,7 +47,16 @@ async function runDiff(argv: string[]): Promise { } else { try { const baseSnapshot = await materializeGitSnapshot(parsed.repo, parsed.base); - const headSnapshot = await materializeGitSnapshot(parsed.repo, parsed.head); + // `cleanup` is only assigned once BOTH snapshots exist, so if head + // materialization fails (an unresolvable head ref, a max-buffer error) + // the base snapshot's temp dir would leak. Clean it explicitly before + // the error propagates to the handler below. + const headSnapshot = await materializeGitSnapshot(parsed.repo, parsed.head).catch( + async (headError: unknown) => { + await baseSnapshot.cleanup(); + throw headError; + } + ); oldRoot = baseSnapshot.root; newRoot = headSnapshot.root; cleanup = async () => { diff --git a/test/git-diff.test.mjs b/test/git-diff.test.mjs index ba38d17..0f3aae6 100644 --- a/test/git-diff.test.mjs +++ b/test/git-diff.test.mjs @@ -207,6 +207,47 @@ test('CLI surfaces a friendly error when a git ref cannot be resolved', async () } }); +test('CLI cleans up the base snapshot when head materialization fails (no temp-dir leak)', async () => { + // Pre-fix gap: runDiff materialized base, then head, and only assigned + // `cleanup` after BOTH succeeded. If head failed (e.g. an unresolvable + // head ref — a shallow checkout missing it), the base snapshot's temp dir + // was already on disk and leaked. The existing "does-not-exist" test fails + // on the *base* ref, before any dir exists, so it never caught this. + const { readdir } = await import('node:fs/promises'); + const { tmpdir } = await import('node:os'); + + const snapshotDirs = async () => + new Set((await readdir(tmpdir())).filter((name) => name.startsWith('scopetrail-snapshot-'))); + + const fx = await makeGitRepo({ + prefix: 'scopetrail-git-leak-', + initialFiles: { '.mcp.json': '{"mcpServers":{}}\n' }, + initialMessage: 'base', + }); + try { + const before = await snapshotDirs(); + + let exitCode = 0; + try { + await execFileAsync( + process.execPath, + ['dist/index.js', 'diff', '--repo', fx.repo, '--base', 'HEAD', '--head', 'does-not-exist', '--format', 'json'], + { cwd: packageRoot } + ); + } catch (error) { + exitCode = error.code ?? 0; + } + assert.equal(exitCode, 2, 'unresolvable head ref should exit 2'); + + // Only count dirs created by this run that survived it — robust to any + // snapshot dirs a sibling test left mid-flight. + const leaked = [...(await snapshotDirs())].filter((name) => !before.has(name)); + assert.deepEqual(leaked, [], 'base snapshot temp dir must not leak when head materialization fails'); + } finally { + await fx.cleanup(); + } +}); + test('CLI rejects git refs that could be parsed as git CLI flags', async () => { // Hardening: `execFile` blocks shell injection, but `git` re-parses // each positional arg against its own option table. A ref like