From 523030549fca064904982b0b4431c8a04750c6a7 Mon Sep 17 00:00:00 2001 From: Chris Lee Date: Sat, 20 Jun 2026 22:25:18 +1000 Subject: [PATCH] fix(daemon): sweep full workspace triple on startup and crash exit The daemon orphaned three per-job paths under CLONE_BASE_DIR on SIGKILL (OOM/eviction/kubelet kill): the clone dir, the token-bearing ${workDir}.cred.sh, and ${workDir}-artifacts. registerExitCleanup and handleJobCancel removed only the first two even on graceful exit, and the daemon had no startup sweep (the only reaper lived in app.ts, ran in the wrong process, and matched *.cred.sh only). Add src/core/workspace-sweep.ts: removeWorkspaceTripleSync (sync, for exit/cancel handlers) and sweepStaleWorkspaces (startup TTL reaper, emits a workspace.sweep log). Wire into both job-executor cleanup sites, daemon main() startup, and app.ts (replacing its divergent cred-only sweep). New WORKSPACE_STALE_TTL_MS env (default 1h). Closes #221 Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_015v2bspPF1ZTTG9ZZrbBgA1 --- docs/operate/configuration.md | 3 +- docs/operate/observability.md | 8 ++ docs/operate/runbooks/daemon-fleet.md | 2 + src/app.ts | 37 ++----- src/config.ts | 10 ++ src/core/workspace-sweep.ts | 108 ++++++++++++++++++ src/daemon/job-executor.ts | 32 ++---- src/daemon/main.ts | 6 + test/core/workspace-sweep.test.ts | 153 ++++++++++++++++++++++++++ 9 files changed, 303 insertions(+), 56 deletions(-) create mode 100644 src/core/workspace-sweep.ts create mode 100644 test/core/workspace-sweep.test.ts diff --git a/docs/operate/configuration.md b/docs/operate/configuration.md index 764a2134..f0aad970 100644 --- a/docs/operate/configuration.md +++ b/docs/operate/configuration.md @@ -52,6 +52,7 @@ Server mode only. If `ORCHESTRATOR_URL` is set, the process runs in daemon mode | `CLAUDE_CODE_PATH` | resolved from `node_modules` | Absolute path to the Claude Code CLI `cli.js`. | | `CLONE_BASE_DIR` | `/tmp/bot-workspaces` | Parent directory for per-delivery clones. | | `CLONE_DEPTH` | `50` | Shallow-clone depth. Increase for deeply-diverged PRs. | +| `WORKSPACE_STALE_TTL_MS` | `3600000` | TTL before an orphaned per-job workspace triple (clone dir + `.cred.sh` + `-artifacts`) under `CLONE_BASE_DIR` is swept at startup. Reclaims SIGKILL/OOM/eviction orphans. Lower only if you understand the risk. | | `CONTEXT7_API_KEY` | unset | Lifts Context7 MCP rate limiting. No other effect. | ## Postgres @@ -189,7 +190,7 @@ before committing: ## Prompt cache layout -Selects the system/user prompt split the agent executor passes to the Claude Agent SDK. See `src/config.ts:573#promptCacheLayout` for the Zod definition and `src/core/executor.ts:208` for the runtime guard. +Selects the system/user prompt split the agent executor passes to the Claude Agent SDK. See `src/config.ts:582#promptCacheLayout` for the Zod definition and `src/core/executor.ts:208` for the runtime guard. | Variable | Default | Notes | | --------------------- | -------- | ------------------------------------------------------------------------------------------------------------ | diff --git a/docs/operate/observability.md b/docs/operate/observability.md index 3216a06d..90539a34 100644 --- a/docs/operate/observability.md +++ b/docs/operate/observability.md @@ -68,6 +68,14 @@ cache_read_input_tokens / (input_tokens + cache_read_input_tokens + cache_creati (The ratio is undefined when the denominator is zero, e.g. a dry-run that never called the model; guard against that in the query.) A high ratio on the second+ run of the same prompt shape confirms `PROMPT_CACHE_LAYOUT=cacheable` is working; a sudden drop in the per-installation cache-read share is the signature of a prompt-cache stability regression. Alert on `sum(cache_read_input_tokens) / sum(input_tokens + cache_read_input_tokens + cache_creation_input_tokens)` falling below its established baseline. +### Workspace sweep + +`sweepStaleWorkspaces` (`src/core/workspace-sweep.ts`) emits one line per startup sweep when the daemon and the webhook server reclaim stale per-job workspace triples under `CLONE_BASE_DIR` (orphans left by SIGKILL/OOM/eviction). A `swept` count climbing across restarts means jobs are being killed mid-run before their own cleanup runs. + +| `event` | Level | Fields | +| ----------------- | ----- | ------------------------------------------------------------------------------------------------------------ | +| `workspace.sweep` | info | `swept` (entries removed), `retained` (entries kept as fresh), `durationMs` (wall-clock time for the sweep). | + ## GitHub API rate-limit fields The `App` is constructed with `ObservableOctokit` (`src/utils/octokit-observability.ts`), an `Octokit.plugin` subclass shared by `app.octokit` and every installation octokit. It logs GitHub's per-installation rate-limit headers via `octokit.hook.after` / `hook.error`. The `pipeline.stage`-style strict Zod schema (`GithubApiLogFieldsSchema`) pins the field shape. diff --git a/docs/operate/runbooks/daemon-fleet.md b/docs/operate/runbooks/daemon-fleet.md index e581f03a..68d17c88 100644 --- a/docs/operate/runbooks/daemon-fleet.md +++ b/docs/operate/runbooks/daemon-fleet.md @@ -46,6 +46,8 @@ flowchart LR classDef done fill:#2a6f2a,stroke:#1a4d1a,color:#ffffff ``` +At boot, before connecting to the orchestrator, the daemon sweeps stale workspace triples (clone dir + `.cred.sh` token helper + `-artifacts`) older than `WORKSPACE_STALE_TTL_MS` under `CLONE_BASE_DIR`, reclaiming SIGKILL/OOM/eviction orphans left behind when a prior run skipped its own cleanup. Each sweep emits a single `workspace.sweep` log line with `swept` / `retained` / `durationMs`. + ## Operational knobs The full list lives at [`../configuration.md`](../configuration.md#orchestrator-and-daemon). The handful you'll actually touch: diff --git a/src/app.ts b/src/app.ts index fe794335..075a7ef9 100644 --- a/src/app.ts +++ b/src/app.ts @@ -1,5 +1,5 @@ import { randomUUID } from "node:crypto"; -import { access, constants, mkdir, readdir, rm, stat } from "node:fs/promises"; +import { access, constants } from "node:fs/promises"; import http from "node:http"; import { join } from "node:path"; @@ -17,6 +17,7 @@ import type { import { App, Octokit } from "octokit"; import { config } from "./config"; +import { sweepStaleWorkspaces } from "./core/workspace-sweep"; import { closeDb, getDb } from "./db"; import { runMigrations } from "./db/migrate"; import { installFatalHandlers, logger } from "./logger"; @@ -368,35 +369,11 @@ async function runStartupChecks(): Promise { } } - // *.cred.sh files accumulate in cloneBaseDir when the pod is SIGKILL-ed mid-checkout. - // Remove files older than 1 hour to avoid leaking installation tokens across restarts. - const STALE_CRED_TTL_MS = 60 * 60 * 1000; - const staleCutoff = Date.now() - STALE_CRED_TTL_MS; - - try { - // eslint-disable-next-line security/detect-non-literal-fs-filename - await mkdir(config.cloneBaseDir, { recursive: true }); - // eslint-disable-next-line security/detect-non-literal-fs-filename - const entries = await readdir(config.cloneBaseDir); - const credFiles = entries.filter((f) => f.endsWith(".cred.sh")); - - for (const credFile of credFiles) { - const fullPath = join(config.cloneBaseDir, credFile); - try { - // eslint-disable-next-line no-await-in-loop, security/detect-non-literal-fs-filename - const { mtimeMs } = await stat(fullPath); - if (mtimeMs < staleCutoff) { - // eslint-disable-next-line no-await-in-loop -- fullPath is join()-constructed, not user input - await rm(fullPath, { force: true }); - logger.info({ credFile }, "Removed stale credential helper script"); - } - } catch { - // Non-fatal: file may have been removed concurrently - } - } - } catch { - // Non-fatal: cloneBaseDir may not exist yet on a fresh pod - } + // Reclaim workspace triples (clone dir + `.cred.sh` token helper + + // `-artifacts`) orphaned in cloneBaseDir when a pod was SIGKILL-ed + // mid-run, to avoid leaking installation tokens and disk across restarts + // (issue #221). + await sweepStaleWorkspaces(config.cloneBaseDir, config.workspaceStaleTtlMs, logger); const db = getDb(); if (db !== null) { diff --git a/src/config.ts b/src/config.ts index 51fe1140..ea9d7217 100644 --- a/src/config.ts +++ b/src/config.ts @@ -238,6 +238,15 @@ const configSchema = z // for testing or smaller-scope deployments. agentTimeoutMs: z.coerce.number().int().positive().default(3_600_000), + // TTL before an orphaned per-job workspace triple (clone dir + + // `.cred.sh` + `-artifacts`) under cloneBaseDir is swept + // at daemon/server startup. Orphans are left behind when a pod is + // SIGKILL-ed / OOM-killed / evicted mid-run, skipping the pipeline's own + // cleanup. Default 1 hour, long enough that an in-flight job's fresh + // workspace is never reaped, short enough that a leaked install token does + // not linger across restarts. Set via WORKSPACE_STALE_TTL_MS (issue #221). + workspaceStaleTtlMs: z.coerce.number().int().positive().default(3_600_000), + // Override max turns for the Claude Agent SDK, used as a FALLBACK ONLY on // src/core/executor.ts when invoked without an explicit `maxTurns` // argument. Since the dispatch-collapse, the orchestrator always passes @@ -913,6 +922,7 @@ function loadConfig(): Config { // Group 5, App runtime / behaviour context7ApiKey: process.env["CONTEXT7_API_KEY"], cloneBaseDir: process.env["CLONE_BASE_DIR"], + workspaceStaleTtlMs: process.env["WORKSPACE_STALE_TTL_MS"], cloneDepth: process.env["CLONE_DEPTH"], triggerPhrase: process.env["TRIGGER_PHRASE"], botAppLogin: process.env["BOT_APP_LOGIN"], diff --git a/src/core/workspace-sweep.ts b/src/core/workspace-sweep.ts new file mode 100644 index 00000000..cef40083 --- /dev/null +++ b/src/core/workspace-sweep.ts @@ -0,0 +1,108 @@ +/** + * Per-job workspace cleanup helpers (issue #221). + * + * A daemon job owns a "workspace triple" rooted at its workDir: + * - `` the cloned repo + * - `.cred.sh` the git credential helper holding the install token + * - `-artifacts` the sibling summary dir (IMPLEMENT.md / REVIEW.md / ...) + * + * The pipeline removes all three on a clean run. A SIGKILL / OOM / eviction + * skips that path and orphans the triple, leaking disk and a short-lived token. + * `removeWorkspaceTripleSync` is the synchronous last-resort path used by the + * daemon's exit and cancel handlers; `sweepStaleWorkspaces` is the TTL reaper + * run once at daemon startup to reclaim orphans from a prior lifetime. + */ + +import { rmSync } from "node:fs"; +import { mkdir, readdir, rm, stat } from "node:fs/promises"; +import { join } from "node:path"; + +/** Minimal structural logger; only `.info(obj, msg)` is used. */ +interface Logger { + info: (obj: object, msg: string) => void; +} + +/** + * Synchronously remove a workspace triple, best-effort. Each removal is + * isolated so one failure (e.g. a busy file) does not skip the others. Safe + * to call when paths are absent (`force: true`). Used on the process-exit and + * cancel paths where async cleanup is not an option. + */ +export function removeWorkspaceTripleSync(workDir: string): void { + // Self-enforce the scoped-job invariant: an empty workDir would make the + // calls below target CWD-relative `.cred.sh` / `-artifacts`. Callers also + // guard, but a force-rm helper must not depend on every caller remembering. + if (workDir === "") return; + try { + rmSync(workDir, { recursive: true, force: true }); + } catch { + // Best effort: leak is acceptable, blocking exit is not. + } + try { + rmSync(`${workDir}.cred.sh`, { force: true }); + } catch { + // Best effort. + } + try { + rmSync(`${workDir}-artifacts`, { recursive: true, force: true }); + } catch { + // Best effort. + } +} + +/** + * Sweep stale workspace entries under `cloneBaseDir` older than `ttlMs`, + * by entry mtime. Tolerant of a missing base dir and of concurrent removal + * (a live job deleting its own workspace mid-sweep). Emits one structured + * log line and returns counts for observability. + * + * Each entry is removed via `rm(..., { recursive: true })`, so a stale clone + * dir, its sibling `.cred.sh`, and its `-artifacts` dir are reaped as three + * independent entries (each carries its own mtime). This per-entry signal is + * safe because the sweep runs once at process startup, before any job of this + * lifetime exists, over a process-local `cloneBaseDir` (pod-local ephemeral + * storage, never a shared volume), so no in-flight job's workspace is reaped. + */ +export async function sweepStaleWorkspaces( + cloneBaseDir: string, + ttlMs: number, + log: Logger, +): Promise<{ swept: number; retained: number; durationMs: number }> { + const startedAt = Date.now(); + let swept = 0; + let retained = 0; + + let entries: string[]; + try { + // eslint-disable-next-line security/detect-non-literal-fs-filename -- cloneBaseDir is config, not user input + await mkdir(cloneBaseDir, { recursive: true }); + // eslint-disable-next-line security/detect-non-literal-fs-filename -- cloneBaseDir is config, not user input + entries = await readdir(cloneBaseDir); + } catch { + // Base dir unreadable / uncreatable: nothing to sweep. + return { swept: 0, retained: 0, durationMs: Date.now() - startedAt }; + } + + const cutoff = Date.now() - ttlMs; + + for (const entry of entries) { + const full = join(cloneBaseDir, entry); + try { + // eslint-disable-next-line no-await-in-loop, security/detect-non-literal-fs-filename -- full is join()-constructed + const { mtimeMs } = await stat(full); + if (mtimeMs < cutoff) { + // eslint-disable-next-line no-await-in-loop -- sequential rm bounds peak fd/IO; full is join()-constructed + await rm(full, { recursive: true, force: true }); + swept++; + } else { + retained++; + } + } catch { + // Concurrent removal or transient stat/rm error: skip this entry. + } + } + + const durationMs = Date.now() - startedAt; + log.info({ event: "workspace.sweep", swept, retained, durationMs }, "Swept stale workspaces"); + return { swept, retained, durationMs }; +} diff --git a/src/daemon/job-executor.ts b/src/daemon/job-executor.ts index be74a492..06704ba3 100644 --- a/src/daemon/job-executor.ts +++ b/src/daemon/job-executor.ts @@ -1,9 +1,8 @@ -import { rmSync } from "node:fs"; - import { Octokit } from "octokit"; import { config } from "../config"; import { runPipeline } from "../core/pipeline"; +import { removeWorkspaceTripleSync } from "../core/workspace-sweep"; import { createChildLogger, logger } from "../logger"; import type { ActiveJob, DaemonCapabilities, SerializableBotContext } from "../shared/daemon-types"; import { @@ -42,19 +41,9 @@ export function registerExitCleanup(): void { // Scoped jobs never own a workspace path; skip the rm so we do not // accidentally target `.cred.sh` in the daemon's CWD. if (job.workDir === "") continue; - try { - rmSync(job.workDir, { recursive: true, force: true }); - } catch { - // Best effort on exit - } - // Credential helper (${workDir}.cred.sh) is written by checkoutRepo beside - // the workspace and contains the installation token, remove it too so a - // SIGKILL / crash does not leak it for the app.ts stale-cred sweep window. - try { - rmSync(`${job.workDir}.cred.sh`, { force: true }); - } catch { - // Best effort on exit - } + // Removes the clone, the `.cred.sh` token helper, and the `-artifacts` + // sibling. The artifacts dir was previously leaked on crash exit. + removeWorkspaceTripleSync(job.workDir); } }); } @@ -732,16 +721,9 @@ export function handleJobCancel(cancel: JobCancelMessage, send: (msg: unknown) = } if (job.workDir !== "") { - try { - rmSync(job.workDir, { recursive: true, force: true }); - } catch { - // Best effort - } - try { - rmSync(`${job.workDir}.cred.sh`, { force: true }); - } catch { - // Best effort - } + // Clone + `.cred.sh` + `-artifacts` sibling; the artifacts dir was + // previously leaked on cancel. + removeWorkspaceTripleSync(job.workDir); } activeJobs.delete(offerId); diff --git a/src/daemon/main.ts b/src/daemon/main.ts index 8002b42f..f793fb58 100644 --- a/src/daemon/main.ts +++ b/src/daemon/main.ts @@ -1,6 +1,7 @@ import { platform } from "node:os"; import { config } from "../config"; +import { sweepStaleWorkspaces } from "../core/workspace-sweep"; import { installFatalHandlers, logger } from "../logger"; import type { DaemonCapabilities } from "../shared/daemon-types"; import { @@ -366,6 +367,11 @@ async function main(): Promise { registerExitCleanup(); + // Reclaim workspace triples orphaned by a prior SIGKILL/OOM/eviction (issue + // #221). Best-effort, runs before connecting so a cluttered base dir does + // not accumulate clones + install-token `.cred.sh` files across restarts. + await sweepStaleWorkspaces(config.cloneBaseDir, config.workspaceStaleTtlMs, logger); + capabilities = await discoverCapabilities(config.cloneBaseDir); logger.info( { diff --git a/test/core/workspace-sweep.test.ts b/test/core/workspace-sweep.test.ts new file mode 100644 index 00000000..339f63a9 --- /dev/null +++ b/test/core/workspace-sweep.test.ts @@ -0,0 +1,153 @@ +/** + * Unit tests for src/core/workspace-sweep.ts (issue #221). + * + * RED phase: the module under test does not exist yet, so the import below + * fails at resolution time and every case errors. Production code is written + * in the GREEN phase. + * + * Covers the two exports the daemon workspace-leak fix introduces: + * - removeWorkspaceTripleSync: synchronous best-effort removal of a workDir, + * its sibling `.cred.sh`, and `-artifacts` dir. + * - sweepStaleWorkspaces: TTL-based reaper over a clone base dir. + */ + +import { existsSync, mkdirSync, mkdtempSync, rmSync, utimesSync, writeFileSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; + +import { afterEach, describe, expect, it } from "bun:test"; + +import { removeWorkspaceTripleSync, sweepStaleWorkspaces } from "../../src/core/workspace-sweep"; + +/** No-op logger; the helper only needs `.info`. */ +const log = { info: () => {}, warn: () => {}, error: () => {} }; + +/** Base dirs created per test; torn down in afterEach. */ +const createdBases: string[] = []; + +function makeBase(): string { + const base = mkdtempSync(join(tmpdir(), "workspace-sweep-test-")); + createdBases.push(base); + return base; +} + +/** Create a workspace triple rooted at `wd`: dir + `.cred.sh` + `-artifacts/`. */ +function makeTriple(wd: string): void { + mkdirSync(wd, { recursive: true }); + writeFileSync(join(wd, "file.txt"), "x"); + writeFileSync(`${wd}.cred.sh`, "#!/bin/sh\n"); + mkdirSync(`${wd}-artifacts`, { recursive: true }); + writeFileSync(join(`${wd}-artifacts`, "REVIEW.md"), "y"); +} + +afterEach(() => { + while (createdBases.length > 0) { + const base = createdBases.pop(); + if (base !== undefined) { + rmSync(base, { recursive: true, force: true }); + } + } +}); + +describe("removeWorkspaceTripleSync", () => { + it("removes workDir, cred.sh, and artifacts", () => { + const base = makeBase(); + const wd = join(base, "deliv-abc123"); + makeTriple(wd); + + expect(existsSync(wd)).toBe(true); + expect(existsSync(`${wd}.cred.sh`)).toBe(true); + expect(existsSync(`${wd}-artifacts`)).toBe(true); + + removeWorkspaceTripleSync(wd); + + expect(existsSync(wd)).toBe(false); + expect(existsSync(`${wd}.cred.sh`)).toBe(false); + expect(existsSync(`${wd}-artifacts`)).toBe(false); + }); + + it("is a no-op when paths are absent", () => { + const base = makeBase(); + const wd = join(base, "does-not-exist"); + + expect(() => { + removeWorkspaceTripleSync(wd); + }).not.toThrow(); + }); + + it("is a no-op for an empty workDir (scoped job invariant)", () => { + // A "" workDir must never make rmSync target CWD-relative `.cred.sh`. + const before = existsSync(join(process.cwd(), ".cred.sh")); + expect(() => { + removeWorkspaceTripleSync(""); + }).not.toThrow(); + expect(existsSync(join(process.cwd(), ".cred.sh"))).toBe(before); + }); +}); + +describe("sweepStaleWorkspaces", () => { + it("removes a stale triple and retains a fresh one", async () => { + const base = makeBase(); + const aged = join(base, "aged"); + const fresh = join(base, "fresh"); + makeTriple(aged); + makeTriple(fresh); + + // Backdate the aged triple's three entries to ~2h ago. + const oldDate = new Date(Date.now() - 2 * 60 * 60 * 1000); + utimesSync(aged, oldDate, oldDate); + utimesSync(`${aged}.cred.sh`, oldDate, oldDate); + utimesSync(`${aged}-artifacts`, oldDate, oldDate); + + // Capture the structured log so the observability contract is pinned. + const calls: { obj: Record; msg: string }[] = []; + const recLog = { + info: (obj: Record, msg: string) => calls.push({ obj, msg }), + }; + + const result = await sweepStaleWorkspaces(base, 60 * 60 * 1000, recLog); + + expect(existsSync(aged)).toBe(false); + expect(existsSync(`${aged}.cred.sh`)).toBe(false); + expect(existsSync(`${aged}-artifacts`)).toBe(false); + + expect(existsSync(fresh)).toBe(true); + expect(existsSync(`${fresh}.cred.sh`)).toBe(true); + expect(existsSync(`${fresh}-artifacts`)).toBe(true); + + // Exactly six entries: three aged (swept) + three fresh (retained). + expect(result.swept).toBe(3); + expect(result.retained).toBe(3); + expect(result.durationMs).toBeGreaterThanOrEqual(0); + expect(calls).toHaveLength(1); + expect(calls[0]?.obj).toMatchObject({ event: "workspace.sweep", swept: 3, retained: 3 }); + }); + + it("reaps a stale cred.sh independently of a fresh clone dir (partial orphan)", async () => { + // The token-bearing cred.sh is written once at clone time and never touched, + // so a long checkout can leave it stale while the clone dir still looks fresh. + // Each entry must be judged on its own mtime so the token is reclaimed. + const base = makeBase(); + const wd = join(base, "partial"); + makeTriple(wd); + + const oldDate = new Date(Date.now() - 2 * 60 * 60 * 1000); + utimesSync(`${wd}.cred.sh`, oldDate, oldDate); + + const result = await sweepStaleWorkspaces(base, 60 * 60 * 1000, log); + + expect(existsSync(`${wd}.cred.sh`)).toBe(false); // token reclaimed + expect(existsSync(wd)).toBe(true); // fresh clone dir retained + expect(existsSync(`${wd}-artifacts`)).toBe(true); // fresh artifacts retained + expect(result.swept).toBe(1); + expect(result.retained).toBe(2); + }); + + it("tolerates a missing base dir", async () => { + const base = makeBase(); + const missing = join(base, "nope"); + + const result = await sweepStaleWorkspaces(missing, 60 * 60 * 1000, log); + expect(result.swept).toBe(0); + }); +});