diff --git a/docs/operate/configuration.md b/docs/operate/configuration.md index 764a213..f0aad97 100644 --- a/docs/operate/configuration.md +++ b/docs/operate/configuration.md @@ -52,6 +52,7 @@ Server mode only. If `ORCHESTRATOR_URL` is set, the process runs in daemon mode | `CLAUDE_CODE_PATH` | resolved from `node_modules` | Absolute path to the Claude Code CLI `cli.js`. | | `CLONE_BASE_DIR` | `/tmp/bot-workspaces` | Parent directory for per-delivery clones. | | `CLONE_DEPTH` | `50` | Shallow-clone depth. Increase for deeply-diverged PRs. | +| `WORKSPACE_STALE_TTL_MS` | `3600000` | TTL before an orphaned per-job workspace triple (clone dir + `.cred.sh` + `-artifacts`) under `CLONE_BASE_DIR` is swept at startup. Reclaims SIGKILL/OOM/eviction orphans. Lower only if you understand the risk. | | `CONTEXT7_API_KEY` | unset | Lifts Context7 MCP rate limiting. No other effect. | ## Postgres @@ -189,7 +190,7 @@ before committing: ## Prompt cache layout -Selects the system/user prompt split the agent executor passes to the Claude Agent SDK. See `src/config.ts:573#promptCacheLayout` for the Zod definition and `src/core/executor.ts:208` for the runtime guard. +Selects the system/user prompt split the agent executor passes to the Claude Agent SDK. See `src/config.ts:582#promptCacheLayout` for the Zod definition and `src/core/executor.ts:208` for the runtime guard. | Variable | Default | Notes | | --------------------- | -------- | ------------------------------------------------------------------------------------------------------------ | diff --git a/docs/operate/observability.md b/docs/operate/observability.md index 3216a06..90539a3 100644 --- a/docs/operate/observability.md +++ b/docs/operate/observability.md @@ -68,6 +68,14 @@ cache_read_input_tokens / (input_tokens + cache_read_input_tokens + cache_creati (The ratio is undefined when the denominator is zero, e.g. a dry-run that never called the model; guard against that in the query.) A high ratio on the second+ run of the same prompt shape confirms `PROMPT_CACHE_LAYOUT=cacheable` is working; a sudden drop in the per-installation cache-read share is the signature of a prompt-cache stability regression. Alert on `sum(cache_read_input_tokens) / sum(input_tokens + cache_read_input_tokens + cache_creation_input_tokens)` falling below its established baseline. +### Workspace sweep + +`sweepStaleWorkspaces` (`src/core/workspace-sweep.ts`) emits one line per startup sweep when the daemon and the webhook server reclaim stale per-job workspace triples under `CLONE_BASE_DIR` (orphans left by SIGKILL/OOM/eviction). A `swept` count climbing across restarts means jobs are being killed mid-run before their own cleanup runs. + +| `event` | Level | Fields | +| ----------------- | ----- | ------------------------------------------------------------------------------------------------------------ | +| `workspace.sweep` | info | `swept` (entries removed), `retained` (entries kept as fresh), `durationMs` (wall-clock time for the sweep). | + ## GitHub API rate-limit fields The `App` is constructed with `ObservableOctokit` (`src/utils/octokit-observability.ts`), an `Octokit.plugin` subclass shared by `app.octokit` and every installation octokit. It logs GitHub's per-installation rate-limit headers via `octokit.hook.after` / `hook.error`. The `pipeline.stage`-style strict Zod schema (`GithubApiLogFieldsSchema`) pins the field shape. diff --git a/docs/operate/runbooks/daemon-fleet.md b/docs/operate/runbooks/daemon-fleet.md index e581f03..68d17c8 100644 --- a/docs/operate/runbooks/daemon-fleet.md +++ b/docs/operate/runbooks/daemon-fleet.md @@ -46,6 +46,8 @@ flowchart LR classDef done fill:#2a6f2a,stroke:#1a4d1a,color:#ffffff ``` +At boot, before connecting to the orchestrator, the daemon sweeps stale workspace triples (clone dir + `.cred.sh` token helper + `-artifacts`) older than `WORKSPACE_STALE_TTL_MS` under `CLONE_BASE_DIR`, reclaiming SIGKILL/OOM/eviction orphans left behind when a prior run skipped its own cleanup. Each sweep emits a single `workspace.sweep` log line with `swept` / `retained` / `durationMs`. + ## Operational knobs The full list lives at [`../configuration.md`](../configuration.md#orchestrator-and-daemon). The handful you'll actually touch: diff --git a/src/app.ts b/src/app.ts index fe79433..075a7ef 100644 --- a/src/app.ts +++ b/src/app.ts @@ -1,5 +1,5 @@ import { randomUUID } from "node:crypto"; -import { access, constants, mkdir, readdir, rm, stat } from "node:fs/promises"; +import { access, constants } from "node:fs/promises"; import http from "node:http"; import { join } from "node:path"; @@ -17,6 +17,7 @@ import type { import { App, Octokit } from "octokit"; import { config } from "./config"; +import { sweepStaleWorkspaces } from "./core/workspace-sweep"; import { closeDb, getDb } from "./db"; import { runMigrations } from "./db/migrate"; import { installFatalHandlers, logger } from "./logger"; @@ -368,35 +369,11 @@ async function runStartupChecks(): Promise { } } - // *.cred.sh files accumulate in cloneBaseDir when the pod is SIGKILL-ed mid-checkout. - // Remove files older than 1 hour to avoid leaking installation tokens across restarts. - const STALE_CRED_TTL_MS = 60 * 60 * 1000; - const staleCutoff = Date.now() - STALE_CRED_TTL_MS; - - try { - // eslint-disable-next-line security/detect-non-literal-fs-filename - await mkdir(config.cloneBaseDir, { recursive: true }); - // eslint-disable-next-line security/detect-non-literal-fs-filename - const entries = await readdir(config.cloneBaseDir); - const credFiles = entries.filter((f) => f.endsWith(".cred.sh")); - - for (const credFile of credFiles) { - const fullPath = join(config.cloneBaseDir, credFile); - try { - // eslint-disable-next-line no-await-in-loop, security/detect-non-literal-fs-filename - const { mtimeMs } = await stat(fullPath); - if (mtimeMs < staleCutoff) { - // eslint-disable-next-line no-await-in-loop -- fullPath is join()-constructed, not user input - await rm(fullPath, { force: true }); - logger.info({ credFile }, "Removed stale credential helper script"); - } - } catch { - // Non-fatal: file may have been removed concurrently - } - } - } catch { - // Non-fatal: cloneBaseDir may not exist yet on a fresh pod - } + // Reclaim workspace triples (clone dir + `.cred.sh` token helper + + // `-artifacts`) orphaned in cloneBaseDir when a pod was SIGKILL-ed + // mid-run, to avoid leaking installation tokens and disk across restarts + // (issue #221). + await sweepStaleWorkspaces(config.cloneBaseDir, config.workspaceStaleTtlMs, logger); const db = getDb(); if (db !== null) { diff --git a/src/config.ts b/src/config.ts index 51fe114..ea9d721 100644 --- a/src/config.ts +++ b/src/config.ts @@ -238,6 +238,15 @@ const configSchema = z // for testing or smaller-scope deployments. agentTimeoutMs: z.coerce.number().int().positive().default(3_600_000), + // TTL before an orphaned per-job workspace triple (clone dir + + // `.cred.sh` + `-artifacts`) under cloneBaseDir is swept + // at daemon/server startup. Orphans are left behind when a pod is + // SIGKILL-ed / OOM-killed / evicted mid-run, skipping the pipeline's own + // cleanup. Default 1 hour, long enough that an in-flight job's fresh + // workspace is never reaped, short enough that a leaked install token does + // not linger across restarts. Set via WORKSPACE_STALE_TTL_MS (issue #221). + workspaceStaleTtlMs: z.coerce.number().int().positive().default(3_600_000), + // Override max turns for the Claude Agent SDK, used as a FALLBACK ONLY on // src/core/executor.ts when invoked without an explicit `maxTurns` // argument. Since the dispatch-collapse, the orchestrator always passes @@ -913,6 +922,7 @@ function loadConfig(): Config { // Group 5, App runtime / behaviour context7ApiKey: process.env["CONTEXT7_API_KEY"], cloneBaseDir: process.env["CLONE_BASE_DIR"], + workspaceStaleTtlMs: process.env["WORKSPACE_STALE_TTL_MS"], cloneDepth: process.env["CLONE_DEPTH"], triggerPhrase: process.env["TRIGGER_PHRASE"], botAppLogin: process.env["BOT_APP_LOGIN"], diff --git a/src/core/workspace-sweep.ts b/src/core/workspace-sweep.ts new file mode 100644 index 0000000..cef4008 --- /dev/null +++ b/src/core/workspace-sweep.ts @@ -0,0 +1,108 @@ +/** + * Per-job workspace cleanup helpers (issue #221). + * + * A daemon job owns a "workspace triple" rooted at its workDir: + * - `` the cloned repo + * - `.cred.sh` the git credential helper holding the install token + * - `-artifacts` the sibling summary dir (IMPLEMENT.md / REVIEW.md / ...) + * + * The pipeline removes all three on a clean run. A SIGKILL / OOM / eviction + * skips that path and orphans the triple, leaking disk and a short-lived token. + * `removeWorkspaceTripleSync` is the synchronous last-resort path used by the + * daemon's exit and cancel handlers; `sweepStaleWorkspaces` is the TTL reaper + * run once at daemon startup to reclaim orphans from a prior lifetime. + */ + +import { rmSync } from "node:fs"; +import { mkdir, readdir, rm, stat } from "node:fs/promises"; +import { join } from "node:path"; + +/** Minimal structural logger; only `.info(obj, msg)` is used. */ +interface Logger { + info: (obj: object, msg: string) => void; +} + +/** + * Synchronously remove a workspace triple, best-effort. Each removal is + * isolated so one failure (e.g. a busy file) does not skip the others. Safe + * to call when paths are absent (`force: true`). Used on the process-exit and + * cancel paths where async cleanup is not an option. + */ +export function removeWorkspaceTripleSync(workDir: string): void { + // Self-enforce the scoped-job invariant: an empty workDir would make the + // calls below target CWD-relative `.cred.sh` / `-artifacts`. Callers also + // guard, but a force-rm helper must not depend on every caller remembering. + if (workDir === "") return; + try { + rmSync(workDir, { recursive: true, force: true }); + } catch { + // Best effort: leak is acceptable, blocking exit is not. + } + try { + rmSync(`${workDir}.cred.sh`, { force: true }); + } catch { + // Best effort. + } + try { + rmSync(`${workDir}-artifacts`, { recursive: true, force: true }); + } catch { + // Best effort. + } +} + +/** + * Sweep stale workspace entries under `cloneBaseDir` older than `ttlMs`, + * by entry mtime. Tolerant of a missing base dir and of concurrent removal + * (a live job deleting its own workspace mid-sweep). Emits one structured + * log line and returns counts for observability. + * + * Each entry is removed via `rm(..., { recursive: true })`, so a stale clone + * dir, its sibling `.cred.sh`, and its `-artifacts` dir are reaped as three + * independent entries (each carries its own mtime). This per-entry signal is + * safe because the sweep runs once at process startup, before any job of this + * lifetime exists, over a process-local `cloneBaseDir` (pod-local ephemeral + * storage, never a shared volume), so no in-flight job's workspace is reaped. + */ +export async function sweepStaleWorkspaces( + cloneBaseDir: string, + ttlMs: number, + log: Logger, +): Promise<{ swept: number; retained: number; durationMs: number }> { + const startedAt = Date.now(); + let swept = 0; + let retained = 0; + + let entries: string[]; + try { + // eslint-disable-next-line security/detect-non-literal-fs-filename -- cloneBaseDir is config, not user input + await mkdir(cloneBaseDir, { recursive: true }); + // eslint-disable-next-line security/detect-non-literal-fs-filename -- cloneBaseDir is config, not user input + entries = await readdir(cloneBaseDir); + } catch { + // Base dir unreadable / uncreatable: nothing to sweep. + return { swept: 0, retained: 0, durationMs: Date.now() - startedAt }; + } + + const cutoff = Date.now() - ttlMs; + + for (const entry of entries) { + const full = join(cloneBaseDir, entry); + try { + // eslint-disable-next-line no-await-in-loop, security/detect-non-literal-fs-filename -- full is join()-constructed + const { mtimeMs } = await stat(full); + if (mtimeMs < cutoff) { + // eslint-disable-next-line no-await-in-loop -- sequential rm bounds peak fd/IO; full is join()-constructed + await rm(full, { recursive: true, force: true }); + swept++; + } else { + retained++; + } + } catch { + // Concurrent removal or transient stat/rm error: skip this entry. + } + } + + const durationMs = Date.now() - startedAt; + log.info({ event: "workspace.sweep", swept, retained, durationMs }, "Swept stale workspaces"); + return { swept, retained, durationMs }; +} diff --git a/src/daemon/job-executor.ts b/src/daemon/job-executor.ts index be74a49..06704ba 100644 --- a/src/daemon/job-executor.ts +++ b/src/daemon/job-executor.ts @@ -1,9 +1,8 @@ -import { rmSync } from "node:fs"; - import { Octokit } from "octokit"; import { config } from "../config"; import { runPipeline } from "../core/pipeline"; +import { removeWorkspaceTripleSync } from "../core/workspace-sweep"; import { createChildLogger, logger } from "../logger"; import type { ActiveJob, DaemonCapabilities, SerializableBotContext } from "../shared/daemon-types"; import { @@ -42,19 +41,9 @@ export function registerExitCleanup(): void { // Scoped jobs never own a workspace path; skip the rm so we do not // accidentally target `.cred.sh` in the daemon's CWD. if (job.workDir === "") continue; - try { - rmSync(job.workDir, { recursive: true, force: true }); - } catch { - // Best effort on exit - } - // Credential helper (${workDir}.cred.sh) is written by checkoutRepo beside - // the workspace and contains the installation token, remove it too so a - // SIGKILL / crash does not leak it for the app.ts stale-cred sweep window. - try { - rmSync(`${job.workDir}.cred.sh`, { force: true }); - } catch { - // Best effort on exit - } + // Removes the clone, the `.cred.sh` token helper, and the `-artifacts` + // sibling. The artifacts dir was previously leaked on crash exit. + removeWorkspaceTripleSync(job.workDir); } }); } @@ -732,16 +721,9 @@ export function handleJobCancel(cancel: JobCancelMessage, send: (msg: unknown) = } if (job.workDir !== "") { - try { - rmSync(job.workDir, { recursive: true, force: true }); - } catch { - // Best effort - } - try { - rmSync(`${job.workDir}.cred.sh`, { force: true }); - } catch { - // Best effort - } + // Clone + `.cred.sh` + `-artifacts` sibling; the artifacts dir was + // previously leaked on cancel. + removeWorkspaceTripleSync(job.workDir); } activeJobs.delete(offerId); diff --git a/src/daemon/main.ts b/src/daemon/main.ts index 8002b42..f793fb5 100644 --- a/src/daemon/main.ts +++ b/src/daemon/main.ts @@ -1,6 +1,7 @@ import { platform } from "node:os"; import { config } from "../config"; +import { sweepStaleWorkspaces } from "../core/workspace-sweep"; import { installFatalHandlers, logger } from "../logger"; import type { DaemonCapabilities } from "../shared/daemon-types"; import { @@ -366,6 +367,11 @@ async function main(): Promise { registerExitCleanup(); + // Reclaim workspace triples orphaned by a prior SIGKILL/OOM/eviction (issue + // #221). Best-effort, runs before connecting so a cluttered base dir does + // not accumulate clones + install-token `.cred.sh` files across restarts. + await sweepStaleWorkspaces(config.cloneBaseDir, config.workspaceStaleTtlMs, logger); + capabilities = await discoverCapabilities(config.cloneBaseDir); logger.info( { diff --git a/test/core/workspace-sweep.test.ts b/test/core/workspace-sweep.test.ts new file mode 100644 index 0000000..339f63a --- /dev/null +++ b/test/core/workspace-sweep.test.ts @@ -0,0 +1,153 @@ +/** + * Unit tests for src/core/workspace-sweep.ts (issue #221). + * + * RED phase: the module under test does not exist yet, so the import below + * fails at resolution time and every case errors. Production code is written + * in the GREEN phase. + * + * Covers the two exports the daemon workspace-leak fix introduces: + * - removeWorkspaceTripleSync: synchronous best-effort removal of a workDir, + * its sibling `.cred.sh`, and `-artifacts` dir. + * - sweepStaleWorkspaces: TTL-based reaper over a clone base dir. + */ + +import { existsSync, mkdirSync, mkdtempSync, rmSync, utimesSync, writeFileSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; + +import { afterEach, describe, expect, it } from "bun:test"; + +import { removeWorkspaceTripleSync, sweepStaleWorkspaces } from "../../src/core/workspace-sweep"; + +/** No-op logger; the helper only needs `.info`. */ +const log = { info: () => {}, warn: () => {}, error: () => {} }; + +/** Base dirs created per test; torn down in afterEach. */ +const createdBases: string[] = []; + +function makeBase(): string { + const base = mkdtempSync(join(tmpdir(), "workspace-sweep-test-")); + createdBases.push(base); + return base; +} + +/** Create a workspace triple rooted at `wd`: dir + `.cred.sh` + `-artifacts/`. */ +function makeTriple(wd: string): void { + mkdirSync(wd, { recursive: true }); + writeFileSync(join(wd, "file.txt"), "x"); + writeFileSync(`${wd}.cred.sh`, "#!/bin/sh\n"); + mkdirSync(`${wd}-artifacts`, { recursive: true }); + writeFileSync(join(`${wd}-artifacts`, "REVIEW.md"), "y"); +} + +afterEach(() => { + while (createdBases.length > 0) { + const base = createdBases.pop(); + if (base !== undefined) { + rmSync(base, { recursive: true, force: true }); + } + } +}); + +describe("removeWorkspaceTripleSync", () => { + it("removes workDir, cred.sh, and artifacts", () => { + const base = makeBase(); + const wd = join(base, "deliv-abc123"); + makeTriple(wd); + + expect(existsSync(wd)).toBe(true); + expect(existsSync(`${wd}.cred.sh`)).toBe(true); + expect(existsSync(`${wd}-artifacts`)).toBe(true); + + removeWorkspaceTripleSync(wd); + + expect(existsSync(wd)).toBe(false); + expect(existsSync(`${wd}.cred.sh`)).toBe(false); + expect(existsSync(`${wd}-artifacts`)).toBe(false); + }); + + it("is a no-op when paths are absent", () => { + const base = makeBase(); + const wd = join(base, "does-not-exist"); + + expect(() => { + removeWorkspaceTripleSync(wd); + }).not.toThrow(); + }); + + it("is a no-op for an empty workDir (scoped job invariant)", () => { + // A "" workDir must never make rmSync target CWD-relative `.cred.sh`. + const before = existsSync(join(process.cwd(), ".cred.sh")); + expect(() => { + removeWorkspaceTripleSync(""); + }).not.toThrow(); + expect(existsSync(join(process.cwd(), ".cred.sh"))).toBe(before); + }); +}); + +describe("sweepStaleWorkspaces", () => { + it("removes a stale triple and retains a fresh one", async () => { + const base = makeBase(); + const aged = join(base, "aged"); + const fresh = join(base, "fresh"); + makeTriple(aged); + makeTriple(fresh); + + // Backdate the aged triple's three entries to ~2h ago. + const oldDate = new Date(Date.now() - 2 * 60 * 60 * 1000); + utimesSync(aged, oldDate, oldDate); + utimesSync(`${aged}.cred.sh`, oldDate, oldDate); + utimesSync(`${aged}-artifacts`, oldDate, oldDate); + + // Capture the structured log so the observability contract is pinned. + const calls: { obj: Record; msg: string }[] = []; + const recLog = { + info: (obj: Record, msg: string) => calls.push({ obj, msg }), + }; + + const result = await sweepStaleWorkspaces(base, 60 * 60 * 1000, recLog); + + expect(existsSync(aged)).toBe(false); + expect(existsSync(`${aged}.cred.sh`)).toBe(false); + expect(existsSync(`${aged}-artifacts`)).toBe(false); + + expect(existsSync(fresh)).toBe(true); + expect(existsSync(`${fresh}.cred.sh`)).toBe(true); + expect(existsSync(`${fresh}-artifacts`)).toBe(true); + + // Exactly six entries: three aged (swept) + three fresh (retained). + expect(result.swept).toBe(3); + expect(result.retained).toBe(3); + expect(result.durationMs).toBeGreaterThanOrEqual(0); + expect(calls).toHaveLength(1); + expect(calls[0]?.obj).toMatchObject({ event: "workspace.sweep", swept: 3, retained: 3 }); + }); + + it("reaps a stale cred.sh independently of a fresh clone dir (partial orphan)", async () => { + // The token-bearing cred.sh is written once at clone time and never touched, + // so a long checkout can leave it stale while the clone dir still looks fresh. + // Each entry must be judged on its own mtime so the token is reclaimed. + const base = makeBase(); + const wd = join(base, "partial"); + makeTriple(wd); + + const oldDate = new Date(Date.now() - 2 * 60 * 60 * 1000); + utimesSync(`${wd}.cred.sh`, oldDate, oldDate); + + const result = await sweepStaleWorkspaces(base, 60 * 60 * 1000, log); + + expect(existsSync(`${wd}.cred.sh`)).toBe(false); // token reclaimed + expect(existsSync(wd)).toBe(true); // fresh clone dir retained + expect(existsSync(`${wd}-artifacts`)).toBe(true); // fresh artifacts retained + expect(result.swept).toBe(1); + expect(result.retained).toBe(2); + }); + + it("tolerates a missing base dir", async () => { + const base = makeBase(); + const missing = join(base, "nope"); + + const result = await sweepStaleWorkspaces(missing, 60 * 60 * 1000, log); + expect(result.swept).toBe(0); + }); +});