From c7e28123689e07e0de0424196b0bea2576b50e4c Mon Sep 17 00:00:00 2001 From: Cho Young-Hwi Date: Thu, 11 Jun 2026 21:02:13 +0900 Subject: [PATCH] [#957] Lifecycle-aware cleanup of agent backend temp dirs Co-Authored-By: Claude Fable 5 --- docs/install-vps.md | 9 +++ docs/troubleshooting.md | 26 ++++++++ server/index.js | 37 +++++++++++ server/temp-cleanup.js | 129 ++++++++++++++++++++++++++++++++++++ server/temp-cleanup.test.js | 107 ++++++++++++++++++++++++++++++ 5 files changed, 308 insertions(+) create mode 100644 server/temp-cleanup.js create mode 100644 server/temp-cleanup.test.js diff --git a/docs/install-vps.md b/docs/install-vps.md index 41080fd..9ed6157 100644 --- a/docs/install-vps.md +++ b/docs/install-vps.md @@ -362,3 +362,12 @@ chmod 600 ~/.quadwork/.env 14. Configure nginx reverse proxy + SSL 15. Add HTTP basic auth 16. Verify reboot survival: `sudo reboot`, then check `pm2 list` + +## Note: /tmp quotas and Claude temp + +Some VPS images mount `/tmp` with a per-user quota (`usrquota`). Claude Code +accumulates temp under `/tmp/claude-{uid}`; if the quota fills up, every +Claude bash command starts failing silently with exit 1 (see +[troubleshooting](troubleshooting.md#every-claude-bash-command-fails-silently-exit-1-no-output)). +QuadWork sweeps stale entries automatically (hourly + on agent teardown, +72h age) — configurable via `temp_cleanup` in `~/.quadwork/config.json`. diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index 76c7d1b..749db81 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -124,3 +124,29 @@ chmod 440 /etc/sudoers.d/quadwork See the [VPS Installation Guide](install-vps.md#step-2-create-non-root-user-critical) for full setup. + +## Every Claude bash command fails silently (exit 1, no output) + +**Symptom:** every bash command a `claude` agent runs — even `true` — returns +exit 1 with empty stdout/stderr. The Read tool still works, a plain shell on +the host works, and `codex` agents are unaffected. Easily mistaken for a +bwrap/AppArmor/sandbox failure. + +**Cause:** Claude Code keeps its temp under `/tmp/claude-{uid}` and never +cleans it up. On hosts where `/tmp` is mounted with a per-user quota +(`usrquota`), that dir grows until the quota is exhausted — after which Claude +can't write the temp files it needs before executing any command (#957). + +**Check:** `du -sh /tmp/claude-$(id -u)` and try `dd if=/dev/zero +of=/tmp/probe bs=1M count=10` — a "Disk quota exceeded" error confirms it. + +**Fix:** QuadWork sweeps stale backend temp automatically (hourly, at boot, +and on agent teardown; entries older than 72h). If you hit the quota *before* +a sweep (e.g. the server was down), clear it manually: +`find /tmp/claude-$(id -u)/* -maxdepth 0 -mmin +60 -exec rm -rf {} +` + +Tune or disable via `~/.quadwork/config.json`: + +```json +{ "temp_cleanup": { "enabled": true, "max_age_hours": 72 } } +``` diff --git a/server/index.js b/server/index.js index 881c967..4d921f0 100644 --- a/server/index.js +++ b/server/index.js @@ -12,6 +12,7 @@ const fileChat = require("./file-chat"); const { dispatchToAgentPTY, cleanupSession: cleanupPtyDispatcher } = require("./pty-dispatcher"); const { runAcMigration } = require("./migrate-ac"); const selfHeal = require("./self-heal"); +const tempCleanup = require("./temp-cleanup"); const { injectModeForCommand } = require("../src/lib/injectMode.js"); const net = require("net"); @@ -623,6 +624,34 @@ async function stopAgentSession(key, { clearSelfHeal = false } = {}) { session.exitedUnexpectedly = false; const [projectId, agentId] = key.split("/"); if (projectId && agentId) stopMcpProxy(projectId, agentId); + // #957: teardown is a known-safe moment to sweep stale backend temp. + // Deferred + stale-only, so it never blocks the stop path and never touches + // files another live agent (same shared /tmp/claude-{uid}) still uses. + setImmediate(backendTempSweepTick); +} + +// #957: sweep stale backend temp entries (/tmp/claude-{uid}, stray gemini +// crash dumps). On hosts where /tmp has a per-user quota, unbounded Claude +// temp eventually exhausts it and every Claude bash call fails silently with +// exit 1 — see the issue for the full post-mortem. Runs on agent teardown and +// hourly; opt-out / age via config.json `temp_cleanup: {enabled, max_age_hours}`. +let _tempSweepRunning = false; +function backendTempSweepTick() { + if (_tempSweepRunning) return; + _tempSweepRunning = true; + try { + const settings = tempCleanup.cleanupSettings(readConfig()); + if (!settings.enabled) return; + const r = tempCleanup.sweepBackendTemp({ maxAgeHours: settings.maxAgeHours }); + if (r.removed.length > 0) { + console.log(`[temp-cleanup] removed ${r.removed.length} stale entr${r.removed.length === 1 ? "y" : "ies"} (kept ${r.kept})`); + } + for (const e of r.errors) console.error(`[temp-cleanup] ${e}`); + } catch (err) { + console.error(`[temp-cleanup] sweep failed: ${err.message}`); + } finally { + _tempSweepRunning = false; + } } app.get("/api/agents", (_req, res) => { @@ -1816,6 +1845,14 @@ if (!process.env.QUADWORK_SKIP_LISTEN) { setInterval(autoStopPollingTick, AUTO_STOP_POLL_INTERVAL_MS); } +// #957: hourly stale-temp sweep (plus one at boot — a server that was down +// for days should reclaim quota immediately, not an hour after start). +const TEMP_SWEEP_INTERVAL_MS = 60 * 60 * 1000; +if (!process.env.QUADWORK_SKIP_LISTEN) { + setImmediate(backendTempSweepTick); + setInterval(backendTempSweepTick, TEMP_SWEEP_INTERVAL_MS); +} + // #915: retry deferred reseeds without a server restart. A reseed deferred on // boot because a project's batch was active used to wait for the next startup — // stranding a busy project on old seeds indefinitely. autoReseedOnStartup is diff --git a/server/temp-cleanup.js b/server/temp-cleanup.js new file mode 100644 index 0000000..98b0ed0 --- /dev/null +++ b/server/temp-cleanup.js @@ -0,0 +1,129 @@ +"use strict"; + +// #957: lifecycle-aware cleanup of agent backend temp dirs. +// +// Claude Code overrides TMPDIR to /tmp/claude-{uid} and never cleans it up. +// On Linux hosts where /tmp carries a per-user quota (usrquota), that dir +// accumulates until the quota is exhausted — at which point EVERY Claude bash +// call fails silently (exit 1, no output) because Claude can't write its +// temp/sandbox-setup files before exec. Gemini similarly strands +// /tmp/gemini-client-error-*.json crash dumps. Codex keeps its state under +// ~/.codex (not /tmp) so it needs no sweep here. +// +// Design constraints: +// - /tmp/claude-{uid} is SHARED by every claude agent of this user, so a +// teardown of one agent must never delete another live agent's files. +// Therefore deletion is strictly age-based ("stale entries only") — the +// same sweep is safe from any call site, teardown or timer. +// - Age uses max(mtime, atime, ctime) like systemd-tmpfiles, so anything a +// live session still touches stays "fresh" and is spared. +// - Never throws: a cleanup bug must not break agent teardown or the server +// boot path. Errors are collected and reported in the result. +// - Cross-platform no-op where it doesn't apply: Windows has no +// process.getuid → the claude dir is skipped; the gemini glob simply +// matches nothing where those files don't exist. + +const fs = require("fs"); +const os = require("os"); +const path = require("path"); + +const DEFAULT_MAX_AGE_HOURS = 72; + +// Newest of mtime/atime/ctime — "last time anything observed this entry". +function newestTimeMs(st) { + return Math.max(st.mtimeMs || 0, st.atimeMs || 0, st.ctimeMs || 0); +} + +// Remove `entry` (file or dir) if its newest timestamp is older than the +// cutoff. Returns "removed" | "kept" | "error". +function removeIfStale(entry, cutoffMs, result) { + let st; + try { + st = fs.lstatSync(entry); + } catch { + return "kept"; // raced away — already gone + } + if (newestTimeMs(st) >= cutoffMs) { + result.kept += 1; + return "kept"; + } + try { + fs.rmSync(entry, { recursive: true, force: true }); + result.removed.push(entry); + return "removed"; + } catch (err) { + result.errors.push(`${entry}: ${err.message}`); + return "error"; + } +} + +// Sweep stale backend temp entries. Options (all injectable for tests): +// maxAgeHours - entries with no mtime/atime/ctime newer than this are removed +// tmpRoot - temp root (default os.tmpdir()) +// uid - numeric uid for the claude-{uid} dir (default process.getuid) +// now - epoch ms (default Date.now()) +// Returns { removed: string[], kept: number, errors: string[] }. +function sweepBackendTemp(opts = {}) { + const result = { removed: [], kept: 0, errors: [] }; + try { + const maxAgeHours = Number.isFinite(opts.maxAgeHours) && opts.maxAgeHours > 0 + ? opts.maxAgeHours + : DEFAULT_MAX_AGE_HOURS; + const tmpRoot = opts.tmpRoot || os.tmpdir(); + const now = typeof opts.now === "number" ? opts.now : Date.now(); + const cutoffMs = now - maxAgeHours * 60 * 60 * 1000; + + // Claude: entries directly under /tmp/claude-{uid}. The dir itself is + // kept (claude recreates it anyway; deleting it while a session spawns + // would be a race for no benefit). + const uid = typeof opts.uid === "number" + ? opts.uid + : (typeof process.getuid === "function" ? process.getuid() : null); + if (uid !== null) { + const claudeDir = path.join(tmpRoot, `claude-${uid}`); + let entries = []; + try { + entries = fs.readdirSync(claudeDir); + } catch { + entries = []; // no dir → nothing to do + } + for (const name of entries) { + removeIfStale(path.join(claudeDir, name), cutoffMs, result); + } + } + + // Gemini: stray crash dumps written directly to the temp root. + let rootEntries = []; + try { + rootEntries = fs.readdirSync(tmpRoot); + } catch { + rootEntries = []; + } + for (const name of rootEntries) { + if (name.startsWith("gemini-client-error-") && name.endsWith(".json")) { + removeIfStale(path.join(tmpRoot, name), cutoffMs, result); + } + } + } catch (err) { + result.errors.push(String(err && err.message ? err.message : err)); + } + return result; +} + +// Resolve {enabled, maxAgeHours} from config.json's optional top-level +// `temp_cleanup` block. Defaults: enabled, 72h. `enabled: false` opts out. +function cleanupSettings(cfg) { + const tc = (cfg && cfg.temp_cleanup) || {}; + return { + enabled: tc.enabled !== false, + maxAgeHours: Number.isFinite(tc.max_age_hours) && tc.max_age_hours > 0 + ? tc.max_age_hours + : DEFAULT_MAX_AGE_HOURS, + }; +} + +module.exports = { + sweepBackendTemp, + cleanupSettings, + DEFAULT_MAX_AGE_HOURS, +}; diff --git a/server/temp-cleanup.test.js b/server/temp-cleanup.test.js new file mode 100644 index 0000000..226ad16 --- /dev/null +++ b/server/temp-cleanup.test.js @@ -0,0 +1,107 @@ +"use strict"; + +// #957: unit tests for the stale backend-temp sweep. All filesystem work +// happens inside a throwaway fixture dir; uid/now/tmpRoot are injected so +// nothing touches the real /tmp/claude-{uid}. + +const fs = require("fs"); +const os = require("os"); +const path = require("path"); +const { sweepBackendTemp, cleanupSettings, DEFAULT_MAX_AGE_HOURS } = require("./temp-cleanup"); + +let failures = 0; +function ok(cond, msg) { + if (cond) { + console.log(` PASS: ${msg}`); + } else { + failures += 1; + console.error(` FAIL: ${msg}`); + } +} + +const HOUR = 60 * 60 * 1000; +const NOW = 1_800_000_000_000; // fixed epoch for determinism + +function makeFixture() { + const root = fs.mkdtempSync(path.join(os.tmpdir(), "qw-tempclean-test-")); + const claudeDir = path.join(root, "claude-1234"); + fs.mkdirSync(claudeDir); + return { root, claudeDir }; +} + +function touch(p, ageHours, { dir = false, content = "x" } = {}) { + if (dir) { + fs.mkdirSync(p, { recursive: true }); + fs.writeFileSync(path.join(p, "inner.txt"), content); + } else { + fs.writeFileSync(p, content); + } + const t = new Date(NOW - ageHours * HOUR); + // utimes sets atime+mtime; ctime can't be set, but lstat ctime of a fresh + // fixture file is "now" in REAL time, far in the past relative to our fake + // NOW — so the injected-now cutoff is still decided by atime/mtime here. + fs.utimesSync(p, t, t); +} + +// ── stale vs fresh entries under claude-{uid} ── +{ + const { root, claudeDir } = makeFixture(); + touch(path.join(claudeDir, "old-file"), 100); + touch(path.join(claudeDir, "old-dir"), 100, { dir: true }); + touch(path.join(claudeDir, "fresh-file"), 1); + // utimes on the dir AFTER writing inner.txt so the dir's own mtime is old + fs.utimesSync(path.join(claudeDir, "old-dir"), new Date(NOW - 100 * HOUR), new Date(NOW - 100 * HOUR)); + + const r = sweepBackendTemp({ tmpRoot: root, uid: 1234, now: NOW, maxAgeHours: 72 }); + ok(!fs.existsSync(path.join(claudeDir, "old-file")), "stale file under claude-{uid} is removed"); + ok(!fs.existsSync(path.join(claudeDir, "old-dir")), "stale directory is removed recursively"); + ok(fs.existsSync(path.join(claudeDir, "fresh-file")), "fresh file is spared"); + ok(fs.existsSync(claudeDir), "the claude-{uid} dir itself is kept"); + ok(r.removed.length === 2 && r.kept === 1 && r.errors.length === 0, "result counts removed=2 kept=1 errors=0"); + fs.rmSync(root, { recursive: true, force: true }); +} + +// ── gemini crash dumps at the temp root ── +{ + const { root } = makeFixture(); + touch(path.join(root, "gemini-client-error-Turn.run-2026-06-03T00-28-55-388Z.json"), 100); + touch(path.join(root, "gemini-client-error-fresh.json"), 1); + touch(path.join(root, "unrelated-old-file.json"), 100); + + const r = sweepBackendTemp({ tmpRoot: root, uid: 1234, now: NOW, maxAgeHours: 72 }); + ok(!fs.existsSync(path.join(root, "gemini-client-error-Turn.run-2026-06-03T00-28-55-388Z.json")), "stale gemini crash dump is removed"); + ok(fs.existsSync(path.join(root, "gemini-client-error-fresh.json")), "fresh gemini crash dump is spared"); + ok(fs.existsSync(path.join(root, "unrelated-old-file.json")), "non-gemini file at temp root is NEVER touched, even when old"); + ok(r.removed.length === 1, "only the stale gemini dump counts as removed"); + fs.rmSync(root, { recursive: true, force: true }); +} + +// ── missing claude dir / no uid → graceful no-op ── +{ + const root = fs.mkdtempSync(path.join(os.tmpdir(), "qw-tempclean-test-")); + const r1 = sweepBackendTemp({ tmpRoot: root, uid: 9999, now: NOW }); + ok(r1.removed.length === 0 && r1.errors.length === 0, "missing claude-{uid} dir → clean no-op"); + const r2 = sweepBackendTemp({ tmpRoot: root, uid: null, now: NOW }); + ok(r2.removed.length === 0 && r2.errors.length === 0, "uid null (Windows) → claude sweep skipped without error"); + fs.rmSync(root, { recursive: true, force: true }); +} + +// ── sweep never throws even on a bogus tmpRoot ── +{ + const r = sweepBackendTemp({ tmpRoot: "/nonexistent/qw-nope", uid: 1, now: NOW }); + ok(Array.isArray(r.removed) && r.removed.length === 0, "bogus tmpRoot → no-op result, no throw"); +} + +// ── cleanupSettings: defaults + opt-out + custom age ── +ok(cleanupSettings({}).enabled === true, "settings default: enabled"); +ok(cleanupSettings({}).maxAgeHours === DEFAULT_MAX_AGE_HOURS, `settings default: ${DEFAULT_MAX_AGE_HOURS}h`); +ok(cleanupSettings({ temp_cleanup: { enabled: false } }).enabled === false, "temp_cleanup.enabled:false opts out"); +ok(cleanupSettings({ temp_cleanup: { max_age_hours: 24 } }).maxAgeHours === 24, "custom max_age_hours respected"); +ok(cleanupSettings({ temp_cleanup: { max_age_hours: -5 } }).maxAgeHours === DEFAULT_MAX_AGE_HOURS, "invalid max_age_hours falls back to default"); +ok(cleanupSettings(null).enabled === true, "null config → defaults, no throw"); + +if (failures > 0) { + console.error(`\n${failures} assertion(s) failed`); + process.exit(1); +} +console.log("\nall temp-cleanup assertions passed");