Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions docs/install-vps.md
Original file line number Diff line number Diff line change
Expand Up @@ -362,3 +362,12 @@ chmod 600 ~/.quadwork/.env
14. Configure nginx reverse proxy + SSL
15. Add HTTP basic auth
16. Verify reboot survival: `sudo reboot`, then check `pm2 list`

## Note: /tmp quotas and Claude temp

Some VPS images mount `/tmp` with a per-user quota (`usrquota`). Claude Code
accumulates temp under `/tmp/claude-{uid}`; if the quota fills up, every
Claude bash command starts failing silently with exit 1 (see
[troubleshooting](troubleshooting.md#every-claude-bash-command-fails-silently-exit-1-no-output)).
QuadWork sweeps stale entries automatically (hourly + on agent teardown,
72h age) — configurable via `temp_cleanup` in `~/.quadwork/config.json`.
26 changes: 26 additions & 0 deletions docs/troubleshooting.md
Original file line number Diff line number Diff line change
Expand Up @@ -124,3 +124,29 @@ chmod 440 /etc/sudoers.d/quadwork

See the [VPS Installation Guide](install-vps.md#step-2-create-non-root-user-critical) for full setup.


## Every Claude bash command fails silently (exit 1, no output)

**Symptom:** every bash command a `claude` agent runs — even `true` — returns
exit 1 with empty stdout/stderr. The Read tool still works, a plain shell on
the host works, and `codex` agents are unaffected. Easily mistaken for a
bwrap/AppArmor/sandbox failure.

**Cause:** Claude Code keeps its temp under `/tmp/claude-{uid}` and never
cleans it up. On hosts where `/tmp` is mounted with a per-user quota
(`usrquota`), that dir grows until the quota is exhausted — after which Claude
can't write the temp files it needs before executing any command (#957).

**Check:** `du -sh /tmp/claude-$(id -u)` and try `dd if=/dev/zero
of=/tmp/probe bs=1M count=10` — a "Disk quota exceeded" error confirms it.

**Fix:** QuadWork sweeps stale backend temp automatically (hourly, at boot,
and on agent teardown; entries older than 72h). If you hit the quota *before*
a sweep (e.g. the server was down), clear it manually:
`find /tmp/claude-$(id -u)/* -maxdepth 0 -mmin +60 -exec rm -rf {} +`

Tune or disable via `~/.quadwork/config.json`:

```json
{ "temp_cleanup": { "enabled": true, "max_age_hours": 72 } }
```
37 changes: 37 additions & 0 deletions server/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ const fileChat = require("./file-chat");
const { dispatchToAgentPTY, cleanupSession: cleanupPtyDispatcher } = require("./pty-dispatcher");
const { runAcMigration } = require("./migrate-ac");
const selfHeal = require("./self-heal");
const tempCleanup = require("./temp-cleanup");
const { injectModeForCommand } = require("../src/lib/injectMode.js");

const net = require("net");
Expand Down Expand Up @@ -623,6 +624,34 @@ async function stopAgentSession(key, { clearSelfHeal = false } = {}) {
session.exitedUnexpectedly = false;
const [projectId, agentId] = key.split("/");
if (projectId && agentId) stopMcpProxy(projectId, agentId);
// #957: teardown is a known-safe moment to sweep stale backend temp.
// Deferred + stale-only, so it never blocks the stop path and never touches
// files another live agent (same shared /tmp/claude-{uid}) still uses.
setImmediate(backendTempSweepTick);
}

// #957: sweep stale backend temp entries (/tmp/claude-{uid}, stray gemini
// crash dumps). On hosts where /tmp has a per-user quota, unbounded Claude
// temp eventually exhausts it and every Claude bash call fails silently with
// exit 1 — see the issue for the full post-mortem. Runs on agent teardown and
// hourly; opt-out / age via config.json `temp_cleanup: {enabled, max_age_hours}`.
let _tempSweepRunning = false;
function backendTempSweepTick() {
if (_tempSweepRunning) return;
_tempSweepRunning = true;
try {
const settings = tempCleanup.cleanupSettings(readConfig());
if (!settings.enabled) return;
const r = tempCleanup.sweepBackendTemp({ maxAgeHours: settings.maxAgeHours });
if (r.removed.length > 0) {
console.log(`[temp-cleanup] removed ${r.removed.length} stale entr${r.removed.length === 1 ? "y" : "ies"} (kept ${r.kept})`);
}
for (const e of r.errors) console.error(`[temp-cleanup] ${e}`);
} catch (err) {
console.error(`[temp-cleanup] sweep failed: ${err.message}`);
} finally {
_tempSweepRunning = false;
}
}

app.get("/api/agents", (_req, res) => {
Expand Down Expand Up @@ -1816,6 +1845,14 @@ if (!process.env.QUADWORK_SKIP_LISTEN) {
setInterval(autoStopPollingTick, AUTO_STOP_POLL_INTERVAL_MS);
}

// #957: hourly stale-temp sweep (plus one at boot — a server that was down
// for days should reclaim quota immediately, not an hour after start).
const TEMP_SWEEP_INTERVAL_MS = 60 * 60 * 1000;
if (!process.env.QUADWORK_SKIP_LISTEN) {
setImmediate(backendTempSweepTick);
setInterval(backendTempSweepTick, TEMP_SWEEP_INTERVAL_MS);
}

// #915: retry deferred reseeds without a server restart. A reseed deferred on
// boot because a project's batch was active used to wait for the next startup —
// stranding a busy project on old seeds indefinitely. autoReseedOnStartup is
Expand Down
129 changes: 129 additions & 0 deletions server/temp-cleanup.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
"use strict";

// #957: lifecycle-aware cleanup of agent backend temp dirs.
//
// Claude Code overrides TMPDIR to /tmp/claude-{uid} and never cleans it up.
// On Linux hosts where /tmp carries a per-user quota (usrquota), that dir
// accumulates until the quota is exhausted — at which point EVERY Claude bash
// call fails silently (exit 1, no output) because Claude can't write its
// temp/sandbox-setup files before exec. Gemini similarly strands
// /tmp/gemini-client-error-*.json crash dumps. Codex keeps its state under
// ~/.codex (not /tmp) so it needs no sweep here.
//
// Design constraints:
// - /tmp/claude-{uid} is SHARED by every claude agent of this user, so a
// teardown of one agent must never delete another live agent's files.
// Therefore deletion is strictly age-based ("stale entries only") — the
// same sweep is safe from any call site, teardown or timer.
// - Age uses max(mtime, atime, ctime) like systemd-tmpfiles, so anything a
// live session still touches stays "fresh" and is spared.
// - Never throws: a cleanup bug must not break agent teardown or the server
// boot path. Errors are collected and reported in the result.
// - Cross-platform no-op where it doesn't apply: Windows has no
// process.getuid → the claude dir is skipped; the gemini glob simply
// matches nothing where those files don't exist.

const fs = require("fs");
const os = require("os");
const path = require("path");

const DEFAULT_MAX_AGE_HOURS = 72;

// Newest of mtime/atime/ctime — "last time anything observed this entry".
function newestTimeMs(st) {
return Math.max(st.mtimeMs || 0, st.atimeMs || 0, st.ctimeMs || 0);
}

// Remove `entry` (file or dir) if its newest timestamp is older than the
// cutoff. Returns "removed" | "kept" | "error".
function removeIfStale(entry, cutoffMs, result) {
let st;
try {
st = fs.lstatSync(entry);
} catch {
return "kept"; // raced away — already gone
}
if (newestTimeMs(st) >= cutoffMs) {
result.kept += 1;
return "kept";
}
try {
fs.rmSync(entry, { recursive: true, force: true });
result.removed.push(entry);
return "removed";
} catch (err) {
result.errors.push(`${entry}: ${err.message}`);
return "error";
}
}

// Sweep stale backend temp entries. Options (all injectable for tests):
// maxAgeHours - entries with no mtime/atime/ctime newer than this are removed
// tmpRoot - temp root (default os.tmpdir())
// uid - numeric uid for the claude-{uid} dir (default process.getuid)
// now - epoch ms (default Date.now())
// Returns { removed: string[], kept: number, errors: string[] }.
function sweepBackendTemp(opts = {}) {
const result = { removed: [], kept: 0, errors: [] };
try {
const maxAgeHours = Number.isFinite(opts.maxAgeHours) && opts.maxAgeHours > 0
? opts.maxAgeHours
: DEFAULT_MAX_AGE_HOURS;
const tmpRoot = opts.tmpRoot || os.tmpdir();
const now = typeof opts.now === "number" ? opts.now : Date.now();
const cutoffMs = now - maxAgeHours * 60 * 60 * 1000;

// Claude: entries directly under /tmp/claude-{uid}. The dir itself is
// kept (claude recreates it anyway; deleting it while a session spawns
// would be a race for no benefit).
const uid = typeof opts.uid === "number"
? opts.uid
: (typeof process.getuid === "function" ? process.getuid() : null);
if (uid !== null) {
const claudeDir = path.join(tmpRoot, `claude-${uid}`);
let entries = [];
try {
entries = fs.readdirSync(claudeDir);
} catch {
entries = []; // no dir → nothing to do
}
for (const name of entries) {
removeIfStale(path.join(claudeDir, name), cutoffMs, result);
}
}

// Gemini: stray crash dumps written directly to the temp root.
let rootEntries = [];
try {
rootEntries = fs.readdirSync(tmpRoot);
} catch {
rootEntries = [];
}
for (const name of rootEntries) {
if (name.startsWith("gemini-client-error-") && name.endsWith(".json")) {
removeIfStale(path.join(tmpRoot, name), cutoffMs, result);
}
}
} catch (err) {
result.errors.push(String(err && err.message ? err.message : err));
}
return result;
}

// Resolve {enabled, maxAgeHours} from config.json's optional top-level
// `temp_cleanup` block. Defaults: enabled, 72h. `enabled: false` opts out.
function cleanupSettings(cfg) {
const tc = (cfg && cfg.temp_cleanup) || {};
return {
enabled: tc.enabled !== false,
maxAgeHours: Number.isFinite(tc.max_age_hours) && tc.max_age_hours > 0
? tc.max_age_hours
: DEFAULT_MAX_AGE_HOURS,
};
}

module.exports = {
sweepBackendTemp,
cleanupSettings,
DEFAULT_MAX_AGE_HOURS,
};
107 changes: 107 additions & 0 deletions server/temp-cleanup.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
"use strict";

// #957: unit tests for the stale backend-temp sweep. All filesystem work
// happens inside a throwaway fixture dir; uid/now/tmpRoot are injected so
// nothing touches the real /tmp/claude-{uid}.

const fs = require("fs");
const os = require("os");
const path = require("path");
const { sweepBackendTemp, cleanupSettings, DEFAULT_MAX_AGE_HOURS } = require("./temp-cleanup");

let failures = 0;
function ok(cond, msg) {
if (cond) {
console.log(` PASS: ${msg}`);
} else {
failures += 1;
console.error(` FAIL: ${msg}`);
}
}

const HOUR = 60 * 60 * 1000;
const NOW = 1_800_000_000_000; // fixed epoch for determinism

function makeFixture() {
const root = fs.mkdtempSync(path.join(os.tmpdir(), "qw-tempclean-test-"));
const claudeDir = path.join(root, "claude-1234");
fs.mkdirSync(claudeDir);
return { root, claudeDir };
}

function touch(p, ageHours, { dir = false, content = "x" } = {}) {
if (dir) {
fs.mkdirSync(p, { recursive: true });
fs.writeFileSync(path.join(p, "inner.txt"), content);
} else {
fs.writeFileSync(p, content);
}
const t = new Date(NOW - ageHours * HOUR);
// utimes sets atime+mtime; ctime can't be set, but lstat ctime of a fresh
// fixture file is "now" in REAL time, far in the past relative to our fake
// NOW — so the injected-now cutoff is still decided by atime/mtime here.
fs.utimesSync(p, t, t);
}

// ── stale vs fresh entries under claude-{uid} ──
{
const { root, claudeDir } = makeFixture();
touch(path.join(claudeDir, "old-file"), 100);
touch(path.join(claudeDir, "old-dir"), 100, { dir: true });
touch(path.join(claudeDir, "fresh-file"), 1);
// utimes on the dir AFTER writing inner.txt so the dir's own mtime is old
fs.utimesSync(path.join(claudeDir, "old-dir"), new Date(NOW - 100 * HOUR), new Date(NOW - 100 * HOUR));

const r = sweepBackendTemp({ tmpRoot: root, uid: 1234, now: NOW, maxAgeHours: 72 });
ok(!fs.existsSync(path.join(claudeDir, "old-file")), "stale file under claude-{uid} is removed");
ok(!fs.existsSync(path.join(claudeDir, "old-dir")), "stale directory is removed recursively");
ok(fs.existsSync(path.join(claudeDir, "fresh-file")), "fresh file is spared");
ok(fs.existsSync(claudeDir), "the claude-{uid} dir itself is kept");
ok(r.removed.length === 2 && r.kept === 1 && r.errors.length === 0, "result counts removed=2 kept=1 errors=0");
fs.rmSync(root, { recursive: true, force: true });
}

// ── gemini crash dumps at the temp root ──
{
const { root } = makeFixture();
touch(path.join(root, "gemini-client-error-Turn.run-2026-06-03T00-28-55-388Z.json"), 100);
touch(path.join(root, "gemini-client-error-fresh.json"), 1);
touch(path.join(root, "unrelated-old-file.json"), 100);

const r = sweepBackendTemp({ tmpRoot: root, uid: 1234, now: NOW, maxAgeHours: 72 });
ok(!fs.existsSync(path.join(root, "gemini-client-error-Turn.run-2026-06-03T00-28-55-388Z.json")), "stale gemini crash dump is removed");
ok(fs.existsSync(path.join(root, "gemini-client-error-fresh.json")), "fresh gemini crash dump is spared");
ok(fs.existsSync(path.join(root, "unrelated-old-file.json")), "non-gemini file at temp root is NEVER touched, even when old");
ok(r.removed.length === 1, "only the stale gemini dump counts as removed");
fs.rmSync(root, { recursive: true, force: true });
}

// ── missing claude dir / no uid → graceful no-op ──
{
const root = fs.mkdtempSync(path.join(os.tmpdir(), "qw-tempclean-test-"));
const r1 = sweepBackendTemp({ tmpRoot: root, uid: 9999, now: NOW });
ok(r1.removed.length === 0 && r1.errors.length === 0, "missing claude-{uid} dir → clean no-op");
const r2 = sweepBackendTemp({ tmpRoot: root, uid: null, now: NOW });
ok(r2.removed.length === 0 && r2.errors.length === 0, "uid null (Windows) → claude sweep skipped without error");
fs.rmSync(root, { recursive: true, force: true });
}

// ── sweep never throws even on a bogus tmpRoot ──
{
const r = sweepBackendTemp({ tmpRoot: "/nonexistent/qw-nope", uid: 1, now: NOW });
ok(Array.isArray(r.removed) && r.removed.length === 0, "bogus tmpRoot → no-op result, no throw");
}

// ── cleanupSettings: defaults + opt-out + custom age ──
ok(cleanupSettings({}).enabled === true, "settings default: enabled");
ok(cleanupSettings({}).maxAgeHours === DEFAULT_MAX_AGE_HOURS, `settings default: ${DEFAULT_MAX_AGE_HOURS}h`);
ok(cleanupSettings({ temp_cleanup: { enabled: false } }).enabled === false, "temp_cleanup.enabled:false opts out");
ok(cleanupSettings({ temp_cleanup: { max_age_hours: 24 } }).maxAgeHours === 24, "custom max_age_hours respected");
ok(cleanupSettings({ temp_cleanup: { max_age_hours: -5 } }).maxAgeHours === DEFAULT_MAX_AGE_HOURS, "invalid max_age_hours falls back to default");
ok(cleanupSettings(null).enabled === true, "null config → defaults, no throw");

if (failures > 0) {
console.error(`\n${failures} assertion(s) failed`);
process.exit(1);
}
console.log("\nall temp-cleanup assertions passed");