Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 49 additions & 0 deletions src/adapters/backend/session-backend-selector.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,55 @@ import { TmuxPipeBackend } from './tmux-pipe-backend.js';
import { ZellijBackend } from './zellij-backend.js';
import type { BackendType, SessionBackend } from './types.js';

export type BackendGateDecision =
| { action: 'spawn' }
| { action: 'gate'; reason: string };

/**
* Hard gate (PTY 退役): a requested *persistent* backend (tmux/herdr/zellij)
* that isn't functional on this host no longer silently degrades to raw PTY.
* That silent fallback was the root of the "secretly running on PTY, then
* hitting all of PTY's problems (no survival across daemon restart, etc.)"
* bug class. Instead the worker refuses to spawn and posts an actionable card.
*
* PTY stays reachable ONLY as an explicit opt-in — `BACKEND_TYPE=pty` or a
* per-bot `backendType: 'pty'` — which arrives here as `requested === 'pty'`
* and is always allowed straight through.
*
* `hasExistingSession` lets an already-running persistent session reattach
* regardless of a transient probe failure (a disposable "can we start a new
* server?" probe is far less authoritative than a live session — see PR#249):
* abandoning it would spawn a duplicate CLI and orphan the real conversation.
* The caller computes it only for backends whose probe is a disposable
* session (tmux, zellij); herdr's probe is a cheap non-destructive
* `herdr --version`, so it passes `hasExistingSession: false`.
*/
export function decideBackendGate(opts: {
requested: BackendType;
available: boolean;
hasExistingSession: boolean;
}): BackendGateDecision {
if (opts.requested === 'pty') return { action: 'spawn' };
if (opts.hasExistingSession) return { action: 'spawn' };
if (opts.available) return { action: 'spawn' };
return { action: 'gate', reason: `${opts.requested} 后端在本机不可用` };
}

/** User-facing card shown when {@link decideBackendGate} gates a session. */
export function backendGateUserMessage(backend: BackendType, reason: string): string {
const installHint =
backend === 'tmux'
? 'macOS: brew install tmux | Debian/Ubuntu: sudo apt-get install -y tmux | 其它发行版用对应包管理器安装 tmux'
: `请确认 ${backend} 已正确安装并可用`;
return [
`⚠️ 本机 ${backend} 不可用,无法启动会话。`,
`原因:${reason}`,
`请安装/修复后重试 —— ${installHint}`,
`(如确需在没有 ${backend} 的环境运行,可显式设置环境变量 BACKEND_TYPE=pty 用 PTY 后端兜底;` +
`但 PTY 会话不跨 daemon 重启存活,仅作应急。)`,
].join('\n');
}

export interface SelectedSessionBackend {
backend: SessionBackend;
isTmuxMode: boolean;
Expand Down
19 changes: 11 additions & 8 deletions src/config.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import { networkInterfaces } from 'node:os';
import type { BackendType } from './adapters/backend/types.js';
import { probeTmuxFunctional } from './setup/ensure-tmux.js';
import { resolveWorkerHttpHost } from './utils/worker-http.js';

/** Get the first non-loopback IPv4 address, fallback to localhost. */
Expand All @@ -26,15 +25,19 @@ export function getDashboardExternalHost(): string {
}

/**
* Pick the session backend. tmux is preferred (enables /adopt + per-client
* Web terminal attach) but only if it can actually start a server. The old
* check was `tmux -V`, which passes on machines where tmux is installed but
* broken (perms / config / linkage) and leaves the worker spamming "error
* connecting to /tmp/tmux-UID/default" forever. The functional probe filters
* those out so we silently fall back to PTY.
* Default session backend: always tmux (PTY 退役).
*
* PTY is no longer an automatic fallback. It used to be picked here whenever
* the tmux functional probe failed, which meant hosts with a broken/missing
* tmux silently ran on PTY — and then hit PTY's whole problem set (no survival
* across daemon restart, scrollback-relay quirks, …) without the user ever
* knowing they'd been downgraded. Now the default is tmux unconditionally; if
* tmux isn't functional the worker hard-gates the session with an actionable
* card (see decideBackendGate). PTY remains reachable only via an explicit
* BACKEND_TYPE=pty (or per-bot backendType:'pty') opt-in.
*/
function detectDefaultBackend(): Exclude<BackendType, 'herdr'> {
return probeTmuxFunctional().ok ? 'tmux' : 'pty';
return 'tmux';
}

// Computed once: the packaged fallback data dir. The effective dir is read
Expand Down
31 changes: 21 additions & 10 deletions src/core/persistent-backend.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
* worker-pool and session-manager import it, and those two already form an
* import cycle with each other.
*/
import { config } from '../config.js';
import { getBot } from '../bot-registry.js';
import { TmuxBackend } from '../adapters/backend/tmux-backend.js';
import { HerdrBackend } from '../adapters/backend/herdr-backend.js';
Expand All @@ -28,19 +27,31 @@ export function isSuspendableBackendType(
}

/**
* Resolve which persistent backend (if any) backs a session: prefer the
* worker's stored init config — the per-session truth captured at spawn time,
* which survives idle-suspend and tracks bot-config drift — then the bot
* config, then the daemon default (covers lazy-restored sessions that never
* forked a worker, where initConfig is unset).
* Resolve which persistent backend (if any) backs a session.
*
* Precedence, most authoritative first:
* 1. `ds.initConfig?.backendType` — the live worker's resolved backend this run.
* 2. `ds.session.backendType` — the backend stamped on the persisted session
* at spawn time (survives daemon restart; see Session.backendType).
* 3. An explicit per-bot `backendType` — authoritative even for legacy
* sessions, since the bot's choice didn't change across the PTY退役 flip.
*
* If NONE of those resolve, the session predates backendType stamping AND its
* bot pins no backend, so it ran on the OLD probe-based daemon default — which
* could have been PTY on a tmux-less host. We deliberately do NOT fall back to
* the current `config.daemon.backendType` (now always tmux): doing so would
* make `restoreActiveSessions` probe for a `bmx-<sid>` pane that never existed,
* find it 'missing', and zombie-close a perfectly recoverable session. Treating
* it as non-persistent keeps the worker-less active record for lazy resume; a
* genuinely surviving tmux pane still reattaches lazily on the next message
* (and gets stamped then).
*/
export function getSessionPersistentBackendType(ds: DaemonSession): PersistentBackendType | undefined {
let backendType: BackendType | undefined = ds.initConfig?.backendType;
let backendType: BackendType | undefined = ds.initConfig?.backendType ?? ds.session.backendType;
if (!backendType) {
backendType = config.daemon.backendType;
try {
backendType = getBot(ds.larkAppId).config.backendType ?? backendType;
} catch { /* bot deregistered — keep daemon default */ }
backendType = getBot(ds.larkAppId).config.backendType;
} catch { /* bot deregistered */ }
}
return isSuspendableBackendType(backendType) ? backendType : undefined;
}
Expand Down
12 changes: 12 additions & 0 deletions src/core/worker-pool.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1679,6 +1679,18 @@ export function forkWorker(ds: DaemonSession, prompt: string, resume = false): v
sessionStore.updateSession(ds.session);
}

// Stamp the resolved backend on the persisted session. Since PTY退役, the
// worker no longer silently downgrades an unavailable backend (it hard-gates
// instead), so the requested backend here IS the effective one for any
// session that actually runs. Restore reads this back (see
// getSessionPersistentBackendType) so an upgraded daemon doesn't re-derive a
// session's backend from the now-always-tmux default and misclassify a legacy
// PTY session as a tmux zombie.
if (ds.session.backendType !== initMsg.backendType) {
ds.session.backendType = initMsg.backendType;
sessionStore.updateSession(ds.session);
}

// Use shared handler for IPC messages and exit
setupWorkerHandlers(ds, worker);

Expand Down
7 changes: 4 additions & 3 deletions src/setup/ensure-zellij.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,10 @@
*
* Zellij is an OPT-IN backend (BACKEND_TYPE=zellij) — there's no auto-install
* here (tmux stays the default). We only need a functional probe so the worker
* can fall back to PTY when zellij is requested but unusable, and an env
* sanitiser so nested-session vars don't make our `zellij` calls target the
* wrong server.
* can hard-gate the session (post an actionable card, refuse to start) when
* zellij is requested but unusable — it no longer silently falls back to PTY —
* and an env sanitiser so nested-session vars don't make our `zellij` calls
* target the wrong server.
*
* The driveable automation surface (action write/dump-screen/list-panes --json,
* headless `attach --create-background`) landed in zellij 0.40–0.44; we require
Expand Down
24 changes: 18 additions & 6 deletions src/setup/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -58,18 +58,30 @@ function herdrCliIds(): CliId[] {
export async function ensureDependencies(): Promise<DependenciesReport> {
const platform = detectPlatform();

// tmux: nice-to-have (enables /adopt + multi-pane Web terminal). Daemon
// still works on PTY backend without it, so failure is a warning, not fatal.
// tmux: REQUIRED (PTY 退役). PTY is no longer an automatic fallback, so a
// host without functional tmux can't start sessions unless the operator
// explicitly opts into the PTY escape hatch (BACKEND_TYPE=pty). Surface this
// loudly instead of pretending "常规对话不受影响" — that was true under the
// old silent-fallback behavior and is now misleading.
const tmux = await ensureTmux(platform);
const ptyOptIn = (process.env.BACKEND_TYPE ?? '').toLowerCase() === 'pty';
if (tmux.installed) {
if (!tmux.freshInstall) console.log(`✓ tmux ${tmux.version} (existing)`);
} else {
} else if (ptyOptIn) {
console.warn('');
console.warn('⚠️ tmux 不可用,已退回到 PTY backend');
console.warn('⚠️ tmux 不可用,但已显式设置 BACKEND_TYPE=pty —— 将使用 PTY 后端兜底。');
console.warn(` 原因:${tmux.reason ?? '未知'}`);
if (tmux.manualCommand) console.warn(` 手动尝试:${tmux.manualCommand}`);
console.warn(' 影响:/adopt(接管已有 CLI 会话)和多人 Web 终端不可用;常规对话不受影响。');
console.warn(' 注意:PTY 会话不跨 daemon 重启存活,/adopt 与多人 Web 终端不可用。');
console.warn('');
} else {
console.error('');
console.error('❌ tmux 不可用,botmux 会话将无法启动。');
console.error(` 原因:${tmux.reason ?? '未知'}`);
if (tmux.manualCommand) console.error(` 请安装:${tmux.manualCommand}`);
console.error(' 安装好 tmux 后重试即可。');
console.error(' (如确需在没有 tmux 的环境运行,可显式设置环境变量 BACKEND_TYPE=pty 用 PTY 后端兜底,');
console.error(' 但 PTY 会话不跨 daemon 重启存活,仅作应急。)');
console.error('');
}

// Fonts second — best-effort.
Expand Down
12 changes: 12 additions & 0 deletions src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,18 @@ export interface Session {
suspendedColdResume?: boolean;
/** CLI used to spawn this session — stamped on every save so closed sessions retain it. */
cliId?: import('./adapters/cli/types.js').CliId;
/**
* Session backend resolved AT SPAWN TIME (tmux/herdr/zellij/pty). Stamped on
* fork so restore can resolve the backend authoritatively from the session
* itself instead of re-deriving it from the live daemon default — which
* changed when PTY stopped being an automatic fallback (default is now always
* tmux). Without this, a session that was created under the old probe-based
* default (e.g. implicit PTY on a tmux-less host) would, after upgrade +
* restart, be misread as tmux and zombie-closed because no `bmx-<sid>` pane
* exists. Undefined on sessions persisted before this field existed → treated
* conservatively (see getSessionPersistentBackendType).
*/
backendType?: BackendType;
/**
* Sandbox decision RECORDED AT SESSION CREATION (overlay file-isolation). The
* live bot flag (BotConfig.sandbox) can be toggled later, but a session's
Expand Down
73 changes: 51 additions & 22 deletions src/worker.ts
Original file line number Diff line number Diff line change
Expand Up @@ -76,10 +76,10 @@ import { ZellijBackend, ZELLIJ_CONFIG_KDL } from './adapters/backend/zellij-back
import { ZellijObserveBackend } from './adapters/backend/zellij-observe-backend.js';
import { zellijEnv } from './setup/ensure-zellij.js';
import { isObserveBackend, type ObserveBackend } from './adapters/backend/types.js';
import { selectSessionBackend } from './adapters/backend/session-backend-selector.js';
import { selectSessionBackend, decideBackendGate, backendGateUserMessage } from './adapters/backend/session-backend-selector.js';
import { prepareSandbox, attachSandboxOutbox, startOutboxWatcher, sandboxEnabled, sandboxedClaudeDataDir } from './adapters/backend/sandbox.js';
import type { BackendType, SessionBackend } from './adapters/backend/types.js';
import { tmuxEnv } from './setup/ensure-tmux.js';
import { tmuxEnv, probeTmuxFunctional } from './setup/ensure-tmux.js';
import { IdleDetector } from './utils/idle-detector.js';
import { ScreenAnalyzer } from './utils/screen-analyzer.js';
import { captureToPng } from './utils/screenshot-renderer.js';
Expand Down Expand Up @@ -3765,28 +3765,57 @@ function spawnCli(cfg: Extract<DaemonToWorker, { type: 'init' }>): void {
}

cliAdapter = createCliAdapterSync(cfg.cliId as any, cfg.cliPathOverride);
// backendType=tmux trust-but-verify: an explicit per-bot config (or
// BACKEND_TYPE=tmux env override) bypasses config.ts's auto-detect, so
// the worker re-probes here. Existing botmux tmux sessions are more
// authoritative than the disposable "can we create a new server?" probe:
// abandoning one after a transient probe failure would spawn a duplicate CLI
// under raw PTY and make later submits invisible to the real Codex history.
// backendType trust-but-verify + HARD GATE (PTY 退役): an explicit per-bot
// config (or BACKEND_TYPE env override) bypasses config.ts's default, so the
// worker re-probes the requested persistent backend here. A requested
// tmux/herdr/zellij backend that isn't functional NO LONGER silently
// degrades to raw PTY — that silent fallback was the root of the "secretly
// running on PTY, then hitting all of PTY's problems" bug class. Instead we
// refuse to spawn and post an actionable card (user_notify) telling the user
// to install the backend, or to explicitly opt into PTY with BACKEND_TYPE=pty.
//
// Existing botmux sessions stay authoritative over the disposable "can we
// create a new server?" probe: a live session reattaches regardless of a
// transient probe failure (PR#249), so it's exempt from the gate.
let effectiveBackend = cfg.backendType;
if (effectiveBackend === 'tmux') {
const existingSessionName = TmuxBackend.sessionName(cfg.sessionId);
const hasExistingSession = TmuxBackend.hasSession(existingSessionName);
if (!hasExistingSession && !TmuxBackend.isAvailable()) {
log('tmux backend requested but functional probe failed and no existing session is available — falling back to PTY backend');
effectiveBackend = 'pty';
{
let available = true;
let reason = '';
let hasExistingSession = false;
if (effectiveBackend === 'tmux') {
hasExistingSession = TmuxBackend.hasSession(TmuxBackend.sessionName(cfg.sessionId));
if (!hasExistingSession) {
const probe = probeTmuxFunctional();
available = probe.ok;
if (!probe.ok) reason = probe.reason;
}
} else if (effectiveBackend === 'zellij') {
// Like tmux, zellij's probe is a disposable background session, so a
// live named session is more authoritative than a transient probe
// failure (PR#249 semantics) — check it first so we reattach, not gate.
hasExistingSession = ZellijBackend.hasSession(ZellijBackend.sessionName(cfg.sessionId));
if (!hasExistingSession) {
available = ZellijBackend.isAvailable();
reason = 'zellij 功能性探针失败(需 zellij >= 0.44)';
}
} else if (effectiveBackend === 'herdr') {
// herdr's isAvailable() is a cheap, non-destructive `herdr --version`
// (not a disposable session probe), so it has no PR#249 false-negative
// risk and needs no existing-session exemption.
available = HerdrBackend.isAvailable();
reason = 'herdr 功能性探针失败';
}
const decision = decideBackendGate({ requested: effectiveBackend, available, hasExistingSession });
if (decision.action === 'gate') {
const detail = reason || decision.reason;
log(`${effectiveBackend} backend unavailable and silent PTY fallback is disabled (set BACKEND_TYPE=pty to opt in): ${detail}`);
// user_notify is delivered to the Lark thread by the daemon (type:'error'
// is log-only); send it BEFORE throwing so the card lands. The throw is
// caught by the init handler, which sends type:'error' and exits — the
// IPC channel flushes these small messages before exit.
send({ type: 'user_notify', message: backendGateUserMessage(effectiveBackend, detail), turnId: cfg.turnId });
throw new Error(`${effectiveBackend} backend unavailable; refusing silent PTY fallback (set BACKEND_TYPE=pty to opt in): ${detail}`);
}
}
if (effectiveBackend === 'herdr' && !HerdrBackend.isAvailable()) {
log('herdr backend requested but probe failed — falling back to PTY backend');
effectiveBackend = 'pty';
}
if (effectiveBackend === 'zellij' && !ZellijBackend.isAvailable()) {
log('zellij backend requested but functional probe failed (need zellij >= 0.44) — falling back to PTY backend');
effectiveBackend = 'pty';
}
effectiveBackendType = effectiveBackend;
const selectedBackend = selectSessionBackend({ sessionId: cfg.sessionId, backendType: effectiveBackend });
Expand Down
45 changes: 45 additions & 0 deletions test/backend-gate.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import { describe, it, expect } from 'vitest';
import {
decideBackendGate,
backendGateUserMessage,
} from '../src/adapters/backend/session-backend-selector.js';

describe('decideBackendGate (PTY 退役 hard gate)', () => {
it('always spawns when PTY is explicitly requested (escape hatch), even if "unavailable"', () => {
expect(
decideBackendGate({ requested: 'pty', available: false, hasExistingSession: false }),
).toEqual({ action: 'spawn' });
});

it('spawns tmux when the functional probe passes', () => {
expect(
decideBackendGate({ requested: 'tmux', available: true, hasExistingSession: false }),
).toEqual({ action: 'spawn' });
});

it('GATES tmux when probe fails and no live session exists (no silent PTY fallback)', () => {
const d = decideBackendGate({ requested: 'tmux', available: false, hasExistingSession: false });
expect(d.action).toBe('gate');
});

it('reattaches a live tmux session despite a transient probe failure (PR#249 exemption)', () => {
expect(
decideBackendGate({ requested: 'tmux', available: false, hasExistingSession: true }),
).toEqual({ action: 'spawn' });
});

it('gates herdr / zellij when unavailable instead of degrading to PTY', () => {
expect(decideBackendGate({ requested: 'herdr', available: false, hasExistingSession: false }).action).toBe('gate');
expect(decideBackendGate({ requested: 'zellij', available: false, hasExistingSession: false }).action).toBe('gate');
});
});

describe('backendGateUserMessage', () => {
it('includes the reason, an install hint, and the explicit PTY escape hatch', () => {
const msg = backendGateUserMessage('tmux', 'tmux 二进制不在 PATH 上');
expect(msg).toContain('tmux 不可用');
expect(msg).toContain('tmux 二进制不在 PATH 上');
expect(msg).toContain('brew install tmux');
expect(msg).toContain('BACKEND_TYPE=pty');
});
});
Loading