From 736f343f287a5f1c8aa4129f2564b3214520d26e Mon Sep 17 00:00:00 2001 From: almogdepaz Date: Wed, 13 May 2026 22:20:33 +0300 Subject: [PATCH] chore: remove broker_stall.md, move content to issue #141 The file was a forensic snapshot of the 2026-05-11 wedge plus a list of unfinished broker investigations. Converted to issue #141 so the action items live in the issue tracker instead of as floating root-level markdown. - broker_stall.md deleted - src/server/backend.ts comment updated to point at issue #141 instead of the file --- broker_stall.md | 79 ------------------------------------------- src/server/backend.ts | 2 +- 2 files changed, 1 insertion(+), 80 deletions(-) delete mode 100644 broker_stall.md diff --git a/broker_stall.md b/broker_stall.md deleted file mode 100644 index 3a93834..0000000 --- a/broker_stall.md +++ /dev/null @@ -1,79 +0,0 @@ -# Broker stall — wedged-but-handshaking state - -## Symptom - -`wolfpack-broker` keeps the unix socket open and answers the initial protocol -handshake, but every subsequent RPC (`list_sessions`, `subscribe`, etc.) hangs -until the 10s client timeout. From the server's point of view the broker looks -"reachable" but is functionally dead. - -Server log (snippet): - -``` -"msg":"broker handshake ok" -"msg":"broker reachable" -… -"error":"BrokerRequestTimeoutError: broker request 'list_sessions' timed out after 10000ms" -"error":"BrokerRequestTimeoutError: broker request 'subscribe' timed out after 10000ms" -``` - -## Observed on this machine - -- broker PID 66555, uptime **5 days**, RSS **686MB**, CPU **~24% steady** -- two specific session UUIDs (`9bf96cf8-…`, `423de1e8-…`) trigger - `subscription forwarder lagged broadcast` warnings continuously for 3+ days -- `broker writer failed; closing connection — Broken pipe (os error 32)` repeats - every few hours from 2026-05-08 onward -- launchd-managed `wolfpack` server can't tell broker is wedged because the - handshake succeeds, so it never restarts the connection — instead it loops on - request timeouts forever - -## Likely root causes (Rust, `broker/src/`) - -1. **Handshake and request paths don't share fate.** Broker passes liveness - checks (handshake reply) while the request handler task is stuck or starved. -2. **Peers keep re-entering the EPIPE state.** Connection eviction on write - error already happens — `broker/src/server.rs:307` breaks the writer loop - on the first failure and the connection unwinds. The "every few hours" - cadence in the log is therefore a *new* peer reaching this state each - time, not one wedged forever. Open question is why: candidates include - the RPC dispatcher task blocking long enough that the kernel buffer - drains and the next `write` races a peer close (yielding EPIPE), or the - TS `BrokerClient` reconnecting after a circuit-breaker trip - (`f27d436`). -3. **Subscription forwarder lag loop.** When a subscriber falls behind on the - broadcast channel, broker logs `lagged broadcast` and tries to re-notify, - but nothing actually drains or evicts the slow subscriber — so it lags - again immediately. Plausible source of steady CPU + RSS growth. - -## Recovery (manual, no code change) - -```sh -launchctl kickstart -k gui/$(id -u)/com.wolfpack.broker -launchctl kickstart -k gui/$(id -u)/com.wolfpack.server # server's socket is now dead, must reattach -``` - -After kickstart the new broker is at PID 23926 (3MB RSS, 0% CPU), server -re-handshakes cleanly, `/api/sessions` returns 200. - -## Not yet investigated - -- Whether a single misbehaving subscriber can wedge the broker for all clients, - or whether the wedge is per-connection. -- Whether the lag-broadcast warnings precede the wedge or are independent. -- Whether the leaked RSS is in the ring buffer, the broadcast channel backlog, - or somewhere else. - -## Action items - -- [ ] Add a request-path liveness probe (current handshake-only check is - insufficient). -- [ ] Investigate why peers re-enter the EPIPE state every few hours. - Eviction on write error already happens (`server.rs:307`); the gap is - diagnostic, not mechanical. Cross-reference timestamps with TS-side - circuit-breaker trips from `f27d436`. -- [ ] Decide a policy for chronically-lagging subscribers: force-disconnect or - drop their backlog. Continuous "notify to re-subscribe" with no - enforcement is a busy loop. -- [ ] Add a metric / log line on broker startup so we can correlate wedges - against broker uptime. diff --git a/src/server/backend.ts b/src/server/backend.ts index cf9b9ec..9b04493 100644 --- a/src/server/backend.ts +++ b/src/server/backend.ts @@ -179,7 +179,7 @@ export class BackendRouter implements SessionBackend { * that flips `_brokerAvailable` to false at startup or after a circuit- * breaker trip keeps the server returning "broker backend unavailable" * until the next manual restart — the 8-hour zombie state observed - * 2026-05-11. See broker_stall.md. + * 2026-05-11. See issue #141 for the wedge-itself investigation. */ private startWatchdog(intervalMs: number = BROKER_WATCHDOG_INTERVAL_MS): void { if (this.watchdogTimer) return;