From aec12f16bfe807b66780aa47212a9de879c335e8 Mon Sep 17 00:00:00 2001 From: Vladimir Rogojin Date: Sun, 24 May 2026 13:47:58 +0200 Subject: [PATCH] fix(cli)(sphere-sdk#247): refuse CLI when a daemon holds the OrbitDB lock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The daemon parks the event loop forever with OrbitDB / Helia open; LevelDB takes a POSIX advisory file lock (fcntl(F_SETLK)) on /orbitdb//_index/LOCK and on /datastore/LOCK. A sibling CLI in the same dataDir hits LEVEL_LOCKED -> 'Database is not open', and the bounded retry from sphere-sdk PR #246 can never succeed (the contention isn't transient). This short-term gate detects the live-daemon case in getSphere() and exits with EX_TEMPFAIL, telling the operator to 'sphere daemon stop' first. Skipped when our own PID owns the PID file (daemon-start calling back into getSphere is the legitimate owner). Bypassed for daemon stop/status (which don't go through getSphere). The proper fix is a daemon-as-broker IPC surface (sphere-sdk #247 long-term: Unix domain socket at /.sphere-cli/daemon.sock, RemoteOrbitDbAdapter mirroring the OrbitDbAdapter interface). Until then, this stops the script-level cascade observed at §C.4 in manual-test-full-recovery.sh. Exports readPidFile and isDaemonProcessAlive from daemon.ts so legacy-cli.ts can reuse them without duplication. --- src/legacy/daemon.ts | 4 ++-- src/legacy/legacy-cli.ts | 45 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 2 deletions(-) diff --git a/src/legacy/daemon.ts b/src/legacy/daemon.ts index 09c8bd2..fc589e4 100644 --- a/src/legacy/daemon.ts +++ b/src/legacy/daemon.ts @@ -42,7 +42,7 @@ interface PidFileData { * Parse a PID file. Handles both the new JSON format and the legacy plain-text * format (just a number). Returns null on parse failure or missing file. */ -function readPidFile(pidFile: string): PidFileData | null { +export function readPidFile(pidFile: string): PidFileData | null { let raw: string; try { raw = fs.readFileSync(pidFile, 'utf8').trim(); @@ -81,7 +81,7 @@ function readPidFile(pidFile: string): PidFileData | null { * Returns false for dead PIDs and for PIDs that are alive but clearly not ours * (i.e. PID reuse case). */ -function isDaemonProcessAlive(pid: number): boolean { +export function isDaemonProcessAlive(pid: number): boolean { if (!isProcessAlive(pid)) return false; // Best-effort PID reuse detection via /proc//comm (Linux only). try { diff --git a/src/legacy/legacy-cli.ts b/src/legacy/legacy-cli.ts index 22a852d..16fc81c 100644 --- a/src/legacy/legacy-cli.ts +++ b/src/legacy/legacy-cli.ts @@ -7,6 +7,8 @@ import * as fs from 'fs'; import * as path from 'path'; import * as readline from 'readline'; +import { readPidFile, isDaemonProcessAlive } from './daemon.js'; +import { getDefaultPidFile } from './daemon-config.js'; // `encrypt`, `decrypt`, `hexToWIF`, `generatePrivateKey`, and // `generateAddressFromMasterKey` are no longer top-level exports of // @unicitylabs/sphere-sdk — they live in the L1 (alpha-chain) namespace @@ -254,9 +256,52 @@ function createNoopTransport(): TransportProvider { }; } +/** + * Issue #247 short-term gate — refuse to open a Sphere instance when a + * sphere daemon is running against the same wallet directory. + * + * Background: the daemon at `daemon.ts:711` parks the event loop forever + * with OrbitDB / Helia open. LevelDB takes a POSIX advisory file lock + * (`fcntl(F_SETLK)`) on `/orbitdb//_index/LOCK` and + * on `/datastore/LOCK`. The lease is held until SIGTERM. A + * second process opening the same directory hits `LEVEL_LOCKED` → + * `Database is not open`, and the PR #245/#246 3-attempt retry can + * never succeed because the contention isn't transient. + * + * The proper fix is a daemon-as-broker IPC surface (#247 long-term). + * This short-term gate detects the live-daemon case at CLI entry, exits + * with EX_TEMPFAIL, and tells the operator how to proceed. + * + * Skipped when the current process IS the daemon (PID match) — `daemon + * start` itself calls getSphere via the runDaemon callback to acquire + * its OrbitDB handle, and that path is the legitimate owner. + */ +function checkNoLiveDaemonOrExit(): void { + const pidFile = getDefaultPidFile(); + const pidData = readPidFile(pidFile); + if (!pidData) return; + if (pidData.pid === process.pid) return; // we ARE the daemon + if (!isDaemonProcessAlive(pidData.pid)) return; // stale PID file + process.stderr.write( + `\nA sphere daemon is running (pid=${pidData.pid}) and holds the wallet's\n` + + `OrbitDB / Helia directory lock. CLI commands that open the wallet would\n` + + `fail with "Database is not open" after the bounded retry budget.\n\n` + + `Stop the daemon first:\n` + + ` sphere daemon stop\n\n` + + `Then re-run your command. (#247 follow-up will add a daemon-broker IPC\n` + + `surface so CLI commands can coexist with a running daemon.)\n`, + ); + process.exit(75); // EX_TEMPFAIL — caller can retry after stopping the daemon. +} + async function getSphere(options?: { autoGenerate?: boolean; mnemonic?: string; nametag?: string }): Promise { if (sphereInstance) return sphereInstance; + // Issue #247 — refuse to open Sphere when a daemon already holds the + // OrbitDB / Helia directory lock. Skipped when our own PID owns the + // PID file (i.e. `daemon start` calling back into getSphere). + checkNoLiveDaemonOrExit(); + const config = loadConfig(); // Issue #23 — guard data-mutating bootstrap against legacy file-storage