From d932fbeb71103b44aec330b0a1bb861e72a04d6e Mon Sep 17 00:00:00 2001 From: burrows99 Date: Fri, 19 Jun 2026 08:03:54 +0100 Subject: [PATCH] fix(collector): make trace failures self-explanatory in the JSON envelope MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a trace run streamed to the dashboard, every failure mode was lossy in exactly the channel an agent consumes (--json + exit code), so a real debugging session spent minutes chasing the wrong cause. Each fix puts the signal where the agent looks. - Collector route: separate fault domains. A dead/misconfigured store now returns 503 STORE_UNAVAILABLE (internal error logged, not leaked) instead of collapsing into the same 400 as a malformed envelope. Every response carries a machine `code` from the shared vocabulary (INGEST / INGEST_INVALID / INGEST_NO_SESSION / STORE), so a caller never has to string-match. A Postgres auth failure is no longer indistinguishable from a schema error. - Emit failures surface. Collector.emit returns a rich EmitResult ({ok,status,body,error}) and logs rejections at error with the reason, instead of logging a 400 at info and swallowing a bare false. The run folds any rejected emit into the printed/--json envelope as an EMIT diagnostic. - 0-hit runs explain themselves. A bound-but-never-hit breakpoint now emits a BP_BOUND_UNHIT diagnostic into the envelope (the JSON counterpart of the renderer's "no breakpoints hit" line), so a --json reader can tell "trigger didn't run" from "wrong line" without grepping server logs. - --concise help states it trims the PRINTED envelope only and does NOT affect --emit (the collector always gets the full envelope) — closing the false lead that toggling it changes the wire payload. - serve prints the exact --emit URL to copy; fix the stale :4747 compose comment (host port is 14747). Adds Code.BP_BOUND_UNHIT, re-exports Code from trace-cli/server for the route, and covers the new emit-result shape + code in output.test.js. CLI + UI typecheck clean; 41/41 tests pass (1 DB round-trip skipped). Co-Authored-By: Claude Opus 4.8 (1M context) --- docker-compose.yml | 2 +- src/cli/Cli.ts | 22 ++++++++++++++++---- src/cli/commands/DynamicCommand.ts | 7 +++++++ src/cli/commands/ServeCommand.ts | 6 +++++- src/collector/Collector.ts | 22 ++++++++++++++++---- src/server.ts | 1 + src/shared/codes.ts | 3 +++ test/output.test.js | 11 ++++++++++ ui/app/v1/traces/route.ts | 33 +++++++++++++++++++++--------- 9 files changed, 87 insertions(+), 20 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 6a2135d..9847596 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -12,7 +12,7 @@ # # Bringing the stack up runs the whole end-to-end demo too (Node + React + Chrome targets, all in # one demo container), no host setup — `docker compose up --build` starts everything; `docker compose down` stops it all: -# docker compose up --build # then watch all 3 traces (incl. the React video) at :4747 +# docker compose up --build # then watch all 3 traces (incl. the React video) at :14747 services: dashboard: build: . diff --git a/src/cli/Cli.ts b/src/cli/Cli.ts index 6050bac..75a8016 100644 --- a/src/cli/Cli.ts +++ b/src/cli/Cli.ts @@ -15,7 +15,7 @@ import { ManifestCommand } from "./commands/ManifestCommand.js"; import { SchemaCommand } from "./commands/SchemaCommand.js"; import { ServeCommand } from "./commands/ServeCommand.js"; import { Tracer } from "../engine/Tracer.js"; -import { Collector } from "../collector/Collector.js"; +import { Collector, type EmitResult } from "../collector/Collector.js"; import { S3ArtifactStore } from "../storage/S3ArtifactStore.js"; import { VERSION } from "../shared/version.js"; import { TargetKind } from "../domain/Target.js"; @@ -120,7 +120,10 @@ export class Cli { // dashboard updates live as it runs. const collector = await Collector.resolve(options.emit); let emitChain: Promise = Promise.resolve(); - const emitToCollector = collector ? (envelope: unknown) => { emitChain = emitChain.then(() => Collector.emit(collector, envelope).catch(() => false)); } : undefined; + const emitFailures: EmitResult[] = []; + const emitToCollector = collector + ? (envelope: unknown) => { emitChain = emitChain.then(async () => { const result = await Collector.emit(collector, envelope); if (!result.ok) emitFailures.push(result); }); } + : undefined; const { trace } = await this.#dynamic.run({ target, port, launch, @@ -132,8 +135,19 @@ export class Cli { ...(emitToCollector ? { onProgress: (intermediateTrace: Trace) => emitToCollector(intermediateTrace.toJSON()) } : {}), }); + // Flush the final (complete) envelope and all pending emits BEFORE rendering, so a rejected emit + // (a 400 schema error, a 503 dead store) becomes a visible diagnostic in the printed/--json envelope + // instead of vanishing into an info log — the gap that sent a debugging loop chasing the wrong cause. + if (emitToCollector) { + emitToCollector(trace.toJSON()); + await emitChain; + if (emitFailures.length) { + const last = emitFailures[emitFailures.length - 1]; + const reason = last.status ? `HTTP ${last.status}${last.body ? ` — ${last.body.slice(0, 200)}` : ""}` : (last.error ?? "unknown error"); + trace.diagnostics.push(Diagnostic.warn(Code.EMIT, `collector ${collector} rejected ${emitFailures.length} emit(s): ${reason}`)); + } + } emit(trace, () => this.#dynamic.render(trace), options); - if (emitToCollector) { emitToCollector(trace.toJSON()); await emitChain; } // final, complete envelope; then flush all pending emits process.exit(trace.hasErrors() ? 1 : 0); } @@ -233,7 +247,7 @@ export class Cli { .option("--output ", "chrome: output path for the screen + trace-panel recording (default: a temp file)") .option("--emit ", "stream the trace to a collector (POST /v1/traces) — the session appears live as it runs (default: env TRACE_COLLECTOR_URL, else a locally-running collector is auto-detected)") .option("--json [path]", "envelope as JSON: to a file if a path is given, else to stdout") - .option("--concise", "trim the --json envelope for token-tight agent reads: per hit, locals collapse to key names and the call stack keeps its top 2 frames (watched --expression values, location & timing kept). Re-run --detailed for everything.") + .option("--concise", "trim the PRINTED --json envelope (stdout/file) for token-tight agent reads: per hit, locals collapse to key names and the call stack keeps its top 2 frames (watched --expression values, location & timing kept). Does NOT affect --emit — the collector always receives the full envelope. Re-run --detailed for everything.") .option("--detailed", "full --json envelope: every local's value and the complete call stack at each hit (the default)") .action((options) => this.#runDynamic(options)); diff --git a/src/cli/commands/DynamicCommand.ts b/src/cli/commands/DynamicCommand.ts index 74a3089..3f16464 100644 --- a/src/cli/commands/DynamicCommand.ts +++ b/src/cli/commands/DynamicCommand.ts @@ -110,6 +110,13 @@ export class DynamicCommand extends TraceCommand for (const breakpoint of capture.breakpoints.filter((breakpoint) => !breakpoint.bound)) { diagnostics.push(Diagnostic.warn(Code.BP_UNBOUND, `${breakpoint.file}:${breakpoint.line} did not bind${breakpoint.note ? " — " + breakpoint.note : ""}`)); } + // Bound-but-unhit: the breakpoint attached but no event fired. Without this an agent reading `--json` + // sees only `bound:true, events:[]` with no diagnostic and has to guess "no trigger" vs "wrong line" — + // exactly the fork that stalls a debugging loop. Mirror the human renderer's "no breakpoints hit" line. + const boundCount = capture.breakpoints.filter((breakpoint) => breakpoint.bound).length; + if (boundCount > 0 && capture.events.length === 0) { + diagnostics.push(Diagnostic.warn(Code.BP_BOUND_UNHIT, `${boundCount} breakpoint(s) bound but never hit — the trigger may not have exercised this path (wrong route/branch, or the trigger didn't run).`)); + } const lineage = LineageAnalyzer.compute(capture.events); const data = new TraceData({ breakpoints: capture.breakpoints, diff --git a/src/cli/commands/ServeCommand.ts b/src/cli/commands/ServeCommand.ts index 93d5ba0..d31a1b7 100644 --- a/src/cli/commands/ServeCommand.ts +++ b/src/cli/commands/ServeCommand.ts @@ -35,7 +35,11 @@ export class ServeCommand extends CliCommand { throw new Error(`dashboard not built — run \`npm run build\` (expected ${SERVER_ENTRY}).`); } - log.info("starting dashboard", { url: `http://localhost:${port}`, host }); + // Print the exact emit URL to copy. Auto-discovery only probes the well-known ports (14747 / the 4000 + // default); on any other --port a run must point at this URL explicitly, so don't make the user guess it. + const collectorUrl = `http://localhost:${port}`; + log.info("starting dashboard", { url: collectorUrl, host }); + log.info("stream traces here", { hint: `trace dynamic … --emit ${collectorUrl} (or: export TRACE_COLLECTOR_URL=${collectorUrl})` }); const child = spawn(process.execPath, [SERVER_ENTRY], { stdio: "inherit", env: { ...process.env, PORT: String(port), HOSTNAME: host, DATABASE_URL: databaseUrl }, diff --git a/src/collector/Collector.ts b/src/collector/Collector.ts index c15bba4..7d04cb7 100644 --- a/src/collector/Collector.ts +++ b/src/collector/Collector.ts @@ -4,6 +4,13 @@ import { Code } from "../shared/codes.js"; const log = logger.child({ component: "collector" }); const CT_JSON = "application/json"; +/** + * The outcome of one emit. `ok` mirrors the old boolean; `status`/`body` (on an HTTP rejection) and `error` + * (on a network/throw failure) let the caller surface *why* an emit failed — into the envelope's diagnostics + * and the logs — instead of silently dropping it (a 400 used to log at `info` and the bool was swallowed). + */ +export interface EmitResult { ok: boolean; status?: number; body?: string; error?: string } + /** Well-known local collector URLs probed for auto-discovery, in priority order: the docker/dashboard port * (14747, the compose-published host port from README/compose/scenarios), then the native `trace serve` default (4000). */ const DEFAULT_CANDIDATES = ["http://localhost:14747", "http://localhost:4000"]; @@ -19,15 +26,22 @@ const PROBE_TIMEOUT_MS = 500; */ export class Collector { /** POST an envelope to a collector's /v1/traces (used when TRACE_COLLECTOR_URL / --emit is set). */ - static async emit(url: string, envelope: unknown): Promise { + static async emit(url: string, envelope: unknown): Promise { const endpoint = url.replace(/\/+$/, "") + "/v1/traces"; try { const response = await fetch(endpoint, { method: "POST", headers: { "content-type": CT_JSON }, body: JSON.stringify(envelope) }); - log.info("emitted envelope", { endpoint, status: response.status }); - return response.ok; + if (response.ok) { + log.info("emitted envelope", { endpoint, status: response.status }); + return { ok: true, status: response.status }; + } + // A rejection is a real failure — log it at error with the collector's reason (e.g. "invalid envelope", + // "trace store unavailable"), not at info. The caller folds this into the envelope's diagnostics. + const body = await response.text().catch(() => ""); + log.error("emit rejected", { code: Code.EMIT, endpoint, status: response.status, body: body.slice(0, 500) }); + return { ok: false, status: response.status, body }; } catch (error: any) { log.error("emit failed", { code: Code.EMIT, endpoint, err: error }); - return false; + return { ok: false, error: String(error?.message ?? error) }; } } diff --git a/src/server.ts b/src/server.ts index c507d22..84af963 100644 --- a/src/server.ts +++ b/src/server.ts @@ -15,3 +15,4 @@ export type { StoreOptions } from "./collector/createSessionStore.js"; export { summarize } from "./collector/SessionStore.js"; export type { SessionStore, SessionSummary, EnvelopePlain } from "./collector/SessionStore.js"; export { Trace } from "./domain/Trace.js"; +export { Code } from "./shared/codes.js"; // shared issue-code vocabulary, so ingest responses carry the same codes as CLI diagnostics diff --git a/src/shared/codes.ts b/src/shared/codes.ts index 92bf9d9..0d3aa49 100644 --- a/src/shared/codes.ts +++ b/src/shared/codes.ts @@ -21,6 +21,9 @@ export const Code = { STEP_FAILED: "STEP_FAILED", /** a requested breakpoint never bound at the target (bad file:line, or that line never loads) */ BP_UNBOUND: "BP_UNBOUND", + /** breakpoint(s) bound but no event fired — the trigger likely didn't exercise this path (the JSON + * counterpart of the renderer's "no breakpoints hit" line, so a `--json` reader sees the why too) */ + BP_BOUND_UNHIT: "BP_BOUND_UNHIT", /** writing the Chrome screen + trace-panel recording failed */ RECORD: "RECORD_FAILED", /** nothing was captured to record (no frames) — usually no breakpoint hit during the journey */ diff --git a/test/output.test.js b/test/output.test.js index 9989e1c..c0008df 100644 --- a/test/output.test.js +++ b/test/output.test.js @@ -6,6 +6,7 @@ import assert from "node:assert/strict"; import { condense } from "../dist/cli/Cli.js"; import { Code } from "../dist/shared/codes.js"; +import { Collector } from "../dist/collector/Collector.js"; /** A trace-shaped plain envelope with two breakpoint hits (full locals + a 5-frame stack + a watched expr). */ function fullEnvelope() { @@ -68,5 +69,15 @@ test("Code registry: stable, unique, greppable values shared by both channels", // the envelope's existing diagnostic codes are part of the one vocabulary assert.equal(Code.CODEGRAPH_FAILED, "CODEGRAPH_FAILED"); assert.equal(Code.BP_UNBOUND, "BP_UNBOUND"); + assert.equal(Code.BP_BOUND_UNHIT, "BP_BOUND_UNHIT"); assert.equal(Code.SCHEMA, "E_SCHEMA"); }); + +test("Collector.emit: a failed POST resolves to a rich result (never throws, never a bare bool)", async () => { + // Port 1 is unbound → fetch rejects (connection refused). emit must catch it and return a structured + // result so the caller can surface *why* an emit failed instead of swallowing a `false`. + const result = await Collector.emit("http://127.0.0.1:1", { tool: "trace" }); + assert.equal(result.ok, false, "a refused emit is not ok"); + assert.equal(typeof result.error, "string", "the failure reason is carried back, not dropped"); + assert.ok(result.error.length > 0, "error message is non-empty"); +}); diff --git a/ui/app/v1/traces/route.ts b/ui/app/v1/traces/route.ts index 20a4d4d..b8c4654 100644 --- a/ui/app/v1/traces/route.ts +++ b/ui/app/v1/traces/route.ts @@ -1,6 +1,6 @@ import { NextResponse } from "next/server"; import { getStore } from "@/lib/store"; -import { Trace } from "trace-cli/server"; +import { Trace, Code } from "trace-cli/server"; export const runtime = "nodejs"; export const dynamic = "force-dynamic"; @@ -13,29 +13,42 @@ const MAX_BYTES = 64 * 1024 * 1024; * contract the CLI emits to (`Collector.emit` → `/v1/traces`). The ingress boundary: hydrate untrusted * JSON into a Trace and enforce the envelope contract (`validate()`) before it touches the store. Reuses the * CLI's domain so validation + persistence are identical to a direct `trace … --emit` against the old collector. + * + * Status codes separate fault domains so a caller can tell *whose* problem it is — and never has to string-match: + * • 4xx + a `code` ⇒ the CALLER's envelope is wrong (too large / malformed JSON / schema-invalid / no sessionId) + * • 503 STORE_UNAVAILABLE ⇒ the SERVER's store (Postgres) is down or misconfigured — a transient infra fault, + * NOT a bad envelope. Previously every failure (including a dead DB) collapsed to one 400, which made an + * infra outage indistinguishable from a schema error. The internal error is logged, never returned. */ export async function POST(request: Request) { const declared = Number(request.headers.get("content-length") ?? 0); - if (declared > MAX_BYTES) return NextResponse.json({ error: "envelope too large" }, { status: 413 }); + if (declared > MAX_BYTES) return NextResponse.json({ error: "envelope too large", code: Code.INGEST }, { status: 413 }); let body: string; try { body = await request.text(); } catch (error) { - return NextResponse.json({ error: (error as Error).message }, { status: 400 }); + return NextResponse.json({ error: (error as Error).message, code: Code.INGEST }, { status: 400 }); } - if (body.length > MAX_BYTES) return NextResponse.json({ error: "envelope too large" }, { status: 413 }); + if (body.length > MAX_BYTES) return NextResponse.json({ error: "envelope too large", code: Code.INGEST }, { status: 413 }); + // Parse + validate the untrusted envelope. Any failure here is the caller's fault → 4xx. + let envelope: Trace; try { - const envelope = Trace.fromPlain(JSON.parse(body)); - const problems = envelope.validate(); - if (problems.length) return NextResponse.json({ error: "invalid envelope", problems }, { status: 400 }); + envelope = Trace.fromPlain(JSON.parse(body)); + } catch (error) { + return NextResponse.json({ error: (error as Error).message, code: Code.INGEST }, { status: 400 }); + } + const problems = envelope.validate(); + if (problems.length) return NextResponse.json({ error: "invalid envelope", code: Code.INGEST_INVALID, problems }, { status: 400 }); + // Persist. A failure here is the server's fault (store/DB) → 503, with the real cause logged, not leaked. + try { const summary = await getStore().ingest(envelope.toJSON()); - if (!summary) return NextResponse.json({ error: "envelope has no meta.sessionId" }, { status: 400 }); - + if (!summary) return NextResponse.json({ error: "envelope has no meta.sessionId", code: Code.INGEST_NO_SESSION }, { status: 400 }); return NextResponse.json({ ok: true, sessionId: summary.sessionId }); } catch (error) { - return NextResponse.json({ error: (error as Error).message }, { status: 400 }); + console.error("[collector] ingest failed — trace store unavailable", error); + return NextResponse.json({ error: "trace store unavailable", code: Code.STORE }, { status: 503 }); } }