From 6e39cedc8b80fac2162c12aadfdf47b20abe3495 Mon Sep 17 00:00:00 2001 From: Yuhan Lei Date: Tue, 23 Jun 2026 01:50:11 +0800 Subject: [PATCH 1/3] fix(opencode): broaden transport errno coverage and stop retrying permanent DNS failures MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The stream failure classifier only recognized 4 errno codes (ECONNRESET/ECONNREFUSED/ETIMEDOUT/UND_ERR_SOCKET) and hardcoded every transport disconnect as retryable. Two gaps: common transient codes (DNS, undici timeouts, broken pipe) fell through to UnknownError, and a permanent failure like ENOTFOUND (an unresolved host — usually a misconfigured base URL) was retried up to the cap, stalling the turn instead of surfacing the cause. - stream-failure-classifier.ts: broaden TRANSPORT_CODES (EAI_AGAIN, EPIPE, ECONNABORTED, EHOSTUNREACH, ENETUNREACH, ENOTFOUND, UND_ERR_CONNECT_TIMEOUT, UND_ERR_HEADERS_TIMEOUT, UND_ERR_BODY_TIMEOUT) and add a tiny message fallback (socket hang up / premature close) for disconnects that arrive with no errno code. An error carrying an HTTP statusCode is short-circuited up front: an HTTP response was received, so it is an API error classified by status downstream and must not be reclassified as transport — neither by a transport-coded cause nor by a transport phrase in its message (fromError runs this classifier before the APICallError branch). "terminated" is intentionally NOT in the fallback — on its own it is undici's generic wrapper and is only transport when its cause carries a transport code. - Make retryability per-errno: TransportDisconnect.retryable is now a boolean. ENOTFOUND is non-retryable (permanent DNS / misconfigured base URL); EAI_AGAIN (transient DNS) and all others stay retryable. - message-v2.ts fromError: propagate transport.retryable to APIError.isRetryable instead of hardcoding true. - retry.ts classifyRetry: transport_disconnect now honors isRetryable (removed from RETRY_TRANSIENT_KINDS, given its own branch) so a non-retryable transport error takes the existing "not retryable -> stop" path. retrySignal.retryable also feeds the recorded transport failure, so run-observability's retry_safety reports provider_terminal_failure for ENOTFOUND via the existing #1118 branch. Tests: classifier (ENOTFOUND non-retryable incl. nested cause, EAI_AGAIN / UND_ERR_HEADERS_TIMEOUT retryable, bare "socket hang up" message fallback, HTTP error with a transport phrase or transport-coded cause stays unclassified); classifyRetry (transport_disconnect honors isRetryable both ways); fromError (ENOTFOUND -> non-retryable APIError, EAI_AGAIN -> retryable, APICallError with "socket hang up" message + transport cause stays invalid_request). Refs #1105 #1123 Claude-Session: https://claude.ai/code/session_015bW9JQSkuB156gkNQdxCzi --- packages/opencode/src/session/message-v2.ts | 5 +- packages/opencode/src/session/retry.test.ts | 18 ++++- packages/opencode/src/session/retry.ts | 26 +++---- .../session/stream-failure-classifier.test.ts | 71 +++++++++++++++++++ .../src/session/stream-failure-classifier.ts | 58 +++++++++++++-- .../opencode/test/session/message-v2.test.ts | 52 ++++++++++++++ 6 files changed, 211 insertions(+), 19 deletions(-) diff --git a/packages/opencode/src/session/message-v2.ts b/packages/opencode/src/session/message-v2.ts index ec45707ff..c47412f2b 100644 --- a/packages/opencode/src/session/message-v2.ts +++ b/packages/opencode/src/session/message-v2.ts @@ -1371,7 +1371,10 @@ export function fromError( return new APIError( { message: (e as Error).message || "Connection interrupted", - isRetryable: true, + // Per-errno: most transport disconnects are retryable, but a permanent + // one (e.g. ENOTFOUND, an unresolved host) is not — classifyRetry reads + // this so it stops instead of retrying into a stall. + isRetryable: transport.retryable, metadata: { code: transport.code, message: (e as Error).message || "", diff --git a/packages/opencode/src/session/retry.test.ts b/packages/opencode/src/session/retry.test.ts index ba25fb131..0502cc2ba 100644 --- a/packages/opencode/src/session/retry.test.ts +++ b/packages/opencode/src/session/retry.test.ts @@ -128,7 +128,7 @@ describe("classifyRetry — reads providerFailure.kind (slice ④)", () => { // Transient kinds always retry, even if isRetryable is false and the status is // not 5xx — the classification, not the SDK flag, is the source of truth. - for (const kind of ["rate_limit", "server_overload", "transport_disconnect", "decompression"] as const) { + for (const kind of ["rate_limit", "server_overload", "decompression"] as const) { test(`transient kind ${kind} retries even when isRetryable is false and status is not 5xx`, () => { expect( classifyRetry(makeAPIError({ isRetryable: false, statusCode: 400, providerFailure: { kind } }))?.kind, @@ -136,6 +136,22 @@ describe("classifyRetry — reads providerFailure.kind (slice ④)", () => { }) } + // transport_disconnect honors the per-errno isRetryable the stream classifier + // sets (#1105b): most transport errnos are transient, but a permanent one + // (e.g. ENOTFOUND — unresolved host) is marked isRetryable=false and must not + // auto-retry into a stall. + test("transport_disconnect retries when the classifier marked it retryable", () => { + expect( + classifyRetry(makeAPIError({ isRetryable: true, providerFailure: { kind: "transport_disconnect" } }))?.kind, + ).toBe("unknown") + }) + + test("transport_disconnect does not retry when the classifier marked it non-retryable", () => { + expect( + classifyRetry(makeAPIError({ isRetryable: false, providerFailure: { kind: "transport_disconnect" } })), + ).toBeUndefined() + }) + test("unknown kind falls back to the legacy isRetryable + 5xx gate", () => { expect( classifyRetry(makeAPIError({ isRetryable: true, statusCode: 404, providerFailure: { kind: "unknown" } }))?.kind, diff --git a/packages/opencode/src/session/retry.ts b/packages/opencode/src/session/retry.ts index 6758888d7..909d39915 100644 --- a/packages/opencode/src/session/retry.ts +++ b/packages/opencode/src/session/retry.ts @@ -16,13 +16,10 @@ export type { RetryAction } from "./retry-classification" // always retry. classifyRetry reads these so the retry decision follows the // canonical classification instead of re-deriving it from the provider SDK's // isRetryable flag; `unknown` or absent kinds fall back to that legacy signal. +// transport_disconnect is handled separately: it honors the per-errno +// isRetryable the stream classifier set (most errnos transient, ENOTFOUND not). const RETRY_TERMINAL_KINDS = new Set(["auth", "invalid_request", "quota_exhausted"]) -const RETRY_TRANSIENT_KINDS = new Set([ - "rate_limit", - "server_overload", - "transport_disconnect", - "decompression", -]) +const RETRY_TRANSIENT_KINDS = new Set(["rate_limit", "server_overload", "decompression"]) export const RETRY_INITIAL_DELAY = 2000 export const RETRY_BACKOFF_FACTOR = 2 @@ -85,16 +82,19 @@ export function classifyRetry(error: Err): RetryClassification | undefined { const status = error.data.statusCode const kind = error.data.providerFailure?.kind // Retry/stop gate. Prefer the canonical providerFailure.kind: terminal kinds - // never retry, transient kinds always do. When the kind is `unknown` or the - // row predates providerFailure, fall back to the legacy signal — isRetryable, - // or a 5xx the provider SDK didn't explicitly mark retryable. The two agree - // for every classified kind today. + // never retry, transient kinds always do. transport_disconnect honors the + // per-errno isRetryable the stream classifier set (most errnos transient, + // ENOTFOUND not). When the kind is `unknown` or the row predates + // providerFailure, fall back to the legacy signal — isRetryable, or a 5xx the + // provider SDK didn't explicitly mark retryable. const retryable = kind && RETRY_TERMINAL_KINDS.has(kind) ? false - : kind && RETRY_TRANSIENT_KINDS.has(kind) - ? true - : error.data.isRetryable || (status !== undefined && status >= 500) + : kind === "transport_disconnect" + ? error.data.isRetryable + : kind && RETRY_TRANSIENT_KINDS.has(kind) + ? true + : error.data.isRetryable || (status !== undefined && status >= 500) if (!retryable) return undefined // Strict 3-way AND: opencode provider + FreeUsageLimitError marker in body diff --git a/packages/opencode/src/session/stream-failure-classifier.test.ts b/packages/opencode/src/session/stream-failure-classifier.test.ts index b7a6a7e7d..33e346d53 100644 --- a/packages/opencode/src/session/stream-failure-classifier.test.ts +++ b/packages/opencode/src/session/stream-failure-classifier.test.ts @@ -55,6 +55,60 @@ describe("classifyStreamFailure", () => { code: "UND_ERR_SOCKET", }) }) + + test("EAI_AGAIN (transient DNS) stays retryable", () => { + const error = Object.assign(new Error("getaddrinfo EAI_AGAIN api.example.com"), { + code: "EAI_AGAIN", + syscall: "getaddrinfo", + }) + expect(classifyStreamFailure(error)).toEqual({ + kind: "provider_transport_disconnect", + retryable: true, + code: "EAI_AGAIN", + }) + }) + + test("UND_ERR_HEADERS_TIMEOUT is a retryable transport disconnect", () => { + const error = Object.assign(new Error("Headers Timeout Error"), { code: "UND_ERR_HEADERS_TIMEOUT" }) + expect(classifyStreamFailure(error)).toEqual({ + kind: "provider_transport_disconnect", + retryable: true, + code: "UND_ERR_HEADERS_TIMEOUT", + }) + }) + + test("bare 'socket hang up' message (no errno code) classifies as retryable transport", () => { + const error = new Error("socket hang up") + expect(classifyStreamFailure(error)).toEqual({ + kind: "provider_transport_disconnect", + retryable: true, + code: "SOCKET_HANG_UP", + }) + }) + }) + + describe("transport disconnect — non-retryable", () => { + test("ENOTFOUND (unresolved host, likely misconfigured base URL) is not retryable", () => { + const error = Object.assign(new Error("getaddrinfo ENOTFOUND api.wrong-host.invalid"), { + code: "ENOTFOUND", + syscall: "getaddrinfo", + }) + expect(classifyStreamFailure(error)).toEqual({ + kind: "provider_transport_disconnect", + retryable: false, + code: "ENOTFOUND", + }) + }) + + test("ENOTFOUND nested in cause chain is not retryable", () => { + const cause = Object.assign(new Error("getaddrinfo ENOTFOUND"), { code: "ENOTFOUND" }) + const error = new TypeError("fetch failed", { cause }) + expect(classifyStreamFailure(error)).toEqual({ + kind: "provider_transport_disconnect", + retryable: false, + code: "ENOTFOUND", + }) + }) }) describe("non-transport errors — not classified", () => { @@ -81,5 +135,22 @@ describe("classifyStreamFailure", () => { const error = new TypeError("terminated", { cause: new Error("unrelated") }) expect(classifyStreamFailure(error)).toBeUndefined() }) + + test("message fallback does not hijack an HTTP error that mentions 'socket hang up'", () => { + // An error carrying an HTTP statusCode is an API error classified by status + // downstream; a transport phrase in its message must not reclassify it. + const error = Object.assign(new Error("400 invalid request: socket hang up is not allowed"), { + statusCode: 400, + }) + expect(classifyStreamFailure(error)).toBeUndefined() + }) + + test("HTTP error with a transport-coded cause is still classified by status, not transport", () => { + // statusCode present = HTTP response received = API error. A transport code + // on its cause must not reclassify it as a transport disconnect. + const cause = Object.assign(new Error("socket hang up"), { code: "UND_ERR_SOCKET" }) + const error = Object.assign(new Error("400 invalid request"), { statusCode: 400, cause }) + expect(classifyStreamFailure(error)).toBeUndefined() + }) }) }) diff --git a/packages/opencode/src/session/stream-failure-classifier.ts b/packages/opencode/src/session/stream-failure-classifier.ts index dcfab658a..767ba4a35 100644 --- a/packages/opencode/src/session/stream-failure-classifier.ts +++ b/packages/opencode/src/session/stream-failure-classifier.ts @@ -1,22 +1,72 @@ export type TransportDisconnect = { kind: "provider_transport_disconnect" - retryable: true + retryable: boolean code: string } -const TRANSPORT_CODES = new Set(["ECONNRESET", "ECONNREFUSED", "ETIMEDOUT", "UND_ERR_SOCKET"]) +// errno / undici codes that mean the connection dropped, timed out, or could +// not be established. All are transient (worth retrying) except the ones in +// NON_RETRYABLE_TRANSPORT_CODES below. +const TRANSPORT_CODES = new Set([ + "ECONNRESET", + "ECONNREFUSED", + "ETIMEDOUT", + "ECONNABORTED", + "EPIPE", + "EHOSTUNREACH", + "ENETUNREACH", + "EAI_AGAIN", // transient DNS lookup failure — retry + "ENOTFOUND", // permanent DNS failure (see NON_RETRYABLE_TRANSPORT_CODES) + "UND_ERR_SOCKET", + "UND_ERR_CONNECT_TIMEOUT", + "UND_ERR_HEADERS_TIMEOUT", + "UND_ERR_BODY_TIMEOUT", +]) + +// ENOTFOUND means the hostname did not resolve — usually a misconfigured base +// URL that will never resolve, so auto-retrying just stalls the turn. Kept +// distinct from EAI_AGAIN (transient DNS), which stays retryable. +const NON_RETRYABLE_TRANSPORT_CODES = new Set(["ENOTFOUND"]) + +// Connection-dropped signals that can arrive as a bare message with no errno +// code (undici/Node throws these in some paths). Only checked when no code +// matched. "terminated" is intentionally excluded — on its own it is undici's +// generic fetch wrapper and is only a transport failure when its cause carries +// a transport code (handled by the cause walk above). +const TRANSIENT_MESSAGE_CODES: ReadonlyArray<[RegExp, string]> = [ + [/socket hang up/i, "SOCKET_HANG_UP"], + [/premature close/i, "PREMATURE_CLOSE"], +] + +function disconnect(code: string): TransportDisconnect { + return { kind: "provider_transport_disconnect", retryable: !NON_RETRYABLE_TRANSPORT_CODES.has(code), code } +} export function classifyStreamFailure(error: unknown): TransportDisconnect | undefined { if (!(error instanceof Error)) return undefined + // An error carrying an HTTP statusCode means an HTTP response was received, so + // it is an API error classified by status downstream (the APICallError branch + // in fromError, which runs after this). It must not be reclassified as a + // transport disconnect — not by a transport-coded cause, and not by a + // transport phrase in its message. + if (typeof (error as { statusCode?: unknown }).statusCode === "number") return undefined + const topCode = (error as { code?: string }).code if (typeof topCode === "string" && TRANSPORT_CODES.has(topCode)) { - return { kind: "provider_transport_disconnect", retryable: true, code: topCode } + return disconnect(topCode) } const code = findTransportCodeInCause(error.cause) if (code) { - return { kind: "provider_transport_disconnect", retryable: true, code } + return disconnect(code) + } + + // Message fallback is a last resort for raw connection errors that carry no + // errno code. + const message = typeof error.message === "string" ? error.message : "" + for (const [pattern, fallbackCode] of TRANSIENT_MESSAGE_CODES) { + if (pattern.test(message)) return { kind: "provider_transport_disconnect", retryable: true, code: fallbackCode } } return undefined diff --git a/packages/opencode/test/session/message-v2.test.ts b/packages/opencode/test/session/message-v2.test.ts index cf8585e95..e210917f7 100644 --- a/packages/opencode/test/session/message-v2.test.ts +++ b/packages/opencode/test/session/message-v2.test.ts @@ -2029,6 +2029,58 @@ describe("session.message-v2.fromError", () => { }) }) + test("classifies ENOTFOUND as a non-retryable transport disconnect", () => { + const error = Object.assign(new Error("getaddrinfo ENOTFOUND api.wrong-host.invalid"), { + code: "ENOTFOUND", + syscall: "getaddrinfo", + }) + + const result = MessageV2.fromError(error, { providerID }) + + expect(MessageV2.APIError.isInstance(result)).toBe(true) + expect((result as MessageV2.APIError).data.isRetryable).toBe(false) + expect((result as MessageV2.APIError).data.providerFailure).toStrictEqual({ + kind: "transport_disconnect", + code: "ENOTFOUND", + }) + }) + + test("classifies EAI_AGAIN as a retryable transport disconnect", () => { + const error = Object.assign(new Error("getaddrinfo EAI_AGAIN api.example.com"), { + code: "EAI_AGAIN", + syscall: "getaddrinfo", + }) + + const result = MessageV2.fromError(error, { providerID }) + + expect((result as MessageV2.APIError).data.isRetryable).toBe(true) + expect((result as MessageV2.APIError).data.providerFailure).toStrictEqual({ + kind: "transport_disconnect", + code: "EAI_AGAIN", + }) + }) + + test("does not let a transport phrase or transport cause on an APICallError hijack status classification", () => { + // A 400 invalid_request whose message echoes "socket hang up" AND whose cause + // carries a transport code must stay invalid_request, not be reclassified as + // a retryable transport disconnect (statusCode present = HTTP response). + const cause = Object.assign(new Error("socket hang up"), { code: "UND_ERR_SOCKET" }) + const error = new APICallError({ + message: "400 invalid request: socket hang up is not allowed", + url: "https://example.com", + requestBodyValues: {}, + statusCode: 400, + responseHeaders: { "content-type": "application/json" }, + responseBody: JSON.stringify({ error: { code: "invalid_prompt", message: "socket hang up is not allowed" } }), + isRetryable: false, + cause, + }) + + const result = MessageV2.fromError(error, { providerID }) + + expect((result as MessageV2.APIError).data.providerFailure?.kind).toBe("invalid_request") + }) + test("populates providerFailure for decompression failures", () => { const zlibError = Object.assign( new Error('ZlibError fetching "https://opencode.cloudflare.dev/anthropic/messages".'), From fa32c5e02920d853681106bc1fcd285055fe1ed1 Mon Sep 17 00:00:00 2001 From: Yuhan Lei Date: Tue, 23 Jun 2026 17:02:57 +0800 Subject: [PATCH 2/3] fix(opencode): demote bare-message transport fallback below structured parsing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Review follow-up (P2 + P3): - P2: classifyStreamFailure ran its `socket hang up` / `premature close` substring fallback before parseStreamError, so a structured stream error (e.g. invalid_prompt) carrying that phrase in its message or cause.body was mis-classified as a retryable transport disconnect — the user got a wrong retry instead of the real provider error. Split the bare-message fallback into classifyBareTransportMessage, anchor it to the whole trimmed message, and run it only AFTER parseStreamError fails. Errno-coded transport detection stays ahead of parsing (a definitive signal). - P3: drop the single-element NON_RETRYABLE_TRANSPORT_CODES set for the simpler `retryable: code !== "ENOTFOUND"`. - P3: make the classifier test a transport-code matrix (every errno asserts kind + retryability), plus anchored bare-message match / non-match cases. Adds end-to-end fromError regressions: cause.body invalid_prompt + "socket hang up" stays invalid_request (non-retryable); bare "socket hang up" stays a retryable transport disconnect. Verification: targeted suite 170 pass; session+provider 1351 pass / 0 fail; tsgo --noEmit clean. Claude-Session: https://claude.ai/code/session_012743rGkjEzqaUMKy2nvYMM --- packages/opencode/src/session/message-v2.ts | 49 ++-- .../session/stream-failure-classifier.test.ts | 223 +++++++----------- .../src/session/stream-failure-classifier.ts | 56 +++-- .../opencode/test/session/message-v2.test.ts | 25 ++ 4 files changed, 174 insertions(+), 179 deletions(-) diff --git a/packages/opencode/src/session/message-v2.ts b/packages/opencode/src/session/message-v2.ts index c47412f2b..c0b12dca2 100644 --- a/packages/opencode/src/session/message-v2.ts +++ b/packages/opencode/src/session/message-v2.ts @@ -19,7 +19,7 @@ import { ModelID, ProviderID } from "@/provider/schema" import { Effect } from "effect" import { EffectLogger } from "@/effect" import { isMedia } from "@/util/media" -import { classifyStreamFailure } from "./stream-failure-classifier" +import { classifyStreamFailure, classifyBareTransportMessage, type TransportDisconnect } from "./stream-failure-classifier" import { LLMTrace } from "./llm-trace" import { RunObservability } from "./run-observability" import { RunLifecycle } from "./run-lifecycle" @@ -1344,6 +1344,25 @@ export const filterCompactedEffect = Effect.fnUntraced(function* (sessionID: Ses return filterCompacted(stream(sessionID)) }) +// Build the transport-disconnect APIError from a classified disconnect. Shared +// by the errno-code path (high priority, ahead of stream parsing) and the +// bare-message fallback (last resort, after stream parsing fails). +function transportDisconnectError(e: unknown, transport: TransportDisconnect) { + const message = (e as Error).message || "" + return new APIError( + { + // Per-errno: most transport disconnects are retryable, but a permanent one + // (e.g. ENOTFOUND, an unresolved host) is not — classifyRetry reads this so + // it stops instead of retrying into a stall. + message: message || "Connection interrupted", + isRetryable: transport.retryable, + metadata: { code: transport.code, message }, + providerFailure: { kind: "transport_disconnect", code: transport.code }, + }, + { cause: e }, + ).toObject() +} + export function fromError( e: unknown, ctx: { providerID: ProviderID; aborted?: boolean }, @@ -1366,24 +1385,12 @@ export function fromError( }, { cause: e }, ).toObject() - case classifyStreamFailure(e) !== undefined: { - const transport = classifyStreamFailure(e)! - return new APIError( - { - message: (e as Error).message || "Connection interrupted", - // Per-errno: most transport disconnects are retryable, but a permanent - // one (e.g. ENOTFOUND, an unresolved host) is not — classifyRetry reads - // this so it stops instead of retrying into a stall. - isRetryable: transport.retryable, - metadata: { - code: transport.code, - message: (e as Error).message || "", - }, - providerFailure: { kind: "transport_disconnect", code: transport.code }, - }, - { cause: e }, - ).toObject() - } + // Errno-coded transport disconnect (top-level or in the cause chain) is a + // definitive signal, so it runs ahead of stream parsing. The bare-message + // fallback does NOT run here — it is demoted below parseStreamError so a + // structured error carrying a transport phrase is not mis-grabbed. + case classifyStreamFailure(e) !== undefined: + return transportDisconnectError(e, classifyStreamFailure(e)!) case e instanceof Error && (e as FetchDecompressionError).code === "ZlibError": if (ctx.aborted) { return new AbortedError({ message: e.message }, { cause: e }).toObject() @@ -1459,6 +1466,10 @@ export function fromError( ).toObject() } } catch {} + // Last resort: a bare connection-dropped message (no errno code, not a + // structured stream error) is still a retryable transport disconnect. + const bareTransport = classifyBareTransportMessage(e) + if (bareTransport) return transportDisconnectError(e, bareTransport) return new NamedError.Unknown( { message: e instanceof Error ? errorMessage(e) : JSON.stringify(e) }, { cause: e }, diff --git a/packages/opencode/src/session/stream-failure-classifier.test.ts b/packages/opencode/src/session/stream-failure-classifier.test.ts index 33e346d53..f24a951aa 100644 --- a/packages/opencode/src/session/stream-failure-classifier.test.ts +++ b/packages/opencode/src/session/stream-failure-classifier.test.ts @@ -1,156 +1,107 @@ import { describe, expect, test } from "bun:test" -import { classifyStreamFailure } from "./stream-failure-classifier" - -describe("classifyStreamFailure", () => { - describe("transport disconnect — retryable", () => { - test("ECONNRESET SystemError", () => { - const error = Object.assign(new Error("read ECONNRESET"), { code: "ECONNRESET", syscall: "read" }) - const result = classifyStreamFailure(error) - expect(result).toEqual({ - kind: "provider_transport_disconnect", - retryable: true, - code: "ECONNRESET", - }) - }) - - test("UND_ERR_SOCKET — TypeError('terminated') with cause.code", () => { - const cause = Object.assign(new Error("other side closed"), { code: "UND_ERR_SOCKET" }) - const error = new TypeError("terminated", { cause }) - const result = classifyStreamFailure(error) - expect(result).toEqual({ - kind: "provider_transport_disconnect", - retryable: true, - code: "UND_ERR_SOCKET", - }) - }) - - test("ECONNREFUSED SystemError", () => { - const error = Object.assign(new Error("connect ECONNREFUSED"), { code: "ECONNREFUSED", syscall: "connect" }) - const result = classifyStreamFailure(error) - expect(result).toEqual({ - kind: "provider_transport_disconnect", - retryable: true, - code: "ECONNREFUSED", - }) - }) - - test("ETIMEDOUT SystemError", () => { - const error = Object.assign(new Error("connect ETIMEDOUT"), { code: "ETIMEDOUT", syscall: "connect" }) - const result = classifyStreamFailure(error) - expect(result).toEqual({ - kind: "provider_transport_disconnect", - retryable: true, - code: "ETIMEDOUT", - }) - }) - - test("UND_ERR_SOCKET nested in cause chain", () => { - const innerCause = Object.assign(new Error("socket hang up"), { code: "UND_ERR_SOCKET" }) - const midError = new Error("fetch failed", { cause: innerCause }) - const outerError = new TypeError("terminated", { cause: midError }) - const result = classifyStreamFailure(outerError) - expect(result).toEqual({ - kind: "provider_transport_disconnect", - retryable: true, - code: "UND_ERR_SOCKET", - }) +import { classifyStreamFailure, classifyBareTransportMessage } from "./stream-failure-classifier" + +// Every transport errno and its expected retryability. ENOTFOUND is the only +// permanent one (unresolved host); everything else is transient. Driven as a +// matrix so adding/renaming a code in TRANSPORT_CODES forces a matching row. +const TRANSPORT_CODE_MATRIX: ReadonlyArray = [ + ["ECONNRESET", true], + ["ECONNREFUSED", true], + ["ETIMEDOUT", true], + ["ECONNABORTED", true], + ["EPIPE", true], + ["EHOSTUNREACH", true], + ["ENETUNREACH", true], + ["EAI_AGAIN", true], + ["ENOTFOUND", false], + ["UND_ERR_SOCKET", true], + ["UND_ERR_CONNECT_TIMEOUT", true], + ["UND_ERR_HEADERS_TIMEOUT", true], + ["UND_ERR_BODY_TIMEOUT", true], +] + +describe("classifyStreamFailure — transport code matrix", () => { + for (const [code, retryable] of TRANSPORT_CODE_MATRIX) { + test(`top-level ${code} → transport disconnect (retryable=${retryable})`, () => { + const error = Object.assign(new Error(`failed: ${code}`), { code }) + expect(classifyStreamFailure(error)).toEqual({ kind: "provider_transport_disconnect", retryable, code }) + }) + + test(`${code} nested in the cause chain → transport disconnect (retryable=${retryable})`, () => { + const cause = Object.assign(new Error("inner"), { code }) + const error = new TypeError("fetch failed", { cause }) + expect(classifyStreamFailure(error)).toEqual({ kind: "provider_transport_disconnect", retryable, code }) }) + } +}) - test("EAI_AGAIN (transient DNS) stays retryable", () => { - const error = Object.assign(new Error("getaddrinfo EAI_AGAIN api.example.com"), { - code: "EAI_AGAIN", - syscall: "getaddrinfo", - }) - expect(classifyStreamFailure(error)).toEqual({ - kind: "provider_transport_disconnect", - retryable: true, - code: "EAI_AGAIN", - }) - }) +describe("classifyStreamFailure — not a code-based transport error", () => { + test("generic Error returns undefined", () => { + expect(classifyStreamFailure(new Error("something broke"))).toBeUndefined() + }) - test("UND_ERR_HEADERS_TIMEOUT is a retryable transport disconnect", () => { - const error = Object.assign(new Error("Headers Timeout Error"), { code: "UND_ERR_HEADERS_TIMEOUT" }) - expect(classifyStreamFailure(error)).toEqual({ - kind: "provider_transport_disconnect", - retryable: true, - code: "UND_ERR_HEADERS_TIMEOUT", - }) - }) + test("unknown errno code returns undefined", () => { + expect(classifyStreamFailure(Object.assign(new Error("x"), { code: "ESOMETHINGELSE" }))).toBeUndefined() + }) - test("bare 'socket hang up' message (no errno code) classifies as retryable transport", () => { - const error = new Error("socket hang up") - expect(classifyStreamFailure(error)).toEqual({ - kind: "provider_transport_disconnect", - retryable: true, - code: "SOCKET_HANG_UP", - }) - }) + test("AbortError returns undefined", () => { + expect(classifyStreamFailure(new DOMException("The operation was aborted", "AbortError"))).toBeUndefined() }) - describe("transport disconnect — non-retryable", () => { - test("ENOTFOUND (unresolved host, likely misconfigured base URL) is not retryable", () => { - const error = Object.assign(new Error("getaddrinfo ENOTFOUND api.wrong-host.invalid"), { - code: "ENOTFOUND", - syscall: "getaddrinfo", - }) - expect(classifyStreamFailure(error)).toEqual({ - kind: "provider_transport_disconnect", - retryable: false, - code: "ENOTFOUND", - }) - }) + test("non-Error values return undefined", () => { + expect(classifyStreamFailure("string error")).toBeUndefined() + expect(classifyStreamFailure(null)).toBeUndefined() + expect(classifyStreamFailure(42)).toBeUndefined() + }) - test("ENOTFOUND nested in cause chain is not retryable", () => { - const cause = Object.assign(new Error("getaddrinfo ENOTFOUND"), { code: "ENOTFOUND" }) - const error = new TypeError("fetch failed", { cause }) - expect(classifyStreamFailure(error)).toEqual({ - kind: "provider_transport_disconnect", - retryable: false, - code: "ENOTFOUND", - }) - }) + test("TypeError('terminated') without a transport-coded cause returns undefined", () => { + expect(classifyStreamFailure(new TypeError("terminated", { cause: new Error("unrelated") }))).toBeUndefined() }) - describe("non-transport errors — not classified", () => { - test("generic Error returns undefined", () => { - expect(classifyStreamFailure(new Error("something broke"))).toBeUndefined() - }) + test("a bare 'socket hang up' message is NOT code-classified (the bare-message fallback owns it)", () => { + // The message fallback was moved out of classifyStreamFailure so structured + // stream parsing can run first; the code path only matches errno codes. + expect(classifyStreamFailure(new Error("socket hang up"))).toBeUndefined() + }) - test("TypeError without transport cause returns undefined", () => { - expect(classifyStreamFailure(new TypeError("Cannot read properties of undefined"))).toBeUndefined() - }) + test("HTTP error (statusCode) with a transport-coded cause stays an API error, not transport", () => { + const cause = Object.assign(new Error("socket hang up"), { code: "UND_ERR_SOCKET" }) + const error = Object.assign(new Error("400 invalid request"), { statusCode: 400, cause }) + expect(classifyStreamFailure(error)).toBeUndefined() + }) +}) - test("AbortError returns undefined", () => { - const error = new DOMException("The operation was aborted", "AbortError") - expect(classifyStreamFailure(error)).toBeUndefined() +describe("classifyBareTransportMessage — anchored bare connection messages", () => { + test("exact 'socket hang up' → retryable transport", () => { + expect(classifyBareTransportMessage(new Error("socket hang up"))).toEqual({ + kind: "provider_transport_disconnect", + retryable: true, + code: "SOCKET_HANG_UP", }) + }) - test("non-Error values return undefined", () => { - expect(classifyStreamFailure("string error")).toBeUndefined() - expect(classifyStreamFailure(null)).toBeUndefined() - expect(classifyStreamFailure(42)).toBeUndefined() + test("exact 'premature close' (surrounding whitespace trimmed) → retryable transport", () => { + expect(classifyBareTransportMessage(new Error(" premature close\n"))).toEqual({ + kind: "provider_transport_disconnect", + retryable: true, + code: "PREMATURE_CLOSE", }) + }) - test("TypeError('terminated') without UND_ERR_SOCKET cause returns undefined", () => { - const error = new TypeError("terminated", { cause: new Error("unrelated") }) - expect(classifyStreamFailure(error)).toBeUndefined() - }) + test("a longer message that merely CONTAINS the phrase is not a bare transport error", () => { + expect( + classifyBareTransportMessage(new Error("invalid request: socket hang up is not allowed")), + ).toBeUndefined() + }) - test("message fallback does not hijack an HTTP error that mentions 'socket hang up'", () => { - // An error carrying an HTTP statusCode is an API error classified by status - // downstream; a transport phrase in its message must not reclassify it. - const error = Object.assign(new Error("400 invalid request: socket hang up is not allowed"), { - statusCode: 400, - }) - expect(classifyStreamFailure(error)).toBeUndefined() - }) + test("an HTTP error (statusCode) whose message is exactly 'socket hang up' is not bare transport", () => { + expect( + classifyBareTransportMessage(Object.assign(new Error("socket hang up"), { statusCode: 502 })), + ).toBeUndefined() + }) - test("HTTP error with a transport-coded cause is still classified by status, not transport", () => { - // statusCode present = HTTP response received = API error. A transport code - // on its cause must not reclassify it as a transport disconnect. - const cause = Object.assign(new Error("socket hang up"), { code: "UND_ERR_SOCKET" }) - const error = Object.assign(new Error("400 invalid request"), { statusCode: 400, cause }) - expect(classifyStreamFailure(error)).toBeUndefined() - }) + test("non-Error value and unrelated message return undefined", () => { + expect(classifyBareTransportMessage("socket hang up")).toBeUndefined() + expect(classifyBareTransportMessage(new Error("something else"))).toBeUndefined() }) }) diff --git a/packages/opencode/src/session/stream-failure-classifier.ts b/packages/opencode/src/session/stream-failure-classifier.ts index 767ba4a35..58dae8c30 100644 --- a/packages/opencode/src/session/stream-failure-classifier.ts +++ b/packages/opencode/src/session/stream-failure-classifier.ts @@ -5,8 +5,8 @@ export type TransportDisconnect = { } // errno / undici codes that mean the connection dropped, timed out, or could -// not be established. All are transient (worth retrying) except the ones in -// NON_RETRYABLE_TRANSPORT_CODES below. +// not be established. All are transient (worth retrying) except ENOTFOUND +// (see disconnect()). const TRANSPORT_CODES = new Set([ "ECONNRESET", "ECONNREFUSED", @@ -16,30 +16,31 @@ const TRANSPORT_CODES = new Set([ "EHOSTUNREACH", "ENETUNREACH", "EAI_AGAIN", // transient DNS lookup failure — retry - "ENOTFOUND", // permanent DNS failure (see NON_RETRYABLE_TRANSPORT_CODES) + "ENOTFOUND", // permanent DNS failure — non-retryable (see disconnect()) "UND_ERR_SOCKET", "UND_ERR_CONNECT_TIMEOUT", "UND_ERR_HEADERS_TIMEOUT", "UND_ERR_BODY_TIMEOUT", ]) -// ENOTFOUND means the hostname did not resolve — usually a misconfigured base -// URL that will never resolve, so auto-retrying just stalls the turn. Kept -// distinct from EAI_AGAIN (transient DNS), which stays retryable. -const NON_RETRYABLE_TRANSPORT_CODES = new Set(["ENOTFOUND"]) - // Connection-dropped signals that can arrive as a bare message with no errno -// code (undici/Node throws these in some paths). Only checked when no code -// matched. "terminated" is intentionally excluded — on its own it is undici's -// generic fetch wrapper and is only a transport failure when its cause carries -// a transport code (handled by the cause walk above). -const TRANSIENT_MESSAGE_CODES: ReadonlyArray<[RegExp, string]> = [ - [/socket hang up/i, "SOCKET_HANG_UP"], - [/premature close/i, "PREMATURE_CLOSE"], +// code (undici/Node throw these in some paths). Anchored to the whole trimmed +// message: Node/undici set the entire message to exactly "socket hang up" / +// "premature close", so a longer message that merely *contains* the phrase +// (e.g. a structured provider error) is NOT a bare transport failure. +// "terminated" is intentionally excluded — on its own it is undici's generic +// fetch wrapper and is only a transport failure when its cause carries a code. +const BARE_TRANSPORT_MESSAGES: ReadonlyArray<[RegExp, string]> = [ + [/^socket hang up$/i, "SOCKET_HANG_UP"], + [/^premature close$/i, "PREMATURE_CLOSE"], ] function disconnect(code: string): TransportDisconnect { - return { kind: "provider_transport_disconnect", retryable: !NON_RETRYABLE_TRANSPORT_CODES.has(code), code } + // ENOTFOUND = the hostname did not resolve (usually a misconfigured base URL + // that will never resolve), so auto-retrying just stalls the turn. Everything + // else in TRANSPORT_CODES is transient; EAI_AGAIN (transient DNS) stays + // retryable. Promote to a set if a second permanent code ever appears. + return { kind: "provider_transport_disconnect", retryable: code !== "ENOTFOUND", code } } export function classifyStreamFailure(error: unknown): TransportDisconnect | undefined { @@ -48,8 +49,7 @@ export function classifyStreamFailure(error: unknown): TransportDisconnect | und // An error carrying an HTTP statusCode means an HTTP response was received, so // it is an API error classified by status downstream (the APICallError branch // in fromError, which runs after this). It must not be reclassified as a - // transport disconnect — not by a transport-coded cause, and not by a - // transport phrase in its message. + // transport disconnect by a transport-coded cause. if (typeof (error as { statusCode?: unknown }).statusCode === "number") return undefined const topCode = (error as { code?: string }).code @@ -62,13 +62,21 @@ export function classifyStreamFailure(error: unknown): TransportDisconnect | und return disconnect(code) } - // Message fallback is a last resort for raw connection errors that carry no - // errno code. - const message = typeof error.message === "string" ? error.message : "" - for (const [pattern, fallbackCode] of TRANSIENT_MESSAGE_CODES) { - if (pattern.test(message)) return { kind: "provider_transport_disconnect", retryable: true, code: fallbackCode } - } + return undefined +} +// Bare connection-dropped message with no errno code — the true last resort, +// applied by fromError only AFTER structured stream/provider parsing fails, so a +// structured error (e.g. invalid_prompt) whose text merely contains a transport +// phrase, or carries it in cause.body, is classified by its real kind instead of +// being mis-read as a retryable disconnect. +export function classifyBareTransportMessage(error: unknown): TransportDisconnect | undefined { + if (!(error instanceof Error)) return undefined + if (typeof (error as { statusCode?: unknown }).statusCode === "number") return undefined + const message = typeof error.message === "string" ? error.message.trim() : "" + for (const [pattern, code] of BARE_TRANSPORT_MESSAGES) { + if (pattern.test(message)) return { kind: "provider_transport_disconnect", retryable: true, code } + } return undefined } diff --git a/packages/opencode/test/session/message-v2.test.ts b/packages/opencode/test/session/message-v2.test.ts index e210917f7..8a5073055 100644 --- a/packages/opencode/test/session/message-v2.test.ts +++ b/packages/opencode/test/session/message-v2.test.ts @@ -2060,6 +2060,31 @@ describe("session.message-v2.fromError", () => { }) }) + test("a structured stream error in cause.body wins over a bare transport message", () => { + // Regression (review P2): the bare-message transport fallback runs only after + // structured stream parsing. An Error whose message is "socket hang up" but + // whose cause.body is a structured invalid_prompt must classify as + // invalid_request (non-retryable), not a retryable transport disconnect. + const error = Object.assign(new Error("socket hang up"), { + cause: { body: { type: "error", error: { code: "invalid_prompt", message: "bad prompt" } } }, + }) + + const result = MessageV2.fromError(error, { providerID }) + + expect((result as MessageV2.APIError).data.providerFailure?.kind).toBe("invalid_request") + expect((result as MessageV2.APIError).data.isRetryable).toBe(false) + }) + + test("a bare 'socket hang up' (no code, no structure) is still a retryable transport disconnect", () => { + const result = MessageV2.fromError(new Error("socket hang up"), { providerID }) + + expect((result as MessageV2.APIError).data.isRetryable).toBe(true) + expect((result as MessageV2.APIError).data.providerFailure).toStrictEqual({ + kind: "transport_disconnect", + code: "SOCKET_HANG_UP", + }) + }) + test("does not let a transport phrase or transport cause on an APICallError hijack status classification", () => { // A 400 invalid_request whose message echoes "socket hang up" AND whose cause // carries a transport code must stay invalid_request, not be reclassified as From 772714a4144801c0394a7e8169097ff2dc02c395 Mon Sep 17 00:00:00 2001 From: Yuhan Lei Date: Tue, 23 Jun 2026 17:31:27 +0800 Subject: [PATCH 3/3] style(opencode): scope fromError default clause in a block MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The default clause now declares `const bareTransport` directly at clause scope (a lexical declaration introduced by the bare-transport fallback). Wrap the clause in braces so the binding is scoped to the clause rather than the whole switch — addresses CodeRabbit's noSwitchDeclarations nit. Pure scoping change, no behavior difference. Claude-Session: https://claude.ai/code/session_012743rGkjEzqaUMKy2nvYMM --- packages/opencode/src/session/message-v2.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/packages/opencode/src/session/message-v2.ts b/packages/opencode/src/session/message-v2.ts index c0b12dca2..c13fa7ae6 100644 --- a/packages/opencode/src/session/message-v2.ts +++ b/packages/opencode/src/session/message-v2.ts @@ -1435,7 +1435,7 @@ export function fromError( }, { cause: e }, ).toObject() - default: + default: { // A provider error can arrive raw or wrapped in an Error (the stream // "error" part throws value.error; the iterator-throw mapper hands back a // value). Run the stream parser for both before falling back to Unknown so @@ -1474,6 +1474,7 @@ export function fromError( { message: e instanceof Error ? errorMessage(e) : JSON.stringify(e) }, { cause: e }, ).toObject() + } } }