diff --git a/packages/opencode/src/session/message-v2.ts b/packages/opencode/src/session/message-v2.ts index ec45707ff..c13fa7ae6 100644 --- a/packages/opencode/src/session/message-v2.ts +++ b/packages/opencode/src/session/message-v2.ts @@ -19,7 +19,7 @@ import { ModelID, ProviderID } from "@/provider/schema" import { Effect } from "effect" import { EffectLogger } from "@/effect" import { isMedia } from "@/util/media" -import { classifyStreamFailure } from "./stream-failure-classifier" +import { classifyStreamFailure, classifyBareTransportMessage, type TransportDisconnect } from "./stream-failure-classifier" import { LLMTrace } from "./llm-trace" import { RunObservability } from "./run-observability" import { RunLifecycle } from "./run-lifecycle" @@ -1344,6 +1344,25 @@ export const filterCompactedEffect = Effect.fnUntraced(function* (sessionID: Ses return filterCompacted(stream(sessionID)) }) +// Build the transport-disconnect APIError from a classified disconnect. Shared +// by the errno-code path (high priority, ahead of stream parsing) and the +// bare-message fallback (last resort, after stream parsing fails). +function transportDisconnectError(e: unknown, transport: TransportDisconnect) { + const message = (e as Error).message || "" + return new APIError( + { + // Per-errno: most transport disconnects are retryable, but a permanent one + // (e.g. ENOTFOUND, an unresolved host) is not — classifyRetry reads this so + // it stops instead of retrying into a stall. + message: message || "Connection interrupted", + isRetryable: transport.retryable, + metadata: { code: transport.code, message }, + providerFailure: { kind: "transport_disconnect", code: transport.code }, + }, + { cause: e }, + ).toObject() +} + export function fromError( e: unknown, ctx: { providerID: ProviderID; aborted?: boolean }, @@ -1366,21 +1385,12 @@ export function fromError( }, { cause: e }, ).toObject() - case classifyStreamFailure(e) !== undefined: { - const transport = classifyStreamFailure(e)! - return new APIError( - { - message: (e as Error).message || "Connection interrupted", - isRetryable: true, - metadata: { - code: transport.code, - message: (e as Error).message || "", - }, - providerFailure: { kind: "transport_disconnect", code: transport.code }, - }, - { cause: e }, - ).toObject() - } + // Errno-coded transport disconnect (top-level or in the cause chain) is a + // definitive signal, so it runs ahead of stream parsing. The bare-message + // fallback does NOT run here — it is demoted below parseStreamError so a + // structured error carrying a transport phrase is not mis-grabbed. + case classifyStreamFailure(e) !== undefined: + return transportDisconnectError(e, classifyStreamFailure(e)!) case e instanceof Error && (e as FetchDecompressionError).code === "ZlibError": if (ctx.aborted) { return new AbortedError({ message: e.message }, { cause: e }).toObject() @@ -1425,7 +1435,7 @@ export function fromError( }, { cause: e }, ).toObject() - default: + default: { // A provider error can arrive raw or wrapped in an Error (the stream // "error" part throws value.error; the iterator-throw mapper hands back a // value). Run the stream parser for both before falling back to Unknown so @@ -1456,10 +1466,15 @@ export function fromError( ).toObject() } } catch {} + // Last resort: a bare connection-dropped message (no errno code, not a + // structured stream error) is still a retryable transport disconnect. + const bareTransport = classifyBareTransportMessage(e) + if (bareTransport) return transportDisconnectError(e, bareTransport) return new NamedError.Unknown( { message: e instanceof Error ? errorMessage(e) : JSON.stringify(e) }, { cause: e }, ).toObject() + } } } diff --git a/packages/opencode/src/session/retry.test.ts b/packages/opencode/src/session/retry.test.ts index ba25fb131..0502cc2ba 100644 --- a/packages/opencode/src/session/retry.test.ts +++ b/packages/opencode/src/session/retry.test.ts @@ -128,7 +128,7 @@ describe("classifyRetry — reads providerFailure.kind (slice ④)", () => { // Transient kinds always retry, even if isRetryable is false and the status is // not 5xx — the classification, not the SDK flag, is the source of truth. - for (const kind of ["rate_limit", "server_overload", "transport_disconnect", "decompression"] as const) { + for (const kind of ["rate_limit", "server_overload", "decompression"] as const) { test(`transient kind ${kind} retries even when isRetryable is false and status is not 5xx`, () => { expect( classifyRetry(makeAPIError({ isRetryable: false, statusCode: 400, providerFailure: { kind } }))?.kind, @@ -136,6 +136,22 @@ describe("classifyRetry — reads providerFailure.kind (slice ④)", () => { }) } + // transport_disconnect honors the per-errno isRetryable the stream classifier + // sets (#1105b): most transport errnos are transient, but a permanent one + // (e.g. ENOTFOUND — unresolved host) is marked isRetryable=false and must not + // auto-retry into a stall. + test("transport_disconnect retries when the classifier marked it retryable", () => { + expect( + classifyRetry(makeAPIError({ isRetryable: true, providerFailure: { kind: "transport_disconnect" } }))?.kind, + ).toBe("unknown") + }) + + test("transport_disconnect does not retry when the classifier marked it non-retryable", () => { + expect( + classifyRetry(makeAPIError({ isRetryable: false, providerFailure: { kind: "transport_disconnect" } })), + ).toBeUndefined() + }) + test("unknown kind falls back to the legacy isRetryable + 5xx gate", () => { expect( classifyRetry(makeAPIError({ isRetryable: true, statusCode: 404, providerFailure: { kind: "unknown" } }))?.kind, diff --git a/packages/opencode/src/session/retry.ts b/packages/opencode/src/session/retry.ts index 6758888d7..909d39915 100644 --- a/packages/opencode/src/session/retry.ts +++ b/packages/opencode/src/session/retry.ts @@ -16,13 +16,10 @@ export type { RetryAction } from "./retry-classification" // always retry. classifyRetry reads these so the retry decision follows the // canonical classification instead of re-deriving it from the provider SDK's // isRetryable flag; `unknown` or absent kinds fall back to that legacy signal. +// transport_disconnect is handled separately: it honors the per-errno +// isRetryable the stream classifier set (most errnos transient, ENOTFOUND not). const RETRY_TERMINAL_KINDS = new Set(["auth", "invalid_request", "quota_exhausted"]) -const RETRY_TRANSIENT_KINDS = new Set([ - "rate_limit", - "server_overload", - "transport_disconnect", - "decompression", -]) +const RETRY_TRANSIENT_KINDS = new Set(["rate_limit", "server_overload", "decompression"]) export const RETRY_INITIAL_DELAY = 2000 export const RETRY_BACKOFF_FACTOR = 2 @@ -85,16 +82,19 @@ export function classifyRetry(error: Err): RetryClassification | undefined { const status = error.data.statusCode const kind = error.data.providerFailure?.kind // Retry/stop gate. Prefer the canonical providerFailure.kind: terminal kinds - // never retry, transient kinds always do. When the kind is `unknown` or the - // row predates providerFailure, fall back to the legacy signal — isRetryable, - // or a 5xx the provider SDK didn't explicitly mark retryable. The two agree - // for every classified kind today. + // never retry, transient kinds always do. transport_disconnect honors the + // per-errno isRetryable the stream classifier set (most errnos transient, + // ENOTFOUND not). When the kind is `unknown` or the row predates + // providerFailure, fall back to the legacy signal — isRetryable, or a 5xx the + // provider SDK didn't explicitly mark retryable. const retryable = kind && RETRY_TERMINAL_KINDS.has(kind) ? false - : kind && RETRY_TRANSIENT_KINDS.has(kind) - ? true - : error.data.isRetryable || (status !== undefined && status >= 500) + : kind === "transport_disconnect" + ? error.data.isRetryable + : kind && RETRY_TRANSIENT_KINDS.has(kind) + ? true + : error.data.isRetryable || (status !== undefined && status >= 500) if (!retryable) return undefined // Strict 3-way AND: opencode provider + FreeUsageLimitError marker in body diff --git a/packages/opencode/src/session/stream-failure-classifier.test.ts b/packages/opencode/src/session/stream-failure-classifier.test.ts index b7a6a7e7d..f24a951aa 100644 --- a/packages/opencode/src/session/stream-failure-classifier.test.ts +++ b/packages/opencode/src/session/stream-failure-classifier.test.ts @@ -1,85 +1,107 @@ import { describe, expect, test } from "bun:test" -import { classifyStreamFailure } from "./stream-failure-classifier" +import { classifyStreamFailure, classifyBareTransportMessage } from "./stream-failure-classifier" -describe("classifyStreamFailure", () => { - describe("transport disconnect — retryable", () => { - test("ECONNRESET SystemError", () => { - const error = Object.assign(new Error("read ECONNRESET"), { code: "ECONNRESET", syscall: "read" }) - const result = classifyStreamFailure(error) - expect(result).toEqual({ - kind: "provider_transport_disconnect", - retryable: true, - code: "ECONNRESET", - }) - }) +// Every transport errno and its expected retryability. ENOTFOUND is the only +// permanent one (unresolved host); everything else is transient. Driven as a +// matrix so adding/renaming a code in TRANSPORT_CODES forces a matching row. +const TRANSPORT_CODE_MATRIX: ReadonlyArray = [ + ["ECONNRESET", true], + ["ECONNREFUSED", true], + ["ETIMEDOUT", true], + ["ECONNABORTED", true], + ["EPIPE", true], + ["EHOSTUNREACH", true], + ["ENETUNREACH", true], + ["EAI_AGAIN", true], + ["ENOTFOUND", false], + ["UND_ERR_SOCKET", true], + ["UND_ERR_CONNECT_TIMEOUT", true], + ["UND_ERR_HEADERS_TIMEOUT", true], + ["UND_ERR_BODY_TIMEOUT", true], +] - test("UND_ERR_SOCKET — TypeError('terminated') with cause.code", () => { - const cause = Object.assign(new Error("other side closed"), { code: "UND_ERR_SOCKET" }) - const error = new TypeError("terminated", { cause }) - const result = classifyStreamFailure(error) - expect(result).toEqual({ - kind: "provider_transport_disconnect", - retryable: true, - code: "UND_ERR_SOCKET", - }) +describe("classifyStreamFailure — transport code matrix", () => { + for (const [code, retryable] of TRANSPORT_CODE_MATRIX) { + test(`top-level ${code} → transport disconnect (retryable=${retryable})`, () => { + const error = Object.assign(new Error(`failed: ${code}`), { code }) + expect(classifyStreamFailure(error)).toEqual({ kind: "provider_transport_disconnect", retryable, code }) }) - test("ECONNREFUSED SystemError", () => { - const error = Object.assign(new Error("connect ECONNREFUSED"), { code: "ECONNREFUSED", syscall: "connect" }) - const result = classifyStreamFailure(error) - expect(result).toEqual({ - kind: "provider_transport_disconnect", - retryable: true, - code: "ECONNREFUSED", - }) + test(`${code} nested in the cause chain → transport disconnect (retryable=${retryable})`, () => { + const cause = Object.assign(new Error("inner"), { code }) + const error = new TypeError("fetch failed", { cause }) + expect(classifyStreamFailure(error)).toEqual({ kind: "provider_transport_disconnect", retryable, code }) }) + } +}) - test("ETIMEDOUT SystemError", () => { - const error = Object.assign(new Error("connect ETIMEDOUT"), { code: "ETIMEDOUT", syscall: "connect" }) - const result = classifyStreamFailure(error) - expect(result).toEqual({ - kind: "provider_transport_disconnect", - retryable: true, - code: "ETIMEDOUT", - }) - }) +describe("classifyStreamFailure — not a code-based transport error", () => { + test("generic Error returns undefined", () => { + expect(classifyStreamFailure(new Error("something broke"))).toBeUndefined() + }) - test("UND_ERR_SOCKET nested in cause chain", () => { - const innerCause = Object.assign(new Error("socket hang up"), { code: "UND_ERR_SOCKET" }) - const midError = new Error("fetch failed", { cause: innerCause }) - const outerError = new TypeError("terminated", { cause: midError }) - const result = classifyStreamFailure(outerError) - expect(result).toEqual({ - kind: "provider_transport_disconnect", - retryable: true, - code: "UND_ERR_SOCKET", - }) - }) + test("unknown errno code returns undefined", () => { + expect(classifyStreamFailure(Object.assign(new Error("x"), { code: "ESOMETHINGELSE" }))).toBeUndefined() }) - describe("non-transport errors — not classified", () => { - test("generic Error returns undefined", () => { - expect(classifyStreamFailure(new Error("something broke"))).toBeUndefined() - }) + test("AbortError returns undefined", () => { + expect(classifyStreamFailure(new DOMException("The operation was aborted", "AbortError"))).toBeUndefined() + }) - test("TypeError without transport cause returns undefined", () => { - expect(classifyStreamFailure(new TypeError("Cannot read properties of undefined"))).toBeUndefined() - }) + test("non-Error values return undefined", () => { + expect(classifyStreamFailure("string error")).toBeUndefined() + expect(classifyStreamFailure(null)).toBeUndefined() + expect(classifyStreamFailure(42)).toBeUndefined() + }) - test("AbortError returns undefined", () => { - const error = new DOMException("The operation was aborted", "AbortError") - expect(classifyStreamFailure(error)).toBeUndefined() - }) + test("TypeError('terminated') without a transport-coded cause returns undefined", () => { + expect(classifyStreamFailure(new TypeError("terminated", { cause: new Error("unrelated") }))).toBeUndefined() + }) - test("non-Error values return undefined", () => { - expect(classifyStreamFailure("string error")).toBeUndefined() - expect(classifyStreamFailure(null)).toBeUndefined() - expect(classifyStreamFailure(42)).toBeUndefined() + test("a bare 'socket hang up' message is NOT code-classified (the bare-message fallback owns it)", () => { + // The message fallback was moved out of classifyStreamFailure so structured + // stream parsing can run first; the code path only matches errno codes. + expect(classifyStreamFailure(new Error("socket hang up"))).toBeUndefined() + }) + + test("HTTP error (statusCode) with a transport-coded cause stays an API error, not transport", () => { + const cause = Object.assign(new Error("socket hang up"), { code: "UND_ERR_SOCKET" }) + const error = Object.assign(new Error("400 invalid request"), { statusCode: 400, cause }) + expect(classifyStreamFailure(error)).toBeUndefined() + }) +}) + +describe("classifyBareTransportMessage — anchored bare connection messages", () => { + test("exact 'socket hang up' → retryable transport", () => { + expect(classifyBareTransportMessage(new Error("socket hang up"))).toEqual({ + kind: "provider_transport_disconnect", + retryable: true, + code: "SOCKET_HANG_UP", }) + }) - test("TypeError('terminated') without UND_ERR_SOCKET cause returns undefined", () => { - const error = new TypeError("terminated", { cause: new Error("unrelated") }) - expect(classifyStreamFailure(error)).toBeUndefined() + test("exact 'premature close' (surrounding whitespace trimmed) → retryable transport", () => { + expect(classifyBareTransportMessage(new Error(" premature close\n"))).toEqual({ + kind: "provider_transport_disconnect", + retryable: true, + code: "PREMATURE_CLOSE", }) }) + + test("a longer message that merely CONTAINS the phrase is not a bare transport error", () => { + expect( + classifyBareTransportMessage(new Error("invalid request: socket hang up is not allowed")), + ).toBeUndefined() + }) + + test("an HTTP error (statusCode) whose message is exactly 'socket hang up' is not bare transport", () => { + expect( + classifyBareTransportMessage(Object.assign(new Error("socket hang up"), { statusCode: 502 })), + ).toBeUndefined() + }) + + test("non-Error value and unrelated message return undefined", () => { + expect(classifyBareTransportMessage("socket hang up")).toBeUndefined() + expect(classifyBareTransportMessage(new Error("something else"))).toBeUndefined() + }) }) diff --git a/packages/opencode/src/session/stream-failure-classifier.ts b/packages/opencode/src/session/stream-failure-classifier.ts index dcfab658a..58dae8c30 100644 --- a/packages/opencode/src/session/stream-failure-classifier.ts +++ b/packages/opencode/src/session/stream-failure-classifier.ts @@ -1,27 +1,85 @@ export type TransportDisconnect = { kind: "provider_transport_disconnect" - retryable: true + retryable: boolean code: string } -const TRANSPORT_CODES = new Set(["ECONNRESET", "ECONNREFUSED", "ETIMEDOUT", "UND_ERR_SOCKET"]) +// errno / undici codes that mean the connection dropped, timed out, or could +// not be established. All are transient (worth retrying) except ENOTFOUND +// (see disconnect()). +const TRANSPORT_CODES = new Set([ + "ECONNRESET", + "ECONNREFUSED", + "ETIMEDOUT", + "ECONNABORTED", + "EPIPE", + "EHOSTUNREACH", + "ENETUNREACH", + "EAI_AGAIN", // transient DNS lookup failure — retry + "ENOTFOUND", // permanent DNS failure — non-retryable (see disconnect()) + "UND_ERR_SOCKET", + "UND_ERR_CONNECT_TIMEOUT", + "UND_ERR_HEADERS_TIMEOUT", + "UND_ERR_BODY_TIMEOUT", +]) + +// Connection-dropped signals that can arrive as a bare message with no errno +// code (undici/Node throw these in some paths). Anchored to the whole trimmed +// message: Node/undici set the entire message to exactly "socket hang up" / +// "premature close", so a longer message that merely *contains* the phrase +// (e.g. a structured provider error) is NOT a bare transport failure. +// "terminated" is intentionally excluded — on its own it is undici's generic +// fetch wrapper and is only a transport failure when its cause carries a code. +const BARE_TRANSPORT_MESSAGES: ReadonlyArray<[RegExp, string]> = [ + [/^socket hang up$/i, "SOCKET_HANG_UP"], + [/^premature close$/i, "PREMATURE_CLOSE"], +] + +function disconnect(code: string): TransportDisconnect { + // ENOTFOUND = the hostname did not resolve (usually a misconfigured base URL + // that will never resolve), so auto-retrying just stalls the turn. Everything + // else in TRANSPORT_CODES is transient; EAI_AGAIN (transient DNS) stays + // retryable. Promote to a set if a second permanent code ever appears. + return { kind: "provider_transport_disconnect", retryable: code !== "ENOTFOUND", code } +} export function classifyStreamFailure(error: unknown): TransportDisconnect | undefined { if (!(error instanceof Error)) return undefined + // An error carrying an HTTP statusCode means an HTTP response was received, so + // it is an API error classified by status downstream (the APICallError branch + // in fromError, which runs after this). It must not be reclassified as a + // transport disconnect by a transport-coded cause. + if (typeof (error as { statusCode?: unknown }).statusCode === "number") return undefined + const topCode = (error as { code?: string }).code if (typeof topCode === "string" && TRANSPORT_CODES.has(topCode)) { - return { kind: "provider_transport_disconnect", retryable: true, code: topCode } + return disconnect(topCode) } const code = findTransportCodeInCause(error.cause) if (code) { - return { kind: "provider_transport_disconnect", retryable: true, code } + return disconnect(code) } return undefined } +// Bare connection-dropped message with no errno code — the true last resort, +// applied by fromError only AFTER structured stream/provider parsing fails, so a +// structured error (e.g. invalid_prompt) whose text merely contains a transport +// phrase, or carries it in cause.body, is classified by its real kind instead of +// being mis-read as a retryable disconnect. +export function classifyBareTransportMessage(error: unknown): TransportDisconnect | undefined { + if (!(error instanceof Error)) return undefined + if (typeof (error as { statusCode?: unknown }).statusCode === "number") return undefined + const message = typeof error.message === "string" ? error.message.trim() : "" + for (const [pattern, code] of BARE_TRANSPORT_MESSAGES) { + if (pattern.test(message)) return { kind: "provider_transport_disconnect", retryable: true, code } + } + return undefined +} + function findTransportCodeInCause(cause: unknown, depth = 0): string | undefined { if (depth > 4 || !cause || typeof cause !== "object") return undefined try { diff --git a/packages/opencode/test/session/message-v2.test.ts b/packages/opencode/test/session/message-v2.test.ts index cf8585e95..8a5073055 100644 --- a/packages/opencode/test/session/message-v2.test.ts +++ b/packages/opencode/test/session/message-v2.test.ts @@ -2029,6 +2029,83 @@ describe("session.message-v2.fromError", () => { }) }) + test("classifies ENOTFOUND as a non-retryable transport disconnect", () => { + const error = Object.assign(new Error("getaddrinfo ENOTFOUND api.wrong-host.invalid"), { + code: "ENOTFOUND", + syscall: "getaddrinfo", + }) + + const result = MessageV2.fromError(error, { providerID }) + + expect(MessageV2.APIError.isInstance(result)).toBe(true) + expect((result as MessageV2.APIError).data.isRetryable).toBe(false) + expect((result as MessageV2.APIError).data.providerFailure).toStrictEqual({ + kind: "transport_disconnect", + code: "ENOTFOUND", + }) + }) + + test("classifies EAI_AGAIN as a retryable transport disconnect", () => { + const error = Object.assign(new Error("getaddrinfo EAI_AGAIN api.example.com"), { + code: "EAI_AGAIN", + syscall: "getaddrinfo", + }) + + const result = MessageV2.fromError(error, { providerID }) + + expect((result as MessageV2.APIError).data.isRetryable).toBe(true) + expect((result as MessageV2.APIError).data.providerFailure).toStrictEqual({ + kind: "transport_disconnect", + code: "EAI_AGAIN", + }) + }) + + test("a structured stream error in cause.body wins over a bare transport message", () => { + // Regression (review P2): the bare-message transport fallback runs only after + // structured stream parsing. An Error whose message is "socket hang up" but + // whose cause.body is a structured invalid_prompt must classify as + // invalid_request (non-retryable), not a retryable transport disconnect. + const error = Object.assign(new Error("socket hang up"), { + cause: { body: { type: "error", error: { code: "invalid_prompt", message: "bad prompt" } } }, + }) + + const result = MessageV2.fromError(error, { providerID }) + + expect((result as MessageV2.APIError).data.providerFailure?.kind).toBe("invalid_request") + expect((result as MessageV2.APIError).data.isRetryable).toBe(false) + }) + + test("a bare 'socket hang up' (no code, no structure) is still a retryable transport disconnect", () => { + const result = MessageV2.fromError(new Error("socket hang up"), { providerID }) + + expect((result as MessageV2.APIError).data.isRetryable).toBe(true) + expect((result as MessageV2.APIError).data.providerFailure).toStrictEqual({ + kind: "transport_disconnect", + code: "SOCKET_HANG_UP", + }) + }) + + test("does not let a transport phrase or transport cause on an APICallError hijack status classification", () => { + // A 400 invalid_request whose message echoes "socket hang up" AND whose cause + // carries a transport code must stay invalid_request, not be reclassified as + // a retryable transport disconnect (statusCode present = HTTP response). + const cause = Object.assign(new Error("socket hang up"), { code: "UND_ERR_SOCKET" }) + const error = new APICallError({ + message: "400 invalid request: socket hang up is not allowed", + url: "https://example.com", + requestBodyValues: {}, + statusCode: 400, + responseHeaders: { "content-type": "application/json" }, + responseBody: JSON.stringify({ error: { code: "invalid_prompt", message: "socket hang up is not allowed" } }), + isRetryable: false, + cause, + }) + + const result = MessageV2.fromError(error, { providerID }) + + expect((result as MessageV2.APIError).data.providerFailure?.kind).toBe("invalid_request") + }) + test("populates providerFailure for decompression failures", () => { const zlibError = Object.assign( new Error('ZlibError fetching "https://opencode.cloudflare.dev/anthropic/messages".'),