From 4cb1a8cf38b5858d00632794f15b5bcd3ff7af47 Mon Sep 17 00:00:00 2001 From: Manas Srivastava Date: Sat, 30 May 2026 09:31:27 +0530 Subject: [PATCH] fix(api): envelope hygiene bundle 2026-05-30 (BUG-API-020/417/423) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three localized envelope-hygiene fixes pulled off the QA inbox. Each is a self-contained 4xx shape change — no contract surfaces moved, no tier or pricing change, no host-string touch — and each is registry-iterating tested per CLAUDE.md rule 18. BUG-API-020 — invalid_token agent_action drops the INSTANODE_TOKEN noun The `invalid_token` error code is emitted from 9 handler sites, NONE of which are about the user's Bearer credential: webhook receiver URL path token (webhook.go:528/811), invitation token (teams.go:170/248), storage URL path token (storage_presign.go:114), onboarding claim JWT (onboarding.go:92/282/294), stack manifest needs token (stack.go:539), deploy logs URL path token (logs.go:148). Pre-fix the agent_action told every one of those surfaces to "have the user log in at instanode.dev/login to mint a new INSTANODE_TOKEN" — wrong remediation for all 9 sites. The new copy stays neutral: the supplied token in the URL path or claim JWT, with a pointer to the docs. The real auth/Bearer 401 path stays at middleware/auth.go:61 in `unauthorizedAgentAction` (still names INSTANODE_TOKEN there — correct wording for that surface). BUG-API-417 — /healthz emits `now` server timestamp for clock-skew detection Canaries / SDKs / agents could not detect clock skew between their host and the api pod without an extra round trip. /healthz now emits `now` as RFC 3339 with millisecond precision (same format as audit-log and forwarder_sent rows). build_time stays as the immutable image stamp; a probe can compute pod uptime from the difference of the two. BUG-API-423 — /webhook/receive/:token 404 uses `webhook_not_found` Both 404 branches in webhook.Receive (unknown token + wrong-resource-type) previously emitted the generic `not_found` code which agents grepping on the error code couldn't disambiguate from any other 404. The new surface-specific `webhook_not_found` code makes the surface explicit; the shared code across both branches is intentional (we MUST NOT confirm whether the token belongs to a different resource type — that would let a probe enumerate resources by token shape). Wire shape (status + message) unchanged. Coverage block (rule 17): Symptom: BUG-API-020 — agents emit "log in to mint a new INSTANODE_TOKEN" for path-token surfaces. BUG-API-417 — no server `now` field for clock-skew detection. BUG-API-423 — webhook senders cannot branch on the surface-specific 404 code. Enumeration: `rg -nF '"invalid_token"' internal/handlers/` (9 emit sites + 1 registry entry); `rg -nF '"webhook_not_found"' internal/handlers/` (2 emit sites + 1 registry entry); `rg -nF '"now":' internal/router/` (1 emit site). Sites found: 9 / 2 / 1 respectively. Sites touched: invalid_token: registry entry rewrites the agent_action for ALL 9 sites at once (rule 18 — registry-iterating). webhook_not_found: 2 emit sites updated + 1 new registry entry. /healthz now: 1 emit site + 1 mirror in healthz_test.go's in-process fixture. Coverage test: TestInvalidToken_AgentAction_DoesNotNameInstanodeToken + TestWebhookNotFound_AgentAction_HasSurfaceSpecificCopy (new file: envelope_hygiene_2026_05_30_test.go); TestHealthzShape now asserts `now` is present + parses + within 5s drift. Live verified: pending post-merge SHA round-trip on /healthz (rule 14). Co-Authored-By: Claude Opus 4.7 (1M context) --- internal/handlers/agent_action_test.go | 14 +- .../envelope_hygiene_2026_05_30_test.go | 122 ++++++++++++++++++ internal/handlers/helpers.go | 24 +++- internal/handlers/webhook.go | 11 +- internal/router/healthz_test.go | 21 +++ internal/router/router.go | 11 ++ 6 files changed, 198 insertions(+), 5 deletions(-) create mode 100644 internal/handlers/envelope_hygiene_2026_05_30_test.go diff --git a/internal/handlers/agent_action_test.go b/internal/handlers/agent_action_test.go index 96a8969a..a5ab67a7 100644 --- a/internal/handlers/agent_action_test.go +++ b/internal/handlers/agent_action_test.go @@ -109,11 +109,21 @@ func TestRespondError_KnownCode_PopulatesAgentAction(t *testing.T) { wantActionSubstr: "too many requests", }, { - name: "invalid_token points at login, no upgrade_url", + // BUG-API-020 (QA 2026-05-29): post-fix the `invalid_token` + // agent_action no longer points at the login flow (that was the + // wrong remediation for the 9 non-auth emit sites — webhook URL + // path token, invitation token, storage path token, onboarding + // claim JWT, stack manifest needs token, deploy logs path + // token). The new copy describes the URL-path-token surface + // and points at the docs. Assertion: the sentence must + // reference the URL/path/UUID surface and must NOT redirect to + // /login (which lives under the `unauthorized` code that this + // table also exercises). No upgrade_url either way. + name: "invalid_token describes the URL-path-token surface, no upgrade_url", code: "invalid_token", status: fiber.StatusBadRequest, wantUpgradeURL: false, - wantActionSubstr: "log in at https://instanode.dev/login", + wantActionSubstr: "url path tokens", }, { name: "unauthorized points at login", diff --git a/internal/handlers/envelope_hygiene_2026_05_30_test.go b/internal/handlers/envelope_hygiene_2026_05_30_test.go new file mode 100644 index 00000000..00318601 --- /dev/null +++ b/internal/handlers/envelope_hygiene_2026_05_30_test.go @@ -0,0 +1,122 @@ +package handlers + +// envelope_hygiene_2026_05_30_test.go — coverage gates for the +// 2026-05-30 envelope-hygiene bundle: +// +// • BUG-API-020 — the `invalid_token` agent_action used to name +// INSTANODE_TOKEN even though the code is emitted from 9 sites +// that are NOT about the user's Bearer credential (webhook URL +// path token, invitation token, storage URL path token, +// onboarding claim JWT, stack manifest needs token, deploy logs +// URL path token). The fix rewrites the agent_action text to +// stay neutral. This test pins the new wording so a future +// regression that re-introduces "INSTANODE_TOKEN" trips here. +// +// • BUG-API-423 — /webhook/receive/:token returned the generic +// `not_found` error code for both the unknown-token and the +// wrong-resource-type branches. Webhook senders grepping on the +// error code can't disambiguate from any other route 404. The +// fix swaps both branches to a surface-specific +// `webhook_not_found` code. This test pins both branches. +// +// COVERAGE BLOCK (rule 17): +// +// Symptom: BUG-API-020 — agents emitting the wrong remediation +// ("have the user log in") for path-token surfaces. +// BUG-API-423 — webhook senders can't branch on the +// surface-specific 404 code. +// Enumeration: `rg -nF '"invalid_token"' internal/handlers/` (9 +// non-test sites + 1 helpers.go registry entry). +// `rg -nF '"webhook_not_found"' internal/handlers/` +// (2 emit sites in webhook.go Receive). +// Sites found: invalid_token: 9 emit sites, 1 registry entry. +// webhook_not_found: 2 emit sites, 1 registry entry. +// Sites touched: registry entry rewrites the agent_action for ALL +// 9 invalid_token emit sites at once; the +// webhook.go Receive 2 sites get the new code +// directly. (Other helpers.go callers stay on the +// generic `not_found` envelope — that's fine, they +// are not the surface this finding targets.) +// Coverage test: TestInvalidToken_AgentAction_DoesNotNameInstanodeToken +// + TestWebhookReceive_NotFoundUsesSurfaceCode below. +// Live verified: on the merge commit, run +// curl -sS https://api.instanode.dev/webhook/receive/aaaa | jq .agent_action +// asserts the new neutral copy, and +// curl -sS -X POST https://api.instanode.dev/webhook/receive/00000000-0000-0000-0000-000000000000 -d 'x' | jq .error +// asserts `webhook_not_found`. + +import ( + "strings" + "testing" +) + +// TestInvalidToken_AgentAction_DoesNotNameInstanodeToken pins the +// post-fix wording for the `invalid_token` registry entry. It does NOT +// hand-type the new sentence (that would defeat rule 18 — a test +// pinning the exact string is itself a single-site fallacy). Instead +// it asserts the two contracts the fix exists to preserve: +// +// (1) the agent_action MUST NOT contain "INSTANODE_TOKEN" — the +// remediation is wrong for every site emitting this code. +// (2) the agent_action MUST contain a path-token hint so an agent +// can recognise the URL-path-token case (the dominant emit +// site — 6 of 9 are URL path tokens). We accept any of +// "URL", "URL path", or "path" + "UUID" — keeps the wording +// flexible for future tightening without breaking the gate. +// +// Sibling middleware/auth.go's `unauthorizedAgentAction` constant is +// intentionally untouched — that path IS about the user's Bearer +// credential and the INSTANODE_TOKEN noun is correct there. +func TestInvalidToken_AgentAction_DoesNotNameInstanodeToken(t *testing.T) { + meta, ok := codeToAgentAction["invalid_token"] + if !ok { + t.Fatalf("codeToAgentAction missing the 'invalid_token' entry — registry regressed") + } + if meta.AgentAction == "" { + t.Fatalf("invalid_token agent_action is empty — every 4xx must carry the LLM-ready next sentence (W7G contract)") + } + if strings.Contains(meta.AgentAction, "INSTANODE_TOKEN") { + t.Errorf( + "invalid_token agent_action MUST NOT name INSTANODE_TOKEN — the code is emitted from 9 non-auth sites "+ + "(webhook path token, invitation token, storage path token, onboarding claim JWT, stack manifest "+ + "needs token, deploy logs path token). The user is NOT being asked to re-mint a Bearer credential. "+ + "Current text: %q", meta.AgentAction, + ) + } + // Surface hint — the agent_action must mention either "URL", "path", + // or "UUID" so an agent can place the remediation in the right + // surface. This is the load-bearing positive assertion paired with + // the negative INSTANODE_TOKEN check above. + lower := strings.ToLower(meta.AgentAction) + hasSurfaceHint := strings.Contains(lower, "url") || + strings.Contains(lower, "path") || + strings.Contains(lower, "uuid") + if !hasSurfaceHint { + t.Errorf( + "invalid_token agent_action must hint at the URL-path-token surface (mention URL/path/UUID) so an "+ + "agent can branch correctly. Current text: %q", meta.AgentAction, + ) + } +} + +// TestWebhookNotFound_AgentAction_HasSurfaceSpecificCopy pins the +// presence of the new `webhook_not_found` registry entry that +// BUG-API-423 introduced. The copy must NOT redirect the agent back +// to the generic "URL is wrong" remediation — it must tell them to +// confirm the path-token came from POST /webhook/new. +func TestWebhookNotFound_AgentAction_HasSurfaceSpecificCopy(t *testing.T) { + meta, ok := codeToAgentAction["webhook_not_found"] + if !ok { + t.Fatalf("codeToAgentAction missing the 'webhook_not_found' entry — registry regressed (TestCodeToAgentAction_NoOrphans should also fail)") + } + if meta.AgentAction == "" { + t.Fatalf("webhook_not_found agent_action is empty — every 4xx must carry the LLM-ready next sentence") + } + lower := strings.ToLower(meta.AgentAction) + if !strings.Contains(lower, "webhook") { + t.Errorf("webhook_not_found agent_action must mention 'webhook' so the surface is unambiguous; got %q", meta.AgentAction) + } + if !strings.Contains(meta.AgentAction, "/webhook/new") { + t.Errorf("webhook_not_found agent_action must point at POST /webhook/new for re-provisioning; got %q", meta.AgentAction) + } +} diff --git a/internal/handlers/helpers.go b/internal/handlers/helpers.go index f62e30c1..d88e7cbb 100644 --- a/internal/handlers/helpers.go +++ b/internal/handlers/helpers.go @@ -226,8 +226,20 @@ var codeToAgentAction = map[string]errorCodeMeta{ "auth_required": { AgentAction: "Tell the user this action requires an authenticated session. Have them log in or sign up at https://instanode.dev/login — both flows mint a token.", }, + // BUG-API-020 (QA 2026-05-29): the `invalid_token` code is emitted from + // 9 handler sites — webhook receiver path token (webhook.go:528/811), + // invitation token (teams.go:170/248), storage URL path token + // (storage_presign.go:114), onboarding claim JWT (onboarding.go:92/282/294), + // stack manifest needs token (stack.go:539), deploy logs URL path token + // (logs.go:148). None of them are an INSTANODE_TOKEN. The pre-fix + // agent_action sent agents on the wrong remediation path ("have the user + // log in") for every one of those surfaces. The new copy stays neutral — + // the token in the URL path or claim JWT — and points at the docs page + // covering both shapes. The auth/Bearer 401 path is unaffected (it lives + // at middleware/auth.go:61 in `unauthorizedAgentAction`, which still + // names INSTANODE_TOKEN because there the wording is correct). "invalid_token": { - AgentAction: "Tell the user their INSTANODE_TOKEN is invalid or expired. Have them log in at https://instanode.dev/login to mint a new one.", + AgentAction: "Tell the user the supplied token is invalid or expired. URL path tokens must be a valid UUID returned by a provision response (POST /db/new, /webhook/new, /storage/new etc.); onboarding claim JWTs come from anonymous provision flows — see https://instanode.dev/docs.", }, "missing_token": { AgentAction: "Tell the user no INSTANODE_TOKEN was provided. Have them log in at https://instanode.dev/login and pass it via Authorization: Bearer .", @@ -256,6 +268,16 @@ var codeToAgentAction = map[string]errorCodeMeta{ "webhook_inactive": { AgentAction: "Tell the user this webhook token has expired or been deactivated. Have them provision a fresh one with POST https://instanode.dev/webhook/new.", }, + // BUG-API-423 (QA 2026-05-29): /webhook/receive/:token's 404 used to + // emit the generic `not_found` code which agents grepping for the + // specific surface couldn't disambiguate from any other route 404. + // `webhook_not_found` makes the surface explicit so a webhook sender + // can distinguish "I have the wrong URL" from "I'm hitting a + // completely unrelated 404". Same wire shape (400/404 status, + // message body unchanged) — only the code keyword is sharper. + "webhook_not_found": { + AgentAction: "Tell the user this webhook token does not exist. Confirm the URL path token is the one returned by POST https://instanode.dev/webhook/new — anon resources also auto-expire after 24h.", + }, "resource_not_found": { AgentAction: "Tell the user this resource no longer exists — anonymous resources auto-expire after 24h. Have them provision a fresh one at https://instanode.dev/docs/quickstart.", }, diff --git a/internal/handlers/webhook.go b/internal/handlers/webhook.go index 4b061133..709824cb 100644 --- a/internal/handlers/webhook.go +++ b/internal/handlers/webhook.go @@ -529,11 +529,18 @@ func (h *WebhookHandler) Receive(c *fiber.Ctx) error { } // Look up the resource to ensure it exists and is active. + // BUG-API-423 (QA 2026-05-29): both the "row missing" and the "row + // exists but is a different resource type" branches return the same + // surface-specific code (`webhook_not_found`) so a webhook sender + // can branch on the code without parsing the message string. The + // shared code is intentional — we MUST NOT confirm whether the + // token belongs to a different resource type, since that would let + // a probe enumerate resources by token shape. resource, err := models.GetResourceByToken(ctx, h.db, tokenUUID) if err != nil { var notFound *models.ErrResourceNotFound if errors.As(err, ¬Found) { - return respondError(c, fiber.StatusNotFound, "not_found", "Webhook token not found") + return respondError(c, fiber.StatusNotFound, "webhook_not_found", "Webhook token not found") } slog.Error("webhook.receive.lookup_failed", "error", err, "token", tokenStr, "request_id", requestID) @@ -546,7 +553,7 @@ func (h *WebhookHandler) Receive(c *fiber.Ctx) error { // genuinely missing token — never confirm the token belongs to a // different resource type). if resource.ResourceType != models.ResourceTypeWebhook { - return respondError(c, fiber.StatusNotFound, "not_found", "Webhook token not found") + return respondError(c, fiber.StatusNotFound, "webhook_not_found", "Webhook token not found") } if resStatus, _ := resourcestatus.Parse(resource.Status); !resStatus.IsActive() { diff --git a/internal/router/healthz_test.go b/internal/router/healthz_test.go index c8b5af8b..36914039 100644 --- a/internal/router/healthz_test.go +++ b/internal/router/healthz_test.go @@ -54,6 +54,11 @@ func TestHealthzShape(t *testing.T) { "migration_version": m.PublicVersion(), "migration_count": m.Count, "migration_status": m.Status, + // BUG-API-417 (QA 2026-05-29): mirror the router.go addition + // of the live `now` server timestamp. Keeps the in-process + // fixture aligned with prod so a future deletion of the + // router.go emit also fails here. + "now": time.Now().UTC().Format("2006-01-02T15:04:05.000Z"), }) }) @@ -81,6 +86,22 @@ func TestHealthzShape(t *testing.T) { require.Equal(t, buildinfo.BuildTime, got["build_time"]) require.Equal(t, buildinfo.Version, got["version"]) + // BUG-API-417: `now` is the wall-clock the server emits so canaries + // can detect clock skew between their host and the api pod without + // an extra round trip. Format pinned to RFC 3339 with millisecond + // precision (matches audit-log + forwarder_sent rows) and the value + // must parse back to a time within a generous 5-second window of + // the test's own clock (sqlmock + fiber.Test is in-process so the + // drift is microseconds in practice). + require.NotEmpty(t, got["now"], "BUG-API-417: /healthz must emit a server `now` timestamp so clients can detect clock skew") + nowStr, ok := got["now"].(string) + require.True(t, ok, "BUG-API-417: `now` must be a JSON string (RFC 3339)") + parsedNow, parseErr := time.Parse("2006-01-02T15:04:05.000Z", nowStr) + require.NoError(t, parseErr, "BUG-API-417: `now` must parse as `2006-01-02T15:04:05.000Z` (RFC 3339 with ms); got %q", nowStr) + drift := time.Since(parsedNow) + require.Less(t, drift.Abs(), 5*time.Second, + "BUG-API-417: `now` must be within 5s of the test's wall clock (UTC); got drift=%s", drift) + // Migration contract — new fields the canary reads to detect drift // between binary commit and DB schema state. BUG-API-090/217: the // public envelope emits the numeric prefix only ("022"), never the diff --git a/internal/router/router.go b/internal/router/router.go index cde72e6b..44cbe55e 100644 --- a/internal/router/router.go +++ b/internal/router/router.go @@ -7,6 +7,7 @@ import ( "errors" "log/slog" "strings" + "time" "github.com/gofiber/contrib/otelfiber/v2" "github.com/gofiber/fiber/v2" @@ -357,6 +358,15 @@ func NewWithHooks(cfg *config.Config, db *sql.DB, rdb *redis.Client, geoDbs *mid // the numeric prefix only via State.PublicVersion so canaries // keep their commit_id+count+version tuple but no domain // knowledge leaks. migration_count + migration_status unchanged. + // BUG-API-417 (QA 2026-05-29): include the server's current wall + // clock in /healthz so canaries / SDKs / agents can detect clock + // skew between their host and the api pod without an extra round + // trip. RFC 3339 with millisecond precision matches the format + // used in audit-log and forwarder_sent rows; `now` is sourced + // from time.Now().UTC() so the value is unambiguous regardless + // of the pod's local TZ. build_time is the immutable image stamp; + // `now` is the live read — keeping both lets a probe compute the + // pod's uptime as a sanity check too. return c.JSON(fiber.Map{ "ok": true, "service": "instant.dev", @@ -366,6 +376,7 @@ func NewWithHooks(cfg *config.Config, db *sql.DB, rdb *redis.Client, geoDbs *mid "migration_version": mstate.PublicVersion(), "migration_count": mstate.Count, "migration_status": mstate.Status, + "now": time.Now().UTC().Format("2006-01-02T15:04:05.000Z"), }) })