From 9fdc4708bec903e4ee80288bfd1631e77e86aa8d Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 29 May 2026 23:31:17 +0530 Subject: [PATCH 1/2] fix(middleware): skip idempotency cache for mutable error codes (BUG-API-238) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BUG-API-238 — the free-tier recycle gate emits 402 `free_tier_recycle_requires_claim` with a claim_url the user can act on in 30 seconds. Pre-fix the 402 was written to the explicit-key 24h cache, so this sequence stranded the agent on a stale failure: 1. Agent POST /db/new with Idempotency-Key K1 → 402 (cached against K1). 2. User follows claim_url, claims with email (clears the gate state). 3. Agent retries POST /db/new with K1 → replays the cached 402. The 24h cache TTL is the Stripe contract — same key replays the same response — but it presumes a STABLE outcome. A 402 that flips when the user takes a 30-second action is not stable. Fix: - mutableErrorCodes registry (currently { free_tier_recycle_requires_claim }) listing error codes whose 4xx resolution can flip inside the cache TTL. - shouldCacheResponse(status, body, ct) helper that defers to caching for success + non-JSON + stable 4xx, but skips for body.error ∈ mutable map. - Wired into BOTH explicit-key (24h TTL) and fingerprint-fallback (120s) cache-write paths so the bypass is symmetric. What did NOT change: - Stripe-shape contract for stable outcomes (success + quota_exceeded + upgrade_required + provision_limit_reached etc.) — those still cache. - 409 idempotency_key_conflict envelope (BUG-013/406) unchanged. - No new endpoints, no new fields, no auth changes. Surface checklist (rule 22): - api/internal/middleware/idempotency.go helper + 2 wires - api/internal/middleware/idempotency_mutable_cache_test.go new regression - dashboard / marketing / OpenAPI no surface change (Stripe contract preserved; response shape unchanged) Coverage block: Symptom: Idempotent retry replays stale 402 after user clears recycle gate Enumeration: rg -F 'rdb.Set(context.Background(), cacheKey' internal/middleware/idempotency.go Sites found: 2 cache-write paths (explicit + fingerprint) Sites touched: 2 / 2 Coverage test: TestShouldCacheResponse_MutableErrorsSkipCache iterates the live mutableErrorCodes map (rule 18 — registry-iterating, not hand-typed); TestIdempotency_RecycleGate402_SourceAssertion static-grep that BOTH branches invoke shouldCacheResponse (rule 16). Live verified: pre-merge curl evidence pending — see PR body Local gate: - go build ./... PASS - go vet ./... PASS - go test ./internal/middleware/ PASS (full suite) Co-Authored-By: Claude Opus 4.7 (1M context) --- internal/middleware/idempotency.go | 107 ++++++++++++++++++ .../idempotency_mutable_cache_test.go | 101 +++++++++++++++++ 2 files changed, 208 insertions(+) create mode 100644 internal/middleware/idempotency_mutable_cache_test.go diff --git a/internal/middleware/idempotency.go b/internal/middleware/idempotency.go index d60212b..ee96d5e 100644 --- a/internal/middleware/idempotency.go +++ b/internal/middleware/idempotency.go @@ -159,6 +159,85 @@ type idemEntry struct { BodyHash string `json:"h"` // sha256 hex of the original request body } +// mutableErrorCodes lists machine-readable `error` strings whose 4xx +// resolution can flip BEFORE the explicit-key 24h cache TTL elapses, +// so caching them silently strands the agent on the stale failure. +// +// BUG-API-238 (QA 2026-05-29): the canonical case is +// `free_tier_recycle_requires_claim`. Sequence: +// +// 1. Agent calls POST /db/new with Idempotency-Key K1. The recycle gate +// fires → 402 free_tier_recycle_requires_claim cached against K1. +// 2. User follows the claim_url and successfully claims (POST /claim). +// The recycle gate state is now CLEARED — a fresh /db/new without +// a key would succeed. +// 3. Agent retries the original /db/new with Idempotency-Key K1. The +// pre-fix path replays the cached 402 verbatim, even though the +// gate would now wave the caller through. Agent thinks the claim +// failed; user thinks the platform is broken. +// +// Excluding these codes from caching restores the Stripe-shape contract +// ("same key replays the same response") in the spirit it was written: +// a STABLE outcome is replayed. A 402 that disappears the moment the +// user takes 30 seconds to claim is not a stable outcome. +// +// We do NOT exclude generic `quota_exceeded` / `upgrade_required` / +// `provision_limit_reached` — those resolve on a calendar boundary +// (next UTC day) or a payment event, both of which are outside the +// agent's reach inside the 24h key TTL. The recycle gate is the +// distinguishing case: a user-initiated action a few seconds later +// clears it. +// +// The list is small on purpose. Adding a code here means "this gate +// can clear inside 24h without server-side state change" — which is +// the actual cache-coherence boundary. Any future addition needs a +// regression test that exercises the same-key-retry-after-resolution +// path. +var mutableErrorCodes = map[string]struct{}{ + "free_tier_recycle_requires_claim": {}, +} + +// shouldCacheResponse decides whether a non-5xx response should be +// written to the idempotency cache. Success (<400) always caches — +// that's the Stripe contract. 4xx responses cache UNLESS the body's +// `error` field is in mutableErrorCodes (BUG-API-238). +// +// JSON peek is non-strict: any parse error / non-JSON body / missing +// `error` field falls back to caching (the pre-fix behaviour). Only a +// well-formed envelope whose `error` is listed gets skipped, so the +// helper degrades safely on unexpected bodies. +func shouldCacheResponse(status int, body []byte, contentType string) bool { + // Success responses always cache — the Stripe-shape contract guarantees + // the agent can replay them. Mutability only matters for failures the + // caller might re-resolve. + if status < 400 { + return true + } + // Quick rejects: non-JSON bodies (e.g. text/html 4xx) can't carry the + // `error` field shape we filter on, so default-cache them. + if !strings.Contains(contentType, "json") { + return true + } + if len(body) == 0 { + return true + } + // Peek the `error` field. Use a minimal struct to avoid pulling in the + // handlers package's ErrorResponse (which would create a circular + // import — handlers depends on middleware). + var peek struct { + Error string `json:"error"` + } + if err := json.Unmarshal(body, &peek); err != nil { + // Malformed JSON body — defer to default (cache). The handler + // emitted bytes the test suite is responsible for catching. + return true + } + if _, mutable := mutableErrorCodes[peek.Error]; mutable { + return false + } + return true +} + // Idempotency returns a Fiber handler that dedups duplicate POSTs via two // layered mechanisms: // @@ -341,6 +420,20 @@ func idempotencyExplicit(c *fiber.Ctx, rdb *redis.Client, endpoint, scope, rawKe body := append([]byte(nil), c.Response().Body()...) ct := string(c.Response().Header.ContentType()) + // BUG-API-238: bypass the cache for mutable error codes (currently + // just free_tier_recycle_requires_claim). A 402 the user resolves + // in 30s by clicking claim_url must not get strand-cached against + // the agent's 24h Idempotency-Key. Success + stable failures still + // cache as before. + if !shouldCacheResponse(status, body, ct) { + slog.Info("idempotency.skip_cache_mutable_error", + "endpoint", endpoint, + "status", status, + "request_id", GetRequestID(c), + ) + return nextErr + } + entry := idemEntry{ StatusCode: status, Body: body, @@ -454,6 +547,20 @@ func idempotencyFingerprint(c *fiber.Ctx, rdb *redis.Client, endpoint, scope str body := append([]byte(nil), c.Response().Body()...) ct := string(c.Response().Header.ContentType()) + // BUG-API-238: same mutable-error bypass as the explicit-key path. + // The fingerprint TTL is only 120s but the recycle gate still flips + // inside that window when a user claims fast — and the fingerprint + // path is the no-header default, so the silent strand is even more + // likely here. + if !shouldCacheResponse(status, body, ct) { + slog.Info("idempotency.fingerprint_skip_cache_mutable_error", + "endpoint", endpoint, + "status", status, + "request_id", GetRequestID(c), + ) + return nextErr + } + entry := idemEntry{ StatusCode: status, Body: body, diff --git a/internal/middleware/idempotency_mutable_cache_test.go b/internal/middleware/idempotency_mutable_cache_test.go new file mode 100644 index 0000000..407b299 --- /dev/null +++ b/internal/middleware/idempotency_mutable_cache_test.go @@ -0,0 +1,101 @@ +package middleware + +// idempotency_mutable_cache_test.go — BUG-API-238 regression. +// +// The shouldCacheResponse helper governs whether a non-5xx response is +// written to the explicit-key (24h) or fingerprint (120s) idempotency +// cache. Most 4xx responses cache; a small allowlist of "mutable" error +// codes (currently free_tier_recycle_requires_claim) MUST skip the cache +// so a user-side action (e.g. claiming with email) that clears the gate +// is honoured on the agent's next retry of the same Idempotency-Key. +// +// Whitebox test (same package) so we can exercise shouldCacheResponse +// directly without spinning a Redis fake. + +import ( + "os" + "strings" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// TestShouldCacheResponse_DefaultBehaviour covers the pre-fix contract: +// success caches, stable 4xx caches, non-JSON caches. +func TestShouldCacheResponse_DefaultBehaviour(t *testing.T) { + tests := []struct { + name string + status int + body []byte + ct string + wantCache bool + }{ + {"200 OK json", 200, []byte(`{"ok":true}`), "application/json", true}, + {"201 Created json", 201, []byte(`{"ok":true}`), "application/json", true}, + {"400 with quota_exceeded error caches (stable)", 402, []byte(`{"error":"quota_exceeded"}`), "application/json", true}, + {"400 with idempotency_key_conflict caches (stable)", 409, []byte(`{"error":"idempotency_key_conflict"}`), "application/json", true}, + {"4xx with provision_limit_reached caches (calendar boundary)", 429, []byte(`{"error":"provision_limit_reached"}`), "application/json", true}, + {"non-JSON 4xx caches (no error field to inspect)", 400, []byte(`oops`), "text/html", true}, + {"empty body caches (no error field to inspect)", 400, []byte(``), "application/json", true}, + {"malformed JSON caches (defer to default)", 400, []byte(`{not-json}`), "application/json", true}, + } + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + got := shouldCacheResponse(tc.status, tc.body, tc.ct) + assert.Equal(t, tc.wantCache, got, + "shouldCacheResponse(status=%d, ct=%q) = %v; want %v", tc.status, tc.ct, got, tc.wantCache) + }) + } +} + +// TestShouldCacheResponse_MutableErrorsSkipCache is the BUG-API-238 +// regression: every entry in the mutableErrorCodes map must return +// false from shouldCacheResponse so the agent gets fresh handler output +// on the next retry. +func TestShouldCacheResponse_MutableErrorsSkipCache(t *testing.T) { + require.NotEmpty(t, mutableErrorCodes, + "BUG-API-238: mutableErrorCodes must list at least free_tier_recycle_requires_claim") + + // Sanity: the canonical case is registered. + _, ok := mutableErrorCodes["free_tier_recycle_requires_claim"] + require.True(t, ok, + "BUG-API-238: free_tier_recycle_requires_claim must be in mutableErrorCodes") + + // Iterate the live registry (rule 18: registry-iterating regression + // test, not a hand-typed list) so any future addition is automatically + // covered. + for code := range mutableErrorCodes { + t.Run(code, func(t *testing.T) { + // Recycle gate returns 402 with claim_url etc. — exercise the + // representative case. + body := []byte(`{"ok":false,"error":"` + code + `","claim_url":"https://instanode.dev/claim"}`) + got := shouldCacheResponse(402, body, "application/json") + assert.False(t, got, + "BUG-API-238: 402 with error=%q must skip the cache; got cache=true", code) + }) + } +} + +// TestIdempotency_RecycleGate402_SourceAssertion is a static-source +// belt-and-suspenders: the call sites in both idempotency branches +// (explicit + fingerprint) must invoke shouldCacheResponse before +// writing. Without both wires the fix is half-applied (only one of +// the two cache paths bypasses) — exactly the rule-16 modal failure +// mode the agent-reliability rules call out. +func TestIdempotency_RecycleGate402_SourceAssertion(t *testing.T) { + src, err := os.ReadFile("idempotency.go") + require.NoError(t, err) + body := string(src) + + // Both branches must call shouldCacheResponse. Count >= 2 so we + // catch the case where someone deletes one of the two call sites. + assert.GreaterOrEqual(t, strings.Count(body, "shouldCacheResponse("), 2, + "BUG-API-238: shouldCacheResponse must be invoked on BOTH explicit and fingerprint cache-write paths (rule 16 — two emitters of one bug)") + + // The mutable list must reference the canonical error code by string + // so a future refactor that renames the constant (and forgets the + // map key) is caught by grep. + assert.Contains(t, body, `"free_tier_recycle_requires_claim"`, + "BUG-API-238: mutableErrorCodes must reference free_tier_recycle_requires_claim by string") +} From ad5ac421b57e26cba6480773e1aabb7761f9bd83 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 29 May 2026 23:43:58 +0530 Subject: [PATCH 2/2] test(middleware): behavioural coverage for BUG-API-238 skip wires MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The static-source assertions in idempotency_mutable_cache_test.go pin the registry invariants; this file proves both cache-write skip branches fire end-to-end. Patch-coverage gate flagged 0% on idempotency.go lines 429-435 (explicit-key skip) and 556-562 (fingerprint skip) because the previous test only exercised the helper in isolation. Adds 3 cases: - explicit-key 402 free_tier_recycle_requires_claim → second call misses cache (no X-Idempotent-Replay) - fingerprint-fallback path → same invariant - negative control: 402 quota_exceeded (stable) → second call DOES replay (Stripe-shape contract preserved for non-mutable errors) shouldCacheResponse now reports 100% coverage. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../idempotency_mutable_skip_test.go | 143 ++++++++++++++++++ 1 file changed, 143 insertions(+) create mode 100644 internal/middleware/idempotency_mutable_skip_test.go diff --git a/internal/middleware/idempotency_mutable_skip_test.go b/internal/middleware/idempotency_mutable_skip_test.go new file mode 100644 index 0000000..ee238c2 --- /dev/null +++ b/internal/middleware/idempotency_mutable_skip_test.go @@ -0,0 +1,143 @@ +package middleware_test + +// idempotency_mutable_skip_test.go — behavioural coverage for the +// BUG-API-238 cache-skip wires in both idempotency cache-write paths +// (explicit-key 24h and fingerprint-fallback 120s). The static-source +// assertions in idempotency_mutable_cache_test.go cover the registry +// invariants; this file proves the wires actually fire end-to-end. + +import ( + "context" + "net/http" + "strings" + "testing" + + "github.com/gofiber/fiber/v2" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "instant.dev/internal/middleware" + "instant.dev/internal/testhelpers" +) + +// newRecycleGateLikeApp builds a Fiber app whose handler emits a 402 +// with the canonical free_tier_recycle_requires_claim envelope. Each +// call hits the live handler (no caching), so a behavioural assertion +// can confirm shouldCacheResponse() refused to cache. +func newRecycleGateLikeApp(t *testing.T, endpoint string) (*fiber.App, func()) { + t.Helper() + rdb, cleanup := testhelpers.SetupTestRedis(t) + app := fiber.New() + app.Use(middleware.Fingerprint()) + app.Post("/recycle", middleware.Idempotency(rdb, endpoint), func(c *fiber.Ctx) error { + // Mirrors the live recycle-gate envelope shape — the fields the + // shouldCacheResponse helper inspects are `error` and (indirectly) + // content-type. Body shape kept compact; the cache decision turns + // on the error string alone. + return c.Status(fiber.StatusPaymentRequired).JSON(fiber.Map{ + "ok": false, + "error": "free_tier_recycle_requires_claim", + "message": "claim required", + }) + }) + return app, cleanup +} + +// TestIdempotency_ExplicitKey_RecycleGate402NotCached drives the +// explicit-key path: two POSTs with the same Idempotency-Key both reach +// the handler because the 402 free_tier_recycle_requires_claim must +// never be cached (BUG-API-238). The absence of X-Idempotent-Replay on +// the second call is the wire signal that the cache was skipped. +func TestIdempotency_ExplicitKey_RecycleGate402NotCached(t *testing.T) { + app, clean := newRecycleGateLikeApp(t, "test.recycle.explicit") + defer clean() + + ip := uniqueTestIP("recycle-explicit") + key := "bug238-" + ip + + // Call 1: explicit key, hits the handler. + resp1 := postWithIdem(t, app, "/recycle", ip, key, `{"x":1}`) + defer resp1.Body.Close() + require.Equal(t, http.StatusPaymentRequired, resp1.StatusCode) + assert.Empty(t, resp1.Header.Get("X-Idempotent-Replay"), + "first call cannot be a replay") + assert.Equal(t, "explicit", resp1.Header.Get("X-Idempotency-Source")) + + // Call 2: SAME key, SAME body. Pre-fix this replayed the 402 with + // X-Idempotent-Replay: true. Post-fix the cache write was skipped, + // so call 2 must reach the handler again (no replay header). + resp2 := postWithIdem(t, app, "/recycle", ip, key, `{"x":1}`) + defer resp2.Body.Close() + require.Equal(t, http.StatusPaymentRequired, resp2.StatusCode, + "second call should still return 402 (handler ran again)") + assert.Empty(t, resp2.Header.Get("X-Idempotent-Replay"), + "BUG-API-238: 402 free_tier_recycle_requires_claim MUST NOT be cached — second call must not carry X-Idempotent-Replay") + body2 := readBody(t, resp2) + assert.Contains(t, body2, `"free_tier_recycle_requires_claim"`) + // The presence of the error keyword on a fresh response (not a + // replay) is the BUG-API-238 contract. +} + +// TestIdempotency_Fingerprint_RecycleGate402NotCached drives the +// header-less (fingerprint-fallback) path. Same invariant: the 402 is +// not cached, so the second identical no-header POST hits the handler +// again rather than replaying from the fingerprint cache. +func TestIdempotency_Fingerprint_RecycleGate402NotCached(t *testing.T) { + app, clean := newRecycleGateLikeApp(t, "test.recycle.fp") + defer clean() + + ip := uniqueTestIP("recycle-fp") + + resp1 := postWithIdem(t, app, "/recycle", ip, "", `{"x":1}`) + defer resp1.Body.Close() + require.Equal(t, http.StatusPaymentRequired, resp1.StatusCode) + assert.Equal(t, "miss", resp1.Header.Get("X-Idempotency-Source")) + + resp2 := postWithIdem(t, app, "/recycle", ip, "", `{"x":1}`) + defer resp2.Body.Close() + require.Equal(t, http.StatusPaymentRequired, resp2.StatusCode) + assert.Empty(t, resp2.Header.Get("X-Idempotent-Replay"), + "BUG-API-238: fingerprint cache must also skip the recycle-gate 402") + assert.Equal(t, "miss", resp2.Header.Get("X-Idempotency-Source"), + "BUG-API-238: second call must be a miss (handler ran), not a fingerprint replay") +} + +// TestIdempotency_ExplicitKey_StableErrorStillCached is the negative +// control: a 402 with error="quota_exceeded" (a STABLE error code not in +// mutableErrorCodes) MUST still cache. Without this contrast a future +// regression that bypasses caching for ALL 4xx would silently break the +// Stripe-shape replay contract for legitimate cases. +func TestIdempotency_ExplicitKey_StableErrorStillCached(t *testing.T) { + rdb, cleanup := testhelpers.SetupTestRedis(t) + defer cleanup() + app := fiber.New() + app.Use(middleware.Fingerprint()) + stableCalls := 0 + app.Post("/stable402", middleware.Idempotency(rdb, "test.stable402"), func(c *fiber.Ctx) error { + stableCalls++ + return c.Status(fiber.StatusPaymentRequired).JSON(fiber.Map{ + "ok": false, + "error": "quota_exceeded", + }) + }) + + ip := uniqueTestIP("stable-402") + key := "stable-" + ip + + resp1 := postWithIdem(t, app, "/stable402", ip, key, `{"y":1}`) + defer resp1.Body.Close() + require.Equal(t, http.StatusPaymentRequired, resp1.StatusCode) + + resp2 := postWithIdem(t, app, "/stable402", ip, key, `{"y":1}`) + defer resp2.Body.Close() + require.Equal(t, http.StatusPaymentRequired, resp2.StatusCode) + assert.Equal(t, "true", resp2.Header.Get("X-Idempotent-Replay"), + "stable 402 (quota_exceeded) MUST still be cached — only mutableErrorCodes entries are skipped") + assert.Equal(t, 1, stableCalls, + "stable 402 replay must NOT re-run the handler — Stripe-shape contract") +} + +// silence unused-import linter when only context.Background() is used +// transitively. +var _ = context.Background +var _ = strings.TrimSpace