Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 107 additions & 0 deletions internal/middleware/idempotency.go
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,85 @@ type idemEntry struct {
BodyHash string `json:"h"` // sha256 hex of the original request body
}

// mutableErrorCodes lists machine-readable `error` strings whose 4xx
// resolution can flip BEFORE the explicit-key 24h cache TTL elapses,
// so caching them silently strands the agent on the stale failure.
//
// BUG-API-238 (QA 2026-05-29): the canonical case is
// `free_tier_recycle_requires_claim`. Sequence:
//
// 1. Agent calls POST /db/new with Idempotency-Key K1. The recycle gate
// fires → 402 free_tier_recycle_requires_claim cached against K1.
// 2. User follows the claim_url and successfully claims (POST /claim).
// The recycle gate state is now CLEARED — a fresh /db/new without
// a key would succeed.
// 3. Agent retries the original /db/new with Idempotency-Key K1. The
// pre-fix path replays the cached 402 verbatim, even though the
// gate would now wave the caller through. Agent thinks the claim
// failed; user thinks the platform is broken.
//
// Excluding these codes from caching restores the Stripe-shape contract
// ("same key replays the same response") in the spirit it was written:
// a STABLE outcome is replayed. A 402 that disappears the moment the
// user takes 30 seconds to claim is not a stable outcome.
//
// We do NOT exclude generic `quota_exceeded` / `upgrade_required` /
// `provision_limit_reached` — those resolve on a calendar boundary
// (next UTC day) or a payment event, both of which are outside the
// agent's reach inside the 24h key TTL. The recycle gate is the
// distinguishing case: a user-initiated action a few seconds later
// clears it.
//
// The list is small on purpose. Adding a code here means "this gate
// can clear inside 24h without server-side state change" — which is
// the actual cache-coherence boundary. Any future addition needs a
// regression test that exercises the same-key-retry-after-resolution
// path.
var mutableErrorCodes = map[string]struct{}{
"free_tier_recycle_requires_claim": {},
}

// shouldCacheResponse decides whether a non-5xx response should be
// written to the idempotency cache. Success (<400) always caches —
// that's the Stripe contract. 4xx responses cache UNLESS the body's
// `error` field is in mutableErrorCodes (BUG-API-238).
//
// JSON peek is non-strict: any parse error / non-JSON body / missing
// `error` field falls back to caching (the pre-fix behaviour). Only a
// well-formed envelope whose `error` is listed gets skipped, so the
// helper degrades safely on unexpected bodies.
func shouldCacheResponse(status int, body []byte, contentType string) bool {
// Success responses always cache — the Stripe-shape contract guarantees
// the agent can replay them. Mutability only matters for failures the
// caller might re-resolve.
if status < 400 {
return true
}
// Quick rejects: non-JSON bodies (e.g. text/html 4xx) can't carry the
// `error` field shape we filter on, so default-cache them.
if !strings.Contains(contentType, "json") {
return true
}
if len(body) == 0 {
return true
}
// Peek the `error` field. Use a minimal struct to avoid pulling in the
// handlers package's ErrorResponse (which would create a circular
// import — handlers depends on middleware).
var peek struct {
Error string `json:"error"`
}
if err := json.Unmarshal(body, &peek); err != nil {
// Malformed JSON body — defer to default (cache). The handler
// emitted bytes the test suite is responsible for catching.
return true
}
if _, mutable := mutableErrorCodes[peek.Error]; mutable {
return false
}
return true
}

// Idempotency returns a Fiber handler that dedups duplicate POSTs via two
// layered mechanisms:
//
Expand Down Expand Up @@ -341,6 +420,20 @@ func idempotencyExplicit(c *fiber.Ctx, rdb *redis.Client, endpoint, scope, rawKe
body := append([]byte(nil), c.Response().Body()...)
ct := string(c.Response().Header.ContentType())

// BUG-API-238: bypass the cache for mutable error codes (currently
// just free_tier_recycle_requires_claim). A 402 the user resolves
// in 30s by clicking claim_url must not get strand-cached against
// the agent's 24h Idempotency-Key. Success + stable failures still
// cache as before.
if !shouldCacheResponse(status, body, ct) {
slog.Info("idempotency.skip_cache_mutable_error",
"endpoint", endpoint,
"status", status,
"request_id", GetRequestID(c),
)
return nextErr
}

entry := idemEntry{
StatusCode: status,
Body: body,
Expand Down Expand Up @@ -454,6 +547,20 @@ func idempotencyFingerprint(c *fiber.Ctx, rdb *redis.Client, endpoint, scope str
body := append([]byte(nil), c.Response().Body()...)
ct := string(c.Response().Header.ContentType())

// BUG-API-238: same mutable-error bypass as the explicit-key path.
// The fingerprint TTL is only 120s but the recycle gate still flips
// inside that window when a user claims fast — and the fingerprint
// path is the no-header default, so the silent strand is even more
// likely here.
if !shouldCacheResponse(status, body, ct) {
slog.Info("idempotency.fingerprint_skip_cache_mutable_error",
"endpoint", endpoint,
"status", status,
"request_id", GetRequestID(c),
)
return nextErr
}

entry := idemEntry{
StatusCode: status,
Body: body,
Expand Down
101 changes: 101 additions & 0 deletions internal/middleware/idempotency_mutable_cache_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
package middleware

// idempotency_mutable_cache_test.go — BUG-API-238 regression.
//
// The shouldCacheResponse helper governs whether a non-5xx response is
// written to the explicit-key (24h) or fingerprint (120s) idempotency
// cache. Most 4xx responses cache; a small allowlist of "mutable" error
// codes (currently free_tier_recycle_requires_claim) MUST skip the cache
// so a user-side action (e.g. claiming with email) that clears the gate
// is honoured on the agent's next retry of the same Idempotency-Key.
//
// Whitebox test (same package) so we can exercise shouldCacheResponse
// directly without spinning a Redis fake.

import (
"os"
"strings"
"testing"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)

// TestShouldCacheResponse_DefaultBehaviour covers the pre-fix contract:
// success caches, stable 4xx caches, non-JSON caches.
func TestShouldCacheResponse_DefaultBehaviour(t *testing.T) {
tests := []struct {
name string
status int
body []byte
ct string
wantCache bool
}{
{"200 OK json", 200, []byte(`{"ok":true}`), "application/json", true},
{"201 Created json", 201, []byte(`{"ok":true}`), "application/json", true},
{"400 with quota_exceeded error caches (stable)", 402, []byte(`{"error":"quota_exceeded"}`), "application/json", true},
{"400 with idempotency_key_conflict caches (stable)", 409, []byte(`{"error":"idempotency_key_conflict"}`), "application/json", true},
{"4xx with provision_limit_reached caches (calendar boundary)", 429, []byte(`{"error":"provision_limit_reached"}`), "application/json", true},
{"non-JSON 4xx caches (no error field to inspect)", 400, []byte(`<html>oops</html>`), "text/html", true},
{"empty body caches (no error field to inspect)", 400, []byte(``), "application/json", true},
{"malformed JSON caches (defer to default)", 400, []byte(`{not-json}`), "application/json", true},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
got := shouldCacheResponse(tc.status, tc.body, tc.ct)
assert.Equal(t, tc.wantCache, got,
"shouldCacheResponse(status=%d, ct=%q) = %v; want %v", tc.status, tc.ct, got, tc.wantCache)
})
}
}

// TestShouldCacheResponse_MutableErrorsSkipCache is the BUG-API-238
// regression: every entry in the mutableErrorCodes map must return
// false from shouldCacheResponse so the agent gets fresh handler output
// on the next retry.
func TestShouldCacheResponse_MutableErrorsSkipCache(t *testing.T) {
require.NotEmpty(t, mutableErrorCodes,
"BUG-API-238: mutableErrorCodes must list at least free_tier_recycle_requires_claim")

// Sanity: the canonical case is registered.
_, ok := mutableErrorCodes["free_tier_recycle_requires_claim"]
require.True(t, ok,
"BUG-API-238: free_tier_recycle_requires_claim must be in mutableErrorCodes")

// Iterate the live registry (rule 18: registry-iterating regression
// test, not a hand-typed list) so any future addition is automatically
// covered.
for code := range mutableErrorCodes {
t.Run(code, func(t *testing.T) {
// Recycle gate returns 402 with claim_url etc. — exercise the
// representative case.
body := []byte(`{"ok":false,"error":"` + code + `","claim_url":"https://instanode.dev/claim"}`)
got := shouldCacheResponse(402, body, "application/json")
assert.False(t, got,
"BUG-API-238: 402 with error=%q must skip the cache; got cache=true", code)
})
}
}

// TestIdempotency_RecycleGate402_SourceAssertion is a static-source
// belt-and-suspenders: the call sites in both idempotency branches
// (explicit + fingerprint) must invoke shouldCacheResponse before
// writing. Without both wires the fix is half-applied (only one of
// the two cache paths bypasses) — exactly the rule-16 modal failure
// mode the agent-reliability rules call out.
func TestIdempotency_RecycleGate402_SourceAssertion(t *testing.T) {
src, err := os.ReadFile("idempotency.go")
require.NoError(t, err)
body := string(src)

// Both branches must call shouldCacheResponse. Count >= 2 so we
// catch the case where someone deletes one of the two call sites.
assert.GreaterOrEqual(t, strings.Count(body, "shouldCacheResponse("), 2,
"BUG-API-238: shouldCacheResponse must be invoked on BOTH explicit and fingerprint cache-write paths (rule 16 — two emitters of one bug)")

// The mutable list must reference the canonical error code by string
// so a future refactor that renames the constant (and forgets the
// map key) is caught by grep.
assert.Contains(t, body, `"free_tier_recycle_requires_claim"`,
"BUG-API-238: mutableErrorCodes must reference free_tier_recycle_requires_claim by string")
}
143 changes: 143 additions & 0 deletions internal/middleware/idempotency_mutable_skip_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
package middleware_test

// idempotency_mutable_skip_test.go — behavioural coverage for the
// BUG-API-238 cache-skip wires in both idempotency cache-write paths
// (explicit-key 24h and fingerprint-fallback 120s). The static-source
// assertions in idempotency_mutable_cache_test.go cover the registry
// invariants; this file proves the wires actually fire end-to-end.

import (
"context"
"net/http"
"strings"
"testing"

"github.com/gofiber/fiber/v2"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"

"instant.dev/internal/middleware"
"instant.dev/internal/testhelpers"
)

// newRecycleGateLikeApp builds a Fiber app whose handler emits a 402
// with the canonical free_tier_recycle_requires_claim envelope. Each
// call hits the live handler (no caching), so a behavioural assertion
// can confirm shouldCacheResponse() refused to cache.
func newRecycleGateLikeApp(t *testing.T, endpoint string) (*fiber.App, func()) {
t.Helper()
rdb, cleanup := testhelpers.SetupTestRedis(t)
app := fiber.New()
app.Use(middleware.Fingerprint())
app.Post("/recycle", middleware.Idempotency(rdb, endpoint), func(c *fiber.Ctx) error {
// Mirrors the live recycle-gate envelope shape — the fields the
// shouldCacheResponse helper inspects are `error` and (indirectly)
// content-type. Body shape kept compact; the cache decision turns
// on the error string alone.
return c.Status(fiber.StatusPaymentRequired).JSON(fiber.Map{
"ok": false,
"error": "free_tier_recycle_requires_claim",
"message": "claim required",
})
})
return app, cleanup
}

// TestIdempotency_ExplicitKey_RecycleGate402NotCached drives the
// explicit-key path: two POSTs with the same Idempotency-Key both reach
// the handler because the 402 free_tier_recycle_requires_claim must
// never be cached (BUG-API-238). The absence of X-Idempotent-Replay on
// the second call is the wire signal that the cache was skipped.
func TestIdempotency_ExplicitKey_RecycleGate402NotCached(t *testing.T) {
app, clean := newRecycleGateLikeApp(t, "test.recycle.explicit")
defer clean()

ip := uniqueTestIP("recycle-explicit")
key := "bug238-" + ip

// Call 1: explicit key, hits the handler.
resp1 := postWithIdem(t, app, "/recycle", ip, key, `{"x":1}`)
defer resp1.Body.Close()
require.Equal(t, http.StatusPaymentRequired, resp1.StatusCode)
assert.Empty(t, resp1.Header.Get("X-Idempotent-Replay"),
"first call cannot be a replay")
assert.Equal(t, "explicit", resp1.Header.Get("X-Idempotency-Source"))

// Call 2: SAME key, SAME body. Pre-fix this replayed the 402 with
// X-Idempotent-Replay: true. Post-fix the cache write was skipped,
// so call 2 must reach the handler again (no replay header).
resp2 := postWithIdem(t, app, "/recycle", ip, key, `{"x":1}`)
defer resp2.Body.Close()
require.Equal(t, http.StatusPaymentRequired, resp2.StatusCode,
"second call should still return 402 (handler ran again)")
assert.Empty(t, resp2.Header.Get("X-Idempotent-Replay"),
"BUG-API-238: 402 free_tier_recycle_requires_claim MUST NOT be cached — second call must not carry X-Idempotent-Replay")
body2 := readBody(t, resp2)
assert.Contains(t, body2, `"free_tier_recycle_requires_claim"`)
// The presence of the error keyword on a fresh response (not a
// replay) is the BUG-API-238 contract.
}

// TestIdempotency_Fingerprint_RecycleGate402NotCached drives the
// header-less (fingerprint-fallback) path. Same invariant: the 402 is
// not cached, so the second identical no-header POST hits the handler
// again rather than replaying from the fingerprint cache.
func TestIdempotency_Fingerprint_RecycleGate402NotCached(t *testing.T) {
app, clean := newRecycleGateLikeApp(t, "test.recycle.fp")
defer clean()

ip := uniqueTestIP("recycle-fp")

resp1 := postWithIdem(t, app, "/recycle", ip, "", `{"x":1}`)
defer resp1.Body.Close()
require.Equal(t, http.StatusPaymentRequired, resp1.StatusCode)
assert.Equal(t, "miss", resp1.Header.Get("X-Idempotency-Source"))

resp2 := postWithIdem(t, app, "/recycle", ip, "", `{"x":1}`)
defer resp2.Body.Close()
require.Equal(t, http.StatusPaymentRequired, resp2.StatusCode)
assert.Empty(t, resp2.Header.Get("X-Idempotent-Replay"),
"BUG-API-238: fingerprint cache must also skip the recycle-gate 402")
assert.Equal(t, "miss", resp2.Header.Get("X-Idempotency-Source"),
"BUG-API-238: second call must be a miss (handler ran), not a fingerprint replay")
}

// TestIdempotency_ExplicitKey_StableErrorStillCached is the negative
// control: a 402 with error="quota_exceeded" (a STABLE error code not in
// mutableErrorCodes) MUST still cache. Without this contrast a future
// regression that bypasses caching for ALL 4xx would silently break the
// Stripe-shape replay contract for legitimate cases.
func TestIdempotency_ExplicitKey_StableErrorStillCached(t *testing.T) {
rdb, cleanup := testhelpers.SetupTestRedis(t)
defer cleanup()
app := fiber.New()
app.Use(middleware.Fingerprint())
stableCalls := 0
app.Post("/stable402", middleware.Idempotency(rdb, "test.stable402"), func(c *fiber.Ctx) error {
stableCalls++
return c.Status(fiber.StatusPaymentRequired).JSON(fiber.Map{
"ok": false,
"error": "quota_exceeded",
})
})

ip := uniqueTestIP("stable-402")
key := "stable-" + ip

resp1 := postWithIdem(t, app, "/stable402", ip, key, `{"y":1}`)
defer resp1.Body.Close()
require.Equal(t, http.StatusPaymentRequired, resp1.StatusCode)

resp2 := postWithIdem(t, app, "/stable402", ip, key, `{"y":1}`)
defer resp2.Body.Close()
require.Equal(t, http.StatusPaymentRequired, resp2.StatusCode)
assert.Equal(t, "true", resp2.Header.Get("X-Idempotent-Replay"),
"stable 402 (quota_exceeded) MUST still be cached — only mutableErrorCodes entries are skipped")
assert.Equal(t, 1, stableCalls,
"stable 402 replay must NOT re-run the handler — Stripe-shape contract")
}

// silence unused-import linter when only context.Background() is used
// transitively.
var _ = context.Background
var _ = strings.TrimSpace
Loading