diff --git a/k8s/prometheus-rules.yaml b/k8s/prometheus-rules.yaml index 795d3d9..a7d0c84 100644 --- a/k8s/prometheus-rules.yaml +++ b/k8s/prometheus-rules.yaml @@ -280,6 +280,55 @@ spec: are fine; a sustained rate means the reap path itself is broken. Check instant-worker pod logs for `jobs.orphan_sweep.*_delete_failed` lines. + # instant-* — code-defect signals (BugBash 2026-05-20). + # Both counters are incremented by the safego.Go wrapper's deferred + # recover() when a panic would otherwise crash a background goroutine. + # Recovered panics keep the pod up, but they ALMOST ALWAYS indicate a + # real code defect that escaped the test suite. Page on any occurrence. + - name: instant-code-defects + rules: + - alert: GoroutinePanicsRecovered + expr: | + sum(rate(instant_goroutine_panics_total[5m])) + + sum(rate(instant_worker_goroutine_panics_recovered_total[5m])) > 0 + for: 5m + labels: + severity: critical + service: platform + annotations: + summary: "instant-* recovered a goroutine panic — code defect shipped to prod" + description: | + instant_goroutine_panics_total (api) + instant_worker_goroutine_panics_recovered_total (worker) + > 0 for >5m. Some goroutine panicked and the safego.Go wrapper caught it. The + pod stayed up, but the panic almost certainly indicates a missed error path or + nil-deref shipped past the test gates. Grep NR Logs for `safego.panic_recovered` + within the same time window to find the stack trace; fix the root cause and ship. + + # instant-worker — entitlement_regrade_failed > 0 (BugBash 2026-05-20). + # The entitlement_reconciler job calls provisioner.RegradeResource() to + # raise a tier-drifted resource's backend limits to the team's current + # plan tier. A failure here = a paying customer is still on lower-tier + # backend limits despite paying for the higher tier. Pair with the + # billing-charge-undeliverable alert (inverse failure mode: tier-not- + # translated-to-DB). + - name: instant-worker-entitlements + rules: + - alert: EntitlementRegradeFailed + expr: | + sum by (service) (rate(instant_entitlement_regrade_failed_total[10m])) > 0 + for: 10m + labels: + severity: critical + service: worker + annotations: + summary: "entitlement_regrade_failed > 0 — paying customer on wrong tier limits" + description: | + instant_entitlement_regrade_failed_total > 0 for >10m. The entitlement_reconciler + failed to call provisioner.RegradeResource() to raise a resource's backend limits + to match the team's current paid tier. A paying customer is getting lower-tier + infrastructure. Grep worker logs for `jobs.entitlement_reconciler.regrade_failed`; + pair with billing-charge-undeliverable (inverse: tier not translated to DB at all). + # CHAOS F1 (2026-05-20) — propagation_runner used to silently mark APPLIED on # any row whose target resource was missing/in an unexpected state. The fix # added unexpected_skip counting AND treats those rows as Failure (counts diff --git a/k8s/website.yaml b/k8s/website.yaml index 7507b1f..962d46a 100644 --- a/k8s/website.yaml +++ b/k8s/website.yaml @@ -8,10 +8,13 @@ metadata: name: instant-website-config namespace: instant data: - # Override this for production: https://instant.dev - # For local k8s (Rancher Desktop / minikube): use the API's NodePort - # The API is exposed on NodePort 30080 by default (see app.yaml) - API_BASE_URL: "http://localhost:30080" + # Override this for production: https://api.instanode.dev + # For local k8s (Rancher Desktop / minikube): port-forward the API: + # kubectl port-forward -n instant svc/instant-api 8080:8080 + # NOTE: the legacy NodePort 30080 was retired 2026-05-11 — instant-api + # Service is now ClusterIP only (see app.yaml). Older ConfigMap values + # of http://localhost:30080 will fail to connect after re-apply. + API_BASE_URL: "http://localhost:8080" --- apiVersion: apps/v1 diff --git a/newrelic/alerts/entitlement-regrade-failed.json b/newrelic/alerts/entitlement-regrade-failed.json new file mode 100644 index 0000000..2d8f66e --- /dev/null +++ b/newrelic/alerts/entitlement-regrade-failed.json @@ -0,0 +1,31 @@ +{ + "name": "instant-worker — entitlement_regrade_failed > 0 [tier change not delivered to backend]", + "type": "NRQL", + "description": "Fires on ANY increment of `instant_entitlement_regrade_failed_total`. The entitlement_reconciler job notices when a resource.tier is below the team's current plan tier (post-upgrade drift) and calls provisioner.RegradeResource() to bring the backend (Postgres dedicated pod / Redis ACL / Mongo user / NATS account) up to the paid limits. A failure here means a CUSTOMER who already paid for a higher tier is still running on the lower-tier backend — they have NOT received what they paid for. This is a paying-customer-impact signal. Pair with billing-charge-undeliverable (which catches the inverse: tier-not-translated-to-DB) — both surface the same problem class. Source: worker/internal/jobs/entitlement_reconciler.go; counter labelled by service (postgres/redis/mongo/queue). BugBash 2026-05-20.", + "enabled": true, + "nrql": { + "query": "SELECT sum(instant_entitlement_regrade_failed_total) FROM Metric FACET service" + }, + "terms": [ + { + "priority": "CRITICAL", + "operator": "ABOVE", + "threshold": 0, + "thresholdDuration": 300, + "thresholdOccurrences": "AT_LEAST_ONCE" + } + ], + "signal": { + "aggregationWindow": 60, + "aggregationMethod": "EVENT_FLOW", + "aggregationDelay": 120, + "fillOption": "STATIC", + "fillValue": 0 + }, + "expiration": { + "expirationDuration": 3600, + "openViolationOnExpiration": false, + "closeViolationsOnExpiration": true + }, + "violationTimeLimitSeconds": 86400 +} diff --git a/newrelic/alerts/goroutine-panics-recovered.json b/newrelic/alerts/goroutine-panics-recovered.json new file mode 100644 index 0000000..0b32b58 --- /dev/null +++ b/newrelic/alerts/goroutine-panics-recovered.json @@ -0,0 +1,31 @@ +{ + "name": "instant-* — goroutine_panics_recovered > 0 [code defect, panic recovered but flagged]", + "type": "NRQL", + "description": "Page on ANY increment of `instant_goroutine_panics_total` (api) or `instant_worker_goroutine_panics_recovered_total` (worker). Both counters are incremented by the safego.Go() wrapper when a deferred recover() catches a panic that would otherwise crash a background goroutine. Recovered panics keep the pod up but ALMOST ALWAYS indicate a real code defect — a nil map access, a divide-by-zero, or a missed error path that should have been a regular return. Each occurrence is a signal that the code shipped with a bug the test suite didn't catch. Threshold is ABOVE 0 (any single panic pages) — but with thresholdDuration=300s + aggregationWindow=60s so a one-off panic with no recurrence still notifies the operator without alert-storming. Source: common/safego package; counters in api/internal/metrics/metrics.go + worker/internal/metrics/metrics.go. BugBash 2026-05-20.", + "enabled": true, + "nrql": { + "query": "SELECT sum(instant_goroutine_panics_total) + sum(instant_worker_goroutine_panics_recovered_total) FROM Metric" + }, + "terms": [ + { + "priority": "CRITICAL", + "operator": "ABOVE", + "threshold": 0, + "thresholdDuration": 300, + "thresholdOccurrences": "AT_LEAST_ONCE" + } + ], + "signal": { + "aggregationWindow": 60, + "aggregationMethod": "EVENT_FLOW", + "aggregationDelay": 120, + "fillOption": "STATIC", + "fillValue": 0 + }, + "expiration": { + "expirationDuration": 3600, + "openViolationOnExpiration": false, + "closeViolationsOnExpiration": true + }, + "violationTimeLimitSeconds": 86400 +} diff --git a/prometheus/alert-rules.yml b/prometheus/alert-rules.yml index 710b54b..0d0421e 100644 --- a/prometheus/alert-rules.yml +++ b/prometheus/alert-rules.yml @@ -193,3 +193,65 @@ groups: key, 401 from Brevo/Razorpay) for 10 consecutive minutes. Pod stays in rotation but the upstream call path is broken. Curl the pod's /readyz body for last_error and rotate the relevant secret. + + # ── Code defects (BugBash 2026-05-20) ────────────────────────────────────── + # + # Either counter ticks when safego.Go's deferred recover() catches a + # panic that would otherwise crash a background goroutine. Pod stays + # up, but a recovered panic almost always indicates a real code defect + # — nil-deref, missing error path, divide-by-zero — that escaped the + # test gates. Page on any occurrence. + - alert: GoroutinePanicsRecovered + expr: | + sum(rate(instant_goroutine_panics_total[5m])) + + sum(rate(instant_worker_goroutine_panics_recovered_total[5m])) > 0 + for: 5m + labels: + severity: critical + annotations: + summary: "instant-* recovered a goroutine panic — code defect shipped" + description: > + A goroutine panicked and the safego.Go wrapper caught it; the pod + stayed up but a panic almost always indicates a real bug. Grep + NR Logs for `safego.panic_recovered` within the same window to + find the stack trace. + + # ── Brevo send-side error spike (BugBash 2026-05-20) ────────────────────── + # + # SEND-side counter for the email pipeline (worker → Brevo API). When + # this spikes, the messages NEVER queue at Brevo — so the receive-side + # delivery-ratio alert is silent (no provider_message_id to track). + # The two alerts together bound the pipeline. + - alert: BrevoSendErrorsSpike + expr: sum(rate(brevo_send_errors_total[5m])) * 300 > 5 + for: 5m + labels: + severity: critical + annotations: + summary: "brevo_send_errors_total > 5 in 5m — outbound email failing" + description: > + worker → Brevo /v3/smtp/email POSTs are returning non-2xx at + elevated rate. Common causes: API-key revoked, sender domain + dropped from validated list, IP blocklisted, sustained 429. + Check worker logs for `brevo.send_failed` lines for the + upstream HTTP status + body. + + # ── Entitlement regrade failures (BugBash 2026-05-20) ───────────────────── + # + # A failure here means a CUSTOMER WHO PAID is still on the lower-tier + # backend. Pair with billing-charge-undeliverable (inverse failure: + # tier-not-translated-to-DB). + - alert: EntitlementRegradeFailed + expr: sum by (service) (rate(instant_entitlement_regrade_failed_total[10m])) > 0 + for: 10m + labels: + severity: critical + annotations: + summary: "entitlement_regrade_failed > 0 for {{ $labels.service }} — paying customer on under-tier infra" + description: > + entitlement_reconciler detected resource.tier < team.plan_tier + and called provisioner.RegradeResource(), which failed. A + customer paid for higher-tier limits but their backend is still + capped at the lower tier. Check provisioner logs + the tenant's + resource.provider_resource_id; replay the regrade after the + root issue is resolved.