InstaNode-dev · mastermanas805 · May 21, 2026 · May 20, 2026
diff --git a/k8s/prometheus-rules.yaml b/k8s/prometheus-rules.yaml
@@ -280,6 +280,55 @@ spec:
               are fine; a sustained rate means the reap path itself is broken. Check
               instant-worker pod logs for `jobs.orphan_sweep.*_delete_failed` lines.
 
+    # instant-* — code-defect signals (BugBash 2026-05-20).
+    # Both counters are incremented by the safego.Go wrapper's deferred
+    # recover() when a panic would otherwise crash a background goroutine.
+    # Recovered panics keep the pod up, but they ALMOST ALWAYS indicate a
+    # real code defect that escaped the test suite. Page on any occurrence.
+    - name: instant-code-defects
+      rules:
+        - alert: GoroutinePanicsRecovered
+          expr: |
+            sum(rate(instant_goroutine_panics_total[5m]))
+              + sum(rate(instant_worker_goroutine_panics_recovered_total[5m])) > 0
+          for: 5m
+          labels:
+            severity: critical
+            service: platform
+          annotations:
+            summary: "instant-* recovered a goroutine panic — code defect shipped to prod"
+            description: |
+              instant_goroutine_panics_total (api) + instant_worker_goroutine_panics_recovered_total (worker)
+              > 0 for >5m. Some goroutine panicked and the safego.Go wrapper caught it. The
+              pod stayed up, but the panic almost certainly indicates a missed error path or
+              nil-deref shipped past the test gates. Grep NR Logs for `safego.panic_recovered`
+              within the same time window to find the stack trace; fix the root cause and ship.
+
+    # instant-worker — entitlement_regrade_failed > 0 (BugBash 2026-05-20).
+    # The entitlement_reconciler job calls provisioner.RegradeResource() to
+    # raise a tier-drifted resource's backend limits to the team's current
+    # plan tier. A failure here = a paying customer is still on lower-tier
+    # backend limits despite paying for the higher tier. Pair with the
+    # billing-charge-undeliverable alert (inverse failure mode: tier-not-
+    # translated-to-DB).
+    - name: instant-worker-entitlements
+      rules:
+        - alert: EntitlementRegradeFailed
+          expr: |
+            sum by (service) (rate(instant_entitlement_regrade_failed_total[10m])) > 0
+          for: 10m
+          labels:
+            severity: critical
+            service: worker
+          annotations:
+            summary: "entitlement_regrade_failed > 0 — paying customer on wrong tier limits"
+            description: |
+              instant_entitlement_regrade_failed_total > 0 for >10m. The entitlement_reconciler
+              failed to call provisioner.RegradeResource() to raise a resource's backend limits
+              to match the team's current paid tier. A paying customer is getting lower-tier
+              infrastructure. Grep worker logs for `jobs.entitlement_reconciler.regrade_failed`;
+              pair with billing-charge-undeliverable (inverse: tier not translated to DB at all).
+
     # CHAOS F1 (2026-05-20) — propagation_runner used to silently mark APPLIED on
     # any row whose target resource was missing/in an unexpected state. The fix
     # added unexpected_skip counting AND treats those rows as Failure (counts

diff --git a/k8s/website.yaml b/k8s/website.yaml
@@ -8,10 +8,13 @@ metadata:
   name: instant-website-config
   namespace: instant
 data:
-  # Override this for production: https://instant.dev
-  # For local k8s (Rancher Desktop / minikube): use the API's NodePort
-  # The API is exposed on NodePort 30080 by default (see app.yaml)
-  API_BASE_URL: "http://localhost:30080"
+  # Override this for production: https://api.instanode.dev
+  # For local k8s (Rancher Desktop / minikube): port-forward the API:
+  #   kubectl port-forward -n instant svc/instant-api 8080:8080
+  # NOTE: the legacy NodePort 30080 was retired 2026-05-11 — instant-api
+  # Service is now ClusterIP only (see app.yaml). Older ConfigMap values
+  # of http://localhost:30080 will fail to connect after re-apply.
+  API_BASE_URL: "http://localhost:8080"
 
 ---
 apiVersion: apps/v1

diff --git a/newrelic/alerts/entitlement-regrade-failed.json b/newrelic/alerts/entitlement-regrade-failed.json
@@ -0,0 +1,31 @@
+{
+  "name": "instant-worker — entitlement_regrade_failed > 0 [tier change not delivered to backend]",
+  "type": "NRQL",
+  "description": "Fires on ANY increment of `instant_entitlement_regrade_failed_total`. The entitlement_reconciler job notices when a resource.tier is below the team's current plan tier (post-upgrade drift) and calls provisioner.RegradeResource() to bring the backend (Postgres dedicated pod / Redis ACL / Mongo user / NATS account) up to the paid limits. A failure here means a CUSTOMER who already paid for a higher tier is still running on the lower-tier backend — they have NOT received what they paid for. This is a paying-customer-impact signal. Pair with billing-charge-undeliverable (which catches the inverse: tier-not-translated-to-DB) — both surface the same problem class. Source: worker/internal/jobs/entitlement_reconciler.go; counter labelled by service (postgres/redis/mongo/queue). BugBash 2026-05-20.",
+  "enabled": true,
+  "nrql": {
+    "query": "SELECT sum(instant_entitlement_regrade_failed_total) FROM Metric FACET service"
+  },
+  "terms": [
+    {
+      "priority": "CRITICAL",
+      "operator": "ABOVE",
+      "threshold": 0,
+      "thresholdDuration": 300,
+      "thresholdOccurrences": "AT_LEAST_ONCE"
+    }
+  ],
+  "signal": {
+    "aggregationWindow": 60,
+    "aggregationMethod": "EVENT_FLOW",
+    "aggregationDelay": 120,
+    "fillOption": "STATIC",
+    "fillValue": 0
+  },
+  "expiration": {
+    "expirationDuration": 3600,
+    "openViolationOnExpiration": false,
+    "closeViolationsOnExpiration": true
+  },
+  "violationTimeLimitSeconds": 86400
+}
diff --git a/newrelic/alerts/goroutine-panics-recovered.json b/newrelic/alerts/goroutine-panics-recovered.json
@@ -0,0 +1,31 @@
+{
+  "name": "instant-* — goroutine_panics_recovered > 0 [code defect, panic recovered but flagged]",
+  "type": "NRQL",
+  "description": "Page on ANY increment of `instant_goroutine_panics_total` (api) or `instant_worker_goroutine_panics_recovered_total` (worker). Both counters are incremented by the safego.Go() wrapper when a deferred recover() catches a panic that would otherwise crash a background goroutine. Recovered panics keep the pod up but ALMOST ALWAYS indicate a real code defect — a nil map access, a divide-by-zero, or a missed error path that should have been a regular return. Each occurrence is a signal that the code shipped with a bug the test suite didn't catch. Threshold is ABOVE 0 (any single panic pages) — but with thresholdDuration=300s + aggregationWindow=60s so a one-off panic with no recurrence still notifies the operator without alert-storming. Source: common/safego package; counters in api/internal/metrics/metrics.go + worker/internal/metrics/metrics.go. BugBash 2026-05-20.",
+  "enabled": true,
+  "nrql": {
+    "query": "SELECT sum(instant_goroutine_panics_total) + sum(instant_worker_goroutine_panics_recovered_total) FROM Metric"
+  },
+  "terms": [
+    {
+      "priority": "CRITICAL",
+      "operator": "ABOVE",
+      "threshold": 0,
+      "thresholdDuration": 300,
+      "thresholdOccurrences": "AT_LEAST_ONCE"
+    }
+  ],
+  "signal": {
+    "aggregationWindow": 60,
+    "aggregationMethod": "EVENT_FLOW",
+    "aggregationDelay": 120,
+    "fillOption": "STATIC",
+    "fillValue": 0
+  },
+  "expiration": {
+    "expirationDuration": 3600,
+    "openViolationOnExpiration": false,
+    "closeViolationsOnExpiration": true
+  },
+  "violationTimeLimitSeconds": 86400
+}
diff --git a/prometheus/alert-rules.yml b/prometheus/alert-rules.yml
@@ -193,3 +193,65 @@ groups:
             key, 401 from Brevo/Razorpay) for 10 consecutive minutes. Pod stays
             in rotation but the upstream call path is broken. Curl the pod's
             /readyz body for last_error and rotate the relevant secret.
+
+      # ── Code defects (BugBash 2026-05-20) ──────────────────────────────────────
+      #
+      # Either counter ticks when safego.Go's deferred recover() catches a
+      # panic that would otherwise crash a background goroutine. Pod stays
+      # up, but a recovered panic almost always indicates a real code defect
+      # — nil-deref, missing error path, divide-by-zero — that escaped the
+      # test gates. Page on any occurrence.
+      - alert: GoroutinePanicsRecovered
+        expr: |
+          sum(rate(instant_goroutine_panics_total[5m]))
+            + sum(rate(instant_worker_goroutine_panics_recovered_total[5m])) > 0
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: "instant-* recovered a goroutine panic — code defect shipped"
+          description: >
+            A goroutine panicked and the safego.Go wrapper caught it; the pod
+            stayed up but a panic almost always indicates a real bug. Grep
+            NR Logs for `safego.panic_recovered` within the same window to
+            find the stack trace.
+
+      # ── Brevo send-side error spike (BugBash 2026-05-20) ──────────────────────
+      #
+      # SEND-side counter for the email pipeline (worker → Brevo API). When
+      # this spikes, the messages NEVER queue at Brevo — so the receive-side
+      # delivery-ratio alert is silent (no provider_message_id to track).
+      # The two alerts together bound the pipeline.
+      - alert: BrevoSendErrorsSpike
+        expr: sum(rate(brevo_send_errors_total[5m])) * 300 > 5
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: "brevo_send_errors_total > 5 in 5m — outbound email failing"
+          description: >
+            worker → Brevo /v3/smtp/email POSTs are returning non-2xx at
+            elevated rate. Common causes: API-key revoked, sender domain
+            dropped from validated list, IP blocklisted, sustained 429.
+            Check worker logs for `brevo.send_failed` lines for the
+            upstream HTTP status + body.
+
+      # ── Entitlement regrade failures (BugBash 2026-05-20) ─────────────────────
+      #
+      # A failure here means a CUSTOMER WHO PAID is still on the lower-tier
+      # backend. Pair with billing-charge-undeliverable (inverse failure:
+      # tier-not-translated-to-DB).
+      - alert: EntitlementRegradeFailed
+        expr: sum by (service) (rate(instant_entitlement_regrade_failed_total[10m])) > 0
+        for: 10m
+        labels:
+          severity: critical
+        annotations:
+          summary: "entitlement_regrade_failed > 0 for {{ $labels.service }} — paying customer on under-tier infra"
+          description: >
+            entitlement_reconciler detected resource.tier < team.plan_tier
+            and called provisioner.RegradeResource(), which failed. A
+            customer paid for higher-tier limits but their backend is still
+            capped at the lower tier. Check provisioner logs + the tenant's
+            resource.provider_resource_id; replay the regrade after the
+            root issue is resolved.