From 318c5b3027ac53dfaa48937e887feb04ce864da8 Mon Sep 17 00:00:00 2001
From: Manas Srivastava <mastermanas805@gmail.com>
Date: Wed, 20 May 2026 16:26:17 +0530
Subject: [PATCH 1/2] feat(k8s/app.yaml): terminationGracePeriodSeconds=35 +
 preStop hook (MR-P0-7)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 k8s/app.yaml | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/k8s/app.yaml b/k8s/app.yaml
index 1f31381..3a48b05 100644
--- a/k8s/app.yaml
+++ b/k8s/app.yaml
@@ -29,6 +29,18 @@ spec:
         app: instant-api
     spec:
       serviceAccountName: instant-api   # grants deploy-manager ClusterRole (see deploy-rbac.yaml)
+      # terminationGracePeriodSeconds — 35s after SIGTERM before kubelet
+      # escalates to SIGKILL. Budget (matches runServerWithGracefulShutdown
+      # in api/main.go):
+      #   preStop sleep             5s  ← LB sees /readyz 503 first
+      #   readinessDrainGrace       3s  ← in-process probe-tick window
+      #   gracefulShutdownTimeout  25s  ← Fiber drain in-flight handlers
+      #   safety margin             2s  ← buffer before SIGKILL
+      #   ──                       ────
+      #   total                    35s
+      # Default is 30s, which collides with our 25+3+5=33s drain.
+      # MR-P0-7 (BugBash 2026-05-20). Keep in sync with main.go consts.
+      terminationGracePeriodSeconds: 35
       # Spread replicas across nodes when possible — preferred (not required)
       # so a 1-node dev cluster still schedules both pods.
       affinity:
@@ -54,6 +66,16 @@ spec:
       containers:
         - name: api
           image: instant-api:local   # built with: docker build -t instant-api:local .
+          # preStop — sleep 5s before SIGTERM is delivered so the kubelet
+          # has a tick to observe the readinessProbe failure (the api
+          # flips inside runServerWithGracefulShutdown via
+          # hooks.Readyz.MarkDraining) and update Service endpoints.
+          # Without this, the LB keeps routing new traffic to a pod that
+          # is about to stop accepting connections. MR-P0-7.
+          lifecycle:
+            preStop:
+              exec:
+                command: ["/bin/sh", "-c", "sleep 5"]
           ports:
             - containerPort: 8080
           envFrom:

From 2933b9fa6d640f5bcfc1aead6a3e161878147c9e Mon Sep 17 00:00:00 2001
From: Manas Srivastava <mastermanas805@gmail.com>
Date: Wed, 20 May 2026 16:27:10 +0530
Subject: [PATCH 2/2] alerts(prom): orphan_sweep reaped/failed alerts (PASS
 3/6, 2026-05-20)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three new Prometheus alerts tied to the worker repo's PASS 3 enhanced
reasons + PASS 6 stuck-build counters:

- OrphanSweepNoDBRowReap (CRITICAL, 1h): a k8s namespace had no backing
  deployments row — the P0-3 atomic-provision symptom. Pages on first
  occurrence over 1h.

- OrphanSweepStuckBuildSpike (WARNING, 15m): >5 stuck-build flips in 15m
  means the kaniko/GHCR build pipeline is degraded for many customers
  at once.

- OrphanSweepReapFailureRate (WARNING, 30m): the reconciler detected
  orphans it cannot reap (k8s/DB write failure sustained).

The counters land in worker master commit 7d2ff0d; the alerts go live
once the deploy lands + scrape picks them up.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 k8s/prometheus-rules.yaml | 61 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)

diff --git a/k8s/prometheus-rules.yaml b/k8s/prometheus-rules.yaml
index c2ad5ef..07c5ca3 100644
--- a/k8s/prometheus-rules.yaml
+++ b/k8s/prometheus-rules.yaml
@@ -218,3 +218,64 @@ spec:
           annotations:
             summary: "instant-worker saw a pending_propagations kind it doesn't recognise (kind={{ $labels.kind }})"
             description: "instant_propagation_unknown_kind_total{kind=\"{{ $labels.kind }}\"} > 0 for >5m. A worker pod is running an older image than the api enqueued (kind={{ $labels.kind }} is not in propagationHandlers). Finish the rollout — `kubectl rollout status deploy/instant-worker -n instant-infra` and confirm pods are on the same image as instant-api. The row will dead-letter after propagationMaxAttempts (10) attempts (~24h cumulative backoff) which will fire PropagationDeadLettered above; this is the early warning."
+
+    # instant-worker — orphan_sweep PASS 3/4/5/6 reap alerts (2026-05-20).
+    # Fires on the worker's instant_orphan_sweep_reaped_total counter
+    # (introduced in worker repo, see metrics.go::OrphanSweepReapedTotal).
+    # Each reap is labelled by `reason`; the alerts here key on the
+    # reasons that imply a distinct upstream bug worth paging on.
+    - name: instant-worker-orphan-sweep
+      rules:
+        - alert: OrphanSweepNoDBRowReap
+          expr: |
+            sum(rate(instant_orphan_sweep_reaped_total{reason="no_db_row"}[1h])) > 0
+          for: 1h
+          labels:
+            severity: critical
+            service: worker
+          annotations:
+            summary: "orphan_sweep reaped an instant-deploy-* namespace with NO backing deployments row (P0-3 atomic-provision bug)"
+            description: |
+              instant_orphan_sweep_reaped_total{reason="no_db_row"} > 0 for >1h.
+              A no_db_row event means a k8s namespace was provisioned (instant-deploy-<appID>)
+              but no deployments row exists for that app_id — the api created the namespace
+              but the INSERT into deployments never landed. This is the P0-3 atomic-provision
+              symptom surfacing in prod.
+              Investigate same hour: search NR Logs for `jobs.orphan_sweep.proposed_reap`
+              with reason=no_db_row, capture the app_id, then trace back through the api
+              POST /deploy/new logs for the same time window to find the partial-commit
+              path that needs the atomic-rollback fix.
+
+        - alert: OrphanSweepStuckBuildSpike
+          expr: |
+            sum(rate(instant_orphan_sweep_reaped_total{reason="failed_build"}[15m])) * 900 > 5
+          for: 15m
+          labels:
+            severity: warning
+            service: worker
+          annotations:
+            summary: "orphan_sweep PASS 6 flipped >5 stuck builds to failed in 15m (build pipeline degraded)"
+            description: |
+              instant_orphan_sweep_reaped_total{reason="failed_build"} > 5 events in 15m.
+              PASS 6 catches deployments stuck in 'building'/'deploying' for >30min whose
+              pod is in ImagePullBackOff/ErrImagePull/CrashLoopBackOff. A burst means many
+              customers' builds are wedged at once — the most likely cause is a ghcr.io
+              outage, a Kaniko image-push 403 (worker-rbac.yaml GHCR_PUSH_TOKEN scope), or
+              an upstream registry auth failure. Check ghcr.io status, the deploy.yml CI
+              push step, and the kaniko build pod logs in instant-deploy-* namespaces.
+
+        - alert: OrphanSweepReapFailureRate
+          expr: |
+            sum(rate(instant_orphan_sweep_reap_failed_total[15m])) by (reason) > 0
+          for: 30m
+          labels:
+            severity: warning
+            service: worker
+          annotations:
+            summary: "orphan_sweep reap_failed > 0 sustained for 30m (reason={{ $labels.reason }})"
+            description: |
+              instant_orphan_sweep_reap_failed_total{reason="{{ $labels.reason }}"} > 0
+              sustained for >30 minutes. The reconciler detected an orphan but could not
+              clean it — a k8s API outage or a DB write failure. Single transient events
+              are fine; a sustained rate means the reap path itself is broken. Check
+              instant-worker pod logs for `jobs.orphan_sweep.*_delete_failed` lines.