From 318c5b3027ac53dfaa48937e887feb04ce864da8 Mon Sep 17 00:00:00 2001 From: Manas Srivastava Date: Wed, 20 May 2026 16:26:17 +0530 Subject: [PATCH 1/2] feat(k8s/app.yaml): terminationGracePeriodSeconds=35 + preStop hook (MR-P0-7) Co-Authored-By: Claude Opus 4.7 (1M context) --- k8s/app.yaml | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/k8s/app.yaml b/k8s/app.yaml index 1f31381..3a48b05 100644 --- a/k8s/app.yaml +++ b/k8s/app.yaml @@ -29,6 +29,18 @@ spec: app: instant-api spec: serviceAccountName: instant-api # grants deploy-manager ClusterRole (see deploy-rbac.yaml) + # terminationGracePeriodSeconds — 35s after SIGTERM before kubelet + # escalates to SIGKILL. Budget (matches runServerWithGracefulShutdown + # in api/main.go): + # preStop sleep 5s ← LB sees /readyz 503 first + # readinessDrainGrace 3s ← in-process probe-tick window + # gracefulShutdownTimeout 25s ← Fiber drain in-flight handlers + # safety margin 2s ← buffer before SIGKILL + # ── ──── + # total 35s + # Default is 30s, which collides with our 25+3+5=33s drain. + # MR-P0-7 (BugBash 2026-05-20). Keep in sync with main.go consts. + terminationGracePeriodSeconds: 35 # Spread replicas across nodes when possible — preferred (not required) # so a 1-node dev cluster still schedules both pods. affinity: @@ -54,6 +66,16 @@ spec: containers: - name: api image: instant-api:local # built with: docker build -t instant-api:local . + # preStop — sleep 5s before SIGTERM is delivered so the kubelet + # has a tick to observe the readinessProbe failure (the api + # flips inside runServerWithGracefulShutdown via + # hooks.Readyz.MarkDraining) and update Service endpoints. + # Without this, the LB keeps routing new traffic to a pod that + # is about to stop accepting connections. MR-P0-7. + lifecycle: + preStop: + exec: + command: ["/bin/sh", "-c", "sleep 5"] ports: - containerPort: 8080 envFrom: From 2933b9fa6d640f5bcfc1aead6a3e161878147c9e Mon Sep 17 00:00:00 2001 From: Manas Srivastava Date: Wed, 20 May 2026 16:27:10 +0530 Subject: [PATCH 2/2] alerts(prom): orphan_sweep reaped/failed alerts (PASS 3/6, 2026-05-20) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three new Prometheus alerts tied to the worker repo's PASS 3 enhanced reasons + PASS 6 stuck-build counters: - OrphanSweepNoDBRowReap (CRITICAL, 1h): a k8s namespace had no backing deployments row — the P0-3 atomic-provision symptom. Pages on first occurrence over 1h. - OrphanSweepStuckBuildSpike (WARNING, 15m): >5 stuck-build flips in 15m means the kaniko/GHCR build pipeline is degraded for many customers at once. - OrphanSweepReapFailureRate (WARNING, 30m): the reconciler detected orphans it cannot reap (k8s/DB write failure sustained). The counters land in worker master commit 7d2ff0d; the alerts go live once the deploy lands + scrape picks them up. Co-Authored-By: Claude Opus 4.7 (1M context) --- k8s/prometheus-rules.yaml | 61 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/k8s/prometheus-rules.yaml b/k8s/prometheus-rules.yaml index c2ad5ef..07c5ca3 100644 --- a/k8s/prometheus-rules.yaml +++ b/k8s/prometheus-rules.yaml @@ -218,3 +218,64 @@ spec: annotations: summary: "instant-worker saw a pending_propagations kind it doesn't recognise (kind={{ $labels.kind }})" description: "instant_propagation_unknown_kind_total{kind=\"{{ $labels.kind }}\"} > 0 for >5m. A worker pod is running an older image than the api enqueued (kind={{ $labels.kind }} is not in propagationHandlers). Finish the rollout — `kubectl rollout status deploy/instant-worker -n instant-infra` and confirm pods are on the same image as instant-api. The row will dead-letter after propagationMaxAttempts (10) attempts (~24h cumulative backoff) which will fire PropagationDeadLettered above; this is the early warning." + + # instant-worker — orphan_sweep PASS 3/4/5/6 reap alerts (2026-05-20). + # Fires on the worker's instant_orphan_sweep_reaped_total counter + # (introduced in worker repo, see metrics.go::OrphanSweepReapedTotal). + # Each reap is labelled by `reason`; the alerts here key on the + # reasons that imply a distinct upstream bug worth paging on. + - name: instant-worker-orphan-sweep + rules: + - alert: OrphanSweepNoDBRowReap + expr: | + sum(rate(instant_orphan_sweep_reaped_total{reason="no_db_row"}[1h])) > 0 + for: 1h + labels: + severity: critical + service: worker + annotations: + summary: "orphan_sweep reaped an instant-deploy-* namespace with NO backing deployments row (P0-3 atomic-provision bug)" + description: | + instant_orphan_sweep_reaped_total{reason="no_db_row"} > 0 for >1h. + A no_db_row event means a k8s namespace was provisioned (instant-deploy-) + but no deployments row exists for that app_id — the api created the namespace + but the INSERT into deployments never landed. This is the P0-3 atomic-provision + symptom surfacing in prod. + Investigate same hour: search NR Logs for `jobs.orphan_sweep.proposed_reap` + with reason=no_db_row, capture the app_id, then trace back through the api + POST /deploy/new logs for the same time window to find the partial-commit + path that needs the atomic-rollback fix. + + - alert: OrphanSweepStuckBuildSpike + expr: | + sum(rate(instant_orphan_sweep_reaped_total{reason="failed_build"}[15m])) * 900 > 5 + for: 15m + labels: + severity: warning + service: worker + annotations: + summary: "orphan_sweep PASS 6 flipped >5 stuck builds to failed in 15m (build pipeline degraded)" + description: | + instant_orphan_sweep_reaped_total{reason="failed_build"} > 5 events in 15m. + PASS 6 catches deployments stuck in 'building'/'deploying' for >30min whose + pod is in ImagePullBackOff/ErrImagePull/CrashLoopBackOff. A burst means many + customers' builds are wedged at once — the most likely cause is a ghcr.io + outage, a Kaniko image-push 403 (worker-rbac.yaml GHCR_PUSH_TOKEN scope), or + an upstream registry auth failure. Check ghcr.io status, the deploy.yml CI + push step, and the kaniko build pod logs in instant-deploy-* namespaces. + + - alert: OrphanSweepReapFailureRate + expr: | + sum(rate(instant_orphan_sweep_reap_failed_total[15m])) by (reason) > 0 + for: 30m + labels: + severity: warning + service: worker + annotations: + summary: "orphan_sweep reap_failed > 0 sustained for 30m (reason={{ $labels.reason }})" + description: | + instant_orphan_sweep_reap_failed_total{reason="{{ $labels.reason }}"} > 0 + sustained for >30 minutes. The reconciler detected an orphan but could not + clean it — a k8s API outage or a DB write failure. Single transient events + are fine; a sustained rate means the reap path itself is broken. Check + instant-worker pod logs for `jobs.orphan_sweep.*_delete_failed` lines.