From 0284e2cd05c0024e7928a2b9188efe78676d14e1 Mon Sep 17 00:00:00 2001 From: stxkxs Date: Sun, 14 Jun 2026 21:27:23 -0700 Subject: [PATCH] observability: add the kube-state-metrics addon (+ operator CR-state metrics) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit grafana-agent statically scrapes kube-state-metrics.kube-system.svc:8080, but no addon ever shipped kube-state-metrics — so that scrape hit nothing and every kube_* / kube_customresource_* panel silently no-data'd: the cilium/cert-manager/ eso dashboards on the standard metrics, and the seven agent persona dashboards + operator-slo CR-status alerts on the custom-resource ones. Add the addon (prometheus-community/kube-state-metrics 7.5.1, app 2.19.1) in kube-system with fullnameOverride: kube-state-metrics so the existing scrape target resolves — grafana-agent unchanged. customResourceState.config carries the eks-agent-platform operator's CR-state definitions (Platform / Tenant / BudgetPolicy / AgentFleet / EvalSuite status_phase + status_field + condition), and rbac.extraRules grants KSM list/watch on those CRDs so kube_customresource_* series actually emit. The CR-state config is inlined here rather than mounted from the operator chart's ConfigMap on purpose: the operator runs at a later sync wave in a different namespace, so mounting it would couple KSM's startup to the operator and need a restart once the ConfigMap appeared. Observability scrape config belongs in the observability repo; keep it in step with the operator chart's files/slo/customresourcestatemetrics.yaml when its CRD status surface changes. Part of #33 — the second half (flip slo.alerting in production) stays blocked on the pagerduty-platform + slack-webhook-* Secrets the AlertmanagerConfig receivers reference. --- .../kube-state-metrics/values-dev.yaml | 1 + .../kube-state-metrics/values-production.yaml | 1 + .../kube-state-metrics/values-staging.yaml | 1 + .../kube-state-metrics/values.yaml | 193 ++++++++++++++++++ applicationsets/addons-observability.yaml | 7 + dashboards/base/kustomization.yaml | 4 +- 6 files changed, 205 insertions(+), 2 deletions(-) create mode 100644 addons/observability/kube-state-metrics/values-dev.yaml create mode 100644 addons/observability/kube-state-metrics/values-production.yaml create mode 100644 addons/observability/kube-state-metrics/values-staging.yaml create mode 100644 addons/observability/kube-state-metrics/values.yaml diff --git a/addons/observability/kube-state-metrics/values-dev.yaml b/addons/observability/kube-state-metrics/values-dev.yaml new file mode 100644 index 0000000..17ebbbe --- /dev/null +++ b/addons/observability/kube-state-metrics/values-dev.yaml @@ -0,0 +1 @@ +# kube-state-metrics — dev overrides (none; base values apply to all environments) diff --git a/addons/observability/kube-state-metrics/values-production.yaml b/addons/observability/kube-state-metrics/values-production.yaml new file mode 100644 index 0000000..f026d86 --- /dev/null +++ b/addons/observability/kube-state-metrics/values-production.yaml @@ -0,0 +1 @@ +# kube-state-metrics — production overrides (none; base values apply to all environments) diff --git a/addons/observability/kube-state-metrics/values-staging.yaml b/addons/observability/kube-state-metrics/values-staging.yaml new file mode 100644 index 0000000..1234a41 --- /dev/null +++ b/addons/observability/kube-state-metrics/values-staging.yaml @@ -0,0 +1 @@ +# kube-state-metrics — staging overrides (none; base values apply to all environments) diff --git a/addons/observability/kube-state-metrics/values.yaml b/addons/observability/kube-state-metrics/values.yaml new file mode 100644 index 0000000..95af1d5 --- /dev/null +++ b/addons/observability/kube-state-metrics/values.yaml @@ -0,0 +1,193 @@ +# -- kube-state-metrics: cluster-state metrics (kube_*) + the eks-agent-platform +# operator's custom-resource-state metrics (kube_customresource_*). +# +# Runs in kube-system as the service `kube-state-metrics` so the grafana-agent +# static scrape (kube-state-metrics.kube-system.svc:8080) finds it — without this +# addon that scrape hit nothing and every kube_* / kube_customresource_* panel +# silently no-data'd (the cilium/cert-manager/eso dashboards and, via the +# customResourceState config below, the seven agent persona dashboards + the +# operator-slo CR-status alerts). +# +# The customResourceState config mirrors the operator chart's +# files/slo/customresourcestatemetrics.yaml. It lives here, not mounted from the +# operator's ConfigMap, on purpose: the operator runs at a later sync wave and in +# a different namespace, so mounting its ConfigMap would couple KSM's start to it +# and need a restart once it appeared. Observability scrape config belongs in the +# observability repo; keep this in step with the operator chart when its CRD +# status surface changes. +fullnameOverride: kube-state-metrics + +# The collected resources are cluster-scoped reads; that's what KSM needs anyway. +rbac: + create: true + # Read access to the operator CRDs whose status the customResourceState config + # projects into metrics. Without these, KSM logs forbidden and emits no + # kube_customresource_* series. + extraRules: + - apiGroups: ["platform.nanohype.dev"] + resources: ["platforms", "tenants"] + verbs: ["list", "watch"] + - apiGroups: ["governance.nanohype.dev"] + resources: ["budgetpolicies", "evalsuites"] + verbs: ["list", "watch"] + - apiGroups: ["agents.nanohype.dev"] + resources: ["agentfleets"] + verbs: ["list", "watch"] + +# Scraped by the grafana-agent static target, but annotate too so the +# annotation-based discovery picks it up regardless. +selfMonitor: + enabled: false + +resources: + requests: + cpu: 10m + memory: 64Mi + limits: + cpu: 100m + memory: 256Mi + +customResourceState: + enabled: true + config: + kind: CustomResourceStateMetrics + spec: + resources: + # Platform + - groupVersionKind: + group: platform.nanohype.dev + version: v1alpha1 + kind: Platform + metricNamePrefix: kube_customresource + metrics: + - name: status_phase + help: "Platform.status.phase by name + phase value" + each: + type: StateSet + stateSet: + labelName: customresource_phase + path: [status, phase] + list: [Pending, Provisioning, Ready, Suspended, Failed] + commonLabels: + customresource_kind: Platform + - name: status_field + help: "Platform.status.suspendedAt as unix ts (0 when unset)" + each: + type: Gauge + gauge: + path: [status, suspendedAt] + nilIsZero: true + commonLabels: + customresource_kind: Platform + field: suspendedAt + + # Tenant + - groupVersionKind: + group: platform.nanohype.dev + version: v1alpha1 + kind: Tenant + metricNamePrefix: kube_customresource + metrics: + - name: status_phase + help: "Tenant.status.phase by name + phase value" + each: + type: StateSet + stateSet: + labelName: customresource_phase + path: [status, phase] + list: [Pending, Provisioning, Active, Suspended, Failed] + commonLabels: + customresource_kind: Tenant + - name: status_field + help: "Tenant.status.{percentOfBudget,aggregateSpendUsd}" + each: + type: Gauge + gauge: + path: [status, percentOfBudget] + commonLabels: + customresource_kind: Tenant + field: percentOfBudget + - name: condition + help: "Tenant.status.conditions by type + status" + each: + type: Gauge + gauge: + path: [status, conditions] + labelsFromPath: + condition_type: [type] + condition_status: [status] + valueFrom: [status] + commonLabels: + customresource_kind: Tenant + + # BudgetPolicy + - groupVersionKind: + group: governance.nanohype.dev + version: v1alpha1 + kind: BudgetPolicy + metricNamePrefix: kube_customresource + metrics: + - name: status_field + help: "BudgetPolicy.status.{lastReconciled,killSwitchFiredAt,percentOfBudget} as gauges" + each: + type: Gauge + gauge: + path: [status, lastReconciled] + nilIsZero: true + commonLabels: + customresource_kind: BudgetPolicy + field: lastReconciled + + # AgentFleet + - groupVersionKind: + group: agents.nanohype.dev + version: v1alpha1 + kind: AgentFleet + metricNamePrefix: kube_customresource + metrics: + - name: status_phase + help: "AgentFleet.status.phase" + each: + type: StateSet + stateSet: + labelName: customresource_phase + path: [status, phase] + list: [Pending, Provisioning, Ready, ScaledToZero, Suspended, Failed] + commonLabels: + customresource_kind: AgentFleet + - name: status_field + help: "AgentFleet.status.readyAgents count" + each: + type: Gauge + gauge: + path: [status, readyAgents] + commonLabels: + customresource_kind: AgentFleet + field: readyAgents + + # EvalSuite + - groupVersionKind: + group: governance.nanohype.dev + version: v1alpha1 + kind: EvalSuite + metricNamePrefix: kube_customresource + metrics: + - name: status_phase + help: "EvalSuite.status.phase" + each: + type: StateSet + stateSet: + labelName: customresource_phase + path: [status, phase] + list: [Pending, Provisioning, Ready, Running, Passed, Failed] + commonLabels: + customresource_kind: EvalSuite + - name: status_field + help: "EvalSuite.status.lastScore (0..1) parsed as gauge" + each: + type: Gauge + gauge: + path: [status, lastScore] + commonLabels: + customresource_kind: EvalSuite + field: lastScore diff --git a/applicationsets/addons-observability.yaml b/applicationsets/addons-observability.yaml index cf8fc45..e9fb4bb 100644 --- a/applicationsets/addons-observability.yaml +++ b/applicationsets/addons-observability.yaml @@ -17,6 +17,13 @@ spec: argocd.argoproj.io/secret-type: cluster - list: elements: + - appName: kube-state-metrics + namespace: kube-system + chartRepo: https://prometheus-community.github.io/helm-charts + chart: kube-state-metrics + chartVersion: 7.5.1 + path: addons/observability/kube-state-metrics + syncWave: "29" - appName: loki namespace: monitoring chartRepo: https://grafana.github.io/helm-charts diff --git a/dashboards/base/kustomization.yaml b/dashboards/base/kustomization.yaml index 683bbc7..65dd503 100644 --- a/dashboards/base/kustomization.yaml +++ b/dashboards/base/kustomization.yaml @@ -24,8 +24,8 @@ resources: - platform/external-dns.yaml - platform/eso.yaml # Agent-platform persona dashboards (operator CR-state + agentgateway + spend). - # Their kube_customresource_* panels need the operator-slo CR-state ConfigMap - # mounted into kube-state-metrics (see addons/observability/kube-state-metrics). + # Their kube_customresource_* panels are fed by the kube-state-metrics addon's + # customResourceState config (addons/observability/kube-state-metrics). - platform/agent-tenants.yaml - platform/agent-eval-quality.yaml - platform/agent-kill-switch.yaml