runwhen-contrib · rw-codebundle-agent · Apr 16, 2026
@@ -0,0 +1,22 @@
+apiVersion: runwhen.com/v1
+kind: GenerationRules
+spec:
+  generationRules:
+    - resourceTypes:
+        - service
+      matchRules:
+        - type: pattern
+          pattern: "litellm"
+          properties: [name, label-values]
+          mode: substring
+      slxs:
+        - baseName: litellm-spend-gov
+          shortenedBaseName: llm-spend-gov
+          levelOfDetail: basic
+          qualifiers: ["resource", "namespace", "cluster"]
+          baseTemplateName: k8s-litellm-spend-governance
+          outputItems:
+            - type: slx
+            - type: sli
+            - type: runbook
+              templateName: k8s-litellm-spend-governance-taskset.yaml
@@ -0,0 +1,53 @@
+apiVersion: runwhen.com/v1
+kind: ServiceLevelIndicator
+metadata:
+  name: {{slx_name}}
+  labels:
+    {% include "common-labels.yaml" %}
+  annotations:
+    {% include "common-annotations.yaml" %}
+spec:
+  displayUnitsLong: OK
+  displayUnitsShort: ok
+  locations:
+    - {{default_location}}
+  description: Scores LiteLLM proxy reachability, global spend versus threshold, and spend-log failure heuristics.
+  codeBundle:
+    {% if repo_url %}
+    repoUrl: {{repo_url}}
+    {% else %}
+    repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git
+    {% endif %}
+    {% if ref %}
+    ref: {{ref}}
+    {% else %}
+    ref: main
+    {% endif %}
+    pathToRobot: codebundles/k8s-litellm-spend-governance/sli.robot
+  intervalStrategy: intermezzo
+  intervalSeconds: 300
+  configProvided:
+    - name: PROXY_BASE_URL
+      value: "{{ custom.litellm_proxy_base_url | default('') }}"
+    - name: LITELLM_SERVICE_NAME
+      value: "{{match_resource.resource.metadata.name}}"
+    - name: RW_LOOKBACK_WINDOW
+      value: "24h"
+    - name: LITELLM_SPEND_THRESHOLD_USD
+      value: "0"
+  secretsProvided:
+  {% if wb_version %}
+    {% include "kubernetes-auth.yaml" ignore missing %}
+    - name: litellm_master_key
+      workspaceKey: {{ custom.litellm_master_key_secret_name | default("litellm_master_key") }}
+  {% else %}
+    - name: kubeconfig
+      workspaceKey: {{ custom.kubeconfig_secret_name | default("kubeconfig") }}
+    - name: litellm_master_key
+      workspaceKey: {{ custom.litellm_master_key_secret_name | default("litellm_master_key") }}
+  {% endif %}
+
+  alertConfig:
+    tasks:
+      persona: eager-edgar
+      sessionTTL: 10m
@@ -0,0 +1,25 @@
+apiVersion: runwhen.com/v1
+kind: ServiceLevelX
+metadata:
+  name: {{slx_name}}
+  labels:
+    {% include "common-labels.yaml" %}
+  annotations:
+    {% include "common-annotations.yaml" %}
+spec:
+  imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/kubernetes/resources/labeled/svc.svg
+  alias: {{match_resource.resource.metadata.name}} LiteLLM Spend Governance
+  asMeasuredBy: LiteLLM Admin API checks for spend logs, global spend, and budget pressure.
+  configProvided:
+    - name: PROXY_BASE_URL
+      value: "{{ custom.litellm_proxy_base_url | default('') }}"
+  owners:
+    - {{workspace.owner_email}}
+  statement: Monitors LiteLLM proxy spend, budgets, and blocked-request signals for this service.
+  additionalContext:
+    {% include "kubernetes-hierarchy.yaml" ignore missing %}
+    qualified_name: "{{ match_resource.qualified_name }}"
+  tags:
+    {% include "kubernetes-tags.yaml" ignore missing %}
+    - name: access
+      value: read-only
@@ -0,0 +1,53 @@
+apiVersion: runwhen.com/v1
+kind: Runbook
+metadata:
+  name: {{slx_name}}
+  labels:
+    {% include "common-labels.yaml" %}
+  annotations:
+    {% include "common-annotations.yaml" %}
+spec:
+  location: {{default_location}}
+  codeBundle:
+    {% if repo_url %}
+    repoUrl: {{repo_url}}
+    {% else %}
+    repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git
+    {% endif %}
+    {% if ref %}
+    ref: {{ref}}
+    {% else %}
+    ref: main
+    {% endif %}
+    pathToRobot: codebundles/k8s-litellm-spend-governance/runbook.robot
+  configProvided:
+    - name: KUBERNETES_DISTRIBUTION_BINARY
+      value: "{{ custom.kubernetes_distribution_binary | default('kubectl') }}"
+    - name: NAMESPACE
+      value: "{{match_resource.resource.metadata.namespace}}"
+    - name: CONTEXT
+      value: "{{context}}"
+    - name: LITELLM_SERVICE_NAME
+      value: "{{match_resource.resource.metadata.name}}"
+    - name: PROXY_BASE_URL
+      value: "{{ custom.litellm_proxy_base_url | default('') }}"
+    - name: RW_LOOKBACK_WINDOW
+      value: "24h"
+    - name: LITELLM_SPEND_THRESHOLD_USD
+      value: "0"
+    - name: LITELLM_USER_IDS
+      value: ""
+    - name: LITELLM_TEAM_IDS
+      value: ""
+  secretsProvided:
+  {% if wb_version %}
+    {% include "kubernetes-auth.yaml" ignore missing %}
+    - name: litellm_master_key
+      workspaceKey: {{ custom.litellm_master_key_secret_name | default("litellm_master_key") }}
+  {% else %}
+    - name: kubeconfig
+      workspaceKey: {{ custom.kubeconfig_secret_name | default("kubeconfig") }}
+    - name: litellm_master_key
+      workspaceKey: {{ custom.litellm_master_key_secret_name | default("litellm_master_key") }}
+  {% endif %}
+
@@ -0,0 +1,110 @@
+version: "3"
+
+tasks:
+  default:
+    desc: "Run/refresh config"
+    cmds:
+      - task: check-unpushed-commits
+      - task: generate-rwl-config
+      - task: run-rwl-discovery
+
+  clean:
+    desc: "Run cleanup tasks"
+    cmds:
+      - task: remove-kubernetes-objects
+      - task: clean-rwl-discovery
+
+  build-infra:
+    desc: "Build test infrastructure"
+    cmds:
+      - task: create-kubernetes-objects
+
+  create-kubernetes-objects:
+    desc: "Apply manifests from kubernetes directory using kubectl"
+    cmds:
+      - kubectl apply -f kubernetes/manifest.yaml
+    silent: true
+
+  remove-kubernetes-objects:
+    desc: "Delete kubernetes objects"
+    cmds:
+      - kubectl delete -f kubernetes/manifest.yaml --ignore-not-found=true
+    silent: true
+
+  check-unpushed-commits:
+    desc: Check if outstanding commits or file updates need to be pushed before testing.
+    vars:
+      BASE_DIR: "../"
+    cmds:
+      - |
+        echo "Checking for uncommitted changes in $BASE_DIR and $BASE_DIR.runwhen, excluding '.test'..."
+        UNCOMMITTED_FILES=$(git diff --name-only HEAD | grep -E "^${BASE_DIR}(\.runwhen|[^/]+)" | grep -v "/\.test/" || true)
+        if [ -n "$UNCOMMITTED_FILES" ]; then
+          echo "Uncommitted changes found:"
+          echo "$UNCOMMITTED_FILES"
+          exit 1
+        fi
+      - |
+        git fetch origin
+        UNPUSHED_FILES=$(git diff --name-only origin/$(git rev-parse --abbrev-ref HEAD) HEAD | grep -E "^${BASE_DIR}(\.runwhen|[^/]+)" | grep -v "/\.test/" || true)
+        if [ -n "$UNPUSHED_FILES" ]; then
+          echo "Unpushed commits found:"
+          echo "$UNPUSHED_FILES"
+          exit 1
+        fi
+    silent: true
+
+  generate-rwl-config:
+    desc: "Generate RunWhen Local configuration (workspaceInfo.yaml)"
+    env:
+      RW_WORKSPACE: '{{.RW_WORKSPACE | default "my-workspace"}}'
+    cmds:
+      - |
+        repo_url=$(git config --get remote.origin.url)
+        branch_name=$(git rev-parse --abbrev-ref HEAD)
+        codebundle=$(basename "$(dirname "$PWD")")
+        namespace=$(yq e 'select(.kind == "Namespace") | .metadata.name' kubernetes/manifest.yaml -N)
+        cat <<EOF > workspaceInfo.yaml
+        workspaceName: "$RW_WORKSPACE"
+        workspaceOwnerEmail: authors@runwhen.com
+        defaultLocation: location-01
+        defaultLOD: none
+        cloudConfig:
+          kubernetes:
+            kubeconfigFile: /shared/kubeconfig
+            namespaceLODs:
+              $namespace: detailed
+            namespaces:
+              - $namespace
+        codeCollections:
+        - repoURL: "$repo_url"
+          branch: "$branch_name"
+          codeBundles: ["$codebundle"]
+        custom:
+          kubeconfig_secret_name: "kubeconfig"
+          kubernetes_distribution_binary: kubectl
+          litellm_proxy_base_url: "http://127.0.0.1:4000"
+        EOF
+    silent: true
+
+  run-rwl-discovery:
+    desc: "Run RunWhen Local Discovery on test infrastructure (requires Docker + kubeconfig)"
+    cmds:
+      - |
+        echo "See test-infra-kubernetes.md — start runwhen-local with kubeconfig mounted at /shared/kubeconfig when ready."
+    silent: true
+
+  validate-generation-rules:
+    desc: "Validate YAML files in .runwhen/generation-rules"
+    cmds:
+      - |
+        for yaml_file in ../.runwhen/generation-rules/*.yaml; do
+          python3 -c "import yaml,sys; yaml.safe_load(open(sys.argv[1]))" "$yaml_file" && echo "OK $yaml_file"
+        done
+    silent: true
+
+  clean-rwl-discovery:
+    desc: "Clean discovery output"
+    cmds:
+      - rm -rf output workspaceInfo.yaml
+    silent: true
@@ -0,0 +1,19 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: test-litellm-spend-governance
+
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: litellm-proxy
+  namespace: test-litellm-spend-governance
+  labels:
+    app.kubernetes.io/name: litellm
+spec:
+  ports:
+    - port: 4000
+      name: http
+  selector:
+    app.kubernetes.io/name: litellm
@@ -0,0 +1,69 @@
+# Kubernetes LiteLLM Spend and Governance
+
+This CodeBundle queries the LiteLLM proxy Admin and spend APIs (not container logs alone) to surface cost pressure, budget blocks, rate limits, and provider-side failures. Pair it with cluster connectivity (`kubectl`) for context and with port-forward or in-cluster URLs for `PROXY_BASE_URL`.
+
+## Overview
+
+- **Spend logs**: Scans `/spend/logs` for budget, rate-limit, and HTTP error heuristics in the lookback window.
+- **Global spend**: Reads `/global/spend/report` and optionally compares estimated USD spend to `LITELLM_SPEND_THRESHOLD_USD`.
+- **Keys**: When `/key/list` is available, flags keys near `max_budget` or past `expires`.
+- **Users / teams**: Optional `/user/info` and `/team/info` checks for cooldowns and budget risk.
+- **Aggregate triage**: Summarizes failure-mode counts from spend logs for quick review.
+
+## Configuration
+
+### Required variables
+
+- `CONTEXT`: Kubernetes context for `kubectl` correlation and cluster verification.
+- `NAMESPACE`: Namespace where the LiteLLM `Service` runs.
+- `PROXY_BASE_URL`: Reachable LiteLLM base URL (for example `http://litellm.default.svc.cluster.local:4000` or a port-forward URL).
+- `LITELLM_SERVICE_NAME`: Kubernetes `Service` name used in titles and reports.
+
+### Optional variables
+
+- `RW_LOOKBACK_WINDOW`: Window for log/report date mapping (default: `24h`). Supports forms like `24h`, `7d`, `30m`.
+- `LITELLM_SPEND_THRESHOLD_USD`: Alert when estimated global spend in the window exceeds this USD amount; `0` disables (default: `0`).
+- `LITELLM_USER_IDS`: Comma-separated internal `user_id` values for `/user/info`; empty skips user checks.
+- `LITELLM_TEAM_IDS`: Comma-separated team ids for `/team/info`; empty skips team checks.
+- `KUBERNETES_DISTRIBUTION_BINARY`: `kubectl` or `oc` (default: `kubectl`).
+
+### Secrets
+
+- `litellm_master_key`: Bearer token with permission to call spend and admin routes (often the proxy master key or an admin key with spend scope).
+- `kubeconfig`: Kubeconfig used only for optional cluster connectivity verification and standard RunWhen Kubernetes wiring.
+
+## Tasks
+
+### Review Recent Spend Logs for Failures
+
+Calls `/spend/logs` with `summarize=false` for the computed date window and raises issues when heuristics match budget blocks, rate limits, or provider/HTTP failures.
+
+### Check Global Spend Report Against Threshold
+
+Calls `/global/spend/report` and, when `LITELLM_SPEND_THRESHOLD_USD` is greater than zero, compares estimated spend to the threshold.
+
+### Inspect Virtual Key Spend and Remaining Budget
+
+Uses `/key/list` when available to find keys near `max_budget` or expired keys.
+
+### Review User Budget and Rate Limit Status
+
+For each entry in `LITELLM_USER_IDS`, calls `/user/info` and surfaces `soft_budget_cooldown` when true.
+
+### Summarize Team Budgets and Limits
+
+For each entry in `LITELLM_TEAM_IDS`, calls `/team/info` and flags teams at or above 90% of `max_budget`.
+
+### Aggregate Error and Blocked Request Signals
+
+Produces triage counts (for example `budget_exceeded`, rate-limit, 429, 5xx patterns) from spend logs and raises an issue when the combined signal volume is high.
+
+## SLI
+
+`sli.robot` publishes a 0–1 score from three dimensions: proxy reachability (`/health` or `/`), global spend versus threshold, and spend-log failure heuristics. Generation rules emit an SLI template alongside the runbook.
+
+## Notes
+
+- Some routes are Enterprise-only or require specific key permissions; scripts emit clear issues on HTTP 403.
+- Database-backed spend logs must be enabled on the proxy for full `/spend/logs` results.
+- Set `custom.litellm_proxy_base_url` in workspace configuration when using discovery templates, or override `PROXY_BASE_URL` per SLX.