Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
apiVersion: runwhen.com/v1
kind: GenerationRules
spec:
generationRules:
- resourceTypes:
- service
matchRules:
- type: pattern
pattern: "pgbouncer"
properties: ["name"]
mode: substring
slxs:
- baseName: pgb-pgb-hc
shortenedBaseName: pgb-pgb-hc
qualifiers: ["resource", "namespace", "cluster"]
baseTemplateName: k8s-pgbouncer-prometheus-health
levelOfDetail: basic
outputItems:
- type: slx
- type: runbook
templateName: k8s-pgbouncer-prometheus-health-taskset.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
apiVersion: runwhen.com/v1
kind: ServiceLevelX
metadata:
name: {{ slx_name }}
labels:
{% include "common-labels.yaml" %}
annotations:
{% include "common-annotations.yaml" %}
spec:
imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/kubernetes/resources/labeled/svc.svg
alias: {{ match_resource.resource.metadata.name }} Kubernetes PgBouncer Prometheus Health
asMeasuredBy: PgBouncer pool saturation, wait queues, exporter availability, and capacity signals from Prometheus.
configProvided:
- name: SLX_PLACEHOLDER
value: SLX_PLACEHOLDER
owners:
- {{ workspace.owner_email }}
statement: Monitor PgBouncer connection pool health using Prometheus metrics from the pgbouncer exporter for this service scope.
additionalContext:
{% include "kubernetes-hierarchy.yaml" ignore missing %}
qualified_name: "{{ match_resource.qualified_name }}"
tags:
{% include "kubernetes-tags.yaml" ignore missing %}
- name: platform
value: kubernetes
- name: resource
value: service
- name: scope
value: namespace
- name: access
value: read-only
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
apiVersion: runwhen.com/v1
kind: Runbook
metadata:
name: {{slx_name}}
labels:
{% include "common-labels.yaml" %}
annotations:
{% include "common-annotations.yaml" %}
spec:
location: {{default_location}}
description: Evaluates PgBouncer pool health via Prometheus metrics for service {{ match_resource.resource.metadata.name }} in namespace {{ match_resource.resource.metadata.namespace }}.
codeBundle:
{% if repo_url %}
repoUrl: {{repo_url}}
{% else %}
repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git
{% endif %}
{% if ref %}
ref: {{ref}}
{% else %}
ref: main
{% endif %}
pathToRobot: codebundles/k8s-pgbouncer-prometheus-health/runbook.robot
configProvided:
- name: PROMETHEUS_URL
value: "{{ custom.prometheus_url | default('https://prometheus.example/api/v1') }}"
- name: PGBOUNCER_JOB_LABEL
value: "{{ custom.pgbouncer_job_label | default('job=\"pgbouncer-exporter\"') }}"
- name: METRIC_NAMESPACE_FILTER
value: 'kubernetes_namespace="{{ match_resource.resource.metadata.namespace }}"'
- name: EXPECTED_POOL_MODE
value: "{{ custom.pgbouncer_expected_pool_mode | default('transaction') }}"
- name: CONTEXT
value: "{{ context }}"
- name: KUBERNETES_DISTRIBUTION_BINARY
value: "{{ custom.kubernetes_distribution_binary | default('kubectl') }}"
- name: PGBOUNCER_NAMESPACE
value: "{{ match_resource.resource.metadata.namespace }}"
- name: PGBOUNCER_POD_LABEL_SELECTOR
value: "{{ custom.pgbouncer_pod_label_selector | default('app.kubernetes.io/name=pgbouncer-exporter') }}"
- name: CLIENT_SATURATION_PERCENT_THRESHOLD
value: "{{ custom.pgbouncer_client_saturation_percent | default('80') }}"
- name: MAX_WAIT_SECONDS_THRESHOLD
value: "{{ custom.pgbouncer_max_wait_seconds | default('1') }}"
secretsProvided:
{% if wb_version %}
{% include "kubernetes-auth.yaml" ignore missing %}
{% else %}
- name: kubeconfig
workspaceKey: {{ custom.kubeconfig_secret_name | default("kubeconfig") }}
{% endif %}
- name: prometheus_bearer_token
workspaceKey: {{ custom.prometheus_bearer_token_secret | default("AUTH DETAILS NOT FOUND") }}
95 changes: 95 additions & 0 deletions codebundles/k8s-pgbouncer-prometheus-health/.test/Taskfile.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
version: "3"

tasks:
default:
desc: "Run/refresh config"
cmds:
- task: check-unpushed-commits
- task: generate-rwl-config
- task: run-rwl-discovery

clean:
desc: "Run cleanup tasks"
cmds:
- task: remove-kubernetes-objects
- task: clean-rwl-discovery

build-infra:
desc: "Build test infrastructure"
cmds:
- task: create-kubernetes-objects

create-kubernetes-objects:
desc: "Apply manifests from kubernetes directory using kubectl"
cmds:
- kubectl apply -f kubernetes/*
silent: true

remove-kubernetes-objects:
desc: "Delete kubernetes objects"
cmds:
- kubectl delete -f kubernetes/* || true
silent: true

check-unpushed-commits:
desc: Check if outstanding commits or file updates need to be pushed before testing.
vars:
BASE_DIR: "../"
cmds:
- |
echo "Checking for uncommitted changes in $BASE_DIR and $BASE_DIR.runwhen, excluding '.test'..."
UNCOMMITTED_FILES=$(git diff --name-only HEAD | grep -E "^${BASE_DIR}(\.runwhen|[^/]+)" | grep -v "/\.test/" || true)
if [ -n "$UNCOMMITTED_FILES" ]; then
echo "Uncommitted changes found:"
echo "$UNCOMMITTED_FILES"
exit 1
fi
echo "No uncommitted changes in specified directories."
silent: true

generate-rwl-config:
desc: "Generate RunWhen Local configuration (workspaceInfo.yaml)"
env:
RW_WORKSPACE: '{{.RW_WORKSPACE | default "my-workspace"}}'
cmds:
- |
repo_url=$(git config --get remote.origin.url)
branch_name=$(git rev-parse --abbrev-ref HEAD)
codebundle=$(basename "$(dirname "$PWD")")
namespace=$(yq e 'select(.kind == "Namespace") | .metadata.name' kubernetes/manifest.yaml -N)
cat <<EOF > workspaceInfo.yaml
workspaceName: "$RW_WORKSPACE"
workspaceOwnerEmail: authors@runwhen.com
defaultLocation: location-01
defaultLOD: none
cloudConfig:
kubernetes:
kubeconfigFile: /shared/kubeconfig
namespaceLODs:
$namespace: detailed
namespaces:
- $namespace
codeCollections:
- repoURL: "$repo_url"
branch: "$branch_name"
codeBundles: ["$codebundle"]
custom:
kubeconfig_secret_name: "kubeconfig"
kubernetes_distribution_binary: kubectl
EOF
silent: true

run-rwl-discovery:
desc: "Run RunWhen Local Discovery on test infrastructure"
cmds:
- |
echo "Discovery requires docker and a valid kubeconfig; see README in parent CodeBundle."
silent: true

clean-rwl-discovery:
desc: "Check and clean up RunWhen Local discovery output"
cmds:
- |
rm -rf output
rm -f workspaceInfo.yaml
silent: true
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
apiVersion: v1
kind: Namespace
metadata:
name: test-pgbouncer-hc
---
apiVersion: v1
kind: Service
metadata:
name: pgbouncer-proxy
namespace: test-pgbouncer-hc
labels:
app.kubernetes.io/name: pgbouncer
spec:
ports:
- port: 5432
targetPort: 5432
selector:
app.kubernetes.io/name: pgbouncer
85 changes: 85 additions & 0 deletions codebundles/k8s-pgbouncer-prometheus-health/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
# Kubernetes PgBouncer Prometheus Health

This CodeBundle evaluates PgBouncer connection pool health using Prometheus metrics from the [prometheus-community/pgbouncer_exporter](https://github.com/prometheus-community/pgbouncer_exporter) (or compatible scrapes), with optional kubectl validation of `pool_mode` when a kubeconfig and namespace are supplied.

## Overview

- **Exporter and process availability**: detects `pgbouncer_up` failures per scrape target.
- **Saturation and waits**: compares active and waiting clients to `pgbouncer_config_max_client_connections`, flags wait queues, max wait time, and server-side balance patterns.
- **Configuration drift**: optional pool mode validation via metric labels or `pgbouncer.ini` read through kubectl.
- **Distribution and outliers**: ranks per-database load, highlights pod-level skew, and estimates capacity when app and pooler replica inputs are provided.
- **Growth**: uses Prometheus `rate()` over a configurable window to spot sustained connection growth.

## Configuration

### Required variables

- `PROMETHEUS_URL`: Base URL for the Prometheus or Thanos querier API (for example `https://prometheus.example/api/v1` or `https://thanos.example/api/v1`).
- `PGBOUNCER_JOB_LABEL`: Prometheus label matchers inside `{...}` for the exporter scrape, for example `job="pgbouncer-exporter"`.
- `EXPECTED_POOL_MODE`: Expected pool mode string: `transaction`, `session`, or `statement`.

### Optional variables

- `CONTEXT`: Kubernetes context for kubectl when kubeconfig is configured.
- `METRIC_NAMESPACE_FILTER`: Extra label matchers (comma-separated) such as `kubernetes_namespace="my-namespace"` to narrow series.
- `CLIENT_SATURATION_PERCENT_THRESHOLD`: Percent of `max_client_conn` above which saturation is raised (default `80`).
- `MAX_WAIT_SECONDS_THRESHOLD`: Maximum acceptable `pgbouncer_pools_client_maxwait_seconds` (default `1`).
- `CLIENT_WAITING_THRESHOLD`: Raise when the sum of waiting connections is greater than this value (default `0`).
- `DATABASE_HOTSPOT_PERCENT_THRESHOLD`: Flag databases whose share of connections exceeds this percent of the total (default `50`).
- `POD_OUTLIER_RATIO`: Flag pods whose per-pod client active sum exceeds the fleet mean times this ratio (default `2.0`).
- `GROWTH_RATE_WINDOW_MINUTES`: Lookback for Prometheus range queries used in growth detection (default `15`).
- `CONNECTION_GROWTH_RATE_THRESHOLD`: Average `rate()` of client active connections (per second) that triggers growth issues (default `0.1`).
- `KUBERNETES_DISTRIBUTION_BINARY`: CLI binary for kubectl (default `kubectl`).
- `PGBOUNCER_NAMESPACE`: Namespace used to locate a pod for optional pool mode inspection (often the same as the PgBouncer workload namespace).
- `PGBOUNCER_POD_LABEL_SELECTOR`: Label selector for the pod that mounts `pgbouncer.ini` (default `app.kubernetes.io/name=pgbouncer-exporter`; change to your PgBouncer pod labels if the exporter runs as a sidecar elsewhere).
- `PGBOUNCER_PGBOUNCER_CONTAINER`: Optional container name for `kubectl exec` when the pod is multi-container.
- `APP_REPLICAS`: Application replica count for the capacity SLI (optional).
- `APP_DB_POOL_SIZE`: Per-replica application DB pool size for the capacity SLI (optional).
- `PGBOUNCER_REPLICAS`: PgBouncer replica count for the capacity SLI (optional).

### Secrets

- `prometheus_bearer_token`: Bearer token for authenticated Prometheus read APIs when required (plain text or OAuth token).
- `kubeconfig`: Standard kubeconfig used for optional kubectl-based pool mode checks.

## Tasks overview

### Check PgBouncer Exporter and Process Availability

Fails when `pgbouncer_up` is `0` for any filtered target or when no series are returned.

### Check Client Connection Saturation vs max_client_conn

Compares `(sum(client_active) + sum(client_waiting)) / max(max_client_conn)` to the percent threshold.

### Check Client Wait Queue Buildup

Raises when the sum of `pgbouncer_pools_client_waiting_connections` is above `CLIENT_WAITING_THRESHOLD`.

### Check Max Client Wait Time Spikes

Compares `max(pgbouncer_pools_client_maxwait_seconds)` to `MAX_WAIT_SECONDS_THRESHOLD`.

### Check Server Pool Balance vs Client Waits

Flags clients waiting while server idle connections exist, and clients waiting alongside elevated `server_used` counts.

### Validate Pool Mode from Metrics or Config

Prefers a `pool_mode` label on metrics if present; otherwise attempts to read `pool_mode` from common `pgbouncer.ini` paths via kubectl when `PGBOUNCER_NAMESPACE` and kubeconfig are set.

### Analyze Per-Database Connection Distribution

Uses `pgbouncer_databases_current_connections` when available, otherwise `pgbouncer_pools_client_active_connections` by `database` label, to find hotspots.

### Aggregate Health Across PgBouncer Pods and Flag Outliers

Compares per-pod `sum(client_active)` against the fleet mean using `POD_OUTLIER_RATIO`.

### Detect Abnormal Client Connection Growth Rate

Runs a range query on `rate(pgbouncer_pools_client_active_connections[5m])` and compares average rates to `CONNECTION_GROWTH_RATE_THRESHOLD`.

### Compute Capacity Planning SLI (App Demand vs PgBouncer Capacity)

When `APP_REPLICAS`, `APP_DB_POOL_SIZE`, and `PGBOUNCER_REPLICAS` are all set, compares `APP_REPLICAS * APP_DB_POOL_SIZE` to `max(pgbouncer_config_max_client_connections) * PGBOUNCER_REPLICAS`.
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
#!/usr/bin/env bash
set -euo pipefail
set -x

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# shellcheck source=lib/prometheus-common.sh
source "${SCRIPT_DIR}/lib/prometheus-common.sh"

: "${PROMETHEUS_URL:?Must set PROMETHEUS_URL}"
: "${PGBOUNCER_JOB_LABEL:?Must set PGBOUNCER_JOB_LABEL}"

OUTPUT_FILE="check_capacity_sli_output.json"
issues_json='[]'

if [ -z "${APP_REPLICAS:-}" ] || [ -z "${APP_DB_POOL_SIZE:-}" ] || [ -z "${PGBOUNCER_REPLICAS:-}" ]; then
echo "$issues_json" > "$OUTPUT_FILE"
echo "Capacity SLI skipped (set APP_REPLICAS, APP_DB_POOL_SIZE, PGBOUNCER_REPLICAS)."
jq '.' "$OUTPUT_FILE"
exit 0
fi

wm=$(wrap_metric pgbouncer_config_max_client_connections)
q="max(${wm})"
echo "Instant query: $q"

raw=$(prometheus_instant_query "$q" || true)
if ! prometheus_query_status_ok "${raw:-}" 2>/dev/null; then
echo '[]' | jq \
--arg title "Prometheus Error for Capacity SLI" \
--arg details "Could not read max client connections from metrics." \
--arg severity "2" \
--arg next_steps "Confirm pgbouncer_config_max_client_connections is scraped." \
'. += [{"title": $title, "details": $details, "severity": ($severity | tonumber), "next_steps": $next_steps}]' > "$OUTPUT_FILE"
exit 0
fi

maxc=$(echo "$raw" | jq -r '.data.result[0].value[1] // "0"')
demand=$(awk -v r="$APP_REPLICAS" -v p="$APP_DB_POOL_SIZE" 'BEGIN {printf "%.0f", r * p}')
cap=$(awk -v m="$maxc" -v pr="$PGBOUNCER_REPLICAS" 'BEGIN {printf "%.0f", m * pr}')

if [ "${cap:-0}" -eq 0 ]; then
echo "$issues_json" > "$OUTPUT_FILE"
exit 0
fi

ratio=$(awk -v d="$demand" -v c="$cap" 'BEGIN {printf "%.4f", d / c}')

awk -v r="$ratio" 'BEGIN {exit !((r + 0) >= 1.0)}' && {
issues_json=$(echo "$issues_json" | jq \
--arg title "Capacity SLI: App Demand Meets or Exceeds PgBouncer Capacity" \
--arg details "Estimated demand is ${demand} (APP_REPLICAS * APP_DB_POOL_SIZE) vs capacity ${cap} (max_client_conn * PGBOUNCER_REPLICAS). Ratio ${ratio}." \
--arg severity "2" \
--arg next_steps "Increase PgBouncer replicas or max_client_conn, reduce per-app pool size or app replicas, or add pooler shards." \
'. += [{"title": $title, "details": $details, "severity": ($severity | tonumber), "next_steps": $next_steps}]')
}

awk -v r="$ratio" 'BEGIN {exit !((r + 0) >= 0.85 && (r + 0) < 1.0)}' && {
issues_json=$(echo "$issues_json" | jq \
--arg title "Capacity SLI: Approaching PgBouncer Limit" \
--arg details "Demand/capacity ratio is ${ratio} (warning band >= 0.85). Demand ${demand}, capacity ${cap}." \
--arg severity "1" \
--arg next_steps "Plan capacity increases before saturation causes client waits and errors." \
'. += [{"title": $title, "details": $details, "severity": ($severity | tonumber), "next_steps": $next_steps}]')
}

echo "$issues_json" > "$OUTPUT_FILE"
jq '.' "$OUTPUT_FILE"
Loading