diff --git a/.env.example b/.env.example new file mode 100644 index 000000000..39863bb4c --- /dev/null +++ b/.env.example @@ -0,0 +1,2 @@ +GRAFANA_ADMIN_USER=replace-with-local-admin-user +GRAFANA_ADMIN_PASSWORD=replace-with-a-strong-local-password diff --git a/.github/workflows/lab8-external-monitor.yml b/.github/workflows/lab8-external-monitor.yml new file mode 100644 index 000000000..9b9d3506a --- /dev/null +++ b/.github/workflows/lab8-external-monitor.yml @@ -0,0 +1,277 @@ +name: Lab 8 External Monitoring Window + +on: + workflow_dispatch: + push: + branches: + - feature/lab8 + paths: + - ".github/workflows/lab8-external-monitor.yml" + - "app/*.go" + - "monitoring/grafana/dashboards/golden-signals.json" + +permissions: + contents: read + +jobs: + external-monitoring: + runs-on: ubuntu-latest + timeout-minutes: 60 + + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Create runtime-only Grafana credentials + shell: bash + run: | + echo "GRAFANA_ADMIN_USER=lab8admin" > .env + echo "GRAFANA_ADMIN_PASSWORD=$(openssl rand -hex 24)" >> .env + chmod 600 .env + + - name: Start Lab 8 stack + shell: bash + run: | + docker compose up -d --build + + for attempt in $(seq 1 120); do + if curl -fsS http://127.0.0.1:8080/health >/dev/null && + curl -fsS http://127.0.0.1:9090/-/ready >/dev/null && + curl -fsS http://127.0.0.1:3000/api/health >/dev/null; then + echo "QuickNotes, Prometheus, and Grafana are ready." + docker compose ps + exit 0 + fi + + echo "Waiting for the Lab 8 stack: attempt ${attempt}/120" + sleep 2 + done + + echo "The Compose stack did not become ready." + docker compose ps + docker compose logs --no-color + exit 1 + + - name: Create tunnel and maintain Checkly monitoring window + shell: bash + run: | + curl -fsSL \ + --retry 5 \ + --retry-delay 2 \ + --retry-all-errors \ + https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64 \ + -o /tmp/cloudflared + + chmod +x /tmp/cloudflared + /tmp/cloudflared --version + + /tmp/cloudflared tunnel \ + --no-autoupdate \ + --protocol http2 \ + --url http://127.0.0.1:8080 \ + > /tmp/cloudflared.log 2>&1 & + + CLOUDFLARED_PID=$! + PUBLIC_URL="" + + for attempt in $(seq 1 120); do + PUBLIC_URL="$( + grep -oE 'https://[a-z0-9-]+\.trycloudflare\.com' \ + /tmp/cloudflared.log | + tail -n 1 || true + )" + + if [ -n "$PUBLIC_URL" ]; then + break + fi + + if ! kill -0 "$CLOUDFLARED_PID" 2>/dev/null; then + echo "Cloudflare Tunnel exited before creating a URL." + cat /tmp/cloudflared.log + exit 1 + fi + + echo "Waiting for Cloudflare URL: attempt ${attempt}/120" + sleep 2 + done + + if [ -z "$PUBLIC_URL" ]; then + echo "No public Cloudflare URL was created." + cat /tmp/cloudflared.log + exit 1 + fi + + CHECKLY_URL="${PUBLIC_URL}/health" + + echo + echo "============================================================" + echo "CHECKLY_URL=${CHECKLY_URL}" + echo "============================================================" + echo + + echo "::notice title=Lab 8 Checkly URL::${CHECKLY_URL}" + + { + echo "## Lab 8 Checkly URL" + echo + echo "\`${CHECKLY_URL}\`" + echo + echo "The public tunnel is scheduled to remain active for 45 minutes." + } >> "$GITHUB_STEP_SUMMARY" + + echo "Waiting for DNS propagation and HTTP 200 response..." + HEALTH_STATUS="" + + for attempt in $(seq 1 120); do + if ! kill -0 "$CLOUDFLARED_PID" 2>/dev/null; then + echo "Cloudflare Tunnel stopped during endpoint readiness checks." + cat /tmp/cloudflared.log + exit 1 + fi + + HEALTH_STATUS="$( + curl -sS \ + --connect-timeout 5 \ + --max-time 15 \ + -o /tmp/public-health-body.txt \ + -w '%{http_code}' \ + "$CHECKLY_URL" 2>/dev/null || true + )" + + if [ "$HEALTH_STATUS" = "200" ]; then + echo "Public endpoint is ready." + cat /tmp/public-health-body.txt + echo + break + fi + + echo "Readiness attempt=${attempt}/120 status=${HEALTH_STATUS:-unresolved}" + sleep 5 + done + + if [ "$HEALTH_STATUS" != "200" ]; then + echo "The public endpoint did not become reachable." + cat /tmp/cloudflared.log + exit 1 + fi + + echo + echo "=== 45-MINUTE CHECKLY MONITORING WINDOW ===" + + for minute in $(seq 1 45); do + if ! kill -0 "$CLOUDFLARED_PID" 2>/dev/null; then + echo "Cloudflare Tunnel stopped at minute ${minute}." + cat /tmp/cloudflared.log + exit 1 + fi + + RESULT="$( + curl -sS \ + --connect-timeout 5 \ + --max-time 15 \ + -o /tmp/window-health-body.txt \ + -w 'status=%{http_code} time=%{time_total}s' \ + "$CHECKLY_URL" 2>/dev/null || true + )" + + echo "minute=$(printf '%02d' "$minute") ${RESULT:-request_failed}" + sleep 60 + done + + echo + echo "=== FINAL 30-MINUTE PROMETHEUS COMPARISON ===" + + WINDOW_END_UTC="$(date -u '+%Y-%m-%dT%H:%M:%SZ')" + WINDOW_START_UTC="$(date -u -d '30 minutes ago' '+%Y-%m-%dT%H:%M:%SZ')" + + PROM_P50_MS="$( + curl -fsS --get --data-urlencode 'query=1000 * histogram_quantile(0.50, sum by (le) (rate(quicknotes_http_request_duration_seconds_bucket[30m])))' http://127.0.0.1:9090/api/v1/query | + python3 -c 'import json,sys; d=json.load(sys.stdin); print(d["data"]["result"][0]["value"][1])' + )" + + PROM_P95_MS="$( + curl -fsS --get --data-urlencode 'query=1000 * histogram_quantile(0.95, sum by (le) (rate(quicknotes_http_request_duration_seconds_bucket[30m])))' http://127.0.0.1:9090/api/v1/query | + python3 -c 'import json,sys; d=json.load(sys.stdin); print(d["data"]["result"][0]["value"][1])' + )" + + PROM_ERRORS="$( + curl -fsS --get --data-urlencode 'query=sum(increase(quicknotes_http_responses_by_code_total{code=~"4..|5.."}[30m]))' http://127.0.0.1:9090/api/v1/query | + python3 -c 'import json,sys; d=json.load(sys.stdin); r=d["data"]["result"]; print(r[0]["value"][1] if r else "0")' + )" + + PROM_REQUESTS="$( + curl -fsS --get --data-urlencode 'query=sum(increase(quicknotes_http_requests_total[30m]))' http://127.0.0.1:9090/api/v1/query | + python3 -c 'import json,sys; d=json.load(sys.stdin); r=d["data"]["result"]; print(r[0]["value"][1] if r else "0")' + )" + + echo "window_start_utc=${WINDOW_START_UTC}" + echo "window_end_utc=${WINDOW_END_UTC}" + echo "prometheus_p50_ms=${PROM_P50_MS}" + echo "prometheus_p95_ms=${PROM_P95_MS}" + echo "prometheus_errors=${PROM_ERRORS}" + echo "prometheus_requests=${PROM_REQUESTS}" + + { + echo + echo "## Final 30-minute Prometheus comparison" + echo + echo "| Measurement | Value |" + echo "|---|---:|" + echo "| Window start UTC | ${WINDOW_START_UTC} |" + echo "| Window end UTC | ${WINDOW_END_UTC} |" + echo "| Internal P50 latency | ${PROM_P50_MS} ms |" + echo "| Internal P95 latency | ${PROM_P95_MS} ms |" + echo "| HTTP 4xx/5xx errors | ${PROM_ERRORS} |" + echo "| HTTP requests | ${PROM_REQUESTS} |" + } >> "$GITHUB_STEP_SUMMARY" + + echo + echo "=== FINAL PUBLIC HEALTH CHECK ===" + + FINAL_STATUS="$( + curl -sS \ + --connect-timeout 5 \ + --max-time 15 \ + -o /tmp/final-health-body.txt \ + -w '%{http_code}' \ + "$CHECKLY_URL" 2>/dev/null || true + )" + + echo "status=${FINAL_STATUS:-request_failed}" + + if [ "$FINAL_STATUS" = "200" ]; then + cat /tmp/final-health-body.txt + echo + else + echo "Final public health check was not HTTP 200." + fi + + echo + echo "=== PROMETHEUS QUICKNOTES TARGET ===" + + curl -fsS \ + --get \ + --data-urlencode 'query=up{job="quicknotes"}' \ + http://127.0.0.1:9090/api/v1/query | + python3 -m json.tool + + - name: Show failure diagnostics + if: failure() + shell: bash + run: | + echo "=== CLOUDFLARED LOG ===" + cat /tmp/cloudflared.log 2>/dev/null || true + + echo + echo "=== COMPOSE STATUS ===" + docker compose ps || true + + echo + echo "=== COMPOSE LOGS ===" + docker compose logs --no-color || true + + - name: Clean up + if: always() + shell: bash + run: | + docker compose down -v diff --git a/app/Dockerfile b/app/Dockerfile new file mode 100644 index 000000000..f971e63ed --- /dev/null +++ b/app/Dockerfile @@ -0,0 +1,75 @@ +# syntax=docker/dockerfile:1.7 + +FROM golang:1.24.13-alpine AS builder + +WORKDIR /src + +# Copy dependency metadata first to preserve the module-cache layer. +COPY go.mod ./ +RUN go mod download + +# Copy application source after dependencies. +COPY *.go ./ + +RUN mkdir -p /out/data && \ + CGO_ENABLED=0 GOOS=linux go test ./... && \ + CGO_ENABLED=0 GOOS=linux go build \ + -trimpath \ + -ldflags="-s -w" \ + -o /out/quicknotes \ + . + +# Build a static healthcheck executable because distroless has no shell, +# curl, wget, or package manager. +RUN <<'BUILD_HEALTHCHECK' +cat > /tmp/healthcheck.go <<'GO' +package main + +import ( + "net/http" + "os" + "time" +) + +func main() { + client := http.Client{ + Timeout: 2 * time.Second, + } + + response, err := client.Get("http://127.0.0.1:8080/health") + if err != nil { + os.Exit(1) + } + defer response.Body.Close() + + if response.StatusCode != http.StatusOK { + os.Exit(1) + } +} +GO + +CGO_ENABLED=0 GOOS=linux go build \ + -trimpath \ + -ldflags="-s -w" \ + -o /out/healthcheck \ + /tmp/healthcheck.go +BUILD_HEALTHCHECK + +FROM gcr.io/distroless/static-debian12:nonroot AS runtime + +WORKDIR / + +COPY --from=builder --chown=65532:65532 /out/quicknotes /quicknotes +COPY --from=builder --chown=65532:65532 /out/healthcheck /healthcheck +COPY --from=builder --chown=65532:65532 /out/data /data +COPY --chown=65532:65532 seed.json /seed.json + +ENV ADDR=:8080 \ + DATA_PATH=/data/notes.json \ + SEED_PATH=/seed.json + +EXPOSE 8080 + +USER 65532:65532 + +ENTRYPOINT ["/quicknotes"] diff --git a/app/handlers.go b/app/handlers.go index c534979c5..7659426e2 100644 --- a/app/handlers.go +++ b/app/handlers.go @@ -7,14 +7,16 @@ import ( "sort" "strconv" "sync/atomic" + "time" ) type Server struct { - store *Store - notesCreated atomic.Uint64 - notesDeleted atomic.Uint64 - requestsTotal atomic.Uint64 - requestsByCode map[int]*atomic.Uint64 + store *Store + notesCreated atomic.Uint64 + notesDeleted atomic.Uint64 + requestsTotal atomic.Uint64 + requestsByCode map[int]*atomic.Uint64 + requestDuration *requestDurationMetrics } func NewServer(store *Store) *Server { @@ -23,7 +25,11 @@ func NewServer(store *Store) *Server { for _, c := range codes { by[c] = new(atomic.Uint64) } - return &Server{store: store, requestsByCode: by} + return &Server{ + store: store, + requestsByCode: by, + requestDuration: newRequestDurationMetrics(), + } } func (s *Server) Routes() *http.ServeMux { @@ -49,12 +55,19 @@ func (sw *statusWriter) WriteHeader(code int) { func (s *Server) wrap(h http.HandlerFunc) http.HandlerFunc { return func(w http.ResponseWriter, r *http.Request) { + startedAt := time.Now() sw := &statusWriter{ResponseWriter: w, code: 200} + h(sw, r) + s.requestsTotal.Add(1) if c, ok := s.requestsByCode[sw.code]; ok { c.Add(1) } + + if r.URL.Path != "/metrics" { + s.requestDuration.observe(time.Since(startedAt)) + } } } @@ -90,6 +103,7 @@ func (s *Server) handleMetrics(w http.ResponseWriter, r *http.Request) { for _, code := range codes { _, _ = w.Write([]byte(byCodeName + `{code="` + strconv.Itoa(code) + `"} ` + strconv.FormatUint(s.requestsByCode[code].Load(), 10) + "\n")) } + s.requestDuration.writePrometheus(w) } func (s *Server) handleListNotes(w http.ResponseWriter, r *http.Request) { diff --git a/app/request_duration_metrics.go b/app/request_duration_metrics.go new file mode 100644 index 000000000..b946866ba --- /dev/null +++ b/app/request_duration_metrics.go @@ -0,0 +1,112 @@ +package main + +import ( + "fmt" + "io" + "strconv" + "sync/atomic" + "time" +) + +const requestDurationMetricName = "quicknotes_http_request_duration_seconds" + +var requestDurationBuckets = []float64{ + 0.00005, + 0.0001, + 0.00025, + 0.0005, + 0.001, + 0.0025, + 0.005, + 0.01, + 0.025, + 0.05, + 0.1, + 0.25, + 0.5, + 1, + 2, + 5, +} + +type requestDurationMetrics struct { + bucketCounts []atomic.Uint64 + count atomic.Uint64 + sumNanoseconds atomic.Uint64 +} + +func newRequestDurationMetrics() *requestDurationMetrics { + return &requestDurationMetrics{ + bucketCounts: make( + []atomic.Uint64, + len(requestDurationBuckets), + ), + } +} + +func (m *requestDurationMetrics) observe(duration time.Duration) { + seconds := duration.Seconds() + + m.count.Add(1) + m.sumNanoseconds.Add(uint64(duration.Nanoseconds())) + + for index, upperBound := range requestDurationBuckets { + if seconds <= upperBound { + m.bucketCounts[index].Add(1) + } + } +} + +func (m *requestDurationMetrics) writePrometheus(writer io.Writer) { + _, _ = fmt.Fprintf( + writer, + "# HELP %s HTTP request duration in seconds.\n", + requestDurationMetricName, + ) + _, _ = fmt.Fprintf( + writer, + "# TYPE %s histogram\n", + requestDurationMetricName, + ) + + for index, upperBound := range requestDurationBuckets { + label := strconv.FormatFloat( + upperBound, + 'g', + -1, + 64, + ) + + _, _ = fmt.Fprintf( + writer, + "%s_bucket{le=\"%s\"} %d\n", + requestDurationMetricName, + label, + m.bucketCounts[index].Load(), + ) + } + + count := m.count.Load() + sumSeconds := + float64(m.sumNanoseconds.Load()) / + float64(time.Second) + + _, _ = fmt.Fprintf( + writer, + "%s_bucket{le=\"+Inf\"} %d\n", + requestDurationMetricName, + count, + ) + _, _ = fmt.Fprintf( + writer, + "%s_sum %s\n", + requestDurationMetricName, + strconv.FormatFloat(sumSeconds, 'g', -1, 64), + ) + _, _ = fmt.Fprintf( + writer, + "%s_count %d\n", + requestDurationMetricName, + count, + ) +} diff --git a/app/request_duration_metrics_test.go b/app/request_duration_metrics_test.go new file mode 100644 index 000000000..084e26ef1 --- /dev/null +++ b/app/request_duration_metrics_test.go @@ -0,0 +1,30 @@ +package main + +import ( + "net/http" + "strings" + "testing" +) + +func TestMetrics_ExposesRequestDurationHistogram(t *testing.T) { + srv := newTestServer(t) + + _ = do(t, srv, http.MethodGet, "/health", nil) + recorder := do(t, srv, http.MethodGet, "/metrics", nil) + + if recorder.Code != http.StatusOK { + t.Fatalf("metrics status: %d", recorder.Code) + } + + body := recorder.Body.String() + + for _, expected := range []string{ + "# TYPE quicknotes_http_request_duration_seconds histogram", + `quicknotes_http_request_duration_seconds_bucket{le="+Inf"} 1`, + "quicknotes_http_request_duration_seconds_count 1", + } { + if !strings.Contains(body, expected) { + t.Errorf("metrics missing %q", expected) + } + } +} diff --git a/compose.yaml b/compose.yaml new file mode 100644 index 000000000..47e685698 --- /dev/null +++ b/compose.yaml @@ -0,0 +1,60 @@ +services: + quicknotes: + image: quicknotes:lab8 + build: + context: ./app + dockerfile: Dockerfile + ports: + - "8080:8080" + environment: + ADDR: ":8080" + DATA_PATH: "/data/notes.json" + SEED_PATH: "/seed.json" + volumes: + - quicknotes-data:/data + healthcheck: + test: ["CMD", "/healthcheck"] + interval: 5s + timeout: 3s + retries: 5 + start_period: 3s + restart: unless-stopped + user: "65532:65532" + cap_drop: + - ALL + read_only: true + tmpfs: + - /tmp:rw,noexec,nosuid,nodev,size=16m + security_opt: + - no-new-privileges:true + + prometheus: + image: prom/prometheus:v3.12.0 + ports: + - "9090:9090" + volumes: + - ./monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - ./monitoring/prometheus/alerts.yml:/etc/prometheus/alerts.yml:ro + depends_on: + quicknotes: + condition: service_healthy + restart: unless-stopped + + grafana: + image: grafana/grafana:13.0.1 + ports: + - "3000:3000" + environment: + GF_SECURITY_ADMIN_USER: "${GRAFANA_ADMIN_USER}" + GF_SECURITY_ADMIN_PASSWORD: "${GRAFANA_ADMIN_PASSWORD}" + GF_USERS_ALLOW_SIGN_UP: "false" + volumes: + - ./monitoring/grafana/provisioning:/etc/grafana/provisioning:ro + - ./monitoring/grafana/dashboards:/var/lib/grafana/dashboards:ro + depends_on: + prometheus: + condition: service_started + restart: unless-stopped + +volumes: + quicknotes-data: diff --git a/docs/runbook/high-error-rate.md b/docs/runbook/high-error-rate.md new file mode 100644 index 000000000..e969203f1 --- /dev/null +++ b/docs/runbook/high-error-rate.md @@ -0,0 +1,44 @@ +# QuickNotes High Error Rate Runbook + +## What this alert means + +More than 5% of QuickNotes requests have returned HTTP 4xx or 5xx responses continuously for at least five minutes, indicating sustained user-visible failures. + +## Triage steps + +1. Confirm that the alert is still firing and inspect the current error ratio in Prometheus or the Grafana Golden Signals dashboard. + +2. Check whether all containers are running: + + docker compose ps + +3. Inspect recent QuickNotes logs: + + docker compose logs --tail=200 quicknotes + +4. Identify which response codes dominate using PromQL: + + sum by (code) ( + rate(quicknotes_http_responses_by_code_total[5m]) + ) + +5. Test the service directly: + + curl -i http://localhost:8080/health + curl -i http://localhost:8080/notes + +## Mitigations + +1. Stop or rate-limit any client generating malformed or excessive requests. + +2. Restart QuickNotes if it is unhealthy or stuck: + + docker compose restart quicknotes + +3. Roll back the latest application or configuration change if errors began after deployment. + +4. Temporarily reduce non-essential traffic while preserving health checks and normal reads. + +## Post-incident + +After recovery, preserve logs and monitoring evidence, identify the root cause, and complete a blameless postmortem using the Lecture 1 postmortem structure. Assign an owner and deadline to every preventive action. diff --git a/monitoring/grafana/dashboards/golden-signals.json b/monitoring/grafana/dashboards/golden-signals.json new file mode 100644 index 000000000..ab2036c1d --- /dev/null +++ b/monitoring/grafana/dashboards/golden-signals.json @@ -0,0 +1,211 @@ +{ + "annotations": { + "list": [] + }, + "editable": false, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Internal QuickNotes HTTP request latency calculated from the Prometheus duration histogram.", + "fieldConfig": { + "defaults": { + "min": 0, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "editorMode": "code", + "expr": "histogram_quantile(0.50, sum by (le) (rate(quicknotes_http_request_duration_seconds_bucket[5m])))", + "legendFormat": "P50 latency", + "range": true, + "refId": "A" + }, + { + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum by (le) (rate(quicknotes_http_request_duration_seconds_bucket[5m])))", + "legendFormat": "P95 latency", + "range": true, + "refId": "B" + } + ], + "title": "Latency — P50 and P95", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Average QuickNotes request rate over five minutes.", + "fieldConfig": { + "defaults": { + "min": 0, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 2, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "editorMode": "code", + "expr": "rate(quicknotes_http_requests_total[5m])", + "legendFormat": "requests per second", + "range": true, + "refId": "A" + } + ], + "title": "Traffic — request rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Percentage of responses returning a 4xx or 5xx status over five minutes.", + "fieldConfig": { + "defaults": { + "max": 100, + "min": 0, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 3, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "editorMode": "code", + "expr": "100 * sum(rate(quicknotes_http_responses_by_code_total{code=~\"4..|5..\"}[5m])) / clamp_min(sum(rate(quicknotes_http_requests_total[5m])), 0.001)", + "legendFormat": "error percentage", + "range": true, + "refId": "A" + } + ], + "title": "Errors — 4xx and 5xx ratio", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Current number of stored notes, used as the available saturation signal.", + "fieldConfig": { + "defaults": { + "min": 0, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 4, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "editorMode": "code", + "expr": "quicknotes_notes_total", + "legendFormat": "stored notes", + "range": true, + "refId": "A" + } + ], + "title": "Saturation — stored notes", + "type": "timeseries" + } + ], + "refresh": "15s", + "schemaVersion": 41, + "tags": [ + "quicknotes", + "golden-signals", + "lab8" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "QuickNotes Golden Signals", + "uid": "quicknotes-golden-signals", + "version": 1 +} diff --git a/monitoring/grafana/provisioning/dashboards/dashboard.yml b/monitoring/grafana/provisioning/dashboards/dashboard.yml new file mode 100644 index 000000000..2d9079e84 --- /dev/null +++ b/monitoring/grafana/provisioning/dashboards/dashboard.yml @@ -0,0 +1,12 @@ +apiVersion: 1 + +providers: + - name: QuickNotes + orgId: 1 + folder: QuickNotes + type: file + disableDeletion: false + editable: false + updateIntervalSeconds: 10 + options: + path: /var/lib/grafana/dashboards diff --git a/monitoring/grafana/provisioning/datasources/datasource.yml b/monitoring/grafana/provisioning/datasources/datasource.yml new file mode 100644 index 000000000..d6f26fa15 --- /dev/null +++ b/monitoring/grafana/provisioning/datasources/datasource.yml @@ -0,0 +1,12 @@ +apiVersion: 1 + +datasources: + - name: Prometheus + uid: prometheus + type: prometheus + access: proxy + url: http://prometheus:9090 + isDefault: true + editable: false + jsonData: + timeInterval: 15s diff --git a/monitoring/prometheus/alerts.yml b/monitoring/prometheus/alerts.yml new file mode 100644 index 000000000..52ef5b835 --- /dev/null +++ b/monitoring/prometheus/alerts.yml @@ -0,0 +1,17 @@ +groups: + - name: quicknotes.rules + rules: + - alert: QuickNotesHighErrorRate + expr: | + ( + sum(rate(quicknotes_http_responses_by_code_total{code=~"4..|5.."}[1m])) + / + clamp_min(sum(rate(quicknotes_http_requests_total[1m])), 0.001) + ) > 0.05 + for: 5m + labels: + severity: page + annotations: + summary: QuickNotes error ratio is above 5% + description: More than 5% of QuickNotes requests have returned 4xx or 5xx responses for at least five minutes. + runbook_url: https://github.com/tivdzualubem/DevOps-Intro/blob/feature/lab8/docs/runbook/high-error-rate.md diff --git a/monitoring/prometheus/prometheus.yml b/monitoring/prometheus/prometheus.yml new file mode 100644 index 000000000..783514dd9 --- /dev/null +++ b/monitoring/prometheus/prometheus.yml @@ -0,0 +1,12 @@ +global: + scrape_interval: 15s + +rule_files: + - /etc/prometheus/alerts.yml + +scrape_configs: + - job_name: quicknotes + metrics_path: /metrics + static_configs: + - targets: + - quicknotes:8080 diff --git a/monitoring/scripts/generate-high-error-traffic.sh b/monitoring/scripts/generate-high-error-traffic.sh new file mode 100755 index 000000000..d5b6124ce --- /dev/null +++ b/monitoring/scripts/generate-high-error-traffic.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash +set -u + +while true; do + curl -sS -o /dev/null http://localhost:8080/notes + + curl -sS -o /dev/null \ + -X POST \ + -H "Content-Type: application/json" \ + --data '{"title":' \ + http://localhost:8080/notes + + sleep 1 +done diff --git a/submissions/lab8.md b/submissions/lab8.md new file mode 100644 index 000000000..b420e90ec --- /dev/null +++ b/submissions/lab8.md @@ -0,0 +1,422 @@ +# Lab 8 — SRE and Monitoring + +## Overview + +This submission extends the Lab 6 QuickNotes Compose stack with Prometheus, Grafana, a provisioned four-panel Golden Signals dashboard, a sustained high-error-rate alert, an operational runbook, and an external Checkly synthetic monitor. + +The completed stack contains: + +- QuickNotes on port `8080` +- Prometheus `v3.12.0` on port `9090` +- Grafana `v13.0.1` on port `3000` +- A provisioned Prometheus data source +- A provisioned QuickNotes Golden Signals dashboard +- A Prometheus alert for a sustained error ratio above 5% +- A Checkly API check from Frankfurt and Singapore +- A QuickNotes request-duration histogram for real internal P50 and P95 latency + +--- + +# Task 1 — Prometheus and Grafana + +## Configuration Files + +The implementation is stored in the following files: + +- [Compose stack](../compose.yaml) +- [Prometheus configuration](../monitoring/prometheus/prometheus.yml) +- [Grafana data-source provisioning](../monitoring/grafana/provisioning/datasources/datasource.yml) +- [Grafana dashboard-provider provisioning](../monitoring/grafana/provisioning/dashboards/dashboard.yml) +- [Golden Signals dashboard JSON](../monitoring/grafana/dashboards/golden-signals.json) +- [Request-duration histogram implementation](../app/request_duration_metrics.go) +- [Request-duration histogram tests](../app/request_duration_metrics_test.go) +- [Environment-variable example](../.env.example) + +The actual local `.env` file contains generated Grafana credentials, is excluded from Git, and is not committed. + +## Prometheus Configuration + +Prometheus uses a global scrape interval of 15 seconds and one scrape job for QuickNotes. The target uses the Compose service name and internal application port: + +```yaml +global: + scrape_interval: 15s + +rule_files: + - /etc/prometheus/alerts.yml + +scrape_configs: + - job_name: quicknotes + static_configs: + - targets: + - quicknotes:8080 +``` + +Using `quicknotes:8080` allows Docker Compose DNS to resolve the application inside the shared Compose network. + +## Prometheus Target Verification + +The target API was queried with: + +```bash +curl -s http://localhost:9090/api/v1/targets | +jq -r '.data.activeTargets[] | +select(.labels.job == "quicknotes") | +.health' +``` + +Output: + +```text +up +``` + +Prometheus target evidence: + +![Prometheus target showing QuickNotes as UP](screenshots/lab8/01-prometheus-target-up.png) + +## Grafana Provisioning + +Grafana automatically loads: + +1. A default Prometheus data source using `http://prometheus:9090`. +2. The dashboard-provider configuration. +3. The Golden Signals dashboard JSON from the mounted dashboard directory. + +No manual dashboard creation is required when the stack is recreated. + +## Golden Signals Dashboard + +The dashboard contains exactly four panels: + +| Golden signal | Panel query or metric | Purpose | +|---|---|---| +| Latency | `histogram_quantile()` over `quicknotes_http_request_duration_seconds_bucket` | Internal HTTP request P50 and P95 latency | +| Traffic | `rate(quicknotes_http_requests_total[5m])` | Requests processed per second | +| Errors | Ratio of 4xx and 5xx responses to all responses | Percentage of requests that fail | +| Saturation | `quicknotes_notes_total` | Current number of stored notes | + +QuickNotes was extended with the Prometheus histogram `quicknotes_http_request_duration_seconds`. The Latency panel now calculates real P50 and P95 values using `histogram_quantile()`. + +Mixed successful and failing traffic was used to validate the Traffic and Errors panels. After adding the request-duration histogram, additional successful requests were generated to populate the P50 and P95 latency series. + +Dashboard evidence: + +![Provisioned QuickNotes Golden Signals dashboard](screenshots/lab8/02-grafana-golden-signals-dashboard.png) + +## Task 1 Design Questions + +### a) Pull versus push + +Prometheus uses a pull model, so Prometheus must be able to initiate a connection to the QuickNotes `/metrics` endpoint. QuickNotes does not need to know the Prometheus address or push metrics to it. + +If Prometheus cannot reach QuickNotes, the target becomes unavailable and the `up` metric becomes `0`. Prometheus stops receiving fresh application samples, and previously collected series eventually become stale. This may indicate application failure, DNS failure, network isolation, an incorrect target address, or a blocked port. + +### b) Effects of changing the scrape interval + +Changing the scrape interval from 15 seconds to 5 seconds triples the number of stored samples, network requests, disk writes, and query-processing work. It may also produce noisy short-window rates without adding useful operational information. + +Changing it to 5 minutes creates severe under-sampling. Short incidents may begin and end between scrapes, dashboards update slowly, and alerts may take several minutes to detect a problem. Rate calculations also become unreliable when a selected range contains too few samples. + +A 15-second interval gives a reasonable balance between detection speed, resolution, and resource cost for this lab. + +### c) `rate()` versus `irate()` versus `delta()` + +`rate()` is appropriate for the Traffic panel because `quicknotes_http_requests_total` is a monotonically increasing counter. It calculates the average per-second increase across the selected range and handles counter resets. + +`irate()` uses only the final two samples and is more sensitive to short spikes, making a dashboard visually unstable. It is useful for highly responsive troubleshooting but less suitable for a general operational traffic graph. + +`delta()` calculates the change in a gauge over a range. It is not the correct function for request counters and does not provide the same counter-reset handling as `rate()`. + +### d) Why provision Grafana from files? + +File-based provisioning makes the monitoring environment reproducible. A new stack automatically receives the same data source, dashboard, queries, titles, and panel layout. + +The files can be version-controlled, code-reviewed, tested, compared through Git diffs, and restored after failure. Manual UI configuration is difficult to reproduce and can drift between developers or environments. + +--- + +# Task 2 — High Error Rate Alert + +## Alert Rule + +The complete Prometheus rule is stored at: + +[monitoring/prometheus/alerts.yml](../monitoring/prometheus/alerts.yml) + +The rule: + +- Calculates the ratio of 4xx and 5xx responses to all responses. +- Fires when the error ratio exceeds `0.05`. +- Requires the condition to remain true for five minutes. +- Uses the label `severity: page`. +- Contains an annotation linking to the runbook. + +The principal rule structure is: + +```yaml +- alert: QuickNotesHighErrorRate + expr: | + ( + sum(rate(quicknotes_http_responses_by_code_total{code=~"4..|5.."}[1m])) + / + sum(rate(quicknotes_http_requests_total[1m])) + ) > 0.05 + for: 5m + labels: + severity: page +``` + +## Alert Trigger Procedure + +The executable script below generated one healthy request and one malformed request every second: + +[monitoring/scripts/generate-high-error-traffic.sh](../monitoring/scripts/generate-high-error-traffic.sh) + +The malformed requests produced a sustained error ratio greater than 5%. + +The alert correctly transitioned through: + +```text +Inactive → Pending → Firing +``` + +Pending-state evidence: + +![QuickNotes high-error-rate alert pending](screenshots/lab8/04-high-error-rate-pending.png) + +Firing-state evidence: + +![QuickNotes high-error-rate alert firing](screenshots/lab8/05-high-error-rate-firing.png) + +After the error-generating process was stopped, the alert returned to the inactive state. + +## Runbook + +The repository runbook is available at: + +[docs/runbook/high-error-rate.md](../docs/runbook/high-error-rate.md) + +### What this alert means + +More than 5% of QuickNotes HTTP responses have been 4xx or 5xx responses continuously for at least five minutes, indicating sustained user-visible request failures. + +### Triage Steps + +1. Confirm the alert state, error ratio, start time, severity label, and five-minute duration in Prometheus. + +2. Verify the service state and health: + + docker compose ps + curl -i http://localhost:8080/health + docker compose logs --tail=100 quicknotes + +3. Inspect the response-code counters and determine whether failures are primarily 4xx or 5xx: + + curl -s http://localhost:8080/metrics | + grep quicknotes_http_responses_by_code_total + +4. Check for recent application, Compose, configuration, or traffic changes that coincide with the beginning of the alert. + +5. Determine whether a test or load-generation process is intentionally producing malformed requests and stop it when appropriate. + +### Mitigations + +1. Roll back the latest application or configuration change and redeploy the last known-good QuickNotes image. + +2. Block, rate-limit, or disable a malfunctioning client that is continuously sending invalid requests. + +3. Restart the QuickNotes service only when evidence indicates that the process is unhealthy or stuck: + + docker compose restart quicknotes + +4. Restore any unavailable dependency or persistent-data path identified during triage. + +### Post-Incident + +After mitigation: + +1. Confirm that `/health` returns HTTP 200. +2. Confirm the error ratio returns below 5%. +3. Confirm the alert returns to inactive. +4. Record the incident timeline, user impact, detection method, root cause, contributing factors, and mitigation. +5. Create a blameless postmortem with assigned action items, owners, and completion dates following the Lecture 1 postmortem structure. + +## Task 2 Design Questions + +### e) Why require a five-minute sustained breach? + +A single malformed request or short traffic burst does not necessarily indicate an incident. Requiring five minutes filters temporary noise, client mistakes, brief deployments, and isolated failures. + +The delay reduces false pages and prevents alert flapping while still detecting a sustained user-visible problem. A paging alert should represent a condition that requires human action rather than every individual error. + +### f) Symptom alerts versus cause alerts + +A possible cause alert would be: + +```text +QuickNotes CPU usage is greater than 80%. +``` + +This is worse as a paging condition because high CPU may occur during legitimate work while users continue receiving successful responses. It may therefore wake an operator when there is no user impact. + +It can also miss incidents caused by network failure, bad requests, storage failure, configuration errors, or application bugs. The error-ratio alert directly measures the symptom users experience regardless of the underlying cause. + +### g) Quantitative alert-fatigue threshold + +The alert should be considered too noisy if more than 10% of its pages occur when users are not measurably affected. This corresponds to alert precision below 90%. + +At that point, the threshold, evaluation window, traffic filters, or severity should be reviewed. Repeated false pages train operators to ignore alerts and increase the risk that a genuine incident will be missed. + +--- + +# Bonus — External Synthetic Monitoring + +## Public Deployment + +A temporary Cloudflare Quick Tunnel exposed the QuickNotes health endpoint publicly. The tunnel and the real Lab 8 Compose stack ran on a GitHub-hosted Actions runner because the local network blocked tunnel registration. + +The implementation is stored at: + +[.github/workflows/lab8-external-monitor.yml](../.github/workflows/lab8-external-monitor.yml) + +The workflow: + +1. Checks out `feature/lab8`. +2. Generates runtime-only Grafana credentials. +3. Starts QuickNotes, Prometheus, and Grafana with Docker Compose. +4. Creates an accountless Cloudflare Quick Tunnel. +5. Waits for DNS propagation and HTTP 200 readiness. +6. Maintains the tunnel for 45 minutes. +7. Performs one external health request every minute. +8. Queries Prometheus for the final 30-minute P50, P95, request count, and error count. +9. Cleans up the Compose stack after completion. + +Initial successful GitHub Actions evidence: + +![Successful 45-minute external monitoring workflow](screenshots/lab8/10-github-actions-external-monitoring-success.png) + +## Checkly Configuration + +The Checkly API check was configured with: + +| Setting | Value | +|---|---| +| Check name | QuickNotes External Health | +| Method | GET | +| Path | `/health` | +| Frequency | Every 1 minute | +| Scheduling | Parallel runs | +| Locations | Frankfurt and Singapore | +| Status assertion | Status code equals 200 | +| Latency assertion | Response time less than 2000 ms | +| Alerting | Enabled | + +Initial assertion evidence: + +![Checkly status and response-time assertions](screenshots/lab8/06-checkly-assertions-success.png) + +Frequency evidence: + +![Checkly one-minute frequency](screenshots/lab8/07-checkly-frequency.png) + +Location evidence: + +![Checkly Frankfurt and Singapore locations](screenshots/lab8/08-checkly-locations.png) + +Same-window assertion evidence: + +![Checkly same-window assertions](screenshots/lab8/11-checkly-same-window-assertions.png) + +## Checkly Results + +The definitive comparison used the same 30-minute interval in Prometheus and Checkly: + +```text +2026-06-24 20:44:56 UTC to 2026-06-24 21:14:56 UTC +``` + +This corresponds approximately to `23:44:56` on June 24 through `00:14:56` on June 25 in the local UTC+3 time zone. + +The selected Checkly window produced: + +- Availability: `100%` +- Failure alerts: `0` +- Retry ratio: `0%` +- Median latency, P50: `374 ms` +- P95 latency: `1.09 s` +- Successful parallel runs from Frankfurt and Singapore +- No assertion failures within the selected comparison window + +Initial 31-minute monitoring evidence: + +![Checkly initial 31-minute successful window](screenshots/lab8/09-checkly-results-30-minutes.png) + +Definitive same-window evidence: + +![Checkly same-window 30-minute results](screenshots/lab8/13-checkly-same-window-30-minutes.png) + +## Internal versus External Comparison + +Prometheus and Checkly were compared over the same final 30-minute monitoring window. Prometheus measured QuickNotes handler execution inside the Compose environment, while Checkly measured the complete public path from Frankfurt and Singapore. + +| Measurement | Prometheus inside Compose | Checkly external monitoring | +|---|---:|---:| +| Window start | 2026-06-24 20:44:56 UTC | 2026-06-24 20:44:56 UTC | +| Window end | 2026-06-24 21:14:56 UTC | 2026-06-24 21:14:56 UTC | +| Request latency P50 | 0.0252 ms | 374 ms | +| Request latency P95 | 0.0478 ms | 1.09 s | +| HTTP 4xx/5xx errors | 0 | 0 | +| Requests observed | Approximately 565 | Parallel checks every minute | +| Availability | Prometheus target `up = 1` | 100% | + +The workflow reported approximately `564.71` requests because Prometheus `increase()` extrapolates counters to the exact time-range boundaries. + +The external values are higher because Checkly includes DNS resolution, TCP and TLS establishment, Cloudflare Tunnel processing, internet routing, and regional network latency. Prometheus measures only the application handler execution inside the Compose environment. + +The histogram excludes `/metrics` requests so that Prometheus scraping does not distort the application-request latency distribution. + +Prometheus same-window evidence: + +![GitHub Actions Prometheus comparison](screenshots/lab8/12-github-actions-same-window-prometheus.png) + +## Failure-Mode Analysis + +Checkly can detect failures outside the Compose network, including DNS failure, TLS problems, Cloudflare tunnel failure, internet routing problems, regional connectivity problems, and a public endpoint that is unreachable even though the containers are internally healthy. + +Prometheus can detect detailed internal application conditions that a simple external `/health` check cannot see, including increasing error counters, stored-note saturation, scrape failure, unusual traffic rates, and problems affecting endpoints other than `/health`. + +Checkly measures the experience of an external client across the complete network path. Prometheus provides higher-resolution service telemetry and explains what is happening inside the application environment. + +Using both provides stronger coverage than either system alone. + +--- + +# Final Verification Summary + +| Requirement | Result | +|---|---| +| Prometheus target `up == 1` | Passed | +| Prometheus scrape interval is 15 seconds | Passed | +| Grafana data source provisioned | Passed | +| Grafana dashboard provisioned | Passed | +| Four Golden Signals panels present | Passed | +| Real P50 and P95 latency panel present | Passed | +| Dashboard contains non-trivial data | Passed | +| Error-ratio rule exceeds 5% threshold | Passed | +| Five-minute sustained gate | Passed | +| `severity: page` label | Passed | +| Runbook annotation | Passed | +| Alert observed pending | Passed | +| Alert observed firing | Passed | +| Runbook contains all required sections | Passed | +| Checkly frequency is one minute | Passed | +| Two external regions | Passed | +| Status assertion equals 200 | Passed | +| Response-time assertion below two seconds | Passed | +| Monitoring duration at least 30 minutes | Passed | +| Prometheus and Checkly used the same comparison window | Passed | +| Real Prometheus P50 and P95 values recorded | Passed | +| Checkly P50 and P95 recorded | Passed | +| All commits signed | Passed | +| Secrets excluded from Git | Passed | diff --git a/submissions/screenshots/lab8/01-prometheus-target-up.png b/submissions/screenshots/lab8/01-prometheus-target-up.png new file mode 100644 index 000000000..370f2ed50 Binary files /dev/null and b/submissions/screenshots/lab8/01-prometheus-target-up.png differ diff --git a/submissions/screenshots/lab8/02-grafana-golden-signals-dashboard.png b/submissions/screenshots/lab8/02-grafana-golden-signals-dashboard.png new file mode 100644 index 000000000..1709e114e Binary files /dev/null and b/submissions/screenshots/lab8/02-grafana-golden-signals-dashboard.png differ diff --git a/submissions/screenshots/lab8/04-high-error-rate-pending.png b/submissions/screenshots/lab8/04-high-error-rate-pending.png new file mode 100644 index 000000000..94bf3a5f8 Binary files /dev/null and b/submissions/screenshots/lab8/04-high-error-rate-pending.png differ diff --git a/submissions/screenshots/lab8/05-high-error-rate-firing.png b/submissions/screenshots/lab8/05-high-error-rate-firing.png new file mode 100644 index 000000000..58426c819 Binary files /dev/null and b/submissions/screenshots/lab8/05-high-error-rate-firing.png differ diff --git a/submissions/screenshots/lab8/06-checkly-assertions-success.png b/submissions/screenshots/lab8/06-checkly-assertions-success.png new file mode 100644 index 000000000..1c12d9feb Binary files /dev/null and b/submissions/screenshots/lab8/06-checkly-assertions-success.png differ diff --git a/submissions/screenshots/lab8/07-checkly-frequency.png b/submissions/screenshots/lab8/07-checkly-frequency.png new file mode 100644 index 000000000..84db84527 Binary files /dev/null and b/submissions/screenshots/lab8/07-checkly-frequency.png differ diff --git a/submissions/screenshots/lab8/08-checkly-locations.png b/submissions/screenshots/lab8/08-checkly-locations.png new file mode 100644 index 000000000..28fd39ab9 Binary files /dev/null and b/submissions/screenshots/lab8/08-checkly-locations.png differ diff --git a/submissions/screenshots/lab8/09-checkly-results-30-minutes.png b/submissions/screenshots/lab8/09-checkly-results-30-minutes.png new file mode 100644 index 000000000..bcf2b9bb3 Binary files /dev/null and b/submissions/screenshots/lab8/09-checkly-results-30-minutes.png differ diff --git a/submissions/screenshots/lab8/10-github-actions-external-monitoring-success.png b/submissions/screenshots/lab8/10-github-actions-external-monitoring-success.png new file mode 100644 index 000000000..8c8b0ca75 Binary files /dev/null and b/submissions/screenshots/lab8/10-github-actions-external-monitoring-success.png differ diff --git a/submissions/screenshots/lab8/11-checkly-same-window-assertions.png b/submissions/screenshots/lab8/11-checkly-same-window-assertions.png new file mode 100644 index 000000000..f506318b2 Binary files /dev/null and b/submissions/screenshots/lab8/11-checkly-same-window-assertions.png differ diff --git a/submissions/screenshots/lab8/12-github-actions-same-window-prometheus.png b/submissions/screenshots/lab8/12-github-actions-same-window-prometheus.png new file mode 100644 index 000000000..aae829b2c Binary files /dev/null and b/submissions/screenshots/lab8/12-github-actions-same-window-prometheus.png differ diff --git a/submissions/screenshots/lab8/13-checkly-same-window-30-minutes.png b/submissions/screenshots/lab8/13-checkly-same-window-30-minutes.png new file mode 100644 index 000000000..1594a164e Binary files /dev/null and b/submissions/screenshots/lab8/13-checkly-same-window-30-minutes.png differ