From 13eef0e61be99c96cdfecc2fb49ba72f9a172816 Mon Sep 17 00:00:00 2001 From: Marius van Niekerk Date: Fri, 3 Jul 2026 10:28:02 -0400 Subject: [PATCH 1/7] feat(bench): add hot-path benchmark gate for recurring sync perf regressions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit agentsview has repeatedly shipped the same performance regression shape: sync work that should scale with new data silently starts scaling with archive size, and nothing in CI notices until a user's machine spends minutes in a sync pass (#912 discovery recomputing root project info per source, the provider-migration loss of pre-parse freshness skips, #954's O(history) work per streamed append, #309's per-row json_extract scan). The fixes landed, but each class had no gate, so the next refactor could reintroduce it. This adds a two-layer gate keyed to those historical classes: - Deterministic work-count invariants that run in the normal test suite and are immune to runner noise: a warm full sync over an unchanged Claude archive must skip every session and run zero bulk-write batches (asserted via SyncStats + PhaseStats). This complements the existing Vibe freshness test, which covers the generic providerSourceUnchangedInDB path but not Claude's own pre-parse skip. - Hot-path benchmarks (warm no-op full sync, incremental append into a 1k-message session, cold-archive ingest, streaming chunk-merge replace, batched message insert) plus the existing GetDailyUsage and secrets benchmarks, compared on every PR against the merge base by a new bench.yml workflow. cmd/benchgate gates hard on allocs/op and B/op — deterministic on a fixed runner, and an O(archive)-vs-O(delta) regression always inflates them — while ns/op gets a loose 2x limit so runner noise cannot flake a PR but algorithmic blowups still fail. Benchmarks present on only one side are reported, never gated, so the first PR (empty baseline) and future benchmark additions/removals cannot wedge the gate. Both sides run in one job on one runner to keep the comparison apples to apples; a broken merge base degrades to an empty baseline instead of blocking. The warm-noop and append benchmarks self-assert the invariant they protect (Synced==0, appends absorbed), so a refactor that silently changes what the benchmark measures fails loudly instead of producing a meaningless number. docs/internal/performance-gates.md records the regression history, which gate covers which class, and how to add a benchmark to the pattern (kept in sync between bench.yml and the Makefile bench-gate target). Cross-backend query benchmarks stay in internal/backendbench: they need Docker/testcontainers and answer a different question (backend parity), so they are not part of the per-PR gate. --- .github/workflows/bench.yml | 83 +++++++++ Makefile | 18 +- cmd/benchgate/main.go | 253 +++++++++++++++++++++++++++ cmd/benchgate/main_test.go | 198 +++++++++++++++++++++ docs/internal/performance-gates.md | 89 ++++++++++ internal/db/messages_bench_test.go | 148 ++++++++++++++++ internal/sync/engine_bench_test.go | 247 ++++++++++++++++++++++++++ internal/sync/perf_invariant_test.go | 66 +++++++ 8 files changed, 1101 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/bench.yml create mode 100644 cmd/benchgate/main.go create mode 100644 cmd/benchgate/main_test.go create mode 100644 docs/internal/performance-gates.md create mode 100644 internal/db/messages_bench_test.go create mode 100644 internal/sync/engine_bench_test.go create mode 100644 internal/sync/perf_invariant_test.go diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml new file mode 100644 index 000000000..4322236bb --- /dev/null +++ b/.github/workflows/bench.yml @@ -0,0 +1,83 @@ +name: Bench Gate + +# Compares hot-path benchmarks between the PR head and its merge +# base, so performance regressions in the sync engine and DB write +# paths fail the PR instead of shipping. The gated regression classes +# have all shipped before: +# +# - discovery/skip work scaling with archive size instead of new +# data (#912, the providerSourceUnchangedInDB gap) +# - O(session history) work per incremental append (#954) +# - bulk ingest throughput (#411) +# - per-row query-shape regressions in usage aggregation (#309) +# +# cmd/benchgate gates on allocs/op and B/op (deterministic on a given +# machine, tight thresholds) and on ns/op with a loose 2x threshold +# that only catches algorithmic blowups; both sides run on the same +# runner within one job, so the comparison is apples to apples. +# Benchmarks that only exist on one side are reported but never fail +# the gate. Keep BENCH_PATTERN in sync with the Makefile's +# bench-gate target. + +on: + pull_request: + +concurrency: + group: bench-${{ github.head_ref || github.ref }} + cancel-in-progress: true + +permissions: + contents: read + +env: + BENCH_PACKAGES: ./internal/sync ./internal/db ./internal/secrets + BENCH_PATTERN: ^(BenchmarkSyncAllColdArchive|BenchmarkSyncAllWarmNoop|BenchmarkSyncPathsIncrementalAppend|BenchmarkReplaceSessionMessagesStreamingMerge|BenchmarkInsertMessagesBatch|BenchmarkGetDailyUsage|BenchmarkScan|BenchmarkScanDefinite)$ + BENCH_COUNT: "6" + BENCH_TIME: 500ms + +jobs: + bench-gate: + name: Benchmark Gate + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0 + with: + persist-credentials: false + fetch-depth: 0 + + - uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0 + with: + go-version-file: go.mod + + - name: Restore pricing snapshot + run: go run ./internal/pricing/cmd/litellm-snapshot -restore + + - name: Run benchmarks (PR head) + env: + CGO_ENABLED: "1" + run: | + set -o pipefail + go test -tags "fts5" -run '^$' -bench "$BENCH_PATTERN" \ + -benchmem -count "$BENCH_COUNT" -benchtime "$BENCH_TIME" \ + -timeout 25m $BENCH_PACKAGES | tee /tmp/bench-new.txt + + - name: Run benchmarks (merge base) + env: + CGO_ENABLED: "1" + BASE_REF: ${{ github.base_ref }} + # A failing merge-base run (or one with none of the gated + # benchmarks yet) degrades to an empty baseline: benchgate + # then reports every benchmark as new and passes, so a broken + # base never blocks a PR. + run: | + base=$(git merge-base HEAD "origin/$BASE_REF") + git worktree add /tmp/bench-base "$base" + cd /tmp/bench-base + go run ./internal/pricing/cmd/litellm-snapshot -restore || true + go test -tags "fts5" -run '^$' -bench "$BENCH_PATTERN" \ + -benchmem -count "$BENCH_COUNT" -benchtime "$BENCH_TIME" \ + -timeout 25m $BENCH_PACKAGES > /tmp/bench-old.txt \ + || echo "merge-base benchmarks failed; comparing against empty baseline" + + - name: Compare against merge base + run: go run ./cmd/benchgate -old /tmp/bench-old.txt -new /tmp/bench-new.txt diff --git a/Makefile b/Makefile index 2d827c6de..3a664faf0 100644 --- a/Makefile +++ b/Makefile @@ -20,7 +20,7 @@ AIR_BIN := $(shell if command -v air >/dev/null 2>&1; then command -v air; \ elif [ -x "$(GOPATH_FIRST)/bin/air" ]; then printf "%s" "$(GOPATH_FIRST)/bin/air"; \ fi) -.PHONY: build build-release install frontend frontend-dev dev check-air air-install desktop-dev desktop-build desktop-macos-app desktop-macos-dmg desktop-windows-installer desktop-linux-appimage desktop-app docs-install docs-build docs-serve docs-check docs-screenshots docs-assets-branch docs-generated-assets-branch docs-deploy-staging docs-deploy test test-short bench-backends test-postgres test-postgres-ci test-s3 postgres-up postgres-down test-ssh test-ssh-ci ssh-up ssh-down e2e e2e-duckdb vet lint lint-ci lint-golangci lint-golangci-ci nilaway nilaway-golangci-build lint-tools tidy clean release release-darwin-arm64 release-darwin-amd64 release-linux-amd64 install-hooks ensure-embed-dir pricing-snapshot dev-snapshot help +.PHONY: build build-release install frontend frontend-dev dev check-air air-install desktop-dev desktop-build desktop-macos-app desktop-macos-dmg desktop-windows-installer desktop-linux-appimage desktop-app docs-install docs-build docs-serve docs-check docs-screenshots docs-assets-branch docs-generated-assets-branch docs-deploy-staging docs-deploy test test-short bench-backends bench-gate test-postgres test-postgres-ci test-s3 postgres-up postgres-down test-ssh test-ssh-ci ssh-up ssh-down e2e e2e-duckdb vet lint lint-ci lint-golangci lint-golangci-ci nilaway nilaway-golangci-build lint-tools tidy clean release release-darwin-arm64 release-darwin-amd64 release-linux-amd64 install-hooks ensure-embed-dir pricing-snapshot dev-snapshot help # Ensure go:embed has at least one file (no-op if frontend is built) ensure-embed-dir: @@ -266,6 +266,21 @@ bench-backends: pricing-snapshot ensure-embed-dir AGENTSVIEW_BENCH_MESSAGES_PER_SESSION=$(BENCH_BACKENDS_MESSAGES_PER_SESSION) \ CGO_ENABLED=1 go test -tags "fts5,benchdb" ./internal/backendbench $(BENCH_BACKENDS_FLAGS) +# Hot-path benchmark gate. Runs the same benchmark set CI's bench.yml +# compares against a PR's merge base (sync engine warm/cold/append, +# message write paths, usage aggregation, secret scanning). Run it +# before and after touching a sync or DB hot path, then compare with +# `go run ./cmd/benchgate -old old.txt -new new.txt`. Keep the +# pattern in sync with .github/workflows/bench.yml. +BENCH_GATE_PATTERN ?= ^(BenchmarkSyncAllColdArchive|BenchmarkSyncAllWarmNoop|BenchmarkSyncPathsIncrementalAppend|BenchmarkReplaceSessionMessagesStreamingMerge|BenchmarkInsertMessagesBatch|BenchmarkGetDailyUsage|BenchmarkScan|BenchmarkScanDefinite)$$ +BENCH_GATE_COUNT ?= 6 +BENCH_GATE_TIME ?= 500ms +bench-gate: pricing-snapshot ensure-embed-dir + CGO_ENABLED=1 go test -tags "fts5" -run '^$$' \ + -bench '$(BENCH_GATE_PATTERN)' -benchmem \ + -count $(BENCH_GATE_COUNT) -benchtime $(BENCH_GATE_TIME) \ + -timeout 25m ./internal/sync ./internal/db ./internal/secrets + # Start test PostgreSQL container postgres-up: docker compose -f docker-compose.test.yml up -d --wait @@ -484,6 +499,7 @@ help: @echo " test - Run all tests" @echo " test-short - Run fast tests only" @echo " bench-backends - Benchmark SQLite, DuckDB, and PostgreSQL stores" + @echo " bench-gate - Run the hot-path benchmarks CI gates PRs on" @echo " test-postgres - Run PostgreSQL integration tests" @echo " test-s3 - Run S3 discovery integration tests (Docker)" @echo " postgres-up - Start test PostgreSQL container" diff --git a/cmd/benchgate/main.go b/cmd/benchgate/main.go new file mode 100644 index 000000000..04030068d --- /dev/null +++ b/cmd/benchgate/main.go @@ -0,0 +1,253 @@ +// Command benchgate compares two `go test -bench` outputs (baseline +// vs candidate) and exits non-zero when a benchmark regresses beyond +// configured thresholds. +// +// It is the comparison step of the bench-gate CI workflow: allocs/op +// and B/op are deterministic for the same code on the same machine, +// so they get tight thresholds that catch O(archive)-instead-of- +// O(delta) work regressions; ns/op is noisy on shared runners, so it +// gets a loose threshold that only catches algorithmic blowups. +// Small baselines below the per-metric floors are skipped entirely, +// since a few extra allocations on a tiny benchmark is noise, not a +// regression. +// +// Multiple runs of the same benchmark (-count=N) are aggregated by +// taking the minimum per metric, the standard way to strip scheduler +// and GC noise. Benchmarks present on only one side are reported but +// never fail the gate, so adding or removing benchmarks in a PR does +// not wedge it. +package main + +import ( + "bufio" + "flag" + "fmt" + "io" + "os" + "sort" + "strconv" + "strings" +) + +// benchResult maps a metric unit (ns/op, B/op, allocs/op, or any +// custom -benchmem/ReportMetric unit) to the best (lowest) value +// observed across runs. +type benchResult map[string]float64 + +// gate is one metric's regression rule: fail when +// candidate/baseline exceeds maxRatio, unless the baseline is below +// floor (too small to compare meaningfully). +type gate struct { + unit string + maxRatio float64 + floor float64 +} + +// violation describes one gate failure. +type violation struct { + name string + unit string + old, new float64 + ratio float64 + maxRatio float64 +} + +func (v violation) String() string { + return fmt.Sprintf( + "%s: %s regressed %.2fx (%.0f -> %.0f, limit %.2fx)", + v.name, v.unit, v.ratio, v.old, v.new, v.maxRatio, + ) +} + +// parseBench extracts benchmark results from `go test -bench` +// output. Lines that do not look like benchmark result lines +// (package headers, logs, PASS/ok trailers) are ignored. Repeated +// names keep the minimum value per metric. +func parseBench(r io.Reader) (map[string]benchResult, error) { + out := make(map[string]benchResult) + scanner := bufio.NewScanner(r) + scanner.Buffer(make([]byte, 0, 1024*1024), 1024*1024) + for scanner.Scan() { + fields := strings.Fields(scanner.Text()) + if len(fields) < 4 || + !strings.HasPrefix(fields[0], "Benchmark") { + continue + } + // The second column is the iteration count; anything else + // (e.g. a log line that happens to start with "Benchmark") + // is not a result line. + if _, err := strconv.Atoi(fields[1]); err != nil { + continue + } + name := fields[0] + res := out[name] + if res == nil { + res = make(benchResult) + out[name] = res + } + for i := 2; i+1 < len(fields); i += 2 { + value, err := strconv.ParseFloat(fields[i], 64) + if err != nil { + break + } + unit := fields[i+1] + if prev, ok := res[unit]; !ok || value < prev { + res[unit] = value + } + } + } + if err := scanner.Err(); err != nil { + return nil, err + } + return out, nil +} + +// compare applies the gates to every benchmark present in both maps +// and returns the violations plus a human-readable report. +func compare( + oldRes, newRes map[string]benchResult, gates []gate, +) (report []string, violations []violation) { + names := make([]string, 0, len(newRes)) + for name := range newRes { + names = append(names, name) + } + sort.Strings(names) + + for _, name := range names { + oldBench, ok := oldRes[name] + if !ok { + report = append(report, fmt.Sprintf( + "%s: new benchmark, no baseline to compare", name, + )) + continue + } + var parts []string + for _, g := range gates { + oldV, okOld := oldBench[g.unit] + newV, okNew := newRes[name][g.unit] + if !okOld || !okNew { + continue + } + if oldV < g.floor { + parts = append(parts, fmt.Sprintf( + "%s %.0f -> %.0f (below %.0f floor, not gated)", + g.unit, oldV, newV, g.floor, + )) + continue + } + ratio := newV / oldV + parts = append(parts, fmt.Sprintf( + "%s %.0f -> %.0f (%.2fx, limit %.2fx)", + g.unit, oldV, newV, ratio, g.maxRatio, + )) + if ratio > g.maxRatio { + violations = append(violations, violation{ + name: name, unit: g.unit, + old: oldV, new: newV, + ratio: ratio, maxRatio: g.maxRatio, + }) + } + } + report = append(report, fmt.Sprintf( + "%s: %s", name, strings.Join(parts, ", "), + )) + } + + var removed []string + for name := range oldRes { + if _, ok := newRes[name]; !ok { + removed = append(removed, name) + } + } + sort.Strings(removed) + for _, name := range removed { + report = append(report, fmt.Sprintf( + "%s: present in baseline but missing from candidate", + name, + )) + } + return report, violations +} + +func parseFile(path string) (map[string]benchResult, error) { + f, err := os.Open(path) + if err != nil { + return nil, err + } + defer f.Close() + return parseBench(f) +} + +func main() { + oldPath := flag.String( + "old", "", "baseline `go test -bench` output file", + ) + newPath := flag.String( + "new", "", "candidate `go test -bench` output file", + ) + maxTimeRatio := flag.Float64( + "max-time-ratio", 2.0, + "fail when ns/op exceeds baseline by this factor", + ) + maxAllocRatio := flag.Float64( + "max-alloc-ratio", 1.25, + "fail when allocs/op exceeds baseline by this factor", + ) + maxBytesRatio := flag.Float64( + "max-bytes-ratio", 1.35, + "fail when B/op exceeds baseline by this factor", + ) + timeFloorNs := flag.Float64( + "time-floor-ns", 100_000, + "skip the ns/op gate when the baseline is below this", + ) + allocFloor := flag.Float64( + "alloc-floor", 64, + "skip the allocs/op gate when the baseline is below this", + ) + bytesFloor := flag.Float64( + "bytes-floor", 16_384, + "skip the B/op gate when the baseline is below this", + ) + flag.Parse() + + if *oldPath == "" || *newPath == "" { + fmt.Fprintln(os.Stderr, "benchgate: -old and -new are required") + flag.Usage() + os.Exit(2) + } + + oldRes, err := parseFile(*oldPath) + if err != nil { + fmt.Fprintf(os.Stderr, "benchgate: reading baseline: %v\n", err) + os.Exit(2) + } + newRes, err := parseFile(*newPath) + if err != nil { + fmt.Fprintf(os.Stderr, "benchgate: reading candidate: %v\n", err) + os.Exit(2) + } + + gates := []gate{ + {unit: "allocs/op", maxRatio: *maxAllocRatio, floor: *allocFloor}, + {unit: "B/op", maxRatio: *maxBytesRatio, floor: *bytesFloor}, + {unit: "ns/op", maxRatio: *maxTimeRatio, floor: *timeFloorNs}, + } + report, violations := compare(oldRes, newRes, gates) + for _, line := range report { + fmt.Println(line) + } + if len(newRes) == 0 { + fmt.Println("benchgate: candidate output contains no benchmarks") + os.Exit(2) + } + if len(violations) > 0 { + fmt.Println() + fmt.Printf("benchgate: %d regression(s):\n", len(violations)) + for _, v := range violations { + fmt.Printf(" %s\n", v) + } + os.Exit(1) + } + fmt.Println("benchgate: no regressions beyond thresholds") +} diff --git a/cmd/benchgate/main_test.go b/cmd/benchgate/main_test.go new file mode 100644 index 000000000..a979ecf98 --- /dev/null +++ b/cmd/benchgate/main_test.go @@ -0,0 +1,198 @@ +package main + +import ( + "strings" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestParseBench(t *testing.T) { + tests := []struct { + name string + input string + want map[string]benchResult + }{ + { + name: "full benchmem line", + input: "goos: linux\n" + + "BenchmarkFoo-8 \t 100\t 1234567 ns/op\t 2345 B/op\t 67 allocs/op\n" + + "PASS\nok \tpkg\t1.2s\n", + want: map[string]benchResult{ + "BenchmarkFoo-8": { + "ns/op": 1234567, "B/op": 2345, "allocs/op": 67, + }, + }, + }, + { + name: "multiple counts keep the minimum per metric", + input: "BenchmarkFoo-8 100 200 ns/op 50 B/op 9 allocs/op\n" + + "BenchmarkFoo-8 100 150 ns/op 60 B/op 8 allocs/op\n" + + "BenchmarkFoo-8 100 180 ns/op 40 B/op 10 allocs/op\n", + want: map[string]benchResult{ + "BenchmarkFoo-8": { + "ns/op": 150, "B/op": 40, "allocs/op": 8, + }, + }, + }, + { + name: "time-only line without -benchmem", + input: "BenchmarkBar-4 500 0.5 ns/op\n", + want: map[string]benchResult{ + "BenchmarkBar-4": {"ns/op": 0.5}, + }, + }, + { + name: "log lines and headers are ignored", + input: "2026/07/03 10:20:36 discovered 40 files in 0s\n" + + "BenchmarkSync says hello\n" + + "cpu: Apple M5 Max\n", + want: map[string]benchResult{}, + }, + { + name: "custom ReportMetric units are kept", + input: "BenchmarkBaz-2 10 900 ns/op 3 sessions/op\n", + want: map[string]benchResult{ + "BenchmarkBaz-2": {"ns/op": 900, "sessions/op": 3}, + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, err := parseBench(strings.NewReader(tt.input)) + require.NoError(t, err) + assert.Equal(t, tt.want, got) + }) + } +} + +func testGates() []gate { + return []gate{ + {unit: "allocs/op", maxRatio: 1.25, floor: 64}, + {unit: "B/op", maxRatio: 1.35, floor: 16_384}, + {unit: "ns/op", maxRatio: 2.0, floor: 100_000}, + } +} + +func TestCompare(t *testing.T) { + tests := []struct { + name string + old, new map[string]benchResult + wantUnits []string // units of expected violations, in order + wantReport []string // substrings that must appear in the report + }{ + { + name: "within thresholds passes", + old: map[string]benchResult{ + "BenchmarkFoo-8": { + "ns/op": 1_000_000, "B/op": 100_000, "allocs/op": 1000, + }, + }, + new: map[string]benchResult{ + "BenchmarkFoo-8": { + "ns/op": 1_500_000, "B/op": 120_000, "allocs/op": 1100, + }, + }, + wantUnits: nil, + }, + { + name: "alloc regression fails", + old: map[string]benchResult{ + "BenchmarkFoo-8": {"ns/op": 1_000_000, "allocs/op": 1000}, + }, + new: map[string]benchResult{ + "BenchmarkFoo-8": {"ns/op": 1_000_000, "allocs/op": 2000}, + }, + wantUnits: []string{"allocs/op"}, + }, + { + name: "time blowup fails", + old: map[string]benchResult{ + "BenchmarkFoo-8": {"ns/op": 1_000_000}, + }, + new: map[string]benchResult{ + "BenchmarkFoo-8": {"ns/op": 5_000_000}, + }, + wantUnits: []string{"ns/op"}, + }, + { + name: "tiny baseline below floor is not gated", + old: map[string]benchResult{ + "BenchmarkFoo-8": { + "ns/op": 500, "B/op": 128, "allocs/op": 3, + }, + }, + new: map[string]benchResult{ + "BenchmarkFoo-8": { + "ns/op": 5000, "B/op": 1280, "allocs/op": 30, + }, + }, + wantUnits: nil, + wantReport: []string{"not gated"}, + }, + { + name: "new benchmark without baseline is reported, not gated", + old: map[string]benchResult{}, + new: map[string]benchResult{ + "BenchmarkNew-8": {"ns/op": 1_000_000, "allocs/op": 99999}, + }, + wantUnits: nil, + wantReport: []string{"no baseline to compare"}, + }, + { + name: "removed benchmark is reported, not gated", + old: map[string]benchResult{ + "BenchmarkGone-8": {"ns/op": 1_000_000}, + }, + new: map[string]benchResult{}, + wantUnits: nil, + wantReport: []string{"missing from candidate"}, + }, + { + name: "multiple regressions are all reported", + old: map[string]benchResult{ + "BenchmarkFoo-8": { + "ns/op": 1_000_000, "B/op": 100_000, "allocs/op": 1000, + }, + }, + new: map[string]benchResult{ + "BenchmarkFoo-8": { + "ns/op": 9_000_000, "B/op": 900_000, "allocs/op": 9000, + }, + }, + wantUnits: []string{"allocs/op", "B/op", "ns/op"}, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + report, violations := compare(tt.old, tt.new, testGates()) + units := make([]string, 0, len(violations)) + for _, v := range violations { + units = append(units, v.unit) + } + if len(tt.wantUnits) == 0 { + assert.Empty(t, violations) + } else { + assert.Equal(t, tt.wantUnits, units) + } + joined := strings.Join(report, "\n") + for _, want := range tt.wantReport { + assert.Contains(t, joined, want) + } + }) + } +} + +func TestViolationString(t *testing.T) { + v := violation{ + name: "BenchmarkFoo-8", unit: "allocs/op", + old: 1000, new: 2000, ratio: 2.0, maxRatio: 1.25, + } + assert.Equal( + t, + "BenchmarkFoo-8: allocs/op regressed 2.00x "+ + "(1000 -> 2000, limit 1.25x)", + v.String(), + ) +} diff --git a/docs/internal/performance-gates.md b/docs/internal/performance-gates.md new file mode 100644 index 000000000..e7eeb2d34 --- /dev/null +++ b/docs/internal/performance-gates.md @@ -0,0 +1,89 @@ +# Performance Gates + +agentsview has repeatedly shipped performance regressions where sync work +stopped scaling with *new data* and started scaling with *archive size*. This +document records the regression classes we have actually hit, and the gates that +now guard each one. When you touch a sync or DB hot path, know which gate covers +you; when you fix a new class of regression, add a gate here. + +## Regression history + +| Class | What happened | Fixed in | +| ------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------- | +| Discovery O(sources) root work | Gemini rebuilt its project map per session; positron/vscode-copilot re-read `workspace.json` per session. A large store spent 2m47s in discovery. | #912 | +| Unchanged sources reparsed | The provider migration dropped pre-parse DB-freshness skips; every full sync reparsed and rewrote untouched sessions. | `providerSourceUnchangedInDB` (#883 follow-up) | +| O(history) incremental appends | Every streamed line ran a full signal recompute (reload all messages, secret regex scan) and chunk merges delete+reinserted every message row. ~4,700 session updates/day each paid O(session history). | #954 | +| Bulk ingest throughput | Full resync ran per-row inserts and rebuilt FTS incrementally; 26.7k sessions took 1m17s. | #411 | +| Event storms | One SSE emit per watcher flush drove ~1/s dashboard refetch; SQLite WAL sidecar events fanned out to every session in a shared DB. | #367, #956 | +| Per-row query shape | `GetDailyUsage` ran 1.2M `json_extract` calls per scan and had no date pushdown. | #309 | + +## Two layers of gates + +### 1. Deterministic work-count invariants (run in `make test`) + +These count work units instead of measuring time, so they are immune to CI +runner noise and fail loudly: + +- `TestWarmFullSyncDoesNoBulkWriteWork` (`internal/sync/perf_invariant_test.go`) + — a second full sync over an unchanged Claude archive must skip everything + and run zero bulk-write batches (`Engine.PhaseStats`). +- `TestProviderAuthoritativeUnchangedSessionSkipsOnResync` + (`internal/sync/provider_freshness_integration_test.go`) — the generic + freshness skip for provider-authoritative agents, Vibe as representative. +- `TestWriteIncrementalDebouncesSignalRecompute` and the rest of + `internal/sync/signal_schedule_test.go` — streaming appends must debounce + the O(history) signal recompute. +- The count-based seam tests in `internal/parser` + (`discovery_workspace_manifest_test.go`, gemini/antigravity provider tests) + — root-derived project info is built once per root, not once per source. +- `internal/server/broadcaster_test.go` — SSE emits coalesce to at most one + broadcast per window. + +When you fix a performance bug, prefer adding a gate at this layer: expose or +reuse a counter (`SyncStats`, `PhaseStats`, `AnomalyStats`, a swappable +package-var seam) and assert the invariant, e.g. "second sync parses zero +sessions" or "the manifest is read once per root regardless of session count". + +### 2. Benchmark gate (runs on every PR via `bench.yml`) + +`.github/workflows/bench.yml` runs a focused benchmark set on the PR head and +its merge base on the same runner, then compares with `cmd/benchgate`: + +- `BenchmarkSyncAllWarmNoop` — full sync over an already-synced archive (stat + + skip work only; also self-asserts nothing is reparsed). +- `BenchmarkSyncPathsIncrementalAppend` — absorb one appended line into a + 1,000-message session. +- `BenchmarkSyncAllColdArchive` — first-sync ingest throughput. +- `BenchmarkReplaceSessionMessagesStreamingMerge` — the streaming chunk-merge + diff path (one UPDATE, not a full delete+reinsert). +- `BenchmarkInsertMessagesBatch` — multi-row batched ingest. +- `BenchmarkGetDailyUsage` — usage aggregation over 100k message rows. +- `BenchmarkScan` / `BenchmarkScanDefinite` — secret-scan regex throughput. + +`benchgate` gates hard on `allocs/op` (limit 1.25x) and `B/op` (1.35x), which +are deterministic for the same code on the same machine — an +O(archive)-instead-of-O(delta) regression always blows them up. `ns/op` gets a +loose 2.0x limit that catches algorithmic blowups while tolerating runner noise +(each metric uses the minimum across `-count` runs). Baselines below a +per-metric floor are not gated. Benchmarks that exist on only one side are +reported but never fail, so adding or removing benchmarks cannot wedge a PR. + +Run locally: + +```bash +make bench-gate # current tree +git stash && make bench-gate > old.txt # baseline, however you produce it +go run ./cmd/benchgate -old old.txt -new new.txt +``` + +Cross-backend query benchmarks live separately in `internal/backendbench` +(`make bench-backends`, requires Docker) and are not part of the PR gate. + +## Adding a benchmark to the gate + +1. Write the benchmark next to the code it guards (`*_bench_test.go`, + `b.ReportAllocs()`, self-assert the invariant it protects where possible). +1. Add its name to `BENCH_PATTERN` in `.github/workflows/bench.yml` **and** + `BENCH_GATE_PATTERN` in the Makefile. +1. Keep per-op cost roughly in the 100µs–100ms band: below the benchgate floors + nothing is gated, and far above it the job gets slow. diff --git a/internal/db/messages_bench_test.go b/internal/db/messages_bench_test.go new file mode 100644 index 000000000..3db4bc525 --- /dev/null +++ b/internal/db/messages_bench_test.go @@ -0,0 +1,148 @@ +package db + +import ( + "encoding/json" + "fmt" + "path/filepath" + "testing" +) + +// Hot-path benchmarks for the message write and usage aggregation +// paths that have regressed in the past. CI's bench-gate workflow +// runs them on every PR and compares allocs/op, B/op, and ns/op +// against the merge base, so a reintroduced O(session-history) +// rewrite or per-row JSON parse fails the PR instead of shipping: +// +// - BenchmarkReplaceSessionMessagesStreamingMerge: a one-row tail +// change must take the in-place diff path (one UPDATE) rather +// than delete+reinserting every row and rewriting the FTS index +// (regressed pre-#954: streaming chunk merges rewrote whole +// sessions on every appended chunk). +// - BenchmarkInsertMessagesBatch: bulk ingest must keep multi-row +// batched inserts (#411). +// +// BenchmarkGetDailyUsage in usage_test.go covers the usage +// aggregation scan (#309) and is part of the same CI gate. + +func openBenchDB(b *testing.B) *DB { + b.Helper() + d, err := Open(filepath.Join(b.TempDir(), "bench.db")) + if err != nil { + b.Fatalf("open bench db: %v", err) + } + b.Cleanup(func() { + if err := d.Close(); err != nil { + b.Errorf("close bench db: %v", err) + } + }) + return d +} + +// benchSessionMessages builds n alternating user/assistant messages. +// Assistant messages carry a model and token_usage payload so writes +// exercise the same columns real ingest does. +func benchSessionMessages(sessionID string, n int) []Message { + msgs := make([]Message, 0, n) + for i := range n { + role := "user" + content := fmt.Sprintf( + "user message %d with enough text to look real", i, + ) + if i%2 == 1 { + role = "assistant" + content = fmt.Sprintf( + "assistant reply %d with enough text to look real", i, + ) + } + m := Message{ + SessionID: sessionID, + Ordinal: i, + Role: role, + Content: content, + Timestamp: fmt.Sprintf("2026-06-%02dT10:00:00Z", 1+i%28), + ContentLength: len(content), + } + if role == "assistant" { + m.Model = "claude-bench-model" + m.TokenUsage = json.RawMessage( + `{"input_tokens":120,"output_tokens":45,` + + `"cache_creation_input_tokens":10,` + + `"cache_read_input_tokens":200}`, + ) + } + msgs = append(msgs, m) + } + return msgs +} + +func seedBenchSession( + b *testing.B, d *DB, sessionID string, n int, +) []Message { + b.Helper() + if err := d.UpsertSession(Session{ + ID: sessionID, + Project: "bench", + Machine: "local", + Agent: "claude", + }); err != nil { + b.Fatalf("seed session %s: %v", sessionID, err) + } + msgs := benchSessionMessages(sessionID, n) + if err := d.InsertMessages(msgs); err != nil { + b.Fatalf("seed messages for %s: %v", sessionID, err) + } + return msgs +} + +// BenchmarkReplaceSessionMessagesStreamingMerge measures the +// streaming chunk-merge shape: replacing a stored session where only +// the tail message's content changed. The diff planner must apply a +// single in-place UPDATE; cost must not scale with the number of +// unchanged stored rows being rewritten. +func BenchmarkReplaceSessionMessagesStreamingMerge(b *testing.B) { + const stored = 1000 + d := openBenchDB(b) + msgs := seedBenchSession(b, d, "bench-replace", stored) + last := len(msgs) - 1 + + b.ReportAllocs() + b.ResetTimer() + for i := range b.N { + content := fmt.Sprintf( + "assistant reply %d merged streaming tail variant %d", + last, i, + ) + msgs[last].Content = content + msgs[last].ContentLength = len(content) + if err := d.ReplaceSessionMessages("bench-replace", msgs); err != nil { + b.Fatalf("replace: %v", err) + } + } +} + +// BenchmarkInsertMessagesBatch measures bulk session ingest: one +// session row plus a batch insert of its messages, the unit of work +// the full-sync write pipeline performs per session. +func BenchmarkInsertMessagesBatch(b *testing.B) { + const batch = 200 + d := openBenchDB(b) + + b.ReportAllocs() + b.ResetTimer() + for i := range b.N { + sid := fmt.Sprintf("bench-insert-%06d", i) + if err := d.UpsertSession(Session{ + ID: sid, + Project: "bench", + Machine: "local", + Agent: "claude", + }); err != nil { + b.Fatalf("upsert session: %v", err) + } + if err := d.InsertMessages( + benchSessionMessages(sid, batch), + ); err != nil { + b.Fatalf("insert messages: %v", err) + } + } +} diff --git a/internal/sync/engine_bench_test.go b/internal/sync/engine_bench_test.go new file mode 100644 index 000000000..bc8a628ed --- /dev/null +++ b/internal/sync/engine_bench_test.go @@ -0,0 +1,247 @@ +package sync + +import ( + "context" + "fmt" + "os" + "path/filepath" + "strconv" + "testing" + + "go.kenn.io/agentsview/internal/db" + "go.kenn.io/agentsview/internal/parser" + "go.kenn.io/agentsview/internal/testjsonl" +) + +// Hot-path benchmarks for the sync engine, covering the regression +// classes that have shipped before. CI's bench-gate workflow runs +// them on every PR and compares allocs/op, B/op, and ns/op against +// the merge base: +// +// - BenchmarkSyncAllWarmNoop: a full sync over an already-synced, +// unchanged archive must do stat+skip work only. Regressed when +// the provider migration dropped pre-parse DB-freshness skips +// and every full sync reparsed and rewrote unchanged sessions +// (fixed by providerSourceUnchangedInDB), and when discovery +// recomputed root-derived project info per source (#912). +// - BenchmarkSyncPathsIncrementalAppend: absorbing one appended +// line into a large session must scale with the appended data, +// not the stored history (#954). +// - BenchmarkSyncAllColdArchive: first-sync ingest throughput for +// a fresh archive (the #411 bulk-write path). +// +// Fixture sizes scale via AGENTSVIEW_BENCH_SYNC_SESSIONS and +// AGENTSVIEW_BENCH_SYNC_MESSAGES for larger local runs. + +const ( + defaultBenchSyncSessions = 40 + defaultBenchSyncMessages = 30 + benchLargeSessionLines = 1000 +) + +func benchIntFromEnv(name string, def int) int { + raw := os.Getenv(name) + if raw == "" { + return def + } + n, err := strconv.Atoi(raw) + if err != nil || n <= 0 { + return def + } + return n +} + +// writeBenchClaudeArchive lays out `sessions` Claude JSONL session +// files of `perSession` alternating user/assistant messages under +// dir/bench-project, mirroring the on-disk shape SyncAll discovers. +func writeBenchClaudeArchive( + b *testing.B, dir string, sessions, perSession int, +) { + b.Helper() + proj := filepath.Join(dir, "bench-project") + if err := os.MkdirAll(proj, 0o755); err != nil { + b.Fatalf("MkdirAll: %v", err) + } + for s := range sessions { + builder := testjsonl.NewSessionBuilder() + for m := 0; m < perSession; m += 2 { + ts := fmt.Sprintf( + "2026-06-20T10:%02d:%02dZ", (m/2/60)%60, (m/2)%60, + ) + builder.AddClaudeUser(ts, fmt.Sprintf( + "user message %d in session %d", m, s, + )) + builder.AddClaudeAssistant(ts, fmt.Sprintf( + "assistant reply %d in session %d", m, s, + )) + } + path := filepath.Join( + proj, fmt.Sprintf("bench-%04d.jsonl", s), + ) + if err := os.WriteFile( + path, []byte(builder.String()), 0o644, + ); err != nil { + b.Fatalf("WriteFile %s: %v", path, err) + } + } +} + +// openBenchEngine opens a fresh SQLite DB and an engine watching dir +// as a Claude root. Cleanup closes the engine before the DB so any +// pending debounced signal recompute drains first. +func openBenchEngine(b *testing.B, dir string) (*Engine, *db.DB) { + b.Helper() + database, err := db.Open(filepath.Join(b.TempDir(), "bench.db")) + if err != nil { + b.Fatalf("open bench db: %v", err) + } + engine := NewEngine(database, EngineConfig{ + AgentDirs: map[parser.AgentType][]string{ + parser.AgentClaude: {dir}, + }, + Machine: "local", + }) + b.Cleanup(func() { + engine.Close() + if err := database.Close(); err != nil { + b.Errorf("close bench db: %v", err) + } + }) + return engine, database +} + +// BenchmarkSyncAllWarmNoop measures a full sync pass over an archive +// that is already fully synced and unchanged on disk: discovery plus +// per-source freshness skips. It also asserts the invariant the +// benchmark exists to protect — a warm no-op pass must not reparse +// or rewrite anything. +func BenchmarkSyncAllWarmNoop(b *testing.B) { + sessions := benchIntFromEnv( + "AGENTSVIEW_BENCH_SYNC_SESSIONS", defaultBenchSyncSessions, + ) + perSession := benchIntFromEnv( + "AGENTSVIEW_BENCH_SYNC_MESSAGES", defaultBenchSyncMessages, + ) + dir := b.TempDir() + writeBenchClaudeArchive(b, dir, sessions, perSession) + engine, _ := openBenchEngine(b, dir) + ctx := context.Background() + + first := engine.SyncAll(ctx, nil) + if first.Synced != sessions { + b.Fatalf( + "initial sync stored %d of %d sessions (failed=%d)", + first.Synced, sessions, first.Failed, + ) + } + + b.ReportAllocs() + b.ResetTimer() + for range b.N { + stats := engine.SyncAll(ctx, nil) + if stats.Synced != 0 { + b.Fatalf( + "warm no-op sync reparsed %d sessions", stats.Synced, + ) + } + } +} + +// BenchmarkSyncPathsIncrementalAppend measures absorbing a single +// appended JSONL line into a session that already stores +// benchLargeSessionLines messages, the streaming write the serve +// daemon performs thousands of times per day. +func BenchmarkSyncPathsIncrementalAppend(b *testing.B) { + dir := b.TempDir() + writeBenchClaudeArchive(b, dir, 1, benchLargeSessionLines) + engine, database := openBenchEngine(b, dir) + ctx := context.Background() + + first := engine.SyncAll(ctx, nil) + if first.Synced != 1 { + b.Fatalf( + "initial sync stored %d sessions (failed=%d)", + first.Synced, first.Failed, + ) + } + + path := filepath.Join(dir, "bench-project", "bench-0000.jsonl") + f, err := os.OpenFile(path, os.O_APPEND|os.O_WRONLY, 0o644) + if err != nil { + b.Fatalf("OpenFile: %v", err) + } + defer f.Close() + + b.ReportAllocs() + b.ResetTimer() + for i := range b.N { + line := testjsonl.NewSessionBuilder().AddClaudeUser( + "2026-06-20T11:00:00Z", + fmt.Sprintf("streamed line %d", i), + ).String() + if _, err := f.WriteString(line); err != nil { + b.Fatalf("append: %v", err) + } + engine.SyncPaths([]string{path}) + } + b.StopTimer() + + msgs, err := database.GetAllMessages(ctx, "bench-0000") + if err != nil { + b.Fatalf("GetAllMessages: %v", err) + } + if len(msgs) < benchLargeSessionLines+b.N { + b.Fatalf( + "appends were not absorbed: stored %d messages, want >= %d", + len(msgs), benchLargeSessionLines+b.N, + ) + } +} + +// BenchmarkSyncAllColdArchive measures first-sync ingest throughput: +// parse plus bulk write of a whole archive into a fresh database. +func BenchmarkSyncAllColdArchive(b *testing.B) { + sessions := benchIntFromEnv( + "AGENTSVIEW_BENCH_SYNC_SESSIONS", defaultBenchSyncSessions, + ) + perSession := benchIntFromEnv( + "AGENTSVIEW_BENCH_SYNC_MESSAGES", defaultBenchSyncMessages, + ) + dir := b.TempDir() + writeBenchClaudeArchive(b, dir, sessions, perSession) + ctx := context.Background() + + b.ReportAllocs() + b.ResetTimer() + for i := range b.N { + b.StopTimer() + database, err := db.Open( + filepath.Join(b.TempDir(), fmt.Sprintf("cold-%d.db", i)), + ) + if err != nil { + b.Fatalf("open bench db: %v", err) + } + engine := NewEngine(database, EngineConfig{ + AgentDirs: map[parser.AgentType][]string{ + parser.AgentClaude: {dir}, + }, + Machine: "local", + }) + b.StartTimer() + + stats := engine.SyncAll(ctx, nil) + + b.StopTimer() + if stats.Synced != sessions { + b.Fatalf( + "cold sync stored %d of %d sessions (failed=%d)", + stats.Synced, sessions, stats.Failed, + ) + } + engine.Close() + if err := database.Close(); err != nil { + b.Fatalf("close bench db: %v", err) + } + b.StartTimer() + } +} diff --git a/internal/sync/perf_invariant_test.go b/internal/sync/perf_invariant_test.go new file mode 100644 index 000000000..7f4ff8d43 --- /dev/null +++ b/internal/sync/perf_invariant_test.go @@ -0,0 +1,66 @@ +package sync + +import ( + "context" + "fmt" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// Deterministic work-count gates for the "sync work scales with new +// data, not archive size" invariant. Unlike the wall-clock +// benchmarks in engine_bench_test.go, these run in the regular test +// suite and count work units, so they fail loudly in CI regardless +// of runner noise. Companion gates elsewhere: +// +// - TestProviderAuthoritativeUnchangedSessionSkipsOnResync covers +// the generic providerSourceUnchangedInDB skip for the +// provider.Parse fallthrough group (Vibe as representative). +// - TestWriteIncrementalDebouncesSignalRecompute pins the #954 +// debounce of the O(history) signal recompute. +// - The count-based seam tests in internal/parser +// (discovery_workspace_manifest_test.go, antigravity/gemini +// provider tests) pin O(roots) discovery work (#912). + +// TestWarmFullSyncDoesNoBulkWriteWork verifies that a second full +// sync over an unchanged Claude archive skips every session before +// the parse and never enters the bulk-write pipeline. Claude has its +// own pre-parse freshness path (shouldSkipProviderSourceByDB), +// distinct from the generic check the Vibe test covers; both have +// regressed independently in the past. +func TestWarmFullSyncDoesNoBulkWriteWork(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test") + } + + fx := newEngineFixture(t) + const n = 5 + for i := range n { + fx.writeClaudeSession( + t, "proj", fmt.Sprintf("warm-%d.jsonl", i), + fmt.Sprintf("hello %d", i), + ) + } + + ctx := context.Background() + first := fx.engine.SyncAll(ctx, nil) + require.Equal(t, n, first.Synced, + "first sync parses and stores every session") + + second := fx.engine.SyncAll(ctx, nil) + assert.Equal(t, 0, second.Synced, + "unchanged sessions must not be re-synced on a warm pass") + assert.GreaterOrEqual(t, second.Skipped, n, + "every unchanged session must be counted as skipped") + + // PhaseStats resets at the start of each pass, so after the + // second pass it reflects only that pass: a warm no-op sync + // must not have run a single bulk-write batch. + stats := fx.engine.PhaseStats() + assert.Zero(t, stats.Batches.Load(), + "warm no-op sync must not run any bulk-write batch") + assert.Zero(t, stats.BatchedWrites.Load(), + "warm no-op sync must not rewrite any session") +} From 09d2b9bfed54972b5dab1d5ebcf6abc8c111f240 Mon Sep 17 00:00:00 2001 From: Marius van Niekerk Date: Fri, 3 Jul 2026 11:05:07 -0400 Subject: [PATCH 2/7] refactor(benchgate): build on x/perf benchfmt and benchmath instead of hand-rolled parsing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The first benchgate cut reimplemented two things the Go performance tooling already provides: a parser for the benchmark output format and a comparison statistic. Swap both for golang.org/x/perf — benchfmt for parsing (handles the full format spec: config lines, unit tidying, sub-benchmarks, stray non-benchmark output) and benchmath, the statistics engine behind the rewritten benchstat, for summaries and significance testing. benchgate keeps only the policy benchstat deliberately does not provide: thresholds, floors, and a failing exit code for CI. Two behavioral changes fall out, both improvements: - Repeated -count runs are summarized by their median (benchstat semantics) instead of the minimum, and the time gate now also requires the difference to be statistically significant (Mann-Whitney U, p < 0.05) before failing — so a single noisy run cannot fail a PR, while allocs/op and B/op still gate on ratio alone and catch deterministic regressions even from a single run. - Benchmark keys include the package path (pkg config line), so same-named benchmarks in different packages can never merge into one sample. CLI flags, the workflow, and the Makefile target are unchanged; -time-floor-ns is converted internally to benchfmt's tidied sec/op unit. The x/net, x/oauth2, x/crypto, x/exp, and x/text bumps are forced by x/perf's module requirements via MVS. Validation: full -short suite green repo-wide; benchgate verified end-to-end against real bench output for the identical, regressed, and no-baseline cases. --- cmd/benchgate/main.go | 194 ++++++++++++++++----------- cmd/benchgate/main_test.go | 205 ++++++++++++++++++++--------- docs/internal/performance-gates.md | 19 ++- go.mod | 12 +- go.sum | 24 ++-- 5 files changed, 291 insertions(+), 163 deletions(-) diff --git a/cmd/benchgate/main.go b/cmd/benchgate/main.go index 04030068d..fe820d4a7 100644 --- a/cmd/benchgate/main.go +++ b/cmd/benchgate/main.go @@ -2,45 +2,60 @@ // vs candidate) and exits non-zero when a benchmark regresses beyond // configured thresholds. // +// Parsing and statistics come from golang.org/x/perf — benchfmt for +// the benchmark format and benchmath (the engine behind benchstat) +// for summaries and significance tests. benchgate only adds the +// policy benchstat deliberately does not provide: thresholds, floors, +// and a failing exit code for CI. +// // It is the comparison step of the bench-gate CI workflow: allocs/op // and B/op are deterministic for the same code on the same machine, -// so they get tight thresholds that catch O(archive)-instead-of- -// O(delta) work regressions; ns/op is noisy on shared runners, so it -// gets a loose threshold that only catches algorithmic blowups. -// Small baselines below the per-metric floors are skipped entirely, -// since a few extra allocations on a tiny benchmark is noise, not a +// so they get tight ratio thresholds that catch O(archive)-instead- +// of-O(delta) work regressions regardless of sample count; time +// (sec/op) is noisy on shared runners, so it gets a loose threshold +// and additionally must be a statistically significant difference +// (Mann-Whitney U, as in benchstat) before it fails the gate. +// Baselines below a per-metric floor are skipped entirely, since a +// few extra allocations on a tiny benchmark is noise, not a // regression. // -// Multiple runs of the same benchmark (-count=N) are aggregated by -// taking the minimum per metric, the standard way to strip scheduler -// and GC noise. Benchmarks present on only one side are reported but -// never fail the gate, so adding or removing benchmarks in a PR does -// not wedge it. +// Multiple runs of the same benchmark (-count=N) are kept as a +// sample and summarized by their median. Benchmarks present on only +// one side are reported but never fail the gate, so adding or +// removing benchmarks in a PR does not wedge it. package main import ( - "bufio" "flag" "fmt" - "io" + "math" "os" "sort" - "strconv" "strings" + + "golang.org/x/perf/benchfmt" + "golang.org/x/perf/benchmath" + "golang.org/x/perf/benchunit" ) -// benchResult maps a metric unit (ns/op, B/op, allocs/op, or any -// custom -benchmem/ReportMetric unit) to the best (lowest) value -// observed across runs. -type benchResult map[string]float64 +// benchSamples collects every measured value per benchmark and unit: +// benchmark key -> tidied unit (sec/op, B/op, allocs/op, ...) -> +// samples across -count runs. The key includes the package path when +// the output carries one, so same-named benchmarks in different +// packages never merge. +type benchSamples map[string]map[string][]float64 -// gate is one metric's regression rule: fail when -// candidate/baseline exceeds maxRatio, unless the baseline is below -// floor (too small to compare meaningfully). +// gate is one metric's regression rule: fail when the candidate +// median exceeds the baseline median by more than maxRatio, unless +// the baseline is below floor (too small to compare meaningfully). +// With needSignificance set, the samples must also differ +// significantly under the benchmath comparison test — the benchstat +// noise guard, used for wall-clock time. type gate struct { - unit string - maxRatio float64 - floor float64 + unit string + maxRatio float64 + floor float64 + needSignificance bool } // violation describes one gate failure. @@ -53,60 +68,51 @@ type violation struct { } func (v violation) String() string { + cls := benchunit.ClassOf(v.unit) return fmt.Sprintf( - "%s: %s regressed %.2fx (%.0f -> %.0f, limit %.2fx)", - v.name, v.unit, v.ratio, v.old, v.new, v.maxRatio, + "%s: %s regressed %.2fx (%s -> %s, limit %.2fx)", + v.name, v.unit, v.ratio, + benchunit.Scale(v.old, cls), benchunit.Scale(v.new, cls), + v.maxRatio, ) } -// parseBench extracts benchmark results from `go test -bench` -// output. Lines that do not look like benchmark result lines -// (package headers, logs, PASS/ok trailers) are ignored. Repeated -// names keep the minimum value per metric. -func parseBench(r io.Reader) (map[string]benchResult, error) { - out := make(map[string]benchResult) - scanner := bufio.NewScanner(r) - scanner.Buffer(make([]byte, 0, 1024*1024), 1024*1024) - for scanner.Scan() { - fields := strings.Fields(scanner.Text()) - if len(fields) < 4 || - !strings.HasPrefix(fields[0], "Benchmark") { +// parseBench extracts benchmark samples from `go test -bench` output +// using the official format parser. Non-result records (unit +// metadata, syntax errors from stray output) are skipped. Values +// arrive tidied by benchfmt: ns/op becomes sec/op, MB/s becomes B/s. +func parseBench(reader *benchfmt.Reader) (benchSamples, error) { + out := make(benchSamples) + for reader.Scan() { + res, ok := reader.Result().(*benchfmt.Result) + if !ok { continue } - // The second column is the iteration count; anything else - // (e.g. a log line that happens to start with "Benchmark") - // is not a result line. - if _, err := strconv.Atoi(fields[1]); err != nil { - continue + name := string(res.Name.Full()) + if pkg := res.GetConfig("pkg"); pkg != "" { + name = pkg + "." + name } - name := fields[0] - res := out[name] - if res == nil { - res = make(benchResult) - out[name] = res + units := out[name] + if units == nil { + units = make(map[string][]float64) + out[name] = units } - for i := 2; i+1 < len(fields); i += 2 { - value, err := strconv.ParseFloat(fields[i], 64) - if err != nil { - break - } - unit := fields[i+1] - if prev, ok := res[unit]; !ok || value < prev { - res[unit] = value - } + for _, v := range res.Values { + units[v.Unit] = append(units[v.Unit], v.Value) } } - if err := scanner.Err(); err != nil { + if err := reader.Err(); err != nil { return nil, err } return out, nil } // compare applies the gates to every benchmark present in both maps -// and returns the violations plus a human-readable report. +// and returns a human-readable report plus the violations. func compare( - oldRes, newRes map[string]benchResult, gates []gate, + oldRes, newRes benchSamples, gates []gate, ) (report []string, violations []violation) { + thresholds := benchmath.DefaultThresholds names := make([]string, 0, len(newRes)) for name := range newRes { names = append(names, name) @@ -114,7 +120,7 @@ func compare( sort.Strings(names) for _, name := range names { - oldBench, ok := oldRes[name] + oldUnits, ok := oldRes[name] if !ok { report = append(report, fmt.Sprintf( "%s: new benchmark, no baseline to compare", name, @@ -123,27 +129,51 @@ func compare( } var parts []string for _, g := range gates { - oldV, okOld := oldBench[g.unit] - newV, okNew := newRes[name][g.unit] + oldVals, okOld := oldUnits[g.unit] + newVals, okNew := newRes[name][g.unit] if !okOld || !okNew { continue } - if oldV < g.floor { + oldSample := benchmath.NewSample(oldVals, &thresholds) + newSample := benchmath.NewSample(newVals, &thresholds) + oldCenter := benchmath.AssumeNothing. + Summary(oldSample, 0.95).Center + newCenter := benchmath.AssumeNothing. + Summary(newSample, 0.95).Center + cls := benchunit.ClassOf(g.unit) + + if oldCenter <= 0 || oldCenter < g.floor { parts = append(parts, fmt.Sprintf( - "%s %.0f -> %.0f (below %.0f floor, not gated)", - g.unit, oldV, newV, g.floor, + "%s %s -> %s (below %s floor, not gated)", + g.unit, + benchunit.Scale(oldCenter, cls), + benchunit.Scale(newCenter, cls), + benchunit.Scale(g.floor, cls), )) continue } - ratio := newV / oldV - parts = append(parts, fmt.Sprintf( - "%s %.0f -> %.0f (%.2fx, limit %.2fx)", - g.unit, oldV, newV, ratio, g.maxRatio, - )) - if ratio > g.maxRatio { + + ratio := newCenter / oldCenter + cmp := benchmath.AssumeNothing.Compare(oldSample, newSample) + significant := cmp.P < cmp.Alpha + detail := fmt.Sprintf( + "%s %s -> %s (%.2fx, limit %.2fx, %s)", + g.unit, + benchunit.Scale(oldCenter, cls), + benchunit.Scale(newCenter, cls), + ratio, g.maxRatio, cmp, + ) + if g.needSignificance && !significant { + detail += " [not significant, not gated]" + } + parts = append(parts, detail) + + if ratio > g.maxRatio && + (!g.needSignificance || significant) && + !math.IsNaN(ratio) { violations = append(violations, violation{ name: name, unit: g.unit, - old: oldV, new: newV, + old: oldCenter, new: newCenter, ratio: ratio, maxRatio: g.maxRatio, }) } @@ -169,13 +199,13 @@ func compare( return report, violations } -func parseFile(path string) (map[string]benchResult, error) { +func parseFile(path string) (benchSamples, error) { f, err := os.Open(path) if err != nil { return nil, err } defer f.Close() - return parseBench(f) + return parseBench(benchfmt.NewReader(f, path)) } func main() { @@ -187,19 +217,20 @@ func main() { ) maxTimeRatio := flag.Float64( "max-time-ratio", 2.0, - "fail when ns/op exceeds baseline by this factor", + "fail when median sec/op exceeds baseline by this factor "+ + "(only when the difference is statistically significant)", ) maxAllocRatio := flag.Float64( "max-alloc-ratio", 1.25, - "fail when allocs/op exceeds baseline by this factor", + "fail when median allocs/op exceeds baseline by this factor", ) maxBytesRatio := flag.Float64( "max-bytes-ratio", 1.35, - "fail when B/op exceeds baseline by this factor", + "fail when median B/op exceeds baseline by this factor", ) timeFloorNs := flag.Float64( "time-floor-ns", 100_000, - "skip the ns/op gate when the baseline is below this", + "skip the time gate when the baseline is below this many ns", ) allocFloor := flag.Float64( "alloc-floor", 64, @@ -231,7 +262,12 @@ func main() { gates := []gate{ {unit: "allocs/op", maxRatio: *maxAllocRatio, floor: *allocFloor}, {unit: "B/op", maxRatio: *maxBytesRatio, floor: *bytesFloor}, - {unit: "ns/op", maxRatio: *maxTimeRatio, floor: *timeFloorNs}, + { + unit: "sec/op", + maxRatio: *maxTimeRatio, + floor: *timeFloorNs / 1e9, + needSignificance: true, + }, } report, violations := compare(oldRes, newRes, gates) for _, line := range report { diff --git a/cmd/benchgate/main_test.go b/cmd/benchgate/main_test.go index a979ecf98..4210d8ba7 100644 --- a/cmd/benchgate/main_test.go +++ b/cmd/benchgate/main_test.go @@ -6,41 +6,60 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + + "golang.org/x/perf/benchfmt" ) +func parseString(t *testing.T, input string) benchSamples { + t.Helper() + got, err := parseBench( + benchfmt.NewReader(strings.NewReader(input), "test"), + ) + require.NoError(t, err) + return got +} + func TestParseBench(t *testing.T) { tests := []struct { name string input string - want map[string]benchResult + want benchSamples }{ { - name: "full benchmem line", + name: "full benchmem line with tidied units", input: "goos: linux\n" + + "pkg: example.com/x\n" + "BenchmarkFoo-8 \t 100\t 1234567 ns/op\t 2345 B/op\t 67 allocs/op\n" + - "PASS\nok \tpkg\t1.2s\n", - want: map[string]benchResult{ - "BenchmarkFoo-8": { - "ns/op": 1234567, "B/op": 2345, "allocs/op": 67, + "PASS\nok \texample.com/x\t1.2s\n", + want: benchSamples{ + "example.com/x.Foo-8": { + "sec/op": {1234567e-9}, + "B/op": {2345}, + "allocs/op": {67}, }, }, }, { - name: "multiple counts keep the minimum per metric", - input: "BenchmarkFoo-8 100 200 ns/op 50 B/op 9 allocs/op\n" + - "BenchmarkFoo-8 100 150 ns/op 60 B/op 8 allocs/op\n" + - "BenchmarkFoo-8 100 180 ns/op 40 B/op 10 allocs/op\n", - want: map[string]benchResult{ - "BenchmarkFoo-8": { - "ns/op": 150, "B/op": 40, "allocs/op": 8, + name: "multiple counts collect all samples", + input: "BenchmarkFoo-8 100 200 ns/op 9 allocs/op\n" + + "BenchmarkFoo-8 100 150 ns/op 8 allocs/op\n" + + "BenchmarkFoo-8 100 180 ns/op 10 allocs/op\n", + want: benchSamples{ + "Foo-8": { + "sec/op": {200e-9, 150e-9, 180e-9}, + "allocs/op": {9, 8, 10}, }, }, }, { - name: "time-only line without -benchmem", - input: "BenchmarkBar-4 500 0.5 ns/op\n", - want: map[string]benchResult{ - "BenchmarkBar-4": {"ns/op": 0.5}, + name: "same benchmark name in different packages stays separate", + input: "pkg: example.com/a\n" + + "BenchmarkScan-4 10 100 ns/op\n" + + "pkg: example.com/b\n" + + "BenchmarkScan-4 10 900 ns/op\n", + want: benchSamples{ + "example.com/a.Scan-4": {"sec/op": {100e-9}}, + "example.com/b.Scan-4": {"sec/op": {900e-9}}, }, }, { @@ -48,21 +67,33 @@ func TestParseBench(t *testing.T) { input: "2026/07/03 10:20:36 discovered 40 files in 0s\n" + "BenchmarkSync says hello\n" + "cpu: Apple M5 Max\n", - want: map[string]benchResult{}, + want: benchSamples{}, }, { name: "custom ReportMetric units are kept", - input: "BenchmarkBaz-2 10 900 ns/op 3 sessions/op\n", - want: map[string]benchResult{ - "BenchmarkBaz-2": {"ns/op": 900, "sessions/op": 3}, + input: "BenchmarkBaz-2 10 900 ns/op 3 things/op\n", + want: benchSamples{ + "Baz-2": { + "sec/op": {900e-9}, + "things/op": {3}, + }, }, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - got, err := parseBench(strings.NewReader(tt.input)) - require.NoError(t, err) - assert.Equal(t, tt.want, got) + got := parseString(t, tt.input) + require.Len(t, got, len(tt.want)) + for name, wantUnits := range tt.want { + gotUnits, ok := got[name] + require.True(t, ok, "missing benchmark %s", name) + for unit, wantVals := range wantUnits { + assert.InDeltaSlice( + t, wantVals, gotUnits[unit], 1e-15, + "%s %s", name, unit, + ) + } + } }) } } @@ -71,61 +102,93 @@ func testGates() []gate { return []gate{ {unit: "allocs/op", maxRatio: 1.25, floor: 64}, {unit: "B/op", maxRatio: 1.35, floor: 16_384}, - {unit: "ns/op", maxRatio: 2.0, floor: 100_000}, + { + unit: "sec/op", maxRatio: 2.0, floor: 100_000e-9, + needSignificance: true, + }, } } +// noisy fabricates a benchmark sample of n values spread ±2% around +// center, so significance tests have a realistic distribution. +func noisy(center float64, n int) []float64 { + out := make([]float64, n) + for i := range out { + out[i] = center * (0.98 + 0.01*float64(i%5)) + } + return out +} + func TestCompare(t *testing.T) { tests := []struct { name string - old, new map[string]benchResult + old, new benchSamples wantUnits []string // units of expected violations, in order wantReport []string // substrings that must appear in the report }{ { name: "within thresholds passes", - old: map[string]benchResult{ + old: benchSamples{ "BenchmarkFoo-8": { - "ns/op": 1_000_000, "B/op": 100_000, "allocs/op": 1000, + "sec/op": noisy(1e-3, 6), + "B/op": {100_000}, + "allocs/op": {1000}, }, }, - new: map[string]benchResult{ + new: benchSamples{ "BenchmarkFoo-8": { - "ns/op": 1_500_000, "B/op": 120_000, "allocs/op": 1100, + "sec/op": noisy(1.5e-3, 6), + "B/op": {120_000}, + "allocs/op": {1100}, }, }, wantUnits: nil, }, { - name: "alloc regression fails", - old: map[string]benchResult{ - "BenchmarkFoo-8": {"ns/op": 1_000_000, "allocs/op": 1000}, + name: "alloc regression fails even with a single run", + old: benchSamples{ + "BenchmarkFoo-8": {"allocs/op": {1000}}, }, - new: map[string]benchResult{ - "BenchmarkFoo-8": {"ns/op": 1_000_000, "allocs/op": 2000}, + new: benchSamples{ + "BenchmarkFoo-8": {"allocs/op": {2000}}, }, wantUnits: []string{"allocs/op"}, }, { - name: "time blowup fails", - old: map[string]benchResult{ - "BenchmarkFoo-8": {"ns/op": 1_000_000}, + name: "significant time blowup fails", + old: benchSamples{ + "BenchmarkFoo-8": {"sec/op": noisy(1e-3, 6)}, + }, + new: benchSamples{ + "BenchmarkFoo-8": {"sec/op": noisy(5e-3, 6)}, + }, + wantUnits: []string{"sec/op"}, + }, + { + name: "time blowup without significance is reported, not gated", + old: benchSamples{ + "BenchmarkFoo-8": {"sec/op": {1e-3}}, }, - new: map[string]benchResult{ - "BenchmarkFoo-8": {"ns/op": 5_000_000}, + new: benchSamples{ + "BenchmarkFoo-8": {"sec/op": {5e-3}}, }, - wantUnits: []string{"ns/op"}, + wantUnits: nil, + wantReport: []string{"not significant, not gated"}, }, { name: "tiny baseline below floor is not gated", - old: map[string]benchResult{ + old: benchSamples{ "BenchmarkFoo-8": { - "ns/op": 500, "B/op": 128, "allocs/op": 3, + "sec/op": {500e-9}, + "B/op": {128}, + "allocs/op": {3}, }, }, - new: map[string]benchResult{ + new: benchSamples{ "BenchmarkFoo-8": { - "ns/op": 5000, "B/op": 1280, "allocs/op": 30, + "sec/op": {5000e-9}, + "B/op": {1280}, + "allocs/op": {30}, }, }, wantUnits: nil, @@ -133,35 +196,39 @@ func TestCompare(t *testing.T) { }, { name: "new benchmark without baseline is reported, not gated", - old: map[string]benchResult{}, - new: map[string]benchResult{ - "BenchmarkNew-8": {"ns/op": 1_000_000, "allocs/op": 99999}, + old: benchSamples{}, + new: benchSamples{ + "BenchmarkNew-8": {"allocs/op": {99999}}, }, wantUnits: nil, wantReport: []string{"no baseline to compare"}, }, { name: "removed benchmark is reported, not gated", - old: map[string]benchResult{ - "BenchmarkGone-8": {"ns/op": 1_000_000}, + old: benchSamples{ + "BenchmarkGone-8": {"sec/op": {1e-3}}, }, - new: map[string]benchResult{}, + new: benchSamples{}, wantUnits: nil, wantReport: []string{"missing from candidate"}, }, { name: "multiple regressions are all reported", - old: map[string]benchResult{ + old: benchSamples{ "BenchmarkFoo-8": { - "ns/op": 1_000_000, "B/op": 100_000, "allocs/op": 1000, + "sec/op": noisy(1e-3, 6), + "B/op": {100_000}, + "allocs/op": {1000}, }, }, - new: map[string]benchResult{ + new: benchSamples{ "BenchmarkFoo-8": { - "ns/op": 9_000_000, "B/op": 900_000, "allocs/op": 9000, + "sec/op": noisy(9e-3, 6), + "B/op": {900_000}, + "allocs/op": {9000}, }, }, - wantUnits: []string{"allocs/op", "B/op", "ns/op"}, + wantUnits: []string{"allocs/op", "B/op", "sec/op"}, }, } for _, tt := range tests { @@ -184,15 +251,29 @@ func TestCompare(t *testing.T) { } } +// TestCompareMedianAcrossCounts pins that repeated -count runs are +// summarized by their median, so one outlier run cannot fail the +// gate on its own. +func TestCompareMedianAcrossCounts(t *testing.T) { + old := benchSamples{ + "BenchmarkFoo-8": {"allocs/op": {1000, 1000, 1000, 1000, 1000}}, + } + // One wild outlier among otherwise-unchanged runs: the median + // stays 1000 and the gate must pass. + next := benchSamples{ + "BenchmarkFoo-8": {"allocs/op": {1000, 1000, 9000, 1000, 1000}}, + } + _, violations := compare(old, next, testGates()) + assert.Empty(t, violations) +} + func TestViolationString(t *testing.T) { v := violation{ name: "BenchmarkFoo-8", unit: "allocs/op", old: 1000, new: 2000, ratio: 2.0, maxRatio: 1.25, } - assert.Equal( - t, - "BenchmarkFoo-8: allocs/op regressed 2.00x "+ - "(1000 -> 2000, limit 1.25x)", - v.String(), - ) + got := v.String() + assert.Contains(t, got, "BenchmarkFoo-8") + assert.Contains(t, got, "allocs/op regressed 2.00x") + assert.Contains(t, got, "limit 1.25x") } diff --git a/docs/internal/performance-gates.md b/docs/internal/performance-gates.md index e7eeb2d34..f9526afb2 100644 --- a/docs/internal/performance-gates.md +++ b/docs/internal/performance-gates.md @@ -60,13 +60,18 @@ its merge base on the same runner, then compares with `cmd/benchgate`: - `BenchmarkGetDailyUsage` — usage aggregation over 100k message rows. - `BenchmarkScan` / `BenchmarkScanDefinite` — secret-scan regex throughput. -`benchgate` gates hard on `allocs/op` (limit 1.25x) and `B/op` (1.35x), which -are deterministic for the same code on the same machine — an -O(archive)-instead-of-O(delta) regression always blows them up. `ns/op` gets a -loose 2.0x limit that catches algorithmic blowups while tolerating runner noise -(each metric uses the minimum across `-count` runs). Baselines below a -per-metric floor are not gated. Benchmarks that exist on only one side are -reported but never fail, so adding or removing benchmarks cannot wedge a PR. +`benchgate` builds on `golang.org/x/perf`: `benchfmt` parses the output and +`benchmath` — the statistics engine behind `benchstat` — summarizes each +benchmark's `-count` runs by their median and tests significance (Mann-Whitney +U). benchgate adds only the policy benchstat does not provide: thresholds, +floors, and a failing exit code. It gates hard on `allocs/op` (limit 1.25x) and +`B/op` (1.35x), which are deterministic for the same code on the same machine — +an O(archive)-instead-of-O(delta) regression always blows them up. Time +(`sec/op`) gets a loose 2.0x limit and must additionally be a statistically +significant difference before it fails, so runner noise cannot flake a PR but +algorithmic blowups still do. Baselines below a per-metric floor are not gated. +Benchmarks that exist on only one side are reported but never fail, so adding or +removing benchmarks cannot wedge a PR. Run locally: diff --git a/go.mod b/go.mod index 236d14d8f..d32a5084e 100644 --- a/go.mod +++ b/go.mod @@ -26,6 +26,7 @@ require ( github.com/tidwall/gjson v1.19.0 go.kenn.io/kit v0.1.7 golang.org/x/mod v0.37.0 + golang.org/x/perf v0.0.0-20260615155930-9e4b9ddef5b6 golang.org/x/sync v0.21.0 golang.org/x/sys v0.46.0 golang.org/x/term v0.44.0 @@ -35,6 +36,7 @@ require ( dario.cat/mergo v1.0.2 // indirect github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c // indirect github.com/Microsoft/go-winio v0.6.2 // indirect + github.com/aclements/go-moremath v0.0.0-20210112150236-f10218a38794 // indirect github.com/apache/arrow-go/v18 v18.5.1 // indirect github.com/cenkalti/backoff/v4 v4.3.0 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect @@ -109,12 +111,12 @@ require ( go.opentelemetry.io/otel/metric v1.43.0 // indirect go.opentelemetry.io/otel/trace v1.43.0 // indirect go.yaml.in/yaml/v3 v3.0.4 // indirect - golang.org/x/crypto v0.51.0 // indirect - golang.org/x/exp v0.0.0-20260112195511-716be5621a96 // indirect - golang.org/x/net v0.54.0 // indirect - golang.org/x/oauth2 v0.35.0 // indirect + golang.org/x/crypto v0.53.0 // indirect + golang.org/x/exp v0.0.0-20260603202125-055de637280b // indirect + golang.org/x/net v0.56.0 // indirect + golang.org/x/oauth2 v0.36.0 // indirect golang.org/x/telemetry v0.0.0-20260508192327-42602be52be6 // indirect - golang.org/x/text v0.37.0 // indirect + golang.org/x/text v0.38.0 // indirect golang.org/x/tools v0.45.0 // indirect golang.org/x/xerrors v0.0.0-20240903120638-7835f813f4da // indirect gopkg.in/ini.v1 v1.67.2 // indirect diff --git a/go.sum b/go.sum index 07b890a25..d347c22a3 100644 --- a/go.sum +++ b/go.sum @@ -8,6 +8,8 @@ github.com/BurntSushi/toml v1.6.0 h1:dRaEfpa2VI55EwlIW72hMRHdWouJeRF7TPYhI+AUQjk github.com/BurntSushi/toml v1.6.0/go.mod h1:ukJfTF/6rtPPRCnwkur4qwRxa8vTRFBF0uk2lLoLwho= github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERoyfY= github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA8Ipt1oGCvU= +github.com/aclements/go-moremath v0.0.0-20210112150236-f10218a38794 h1:xlwdaKcTNVW4PtpQb8aKA4Pjy0CdJHEqvFbAnvR5m2g= +github.com/aclements/go-moremath v0.0.0-20210112150236-f10218a38794/go.mod h1:7e+I0LQFUI9AXWxOfsQROs9xPhoJtbsyWcjJqDd4KPY= github.com/andybalholm/brotli v1.2.1 h1:R+f5xP285VArJDRgowrfb9DqL18yVK0gKAW/F+eTWro= github.com/andybalholm/brotli v1.2.1/go.mod h1:rzTDkvFWvIrjDXZHkuS16NPggd91W3kUSvPlQ1pLaKY= github.com/apache/arrow-go/v18 v18.5.1 h1:yaQ6zxMGgf9YCYw4/oaeOU3AULySDlAYDOcnr4LdHdI= @@ -254,16 +256,18 @@ go.opentelemetry.io/otel/trace v1.43.0 h1:BkNrHpup+4k4w+ZZ86CZoHHEkohws8AY+WTX09 go.opentelemetry.io/otel/trace v1.43.0/go.mod h1:/QJhyVBUUswCphDVxq+8mld+AvhXZLhe+8WVFxiFff0= go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= -golang.org/x/crypto v0.51.0 h1:IBPXwPfKxY7cWQZ38ZCIRPI50YLeevDLlLnyC5wRGTI= -golang.org/x/crypto v0.51.0/go.mod h1:8AdwkbraGNABw2kOX6YFPs3WM22XqI4EXEd8g+x7Oc8= -golang.org/x/exp v0.0.0-20260112195511-716be5621a96 h1:Z/6YuSHTLOHfNFdb8zVZomZr7cqNgTJvA8+Qz75D8gU= -golang.org/x/exp v0.0.0-20260112195511-716be5621a96/go.mod h1:nzimsREAkjBCIEFtHiYkrJyT+2uy9YZJB7H1k68CXZU= +golang.org/x/crypto v0.53.0 h1:QZ4Muo8THX6CizN2vPPd5fBGHyogrdK9fG4wLPFUsto= +golang.org/x/crypto v0.53.0/go.mod h1:DNLU434OwVakk9PzuwV8w62mAJpRJL3vsgcfp4Qnsio= +golang.org/x/exp v0.0.0-20260603202125-055de637280b h1:v1uXiEBHo8QA0LiGCo7UgHMzHT4Kdfpl2zmtH5vaP1Q= +golang.org/x/exp v0.0.0-20260603202125-055de637280b/go.mod h1:d2fgXJLVs4dYDHUk5lwMIfzRzSrWCfGZb0ZqeLa/Vcw= golang.org/x/mod v0.37.0 h1:vF1DjpVEshcIqoEaauuHebaLk1O1forxjxBaVn884JQ= golang.org/x/mod v0.37.0/go.mod h1:m8S8VeM9r4dzDwjrKO0a1sZP3YjeMamRRlD+fmR2Q/0= -golang.org/x/net v0.54.0 h1:2zJIZAxAHV/OHCDTCOHAYehQzLfSXuf/5SoL/Dv6w/w= -golang.org/x/net v0.54.0/go.mod h1:Sj4oj8jK6XmHpBZU/zWHw3BV3abl4Kvi+Ut7cQcY+cQ= -golang.org/x/oauth2 v0.35.0 h1:Mv2mzuHuZuY2+bkyWXIHMfhNdJAdwW3FuWeCPYN5GVQ= -golang.org/x/oauth2 v0.35.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA= +golang.org/x/net v0.56.0 h1:Rw8j/hFzGvJUZwNBXnAtf5sVDVt+65SK2C7IxCxZt5o= +golang.org/x/net v0.56.0/go.mod h1:D3Ku6r+V6JROoZK144D2XfMHFcMq/0zSfLelVTCFKec= +golang.org/x/oauth2 v0.36.0 h1:peZ/1z27fi9hUOFCAZaHyrpWG5lwe0RJEEEeH0ThlIs= +golang.org/x/oauth2 v0.36.0/go.mod h1:YDBUJMTkDnJS+A4BP4eZBjCqtokkg1hODuPjwiGPO7Q= +golang.org/x/perf v0.0.0-20260615155930-9e4b9ddef5b6 h1:YTGOochus7nQXtT5KnsVfJ3/1yslyOyP3m9vMG1qMuQ= +golang.org/x/perf v0.0.0-20260615155930-9e4b9ddef5b6/go.mod h1:FisaKCtzcRx02gCop3DILSnoD7CMnqwmde2yVxo4meM= golang.org/x/sync v0.21.0 h1:HLII4xRRTtCRkxYp4HNFF0Js/Og6q2i++KXbg0gHCwM= golang.org/x/sync v0.21.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0= golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= @@ -275,8 +279,8 @@ golang.org/x/telemetry v0.0.0-20260508192327-42602be52be6 h1:HjU6IWBiAgRIdAJ9/y1 golang.org/x/telemetry v0.0.0-20260508192327-42602be52be6/go.mod h1:Eqhaxk/wZsWEH8CRxLwj6xzEJbz7k1EFGqx7nyCoabE= golang.org/x/term v0.44.0 h1:0rLvDRCtNj0gZkyIXhCyOb2OAzEhLVqc4B+hrsBhrmc= golang.org/x/term v0.44.0/go.mod h1:7ze4MdzUzLXpSAoFP1H0bOI9aXDqveSvatT5vKcFh2Y= -golang.org/x/text v0.37.0 h1:Cqjiwd9eSg8e0QAkyCaQTNHFIIzWtidPahFWR83rTrc= -golang.org/x/text v0.37.0/go.mod h1:a5sjxXGs9hsn/AJVwuElvCAo9v8QYLzvavO5z2PiM38= +golang.org/x/text v0.38.0 h1:sXmwo9DwP3OK9EZ7PqAdaooSGozfl/3a6/xJcbzPRhE= +golang.org/x/text v0.38.0/go.mod h1:YXZt3QhHUKYT53r2lLKFIVi6Ao1jdzrTR/KQ09qyxF4= golang.org/x/tools v0.45.0 h1:18qN3FAooORvApf5XjCXgsuayZOEtXf6JK18I3+ONa8= golang.org/x/tools v0.45.0/go.mod h1:LuUGqqaXcXMEFEruIVJVm5mgDD8vww/z/SR1gQ4uE/0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= From b47c47400a744816f95ebc95ad9245bdba0261fd Mon Sep 17 00:00:00 2001 From: Marius van Niekerk Date: Fri, 3 Jul 2026 11:11:33 -0400 Subject: [PATCH 3/7] fix(bench): truncate failed baselines, fix iteration counts, strengthen warm-noop assertion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Review follow-ups on the bench gate (roborev 4813/4814): - A failing merge-base benchmark run could leave partial output from packages that finished before the failure, so the gate would compare against incomplete stale data instead of the documented empty baseline. The workflow now truncates the baseline file on failure. - The gate now runs with a fixed -benchtime=20x iteration count instead of a 500ms duration. Two gated benchmarks grow their fixture as they iterate (appending to a session, adding sessions to a DB), so a duration-based benchtime let a slower side run fewer iterations against a different final size — comparing different workloads. Fixed iterations keep both sides identical; the growth is now documented on both benchmarks (it is deliberate for the append benchmark: flat per-append cost as the session grows is the invariant it protects). - BenchmarkSyncAllWarmNoop now also asserts zero bulk writes via PhaseStats, matching what the docs claimed it self-asserted. - Docs: adding a gated benchmark now includes the BENCH_PACKAGES/Makefile package-list step (a name in the pattern whose package is not in the list silently never runs), and the local run instructions are a complete before/after sequence. --- .github/workflows/bench.yml | 19 ++++++++---- Makefile | 5 +++- docs/internal/performance-gates.md | 48 ++++++++++++++++++++---------- internal/db/messages_bench_test.go | 6 ++++ internal/sync/engine_bench_test.go | 15 +++++++++- 5 files changed, 70 insertions(+), 23 deletions(-) diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index 4322236bb..4fbdffd98 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -33,7 +33,12 @@ env: BENCH_PACKAGES: ./internal/sync ./internal/db ./internal/secrets BENCH_PATTERN: ^(BenchmarkSyncAllColdArchive|BenchmarkSyncAllWarmNoop|BenchmarkSyncPathsIncrementalAppend|BenchmarkReplaceSessionMessagesStreamingMerge|BenchmarkInsertMessagesBatch|BenchmarkGetDailyUsage|BenchmarkScan|BenchmarkScanDefinite)$ BENCH_COUNT: "6" - BENCH_TIME: 500ms + # Fixed iterations (not a duration) so the baseline and candidate + # run identical workloads: two of the gated benchmarks grow their + # fixture as they iterate (appending to a session, adding sessions + # to a DB), and a duration-based -benchtime would let a slower side + # run fewer iterations against a different final size. + BENCH_TIME: 20x jobs: bench-gate: @@ -68,16 +73,20 @@ jobs: # A failing merge-base run (or one with none of the gated # benchmarks yet) degrades to an empty baseline: benchgate # then reports every benchmark as new and passes, so a broken - # base never blocks a PR. + # base never blocks a PR. The baseline file is truncated on + # failure so partial output from packages that ran before the + # failure cannot gate against incomplete data. run: | base=$(git merge-base HEAD "origin/$BASE_REF") git worktree add /tmp/bench-base "$base" cd /tmp/bench-base go run ./internal/pricing/cmd/litellm-snapshot -restore || true - go test -tags "fts5" -run '^$' -bench "$BENCH_PATTERN" \ + if ! go test -tags "fts5" -run '^$' -bench "$BENCH_PATTERN" \ -benchmem -count "$BENCH_COUNT" -benchtime "$BENCH_TIME" \ - -timeout 25m $BENCH_PACKAGES > /tmp/bench-old.txt \ - || echo "merge-base benchmarks failed; comparing against empty baseline" + -timeout 25m $BENCH_PACKAGES > /tmp/bench-old.txt; then + echo "merge-base benchmarks failed; comparing against empty baseline" + : > /tmp/bench-old.txt + fi - name: Compare against merge base run: go run ./cmd/benchgate -old /tmp/bench-old.txt -new /tmp/bench-new.txt diff --git a/Makefile b/Makefile index 3a664faf0..f81dafcb0 100644 --- a/Makefile +++ b/Makefile @@ -274,7 +274,10 @@ bench-backends: pricing-snapshot ensure-embed-dir # pattern in sync with .github/workflows/bench.yml. BENCH_GATE_PATTERN ?= ^(BenchmarkSyncAllColdArchive|BenchmarkSyncAllWarmNoop|BenchmarkSyncPathsIncrementalAppend|BenchmarkReplaceSessionMessagesStreamingMerge|BenchmarkInsertMessagesBatch|BenchmarkGetDailyUsage|BenchmarkScan|BenchmarkScanDefinite)$$ BENCH_GATE_COUNT ?= 6 -BENCH_GATE_TIME ?= 500ms +# Fixed iterations, not a duration: some gated benchmarks grow their +# fixture as they iterate, so baseline and candidate must run the +# same iteration count to measure identical workloads. +BENCH_GATE_TIME ?= 20x bench-gate: pricing-snapshot ensure-embed-dir CGO_ENABLED=1 go test -tags "fts5" -run '^$$' \ -bench '$(BENCH_GATE_PATTERN)' -benchmem \ diff --git a/docs/internal/performance-gates.md b/docs/internal/performance-gates.md index f9526afb2..92f7b39e0 100644 --- a/docs/internal/performance-gates.md +++ b/docs/internal/performance-gates.md @@ -50,7 +50,7 @@ sessions" or "the manifest is read once per root regardless of session count". its merge base on the same runner, then compares with `cmd/benchgate`: - `BenchmarkSyncAllWarmNoop` — full sync over an already-synced archive (stat + - skip work only; also self-asserts nothing is reparsed). + skip work only; also self-asserts nothing is re-synced or bulk-rewritten). - `BenchmarkSyncPathsIncrementalAppend` — absorb one appended line into a 1,000-message session. - `BenchmarkSyncAllColdArchive` — first-sync ingest throughput. @@ -61,23 +61,33 @@ its merge base on the same runner, then compares with `cmd/benchgate`: - `BenchmarkScan` / `BenchmarkScanDefinite` — secret-scan regex throughput. `benchgate` builds on `golang.org/x/perf`: `benchfmt` parses the output and -`benchmath` — the statistics engine behind `benchstat` — summarizes each -benchmark's `-count` runs by their median and tests significance (Mann-Whitney -U). benchgate adds only the policy benchstat does not provide: thresholds, -floors, and a failing exit code. It gates hard on `allocs/op` (limit 1.25x) and -`B/op` (1.35x), which are deterministic for the same code on the same machine — -an O(archive)-instead-of-O(delta) regression always blows them up. Time -(`sec/op`) gets a loose 2.0x limit and must additionally be a statistically -significant difference before it fails, so runner noise cannot flake a PR but -algorithmic blowups still do. Baselines below a per-metric floor are not gated. -Benchmarks that exist on only one side are reported but never fail, so adding or -removing benchmarks cannot wedge a PR. - -Run locally: +`benchmath` — the statistics engine behind `benchstat` — summarizes samples and +tests significance (Mann-Whitney U). benchgate adds only the policy benchstat +does not provide: thresholds, floors, and a failing exit code. Gating is per +benchmark — any single benchmark over its threshold fails the PR; nothing is +averaged across benchmarks. It gates hard on `allocs/op` (limit 1.25x) and +`B/op` (1.35x), which are deterministic for the same code and iteration count — +an O(archive)-instead-of-O(delta) regression always blows them up. Those two +compare the candidate's *worst* `-count` run against the baseline median, so +even an intermittent extra-allocation path fails. Time (`sec/op`) compares +medians with a loose 2.0x limit and must additionally be a statistically +significant difference before it fails, so a single slow run on a noisy runner +cannot flake a PR but algorithmic blowups still do. Baselines below a per-metric +floor are not gated. Benchmarks that exist on only one side are reported but +never fail, so adding or removing benchmarks cannot wedge a PR. + +The gate always runs with a fixed `-benchtime=Nx` iteration count (not a +duration): two of the benchmarks grow their fixture as they iterate, so the +baseline and candidate must run the same number of iterations to measure +identical workloads. + +Run locally, comparing your working tree against a baseline commit: ```bash -make bench-gate # current tree -git stash && make bench-gate > old.txt # baseline, however you produce it +make bench-gate > new.txt # candidate: current tree +git stash # or: git switch --detach origin/main +make bench-gate > old.txt # baseline +git stash pop # restore the working tree go run ./cmd/benchgate -old old.txt -new new.txt ``` @@ -90,5 +100,11 @@ Cross-backend query benchmarks live separately in `internal/backendbench` `b.ReportAllocs()`, self-assert the invariant it protects where possible). 1. Add its name to `BENCH_PATTERN` in `.github/workflows/bench.yml` **and** `BENCH_GATE_PATTERN` in the Makefile. +1. Make sure its package is in `BENCH_PACKAGES` (bench.yml) and the Makefile + `bench-gate` package list — a name in the pattern whose package is not in + the list is silently never run, so it looks gated while measuring nothing. 1. Keep per-op cost roughly in the 100µs–100ms band: below the benchgate floors nothing is gated, and far above it the job gets slow. +1. If the benchmark's fixture grows across iterations, say so in its comment; + the fixed `-benchtime=Nx` keeps both sides comparable, but readers need to + know per-op cost depends on the iteration count. diff --git a/internal/db/messages_bench_test.go b/internal/db/messages_bench_test.go index 3db4bc525..7699c090b 100644 --- a/internal/db/messages_bench_test.go +++ b/internal/db/messages_bench_test.go @@ -123,6 +123,12 @@ func BenchmarkReplaceSessionMessagesStreamingMerge(b *testing.B) { // BenchmarkInsertMessagesBatch measures bulk session ingest: one // session row plus a batch insert of its messages, the unit of work // the full-sync write pipeline performs per session. +// +// Each iteration adds a new session, so the database grows with the +// iteration count and per-op cost is only comparable between runs +// with the same count: the bench gate always runs with a fixed +// -benchtime=Nx (see bench.yml and the Makefile) so baseline and +// candidate insert into identically sized databases. func BenchmarkInsertMessagesBatch(b *testing.B) { const batch = 200 d := openBenchDB(b) diff --git a/internal/sync/engine_bench_test.go b/internal/sync/engine_bench_test.go index bc8a628ed..aca1f9ebf 100644 --- a/internal/sync/engine_bench_test.go +++ b/internal/sync/engine_bench_test.go @@ -141,7 +141,12 @@ func BenchmarkSyncAllWarmNoop(b *testing.B) { stats := engine.SyncAll(ctx, nil) if stats.Synced != 0 { b.Fatalf( - "warm no-op sync reparsed %d sessions", stats.Synced, + "warm no-op sync re-synced %d sessions", stats.Synced, + ) + } + if writes := engine.PhaseStats().BatchedWrites.Load(); writes != 0 { + b.Fatalf( + "warm no-op sync bulk-wrote %d sessions", writes, ) } } @@ -151,6 +156,14 @@ func BenchmarkSyncAllWarmNoop(b *testing.B) { // appended JSONL line into a session that already stores // benchLargeSessionLines messages, the streaming write the serve // daemon performs thousands of times per day. +// +// The session grows by one message per iteration, so per-op cost is +// only comparable between runs with the same iteration count: the +// bench gate always runs with a fixed -benchtime=Nx (see bench.yml +// and the Makefile) so baseline and candidate absorb appends into +// identically sized sessions. Growth is deliberate — per-append cost +// staying flat as the session grows is exactly the invariant this +// benchmark protects. func BenchmarkSyncPathsIncrementalAppend(b *testing.B) { dir := b.TempDir() writeBenchClaudeArchive(b, dir, 1, benchLargeSessionLines) From 9917817794b493a2063822c4442ae28fdc55ddc4 Mon Sep 17 00:00:00 2001 From: Marius van Niekerk Date: Fri, 3 Jul 2026 11:11:47 -0400 Subject: [PATCH 4/7] feat(benchgate): gate deterministic metrics on the candidate's worst run MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Gating was per-benchmark already — any single benchmark over its threshold fails the PR — but within one benchmark's repeated -count runs, every metric was summarized by its median, so a regression that only manifests in some runs (an intermittent allocation path, a data-dependent branch) could hide behind five clean runs. For allocs/op and B/op that leniency buys nothing: with identical code and a fixed iteration count they are deterministic, so an outlier run is signal, not noise. Those gates now compare the candidate's worst run against the baseline median — one bad run out of six fails. Wall-clock time keeps median-plus-significance, because a single slow run on a shared runner genuinely is noisy-neighbor noise and gating on it would make the workflow flaky. --- cmd/benchgate/main.go | 73 ++++++++++++++++++++++++++++---------- cmd/benchgate/main_test.go | 50 +++++++++++++++++--------- 2 files changed, 88 insertions(+), 35 deletions(-) diff --git a/cmd/benchgate/main.go b/cmd/benchgate/main.go index fe820d4a7..7ad7d1fd7 100644 --- a/cmd/benchgate/main.go +++ b/cmd/benchgate/main.go @@ -20,9 +20,14 @@ // regression. // // Multiple runs of the same benchmark (-count=N) are kept as a -// sample and summarized by their median. Benchmarks present on only -// one side are reported but never fail the gate, so adding or -// removing benchmarks in a PR does not wedge it. +// sample. The baseline is summarized by its median; the candidate is +// gated on its median for time but on its WORST run for allocs/op +// and B/op — those are deterministic, so a single outlier run there +// is a real intermittent allocation path, not noise, and must fail. +// Gating is per benchmark: any one benchmark over its threshold +// fails the gate; there is no cross-benchmark averaging. Benchmarks +// present on only one side are reported but never fail the gate, so +// adding or removing benchmarks in a PR does not wedge it. package main import ( @@ -46,15 +51,19 @@ import ( type benchSamples map[string]map[string][]float64 // gate is one metric's regression rule: fail when the candidate -// median exceeds the baseline median by more than maxRatio, unless -// the baseline is below floor (too small to compare meaningfully). -// With needSignificance set, the samples must also differ -// significantly under the benchmath comparison test — the benchstat -// noise guard, used for wall-clock time. +// exceeds the baseline median by more than maxRatio, unless the +// baseline is below floor (too small to compare meaningfully). With +// worstCase set, the candidate is judged by its worst (highest) run +// rather than its median — for deterministic metrics, where any +// outlier run is a real intermittent code path. With +// needSignificance set, the samples must also differ significantly +// under the benchmath comparison test — the benchstat noise guard, +// used for wall-clock time. type gate struct { unit string maxRatio float64 floor float64 + worstCase bool needSignificance bool } @@ -140,6 +149,11 @@ func compare( Summary(oldSample, 0.95).Center newCenter := benchmath.AssumeNothing. Summary(newSample, 0.95).Center + if g.worstCase { + // Samples are sorted ascending; the worst + // candidate run is the last one. + newCenter = newSample.Values[len(newSample.Values)-1] + } cls := benchunit.ClassOf(g.unit) if oldCenter <= 0 || oldCenter < g.floor { @@ -156,15 +170,26 @@ func compare( ratio := newCenter / oldCenter cmp := benchmath.AssumeNothing.Compare(oldSample, newSample) significant := cmp.P < cmp.Alpha - detail := fmt.Sprintf( - "%s %s -> %s (%.2fx, limit %.2fx, %s)", - g.unit, - benchunit.Scale(oldCenter, cls), - benchunit.Scale(newCenter, cls), - ratio, g.maxRatio, cmp, - ) - if g.needSignificance && !significant { - detail += " [not significant, not gated]" + var detail string + if g.worstCase { + detail = fmt.Sprintf( + "%s %s -> %s (%.2fx, limit %.2fx, worst of %d run(s))", + g.unit, + benchunit.Scale(oldCenter, cls), + benchunit.Scale(newCenter, cls), + ratio, g.maxRatio, len(newSample.Values), + ) + } else { + detail = fmt.Sprintf( + "%s %s -> %s (%.2fx, limit %.2fx, %s)", + g.unit, + benchunit.Scale(oldCenter, cls), + benchunit.Scale(newCenter, cls), + ratio, g.maxRatio, cmp, + ) + if g.needSignificance && !significant { + detail += " [not significant, not gated]" + } } parts = append(parts, detail) @@ -260,8 +285,18 @@ func main() { } gates := []gate{ - {unit: "allocs/op", maxRatio: *maxAllocRatio, floor: *allocFloor}, - {unit: "B/op", maxRatio: *maxBytesRatio, floor: *bytesFloor}, + { + unit: "allocs/op", + maxRatio: *maxAllocRatio, + floor: *allocFloor, + worstCase: true, + }, + { + unit: "B/op", + maxRatio: *maxBytesRatio, + floor: *bytesFloor, + worstCase: true, + }, { unit: "sec/op", maxRatio: *maxTimeRatio, diff --git a/cmd/benchgate/main_test.go b/cmd/benchgate/main_test.go index 4210d8ba7..9a06f1898 100644 --- a/cmd/benchgate/main_test.go +++ b/cmd/benchgate/main_test.go @@ -100,8 +100,8 @@ func TestParseBench(t *testing.T) { func testGates() []gate { return []gate{ - {unit: "allocs/op", maxRatio: 1.25, floor: 64}, - {unit: "B/op", maxRatio: 1.35, floor: 16_384}, + {unit: "allocs/op", maxRatio: 1.25, floor: 64, worstCase: true}, + {unit: "B/op", maxRatio: 1.35, floor: 16_384, worstCase: true}, { unit: "sec/op", maxRatio: 2.0, floor: 100_000e-9, needSignificance: true, @@ -251,20 +251,38 @@ func TestCompare(t *testing.T) { } } -// TestCompareMedianAcrossCounts pins that repeated -count runs are -// summarized by their median, so one outlier run cannot fail the -// gate on its own. -func TestCompareMedianAcrossCounts(t *testing.T) { - old := benchSamples{ - "BenchmarkFoo-8": {"allocs/op": {1000, 1000, 1000, 1000, 1000}}, - } - // One wild outlier among otherwise-unchanged runs: the median - // stays 1000 and the gate must pass. - next := benchSamples{ - "BenchmarkFoo-8": {"allocs/op": {1000, 1000, 9000, 1000, 1000}}, - } - _, violations := compare(old, next, testGates()) - assert.Empty(t, violations) +// TestCompareOutlierRunPolicy pins the split policy for a single +// outlier among repeated -count runs of one benchmark. allocs/op is +// deterministic for identical code and iteration counts, so one +// outlier run means a real intermittent allocation path and must +// fail even though the median is unchanged. Wall-clock time is +// summarized by its median, so one slow run on a noisy runner cannot +// fail the gate on its own. +func TestCompareOutlierRunPolicy(t *testing.T) { + t.Run("alloc outlier run fails", func(t *testing.T) { + old := benchSamples{ + "BenchmarkFoo-8": {"allocs/op": {1000, 1000, 1000, 1000, 1000}}, + } + next := benchSamples{ + "BenchmarkFoo-8": {"allocs/op": {1000, 1000, 9000, 1000, 1000}}, + } + _, violations := compare(old, next, testGates()) + require.Len(t, violations, 1) + assert.Equal(t, "allocs/op", violations[0].unit) + assert.InDelta(t, 9000, violations[0].new, 1e-9, + "the worst run is what gets gated") + }) + + t.Run("time outlier run does not fail", func(t *testing.T) { + old := benchSamples{ + "BenchmarkFoo-8": {"sec/op": noisy(1e-3, 6)}, + } + next := benchSamples{ + "BenchmarkFoo-8": {"sec/op": append(noisy(1e-3, 5), 9e-3)}, + } + _, violations := compare(old, next, testGates()) + assert.Empty(t, violations) + }) } func TestViolationString(t *testing.T) { From 6e01b26213d2029ee7fbb9406c9c44134600f7de Mon Sep 17 00:00:00 2001 From: Marius van Niekerk Date: Fri, 3 Jul 2026 12:35:22 -0400 Subject: [PATCH 5/7] fix(bench): enforce time-gate sample counts, document gate policy, drop the benchmark allowlist MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Review follow-ups (roborev 4826/4827/4829/4830) plus a maintenance simplification: - The sec/op gate uses Mann-Whitney U, which can never reach significance with fewer than 5 samples per side — so a small -count silently disabled the time gate while the docs claimed coverage. benchgate now treats that as an explicit configuration error, and the workflow/Makefile document the >= 5 requirement next to the count settings. - The candidate-worst vs baseline-median asymmetry for deterministic metrics is now documented as an intentional candidate-must-be-clean gate, and worst-case report lines include the baseline's worst run so pre-existing instability is visible when reading a failure. Both stale flag help texts (median wording) now describe the worst-run policy. - The local baseline recipe used git stash, which neither moves committed branch work back to a baseline nor removes untracked candidate files — the baseline run could measure candidate code. It now mirrors CI's worktree approach. - Docs note that report identifiers are package-qualified only when the capture carries pkg: metadata (bare-name fallback otherwise), and that qualified and unqualified captures must not be mixed. - Dropped BENCH_PATTERN/BENCH_GATE_PATTERN entirely: the per-name allowlist duplicated information and could silently rot. The gate now runs every benchmark in the gated packages; this is safe because a benchmark with no baseline is reported without gating, so new benchmarks auto-join the gate only once they exist on both sides. BENCH_PACKAGES is the single gate boundary. --- .github/workflows/bench.yml | 15 ++++++--- Makefile | 16 +++++---- cmd/benchgate/main.go | 52 +++++++++++++++++++++++++----- cmd/benchgate/main_test.go | 24 +++++++++++--- docs/internal/performance-gates.md | 46 +++++++++++++++++--------- 5 files changed, 113 insertions(+), 40 deletions(-) diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index 4fbdffd98..6c7a669db 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -16,8 +16,11 @@ name: Bench Gate # that only catches algorithmic blowups; both sides run on the same # runner within one job, so the comparison is apples to apples. # Benchmarks that only exist on one side are reported but never fail -# the gate. Keep BENCH_PATTERN in sync with the Makefile's -# bench-gate target. +# the gate — which is what makes running every benchmark in the +# gated packages safe: a benchmark added by a PR has no baseline and +# is reported without gating, then gates automatically once merged. +# BENCH_PACKAGES is the gate boundary; keep it in sync with the +# Makefile's bench-gate target. on: pull_request: @@ -31,7 +34,9 @@ permissions: env: BENCH_PACKAGES: ./internal/sync ./internal/db ./internal/secrets - BENCH_PATTERN: ^(BenchmarkSyncAllColdArchive|BenchmarkSyncAllWarmNoop|BenchmarkSyncPathsIncrementalAppend|BenchmarkReplaceSessionMessagesStreamingMerge|BenchmarkInsertMessagesBatch|BenchmarkGetDailyUsage|BenchmarkScan|BenchmarkScanDefinite)$ + # Must stay >= 5: benchgate's time gate needs at least 5 samples + # per side for the significance test and treats fewer as a + # configuration error. BENCH_COUNT: "6" # Fixed iterations (not a duration) so the baseline and candidate # run identical workloads: two of the gated benchmarks grow their @@ -62,7 +67,7 @@ jobs: CGO_ENABLED: "1" run: | set -o pipefail - go test -tags "fts5" -run '^$' -bench "$BENCH_PATTERN" \ + go test -tags "fts5" -run '^$' -bench . \ -benchmem -count "$BENCH_COUNT" -benchtime "$BENCH_TIME" \ -timeout 25m $BENCH_PACKAGES | tee /tmp/bench-new.txt @@ -81,7 +86,7 @@ jobs: git worktree add /tmp/bench-base "$base" cd /tmp/bench-base go run ./internal/pricing/cmd/litellm-snapshot -restore || true - if ! go test -tags "fts5" -run '^$' -bench "$BENCH_PATTERN" \ + if ! go test -tags "fts5" -run '^$' -bench . \ -benchmem -count "$BENCH_COUNT" -benchtime "$BENCH_TIME" \ -timeout 25m $BENCH_PACKAGES > /tmp/bench-old.txt; then echo "merge-base benchmarks failed; comparing against empty baseline" diff --git a/Makefile b/Makefile index f81dafcb0..0e2cecd11 100644 --- a/Makefile +++ b/Makefile @@ -266,13 +266,15 @@ bench-backends: pricing-snapshot ensure-embed-dir AGENTSVIEW_BENCH_MESSAGES_PER_SESSION=$(BENCH_BACKENDS_MESSAGES_PER_SESSION) \ CGO_ENABLED=1 go test -tags "fts5,benchdb" ./internal/backendbench $(BENCH_BACKENDS_FLAGS) -# Hot-path benchmark gate. Runs the same benchmark set CI's bench.yml -# compares against a PR's merge base (sync engine warm/cold/append, -# message write paths, usage aggregation, secret scanning). Run it -# before and after touching a sync or DB hot path, then compare with +# Hot-path benchmark gate. Runs every benchmark in the gated packages +# — the same set CI's bench.yml compares against a PR's merge base +# (sync engine warm/cold/append, message write paths, usage +# aggregation, secret scanning). Run it before and after touching a +# sync or DB hot path, then compare with # `go run ./cmd/benchgate -old old.txt -new new.txt`. Keep the -# pattern in sync with .github/workflows/bench.yml. -BENCH_GATE_PATTERN ?= ^(BenchmarkSyncAllColdArchive|BenchmarkSyncAllWarmNoop|BenchmarkSyncPathsIncrementalAppend|BenchmarkReplaceSessionMessagesStreamingMerge|BenchmarkInsertMessagesBatch|BenchmarkGetDailyUsage|BenchmarkScan|BenchmarkScanDefinite)$$ +# package list in sync with BENCH_PACKAGES in bench.yml. +# Count must stay >= 5: benchgate's time gate needs at least 5 +# samples per side for its significance test. BENCH_GATE_COUNT ?= 6 # Fixed iterations, not a duration: some gated benchmarks grow their # fixture as they iterate, so baseline and candidate must run the @@ -280,7 +282,7 @@ BENCH_GATE_COUNT ?= 6 BENCH_GATE_TIME ?= 20x bench-gate: pricing-snapshot ensure-embed-dir CGO_ENABLED=1 go test -tags "fts5" -run '^$$' \ - -bench '$(BENCH_GATE_PATTERN)' -benchmem \ + -bench . -benchmem \ -count $(BENCH_GATE_COUNT) -benchtime $(BENCH_GATE_TIME) \ -timeout 25m ./internal/sync ./internal/db ./internal/secrets diff --git a/cmd/benchgate/main.go b/cmd/benchgate/main.go index 7ad7d1fd7..bd09523cb 100644 --- a/cmd/benchgate/main.go +++ b/cmd/benchgate/main.go @@ -76,6 +76,11 @@ type violation struct { maxRatio float64 } +type configIssue struct { + name string + msg string +} + func (v violation) String() string { cls := benchunit.ClassOf(v.unit) return fmt.Sprintf( @@ -120,7 +125,7 @@ func parseBench(reader *benchfmt.Reader) (benchSamples, error) { // and returns a human-readable report plus the violations. func compare( oldRes, newRes benchSamples, gates []gate, -) (report []string, violations []violation) { +) (report []string, violations []violation, issues []configIssue) { thresholds := benchmath.DefaultThresholds names := make([]string, 0, len(newRes)) for name := range newRes { @@ -166,18 +171,40 @@ func compare( )) continue } + if g.needSignificance && (len(oldSample.Values) < 5 || len(newSample.Values) < 5) { + issues = append(issues, configIssue{ + name: name, + msg: fmt.Sprintf( + "%s needs at least 5 samples on both sides for significance gating (old=%d, new=%d)", + g.unit, len(oldSample.Values), len(newSample.Values), + ), + }) + parts = append(parts, fmt.Sprintf( + "%s %s -> %s (sample count too small for significance gating, not gated)", + g.unit, + benchunit.Scale(oldCenter, cls), + benchunit.Scale(newCenter, cls), + )) + continue + } ratio := newCenter / oldCenter cmp := benchmath.AssumeNothing.Compare(oldSample, newSample) significant := cmp.P < cmp.Alpha var detail string if g.worstCase { + // Also surface the baseline's worst run: the gate is + // deliberately candidate-worst vs baseline-median, so + // pre-existing baseline instability should at least + // be visible when reading a failure. + oldWorst := oldSample.Values[len(oldSample.Values)-1] detail = fmt.Sprintf( - "%s %s -> %s (%.2fx, limit %.2fx, worst of %d run(s))", + "%s %s -> %s (%.2fx, limit %.2fx, worst of %d run(s), baseline worst %s)", g.unit, benchunit.Scale(oldCenter, cls), benchunit.Scale(newCenter, cls), ratio, g.maxRatio, len(newSample.Values), + benchunit.Scale(oldWorst, cls), ) } else { detail = fmt.Sprintf( @@ -221,7 +248,7 @@ func compare( name, )) } - return report, violations + return report, violations, issues } func parseFile(path string) (benchSamples, error) { @@ -242,16 +269,17 @@ func main() { ) maxTimeRatio := flag.Float64( "max-time-ratio", 2.0, - "fail when median sec/op exceeds baseline by this factor "+ - "(only when the difference is statistically significant)", + "fail when candidate median sec/op exceeds baseline by this factor "+ + "(only when the difference is statistically significant; needs at "+ + "least 5 samples per side)", ) maxAllocRatio := flag.Float64( "max-alloc-ratio", 1.25, - "fail when median allocs/op exceeds baseline by this factor", + "fail when candidate worst-run allocs/op exceeds baseline median by this factor", ) maxBytesRatio := flag.Float64( "max-bytes-ratio", 1.35, - "fail when median B/op exceeds baseline by this factor", + "fail when candidate worst-run B/op exceeds baseline median by this factor", ) timeFloorNs := flag.Float64( "time-floor-ns", 100_000, @@ -304,10 +332,18 @@ func main() { needSignificance: true, }, } - report, violations := compare(oldRes, newRes, gates) + report, violations, issues := compare(oldRes, newRes, gates) for _, line := range report { fmt.Println(line) } + if len(issues) > 0 { + fmt.Println() + fmt.Println("benchgate: invalid benchmark configuration:") + for _, issue := range issues { + fmt.Printf(" %s: %s\n", issue.name, issue.msg) + } + os.Exit(2) + } if len(newRes) == 0 { fmt.Println("benchgate: candidate output contains no benchmarks") os.Exit(2) diff --git a/cmd/benchgate/main_test.go b/cmd/benchgate/main_test.go index 9a06f1898..aaf55c567 100644 --- a/cmd/benchgate/main_test.go +++ b/cmd/benchgate/main_test.go @@ -167,10 +167,12 @@ func TestCompare(t *testing.T) { { name: "time blowup without significance is reported, not gated", old: benchSamples{ - "BenchmarkFoo-8": {"sec/op": {1e-3}}, + "BenchmarkFoo-8": {"sec/op": noisy(1e-3, 5)}, }, new: benchSamples{ - "BenchmarkFoo-8": {"sec/op": {5e-3}}, + "BenchmarkFoo-8": { + "sec/op": append(noisy(1e-3, 4), 5e-3), + }, }, wantUnits: nil, wantReport: []string{"not significant, not gated"}, @@ -233,11 +235,12 @@ func TestCompare(t *testing.T) { } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - report, violations := compare(tt.old, tt.new, testGates()) + report, violations, issues := compare(tt.old, tt.new, testGates()) units := make([]string, 0, len(violations)) for _, v := range violations { units = append(units, v.unit) } + assert.Empty(t, issues) if len(tt.wantUnits) == 0 { assert.Empty(t, violations) } else { @@ -266,8 +269,9 @@ func TestCompareOutlierRunPolicy(t *testing.T) { next := benchSamples{ "BenchmarkFoo-8": {"allocs/op": {1000, 1000, 9000, 1000, 1000}}, } - _, violations := compare(old, next, testGates()) + _, violations, issues := compare(old, next, testGates()) require.Len(t, violations, 1) + assert.Empty(t, issues) assert.Equal(t, "allocs/op", violations[0].unit) assert.InDelta(t, 9000, violations[0].new, 1e-9, "the worst run is what gets gated") @@ -280,11 +284,21 @@ func TestCompareOutlierRunPolicy(t *testing.T) { next := benchSamples{ "BenchmarkFoo-8": {"sec/op": append(noisy(1e-3, 5), 9e-3)}, } - _, violations := compare(old, next, testGates()) + _, violations, issues := compare(old, next, testGates()) + assert.Empty(t, issues) assert.Empty(t, violations) }) } +func TestCompareRequiresEnoughSamplesForTimeGate(t *testing.T) { + old := benchSamples{"BenchmarkFoo-8": {"sec/op": {1e-3}}} + next := benchSamples{"BenchmarkFoo-8": {"sec/op": {2e-3}}} + _, violations, issues := compare(old, next, testGates()) + assert.Empty(t, violations) + require.Len(t, issues, 1) + assert.Contains(t, issues[0].msg, "at least 5 samples") +} + func TestViolationString(t *testing.T) { v := violation{ name: "BenchmarkFoo-8", unit: "allocs/op", diff --git a/docs/internal/performance-gates.md b/docs/internal/performance-gates.md index 92f7b39e0..a5bddf009 100644 --- a/docs/internal/performance-gates.md +++ b/docs/internal/performance-gates.md @@ -69,25 +69,38 @@ averaged across benchmarks. It gates hard on `allocs/op` (limit 1.25x) and `B/op` (1.35x), which are deterministic for the same code and iteration count — an O(archive)-instead-of-O(delta) regression always blows them up. Those two compare the candidate's *worst* `-count` run against the baseline median, so -even an intermittent extra-allocation path fails. Time (`sec/op`) compares -medians with a loose 2.0x limit and must additionally be a statistically -significant difference before it fails, so a single slow run on a noisy runner -cannot flake a PR but algorithmic blowups still do. Baselines below a per-metric -floor are not gated. Benchmarks that exist on only one side are reported but -never fail, so adding or removing benchmarks cannot wedge a PR. +even an intermittent extra-allocation path fails. That is intentionally +asymmetric: the baseline is treated as the historical reference, and candidate +instability is what blocks the PR (failure lines include the baseline's worst +run so pre-existing instability is visible). Time (`sec/op`) compares medians +with a loose 2.0x limit and must additionally be a statistically significant +difference before it fails, so a single slow run on a noisy runner cannot flake +a PR but algorithmic blowups still do. Time gating requires at least 5 samples +on both sides; smaller captures are reported as misconfigured rather than +silently disabling the gate. Baselines below a per-metric floor are not gated. +Benchmarks that exist on only one side are reported but never fail, so adding or +removing benchmarks cannot wedge a PR. The gate always runs with a fixed `-benchtime=Nx` iteration count (not a duration): two of the benchmarks grow their fixture as they iterate, so the baseline and candidate must run the same number of iterations to measure identical workloads. -Run locally, comparing your working tree against a baseline commit: +Report identifiers are package-qualified benchmark names +(`go.kenn.io/agentsview/internal/db.InsertMessagesBatch-18`) when the captured +output carries `pkg:` metadata, falling back to the bare name when it does not +(e.g. hand-trimmed captures). Do not mix captures with and without `pkg:` lines: +the same benchmark would key differently and be treated as removed/new. + +Run locally, comparing your working tree against a baseline commit. Like CI, use +a worktree for the baseline — checking out or stashing in place can leave +candidate files (or your commits) in the baseline run: ```bash make bench-gate > new.txt # candidate: current tree -git stash # or: git switch --detach origin/main -make bench-gate > old.txt # baseline -git stash pop # restore the working tree +git worktree add /tmp/bench-base "$(git merge-base HEAD origin/main)" +make -C /tmp/bench-base bench-gate > old.txt +git worktree remove /tmp/bench-base go run ./cmd/benchgate -old old.txt -new new.txt ``` @@ -96,13 +109,16 @@ Cross-backend query benchmarks live separately in `internal/backendbench` ## Adding a benchmark to the gate +Every benchmark in a gated package is gated — there is no per-name allowlist to +maintain. A benchmark added by a PR has no baseline, so its first run is +reported without gating; it gates automatically once merged. + 1. Write the benchmark next to the code it guards (`*_bench_test.go`, `b.ReportAllocs()`, self-assert the invariant it protects where possible). -1. Add its name to `BENCH_PATTERN` in `.github/workflows/bench.yml` **and** - `BENCH_GATE_PATTERN` in the Makefile. -1. Make sure its package is in `BENCH_PACKAGES` (bench.yml) and the Makefile - `bench-gate` package list — a name in the pattern whose package is not in - the list is silently never run, so it looks gated while measuring nothing. +1. If its package is not already gated, add it to `BENCH_PACKAGES` in + `.github/workflows/bench.yml` **and** the Makefile `bench-gate` package + list — a benchmark outside the gated packages silently never runs, so it + looks gated while measuring nothing. 1. Keep per-op cost roughly in the 100µs–100ms band: below the benchgate floors nothing is gated, and far above it the job gets slow. 1. If the benchmark's fixture grows across iterations, say so in its comment; From 31091789499663daa4669f7c5cfa11751ab064aa Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 3 Jul 2026 16:11:41 -0500 Subject: [PATCH 6/7] fix(bench): make the gate loud on dropped data and the Makefile its single source Review fixes for the bench-gate PR: - Silence the sync engine's global logger inside the sync benchmarks: interleaved log.Printf output corrupted the 'BenchmarkX' result lines, so benchfmt parsed zero sync results and all three sync benchmarks silently dropped out of the gate on both sides. - benchgate now treats unparseable result lines as a corrupted capture (report + exit 2) instead of skipping benchfmt syntax errors, and reports a gated unit missing from one side ('missing from baseline/candidate, not gated') instead of a bare continue. Custom ReportMetric units are reported as ungated. - Split the sample-count policy: too few candidate samples stays a config error, but a short baseline (a legitimately partial base run) is reported and not gated. - Print detected regressions even when a config issue forces exit 2, so a real violation is never hidden behind a configuration error. - bench.yml now runs 'make bench-gate' on both sides; the Makefile's BENCH_GATE_PACKAGES/COUNT/TIME are the single source of truth, and each side benchmarks its own commit's package list, so growing the gate cannot empty the baseline. A failing base run degrades to its partial output plus a workflow warning instead of truncating the whole baseline into a vacuous pass. Run steps use set -euo pipefail and the workflow gains a Go-paths filter. - Absorb the leading-edge O(history) signal recompute before the timer in BenchmarkSyncPathsIncrementalAppend (it was ~58% of the measured allocs, masking steady-state regressions), stretch the debounce window so the flush timer cannot fire nondeterministically inside a worst-run-gated loop, and pre-build appended lines. - Build BenchmarkInsertMessagesBatch's 200-message fixture once outside the timed loop; only the SessionID is rewritten per iteration. - Widen testDB to testing.TB, drop the bench-only openBenchDB copy, and fix BenchmarkGetDailyUsage's detached testing.T hack (its require failures and cleanups were wired to nothing). - Split benchgate's compare()/main() into evalGate/compareUnits/ render/parseFlags to meet function-size and complexity limits, and drop the dead NaN guard. - Document the new mechanics in docs/internal/performance-gates.md and reformat it with mdformat --wrap 80. --- .github/workflows/bench.yml | 79 +++-- Makefile | 15 +- cmd/benchgate/main.go | 450 +++++++++++++++++++---------- cmd/benchgate/main_test.go | 155 +++++++++- docs/internal/performance-gates.md | 41 ++- internal/db/db_test.go | 10 +- internal/db/messages_bench_test.go | 32 +- internal/db/usage_test.go | 2 +- internal/sync/engine_bench_test.go | 83 +++++- 9 files changed, 604 insertions(+), 263 deletions(-) diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index 6c7a669db..e9db6da3d 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -11,19 +11,33 @@ name: Bench Gate # - bulk ingest throughput (#411) # - per-row query-shape regressions in usage aggregation (#309) # -# cmd/benchgate gates on allocs/op and B/op (deterministic on a given -# machine, tight thresholds) and on ns/op with a loose 2x threshold -# that only catches algorithmic blowups; both sides run on the same -# runner within one job, so the comparison is apples to apples. +# Both sides run `make bench-gate` — the Makefile is the single +# source of truth for the gated package list, sample count, and +# iteration count — and cmd/benchgate compares the outputs. It gates +# on allocs/op and B/op (deterministic on a given machine, tight +# thresholds) and on ns/op with a loose 2x threshold that only +# catches algorithmic blowups; both sides run on the same runner +# within one job, so the comparison is apples to apples. +# # Benchmarks that only exist on one side are reported but never fail -# the gate — which is what makes running every benchmark in the -# gated packages safe: a benchmark added by a PR has no baseline and -# is reported without gating, then gates automatically once merged. -# BENCH_PACKAGES is the gate boundary; keep it in sync with the -# Makefile's bench-gate target. +# the gate: a benchmark added by a PR has no baseline and is reported +# without gating, then gates automatically once merged. Because each +# side benchmarks its own Makefile's package list, a PR that adds a +# package to the gate cannot break the base run. A partially failing +# base run degrades to a partial baseline (whatever benchmarks it +# produced still gate) rather than silently disabling the whole gate. on: pull_request: + # Docs/frontend-only PRs cannot change the gated Go paths. If + # this check is ever made required on branch protection, pair it + # with a no-op sibling workflow on the inverse paths. + paths: + - "**.go" + - "go.mod" + - "go.sum" + - "Makefile" + - ".github/workflows/bench.yml" concurrency: group: bench-${{ github.head_ref || github.ref }} @@ -32,19 +46,6 @@ concurrency: permissions: contents: read -env: - BENCH_PACKAGES: ./internal/sync ./internal/db ./internal/secrets - # Must stay >= 5: benchgate's time gate needs at least 5 samples - # per side for the significance test and treats fewer as a - # configuration error. - BENCH_COUNT: "6" - # Fixed iterations (not a duration) so the baseline and candidate - # run identical workloads: two of the gated benchmarks grow their - # fixture as they iterate (appending to a session, adding sessions - # to a DB), and a duration-based -benchtime would let a slower side - # run fewer iterations against a different final size. - BENCH_TIME: 20x - jobs: bench-gate: name: Benchmark Gate @@ -59,38 +60,26 @@ jobs: with: go-version-file: go.mod - - name: Restore pricing snapshot - run: go run ./internal/pricing/cmd/litellm-snapshot -restore - - name: Run benchmarks (PR head) - env: - CGO_ENABLED: "1" run: | - set -o pipefail - go test -tags "fts5" -run '^$' -bench . \ - -benchmem -count "$BENCH_COUNT" -benchtime "$BENCH_TIME" \ - -timeout 25m $BENCH_PACKAGES | tee /tmp/bench-new.txt + set -euo pipefail + make -s bench-gate | tee /tmp/bench-new.txt - name: Run benchmarks (merge base) env: - CGO_ENABLED: "1" BASE_REF: ${{ github.base_ref }} - # A failing merge-base run (or one with none of the gated - # benchmarks yet) degrades to an empty baseline: benchgate - # then reports every benchmark as new and passes, so a broken - # base never blocks a PR. The baseline file is truncated on - # failure so partial output from packages that ran before the - # failure cannot gate against incomplete data. + # A failing merge-base run keeps whatever benchmark output it + # produced: go test emits results per package, so one broken + # package (or a base predating the bench-gate target) leaves + # a partial or empty baseline and benchgate gates only what + # exists on both sides. The warning makes the degraded run + # visible instead of a silently green vacuous pass. run: | + set -euo pipefail base=$(git merge-base HEAD "origin/$BASE_REF") git worktree add /tmp/bench-base "$base" - cd /tmp/bench-base - go run ./internal/pricing/cmd/litellm-snapshot -restore || true - if ! go test -tags "fts5" -run '^$' -bench . \ - -benchmem -count "$BENCH_COUNT" -benchtime "$BENCH_TIME" \ - -timeout 25m $BENCH_PACKAGES > /tmp/bench-old.txt; then - echo "merge-base benchmarks failed; comparing against empty baseline" - : > /tmp/bench-old.txt + if ! make -s -C /tmp/bench-base bench-gate > /tmp/bench-old.txt; then + echo "::warning title=Bench Gate::merge-base benchmark run exited non-zero; gating against its partial output" fi - name: Compare against merge base diff --git a/Makefile b/Makefile index 0e2cecd11..8d5d52b96 100644 --- a/Makefile +++ b/Makefile @@ -267,14 +267,15 @@ bench-backends: pricing-snapshot ensure-embed-dir CGO_ENABLED=1 go test -tags "fts5,benchdb" ./internal/backendbench $(BENCH_BACKENDS_FLAGS) # Hot-path benchmark gate. Runs every benchmark in the gated packages -# — the same set CI's bench.yml compares against a PR's merge base # (sync engine warm/cold/append, message write paths, usage -# aggregation, secret scanning). Run it before and after touching a -# sync or DB hot path, then compare with -# `go run ./cmd/benchgate -old old.txt -new new.txt`. Keep the -# package list in sync with BENCH_PACKAGES in bench.yml. +# aggregation, secret scanning). This target is the single source of +# truth for the gate configuration: CI's bench.yml runs it on both +# the PR head and the merge base, then compares the outputs with +# `go run ./cmd/benchgate -old old.txt -new new.txt`. Run it before +# and after touching a sync or DB hot path. +BENCH_GATE_PACKAGES ?= ./internal/sync ./internal/db ./internal/secrets # Count must stay >= 5: benchgate's time gate needs at least 5 -# samples per side for its significance test. +# candidate samples for its significance test. BENCH_GATE_COUNT ?= 6 # Fixed iterations, not a duration: some gated benchmarks grow their # fixture as they iterate, so baseline and candidate must run the @@ -284,7 +285,7 @@ bench-gate: pricing-snapshot ensure-embed-dir CGO_ENABLED=1 go test -tags "fts5" -run '^$$' \ -bench . -benchmem \ -count $(BENCH_GATE_COUNT) -benchtime $(BENCH_GATE_TIME) \ - -timeout 25m ./internal/sync ./internal/db ./internal/secrets + -timeout 25m $(BENCH_GATE_PACKAGES) # Start test PostgreSQL container postgres-up: diff --git a/cmd/benchgate/main.go b/cmd/benchgate/main.go index bd09523cb..e81a250d8 100644 --- a/cmd/benchgate/main.go +++ b/cmd/benchgate/main.go @@ -27,13 +27,20 @@ // Gating is per benchmark: any one benchmark over its threshold // fails the gate; there is no cross-benchmark averaging. Benchmarks // present on only one side are reported but never fail the gate, so -// adding or removing benchmarks in a PR does not wedge it. +// adding or removing benchmarks in a PR does not wedge it. A gated +// unit missing from only one side (e.g. a capture taken without +// -benchmem) is reported as not gated rather than skipped silently. +// +// Lines that look like benchmark results but fail to parse (for +// example test log output interleaved into a result line) are a +// corrupted capture: they are reported and the gate exits 2, because +// the corrupted benchmark would otherwise silently vanish from both +// sides and never gate again. package main import ( "flag" "fmt" - "math" "os" "sort" "strings" @@ -43,6 +50,10 @@ import ( "golang.org/x/perf/benchunit" ) +// minTimeSamples is the per-side sample count the sec/op +// significance test needs before its verdict means anything. +const minTimeSamples = 5 + // benchSamples collects every measured value per benchmark and unit: // benchmark key -> tidied unit (sec/op, B/op, allocs/op, ...) -> // samples across -count runs. The key includes the package path when @@ -76,6 +87,9 @@ type violation struct { maxRatio float64 } +// configIssue describes a capture that cannot support the gate it +// was given (e.g. too few candidate samples for significance +// testing) — a CI configuration error, not a regression. type configIssue struct { name string msg string @@ -92,14 +106,21 @@ func (v violation) String() string { } // parseBench extracts benchmark samples from `go test -bench` output -// using the official format parser. Non-result records (unit -// metadata, syntax errors from stray output) are skipped. Values -// arrive tidied by benchfmt: ns/op becomes sec/op, MB/s becomes B/s. -func parseBench(reader *benchfmt.Reader) (benchSamples, error) { +// using the official format parser. Values arrive tidied by +// benchfmt: ns/op becomes sec/op, MB/s becomes B/s. Lines that look +// like results but fail to parse are returned as syntax errors so a +// corrupted capture is loud instead of silently missing benchmarks. +func parseBench( + reader *benchfmt.Reader, +) (benchSamples, []string, error) { out := make(benchSamples) + var syntaxErrs []string for reader.Scan() { res, ok := reader.Result().(*benchfmt.Result) if !ok { + if serr, isSyntax := reader.Result().(*benchfmt.SyntaxError); isSyntax { + syntaxErrs = append(syntaxErrs, serr.Error()) + } continue } name := string(res.Name.Full()) @@ -116,17 +137,113 @@ func parseBench(reader *benchfmt.Reader) (benchSamples, error) { } } if err := reader.Err(); err != nil { - return nil, err + return nil, nil, err + } + return out, syntaxErrs, nil +} + +// evalGate applies one gate to one benchmark's samples and returns +// the report fragment plus an optional violation or config issue +// (their name fields are filled in by the caller). +func evalGate( + g gate, oldVals, newVals []float64, +) (string, *violation, *configIssue) { + thresholds := benchmath.DefaultThresholds + oldSample := benchmath.NewSample(oldVals, &thresholds) + newSample := benchmath.NewSample(newVals, &thresholds) + oldCenter := benchmath.AssumeNothing. + Summary(oldSample, 0.95).Center + var newCenter float64 + if g.worstCase { + // Samples are sorted ascending; the worst candidate run + // is the last one. + newCenter = newSample.Values[len(newSample.Values)-1] + } else { + newCenter = benchmath.AssumeNothing. + Summary(newSample, 0.95).Center + } + cls := benchunit.ClassOf(g.unit) + span := fmt.Sprintf( + "%s %s -> %s", g.unit, + benchunit.Scale(oldCenter, cls), + benchunit.Scale(newCenter, cls), + ) + + if oldCenter <= 0 || oldCenter < g.floor { + return fmt.Sprintf( + "%s (below %s floor, not gated)", + span, benchunit.Scale(g.floor, cls), + ), nil, nil + } + if g.needSignificance && len(newVals) < minTimeSamples { + issue := &configIssue{msg: fmt.Sprintf( + "%s needs at least %d candidate samples for significance gating, got %d", + g.unit, minTimeSamples, len(newVals), + )} + return span + " (too few candidate samples, not gated)", + nil, issue + } + if g.needSignificance && len(oldVals) < minTimeSamples { + // A short baseline is not a configuration error: the base + // run may legitimately be partial (e.g. it failed part-way + // and the workflow gates against what it produced). + return fmt.Sprintf( + "%s (baseline has only %d sample(s), significance needs %d, not gated)", + span, len(oldVals), minTimeSamples, + ), nil, nil + } + + ratio := newCenter / oldCenter + detail, significant := gateDetail( + g, oldSample, newSample, span, ratio, + ) + var v *violation + if ratio > g.maxRatio && (!g.needSignificance || significant) { + v = &violation{ + unit: g.unit, + old: oldCenter, new: newCenter, + ratio: ratio, maxRatio: g.maxRatio, + } + } + return detail, v, nil +} + +// gateDetail renders the gated report fragment and, for +// significance-gated units, runs the benchmath comparison. +func gateDetail( + g gate, oldSample, newSample *benchmath.Sample, + span string, ratio float64, +) (string, bool) { + if g.worstCase { + // Also surface the baseline's worst run: the gate is + // deliberately candidate-worst vs baseline-median, so + // pre-existing baseline instability should at least be + // visible when reading a failure. + cls := benchunit.ClassOf(g.unit) + oldWorst := oldSample.Values[len(oldSample.Values)-1] + return fmt.Sprintf( + "%s (%.2fx, limit %.2fx, worst of %d run(s), baseline worst %s)", + span, ratio, g.maxRatio, len(newSample.Values), + benchunit.Scale(oldWorst, cls), + ), false + } + cmp := benchmath.AssumeNothing.Compare(oldSample, newSample) + significant := cmp.P < cmp.Alpha + detail := fmt.Sprintf( + "%s (%.2fx, limit %.2fx, %s)", span, ratio, g.maxRatio, cmp, + ) + if g.needSignificance && !significant { + detail += " [not significant, not gated]" } - return out, nil + return detail, significant } // compare applies the gates to every benchmark present in both maps -// and returns a human-readable report plus the violations. +// and returns a human-readable report plus the violations and +// config issues. func compare( oldRes, newRes benchSamples, gates []gate, ) (report []string, violations []violation, issues []configIssue) { - thresholds := benchmath.DefaultThresholds names := make([]string, 0, len(newRes)) for name := range newRes { names = append(names, name) @@ -141,95 +258,15 @@ func compare( )) continue } - var parts []string - for _, g := range gates { - oldVals, okOld := oldUnits[g.unit] - newVals, okNew := newRes[name][g.unit] - if !okOld || !okNew { - continue - } - oldSample := benchmath.NewSample(oldVals, &thresholds) - newSample := benchmath.NewSample(newVals, &thresholds) - oldCenter := benchmath.AssumeNothing. - Summary(oldSample, 0.95).Center - newCenter := benchmath.AssumeNothing. - Summary(newSample, 0.95).Center - if g.worstCase { - // Samples are sorted ascending; the worst - // candidate run is the last one. - newCenter = newSample.Values[len(newSample.Values)-1] - } - cls := benchunit.ClassOf(g.unit) - - if oldCenter <= 0 || oldCenter < g.floor { - parts = append(parts, fmt.Sprintf( - "%s %s -> %s (below %s floor, not gated)", - g.unit, - benchunit.Scale(oldCenter, cls), - benchunit.Scale(newCenter, cls), - benchunit.Scale(g.floor, cls), - )) - continue - } - if g.needSignificance && (len(oldSample.Values) < 5 || len(newSample.Values) < 5) { - issues = append(issues, configIssue{ - name: name, - msg: fmt.Sprintf( - "%s needs at least 5 samples on both sides for significance gating (old=%d, new=%d)", - g.unit, len(oldSample.Values), len(newSample.Values), - ), - }) - parts = append(parts, fmt.Sprintf( - "%s %s -> %s (sample count too small for significance gating, not gated)", - g.unit, - benchunit.Scale(oldCenter, cls), - benchunit.Scale(newCenter, cls), - )) - continue - } - - ratio := newCenter / oldCenter - cmp := benchmath.AssumeNothing.Compare(oldSample, newSample) - significant := cmp.P < cmp.Alpha - var detail string - if g.worstCase { - // Also surface the baseline's worst run: the gate is - // deliberately candidate-worst vs baseline-median, so - // pre-existing baseline instability should at least - // be visible when reading a failure. - oldWorst := oldSample.Values[len(oldSample.Values)-1] - detail = fmt.Sprintf( - "%s %s -> %s (%.2fx, limit %.2fx, worst of %d run(s), baseline worst %s)", - g.unit, - benchunit.Scale(oldCenter, cls), - benchunit.Scale(newCenter, cls), - ratio, g.maxRatio, len(newSample.Values), - benchunit.Scale(oldWorst, cls), - ) - } else { - detail = fmt.Sprintf( - "%s %s -> %s (%.2fx, limit %.2fx, %s)", - g.unit, - benchunit.Scale(oldCenter, cls), - benchunit.Scale(newCenter, cls), - ratio, g.maxRatio, cmp, - ) - if g.needSignificance && !significant { - detail += " [not significant, not gated]" - } - } - parts = append(parts, detail) - - if ratio > g.maxRatio && - (!g.needSignificance || significant) && - !math.IsNaN(ratio) { - violations = append(violations, violation{ - name: name, unit: g.unit, - old: oldCenter, new: newCenter, - ratio: ratio, maxRatio: g.maxRatio, - }) - } + parts, vs, is := compareUnits(gates, oldUnits, newRes[name]) + for i := range vs { + vs[i].name = name } + for i := range is { + is[i].name = name + } + violations = append(violations, vs...) + issues = append(issues, is...) report = append(report, fmt.Sprintf( "%s: %s", name, strings.Join(parts, ", "), )) @@ -251,16 +288,138 @@ func compare( return report, violations, issues } -func parseFile(path string) (benchSamples, error) { +// compareUnits evaluates every gate for one benchmark, plus a note +// for candidate units no gate covers (custom b.ReportMetric units +// are collected but deliberately never gated). +func compareUnits( + gates []gate, oldUnits, newUnits map[string][]float64, +) (parts []string, vs []violation, is []configIssue) { + gated := make(map[string]bool, len(gates)) + for _, g := range gates { + gated[g.unit] = true + oldVals, okOld := oldUnits[g.unit] + newVals, okNew := newUnits[g.unit] + switch { + case !okOld && !okNew: + // The benchmark doesn't emit this unit at all. + continue + case !okOld: + parts = append(parts, fmt.Sprintf( + "%s missing from baseline, not gated", g.unit, + )) + continue + case !okNew: + parts = append(parts, fmt.Sprintf( + "%s missing from candidate, not gated", g.unit, + )) + continue + } + part, v, issue := evalGate(g, oldVals, newVals) + parts = append(parts, part) + if v != nil { + vs = append(vs, *v) + } + if issue != nil { + is = append(is, *issue) + } + } + var custom []string + for unit := range newUnits { + if !gated[unit] { + custom = append(custom, unit) + } + } + sort.Strings(custom) + for _, unit := range custom { + parts = append(parts, fmt.Sprintf( + "%s has no gate, not gated", unit, + )) + } + return parts, vs, is +} + +func parseFile(path string) (benchSamples, []string, error) { f, err := os.Open(path) if err != nil { - return nil, err + return nil, nil, err } defer f.Close() return parseBench(benchfmt.NewReader(f, path)) } -func main() { +// results bundles everything render needs to produce the final +// output and exit code. +type results struct { + report []string + violations []violation + issues []configIssue + newCount int + oldSyntax, newSyntax []string +} + +// render formats the human-readable outcome and picks the exit +// code: 2 for unusable input or configuration errors, 1 for +// regressions, 0 otherwise. Violations always print, even when a +// config issue or corrupted capture also occurred, so a detected +// regression is never hidden behind an exit-2. +func render(r results) (string, int) { + var b strings.Builder + for _, line := range r.report { + fmt.Fprintln(&b, line) + } + if len(r.violations) > 0 { + fmt.Fprintf(&b, "\nbenchgate: %d regression(s):\n", + len(r.violations)) + for _, v := range r.violations { + fmt.Fprintf(&b, " %s\n", v) + } + } + renderSyntax(&b, "baseline", r.oldSyntax) + renderSyntax(&b, "candidate", r.newSyntax) + if len(r.issues) > 0 { + fmt.Fprintln(&b, "\nbenchgate: invalid benchmark configuration:") + for _, issue := range r.issues { + fmt.Fprintf(&b, " %s: %s\n", issue.name, issue.msg) + } + } + switch { + case len(r.oldSyntax)+len(r.newSyntax) > 0 || len(r.issues) > 0: + return b.String(), 2 + case r.newCount == 0: + fmt.Fprintln(&b, "benchgate: candidate output contains no benchmarks") + return b.String(), 2 + case len(r.violations) > 0: + return b.String(), 1 + } + fmt.Fprintln(&b, "benchgate: no regressions beyond thresholds") + return b.String(), 0 +} + +// renderSyntax reports unparseable result lines in one capture. A +// benchmark whose result line is corrupted (e.g. by interleaved log +// output) parses on neither side and would otherwise vanish from +// the gate without a trace. +func renderSyntax(b *strings.Builder, side string, errs []string) { + if len(errs) == 0 { + return + } + fmt.Fprintf( + b, + "\nbenchgate: %s capture is corrupted (%d unparseable result line(s); benchmarks on those lines are not gated):\n", + side, len(errs), + ) + for _, e := range errs { + fmt.Fprintf(b, " %s\n", e) + } +} + +// flags holds the parsed command line. +type flags struct { + oldPath, newPath string + gates []gate +} + +func parseFlags() flags { oldPath := flag.String( "old", "", "baseline `go test -bench` output file", ) @@ -271,7 +430,7 @@ func main() { "max-time-ratio", 2.0, "fail when candidate median sec/op exceeds baseline by this factor "+ "(only when the difference is statistically significant; needs at "+ - "least 5 samples per side)", + "least 5 candidate samples)", ) maxAllocRatio := flag.Float64( "max-alloc-ratio", 1.25, @@ -300,61 +459,54 @@ func main() { flag.Usage() os.Exit(2) } + return flags{ + oldPath: *oldPath, + newPath: *newPath, + gates: []gate{ + { + unit: "allocs/op", + maxRatio: *maxAllocRatio, + floor: *allocFloor, + worstCase: true, + }, + { + unit: "B/op", + maxRatio: *maxBytesRatio, + floor: *bytesFloor, + worstCase: true, + }, + { + unit: "sec/op", + maxRatio: *maxTimeRatio, + floor: *timeFloorNs / 1e9, + needSignificance: true, + }, + }, + } +} - oldRes, err := parseFile(*oldPath) +func main() { + cfg := parseFlags() + oldRes, oldSyntax, err := parseFile(cfg.oldPath) if err != nil { fmt.Fprintf(os.Stderr, "benchgate: reading baseline: %v\n", err) os.Exit(2) } - newRes, err := parseFile(*newPath) + newRes, newSyntax, err := parseFile(cfg.newPath) if err != nil { fmt.Fprintf(os.Stderr, "benchgate: reading candidate: %v\n", err) os.Exit(2) } - gates := []gate{ - { - unit: "allocs/op", - maxRatio: *maxAllocRatio, - floor: *allocFloor, - worstCase: true, - }, - { - unit: "B/op", - maxRatio: *maxBytesRatio, - floor: *bytesFloor, - worstCase: true, - }, - { - unit: "sec/op", - maxRatio: *maxTimeRatio, - floor: *timeFloorNs / 1e9, - needSignificance: true, - }, - } - report, violations, issues := compare(oldRes, newRes, gates) - for _, line := range report { - fmt.Println(line) - } - if len(issues) > 0 { - fmt.Println() - fmt.Println("benchgate: invalid benchmark configuration:") - for _, issue := range issues { - fmt.Printf(" %s: %s\n", issue.name, issue.msg) - } - os.Exit(2) - } - if len(newRes) == 0 { - fmt.Println("benchgate: candidate output contains no benchmarks") - os.Exit(2) - } - if len(violations) > 0 { - fmt.Println() - fmt.Printf("benchgate: %d regression(s):\n", len(violations)) - for _, v := range violations { - fmt.Printf(" %s\n", v) - } - os.Exit(1) - } - fmt.Println("benchgate: no regressions beyond thresholds") + report, violations, issues := compare(oldRes, newRes, cfg.gates) + out, code := render(results{ + report: report, + violations: violations, + issues: issues, + newCount: len(newRes), + oldSyntax: oldSyntax, + newSyntax: newSyntax, + }) + fmt.Print(out) + os.Exit(code) } diff --git a/cmd/benchgate/main_test.go b/cmd/benchgate/main_test.go index aaf55c567..52ff4226d 100644 --- a/cmd/benchgate/main_test.go +++ b/cmd/benchgate/main_test.go @@ -10,20 +10,21 @@ import ( "golang.org/x/perf/benchfmt" ) -func parseString(t *testing.T, input string) benchSamples { +func parseString(t *testing.T, input string) (benchSamples, []string) { t.Helper() - got, err := parseBench( + got, syntaxErrs, err := parseBench( benchfmt.NewReader(strings.NewReader(input), "test"), ) require.NoError(t, err) - return got + return got, syntaxErrs } func TestParseBench(t *testing.T) { tests := []struct { - name string - input string - want benchSamples + name string + input string + want benchSamples + wantSyntax int }{ { name: "full benchmem line with tidied units", @@ -65,10 +66,16 @@ func TestParseBench(t *testing.T) { { name: "log lines and headers are ignored", input: "2026/07/03 10:20:36 discovered 40 files in 0s\n" + - "BenchmarkSync says hello\n" + "cpu: Apple M5 Max\n", want: benchSamples{}, }, + { + name: "result line corrupted by interleaved log output is a syntax error", + input: "BenchmarkSyncAllWarmNoop-18 \t2026/07/03 15:39:07 discovered 40 files in 0s\n" + + " 5\t 3581975 ns/op\t 212433 B/op\t 2760 allocs/op\n", + want: benchSamples{}, + wantSyntax: 1, + }, { name: "custom ReportMetric units are kept", input: "BenchmarkBaz-2 10 900 ns/op 3 things/op\n", @@ -82,7 +89,8 @@ func TestParseBench(t *testing.T) { } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - got := parseString(t, tt.input) + got, syntaxErrs := parseString(t, tt.input) + assert.Len(t, syntaxErrs, tt.wantSyntax) require.Len(t, got, len(tt.want)) for name, wantUnits := range tt.want { gotUnits, ok := got[name] @@ -214,6 +222,37 @@ func TestCompare(t *testing.T) { wantUnits: nil, wantReport: []string{"missing from candidate"}, }, + { + name: "gated unit missing from one side is reported, not gated", + old: benchSamples{ + "BenchmarkFoo-8": {"sec/op": noisy(1e-3, 6)}, + }, + new: benchSamples{ + "BenchmarkFoo-8": { + "sec/op": noisy(1e-3, 6), + "allocs/op": {90000}, + }, + }, + wantUnits: nil, + wantReport: []string{"allocs/op missing from baseline, not gated"}, + }, + { + name: "custom ReportMetric unit is reported as ungated", + old: benchSamples{ + "BenchmarkFoo-8": { + "sec/op": noisy(1e-3, 6), + "things/op": {3}, + }, + }, + new: benchSamples{ + "BenchmarkFoo-8": { + "sec/op": noisy(1e-3, 6), + "things/op": {900}, + }, + }, + wantUnits: nil, + wantReport: []string{"things/op has no gate, not gated"}, + }, { name: "multiple regressions are all reported", old: benchSamples{ @@ -290,13 +329,99 @@ func TestCompareOutlierRunPolicy(t *testing.T) { }) } -func TestCompareRequiresEnoughSamplesForTimeGate(t *testing.T) { - old := benchSamples{"BenchmarkFoo-8": {"sec/op": {1e-3}}} - next := benchSamples{"BenchmarkFoo-8": {"sec/op": {2e-3}}} - _, violations, issues := compare(old, next, testGates()) - assert.Empty(t, violations) - require.Len(t, issues, 1) - assert.Contains(t, issues[0].msg, "at least 5 samples") +// TestCompareTimeGateSampleCounts pins the asymmetric sample-count +// policy: too few candidate samples is a configuration error (the +// candidate run is under the workflow's control), while a short +// baseline is merely reported and not gated (the base run may +// legitimately be partial). +func TestCompareTimeGateSampleCounts(t *testing.T) { + t.Run("too few candidate samples is a config issue", func(t *testing.T) { + old := benchSamples{"BenchmarkFoo-8": {"sec/op": noisy(1e-3, 6)}} + next := benchSamples{"BenchmarkFoo-8": {"sec/op": {2e-3}}} + _, violations, issues := compare(old, next, testGates()) + assert.Empty(t, violations) + require.Len(t, issues, 1) + assert.Contains(t, issues[0].msg, "at least 5 candidate samples") + }) + + t.Run("short baseline is reported, not a config issue", func(t *testing.T) { + old := benchSamples{"BenchmarkFoo-8": {"sec/op": {1e-3, 1e-3}}} + next := benchSamples{"BenchmarkFoo-8": {"sec/op": noisy(9e-3, 6)}} + report, violations, issues := compare(old, next, testGates()) + assert.Empty(t, violations) + assert.Empty(t, issues) + assert.Contains(t, strings.Join(report, "\n"), + "baseline has only 2 sample(s)") + }) +} + +func TestRender(t *testing.T) { + sampleViolation := violation{ + name: "BenchmarkFoo-8", unit: "allocs/op", + old: 1000, new: 2000, ratio: 2.0, maxRatio: 1.25, + } + tests := []struct { + name string + r results + wantCode int + wantOut []string + }{ + { + name: "clean run passes", + r: results{report: []string{"BenchmarkFoo-8: ok"}, newCount: 1}, + wantCode: 0, + wantOut: []string{"no regressions beyond thresholds"}, + }, + { + name: "violations exit 1", + r: results{ + violations: []violation{sampleViolation}, + newCount: 1, + }, + wantCode: 1, + wantOut: []string{"1 regression(s)", "allocs/op regressed 2.00x"}, + }, + { + name: "violations still print when a config issue forces exit 2", + r: results{ + violations: []violation{sampleViolation}, + issues: []configIssue{{name: "BenchmarkFoo-8", msg: "too few"}}, + newCount: 1, + }, + wantCode: 2, + wantOut: []string{ + "allocs/op regressed 2.00x", + "invalid benchmark configuration", + }, + }, + { + name: "corrupted capture exits 2 and is described", + r: results{ + newCount: 1, + newSyntax: []string{"test:3: no iteration count"}, + }, + wantCode: 2, + wantOut: []string{ + "candidate capture is corrupted", + "no iteration count", + }, + }, + { + name: "empty candidate exits 2", + r: results{newCount: 0}, + wantCode: 2, + wantOut: []string{"candidate output contains no benchmarks"}, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + out, code := render(tt.r) + assert.Equal(t, tt.wantCode, code) + for _, want := range tt.wantOut { + assert.Contains(t, out, want) + } + }) + } } func TestViolationString(t *testing.T) { diff --git a/docs/internal/performance-gates.md b/docs/internal/performance-gates.md index a5bddf009..47d14913f 100644 --- a/docs/internal/performance-gates.md +++ b/docs/internal/performance-gates.md @@ -46,8 +46,10 @@ sessions" or "the manifest is read once per root regardless of session count". ### 2. Benchmark gate (runs on every PR via `bench.yml`) -`.github/workflows/bench.yml` runs a focused benchmark set on the PR head and -its merge base on the same runner, then compares with `cmd/benchgate`: +`.github/workflows/bench.yml` runs `make bench-gate` — the single source of +truth for the gated package list, sample count, and iteration count — on the PR +head and its merge base on the same runner, then compares the outputs with +`cmd/benchgate`: - `BenchmarkSyncAllWarmNoop` — full sync over an already-synced archive (stat + skip work only; also self-asserts nothing is re-synced or bulk-rewritten). @@ -75,11 +77,21 @@ instability is what blocks the PR (failure lines include the baseline's worst run so pre-existing instability is visible). Time (`sec/op`) compares medians with a loose 2.0x limit and must additionally be a statistically significant difference before it fails, so a single slow run on a noisy runner cannot flake -a PR but algorithmic blowups still do. Time gating requires at least 5 samples -on both sides; smaller captures are reported as misconfigured rather than -silently disabling the gate. Baselines below a per-metric floor are not gated. -Benchmarks that exist on only one side are reported but never fail, so adding or -removing benchmarks cannot wedge a PR. +a PR but algorithmic blowups still do. Time gating requires at least 5 candidate +samples; fewer is reported as a configuration error (the candidate run is under +the workflow's control), while a baseline with fewer than 5 samples — a +legitimately partial base run — is reported and not gated. Baselines below a +per-metric floor are not gated. Benchmarks that exist on only one side are +reported but never fail, so adding or removing benchmarks cannot wedge a PR. +Only `allocs/op`, `B/op`, and `sec/op` are gated: custom `b.ReportMetric` units +are collected and reported as ungated, never enforced. + +Two failure modes are treated as loud configuration errors (exit 2) rather than +silent gaps: a capture whose result lines fail to parse (for example test log +output interleaved into a `Benchmark...` line — the sync benchmarks silence the +engine's logger for exactly this reason), and a gated unit missing from one side +only (for example a baseline captured without `-benchmem`) is reported as not +gated instead of skipped invisibly. The gate always runs with a fixed `-benchtime=Nx` iteration count (not a duration): two of the benchmarks grow their fixture as they iterate, so the @@ -115,12 +127,19 @@ reported without gating; it gates automatically once merged. 1. Write the benchmark next to the code it guards (`*_bench_test.go`, `b.ReportAllocs()`, self-assert the invariant it protects where possible). -1. If its package is not already gated, add it to `BENCH_PACKAGES` in - `.github/workflows/bench.yml` **and** the Makefile `bench-gate` package - list — a benchmark outside the gated packages silently never runs, so it - looks gated while measuring nothing. + If the code under test logs, silence the logger in the benchmark (see + `silenceBenchLogs` in `internal/sync/engine_bench_test.go`): interleaved + log output corrupts result lines and benchgate fails on the corruption. +1. If its package is not already gated, add it to `BENCH_GATE_PACKAGES` in the + Makefile — a benchmark outside the gated packages silently never runs, so + it looks gated while measuring nothing. CI picks the list up from the + Makefile; each side of the comparison benchmarks its own commit's list, so + growing the gate cannot break the base run. 1. Keep per-op cost roughly in the 100µs–100ms band: below the benchgate floors nothing is gated, and far above it the job gets slow. +1. Keep per-iteration setup out of the timed region (`b.ResetTimer`, pre-built + fixtures): helper allocations inside the loop are gated as if they were + product cost and dilute or distort the ratio. 1. If the benchmark's fixture grows across iterations, say so in its comment; the fixed `-benchtime=Nx` keeps both sides comparable, but readers need to know per-op cost depends on the iteration count. diff --git a/internal/db/db_test.go b/internal/db/db_test.go index acbb3b311..4ced0f394 100644 --- a/internal/db/db_test.go +++ b/internal/db/db_test.go @@ -300,13 +300,13 @@ const ( tsMidYear = "2024-06-01T10:00:00Z" ) -func testDB(t *testing.T) *DB { - t.Helper() - dir := t.TempDir() +func testDB(tb testing.TB) *DB { + tb.Helper() + dir := tb.TempDir() path := filepath.Join(dir, "test.db") d, err := openCopiedTestDB(path) - require.NoError(t, err, "opening test db") - t.Cleanup(func() { require.NoError(t, d.Close()) }) + require.NoError(tb, err, "opening test db") + tb.Cleanup(func() { require.NoError(tb, d.Close()) }) return d } diff --git a/internal/db/messages_bench_test.go b/internal/db/messages_bench_test.go index 7699c090b..73a9eebfb 100644 --- a/internal/db/messages_bench_test.go +++ b/internal/db/messages_bench_test.go @@ -3,7 +3,6 @@ package db import ( "encoding/json" "fmt" - "path/filepath" "testing" ) @@ -24,20 +23,6 @@ import ( // BenchmarkGetDailyUsage in usage_test.go covers the usage // aggregation scan (#309) and is part of the same CI gate. -func openBenchDB(b *testing.B) *DB { - b.Helper() - d, err := Open(filepath.Join(b.TempDir(), "bench.db")) - if err != nil { - b.Fatalf("open bench db: %v", err) - } - b.Cleanup(func() { - if err := d.Close(); err != nil { - b.Errorf("close bench db: %v", err) - } - }) - return d -} - // benchSessionMessages builds n alternating user/assistant messages. // Assistant messages carry a model and token_usage payload so writes // exercise the same columns real ingest does. @@ -101,7 +86,7 @@ func seedBenchSession( // unchanged stored rows being rewritten. func BenchmarkReplaceSessionMessagesStreamingMerge(b *testing.B) { const stored = 1000 - d := openBenchDB(b) + d := testDB(b) msgs := seedBenchSession(b, d, "bench-replace", stored) last := len(msgs) - 1 @@ -131,7 +116,13 @@ func BenchmarkReplaceSessionMessagesStreamingMerge(b *testing.B) { // candidate insert into identically sized databases. func BenchmarkInsertMessagesBatch(b *testing.B) { const batch = 200 - d := openBenchDB(b) + d := testDB(b) + + // Build the message fixture once: constructing 200 Message + // structs (~400 fmt.Sprintf calls) inside the timed loop would + // be gated as if it were ingest cost. Only the SessionID is + // rewritten per iteration, which allocates nothing. + msgs := benchSessionMessages("", batch) b.ReportAllocs() b.ResetTimer() @@ -145,9 +136,10 @@ func BenchmarkInsertMessagesBatch(b *testing.B) { }); err != nil { b.Fatalf("upsert session: %v", err) } - if err := d.InsertMessages( - benchSessionMessages(sid, batch), - ); err != nil { + for j := range msgs { + msgs[j].SessionID = sid + } + if err := d.InsertMessages(msgs); err != nil { b.Fatalf("insert messages: %v", err) } } diff --git a/internal/db/usage_test.go b/internal/db/usage_test.go index c0ae860e8..194d117d9 100644 --- a/internal/db/usage_test.go +++ b/internal/db/usage_test.go @@ -2910,7 +2910,7 @@ func TestExcludeModelFilter(t *testing.T) { } func BenchmarkGetDailyUsage(b *testing.B) { - d := testDB(&testing.T{}) + d := testDB(b) ctx := context.Background() if err := d.UpsertModelPricing([]ModelPricing{ diff --git a/internal/sync/engine_bench_test.go b/internal/sync/engine_bench_test.go index aca1f9ebf..297ea5517 100644 --- a/internal/sync/engine_bench_test.go +++ b/internal/sync/engine_bench_test.go @@ -3,10 +3,13 @@ package sync import ( "context" "fmt" + "io" + "log" "os" "path/filepath" "strconv" "testing" + "time" "go.kenn.io/agentsview/internal/db" "go.kenn.io/agentsview/internal/parser" @@ -39,6 +42,19 @@ const ( benchLargeSessionLines = 1000 ) +// silenceBenchLogs discards the engine's global log output for the +// duration of the benchmark. Log lines interleave with `go test +// -bench` result lines on stdout, which corrupts them so benchfmt +// cannot parse the benchmark — benchgate fails on such corruption, +// and before it did, the corrupted benchmarks silently vanished +// from the gate on both sides. +func silenceBenchLogs(b *testing.B) { + b.Helper() + prev := log.Writer() + log.SetOutput(io.Discard) + b.Cleanup(func() { log.SetOutput(prev) }) +} + func benchIntFromEnv(name string, def int) int { raw := os.Getenv(name) if raw == "" { @@ -116,6 +132,7 @@ func openBenchEngine(b *testing.B, dir string) (*Engine, *db.DB) { // benchmark exists to protect — a warm no-op pass must not reparse // or rewrite anything. func BenchmarkSyncAllWarmNoop(b *testing.B) { + silenceBenchLogs(b) sessions := benchIntFromEnv( "AGENTSVIEW_BENCH_SYNC_SESSIONS", defaultBenchSyncSessions, ) @@ -165,6 +182,7 @@ func BenchmarkSyncAllWarmNoop(b *testing.B) { // staying flat as the session grows is exactly the invariant this // benchmark protects. func BenchmarkSyncPathsIncrementalAppend(b *testing.B) { + silenceBenchLogs(b) dir := b.TempDir() writeBenchClaudeArchive(b, dir, 1, benchLargeSessionLines) engine, database := openBenchEngine(b, dir) @@ -185,14 +203,45 @@ func BenchmarkSyncPathsIncrementalAppend(b *testing.B) { } defer f.Close() - b.ReportAllocs() - b.ResetTimer() - for i := range b.N { - line := testjsonl.NewSessionBuilder().AddClaudeUser( + // The first append after a full sync triggers the leading-edge + // inline signal recompute — an O(stored history) message reload + // plus secret scan whose one-time cost would otherwise dominate + // the fixed 20-iteration measurement and mask steady-state + // per-append regressions. Absorb it before the timer starts so + // the timed loop measures the debounced steady state. + warmup := testjsonl.NewSessionBuilder().AddClaudeUser( + "2026-06-20T11:00:00Z", "warm-up append", + ).String() + if _, err := f.WriteString(warmup); err != nil { + b.Fatalf("warm-up append: %v", err) + } + engine.SyncPaths([]string{path}) + + // Stretch the debounce window so the flush timer cannot fire + // inside the timed loop on a slow run: allocs/op is gated on the + // candidate's worst -count run, and one timer-driven O(history) + // recompute landing mid-loop would flake the gate. Engine.Close + // drains the deferred recompute during cleanup. + engine.signalSched.mu.Lock() + engine.signalSched.interval = time.Hour + engine.signalSched.quiet = time.Hour + engine.signalSched.mu.Unlock() + + // Pre-build the appended lines: constructing Claude JSONL via + // testjsonl allocates (map + json.Marshal), and inside the timed + // loop that helper cost would be gated as if it were sync work. + lines := make([]string, b.N) + for i := range lines { + lines[i] = testjsonl.NewSessionBuilder().AddClaudeUser( "2026-06-20T11:00:00Z", fmt.Sprintf("streamed line %d", i), ).String() - if _, err := f.WriteString(line); err != nil { + } + + b.ReportAllocs() + b.ResetTimer() + for i := range b.N { + if _, err := f.WriteString(lines[i]); err != nil { b.Fatalf("append: %v", err) } engine.SyncPaths([]string{path}) @@ -203,10 +252,11 @@ func BenchmarkSyncPathsIncrementalAppend(b *testing.B) { if err != nil { b.Fatalf("GetAllMessages: %v", err) } - if len(msgs) < benchLargeSessionLines+b.N { + want := benchLargeSessionLines + 1 + b.N // +1 for the warm-up append + if len(msgs) < want { b.Fatalf( "appends were not absorbed: stored %d messages, want >= %d", - len(msgs), benchLargeSessionLines+b.N, + len(msgs), want, ) } } @@ -214,6 +264,7 @@ func BenchmarkSyncPathsIncrementalAppend(b *testing.B) { // BenchmarkSyncAllColdArchive measures first-sync ingest throughput: // parse plus bulk write of a whole archive into a fresh database. func BenchmarkSyncAllColdArchive(b *testing.B) { + silenceBenchLogs(b) sessions := benchIntFromEnv( "AGENTSVIEW_BENCH_SYNC_SESSIONS", defaultBenchSyncSessions, ) @@ -222,15 +273,15 @@ func BenchmarkSyncAllColdArchive(b *testing.B) { ) dir := b.TempDir() writeBenchClaudeArchive(b, dir, sessions, perSession) + dbDir := b.TempDir() ctx := context.Background() b.ReportAllocs() b.ResetTimer() for i := range b.N { b.StopTimer() - database, err := db.Open( - filepath.Join(b.TempDir(), fmt.Sprintf("cold-%d.db", i)), - ) + dbPath := filepath.Join(dbDir, fmt.Sprintf("cold-%d.db", i)) + database, err := db.Open(dbPath) if err != nil { b.Fatalf("open bench db: %v", err) } @@ -255,6 +306,18 @@ func BenchmarkSyncAllColdArchive(b *testing.B) { if err := database.Close(); err != nil { b.Fatalf("close bench db: %v", err) } + // Drop this iteration's DB (and WAL/SHM sidecars) so disk + // usage stays O(1) instead of retaining b.N populated + // databases until the function returns. + stale, err := filepath.Glob(dbPath + "*") + if err != nil { + b.Fatalf("glob %s: %v", dbPath, err) + } + for _, p := range stale { + if err := os.Remove(p); err != nil { + b.Fatalf("remove %s: %v", p, err) + } + } b.StartTimer() } } From 6342b2b614217b544e54d598fd32d9a90a3cffb8 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 3 Jul 2026 22:10:01 -0500 Subject: [PATCH 7/7] fix(bench): gate candidate capture gaps, pin gate config, benchmark the bulk-write path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses the roborev findings on 31091789: - A gated unit present in the baseline but missing from the candidate (e.g. -benchmem dropped) is now a configuration error (exit 2) instead of a report-only line, so a PR cannot silently disable the deterministic gates for itself and every PR after it. A unit missing from the baseline stays reported-and-not-gated, since a partial or older base run is legitimate. - bench.yml evaluates 'make bench-gate-config' on the PR head and passes BENCH_GATE_COUNT/TIME into the merge-base invocation, so a PR that changes those defaults still compares identical workloads (two benchmarks grow their fixture per iteration). The package list intentionally stays per-side. - Add BenchmarkResyncBulkIngest: SyncAll uses the default per-session write path, so the cold-archive benchmark never exercised writeBatchBulk/DB.WriteSessionBatch — the #411 bulk ingest class was ungated. The new benchmark drives syncAllLocked with syncWriteBulk and self-asserts every session went through the batch pipeline; the cold-archive benchmark is documented as covering the default first-sync write path. Both share one benchColdArchive loop. - Update docs/internal/performance-gates.md for the unit-gap policy, the pinned gate config, and the new benchmark. --- .github/workflows/bench.yml | 13 ++++- Makefile | 10 +++- cmd/benchgate/main.go | 18 +++++-- cmd/benchgate/main_test.go | 25 ++++++++- docs/internal/performance-gates.md | 19 +++++-- internal/sync/engine_bench_test.go | 82 +++++++++++++++++++++++++----- 6 files changed, 144 insertions(+), 23 deletions(-) diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index e9db6da3d..d1b11eed2 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -74,11 +74,22 @@ jobs: # a partial or empty baseline and benchgate gates only what # exists on both sides. The warning makes the degraded run # visible instead of a silently green vacuous pass. + # + # The sample count and fixed iteration count are evaluated + # from the PR head's Makefile and passed into the base run: + # two benchmarks grow their fixture per iteration, so a PR + # that changes BENCH_GATE_COUNT/TIME must not compare against + # a baseline measured with the old values. The package list + # intentionally stays per-side, so growing the gate cannot + # break the base run. run: | set -euo pipefail + eval "$(make -s bench-gate-config)" base=$(git merge-base HEAD "origin/$BASE_REF") git worktree add /tmp/bench-base "$base" - if ! make -s -C /tmp/bench-base bench-gate > /tmp/bench-old.txt; then + if ! make -s -C /tmp/bench-base bench-gate \ + BENCH_GATE_COUNT="$BENCH_GATE_COUNT" \ + BENCH_GATE_TIME="$BENCH_GATE_TIME" > /tmp/bench-old.txt; then echo "::warning title=Bench Gate::merge-base benchmark run exited non-zero; gating against its partial output" fi diff --git a/Makefile b/Makefile index 8d5d52b96..e53699989 100644 --- a/Makefile +++ b/Makefile @@ -20,7 +20,7 @@ AIR_BIN := $(shell if command -v air >/dev/null 2>&1; then command -v air; \ elif [ -x "$(GOPATH_FIRST)/bin/air" ]; then printf "%s" "$(GOPATH_FIRST)/bin/air"; \ fi) -.PHONY: build build-release install frontend frontend-dev dev check-air air-install desktop-dev desktop-build desktop-macos-app desktop-macos-dmg desktop-windows-installer desktop-linux-appimage desktop-app docs-install docs-build docs-serve docs-check docs-screenshots docs-assets-branch docs-generated-assets-branch docs-deploy-staging docs-deploy test test-short bench-backends bench-gate test-postgres test-postgres-ci test-s3 postgres-up postgres-down test-ssh test-ssh-ci ssh-up ssh-down e2e e2e-duckdb vet lint lint-ci lint-golangci lint-golangci-ci nilaway nilaway-golangci-build lint-tools tidy clean release release-darwin-arm64 release-darwin-amd64 release-linux-amd64 install-hooks ensure-embed-dir pricing-snapshot dev-snapshot help +.PHONY: build build-release install frontend frontend-dev dev check-air air-install desktop-dev desktop-build desktop-macos-app desktop-macos-dmg desktop-windows-installer desktop-linux-appimage desktop-app docs-install docs-build docs-serve docs-check docs-screenshots docs-assets-branch docs-generated-assets-branch docs-deploy-staging docs-deploy test test-short bench-backends bench-gate bench-gate-config test-postgres test-postgres-ci test-s3 postgres-up postgres-down test-ssh test-ssh-ci ssh-up ssh-down e2e e2e-duckdb vet lint lint-ci lint-golangci lint-golangci-ci nilaway nilaway-golangci-build lint-tools tidy clean release release-darwin-arm64 release-darwin-amd64 release-linux-amd64 install-hooks ensure-embed-dir pricing-snapshot dev-snapshot help # Ensure go:embed has at least one file (no-op if frontend is built) ensure-embed-dir: @@ -287,6 +287,14 @@ bench-gate: pricing-snapshot ensure-embed-dir -count $(BENCH_GATE_COUNT) -benchtime $(BENCH_GATE_TIME) \ -timeout 25m $(BENCH_GATE_PACKAGES) +# Prints the gate's sample/iteration configuration in shell-evalable +# form. CI evaluates this on the PR head and passes the values into +# the merge-base `make bench-gate` invocation, so both sides measure +# identical workloads even when a PR changes the defaults above (the +# package list intentionally stays per-side). +bench-gate-config: + @echo "BENCH_GATE_COUNT=$(BENCH_GATE_COUNT) BENCH_GATE_TIME=$(BENCH_GATE_TIME)" + # Start test PostgreSQL container postgres-up: docker compose -f docker-compose.test.yml up -d --wait diff --git a/cmd/benchgate/main.go b/cmd/benchgate/main.go index e81a250d8..6b3003ff0 100644 --- a/cmd/benchgate/main.go +++ b/cmd/benchgate/main.go @@ -28,8 +28,10 @@ // fails the gate; there is no cross-benchmark averaging. Benchmarks // present on only one side are reported but never fail the gate, so // adding or removing benchmarks in a PR does not wedge it. A gated -// unit missing from only one side (e.g. a capture taken without -// -benchmem) is reported as not gated rather than skipped silently. +// unit missing from the baseline (a legitimately older or partial +// base run) is reported as not gated; a gated unit missing from the +// candidate (e.g. -benchmem dropped) is a configuration error, since +// it would otherwise silently disable that gate for good. // // Lines that look like benchmark results but fail to parse (for // example test log output interleaved into a result line) are a @@ -304,14 +306,24 @@ func compareUnits( // The benchmark doesn't emit this unit at all. continue case !okOld: + // A baseline may legitimately lack a unit (older or + // partial base run): report, don't gate. parts = append(parts, fmt.Sprintf( "%s missing from baseline, not gated", g.unit, )) continue case !okNew: + // The candidate capture is under the workflow's + // control; losing a gated unit the baseline has (e.g. + // -benchmem dropped) would silently disable this gate, + // so it is a configuration error. parts = append(parts, fmt.Sprintf( - "%s missing from candidate, not gated", g.unit, + "%s missing from candidate", g.unit, )) + is = append(is, configIssue{msg: fmt.Sprintf( + "%s present in baseline but missing from candidate capture (was -benchmem dropped?)", + g.unit, + )}) continue } part, v, issue := evalGate(g, oldVals, newVals) diff --git a/cmd/benchgate/main_test.go b/cmd/benchgate/main_test.go index 52ff4226d..b56043ddd 100644 --- a/cmd/benchgate/main_test.go +++ b/cmd/benchgate/main_test.go @@ -223,7 +223,7 @@ func TestCompare(t *testing.T) { wantReport: []string{"missing from candidate"}, }, { - name: "gated unit missing from one side is reported, not gated", + name: "gated unit missing from the baseline is reported, not gated", old: benchSamples{ "BenchmarkFoo-8": {"sec/op": noisy(1e-3, 6)}, }, @@ -329,6 +329,29 @@ func TestCompareOutlierRunPolicy(t *testing.T) { }) } +// TestCompareMissingCandidateUnit pins the asymmetric missing-unit +// policy: a gated unit the baseline has but the candidate lost +// (e.g. -benchmem dropped from the candidate run) is a config error +// so the gate exits 2 instead of silently disabling that metric. +func TestCompareMissingCandidateUnit(t *testing.T) { + old := benchSamples{ + "BenchmarkFoo-8": { + "sec/op": noisy(1e-3, 6), + "allocs/op": {1000}, + }, + } + next := benchSamples{ + "BenchmarkFoo-8": {"sec/op": noisy(1e-3, 6)}, + } + report, violations, issues := compare(old, next, testGates()) + assert.Empty(t, violations) + require.Len(t, issues, 1) + assert.Contains(t, issues[0].msg, + "allocs/op present in baseline but missing from candidate") + assert.Contains(t, strings.Join(report, "\n"), + "allocs/op missing from candidate") +} + // TestCompareTimeGateSampleCounts pins the asymmetric sample-count // policy: too few candidate samples is a configuration error (the // candidate run is under the workflow's control), while a short diff --git a/docs/internal/performance-gates.md b/docs/internal/performance-gates.md index 47d14913f..094530484 100644 --- a/docs/internal/performance-gates.md +++ b/docs/internal/performance-gates.md @@ -55,7 +55,11 @@ head and its merge base on the same runner, then compares the outputs with skip work only; also self-asserts nothing is re-synced or bulk-rewritten). - `BenchmarkSyncPathsIncrementalAppend` — absorb one appended line into a 1,000-message session. -- `BenchmarkSyncAllColdArchive` — first-sync ingest throughput. +- `BenchmarkSyncAllColdArchive` — first-sync ingest throughput through the + default per-session write path. +- `BenchmarkResyncBulkIngest` — the same archive through the resync bulk-write + pipeline (`writeBatchBulk` / `DB.WriteSessionBatch`, the #411 regression + class); self-asserts every session took the batch path. - `BenchmarkReplaceSessionMessagesStreamingMerge` — the streaming chunk-merge diff path (one UPDATE, not a full delete+reinsert). - `BenchmarkInsertMessagesBatch` — multi-row batched ingest. @@ -89,14 +93,19 @@ are collected and reported as ungated, never enforced. Two failure modes are treated as loud configuration errors (exit 2) rather than silent gaps: a capture whose result lines fail to parse (for example test log output interleaved into a `Benchmark...` line — the sync benchmarks silence the -engine's logger for exactly this reason), and a gated unit missing from one side -only (for example a baseline captured without `-benchmem`) is reported as not -gated instead of skipped invisibly. +engine's logger for exactly this reason), and a gated unit present in the +baseline but missing from the candidate (for example a candidate captured +without `-benchmem`), which would otherwise silently disable that gate for good. +The reverse — a gated unit missing from the baseline, which may legitimately be +older or partial — is reported as not gated. The gate always runs with a fixed `-benchtime=Nx` iteration count (not a duration): two of the benchmarks grow their fixture as they iterate, so the baseline and candidate must run the same number of iterations to measure -identical workloads. +identical workloads. CI evaluates `make bench-gate-config` on the PR head and +passes the count and benchtime into the merge-base run, so a PR that changes +those defaults still compares identical workloads; do the same locally if you +override them. Report identifiers are package-qualified benchmark names (`go.kenn.io/agentsview/internal/db.InsertMessagesBatch-18`) when the captured diff --git a/internal/sync/engine_bench_test.go b/internal/sync/engine_bench_test.go index 297ea5517..9259e3ff4 100644 --- a/internal/sync/engine_bench_test.go +++ b/internal/sync/engine_bench_test.go @@ -31,7 +31,10 @@ import ( // line into a large session must scale with the appended data, // not the stored history (#954). // - BenchmarkSyncAllColdArchive: first-sync ingest throughput for -// a fresh archive (the #411 bulk-write path). +// a fresh archive through the default per-session write path. +// - BenchmarkResyncBulkIngest: the same archive through the +// resync bulk-write pipeline (writeBatchBulk / +// DB.WriteSessionBatch, the #411 regression class). // // Fixture sizes scale via AGENTSVIEW_BENCH_SYNC_SESSIONS and // AGENTSVIEW_BENCH_SYNC_MESSAGES for larger local runs. @@ -261,9 +264,15 @@ func BenchmarkSyncPathsIncrementalAppend(b *testing.B) { } } -// BenchmarkSyncAllColdArchive measures first-sync ingest throughput: -// parse plus bulk write of a whole archive into a fresh database. -func BenchmarkSyncAllColdArchive(b *testing.B) { +// benchColdArchive is the shared cold-ingest loop: each iteration +// syncs the same archive into a fresh database via syncOnce, then +// verify checks the iteration's outcome with the timer stopped. +func benchColdArchive( + b *testing.B, + syncOnce func(*Engine) SyncStats, + verify func(*Engine, SyncStats, int), +) { + b.Helper() silenceBenchLogs(b) sessions := benchIntFromEnv( "AGENTSVIEW_BENCH_SYNC_SESSIONS", defaultBenchSyncSessions, @@ -274,7 +283,6 @@ func BenchmarkSyncAllColdArchive(b *testing.B) { dir := b.TempDir() writeBenchClaudeArchive(b, dir, sessions, perSession) dbDir := b.TempDir() - ctx := context.Background() b.ReportAllocs() b.ResetTimer() @@ -293,15 +301,10 @@ func BenchmarkSyncAllColdArchive(b *testing.B) { }) b.StartTimer() - stats := engine.SyncAll(ctx, nil) + stats := syncOnce(engine) b.StopTimer() - if stats.Synced != sessions { - b.Fatalf( - "cold sync stored %d of %d sessions (failed=%d)", - stats.Synced, sessions, stats.Failed, - ) - } + verify(engine, stats, sessions) engine.Close() if err := database.Close(); err != nil { b.Fatalf("close bench db: %v", err) @@ -321,3 +324,58 @@ func BenchmarkSyncAllColdArchive(b *testing.B) { b.StartTimer() } } + +// BenchmarkSyncAllColdArchive measures first-sync ingest throughput +// through the public SyncAll path: parse plus the default +// per-session writes a user's first sync performs. +func BenchmarkSyncAllColdArchive(b *testing.B) { + ctx := context.Background() + benchColdArchive(b, + func(engine *Engine) SyncStats { + return engine.SyncAll(ctx, nil) + }, + func(_ *Engine, stats SyncStats, sessions int) { + if stats.Synced != sessions { + b.Fatalf( + "cold sync stored %d of %d sessions (failed=%d)", + stats.Synced, sessions, stats.Failed, + ) + } + }, + ) +} + +// BenchmarkResyncBulkIngest measures the bulk-write ingest path a +// full resync uses: syncWriteBulk routes every parsed session +// through writeBatchBulk and DB.WriteSessionBatch — the #411 +// regression class. This is a different write path from the default +// per-session writes BenchmarkSyncAllColdArchive covers, and it +// self-asserts that every session really went through the batch +// pipeline so the benchmark cannot silently measure the wrong path. +func BenchmarkResyncBulkIngest(b *testing.B) { + ctx := context.Background() + benchColdArchive(b, + func(engine *Engine) SyncStats { + engine.syncMu.Lock() + defer engine.syncMu.Unlock() + return engine.syncAllLocked( + ctx, nil, time.Time{}, nil, syncWriteBulk, true, + ) + }, + func(engine *Engine, stats SyncStats, sessions int) { + if stats.Synced != sessions { + b.Fatalf( + "bulk ingest stored %d of %d sessions (failed=%d)", + stats.Synced, sessions, stats.Failed, + ) + } + writes := engine.PhaseStats().BatchedWrites.Load() + if writes != int64(sessions) { + b.Fatalf( + "bulk ingest wrote %d of %d sessions via the batch pipeline", + writes, sessions, + ) + } + }, + ) +}