diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml new file mode 100644 index 000000000..d1b11eed2 --- /dev/null +++ b/.github/workflows/bench.yml @@ -0,0 +1,97 @@ +name: Bench Gate + +# Compares hot-path benchmarks between the PR head and its merge +# base, so performance regressions in the sync engine and DB write +# paths fail the PR instead of shipping. The gated regression classes +# have all shipped before: +# +# - discovery/skip work scaling with archive size instead of new +# data (#912, the providerSourceUnchangedInDB gap) +# - O(session history) work per incremental append (#954) +# - bulk ingest throughput (#411) +# - per-row query-shape regressions in usage aggregation (#309) +# +# Both sides run `make bench-gate` — the Makefile is the single +# source of truth for the gated package list, sample count, and +# iteration count — and cmd/benchgate compares the outputs. It gates +# on allocs/op and B/op (deterministic on a given machine, tight +# thresholds) and on ns/op with a loose 2x threshold that only +# catches algorithmic blowups; both sides run on the same runner +# within one job, so the comparison is apples to apples. +# +# Benchmarks that only exist on one side are reported but never fail +# the gate: a benchmark added by a PR has no baseline and is reported +# without gating, then gates automatically once merged. Because each +# side benchmarks its own Makefile's package list, a PR that adds a +# package to the gate cannot break the base run. A partially failing +# base run degrades to a partial baseline (whatever benchmarks it +# produced still gate) rather than silently disabling the whole gate. + +on: + pull_request: + # Docs/frontend-only PRs cannot change the gated Go paths. If + # this check is ever made required on branch protection, pair it + # with a no-op sibling workflow on the inverse paths. + paths: + - "**.go" + - "go.mod" + - "go.sum" + - "Makefile" + - ".github/workflows/bench.yml" + +concurrency: + group: bench-${{ github.head_ref || github.ref }} + cancel-in-progress: true + +permissions: + contents: read + +jobs: + bench-gate: + name: Benchmark Gate + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0 + with: + persist-credentials: false + fetch-depth: 0 + + - uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0 + with: + go-version-file: go.mod + + - name: Run benchmarks (PR head) + run: | + set -euo pipefail + make -s bench-gate | tee /tmp/bench-new.txt + + - name: Run benchmarks (merge base) + env: + BASE_REF: ${{ github.base_ref }} + # A failing merge-base run keeps whatever benchmark output it + # produced: go test emits results per package, so one broken + # package (or a base predating the bench-gate target) leaves + # a partial or empty baseline and benchgate gates only what + # exists on both sides. The warning makes the degraded run + # visible instead of a silently green vacuous pass. + # + # The sample count and fixed iteration count are evaluated + # from the PR head's Makefile and passed into the base run: + # two benchmarks grow their fixture per iteration, so a PR + # that changes BENCH_GATE_COUNT/TIME must not compare against + # a baseline measured with the old values. The package list + # intentionally stays per-side, so growing the gate cannot + # break the base run. + run: | + set -euo pipefail + eval "$(make -s bench-gate-config)" + base=$(git merge-base HEAD "origin/$BASE_REF") + git worktree add /tmp/bench-base "$base" + if ! make -s -C /tmp/bench-base bench-gate \ + BENCH_GATE_COUNT="$BENCH_GATE_COUNT" \ + BENCH_GATE_TIME="$BENCH_GATE_TIME" > /tmp/bench-old.txt; then + echo "::warning title=Bench Gate::merge-base benchmark run exited non-zero; gating against its partial output" + fi + + - name: Compare against merge base + run: go run ./cmd/benchgate -old /tmp/bench-old.txt -new /tmp/bench-new.txt diff --git a/Makefile b/Makefile index 2d827c6de..e53699989 100644 --- a/Makefile +++ b/Makefile @@ -20,7 +20,7 @@ AIR_BIN := $(shell if command -v air >/dev/null 2>&1; then command -v air; \ elif [ -x "$(GOPATH_FIRST)/bin/air" ]; then printf "%s" "$(GOPATH_FIRST)/bin/air"; \ fi) -.PHONY: build build-release install frontend frontend-dev dev check-air air-install desktop-dev desktop-build desktop-macos-app desktop-macos-dmg desktop-windows-installer desktop-linux-appimage desktop-app docs-install docs-build docs-serve docs-check docs-screenshots docs-assets-branch docs-generated-assets-branch docs-deploy-staging docs-deploy test test-short bench-backends test-postgres test-postgres-ci test-s3 postgres-up postgres-down test-ssh test-ssh-ci ssh-up ssh-down e2e e2e-duckdb vet lint lint-ci lint-golangci lint-golangci-ci nilaway nilaway-golangci-build lint-tools tidy clean release release-darwin-arm64 release-darwin-amd64 release-linux-amd64 install-hooks ensure-embed-dir pricing-snapshot dev-snapshot help +.PHONY: build build-release install frontend frontend-dev dev check-air air-install desktop-dev desktop-build desktop-macos-app desktop-macos-dmg desktop-windows-installer desktop-linux-appimage desktop-app docs-install docs-build docs-serve docs-check docs-screenshots docs-assets-branch docs-generated-assets-branch docs-deploy-staging docs-deploy test test-short bench-backends bench-gate bench-gate-config test-postgres test-postgres-ci test-s3 postgres-up postgres-down test-ssh test-ssh-ci ssh-up ssh-down e2e e2e-duckdb vet lint lint-ci lint-golangci lint-golangci-ci nilaway nilaway-golangci-build lint-tools tidy clean release release-darwin-arm64 release-darwin-amd64 release-linux-amd64 install-hooks ensure-embed-dir pricing-snapshot dev-snapshot help # Ensure go:embed has at least one file (no-op if frontend is built) ensure-embed-dir: @@ -266,6 +266,35 @@ bench-backends: pricing-snapshot ensure-embed-dir AGENTSVIEW_BENCH_MESSAGES_PER_SESSION=$(BENCH_BACKENDS_MESSAGES_PER_SESSION) \ CGO_ENABLED=1 go test -tags "fts5,benchdb" ./internal/backendbench $(BENCH_BACKENDS_FLAGS) +# Hot-path benchmark gate. Runs every benchmark in the gated packages +# (sync engine warm/cold/append, message write paths, usage +# aggregation, secret scanning). This target is the single source of +# truth for the gate configuration: CI's bench.yml runs it on both +# the PR head and the merge base, then compares the outputs with +# `go run ./cmd/benchgate -old old.txt -new new.txt`. Run it before +# and after touching a sync or DB hot path. +BENCH_GATE_PACKAGES ?= ./internal/sync ./internal/db ./internal/secrets +# Count must stay >= 5: benchgate's time gate needs at least 5 +# candidate samples for its significance test. +BENCH_GATE_COUNT ?= 6 +# Fixed iterations, not a duration: some gated benchmarks grow their +# fixture as they iterate, so baseline and candidate must run the +# same iteration count to measure identical workloads. +BENCH_GATE_TIME ?= 20x +bench-gate: pricing-snapshot ensure-embed-dir + CGO_ENABLED=1 go test -tags "fts5" -run '^$$' \ + -bench . -benchmem \ + -count $(BENCH_GATE_COUNT) -benchtime $(BENCH_GATE_TIME) \ + -timeout 25m $(BENCH_GATE_PACKAGES) + +# Prints the gate's sample/iteration configuration in shell-evalable +# form. CI evaluates this on the PR head and passes the values into +# the merge-base `make bench-gate` invocation, so both sides measure +# identical workloads even when a PR changes the defaults above (the +# package list intentionally stays per-side). +bench-gate-config: + @echo "BENCH_GATE_COUNT=$(BENCH_GATE_COUNT) BENCH_GATE_TIME=$(BENCH_GATE_TIME)" + # Start test PostgreSQL container postgres-up: docker compose -f docker-compose.test.yml up -d --wait @@ -484,6 +513,7 @@ help: @echo " test - Run all tests" @echo " test-short - Run fast tests only" @echo " bench-backends - Benchmark SQLite, DuckDB, and PostgreSQL stores" + @echo " bench-gate - Run the hot-path benchmarks CI gates PRs on" @echo " test-postgres - Run PostgreSQL integration tests" @echo " test-s3 - Run S3 discovery integration tests (Docker)" @echo " postgres-up - Start test PostgreSQL container" diff --git a/cmd/benchgate/main.go b/cmd/benchgate/main.go new file mode 100644 index 000000000..6b3003ff0 --- /dev/null +++ b/cmd/benchgate/main.go @@ -0,0 +1,524 @@ +// Command benchgate compares two `go test -bench` outputs (baseline +// vs candidate) and exits non-zero when a benchmark regresses beyond +// configured thresholds. +// +// Parsing and statistics come from golang.org/x/perf — benchfmt for +// the benchmark format and benchmath (the engine behind benchstat) +// for summaries and significance tests. benchgate only adds the +// policy benchstat deliberately does not provide: thresholds, floors, +// and a failing exit code for CI. +// +// It is the comparison step of the bench-gate CI workflow: allocs/op +// and B/op are deterministic for the same code on the same machine, +// so they get tight ratio thresholds that catch O(archive)-instead- +// of-O(delta) work regressions regardless of sample count; time +// (sec/op) is noisy on shared runners, so it gets a loose threshold +// and additionally must be a statistically significant difference +// (Mann-Whitney U, as in benchstat) before it fails the gate. +// Baselines below a per-metric floor are skipped entirely, since a +// few extra allocations on a tiny benchmark is noise, not a +// regression. +// +// Multiple runs of the same benchmark (-count=N) are kept as a +// sample. The baseline is summarized by its median; the candidate is +// gated on its median for time but on its WORST run for allocs/op +// and B/op — those are deterministic, so a single outlier run there +// is a real intermittent allocation path, not noise, and must fail. +// Gating is per benchmark: any one benchmark over its threshold +// fails the gate; there is no cross-benchmark averaging. Benchmarks +// present on only one side are reported but never fail the gate, so +// adding or removing benchmarks in a PR does not wedge it. A gated +// unit missing from the baseline (a legitimately older or partial +// base run) is reported as not gated; a gated unit missing from the +// candidate (e.g. -benchmem dropped) is a configuration error, since +// it would otherwise silently disable that gate for good. +// +// Lines that look like benchmark results but fail to parse (for +// example test log output interleaved into a result line) are a +// corrupted capture: they are reported and the gate exits 2, because +// the corrupted benchmark would otherwise silently vanish from both +// sides and never gate again. +package main + +import ( + "flag" + "fmt" + "os" + "sort" + "strings" + + "golang.org/x/perf/benchfmt" + "golang.org/x/perf/benchmath" + "golang.org/x/perf/benchunit" +) + +// minTimeSamples is the per-side sample count the sec/op +// significance test needs before its verdict means anything. +const minTimeSamples = 5 + +// benchSamples collects every measured value per benchmark and unit: +// benchmark key -> tidied unit (sec/op, B/op, allocs/op, ...) -> +// samples across -count runs. The key includes the package path when +// the output carries one, so same-named benchmarks in different +// packages never merge. +type benchSamples map[string]map[string][]float64 + +// gate is one metric's regression rule: fail when the candidate +// exceeds the baseline median by more than maxRatio, unless the +// baseline is below floor (too small to compare meaningfully). With +// worstCase set, the candidate is judged by its worst (highest) run +// rather than its median — for deterministic metrics, where any +// outlier run is a real intermittent code path. With +// needSignificance set, the samples must also differ significantly +// under the benchmath comparison test — the benchstat noise guard, +// used for wall-clock time. +type gate struct { + unit string + maxRatio float64 + floor float64 + worstCase bool + needSignificance bool +} + +// violation describes one gate failure. +type violation struct { + name string + unit string + old, new float64 + ratio float64 + maxRatio float64 +} + +// configIssue describes a capture that cannot support the gate it +// was given (e.g. too few candidate samples for significance +// testing) — a CI configuration error, not a regression. +type configIssue struct { + name string + msg string +} + +func (v violation) String() string { + cls := benchunit.ClassOf(v.unit) + return fmt.Sprintf( + "%s: %s regressed %.2fx (%s -> %s, limit %.2fx)", + v.name, v.unit, v.ratio, + benchunit.Scale(v.old, cls), benchunit.Scale(v.new, cls), + v.maxRatio, + ) +} + +// parseBench extracts benchmark samples from `go test -bench` output +// using the official format parser. Values arrive tidied by +// benchfmt: ns/op becomes sec/op, MB/s becomes B/s. Lines that look +// like results but fail to parse are returned as syntax errors so a +// corrupted capture is loud instead of silently missing benchmarks. +func parseBench( + reader *benchfmt.Reader, +) (benchSamples, []string, error) { + out := make(benchSamples) + var syntaxErrs []string + for reader.Scan() { + res, ok := reader.Result().(*benchfmt.Result) + if !ok { + if serr, isSyntax := reader.Result().(*benchfmt.SyntaxError); isSyntax { + syntaxErrs = append(syntaxErrs, serr.Error()) + } + continue + } + name := string(res.Name.Full()) + if pkg := res.GetConfig("pkg"); pkg != "" { + name = pkg + "." + name + } + units := out[name] + if units == nil { + units = make(map[string][]float64) + out[name] = units + } + for _, v := range res.Values { + units[v.Unit] = append(units[v.Unit], v.Value) + } + } + if err := reader.Err(); err != nil { + return nil, nil, err + } + return out, syntaxErrs, nil +} + +// evalGate applies one gate to one benchmark's samples and returns +// the report fragment plus an optional violation or config issue +// (their name fields are filled in by the caller). +func evalGate( + g gate, oldVals, newVals []float64, +) (string, *violation, *configIssue) { + thresholds := benchmath.DefaultThresholds + oldSample := benchmath.NewSample(oldVals, &thresholds) + newSample := benchmath.NewSample(newVals, &thresholds) + oldCenter := benchmath.AssumeNothing. + Summary(oldSample, 0.95).Center + var newCenter float64 + if g.worstCase { + // Samples are sorted ascending; the worst candidate run + // is the last one. + newCenter = newSample.Values[len(newSample.Values)-1] + } else { + newCenter = benchmath.AssumeNothing. + Summary(newSample, 0.95).Center + } + cls := benchunit.ClassOf(g.unit) + span := fmt.Sprintf( + "%s %s -> %s", g.unit, + benchunit.Scale(oldCenter, cls), + benchunit.Scale(newCenter, cls), + ) + + if oldCenter <= 0 || oldCenter < g.floor { + return fmt.Sprintf( + "%s (below %s floor, not gated)", + span, benchunit.Scale(g.floor, cls), + ), nil, nil + } + if g.needSignificance && len(newVals) < minTimeSamples { + issue := &configIssue{msg: fmt.Sprintf( + "%s needs at least %d candidate samples for significance gating, got %d", + g.unit, minTimeSamples, len(newVals), + )} + return span + " (too few candidate samples, not gated)", + nil, issue + } + if g.needSignificance && len(oldVals) < minTimeSamples { + // A short baseline is not a configuration error: the base + // run may legitimately be partial (e.g. it failed part-way + // and the workflow gates against what it produced). + return fmt.Sprintf( + "%s (baseline has only %d sample(s), significance needs %d, not gated)", + span, len(oldVals), minTimeSamples, + ), nil, nil + } + + ratio := newCenter / oldCenter + detail, significant := gateDetail( + g, oldSample, newSample, span, ratio, + ) + var v *violation + if ratio > g.maxRatio && (!g.needSignificance || significant) { + v = &violation{ + unit: g.unit, + old: oldCenter, new: newCenter, + ratio: ratio, maxRatio: g.maxRatio, + } + } + return detail, v, nil +} + +// gateDetail renders the gated report fragment and, for +// significance-gated units, runs the benchmath comparison. +func gateDetail( + g gate, oldSample, newSample *benchmath.Sample, + span string, ratio float64, +) (string, bool) { + if g.worstCase { + // Also surface the baseline's worst run: the gate is + // deliberately candidate-worst vs baseline-median, so + // pre-existing baseline instability should at least be + // visible when reading a failure. + cls := benchunit.ClassOf(g.unit) + oldWorst := oldSample.Values[len(oldSample.Values)-1] + return fmt.Sprintf( + "%s (%.2fx, limit %.2fx, worst of %d run(s), baseline worst %s)", + span, ratio, g.maxRatio, len(newSample.Values), + benchunit.Scale(oldWorst, cls), + ), false + } + cmp := benchmath.AssumeNothing.Compare(oldSample, newSample) + significant := cmp.P < cmp.Alpha + detail := fmt.Sprintf( + "%s (%.2fx, limit %.2fx, %s)", span, ratio, g.maxRatio, cmp, + ) + if g.needSignificance && !significant { + detail += " [not significant, not gated]" + } + return detail, significant +} + +// compare applies the gates to every benchmark present in both maps +// and returns a human-readable report plus the violations and +// config issues. +func compare( + oldRes, newRes benchSamples, gates []gate, +) (report []string, violations []violation, issues []configIssue) { + names := make([]string, 0, len(newRes)) + for name := range newRes { + names = append(names, name) + } + sort.Strings(names) + + for _, name := range names { + oldUnits, ok := oldRes[name] + if !ok { + report = append(report, fmt.Sprintf( + "%s: new benchmark, no baseline to compare", name, + )) + continue + } + parts, vs, is := compareUnits(gates, oldUnits, newRes[name]) + for i := range vs { + vs[i].name = name + } + for i := range is { + is[i].name = name + } + violations = append(violations, vs...) + issues = append(issues, is...) + report = append(report, fmt.Sprintf( + "%s: %s", name, strings.Join(parts, ", "), + )) + } + + var removed []string + for name := range oldRes { + if _, ok := newRes[name]; !ok { + removed = append(removed, name) + } + } + sort.Strings(removed) + for _, name := range removed { + report = append(report, fmt.Sprintf( + "%s: present in baseline but missing from candidate", + name, + )) + } + return report, violations, issues +} + +// compareUnits evaluates every gate for one benchmark, plus a note +// for candidate units no gate covers (custom b.ReportMetric units +// are collected but deliberately never gated). +func compareUnits( + gates []gate, oldUnits, newUnits map[string][]float64, +) (parts []string, vs []violation, is []configIssue) { + gated := make(map[string]bool, len(gates)) + for _, g := range gates { + gated[g.unit] = true + oldVals, okOld := oldUnits[g.unit] + newVals, okNew := newUnits[g.unit] + switch { + case !okOld && !okNew: + // The benchmark doesn't emit this unit at all. + continue + case !okOld: + // A baseline may legitimately lack a unit (older or + // partial base run): report, don't gate. + parts = append(parts, fmt.Sprintf( + "%s missing from baseline, not gated", g.unit, + )) + continue + case !okNew: + // The candidate capture is under the workflow's + // control; losing a gated unit the baseline has (e.g. + // -benchmem dropped) would silently disable this gate, + // so it is a configuration error. + parts = append(parts, fmt.Sprintf( + "%s missing from candidate", g.unit, + )) + is = append(is, configIssue{msg: fmt.Sprintf( + "%s present in baseline but missing from candidate capture (was -benchmem dropped?)", + g.unit, + )}) + continue + } + part, v, issue := evalGate(g, oldVals, newVals) + parts = append(parts, part) + if v != nil { + vs = append(vs, *v) + } + if issue != nil { + is = append(is, *issue) + } + } + var custom []string + for unit := range newUnits { + if !gated[unit] { + custom = append(custom, unit) + } + } + sort.Strings(custom) + for _, unit := range custom { + parts = append(parts, fmt.Sprintf( + "%s has no gate, not gated", unit, + )) + } + return parts, vs, is +} + +func parseFile(path string) (benchSamples, []string, error) { + f, err := os.Open(path) + if err != nil { + return nil, nil, err + } + defer f.Close() + return parseBench(benchfmt.NewReader(f, path)) +} + +// results bundles everything render needs to produce the final +// output and exit code. +type results struct { + report []string + violations []violation + issues []configIssue + newCount int + oldSyntax, newSyntax []string +} + +// render formats the human-readable outcome and picks the exit +// code: 2 for unusable input or configuration errors, 1 for +// regressions, 0 otherwise. Violations always print, even when a +// config issue or corrupted capture also occurred, so a detected +// regression is never hidden behind an exit-2. +func render(r results) (string, int) { + var b strings.Builder + for _, line := range r.report { + fmt.Fprintln(&b, line) + } + if len(r.violations) > 0 { + fmt.Fprintf(&b, "\nbenchgate: %d regression(s):\n", + len(r.violations)) + for _, v := range r.violations { + fmt.Fprintf(&b, " %s\n", v) + } + } + renderSyntax(&b, "baseline", r.oldSyntax) + renderSyntax(&b, "candidate", r.newSyntax) + if len(r.issues) > 0 { + fmt.Fprintln(&b, "\nbenchgate: invalid benchmark configuration:") + for _, issue := range r.issues { + fmt.Fprintf(&b, " %s: %s\n", issue.name, issue.msg) + } + } + switch { + case len(r.oldSyntax)+len(r.newSyntax) > 0 || len(r.issues) > 0: + return b.String(), 2 + case r.newCount == 0: + fmt.Fprintln(&b, "benchgate: candidate output contains no benchmarks") + return b.String(), 2 + case len(r.violations) > 0: + return b.String(), 1 + } + fmt.Fprintln(&b, "benchgate: no regressions beyond thresholds") + return b.String(), 0 +} + +// renderSyntax reports unparseable result lines in one capture. A +// benchmark whose result line is corrupted (e.g. by interleaved log +// output) parses on neither side and would otherwise vanish from +// the gate without a trace. +func renderSyntax(b *strings.Builder, side string, errs []string) { + if len(errs) == 0 { + return + } + fmt.Fprintf( + b, + "\nbenchgate: %s capture is corrupted (%d unparseable result line(s); benchmarks on those lines are not gated):\n", + side, len(errs), + ) + for _, e := range errs { + fmt.Fprintf(b, " %s\n", e) + } +} + +// flags holds the parsed command line. +type flags struct { + oldPath, newPath string + gates []gate +} + +func parseFlags() flags { + oldPath := flag.String( + "old", "", "baseline `go test -bench` output file", + ) + newPath := flag.String( + "new", "", "candidate `go test -bench` output file", + ) + maxTimeRatio := flag.Float64( + "max-time-ratio", 2.0, + "fail when candidate median sec/op exceeds baseline by this factor "+ + "(only when the difference is statistically significant; needs at "+ + "least 5 candidate samples)", + ) + maxAllocRatio := flag.Float64( + "max-alloc-ratio", 1.25, + "fail when candidate worst-run allocs/op exceeds baseline median by this factor", + ) + maxBytesRatio := flag.Float64( + "max-bytes-ratio", 1.35, + "fail when candidate worst-run B/op exceeds baseline median by this factor", + ) + timeFloorNs := flag.Float64( + "time-floor-ns", 100_000, + "skip the time gate when the baseline is below this many ns", + ) + allocFloor := flag.Float64( + "alloc-floor", 64, + "skip the allocs/op gate when the baseline is below this", + ) + bytesFloor := flag.Float64( + "bytes-floor", 16_384, + "skip the B/op gate when the baseline is below this", + ) + flag.Parse() + + if *oldPath == "" || *newPath == "" { + fmt.Fprintln(os.Stderr, "benchgate: -old and -new are required") + flag.Usage() + os.Exit(2) + } + return flags{ + oldPath: *oldPath, + newPath: *newPath, + gates: []gate{ + { + unit: "allocs/op", + maxRatio: *maxAllocRatio, + floor: *allocFloor, + worstCase: true, + }, + { + unit: "B/op", + maxRatio: *maxBytesRatio, + floor: *bytesFloor, + worstCase: true, + }, + { + unit: "sec/op", + maxRatio: *maxTimeRatio, + floor: *timeFloorNs / 1e9, + needSignificance: true, + }, + }, + } +} + +func main() { + cfg := parseFlags() + oldRes, oldSyntax, err := parseFile(cfg.oldPath) + if err != nil { + fmt.Fprintf(os.Stderr, "benchgate: reading baseline: %v\n", err) + os.Exit(2) + } + newRes, newSyntax, err := parseFile(cfg.newPath) + if err != nil { + fmt.Fprintf(os.Stderr, "benchgate: reading candidate: %v\n", err) + os.Exit(2) + } + + report, violations, issues := compare(oldRes, newRes, cfg.gates) + out, code := render(results{ + report: report, + violations: violations, + issues: issues, + newCount: len(newRes), + oldSyntax: oldSyntax, + newSyntax: newSyntax, + }) + fmt.Print(out) + os.Exit(code) +} diff --git a/cmd/benchgate/main_test.go b/cmd/benchgate/main_test.go new file mode 100644 index 000000000..b56043ddd --- /dev/null +++ b/cmd/benchgate/main_test.go @@ -0,0 +1,459 @@ +package main + +import ( + "strings" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "golang.org/x/perf/benchfmt" +) + +func parseString(t *testing.T, input string) (benchSamples, []string) { + t.Helper() + got, syntaxErrs, err := parseBench( + benchfmt.NewReader(strings.NewReader(input), "test"), + ) + require.NoError(t, err) + return got, syntaxErrs +} + +func TestParseBench(t *testing.T) { + tests := []struct { + name string + input string + want benchSamples + wantSyntax int + }{ + { + name: "full benchmem line with tidied units", + input: "goos: linux\n" + + "pkg: example.com/x\n" + + "BenchmarkFoo-8 \t 100\t 1234567 ns/op\t 2345 B/op\t 67 allocs/op\n" + + "PASS\nok \texample.com/x\t1.2s\n", + want: benchSamples{ + "example.com/x.Foo-8": { + "sec/op": {1234567e-9}, + "B/op": {2345}, + "allocs/op": {67}, + }, + }, + }, + { + name: "multiple counts collect all samples", + input: "BenchmarkFoo-8 100 200 ns/op 9 allocs/op\n" + + "BenchmarkFoo-8 100 150 ns/op 8 allocs/op\n" + + "BenchmarkFoo-8 100 180 ns/op 10 allocs/op\n", + want: benchSamples{ + "Foo-8": { + "sec/op": {200e-9, 150e-9, 180e-9}, + "allocs/op": {9, 8, 10}, + }, + }, + }, + { + name: "same benchmark name in different packages stays separate", + input: "pkg: example.com/a\n" + + "BenchmarkScan-4 10 100 ns/op\n" + + "pkg: example.com/b\n" + + "BenchmarkScan-4 10 900 ns/op\n", + want: benchSamples{ + "example.com/a.Scan-4": {"sec/op": {100e-9}}, + "example.com/b.Scan-4": {"sec/op": {900e-9}}, + }, + }, + { + name: "log lines and headers are ignored", + input: "2026/07/03 10:20:36 discovered 40 files in 0s\n" + + "cpu: Apple M5 Max\n", + want: benchSamples{}, + }, + { + name: "result line corrupted by interleaved log output is a syntax error", + input: "BenchmarkSyncAllWarmNoop-18 \t2026/07/03 15:39:07 discovered 40 files in 0s\n" + + " 5\t 3581975 ns/op\t 212433 B/op\t 2760 allocs/op\n", + want: benchSamples{}, + wantSyntax: 1, + }, + { + name: "custom ReportMetric units are kept", + input: "BenchmarkBaz-2 10 900 ns/op 3 things/op\n", + want: benchSamples{ + "Baz-2": { + "sec/op": {900e-9}, + "things/op": {3}, + }, + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, syntaxErrs := parseString(t, tt.input) + assert.Len(t, syntaxErrs, tt.wantSyntax) + require.Len(t, got, len(tt.want)) + for name, wantUnits := range tt.want { + gotUnits, ok := got[name] + require.True(t, ok, "missing benchmark %s", name) + for unit, wantVals := range wantUnits { + assert.InDeltaSlice( + t, wantVals, gotUnits[unit], 1e-15, + "%s %s", name, unit, + ) + } + } + }) + } +} + +func testGates() []gate { + return []gate{ + {unit: "allocs/op", maxRatio: 1.25, floor: 64, worstCase: true}, + {unit: "B/op", maxRatio: 1.35, floor: 16_384, worstCase: true}, + { + unit: "sec/op", maxRatio: 2.0, floor: 100_000e-9, + needSignificance: true, + }, + } +} + +// noisy fabricates a benchmark sample of n values spread ±2% around +// center, so significance tests have a realistic distribution. +func noisy(center float64, n int) []float64 { + out := make([]float64, n) + for i := range out { + out[i] = center * (0.98 + 0.01*float64(i%5)) + } + return out +} + +func TestCompare(t *testing.T) { + tests := []struct { + name string + old, new benchSamples + wantUnits []string // units of expected violations, in order + wantReport []string // substrings that must appear in the report + }{ + { + name: "within thresholds passes", + old: benchSamples{ + "BenchmarkFoo-8": { + "sec/op": noisy(1e-3, 6), + "B/op": {100_000}, + "allocs/op": {1000}, + }, + }, + new: benchSamples{ + "BenchmarkFoo-8": { + "sec/op": noisy(1.5e-3, 6), + "B/op": {120_000}, + "allocs/op": {1100}, + }, + }, + wantUnits: nil, + }, + { + name: "alloc regression fails even with a single run", + old: benchSamples{ + "BenchmarkFoo-8": {"allocs/op": {1000}}, + }, + new: benchSamples{ + "BenchmarkFoo-8": {"allocs/op": {2000}}, + }, + wantUnits: []string{"allocs/op"}, + }, + { + name: "significant time blowup fails", + old: benchSamples{ + "BenchmarkFoo-8": {"sec/op": noisy(1e-3, 6)}, + }, + new: benchSamples{ + "BenchmarkFoo-8": {"sec/op": noisy(5e-3, 6)}, + }, + wantUnits: []string{"sec/op"}, + }, + { + name: "time blowup without significance is reported, not gated", + old: benchSamples{ + "BenchmarkFoo-8": {"sec/op": noisy(1e-3, 5)}, + }, + new: benchSamples{ + "BenchmarkFoo-8": { + "sec/op": append(noisy(1e-3, 4), 5e-3), + }, + }, + wantUnits: nil, + wantReport: []string{"not significant, not gated"}, + }, + { + name: "tiny baseline below floor is not gated", + old: benchSamples{ + "BenchmarkFoo-8": { + "sec/op": {500e-9}, + "B/op": {128}, + "allocs/op": {3}, + }, + }, + new: benchSamples{ + "BenchmarkFoo-8": { + "sec/op": {5000e-9}, + "B/op": {1280}, + "allocs/op": {30}, + }, + }, + wantUnits: nil, + wantReport: []string{"not gated"}, + }, + { + name: "new benchmark without baseline is reported, not gated", + old: benchSamples{}, + new: benchSamples{ + "BenchmarkNew-8": {"allocs/op": {99999}}, + }, + wantUnits: nil, + wantReport: []string{"no baseline to compare"}, + }, + { + name: "removed benchmark is reported, not gated", + old: benchSamples{ + "BenchmarkGone-8": {"sec/op": {1e-3}}, + }, + new: benchSamples{}, + wantUnits: nil, + wantReport: []string{"missing from candidate"}, + }, + { + name: "gated unit missing from the baseline is reported, not gated", + old: benchSamples{ + "BenchmarkFoo-8": {"sec/op": noisy(1e-3, 6)}, + }, + new: benchSamples{ + "BenchmarkFoo-8": { + "sec/op": noisy(1e-3, 6), + "allocs/op": {90000}, + }, + }, + wantUnits: nil, + wantReport: []string{"allocs/op missing from baseline, not gated"}, + }, + { + name: "custom ReportMetric unit is reported as ungated", + old: benchSamples{ + "BenchmarkFoo-8": { + "sec/op": noisy(1e-3, 6), + "things/op": {3}, + }, + }, + new: benchSamples{ + "BenchmarkFoo-8": { + "sec/op": noisy(1e-3, 6), + "things/op": {900}, + }, + }, + wantUnits: nil, + wantReport: []string{"things/op has no gate, not gated"}, + }, + { + name: "multiple regressions are all reported", + old: benchSamples{ + "BenchmarkFoo-8": { + "sec/op": noisy(1e-3, 6), + "B/op": {100_000}, + "allocs/op": {1000}, + }, + }, + new: benchSamples{ + "BenchmarkFoo-8": { + "sec/op": noisy(9e-3, 6), + "B/op": {900_000}, + "allocs/op": {9000}, + }, + }, + wantUnits: []string{"allocs/op", "B/op", "sec/op"}, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + report, violations, issues := compare(tt.old, tt.new, testGates()) + units := make([]string, 0, len(violations)) + for _, v := range violations { + units = append(units, v.unit) + } + assert.Empty(t, issues) + if len(tt.wantUnits) == 0 { + assert.Empty(t, violations) + } else { + assert.Equal(t, tt.wantUnits, units) + } + joined := strings.Join(report, "\n") + for _, want := range tt.wantReport { + assert.Contains(t, joined, want) + } + }) + } +} + +// TestCompareOutlierRunPolicy pins the split policy for a single +// outlier among repeated -count runs of one benchmark. allocs/op is +// deterministic for identical code and iteration counts, so one +// outlier run means a real intermittent allocation path and must +// fail even though the median is unchanged. Wall-clock time is +// summarized by its median, so one slow run on a noisy runner cannot +// fail the gate on its own. +func TestCompareOutlierRunPolicy(t *testing.T) { + t.Run("alloc outlier run fails", func(t *testing.T) { + old := benchSamples{ + "BenchmarkFoo-8": {"allocs/op": {1000, 1000, 1000, 1000, 1000}}, + } + next := benchSamples{ + "BenchmarkFoo-8": {"allocs/op": {1000, 1000, 9000, 1000, 1000}}, + } + _, violations, issues := compare(old, next, testGates()) + require.Len(t, violations, 1) + assert.Empty(t, issues) + assert.Equal(t, "allocs/op", violations[0].unit) + assert.InDelta(t, 9000, violations[0].new, 1e-9, + "the worst run is what gets gated") + }) + + t.Run("time outlier run does not fail", func(t *testing.T) { + old := benchSamples{ + "BenchmarkFoo-8": {"sec/op": noisy(1e-3, 6)}, + } + next := benchSamples{ + "BenchmarkFoo-8": {"sec/op": append(noisy(1e-3, 5), 9e-3)}, + } + _, violations, issues := compare(old, next, testGates()) + assert.Empty(t, issues) + assert.Empty(t, violations) + }) +} + +// TestCompareMissingCandidateUnit pins the asymmetric missing-unit +// policy: a gated unit the baseline has but the candidate lost +// (e.g. -benchmem dropped from the candidate run) is a config error +// so the gate exits 2 instead of silently disabling that metric. +func TestCompareMissingCandidateUnit(t *testing.T) { + old := benchSamples{ + "BenchmarkFoo-8": { + "sec/op": noisy(1e-3, 6), + "allocs/op": {1000}, + }, + } + next := benchSamples{ + "BenchmarkFoo-8": {"sec/op": noisy(1e-3, 6)}, + } + report, violations, issues := compare(old, next, testGates()) + assert.Empty(t, violations) + require.Len(t, issues, 1) + assert.Contains(t, issues[0].msg, + "allocs/op present in baseline but missing from candidate") + assert.Contains(t, strings.Join(report, "\n"), + "allocs/op missing from candidate") +} + +// TestCompareTimeGateSampleCounts pins the asymmetric sample-count +// policy: too few candidate samples is a configuration error (the +// candidate run is under the workflow's control), while a short +// baseline is merely reported and not gated (the base run may +// legitimately be partial). +func TestCompareTimeGateSampleCounts(t *testing.T) { + t.Run("too few candidate samples is a config issue", func(t *testing.T) { + old := benchSamples{"BenchmarkFoo-8": {"sec/op": noisy(1e-3, 6)}} + next := benchSamples{"BenchmarkFoo-8": {"sec/op": {2e-3}}} + _, violations, issues := compare(old, next, testGates()) + assert.Empty(t, violations) + require.Len(t, issues, 1) + assert.Contains(t, issues[0].msg, "at least 5 candidate samples") + }) + + t.Run("short baseline is reported, not a config issue", func(t *testing.T) { + old := benchSamples{"BenchmarkFoo-8": {"sec/op": {1e-3, 1e-3}}} + next := benchSamples{"BenchmarkFoo-8": {"sec/op": noisy(9e-3, 6)}} + report, violations, issues := compare(old, next, testGates()) + assert.Empty(t, violations) + assert.Empty(t, issues) + assert.Contains(t, strings.Join(report, "\n"), + "baseline has only 2 sample(s)") + }) +} + +func TestRender(t *testing.T) { + sampleViolation := violation{ + name: "BenchmarkFoo-8", unit: "allocs/op", + old: 1000, new: 2000, ratio: 2.0, maxRatio: 1.25, + } + tests := []struct { + name string + r results + wantCode int + wantOut []string + }{ + { + name: "clean run passes", + r: results{report: []string{"BenchmarkFoo-8: ok"}, newCount: 1}, + wantCode: 0, + wantOut: []string{"no regressions beyond thresholds"}, + }, + { + name: "violations exit 1", + r: results{ + violations: []violation{sampleViolation}, + newCount: 1, + }, + wantCode: 1, + wantOut: []string{"1 regression(s)", "allocs/op regressed 2.00x"}, + }, + { + name: "violations still print when a config issue forces exit 2", + r: results{ + violations: []violation{sampleViolation}, + issues: []configIssue{{name: "BenchmarkFoo-8", msg: "too few"}}, + newCount: 1, + }, + wantCode: 2, + wantOut: []string{ + "allocs/op regressed 2.00x", + "invalid benchmark configuration", + }, + }, + { + name: "corrupted capture exits 2 and is described", + r: results{ + newCount: 1, + newSyntax: []string{"test:3: no iteration count"}, + }, + wantCode: 2, + wantOut: []string{ + "candidate capture is corrupted", + "no iteration count", + }, + }, + { + name: "empty candidate exits 2", + r: results{newCount: 0}, + wantCode: 2, + wantOut: []string{"candidate output contains no benchmarks"}, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + out, code := render(tt.r) + assert.Equal(t, tt.wantCode, code) + for _, want := range tt.wantOut { + assert.Contains(t, out, want) + } + }) + } +} + +func TestViolationString(t *testing.T) { + v := violation{ + name: "BenchmarkFoo-8", unit: "allocs/op", + old: 1000, new: 2000, ratio: 2.0, maxRatio: 1.25, + } + got := v.String() + assert.Contains(t, got, "BenchmarkFoo-8") + assert.Contains(t, got, "allocs/op regressed 2.00x") + assert.Contains(t, got, "limit 1.25x") +} diff --git a/docs/internal/performance-gates.md b/docs/internal/performance-gates.md new file mode 100644 index 000000000..094530484 --- /dev/null +++ b/docs/internal/performance-gates.md @@ -0,0 +1,154 @@ +# Performance Gates + +agentsview has repeatedly shipped performance regressions where sync work +stopped scaling with *new data* and started scaling with *archive size*. This +document records the regression classes we have actually hit, and the gates that +now guard each one. When you touch a sync or DB hot path, know which gate covers +you; when you fix a new class of regression, add a gate here. + +## Regression history + +| Class | What happened | Fixed in | +| ------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------- | +| Discovery O(sources) root work | Gemini rebuilt its project map per session; positron/vscode-copilot re-read `workspace.json` per session. A large store spent 2m47s in discovery. | #912 | +| Unchanged sources reparsed | The provider migration dropped pre-parse DB-freshness skips; every full sync reparsed and rewrote untouched sessions. | `providerSourceUnchangedInDB` (#883 follow-up) | +| O(history) incremental appends | Every streamed line ran a full signal recompute (reload all messages, secret regex scan) and chunk merges delete+reinserted every message row. ~4,700 session updates/day each paid O(session history). | #954 | +| Bulk ingest throughput | Full resync ran per-row inserts and rebuilt FTS incrementally; 26.7k sessions took 1m17s. | #411 | +| Event storms | One SSE emit per watcher flush drove ~1/s dashboard refetch; SQLite WAL sidecar events fanned out to every session in a shared DB. | #367, #956 | +| Per-row query shape | `GetDailyUsage` ran 1.2M `json_extract` calls per scan and had no date pushdown. | #309 | + +## Two layers of gates + +### 1. Deterministic work-count invariants (run in `make test`) + +These count work units instead of measuring time, so they are immune to CI +runner noise and fail loudly: + +- `TestWarmFullSyncDoesNoBulkWriteWork` (`internal/sync/perf_invariant_test.go`) + — a second full sync over an unchanged Claude archive must skip everything + and run zero bulk-write batches (`Engine.PhaseStats`). +- `TestProviderAuthoritativeUnchangedSessionSkipsOnResync` + (`internal/sync/provider_freshness_integration_test.go`) — the generic + freshness skip for provider-authoritative agents, Vibe as representative. +- `TestWriteIncrementalDebouncesSignalRecompute` and the rest of + `internal/sync/signal_schedule_test.go` — streaming appends must debounce + the O(history) signal recompute. +- The count-based seam tests in `internal/parser` + (`discovery_workspace_manifest_test.go`, gemini/antigravity provider tests) + — root-derived project info is built once per root, not once per source. +- `internal/server/broadcaster_test.go` — SSE emits coalesce to at most one + broadcast per window. + +When you fix a performance bug, prefer adding a gate at this layer: expose or +reuse a counter (`SyncStats`, `PhaseStats`, `AnomalyStats`, a swappable +package-var seam) and assert the invariant, e.g. "second sync parses zero +sessions" or "the manifest is read once per root regardless of session count". + +### 2. Benchmark gate (runs on every PR via `bench.yml`) + +`.github/workflows/bench.yml` runs `make bench-gate` — the single source of +truth for the gated package list, sample count, and iteration count — on the PR +head and its merge base on the same runner, then compares the outputs with +`cmd/benchgate`: + +- `BenchmarkSyncAllWarmNoop` — full sync over an already-synced archive (stat + + skip work only; also self-asserts nothing is re-synced or bulk-rewritten). +- `BenchmarkSyncPathsIncrementalAppend` — absorb one appended line into a + 1,000-message session. +- `BenchmarkSyncAllColdArchive` — first-sync ingest throughput through the + default per-session write path. +- `BenchmarkResyncBulkIngest` — the same archive through the resync bulk-write + pipeline (`writeBatchBulk` / `DB.WriteSessionBatch`, the #411 regression + class); self-asserts every session took the batch path. +- `BenchmarkReplaceSessionMessagesStreamingMerge` — the streaming chunk-merge + diff path (one UPDATE, not a full delete+reinsert). +- `BenchmarkInsertMessagesBatch` — multi-row batched ingest. +- `BenchmarkGetDailyUsage` — usage aggregation over 100k message rows. +- `BenchmarkScan` / `BenchmarkScanDefinite` — secret-scan regex throughput. + +`benchgate` builds on `golang.org/x/perf`: `benchfmt` parses the output and +`benchmath` — the statistics engine behind `benchstat` — summarizes samples and +tests significance (Mann-Whitney U). benchgate adds only the policy benchstat +does not provide: thresholds, floors, and a failing exit code. Gating is per +benchmark — any single benchmark over its threshold fails the PR; nothing is +averaged across benchmarks. It gates hard on `allocs/op` (limit 1.25x) and +`B/op` (1.35x), which are deterministic for the same code and iteration count — +an O(archive)-instead-of-O(delta) regression always blows them up. Those two +compare the candidate's *worst* `-count` run against the baseline median, so +even an intermittent extra-allocation path fails. That is intentionally +asymmetric: the baseline is treated as the historical reference, and candidate +instability is what blocks the PR (failure lines include the baseline's worst +run so pre-existing instability is visible). Time (`sec/op`) compares medians +with a loose 2.0x limit and must additionally be a statistically significant +difference before it fails, so a single slow run on a noisy runner cannot flake +a PR but algorithmic blowups still do. Time gating requires at least 5 candidate +samples; fewer is reported as a configuration error (the candidate run is under +the workflow's control), while a baseline with fewer than 5 samples — a +legitimately partial base run — is reported and not gated. Baselines below a +per-metric floor are not gated. Benchmarks that exist on only one side are +reported but never fail, so adding or removing benchmarks cannot wedge a PR. +Only `allocs/op`, `B/op`, and `sec/op` are gated: custom `b.ReportMetric` units +are collected and reported as ungated, never enforced. + +Two failure modes are treated as loud configuration errors (exit 2) rather than +silent gaps: a capture whose result lines fail to parse (for example test log +output interleaved into a `Benchmark...` line — the sync benchmarks silence the +engine's logger for exactly this reason), and a gated unit present in the +baseline but missing from the candidate (for example a candidate captured +without `-benchmem`), which would otherwise silently disable that gate for good. +The reverse — a gated unit missing from the baseline, which may legitimately be +older or partial — is reported as not gated. + +The gate always runs with a fixed `-benchtime=Nx` iteration count (not a +duration): two of the benchmarks grow their fixture as they iterate, so the +baseline and candidate must run the same number of iterations to measure +identical workloads. CI evaluates `make bench-gate-config` on the PR head and +passes the count and benchtime into the merge-base run, so a PR that changes +those defaults still compares identical workloads; do the same locally if you +override them. + +Report identifiers are package-qualified benchmark names +(`go.kenn.io/agentsview/internal/db.InsertMessagesBatch-18`) when the captured +output carries `pkg:` metadata, falling back to the bare name when it does not +(e.g. hand-trimmed captures). Do not mix captures with and without `pkg:` lines: +the same benchmark would key differently and be treated as removed/new. + +Run locally, comparing your working tree against a baseline commit. Like CI, use +a worktree for the baseline — checking out or stashing in place can leave +candidate files (or your commits) in the baseline run: + +```bash +make bench-gate > new.txt # candidate: current tree +git worktree add /tmp/bench-base "$(git merge-base HEAD origin/main)" +make -C /tmp/bench-base bench-gate > old.txt +git worktree remove /tmp/bench-base +go run ./cmd/benchgate -old old.txt -new new.txt +``` + +Cross-backend query benchmarks live separately in `internal/backendbench` +(`make bench-backends`, requires Docker) and are not part of the PR gate. + +## Adding a benchmark to the gate + +Every benchmark in a gated package is gated — there is no per-name allowlist to +maintain. A benchmark added by a PR has no baseline, so its first run is +reported without gating; it gates automatically once merged. + +1. Write the benchmark next to the code it guards (`*_bench_test.go`, + `b.ReportAllocs()`, self-assert the invariant it protects where possible). + If the code under test logs, silence the logger in the benchmark (see + `silenceBenchLogs` in `internal/sync/engine_bench_test.go`): interleaved + log output corrupts result lines and benchgate fails on the corruption. +1. If its package is not already gated, add it to `BENCH_GATE_PACKAGES` in the + Makefile — a benchmark outside the gated packages silently never runs, so + it looks gated while measuring nothing. CI picks the list up from the + Makefile; each side of the comparison benchmarks its own commit's list, so + growing the gate cannot break the base run. +1. Keep per-op cost roughly in the 100µs–100ms band: below the benchgate floors + nothing is gated, and far above it the job gets slow. +1. Keep per-iteration setup out of the timed region (`b.ResetTimer`, pre-built + fixtures): helper allocations inside the loop are gated as if they were + product cost and dilute or distort the ratio. +1. If the benchmark's fixture grows across iterations, say so in its comment; + the fixed `-benchtime=Nx` keeps both sides comparable, but readers need to + know per-op cost depends on the iteration count. diff --git a/go.mod b/go.mod index 236d14d8f..d32a5084e 100644 --- a/go.mod +++ b/go.mod @@ -26,6 +26,7 @@ require ( github.com/tidwall/gjson v1.19.0 go.kenn.io/kit v0.1.7 golang.org/x/mod v0.37.0 + golang.org/x/perf v0.0.0-20260615155930-9e4b9ddef5b6 golang.org/x/sync v0.21.0 golang.org/x/sys v0.46.0 golang.org/x/term v0.44.0 @@ -35,6 +36,7 @@ require ( dario.cat/mergo v1.0.2 // indirect github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c // indirect github.com/Microsoft/go-winio v0.6.2 // indirect + github.com/aclements/go-moremath v0.0.0-20210112150236-f10218a38794 // indirect github.com/apache/arrow-go/v18 v18.5.1 // indirect github.com/cenkalti/backoff/v4 v4.3.0 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect @@ -109,12 +111,12 @@ require ( go.opentelemetry.io/otel/metric v1.43.0 // indirect go.opentelemetry.io/otel/trace v1.43.0 // indirect go.yaml.in/yaml/v3 v3.0.4 // indirect - golang.org/x/crypto v0.51.0 // indirect - golang.org/x/exp v0.0.0-20260112195511-716be5621a96 // indirect - golang.org/x/net v0.54.0 // indirect - golang.org/x/oauth2 v0.35.0 // indirect + golang.org/x/crypto v0.53.0 // indirect + golang.org/x/exp v0.0.0-20260603202125-055de637280b // indirect + golang.org/x/net v0.56.0 // indirect + golang.org/x/oauth2 v0.36.0 // indirect golang.org/x/telemetry v0.0.0-20260508192327-42602be52be6 // indirect - golang.org/x/text v0.37.0 // indirect + golang.org/x/text v0.38.0 // indirect golang.org/x/tools v0.45.0 // indirect golang.org/x/xerrors v0.0.0-20240903120638-7835f813f4da // indirect gopkg.in/ini.v1 v1.67.2 // indirect diff --git a/go.sum b/go.sum index 07b890a25..d347c22a3 100644 --- a/go.sum +++ b/go.sum @@ -8,6 +8,8 @@ github.com/BurntSushi/toml v1.6.0 h1:dRaEfpa2VI55EwlIW72hMRHdWouJeRF7TPYhI+AUQjk github.com/BurntSushi/toml v1.6.0/go.mod h1:ukJfTF/6rtPPRCnwkur4qwRxa8vTRFBF0uk2lLoLwho= github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERoyfY= github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA8Ipt1oGCvU= +github.com/aclements/go-moremath v0.0.0-20210112150236-f10218a38794 h1:xlwdaKcTNVW4PtpQb8aKA4Pjy0CdJHEqvFbAnvR5m2g= +github.com/aclements/go-moremath v0.0.0-20210112150236-f10218a38794/go.mod h1:7e+I0LQFUI9AXWxOfsQROs9xPhoJtbsyWcjJqDd4KPY= github.com/andybalholm/brotli v1.2.1 h1:R+f5xP285VArJDRgowrfb9DqL18yVK0gKAW/F+eTWro= github.com/andybalholm/brotli v1.2.1/go.mod h1:rzTDkvFWvIrjDXZHkuS16NPggd91W3kUSvPlQ1pLaKY= github.com/apache/arrow-go/v18 v18.5.1 h1:yaQ6zxMGgf9YCYw4/oaeOU3AULySDlAYDOcnr4LdHdI= @@ -254,16 +256,18 @@ go.opentelemetry.io/otel/trace v1.43.0 h1:BkNrHpup+4k4w+ZZ86CZoHHEkohws8AY+WTX09 go.opentelemetry.io/otel/trace v1.43.0/go.mod h1:/QJhyVBUUswCphDVxq+8mld+AvhXZLhe+8WVFxiFff0= go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= -golang.org/x/crypto v0.51.0 h1:IBPXwPfKxY7cWQZ38ZCIRPI50YLeevDLlLnyC5wRGTI= -golang.org/x/crypto v0.51.0/go.mod h1:8AdwkbraGNABw2kOX6YFPs3WM22XqI4EXEd8g+x7Oc8= -golang.org/x/exp v0.0.0-20260112195511-716be5621a96 h1:Z/6YuSHTLOHfNFdb8zVZomZr7cqNgTJvA8+Qz75D8gU= -golang.org/x/exp v0.0.0-20260112195511-716be5621a96/go.mod h1:nzimsREAkjBCIEFtHiYkrJyT+2uy9YZJB7H1k68CXZU= +golang.org/x/crypto v0.53.0 h1:QZ4Muo8THX6CizN2vPPd5fBGHyogrdK9fG4wLPFUsto= +golang.org/x/crypto v0.53.0/go.mod h1:DNLU434OwVakk9PzuwV8w62mAJpRJL3vsgcfp4Qnsio= +golang.org/x/exp v0.0.0-20260603202125-055de637280b h1:v1uXiEBHo8QA0LiGCo7UgHMzHT4Kdfpl2zmtH5vaP1Q= +golang.org/x/exp v0.0.0-20260603202125-055de637280b/go.mod h1:d2fgXJLVs4dYDHUk5lwMIfzRzSrWCfGZb0ZqeLa/Vcw= golang.org/x/mod v0.37.0 h1:vF1DjpVEshcIqoEaauuHebaLk1O1forxjxBaVn884JQ= golang.org/x/mod v0.37.0/go.mod h1:m8S8VeM9r4dzDwjrKO0a1sZP3YjeMamRRlD+fmR2Q/0= -golang.org/x/net v0.54.0 h1:2zJIZAxAHV/OHCDTCOHAYehQzLfSXuf/5SoL/Dv6w/w= -golang.org/x/net v0.54.0/go.mod h1:Sj4oj8jK6XmHpBZU/zWHw3BV3abl4Kvi+Ut7cQcY+cQ= -golang.org/x/oauth2 v0.35.0 h1:Mv2mzuHuZuY2+bkyWXIHMfhNdJAdwW3FuWeCPYN5GVQ= -golang.org/x/oauth2 v0.35.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA= +golang.org/x/net v0.56.0 h1:Rw8j/hFzGvJUZwNBXnAtf5sVDVt+65SK2C7IxCxZt5o= +golang.org/x/net v0.56.0/go.mod h1:D3Ku6r+V6JROoZK144D2XfMHFcMq/0zSfLelVTCFKec= +golang.org/x/oauth2 v0.36.0 h1:peZ/1z27fi9hUOFCAZaHyrpWG5lwe0RJEEEeH0ThlIs= +golang.org/x/oauth2 v0.36.0/go.mod h1:YDBUJMTkDnJS+A4BP4eZBjCqtokkg1hODuPjwiGPO7Q= +golang.org/x/perf v0.0.0-20260615155930-9e4b9ddef5b6 h1:YTGOochus7nQXtT5KnsVfJ3/1yslyOyP3m9vMG1qMuQ= +golang.org/x/perf v0.0.0-20260615155930-9e4b9ddef5b6/go.mod h1:FisaKCtzcRx02gCop3DILSnoD7CMnqwmde2yVxo4meM= golang.org/x/sync v0.21.0 h1:HLII4xRRTtCRkxYp4HNFF0Js/Og6q2i++KXbg0gHCwM= golang.org/x/sync v0.21.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0= golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= @@ -275,8 +279,8 @@ golang.org/x/telemetry v0.0.0-20260508192327-42602be52be6 h1:HjU6IWBiAgRIdAJ9/y1 golang.org/x/telemetry v0.0.0-20260508192327-42602be52be6/go.mod h1:Eqhaxk/wZsWEH8CRxLwj6xzEJbz7k1EFGqx7nyCoabE= golang.org/x/term v0.44.0 h1:0rLvDRCtNj0gZkyIXhCyOb2OAzEhLVqc4B+hrsBhrmc= golang.org/x/term v0.44.0/go.mod h1:7ze4MdzUzLXpSAoFP1H0bOI9aXDqveSvatT5vKcFh2Y= -golang.org/x/text v0.37.0 h1:Cqjiwd9eSg8e0QAkyCaQTNHFIIzWtidPahFWR83rTrc= -golang.org/x/text v0.37.0/go.mod h1:a5sjxXGs9hsn/AJVwuElvCAo9v8QYLzvavO5z2PiM38= +golang.org/x/text v0.38.0 h1:sXmwo9DwP3OK9EZ7PqAdaooSGozfl/3a6/xJcbzPRhE= +golang.org/x/text v0.38.0/go.mod h1:YXZt3QhHUKYT53r2lLKFIVi6Ao1jdzrTR/KQ09qyxF4= golang.org/x/tools v0.45.0 h1:18qN3FAooORvApf5XjCXgsuayZOEtXf6JK18I3+ONa8= golang.org/x/tools v0.45.0/go.mod h1:LuUGqqaXcXMEFEruIVJVm5mgDD8vww/z/SR1gQ4uE/0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= diff --git a/internal/db/db_test.go b/internal/db/db_test.go index acbb3b311..4ced0f394 100644 --- a/internal/db/db_test.go +++ b/internal/db/db_test.go @@ -300,13 +300,13 @@ const ( tsMidYear = "2024-06-01T10:00:00Z" ) -func testDB(t *testing.T) *DB { - t.Helper() - dir := t.TempDir() +func testDB(tb testing.TB) *DB { + tb.Helper() + dir := tb.TempDir() path := filepath.Join(dir, "test.db") d, err := openCopiedTestDB(path) - require.NoError(t, err, "opening test db") - t.Cleanup(func() { require.NoError(t, d.Close()) }) + require.NoError(tb, err, "opening test db") + tb.Cleanup(func() { require.NoError(tb, d.Close()) }) return d } diff --git a/internal/db/messages_bench_test.go b/internal/db/messages_bench_test.go new file mode 100644 index 000000000..73a9eebfb --- /dev/null +++ b/internal/db/messages_bench_test.go @@ -0,0 +1,146 @@ +package db + +import ( + "encoding/json" + "fmt" + "testing" +) + +// Hot-path benchmarks for the message write and usage aggregation +// paths that have regressed in the past. CI's bench-gate workflow +// runs them on every PR and compares allocs/op, B/op, and ns/op +// against the merge base, so a reintroduced O(session-history) +// rewrite or per-row JSON parse fails the PR instead of shipping: +// +// - BenchmarkReplaceSessionMessagesStreamingMerge: a one-row tail +// change must take the in-place diff path (one UPDATE) rather +// than delete+reinserting every row and rewriting the FTS index +// (regressed pre-#954: streaming chunk merges rewrote whole +// sessions on every appended chunk). +// - BenchmarkInsertMessagesBatch: bulk ingest must keep multi-row +// batched inserts (#411). +// +// BenchmarkGetDailyUsage in usage_test.go covers the usage +// aggregation scan (#309) and is part of the same CI gate. + +// benchSessionMessages builds n alternating user/assistant messages. +// Assistant messages carry a model and token_usage payload so writes +// exercise the same columns real ingest does. +func benchSessionMessages(sessionID string, n int) []Message { + msgs := make([]Message, 0, n) + for i := range n { + role := "user" + content := fmt.Sprintf( + "user message %d with enough text to look real", i, + ) + if i%2 == 1 { + role = "assistant" + content = fmt.Sprintf( + "assistant reply %d with enough text to look real", i, + ) + } + m := Message{ + SessionID: sessionID, + Ordinal: i, + Role: role, + Content: content, + Timestamp: fmt.Sprintf("2026-06-%02dT10:00:00Z", 1+i%28), + ContentLength: len(content), + } + if role == "assistant" { + m.Model = "claude-bench-model" + m.TokenUsage = json.RawMessage( + `{"input_tokens":120,"output_tokens":45,` + + `"cache_creation_input_tokens":10,` + + `"cache_read_input_tokens":200}`, + ) + } + msgs = append(msgs, m) + } + return msgs +} + +func seedBenchSession( + b *testing.B, d *DB, sessionID string, n int, +) []Message { + b.Helper() + if err := d.UpsertSession(Session{ + ID: sessionID, + Project: "bench", + Machine: "local", + Agent: "claude", + }); err != nil { + b.Fatalf("seed session %s: %v", sessionID, err) + } + msgs := benchSessionMessages(sessionID, n) + if err := d.InsertMessages(msgs); err != nil { + b.Fatalf("seed messages for %s: %v", sessionID, err) + } + return msgs +} + +// BenchmarkReplaceSessionMessagesStreamingMerge measures the +// streaming chunk-merge shape: replacing a stored session where only +// the tail message's content changed. The diff planner must apply a +// single in-place UPDATE; cost must not scale with the number of +// unchanged stored rows being rewritten. +func BenchmarkReplaceSessionMessagesStreamingMerge(b *testing.B) { + const stored = 1000 + d := testDB(b) + msgs := seedBenchSession(b, d, "bench-replace", stored) + last := len(msgs) - 1 + + b.ReportAllocs() + b.ResetTimer() + for i := range b.N { + content := fmt.Sprintf( + "assistant reply %d merged streaming tail variant %d", + last, i, + ) + msgs[last].Content = content + msgs[last].ContentLength = len(content) + if err := d.ReplaceSessionMessages("bench-replace", msgs); err != nil { + b.Fatalf("replace: %v", err) + } + } +} + +// BenchmarkInsertMessagesBatch measures bulk session ingest: one +// session row plus a batch insert of its messages, the unit of work +// the full-sync write pipeline performs per session. +// +// Each iteration adds a new session, so the database grows with the +// iteration count and per-op cost is only comparable between runs +// with the same count: the bench gate always runs with a fixed +// -benchtime=Nx (see bench.yml and the Makefile) so baseline and +// candidate insert into identically sized databases. +func BenchmarkInsertMessagesBatch(b *testing.B) { + const batch = 200 + d := testDB(b) + + // Build the message fixture once: constructing 200 Message + // structs (~400 fmt.Sprintf calls) inside the timed loop would + // be gated as if it were ingest cost. Only the SessionID is + // rewritten per iteration, which allocates nothing. + msgs := benchSessionMessages("", batch) + + b.ReportAllocs() + b.ResetTimer() + for i := range b.N { + sid := fmt.Sprintf("bench-insert-%06d", i) + if err := d.UpsertSession(Session{ + ID: sid, + Project: "bench", + Machine: "local", + Agent: "claude", + }); err != nil { + b.Fatalf("upsert session: %v", err) + } + for j := range msgs { + msgs[j].SessionID = sid + } + if err := d.InsertMessages(msgs); err != nil { + b.Fatalf("insert messages: %v", err) + } + } +} diff --git a/internal/db/usage_test.go b/internal/db/usage_test.go index c0ae860e8..194d117d9 100644 --- a/internal/db/usage_test.go +++ b/internal/db/usage_test.go @@ -2910,7 +2910,7 @@ func TestExcludeModelFilter(t *testing.T) { } func BenchmarkGetDailyUsage(b *testing.B) { - d := testDB(&testing.T{}) + d := testDB(b) ctx := context.Background() if err := d.UpsertModelPricing([]ModelPricing{ diff --git a/internal/sync/engine_bench_test.go b/internal/sync/engine_bench_test.go new file mode 100644 index 000000000..9259e3ff4 --- /dev/null +++ b/internal/sync/engine_bench_test.go @@ -0,0 +1,381 @@ +package sync + +import ( + "context" + "fmt" + "io" + "log" + "os" + "path/filepath" + "strconv" + "testing" + "time" + + "go.kenn.io/agentsview/internal/db" + "go.kenn.io/agentsview/internal/parser" + "go.kenn.io/agentsview/internal/testjsonl" +) + +// Hot-path benchmarks for the sync engine, covering the regression +// classes that have shipped before. CI's bench-gate workflow runs +// them on every PR and compares allocs/op, B/op, and ns/op against +// the merge base: +// +// - BenchmarkSyncAllWarmNoop: a full sync over an already-synced, +// unchanged archive must do stat+skip work only. Regressed when +// the provider migration dropped pre-parse DB-freshness skips +// and every full sync reparsed and rewrote unchanged sessions +// (fixed by providerSourceUnchangedInDB), and when discovery +// recomputed root-derived project info per source (#912). +// - BenchmarkSyncPathsIncrementalAppend: absorbing one appended +// line into a large session must scale with the appended data, +// not the stored history (#954). +// - BenchmarkSyncAllColdArchive: first-sync ingest throughput for +// a fresh archive through the default per-session write path. +// - BenchmarkResyncBulkIngest: the same archive through the +// resync bulk-write pipeline (writeBatchBulk / +// DB.WriteSessionBatch, the #411 regression class). +// +// Fixture sizes scale via AGENTSVIEW_BENCH_SYNC_SESSIONS and +// AGENTSVIEW_BENCH_SYNC_MESSAGES for larger local runs. + +const ( + defaultBenchSyncSessions = 40 + defaultBenchSyncMessages = 30 + benchLargeSessionLines = 1000 +) + +// silenceBenchLogs discards the engine's global log output for the +// duration of the benchmark. Log lines interleave with `go test +// -bench` result lines on stdout, which corrupts them so benchfmt +// cannot parse the benchmark — benchgate fails on such corruption, +// and before it did, the corrupted benchmarks silently vanished +// from the gate on both sides. +func silenceBenchLogs(b *testing.B) { + b.Helper() + prev := log.Writer() + log.SetOutput(io.Discard) + b.Cleanup(func() { log.SetOutput(prev) }) +} + +func benchIntFromEnv(name string, def int) int { + raw := os.Getenv(name) + if raw == "" { + return def + } + n, err := strconv.Atoi(raw) + if err != nil || n <= 0 { + return def + } + return n +} + +// writeBenchClaudeArchive lays out `sessions` Claude JSONL session +// files of `perSession` alternating user/assistant messages under +// dir/bench-project, mirroring the on-disk shape SyncAll discovers. +func writeBenchClaudeArchive( + b *testing.B, dir string, sessions, perSession int, +) { + b.Helper() + proj := filepath.Join(dir, "bench-project") + if err := os.MkdirAll(proj, 0o755); err != nil { + b.Fatalf("MkdirAll: %v", err) + } + for s := range sessions { + builder := testjsonl.NewSessionBuilder() + for m := 0; m < perSession; m += 2 { + ts := fmt.Sprintf( + "2026-06-20T10:%02d:%02dZ", (m/2/60)%60, (m/2)%60, + ) + builder.AddClaudeUser(ts, fmt.Sprintf( + "user message %d in session %d", m, s, + )) + builder.AddClaudeAssistant(ts, fmt.Sprintf( + "assistant reply %d in session %d", m, s, + )) + } + path := filepath.Join( + proj, fmt.Sprintf("bench-%04d.jsonl", s), + ) + if err := os.WriteFile( + path, []byte(builder.String()), 0o644, + ); err != nil { + b.Fatalf("WriteFile %s: %v", path, err) + } + } +} + +// openBenchEngine opens a fresh SQLite DB and an engine watching dir +// as a Claude root. Cleanup closes the engine before the DB so any +// pending debounced signal recompute drains first. +func openBenchEngine(b *testing.B, dir string) (*Engine, *db.DB) { + b.Helper() + database, err := db.Open(filepath.Join(b.TempDir(), "bench.db")) + if err != nil { + b.Fatalf("open bench db: %v", err) + } + engine := NewEngine(database, EngineConfig{ + AgentDirs: map[parser.AgentType][]string{ + parser.AgentClaude: {dir}, + }, + Machine: "local", + }) + b.Cleanup(func() { + engine.Close() + if err := database.Close(); err != nil { + b.Errorf("close bench db: %v", err) + } + }) + return engine, database +} + +// BenchmarkSyncAllWarmNoop measures a full sync pass over an archive +// that is already fully synced and unchanged on disk: discovery plus +// per-source freshness skips. It also asserts the invariant the +// benchmark exists to protect — a warm no-op pass must not reparse +// or rewrite anything. +func BenchmarkSyncAllWarmNoop(b *testing.B) { + silenceBenchLogs(b) + sessions := benchIntFromEnv( + "AGENTSVIEW_BENCH_SYNC_SESSIONS", defaultBenchSyncSessions, + ) + perSession := benchIntFromEnv( + "AGENTSVIEW_BENCH_SYNC_MESSAGES", defaultBenchSyncMessages, + ) + dir := b.TempDir() + writeBenchClaudeArchive(b, dir, sessions, perSession) + engine, _ := openBenchEngine(b, dir) + ctx := context.Background() + + first := engine.SyncAll(ctx, nil) + if first.Synced != sessions { + b.Fatalf( + "initial sync stored %d of %d sessions (failed=%d)", + first.Synced, sessions, first.Failed, + ) + } + + b.ReportAllocs() + b.ResetTimer() + for range b.N { + stats := engine.SyncAll(ctx, nil) + if stats.Synced != 0 { + b.Fatalf( + "warm no-op sync re-synced %d sessions", stats.Synced, + ) + } + if writes := engine.PhaseStats().BatchedWrites.Load(); writes != 0 { + b.Fatalf( + "warm no-op sync bulk-wrote %d sessions", writes, + ) + } + } +} + +// BenchmarkSyncPathsIncrementalAppend measures absorbing a single +// appended JSONL line into a session that already stores +// benchLargeSessionLines messages, the streaming write the serve +// daemon performs thousands of times per day. +// +// The session grows by one message per iteration, so per-op cost is +// only comparable between runs with the same iteration count: the +// bench gate always runs with a fixed -benchtime=Nx (see bench.yml +// and the Makefile) so baseline and candidate absorb appends into +// identically sized sessions. Growth is deliberate — per-append cost +// staying flat as the session grows is exactly the invariant this +// benchmark protects. +func BenchmarkSyncPathsIncrementalAppend(b *testing.B) { + silenceBenchLogs(b) + dir := b.TempDir() + writeBenchClaudeArchive(b, dir, 1, benchLargeSessionLines) + engine, database := openBenchEngine(b, dir) + ctx := context.Background() + + first := engine.SyncAll(ctx, nil) + if first.Synced != 1 { + b.Fatalf( + "initial sync stored %d sessions (failed=%d)", + first.Synced, first.Failed, + ) + } + + path := filepath.Join(dir, "bench-project", "bench-0000.jsonl") + f, err := os.OpenFile(path, os.O_APPEND|os.O_WRONLY, 0o644) + if err != nil { + b.Fatalf("OpenFile: %v", err) + } + defer f.Close() + + // The first append after a full sync triggers the leading-edge + // inline signal recompute — an O(stored history) message reload + // plus secret scan whose one-time cost would otherwise dominate + // the fixed 20-iteration measurement and mask steady-state + // per-append regressions. Absorb it before the timer starts so + // the timed loop measures the debounced steady state. + warmup := testjsonl.NewSessionBuilder().AddClaudeUser( + "2026-06-20T11:00:00Z", "warm-up append", + ).String() + if _, err := f.WriteString(warmup); err != nil { + b.Fatalf("warm-up append: %v", err) + } + engine.SyncPaths([]string{path}) + + // Stretch the debounce window so the flush timer cannot fire + // inside the timed loop on a slow run: allocs/op is gated on the + // candidate's worst -count run, and one timer-driven O(history) + // recompute landing mid-loop would flake the gate. Engine.Close + // drains the deferred recompute during cleanup. + engine.signalSched.mu.Lock() + engine.signalSched.interval = time.Hour + engine.signalSched.quiet = time.Hour + engine.signalSched.mu.Unlock() + + // Pre-build the appended lines: constructing Claude JSONL via + // testjsonl allocates (map + json.Marshal), and inside the timed + // loop that helper cost would be gated as if it were sync work. + lines := make([]string, b.N) + for i := range lines { + lines[i] = testjsonl.NewSessionBuilder().AddClaudeUser( + "2026-06-20T11:00:00Z", + fmt.Sprintf("streamed line %d", i), + ).String() + } + + b.ReportAllocs() + b.ResetTimer() + for i := range b.N { + if _, err := f.WriteString(lines[i]); err != nil { + b.Fatalf("append: %v", err) + } + engine.SyncPaths([]string{path}) + } + b.StopTimer() + + msgs, err := database.GetAllMessages(ctx, "bench-0000") + if err != nil { + b.Fatalf("GetAllMessages: %v", err) + } + want := benchLargeSessionLines + 1 + b.N // +1 for the warm-up append + if len(msgs) < want { + b.Fatalf( + "appends were not absorbed: stored %d messages, want >= %d", + len(msgs), want, + ) + } +} + +// benchColdArchive is the shared cold-ingest loop: each iteration +// syncs the same archive into a fresh database via syncOnce, then +// verify checks the iteration's outcome with the timer stopped. +func benchColdArchive( + b *testing.B, + syncOnce func(*Engine) SyncStats, + verify func(*Engine, SyncStats, int), +) { + b.Helper() + silenceBenchLogs(b) + sessions := benchIntFromEnv( + "AGENTSVIEW_BENCH_SYNC_SESSIONS", defaultBenchSyncSessions, + ) + perSession := benchIntFromEnv( + "AGENTSVIEW_BENCH_SYNC_MESSAGES", defaultBenchSyncMessages, + ) + dir := b.TempDir() + writeBenchClaudeArchive(b, dir, sessions, perSession) + dbDir := b.TempDir() + + b.ReportAllocs() + b.ResetTimer() + for i := range b.N { + b.StopTimer() + dbPath := filepath.Join(dbDir, fmt.Sprintf("cold-%d.db", i)) + database, err := db.Open(dbPath) + if err != nil { + b.Fatalf("open bench db: %v", err) + } + engine := NewEngine(database, EngineConfig{ + AgentDirs: map[parser.AgentType][]string{ + parser.AgentClaude: {dir}, + }, + Machine: "local", + }) + b.StartTimer() + + stats := syncOnce(engine) + + b.StopTimer() + verify(engine, stats, sessions) + engine.Close() + if err := database.Close(); err != nil { + b.Fatalf("close bench db: %v", err) + } + // Drop this iteration's DB (and WAL/SHM sidecars) so disk + // usage stays O(1) instead of retaining b.N populated + // databases until the function returns. + stale, err := filepath.Glob(dbPath + "*") + if err != nil { + b.Fatalf("glob %s: %v", dbPath, err) + } + for _, p := range stale { + if err := os.Remove(p); err != nil { + b.Fatalf("remove %s: %v", p, err) + } + } + b.StartTimer() + } +} + +// BenchmarkSyncAllColdArchive measures first-sync ingest throughput +// through the public SyncAll path: parse plus the default +// per-session writes a user's first sync performs. +func BenchmarkSyncAllColdArchive(b *testing.B) { + ctx := context.Background() + benchColdArchive(b, + func(engine *Engine) SyncStats { + return engine.SyncAll(ctx, nil) + }, + func(_ *Engine, stats SyncStats, sessions int) { + if stats.Synced != sessions { + b.Fatalf( + "cold sync stored %d of %d sessions (failed=%d)", + stats.Synced, sessions, stats.Failed, + ) + } + }, + ) +} + +// BenchmarkResyncBulkIngest measures the bulk-write ingest path a +// full resync uses: syncWriteBulk routes every parsed session +// through writeBatchBulk and DB.WriteSessionBatch — the #411 +// regression class. This is a different write path from the default +// per-session writes BenchmarkSyncAllColdArchive covers, and it +// self-asserts that every session really went through the batch +// pipeline so the benchmark cannot silently measure the wrong path. +func BenchmarkResyncBulkIngest(b *testing.B) { + ctx := context.Background() + benchColdArchive(b, + func(engine *Engine) SyncStats { + engine.syncMu.Lock() + defer engine.syncMu.Unlock() + return engine.syncAllLocked( + ctx, nil, time.Time{}, nil, syncWriteBulk, true, + ) + }, + func(engine *Engine, stats SyncStats, sessions int) { + if stats.Synced != sessions { + b.Fatalf( + "bulk ingest stored %d of %d sessions (failed=%d)", + stats.Synced, sessions, stats.Failed, + ) + } + writes := engine.PhaseStats().BatchedWrites.Load() + if writes != int64(sessions) { + b.Fatalf( + "bulk ingest wrote %d of %d sessions via the batch pipeline", + writes, sessions, + ) + } + }, + ) +} diff --git a/internal/sync/perf_invariant_test.go b/internal/sync/perf_invariant_test.go new file mode 100644 index 000000000..7f4ff8d43 --- /dev/null +++ b/internal/sync/perf_invariant_test.go @@ -0,0 +1,66 @@ +package sync + +import ( + "context" + "fmt" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// Deterministic work-count gates for the "sync work scales with new +// data, not archive size" invariant. Unlike the wall-clock +// benchmarks in engine_bench_test.go, these run in the regular test +// suite and count work units, so they fail loudly in CI regardless +// of runner noise. Companion gates elsewhere: +// +// - TestProviderAuthoritativeUnchangedSessionSkipsOnResync covers +// the generic providerSourceUnchangedInDB skip for the +// provider.Parse fallthrough group (Vibe as representative). +// - TestWriteIncrementalDebouncesSignalRecompute pins the #954 +// debounce of the O(history) signal recompute. +// - The count-based seam tests in internal/parser +// (discovery_workspace_manifest_test.go, antigravity/gemini +// provider tests) pin O(roots) discovery work (#912). + +// TestWarmFullSyncDoesNoBulkWriteWork verifies that a second full +// sync over an unchanged Claude archive skips every session before +// the parse and never enters the bulk-write pipeline. Claude has its +// own pre-parse freshness path (shouldSkipProviderSourceByDB), +// distinct from the generic check the Vibe test covers; both have +// regressed independently in the past. +func TestWarmFullSyncDoesNoBulkWriteWork(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test") + } + + fx := newEngineFixture(t) + const n = 5 + for i := range n { + fx.writeClaudeSession( + t, "proj", fmt.Sprintf("warm-%d.jsonl", i), + fmt.Sprintf("hello %d", i), + ) + } + + ctx := context.Background() + first := fx.engine.SyncAll(ctx, nil) + require.Equal(t, n, first.Synced, + "first sync parses and stores every session") + + second := fx.engine.SyncAll(ctx, nil) + assert.Equal(t, 0, second.Synced, + "unchanged sessions must not be re-synced on a warm pass") + assert.GreaterOrEqual(t, second.Skipped, n, + "every unchanged session must be counted as skipped") + + // PhaseStats resets at the start of each pass, so after the + // second pass it reflects only that pass: a warm no-op sync + // must not have run a single bulk-write batch. + stats := fx.engine.PhaseStats() + assert.Zero(t, stats.Batches.Load(), + "warm no-op sync must not run any bulk-write batch") + assert.Zero(t, stats.BatchedWrites.Load(), + "warm no-op sync must not rewrite any session") +}