diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 04049a330..29e872fae 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -82,7 +82,7 @@ jobs:
       - name: Checkout bpfilter
         uses: actions/checkout@v2
       - name: Configure the build
-        run: cmake -S $GITHUB_WORKSPACE -B $GITHUB_WORKSPACE/build -DNO_BENCHMARKS=1
+        run: cmake -S $GITHUB_WORKSPACE -B $GITHUB_WORKSPACE/build
       - name: Build all
         run: make -C $GITHUB_WORKSPACE/build -j `nproc`
 
@@ -177,7 +177,8 @@ jobs:
             BENCH_INCLUDE=""
             BENCH_FAIL_ON=""
           fi
-          tools/benchmarks/bfbencher \
+          tests/benchmarks/bfbencher \
+            history \
             --since 30bd49f \
             --until $BENCH_UNTIL \
             $BENCH_INCLUDE \
diff --git a/CLAUDE.md b/CLAUDE.md
index fe4f209b1..ecf74b6c2 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -68,7 +68,7 @@ make -C build doc
 ```
 
 **Build options:**
-- `-DNO_DOCS=1`, `-DNO_TESTS=1`, `-DNO_CHECKS=1`, `-DNO_BENCHMARKS=1`
+- `-DNO_DOCS=1`, `-DNO_TESTS=1`, `-DNO_CHECKS=1`
 
 ## Code style
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9e787ed8a..d3b5f48c6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -32,7 +32,6 @@ include(GNUInstallDirs)
 option(NO_DOCS "Disable documentation generation" 0)
 option(NO_TESTS "Disable unit, end-to-end, and integration tests" 0)
 option(NO_CHECKS "Disable the check target (clang-tidy and clang-format" 0)
-option(NO_BENCHMARKS "Disable the benchmark" 0)
 option(WITH_COVERAGE "Build with code coverage support. Disabled by default" 0)
 
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
@@ -100,7 +99,3 @@ if (NOT ${NO_TESTS})
     enable_testing()
     add_subdirectory(tests)
 endif ()
-
-if (NOT ${NO_BENCHMARKS})
-    add_subdirectory(tools/benchmarks)
-endif ()
diff --git a/README.md b/README.md
index 4705845a1..feab00460 100644
--- a/README.md
+++ b/README.md
@@ -54,7 +54,7 @@ sudo dnf install -y bpfilter bpfilter-devel
 sudo dnf install -y clang cmake gcc libbpf-devel bison flex sed xxd
 
 # Configure the project and build bpfilter
-cmake -S $SOURCES_DIR -B $BUILD_DIR -DNO_DOCS=ON -DNO_TESTS=ON -DNO_CHECKS=ON -DNO_BENCHMARKS=ON
+cmake -S $SOURCES_DIR -B $BUILD_DIR -DNO_DOCS=ON -DNO_TESTS=ON -DNO_CHECKS=ON
 make -C $BUILD_DIR
 ```
 
diff --git a/derivation.nix b/derivation.nix
index 02da8cdb6..82b736e2b 100644
--- a/derivation.nix
+++ b/derivation.nix
@@ -66,7 +66,6 @@ in
       "-DNO_DOCS=1"
       "-DNO_TESTS=1"
       "-DNO_CHECKS=1"
-      "-DNO_BENCHMARKS=1"
     ];
 
     # We do not run the unit tests because the nix build sandbox doesn't
diff --git a/doc/developers/build.rst b/doc/developers/build.rst
index f979cac32..25e4fa1d6 100644
--- a/doc/developers/build.rst
+++ b/doc/developers/build.rst
@@ -95,9 +95,8 @@ You can then use CMake to generate the build system:
 The usual CMake options are allowed (e.g. ``CMAKE_BUILD_TYPE``, ``CMAKE_INSTALL_PREFIX``...). The build configuration is modular, so you're free to enable/disable some parts of the projects according to your needs:
 
 - ``-DNO_DOCS``: disable the documentation, including the coverage and benchmarks report.
-- ``-DNO_TESTS``: disable unit tests, end-to-end tests, and integration tests.
+- ``-DNO_TESTS``: disable all tests.
 - ``-DNO_CHECKS``: disable style check and static analyzer.
-- ``-DNO_BENCHMARKS``: disable benchmarks.
 
 A full configuration (without any part disabled) will provide the following targets:
 
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index b3efbb8f3..4c8d0d6f9 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -7,6 +7,7 @@ add_subdirectory(e2e)
 add_subdirectory(fuzz)
 add_subdirectory(integration)
 add_subdirectory(check)
+add_subdirectory(benchmarks)
 
 add_custom_target(test_bin
     DEPENDS unit_bin e2e_bin fuzz_parser
diff --git a/tools/benchmarks/CMakeLists.txt b/tests/benchmarks/CMakeLists.txt
similarity index 98%
rename from tools/benchmarks/CMakeLists.txt
rename to tests/benchmarks/CMakeLists.txt
index a9fad4f2a..e5d3b54a3 100644
--- a/tools/benchmarks/CMakeLists.txt
+++ b/tests/benchmarks/CMakeLists.txt
@@ -1,8 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-only
 # Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
 
-enable_language(CXX)
-
 find_package(benchmark REQUIRED)
 find_package(PkgConfig REQUIRED)
 pkg_check_modules(bpf REQUIRED IMPORTED_TARGET libbpf)
diff --git a/tools/benchmarks/benchmark.cpp b/tests/benchmarks/benchmark.cpp
similarity index 100%
rename from tools/benchmarks/benchmark.cpp
rename to tests/benchmarks/benchmark.cpp
diff --git a/tools/benchmarks/benchmark.hpp b/tests/benchmarks/benchmark.hpp
similarity index 100%
rename from tools/benchmarks/benchmark.hpp
rename to tests/benchmarks/benchmark.hpp
diff --git a/tools/benchmarks/bfbencher b/tests/benchmarks/bfbencher
similarity index 76%
rename from tools/benchmarks/bfbencher
rename to tests/benchmarks/bfbencher
index 08f62939a..9fe089403 100755
--- a/tools/benchmarks/bfbencher
+++ b/tests/benchmarks/bfbencher
@@ -36,8 +36,8 @@ DEFAULT_LAST_COMMIT_REF = "wip"
 DEFAULT_SOURCE_PATH = pathlib.Path(".")
 DEFAULT_CACHE_PATH = pathlib.Path(".cache/bfbencher")
 DEFAULT_USERNAME = getpass.getuser()
-DEFAULT_REPORT_TEMPLATE_PATH = pathlib.Path("tools/benchmarks/results.html.j2")
-DEFAULT_PR_REPORT_TEMPLATE_PATH = pathlib.Path("tools/benchmarks/summary.html.j2")
+DEFAULT_REPORT_TEMPLATE_PATH = pathlib.Path("tests/benchmarks/results.html.j2")
+DEFAULT_PR_REPORT_TEMPLATE_PATH = pathlib.Path("tests/benchmarks/summary.html.j2")
 DEFAULT_HOST = [socket.gethostname(), "localhost"]
 SHORT_SHA_LEN = 7
 
@@ -53,6 +53,7 @@ class Stats:
         self.n_failures = 0
         self.n_cache_hits = 0
         self.n_cache_misses = 0
+        self.n_benchmark_errors = 0
 
     def success(self, from_cache: bool = False) -> None:
         if from_cache:
@@ -111,6 +112,47 @@ class Renderer:
 
         self.console.print(table)
 
+    def print_compare_report(
+        self,
+        rows: list[Report.CompareRow],
+        base_sha: str,
+        ref_sha: str,
+    ) -> None:
+        def format_pct(pct: float) -> str:
+            color = "green" if pct < 0 else ("red" if pct > 0 else "white")
+            return f"[{color}]{pct:+.1f}%[/{color}]"
+
+        table = rich.table.Table(
+            title=f"{base_sha[:SHORT_SHA_LEN]} → {ref_sha[:SHORT_SHA_LEN]}",
+            show_header=True,
+        )
+        table.add_column("Benchmark", style="cyan")
+        table.add_column("Base", justify="right")
+        table.add_column("Ref", justify="right")
+        table.add_column("ΔTime", justify="right")
+        table.add_column("ΔTime%", justify="right")
+        table.add_column("Base Insn", justify="right")
+        table.add_column("Ref Insn", justify="right")
+        table.add_column("ΔInsn", justify="right")
+        table.add_column("ΔInsn%", justify="right")
+
+        for row in rows:
+            table.add_row(
+                row.name,
+                row.base_time_str,
+                row.ref_time_str,
+                row.delta_time_str,
+                format_pct(row.delta_time_pct),
+                str(row.base_insn) if row.base_insn is not None else "-",
+                str(row.ref_insn) if row.ref_insn is not None else "-",
+                f"{row.delta_insn:+d}" if row.delta_insn is not None else "-",
+                format_pct(row.delta_insn_pct)
+                if row.delta_insn_pct is not None
+                else "-",
+            )
+
+        self.console.print(table)
+
 
 renderer: Renderer = Renderer()
 
@@ -221,6 +263,10 @@ class Benchmark:
             "nInsn": Analyzer(nInsns[-1], nInsns[-n - 1 : -1]) if nInsns else None,
         }
 
+    @property
+    def results(self) -> list[Result]:
+        return list(self._results)
+
     @property
     def last(self) -> Result | None:
         return self._results[-1] if self._results else None
@@ -548,10 +594,56 @@ class FilesystemSource:
         self._local = local_src_dir
 
         shutil.copytree(self._path, self._local, dirs_exist_ok=True)
+        self._detach_if_worktree()
         self._repo: git.Repo = git.Repo(self._local)
         self._retry_all: bool = False
         self._retry_failed: bool = False
 
+    def _detach_if_worktree(self) -> None:
+        """Convert a copied git worktree into a standalone repository.
+
+        In a git worktree the .git entry is a file containing a gitdir pointer
+        to the original repo's worktree-specific state. shutil.copytree copies
+        that file verbatim, so all git operations on the copy would modify the
+        original worktree's HEAD and index.  This method detects that case and
+        replaces the .git file with a self-contained .git directory built from
+        the worktree-specific state (HEAD, index) and the shared commondir
+        (objects, refs, config, …).
+        """
+        git_entry = self._local / ".git"
+        if not git_entry.is_file():
+            return
+
+        content = git_entry.read_text().strip()
+        if not content.startswith("gitdir:"):
+            return
+
+        wt_gitdir = pathlib.Path(content.split(":", 1)[1].strip())
+        if not wt_gitdir.is_absolute():
+            wt_gitdir = (self._local / wt_gitdir).resolve()
+
+        commondir_file = wt_gitdir / "commondir"
+        if commondir_file.exists():
+            commondir = (wt_gitdir / commondir_file.read_text().strip()).resolve()
+        else:
+            commondir = wt_gitdir
+
+        # Replace the .git pointer file with a full standalone git directory.
+        git_entry.unlink()
+        shutil.copytree(commondir, self._local / ".git")
+
+        # Drop worktrees/ — those entries are specific to the original repo.
+        worktrees_dir = self._local / ".git" / "worktrees"
+        if worktrees_dir.exists():
+            shutil.rmtree(worktrees_dir)
+
+        # Apply the worktree-specific HEAD and index, which track the state of
+        # this worktree and differ from the main worktree's equivalents.
+        for fname in ("HEAD", "index"):
+            src = wt_gitdir / fname
+            if src.exists():
+                shutil.copy2(src, self._local / ".git" / fname)
+
     @property
     def local(self) -> pathlib.Path:
         """Local path to the source repository copy."""
@@ -749,7 +841,7 @@ class BenchmarkContext:
         return self.build_dir / "output/sbin/bfcli"
 
     def configure(
-        self, doc: bool = False, tests: bool = False, checks: bool = False
+        self, doc: bool = False, checks: bool = False
     ) -> bool:
         cmd: list[str] = [
             "cmake",
@@ -761,8 +853,6 @@ class BenchmarkContext:
 
         if not doc:
             cmd += ["-DNO_DOCS=1"]
-        if not tests:
-            cmd += ["-DNO_TESTS=1"]
         if not checks:
             cmd += ["-DNO_CHECKS=1"]
 
@@ -884,6 +974,23 @@ class Report:
         runtime_ns: float = 0  # Runtime in nanoseconds for sorting
         insn_count: int = 0  # Instruction count for sorting
 
+    @dataclasses.dataclass
+    class CompareRow:
+        """Prepared data for a single benchmark row in compare mode."""
+
+        name: str
+        label: str
+        base_time_str: str
+        ref_time_str: str
+        delta_time_str: str
+        delta_time_pct: float
+        base_insn: int | None
+        ref_insn: int | None
+        delta_insn: int | None
+        delta_insn_pct: float | None
+        base_time_ns: float
+        ref_time_ns: float
+
     def __init__(self, history: History):
         self._history = history
 
@@ -1023,6 +1130,120 @@ class Report:
         rows = self._get_benchmark_rows(terms)
         renderer.print_report(rows, terms)
 
+    def _get_compare_rows(self, base_sha: str, ref_sha: str) -> list[CompareRow]:
+        rows = []
+        for benchmark in self._history.sorted_benchmarks():
+            base_result = next(
+                (r for r in benchmark.results if r.commit_sha == base_sha), None
+            )
+            ref_result = next(
+                (r for r in benchmark.results if r.commit_sha == ref_sha), None
+            )
+            if not base_result or not ref_result:
+                continue
+
+            base_ns: float = float(base_result.time.to("ns").magnitude)  # type: ignore[union-attr]
+            ref_ns: float = float(ref_result.time.to("ns").magnitude)  # type: ignore[union-attr]
+            delta_ns: float = ref_ns - base_ns
+            delta_pct: float = (delta_ns / base_ns * 100) if base_ns else 0.0
+
+            base_insn = int(base_result.nInsn) if base_result.nInsn else None
+            ref_insn = int(ref_result.nInsn) if ref_result.nInsn else None
+            if base_insn is not None and ref_insn is not None:
+                delta_insn: int | None = int(ref_insn) - int(base_insn)
+                delta_insn_pct: float | None = (
+                    (delta_insn / base_insn * 100) if base_insn else 0
+                )
+            else:
+                delta_insn = None
+                delta_insn_pct = None
+
+            rows.append(
+                Report.CompareRow(
+                    name=benchmark.name,
+                    label=benchmark.label,
+                    base_time_str=f"{base_result.time:~.2f}",  # type: ignore[union-attr]
+                    ref_time_str=f"{ref_result.time:~.2f}",  # type: ignore[union-attr]
+                    delta_time_str=f"{delta_ns:+.2f} ns",
+                    delta_time_pct=delta_pct,
+                    base_insn=base_insn,
+                    ref_insn=ref_insn,
+                    delta_insn=delta_insn,
+                    delta_insn_pct=delta_insn_pct,
+                    base_time_ns=base_ns,
+                    ref_time_ns=ref_ns,
+                )
+            )
+
+        return rows
+
+    def print_compare_report(self, base_sha: str, ref_sha: str) -> None:
+        rows = self._get_compare_rows(base_sha, ref_sha)
+        renderer.print_compare_report(rows, base_sha, ref_sha)
+
+    def write_compare_json(
+        self,
+        path: pathlib.Path,
+        base_sha: str,
+        ref_sha: str,
+        host: str,
+    ) -> None:
+        rows = self._get_compare_rows(base_sha, ref_sha)
+        data = {
+            "base": base_sha,
+            "ref": ref_sha,
+            "host": host,
+            "benchmarks": [
+                {
+                    "name": r.name,
+                    "base_time_ns": r.base_time_ns,
+                    "ref_time_ns": r.ref_time_ns,
+                    "delta_time_ns": r.ref_time_ns - r.base_time_ns,
+                    "delta_time_pct": r.delta_time_pct,
+                    "base_insn": r.base_insn,
+                    "ref_insn": r.ref_insn,
+                    "delta_insn": r.delta_insn,
+                    "delta_insn_pct": r.delta_insn_pct,
+                }
+                for r in rows
+            ],
+        }
+
+        with open(path, "w") as f:
+            json.dump(data, f, indent=2)
+
+
+def _benchmark_commits(executor: Executor, args: argparse.Namespace) -> None:
+    for ctx in BenchmarkContext.commits(executor):
+        if not ctx.configure():
+            continue
+
+        if not ctx.make("bfcli"):
+            continue
+
+        if not ctx.make("benchmark_bin"):
+            continue
+
+        if not ctx.run_benchmark(
+            args.bind_node, args.no_preempt, args.cpu_pin, args.slice
+        ):
+            continue
+
+        results = ctx.results
+        if not results:
+            executor.log(f"could not find {ctx.results_path}")
+            continue
+
+        for r in results:
+            if r.get("error_occurred"):
+                executor.stats.n_benchmark_errors += 1
+                executor.log(
+                    f"[red bold]benchmark error: {r['name']}: {r.get('error_message', '')}[/]"
+                )
+
+        executor.add_results(ctx.commit, results)
+        executor.log("Done!")
+
 
 def run_benchmarks(args: argparse.Namespace):
     executor = (
@@ -1072,6 +1293,9 @@ def run_benchmarks(args: argparse.Namespace):
             )
         report.print_report([20])
 
+    if args.fail_on_benchmark_error and executor.stats.n_benchmark_errors > 0:
+        raise SystemExit(1)
+
     if args.fail_on_significant_change:
         terms = [20]
         for benchmark in executor.results.sorted_benchmarks():
@@ -1081,72 +1305,56 @@ def run_benchmarks(args: argparse.Namespace):
                 raise SystemExit(1)
 
 
-def main():
-    parser = argparse.ArgumentParser(
-        prog="bfbencher",
-        description="Benchmark bpfilter performance across git commits.",
-    )
+def run_compare(args: argparse.Namespace):
+    source_repo = git.Repo(args.sources)
+    base_sha: str = source_repo.git.rev_parse(args.base)
+    ref_sha: str = source_repo.git.rev_parse(args.ref)
 
-    parser.add_argument(
-        "--since",
-        type=str,
-        help=f'oldest commit to benchmark. Use "wip" to start from the uncommitted changes (committed as "bfbencher: WIP"). Must be older than --until, or the same. Defaults to "{DEFAULT_FIRST_COMMIT_REF}"',
-        default=DEFAULT_FIRST_COMMIT_REF,
-    )
-    parser.add_argument(
-        "--include",
-        type=str,
-        action="append",
-        default=[],
-        help='include an extra commit outside the range. Can be specified multiple times. Use "wip" to include uncommitted changes. Commits are sorted in git order with the range commits.',
-    )
-    parser.add_argument(
-        "--until",
-        type=str,
-        help=f'newest commit to benchmark. Use "wip" to include uncommitted changes (committed as "bfbencher: WIP"). Must be newer than --since, or the same. Defaults to "{DEFAULT_LAST_COMMIT_REF}"',
-        default=DEFAULT_LAST_COMMIT_REF,
+    args.since = args.base
+    args.until = args.base
+    args.include = [args.ref]
+
+    executor = (
+        LocalExecutor(args) if args.host in DEFAULT_HOST else RemoteExecutor(args)
     )
-    parser.add_argument(
+
+    with executor:
+        _benchmark_commits(executor, args)
+
+        report = Report(executor.results)
+        report.print_compare_report(base_sha, ref_sha)
+
+        if args.json_output:
+            report.write_compare_json(
+                args.json_output, base_sha, ref_sha, executor.host
+            )
+
+    if args.fail_on_benchmark_error and executor.stats.n_benchmark_errors > 0:
+        raise SystemExit(1)
+
+
+def main():
+    # Options shared across all subcommands.
+    shared = argparse.ArgumentParser(add_help=False)
+    shared.add_argument(
         "--sources",
         type=pathlib.Path,
         help=f'path to the bpfilter sources directory. Defaults to "{DEFAULT_SOURCE_PATH}".',
         default=DEFAULT_SOURCE_PATH,
     )
-    parser.add_argument(
+    shared.add_argument(
         "--host",
         type=str,
         help=f'host to run the benchmark on. bfbencher will connect to the host using SSH, copy the project sources on it, and run the benchmarks. Defaults to "{DEFAULT_HOST[0]}" (current host).',
         default=DEFAULT_HOST[0],
     )
-    parser.add_argument(
+    shared.add_argument(
         "--cache-dir",
         type=pathlib.Path,
         help=f"path to the directory containing the cached results. The cache is used to store benchmark results based on the hostname and the commit SHA, it is stored on the host running bfbencher. Defaults to {DEFAULT_CACHE_PATH}.",
         default=DEFAULT_CACHE_PATH,
     )
-    parser.add_argument(
-        "--report-template-path",
-        type=pathlib.Path,
-        help=f'path to the Jinja2 template use to generate the HTML report. Defaults to "{DEFAULT_REPORT_TEMPLATE_PATH}"',
-        default=DEFAULT_REPORT_TEMPLATE_PATH,
-    )
-    parser.add_argument(
-        "--report-path",
-        type=pathlib.Path,
-        help="path of the final HTML report.",
-    )
-    parser.add_argument(
-        "--pr-report-template-path",
-        type=pathlib.Path,
-        help=f'path to the Jinja2 template use to generate the HTML pull-request report. Defaults to "{DEFAULT_PR_REPORT_TEMPLATE_PATH}"',
-        default=DEFAULT_PR_REPORT_TEMPLATE_PATH,
-    )
-    parser.add_argument(
-        "--pr-report-path",
-        type=pathlib.Path,
-        help="path of the HTML summary report for pull requests (shows only significant changes).",
-    )
-    parser.add_argument(
+    shared.add_argument(
         "--retry",
         "-r",
         type=str,
@@ -1154,41 +1362,130 @@ def main():
         default=[],
         help='retry benchmarks for specific commits, ignoring cached results. Use "failed" to retry all failed commits, "all" to retry everything, or a commit ref to retry a specific commit. Can be specified multiple times.',
     )
-    parser.add_argument(
+    shared.add_argument(
+        "--fail-on-benchmark-error",
+        action="store_true",
+        help="exit with non-zero status if any benchmark reports an error during execution",
+        default=False,
+    )
+    shared.add_argument(
         "--fail-on-significant-change",
         choices=["better", "worse", "any"],
         help="exit with non-zero status if any benchmark has a statistically significant change (better=improvement, worse=regression, any=either)",
         default=None,
     )
-    parser.add_argument(
+    shared.add_argument(
         "--bind-node",
         type=int,
         help="CPU and memory node to bind the benchmark to.",
         default=None,
     )
-    parser.add_argument(
+    shared.add_argument(
         "--no-preempt",
         action="store_true",
-        help="if set, use chrt to run the bechmark with real-time scheduling policy at the highest priority. This option should reduce jitter as only kernel threads could preempt it.",
+        help="if set, use chrt to run the benchmark with real-time scheduling policy at the highest priority. This option should reduce jitter as only kernel threads could preempt it.",
         default=False,
     )
-    parser.add_argument(
+    shared.add_argument(
         "--cpu-pin",
         type=int,
         help="if set, defines the CPU to pin the benchmark to. If the CPU is isolated, it will reduce variability between runs.",
         default=None,
     )
-    parser.add_argument(
+    shared.add_argument(
         "--slice",
         type=str,
         help="systemd slice to run the benchmark into. Required if --cpu-pin is isolated at the systemd level.",
         default=None,
     )
 
+    parser = argparse.ArgumentParser(
+        prog="bfbencher",
+        description="Benchmark bpfilter performance across git commits.",
+    )
+    subparsers = parser.add_subparsers(dest="command")
+
+    history_parser = subparsers.add_parser(
+        "history",
+        parents=[shared],
+        help="benchmark performance across a range of commits",
+        description="Benchmark bpfilter performance across a range of commits and report changes over time.",
+    )
+    history_parser.add_argument(
+        "--report-template-path",
+        type=pathlib.Path,
+        help=f'path to the Jinja2 template use to generate the HTML report. Defaults to "{DEFAULT_REPORT_TEMPLATE_PATH}"',
+        default=DEFAULT_REPORT_TEMPLATE_PATH,
+    )
+    history_parser.add_argument(
+        "--report-path",
+        type=pathlib.Path,
+        help="path of the final HTML report.",
+    )
+    history_parser.add_argument(
+        "--pr-report-template-path",
+        type=pathlib.Path,
+        help=f'path to the Jinja2 template use to generate the HTML pull-request report. Defaults to "{DEFAULT_PR_REPORT_TEMPLATE_PATH}"',
+        default=DEFAULT_PR_REPORT_TEMPLATE_PATH,
+    )
+    history_parser.add_argument(
+        "--pr-report-path",
+        type=pathlib.Path,
+        help="path of the HTML summary report for pull requests (shows only significant changes).",
+    )
+    history_parser.add_argument(
+        "--since",
+        type=str,
+        help=f'oldest commit to benchmark. Use "wip" to start from the uncommitted changes (committed as "bfbencher: WIP"). Must be older than --until, or the same. Defaults to "{DEFAULT_FIRST_COMMIT_REF}"',
+        default=DEFAULT_FIRST_COMMIT_REF,
+    )
+    history_parser.add_argument(
+        "--until",
+        type=str,
+        help=f'newest commit to benchmark. Use "wip" to include uncommitted changes (committed as "bfbencher: WIP"). Must be newer than --since, or the same. Defaults to "{DEFAULT_LAST_COMMIT_REF}"',
+        default=DEFAULT_LAST_COMMIT_REF,
+    )
+    history_parser.add_argument(
+        "--include",
+        type=str,
+        action="append",
+        default=[],
+        help='include an extra commit outside the range. Can be specified multiple times. Use "wip" to include uncommitted changes. Commits are sorted in git order with the range commits.',
+    )
+
+    compare_parser = subparsers.add_parser(
+        "compare",
+        parents=[shared],
+        help="compare performance between two specific commits",
+        description="Benchmark two specific commits and report the performance difference.",
+    )
+    compare_parser.add_argument(
+        "base",
+        type=str,
+        help='baseline commit ref. Use "wip" for uncommitted changes.',
+    )
+    compare_parser.add_argument(
+        "ref",
+        type=str,
+        help='commit ref to compare against the baseline. Use "wip" for uncommitted changes.',
+    )
+    compare_parser.add_argument(
+        "--json-output",
+        type=pathlib.Path,
+        help="write comparison results to a JSON file.",
+    )
+
     args = parser.parse_args()
 
+    if args.command is None:
+        parser.print_help()
+        raise SystemExit(1)
+
     try:
-        run_benchmarks(args)
+        if args.command == "history":
+            run_benchmarks(args)
+        elif args.command == "compare":
+            run_compare(args)
     except KeyboardInterrupt:
         renderer.log("Command interrupted by user")
         raise SystemExit(1)
diff --git a/tools/benchmarks/main.cpp b/tests/benchmarks/main.cpp
similarity index 86%
rename from tools/benchmarks/main.cpp
rename to tests/benchmarks/main.cpp
index c8f634aba..7517ef68b 100644
--- a/tools/benchmarks/main.cpp
+++ b/tests/benchmarks/main.cpp
@@ -94,6 +94,108 @@ void chain_policy_c(::benchmark::State &state)
 
 BENCHMARK(chain_policy_c);
 
+void xdp_prologue_c(::benchmark::State &state)
+{
+    Chain chain("bf_benchmark", BF_HOOK_XDP, BF_VERDICT_ACCEPT);
+
+    auto chainp = chain.get();
+    int ret = bf_chain_set(chainp.get(), nullptr);
+    if (ret < 0)
+        throw std::runtime_error("failed to load chain");
+
+    auto prog = bft::Program(chain.name());
+
+    while (state.KeepRunningBatch(::bft::progRunRepeat)) {
+        auto stats = prog.run(::bft::pkt_local_ip6_tcp);
+        if (stats.retval != XDP_PASS)
+            state.SkipWithError("benchmark run failed");
+
+        state.SetIterationTime((double)stats.duration * stats.repeat);
+    }
+
+    state.counters["nInsn"] = prog.nInsn();
+    state.SetLabel("XDP prologue, accept policy");
+}
+
+BENCHMARK(xdp_prologue_c);
+
+void tc_ingress_prologue_c(::benchmark::State &state)
+{
+    Chain chain("bf_benchmark", BF_HOOK_TC_INGRESS, BF_VERDICT_ACCEPT);
+
+    auto chainp = chain.get();
+    int ret = bf_chain_set(chainp.get(), nullptr);
+    if (ret < 0)
+        throw std::runtime_error("failed to load chain");
+
+    auto prog = bft::Program(chain.name());
+
+    // TC_ACT_OK = 0
+    while (state.KeepRunningBatch(::bft::progRunRepeat)) {
+        auto stats = prog.run(::bft::pkt_local_ip6_tcp);
+        if (stats.retval != 0)
+            state.SkipWithError("benchmark run failed");
+
+        state.SetIterationTime((double)stats.duration * stats.repeat);
+    }
+
+    state.counters["nInsn"] = prog.nInsn();
+    state.SetLabel("TC_INGRESS prologue, accept policy");
+}
+
+BENCHMARK(tc_ingress_prologue_c);
+
+void cgroup_skb_ingress_prologue_c(::benchmark::State &state)
+{
+    Chain chain("bf_benchmark", BF_HOOK_CGROUP_SKB_INGRESS, BF_VERDICT_ACCEPT);
+
+    auto chainp = chain.get();
+    int ret = bf_chain_set(chainp.get(), nullptr);
+    if (ret < 0)
+        throw std::runtime_error("failed to load chain");
+
+    auto prog = bft::Program(chain.name());
+
+    while (state.KeepRunningBatch(::bft::progRunRepeat)) {
+        auto stats = prog.run(::bft::pkt_local_ip6_tcp);
+        if (stats.retval != ::bft::CGROUP_SKB_ACCEPT)
+            state.SkipWithError("benchmark run failed");
+
+        state.SetIterationTime((double)stats.duration * stats.repeat);
+    }
+
+    state.counters["nInsn"] = prog.nInsn();
+    state.SetLabel("CGROUP_SKB_INGRESS prologue, accept policy");
+}
+
+BENCHMARK(cgroup_skb_ingress_prologue_c);
+
+void nf_local_in_prologue_c(::benchmark::State &state)
+{
+    Chain chain("bf_benchmark", BF_HOOK_NF_LOCAL_IN, BF_VERDICT_ACCEPT);
+
+    auto chainp = chain.get();
+    int ret = bf_chain_set(chainp.get(), nullptr);
+    if (ret < 0)
+        throw std::runtime_error("failed to load chain");
+
+    auto prog = bft::Program(chain.name());
+
+    // NF_ACCEPT = 1
+    while (state.KeepRunningBatch(::bft::progRunRepeat)) {
+        auto stats = prog.run(::bft::pkt_local_ip6_tcp);
+        if (stats.retval != 1)
+            state.SkipWithError("benchmark run failed");
+
+        state.SetIterationTime((double)stats.duration * stats.repeat);
+    }
+
+    state.counters["nInsn"] = prog.nInsn();
+    state.SetLabel("NF_LOCAL_IN prologue, accept policy");
+}
+
+BENCHMARK(nf_local_in_prologue_c);
+
 void single_rule__ip4_saddr(::benchmark::State &state)
 {
     Chain chain("bf_benchmark", BF_HOOK_XDP, BF_VERDICT_ACCEPT);
diff --git a/tools/benchmarks/summary.html.j2 b/tests/benchmarks/summary.html.j2
similarity index 100%
rename from tools/benchmarks/summary.html.j2
rename to tests/benchmarks/summary.html.j2
diff --git a/tests/check/CMakeLists.txt b/tests/check/CMakeLists.txt
index e26f9baf1..d62632af0 100644
--- a/tests/check/CMakeLists.txt
+++ b/tests/check/CMakeLists.txt
@@ -13,7 +13,7 @@ file(GLOB_RECURSE bf_srcs
 file(GLOB_RECURSE bf_test_srcs
     ${CMAKE_SOURCE_DIR}/tests/*.h               ${CMAKE_SOURCE_DIR}/tests/*.c
     ${CMAKE_SOURCE_DIR}/tests/*.hpp             ${CMAKE_SOURCE_DIR}/tests/*.cpp
-    ${CMAKE_SOURCE_DIR}/tools/benchmarks/*.hpp  ${CMAKE_SOURCE_DIR}/tools/benchmarks/*.cpp
+    ${CMAKE_SOURCE_DIR}/tests/benchmarks/*.hpp  ${CMAKE_SOURCE_DIR}/tests/benchmarks/*.cpp
 )
 set(bf_all_srcs ${bf_srcs} ${bf_test_srcs})
 
diff --git a/tools/benchmarks/results.html.j2 b/tools/benchmarks/results.html.j2
deleted file mode 100644
index 60d54a0c1..000000000
--- a/tools/benchmarks/results.html.j2
+++ /dev/null
@@ -1,819 +0,0 @@
-<!DOCTYPE html>
-<html lang="en">
-
-<head>
-    <meta charset="UTF-8">
-    <meta name="viewport" content="width=device-width, initial-scale=1.0">
-    <title>Benchmark Results</title>
-    <link rel="icon" href="../../_static/favicon.ico">
-    <script src="https://cdnjs.cloudflare.com/ajax/libs/Chart.js/4.4.1/chart.umd.min.js"></script>
-    <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-QWTKZyjpPEjISv5WaRU9OFeRpok6YctnYmDr5pNlyT2bRjXh0JMhjY6hW+ALEwIH" crossorigin="anonymous">
-    <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/js/bootstrap.bundle.min.js" integrity="sha384-YvpcrYf0tY3lHB60NNkmXc5s9fDVZLESaAA55NDzOxhy9GkcIdslK1eN7N6jIeHz" crossorigin="anonymous"></script>
-    <link rel="preconnect" href="https://fonts.googleapis.com">
-    <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
-    <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&family=JetBrains+Mono:wght@400;500&display=swap" rel="stylesheet">
-    <style>
-        :root {
-            --primary: #6366f1;
-            --primary-dark: #4f46e5;
-            --secondary: #8b5cf6;
-            --accent: #06b6d4;
-            --success: #10b981;
-            --warning: #f59e0b;
-            --danger: #ef4444;
-            --neutral-50: #fafafa;
-            --neutral-100: #f4f4f5;
-            --neutral-200: #e4e4e7;
-            --neutral-300: #d4d4d8;
-            --neutral-400: #a1a1aa;
-            --neutral-500: #71717a;
-            --neutral-600: #52525b;
-            --neutral-700: #3f3f46;
-            --neutral-800: #27272a;
-            --neutral-900: #18181b;
-            --card-bg: #ffffff;
-            --body-bg: linear-gradient(135deg, #f8fafc 0%, #eef2ff 50%, #f0fdfa 100%);
-            --shadow-sm: 0 1px 2px 0 rgb(0 0 0 / 0.05);
-            --shadow: 0 1px 3px 0 rgb(0 0 0 / 0.1), 0 1px 2px -1px rgb(0 0 0 / 0.1);
-            --shadow-md: 0 4px 6px -1px rgb(0 0 0 / 0.1), 0 2px 4px -2px rgb(0 0 0 / 0.1);
-            --shadow-lg: 0 10px 15px -3px rgb(0 0 0 / 0.1), 0 4px 6px -4px rgb(0 0 0 / 0.1);
-            --radius: 12px;
-            --radius-lg: 16px;
-        }
-
-        * {
-            box-sizing: border-box;
-        }
-
-        body {
-            background: var(--body-bg);
-            font-family: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
-            min-height: 100vh;
-            color: var(--neutral-800);
-            line-height: 1.6;
-        }
-
-        .container-fluid {
-            max-width: 1600px;
-            margin: 0 auto;
-        }
-
-        /* Header */
-        .page-header {
-            text-align: center;
-            padding: 2rem 0 1rem;
-        }
-
-        .page-title {
-            display: flex;
-            align-items: center;
-            justify-content: center;
-            gap: 1rem;
-        }
-
-        .page-logo {
-            height: 48px;
-        }
-
-        .page-header h1 {
-            font-size: 3rem;
-            font-weight: 700;
-            background: linear-gradient(135deg, var(--primary) 0%, var(--secondary) 50%, var(--accent) 100%);
-            -webkit-background-clip: text;
-            -webkit-text-fill-color: transparent;
-            background-clip: text;
-            margin: 0;
-            letter-spacing: -0.025em;
-        }
-
-        .page-header .subtitle {
-            color: var(--neutral-500);
-            font-size: 0.95rem;
-            padding-top: 0.5rem;
-        }
-
-        /* Cards */
-        .card {
-            background: var(--card-bg);
-            border: 1px solid var(--neutral-200);
-            border-radius: var(--radius-lg);
-            box-shadow: var(--shadow-md);
-            transition: box-shadow 0.2s ease, transform 0.2s ease;
-        }
-
-        .card:hover {
-            box-shadow: var(--shadow-lg);
-        }
-
-        .card-body {
-            padding: 1.5rem;
-        }
-
-        /* Stats Bar */
-        .stats-bar {
-            background: linear-gradient(135deg, var(--primary) 0%, var(--secondary) 100%);
-            border: none;
-            color: white;
-        }
-
-        .stats-bar .stat-item {
-            padding: 0.75rem 1.5rem;
-            border-right: 1px solid rgba(255, 255, 255, 0.2);
-        }
-
-        .stats-bar .stat-item:last-child {
-            border-right: none;
-        }
-
-        .stats-bar .stat-label {
-            font-size: 0.75rem;
-            text-transform: uppercase;
-            letter-spacing: 0.05em;
-            opacity: 0.85;
-            font-weight: 500;
-        }
-
-        .stats-bar .stat-value {
-            font-family: 'JetBrains Mono', monospace;
-            font-size: 0.95rem;
-            font-weight: 600;
-        }
-
-        /* Section Headers */
-        .section-header {
-            display: flex;
-            align-items: center;
-            gap: 0.75rem;
-            margin-bottom: 1.5rem;
-            padding-bottom: 0.75rem;
-            border-bottom: 2px solid var(--neutral-200);
-        }
-
-        .section-header h2 {
-            font-size: 1.25rem;
-            font-weight: 600;
-            color: var(--neutral-800);
-            margin: 0;
-        }
-
-        .section-header .badge {
-            background: linear-gradient(135deg, var(--primary), var(--secondary));
-            color: white;
-            font-size: 0.7rem;
-            padding: 0.35rem 0.65rem;
-            border-radius: 9999px;
-            font-weight: 500;
-        }
-
-        /* Table Styles */
-        .table-container {
-            overflow-x: auto;
-            border-radius: var(--radius);
-            border: 1px solid var(--neutral-200);
-        }
-
-        .table {
-            margin: 0;
-            font-size: 0.85rem;
-            border-collapse: separate;
-            border-spacing: 0;
-        }
-
-        .table thead th {
-            background: var(--neutral-50);
-            color: var(--neutral-700);
-            font-weight: 600;
-            font-size: 0.75rem;
-            text-transform: uppercase;
-            letter-spacing: 0.05em;
-            padding: 0.875rem 0.75rem;
-            border-bottom: 2px solid var(--neutral-200);
-            position: sticky;
-            top: 0;
-            z-index: 10;
-            white-space: nowrap;
-        }
-
-        .table thead tr:first-child th {
-            border-top: none;
-        }
-
-        .table tbody {
-            font-family: 'JetBrains Mono', monospace;
-        }
-
-        .table tbody td {
-            padding: 0.75rem;
-            vertical-align: middle;
-            border-bottom: 1px solid var(--neutral-100);
-            transition: background-color 0.15s ease;
-        }
-
-        .table tbody tr:hover td {
-            background: var(--neutral-50);
-        }
-
-        .table tbody tr:last-child td {
-            border-bottom: none;
-        }
-
-        .table-striped tbody tr:nth-of-type(odd) td {
-            background: rgba(99, 102, 241, 0.02);
-        }
-
-        .table-striped tbody tr:nth-of-type(odd):hover td {
-            background: var(--neutral-50);
-        }
-
-        /* Value Indicators */
-        .text-success {
-            color: var(--success) !important;
-            font-weight: 500;
-        }
-
-        .text-danger {
-            color: var(--danger) !important;
-            font-weight: 500;
-        }
-
-        .text-warning {
-            color: var(--warning) !important;
-            font-weight: 500;
-        }
-
-        /* Muted Values */
-        .text-muted {
-            color: var(--neutral-400) !important;
-        }
-
-        .missing {
-            opacity: 0.5;
-        }
-
-        .missing [data-bs-toggle="tooltip"] {
-            cursor: help;
-            text-decoration: underline dotted var(--neutral-400);
-        }
-
-        /* Charts Section */
-        #charts {
-            display: grid;
-            grid-template-columns: repeat(auto-fit, minmax(500px, 1fr));
-            gap: 1.5rem;
-        }
-
-        .chart-card {
-            display: flex;
-            flex-direction: column;
-        }
-
-        .chart-card .chart-header {
-            padding: 1rem 1.25rem;
-            border-bottom: 1px solid var(--neutral-200);
-            background: var(--neutral-50);
-            border-radius: var(--radius-lg) var(--radius-lg) 0 0;
-        }
-
-        .chart-card .chart-header h4 {
-            font-size: 0.9rem;
-            font-weight: 600;
-            color: var(--neutral-700);
-            margin: 0;
-            font-family: 'JetBrains Mono', monospace;
-        }
-
-        .chart-card canvas {
-            padding: 1rem;
-            cursor: pointer;
-        }
-
-        /* Font Utilities */
-        .font-mono {
-            font-family: 'JetBrains Mono', monospace !important;
-        }
-
-        .fs-7 {
-            font-size: 0.85rem;
-        }
-
-        .fs-8 {
-            font-size: 0.7rem;
-        }
-
-        /* Responsive */
-        @media (max-width: 992px) {
-            #charts {
-                grid-template-columns: 1fr;
-            }
-        }
-
-        @media (max-width: 768px) {
-            .page-header h1 {
-                font-size: 1.75rem;
-            }
-
-            .stats-bar .row {
-                flex-direction: column;
-            }
-
-            .stats-bar .stat-item {
-                border-right: none;
-                border-bottom: 1px solid rgba(255, 255, 255, 0.2);
-                text-align: center;
-            }
-
-            .table {
-                font-size: 0.75rem;
-            }
-
-            .table thead th,
-            .table tbody td {
-                padding: 0.5rem;
-            }
-
-            #charts {
-                grid-template-columns: 1fr;
-            }
-        }
-
-        /* Tooltip Customization */
-        .tooltip-inner {
-            background-color: var(--neutral-800);
-            border-radius: 6px;
-            font-size: 0.8rem;
-            padding: 0.5rem 0.75rem;
-        }
-
-        .tooltip.bs-tooltip-top .tooltip-arrow::before {
-            border-top-color: var(--neutral-800);
-        }
-
-        /* Scrollbar */
-        .table-container::-webkit-scrollbar {
-            height: 8px;
-        }
-
-        .table-container::-webkit-scrollbar-track {
-            background: var(--neutral-100);
-            border-radius: 4px;
-        }
-
-        .table-container::-webkit-scrollbar-thumb {
-            background: var(--neutral-300);
-            border-radius: 4px;
-        }
-
-        .table-container::-webkit-scrollbar-thumb:hover {
-            background: var(--neutral-400);
-        }
-
-        /* Sortable Headers */
-        .sortable {
-            cursor: pointer;
-            user-select: none;
-            position: relative;
-            padding-right: 1.5rem !important;
-            transition: background-color 0.15s ease;
-        }
-
-        .sortable:hover {
-            background: var(--neutral-100);
-        }
-
-        .sortable::after {
-            content: '⇅';
-            position: absolute;
-            right: 0.5rem;
-            bottom: 0.875rem;
-            opacity: 0.3;
-            font-size: 0.7rem;
-        }
-
-        .sortable.sort-asc::after {
-            content: '↑';
-            opacity: 0.8;
-            color: var(--primary);
-        }
-
-        .sortable.sort-desc::after {
-            content: '↓';
-            opacity: 0.8;
-            color: var(--primary);
-        }
-
-        /* Benchmark Link */
-        .benchmark-link {
-            color: var(--neutral-800);
-            text-decoration: none;
-            transition: color 0.15s ease;
-        }
-
-        .benchmark-link:hover {
-            color: var(--primary);
-            text-decoration: underline;
-        }
-
-        /* Description Box */
-        .description-box {
-            background: linear-gradient(135deg, rgba(99, 102, 241, 0.04) 0%, rgba(6, 182, 212, 0.04) 100%);
-            border: 1px solid var(--neutral-200);
-            border-left: 3px solid var(--primary);
-            border-radius: 0 var(--radius) var(--radius) 0;
-            padding: 1rem 1.25rem;
-            max-width: 80ch;
-            margin: 0 auto;
-            font-size: 0.875rem;
-            line-height: 1.7;
-            color: var(--neutral-600);
-        }
-
-        .description-box strong {
-            color: var(--neutral-700);
-            font-weight: 600;
-        }
-
-        .description-box em {
-            font-style: italic;
-            color: var(--neutral-700);
-        }
-
-        .description-box .text-success,
-        .description-box .text-danger {
-            font-weight: 600;
-        }
-    </style>
-</head>
-
-<body>
-    <div class="container-fluid p-4">
-        <!-- Header -->
-        <header class="page-header">
-            <div class="page-title">
-                <img src="../../_static/logomark-light-mode.png" alt="bpfilter" class="page-logo">
-                <h1>bfbencher</h1>
-            </div>
-            <p class="subtitle">bpfilter performance across commits</p>
-        </header>
-
-        <!-- Stats Bar -->
-        <div class="card stats-bar mb-4">
-            <div class="card-body py-0">
-                <div class="row align-items-center justify-content-center">
-                    <div class="col-auto stat-item text-center">
-                        <div class="stat-label">Commit range</div>
-                        <div class="stat-value">{{ first_commit_sha[:7] }}..{{ last_commit_sha[:7] }}</div>
-                    </div>
-                    <div class="col-auto stat-item text-center">
-                        <div class="stat-label">Host</div>
-                        <div class="stat-value">{{ hostname }}</div>
-                    </div>
-                    <div class="col-auto stat-item text-center">
-                        <div class="stat-label">Commits</div>
-                        <div class="stat-value">{{ n_commits }}</div>
-                    </div>
-                    <div class="col-auto stat-item text-center">
-                        <div class="stat-label">Results</div>
-                        {% if stats.n_failures %}
-                        <div class="stat-value">{{ stats.n_successes }} ({{ stats.n_failures }} failures)</div>
-                        {% else %}
-                        <div class="stat-value">{{ stats.n_successes }}</div>
-                        {% endif %}
-                    </div>
-                </div>
-            </div>
-        </div>
-
-        <!-- Benchmark Table Section -->
-        <section class="mb-5">
-            <div class="card">
-                <div class="card-body">
-                    <div class="section-header">
-                        <h2>Benchmark metrics</h2>
-                        <span class="badge">{{ history.sorted_benchmarks() | length }} benchmarks</span>
-                    </div>
-
-                    <div class="description-box mb-4">
-                        Each row shows performance data for a single benchmark. <strong>Runtime</strong> is the CPU time
-                        measured for the most recent commit, and <strong>Instructions</strong> is the BPF instruction count
-                        of the generated program. The <strong>Δ</strong> (delta) columns show the percentage change compared
-                        to the mean of the previous <em>N</em> commits (e.g., "5 commits" compares against the average of
-                        commits 2-6). Only statistically significant changes (z-score &gt; 2.5) are colored:
-                        <span class="text-success">green</span> for improvements (faster or fewer instructions),
-                        <span class="text-danger">red</span> for regressions. Uncolored values indicate changes within
-                        normal variance. Click a benchmark name to jump to its historical chart below.
-                    </div>
-
-                    {%- from "summary.html.j2" import render_table -%}
-                    <div class="table-container">
-                        {{ render_table(rows, terms, none, "bootstrap", true, ureg, none, get_class) }}
-                    </div>
-                </div>
-            </div>
-        </section>
-
-        <!-- Charts Section -->
-        <section>
-            <div class="section-header">
-                <h2>Performance trends</h2>
-                <span class="badge">Historical data</span>
-            </div>
-            <div id="charts"></div>
-        </section>
-    </div>
-
-    <script>
-        function restripeTable() {
-            const visibleRows = document.querySelectorAll('tbody tr');
-            visibleRows.forEach((row, index) => {
-                row.classList.remove('table-stripe-odd', 'table-stripe-even');
-                if (index % 2 === 0) {
-                    row.classList.add('table-stripe-odd');
-                } else {
-                    row.classList.add('table-stripe-even');
-                }
-            });
-        }
-
-        let currentSort = { column: null, direction: null }; // direction: null, 'asc', 'desc'
-        let originalRowOrder = [];
-
-        // Store original row order on page load
-        document.addEventListener('DOMContentLoaded', function() {
-            const tbody = document.querySelector('#benchmark-table tbody');
-            originalRowOrder = Array.from(tbody.querySelectorAll('tr'));
-        });
-
-        function sortTable(column) {
-            const table = document.getElementById('benchmark-table');
-            const tbody = table.querySelector('tbody');
-            const rows = Array.from(tbody.querySelectorAll('tr'));
-            const headers = table.querySelectorAll('th.sortable');
-
-            // Determine sort direction: null -> asc -> desc -> null
-            if (currentSort.column === column) {
-                if (currentSort.direction === 'asc') {
-                    currentSort.direction = 'desc';
-                } else if (currentSort.direction === 'desc') {
-                    currentSort.direction = null;
-                    currentSort.column = null;
-                } else {
-                    currentSort.direction = 'asc';
-                }
-            } else {
-                currentSort.column = column;
-                currentSort.direction = 'asc';
-            }
-
-            // Update header classes
-            headers.forEach(header => {
-                header.classList.remove('sort-asc', 'sort-desc');
-                if (header.dataset.sort === currentSort.column && currentSort.direction) {
-                    header.classList.add(currentSort.direction === 'asc' ? 'sort-asc' : 'sort-desc');
-                }
-            });
-
-            // If no sort direction, restore original order
-            if (!currentSort.direction) {
-                originalRowOrder.forEach(row => tbody.appendChild(row));
-                restripeTable();
-                return;
-            }
-
-            // Sort rows
-            rows.sort((a, b) => {
-                let aVal, bVal;
-
-                if (column === 'name') {
-                    aVal = a.dataset.name.toLowerCase();
-                    bVal = b.dataset.name.toLowerCase();
-                    return currentSort.direction === 'asc'
-                        ? aVal.localeCompare(bVal)
-                        : bVal.localeCompare(aVal);
-                } else if (column === 'runtime') {
-                    aVal = parseFloat(a.dataset.runtime) || 0;
-                    bVal = parseFloat(b.dataset.runtime) || 0;
-                } else if (column === 'instructions') {
-                    aVal = parseFloat(a.dataset.instructions) || 0;
-                    bVal = parseFloat(b.dataset.instructions) || 0;
-                }
-
-                if (currentSort.direction === 'asc') {
-                    return aVal - bVal;
-                } else {
-                    return bVal - aVal;
-                }
-            });
-
-            // Re-append rows in sorted order
-            rows.forEach(row => tbody.appendChild(row));
-
-            // Re-stripe the table
-            restripeTable();
-        }
-
-        function generateChart(id, label, name, commits, fullCommits, runtimeData, instructionData, commitSubjects) {
-            const container = document.getElementById('charts');
-
-            container.insertAdjacentHTML("beforeend",
-                `<div id="${id}" class="card chart-card">
-                    <div class="chart-header">
-                        <h4>${label}</h4>
-                    </div>
-                    <canvas id="${id}-canvas"></canvas>
-                </div>`
-            );
-
-            const runtimeMin = Math.min(...runtimeData);
-            const runtimeMax = Math.max(...runtimeData);
-            const runtimeCenter = (runtimeMin + runtimeMax) / 2;
-
-            let chartOptions = {
-                type: 'line',
-                options: {
-                    responsive: true,
-                    maintainAspectRatio: true,
-                    interaction: {
-                        mode: 'nearest',
-                        intersect: false,
-                    },
-                    plugins: {
-                        legend: {
-                            position: 'top',
-                            labels: {
-                                usePointStyle: true,
-                                padding: 20,
-                                font: {
-                                    family: "'JetBrains Mono', monospace",
-                                    size: 11
-                                }
-                            }
-                        },
-                        tooltip: {
-                            backgroundColor: 'rgba(24, 24, 27, 0.95)',
-                            titleFont: {
-                                family: "'JetBrains Mono', monospace",
-                                size: 12
-                            },
-                            bodyFont: {
-                                family: "'JetBrains Mono', monospace",
-                                size: 11
-                            },
-                            padding: 12,
-                            cornerRadius: 8,
-                            displayColors: true,
-                            boxPadding: 6,
-                            yAlign: 'bottom',
-                            callbacks: {
-                                title: function(tooltipItems) {
-                                    const idx = tooltipItems[0].dataIndex;
-                                    const hash = commits[idx];
-                                    const subject = commitSubjects && commitSubjects[idx] ? commitSubjects[idx] : '';
-                                    return subject ? `${hash}: ${subject}` : hash;
-                                },
-                                footer: function() {
-                                    return 'Click to view on GitHub';
-                                }
-                            }
-                        }
-                    },
-                    onClick: function(event, elements) {
-                        if (elements.length > 0 && fullCommits) {
-                            const sha = fullCommits[elements[0].index];
-                            window.open(`https://github.com/facebook/bpfilter/commit/${sha}`, '_blank');
-                        }
-                    },
-                    scales: {
-                        x: {
-                            ticks: {
-                                font: {
-                                    family: "'JetBrains Mono', monospace",
-                                    size: 10
-                                },
-                                color: '#71717a'
-                            },
-                            grid: {
-                                color: 'rgba(0, 0, 0, 0.05)'
-                            }
-                        },
-                        y0: {
-                            type: 'linear',
-                            display: true,
-                            position: 'left',
-                            title: {
-                                display: true,
-                                text: 'Runtime (ns)',
-                                font: {
-                                    family: "'JetBrains Mono', monospace",
-                                    size: 11
-                                },
-                                color: '#71717a'
-                            },
-                            ticks: {
-                                font: {
-                                    family: "'JetBrains Mono', monospace",
-                                    size: 10
-                                },
-                                color: '#71717a'
-                            },
-                            grid: {
-                                color: 'rgba(0, 0, 0, 0.05)'
-                            },
-                            min: 0,
-                            max: runtimeMax + runtimeCenter * 0.25
-                        }
-                    }
-                },
-                data: {
-                    labels: commits,
-                    datasets: [
-                        {
-                            label: 'Runtime (ns)',
-                            data: runtimeData,
-                            backgroundColor: 'rgba(239, 68, 68, 0.1)',
-                            borderColor: 'rgba(239, 68, 68, 0.8)',
-                            fill: true,
-                            yAxisID: 'y0',
-                            tension: 0.3,
-                            borderWidth: 2,
-                            pointRadius: 2,
-                            pointHoverRadius: 5,
-                            pointBackgroundColor: 'rgba(239, 68, 68, 1)',
-                            pointBorderColor: '#fff',
-                            pointBorderWidth: 1
-                        }
-                    ]
-                }
-            };
-
-            if (instructionData != null) {
-                const instructionMax = Math.max(...instructionData);
-
-                chartOptions.options.scales.y1 = {
-                    type: 'linear',
-                    display: true,
-                    position: 'right',
-                    min: 0,
-                    title: {
-                        display: true,
-                        text: 'Instructions',
-                        font: {
-                            family: "'JetBrains Mono', monospace",
-                            size: 11
-                        },
-                        color: '#71717a'
-                    },
-                    ticks: {
-                        font: {
-                            family: "'JetBrains Mono', monospace",
-                            size: 10
-                        },
-                        color: '#71717a',
-                        callback: function(value) {
-                            if (value >= 1000000) {
-                                return (value / 1000000).toFixed(1) + 'M';
-                            }
-                            if (value >= 1000) {
-                                return (value / 1000).toFixed(1) + 'K';
-                            }
-                            return value.toFixed(0);
-                        }
-                    },
-                    grid: {
-                        drawOnChartArea: false,
-                    }
-                };
-
-                chartOptions.data.datasets.push({
-                    label: 'Instructions',
-                    data: instructionData,
-                    backgroundColor: 'rgba(6, 182, 212, 0.1)',
-                    borderColor: 'rgba(6, 182, 212, 0.8)',
-                    fill: true,
-                    yAxisID: 'y1',
-                    tension: 0.3,
-                    borderWidth: 2,
-                    pointRadius: 2,
-                    pointHoverRadius: 5,
-                    pointBackgroundColor: 'rgba(6, 182, 212, 1)',
-                    pointBorderColor: '#fff',
-                    pointBorderWidth: 1
-                });
-            }
-
-            new Chart(document.getElementById(id + '-canvas').getContext('2d'), chartOptions);
-        }
-
-        // Create each chart
-        {% for benchmark in history.sorted_benchmarks() -%}
-        generateChart("chart-{{ benchmark.name }}", "{{ benchmark.label }}", "{{ benchmark.name }}", {{ benchmark.short_commits_sha }}, {{ benchmark.commits_sha }}, {{ benchmark.times }}, {{ benchmark.nInsns | default("null", true) }}, {{ benchmark.commit_subjects | tojson }})
-        {% endfor %}
-
-        document.addEventListener("DOMContentLoaded", function() {
-            // Initialize tooltips
-            const tooltipTriggerList = document.querySelectorAll('[data-bs-toggle="tooltip"]');
-            tooltipTriggerList.forEach(el => new bootstrap.Tooltip(el));
-        });
-    </script>
-</body>
-
-</html>
diff --git a/tools/bfoptimize b/tools/bfoptimize
new file mode 100755
index 000000000..77d716b36
--- /dev/null
+++ b/tools/bfoptimize
@@ -0,0 +1,672 @@
+#!/usr/bin/env python3
+"""bfoptimize — LLM-driven BPF bytecode optimization loop for bpfilter."""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import datetime
+import json
+import multiprocessing
+import os
+import pathlib
+import shutil
+import subprocess
+from typing import Any
+
+import anthropic
+import diskcache  # type: ignore[import-untyped]
+import git
+import numpy
+import rich.console
+import rich.table
+from claude_agent_sdk import (
+    AssistantMessage,
+    ClaudeAgentOptions,
+    ResultMessage,
+    TextBlock,
+    query,
+)
+
+DEFAULT_SOURCES = pathlib.Path(".")
+DEFAULT_BUILD_DIR = pathlib.Path("build")
+DEFAULT_CACHE_DIR = pathlib.Path(".cache/bfoptimize")
+DEFAULT_ITERATIONS = 10
+DEFAULT_MODEL = "claude-opus-4-6"
+DEFAULT_EFFORT = "high"
+CGEN_DIR = "src/libbpfilter/cgen"
+SHORT_SHA_LEN = 7
+
+console = rich.console.Console(log_path=False)
+
+
+# ---------------------------------------------------------------------------
+# History
+# ---------------------------------------------------------------------------
+
+
+class History:
+    """Persists attempt records and the current baseline SHA."""
+
+    def __init__(self, cache_dir: pathlib.Path) -> None:
+        self._path = cache_dir / "history.json"
+        self._data: dict[str, Any] = self._load()
+
+    def _load(self) -> dict[str, Any]:
+        if self._path.exists():
+            return json.loads(self._path.read_text())
+        return {"baseline_sha": None, "attempts": []}
+
+    def save(self) -> None:
+        self._path.parent.mkdir(parents=True, exist_ok=True)
+        self._path.write_text(json.dumps(self._data, indent=2))
+
+    @property
+    def baseline_sha(self) -> str | None:
+        return self._data.get("baseline_sha")
+
+    @baseline_sha.setter
+    def baseline_sha(self, sha: str) -> None:
+        self._data["baseline_sha"] = sha
+
+    @property
+    def attempts(self) -> list[dict[str, Any]]:
+        return self._data.get("attempts", [])
+
+    def next_id(self) -> int:
+        attempts = self.attempts
+        return (attempts[-1]["id"] + 1) if attempts else 1
+
+    def add_attempt(self, attempt: dict[str, Any]) -> None:
+        self._data.setdefault("attempts", []).append(attempt)
+        self.save()
+
+    def summary(self) -> str:
+        attempts = self.attempts
+        if not attempts:
+            return "No previous attempts."
+        lines = ["Previous optimization attempts (do not repeat these):"]
+        for a in attempts:
+            delta = (
+                f"{a['delta_time_pct']:+.1f}%"
+                if a.get("delta_time_pct") is not None
+                else "N/A"
+            )
+            lines.append(
+                f"  #{a['id']} [{a['status']}] {a['description']} (weighted runtime delta: {delta})"
+            )
+        return "\n".join(lines)
+
+
+# ---------------------------------------------------------------------------
+# Source loading
+# ---------------------------------------------------------------------------
+
+
+def load_cgen_sources(sources_dir: pathlib.Path) -> str:
+    cgen_path = sources_dir / CGEN_DIR
+    parts: list[str] = []
+    for f in sorted(cgen_path.rglob("*.[ch]")):
+        rel = f.relative_to(sources_dir)
+        try:
+            content = f.read_text(encoding="utf-8")
+        except UnicodeDecodeError:
+            continue
+        parts.append(f"=== {rel} ===\n{content}")
+    return "\n\n".join(parts)
+
+
+def load_last_benchmark(cache_dir: pathlib.Path) -> str:
+    bench_path = cache_dir / "last_bench.json"
+    if not bench_path.exists():
+        return "No benchmark data available yet."
+    try:
+        data = json.loads(bench_path.read_text())
+        base = data.get("base", "?")[:SHORT_SHA_LEN]
+        ref = data.get("ref", "?")[:SHORT_SHA_LEN]
+        lines = [f"Last benchmark results ({base} → {ref}):"]
+        for b in data.get("benchmarks", []):
+            pct = b.get("delta_time_pct", 0)
+            lines.append(
+                f"  {b['name']}: base={b['base_time_ns']:.1f}ns"
+                f"  ref={b['ref_time_ns']:.1f}ns  delta={pct:+.1f}%"
+            )
+        return "\n".join(lines)
+    except Exception:
+        return "Benchmark data unavailable."
+
+
+# ---------------------------------------------------------------------------
+# Planning phase (extended thinking)
+# ---------------------------------------------------------------------------
+
+
+def plan_optimization(
+    client: anthropic.Anthropic,
+    cgen_sources: str,
+    history_summary: str,
+    benchmark_results: str,
+    cache_dir: pathlib.Path,
+    attempt_id: int,
+    model: str = DEFAULT_MODEL,
+    thinking: bool = True,
+    effort: str = DEFAULT_EFFORT,
+    context_1m: bool = False,
+    hint: str | None = None,
+) -> str:
+    prompt = f"""You are optimizing the BPF bytecode generation logic in the bpfilter project.
+
+The cgen directory generates BPF programs that run in the Linux kernel for packet filtering.
+Every nanosecond saved matters — these programs execute for every packet received by the host.
+
+## Source files in {CGEN_DIR}/
+
+{cgen_sources}
+
+## Current benchmark results
+
+{benchmark_results}
+
+## Optimization history
+
+{history_summary}
+
+## Task
+
+Propose exactly ONE concrete optimization to the cgen code. Describe:
+1. Which file(s) you will change and what specifically you will change
+2. Why this will reduce the runtime of the generated BPF programs
+3. Any risks or tricky edge cases to handle
+
+Be specific and actionable. Do not repeat any previously attempted optimization.
+Output only the optimization proposal — no code yet."""
+
+    if hint:
+        prompt += (
+            f"\n\n## Hint\n\n{hint}\n\n"
+            "This is a direction to consider, not a constraint — "
+            "you may propose a different optimization if you judge it more impactful."
+        )
+
+    stream_kwargs: dict[str, Any] = {
+        "model": model,
+        "max_tokens": 128000,
+        "messages": [{"role": "user", "content": prompt}],
+        "output_config": {"effort": effort},
+    }
+    if thinking:
+        stream_kwargs["thinking"] = {"type": "adaptive"}
+    if context_1m:
+        stream_kwargs["extra_headers"] = {"anthropic-beta": "context-1m-2025-08-07"}
+
+    with client.messages.stream(**stream_kwargs) as stream:
+        for event in stream:
+            if event.type == "content_block_delta":
+                if event.delta.type == "thinking_delta":
+                    console.print(event.delta.thinking, end="")
+                elif event.delta.type == "text_delta":
+                    console.print(event.delta.text, end="")
+        console.print("")
+        response = stream.get_final_message()
+
+    # Persist thinking blocks for post-hoc inspection
+    thinking_texts = [b.thinking for b in response.content if b.type == "thinking"]
+    if thinking_texts:
+        thinking_path = cache_dir / f"{attempt_id}-thinking.txt"
+        thinking_path.write_text("\n\n---\n\n".join(thinking_texts))  # type: ignore[arg-type]
+
+    return next(b.text for b in response.content if b.type == "text")  # type: ignore[union-attr]
+
+
+# ---------------------------------------------------------------------------
+# Execution phase (Agent SDK)
+# ---------------------------------------------------------------------------
+
+
+async def execute_optimization(
+    sources_dir: pathlib.Path,
+    build_dir: pathlib.Path,
+    optimization_plan: str,
+    baseline_sha: str,
+) -> str | None:
+    """Run the agent to implement the optimization. Returns new HEAD sha if committed."""
+    ncpus = multiprocessing.cpu_count()
+    abs_sources = sources_dir.resolve()
+    abs_build = build_dir.resolve()
+
+    prompt = f"""You are implementing a performance optimization to the bpfilter BPF bytecode generator.
+
+## Optimization to implement
+
+{optimization_plan}
+
+## Rules
+
+- Modify ONLY files under `{abs_sources}/{CGEN_DIR}/`. Do not touch any other files.
+- Build: cmake -S {abs_sources} -B {abs_build} -DNO_DOCS=1 -DNO_TESTS=1 -DNO_CHECKS=1 \
+-DCMAKE_BUILD_TYPE=release && make -C {abs_build} -j{ncpus} bpfilter
+- Test: make -C {abs_build} -j{ncpus} unit e2e integration
+- If tests pass: commit with `git -C {abs_sources} commit -am "daemon: cgen: <description>"`
+- If build or tests fail: diagnose and fix. If you cannot make tests pass, revert ALL your \
+changes with `git -C {abs_sources} checkout -- {abs_sources}/{CGEN_DIR}/` and exit without \
+committing.
+- The current baseline is {baseline_sha[:SHORT_SHA_LEN]}. Only commit when tests are green."""
+
+    async for message in query(
+        prompt=prompt,
+        options=ClaudeAgentOptions(
+            cwd=str(abs_sources),
+            permission_mode="bypassPermissions",
+            allowed_tools=["Read", "Edit", "Write", "Bash", "Glob", "Grep"],
+        ),
+    ):
+        if isinstance(message, AssistantMessage):
+            for block in message.content:
+                if isinstance(block, TextBlock) and block.text.strip():
+                    console.log(f"[dim]{block.text.strip()[:200]}[/dim]")
+        elif isinstance(message, ResultMessage) and message.result:
+            console.log(f"Agent done: {message.result[:200]}")
+
+    repo = git.Repo(sources_dir)
+    new_sha = repo.head.commit.hexsha
+    return new_sha if new_sha != baseline_sha else None
+
+
+# ---------------------------------------------------------------------------
+# Benchmark step
+# ---------------------------------------------------------------------------
+
+
+def _benchmark_noise(bfbencher_cache_dir: pathlib.Path) -> dict[str, float]:
+    """Estimate per-benchmark noise (CV) from bfbencher's accumulated cache.
+
+    Iterates all cached commit results and computes the coefficient of variation
+    (MAD/median, normalised via the 1.4826 consistency factor) for each
+    benchmark.  Benchmarks with fewer than 3 data points are excluded so that
+    noise estimates are not based on a single outlier.
+
+    Returns a mapping from benchmark name to CV.  An empty dict is returned
+    when the cache is absent or unreadable.
+    """
+    series: dict[str, list[float]] = {}
+    try:
+        cache = diskcache.Cache(bfbencher_cache_dir)
+        for key in cache:
+            val = cache.get(key)
+            if not isinstance(val, dict) or not val.get("success"):
+                continue
+            for r in val.get("results", []):
+                name = r.get("name", "")
+                t = float(r.get("cpu_time", 0))
+                if t > 0:
+                    series.setdefault(name, []).append(t)
+        cache.close()
+    except Exception:
+        return {}
+
+    noise: dict[str, float] = {}
+    for name, times in series.items():
+        if len(times) < 3:
+            continue
+        arr = numpy.array(times)
+        median = float(numpy.median(arr))
+        if median == 0:
+            continue
+        mad = float(numpy.median(numpy.abs(arr - median)))
+        noise[name] = (mad * 1.4826) / median
+    return noise
+
+
+def run_benchmark(
+    sources_dir: pathlib.Path,
+    cache_dir: pathlib.Path,
+    bfbencher: pathlib.Path,
+    baseline_sha: str,
+    result_sha: str,
+    extra_args: list[str],
+    attempt_id: int,
+) -> float | None:
+    json_path = cache_dir / "last_bench.json"
+    cmd = [
+        str(bfbencher),
+        "compare",
+        baseline_sha,
+        result_sha,
+        "--sources",
+        str(sources_dir),
+        "--cache-dir",
+        str(cache_dir / "bfbencher"),
+        "--json-output",
+        str(json_path),
+        "--fail-on-benchmark-error",
+    ] + extra_args
+
+    console.log(
+        f"Running bfbencher compare {baseline_sha[:SHORT_SHA_LEN]} → {result_sha[:SHORT_SHA_LEN]}"
+    )
+    result = subprocess.run(cmd, text=True, env={**os.environ, "PYTHONUNBUFFERED": "1"})
+    if result.returncode != 0:
+        console.log("[red]bfbencher compare failed[/red]")
+        return None
+
+    shutil.copy(json_path, cache_dir / f"bench-{attempt_id}.json")
+
+    try:
+        data = json.loads(json_path.read_text())
+        noise = _benchmark_noise(cache_dir / "bfbencher")
+
+        pairs: list[tuple[float, float]] = []
+        for b in data.get("benchmarks", []):
+            if b.get("delta_time_pct") is None:
+                continue
+            cv = noise.get(b["name"], 0.0)
+            # Weight = 1 / (1 + CV): high-noise benchmarks contribute less.
+            pairs.append((b["delta_time_pct"], 1.0 / (1.0 + cv)))
+
+        if not pairs:
+            return None
+
+        noisy = [(name, cv) for name, cv in noise.items() if cv > 0.01]
+        if noisy:
+            noisy_str = ", ".join(
+                f"{name} (CV={cv:.1%})"
+                for name, cv in sorted(noisy, key=lambda x: -x[1])
+            )
+            console.log(f"[dim]Volatile benchmarks (down-weighted): {noisy_str}[/dim]")
+
+        total_weight = sum(w for _, w in pairs)
+        return sum(d * w for d, w in pairs) / total_weight
+    except Exception as e:
+        console.log(f"[red]Failed to parse benchmark output: {e}[/red]")
+        return None
+
+
+# ---------------------------------------------------------------------------
+# Summary table
+# ---------------------------------------------------------------------------
+
+
+def print_summary(history: History) -> None:
+    table = rich.table.Table(title="bfoptimize summary", show_header=True)
+    table.add_column("#", justify="right")
+    table.add_column("Status", justify="center")
+    table.add_column("Δ Runtime", justify="right")
+    table.add_column("Description", style="cyan")
+
+    for a in history.attempts:
+        status = a["status"]
+        color = (
+            "green"
+            if status == "accepted"
+            else "red"
+            if status == "rejected_bench"
+            else "yellow"
+        )
+        delta = (
+            f"{a['delta_time_pct']:+.1f}%"
+            if a.get("delta_time_pct") is not None
+            else "-"
+        )
+        table.add_row(
+            str(a["id"]),
+            f"[{color}]{status}[/{color}]",
+            delta,
+            a["description"][:80],
+        )
+
+    console.print(table)
+
+
+# ---------------------------------------------------------------------------
+# Main loop
+# ---------------------------------------------------------------------------
+
+
+async def run_optimize(args: argparse.Namespace) -> None:
+    args.cache_dir.mkdir(parents=True, exist_ok=True)
+    history = History(args.cache_dir)
+    client = anthropic.Anthropic()
+    repo = git.Repo(args.sources)
+
+    if history.baseline_sha is None:
+        history.baseline_sha = repo.head.commit.hexsha
+        history.save()
+
+    console.log(f"Baseline: {history.baseline_sha[:SHORT_SHA_LEN]}")
+
+    bfbencher = (args.sources / "tests/benchmarks/bfbencher").resolve()
+
+    bench_extra: list[str] = []
+    if args.host:
+        bench_extra += ["--host", args.host]
+    if args.bind_node is not None:
+        bench_extra += ["--bind-node", str(args.bind_node)]
+    if args.no_preempt:
+        bench_extra += ["--no-preempt"]
+    if args.cpu_pin is not None:
+        bench_extra += ["--cpu-pin", str(args.cpu_pin)]
+    if args.slice:
+        bench_extra += ["--slice", args.slice]
+
+    for i in range(args.iterations):
+        attempt_id = history.next_id()
+        console.log(
+            f"\n[bold cyan]─── Iteration {i + 1}/{args.iterations}"
+            f"  (attempt #{attempt_id}) ───[/bold cyan]"
+        )
+
+        baseline_sha = history.baseline_sha
+        assert baseline_sha is not None
+
+        # ── Plan ──────────────────────────────────────────────────────────
+        console.log("[bold]Planning...[/bold]")
+        cgen_sources = load_cgen_sources(args.sources)
+        benchmark_results = load_last_benchmark(args.cache_dir)
+        try:
+            plan = plan_optimization(
+                client,
+                cgen_sources,
+                history.summary(),
+                benchmark_results,
+                args.cache_dir,
+                attempt_id,
+                model=args.model,
+                thinking=args.thinking,
+                effort=args.effort,
+                context_1m=args.context_1m,
+                hint=args.hint,
+            )
+        except Exception as e:
+            console.log(f"[red]Planning failed: {e}[/red]")
+            continue
+
+        description = plan.strip().splitlines()[0][:120]
+        console.log(f"Proposal: {description}")
+
+        # ── Execute ───────────────────────────────────────────────────────
+        console.log("[bold]Executing...[/bold]")
+        try:
+            result_sha = await execute_optimization(
+                args.sources,
+                args.build_dir,
+                plan,
+                baseline_sha,
+            )
+        except Exception as e:
+            console.log(f"[red]Agent execution failed: {e}[/red]")
+            history.add_attempt(
+                {
+                    "id": attempt_id,
+                    "description": description,
+                    "status": "rejected_tests",
+                    "baseline_sha": baseline_sha,
+                    "result_sha": None,
+                    "delta_time_pct": None,
+                    "timestamp": datetime.datetime.now().isoformat(),
+                }
+            )
+            continue
+
+        if result_sha is None:
+            console.log("[yellow]Agent did not commit — rejected_tests[/yellow]")
+            history.add_attempt(
+                {
+                    "id": attempt_id,
+                    "description": description,
+                    "status": "rejected_tests",
+                    "baseline_sha": baseline_sha,
+                    "result_sha": None,
+                    "delta_time_pct": None,
+                    "timestamp": datetime.datetime.now().isoformat(),
+                }
+            )
+            continue
+
+        console.log(f"Agent committed: {result_sha[:SHORT_SHA_LEN]}")
+
+        # ── Benchmark ─────────────────────────────────────────────────────
+        console.log("[bold]Benchmarking...[/bold]")
+        delta = run_benchmark(
+            args.sources,
+            args.cache_dir,
+            bfbencher,
+            baseline_sha,
+            result_sha,
+            bench_extra,
+            attempt_id,
+        )
+
+        if delta is None or delta >= 0:
+            delta_str = f"{delta:+.1f}%" if delta is not None else "N/A"
+            console.log(f"[red]Rejected (bench): mean delta {delta_str}[/red]")
+            repo.git.reset("--hard", baseline_sha)
+            status = "rejected_bench"
+        else:
+            console.log(f"[green]Accepted: mean delta {delta:+.1f}%[/green]")
+            history.baseline_sha = result_sha
+            status = "accepted"
+
+        history.add_attempt(
+            {
+                "id": attempt_id,
+                "description": description,
+                "status": status,
+                "baseline_sha": baseline_sha,
+                "result_sha": result_sha,
+                "delta_time_pct": delta,
+                "timestamp": datetime.datetime.now().isoformat(),
+            }
+        )
+
+    print_summary(history)
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        prog="bfoptimize",
+        description="LLM-driven BPF bytecode optimization loop for bpfilter.",
+    )
+    parser.add_argument(
+        "--iterations",
+        "-n",
+        type=int,
+        default=DEFAULT_ITERATIONS,
+        help=f"number of optimization iterations to run (default: {DEFAULT_ITERATIONS})",
+    )
+    parser.add_argument(
+        "--sources",
+        type=pathlib.Path,
+        default=DEFAULT_SOURCES,
+        help=f'path to the bpfilter source directory (default: "{DEFAULT_SOURCES}")',
+    )
+    parser.add_argument(
+        "--build-dir",
+        type=pathlib.Path,
+        default=DEFAULT_BUILD_DIR,
+        help=f'cmake build directory (default: "{DEFAULT_BUILD_DIR}")',
+    )
+    parser.add_argument(
+        "--cache-dir",
+        type=pathlib.Path,
+        default=DEFAULT_CACHE_DIR,
+        help=f'directory for history and benchmark cache (default: "{DEFAULT_CACHE_DIR}")',
+    )
+    parser.add_argument(
+        "--host",
+        type=str,
+        default=None,
+        help="remote host for benchmarking (passed through to bfbencher)",
+    )
+    parser.add_argument(
+        "--bind-node",
+        type=int,
+        default=None,
+        help="CPU/memory NUMA node to bind benchmarks to",
+    )
+    parser.add_argument(
+        "--no-preempt",
+        action="store_true",
+        default=False,
+        help="run benchmarks with real-time scheduling (chrt -f 99)",
+    )
+    parser.add_argument(
+        "--cpu-pin",
+        type=int,
+        default=None,
+        help="CPU to pin benchmark to",
+    )
+    parser.add_argument(
+        "--slice",
+        type=str,
+        default=None,
+        help="systemd slice for benchmark execution",
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default=DEFAULT_MODEL,
+        choices=["claude-opus-4-6", "claude-sonnet-4-6"],
+        help=f"Claude model to use for planning (default: {DEFAULT_MODEL})",
+    )
+    parser.add_argument(
+        "--thinking",
+        action=argparse.BooleanOptionalAction,
+        default=True,
+        help="enable adaptive thinking during planning (default: enabled)",
+    )
+    parser.add_argument(
+        "--effort",
+        type=str,
+        default=DEFAULT_EFFORT,
+        choices=["low", "medium", "high", "max"],
+        help=f"effort level for the planning call (default: {DEFAULT_EFFORT}; max is Opus only)",
+    )
+    parser.add_argument(
+        "--context-1m",
+        action="store_true",
+        default=False,
+        help="enable 1M context window beta (claude-opus-4-6 and claude-sonnet-4-6 only)",
+    )
+    parser.add_argument(
+        "--hint",
+        type=str,
+        default=None,
+        help="optional direction for the model (e.g. 'look into XXX'); it is provided as context but not enforced",
+    )
+
+    args = parser.parse_args()
+
+    try:
+        asyncio.run(run_optimize(args))
+    except KeyboardInterrupt:
+        console.log("Interrupted by user")
+        raise SystemExit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/bfoptimize-web b/tools/bfoptimize-web
new file mode 100755
index 000000000..fefdb952a
--- /dev/null
+++ b/tools/bfoptimize-web
@@ -0,0 +1,1174 @@
+#!/usr/bin/env python3
+"""bfoptimize-web — Local web UI for bfoptimize."""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import json
+import pathlib
+import signal
+from typing import Any, AsyncGenerator
+
+import uvicorn
+from fastapi import FastAPI, HTTPException
+from fastapi.responses import HTMLResponse, PlainTextResponse, StreamingResponse
+from pydantic import BaseModel
+
+# ---------------------------------------------------------------------------
+# Global state
+# ---------------------------------------------------------------------------
+
+_process: asyncio.subprocess.Process | None = None
+_log_lines: list[str] = []
+_subscribers: list[asyncio.Queue[str | None]] = []
+
+# Updated at startup from CLI args; overridden per-run from POST /run body
+_cache_dir: pathlib.Path = pathlib.Path(".cache/bfoptimize")
+_sources_dir: pathlib.Path = pathlib.Path(".")
+_bfoptimize: pathlib.Path = pathlib.Path(__file__).parent / "bfoptimize"
+
+app = FastAPI()
+
+# ---------------------------------------------------------------------------
+# Request models
+# ---------------------------------------------------------------------------
+
+
+class RunRequest(BaseModel):
+    iterations: int = 10
+    sources: str = "."
+    build_dir: str = "build"
+    cache_dir: str = ".cache/bfoptimize"
+    host: str | None = None
+    bind_node: int | None = None
+    no_preempt: bool = False
+    cpu_pin: int | None = None
+    slice: str | None = None
+    model: str = "claude-opus-4-6"
+    thinking: bool = True
+    effort: str = "high"
+    context_1m: bool = False
+    hint: str | None = None
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _build_cmd(body: RunRequest) -> list[str]:
+    cmd = [
+        str(_bfoptimize),
+        "--iterations",
+        str(body.iterations),
+        "--sources",
+        body.sources,
+        "--build-dir",
+        body.build_dir,
+        "--cache-dir",
+        body.cache_dir,
+    ]
+    if body.host:
+        cmd += ["--host", body.host]
+    if body.bind_node is not None:
+        cmd += ["--bind-node", str(body.bind_node)]
+    if body.no_preempt:
+        cmd += ["--no-preempt"]
+    if body.cpu_pin is not None:
+        cmd += ["--cpu-pin", str(body.cpu_pin)]
+    if body.slice:
+        cmd += ["--slice", body.slice]
+    cmd += ["--model", body.model]
+    cmd += ["--thinking" if body.thinking else "--no-thinking"]
+    cmd += ["--effort", body.effort]
+    if body.context_1m:
+        cmd += ["--context-1m"]
+    if body.hint:
+        cmd += ["--hint", body.hint]
+    return cmd
+
+
+async def _broadcast_output(proc: asyncio.subprocess.Process) -> None:
+    assert proc.stdout is not None
+    async for raw in proc.stdout:
+        text = raw.decode(errors="replace").rstrip()
+        _log_lines.append(text)
+        for q in list(_subscribers):
+            await q.put(text)
+    await proc.wait()
+    for q in list(_subscribers):
+        await q.put(None)
+
+
+# ---------------------------------------------------------------------------
+# Endpoints
+# ---------------------------------------------------------------------------
+
+
+@app.get("/", response_class=HTMLResponse)
+async def index() -> str:
+    return HTML
+
+
+@app.post("/run", status_code=202)
+async def start_run(body: RunRequest) -> dict[str, str]:
+    global _process, _log_lines, _cache_dir, _sources_dir
+    if _process is not None and _process.returncode is None:
+        raise HTTPException(409, "Already running")
+    _cache_dir = pathlib.Path(body.cache_dir)
+    _sources_dir = pathlib.Path(body.sources)
+    _log_lines = []
+    cmd = _build_cmd(body)
+    _process = await asyncio.create_subprocess_exec(
+        *cmd,
+        stdout=asyncio.subprocess.PIPE,
+        stderr=asyncio.subprocess.STDOUT,
+    )
+    asyncio.create_task(_broadcast_output(_process))
+    return {"status": "started"}
+
+
+@app.delete("/run")
+async def stop_run() -> dict[str, str]:
+    if _process is None or _process.returncode is not None:
+        raise HTTPException(404, "No running process")
+    _process.send_signal(signal.SIGTERM)
+    return {"status": "stopping"}
+
+
+@app.get("/stream")
+async def stream() -> StreamingResponse:
+    async def generator() -> AsyncGenerator[str, None]:
+        q: asyncio.Queue[str | None] = asyncio.Queue()
+        _subscribers.append(q)
+        try:
+            for line in list(_log_lines):
+                yield f"data: {line}\n\n"
+            if _process is None or _process.returncode is not None:
+                return
+            while True:
+                try:
+                    item = await asyncio.wait_for(q.get(), timeout=15.0)
+                except asyncio.TimeoutError:
+                    yield ": keepalive\n\n"
+                    continue
+                if item is None:
+                    break
+                yield f"data: {item}\n\n"
+        finally:
+            try:
+                _subscribers.remove(q)
+            except ValueError:
+                pass
+
+    return StreamingResponse(generator(), media_type="text/event-stream")
+
+
+@app.get("/history")
+async def get_history() -> Any:
+    path = _cache_dir / "history.json"
+    if not path.exists():
+        return {"baseline_sha": None, "attempts": []}
+    return json.loads(path.read_text())
+
+
+@app.delete("/history")
+async def reset_history() -> dict[str, str]:
+    import shutil
+
+    for p in _cache_dir.iterdir():
+        if p.is_dir():
+            shutil.rmtree(p)
+        else:
+            p.unlink()
+    return {"status": "reset"}
+
+
+@app.get("/status")
+async def get_status() -> dict[str, Any]:
+    if _process is None:
+        return {"state": "idle", "returncode": None}
+    if _process.returncode is None:
+        return {"state": "running", "returncode": None}
+    return {"state": "stopped", "returncode": _process.returncode}
+
+
+@app.get("/bench/{bench_id}")
+async def get_bench(bench_id: int) -> Any:
+    path = _cache_dir / f"bench-{bench_id}.json"
+    if not path.exists():
+        raise HTTPException(404, "Bench data not found")
+    return json.loads(path.read_text())
+
+
+@app.get("/diff/{bench_id}")
+async def get_diff(bench_id: int) -> PlainTextResponse:
+    history_path = _cache_dir / "history.json"
+    if not history_path.exists():
+        raise HTTPException(404, "No history")
+    history = json.loads(history_path.read_text())
+    attempt = next(
+        (a for a in history.get("attempts", []) if a["id"] == bench_id), None
+    )
+    if attempt is None or not attempt.get("result_sha"):
+        raise HTTPException(404, "Attempt not found or has no result")
+    proc = await asyncio.create_subprocess_exec(
+        "git",
+        "-C",
+        str(_sources_dir.resolve()),
+        "diff",
+        attempt["baseline_sha"],
+        attempt["result_sha"],
+        "--",
+        "src/libbpfilter/cgen/",
+        stdout=asyncio.subprocess.PIPE,
+        stderr=asyncio.subprocess.PIPE,
+    )
+    stdout, _ = await proc.communicate()
+    return PlainTextResponse(stdout.decode(errors="replace"))
+
+
+# ---------------------------------------------------------------------------
+# Embedded frontend
+# ---------------------------------------------------------------------------
+
+HTML = """<!DOCTYPE html>
+<html lang="en" data-theme="dark">
+<head>
+<meta charset="UTF-8">
+<meta name="viewport" content="width=device-width, initial-scale=1">
+<title>bfoptimize</title>
+<link rel="preconnect" href="https://fonts.googleapis.com">
+<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+<link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&family=JetBrains+Mono:wght@400;500&display=swap" rel="stylesheet">
+<script src="https://cdn.jsdelivr.net/npm/chart.js@4.4.4/dist/chart.umd.min.js"></script>
+<style>
+/* ── Tokens ──────────────────────────────────────────────────────────────── */
+:root {
+  --font: 'Inter', system-ui, -apple-system, sans-serif;
+  --mono: 'JetBrains Mono', 'Fira Code', ui-monospace, monospace;
+  --r: 10px; --r-sm: 6px; --r-lg: 14px; --dur: .2s;
+}
+[data-theme="dark"] {
+  --bg:#07090f; --surface:#0c0f1a; --card:#111420; --card-hi:#161a28;
+  --border:#1e2438; --border-hi:#252c42;
+  --accent:#818cf8; --accent-dim:rgba(129,140,248,.14);
+  --text:#e2e8f0; --text-2:#8892a4; --text-3:#3d4460;
+  --ok:#4ade80; --ok-bg:rgba(74,222,128,.10); --ok-glow:rgba(74,222,128,.30);
+  --err:#f87171; --err-bg:rgba(248,113,113,.10);
+  --warn:#fbbf24; --warn-bg:rgba(251,191,36,.10);
+  --log-bg:#040609; --shadow:0 4px 32px rgba(0,0,0,.55); --shadow-sm:0 2px 10px rgba(0,0,0,.4);
+  --code-bg:#040609; --code-fg:#6e7e9e;
+}
+[data-theme="light"] {
+  --bg:#f0f4f8; --surface:#f8fafc; --card:#fff; --card-hi:#f8fafc;
+  --border:#dde3ee; --border-hi:#c8d1e0;
+  --accent:#6366f1; --accent-dim:rgba(99,102,241,.10);
+  --text:#0f172a; --text-2:#4a5568; --text-3:#94a3b8;
+  --ok:#16a34a; --ok-bg:rgba(22,163,74,.08); --ok-glow:rgba(22,163,74,.25);
+  --err:#dc2626; --err-bg:rgba(220,38,38,.08);
+  --warn:#b45309; --warn-bg:rgba(180,83,9,.08);
+  --log-bg:#0d1117; --shadow:0 4px 24px rgba(0,0,0,.08); --shadow-sm:0 2px 8px rgba(0,0,0,.06);
+  --code-bg:#f6f8fa; --code-fg:#334155;
+}
+
+/* ── Reset ───────────────────────────────────────────────────────────────── */
+*,*::before,*::after{box-sizing:border-box;margin:0;padding:0}
+html{font-size:15px;-webkit-font-smoothing:antialiased}
+body{font-family:var(--font);background:var(--bg);color:var(--text);min-height:100vh;
+     transition:background var(--dur),color var(--dur)}
+::-webkit-scrollbar{width:5px;height:5px}
+::-webkit-scrollbar-track{background:transparent}
+::-webkit-scrollbar-thumb{background:var(--border-hi);border-radius:99px}
+::-webkit-scrollbar-thumb:hover{background:var(--text-3)}
+
+/* ── Topbar ──────────────────────────────────────────────────────────────── */
+.topbar{position:sticky;top:0;z-index:200;display:flex;align-items:center;
+  justify-content:space-between;padding:0 20px;height:54px;
+  background:var(--surface);border-bottom:1px solid var(--border);
+  backdrop-filter:blur(14px);-webkit-backdrop-filter:blur(14px);
+  transition:background var(--dur),border-color var(--dur)}
+.brand{display:flex;align-items:center;gap:10px;font-weight:700;
+  font-size:1.05rem;letter-spacing:-.02em}
+.brand-icon{width:30px;height:30px;border-radius:8px;flex-shrink:0;
+  background:linear-gradient(135deg,#6366f1 0%,#a78bfa 100%);
+  display:flex;align-items:center;justify-content:center;color:#fff;
+  box-shadow:0 2px 8px rgba(99,102,241,.4)}
+.brand-text{background:linear-gradient(135deg,var(--accent) 0%,#c084fc 100%);
+  -webkit-background-clip:text;-webkit-text-fill-color:transparent;background-clip:text}
+.topbar-right{display:flex;align-items:center;gap:8px}
+
+/* ── Buttons ─────────────────────────────────────────────────────────────── */
+.btn{display:inline-flex;align-items:center;gap:6px;padding:7px 14px;
+  border:none;border-radius:var(--r-sm);font-family:var(--font);font-size:.855rem;
+  font-weight:500;cursor:pointer;transition:all var(--dur);white-space:nowrap;outline:none}
+.btn svg{flex-shrink:0;display:block}
+.btn-start{background:linear-gradient(135deg,#5254e0 0%,#7c7ef5 100%);color:#fff;
+  box-shadow:0 2px 10px rgba(82,84,224,.35)}
+.btn-start:hover{background:linear-gradient(135deg,#4446d4 0%,#6c6ef0 100%);
+  box-shadow:0 4px 18px rgba(82,84,224,.45);transform:translateY(-1px)}
+.btn-start:active{transform:translateY(0)}
+.btn-stop{background:var(--err-bg);color:var(--err);
+  border:1px solid rgba(248,113,113,.2)}
+.btn-stop:hover{background:rgba(248,113,113,.2);border-color:rgba(248,113,113,.35)}
+.btn-ghost{background:transparent;color:var(--text-2);border:1px solid var(--border);
+  padding:5px 10px;font-size:.78rem}
+.btn-ghost:hover{background:var(--card-hi);color:var(--text)}
+
+/* ── Config overlay ──────────────────────────────────────────────────────── */
+.overlay-bg{display:none;position:fixed;inset:0;z-index:400;
+  background:rgba(0,0,0,.55);backdrop-filter:blur(6px);-webkit-backdrop-filter:blur(6px)}
+.overlay-bg.open{display:block}
+.run-form{display:none;position:fixed;top:50%;left:50%;z-index:401;
+  transform:translate(-50%,-50%);
+  width:min(720px,calc(100vw - 40px));max-height:calc(100vh - 60px);overflow-y:auto;
+  background:var(--card);border:1px solid var(--border-hi);border-radius:var(--r-lg);
+  box-shadow:0 24px 64px rgba(0,0,0,.6);padding:24px}
+.run-form.open{display:block}
+.fgrid{display:grid;grid-template-columns:1fr 1fr;gap:10px;margin-bottom:10px}
+.fg{display:flex;flex-direction:column;gap:4px}
+.fg.full{grid-column:1/-1}
+.flbl{font-size:.7rem;font-weight:700;color:var(--text-3);
+  text-transform:uppercase;letter-spacing:.06em}
+.fctl{padding:7px 10px;background:var(--bg);border:1px solid var(--border);
+  border-radius:var(--r-sm);color:var(--text);font-family:var(--font);font-size:.84rem;
+  outline:none;transition:border-color var(--dur),box-shadow var(--dur)}
+.fctl:focus{border-color:var(--accent);box-shadow:0 0 0 3px var(--accent-dim)}
+.fctl::placeholder{color:var(--text-3)}
+.fchk{display:flex;align-items:center;gap:8px;padding:2px 0}
+.fchk input[type=checkbox]{width:15px;height:15px;accent-color:var(--accent);cursor:pointer}
+.fchk label{font-size:.82rem;color:var(--text-2);cursor:pointer}
+.hint-add{display:inline-flex;align-items:center;gap:4px;padding:3px 9px;
+  font-size:.73rem;font-weight:600;cursor:pointer;border-radius:var(--r-sm);
+  background:var(--accent-dim);color:var(--accent);
+  border:1px solid rgba(129,140,248,.3);transition:all var(--dur)}
+.hint-add:hover{background:var(--accent);color:#fff}
+.hint-del{width:28px;height:28px;flex-shrink:0;border-radius:var(--r-sm);
+  background:var(--err-bg);color:var(--err);
+  border:1px solid rgba(248,113,113,.18);cursor:pointer;
+  display:flex;align-items:center;justify-content:center;transition:all var(--dur)}
+.hint-del:hover{background:rgba(248,113,113,.22);border-color:rgba(248,113,113,.4)}
+
+/* ── Status pill ─────────────────────────────────────────────────────────── */
+.spill{display:flex;align-items:center;gap:7px;padding:5px 12px;
+  background:var(--card);border:1px solid var(--border);border-radius:99px;
+  font-size:.78rem;font-weight:500;color:var(--text-2);transition:all var(--dur)}
+.sdot{width:7px;height:7px;border-radius:50%;background:var(--text-3);
+  transition:background var(--dur),box-shadow var(--dur)}
+.sdot.on{background:var(--ok);box-shadow:0 0 0 3px var(--ok-bg);
+  animation:pulse 2.2s ease infinite}
+.sdot.done{background:var(--accent)}
+.sdot.fail{background:var(--err)}
+@keyframes pulse{0%,100%{opacity:1;transform:scale(1)}50%{opacity:.65;transform:scale(1.15)}}
+
+/* ── Theme btn ───────────────────────────────────────────────────────────── */
+.tbtn{width:34px;height:34px;border:1px solid var(--border);border-radius:var(--r-sm);
+  background:transparent;color:var(--text-2);cursor:pointer;
+  display:flex;align-items:center;justify-content:center;transition:all var(--dur)}
+.tbtn:hover{background:var(--card);color:var(--text);border-color:var(--border-hi)}
+.i-sun{display:none}.i-moon{display:block}
+[data-theme="light"] .i-sun{display:block}[data-theme="light"] .i-moon{display:none}
+
+/* ── Layout ──────────────────────────────────────────────────────────────── */
+main{max-width:1680px;margin:0 auto;padding:18px 20px;
+  display:flex;flex-direction:column;gap:14px}
+
+/* ── Metrics ─────────────────────────────────────────────────────────────── */
+.metrics{display:grid;grid-template-columns:repeat(4,1fr);gap:12px}
+.metric{background:var(--card);border:1px solid var(--border);border-radius:var(--r);
+  padding:16px 20px;transition:background var(--dur),border-color var(--dur)}
+.mval{font-size:1.95rem;font-weight:700;letter-spacing:-.04em;line-height:1;
+  margin-bottom:5px;transition:color var(--dur)}
+.mval.ok{color:var(--ok)}.mval.err{color:var(--err)}.mval.acc{color:var(--accent)}
+.mlbl{font-size:.72rem;font-weight:700;color:var(--text-3);
+  text-transform:uppercase;letter-spacing:.06em}
+
+/* ── Panel ───────────────────────────────────────────────────────────────── */
+.panel{background:var(--card);border:1px solid var(--border);border-radius:var(--r);
+  overflow:hidden;transition:background var(--dur),border-color var(--dur)}
+.ph{display:flex;align-items:center;justify-content:space-between;
+  padding:12px 16px;border-bottom:1px solid var(--border);
+  transition:border-color var(--dur)}
+.ph-t{font-size:.74rem;font-weight:700;color:var(--text-2);
+  text-transform:uppercase;letter-spacing:.07em;display:flex;align-items:center;gap:8px}
+.ph-s{font-size:.71rem;color:var(--text-3)}
+.pb{padding:14px 16px}
+
+/* ── Two-col grid ────────────────────────────────────────────────────────── */
+.mgrid{display:grid;grid-template-columns:58fr 42fr;gap:14px;align-items:stretch;min-height:500px}
+@media(max-width:1100px){.mgrid{grid-template-columns:1fr}.metrics{grid-template-columns:repeat(2,1fr)}}
+
+/* ── Live dot ────────────────────────────────────────────────────────────── */
+.ldot{width:6px;height:6px;border-radius:50%;background:var(--text-3);transition:all var(--dur)}
+.ldot.on{background:var(--ok);box-shadow:0 0 0 3px var(--ok-bg);animation:pulse 2.2s ease infinite}
+
+/* ── Logs ────────────────────────────────────────────────────────────────── */
+#logs{font-family:var(--mono);font-size:.775rem;line-height:1.65;
+  background:var(--code-bg);color:var(--code-fg);padding:12px 14px;
+  height:420px;
+  overflow-y:auto;white-space:pre-wrap;word-break:break-all;
+  transition:background var(--dur),color var(--dur),box-shadow var(--dur)}
+#logs.on{box-shadow:inset 0 0 0 1px rgba(74,222,128,.12)}
+
+/* ── Pills ───────────────────────────────────────────────────────────────── */
+.pills{display:flex;flex-wrap:wrap;gap:7px;min-height:34px;padding-bottom:14px}
+.pill{display:inline-flex;align-items:center;gap:5px;padding:5px 11px;
+  border-radius:99px;border:1px solid var(--border);background:var(--bg);
+  color:var(--text-2);font-size:.775rem;font-weight:500;font-family:var(--mono);
+  cursor:pointer;transition:all .15s;animation:pop-in .22s ease both}
+.pill:hover{background:var(--card-hi);border-color:var(--accent);color:var(--text)}
+.pill.active{background:var(--accent-dim);border-color:var(--accent);color:var(--accent)}
+.pdot{width:6px;height:6px;border-radius:50%;flex-shrink:0}
+.pdelta{font-size:.72rem;opacity:.7}
+@keyframes pop-in{from{opacity:0;transform:scale(.85) translateY(5px)}to{opacity:1;transform:scale(1) translateY(0)}}
+
+/* ── Detail sections ─────────────────────────────────────────────────────── */
+.detail{margin-top:14px;border:1px solid var(--border);border-radius:var(--r);overflow:hidden}
+.detail.hidden{display:none}
+.dh{display:flex;align-items:center;justify-content:space-between;padding:9px 14px;
+  background:var(--surface);border-bottom:1px solid var(--border);
+  font-size:.71rem;font-weight:700;color:var(--text-3);
+  text-transform:uppercase;letter-spacing:.06em;transition:background var(--dur)}
+.dh-path{font-family:var(--mono);font-size:.71rem;color:var(--text-3);
+  font-weight:400;text-transform:none;letter-spacing:0}
+
+/* ── Compare table ───────────────────────────────────────────────────────── */
+.tw{overflow-x:auto}
+table.cmp{width:100%;border-collapse:collapse;font-size:.8rem}
+.cmp th{padding:8px 13px;text-align:right;font-size:.7rem;font-weight:700;
+  color:var(--text-3);text-transform:uppercase;letter-spacing:.05em;
+  background:var(--surface);border-bottom:1px solid var(--border);
+  white-space:nowrap;transition:background var(--dur)}
+.cmp th:first-child{text-align:left}
+.cmp td{padding:7px 13px;text-align:right;border-bottom:1px solid var(--border);
+  font-family:var(--mono);font-size:.78rem;color:var(--text-2);
+  white-space:nowrap;transition:background .1s}
+.cmp td:first-child{text-align:left;font-family:var(--font);font-weight:500;
+  color:var(--text);font-size:.8rem}
+.cmp tr:last-child td{border-bottom:none}
+.cmp tr:hover td{background:var(--card-hi)}
+.cmp td.g{color:var(--ok);background:var(--ok-bg)}
+.cmp td.b{color:var(--err);background:var(--err-bg)}
+
+/* ── Diff ────────────────────────────────────────────────────────────────── */
+#diff-pre{font-family:var(--mono);font-size:.775rem;line-height:1.7;
+  background:var(--code-bg);color:var(--code-fg);padding:10px 0;
+  max-height:560px;overflow:auto;
+  transition:background var(--dur),color var(--dur)}
+.dl{display:block;padding:0 14px;min-height:1.7em}
+.dl.add{background:rgba(74,222,128,.07);color:#4ade80}
+.dl.del{background:rgba(248,113,113,.07);color:#f87171}
+.dl.hunk{background:rgba(96,165,250,.05);color:#60a5fa}
+.dl.meta,.dl.fhdr{color:#3d4460}
+[data-theme="light"] .dl.add{background:rgba(22,163,74,.09);color:#166534}
+[data-theme="light"] .dl.del{background:rgba(220,38,38,.09);color:#991b1b}
+[data-theme="light"] .dl.hunk{background:rgba(37,99,235,.06);color:#1d4ed8}
+[data-theme="light"] .dl.meta,[data-theme="light"] .dl.fhdr{color:#64748b}
+
+/* ── Chart tooltip ───────────────────────────────────────────────────────── */
+#ctip{position:fixed;pointer-events:none;z-index:400;background:var(--card);
+  border:1px solid var(--border-hi);border-radius:var(--r);
+  padding:10px 14px;box-shadow:var(--shadow);
+  opacity:0;transition:opacity .1s;max-width:300px;min-width:160px}
+.ct-id{font-family:var(--mono);font-size:.7rem;color:var(--text-3);margin-bottom:4px}
+.ct-desc{font-size:.8rem;color:var(--text);line-height:1.45;margin-bottom:5px}
+.ct-delta{font-family:var(--mono);font-weight:700;font-size:.92rem}
+
+/* ── Empty state ─────────────────────────────────────────────────────────── */
+.empty{display:flex;flex-direction:column;align-items:center;gap:6px;
+  padding:28px 20px;color:var(--text-3);font-size:.88rem;text-align:center;width:100%}
+.empty-icon{font-size:1.6rem;opacity:.35}
+</style>
+</head>
+<body>
+
+<div id="ctip"></div>
+<div class="overlay-bg" id="overlay-bg" onclick="closeForm()"></div>
+
+<!-- Config overlay -->
+<div class="run-form" id="run-form">
+  <div style="display:flex;align-items:center;justify-content:space-between;margin-bottom:16px">
+    <span style="font-weight:700;font-size:1rem;color:var(--text)">Run configuration</span>
+    <button class="tbtn" onclick="closeForm()" title="Close" style="flex-shrink:0">
+      <svg width="15" height="15" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2.5" stroke-linecap="round"><line x1="18" y1="6" x2="6" y2="18"/><line x1="6" y1="6" x2="18" y2="18"/></svg>
+    </button>
+  </div>
+  <div class="fgrid">
+    <div class="fg"><label class="flbl">Iterations</label>
+      <input id="f-iter" type="number" class="fctl" value="10" min="1"></div>
+    <div class="fg"><label class="flbl">Sources</label>
+      <input id="f-src" type="text" class="fctl" value="."></div>
+    <div class="fg"><label class="flbl">Build dir</label>
+      <input id="f-build" type="text" class="fctl" value="build"></div>
+    <div class="fg"><label class="flbl">Cache dir</label>
+      <input id="f-cache" type="text" class="fctl" value=".cache/bfoptimize"></div>
+    <div class="fg"><label class="flbl">Host</label>
+      <input id="f-host" type="text" class="fctl" placeholder="localhost"></div>
+    <div class="fg"><label class="flbl">Bind node</label>
+      <input id="f-node" type="number" class="fctl" placeholder="&#8212;" min="0"></div>
+    <div class="fg"><label class="flbl">CPU pin</label>
+      <input id="f-cpu" type="number" class="fctl" placeholder="&#8212;" min="0"></div>
+    <div class="fg"><label class="flbl">Slice</label>
+      <input id="f-slice" type="text" class="fctl" placeholder="&#8212;"></div>
+    <div class="fg full">
+      <div class="fchk">
+        <input id="f-nopre" type="checkbox">
+        <label for="f-nopre">No preempt &mdash; RT scheduling</label>
+      </div>
+    </div>
+  </div>
+  <hr style="border-color:var(--border);margin:4px 0 10px">
+  <div class="fgrid">
+    <div class="fg full" style="color:var(--text-3);font-size:.7rem;font-weight:700;text-transform:uppercase;letter-spacing:.06em;margin-bottom:2px">Hints</div>
+    <div class="fg full">
+      <div style="display:flex;align-items:center;justify-content:space-between;margin-bottom:8px">
+        <span class="flbl" style="margin:0">Optional directions for the model <span style="font-weight:400;text-transform:none;letter-spacing:0">&mdash; it may ignore them if a better opportunity exists</span></span>
+        <button class="hint-add" onclick="addHint()">
+          <svg width="11" height="11" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="3" stroke-linecap="round"><line x1="12" y1="5" x2="12" y2="19"/><line x1="5" y1="12" x2="19" y2="12"/></svg>
+          Add
+        </button>
+      </div>
+      <div id="hints-list"></div>
+    </div>
+    <div class="fg full" style="color:var(--text-3);font-size:.7rem;font-weight:700;text-transform:uppercase;letter-spacing:.06em;margin:6px 0 2px">Model</div>
+    <div class="fg full">
+      <label class="flbl">Model</label>
+      <select id="f-model" class="fctl">
+        <option value="claude-opus-4-6">Opus 4.6 &middot; 200K</option>
+        <option value="claude-opus-4-6-1m">Opus 4.6 &middot; 1M context</option>
+        <option value="claude-sonnet-4-6">Sonnet 4.6 &middot; 200K</option>
+        <option value="claude-sonnet-4-6-1m">Sonnet 4.6 &middot; 1M context</option>
+      </select>
+    </div>
+    <div class="fg">
+      <label class="flbl">Effort</label>
+      <select id="f-effort" class="fctl">
+        <option value="low">Low</option>
+        <option value="medium">Medium</option>
+        <option value="high" selected>High</option>
+        <option value="max">Max (Opus only)</option>
+      </select>
+    </div>
+    <div class="fg" style="justify-content:flex-end">
+      <div class="fchk">
+        <input id="f-ultrathink" type="checkbox" checked>
+        <label for="f-ultrathink">Ultrathink (adaptive thinking)</label>
+      </div>
+    </div>
+  </div>
+  <div style="display:flex;justify-content:flex-end;gap:10px;margin-top:16px;padding-top:14px;border-top:1px solid var(--border)">
+    <button class="btn btn-ghost" onclick="closeForm()">Cancel</button>
+    <button class="btn btn-start" onclick="startRun()">
+      <svg width="13" height="13" viewBox="0 0 24 24" fill="currentColor"><path d="M8 5v14l11-7z"/></svg>
+      Start run
+    </button>
+  </div>
+</div>
+
+<header class="topbar">
+  <div class="brand">
+    <div class="brand-icon">
+      <svg width="15" height="15" viewBox="0 0 24 24" fill="currentColor"><path d="M13 2L3 14h9l-1 8 10-12h-9l1-8z"/></svg>
+    </div>
+    <span class="brand-text">bfoptimize</span>
+  </div>
+  <div class="topbar-right">
+    <div class="run-wrap">
+      <div class="run-group">
+        <button class="btn btn-start" onclick="startRun()">
+          <svg width="13" height="13" viewBox="0 0 24 24" fill="currentColor"><path d="M8 5v14l11-7z"/></svg>
+          Start
+        </button>
+      </div>
+    </div>
+    <button class="tbtn" onclick="toggleForm(event)" title="Configuration">
+      <svg width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="3"/><path d="M19.4 15a1.65 1.65 0 0 0 .33 1.82l.06.06a2 2 0 0 1-2.83 2.83l-.06-.06a1.65 1.65 0 0 0-1.82-.33 1.65 1.65 0 0 0-1 1.51V21a2 2 0 0 1-4 0v-.09A1.65 1.65 0 0 0 9 19.4a1.65 1.65 0 0 0-1.82.33l-.06.06a2 2 0 0 1-2.83-2.83l.06-.06A1.65 1.65 0 0 0 4.68 15a1.65 1.65 0 0 0-1.51-1H3a2 2 0 0 1 0-4h.09A1.65 1.65 0 0 0 4.6 9a1.65 1.65 0 0 0-.33-1.82l-.06-.06a2 2 0 0 1 2.83-2.83l.06.06A1.65 1.65 0 0 0 9 4.68a1.65 1.65 0 0 0 1-1.51V3a2 2 0 0 1 4 0v.09a1.65 1.65 0 0 0 1 1.51 1.65 1.65 0 0 0 1.82-.33l.06-.06a2 2 0 0 1 2.83 2.83l-.06.06A1.65 1.65 0 0 0 19.4 9a1.65 1.65 0 0 0 1.51 1H21a2 2 0 0 1 0 4h-.09a1.65 1.65 0 0 0-1.51 1z"/></svg>
+    </button>
+    <button class="btn btn-stop" onclick="stopRun()">
+      <svg width="11" height="11" viewBox="0 0 24 24" fill="currentColor"><rect x="4" y="4" width="16" height="16" rx="2"/></svg>
+      Stop
+    </button>
+    <button class="tbtn" onclick="resetHistory()" title="Reset history">
+      <svg width="15" height="15" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><polyline points="1 4 1 10 7 10"/><path d="M3.51 15a9 9 0 1 0 .49-4.5"/></svg>
+    </button>
+    <div class="spill">
+      <span class="sdot" id="sdot"></span>
+      <span id="stxt">Idle</span>
+    </div>
+    <button class="tbtn" onclick="toggleTheme()" title="Toggle light/dark">
+      <svg class="i-moon" width="15" height="15" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round"><path d="M21 12.79A9 9 0 1 1 11.21 3 7 7 0 0 0 21 12.79z"/></svg>
+      <svg class="i-sun" width="15" height="15" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg>
+    </button>
+  </div>
+</header>
+
+<main>
+  <div class="metrics">
+    <div class="metric"><div class="mval" id="m-tot">&#8212;</div><div class="mlbl">Attempts</div></div>
+    <div class="metric"><div class="mval" id="m-acc">&#8212;</div><div class="mlbl">Accepted</div></div>
+    <div class="metric"><div class="mval" id="m-rej">&#8212;</div><div class="mlbl">Rejected</div></div>
+    <div class="metric"><div class="mval" id="m-best">&#8212;</div><div class="mlbl">Best cumul. &#916;</div></div>
+  </div>
+
+  <div class="mgrid">
+    <div class="panel" style="display:flex;flex-direction:column">
+      <div class="ph">
+        <span class="ph-t">Runtime delta per attempt</span>
+        <span class="ph-s">negative = faster than baseline</span>
+      </div>
+      <div style="padding:10px 14px 14px;flex:1;min-height:0;position:relative">
+        <canvas id="chart"></canvas>
+      </div>
+    </div>
+    <div class="panel" style="display:flex;flex-direction:column">
+      <div class="ph">
+        <span class="ph-t"><span class="ldot" id="ldot"></span>Output</span>
+        <button class="btn-ghost" onclick="clearLogs()">Clear</button>
+      </div>
+      <pre id="logs"></pre>
+    </div>
+  </div>
+
+  <div class="panel">
+    <div class="ph"><span class="ph-t">Benchmark history</span></div>
+    <div class="pb">
+      <div class="pills" id="pills">
+        <div class="empty" id="pills-empty">
+          <span class="empty-icon">&#9685;</span>
+          <span>No results yet &mdash; start a run to populate this section</span>
+        </div>
+      </div>
+      <div class="detail hidden" id="cmp-card">
+        <div class="dh"><span>Benchmark comparison</span></div>
+        <div class="tw">
+          <table class="cmp">
+            <thead><tr>
+              <th style="text-align:left">Benchmark</th>
+              <th>Base (ns)</th><th>Ref (ns)</th>
+              <th>&#916;Time (ns)</th><th>&#916;Time %</th>
+              <th>Base Insn</th><th>Ref Insn</th>
+              <th>&#916;Insn</th><th>&#916;Insn %</th>
+            </tr></thead>
+            <tbody id="cmp-body"></tbody>
+          </table>
+        </div>
+      </div>
+      <div class="detail hidden" id="diff-card">
+        <div class="dh">
+          <span>Code changes</span>
+          <span class="dh-path">src/libbpfilter/cgen/</span>
+        </div>
+        <pre id="diff-pre"></pre>
+      </div>
+    </div>
+  </div>
+</main>
+
+<script>
+// ── Form persistence (localStorage) ──────────────────────────────────────
+const _TEXT_IDS  = ['f-iter','f-src','f-build','f-cache','f-host','f-node','f-cpu','f-slice','f-model','f-effort'];
+const _CHECK_IDS = ['f-nopre','f-ultrathink'];
+function saveFormPrefs() {
+  const s = {};
+  _TEXT_IDS.forEach(id => { s[id] = document.getElementById(id).value; });
+  _CHECK_IDS.forEach(id => { s[id] = document.getElementById(id).checked; });
+  s.hints = Array.from(document.querySelectorAll('.hint-input')).map(el => el.value);
+  localStorage.setItem('bfo-form', JSON.stringify(s));
+}
+function loadFormPrefs() {
+  try {
+    const s = JSON.parse(localStorage.getItem('bfo-form') || '{}');
+    _TEXT_IDS.forEach(id => { if (s[id] != null) document.getElementById(id).value = s[id]; });
+    _CHECK_IDS.forEach(id => { if (s[id] != null) document.getElementById(id).checked = s[id]; });
+    if (Array.isArray(s.hints)) s.hints.forEach(h => addHint(h));
+  } catch (_) {}
+}
+
+// ── Hints ──────────────────────────────────────────────────────────────────
+function addHint(value = '') {
+  const list = document.getElementById('hints-list');
+  const row = document.createElement('div');
+  row.className = 'hint-row';
+  row.style.cssText = 'display:flex;align-items:center;gap:8px;margin-bottom:6px';
+  row.innerHTML =
+    '<button class="hint-del" onclick="removeHint(this)" title="Remove hint">' +
+      '<svg width="12" height="12" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2.5" stroke-linecap="round"><polyline points="3 6 5 6 21 6"/><path d="M19 6l-1 14a2 2 0 0 1-2 2H8a2 2 0 0 1-2-2L5 6"/><path d="M10 11v6"/><path d="M14 11v6"/><path d="M9 6V4h6v2"/></svg>' +
+    '</button>' +
+    '<input type="text" class="fctl hint-input" placeholder="e.g. look into XXX" style="flex:1">';
+  list.appendChild(row);
+  row.querySelector('.hint-input').value = value;
+  row.querySelector('.hint-input').addEventListener('input', saveFormPrefs);
+  saveFormPrefs();
+}
+function removeHint(btn) {
+  btn.closest('.hint-row').remove();
+  saveFormPrefs();
+}
+// Keep saveModelPrefs as an alias used in startRun
+const saveModelPrefs = saveFormPrefs;
+
+function _attachPersistence() {
+  _TEXT_IDS.forEach(id => document.getElementById(id).addEventListener('input', saveFormPrefs));
+  _CHECK_IDS.forEach(id => document.getElementById(id).addEventListener('change', saveFormPrefs));
+}
+
+// ── Theme ─────────────────────────────────────────────────────────────────
+let _theme = localStorage.getItem('bfo-theme') || 'dark';
+document.documentElement.setAttribute('data-theme', _theme);
+function toggleTheme() {
+  _theme = _theme === 'dark' ? 'light' : 'dark';
+  document.documentElement.setAttribute('data-theme', _theme);
+  localStorage.setItem('bfo-theme', _theme);
+  applyChartTheme();
+}
+
+// ── Chart palettes ────────────────────────────────────────────────────────
+const PAL = {
+  dark: {
+    grid:'rgba(30,36,56,.9)', tick:'#3d4460', axTitle:'#3d4460', legText:'#8892a4',
+    ptBorder:'#07090f', ok:'#4ade80', err:'#f87171', warn:'#fbbf24',
+    best:'#818cf8', bestFill:'rgba(129,140,248,.11)', base:'rgba(61,68,96,.55)',
+  },
+  light: {
+    grid:'#dde3ee', tick:'#94a3b8', axTitle:'#94a3b8', legText:'#4a5568',
+    ptBorder:'#f0f4f8', ok:'#16a34a', err:'#dc2626', warn:'#b45309',
+    best:'#6366f1', bestFill:'rgba(99,102,241,.09)', base:'rgba(148,163,184,.5)',
+  },
+};
+const p = () => PAL[_theme];
+
+// ── Chart ─────────────────────────────────────────────────────────────────
+let chart;
+function initChart() {
+  Chart.defaults.font.family = "'Inter', system-ui, sans-serif";
+  Chart.defaults.font.size = 11;
+  const ctx = document.getElementById('chart').getContext('2d');
+  const c = p();
+  chart = new Chart(ctx, {
+    data: { datasets: [
+      { type:'scatter', label:'Accepted (vs prev baseline)',
+        data:[], backgroundColor:c.ok, borderColor:c.ptBorder, borderWidth:2,
+        pointRadius:7, pointHoverRadius:10, order:2 },
+      { type:'scatter', label:'Rejected (bench)',
+        data:[], backgroundColor:c.err, borderColor:c.ptBorder, borderWidth:2,
+        pointRadius:7, pointHoverRadius:10, order:2 },
+      { type:'scatter', label:'Rejected (tests)',
+        data:[], backgroundColor:c.warn, borderColor:c.ptBorder, borderWidth:2,
+        pointRadius:7, pointHoverRadius:10, order:2 },
+      { type:'line', label:'Current best (cumul. vs initial)',
+        data:[], borderColor:c.best, backgroundColor:c.bestFill,
+        fill:'origin', pointRadius:0, stepped:'after', borderWidth:2.5, order:1 },
+      { type:'line', label:'y = 0',
+        data:[], borderColor:c.base, backgroundColor:'transparent',
+        pointRadius:0, borderDash:[5,5], borderWidth:1, order:3 },
+    ]},
+    options: {
+      responsive:true, maintainAspectRatio:false, animation:false,
+      interaction:{ mode:'nearest', intersect:true },
+      scales:{
+        x:{ type:'linear',
+          title:{ display:true, text:'Attempt #', color:c.axTitle },
+          grid:{ color:c.grid }, ticks:{ color:c.tick, stepSize:1 },
+          border:{ color:c.grid } },
+        y:{ title:{ display:true, text:'\\u0394 Time % vs baseline', color:c.axTitle },
+          grid:{ color:c.grid }, ticks:{ color:c.tick, callback: v => v.toFixed(2) + '%' },
+          border:{ color:c.grid }, grace:'15%' },
+      },
+      plugins:{
+        legend:{ labels:{ color:c.legText, usePointStyle:true, pointStyle:'circle',
+          boxWidth:7, padding:16 } },
+        tooltip:{ enabled:false, external:customTip },
+      },
+    },
+  });
+}
+
+function applyChartTheme() {
+  if (!chart) return;
+  const c = p();
+  const ds = chart.data.datasets;
+  ds[0].backgroundColor=c.ok;   ds[0].borderColor=c.ptBorder;
+  ds[1].backgroundColor=c.err;  ds[1].borderColor=c.ptBorder;
+  ds[2].backgroundColor=c.warn; ds[2].borderColor=c.ptBorder;
+  ds[3].borderColor=c.best;     ds[3].backgroundColor=c.bestFill;
+  ds[4].borderColor=c.base;
+  const sc = chart.options.scales;
+  for (const ax of [sc.x, sc.y]) {
+    ax.grid.color=c.grid; ax.ticks.color=c.tick;
+    ax.title.color=c.axTitle; ax.border.color=c.grid;
+  }
+  chart.options.plugins.legend.labels.color = c.legText;
+  chart.update('none');
+}
+
+function updateChart(history) {
+  if (!chart) return;
+  const attempts = history.attempts || [];
+  let ds0=[],ds1=[],ds2=[],best=[],cum=0,lo=Infinity,hi=-Infinity;
+  for (const a of attempts) {
+    if (a.id < lo) lo = a.id;
+    if (a.id > hi) hi = a.id;
+    const pt = {x:a.id, y:a.delta_time_pct??0, id:a.id,
+                desc:a.description||'', status:a.status, cumul:null};
+    if (a.status==='accepted') {
+      if (a.delta_time_pct != null) cum += a.delta_time_pct;
+      pt.cumul = cum;
+      ds0.push(pt);
+      if (a.delta_time_pct != null) best.push({x:a.id, y:cum});
+    } else if (a.status==='rejected_bench') {
+      ds1.push(pt);
+    } else {
+      ds2.push({...pt, y:0});
+    }
+  }
+  if (best.length) {
+    best.unshift({x:0, y:0});
+    if (hi > best[best.length-1].x)
+      best.push({x:hi+.4, y:best[best.length-1].y});
+  }
+  chart.data.datasets[0].data=ds0; chart.data.datasets[1].data=ds1;
+  chart.data.datasets[2].data=ds2; chart.data.datasets[3].data=best;
+  chart.data.datasets[4].data = lo!==Infinity ? [{x:lo-.5,y:0},{x:hi+.5,y:0}] : [];
+  chart.update('none');
+}
+
+// ── Custom tooltip ────────────────────────────────────────────────────────
+function customTip({chart, tooltip}) {
+  const el = document.getElementById('ctip');
+  if (!tooltip.opacity || !tooltip.dataPoints?.length) { el.style.opacity='0'; return; }
+  const raw = tooltip.dataPoints[0].raw;
+  if (raw?.desc == null) { el.style.opacity='0'; return; }
+  const fmt = v => v!=null ? (v>0?'+':'')+v.toFixed(2)+'%' : 'N/A';
+  const clr = v => v==null||v===0 ? 'var(--text-3)' : v<0 ? 'var(--ok)' : 'var(--err)';
+  const margLine = `<div class="ct-delta" style="color:${clr(raw.y)}">${fmt(raw.y)} vs prev baseline</div>`;
+  const cumulLine = raw.cumul!=null
+    ? `<div class="ct-delta" style="color:${clr(raw.cumul)};font-size:.78rem">${fmt(raw.cumul)} cumul. vs initial</div>`
+    : '';
+  el.innerHTML =
+    `<div class="ct-id">#${raw.id} &mdash; ${raw.status||''}</div>` +
+    `<div class="ct-desc">${(raw.desc||'').substring(0,120)}</div>` +
+    margLine + cumulLine;
+  const r = chart.canvas.getBoundingClientRect();
+  el.style.left = (r.left + window.scrollX + tooltip.caretX + 14) + 'px';
+  el.style.top  = (r.top  + window.scrollY + tooltip.caretY - 44) + 'px';
+  el.style.opacity = '1';
+}
+
+// ── Metrics ───────────────────────────────────────────────────────────────
+function updateMetrics(history) {
+  const all = history.attempts || [];
+  const acc = all.filter(a => a.status==='accepted');
+  const rej = all.filter(a => a.status!=='accepted');
+  let cum = 0;
+  for (const a of acc) if (a.delta_time_pct != null) cum += a.delta_time_pct;
+  setM('m-tot', all.length, '');
+  setM('m-acc', acc.length, acc.length ? 'ok' : '');
+  setM('m-rej', rej.length, rej.length ? 'err' : '');
+  const bs = acc.length ? (cum>0?'+':'')+cum.toFixed(2)+'%' : '&#8212;';
+  setM('m-best', bs, acc.length ? (cum<0?'ok':cum>0?'err':'') : '');
+}
+function setM(id, val, cls) {
+  const el = document.getElementById(id);
+  el.innerHTML = val;
+  el.className = 'mval' + (cls ? ' '+cls : '');
+}
+
+// ── Pills ─────────────────────────────────────────────────────────────────
+let _pill = null;
+function updatePills(history) {
+  const wrap  = document.getElementById('pills');
+  const empty = document.getElementById('pills-empty');
+  const items = (history.attempts||[]).filter(a => a.result_sha);
+  if (items.length) empty.style.display='none';
+  for (const a of items) {
+    if (document.getElementById('pl-'+a.id)) continue;
+    const base = (a.baseline_sha||'').substring(0,7);
+    const ref  = a.result_sha.substring(0,7);
+    const dv   = a.delta_time_pct;
+    const dc   = dv!=null ? (dv>0?'+':'')+dv.toFixed(1)+'%' : null;
+    const dotC = a.status==='accepted' ? 'var(--ok)'
+               : a.status==='rejected_bench' ? 'var(--err)' : 'var(--warn)';
+    const btn  = document.createElement('button');
+    btn.type      = 'button';
+    btn.id        = 'pl-'+a.id;
+    btn.className = 'pill';
+    btn.innerHTML =
+      `<span class="pdot" style="background:${dotC}"></span>` +
+      `<span>#${a.id}&nbsp;${base}&rarr;${ref}</span>` +
+      (dc ? `<span class="pdelta">${dc}</span>` : '');
+    btn.onclick = () => selectPill(a.id);
+    wrap.insertBefore(btn, empty);
+  }
+  if (_pill != null) hlPill(_pill);
+}
+function hlPill(id) {
+  document.querySelectorAll('.pill').forEach(b => b.classList.remove('active'));
+  document.getElementById('pl-'+id)?.classList.add('active');
+}
+async function selectPill(id) {
+  _pill = id; hlPill(id);
+  document.getElementById('cmp-card').classList.add('hidden');
+  document.getElementById('diff-card').classList.add('hidden');
+  const [r1, r2] = await Promise.all([fetch('/bench/'+id), fetch('/diff/'+id)]);
+  if (r1.ok) renderCmp(await r1.json());
+  if (r2.ok) renderDiff(await r2.text());
+  document.getElementById('cmp-card').scrollIntoView({behavior:'smooth', block:'start'});
+}
+
+// ── Compare table ─────────────────────────────────────────────────────────
+function fmtNs(ns) {
+  if (ns == null) return '&#8212;';
+  const abs = Math.abs(ns);
+  if (abs < 1e3)  return ns.toFixed(2) + '&#8239;ns';
+  if (abs < 1e6)  return (ns/1e3).toFixed(2) + '&#8239;&#956;s';
+  return (ns/1e6).toFixed(2) + '&#8239;ms';
+}
+function fmtDns(ns) {
+  if (ns == null) return '&#8212;';
+  return (ns > 0 ? '+' : '') + fmtNs(ns);
+}
+function renderCmp(data) {
+  const tb  = document.getElementById('cmp-body');
+  const hdr = document.querySelector('#cmp-card .dh span');
+  tb.innerHTML = '';
+  const base = (data.base||'').substring(0,7);
+  const ref  = (data.ref||'').substring(0,7);
+  if (hdr && base && ref) hdr.textContent = `${base} \u2192 ${ref}`;
+  const benchmarks = data.benchmarks || [];
+  const sgn = v => v!=null&&v!==0 ? (v>0?'+':'') : '';
+  const cl  = v => v==null ? '' : v<0 ? ' class="g"' : v>0 ? ' class="b"' : '';
+  for (const b of benchmarks) {
+    tb.insertAdjacentHTML('beforeend',
+      `<tr><td>${b.name||'&#8212;'}</td>` +
+      `<td>${fmtNs(b.base_time_ns)}</td>` +
+      `<td>${fmtNs(b.ref_time_ns)}</td>` +
+      `<td${cl(b.delta_time_ns)}>${fmtDns(b.delta_time_ns)}</td>` +
+      `<td${cl(b.delta_time_pct)}>${b.delta_time_pct!=null?(b.delta_time_pct>0?'+':'')+b.delta_time_pct.toFixed(1)+'%':'&#8212;'}</td>` +
+      `<td>${b.base_insn??'&#8212;'}</td><td>${b.ref_insn??'&#8212;'}</td>` +
+      `<td${cl(b.delta_insn)}>${b.delta_insn!=null?sgn(b.delta_insn)+b.delta_insn:'&#8212;'}</td>` +
+      `<td${cl(b.delta_insn_pct)}>${b.delta_insn_pct!=null?(b.delta_insn_pct>0?'+':'')+b.delta_insn_pct.toFixed(1)+'%':'&#8212;'}</td></tr>`
+    );
+  }
+  // Summary row: mean ΔTime% across all benchmarks
+  const deltas = benchmarks.map(b => b.delta_time_pct).filter(v => v!=null);
+  if (deltas.length > 1) {
+    const mean = deltas.reduce((a,b)=>a+b,0) / deltas.length;
+    const cl2  = mean<0 ? ' class="g"' : mean>0 ? ' class="b"' : '';
+    tb.insertAdjacentHTML('beforeend',
+      `<tr style="border-top:2px solid var(--border-hi);font-weight:600">` +
+      `<td>Mean</td><td></td><td></td><td></td>` +
+      `<td${cl2}>${(mean>0?'+':'')+mean.toFixed(1)}%</td>` +
+      `<td></td><td></td><td></td><td></td></tr>`
+    );
+  }
+  document.getElementById('cmp-card').classList.remove('hidden');
+}
+
+// ── Diff ──────────────────────────────────────────────────────────────────
+function renderDiff(text) {
+  const pre = document.getElementById('diff-pre');
+  pre.innerHTML = '';
+  for (const line of text.split('\\n')) {
+    const s = document.createElement('span');
+    s.className = 'dl';
+    s.textContent = line;
+    if      (line.startsWith('diff ')||line.startsWith('index ')||
+             line.startsWith('old mode')||line.startsWith('new mode')) s.classList.add('meta');
+    else if (line.startsWith('--- ')||line.startsWith('+++ '))         s.classList.add('fhdr');
+    else if (line.startsWith('@@'))                                     s.classList.add('hunk');
+    else if (line.startsWith('+'))                                      s.classList.add('add');
+    else if (line.startsWith('-'))                                      s.classList.add('del');
+    pre.appendChild(s);
+  }
+  document.getElementById('diff-card').classList.remove('hidden');
+}
+
+// ── Logs ──────────────────────────────────────────────────────────────────
+const LOG_PATS = [
+  [/error|failed|rejected/i, 'color:var(--err)'],
+  [/warn/i,                  'color:var(--warn)'],
+  [/accepted|passed/i,       'color:var(--ok)'],
+  [/planning|executing|benchmarking/i, 'color:var(--accent)'],
+];
+function appendLog(line) {
+  const el = document.getElementById('logs');
+  const atBot = el.scrollTop + el.clientHeight >= el.scrollHeight - 22;
+  let style = '';
+  for (const [re, s] of LOG_PATS) if (re.test(line)) { style=s; break; }
+  if (style) {
+    const sp = document.createElement('span');
+    sp.setAttribute('style', style);
+    sp.textContent = line + '\\n';
+    el.appendChild(sp);
+  } else {
+    el.appendChild(document.createTextNode(line + '\\n'));
+  }
+  if (atBot) el.scrollTop = el.scrollHeight;
+}
+function clearLogs() { document.getElementById('logs').innerHTML=''; }
+
+// ── SSE ───────────────────────────────────────────────────────────────────
+let _es = null;
+function connectSSE() {
+  if (_es) { _es.close(); _es=null; }
+  _es = new EventSource('/stream');
+  _es.onmessage = e => appendLog(e.data);
+  _es.onerror   = async () => {
+    _es.close(); _es = null;
+    try {
+      const r = await fetch('/status');
+      const d = await r.json();
+      if (d.state === 'stopped') {
+        setStopped(d.returncode);
+        return;
+      }
+    } catch(_) {}
+    setRunning(false);
+  };
+}
+
+// ── Running state ─────────────────────────────────────────────────────────
+function setRunning(on) {
+  document.getElementById('sdot').className  = 'sdot' +(on?' on':'');
+  document.getElementById('stxt').textContent = on ? 'Running' : 'Idle';
+  document.getElementById('ldot').className  = 'ldot'+(on?' on':'');
+  document.getElementById('logs').className  = on ? 'on' : '';
+}
+function setStopped(code) {
+  const ok = code === 0;
+  document.getElementById('sdot').className  = 'sdot ' + (ok ? 'done' : 'fail');
+  document.getElementById('stxt').textContent = ok ? 'Done' : `Failed (exit ${code})`;
+  document.getElementById('ldot').className  = 'ldot';
+  document.getElementById('logs').className  = '';
+}
+
+// ── Config overlay ────────────────────────────────────────────────────────
+let _open = false;
+function toggleForm(e) {
+  e.stopPropagation(); _open ? closeForm() : openForm();
+}
+function openForm() {
+  _open = true;
+  document.getElementById('run-form').classList.add('open');
+  document.getElementById('overlay-bg').classList.add('open');
+  document.body.style.overflow = 'hidden';
+}
+function closeForm() {
+  _open = false;
+  document.getElementById('run-form').classList.remove('open');
+  document.getElementById('overlay-bg').classList.remove('open');
+  document.body.style.overflow = '';
+}
+document.addEventListener('keydown', e => { if (e.key === 'Escape') closeForm(); });
+
+// ── Start / Stop ──────────────────────────────────────────────────────────
+async function startRun() {
+  const v  = id => document.getElementById(id).value;
+  const ni = id => v(id)!=='' ? parseInt(v(id)) : null;
+  const mv = v('f-model');
+  const c1m = mv.endsWith('-1m');
+  saveModelPrefs();
+  const body = {
+    iterations: parseInt(v('f-iter'))||10,
+    sources:    v('f-src')||'.',
+    build_dir:  v('f-build')||'build',
+    cache_dir:  v('f-cache')||'.cache/bfoptimize',
+    host:       v('f-host')||null,
+    bind_node:  ni('f-node'),
+    no_preempt: document.getElementById('f-nopre').checked,
+    cpu_pin:    ni('f-cpu'),
+    slice:      v('f-slice')||null,
+    model:      c1m ? mv.slice(0, -3) : mv,
+    thinking:   document.getElementById('f-ultrathink').checked,
+    effort:     v('f-effort'),
+    context_1m: c1m,
+    hint:       Array.from(document.querySelectorAll('.hint-input')).map(e=>e.value.trim()).filter(Boolean).join('\\n') || null,
+  };
+  try {
+    const r = await fetch('/run', {
+      method:'POST', headers:{'Content-Type':'application/json'},
+      body:JSON.stringify(body),
+    });
+    if (r.ok) {
+      closeForm(); clearLogs(); setRunning(true); connectSSE(); startPoll();
+    } else {
+      const e = await r.json().catch(()=>({}));
+      alert('Start failed: '+(e.detail||r.status));
+    }
+  } catch(e) { alert('Start failed: '+e); }
+}
+async function stopRun() {
+  try { await fetch('/run',{method:'DELETE'}); } catch(_){}
+}
+async function resetHistory() {
+  if (!confirm('Reset history? This deletes history.json from the cache directory.')) return;
+  try {
+    await fetch('/history',{method:'DELETE'});
+    pollHistory();
+  } catch(e) { alert('Reset failed: '+e); }
+}
+
+// ── History polling ───────────────────────────────────────────────────────
+let _poll = null;
+async function pollHistory() {
+  try {
+    const r = await fetch('/history');
+    if (!r.ok) return;
+    const d = await r.json();
+    updateChart(d); updatePills(d); updateMetrics(d);
+  } catch(_){}
+}
+function startPoll() {
+  if (_poll) clearInterval(_poll);
+  _poll = setInterval(pollHistory, 3000);
+}
+
+// ── Init ──────────────────────────────────────────────────────────────────
+window.addEventListener('load', () => {
+  loadFormPrefs(); _attachPersistence(); initChart(); connectSSE(); pollHistory(); startPoll();
+});
+</script>
+</body>
+</html>"""
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+
+
+def main() -> None:
+    global _cache_dir, _sources_dir
+
+    parser = argparse.ArgumentParser(
+        prog="bfoptimize-web",
+        description="Local web UI for bfoptimize.",
+    )
+    parser.add_argument(
+        "--port",
+        type=int,
+        default=8080,
+        help="TCP port to listen on (default: 8080)",
+    )
+    parser.add_argument(
+        "--sources",
+        type=pathlib.Path,
+        default=pathlib.Path("."),
+        help='bpfilter source directory for git diff (default: ".")',
+    )
+    parser.add_argument(
+        "--cache-dir",
+        type=pathlib.Path,
+        default=pathlib.Path(".cache/bfoptimize"),
+        help='bfoptimize cache directory (default: ".cache/bfoptimize")',
+    )
+    args = parser.parse_args()
+
+    _cache_dir = args.cache_dir
+    _sources_dir = args.sources
+
+    uvicorn.run(app, host="127.0.0.1", port=args.port)
+
+
+if __name__ == "__main__":
+    main()