From ec7b5da613e63407a24bb8ea014bce14b540f1b2 Mon Sep 17 00:00:00 2001
From: holo <helebest@gmail.com>
Date: Mon, 29 Jun 2026 22:04:07 +0800
Subject: [PATCH] feat(ci): reward-hacking integrity gate (delivery-loop
 hardening #1)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Turn the prose "don't weaken the tests / don't lower thresholds" rule into a
deterministic check the in-loop agent cannot fool. scripts/check-gate-integrity.mjs
diffs the branch against its merge base and fails if the verification itself was
weakened — lowered coverage thresholds, raised bundle budgets, raised e2e retries,
deleted/disabled tests or stripped assertions, or any edit to the gate/CI machinery
(the script, .github/workflows/**, fixer.md's forbidden list). A maintainer allows a
deliberate change with a visible `gate-change` PR label. Added as a PR-scoped required
CI job; mirrored locally via `npm run check:gate`.
---
 .claude/agents/fixer.md                       |   2 +
 .../dikw-web-delivery-workflow/SKILL.md       |   5 +-
 .github/workflows/ci.yml                      |  29 ++
 CHANGELOG.md                                  |  19 +
 CLAUDE.md                                     |   3 +-
 docs/adr/0005-delivery-loop-hardening.md      |  88 +++++
 docs/review-rubric.md                         |   6 +-
 docs/tdd.md                                   |   2 +
 package.json                                  |   3 +-
 scripts/check-gate-integrity.mjs              | 334 ++++++++++++++++++
 scripts/check-gate-integrity.test.mjs         | 332 +++++++++++++++++
 vite.config.ts                                |   6 +-
 12 files changed, 824 insertions(+), 5 deletions(-)
 create mode 100644 docs/adr/0005-delivery-loop-hardening.md
 create mode 100644 scripts/check-gate-integrity.mjs
 create mode 100644 scripts/check-gate-integrity.test.mjs
diff --git a/.claude/agents/fixer.md b/.claude/agents/fixer.md
index f9abee2..1f038d4 100644
--- a/.claude/agents/fixer.md
+++ b/.claude/agents/fixer.md
@@ -20,3 +20,5 @@ Forbidden — these "fix the test, not the code" and are never allowed:
 - Lowering the `vite.config.ts` coverage thresholds.
 
 If the only path to green is one of the forbidden moves, STOP: report the real root cause and why it can't be fixed cleanly. Do not weaken the check — escalate instead.
+
+This list is not honor-system only: the `gate-integrity` CI job (`npm run check:gate`) mechanically fails any PR that lowers a coverage threshold, raises a bundle budget, raises e2e retries, or deletes/disables a test / strips its assertions — so a forbidden move won't merge even if attempted. Fix the cause; don't route around the gate.
diff --git a/.claude/skills/dikw-web-delivery-workflow/SKILL.md b/.claude/skills/dikw-web-delivery-workflow/SKILL.md
index 62703d4..ac01464 100644
--- a/.claude/skills/dikw-web-delivery-workflow/SKILL.md
+++ b/.claude/skills/dikw-web-delivery-workflow/SKILL.md
@@ -52,7 +52,10 @@ a fresh agent that didn't write the code).
    the **same** change — not "later". (Disk `.md` is English-only in this repo.)
 
 8. **Final gate + PR.** `npm.cmd run verify` (lint + format:check + typecheck + coverage + build + e2e)
-   green, then `npm.cmd run check:bundle` (gzip budget; also runs in CI). Bump
+   green, then `npm.cmd run check:bundle` (gzip budget) and `npm.cmd run check:gate`
+   (reward-hacking gate; both also run in CI — the latter as the required
+   `gate-integrity` job). If `check:gate` flags a *deliberate* weakening, a maintainer
+   adds the `gate-change` label to the PR; never route around it. Bump
    `package.json` version (3-digit SemVer) and add a `CHANGELOG.md` entry when
    warranted. Branch with a descriptive name, commit `<type>(<scope>): <subject>`,
    push, `gh pr create`.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 29f9aec..1a02db5 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -165,6 +165,35 @@ jobs:
           severity: HIGH,CRITICAL
           version: ${{ env.TRIVY_VERSION }}
 
+  gate-integrity:
+    # Reward-hacking gate: fails a PR that weakens the verification ITSELF — lowers a
+    # coverage threshold (vite.config.ts), raises a bundle budget (check-bundle.mjs),
+    # raises e2e retries (playwright.config.ts), deletes/disables a test or strips its
+    # assertions, or edits the gate/CI machinery (this script, .github/workflows/**,
+    # fixer.md's forbidden list). A maintainer can allow a deliberate change with the
+    # visible, auditable `gate-change` PR label. PR-scoped on purpose: it is a required
+    # status check, so nothing merges to main without it, and the label context only
+    # exists on a PR. Pure Node script (no deps) — no `npm ci` needed.
+    name: Gate integrity (no reward-hacking)
+    if: github.event_name == 'pull_request'
+    runs-on: ubuntu-latest
+    timeout-minutes: 5
+    steps:
+      - uses: actions/checkout@v7
+        with:
+          # Full history so `<base>...HEAD` can resolve the merge base.
+          fetch-depth: 0
+
+      - uses: actions/setup-node@v6
+        with:
+          node-version: "24"
+
+      - name: Check gate integrity
+        env:
+          GATE_BASE_REF: ${{ github.event.pull_request.base.sha }}
+          GATE_HAS_OVERRIDE: ${{ contains(github.event.pull_request.labels.*.name, 'gate-change') }}
+        run: node scripts/check-gate-integrity.mjs
+
   release:
     name: Release tag (dikw-web-v*)
     # Cuts a GitHub Release tagged dikw-web-v<package.json version> once the whole
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6e40b86..57ee876 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,25 @@ file format introduced in `[0.0.1.0]` was dropped.
 
 ## [Unreleased]
 
+## [0.8.7] - 2026-06-29
+
+### Added
+
+- **Reward-hacking gate (`gate-integrity` CI job / `npm run check:gate`).** A
+  deterministic check the in-loop agent cannot fool, hardening the delivery loop
+  against an agent weakening the verification to make a check go green.
+  `scripts/check-gate-integrity.mjs` diffs the branch against its merge base and
+  fails the PR if a coverage threshold was lowered, the coverage `exclude` list
+  grew, a bundle budget was raised, e2e `retries` were raised, a test was
+  deleted/disabled or stripped of assertions, or the gate/CI machinery itself
+  (the script, `.github/workflows/**`, `fixer.md`'s forbidden list) was edited.
+  The good direction (raising a threshold, adding tests) is always allowed; a
+  deliberate weakening is allowed only when a maintainer adds the visible
+  `gate-change` PR label. Until now "don't weaken the tests" was prose only in
+  `CLAUDE.md` / `docs/review-rubric.md` / `fixer.md` — the weakest defense. The job
+  is PR-scoped and a required status check. See
+  `docs/adr/0005-delivery-loop-hardening.md`.
+
 ## [0.8.6] - 2026-06-29
 
 ### Fixed
diff --git a/CLAUDE.md b/CLAUDE.md
index 11a3212..5dd68fb 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -72,7 +72,7 @@ End-to-end loop from request to landed PR. Run autonomously for behavior changes
 4. **Final pass.** Run `/code-review`, scored against `docs/review-rubric.md` (the project-specific principles), and resolve every finding before continuing.
 5. **Verify in the browser.** For UI changes, invoke the `dikw-web-verify-frontend` skill: navigate the changed routes via Chrome MCP, confirm a clean runtime console on real data, exercise the affected interactions, and run the `docs/ui-checklist.md` rubric in light + dark — confirm the change actually rendered as intended, not just that unit tests pass.
 6. **Update markdown docs.** Walk `CLAUDE.md`, `README.md`, and the relevant `docs/*.md` against the diff; any contract, behavior, command, or doc index that drifted must be updated in the same change. Don't leave docs to "catch up later".
-7. **Create the PR.** Branch with a descriptive name, commit with `<type>(<scope>): <subject>` matching the project's existing convention (see recent `git log`), push, then `gh pr create`. CI auto-runs lint + format:check + typecheck + coverage + build + e2e + bundle budget + security scans (npm audit, gitleaks, Trivy, CodeQL). Bump `package.json.version` manually (standard 3-digit SemVer) when the change warrants it, and add an entry to `CHANGELOG.md` under the matching version heading. On merge to `main`, CI's `release` job auto-cuts a GitHub Release tagged `dikw-web-v<version>` from `package.json.version` (idempotent — only a version bump creates a new tag; notes come from the matching CHANGELOG section via `scripts/changelog-notes.mjs`), so a deliberate version bump is what publishes a release.
+7. **Create the PR.** Branch with a descriptive name, commit with `<type>(<scope>): <subject>` matching the project's existing convention (see recent `git log`), push, then `gh pr create`. CI auto-runs lint + format:check + typecheck + coverage + build + e2e + bundle budget + the `gate-integrity` reward-hacking gate (`check:gate`) + security scans (npm audit, gitleaks, Trivy, CodeQL). Bump `package.json.version` manually (standard 3-digit SemVer) when the change warrants it, and add an entry to `CHANGELOG.md` under the matching version heading. On merge to `main`, CI's `release` job auto-cuts a GitHub Release tagged `dikw-web-v<version>` from `package.json.version` (idempotent — only a version bump creates a new tag; notes come from the matching CHANGELOG section via `scripts/changelog-notes.mjs`), so a deliberate version bump is what publishes a release.
 8. **Monitor CI and PR comments; resolve as they surface, then merge.** After pushing, actively watch both signals — don't passively wait, and don't batch resolution to merge time.
    - **CI rollup**: `gh pr checks <N>` (or `--watch` to block until terminal). Failing job logs: `gh run view <run-id> --log-failed`. Flaky e2e gets **one** rerun, not five (see [[project_flaky_graph_e2e]] in memory for which test).
    - **PR review prose**: `gh api repos/{owner}/{repo}/pulls/{N}/reviews` for review bodies, `.../pulls/{N}/comments` for inline threads, `.../issues/{N}/comments` for top-level CodeRabbit summaries. `gh pr checks` shows pass/fail only, not the prose.
@@ -98,6 +98,7 @@ Windows shell: use `npm.cmd` (not `npm`) when invoking from PowerShell.
 - `npm.cmd run build` — typecheck, `vite build` (browser bundle to `dist/`), then `build:server` (esbuild bundles `server/agent/standalone.ts` to `dist-server/standalone.mjs` with `--packages=external`, since ADK + MikroORM + native sqlite3 can't be bundled — so the sidecar imports its deps from a production `node_modules` at runtime). `npm.cmd start` runs that standalone sidecar.
 - `npm.cmd run verify` — full gate: lint + format:check + typecheck + coverage + build + e2e. Run before committing behavior changes.
 - `npm.cmd run check:bundle` — gzip bundle budget (entry JS / total JS / CSS) against `dist/`; runs in CI after the verify gate. Raise the budgets in `scripts/check-bundle.mjs` deliberately, like the coverage thresholds — don't bump to pass.
+- `npm.cmd run check:gate` — **reward-hacking gate** (`scripts/check-gate-integrity.mjs`): diffs the branch against its merge base (`origin/main` locally; the PR base in CI) and fails if the verification *itself* was weakened — a lowered coverage threshold, a grown coverage `exclude`, a raised bundle budget, raised e2e `retries`, a deleted/disabled test or removed assertions, or any edit to the gate/CI machinery (the script, `.github/workflows/**`, `fixer.md`'s forbidden list). The good direction (raising a threshold, adding a test) is always allowed; a deliberate weakening is allowed only when a maintainer adds the visible `gate-change` label to the PR (`GATE_HAS_OVERRIDE`). Runs as the **PR-scoped required CI job `gate-integrity`** — it is the deterministic backstop for the prose "don't weaken the tests" rule in `fixer.md` / `docs/review-rubric.md`. See `docs/adr/0005-delivery-loop-hardening.md`.
 - `npm.cmd run smoke:core` — live-core `/v1` contract smoke (`scripts/smoke-core.mjs`, the `dikw-web-smoke-core` skill). Not a CI gate; needs a reachable core. Run after a `dikw-core` bump or before a demo.
 - `npm.cmd run live:verify` — full live integration verification of the working tree against a **real `dikw-core`** (GHCR image, Postgres backend) on dynamic ports: boot → seed the write pipeline (import→ingest→synth→lint, reusing `buildImportBundle` + `DikwClient`) → read-contract smoke → browser read-route e2e (Playwright `live` project) → agent↔core check → teardown. Needs Docker + `.env.core` (LLM/embedding keys, git-ignored; copy `.env.core.example`). Not a CI gate (boots a container, calls live LLMs); `live-integration.yml` runs it on dispatch/nightly/label. Sub-commands: `live:up` / `live:seed` / `live:smoke` / `live:down` (`-- --volumes` to drop data). `-- --keep` leaves the stack up. See `docs/integration-verification.md`.
 
diff --git a/docs/adr/0005-delivery-loop-hardening.md b/docs/adr/0005-delivery-loop-hardening.md
new file mode 100644
index 0000000..56e670d
--- /dev/null
+++ b/docs/adr/0005-delivery-loop-hardening.md
@@ -0,0 +1,88 @@
+# 5. Delivery-loop hardening
+
+This ADR records the decision to harden dikw-web's request→merged-PR **delivery
+loop** against the failure modes of agentic, low-supervision development. It is the
+umbrella record for a set of changes landing as separate PRs; this first revision
+covers the **reward-hacking gate** (`gate-integrity` / `npm run check:gate`).
+
+## Status
+
+Accepted (2026-06-29). Living document — later sections are added as the remaining
+hardening items land (trustworthy green signal, measured perf/a11y, automated
+CI-watch-and-fix, loop observability).
+
+## Context
+
+The loop already encodes most manual + pre-merge checks as skills (the
+`dikw-web-delivery-workflow`, `dikw-web-verify-frontend`, `dikw-web-smoke-core`
+skills; the `docs/ui-checklist.md` + `docs/review-rubric.md` rubrics) and runs an
+independent review on a different model (`/codex:review`) before merge. The CI
+`verify` gate (lint / format / typecheck / coverage / build / e2e) plus
+`check:bundle` is deterministic and blocks merge.
+
+The gap is the one an autonomous agent exploits under pressure to make a check go
+green: **reward hacking**. The verification's own knobs live in editable source —
+coverage thresholds in `vite.config.ts`, bundle budgets in `scripts/check-bundle.mjs`,
+e2e retries in `playwright.config.ts` — and a test can be deleted, `.skip`-ped, or
+stripped of assertions. The only thing standing in the way was prose: "don't weaken
+the tests" in `CLAUDE.md`, `docs/review-rubric.md`, and the `fixer` agent's forbidden
+list. Prose is the weakest possible defense: the agent that edits the threshold also
+passes the check that reads it.
+
+## Decision
+
+Add a deterministic gate the in-loop agent **cannot fool**:
+`scripts/check-gate-integrity.mjs`, run as the PR-scoped required CI job
+`gate-integrity` and mirrored locally by `npm run check:gate`.
+
+It diffs the branch against its merge base (`<base>...HEAD`) and fails if the
+verification itself was weakened:
+
+- **coverage-threshold-lowered** — any of statements/branches/functions/lines in
+  `vite.config.ts` dropped.
+- **coverage-exclude-grown** — the coverage `exclude` array gained entries (drops
+  files out of the denominator).
+- **bundle-budget-raised** — any `*GzipKB` budget in `check-bundle.mjs` rose.
+- **e2e-retries-raised** — the CI branch of `retries` in `playwright.config.ts` rose
+  (a higher retry count masks new flakes).
+- **test-file-deleted** / **test-skip-added** / **test-assertions-removed** — a test
+  file was removed, gained a `.skip`/`.only`/`.todo`/`xit`/`xdescribe` marker, or lost
+  `expect()` calls. Only *in-place* modifications are diffed; a new file is new
+  coverage, and a rename-that-guts shows up as a separate delete (still caught).
+- **gate-machinery-modified** — any edit to the gate script itself,
+  `.github/workflows/**`, or `fixer.md`'s forbidden list. This is the **self-guard**:
+  an agent cannot quietly delete a check or drop the CI job, because doing so trips the
+  gate.
+
+The "good direction" — raising a threshold, tightening a budget, adding tests — is
+always allowed without ceremony. A *deliberate* weakening is allowed only when a
+maintainer attaches the visible, auditable **`gate-change`** label to the PR
+(`GATE_HAS_OVERRIDE=true`); the job then passes but prints what it allowed, for the
+audit trail. The reviewer's remaining job is to judge whether a labelled change is
+justified.
+
+This mirrors the "a second check the agent does not control" pattern: the gate is the
+mechanical backstop for the prose rule, and it guards its own machinery so it cannot
+be edited away.
+
+## Consequences
+
+- The PR that *introduces or changes* the gate machinery (including this one) trips
+  `gate-machinery-modified` by design and must carry the `gate-change` label. That is
+  the intended, visible audit point, not a bug.
+- The skip/assertion heuristics are deliberately dumb (regex counts), so they can have
+  false positives (e.g. a test file that contains skip markers as string fixtures —
+  handled by only diffing in-place modifications with a non-null base). The
+  `gate-change` label is the escape hatch; the gate is a backstop, not a proof system.
+- The job is PR-scoped (`if: github.event_name == 'pull_request'`) because the label
+  context only exists on a PR, and a required status check already blocks every merge
+  to `main`. It is therefore **not** added to `release.needs` (a skipped dependency
+  would skip `release`).
+
+## Excluded (deliberately, for "simplicity first")
+
+The fuller autonomous-loop machinery — an on-disk `STATUS.md` + `loop_state.json`
+state protocol, container `--network none` isolation, per-token cost accounting — is
+out of scope. dikw-web's loop is interactive Claude Code with worktree isolation
+already available for background jobs and no destructive operations; that machinery
+would add complexity without proportional value here.
diff --git a/docs/review-rubric.md b/docs/review-rubric.md
index 0f825df..7ce07c7 100644
--- a/docs/review-rubric.md
+++ b/docs/review-rubric.md
@@ -19,7 +19,11 @@ Each item is pass/fail against the **diff under review**.
   orphaned imports/vars are removed.
 - [ ] **Goal-driven / TDD.** Behavior changes land test-first (see `docs/tdd.md`).
   Coverage thresholds in `vite.config.ts` (60/45/55/60) are **not lowered** to
-  pass — tests are added/repaired instead.
+  pass — tests are added/repaired instead. This is now machine-enforced by the
+  `gate-integrity` CI job (`npm run check:gate`): lowering a threshold, raising a
+  bundle budget, raising e2e retries, or deleting/disabling a test fails the PR
+  unless a maintainer attaches the `gate-change` label. The reviewer's job here is
+  to judge whether a labelled `gate-change` is actually justified.
 
 ## Repo-specific traps (these don't trip generic reviewers)
 
diff --git a/docs/tdd.md b/docs/tdd.md
index bf3e530..52b01f2 100644
--- a/docs/tdd.md
+++ b/docs/tdd.md
@@ -217,6 +217,8 @@ Locale and theme regressions should be caught at the browser boundary:
 
 Initial thresholds are intentionally modest: statements/lines 60%, functions 55%, branches 45%. Raise thresholds only when the suite gains durable behavior coverage. Do not lower thresholds to merge a feature; add or repair tests instead.
 
+"Do not lower thresholds" is no longer just a rule of discipline — the `gate-integrity` CI job (`npm run check:gate`, `scripts/check-gate-integrity.mjs`) enforces it mechanically. It diffs the PR against its merge base and fails if the verification itself was weakened: a lowered coverage threshold, a grown coverage `exclude` list, a raised bundle budget, raised e2e retries, a deleted/disabled test, removed assertions, or any edit to the gate/CI machinery. A deliberate, reviewed change is allowed only when a maintainer attaches the visible `gate-change` label to the PR. See `docs/adr/0005-delivery-loop-hardening.md`.
+
 ## Real Core Smoke Testing
 
 Mocked E2E is the default gate. Manual smoke against a real local `dikw-core` remains useful before demos, but it should not block normal TDD work because local data and providers vary.
diff --git a/package.json b/package.json
index df28245..1bd5529 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "dikw-web",
-  "version": "0.8.6",
+  "version": "0.8.7",
   "private": true,
   "type": "module",
   "engines": {
@@ -27,6 +27,7 @@
     "live:smoke": "node scripts/live-core/smoke.mjs",
     "live:verify": "node scripts/live-core/run.mjs",
     "check:bundle": "node scripts/check-bundle.mjs",
+    "check:gate": "node scripts/check-gate-integrity.mjs",
     "verify": "npm run lint && npm run format:check && npm run typecheck && npm run test:coverage && npm run build && npm run test:e2e"
   },
   "dependencies": {
diff --git a/scripts/check-gate-integrity.mjs b/scripts/check-gate-integrity.mjs
new file mode 100644
index 0000000..b3ee0ea
--- /dev/null
+++ b/scripts/check-gate-integrity.mjs
@@ -0,0 +1,334 @@
+#!/usr/bin/env node
+// Reward-hacking gate. The deterministic check the in-loop agent CANNOT fool.
+//
+// dikw-web's verification (coverage thresholds, bundle budgets, e2e retries) lives
+// in editable source files, and the "don't weaken the tests" rule is otherwise just
+// prose in CLAUDE.md / fixer.md / review-rubric.md — the weakest possible defense.
+// This gate turns that prose into an exit code: it diffs the branch against its merge
+// base and FAILS if the verification itself was weakened, unless a human has attached
+// a visible, auditable `gate-change` label to the PR.
+//
+// It guards itself: any edit to this script, the CI workflows, or fixer's forbidden
+// list trips `gate-machinery-modified`, so an agent cannot quietly delete a check.
+//
+// Usage:
+//   node scripts/check-gate-integrity.mjs           # compares HEAD against origin/main
+//   GATE_BASE_REF=<sha> node scripts/check-gate-integrity.mjs
+//   GATE_HAS_OVERRIDE=true ...                       # honor the gate-change label
+//
+// Pure helpers + evaluateGate() are exported for unit testing; the git plumbing only
+// runs when invoked as a CLI.
+
+import { execFileSync } from "node:child_process";
+import { pathToFileURL } from "node:url";
+
+// --- pure parsers ---------------------------------------------------------------
+
+/**
+ * Extract the balanced `{ ... }` block that follows `key` (e.g. `coverage:`).
+ * Brace-counting, so it survives nested blocks like `coverage.thresholds`. Returns
+ * the block text incl. its braces, or null. Scoping to the coverage block is what
+ * stops the `exclude:`/`thresholds:` parsers from matching a *preceding* `test.*`.
+ */
+export function extractBlock(src, key) {
+  if (!src) return null;
+  const keyAt = src.indexOf(key);
+  if (keyAt === -1) return null;
+  const open = src.indexOf("{", keyAt);
+  if (open === -1) return null;
+  let depth = 0;
+  for (let i = open; i < src.length; i++) {
+    if (src[i] === "{") depth++;
+    else if (src[i] === "}" && --depth === 0) return src.slice(open, i + 1);
+  }
+  return null;
+}
+
+/** @returns {{statements:number,branches:number,functions:number,lines:number}|null} */
+export function parseCoverageThresholds(src) {
+  const cov = extractBlock(src, "coverage:");
+  if (!cov) return null;
+  const block = cov.match(/thresholds:\s*\{([\s\S]*?)\}/);
+  if (!block) return null;
+  const pick = (key) => {
+    const m = block[1].match(new RegExp(`${key}:\\s*(\\d+(?:\\.\\d+)?)`));
+    return m ? Number(m[1]) : null;
+  };
+  const out = {
+    statements: pick("statements"),
+    branches: pick("branches"),
+    functions: pick("functions"),
+    lines: pick("lines"),
+  };
+  return Object.values(out).every((v) => v !== null) ? out : null;
+}
+
+/** Number of entries in the **coverage** `exclude: [...]` array (not `test.exclude`). */
+export function parseCoverageExcludeCount(src) {
+  const cov = extractBlock(src, "coverage:");
+  if (!cov) return null;
+  const m = cov.match(/exclude:\s*\[([\s\S]*?)\]/);
+  if (!m) return null;
+  const entries = m[1].match(/['"`][^'"`]*['"`]/g);
+  return entries ? entries.length : 0;
+}
+
+/** @returns {{entryJsGzipKB:number,totalJsGzipKB:number,cssGzipKB:number}|null} */
+export function parseBundleBudgets(src) {
+  if (!src) return null;
+  const pick = (key) => {
+    const m = src.match(new RegExp(`${key}:\\s*(\\d+(?:\\.\\d+)?)`));
+    return m ? Number(m[1]) : null;
+  };
+  const out = {
+    entryJsGzipKB: pick("entryJsGzipKB"),
+    totalJsGzipKB: pick("totalJsGzipKB"),
+    cssGzipKB: pick("cssGzipKB"),
+  };
+  return Object.values(out).every((v) => v !== null) ? out : null;
+}
+
+/** The CI branch of `retries: process.env.CI ? N : M` (falls back to a bare `retries: N`). */
+export function parsePlaywrightCiRetries(src) {
+  if (!src) return null;
+  const ternary = src.match(/retries:\s*process\.env\.CI\s*\?\s*(\d+)\s*:\s*\d+/);
+  if (ternary) return Number(ternary[1]);
+  const bare = src.match(/retries:\s*(\d+)/);
+  return bare ? Number(bare[1]) : null;
+}
+
+const SKIP_MARKER_RE =
+  /(?:\b(?:it|test|describe)\.(?:skip|only|todo)\b)|(?:\b(?:xit|xdescribe|xtest)\b)/g;
+
+/** Count of `.skip`/`.only`/`.todo` and `xit`/`xdescribe` test-disabling markers. */
+export function countSkipMarkers(src) {
+  if (!src) return 0;
+  return (src.match(SKIP_MARKER_RE) || []).length;
+}
+
+// Strip line and block comments so commenting code out is not a way to hide an assertion.
+function stripComments(src) {
+  return src.replace(/\/\*[\s\S]*?\*\//g, "").replace(/\/\/[^\n]*/g, "");
+}
+
+/** Count of `expect(` assertion calls, ignoring commented-out ones. */
+export function countAssertions(src) {
+  if (!src) return 0;
+  return (stripComments(src).match(/\bexpect\s*\(/g) || []).length;
+}
+
+const MACHINERY_PATHS = new Set(["scripts/check-gate-integrity.mjs", ".claude/agents/fixer.md"]);
+
+/**
+ * Is `path` part of the gate's own enforcement machinery (any edit needs the label)?
+ * Note `scripts/check-bundle.mjs` is deliberately NOT here: it is a *checked* file
+ * guarded directionally (a raised budget is flagged, a lowered one allowed), exactly
+ * like `vite.config.ts` — so tightening a budget doesn't require the gate-change label.
+ */
+export function isMachineryPath(path) {
+  return MACHINERY_PATHS.has(path) || path.startsWith(".github/workflows/");
+}
+
+// --- pure evaluation ------------------------------------------------------------
+
+/**
+ * Decide whether a change weakens the verification.
+ * @param {{
+ *   coverage: {base: string|null, head: string|null},
+ *   bundle: {base: string|null, head: string|null},
+ *   retries: {base: string|null, head: string|null},
+ *   modifiedTests: Array<{path: string, base: string|null, head: string|null}>,
+ *   deletedTests: string[],
+ *   machineryTouched: string[],
+ *   hasOverrideLabel: boolean,
+ * }} input
+ * @returns {{ok: boolean, overridden: boolean, violations: Array<{code: string, file: string, detail: string}>}}
+ */
+export function evaluateGate(input) {
+  const violations = [];
+
+  // (a) coverage thresholds must not drop; the exclude list must not grow.
+  const baseCov = parseCoverageThresholds(input.coverage?.base);
+  const headCov = parseCoverageThresholds(input.coverage?.head);
+  if (baseCov && !headCov) {
+    // Removing the thresholds block (or any key) disables Vitest coverage
+    // enforcement entirely — the exact weakening this gate exists to catch.
+    violations.push({
+      code: "coverage-threshold-lowered",
+      file: "vite.config.ts",
+      detail: "coverage thresholds block removed or no longer parseable",
+    });
+  } else if (baseCov && headCov) {
+    for (const key of Object.keys(baseCov)) {
+      if (headCov[key] < baseCov[key]) {
+        violations.push({
+          code: "coverage-threshold-lowered",
+          file: "vite.config.ts",
+          detail: `${key}: ${baseCov[key]} → ${headCov[key]}`,
+        });
+      }
+    }
+  }
+  const baseExcl = parseCoverageExcludeCount(input.coverage?.base);
+  const headExcl = parseCoverageExcludeCount(input.coverage?.head);
+  if (baseExcl !== null && headExcl !== null && headExcl > baseExcl) {
+    violations.push({
+      code: "coverage-exclude-grown",
+      file: "vite.config.ts",
+      detail: `coverage exclude entries: ${baseExcl} → ${headExcl}`,
+    });
+  }
+
+  // (b) bundle budgets must not rise.
+  const baseBudget = parseBundleBudgets(input.bundle?.base);
+  const headBudget = parseBundleBudgets(input.bundle?.head);
+  if (baseBudget && !headBudget) {
+    violations.push({
+      code: "bundle-budget-raised",
+      file: "scripts/check-bundle.mjs",
+      detail: "bundle budget block removed or no longer parseable",
+    });
+  } else if (baseBudget && headBudget) {
+    for (const key of Object.keys(baseBudget)) {
+      if (headBudget[key] > baseBudget[key]) {
+        violations.push({
+          code: "bundle-budget-raised",
+          file: "scripts/check-bundle.mjs",
+          detail: `${key}: ${baseBudget[key]} → ${headBudget[key]}`,
+        });
+      }
+    }
+  }
+
+  // (c) e2e retries must not rise (a higher retry count masks new flakes).
+  const baseRetries = parsePlaywrightCiRetries(input.retries?.base);
+  const headRetries = parsePlaywrightCiRetries(input.retries?.head);
+  if (baseRetries !== null && headRetries !== null && headRetries > baseRetries) {
+    violations.push({
+      code: "e2e-retries-raised",
+      file: "playwright.config.ts",
+      detail: `CI retries: ${baseRetries} → ${headRetries}`,
+    });
+  }
+
+  // (d) tests must not be deleted, disabled, or stripped of assertions.
+  for (const path of input.deletedTests || []) {
+    violations.push({ code: "test-file-deleted", file: path, detail: "test file removed" });
+  }
+  for (const t of input.modifiedTests || []) {
+    // Only an EXISTING test can be weakened; a brand-new file (no base) is new
+    // coverage, and its skip markers / assertion count have nothing to compare against.
+    if (t.base == null) continue;
+    if (countSkipMarkers(t.head) > countSkipMarkers(t.base)) {
+      violations.push({
+        code: "test-skip-added",
+        file: t.path,
+        detail: "a skip/only/todo marker was added",
+      });
+    }
+    if (countAssertions(t.head) < countAssertions(t.base)) {
+      violations.push({
+        code: "test-assertions-removed",
+        file: t.path,
+        detail: `expect() calls: ${countAssertions(t.base)} → ${countAssertions(t.head)}`,
+      });
+    }
+  }
+
+  // (e) the gate machinery itself must not be edited (direction is undefinable here).
+  for (const path of input.machineryTouched || []) {
+    violations.push({
+      code: "gate-machinery-modified",
+      file: path,
+      detail: "gate/CI machinery changed",
+    });
+  }
+
+  const overridden = violations.length > 0 && !!input.hasOverrideLabel;
+  return { ok: violations.length === 0 || overridden, overridden, violations };
+}
+
+// --- git plumbing (CLI only) ----------------------------------------------------
+
+const TEST_FILE_RE = /\.(test|spec)\.(ts|tsx|mjs|js)$/;
+
+function git(args) {
+  return execFileSync("git", args, { encoding: "utf8" });
+}
+
+function showAtRef(ref, path) {
+  try {
+    return execFileSync("git", ["show", `${ref}:${path}`], {
+      encoding: "utf8",
+      stdio: ["ignore", "pipe", "ignore"],
+    });
+  } catch {
+    return null; // file absent at that ref
+  }
+}
+
+function main() {
+  const baseRef = process.env.GATE_BASE_REF || "origin/main";
+  const hasOverrideLabel = process.env.GATE_HAS_OVERRIDE === "true";
+
+  // `<base>...HEAD` diffs against the merge base, so unrelated base movement is ignored.
+  const range = `${baseRef}...HEAD`;
+  const nameStatus = git(["diff", "--name-status", range]).trim();
+
+  const deletedTests = [];
+  const modifiedTestPaths = [];
+  const machineryTouched = [];
+  for (const line of nameStatus ? nameStatus.split("\n") : []) {
+    const [status, ...rest] = line.split(/\s+/);
+    const path = rest[rest.length - 1];
+    if (!path) continue;
+    if (isMachineryPath(path)) machineryTouched.push(path);
+    if (TEST_FILE_RE.test(path)) {
+      // Deleted tests are always a violation. Only in-place MODIFY can weaken an
+      // existing test; added (A) files are new coverage, and a rename that also guts a
+      // test shows up as a separate D+A (the D trips test-file-deleted).
+      if (status.startsWith("D")) deletedTests.push(path);
+      else if (status.startsWith("M")) modifiedTestPaths.push(path);
+    }
+  }
+
+  const mergeBase = git(["merge-base", baseRef, "HEAD"]).trim();
+  const pair = (path) => ({ base: showAtRef(mergeBase, path), head: showAtRef("HEAD", path) });
+  const modifiedTests = modifiedTestPaths.map((path) => ({ path, ...pair(path) }));
+
+  const result = evaluateGate({
+    coverage: pair("vite.config.ts"),
+    bundle: pair("scripts/check-bundle.mjs"),
+    retries: pair("playwright.config.ts"),
+    modifiedTests,
+    deletedTests,
+    machineryTouched,
+    hasOverrideLabel,
+  });
+
+  if (result.violations.length === 0) {
+    console.log("✓ Gate integrity: no verification weakened.");
+    process.exit(0);
+  }
+
+  const lines = result.violations.map((v) => `  ✖ [${v.code}] ${v.file} — ${v.detail}`);
+  if (result.overridden) {
+    console.log(
+      "⚠ Gate integrity: the following weakenings were ALLOWED by the `gate-change` label:",
+    );
+    console.log(lines.join("\n"));
+    console.log("\nProceeding (override present). This is recorded for audit.");
+    process.exit(0);
+  }
+
+  console.error("✖ Gate integrity FAILED — the verification itself was weakened:");
+  console.error(lines.join("\n"));
+  console.error(
+    "\nFix the code so the existing checks pass, instead of weakening them. If the change is a" +
+      "\ndeliberate, reviewed decision, a maintainer can add the `gate-change` label to the PR.",
+  );
+  process.exit(1);
+}
+
+const invokedDirectly = process.argv[1] && import.meta.url === pathToFileURL(process.argv[1]).href;
+if (invokedDirectly) main();
diff --git a/scripts/check-gate-integrity.test.mjs b/scripts/check-gate-integrity.test.mjs
new file mode 100644
index 0000000..f48105c
--- /dev/null
+++ b/scripts/check-gate-integrity.test.mjs
@@ -0,0 +1,332 @@
+import { describe, expect, it } from "vitest";
+import {
+  countAssertions,
+  countSkipMarkers,
+  evaluateGate,
+  isMachineryPath,
+  parseBundleBudgets,
+  parseCoverageExcludeCount,
+  parseCoverageThresholds,
+  parsePlaywrightCiRetries,
+} from "./check-gate-integrity.mjs";
+
+// Mirrors the real vite.config.ts layout: a top-level `test.exclude` (3 entries)
+// appears BEFORE the `coverage` block's own `exclude` (5 entries). The parsers must
+// read the coverage block's values, not the first `exclude:`/`thresholds:` they see.
+const VITE_CONFIG_SRC = `
+    test: {
+      include: ["src/**/*.{test,spec}.{ts,tsx}"],
+      exclude: ["tests/e2e/**", "node_modules/**", "dist/**"],
+      coverage: {
+        provider: "v8",
+        include: ["src/**/*.{ts,tsx}"],
+        exclude: [
+          "src/**/*.test.{ts,tsx}",
+          "src/test/**",
+          "src/main.tsx",
+          "src/types.ts",
+          "src/vite-env.d.ts",
+        ],
+        thresholds: {
+          statements: 60,
+          branches: 45,
+          functions: 55,
+          lines: 60,
+        },
+      },
+    },
+`;
+
+const COVERAGE_SRC = `
+      coverage: {
+        provider: "v8",
+        include: ["src/**/*.{ts,tsx}"],
+        exclude: [
+          "src/**/*.test.{ts,tsx}",
+          "src/test/**",
+          "src/main.tsx",
+          "src/types.ts",
+          "src/vite-env.d.ts",
+        ],
+        thresholds: {
+          statements: 60,
+          branches: 45,
+          functions: 55,
+          lines: 60,
+        },
+      },
+`;
+
+const BUNDLE_SRC = `
+const BUDGET = {
+  entryJsGzipKB: 280,
+  totalJsGzipKB: 1950,
+  cssGzipKB: 35,
+};
+`;
+
+const PLAYWRIGHT_SRC = `
+  retries: process.env.CI ? 2 : 0,
+`;
+
+describe("parsers", () => {
+  it("parses coverage thresholds", () => {
+    expect(parseCoverageThresholds(COVERAGE_SRC)).toEqual({
+      statements: 60,
+      branches: 45,
+      functions: 55,
+      lines: 60,
+    });
+  });
+
+  it("counts coverage exclude entries", () => {
+    expect(parseCoverageExcludeCount(COVERAGE_SRC)).toBe(5);
+  });
+
+  it("parses bundle budgets", () => {
+    expect(parseBundleBudgets(BUNDLE_SRC)).toEqual({
+      entryJsGzipKB: 280,
+      totalJsGzipKB: 1950,
+      cssGzipKB: 35,
+    });
+  });
+
+  it("parses the CI branch of playwright retries", () => {
+    expect(parsePlaywrightCiRetries(PLAYWRIGHT_SRC)).toBe(2);
+  });
+
+  it("counts skip/only/todo markers", () => {
+    const src = `test.skip("a", () => {}); it.only("b", () => {}); xit("c", () => {}); describe.todo("d");`;
+    expect(countSkipMarkers(src)).toBe(4);
+  });
+
+  it("counts expect() assertions", () => {
+    expect(countAssertions(`expect(a).toBe(1); expect(b).toEqual(2);`)).toBe(2);
+  });
+
+  it("ignores commented-out assertions", () => {
+    expect(countAssertions(`expect(a).toBe(1); // expect(b).toBe(2);`)).toBe(1);
+    expect(countAssertions(`expect(a).toBe(1); /* expect(b).toBe(2); */`)).toBe(1);
+  });
+
+  it("parses the coverage block's exclude, not a preceding test.exclude", () => {
+    // test.exclude (3 entries) precedes coverage.exclude (5 entries) in the real file.
+    expect(parseCoverageExcludeCount(VITE_CONFIG_SRC)).toBe(5);
+  });
+
+  it("parses coverage thresholds even with a preceding test block", () => {
+    expect(parseCoverageThresholds(VITE_CONFIG_SRC)).toEqual({
+      statements: 60,
+      branches: 45,
+      functions: 55,
+      lines: 60,
+    });
+  });
+
+  it("classifies gate machinery paths", () => {
+    expect(isMachineryPath("scripts/check-gate-integrity.mjs")).toBe(true);
+    expect(isMachineryPath(".github/workflows/ci.yml")).toBe(true);
+    expect(isMachineryPath(".claude/agents/fixer.md")).toBe(true);
+    // check-bundle.mjs is a CHECKED file (guarded directionally), not machinery —
+    // so lowering a budget there does not require the gate-change label.
+    expect(isMachineryPath("scripts/check-bundle.mjs")).toBe(false);
+    expect(isMachineryPath("src/App.tsx")).toBe(false);
+  });
+});
+
+describe("evaluateGate", () => {
+  const clean = {
+    coverage: { base: COVERAGE_SRC, head: COVERAGE_SRC },
+    bundle: { base: BUNDLE_SRC, head: BUNDLE_SRC },
+    retries: { base: PLAYWRIGHT_SRC, head: PLAYWRIGHT_SRC },
+    modifiedTests: [],
+    deletedTests: [],
+    machineryTouched: [],
+    hasOverrideLabel: false,
+  };
+
+  it("passes an unrelated change", () => {
+    const result = evaluateGate(clean);
+    expect(result.ok).toBe(true);
+    expect(result.violations).toEqual([]);
+  });
+
+  it("flags a lowered coverage threshold", () => {
+    const result = evaluateGate({
+      ...clean,
+      coverage: {
+        base: COVERAGE_SRC,
+        head: COVERAGE_SRC.replace("statements: 60", "statements: 50"),
+      },
+    });
+    expect(result.ok).toBe(false);
+    expect(result.violations.map((v) => v.code)).toContain("coverage-threshold-lowered");
+  });
+
+  it("flags removing the whole coverage thresholds block (disables enforcement)", () => {
+    const noThresholds = COVERAGE_SRC.replace(/thresholds:\s*\{[\s\S]*?\},/, "");
+    const result = evaluateGate({
+      ...clean,
+      coverage: { base: COVERAGE_SRC, head: noThresholds },
+    });
+    expect(result.ok).toBe(false);
+    expect(result.violations.map((v) => v.code)).toContain("coverage-threshold-lowered");
+  });
+
+  it("flags removing a single coverage threshold key", () => {
+    const noBranches = COVERAGE_SRC.replace("branches: 45,\n", "");
+    const result = evaluateGate({
+      ...clean,
+      coverage: { base: COVERAGE_SRC, head: noBranches },
+    });
+    expect(result.ok).toBe(false);
+    expect(result.violations.map((v) => v.code)).toContain("coverage-threshold-lowered");
+  });
+
+  it("flags removing the bundle budget block", () => {
+    const result = evaluateGate({
+      ...clean,
+      bundle: { base: BUNDLE_SRC, head: "const BUDGET = {};\n" },
+    });
+    expect(result.ok).toBe(false);
+    expect(result.violations.map((v) => v.code)).toContain("bundle-budget-raised");
+  });
+
+  it("allows raising a coverage threshold", () => {
+    const result = evaluateGate({
+      ...clean,
+      coverage: {
+        base: COVERAGE_SRC,
+        head: COVERAGE_SRC.replace("statements: 60", "statements: 70"),
+      },
+    });
+    expect(result.ok).toBe(true);
+  });
+
+  it("flags a grown coverage exclude list", () => {
+    const grown = COVERAGE_SRC.replace(
+      `"src/vite-env.d.ts",`,
+      `"src/vite-env.d.ts",\n          "src/big-untested.ts",`,
+    );
+    const result = evaluateGate({ ...clean, coverage: { base: COVERAGE_SRC, head: grown } });
+    expect(result.ok).toBe(false);
+    expect(result.violations.map((v) => v.code)).toContain("coverage-exclude-grown");
+  });
+
+  it("flags a raised bundle budget", () => {
+    const result = evaluateGate({
+      ...clean,
+      bundle: {
+        base: BUNDLE_SRC,
+        head: BUNDLE_SRC.replace("entryJsGzipKB: 280", "entryJsGzipKB: 400"),
+      },
+    });
+    expect(result.ok).toBe(false);
+    expect(result.violations.map((v) => v.code)).toContain("bundle-budget-raised");
+  });
+
+  it("allows lowering a bundle budget", () => {
+    const result = evaluateGate({
+      ...clean,
+      bundle: {
+        base: BUNDLE_SRC,
+        head: BUNDLE_SRC.replace("entryJsGzipKB: 280", "entryJsGzipKB: 250"),
+      },
+    });
+    expect(result.ok).toBe(true);
+  });
+
+  it("flags raised e2e retries", () => {
+    const result = evaluateGate({
+      ...clean,
+      retries: { base: PLAYWRIGHT_SRC, head: PLAYWRIGHT_SRC.replace("? 2", "? 5") },
+    });
+    expect(result.ok).toBe(false);
+    expect(result.violations.map((v) => v.code)).toContain("e2e-retries-raised");
+  });
+
+  it("flags a deleted test file", () => {
+    const result = evaluateGate({ ...clean, deletedTests: ["src/foo.test.ts"] });
+    expect(result.ok).toBe(false);
+    expect(result.violations.map((v) => v.code)).toContain("test-file-deleted");
+  });
+
+  it("flags an added skip marker", () => {
+    const result = evaluateGate({
+      ...clean,
+      modifiedTests: [
+        {
+          path: "src/a.test.ts",
+          base: `it("x", () => { expect(1).toBe(1); });`,
+          head: `it.skip("x", () => { expect(1).toBe(1); });`,
+        },
+      ],
+    });
+    expect(result.ok).toBe(false);
+    expect(result.violations.map((v) => v.code)).toContain("test-skip-added");
+  });
+
+  it("flags removed assertions", () => {
+    const result = evaluateGate({
+      ...clean,
+      modifiedTests: [
+        {
+          path: "src/a.test.ts",
+          base: `expect(1).toBe(1); expect(2).toBe(2);`,
+          head: `expect(1).toBe(1);`,
+        },
+      ],
+    });
+    expect(result.ok).toBe(false);
+    expect(result.violations.map((v) => v.code)).toContain("test-assertions-removed");
+  });
+
+  it("allows adding assertions", () => {
+    const result = evaluateGate({
+      ...clean,
+      modifiedTests: [
+        {
+          path: "src/a.test.ts",
+          base: `expect(1).toBe(1);`,
+          head: `expect(1).toBe(1); expect(2).toBe(2);`,
+        },
+      ],
+    });
+    expect(result.ok).toBe(true);
+  });
+
+  it("does not flag a brand-new test file that contains skip markers (no base to weaken)", () => {
+    const result = evaluateGate({
+      ...clean,
+      modifiedTests: [
+        {
+          path: "scripts/gate.test.mjs",
+          base: null,
+          head: `it.skip("x", () => {}); xit("y", () => {});`,
+        },
+      ],
+    });
+    expect(result.ok).toBe(true);
+  });
+
+  it("flags edits to the gate machinery", () => {
+    const result = evaluateGate({ ...clean, machineryTouched: [".github/workflows/ci.yml"] });
+    expect(result.ok).toBe(false);
+    expect(result.violations.map((v) => v.code)).toContain("gate-machinery-modified");
+  });
+
+  it("allows any violation when the override label is present, marking it overridden", () => {
+    const result = evaluateGate({
+      ...clean,
+      coverage: {
+        base: COVERAGE_SRC,
+        head: COVERAGE_SRC.replace("statements: 60", "statements: 50"),
+      },
+      machineryTouched: [".github/workflows/ci.yml"],
+      hasOverrideLabel: true,
+    });
+    expect(result.ok).toBe(true);
+    expect(result.overridden).toBe(true);
+    expect(result.violations.length).toBeGreaterThan(0);
+  });
+});
diff --git a/vite.config.ts b/vite.config.ts
index ff30375..b184ea4 100644
--- a/vite.config.ts
+++ b/vite.config.ts
@@ -11,7 +11,11 @@ export default defineConfig(({ mode }) => {
   return {
     plugins: [react(), agentSidecarPlugin(), webApiPlugin()],
     test: {
-      include: ["src/**/*.{test,spec}.{ts,tsx}", "server/**/*.{test,spec}.ts"],
+      include: [
+        "src/**/*.{test,spec}.{ts,tsx}",
+        "server/**/*.{test,spec}.ts",
+        "scripts/**/*.{test,spec}.mjs",
+      ],
       exclude: ["tests/e2e/**", "node_modules/**", "dist/**"],
       environment: "jsdom",
       setupFiles: ["./src/test/setup.ts"],