From ec7b5da613e63407a24bb8ea014bce14b540f1b2 Mon Sep 17 00:00:00 2001 From: holo Date: Mon, 29 Jun 2026 22:04:07 +0800 Subject: [PATCH] feat(ci): reward-hacking integrity gate (delivery-loop hardening #1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Turn the prose "don't weaken the tests / don't lower thresholds" rule into a deterministic check the in-loop agent cannot fool. scripts/check-gate-integrity.mjs diffs the branch against its merge base and fails if the verification itself was weakened — lowered coverage thresholds, raised bundle budgets, raised e2e retries, deleted/disabled tests or stripped assertions, or any edit to the gate/CI machinery (the script, .github/workflows/**, fixer.md's forbidden list). A maintainer allows a deliberate change with a visible `gate-change` PR label. Added as a PR-scoped required CI job; mirrored locally via `npm run check:gate`. --- .claude/agents/fixer.md | 2 + .../dikw-web-delivery-workflow/SKILL.md | 5 +- .github/workflows/ci.yml | 29 ++ CHANGELOG.md | 19 + CLAUDE.md | 3 +- docs/adr/0005-delivery-loop-hardening.md | 88 +++++ docs/review-rubric.md | 6 +- docs/tdd.md | 2 + package.json | 3 +- scripts/check-gate-integrity.mjs | 334 ++++++++++++++++++ scripts/check-gate-integrity.test.mjs | 332 +++++++++++++++++ vite.config.ts | 6 +- 12 files changed, 824 insertions(+), 5 deletions(-) create mode 100644 docs/adr/0005-delivery-loop-hardening.md create mode 100644 scripts/check-gate-integrity.mjs create mode 100644 scripts/check-gate-integrity.test.mjs diff --git a/.claude/agents/fixer.md b/.claude/agents/fixer.md index f9abee2..1f038d4 100644 --- a/.claude/agents/fixer.md +++ b/.claude/agents/fixer.md @@ -20,3 +20,5 @@ Forbidden — these "fix the test, not the code" and are never allowed: - Lowering the `vite.config.ts` coverage thresholds. If the only path to green is one of the forbidden moves, STOP: report the real root cause and why it can't be fixed cleanly. Do not weaken the check — escalate instead. + +This list is not honor-system only: the `gate-integrity` CI job (`npm run check:gate`) mechanically fails any PR that lowers a coverage threshold, raises a bundle budget, raises e2e retries, or deletes/disables a test / strips its assertions — so a forbidden move won't merge even if attempted. Fix the cause; don't route around the gate. diff --git a/.claude/skills/dikw-web-delivery-workflow/SKILL.md b/.claude/skills/dikw-web-delivery-workflow/SKILL.md index 62703d4..ac01464 100644 --- a/.claude/skills/dikw-web-delivery-workflow/SKILL.md +++ b/.claude/skills/dikw-web-delivery-workflow/SKILL.md @@ -52,7 +52,10 @@ a fresh agent that didn't write the code). the **same** change — not "later". (Disk `.md` is English-only in this repo.) 8. **Final gate + PR.** `npm.cmd run verify` (lint + format:check + typecheck + coverage + build + e2e) - green, then `npm.cmd run check:bundle` (gzip budget; also runs in CI). Bump + green, then `npm.cmd run check:bundle` (gzip budget) and `npm.cmd run check:gate` + (reward-hacking gate; both also run in CI — the latter as the required + `gate-integrity` job). If `check:gate` flags a *deliberate* weakening, a maintainer + adds the `gate-change` label to the PR; never route around it. Bump `package.json` version (3-digit SemVer) and add a `CHANGELOG.md` entry when warranted. Branch with a descriptive name, commit `(): `, push, `gh pr create`. diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 29f9aec..1a02db5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -165,6 +165,35 @@ jobs: severity: HIGH,CRITICAL version: ${{ env.TRIVY_VERSION }} + gate-integrity: + # Reward-hacking gate: fails a PR that weakens the verification ITSELF — lowers a + # coverage threshold (vite.config.ts), raises a bundle budget (check-bundle.mjs), + # raises e2e retries (playwright.config.ts), deletes/disables a test or strips its + # assertions, or edits the gate/CI machinery (this script, .github/workflows/**, + # fixer.md's forbidden list). A maintainer can allow a deliberate change with the + # visible, auditable `gate-change` PR label. PR-scoped on purpose: it is a required + # status check, so nothing merges to main without it, and the label context only + # exists on a PR. Pure Node script (no deps) — no `npm ci` needed. + name: Gate integrity (no reward-hacking) + if: github.event_name == 'pull_request' + runs-on: ubuntu-latest + timeout-minutes: 5 + steps: + - uses: actions/checkout@v7 + with: + # Full history so `...HEAD` can resolve the merge base. + fetch-depth: 0 + + - uses: actions/setup-node@v6 + with: + node-version: "24" + + - name: Check gate integrity + env: + GATE_BASE_REF: ${{ github.event.pull_request.base.sha }} + GATE_HAS_OVERRIDE: ${{ contains(github.event.pull_request.labels.*.name, 'gate-change') }} + run: node scripts/check-gate-integrity.mjs + release: name: Release tag (dikw-web-v*) # Cuts a GitHub Release tagged dikw-web-v once the whole diff --git a/CHANGELOG.md b/CHANGELOG.md index 6e40b86..57ee876 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,25 @@ file format introduced in `[0.0.1.0]` was dropped. ## [Unreleased] +## [0.8.7] - 2026-06-29 + +### Added + +- **Reward-hacking gate (`gate-integrity` CI job / `npm run check:gate`).** A + deterministic check the in-loop agent cannot fool, hardening the delivery loop + against an agent weakening the verification to make a check go green. + `scripts/check-gate-integrity.mjs` diffs the branch against its merge base and + fails the PR if a coverage threshold was lowered, the coverage `exclude` list + grew, a bundle budget was raised, e2e `retries` were raised, a test was + deleted/disabled or stripped of assertions, or the gate/CI machinery itself + (the script, `.github/workflows/**`, `fixer.md`'s forbidden list) was edited. + The good direction (raising a threshold, adding tests) is always allowed; a + deliberate weakening is allowed only when a maintainer adds the visible + `gate-change` PR label. Until now "don't weaken the tests" was prose only in + `CLAUDE.md` / `docs/review-rubric.md` / `fixer.md` — the weakest defense. The job + is PR-scoped and a required status check. See + `docs/adr/0005-delivery-loop-hardening.md`. + ## [0.8.6] - 2026-06-29 ### Fixed diff --git a/CLAUDE.md b/CLAUDE.md index 11a3212..5dd68fb 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -72,7 +72,7 @@ End-to-end loop from request to landed PR. Run autonomously for behavior changes 4. **Final pass.** Run `/code-review`, scored against `docs/review-rubric.md` (the project-specific principles), and resolve every finding before continuing. 5. **Verify in the browser.** For UI changes, invoke the `dikw-web-verify-frontend` skill: navigate the changed routes via Chrome MCP, confirm a clean runtime console on real data, exercise the affected interactions, and run the `docs/ui-checklist.md` rubric in light + dark — confirm the change actually rendered as intended, not just that unit tests pass. 6. **Update markdown docs.** Walk `CLAUDE.md`, `README.md`, and the relevant `docs/*.md` against the diff; any contract, behavior, command, or doc index that drifted must be updated in the same change. Don't leave docs to "catch up later". -7. **Create the PR.** Branch with a descriptive name, commit with `(): ` matching the project's existing convention (see recent `git log`), push, then `gh pr create`. CI auto-runs lint + format:check + typecheck + coverage + build + e2e + bundle budget + security scans (npm audit, gitleaks, Trivy, CodeQL). Bump `package.json.version` manually (standard 3-digit SemVer) when the change warrants it, and add an entry to `CHANGELOG.md` under the matching version heading. On merge to `main`, CI's `release` job auto-cuts a GitHub Release tagged `dikw-web-v` from `package.json.version` (idempotent — only a version bump creates a new tag; notes come from the matching CHANGELOG section via `scripts/changelog-notes.mjs`), so a deliberate version bump is what publishes a release. +7. **Create the PR.** Branch with a descriptive name, commit with `(): ` matching the project's existing convention (see recent `git log`), push, then `gh pr create`. CI auto-runs lint + format:check + typecheck + coverage + build + e2e + bundle budget + the `gate-integrity` reward-hacking gate (`check:gate`) + security scans (npm audit, gitleaks, Trivy, CodeQL). Bump `package.json.version` manually (standard 3-digit SemVer) when the change warrants it, and add an entry to `CHANGELOG.md` under the matching version heading. On merge to `main`, CI's `release` job auto-cuts a GitHub Release tagged `dikw-web-v` from `package.json.version` (idempotent — only a version bump creates a new tag; notes come from the matching CHANGELOG section via `scripts/changelog-notes.mjs`), so a deliberate version bump is what publishes a release. 8. **Monitor CI and PR comments; resolve as they surface, then merge.** After pushing, actively watch both signals — don't passively wait, and don't batch resolution to merge time. - **CI rollup**: `gh pr checks ` (or `--watch` to block until terminal). Failing job logs: `gh run view --log-failed`. Flaky e2e gets **one** rerun, not five (see [[project_flaky_graph_e2e]] in memory for which test). - **PR review prose**: `gh api repos/{owner}/{repo}/pulls/{N}/reviews` for review bodies, `.../pulls/{N}/comments` for inline threads, `.../issues/{N}/comments` for top-level CodeRabbit summaries. `gh pr checks` shows pass/fail only, not the prose. @@ -98,6 +98,7 @@ Windows shell: use `npm.cmd` (not `npm`) when invoking from PowerShell. - `npm.cmd run build` — typecheck, `vite build` (browser bundle to `dist/`), then `build:server` (esbuild bundles `server/agent/standalone.ts` to `dist-server/standalone.mjs` with `--packages=external`, since ADK + MikroORM + native sqlite3 can't be bundled — so the sidecar imports its deps from a production `node_modules` at runtime). `npm.cmd start` runs that standalone sidecar. - `npm.cmd run verify` — full gate: lint + format:check + typecheck + coverage + build + e2e. Run before committing behavior changes. - `npm.cmd run check:bundle` — gzip bundle budget (entry JS / total JS / CSS) against `dist/`; runs in CI after the verify gate. Raise the budgets in `scripts/check-bundle.mjs` deliberately, like the coverage thresholds — don't bump to pass. +- `npm.cmd run check:gate` — **reward-hacking gate** (`scripts/check-gate-integrity.mjs`): diffs the branch against its merge base (`origin/main` locally; the PR base in CI) and fails if the verification *itself* was weakened — a lowered coverage threshold, a grown coverage `exclude`, a raised bundle budget, raised e2e `retries`, a deleted/disabled test or removed assertions, or any edit to the gate/CI machinery (the script, `.github/workflows/**`, `fixer.md`'s forbidden list). The good direction (raising a threshold, adding a test) is always allowed; a deliberate weakening is allowed only when a maintainer adds the visible `gate-change` label to the PR (`GATE_HAS_OVERRIDE`). Runs as the **PR-scoped required CI job `gate-integrity`** — it is the deterministic backstop for the prose "don't weaken the tests" rule in `fixer.md` / `docs/review-rubric.md`. See `docs/adr/0005-delivery-loop-hardening.md`. - `npm.cmd run smoke:core` — live-core `/v1` contract smoke (`scripts/smoke-core.mjs`, the `dikw-web-smoke-core` skill). Not a CI gate; needs a reachable core. Run after a `dikw-core` bump or before a demo. - `npm.cmd run live:verify` — full live integration verification of the working tree against a **real `dikw-core`** (GHCR image, Postgres backend) on dynamic ports: boot → seed the write pipeline (import→ingest→synth→lint, reusing `buildImportBundle` + `DikwClient`) → read-contract smoke → browser read-route e2e (Playwright `live` project) → agent↔core check → teardown. Needs Docker + `.env.core` (LLM/embedding keys, git-ignored; copy `.env.core.example`). Not a CI gate (boots a container, calls live LLMs); `live-integration.yml` runs it on dispatch/nightly/label. Sub-commands: `live:up` / `live:seed` / `live:smoke` / `live:down` (`-- --volumes` to drop data). `-- --keep` leaves the stack up. See `docs/integration-verification.md`. diff --git a/docs/adr/0005-delivery-loop-hardening.md b/docs/adr/0005-delivery-loop-hardening.md new file mode 100644 index 0000000..56e670d --- /dev/null +++ b/docs/adr/0005-delivery-loop-hardening.md @@ -0,0 +1,88 @@ +# 5. Delivery-loop hardening + +This ADR records the decision to harden dikw-web's request→merged-PR **delivery +loop** against the failure modes of agentic, low-supervision development. It is the +umbrella record for a set of changes landing as separate PRs; this first revision +covers the **reward-hacking gate** (`gate-integrity` / `npm run check:gate`). + +## Status + +Accepted (2026-06-29). Living document — later sections are added as the remaining +hardening items land (trustworthy green signal, measured perf/a11y, automated +CI-watch-and-fix, loop observability). + +## Context + +The loop already encodes most manual + pre-merge checks as skills (the +`dikw-web-delivery-workflow`, `dikw-web-verify-frontend`, `dikw-web-smoke-core` +skills; the `docs/ui-checklist.md` + `docs/review-rubric.md` rubrics) and runs an +independent review on a different model (`/codex:review`) before merge. The CI +`verify` gate (lint / format / typecheck / coverage / build / e2e) plus +`check:bundle` is deterministic and blocks merge. + +The gap is the one an autonomous agent exploits under pressure to make a check go +green: **reward hacking**. The verification's own knobs live in editable source — +coverage thresholds in `vite.config.ts`, bundle budgets in `scripts/check-bundle.mjs`, +e2e retries in `playwright.config.ts` — and a test can be deleted, `.skip`-ped, or +stripped of assertions. The only thing standing in the way was prose: "don't weaken +the tests" in `CLAUDE.md`, `docs/review-rubric.md`, and the `fixer` agent's forbidden +list. Prose is the weakest possible defense: the agent that edits the threshold also +passes the check that reads it. + +## Decision + +Add a deterministic gate the in-loop agent **cannot fool**: +`scripts/check-gate-integrity.mjs`, run as the PR-scoped required CI job +`gate-integrity` and mirrored locally by `npm run check:gate`. + +It diffs the branch against its merge base (`...HEAD`) and fails if the +verification itself was weakened: + +- **coverage-threshold-lowered** — any of statements/branches/functions/lines in + `vite.config.ts` dropped. +- **coverage-exclude-grown** — the coverage `exclude` array gained entries (drops + files out of the denominator). +- **bundle-budget-raised** — any `*GzipKB` budget in `check-bundle.mjs` rose. +- **e2e-retries-raised** — the CI branch of `retries` in `playwright.config.ts` rose + (a higher retry count masks new flakes). +- **test-file-deleted** / **test-skip-added** / **test-assertions-removed** — a test + file was removed, gained a `.skip`/`.only`/`.todo`/`xit`/`xdescribe` marker, or lost + `expect()` calls. Only *in-place* modifications are diffed; a new file is new + coverage, and a rename-that-guts shows up as a separate delete (still caught). +- **gate-machinery-modified** — any edit to the gate script itself, + `.github/workflows/**`, or `fixer.md`'s forbidden list. This is the **self-guard**: + an agent cannot quietly delete a check or drop the CI job, because doing so trips the + gate. + +The "good direction" — raising a threshold, tightening a budget, adding tests — is +always allowed without ceremony. A *deliberate* weakening is allowed only when a +maintainer attaches the visible, auditable **`gate-change`** label to the PR +(`GATE_HAS_OVERRIDE=true`); the job then passes but prints what it allowed, for the +audit trail. The reviewer's remaining job is to judge whether a labelled change is +justified. + +This mirrors the "a second check the agent does not control" pattern: the gate is the +mechanical backstop for the prose rule, and it guards its own machinery so it cannot +be edited away. + +## Consequences + +- The PR that *introduces or changes* the gate machinery (including this one) trips + `gate-machinery-modified` by design and must carry the `gate-change` label. That is + the intended, visible audit point, not a bug. +- The skip/assertion heuristics are deliberately dumb (regex counts), so they can have + false positives (e.g. a test file that contains skip markers as string fixtures — + handled by only diffing in-place modifications with a non-null base). The + `gate-change` label is the escape hatch; the gate is a backstop, not a proof system. +- The job is PR-scoped (`if: github.event_name == 'pull_request'`) because the label + context only exists on a PR, and a required status check already blocks every merge + to `main`. It is therefore **not** added to `release.needs` (a skipped dependency + would skip `release`). + +## Excluded (deliberately, for "simplicity first") + +The fuller autonomous-loop machinery — an on-disk `STATUS.md` + `loop_state.json` +state protocol, container `--network none` isolation, per-token cost accounting — is +out of scope. dikw-web's loop is interactive Claude Code with worktree isolation +already available for background jobs and no destructive operations; that machinery +would add complexity without proportional value here. diff --git a/docs/review-rubric.md b/docs/review-rubric.md index 0f825df..7ce07c7 100644 --- a/docs/review-rubric.md +++ b/docs/review-rubric.md @@ -19,7 +19,11 @@ Each item is pass/fail against the **diff under review**. orphaned imports/vars are removed. - [ ] **Goal-driven / TDD.** Behavior changes land test-first (see `docs/tdd.md`). Coverage thresholds in `vite.config.ts` (60/45/55/60) are **not lowered** to - pass — tests are added/repaired instead. + pass — tests are added/repaired instead. This is now machine-enforced by the + `gate-integrity` CI job (`npm run check:gate`): lowering a threshold, raising a + bundle budget, raising e2e retries, or deleting/disabling a test fails the PR + unless a maintainer attaches the `gate-change` label. The reviewer's job here is + to judge whether a labelled `gate-change` is actually justified. ## Repo-specific traps (these don't trip generic reviewers) diff --git a/docs/tdd.md b/docs/tdd.md index bf3e530..52b01f2 100644 --- a/docs/tdd.md +++ b/docs/tdd.md @@ -217,6 +217,8 @@ Locale and theme regressions should be caught at the browser boundary: Initial thresholds are intentionally modest: statements/lines 60%, functions 55%, branches 45%. Raise thresholds only when the suite gains durable behavior coverage. Do not lower thresholds to merge a feature; add or repair tests instead. +"Do not lower thresholds" is no longer just a rule of discipline — the `gate-integrity` CI job (`npm run check:gate`, `scripts/check-gate-integrity.mjs`) enforces it mechanically. It diffs the PR against its merge base and fails if the verification itself was weakened: a lowered coverage threshold, a grown coverage `exclude` list, a raised bundle budget, raised e2e retries, a deleted/disabled test, removed assertions, or any edit to the gate/CI machinery. A deliberate, reviewed change is allowed only when a maintainer attaches the visible `gate-change` label to the PR. See `docs/adr/0005-delivery-loop-hardening.md`. + ## Real Core Smoke Testing Mocked E2E is the default gate. Manual smoke against a real local `dikw-core` remains useful before demos, but it should not block normal TDD work because local data and providers vary. diff --git a/package.json b/package.json index df28245..1bd5529 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "dikw-web", - "version": "0.8.6", + "version": "0.8.7", "private": true, "type": "module", "engines": { @@ -27,6 +27,7 @@ "live:smoke": "node scripts/live-core/smoke.mjs", "live:verify": "node scripts/live-core/run.mjs", "check:bundle": "node scripts/check-bundle.mjs", + "check:gate": "node scripts/check-gate-integrity.mjs", "verify": "npm run lint && npm run format:check && npm run typecheck && npm run test:coverage && npm run build && npm run test:e2e" }, "dependencies": { diff --git a/scripts/check-gate-integrity.mjs b/scripts/check-gate-integrity.mjs new file mode 100644 index 0000000..b3ee0ea --- /dev/null +++ b/scripts/check-gate-integrity.mjs @@ -0,0 +1,334 @@ +#!/usr/bin/env node +// Reward-hacking gate. The deterministic check the in-loop agent CANNOT fool. +// +// dikw-web's verification (coverage thresholds, bundle budgets, e2e retries) lives +// in editable source files, and the "don't weaken the tests" rule is otherwise just +// prose in CLAUDE.md / fixer.md / review-rubric.md — the weakest possible defense. +// This gate turns that prose into an exit code: it diffs the branch against its merge +// base and FAILS if the verification itself was weakened, unless a human has attached +// a visible, auditable `gate-change` label to the PR. +// +// It guards itself: any edit to this script, the CI workflows, or fixer's forbidden +// list trips `gate-machinery-modified`, so an agent cannot quietly delete a check. +// +// Usage: +// node scripts/check-gate-integrity.mjs # compares HEAD against origin/main +// GATE_BASE_REF= node scripts/check-gate-integrity.mjs +// GATE_HAS_OVERRIDE=true ... # honor the gate-change label +// +// Pure helpers + evaluateGate() are exported for unit testing; the git plumbing only +// runs when invoked as a CLI. + +import { execFileSync } from "node:child_process"; +import { pathToFileURL } from "node:url"; + +// --- pure parsers --------------------------------------------------------------- + +/** + * Extract the balanced `{ ... }` block that follows `key` (e.g. `coverage:`). + * Brace-counting, so it survives nested blocks like `coverage.thresholds`. Returns + * the block text incl. its braces, or null. Scoping to the coverage block is what + * stops the `exclude:`/`thresholds:` parsers from matching a *preceding* `test.*`. + */ +export function extractBlock(src, key) { + if (!src) return null; + const keyAt = src.indexOf(key); + if (keyAt === -1) return null; + const open = src.indexOf("{", keyAt); + if (open === -1) return null; + let depth = 0; + for (let i = open; i < src.length; i++) { + if (src[i] === "{") depth++; + else if (src[i] === "}" && --depth === 0) return src.slice(open, i + 1); + } + return null; +} + +/** @returns {{statements:number,branches:number,functions:number,lines:number}|null} */ +export function parseCoverageThresholds(src) { + const cov = extractBlock(src, "coverage:"); + if (!cov) return null; + const block = cov.match(/thresholds:\s*\{([\s\S]*?)\}/); + if (!block) return null; + const pick = (key) => { + const m = block[1].match(new RegExp(`${key}:\\s*(\\d+(?:\\.\\d+)?)`)); + return m ? Number(m[1]) : null; + }; + const out = { + statements: pick("statements"), + branches: pick("branches"), + functions: pick("functions"), + lines: pick("lines"), + }; + return Object.values(out).every((v) => v !== null) ? out : null; +} + +/** Number of entries in the **coverage** `exclude: [...]` array (not `test.exclude`). */ +export function parseCoverageExcludeCount(src) { + const cov = extractBlock(src, "coverage:"); + if (!cov) return null; + const m = cov.match(/exclude:\s*\[([\s\S]*?)\]/); + if (!m) return null; + const entries = m[1].match(/['"`][^'"`]*['"`]/g); + return entries ? entries.length : 0; +} + +/** @returns {{entryJsGzipKB:number,totalJsGzipKB:number,cssGzipKB:number}|null} */ +export function parseBundleBudgets(src) { + if (!src) return null; + const pick = (key) => { + const m = src.match(new RegExp(`${key}:\\s*(\\d+(?:\\.\\d+)?)`)); + return m ? Number(m[1]) : null; + }; + const out = { + entryJsGzipKB: pick("entryJsGzipKB"), + totalJsGzipKB: pick("totalJsGzipKB"), + cssGzipKB: pick("cssGzipKB"), + }; + return Object.values(out).every((v) => v !== null) ? out : null; +} + +/** The CI branch of `retries: process.env.CI ? N : M` (falls back to a bare `retries: N`). */ +export function parsePlaywrightCiRetries(src) { + if (!src) return null; + const ternary = src.match(/retries:\s*process\.env\.CI\s*\?\s*(\d+)\s*:\s*\d+/); + if (ternary) return Number(ternary[1]); + const bare = src.match(/retries:\s*(\d+)/); + return bare ? Number(bare[1]) : null; +} + +const SKIP_MARKER_RE = + /(?:\b(?:it|test|describe)\.(?:skip|only|todo)\b)|(?:\b(?:xit|xdescribe|xtest)\b)/g; + +/** Count of `.skip`/`.only`/`.todo` and `xit`/`xdescribe` test-disabling markers. */ +export function countSkipMarkers(src) { + if (!src) return 0; + return (src.match(SKIP_MARKER_RE) || []).length; +} + +// Strip line and block comments so commenting code out is not a way to hide an assertion. +function stripComments(src) { + return src.replace(/\/\*[\s\S]*?\*\//g, "").replace(/\/\/[^\n]*/g, ""); +} + +/** Count of `expect(` assertion calls, ignoring commented-out ones. */ +export function countAssertions(src) { + if (!src) return 0; + return (stripComments(src).match(/\bexpect\s*\(/g) || []).length; +} + +const MACHINERY_PATHS = new Set(["scripts/check-gate-integrity.mjs", ".claude/agents/fixer.md"]); + +/** + * Is `path` part of the gate's own enforcement machinery (any edit needs the label)? + * Note `scripts/check-bundle.mjs` is deliberately NOT here: it is a *checked* file + * guarded directionally (a raised budget is flagged, a lowered one allowed), exactly + * like `vite.config.ts` — so tightening a budget doesn't require the gate-change label. + */ +export function isMachineryPath(path) { + return MACHINERY_PATHS.has(path) || path.startsWith(".github/workflows/"); +} + +// --- pure evaluation ------------------------------------------------------------ + +/** + * Decide whether a change weakens the verification. + * @param {{ + * coverage: {base: string|null, head: string|null}, + * bundle: {base: string|null, head: string|null}, + * retries: {base: string|null, head: string|null}, + * modifiedTests: Array<{path: string, base: string|null, head: string|null}>, + * deletedTests: string[], + * machineryTouched: string[], + * hasOverrideLabel: boolean, + * }} input + * @returns {{ok: boolean, overridden: boolean, violations: Array<{code: string, file: string, detail: string}>}} + */ +export function evaluateGate(input) { + const violations = []; + + // (a) coverage thresholds must not drop; the exclude list must not grow. + const baseCov = parseCoverageThresholds(input.coverage?.base); + const headCov = parseCoverageThresholds(input.coverage?.head); + if (baseCov && !headCov) { + // Removing the thresholds block (or any key) disables Vitest coverage + // enforcement entirely — the exact weakening this gate exists to catch. + violations.push({ + code: "coverage-threshold-lowered", + file: "vite.config.ts", + detail: "coverage thresholds block removed or no longer parseable", + }); + } else if (baseCov && headCov) { + for (const key of Object.keys(baseCov)) { + if (headCov[key] < baseCov[key]) { + violations.push({ + code: "coverage-threshold-lowered", + file: "vite.config.ts", + detail: `${key}: ${baseCov[key]} → ${headCov[key]}`, + }); + } + } + } + const baseExcl = parseCoverageExcludeCount(input.coverage?.base); + const headExcl = parseCoverageExcludeCount(input.coverage?.head); + if (baseExcl !== null && headExcl !== null && headExcl > baseExcl) { + violations.push({ + code: "coverage-exclude-grown", + file: "vite.config.ts", + detail: `coverage exclude entries: ${baseExcl} → ${headExcl}`, + }); + } + + // (b) bundle budgets must not rise. + const baseBudget = parseBundleBudgets(input.bundle?.base); + const headBudget = parseBundleBudgets(input.bundle?.head); + if (baseBudget && !headBudget) { + violations.push({ + code: "bundle-budget-raised", + file: "scripts/check-bundle.mjs", + detail: "bundle budget block removed or no longer parseable", + }); + } else if (baseBudget && headBudget) { + for (const key of Object.keys(baseBudget)) { + if (headBudget[key] > baseBudget[key]) { + violations.push({ + code: "bundle-budget-raised", + file: "scripts/check-bundle.mjs", + detail: `${key}: ${baseBudget[key]} → ${headBudget[key]}`, + }); + } + } + } + + // (c) e2e retries must not rise (a higher retry count masks new flakes). + const baseRetries = parsePlaywrightCiRetries(input.retries?.base); + const headRetries = parsePlaywrightCiRetries(input.retries?.head); + if (baseRetries !== null && headRetries !== null && headRetries > baseRetries) { + violations.push({ + code: "e2e-retries-raised", + file: "playwright.config.ts", + detail: `CI retries: ${baseRetries} → ${headRetries}`, + }); + } + + // (d) tests must not be deleted, disabled, or stripped of assertions. + for (const path of input.deletedTests || []) { + violations.push({ code: "test-file-deleted", file: path, detail: "test file removed" }); + } + for (const t of input.modifiedTests || []) { + // Only an EXISTING test can be weakened; a brand-new file (no base) is new + // coverage, and its skip markers / assertion count have nothing to compare against. + if (t.base == null) continue; + if (countSkipMarkers(t.head) > countSkipMarkers(t.base)) { + violations.push({ + code: "test-skip-added", + file: t.path, + detail: "a skip/only/todo marker was added", + }); + } + if (countAssertions(t.head) < countAssertions(t.base)) { + violations.push({ + code: "test-assertions-removed", + file: t.path, + detail: `expect() calls: ${countAssertions(t.base)} → ${countAssertions(t.head)}`, + }); + } + } + + // (e) the gate machinery itself must not be edited (direction is undefinable here). + for (const path of input.machineryTouched || []) { + violations.push({ + code: "gate-machinery-modified", + file: path, + detail: "gate/CI machinery changed", + }); + } + + const overridden = violations.length > 0 && !!input.hasOverrideLabel; + return { ok: violations.length === 0 || overridden, overridden, violations }; +} + +// --- git plumbing (CLI only) ---------------------------------------------------- + +const TEST_FILE_RE = /\.(test|spec)\.(ts|tsx|mjs|js)$/; + +function git(args) { + return execFileSync("git", args, { encoding: "utf8" }); +} + +function showAtRef(ref, path) { + try { + return execFileSync("git", ["show", `${ref}:${path}`], { + encoding: "utf8", + stdio: ["ignore", "pipe", "ignore"], + }); + } catch { + return null; // file absent at that ref + } +} + +function main() { + const baseRef = process.env.GATE_BASE_REF || "origin/main"; + const hasOverrideLabel = process.env.GATE_HAS_OVERRIDE === "true"; + + // `...HEAD` diffs against the merge base, so unrelated base movement is ignored. + const range = `${baseRef}...HEAD`; + const nameStatus = git(["diff", "--name-status", range]).trim(); + + const deletedTests = []; + const modifiedTestPaths = []; + const machineryTouched = []; + for (const line of nameStatus ? nameStatus.split("\n") : []) { + const [status, ...rest] = line.split(/\s+/); + const path = rest[rest.length - 1]; + if (!path) continue; + if (isMachineryPath(path)) machineryTouched.push(path); + if (TEST_FILE_RE.test(path)) { + // Deleted tests are always a violation. Only in-place MODIFY can weaken an + // existing test; added (A) files are new coverage, and a rename that also guts a + // test shows up as a separate D+A (the D trips test-file-deleted). + if (status.startsWith("D")) deletedTests.push(path); + else if (status.startsWith("M")) modifiedTestPaths.push(path); + } + } + + const mergeBase = git(["merge-base", baseRef, "HEAD"]).trim(); + const pair = (path) => ({ base: showAtRef(mergeBase, path), head: showAtRef("HEAD", path) }); + const modifiedTests = modifiedTestPaths.map((path) => ({ path, ...pair(path) })); + + const result = evaluateGate({ + coverage: pair("vite.config.ts"), + bundle: pair("scripts/check-bundle.mjs"), + retries: pair("playwright.config.ts"), + modifiedTests, + deletedTests, + machineryTouched, + hasOverrideLabel, + }); + + if (result.violations.length === 0) { + console.log("✓ Gate integrity: no verification weakened."); + process.exit(0); + } + + const lines = result.violations.map((v) => ` ✖ [${v.code}] ${v.file} — ${v.detail}`); + if (result.overridden) { + console.log( + "⚠ Gate integrity: the following weakenings were ALLOWED by the `gate-change` label:", + ); + console.log(lines.join("\n")); + console.log("\nProceeding (override present). This is recorded for audit."); + process.exit(0); + } + + console.error("✖ Gate integrity FAILED — the verification itself was weakened:"); + console.error(lines.join("\n")); + console.error( + "\nFix the code so the existing checks pass, instead of weakening them. If the change is a" + + "\ndeliberate, reviewed decision, a maintainer can add the `gate-change` label to the PR.", + ); + process.exit(1); +} + +const invokedDirectly = process.argv[1] && import.meta.url === pathToFileURL(process.argv[1]).href; +if (invokedDirectly) main(); diff --git a/scripts/check-gate-integrity.test.mjs b/scripts/check-gate-integrity.test.mjs new file mode 100644 index 0000000..f48105c --- /dev/null +++ b/scripts/check-gate-integrity.test.mjs @@ -0,0 +1,332 @@ +import { describe, expect, it } from "vitest"; +import { + countAssertions, + countSkipMarkers, + evaluateGate, + isMachineryPath, + parseBundleBudgets, + parseCoverageExcludeCount, + parseCoverageThresholds, + parsePlaywrightCiRetries, +} from "./check-gate-integrity.mjs"; + +// Mirrors the real vite.config.ts layout: a top-level `test.exclude` (3 entries) +// appears BEFORE the `coverage` block's own `exclude` (5 entries). The parsers must +// read the coverage block's values, not the first `exclude:`/`thresholds:` they see. +const VITE_CONFIG_SRC = ` + test: { + include: ["src/**/*.{test,spec}.{ts,tsx}"], + exclude: ["tests/e2e/**", "node_modules/**", "dist/**"], + coverage: { + provider: "v8", + include: ["src/**/*.{ts,tsx}"], + exclude: [ + "src/**/*.test.{ts,tsx}", + "src/test/**", + "src/main.tsx", + "src/types.ts", + "src/vite-env.d.ts", + ], + thresholds: { + statements: 60, + branches: 45, + functions: 55, + lines: 60, + }, + }, + }, +`; + +const COVERAGE_SRC = ` + coverage: { + provider: "v8", + include: ["src/**/*.{ts,tsx}"], + exclude: [ + "src/**/*.test.{ts,tsx}", + "src/test/**", + "src/main.tsx", + "src/types.ts", + "src/vite-env.d.ts", + ], + thresholds: { + statements: 60, + branches: 45, + functions: 55, + lines: 60, + }, + }, +`; + +const BUNDLE_SRC = ` +const BUDGET = { + entryJsGzipKB: 280, + totalJsGzipKB: 1950, + cssGzipKB: 35, +}; +`; + +const PLAYWRIGHT_SRC = ` + retries: process.env.CI ? 2 : 0, +`; + +describe("parsers", () => { + it("parses coverage thresholds", () => { + expect(parseCoverageThresholds(COVERAGE_SRC)).toEqual({ + statements: 60, + branches: 45, + functions: 55, + lines: 60, + }); + }); + + it("counts coverage exclude entries", () => { + expect(parseCoverageExcludeCount(COVERAGE_SRC)).toBe(5); + }); + + it("parses bundle budgets", () => { + expect(parseBundleBudgets(BUNDLE_SRC)).toEqual({ + entryJsGzipKB: 280, + totalJsGzipKB: 1950, + cssGzipKB: 35, + }); + }); + + it("parses the CI branch of playwright retries", () => { + expect(parsePlaywrightCiRetries(PLAYWRIGHT_SRC)).toBe(2); + }); + + it("counts skip/only/todo markers", () => { + const src = `test.skip("a", () => {}); it.only("b", () => {}); xit("c", () => {}); describe.todo("d");`; + expect(countSkipMarkers(src)).toBe(4); + }); + + it("counts expect() assertions", () => { + expect(countAssertions(`expect(a).toBe(1); expect(b).toEqual(2);`)).toBe(2); + }); + + it("ignores commented-out assertions", () => { + expect(countAssertions(`expect(a).toBe(1); // expect(b).toBe(2);`)).toBe(1); + expect(countAssertions(`expect(a).toBe(1); /* expect(b).toBe(2); */`)).toBe(1); + }); + + it("parses the coverage block's exclude, not a preceding test.exclude", () => { + // test.exclude (3 entries) precedes coverage.exclude (5 entries) in the real file. + expect(parseCoverageExcludeCount(VITE_CONFIG_SRC)).toBe(5); + }); + + it("parses coverage thresholds even with a preceding test block", () => { + expect(parseCoverageThresholds(VITE_CONFIG_SRC)).toEqual({ + statements: 60, + branches: 45, + functions: 55, + lines: 60, + }); + }); + + it("classifies gate machinery paths", () => { + expect(isMachineryPath("scripts/check-gate-integrity.mjs")).toBe(true); + expect(isMachineryPath(".github/workflows/ci.yml")).toBe(true); + expect(isMachineryPath(".claude/agents/fixer.md")).toBe(true); + // check-bundle.mjs is a CHECKED file (guarded directionally), not machinery — + // so lowering a budget there does not require the gate-change label. + expect(isMachineryPath("scripts/check-bundle.mjs")).toBe(false); + expect(isMachineryPath("src/App.tsx")).toBe(false); + }); +}); + +describe("evaluateGate", () => { + const clean = { + coverage: { base: COVERAGE_SRC, head: COVERAGE_SRC }, + bundle: { base: BUNDLE_SRC, head: BUNDLE_SRC }, + retries: { base: PLAYWRIGHT_SRC, head: PLAYWRIGHT_SRC }, + modifiedTests: [], + deletedTests: [], + machineryTouched: [], + hasOverrideLabel: false, + }; + + it("passes an unrelated change", () => { + const result = evaluateGate(clean); + expect(result.ok).toBe(true); + expect(result.violations).toEqual([]); + }); + + it("flags a lowered coverage threshold", () => { + const result = evaluateGate({ + ...clean, + coverage: { + base: COVERAGE_SRC, + head: COVERAGE_SRC.replace("statements: 60", "statements: 50"), + }, + }); + expect(result.ok).toBe(false); + expect(result.violations.map((v) => v.code)).toContain("coverage-threshold-lowered"); + }); + + it("flags removing the whole coverage thresholds block (disables enforcement)", () => { + const noThresholds = COVERAGE_SRC.replace(/thresholds:\s*\{[\s\S]*?\},/, ""); + const result = evaluateGate({ + ...clean, + coverage: { base: COVERAGE_SRC, head: noThresholds }, + }); + expect(result.ok).toBe(false); + expect(result.violations.map((v) => v.code)).toContain("coverage-threshold-lowered"); + }); + + it("flags removing a single coverage threshold key", () => { + const noBranches = COVERAGE_SRC.replace("branches: 45,\n", ""); + const result = evaluateGate({ + ...clean, + coverage: { base: COVERAGE_SRC, head: noBranches }, + }); + expect(result.ok).toBe(false); + expect(result.violations.map((v) => v.code)).toContain("coverage-threshold-lowered"); + }); + + it("flags removing the bundle budget block", () => { + const result = evaluateGate({ + ...clean, + bundle: { base: BUNDLE_SRC, head: "const BUDGET = {};\n" }, + }); + expect(result.ok).toBe(false); + expect(result.violations.map((v) => v.code)).toContain("bundle-budget-raised"); + }); + + it("allows raising a coverage threshold", () => { + const result = evaluateGate({ + ...clean, + coverage: { + base: COVERAGE_SRC, + head: COVERAGE_SRC.replace("statements: 60", "statements: 70"), + }, + }); + expect(result.ok).toBe(true); + }); + + it("flags a grown coverage exclude list", () => { + const grown = COVERAGE_SRC.replace( + `"src/vite-env.d.ts",`, + `"src/vite-env.d.ts",\n "src/big-untested.ts",`, + ); + const result = evaluateGate({ ...clean, coverage: { base: COVERAGE_SRC, head: grown } }); + expect(result.ok).toBe(false); + expect(result.violations.map((v) => v.code)).toContain("coverage-exclude-grown"); + }); + + it("flags a raised bundle budget", () => { + const result = evaluateGate({ + ...clean, + bundle: { + base: BUNDLE_SRC, + head: BUNDLE_SRC.replace("entryJsGzipKB: 280", "entryJsGzipKB: 400"), + }, + }); + expect(result.ok).toBe(false); + expect(result.violations.map((v) => v.code)).toContain("bundle-budget-raised"); + }); + + it("allows lowering a bundle budget", () => { + const result = evaluateGate({ + ...clean, + bundle: { + base: BUNDLE_SRC, + head: BUNDLE_SRC.replace("entryJsGzipKB: 280", "entryJsGzipKB: 250"), + }, + }); + expect(result.ok).toBe(true); + }); + + it("flags raised e2e retries", () => { + const result = evaluateGate({ + ...clean, + retries: { base: PLAYWRIGHT_SRC, head: PLAYWRIGHT_SRC.replace("? 2", "? 5") }, + }); + expect(result.ok).toBe(false); + expect(result.violations.map((v) => v.code)).toContain("e2e-retries-raised"); + }); + + it("flags a deleted test file", () => { + const result = evaluateGate({ ...clean, deletedTests: ["src/foo.test.ts"] }); + expect(result.ok).toBe(false); + expect(result.violations.map((v) => v.code)).toContain("test-file-deleted"); + }); + + it("flags an added skip marker", () => { + const result = evaluateGate({ + ...clean, + modifiedTests: [ + { + path: "src/a.test.ts", + base: `it("x", () => { expect(1).toBe(1); });`, + head: `it.skip("x", () => { expect(1).toBe(1); });`, + }, + ], + }); + expect(result.ok).toBe(false); + expect(result.violations.map((v) => v.code)).toContain("test-skip-added"); + }); + + it("flags removed assertions", () => { + const result = evaluateGate({ + ...clean, + modifiedTests: [ + { + path: "src/a.test.ts", + base: `expect(1).toBe(1); expect(2).toBe(2);`, + head: `expect(1).toBe(1);`, + }, + ], + }); + expect(result.ok).toBe(false); + expect(result.violations.map((v) => v.code)).toContain("test-assertions-removed"); + }); + + it("allows adding assertions", () => { + const result = evaluateGate({ + ...clean, + modifiedTests: [ + { + path: "src/a.test.ts", + base: `expect(1).toBe(1);`, + head: `expect(1).toBe(1); expect(2).toBe(2);`, + }, + ], + }); + expect(result.ok).toBe(true); + }); + + it("does not flag a brand-new test file that contains skip markers (no base to weaken)", () => { + const result = evaluateGate({ + ...clean, + modifiedTests: [ + { + path: "scripts/gate.test.mjs", + base: null, + head: `it.skip("x", () => {}); xit("y", () => {});`, + }, + ], + }); + expect(result.ok).toBe(true); + }); + + it("flags edits to the gate machinery", () => { + const result = evaluateGate({ ...clean, machineryTouched: [".github/workflows/ci.yml"] }); + expect(result.ok).toBe(false); + expect(result.violations.map((v) => v.code)).toContain("gate-machinery-modified"); + }); + + it("allows any violation when the override label is present, marking it overridden", () => { + const result = evaluateGate({ + ...clean, + coverage: { + base: COVERAGE_SRC, + head: COVERAGE_SRC.replace("statements: 60", "statements: 50"), + }, + machineryTouched: [".github/workflows/ci.yml"], + hasOverrideLabel: true, + }); + expect(result.ok).toBe(true); + expect(result.overridden).toBe(true); + expect(result.violations.length).toBeGreaterThan(0); + }); +}); diff --git a/vite.config.ts b/vite.config.ts index ff30375..b184ea4 100644 --- a/vite.config.ts +++ b/vite.config.ts @@ -11,7 +11,11 @@ export default defineConfig(({ mode }) => { return { plugins: [react(), agentSidecarPlugin(), webApiPlugin()], test: { - include: ["src/**/*.{test,spec}.{ts,tsx}", "server/**/*.{test,spec}.ts"], + include: [ + "src/**/*.{test,spec}.{ts,tsx}", + "server/**/*.{test,spec}.ts", + "scripts/**/*.{test,spec}.mjs", + ], exclude: ["tests/e2e/**", "node_modules/**", "dist/**"], environment: "jsdom", setupFiles: ["./src/test/setup.ts"],