From ba4dcfcd7e0e2bd8d96e5b0ca47240ff07e625d5 Mon Sep 17 00:00:00 2001 From: Shay Palachy Date: Sun, 14 Jun 2026 16:20:19 +0300 Subject: [PATCH 1/3] feat(security): three-layer gitleaks secret-scan guard (#218) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add defense-in-depth secret scanning with gitleaks (the industry tool, not ad-hoc regex), the structural follow-up to the seed-time Google CSE key leak. A repo-root .gitleaks.toml (default ruleset + a no-entropy AIza Google API-key rule + an allowlist for bulk news-candidate data and captured-page test fixtures) drives all three guards: 1. pre-commit gitleaks hook at stages: [pre-push] — the general git guard. 2. scripts/state-run.sh scan_state_for_secrets() — scans the state worktree before every commit/push and fails closed when gitleaks is absent (STATE_RUN_SKIP_SECRET_SCAN=1 is the discouraged escape hatch). 3. Claude Code PreToolUse/Bash hook (.claude/settings.json -> scripts/hooks/gitleaks_prepush_guard.py) blocking any agent-issued git push when gitleaks finds secrets in tracked content. Documented under AGENTS.md "Secret Scanning". gitleaks-gated integration tests skip when the binary is absent; squash/coexistence mechanics tests opt out via STATE_RUN_SKIP_SECRET_SCAN=1. .gitignore now lets the shared .claude/settings.json through while keeping settings.local.json and worktrees ignored. Satisfies gate (a) of UNIFY-PR-06. Co-Authored-By: Claude Opus 4.8 --- .agent-plan.md | 24 ++++++-- .claude/settings.json | 17 ++++++ .gitignore | 6 +- .gitleaks.toml | 36 ++++++++++++ .pre-commit-config.yaml | 8 +++ AGENTS.md | 26 +++++++++ scripts/hooks/gitleaks_prepush_guard.py | 66 ++++++++++++++++++++++ scripts/state-run.sh | 39 +++++++++++++ tests/integration/test_state_run.py | 74 +++++++++++++++++++++++++ tests/integration/test_state_squash.py | 3 + 10 files changed, 292 insertions(+), 7 deletions(-) create mode 100644 .claude/settings.json create mode 100644 .gitleaks.toml create mode 100755 scripts/hooks/gitleaks_prepush_guard.py diff --git a/.agent-plan.md b/.agent-plan.md index d3fa467..850424e 100644 --- a/.agent-plan.md +++ b/.agent-plan.md @@ -17,8 +17,8 @@ - Next planned PR: `UNIFY-PR-06` (operational, go-live) — re-enable only the non-scraping workflows on schedules (discover ≥daily for the backstop, daily-review, monthly-report, release, backup, squash); the scraping ingest / backfill-scrape jobs stay local since GitHub never scrapes. - Deferred until **(a)** the state-push secret-scan guard (issue #218) is in place and **(b)** a - manual dispatch verifies the seeded state end to end — both prompted by the incident below. The + Deferred until a manual dispatch verifies the seeded state end to end — prompted by the incident + below. The state-push secret-scan guard (issue #218) has landed (`GUARD-PR-SECRET-SCAN`). The state repo `DataHackIL/tfht_enforce_idx_state` is **seeded** from local `data/news_items` (27,568 candidates + queues/attempts/verdicts/budget/yield + backfill_batches/runs/metrics, recovered from orphaned `.jsonl.gz` to plain JSONL; excluded: prefilter models + decision-log @@ -331,13 +331,27 @@ defers to a recent local search regardless of clock ordering). A zero-run day now finishes non-fatal. Covered by ledger + config + discover-job tests. +- [done] `GUARD-PR-SECRET-SCAN` (closes #218): three-layer [gitleaks](https://github.com/gitleaks/gitleaks) + secret-scan guard (the industry tool, not ad-hoc regex), the structural follow-up to the seed-time + leak incident below. A repo-root `.gitleaks.toml` (default ruleset + a no-entropy `AIza` Google + API-key rule + an allowlist for bulk news-candidate data and captured-page test fixtures) drives + all three: **(1)** a `pre-commit` `gitleaks` hook at `stages: [pre-push]` (the general git + guard); **(2)** a `scripts/state-run.sh` `scan_state_for_secrets()` that scans the state worktree + before every commit/push and **fails closed** when gitleaks is absent (`STATE_RUN_SKIP_SECRET_SCAN=1` + is the discouraged escape hatch); **(3)** a Claude Code `PreToolUse`/`Bash` hook + (`.claude/settings.json` → `scripts/hooks/gitleaks_prepush_guard.py`) that blocks any agent-issued + `git push` when gitleaks finds secrets in tracked content. Documented under AGENTS.md “Secret + Scanning”. gitleaks-gated integration tests skip when the binary is absent; squash/coexistence + mechanics tests opt out via `STATE_RUN_SKIP_SECRET_SCAN=1`. This satisfies gate (a) of + `UNIFY-PR-06`. - [next] `UNIFY-PR-06` (operational, go-live): the state repo is **seeded** (key-scrubbed after the incident below) with the recovered core state from local `data/news_items` (27,568 candidates + queues/attempts/verdicts/budget/yield + backfill_batches/runs/metrics; excluded: prefilter models + decision logs, engine_query_cache, and the 119 MB candidate_provenance which exceeds GitHub's - 100 MB file limit). Remaining: re-enable only the non-scraping workflows on schedules (discover - ≥daily, daily-review, monthly-report, release, backup, squash) — gated on the state-push - secret-scan guard (issue #218) plus a manual dispatch verifying the seeded state end to end. + 100 MB file limit). The state-push secret-scan guard (`GUARD-PR-SECRET-SCAN`, #218) is now in + place. Remaining: re-enable only the non-scraping workflows on schedules (discover ≥daily, + daily-review, monthly-report, release, backup, squash) — gated only on a manual dispatch + verifying the seeded state end to end. Scraping ingest / backfill-scrape stay local. Seed-time incident: a live Google CSE key was captured into a run's `errors[]` from a CSE-403 URL and seeded to the public repo; it was rotated, the public history was purged, and redaction landed (#217). (Parked scrape→classify decouple: diff --git a/.claude/settings.json b/.claude/settings.json new file mode 100644 index 0000000..58f86ed --- /dev/null +++ b/.claude/settings.json @@ -0,0 +1,17 @@ +{ + "hooks": { + "PreToolUse": [ + { + "matcher": "Bash", + "hooks": [ + { + "type": "command", + "command": "python3 \"$CLAUDE_PROJECT_DIR/scripts/hooks/gitleaks_prepush_guard.py\"", + "if": "Bash(git push:*)", + "statusMessage": "Secret-scanning before push (gitleaks)…" + } + ] + } + ] + } +} diff --git a/.gitignore b/.gitignore index 2a680be..1d0206c 100644 --- a/.gitignore +++ b/.gitignore @@ -211,8 +211,10 @@ __marimo__/ # Local agent overrides LOCAL_AGENTS.md -# Claude Code -.claude/ +# Claude Code — ignore everything under .claude/ except the shared, checked-in +# settings.json (which carries the team-wide secret-scan pre-push hook). +.claude/* +!.claude/settings.json .DS_Store diff --git a/.gitleaks.toml b/.gitleaks.toml new file mode 100644 index 0000000..b16258b --- /dev/null +++ b/.gitleaks.toml @@ -0,0 +1,36 @@ +# gitleaks config for denbust — used by the git pre-push hook, the Claude Code +# pre-push hook, and the state-run push guard (scripts/state-run.sh). +# +# Strategy: the industry default ruleset, plus a strict (no-entropy) rule for +# Google API keys (the kind that leaked — gitleaks' default rule applies an +# entropy gate that can miss them), plus an allowlist for the bulk news-candidate +# data files, which are article content (URLs/titles/snippets) — scanning them +# only yields false positives, and any secret echoed into an error string is +# already scrubbed at write time by denbust.discovery.redaction. + +title = "denbust" + +[extend] +useDefault = true + +[[rules]] +id = "google-api-key-strict" +description = "Google API key (e.g. Custom Search) — matched without an entropy gate" +regex = '''AIza[0-9A-Za-z\-_]{35}''' +keywords = ["AIza"] + +[allowlist] +description = "Skip news content + test fixtures (third-party tokens, not our secrets)" +paths = [ + # Bulk news-candidate data (article URLs/titles/snippets), scanned only as noise. + '''.*/candidates/latest_candidates\.jsonl$''', + '''.*/candidates/retry_queue\.jsonl$''', + '''.*/candidates/backfill_queue\.jsonl$''', + '''.*/candidates/candidate_provenance\.jsonl$''', + '''.*/candidates/scrape_attempts\.jsonl$''', + '''.*/candidates/triage_decisions\.jsonl$''', + '''.*/candidates/engine_query_cache/.*''', + '''.*/prefilter/.*''', + # Captured-page test fixtures carry third-party ad/analytics tokens (not ours). + '''tests/fixtures/.*''', +] diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0a6ca6e..87689e5 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,6 +19,14 @@ repos: - id: ruff - id: ruff-format + # Secret scan before push (the "general git" pre-push hook). gitleaks uses the + # repo-root .gitleaks.toml. Runs on pre-push so it gates what leaves the machine. + - repo: https://github.com/gitleaks/gitleaks + rev: v8.30.1 + hooks: + - id: gitleaks + stages: [pre-push] + - repo: local hooks: - id: mypy diff --git a/AGENTS.md b/AGENTS.md index d7fcd27..d6faf3d 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -144,6 +144,32 @@ scraping budget. Full protocol: `docs/batch_scraping_protocol.md`. - `DENBUST_*` - Never add secrets to fixtures, examples, docs, or workflow YAML. +## Secret Scanning (defense in depth) + +Secrets are scanned with [gitleaks](https://github.com/gitleaks/gitleaks) (the +industry tool, not ad-hoc regex). Install it before pushing: + +```bash +brew install gitleaks # or: see github.com/gitleaks/gitleaks releases +``` + +Configuration lives in the repo-root `.gitleaks.toml` (default ruleset + a +no-entropy Google API-key rule + an allowlist for bulk news-candidate data and +captured-page test fixtures, which only yield third-party false positives). + +Three independent guards run gitleaks against this config: + +1. **git pre-push hook** (`pre-commit`, `stages: [pre-push]`) — gates what leaves + the machine. Enable with `pre-commit install --install-hooks` (the repo sets + `default_install_hook_types: [pre-commit, pre-push]`). +2. **state-run push guard** (`scripts/state-run.sh`) — scans the state worktree + before each commit/push to the public state repo. It **fails closed**: if + gitleaks is not installed it refuses to push. Override only in an emergency + with `STATE_RUN_SKIP_SECRET_SCAN=1` (not recommended). +3. **Claude Code pre-push hook** (`.claude/settings.json` → `scripts/hooks/gitleaks_prepush_guard.py`) — + a `PreToolUse`/`Bash` hook that blocks any agent-issued `git push` when + gitleaks finds secrets in tracked content. + ## Testing Constraints - No live network calls in tests. diff --git a/scripts/hooks/gitleaks_prepush_guard.py b/scripts/hooks/gitleaks_prepush_guard.py new file mode 100755 index 0000000..5789891 --- /dev/null +++ b/scripts/hooks/gitleaks_prepush_guard.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python3 +"""Claude Code PreToolUse hook: secret-scan before a `git push`. + +Reads the PreToolUse event JSON on stdin. If the Bash command is a ``git push``, +it runs gitleaks over the repo's tracked content (using the repo-root +``.gitleaks.toml``) and **blocks** the push (exit code 2) when secrets are found, +printing the redacted findings so Claude can fix them instead of publishing. + +This is the "Claude ability" pre-push guard, complementing the git pre-push hook +(pre-commit) and the `scripts/state-run.sh` push guard. gitleaks is run in *git* +mode, so untracked working-tree files (e.g. local `data/`) are not scanned. + +Exit codes (Claude Code hook protocol): 0 = allow, 2 = block. +""" + +from __future__ import annotations + +import json +import re +import shutil +import subprocess +import sys + +_GIT_PUSH = re.compile(r"\bgit\b(?:\s+-C\s+\S+)?(?:\s+-c\s+\S+)*\s+push\b") + + +def main() -> int: + try: + event = json.load(sys.stdin) + except (json.JSONDecodeError, ValueError): + return 0 # not parseable -> do not interfere + + if event.get("tool_name") != "Bash": + return 0 + command = (event.get("tool_input") or {}).get("command", "") + if not _GIT_PUSH.search(command): + return 0 + + if shutil.which("gitleaks") is None: + print( + "gitleaks-prepush: gitleaks not installed; skipping pre-push secret scan " + "(install: 'brew install gitleaks').", + file=sys.stderr, + ) + return 0 + + scan = subprocess.run( + ["gitleaks", "git", ".", "--no-banner", "--redact"], + capture_output=True, + text=True, + ) + if scan.returncode != 0: + details = (scan.stdout + scan.stderr).strip()[-2000:] + print( + "BLOCKED by gitleaks-prepush: potential secrets detected in tracked content; " + "refusing `git push`. Remove/rotate the secret (and purge it from history) " + "before pushing. Findings (redacted):\n" + details, + file=sys.stderr, + ) + return 2 # block the tool call + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/state-run.sh b/scripts/state-run.sh index fcf402a..36761a6 100755 --- a/scripts/state-run.sh +++ b/scripts/state-run.sh @@ -123,6 +123,41 @@ push_state() { done } +# Secret-scan guard: refuse to commit/push state that gitleaks flags. The leak +# that motivated this rode in a discovery error string; redaction scrubs those at +# write time, and this is the defense-in-depth net at the push boundary. Bulk +# news-candidate data is allowlisted in .gitleaks.toml to avoid false positives. +scan_state_for_secrets() { + if [[ "${STATE_RUN_SKIP_SECRET_SCAN:-0}" == "1" ]]; then + echo "state-run: WARNING — secret scan skipped (STATE_RUN_SKIP_SECRET_SCAN=1)." >&2 + return 0 + fi + if ! command -v gitleaks >/dev/null 2>&1; then + echo "state-run: gitleaks not installed — refusing to push state unscanned." >&2 + echo " install it (e.g. 'brew install gitleaks') or, only if you must," >&2 + echo " set STATE_RUN_SKIP_SECRET_SCAN=1 to override (NOT recommended)." >&2 + return 1 + fi + local cfg cfg_arg=() targets=() target + cfg="$(cd "$(dirname "$0")/.." && pwd)/.gitleaks.toml" + [[ -f "$cfg" ]] && cfg_arg=(--config "$cfg") + if [[ ${#subtrees[@]} -gt 0 ]]; then + for target in "${subtrees[@]}"; do + [[ -e "$STATE_REPO_DIR/$target" ]] && targets+=("$STATE_REPO_DIR/$target") + done + else + targets+=("$STATE_REPO_DIR") + fi + for target in "${targets[@]}"; do + if ! gitleaks dir "$target" "${cfg_arg[@]}" --no-banner --redact >/dev/null 2>&1; then + echo "state-run: SECRET DETECTED in state ($target) — refusing to commit/push." >&2 + gitleaks dir "$target" "${cfg_arg[@]}" --no-banner --redact 2>&1 \ + | grep -iE "Finding|File:|RuleID|Line:" | head -40 >&2 || true + return 1 + fi + done +} + acquire_state_repo_lock # 1. Bring the state repo to canonical HEAD. @@ -149,6 +184,10 @@ fi if [[ -z "$(git_state diff --cached --name-only)" ]]; then echo "state-run: no state changes to commit." else + if ! scan_state_for_secrets; then + echo "state-run: aborting before commit/push — secret scan failed." >&2 + exit 1 + fi git_state config user.name "${GIT_AUTHOR_NAME:-github-actions[bot]}" git_state config user.email "${GIT_AUTHOR_EMAIL:-41898282+github-actions[bot]@users.noreply.github.com}" git_state commit -m "${message:-chore(state): update ${subtrees[*]:-state}}" diff --git a/tests/integration/test_state_run.py b/tests/integration/test_state_run.py index eb43efd..fe89b21 100644 --- a/tests/integration/test_state_run.py +++ b/tests/integration/test_state_run.py @@ -57,6 +57,7 @@ def _run_wrapper( message: str | None = None, offline: bool = False, no_fetch: bool = False, + scan_secrets: bool = False, ) -> subprocess.CompletedProcess[str]: args: list[str] = ["bash", str(WRAPPER)] for subtree in subtrees: @@ -76,6 +77,10 @@ def _run_wrapper( "GIT_AUTHOR_NAME": "tester", "GIT_AUTHOR_EMAIL": "tester@example.com", } + # The secret-scan guard fails closed when gitleaks is absent; tests that + # exercise wrapper mechanics (not the guard) opt out so they run anywhere. + if not scan_secrets: + env["STATE_RUN_SKIP_SECRET_SCAN"] = "1" return subprocess.run(args, env=env, capture_output=True, text=True) @@ -253,3 +258,72 @@ def test_rejected_push_recovers_via_refetch_rebase(tmp_path: Path) -> None: assert _remote_commit_count(remote) == 3 assert _remote_file(remote, "news_items/discover/other.jsonl") == "other" # not clobbered assert _remote_file(remote, "news_items/discover/mine.jsonl") == "mine" + + +_GITLEAKS = shutil.which("gitleaks") is not None +# A correctly-shaped fake Google key (AIza + 35 chars) — matches the strict rule +# in .gitleaks.toml. No real key is in this file. +_FAKE_GOOGLE_KEY = "AIza" + "B1cD3fGh4JkLmN0pQrStUvWxYz123456789" + + +@pytest.mark.skipif(not _GITLEAKS, reason="gitleaks not installed") +def test_secret_in_state_is_blocked_before_push(tmp_path: Path) -> None: + """A secret written into state is caught by gitleaks; nothing is committed/pushed.""" + remote = _make_remote(tmp_path) + result = _run_wrapper( + work=tmp_path / "work", + remote=remote, + scan_secrets=True, + subtrees=["news_items/discover"], + command=[ + "bash", + "-c", + f'echo \'{{"errors":["?key={_FAKE_GOOGLE_KEY}"]}}\' ' + '> "$DENBUST_STATE_ROOT/news_items/discover/run.json"', + ], + ) + assert result.returncode != 0 + assert "SECRET DETECTED" in result.stderr + assert _remote_commit_count(remote) == 1 # push blocked + assert _FAKE_GOOGLE_KEY not in result.stdout + result.stderr # not echoed (redacted) + + +@pytest.mark.skipif(not _GITLEAKS, reason="gitleaks not installed") +def test_clean_state_passes_the_secret_scan(tmp_path: Path) -> None: + """Clean state passes the scan and is pushed normally.""" + remote = _make_remote(tmp_path) + result = _run_wrapper( + work=tmp_path / "work", + remote=remote, + scan_secrets=True, + subtrees=["news_items/discover"], + command=[ + "bash", + "-c", + 'echo \'{"status":"ok","errors":[]}\' ' + '> "$DENBUST_STATE_ROOT/news_items/discover/run.json"', + ], + ) + assert result.returncode == 0, result.stderr + assert _remote_commit_count(remote) == 2 # clean push succeeded + + +@pytest.mark.skipif(not _GITLEAKS, reason="gitleaks not installed") +def test_bulk_candidate_data_is_not_false_flagged(tmp_path: Path) -> None: + """News candidate data (allowlisted) must not trip the scan even if it looks key-ish.""" + remote = _make_remote(tmp_path) + result = _run_wrapper( + work=tmp_path / "work", + remote=remote, + scan_secrets=True, + subtrees=["news_items/discover"], + command=[ + "bash", + "-c", + 'mkdir -p "$DENBUST_STATE_ROOT/news_items/discover/candidates"; ' + f'echo \'{{"url":"https://x.dk/a?key={_FAKE_GOOGLE_KEY}"}}\' ' + '> "$DENBUST_STATE_ROOT/news_items/discover/candidates/latest_candidates.jsonl"', + ], + ) + assert result.returncode == 0, result.stderr # allowlisted path → not flagged + assert _remote_commit_count(remote) == 2 diff --git a/tests/integration/test_state_squash.py b/tests/integration/test_state_squash.py index 1de4638..90a6e52 100644 --- a/tests/integration/test_state_squash.py +++ b/tests/integration/test_state_squash.py @@ -58,6 +58,9 @@ def _env(work: Path, remote: Path) -> dict[str, str]: "STATE_REPO_BRANCH": "main", "GIT_AUTHOR_NAME": "tester", "GIT_AUTHOR_EMAIL": "tester@example.com", + # These tests exercise squash/coexistence mechanics, not the secret-scan + # guard (which fails closed without gitleaks); opt out so they run anywhere. + "STATE_RUN_SKIP_SECRET_SCAN": "1", } From 1605b48944ed2f2689136be12ad50f52b4ff42b3 Mon Sep 17 00:00:00 2001 From: Shay Palachy Date: Sun, 14 Jun 2026 16:22:47 +0300 Subject: [PATCH 2/3] docs(plan): record PR #220 for the secret-scan guard Co-Authored-By: Claude Opus 4.8 --- .agent-plan.md | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/.agent-plan.md b/.agent-plan.md index 850424e..4c8db1e 100644 --- a/.agent-plan.md +++ b/.agent-plan.md @@ -6,14 +6,15 @@ ## Mainline Status -- Last merged PR on main: `#217` — secret redaction in persisted discovery state, the root-cause - fix for the seed-time secret-leak incident (below). `redact_secrets()` strips the literal values - of credential-named env vars (`DENBUST_*`/`ANTHROPIC_API_KEY`/Supabase/object-store/Kaggle/HF — - primary, format-agnostic) plus credential shapes (URL key params, JWTs, header tokens, JSON - secret fields, `AIza`/`Bearer`/`sk-` — backstop) from every discovery error string and from the - run/backfill-batch/metrics snapshot writers, so an API error that echoes a key never reaches - state. Threat-model tested across the project's secret types. This is the last step of the - search-backstop code (`UNIFY-PR-05`) plus the incident fix. +- Last merged PR on main: `#220` (`GUARD-PR-SECRET-SCAN`, closes #218) — the three-layer + [gitleaks](https://github.com/gitleaks/gitleaks) secret-scan guard (the outer defense following + the seed-time leak incident below): a shared `.gitleaks.toml`, a `pre-commit` pre-push hook, a + fail-closed `scripts/state-run.sh` scan before each state push, and a Claude Code + `PreToolUse`/`Bash` hook that blocks an agent-issued `git push` carrying a secret. Builds on the + root-cause fix `#217`, which made `redact_secrets()` strip credential values (env-var literals — + primary, format-agnostic — plus URL/JWT/header/`AIza`/`Bearer`/`sk-` shapes — backstop) from every + discovery error string and the run/backfill-batch/metrics snapshot writers, so an API error that + echoes a key never reaches state. - Next planned PR: `UNIFY-PR-06` (operational, go-live) — re-enable only the non-scraping workflows on schedules (discover ≥daily for the backstop, daily-review, monthly-report, release, backup, squash); the scraping ingest / backfill-scrape jobs stay local since GitHub never scrapes. @@ -331,7 +332,7 @@ defers to a recent local search regardless of clock ordering). A zero-run day now finishes non-fatal. Covered by ledger + config + discover-job tests. -- [done] `GUARD-PR-SECRET-SCAN` (closes #218): three-layer [gitleaks](https://github.com/gitleaks/gitleaks) +- [done] `GUARD-PR-SECRET-SCAN` (#220, closes #218): three-layer [gitleaks](https://github.com/gitleaks/gitleaks) secret-scan guard (the industry tool, not ad-hoc regex), the structural follow-up to the seed-time leak incident below. A repo-root `.gitleaks.toml` (default ruleset + a no-entropy `AIza` Google API-key rule + an allowlist for bulk news-candidate data and captured-page test fixtures) drives From 38065e3cda5d4b000a949a9850ebbd4f13b891e4 Mon Sep 17 00:00:00 2001 From: Shay Palachy Date: Mon, 15 Jun 2026 14:22:54 +0300 Subject: [PATCH 3/3] fix(security): harden secret-scan guard after self-review (#220) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address review findings that the guard had a hole exactly at the incident's location and was under-tested: - .gitleaks.toml: replace the blanket path allowlist (which skipped ALL rules on candidate data — blinding the scanner where the leaked key actually rode in) with narrow per-rule allowlists. Only generic-api-key is suppressed on candidate paths (and jwt on captured fixtures); the strict Google rule, jwt, and all provider rules stay active there, so a real key/JWT in the candidate stream is still caught. - state-run.sh: fail closed when .gitleaks.toml is missing (no silent fallback to gitleaks defaults, which miss the low-entropy Google key class); scan the staged diff via `gitleaks git --staged` (not the whole working tree, so it no longer blocks on pre-existing secrets in unchanged files); single run via a JSON report; distinguish leaks from operational errors; resolve the config via BASH_SOURCE. - gitleaks_prepush_guard.py: shlex-based push detection so it no longer fails open on `git --no-pager push`, env-prefixed, or reordered-flag forms; scan the `-C ` target rather than a hardcoded `.`; pass an explicit script-relative --config; distinguish leaks from gitleaks errors. - ci-test.yml: install a pinned gitleaks in the integration-tests job so the guard's blocking tests run in CI instead of silently skipping. - tests: the old test asserted a Google key in candidate data passes the scan — i.e. it encoded the hole. Replace it with a generic-noise test, and add a regression test that a real Google key AND a JWT in candidate data ARE blocked. Co-Authored-By: Claude Opus 4.8 --- .agent-plan.md | 24 ++-- .github/workflows/ci-test.yml | 9 ++ .gitleaks.toml | 33 +++-- AGENTS.md | 40 +++--- scripts/hooks/gitleaks_prepush_guard.py | 161 ++++++++++++++++++++---- scripts/state-run.sh | 58 ++++++--- tests/integration/test_state_run.py | 43 ++++++- 7 files changed, 286 insertions(+), 82 deletions(-) diff --git a/.agent-plan.md b/.agent-plan.md index 4c8db1e..fca4306 100644 --- a/.agent-plan.md +++ b/.agent-plan.md @@ -332,19 +332,21 @@ defers to a recent local search regardless of clock ordering). A zero-run day now finishes non-fatal. Covered by ledger + config + discover-job tests. -- [done] `GUARD-PR-SECRET-SCAN` (#220, closes #218): three-layer [gitleaks](https://github.com/gitleaks/gitleaks) +- [done] `GUARD-PR-SECRET-SCAN` (#220, closes #218): three-surface [gitleaks](https://github.com/gitleaks/gitleaks) secret-scan guard (the industry tool, not ad-hoc regex), the structural follow-up to the seed-time leak incident below. A repo-root `.gitleaks.toml` (default ruleset + a no-entropy `AIza` Google - API-key rule + an allowlist for bulk news-candidate data and captured-page test fixtures) drives - all three: **(1)** a `pre-commit` `gitleaks` hook at `stages: [pre-push]` (the general git - guard); **(2)** a `scripts/state-run.sh` `scan_state_for_secrets()` that scans the state worktree - before every commit/push and **fails closed** when gitleaks is absent (`STATE_RUN_SKIP_SECRET_SCAN=1` - is the discouraged escape hatch); **(3)** a Claude Code `PreToolUse`/`Bash` hook - (`.claude/settings.json` → `scripts/hooks/gitleaks_prepush_guard.py`) that blocks any agent-issued - `git push` when gitleaks finds secrets in tracked content. Documented under AGENTS.md “Secret - Scanning”. gitleaks-gated integration tests skip when the binary is absent; squash/coexistence - mechanics tests opt out via `STATE_RUN_SKIP_SECRET_SCAN=1`. This satisfies gate (a) of - `UNIFY-PR-06`. + rule) with a **narrow, per-rule allowlist**: because the leak rode in *inside* the candidate-data + JSONL, those paths are not blanket-skipped — only the catch-all `generic-api-key` rule is suppressed + there (and `jwt` too on captured fixtures), so a real key or JWT in the candidate stream is still + caught. Three guards: **(1)** the enforced `scripts/state-run.sh` `scan_state_for_secrets()`, + which scans the **staged** diff before every commit/push, **fails closed** when gitleaks or the + config is absent (`STATE_RUN_SKIP_SECRET_SCAN=1` is the discouraged escape hatch), and distinguishes + leaks from operational errors; **(2)** a best-effort `pre-commit` `gitleaks` hook at + `stages: [pre-push]`; **(3)** a best-effort Claude Code `PreToolUse`/`Bash` hook + (`.claude/settings.json` → `scripts/hooks/gitleaks_prepush_guard.py`) with shlex-based push + detection (no fail-open on `--no-pager`/`-c`/`-C` forms), per-`-C` target scanning, and an + explicit script-relative `--config`. CI installs gitleaks so the guard's blocking tests run rather + than skip. Documented under AGENTS.md “Secret Scanning”. This satisfies gate (a) of `UNIFY-PR-06`. - [next] `UNIFY-PR-06` (operational, go-live): the state repo is **seeded** (key-scrubbed after the incident below) with the recovered core state from local `data/news_items` (27,568 candidates + queues/attempts/verdicts/budget/yield + backfill_batches/runs/metrics; excluded: prefilter models diff --git a/.github/workflows/ci-test.yml b/.github/workflows/ci-test.yml index b7f1b39..e7b405f 100644 --- a/.github/workflows/ci-test.yml +++ b/.github/workflows/ci-test.yml @@ -144,6 +144,15 @@ jobs: python -m pip install --upgrade pip python -m pip install -e ".[dev]" + # Required so the gitleaks-gated secret-scan guard tests actually run in CI + # instead of silently skipping (they assert the guard blocks real keys). + - name: Install gitleaks + run: | + VERSION=8.30.1 + curl -fsSL "https://github.com/gitleaks/gitleaks/releases/download/v${VERSION}/gitleaks_${VERSION}_linux_x64.tar.gz" \ + | sudo tar -xz -C /usr/local/bin gitleaks + gitleaks version + - name: Run integration tests run: pytest -q tests/integration --cov --cov-report= diff --git a/.gitleaks.toml b/.gitleaks.toml index b16258b..297b032 100644 --- a/.gitleaks.toml +++ b/.gitleaks.toml @@ -3,10 +3,17 @@ # # Strategy: the industry default ruleset, plus a strict (no-entropy) rule for # Google API keys (the kind that leaked — gitleaks' default rule applies an -# entropy gate that can miss them), plus an allowlist for the bulk news-candidate -# data files, which are article content (URLs/titles/snippets) — scanning them -# only yields false positives, and any secret echoed into an error string is -# already scrubbed at write time by denbust.discovery.redaction. +# entropy gate that can miss them). +# +# Allowlisting is deliberately NARROW. The seed-time leak rode in as a Google +# key inside a discovery `errors[]` field — i.e. *inside* the candidate-data +# JSONL. So we must NOT blanket-skip those files: a blanket path allowlist would +# blind the scanner at exactly the incident's location. Instead we suppress only +# the catch-all `generic-api-key` rule (which false-positives on news URLs/titles/ +# snippets) on the bulk data paths, while every high-signal provider rule — our +# strict Google rule, `jwt` (the Supabase-JWT incident class), AWS/GitHub/Slack/ +# etc. — stays ACTIVE there. Captured third-party HTML fixtures additionally trip +# `jwt` (foreign ad/analytics tokens, not our secrets), so they suppress both. title = "denbust" @@ -19,10 +26,13 @@ description = "Google API key (e.g. Custom Search) — matched without an entrop regex = '''AIza[0-9A-Za-z\-_]{35}''' keywords = ["AIza"] -[allowlist] -description = "Skip news content + test fixtures (third-party tokens, not our secrets)" +# Bulk news-candidate data (article URLs/titles/snippets). Suppress ONLY the +# generic/entropy catch-all; keep every provider-key rule active so a real key or +# JWT captured into this stream — the incident vector — is still caught. +[[allowlists]] +description = "News-candidate data — suppress generic-api-key noise only" +targetRules = ["generic-api-key"] paths = [ - # Bulk news-candidate data (article URLs/titles/snippets), scanned only as noise. '''.*/candidates/latest_candidates\.jsonl$''', '''.*/candidates/retry_queue\.jsonl$''', '''.*/candidates/backfill_queue\.jsonl$''', @@ -31,6 +41,11 @@ paths = [ '''.*/candidates/triage_decisions\.jsonl$''', '''.*/candidates/engine_query_cache/.*''', '''.*/prefilter/.*''', - # Captured-page test fixtures carry third-party ad/analytics tokens (not ours). - '''tests/fixtures/.*''', ] + +# Captured-page test fixtures carry third-party ad/analytics tokens (not ours) +# that trip both the generic catch-all and the JWT rule. +[[allowlists]] +description = "Captured-page test fixtures — third-party tokens, not our secrets" +targetRules = ["generic-api-key", "jwt"] +paths = ['''tests/fixtures/.*'''] diff --git a/AGENTS.md b/AGENTS.md index d6faf3d..0dc2d98 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -153,22 +153,32 @@ industry tool, not ad-hoc regex). Install it before pushing: brew install gitleaks # or: see github.com/gitleaks/gitleaks releases ``` -Configuration lives in the repo-root `.gitleaks.toml` (default ruleset + a -no-entropy Google API-key rule + an allowlist for bulk news-candidate data and -captured-page test fixtures, which only yield third-party false positives). - -Three independent guards run gitleaks against this config: - -1. **git pre-push hook** (`pre-commit`, `stages: [pre-push]`) — gates what leaves - the machine. Enable with `pre-commit install --install-hooks` (the repo sets - `default_install_hook_types: [pre-commit, pre-push]`). -2. **state-run push guard** (`scripts/state-run.sh`) — scans the state worktree - before each commit/push to the public state repo. It **fails closed**: if - gitleaks is not installed it refuses to push. Override only in an emergency - with `STATE_RUN_SKIP_SECRET_SCAN=1` (not recommended). +Configuration lives in the repo-root `.gitleaks.toml`: the default ruleset plus +a no-entropy Google API-key rule. Allowlisting is **deliberately narrow** — +because the seed-time leak rode in *inside* the candidate-data JSONL, those +paths are **not** blanket-skipped. We suppress only the catch-all +`generic-api-key` rule there (it false-positives on news URLs/titles/snippets); +every provider rule (the strict Google rule, `jwt`, AWS/GitHub/Slack/…) stays +active, so a real key or JWT captured into the candidate stream is still caught. + +Three guards run gitleaks against this config: + +1. **state-run push guard** (`scripts/state-run.sh`) — the **enforced** layer + (runs locally *and* in CI). Scans the *staged* diff before each commit/push to + the public state repo and **fails closed**: if gitleaks is missing or + `.gitleaks.toml` cannot be found it refuses to push rather than degrade to + gitleaks' weaker defaults. Override only in an emergency with + `STATE_RUN_SKIP_SECRET_SCAN=1` (not recommended). +2. **git pre-push hook** (`pre-commit`, `stages: [pre-push]`) — best-effort, gates + ordinary `git push`. Enable with `pre-commit install --install-hooks` (the + repo sets `default_install_hook_types: [pre-commit, pre-push]`). 3. **Claude Code pre-push hook** (`.claude/settings.json` → `scripts/hooks/gitleaks_prepush_guard.py`) — - a `PreToolUse`/`Bash` hook that blocks any agent-issued `git push` when - gitleaks finds secrets in tracked content. + best-effort; a `PreToolUse`/`Bash` hook that blocks an agent-issued `git push` + when gitleaks finds secrets in the pushed repo's tracked content. + +The guard's blocking behaviour is regression-tested in +`tests/integration/test_state_run.py`; CI installs gitleaks so those tests run +rather than skip. ## Testing Constraints diff --git a/scripts/hooks/gitleaks_prepush_guard.py b/scripts/hooks/gitleaks_prepush_guard.py index 5789891..46c419f 100755 --- a/scripts/hooks/gitleaks_prepush_guard.py +++ b/scripts/hooks/gitleaks_prepush_guard.py @@ -1,14 +1,25 @@ #!/usr/bin/env python3 """Claude Code PreToolUse hook: secret-scan before a `git push`. -Reads the PreToolUse event JSON on stdin. If the Bash command is a ``git push``, -it runs gitleaks over the repo's tracked content (using the repo-root -``.gitleaks.toml``) and **blocks** the push (exit code 2) when secrets are found, -printing the redacted findings so Claude can fix them instead of publishing. +Reads the PreToolUse event JSON on stdin. When the Bash command is a ``git +push``, it runs gitleaks over the target repo's tracked content (using the +repo-root ``.gitleaks.toml``) and **blocks** the push (exit code 2) when secrets +are found, printing the redacted findings so Claude can fix them instead of +publishing. -This is the "Claude ability" pre-push guard, complementing the git pre-push hook -(pre-commit) and the `scripts/state-run.sh` push guard. gitleaks is run in *git* -mode, so untracked working-tree files (e.g. local `data/`) are not scanned. +This is the best-effort "Claude ability" pre-push guard. The *enforced* secret +scanning lives in the git pre-push hook (pre-commit) and the +``scripts/state-run.sh`` push guard; this one is a convenience net that stops an +agent from pushing a secret in the first place. + +Design notes: + * Push detection is shlex-tokenized across ``&&``/``;``/``|`` and tolerates git + global options (``--no-pager``, ``-c k=v``, ``-C ``, env-var prefixes), + so it does not fail *open* on a push form an over-specific regex would miss. + * Each detected push is scanned in the directory named by its ``-C `` + (default the current dir), not a hardcoded ``.``. + * The config is resolved relative to this script and passed explicitly, so the + strict ruleset is always applied rather than relying on cwd auto-discovery. Exit codes (Claude Code hook protocol): 0 = allow, 2 = block. """ @@ -16,12 +27,117 @@ from __future__ import annotations import json -import re +import shlex import shutil import subprocess import sys +import tempfile +from pathlib import Path + +# .gitleaks.toml lives at the repo root: scripts/hooks/ -> parents[2]. +_CONFIG = Path(__file__).resolve().parents[2] / ".gitleaks.toml" + +# git global options that consume the following token as their argument. +_OPTS_WITH_ARG = {"-C", "-c", "--git-dir", "--work-tree", "--namespace", "--exec-path"} + + +def _push_targets(command: str) -> list[str]: + """Return the scan directory for each `git push` in a shell command. + + Splits on shell separators, then for each segment walks tokens: skip a + leading run of ``VAR=value`` env assignments, require ``git``, consume git + global options (tracking ``-C ``), and if the resulting subcommand is + ``push`` record the target dir. Returns ``["."]`` as a conservative fallback + when the command cannot be tokenized but clearly contains a git push. + """ + try: + tokens = shlex.split(command, comments=False) + except ValueError: + # Unbalanced quotes etc. — be conservative: if it smells like a push, + # scan the current dir rather than waving it through. + return ["."] if ("git" in command and "push" in command) else [] + + targets: list[str] = [] + segment: list[str] = [] + for tok in (*tokens, "&&"): # sentinel flushes the final segment + if tok in ("&&", "||", "|", ";", "&", "\n"): + target = _push_target_for_segment(segment) + if target is not None: + targets.append(target) + segment = [] + else: + segment.append(tok) + return targets + + +def _push_target_for_segment(tokens: list[str]) -> str | None: + i = 0 + # Skip leading env-var assignments (e.g. `GIT_DIR=… git push`). + while i < len(tokens) and "=" in tokens[i] and not tokens[i].startswith("-"): + i += 1 + if i >= len(tokens) or tokens[i] != "git": + return None + i += 1 + cwd = "." + while i < len(tokens): + tok = tokens[i] + if tok == "-C" and i + 1 < len(tokens): + cwd = tokens[i + 1] + i += 2 + continue + if tok in _OPTS_WITH_ARG and i + 1 < len(tokens): + i += 2 + continue + if tok.startswith("-"): + i += 1 + continue + return cwd if tok == "push" else None + return None + -_GIT_PUSH = re.compile(r"\bgit\b(?:\s+-C\s+\S+)?(?:\s+-c\s+\S+)*\s+push\b") +def _scan(target: str) -> tuple[bool, str]: + """Scan ``target`` with gitleaks. Returns (has_leaks, redacted_details).""" + config_arg = ["--config", str(_CONFIG)] if _CONFIG.is_file() else [] + if not config_arg: + print( + f"gitleaks-prepush: WARNING — {_CONFIG} not found; scanning with gitleaks " + "defaults, which miss the low-entropy Google key class.", + file=sys.stderr, + ) + with tempfile.NamedTemporaryFile(suffix=".json", delete=False) as tf: + report = tf.name + try: + scan = subprocess.run( + [ + "gitleaks", + "git", + target, + *config_arg, + "--no-banner", + "--redact", + "--report-format", + "json", + "--report-path", + report, + ], + capture_output=True, + text=True, + ) + if scan.returncode == 0: + return False, "" + body = Path(report).read_text() if Path(report).exists() else "" + if '"RuleID"' in body: + return True, (scan.stderr or body).strip()[-2000:] + # Non-zero without findings = gitleaks operational error (e.g. not a git + # repo). Do not block the agent on a tool error; warn instead. + print( + f"gitleaks-prepush: gitleaks errored on '{target}' (exit " + f"{scan.returncode}); not blocking. {scan.stderr.strip()[-300:]}", + file=sys.stderr, + ) + return False, "" + finally: + Path(report).unlink(missing_ok=True) def main() -> int: @@ -33,7 +149,8 @@ def main() -> int: if event.get("tool_name") != "Bash": return 0 command = (event.get("tool_input") or {}).get("command", "") - if not _GIT_PUSH.search(command): + targets = _push_targets(command) + if not targets: return 0 if shutil.which("gitleaks") is None: @@ -44,20 +161,16 @@ def main() -> int: ) return 0 - scan = subprocess.run( - ["gitleaks", "git", ".", "--no-banner", "--redact"], - capture_output=True, - text=True, - ) - if scan.returncode != 0: - details = (scan.stdout + scan.stderr).strip()[-2000:] - print( - "BLOCKED by gitleaks-prepush: potential secrets detected in tracked content; " - "refusing `git push`. Remove/rotate the secret (and purge it from history) " - "before pushing. Findings (redacted):\n" + details, - file=sys.stderr, - ) - return 2 # block the tool call + for target in targets: + has_leaks, details = _scan(target) + if has_leaks: + print( + "BLOCKED by gitleaks-prepush: potential secrets detected in tracked " + f"content of '{target}'; refusing `git push`. Remove/rotate the secret " + "(and purge it from history) before pushing. Findings (redacted):\n" + details, + file=sys.stderr, + ) + return 2 # block the tool call return 0 diff --git a/scripts/state-run.sh b/scripts/state-run.sh index 36761a6..6f2effa 100755 --- a/scripts/state-run.sh +++ b/scripts/state-run.sh @@ -124,9 +124,16 @@ push_state() { } # Secret-scan guard: refuse to commit/push state that gitleaks flags. The leak -# that motivated this rode in a discovery error string; redaction scrubs those at -# write time, and this is the defense-in-depth net at the push boundary. Bulk -# news-candidate data is allowlisted in .gitleaks.toml to avoid false positives. +# that motivated this rode in as a key inside a discovery `errors[]` field; +# redaction scrubs those at write time, and this is the enforced defense-in-depth +# net at the push boundary. It scans the *staged* diff (what is about to be +# committed), not the whole working tree, so it does not block on a pre-existing +# secret in an unchanged file and does not re-scan the full candidate store. +# +# Fails closed: if gitleaks is missing or the config cannot be found, it refuses +# to push rather than degrade to gitleaks' default ruleset (which misses the +# low-entropy Google key class our strict rule exists for). Override only in an +# emergency with STATE_RUN_SKIP_SECRET_SCAN=1. scan_state_for_secrets() { if [[ "${STATE_RUN_SKIP_SECRET_SCAN:-0}" == "1" ]]; then echo "state-run: WARNING — secret scan skipped (STATE_RUN_SKIP_SECRET_SCAN=1)." >&2 @@ -138,24 +145,37 @@ scan_state_for_secrets() { echo " set STATE_RUN_SKIP_SECRET_SCAN=1 to override (NOT recommended)." >&2 return 1 fi - local cfg cfg_arg=() targets=() target - cfg="$(cd "$(dirname "$0")/.." && pwd)/.gitleaks.toml" - [[ -f "$cfg" ]] && cfg_arg=(--config "$cfg") - if [[ ${#subtrees[@]} -gt 0 ]]; then - for target in "${subtrees[@]}"; do - [[ -e "$STATE_REPO_DIR/$target" ]] && targets+=("$STATE_REPO_DIR/$target") - done + local repo_root cfg + repo_root="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" + cfg="$repo_root/.gitleaks.toml" + if [[ ! -f "$cfg" ]]; then + echo "state-run: .gitleaks.toml not found at $cfg — refusing to push." >&2 + echo " scanning with gitleaks defaults would miss the low-entropy Google key" >&2 + echo " class this config exists to catch. Restore the file before pushing." >&2 + return 1 + fi + + local report rc=0 + report="$(mktemp -t state-run-gitleaks.XXXXXX.json)" + # Scan only what is staged. gitleaks exit: 0 = clean, 1 = leaks found, other = + # operational error (which we also treat as fail-closed). + gitleaks git --staged "$STATE_REPO_DIR" \ + --config "$cfg" --no-banner --redact \ + --report-format json --report-path "$report" >/dev/null 2>&1 || rc=$? + + if [[ "$rc" -eq 0 ]]; then + rm -f "$report" + return 0 + fi + + if [[ -s "$report" ]] && grep -q '"RuleID"' "$report" 2>/dev/null; then + echo "state-run: SECRET DETECTED in staged state — refusing to commit/push." >&2 + grep -oE '"(RuleID|File|StartLine)": *[^,}]*' "$report" | head -40 >&2 || true else - targets+=("$STATE_REPO_DIR") + echo "state-run: gitleaks errored (exit $rc) — refusing to push (fail-closed)." >&2 fi - for target in "${targets[@]}"; do - if ! gitleaks dir "$target" "${cfg_arg[@]}" --no-banner --redact >/dev/null 2>&1; then - echo "state-run: SECRET DETECTED in state ($target) — refusing to commit/push." >&2 - gitleaks dir "$target" "${cfg_arg[@]}" --no-banner --redact 2>&1 \ - | grep -iE "Finding|File:|RuleID|Line:" | head -40 >&2 || true - return 1 - fi - done + rm -f "$report" + return 1 } acquire_state_repo_lock diff --git a/tests/integration/test_state_run.py b/tests/integration/test_state_run.py index fe89b21..12fafac 100644 --- a/tests/integration/test_state_run.py +++ b/tests/integration/test_state_run.py @@ -264,6 +264,14 @@ def test_rejected_push_recovers_via_refetch_rebase(tmp_path: Path) -> None: # A correctly-shaped fake Google key (AIza + 35 chars) — matches the strict rule # in .gitleaks.toml. No real key is in this file. _FAKE_GOOGLE_KEY = "AIza" + "B1cD3fGh4JkLmN0pQrStUvWxYz123456789" +# A structurally-valid but meaningless JWT (the Supabase-JWT incident class). +_FAKE_JWT = ( + "eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiIxMjM0NTY3ODkwIn0.SflKxwRJSMeKKF2QT4fwpMeJf36POk6yJV_adQssw5c" +) +# A high-entropy assignment that trips gitleaks' catch-all `generic-api-key` +# rule — the kind of false positive news text produces. Suppressed on candidate +# paths, so it must NOT block a push. +_GENERIC_NOISE = "api_key = 8f3Hq9ZxR2bN7vKpL4wYtCgD6sErA1mU0oP" @pytest.mark.skipif(not _GITLEAKS, reason="gitleaks not installed") @@ -309,8 +317,9 @@ def test_clean_state_passes_the_secret_scan(tmp_path: Path) -> None: @pytest.mark.skipif(not _GITLEAKS, reason="gitleaks not installed") -def test_bulk_candidate_data_is_not_false_flagged(tmp_path: Path) -> None: - """News candidate data (allowlisted) must not trip the scan even if it looks key-ish.""" +def test_generic_noise_in_candidate_data_is_not_false_flagged(tmp_path: Path) -> None: + """The generic-api-key catch-all is suppressed on candidate paths, so key-ish + news text does not block a push.""" remote = _make_remote(tmp_path) result = _run_wrapper( work=tmp_path / "work", @@ -321,9 +330,35 @@ def test_bulk_candidate_data_is_not_false_flagged(tmp_path: Path) -> None: "bash", "-c", 'mkdir -p "$DENBUST_STATE_ROOT/news_items/discover/candidates"; ' - f'echo \'{{"url":"https://x.dk/a?key={_FAKE_GOOGLE_KEY}"}}\' ' + f'echo \'{{"snippet":"{_GENERIC_NOISE}"}}\' ' '> "$DENBUST_STATE_ROOT/news_items/discover/candidates/latest_candidates.jsonl"', ], ) - assert result.returncode == 0, result.stderr # allowlisted path → not flagged + assert result.returncode == 0, result.stderr # generic noise suppressed → not flagged assert _remote_commit_count(remote) == 2 + + +@pytest.mark.skipif(not _GITLEAKS, reason="gitleaks not installed") +def test_real_key_in_candidate_data_is_blocked(tmp_path: Path) -> None: + """Regression for the seed-time incident: a real key/JWT captured *into* + candidate data — the exact leak vector — must still be caught. Only the + generic catch-all is suppressed on these paths, not the provider rules.""" + remote = _make_remote(tmp_path) + for secret in (_FAKE_GOOGLE_KEY, _FAKE_JWT): + result = _run_wrapper( + work=tmp_path / f"work_{secret[:8]}", + remote=remote, + scan_secrets=True, + subtrees=["news_items/discover"], + command=[ + "bash", + "-c", + 'mkdir -p "$DENBUST_STATE_ROOT/news_items/discover/candidates"; ' + f'echo \'{{"url":"x","errors":["403 ?key={secret}"]}}\' ' + '> "$DENBUST_STATE_ROOT/news_items/discover/candidates/latest_candidates.jsonl"', + ], + ) + assert result.returncode != 0, f"{secret[:8]} not blocked" + assert "SECRET DETECTED" in result.stderr + assert _remote_commit_count(remote) == 1 # push blocked, nothing committed + assert secret not in result.stdout + result.stderr # redacted