Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 14 additions & 1 deletion .agent-plan.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,11 @@

## Mainline Status

- Last merged PR on main: `#220` (`GUARD-PR-SECRET-SCAN`, closes #218) — the three-layer
- Last merged PR on main: `#223` (`FIX-DISCOVERY-RUNS-SCHEMA`; refs #150/#152/#156/#184/#186, closed on deploy) —
add the missing `discovery_runs.skipped_query_count` column whose absence was failing every
Supabase run-metadata write and blocking candidate persistence on daily-review runs; a guard test
now asserts every `DiscoveryRun` field maps to a column. Clears the schema break on the
Comment on lines +11 to +12
`UNIFY-PR-06` go-live path. Prior: `#220` (`GUARD-PR-SECRET-SCAN`, closes #218) — the three-layer
[gitleaks](https://github.com/gitleaks/gitleaks) secret-scan guard (the outer defense following
the seed-time leak incident below): a shared `.gitleaks.toml`, a `pre-commit` pre-push hook, a
fail-closed `scripts/state-run.sh` scan before each state push, and a Claude Code
Expand Down Expand Up @@ -347,6 +351,15 @@
detection (no fail-open on `--no-pager`/`-c`/`-C` forms), per-`-C` target scanning, and an
explicit script-relative `--config`. CI installs gitleaks so the guard's blocking tests run rather
than skip. Documented under AGENTS.md “Secret Scanning”. This satisfies gate (a) of `UNIFY-PR-06`.
- [done] `FIX-DISCOVERY-RUNS-SCHEMA` (#223; refs #150/#152/#156/#184/#186, closed on deploy): add the missing
`discovery_runs.skipped_query_count` column. `DiscoveryRun` gained the field (the rolling-window
search backstop counts deliberately-skipped queries) and `DiscoveryStore.write_run()` posts the
full `model_dump()` to `discovery_runs` via PostgREST, but no migration ever added the column — so
every run-metadata write failed with “column skipped_query_count does not exist”, blocking
candidate persistence on the daily-review runs. New migration
`supabase/migrations/20260616_discovery_runs_skipped_query_count.sql` plus a regression test and a
guard test asserting every `DiscoveryRun` field maps to a `discovery_runs` column. Clears the
schema break that would otherwise fail the `UNIFY-PR-06` go-live dispatch.
- [next] `UNIFY-PR-06` (operational, go-live): the state repo is **seeded** (key-scrubbed after the
incident below) with the recovered core state from local `data/news_items` (27,568 candidates +
queues/attempts/verdicts/budget/yield + backfill_batches/runs/metrics; excluded: prefilter models
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
-- Add the discovery_runs.skipped_query_count column.
--
-- DiscoveryRun gained a `skipped_query_count` field (the rolling-window search
-- backstop counts queries it deliberately skips), and the Supabase-backed
-- DiscoveryStore.write_run() posts the full model_dump() to discovery_runs via
-- PostgREST. The column was never added to the table, so every run-metadata
-- write failed with "column skipped_query_count does not exist", which blocked
-- candidate persistence on the daily-review runs.

alter table public.discovery_runs
add column if not exists skipped_query_count integer not null default 0;

-- Deploy note: the runtime error is PGRST204 ("not found in the schema cache"),
-- so this must be applied to the live project (supabase db push) AND PostgREST
-- must reload its schema cache before the write succeeds. PostgREST reloads
-- automatically on most managed deploys; if the 400 persists, force it with:
-- notify pgrst, 'reload schema';
87 changes: 87 additions & 0 deletions tests/unit/test_discovery_migration.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,66 @@
"""Unit tests for the discovery-layer Supabase migration scaffold."""

import re
from pathlib import Path

from denbust.discovery.models import (
BackfillBatch,
CandidateProvenance,
DiscoveryRun,
PersistentCandidate,
ScrapeAttempt,
)

MIGRATION_PATH = Path("supabase/migrations/20260410_discovery_candidacy_foundation.sql")
SKIPPED_QUERY_COUNT_MIGRATION = Path(
"supabase/migrations/20260616_discovery_runs_skipped_query_count.sql"
)

_MIGRATIONS_DIR = Path("supabase/migrations")
_ALL_MIGRATIONS = "\n".join(
p.read_text(encoding="utf-8") for p in sorted(_MIGRATIONS_DIR.glob("*.sql"))
)

# Tokens that begin a table-constraint line rather than a column definition.
_NON_COLUMN_TOKENS = {"primary", "unique", "constraint", "foreign", "check", "references"}

# Each persisted discovery model and the table its writer POSTs a full
# ``model_dump()`` to. The third element lists fields that are intentionally NOT
# stored as a same-named column (none today); adding such a field is then a
# conscious choice rather than a silent production break.
_MODEL_TABLES: list[tuple[type, str, set[str]]] = [
(DiscoveryRun, "discovery_runs", set()),
(PersistentCandidate, "persistent_candidates", set()),
(BackfillBatch, "backfill_batches", set()),
(ScrapeAttempt, "scrape_attempts", set()),
(CandidateProvenance, "candidate_provenance", set()),
]


def _table_columns(table: str) -> set[str]:
"""Columns of ``public.<table>``, parsed from its CREATE TABLE block and any
ALTER ... ADD COLUMN statements across all migrations.

Scoped to the specific table — a substring search over all migrations is
unsound, because discovery models share field names (``status``, ``errors``,
``dataset_name``, ...) with columns on *other* tables, so a column missing
from this table could be masked by a same-named column elsewhere.
"""
cols: set[str] = set()
create = re.search(
rf"create table if not exists public\.{table}\s*\((.*?)\);",
_ALL_MIGRATIONS,
re.S,
)
if create:
for line in create.group(1).splitlines():
match = re.match(r"\s*([a-z_]+)\s+\S", line)
if match and match.group(1) not in _NON_COLUMN_TOKENS:
cols.add(match.group(1))
for alter in re.finditer(rf"alter table public\.{table}\b(.*?);", _ALL_MIGRATIONS, re.S):
for col in re.finditer(r"add column if not exists\s+([a-z_]+)", alter.group(1)):
cols.add(col.group(1))
return cols


def test_discovery_migration_defines_required_tables() -> None:
Expand All @@ -25,3 +83,32 @@ def test_discovery_migration_includes_core_columns() -> None:
assert "metadata jsonb not null default '{}'::jsonb" in sql
assert "errors jsonb not null default '[]'::jsonb" in sql
assert "diagnostics jsonb not null default '{}'::jsonb" in sql


def test_every_persisted_model_field_maps_to_a_table_column() -> None:
"""Every field of every persisted discovery model must have a matching column
on its target table, so a writer's ``model_dump()`` POST never references an
unknown column (the PGRST204 class of failure that issues #150/#152/#156/#184/
#186 reported for discovery_runs.skipped_query_count).

This is the real regression guard: it covers ALL five model<->table pairs and
is scoped per table, so drift in any of them fails CI rather than waiting for
the daily-review bot to file the next ticket.
"""
failures: list[str] = []
for model, table, exceptions in _MODEL_TABLES:
columns = _table_columns(table)
# A parsing failure would yield an empty set and (correctly) fail loudly.
assert columns, f"could not parse any columns for public.{table}"
missing = set(model.model_fields) - columns - exceptions
if missing:
failures.append(f"public.{table} is missing columns for {sorted(missing)}")
assert not failures, "model<->schema drift detected:\n" + "\n".join(failures)


def test_skipped_query_count_column_is_added() -> None:
"""Regression for the missing-column persistence failure (issues #150/#152/
#156/#184/#186): the column DiscoveryRun.skipped_query_count writes must exist."""
sql = SKIPPED_QUERY_COUNT_MIGRATION.read_text(encoding="utf-8")
assert "add column if not exists skipped_query_count integer not null default 0" in sql
assert "alter table public.discovery_runs" in sql
Loading