OpenCircuitDev · OpenCircuitDev · Jun 11, 2026 · May 31, 2026 · Jun 11, 2026 · Jun 11, 2026
diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
@@ -5,11 +5,17 @@ on:
     branches: [main]
     paths:
       - 'bench/**'
+      - 'docs/superpowers/specs/**'
+      - 'docs/coverage.md'
+      - 'docs/metrics.md'
       - '.github/workflows/bench.yml'
   pull_request:
     branches: [main]
     paths:
       - 'bench/**'
+      - 'docs/superpowers/specs/**'
+      - 'docs/coverage.md'
+      - 'docs/metrics.md'
       - '.github/workflows/bench.yml'
 
 jobs:
@@ -31,3 +37,13 @@ jobs:
       - name: Validate every sandbox structure (dry-run-all)
         working-directory: bench
         run: bench dry-run-all
+      - name: Regenerate docs/coverage.md
+        run: python -m bench.cli coverage --root bench --write docs/coverage.md
+      - name: Regenerate docs/metrics.md
+        run: python -m bench.cli dashboard --root bench --write docs/metrics.md
+      - name: Fail if generated docs are out of date
+        run: |
+          if ! git diff --exit-code docs/coverage.md docs/metrics.md; then
+            echo "::error::docs/coverage.md or docs/metrics.md is stale. Run 'bench coverage' and 'bench dashboard' locally and commit the regenerated files."
+            exit 1
+          fi
diff --git a/README.md b/README.md
@@ -75,7 +75,7 @@ Apache 2.0 — see [LICENSE](LICENSE). Choose your own license for derivative wo
 | `ocm-inference` | `InferenceBackend` trait + llama.cpp + vLLM adapters + supervisor. Auto-selects backend by platform. | Built |
 | `ocm-memory` | Mem0 client (search before generation, persist after). | Built |
 | `ocm-mcp` | MCP stdio JSON-RPC bridge. Lets Claude Code / Cursor / Cline / Continue.dev connect via standard MCP. | Built |
-| `ocm-models` | Curated registry (5 GGUFs across tiny / default / canonical tiers) + streaming SHA256-verified downloader. Refuses unverified hashes. | Built |
+| `ocm-models` | Curated registry (3 GGUFs across tiny / default tiers, all SHA256-verified; canonical tier returns in v0.1.1) + streaming verified downloader. Refuses unverified hashes. | Built |
 | `ocm-mesh` | Mesh transport trait + iroh / libp2p stubs. Real implementations land in v2. | Scaffolded |
 
 ### Frontend (`frontend/`, SvelteKit 2 + Svelte 5 + adapter-static)
@@ -111,7 +111,7 @@ Everything in `docs/superpowers/` informed the architecture:
 - Codesigning + auto-update (v4)
 - Sandboxing + Sybil resistance (v5)
 - Sharded inference (v6)
-- The five model SHA256 hashes in `crates/ocm-models/registry.json` — the downloader refuses empty-hash entries, so downloads no-op until hashes are committed
+- Qwen3 canonical-tier registry entries (dropped pending verified hashes; return in v0.1.1 — the shipping 3-model registry is fully SHA256-verified)
 - Live deployment on [ocm.shortcircuit.bot](https://ocm.shortcircuit.bot) — domain reserved, site not yet up
 
 ## Contributing

diff --git a/bench/bench/cli.py b/bench/bench/cli.py
@@ -10,6 +10,13 @@
 from rich.table import Table
 
 from .compare import retro_sync_report
+from .coverage import build_coverage, render_coverage_markdown
+from .dashboard import (
+    collect_dashboard_rows,
+    compute_overall_status,
+    counts_to_summary,
+    render_markdown,
+)
 from .runner import DryRunError, list_all_sandboxes, load_expected, run_sandbox
 
 console = Console()
@@ -304,6 +311,90 @@ def run_all(
         sys.exit(1)
 
 
+_DEFAULT_SPEC_PATH = (
+    "docs/superpowers/specs/2026-05-08-ocm-v1-design-spec.md"
+)
+
+
+@main.command()
+@click.option("--root", type=click.Path(exists=True, path_type=Path), default=None,
+              help="Override the bench/ root directory (default: auto-detected).")
+@click.option("--spec", "spec_path", type=click.Path(exists=True, path_type=Path),
+              default=None,
+              help=f"Path to the design spec (default: ../{_DEFAULT_SPEC_PATH}).")
+@click.option("--write", type=click.Path(path_type=Path), default=None,
+              help="Write markdown output to this path (default: print to stdout).")
+def coverage(root: Path | None, spec_path: Path | None, write: Path | None) -> None:
+    """Map spec rows to validating sandboxes.
+
+    Walks the design-spec markdown table of locked decisions, joins each row
+    with sandboxes whose `spec_row` field (or `source_for_claim` regex
+    fallback) references that row, and emits a coverage table. Orphan
+    sandboxes — those validating something the spec doesn't number — are
+    listed separately so they can be reconciled.
+    """
+    bench_root = root or _bench_root()
+    if spec_path is None:
+        spec_path = bench_root.parent / _DEFAULT_SPEC_PATH
+        if not spec_path.exists():
+            console.print(
+                f"[red]Spec not found at {spec_path}[/red]. "
+                "Pass --spec or run from a checkout with docs/superpowers/."
+            )
+            sys.exit(2)
+
+    entries, orphans = build_coverage(bench_root, spec_path)
+    md = render_coverage_markdown(entries, orphans)
+
+    if write is not None:
+        write.parent.mkdir(parents=True, exist_ok=True)
+        write.write_text(md, encoding="utf-8")
+        validated = sum(
+            1 for e in entries
+            if any(s.verdict and s.verdict.value == "CONFIRMED" for s in e.sandboxes)
+        )
+        console.print(
+            f"[green]coverage[/green]: {len(entries)} spec rows, "
+            f"{validated} CONFIRMED, {len(orphans)} orphan sandbox(es) "
+            f"-> wrote {write}"
+        )
+    else:
+        console.print(md)
+
+
+@main.command()
+@click.option("--root", type=click.Path(exists=True, path_type=Path), default=None,
+              help="Override the bench/ root directory (default: auto-detected).")
+@click.option("--write", type=click.Path(path_type=Path), default=None,
+              help="Write markdown output to this path (default: print to stdout).")
+@click.option("--check", is_flag=True,
+              help="Exit 1 if any sandbox is REFUTED, INCONCLUSIVE, or has no run. For CI.")
+def dashboard(root: Path | None, write: Path | None, check: bool) -> None:
+    """Show latest verdict per ACTIVE sandbox as a unified markdown table.
+
+    Walks bench/isolation/ + bench/combination/, takes the most recent
+    summary.json per (hypothesis_id, hardware_class) pair, renders a single
+    markdown document with an overall PASS/FAIL badge. With --check, exits 1
+    on any non-CONFIRMED status — wire this into CI to gate merges.
+    """
+    bench_root = root or _bench_root()
+    rows = collect_dashboard_rows(bench_root)
+    status, counts = compute_overall_status(rows)
+
+    if write is not None:
+        write.parent.mkdir(parents=True, exist_ok=True)
+        write.write_text(render_markdown(rows, status=status, counts=counts), encoding="utf-8")
+        color = "green" if status == "PASS" else "red"
+        console.print(
+            f"[{color}]{status}[/{color}] - {counts_to_summary(counts)} -> wrote {write}"
+        )
+    else:
+        console.print(render_markdown(rows, status=status, counts=counts))
+
+    if check and status != "PASS":
+        sys.exit(1)
+
+
 @main.command(name="list-inactive")
 @click.option("--root", type=click.Path(exists=True, path_type=Path), default=None)
 def list_inactive(root: Path | None) -> None:

diff --git a/bench/bench/coverage.py b/bench/bench/coverage.py
@@ -0,0 +1,209 @@
+"""Spec-row coverage map: which spec decisions have validating benchmarks?
+
+Joins three sources into a single markdown table:
+1. Spec rows parsed from the OCM design-spec markdown.
+2. Sandboxes' declared spec_row field (or regex fallback on source_for_claim).
+3. Latest verdict per sandbox (via dashboard.collect_dashboard_rows).
+"""
+
+from __future__ import annotations
+
+import re
+from dataclasses import dataclass, field
+from pathlib import Path
+
+from .dashboard import DashboardRow, collect_dashboard_rows
+from .metrics import ExpectedJson, Verdict
+
+
+_SPEC_ROW_FALLBACK_RE = re.compile(r"row\s+(\d+)", re.IGNORECASE)
+_BOLD_RE = re.compile(r"\*\*(.+?)\*\*")
+
+
+@dataclass
+class SpecRow:
+    """One row from the spec's locked-decisions table."""
+
+    number: int
+    raw_number: str  # preserves "6b" etc.
+    title: str
+
+
+@dataclass
+class CoverageEntry:
+    """One row in the coverage map."""
+
+    spec_row: SpecRow
+    sandboxes: list[DashboardRow] = field(default_factory=list)
+
+    @property
+    def best_verdict(self) -> Verdict | None:
+        """Return the strongest verdict among sandboxes claiming this row.
+
+        CONFIRMED > INCONCLUSIVE > REFUTED > None. A row counts as validated
+        as soon as at least one sandbox CONFIRMs it.
+        """
+        priority = {
+            Verdict.CONFIRMED: 3,
+            Verdict.INCONCLUSIVE: 2,
+            Verdict.REFUTED: 1,
+        }
+        best: Verdict | None = None
+        best_pri = 0
+        for sb in self.sandboxes:
+            if sb.verdict is None:
+                continue
+            pri = priority.get(sb.verdict, 0)
+            if pri > best_pri:
+                best = sb.verdict
+                best_pri = pri
+        return best
+
+
+def parse_spec_rows(spec_path: Path) -> list[SpecRow]:
+    """Parse a spec markdown file for numbered decision rows.
+
+    Rows look like `| 9 | Title | Choice | Rationale |`. The first cell must
+    be a number (with optional letter suffix like "6b"). Header/divider rows
+    are skipped automatically because their first cell isn't a digit.
+    """
+    rows: list[SpecRow] = []
+    if not spec_path.exists():
+        return rows
+    seen: set[int] = set()
+    for line in spec_path.read_text(encoding="utf-8").splitlines():
+        if not line.startswith("|"):
+            continue
+        cells = [c.strip() for c in line.strip("|").split("|")]
+        if len(cells) < 2:
+            continue
+        first = cells[0]
+        match = re.match(r"^(\d+)([a-z]?)$", first)
+        if not match:
+            continue
+        number = int(match.group(1))
+        raw_number = first
+        title = _BOLD_RE.sub(r"\1", cells[1]).strip()
+        # If a row number appears twice (e.g. "6" then "6b"), keep the first
+        # canonical entry — sub-letter rows tend to be variants of the parent.
+        if number in seen:
+            continue
+        seen.add(number)
+        rows.append(SpecRow(number=number, raw_number=raw_number, title=title))
+    return rows
+
+
+def extract_spec_rows(expected: ExpectedJson) -> list[int]:
+    """Return spec rows this sandbox claims to validate.
+
+    Prefers the explicit `spec_row` field. Falls back to regex-parsing
+    `source_for_claim` for sandboxes that haven't been backfilled yet.
+    """
+    if expected.spec_row:
+        return list(expected.spec_row)
+    source = expected.source_for_claim or ""
+    return [int(m) for m in _SPEC_ROW_FALLBACK_RE.findall(source)]
+
+
+def build_coverage(
+    bench_root: Path,
+    spec_path: Path,
+    results_root: Path | None = None,
+) -> tuple[list[CoverageEntry], list[DashboardRow]]:
+    """Build the coverage join.
+
+    Returns (entries_by_spec_row, orphan_sandboxes). Orphans are ACTIVE
+    sandboxes whose spec_row resolves to an empty list — they validate
+    something not (yet) numbered in the spec.
+    """
+    spec_rows = parse_spec_rows(spec_path)
+    dashboard_rows = collect_dashboard_rows(bench_root, results_root)
+
+    entries = {row.number: CoverageEntry(spec_row=row) for row in spec_rows}
+    orphans: list[DashboardRow] = []
+
+    seen_paths: set[Path] = set()
+    for db_row in dashboard_rows:
+        rows_claimed = extract_spec_rows(db_row.expected)
+        if not rows_claimed:
+            if db_row.sandbox_path not in seen_paths:
+                orphans.append(db_row)
+                seen_paths.add(db_row.sandbox_path)
+            continue
+        for row_num in rows_claimed:
+            entry = entries.get(row_num)
+            if entry is None:
+                # Sandbox references a row not in the spec — surface it as orphan too
+                if db_row.sandbox_path not in seen_paths:
+                    orphans.append(db_row)
+                    seen_paths.add(db_row.sandbox_path)
+                continue
+            entry.sandboxes.append(db_row)
+
+    return list(entries.values()), orphans
+
+
+_VERDICT_LABEL = {
+    Verdict.CONFIRMED: "**CONFIRMED**",
+    Verdict.REFUTED: "**REFUTED**",
+    Verdict.INCONCLUSIVE: "INCONCLUSIVE",
+}
+
+
+def render_coverage_markdown(
+    entries: list[CoverageEntry],
+    orphans: list[DashboardRow],
+) -> str:
+    """Render the coverage map as markdown."""
+    validated = sum(1 for e in entries if e.best_verdict == Verdict.CONFIRMED)
+    has_sandbox = sum(1 for e in entries if e.sandboxes)
+    total = len(entries)
+
+    lines: list[str] = [
+        "# OCM Bench: Spec Row Coverage",
+        "",
+        "_Auto-generated by `bench coverage`. Do not edit by hand._",
+        "",
+        f"**Spec rows:** {total} total | **with a sandbox:** {has_sandbox} "
+        f"| **CONFIRMED:** {validated}",
+        "",
+        "| Row | Decision title | Sandbox(es) | Best verdict |",
+        "|---|---|---|---|",
+    ]
+    for entry in entries:
+        row = entry.spec_row
+        if entry.sandboxes:
+            sandbox_str = "<br>".join(
+                f"`{db.sandbox_path.name}`" for db in entry.sandboxes
+            )
+            verdict = entry.best_verdict
+            verdict_str = _VERDICT_LABEL.get(verdict, "(no run yet)") if verdict else "(no run yet)"
+        else:
+            sandbox_str = "_(none)_"
+            verdict_str = "—"
+        lines.append(
+            f"| {row.raw_number} | {row.title} | {sandbox_str} | {verdict_str} |"
+        )
+
+    if orphans:
+        lines.extend(
+            [
+                "",
+                "## Orphan sandboxes",
+                "",
+                "_ACTIVE sandboxes whose `spec_row` field or `source_for_claim` "
+                "did not resolve to a known spec row._",
+                "",
+                "| Sandbox | Hypothesis | Source-for-claim hint |",
+                "|---|---|---|",
+            ]
+        )
+        for db in orphans:
+            source = (db.expected.source_for_claim or "").replace("\n", " ").strip()
+            if len(source) > 80:
+                source = source[:77] + "..."
+            lines.append(
+                f"| `{db.sandbox_path.name}` | `{db.expected.hypothesis_id}` | {source or '-'} |"
+            )
+
+    return "\n".join(lines) + "\n"