Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions .github/workflows/bench.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,17 @@ on:
branches: [main]
paths:
- 'bench/**'
- 'docs/superpowers/specs/**'
- 'docs/coverage.md'
- 'docs/metrics.md'
- '.github/workflows/bench.yml'
pull_request:
branches: [main]
paths:
- 'bench/**'
- 'docs/superpowers/specs/**'
- 'docs/coverage.md'
- 'docs/metrics.md'
- '.github/workflows/bench.yml'

jobs:
Expand All @@ -31,3 +37,13 @@ jobs:
- name: Validate every sandbox structure (dry-run-all)
working-directory: bench
run: bench dry-run-all
- name: Regenerate docs/coverage.md
run: python -m bench.cli coverage --root bench --write docs/coverage.md
- name: Regenerate docs/metrics.md
run: python -m bench.cli dashboard --root bench --write docs/metrics.md
- name: Fail if generated docs are out of date
run: |
if ! git diff --exit-code docs/coverage.md docs/metrics.md; then
echo "::error::docs/coverage.md or docs/metrics.md is stale. Run 'bench coverage' and 'bench dashboard' locally and commit the regenerated files."
exit 1
fi
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ Apache 2.0 — see [LICENSE](LICENSE). Choose your own license for derivative wo
| `ocm-inference` | `InferenceBackend` trait + llama.cpp + vLLM adapters + supervisor. Auto-selects backend by platform. | Built |
| `ocm-memory` | Mem0 client (search before generation, persist after). | Built |
| `ocm-mcp` | MCP stdio JSON-RPC bridge. Lets Claude Code / Cursor / Cline / Continue.dev connect via standard MCP. | Built |
| `ocm-models` | Curated registry (5 GGUFs across tiny / default / canonical tiers) + streaming SHA256-verified downloader. Refuses unverified hashes. | Built |
| `ocm-models` | Curated registry (3 GGUFs across tiny / default tiers, all SHA256-verified; canonical tier returns in v0.1.1) + streaming verified downloader. Refuses unverified hashes. | Built |
| `ocm-mesh` | Mesh transport trait + iroh / libp2p stubs. Real implementations land in v2. | Scaffolded |

### Frontend (`frontend/`, SvelteKit 2 + Svelte 5 + adapter-static)
Expand Down Expand Up @@ -111,7 +111,7 @@ Everything in `docs/superpowers/` informed the architecture:
- Codesigning + auto-update (v4)
- Sandboxing + Sybil resistance (v5)
- Sharded inference (v6)
- The five model SHA256 hashes in `crates/ocm-models/registry.json` — the downloader refuses empty-hash entries, so downloads no-op until hashes are committed
- Qwen3 canonical-tier registry entries (dropped pending verified hashes; return in v0.1.1 — the shipping 3-model registry is fully SHA256-verified)
- Live deployment on [ocm.shortcircuit.bot](https://ocm.shortcircuit.bot) — domain reserved, site not yet up

## Contributing
Expand Down
91 changes: 91 additions & 0 deletions bench/bench/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,13 @@
from rich.table import Table

from .compare import retro_sync_report
from .coverage import build_coverage, render_coverage_markdown
from .dashboard import (
collect_dashboard_rows,
compute_overall_status,
counts_to_summary,
render_markdown,
)
from .runner import DryRunError, list_all_sandboxes, load_expected, run_sandbox

console = Console()
Expand Down Expand Up @@ -304,6 +311,90 @@ def run_all(
sys.exit(1)


_DEFAULT_SPEC_PATH = (
"docs/superpowers/specs/2026-05-08-ocm-v1-design-spec.md"
)


@main.command()
@click.option("--root", type=click.Path(exists=True, path_type=Path), default=None,
help="Override the bench/ root directory (default: auto-detected).")
@click.option("--spec", "spec_path", type=click.Path(exists=True, path_type=Path),
default=None,
help=f"Path to the design spec (default: ../{_DEFAULT_SPEC_PATH}).")
@click.option("--write", type=click.Path(path_type=Path), default=None,
help="Write markdown output to this path (default: print to stdout).")
def coverage(root: Path | None, spec_path: Path | None, write: Path | None) -> None:
"""Map spec rows to validating sandboxes.

Walks the design-spec markdown table of locked decisions, joins each row
with sandboxes whose `spec_row` field (or `source_for_claim` regex
fallback) references that row, and emits a coverage table. Orphan
sandboxes — those validating something the spec doesn't number — are
listed separately so they can be reconciled.
"""
bench_root = root or _bench_root()
if spec_path is None:
spec_path = bench_root.parent / _DEFAULT_SPEC_PATH
if not spec_path.exists():
console.print(
f"[red]Spec not found at {spec_path}[/red]. "
"Pass --spec or run from a checkout with docs/superpowers/."
)
sys.exit(2)

entries, orphans = build_coverage(bench_root, spec_path)
md = render_coverage_markdown(entries, orphans)

if write is not None:
write.parent.mkdir(parents=True, exist_ok=True)
write.write_text(md, encoding="utf-8")
validated = sum(
1 for e in entries
if any(s.verdict and s.verdict.value == "CONFIRMED" for s in e.sandboxes)
)
console.print(
f"[green]coverage[/green]: {len(entries)} spec rows, "
f"{validated} CONFIRMED, {len(orphans)} orphan sandbox(es) "
f"-> wrote {write}"
)
else:
console.print(md)


@main.command()
@click.option("--root", type=click.Path(exists=True, path_type=Path), default=None,
help="Override the bench/ root directory (default: auto-detected).")
@click.option("--write", type=click.Path(path_type=Path), default=None,
help="Write markdown output to this path (default: print to stdout).")
@click.option("--check", is_flag=True,
help="Exit 1 if any sandbox is REFUTED, INCONCLUSIVE, or has no run. For CI.")
def dashboard(root: Path | None, write: Path | None, check: bool) -> None:
"""Show latest verdict per ACTIVE sandbox as a unified markdown table.

Walks bench/isolation/ + bench/combination/, takes the most recent
summary.json per (hypothesis_id, hardware_class) pair, renders a single
markdown document with an overall PASS/FAIL badge. With --check, exits 1
on any non-CONFIRMED status — wire this into CI to gate merges.
"""
bench_root = root or _bench_root()
rows = collect_dashboard_rows(bench_root)
status, counts = compute_overall_status(rows)

if write is not None:
write.parent.mkdir(parents=True, exist_ok=True)
write.write_text(render_markdown(rows, status=status, counts=counts), encoding="utf-8")
color = "green" if status == "PASS" else "red"
console.print(
f"[{color}]{status}[/{color}] - {counts_to_summary(counts)} -> wrote {write}"
)
else:
console.print(render_markdown(rows, status=status, counts=counts))

if check and status != "PASS":
sys.exit(1)


@main.command(name="list-inactive")
@click.option("--root", type=click.Path(exists=True, path_type=Path), default=None)
def list_inactive(root: Path | None) -> None:
Expand Down
209 changes: 209 additions & 0 deletions bench/bench/coverage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,209 @@
"""Spec-row coverage map: which spec decisions have validating benchmarks?

Joins three sources into a single markdown table:
1. Spec rows parsed from the OCM design-spec markdown.
2. Sandboxes' declared spec_row field (or regex fallback on source_for_claim).
3. Latest verdict per sandbox (via dashboard.collect_dashboard_rows).
"""

from __future__ import annotations

import re
from dataclasses import dataclass, field
from pathlib import Path

from .dashboard import DashboardRow, collect_dashboard_rows
from .metrics import ExpectedJson, Verdict


_SPEC_ROW_FALLBACK_RE = re.compile(r"row\s+(\d+)", re.IGNORECASE)
_BOLD_RE = re.compile(r"\*\*(.+?)\*\*")


@dataclass
class SpecRow:
"""One row from the spec's locked-decisions table."""

number: int
raw_number: str # preserves "6b" etc.
title: str


@dataclass
class CoverageEntry:
"""One row in the coverage map."""

spec_row: SpecRow
sandboxes: list[DashboardRow] = field(default_factory=list)

@property
def best_verdict(self) -> Verdict | None:
"""Return the strongest verdict among sandboxes claiming this row.

CONFIRMED > INCONCLUSIVE > REFUTED > None. A row counts as validated
as soon as at least one sandbox CONFIRMs it.
"""
priority = {
Verdict.CONFIRMED: 3,
Verdict.INCONCLUSIVE: 2,
Verdict.REFUTED: 1,
}
best: Verdict | None = None
best_pri = 0
for sb in self.sandboxes:
if sb.verdict is None:
continue
pri = priority.get(sb.verdict, 0)
if pri > best_pri:
best = sb.verdict
best_pri = pri
return best


def parse_spec_rows(spec_path: Path) -> list[SpecRow]:
"""Parse a spec markdown file for numbered decision rows.

Rows look like `| 9 | Title | Choice | Rationale |`. The first cell must
be a number (with optional letter suffix like "6b"). Header/divider rows
are skipped automatically because their first cell isn't a digit.
"""
rows: list[SpecRow] = []
if not spec_path.exists():
return rows
seen: set[int] = set()
for line in spec_path.read_text(encoding="utf-8").splitlines():
if not line.startswith("|"):
continue
cells = [c.strip() for c in line.strip("|").split("|")]
if len(cells) < 2:
continue
first = cells[0]
match = re.match(r"^(\d+)([a-z]?)$", first)
if not match:
continue
number = int(match.group(1))
raw_number = first
title = _BOLD_RE.sub(r"\1", cells[1]).strip()
# If a row number appears twice (e.g. "6" then "6b"), keep the first
# canonical entry — sub-letter rows tend to be variants of the parent.
if number in seen:
continue
seen.add(number)
rows.append(SpecRow(number=number, raw_number=raw_number, title=title))
return rows


def extract_spec_rows(expected: ExpectedJson) -> list[int]:
"""Return spec rows this sandbox claims to validate.

Prefers the explicit `spec_row` field. Falls back to regex-parsing
`source_for_claim` for sandboxes that haven't been backfilled yet.
"""
if expected.spec_row:
return list(expected.spec_row)
source = expected.source_for_claim or ""
return [int(m) for m in _SPEC_ROW_FALLBACK_RE.findall(source)]


def build_coverage(
bench_root: Path,
spec_path: Path,
results_root: Path | None = None,
) -> tuple[list[CoverageEntry], list[DashboardRow]]:
"""Build the coverage join.

Returns (entries_by_spec_row, orphan_sandboxes). Orphans are ACTIVE
sandboxes whose spec_row resolves to an empty list — they validate
something not (yet) numbered in the spec.
"""
spec_rows = parse_spec_rows(spec_path)
dashboard_rows = collect_dashboard_rows(bench_root, results_root)

entries = {row.number: CoverageEntry(spec_row=row) for row in spec_rows}
orphans: list[DashboardRow] = []

seen_paths: set[Path] = set()
for db_row in dashboard_rows:
rows_claimed = extract_spec_rows(db_row.expected)
if not rows_claimed:
if db_row.sandbox_path not in seen_paths:
orphans.append(db_row)
seen_paths.add(db_row.sandbox_path)
continue
for row_num in rows_claimed:
entry = entries.get(row_num)
if entry is None:
# Sandbox references a row not in the spec — surface it as orphan too
if db_row.sandbox_path not in seen_paths:
orphans.append(db_row)
seen_paths.add(db_row.sandbox_path)
continue
entry.sandboxes.append(db_row)

return list(entries.values()), orphans


_VERDICT_LABEL = {
Verdict.CONFIRMED: "**CONFIRMED**",
Verdict.REFUTED: "**REFUTED**",
Verdict.INCONCLUSIVE: "INCONCLUSIVE",
}


def render_coverage_markdown(
entries: list[CoverageEntry],
orphans: list[DashboardRow],
) -> str:
"""Render the coverage map as markdown."""
validated = sum(1 for e in entries if e.best_verdict == Verdict.CONFIRMED)
has_sandbox = sum(1 for e in entries if e.sandboxes)
total = len(entries)

lines: list[str] = [
"# OCM Bench: Spec Row Coverage",
"",
"_Auto-generated by `bench coverage`. Do not edit by hand._",
"",
f"**Spec rows:** {total} total | **with a sandbox:** {has_sandbox} "
f"| **CONFIRMED:** {validated}",
"",
"| Row | Decision title | Sandbox(es) | Best verdict |",
"|---|---|---|---|",
]
for entry in entries:
row = entry.spec_row
if entry.sandboxes:
sandbox_str = "<br>".join(
f"`{db.sandbox_path.name}`" for db in entry.sandboxes
)
verdict = entry.best_verdict
verdict_str = _VERDICT_LABEL.get(verdict, "(no run yet)") if verdict else "(no run yet)"
else:
sandbox_str = "_(none)_"
verdict_str = "—"
lines.append(
f"| {row.raw_number} | {row.title} | {sandbox_str} | {verdict_str} |"
)

if orphans:
lines.extend(
[
"",
"## Orphan sandboxes",
"",
"_ACTIVE sandboxes whose `spec_row` field or `source_for_claim` "
"did not resolve to a known spec row._",
"",
"| Sandbox | Hypothesis | Source-for-claim hint |",
"|---|---|---|",
]
)
for db in orphans:
source = (db.expected.source_for_claim or "").replace("\n", " ").strip()
if len(source) > 80:
source = source[:77] + "..."
lines.append(
f"| `{db.sandbox_path.name}` | `{db.expected.hypothesis_id}` | {source or '-'} |"
)

return "\n".join(lines) + "\n"
Loading
Loading