From e4613de32657fea0467d973f1542e566c9ca12d8 Mon Sep 17 00:00:00 2001 From: Brand Date: Sat, 30 May 2026 19:46:27 -0600 Subject: [PATCH 1/7] Commit uncommitted work before Dropbox->Git migration (2026-05-30) --- .github/workflows/bench.yml | 16 ++ bench/bench/cli.py | 91 +++++++++++ bench/bench/coverage.py | 209 +++++++++++++++++++++++++ bench/bench/dashboard.py | 210 +++++++++++++++++++++++++ bench/bench/metrics.py | 25 +++ bench/bench/runner.py | 49 +++++- bench/tests/test_coverage.py | 286 ++++++++++++++++++++++++++++++++++ bench/tests/test_dashboard.py | 272 ++++++++++++++++++++++++++++++++ bench/tests/test_history.py | 142 +++++++++++++++++ docs/benchmarking.md | 24 +++ docs/coverage.md | 49 ++++++ docs/metrics.md | 12 ++ 12 files changed, 1384 insertions(+), 1 deletion(-) create mode 100644 bench/bench/coverage.py create mode 100644 bench/bench/dashboard.py create mode 100644 bench/tests/test_coverage.py create mode 100644 bench/tests/test_dashboard.py create mode 100644 bench/tests/test_history.py create mode 100644 docs/coverage.md create mode 100644 docs/metrics.md diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index d6d46f8..79353ca 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -5,11 +5,17 @@ on: branches: [main] paths: - 'bench/**' + - 'docs/superpowers/specs/**' + - 'docs/coverage.md' + - 'docs/metrics.md' - '.github/workflows/bench.yml' pull_request: branches: [main] paths: - 'bench/**' + - 'docs/superpowers/specs/**' + - 'docs/coverage.md' + - 'docs/metrics.md' - '.github/workflows/bench.yml' jobs: @@ -31,3 +37,13 @@ jobs: - name: Validate every sandbox structure (dry-run-all) working-directory: bench run: bench dry-run-all + - name: Regenerate docs/coverage.md + run: python -m bench.cli coverage --root bench --write docs/coverage.md + - name: Regenerate docs/metrics.md + run: python -m bench.cli dashboard --root bench --write docs/metrics.md + - name: Fail if generated docs are out of date + run: | + if ! git diff --exit-code docs/coverage.md docs/metrics.md; then + echo "::error::docs/coverage.md or docs/metrics.md is stale. Run 'bench coverage' and 'bench dashboard' locally and commit the regenerated files." + exit 1 + fi diff --git a/bench/bench/cli.py b/bench/bench/cli.py index 32445c0..d9b3cde 100644 --- a/bench/bench/cli.py +++ b/bench/bench/cli.py @@ -10,6 +10,13 @@ from rich.table import Table from .compare import retro_sync_report +from .coverage import build_coverage, render_coverage_markdown +from .dashboard import ( + collect_dashboard_rows, + compute_overall_status, + counts_to_summary, + render_markdown, +) from .runner import DryRunError, list_all_sandboxes, load_expected, run_sandbox console = Console() @@ -304,6 +311,90 @@ def run_all( sys.exit(1) +_DEFAULT_SPEC_PATH = ( + "docs/superpowers/specs/2026-05-08-ocm-v1-design-spec.md" +) + + +@main.command() +@click.option("--root", type=click.Path(exists=True, path_type=Path), default=None, + help="Override the bench/ root directory (default: auto-detected).") +@click.option("--spec", "spec_path", type=click.Path(exists=True, path_type=Path), + default=None, + help=f"Path to the design spec (default: ../{_DEFAULT_SPEC_PATH}).") +@click.option("--write", type=click.Path(path_type=Path), default=None, + help="Write markdown output to this path (default: print to stdout).") +def coverage(root: Path | None, spec_path: Path | None, write: Path | None) -> None: + """Map spec rows to validating sandboxes. + + Walks the design-spec markdown table of locked decisions, joins each row + with sandboxes whose `spec_row` field (or `source_for_claim` regex + fallback) references that row, and emits a coverage table. Orphan + sandboxes — those validating something the spec doesn't number — are + listed separately so they can be reconciled. + """ + bench_root = root or _bench_root() + if spec_path is None: + spec_path = bench_root.parent / _DEFAULT_SPEC_PATH + if not spec_path.exists(): + console.print( + f"[red]Spec not found at {spec_path}[/red]. " + "Pass --spec or run from a checkout with docs/superpowers/." + ) + sys.exit(2) + + entries, orphans = build_coverage(bench_root, spec_path) + md = render_coverage_markdown(entries, orphans) + + if write is not None: + write.parent.mkdir(parents=True, exist_ok=True) + write.write_text(md, encoding="utf-8") + validated = sum( + 1 for e in entries + if any(s.verdict and s.verdict.value == "CONFIRMED" for s in e.sandboxes) + ) + console.print( + f"[green]coverage[/green]: {len(entries)} spec rows, " + f"{validated} CONFIRMED, {len(orphans)} orphan sandbox(es) " + f"-> wrote {write}" + ) + else: + console.print(md) + + +@main.command() +@click.option("--root", type=click.Path(exists=True, path_type=Path), default=None, + help="Override the bench/ root directory (default: auto-detected).") +@click.option("--write", type=click.Path(path_type=Path), default=None, + help="Write markdown output to this path (default: print to stdout).") +@click.option("--check", is_flag=True, + help="Exit 1 if any sandbox is REFUTED, INCONCLUSIVE, or has no run. For CI.") +def dashboard(root: Path | None, write: Path | None, check: bool) -> None: + """Show latest verdict per ACTIVE sandbox as a unified markdown table. + + Walks bench/isolation/ + bench/combination/, takes the most recent + summary.json per (hypothesis_id, hardware_class) pair, renders a single + markdown document with an overall PASS/FAIL badge. With --check, exits 1 + on any non-CONFIRMED status — wire this into CI to gate merges. + """ + bench_root = root or _bench_root() + rows = collect_dashboard_rows(bench_root) + status, counts = compute_overall_status(rows) + + if write is not None: + write.parent.mkdir(parents=True, exist_ok=True) + write.write_text(render_markdown(rows, status=status, counts=counts), encoding="utf-8") + color = "green" if status == "PASS" else "red" + console.print( + f"[{color}]{status}[/{color}] - {counts_to_summary(counts)} -> wrote {write}" + ) + else: + console.print(render_markdown(rows, status=status, counts=counts)) + + if check and status != "PASS": + sys.exit(1) + + @main.command(name="list-inactive") @click.option("--root", type=click.Path(exists=True, path_type=Path), default=None) def list_inactive(root: Path | None) -> None: diff --git a/bench/bench/coverage.py b/bench/bench/coverage.py new file mode 100644 index 0000000..88bef5f --- /dev/null +++ b/bench/bench/coverage.py @@ -0,0 +1,209 @@ +"""Spec-row coverage map: which spec decisions have validating benchmarks? + +Joins three sources into a single markdown table: +1. Spec rows parsed from the OCM design-spec markdown. +2. Sandboxes' declared spec_row field (or regex fallback on source_for_claim). +3. Latest verdict per sandbox (via dashboard.collect_dashboard_rows). +""" + +from __future__ import annotations + +import re +from dataclasses import dataclass, field +from pathlib import Path + +from .dashboard import DashboardRow, collect_dashboard_rows +from .metrics import ExpectedJson, Verdict + + +_SPEC_ROW_FALLBACK_RE = re.compile(r"row\s+(\d+)", re.IGNORECASE) +_BOLD_RE = re.compile(r"\*\*(.+?)\*\*") + + +@dataclass +class SpecRow: + """One row from the spec's locked-decisions table.""" + + number: int + raw_number: str # preserves "6b" etc. + title: str + + +@dataclass +class CoverageEntry: + """One row in the coverage map.""" + + spec_row: SpecRow + sandboxes: list[DashboardRow] = field(default_factory=list) + + @property + def best_verdict(self) -> Verdict | None: + """Return the strongest verdict among sandboxes claiming this row. + + CONFIRMED > INCONCLUSIVE > REFUTED > None. A row counts as validated + as soon as at least one sandbox CONFIRMs it. + """ + priority = { + Verdict.CONFIRMED: 3, + Verdict.INCONCLUSIVE: 2, + Verdict.REFUTED: 1, + } + best: Verdict | None = None + best_pri = 0 + for sb in self.sandboxes: + if sb.verdict is None: + continue + pri = priority.get(sb.verdict, 0) + if pri > best_pri: + best = sb.verdict + best_pri = pri + return best + + +def parse_spec_rows(spec_path: Path) -> list[SpecRow]: + """Parse a spec markdown file for numbered decision rows. + + Rows look like `| 9 | Title | Choice | Rationale |`. The first cell must + be a number (with optional letter suffix like "6b"). Header/divider rows + are skipped automatically because their first cell isn't a digit. + """ + rows: list[SpecRow] = [] + if not spec_path.exists(): + return rows + seen: set[int] = set() + for line in spec_path.read_text(encoding="utf-8").splitlines(): + if not line.startswith("|"): + continue + cells = [c.strip() for c in line.strip("|").split("|")] + if len(cells) < 2: + continue + first = cells[0] + match = re.match(r"^(\d+)([a-z]?)$", first) + if not match: + continue + number = int(match.group(1)) + raw_number = first + title = _BOLD_RE.sub(r"\1", cells[1]).strip() + # If a row number appears twice (e.g. "6" then "6b"), keep the first + # canonical entry — sub-letter rows tend to be variants of the parent. + if number in seen: + continue + seen.add(number) + rows.append(SpecRow(number=number, raw_number=raw_number, title=title)) + return rows + + +def extract_spec_rows(expected: ExpectedJson) -> list[int]: + """Return spec rows this sandbox claims to validate. + + Prefers the explicit `spec_row` field. Falls back to regex-parsing + `source_for_claim` for sandboxes that haven't been backfilled yet. + """ + if expected.spec_row: + return list(expected.spec_row) + source = expected.source_for_claim or "" + return [int(m) for m in _SPEC_ROW_FALLBACK_RE.findall(source)] + + +def build_coverage( + bench_root: Path, + spec_path: Path, + results_root: Path | None = None, +) -> tuple[list[CoverageEntry], list[DashboardRow]]: + """Build the coverage join. + + Returns (entries_by_spec_row, orphan_sandboxes). Orphans are ACTIVE + sandboxes whose spec_row resolves to an empty list — they validate + something not (yet) numbered in the spec. + """ + spec_rows = parse_spec_rows(spec_path) + dashboard_rows = collect_dashboard_rows(bench_root, results_root) + + entries = {row.number: CoverageEntry(spec_row=row) for row in spec_rows} + orphans: list[DashboardRow] = [] + + seen_paths: set[Path] = set() + for db_row in dashboard_rows: + rows_claimed = extract_spec_rows(db_row.expected) + if not rows_claimed: + if db_row.sandbox_path not in seen_paths: + orphans.append(db_row) + seen_paths.add(db_row.sandbox_path) + continue + for row_num in rows_claimed: + entry = entries.get(row_num) + if entry is None: + # Sandbox references a row not in the spec — surface it as orphan too + if db_row.sandbox_path not in seen_paths: + orphans.append(db_row) + seen_paths.add(db_row.sandbox_path) + continue + entry.sandboxes.append(db_row) + + return list(entries.values()), orphans + + +_VERDICT_LABEL = { + Verdict.CONFIRMED: "**CONFIRMED**", + Verdict.REFUTED: "**REFUTED**", + Verdict.INCONCLUSIVE: "INCONCLUSIVE", +} + + +def render_coverage_markdown( + entries: list[CoverageEntry], + orphans: list[DashboardRow], +) -> str: + """Render the coverage map as markdown.""" + validated = sum(1 for e in entries if e.best_verdict == Verdict.CONFIRMED) + has_sandbox = sum(1 for e in entries if e.sandboxes) + total = len(entries) + + lines: list[str] = [ + "# OCM Bench: Spec Row Coverage", + "", + "_Auto-generated by `bench coverage`. Do not edit by hand._", + "", + f"**Spec rows:** {total} total | **with a sandbox:** {has_sandbox} " + f"| **CONFIRMED:** {validated}", + "", + "| Row | Decision title | Sandbox(es) | Best verdict |", + "|---|---|---|---|", + ] + for entry in entries: + row = entry.spec_row + if entry.sandboxes: + sandbox_str = "
".join( + f"`{db.sandbox_path.name}`" for db in entry.sandboxes + ) + verdict = entry.best_verdict + verdict_str = _VERDICT_LABEL.get(verdict, "(no run yet)") if verdict else "(no run yet)" + else: + sandbox_str = "_(none)_" + verdict_str = "—" + lines.append( + f"| {row.raw_number} | {row.title} | {sandbox_str} | {verdict_str} |" + ) + + if orphans: + lines.extend( + [ + "", + "## Orphan sandboxes", + "", + "_ACTIVE sandboxes whose `spec_row` field or `source_for_claim` " + "did not resolve to a known spec row._", + "", + "| Sandbox | Hypothesis | Source-for-claim hint |", + "|---|---|---|", + ] + ) + for db in orphans: + source = (db.expected.source_for_claim or "").replace("\n", " ").strip() + if len(source) > 80: + source = source[:77] + "..." + lines.append( + f"| `{db.sandbox_path.name}` | `{db.expected.hypothesis_id}` | {source or '-'} |" + ) + + return "\n".join(lines) + "\n" diff --git a/bench/bench/dashboard.py b/bench/bench/dashboard.py new file mode 100644 index 0000000..2dac79f --- /dev/null +++ b/bench/bench/dashboard.py @@ -0,0 +1,210 @@ +"""Cross-sandbox metrics dashboard. + +Walks every ACTIVE sandbox, finds the most recent run per +(hypothesis_id, hardware_class) pair under results/, and renders a unified +markdown table with an overall PASS/FAIL badge. +""" + +from __future__ import annotations + +import json +from dataclasses import dataclass +from pathlib import Path + +from .metrics import ExpectedJson, Verdict +from .runner import list_all_sandboxes, load_expected + + +@dataclass +class DashboardRow: + """One row in the dashboard — one (sandbox, hardware_class) pair.""" + + sandbox_path: Path + category: str + expected: ExpectedJson + hardware_class: str | None + primary_median: float | None + secondary_median: float | None + verdict: Verdict | None + timestamp_utc: str | None + summary_path: Path | None + + +def _category_of(sandbox_path: Path, bench_root: Path) -> str: + """Derive category from path, e.g. isolation/memory/mem0-v3 → 'memory'.""" + try: + rel_parts = sandbox_path.relative_to(bench_root).parts + except ValueError: + return "?" + if len(rel_parts) >= 2: + return rel_parts[1] + return "?" + + +def _threshold_str(expected: ExpectedJson) -> str: + """Format the primary threshold for display (ASCII-only).""" + t = expected.thresholds + if t.confirm_at_least is not None: + return f">= {t.confirm_at_least}" + if t.confirm_at_most is not None: + return f"<= {t.confirm_at_most}" + return "-" + + +def _find_latest_summary_for( + results_root: Path, hypothesis_id: str +) -> dict[str, dict]: + """Return {hardware_class: parsed_summary} for the most recent run per hardware_class.""" + latest: dict[str, tuple[str, dict, Path]] = {} + if not results_root.exists(): + return {} + for summary_path in results_root.rglob("summary.json"): + try: + data = json.loads(summary_path.read_text()) + except (json.JSONDecodeError, OSError): + continue + if data.get("hypothesis_id") != hypothesis_id: + continue + hw = data.get("hardware_class", "?") + ts = data.get("timestamp_utc", "") + existing = latest.get(hw) + if existing is None or ts > existing[0]: + latest[hw] = (ts, data, summary_path) + return {hw: {"data": tup[1], "path": tup[2]} for hw, tup in latest.items()} + + +def collect_dashboard_rows( + bench_root: Path, results_root: Path | None = None +) -> list[DashboardRow]: + """Walk ACTIVE sandboxes and join with their latest results.""" + results_root = results_root or (bench_root / "results") + sandboxes = ( + list_all_sandboxes(bench_root / "isolation") + + list_all_sandboxes(bench_root / "combination") + ) + + rows: list[DashboardRow] = [] + for sandbox in sandboxes: + try: + expected = load_expected(sandbox) + except Exception: # noqa: BLE001 + continue + if expected.status != "ACTIVE": + continue + + category = _category_of(sandbox, bench_root) + latest = _find_latest_summary_for(results_root, expected.hypothesis_id) + + if not latest: + rows.append( + DashboardRow( + sandbox_path=sandbox, + category=category, + expected=expected, + hardware_class=None, + primary_median=None, + secondary_median=None, + verdict=None, + timestamp_utc=None, + summary_path=None, + ) + ) + continue + + for hw, info in latest.items(): + data = info["data"] + try: + verdict = Verdict(data.get("verdict")) + except ValueError: + verdict = None + rows.append( + DashboardRow( + sandbox_path=sandbox, + category=category, + expected=expected, + hardware_class=hw, + primary_median=data.get("primary_median"), + secondary_median=data.get("secondary_median"), + verdict=verdict, + timestamp_utc=data.get("timestamp_utc"), + summary_path=info["path"], + ) + ) + + rows.sort(key=lambda r: (r.category, r.expected.hypothesis_id, r.hardware_class or "")) + return rows + + +def compute_overall_status(rows: list[DashboardRow]) -> tuple[str, dict[str, int]]: + """Return ('PASS' or 'FAIL', verdict_counts). + + PASS only when every ACTIVE row has a CONFIRMED verdict. Rows without a + run yet count against the badge (treated as INCONCLUSIVE). + """ + counts = {"CONFIRMED": 0, "REFUTED": 0, "INCONCLUSIVE": 0, "NO_RUN": 0} + for row in rows: + if row.verdict is None: + counts["NO_RUN"] += 1 + else: + counts[row.verdict.value] += 1 + status = "PASS" if counts["CONFIRMED"] == len(rows) and rows else "FAIL" + return status, counts + + +_VERDICT_BADGE = { + "CONFIRMED": "**CONFIRMED**", + "REFUTED": "**REFUTED**", + "INCONCLUSIVE": "INCONCLUSIVE", +} + + +def render_markdown(rows: list[DashboardRow], *, status: str, counts: dict[str, int]) -> str: + """Render the dashboard as a markdown document. + + ASCII-only output so the same string renders on GitHub *and* prints to + Windows-legacy consoles without hitting cp1252 encoding errors. + """ + parts = counts_to_summary(counts) + badge = f"**[{status}]** {parts}" + + lines: list[str] = [ + "# OCM Bench: Metrics Dashboard", + "", + "_Auto-generated by `bench dashboard`. Do not edit by hand._", + "", + badge, + "", + "| Sandbox | Hypothesis | Category | Primary metric | Latest | Threshold | Verdict | Hardware | Run |", + "|---|---|---|---|---|---|---|---|---|", + ] + for row in rows: + sandbox_name = row.sandbox_path.name + hypothesis = row.expected.hypothesis_id + metric = row.expected.metric + latest = ( + f"{row.primary_median:.3f}" if row.primary_median is not None else "-" + ) + threshold = _threshold_str(row.expected) + verdict = ( + _VERDICT_BADGE[row.verdict.value] if row.verdict else "(no run yet)" + ) + hw = row.hardware_class or "-" + ts = row.timestamp_utc or "-" + lines.append( + f"| `{sandbox_name}` | `{hypothesis}` | {row.category} | `{metric}` | " + f"{latest} | {threshold} | {verdict} | `{hw}` | {ts} |" + ) + + return "\n".join(lines) + "\n" + + +def counts_to_summary(counts: dict[str, int]) -> str: + """Format verdict counts: '4 CONFIRMED / 0 REFUTED / 0 INCONCLUSIVE / 1 no-run'.""" + parts = [ + f"{counts['CONFIRMED']} CONFIRMED", + f"{counts['REFUTED']} REFUTED", + f"{counts['INCONCLUSIVE']} INCONCLUSIVE", + ] + if counts.get("NO_RUN", 0): + parts.append(f"{counts['NO_RUN']} no-run") + return " / ".join(parts) diff --git a/bench/bench/metrics.py b/bench/bench/metrics.py index e8f9bed..00e030f 100644 --- a/bench/bench/metrics.py +++ b/bench/bench/metrics.py @@ -48,6 +48,10 @@ class ExpectedJson(BaseModel): # ship with just expected.json + README.md so the harness has a target later. status: str = "ACTIVE" blocked_on: list[str] | None = None + # Spec rows this sandbox validates. New sandboxes set this explicitly; for + # existing sandboxes, `bench coverage` falls back to regex-parsing + # `source_for_claim` until they're backfilled. + spec_row: list[int] | None = None model_config = {"extra": "allow"} @@ -79,6 +83,27 @@ class SandboxSummary(BaseModel): verdict_reason: str +class HistoryRecord(BaseModel): + """One row in results/history.jsonl — append-only per-run trend record. + + Distinct from `compare.HistoryEntry` which is an in-memory dataclass built + by walking summary.json files. HistoryRecord is the durable schema written + once per `bench run`, giving a fast scan path for trend tooling without + re-parsing every summary.json. + """ + + timestamp_utc: str + hypothesis_id: str + hardware_class: str + primary_median: float + primary_std: float | None = None + secondary_median: float | None = None + verdict: Verdict + git_sha: str + repeats: int + wall_clock_s: float + + def decide_verdict( primary_median: float, expected: ExpectedJson, diff --git a/bench/bench/runner.py b/bench/bench/runner.py index a97647e..80d56c9 100644 --- a/bench/bench/runner.py +++ b/bench/bench/runner.py @@ -4,13 +4,42 @@ import json import subprocess +import time from datetime import UTC, datetime from pathlib import Path import yaml from pydantic import ValidationError -from .metrics import ExpectedJson, RunResult, SandboxSummary, Verdict, decide_verdict +from .metrics import ( + ExpectedJson, + HistoryRecord, + RunResult, + SandboxSummary, + Verdict, + decide_verdict, +) + + +def _get_git_sha(repo_hint: Path | None = None) -> str: + """Return the current git HEAD short SHA, or "unknown" if not in a repo. + + Resilient to test environments and shallow clones — never raises. + """ + try: + proc = subprocess.run( + ["git", "rev-parse", "--short", "HEAD"], + cwd=str(repo_hint) if repo_hint else None, + capture_output=True, + text=True, + timeout=5, + check=False, + ) + if proc.returncode == 0 and proc.stdout.strip(): + return proc.stdout.strip() + except (subprocess.SubprocessError, FileNotFoundError, OSError): + pass + return "unknown" class DryRunError(ValueError): @@ -122,10 +151,12 @@ def run_sandbox( run_dir = out_dir / f"{timestamp}-{expected.hypothesis_id}-{hardware_class}" run_dir.mkdir(parents=True, exist_ok=True) + wall_clock_start = time.monotonic() runs: list[RunResult] = [] for i in range(repeats): result = _execute_compose(sandbox_path, expected, repeat=i, out_dir=run_dir) runs.append(result) + wall_clock_s = time.monotonic() - wall_clock_start primary_values = sorted(r.primary_value for r in runs) primary_median = primary_values[len(primary_values) // 2] @@ -155,6 +186,22 @@ def run_sandbox( verdict_reason=reason, ) (run_dir / "summary.json").write_text(summary.model_dump_json(indent=2)) + + record = HistoryRecord( + timestamp_utc=timestamp, + hypothesis_id=expected.hypothesis_id, + hardware_class=hardware_class, + primary_median=primary_median, + primary_std=primary_std, + secondary_median=secondary_median, + verdict=verdict, + git_sha=_get_git_sha(repo_hint=sandbox_path), + repeats=repeats, + wall_clock_s=wall_clock_s, + ) + with (out_dir / "history.jsonl").open("a", encoding="utf-8") as f: + f.write(record.model_dump_json() + "\n") + return summary diff --git a/bench/tests/test_coverage.py b/bench/tests/test_coverage.py new file mode 100644 index 0000000..6bd226b --- /dev/null +++ b/bench/tests/test_coverage.py @@ -0,0 +1,286 @@ +"""Tests for the spec-row coverage map.""" + +from __future__ import annotations + +import json +from pathlib import Path + +from click.testing import CliRunner + +from bench.cli import main +from bench.coverage import ( + build_coverage, + extract_spec_rows, + parse_spec_rows, + render_coverage_markdown, +) +from bench.metrics import ExpectedJson, Thresholds + + +SAMPLE_SPEC = """# OCM Design Spec + +Some preamble paragraph. + +## Locked decisions + +| # | Decision | Choice | Rationale | +|---|---|---|---| +| 1 | Project name | **OpenCircuitModel (OCM)** | User-locked | +| 6 | Inference engine — Apple Silicon | **llama.cpp + Metal** | most stable on macOS | +| 6b | Inference engine — RTX 5090 | **TensorRT-LLM FP4** | only engine for FP4 | +| 9 | Agent memory | **Mem0 v3** | library-driven retrieval | +| 24 | Compressed-view tool | **Aider repomap** | 70% token cut | + +## Some other section + +Not a table. +""" + + +def _write_spec(tmp_path: Path) -> Path: + spec = tmp_path / "spec.md" + spec.write_text(SAMPLE_SPEC, encoding="utf-8") + return spec + + +def _write_sandbox( + root: Path, + *, + hypothesis_id: str, + spec_row: list[int] | None = None, + source_for_claim: str | None = None, + status: str = "ACTIVE", +) -> Path: + root.mkdir(parents=True, exist_ok=True) + payload = { + "hypothesis_id": hypothesis_id, + "claim": "A claim describing the hypothesis under test.", + "metric": "primary_pct", + "thresholds": {"confirm_at_least": 80.0, "refute_below": 60.0}, + "workload": "test.jsonl", + "status": status, + } + if spec_row is not None: + payload["spec_row"] = spec_row + if source_for_claim is not None: + payload["source_for_claim"] = source_for_claim + if status == "INACTIVE": + payload["blocked_on"] = ["upstream not ready"] + (root / "expected.json").write_text(json.dumps(payload)) + (root / "README.md").write_text("# Sandbox\n") + if status == "ACTIVE": + (root / "docker-compose.yml").write_text( + "services:\n bench:\n image: alpine\n command: ['echo']\n" + ) + (root / "bench.py").write_text("# noop\n") + return root + + +def _bench_layout(tmp_path: Path) -> Path: + (tmp_path / "isolation").mkdir() + (tmp_path / "combination").mkdir() + return tmp_path + + +# --- parse_spec_rows --- + + +def test_parse_spec_rows_extracts_all_numbered(tmp_path: Path): + spec = _write_spec(tmp_path) + rows = parse_spec_rows(spec) + numbers = [r.number for r in rows] + assert 1 in numbers + assert 6 in numbers + assert 9 in numbers + assert 24 in numbers + + +def test_parse_spec_rows_strips_bold_from_titles(tmp_path: Path): + spec = _write_spec(tmp_path) + rows = parse_spec_rows(spec) + row_by_num = {r.number: r for r in rows} + # The choice column has bold; the title (column 2) doesn't here, but in + # the real spec row 21's title IS bold. Verify the parser strips it. + assert "Project name" == row_by_num[1].title + + +def test_parse_spec_rows_skips_divider_row(tmp_path: Path): + spec = _write_spec(tmp_path) + rows = parse_spec_rows(spec) + # The divider `|---|---|---|---|` should not appear as a row. + assert all(r.number != 0 for r in rows) + + +def test_parse_spec_rows_returns_empty_on_missing_file(tmp_path: Path): + rows = parse_spec_rows(tmp_path / "does-not-exist.md") + assert rows == [] + + +def test_parse_spec_rows_collapses_letter_subrows(tmp_path: Path): + """6b should be treated as a variant of row 6; only one entry kept.""" + spec = _write_spec(tmp_path) + rows = parse_spec_rows(spec) + sixes = [r for r in rows if r.number == 6] + assert len(sixes) == 1 + + +# --- extract_spec_rows --- + + +def _make_expected(**overrides) -> ExpectedJson: + base = { + "hypothesis_id": "h", + "claim": "Claim that has at least ten characters of content.", + "metric": "m", + "thresholds": Thresholds(confirm_at_least=80.0, refute_below=60.0), + "workload": "w", + } + base.update(overrides) + return ExpectedJson(**base) + + +def test_extract_spec_rows_prefers_explicit_field(): + e = _make_expected(spec_row=[6, 8, 13], source_for_claim="row 99") + assert extract_spec_rows(e) == [6, 8, 13] + + +def test_extract_spec_rows_falls_back_to_source_regex(): + e = _make_expected(source_for_claim="Spec v0.4 row 9 — 'library-driven retrieval'") + assert extract_spec_rows(e) == [9] + + +def test_extract_spec_rows_finds_multiple_in_source(): + e = _make_expected(source_for_claim="row 6, row 8, row 13, row 29 — mobile revisions") + assert extract_spec_rows(e) == [6, 8, 13, 29] + + +def test_extract_spec_rows_empty_when_no_signal(): + e = _make_expected(source_for_claim="general background, no row reference") + assert extract_spec_rows(e) == [] + + +# --- build_coverage --- + + +def test_build_coverage_joins_sandboxes_to_rows(tmp_path: Path): + bench_root = _bench_layout(tmp_path) + spec = _write_spec(tmp_path) + _write_sandbox( + bench_root / "isolation" / "memory" / "mem0-s", + hypothesis_id="mem0-h", + spec_row=[9], + ) + + entries, orphans = build_coverage(bench_root, spec) + row_9 = next(e for e in entries if e.spec_row.number == 9) + assert len(row_9.sandboxes) == 1 + assert row_9.sandboxes[0].expected.hypothesis_id == "mem0-h" + assert orphans == [] + + +def test_build_coverage_finds_orphan_sandboxes(tmp_path: Path): + bench_root = _bench_layout(tmp_path) + spec = _write_spec(tmp_path) + _write_sandbox( + bench_root / "isolation" / "memory" / "orphan-s", + hypothesis_id="orphan-h", + # No spec_row, no row-N pattern in source + source_for_claim="independent claim, no spec row reference", + ) + + entries, orphans = build_coverage(bench_root, spec) + assert len(orphans) == 1 + assert orphans[0].expected.hypothesis_id == "orphan-h" + + +def test_build_coverage_uses_source_for_claim_fallback(tmp_path: Path): + bench_root = _bench_layout(tmp_path) + spec = _write_spec(tmp_path) + _write_sandbox( + bench_root / "isolation" / "retrieval" / "aider-s", + hypothesis_id="aider-h", + # No explicit spec_row — relies on regex fallback + source_for_claim="Spec v0.3 row 24: 'Aider-style repomap pattern...'", + ) + + entries, orphans = build_coverage(bench_root, spec) + row_24 = next(e for e in entries if e.spec_row.number == 24) + assert len(row_24.sandboxes) == 1 + assert orphans == [] + + +def test_build_coverage_excludes_inactive_sandboxes(tmp_path: Path): + bench_root = _bench_layout(tmp_path) + spec = _write_spec(tmp_path) + _write_sandbox( + bench_root / "isolation" / "memory" / "stub-s", + hypothesis_id="stub-h", + spec_row=[9], + status="INACTIVE", + ) + + entries, orphans = build_coverage(bench_root, spec) + row_9 = next(e for e in entries if e.spec_row.number == 9) + assert row_9.sandboxes == [] + + +def test_build_coverage_unknown_row_is_orphan(tmp_path: Path): + """Sandbox claiming row 999 (not in spec) is treated as an orphan.""" + bench_root = _bench_layout(tmp_path) + spec = _write_spec(tmp_path) + _write_sandbox( + bench_root / "isolation" / "memory" / "ghost-s", + hypothesis_id="ghost-h", + spec_row=[999], + ) + + _, orphans = build_coverage(bench_root, spec) + assert any(o.expected.hypothesis_id == "ghost-h" for o in orphans) + + +# --- render markdown + CLI --- + + +def test_render_markdown_includes_all_rows(tmp_path: Path): + bench_root = _bench_layout(tmp_path) + spec = _write_spec(tmp_path) + _write_sandbox( + bench_root / "isolation" / "memory" / "mem0-s", + hypothesis_id="mem0-h", + spec_row=[9], + ) + + entries, orphans = build_coverage(bench_root, spec) + md = render_coverage_markdown(entries, orphans) + assert "OCM Bench" in md + assert "Spec Row Coverage" in md + # Each row from the sample spec should appear + for n in [1, 6, 9, 24]: + assert f"| {n} |" in md + assert "mem0-s" in md + + +def test_cli_coverage_writes_to_file(tmp_path: Path): + bench_root = _bench_layout(tmp_path) + spec = _write_spec(tmp_path) + _write_sandbox( + bench_root / "isolation" / "memory" / "s", + hypothesis_id="h-mem0", + spec_row=[9], + ) + out = tmp_path / "docs" / "coverage.md" + + cli = CliRunner() + result = cli.invoke( + main, + [ + "coverage", + "--root", str(bench_root), + "--spec", str(spec), + "--write", str(out), + ], + ) + assert result.exit_code == 0, result.output + assert out.exists() + md = out.read_text(encoding="utf-8") + assert "h-mem0" in md or "s" in md diff --git a/bench/tests/test_dashboard.py b/bench/tests/test_dashboard.py new file mode 100644 index 0000000..c2e1001 --- /dev/null +++ b/bench/tests/test_dashboard.py @@ -0,0 +1,272 @@ +"""Tests for the bench dashboard subcommand and dashboard.py helpers.""" + +from __future__ import annotations + +import json +from pathlib import Path + +from click.testing import CliRunner + +from bench.cli import main +from bench.dashboard import ( + collect_dashboard_rows, + compute_overall_status, + render_markdown, +) + + +def _write_sandbox( + root: Path, + *, + hypothesis_id: str, + status: str = "ACTIVE", + confirm_at_least: float = 80.0, + metric: str = "primary_pct", +) -> Path: + root.mkdir(parents=True, exist_ok=True) + payload = { + "hypothesis_id": hypothesis_id, + "claim": "A claim describing the hypothesis under test.", + "metric": metric, + "thresholds": {"confirm_at_least": confirm_at_least, "refute_below": 60.0}, + "workload": "test.jsonl", + "status": status, + } + if status == "INACTIVE": + payload["blocked_on"] = ["upstream not ready"] + (root / "expected.json").write_text(json.dumps(payload)) + (root / "README.md").write_text("# Sandbox\n") + if status == "ACTIVE": + (root / "docker-compose.yml").write_text( + "services:\n bench:\n image: alpine\n command: ['echo', 'noop']\n" + ) + (root / "bench.py").write_text("# noop\n") + return root + + +def _write_summary( + results_root: Path, + *, + hypothesis_id: str, + hardware_class: str, + primary_median: float, + timestamp: str, + verdict: str = "CONFIRMED", +) -> Path: + run_dir = results_root / f"{timestamp}-{hypothesis_id}-{hardware_class}" + run_dir.mkdir(parents=True, exist_ok=True) + payload = { + "hypothesis_id": hypothesis_id, + "hardware_class": hardware_class, + "timestamp_utc": timestamp, + "expected": {}, + "runs": [], + "primary_median": primary_median, + "primary_std": None, + "secondary_median": None, + "secondary_std": None, + "verdict": verdict, + "verdict_reason": "test", + } + path = run_dir / "summary.json" + path.write_text(json.dumps(payload)) + return path + + +def _bench_layout(tmp_path: Path) -> Path: + """Create a minimal bench-root with isolation/ + combination/.""" + (tmp_path / "isolation").mkdir() + (tmp_path / "combination").mkdir() + return tmp_path + + +def test_collect_rows_excludes_inactive(tmp_path: Path): + bench_root = _bench_layout(tmp_path) + _write_sandbox(bench_root / "isolation" / "memory" / "active-s", hypothesis_id="h-active") + _write_sandbox( + bench_root / "isolation" / "memory" / "inactive-s", + hypothesis_id="h-inactive", + status="INACTIVE", + ) + + rows = collect_dashboard_rows(bench_root) + assert len(rows) == 1 + assert rows[0].expected.hypothesis_id == "h-active" + assert rows[0].category == "memory" + + +def test_collect_rows_picks_latest_per_hardware(tmp_path: Path): + bench_root = _bench_layout(tmp_path) + _write_sandbox(bench_root / "isolation" / "memory" / "s", hypothesis_id="h1") + results = bench_root / "results" + _write_summary( + results, hypothesis_id="h1", hardware_class="rtx-4090", + primary_median=80.0, timestamp="2026-01-01T00-00-00Z", + ) + _write_summary( + results, hypothesis_id="h1", hardware_class="rtx-4090", + primary_median=85.0, timestamp="2026-02-01T00-00-00Z", + ) + + rows = collect_dashboard_rows(bench_root) + assert len(rows) == 1 + assert rows[0].primary_median == 85.0 + assert rows[0].timestamp_utc == "2026-02-01T00-00-00Z" + + +def test_collect_rows_splits_by_hardware_class(tmp_path: Path): + bench_root = _bench_layout(tmp_path) + _write_sandbox(bench_root / "isolation" / "memory" / "s", hypothesis_id="h1") + results = bench_root / "results" + _write_summary( + results, hypothesis_id="h1", hardware_class="rtx-4090", + primary_median=85.0, timestamp="2026-01-01T00-00-00Z", + ) + _write_summary( + results, hypothesis_id="h1", hardware_class="m4-pro", + primary_median=70.0, timestamp="2026-01-01T00-00-00Z", + verdict="INCONCLUSIVE", + ) + + rows = collect_dashboard_rows(bench_root) + assert len(rows) == 2 + by_hw = {r.hardware_class: r for r in rows} + assert by_hw["rtx-4090"].verdict.value == "CONFIRMED" + assert by_hw["m4-pro"].verdict.value == "INCONCLUSIVE" + + +def test_no_runs_yet_shows_row_with_none(tmp_path: Path): + bench_root = _bench_layout(tmp_path) + _write_sandbox(bench_root / "isolation" / "memory" / "s", hypothesis_id="h-fresh") + + rows = collect_dashboard_rows(bench_root) + assert len(rows) == 1 + assert rows[0].verdict is None + assert rows[0].primary_median is None + + +def test_overall_status_pass_when_all_confirmed(tmp_path: Path): + bench_root = _bench_layout(tmp_path) + _write_sandbox(bench_root / "isolation" / "memory" / "s1", hypothesis_id="h1") + _write_sandbox(bench_root / "isolation" / "memory" / "s2", hypothesis_id="h2") + results = bench_root / "results" + _write_summary( + results, hypothesis_id="h1", hardware_class="hw", + primary_median=85.0, timestamp="2026-01-01T00-00-00Z", + ) + _write_summary( + results, hypothesis_id="h2", hardware_class="hw", + primary_median=90.0, timestamp="2026-01-01T00-00-00Z", + ) + + rows = collect_dashboard_rows(bench_root) + status, counts = compute_overall_status(rows) + assert status == "PASS" + assert counts["CONFIRMED"] == 2 + + +def test_overall_status_fail_when_any_refuted(tmp_path: Path): + bench_root = _bench_layout(tmp_path) + _write_sandbox(bench_root / "isolation" / "memory" / "s1", hypothesis_id="h1") + _write_sandbox(bench_root / "isolation" / "memory" / "s2", hypothesis_id="h2") + results = bench_root / "results" + _write_summary( + results, hypothesis_id="h1", hardware_class="hw", + primary_median=85.0, timestamp="2026-01-01T00-00-00Z", + ) + _write_summary( + results, hypothesis_id="h2", hardware_class="hw", + primary_median=40.0, timestamp="2026-01-01T00-00-00Z", + verdict="REFUTED", + ) + + rows = collect_dashboard_rows(bench_root) + status, counts = compute_overall_status(rows) + assert status == "FAIL" + assert counts["REFUTED"] == 1 + + +def test_overall_status_fail_when_no_runs(tmp_path: Path): + bench_root = _bench_layout(tmp_path) + _write_sandbox(bench_root / "isolation" / "memory" / "s", hypothesis_id="h-fresh") + + rows = collect_dashboard_rows(bench_root) + status, _ = compute_overall_status(rows) + assert status == "FAIL" + + +def test_render_markdown_contains_badge_and_rows(tmp_path: Path): + bench_root = _bench_layout(tmp_path) + _write_sandbox(bench_root / "isolation" / "memory" / "my-sandbox", hypothesis_id="h1") + results = bench_root / "results" + _write_summary( + results, hypothesis_id="h1", hardware_class="rtx-4090", + primary_median=85.0, timestamp="2026-01-01T00-00-00Z", + ) + + rows = collect_dashboard_rows(bench_root) + status, counts = compute_overall_status(rows) + md = render_markdown(rows, status=status, counts=counts) + assert "OCM Bench" in md + assert "[PASS]" in md + assert "my-sandbox" in md + assert "85.000" in md + assert "rtx-4090" in md + assert "CONFIRMED" in md + + +def test_cli_dashboard_writes_to_file(tmp_path: Path): + bench_root = _bench_layout(tmp_path) + _write_sandbox(bench_root / "isolation" / "memory" / "s", hypothesis_id="h1") + results = bench_root / "results" + _write_summary( + results, hypothesis_id="h1", hardware_class="hw", + primary_median=85.0, timestamp="2026-01-01T00-00-00Z", + ) + out_path = tmp_path / "docs" / "metrics.md" + + runner = CliRunner() + result = runner.invoke( + main, + ["dashboard", "--root", str(bench_root), "--write", str(out_path)], + ) + assert result.exit_code == 0, result.output + assert out_path.exists() + md = out_path.read_text() + assert "OCM Bench" in md + assert "h1" in md + + +def test_cli_dashboard_check_exits_1_on_refuted(tmp_path: Path): + bench_root = _bench_layout(tmp_path) + _write_sandbox(bench_root / "isolation" / "memory" / "s", hypothesis_id="h1") + results = bench_root / "results" + _write_summary( + results, hypothesis_id="h1", hardware_class="hw", + primary_median=40.0, timestamp="2026-01-01T00-00-00Z", + verdict="REFUTED", + ) + + runner = CliRunner() + result = runner.invoke( + main, + ["dashboard", "--root", str(bench_root), "--check"], + ) + assert result.exit_code == 1 + + +def test_cli_dashboard_check_passes_when_all_confirmed(tmp_path: Path): + bench_root = _bench_layout(tmp_path) + _write_sandbox(bench_root / "isolation" / "memory" / "s", hypothesis_id="h1") + results = bench_root / "results" + _write_summary( + results, hypothesis_id="h1", hardware_class="hw", + primary_median=85.0, timestamp="2026-01-01T00-00-00Z", + ) + + runner = CliRunner() + result = runner.invoke( + main, + ["dashboard", "--root", str(bench_root), "--check"], + ) + assert result.exit_code == 0, result.output diff --git a/bench/tests/test_history.py b/bench/tests/test_history.py new file mode 100644 index 0000000..137c92d --- /dev/null +++ b/bench/tests/test_history.py @@ -0,0 +1,142 @@ +"""Tests for trend tracking — history.jsonl append on each run.""" + +from __future__ import annotations + +import json +from pathlib import Path + +import pytest + +from bench import runner as runner_module +from bench.metrics import HistoryRecord, RunResult +from bench.runner import _get_git_sha, run_sandbox + + +def _write_active_sandbox(root: Path, *, hypothesis_id: str = "trend-h") -> Path: + root.mkdir(parents=True, exist_ok=True) + expected = { + "hypothesis_id": hypothesis_id, + "claim": "A claim describing the hypothesis under test.", + "metric": "primary_pct", + "thresholds": {"confirm_at_least": 80.0, "refute_below": 60.0}, + "workload": "test.jsonl", + } + (root / "expected.json").write_text(json.dumps(expected)) + (root / "docker-compose.yml").write_text( + "services:\n bench:\n image: alpine\n command: ['echo', 'noop']\n" + ) + (root / "bench.py").write_text("# placeholder\n") + (root / "README.md").write_text("# Trend sandbox\n") + return root + + +def _fake_execute(sandbox_path, expected, *, repeat, out_dir, primary=85.0): + """Stand-in for _execute_compose that skips Docker.""" + return RunResult( + hypothesis_id=expected.hypothesis_id, + repeat_index=repeat, + primary_value=primary, + secondary_value=None, + duration_seconds=0.01, + raw_path=None, + ) + + +def test_history_jsonl_created_after_run(tmp_path: Path, monkeypatch): + sandbox = _write_active_sandbox(tmp_path / "sandbox") + results = tmp_path / "results" + + monkeypatch.setattr(runner_module, "_execute_compose", _fake_execute) + + summary = run_sandbox( + sandbox, + hardware_class="cpu-only-32gb", + repeats=3, + out_dir=results, + dry_run=False, + ) + + history_path = results / "history.jsonl" + assert history_path.exists() + lines = history_path.read_text().splitlines() + assert len(lines) == 1 + + record = HistoryRecord.model_validate_json(lines[0]) + assert record.hypothesis_id == "trend-h" + assert record.hardware_class == "cpu-only-32gb" + assert record.primary_median == summary.primary_median + assert record.verdict.value == summary.verdict.value + assert record.repeats == 3 + assert record.wall_clock_s >= 0.0 + assert record.git_sha # always set, "unknown" if not in repo + + +def test_history_jsonl_appends_not_overwrites(tmp_path: Path, monkeypatch): + sandbox = _write_active_sandbox(tmp_path / "sandbox") + results = tmp_path / "results" + + monkeypatch.setattr(runner_module, "_execute_compose", _fake_execute) + + run_sandbox(sandbox, hardware_class="hw", repeats=1, out_dir=results) + run_sandbox(sandbox, hardware_class="hw", repeats=1, out_dir=results) + run_sandbox(sandbox, hardware_class="hw", repeats=1, out_dir=results) + + lines = (results / "history.jsonl").read_text().splitlines() + assert len(lines) == 3 + for line in lines: + HistoryRecord.model_validate_json(line) # each parses cleanly + + +def test_history_not_written_on_dry_run(tmp_path: Path, monkeypatch): + sandbox = _write_active_sandbox(tmp_path / "sandbox") + results = tmp_path / "results" + + monkeypatch.setattr(runner_module, "_execute_compose", _fake_execute) + + run_sandbox( + sandbox, + hardware_class="hw", + repeats=1, + out_dir=results, + dry_run=True, + ) + + assert not (results / "history.jsonl").exists() + + +def test_get_git_sha_returns_unknown_outside_repo(tmp_path: Path): + """In a fresh tmp_path (not a git repo), git_sha falls back to 'unknown'.""" + sha = _get_git_sha(repo_hint=tmp_path) + assert sha == "unknown" + + +def test_get_git_sha_returns_value_in_repo(): + """Inside the actual ocm repo, git_sha returns a short SHA (7+ hex chars).""" + sha = _get_git_sha() + if sha == "unknown": + pytest.skip("not running inside a git repo") + assert len(sha) >= 7 + assert all(c in "0123456789abcdef" for c in sha) + + +def test_history_record_captures_secondary_median(tmp_path: Path, monkeypatch): + sandbox = _write_active_sandbox(tmp_path / "sandbox") + results = tmp_path / "results" + + def with_secondary(sandbox_path, expected, *, repeat, out_dir): + return RunResult( + hypothesis_id=expected.hypothesis_id, + repeat_index=repeat, + primary_value=85.0, + secondary_value=42.0, + duration_seconds=0.01, + raw_path=None, + ) + + monkeypatch.setattr(runner_module, "_execute_compose", with_secondary) + + run_sandbox(sandbox, hardware_class="hw", repeats=3, out_dir=results) + + line = (results / "history.jsonl").read_text().splitlines()[0] + record = HistoryRecord.model_validate_json(line) + assert record.secondary_median == 42.0 diff --git a/docs/benchmarking.md b/docs/benchmarking.md index 4fa5a84..22db545 100644 --- a/docs/benchmarking.md +++ b/docs/benchmarking.md @@ -107,6 +107,30 @@ bench report --hypothesis-id vllm-q4-llama8b-singlestream-tps bench report --hardware-class nvidia-rtx-4090-24gb ``` +### Snapshot views: dashboard + coverage + +Two commands answer "what is the current state?" without re-running anything: + +```bash +# All ACTIVE sandboxes, latest verdict per (hypothesis, hardware) pair. +# Prints a markdown table to stdout; --write commits to docs/metrics.md. +bench dashboard --write docs/metrics.md + +# Same, but exits 1 if any sandbox is REFUTED, INCONCLUSIVE, or has no run. +# CI uses this to gate merges that would weaken the published numbers. +bench dashboard --check + +# Map spec rows to validating sandboxes (one row per locked decision). +# Surfaces "unvalidated rows" (no sandbox) and "orphan sandboxes" (no row). +bench coverage --write docs/coverage.md +``` + +`docs/metrics.md` and `docs/coverage.md` are checked into the repo and regenerated by CI. If you change a sandbox or flip its state, run both commands locally and commit the regenerated files — CI will fail the PR otherwise. + +### Trend tracking + +Every `bench run` appends one line to `bench/results/history.jsonl` capturing the run's metric, verdict, git SHA, wall-clock time, and repeats. This is the single fast scan path for plotting trends or bisecting regressions without re-parsing every `summary.json`. The schema is `HistoryRecord` in [bench/bench/metrics.py](../bench/bench/metrics.py). + ## Hardware classes A **hardware class** is an opaque tag that groups results by the machine they ran on. Don't try to make this ontologically clean — just be consistent. Examples currently in use: diff --git a/docs/coverage.md b/docs/coverage.md new file mode 100644 index 0000000..439978b --- /dev/null +++ b/docs/coverage.md @@ -0,0 +1,49 @@ +# OCM Bench: Spec Row Coverage + +_Auto-generated by `bench coverage`. Do not edit by hand._ + +**Spec rows:** 33 total | **with a sandbox:** 3 | **CONFIRMED:** 0 + +| Row | Decision title | Sandbox(es) | Best verdict | +|---|---|---|---| +| 1 | Project name | _(none)_ | — | +| 2 | Scope | _(none)_ | — | +| 3 | Build approach | _(none)_ | — | +| 4 | License | _(none)_ | — | +| 5 | Inference engine — NVIDIA peers | _(none)_ | — | +| 6 | Inference engine — Apple Silicon peers (incl. iPad M-series via v0.7 mobile policy) | _(none)_ | — | +| 7 | Sharded inference (v6+) | _(none)_ | — | +| 8 | Mesh transport (v2+, mobile bindings v1.5+) | _(none)_ | — | +| 9 | Agent memory + virtual context | _(none)_ | — | +| 10 | Agent runtime | _(none)_ | — | +| 11 | Client-facing API | _(none)_ | — | +| 12 | Daemon / cross-platform UI | _(none)_ | — | +| 13 | Platforms | _(none)_ | — | +| 14 | Hero pitch | _(none)_ | — | +| 15 | Wedge / secondary | _(none)_ | — | +| 16 | Canonical model (24GB+ VRAM) | _(none)_ | — | +| 17 | Default model (8GB-16GB) | _(none)_ | — | +| 18 | CPU-only / tiny tier | _(none)_ | — | +| 19 | Speculative decoding | _(none)_ | — | +| 20 | Tool-call format | _(none)_ | — | +| 21 | Schema compression | `sandbox-e-schema-compression` | (no run yet) | +| 22 | Structured generation | _(none)_ | — | +| 23 | DSPy GEPA + signed skill artifacts (v2+ network-effect lever) | _(none)_ | — | +| 24 | Compressed-view tool (v1, new in v0.3) | `aider-repomap-fidelity` | (no run yet) | +| 25 | Code-context retrieval (v1, new in v0.3) | _(none)_ | — | +| 26 | Decentralized Memory Palace (v3.5+ network-effect lever, new in v0.4) | _(none)_ | — | +| 27 | Effective-Context Triad (cross-cutting constraint, new in v0.4) | _(none)_ | — | +| 28 | Compression pipeline contract (cross-cutting, new in v0.4) | _(none)_ | — | +| 29 | Encryption mapped onto privacy zones A/B/C (new in v0.4, mobile keystore mapping added v0.7) | `sandbox-i-mem0-encryption-overhead` | (no run yet) | +| 30 | Compress-then-encrypt order mandatory (new in v0.4) | _(none)_ | — | +| 31 | VM / cloud deployment policy (new in v0.5) | _(none)_ | — | +| 32 | Bench-sandbox INACTIVE convention (new in v0.6) | _(none)_ | — | +| 33 | Mobile compute policy — opportunistic contributor with staged rollout (new in v0.7) | _(none)_ | — | + +## Orphan sandboxes + +_ACTIVE sandboxes whose `spec_row` field or `source_for_claim` did not resolve to a known spec row._ + +| Sandbox | Hypothesis | Source-for-claim hint | +|---|---|---| +| `vllm-q4-llama8b` | `vllm-q4-llama8b-singlestream-tps` | https://www.databasemart.com/blog/vllm-gpu-benchmark-rtx4090 | diff --git a/docs/metrics.md b/docs/metrics.md new file mode 100644 index 0000000..0216933 --- /dev/null +++ b/docs/metrics.md @@ -0,0 +1,12 @@ +# OCM Bench: Metrics Dashboard + +_Auto-generated by `bench dashboard`. Do not edit by hand._ + +**[FAIL]** 0 CONFIRMED / 0 REFUTED / 0 INCONCLUSIVE / 4 no-run + +| Sandbox | Hypothesis | Category | Primary metric | Latest | Threshold | Verdict | Hardware | Run | +|---|---|---|---|---|---|---|---|---| +| `sandbox-i-mem0-encryption-overhead` | `mem0-sqlcipher-aes256-overhead` | frontier-comparison | `encrypted_retrieval_latency_overhead_pct` | - | <= 15.0 | (no run yet) | `-` | - | +| `sandbox-e-schema-compression` | `schema-compression-token-impact` | frontier-comparison | `input_tokens_pct_reduction_median` | - | >= 30.0 | (no run yet) | `-` | - | +| `vllm-q4-llama8b` | `vllm-q4-llama8b-singlestream-tps` | inference-engines | `tokens_per_second_median_single_stream` | - | >= 100.0 | (no run yet) | `-` | - | +| `aider-repomap-fidelity` | `aider-repomap-token-reduction-and-symbol-coverage` | retrieval | `token_reduction_pct` | - | >= 50.0 | (no run yet) | `-` | - | From 87c19c8b63573912563d1809311c7de6394569d2 Mon Sep 17 00:00:00 2001 From: Brand Date: Thu, 11 Jun 2026 14:33:07 -0600 Subject: [PATCH 2/7] =?UTF-8?q?bench:=20amnesia-ab=20sandbox=20=E2=80=94?= =?UTF-8?q?=20FIRST=20RUN=20of=20the=20suite,=20verdict=20CONFIRMED?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The cheapest discriminating test of the central loop (spec row 9, library- driven retrieval) as a faithful miniature: mxbai-embed-large cosine top-5 -> inject -> llama3 8B Q4 via Ollama, 62-memory corpus w/ cross-project distractors, 20 tasks, objective key-fact scoring. Measured (results/run-2026-06-11T20-32-21.json): memory_on_fact_recall_pct 94.2 (confirm >=70) retrieval_hit_rate_pct 100.0 (confirm >=80) memory_off_fact_recall_pct 2.5 (sanity <=25 — corpus not guessable) latency p50 on/off 19.5s / 12.3s OFF-arm failure mode is confident fabrication, not ignorance — the memory loop is the difference between correct specifics and plausible lies on an 8B model. Per the decision rule this justifies: the Ollama backend adapter, activating mem0-v3-locomo, and cutting v0.1.0. Co-Authored-By: Claude Fable 5 --- bench/isolation/memory/amnesia-ab/README.md | 53 ++ bench/isolation/memory/amnesia-ab/corpus.json | 88 +++ .../isolation/memory/amnesia-ab/expected.json | 25 + .../results/run-2026-06-11T20-32-21.json | 643 ++++++++++++++++++ bench/isolation/memory/amnesia-ab/run.mjs | 135 ++++ 5 files changed, 944 insertions(+) create mode 100644 bench/isolation/memory/amnesia-ab/README.md create mode 100644 bench/isolation/memory/amnesia-ab/corpus.json create mode 100644 bench/isolation/memory/amnesia-ab/expected.json create mode 100644 bench/isolation/memory/amnesia-ab/results/run-2026-06-11T20-32-21.json create mode 100644 bench/isolation/memory/amnesia-ab/run.mjs diff --git a/bench/isolation/memory/amnesia-ab/README.md b/bench/isolation/memory/amnesia-ab/README.md new file mode 100644 index 0000000..a619d52 --- /dev/null +++ b/bench/isolation/memory/amnesia-ab/README.md @@ -0,0 +1,53 @@ +# amnesia-ab — memory ON vs OFF on an 8B local model + +**Status: ACTIVE — first sandbox in the suite to actually run.** + +## What this tests + +The cheapest discriminating test of OCM's reason to exist. OCM's pitch is "a local model +that remembers you" — the central loop is library-driven retrieval (spec row 9): embed the +request, pull top-k memories, inject them, generate. If that loop does not make a small +local model *clearly* better on tasks that depend on prior-session facts, the +persistent-memory differentiator is refuted at small scale and nothing downstream matters. + +## Method + +Faithful **miniature** of the Mem0 library-driven pattern — NOT the OCM binaries (which +need a 3-process stack: daemon + inference server + Mem0). Same loop, operator's actual +hardware class: + +- **Corpus:** 65 synthetic prior-session memories across 5 fictional projects + (e-bike conversion, greenhouse controller, NAS build, drone gimbal, bakery side-business) + plus personal preferences. Facts carry exact-matchable invented tokens (vendor names, + numbers, dates) that an 8B model cannot guess — the other 4 projects act as retrieval + distractors. +- **Tasks:** 20, each requiring 1-3 specific stored facts to answer well. +- **ARM A (ON):** task → mxbai-embed-large query embedding → cosine top-5 → injected as + RELEVANT MEMORIES in the system prompt → llama3 8B Q4. +- **ARM B (OFF):** identical prompt, no memories. +- **Scoring:** objective substring match on key facts (normalized: lowercase, strip + spaces/commas/periods). Retrieval hit = ≥2 of the task's source memories in top-5 + (or all, if the task has fewer than 2 sources). + +## Hypothesis contract (expected.json) + +| Metric | Confirm | Refute | Meaning on refute | +|---|---|---|---| +| `memory_on_fact_recall_pct` | ≥70 | <50 | 8B can't use injected memories — thesis fails on target hardware | +| `retrieval_hit_rate_pct` | ≥80 | <60 | embedding retrieval fails even at toy scale — memory layer needs redesign | +| `memory_off_fact_recall_pct` (sanity) | ≤25 | — | above 25 the corpus is guessable and the run is INVALID, not a confirm | + +## Decision rule + +- **CONFIRMED** → the core loop is valid at small scale. Justifies (1) the Ollama backend + adapter in `ocm-inference`, (2) standing up the real Mem0-on-8B LoCoMo sandbox + (`../mem0-v3-locomo`), (3) cutting the v0.1.0 release. +- **REFUTED** → archive-without-guilt evidence, before any further investment. + +## Run + +``` +node run.mjs # Ollama on 127.0.0.1:11434 with llama3 + mxbai-embed-large pulled +``` + +Results land in `results/run-.json` (summary + every prompt/output/score row). diff --git a/bench/isolation/memory/amnesia-ab/corpus.json b/bench/isolation/memory/amnesia-ab/corpus.json new file mode 100644 index 0000000..3ee25d1 --- /dev/null +++ b/bench/isolation/memory/amnesia-ab/corpus.json @@ -0,0 +1,88 @@ +{ + "memories": [ + { "id": "k01", "project": "kestrel", "text": "Kestrel e-bike conversion: original battery vendor was VoltHive, 52V 17.5Ah pack." }, + { "id": "k02", "project": "kestrel", "text": "Kestrel: switched battery vendor from VoltHive to AmpReach on March 3 because the AmpReach pack saves 12% weight at the same capacity." }, + { "id": "k03", "project": "kestrel", "text": "Kestrel motor decision: Bafang G062 rear hub motor, chosen over the mid-drive for chainline simplicity." }, + { "id": "k04", "project": "kestrel", "text": "Kestrel: first controller fried when it saw 52V on the battery side — replacement controller must be rated for 52V nominal." }, + { "id": "k05", "project": "kestrel", "text": "Kestrel hard budget cap is $1,400 all-in, agreed at project start." }, + { "id": "k06", "project": "kestrel", "text": "Kestrel range target: 45 miles per charge at assist level 2." }, + { "id": "k07", "project": "kestrel", "text": "Kestrel frame has carbon dropouts, so a torque arm is mandatory before the hub motor goes in — carbon cannot take the axle torque." }, + { "id": "k08", "project": "kestrel", "text": "Kestrel: brake levers need cutoff sensors wired to the controller, ordered the magnetic ring type." }, + { "id": "k09", "project": "kestrel", "text": "Kestrel tires upgraded to Schwalbe Marathon Plus for puncture resistance under motor load." }, + { "id": "k10", "project": "kestrel", "text": "Kestrel display unit is the DPC-18 color display, mounted left of the stem." }, + { "id": "k11", "project": "kestrel", "text": "Kestrel: spoke gauge on the motor wheel build is 12G front-load pattern, built by the local shop." }, + { "id": "k12", "project": "kestrel", "text": "Kestrel charger: 5A fast charger stays in the garage; the 2A travel charger lives in the pannier." }, + { "id": "a01", "project": "aurora", "text": "Aurora greenhouse controller runs on an ESP32-S3 dev board." }, + { "id": "a02", "project": "aurora", "text": "Aurora humidity/temperature sensing uses the SHT45 sensor on a 1-meter cable into the canopy." }, + { "id": "a03", "project": "aurora", "text": "Aurora irrigation uses DripWorks DV-2 solenoid drip valves, one per bed." }, + { "id": "a04", "project": "aurora", "text": "Aurora misting rule: misting triggers when canopy humidity drops below 38%." }, + { "id": "a05", "project": "aurora", "text": "Aurora: the tomatoes are in bed B2; peppers in B1; greens in B3." }, + { "id": "a06", "project": "aurora", "text": "Aurora firmware bug fixed: relay chatter on the mist pump was solved with a 500ms debounce on the humidity trigger." }, + { "id": "a07", "project": "aurora", "text": "Aurora power: 120W solar panel charging a 12V 20Ah LiFePO4 battery, controller is a Victron 75/10." }, + { "id": "a08", "project": "aurora", "text": "Aurora telemetry publishes MQTT to the house broker every 60 seconds, topic greenhouse/aurora." }, + { "id": "a09", "project": "aurora", "text": "Aurora enclosure is an IP65 box on the north post, glands pointed down after the first one leaked." }, + { "id": "a10", "project": "aurora", "text": "Aurora vent servo opens the ridge vent fully above 29C interior temperature." }, + { "id": "a11", "project": "aurora", "text": "Aurora: soil moisture probes were abandoned — too much drift; canopy humidity + drip timer is the design now." }, + { "id": "a12", "project": "aurora", "text": "Aurora watering window is 6:00-6:20am only, to avoid afternoon fungal pressure." }, + { "id": "t01", "project": "tidehouse", "text": "Tidehouse NAS case is the Jonsbo N3, eight bays, chosen for the small footprint." }, + { "id": "t02", "project": "tidehouse", "text": "Tidehouse drives: 4x 12TB Seagate Ironwolf, bought across two retailers to vary manufacturing batches." }, + { "id": "t03", "project": "tidehouse", "text": "Tidehouse filesystem is ZFS RAIDZ2 — two-drive failure tolerance was the requirement." }, + { "id": "t04", "project": "tidehouse", "text": "Tidehouse offsite backup target is Backblaze B2, encrypted with restic before upload." }, + { "id": "t05", "project": "tidehouse", "text": "Tidehouse ZFS scrub is scheduled weekly, Sunday 2am." }, + { "id": "t06", "project": "tidehouse", "text": "Tidehouse PSU is a Corsair SF450 — SFX form factor required by the Jonsbo case." }, + { "id": "t07", "project": "tidehouse", "text": "Tidehouse idle power measured at 31W at the wall with drives spun down." }, + { "id": "t08", "project": "tidehouse", "text": "Tidehouse motherboard: ITX board with 6 SATA ports plus an ASM1166 M.2 adapter for the extra bays." }, + { "id": "t09", "project": "tidehouse", "text": "Tidehouse RAM is 32GB ECC; ZFS ARC capped at 16GB in the boot config." }, + { "id": "t10", "project": "tidehouse", "text": "Tidehouse runs three services only: SMB shares, the photo library, and the restic backup timer. No VMs by design." }, + { "id": "t11", "project": "tidehouse", "text": "Tidehouse alerting: scrub and SMART failures email through the house relay; test email confirmed working April 12." }, + { "id": "t12", "project": "tidehouse", "text": "Tidehouse UPS is a 600VA line-interactive unit; NUT shuts the NAS down at 20% battery." }, + { "id": "f01", "project": "falconer", "text": "Falconer drone gimbal motors are GB2208 brushless gimbal motors, one per axis." }, + { "id": "f02", "project": "falconer", "text": "Falconer gimbal controller is the STorM32 board, firmware locked at version 2.61 — newer firmware broke the IMU filter settings." }, + { "id": "f03", "project": "falconer", "text": "Falconer camera is the Runcam Thumb Pro, chosen for weight." }, + { "id": "f04", "project": "falconer", "text": "Falconer vibration fix: printed TPU 95A dampers between the frame and gimbal cage killed the jello in footage." }, + { "id": "f05", "project": "falconer", "text": "Falconer balance: camera sits 2.3mm forward of the roll axis after the last rebalance — do not move the mount plate." }, + { "id": "f06", "project": "falconer", "text": "Falconer IMU is mounted under the camera plate, foam-taped, axis orientation +X forward." }, + { "id": "f07", "project": "falconer", "text": "Falconer power: gimbal runs off a 3S regulated rail at 11.5V, not battery-direct — voltage sag was causing motor hiccups." }, + { "id": "f08", "project": "falconer", "text": "Falconer pitch travel is software-limited to -90 to +30 degrees to keep the lens out of the props." }, + { "id": "f09", "project": "falconer", "text": "Falconer: PID autotune results were worse than manual; manual P values saved in the field notebook, page 14." }, + { "id": "f10", "project": "falconer", "text": "Falconer spare parts box has one spare GB2208 and two spare TPU damper sets, shelf C in the shop." }, + { "id": "m01", "project": "marrow", "text": "Marrow bakery: flour is Cairnspring Trailblazer, ordered in 50lb bags." }, + { "id": "m02", "project": "marrow", "text": "Marrow standard dough hydration is 78% for the country loaf." }, + { "id": "m03", "project": "marrow", "text": "Marrow bake day is Saturday; mix Friday evening, bake from 5am." }, + { "id": "m04", "project": "marrow", "text": "Marrow oven is a Rofco B40, three decks, 90-minute preheat." }, + { "id": "m05", "project": "marrow", "text": "Marrow pricing: $9 per loaf at the farmers market." }, + { "id": "m06", "project": "marrow", "text": "Marrow farmers market stall fee is $35 per Saturday." }, + { "id": "m07", "project": "marrow", "text": "Marrow sourdough starter is named Clyde, kept at 100% hydration, fed twice daily in summer." }, + { "id": "m08", "project": "marrow", "text": "Marrow cottage food license renewal is due August 15 — calendar reminder set for two weeks prior." }, + { "id": "m09", "project": "marrow", "text": "Marrow typical Saturday output is 36 loaves; sellout by 11am the last four markets." }, + { "id": "m10", "project": "marrow", "text": "Marrow: switched to parchment-lined transport crates after the cloth liners kept lint on the crust." }, + { "id": "m11", "project": "marrow", "text": "Marrow keeps a $200 float of small bills in the cash box for the market." }, + { "id": "p01", "project": "personal", "text": "Preference: build documentation should use metric units." }, + { "id": "p02", "project": "personal", "text": "Preference: status updates in bullet form, five bullets maximum." }, + { "id": "p03", "project": "personal", "text": "Preference: avoid passive voice in anything written for others." }, + { "id": "p04", "project": "personal", "text": "Context: home timezone is America/Denver." }, + { "id": "p05", "project": "personal", "text": "Rule: any tool purchase over $200 needs sign-off from spouse before ordering." } + ], + "tasks": [ + { "id": "K1", "project": "kestrel", "prompt": "Draft a short status update on the Kestrel battery situation — what changed and why.", "key_facts": ["AmpReach", "VoltHive", "12%"], "source_ids": ["k01", "k02"] }, + { "id": "K2", "project": "kestrel", "prompt": "Which motor did we pick for the Kestrel build, and what went wrong with the first controller?", "key_facts": ["Bafang G062", "52V"], "source_ids": ["k03", "k04"] }, + { "id": "K3", "project": "kestrel", "prompt": "Write next steps for Kestrel, keeping the agreed budget and range target in mind.", "key_facts": ["$1,400", "45 miles"], "source_ids": ["k05", "k06"] }, + { "id": "K4", "project": "kestrel", "prompt": "Remind me why the torque arm is non-negotiable on the Kestrel frame.", "key_facts": ["carbon dropouts", "torque arm"], "source_ids": ["k07"] }, + { "id": "A1", "project": "aurora", "prompt": "Status update on the Aurora misting logic — what sensor drives it and at what threshold?", "key_facts": ["SHT45", "38%"], "source_ids": ["a02", "a04"] }, + { "id": "A2", "project": "aurora", "prompt": "What fixed the relay chatter problem on the Aurora mist pump?", "key_facts": ["500ms debounce"], "source_ids": ["a06"] }, + { "id": "A3", "project": "aurora", "prompt": "Write a hardware recap for Aurora: the controller board, the drip valves, and the solar panel size.", "key_facts": ["ESP32-S3", "DripWorks DV-2", "120W"], "source_ids": ["a01", "a03", "a07"] }, + { "id": "A4", "project": "aurora", "prompt": "Which bed are the tomatoes in, and what humidity level triggers misting in the greenhouse?", "key_facts": ["B2", "38%"], "source_ids": ["a05", "a04"] }, + { "id": "T1", "project": "tidehouse", "prompt": "Summarize the Tidehouse storage design — case, drives, and filesystem layout.", "key_facts": ["Jonsbo N3", "12TB", "RAIDZ2"], "source_ids": ["t01", "t02", "t03"] }, + { "id": "T2", "project": "tidehouse", "prompt": "When does the Tidehouse scrub run, and where do the offsite backups go?", "key_facts": ["Sunday 2am", "Backblaze B2"], "source_ids": ["t05", "t04"] }, + { "id": "T3", "project": "tidehouse", "prompt": "Report the Tidehouse power story: PSU model and measured idle draw.", "key_facts": ["Corsair SF450", "31W"], "source_ids": ["t06", "t07"] }, + { "id": "T4", "project": "tidehouse", "prompt": "Draft a one-paragraph parts recap email for the Tidehouse build for a friend planning the same NAS.", "key_facts": ["Jonsbo N3", "Ironwolf", "SF450"], "source_ids": ["t01", "t02", "t06"] }, + { "id": "F1", "project": "falconer", "prompt": "Status on the Falconer vibration issue — what was the fix, and what motors are on the gimbal?", "key_facts": ["TPU 95A", "GB2208"], "source_ids": ["f04", "f01"] }, + { "id": "F2", "project": "falconer", "prompt": "What firmware version is the Falconer gimbal locked at, and which controller board is it?", "key_facts": ["2.61", "STorM32"], "source_ids": ["f02"] }, + { "id": "F3", "project": "falconer", "prompt": "Describe the Falconer camera choice and the current balance offset.", "key_facts": ["Runcam Thumb Pro", "2.3mm"], "source_ids": ["f03", "f05"] }, + { "id": "F4", "project": "falconer", "prompt": "Write a pre-flight tuning checklist for the Falconer gimbal mentioning the controller, dampers, and balance.", "key_facts": ["STorM32", "TPU 95A", "2.3mm"], "source_ids": ["f02", "f04", "f05"] }, + { "id": "M1", "project": "marrow", "prompt": "What do we charge per loaf at the market, and what does the stall cost us each week?", "key_facts": ["$9", "$35"], "source_ids": ["m05", "m06"] }, + { "id": "M2", "project": "marrow", "prompt": "Status update on Marrow operations: the oven, the bake day, and our dough hydration.", "key_facts": ["Rofco B40", "Saturday", "78%"], "source_ids": ["m04", "m03", "m02"] }, + { "id": "M3", "project": "marrow", "prompt": "What licensing deadline is coming up for Marrow?", "key_facts": ["August 15", "cottage"], "source_ids": ["m08"] }, + { "id": "M4", "project": "marrow", "prompt": "Describe the flour we bake with and tell me about our starter.", "key_facts": ["Cairnspring", "Clyde"], "source_ids": ["m01", "m07"] } + ] +} diff --git a/bench/isolation/memory/amnesia-ab/expected.json b/bench/isolation/memory/amnesia-ab/expected.json new file mode 100644 index 0000000..2eb87cc --- /dev/null +++ b/bench/isolation/memory/amnesia-ab/expected.json @@ -0,0 +1,25 @@ +{ + "hypothesis_id": "amnesia-ab-memory-loop", + "claim": "OCM's central loop (library-driven retrieval: embed query -> top-k memory injection -> generate) materially improves an 8B Q4 local model's output on tasks that depend on prior-session facts, with BOTH retrieval precision and the small model's fact-usage functioning end-to-end on consumer hardware. This is the cheapest discriminating test of the architecture's reason to exist: if memory-ON does not clearly beat memory-OFF here, the persistent-memory differentiator is refuted at small scale.", + "metric": "memory_on_fact_recall_pct", + "thresholds": { + "confirm_at_least": 70.0, + "refute_below": 50.0 + }, + "secondary_metric": "retrieval_hit_rate_pct", + "secondary_thresholds": { + "confirm_at_least": 80.0, + "refute_below": 60.0 + }, + "sanity_check": { + "metric": "memory_off_fact_recall_pct", + "must_be_at_most": 25.0, + "rationale": "If the OFF arm recalls facts it was never given, the corpus is guessable and the run is INVALID, not a confirmation." + }, + "workload": "corpus.json (65 synthetic prior-session memories across 5 fictional projects + personal prefs; 20 tasks, each with 2-3 exact-matchable key facts)", + "method_note": "Faithful miniature of the Mem0 library-driven retrieval pattern (spec row 9) using Ollama llama3 8B Q4 + mxbai-embed-large cosine top-5 — NOT the OCM binaries. Tests the thesis on the operator's actual hardware class; a CONFIRM justifies standing up the real 3-process OCM stack, a REFUTE kills it before that cost.", + "source_for_claim": "OCM v1 design spec row 9 (library-driven Mem0 retrieval before every turn); README pitch 'remembers everything'", + "decision_rule": "CONFIRM both metrics + sanity green => OCM's core loop is valid at small scale; proceed to Ollama backend adapter + Mem0-on-8B LoCoMo sandbox. REFUTE on fact_recall => 8B cannot use injected memories; thesis fails on target hardware. REFUTE on retrieval => embedding retrieval fails even at toy scale; memory layer needs redesign before any further investment.", + "timeout_seconds": 1800, + "status": "ACTIVE" +} diff --git a/bench/isolation/memory/amnesia-ab/results/run-2026-06-11T20-32-21.json b/bench/isolation/memory/amnesia-ab/results/run-2026-06-11T20-32-21.json new file mode 100644 index 0000000..1d335ac --- /dev/null +++ b/bench/isolation/memory/amnesia-ab/results/run-2026-06-11T20-32-21.json @@ -0,0 +1,643 @@ +{ + "summary": { + "sandbox": "amnesia-ab", + "ranAt": "2026-06-11T20:32:21.539Z", + "hardware": "operator dev box (Windows, Ollama CPU/GPU local)", + "chatModel": "llama3 8B Q4_0", + "embedModel": "mxbai-embed-large", + "memory_on_fact_recall_pct": 94.2, + "memory_off_fact_recall_pct": 2.5, + "retrieval_hit_rate_pct": 100, + "latency_ms_p50": { + "on": 19493, + "off": 12255 + }, + "verdict": "CONFIRMED" + }, + "results": [ + { + "task": "K1", + "project": "kestrel", + "retrieval": { + "topIds": [ + "k01", + "k02", + "k04", + "k12", + "k06" + ], + "sourceHits": 2, + "of": 2, + "hit": true + }, + "on": { + "factsFound": [ + "AmpReach", + "VoltHive", + "12%" + ], + "recall": 1, + "ms": 36686, + "text": "\"Update on the Kestrel e-bike conversion: We've switched battery vendors from VoltHive to AmpReach, effective March 3. The new 52V 17.5Ah pack from AmpReach saves 12% weight at the same capacity, a significant improvement for our range target of 45 miles per charge at assist level 2. Remembering the initial controller failure when it saw 52V on the battery side, we'll ensure the replacement controller is rated for 52V nominal to avoid any future issues.\"" + }, + "off": { + "factsFound": [], + "recall": 0, + "ms": 8553, + "text": "\"Update: The Kestrel battery situation has improved. After investigating, we found that the issue was caused by a faulty charging port. We replaced it with a new one, which resolved the problem. All batteries are now functioning properly.\"" + } + }, + { + "task": "K2", + "project": "kestrel", + "retrieval": { + "topIds": [ + "k04", + "k03", + "f01", + "f02", + "k07" + ], + "sourceHits": 2, + "of": 2, + "hit": true + }, + "on": { + "factsFound": [ + "Bafang G062", + "52V" + ], + "recall": 1, + "ms": 15363, + "text": "We chose the Bafang G062 rear hub motor for the Kestrel build. The first controller fried when it saw 52V on the battery side." + }, + "off": { + "factsFound": [], + "recall": 0, + "ms": 10891, + "text": "We selected the T-Motor MN3508-12S motor for the Kestrel build.\n\nRegarding the first controller, it was an ESC failure due to overheating during testing. We need to replace it with a more robust one, such as the Hobbywing 60A or the Castle Creations 100A." + } + }, + { + "task": "K3", + "project": "kestrel", + "retrieval": { + "topIds": [ + "k06", + "k05", + "k11", + "k01", + "k03" + ], + "sourceHits": 2, + "of": 2, + "hit": true + }, + "on": { + "factsFound": [ + "$1,400", + "45 miles" + ], + "recall": 1, + "ms": 45316, + "text": "Based on our previous discussions, I recall that we're aiming to achieve a range target of 45 miles per charge at assist level 2 while staying within the hard budget cap of $1,400 all-in.\n\nTo move forward, here are my suggested next steps:\n\n1. Confirm the Bafang G062 rear hub motor installation and ensure it's properly calibrated for optimal performance.\n2. Review the original VoltHive 52V 17.5Ah battery pack specifications to determine if any upgrades or modifications can be made to improve range without exceeding the budget cap.\n3. Consult with the local shop that built the spoke gauge on the motor wheel build (12G front-load pattern) to explore options for optimizing the wheel's aerodynamics and minimizing rolling resistance while staying within the agreed-upon budget.\n4. Conduct a thorough analysis of the Kestrel e-bike conversion's overall design and components to identify potential areas for cost savings or optimization without compromising performance.\n\nBy focusing on these key areas, I'm confident that we can achieve our range target while staying within the agreed-upon budget cap of $1,400 all-in." + }, + "off": { + "factsFound": [], + "recall": 0, + "ms": 26880, + "text": "Next Steps for Kestrel:\n\n1. Finalize design iterations: Review and refine the current design to ensure it meets the agreed-upon range target while staying within the allocated budget.\n2. Material selection: Conduct a thorough analysis of materials to identify the most cost-effective options that meet performance requirements, without exceeding the budget.\n3. Prototype development: Create a functional prototype to test and validate the design, making any necessary adjustments before moving forward with production.\n4. Supply chain optimization: Identify reliable suppliers for components and materials, ensuring timely delivery and minimizing costs.\n5. Production planning: Develop a detailed plan for manufacturing, including scheduling, quality control measures, and inventory management.\n\nBy following these steps, we can ensure that Kestrel meets its range target while staying within the agreed-upon budget." + } + }, + { + "task": "K4", + "project": "kestrel", + "retrieval": { + "topIds": [ + "k07", + "k11", + "k03", + "k08", + "f01" + ], + "sourceHits": 1, + "of": 1, + "hit": true + }, + "on": { + "factsFound": [ + "carbon dropouts", + "torque arm" + ], + "recall": 1, + "ms": 20130, + "text": "The torque arm is mandatory for the Kestrel frame because it has carbon dropouts, which cannot handle the axle torque of the hub motor. This was a crucial consideration when we chose the Bafang G062 rear hub motor for the Kestrel build." + }, + "off": { + "factsFound": [ + "torque arm" + ], + "recall": 0.5, + "ms": 8315, + "text": "The torque arm is non-negotiable on the Kestrel frame because it provides critical stability and prevents the fork from flexing excessively under heavy loads, ensuring precise steering and handling." + } + }, + { + "task": "A1", + "project": "aurora", + "retrieval": { + "topIds": [ + "a04", + "a06", + "a02", + "a01", + "a11" + ], + "sourceHits": 2, + "of": 2, + "hit": true + }, + "on": { + "factsFound": [ + "SHT45", + "38%" + ], + "recall": 1, + "ms": 18439, + "text": "The Aurora misting rule is driven by the SHT45 sensor's canopy humidity reading, which triggers the mist pump when the humidity drops below 38%. This was fixed with a firmware bug fix, implementing a 500ms debounce on the humidity trigger." + }, + "off": { + "factsFound": [], + "recall": 0, + "ms": 10068, + "text": "The Aurora misting system is triggered by the temperature sensor, which detects temperatures above 75°F (24°C). The threshold for activation is set at 78°F (25.5°C), ensuring optimal misting performance when the ambient temperature reaches a comfortable level." + } + }, + { + "task": "A2", + "project": "aurora", + "retrieval": { + "topIds": [ + "a06", + "a11", + "a03", + "a04", + "a01" + ], + "sourceHits": 1, + "of": 1, + "hit": true + }, + "on": { + "factsFound": [ + "500ms debounce" + ], + "recall": 1, + "ms": 15172, + "text": "The relay chatter issue on the Aurora mist pump was resolved by implementing a 500ms debounce on the humidity trigger. This fix allowed for smooth operation of the misting system." + }, + "off": { + "factsFound": [], + "recall": 0, + "ms": 17421, + "text": "According to the maintenance records, the issue was resolved by replacing the faulty solenoid valve and adjusting the pump's air pressure setting." + } + }, + { + "task": "A3", + "project": "aurora", + "retrieval": { + "topIds": [ + "a07", + "a01", + "a03", + "a06", + "a09" + ], + "sourceHits": 3, + "of": 3, + "hit": true + }, + "on": { + "factsFound": [ + "ESP32-S3", + "DripWorks DV-2", + "120W" + ], + "recall": 1, + "ms": 33958, + "text": "Here's a quick recap of Aurora's hardware:\n\n* Controller Board: The ESP32-S3 dev board runs the show.\n* Drip Valves: We're using DripWorks DV-2 solenoid drip valves, one per bed.\n* Solar Panel: Our power source is a 120W solar panel charging a 12V 20Ah LiFePO4 battery, all controlled by a Victron 75/10 controller." + }, + "off": { + "factsFound": [], + "recall": 0, + "ms": 42600, + "text": "Here is a hardware recap for Aurora:\n\n**Controller Board:** The Aurora controller board is a custom-designed PCB that integrates with the system's sensors, actuators, and power management components. It features an ARM Cortex-M4 processor, 128KB flash memory, and 16KB SRAM.\n\n**Drip Valves:** The drip valves are precision-machined stainless steel components designed for precise water control. Each valve has a 1/8\" NPT inlet and a 1/16\" NPT outlet, with a maximum flow rate of 10 mL/min.\n\n**Solar Panel Size:** The Aurora solar panel is a high-efficiency, monocrystalline silicon module measuring 12 inches x 24 inches (30.5 cm x 61 cm). It has an open-circuit voltage of 22V and a short-circuit current of 3.2A, with a maximum power output of 60W.\n\nLet me know if you'd like any further details!" + } + }, + { + "task": "A4", + "project": "aurora", + "retrieval": { + "topIds": [ + "a05", + "a04", + "a02", + "a01", + "a12" + ], + "sourceHits": 2, + "of": 2, + "hit": true + }, + "on": { + "factsFound": [ + "B2", + "38%" + ], + "recall": 1, + "ms": 16752, + "text": "The tomatoes are in Bed 2 (B2). As for misting, it triggers when the canopy humidity drops below 38%." + }, + "off": { + "factsFound": [], + "recall": 0, + "ms": 7146, + "text": "The tomatoes are in Bed 3.\n\nAs per the greenhouse settings, the misting system is triggered when the humidity level drops to 40%." + } + }, + { + "task": "T1", + "project": "tidehouse", + "retrieval": { + "topIds": [ + "t01", + "t08", + "t03", + "t09", + "t02" + ], + "sourceHits": 3, + "of": 3, + "hit": true + }, + "on": { + "factsFound": [ + "Jonsbo N3", + "12TB", + "RAIDZ2" + ], + "recall": 1, + "ms": 23469, + "text": "The Tidehouse NAS uses a Jonsbo N3 case with an ITX motherboard featuring 6 SATA ports and an ASM1166 M.2 adapter for expansion. The system runs ZFS RAIDZ2 on four 12TB Seagate Ironwolf drives, providing two-drive failure tolerance. The filesystem layout utilizes the full capacity of the drives, offering a robust storage solution." + }, + "off": { + "factsFound": [], + "recall": 0, + "ms": 33769, + "text": "The Tidehouse storage design is a Linux-based solution that provides a scalable and efficient way to store and manage data. Here's a summary of its key components:\n\n**Case:**\nTidehouse uses a standard 19-inch rackmount case with a depth of approximately 12 inches.\n\n**Drives:**\nThe storage system consists of multiple drives, including:\n\n* A primary drive (e.g., NVMe SSD) for the operating system and critical data\n* One or more secondary drives (e.g., SATA SSDs or HDDs) for storing user data\n\n**Filesystem Layout:**\nTidehouse uses a hierarchical filesystem layout to organize data. The main directories are:\n\n* `/`: Root directory, containing the OS and essential files\n* `/data`: Directory for storing user data, divided into subdirectories based on specific use cases (e.g., `/data/photos`, `/data/docs`)\n* `/logs`: Directory for storing system logs and diagnostic information\n\nThis design allows for efficient storage management, easy scalability, and robust data protection." + } + }, + { + "task": "T2", + "project": "tidehouse", + "retrieval": { + "topIds": [ + "t05", + "t04", + "t10", + "t09", + "t11" + ], + "sourceHits": 2, + "of": 2, + "hit": true + }, + "on": { + "factsFound": [ + "Backblaze B2" + ], + "recall": 0.5, + "ms": 15803, + "text": "The Tidehouse ZFS scrub is scheduled to run every Sunday at 2am. As for offsite backups, they are stored on Backblaze B2, encrypted with restic before upload." + }, + "off": { + "factsFound": [], + "recall": 0, + "ms": 6811, + "text": "The Tidehouse scrub runs every Sunday at 2:00 AM.\n\nOffsite backups are stored on Amazon S3 servers located in the US East region." + } + }, + { + "task": "T3", + "project": "tidehouse", + "retrieval": { + "topIds": [ + "t07", + "t06", + "t12", + "t09", + "a07" + ], + "sourceHits": 2, + "of": 2, + "hit": true + }, + "on": { + "factsFound": [ + "Corsair SF450", + "31W" + ], + "recall": 1, + "ms": 20128, + "text": "The Tidehouse power story! According to my persistent memory, the Tidehouse's power supply unit (PSU) is a Corsair SF450, specifically designed for SFX form factor required by the Jonsbo case. When I checked the idle power consumption, I measured it at 31W with all drives spun down." + }, + "off": { + "factsFound": [], + "recall": 0, + "ms": 12255, + "text": "Here is the report on the Tidehouse power story:\n\n**PSU Model:** Seasonic FOCUS GX-750, 80+ Gold certified\n\n**Measured Idle Draw:** 0.15A (1.7W)\n\nPlease let me know if you'd like any additional information or if there's anything else I can help with!" + } + }, + { + "task": "T4", + "project": "tidehouse", + "retrieval": { + "topIds": [ + "t11", + "t01", + "t04", + "t08", + "t06" + ], + "sourceHits": 2, + "of": 3, + "hit": true + }, + "on": { + "factsFound": [ + "Jonsbo N3", + "SF450" + ], + "recall": 0.6666666666666666, + "ms": 39577, + "text": "Subject: Tidehouse Build Recap - Parts and Config\n\nHey [Friend's Name],\n\nI'm happy to share my experience building the Tidehouse NAS with you! To help you plan your own build, here are the key parts I used:\n\n* The Jonsbo N3 case provides a compact footprint for eight bays.\n* For the motherboard, I chose an ITX board with 6 SATA ports and an ASM1166 M.2 adapter to accommodate the extra bays.\n* Powering it all is the Corsair SF450 SFX-form-factor PSU, specifically designed for the Jonsbo case.\n\nFor data protection, I set up offsite backups using Backblaze B2, encrypting the data with restic before upload. If you have any questions or would like more details on my experience, feel free to reach out!\n\nBest regards,\n[Your Name]" + }, + "off": { + "factsFound": [], + "recall": 0, + "ms": 27587, + "text": "Here's a draft:\n\nSubject: Parts Recap - Tidehouse Build\n\nHey [Friend's Name],\n\nJust wanted to confirm that we've got all the necessary parts lined up for our upcoming Tidehouse NAS build. To recap, here's what we're looking at: 1x ASRock C2550D4I mini-ITX board, 2x WD Red 12TB drives, 1x Corsair VS650 power supply, 1x Noctua NH-U14S TR4-SP3 CPU cooler, and 1x Fractal Design Node 202 case. Let me know if you have any questions or concerns - otherwise, I'll keep an eye on the parts and make sure everything arrives in time for our build day!\n\nBest,\n[Your Name]" + } + }, + { + "task": "F1", + "project": "falconer", + "retrieval": { + "topIds": [ + "f01", + "f07", + "f02", + "f04", + "f06" + ], + "sourceHits": 2, + "of": 2, + "hit": true + }, + "on": { + "factsFound": [ + "TPU 95A", + "GB2208" + ], + "recall": 1, + "ms": 26447, + "text": "The Falconer vibration issue! We previously identified the culprit as excessive jello in footage due to vibrations. The fix was simple yet effective: we added printed TPU 95A dampers between the frame and gimbal cage. This eliminated the unwanted motion.\n\nAs for the motors, I recall that the Falconer drone uses GB2208 brushless gimbal motors, one per axis. These motors are designed to provide smooth and precise control over the gimbal's movements." + }, + "off": { + "factsFound": [], + "recall": 0, + "ms": 13296, + "text": "The Falconer vibration issue was resolved by updating the gimbal's firmware to version 3.2. This fixed the problem with the motor vibrations causing camera shake.\n\nThe Falconer gimbal features a combination of brushless motors from Maxon Motors (EC-4 pole, 16mm diameter) and Faulhaber Motors (MICRO-17/32F)." + } + }, + { + "task": "F2", + "project": "falconer", + "retrieval": { + "topIds": [ + "f02", + "f01", + "f07", + "f06", + "f05" + ], + "sourceHits": 1, + "of": 1, + "hit": true + }, + "on": { + "factsFound": [ + "2.61", + "STorM32" + ], + "recall": 1, + "ms": 15114, + "text": "The Falconer gimbal is locked at firmware version 2.61, and it's controlled by the STorM32 board." + }, + "off": { + "factsFound": [], + "recall": 0, + "ms": 7123, + "text": "According to our records, the Falconer gimbal is currently running firmware version 2.3.1, and it's using the DJI EC35 controller board." + } + }, + { + "task": "F3", + "project": "falconer", + "retrieval": { + "topIds": [ + "f03", + "f05", + "f06", + "f07", + "f02" + ], + "sourceHits": 2, + "of": 2, + "hit": true + }, + "on": { + "factsFound": [ + "Runcam Thumb Pro", + "2.3mm" + ], + "recall": 1, + "ms": 19493, + "text": "The Falconer camera is the Runcam Thumb Pro, chosen for its weight considerations. As of our last rebalance session, the camera sits 2.3mm forward of the roll axis. Please note that I've been instructed not to move the mount plate, so we'll need to work around this current offset." + }, + "off": { + "factsFound": [], + "recall": 0, + "ms": 12873, + "text": "The Falconer camera choice is the Sony A7R IV, which provides high-quality images with its 61.4MP full-frame sensor.\n\nAs for the current balance offset, it's currently set at -0.5% to account for the slight weight bias towards the front of the camera. This adjustment ensures a smooth and stable panning experience when using the Falconer rig." + } + }, + { + "task": "F4", + "project": "falconer", + "retrieval": { + "topIds": [ + "f02", + "f01", + "f07", + "f06", + "f05" + ], + "sourceHits": 2, + "of": 3, + "hit": true + }, + "on": { + "factsFound": [ + "STorM32", + "2.3mm" + ], + "recall": 0.6666666666666666, + "ms": 18613, + "text": "Falconer Gimbal Pre-Flight Tuning Checklist:\n\n1. Controller:\n\t* Verify firmware version: 2.61 (do not update)\n\t* Ensure STorM32 board is properly connected and powered\n2. Dampers:\n\t* Check IMU filter settings for optimal performance\n3. Balance:\n\t* Confirm camera sits 2.3mm forward of the roll axis (do not adjust mount plate)\n\nBy following this checklist, you'll ensure a smooth and stable flight with your Falconer gimbal." + }, + "off": { + "factsFound": [], + "recall": 0, + "ms": 30968, + "text": "Here is a pre-flight tuning checklist for the Falconer gimbal:\n\n**Controller:**\n\n1. Ensure the controller is turned on and set to the correct mode (e.g., \"Flight\" or \"Record\").\n2. Check that the joystick is centered and not stuck.\n3. Verify that the controller's battery level is sufficient.\n\n**Dampers:**\n\n1. Adjust the dampers to the desired setting for smooth panning and tilting.\n2. Ensure the dampers are not too loose, which can cause vibrations or oscillations.\n\n**Balance:**\n\n1. Check the gimbal's balance by observing the camera's horizon line on the display screen.\n2. Make any necessary adjustments to the balance arms or counterweights to ensure a stable platform.\n3. Verify that the camera is properly secured and level.\n\nBy following this checklist, you'll be able to ensure your Falconer gimbal is properly tuned for a smooth and successful flight." + } + }, + { + "task": "M1", + "project": "marrow", + "retrieval": { + "topIds": [ + "m05", + "m06", + "m09", + "m01", + "m02" + ], + "sourceHits": 2, + "of": 2, + "hit": true + }, + "on": { + "factsFound": [ + "$9", + "$35" + ], + "recall": 1, + "ms": 11612, + "text": "We charge $9 per loaf of Marrow bread at the farmers market. The stall fee is $35 per Saturday." + }, + "off": { + "factsFound": [], + "recall": 0, + "ms": 6528, + "text": "We charge $5 per loaf of bread at the market.\n\nAs for the stall cost, it's $200 per week." + } + }, + { + "task": "M2", + "project": "marrow", + "retrieval": { + "topIds": [ + "m03", + "m04", + "m10", + "m02", + "m07" + ], + "sourceHits": 3, + "of": 3, + "hit": true + }, + "on": { + "factsFound": [ + "Rofco B40", + "Saturday", + "78%" + ], + "recall": 1, + "ms": 29571, + "text": "Marrow operations are running smoothly! Here's a status update:\n\n* Oven: The Rofco B40 is preheated and ready to go. Remember to preheat it for 90 minutes before baking.\n* Bake Day: Marrow bake day is this Saturday, so make sure to mix the dough on Friday evening and have everything prepared for the big bake at 5am.\n* Dough Hydration: Our standard dough hydration remains at 78% for the country loaf. Don't forget to keep Clyde, our sourdough starter, happy with its twice-daily feeding schedule in summer.\n\nAll systems are go!" + }, + "off": { + "factsFound": [], + "recall": 0, + "ms": 11219, + "text": "Here's an update:\n\n* Oven: Currently at 375°F, scheduled to be adjusted to 425°F for the next batch.\n* Bake Day: Still on track for tomorrow morning, with all necessary ingredients and equipment ready.\n* Dough Hydration: Levels are within acceptable range (65%), with no issues reported." + } + }, + { + "task": "M3", + "project": "marrow", + "retrieval": { + "topIds": [ + "m08", + "m06", + "m05", + "m03", + "m04" + ], + "sourceHits": 1, + "of": 1, + "hit": true + }, + "on": { + "factsFound": [ + "August 15", + "cottage" + ], + "recall": 1, + "ms": 12644, + "text": "The food license renewal for Marrow Cottage is due on August 15th. I've set a calendar reminder to go off two weeks prior, so that's July 1st." + }, + "off": { + "factsFound": [], + "recall": 0, + "ms": 4395, + "text": "According to our records, Marrow's software license renewal is due on March 15th." + } + }, + { + "task": "M4", + "project": "marrow", + "retrieval": { + "topIds": [ + "m01", + "m07", + "m03", + "m02", + "m04" + ], + "sourceHits": 2, + "of": 2, + "hit": true + }, + "on": { + "factsFound": [ + "Cairnspring", + "Clyde" + ], + "recall": 1, + "ms": 19244, + "text": "We use Cairnspring Trailblazer flour for baking at Marrow. It comes in 50lb bags.\n\nAs for our sourdough starter, it's named Clyde and is kept at 100% hydration. During summer months, Clyde is fed twice daily to ensure its health and activity." + }, + "off": { + "factsFound": [], + "recall": 0, + "ms": 18524, + "text": "We use unbleached, unbromated all-purpose flour for baking. It's a high-protein flour that provides structure and texture to our baked goods.\n\nAs for our starter, it's a natural yeast culture made from a mixture of flour and water that's been allowed to ferment over time. Our starter is fed regularly and has developed a unique flavor profile that adds depth and complexity to our breads and pastries. It's a vital component in many of our recipes and helps to create the signature rise and texture we're known for." + } + } + ] +} \ No newline at end of file diff --git a/bench/isolation/memory/amnesia-ab/run.mjs b/bench/isolation/memory/amnesia-ab/run.mjs new file mode 100644 index 0000000..c425a18 --- /dev/null +++ b/bench/isolation/memory/amnesia-ab/run.mjs @@ -0,0 +1,135 @@ +/** + * amnesia-ab — the cheapest discriminating test of OCM's central loop. + * + * ARM A (memory ON): task -> embed (mxbai-embed-large) -> cosine top-5 memories -> inject -> llama3 8B Q4 + * ARM B (memory OFF): identical prompt, no memories. + * + * Metrics (see expected.json hypothesis contract): + * memory_on_fact_recall_pct primary confirm >=70, refute <50 + * retrieval_hit_rate_pct secondary confirm >=80, refute <60 + * memory_off_fact_recall_pct sanity must be <=25 or the corpus is guessable (run INVALID) + * + * Run: node run.mjs (Ollama on 127.0.0.1:11434 with llama3 + mxbai-embed-large) + */ +import { readFileSync, writeFileSync, mkdirSync } from 'node:fs'; +import { dirname, join } from 'node:path'; +import { fileURLToPath } from 'node:url'; + +const HERE = dirname(fileURLToPath(import.meta.url)); +const OLLAMA = 'http://127.0.0.1:11434'; +const CHAT_MODEL = 'llama3'; +const EMBED_MODEL = 'mxbai-embed-large'; +const TOP_K = 5; + +const { memories, tasks } = JSON.parse(readFileSync(join(HERE, 'corpus.json'), 'utf8')); + +async function embed(text, isQuery = false) { + const prompt = isQuery ? `Represent this sentence for searching relevant passages: ${text}` : text; + const r = await fetch(`${OLLAMA}/api/embeddings`, { + method: 'POST', headers: { 'content-type': 'application/json' }, + body: JSON.stringify({ model: EMBED_MODEL, prompt }), + }); + if (!r.ok) throw new Error(`embed ${r.status}`); + return (await r.json()).embedding; +} + +function cosine(a, b) { + let dot = 0, na = 0, nb = 0; + for (let i = 0; i < a.length; i++) { dot += a[i] * b[i]; na += a[i] * a[i]; nb += b[i] * b[i]; } + return dot / (Math.sqrt(na) * Math.sqrt(nb)); +} + +async function generate(system, user) { + const t0 = Date.now(); + const r = await fetch(`${OLLAMA}/api/chat`, { + method: 'POST', headers: { 'content-type': 'application/json' }, + body: JSON.stringify({ + model: CHAT_MODEL, stream: false, + messages: [{ role: 'system', content: system }, { role: 'user', content: user }], + options: { temperature: 0.2, num_predict: 300 }, + }), + }); + if (!r.ok) throw new Error(`chat ${r.status}`); + const j = await r.json(); + return { text: j.message?.content || '', ms: Date.now() - t0 }; +} + +// fact matching: lowercase, strip spaces/commas/periods so "$1,400"≈"1400", "Sunday 2am"≈"sunday, 2 AM" +const norm = (s) => s.toLowerCase().replace(/[ ,.]/g, ''); +const containsFact = (output, fact) => norm(output).includes(norm(fact)); + +const SYS_ON = (mems) => + `You are a personal assistant with persistent memory of past working sessions.\n` + + `RELEVANT MEMORIES retrieved for this request:\n${mems.map((m) => `- ${m.text}`).join('\n')}\n` + + `Answer using these memories — be specific with names and numbers. Be concise.`; +const SYS_OFF = `You are a personal assistant. Answer the request. Be concise.`; + +console.log(`[1/3] embedding ${memories.length} memories…`); +const memVecs = []; +for (const m of memories) memVecs.push({ ...m, vec: await embed(m.text) }); + +console.log(`[2/3] running ${tasks.length} tasks × 2 arms…`); +const results = []; +for (const task of tasks) { + const qVec = await embed(task.prompt, true); + const top = memVecs + .map((m) => ({ m, score: cosine(qVec, m.vec) })) + .sort((a, b) => b.score - a.score) + .slice(0, TOP_K); + const topIds = top.map((t) => t.m.id); + const sourceHits = task.source_ids.filter((id) => topIds.includes(id)).length; + const retrievalHit = sourceHits >= Math.min(2, task.source_ids.length); + + const on = await generate(SYS_ON(top.map((t) => t.m)), task.prompt); + const off = await generate(SYS_OFF, task.prompt); + + const onFacts = task.key_facts.filter((f) => containsFact(on.text, f)); + const offFacts = task.key_facts.filter((f) => containsFact(off.text, f)); + + const row = { + task: task.id, project: task.project, + retrieval: { topIds, sourceHits, of: task.source_ids.length, hit: retrievalHit }, + on: { factsFound: onFacts, recall: onFacts.length / task.key_facts.length, ms: on.ms, text: on.text }, + off: { factsFound: offFacts, recall: offFacts.length / task.key_facts.length, ms: off.ms, text: off.text }, + }; + results.push(row); + console.log( + ` ${task.id}: retrieval ${sourceHits}/${task.source_ids.length}${retrievalHit ? ' HIT' : ' MISS'}` + + ` | ON ${onFacts.length}/${task.key_facts.length} (${(on.ms / 1000).toFixed(1)}s)` + + ` | OFF ${offFacts.length}/${task.key_facts.length} (${(off.ms / 1000).toFixed(1)}s)` + ); +} + +console.log('[3/3] scoring…'); +const pct = (xs) => (100 * xs.reduce((a, b) => a + b, 0)) / xs.length; +const onRecall = pct(results.map((r) => r.on.recall)); +const offRecall = pct(results.map((r) => r.off.recall)); +const hitRate = pct(results.map((r) => (r.retrieval.hit ? 1 : 0))); +const onMs = results.map((r) => r.on.ms).sort((a, b) => a - b)[Math.floor(results.length / 2)]; +const offMs = results.map((r) => r.off.ms).sort((a, b) => a - b)[Math.floor(results.length / 2)]; + +const sane = offRecall <= 25; +let verdict; +if (!sane) verdict = 'INVALID (corpus guessable — OFF arm exceeded 25% recall)'; +else if (onRecall >= 70 && hitRate >= 80) verdict = 'CONFIRMED'; +else if (onRecall < 50) verdict = 'REFUTED (fact-usage: 8B cannot use injected memories)'; +else if (hitRate < 60) verdict = 'REFUTED (retrieval failed at toy scale)'; +else verdict = 'MIXED (between thresholds — investigate failure rows)'; + +const summary = { + sandbox: 'amnesia-ab', ranAt: new Date().toISOString(), + hardware: 'operator dev box (Windows, Ollama CPU/GPU local)', chatModel: 'llama3 8B Q4_0', embedModel: EMBED_MODEL, + memory_on_fact_recall_pct: +onRecall.toFixed(1), + memory_off_fact_recall_pct: +offRecall.toFixed(1), + retrieval_hit_rate_pct: +hitRate.toFixed(1), + latency_ms_p50: { on: onMs, off: offMs }, + verdict, +}; + +mkdirSync(join(HERE, 'results'), { recursive: true }); +const stamp = new Date().toISOString().replace(/[:.]/g, '-').slice(0, 19); +writeFileSync(join(HERE, 'results', `run-${stamp}.json`), JSON.stringify({ summary, results }, null, 2)); + +console.log('\n========== AMNESIA A/B SUMMARY =========='); +console.log(JSON.stringify(summary, null, 2)); +console.log(`details: results/run-${stamp}.json`); From ad2162a449783d1f851823f3ea56295661bf43ff Mon Sep 17 00:00:00 2001 From: Brand Date: Thu, 11 Jun 2026 14:40:39 -0600 Subject: [PATCH 3/7] =?UTF-8?q?feat(inference):=20Ollama=20backend=20adapt?= =?UTF-8?q?er=20=E2=80=94=20zero-extra-process=20path=20for=20Ollama=20use?= =?UTF-8?q?rs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Third InferenceBackend: bridges OCM to an existing Ollama daemon via its native NDJSON /api/chat API (model tag required per-request; max_tokens maps to options.num_predict; health via /api/tags). Selector untouched — explicit construction for now; daemon settings wiring is the follow-up. Parser test fixtures are VERBATIM captures from a live Ollama daemon (llama3, 2026-06-11) — pinned to the real wire format. Motivated by the amnesia-ab sandbox CONFIRMED verdict (94.2% fact recall on this exact daemon + model class). Co-Authored-By: Claude Fable 5 --- crates/ocm-inference/src/lib.rs | 9 +- crates/ocm-inference/src/ollama.rs | 239 +++++++++++++++++++++++++++++ 2 files changed, 245 insertions(+), 3 deletions(-) create mode 100644 crates/ocm-inference/src/ollama.rs diff --git a/crates/ocm-inference/src/lib.rs b/crates/ocm-inference/src/lib.rs index fe6e6ac..87748fc 100644 --- a/crates/ocm-inference/src/lib.rs +++ b/crates/ocm-inference/src/lib.rs @@ -1,11 +1,13 @@ //! Inference backend trait + adapters for OCM. //! -//! Two production backends targeted by v1: +//! Three backends: //! - [`llamacpp::LlamaCpp`] — Apple Silicon default (Metal) and CPU-only fallback //! - [`vllm::Vllm`] — NVIDIA default (CUDA + AWQ-INT4 + RadixAttention via SGLang upgrade path) +//! - [`ollama::Ollama`] — bridges to an existing Ollama daemon (native NDJSON API), +//! the zero-extra-process path for the largest installed base of local-model users //! -//! Both expose the same OpenAI-compatible HTTP wire format, so most adapter logic is -//! shared via [`llamacpp::LlamaCpp`] and [`vllm::Vllm`] re-uses it. +//! llama.cpp and vLLM share the OpenAI-compatible HTTP wire format ([`llamacpp::LlamaCpp`] +//! hosts the shared SSE parser); Ollama speaks its own NDJSON streaming format. use async_trait::async_trait; use futures::stream::BoxStream; @@ -64,6 +66,7 @@ pub trait InferenceBackend: Send + Sync { } pub mod llamacpp; +pub mod ollama; pub mod selector; pub mod vllm; diff --git a/crates/ocm-inference/src/ollama.rs b/crates/ocm-inference/src/ollama.rs new file mode 100644 index 0000000..761817d --- /dev/null +++ b/crates/ocm-inference/src/ollama.rs @@ -0,0 +1,239 @@ +//! Ollama adapter — talks to a running Ollama daemon via its native NDJSON API. +//! +//! Why a third backend: Ollama is the largest installed base of local-model users, +//! and an existing Ollama daemon (default `127.0.0.1:11434`) already supervises model +//! load/unload. Pointing OCM at it gives those users a zero-extra-process path — no +//! separate `llama-server` to manage. The daemon + model class this adapter targets +//! was exercised end-to-end by the `bench/isolation/memory/amnesia-ab` sandbox +//! (verdict CONFIRMED, 2026-06-11). +//! +//! Unlike llama.cpp/vLLM, Ollama's native streaming format is NDJSON (one JSON object +//! per line, final line carries `"done":true`), and the model tag is REQUIRED in every +//! request — there is no server-side default model. + +use crate::{BackendError, ChatRequest, InferenceBackend, Role}; +use async_trait::async_trait; +use futures::stream::BoxStream; +use futures::StreamExt; +use reqwest::Client; +use serde::{Deserialize, Serialize}; + +/// Default model tag when none is configured. +pub const DEFAULT_MODEL: &str = "llama3"; + +pub struct Ollama { + base_url: String, + model: String, + http: Client, +} + +impl Ollama { + pub fn new(base_url: impl Into, model: impl Into) -> Self { + Self { + base_url: base_url.into(), + model: model.into(), + http: Client::new(), + } + } + + pub fn base_url(&self) -> &str { + &self.base_url + } + + pub fn model(&self) -> &str { + &self.model + } +} + +#[derive(Serialize)] +struct OllamaChatBody<'a> { + model: &'a str, + messages: Vec>, + stream: bool, + options: OllamaOptions, +} + +#[derive(Serialize)] +struct OllamaMsg<'a> { + role: &'a str, + content: &'a str, +} + +/// Generation params live under `options` in Ollama's native API; note +/// `max_tokens` maps to `num_predict`. +#[derive(Serialize, Default)] +struct OllamaOptions { + #[serde(skip_serializing_if = "Option::is_none")] + num_predict: Option, + #[serde(skip_serializing_if = "Option::is_none")] + temperature: Option, + #[serde(skip_serializing_if = "Option::is_none")] + top_p: Option, + #[serde(skip_serializing_if = "Vec::is_empty")] + stop: Vec, +} + +#[derive(Deserialize)] +struct OllamaChunk { + #[serde(default)] + message: Option, +} + +#[derive(Deserialize)] +struct OllamaChunkMsg { + #[serde(default)] + content: String, +} + +fn role_str(r: &Role) -> &'static str { + match r { + Role::System => "system", + Role::User => "user", + Role::Assistant => "assistant", + } +} + +/// Parse a chunk of NDJSON lines into the concatenated content delta. +/// The `"done":true` line carries an empty `message.content`, so it needs no +/// special-casing — it simply contributes nothing. +pub(crate) fn parse_ndjson_chunk(text: &str) -> Result { + let mut out = String::new(); + for line in text.lines() { + let line = line.trim(); + if line.is_empty() { + continue; + } + if let Ok(c) = serde_json::from_str::(line) { + if let Some(m) = c.message { + out.push_str(&m.content); + } + } + } + Ok(out) +} + +#[async_trait] +impl InferenceBackend for Ollama { + fn name(&self) -> &'static str { + "Ollama" + } + + async fn health(&self) -> Result<(), BackendError> { + let r = self + .http + .get(format!("{}/api/tags", self.base_url)) + .send() + .await?; + if r.status().is_success() { + Ok(()) + } else { + Err(BackendError::NotReady(r.status().to_string())) + } + } + + async fn generate( + &self, + req: ChatRequest, + ) -> Result>, BackendError> { + let messages: Vec = req + .messages + .iter() + .map(|m| OllamaMsg { + role: role_str(&m.role), + content: &m.content, + }) + .collect(); + let body = OllamaChatBody { + model: &self.model, + messages, + stream: true, + options: OllamaOptions { + num_predict: req.params.max_tokens, + temperature: req.params.temperature, + top_p: req.params.top_p, + stop: req.params.stop.clone(), + }, + }; + let resp = self + .http + .post(format!("{}/api/chat", self.base_url)) + .json(&body) + .send() + .await? + .error_for_status()?; + + let stream = resp.bytes_stream().map(|chunk| { + let bytes = chunk.map_err(BackendError::from)?; + let text = + std::str::from_utf8(&bytes).map_err(|e| BackendError::Protocol(e.to_string()))?; + parse_ndjson_chunk(text) + }); + Ok(stream.boxed()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + // Fixtures below are VERBATIM captures from a live Ollama 127.0.0.1:11434 + // /api/chat stream (llama3, 2026-06-11) — the parser is pinned to the real + // wire format, not to documentation. + + #[test] + fn parse_ndjson_concatenates_content_deltas() { + let text = concat!( + "{\"model\":\"llama3\",\"created_at\":\"2026-06-11T20:39:22.6565791Z\",\"message\":{\"role\":\"assistant\",\"content\":\"Hi\"},\"done\":false}\n", + "{\"model\":\"llama3\",\"created_at\":\"2026-06-11T20:39:22.7598978Z\",\"message\":{\"role\":\"assistant\",\"content\":\"!\"},\"done\":false}\n", + ); + let out = parse_ndjson_chunk(text).expect("parse"); + assert_eq!(out, "Hi!"); + } + + #[test] + fn parse_ndjson_done_line_contributes_nothing() { + let text = "{\"model\":\"llama3\",\"created_at\":\"2026-06-11T20:39:22.8736917Z\",\"message\":{\"role\":\"assistant\",\"content\":\"\"},\"done\":true,\"done_reason\":\"stop\",\"total_duration\":6912063700,\"load_duration\":5972558500,\"prompt_eval_count\":14,\"prompt_eval_duration\":706919000,\"eval_count\":3,\"eval_duration\":227366000}\n"; + let out = parse_ndjson_chunk(text).expect("parse"); + assert!(out.is_empty()); + } + + #[test] + fn parse_ndjson_skips_malformed_and_blank_lines() { + let text = concat!( + "not json at all\n", + "\n", + "{\"message\":{\"role\":\"assistant\",\"content\":\"ok\"},\"done\":false}\n", + ); + let out = parse_ndjson_chunk(text).expect("parse"); + assert_eq!(out, "ok"); + } + + #[test] + fn options_serialize_under_ollama_names() { + let opts = OllamaOptions { + num_predict: Some(64), + temperature: Some(0.2), + top_p: None, + stop: vec![], + }; + let s = serde_json::to_string(&opts).expect("serialize"); + assert!(s.contains("\"num_predict\":64")); + assert!(!s.contains("top_p")); + assert!(!s.contains("stop")); + } + + #[test] + fn role_str_maps_correctly() { + assert_eq!(role_str(&Role::System), "system"); + assert_eq!(role_str(&Role::User), "user"); + assert_eq!(role_str(&Role::Assistant), "assistant"); + } + + #[test] + fn ollama_records_base_url_and_model() { + let backend = Ollama::new("http://127.0.0.1:11434", DEFAULT_MODEL); + assert_eq!(backend.base_url(), "http://127.0.0.1:11434"); + assert_eq!(backend.model(), "llama3"); + assert_eq!(backend.name(), "Ollama"); + } +} From 1121e21c6e471b56c356dc8aa1cc773f1f2b103a Mon Sep 17 00:00:00 2001 From: Brand Date: Thu, 11 Jun 2026 14:42:11 -0600 Subject: [PATCH 4/7] =?UTF-8?q?docs:=20README=20catches=20up=20with=2042b2?= =?UTF-8?q?bad=20=E2=80=94=20registry=20is=203/3=20verified,=20hash=20bloc?= =?UTF-8?q?ker=20resolved?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 'five model SHA256 hashes' pre-release blocker was cleared when the unhashed Qwen3 entries were dropped (#50); the shipping registry is 3 models, all hashed. README still claimed 5 GGUFs + an open blocker. Co-Authored-By: Claude Fable 5 --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index bc0db7c..c1d4c7b 100644 --- a/README.md +++ b/README.md @@ -75,7 +75,7 @@ Apache 2.0 — see [LICENSE](LICENSE). Choose your own license for derivative wo | `ocm-inference` | `InferenceBackend` trait + llama.cpp + vLLM adapters + supervisor. Auto-selects backend by platform. | Built | | `ocm-memory` | Mem0 client (search before generation, persist after). | Built | | `ocm-mcp` | MCP stdio JSON-RPC bridge. Lets Claude Code / Cursor / Cline / Continue.dev connect via standard MCP. | Built | -| `ocm-models` | Curated registry (5 GGUFs across tiny / default / canonical tiers) + streaming SHA256-verified downloader. Refuses unverified hashes. | Built | +| `ocm-models` | Curated registry (3 GGUFs across tiny / default tiers, all SHA256-verified; canonical tier returns in v0.1.1) + streaming verified downloader. Refuses unverified hashes. | Built | | `ocm-mesh` | Mesh transport trait + iroh / libp2p stubs. Real implementations land in v2. | Scaffolded | ### Frontend (`frontend/`, SvelteKit 2 + Svelte 5 + adapter-static) @@ -111,7 +111,7 @@ Everything in `docs/superpowers/` informed the architecture: - Codesigning + auto-update (v4) - Sandboxing + Sybil resistance (v5) - Sharded inference (v6) -- The five model SHA256 hashes in `crates/ocm-models/registry.json` — the downloader refuses empty-hash entries, so downloads no-op until hashes are committed +- Qwen3 canonical-tier registry entries (dropped pending verified hashes; return in v0.1.1 — the shipping 3-model registry is fully SHA256-verified) - Live deployment on [ocm.shortcircuit.bot](https://ocm.shortcircuit.bot) — domain reserved, site not yet up ## Contributing From 5c60cb592ccc6b37edd204d129c98e9b23570aa3 Mon Sep 17 00:00:00 2001 From: Brand Date: Thu, 11 Jun 2026 14:44:38 -0600 Subject: [PATCH 5/7] =?UTF-8?q?docs(release):=20v0.1.1=20draft=20notes=20?= =?UTF-8?q?=E2=80=94=20first=20measured-evidence=20release?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Fable 5 --- docs/release-notes/v0.1.1-draft.md | 54 ++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 docs/release-notes/v0.1.1-draft.md diff --git a/docs/release-notes/v0.1.1-draft.md b/docs/release-notes/v0.1.1-draft.md new file mode 100644 index 0000000..cb33143 --- /dev/null +++ b/docs/release-notes/v0.1.1-draft.md @@ -0,0 +1,54 @@ +# OCM v0.1.1 — DRAFT release notes + +> Status: DRAFT — written 2026-06-11. Paste into the GitHub draft release when +> tagging v0.1.1. Tagging requires the version bump in Cargo.toml / +> frontend/package.json / tauri.conf.json per docs/release-process.md. + +## Headline: the memory loop is now measured, not promised + +v0.1.0 shipped the architecture; v0.1.1 ships the first evidence. The bench +suite went from scaffold to working instrument, and the central claim — that +library-driven memory retrieval makes a small local model materially better — +now has a CONFIRMED verdict behind it. + +## Benchmarks (the headline numbers) + +- **`amnesia-ab` (NEW, first memory sandbox to run) — CONFIRMED.** + Memory ON vs OFF on llama3 8B Q4 + mxbai-embed-large, 62-memory corpus with + cross-project distractors, 20 tasks, objective key-fact scoring: + **94.2%** fact recall with memory vs **2.5%** without; retrieval hit rate + **100%**. The OFF arm's failure mode is confident fabrication, not ignorance — + the memory loop is the difference between correct specifics and plausible + lies on an 8B model. +- **`aider-repomap-fidelity` — ACTIVE, 59.2% CONFIRMED** (#53). +- **`sandbox-e` schema compression and `sandbox-i` Mem0 encryption overhead + flipped ACTIVE** (#51, #52); frontier-comparison sandboxes E–I landed as + INACTIVE hypothesis stubs (#48). +- **`run-all` orchestrator + cross-sandbox comparison table** (#47) and the + comprehensive bench guide `docs/benchmarking.md` (#49). + +## Inference + +- **NEW: Ollama backend adapter** (`ocm-inference::ollama`). Bridges OCM to an + existing Ollama daemon via its native NDJSON API — the zero-extra-process + path for the largest installed base of local-model users. Parser tests are + pinned to verbatim captures from a live daemon. Explicit construction for + now; daemon settings wiring (`backend = "ollama"`) is the follow-up. + +## Models + +- Registry is now **3 models, 3/3 SHA256-verified** — the unhashed Qwen3 + entries were dropped (#50) and return when verified hashes are available. + The "downloads no-op on empty hashes" pre-release blocker is gone. + +## Docs + +- README corrected to match the shipping registry (was still claiming 5 GGUFs + and an open hash blocker). + +## Known limitations (unchanged from v0.1.0) + +- Unsigned installers (codesigning lands in v4) — expect OS warnings. +- Daemon expects external inference + Mem0 processes; supervised spawn is the + next UX milestone. +- Mesh transport remains a v2 trait scaffold. From 656d5a1d4a523423e316b6cda912179421c5b430 Mon Sep 17 00:00:00 2001 From: Brand Date: Thu, 11 Jun 2026 14:48:49 -0600 Subject: [PATCH 6/7] =?UTF-8?q?fix(bench):=20amnesia-ab=20conforms=20to=20?= =?UTF-8?q?the=20ACTIVE-sandbox=20contract=20=E2=80=94=20compose=20+=20ben?= =?UTF-8?q?ch.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The dry-run validator requires docker-compose.yml + bench.py for ACTIVE sandboxes (caught by Bench Framework CI on PR #67 — the framework doing its job). bench.py delegates to run.mjs (ONE harness, the exact artifact that produced the CONFIRMED result); compose runs it in node:22-slim against the HOST Ollama daemon via host-gateway, same host-dependency pattern as vllm-q4-llama8b. run.mjs now honors OLLAMA_URL. Local validate_compose: PASS. Co-Authored-By: Claude Fable 5 --- bench/isolation/memory/amnesia-ab/bench.py | 21 +++++++++++++++++++ .../memory/amnesia-ab/docker-compose.yml | 14 +++++++++++++ bench/isolation/memory/amnesia-ab/run.mjs | 2 +- 3 files changed, 36 insertions(+), 1 deletion(-) create mode 100644 bench/isolation/memory/amnesia-ab/bench.py create mode 100644 bench/isolation/memory/amnesia-ab/docker-compose.yml diff --git a/bench/isolation/memory/amnesia-ab/bench.py b/bench/isolation/memory/amnesia-ab/bench.py new file mode 100644 index 0000000..2baa82c --- /dev/null +++ b/bench/isolation/memory/amnesia-ab/bench.py @@ -0,0 +1,21 @@ +"""amnesia-ab framework entrypoint — delegates to the Node runner (run.mjs). + +This sandbox's harness is JavaScript because the measurement is pure HTTP +against an Ollama daemon, and run.mjs is the exact artifact that produced the +committed first CONFIRMED result (results/run-2026-06-11T20-32-21.json). +bench.py exists so the framework's ACTIVE-sandbox contract (compose + bench) +holds; it delegates rather than re-implementing, so there is ONE harness. + +Requires: a reachable Ollama daemon (default http://127.0.0.1:11434, override +with OLLAMA_URL) with `llama3` and `mxbai-embed-large` pulled, plus Node 18+. +Host-dependency pattern matches vllm-q4-llama8b (which needs a host GPU). +""" + +import os +import subprocess +import sys + +HERE = os.path.dirname(os.path.abspath(__file__)) + +if __name__ == "__main__": + sys.exit(subprocess.call(["node", os.path.join(HERE, "run.mjs")])) diff --git a/bench/isolation/memory/amnesia-ab/docker-compose.yml b/bench/isolation/memory/amnesia-ab/docker-compose.yml new file mode 100644 index 0000000..46475bd --- /dev/null +++ b/bench/isolation/memory/amnesia-ab/docker-compose.yml @@ -0,0 +1,14 @@ +services: + bench: + image: node:22-slim + volumes: + - ./:/work + working_dir: /work + environment: + # The measurement needs a HOST Ollama daemon with llama3 + + # mxbai-embed-large pulled — same host-dependency pattern as + # vllm-q4-llama8b needing a host GPU. + - OLLAMA_URL=http://host.docker.internal:11434 + extra_hosts: + - "host.docker.internal:host-gateway" + command: ["node", "run.mjs"] diff --git a/bench/isolation/memory/amnesia-ab/run.mjs b/bench/isolation/memory/amnesia-ab/run.mjs index c425a18..9591e57 100644 --- a/bench/isolation/memory/amnesia-ab/run.mjs +++ b/bench/isolation/memory/amnesia-ab/run.mjs @@ -16,7 +16,7 @@ import { dirname, join } from 'node:path'; import { fileURLToPath } from 'node:url'; const HERE = dirname(fileURLToPath(import.meta.url)); -const OLLAMA = 'http://127.0.0.1:11434'; +const OLLAMA = process.env.OLLAMA_URL || 'http://127.0.0.1:11434'; const CHAT_MODEL = 'llama3'; const EMBED_MODEL = 'mxbai-embed-large'; const TOP_K = 5; From 7b691cca0b64ce0d5b8d7a39f7b05613ca96d3ca Mon Sep 17 00:00:00 2001 From: Brand Date: Thu, 11 Jun 2026 14:51:25 -0600 Subject: [PATCH 7/7] =?UTF-8?q?docs(bench):=20regenerate=20coverage=20+=20?= =?UTF-8?q?metrics=20=E2=80=94=20amnesia-ab=20maps=20to=20spec=20row=209?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Fable 5 --- docs/coverage.md | 4 ++-- docs/metrics.md | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/docs/coverage.md b/docs/coverage.md index 439978b..4c50f45 100644 --- a/docs/coverage.md +++ b/docs/coverage.md @@ -2,7 +2,7 @@ _Auto-generated by `bench coverage`. Do not edit by hand._ -**Spec rows:** 33 total | **with a sandbox:** 3 | **CONFIRMED:** 0 +**Spec rows:** 33 total | **with a sandbox:** 4 | **CONFIRMED:** 0 | Row | Decision title | Sandbox(es) | Best verdict | |---|---|---|---| @@ -14,7 +14,7 @@ _Auto-generated by `bench coverage`. Do not edit by hand._ | 6 | Inference engine — Apple Silicon peers (incl. iPad M-series via v0.7 mobile policy) | _(none)_ | — | | 7 | Sharded inference (v6+) | _(none)_ | — | | 8 | Mesh transport (v2+, mobile bindings v1.5+) | _(none)_ | — | -| 9 | Agent memory + virtual context | _(none)_ | — | +| 9 | Agent memory + virtual context | `amnesia-ab` | (no run yet) | | 10 | Agent runtime | _(none)_ | — | | 11 | Client-facing API | _(none)_ | — | | 12 | Daemon / cross-platform UI | _(none)_ | — | diff --git a/docs/metrics.md b/docs/metrics.md index 0216933..7cd59c6 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -2,11 +2,12 @@ _Auto-generated by `bench dashboard`. Do not edit by hand._ -**[FAIL]** 0 CONFIRMED / 0 REFUTED / 0 INCONCLUSIVE / 4 no-run +**[FAIL]** 0 CONFIRMED / 0 REFUTED / 0 INCONCLUSIVE / 5 no-run | Sandbox | Hypothesis | Category | Primary metric | Latest | Threshold | Verdict | Hardware | Run | |---|---|---|---|---|---|---|---|---| | `sandbox-i-mem0-encryption-overhead` | `mem0-sqlcipher-aes256-overhead` | frontier-comparison | `encrypted_retrieval_latency_overhead_pct` | - | <= 15.0 | (no run yet) | `-` | - | | `sandbox-e-schema-compression` | `schema-compression-token-impact` | frontier-comparison | `input_tokens_pct_reduction_median` | - | >= 30.0 | (no run yet) | `-` | - | | `vllm-q4-llama8b` | `vllm-q4-llama8b-singlestream-tps` | inference-engines | `tokens_per_second_median_single_stream` | - | >= 100.0 | (no run yet) | `-` | - | +| `amnesia-ab` | `amnesia-ab-memory-loop` | memory | `memory_on_fact_recall_pct` | - | >= 70.0 | (no run yet) | `-` | - | | `aider-repomap-fidelity` | `aider-repomap-token-reduction-and-symbol-coverage` | retrieval | `token_reduction_pct` | - | >= 50.0 | (no run yet) | `-` | - |