diff --git a/bench/isolation/frontier-comparison/sandbox-e-schema-compression/README.md b/bench/isolation/frontier-comparison/sandbox-e-schema-compression/README.md new file mode 100644 index 0000000..b9a4ac8 --- /dev/null +++ b/bench/isolation/frontier-comparison/sandbox-e-schema-compression/README.md @@ -0,0 +1,19 @@ +# Sandbox E: Schema Compression Token Impact + +**Hypothesis:** MCP tool-schema compression (strip descriptions, shorten param names, hide optional params) cuts per-request input tokens by 30-60% with ≤2pp tool-call-accuracy loss. + +**Status:** INACTIVE — workload + tool-call accuracy harness not yet wired. + +## Why this matters + +Spec v0.2 row 21 locks schema compression as v1 default. This sandbox empirically validates the claimed 30-60% token reduction on representative tool-rich workloads. If the secondary metric (tool-call accuracy delta) blows past +5pp loss, the algorithm is too aggressive and the row 21 lock needs revisiting. + +## Pair with + +- `sandbox-a-raw-vllm-baseline` (when implemented) — comparison anchor +- `sandbox-c-aider-repomap` — orthogonal compression (structural, not schema) +- `sandbox-d-dspy-compiled` — orthogonal compression (behavioral, not schema) + +## Source + +Spec v0.2 row 21. Independent benchmarks of schema compression on Hermes-trained 8B models. diff --git a/bench/isolation/frontier-comparison/sandbox-e-schema-compression/expected.json b/bench/isolation/frontier-comparison/sandbox-e-schema-compression/expected.json new file mode 100644 index 0000000..d61e8ca --- /dev/null +++ b/bench/isolation/frontier-comparison/sandbox-e-schema-compression/expected.json @@ -0,0 +1,25 @@ +{ + "hypothesis_id": "schema-compression-token-impact", + "claim": "Schema compression on MCP tool definitions (strip descriptions, shorten param names, hide optional params) reduces per-request input tokens by 30-60% on a representative tool-rich workload (10+ tools, multi-turn chat) with no measurable accuracy loss on tool-call selection.", + "metric": "input_tokens_pct_reduction", + "thresholds": { + "confirm_at_least": 30.0, + "refute_below": 15.0 + }, + "secondary_metric": "tool_call_accuracy_delta_pp", + "secondary_thresholds": { + "confirm_at_most": 2.0, + "refute_above": 5.0 + }, + "workload": "mcp-tool-rich-multiturn.jsonl", + "source_for_claim": "Spec v0.2 row 21: schema compression default-on for MCP tool schemas. Cited 30-60% token reduction.", + "comparison_anchor": "frontier-comparison/sandbox-a-raw-vllm-baseline", + "decision_rule": "If CONFIRMED on tokens AND secondary stays under +2pp accuracy hit, schema compression stays the v1 default. If REFUTED on tokens, compression algorithm needs revisit. If REFUTED on accuracy delta (>5pp loss), the algorithm is too aggressive and needs the reverse — preserve more.", + "timeout_seconds": 1800, + "status": "INACTIVE", + "blocked_on": [ + "MCP tool-rich workload not yet curated (need 10+ tools, multi-turn chat fixtures)", + "Tool-call accuracy harness not yet wired into bench/bench/metrics.py", + "Sandbox-A baseline must run first to provide comparison anchor" + ] +} diff --git a/bench/isolation/frontier-comparison/sandbox-f-memory-palace-effective-corpus/README.md b/bench/isolation/frontier-comparison/sandbox-f-memory-palace-effective-corpus/README.md new file mode 100644 index 0000000..2911656 --- /dev/null +++ b/bench/isolation/frontier-comparison/sandbox-f-memory-palace-effective-corpus/README.md @@ -0,0 +1,31 @@ +# Sandbox F: Memory Palace Effective-Corpus + +**Hypothesis:** OCM's full memory stack (Mem0 + Memory Palace at 5 GB scale) hits ≥92% on long-horizon QA at ≤8 000 tokens/query — comparable to or better than a frontier 1M-token-context model paying 200× more tokens per query. + +**Status:** INACTIVE — Memory Palace federation (v3.5+) not yet implemented; this is the v0.4 spec's load-bearing future measurement. + +## Why this is the most consequential sandbox + +Spec v0.4 row 27 locks the **Effective-Context Triad** (Expansion / Stratification / Quick Look-Up) as a cross-cutting design constraint. Sandbox F is its empirical test. If CONFIRMED, OCM's pitch holds: knowledge axis (palace) + retrieval axis (Mem0) compounds to beat raw context-window expansion at fraction of the per-query cost. If REFUTED, the triad needs revising. + +## Comparison anchor + +Frontier 1M-token-context model (Gemini 1.5 Pro 1M, Claude 3.5 Sonnet 200K, or GPT-4 Turbo 128K with full corpus stuffed). Same 50-prompt suite, full corpus dropped into the prompt. Expected: ~92-98% accuracy at ~200K-1M tokens/query. OCM target: ~92% at ≤8K tokens/query. + +## What CONFIRMED unlocks + +- Spec row 26 (Memory Palace federation) gets locked as a network-effect lever +- The "Effective-Context > Single-Window" pitch moves from speculation to evidence +- v3.5+ palace work has clear ROI + +## What REFUTED unlocks + +- Either the palace design needs different chunking / signing / sub-context retrieval +- Or the retrieval policy (Mem0's library-driven approach) needs tuning +- Or the triad isn't actually load-bearing and v0.4 row 27 needs softening + +## Source + +- Spec v0.4 row 27 + 26 +- Research note `docs/superpowers/research/2026-05-09-effective-context-triad-expansion-stratification-lookup.md` +- Research note `docs/superpowers/research/2026-05-09-decentralized-memory-palace-pattern.md` diff --git a/bench/isolation/frontier-comparison/sandbox-f-memory-palace-effective-corpus/expected.json b/bench/isolation/frontier-comparison/sandbox-f-memory-palace-effective-corpus/expected.json new file mode 100644 index 0000000..b5e3943 --- /dev/null +++ b/bench/isolation/frontier-comparison/sandbox-f-memory-palace-effective-corpus/expected.json @@ -0,0 +1,26 @@ +{ + "hypothesis_id": "memory-palace-effective-corpus-vs-frontier-1m", + "claim": "OCM's full memory stack (Mem0 + Memory Palace at 5GB scale) answers a 50-prompt long-horizon QA suite with >=92% accuracy at <=8000 tokens-per-query injected — comparable to or better than a frontier 1M-token-context model that pays 200x more tokens per query for the same accuracy.", + "metric": "long_horizon_qa_accuracy_at_token_budget", + "thresholds": { + "confirm_at_least": 92.0, + "refute_below": 75.0 + }, + "secondary_metric": "tokens_injected_p50", + "secondary_thresholds": { + "confirm_at_most": 8000, + "refute_above": 20000 + }, + "workload": "long-horizon-qa-50.jsonl", + "source_for_claim": "Spec v0.4 row 27 (Effective-Context Triad) + research note 2026-05-09-effective-context-triad-expansion-stratification-lookup.md. The triad's load-bearing measurement.", + "comparison_anchor": "frontier-1m-context-baseline (e.g. Gemini 1.5 Pro 1M-token, Claude 3.5 Sonnet 200K, GPT-4 Turbo 128K with full corpus stuffed)", + "decision_rule": "If CONFIRMED at the token budget, OCM's Effective-Context Triad story holds — knowledge axis (palace) compounds with retrieval axis (Mem0) to beat raw window expansion. If REFUTED, either (a) palace federation isn't load-bearing yet, (b) retrieval policies need tuning, or (c) the triad's claim needs revising. The most consequential bench in the v0.4+ stack.", + "timeout_seconds": 7200, + "status": "INACTIVE", + "blocked_on": [ + "Memory Palace federation (v3.5+) not yet implemented", + "Frontier-1M baseline harness needs API keys + cost budget", + "Long-horizon QA workload (50 prompts spanning multi-week retrieval) not yet curated", + "5GB-scale palace fixture not yet generated (would seed from a real curated knowledge corpus)" + ] +} diff --git a/bench/isolation/frontier-comparison/sandbox-g-wire-compression/README.md b/bench/isolation/frontier-comparison/sandbox-g-wire-compression/README.md new file mode 100644 index 0000000..ebaa5eb --- /dev/null +++ b/bench/isolation/frontier-comparison/sandbox-g-wire-compression/README.md @@ -0,0 +1,13 @@ +# Sandbox G: Wire Compression Bandwidth + +**Hypothesis:** zstd-6 on mesh payloads cuts bandwidth 60-80% with ≤2ms compress+decompress overhead. + +**Status:** INACTIVE — Phase 7+ mesh transport not yet implemented. + +## Why this matters + +Spec v0.4 row 28 locks the compression pipeline contract. Sandbox G is the bandwidth half of that contract; sandbox H is the activation-transfer half. Together they validate that OCM's mesh stays cheap on residential internet. + +## Source + +Spec v0.4 row 28 + research note `docs/superpowers/research/2026-05-09-encryption-compression-optimizations.md`. diff --git a/bench/isolation/frontier-comparison/sandbox-g-wire-compression/expected.json b/bench/isolation/frontier-comparison/sandbox-g-wire-compression/expected.json new file mode 100644 index 0000000..149f57e --- /dev/null +++ b/bench/isolation/frontier-comparison/sandbox-g-wire-compression/expected.json @@ -0,0 +1,25 @@ +{ + "hypothesis_id": "wire-compression-zstd6-bandwidth-reduction", + "claim": "zstd-6 wire compression on mesh payloads (chat-relay, palace-gossip, skill-fetch) achieves 60-80% bandwidth reduction across a representative trace of OCM v2 mesh traffic, with <=2ms median compress+decompress overhead per payload on consumer hardware (M-series Mac / Ryzen 7).", + "metric": "bandwidth_reduction_pct", + "thresholds": { + "confirm_at_least": 60.0, + "refute_below": 40.0 + }, + "secondary_metric": "compress_decompress_overhead_ms_p50", + "secondary_thresholds": { + "confirm_at_most": 2.0, + "refute_above": 10.0 + }, + "workload": "mesh-traffic-trace-1k.jsonl", + "source_for_claim": "Spec v0.4 row 28 (Compression pipeline contract). zstd-6 / brotli for wire-level mesh payloads, ~60-80% bandwidth reduction.", + "comparison_anchor": "uncompressed-mesh-baseline (same trace, no compression layer)", + "decision_rule": "If CONFIRMED on bandwidth + secondary stays under 2ms, row 28 wire-compression lock holds. If REFUTED on bandwidth, payload structure may not be compressible enough at zstd-6 — try zstd-19 (higher CPU cost, better ratio) or brotli. If REFUTED on overhead, the latency budget violates the Effective-Context Triad's quick-look-up constraint.", + "timeout_seconds": 1800, + "status": "INACTIVE", + "blocked_on": [ + "Phase 7+ mesh transport (libp2p/iroh) not yet implemented", + "Mesh traffic trace fixture not yet curated (need representative chat-relay + palace-gossip + skill-fetch sample)", + "zstd-6 compression layer not yet wired into the mesh transport" + ] +} diff --git a/bench/isolation/frontier-comparison/sandbox-h-fp8-activation-transfer/README.md b/bench/isolation/frontier-comparison/sandbox-h-fp8-activation-transfer/README.md new file mode 100644 index 0000000..f0f02e2 --- /dev/null +++ b/bench/isolation/frontier-comparison/sandbox-h-fp8-activation-transfer/README.md @@ -0,0 +1,13 @@ +# Sandbox H: fp8 Activation Transfer + +**Hypothesis:** fp8 activation transfer between sharded inference layers preserves ≥99% of fp16 output quality at half the wire bandwidth. + +**Status:** INACTIVE — v6 sharded inference not yet implemented; this gates the v6 rollout. + +## Why this is a gate + +Spec v0.4 row 28 explicitly conditions the v6 fp8-default decision on Sandbox H: "Activation transfer (v6+ sharded inference): fp8 default with fp16 fallback, gated by Sandbox H confirmation." If H REFUTES on quality, fp8 is too lossy and v6 falls back to fp16 (with no bandwidth savings). If H REFUTES on bandwidth, the v6 economics story is overpromising. + +## Source + +Spec v0.4 row 28 + research note `docs/superpowers/research/2026-05-09-encryption-compression-optimizations.md`. diff --git a/bench/isolation/frontier-comparison/sandbox-h-fp8-activation-transfer/expected.json b/bench/isolation/frontier-comparison/sandbox-h-fp8-activation-transfer/expected.json new file mode 100644 index 0000000..9e62fbe --- /dev/null +++ b/bench/isolation/frontier-comparison/sandbox-h-fp8-activation-transfer/expected.json @@ -0,0 +1,26 @@ +{ + "hypothesis_id": "fp8-activation-transfer-vs-fp16", + "claim": "fp8 activation transfer between sharded inference layers (v6+ Exo / Prima.cpp pattern) preserves >=99% of fp16-baseline output token quality on a 200-prompt suite, while halving wire bandwidth for cross-node activations. Gates the v6 sharded-inference rollout.", + "metric": "output_token_quality_ratio_vs_fp16", + "thresholds": { + "confirm_at_least": 0.99, + "refute_below": 0.95 + }, + "secondary_metric": "activation_bandwidth_reduction_pct", + "secondary_thresholds": { + "confirm_at_least": 45.0, + "refute_below": 30.0 + }, + "workload": "sharded-inference-200prompt.jsonl", + "source_for_claim": "Spec v0.4 row 28: 'Activation transfer (v6+ sharded inference): fp8 default with fp16 fallback, gated by Sandbox H confirmation.'", + "comparison_anchor": "fp16-activation-baseline (same prompts, same shard topology, no quantization)", + "decision_rule": "CONFIRMED gates v6 sharded-inference at fp8 default. REFUTED on quality means fp8 is too lossy for our shard topology — fall back to fp16 (no bandwidth savings) or research mixed-precision per-layer policies. REFUTED on bandwidth means the savings story is overpromising and v6 economics need revisit.", + "timeout_seconds": 7200, + "status": "INACTIVE", + "blocked_on": [ + "v6 sharded inference (Exo / Prima.cpp pattern) not yet implemented in OCM", + "Cross-node activation transfer protocol not yet defined", + "fp8 quantization codepath not yet wired into ocm-inference", + "200-prompt sharded-inference workload fixture not yet curated" + ] +} diff --git a/bench/isolation/frontier-comparison/sandbox-i-mem0-encryption-overhead/README.md b/bench/isolation/frontier-comparison/sandbox-i-mem0-encryption-overhead/README.md new file mode 100644 index 0000000..c63132f --- /dev/null +++ b/bench/isolation/frontier-comparison/sandbox-i-mem0-encryption-overhead/README.md @@ -0,0 +1,15 @@ +# Sandbox I: Mem0 Encryption Overhead + +**Hypothesis:** SQLCipher AES-256 with Argon2id-derived key adds ≤15% latency overhead on Mem0 retrieval, no measurable accuracy regression. + +**Status:** INACTIVE — SQLCipher integration not yet wired into `ocm-memory`. + +## Why this matters + +Spec v0.4 row 29 names a 5-15% latency overhead range. Sandbox I empirically validates that range under our actual query patterns. If REFUTED on latency, encryption becomes opt-in (off by default) with a documented warning. If REFUTED on accuracy delta, something is structurally wrong with SQLCipher under our access pattern — row 29 needs revising. + +This sandbox becomes critical when OCM ships on remote VMs (per spec row 31) — Zone A encryption stops being defense-in-depth and becomes load-bearing. + +## Source + +Spec v0.4 row 29 + research note `docs/superpowers/research/2026-05-09-encryption-compression-optimizations.md`. diff --git a/bench/isolation/frontier-comparison/sandbox-i-mem0-encryption-overhead/expected.json b/bench/isolation/frontier-comparison/sandbox-i-mem0-encryption-overhead/expected.json new file mode 100644 index 0000000..aa1cc57 --- /dev/null +++ b/bench/isolation/frontier-comparison/sandbox-i-mem0-encryption-overhead/expected.json @@ -0,0 +1,25 @@ +{ + "hypothesis_id": "mem0-sqlcipher-aes256-overhead", + "claim": "SQLCipher AES-256 encryption on Mem0's at-rest SQLite store with Argon2id-derived key adds <=15% latency overhead on a 1000-query retrieval workload vs unencrypted baseline, with no measurable accuracy regression.", + "metric": "encrypted_retrieval_latency_overhead_pct", + "thresholds": { + "confirm_at_most": 15.0, + "refute_above": 30.0 + }, + "secondary_metric": "retrieval_accuracy_delta_pp", + "secondary_thresholds": { + "confirm_at_most": 1.0, + "refute_above": 3.0 + }, + "workload": "mem0-retrieval-1000q.jsonl", + "source_for_claim": "Spec v0.4 row 29 (Encryption mapped onto privacy zones A/B/C). 'SQLCipher AES-256 with Argon2id-derived key from user passphrase (~5-15% latency overhead).'", + "comparison_anchor": "mem0-unencrypted-baseline (same workload, plain SQLite)", + "decision_rule": "If CONFIRMED, Zone A encryption ships as default in v1.x. If REFUTED on latency, encryption becomes opt-in with a documented warning. If REFUTED on accuracy delta, something is structurally wrong with SQLCipher under our query patterns and the row 29 lock needs revising.", + "timeout_seconds": 1800, + "status": "INACTIVE", + "blocked_on": [ + "SQLCipher integration not yet wired into ocm-memory crate", + "Argon2id-from-passphrase key-derivation not yet implemented", + "Mem0 retrieval workload fixture (1000 queries) not yet curated" + ] +}