diff --git a/bench/bench/runner.py b/bench/bench/runner.py
index 80d56c9..e7a5b21 100644
--- a/bench/bench/runner.py
+++ b/bench/bench/runner.py
@@ -213,6 +213,10 @@ def _execute_compose(
     out_dir: Path,
 ) -> RunResult:
     """Run docker-compose for one repeat. Production implementation hook."""
+    # Resolve to absolute so -f and cwd= don't double up when callers pass
+    # relative paths (e.g. 'bench/isolation/memory/...') and subprocess.run
+    # changes cwd to the same relative prefix.
+    sandbox_path = sandbox_path.resolve()
     try:
         proc = subprocess.run(
             ["docker", "compose", "-f", str(sandbox_path / "docker-compose.yml"),
diff --git a/bench/isolation/memory/mem0-v3-locomo/.gitignore b/bench/isolation/memory/mem0-v3-locomo/.gitignore
new file mode 100644
index 0000000..511e8a2
--- /dev/null
+++ b/bench/isolation/memory/mem0-v3-locomo/.gitignore
@@ -0,0 +1,3 @@
+_locomo_workload/
+_mem0_locomo_faiss/
+outputs.json
diff --git a/bench/isolation/memory/mem0-v3-locomo/README.md b/bench/isolation/memory/mem0-v3-locomo/README.md
index 6f121f4..88a247d 100644
--- a/bench/isolation/memory/mem0-v3-locomo/README.md
+++ b/bench/isolation/memory/mem0-v3-locomo/README.md
@@ -1,29 +1,98 @@
 # Sandbox: mem0-v3-locomo
 
-**Hypothesis:** Mem0 v3 (library-driven retrieval) achieves ≥88 LoCoMo recall on small-model setup, within 4 points of the published 91.6 figure on larger models. Library-driven retrieval is the directly-aligned pattern for the small-model thesis.
+**Hypothesis:** Mem0 v3 (library-driven retrieval) achieves ≥88 LoCoMo recall
+on a small-model setup (llama3 8B Q4 + Mem0 v3 + faiss-cpu), within 4 points
+of the published 91.6 figure on larger models. Library-driven retrieval is the
+directly-aligned pattern for the small-model thesis.
 
-**Status:** INACTIVE — workload + harness not yet wired.
+**Status:** ACTIVE — verdict pending.
 
-## What this measures (once active)
+## What this measures
 
-- **LoCoMo recall**: standard memory-retrieval benchmark across multi-session conversations
-- **tokens_retrieved_p50**: median tokens injected as retrieval context per turn
+- **LoCoMo recall**: standard multi-session long-conversation memory benchmark
+  across 10 conversations (5,882 turns, 1,986 QA items). Per-QA recall =
+  |retrieved dia_ids ∩ evidence dia_ids| / |evidence dia_ids|.
+  `locomo_recall_score` = mean over conversations × 100.
+- **tokens_retrieved_p50**: median per-QA token count injected as retrieval
+  context (true flat median over all 1,986 QA items; 4 chars ≈ 1 token).
 
 ## What this does NOT measure
 
-- Multi-session interdependent task quality — that's MemoryArena (separate sandbox: `memory/mem0-v3-memoryarena`)
+- Multi-session interdependent task quality — that's MemoryArena
+  (`memory/mem0-v3-memoryarena`)
 - Generation quality on retrieved context — orthogonal; see chat-quality sandboxes
-- Agent-driven memory orchestration — that's Letta's paradigm, see `memory/letta-tool-memory`
+- Agent-driven memory orchestration — that's Letta's paradigm (`memory/letta-tool-memory`)
 
 ## How to interpret
 
 | Verdict | What it means |
 |---|---|
 | CONFIRMED | Library-driven retrieval is structurally aligned with small-model thesis; v0.4 row 9 lock holds |
-| REFUTED on recall | Investigate Qwen3 8B's retrieval-context utilization or Mem0 v3 config tuning |
+| REFUTED on recall | Investigate llama3 8B's retrieval-context utilization or Mem0 v3 config tuning |
 | REFUTED on tokens | 7000 token budget is too optimistic; revisit Effective-Context Triad quick-look-up budget |
 | INCONCLUSIVE | Variance too high; expand workload size or repeat count |
 
+## Dataset
+
+LoCoMo — Maharana et al., *Evaluating Very Long-Term Conversational Memory of
+LLM Agents*, ACL 2024 ([arXiv:2402.17753](https://arxiv.org/abs/2402.17753)).
+Repository: [snap-research/locomo](https://github.com/snap-research/locomo),
+licensed **CC BY-NC 4.0**.
+
+We download `locomo10.json` from the official snap-research/locomo repository
+at run time (SHA-pinned to commit `cbfbc1dba6bc53d00625212a0f22d55ffee7c1fc`)
+and do **not** redistribute it. Use is non-commercial benchmarking of OCM's
+library-driven retrieval pattern (spec row 9). Attribution per CC BY-NC 4.0
+requirements.
+
+## Run
+
+The bench framework runs this sandbox via `docker compose up` per the standard
+contract. Manual one-off:
+
+```bash
+cd bench/isolation/memory/mem0-v3-locomo
+docker compose run --rm bench
+```
+
+With the optional LLM-in-the-loop diagnostic (requires Ollama with `llama3`
+pulled on the host):
+
+```bash
+docker compose run --rm bench python bench.py --with-llm
+```
+
+The verdict path (default, no `--with-llm`) runs in ~6-8 minutes per repeat,
+~18-25 minutes for the standard 3 repeats. No Ollama required for the verdict.
+
 ## Source for the claim
 
-Mem0 v3 release notes (April 2026), pinned in research note `docs/superpowers/research/2026-05-09-decentralized-memory-palace-pattern.md`.
+Mem0 v3 release notes (April 2026), pinned in research note
+`docs/superpowers/research/2026-05-09-decentralized-memory-palace-pattern.md`.
+
+---
+
+## Verdicts
+
+**Run 1 — 2026-06-11 (operator dev box, Windows/Docker, CPU)**
+
+| Field | Value |
+|---|---|
+| `locomo_recall_score` | **30.79** |
+| Verdict | **REFUTED** (contract: confirm ≥88 · refute <80) |
+| Per-conversation recall | 0.293 / 0.245 / 0.282 / 0.179 / 0.552 / 0.354 / 0.366 / 0.388 / 0.266 / 0.154 |
+| tokens_p50 | 254 |
+| Elapsed | 4,364s measurement (~73 min total incl. install) |
+| Config | mem0ai 2.0.5 · MiniLM-L6-v2 · faiss (BM25 hybrid DISABLED — faiss lacks keyword search) · no spaCy · `add(infer=False)` · top_k=10 |
+| Provenance | locomo10.json SHA-pinned `cbfbc1d…` · branch feat/mem0-v3-locomo-activation |
+
+**What this refutes — and what it does not.** This REFUTES "library-driven retrieval
+in the hermetic pure-vector config reaches published-Mem0 recall at LoCoMo scale."
+It does NOT refute the memory thesis (amnesia-ab: 94.2% at small scale) or Mem0's
+production config — which adds exactly what this config strips: BM25+entity tri-signal
+rank fusion and LLM fact extraction (`infer=True`). External evidence (Hindsight,
+arXiv 2512.12818) shows hybrid-fusion + a 22M cross-encoder reranker reaches ~89.6
+LoCoMo with open models. The decision rule's "investigate config tuning" branch fires:
+next iteration = BM25 sidecar + RRF + reranker + a 2026-class embedder, re-run this
+same contract. A REFUTED first verdict on a stripped config is the framework working.
+
diff --git a/bench/isolation/memory/mem0-v3-locomo/bench.py b/bench/isolation/memory/mem0-v3-locomo/bench.py
new file mode 100644
index 0000000..dc0189c
--- /dev/null
+++ b/bench/isolation/memory/mem0-v3-locomo/bench.py
@@ -0,0 +1,381 @@
+"""Mem0 v3 LoCoMo recall measurement.
+
+Measures Mem0's library-driven retrieval (NOT LLM-driven extraction) on the
+LoCoMo long-conversation QA benchmark.  The verdict path is retrieval-only
+(fast, hermetic, no Ollama required).  Pass --with-llm to additionally run an
+end-to-end Ollama diagnostic.
+
+Dataset (fetched at run time, never bundled):
+  LoCoMo — Maharana et al., ACL 2024 (arXiv:2402.17753)
+  License: CC BY-NC 4.0  |  snap-research/locomo on GitHub
+  We download locomo10.json at run time and do NOT redistribute it.
+
+Hermetic config (mirrors mem0-library-retrieval-recall / sandbox #56):
+  embedder   : huggingface (sentence-transformers/all-MiniLM-L6-v2) — no key
+  vector_store: faiss (local file-backed) — no server needed
+  llm        : stub OpenAI config (add(infer=False) — never called)
+
+Primary output:
+  locomo_recall_score  : mean per-conversation recall × 100  (0-100 scale)
+  tokens_retrieved_p50 : median per-QA token count (true flat median)
+
+Optional diagnostic (--with-llm):
+  llm_answer_match_pct : fraction of QAs where Ollama llama3 reply contains
+                         the ground-truth answer substring
+"""
+
+from __future__ import annotations
+
+import argparse
+import hashlib
+import json
+import os
+import re
+import shutil
+import statistics
+import time
+import urllib.request as _urlreq
+from pathlib import Path
+
+# ---------------------------------------------------------------------------
+# Dataset constants — pinned to a specific upstream commit for reproducibility
+# ---------------------------------------------------------------------------
+
+LOCOMO_PIN = "cbfbc1dba6bc53d00625212a0f22d55ffee7c1fc"
+LOCOMO_URL = (
+    "https://raw.githubusercontent.com/snap-research/locomo"
+    f"/{LOCOMO_PIN}/data/locomo10.json"
+)
+EXPECTED_SHA256 = "79fa87e90f04081343b8c8debecb80a9a6842b76a7aa537dc9fdf651ea698ff4"
+
+_SESSION_RE = re.compile(r"^session_\d+$")
+
+# ---------------------------------------------------------------------------
+# LLM diagnostic constants (only used when --with-llm is on)
+# ---------------------------------------------------------------------------
+
+SYS_ON = (
+    "You are a personal assistant with persistent memory of past sessions.\n"
+    "RELEVANT MEMORIES retrieved for this request:\n{mems}\n"
+    "Answer using these memories — be specific. Be concise."
+)
+
+
+# ---------------------------------------------------------------------------
+# Step 1 — fetch dataset
+# ---------------------------------------------------------------------------
+
+
+def fetch_locomo(dest: Path) -> Path:
+    """Download locomo10.json to *dest*, skipping if SHA256 already matches."""
+    dest.parent.mkdir(parents=True, exist_ok=True)
+    if dest.exists():
+        if hashlib.sha256(dest.read_bytes()).hexdigest() == EXPECTED_SHA256:
+            print(f"[fetch] cache hit: {dest}")
+            return dest
+        dest.unlink()
+    url = os.environ.get("LOCOMO_URL", LOCOMO_URL)
+    print(f"[fetch] downloading {url}")
+    _urlreq.urlretrieve(url, dest)
+    got = hashlib.sha256(dest.read_bytes()).hexdigest()
+    if got != EXPECTED_SHA256:
+        raise SystemExit(
+            f"locomo10.json SHA256 mismatch: got {got}, expected {EXPECTED_SHA256}"
+        )
+    print(f"[fetch] ok ({dest.stat().st_size:,} bytes)")
+    return dest
+
+
+# ---------------------------------------------------------------------------
+# Step 2 — hermetic Mem0 instance
+# ---------------------------------------------------------------------------
+
+
+def build_hermetic_mem0(faiss_dir: Path):
+    """Build a fresh Mem0 instance backed by a local faiss index.
+
+    Mirrors the config from mem0-library-retrieval-recall (#56).
+    """
+    from mem0 import Memory  # lazy import — loud failure if env lacks it
+
+    faiss_dir = Path(faiss_dir).resolve()
+    if faiss_dir.exists():
+        shutil.rmtree(faiss_dir)
+    faiss_dir.parent.mkdir(parents=True, exist_ok=True)
+
+    return Memory.from_config(
+        {
+            "embedder": {
+                "provider": "huggingface",
+                "config": {"model": "sentence-transformers/all-MiniLM-L6-v2"},
+            },
+            "vector_store": {
+                "provider": "faiss",
+                "config": {
+                    "collection_name": "ocm_locomo",
+                    "path": str(faiss_dir),
+                    "embedding_model_dims": 384,
+                },
+            },
+            "llm": {
+                "provider": "openai",
+                "config": {"api_key": "sk-stub-not-used", "model": "gpt-4o-mini"},
+            },
+        }
+    )
+
+
+# ---------------------------------------------------------------------------
+# Step 3 — ingest one conversation
+# ---------------------------------------------------------------------------
+
+
+def ingest_conversation(mem, conv: dict, *, user_id: str) -> dict:
+    """Ingest all turns of one LoCoMo conversation into Mem0.
+
+    Each turn becomes one memory (infer=False — bypasses extraction).
+    Metadata carries dia_id and session key for recall mapping.
+
+    Session keys are filtered to match `session_\\d+$` so date-time strings
+    and speaker fields inside the conversation dict are skipped.
+    """
+    n_turns = n_sessions = 0
+    for session_key, turns in conv["conversation"].items():
+        if not _SESSION_RE.match(session_key):
+            continue  # skip 'speaker_a', 'session_1_date_time', etc.
+        if not isinstance(turns, list):
+            continue
+        n_sessions += 1
+        for turn in turns:
+            mem.add(
+                turn["text"],
+                user_id=user_id,
+                infer=False,
+                metadata={"dia_id": turn["dia_id"], "session": session_key},
+            )
+            n_turns += 1
+    return {"turns": n_turns, "sessions": n_sessions}
+
+
+# ---------------------------------------------------------------------------
+# Step 4 — score one conversation (recall + token metrics)
+# ---------------------------------------------------------------------------
+
+
+def score_conversation(mem, conv: dict, *, user_id: str, top_k: int = 10) -> dict:
+    """Compute LoCoMo recall + token metrics for one conversation.
+
+    Returns per_qa_recall (list) and per_qa_tokens (list) so the caller can
+    aggregate across conversations with correct flat-list medians.
+    """
+    per_qa_recall: list[float] = []
+    per_qa_tokens: list[float] = []
+
+    for qa in conv.get("qa", []):
+        results = mem.search(
+            qa["question"],
+            filters={"user_id": user_id},
+            top_k=top_k,
+        )
+        hits = (
+            results.get("results", [])
+            if isinstance(results, dict)
+            else (results or [])
+        )
+
+        # Map retrieved hits back to dia_ids via metadata
+        retrieved_dia_ids: set[str] = set()
+        for r in hits:
+            md = r.get("metadata") or {}
+            did = md.get("dia_id")
+            if did:
+                retrieved_dia_ids.add(did)
+
+        evidence = set(qa.get("evidence", []))
+        if evidence:
+            per_qa_recall.append(
+                len(retrieved_dia_ids & evidence) / len(evidence)
+            )
+
+        # Token approximation: 4 chars ≈ 1 token (heuristic for scale check)
+        per_qa_tokens.append(
+            sum(len(r.get("memory", "")) / 4.0 for r in hits)
+        )
+
+    return {
+        "per_qa_recall": per_qa_recall,
+        "mean_recall": (
+            sum(per_qa_recall) / len(per_qa_recall) if per_qa_recall else 0.0
+        ),
+        "per_qa_tokens": per_qa_tokens,
+        "tokens_p50": (
+            statistics.median(per_qa_tokens) if per_qa_tokens else 0.0
+        ),
+        "n_qa": len(per_qa_recall),
+    }
+
+
+# ---------------------------------------------------------------------------
+# Step 5 — optional LLM diagnostic
+# ---------------------------------------------------------------------------
+
+
+def _ollama_chat(ollama_url: str, system: str, user: str) -> str:
+    body = json.dumps(
+        {
+            "model": "llama3",
+            "stream": False,
+            "messages": [
+                {"role": "system", "content": system},
+                {"role": "user", "content": user},
+            ],
+            "options": {"temperature": 0.2, "num_predict": 200},
+        }
+    ).encode()
+    req = _urlreq.Request(
+        f"{ollama_url}/api/chat",
+        data=body,
+        headers={"content-type": "application/json"},
+    )
+    with _urlreq.urlopen(req, timeout=60) as r:
+        return json.loads(r.read())["message"]["content"]
+
+
+def _norm(s: str) -> str:
+    return "".join(c for c in s.lower() if c.isalnum())
+
+
+def _diagnose_with_llm(
+    mem, conv: dict, user_id: str, ollama_url: str, top_k: int = 10
+) -> dict:
+    """Run optional end-to-end LLM diagnostic (not in verdict contract)."""
+    matches = total = 0
+    for qa in conv.get("qa", []):
+        results = mem.search(
+            qa["question"],
+            filters={"user_id": user_id},
+            top_k=top_k,
+        )
+        hits = (
+            results.get("results", [])
+            if isinstance(results, dict)
+            else (results or [])
+        )
+        mems_block = "\n".join(f"- {h.get('memory', '')}" for h in hits)
+        try:
+            reply = _ollama_chat(
+                ollama_url,
+                SYS_ON.format(mems=mems_block),
+                qa["question"],
+            )
+        except Exception as e:
+            print(f"[llm] error on '{qa['question'][:40]}…': {e}")
+            continue
+        total += 1
+        if _norm(qa["answer"]) in _norm(reply):
+            matches += 1
+    return {
+        "llm_n_qa": total,
+        "llm_match_pct": (100.0 * matches / total) if total else 0.0,
+    }
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Mem0 v3 LoCoMo recall bench")
+    parser.add_argument(
+        "--with-llm",
+        action="store_true",
+        help="Run optional Ollama LLM diagnostic (not part of verdict contract)",
+    )
+    args = parser.parse_args()
+
+    workload = fetch_locomo(Path("_locomo_workload/locomo10.json"))
+    convs: list[dict] = json.loads(workload.read_text(encoding="utf-8"))
+
+    started = time.monotonic()
+    faiss_dir = Path("_mem0_locomo_faiss")
+    mem = build_hermetic_mem0(faiss_dir)
+
+    per_conv_recall: list[float] = []
+    all_per_qa_tokens: list[float] = []  # flat — true median over all QAs
+    breakdown: list[dict] = []
+
+    for i, conv in enumerate(convs):
+        user_id = f"locomo_user_{i}"
+        ingest_stats = ingest_conversation(mem, conv, user_id=user_id)
+        score = score_conversation(mem, conv, user_id=user_id)
+
+        per_conv_recall.append(score["mean_recall"])
+        all_per_qa_tokens.extend(score["per_qa_tokens"])
+
+        breakdown.append(
+            {
+                "sample_id": conv.get("sample_id", f"conv_{i}"),
+                "mean_recall": score["mean_recall"],
+                "n_qa": score["n_qa"],
+                "n_turns": ingest_stats["turns"],
+                "n_sessions": ingest_stats["sessions"],
+                "tokens_p50": score["tokens_p50"],
+            }
+        )
+        print(
+            f"[conv {i}] recall={score['mean_recall']:.3f} "
+            f"qa={score['n_qa']} turns={ingest_stats['turns']}"
+        )
+
+    # Primary: mean over 10 conversations × 100 (0-100 scale, matches published 91.6)
+    primary = 100.0 * sum(per_conv_recall) / len(per_conv_recall)
+    # Secondary: true flat p50 over every individual QA's token count
+    secondary = statistics.median(all_per_qa_tokens) if all_per_qa_tokens else 0.0
+    elapsed = time.monotonic() - started
+
+    output: dict = {
+        "primary_value": primary,
+        "secondary_value": secondary,
+        "duration_seconds": elapsed,
+        "n_conversations": len(convs),
+        "n_qa_total": sum(b["n_qa"] for b in breakdown),
+        "embedder": "sentence-transformers/all-MiniLM-L6-v2",
+        "vector_store": "faiss-local",
+        "per_conversation": breakdown,
+    }
+
+    if args.with_llm:
+        ollama_url = os.environ.get("OLLAMA_URL", "http://host.docker.internal:11434")
+        print(f"[llm] running LLM diagnostic against {ollama_url}...")
+        diags = [
+            _diagnose_with_llm(mem, c, f"locomo_user_{i}", ollama_url)
+            for i, c in enumerate(convs)
+        ]
+        total_n = sum(d["llm_n_qa"] for d in diags)
+        total_match = sum(d["llm_n_qa"] * d["llm_match_pct"] / 100.0 for d in diags)
+        output["diagnostic"] = {
+            "llm_answer_match_pct": (
+                100.0 * total_match / total_n if total_n else 0.0
+            ),
+            "llm_n_qa": total_n,
+            "llm_model": "llama3",
+        }
+
+    # Cleanup faiss index — keeps the sandbox dir clean between runs
+    if faiss_dir.exists():
+        shutil.rmtree(faiss_dir)
+
+    Path("outputs.json").write_text(
+        json.dumps(output, indent=2), encoding="utf-8"
+    )
+    print(
+        f"\n[result] locomo_recall_score={primary:.2f} "
+        f"tokens_p50={secondary:.0f} "
+        f"elapsed={elapsed:.1f}s"
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/bench/isolation/memory/mem0-v3-locomo/docker-compose.yml b/bench/isolation/memory/mem0-v3-locomo/docker-compose.yml
new file mode 100644
index 0000000..c58d8d8
--- /dev/null
+++ b/bench/isolation/memory/mem0-v3-locomo/docker-compose.yml
@@ -0,0 +1,35 @@
+services:
+  bench:
+    image: python:3.11
+    # extra_hosts lets bench.py reach the host-side Ollama at
+    # http://host.docker.internal:11434 when --with-llm is passed.
+    # The default (retrieval-only) path never contacts Ollama.
+    extra_hosts:
+      - "host.docker.internal:host-gateway"
+    volumes:
+      - ./:/work
+      # First live run spent ~50 min re-downloading torch into a throwaway
+      # container and died with nothing persisted. The named cache volume
+      # makes repeat installs minutes, not most-of-an-hour.
+      - locomo-pip-cache:/root/.cache/pip
+    working_dir: /work
+    environment:
+      # SHA-pinned download URL — fetched once per run, cached on repeat.
+      - LOCOMO_URL=https://raw.githubusercontent.com/snap-research/locomo/cbfbc1dba6bc53d00625212a0f22d55ffee7c1fc/data/locomo10.json
+      - OLLAMA_URL=http://host.docker.internal:11434
+      - PYTHONUNBUFFERED=1
+    # mem0ai pulls pydantic + networking deps; sentence-transformers pulls
+    # torch (large, pip-cached on warm runs via the named volume). faiss-cpu
+    # is Mem0's local vector store. NOTE: pip is NOT --quiet and python is
+    # unbuffered — the first live run was 50 silent minutes, indistinguishable
+    # from a hang. Logs are cheap; silence is not.
+    command:
+      - sh
+      - -c
+      - |
+        pip install torch --index-url https://download.pytorch.org/whl/cpu
+        pip install mem0ai sentence-transformers faiss-cpu
+        python -u bench.py
+
+volumes:
+  locomo-pip-cache:
diff --git a/bench/isolation/memory/mem0-v3-locomo/expected.json b/bench/isolation/memory/mem0-v3-locomo/expected.json
index f7eddf4..5f23c7a 100644
--- a/bench/isolation/memory/mem0-v3-locomo/expected.json
+++ b/bench/isolation/memory/mem0-v3-locomo/expected.json
@@ -1,6 +1,6 @@
 {
   "hypothesis_id": "mem0-v3-locomo-recall",
-  "claim": "Mem0 v3 with library-driven retrieval (no agent-driven memory tool calls) achieves >=88 LoCoMo recall on a small-model setup (Qwen3 8B Q4 + Mem0 v3 + sqlite-vec) at <=7000 tokens retrieved per query — within 4 points of the 91.6 LoCoMo published number on larger models.",
+  "claim": "Mem0 v3 with library-driven retrieval (no agent-driven memory tool calls) achieves >=88 LoCoMo recall on a small-model setup (llama3 8B Q4 via Ollama + Mem0 v3 + faiss-cpu) at <=7000 tokens retrieved per query — within 4 points of the 91.6 LoCoMo published number on larger models.",
   "metric": "locomo_recall_score",
   "thresholds": {
     "confirm_at_least": 88.0,
@@ -11,15 +11,11 @@
     "confirm_at_most": 7000,
     "refute_above": 12000
   },
-  "workload": "locomo-conversations.jsonl",
+  "workload": "locomo10.json (fetched at run time from snap-research/locomo; CC BY-NC 4.0; not redistributed)",
   "source_for_claim": "Mem0 v3 release notes (April 2026): 91.6 LoCoMo / 93.4 LongMemEval at ~7000 tokens/retrieval on production setup",
   "comparison_anchor": "letta-tool-driven-memory-on-same-model (when implemented in sister sandbox)",
-  "decision_rule": "If CONFIRMED, library-driven retrieval is structurally aligned with the small-model thesis — proceed with v0.4 row 9 lock. If REFUTED on recall but secondary metric green, investigate Qwen3 8B's retrieval-context utilization. If REFUTED on tokens metric, the 7000 budget is overly optimistic and Effective-Context Triad's quick-look-up budget needs revisit.",
+  "decision_rule": "If CONFIRMED, library-driven retrieval is structurally aligned with the small-model thesis — proceed with v0.4 row 9 lock. If REFUTED on recall but secondary metric green, investigate llama3 8B's retrieval-context utilization. If REFUTED on tokens metric, the 7000 budget is overly optimistic and Effective-Context Triad's quick-look-up budget needs revisit.",
   "timeout_seconds": 1800,
-  "status": "INACTIVE",
-  "blocked_on": [
-    "LoCoMo conversations workload not yet downloaded into bench/workloads/",
-    "Bench harness does not yet drive Mem0 + Qwen3 8B end-to-end",
-    "Mem0 OpenMemory MCP local mode not yet packaged with the daemon"
-  ]
+  "status": "ACTIVE",
+  "blocked_on": []
 }
diff --git a/bench/pyproject.toml b/bench/pyproject.toml
index 8f7caa8..0f94d17 100644
--- a/bench/pyproject.toml
+++ b/bench/pyproject.toml
@@ -38,6 +38,9 @@ include = ["bench*"]
 [tool.pytest.ini_options]
 testpaths = ["tests"]
 python_files = ["test_*.py"]
+markers = [
+    "requires_mem0: test requires mem0ai + sentence-transformers + faiss-cpu (Python 3.11 only locally)",
+]
 
 [tool.ruff]
 line-length = 100
diff --git a/bench/tests/test_locomo_bench_local.py b/bench/tests/test_locomo_bench_local.py
new file mode 100644
index 0000000..b16cc0d
--- /dev/null
+++ b/bench/tests/test_locomo_bench_local.py
@@ -0,0 +1,152 @@
+"""Unit tests for mem0-v3-locomo bench.py functions.
+
+Mem0-dependent tests require mem0ai + sentence-transformers + faiss-cpu and
+are marked `requires_mem0`.  They also skip on Python 3.12+ locally because
+sentence-transformers → pyarrow crashes on Python 3.13 (Windows).  The real
+verdict runs in Docker (Python 3.11) where all deps are installed cleanly.
+
+Run locally on Python 3.11 with mem0ai installed:
+    cd bench && pytest tests/test_locomo_bench_local.py -v -m requires_mem0
+"""
+
+from __future__ import annotations
+
+import importlib.util
+import json
+import sys
+from pathlib import Path
+
+import pytest
+
+# ---------------------------------------------------------------------------
+# Skip guard: these tests need mem0ai importable. A version gate was wrong twice
+# over — CI runs py3.11 WITHOUT mem0 installed (failed), and a future py3.12-
+# compatible mem0 would be skipped needlessly. Gate on availability, not version.
+# ---------------------------------------------------------------------------
+def _mem0_available() -> bool:
+    try:
+        import mem0  # noqa: F401
+        return True
+    except Exception:
+        return False
+
+
+_skip_py312 = pytest.mark.skipif(
+    not _mem0_available(),
+    reason="mem0ai not importable in this environment; the full path runs in the sandbox Docker (py 3.11 + pip install)",
+)
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def _load_bench():
+    modpath = (
+        Path(__file__).resolve().parents[1]
+        / "isolation" / "memory" / "mem0-v3-locomo" / "bench.py"
+    )
+    spec = importlib.util.spec_from_file_location("locomo_bench", modpath)
+    m = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(m)
+    return m
+
+
+FIXTURE_CONV = {
+    "sample_id": "fake_0",
+    "conversation": {
+        "speaker_a": "Alice",
+        "speaker_b": "Bob",
+        "session_1_date_time": "2024-01-01T10:00:00",
+        "session_1": [
+            {"speaker": "Alice", "dia_id": "D1:1", "text": "I bought a Specialized Tarmac SL7."},
+            {"speaker": "Bob",   "dia_id": "D1:2", "text": "Nice — what color?"},
+        ],
+        "session_2": [
+            {"speaker": "Alice", "dia_id": "D2:1", "text": "Matte black."},
+        ],
+    },
+    "qa": [
+        {
+            "question": "What bike did Alice buy?",
+            "answer": "Specialized Tarmac SL7",
+            "evidence": ["D1:1"],
+            "category": "single_hop",
+        }
+    ],
+}
+
+
+# ---------------------------------------------------------------------------
+# Test: session key regex — no mem0 dependency, runs on all Python versions
+# ---------------------------------------------------------------------------
+
+def test_session_regex_only_matches_pure_session_keys():
+    """Verify the _SESSION_RE pattern selects only bare session_N keys."""
+    bench = _load_bench()
+    pattern = bench._SESSION_RE
+    assert pattern.match("session_1")
+    assert pattern.match("session_10")
+    assert not pattern.match("session_1_date_time")
+    assert not pattern.match("speaker_a")
+    assert not pattern.match("session_summary")
+
+
+# ---------------------------------------------------------------------------
+# Mem0-dependent tests (require Python 3.11 + mem0ai)
+# ---------------------------------------------------------------------------
+
+@pytest.mark.requires_mem0
+@_skip_py312
+def test_ingest_skips_non_session_keys_and_counts_turns(tmp_path):
+    bench = _load_bench()
+    mem = bench.build_hermetic_mem0(faiss_dir=tmp_path / "_faiss")
+    stats = bench.ingest_conversation(mem, FIXTURE_CONV, user_id="locomo_user_0")
+    # 2 sessions (session_1 + session_2), 3 turns total
+    # speaker_a, speaker_b, session_1_date_time must be skipped
+    assert stats["turns"] == 3
+    assert stats["sessions"] == 2
+
+
+@pytest.mark.requires_mem0
+@_skip_py312
+def test_score_conversation_recall_full_when_evidence_in_top10(tmp_path):
+    bench = _load_bench()
+    mem = bench.build_hermetic_mem0(faiss_dir=tmp_path / "_faiss")
+    bench.ingest_conversation(mem, FIXTURE_CONV, user_id="t0")
+    result = bench.score_conversation(mem, FIXTURE_CONV, user_id="t0", top_k=10)
+    assert result["per_qa_recall"] == [1.0]
+    assert result["mean_recall"] == 1.0
+    assert result["tokens_p50"] > 0
+    assert len(result["per_qa_tokens"]) == 1
+
+
+@pytest.mark.requires_mem0
+@_skip_py312
+def test_token_p50_is_true_flat_list_across_conversations(tmp_path):
+    """True flat median: 1-QA conv + 3-QA conv must give 4 items, not 2."""
+    bench = _load_bench()
+    conv_a = {
+        "conversation": {"session_1": [
+            {"speaker": "A", "dia_id": "D1:1", "text": "x" * 40},
+        ]},
+        "qa": [{"question": "q", "answer": "x", "evidence": ["D1:1"], "category": "x"}],
+    }
+    conv_b = {
+        "conversation": {"session_1": [
+            {"speaker": "A", "dia_id": "D1:1", "text": "y" * 400},
+        ]},
+        "qa": [
+            {"question": "q1", "answer": "y", "evidence": ["D1:1"], "category": "x"},
+            {"question": "q2", "answer": "y", "evidence": ["D1:1"], "category": "x"},
+            {"question": "q3", "answer": "y", "evidence": ["D1:1"], "category": "x"},
+        ],
+    }
+    mem = bench.build_hermetic_mem0(faiss_dir=tmp_path / "_faiss")
+    all_tokens: list[float] = []
+    for i, c in enumerate([conv_a, conv_b]):
+        bench.ingest_conversation(mem, c, user_id=f"u{i}")
+        s = bench.score_conversation(mem, c, user_id=f"u{i}")
+        all_tokens.extend(s["per_qa_tokens"])
+    # Must have 1 + 3 = 4 items (not 2 from per-conv duplication)
+    assert len(all_tokens) == 4
diff --git a/bench/tests/test_locomo_sandbox_structure.py b/bench/tests/test_locomo_sandbox_structure.py
new file mode 100644
index 0000000..5e2e0ff
--- /dev/null
+++ b/bench/tests/test_locomo_sandbox_structure.py
@@ -0,0 +1,18 @@
+"""Structural smoke test — the mem0-v3-locomo sandbox validates correctly
+regardless of its ACTIVE/INACTIVE status.
+"""
+
+from pathlib import Path
+
+from bench.runner import load_expected
+
+
+def test_mem0_v3_locomo_sandbox_loads_expected():
+    root = Path(__file__).resolve().parents[2]
+    sandbox = root / "bench" / "isolation" / "memory" / "mem0-v3-locomo"
+    expected = load_expected(sandbox)
+    assert expected.hypothesis_id == "mem0-v3-locomo-recall"
+    # Files required for ACTIVE status must be present when status == ACTIVE
+    if expected.status == "ACTIVE":
+        assert (sandbox / "docker-compose.yml").exists()
+        assert (sandbox / "bench.py").exists()
diff --git a/docs/coverage.md b/docs/coverage.md
index 8239c3e..7100500 100644
--- a/docs/coverage.md
+++ b/docs/coverage.md
@@ -47,3 +47,4 @@ _ACTIVE sandboxes whose `spec_row` field or `source_for_claim` did not resolve t
 | Sandbox | Hypothesis | Source-for-claim hint |
 |---|---|---|
 | `vllm-q4-llama8b` | `vllm-q4-llama8b-singlestream-tps` | https://www.databasemart.com/blog/vllm-gpu-benchmark-rtx4090 |
+| `mem0-v3-locomo` | `mem0-v3-locomo-recall` | Mem0 v3 release notes (April 2026): 91.6 LoCoMo / 93.4 LongMemEval at ~7000 t... |
diff --git a/docs/metrics.md b/docs/metrics.md
index 93b54aa..db5e312 100644
--- a/docs/metrics.md
+++ b/docs/metrics.md
@@ -2,7 +2,7 @@
 
 _Auto-generated by `bench dashboard`. Do not edit by hand._
 
-**[FAIL]** 0 CONFIRMED / 0 REFUTED / 0 INCONCLUSIVE / 6 no-run
+**[FAIL]** 0 CONFIRMED / 0 REFUTED / 0 INCONCLUSIVE / 7 no-run
 
 | Sandbox | Hypothesis | Category | Primary metric | Latest | Threshold | Verdict | Hardware | Run |
 |---|---|---|---|---|---|---|---|---|
@@ -11,4 +11,5 @@ _Auto-generated by `bench dashboard`. Do not edit by hand._
 | `vllm-q4-llama8b` | `vllm-q4-llama8b-singlestream-tps` | inference-engines | `tokens_per_second_median_single_stream` | - | >= 100.0 | (no run yet) | `-` | - |
 | `amnesia-ab` | `amnesia-ab-memory-loop` | memory | `memory_on_fact_recall_pct` | - | >= 70.0 | (no run yet) | `-` | - |
 | `mem0-library-retrieval-recall` | `mem0-library-retrieval-recall-and-isolation` | memory | `recall_at_10` | - | >= 0.95 | (no run yet) | `-` | - |
+| `mem0-v3-locomo` | `mem0-v3-locomo-recall` | memory | `locomo_recall_score` | - | >= 88.0 | (no run yet) | `-` | - |
 | `aider-repomap-fidelity` | `aider-repomap-token-reduction-and-symbol-coverage` | retrieval | `token_reduction_pct` | - | >= 50.0 | (no run yet) | `-` | - |