TSchonleber · Velamj · Apr 24, 2026 · Apr 24, 2026 · Apr 24, 2026 · Apr 25, 2026
@@ -130,13 +130,14 @@ jobs:
   retrieval-gate:
     runs-on: ubuntu-latest
     if: github.event_name == 'pull_request'
-    # pull-requests: write is needed by the trailing "Post bench summary
+    # pull-requests/issues write are needed by the trailing "Post bench summary
     # as PR comment" step (actions/github-script). Without it the step
     # 403s on issues.createComment and fails the job even when the bench
     # itself passed.
     permissions:
       contents: read
       pull-requests: write
+      issues: write
     # Path filter: only pay the minutes-long LongMemEval tax when we're
     # touching retrieval. The planning doc calls these the "top-heavy
     # retrieval" hot paths — keep in sync with the plan.
@@ -155,7 +156,9 @@ jobs:
               - 'src/agentmemory/rerank.py'
               - 'src/agentmemory/embeddings.py'
               - 'src/agentmemory/retrieval.py'
+              - 'src/agentmemory/retrieval/**'
               - 'bin/intent_classifier.py'
+              - 'benchmarks/**'
               - 'tests/bench/**'
 
       - name: Set up Python

diff --git a/.gitignore b/.gitignore
@@ -14,6 +14,10 @@ db/*.backup
 logs/
 blobs/
 backups/
+benchmarks/results/
+benchmarks/training_data/
+src/agentmemory/retrieval/models/*.json
+.vs/
 .DS_Store
 /tmp/
 *.swp

diff --git a/bin/intent_classifier.py b/bin/intent_classifier.py
@@ -43,15 +43,15 @@ class IntentResult:
 # The final merged list fed to --tables is primary + secondary (de-duped).
 _TABLE_ROUTES = {
     "cross_reference":    ["events", "memories", "context"],
-    "troubleshooting":    ["events", "memories", "context"],
+    "troubleshooting":    ["events", "memories", "context", "decisions"],
     "task_status":        ["events", "context", "memories"],
-    "entity_lookup":      ["memories", "context", "events"],   # entities not in universal search pipeline
-    "historical_timeline":["events", "context", "memories"],
-    "how_to":             ["memories", "context"],
-    "decision_rationale": ["memories", "context", "events"],
+    "entity_lookup":      ["memories", "events", "context"],
+    "historical_timeline":["events", "memories", "context"],
+    "how_to":             ["memories", "context", "events", "decisions"],
+    "decision_rationale": ["decisions", "memories", "context", "events"],
     "research_concept":   ["memories", "context"],
     "orientation":        ["memories", "events", "context"],
-    "factual_lookup":     ["memories", "context", "events"],   # same as default
+    "factual_lookup":     ["memories", "entities", "decisions", "context", "events"],
 }
 
 _FORMAT_HINTS = {
@@ -81,6 +81,17 @@ class IntentResult:
 _WAVE_RE = re.compile(r'\bwave\s*\d+\b', re.IGNORECASE)
 _HOW_RE = re.compile(r'\bhow\s+(to|do|does|can|should)\b', re.IGNORECASE)
 _WHY_RE = re.compile(r'\bwhy\b', re.IGNORECASE)
+_PROCEDURAL_RE = re.compile(r'\b(runbook|playbook|rollback|roll back|procedure|workflow|steps?|migrate|deployment?|troubleshoot|debug)\b', re.IGNORECASE)
+_ENTITY_FACT_RE = re.compile(
+    r'\b('
+    r'who(?:\s+is|\s+owns?)?|'
+    r'what\s+does|'
+    r'owner|maintainer|reviewer|assignee|'
+    r'prefers?|preference|'
+    r'role|responsible'
+    r')\b',
+    re.IGNORECASE,
+)
 # First-person/identity statement (Hermes memory dumps stored as queries)
 _IDENTITY_STMT_RE = re.compile(
     r'^(I |My |The vault|Chief wakes|Continuity is|Tasks that|Learn the|'
@@ -157,12 +168,12 @@ def classify_intent(query: str) -> IntentResult:
         )
 
     # ---- Rule 4: How-to ----
-    if _HOW_RE.search(q):
+    if _HOW_RE.search(q) or _PROCEDURAL_RE.search(q):
         return IntentResult(
             intent="how_to",
             confidence=0.88,
             tables=_TABLE_ROUTES["how_to"],
-            matched_rule="how_to_regex",
+            matched_rule="how_to_regex" if _HOW_RE.search(q) else "procedural_kw_regex",
             format_hint=_FORMAT_HINTS["how_to"],
         )
 
@@ -264,14 +275,14 @@ def classify_intent(query: str) -> IntentResult:
     # Note: 'agent', 'assigned' here can be intentionally claimed earlier by
     # Rule 2 (troubleshooting) or Rule 3 (task_status); that's the richer
     # external taxonomy winning over the builtin's broader bucket.
-    _ENTITY_KW = ["who ", "person", "agent", "team", "assigned"]
+    _ENTITY_KW = ["who ", "person", "agent", "team", "assigned", "owner", "maintainer", "reviewer", "preference", "prefer"]
     hit = _kw(ql, _ENTITY_KW)
-    if hit:
+    if hit or _ENTITY_FACT_RE.search(q):
         return IntentResult(
             intent="entity_lookup",
             confidence=0.80,
             tables=_TABLE_ROUTES["entity_lookup"],
-            matched_rule=f"entity_kw:{hit.strip()}",
+            matched_rule=f"entity_kw:{(hit or 'entity_fact_regex').strip()}",
             format_hint=_FORMAT_HINTS["entity_lookup"],
         )
     if _PROPER_NOUN_ALONE_RE.match(q):

diff --git a/docs/RERANKER.md b/docs/RERANKER.md
@@ -103,6 +103,37 @@ Models load from the Hugging Face Hub on first use (cached at
 `~/.cache/huggingface/`). After the first call the model is held in
 the per-process module cache.
 
+## Second-stage tiny MLP artifact policy
+
+The local second-stage reranker can optionally load a tiny JSON MLP artifact
+from `src/agentmemory/retrieval/models/tiny_mlp_v1.json`, or from an explicit
+path passed through the internal reranker configuration. That artifact is not
+checked into git. If the file is absent, the second-stage path falls back to
+the deterministic heuristic slate scorer and search remains fully functional.
+
+The fallback is implemented in `src/agentmemory/retrieval/second_stage.py`:
+`rerank_top_candidates()` calls `TinyMLPModel.try_load(...)`; when that returns
+`None`, the MLP score vector is all zeros and `_heuristic_score()` plus
+`_rerank_slate()` produce the final deterministic listwise order. No network,
+model download, or checked-in weight file is required for the default path.
+
+This keeps the default package local-first and reviewable:
+
+- no mandatory network fetch,
+- no opaque weights bundled in source,
+- no hard dependency on numpy at import time,
+- no failure when the model artifact is unavailable.
+
+Training and calibration scripts live under `benchmarks/` and emit JSON
+artifacts into ignored benchmark/training output directories. If a trained
+artifact is published later, it should be attached as a release asset or LFS
+object with a short provenance record containing the source commit, training
+bundle, feature version, and held-out metrics.
+
+Benchmark numbers reported by a PR must state whether they were produced with
+an external MLP artifact present. If no artifact path is supplied and
+`tiny_mlp_v1.json` is absent, those numbers are heuristic-fallback numbers.
+
 ## Latency / quality tradeoff
 
 Measured on Apple Silicon M-series, CPU only (no MPS), Python 3.14,

diff --git a/docs/RETRIEVAL_VALIDATION.md b/docs/RETRIEVAL_VALIDATION.md
@@ -0,0 +1,47 @@
+# Retrieval Validation Slices
+
+This PR keeps benchmark headline numbers provisional until two non-benchmark
+checks are run alongside the LongMemEval/LoCoMo/MemBench comparison pack.
+
+## Held-out Non-benchmark Slice
+
+`tests/test_retrieval_validation_slices.py` seeds hand-labeled retrieval cases
+that are not copied from LongMemEval, LoCoMo, or MemBench. They use ordinary
+brainctl-style facts:
+
+- ownership of a signer-key checklist;
+- offline verification of signed exports;
+- temporal "after outage" evidence.
+
+The test compares raw candidate order against the full second-stage reranker
+and asserts that the full path does not demote the gold candidate. In the
+current deterministic slice, full reranking keeps or improves every case and
+lands `3/3` gold candidates at rank 1.
+
+## Exact / Field-aware Ablation Slice
+
+The same test module includes a non-synthetic role-fact case:
+
+```text
+query: What is Arlo's role in group alpha?
+answer evidence: Arlo is the quartermaster for group alpha.
+```
+
+Raw candidate order places a semantically similar distractor above the answer.
+The field-aware value-pattern feature promotes the answer to rank 1 without
+using synthetic IDs, benchmark fixture keys, or gold labels. This is intended
+to separate the useful exact/field-aware behavior from MemBench generator-tight
+role IDs.
+
+## Current Local Validation
+
+```powershell
+$env:PYTHONPATH=(Resolve-Path .\src)
+python -m pytest tests\test_retrieval_validation_slices.py -q
+```
+
+Result: `2 passed`.
+
+These slices are small by design. They are a review-time guard against obvious
+metric-shape overfitting, not a substitute for a larger real `brain.db` query
+sample before un-drafting the retrieval PR.