Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -130,13 +130,14 @@ jobs:
retrieval-gate:
runs-on: ubuntu-latest
if: github.event_name == 'pull_request'
# pull-requests: write is needed by the trailing "Post bench summary
# pull-requests/issues write are needed by the trailing "Post bench summary
# as PR comment" step (actions/github-script). Without it the step
# 403s on issues.createComment and fails the job even when the bench
# itself passed.
permissions:
contents: read
pull-requests: write
issues: write
# Path filter: only pay the minutes-long LongMemEval tax when we're
# touching retrieval. The planning doc calls these the "top-heavy
# retrieval" hot paths — keep in sync with the plan.
Expand All @@ -155,7 +156,9 @@ jobs:
- 'src/agentmemory/rerank.py'
- 'src/agentmemory/embeddings.py'
- 'src/agentmemory/retrieval.py'
- 'src/agentmemory/retrieval/**'
- 'bin/intent_classifier.py'
- 'benchmarks/**'
- 'tests/bench/**'

- name: Set up Python
Expand Down
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@ db/*.backup
logs/
blobs/
backups/
benchmarks/results/
benchmarks/training_data/
src/agentmemory/retrieval/models/*.json
.vs/
.DS_Store
/tmp/
*.swp
Expand Down
33 changes: 22 additions & 11 deletions bin/intent_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,15 +43,15 @@ class IntentResult:
# The final merged list fed to --tables is primary + secondary (de-duped).
_TABLE_ROUTES = {
"cross_reference": ["events", "memories", "context"],
"troubleshooting": ["events", "memories", "context"],
"troubleshooting": ["events", "memories", "context", "decisions"],
"task_status": ["events", "context", "memories"],
"entity_lookup": ["memories", "context", "events"], # entities not in universal search pipeline
"historical_timeline":["events", "context", "memories"],
"how_to": ["memories", "context"],
"decision_rationale": ["memories", "context", "events"],
"entity_lookup": ["memories", "events", "context"],
"historical_timeline":["events", "memories", "context"],
"how_to": ["memories", "context", "events", "decisions"],
"decision_rationale": ["decisions", "memories", "context", "events"],
"research_concept": ["memories", "context"],
"orientation": ["memories", "events", "context"],
"factual_lookup": ["memories", "context", "events"], # same as default
"factual_lookup": ["memories", "entities", "decisions", "context", "events"],
}

_FORMAT_HINTS = {
Expand Down Expand Up @@ -81,6 +81,17 @@ class IntentResult:
_WAVE_RE = re.compile(r'\bwave\s*\d+\b', re.IGNORECASE)
_HOW_RE = re.compile(r'\bhow\s+(to|do|does|can|should)\b', re.IGNORECASE)
_WHY_RE = re.compile(r'\bwhy\b', re.IGNORECASE)
_PROCEDURAL_RE = re.compile(r'\b(runbook|playbook|rollback|roll back|procedure|workflow|steps?|migrate|deployment?|troubleshoot|debug)\b', re.IGNORECASE)
_ENTITY_FACT_RE = re.compile(
r'\b('
r'who(?:\s+is|\s+owns?)?|'
r'what\s+does|'
r'owner|maintainer|reviewer|assignee|'
r'prefers?|preference|'
r'role|responsible'
r')\b',
re.IGNORECASE,
)
# First-person/identity statement (Hermes memory dumps stored as queries)
_IDENTITY_STMT_RE = re.compile(
r'^(I |My |The vault|Chief wakes|Continuity is|Tasks that|Learn the|'
Expand Down Expand Up @@ -157,12 +168,12 @@ def classify_intent(query: str) -> IntentResult:
)

# ---- Rule 4: How-to ----
if _HOW_RE.search(q):
if _HOW_RE.search(q) or _PROCEDURAL_RE.search(q):
return IntentResult(
intent="how_to",
confidence=0.88,
tables=_TABLE_ROUTES["how_to"],
matched_rule="how_to_regex",
matched_rule="how_to_regex" if _HOW_RE.search(q) else "procedural_kw_regex",
format_hint=_FORMAT_HINTS["how_to"],
)

Expand Down Expand Up @@ -264,14 +275,14 @@ def classify_intent(query: str) -> IntentResult:
# Note: 'agent', 'assigned' here can be intentionally claimed earlier by
# Rule 2 (troubleshooting) or Rule 3 (task_status); that's the richer
# external taxonomy winning over the builtin's broader bucket.
_ENTITY_KW = ["who ", "person", "agent", "team", "assigned"]
_ENTITY_KW = ["who ", "person", "agent", "team", "assigned", "owner", "maintainer", "reviewer", "preference", "prefer"]
hit = _kw(ql, _ENTITY_KW)
if hit:
if hit or _ENTITY_FACT_RE.search(q):
return IntentResult(
intent="entity_lookup",
confidence=0.80,
tables=_TABLE_ROUTES["entity_lookup"],
matched_rule=f"entity_kw:{hit.strip()}",
matched_rule=f"entity_kw:{(hit or 'entity_fact_regex').strip()}",
format_hint=_FORMAT_HINTS["entity_lookup"],
)
if _PROPER_NOUN_ALONE_RE.match(q):
Expand Down
31 changes: 31 additions & 0 deletions docs/RERANKER.md
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,37 @@ Models load from the Hugging Face Hub on first use (cached at
`~/.cache/huggingface/`). After the first call the model is held in
the per-process module cache.

## Second-stage tiny MLP artifact policy

The local second-stage reranker can optionally load a tiny JSON MLP artifact
from `src/agentmemory/retrieval/models/tiny_mlp_v1.json`, or from an explicit
path passed through the internal reranker configuration. That artifact is not
checked into git. If the file is absent, the second-stage path falls back to
the deterministic heuristic slate scorer and search remains fully functional.

The fallback is implemented in `src/agentmemory/retrieval/second_stage.py`:
`rerank_top_candidates()` calls `TinyMLPModel.try_load(...)`; when that returns
`None`, the MLP score vector is all zeros and `_heuristic_score()` plus
`_rerank_slate()` produce the final deterministic listwise order. No network,
model download, or checked-in weight file is required for the default path.

This keeps the default package local-first and reviewable:

- no mandatory network fetch,
- no opaque weights bundled in source,
- no hard dependency on numpy at import time,
- no failure when the model artifact is unavailable.

Training and calibration scripts live under `benchmarks/` and emit JSON
artifacts into ignored benchmark/training output directories. If a trained
artifact is published later, it should be attached as a release asset or LFS
object with a short provenance record containing the source commit, training
bundle, feature version, and held-out metrics.

Benchmark numbers reported by a PR must state whether they were produced with
an external MLP artifact present. If no artifact path is supplied and
`tiny_mlp_v1.json` is absent, those numbers are heuristic-fallback numbers.

## Latency / quality tradeoff

Measured on Apple Silicon M-series, CPU only (no MPS), Python 3.14,
Expand Down
47 changes: 47 additions & 0 deletions docs/RETRIEVAL_VALIDATION.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# Retrieval Validation Slices

This PR keeps benchmark headline numbers provisional until two non-benchmark
checks are run alongside the LongMemEval/LoCoMo/MemBench comparison pack.

## Held-out Non-benchmark Slice

`tests/test_retrieval_validation_slices.py` seeds hand-labeled retrieval cases
that are not copied from LongMemEval, LoCoMo, or MemBench. They use ordinary
brainctl-style facts:

- ownership of a signer-key checklist;
- offline verification of signed exports;
- temporal "after outage" evidence.

The test compares raw candidate order against the full second-stage reranker
and asserts that the full path does not demote the gold candidate. In the
current deterministic slice, full reranking keeps or improves every case and
lands `3/3` gold candidates at rank 1.

## Exact / Field-aware Ablation Slice

The same test module includes a non-synthetic role-fact case:

```text
query: What is Arlo's role in group alpha?
answer evidence: Arlo is the quartermaster for group alpha.
```

Raw candidate order places a semantically similar distractor above the answer.
The field-aware value-pattern feature promotes the answer to rank 1 without
using synthetic IDs, benchmark fixture keys, or gold labels. This is intended
to separate the useful exact/field-aware behavior from MemBench generator-tight
role IDs.

## Current Local Validation

```powershell
$env:PYTHONPATH=(Resolve-Path .\src)
python -m pytest tests\test_retrieval_validation_slices.py -q
```

Result: `2 passed`.

These slices are small by design. They are a review-time guard against obvious
metric-shape overfitting, not a substitute for a larger real `brain.db` query
sample before un-drafting the retrieval PR.
Loading