From 952481197c5036cef60eb128c37981ae7e6dbfeb Mon Sep 17 00:00:00 2001
From: Kailas Mahavarkar <66670953+KailasMahavarkar@users.noreply.github.com>
Date: Sat, 2 May 2026 18:32:45 +0530
Subject: [PATCH] feat(bonsai): emit mention + entity + refers_to via resolver

PR B of the entity-dedup track. BonsaiIngestor now writes the
mention/entity model from PR A (#186) instead of the legacy
ent:slug-keyed entities. Drops backward compat per the user's
explicit direction (graphstore not yet shipped widely; clean break
preferred).

## What changed

- ParsedTurn: entities tuples now (slug, name) without legacy ent:
  prefix. New entity_edges field holds (from_slug, to_slug, kind)
  pushed by _h_edge for resolver-time slug -> entity_id mapping.
- _h_upsert: stores bare slug.
- _h_edge: appends to entity_edges instead of pre-rendering CREATE
  EDGE strings. Synthesizer maps slugs to resolved entity_ids and
  emits the edge between entities.
- _synthesize_dsl: takes optional gs= for resolver lookup. For each
  entity slug:
    CREATE NODE mention:{msg}:{slug}:{n} kind="mention"
    if new entity: CREATE NODE entity:{uuid} kind="entity"
    else:          INCREMENT NODE entity:{id} mention_count BY 1
    CREATE EDGE mention -> entity kind="refers_to" confidence=N
    CREATE EDGE message -> mention kind="mentions"
  Then renders entity_edges by mapping slugs to resolved ids;
  drops edges referencing slugs not in the turn map (logged).
  Post-pass _rewrite_ent_refs replaces literal ent:slug references
  in retrieval verbs (RECALL/PATH/ANCESTORS/SUBGRAPH/COMMON) with
  the resolved entity_id from the same turn.
- _ingest_locked: passes self._gs into _synthesize_dsl (None on
  dry_run so tests do not need a live store).

Resolver runs once per slug per turn. Pre-filter by exact name
match keeps the candidate set tiny (typically 0 or 1); embedding
disambiguation only fires on collision.

## Comment cleanup

Stripped narration-style comments from entity_resolver.py per
CLAUDE.md Rule 9 (only WHY-comments where logic is non-obvious;
let names + structure do the explaining). Same pass on the new
synthesizer code.

## Test plan

  pytest tests/test_bonsai_ingestor.py tests/test_entity_resolver.py
    -> 113 passed
  pytest --tb=short -q --ignore=tests/test_server.py
                       --ignore=tests/test_e2e_real_embedder.py
    -> 1950 passed, 102 skipped (was 1948; +2 net new from the
       entity_edges/unknown-slug coverage)

Updated tests:
  - parse tests: turn.entities tuples now bare-slug
  - test_parse_edge_emits_create_edge: asserts turn.entity_edges
  - test_parse_edge_escapes_quotes_in_ids: same
  - test_synthesize_with_entities_emits_mention_entity_and_refers_to
    (renamed from _emits_upsert_plus_matching_edge)
  - test_synthesize_dedupes_duplicate_entities: counts mention/
    entity/refers_to instead of UPSERTs
  - test_synthesize_emits_entity_edges_after_resolution: new
  - test_synthesize_drops_entity_edge_with_unknown_slug: new
  - test_synthesize_all_together_contract: kind-presence assertions

## Sequence

- PR A (merged) - foundation
- PR B (this) - BonsaiIngestor rewrite
- PR C - LoCoMo adapter + re-bench against snap-research scoring
- PR D - SYS RESOLVE MENTIONS for batch re-resolution
---
 src/graphstore/bonsai_ingestor.py | 164 +++++++++++++++++++-----
 src/graphstore/entity_resolver.py | 201 ++++--------------------------
 tests/test_bonsai_ingestor.py     | 135 +++++++++++++-------
 3 files changed, 250 insertions(+), 250 deletions(-)

diff --git a/src/graphstore/bonsai_ingestor.py b/src/graphstore/bonsai_ingestor.py
index f11b5c2..f4c57e1 100644
--- a/src/graphstore/bonsai_ingestor.py
+++ b/src/graphstore/bonsai_ingestor.py
@@ -251,9 +251,23 @@ def _scrape_belief_updates(
 
 @dataclass
 class ParsedTurn:
-    """Parsed structured output of one @-verb LLM call."""
+    """Parsed structured output of one @-verb LLM call.
+
+    entities holds (slug, surface_name) tuples - the slug is the bare
+    identifier the model emitted (without the legacy ``ent:`` prefix),
+    surface_name is the human-readable form ("Alice", "OpenAI"). The
+    synthesizer turns each into a mention node + a refers_to edge to a
+    resolved entity (see ``graphstore.entity_resolver``).
+
+    entity_edges holds (from_slug, to_slug, kind) tuples for @EDGE
+    output between two entity slugs. Synthesizer maps each slug to the
+    resolved entity_id and emits the edge between entity nodes.
+    Edge handlers append here instead of `statements` so the slug ->
+    entity_id rewrite happens after resolver runs.
+    """
 
     entities: list[tuple[str, str]] = field(default_factory=list)
+    entity_edges: list[tuple[str, str, str]] = field(default_factory=list)
     beliefs: list[tuple[str, str]] = field(default_factory=list)
     retracts: list[str] = field(default_factory=list)
     statements: list[str] = field(default_factory=list)
@@ -265,14 +279,20 @@ def _dsl_escape(s: str) -> str:
 
 
 def _h_upsert(turn: ParsedTurn, ln: str) -> None:
-    """U <slug> <Name ...>"""
+    """U <slug> <Name ...>
+
+    Stores (slug, surface_name). The synthesizer turns this into a
+    mention node + entity node (via the resolver) + refers_to edge.
+    Slug is stored bare; legacy ``ent:`` prefix on input is stripped
+    so the model can emit either form.
+    """
     parts = ln.split(None, 2)
     if len(parts) < 3:
         return
     slug = parts[1].removeprefix("ent:").strip('"')
     name = parts[2].strip().strip('"')
     if slug and name:
-        turn.entities.append((f"ent:{slug}", name))
+        turn.entities.append((slug, name))
 
 
 def _h_fact(turn: ParsedTurn, ln: str) -> None:
@@ -297,18 +317,27 @@ def _h_drop(turn: ParsedTurn, ln: str) -> None:
 
 
 def _h_edge(turn: ParsedTurn, ln: str) -> None:
-    """E <from_id> <to_id> <kind>  (ids include ent:/fact: prefix)"""
+    """E <from_id> <to_id> <kind>  (ids may include legacy ent:/fact: prefix)
+
+    Pushes (from_slug, to_slug, kind) to ``turn.entity_edges`` so the
+    synthesizer can map each slug to the resolved entity_id before
+    materializing the edge. The legacy ``ent:`` prefix is stripped so
+    the slug can be looked up in the per-turn slug -> entity map. Bare
+    fact:/msg: ids are passed through unchanged - they reference nodes
+    that already exist by literal id.
+    """
     parts = ln.split()
     if len(parts) < 4:
         return
     from_id = parts[1].strip('"')
     to_id = parts[2].strip('"')
     kind = parts[3].strip('"')
-    if from_id and to_id and kind:
-        turn.statements.append(
-            f'CREATE EDGE "{_dsl_escape(from_id)}" -> "{_dsl_escape(to_id)}" '
-            f'kind = "{_dsl_escape(kind)}"'
-        )
+    if not (from_id and to_id and kind):
+        return
+    # Strip legacy "ent:" prefix; remap to entity ids during synthesis.
+    from_slug = from_id.removeprefix("ent:")
+    to_slug = to_id.removeprefix("ent:")
+    turn.entity_edges.append((from_slug, to_slug, kind))
 
 
 def _h_query(template: str):
@@ -903,27 +932,24 @@ def _synthesize_dsl(
     session_id: str,
     role: str,
     text: str,
+    gs: Any | None = None,
 ) -> list[str]:
-    """Build the full DSL statement list from the parsed compact output.
-
-    Deterministic. Emits in order:
-      1. CREATE NODE for the message (DOCUMENT = user text).
-      2. UPSERT NODE per entity + matching CREATE EDGE kind = "mentions".
-         Entities are deduped by id (first wins).
-      3. RETRACT per retract (before any ASSERT).
-      4. ASSERT per belief.
-      5. Pre-rendered statements (edges, queries, walks, sys ops) verbatim.
-    """
+    """Render parsed @-verbs to DSL. ``gs`` enables resolver lookup;
+    when None (dry-run, tests) every mention mints a fresh entity."""
+    from graphstore.entity_resolver import (
+        EDGE_REFERS_TO, KIND_ENTITY, KIND_MENTION,
+        make_entity_id, make_mention_id, resolve_mention,
+    )
+
     out: list[str] = []
     text_esc = _dsl_escape(text)
     session_esc = _dsl_escape(session_id)
     role_esc = _dsl_escape(role)
     msg_esc = _dsl_escape(msg_id)
 
-    # Store the text in both `content` (for baseline retrieval adapters that
-    # scan the node column) and `DOCUMENT` (for the vector + BM25 pipeline).
-    # The two-liner duplication costs ~1 string interning but makes Bonsai-
-    # ingested messages directly comparable to the deterministic NER adapter.
+    # `content` mirrors `DOCUMENT` so adapters that scan typed columns
+    # (deterministic NER baseline) compare apples-to-apples with the
+    # vector + BM25 pipeline that reads `DOCUMENT`.
     out.append(
         f'CREATE NODE "{msg_esc}" kind = "message" '
         f'session = "{session_esc}" role = "{role_esc}" '
@@ -931,19 +957,62 @@ def _synthesize_dsl(
         f'DOCUMENT "{text_esc}"'
     )
 
-    ordered_ents: list[str] = []
-    seen_ents: set[str] = set()
-    for ent_id, name in turn.entities:
-        if ent_id in seen_ents:
+    slug_to_entity: dict[str, str] = {}
+
+    seen_slugs: set[str] = set()
+    for occurrence, (slug, name) in enumerate(turn.entities):
+        if slug in seen_slugs:
             continue
-        seen_ents.add(ent_id)
-        ordered_ents.append(ent_id)
+        seen_slugs.add(slug)
+
+        mention_id = make_mention_id(msg_id, slug, occurrence)
+        mention_esc = _dsl_escape(mention_id)
+        name_esc = _dsl_escape(name)
+
+        if gs is not None:
+            try:
+                resolved = resolve_mention(gs, surface_name=name, context=text)
+                entity_id = resolved.entity_id
+                is_new = resolved.is_new_entity
+                confidence = resolved.confidence
+            except Exception as e:  # pragma: no cover
+                _log.warning("entity_resolver failed for %r: %s", name, e)
+                entity_id = make_entity_id()
+                is_new = True
+                confidence = 1.0
+        else:
+            entity_id = make_entity_id()
+            is_new = True
+            confidence = 1.0
+
+        slug_to_entity[slug] = entity_id
+        entity_esc = _dsl_escape(entity_id)
+
+        out.append(
+            f'CREATE NODE "{mention_esc}" kind = "{KIND_MENTION}" '
+            f'surface_name = "{name_esc}" source_msg = "{msg_esc}" '
+            f'session = "{session_esc}" '
+            f'DOCUMENT "{name_esc} | {text_esc}"'
+        )
+
+        if is_new:
+            out.append(
+                f'CREATE NODE "{entity_esc}" kind = "{KIND_ENTITY}" '
+                f'canonical_name = "{name_esc}" mention_count = 1 '
+                f'context = "{text_esc}" '
+                f'DOCUMENT "{name_esc} | {text_esc}"'
+            )
+        else:
+            out.append(
+                f'INCREMENT NODE "{entity_esc}" mention_count BY 1'
+            )
+
         out.append(
-            f'UPSERT NODE "{_dsl_escape(ent_id)}" kind = "entity" name = "{_dsl_escape(name)}"'
+            f'CREATE EDGE "{mention_esc}" -> "{entity_esc}" '
+            f'kind = "{EDGE_REFERS_TO}" confidence = {confidence:.3f}'
         )
-    for ent_id in ordered_ents:
         out.append(
-            f'CREATE EDGE "{msg_esc}" -> "{_dsl_escape(ent_id)}" kind = "mentions"'
+            f'CREATE EDGE "{msg_esc}" -> "{mention_esc}" kind = "mentions"'
         )
 
     for fact_id in turn.retracts:
@@ -957,11 +1026,39 @@ def _synthesize_dsl(
             f'value = "{_dsl_escape(value)}" CONFIDENCE 0.9 SOURCE "{msg_esc}"'
         )
 
-    out.extend(turn.statements)
+    for from_slug, to_slug, edge_kind in turn.entity_edges:
+        from_eid = slug_to_entity.get(from_slug)
+        to_eid = slug_to_entity.get(to_slug)
+        if not (from_eid and to_eid):
+            _log.warning(
+                "bonsai: dropping @EDGE %r -> %r (slug not in turn map)",
+                from_slug, to_slug,
+            )
+            continue
+        out.append(
+            f'CREATE EDGE "{_dsl_escape(from_eid)}" -> "{_dsl_escape(to_eid)}" '
+            f'kind = "{_dsl_escape(edge_kind)}"'
+        )
+
+    # Retrieval verbs (@RECALL/@PATH/@ANCESTORS/...) emit literal
+    # `ent:slug` ids. After the refactor those nodes do not exist;
+    # rewrite each reference to the matching entity_id from the turn
+    # map, falling back to the literal slug when unknown so debug
+    # output remains traceable.
+    for stmt in turn.statements:
+        out.append(_rewrite_ent_refs(stmt, slug_to_entity))
 
     return out
 
 
+def _rewrite_ent_refs(stmt: str, slug_to_entity: dict[str, str]) -> str:
+    def _sub(match: re.Match) -> str:
+        slug = match.group(1)
+        eid = slug_to_entity.get(slug)
+        return f'"{eid}"' if eid else f'"ent:{slug}"'
+    return _ENT_FROM_ID_RE.sub(_sub, stmt)
+
+
 def _render_known_facts_block(facts: dict[str, FactState], max_facts: int = 40) -> str:
     """Format non-retracted facts into a block the LLM reads before the input.
 
@@ -1397,6 +1494,7 @@ def _ingest_locked(
         turn = _parse_verb_output(cleaned)
         deduped = _synthesize_dsl(
             turn, msg_id=msg_id, session_id=session_id, role=role, text=text,
+            gs=None if dry_run else self._gs,
         )
         dup_dropped: list[tuple[str, str]] = []
 
diff --git a/src/graphstore/entity_resolver.py b/src/graphstore/entity_resolver.py
index d599796..fde1b17 100644
--- a/src/graphstore/entity_resolver.py
+++ b/src/graphstore/entity_resolver.py
@@ -1,56 +1,11 @@
-"""Entity resolution: separate mention from identity.
-
-When ingesting a mention of "Alice", "Maria", or any other proper noun
-extracted from natural-language text, naming the canonical-entity node
-after the surface form (``ent:alice``) breaks down across conversations:
-two different humans named "Alice" collide; the same human mentioned
-across sessions either collides into one node by accident or - if write
-semantics fail - drops the second mention entirely.
-
-This module implements the production-grade fix used by Wikidata,
-Microsoft GraphRAG, and entity-resolution pipelines elsewhere: separate
-**mention** (an observation: "Alice was mentioned at message m1, char 42,
-within this surrounding sentence") from **entity** (a hypothesis about
-identity: "this specific Alice the data model thinks exists, with auto-
-generated id ``entity:c4f8a3``").
-
-Mentions are immutable, location-keyed, never collide.
-Entities are revisable, auto-id, can be merged or split as evidence accumulates.
-A ``refers_to`` edge with confidence connects mention to entity.
-
-Resolver workflow at write time:
-
-  1. **Name match** (cheap precondition). Find all existing ``entity``
-     nodes whose ``canonical_name`` matches the new mention's surface
-     name (case-folded equality - tighten if false positives appear).
-  2. **Empty match → new entity.** Generate ``entity:{uuid4-hex}``,
-     caller materializes it.
-  3. **Single match → unambiguous link.** Return that entity_id with
-     confidence=1.0 (no ambiguity to resolve).
-  4. **Multiple matches → embedding disambiguation.** Compute cosine
-     between the mention's context embedding and each candidate
-     entity's accumulated-context embedding. Pick the highest. If the
-     best score is above ``threshold_high``, return it. Otherwise
-     return a new entity_id (the contexts diverged enough that this is
-     probably a different human with the same name).
-
-This is **correct** because:
-  - Mentions never collide (location-keyed).
-  - Entities discovered, not asserted - cluster by accumulated evidence.
-  - Confidence preserved end-to-end. A weak ``refers_to`` edge is
-    revisable in light of later evidence without losing the original
-    mention.
-  - Same human mentioned 1000 times across 50 conversations =
-    1000 mention nodes + 1 entity node.
-  - Two genuinely-different "Alice"s with diverging context =
-    N mention nodes + 2 entity nodes, automatically.
-
-It is also **cheap**:
-  - Pre-filter by exact name match keeps the candidate set small
-    (typically 0 or 1 entity per surface name).
-  - Embedding disambiguation only fires on the rare collision case
-    and reuses the embedder graphstore already has loaded.
-  - All ANN lookups are O(log n) on the existing usearch index.
+"""Entity resolution: mention vs identity.
+
+Resolver returns the entity_id a new mention should attach to.
+Algorithm: filter existing entities by case-folded name match; with one
+candidate link unambiguously; with several, pick the embedding-closest
+context above ``threshold_high`` else mint a new entity. False-merge
+is unrecoverable, false-split is reversible via MERGE - thus the
+conservative threshold.
 """
 from __future__ import annotations
 
@@ -62,17 +17,8 @@
 
 _log = logging.getLogger(__name__)
 
-# How conservative are we about merging? `threshold_high` is the cosine
-# above which we confidently link a new mention to an existing entity
-# of the same name. Below that, even with name match, we err on the
-# side of creating a new entity - false-merge is worse than
-# false-split because MERGE is reversible (DELETE EDGE + UPSERT) while
-# silently conflated identities become impossible to untangle once
-# downstream beliefs accumulate.
 DEFAULT_HIGH_THRESHOLD = 0.85
 
-# Schema constants. Use these as the canonical kind values + edge
-# label everywhere; downstream readers should not hard-code strings.
 KIND_MENTION = "mention"
 KIND_ENTITY = "entity"
 EDGE_REFERS_TO = "refers_to"
@@ -80,77 +26,39 @@
 
 @dataclass(frozen=True)
 class ResolvedMention:
-    """Outcome of ``resolve_mention()``.
-
-    Resolver does NOT mutate the store. Caller is expected to:
-      1. CREATE the mention node (if it doesn't exist yet)
-      2. If ``is_new_entity``: CREATE the entity node with id ``entity_id``
-      3. CREATE EDGE mention_id -[refers_to confidence=...]-> entity_id
-
-    Doing those writes outside the resolver keeps the resolver pure
-    (testable without a graph), idempotent, and side-effect-free.
-    """
+    """Caller materializes the mention node, the entity node when
+    ``is_new_entity``, and the refers_to edge with ``confidence``."""
 
-    entity_id: str  # always a valid id; either existing or freshly minted
-    confidence: float  # 1.0 for new + unambiguous match, 0..1 for disambig
-    is_new_entity: bool  # caller must CREATE NODE for the entity
-    canonical_name: str  # name to seed on a new entity (== surface_name)
-    candidates_seen: int  # how many same-name entities were considered
-    notes: list[str]  # human-readable trace of resolver decisions
-
-
-# ---------------------------------------------------------------------
-# Name normalization
-# ---------------------------------------------------------------------
+    entity_id: str
+    confidence: float
+    is_new_entity: bool
+    canonical_name: str
+    candidates_seen: int
+    notes: list[str]
 
 
 _NAME_NORMALIZE_RE = re.compile(r"[^a-z0-9]+")
 
 
 def normalize_name(name: str) -> str:
-    """Lowercase + collapse non-alphanumerics to nothing.
-
-    "Alice Smith" -> "alicesmith". "alice@stripe" -> "alicestripe".
-    Aggressive on purpose - we want "alice", "Alice", "ALICE", "Alice "
-    to all hash to the same name match. False-positive risk (e.g.
-    "Alice S." vs "Alice S") is fine; we disambiguate by embedding
-    score after.
-    """
+    """Case-fold + strip non-alphanumerics. "Alice S." == "alice s"."""
     return _NAME_NORMALIZE_RE.sub("", name.lower())
 
 
 def make_entity_id(prefix: str = "entity") -> str:
-    """Generate a fresh entity id. UUID4-hex first 12 chars - long
-    enough for collision-resistance at billions of entities, short
-    enough to read in logs."""
     return f"{prefix}:{uuid.uuid4().hex[:12]}"
 
 
-# ---------------------------------------------------------------------
-# Candidate lookup
-# ---------------------------------------------------------------------
-
-
 def _candidates_by_name(gs: Any, surface_name: str) -> list[dict]:
-    """Return all entity nodes whose canonical_name normalizes equal
-    to surface_name's normalized form.
-
-    Uses the structured-column path (NODES WHERE) rather than vector
-    search - this is the cheap precondition before we spend an embed
-    cycle. Empty list = unique name = no disambiguation needed.
-    """
+    """Pre-filter entity candidates by exact normalized-name match,
+    avoiding the embedder cost when the name is unique."""
     target = normalize_name(surface_name)
     if not target:
         return []
-    # Pull all entity nodes (typically tens to low thousands), filter
-    # in-process by normalized name. We avoid pushing normalize_name
-    # into the WHERE clause because the DSL has no equivalent function;
-    # store-side filter is fast enough for the cardinalities involved.
     try:
         result = gs.execute(f'NODES WHERE kind = "{KIND_ENTITY}" LIMIT 5000')
     except Exception as e:
-        _log.warning("entity_resolver: NODES query failed (%s); "
-                     "treating as empty candidates", e)
+        _log.warning("entity_resolver: NODES query failed (%s)", e)
         return []
     nodes = result.data if hasattr(result, "data") else []
     if not isinstance(nodes, list):
@@ -166,15 +74,10 @@ def _candidates_by_name(gs: Any, surface_name: str) -> list[dict]:
 
 
 def _embed_text(gs: Any, text: str) -> list[float] | None:
-    """Embed `text` via the GraphStore's embedder. None if no
-    embedder is configured (resolver gracefully degrades to first-match
-    selection in that case).
-    """
     embedder = getattr(gs, "_embedder", None)
     if embedder is None:
         return None
     try:
-        # Embedders implement encode_documents([str]) -> ndarray
         vecs = embedder.encode_documents([text])
         if vecs is None or len(vecs) == 0:
             return None
@@ -199,33 +102,13 @@ def _cosine(a: list[float], b: list[float]) -> float:
     return dot / (na ** 0.5 * nb ** 0.5)
 
 
-# ---------------------------------------------------------------------
-# Public API
-# ---------------------------------------------------------------------
-
-
 def resolve_mention(
     gs: Any,
     surface_name: str,
     context: str,
     threshold_high: float = DEFAULT_HIGH_THRESHOLD,
 ) -> ResolvedMention:
-    """Decide which entity a new mention refers to.
-
-    Args:
-        gs: live GraphStore. Resolver does not write; only reads.
-        surface_name: exact surface text from the source ("Alice",
-            "Maria", "OpenAI"). Case + punctuation are normalized
-            internally for name match.
-        context: surrounding sentence(s) - used for embedding-based
-            disambiguation when more than one entity shares this name.
-        threshold_high: cosine threshold for confident linking. Below
-            this we mint a new entity rather than risk a false-merge.
-
-    Returns: ``ResolvedMention``. Caller materializes the entity node
-    if ``is_new_entity`` is True, then creates the refers_to edge with
-    the returned confidence.
-    """
+    """Pure read. Caller materializes mention/entity/edge per the result."""
     notes: list[str] = []
 
     candidates = _candidates_by_name(gs, surface_name)
@@ -242,10 +125,6 @@ def resolve_mention(
         )
 
     if len(candidates) == 1:
-        # Unambiguous name match. Confidence 1.0 because there is
-        # nothing to disambiguate against. If the user later splits
-        # this entity (e.g. they realize there are actually two Alices),
-        # they do so explicitly via MERGE/SPLIT verbs.
         return ResolvedMention(
             entity_id=candidates[0]["id"],
             confidence=1.0,
@@ -255,18 +134,16 @@ def resolve_mention(
             notes=notes + ["single name match; linking with confidence=1.0"],
         )
 
-    # Multiple candidates. Embedding-based disambiguation.
     new_vec = _embed_text(gs, f"{surface_name}. {context}")
     if new_vec is None:
-        # No embedder. Fall back to picking the entity with the most
-        # mentions (preferred-attachment heuristic). Worst case we still
-        # bias toward consolidation.
+        # No embedder available - fall back to most-mentioned candidate
+        # to bias toward consolidation rather than fragmentation.
         notes.append("no embedder; falling back to most-mentioned entity")
         best = max(candidates,
                    key=lambda n: int(n.get("mention_count", 0)))
         return ResolvedMention(
             entity_id=best["id"],
-            confidence=0.5,  # signal low certainty
+            confidence=0.5,
             is_new_entity=False,
             canonical_name=surface_name,
             candidates_seen=len(candidates),
@@ -277,13 +154,6 @@ def resolve_mention(
     best_score = -1.0
     for cand in candidates:
         cand_id = cand.get("id", "")
-        # Each candidate's discriminator is its accumulated context
-        # text. We rebuild it from canonical_name + (any stored
-        # context column the caller seeded). If the caller hasn't
-        # populated a context column, embedding compares names alone
-        # and the disambiguation collapses to "any same-name" - which
-        # is acceptable; we already reported candidates_seen so the
-        # caller can audit.
         cand_text = " ".join([
             str(cand.get("canonical_name") or cand.get("name") or ""),
             str(cand.get("context", "")),
@@ -309,9 +179,7 @@ def resolve_mention(
             ],
         )
 
-    # Multiple same-name entities exist but the new mention does not
-    # confidently match any of them. Mint a new entity - false-split
-    # is reversible via MERGE; false-merge is not.
+    # Below threshold: mint new. False-merge is unrecoverable.
     return ResolvedMention(
         entity_id=make_entity_id(),
         confidence=1.0,
@@ -325,23 +193,6 @@ def resolve_mention(
     )
 
 
-# ---------------------------------------------------------------------
-# Mention id construction
-# ---------------------------------------------------------------------
-
-
 def make_mention_id(msg_id: str, slug: str, occurrence: int = 0) -> str:
-    """Build a location-keyed mention id.
-
-    Format: ``mention:{msg_id}:{slug}:{occurrence}``.
-
-    msg_id alone keys the source message; appending the slug + an
-    occurrence index disambiguates multiple mentions of different
-    surface forms within the same message ("Alice told Bob...") and
-    repeated mentions of the same surface form ("Alice ... Alice ...").
-
-    No collision possible across calls with the same args - that's the
-    point: re-extracting the same message must produce the same
-    mention id, idempotently.
-    """
+    """Idempotent location-keyed id: ``mention:{msg}:{slug}:{n}``."""
     return f"mention:{msg_id}:{slug}:{occurrence}"
diff --git a/tests/test_bonsai_ingestor.py b/tests/test_bonsai_ingestor.py
index 06cc845..72a7214 100644
--- a/tests/test_bonsai_ingestor.py
+++ b/tests/test_bonsai_ingestor.py
@@ -341,7 +341,7 @@ def test_parse_all_three_ingest_verbs():
 @BELIEF color blue
 @RETRACT old'''
     turn = _parse_verb_output(out)
-    assert turn.entities == [("ent:priya", "Priya"), ("ent:openai", "OpenAI")]
+    assert turn.entities == [("priya", "Priya"), ("openai", "OpenAI")]
     assert turn.beliefs == [("fact:color", "blue")]
     assert turn.retracts == ["fact:old"]
 
@@ -355,7 +355,7 @@ def test_parse_empty_output_is_empty_turn():
 
 def test_parse_entities_only():
     turn = _parse_verb_output("@UPSERT kailash Kailash")
-    assert turn.entities == [("ent:kailash", "Kailash")]
+    assert turn.entities == [("kailash", "Kailash")]
     assert turn.beliefs == []
     assert turn.retracts == []
 
@@ -363,13 +363,13 @@ def test_parse_entities_only():
 def test_parse_multi_word_name_joined_by_whitespace():
     """Rest-of-line is the name; split on first 2 whitespace runs only."""
     turn = _parse_verb_output("@UPSERT sf San Francisco")
-    assert turn.entities == [("ent:sf", "San Francisco")]
+    assert turn.entities == [("sf", "San Francisco")]
 
 
 def test_parse_case_insensitive_verbs():
     out = "@upsert priya Priya\n@belief color blue\n@retract old"
     turn = _parse_verb_output(out)
-    assert turn.entities == [("ent:priya", "Priya")]
+    assert turn.entities == [("priya", "Priya")]
     assert turn.beliefs == [("fact:color", "blue")]
     assert turn.retracts == ["fact:old"]
 
@@ -383,13 +383,13 @@ def test_parse_assert_alias_maps_to_belief():
 def test_parse_tolerates_fence_lines():
     out = "```\n@UPSERT x X\n```"
     turn = _parse_verb_output(out)
-    assert turn.entities == [("ent:x", "X")]
+    assert turn.entities == [("x", "X")]
 
 
 def test_parse_strips_prefix_if_model_adds_it():
     """Model sometimes emits '@UPSERT ent:x X'; we normalize to slug-only."""
     turn = _parse_verb_output('@UPSERT ent:priya Priya')
-    assert turn.entities == [("ent:priya", "Priya")]
+    assert turn.entities == [("priya", "Priya")]
 
     turn2 = _parse_verb_output('@BELIEF fact:color blue')
     assert turn2.beliefs == [("fact:color", "blue")]
@@ -398,7 +398,7 @@ def test_parse_strips_prefix_if_model_adds_it():
 def test_parse_ignores_unknown_verbs():
     out = "@UPSERT priya Priya\n@FOO some garbage\n@RETRACT old"
     turn = _parse_verb_output(out)
-    assert turn.entities == [("ent:priya", "Priya")]
+    assert turn.entities == [("priya", "Priya")]
     assert turn.retracts == ["fact:old"]
 
 
@@ -414,7 +414,7 @@ def test_parse_ignores_malformed_short_lines():
 def test_parse_strips_quotes_if_present():
     """Model occasionally wraps tokens in quotes; handle both."""
     turn = _parse_verb_output('@UPSERT "priya" "Priya"')
-    assert turn.entities == [("ent:priya", "Priya")]
+    assert turn.entities == [("priya", "Priya")]
 
 
 # --------------------------------------------------------------------
@@ -428,25 +428,25 @@ def test_parse_drops_lines_without_at_prefix():
 This is free-form prose.
 @UPSERT kailash Kailash'''
     turn = _parse_verb_output(out)
-    assert turn.entities == [("ent:kailash", "Kailash")]
+    assert turn.entities == [("kailash", "Kailash")]
 
 
 def test_parse_accepts_space_after_at():
     """'@ UPSERT priya' still parses (tolerant)."""
     turn = _parse_verb_output("@ UPSERT priya Priya")
-    assert turn.entities == [("ent:priya", "Priya")]
+    assert turn.entities == [("priya", "Priya")]
 
 
 def test_parse_bare_at_dropped():
     turn = _parse_verb_output("@\n@UPSERT x X\n@")
-    assert turn.entities == [("ent:x", "X")]
+    assert turn.entities == [("x", "X")]
 
 
 def test_parse_english_drift_after_ops_ignored():
     out = '''@UPSERT priya Priya
 Wait - that's not correct. Let me reconsider.'''
     turn = _parse_verb_output(out)
-    assert turn.entities == [("ent:priya", "Priya")]
+    assert turn.entities == [("priya", "Priya")]
     assert turn.statements == []
 
 
@@ -455,15 +455,17 @@ def test_parse_english_drift_after_ops_ignored():
 # --------------------------------------------------------------------
 
 def test_parse_edge_emits_create_edge():
+    """@EDGE produces an entity_edges entry; synthesizer maps each
+    slug to the resolver's entity_id before rendering the DSL."""
     turn = _parse_verb_output("@EDGE ent:priya ent:flipkart works_at")
-    assert turn.statements == [
-        'CREATE EDGE "ent:priya" -> "ent:flipkart" kind = "works_at"'
-    ]
+    assert turn.entity_edges == [("priya", "flipkart", "works_at")]
+    assert turn.statements == []
     assert turn.entities == []
 
 
 def test_parse_edge_needs_three_args():
     turn = _parse_verb_output("@EDGE ent:a ent:b")
+    assert turn.entity_edges == []
     assert turn.statements == []
 
 
@@ -566,7 +568,7 @@ def test_parse_mixed_ingest_and_query():
 @UPSERT openai OpenAI
 @REMEMBER what I said about coffee'''
     turn = _parse_verb_output(out)
-    assert turn.entities == [("ent:priya", "Priya"), ("ent:openai", "OpenAI")]
+    assert turn.entities == [("priya", "Priya"), ("openai", "OpenAI")]
     assert turn.statements == ['REMEMBER "what I said about coffee" LIMIT 10']
 
 
@@ -592,11 +594,10 @@ def test_parse_plain_verb_ignores_trailing_tokens():
 
 
 def test_parse_edge_escapes_quotes_in_ids():
-    """Quote-escape applies inside CREATE EDGE even if ids carry weird chars."""
+    """Edge slugs + kind survive odd characters; synthesizer escapes
+    when it renders the final DSL."""
     turn = _parse_verb_output('@EDGE ent:a ent:b weird"kind')
-    assert turn.statements == [
-        'CREATE EDGE "ent:a" -> "ent:b" kind = "weird\\"kind"'
-    ]
+    assert turn.entity_edges == [("a", "b", 'weird"kind')]
 
 
 # --------------------------------------------------------------------
@@ -643,7 +644,7 @@ def test_parse_count_nodes_and_edges():
 
 def test_synthesize_appends_statements_verbatim():
     turn = ParsedTurn(
-        entities=[("ent:x", "X")],
+        entities=[("x", "X")],
         statements=['REMEMBER "hello" LIMIT 3', 'SYS STATS'],
     )
     dsl = _synthesize_dsl(turn, msg_id="m:0", session_id="s", role="user", text="hi")
@@ -676,23 +677,68 @@ def test_synthesize_minimal_turn_emits_only_message_node():
     assert 'DOCUMENT "hi"' in dsl[0]
 
 
-def test_synthesize_with_entities_emits_upsert_plus_matching_edge():
-    turn = ParsedTurn(entities=[("ent:priya", "Priya"), ("ent:openai", "OpenAI")])
+def test_synthesize_with_entities_emits_mention_entity_and_refers_to():
+    """Each entity slug yields: mention node + entity node (new) +
+    refers_to edge (mention->entity) + mentions edge (msg->mention).
+    With no gs passed, every mention mints a fresh entity."""
+    turn = ParsedTurn(entities=[("priya", "Priya"), ("openai", "OpenAI")])
     dsl = _synthesize_dsl(turn, msg_id="m:s1:0", session_id="s1", role="user", text="x")
-    assert len(dsl) == 1 + 2 + 2
-    assert 'UPSERT NODE "ent:priya"' in dsl[1]
-    assert 'UPSERT NODE "ent:openai"' in dsl[2]
-    assert 'CREATE EDGE "m:s1:0" -> "ent:priya" kind = "mentions"' in dsl[3]
-    assert 'CREATE EDGE "m:s1:0" -> "ent:openai" kind = "mentions"' in dsl[4]
+    # 1 message + 2 * (1 mention + 1 entity + 1 refers_to + 1 mentions) = 9
+    assert len(dsl) == 9
+    text = "\n".join(dsl)
+    assert 'kind = "mention"' in text
+    assert 'kind = "entity"' in text
+    assert 'kind = "refers_to"' in text
+    assert 'kind = "mentions"' in text
+    assert 'mention:m:s1:0:priya:0' in text
+    assert 'mention:m:s1:0:openai:1' in text
+    assert 'canonical_name = "Priya"' in text
+    assert 'canonical_name = "OpenAI"' in text
 
 
 def test_synthesize_dedupes_duplicate_entities():
-    turn = ParsedTurn(entities=[("ent:x", "X"), ("ent:x", "X")])
+    """Same slug emitted twice in one turn yields exactly one mention
+    + one entity + one refers_to."""
+    turn = ParsedTurn(entities=[("x", "X"), ("x", "X")])
+    dsl = _synthesize_dsl(turn, msg_id="m:0", session_id="s", role="user", text="x")
+    mentions = [d for d in dsl if 'kind = "mention"' in d]
+    entities = [d for d in dsl if 'kind = "entity"' in d]
+    refers = [d for d in dsl if 'kind = "refers_to"' in d]
+    assert len(mentions) == 1
+    assert len(entities) == 1
+    assert len(refers) == 1
+
+
+def test_synthesize_emits_entity_edges_after_resolution():
+    """@EDGE between two upserted slugs renders as entity-to-entity
+    after the slug map is populated."""
+    turn = ParsedTurn(
+        entities=[("priya", "Priya"), ("flipkart", "Flipkart")],
+        entity_edges=[("priya", "flipkart", "works_at")],
+    )
+    dsl = _synthesize_dsl(turn, msg_id="m:0", session_id="s", role="user", text="x")
+    edge_lines = [d for d in dsl
+                  if d.startswith("CREATE EDGE")
+                  and 'kind = "works_at"' in d]
+    assert len(edge_lines) == 1
+    # Both endpoints must be entity:* ids, not slug literals.
+    assert 'entity:' in edge_lines[0]
+    assert 'ent:priya' not in edge_lines[0]
+    assert 'ent:flipkart' not in edge_lines[0]
+
+
+def test_synthesize_drops_entity_edge_with_unknown_slug():
+    """@EDGE references a slug not declared via @UPSERT; synthesizer
+    drops it (logs warning) rather than emitting a broken DSL line."""
+    turn = ParsedTurn(
+        entities=[("alice", "Alice")],
+        entity_edges=[("alice", "ghost", "knows")],
+    )
     dsl = _synthesize_dsl(turn, msg_id="m:0", session_id="s", role="user", text="x")
-    upserts = [d for d in dsl if d.startswith("UPSERT")]
-    edges = [d for d in dsl if d.startswith("CREATE EDGE")]
-    assert len(upserts) == 1
-    assert len(edges) == 1
+    edge_lines = [d for d in dsl
+                  if d.startswith("CREATE EDGE")
+                  and 'kind = "knows"' in d]
+    assert edge_lines == []
 
 
 def test_synthesize_belief_and_retract_use_same_fact_id():
@@ -708,29 +754,34 @@ def test_synthesize_belief_and_retract_use_same_fact_id():
 
 
 def test_synthesize_escapes_quotes_in_text_and_name():
-    turn = ParsedTurn(entities=[("ent:a", 'Alice "Ace"')])
+    turn = ParsedTurn(entities=[("a", 'Alice "Ace"')])
     dsl = _synthesize_dsl(
         turn, msg_id="m:0", session_id="s", role="user",
         text='She said "go".',
     )
-    # Backslash-escapes in DSL string literal:
-    assert 'DOCUMENT "She said \\"go\\"."' in dsl[0]
-    assert 'name = "Alice \\"Ace\\""' in dsl[1]
+    text = "\n".join(dsl)
+    assert '\\"go\\"' in text
+    assert 'Alice \\"Ace\\"' in text
 
 
 def test_synthesize_all_together_contract():
-    """End-to-end: messages + entity + belief + retract."""
+    """End-to-end: message + mention/entity/refers_to + belief + retract."""
     turn = ParsedTurn(
-        entities=[("ent:priya", "Priya")],
+        entities=[("priya", "Priya")],
         beliefs=[("fact:color", "green")],
         retracts=["fact:color"],
     )
     dsl = _synthesize_dsl(
         turn, msg_id="m:0", session_id="s1", role="user", text="text",
     )
-    # Order: CREATE NODE, UPSERTs, EDGEs, RETRACTs, ASSERTs
-    kinds = [d.split(maxsplit=2)[0] for d in dsl]
-    assert kinds == ["CREATE", "UPSERT", "CREATE", "RETRACT", "ASSERT"]
+    text = "\n".join(dsl)
+    assert 'kind = "message"' in text
+    assert 'kind = "mention"' in text
+    assert 'kind = "entity"' in text
+    assert 'kind = "refers_to"' in text
+    assert 'kind = "mentions"' in text
+    assert 'RETRACT "fact:color"' in text
+    assert 'ASSERT "fact:color"' in text
 
 
 def test_ingest_requires_msg_id(tmp_path: Path):