From 952481197c5036cef60eb128c37981ae7e6dbfeb Mon Sep 17 00:00:00 2001 From: Kailas Mahavarkar <66670953+KailasMahavarkar@users.noreply.github.com> Date: Sat, 2 May 2026 18:32:45 +0530 Subject: [PATCH] feat(bonsai): emit mention + entity + refers_to via resolver PR B of the entity-dedup track. BonsaiIngestor now writes the mention/entity model from PR A (#186) instead of the legacy ent:slug-keyed entities. Drops backward compat per the user's explicit direction (graphstore not yet shipped widely; clean break preferred). ## What changed - ParsedTurn: entities tuples now (slug, name) without legacy ent: prefix. New entity_edges field holds (from_slug, to_slug, kind) pushed by _h_edge for resolver-time slug -> entity_id mapping. - _h_upsert: stores bare slug. - _h_edge: appends to entity_edges instead of pre-rendering CREATE EDGE strings. Synthesizer maps slugs to resolved entity_ids and emits the edge between entities. - _synthesize_dsl: takes optional gs= for resolver lookup. For each entity slug: CREATE NODE mention:{msg}:{slug}:{n} kind="mention" if new entity: CREATE NODE entity:{uuid} kind="entity" else: INCREMENT NODE entity:{id} mention_count BY 1 CREATE EDGE mention -> entity kind="refers_to" confidence=N CREATE EDGE message -> mention kind="mentions" Then renders entity_edges by mapping slugs to resolved ids; drops edges referencing slugs not in the turn map (logged). Post-pass _rewrite_ent_refs replaces literal ent:slug references in retrieval verbs (RECALL/PATH/ANCESTORS/SUBGRAPH/COMMON) with the resolved entity_id from the same turn. - _ingest_locked: passes self._gs into _synthesize_dsl (None on dry_run so tests do not need a live store). Resolver runs once per slug per turn. Pre-filter by exact name match keeps the candidate set tiny (typically 0 or 1); embedding disambiguation only fires on collision. ## Comment cleanup Stripped narration-style comments from entity_resolver.py per CLAUDE.md Rule 9 (only WHY-comments where logic is non-obvious; let names + structure do the explaining). Same pass on the new synthesizer code. ## Test plan pytest tests/test_bonsai_ingestor.py tests/test_entity_resolver.py -> 113 passed pytest --tb=short -q --ignore=tests/test_server.py --ignore=tests/test_e2e_real_embedder.py -> 1950 passed, 102 skipped (was 1948; +2 net new from the entity_edges/unknown-slug coverage) Updated tests: - parse tests: turn.entities tuples now bare-slug - test_parse_edge_emits_create_edge: asserts turn.entity_edges - test_parse_edge_escapes_quotes_in_ids: same - test_synthesize_with_entities_emits_mention_entity_and_refers_to (renamed from _emits_upsert_plus_matching_edge) - test_synthesize_dedupes_duplicate_entities: counts mention/ entity/refers_to instead of UPSERTs - test_synthesize_emits_entity_edges_after_resolution: new - test_synthesize_drops_entity_edge_with_unknown_slug: new - test_synthesize_all_together_contract: kind-presence assertions ## Sequence - PR A (merged) - foundation - PR B (this) - BonsaiIngestor rewrite - PR C - LoCoMo adapter + re-bench against snap-research scoring - PR D - SYS RESOLVE MENTIONS for batch re-resolution --- src/graphstore/bonsai_ingestor.py | 164 +++++++++++++++++++----- src/graphstore/entity_resolver.py | 201 ++++-------------------------- tests/test_bonsai_ingestor.py | 135 +++++++++++++------- 3 files changed, 250 insertions(+), 250 deletions(-) diff --git a/src/graphstore/bonsai_ingestor.py b/src/graphstore/bonsai_ingestor.py index f11b5c2..f4c57e1 100644 --- a/src/graphstore/bonsai_ingestor.py +++ b/src/graphstore/bonsai_ingestor.py @@ -251,9 +251,23 @@ def _scrape_belief_updates( @dataclass class ParsedTurn: - """Parsed structured output of one @-verb LLM call.""" + """Parsed structured output of one @-verb LLM call. + + entities holds (slug, surface_name) tuples - the slug is the bare + identifier the model emitted (without the legacy ``ent:`` prefix), + surface_name is the human-readable form ("Alice", "OpenAI"). The + synthesizer turns each into a mention node + a refers_to edge to a + resolved entity (see ``graphstore.entity_resolver``). + + entity_edges holds (from_slug, to_slug, kind) tuples for @EDGE + output between two entity slugs. Synthesizer maps each slug to the + resolved entity_id and emits the edge between entity nodes. + Edge handlers append here instead of `statements` so the slug -> + entity_id rewrite happens after resolver runs. + """ entities: list[tuple[str, str]] = field(default_factory=list) + entity_edges: list[tuple[str, str, str]] = field(default_factory=list) beliefs: list[tuple[str, str]] = field(default_factory=list) retracts: list[str] = field(default_factory=list) statements: list[str] = field(default_factory=list) @@ -265,14 +279,20 @@ def _dsl_escape(s: str) -> str: def _h_upsert(turn: ParsedTurn, ln: str) -> None: - """U """ + """U + + Stores (slug, surface_name). The synthesizer turns this into a + mention node + entity node (via the resolver) + refers_to edge. + Slug is stored bare; legacy ``ent:`` prefix on input is stripped + so the model can emit either form. + """ parts = ln.split(None, 2) if len(parts) < 3: return slug = parts[1].removeprefix("ent:").strip('"') name = parts[2].strip().strip('"') if slug and name: - turn.entities.append((f"ent:{slug}", name)) + turn.entities.append((slug, name)) def _h_fact(turn: ParsedTurn, ln: str) -> None: @@ -297,18 +317,27 @@ def _h_drop(turn: ParsedTurn, ln: str) -> None: def _h_edge(turn: ParsedTurn, ln: str) -> None: - """E (ids include ent:/fact: prefix)""" + """E (ids may include legacy ent:/fact: prefix) + + Pushes (from_slug, to_slug, kind) to ``turn.entity_edges`` so the + synthesizer can map each slug to the resolved entity_id before + materializing the edge. The legacy ``ent:`` prefix is stripped so + the slug can be looked up in the per-turn slug -> entity map. Bare + fact:/msg: ids are passed through unchanged - they reference nodes + that already exist by literal id. + """ parts = ln.split() if len(parts) < 4: return from_id = parts[1].strip('"') to_id = parts[2].strip('"') kind = parts[3].strip('"') - if from_id and to_id and kind: - turn.statements.append( - f'CREATE EDGE "{_dsl_escape(from_id)}" -> "{_dsl_escape(to_id)}" ' - f'kind = "{_dsl_escape(kind)}"' - ) + if not (from_id and to_id and kind): + return + # Strip legacy "ent:" prefix; remap to entity ids during synthesis. + from_slug = from_id.removeprefix("ent:") + to_slug = to_id.removeprefix("ent:") + turn.entity_edges.append((from_slug, to_slug, kind)) def _h_query(template: str): @@ -903,27 +932,24 @@ def _synthesize_dsl( session_id: str, role: str, text: str, + gs: Any | None = None, ) -> list[str]: - """Build the full DSL statement list from the parsed compact output. - - Deterministic. Emits in order: - 1. CREATE NODE for the message (DOCUMENT = user text). - 2. UPSERT NODE per entity + matching CREATE EDGE kind = "mentions". - Entities are deduped by id (first wins). - 3. RETRACT per retract (before any ASSERT). - 4. ASSERT per belief. - 5. Pre-rendered statements (edges, queries, walks, sys ops) verbatim. - """ + """Render parsed @-verbs to DSL. ``gs`` enables resolver lookup; + when None (dry-run, tests) every mention mints a fresh entity.""" + from graphstore.entity_resolver import ( + EDGE_REFERS_TO, KIND_ENTITY, KIND_MENTION, + make_entity_id, make_mention_id, resolve_mention, + ) + out: list[str] = [] text_esc = _dsl_escape(text) session_esc = _dsl_escape(session_id) role_esc = _dsl_escape(role) msg_esc = _dsl_escape(msg_id) - # Store the text in both `content` (for baseline retrieval adapters that - # scan the node column) and `DOCUMENT` (for the vector + BM25 pipeline). - # The two-liner duplication costs ~1 string interning but makes Bonsai- - # ingested messages directly comparable to the deterministic NER adapter. + # `content` mirrors `DOCUMENT` so adapters that scan typed columns + # (deterministic NER baseline) compare apples-to-apples with the + # vector + BM25 pipeline that reads `DOCUMENT`. out.append( f'CREATE NODE "{msg_esc}" kind = "message" ' f'session = "{session_esc}" role = "{role_esc}" ' @@ -931,19 +957,62 @@ def _synthesize_dsl( f'DOCUMENT "{text_esc}"' ) - ordered_ents: list[str] = [] - seen_ents: set[str] = set() - for ent_id, name in turn.entities: - if ent_id in seen_ents: + slug_to_entity: dict[str, str] = {} + + seen_slugs: set[str] = set() + for occurrence, (slug, name) in enumerate(turn.entities): + if slug in seen_slugs: continue - seen_ents.add(ent_id) - ordered_ents.append(ent_id) + seen_slugs.add(slug) + + mention_id = make_mention_id(msg_id, slug, occurrence) + mention_esc = _dsl_escape(mention_id) + name_esc = _dsl_escape(name) + + if gs is not None: + try: + resolved = resolve_mention(gs, surface_name=name, context=text) + entity_id = resolved.entity_id + is_new = resolved.is_new_entity + confidence = resolved.confidence + except Exception as e: # pragma: no cover + _log.warning("entity_resolver failed for %r: %s", name, e) + entity_id = make_entity_id() + is_new = True + confidence = 1.0 + else: + entity_id = make_entity_id() + is_new = True + confidence = 1.0 + + slug_to_entity[slug] = entity_id + entity_esc = _dsl_escape(entity_id) + + out.append( + f'CREATE NODE "{mention_esc}" kind = "{KIND_MENTION}" ' + f'surface_name = "{name_esc}" source_msg = "{msg_esc}" ' + f'session = "{session_esc}" ' + f'DOCUMENT "{name_esc} | {text_esc}"' + ) + + if is_new: + out.append( + f'CREATE NODE "{entity_esc}" kind = "{KIND_ENTITY}" ' + f'canonical_name = "{name_esc}" mention_count = 1 ' + f'context = "{text_esc}" ' + f'DOCUMENT "{name_esc} | {text_esc}"' + ) + else: + out.append( + f'INCREMENT NODE "{entity_esc}" mention_count BY 1' + ) + out.append( - f'UPSERT NODE "{_dsl_escape(ent_id)}" kind = "entity" name = "{_dsl_escape(name)}"' + f'CREATE EDGE "{mention_esc}" -> "{entity_esc}" ' + f'kind = "{EDGE_REFERS_TO}" confidence = {confidence:.3f}' ) - for ent_id in ordered_ents: out.append( - f'CREATE EDGE "{msg_esc}" -> "{_dsl_escape(ent_id)}" kind = "mentions"' + f'CREATE EDGE "{msg_esc}" -> "{mention_esc}" kind = "mentions"' ) for fact_id in turn.retracts: @@ -957,11 +1026,39 @@ def _synthesize_dsl( f'value = "{_dsl_escape(value)}" CONFIDENCE 0.9 SOURCE "{msg_esc}"' ) - out.extend(turn.statements) + for from_slug, to_slug, edge_kind in turn.entity_edges: + from_eid = slug_to_entity.get(from_slug) + to_eid = slug_to_entity.get(to_slug) + if not (from_eid and to_eid): + _log.warning( + "bonsai: dropping @EDGE %r -> %r (slug not in turn map)", + from_slug, to_slug, + ) + continue + out.append( + f'CREATE EDGE "{_dsl_escape(from_eid)}" -> "{_dsl_escape(to_eid)}" ' + f'kind = "{_dsl_escape(edge_kind)}"' + ) + + # Retrieval verbs (@RECALL/@PATH/@ANCESTORS/...) emit literal + # `ent:slug` ids. After the refactor those nodes do not exist; + # rewrite each reference to the matching entity_id from the turn + # map, falling back to the literal slug when unknown so debug + # output remains traceable. + for stmt in turn.statements: + out.append(_rewrite_ent_refs(stmt, slug_to_entity)) return out +def _rewrite_ent_refs(stmt: str, slug_to_entity: dict[str, str]) -> str: + def _sub(match: re.Match) -> str: + slug = match.group(1) + eid = slug_to_entity.get(slug) + return f'"{eid}"' if eid else f'"ent:{slug}"' + return _ENT_FROM_ID_RE.sub(_sub, stmt) + + def _render_known_facts_block(facts: dict[str, FactState], max_facts: int = 40) -> str: """Format non-retracted facts into a block the LLM reads before the input. @@ -1397,6 +1494,7 @@ def _ingest_locked( turn = _parse_verb_output(cleaned) deduped = _synthesize_dsl( turn, msg_id=msg_id, session_id=session_id, role=role, text=text, + gs=None if dry_run else self._gs, ) dup_dropped: list[tuple[str, str]] = [] diff --git a/src/graphstore/entity_resolver.py b/src/graphstore/entity_resolver.py index d599796..fde1b17 100644 --- a/src/graphstore/entity_resolver.py +++ b/src/graphstore/entity_resolver.py @@ -1,56 +1,11 @@ -"""Entity resolution: separate mention from identity. - -When ingesting a mention of "Alice", "Maria", or any other proper noun -extracted from natural-language text, naming the canonical-entity node -after the surface form (``ent:alice``) breaks down across conversations: -two different humans named "Alice" collide; the same human mentioned -across sessions either collides into one node by accident or - if write -semantics fail - drops the second mention entirely. - -This module implements the production-grade fix used by Wikidata, -Microsoft GraphRAG, and entity-resolution pipelines elsewhere: separate -**mention** (an observation: "Alice was mentioned at message m1, char 42, -within this surrounding sentence") from **entity** (a hypothesis about -identity: "this specific Alice the data model thinks exists, with auto- -generated id ``entity:c4f8a3``"). - -Mentions are immutable, location-keyed, never collide. -Entities are revisable, auto-id, can be merged or split as evidence accumulates. -A ``refers_to`` edge with confidence connects mention to entity. - -Resolver workflow at write time: - - 1. **Name match** (cheap precondition). Find all existing ``entity`` - nodes whose ``canonical_name`` matches the new mention's surface - name (case-folded equality - tighten if false positives appear). - 2. **Empty match → new entity.** Generate ``entity:{uuid4-hex}``, - caller materializes it. - 3. **Single match → unambiguous link.** Return that entity_id with - confidence=1.0 (no ambiguity to resolve). - 4. **Multiple matches → embedding disambiguation.** Compute cosine - between the mention's context embedding and each candidate - entity's accumulated-context embedding. Pick the highest. If the - best score is above ``threshold_high``, return it. Otherwise - return a new entity_id (the contexts diverged enough that this is - probably a different human with the same name). - -This is **correct** because: - - Mentions never collide (location-keyed). - - Entities discovered, not asserted - cluster by accumulated evidence. - - Confidence preserved end-to-end. A weak ``refers_to`` edge is - revisable in light of later evidence without losing the original - mention. - - Same human mentioned 1000 times across 50 conversations = - 1000 mention nodes + 1 entity node. - - Two genuinely-different "Alice"s with diverging context = - N mention nodes + 2 entity nodes, automatically. - -It is also **cheap**: - - Pre-filter by exact name match keeps the candidate set small - (typically 0 or 1 entity per surface name). - - Embedding disambiguation only fires on the rare collision case - and reuses the embedder graphstore already has loaded. - - All ANN lookups are O(log n) on the existing usearch index. +"""Entity resolution: mention vs identity. + +Resolver returns the entity_id a new mention should attach to. +Algorithm: filter existing entities by case-folded name match; with one +candidate link unambiguously; with several, pick the embedding-closest +context above ``threshold_high`` else mint a new entity. False-merge +is unrecoverable, false-split is reversible via MERGE - thus the +conservative threshold. """ from __future__ import annotations @@ -62,17 +17,8 @@ _log = logging.getLogger(__name__) -# How conservative are we about merging? `threshold_high` is the cosine -# above which we confidently link a new mention to an existing entity -# of the same name. Below that, even with name match, we err on the -# side of creating a new entity - false-merge is worse than -# false-split because MERGE is reversible (DELETE EDGE + UPSERT) while -# silently conflated identities become impossible to untangle once -# downstream beliefs accumulate. DEFAULT_HIGH_THRESHOLD = 0.85 -# Schema constants. Use these as the canonical kind values + edge -# label everywhere; downstream readers should not hard-code strings. KIND_MENTION = "mention" KIND_ENTITY = "entity" EDGE_REFERS_TO = "refers_to" @@ -80,77 +26,39 @@ @dataclass(frozen=True) class ResolvedMention: - """Outcome of ``resolve_mention()``. - - Resolver does NOT mutate the store. Caller is expected to: - 1. CREATE the mention node (if it doesn't exist yet) - 2. If ``is_new_entity``: CREATE the entity node with id ``entity_id`` - 3. CREATE EDGE mention_id -[refers_to confidence=...]-> entity_id - - Doing those writes outside the resolver keeps the resolver pure - (testable without a graph), idempotent, and side-effect-free. - """ + """Caller materializes the mention node, the entity node when + ``is_new_entity``, and the refers_to edge with ``confidence``.""" - entity_id: str # always a valid id; either existing or freshly minted - confidence: float # 1.0 for new + unambiguous match, 0..1 for disambig - is_new_entity: bool # caller must CREATE NODE for the entity - canonical_name: str # name to seed on a new entity (== surface_name) - candidates_seen: int # how many same-name entities were considered - notes: list[str] # human-readable trace of resolver decisions - - -# --------------------------------------------------------------------- -# Name normalization -# --------------------------------------------------------------------- + entity_id: str + confidence: float + is_new_entity: bool + canonical_name: str + candidates_seen: int + notes: list[str] _NAME_NORMALIZE_RE = re.compile(r"[^a-z0-9]+") def normalize_name(name: str) -> str: - """Lowercase + collapse non-alphanumerics to nothing. - - "Alice Smith" -> "alicesmith". "alice@stripe" -> "alicestripe". - Aggressive on purpose - we want "alice", "Alice", "ALICE", "Alice " - to all hash to the same name match. False-positive risk (e.g. - "Alice S." vs "Alice S") is fine; we disambiguate by embedding - score after. - """ + """Case-fold + strip non-alphanumerics. "Alice S." == "alice s".""" return _NAME_NORMALIZE_RE.sub("", name.lower()) def make_entity_id(prefix: str = "entity") -> str: - """Generate a fresh entity id. UUID4-hex first 12 chars - long - enough for collision-resistance at billions of entities, short - enough to read in logs.""" return f"{prefix}:{uuid.uuid4().hex[:12]}" -# --------------------------------------------------------------------- -# Candidate lookup -# --------------------------------------------------------------------- - - def _candidates_by_name(gs: Any, surface_name: str) -> list[dict]: - """Return all entity nodes whose canonical_name normalizes equal - to surface_name's normalized form. - - Uses the structured-column path (NODES WHERE) rather than vector - search - this is the cheap precondition before we spend an embed - cycle. Empty list = unique name = no disambiguation needed. - """ + """Pre-filter entity candidates by exact normalized-name match, + avoiding the embedder cost when the name is unique.""" target = normalize_name(surface_name) if not target: return [] - # Pull all entity nodes (typically tens to low thousands), filter - # in-process by normalized name. We avoid pushing normalize_name - # into the WHERE clause because the DSL has no equivalent function; - # store-side filter is fast enough for the cardinalities involved. try: result = gs.execute(f'NODES WHERE kind = "{KIND_ENTITY}" LIMIT 5000') except Exception as e: - _log.warning("entity_resolver: NODES query failed (%s); " - "treating as empty candidates", e) + _log.warning("entity_resolver: NODES query failed (%s)", e) return [] nodes = result.data if hasattr(result, "data") else [] if not isinstance(nodes, list): @@ -166,15 +74,10 @@ def _candidates_by_name(gs: Any, surface_name: str) -> list[dict]: def _embed_text(gs: Any, text: str) -> list[float] | None: - """Embed `text` via the GraphStore's embedder. None if no - embedder is configured (resolver gracefully degrades to first-match - selection in that case). - """ embedder = getattr(gs, "_embedder", None) if embedder is None: return None try: - # Embedders implement encode_documents([str]) -> ndarray vecs = embedder.encode_documents([text]) if vecs is None or len(vecs) == 0: return None @@ -199,33 +102,13 @@ def _cosine(a: list[float], b: list[float]) -> float: return dot / (na ** 0.5 * nb ** 0.5) -# --------------------------------------------------------------------- -# Public API -# --------------------------------------------------------------------- - - def resolve_mention( gs: Any, surface_name: str, context: str, threshold_high: float = DEFAULT_HIGH_THRESHOLD, ) -> ResolvedMention: - """Decide which entity a new mention refers to. - - Args: - gs: live GraphStore. Resolver does not write; only reads. - surface_name: exact surface text from the source ("Alice", - "Maria", "OpenAI"). Case + punctuation are normalized - internally for name match. - context: surrounding sentence(s) - used for embedding-based - disambiguation when more than one entity shares this name. - threshold_high: cosine threshold for confident linking. Below - this we mint a new entity rather than risk a false-merge. - - Returns: ``ResolvedMention``. Caller materializes the entity node - if ``is_new_entity`` is True, then creates the refers_to edge with - the returned confidence. - """ + """Pure read. Caller materializes mention/entity/edge per the result.""" notes: list[str] = [] candidates = _candidates_by_name(gs, surface_name) @@ -242,10 +125,6 @@ def resolve_mention( ) if len(candidates) == 1: - # Unambiguous name match. Confidence 1.0 because there is - # nothing to disambiguate against. If the user later splits - # this entity (e.g. they realize there are actually two Alices), - # they do so explicitly via MERGE/SPLIT verbs. return ResolvedMention( entity_id=candidates[0]["id"], confidence=1.0, @@ -255,18 +134,16 @@ def resolve_mention( notes=notes + ["single name match; linking with confidence=1.0"], ) - # Multiple candidates. Embedding-based disambiguation. new_vec = _embed_text(gs, f"{surface_name}. {context}") if new_vec is None: - # No embedder. Fall back to picking the entity with the most - # mentions (preferred-attachment heuristic). Worst case we still - # bias toward consolidation. + # No embedder available - fall back to most-mentioned candidate + # to bias toward consolidation rather than fragmentation. notes.append("no embedder; falling back to most-mentioned entity") best = max(candidates, key=lambda n: int(n.get("mention_count", 0))) return ResolvedMention( entity_id=best["id"], - confidence=0.5, # signal low certainty + confidence=0.5, is_new_entity=False, canonical_name=surface_name, candidates_seen=len(candidates), @@ -277,13 +154,6 @@ def resolve_mention( best_score = -1.0 for cand in candidates: cand_id = cand.get("id", "") - # Each candidate's discriminator is its accumulated context - # text. We rebuild it from canonical_name + (any stored - # context column the caller seeded). If the caller hasn't - # populated a context column, embedding compares names alone - # and the disambiguation collapses to "any same-name" - which - # is acceptable; we already reported candidates_seen so the - # caller can audit. cand_text = " ".join([ str(cand.get("canonical_name") or cand.get("name") or ""), str(cand.get("context", "")), @@ -309,9 +179,7 @@ def resolve_mention( ], ) - # Multiple same-name entities exist but the new mention does not - # confidently match any of them. Mint a new entity - false-split - # is reversible via MERGE; false-merge is not. + # Below threshold: mint new. False-merge is unrecoverable. return ResolvedMention( entity_id=make_entity_id(), confidence=1.0, @@ -325,23 +193,6 @@ def resolve_mention( ) -# --------------------------------------------------------------------- -# Mention id construction -# --------------------------------------------------------------------- - - def make_mention_id(msg_id: str, slug: str, occurrence: int = 0) -> str: - """Build a location-keyed mention id. - - Format: ``mention:{msg_id}:{slug}:{occurrence}``. - - msg_id alone keys the source message; appending the slug + an - occurrence index disambiguates multiple mentions of different - surface forms within the same message ("Alice told Bob...") and - repeated mentions of the same surface form ("Alice ... Alice ..."). - - No collision possible across calls with the same args - that's the - point: re-extracting the same message must produce the same - mention id, idempotently. - """ + """Idempotent location-keyed id: ``mention:{msg}:{slug}:{n}``.""" return f"mention:{msg_id}:{slug}:{occurrence}" diff --git a/tests/test_bonsai_ingestor.py b/tests/test_bonsai_ingestor.py index 06cc845..72a7214 100644 --- a/tests/test_bonsai_ingestor.py +++ b/tests/test_bonsai_ingestor.py @@ -341,7 +341,7 @@ def test_parse_all_three_ingest_verbs(): @BELIEF color blue @RETRACT old''' turn = _parse_verb_output(out) - assert turn.entities == [("ent:priya", "Priya"), ("ent:openai", "OpenAI")] + assert turn.entities == [("priya", "Priya"), ("openai", "OpenAI")] assert turn.beliefs == [("fact:color", "blue")] assert turn.retracts == ["fact:old"] @@ -355,7 +355,7 @@ def test_parse_empty_output_is_empty_turn(): def test_parse_entities_only(): turn = _parse_verb_output("@UPSERT kailash Kailash") - assert turn.entities == [("ent:kailash", "Kailash")] + assert turn.entities == [("kailash", "Kailash")] assert turn.beliefs == [] assert turn.retracts == [] @@ -363,13 +363,13 @@ def test_parse_entities_only(): def test_parse_multi_word_name_joined_by_whitespace(): """Rest-of-line is the name; split on first 2 whitespace runs only.""" turn = _parse_verb_output("@UPSERT sf San Francisco") - assert turn.entities == [("ent:sf", "San Francisco")] + assert turn.entities == [("sf", "San Francisco")] def test_parse_case_insensitive_verbs(): out = "@upsert priya Priya\n@belief color blue\n@retract old" turn = _parse_verb_output(out) - assert turn.entities == [("ent:priya", "Priya")] + assert turn.entities == [("priya", "Priya")] assert turn.beliefs == [("fact:color", "blue")] assert turn.retracts == ["fact:old"] @@ -383,13 +383,13 @@ def test_parse_assert_alias_maps_to_belief(): def test_parse_tolerates_fence_lines(): out = "```\n@UPSERT x X\n```" turn = _parse_verb_output(out) - assert turn.entities == [("ent:x", "X")] + assert turn.entities == [("x", "X")] def test_parse_strips_prefix_if_model_adds_it(): """Model sometimes emits '@UPSERT ent:x X'; we normalize to slug-only.""" turn = _parse_verb_output('@UPSERT ent:priya Priya') - assert turn.entities == [("ent:priya", "Priya")] + assert turn.entities == [("priya", "Priya")] turn2 = _parse_verb_output('@BELIEF fact:color blue') assert turn2.beliefs == [("fact:color", "blue")] @@ -398,7 +398,7 @@ def test_parse_strips_prefix_if_model_adds_it(): def test_parse_ignores_unknown_verbs(): out = "@UPSERT priya Priya\n@FOO some garbage\n@RETRACT old" turn = _parse_verb_output(out) - assert turn.entities == [("ent:priya", "Priya")] + assert turn.entities == [("priya", "Priya")] assert turn.retracts == ["fact:old"] @@ -414,7 +414,7 @@ def test_parse_ignores_malformed_short_lines(): def test_parse_strips_quotes_if_present(): """Model occasionally wraps tokens in quotes; handle both.""" turn = _parse_verb_output('@UPSERT "priya" "Priya"') - assert turn.entities == [("ent:priya", "Priya")] + assert turn.entities == [("priya", "Priya")] # -------------------------------------------------------------------- @@ -428,25 +428,25 @@ def test_parse_drops_lines_without_at_prefix(): This is free-form prose. @UPSERT kailash Kailash''' turn = _parse_verb_output(out) - assert turn.entities == [("ent:kailash", "Kailash")] + assert turn.entities == [("kailash", "Kailash")] def test_parse_accepts_space_after_at(): """'@ UPSERT priya' still parses (tolerant).""" turn = _parse_verb_output("@ UPSERT priya Priya") - assert turn.entities == [("ent:priya", "Priya")] + assert turn.entities == [("priya", "Priya")] def test_parse_bare_at_dropped(): turn = _parse_verb_output("@\n@UPSERT x X\n@") - assert turn.entities == [("ent:x", "X")] + assert turn.entities == [("x", "X")] def test_parse_english_drift_after_ops_ignored(): out = '''@UPSERT priya Priya Wait - that's not correct. Let me reconsider.''' turn = _parse_verb_output(out) - assert turn.entities == [("ent:priya", "Priya")] + assert turn.entities == [("priya", "Priya")] assert turn.statements == [] @@ -455,15 +455,17 @@ def test_parse_english_drift_after_ops_ignored(): # -------------------------------------------------------------------- def test_parse_edge_emits_create_edge(): + """@EDGE produces an entity_edges entry; synthesizer maps each + slug to the resolver's entity_id before rendering the DSL.""" turn = _parse_verb_output("@EDGE ent:priya ent:flipkart works_at") - assert turn.statements == [ - 'CREATE EDGE "ent:priya" -> "ent:flipkart" kind = "works_at"' - ] + assert turn.entity_edges == [("priya", "flipkart", "works_at")] + assert turn.statements == [] assert turn.entities == [] def test_parse_edge_needs_three_args(): turn = _parse_verb_output("@EDGE ent:a ent:b") + assert turn.entity_edges == [] assert turn.statements == [] @@ -566,7 +568,7 @@ def test_parse_mixed_ingest_and_query(): @UPSERT openai OpenAI @REMEMBER what I said about coffee''' turn = _parse_verb_output(out) - assert turn.entities == [("ent:priya", "Priya"), ("ent:openai", "OpenAI")] + assert turn.entities == [("priya", "Priya"), ("openai", "OpenAI")] assert turn.statements == ['REMEMBER "what I said about coffee" LIMIT 10'] @@ -592,11 +594,10 @@ def test_parse_plain_verb_ignores_trailing_tokens(): def test_parse_edge_escapes_quotes_in_ids(): - """Quote-escape applies inside CREATE EDGE even if ids carry weird chars.""" + """Edge slugs + kind survive odd characters; synthesizer escapes + when it renders the final DSL.""" turn = _parse_verb_output('@EDGE ent:a ent:b weird"kind') - assert turn.statements == [ - 'CREATE EDGE "ent:a" -> "ent:b" kind = "weird\\"kind"' - ] + assert turn.entity_edges == [("a", "b", 'weird"kind')] # -------------------------------------------------------------------- @@ -643,7 +644,7 @@ def test_parse_count_nodes_and_edges(): def test_synthesize_appends_statements_verbatim(): turn = ParsedTurn( - entities=[("ent:x", "X")], + entities=[("x", "X")], statements=['REMEMBER "hello" LIMIT 3', 'SYS STATS'], ) dsl = _synthesize_dsl(turn, msg_id="m:0", session_id="s", role="user", text="hi") @@ -676,23 +677,68 @@ def test_synthesize_minimal_turn_emits_only_message_node(): assert 'DOCUMENT "hi"' in dsl[0] -def test_synthesize_with_entities_emits_upsert_plus_matching_edge(): - turn = ParsedTurn(entities=[("ent:priya", "Priya"), ("ent:openai", "OpenAI")]) +def test_synthesize_with_entities_emits_mention_entity_and_refers_to(): + """Each entity slug yields: mention node + entity node (new) + + refers_to edge (mention->entity) + mentions edge (msg->mention). + With no gs passed, every mention mints a fresh entity.""" + turn = ParsedTurn(entities=[("priya", "Priya"), ("openai", "OpenAI")]) dsl = _synthesize_dsl(turn, msg_id="m:s1:0", session_id="s1", role="user", text="x") - assert len(dsl) == 1 + 2 + 2 - assert 'UPSERT NODE "ent:priya"' in dsl[1] - assert 'UPSERT NODE "ent:openai"' in dsl[2] - assert 'CREATE EDGE "m:s1:0" -> "ent:priya" kind = "mentions"' in dsl[3] - assert 'CREATE EDGE "m:s1:0" -> "ent:openai" kind = "mentions"' in dsl[4] + # 1 message + 2 * (1 mention + 1 entity + 1 refers_to + 1 mentions) = 9 + assert len(dsl) == 9 + text = "\n".join(dsl) + assert 'kind = "mention"' in text + assert 'kind = "entity"' in text + assert 'kind = "refers_to"' in text + assert 'kind = "mentions"' in text + assert 'mention:m:s1:0:priya:0' in text + assert 'mention:m:s1:0:openai:1' in text + assert 'canonical_name = "Priya"' in text + assert 'canonical_name = "OpenAI"' in text def test_synthesize_dedupes_duplicate_entities(): - turn = ParsedTurn(entities=[("ent:x", "X"), ("ent:x", "X")]) + """Same slug emitted twice in one turn yields exactly one mention + + one entity + one refers_to.""" + turn = ParsedTurn(entities=[("x", "X"), ("x", "X")]) + dsl = _synthesize_dsl(turn, msg_id="m:0", session_id="s", role="user", text="x") + mentions = [d for d in dsl if 'kind = "mention"' in d] + entities = [d for d in dsl if 'kind = "entity"' in d] + refers = [d for d in dsl if 'kind = "refers_to"' in d] + assert len(mentions) == 1 + assert len(entities) == 1 + assert len(refers) == 1 + + +def test_synthesize_emits_entity_edges_after_resolution(): + """@EDGE between two upserted slugs renders as entity-to-entity + after the slug map is populated.""" + turn = ParsedTurn( + entities=[("priya", "Priya"), ("flipkart", "Flipkart")], + entity_edges=[("priya", "flipkart", "works_at")], + ) + dsl = _synthesize_dsl(turn, msg_id="m:0", session_id="s", role="user", text="x") + edge_lines = [d for d in dsl + if d.startswith("CREATE EDGE") + and 'kind = "works_at"' in d] + assert len(edge_lines) == 1 + # Both endpoints must be entity:* ids, not slug literals. + assert 'entity:' in edge_lines[0] + assert 'ent:priya' not in edge_lines[0] + assert 'ent:flipkart' not in edge_lines[0] + + +def test_synthesize_drops_entity_edge_with_unknown_slug(): + """@EDGE references a slug not declared via @UPSERT; synthesizer + drops it (logs warning) rather than emitting a broken DSL line.""" + turn = ParsedTurn( + entities=[("alice", "Alice")], + entity_edges=[("alice", "ghost", "knows")], + ) dsl = _synthesize_dsl(turn, msg_id="m:0", session_id="s", role="user", text="x") - upserts = [d for d in dsl if d.startswith("UPSERT")] - edges = [d for d in dsl if d.startswith("CREATE EDGE")] - assert len(upserts) == 1 - assert len(edges) == 1 + edge_lines = [d for d in dsl + if d.startswith("CREATE EDGE") + and 'kind = "knows"' in d] + assert edge_lines == [] def test_synthesize_belief_and_retract_use_same_fact_id(): @@ -708,29 +754,34 @@ def test_synthesize_belief_and_retract_use_same_fact_id(): def test_synthesize_escapes_quotes_in_text_and_name(): - turn = ParsedTurn(entities=[("ent:a", 'Alice "Ace"')]) + turn = ParsedTurn(entities=[("a", 'Alice "Ace"')]) dsl = _synthesize_dsl( turn, msg_id="m:0", session_id="s", role="user", text='She said "go".', ) - # Backslash-escapes in DSL string literal: - assert 'DOCUMENT "She said \\"go\\"."' in dsl[0] - assert 'name = "Alice \\"Ace\\""' in dsl[1] + text = "\n".join(dsl) + assert '\\"go\\"' in text + assert 'Alice \\"Ace\\"' in text def test_synthesize_all_together_contract(): - """End-to-end: messages + entity + belief + retract.""" + """End-to-end: message + mention/entity/refers_to + belief + retract.""" turn = ParsedTurn( - entities=[("ent:priya", "Priya")], + entities=[("priya", "Priya")], beliefs=[("fact:color", "green")], retracts=["fact:color"], ) dsl = _synthesize_dsl( turn, msg_id="m:0", session_id="s1", role="user", text="text", ) - # Order: CREATE NODE, UPSERTs, EDGEs, RETRACTs, ASSERTs - kinds = [d.split(maxsplit=2)[0] for d in dsl] - assert kinds == ["CREATE", "UPSERT", "CREATE", "RETRACT", "ASSERT"] + text = "\n".join(dsl) + assert 'kind = "message"' in text + assert 'kind = "mention"' in text + assert 'kind = "entity"' in text + assert 'kind = "refers_to"' in text + assert 'kind = "mentions"' in text + assert 'RETRACT "fact:color"' in text + assert 'ASSERT "fact:color"' in text def test_ingest_requires_msg_id(tmp_path: Path):