diff --git a/internal_documentation/ENTITY_RESOLUTION.md b/internal_documentation/ENTITY_RESOLUTION.md new file mode 100644 index 0000000..eef579e --- /dev/null +++ b/internal_documentation/ENTITY_RESOLUTION.md @@ -0,0 +1,681 @@ +# Entity Resolution Flow + +## Purpose + +Entity resolution is the canonicalization layer between extraction output and stored graph identity. + +Its job is to answer: + +> Does this extracted entity already exist in scope, or should it create a new canonical entity? + +This is the system that decides whether: + +- `Adarsh` +- `Adarsh Tadimari` +- `Hi Adarsh` +- `Adarsh's` + +are all one thing, multiple things, or partly garbage. + +The resolver is conservative on purpose. A false merge corrupts the graph more severely than a missed merge. + +## Inputs and Outputs + +Input: + +- candidate name +- candidate type +- candidate aliases +- scope identity +- optional description +- optional deterministic external IDs + +Output: + +- canonical entity +- `isNew` flag + +Possible outcomes: + +1. merge into an existing entity +2. create a new entity + +## High-Level Strategy + +Resolution is a cascade: + +1. deterministic external ID checks first when available +2. cheap deterministic text checks next +3. stronger but more expensive similarity checks later +4. embeddings only when needed + +The resolver does **not** immediately jump to vector similarity. + +## Resolver Flow + +```mermaid +flowchart TD + A["Incoming entity candidate"] --> X{"External IDs provided?"} + X -->|yes| Y["Phase -1: exact external ID lookup"] + Y -->|match| M["merge into matched entity"] + Y -->|conflict| Z["throw identity conflict"] + Y -->|no match| B["Phase 0: in-memory cache"] + X -->|no| B + B -->|match| M + B -->|no match| C["Phase 1: alias match against DB candidates"] + C -->|match| M + C -->|no match| D["Phase 2: normalized exact match"] + D -->|match| M + D -->|no match| E["Phase 2.5: trigram fuzzy match"] + E -->|match| M + E -->|no match| F["Phase 3: name embedding similarity"] + F -->|match| M + F -->|no match| G["Phase 3.5: description-confirmed near miss"] + G -->|match| M + G -->|no match| H["create new entity"] +``` + +## Scope + +Resolution is always scoped by identity: + +- `tenantId` +- `groupId` +- `userId` +- `agentId` +- `conversationId` + +This prevents cross-tenant or otherwise invalid merges. + +## Entity Type Contract + +Entity types come from the central ontology registry in: + +```txt +packages/sdk/src/index-engine/ontology.ts +``` + +Active entity types are: + +```txt +person +organization +location +product +technology +concept +event +meeting +document +project +issue +role +law_regulation +time_period +creative_work +``` + +`document` is the graph entity type for authored business materials such as contracts, RFPs, specs, reports, decks, transcripts, and plans. TypeGraph ingested sources remain storage objects and chunks; they are not graph entities. + +If a developer omits an entity type, TypeGraph uses the central default `concept`. Memory subjects with external IDs whose `identityType` is `user` default to `person`, which supports external-user memory flows where only an email address is known. + +## Deterministic External IDs + +External IDs are the strongest identity evidence TypeGraph supports. + +They are structured, not bare strings: + +```ts +interface ExternalId { + id: string + type: string + identityType: 'tenant' | 'group' | 'user' | 'agent' | 'conversation' | 'entity' + encoding?: 'none' | 'sha256' + metadata?: Record +} +``` + +Examples: + +```ts +[ + { id: 'pat@example.com', type: 'email', identityType: 'user' }, + { id: 'U123', type: 'slack_user_id', identityType: 'user' }, + { id: 'pat-m', type: 'github_handle', identityType: 'user' }, +] +``` + +External IDs are stored in `typegraph_entity_external_ids` and looked up by exact indexed normalized value. + +Normalization examples: + +- email-like identifiers are lowercased +- GitHub handles are lowercased +- phone numbers strip punctuation +- `encoding: 'sha256'` assumes the incoming id is already encoded and lowercases the hash + +External IDs are scoped by the same identity fields as entity resolution: + +- `tenantId` +- `groupId` +- `userId` +- `agentId` +- `conversationId` + +They are identity-resolution anchors, not authorization credentials. + +## External ID Conflict Rules + +If an incoming external ID already belongs to an entity in scope, resolution uses that entity before fuzzy matching. + +If a developer tries to attach the same scoped external ID to a different entity, the store rejects it. The resolver must not silently reassign deterministic identity. + +This is intentional: + +- deterministic identity should beat fuzzy extraction +- reassignment should be explicit +- accidental merges should fail loudly + +## Developer Seeding + +Developers can seed entities with deterministic identifiers: + +```ts +await typegraph.graph.upsertEntity({ + name: 'Pat Example', + entityType: 'person', + externalIds: [ + { id: 'pat@example.com', type: 'email', identityType: 'user' }, + { id: 'U123', type: 'slack_user_id', identityType: 'user' }, + ], +}) +``` + +They can then seed facts or edges using entity refs and external IDs. Entity resolution should prefer the external ID match before name similarity. + +Memory subjects use the same shape: + +```ts +await typegraph.remember('Prefers SMS for urgent notices', { + tenantId: 'acme', + subject: { + externalIds: [{ id: 'pat@example.com', type: 'email', identityType: 'user' }], + entityType: 'person', + }, + visibility: 'tenant', +}) +``` + +That flow resolves or upserts an entity, stores the memory, and links: + +```txt +memory --ABOUT--> entity +``` + +via `typegraph_graph_edges`. + +## Alias Safety Model + +The resolver uses two different alias standards. + +### Display-safe alias + +This is the standard for storing aliases on the entity. + +It asks: + +- is this alias safe enough to show and persist? + +### Strong alias for merge + +This is the stricter standard used for identity matching. + +It asks: + +- is this alias strong enough to trust as merge evidence? + +That distinction matters because an alias can be okay to display but still too weak to use as canonical merge evidence. + +## Alias Validation + +### `isValidAlias(...)` + +Rejects aliases that are obviously bad: + +- empty or 1-character +- over 80 chars +- greetings like `hi adarsh` +- imperatives like `inform adarsh` +- quantifiers like `both adarsh` +- possessives like `adarsh's` +- URLs/emails +- lowercase pronouns +- pure numbers +- disambiguator-style parentheticals +- bare generic noun phrases like `the team` +- generic one-word nouns like `finals`, `mvp` + +### `isDisplayAliasSafe(...)` + +Builds on `isValidAlias(...)`. + +For person aliases it adds stricter rules: + +- no sentence boundaries +- 1 to 5 tokens only +- no leading fragment words + +### `isStrongAliasForMerge(...)` + +Builds on `isDisplayAliasSafe(...)`. + +For people it becomes even stricter: + +- surname-only aliases are often rejected +- weak bare first-name aliases are rejected +- aliases that only match by a fragile surname pattern are rejected + +This is one of the key protections against merging every `Adarsh`-like mention into the wrong person. + +## Phase-by-Phase Behavior + +## Phase -1: Exact external ID lookup + +When external IDs are provided, the resolver asks the store for an exact scoped match using `findEntityByExternalId(...)`. + +Behavior: + +1. normalize each external ID +2. perform indexed lookup in `typegraph_entity_external_ids` +3. if one entity matches, use that entity +4. merge safe incoming name/type/alias/description data into the matched entity +5. if external IDs resolve to conflicting entities, throw +6. if none resolve, continue through normal text-based resolution + +This phase is used by: + +- developer entity seeding +- developer fact seeding with entity refs +- memory subject resolution +- query entity-scope resolution + +It is not fuzzy and must remain cheap. + +## Phase 0: In-memory cache + +The resolver keeps a session-local map from normalized names and strong aliases to canonical entities. + +Purpose: + +- catch duplicates within the same ingest session +- avoid repeat DB lookups +- reduce timing-race duplicates across nearby triple writes + +Behavior: + +1. normalize incoming name +2. check cache by canonical name +3. check cache by incoming strong aliases +4. require type compatibility +5. if matched, merge immediately + +This is the cheapest path. + +## Phase 1: Alias match against DB candidates + +If the store supports `findEntities(...)`, the resolver fetches likely text matches from the DB. + +Then `findByAlias(...)` compares: + +- incoming name + strong aliases +- candidate name + candidate strong aliases + +All comparisons are done on normalized forms. + +If any incoming normalized name matches any candidate normalized name or strong alias, the entity merges. + +## Phase 2: Normalized exact match + +This catches case and punctuation variants that are semantically identical. + +Examples: + +- `OpenAI` vs `openai` +- `J.K. Rowling` vs `JK Rowling` + +This phase compares normalized canonical names and normalized strong aliases. + +## Phase 2.5: Fuzzy trigram match + +If exact normalized matching fails, the resolver tries trigram Jaccard similarity. + +Purpose: + +- catch abbreviations +- catch spacing/punctuation reorderings +- catch close textual variants + +Examples: + +- `NY Times` vs `New York Times` +- `JK Rowling` vs `J. K. Rowling` + +This phase is guarded. It still requires: + +- compatible types +- for people, non-weak merge evidence +- no conflicting distinguishers + +Threshold: + +- `FUZZY_THRESHOLD = 0.85` + +## Phase 3: Name embedding similarity + +If deterministic and fuzzy matching fail, the resolver embeds the incoming name and searches entity embeddings. + +This is the first real semantic merge stage. + +But it is still heavily guarded. + +### Person-specific guards + +For `person` entities, the resolver requires: + +- matching last token when both names have 2+ tokens +- no weak person-name evidence + +That means: + +- `Adarsh Tadimari` and `Adarsh Revy` should not merge +- `Chris Mullin` and `Christopher Paul Mullin` may merge + +### Cross-type guard + +`typesCompatible(...)` prevents merges like: + +- person into location +- organization into event + +Fallback types are allowed to merge into more specific types when stronger evidence matches. This keeps developer-seeded or memory-created entities refinable without reintroducing `entity` as an extracted ontology type. + +### Shared token guard + +`hasSharedNameToken(...)` requires at least one meaningful shared token after stop-word removal. + +Examples: + +- `Chris Mullin` and `Christopher Paul Mullin` -> pass +- `United States` and `United Kingdom` -> fail because `united` is ignored + +### Conflicting distinguisher guard + +`hasConflictingDistinguishers(...)` blocks merges when names differ on: + +- years +- versions +- ordinals +- opposing qualifiers + +Examples: + +- `Python 2` vs `Python 3` -> conflict +- `2023 NBA Finals` vs `2024 NBA Finals` -> conflict +- `Eastern Conference` vs `Western Conference` -> conflict +- `Senior Team` vs `Junior Team` -> conflict + +Only if the candidate survives all those guards does cosine similarity decide the merge. + +Default threshold: + +- `similarityThreshold = 0.85` + +## Phase 3.5: Description-confirmed near miss + +This phase is used only if: + +- name similarity is suggestive but below the direct merge threshold +- an incoming description exists +- candidate descriptions and description embeddings exist + +Purpose: + +- catch legitimate near misses where name form differs but semantic identity is likely the same + +Thresholds: + +- `NEAR_MISS_NAME_THRESHOLD = 0.45` +- `DESC_SIMILARITY_THRESHOLD = 0.8` + +This phase still enforces: + +- type compatibility +- person surname/weak-name guards +- no conflicting distinguishers +- shared meaningful name token + +So it is not a free semantic merge. It is a guarded confirmation step. + +## Create-New Path + +If all phases fail: + +1. embed the name if not already embedded +2. embed the description if one exists +3. create a new entity +4. cache it +5. return `isNew: true` + +The created entity stores: + +- canonical name +- type +- aliases +- properties.description +- name embedding +- optional description embedding +- scope +- temporal metadata + +## Merge Path + +If any phase matches, the resolver merges incoming data into the existing entity. + +```mermaid +flowchart TD + A["Matched existing entity"] --> B["retain existing canonical id"] + B --> C["collect existing safe aliases"] + C --> D["add incoming name as alias if safe and different"] + D --> E["add incoming aliases if safe"] + E --> F["merge descriptions conservatively"] + F --> G["re-embed description only if it changed"] + G --> H["keep existing specific type unless current type is generic"] + H --> I["return merged entity"] +``` + +## Merge Rules In Detail + +### Canonical identity stays stable + +The existing entity id remains the canonical node. + +The resolver does not rename ids on merge. + +### Incoming canonical name may become an alias + +If the incoming name differs from the stored canonical name, the incoming name may be added as an alias. + +But only if it is display-safe. + +### Incoming aliases are filtered again + +No alias is trusted just because extraction produced it. + +Incoming aliases are re-validated before they are stored. + +### Type specificity is preserved + +If the existing type is specific and the incoming type is generic, keep the existing specific type. + +If the existing type is generic and the incoming type is specific, promote to the specific type. + +### Description merge is conservative + +Descriptions are not blindly concatenated anymore. + +The merge path: + +1. split descriptions into sentences +2. remove low-value description sentences +3. dedupe normalized sentences +4. append until max length +5. trim on a word boundary if needed + +Cap: + +- `MAX_DESCRIPTION_LENGTH = 1200` + +## Low-Value Description Filtering + +`isLowValueEntityDescription(...)` removes boilerplate like: + +- creator of the task +- creator of the source +- assignee responsible +- primary contact and requester +- tagged in the +- copied on the +- for visibility + +It also rejects very short role-only descriptions dominated by words like: + +- creator +- assignee +- requester +- participant +- individual +- professional + +This is one of the reasons repeated role spam should stop dominating entity descriptions. + +## Different-Person Description Guard + +`descriptionAppearsAboutDifferentPerson(...)` tries to avoid poisoning a person entity with a description that is clearly about someone related to them. + +It looks for patterns like: + +- `father of ` +- `partner of ` +- `cousin of ` + +If such a pattern appears, the incoming description is not merged into the existing person. + +## Person-Specific Merge Defenses + +These are the main protections that keep people from collapsing incorrectly. + +### `hasMatchingLastToken(...)` + +If both names look like multi-token person names, the last token must match. + +Examples: + +- `Adarsh Tadimari` vs `Adarsh Revy` -> fail +- `Chris Mullin` vs `Christopher Paul Mullin` -> pass + +### `hasWeakPersonNameMergeEvidence(...)` + +Rejects weak patterns such as: + +- surname-only weak matches +- single-token weak matches against a full multi-token name + +This is the mechanism that prevents overly aggressive merges from names like `Adarsh`. + +## Why Bad Aliases Like `Hi Adarsh` Should Stop Appearing + +Those strings are blocked in multiple places: + +1. extractor post-processing alias filtering +2. resolver `isValidAlias(...)` +3. resolver `isDisplayAliasSafe(...)` +4. resolver `isStrongAliasForMerge(...)` + +That layered defense is intentional. Alias garbage is damaging both for: + +- public graph exploration +- internal resolution accuracy + +## Resolver Design Principles + +The resolver follows these rules: + +1. prefer deterministic evidence over embeddings +2. never silently reassign a scoped external ID +3. treat people as the highest-risk merge class +4. do not trust extracted aliases blindly +5. do not trust descriptions blindly +6. preserve canonical ids once chosen +7. allow generic -> specific type upgrades +8. block merges with distinguishers that indicate separate instances + +## Public Entity Maintenance + +Entity resolution handles automatic canonicalization at write time. Public graph maintenance handles deliberate corrections after the fact: + +```ts +await typegraph.graph.mergeEntities({ + sourceEntityId: 'ent_duplicate', + targetEntityId: 'ent_canonical', + tenantId: 'acme', +}) + +await typegraph.graph.deleteEntity('ent_bad', { + tenantId: 'acme', + mode: 'invalidate', +}) +``` + +### Merge + +`mergeEntities` is transactional at the store layer. + +It: + +- moves source aliases, properties, and external IDs onto the target +- fails if a moved external ID conflicts with a third entity +- rewrites entity edges, typed graph edges, facts, entity chunk mentions, and memory/entity associations +- collapses duplicate facts and edges +- invalidates self-edges created by the merge +- marks the source entity as `status: 'merged'`, sets `mergedIntoEntityId`, and sets an invalidation timestamp + +The target canonical id stays stable. + +### Delete + +`deleteEntity` supports: + +- `mode: 'invalidate'`, the default +- `mode: 'purge'` + +`invalidate` marks the entity inactive and invalidates associated graph/fact records while preserving provenance. `purge` physically removes the entity row and graph references. Neither mode deletes chunks, ingested sources, or memory records themselves. + +Delete mode is called `invalidate`, not `tombstone`. + +### External IDs During Maintenance + +External IDs remain deterministic identity anchors. Merges may move them from source to target only when no third entity already owns the same scoped identifier. Deletes remove or invalidate entity references, but they do not convert external IDs into authorization credentials. + +## Practical Mental Model + +The resolver is not "find the nearest entity embedding". + +It is: + +1. try exact external identity evidence +2. try strong alias evidence +3. try normalized and fuzzy textual evidence +4. try semantic evidence under strict guards +5. otherwise create a new canonical node + +That is the right way to think about why the graph can improve without collapsing every person with a common first name into the same node. diff --git a/internal_documentation/GRAPH_EXTRACTION.md b/internal_documentation/GRAPH_EXTRACTION.md new file mode 100644 index 0000000..86bd92b --- /dev/null +++ b/internal_documentation/GRAPH_EXTRACTION.md @@ -0,0 +1,624 @@ +# Graph Extraction Flow + +## Purpose + +Graph extraction converts indexed text into durable graph and memory-compatible structures that support: + +- entity exploration +- fact lookup +- graph-augmented chunk retrieval +- entity-scoped query +- entity-aware memory +- profile consolidation + +The extraction system writes three classes of durable knowledge data: + +1. canonical entities +2. canonical facts and entity-entity edges +3. entity-to-chunk graph edges + +It also writes raw mention evidence: + +1. `typegraph_entity_chunk_mentions` + +Chunks are the only retrievable text unit. The graph no longer creates or manages passage nodes or passage-specific edge tables. + +## Important Clarification: Does Post-Processing Add LLM Calls? + +No. + +Post-processing is deterministic TypeScript. It runs after the extractor receives JSON from the model. + +LLM call count by extractor mode: + +- `twoPass = true`: + - 1 LLM call for entity extraction + - 1 LLM call for relationship extraction +- `twoPass = false`: + - 1 combined LLM call + +Post-processing adds: + +- zero extra LLM calls +- zero embedding calls by itself + +Embedding calls happen later when: + +- new entities need name embeddings +- descriptions need description embeddings +- new facts need fact embeddings + +## Definitions + +### Chunk + +A stored piece of source text in the vector index. + +Chunk identity is: + +- `bucketId` +- `sourceId` +- `chunkIndex` +- optional `embeddingModel` +- optional `chunkId` + +### Entity + +A canonical graph node representing a resolved typed thing from the central ontology. + +Entities can come from extraction or from developer seeding. + +Active entity types are: + +```txt +person +organization +location +product +technology +concept +event +meeting +document +project +issue +role +law_regulation +time_period +creative_work +``` + +`document` is the graph entity type for authored business materials such as contracts, RFPs, specs, reports, decks, transcripts, and plans. TypeGraph ingested sources remain storage objects with `sourceId`; they are not graph entity types. + +### Fact + +A persisted, normalized relation record derived from a canonical entity-entity semantic edge. + +### Typed Graph Edge + +`typegraph_graph_edges` stores traversable associations with typed endpoints: + +- `entity -> entity` +- `entity -> chunk` +- `memory -> entity` + +Supported node types: + +- `entity` +- `chunk` +- `memory` + +### Entity-Chunk Mention Evidence + +`typegraph_entity_chunk_mentions` stores raw mention evidence: + +- exact surface text +- normalized surface text +- mention type +- confidence +- chunk location + +It is not the traversal hot path. Traversal uses aggregated typed graph edges. + +## Central Ontology Registry + +Ontology state lives in one SDK source of truth: + +```txt +packages/sdk/src/index-engine/ontology.ts +``` + +That file owns: + +- entity type specs +- canonical predicate specs +- predicate aliases +- inverse direction and swap metadata +- symmetric predicate metadata +- prompt grouping +- alias relation cues +- soft domain/range validation +- predicate normalization + +Other extraction, query, normalization, and graph modules import derived helpers from the registry. Do not add local predicate lists or duplicated entity type lists elsewhere. + +The canonical predicate vocabulary is compact on purpose: + +- Core/taxonomy: `IS_A`, `PART_OF`, `CONTAINS`, `EQUIVALENT_TO`, `RELATED_TO` +- People/roles/orgs: `WORKS_FOR`, `WORKS_AS`, `REPORTS_TO`, `MANAGES`, `FOUNDED`, `LEADS`, `ADVISES`, `MEMBER_OF`, `REPRESENTS`, `INVESTED_IN`, `MARRIED`, `DIVORCED`, `PARENT_OF`, `CHILD_OF`, `SIBLING_OF`, `MENTORED` +- Business/org: `ACQUIRED`, `MERGED_WITH`, `PARTNERED_WITH`, `COMPETES_WITH`, `FUNDED`, `SUPPLIED`, `SUED`, `REGULATED_BY`, `OWNS` +- Product/technical: `USES`, `IMPLEMENTS`, `INTEGRATES_WITH`, `REQUIRES`, `COMPATIBLE_WITH`, `MIGRATED_FROM`, `DEPLOYED_AT`, `REPLACES`, `BASED_ON` +- Work/project/issue/document: `ASSIGNED_TO`, `BLOCKS`, `DUPLICATES`, `RESOLVES`, `CREATED`, `AUTHORED`, `SIGNED`, `APPROVED`, `REFERENCES`, `DESCRIBES`, `SUPPORTS`, `OPPOSES` +- Event/location/legal: `ATTENDED`, `ORGANIZED`, `SPOKE_AT`, `OCCURRED_AT`, `OCCURRED_IN`, `LOCATED_IN`, `OPERATES_IN`, `HEADQUARTERED_IN`, `GOVERNS`, `PROHIBITS`, `PERMITS`, `AMENDS`, `REPEALS`, `CAUSED`, `PRECEDED`, `FOLLOWED` +- Historical/narrative: `KILLED`, `BETRAYED`, `RESCUED`, `EXILED_TO`, `RULED`, `CONQUERED`, `IMPRISONED_IN`, `FOUGHT_IN` + +Near-duplicates normalize into canonical predicates. For example, `WORKED_FOR` becomes `WORKS_FOR` with `temporalStatus: 'former'`, `CO_FOUNDED` becomes `FOUNDED`, and `WRITTEN_BY` becomes `AUTHORED` with subject/object swap. + +Alias relation cues such as `KNOWN_AS`, `AKA`, `ALIAS`, and `CALLED` are not graph predicates. They are routed into entity aliases during graph writes. `NAMED_AFTER` and similar cues are rejected as graph predicates rather than materialized as claims. + +## Removed Passage Model + +The following are no longer created or managed: + +- `typegraph_passage_nodes` +- `typegraph_passage_entity_edges` +- `KnowledgeGraphBridge.upsertPassageNodes` +- `searchGraphPassages` +- `getPassagesForEntity` + +Existing legacy passage tables may remain in a database, but `typegraph.deploy` does not create them and does not drop them. + +## End-to-End Ingestion and Extraction Order + +```mermaid +sequenceDiagram + participant U as User/Caller + participant E as IndexEngine + participant V as Vector Store + participant T as TripleExtractor + participant G as KnowledgeGraphBridge + participant R as EntityResolver + participant S as MemoryStore + + U->>E: ingest / ingestWithChunks + E->>E: sanitize source and chunks + E->>E: preprocess text for embeddings + E->>E: embed chunk batch + E->>V: upsert chunk rows + loop each chunk + E->>T: extractFromChunk(chunk, entityContext, title, identity) + T->>T: LLM extraction + T->>T: deterministic post-processing + T->>G: addEntityMentions(...) + G->>R: resolve entity + R->>S: find/search/upsert entity data as needed + G->>S: upsert entity_chunk_mentions + G->>S: upsert entity->chunk graph edge + T->>G: addTriple(...) + G->>R: resolve subject/object + G->>S: upsert entity->entity graph edge + G->>S: upsert fact record + G->>S: update profile evidence + end + E->>E: carry forward entityContext across chunks +``` + +## Step-by-Step Flow + +### 1. Source and chunks are sanitized + +The engine sanitizes: + +- source fields +- chunk text +- chunk metadata + +This happens before embedding and before extraction. + +### 2. Chunk embeddings are created + +The engine computes embeddings for all chunks that will be indexed. + +These embeddings are used for: + +- semantic retrieval +- dense chunk seeding during graph query + +### 3. Chunk rows are persisted first + +The chunk rows go into the vector-backed chunk table before graph extraction starts. + +This ordering matters because graph retrieval reads chunk content from the vector adapter. The graph stores chunk refs, not chunk text. + +### 4. Extraction runs chunk by chunk + +The engine iterates chunks in order and calls: + +```ts +TripleExtractor.extractFromChunk(...) +``` + +It passes forward `entityContext` built from earlier successful chunks in the same source. + +### 5. Entity context is carried across chunks + +`entityContext` is a bounded list of previously extracted canonical entities from the same source. + +It helps later chunks avoid duplicate entity creation when they refer to earlier entities with: + +- surname-only references +- abbreviations +- shortened forms +- pseudonyms + +This is not global memory. It is per-source contextual carry-forward. + +### 6. The extractor runs one-pass or two-pass LLM extraction + +Current default: + +- `twoPass = true` + +That means: + +1. entity extraction prompt runs first +2. relationship extraction prompt runs second using the extracted entity list + +The second pass is constrained to entities from the first pass. This reduces arbitrary relationship endpoints. + +### 7. Deterministic post-processing runs + +After raw JSON returns, the extractor runs deterministic post-processing. + +```mermaid +flowchart TD + A["Raw LLM JSON"] --> B["sanitize name/type/description/aliases"] + B --> C["drop invalid entity types or empty names"] + C --> D["filter aliases"] + D --> E["augment aliases from source text"] + E --> F["promote fuller alias to canonical name or reject weak mononym"] + F --> G["dedupe aliases"] + G --> H["build nameMap from original names and aliases to canonical name"] + H --> I["rewrite relationship subject/object through nameMap"] + I --> J["drop relationships with missing canonical endpoints or empty predicate"] + J --> K["processed entities + processed relationships"] +``` + +Post-processing handles: + +- Unicode/control-character cleanup +- invalid entity type removal +- unsafe alias filtering +- source-text alias augmentation +- canonical-name promotion +- relationship endpoint rewriting + +## Entity Mention Storage + +When an entity mention is accepted, `KnowledgeGraphBridge.resolveAndStoreEntity(...)` does two separate writes. + +### 1. Raw Mention Evidence + +It writes one or more `typegraph_entity_chunk_mentions` rows. + +These rows are detailed evidence: + +- `entityId` +- `bucketId` +- `sourceId` +- `chunkIndex` +- `mentionType` +- `surfaceText` +- `normalizedSurfaceText` +- `confidence` + +Use this table for: + +- debugging extraction +- alias learning +- provenance +- future backfills + +Do not use it as the online traversal table. + +### 2. Traversable Entity-to-Chunk Edge + +It writes a typed graph edge: + +```txt +entity --MENTIONED_IN--> chunk +``` + +The edge is stored in `typegraph_graph_edges` with: + +- `source_type = 'entity'` +- `source_id = entityId` +- `target_type = 'chunk'` +- `target_id = chunk node id` +- `relation = 'MENTIONED_IN'` +- chunk ref columns for the chunk endpoint +- scope identity columns +- visibility +- evidence/properties + +The properties include aggregated mention metadata such as: + +- mention count +- confidence +- surface texts +- mention types + +This edge is the durable bridge from entities to chunks for graph retrieval. + +## Triple and Fact Storage + +When a relationship is accepted, `KnowledgeGraphBridge.addTriple(...)`: + +1. resolves subject entity +2. resolves object entity +3. normalizes predicate +4. routes alias cue predicates into entity aliases instead of graph edges +5. rejects generic or invalid predicates +6. soft-validates predicate domain/range +7. rejects self-edges +8. writes an entity-to-entity graph edge +9. writes a compact fact record +10. updates entity profile evidence + +Soft domain/range validation rejects invalid predicates but does not hard-block plausible extracted or developer-seeded facts solely because entity types are imperfect. Mismatches are stored in edge properties as validation metadata and receive reduced weight. + +Tense is metadata, not predicate identity. Use canonical predicates for both current and former facts, with `temporalStatus`, `validFrom`, and `validTo` carrying temporal meaning. + +Entity-to-entity graph edges are stored in `typegraph_graph_edges` with: + +- `source_type = 'entity'` +- `target_type = 'entity'` +- `relation` +- `weight` +- `properties` +- scope identity columns +- visibility +- evidence +- temporal columns + +Fact records are stored in `typegraph_fact_records`. They support: + +- vector search +- keyword search through `search_vector` +- source/target entity filtering +- graph query seeding +- direct fact results for semantic/keyword query +- invalidation through `invalid_at` + +## Developer Seeding + +Developers can seed: + +- entities +- edges +- facts +- external IDs for entities + +Seeded entities can include deterministic external IDs: + +```ts +await typegraph.graph.upsertEntity({ + name: 'Pat Example', + entityType: 'person', + externalIds: [ + { id: 'pat@example.com', type: 'email', identityType: 'user' }, + { id: 'U123', type: 'slack_user_id', identityType: 'user' }, + ], +}) +``` + +These external IDs are used before fuzzy/probabilistic matching during entity resolution. + +## Storage Tables + +### `typegraph_semantic_entities` + +Canonical entities. + +Important fields: + +- `id` +- `name` +- `entity_type` +- `aliases` +- `properties` +- embeddings +- scope identity columns +- visibility +- temporal columns +- `status` +- `merged_into_entity_id` +- `deleted_at` + +### `typegraph_entity_external_ids` + +Deterministic entity identifiers. + +Important fields: + +- `entity_id` +- `identity_type` +- `type` +- `id_value` +- `normalized_value` +- `encoding` +- metadata +- scope identity columns + +Lookups are exact and indexed. Query hot paths do not fuzzy-match external IDs. + +### `typegraph_graph_edges` + +Canonical typed graph edge table. + +Important fields: + +- `source_type` +- `source_id` +- `target_type` +- `target_id` +- `relation` +- `weight` +- `properties` +- `evidence` +- source chunk ref columns +- target chunk ref columns +- scope identity columns +- visibility +- temporal columns + +Supported endpoint types: + +- `entity` +- `chunk` +- `memory` + +### `typegraph_entity_chunk_mentions` + +Raw mention evidence table. + +Important fields: + +- `entity_id` +- `bucket_id` +- `source_id` +- `chunk_index` +- `mention_type` +- `surface_text` +- `normalized_surface_text` +- `confidence` + +### `typegraph_fact_records` + +Searchable fact records derived from entity-to-entity edges. + +Important fields: + +- `edge_id` +- `source_entity_id` +- `target_entity_id` +- `relation` +- `fact_text` +- `fact_search_text` +- embedding +- `search_vector` +- scope identity columns +- visibility +- `invalid_at` + +### Legacy Orphaned Passage Tables + +These may exist in older databases: + +- `typegraph_passage_nodes` +- `typegraph_passage_entity_edges` + +They are no longer created or managed by deploy. They are left orphaned and should not be used by current graph/query paths. + +## Backfill + +Graph backfill now creates: + +- entity-to-chunk typed graph edges from `typegraph_entity_chunk_mentions` +- fact records from existing entity-to-entity semantic edges +- entity profile updates from fact evidence + +Backfill result shape: + +```ts +interface GraphBackfillResult { + entityChunkEdgesUpserted: number + factRecordsUpserted: number + entityProfilesUpdated: number + batches: number +} +``` + +Backfill no longer creates passage nodes. + +## Entity-Aware Memory Integration + +Memory records can be linked to entities through typed graph edges: + +```txt +memory --ABOUT--> entity +``` + +`remember(...)` can accept a subject: + +```ts +await typegraph.remember('Prefers SMS for urgent notices', { + tenantId: 'acme', + subject: { + externalIds: [{ id: 'pat@example.com', type: 'email', identityType: 'user' }], + entityType: 'person', + }, + visibility: 'tenant', +}) +``` + +The memory layer: + +1. resolves/upserts the subject entity by external ID +2. stores the memory +3. writes a `memory -> entity` `ABOUT` edge + +`forget(memoryId)` invalidates the memory and invalidates graph edges for that memory node. + +`correct(...)` and conversation memory accept the same subject shape and constrain contradiction checks to the intended entity context when possible. + +## Entity Maintenance + +Public graph APIs now expose safe entity health operations: + +```ts +await typegraph.graph.mergeEntities({ + sourceEntityId: 'ent_duplicate', + targetEntityId: 'ent_canonical', + tenantId: 'acme', +}) + +await typegraph.graph.deleteEntity('ent_bad', { + tenantId: 'acme', + mode: 'invalidate', +}) +``` + +`mergeEntities` transactionally moves source aliases, properties, external IDs, facts, entity edges, typed graph edges, entity-chunk mentions, and memory/entity associations onto the target. It collapses duplicate records, invalidates self-edges created by the merge, and marks the source entity as `status: 'merged'` with `mergedIntoEntityId`. + +`deleteEntity(..., { mode: 'invalidate' })` marks the entity invalid and invalidates associated facts/edges while preserving provenance. `mode: 'purge'` physically removes entity rows and graph references. Delete never removes chunks, ingested sources, or memory records themselves. + +## Latency Guardrails + +Extraction and query performance depend on these invariants: + +- chunk content stays in the vector adapter +- graph edges store chunk refs, not chunk content +- external ID lookup is exact and indexed +- graph traversal reads `typegraph_graph_edges`, not raw mention rows +- chunk filtering uses `(bucket_id, source_id, chunk_index)` +- direct semantic/keyword knowledge search does not fetch chunk content + +## Cloud Backend Checklist + +Cloud backend implementation needs to mirror the SDK behavior: + +- stop creating managed passage tables +- leave legacy passage tables orphaned +- create and query `typegraph_graph_edges` +- support typed endpoints for `entity`, `chunk`, and `memory` +- store chunk ref columns for chunk endpoints +- mirror the central ontology registry and avoid duplicated predicate/type lists +- expose entity external ID upsert/lookup APIs +- expose transactional `mergeEntities` and `deleteEntity` APIs +- expose `getChunksForEntity`, not `getPassagesForEntity` +- implement `resolveEntityScope` +- implement direct `searchKnowledge` +- implement `searchGraphChunks` +- link memory records to entities through `memory -> entity` graph edges +- invalidate memory-origin graph edges on forget diff --git a/internal_documentation/GRAPH_QUERY.md b/internal_documentation/GRAPH_QUERY.md new file mode 100644 index 0000000..5e496a0 --- /dev/null +++ b/internal_documentation/GRAPH_QUERY.md @@ -0,0 +1,490 @@ +# Graph Query Flow + +## Purpose + +TypeGraph has four retrieval signals that answer different questions: + +- Semantic search: "Which chunks look similar to the query embedding?" +- Keyword search: "Which chunks contain the lexical terms?" +- Memory search: "Which memories are relevant to this interaction?" +- Graph search: "Which chunks are connected to the entities and facts implied by this query?" + +The current graph query path runs over a heterogeneous graph, but chunks are now the only retrievable text unit. There are no managed passage nodes and no passage result APIs. + +The graph contributes: + +- entity nodes for canonical identity and traversal +- fact records for high-precision query anchoring +- typed graph edges for entity-to-entity, entity-to-chunk, and memory-to-entity associations +- chunk refs for the final readout into the vector adapter's chunk table + +The caller receives ranked chunks, facts, entities, memories, and optional formatted context. + +## Public API Shape + +Graph-backed retrieval is part of the unified query API: + +```ts +const response = await typegraph.query('Who founded Anthropic?', { + signals: { semantic: true, keyword: true, graph: true }, + graphReinforcement: 'prefer', + context: { + format: 'xml', + sections: ['chunks', 'facts', 'entities'], + maxTotalTokens: 6000, + }, +}) + +console.log(response.results.chunks) +console.log(response.results.facts) +console.log(response.results.entities) +console.log(response.context) +``` + +Graph inspection methods are still available, but they serve a different purpose: + +```ts +await typegraph.graph.searchFacts('Anthropic founded by', { tenantId, limit: 10 }) +await typegraph.graph.searchEntities('Anthropic', { tenantId }, { limit: 5 }) +await typegraph.graph.explainQuery('Who founded Anthropic?', { tenantId }) +await typegraph.graph.explore('Who founded Anthropic?', { + tenantId, + include: { entities: true, facts: true, chunks: true }, + explain: true, +}) +await typegraph.graph.getChunksForEntity('ent_anthropic', { tenantId, limit: 10 }) +``` + +Use `query(..., { signals: { graph: true } })` when the product needs retrieval results, answer context, or the graph evidence selected for a query. + +Use `graph.explore(...)` to inspect graph state, anchors, facts, and relationship settings. It is not the graph retrieval path used by `query()`. + +## Query vs Explore + +| API | Main Question | Return Shape | Uses PPR? | +| --- | --- | --- | --- | +| `typegraph.query(..., { signals.graph: true })` | Which chunks should I give an LLM? | `results.chunks`, `results.facts`, `results.entities`, optional `context` | Yes | +| `typegraph.graph.explore(...)` | What does the graph know around this relationship question? | parsed intent, anchors, entities, facts, optional chunks, optional trace | No | + +## Core Concepts + +### Entity + +A canonical node representing a resolved typed thing from the central ontology. + +Entities can be created by extraction, seeded by developers, or resolved/upserted from deterministic external IDs. + +Active entity types are: + +```txt +person +organization +location +product +technology +concept +event +meeting +document +project +issue +role +law_regulation +time_period +creative_work +``` + +`document` is used for extracted business materials. TypeGraph ingested sources are storage records and chunks, not graph entities. + +### Chunk + +A stored piece of source text in the vector adapter. Chunks are the only text readout surface for query results. + +Chunk identity is: + +- `bucketId` +- `sourceId` +- `chunkIndex` +- optional `embeddingModel` +- optional stable `chunkId` + +### Chunk Ref + +A lightweight pointer to a chunk: + +```ts +interface ChunkRef { + bucketId: string + sourceId: string + chunkIndex: number + embeddingModel?: string + chunkId?: string +} +``` + +Graph APIs use chunk refs for filtering and readout, but the vector adapter owns chunk content retrieval. + +### Fact + +A persisted, structured record derived from a canonical semantic edge. + +It is: + +- directional +- normalized +- compact +- searchable by embedding and keyword + +Example: + +```txt +Dario Amodei co-founded Anthropic +``` + +Facts link the query into the graph with higher precision than raw chunk similarity. + +### Typed Graph Edge + +`typegraph_graph_edges` stores traversable graph associations with typed endpoints: + +- `entity -> entity`, for semantic relationships +- `entity -> chunk`, for evidence-bearing mentions +- `memory -> entity`, for entity-aware memory + +Supported node types: + +- `entity` +- `chunk` +- `memory` + +Entity-to-chunk traversal is just a regular typed graph edge, for example: + +```txt +entity --MENTIONED_IN--> chunk +``` + +Memory-to-entity association is also a regular typed graph edge: + +```txt +memory --ABOUT--> entity +``` + +### Entity-Chunk Mention Evidence + +`typegraph_entity_chunk_mentions` is still written during extraction, but it is raw evidence, not the online traversal table. + +It stores: + +- exact surface text +- normalized surface text +- mention type +- confidence +- chunk location + +It is useful for alias learning, debugging, provenance, and backfill. Traversal hot paths use aggregated `typegraph_graph_edges`. + +## Ontology Contract + +The graph query path consumes the same centralized ontology registry used by extraction and graph writes: + +```txt +packages/sdk/src/index-engine/ontology.ts +``` + +The registry exports the canonical predicates, aliases, inverse direction metadata, symmetric predicates, alias cues, and soft domain/range validation helpers. Query intent parsing must not maintain its own predicate vocabulary. Deterministic query patterns may emit canonical predicates or known aliases, but every predicate is normalized through the registry before it reaches graph search. + +`IS_A` is the classification predicate. `WORKS_AS` is for employment title, job, function, or role relationships. Former/current tense is represented as metadata on stored facts and edges, not separate predicates such as `WORKED_FOR` or `LED`. + +Alias cues such as `KNOWN_AS`, `AKA`, `ALIAS`, and `CALLED` are not traversable claims. They improve entity resolution and stored aliases instead of becoming graph edges or searchable facts. + +## Entity-Scoped Querying + +Queries can be scoped to TypeGraph entity IDs or deterministic external IDs: + +```ts +await typegraph.query('urgent notices', { + entityScope: { + entityIds: ['ent_pat'], + externalIds: [ + { id: 'pat@example.com', type: 'email', identityType: 'user' }, + { id: 'U123', type: 'slack_user_id', identityType: 'user' }, + ], + mode: 'filter', + }, +}) +``` + +Shape: + +```ts +interface QueryEntityScope { + entityIds?: string[] + externalIds?: ExternalId[] + mode?: 'filter' | 'boost' +} +``` + +`ExternalId` is structured: + +```ts +interface ExternalId { + id: string + type: string + identityType: 'tenant' | 'group' | 'user' | 'agent' | 'conversation' | 'entity' + encoding?: 'none' | 'sha256' + metadata?: Record +} +``` + +Multiple `entityIds` and `externalIds` use OR semantics. The scope matches information associated with any resolved entity. + +### Filter Mode + +Default mode is `filter`. + +Behavior: + +- resolves external IDs to entity IDs by exact indexed lookup +- resolves entity IDs to direct chunk refs through `entity -> chunk` graph edges +- prefilters vector/keyword chunk search by `ChunkFilter.chunkRefs` +- filters direct facts/entities to direct associations with any scoped entity +- filters memory recall to memories linked to any scoped entity + +If no external IDs resolve, scoped results are empty and a warning can be returned. + +### Boost Mode + +Boost mode keeps normal results and boosts/includes direct scoped associations. + +Behavior: + +- normal vector/keyword search still runs +- scoped chunk refs get a small score boost +- direct scoped facts/entities can still be included + +### No-Graph Behavior + +TypeGraph still works without a graph configured: + +- semantic and keyword query remain chunk-only +- basic memory still works +- memory-only `entityScope` can work if the memory store supports external-ID resolution and memory-to-entity lookup + +Indexed `entityScope` requires graph-side scope resolution because chunks need graph-derived chunk refs before vector/keyword ranking. If a caller asks for indexed scoped search without a graph bridge that implements `resolveEntityScope`, TypeGraph throws `ConfigError`. + +## Default Graph Profile + +When `signals.graph` is enabled, `GraphRunner` defaults to the `fact-filtered-narrow` profile. + +```ts +type GraphSearchProfile = 'fact-filtered-narrow' + +interface GraphSearchOpts { + profile?: GraphSearchProfile +} +``` + +The profile currently resolves to: + +```ts +{ + factFilter: true, + factCandidateLimit: 80, + factFilterInputLimit: 12, + factSeedLimit: 4, + chunkSeedLimit: 80, + maxExpansionEdgesPerEntity: 25, + factChainLimit: 2, + maxPprIterations: 40, + minPprScore: 1e-8, +} +``` + +Explicit graph options override profile values: + +```ts +await typegraph.query('relationship question', { + signals: { semantic: true, graph: true }, + graph: { + profile: 'fact-filtered-narrow', + chunkSeedLimit: 120, + maxExpansionEdgesPerEntity: 10, + }, +}) +``` + +## End-to-End Query Flow + +```mermaid +flowchart TD + A["typegraph.query(text, opts)"] --> B["Planner resolves signals, identity, buckets"] + B --> C{"entityScope?"} + C -- yes --> D["knowledgeGraph.resolveEntityScope(...) for indexed or graph search"] + C -- no --> E["continue"] + D --> F["entity IDs + chunk refs"] + F --> G["prefilter indexed search by chunk refs in filter mode"] + E --> G + G --> H["semantic/keyword chunk search"] + H --> I{"graph bridge + semantic/keyword active?"} + I -- yes --> J["knowledgeGraph.searchKnowledge(...) direct facts/entities"] + I -- no --> K["skip direct knowledge"] + J --> L{"signals.graph?"} + K --> L + L -- yes --> M["GraphRunner -> searchGraphChunks(...)"] + L -- no --> N["skip traversal"] + M --> O["intent parse, fact/entity/chunk seeds"] + O --> P["build adjacency from typed graph edges"] + P --> Q["weighted PPR"] + Q --> R["read out chunk refs, fetch chunk content through vector adapter"] + R --> S["merge semantic, keyword, graph, and memory candidates"] + N --> S + S --> T["partition chunks/facts/entities/memories and build context"] +``` + +## Direct Knowledge Search Without Traversal + +When semantic or keyword search is active and a graph bridge is configured, the planner calls: + +```ts +knowledgeGraph.searchKnowledge(query, identity, { + count, + signals, + entityScope, + resolvedEntityIds, +}) +``` + +This returns direct facts and entities for the query. It does not run PPR and does not fetch chunk content. + +This is how semantic-only and keyword-only queries can return: + +- chunks from vector/keyword search +- facts from fact semantic/keyword search +- entities from entity semantic/keyword search + +Traversal remains exclusive to `signals.graph`. + +## Graph Traversal Flow + +`searchGraphChunks(...)` performs graph traversal. + +### 1. Parse Graph Intent + +The bridge parses the query to identify relationship-oriented intent. If no graph intent is found, graph traversal returns empty results. + +### 2. Build Embeddings + +The bridge builds embeddings for: + +- fact search text +- chunk search text + +When they are identical, the embedding is reused. + +### 3. Retrieve Fact Candidates + +Fact candidates are searched by embedding. If a fact relevance filter is configured, it can narrow the selected fact ids. + +### 4. Build Seeds + +Seeds can include: + +- scoped entity seeds from `entityScope` +- intent anchor entities +- selected fact source and target entities +- dense chunk seeds from chunk embedding search + +### 5. Assemble Heterogeneous Adjacency + +The in-memory traversal graph includes: + +- entity-to-entity graph edges +- entity-to-chunk graph edges +- mirrored chunk-to-entity edges + +```mermaid +flowchart LR + E1["Entity: Dario Amodei"] -->|FOUNDED| E2["Entity: Anthropic"] + E2 -->|CREATED| E3["Entity: Constitutional AI"] + E1 <--> |MENTIONED_IN| C1["Chunk: doc A / 8"] + E2 <--> |MENTIONED_IN| C1 + E3 <--> |MENTIONED_IN| C2["Chunk: doc B / 4"] +``` + +### 6. Run Weighted PPR + +The bridge runs weighted Personalized PageRank with: + +- adjacency +- seed weights +- restart probability +- iteration cap +- minimum score + +### 7. Read Out Chunks + +After PPR, the bridge keeps chunk nodes. Chunk refs are sorted by graph score and fetched through `memoryStore.getChunksByRefs(...)`, which joins against the vector adapter's chunk table. + +The graph bridge does not own chunk content. + +### 8. Merge With Other Signals + +`GraphRunner` converts graph chunks into retrieval candidates. The planner merges graph, semantic, keyword, and memory candidates by stable chunk identity: + +```txt +bucketId + sourceId + chunkIndex +``` + +## Trace Fields + +`typegraph.graph.explainQuery(query, opts)` runs the same graph chunk search used by the graph signal and returns trace metadata. + +Important fields: + +- `intent` +- `parser` +- `entitySeedCount` +- `factSeedCount` +- `chunkSeedCount` +- `graphNodeCount` +- `graphEdgeCount` +- `pprNonzeroCount` +- `candidatesBeforeMerge` +- `candidatesAfterMerge` +- `topGraphScores` +- `selectedFactIds` +- `selectedEntityIds` +- `selectedChunkIds` +- `finalChunkIds` +- `selectedFactTexts` +- `selectedEntityNames` +- `selectedFactChains` + +## Latency Guardrails + +Query latency depends on keeping scope and chunk operations exact and indexed: + +- external ID resolution is exact lookup only +- external IDs are resolved once per query +- resolved entity IDs are deduped +- entity status/invalidity filters should be applied before traversal or direct fact search +- chunk filtering happens before vector/keyword ranking through `ChunkFilter.chunkRefs` +- pgvector has an index on `(bucket_id, source_id, chunk_index)` +- graph traversal reads aggregated `typegraph_graph_edges`, not raw mention rows +- graph APIs never fetch chunk content for direct knowledge search + +## Backend Integration Notes + +Cloud backend implementations need to expose/update: + +- ontology registry parity with SDK canonical entity types, predicates, aliases, symmetric metadata, and validation behavior +- `POST /v1/graph/entities/:id/chunks` for `getChunksForEntity` +- `POST /v1/graph/entities/merge` for transactional entity merges +- `DELETE /v1/graph/entities/:id` with `mode: 'invalidate' | 'purge'` +- graph query execution backed by `searchGraphChunks`, not passage APIs +- direct knowledge search for semantic/keyword query paths +- entity scope resolution from structured external IDs +- typed graph edge storage and traversal over `entity`, `chunk`, and `memory` +- status-aware entity/fact/edge filters for invalidated and merged entities + +Cloud APIs should remove or replace old passage endpoints rather than proxying them to chunks. diff --git a/packages/adapters/pgvector/README.md b/packages/adapters/pgvector/README.md index cc0ba09..7e47114 100644 --- a/packages/adapters/pgvector/README.md +++ b/packages/adapters/pgvector/README.md @@ -2,7 +2,7 @@ Postgres + [pgvector](https://github.com/pgvector/pgvector) storage for TypeGraph. -This adapter provides document/chunk storage, vector search, BM25 keyword search, hybrid retrieval, jobs, events, policies, and the memory/graph backing store used by TypeGraph graph and memory features. +This adapter provides source/chunk storage, vector search, BM25 keyword search, hybrid retrieval, jobs, events, policies, and the memory/graph backing store used by TypeGraph graph and memory features. For complete setup instructions, see [Self-Hosted Initialization](https://typegraph.ai/docs/guides/self-hosted-initialization). @@ -115,7 +115,7 @@ new PgVectorAdapter({ schema: 'public', tablePrefix: 'typegraph_chunks', hashesTable: 'typegraph_hashes', - documentsTable: 'typegraph_documents', + sourcesTable: 'typegraph_sources', bucketsTable: 'typegraph_buckets', jobsTable: 'typegraph_jobs', }) @@ -130,7 +130,7 @@ Most projects only need `sql`. Use `schema` or table overrides when sharing a da | `PgVectorAdapter` | Main Postgres + pgvector adapter | | `PgMemoryStoreAdapter` | Persistent memory/entity/fact/passage backing store | | `PgHashStore` | Content-hash deduplication store | -| `PgDocumentStore` | Document CRUD store | +| `PgSourceStore` | Source CRUD store | | `PgJobStore` | Job tracking store | | `PgEventSink` | Event sink for query/index telemetry | | `PgPolicyStore` | Policy storage | diff --git a/packages/adapters/pgvector/__tests__/memory-store.test.ts b/packages/adapters/pgvector/__tests__/memory-store.test.ts index 78dba5a..dbad544 100644 --- a/packages/adapters/pgvector/__tests__/memory-store.test.ts +++ b/packages/adapters/pgvector/__tests__/memory-store.test.ts @@ -1,5 +1,5 @@ import { describe, expect, it, vi } from 'vitest' -import type { SemanticFactRecord } from '@typegraph-ai/sdk' +import type { ExternalId, SemanticFactRecord, SemanticGraphEdge } from '@typegraph-ai/sdk' import { PgMemoryStoreAdapter } from '../src/memory-store.js' function makeFact(): SemanticFactRecord { @@ -41,6 +41,78 @@ function rowFromParams(params: unknown[] = []): Record { } describe('PgMemoryStoreAdapter', () => { + it('initializes the canonical graph-edge pattern without creating legacy passage tables', async () => { + const queries: string[] = [] + const sql = vi.fn(async (query: string) => { + queries.push(query) + if (query.includes('FROM pg_constraint')) return [] + return [] + }) + const store = new PgMemoryStoreAdapter({ sql, embeddingDimensions: 4 }) + + await store.initialize() + + const ddl = queries.join('\n') + expect(ddl).toContain('typegraph_graph_edges') + expect(ddl).toContain('source_type') + expect(ddl).toContain('target_type') + expect(ddl).toContain("CHECK (source_type IN ('entity', 'chunk', 'memory'))") + expect(ddl).toContain('typegraph_entity_chunk_mentions') + expect(ddl).not.toContain('typegraph_passage_nodes') + expect(ddl).not.toContain('typegraph_passage_entity_edges') + }) + + it('upserts entity-to-chunk associations as typed graph edges with chunk refs', async () => { + let capturedQuery = '' + let capturedParams: unknown[] = [] + const sql = vi.fn(async (query: string, params?: unknown[]) => { + capturedQuery = query + capturedParams = params ?? [] + return [] + }) + const store = new PgMemoryStoreAdapter({ sql, embeddingDimensions: 4 }) + const edge: SemanticGraphEdge = { + id: 'edge_chunk_1', + sourceType: 'entity', + sourceId: 'ent_pat', + targetType: 'chunk', + targetId: 'chunk_pat', + relation: 'MENTIONED_IN', + weight: 1.5, + properties: { mentionCount: 1 }, + scope: { tenantId: 'tenant-1' }, + targetChunkRef: { + bucketId: 'bucket-1', + sourceId: 'doc-1', + chunkIndex: 2, + embeddingModel: 'mock-embed', + chunkId: 'chunk_pat', + }, + visibility: 'tenant', + evidence: ['chunk_pat'], + temporal: { + validAt: new Date('2026-04-16T00:00:00Z'), + createdAt: new Date('2026-04-16T00:00:00Z'), + }, + } + + await store.upsertGraphEdges([edge]) + + expect(capturedQuery).toContain('INSERT INTO typegraph_graph_edges') + expect(capturedQuery).toContain('ON CONFLICT (source_type, source_id, target_type, target_id, relation)') + expect(capturedParams[1]).toBe('entity') + expect(capturedParams[2]).toBe('ent_pat') + expect(capturedParams[3]).toBe('chunk') + expect(capturedParams[4]).toBe('chunk_pat') + expect(capturedParams[14]).toBe('bucket-1') + expect(capturedParams[15]).toBe('doc-1') + expect(capturedParams[16]).toBe(2) + expect(capturedParams[17]).toBe('mock-embed') + expect(capturedParams[18]).toBe('chunk_pat') + expect(capturedParams[19]).toBe('tenant-1') + expect(capturedParams[24]).toBe('tenant') + }) + it('retries fact record upsert on duplicate deterministic fact id', async () => { const queries: string[] = [] const sql = vi.fn(async (query: string, params?: unknown[]) => { @@ -63,4 +135,32 @@ describe('PgMemoryStoreAdapter', () => { expect(result.id).toBe('fact-stable') expect(result.edgeId).toBe('edge-new') }) + + it('stores scoped deterministic entity external IDs with normalized lookup values', async () => { + let capturedQuery = '' + let capturedParams: unknown[] = [] + const sql = vi.fn(async (query: string, params?: unknown[]) => { + capturedQuery = query + capturedParams = params ?? [] + return [{ id: 'xid_1' }] + }) + const store = new PgMemoryStoreAdapter({ sql, embeddingDimensions: 4 }) + const externalId: ExternalId = { + id: 'Alice@Example.com', + type: 'EMAIL', + identityType: 'user', + } + + await store.upsertEntityExternalIds('ent_alice', [externalId], { tenantId: 'tenant-1' }) + + expect(capturedQuery).toContain('ON CONFLICT') + expect(capturedQuery).toContain('WHERE typegraph_entity_external_ids.entity_id = EXCLUDED.entity_id') + expect(capturedParams[1]).toBe('ent_alice') + expect(capturedParams[2]).toBe('user') + expect(capturedParams[3]).toBe('email') + expect(capturedParams[4]).toBe('Alice@Example.com') + expect(capturedParams[5]).toBe('alice@example.com') + expect(capturedParams[6]).toBe('none') + expect(capturedParams[9]).toBe('tenant-1') + }) }) diff --git a/packages/adapters/pgvector/src/adapter.ts b/packages/adapters/pgvector/src/adapter.ts index dcf12ef..58364a3 100644 --- a/packages/adapters/pgvector/src/adapter.ts +++ b/packages/adapters/pgvector/src/adapter.ts @@ -1,16 +1,16 @@ -import type { VectorStoreAdapter, SearchOpts, ScoredChunkWithDocument, UndeployResult } from '@typegraph-ai/sdk' +import type { VectorStoreAdapter, SearchOpts, ScoredChunkWithSource, UndeployResult } from '@typegraph-ai/sdk' import type { EmbeddedChunk, ChunkFilter, ScoredChunk } from '@typegraph-ai/sdk' -import type { typegraphDocument, DocumentFilter, DocumentStatus, UpsertDocumentInput } from '@typegraph-ai/sdk' +import type { typegraphSource, SourceFilter, SourceStatus, UpsertSourceInput } from '@typegraph-ai/sdk' import type { Bucket } from '@typegraph-ai/sdk' import type { Job, JobFilter, UpsertJobInput, JobStatusPatch, PaginationOpts, PaginatedResult } from '@typegraph-ai/sdk' -import { DEFAULT_BUCKET_ID } from '@typegraph-ai/sdk' +import { ConfigError, DEFAULT_BUCKET_ID } from '@typegraph-ai/sdk' import { - REGISTRY_SQL, MODEL_TABLE_SQL, HASH_TABLE_SQL, DOCUMENTS_TABLE_SQL, + REGISTRY_SQL, MODEL_TABLE_SQL, HASH_TABLE_SQL, SOURCES_TABLE_SQL, BUCKETS_TABLE_SQL, EVENTS_TABLE_SQL, POLICIES_TABLE_SQL, JOBS_TABLE_SQL, sanitizeModelKey, } from './migrations.js' import { PgHashStore } from './hash-store.js' -import { PgDocumentStore, buildDocWhere } from './document-store.js' +import { PgSourceStore, buildSourceWhere } from './source-store.js' import { PgJobStore } from './job-store.js' /** @@ -41,6 +41,14 @@ const RELAXED_KEYWORD_STOP_WORDS = new Set([ 'when', 'where', 'which', 'who', 'whom', 'why', 'with', 'within', ]) +function requireSearchOpts(opts: SearchOpts | null | undefined, method: string): SearchOpts { + if (opts == null) throw new ConfigError(`${method} opts are required.`) + if (typeof opts !== 'object' || Array.isArray(opts)) { + throw new ConfigError(`${method} opts must be an object.`) + } + return opts +} + function buildRelaxedKeywordQuery(query: string): string { const terms: string[] = [] const seen = new Set() @@ -76,7 +84,7 @@ export interface PgVectorAdapterConfig { schema?: string | undefined tablePrefix?: string | undefined hashesTable?: string | undefined - documentsTable?: string | undefined + sourcesTable?: string | undefined bucketsTable?: string | undefined jobsTable?: string | undefined } @@ -85,11 +93,11 @@ export class PgVectorAdapter implements VectorStoreAdapter { private sql: SqlExecutor private transaction?: PgVectorAdapterConfig['transaction'] readonly hashStore: PgHashStore - readonly documentStore: PgDocumentStore + readonly sourceStore: PgSourceStore readonly jobStore: PgJobStore private tablePrefix: string private hashesTable: string - private documentsTable: string + private sourcesTable: string private registryTable: string private bucketsTable: string private eventsTable: string @@ -108,14 +116,14 @@ export class PgVectorAdapter implements VectorStoreAdapter { const prefix = config.schema ? `"${config.schema}".` : '' this.tablePrefix = config.tablePrefix ?? `${prefix}typegraph_chunks` this.hashesTable = config.hashesTable ?? `${prefix}typegraph_hashes` - this.documentsTable = config.documentsTable ?? `${prefix}typegraph_documents` + this.sourcesTable = config.sourcesTable ?? `${prefix}typegraph_sources` this.bucketsTable = config.bucketsTable ?? `${prefix}typegraph_buckets` this.eventsTable = `${prefix}typegraph_events` this.policiesTable = `${prefix}typegraph_policies` this.jobsTable = config.jobsTable ?? `${prefix}typegraph_jobs` this.registryTable = `${this.tablePrefix}_registry` this.hashStore = new PgHashStore(this.sql, this.hashesTable) - this.documentStore = new PgDocumentStore(this.sql, this.documentsTable) + this.sourceStore = new PgSourceStore(this.sql, this.sourcesTable) this.jobStore = new PgJobStore(this.sql, this.jobsTable) } @@ -133,7 +141,7 @@ export class PgVectorAdapter implements VectorStoreAdapter { } await this.execStatements(REGISTRY_SQL(this.registryTable)) await this.execStatements(HASH_TABLE_SQL(this.hashesTable)) - await this.execStatements(DOCUMENTS_TABLE_SQL(this.documentsTable)) + await this.execStatements(SOURCES_TABLE_SQL(this.sourcesTable)) await this.execStatements(BUCKETS_TABLE_SQL(this.bucketsTable)) await this.execStatements(EVENTS_TABLE_SQL(this.eventsTable)) await this.execStatements(POLICIES_TABLE_SQL(this.policiesTable)) @@ -166,7 +174,7 @@ export class PgVectorAdapter implements VectorStoreAdapter { this.registryTable, this.hashesTable, `${this.hashesTable}_run_times`, - this.documentsTable, + this.sourcesTable, this.bucketsTable, this.jobsTable, ] @@ -198,7 +206,7 @@ export class PgVectorAdapter implements VectorStoreAdapter { await this.sql(`DROP TABLE IF EXISTS ${table}`) } await this.sql(`DROP TABLE IF EXISTS ${this.bucketsTable}`) - await this.sql(`DROP TABLE IF EXISTS ${this.documentsTable}`) + await this.sql(`DROP TABLE IF EXISTS ${this.sourcesTable}`) await this.sql(`DROP TABLE IF EXISTS ${this.hashesTable}_run_times`) await this.sql(`DROP TABLE IF EXISTS ${this.hashesTable}`) await this.sql(`DROP TABLE IF EXISTS ${this.registryTable}`) @@ -245,7 +253,7 @@ export class PgVectorAdapter implements VectorStoreAdapter { throw new Error(`No table registered for model "${model}". Call ensureModel() first.`) } - async upsertDocument(model: string, chunks: EmbeddedChunk[]): Promise { + async upsertSourceChunks(model: string, chunks: EmbeddedChunk[]): Promise { if (chunks.length === 0) return const table = await this.getTable(model) @@ -271,13 +279,13 @@ export class PgVectorAdapter implements VectorStoreAdapter { private buildUpsertParams(chunks: EmbeddedChunk[]): unknown[][] { const chunkIds: string[] = [] - const sourceIds: string[] = [] + const bucketIds: string[] = [] const tenantIds: (string | null)[] = [] const groupIds: (string | null)[] = [] const userIds: (string | null)[] = [] const agentIds: (string | null)[] = [] const conversationIds: (string | null)[] = [] - const documentIds: string[] = [] + const sourceIds: string[] = [] const idempotencyKeys: string[] = [] const contents: string[] = [] const embeddings: string[] = [] @@ -290,13 +298,13 @@ export class PgVectorAdapter implements VectorStoreAdapter { for (const chunk of chunks) { chunkIds.push(chunk.id) - sourceIds.push(chunk.bucketId) + bucketIds.push(chunk.bucketId) tenantIds.push(chunk.tenantId ?? null) groupIds.push(chunk.groupId ?? null) userIds.push(chunk.userId ?? null) agentIds.push(chunk.agentId ?? null) conversationIds.push(chunk.conversationId ?? null) - documentIds.push(chunk.documentId) + sourceIds.push(chunk.sourceId) idempotencyKeys.push(chunk.idempotencyKey) contents.push(chunk.content) embeddings.push(`[${chunk.embedding.join(',')}]`) @@ -309,8 +317,8 @@ export class PgVectorAdapter implements VectorStoreAdapter { } return [ - chunkIds, sourceIds, tenantIds, groupIds, userIds, agentIds, conversationIds, - documentIds, idempotencyKeys, contents, embeddings, + chunkIds, bucketIds, tenantIds, groupIds, userIds, agentIds, conversationIds, + sourceIds, idempotencyKeys, contents, embeddings, embeddingModels, chunkIndices, totalChunks, visibilities, metadatas, indexedAts, ] } @@ -319,7 +327,7 @@ export class PgVectorAdapter implements VectorStoreAdapter { await this.sql( `INSERT INTO ${table} (id, bucket_id, tenant_id, group_id, user_id, agent_id, conversation_id, - document_id, idempotency_key, content, embedding, + source_id, idempotency_key, content, embedding, embedding_model, chunk_index, total_chunks, visibility, metadata, indexed_at) SELECT * FROM unnest( $1::text[], $2::text[], $3::text[], $4::text[], $5::text[], $6::text[], $7::text[], @@ -328,7 +336,7 @@ export class PgVectorAdapter implements VectorStoreAdapter { ) ON CONFLICT (idempotency_key, chunk_index, bucket_id) DO UPDATE SET id = EXCLUDED.id, - document_id = EXCLUDED.document_id, + source_id = EXCLUDED.source_id, content = EXCLUDED.content, embedding = EXCLUDED.embedding, embedding_model = EXCLUDED.embedding_model, @@ -340,43 +348,46 @@ export class PgVectorAdapter implements VectorStoreAdapter { ) } - async delete(model: string, filter: ChunkFilter): Promise { + async delete(model: string, filter: ChunkFilter | null): Promise { const table = await this.getTable(model) + const normalizedFilter = filter ?? {} const hasExplicitFilter = - filter.bucketId != null || - (filter.bucketIds != null && filter.bucketIds.length > 0) || - filter.tenantId != null || - filter.groupId != null || - filter.userId != null || - filter.agentId != null || - filter.conversationId != null || - filter.documentId != null || - filter.idempotencyKey != null - if (!hasExplicitFilter) throw new Error('delete() requires at least one filter field') - const { where, params } = buildWhere(filter) + normalizedFilter.bucketId != null || + (normalizedFilter.bucketIds != null && normalizedFilter.bucketIds.length > 0) || + normalizedFilter.chunkRefs != null || + normalizedFilter.tenantId != null || + normalizedFilter.groupId != null || + normalizedFilter.userId != null || + normalizedFilter.agentId != null || + normalizedFilter.conversationId != null || + normalizedFilter.sourceId != null || + normalizedFilter.idempotencyKey != null + if (!hasExplicitFilter) throw new ConfigError('delete() requires at least one filter field.') + const { where, params } = buildWhere(normalizedFilter) await this.sql(`DELETE FROM ${table} WHERE ${where}`, params) } - async search(model: string, embedding: number[], opts: SearchOpts): Promise { + async search(model: string, embedding: number[], opts: SearchOpts | null): Promise { + const normalizedOpts = requireSearchOpts(opts, 'search') const table = await this.getTable(model) const vectorStr = `[${embedding.join(',')}]` - const { where, params } = buildWhere(opts.filter) + const { where, params } = buildWhere(normalizedOpts.filter) // Add temporal filtering if requested const temporalConditions: string[] = where ? [where] : [] - if (opts.temporalAt) { - params.push(opts.temporalAt.toISOString()) + if (normalizedOpts.temporalAt) { + params.push(normalizedOpts.temporalAt.toISOString()) temporalConditions.push(`indexed_at <= $${params.length}`) } const filterClause = temporalConditions.length > 0 ? `WHERE ${temporalConditions.join(' AND ')}` : '' - const count = opts.count + const count = normalizedOpts.count const runQuery = async (sql: SqlExecutor, inTransaction: boolean): Promise => { - if (inTransaction && opts.iterativeScan !== false) { + if (inTransaction && normalizedOpts.iterativeScan !== false) { await sql(`SET LOCAL hnsw.iterative_scan = relaxed_order;`) } const paramOffset = params.length const rows = await sql( - `SELECT id, bucket_id, tenant_id, document_id, idempotency_key, content, + `SELECT id, bucket_id, tenant_id, source_id, idempotency_key, content, embedding_model, chunk_index, total_chunks, metadata, indexed_at, 1 - (embedding <=> $${paramOffset + 1}::vector) AS similarity FROM ${table} @@ -398,21 +409,22 @@ export class PgVectorAdapter implements VectorStoreAdapter { model: string, embedding: number[], query: string, - opts: SearchOpts + opts: SearchOpts | null ): Promise { + const normalizedOpts = requireSearchOpts(opts, 'hybridSearch') const table = await this.getTable(model) const vectorStr = `[${embedding.join(',')}]` - const count = opts.count - const useSemantic = opts.signals?.semantic !== false - const useKeyword = opts.signals?.keyword ?? true + const count = normalizedOpts.count + const useSemantic = normalizedOpts.signals?.semantic !== false + const useKeyword = normalizedOpts.signals?.keyword ?? true if (!useSemantic && !useKeyword) return [] const relaxedQuery = buildRelaxedKeywordQuery(query) - const { where: filterWhere, params: filterParams } = buildWhere(opts.filter) + const { where: filterWhere, params: filterParams } = buildWhere(normalizedOpts.filter) // Add temporal filtering — appended to filterParams so it gets reindexed with everything else - if (opts.temporalAt) { - filterParams.push(opts.temporalAt.toISOString()) + if (normalizedOpts.temporalAt) { + filterParams.push(normalizedOpts.temporalAt.toISOString()) } - const temporalCond = opts.temporalAt ? ` AND indexed_at <= $${filterParams.length}` : '' + const temporalCond = normalizedOpts.temporalAt ? ` AND indexed_at <= $${filterParams.length}` : '' const filterClause = (filterWhere ? `AND ${filterWhere}` : '') + temporalCond // Offset param indices past filter params: $1=vectorStr, $2=strict query, @@ -424,7 +436,7 @@ export class PgVectorAdapter implements VectorStoreAdapter { ) const runQuery = async (sql: SqlExecutor, inTransaction: boolean): Promise => { - if (inTransaction && opts.iterativeScan !== false) { + if (inTransaction && normalizedOpts.iterativeScan !== false) { await sql(`SET LOCAL hnsw.iterative_scan = relaxed_order;`) } @@ -467,12 +479,12 @@ export class PgVectorAdapter implements VectorStoreAdapter { ),` : ''} combined AS ( ${[ - useSemantic ? `SELECT id, bucket_id, tenant_id, document_id, idempotency_key, content, + useSemantic ? `SELECT id, bucket_id, tenant_id, source_id, idempotency_key, content, embedding, embedding_model, chunk_index, total_chunks, metadata, indexed_at, similarity, NULL::double precision AS kw_score, vrank, NULL::bigint AS krank FROM vector_ranked` : '', - useKeyword ? `SELECT id, bucket_id, tenant_id, document_id, idempotency_key, content, + useKeyword ? `SELECT id, bucket_id, tenant_id, source_id, idempotency_key, content, embedding, embedding_model, chunk_index, total_chunks, metadata, indexed_at, NULL::double precision AS similarity, kw_score, NULL::bigint AS vrank, krank @@ -480,19 +492,19 @@ export class PgVectorAdapter implements VectorStoreAdapter { ].filter(Boolean).join('\n UNION ALL\n ')} ), scored AS ( - SELECT id, bucket_id, tenant_id, document_id, idempotency_key, content, + SELECT id, bucket_id, tenant_id, source_id, idempotency_key, content, embedding_model, chunk_index, total_chunks, metadata, indexed_at, similarity, kw_score, (COALESCE(1.0::float8 / (60 + vrank), 0) + COALESCE(1.0::float8 / (60 + krank), 0))::double precision AS rrf_score FROM combined ) - SELECT id, bucket_id, tenant_id, document_id, idempotency_key, content, + SELECT id, bucket_id, tenant_id, source_id, idempotency_key, content, embedding_model, chunk_index, total_chunks, metadata, indexed_at, MAX(similarity) AS similarity, MAX(kw_score) AS keyword_score, SUM(rrf_score)::double precision AS rrf_score FROM scored - GROUP BY id, bucket_id, tenant_id, document_id, idempotency_key, content, + GROUP BY id, bucket_id, tenant_id, source_id, idempotency_key, content, embedding_model, chunk_index, total_chunks, metadata, indexed_at ORDER BY SUM(rrf_score)::double precision DESC LIMIT $3`, @@ -512,7 +524,7 @@ export class PgVectorAdapter implements VectorStoreAdapter { return runQuery(this.sql, false) } - async countChunks(model: string, filter: ChunkFilter): Promise { + async countChunks(model: string, filter: ChunkFilter | null): Promise { const table = await this.getTable(model) const { where, params } = buildWhere(filter) const filterClause = where ? `WHERE ${where}` : '' @@ -523,22 +535,22 @@ export class PgVectorAdapter implements VectorStoreAdapter { return (rows[0]?.count as number) ?? 0 } - // --- Document record methods --- + // --- Source record methods --- - async upsertDocumentRecord(input: UpsertDocumentInput): Promise { - return this.documentStore.upsert(input) + async upsertSourceRecord(input: UpsertSourceInput): Promise { + return this.sourceStore.upsert(input) } - async getDocument(id: string): Promise { - return this.documentStore.get(id) + async getSource(id: string): Promise { + return this.sourceStore.get(id) } - async listDocuments(filter: DocumentFilter, pagination?: import('@typegraph-ai/sdk').PaginationOpts): Promise> { - return this.documentStore.list(filter, pagination) + async listSources(filter?: SourceFilter | null, pagination?: import('@typegraph-ai/sdk').PaginationOpts | null): Promise> { + return this.sourceStore.list(filter, pagination) } - async deleteDocuments(filter: DocumentFilter): Promise { - const { count, ids } = await this.documentStore.delete(filter) + async deleteSources(filter: SourceFilter | null): Promise { + const { count, ids } = await this.sourceStore.delete(filter) if (ids.length === 0) return 0 // Cascade: delete chunks from all registered model tables @@ -547,11 +559,11 @@ export class PgVectorAdapter implements VectorStoreAdapter { // Collect idempotency keys before deleting chunks (for hash cleanup) const ikeyRows = await this.sql( `SELECT DISTINCT idempotency_key, bucket_id, tenant_id FROM ${table} - WHERE document_id = ANY($1::text[])`, + WHERE source_id = ANY($1::text[])`, [ids] ) const chunkRows = await this.sql( - `DELETE FROM ${table} WHERE document_id = ANY($1::text[]) RETURNING id`, + `DELETE FROM ${table} WHERE source_id = ANY($1::text[]) RETURNING id`, [ids] ) totalChunksDeleted += chunkRows.length @@ -568,25 +580,25 @@ export class PgVectorAdapter implements VectorStoreAdapter { return count } - async updateDocument(id: string, input: Partial>): Promise { - const doc = await this.documentStore.update(id, input) - if (!doc) throw new Error(`Document not found: ${id}`) - // Cascade visibility changes onto all chunk rows for this document. Chunks + async updateSource(id: string, input: Partial>): Promise { + const source = await this.sourceStore.update(id, input) + if (!source) throw new Error(`Source not found: ${id}`) + // Cascade visibility changes onto all chunk rows for this source. Chunks // are the security-sensitive target — a stale chunk visibility would let - // a tightened document keep leaking through unscoped queries. + // a tightened source keep leaking through unscoped queries. if (input.visibility !== undefined) { for (const table of this.modelTables.values()) { await this.sql( - `UPDATE ${table} SET visibility = $1 WHERE document_id = $2`, + `UPDATE ${table} SET visibility = $1 WHERE source_id = $2`, [input.visibility, id] ) } } - return doc + return source } - async updateDocumentStatus(id: string, status: DocumentStatus, chunkCount?: number): Promise { - return this.documentStore.updateStatus(id, status, chunkCount) + async updateSourceStatus(id: string, status: SourceStatus, chunkCount?: number): Promise { + return this.sourceStore.updateStatus(id, status, chunkCount) } // --- Job record methods --- @@ -599,7 +611,7 @@ export class PgVectorAdapter implements VectorStoreAdapter { return this.jobStore.get(id) } - async listJobs(filter: JobFilter, pagination?: PaginationOpts): Promise> { + async listJobs(filter?: JobFilter | null, pagination?: PaginationOpts | null): Promise> { return this.jobStore.list(filter, pagination) } @@ -611,46 +623,47 @@ export class PgVectorAdapter implements VectorStoreAdapter { return this.jobStore.incrementProgress(id, processedDelta) } - // --- Search with document JOIN --- + // --- Search with source JOIN --- - async searchWithDocuments( + async searchWithSources( model: string, embedding: number[], query: string, - opts: SearchOpts & { documentFilter?: DocumentFilter | undefined } - ): Promise { + opts: (SearchOpts & { sourceFilter?: SourceFilter | undefined }) | null + ): Promise { + const normalizedOpts = requireSearchOpts(opts, 'searchWithSources') as SearchOpts & { sourceFilter?: SourceFilter | undefined } const table = await this.getTable(model) const vectorStr = `[${embedding.join(',')}]` - const count = opts.count - const useSemantic = opts.signals?.semantic !== false - const useKeyword = opts.signals?.keyword ?? true + const count = normalizedOpts.count + const useSemantic = normalizedOpts.signals?.semantic !== false + const useKeyword = normalizedOpts.signals?.keyword ?? true if (!useSemantic && !useKeyword) return [] const relaxedQuery = buildRelaxedKeywordQuery(query) - const { where: chunkFilterWhere, params: chunkFilterParams } = buildWhere(opts.filter) + const { where: chunkFilterWhere, params: chunkFilterParams } = buildWhere(normalizedOpts.filter) // Add temporal filtering - if (opts.temporalAt) { - chunkFilterParams.push(opts.temporalAt.toISOString()) + if (normalizedOpts.temporalAt) { + chunkFilterParams.push(normalizedOpts.temporalAt.toISOString()) } - const temporalCond = opts.temporalAt ? ` AND c.indexed_at <= $${chunkFilterParams.length}` : '' + const temporalCond = normalizedOpts.temporalAt ? ` AND c.indexed_at <= $${chunkFilterParams.length}` : '' const chunkFilterClause = (chunkFilterWhere ? `AND ${chunkFilterWhere}` : '') + temporalCond - const { where: docFilterWhere, params: docFilterParams } = buildDocWhere(opts.documentFilter ?? {}) + const { where: sourceFilterWhere, params: sourceFilterParams } = buildSourceWhere(normalizedOpts.sourceFilter ?? {}) // Base params: $1=vector, $2=strict query, $3=count, $4=relaxed query - // Then chunk filter params, then doc filter params + // Then chunk filter params, then source filter params const baseOffset = 4 const reindexedChunkFilter = chunkFilterClause.replace( /\$(\d+)/g, (_, n) => `$${parseInt(n) + baseOffset}` ) - const docParamOffset = baseOffset + chunkFilterParams.length - const docFilterClause = docFilterWhere - ? `AND ${docFilterWhere.replace(/\$(\d+)/g, (_, n) => `$${parseInt(n) + docParamOffset}`)}` + const sourceParamOffset = baseOffset + chunkFilterParams.length + const sourceFilterClause = sourceFilterWhere + ? `AND ${sourceFilterWhere.replace(/\$(\d+)/g, (_, n) => `$${parseInt(n) + sourceParamOffset}`)}` : '' - const allParams = [vectorStr, query, count, relaxedQuery, ...chunkFilterParams, ...docFilterParams] + const allParams = [vectorStr, query, count, relaxedQuery, ...chunkFilterParams, ...sourceFilterParams] - const runQuery = async (sql: SqlExecutor, inTransaction: boolean): Promise => { - if (inTransaction && opts.iterativeScan !== false) { + const runQuery = async (sql: SqlExecutor, inTransaction: boolean): Promise => { + if (inTransaction && normalizedOpts.iterativeScan !== false) { await sql(`SET LOCAL hnsw.iterative_scan = relaxed_order;`) } @@ -672,8 +685,8 @@ export class PgVectorAdapter implements VectorStoreAdapter { ROW_NUMBER() OVER (ORDER BY c.embedding <=> query_embedding) AS vrank FROM ${table} c CROSS JOIN __tg_base_params - JOIN ${this.documentsTable} d ON c.document_id = d.id - WHERE TRUE ${reindexedChunkFilter} ${docFilterClause} + JOIN ${this.sourcesTable} s ON c.source_id = s.id + WHERE TRUE ${reindexedChunkFilter} ${sourceFilterClause} ORDER BY c.embedding <=> query_embedding LIMIT ${count * 3} ),` : ''} @@ -689,19 +702,19 @@ export class PgVectorAdapter implements VectorStoreAdapter { ) DESC) AS krank FROM ${table} c CROSS JOIN tsq - JOIN ${this.documentsTable} d ON c.document_id = d.id - WHERE (c.search_vector @@ tsq.strict_q OR c.search_vector @@ tsq.relaxed_q) ${reindexedChunkFilter} ${docFilterClause} + JOIN ${this.sourcesTable} s ON c.source_id = s.id + WHERE (c.search_vector @@ tsq.strict_q OR c.search_vector @@ tsq.relaxed_q) ${reindexedChunkFilter} ${sourceFilterClause} ORDER BY kw_score DESC LIMIT ${count * 3} ),` : ''} combined AS ( ${[ - useSemantic ? `SELECT id, bucket_id, tenant_id, document_id, idempotency_key, content, + useSemantic ? `SELECT id, bucket_id, tenant_id, source_id, idempotency_key, content, embedding_model, chunk_index, total_chunks, metadata, indexed_at, similarity, NULL::double precision AS kw_score, vrank, NULL::bigint AS krank FROM vector_ranked` : '', - useKeyword ? `SELECT id, bucket_id, tenant_id, document_id, idempotency_key, content, + useKeyword ? `SELECT id, bucket_id, tenant_id, source_id, idempotency_key, content, embedding_model, chunk_index, total_chunks, metadata, indexed_at, NULL::double precision AS similarity, kw_score, NULL::bigint AS vrank, krank @@ -714,28 +727,30 @@ export class PgVectorAdapter implements VectorStoreAdapter { FROM combined ), final_chunks AS ( - SELECT id, bucket_id, tenant_id, document_id, idempotency_key, content, + SELECT id, bucket_id, tenant_id, source_id, idempotency_key, content, embedding_model, chunk_index, total_chunks, metadata, indexed_at, MAX(similarity) AS similarity, MAX(kw_score) AS keyword_score, SUM(rrf_score)::double precision AS rrf_score FROM scored - GROUP BY id, bucket_id, tenant_id, document_id, idempotency_key, content, + GROUP BY id, bucket_id, tenant_id, source_id, idempotency_key, content, embedding_model, chunk_index, total_chunks, metadata, indexed_at ORDER BY SUM(rrf_score)::double precision DESC LIMIT $3 ) SELECT fc.*, - d.id AS doc_id, d.title AS doc_title, d.url AS doc_url, - d.content_hash AS doc_content_hash, d.chunk_count AS doc_chunk_count, - d.status AS doc_status, d.visibility AS doc_visibility, - d.group_id AS doc_group_id, d.user_id AS doc_user_id, - d.agent_id AS doc_agent_id, d.conversation_id AS doc_conversation_id, - d.graph_extracted AS doc_graph_extracted, - d.indexed_at AS doc_indexed_at, d.created_at AS doc_created_at, - d.updated_at AS doc_updated_at, d.metadata AS doc_metadata + s.id AS source_id, s.title AS source_title, s.url AS source_url, + s.content_hash AS source_content_hash, s.chunk_count AS source_chunk_count, + s.status AS source_status, s.visibility AS source_visibility, + s.bucket_id AS source_bucket_id, s.tenant_id AS source_tenant_id, + s.group_id AS source_group_id, s.user_id AS source_user_id, + s.agent_id AS source_agent_id, s.conversation_id AS source_conversation_id, + s.graph_extracted AS source_graph_extracted, + s.indexed_at AS source_indexed_at, s.created_at AS source_created_at, + s.updated_at AS source_updated_at, s.metadata AS source_metadata, + s.subject AS source_subject FROM final_chunks fc - JOIN ${this.documentsTable} d ON fc.document_id = d.id + JOIN ${this.sourcesTable} s ON fc.source_id = s.id ORDER BY fc.rrf_score DESC`, allParams ) @@ -746,12 +761,12 @@ export class PgVectorAdapter implements VectorStoreAdapter { keyword: (row.keyword_score as number) ?? undefined, rrf: Number(row.rrf_score), }), - document: mapRowToDocument(row), + source: mapRowToSource(row), })) } if (this.transaction) { - return this.transaction((sql) => runQuery(sql, true)) as Promise + return this.transaction((sql) => runQuery(sql, true)) as Promise } return runQuery(this.sql, false) } @@ -760,16 +775,16 @@ export class PgVectorAdapter implements VectorStoreAdapter { async getChunksByRange( model: string, - documentId: string, + sourceId: string, fromIndex: number, toIndex: number ): Promise { const table = await this.getTable(model) const rows = await this.sql( `SELECT * FROM ${table} - WHERE document_id = $1 AND chunk_index >= $2 AND chunk_index <= $3 + WHERE source_id = $1 AND chunk_index >= $2 AND chunk_index <= $3 ORDER BY chunk_index`, - [documentId, fromIndex, toIndex] + [sourceId, fromIndex, toIndex] ) return rows.map(row => mapRowToScoredChunk(row, {})) } @@ -847,8 +862,8 @@ export class PgVectorAdapter implements VectorStoreAdapter { if (id === DEFAULT_BUCKET_ID) { throw new Error('Cannot delete the default bucket.') } - // Cascade: delete all documents (which cascades to chunks + hashes) - await this.deleteDocuments({ bucketId: id }) + // Cascade: delete all sources (which cascades to chunks + hashes) + await this.deleteSources({ bucketId: id }) // Clean up any remaining hash entries for this bucket (all tenants) await this.hashStore.deleteAllByBucket(id) // Delete the bucket record @@ -860,7 +875,7 @@ export class PgVectorAdapter implements VectorStoreAdapter { } } -function buildWhere(filter?: ChunkFilter): { where: string; params: unknown[] } { +function buildWhere(filter?: ChunkFilter | null): { where: string; params: unknown[] } { const conditions: string[] = [] const params: unknown[] = [] @@ -878,6 +893,23 @@ function buildWhere(filter?: ChunkFilter): { where: string; params: unknown[] } params.push(filter.bucketIds) conditions.push(`bucket_id = ANY($${params.length}::text[])`) } + if (filter?.chunkRefs != null) { + if (filter.chunkRefs.length === 0) { + conditions.push('FALSE') + } else { + params.push(filter.chunkRefs.map(ref => ref.bucketId)) + const bucketParam = `$${params.length}` + params.push(filter.chunkRefs.map(ref => ref.sourceId)) + const sourceParam = `$${params.length}` + params.push(filter.chunkRefs.map(ref => ref.chunkIndex)) + const chunkParam = `$${params.length}` + conditions.push( + `(bucket_id, source_id, chunk_index) IN (` + + `SELECT * FROM unnest(${bucketParam}::text[], ${sourceParam}::text[], ${chunkParam}::int[])` + + `)` + ) + } + } if (filter?.tenantId != null) { params.push(filter.tenantId) tenantParam = `$${params.length}` @@ -903,9 +935,9 @@ function buildWhere(filter?: ChunkFilter): { where: string; params: unknown[] } convParam = `$${params.length}` conditions.push(`conversation_id = ${convParam}`) } - if (filter?.documentId != null) { - params.push(filter.documentId) - conditions.push(`document_id = $${params.length}`) + if (filter?.sourceId != null) { + params.push(filter.sourceId) + conditions.push(`source_id = $${params.length}`) } if (filter?.idempotencyKey != null) { params.push(filter.idempotencyKey) @@ -939,13 +971,13 @@ function mapRowToScoredChunk( return { id: row.id as string, idempotencyKey: row.idempotency_key as string, - bucketId: row.bucket_id as string, - tenantId: (row.tenant_id as string) ?? undefined, + bucketId: row.source_bucket_id as string, + tenantId: (row.source_tenant_id as string) ?? undefined, groupId: (row.group_id as string) ?? undefined, userId: (row.user_id as string) ?? undefined, agentId: (row.agent_id as string) ?? undefined, conversationId: (row.conversation_id as string) ?? undefined, - documentId: row.document_id as string, + sourceId: row.source_id as string, content: row.content as string, embedding: [], // Don't return the full vector - too large and unnecessary embeddingModel: row.embedding_model as string, @@ -982,25 +1014,26 @@ function mapRowToBucket(row: Record): Bucket { } } -function mapRowToDocument(row: Record): typegraphDocument { +function mapRowToSource(row: Record): typegraphSource { return { - id: row.doc_id as string, + id: row.source_id as string, bucketId: row.bucket_id as string, tenantId: (row.tenant_id as string) ?? undefined, - groupId: (row.doc_group_id as string) ?? undefined, - userId: (row.doc_user_id as string) ?? undefined, - agentId: (row.doc_agent_id as string) ?? undefined, - conversationId: (row.doc_conversation_id as string) ?? undefined, - title: row.doc_title as string, - url: (row.doc_url as string) ?? undefined, - contentHash: row.doc_content_hash as string, - chunkCount: row.doc_chunk_count as number, - status: row.doc_status as typegraphDocument['status'], - visibility: (row.doc_visibility as typegraphDocument['visibility']) ?? undefined, - graphExtracted: (row.doc_graph_extracted as boolean) ?? false, - indexedAt: new Date(row.doc_indexed_at as string), - createdAt: new Date(row.doc_created_at as string), - updatedAt: new Date(row.doc_updated_at as string), - metadata: (typeof row.doc_metadata === 'string' ? JSON.parse(row.doc_metadata) : row.doc_metadata ?? {}) as Record, + groupId: (row.source_group_id as string) ?? undefined, + userId: (row.source_user_id as string) ?? undefined, + agentId: (row.source_agent_id as string) ?? undefined, + conversationId: (row.source_conversation_id as string) ?? undefined, + title: row.source_title as string, + url: (row.source_url as string) ?? undefined, + contentHash: row.source_content_hash as string, + chunkCount: row.source_chunk_count as number, + status: row.source_status as typegraphSource['status'], + visibility: (row.source_visibility as typegraphSource['visibility']) ?? undefined, + graphExtracted: (row.source_graph_extracted as boolean) ?? false, + indexedAt: new Date(row.source_indexed_at as string), + createdAt: new Date(row.source_created_at as string), + updatedAt: new Date(row.source_updated_at as string), + metadata: (typeof row.source_metadata === 'string' ? JSON.parse(row.source_metadata) : row.source_metadata ?? {}) as Record, + subject: (typeof row.source_subject === 'string' ? JSON.parse(row.source_subject) : row.source_subject ?? undefined) as typegraphSource['subject'], } } diff --git a/packages/adapters/pgvector/src/event-sink.ts b/packages/adapters/pgvector/src/event-sink.ts index 8c2485a..8b9e73a 100644 --- a/packages/adapters/pgvector/src/event-sink.ts +++ b/packages/adapters/pgvector/src/event-sink.ts @@ -37,7 +37,7 @@ export class PgEventSink implements typegraphEventSink { this.flush().catch((err) => console.error('[typegraph] Event flush failed:', err instanceof Error ? err.message : err)) }, this.flushIntervalMs) - // Start unref'd — an idle sink should never block process exit. We only + // Start unref's — an idle sink should never block process exit. We only // ref() the timer while the buffer is non-empty (see setTimerRef below), // so Node will wait for pending writes to drain but not for nothing. this.unrefTimer() diff --git a/packages/adapters/pgvector/src/index.ts b/packages/adapters/pgvector/src/index.ts index dc1367e..064d244 100644 --- a/packages/adapters/pgvector/src/index.ts +++ b/packages/adapters/pgvector/src/index.ts @@ -1,9 +1,9 @@ export { PgVectorAdapter } from './adapter.js' export type { PgVectorAdapterConfig, SqlExecutor } from './adapter.js' export { PgHashStore } from './hash-store.js' -export { PgDocumentStore } from './document-store.js' +export { PgSourceStore } from './source-store.js' export { PgJobStore } from './job-store.js' -export { REGISTRY_SQL, MODEL_TABLE_SQL, HASH_TABLE_SQL, DOCUMENTS_TABLE_SQL, EVENTS_TABLE_SQL, POLICIES_TABLE_SQL, JOBS_TABLE_SQL, sanitizeModelKey, safeIdx } from './migrations.js' +export { REGISTRY_SQL, MODEL_TABLE_SQL, HASH_TABLE_SQL, SOURCES_TABLE_SQL, EVENTS_TABLE_SQL, POLICIES_TABLE_SQL, JOBS_TABLE_SQL, sanitizeModelKey, safeIdx } from './migrations.js' export { PgEventSink } from './event-sink.js' export type { PgEventSinkConfig } from './event-sink.js' export { PgPolicyStore } from './policy-store.js' diff --git a/packages/adapters/pgvector/src/job-store.ts b/packages/adapters/pgvector/src/job-store.ts index 833dede..6134282 100644 --- a/packages/adapters/pgvector/src/job-store.ts +++ b/packages/adapters/pgvector/src/job-store.ts @@ -62,18 +62,18 @@ export class PgJobStore { return mapJobRow(rows[0]!) } - async list(filter: JobFilter, pagination?: PaginationOpts): Promise> { + async list(filter?: JobFilter | null, pagination?: PaginationOpts | null): Promise> { const conditions: string[] = [] const params: unknown[] = [] - if (filter.bucketId != null) { + if (filter?.bucketId != null) { params.push(filter.bucketId) conditions.push(`bucket_id = $${params.length}`) } - if (filter.status != null) { + if (filter?.status != null) { params.push(filter.status) conditions.push(`status = $${params.length}`) } - if (filter.type != null) { + if (filter?.type != null) { params.push(filter.type) conditions.push(`type = $${params.length}`) } diff --git a/packages/adapters/pgvector/src/memory-store.ts b/packages/adapters/pgvector/src/memory-store.ts index 8344e05..4a00813 100644 --- a/packages/adapters/pgvector/src/memory-store.ts +++ b/packages/adapters/pgvector/src/memory-store.ts @@ -11,15 +11,22 @@ import type { MemoryFilter, MemorySearchOpts, MemoryRecord, - PassageBackfillChunk, - PassageMentionBackfillRow, + ExternalId, + ChunkBackfillRecord, + ChunkMentionBackfillRow, SemanticEntity, SemanticEntityMention, SemanticEdge, + SemanticGraphEdge, + SemanticEntityChunkEdge, + SemanticChunkRecord, SemanticFactRecord, - SemanticPassageEntityEdge, - SemanticPassageNode, + ChunkRef, typegraphIdentity, + MergeGraphEntitiesInput, + MergeGraphEntitiesResult, + DeleteGraphEntityOpts, + DeleteGraphEntityResult, } from '@typegraph-ai/sdk' import { generateId } from '@typegraph-ai/sdk' @@ -45,9 +52,8 @@ export interface PgMemoryAdapterConfig { memoriesTable?: string | undefined entitiesTable?: string | undefined edgesTable?: string | undefined + entityExternalIdsTable?: string | undefined chunkMentionsTable?: string | undefined - passageNodesTable?: string | undefined - passageEntityEdgesTable?: string | undefined factRecordsTable?: string | undefined /** Embedding vector dimensions (e.g. 1536 for text-embedding-3-small). Used for HNSW index creation. */ embeddingDimensions?: number | undefined @@ -150,6 +156,9 @@ const ENTITIES_DDL = (t: string, dims?: number) => { entity_type TEXT NOT NULL, aliases TEXT[] DEFAULT '{}', properties JSONB NOT NULL DEFAULT '{}', + status TEXT NOT NULL DEFAULT 'active' CHECK (status IN ('active', 'merged', 'invalidated')), + merged_into_entity_id TEXT, + deleted_at TIMESTAMPTZ, embedding VECTOR${dims ? `(${dims})` : ''}, description_embedding VECTOR${dims ? `(${dims})` : ''}, scope JSONB NOT NULL DEFAULT '{}', @@ -168,6 +177,8 @@ const ENTITIES_DDL = (t: string, dims?: number) => { CREATE INDEX IF NOT EXISTS ${idx('name_idx')} ON ${t} (name); CREATE INDEX IF NOT EXISTS ${idx('type_idx')} ON ${t} (entity_type); + CREATE INDEX IF NOT EXISTS ${idx('status_idx')} ON ${t} (status); + CREATE INDEX IF NOT EXISTS ${idx('merged_into_idx')} ON ${t} (merged_into_entity_id); CREATE INDEX IF NOT EXISTS ${idx('tenant_user_idx')} ON ${t} (tenant_id, user_id); CREATE INDEX IF NOT EXISTS ${idx('tenant_group_idx')} ON ${t} (tenant_id, group_id); CREATE INDEX IF NOT EXISTS ${idx('tenant_agent_idx')} ON ${t} (tenant_id, agent_id); @@ -180,18 +191,72 @@ const ENTITIES_DDL = (t: string, dims?: number) => { ` } +const ENTITY_EXTERNAL_IDS_DDL = (t: string, entitiesTable: string) => { + const i = idxPrefix(t) + const idx = (suffix: string) => safeIdx(i, suffix) + return ` + CREATE TABLE IF NOT EXISTS ${t} ( + id TEXT PRIMARY KEY, + entity_id TEXT NOT NULL REFERENCES ${entitiesTable}(id) ON DELETE CASCADE, + type TEXT NOT NULL, + id_value TEXT NOT NULL, + normalized_value TEXT NOT NULL, + encoding TEXT NOT NULL DEFAULT 'none' CHECK (encoding IN ('none', 'sha256')), + metadata JSONB NOT NULL DEFAULT '{}', + scope JSONB NOT NULL DEFAULT '{}', + tenant_id TEXT, + group_id TEXT, + user_id TEXT, + agent_id TEXT, + conversation_id TEXT, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW() + ); + + CREATE INDEX IF NOT EXISTS ${idx('entity_idx')} ON ${t} (entity_id); + CREATE INDEX IF NOT EXISTS ${idx('lookup_idx')} ON ${t} (type, normalized_value, encoding); + CREATE INDEX IF NOT EXISTS ${idx('tenant_user_idx')} ON ${t} (tenant_id, user_id); + CREATE INDEX IF NOT EXISTS ${idx('tenant_group_idx')} ON ${t} (tenant_id, group_id); + CREATE INDEX IF NOT EXISTS ${idx('tenant_agent_idx')} ON ${t} (tenant_id, agent_id); + CREATE INDEX IF NOT EXISTS ${idx('tenant_conversation_idx')} ON ${t} (tenant_id, conversation_id); + CREATE UNIQUE INDEX IF NOT EXISTS ${idx('scoped_external_id_uniq_idx')} + ON ${t} ( + type, + normalized_value, + encoding, + COALESCE(tenant_id, ''), + COALESCE(group_id, ''), + COALESCE(user_id, ''), + COALESCE(agent_id, ''), + COALESCE(conversation_id, '') + ); +` +} + const EDGES_DDL = (t: string) => { const i = idxPrefix(t) const idx = (suffix: string) => safeIdx(i, suffix) return ` CREATE TABLE IF NOT EXISTS ${t} ( id TEXT PRIMARY KEY, - source_entity_id TEXT NOT NULL, - target_entity_id TEXT NOT NULL, + source_type TEXT NOT NULL CHECK (source_type IN ('entity', 'chunk', 'memory')), + source_id TEXT NOT NULL, + target_type TEXT NOT NULL CHECK (target_type IN ('entity', 'chunk', 'memory')), + target_id TEXT NOT NULL, relation TEXT NOT NULL, weight REAL NOT NULL DEFAULT 1.0, properties JSONB NOT NULL DEFAULT '{}', scope JSONB NOT NULL DEFAULT '{}', + from_bucket_id TEXT, + from_source_id TEXT, + from_chunk_index INTEGER, + from_embedding_model TEXT, + from_chunk_id TEXT, + to_bucket_id TEXT, + to_source_id TEXT, + to_chunk_index INTEGER, + to_embedding_model TEXT, + to_chunk_id TEXT, -- Identity columns tenant_id TEXT, group_id TEXT, @@ -204,12 +269,19 @@ const EDGES_DDL = (t: string) => { invalid_at TIMESTAMPTZ, created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), - CONSTRAINT ${safeIdx(i, 'rel_uniq')} UNIQUE (source_entity_id, target_entity_id, relation) + CONSTRAINT ${safeIdx(i, 'rel_uniq')} UNIQUE (source_type, source_id, target_type, target_id, relation) ); - CREATE INDEX IF NOT EXISTS ${idx('source_idx')} ON ${t} (source_entity_id); - CREATE INDEX IF NOT EXISTS ${idx('target_idx')} ON ${t} (target_entity_id); + CREATE INDEX IF NOT EXISTS ${idx('source_idx')} ON ${t} (source_type, source_id); + CREATE INDEX IF NOT EXISTS ${idx('target_idx')} ON ${t} (target_type, target_id); + CREATE INDEX IF NOT EXISTS ${idx('entity_source_idx')} ON ${t} (source_id) WHERE source_type = 'entity'; + CREATE INDEX IF NOT EXISTS ${idx('entity_target_idx')} ON ${t} (target_id) WHERE target_type = 'entity'; + CREATE INDEX IF NOT EXISTS ${idx('memory_source_idx')} ON ${t} (source_id) WHERE source_type = 'memory'; + CREATE INDEX IF NOT EXISTS ${idx('memory_target_idx')} ON ${t} (target_id) WHERE target_type = 'memory'; + CREATE INDEX IF NOT EXISTS ${idx('to_chunk_ref_idx')} ON ${t} (to_bucket_id, to_source_id, to_chunk_index) WHERE target_type = 'chunk'; + CREATE INDEX IF NOT EXISTS ${idx('from_chunk_ref_idx')} ON ${t} (from_bucket_id, from_source_id, from_chunk_index) WHERE source_type = 'chunk'; CREATE INDEX IF NOT EXISTS ${idx('relation_idx')} ON ${t} (relation); + CREATE INDEX IF NOT EXISTS ${idx('invalid_at_idx')} ON ${t} (invalid_at); CREATE INDEX IF NOT EXISTS ${idx('tenant_user_idx')} ON ${t} (tenant_id, user_id); CREATE INDEX IF NOT EXISTS ${idx('tenant_group_idx')} ON ${t} (tenant_id, group_id); CREATE INDEX IF NOT EXISTS ${idx('tenant_agent_idx')} ON ${t} (tenant_id, agent_id); @@ -229,11 +301,11 @@ const CHUNK_MENTIONS_DDL = (t: string) => { CREATE TABLE IF NOT EXISTS ${t} ( id TEXT PRIMARY KEY, entity_id TEXT NOT NULL, - document_id TEXT NOT NULL, + source_id TEXT NOT NULL, chunk_index INTEGER NOT NULL, bucket_id TEXT NOT NULL, mention_type TEXT NOT NULL - CHECK (mention_type IN ('subject', 'object', 'co_occurrence', 'entity', 'alias')), + CHECK (mention_type IN ('subject', 'object', 'co_occurrence', 'entity', 'alias', 'source_subject')), surface_text TEXT, normalized_surface_text TEXT NOT NULL DEFAULT '', confidence REAL, @@ -241,80 +313,11 @@ const CHUNK_MENTIONS_DDL = (t: string) => { ); CREATE INDEX IF NOT EXISTS ${idx('entity_idx')} ON ${t} (entity_id); - CREATE INDEX IF NOT EXISTS ${idx('chunk_idx')} ON ${t} (document_id, chunk_index); + CREATE INDEX IF NOT EXISTS ${idx('chunk_idx')} ON ${t} (source_id, chunk_index); CREATE INDEX IF NOT EXISTS ${idx('bucket_entity_idx')} ON ${t} (bucket_id, entity_id); CREATE INDEX IF NOT EXISTS ${idx('surface_idx')} ON ${t} (normalized_surface_text); CREATE UNIQUE INDEX IF NOT EXISTS ${idx('mention_uniq_idx')} - ON ${t} (entity_id, document_id, chunk_index, mention_type, normalized_surface_text); -` -} - -const PASSAGE_NODES_DDL = (t: string) => { - const i = idxPrefix(t) - const idx = (suffix: string) => safeIdx(i, suffix) - return ` - CREATE TABLE IF NOT EXISTS ${t} ( - id TEXT PRIMARY KEY, - bucket_id TEXT NOT NULL, - document_id TEXT NOT NULL, - chunk_index INTEGER NOT NULL, - chunk_id TEXT, - embedding_model TEXT NOT NULL, - content_hash TEXT NOT NULL, - metadata JSONB NOT NULL DEFAULT '{}', - scope JSONB NOT NULL DEFAULT '{}', - tenant_id TEXT, - group_id TEXT, - user_id TEXT, - agent_id TEXT, - conversation_id TEXT, - visibility TEXT CHECK (visibility IS NULL OR visibility IN ('tenant', 'group', 'user', 'agent', 'conversation')), - created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), - updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), - CONSTRAINT ${safeIdx(i, 'passage_uniq')} UNIQUE (bucket_id, document_id, chunk_index, embedding_model) - ); - - CREATE INDEX IF NOT EXISTS ${idx('doc_chunk_idx')} ON ${t} (document_id, chunk_index); - CREATE INDEX IF NOT EXISTS ${idx('bucket_idx')} ON ${t} (bucket_id); - CREATE INDEX IF NOT EXISTS ${idx('tenant_user_idx')} ON ${t} (tenant_id, user_id); - CREATE INDEX IF NOT EXISTS ${idx('tenant_group_idx')} ON ${t} (tenant_id, group_id); - CREATE INDEX IF NOT EXISTS ${idx('tenant_agent_idx')} ON ${t} (tenant_id, agent_id); - CREATE INDEX IF NOT EXISTS ${idx('tenant_conversation_idx')} ON ${t} (tenant_id, conversation_id); - CREATE INDEX IF NOT EXISTS ${idx('visibility_idx')} ON ${t} (visibility); -` -} - -const PASSAGE_ENTITY_EDGES_DDL = (t: string) => { - const i = idxPrefix(t) - const idx = (suffix: string) => safeIdx(i, suffix) - return ` - CREATE TABLE IF NOT EXISTS ${t} ( - passage_id TEXT NOT NULL, - entity_id TEXT NOT NULL, - weight REAL NOT NULL DEFAULT 1.0, - mention_count INTEGER NOT NULL DEFAULT 1, - confidence REAL, - surface_texts TEXT[] NOT NULL DEFAULT '{}', - mention_types TEXT[] NOT NULL DEFAULT '{}', - scope JSONB NOT NULL DEFAULT '{}', - tenant_id TEXT, - group_id TEXT, - user_id TEXT, - agent_id TEXT, - conversation_id TEXT, - visibility TEXT CHECK (visibility IS NULL OR visibility IN ('tenant', 'group', 'user', 'agent', 'conversation')), - created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), - updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), - PRIMARY KEY (passage_id, entity_id) - ); - - CREATE INDEX IF NOT EXISTS ${idx('entity_idx')} ON ${t} (entity_id); - CREATE INDEX IF NOT EXISTS ${idx('passage_idx')} ON ${t} (passage_id); - CREATE INDEX IF NOT EXISTS ${idx('tenant_group_idx')} ON ${t} (tenant_id, group_id); - CREATE INDEX IF NOT EXISTS ${idx('tenant_user_idx')} ON ${t} (tenant_id, user_id); - CREATE INDEX IF NOT EXISTS ${idx('tenant_agent_idx')} ON ${t} (tenant_id, agent_id); - CREATE INDEX IF NOT EXISTS ${idx('tenant_conversation_idx')} ON ${t} (tenant_id, conversation_id); - CREATE INDEX IF NOT EXISTS ${idx('visibility_idx')} ON ${t} (visibility); + ON ${t} (entity_id, source_id, chunk_index, mention_type, normalized_surface_text); ` } @@ -332,7 +335,7 @@ const FACT_RECORDS_DDL = (t: string, dims?: number) => { description TEXT, evidence_text TEXT, fact_search_text TEXT NOT NULL, - source_chunk_id TEXT, + from_chunk_id TEXT, weight REAL NOT NULL DEFAULT 1.0, evidence_count INTEGER NOT NULL DEFAULT 1, embedding VECTOR${dims ? `(${dims})` : ''}, @@ -343,8 +346,10 @@ const FACT_RECORDS_DDL = (t: string, dims?: number) => { agent_id TEXT, conversation_id TEXT, visibility TEXT CHECK (visibility IS NULL OR visibility IN ('tenant', 'group', 'user', 'agent', 'conversation')), + invalid_at TIMESTAMPTZ, created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), - updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW() + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + search_vector TSVECTOR GENERATED ALWAYS AS (to_tsvector('english', fact_search_text)) STORED ); CREATE INDEX IF NOT EXISTS ${idx('source_idx')} ON ${t} (source_entity_id); @@ -356,6 +361,7 @@ const FACT_RECORDS_DDL = (t: string, dims?: number) => { CREATE INDEX IF NOT EXISTS ${idx('tenant_conversation_idx')} ON ${t} (tenant_id, conversation_id); CREATE INDEX IF NOT EXISTS ${idx('visibility_idx')} ON ${t} (visibility); CREATE INDEX IF NOT EXISTS ${idx('embedding_idx')} ON ${t} USING hnsw (embedding vector_cosine_ops); + CREATE INDEX IF NOT EXISTS ${idx('search_vector_idx')} ON ${t} USING gin (search_vector); ` } @@ -368,10 +374,9 @@ export class PgMemoryStoreAdapter implements MemoryStoreAdapter { private sql: SqlExecutor private memoriesTable: string private entitiesTable: string + private entityExternalIdsTable: string private edgesTable: string private chunkMentionsTable: string - private passageNodesTable: string - private passageEntityEdgesTable: string private factRecordsTable: string private schema: string | undefined private hnswEntityIndexCreated = false @@ -384,10 +389,9 @@ export class PgMemoryStoreAdapter implements MemoryStoreAdapter { const prefix = config.schema ? `"${config.schema}".` : '' this.memoriesTable = config.memoriesTable ?? `${prefix}typegraph_memories` this.entitiesTable = config.entitiesTable ?? `${prefix}typegraph_semantic_entities` - this.edgesTable = config.edgesTable ?? `${prefix}typegraph_semantic_edges` + this.entityExternalIdsTable = config.entityExternalIdsTable ?? `${prefix}typegraph_entity_external_ids` + this.edgesTable = config.edgesTable ?? `${prefix}typegraph_graph_edges` this.chunkMentionsTable = config.chunkMentionsTable ?? `${prefix}typegraph_entity_chunk_mentions` - this.passageNodesTable = config.passageNodesTable ?? `${prefix}typegraph_passage_nodes` - this.passageEntityEdgesTable = config.passageEntityEdgesTable ?? `${prefix}typegraph_passage_entity_edges` this.factRecordsTable = config.factRecordsTable ?? `${prefix}typegraph_fact_records` this.embeddingDimensions = config.embeddingDimensions ?? 1536 } @@ -404,10 +408,9 @@ export class PgMemoryStoreAdapter implements MemoryStoreAdapter { MEMORIES_DDL(this.memoriesTable), ENTITIES_DDL(this.entitiesTable, this.embeddingDimensions), + ENTITY_EXTERNAL_IDS_DDL(this.entityExternalIdsTable, this.entitiesTable), EDGES_DDL(this.edgesTable), CHUNK_MENTIONS_DDL(this.chunkMentionsTable), - PASSAGE_NODES_DDL(this.passageNodesTable), - PASSAGE_ENTITY_EDGES_DDL(this.passageEntityEdgesTable), FACT_RECORDS_DDL(this.factRecordsTable, this.embeddingDimensions), ] for (const ddl of allDdl) { @@ -420,7 +423,8 @@ export class PgMemoryStoreAdapter implements MemoryStoreAdapter { } } await this.ensureChunkMentionShape() - await this.ensurePassageEntityEdgeShape() + await this.ensureEntityMaintenanceShape() + await this.ensureFactRecordsShape() // Try to create HNSW indexes on entity and memory embeddings. // May fail if tables are empty (no embedding dimensions known yet). @@ -429,22 +433,6 @@ export class PgMemoryStoreAdapter implements MemoryStoreAdapter { await this.ensureHnswIndex('memory') } - private async ensurePassageEntityEdgeShape(): Promise { - const i = idxPrefix(this.passageEntityEdgesTable) - await this.sql(`ALTER TABLE ${this.passageEntityEdgesTable} ADD COLUMN IF NOT EXISTS scope JSONB NOT NULL DEFAULT '{}'`) - await this.sql(`ALTER TABLE ${this.passageEntityEdgesTable} ADD COLUMN IF NOT EXISTS tenant_id TEXT`) - await this.sql(`ALTER TABLE ${this.passageEntityEdgesTable} ADD COLUMN IF NOT EXISTS group_id TEXT`) - await this.sql(`ALTER TABLE ${this.passageEntityEdgesTable} ADD COLUMN IF NOT EXISTS user_id TEXT`) - await this.sql(`ALTER TABLE ${this.passageEntityEdgesTable} ADD COLUMN IF NOT EXISTS agent_id TEXT`) - await this.sql(`ALTER TABLE ${this.passageEntityEdgesTable} ADD COLUMN IF NOT EXISTS conversation_id TEXT`) - await this.sql(`ALTER TABLE ${this.passageEntityEdgesTable} ADD COLUMN IF NOT EXISTS visibility TEXT`) - await this.sql(`CREATE INDEX IF NOT EXISTS ${safeIdx(i, 'tenant_group_idx')} ON ${this.passageEntityEdgesTable} (tenant_id, group_id)`) - await this.sql(`CREATE INDEX IF NOT EXISTS ${safeIdx(i, 'tenant_user_idx')} ON ${this.passageEntityEdgesTable} (tenant_id, user_id)`) - await this.sql(`CREATE INDEX IF NOT EXISTS ${safeIdx(i, 'tenant_agent_idx')} ON ${this.passageEntityEdgesTable} (tenant_id, agent_id)`) - await this.sql(`CREATE INDEX IF NOT EXISTS ${safeIdx(i, 'tenant_conversation_idx')} ON ${this.passageEntityEdgesTable} (tenant_id, conversation_id)`) - await this.sql(`CREATE INDEX IF NOT EXISTS ${safeIdx(i, 'visibility_idx')} ON ${this.passageEntityEdgesTable} (visibility)`) - } - /** * SQL executor with auto-recovery on missing tables. On PG error 42P01 * (undefined_table), calls initialize() to create the missing table and @@ -468,6 +456,18 @@ export class PgMemoryStoreAdapter implements MemoryStoreAdapter { } } + private async withTransaction(fn: () => Promise): Promise { + await this.sql('BEGIN') + try { + const result = await fn() + await this.sql('COMMIT') + return result + } catch (err) { + await this.sql('ROLLBACK') + throw err + } + } + private async ensureHnswIndex(target: 'entity' | 'memory'): Promise { const table = target === 'entity' ? this.entitiesTable : this.memoriesTable const created = target === 'entity' ? this.hnswEntityIndexCreated : this.hnswMemoryIndexCreated @@ -514,14 +514,35 @@ export class PgMemoryStoreAdapter implements MemoryStoreAdapter { await this.sql( `ALTER TABLE ${this.chunkMentionsTable} ADD CONSTRAINT ${mentionTypeCheck} - CHECK (mention_type IN ('subject', 'object', 'co_occurrence', 'entity', 'alias')) NOT VALID` + CHECK (mention_type IN ('subject', 'object', 'co_occurrence', 'entity', 'alias', 'source_subject')) NOT VALID` ) await this.sql(`ALTER TABLE ${this.chunkMentionsTable} VALIDATE CONSTRAINT ${mentionTypeCheck}`) await this.sql(`CREATE INDEX IF NOT EXISTS ${safeIdx(i, 'surface_idx')} ON ${this.chunkMentionsTable} (normalized_surface_text)`) await this.sql( `CREATE UNIQUE INDEX IF NOT EXISTS ${safeIdx(i, 'mention_uniq_idx')} - ON ${this.chunkMentionsTable} (entity_id, document_id, chunk_index, mention_type, normalized_surface_text)` + ON ${this.chunkMentionsTable} (entity_id, source_id, chunk_index, mention_type, normalized_surface_text)` + ) + } + + private async ensureEntityMaintenanceShape(): Promise { + const i = idxPrefix(this.entitiesTable) + await this.sql(`ALTER TABLE ${this.entitiesTable} ADD COLUMN IF NOT EXISTS status TEXT NOT NULL DEFAULT 'active'`) + await this.sql(`ALTER TABLE ${this.entitiesTable} ADD COLUMN IF NOT EXISTS merged_into_entity_id TEXT`) + await this.sql(`ALTER TABLE ${this.entitiesTable} ADD COLUMN IF NOT EXISTS deleted_at TIMESTAMPTZ`) + await this.sql(`CREATE INDEX IF NOT EXISTS ${safeIdx(i, 'status_idx')} ON ${this.entitiesTable} (status)`) + await this.sql(`CREATE INDEX IF NOT EXISTS ${safeIdx(i, 'merged_into_idx')} ON ${this.entitiesTable} (merged_into_entity_id)`) + } + + private async ensureFactRecordsShape(): Promise { + const i = idxPrefix(this.factRecordsTable) + await this.sql(`ALTER TABLE ${this.factRecordsTable} ADD COLUMN IF NOT EXISTS invalid_at TIMESTAMPTZ`) + await this.sql( + `ALTER TABLE ${this.factRecordsTable} + ADD COLUMN IF NOT EXISTS search_vector TSVECTOR + GENERATED ALWAYS AS (to_tsvector('english', fact_search_text)) STORED` ) + await this.sql(`CREATE INDEX IF NOT EXISTS ${safeIdx(i, 'invalid_at_idx')} ON ${this.factRecordsTable} (invalid_at)`) + await this.sql(`CREATE INDEX IF NOT EXISTS ${safeIdx(i, 'search_vector_idx')} ON ${this.factRecordsTable} USING gin (search_vector)`) } // ── CRUD ── @@ -642,7 +663,7 @@ export class PgMemoryStoreAdapter implements MemoryStoreAdapter { } async getHistory(id: string): Promise { - // Return the record itself — in a full bi-temporal system, we'd + // Return the record itself — in a full bi-temporal system, we's // query all versions sharing a lineage ID. For now, return the single record. const row = await this.get(id) return row ? [row] : [] @@ -770,6 +791,34 @@ export class PgMemoryStoreAdapter implements MemoryStoreAdapter { // ── Entity Storage ── + private async attachExternalIds(entities: SemanticEntity[]): Promise { + if (entities.length === 0) return entities + const rows = await this.sqlWithRetry( + `SELECT entity_id, type, id_value, encoding, metadata + FROM ${this.entityExternalIdsTable} + WHERE entity_id = ANY($1::text[]) + ORDER BY created_at ASC`, + [entities.map(entity => entity.id)] + ) + const byEntity = new Map() + for (const row of rows) { + const entityId = row.entity_id as string + const externalId: ExternalId = { + type: row.type as string, + id: row.id_value as string, + encoding: (row.encoding as ExternalId['encoding']) ?? 'none', + metadata: parseJson(row.metadata), + } + const list = byEntity.get(entityId) ?? [] + list.push(externalId) + byEntity.set(entityId, list) + } + return entities.map(entity => ({ + ...entity, + externalIds: byEntity.get(entity.id) ?? entity.externalIds, + })) + } + async upsertEntity(entity: SemanticEntity): Promise { const embeddingStr = entity.embedding ? `[${entity.embedding.join(',')}]` : null const descEmbeddingStr = entity.descriptionEmbedding ? `[${entity.descriptionEmbedding.join(',')}]` : null @@ -779,13 +828,16 @@ export class PgMemoryStoreAdapter implements MemoryStoreAdapter { const tbl = unqualified(this.entitiesTable) const rows = await this.sqlWithRetry( `INSERT INTO ${this.entitiesTable} - (id, name, entity_type, aliases, properties, embedding, description_embedding, scope, + (id, name, entity_type, aliases, properties, status, merged_into_entity_id, deleted_at, embedding, description_embedding, scope, tenant_id, group_id, user_id, agent_id, conversation_id, visibility, valid_at, invalid_at, updated_at) - VALUES ($1,$2,$3,$4,$5,$6::vector,$7::vector,$8,$9,$10,$11,$12,$13,$14,$15,$16,NOW()) + VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9::vector,$10::vector,$11,$12,$13,$14,$15,$16,$17,$18,$19,NOW()) ON CONFLICT (id) DO UPDATE SET name = EXCLUDED.name, entity_type = EXCLUDED.entity_type, aliases = EXCLUDED.aliases, properties = EXCLUDED.properties, + status = EXCLUDED.status, + merged_into_entity_id = EXCLUDED.merged_into_entity_id, + deleted_at = EXCLUDED.deleted_at, embedding = COALESCE(EXCLUDED.embedding, ${tbl}.embedding), description_embedding = COALESCE(EXCLUDED.description_embedding, ${tbl}.description_embedding), scope = EXCLUDED.scope, @@ -797,6 +849,9 @@ export class PgMemoryStoreAdapter implements MemoryStoreAdapter { [ entity.id, entity.name, entity.entityType, entity.aliases, JSON.stringify(cleanProps), + entity.status ?? 'active', + entity.mergedIntoEntityId ?? null, + entity.deletedAt?.toISOString() ?? null, embeddingStr, descEmbeddingStr, JSON.stringify(entity.scope), entity.scope.tenantId ?? null, entity.scope.groupId ?? null, @@ -815,14 +870,107 @@ export class PgMemoryStoreAdapter implements MemoryStoreAdapter { await this.ensureHnswIndex('entity') } - return mapRowToEntity(rows[0]!) + if (entity.externalIds && entity.externalIds.length > 0) { + await this.upsertEntityExternalIds(entity.id, entity.externalIds, entity.scope) + } + + const [mapped] = await this.attachExternalIds([mapRowToEntity(rows[0]!)]) + return mapped! + } + + async upsertEntityExternalIds(entityId: string, externalIds: ExternalId[], scope: typegraphIdentity): Promise { + if (externalIds.length === 0) return + + const values: string[] = [] + const params: unknown[] = [] + for (const externalId of externalIds) { + const normalized = normalizeExternalId(externalId) + if (!normalized) continue + const base = params.length + values.push(`($${base + 1},$${base + 2},$${base + 3},$${base + 4},$${base + 5},$${base + 6},$${base + 7},$${base + 8},$${base + 9},$${base + 10},$${base + 11},$${base + 12},$${base + 13})`) + params.push( + generateId('xid'), + entityId, + normalized.type, + normalized.id, + normalized.normalizedValue, + normalized.encoding, + JSON.stringify(normalized.metadata ?? {}), + JSON.stringify(scope), + scope.tenantId ?? null, + scope.groupId ?? null, + scope.userId ?? null, + scope.agentId ?? null, + scope.conversationId ?? null, + ) + } + if (values.length === 0) return + + const tbl = unqualified(this.entityExternalIdsTable) + const rows = await this.sqlWithRetry( + `INSERT INTO ${this.entityExternalIdsTable} + (id, entity_id, type, id_value, normalized_value, encoding, metadata, + scope, tenant_id, group_id, user_id, agent_id, conversation_id) + VALUES ${values.join(',')} + ON CONFLICT ( + type, + normalized_value, + encoding, + COALESCE(tenant_id, ''), + COALESCE(group_id, ''), + COALESCE(user_id, ''), + COALESCE(agent_id, ''), + COALESCE(conversation_id, '') + ) DO UPDATE SET + id_value = EXCLUDED.id_value, + metadata = EXCLUDED.metadata, + updated_at = NOW() + WHERE ${tbl}.entity_id = EXCLUDED.entity_id + RETURNING id`, + params + ) + if (rows.length !== values.length) { + throw new Error('One or more external IDs are already linked to a different entity') + } + } + + async findEntityByExternalId(externalId: ExternalId, scope?: typegraphIdentity): Promise { + const normalized = normalizeExternalId(externalId) + if (!normalized) return null + const identity = buildGraphVisibilityWhere(scope, 4, 'e') + const scopeClause = identity.where ? `AND ${identity.where}` : '' + const rows = await this.sqlWithRetry( + `SELECT e.id, e.name, e.entity_type, e.aliases, e.properties, + e.status, e.merged_into_entity_id, e.deleted_at, e.scope, + e.tenant_id, e.group_id, e.user_id, e.agent_id, e.conversation_id, e.visibility, + e.valid_at, e.invalid_at, e.created_at, e.updated_at + FROM ${this.entityExternalIdsTable} xid + JOIN ${this.entitiesTable} e ON e.id = xid.entity_id + WHERE xid.type = $1 + AND xid.normalized_value = $2 + AND xid.encoding = $3 + ${scopeClause} + AND e.invalid_at IS NULL + AND e.status = 'active' + LIMIT 1`, + [ + normalized.type, + normalized.normalizedValue, + normalized.encoding, + ...identity.params, + ] + ) + if (rows.length === 0) return null + const [mapped] = await this.attachExternalIds([mapRowToEntity(rows[0]!)]) + return mapped! } async getEntity(id: string, scope?: typegraphIdentity): Promise { const identity = buildGraphVisibilityWhere(scope, 1) const scopeClause = identity.where ? `AND ${identity.where}` : '' const rows = await this.sqlWithRetry( - `SELECT id, name, entity_type, aliases, properties, scope, + `SELECT id, name, entity_type, aliases, properties, + status, merged_into_entity_id, deleted_at, scope, tenant_id, group_id, user_id, agent_id, conversation_id, visibility, valid_at, invalid_at, created_at, updated_at FROM ${this.entitiesTable} @@ -830,7 +978,9 @@ export class PgMemoryStoreAdapter implements MemoryStoreAdapter { ${scopeClause}`, [id, ...identity.params] ) - return rows.length > 0 ? mapRowToEntity(rows[0]!) : null + if (rows.length === 0) return null + const [mapped] = await this.attachExternalIds([mapRowToEntity(rows[0]!)]) + return mapped! } async getEntitiesBatch(ids: string[], scope?: typegraphIdentity): Promise { @@ -838,7 +988,8 @@ export class PgMemoryStoreAdapter implements MemoryStoreAdapter { const identity = buildGraphVisibilityWhere(scope, 1) const scopeClause = identity.where ? `AND ${identity.where}` : '' const rows = await this.sqlWithRetry( - `SELECT id, name, entity_type, aliases, properties, scope, + `SELECT id, name, entity_type, aliases, properties, + status, merged_into_entity_id, deleted_at, scope, tenant_id, group_id, user_id, agent_id, conversation_id, visibility, valid_at, invalid_at, created_at, updated_at FROM ${this.entitiesTable} @@ -846,7 +997,7 @@ export class PgMemoryStoreAdapter implements MemoryStoreAdapter { ${scopeClause}`, [ids, ...identity.params] ) - return rows.map(mapRowToEntity) + return this.attachExternalIds(rows.map(mapRowToEntity)) } async findEntities(query: string, scope: typegraphIdentity, limit?: number): Promise { @@ -858,7 +1009,8 @@ export class PgMemoryStoreAdapter implements MemoryStoreAdapter { const limitParam = `$${baseIdx + 2}` const scopeClause = where ? ` AND ${where}` : '' const rows = await this.sqlWithRetry( - `SELECT id, name, entity_type, aliases, properties, scope, + `SELECT id, name, entity_type, aliases, properties, + status, merged_into_entity_id, deleted_at, scope, tenant_id, group_id, user_id, agent_id, conversation_id, visibility, valid_at, invalid_at, created_at, updated_at FROM ${this.entitiesTable} @@ -871,10 +1023,11 @@ export class PgMemoryStoreAdapter implements MemoryStoreAdapter { )) ${scopeClause} AND invalid_at IS NULL + AND status = 'active' LIMIT ${limitParam}`, params ) - return rows.map(mapRowToEntity) + return this.attachExternalIds(rows.map(mapRowToEntity)) } async searchEntities(embedding: number[], scope: typegraphIdentity, limit?: number): Promise { @@ -887,13 +1040,14 @@ export class PgMemoryStoreAdapter implements MemoryStoreAdapter { `SELECT *, 1 - (embedding <=> $1::vector) AS similarity FROM ${this.entitiesTable} WHERE embedding IS NOT NULL - ${scopeClause} AND invalid_at IS NULL + AND status = 'active' + ${scopeClause} ORDER BY embedding <=> $1::vector LIMIT ${limitParam}`, [vectorStr, ...params] ) - return rows.map(mapRowToEntity) + return this.attachExternalIds(rows.map(mapRowToEntity)) } async searchEntitiesHybrid(query: string, embedding: number[], scope: typegraphIdentity, limit?: number): Promise { @@ -926,8 +1080,9 @@ export class PgMemoryStoreAdapter implements MemoryStoreAdapter { WHERE m.entity_id = e.id AND m.surface_text ILIKE ${likeParam} ) THEN 0.84 ELSE 0 END ) AS similarity - FROM ${this.entitiesTable} e + FROM ${this.entitiesTable} e WHERE e.invalid_at IS NULL + AND e.status = 'active' ${scopeClause} AND ( lower(e.name) = ${lowerParam} @@ -952,10 +1107,11 @@ export class PgMemoryStoreAdapter implements MemoryStoreAdapter { 1 - (embedding <=> $1::vector), COALESCE(1 - (description_embedding <=> $1::vector), 0) ) AS similarity - FROM ${this.entitiesTable} + FROM ${this.entitiesTable} WHERE embedding IS NOT NULL ${vectorScopeClause} AND invalid_at IS NULL + AND status = 'active' ORDER BY embedding <=> $1::vector LIMIT ${vectorLimitParam}`, [vectorStr, ...vectorWhere.params, maxRows * 3] @@ -970,64 +1126,15 @@ export class PgMemoryStoreAdapter implements MemoryStoreAdapter { } } - return [...byId.values()] + const merged = [...byId.values()] .sort((a, b) => ((b.properties._similarity as number | undefined) ?? 0) - ((a.properties._similarity as number | undefined) ?? 0)) .slice(0, maxRows) + return this.attachExternalIds(merged) } - // ── Passage + Fact Graph Storage ── + // ── Chunk + Fact Graph Storage ── - async upsertPassageNodes(nodes: SemanticPassageNode[]): Promise { - if (nodes.length === 0) return - - const values: string[] = [] - const params: unknown[] = [] - for (const node of nodes) { - const base = params.length - values.push(`($${base + 1},$${base + 2},$${base + 3},$${base + 4},$${base + 5},$${base + 6},$${base + 7},$${base + 8},$${base + 9},$${base + 10},$${base + 11},$${base + 12},$${base + 13},$${base + 14},$${base + 15},$${base + 16})`) - params.push( - node.id, - node.bucketId, - node.documentId, - node.chunkIndex, - node.chunkId ?? null, - node.embeddingModel, - node.contentHash, - JSON.stringify(node.metadata), - JSON.stringify(node.scope), - node.scope.tenantId ?? null, - node.scope.groupId ?? null, - node.scope.userId ?? null, - node.scope.agentId ?? null, - node.scope.conversationId ?? null, - node.visibility ?? null, - node.updatedAt.toISOString(), - ) - } - - const tbl = unqualified(this.passageNodesTable) - await this.sqlWithRetry( - `INSERT INTO ${this.passageNodesTable} - (id, bucket_id, document_id, chunk_index, chunk_id, embedding_model, content_hash, - metadata, scope, tenant_id, group_id, user_id, agent_id, conversation_id, visibility, updated_at) - VALUES ${values.join(',')} - ON CONFLICT (bucket_id, document_id, chunk_index, embedding_model) DO UPDATE SET - chunk_id = COALESCE(EXCLUDED.chunk_id, ${tbl}.chunk_id), - content_hash = EXCLUDED.content_hash, - metadata = EXCLUDED.metadata, - scope = EXCLUDED.scope, - tenant_id = EXCLUDED.tenant_id, - group_id = EXCLUDED.group_id, - user_id = EXCLUDED.user_id, - agent_id = EXCLUDED.agent_id, - conversation_id = EXCLUDED.conversation_id, - visibility = EXCLUDED.visibility, - updated_at = EXCLUDED.updated_at`, - params - ) - } - - async upsertPassageEntityEdges(edges: SemanticPassageEntityEdge[]): Promise { + async upsertGraphEdges(edges: SemanticGraphEdge[]): Promise { if (edges.length === 0) return const values: string[] = [] @@ -1035,44 +1142,69 @@ export class PgMemoryStoreAdapter implements MemoryStoreAdapter { for (const edge of edges) { const base = params.length const scope = edge.scope ?? {} - values.push(`($${base + 1},$${base + 2},$${base + 3},$${base + 4},$${base + 5},$${base + 6},$${base + 7},$${base + 8},$${base + 9},$${base + 10},$${base + 11},$${base + 12},$${base + 13},$${base + 14})`) + values.push(`($${base + 1},$${base + 2},$${base + 3},$${base + 4},$${base + 5},$${base + 6},$${base + 7},$${base + 8},$${base + 9},$${base + 10},$${base + 11},$${base + 12},$${base + 13},$${base + 14},$${base + 15},$${base + 16},$${base + 17},$${base + 18},$${base + 19},$${base + 20},$${base + 21},$${base + 22},$${base + 23},$${base + 24},$${base + 25},$${base + 26},$${base + 27},$${base + 28})`) params.push( - edge.passageId, - edge.entityId, + edge.id, + edge.sourceType, + edge.sourceId, + edge.targetType, + edge.targetId, + edge.relation, edge.weight, - edge.mentionCount, - edge.confidence ?? null, - edge.surfaceTexts, - edge.mentionTypes, + JSON.stringify(edge.properties ?? {}), JSON.stringify(scope), + edge.sourceChunkRef?.bucketId ?? null, + edge.sourceChunkRef?.sourceId ?? null, + edge.sourceChunkRef?.chunkIndex ?? null, + edge.sourceChunkRef?.embeddingModel ?? null, + edge.sourceChunkRef?.chunkId ?? null, + edge.targetChunkRef?.bucketId ?? null, + edge.targetChunkRef?.sourceId ?? null, + edge.targetChunkRef?.chunkIndex ?? null, + edge.targetChunkRef?.embeddingModel ?? null, + edge.targetChunkRef?.chunkId ?? null, scope.tenantId ?? null, scope.groupId ?? null, scope.userId ?? null, scope.agentId ?? null, scope.conversationId ?? null, edge.visibility ?? null, + edge.evidence ?? [], + edge.temporal.validAt.toISOString(), + edge.temporal.invalidAt?.toISOString() ?? null, ) } - const tbl = unqualified(this.passageEntityEdgesTable) + const tbl = unqualified(this.edgesTable) await this.sqlWithRetry( - `INSERT INTO ${this.passageEntityEdgesTable} - (passage_id, entity_id, weight, mention_count, confidence, surface_texts, mention_types, - scope, tenant_id, group_id, user_id, agent_id, conversation_id, visibility) + `INSERT INTO ${this.edgesTable} + (id, source_type, source_id, target_type, target_id, relation, weight, properties, scope, + from_bucket_id, from_source_id, from_chunk_index, from_embedding_model, from_chunk_id, + to_bucket_id, to_source_id, to_chunk_index, to_embedding_model, to_chunk_id, + tenant_id, group_id, user_id, agent_id, conversation_id, visibility, evidence, valid_at, invalid_at) VALUES ${values.join(',')} - ON CONFLICT (passage_id, entity_id) DO UPDATE SET + ON CONFLICT (source_type, source_id, target_type, target_id, relation) DO UPDATE SET weight = LEAST(5.0, ${tbl}.weight + EXCLUDED.weight), - mention_count = ${tbl}.mention_count + EXCLUDED.mention_count, - confidence = GREATEST(COALESCE(${tbl}.confidence, 0), COALESCE(EXCLUDED.confidence, 0)), - surface_texts = ARRAY(SELECT DISTINCT v FROM unnest(${tbl}.surface_texts || EXCLUDED.surface_texts) AS v WHERE v <> ''), - mention_types = ARRAY(SELECT DISTINCT v FROM unnest(${tbl}.mention_types || EXCLUDED.mention_types) AS v WHERE v <> ''), + properties = ${tbl}.properties || EXCLUDED.properties, scope = EXCLUDED.scope, + from_bucket_id = COALESCE(EXCLUDED.from_bucket_id, ${tbl}.from_bucket_id), + from_source_id = COALESCE(EXCLUDED.from_source_id, ${tbl}.from_source_id), + from_chunk_index = COALESCE(EXCLUDED.from_chunk_index, ${tbl}.from_chunk_index), + from_embedding_model = COALESCE(EXCLUDED.from_embedding_model, ${tbl}.from_embedding_model), + from_chunk_id = COALESCE(EXCLUDED.from_chunk_id, ${tbl}.from_chunk_id), + to_bucket_id = COALESCE(EXCLUDED.to_bucket_id, ${tbl}.to_bucket_id), + to_source_id = COALESCE(EXCLUDED.to_source_id, ${tbl}.to_source_id), + to_chunk_index = COALESCE(EXCLUDED.to_chunk_index, ${tbl}.to_chunk_index), + to_embedding_model = COALESCE(EXCLUDED.to_embedding_model, ${tbl}.to_embedding_model), + to_chunk_id = COALESCE(EXCLUDED.to_chunk_id, ${tbl}.to_chunk_id), tenant_id = EXCLUDED.tenant_id, group_id = EXCLUDED.group_id, user_id = EXCLUDED.user_id, agent_id = EXCLUDED.agent_id, conversation_id = EXCLUDED.conversation_id, visibility = EXCLUDED.visibility, + evidence = ARRAY(SELECT DISTINCT v FROM unnest(${tbl}.evidence || EXCLUDED.evidence) AS v WHERE v <> ''), + invalid_at = EXCLUDED.invalid_at, updated_at = NOW()`, params ) @@ -1101,15 +1233,16 @@ export class PgMemoryStoreAdapter implements MemoryStoreAdapter { fact.scope.agentId ?? null, fact.scope.conversationId ?? null, fact.visibility ?? null, + fact.invalidAt?.toISOString() ?? null, fact.updatedAt.toISOString(), ] const table = unqualified(this.factRecordsTable) const buildSql = (conflictTarget: 'edge_id' | 'id') => `INSERT INTO ${this.factRecordsTable} (id, edge_id, source_entity_id, target_entity_id, relation, fact_text, - description, evidence_text, fact_search_text, source_chunk_id, weight, + description, evidence_text, fact_search_text, from_chunk_id, weight, evidence_count, embedding, scope, tenant_id, group_id, user_id, agent_id, - conversation_id, visibility, updated_at) - VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13::vector,$14,$15,$16,$17,$18,$19,$20,$21) + conversation_id, visibility, invalid_at, updated_at) + VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13::vector,$14,$15,$16,$17,$18,$19,$20,$21,$22) ON CONFLICT (${conflictTarget}) DO UPDATE SET ${conflictTarget === 'id' ? `edge_id = EXCLUDED.edge_id, @@ -1121,7 +1254,7 @@ export class PgMemoryStoreAdapter implements MemoryStoreAdapter { description = EXCLUDED.description, evidence_text = EXCLUDED.evidence_text, fact_search_text = EXCLUDED.fact_search_text, - source_chunk_id = COALESCE(EXCLUDED.source_chunk_id, ${table}.source_chunk_id), + from_chunk_id = COALESCE(EXCLUDED.from_chunk_id, ${table}.from_chunk_id), weight = GREATEST(${table}.weight, EXCLUDED.weight), evidence_count = GREATEST(${table}.evidence_count, EXCLUDED.evidence_count), embedding = COALESCE(EXCLUDED.embedding, ${table}.embedding), @@ -1132,6 +1265,7 @@ export class PgMemoryStoreAdapter implements MemoryStoreAdapter { agent_id = EXCLUDED.agent_id, conversation_id = EXCLUDED.conversation_id, visibility = EXCLUDED.visibility, + invalid_at = EXCLUDED.invalid_at, updated_at = EXCLUDED.updated_at RETURNING *` let rows: Record[] @@ -1161,150 +1295,162 @@ export class PgMemoryStoreAdapter implements MemoryStoreAdapter { return rows.map(mapRowToFact) } - async getPassageEdgesForEntities( + async searchFactsHybrid(query: string, embedding: number[] | undefined, scope: typegraphIdentity, limit?: number): Promise { + const maxRows = limit ?? 20 + const identity = buildGraphVisibilityWhere(scope, 2) + const scopeClause = identity.where ? ` AND ${identity.where}` : '' + const relaxedQuery = normalizeEntityText(query) + const lexicalRows = await this.sqlWithRetry( + `WITH tsq AS ( + SELECT websearch_to_tsquery('english', $1::text) AS strict_q, + websearch_to_tsquery('english', $2::text) AS relaxed_q + ) + SELECT f.*, + GREATEST(ts_rank(f.search_vector, tsq.strict_q), ts_rank(f.search_vector, tsq.relaxed_q) * 0.75) AS similarity + FROM ${this.factRecordsTable} f, tsq + WHERE f.invalid_at IS NULL + AND (f.search_vector @@ tsq.strict_q OR f.search_vector @@ tsq.relaxed_q) + ${scopeClause} + ORDER BY similarity DESC + LIMIT $${2 + identity.params.length + 1}`, + [query, relaxedQuery, ...identity.params, maxRows * 3] + ) + + const vectorRows = embedding + ? await this.searchFacts(embedding, scope, maxRows * 3) + : [] + + const byId = new Map() + for (const row of [...lexicalRows.map(mapRowToFact), ...vectorRows]) { + const existing = byId.get(row.id) + if (!existing || (row.similarity ?? 0) > (existing.similarity ?? 0)) byId.set(row.id, row) + } + return [...byId.values()] + .sort((a, b) => (b.similarity ?? 0) - (a.similarity ?? 0)) + .slice(0, maxRows) + } + + async getChunkEdgesForEntities( entityIds: string[], opts?: { scope?: typegraphIdentity | undefined bucketIds?: string[] | undefined limit?: number | undefined } - ): Promise { + ): Promise { if (entityIds.length === 0) return [] const params: unknown[] = [entityIds] let bucketClause = '' if (opts?.bucketIds && opts.bucketIds.length > 0) { params.push(opts.bucketIds) - bucketClause = `AND p.bucket_id = ANY($${params.length}::text[])` + bucketClause = `AND e.to_bucket_id = ANY($${params.length}::text[])` } - const edgeIdentity = buildGraphVisibilityWhere(opts?.scope, params.length, 'pe') + const edgeIdentity = buildGraphVisibilityWhere(opts?.scope, params.length, 'e') params.push(...edgeIdentity.params) - const passageIdentity = buildGraphVisibilityWhere(opts?.scope, params.length, 'p') - params.push(...passageIdentity.params) params.push(opts?.limit ?? entityIds.length * 200) const limitParam = `$${params.length}` const edgeScopeClause = edgeIdentity.where ? `AND ${edgeIdentity.where}` : '' - const passageScopeClause = passageIdentity.where ? `AND ${passageIdentity.where}` : '' const rows = await this.sqlWithRetry( - `SELECT pe.* - FROM ${this.passageEntityEdgesTable} pe - JOIN ${this.passageNodesTable} p ON p.id = pe.passage_id - WHERE pe.entity_id = ANY($1::text[]) + `SELECT e.* + FROM ${this.edgesTable} e + WHERE e.source_type = 'entity' + AND e.target_type = 'chunk' + AND e.source_id = ANY($1::text[]) + AND e.invalid_at IS NULL ${bucketClause} ${edgeScopeClause} - ${passageScopeClause} - ORDER BY pe.weight DESC, pe.mention_count DESC + ORDER BY e.weight DESC LIMIT ${limitParam}`, params ) - return rows.map(mapRowToPassageEntityEdge) + return rows.map(mapRowToEntityChunkEdge) } - async getPassagesByIds( - passageIds: string[], + async getChunksByRefs( + chunkRefs: ChunkRef[], opts: { chunksTable: string scope?: typegraphIdentity | undefined bucketIds?: string[] | undefined } - ): Promise - tenantId?: string | undefined - groupId?: string | undefined - userId?: string | undefined - agentId?: string | undefined - conversationId?: string | undefined - }>> { - if (passageIds.length === 0) return [] - const params: unknown[] = [passageIds] + ): Promise { + if (chunkRefs.length === 0) return [] + const params: unknown[] = [ + chunkRefs.map(ref => ref.bucketId), + chunkRefs.map(ref => ref.sourceId), + chunkRefs.map(ref => ref.chunkIndex), + ] let bucketClause = '' if (opts.bucketIds && opts.bucketIds.length > 0) { params.push(opts.bucketIds) - bucketClause = `AND p.bucket_id = ANY($${params.length}::text[])` + bucketClause = `AND c.bucket_id = ANY($${params.length}::text[])` } - const passageIdentity = buildGraphVisibilityWhere(opts.scope, params.length, 'p') - params.push(...passageIdentity.params) const chunkIdentity = buildGraphVisibilityWhere(opts.scope, params.length, 'c') params.push(...chunkIdentity.params) - const passageScopeClause = passageIdentity.where ? `AND ${passageIdentity.where}` : '' const chunkScopeClause = chunkIdentity.where ? `AND ${chunkIdentity.where}` : '' const rows = await this.sqlWithRetry( - `SELECT p.id AS passage_id, c.content, c.bucket_id, c.document_id, c.chunk_index, - c.total_chunks, c.metadata, c.tenant_id, c.group_id, c.user_id, - c.agent_id, c.conversation_id - FROM ${this.passageNodesTable} p - JOIN ${opts.chunksTable} c - ON p.document_id = c.document_id - AND p.chunk_index = c.chunk_index - AND p.bucket_id = c.bucket_id - WHERE p.id = ANY($1::text[]) + `SELECT c.id AS chunk_id, c.content, c.bucket_id, c.source_id, c.chunk_index, + c.embedding_model, c.total_chunks, c.metadata, c.tenant_id, c.group_id, + c.user_id, c.agent_id, c.conversation_id + FROM ${opts.chunksTable} c + WHERE (c.bucket_id, c.source_id, c.chunk_index) IN ( + SELECT * FROM unnest($1::text[], $2::text[], $3::int[]) + ) ${bucketClause} - ${passageScopeClause} ${chunkScopeClause}`, params ) - return rows.map(mapRowToPassageContent) + return rows.map(mapRowToChunkContent) } - async searchPassageNodes( + async searchChunks( embedding: number[], scope: typegraphIdentity, opts: { chunksTable: string bucketIds?: string[] | undefined limit?: number | undefined + chunkRefs?: ChunkRef[] | undefined } - ): Promise - similarity: number - tenantId?: string | undefined - groupId?: string | undefined - userId?: string | undefined - agentId?: string | undefined - conversationId?: string | undefined - }>> { + ): Promise { const vectorStr = `[${embedding.join(',')}]` const params: unknown[] = [vectorStr] let bucketClause = '' if (opts.bucketIds && opts.bucketIds.length > 0) { params.push(opts.bucketIds) - bucketClause = `AND p.bucket_id = ANY($${params.length}::text[])` + bucketClause = `AND c.bucket_id = ANY($${params.length}::text[])` + } + let chunkRefClause = '' + if (opts.chunkRefs) { + if (opts.chunkRefs.length === 0) { + chunkRefClause = 'AND FALSE' + } else { + params.push(opts.chunkRefs.map(ref => ref.bucketId)) + const bucketParam = `$${params.length}` + params.push(opts.chunkRefs.map(ref => ref.sourceId)) + const sourceParam = `$${params.length}` + params.push(opts.chunkRefs.map(ref => ref.chunkIndex)) + const chunkParam = `$${params.length}` + chunkRefClause = `AND (c.bucket_id, c.source_id, c.chunk_index) IN (SELECT * FROM unnest(${bucketParam}::text[], ${sourceParam}::text[], ${chunkParam}::int[]))` + } } - const passageIdentity = buildGraphVisibilityWhere(scope, params.length, 'p') - params.push(...passageIdentity.params) const chunkIdentity = buildGraphVisibilityWhere(scope, params.length, 'c') params.push(...chunkIdentity.params) params.push(opts.limit ?? 200) const limitParam = `$${params.length}` - const passageScopeClause = passageIdentity.where ? `AND ${passageIdentity.where}` : '' const chunkScopeClause = chunkIdentity.where ? `AND ${chunkIdentity.where}` : '' const rows = await this.sqlWithRetry( - `SELECT p.id AS passage_id, c.content, c.bucket_id, c.document_id, c.chunk_index, - c.total_chunks, c.metadata, c.tenant_id, c.group_id, c.user_id, - c.agent_id, c.conversation_id, + `SELECT c.id AS chunk_id, c.content, c.bucket_id, c.source_id, c.chunk_index, + c.embedding_model, c.total_chunks, c.metadata, c.tenant_id, c.group_id, + c.user_id, c.agent_id, c.conversation_id, 1 - (c.embedding <=> $1::vector) AS similarity - FROM ${this.passageNodesTable} p - JOIN ${opts.chunksTable} c - ON p.document_id = c.document_id - AND p.chunk_index = c.chunk_index - AND p.bucket_id = c.bucket_id + FROM ${opts.chunksTable} c WHERE c.embedding IS NOT NULL ${bucketClause} - ${passageScopeClause} + ${chunkRefClause} ${chunkScopeClause} ORDER BY c.embedding <=> $1::vector LIMIT ${limitParam}`, @@ -1312,7 +1458,7 @@ export class PgMemoryStoreAdapter implements MemoryStoreAdapter { ) return rows.map(row => ({ - ...mapRowToPassageContent(row), + ...mapRowToChunkContent(row), similarity: row.similarity as number, })) } @@ -1320,17 +1466,13 @@ export class PgMemoryStoreAdapter implements MemoryStoreAdapter { // ── Edge Storage ── async upsertEdge(edge: SemanticEdge): Promise { - // Edges are now deduplicated on (source_entity_id, target_entity_id, relation). - // On conflict, weight accumulates (sum of confidences across extractions) and - // valid_at takes the earliest. Everything else stays from the first writer - // so provenance (scope/identity/visibility) doesn't churn across extractions. const rows = await this.sqlWithRetry( `INSERT INTO ${this.edgesTable} - (id, source_entity_id, target_entity_id, relation, weight, properties, + (id, source_type, source_id, target_type, target_id, relation, weight, properties, scope, tenant_id, group_id, user_id, agent_id, conversation_id, visibility, evidence, valid_at, invalid_at, updated_at) - VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,NOW()) - ON CONFLICT (source_entity_id, target_entity_id, relation) DO UPDATE SET + VALUES ($1,'entity',$2,'entity',$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,NOW()) + ON CONFLICT (source_type, source_id, target_type, target_id, relation) DO UPDATE SET weight = ${unqualified(this.edgesTable)}.weight + EXCLUDED.weight, valid_at = LEAST(${unqualified(this.edgesTable)}.valid_at, EXCLUDED.valid_at), updated_at = NOW() @@ -1359,11 +1501,14 @@ export class PgMemoryStoreAdapter implements MemoryStoreAdapter { const scopeClause = identity.where ? `AND ${identity.where}` : '' const params = [entityId, ...identity.params] if (direction === 'in') { - query = `SELECT * FROM ${this.edgesTable} WHERE target_entity_id = $1 AND invalid_at IS NULL ${scopeClause}` + query = `SELECT * FROM ${this.edgesTable} WHERE target_type = 'entity' AND target_id = $1 AND source_type = 'entity' AND invalid_at IS NULL ${scopeClause}` } else if (direction === 'out') { - query = `SELECT * FROM ${this.edgesTable} WHERE source_entity_id = $1 AND invalid_at IS NULL ${scopeClause}` + query = `SELECT * FROM ${this.edgesTable} WHERE source_type = 'entity' AND source_id = $1 AND target_type = 'entity' AND invalid_at IS NULL ${scopeClause}` } else { - query = `SELECT * FROM ${this.edgesTable} WHERE (source_entity_id = $1 OR target_entity_id = $1) AND invalid_at IS NULL ${scopeClause}` + query = `SELECT * FROM ${this.edgesTable} + WHERE ((source_type = 'entity' AND source_id = $1 AND target_type = 'entity') + OR (target_type = 'entity' AND target_id = $1 AND source_type = 'entity')) + AND invalid_at IS NULL ${scopeClause}` } const rows = await this.sqlWithRetry(query, params) return rows.map(mapRowToEdge) @@ -1375,12 +1520,13 @@ export class PgMemoryStoreAdapter implements MemoryStoreAdapter { const scopeClause = identity.where ? `AND ${identity.where}` : '' let query: string if (direction === 'out') { - query = `SELECT * FROM ${this.edgesTable} WHERE source_entity_id = ANY($1::text[]) AND invalid_at IS NULL ${scopeClause}` + query = `SELECT * FROM ${this.edgesTable} WHERE source_type = 'entity' AND source_id = ANY($1::text[]) AND target_type = 'entity' AND invalid_at IS NULL ${scopeClause}` } else if (direction === 'in') { - query = `SELECT * FROM ${this.edgesTable} WHERE target_entity_id = ANY($1::text[]) AND invalid_at IS NULL ${scopeClause}` + query = `SELECT * FROM ${this.edgesTable} WHERE target_type = 'entity' AND target_id = ANY($1::text[]) AND source_type = 'entity' AND invalid_at IS NULL ${scopeClause}` } else { query = `SELECT * FROM ${this.edgesTable} - WHERE (source_entity_id = ANY($1::text[]) OR target_entity_id = ANY($1::text[])) + WHERE ((source_type = 'entity' AND source_id = ANY($1::text[]) AND target_type = 'entity') + OR (target_type = 'entity' AND target_id = ANY($1::text[]) AND source_type = 'entity')) AND invalid_at IS NULL ${scopeClause}` } @@ -1389,7 +1535,7 @@ export class PgMemoryStoreAdapter implements MemoryStoreAdapter { } async findEdges(sourceId: string, targetId: string, relation?: string): Promise { - const conditions = ['source_entity_id = $1', 'target_entity_id = $2'] + const conditions = [`source_type = 'entity'`, 'source_id = $1', `target_type = 'entity'`, 'target_id = $2'] const params: unknown[] = [sourceId, targetId] if (relation) { params.push(relation) @@ -1409,6 +1555,398 @@ export class PgMemoryStoreAdapter implements MemoryStoreAdapter { ) } + async invalidateGraphEdgesForNode(nodeType: 'entity' | 'chunk' | 'memory', nodeId: string, invalidAt?: Date): Promise { + await this.sqlWithRetry( + `UPDATE ${this.edgesTable} + SET invalid_at = $3, updated_at = NOW() + WHERE (source_type = $1 AND source_id = $2) + OR (target_type = $1 AND target_id = $2)`, + [nodeType, nodeId, (invalidAt ?? new Date()).toISOString()] + ) + } + + async getMemoryIdsForEntities(entityIds: string[], scope?: typegraphIdentity): Promise { + if (entityIds.length === 0) return [] + const identity = buildGraphVisibilityWhere(scope, 1) + const scopeClause = identity.where ? `AND ${identity.where}` : '' + const rows = await this.sqlWithRetry( + `SELECT DISTINCT + CASE + WHEN source_type = 'memory' THEN source_id + ELSE target_id + END AS memory_id + FROM ${this.edgesTable} + WHERE invalid_at IS NULL + AND ( + (source_type = 'memory' AND target_type = 'entity' AND target_id = ANY($1::text[])) + OR + (target_type = 'memory' AND source_type = 'entity' AND source_id = ANY($1::text[])) + ) + ${scopeClause}`, + [entityIds, ...identity.params] + ) + return rows.map(row => row.memory_id as string) + } + + async mergeEntityReferences(input: MergeGraphEntitiesInput): Promise { + if (input.sourceEntityId === input.targetEntityId) { + throw new Error('mergeEntityReferences requires distinct source and target entity IDs') + } + + return this.withTransaction(async () => { + const source = await this.getEntity(input.sourceEntityId, input) + const target = await this.getEntity(input.targetEntityId, input) + if (!source) throw new Error(`Source entity not found: ${input.sourceEntityId}`) + if (!target) throw new Error(`Target entity not found: ${input.targetEntityId}`) + + const now = new Date() + const mergedAliases = [...new Set([ + ...target.aliases, + source.name, + ...source.aliases, + ].map(value => value.trim()).filter(Boolean))] + .filter(alias => alias.toLowerCase() !== target.name.toLowerCase()) + const mergedEntityIds = [ + ...new Set([ + ...arrayProperty(target.properties.mergedEntityIds), + ...arrayProperty(source.properties.mergedEntityIds), + source.id, + ]), + ] + + await this.upsertEntity({ + ...target, + aliases: mergedAliases, + properties: { + ...source.properties, + ...target.properties, + ...(input.properties ?? {}), + mergedEntityIds, + updatedAt: now.toISOString(), + }, + status: 'active', + }) + + const duplicateExternalRows = await this.sqlWithRetry( + `DELETE FROM ${this.entityExternalIdsTable} sx + USING ${this.entityExternalIdsTable} tx + WHERE sx.entity_id = $1 + AND tx.entity_id = $2 + AND sx.type = tx.type + AND sx.normalized_value = tx.normalized_value + AND sx.encoding = tx.encoding + RETURNING sx.id`, + [source.id, target.id] + ) + const movedExternalRows = await this.sqlWithRetry( + `UPDATE ${this.entityExternalIdsTable} + SET entity_id = $2, updated_at = NOW() + WHERE entity_id = $1 + RETURNING id`, + [source.id, target.id] + ) + + const duplicateMentionRows = await this.sqlWithRetry( + `DELETE FROM ${this.chunkMentionsTable} sm + USING ${this.chunkMentionsTable} tm + WHERE sm.entity_id = $1 + AND tm.entity_id = $2 + AND sm.source_id = tm.source_id + AND sm.chunk_index = tm.chunk_index + AND sm.mention_type = tm.mention_type + AND sm.normalized_surface_text = tm.normalized_surface_text + RETURNING sm.id`, + [source.id, target.id] + ) + const movedMentionRows = await this.sqlWithRetry( + `UPDATE ${this.chunkMentionsTable} + SET entity_id = $2 + WHERE entity_id = $1 + RETURNING id`, + [source.id, target.id] + ) + + const edgeRows = await this.sqlWithRetry( + `SELECT * + FROM ${this.edgesTable} + WHERE invalid_at IS NULL + AND ( + (source_type = 'entity' AND source_id = $1) + OR + (target_type = 'entity' AND target_id = $1) + ) + ORDER BY created_at, id`, + [source.id] + ) + const edgeIdMap = new Map() + let redirectedGraphEdges = 0 + let redirectedEdges = 0 + let removedSelfEdges = 0 + for (const row of edgeRows) { + const edgeId = row.id as string + const newSourceId = row.source_type === 'entity' && row.source_id === source.id + ? target.id + : row.source_id as string + const newTargetId = row.target_type === 'entity' && row.target_id === source.id + ? target.id + : row.target_id as string + + if (row.source_type === 'entity' && row.target_type === 'entity' && newSourceId === newTargetId) { + await this.sqlWithRetry( + `UPDATE ${this.edgesTable} + SET invalid_at = $2, updated_at = NOW() + WHERE id = $1`, + [edgeId, now.toISOString()] + ) + edgeIdMap.set(edgeId, edgeId) + removedSelfEdges += 1 + redirectedEdges += 1 + continue + } + + const conflict = await this.sqlWithRetry( + `SELECT id + FROM ${this.edgesTable} + WHERE source_type = $1 + AND source_id = $2 + AND target_type = $3 + AND target_id = $4 + AND relation = $5 + AND invalid_at IS NULL + AND id <> $6 + LIMIT 1`, + [row.source_type, newSourceId, row.target_type, newTargetId, row.relation, edgeId] + ) + if (conflict[0]?.id) { + const conflictId = conflict[0].id as string + await this.sqlWithRetry( + `UPDATE ${this.edgesTable} target + SET weight = LEAST(5.0, target.weight + source.weight), + properties = target.properties || source.properties, + evidence = ARRAY(SELECT DISTINCT v FROM unnest(target.evidence || source.evidence) AS v WHERE v <> ''), + updated_at = NOW() + FROM ${this.edgesTable} source + WHERE target.id = $1 + AND source.id = $2`, + [conflictId, edgeId] + ) + await this.sqlWithRetry( + `UPDATE ${this.edgesTable} + SET invalid_at = $2, updated_at = NOW() + WHERE id = $1`, + [edgeId, now.toISOString()] + ) + edgeIdMap.set(edgeId, conflictId) + } else { + await this.sqlWithRetry( + `UPDATE ${this.edgesTable} + SET source_id = $2, + target_id = $3, + updated_at = NOW() + WHERE id = $1`, + [edgeId, newSourceId, newTargetId] + ) + edgeIdMap.set(edgeId, edgeId) + } + redirectedGraphEdges += 1 + if (row.source_type === 'entity' && row.target_type === 'entity') redirectedEdges += 1 + } + + const factRows = await this.sqlWithRetry( + `SELECT * + FROM ${this.factRecordsTable} + WHERE invalid_at IS NULL + AND (source_entity_id = $1 OR target_entity_id = $1) + ORDER BY created_at, id`, + [source.id] + ) + let redirectedFacts = 0 + for (const row of factRows) { + const factId = row.id as string + const newSourceId = row.source_entity_id === source.id ? target.id : row.source_entity_id as string + const newTargetId = row.target_entity_id === source.id ? target.id : row.target_entity_id as string + const newEdgeId = edgeIdMap.get(row.edge_id as string) ?? row.edge_id as string + if (newSourceId === newTargetId) { + await this.sqlWithRetry( + `UPDATE ${this.factRecordsTable} + SET invalid_at = $2, updated_at = NOW() + WHERE id = $1`, + [factId, now.toISOString()] + ) + redirectedFacts += 1 + continue + } + + const conflict = await this.sqlWithRetry( + `SELECT id + FROM ${this.factRecordsTable} + WHERE edge_id = $1 + AND id <> $2 + AND invalid_at IS NULL + LIMIT 1`, + [newEdgeId, factId] + ) + if (conflict[0]?.id) { + await this.sqlWithRetry( + `UPDATE ${this.factRecordsTable} target + SET weight = GREATEST(target.weight, source.weight), + evidence_count = GREATEST(target.evidence_count, source.evidence_count), + updated_at = NOW() + FROM ${this.factRecordsTable} source + WHERE target.id = $1 + AND source.id = $2`, + [conflict[0].id as string, factId] + ) + await this.sqlWithRetry( + `UPDATE ${this.factRecordsTable} + SET invalid_at = $2, updated_at = NOW() + WHERE id = $1`, + [factId, now.toISOString()] + ) + } else { + await this.sqlWithRetry( + `UPDATE ${this.factRecordsTable} + SET edge_id = $2, + source_entity_id = $3, + target_entity_id = $4, + fact_text = replace(fact_text, $5, $6), + fact_search_text = replace(fact_search_text, $5, $6), + updated_at = NOW() + WHERE id = $1`, + [factId, newEdgeId, newSourceId, newTargetId, source.name, target.name] + ) + } + redirectedFacts += 1 + } + + await this.sqlWithRetry( + `UPDATE ${this.entitiesTable} + SET status = 'merged', + merged_into_entity_id = $2, + invalid_at = $3, + deleted_at = $3, + properties = properties || $4::jsonb, + updated_at = NOW() + WHERE id = $1`, + [ + source.id, + target.id, + now.toISOString(), + JSON.stringify({ mergedIntoEntityId: target.id }), + ] + ) + + const refreshed = await this.getEntity(target.id, input) + return { + target: entityDetailFromSemanticEntity(refreshed ?? target), + sourceEntityId: source.id, + targetEntityId: target.id, + redirectedEdges, + redirectedFacts, + redirectedGraphEdges, + movedMentions: duplicateMentionRows.length + movedMentionRows.length, + movedExternalIds: duplicateExternalRows.length + movedExternalRows.length, + removedSelfEdges, + } + }) + } + + async deleteEntityReferences(entityId: string, opts?: DeleteGraphEntityOpts | null): Promise { + const mode = opts?.mode ?? 'invalidate' + const now = new Date() + + return this.withTransaction(async () => { + if (mode === 'purge') { + const factRows = await this.sqlWithRetry( + `DELETE FROM ${this.factRecordsTable} + WHERE source_entity_id = $1 OR target_entity_id = $1 + RETURNING id`, + [entityId] + ) + const edgeRows = await this.sqlWithRetry( + `DELETE FROM ${this.edgesTable} + WHERE (source_type = 'entity' AND source_id = $1) + OR (target_type = 'entity' AND target_id = $1) + RETURNING id, source_type, target_type`, + [entityId] + ) + const mentionRows = await this.sqlWithRetry( + `DELETE FROM ${this.chunkMentionsTable} + WHERE entity_id = $1 + RETURNING id`, + [entityId] + ) + const externalRows = await this.sqlWithRetry( + `DELETE FROM ${this.entityExternalIdsTable} + WHERE entity_id = $1 + RETURNING id`, + [entityId] + ) + await this.sqlWithRetry( + `DELETE FROM ${this.entitiesTable} + WHERE id = $1`, + [entityId] + ) + return { + entityId, + mode, + deletedEdges: edgeRows.filter(row => row.source_type === 'entity' && row.target_type === 'entity').length, + deletedFacts: factRows.length, + deletedGraphEdges: edgeRows.length, + deletedMentions: mentionRows.length, + deletedExternalIds: externalRows.length, + } + } + + const factRows = await this.sqlWithRetry( + `UPDATE ${this.factRecordsTable} + SET invalid_at = $2, updated_at = NOW() + WHERE invalid_at IS NULL + AND (source_entity_id = $1 OR target_entity_id = $1) + RETURNING id`, + [entityId, now.toISOString()] + ) + const edgeRows = await this.sqlWithRetry( + `UPDATE ${this.edgesTable} + SET invalid_at = $2, updated_at = NOW() + WHERE invalid_at IS NULL + AND ( + (source_type = 'entity' AND source_id = $1) + OR + (target_type = 'entity' AND target_id = $1) + ) + RETURNING id, source_type, target_type`, + [entityId, now.toISOString()] + ) + await this.sqlWithRetry( + `UPDATE ${this.entitiesTable} + SET status = 'invalidated', + invalid_at = $2, + deleted_at = $2, + updated_at = NOW() + WHERE id = $1`, + [entityId, now.toISOString()] + ) + const mentionRows = await this.sqlWithRetry( + `SELECT id FROM ${this.chunkMentionsTable} WHERE entity_id = $1`, + [entityId] + ) + const externalRows = await this.sqlWithRetry( + `SELECT id FROM ${this.entityExternalIdsTable} WHERE entity_id = $1`, + [entityId] + ) + return { + entityId, + mode, + deletedEdges: edgeRows.filter(row => row.source_type === 'entity' && row.target_type === 'entity').length, + deletedFacts: factRows.length, + deletedGraphEdges: edgeRows.length, + deletedMentions: mentionRows.length, + deletedExternalIds: externalRows.length, + } + }) + } + // ── Entity ↔ Chunk Mention Evidence ── async upsertEntityChunkMentions(mentions: SemanticEntityMention[]): Promise { @@ -1428,7 +1966,7 @@ export class PgMemoryStoreAdapter implements MemoryStoreAdapter { params.push( generateId('mention'), m.entityId, - m.documentId, + m.sourceId, m.chunkIndex, m.bucketId, m.mentionType, @@ -1440,22 +1978,22 @@ export class PgMemoryStoreAdapter implements MemoryStoreAdapter { await this.sqlWithRetry( `INSERT INTO ${this.chunkMentionsTable} - (id, entity_id, document_id, chunk_index, bucket_id, mention_type, surface_text, normalized_surface_text, confidence) + (id, entity_id, source_id, chunk_index, bucket_id, mention_type, surface_text, normalized_surface_text, confidence) VALUES ${values.join(',')} - ON CONFLICT (entity_id, document_id, chunk_index, mention_type, normalized_surface_text) DO UPDATE SET + ON CONFLICT (entity_id, source_id, chunk_index, mention_type, normalized_surface_text) DO UPDATE SET surface_text = COALESCE(EXCLUDED.surface_text, ${unqualified(this.chunkMentionsTable)}.surface_text), confidence = COALESCE(EXCLUDED.confidence, ${unqualified(this.chunkMentionsTable)}.confidence)`, params ) } - async listPassageBackfillChunks(opts: { + async listChunkBackfillRecords(opts: { chunksTable: string scope?: typegraphIdentity | undefined bucketIds?: string[] | undefined limit?: number | undefined offset?: number | undefined - }): Promise { + }): Promise { const params: unknown[] = [] let bucketClause = '' if (opts.bucketIds && opts.bucketIds.length > 0) { @@ -1471,27 +2009,27 @@ export class PgMemoryStoreAdapter implements MemoryStoreAdapter { const offsetParam = `$${params.length}` const rows = await this.sqlWithRetry( - `SELECT c.id AS chunk_id, c.bucket_id, c.document_id, c.chunk_index, + `SELECT c.id AS chunk_id, c.bucket_id, c.source_id, c.chunk_index, c.embedding_model, c.content, c.metadata, c.visibility, c.tenant_id, c.group_id, c.user_id, c.agent_id, c.conversation_id FROM ${opts.chunksTable} c WHERE TRUE ${bucketClause} ${scopeClause} - ORDER BY c.document_id, c.chunk_index + ORDER BY c.source_id, c.chunk_index LIMIT ${limitParam} OFFSET ${offsetParam}`, params ) - return rows.map(mapRowToPassageBackfillChunk) + return rows.map(mapRowToChunkBackfillRecord) } - async listPassageMentionBackfillRows(opts: { + async listChunkMentionBackfillRows(opts: { chunksTable: string scope?: typegraphIdentity | undefined bucketIds?: string[] | undefined limit?: number | undefined offset?: number | undefined - }): Promise { + }): Promise { const params: unknown[] = [] let bucketClause = '' if (opts.bucketIds && opts.bucketIds.length > 0) { @@ -1507,24 +2045,24 @@ export class PgMemoryStoreAdapter implements MemoryStoreAdapter { const offsetParam = `$${params.length}` const rows = await this.sqlWithRetry( - `SELECT c.id AS chunk_id, c.bucket_id, c.document_id, c.chunk_index, + `SELECT c.id AS chunk_id, c.bucket_id, c.source_id, c.chunk_index, c.embedding_model, c.content, c.metadata, c.visibility, c.tenant_id, c.group_id, c.user_id, c.agent_id, c.conversation_id, m.entity_id, m.mention_type, m.surface_text, m.normalized_surface_text, m.confidence FROM ${this.chunkMentionsTable} m JOIN ${opts.chunksTable} c - ON m.document_id = c.document_id + ON m.source_id = c.source_id AND m.chunk_index = c.chunk_index AND m.bucket_id = c.bucket_id WHERE TRUE ${bucketClause} ${scopeClause} - ORDER BY c.document_id, c.chunk_index, m.entity_id + ORDER BY c.source_id, c.chunk_index, m.entity_id LIMIT ${limitParam} OFFSET ${offsetParam}`, params ) return rows.map(row => ({ - ...mapRowToPassageBackfillChunk(row), + ...mapRowToChunkBackfillRecord(row), entityId: row.entity_id as string, mentionType: row.mention_type as SemanticEntityMention['mentionType'], surfaceText: (row.surface_text as string | null) ?? undefined, @@ -1546,7 +2084,9 @@ export class PgMemoryStoreAdapter implements MemoryStoreAdapter { const rows = await this.sqlWithRetry( `SELECT * FROM ${this.edgesTable} - WHERE invalid_at IS NULL + WHERE source_type = 'entity' + AND target_type = 'entity' + AND invalid_at IS NULL ${scopeClause} ORDER BY created_at, id LIMIT ${limitParam} OFFSET ${offsetParam}`, @@ -1592,7 +2132,9 @@ export class PgMemoryStoreAdapter implements MemoryStoreAdapter { const scopeClause = identity.where ? `AND ${identity.where}` : '' const rows = await this.sqlWithRetry( `SELECT relation, COUNT(*)::integer AS count FROM ${this.edgesTable} - WHERE invalid_at IS NULL + WHERE source_type = 'entity' + AND target_type = 'entity' + AND invalid_at IS NULL ${scopeClause} GROUP BY relation ORDER BY count DESC`, identity.params @@ -1618,9 +2160,9 @@ export class PgMemoryStoreAdapter implements MemoryStoreAdapter { const scopeClause = identity.where ? `AND ${identity.where}` : '' const rows = await this.sqlWithRetry( `SELECT degree, COUNT(*)::integer AS count FROM ( - SELECT source_entity_id AS eid, COUNT(*)::integer AS degree FROM ${this.edgesTable} WHERE invalid_at IS NULL ${scopeClause} GROUP BY source_entity_id + SELECT source_id AS eid, COUNT(*)::integer AS degree FROM ${this.edgesTable} WHERE source_type = 'entity' AND target_type = 'entity' AND invalid_at IS NULL ${scopeClause} GROUP BY source_id UNION ALL - SELECT target_entity_id AS eid, COUNT(*)::integer AS degree FROM ${this.edgesTable} WHERE invalid_at IS NULL ${scopeClause} GROUP BY target_entity_id + SELECT target_id AS eid, COUNT(*)::integer AS degree FROM ${this.edgesTable} WHERE source_type = 'entity' AND target_type = 'entity' AND invalid_at IS NULL ${scopeClause} GROUP BY target_id ) sub GROUP BY degree ORDER BY degree`, identity.params @@ -1707,6 +2249,9 @@ function mapRowToEntity(row: Record): SemanticEntity { entityType: row.entity_type as string, aliases: row.aliases as string[] ?? [], properties: props, + status: (row.status as SemanticEntity['status']) ?? 'active', + mergedIntoEntityId: (row.merged_into_entity_id as string | null) ?? undefined, + deletedAt: row.deleted_at ? new Date(row.deleted_at as string) : undefined, embedding: undefined, descriptionEmbedding: parseVectorString(row.description_embedding), scope: rowToIdentity(row), @@ -1723,8 +2268,12 @@ function mapRowToEntity(row: Record): SemanticEntity { function mapRowToEdge(row: Record): SemanticEdge { return { id: row.id as string, - sourceEntityId: row.source_entity_id as string, - targetEntityId: row.target_entity_id as string, + sourceType: 'entity', + sourceId: row.source_id as string, + targetType: 'entity', + targetId: row.target_id as string, + sourceEntityId: row.source_id as string, + targetEntityId: row.target_id as string, relation: row.relation as string, weight: row.weight as number, properties: parseJson(row.properties), @@ -1751,7 +2300,7 @@ function mapRowToFact(row: Record): SemanticFactRecord { description: (row.description as string | null) ?? undefined, evidenceText: (row.evidence_text as string | null) ?? undefined, factSearchText: (row.fact_search_text as string | null) ?? undefined, - sourceChunkId: (row.source_chunk_id as string | null) ?? undefined, + sourceChunkId: (row.from_chunk_id as string | null) ?? undefined, weight: row.weight as number, evidenceCount: row.evidence_count as number, embedding: undefined, @@ -1759,36 +2308,45 @@ function mapRowToFact(row: Record): SemanticFactRecord { visibility: (row.visibility as SemanticFactRecord['visibility']) ?? undefined, createdAt: new Date(row.created_at as string), updatedAt: new Date(row.updated_at as string), + invalidAt: row.invalid_at ? new Date(row.invalid_at as string) : undefined, similarity: (row.similarity as number | null) ?? undefined, } } -function mapRowToPassageEntityEdge(row: Record): SemanticPassageEntityEdge { +function mapRowToEntityChunkEdge(row: Record): SemanticEntityChunkEdge { + const props = parseJson(row.properties) return { - passageId: row.passage_id as string, - entityId: row.entity_id as string, + id: row.id as string, + entityId: row.source_id as string, + chunkRef: { + bucketId: row.to_bucket_id as string, + sourceId: row.to_source_id as string, + chunkIndex: row.to_chunk_index as number, + embeddingModel: (row.to_embedding_model as string | null) ?? undefined, + chunkId: (row.to_chunk_id as string | null) ?? undefined, + }, weight: row.weight as number, - mentionCount: row.mention_count as number, - confidence: (row.confidence as number | null) ?? undefined, - surfaceTexts: (row.surface_texts as string[] | null) ?? [], - mentionTypes: (row.mention_types as SemanticPassageEntityEdge['mentionTypes'] | null) ?? [], + mentionCount: Number(props.mentionCount ?? 1), + confidence: typeof props.confidence === 'number' ? props.confidence : undefined, + surfaceTexts: Array.isArray(props.surfaceTexts) ? props.surfaceTexts as string[] : [], + mentionTypes: Array.isArray(props.mentionTypes) ? props.mentionTypes as SemanticEntityChunkEdge['mentionTypes'] : [], scope: rowToIdentity(row), - visibility: (row.visibility as SemanticPassageEntityEdge['visibility']) ?? undefined, + visibility: (row.visibility as SemanticEntityChunkEdge['visibility']) ?? undefined, createdAt: row.created_at ? new Date(row.created_at as string) : undefined, updatedAt: row.updated_at ? new Date(row.updated_at as string) : undefined, } } -function mapRowToPassageBackfillChunk(row: Record): PassageBackfillChunk { +function mapRowToChunkBackfillRecord(row: Record): ChunkBackfillRecord { return { chunkId: row.chunk_id as string, bucketId: row.bucket_id as string, - documentId: row.document_id as string, + sourceId: row.source_id as string, chunkIndex: row.chunk_index as number, embeddingModel: row.embedding_model as string, content: row.content as string, metadata: parseJson(row.metadata), - visibility: (row.visibility as PassageBackfillChunk['visibility']) ?? undefined, + visibility: (row.visibility as ChunkBackfillRecord['visibility']) ?? undefined, tenantId: (row.tenant_id as string) ?? undefined, groupId: (row.group_id as string) ?? undefined, userId: (row.user_id as string) ?? undefined, @@ -1797,26 +2355,14 @@ function mapRowToPassageBackfillChunk(row: Record): PassageBack } } -function mapRowToPassageContent(row: Record): { - passageId: string - content: string - bucketId: string - documentId: string - chunkIndex: number - totalChunks: number - metadata: Record - tenantId?: string | undefined - groupId?: string | undefined - userId?: string | undefined - agentId?: string | undefined - conversationId?: string | undefined -} { +function mapRowToChunkContent(row: Record): SemanticChunkRecord { return { - passageId: row.passage_id as string, + chunkId: (row.chunk_id as string | null) ?? undefined, content: row.content as string, bucketId: row.bucket_id as string, - documentId: row.document_id as string, + sourceId: row.source_id as string, chunkIndex: row.chunk_index as number, + embeddingModel: (row.embedding_model as string | null) ?? undefined, totalChunks: row.total_chunks as number, metadata: parseJson(row.metadata), tenantId: (row.tenant_id as string) ?? undefined, @@ -1829,6 +2375,29 @@ function mapRowToPassageContent(row: Record): { // ── Helpers ── +function entityDetailFromSemanticEntity(entity: SemanticEntity): MergeGraphEntitiesResult['target'] { + return { + id: entity.id, + name: entity.name, + entityType: entity.entityType, + aliases: entity.aliases, + externalIds: entity.externalIds, + edgeCount: 0, + properties: entity.properties, + description: entity.properties.description as string | undefined, + createdAt: entity.temporal.createdAt, + validAt: entity.temporal.validAt, + invalidAt: entity.temporal.invalidAt, + topEdges: [], + } +} + +function arrayProperty(value: unknown): string[] { + return Array.isArray(value) + ? value.filter((item): item is string => typeof item === 'string' && item.length > 0) + : [] +} + function parseJson(val: unknown): Record { if (typeof val === 'string') return JSON.parse(val) return (val ?? {}) as Record @@ -1857,6 +2426,34 @@ function normalizeEntityText(value: string): string { .replace(/\s+/g, ' ') } +function normalizeExternalIdValue(id: string, type: string, encoding: ExternalId['encoding']): string { + const trimmed = id.trim() + if (encoding === 'sha256') return trimmed.toLowerCase() + if (type === 'email' || type.endsWith('_email') || type === 'github_handle') { + return trimmed.toLowerCase() + } + if (type === 'phone') { + return trimmed.replace(/[^\s+]/g, '') + } + return trimmed +} + +function normalizeExternalId( + externalId: ExternalId, +): (ExternalId & { normalizedValue: string; encoding: NonNullable }) | undefined { + const type = externalId.type.trim().toLowerCase() + const id = externalId.id.trim() + if (!id || !type) return undefined + const encoding = externalId.encoding ?? 'none' + return { + ...externalId, + id, + type, + encoding, + normalizedValue: normalizeExternalIdValue(id, type, encoding), + } +} + function escapeLike(value: string): string { return value.replace(/[\\%_]/g, '\\$&') } @@ -1873,6 +2470,15 @@ function buildMemoryWhere( const params: unknown[] = [] const p = () => `$${paramOffset + params.length}` + if (filter.ids) { + if (filter.ids.length === 0) { + conditions.push('FALSE') + } else { + params.push(filter.ids) + conditions.push(`id = ANY(${p()}::text[])`) + } + } + // Explicit identity column filtering (preferred over JSONB scope) if (filter.tenantId) { params.push(filter.tenantId) diff --git a/packages/adapters/pgvector/src/migrations.ts b/packages/adapters/pgvector/src/migrations.ts index 7e5b54d..369129c 100644 --- a/packages/adapters/pgvector/src/migrations.ts +++ b/packages/adapters/pgvector/src/migrations.ts @@ -61,7 +61,7 @@ export const MODEL_TABLE_SQL = (chunksTable: string, dimensions: number) => { user_id TEXT, agent_id TEXT, conversation_id TEXT, - document_id TEXT NOT NULL, + source_id TEXT NOT NULL, idempotency_key TEXT NOT NULL, content TEXT NOT NULL, embedding VECTOR(${dimensions}), @@ -85,8 +85,11 @@ export const MODEL_TABLE_SQL = (chunksTable: string, dimensions: number) => { CREATE INDEX IF NOT EXISTS ${idx('fts_idx')} ON ${chunksTable} USING gin (search_vector); - CREATE INDEX IF NOT EXISTS ${idx('doc_chunk_idx')} - ON ${chunksTable} (document_id, chunk_index); + CREATE INDEX IF NOT EXISTS ${idx('source_chunk_idx')} + ON ${chunksTable} (source_id, chunk_index); + + CREATE INDEX IF NOT EXISTS ${idx('bucket_source_chunk_idx')} + ON ${chunksTable} (bucket_id, source_id, chunk_index); CREATE UNIQUE INDEX IF NOT EXISTS ${idx('ikey_chunk_idx')} ON ${chunksTable} (idempotency_key, chunk_index, bucket_id); @@ -157,13 +160,13 @@ export const HASH_TABLE_SQL = (hashesTable: string) => { } /** - * DDL for the documents table - tracks indexed documents with metadata. + * DDL for the sources table - tracks indexed sources with metadata. * Created once during initialize(). */ -export const DOCUMENTS_TABLE_SQL = (documentsTable: string) => { - const idx = (suffix: string) => safeIdx(documentsTable, suffix) +export const SOURCES_TABLE_SQL = (sourcesTable: string) => { + const idx = (suffix: string) => safeIdx(sourcesTable, suffix) return ` - CREATE TABLE IF NOT EXISTS ${documentsTable} ( + CREATE TABLE IF NOT EXISTS ${sourcesTable} ( id TEXT PRIMARY KEY, bucket_id TEXT NOT NULL, tenant_id TEXT, @@ -182,47 +185,48 @@ export const DOCUMENTS_TABLE_SQL = (documentsTable: string) => { indexed_at TIMESTAMPTZ, created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), - metadata JSONB NOT NULL DEFAULT '{}' + metadata JSONB NOT NULL DEFAULT '{}', + subject JSONB ); CREATE UNIQUE INDEX IF NOT EXISTS ${idx('source_hash_idx')} - ON ${documentsTable} (bucket_id, COALESCE(tenant_id, ''), content_hash); + ON ${sourcesTable} (bucket_id, COALESCE(tenant_id, ''), content_hash); CREATE INDEX IF NOT EXISTS ${idx('bucket_idx')} - ON ${documentsTable} (bucket_id, tenant_id); + ON ${sourcesTable} (bucket_id, tenant_id); CREATE INDEX IF NOT EXISTS ${idx('status_idx')} - ON ${documentsTable} (status); + ON ${sourcesTable} (status); CREATE INDEX IF NOT EXISTS ${idx('visibility_user_idx')} - ON ${documentsTable} (visibility, user_id); + ON ${sourcesTable} (visibility, user_id); CREATE INDEX IF NOT EXISTS ${idx('graph_extracted_idx')} - ON ${documentsTable} (graph_extracted); + ON ${sourcesTable} (graph_extracted); CREATE INDEX IF NOT EXISTS ${idx('tenant_user_idx')} - ON ${documentsTable} (tenant_id, user_id); + ON ${sourcesTable} (tenant_id, user_id); CREATE INDEX IF NOT EXISTS ${idx('tenant_group_idx')} - ON ${documentsTable} (tenant_id, group_id); + ON ${sourcesTable} (tenant_id, group_id); CREATE INDEX IF NOT EXISTS ${idx('tenant_agent_idx')} - ON ${documentsTable} (tenant_id, agent_id); + ON ${sourcesTable} (tenant_id, agent_id); CREATE INDEX IF NOT EXISTS ${idx('tenant_conversation_idx')} - ON ${documentsTable} (tenant_id, conversation_id); + ON ${sourcesTable} (tenant_id, conversation_id); CREATE INDEX IF NOT EXISTS ${idx('user_idx')} - ON ${documentsTable} (user_id); + ON ${sourcesTable} (user_id); CREATE INDEX IF NOT EXISTS ${idx('group_idx')} - ON ${documentsTable} (group_id); + ON ${sourcesTable} (group_id); CREATE INDEX IF NOT EXISTS ${idx('agent_idx')} - ON ${documentsTable} (agent_id); + ON ${sourcesTable} (agent_id); CREATE INDEX IF NOT EXISTS ${idx('conversation_idx')} - ON ${documentsTable} (conversation_id); + ON ${sourcesTable} (conversation_id); ` } diff --git a/packages/adapters/pgvector/src/document-store.ts b/packages/adapters/pgvector/src/source-store.ts similarity index 70% rename from packages/adapters/pgvector/src/document-store.ts rename to packages/adapters/pgvector/src/source-store.ts index 08f92e1..1d8029b 100644 --- a/packages/adapters/pgvector/src/document-store.ts +++ b/packages/adapters/pgvector/src/source-store.ts @@ -1,9 +1,13 @@ -import type { typegraphDocument, DocumentFilter, DocumentStatus, UpsertDocumentInput, PaginationOpts, PaginatedResult } from '@typegraph-ai/sdk' +import { ConfigError } from '@typegraph-ai/sdk' +import type { typegraphSource, SourceFilter, SourceStatus, UpsertSourceInput, PaginationOpts, PaginatedResult } from '@typegraph-ai/sdk' import type { SqlExecutor } from './adapter.js' -type UpsertedDocumentRecord = typegraphDocument & { wasCreated?: boolean | undefined } +type UpsertedSourceRecord = typegraphSource & { wasCreated?: boolean | undefined } -function mapDocRow(row: Record): typegraphDocument { +function mapSourceRow(row: Record): typegraphSource { + const subject = typeof row.subject === 'string' + ? JSON.parse(row.subject) + : row.subject ?? undefined return { id: row.id as string, bucketId: row.bucket_id as string, @@ -16,29 +20,30 @@ function mapDocRow(row: Record): typegraphDocument { url: (row.url as string) ?? undefined, contentHash: row.content_hash as string, chunkCount: row.chunk_count as number, - status: row.status as typegraphDocument['status'], - visibility: (row.visibility as typegraphDocument['visibility']) ?? undefined, + status: row.status as typegraphSource['status'], + visibility: (row.visibility as typegraphSource['visibility']) ?? undefined, graphExtracted: (row.graph_extracted as boolean) ?? false, indexedAt: new Date(row.indexed_at as string), createdAt: new Date(row.created_at as string), updatedAt: new Date(row.updated_at as string), metadata: (typeof row.metadata === 'string' ? JSON.parse(row.metadata) : row.metadata ?? {}) as Record, + subject: subject as typegraphSource['subject'], } } -export class PgDocumentStore { +export class PgSourceStore { constructor( private sql: SqlExecutor, private tableName: string ) {} - async upsert(input: UpsertDocumentInput): Promise { + async upsert(input: UpsertSourceInput): Promise { const rows = await this.sql( `INSERT INTO ${this.tableName} (id, bucket_id, tenant_id, group_id, user_id, agent_id, conversation_id, title, url, content_hash, chunk_count, status, - visibility, graph_extracted, metadata, indexed_at, updated_at) - VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, NOW(), NOW()) + visibility, graph_extracted, metadata, subject, indexed_at, updated_at) + VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, NOW(), NOW()) ON CONFLICT (bucket_id, COALESCE(tenant_id, ''), content_hash) DO UPDATE SET title = EXCLUDED.title, @@ -52,6 +57,7 @@ export class PgDocumentStore { conversation_id = EXCLUDED.conversation_id, graph_extracted = EXCLUDED.graph_extracted, metadata = EXCLUDED.metadata, + subject = EXCLUDED.subject, indexed_at = NOW(), updated_at = NOW() RETURNING *, (xmax = 0) AS was_created`, @@ -71,25 +77,26 @@ export class PgDocumentStore { input.visibility ?? null, input.graphExtracted ?? false, JSON.stringify(input.metadata ?? {}), + input.subject ? JSON.stringify(input.subject) : null, ] ) return { - ...mapDocRow(rows[0]!), + ...mapSourceRow(rows[0]!), wasCreated: rows[0]!.was_created as boolean, } } - async get(id: string): Promise { + async get(id: string): Promise { const rows = await this.sql( `SELECT * FROM ${this.tableName} WHERE id = $1`, [id] ) if (rows.length === 0) return null - return mapDocRow(rows[0]!) + return mapSourceRow(rows[0]!) } - async list(filter: DocumentFilter, pagination?: PaginationOpts): Promise> { - const { where, params } = buildDocWhere(filter) + async list(filter?: SourceFilter | null, pagination?: PaginationOpts | null): Promise> { + const { where, params } = buildSourceWhere(filter) const filterClause = where ? `WHERE ${where}` : '' if (pagination) { @@ -104,19 +111,19 @@ export class PgDocumentStore { `SELECT * FROM ${this.tableName} ${filterClause} ORDER BY updated_at DESC LIMIT $${params.length + 1} OFFSET $${params.length + 2}`, [...params, limit, offset] ) - return { items: rows.map(mapDocRow), total, limit, offset } + return { items: rows.map(mapSourceRow), total, limit, offset } } const rows = await this.sql( `SELECT * FROM ${this.tableName} ${filterClause} ORDER BY updated_at DESC`, params ) - return rows.map(mapDocRow) + return rows.map(mapSourceRow) } - async delete(filter: DocumentFilter): Promise<{ count: number; ids: string[] }> { - const { where, params } = buildDocWhere(filter) - if (!where) throw new Error('deleteDocuments() requires at least one filter field') + async delete(filter: SourceFilter | null): Promise<{ count: number; ids: string[] }> { + const { where, params } = buildSourceWhere(filter) + if (!where) throw new ConfigError('deleteSources() requires at least one filter field.') const rows = await this.sql( `DELETE FROM ${this.tableName} WHERE ${where} RETURNING id`, params @@ -124,22 +131,23 @@ export class PgDocumentStore { return { count: rows.length, ids: rows.map(r => r.id as string) } } - async update(id: string, input: Partial>): Promise { + async update(id: string, input: Partial>): Promise { const setClauses: string[] = ['updated_at = NOW()'] const params: unknown[] = [] if (input.title !== undefined) { params.push(input.title); setClauses.push(`title = $${params.length}`) } if (input.url !== undefined) { params.push(input.url); setClauses.push(`url = $${params.length}`) } if (input.visibility !== undefined) { params.push(input.visibility); setClauses.push(`visibility = $${params.length}`) } if (input.metadata !== undefined) { params.push(JSON.stringify(input.metadata)); setClauses.push(`metadata = $${params.length}::jsonb`) } + if (input.subject !== undefined) { params.push(input.subject ? JSON.stringify(input.subject) : null); setClauses.push(`subject = $${params.length}::jsonb`) } params.push(id) const rows = await this.sql( `UPDATE ${this.tableName} SET ${setClauses.join(', ')} WHERE id = $${params.length} RETURNING *`, params ) - return rows.length > 0 ? mapDocRow(rows[0]!) : null + return rows.length > 0 ? mapSourceRow(rows[0]!) : null } - async updateStatus(id: string, status: DocumentStatus, chunkCount?: number): Promise { + async updateStatus(id: string, status: SourceStatus, chunkCount?: number): Promise { if (chunkCount != null) { await this.sql( `UPDATE ${this.tableName} @@ -158,35 +166,35 @@ export class PgDocumentStore { } } -function buildDocWhere(filter: DocumentFilter): { where: string; params: unknown[] } { +function buildSourceWhere(filter?: SourceFilter | null): { where: string; params: unknown[] } { const conditions: string[] = [] const params: unknown[] = [] - if (filter.bucketId != null) { + if (filter?.bucketId != null) { params.push(filter.bucketId) conditions.push(`bucket_id = $${params.length}`) } - if (filter.tenantId != null) { + if (filter?.tenantId != null) { params.push(filter.tenantId) conditions.push(`tenant_id = $${params.length}`) } - if (filter.groupId != null) { + if (filter?.groupId != null) { params.push(filter.groupId) conditions.push(`group_id = $${params.length}`) } - if (filter.userId != null) { + if (filter?.userId != null) { params.push(filter.userId) conditions.push(`user_id = $${params.length}`) } - if (filter.agentId != null) { + if (filter?.agentId != null) { params.push(filter.agentId) conditions.push(`agent_id = $${params.length}`) } - if (filter.conversationId != null) { + if (filter?.conversationId != null) { params.push(filter.conversationId) conditions.push(`conversation_id = $${params.length}`) } - if (filter.status != null) { + if (filter?.status != null) { if (Array.isArray(filter.status)) { params.push(filter.status) conditions.push(`status = ANY($${params.length}::text[])`) @@ -195,7 +203,7 @@ function buildDocWhere(filter: DocumentFilter): { where: string; params: unknown conditions.push(`status = $${params.length}`) } } - if (filter.visibility != null) { + if (filter?.visibility != null) { if (Array.isArray(filter.visibility)) { params.push(filter.visibility) conditions.push(`visibility = ANY($${params.length}::text[])`) @@ -204,11 +212,11 @@ function buildDocWhere(filter: DocumentFilter): { where: string; params: unknown conditions.push(`visibility = $${params.length}`) } } - if (filter.documentIds != null && filter.documentIds.length > 0) { - params.push(filter.documentIds) + if (filter?.sourceIds != null && filter.sourceIds.length > 0) { + params.push(filter.sourceIds) conditions.push(`id = ANY($${params.length}::text[])`) } - if (filter.graphExtracted != null) { + if (filter?.graphExtracted != null) { params.push(filter.graphExtracted) conditions.push(`graph_extracted = $${params.length}`) } @@ -219,4 +227,4 @@ function buildDocWhere(filter: DocumentFilter): { where: string; params: unknown } } -export { buildDocWhere } +export { buildSourceWhere } diff --git a/packages/adapters/sqlite-vec/README.md b/packages/adapters/sqlite-vec/README.md index d931932..3ba53c1 100644 --- a/packages/adapters/sqlite-vec/README.md +++ b/packages/adapters/sqlite-vec/README.md @@ -17,12 +17,12 @@ This adapter is intentionally scoped for development and testing. It implements **Not supported (use `@typegraph-ai/adapter-pgvector` for production):** - `hybridSearch` — no BM25 keyword search (SQLite FTS5 is not wired in) -- Document CRUD — `upsertDocumentRecord`, `getDocument`, `listDocuments`, `deleteDocuments`, `updateDocument`, `searchWithDocuments`, `getChunksByRange` +- Source CRUD — `upsertSourceRecord`, `getSource`, `listSources`, `deleteSources`, `updateSource`, `searchWithSources`, `getChunksByRange` - Graph / memory storage — `QuerySignals.graph` and `QuerySignals.memory` require the pgvector memory adapter - Audit events and policy/governance tables - Schema isolation (SQLite has no schemas) -If you call `d.query(..., { signals: { keyword: true } })` or any of the document-level APIs against this adapter, the call will either throw or silently return empty results depending on the feature. Use pgvector for anything beyond local dev. +If you call `d.query(..., { signals: { keyword: true } })` or any of the source-level APIs against this adapter, the call will either throw or silently return empty results depending on the feature. Use pgvector for anything beyond local dev. ## Install diff --git a/packages/adapters/sqlite-vec/src/__tests__/isolation.test.ts b/packages/adapters/sqlite-vec/src/__tests__/isolation.test.ts index 91da0e1..cb82cf8 100644 --- a/packages/adapters/sqlite-vec/src/__tests__/isolation.test.ts +++ b/packages/adapters/sqlite-vec/src/__tests__/isolation.test.ts @@ -15,7 +15,7 @@ function makeChunk(overrides: Partial = {}): EmbeddedChunk { userId: overrides.userId, agentId: overrides.agentId, conversationId: overrides.conversationId, - documentId: overrides.documentId ?? 'doc-1', + sourceId: overrides.sourceId ?? 'source-1', content: overrides.content ?? 'hello world', embedding: overrides.embedding ?? [1, 0, 0, 0], embeddingModel: MODEL, @@ -41,9 +41,9 @@ describe('SqliteVecAdapter — identity isolation', () => { }) it('search filters by userId within the same tenant', async () => { - await adapter.upsertDocument(MODEL, [ - makeChunk({ idempotencyKey: 'A', userId: 'user-a', documentId: 'doc-a', embedding: [1, 0, 0, 0] }), - makeChunk({ idempotencyKey: 'B', userId: 'user-b', documentId: 'doc-b', embedding: [0, 1, 0, 0] }), + await adapter.upsertSourceChunks(MODEL, [ + makeChunk({ idempotencyKey: 'A', userId: 'user-a', sourceId: 'source-a', embedding: [1, 0, 0, 0] }), + makeChunk({ idempotencyKey: 'B', userId: 'user-b', sourceId: 'source-b', embedding: [0, 1, 0, 0] }), ]) const results = await adapter.search(MODEL, [1, 0, 0, 0], { @@ -53,11 +53,11 @@ describe('SqliteVecAdapter — identity isolation', () => { expect(results).toHaveLength(1) expect(results[0]!.userId).toBe('user-a') - expect(results[0]!.documentId).toBe('doc-a') + expect(results[0]!.sourceId).toBe('source-a') }) it('search filters by agentId', async () => { - await adapter.upsertDocument(MODEL, [ + await adapter.upsertSourceChunks(MODEL, [ makeChunk({ idempotencyKey: 'A', agentId: 'agent-a', embedding: [1, 0, 0, 0] }), makeChunk({ idempotencyKey: 'B', agentId: 'agent-b', embedding: [1, 0, 0, 0] }), ]) @@ -72,7 +72,7 @@ describe('SqliteVecAdapter — identity isolation', () => { }) it('search filters by conversationId', async () => { - await adapter.upsertDocument(MODEL, [ + await adapter.upsertSourceChunks(MODEL, [ makeChunk({ idempotencyKey: 'A', conversationId: 'conv-a', embedding: [1, 0, 0, 0] }), makeChunk({ idempotencyKey: 'B', conversationId: 'conv-b', embedding: [1, 0, 0, 0] }), ]) @@ -87,7 +87,7 @@ describe('SqliteVecAdapter — identity isolation', () => { }) it('search filters by groupId', async () => { - await adapter.upsertDocument(MODEL, [ + await adapter.upsertSourceChunks(MODEL, [ makeChunk({ idempotencyKey: 'A', groupId: 'group-a', embedding: [1, 0, 0, 0] }), makeChunk({ idempotencyKey: 'B', groupId: 'group-b', embedding: [1, 0, 0, 0] }), ]) @@ -102,7 +102,7 @@ describe('SqliteVecAdapter — identity isolation', () => { }) it('countChunks respects identity filters', async () => { - await adapter.upsertDocument(MODEL, [ + await adapter.upsertSourceChunks(MODEL, [ makeChunk({ idempotencyKey: 'A', userId: 'user-a' }), makeChunk({ idempotencyKey: 'B', userId: 'user-b' }), makeChunk({ idempotencyKey: 'C', userId: 'user-a' }), @@ -113,12 +113,12 @@ describe('SqliteVecAdapter — identity isolation', () => { expect(await adapter.countChunks(MODEL, { tenantId: 'tenant-1' })).toBe(3) }) - it('updates documentId on idempotency conflict', async () => { - await adapter.upsertDocument(MODEL, [ - makeChunk({ idempotencyKey: 'A', documentId: 'doc-stale' }), + it('updates sourceId on idempotency conflict', async () => { + await adapter.upsertSourceChunks(MODEL, [ + makeChunk({ idempotencyKey: 'A', sourceId: 'source-stale' }), ]) - await adapter.upsertDocument(MODEL, [ - makeChunk({ idempotencyKey: 'A', documentId: 'doc-canonical', content: 'updated content' }), + await adapter.upsertSourceChunks(MODEL, [ + makeChunk({ idempotencyKey: 'A', sourceId: 'source-canonical', content: 'updated content' }), ]) const results = await adapter.search(MODEL, [1, 0, 0, 0], { @@ -127,12 +127,12 @@ describe('SqliteVecAdapter — identity isolation', () => { }) expect(results).toHaveLength(1) - expect(results[0]!.documentId).toBe('doc-canonical') + expect(results[0]!.sourceId).toBe('source-canonical') expect(results[0]!.content).toBe('updated content') }) it('mapRowToScoredChunk returns all identity fields', async () => { - await adapter.upsertDocument(MODEL, [ + await adapter.upsertSourceChunks(MODEL, [ makeChunk({ idempotencyKey: 'A', tenantId: 't1', @@ -212,9 +212,9 @@ describe('SqliteVecAdapter — bucket identity + cascade', () => { it('deleteBucket cascades to chunks, vec table, and hashes', async () => { await adapter.upsertBucket!({ id: 'b1', name: 'b1', status: 'active', tenantId: 't1' }) - await adapter.upsertDocument(MODEL, [ + await adapter.upsertSourceChunks(MODEL, [ makeChunk({ idempotencyKey: 'k1', bucketId: 'b1' }), - makeChunk({ idempotencyKey: 'k2', bucketId: 'b1', documentId: 'doc-2' }), + makeChunk({ idempotencyKey: 'k2', bucketId: 'b1', sourceId: 'source-2' }), ]) await adapter.hashStore.set('b1:t1:k1', { idempotencyKey: 'k1', diff --git a/packages/adapters/sqlite-vec/src/adapter.ts b/packages/adapters/sqlite-vec/src/adapter.ts index d16c555..ecbaa2f 100644 --- a/packages/adapters/sqlite-vec/src/adapter.ts +++ b/packages/adapters/sqlite-vec/src/adapter.ts @@ -28,8 +28,8 @@ export interface SqliteVecAdapterConfig { * * **Limitations vs PostgreSQL:** * - No hybrid search (BM25 keyword search unavailable) - * - No document management (list, update, delete documents) - * - No context passages (searchWithDocuments) + * - No source management (list, update, delete sources) + * - No context passages (searchWithSources) * - No policy enforcement * - No graph/memory storage (no sqlite memory adapter — `QuerySignals.graph` and * `QuerySignals.memory` require the pgvector adapter) @@ -183,14 +183,14 @@ export class SqliteVecAdapter implements VectorStoreAdapter { return tables } - async upsertDocument(model: string, chunks: EmbeddedChunk[]): Promise { + async upsertSourceChunks(model: string, chunks: EmbeddedChunk[]): Promise { if (chunks.length === 0) return const { chunksTable, vecTable } = this.getTables(model) const upsertChunk = this.db.prepare( `INSERT INTO ${chunksTable} (id, bucket_id, tenant_id, group_id, user_id, agent_id, conversation_id, - document_id, idempotency_key, content, + source_id, idempotency_key, content, embedding_model, chunk_index, total_chunks, metadata, indexed_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT (idempotency_key, chunk_index, bucket_id) DO UPDATE SET @@ -200,7 +200,7 @@ export class SqliteVecAdapter implements VectorStoreAdapter { user_id = excluded.user_id, agent_id = excluded.agent_id, conversation_id = excluded.conversation_id, - document_id = excluded.document_id, + source_id = excluded.source_id, content = excluded.content, embedding_model = excluded.embedding_model, total_chunks = excluded.total_chunks, @@ -230,7 +230,7 @@ export class SqliteVecAdapter implements VectorStoreAdapter { chunk.userId ?? null, chunk.agentId ?? null, chunk.conversationId ?? null, - chunk.documentId, + chunk.sourceId, chunk.idempotencyKey, chunk.content, chunk.embeddingModel, @@ -455,9 +455,9 @@ function buildWhere(filter?: ChunkFilter): { where: string; params: unknown[] } conditions.push(`conversation_id = ?`) params.push(filter.conversationId) } - if (filter.documentId != null) { - conditions.push(`document_id = ?`) - params.push(filter.documentId) + if (filter.sourceId != null) { + conditions.push(`source_id = ?`) + params.push(filter.sourceId) } if (filter.idempotencyKey != null) { conditions.push(`idempotency_key = ?`) @@ -501,7 +501,7 @@ function mapRowToScoredChunk(row: Record): ScoredChunk { userId: (row.user_id as string) ?? undefined, agentId: (row.agent_id as string) ?? undefined, conversationId: (row.conversation_id as string) ?? undefined, - documentId: row.document_id as string, + sourceId: row.source_id as string, content: row.content as string, embedding: [], // Don't return the full vector embeddingModel: row.embedding_model as string, diff --git a/packages/adapters/sqlite-vec/src/migrations.ts b/packages/adapters/sqlite-vec/src/migrations.ts index 249377c..6c7188c 100644 --- a/packages/adapters/sqlite-vec/src/migrations.ts +++ b/packages/adapters/sqlite-vec/src/migrations.ts @@ -41,7 +41,7 @@ export const MODEL_CHUNKS_SQL = (chunksTable: string) => ` user_id TEXT, agent_id TEXT, conversation_id TEXT, - document_id TEXT NOT NULL, + source_id TEXT NOT NULL, idempotency_key TEXT NOT NULL, content TEXT NOT NULL, embedding_model TEXT NOT NULL, @@ -60,8 +60,8 @@ export const MODEL_CHUNKS_SQL = (chunksTable: string) => ` CREATE INDEX IF NOT EXISTS ${chunksTable}_bucket_tenant_idx ON ${chunksTable} (bucket_id, tenant_id); - CREATE INDEX IF NOT EXISTS ${chunksTable}_doc_chunk_idx - ON ${chunksTable} (document_id, chunk_index); + CREATE INDEX IF NOT EXISTS ${chunksTable}_source_chunk_idx + ON ${chunksTable} (source_id, chunk_index); CREATE INDEX IF NOT EXISTS ${chunksTable}_tenant_user_idx ON ${chunksTable} (tenant_id, user_id); diff --git a/packages/mcp-server/src/tools.ts b/packages/mcp-server/src/tools.ts index 48f4081..14d8a72 100644 --- a/packages/mcp-server/src/tools.ts +++ b/packages/mcp-server/src/tools.ts @@ -164,7 +164,7 @@ export async function executeTool( case 'typegraph_add_conversation': result = await memory.addConversationTurn( args['messages'] as { role: 'user' | 'assistant' | 'system' | 'tool'; content: string }[], - args['conversationId'] as string | undefined, + { conversationId: args['conversationId'] as string | undefined }, ) break diff --git a/packages/otel/src/otel-event-sink.ts b/packages/otel/src/otel-event-sink.ts index e87814a..f76086f 100644 --- a/packages/otel/src/otel-event-sink.ts +++ b/packages/otel/src/otel-event-sink.ts @@ -162,7 +162,7 @@ function buildAttributes( case 'index.start': case 'index.complete': - case 'index.document': { + case 'index.source': { if (typeof payload['bucketId'] === 'string') { attrs[ATTR.GEN_AI_DATA_SOURCE_ID] = payload['bucketId'] } diff --git a/packages/sdk/README.md b/packages/sdk/README.md index b483df1..2884ba3 100644 --- a/packages/sdk/README.md +++ b/packages/sdk/README.md @@ -1,6 +1,6 @@ # @typegraph-ai/sdk -The TypeGraph SDK is the main TypeScript API for building an AI context layer: ingest documents, query them with composable retrieval signals, build LLM-ready context, and wire graph or memory features when needed. +The TypeGraph SDK is the main TypeScript API for building an AI context layer: ingest sources, query them with composable retrieval signals, build LLM-ready context, and wire graph or memory features when needed. Use this README for the essentials. Use [typegraph.ai/docs](https://typegraph.ai/docs) for complete guides, deployment details, and deeper architecture notes. diff --git a/packages/sdk/package.json b/packages/sdk/package.json index 2ec0094..218320c 100644 --- a/packages/sdk/package.json +++ b/packages/sdk/package.json @@ -1,6 +1,6 @@ { "name": "@typegraph-ai/sdk", - "version": "0.3.12", + "version": "0.3.13", "description": "TypeScript SDK for retrieval, knowledge graph, and cognitive memory for AI agents", "type": "module", "main": "./dist/index.js", diff --git a/packages/sdk/src/__tests__/assemble.test.ts b/packages/sdk/src/__tests__/assemble.test.ts index 9d1b3af..4811963 100644 --- a/packages/sdk/src/__tests__/assemble.test.ts +++ b/packages/sdk/src/__tests__/assemble.test.ts @@ -8,8 +8,8 @@ function makeChunk(overrides: Partial = {}): QueryChunkResult score: 0.9123, scores: { raw: { cosineSimilarity: 0.9123 }, normalized: { semantic: 0.9123 } }, sources: ['semantic'], - document: { - id: 'doc-1', + source: { + id: 'source-1', bucketId: 'books', title: 'Maud', url: 'https://example.com/maud', @@ -70,7 +70,7 @@ describe('buildContext', () => { sourceEntityName: 'Alfred Tennyson', targetEntityId: 'ent-2', targetEntityName: 'Maud', - relation: 'WROTE', + relation: 'AUTHORED', factText: 'Alfred Tennyson wrote Maud.', weight: 1, evidenceCount: 2, @@ -87,7 +87,7 @@ describe('buildContext', () => { expect(built.context).toContain('metadata: {"source":"gutenberg","tags":["poetry","victorian"]}') expect(built.context).toContain('\nMaud is a poem by Alfred Tennyson.\n') expect(built.context).toContain('## Context Facts') - expect(built.context).toContain('relation: WROTE') + expect(built.context).toContain('relation: AUTHORED') expect(built.context).toContain('\nAlfred Tennyson wrote Maud.\n') }) @@ -98,7 +98,7 @@ describe('buildContext', () => { edgeId: 'edge-1', sourceEntityId: 'ent-1', targetEntityId: 'ent-2', - relation: 'WROTE', + relation: 'AUTHORED', factText: 'Alfred Tennyson wrote Maud.', weight: 1, evidenceCount: 2, diff --git a/packages/sdk/src/__tests__/chunker.test.ts b/packages/sdk/src/__tests__/chunker.test.ts index 8fa2a83..94ceef0 100644 --- a/packages/sdk/src/__tests__/chunker.test.ts +++ b/packages/sdk/src/__tests__/chunker.test.ts @@ -1,11 +1,11 @@ import { describe, it, expect } from 'vitest' import { defaultChunker } from '../index-engine/chunker.js' -import { createTestDocument } from './helpers/mock-connector.js' +import { createTestSource } from './helpers/mock-connector.js' describe('defaultChunker', () => { it('returns single chunk for short content', async () => { - const doc = createTestDocument({ content: 'Short text.' }) - const chunks = await defaultChunker(doc, { chunkSize: 100, chunkOverlap: 20 }) + const source = createTestSource({ content: 'Short text.' }) + const chunks = await defaultChunker(source, { chunkSize: 100, chunkOverlap: 20 }) expect(chunks).toHaveLength(1) expect(chunks[0]!.content).toBe('Short text.') expect(chunks[0]!.chunkIndex).toBe(0) @@ -13,15 +13,15 @@ describe('defaultChunker', () => { it('splits long content into multiple chunks', async () => { const content = 'This is a sentence. '.repeat(200) - const doc = createTestDocument({ content }) - const chunks = await defaultChunker(doc, { chunkSize: 512, chunkOverlap: 0 }) + const source = createTestSource({ content }) + const chunks = await defaultChunker(source, { chunkSize: 512, chunkOverlap: 0 }) expect(chunks.length).toBeGreaterThan(1) }) it('preserves chunk indices in order', async () => { const content = 'This is a sentence. '.repeat(200) - const doc = createTestDocument({ content }) - const chunks = await defaultChunker(doc, { chunkSize: 512, chunkOverlap: 0 }) + const source = createTestSource({ content }) + const chunks = await defaultChunker(source, { chunkSize: 512, chunkOverlap: 0 }) for (let i = 0; i < chunks.length; i++) { expect(chunks[i]!.chunkIndex).toBe(i) } @@ -29,16 +29,16 @@ describe('defaultChunker', () => { it('skips empty chunks after trimming', async () => { const content = 'Hello' + ' '.repeat(500) - const doc = createTestDocument({ content }) - const chunks = await defaultChunker(doc, { chunkSize: 512, chunkOverlap: 0 }) + const source = createTestSource({ content }) + const chunks = await defaultChunker(source, { chunkSize: 512, chunkOverlap: 0 }) for (const chunk of chunks) { expect(chunk.content.length).toBeGreaterThan(0) } }) it('returns empty array for empty content', async () => { - const doc = createTestDocument({ content: '' }) - const chunks = await defaultChunker(doc, { chunkSize: 512, chunkOverlap: 0 }) + const source = createTestSource({ content: '' }) + const chunks = await defaultChunker(source, { chunkSize: 512, chunkOverlap: 0 }) expect(chunks).toHaveLength(0) }) @@ -51,9 +51,9 @@ describe('defaultChunker', () => { 'Beauty is in the eye of the beholder.', ] const content = sentences.join(' ') - const doc = createTestDocument({ content }) + const source = createTestSource({ content }) // Use a small chunk size to force splitting - const chunks = await defaultChunker(doc, { chunkSize: 64, chunkOverlap: 0 }) + const chunks = await defaultChunker(source, { chunkSize: 64, chunkOverlap: 0 }) for (const chunk of chunks) { const trimmed = chunk.content.trim() diff --git a/packages/sdk/src/__tests__/cloud-instance.test.ts b/packages/sdk/src/__tests__/cloud-instance.test.ts index 74f6405..e8cf959 100644 --- a/packages/sdk/src/__tests__/cloud-instance.test.ts +++ b/packages/sdk/src/__tests__/cloud-instance.test.ts @@ -2,7 +2,7 @@ import { afterEach, describe, expect, it, vi } from 'vitest' import { createCloudInstance } from '../cloud/cloud-instance.js' function mockFetch() { - const fetchMock = vi.fn().mockResolvedValue(new Response(JSON.stringify({ + const fetchMock = vi.fn().mockImplementation(async () => new Response(JSON.stringify({ bucketId: 'bkt_novel', mode: 'upsert', total: 1, @@ -37,7 +37,7 @@ describe('createCloudInstance', () => { const [url, init] = fetchMock.mock.calls[0]! expect(url).toBe('https://example.test/api/v1/buckets/bkt_novel/ingest') const body = JSON.parse((init as RequestInit).body as string) - expect(body.docs).toHaveLength(1) + expect(body.sources).toHaveLength(1) expect(body.opts).toEqual(expect.objectContaining({ bucketId: 'bkt_novel', deduplicateBy: ['content', 'metadata.retryRound'], @@ -58,7 +58,7 @@ describe('createCloudInstance', () => { ) const body = JSON.parse((fetchMock.mock.calls[0]![1] as RequestInit).body as string) - expect(body.doc).toEqual(expect.objectContaining({ title: 'Novel chunk' })) + expect(body.source).toEqual(expect.objectContaining({ title: 'Novel chunk' })) expect(body.chunks).toEqual([{ content: 'Cole Conway met Steve Sharp.', chunkIndex: 0 }]) expect(body.opts).toEqual(expect.objectContaining({ bucketId: 'bkt_novel', @@ -66,4 +66,78 @@ describe('createCloudInstance', () => { })) expect(body.deduplicateBy).toBeUndefined() }) + + it('normalizes null optional request bodies to empty objects', async () => { + const fetchMock = mockFetch() + const instance = createCloudInstance({ apiKey: 'test-key', baseUrl: 'https://example.test/api' }) + + await instance.sources.list(null) + await instance.jobs.list(null) + await instance.policies.list(null) + await instance.listSources(null) + + for (const call of fetchMock.mock.calls) { + const body = JSON.parse((call[1] as RequestInit).body as string) + expect(body).toEqual({}) + } + }) + + it('accepts null ingest opts and keeps opts nested', async () => { + const fetchMock = mockFetch() + const instance = createCloudInstance({ apiKey: 'test-key', baseUrl: 'https://example.test/api' }) + + await instance.ingest([ + { title: 'Untargeted', content: 'Default bucket content' }, + ], null) + + const [url, init] = fetchMock.mock.calls[0]! + expect(url).toBe('https://example.test/api/v1/buckets/bkt_default/ingest') + const body = JSON.parse((init as RequestInit).body as string) + expect(body.opts).toEqual({}) + }) + + it('rejects null destructive source filters with a ConfigError', async () => { + const instance = createCloudInstance({ apiKey: 'test-key', baseUrl: 'https://example.test/api' }) + + await expect(instance.sources.delete(null)).rejects.toThrow('sources.delete requires at least one filter field') + await expect(instance.deleteSources(null)).rejects.toThrow('deleteSources requires at least one filter field') + }) + + it('uses unified memory opts bags in cloud mode', async () => { + const fetchMock = mockFetch() + const instance = createCloudInstance({ apiKey: 'test-key', baseUrl: 'https://example.test/api' }) + + await instance.remember('Prefers SMS', { + userId: 'user-1', + category: 'semantic', + importance: 0.8, + metadata: { source: 'test' }, + }) + await instance.recall('SMS', null) + await instance.healthCheck(null) + await instance.addConversationTurn([ + { role: 'user', content: 'hello' }, + ], { userId: 'user-1', conversationId: 'conv-1' }) + + const rememberBody = JSON.parse((fetchMock.mock.calls[0]![1] as RequestInit).body as string) + expect(rememberBody).toEqual({ + content: 'Prefers SMS', + identity: { userId: 'user-1' }, + category: 'semantic', + importance: 0.8, + metadata: { source: 'test' }, + }) + + const recallBody = JSON.parse((fetchMock.mock.calls[1]![1] as RequestInit).body as string) + expect(recallBody).toEqual({ query: 'SMS', identity: {} }) + + const healthBody = JSON.parse((fetchMock.mock.calls[2]![1] as RequestInit).body as string) + expect(healthBody).toEqual({ identity: {} }) + + const turnBody = JSON.parse((fetchMock.mock.calls[3]![1] as RequestInit).body as string) + expect(turnBody).toEqual({ + messages: [{ role: 'user', content: 'hello' }], + identity: { userId: 'user-1', conversationId: 'conv-1' }, + }) + }) }) diff --git a/packages/sdk/src/__tests__/graph-runner.test.ts b/packages/sdk/src/__tests__/graph-runner.test.ts index ff8e793..a550bb0 100644 --- a/packages/sdk/src/__tests__/graph-runner.test.ts +++ b/packages/sdk/src/__tests__/graph-runner.test.ts @@ -3,13 +3,13 @@ import { GraphRunner } from '../query/runners/graph-runner.js' import type { KnowledgeGraphBridge } from '../types/graph-bridge.js' describe('GraphRunner', () => { - it('maps passage graph results into normalized graph results', async () => { - const searchGraphPassages = vi.fn().mockResolvedValue({ + it('maps chunk graph results into normalized graph results', async () => { + const searchGraphChunks = vi.fn().mockResolvedValue({ results: [{ - passageId: 'passage-1', + chunkId: 'chunk-1', content: 'Adarsh Tadimari is debugging Plotline SDK initialization.', bucketId: 'bucket-1', - documentId: 'doc-1', + sourceId: 'source-1', chunkIndex: 2, totalChunks: 5, score: 0.42, @@ -38,7 +38,7 @@ describe('GraphRunner', () => { trace: { entitySeedCount: 1, factSeedCount: 1, - passageSeedCount: 1, + chunkSeedCount: 1, graphNodeCount: 3, graphEdgeCount: 2, pprNonzeroCount: 3, @@ -47,11 +47,11 @@ describe('GraphRunner', () => { topGraphScores: [0.42], selectedFactIds: ['fact-1'], selectedEntityIds: ['ent-1'], - selectedPassageIds: ['passage-1'], + selectedChunkIds: ['chunk-1'], }, }) - const runner = new GraphRunner({ searchGraphPassages } satisfies KnowledgeGraphBridge) + const runner = new GraphRunner({ searchGraphChunks } satisfies KnowledgeGraphBridge) const run = await runner.run( 'Adarsh Plotline SDK', { tenantId: 'tenant-1' }, @@ -60,7 +60,7 @@ describe('GraphRunner', () => { { restartProbability: 0.5 } ) - expect(searchGraphPassages).toHaveBeenCalledWith( + expect(searchGraphChunks).toHaveBeenCalledWith( 'Adarsh Plotline SDK', { tenantId: 'tenant-1' }, { @@ -68,7 +68,7 @@ describe('GraphRunner', () => { factCandidateLimit: 80, factFilterInputLimit: 12, factSeedLimit: 4, - passageSeedLimit: 80, + chunkSeedLimit: 80, maxExpansionEdgesPerEntity: 25, factChainLimit: 2, maxPprIterations: 40, @@ -84,42 +84,42 @@ describe('GraphRunner', () => { expect.objectContaining({ content: 'Adarsh Tadimari is debugging Plotline SDK initialization.', bucketId: 'bucket-1', - documentId: 'doc-1', + sourceId: 'source-1', rawScores: { graph: 0.42 }, mode: 'graph', chunk: { index: 2, total: 5 }, metadata: expect.objectContaining({ source: 'test', - passageId: 'passage-1', + chunkId: 'chunk-1', }), tenantId: 'tenant-1', }), ]) }) - it('throws when searchGraphPassages is missing', async () => { + it('throws when searchGraphChunks is missing', async () => { const runner = new GraphRunner({} satisfies KnowledgeGraphBridge) await expect( runner.run('Adarsh', { tenantId: 'tenant-1' }, 5) - ).rejects.toThrow('Knowledge graph bridge must implement searchGraphPassages for graph queries.') + ).rejects.toThrow('Knowledge graph bridge must implement searchGraphChunks for graph queries.') }) it('lets explicit graph options override the default profile', async () => { - const searchGraphPassages = vi.fn().mockResolvedValue({ + const searchGraphChunks = vi.fn().mockResolvedValue({ results: [], facts: [], entities: [], trace: {}, }) - const runner = new GraphRunner({ searchGraphPassages } satisfies KnowledgeGraphBridge) + const runner = new GraphRunner({ searchGraphChunks } satisfies KnowledgeGraphBridge) await runner.run('query', { tenantId: 'tenant-1' }, 5, undefined, { factFilter: false, factCandidateLimit: 25, }) - expect(searchGraphPassages).toHaveBeenCalledWith( + expect(searchGraphChunks).toHaveBeenCalledWith( 'query', { tenantId: 'tenant-1' }, expect.objectContaining({ diff --git a/packages/sdk/src/__tests__/hash.test.ts b/packages/sdk/src/__tests__/hash.test.ts index 111835e..7ec38ac 100644 --- a/packages/sdk/src/__tests__/hash.test.ts +++ b/packages/sdk/src/__tests__/hash.test.ts @@ -1,6 +1,6 @@ import { describe, it, expect } from 'vitest' import { sha256, resolveIdempotencyKey, buildHashStoreKey } from '../index-engine/hash.js' -import { createTestDocument } from './helpers/mock-connector.js' +import { createTestSource } from './helpers/mock-connector.js' describe('sha256', () => { it('returns 64-char hex string', () => { @@ -20,33 +20,33 @@ describe('sha256', () => { describe('resolveIdempotencyKey', () => { it('resolves field-based spec', () => { - const doc = createTestDocument({ url: 'https://example.com/page' }) - const key = resolveIdempotencyKey(doc, ['url']) + const source = createTestSource({ url: 'https://example.com/page' }) + const key = resolveIdempotencyKey(source, ['url']) expect(key).toBe('https://example.com/page') }) it('resolves multi-field spec joined by ::', () => { - const doc = createTestDocument({ id: 'doc-1', url: 'https://example.com/page' }) - const key = resolveIdempotencyKey(doc, ['id', 'url']) - expect(key).toBe('doc-1::https://example.com/page') + const source = createTestSource({ id: 'source-1', url: 'https://example.com/page' }) + const key = resolveIdempotencyKey(source, ['id', 'url']) + expect(key).toBe('source-1::https://example.com/page') }) it('resolves metadata fields', () => { - const doc = createTestDocument({ metadata: { category: 'tech' } }) - const key = resolveIdempotencyKey(doc, ['metadata.category']) + const source = createTestSource({ metadata: { category: 'tech' } }) + const key = resolveIdempotencyKey(source, ['metadata.category']) expect(key).toBe('tech') }) it('returns empty string for missing fields', () => { - const doc = createTestDocument({ metadata: {} }) - const key = resolveIdempotencyKey(doc, ['metadata.nonexistent']) + const source = createTestSource({ metadata: {} }) + const key = resolveIdempotencyKey(source, ['metadata.nonexistent']) expect(key).toBe('') }) it('supports function-based spec', () => { - const doc = createTestDocument({ id: 'doc-1' }) - const key = resolveIdempotencyKey(doc, (d) => `custom-${d.id}`) - expect(key).toBe('custom-doc-1') + const source = createTestSource({ id: 'source-1' }) + const key = resolveIdempotencyKey(source, (d) => `custom-${d.id}`) + expect(key).toBe('custom-source-1') }) }) diff --git a/packages/sdk/src/__tests__/helpers/mock-adapter.ts b/packages/sdk/src/__tests__/helpers/mock-adapter.ts index 639da45..edb60d4 100644 --- a/packages/sdk/src/__tests__/helpers/mock-adapter.ts +++ b/packages/sdk/src/__tests__/helpers/mock-adapter.ts @@ -1,6 +1,6 @@ import type { VectorStoreAdapter, HashStoreAdapter, SearchOpts, HashRecord, UndeployResult } from '../../types/adapter.js' -import type { EmbeddedChunk, ChunkFilter, ScoredChunk } from '../../types/document.js' -import type { typegraphDocument, DocumentStatus, DocumentFilter, UpsertDocumentInput, UpsertedDocumentRecord } from '../../types/typegraph-document.js' +import type { EmbeddedChunk, ChunkFilter, ScoredChunk } from '../../types/chunk.js' +import type { typegraphSource, SourceStatus, SourceFilter, UpsertSourceInput, UpsertedSourceRecord } from '../../types/source.js' import { createHash } from 'crypto' function cosineSimilarity(a: number[], b: number[]): number { @@ -19,12 +19,22 @@ function cosineSimilarity(a: number[], b: number[]): number { function matchesFilter(chunk: EmbeddedChunk, filter: ChunkFilter): boolean { if (filter.bucketId && chunk.bucketId !== filter.bucketId) return false if (filter.bucketIds && filter.bucketIds.length > 0 && !filter.bucketIds.includes(chunk.bucketId)) return false + if (filter.chunkRefs) { + if (filter.chunkRefs.length === 0) return false + const matched = filter.chunkRefs.some(ref => + ref.bucketId === chunk.bucketId && + ref.sourceId === chunk.sourceId && + ref.chunkIndex === chunk.chunkIndex && + (ref.embeddingModel == null || ref.embeddingModel === chunk.embeddingModel) + ) + if (!matched) return false + } if (filter.tenantId && chunk.tenantId !== filter.tenantId) return false if (filter.groupId && chunk.groupId !== filter.groupId) return false if (filter.userId && chunk.userId !== filter.userId) return false if (filter.agentId && chunk.agentId !== filter.agentId) return false if (filter.conversationId && chunk.conversationId !== filter.conversationId) return false - if (filter.documentId && chunk.documentId !== filter.documentId) return false + if (filter.sourceId && chunk.sourceId !== filter.sourceId) return false if (filter.idempotencyKey && chunk.idempotencyKey !== filter.idempotencyKey) return false if (filter.metadata) { for (const [k, v] of Object.entries(filter.metadata)) { @@ -104,21 +114,21 @@ export interface MockAdapterCall { export function createMockAdapter(): VectorStoreAdapter & { calls: MockAdapterCall[] _chunks: Map - _documents: Map + _sources: Map } { const chunks = new Map() - const documents = new Map() + const sources = new Map() const calls: MockAdapterCall[] = [] const hashStore = createMockHashStore() const adapter: VectorStoreAdapter & { calls: MockAdapterCall[] _chunks: Map - _documents: Map + _sources: Map } = { calls, _chunks: chunks, - _documents: documents, + _sources: sources, hashStore, async deploy() { @@ -145,8 +155,8 @@ export function createMockAdapter(): VectorStoreAdapter & { } }, - async upsertDocument(model: string, newChunks: EmbeddedChunk[]) { - calls.push({ method: 'upsertDocument', args: [model, newChunks] }) + async upsertSourceChunks(model: string, newChunks: EmbeddedChunk[]) { + calls.push({ method: 'upsertSourceChunks', args: [model, newChunks] }) if (!chunks.has(model)) { chunks.set(model, []) } @@ -235,19 +245,19 @@ export function createMockAdapter(): VectorStoreAdapter & { return store.filter(c => matchesFilter(c, filter)).length }, - async upsertDocumentRecord(input: UpsertDocumentInput): Promise { - calls.push({ method: 'upsertDocumentRecord', args: [input] }) - const existing = [...documents.values()].find(doc => - doc.bucketId === input.bucketId && - doc.tenantId === input.tenantId && - doc.contentHash === input.contentHash + async upsertSourceRecord(input: UpsertSourceInput): Promise { + calls.push({ method: 'upsertSourceRecord', args: [input] }) + const existing = [...sources.values()].find(source => + source.bucketId === input.bucketId && + source.tenantId === input.tenantId && + source.contentHash === input.contentHash ) const id = existing?.id ?? input.id ?? createHash('sha256') .update(`${input.bucketId}::${input.tenantId ?? ''}::${input.contentHash}`) .digest('hex') .slice(0, 16) const now = new Date() - const doc: typegraphDocument = { + const source: typegraphSource = { id, bucketId: input.bucketId, tenantId: input.tenantId, @@ -261,23 +271,25 @@ export function createMockAdapter(): VectorStoreAdapter & { userId: input.userId, agentId: input.agentId, conversationId: input.conversationId, + graphExtracted: input.graphExtracted ?? false, indexedAt: now, createdAt: existing?.createdAt ?? now, updatedAt: now, metadata: input.metadata ?? {}, + subject: input.subject, } - documents.set(id, doc) - return { ...doc, wasCreated: !existing } + sources.set(id, source) + return { ...source, wasCreated: !existing } }, - async getDocument(id: string): Promise { - calls.push({ method: 'getDocument', args: [id] }) - return documents.get(id) ?? null + async getSource(id: string): Promise { + calls.push({ method: 'getSource', args: [id] }) + return sources.get(id) ?? null }, - async listDocuments(filter: DocumentFilter): Promise { - calls.push({ method: 'listDocuments', args: [filter] }) - return [...documents.values()].filter(d => { + async listSources(filter: SourceFilter): Promise { + calls.push({ method: 'listSources', args: [filter] }) + return [...sources.values()].filter(d => { if (filter.bucketId && d.bucketId !== filter.bucketId) return false if (filter.tenantId && d.tenantId !== filter.tenantId) return false if (filter.status) { @@ -288,41 +300,41 @@ export function createMockAdapter(): VectorStoreAdapter & { }) }, - async deleteDocuments(filter: DocumentFilter): Promise { - calls.push({ method: 'deleteDocuments', args: [filter] }) + async deleteSources(filter: SourceFilter): Promise { + calls.push({ method: 'deleteSources', args: [filter] }) let count = 0 - for (const [id, d] of documents) { + for (const [id, d] of sources) { let match = true if (filter.bucketId && d.bucketId !== filter.bucketId) match = false if (filter.tenantId && d.tenantId !== filter.tenantId) match = false if (match) { - documents.delete(id) + sources.delete(id) count++ } } return count }, - async updateDocumentStatus(id: string, status: DocumentStatus, chunkCount?: number) { - calls.push({ method: 'updateDocumentStatus', args: [id, status, chunkCount] }) - const doc = documents.get(id) - if (doc) { - doc.status = status - if (chunkCount !== undefined) doc.chunkCount = chunkCount - doc.updatedAt = new Date() + async updateSourceStatus(id: string, status: SourceStatus, chunkCount?: number) { + calls.push({ method: 'updateSourceStatus', args: [id, status, chunkCount] }) + const source = sources.get(id) + if (source) { + source.status = status + if (chunkCount !== undefined) source.chunkCount = chunkCount + source.updatedAt = new Date() } }, async getChunksByRange( model: string, - documentId: string, + sourceId: string, fromIndex: number, toIndex: number ): Promise { - calls.push({ method: 'getChunksByRange', args: [model, documentId, fromIndex, toIndex] }) + calls.push({ method: 'getChunksByRange', args: [model, sourceId, fromIndex, toIndex] }) const store = chunks.get(model) ?? [] return store - .filter(c => c.documentId === documentId && c.chunkIndex >= fromIndex && c.chunkIndex <= toIndex) + .filter(c => c.sourceId === sourceId && c.chunkIndex >= fromIndex && c.chunkIndex <= toIndex) .map(c => ({ ...c, scores: { semantic: 0 } })) .sort((a, b) => a.chunkIndex - b.chunkIndex) }, diff --git a/packages/sdk/src/__tests__/helpers/mock-connector.ts b/packages/sdk/src/__tests__/helpers/mock-connector.ts index d953bab..321eff2 100644 --- a/packages/sdk/src/__tests__/helpers/mock-connector.ts +++ b/packages/sdk/src/__tests__/helpers/mock-connector.ts @@ -1,24 +1,24 @@ -import type { RawDocument } from '../../types/connector.js' +import type { SourceInput } from '../../types/connector.js' -export function createTestDocument(overrides?: Partial): RawDocument { +export function createTestSource(overrides?: Partial): SourceInput { return { - id: 'doc-1', - content: 'Test document content. This is the body of the test document.', - title: 'Test Document', - url: 'https://example.com/doc-1', + id: 'source-1', + content: 'Test source content. This is the body of the test source.', + title: 'Test Source', + url: 'https://example.com/source-1', updatedAt: new Date('2024-01-01'), metadata: {}, ...overrides, } } -export function createTestDocuments(count: number, contentPrefix?: string): RawDocument[] { - const prefix = contentPrefix ?? 'Document' +export function createTestSources(count: number, contentPrefix?: string): SourceInput[] { + const prefix = contentPrefix ?? 'Source' return Array.from({ length: count }, (_, i) => ({ - id: `doc-${i + 1}`, - content: `${prefix} ${i + 1} content. This is the body of document number ${i + 1}.`, + id: `source-${i + 1}`, + content: `${prefix} ${i + 1} content. This is the body of source number ${i + 1}.`, title: `${prefix} ${i + 1}`, - url: `https://example.com/doc-${i + 1}`, + url: `https://example.com/source-${i + 1}`, updatedAt: new Date('2024-01-01'), metadata: {}, })) diff --git a/packages/sdk/src/__tests__/helpers/mock-source.ts b/packages/sdk/src/__tests__/helpers/mock-source.ts index ec8e468..ea11e52 100644 --- a/packages/sdk/src/__tests__/helpers/mock-source.ts +++ b/packages/sdk/src/__tests__/helpers/mock-source.ts @@ -1,14 +1,14 @@ import type { Bucket } from '../../types/bucket.js' -import type { RawDocument, ChunkOpts } from '../../types/connector.js' +import type { SourceInput, ChunkOpts } from '../../types/connector.js' import type { IngestOptions } from '../../types/index-types.js' export interface MockSourceOpts { id?: string name?: string - documents?: RawDocument[] + sources?: SourceInput[] chunkSize?: number chunkOverlap?: number - deduplicateBy?: string[] | ((doc: RawDocument) => string) + deduplicateBy?: string[] | ((source: SourceInput) => string) stripMarkdownForEmbedding?: boolean preprocessForEmbedding?: (content: string) => string propagateMetadata?: string[] @@ -16,14 +16,14 @@ export interface MockSourceOpts { export interface MockSourceResult { bucket: Bucket - documents: RawDocument[] + sources: SourceInput[] ingestOptions: IngestOptions chunkOpts: ChunkOpts } export function createMockBucket(opts: MockSourceOpts = {}): MockSourceResult { const id = opts.id ?? 'test-source' - const documents = opts.documents ?? [] + const sources = opts.sources ?? [] const bucket: Bucket = { id, @@ -45,5 +45,5 @@ export function createMockBucket(opts: MockSourceOpts = {}): MockSourceResult { const chunkOpts: ChunkOpts = { chunkSize, chunkOverlap } - return { bucket, documents, ingestOptions, chunkOpts } + return { bucket, sources, ingestOptions, chunkOpts } } diff --git a/packages/sdk/src/__tests__/index-engine.test.ts b/packages/sdk/src/__tests__/index-engine.test.ts index 3912c36..9540024 100644 --- a/packages/sdk/src/__tests__/index-engine.test.ts +++ b/packages/sdk/src/__tests__/index-engine.test.ts @@ -4,10 +4,9 @@ import { embeddingModelKey } from '../embedding/provider.js' import { createMockAdapter } from './helpers/mock-adapter.js' import { createMockEmbedding } from './helpers/mock-embedding.js' import { createMockBucket } from './helpers/mock-source.js' -import { createTestDocument, createTestDocuments } from './helpers/mock-connector.js' +import { createTestSource, createTestSources } from './helpers/mock-connector.js' import { defaultChunker } from '../index-engine/chunker.js' import { buildHashStoreKey, resolveIdempotencyKey } from '../index-engine/hash.js' -import { chunkIdFor } from '../utils/id.js' import type { typegraphEvent } from '../types/events.js' describe('IndexEngine', () => { @@ -19,52 +18,64 @@ describe('IndexEngine', () => { embedding = createMockEmbedding() }) - /** Helper: chunk docs and ingest via engine.ingestBatch */ + /** Helper: chunk sources and ingest via engine.ingestBatch */ async function ingestDocs( engine: IndexEngine, bucketId: string, - docs: ReturnType, + sources: ReturnType, ingestOptions: ReturnType['ingestOptions'], opts?: Parameters[2], ) { const chunkOpts = { chunkSize: ingestOptions.chunkSize ?? 100, chunkOverlap: ingestOptions.chunkOverlap ?? 20 } - const items = await Promise.all(docs.map(async doc => ({ doc, chunks: await defaultChunker(doc, chunkOpts) }))) - return engine.ingestBatch(bucketId, items, { ...ingestOptions, ...opts }) + const items = await Promise.all(sources.map(async source => ({ source, chunks: await defaultChunker(source, chunkOpts) }))) + return engine.ingestBatch(bucketId, items, { ...ingestOptions, ...(opts ?? {}) }) } describe('ingestBatch', () => { - it('indexes all documents', async () => { - const docs = createTestDocuments(3) - const { bucket, ingestOptions } = createMockBucket({ documents: docs }) + it('indexes all sources', async () => { + const sources = createTestSources(3) + const { bucket, ingestOptions } = createMockBucket({ sources: sources }) const engine = new IndexEngine(adapter, embedding) - const result = await ingestDocs(engine, bucket.id, docs, ingestOptions) + const result = await ingestDocs(engine, bucket.id, sources, ingestOptions) expect(result.total).toBe(3) expect(result.inserted).toBe(3) expect(result.skipped).toBe(0) }) - it('skips unchanged documents (idempotency)', async () => { - const docs = createTestDocuments(2) - const { bucket, ingestOptions } = createMockBucket({ documents: docs }) + it('treats null opts as omitted', async () => { + const source = createTestSource() + const { bucket } = createMockBucket({ sources: [] }) + const chunks = [{ content: 'Chunk 0', chunkIndex: 0 }] + const engine = new IndexEngine(adapter, embedding) + + const result = await engine.ingestBatch(bucket.id, [{ source, chunks }], null) + + expect(result.inserted).toBe(1) + expect(result.total).toBe(1) + }) + + it('skips unchanged sources (idempotency)', async () => { + const sources = createTestSources(2) + const { bucket, ingestOptions } = createMockBucket({ sources: sources }) const engine = new IndexEngine(adapter, embedding) - await ingestDocs(engine, bucket.id, docs, ingestOptions) - const result2 = await ingestDocs(engine, bucket.id, docs, ingestOptions) + await ingestDocs(engine, bucket.id, sources, ingestOptions) + const result2 = await ingestDocs(engine, bucket.id, sources, ingestOptions) expect(result2.total).toBe(2) expect(result2.skipped).toBe(2) expect(result2.inserted).toBe(0) }) - it('skips unchanged group-visible documents', async () => { - const docs = createTestDocuments(2) - const { bucket, ingestOptions } = createMockBucket({ documents: docs }) + it('skips unchanged group-visible sources', async () => { + const sources = createTestSources(2) + const { bucket, ingestOptions } = createMockBucket({ sources: sources }) const engine = new IndexEngine(adapter, embedding) - await ingestDocs(engine, bucket.id, docs, ingestOptions, { + await ingestDocs(engine, bucket.id, sources, ingestOptions, { groupId: 'Novel-30752', visibility: 'group', }) - const result2 = await ingestDocs(engine, bucket.id, docs, ingestOptions, { + const result2 = await ingestDocs(engine, bucket.id, sources, ingestOptions, { groupId: 'Novel-30752', visibility: 'group', }) @@ -76,209 +87,264 @@ describe('IndexEngine', () => { const countCalls = adapter.calls.filter(c => c.method === 'countChunks') expect(countCalls.at(-1)!.args[1]).toEqual(expect.objectContaining({ groupId: 'Novel-30752', - idempotencyKey: 'doc-2', + idempotencyKey: 'source-2', })) }) it('re-indexes on content change', async () => { - const docs = [createTestDocument({ id: 'doc-1', content: 'Original content' })] - const { bucket, ingestOptions } = createMockBucket({ documents: docs }) + const sources = [createTestSource({ id: 'source-1', content: 'Original content' })] + const { bucket, ingestOptions } = createMockBucket({ sources: sources }) const engine = new IndexEngine(adapter, embedding) - await ingestDocs(engine, bucket.id, docs, ingestOptions) + await ingestDocs(engine, bucket.id, sources, ingestOptions) - const updatedDocs = [createTestDocument({ id: 'doc-1', content: 'Updated content' })] + const updatedDocs = [createTestSource({ id: 'source-1', content: 'Updated content' })] const result = await ingestDocs(engine, bucket.id, updatedDocs, ingestOptions) expect(result.inserted).toBe(1) }) it('re-indexes on model change', async () => { - const docs = [createTestDocument()] - const { bucket, ingestOptions } = createMockBucket({ documents: docs }) + const sources = [createTestSource()] + const { bucket, ingestOptions } = createMockBucket({ sources: sources }) const engine1 = new IndexEngine(adapter, createMockEmbedding({ model: 'model-v1' })) - await ingestDocs(engine1, bucket.id, docs, ingestOptions) + await ingestDocs(engine1, bucket.id, sources, ingestOptions) const engine2 = new IndexEngine(adapter, createMockEmbedding({ model: 'model-v2' })) - const result = await ingestDocs(engine2, bucket.id, docs, ingestOptions) + const result = await ingestDocs(engine2, bucket.id, sources, ingestOptions) expect(result.inserted).toBe(0) expect(result.updated).toBe(1) }) it('calls ensureModel', async () => { - const docs = [createTestDocument()] - const { bucket, ingestOptions } = createMockBucket({ documents: docs }) + const sources = [createTestSource()] + const { bucket, ingestOptions } = createMockBucket({ sources: sources }) const engine = new IndexEngine(adapter, embedding) - await ingestDocs(engine, bucket.id, docs, ingestOptions) + await ingestDocs(engine, bucket.id, sources, ingestOptions) expect(adapter.calls.some(c => c.method === 'ensureModel')).toBe(true) }) it('supports dryRun', async () => { - const docs = [createTestDocument()] - const { bucket, ingestOptions } = createMockBucket({ documents: docs }) + const sources = [createTestSource()] + const { bucket, ingestOptions } = createMockBucket({ sources: sources }) const engine = new IndexEngine(adapter, embedding) - const result = await ingestDocs(engine, bucket.id, docs, ingestOptions, { dryRun: true }) + const result = await ingestDocs(engine, bucket.id, sources, ingestOptions, { dryRun: true }) expect(result.inserted).toBe(1) - expect(adapter.calls.filter(c => c.method === 'upsertDocument')).toHaveLength(0) + expect(adapter.calls.filter(c => c.method === 'upsertSourceChunks')).toHaveLength(0) }) it('strips markdown for embedding when configured', async () => { - const doc = createTestDocument({ content: '# Heading\n\n**Bold** text' }) + const source = createTestSource({ content: '# Heading\n\n**Bold** text' }) const { bucket, ingestOptions } = createMockBucket({ - documents: [doc], + sources: [source], stripMarkdownForEmbedding: true, }) const engine = new IndexEngine(adapter, embedding) const embedSpy = vi.spyOn(embedding, 'embedBatch') - await ingestDocs(engine, bucket.id, [doc], ingestOptions) + await ingestDocs(engine, bucket.id, [source], ingestOptions) const embeddedTexts = embedSpy.mock.calls[0]![0] expect(embeddedTexts[0]).not.toContain('#') expect(embeddedTexts[0]).not.toContain('**') }) it('applies custom preprocessForEmbedding', async () => { - const doc = createTestDocument({ content: 'Hello World' }) + const source = createTestSource({ content: 'Hello World' }) const { bucket, ingestOptions } = createMockBucket({ - documents: [doc], + sources: [source], preprocessForEmbedding: (c) => c.toLowerCase(), }) const engine = new IndexEngine(adapter, embedding) const embedSpy = vi.spyOn(embedding, 'embedBatch') - await ingestDocs(engine, bucket.id, [doc], ingestOptions) + await ingestDocs(engine, bucket.id, [source], ingestOptions) const embeddedTexts = embedSpy.mock.calls[0]![0] expect(embeddedTexts[0]).toBe('hello world') }) it('propagates default metadata (title, url, updatedAt)', async () => { - const doc = createTestDocument({ - title: 'My Doc', + const source = createTestSource({ + title: 'My Source', url: 'https://example.com', updatedAt: new Date('2024-06-01'), }) - const { bucket, ingestOptions } = createMockBucket({ documents: [doc] }) + const { bucket, ingestOptions } = createMockBucket({ sources: [source] }) const engine = new IndexEngine(adapter, embedding) - await ingestDocs(engine, bucket.id, [doc], ingestOptions) + await ingestDocs(engine, bucket.id, [source], ingestOptions) const stored = adapter._chunks.get(embeddingModelKey(embedding))! - expect(stored[0]!.metadata.title).toBe('My Doc') + expect(stored[0]!.metadata.title).toBe('My Source') expect(stored[0]!.metadata.url).toBe('https://example.com') }) + it('materializes source subjects without requiring triple extraction', async () => { + const subject = { + name: 'Acme demo', + entityType: 'meeting', + externalIds: [{ type: 'meeting_id', id: 'mtng_123' }], + } + const source = createTestSource({ + id: 'source-subject', + title: 'Acme demo transcript', + content: 'Transcript body that does not repeat the meeting title.', + subject, + }) + const { bucket } = createMockBucket({ sources: [] }) + const addSourceSubject = vi.fn().mockResolvedValue({ + id: 'ent_meeting', + name: 'Acme demo', + entityType: 'meeting', + aliases: [], + edgeCount: 0, + properties: {}, + createdAt: new Date('2024-01-01'), + topEdges: [], + }) + const engine = new IndexEngine( + adapter, + embedding, + undefined, + undefined, + { addSourceSubject } as any, + ) + + await engine.ingestWithChunks( + bucket.id, + source, + [ + { content: 'Opening discussion.', chunkIndex: 0 }, + { content: 'Next steps.', chunkIndex: 1 }, + ], + ) + + expect(addSourceSubject).toHaveBeenCalledTimes(1) + expect(addSourceSubject).toHaveBeenCalledWith(expect.objectContaining({ + subject, + bucketId: bucket.id, + sourceId: 'source-subject', + embeddingModel: embeddingModelKey(embedding), + chunks: expect.arrayContaining([ + expect.objectContaining({ chunkIndex: 0, id: expect.any(String) }), + expect.objectContaining({ chunkIndex: 1, id: expect.any(String) }), + ]), + })) + const recordCall = adapter.calls.find(c => c.method === 'upsertSourceRecord')! + expect(recordCall.args[0]).toEqual(expect.objectContaining({ subject })) + const stored = adapter._chunks.get(embeddingModelKey(embedding))! + expect(stored).toHaveLength(2) + expect(stored.every(chunk => chunk.metadata.subject === subject)).toBe(true) + }) + it('normalizes url=null to no URL during batch ingest', async () => { - const doc = createTestDocument({ id: 'doc-null-url', url: null }) - const { bucket, ingestOptions } = createMockBucket({ documents: [doc] }) + const source = createTestSource({ id: 'source-null-url', url: null }) + const { bucket, ingestOptions } = createMockBucket({ sources: [source] }) const engine = new IndexEngine(adapter, embedding) - await ingestDocs(engine, bucket.id, [doc], ingestOptions) + await ingestDocs(engine, bucket.id, [source], ingestOptions) - const recordCall = adapter.calls.find(c => c.method === 'upsertDocumentRecord')! + const recordCall = adapter.calls.find(c => c.method === 'upsertSourceRecord')! expect(recordCall.args[0].url).toBeUndefined() const stored = adapter._chunks.get(embeddingModelKey(embedding))! expect(stored[0]!.metadata.url).toBeUndefined() }) it('normalizes url=null to no URL during pre-chunked ingest', async () => { - const doc = createTestDocument({ id: 'doc-null-url-prechunked', url: null }) - const { bucket } = createMockBucket({ documents: [] }) + const source = createTestSource({ id: 'source-null-url-prechunked', url: null }) + const { bucket } = createMockBucket({ sources: [] }) const engine = new IndexEngine(adapter, embedding) const result = await engine.ingestWithChunks( bucket.id, - doc, + source, [{ content: 'Chunk content', chunkIndex: 0 }], ) expect(result.inserted).toBe(1) - const recordCall = adapter.calls.find(c => c.method === 'upsertDocumentRecord')! + const recordCall = adapter.calls.find(c => c.method === 'upsertSourceRecord')! expect(recordCall.args[0].url).toBeUndefined() const stored = adapter._chunks.get(embeddingModelKey(embedding))! expect(stored[0]!.metadata.url).toBeUndefined() }) it('propagates custom metadata fields', async () => { - const doc = createTestDocument({ + const source = createTestSource({ metadata: { category: 'tech', priority: 'high' }, }) const { bucket, ingestOptions } = createMockBucket({ - documents: [doc], + sources: [source], propagateMetadata: ['metadata.category', 'metadata.priority'], }) const engine = new IndexEngine(adapter, embedding) - await ingestDocs(engine, bucket.id, [doc], ingestOptions) + await ingestDocs(engine, bucket.id, [source], ingestOptions) const stored = adapter._chunks.get(embeddingModelKey(embedding))! expect(stored[0]!.metadata.category).toBe('tech') expect(stored[0]!.metadata.priority).toBe('high') }) - it('creates document records', async () => { - const doc = createTestDocument() - const { bucket, ingestOptions } = createMockBucket({ documents: [doc] }) + it('creates source records', async () => { + const source = createTestSource() + const { bucket, ingestOptions } = createMockBucket({ sources: [source] }) const engine = new IndexEngine(adapter, embedding) - await ingestDocs(engine, bucket.id, [doc], ingestOptions) + await ingestDocs(engine, bucket.id, [source], ingestOptions) - expect(adapter.calls.some(c => c.method === 'upsertDocumentRecord')).toBe(true) + expect(adapter.calls.some(c => c.method === 'upsertSourceRecord')).toBe(true) }) - it('uses canonical document id when hash dedup is missing', async () => { - const doc = createTestDocument({ + it('uses canonical source id when hash dedup is missing', async () => { + const source = createTestSource({ id: undefined, - content: 'Canonical document content about Alice and Bob.', - title: 'Canonical Batch Document', + content: 'Canonical source content about Alice and Bob.', + title: 'Canonical Batch Source', url: 'https://example.com/canonical-batch', }) - const { bucket } = createMockBucket({ documents: [] }) + const { bucket } = createMockBucket({ sources: [] }) const chunks = [{ content: 'Alice met Bob.', chunkIndex: 0 }] const events: typegraphEvent[] = [] - const persistPassageNodes = vi.fn().mockResolvedValue(undefined) const extractFromChunk = vi.fn().mockResolvedValue({ entities: [] }) const engine = new IndexEngine(adapter, embedding, { emit: event => { events.push(event) }, }) - engine.tripleExtractor = { persistPassageNodes, extractFromChunk } as any + engine.tripleExtractor = { extractFromChunk } as any - await engine.ingestBatch(bucket.id, [{ doc, chunks }], { graphExtraction: true }) - const canonicalId = adapter._chunks.get(embeddingModelKey(embedding))![0]!.documentId - const ikey = resolveIdempotencyKey(doc, ['url']) + await engine.ingestBatch(bucket.id, [{ source, chunks }], { graphExtraction: true }) + const canonicalId = adapter._chunks.get(embeddingModelKey(embedding))![0]!.sourceId + const ikey = resolveIdempotencyKey(source, ['url']) await adapter.hashStore.delete(buildHashStoreKey(undefined, bucket.id, ikey)) adapter.calls.length = 0 events.length = 0 - persistPassageNodes.mockClear() extractFromChunk.mockClear() - const result = await engine.ingestBatch(bucket.id, [{ doc, chunks }], { graphExtraction: true }) + const result = await engine.ingestBatch(bucket.id, [{ source, chunks }], { graphExtraction: true }) expect(result.inserted).toBe(0) expect(result.updated).toBe(1) - const upsertCall = adapter.calls.find(c => c.method === 'upsertDocument')! - expect((upsertCall.args[1] as Array<{ documentId: string }>)[0]!.documentId).toBe(canonicalId) - expect(persistPassageNodes.mock.calls[0]![0][0].documentId).toBe(canonicalId) + const upsertCall = adapter.calls.find(c => c.method === 'upsertSourceChunks')! + expect((upsertCall.args[1] as Array<{ sourceId: string }>)[0]!.sourceId).toBe(canonicalId) expect(extractFromChunk.mock.calls[0]![3]).toBe(canonicalId) - expect(adapter.calls.filter(c => c.method === 'updateDocumentStatus').at(-1)!.args[0]).toBe(canonicalId) - expect(events.find(e => e.eventType === 'index.document')!.targetId).toBe(canonicalId) + expect(adapter.calls.filter(c => c.method === 'updateSourceStatus').at(-1)!.args[0]).toBe(canonicalId) + expect(events.find(e => e.eventType === 'index.source')!.targetId).toBe(canonicalId) }) it('leaves graph extraction failures retryable', async () => { - const doc = createTestDocument({ + const source = createTestSource({ id: undefined, - content: 'Retryable graph extraction document.', - title: 'Retryable Graph Document', + content: 'Retryable graph extraction source.', + title: 'Retryable Graph Source', url: 'https://example.com/retryable-graph', }) - const { bucket } = createMockBucket({ documents: [] }) + const { bucket } = createMockBucket({ sources: [] }) const chunks = [{ content: 'Alice met Bob.', chunkIndex: 0 }] const engine = new IndexEngine(adapter, embedding) engine.tripleExtractor = { extractFromChunk: vi.fn().mockRejectedValue(new Error('Graph write failed')), } as any - const failed = await engine.ingestBatch(bucket.id, [{ doc, chunks }], { graphExtraction: true }) + const failed = await engine.ingestBatch(bucket.id, [{ source, chunks }], { graphExtraction: true }) expect(failed.inserted).toBe(0) expect(failed.updated).toBe(0) expect(failed.extraction?.failed).toBe(1) - const failedStatus = adapter.calls.filter(c => c.method === 'updateDocumentStatus').at(-1)! + const failedStatus = adapter.calls.filter(c => c.method === 'updateSourceStatus').at(-1)! expect(failedStatus.args[1]).toBe('failed') - const ikey = resolveIdempotencyKey(doc, ['url']) + const ikey = resolveIdempotencyKey(source, ['url']) const storeKey = buildHashStoreKey(undefined, bucket.id, ikey) expect(await adapter.hashStore.get(storeKey)).toBeNull() @@ -286,22 +352,22 @@ describe('IndexEngine', () => { engine.tripleExtractor = { extractFromChunk: vi.fn().mockResolvedValue({ entities: [] }), } as any - const retried = await engine.ingestBatch(bucket.id, [{ doc, chunks }], { graphExtraction: true }) + const retried = await engine.ingestBatch(bucket.id, [{ source, chunks }], { graphExtraction: true }) expect(retried.skipped).toBe(0) expect(retried.inserted).toBe(0) expect(retried.updated).toBe(1) expect(await adapter.hashStore.get(storeKey)).not.toBeNull() - expect(adapter.calls.some(c => c.method === 'upsertDocument')).toBe(true) - expect(adapter.calls.filter(c => c.method === 'updateDocumentStatus').at(-1)!.args[1]).toBe('complete') + expect(adapter.calls.some(c => c.method === 'upsertSourceChunks')).toBe(true) + expect(adapter.calls.filter(c => c.method === 'updateSourceStatus').at(-1)!.args[1]).toBe('complete') }) it('serializes graph extraction even when concurrency is higher', async () => { - const docs = [ - createTestDocument({ id: undefined, title: 'Doc A', url: 'https://example.com/a', content: 'Alice met Bob.' }), - createTestDocument({ id: undefined, title: 'Doc B', url: 'https://example.com/b', content: 'Carol met Dana.' }), + const sources = [ + createTestSource({ id: undefined, title: 'Source A', url: 'https://example.com/a', content: 'Alice met Bob.' }), + createTestSource({ id: undefined, title: 'Source B', url: 'https://example.com/b', content: 'Carol met Dana.' }), ] - const { bucket } = createMockBucket({ documents: [] }) + const { bucket } = createMockBucket({ sources: [] }) let active = 0 let maxActive = 0 const extractFromChunk = vi.fn(async () => { @@ -316,7 +382,7 @@ describe('IndexEngine', () => { await engine.ingestBatch( bucket.id, - docs.map(doc => ({ doc, chunks: [{ content: doc.content, chunkIndex: 0 }] })), + sources.map(source => ({ source, chunks: [{ content: source.content, chunkIndex: 0 }] })), { graphExtraction: true, concurrency: 2 }, ) @@ -327,14 +393,14 @@ describe('IndexEngine', () => { describe('ingestWithChunks', () => { it('ingests pre-built chunks', async () => { - const doc = createTestDocument() - const { bucket } = createMockBucket({ documents: [] }) + const source = createTestSource() + const { bucket } = createMockBucket({ sources: [] }) const chunks = [ { content: 'Chunk 0', chunkIndex: 0 }, { content: 'Chunk 1', chunkIndex: 1 }, ] const engine = new IndexEngine(adapter, embedding) - const result = await engine.ingestWithChunks(bucket.id, doc, chunks) + const result = await engine.ingestWithChunks(bucket.id, source, chunks) expect(result.inserted).toBe(1) expect(result.total).toBe(1) @@ -342,43 +408,55 @@ describe('IndexEngine', () => { expect(stored).toHaveLength(2) }) + it('treats null opts as omitted', async () => { + const source = createTestSource() + const { bucket } = createMockBucket({ sources: [] }) + const chunks = [{ content: 'Chunk 0', chunkIndex: 0 }] + const engine = new IndexEngine(adapter, embedding) + + const result = await engine.ingestWithChunks(bucket.id, source, chunks, null) + + expect(result.inserted).toBe(1) + expect(result.total).toBe(1) + }) + it('supports dryRun', async () => { - const doc = createTestDocument() - const { bucket } = createMockBucket({ documents: [] }) + const source = createTestSource() + const { bucket } = createMockBucket({ sources: [] }) const chunks = [{ content: 'Chunk 0', chunkIndex: 0 }] const engine = new IndexEngine(adapter, embedding) - const result = await engine.ingestWithChunks(bucket.id, doc, chunks, { dryRun: true }) + const result = await engine.ingestWithChunks(bucket.id, source, chunks, { dryRun: true }) expect(result.inserted).toBe(1) - expect(adapter.calls.filter(c => c.method === 'upsertDocument')).toHaveLength(0) + expect(adapter.calls.filter(c => c.method === 'upsertSourceChunks')).toHaveLength(0) }) it('sets status to failed on error', async () => { - const doc = createTestDocument() - const { bucket } = createMockBucket({ documents: [] }) + const source = createTestSource() + const { bucket } = createMockBucket({ sources: [] }) const chunks = [{ content: 'Chunk 0', chunkIndex: 0 }] const failEmbedding = createMockEmbedding() failEmbedding.embedBatch = async () => { throw new Error('Embed failed') } const engine = new IndexEngine(adapter, failEmbedding) - await expect(engine.ingestWithChunks(bucket.id, doc, chunks)).rejects.toThrow('Embed failed') + await expect(engine.ingestWithChunks(bucket.id, source, chunks)).rejects.toThrow('Embed failed') - const statusCalls = adapter.calls.filter(c => c.method === 'updateDocumentStatus') + const statusCalls = adapter.calls.filter(c => c.method === 'updateSourceStatus') if (statusCalls.length > 0) { expect(statusCalls[statusCalls.length - 1]!.args[1]).toBe('failed') } }) it('reports triple extraction exceptions as errors, not timeouts', async () => { - const doc = createTestDocument() - const { bucket } = createMockBucket({ documents: [] }) + const source = createTestSource() + const { bucket } = createMockBucket({ sources: [] }) const chunks = [{ content: 'Alice met Bob.', chunkIndex: 0 }] const engine = new IndexEngine(adapter, embedding) engine.tripleExtractor = { extractFromChunk: vi.fn().mockRejectedValue(new Error('No output generated.')), } as any - const result = await engine.ingestWithChunks(bucket.id, doc, chunks, { graphExtraction: true }) + const result = await engine.ingestWithChunks(bucket.id, source, chunks, { graphExtraction: true }) expect(result.extraction?.failed).toBe(1) expect(result.extraction?.failedChunks?.[0]).toEqual(expect.objectContaining({ @@ -387,78 +465,62 @@ describe('IndexEngine', () => { })) }) - it('uses canonical document id for pre-chunked reprocessing', async () => { - const doc = createTestDocument({ + it('uses canonical source id for pre-chunked reprocessing', async () => { + const source = createTestSource({ id: undefined, content: 'Canonical pre-chunked content about Alice and Bob.', - title: 'Canonical Prechunked Document', + title: 'Canonical Prechunked Source', url: 'https://example.com/canonical-prechunked', }) - const { bucket } = createMockBucket({ documents: [] }) + const { bucket } = createMockBucket({ sources: [] }) const chunks = [{ content: 'Alice met Bob.', chunkIndex: 0 }] - const persistPassageNodes = vi.fn().mockResolvedValue(undefined) const extractFromChunk = vi.fn().mockResolvedValue({ entities: [] }) const engine = new IndexEngine(adapter, embedding) - engine.tripleExtractor = { persistPassageNodes, extractFromChunk } as any + engine.tripleExtractor = { extractFromChunk } as any - await engine.ingestWithChunks(bucket.id, doc, chunks, { graphExtraction: true }) - const canonicalId = adapter._chunks.get(embeddingModelKey(embedding))![0]!.documentId + await engine.ingestWithChunks(bucket.id, source, chunks, { graphExtraction: true }) + const canonicalId = adapter._chunks.get(embeddingModelKey(embedding))![0]!.sourceId adapter.calls.length = 0 - persistPassageNodes.mockClear() extractFromChunk.mockClear() - const result = await engine.ingestWithChunks(bucket.id, doc, chunks, { graphExtraction: true }) + const result = await engine.ingestWithChunks(bucket.id, source, chunks, { graphExtraction: true }) expect(result.inserted).toBe(0) expect(result.updated).toBe(1) - const upsertCall = adapter.calls.find(c => c.method === 'upsertDocument')! - expect((upsertCall.args[1] as Array<{ documentId: string }>)[0]!.documentId).toBe(canonicalId) - expect(persistPassageNodes.mock.calls[0]![0][0].documentId).toBe(canonicalId) + const upsertCall = adapter.calls.find(c => c.method === 'upsertSourceChunks')! + expect((upsertCall.args[1] as Array<{ sourceId: string }>)[0]!.sourceId).toBe(canonicalId) expect(extractFromChunk.mock.calls[0]![3]).toBe(canonicalId) - expect(adapter.calls.filter(c => c.method === 'updateDocumentStatus').at(-1)!.args[0]).toBe(canonicalId) + expect(adapter.calls.filter(c => c.method === 'updateSourceStatus').at(-1)!.args[0]).toBe(canonicalId) }) - it('persists passage nodes before graph extraction', async () => { - const doc = createTestDocument({ id: 'doc-passages' }) - const { bucket } = createMockBucket({ documents: [] }) + it('extracts graph facts from chunks without graph-owned chunk persistence', async () => { + const source = createTestSource({ id: 'source-chunks' }) + const { bucket } = createMockBucket({ sources: [] }) const chunks = [ { content: 'Alice met Bob.', chunkIndex: 0 }, { content: 'Bob works at Acme.', chunkIndex: 1 }, ] - const persistPassageNodes = vi.fn().mockResolvedValue(undefined) const extractFromChunk = vi.fn().mockResolvedValue({ entities: [] }) const engine = new IndexEngine(adapter, embedding) - engine.tripleExtractor = { persistPassageNodes, extractFromChunk } as any - - await engine.ingestWithChunks(bucket.id, doc, chunks, { graphExtraction: true, tenantId: 'tenant-1' }) - const ikey = resolveIdempotencyKey(doc, ['url']) - const modelId = embeddingModelKey(embedding) - - expect(persistPassageNodes).toHaveBeenCalledTimes(1) - expect(persistPassageNodes.mock.calls[0]![0]).toEqual([ - expect.objectContaining({ - bucketId: bucket.id, - documentId: 'doc-passages', - chunkIndex: 0, - chunkId: chunkIdFor({ embeddingModel: modelId, bucketId: bucket.id, idempotencyKey: ikey, chunkIndex: 0 }), - tenantId: 'tenant-1', - }), - expect.objectContaining({ - bucketId: bucket.id, - documentId: 'doc-passages', - chunkIndex: 1, - chunkId: chunkIdFor({ embeddingModel: modelId, bucketId: bucket.id, idempotencyKey: ikey, chunkIndex: 1 }), - tenantId: 'tenant-1', - }), - ]) - const upsertCallIndex = adapter.calls.findIndex(call => call.method === 'upsertDocument') + engine.tripleExtractor = { extractFromChunk } as any + + await engine.ingestWithChunks(bucket.id, source, chunks, { graphExtraction: true, tenantId: 'tenant-1' }) + + const upsertCallIndex = adapter.calls.findIndex(call => call.method === 'upsertSourceChunks') expect(upsertCallIndex).toBeGreaterThanOrEqual(0) - expect(extractFromChunk).toHaveBeenCalled() + expect(extractFromChunk).toHaveBeenCalledTimes(2) + expect(extractFromChunk.mock.calls[0]).toEqual(expect.arrayContaining([ + 'Alice met Bob.', + bucket.id, + 0, + 'source-chunks', + ])) + expect(extractFromChunk.mock.calls[0]![7]).toEqual(expect.objectContaining({ tenantId: 'tenant-1' })) }) it('passes accumulated entity context to later chunks', async () => { - const doc = createTestDocument() - const { bucket } = createMockBucket({ documents: [] }) + const source = createTestSource() + const { bucket } = createMockBucket({ sources: [] }) const chunks = [ { content: 'Cole Conway entered the saloon.', chunkIndex: 0 }, { content: 'Conway met Steve Sharp there.', chunkIndex: 1 }, @@ -469,7 +531,7 @@ describe('IndexEngine', () => { const engine = new IndexEngine(adapter, embedding) engine.tripleExtractor = { extractFromChunk } as any - await engine.ingestWithChunks(bucket.id, doc, chunks, { graphExtraction: true }) + await engine.ingestWithChunks(bucket.id, source, chunks, { graphExtraction: true }) expect(extractFromChunk).toHaveBeenCalledTimes(2) expect(extractFromChunk.mock.calls[1]![5]).toEqual([ diff --git a/packages/sdk/src/__tests__/integration.test.ts b/packages/sdk/src/__tests__/integration.test.ts index 75323f7..f29f040 100644 --- a/packages/sdk/src/__tests__/integration.test.ts +++ b/packages/sdk/src/__tests__/integration.test.ts @@ -3,7 +3,7 @@ import { typegraphInit } from '../typegraph.js' import { createMockAdapter } from './helpers/mock-adapter.js' import { createMockEmbedding } from './helpers/mock-embedding.js' import { createMockBucket } from './helpers/mock-source.js' -import { createTestDocument, createTestDocuments } from './helpers/mock-connector.js' +import { createTestSource, createTestSources } from './helpers/mock-connector.js' import type { typegraphInstance } from '../typegraph.js' import type { Bucket } from '../types/bucket.js' import type { EmbeddingProvider } from '../embedding/provider.js' @@ -21,11 +21,11 @@ describe('integration', () => { const embedding = createMockEmbedding() const instance = await typegraphInit({ vectorStore: adapter, embedding }) - const { bucket, documents, ingestOptions } = createMockBucket({ documents: createTestDocuments(3) }) + const { bucket, sources, ingestOptions } = createMockBucket({ sources: createTestSources(3) }) registerTestBucket(instance, bucket, embedding) - await instance.ingest(documents, { ...ingestOptions, bucketId: bucket.id }) + await instance.ingest(sources, { ...ingestOptions, bucketId: bucket.id }) - const response = await instance.query('Document 1', { context: { format: 'xml' } }) + const response = await instance.query('Source 1', { context: { format: 'xml' } }) expect(response.results.chunks.length).toBeGreaterThan(0) expect(response.context).toContain('') expect(response.context).toContain('') @@ -38,12 +38,12 @@ describe('integration', () => { const embedding = createMockEmbedding() const instance = await typegraphInit({ vectorStore: adapter, embedding }) - const docs = [createTestDocument({ id: 'doc-1', content: 'Original content for testing' })] - const { bucket, ingestOptions } = createMockBucket({ documents: docs }) + const sources = [createTestSource({ id: 'source-1', content: 'Original content for testing' })] + const { bucket, ingestOptions } = createMockBucket({ sources: sources }) registerTestBucket(instance, bucket, embedding) - await instance.ingest(docs, { ...ingestOptions, bucketId: bucket.id }) + await instance.ingest(sources, { ...ingestOptions, bucketId: bucket.id }) - const updatedDocs = [createTestDocument({ id: 'doc-1', content: 'Updated content with new information' })] + const updatedDocs = [createTestSource({ id: 'source-1', content: 'Updated content with new information' })] await instance.ingest(updatedDocs, { ...ingestOptions, bucketId: bucket.id }) const response = await instance.query('Updated content') @@ -56,8 +56,8 @@ describe('integration', () => { const embedding = createMockEmbedding() const instance = await typegraphInit({ vectorStore: adapter, embedding }) - const { bucket: source1, documents: docs1, ingestOptions: ingestOptions1 } = createMockBucket({ id: 'src-1', documents: createTestDocuments(2, 'Alpha') }) - const { bucket: source2, documents: docs2, ingestOptions: ingestOptions2 } = createMockBucket({ id: 'src-2', documents: createTestDocuments(2, 'Beta') }) + const { bucket: source1, sources: docs1, ingestOptions: ingestOptions1 } = createMockBucket({ id: 'src-1', sources: createTestSources(2, 'Alpha') }) + const { bucket: source2, sources: docs2, ingestOptions: ingestOptions2 } = createMockBucket({ id: 'src-2', sources: createTestSources(2, 'Beta') }) registerTestBucket(instance, source1, embedding) registerTestBucket(instance, source2, embedding) @@ -66,7 +66,7 @@ describe('integration', () => { const response = await instance.query('content') expect(response.results.chunks.length).toBeGreaterThan(0) - const bucketIds = new Set(response.results.chunks.map(r => r.document.bucketId)) + const bucketIds = new Set(response.results.chunks.map(r => r.source.bucketId)) expect(bucketIds.size).toBeGreaterThanOrEqual(1) }) @@ -76,8 +76,8 @@ describe('integration', () => { const embeddingB = createMockEmbedding({ model: 'model-b', dimensions: 4 }) const instance = await typegraphInit({ vectorStore: adapter, embedding: embeddingA }) - const { bucket: source1, documents: docs1, ingestOptions: ingestOptions1 } = createMockBucket({ id: 'src-1', documents: createTestDocuments(2, 'Alpha') }) - const { bucket: source2, documents: docs2, ingestOptions: ingestOptions2 } = createMockBucket({ id: 'src-2', documents: createTestDocuments(2, 'Beta') }) + const { bucket: source1, sources: docs1, ingestOptions: ingestOptions1 } = createMockBucket({ id: 'src-1', sources: createTestSources(2, 'Alpha') }) + const { bucket: source2, sources: docs2, ingestOptions: ingestOptions2 } = createMockBucket({ id: 'src-2', sources: createTestSources(2, 'Beta') }) registerTestBucket(instance, source1, embeddingA) registerTestBucket(instance, source2, embeddingB) @@ -93,11 +93,11 @@ describe('integration', () => { const embedding = createMockEmbedding() const instance = await typegraphInit({ vectorStore: adapter, embedding }) - const { bucket, documents, ingestOptions } = createMockBucket({ documents: createTestDocuments(2) }) + const { bucket, sources, ingestOptions } = createMockBucket({ sources: createTestSources(2) }) registerTestBucket(instance, bucket, embedding) - const result1 = await instance.ingest(documents, { ...ingestOptions, bucketId: bucket.id }) - const result2 = await instance.ingest(documents, { ...ingestOptions, bucketId: bucket.id }) + const result1 = await instance.ingest(sources, { ...ingestOptions, bucketId: bucket.id }) + const result2 = await instance.ingest(sources, { ...ingestOptions, bucketId: bucket.id }) expect(result1.inserted).toBe(2) expect(result2.skipped).toBe(2) @@ -109,14 +109,14 @@ describe('integration', () => { const embedding = createMockEmbedding() const instance = await typegraphInit({ vectorStore: adapter, embedding }) - const { bucket, documents, ingestOptions } = createMockBucket({ documents: createTestDocuments(2) }) + const { bucket, sources, ingestOptions } = createMockBucket({ sources: createTestSources(2) }) registerTestBucket(instance, bucket, embedding) - await instance.ingest(documents, { ...ingestOptions, bucketId: bucket.id, tenantId: 'tenant-a' }) - await instance.ingest(documents, { ...ingestOptions, bucketId: bucket.id, tenantId: 'tenant-b' }) + await instance.ingest(sources, { ...ingestOptions, bucketId: bucket.id, tenantId: 'tenant-a' }) + await instance.ingest(sources, { ...ingestOptions, bucketId: bucket.id, tenantId: 'tenant-b' }) - const responseA = await instance.query('Document', { tenantId: 'tenant-a' }) - const responseB = await instance.query('Document', { tenantId: 'tenant-b' }) + const responseA = await instance.query('Source', { tenantId: 'tenant-a' }) + const responseB = await instance.query('Source', { tenantId: 'tenant-b' }) expect(responseA.query.tenantId).toBe('tenant-a') expect(responseB.query.tenantId).toBe('tenant-b') @@ -127,15 +127,15 @@ describe('integration', () => { const embedding = createMockEmbedding() const instance = await typegraphInit({ vectorStore: adapter, embedding }) - const { bucket } = createMockBucket({ documents: [] }) + const { bucket } = createMockBucket({ sources: [] }) registerTestBucket(instance, bucket, embedding) - const doc = createTestDocument({ content: 'Ingested document content' }) + const source = createTestSource({ content: 'Ingested source content' }) const chunks = [ { content: 'Chunk zero text', chunkIndex: 0 }, { content: 'Chunk one text', chunkIndex: 1 }, ] - await instance.ingestPreChunked(doc, chunks, { bucketId: bucket.id }) + await instance.ingestPreChunked(source, chunks, { bucketId: bucket.id }) const response = await instance.query('Chunk zero text') expect(response.results.chunks.length).toBeGreaterThan(0) @@ -146,13 +146,13 @@ describe('integration', () => { const embedding = createMockEmbedding() const instance = await typegraphInit({ vectorStore: adapter, embedding }) - const { bucket, documents, ingestOptions } = createMockBucket({ documents: createTestDocuments(2) }) + const { bucket, sources, ingestOptions } = createMockBucket({ sources: createTestSources(2) }) registerTestBucket(instance, bucket, embedding) - await instance.ingest(documents, { ...ingestOptions, bucketId: bucket.id }) + await instance.ingest(sources, { ...ingestOptions, bucketId: bucket.id }) - const xmlResponse = await instance.query('Document', { context: { format: 'xml' } }) - const mdResponse = await instance.query('Document', { context: { format: 'markdown' } }) - const plainResponse = await instance.query('Document', { context: { format: 'plain' } }) + const xmlResponse = await instance.query('Source', { context: { format: 'xml' } }) + const mdResponse = await instance.query('Source', { context: { format: 'markdown' } }) + const plainResponse = await instance.query('Source', { context: { format: 'plain' } }) expect(xmlResponse.context).toContain('') expect(mdResponse.context).toContain('# Context') @@ -172,11 +172,11 @@ describe('integration', () => { const embedding = createMockEmbedding() const instance = await typegraphInit({ vectorStore: adapter, embedding }) - const docs = createTestDocuments(1, 'PrivateDoc') - const { bucket, ingestOptions } = createMockBucket({ documents: docs }) + const sources = createTestSources(1, 'PrivateDoc') + const { bucket, ingestOptions } = createMockBucket({ sources: sources }) registerTestBucket(instance, bucket, embedding) - await instance.ingest(docs, { + await instance.ingest(sources, { ...ingestOptions, bucketId: bucket.id, tenantId, @@ -199,13 +199,13 @@ describe('integration', () => { const embedding = createMockEmbedding() const instance = await typegraphInit({ vectorStore: adapter, embedding }) - const docs = createTestDocuments(1, 'PublicDoc') - const { bucket, ingestOptions } = createMockBucket({ documents: docs }) + const sources = createTestSources(1, 'PublicDoc') + const { bucket, ingestOptions } = createMockBucket({ sources: sources }) registerTestBucket(instance, bucket, embedding) // No tenantId, no visibility — the basic RAG case where a developer - // just wants to index docs and query them without any identity scoping. - await instance.ingest(docs, { ...ingestOptions, bucketId: bucket.id }) + // just wants to index sources and query them without any identity scoping. + await instance.ingest(sources, { ...ingestOptions, bucketId: bucket.id }) const unscoped = await instance.query('PublicDoc') expect(unscoped.results.chunks.length).toBeGreaterThan(0) @@ -216,18 +216,18 @@ describe('integration', () => { const embedding = createMockEmbedding() const instance = await typegraphInit({ vectorStore: adapter, embedding }) - const docs = createTestDocuments(1, 'TenantGated') - const { bucket, ingestOptions } = createMockBucket({ documents: docs }) + const sources = createTestSources(1, 'TenantGated') + const { bucket, ingestOptions } = createMockBucket({ sources: sources }) registerTestBucket(instance, bucket, embedding) - await instance.ingest(docs, { + await instance.ingest(sources, { ...ingestOptions, bucketId: bucket.id, tenantId, visibility: 'tenant', }) - // No tenantId on the query → blocked, even though the doc is 'tenant' visibility. + // No tenantId on the query → blocked, even though the source is 'tenant' visibility. const unscoped = await instance.query('TenantGated') expect(unscoped.results.chunks).toHaveLength(0) @@ -241,13 +241,13 @@ describe('integration', () => { const embedding = createMockEmbedding() const instance = await typegraphInit({ vectorStore: adapter, embedding }) - const docs = createTestDocuments(1, 'TenantDoc') - const { bucket, ingestOptions } = createMockBucket({ documents: docs }) + const sources = createTestSources(1, 'TenantDoc') + const { bucket, ingestOptions } = createMockBucket({ sources: sources }) registerTestBucket(instance, bucket, embedding) // Tenant-visible rows intentionally carry no userId so any tenant-level // caller sees them regardless of their own userId. - await instance.ingest(docs, { + await instance.ingest(sources, { ...ingestOptions, bucketId: bucket.id, tenantId, @@ -263,11 +263,11 @@ describe('integration', () => { const embedding = createMockEmbedding() const instance = await typegraphInit({ vectorStore: adapter, embedding }) - const docs = createTestDocuments(1, 'MigratingDoc') - const { bucket, ingestOptions } = createMockBucket({ documents: docs }) + const sources = createTestSources(1, 'MigratingDoc') + const { bucket, ingestOptions } = createMockBucket({ sources: sources }) registerTestBucket(instance, bucket, embedding) - await instance.ingest(docs, { + await instance.ingest(sources, { ...ingestOptions, bucketId: bucket.id, tenantId, @@ -278,7 +278,7 @@ describe('integration', () => { const before = await instance.query('MigratingDoc', { tenantId }) expect(before.results.chunks.length).toBeGreaterThan(0) - // Simulate the chunk-level cascade PgVectorAdapter.updateDocument() applies. + // Simulate the chunk-level cascade PgVectorAdapter.updateSource() applies. for (const chunks of adapter._chunks.values()) { for (const c of chunks) c.visibility = 'user' } @@ -304,10 +304,10 @@ describe('integration', () => { hooks: { onIndexStart, onIndexComplete, onQueryResults }, }) - const { bucket, documents, ingestOptions } = createMockBucket({ documents: createTestDocuments(2) }) + const { bucket, sources, ingestOptions } = createMockBucket({ sources: createTestSources(2) }) registerTestBucket(instance, bucket, embedding) - await instance.ingest(documents, { ...ingestOptions, bucketId: bucket.id }) + await instance.ingest(sources, { ...ingestOptions, bucketId: bucket.id }) expect(onIndexStart).toHaveBeenCalled() expect(onIndexComplete).toHaveBeenCalled() diff --git a/packages/sdk/src/__tests__/merger.test.ts b/packages/sdk/src/__tests__/merger.test.ts index 6a71f83..d406d26 100644 --- a/packages/sdk/src/__tests__/merger.test.ts +++ b/packages/sdk/src/__tests__/merger.test.ts @@ -5,7 +5,7 @@ function makeResult(overrides: Partial = {}): RetrievalCandi return { content: 'Test content', bucketId: 'src-1', - documentId: 'doc-1', + sourceId: 'source-1', rawScores: { semantic: 0.9 }, normalizedScore: 0.9, mode: 'indexed', @@ -15,22 +15,22 @@ function makeResult(overrides: Partial = {}): RetrievalCandi } describe('dedupKey', () => { - it('uses stable chunk identity when bucket, document, and chunk are available', () => { + it('uses stable chunk identity when bucket, source, and chunk are available', () => { const r = makeResult({ url: 'https://example.com/page', chunk: { index: 2, total: 5 } }) const key = dedupKey(r) - expect(key).toBe('src-1:doc-1:2') + expect(key).toBe('src-1:source-1:2') }) it('falls back to content hash when chunk identity is unavailable', () => { - const r = makeResult({ url: 'https://example.com/page', documentId: '', chunk: undefined }) + const r = makeResult({ url: 'https://example.com/page', sourceId: '', chunk: undefined }) const key = dedupKey(r) expect(key).toHaveLength(64) expect(key).toMatch(/^[0-9a-f]{64}$/) }) - it('same content produces same fallback key regardless of documentId', () => { - const a = makeResult({ content: 'hello world', documentId: '', chunk: undefined }) - const b = makeResult({ content: 'hello world', documentId: 'graph-0', chunk: undefined }) + it('same content produces same fallback key regardless of sourceId', () => { + const a = makeResult({ content: 'hello world', sourceId: '', chunk: undefined }) + const b = makeResult({ content: 'hello world', sourceId: 'graph-0', chunk: undefined }) expect(dedupKey(a)).toBe(dedupKey(b)) }) @@ -178,14 +178,14 @@ describe('mergeAndRank', () => { it('deduplicates by content', () => { const group1 = [makeResult({ content: 'same content', normalizedScore: 0.9, mode: 'indexed', rawScores: { semantic: 0.8 } })] - const group2 = [makeResult({ content: 'same content', normalizedScore: 0.8, mode: 'graph', documentId: 'graph-0', rawScores: { graph: 0.7 } })] + const group2 = [makeResult({ content: 'same content', normalizedScore: 0.8, mode: 'graph', sourceId: 'graph-0', rawScores: { graph: 0.7 } })] const merged = mergeAndRank([group1, group2], 10) expect(merged).toHaveLength(1) }) it('aggregates rawScores across runners', () => { const indexed = [makeResult({ content: 'shared', normalizedScore: 0.9, mode: 'indexed', rawScores: { semantic: 0.8, keyword: 0.3 } })] - const graph = [makeResult({ content: 'shared', normalizedScore: 0.7, mode: 'graph', documentId: 'graph-0', rawScores: { graph: 0.6 } })] + const graph = [makeResult({ content: 'shared', normalizedScore: 0.7, mode: 'graph', sourceId: 'graph-0', rawScores: { graph: 0.6 } })] const merged = mergeAndRank([indexed, graph], 10) expect(merged).toHaveLength(1) const result = merged[0]! @@ -196,7 +196,7 @@ describe('mergeAndRank', () => { it('tracks modes from contributing runners', () => { const indexed = [makeResult({ content: 'shared', mode: 'indexed', rawScores: { semantic: 0.8 } })] - const graph = [makeResult({ content: 'shared', mode: 'graph', documentId: 'graph-0', rawScores: { graph: 0.6 } })] + const graph = [makeResult({ content: 'shared', mode: 'graph', sourceId: 'graph-0', rawScores: { graph: 0.6 } })] const merged = mergeAndRank([indexed, graph], 10) const result = merged[0] as any expect(result.modes).toContain('indexed') @@ -284,7 +284,7 @@ describe('mergeAndRank', () => { it('cross-runner dedup: indexed + graph with same content → 1 result with both scores', () => { const indexed = [makeResult({ content: 'Golden State Warriors are awesome', - documentId: 'doc-123', + sourceId: 'source-123', mode: 'indexed', normalizedScore: 0.8, rawScores: { semantic: 0.75, keyword: 0.4 }, @@ -292,7 +292,7 @@ describe('mergeAndRank', () => { })] const graph = [makeResult({ content: 'Golden State Warriors are awesome', - documentId: 'doc-123', + sourceId: 'source-123', mode: 'graph', normalizedScore: 0.15, rawScores: { graph: 0.15 }, diff --git a/packages/sdk/src/__tests__/mock-adapter.test.ts b/packages/sdk/src/__tests__/mock-adapter.test.ts index 7c1c711..3a08a9b 100644 --- a/packages/sdk/src/__tests__/mock-adapter.test.ts +++ b/packages/sdk/src/__tests__/mock-adapter.test.ts @@ -1,14 +1,14 @@ import { describe, it, expect, beforeEach } from 'vitest' import { createMockAdapter, createMockHashStore } from './helpers/mock-adapter.js' import { createMockEmbedding } from './helpers/mock-embedding.js' -import type { EmbeddedChunk } from '../types/document.js' +import type { EmbeddedChunk } from '../types/chunk.js' function makeChunk(overrides: Partial = {}): EmbeddedChunk { return { id: overrides.id ?? `chunk-${overrides.idempotencyKey ?? 'key-1'}-${overrides.chunkIndex ?? 0}`, idempotencyKey: 'key-1', bucketId: 'src-1', - documentId: 'doc-1', + sourceId: 'source-1', content: 'Test chunk content', embedding: [0.1, 0.2, 0.3, 0.4], embeddingModel: 'mock-embed-v1', @@ -41,16 +41,16 @@ describe('MockAdapter', () => { expect(adapter._chunks.has('test-model')).toBe(true) }) - it('upsertDocument stores chunks', async () => { + it('upsertSourceChunks stores chunks', async () => { await adapter.ensureModel('model', 4) const chunk = makeChunk() - await adapter.upsertDocument('model', [chunk]) + await adapter.upsertSourceChunks('model', [chunk]) expect(adapter._chunks.get('model')).toHaveLength(1) }) it('search retrieves by cosine similarity', async () => { await adapter.ensureModel('model', 4) - await adapter.upsertDocument('model', [ + await adapter.upsertSourceChunks('model', [ makeChunk({ embedding: [1, 0, 0, 0], content: 'A' }), makeChunk({ embedding: [0, 1, 0, 0], content: 'B', idempotencyKey: 'key-2' }), ]) @@ -62,7 +62,7 @@ describe('MockAdapter', () => { it('search sorts by cosine similarity', async () => { await adapter.ensureModel('model', 4) - await adapter.upsertDocument('model', [ + await adapter.upsertSourceChunks('model', [ makeChunk({ embedding: [0, 0, 0, 1], content: 'Far', idempotencyKey: 'k1' }), makeChunk({ embedding: [0.9, 0.1, 0, 0], content: 'Close', idempotencyKey: 'k2' }), makeChunk({ embedding: [1, 0, 0, 0], content: 'Exact', idempotencyKey: 'k3' }), @@ -75,7 +75,7 @@ describe('MockAdapter', () => { it('hybridSearch includes keyword matching', async () => { await adapter.ensureModel('model', 4) - await adapter.upsertDocument('model', [ + await adapter.upsertSourceChunks('model', [ makeChunk({ embedding: [0.5, 0.5, 0, 0], content: 'JavaScript programming', idempotencyKey: 'k1' }), makeChunk({ embedding: [0.5, 0.5, 0, 0], content: 'Python scripting', idempotencyKey: 'k2' }), ]) @@ -89,16 +89,16 @@ describe('MockAdapter', () => { it('upsert replaces existing chunks', async () => { await adapter.ensureModel('model', 4) const chunk = makeChunk({ content: 'Original' }) - await adapter.upsertDocument('model', [chunk]) + await adapter.upsertSourceChunks('model', [chunk]) const updated = makeChunk({ content: 'Updated' }) - await adapter.upsertDocument('model', [updated]) + await adapter.upsertSourceChunks('model', [updated]) expect(adapter._chunks.get('model')).toHaveLength(1) expect(adapter._chunks.get('model')![0]!.content).toBe('Updated') }) it('delete by filter', async () => { await adapter.ensureModel('model', 4) - await adapter.upsertDocument('model', [ + await adapter.upsertSourceChunks('model', [ makeChunk({ bucketId: 'src-1', idempotencyKey: 'k1' }), makeChunk({ bucketId: 'src-2', idempotencyKey: 'k2' }), ]) @@ -109,7 +109,7 @@ describe('MockAdapter', () => { it('countChunks by filter', async () => { await adapter.ensureModel('model', 4) - await adapter.upsertDocument('model', [ + await adapter.upsertSourceChunks('model', [ makeChunk({ bucketId: 'src-1', idempotencyKey: 'k1' }), makeChunk({ bucketId: 'src-1', idempotencyKey: 'k2', chunkIndex: 1 }), makeChunk({ bucketId: 'src-2', idempotencyKey: 'k3' }), @@ -118,46 +118,46 @@ describe('MockAdapter', () => { expect(count).toBe(2) }) - it('document records: upsert and retrieve', async () => { - const doc = await adapter.upsertDocumentRecord!({ + it('source records: upsert and retrieve', async () => { + const source = await adapter.upsertSourceRecord!({ bucketId: 'src-1', title: 'Test', contentHash: 'abc', chunkCount: 5, status: 'complete', }) - expect(doc.id).toBeDefined() - expect(doc.title).toBe('Test') - expect(doc.status).toBe('complete') + expect(source.id).toBeDefined() + expect(source.title).toBe('Test') + expect(source.status).toBe('complete') - const retrieved = await adapter.getDocument!(doc.id) + const retrieved = await adapter.getSource!(source.id) expect(retrieved).toBeDefined() - expect(retrieved!.id).toBe(doc.id) + expect(retrieved!.id).toBe(source.id) }) - it('updateDocumentStatus', async () => { - const doc = await adapter.upsertDocumentRecord!({ + it('updateSourceStatus', async () => { + const source = await adapter.upsertSourceRecord!({ bucketId: 'src-1', title: 'Test', contentHash: 'abc', chunkCount: 0, status: 'processing', }) - await adapter.updateDocumentStatus!(doc.id, 'complete', 10) - const updated = await adapter.getDocument!(doc.id) + await adapter.updateSourceStatus!(source.id, 'complete', 10) + const updated = await adapter.getSource!(source.id) expect(updated!.status).toBe('complete') expect(updated!.chunkCount).toBe(10) }) it('getChunksByRange returns chunks in range', async () => { await adapter.ensureModel('model', 4) - await adapter.upsertDocument('model', [ - makeChunk({ documentId: 'doc-1', chunkIndex: 0, content: 'C0', idempotencyKey: 'k0' }), - makeChunk({ documentId: 'doc-1', chunkIndex: 1, content: 'C1', idempotencyKey: 'k1' }), - makeChunk({ documentId: 'doc-1', chunkIndex: 2, content: 'C2', idempotencyKey: 'k2' }), - makeChunk({ documentId: 'doc-1', chunkIndex: 3, content: 'C3', idempotencyKey: 'k3' }), + await adapter.upsertSourceChunks('model', [ + makeChunk({ sourceId: 'source-1', chunkIndex: 0, content: 'C0', idempotencyKey: 'k0' }), + makeChunk({ sourceId: 'source-1', chunkIndex: 1, content: 'C1', idempotencyKey: 'k1' }), + makeChunk({ sourceId: 'source-1', chunkIndex: 2, content: 'C2', idempotencyKey: 'k2' }), + makeChunk({ sourceId: 'source-1', chunkIndex: 3, content: 'C3', idempotencyKey: 'k3' }), ]) - const result = await adapter.getChunksByRange!('model', 'doc-1', 1, 2) + const result = await adapter.getChunksByRange!('model', 'source-1', 1, 2) expect(result).toHaveLength(2) expect(result[0]!.chunkIndex).toBe(1) expect(result[1]!.chunkIndex).toBe(2) diff --git a/packages/sdk/src/__tests__/ontology.test.ts b/packages/sdk/src/__tests__/ontology.test.ts new file mode 100644 index 0000000..10cada1 --- /dev/null +++ b/packages/sdk/src/__tests__/ontology.test.ts @@ -0,0 +1,69 @@ +import { describe, expect, it } from 'vitest' +import { + ALIAS_RELATION_CUES, + ALL_PREDICATES, + ENTITY_TYPES, + GENERIC_DISALLOWED_PREDICATES, + PREDICATE_SPECS, + SYMMETRIC_PREDICATES, + normalizePredicateWithDirection, + validatePredicateTypes, +} from '../index-engine/ontology.js' + +describe('ontology registry', () => { + it('keeps entity types and predicates unique and centralized', () => { + expect(new Set(ENTITY_TYPES).size).toBe(ENTITY_TYPES.length) + expect(new Set(PREDICATE_SPECS.map(spec => spec.name)).size).toBe(PREDICATE_SPECS.length) + for (const spec of PREDICATE_SPECS) { + expect(spec.category).toBeTruthy() + expect(spec.description).toBeTruthy() + expect(spec.domain.length).toBeGreaterThan(0) + expect(spec.range.length).toBeGreaterThan(0) + expect(ALL_PREDICATES.has(spec.name)).toBe(true) + } + }) + + it('normalizes simplified aliases through the registry', () => { + expect(normalizePredicateWithDirection('CO_FOUNDED')).toEqual(expect.objectContaining({ + predicate: 'FOUNDED', + valid: true, + swapSubjectObject: false, + })) + expect(normalizePredicateWithDirection('FOUNDED_BY')).toEqual(expect.objectContaining({ + predicate: 'FOUNDED', + valid: true, + swapSubjectObject: true, + })) + expect(normalizePredicateWithDirection('WORKED_AS')).toEqual(expect.objectContaining({ + predicate: 'WORKS_AS', + temporalStatus: 'former', + })) + expect(normalizePredicateWithDirection('SUPPORTED')).toEqual(expect.objectContaining({ + predicate: 'SUPPORTS', + valid: true, + })) + }) + + it('promotes IS_A and rejects alias cues as graph predicates', () => { + expect(normalizePredicateWithDirection('IS_A')).toEqual(expect.objectContaining({ + predicate: 'IS_A', + valid: true, + })) + expect(GENERIC_DISALLOWED_PREDICATES.has('IS_A')).toBe(false) + for (const cue of ALIAS_RELATION_CUES) { + expect(normalizePredicateWithDirection(cue).valid).toBe(false) + } + }) + + it('exposes symmetry and soft domain/range validation metadata', () => { + expect(SYMMETRIC_PREDICATES.has('MARRIED')).toBe(true) + expect(validatePredicateTypes('WORKS_FOR', 'person', 'organization')).toEqual(expect.objectContaining({ + valid: true, + })) + expect(validatePredicateTypes('WORKS_FOR', 'issue', 'document')).toEqual(expect.objectContaining({ + valid: false, + reason: 'domain-range-mismatch', + })) + }) +}) + diff --git a/packages/sdk/src/__tests__/query-planner.test.ts b/packages/sdk/src/__tests__/query-planner.test.ts index 489a64b..cbd336e 100644 --- a/packages/sdk/src/__tests__/query-planner.test.ts +++ b/packages/sdk/src/__tests__/query-planner.test.ts @@ -3,12 +3,13 @@ import { QueryPlanner } from '../query/planner.js' import { createMockAdapter } from './helpers/mock-adapter.js' import { createMockEmbedding } from './helpers/mock-embedding.js' import { createMockBucket } from './helpers/mock-source.js' -import { createTestDocuments } from './helpers/mock-connector.js' +import { createTestSources } from './helpers/mock-connector.js' import { IndexEngine } from '../index-engine/engine.js' import { defaultChunker } from '../index-engine/chunker.js' import type { EmbeddingProvider } from '../embedding/provider.js' -import type { KnowledgeGraphBridge } from '../types/graph-bridge.js' +import type { KnowledgeGraphBridge, MemoryBridge } from '../types/graph-bridge.js' import type { typegraphEvent, typegraphEventSink } from '../types/events.js' +import type { ExternalId, MemoryRecord } from '../memory/types/memory.js' describe('QueryPlanner', () => { let adapter: ReturnType @@ -22,21 +23,21 @@ describe('QueryPlanner', () => { bucketIds = [] bucketEmbeddings = new Map() - const docs = createTestDocuments(3) - const { bucket, ingestOptions, chunkOpts } = createMockBucket({ id: 'src-1', documents: docs }) + const sources = createTestSources(3) + const { bucket, ingestOptions, chunkOpts } = createMockBucket({ id: 'src-1', sources: sources }) bucketIds.push(bucket.id) bucketEmbeddings.set(bucket.id, embedding) await adapter.deploy() await adapter.connect() const engine = new IndexEngine(adapter, embedding) - const items = await Promise.all(docs.map(async doc => ({ doc, chunks: await defaultChunker(doc, chunkOpts) }))) + const items = await Promise.all(sources.map(async source => ({ source, chunks: await defaultChunker(source, chunkOpts) }))) await engine.ingestBatch(bucket.id, items, ingestOptions) }) it('returns results for indexed sources', async () => { const planner = new QueryPlanner(adapter, bucketIds, bucketEmbeddings, bucketEmbeddings) - const response = await planner.execute('Document 1') + const response = await planner.execute('Source 1') expect(response.results.chunks.length).toBeGreaterThan(0) expect(response.results.chunks[0]!.content).toBeDefined() expect(response.results.facts).toEqual([]) @@ -44,6 +45,13 @@ describe('QueryPlanner', () => { expect(response.results.memories).toEqual([]) }) + it('treats null execute opts as omitted', async () => { + const planner = new QueryPlanner(adapter, bucketIds, bucketEmbeddings, bucketEmbeddings) + const response = await planner.execute('Source 1', null) + + expect(response.results.chunks.length).toBeGreaterThan(0) + }) + it('respects count', async () => { const planner = new QueryPlanner(adapter, bucketIds, bucketEmbeddings, bucketEmbeddings) const response = await planner.execute('test query', { count: 1 }) @@ -54,7 +62,7 @@ describe('QueryPlanner', () => { const planner = new QueryPlanner(adapter, bucketIds, bucketEmbeddings, bucketEmbeddings) adapter.calls.length = 0 - const response = await planner.execute('Document 1', { + const response = await planner.execute('Source 1', { signals: { semantic: false, keyword: true }, count: 2, }) @@ -70,18 +78,18 @@ describe('QueryPlanner', () => { }) it('filters to requested sources', async () => { - const docs2 = createTestDocuments(2, 'Other') - const { bucket: bucket2, ingestOptions: ingestOptions2, chunkOpts: chunkOpts2 } = createMockBucket({ id: 'src-2', documents: docs2 }) + const docs2 = createTestSources(2, 'Other') + const { bucket: bucket2, ingestOptions: ingestOptions2, chunkOpts: chunkOpts2 } = createMockBucket({ id: 'src-2', sources: docs2 }) bucketIds.push(bucket2.id) bucketEmbeddings.set(bucket2.id, embedding) const engine = new IndexEngine(adapter, embedding) - const items = await Promise.all(docs2.map(async doc => ({ doc, chunks: await defaultChunker(doc, chunkOpts2) }))) + const items = await Promise.all(docs2.map(async source => ({ source, chunks: await defaultChunker(source, chunkOpts2) }))) await engine.ingestBatch(bucket2.id, items, ingestOptions2) const planner = new QueryPlanner(adapter, bucketIds, bucketEmbeddings, bucketEmbeddings) const response = await planner.execute('test', { buckets: ['src-1'] }) for (const r of response.results.chunks) { - expect(r.document.bucketId).toBe('src-1') + expect(r.source.bucketId).toBe('src-1') } }) @@ -117,12 +125,12 @@ describe('QueryPlanner', () => { } const planner = new QueryPlanner(adapter, bucketIds, bucketEmbeddings, bucketEmbeddings, undefined, undefined, eventSink) - const response = await planner.execute('Document 1', { count: 2 }) + const response = await planner.execute('Source 1', { count: 2 }) const queryEvents = events.filter(event => event.eventType === 'query.execute') expect(queryEvents).toHaveLength(1) expect(queryEvents[0]!.payload).toMatchObject({ - query: 'Document 1', + query: 'Source 1', requested_count: 2, result_count: response.results.chunks.length + response.results.memories.length, chunk_count: response.results.chunks.length, @@ -137,7 +145,7 @@ describe('QueryPlanner', () => { it('maps results to structured query response shape', async () => { const planner = new QueryPlanner(adapter, bucketIds, bucketEmbeddings, bucketEmbeddings) - const response = await planner.execute('Document 1') + const response = await planner.execute('Source 1') expect(response.results).toHaveProperty('chunks') expect(response.results).toHaveProperty('facts') expect(response.results).toHaveProperty('entities') @@ -146,29 +154,182 @@ describe('QueryPlanner', () => { expect(result).toHaveProperty('content') expect(result).toHaveProperty('score') expect(result).toHaveProperty('scores') - expect(result).toHaveProperty('document') + expect(result).toHaveProperty('source') expect(result).toHaveProperty('chunk') expect(result).toHaveProperty('metadata') expect(result).not.toHaveProperty('facts') expect(result).not.toHaveProperty('entities') expect(response.results.facts).toEqual([]) expect(response.results.entities).toEqual([]) - expect(result.document).toHaveProperty('id') - expect(result.document).toHaveProperty('bucketId') + expect(result.source).toHaveProperty('id') + expect(result.source).toHaveProperty('bucketId') expect(result.chunk).toHaveProperty('index') expect(result.chunk).toHaveProperty('total') }) it('uses "semantic" source label for indexed results', async () => { const planner = new QueryPlanner(adapter, bucketIds, bucketEmbeddings, bucketEmbeddings) - const response = await planner.execute('Document 1') + const response = await planner.execute('Source 1') const result = response.results.chunks[0]! expect(result.sources).toContain('semantic') }) + it('returns direct facts and entities for semantic search without graph traversal', async () => { + const fact = { + id: 'fact-direct', + edgeId: 'edge-direct', + sourceEntityId: 'ent-pat', + sourceEntityName: 'Pat', + targetEntityId: 'ent-sms', + targetEntityName: 'SMS', + relation: 'PREFERS', + factText: 'Pat prefers SMS', + weight: 1, + evidenceCount: 1, + } + const entity = { + id: 'ent-pat', + name: 'Pat', + entityType: 'person', + aliases: [], + edgeCount: 1, + } + const knowledgeGraph: KnowledgeGraphBridge = { + searchKnowledge: vi.fn().mockResolvedValue({ facts: [fact], entities: [entity] }), + searchGraphChunks: vi.fn(), + } + const planner = new QueryPlanner(adapter, bucketIds, bucketEmbeddings, bucketEmbeddings, undefined, knowledgeGraph) + + const response = await planner.execute('sms', { + signals: { semantic: true, keyword: false, graph: false }, + count: 2, + }) + + expect(knowledgeGraph.searchKnowledge).toHaveBeenCalledWith('sms', expect.anything(), expect.objectContaining({ + count: 2, + signals: expect.objectContaining({ semantic: true, keyword: false, graph: false }), + })) + expect(knowledgeGraph.searchGraphChunks).not.toHaveBeenCalled() + expect(response.results.chunks.length).toBeGreaterThan(0) + expect(response.results.facts).toEqual([expect.objectContaining({ id: 'fact-direct', factText: 'Pat prefers SMS' })]) + expect(response.results.entities).toEqual([expect.objectContaining({ id: 'ent-pat', name: 'Pat' })]) + }) + + it('prefilters indexed chunks with OR entity-scope chunk refs', async () => { + const [firstChunk, secondChunk] = [...adapter._chunks.values()][0]! + const externalId: ExternalId = { id: 'pat@example.com', type: 'email' } + const knowledgeGraph: KnowledgeGraphBridge = { + resolveEntityScope: vi.fn().mockResolvedValue({ + entityIds: ['ent-1', 'ent-2'], + chunkRefs: [ + { + bucketId: firstChunk!.bucketId, + sourceId: firstChunk!.sourceId, + chunkIndex: firstChunk!.chunkIndex, + embeddingModel: firstChunk!.embeddingModel, + }, + { + bucketId: secondChunk!.bucketId, + sourceId: secondChunk!.sourceId, + chunkIndex: secondChunk!.chunkIndex, + embeddingModel: secondChunk!.embeddingModel, + }, + ], + }), + searchKnowledge: vi.fn().mockResolvedValue({ facts: [], entities: [] }), + } + const planner = new QueryPlanner(adapter, bucketIds, bucketEmbeddings, bucketEmbeddings, undefined, knowledgeGraph) + + const response = await planner.execute('Source', { + entityScope: { entityIds: ['ent-1', 'ent-2'], externalIds: [externalId] }, + count: 10, + }) + const searchCall = adapter.calls.find(call => call.method === 'search') + + expect(knowledgeGraph.resolveEntityScope).toHaveBeenCalledWith( + { entityIds: ['ent-1', 'ent-2'], externalIds: [externalId] }, + expect.anything(), + expect.anything(), + ) + expect(searchCall).toBeDefined() + expect((searchCall!.args[2] as { filter?: unknown }).filter).toEqual(expect.objectContaining({ + chunkRefs: [ + { + bucketId: firstChunk!.bucketId, + sourceId: firstChunk!.sourceId, + chunkIndex: firstChunk!.chunkIndex, + embeddingModel: firstChunk!.embeddingModel, + }, + { + bucketId: secondChunk!.bucketId, + sourceId: secondChunk!.sourceId, + chunkIndex: secondChunk!.chunkIndex, + embeddingModel: secondChunk!.embeddingModel, + }, + ], + })) + expect(response.results.chunks).toHaveLength(2) + expect(response.results.chunks.map(chunk => `${chunk.source.bucketId}:${chunk.source.id}:${chunk.chunk.index}`)).toEqual(expect.arrayContaining([ + `${firstChunk!.bucketId}:${firstChunk!.sourceId}:${firstChunk!.chunkIndex}`, + `${secondChunk!.bucketId}:${secondChunk!.sourceId}:${secondChunk!.chunkIndex}`, + ])) + }) + + it('throws for indexed entity scope without graph scope resolution', async () => { + const planner = new QueryPlanner(adapter, bucketIds, bucketEmbeddings, bucketEmbeddings) + + await expect(planner.execute('Source', { + entityScope: { entityIds: ['ent-1'] }, + count: 1, + })).rejects.toThrow('entityScope requires a knowledge graph bridge with entity scope resolution.') + }) + + it('allows memory-only entity scope without a knowledge graph bridge', async () => { + const email: ExternalId = { id: 'pat@example.com', type: 'email' } + const memory: MemoryRecord = { + id: 'mem-1', + category: 'semantic', + status: 'active', + content: 'Pat prefers SMS for urgent notices', + importance: 0.8, + accessCount: 0, + lastAccessedAt: new Date(), + metadata: { _similarity: 0.9 }, + scope: { tenantId: 'tenant-1' }, + validAt: new Date(), + createdAt: new Date(), + } + const memoryBridge: MemoryBridge = { + remember: vi.fn(), + forget: vi.fn(), + correct: vi.fn(), + addConversationTurn: vi.fn(), + recall: vi.fn().mockResolvedValue([memory]), + hasMemories: vi.fn().mockResolvedValue(true), + } + const planner = new QueryPlanner(adapter, [], new Map(), new Map(), memoryBridge) + + const response = await planner.execute('urgent notices', { + tenantId: 'tenant-1', + signals: { semantic: false, keyword: false, memory: true, graph: false }, + entityScope: { externalIds: [email] }, + count: 3, + }) + + expect(memoryBridge.recall).toHaveBeenCalledWith('urgent notices', expect.objectContaining({ + tenantId: 'tenant-1', + limit: 3, + entityScope: { externalIds: [email] }, + })) + expect(response.results.memories).toEqual([expect.objectContaining({ + id: 'mem-1', + content: 'Pat prefers SMS for urgent notices', + })]) + }) + it('autoWeights adjusts scoring without enabling graph search', async () => { const knowledgeGraph: KnowledgeGraphBridge = { - searchGraphPassages: vi.fn().mockResolvedValue({ + searchGraphChunks: vi.fn().mockResolvedValue({ results: [], facts: [], entities: [], @@ -182,20 +343,20 @@ describe('QueryPlanner', () => { count: 1, }) - expect(knowledgeGraph.searchGraphPassages).not.toHaveBeenCalled() + expect(knowledgeGraph.searchGraphChunks).not.toHaveBeenCalled() expect(response.results.facts).toEqual([]) expect(response.results.entities).toEqual([]) }) - it('returns nonzero graph scores for graph-only passage graph results', async () => { + it('returns nonzero graph scores for graph-only chunk graph results', async () => { const firstChunk = [...adapter._chunks.values()][0]![0]! const knowledgeGraph: KnowledgeGraphBridge = { - searchGraphPassages: vi.fn().mockResolvedValue({ + searchGraphChunks: vi.fn().mockResolvedValue({ results: [{ - passageId: 'passage-test', + chunkId: 'chunk-test', content: firstChunk.content, bucketId: firstChunk.bucketId, - documentId: firstChunk.documentId, + sourceId: firstChunk.sourceId, chunkIndex: firstChunk.chunkIndex, totalChunks: firstChunk.totalChunks, score: 0.25, @@ -208,7 +369,7 @@ describe('QueryPlanner', () => { sourceEntityName: 'Tennyson', targetEntityId: 'ent-2', targetEntityName: 'Maud', - relation: 'WROTE', + relation: 'AUTHORED', factText: 'Tennyson wrote Maud', weight: 1, evidenceCount: 1, @@ -223,7 +384,7 @@ describe('QueryPlanner', () => { trace: { entitySeedCount: 1, factSeedCount: 1, - passageSeedCount: 1, + chunkSeedCount: 1, graphNodeCount: 3, graphEdgeCount: 2, pprNonzeroCount: 3, @@ -232,13 +393,13 @@ describe('QueryPlanner', () => { topGraphScores: [0.25], selectedFactIds: ['fact-1'], selectedEntityIds: ['ent-1'], - selectedPassageIds: ['passage-test'], + selectedChunkIds: ['chunk-test'], }, }), } const planner = new QueryPlanner(adapter, bucketIds, bucketEmbeddings, bucketEmbeddings, undefined, knowledgeGraph) - const response = await planner.execute('Document 1', { + const response = await planner.execute('Source 1', { signals: { semantic: false, keyword: false, graph: true }, count: 1, }) @@ -249,15 +410,15 @@ describe('QueryPlanner', () => { expect(response.results.chunks[0]!.scores.normalized.graph).toBeCloseTo(Math.sqrt(Math.sqrt(0.25))) expect(response.results.facts).toEqual([expect.objectContaining({ id: 'fact-1', factText: 'Tennyson wrote Maud' })]) expect(response.results.entities).toEqual([expect.objectContaining({ id: 'ent-1', name: 'Tennyson' })]) - expect(knowledgeGraph.searchGraphPassages).toHaveBeenCalledWith( - 'Document 1', + expect(knowledgeGraph.searchGraphChunks).toHaveBeenCalledWith( + 'Source 1', expect.anything(), expect.objectContaining({ factFilter: true, factCandidateLimit: 80, factFilterInputLimit: 12, factSeedLimit: 4, - passageSeedLimit: 80, + chunkSeedLimit: 80, maxExpansionEdgesPerEntity: 25, factChainLimit: 2, maxPprIterations: 40, @@ -269,12 +430,12 @@ describe('QueryPlanner', () => { it('merges graph scores into indexed results by chunk identity', async () => { const firstChunk = [...adapter._chunks.values()][0]![0]! const knowledgeGraph: KnowledgeGraphBridge = { - searchGraphPassages: vi.fn().mockResolvedValue({ + searchGraphChunks: vi.fn().mockResolvedValue({ results: [{ - passageId: 'passage-test', + chunkId: 'chunk-test', content: `${firstChunk.content} with graph-only formatting`, bucketId: firstChunk.bucketId, - documentId: firstChunk.documentId, + sourceId: firstChunk.sourceId, chunkIndex: firstChunk.chunkIndex, totalChunks: firstChunk.totalChunks, score: 0.36, @@ -287,7 +448,7 @@ describe('QueryPlanner', () => { sourceEntityName: 'Tennyson', targetEntityId: 'ent-2', targetEntityName: 'Maud', - relation: 'WROTE', + relation: 'AUTHORED', factText: 'Tennyson wrote Maud', weight: 1, evidenceCount: 1, @@ -302,7 +463,7 @@ describe('QueryPlanner', () => { trace: { entitySeedCount: 1, factSeedCount: 1, - passageSeedCount: 1, + chunkSeedCount: 1, graphNodeCount: 3, graphEdgeCount: 2, pprNonzeroCount: 3, @@ -311,19 +472,19 @@ describe('QueryPlanner', () => { topGraphScores: [0.36], selectedFactIds: ['fact-1'], selectedEntityIds: ['ent-1'], - selectedPassageIds: ['passage-test'], + selectedChunkIds: ['chunk-test'], }, }), } const planner = new QueryPlanner(adapter, bucketIds, bucketEmbeddings, bucketEmbeddings, undefined, knowledgeGraph) - const response = await planner.execute('Document 1', { + const response = await planner.execute('Source 1', { signals: { semantic: true, keyword: false, graph: true }, count: 10, }) const merged = response.results.chunks.find(result => - result.document.id === firstChunk.documentId && result.chunk.index === firstChunk.chunkIndex + result.source.id === firstChunk.sourceId && result.chunk.index === firstChunk.chunkIndex ) expect(merged).toBeDefined() expect(merged!.sources).toContain('graph') @@ -333,11 +494,11 @@ describe('QueryPlanner', () => { expect(response.results.entities).toEqual([expect.objectContaining({ id: 'ent-1', name: 'Tennyson' })]) }) - it('surfaces a misconfigured graph bridge when searchGraphPassages is missing', async () => { + it('surfaces a misconfigured graph bridge when searchGraphChunks is missing', async () => { const knowledgeGraph: KnowledgeGraphBridge = {} const planner = new QueryPlanner(adapter, bucketIds, bucketEmbeddings, bucketEmbeddings, undefined, knowledgeGraph) - const response = await planner.execute('Document 1', { + const response = await planner.execute('Source 1', { signals: { semantic: false, keyword: false, graph: true }, count: 1, }) @@ -346,7 +507,7 @@ describe('QueryPlanner', () => { expect(response.results.facts).toEqual([]) expect(response.results.entities).toEqual([]) expect(response.warnings).toEqual(expect.arrayContaining([ - 'Graph search failed: Knowledge graph bridge must implement searchGraphPassages for graph queries.', + 'Graph search failed: Knowledge graph bridge must implement searchGraphChunks for graph queries.', ])) }) }) diff --git a/packages/sdk/src/__tests__/triple-extractor.test.ts b/packages/sdk/src/__tests__/triple-extractor.test.ts index 08a029f..794208a 100644 --- a/packages/sdk/src/__tests__/triple-extractor.test.ts +++ b/packages/sdk/src/__tests__/triple-extractor.test.ts @@ -50,7 +50,7 @@ describe('TripleExtractor', () => { 'At twenty years of age Cousin Cæsar was in Paducah, Kentucky, calling himself Cole Conway, in company with one Steve Sharp.', 'bucket-1', 0, - 'doc-1', + 'source-1', undefined, undefined, undefined, @@ -118,7 +118,7 @@ describe('TripleExtractor', () => { 'When Cousin Cæsar reached Iuka. Cousin Cæsar later appeared in Paducah, Kentucky, calling himself Cole Conway. And Cousin Cæsar met Conway there.', 'bucket-1', 0, - 'doc-1', + 'source-1', ) const mentions = vi.mocked(graph.addEntityMentions).mock.calls[0]![0] @@ -188,7 +188,7 @@ describe('TripleExtractor', () => { 'CHAPTER II ELSIE MAUD INGLIS. Elsie Inglis wrote to John Inglis and later mentioned David Inglis while KATHERINE INGLIS remained elsewhere.', 'bucket-1', 0, - 'doc-1', + 'source-1', ) const mentions = vi.mocked(graph.addEntityMentions).mock.calls[0]![0] @@ -221,7 +221,7 @@ describe('TripleExtractor', () => { }, { name: 'doctor', - type: 'concept', + type: 'role', description: 'A profession practiced by Elsie Inglis.', aliases: [], }, @@ -238,7 +238,7 @@ describe('TripleExtractor', () => { 'Elsie Inglis was a doctor.', 'bucket-1', 0, - 'doc-1', + 'source-1', ) expect(graph.addTriple).toHaveBeenCalledWith(expect.objectContaining({ @@ -246,7 +246,48 @@ describe('TripleExtractor', () => { subjectType: 'person', predicate: 'WORKS_AS', object: 'doctor', - objectType: 'concept', + objectType: 'role', + })) + }) + + it('accepts B2B entity types from the centralized ontology', async () => { + const graph: KnowledgeGraphBridge = { + addEntityMentions: vi.fn().mockResolvedValue(undefined), + addTriple: vi.fn().mockResolvedValue(undefined), + } + const extractor = new TripleExtractor({ + llm: mockLLM({ + entities: [ + { name: 'Acme security review deck', type: 'document', description: 'A security review document.', aliases: ['deck'] }, + { name: 'SOC2 rollout', type: 'project', description: 'A compliance rollout project.', aliases: [] }, + { name: 'AUTH-123', type: 'issue', description: 'An authentication issue.', aliases: [] }, + { name: 'Acme demo', type: 'meeting', description: 'A sales demo meeting.', aliases: [] }, + ], + relationships: [ + { subject: 'Acme security review deck', predicate: 'describes', object: 'SOC2 rollout', confidence: 0.9 }, + ], + }), + graph, + twoPass: false, + }) + + await extractor.extractFromChunk( + 'The Acme security review deck describes the SOC2 rollout after AUTH-123 came up in the Acme demo.', + 'bucket-1', + 0, + 'source-1', + ) + + expect(graph.addEntityMentions).toHaveBeenCalledWith(expect.arrayContaining([ + expect.objectContaining({ name: 'Acme security review deck', type: 'document' }), + expect.objectContaining({ name: 'SOC2 rollout', type: 'project' }), + expect.objectContaining({ name: 'AUTH-123', type: 'issue' }), + expect.objectContaining({ name: 'Acme demo', type: 'meeting' }), + ])) + expect(graph.addTriple).toHaveBeenCalledWith(expect.objectContaining({ + predicate: 'DESCRIBES', + subjectType: 'document', + objectType: 'project', })) }) @@ -282,7 +323,7 @@ describe('TripleExtractor', () => { 'Hi Adarsh Tadimari, please help with the Plotline SDK integration issue.', 'bucket-1', 0, - 'doc-1', + 'source-1', ) const mentions = vi.mocked(graph.addEntityMentions).mock.calls[0]![0] @@ -310,7 +351,7 @@ describe('TripleExtractor', () => { twoPass: false, }) - await expect(extractor.extractFromChunk('Alice met Bob.', 'bucket-1', 0, 'doc-1')) + await expect(extractor.extractFromChunk('Alice met Bob.', 'bucket-1', 0, 'source-1')) .rejects.toThrow('No output generated.') }) }) diff --git a/packages/sdk/src/__tests__/typegraph-instance.test.ts b/packages/sdk/src/__tests__/typegraph-instance.test.ts index 7c0e914..fe8ed67 100644 --- a/packages/sdk/src/__tests__/typegraph-instance.test.ts +++ b/packages/sdk/src/__tests__/typegraph-instance.test.ts @@ -1,13 +1,13 @@ import { describe, it, expect, beforeEach, vi } from 'vitest' -import { typegraphInit, typegraphDeploy } from '../typegraph.js' +import { DEFAULT_BUCKET_ID, typegraphInit, typegraphDeploy } from '../typegraph.js' import { createMockAdapter } from './helpers/mock-adapter.js' import { createMockEmbedding } from './helpers/mock-embedding.js' import { createMockBucket } from './helpers/mock-source.js' -import { createTestDocument, createTestDocuments } from './helpers/mock-connector.js' +import { createTestSource, createTestSources } from './helpers/mock-connector.js' import type { typegraphInstance } from '../typegraph.js' import type { Bucket } from '../types/bucket.js' import type { EmbeddingProvider } from '../embedding/provider.js' -import type { EntityResult, GraphExploreResult, KnowledgeGraphBridge } from '../types/graph-bridge.js' +import type { EntityDetail, EntityResult, GraphExploreResult, KnowledgeGraphBridge } from '../types/graph-bridge.js' /** Register a pre-built Bucket + embedding on an instance (bypasses buckets.create UUID generation). */ function registerTestBucket(instance: typegraphInstance, bucket: Bucket, embedding: EmbeddingProvider) { @@ -43,7 +43,7 @@ describe('typegraphInit', () => { describe('getEmbeddingForBucket', () => { it('returns default embedding', () => { - const { bucket } = createMockBucket({ documents: [] }) + const { bucket } = createMockBucket({ sources: [] }) registerTestBucket(instance, bucket, embedding) const emb = instance.getEmbeddingForBucket(bucket.id) expect(emb.model).toBe(embedding.model) @@ -51,7 +51,7 @@ describe('typegraphInit', () => { it('returns per-bucket override', () => { const customEmb = createMockEmbedding({ model: 'custom-v2' }) - const { bucket } = createMockBucket({ documents: [] }) + const { bucket } = createMockBucket({ sources: [] }) registerTestBucket(instance, bucket, customEmb) const emb = instance.getEmbeddingForBucket(bucket.id) expect(emb.model).toBe('custom-v2') @@ -64,8 +64,8 @@ describe('typegraphInit', () => { describe('getDistinctEmbeddings', () => { it('returns unique embeddings by model name', () => { - const { bucket: s1 } = createMockBucket({ id: 'src-1', documents: [] }) - const { bucket: s2 } = createMockBucket({ id: 'src-2', documents: [] }) + const { bucket: s1 } = createMockBucket({ id: 'src-1', sources: [] }) + const { bucket: s2 } = createMockBucket({ id: 'src-2', sources: [] }) registerTestBucket(instance, s1, embedding) registerTestBucket(instance, s2, embedding) const distinct = instance.getDistinctEmbeddings() @@ -75,8 +75,8 @@ describe('typegraphInit', () => { it('filters to sourceIds', () => { const embA = createMockEmbedding({ model: 'model-a' }) const embB = createMockEmbedding({ model: 'model-b' }) - const { bucket: s1 } = createMockBucket({ id: 'src-1', documents: [] }) - const { bucket: s2 } = createMockBucket({ id: 'src-2', documents: [] }) + const { bucket: s1 } = createMockBucket({ id: 'src-1', sources: [] }) + const { bucket: s2 } = createMockBucket({ id: 'src-2', sources: [] }) registerTestBucket(instance, s1, embA) registerTestBucket(instance, s2, embB) const distinct = instance.getDistinctEmbeddings(['src-1']) @@ -87,8 +87,8 @@ describe('typegraphInit', () => { describe('groupBucketsByModel', () => { it('groups sources by model', () => { - const { bucket: s1 } = createMockBucket({ id: 'src-1', documents: [] }) - const { bucket: s2 } = createMockBucket({ id: 'src-2', documents: [] }) + const { bucket: s1 } = createMockBucket({ id: 'src-1', sources: [] }) + const { bucket: s2 } = createMockBucket({ id: 'src-2', sources: [] }) const differentEmb = createMockEmbedding({ model: 'different-model' }) registerTestBucket(instance, s1, embedding) registerTestBucket(instance, s2, differentEmb) @@ -97,6 +97,39 @@ describe('typegraphInit', () => { }) }) + describe('graph seeding', () => { + it('forwards entity seeding to the knowledge graph bridge', async () => { + const seeded: EntityDetail = { + id: 'ent_alice', + name: 'Alice', + entityType: 'person', + aliases: [], + externalIds: [{ id: 'alice@example.com', type: 'email', encoding: 'none' }], + edgeCount: 0, + properties: {}, + createdAt: new Date(), + topEdges: [], + } + const knowledgeGraph: KnowledgeGraphBridge = { + upsertEntity: vi.fn().mockResolvedValue(seeded), + } + const inst = await typegraphInit({ vectorStore: adapter, embedding, knowledgeGraph }) + + const result = await inst.graph.upsertEntity({ + name: 'Alice', + entityType: 'person', + externalIds: [{ id: 'alice@example.com', type: 'email' }], + }) + + expect(knowledgeGraph.upsertEntity).toHaveBeenCalledWith({ + name: 'Alice', + entityType: 'person', + externalIds: [{ id: 'alice@example.com', type: 'email' }], + }) + expect(result).toEqual(seeded) + }) + }) + describe('graph.searchEntities', () => { it('preserves bridge-provided aliases and edge counts', async () => { const identity = { userId: 'test-user' } @@ -181,9 +214,9 @@ describe('typegraphInit', () => { name: 'WORKS_FOR', confidence: 0.95, }], - answerSide: 'source', subqueries: ['plotline employees'], mode: 'relationship', + strictness: 'soft', }, anchors: [{ id: 'ent_plotline', @@ -220,47 +253,91 @@ describe('typegraphInit', () => { const result = await inst.graph.explore('plotline employees', { ...identity, depth: 1, - include: { passages: false }, + include: { chunks: false }, }) expect(knowledgeGraph.explore).toHaveBeenCalledWith('plotline employees', { ...identity, depth: 1, - include: { passages: false }, + include: { chunks: false }, }) expect(result).toEqual(exploreResult) }) }) describe('ingest', () => { - it('ingests a single document', async () => { - const { bucket, ingestOptions } = createMockBucket({ documents: [] }) + it('ingests a single source', async () => { + const { bucket, ingestOptions } = createMockBucket({ sources: [] }) + registerTestBucket(instance, bucket, embedding) + const source = createTestSource({ content: 'Some content to ingest' }) + const result = await instance.ingest([source], { ...ingestOptions, bucketId: bucket.id }) + expect(result.inserted).toBe(1) + }) + + it('treats null ingest opts as omitted', async () => { + const { bucket } = createMockBucket({ id: DEFAULT_BUCKET_ID, sources: [] }) registerTestBucket(instance, bucket, embedding) - const doc = createTestDocument({ content: 'Some content to ingest' }) - const result = await instance.ingest([doc], { ...ingestOptions, bucketId: bucket.id }) + const source = createTestSource({ content: 'Some content to ingest with null opts' }) + + const result = await instance.ingest([source], null) + expect(result.inserted).toBe(1) + expect(result.bucketId).toBe(DEFAULT_BUCKET_ID) }) - it('ingests a batch of documents', async () => { - const { bucket, ingestOptions } = createMockBucket({ documents: [] }) + it('treats null pre-chunked ingest opts as omitted', async () => { + const { bucket } = createMockBucket({ id: DEFAULT_BUCKET_ID, sources: [] }) registerTestBucket(instance, bucket, embedding) - const docs = createTestDocuments(3) - const result = await instance.ingest(docs, { ...ingestOptions, bucketId: bucket.id }) + const source = createTestSource({ content: 'Prechunked content with null opts' }) + + const result = await instance.ingestPreChunked( + source, + [{ content: source.content, chunkIndex: 0 }], + null, + ) + + expect(result.inserted).toBe(1) + expect(result.bucketId).toBe(DEFAULT_BUCKET_ID) + }) + + it('ignores null source subject external ID entries', async () => { + const { bucket, ingestOptions } = createMockBucket({ sources: [] }) + registerTestBucket(instance, bucket, embedding) + const source = createTestSource({ + subject: { + externalIds: [ + null, + undefined, + { type: 'email', id: 'pat@example.com' }, + ] as any, + }, + }) + + const result = await instance.ingest([source], { ...ingestOptions, bucketId: bucket.id }) + + expect(result.inserted).toBe(1) + }) + + it('ingests a batch of sources', async () => { + const { bucket, ingestOptions } = createMockBucket({ sources: [] }) + registerTestBucket(instance, bucket, embedding) + const sources = createTestSources(3) + const result = await instance.ingest(sources, { ...ingestOptions, bucketId: bucket.id }) expect(result.total).toBe(3) expect(result.inserted).toBe(3) }) it('batches all chunks into a single embedBatch call', async () => { - const { bucket, ingestOptions } = createMockBucket({ documents: [] }) + const { bucket, ingestOptions } = createMockBucket({ sources: [] }) registerTestBucket(instance, bucket, embedding) - const docs = createTestDocuments(3) + const sources = createTestSources(3) const spy = vi.spyOn(embedding, 'embedBatch') - await instance.ingest(docs, { ...ingestOptions, bucketId: bucket.id }) + await instance.ingest(sources, { ...ingestOptions, bucketId: bucket.id }) expect(spy).toHaveBeenCalledOnce() }) it('returns zero-count result for empty array', async () => { - const { bucket, ingestOptions } = createMockBucket({ documents: [] }) + const { bucket, ingestOptions } = createMockBucket({ sources: [] }) registerTestBucket(instance, bucket, embedding) const result = await instance.ingest([], { ...ingestOptions, bucketId: bucket.id }) expect(result.total).toBe(0) @@ -268,7 +345,7 @@ describe('typegraphInit', () => { }) it('throws for unknown bucket', async () => { - const { ingestOptions } = createMockBucket({ documents: [] }) + const { ingestOptions } = createMockBucket({ sources: [] }) await expect(instance.ingest([], { ...ingestOptions, bucketId: 'unknown' })).rejects.toThrow('not found') }) @@ -277,46 +354,71 @@ describe('typegraphInit', () => { }) }) + describe('optional filters', () => { + it('treats null list filters as omitted', async () => { + const { bucket, ingestOptions } = createMockBucket({ sources: [] }) + registerTestBucket(instance, bucket, embedding) + await instance.ingest([createTestSource()], { ...ingestOptions, bucketId: bucket.id }) + + await expect(instance.sources.list(null)).resolves.toHaveLength(1) + await expect(instance.jobs.list(null)).resolves.toEqual([]) + }) + + it('rejects null destructive source filters with a ConfigError', async () => { + await expect(instance.sources.delete(null)).rejects.toThrow('sources.delete requires at least one filter field') + }) + }) + describe('query', () => { it('returns results', async () => { - const { bucket, documents, ingestOptions } = createMockBucket({ documents: createTestDocuments(3) }) + const { bucket, sources, ingestOptions } = createMockBucket({ sources: createTestSources(3) }) registerTestBucket(instance, bucket, embedding) - await instance.ingest(documents, { ...ingestOptions, bucketId: bucket.id }) - const response = await instance.query('Document 1') + await instance.ingest(sources, { ...ingestOptions, bucketId: bucket.id }) + const response = await instance.query('Source 1') + expect(response.results.chunks.length).toBeGreaterThan(0) + }) + + it('treats null query opts as omitted', async () => { + const { bucket, sources, ingestOptions } = createMockBucket({ sources: createTestSources(1) }) + registerTestBucket(instance, bucket, embedding) + await instance.ingest(sources, { ...ingestOptions, bucketId: bucket.id }) + + const response = await instance.query('Source 1', null) + expect(response.results.chunks.length).toBeGreaterThan(0) }) it('passes tenantId from config', async () => { const inst = await typegraphInit({ vectorStore: adapter, embedding, tenantId: 'config-tenant' }) - const { bucket, documents, ingestOptions } = createMockBucket({ documents: createTestDocuments(1) }) + const { bucket, sources, ingestOptions } = createMockBucket({ sources: createTestSources(1) }) registerTestBucket(inst, bucket, embedding) - await inst.ingest(documents, { ...ingestOptions, bucketId: bucket.id }) + await inst.ingest(sources, { ...ingestOptions, bucketId: bucket.id }) const response = await inst.query('test') expect(response.query.tenantId).toBe('config-tenant') }) it('per-query tenantId overrides config', async () => { const inst = await typegraphInit({ vectorStore: adapter, embedding, tenantId: 'config-tenant' }) - const { bucket, documents, ingestOptions } = createMockBucket({ documents: createTestDocuments(1) }) + const { bucket, sources, ingestOptions } = createMockBucket({ sources: createTestSources(1) }) registerTestBucket(inst, bucket, embedding) - await inst.ingest(documents, { ...ingestOptions, bucketId: bucket.id }) + await inst.ingest(sources, { ...ingestOptions, bucketId: bucket.id }) const response = await inst.query('test', { tenantId: 'query-tenant' }) expect(response.query.tenantId).toBe('query-tenant') }) it('supports context option for XML context', async () => { - const { bucket, documents, ingestOptions } = createMockBucket({ documents: createTestDocuments(1) }) + const { bucket, sources, ingestOptions } = createMockBucket({ sources: createTestSources(1) }) registerTestBucket(instance, bucket, embedding) - await instance.ingest(documents, { ...ingestOptions, bucketId: bucket.id }) + await instance.ingest(sources, { ...ingestOptions, bucketId: bucket.id }) const response = await instance.query('test', { context: { format: 'xml' } }) expect(response.context).toContain('') expect(response.contextStats?.format).toBe('xml') }) it('supports context option for plain text context', async () => { - const { bucket, documents, ingestOptions } = createMockBucket({ documents: createTestDocuments(1) }) + const { bucket, sources, ingestOptions } = createMockBucket({ sources: createTestSources(1) }) registerTestBucket(instance, bucket, embedding) - await instance.ingest(documents, { ...ingestOptions, bucketId: bucket.id }) + await instance.ingest(sources, { ...ingestOptions, bucketId: bucket.id }) const response = await instance.query('test', { context: { format: 'plain' } }) expect(response.context).toBeDefined() expect(response.context).not.toContain('') @@ -332,9 +434,9 @@ describe('typegraphInit', () => { embedding, hooks: { onIndexStart, onIndexComplete }, }) - const { bucket, documents, ingestOptions } = createMockBucket({ documents: [createTestDocument()] }) + const { bucket, sources, ingestOptions } = createMockBucket({ sources: [createTestSource()] }) registerTestBucket(inst, bucket, embedding) - await inst.ingest(documents, { ...ingestOptions, bucketId: bucket.id }) + await inst.ingest(sources, { ...ingestOptions, bucketId: bucket.id }) expect(onIndexStart).toHaveBeenCalledOnce() expect(onIndexComplete).toHaveBeenCalledOnce() }) @@ -346,9 +448,9 @@ describe('typegraphInit', () => { embedding, hooks: { onQueryResults }, }) - const { bucket, documents, ingestOptions } = createMockBucket({ documents: [createTestDocument()] }) + const { bucket, sources, ingestOptions } = createMockBucket({ sources: [createTestSource()] }) registerTestBucket(inst, bucket, embedding) - await inst.ingest(documents, { ...ingestOptions, bucketId: bucket.id }) + await inst.ingest(sources, { ...ingestOptions, bucketId: bucket.id }) await inst.query('test') expect(onQueryResults).toHaveBeenCalledOnce() }) diff --git a/packages/sdk/src/cloud/cloud-instance.ts b/packages/sdk/src/cloud/cloud-instance.ts index 7d200b7..7c76a0f 100644 --- a/packages/sdk/src/cloud/cloud-instance.ts +++ b/packages/sdk/src/cloud/cloud-instance.ts @@ -1,31 +1,32 @@ -import type { typegraphInstance, typegraphConfig, BucketsApi, DocumentsApi, JobsApi, GraphApi } from '../typegraph.js' +import type { typegraphInstance, typegraphConfig, BucketsApi, SourcesApi, JobsApi, GraphApi } from '../typegraph.js' import type { Bucket, CreateBucketInput, BucketListFilter } from '../types/bucket.js' import type { QueryOpts, QueryResponse } from '../types/query.js' import type { IngestOptions, IndexResult } from '../types/index-types.js' import type { EmbeddingProvider } from '../embedding/provider.js' -import type { RawDocument, Chunk } from '../types/connector.js' -import type { typegraphDocument, DocumentFilter } from '../types/typegraph-document.js' +import type { SourceInput, Chunk } from '../types/connector.js' +import type { typegraphSource, SourceFilter } from '../types/source.js' import type { typegraphIdentity } from '../types/identity.js' import type { CreatePolicyInput, UpdatePolicyInput, Policy, PolicyType } from '../types/policy.js' import type { UndeployResult } from '../types/adapter.js' import type { PaginationOpts, PaginatedResult } from '../types/pagination.js' import type { ConversationTurnResult, MemoryHealthReport } from '../types/memory.js' -import type { MemoryRecord } from '../memory/types/memory.js' +import type { ExternalId, MemoryRecord } from '../memory/types/memory.js' import type { Job, JobFilter } from '../types/job.js' -import type { EntityResult, EntityDetail, EdgeResult, FactResult, FactSearchOpts, GraphExploreOpts, GraphExploreResult, GraphBackfillOpts, GraphBackfillResult, GraphExplainOpts, GraphSearchTrace, PassageResult, SubgraphOpts, SubgraphResult, GraphStats, RecallOpts } from '../types/graph-bridge.js' -import { DEFAULT_BUCKET_ID, normalizeRawDocument } from '../typegraph.js' +import type { EntityResult, EntityDetail, EdgeResult, FactResult, FactSearchOpts, GraphExploreOpts, GraphExploreResult, GraphBackfillOpts, GraphBackfillResult, GraphExplainOpts, GraphSearchTrace, ChunkResult, SubgraphOpts, SubgraphResult, GraphStats, RecallOpts, GraphEntityRef, UpsertGraphEdgeInput, UpsertGraphEntityInput, UpsertGraphFactInput, MergeGraphEntitiesInput, MergeGraphEntitiesResult, DeleteGraphEntityOpts, DeleteGraphEntityResult, RememberOpts, ForgetOpts, CorrectOpts, HealthCheckOpts, AddConversationTurnOpts } from '../types/graph-bridge.js' +import { DEFAULT_BUCKET_ID, normalizeSourceInput } from '../typegraph.js' import { HttpClient } from './http-client.js' import type { CloudConfig } from './http-client.js' +import { assertHasMeaningfulFilter, compactIdentity, optionalCompactObject, withDefaultTenant } from '../utils/input.js' /** * Extended typegraph instance for cloud mode. - * Includes document CRUD methods available via the hosted API. + * Includes source CRUD methods available via the hosted API. */ export interface typegraphCloudInstance extends typegraphInstance { - listDocuments(filter?: DocumentFilter): Promise - getDocument(documentId: string): Promise - updateDocument(documentId: string, update: Partial): Promise - deleteDocuments(filter: DocumentFilter): Promise + listSources(filter?: SourceFilter | null): Promise + getSource(sourceId: string): Promise + updateSource(sourceId: string, update: Partial): Promise + deleteSources(filter: SourceFilter | null): Promise } /** @@ -36,6 +37,41 @@ export function createCloudInstance(config: CloudConfig): typegraphCloudInstance const client = new HttpClient(config) const e = encodeURIComponent + function normalizeOpts(opts: T | null | undefined, method: string): T { + return withDefaultTenant(opts, config.tenantId, method) as T + } + + function splitIdentityOpts( + opts: T | null | undefined, + method: string, + ): { identity: typegraphIdentity; rest: Omit } { + const normalized = normalizeOpts(opts, method) as T & Record + const { + tenantId, + groupId, + userId, + agentId, + conversationId, + agentName, + agentDescription, + agentVersion, + ...rest + } = normalized + return { + identity: compactIdentity({ + tenantId: tenantId as string | undefined, + groupId: groupId as string | undefined, + userId: userId as string | undefined, + agentId: agentId as string | undefined, + conversationId: conversationId as string | undefined, + agentName: agentName as string | undefined, + agentDescription: agentDescription as string | undefined, + agentVersion: agentVersion as string | undefined, + }), + rest: rest as Omit, + } + } + const buckets: BucketsApi = { async create(input: CreateBucketInput): Promise { return client.post('/v1/buckets', input) @@ -43,17 +79,21 @@ export function createCloudInstance(config: CloudConfig): typegraphCloudInstance async get(bucketId: string): Promise { return client.get(`/v1/buckets/${e(bucketId)}`) }, - async list(filter?: BucketListFilter, pagination?: PaginationOpts): Promise> { + async list(filter?: BucketListFilter | null, pagination?: PaginationOpts | null): Promise> { + const normalizedFilter = optionalCompactObject(filter, 'buckets.list', 'filter') as BucketListFilter + const normalizedPagination = pagination == null + ? undefined + : optionalCompactObject(pagination, 'buckets.list', 'pagination') as PaginationOpts const searchParams = new URLSearchParams() - if (filter?.tenantId) searchParams.set('tenantId', filter.tenantId) - if (filter?.groupId) searchParams.set('groupId', filter.groupId) - if (filter?.userId) searchParams.set('userId', filter.userId) - if (filter?.agentId) searchParams.set('agentId', filter.agentId) - if (filter?.conversationId) searchParams.set('conversationId', filter.conversationId) - if (pagination?.limit != null) searchParams.set('limit', String(pagination.limit)) - if (pagination?.offset != null) searchParams.set('offset', String(pagination.offset)) + if (normalizedFilter.tenantId) searchParams.set('tenantId', normalizedFilter.tenantId) + if (normalizedFilter.groupId) searchParams.set('groupId', normalizedFilter.groupId) + if (normalizedFilter.userId) searchParams.set('userId', normalizedFilter.userId) + if (normalizedFilter.agentId) searchParams.set('agentId', normalizedFilter.agentId) + if (normalizedFilter.conversationId) searchParams.set('conversationId', normalizedFilter.conversationId) + if (normalizedPagination?.limit != null) searchParams.set('limit', String(normalizedPagination.limit)) + if (normalizedPagination?.offset != null) searchParams.set('offset', String(normalizedPagination.offset)) const qs = searchParams.toString() - if (pagination) { + if (normalizedPagination) { return client.get>(`/v1/buckets${qs ? `?${qs}` : ''}`) } return client.get(`/v1/buckets${qs ? `?${qs}` : ''}`) @@ -66,21 +106,27 @@ export function createCloudInstance(config: CloudConfig): typegraphCloudInstance }, } - const documents: DocumentsApi = { - async get(id: string): Promise { - return client.get(`/v1/documents/${e(id)}`) - }, - async list(filter?: DocumentFilter, pagination?: PaginationOpts): Promise> { - if (pagination) { - return client.post>('/v1/documents/list', { ...filter, ...pagination }) + const sources: SourcesApi = { + async get(id: string): Promise { + return client.get(`/v1/sources/${e(id)}`) + }, + async list(filter?: SourceFilter | null, pagination?: PaginationOpts | null): Promise> { + const normalizedFilter = optionalCompactObject(filter, 'sources.list', 'filter') as SourceFilter + const normalizedPagination = pagination == null + ? undefined + : optionalCompactObject(pagination, 'sources.list', 'pagination') as PaginationOpts + if (normalizedPagination) { + return client.post>('/v1/sources/list', { ...normalizedFilter, ...normalizedPagination }) } - return client.post('/v1/documents/list', filter) + return client.post('/v1/sources/list', normalizedFilter) }, - async update(id: string, input): Promise { - return client.patch(`/v1/documents/${e(id)}`, input) + async update(id: string, input): Promise { + return client.patch(`/v1/sources/${e(id)}`, input) }, - async delete(filter: DocumentFilter): Promise { - return client.delete('/v1/documents', filter) + async delete(filter: SourceFilter | null): Promise { + const normalizedFilter = optionalCompactObject(filter, 'sources.delete', 'filter') as SourceFilter + assertHasMeaningfulFilter(normalizedFilter, 'sources.delete') + return client.delete('/v1/sources', normalizedFilter) }, } @@ -88,8 +134,8 @@ export function createCloudInstance(config: CloudConfig): typegraphCloudInstance async get(id: string): Promise { return client.get(`/v1/jobs/${e(id)}`) }, - async list(filter?: JobFilter): Promise { - return client.post('/v1/jobs/list', filter) + async list(filter?: JobFilter | null): Promise { + return client.post('/v1/jobs/list', optionalCompactObject(filter, 'jobs.list', 'filter')) }, async upsert(): Promise { throw new Error('jobs.upsert() is a server-side primitive and is not available in cloud mode.') @@ -103,76 +149,127 @@ export function createCloudInstance(config: CloudConfig): typegraphCloudInstance } const graph: GraphApi = { - async searchEntities(query: string, identity: typegraphIdentity, opts?: { + async upsertEntity(input: UpsertGraphEntityInput): Promise { + return client.post('/v1/graph/entities', input) + }, + async upsertEntities(inputs: UpsertGraphEntityInput[]): Promise { + return client.post('/v1/graph/entities/batch', { entities: inputs }) + }, + async resolveEntity(ref: GraphEntityRef | string, identity?: typegraphIdentity | null): Promise { + return client.post('/v1/graph/entities/resolve', { + ref, + identity: normalizeOpts(identity, 'graph.resolveEntity'), + }) + }, + async linkExternalIds(entityId: string, externalIds: ExternalId[], identity?: typegraphIdentity | null): Promise { + return client.post(`/v1/graph/entities/${e(entityId)}/external-ids`, { + externalIds, + identity: normalizeOpts(identity, 'graph.linkExternalIds'), + }) + }, + async mergeEntities(input: MergeGraphEntitiesInput): Promise { + return client.post('/v1/graph/entities/merge', input) + }, + async deleteEntity(entityId: string, opts?: DeleteGraphEntityOpts | null): Promise { + const { identity, rest } = splitIdentityOpts(opts, 'graph.deleteEntity') + return client.delete(`/v1/graph/entities/${e(entityId)}`, { ...rest, identity }) + }, + async upsertEdge(input: UpsertGraphEdgeInput): Promise { + return client.post('/v1/graph/edges', input) + }, + async upsertEdges(inputs: UpsertGraphEdgeInput[]): Promise { + return client.post('/v1/graph/edges/batch', { edges: inputs }) + }, + async upsertFact(input: UpsertGraphFactInput): Promise { + return client.post('/v1/graph/facts', input) + }, + async upsertFacts(inputs: UpsertGraphFactInput[]): Promise { + return client.post('/v1/graph/facts/batch', { facts: inputs }) + }, + async searchEntities(query: string, identity: typegraphIdentity | null, opts?: { limit?: number entityType?: string minConnections?: number - }): Promise { - return client.post('/v1/graph/entities/search', { query, identity, ...opts }) + } | null): Promise { + const normalizedIdentity = normalizeOpts(identity, 'graph.searchEntities') + const normalizedOpts = optionalCompactObject<{ + limit?: number + entityType?: string + minConnections?: number + }>(opts, 'graph.searchEntities') as { + limit?: number + entityType?: string + minConnections?: number + } + return client.post('/v1/graph/entities/search', { query, identity: normalizedIdentity, ...normalizedOpts }) }, - async getEntity(id: string, opts?: typegraphIdentity): Promise { + async getEntity(id: string, opts?: typegraphIdentity | null): Promise { const params = new URLSearchParams() - for (const [key, value] of Object.entries(opts ?? {})) { + for (const [key, value] of Object.entries(normalizeOpts(opts, 'graph.getEntity'))) { if (typeof value === 'string') params.set(key, value) } const query = params.toString() return client.get(`/v1/graph/entities/${e(id)}${query ? `?${query}` : ''}`) }, - async getEdges(entityId: string, opts?: { + async getEdges(entityId: string, opts?: ({ direction?: 'in' | 'out' | 'both' relation?: string limit?: number - } & typegraphIdentity): Promise { - const { tenantId, groupId, userId, agentId, conversationId, ...rest } = opts ?? {} - const identity = { tenantId, groupId, userId, agentId, conversationId } + } & typegraphIdentity) | null): Promise { + const { identity, rest } = splitIdentityOpts<{ + direction?: 'in' | 'out' | 'both' + relation?: string + limit?: number + } & typegraphIdentity>(opts, 'graph.getEdges') return client.post(`/v1/graph/entities/${e(entityId)}/edges`, { ...rest, identity }) }, - async searchFacts(query: string, opts?: FactSearchOpts): Promise { - const { tenantId, groupId, userId, agentId, conversationId, ...rest } = opts ?? {} - const identity = { tenantId, groupId, userId, agentId, conversationId } + async searchFacts(query: string, opts?: FactSearchOpts | null): Promise { + const { identity, rest } = splitIdentityOpts(opts, 'graph.searchFacts') return client.post('/v1/graph/facts/search', { query, identity, ...rest }) }, - async explore(query: string, opts?: GraphExploreOpts): Promise { - const { tenantId, groupId, userId, agentId, conversationId, ...rest } = opts ?? {} - const identity = { tenantId, groupId, userId, agentId, conversationId } + async explore(query: string, opts?: GraphExploreOpts | null): Promise { + const { identity, rest } = splitIdentityOpts(opts, 'graph.explore') return client.post('/v1/graph/explore', { query, identity, ...rest }) }, - async getPassagesForEntity(entityId: string, opts?: { + async getChunksForEntity(entityId: string, opts?: ({ bucketIds?: string[] | undefined limit?: number | undefined - } & typegraphIdentity): Promise { - const { tenantId, groupId, userId, agentId, conversationId, ...rest } = opts ?? {} - const identity = { tenantId, groupId, userId, agentId, conversationId } - return client.post(`/v1/graph/entities/${e(entityId)}/passages`, { ...rest, identity }) - }, - async explainQuery(query: string, opts?: GraphExplainOpts): Promise { - const { tenantId, groupId, userId, agentId, conversationId, ...rest } = opts ?? {} - const identity = { tenantId, groupId, userId, agentId, conversationId } + } & typegraphIdentity) | null): Promise { + const { identity, rest } = splitIdentityOpts<{ + bucketIds?: string[] | undefined + limit?: number | undefined + } & typegraphIdentity>(opts, 'graph.getChunksForEntity') + return client.post(`/v1/graph/entities/${e(entityId)}/chunks`, { ...rest, identity }) + }, + async explainQuery(query: string, opts?: GraphExplainOpts | null): Promise { + const { identity, rest } = splitIdentityOpts(opts, 'graph.explainQuery') return client.post('/v1/graph/query/explain', { query, identity, ...rest }) }, - async backfill(identity: typegraphIdentity, opts?: GraphBackfillOpts): Promise { - return client.post('/v1/graph/backfill', { identity, ...opts }) + async backfill(identity: typegraphIdentity | null, opts?: GraphBackfillOpts | null): Promise { + return client.post('/v1/graph/backfill', { + identity: normalizeOpts(identity, 'graph.backfill'), + ...optionalCompactObject(opts, 'graph.backfill'), + }) }, async getSubgraph(opts: SubgraphOpts): Promise { - return client.post('/v1/graph/subgraph', opts) + return client.post('/v1/graph/subgraph', optionalCompactObject(opts, 'graph.getSubgraph')) }, - async stats(identity: typegraphIdentity): Promise { - return client.post('/v1/graph/stats', { identity }) + async stats(identity: typegraphIdentity | null): Promise { + return client.post('/v1/graph/stats', { identity: normalizeOpts(identity, 'graph.stats') }) }, - async getRelationTypes(identity: typegraphIdentity): Promise> { - return client.post('/v1/graph/relation-types', { identity }) + async getRelationTypes(identity: typegraphIdentity | null): Promise> { + return client.post('/v1/graph/relation-types', { identity: normalizeOpts(identity, 'graph.getRelationTypes') }) }, - async getEntityTypes(identity: typegraphIdentity): Promise> { - return client.post('/v1/graph/entity-types', { identity }) + async getEntityTypes(identity: typegraphIdentity | null): Promise> { + return client.post('/v1/graph/entity-types', { identity: normalizeOpts(identity, 'graph.getEntityTypes') }) }, } function recall(query: string, opts: RecallOpts & { format: 'xml' | 'markdown' | 'plain' }): Promise - function recall(query: string, opts: RecallOpts): Promise - function recall(query: string, opts: RecallOpts): Promise { - const { tenantId, groupId, userId, agentId, conversationId, ...rest } = opts - const identity = { tenantId, groupId, userId, agentId, conversationId } - if (opts.format) { + function recall(query: string, opts?: RecallOpts | null): Promise + function recall(query: string, opts?: RecallOpts | null): Promise { + const { identity, rest } = splitIdentityOpts(opts, 'recall') + if (rest.format) { return client.post('/v1/memory/recall', { query, identity, ...rest }) } return client.post('/v1/memory/recall', { query, identity, ...rest }) @@ -192,7 +289,7 @@ export function createCloudInstance(config: CloudConfig): typegraphCloudInstance }, buckets, - documents, + sources, jobs, graph, @@ -203,8 +300,8 @@ export function createCloudInstance(config: CloudConfig): typegraphCloudInstance async get(id: string): Promise { return client.get(`/v1/policies/${e(id)}`) }, - async list(filter?: { tenantId?: string; policyType?: PolicyType; enabled?: boolean }): Promise { - return client.post('/v1/policies/list', filter) + async list(filter?: { tenantId?: string; policyType?: PolicyType; enabled?: boolean } | null): Promise { + return client.post('/v1/policies/list', optionalCompactObject<{ tenantId?: string; policyType?: PolicyType; enabled?: boolean }>(filter, 'policies.list', 'filter')) }, async update(id: string, input: UpdatePolicyInput): Promise { return client.patch(`/v1/policies/${e(id)}`, input) @@ -230,48 +327,51 @@ export function createCloudInstance(config: CloudConfig): typegraphCloudInstance throw new Error('getQueryEmbeddingForBucket() is not available in cloud mode — embedding is managed server-side.') }, - async query(text: string, opts?: QueryOpts): Promise { - return client.post('/v1/query', { text, ...opts }) + async query(text: string, opts?: QueryOpts | null): Promise { + return client.post('/v1/query', { text, ...normalizeOpts(opts, 'query') }) }, - async ingest(docs: RawDocument[], opts: IngestOptions = {}): Promise { - const bucketId = opts.bucketId || DEFAULT_BUCKET_ID - const normalizedDocs = docs.map(normalizeRawDocument) - return client.post(`/v1/buckets/${e(bucketId)}/ingest`, { docs: normalizedDocs, opts }) + async ingest(sources: SourceInput[], opts?: IngestOptions | null): Promise { + const normalizedOpts = normalizeOpts(opts, 'ingest') + const bucketId = normalizedOpts.bucketId || DEFAULT_BUCKET_ID + const normalizedSources = sources.map(normalizeSourceInput) + return client.post(`/v1/buckets/${e(bucketId)}/ingest`, { sources: normalizedSources, opts: normalizedOpts }) }, - async ingestPreChunked(doc: RawDocument, chunks: Chunk[], opts: IngestOptions = {}): Promise { - const bucketId = opts.bucketId || DEFAULT_BUCKET_ID - return client.post(`/v1/buckets/${e(bucketId)}/ingest`, { doc: normalizeRawDocument(doc), chunks, opts }) + async ingestPreChunked(source: SourceInput, chunks: Chunk[], opts?: IngestOptions | null): Promise { + const normalizedOpts = normalizeOpts(opts, 'ingestPreChunked') + const bucketId = normalizedOpts.bucketId || DEFAULT_BUCKET_ID + return client.post(`/v1/buckets/${e(bucketId)}/ingest`, { source: normalizeSourceInput(source), chunks, opts: normalizedOpts }) }, - async remember(content: string, identity: typegraphIdentity, category?: string, opts?: { - importance?: number - metadata?: Record - }): Promise { - return client.post('/v1/memory/remember', { content, identity, category, ...opts }) + async remember(content: string, opts?: RememberOpts | null): Promise { + const { identity, rest } = splitIdentityOpts(opts, 'remember') + return client.post('/v1/memory/remember', { content, identity, ...rest }) }, - async forget(id: string, identity: typegraphIdentity): Promise { - await client.post('/v1/memory/forget', { id, identity }) + async forget(id: string, opts?: ForgetOpts | null): Promise { + const { identity, rest } = splitIdentityOpts(opts, 'forget') + await client.post('/v1/memory/forget', { id, identity, ...rest }) }, - async correct(correction: string, identity: typegraphIdentity): Promise<{ invalidated: number; created: number; summary: string }> { - return client.post('/v1/memory/correct', { correction, identity }) + async correct(correction: string, opts?: CorrectOpts | null): Promise<{ invalidated: number; created: number; summary: string }> { + const { identity, rest } = splitIdentityOpts(opts, 'correct') + return client.post('/v1/memory/correct', { correction, identity, ...rest }) }, recall: recall as typegraphInstance['recall'], - async healthCheck(identity: typegraphIdentity): Promise { - return client.post('/v1/memory/health', { identity }) + async healthCheck(opts?: HealthCheckOpts | null): Promise { + const { identity, rest } = splitIdentityOpts(opts, 'healthCheck') + return client.post('/v1/memory/health', { identity, ...rest }) }, async addConversationTurn( messages: Array<{ role: string; content: string; timestamp?: Date }>, - identity: typegraphIdentity, - conversationId?: string, + opts?: AddConversationTurnOpts | null, ): Promise { - return client.post('/v1/memory/conversation', { messages, identity, conversationId }) + const { identity, rest } = splitIdentityOpts(opts, 'addConversationTurn') + return client.post('/v1/memory/conversation', { messages, identity, ...rest }) }, async flush(): Promise { @@ -282,22 +382,24 @@ export function createCloudInstance(config: CloudConfig): typegraphCloudInstance // No-op in cloud mode }, - // ── Document CRUD (cloud-only extensions) ── + // ── Source CRUD (cloud-only extensions) ── - async listDocuments(filter?: DocumentFilter): Promise { - return client.post('/v1/documents/list', filter) + async listSources(filter?: SourceFilter | null): Promise { + return client.post('/v1/sources/list', optionalCompactObject(filter, 'listSources', 'filter')) }, - async getDocument(documentId: string): Promise { - return client.get(`/v1/documents/${e(documentId)}`) + async getSource(sourceId: string): Promise { + return client.get(`/v1/sources/${e(sourceId)}`) }, - async updateDocument(documentId: string, update: Partial): Promise { - return client.patch(`/v1/documents/${e(documentId)}`, update) + async updateSource(sourceId: string, update: Partial): Promise { + return client.patch(`/v1/sources/${e(sourceId)}`, update) }, - async deleteDocuments(filter: DocumentFilter): Promise { - return client.delete('/v1/documents', filter) + async deleteSources(filter: SourceFilter | null): Promise { + const normalizedFilter = optionalCompactObject(filter, 'deleteSources', 'filter') as SourceFilter + assertHasMeaningfulFilter(normalizedFilter, 'deleteSources') + return client.delete('/v1/sources', normalizedFilter) }, } diff --git a/packages/sdk/src/embedding/ai-sdk-adapter.ts b/packages/sdk/src/embedding/ai-sdk-adapter.ts index 12daf33..37f180e 100644 --- a/packages/sdk/src/embedding/ai-sdk-adapter.ts +++ b/packages/sdk/src/embedding/ai-sdk-adapter.ts @@ -20,7 +20,7 @@ import type { EmbeddingProvider } from './provider.js' * const embedding: AISDKEmbeddingInput = { * model: gateway.embeddingModel('voyage/voyage-4-large'), * dimensions: 512, - * providerOptions: { voyage: { outputDimension: 512, inputType: 'document' } }, + * providerOptions: { voyage: { outputDimension: 512, inputType: 'source' } }, * } * ``` */ diff --git a/packages/sdk/src/graph/__tests__/graph-bridge.test.ts b/packages/sdk/src/graph/__tests__/graph-bridge.test.ts index b3d4b04..cbe4258 100644 --- a/packages/sdk/src/graph/__tests__/graph-bridge.test.ts +++ b/packages/sdk/src/graph/__tests__/graph-bridge.test.ts @@ -1,11 +1,11 @@ import { describe, it, expect, vi } from 'vitest' import { createKnowledgeGraphBridge } from '../graph-bridge.js' -import type { MemoryStoreAdapter, SemanticEntity, SemanticEdge, SemanticFactRecord } from '../../memory/types/index.js' +import type { ExternalId, MemoryStoreAdapter, SemanticEntity, SemanticEdge, SemanticEntityChunkEdge, SemanticFactRecord, SemanticGraphEdge } from '../../memory/types/index.js' import { buildScope } from '../../memory/index.js' const testScope = buildScope({ userId: 'test-user' }) -function makeEntity(id: string, name: string, type: string = 'entity'): SemanticEntity { +function makeEntity(id: string, name: string, type: string = 'concept'): SemanticEntity { return { id, name, @@ -40,20 +40,57 @@ function makeEdge( interface MockMention { entityId: string - documentId: string + sourceId: string chunkIndex: number bucketId: string - mentionType: 'subject' | 'object' | 'co_occurrence' | 'entity' | 'alias' + mentionType: 'subject' | 'object' | 'co_occurrence' | 'entity' | 'alias' | 'source_subject' surfaceText?: string | undefined normalizedSurfaceText?: string | undefined confidence?: number | undefined } +function externalIdKey(externalId: ExternalId): string { + return [ + externalId.type.trim().toLowerCase(), + externalId.id.trim(), + externalId.encoding ?? 'none', + ].join('|') +} + function mockStore( entities: Map = new Map(), edges: SemanticEdge[] = [], mentions: MockMention[] = [], ) { + const externalEntityIdByKey = new Map() + const externalIdsByEntity = new Map() + + const attachExternalIds = (entity: SemanticEntity): SemanticEntity => ({ + ...entity, + externalIds: externalIdsByEntity.get(entity.id) ?? entity.externalIds, + }) + + const linkExternalIds = (entityId: string, externalIds: ExternalId[]) => { + const existing = externalIdsByEntity.get(entityId) ?? [] + const byKey = new Map(existing.map(externalId => [externalIdKey(externalId), externalId])) + for (const externalId of externalIds) { + const normalized: ExternalId = { + ...externalId, + type: externalId.type.trim().toLowerCase(), + id: externalId.id.trim(), + encoding: externalId.encoding ?? 'none', + } + const key = externalIdKey(normalized) + const currentEntityId = externalEntityIdByKey.get(key) + if (currentEntityId && currentEntityId !== entityId) { + throw new Error(`external ID conflict for ${key}`) + } + externalEntityIdByKey.set(key, entityId) + byKey.set(key, normalized) + } + externalIdsByEntity.set(entityId, [...byKey.values()]) + } + const store: MemoryStoreAdapter = { initialize: vi.fn(), upsert: vi.fn().mockImplementation(async (r) => r), @@ -65,16 +102,143 @@ function mockStore( getHistory: vi.fn().mockResolvedValue([]), search: vi.fn().mockResolvedValue([]), upsertEntity: vi.fn().mockImplementation(async (e: SemanticEntity) => { + if (e.externalIds?.length) linkExternalIds(e.id, e.externalIds) entities.set(e.id, e) - return e + return attachExternalIds(e) + }), + getEntity: vi.fn().mockImplementation(async (id: string) => { + const entity = entities.get(id) + return entity ? attachExternalIds(entity) : null }), - getEntity: vi.fn().mockImplementation(async (id: string) => entities.get(id) ?? null), findEntities: vi.fn().mockImplementation(async (query: string) => { return [...entities.values()].filter(e => e.name.toLowerCase().includes(query.toLowerCase()), - ) + ).map(attachExternalIds) + }), + upsertEntityExternalIds: vi.fn().mockImplementation(async (entityId: string, externalIds: ExternalId[]) => { + linkExternalIds(entityId, externalIds) + const entity = entities.get(entityId) + if (entity) entities.set(entityId, attachExternalIds(entity)) }), - searchEntities: vi.fn().mockImplementation(async () => [...entities.values()]), + findEntityByExternalId: vi.fn().mockImplementation(async (externalId: ExternalId) => { + const entityId = externalEntityIdByKey.get(externalIdKey({ + ...externalId, + type: externalId.type.trim().toLowerCase(), + id: externalId.id.trim(), + encoding: externalId.encoding ?? 'none', + })) + const entity = entityId ? entities.get(entityId) : undefined + return entity ? attachExternalIds(entity) : null + }), + mergeEntityReferences: vi.fn().mockImplementation(async ({ sourceEntityId, targetEntityId, properties }) => { + const source = entities.get(sourceEntityId) + const target = entities.get(targetEntityId) + if (!source || !target) throw new Error('entity not found') + const sourceExternalIds = externalIdsByEntity.get(sourceEntityId) ?? [] + const targetExternalIds = externalIdsByEntity.get(targetEntityId) ?? [] + const mergedExternalIds = [...targetExternalIds] + for (const externalId of sourceExternalIds) { + const key = externalIdKey(externalId) + const linked = externalEntityIdByKey.get(key) + if (linked && linked !== sourceEntityId && linked !== targetEntityId) throw new Error('external ID conflict') + externalEntityIdByKey.set(key, targetEntityId) + if (!mergedExternalIds.some(existing => externalIdKey(existing) === key)) mergedExternalIds.push(externalId) + } + externalIdsByEntity.delete(sourceEntityId) + externalIdsByEntity.set(targetEntityId, mergedExternalIds) + let redirectedEdges = 0 + let removedSelfEdges = 0 + for (const edge of edges) { + if (edge.sourceEntityId === sourceEntityId) { + edge.sourceEntityId = targetEntityId + edge.sourceId = targetEntityId + redirectedEdges += 1 + } + if (edge.targetEntityId === sourceEntityId) { + edge.targetEntityId = targetEntityId + edge.targetId = targetEntityId + redirectedEdges += 1 + } + if (edge.sourceEntityId === edge.targetEntityId) { + edge.temporal.invalidAt = new Date() + removedSelfEdges += 1 + } + } + let movedMentions = 0 + for (const mention of mentions) { + if (mention.entityId === sourceEntityId) { + mention.entityId = targetEntityId + movedMentions += 1 + } + } + const updatedTarget: SemanticEntity = { + ...target, + aliases: [...new Set([...target.aliases, source.name, ...source.aliases])], + externalIds: mergedExternalIds, + properties: { ...source.properties, ...target.properties, ...(properties ?? {}) }, + } + const mergedSource: SemanticEntity = { + ...source, + status: 'merged', + mergedIntoEntityId: targetEntityId, + temporal: { ...source.temporal, invalidAt: new Date() }, + } + entities.set(targetEntityId, updatedTarget) + entities.set(sourceEntityId, mergedSource) + return { + target: { + id: updatedTarget.id, + name: updatedTarget.name, + entityType: updatedTarget.entityType, + aliases: updatedTarget.aliases, + externalIds: mergedExternalIds, + edgeCount: edges.filter(edge => edge.sourceEntityId === targetEntityId || edge.targetEntityId === targetEntityId).length, + properties: updatedTarget.properties, + createdAt: updatedTarget.temporal.createdAt, + validAt: updatedTarget.temporal.validAt, + topEdges: [], + }, + sourceEntityId, + targetEntityId, + redirectedEdges, + redirectedFacts: 0, + redirectedGraphEdges: redirectedEdges, + movedMentions, + movedExternalIds: sourceExternalIds.length, + removedSelfEdges, + } + }), + deleteEntityReferences: vi.fn().mockImplementation(async (entityId: string, opts = {}) => { + const mode = opts.mode ?? 'invalidate' + const matchingEdges = edges.filter(edge => edge.sourceEntityId === entityId || edge.targetEntityId === entityId) + const matchingMentions = mentions.filter(mention => mention.entityId === entityId) + const matchingExternalIds = externalIdsByEntity.get(entityId) ?? [] + if (mode === 'purge') { + entities.delete(entityId) + for (const externalId of matchingExternalIds) externalEntityIdByKey.delete(externalIdKey(externalId)) + externalIdsByEntity.delete(entityId) + } else { + const entity = entities.get(entityId) + if (entity) { + entities.set(entityId, { + ...entity, + status: 'invalidated', + temporal: { ...entity.temporal, invalidAt: new Date() }, + }) + } + } + for (const edge of matchingEdges) edge.temporal.invalidAt = new Date() + return { + entityId, + mode, + deletedEdges: matchingEdges.length, + deletedFacts: 0, + deletedGraphEdges: matchingEdges.length, + deletedMentions: matchingMentions.length, + deletedExternalIds: matchingExternalIds.length, + } + }), + searchEntities: vi.fn().mockImplementation(async () => [...entities.values()].map(attachExternalIds)), searchEntitiesHybrid: vi.fn().mockImplementation(async (query: string) => { const normalized = query .replace(/[Ææ]/g, 'ae') @@ -90,17 +254,20 @@ function mockStore( || e.aliases.some(a => a.toLowerCase() === query.toLowerCase()) || mentions.some(m => m.entityId === e.id && m.normalizedSurfaceText === normalized) ) - .map(e => ({ ...e, properties: { ...e.properties, _similarity: 1 } })) + .map(e => ({ ...attachExternalIds(e), properties: { ...e.properties, _similarity: 1 } })) return exact.length > 0 ? exact - : [...entities.values()].map(e => ({ ...e, properties: { ...e.properties, _similarity: 0.5 } })) + : [...entities.values()].map(e => ({ ...attachExternalIds(e), properties: { ...e.properties, _similarity: 0.5 } })) }), upsertEdge: vi.fn().mockImplementation(async (e: SemanticEdge) => { edges.push(e) return e }), getEntitiesBatch: vi.fn().mockImplementation(async (ids: string[]) => { - return ids.map(id => entities.get(id)).filter(Boolean) as SemanticEntity[] + return ids + .map(id => entities.get(id)) + .filter((entity): entity is SemanticEntity => !!entity) + .map(attachExternalIds) }), getEdges: vi.fn().mockImplementation(async (entityId: string, direction: string = 'both') => { return edges.filter(e => { @@ -150,6 +317,319 @@ function mockEmbedding() { } describe('createKnowledgeGraphBridge', () => { + describe('developer seeding', () => { + it('upserts and resolves entities by deterministic external ID', async () => { + const entities = new Map() + const store = mockStore(entities) + const bridge = createKnowledgeGraphBridge({ + memoryStore: store, + embedding: mockEmbedding(), + scope: testScope, + }) + const externalId: ExternalId = { + id: 'ryan@example.com', + type: 'email', + } + + const seeded = await bridge.upsertEntity!({ + name: 'Ryan Musser', + entityType: 'person', + externalIds: [externalId], + }) + const resolved = await bridge.resolveEntity!({ externalId }, testScope) + + expect(seeded.externalIds).toEqual([expect.objectContaining({ + id: 'ryan@example.com', + type: 'email', + encoding: 'none', + })]) + expect(resolved?.id).toBe(seeded.id) + expect(store.findEntityByExternalId).toHaveBeenCalledWith( + expect.objectContaining({ id: 'ryan@example.com', type: 'email' }), + testScope, + ) + }) + + it('uses external IDs before fuzzy entity creation when seeding facts', async () => { + const entities = new Map() + const edges: SemanticEdge[] = [] + const store = mockStore(entities, edges) + const bridge = createKnowledgeGraphBridge({ + memoryStore: store, + embedding: mockEmbedding(), + scope: testScope, + }) + const slackId: ExternalId = { + id: 'U123', + type: 'slack_user_id', + } + + const jane = await bridge.upsertEntity!({ + name: 'Jane Doe', + entityType: 'person', + externalIds: [slackId], + }) + + const fact = await bridge.upsertFact!({ + source: { name: 'J. Doe', entityType: 'person', externalId: slackId }, + relation: 'works at', + target: { name: 'TypeGraph', entityType: 'organization' }, + evidenceText: 'J. Doe works at TypeGraph.', + }) + + const people = [...entities.values()].filter(entity => entity.entityType === 'person') + expect(people).toHaveLength(1) + expect(people[0]?.id).toBe(jane.id) + expect(people[0]?.aliases).toContain('J. Doe') + expect(fact.sourceEntityId).toBe(jane.id) + expect(fact.relation).toBe('WORKS_FOR') + }) + + it('merges entities through the graph bridge and rewrites references', async () => { + const entities = new Map([ + ['source', makeEntity('source', 'Pat Old', 'person')], + ['target', makeEntity('target', 'Pat Canonical', 'person')], + ['acme', makeEntity('acme', 'Acme', 'organization')], + ]) + const edges = [makeEdge('edge-1', 'source', 'acme', 'WORKS_FOR')] + const mentions: MockMention[] = [{ + entityId: 'source', + sourceId: 'source-1', + chunkIndex: 0, + bucketId: 'bucket-1', + mentionType: 'entity', + surfaceText: 'Pat Old', + normalizedSurfaceText: 'pat old', + }] + const store = mockStore(entities, edges, mentions) + const bridge = createKnowledgeGraphBridge({ + memoryStore: store, + embedding: mockEmbedding(), + scope: testScope, + }) + + const result = await bridge.mergeEntities!({ + sourceEntityId: 'source', + targetEntityId: 'target', + properties: { reviewed: true }, + }) + + expect(result.sourceEntityId).toBe('source') + expect(result.targetEntityId).toBe('target') + expect(result.redirectedEdges).toBe(1) + expect(result.movedMentions).toBe(1) + expect(edges[0]!.sourceEntityId).toBe('target') + expect(mentions[0]!.entityId).toBe('target') + expect(entities.get('source')?.status).toBe('merged') + expect(entities.get('source')?.mergedIntoEntityId).toBe('target') + expect(entities.get('target')?.aliases).toContain('Pat Old') + expect(entities.get('target')?.properties.reviewed).toBe(true) + }) + + it('invalidates and purges entities through the graph bridge', async () => { + const entities = new Map([ + ['pat', makeEntity('pat', 'Pat', 'person')], + ['acme', makeEntity('acme', 'Acme', 'organization')], + ]) + const edges = [makeEdge('edge-1', 'pat', 'acme', 'WORKS_FOR')] + const store = mockStore(entities, edges) + const bridge = createKnowledgeGraphBridge({ + memoryStore: store, + embedding: mockEmbedding(), + scope: testScope, + }) + + const invalidated = await bridge.deleteEntity!('pat', { mode: 'invalidate' }) + expect(invalidated.mode).toBe('invalidate') + expect(entities.get('pat')?.status).toBe('invalidated') + expect(edges[0]!.temporal.invalidAt).toBeInstanceOf(Date) + + const defaultMode = await bridge.deleteEntity!('pat', null) + expect(defaultMode.mode).toBe('invalidate') + + const purged = await bridge.deleteEntity!('acme', { mode: 'purge' }) + expect(purged.mode).toBe('purge') + expect(entities.has('acme')).toBe(false) + }) + + it('treats null graph search opts as omitted', async () => { + const store = mockStore() + const bridge = createKnowledgeGraphBridge({ + memoryStore: store, + embedding: mockEmbedding(), + scope: testScope, + }) + + await expect(bridge.searchFacts!('query', null)).resolves.toEqual([]) + await expect(bridge.getEntity!('missing', null)).resolves.toBeNull() + await expect(bridge.getEdges!('missing', null)).resolves.toEqual([]) + }) + + it('rejects external ID conflicts instead of reassigning identity', async () => { + const entities = new Map() + const store = mockStore(entities) + const bridge = createKnowledgeGraphBridge({ + memoryStore: store, + embedding: mockEmbedding(), + scope: testScope, + }) + const email: ExternalId = { + id: 'alice@example.com', + type: 'email', + } + + await bridge.upsertEntity!({ + id: 'ent_alice', + name: 'Alice', + entityType: 'person', + externalIds: [email], + }) + + await expect(bridge.upsertEntity!({ + id: 'ent_bob', + name: 'Bob', + entityType: 'person', + externalIds: [email], + })).rejects.toThrow(/External IDs resolve to entity ent_alice/) + }) + + it('resolves entity scope from entity IDs and external IDs with OR semantics', async () => { + const email: ExternalId = { id: 'pat@example.com', type: 'email' } + const github: ExternalId = { id: 'pm', type: 'github_handle' } + const entities = new Map([ + ['ent-manual', makeEntity('ent-manual', 'Manual Anchor', 'person')], + ['ent-email', { ...makeEntity('ent-email', 'Pat Email', 'person'), externalIds: [email] }], + ['ent-github', { ...makeEntity('ent-github', 'Pat GitHub', 'person'), externalIds: [github] }], + ]) + const store = mockStore(entities) + for (const entity of entities.values()) { + if (entity.externalIds?.length) { + await store.upsertEntityExternalIds!(entity.id, entity.externalIds, testScope) + } + } + const chunkEdges: SemanticEntityChunkEdge[] = [ + { + id: 'edge-manual', + entityId: 'ent-manual', + chunkRef: { bucketId: 'bucket-1', sourceId: 'source-1', chunkIndex: 0, embeddingModel: 'mock-embed' }, + weight: 1, + mentionCount: 1, + surfaceTexts: ['Manual Anchor'], + mentionTypes: ['entity'], + }, + { + id: 'edge-email', + entityId: 'ent-email', + chunkRef: { bucketId: 'bucket-1', sourceId: 'source-2', chunkIndex: 0, embeddingModel: 'mock-embed' }, + weight: 1, + mentionCount: 1, + surfaceTexts: ['Pat Email'], + mentionTypes: ['entity'], + }, + { + id: 'edge-github', + entityId: 'ent-github', + chunkRef: { bucketId: 'bucket-1', sourceId: 'source-3', chunkIndex: 0, embeddingModel: 'mock-embed' }, + weight: 1, + mentionCount: 1, + surfaceTexts: ['Pat GitHub'], + mentionTypes: ['entity'], + }, + ] + Object.assign(store, { + getChunkEdgesForEntities: vi.fn().mockImplementation(async (entityIds: string[]) => + chunkEdges.filter(edge => entityIds.includes(edge.entityId)) + ), + }) + const bridge = createKnowledgeGraphBridge({ + memoryStore: store, + embedding: mockEmbedding(), + scope: testScope, + }) + + const resolved = await bridge.resolveEntityScope!({ + entityIds: ['ent-manual'], + externalIds: [email, github], + }, testScope) + + expect(resolved.entityIds).toEqual(expect.arrayContaining(['ent-manual', 'ent-email', 'ent-github'])) + expect(resolved.chunkRefs).toEqual(expect.arrayContaining([ + expect.objectContaining({ sourceId: 'source-1' }), + expect.objectContaining({ sourceId: 'source-2' }), + expect.objectContaining({ sourceId: 'source-3' }), + ])) + expect(store.getChunkEdgesForEntities).toHaveBeenCalledWith( + expect.arrayContaining(['ent-manual', 'ent-email', 'ent-github']), + expect.objectContaining({ scope: testScope }), + ) + }) + }) + + describe('addSourceSubject', () => { + it('materializes a source subject as primary-source chunk evidence', async () => { + const entities = new Map() + const mentions: MockMention[] = [] + const graphEdges: SemanticGraphEdge[] = [] + const store = mockStore(entities, [], mentions) + store.upsertGraphEdges = vi.fn().mockImplementation(async (rows: SemanticGraphEdge[]) => { + graphEdges.push(...rows) + }) + const bridge = createKnowledgeGraphBridge({ + memoryStore: store, + embedding: mockEmbedding(), + scope: testScope, + }) + + const entity = await bridge.addSourceSubject!({ + subject: { + name: 'Acme demo', + entityType: 'meeting', + externalIds: [{ type: 'calendar_event_id', id: 'evt_123' }], + }, + bucketId: 'bucket-1', + sourceId: 'source-1', + embeddingModel: 'mock-embed', + chunks: [ + { id: 'chunk-1', content: 'Intro.', chunkIndex: 0 }, + { id: 'chunk-2', content: 'Next steps.', chunkIndex: 1 }, + ], + tenantId: 'tenant-1', + visibility: 'tenant', + }) + + expect(entity).toEqual(expect.objectContaining({ + name: 'Acme demo', + entityType: 'meeting', + })) + expect([...entities.values()][0]!.externalIds).toEqual([ + expect.objectContaining({ + type: 'calendar_event_id', + id: 'evt_123', + }), + ]) + expect(mentions).toHaveLength(2) + expect(mentions).toEqual(expect.arrayContaining([ + expect.objectContaining({ mentionType: 'source_subject', sourceId: 'source-1', chunkIndex: 0, confidence: 1.0 }), + expect.objectContaining({ mentionType: 'source_subject', sourceId: 'source-1', chunkIndex: 1, confidence: 1.0 }), + ])) + expect(graphEdges).toHaveLength(2) + expect(graphEdges).toEqual(expect.arrayContaining([ + expect.objectContaining({ + sourceType: 'entity', + targetType: 'chunk', + relation: 'PRIMARY_SOURCE_CHUNK', + targetChunkRef: expect.objectContaining({ sourceId: 'source-1', chunkIndex: 0, chunkId: 'chunk-1' }), + visibility: 'tenant', + scope: expect.objectContaining({ tenantId: 'tenant-1' }), + }), + expect.objectContaining({ + relation: 'PRIMARY_SOURCE_CHUNK', + targetChunkRef: expect.objectContaining({ sourceId: 'source-1', chunkIndex: 1, chunkId: 'chunk-2' }), + }), + ])) + }) + }) + describe('addTriple', () => { it('creates entities, edge, and entity↔chunk mentions from a triple', async () => { const entities = new Map() @@ -171,7 +651,7 @@ describe('createKnowledgeGraphBridge', () => { sourceChunkId: 'chk-vitd-0', content: 'Vitamin D supports bone health in elderly patients.', bucketId: 'bucket-1', - documentId: 'doc-1', + sourceId: 'source-1', chunkIndex: 0, }) @@ -184,7 +664,7 @@ describe('createKnowledgeGraphBridge', () => { expect(edges).toHaveLength(1) const edge = edges[0]! - expect(edge.relation).toBe('SUPPORTED') + expect(edge.relation).toBe('SUPPORTS') expect(edge.properties.content).toBeUndefined() expect(edge.properties.bucketId).toBeUndefined() expect(edge.properties.chunkIndex).toBeUndefined() @@ -197,7 +677,7 @@ describe('createKnowledgeGraphBridge', () => { // Two mentions written to the junction (subject + object for the same chunk) expect(store.upsertEntityChunkMentions).toHaveBeenCalled() expect(mentions).toHaveLength(2) - expect(mentions.every(m => m.documentId === 'doc-1' && m.chunkIndex === 0 && m.bucketId === 'bucket-1')).toBe(true) + expect(mentions.every(m => m.sourceId === 'source-1' && m.chunkIndex === 0 && m.bucketId === 'bucket-1')).toBe(true) expect(mentions.map(m => m.mentionType).sort()).toEqual(['object', 'subject']) }) @@ -214,7 +694,7 @@ describe('createKnowledgeGraphBridge', () => { const entities = new Map() const edges: SemanticEdge[] = [] const store = mockStore(entities, edges) - store.upsertPassageEntityEdges = vi.fn().mockResolvedValue(undefined) + store.upsertGraphEdges = vi.fn().mockResolvedValue(undefined) store.upsertFactRecord = vi.fn().mockImplementation(async fact => fact) const bridge = createKnowledgeGraphBridge({ memoryStore: store, @@ -230,7 +710,7 @@ describe('createKnowledgeGraphBridge', () => { objectType: 'organization', content: `Subject ${item.visibility} leads Object ${item.visibility}.`, bucketId: 'bucket-1', - documentId: `doc-${item.visibility}`, + sourceId: `source-${item.visibility}`, chunkIndex: 0, ...item.identity, visibility: item.visibility, @@ -248,7 +728,7 @@ describe('createKnowledgeGraphBridge', () => { visibility: item.visibility, scope: expect.objectContaining(item.identity), })) - expect(store.upsertPassageEntityEdges).toHaveBeenCalledWith(expect.arrayContaining([ + expect(store.upsertGraphEdges).toHaveBeenCalledWith(expect.arrayContaining([ expect.objectContaining({ visibility: item.visibility, scope: expect.objectContaining(item.identity), @@ -271,7 +751,7 @@ describe('createKnowledgeGraphBridge', () => { type: 'organization', content: 'TypeGraph appears in group A.', bucketId: 'bucket-1', - documentId: 'doc-a', + sourceId: 'source-a', chunkIndex: 0, groupId: 'group-a', visibility: 'group', @@ -281,7 +761,7 @@ describe('createKnowledgeGraphBridge', () => { type: 'organization', content: 'TypeGraph appears in group B.', bucketId: 'bucket-1', - documentId: 'doc-b', + sourceId: 'source-b', chunkIndex: 0, groupId: 'group-b', visibility: 'group', @@ -312,7 +792,7 @@ describe('createKnowledgeGraphBridge', () => { objectType: 'person', content: 'Cæsar Simon was calling himself Cole Conway in company with Steve Sharp.', bucketId: 'bucket-1', - documentId: 'doc-47558', + sourceId: 'source-47558', chunkIndex: 24, }) @@ -337,7 +817,7 @@ describe('createKnowledgeGraphBridge', () => { expect(foundByAsciiAlias[0]).toEqual(expect.objectContaining({ name: 'Cæsar Simon' })) }) - it('does not persist self-edges after entity resolution', async () => { + it('routes alias predicates into entity aliases instead of graph edges', async () => { const entities = new Map() const edges: SemanticEdge[] = [] const store = mockStore(entities, edges) @@ -357,11 +837,13 @@ describe('createKnowledgeGraphBridge', () => { objectAliases: ['Cæsar Simon'], content: 'Cæsar Simon was known as Conway.', bucketId: 'bucket-1', - documentId: 'doc-1', + sourceId: 'source-1', chunkIndex: 0, }) expect(edges).toHaveLength(0) + expect(entities.size).toBe(1) + expect([...entities.values()][0]?.aliases).toContain('Conway') }) it('stores entity mentions even when no relationship is available', async () => { @@ -381,7 +863,7 @@ describe('createKnowledgeGraphBridge', () => { description: 'A name used by Cæsar Simon in Paducah.', content: 'At twenty years of age Cousin Cæsar was calling himself Cole Conway.', bucketId: 'bucket-1', - documentId: 'doc-1', + sourceId: 'source-1', chunkIndex: 0, }]) @@ -407,7 +889,7 @@ describe('createKnowledgeGraphBridge', () => { predicate: 'works at', object: 'Acme Corp', content: 'Alice works at Acme Corp.', - bucketId: 'doc-2', + bucketId: 'source-2', }) expect(edges[0]!.relation).toBe('WORKS_FOR') @@ -526,7 +1008,7 @@ describe('createKnowledgeGraphBridge', () => { predicate: 'supported', object: 'bone health', content: 'Chunk 1', - bucketId: 'doc-1', + bucketId: 'source-1', }) const firstEntityCount = entities.size @@ -537,7 +1019,7 @@ describe('createKnowledgeGraphBridge', () => { predicate: 'supported', object: 'skeletal health', content: 'Chunk 2', - bucketId: 'doc-1', + bucketId: 'source-1', }) // Should have 3 entities (Vitamin D reused, + bone health + skeletal health) @@ -549,20 +1031,20 @@ describe('createKnowledgeGraphBridge', () => { }) describe('backfill', () => { - it('creates passage nodes, passage-entity edges, fact records, and profiles from existing rows', async () => { + it('creates entity-chunk graph edges, fact records, and profiles from existing rows', async () => { const entities = new Map([ ['alice', makeEntity('alice', 'Alice', 'person')], ['beta', makeEntity('beta', 'Beta Inc', 'organization')], ]) - const edges = [makeEdge('edge-1', 'alice', 'beta', 'WORKS_AT')] + const edges = [makeEdge('edge-1', 'alice', 'beta', 'WORKS_FOR')] const store = mockStore(entities, edges) Object.assign(store, { - listPassageBackfillChunks: vi.fn().mockImplementation(async ({ offset }: { offset?: number }) => { + listChunkBackfillRecords: vi.fn().mockImplementation(async ({ offset }: { offset?: number }) => { if ((offset ?? 0) > 0) return [] return [{ chunkId: 'chk-1', bucketId: 'bucket-1', - documentId: 'doc-1', + sourceId: 'source-1', chunkIndex: 0, embeddingModel: 'mock-embed', content: 'Alice works at Beta Inc.', @@ -570,13 +1052,13 @@ describe('createKnowledgeGraphBridge', () => { userId: 'test-user', }] }), - listPassageMentionBackfillRows: vi.fn().mockImplementation(async ({ offset }: { offset?: number }) => { + listChunkMentionBackfillRows: vi.fn().mockImplementation(async ({ offset }: { offset?: number }) => { if ((offset ?? 0) > 0) return [] return [ { chunkId: 'chk-1', bucketId: 'bucket-1', - documentId: 'doc-1', + sourceId: 'source-1', chunkIndex: 0, embeddingModel: 'mock-embed', content: 'Alice works at Beta Inc.', @@ -590,7 +1072,7 @@ describe('createKnowledgeGraphBridge', () => { { chunkId: 'chk-1', bucketId: 'bucket-1', - documentId: 'doc-1', + sourceId: 'source-1', chunkIndex: 0, embeddingModel: 'mock-embed', content: 'Alice works at Beta Inc.', @@ -607,8 +1089,7 @@ describe('createKnowledgeGraphBridge', () => { if ((offset ?? 0) > 0) return [] return edges }), - upsertPassageNodes: vi.fn(), - upsertPassageEntityEdges: vi.fn(), + upsertGraphEdges: vi.fn(), upsertFactRecord: vi.fn().mockImplementation(async fact => fact), }) @@ -621,19 +1102,12 @@ describe('createKnowledgeGraphBridge', () => { const result = await bridge.backfill!(testScope, { batchSize: 10 }) - expect(result.passageNodesUpserted).toBe(1) - expect(result.passageEntityEdgesUpserted).toBe(2) + expect(result.entityChunkEdgesUpserted).toBe(2) expect(result.factRecordsUpserted).toBe(1) expect(result.entityProfilesUpdated).toBe(2) - expect(store.upsertPassageNodes).toHaveBeenCalledWith(expect.arrayContaining([ - expect.objectContaining({ - bucketId: 'bucket-1', - documentId: 'doc-1', - chunkIndex: 0, - }), - ])) + expect(store.upsertGraphEdges).toHaveBeenCalled() expect(store.upsertFactRecord).toHaveBeenCalledWith(expect.objectContaining({ - factText: 'Alice works at Beta Inc', + factText: 'Alice works for Beta Inc', })) }) }) @@ -698,8 +1172,8 @@ describe('createKnowledgeGraphBridge', () => { }) }) - describe('searchGraphPassages', () => { - it('returns ranked passages and keeps direct entity seeding from hybrid entity lookup', async () => { + describe('searchGraphChunks', () => { + it('returns ranked chunks and keeps direct entity seeding from hybrid entity lookup', async () => { const entities = new Map([ ['adarsh', { ...makeEntity('adarsh', 'Adarsh Tadimari', 'person'), @@ -709,7 +1183,7 @@ describe('createKnowledgeGraphBridge', () => { ]) const mentions: MockMention[] = [{ entityId: 'adarsh', - documentId: 'doc-1', + sourceId: 'source-1', chunkIndex: 0, bucketId: 'bucket-1', mentionType: 'entity', @@ -719,22 +1193,30 @@ describe('createKnowledgeGraphBridge', () => { const store = mockStore(entities, [], mentions) Object.assign(store, { searchFacts: vi.fn().mockResolvedValue([]), - searchPassageNodes: vi.fn().mockResolvedValue([]), - getPassageEdgesForEntities: vi.fn().mockResolvedValue([{ - passageId: 'passage_test', + searchChunks: vi.fn().mockResolvedValue([]), + getChunkEdgesForEntities: vi.fn().mockResolvedValue([{ + id: 'edge_chunk_test', entityId: 'adarsh', + chunkRef: { + chunkId: 'chunk_test', + bucketId: 'bucket-1', + sourceId: 'source-1', + chunkIndex: 0, + embeddingModel: 'mock-embed', + }, weight: 1.5, mentionCount: 1, confidence: 0.9, surfaceTexts: ['Adarsh'], mentionTypes: ['entity'], }]), - getPassagesByIds: vi.fn().mockResolvedValue([{ - passageId: 'passage_test', + getChunksByRefs: vi.fn().mockResolvedValue([{ + chunkId: 'chunk_test', content: 'Adarsh Tadimari is debugging Plotline SDK initialization issues.', bucketId: 'bucket-1', - documentId: 'doc-1', + sourceId: 'source-1', chunkIndex: 0, + embeddingModel: 'mock-embed', totalChunks: 1, metadata: { source: 'test' }, userId: 'test-user', @@ -746,27 +1228,16 @@ describe('createKnowledgeGraphBridge', () => { embedding: mockEmbedding(), scope: testScope, resolveChunksTable: () => 'typegraph_chunks_mock', - explorationLlm: { - generateText: vi.fn().mockResolvedValue(''), - generateJSON: vi.fn().mockResolvedValue({ - sourceEntityQueries: ['Adarsh'], - targetEntityQueries: [], - predicates: [], - answerSide: 'none', - subqueries: ['Adarsh'], - mode: 'summary', - }), - }, }) - const result = await bridge.searchGraphPassages!('Adarsh', testScope, { count: 5 }) + const result = await bridge.searchGraphChunks!('Adarsh', testScope, { count: 5 }) expect(store.searchEntitiesHybrid).toHaveBeenCalledWith(expect.any(String), expect.any(Array), testScope, 5) expect(result.results).toHaveLength(1) expect(result.results[0]).toEqual(expect.objectContaining({ - passageId: 'passage_test', + chunkId: 'chunk_test', bucketId: 'bucket-1', - documentId: 'doc-1', + sourceId: 'source-1', chunkIndex: 0, })) expect(result.results[0]!.score).toBeGreaterThan(0) @@ -774,18 +1245,18 @@ describe('createKnowledgeGraphBridge', () => { expect(result.trace.selectedEntityIds).toContain('adarsh') }) - it('attaches selected graph facts and entity names to evidence passages', async () => { + it('attaches selected graph facts and entity names to evidence chunks', async () => { const entities = new Map([ ['tennyson', makeEntity('tennyson', 'Tennyson', 'person')], - ['maud', makeEntity('maud', 'Maud', 'work_of_art')], + ['maud', makeEntity('maud', 'Maud', 'creative_work')], ]) - const edges = [makeEdge('edge-1', 'tennyson', 'maud', 'WROTE')] + const edges = [makeEdge('edge-1', 'tennyson', 'maud', 'AUTHORED')] const fact: SemanticFactRecord = { id: 'fact-1', edgeId: 'edge-1', sourceEntityId: 'tennyson', targetEntityId: 'maud', - relation: 'WROTE', + relation: 'AUTHORED', factText: 'Tennyson wrote Maud', weight: 1, evidenceCount: 1, @@ -797,31 +1268,46 @@ describe('createKnowledgeGraphBridge', () => { const store = mockStore(entities, edges) Object.assign(store, { searchFacts: vi.fn().mockResolvedValue([fact]), - searchPassageNodes: vi.fn().mockResolvedValue([]), - getPassageEdgesForEntities: vi.fn().mockResolvedValue([ + searchChunks: vi.fn().mockResolvedValue([]), + getChunkEdgesForEntities: vi.fn().mockResolvedValue([ { - passageId: 'passage_maud', + id: 'edge_chunk_maud_tennyson', entityId: 'tennyson', + chunkRef: { + chunkId: 'chunk_maud', + bucketId: 'bucket-1', + sourceId: 'source-1', + chunkIndex: 0, + embeddingModel: 'mock-embed', + }, weight: 1, mentionCount: 1, surfaceTexts: ['Tennyson'], mentionTypes: ['subject'], }, { - passageId: 'passage_maud', + id: 'edge_chunk_maud_maud', entityId: 'maud', + chunkRef: { + chunkId: 'chunk_maud', + bucketId: 'bucket-1', + sourceId: 'source-1', + chunkIndex: 0, + embeddingModel: 'mock-embed', + }, weight: 1, mentionCount: 1, surfaceTexts: ['Maud'], mentionTypes: ['object'], }, ]), - getPassagesByIds: vi.fn().mockResolvedValue([{ - passageId: 'passage_maud', + getChunksByRefs: vi.fn().mockResolvedValue([{ + chunkId: 'chunk_maud', content: 'A tiny shell was moralised over by Tennyson in Maud.', bucketId: 'bucket-1', - documentId: 'doc-1', + sourceId: 'source-1', chunkIndex: 0, + embeddingModel: 'mock-embed', totalChunks: 1, metadata: { source: 'test' }, userId: 'test-user', @@ -833,20 +1319,9 @@ describe('createKnowledgeGraphBridge', () => { embedding: mockEmbedding(), scope: testScope, resolveChunksTable: () => 'typegraph_chunks_mock', - explorationLlm: { - generateText: vi.fn().mockResolvedValue(''), - generateJSON: vi.fn().mockResolvedValue({ - sourceEntityQueries: [], - targetEntityQueries: ['Maud'], - predicates: [{ name: 'WROTE', confidence: 0.95 }], - answerSide: 'source', - subqueries: ['who wrote Maud'], - mode: 'fact', - }), - }, }) - const result = await bridge.searchGraphPassages!('Who moralised Maud?', testScope, { count: 5 }) + const result = await bridge.searchGraphChunks!('Who wrote Maud?', testScope, { count: 5 }) expect(result.facts).toEqual([expect.objectContaining({ id: 'fact-1', factText: 'Tennyson wrote Maud' })]) expect(result.facts[0]!.properties?.relevanceScore).toEqual(expect.any(Number)) @@ -856,7 +1331,7 @@ describe('createKnowledgeGraphBridge', () => { ])) expect(result.results[0]).not.toHaveProperty('facts') expect(result.results[0]).not.toHaveProperty('entities') - expect(result.trace.finalPassageIds).toEqual(['passage_maud']) + expect(result.trace.finalChunkIds).toEqual(['chunk_maud']) expect(result.trace.selectedFactTexts).toEqual([{ id: 'fact-1', content: 'Tennyson wrote Maud' }]) expect(result.trace.selectedEntityNames).toEqual(expect.arrayContaining([ { id: 'tennyson', content: 'Tennyson' }, @@ -867,12 +1342,12 @@ describe('createKnowledgeGraphBridge', () => { it('ranks exact entity-constrained facts ahead of weaker adjacent facts and emits chains', async () => { const entities = new Map([ ['tennyson', makeEntity('tennyson', 'Tennyson', 'person')], - ['maud', makeEntity('maud', 'Maud', 'work_of_art')], + ['maud', makeEntity('maud', 'Maud', 'creative_work')], ['shell', makeEntity('shell', 'Tiny shell', 'object')], ['lizard', makeEntity('lizard', 'Lizard', 'place')], ]) const edges = [ - makeEdge('edge-1', 'tennyson', 'maud', 'WROTE'), + makeEdge('edge-1', 'tennyson', 'maud', 'AUTHORED'), makeEdge('edge-2', 'maud', 'shell', 'MORALISED'), makeEdge('edge-3', 'lizard', 'shell', 'CONTAINS'), ] @@ -896,7 +1371,7 @@ describe('createKnowledgeGraphBridge', () => { edgeId: 'edge-1', sourceEntityId: 'tennyson', targetEntityId: 'maud', - relation: 'WROTE', + relation: 'AUTHORED', factText: 'Tennyson wrote Maud', weight: 1, evidenceCount: 1, @@ -923,50 +1398,73 @@ describe('createKnowledgeGraphBridge', () => { const store = mockStore(entities, edges) Object.assign(store, { searchFacts: vi.fn().mockResolvedValue(facts), - searchPassageNodes: vi.fn().mockResolvedValue([]), - getPassageEdgesForEntities: vi.fn().mockResolvedValue([ + searchChunks: vi.fn().mockResolvedValue([]), + getChunkEdgesForEntities: vi.fn().mockResolvedValue([ { - passageId: 'passage_maud', + id: 'edge_chunk_maud_tennyson', entityId: 'tennyson', + chunkRef: { + chunkId: 'chunk_maud', + bucketId: 'bucket-1', + sourceId: 'source-1', + chunkIndex: 0, + embeddingModel: 'mock-embed', + }, weight: 1, mentionCount: 1, surfaceTexts: ['Tennyson'], mentionTypes: ['subject'], }, { - passageId: 'passage_maud', + id: 'edge_chunk_maud_maud', entityId: 'maud', + chunkRef: { + chunkId: 'chunk_maud', + bucketId: 'bucket-1', + sourceId: 'source-1', + chunkIndex: 0, + embeddingModel: 'mock-embed', + }, weight: 1, mentionCount: 1, surfaceTexts: ['Maud'], mentionTypes: ['object'], }, { - passageId: 'passage_shell', + id: 'edge_chunk_shell', entityId: 'shell', + chunkRef: { + chunkId: 'chunk_shell', + bucketId: 'bucket-1', + sourceId: 'source-2', + chunkIndex: 0, + embeddingModel: 'mock-embed', + }, weight: 0.4, mentionCount: 1, surfaceTexts: ['shell'], mentionTypes: ['object'], }, ]), - getPassagesByIds: vi.fn().mockResolvedValue([ + getChunksByRefs: vi.fn().mockResolvedValue([ { - passageId: 'passage_maud', + chunkId: 'chunk_maud', content: 'A tiny shell was moralised over by Tennyson in Maud.', bucketId: 'bucket-1', - documentId: 'doc-1', + sourceId: 'source-1', chunkIndex: 0, + embeddingModel: 'mock-embed', totalChunks: 1, metadata: { source: 'test' }, userId: 'test-user', }, { - passageId: 'passage_shell', + chunkId: 'chunk_shell', content: 'The Lizard coast contains shells.', bucketId: 'bucket-1', - documentId: 'doc-2', + sourceId: 'source-2', chunkIndex: 0, + embeddingModel: 'mock-embed', totalChunks: 1, metadata: { source: 'test' }, userId: 'test-user', @@ -979,20 +1477,9 @@ describe('createKnowledgeGraphBridge', () => { embedding: mockEmbedding(), scope: testScope, resolveChunksTable: () => 'typegraph_chunks_mock', - explorationLlm: { - generateText: vi.fn().mockResolvedValue(''), - generateJSON: vi.fn().mockResolvedValue({ - sourceEntityQueries: [], - targetEntityQueries: ['Maud'], - predicates: [{ name: 'WROTE', confidence: 0.95 }], - answerSide: 'source', - subqueries: ['who wrote Maud'], - mode: 'fact', - }), - }, }) - const result = await bridge.searchGraphPassages!('Who wrote Maud?', testScope, { + const result = await bridge.searchGraphChunks!('Who wrote Maud?', testScope, { count: 5, factCandidateLimit: 3, factChainLimit: 2, @@ -1008,7 +1495,7 @@ describe('createKnowledgeGraphBridge', () => { }) describe('explore graph intent V2', () => { - it('uses LLM source/target intent to keep only matching killer facts', async () => { + it('uses deterministic source/target intent to keep only matching killer facts', async () => { const entities = new Map([ ['aac', makeEntity('aac', 'Aac', 'person')], ['chaacmol', makeEntity('chaacmol', 'Chaacmol', 'person')], @@ -1020,29 +1507,18 @@ describe('createKnowledgeGraphBridge', () => { makeEdge('edge-married', 'chaacmol', 'moo', 'MARRIED'), ] const store = mockStore(entities, edges) - const explorationLlm = { - generateText: vi.fn().mockResolvedValue(''), - generateJSON: vi.fn().mockResolvedValue({ - sourceEntityQueries: [], - targetEntityQueries: ['Chaacmol'], - predicates: [{ name: 'KILLED', confidence: 0.98 }], - answerSide: 'source', - subqueries: ['who killed Chaacmol'], - mode: 'fact', - }), - } const bridge = createKnowledgeGraphBridge({ memoryStore: store, embedding: mockEmbedding(), scope: testScope, - explorationLlm, }) const result = await bridge.explore!('Who killed Chaacmol?', { userId: 'test-user', explain: true }) - expect(result.trace?.parser).toBe('llm') + expect(result.trace?.parser).toBe('deterministic') expect(result.intent.targetEntityQueries).toEqual(['Chaacmol']) expect(result.intent.predicates.map(predicate => predicate.name)).toEqual(['KILLED']) + expect(result.intent.strictness).toBe('strict') expect(result.facts.map(fact => fact.edgeId)).toEqual(['edge-killed']) expect(result.entities.map(entity => entity.name).sort()).toEqual(['Aac', 'Chaacmol']) }) @@ -1059,22 +1535,10 @@ describe('createKnowledgeGraphBridge', () => { makeEdge('edge-married', 'moo', 'chaacmol', 'MARRIED'), ] const store = mockStore(entities, edges) - const explorationLlm = { - generateText: vi.fn().mockResolvedValue(''), - generateJSON: vi.fn().mockResolvedValue({ - sourceEntityQueries: ['Chaacmol'], - targetEntityQueries: [], - predicates: [{ name: 'WIFE_OF', confidence: 0.98 }], - answerSide: 'target', - subqueries: ['Chaacmol wife spouse married'], - mode: 'fact', - }), - } const bridge = createKnowledgeGraphBridge({ memoryStore: store, embedding: mockEmbedding(), scope: testScope, - explorationLlm, }) const result = await bridge.explore!('Who is Chaacmol wife?', { userId: 'test-user', explain: true }) diff --git a/packages/sdk/src/graph/__tests__/query-intent.test.ts b/packages/sdk/src/graph/__tests__/query-intent.test.ts index aaaec3b..81ef7a4 100644 --- a/packages/sdk/src/graph/__tests__/query-intent.test.ts +++ b/packages/sdk/src/graph/__tests__/query-intent.test.ts @@ -1,6 +1,7 @@ import { describe, expect, it, vi } from 'vitest' import { parseGraphQueryIntent } from '../query-intent.js' import type { LLMProvider } from '../../types/llm-provider.js' +import type { GraphQueryIntent } from '../../types/graph-bridge.js' function mockLlm(output: unknown): LLMProvider { return { @@ -9,141 +10,517 @@ function mockLlm(output: unknown): LLMProvider { } } +function expectNoAnswerSide(intent: GraphQueryIntent): void { + expect('answerSide' in intent).toBe(false) +} + +function predicateNames(intent: GraphQueryIntent): string[] { + return intent.predicates.map(predicate => predicate.name) +} + describe('parseGraphQueryIntent', () => { - it('returns empty intent when no exploration LLM is configured', async () => { - const result = await parseGraphQueryIntent({ query: 'Who killed Chaacmol?' }) + it.each([ + { + query: 'Who killed Chaacmol?', + source: [], + target: ['Chaacmol'], + predicates: ['KILLED'], + mode: 'fact', + strictness: 'strict', + }, + { + query: 'Who did Aac kill?', + source: ['Aac'], + target: [], + predicates: ['KILLED'], + mode: 'fact', + strictness: 'strict', + }, + { + query: 'Who was Chaacmol killed by?', + source: [], + target: ['Chaacmol'], + predicates: ['KILLED'], + mode: 'fact', + strictness: 'strict', + }, + { + query: 'Chaacmol was killed by whom?', + source: [], + target: ['Chaacmol'], + predicates: ['KILLED'], + mode: 'fact', + strictness: 'strict', + }, + { + query: 'Who murdered Julius Caesar?', + source: [], + target: ['Julius Caesar'], + predicates: ['KILLED'], + mode: 'fact', + strictness: 'strict', + }, + { + query: 'Who was Julius Caesar assassinated by?', + source: [], + target: ['Julius Caesar'], + predicates: ['KILLED'], + mode: 'fact', + strictness: 'strict', + }, + { + query: "Who is Chaacmol's wife?", + source: ['Chaacmol'], + target: [], + predicates: ['MARRIED'], + mode: 'fact', + strictness: 'strict', + }, + { + query: "Who is Chaacmol's husband?", + source: ['Chaacmol'], + target: [], + predicates: ['MARRIED'], + mode: 'fact', + strictness: 'strict', + }, + { + query: 'Who is Chaacmol wife?', + source: ['Chaacmol'], + target: [], + predicates: ['MARRIED'], + mode: 'fact', + strictness: 'strict', + }, + { + query: 'Who was Marie Curie married to?', + source: ['Marie Curie'], + target: [], + predicates: ['MARRIED'], + mode: 'fact', + strictness: 'strict', + }, + { + query: 'Who is the spouse of Barack Obama?', + source: ['Barack Obama'], + target: [], + predicates: ['MARRIED'], + mode: 'fact', + strictness: 'strict', + }, + { + query: "Who are Chaacmol's parents?", + source: [], + target: ['Chaacmol'], + predicates: ['PARENT_OF'], + mode: 'fact', + strictness: 'strict', + }, + { + query: "Who is Chaacmol's father?", + source: [], + target: ['Chaacmol'], + predicates: ['PARENT_OF'], + mode: 'fact', + strictness: 'strict', + }, + { + query: "Who is Chaacmol's mother?", + source: [], + target: ['Chaacmol'], + predicates: ['PARENT_OF'], + mode: 'fact', + strictness: 'strict', + }, + { + query: "Who are CAN's children?", + source: [], + target: ['CAN'], + predicates: ['CHILD_OF'], + mode: 'fact', + strictness: 'strict', + }, + { + query: 'Who is the son of CAN?', + source: [], + target: ['CAN'], + predicates: ['CHILD_OF'], + mode: 'fact', + strictness: 'strict', + }, + { + query: 'Who did CAN father?', + source: ['CAN'], + target: [], + predicates: ['PARENT_OF'], + mode: 'fact', + strictness: 'strict', + }, + { + query: "Who is Chaacmol's brother?", + source: ['Chaacmol'], + target: [], + predicates: ['SIBLING_OF'], + mode: 'fact', + strictness: 'strict', + }, + { + query: "Who is Chaacmol's sister?", + source: ['Chaacmol'], + target: [], + predicates: ['SIBLING_OF'], + mode: 'fact', + strictness: 'strict', + }, + { + query: "Who are Chaacmol's siblings?", + source: ['Chaacmol'], + target: [], + predicates: ['SIBLING_OF'], + mode: 'fact', + strictness: 'strict', + }, + { + query: 'Who wrote Frankenstein?', + source: [], + target: ['Frankenstein'], + predicates: ['AUTHORED'], + mode: 'fact', + strictness: 'strict', + }, + { + query: 'What did Mary Shelley write?', + source: ['Mary Shelley'], + target: [], + predicates: ['AUTHORED'], + mode: 'fact', + strictness: 'strict', + }, + { + query: 'Who authored Pride and Prejudice?', + source: [], + target: ['Pride and Prejudice'], + predicates: ['AUTHORED'], + mode: 'fact', + strictness: 'strict', + }, + { + query: 'What books did Charles Dickens write?', + source: ['Charles Dickens'], + target: [], + predicates: ['AUTHORED'], + mode: 'fact', + strictness: 'strict', + }, + { + query: 'Who founded Stripe?', + source: [], + target: ['Stripe'], + predicates: ['FOUNDED'], + mode: 'fact', + strictness: 'strict', + }, + { + query: 'What did Patrick Collison found?', + source: ['Patrick Collison'], + target: [], + predicates: ['FOUNDED'], + mode: 'fact', + strictness: 'strict', + }, + { + query: 'Who established Uxmal?', + source: [], + target: ['Uxmal'], + predicates: ['FOUNDED'], + mode: 'fact', + strictness: 'strict', + }, + { + query: 'What company did Steve Jobs co-found?', + source: ['Steve Jobs'], + target: [], + predicates: ['FOUNDED'], + mode: 'fact', + strictness: 'strict', + }, + { + query: 'Where was Marie Curie born?', + source: ['Marie Curie'], + target: [], + predicates: ['LOCATED_IN'], + mode: 'fact', + strictness: 'strict', + }, + { + query: 'Where did Albert Einstein die?', + source: ['Albert Einstein'], + target: [], + predicates: ['LOCATED_IN'], + mode: 'fact', + strictness: 'strict', + }, + { + query: 'Where is OpenAI headquartered?', + source: ['OpenAI'], + target: [], + predicates: ['HEADQUARTERED_IN'], + mode: 'fact', + strictness: 'strict', + }, + { + query: 'What city is the Eiffel Tower located in?', + source: ['Eiffel Tower'], + target: [], + predicates: ['LOCATED_IN'], + mode: 'fact', + strictness: 'strict', + }, + { + query: 'Who leads Microsoft?', + source: [], + target: ['Microsoft'], + predicates: ['LEADS'], + mode: 'fact', + strictness: 'strict', + }, + { + query: 'What organization does Sam Altman lead?', + source: ['Sam Altman'], + target: [], + predicates: ['LEADS'], + mode: 'fact', + strictness: 'strict', + }, + { + query: 'Who works for OpenAI?', + source: [], + target: ['OpenAI'], + predicates: ['WORKS_FOR'], + mode: 'fact', + strictness: 'strict', + }, + { + query: 'Where does Alice work?', + source: ['Alice'], + target: [], + predicates: ['WORKS_FOR'], + mode: 'fact', + strictness: 'strict', + }, + { + query: 'Summarize the relationship between Aac and Chaacmol', + source: ['Aac'], + target: ['Chaacmol'], + predicates: [], + mode: 'summary', + strictness: 'soft', + }, + { + query: 'How are Kubernetes and Docker related?', + source: ['Kubernetes'], + target: ['Docker'], + predicates: [], + mode: 'relationship', + strictness: 'soft', + }, + { + query: 'What connects Tesla and Edison?', + source: ['Tesla'], + target: ['Edison'], + predicates: [], + mode: 'relationship', + strictness: 'soft', + }, + { + query: "Write a diary entry from Elizabeth Bennet's perspective about Darcy", + source: ['Elizabeth Bennet', 'Darcy'], + target: [], + predicates: [], + mode: 'creative', + strictness: 'soft', + }, + { + query: 'Imagine a letter from Aac to Chaacmol', + source: ['Aac', 'Chaacmol'], + target: [], + predicates: [], + mode: 'creative', + strictness: 'soft', + }, + { + query: 'Tell me about Chaacmol', + source: ['Chaacmol'], + target: [], + predicates: [], + mode: 'summary', + strictness: 'soft', + }, + { + query: 'What do we know about Uxmal?', + source: ['Uxmal'], + target: [], + predicates: [], + mode: 'summary', + strictness: 'soft', + }, + { + query: 'Who wrote "The Great Gatsby"?', + source: [], + target: ['The Great Gatsby'], + predicates: ['AUTHORED'], + mode: 'fact', + strictness: 'strict', + }, + { + query: 'Tell me about CAN', + source: ['CAN'], + target: [], + predicates: [], + mode: 'summary', + strictness: 'soft', + }, + { + query: 'Tell me about the Mayas’ temples', + source: ['Mayas'], + target: [], + predicates: [], + mode: 'summary', + strictness: 'soft', + }, + { + query: 'Where is Worcester, Mass. located?', + source: ['Worcester, Mass'], + target: [], + predicates: ['LOCATED_IN'], + mode: 'fact', + strictness: 'strict', + }, + { + query: 'Where is Chichen-Itza located?', + source: ['Chichen-Itza'], + target: [], + predicates: ['LOCATED_IN'], + mode: 'fact', + strictness: 'strict', + }, + ])('deterministically parses $query', async ({ query, source, target, predicates, mode, strictness }) => { + const result = await parseGraphQueryIntent({ query }) + + expect(result.parser).toBe('deterministic') + expect(result.intent.sourceEntityQueries).toEqual(source) + expect(result.intent.targetEntityQueries).toEqual(target) + expect(predicateNames(result.intent)).toEqual(predicates) + expect(result.intent.mode).toBe(mode) + expect(result.intent.strictness).toBe(strictness) + expect(result.intent.subqueries.length).toBeGreaterThan(0) + expectNoAnswerSide(result.intent) + }) + + it.each([ + 'what is going on here?', + 'summarize this', + 'why does this matter?', + 'compare the two chunks', + ])('returns no parser for non-actionable graph query %s', async (query) => { + const result = await parseGraphQueryIntent({ query }) expect(result.parser).toBe('none') - expect(result.fallbackUsed).toBe(false) + expect(result.intent.strictness).toBe('none') expect(result.intent.sourceEntityQueries).toEqual([]) expect(result.intent.targetEntityQueries).toEqual([]) expect(result.intent.predicates).toEqual([]) - expect(result.intent.answerSide).toBe('none') + expectNoAnswerSide(result.intent) }) - it('parses parent questions into target anchor, PARENT_OF, and source answer side', async () => { - const result = await parseGraphQueryIntent({ - query: 'Who are Chaacmol parents?', - llm: mockLlm({ - sourceEntityQueries: [], - targetEntityQueries: ['Chaacmol'], - predicates: [{ name: 'PARENT_OF', confidence: 0.98 }], - answerSide: 'source', - subqueries: ['Chaacmol parents'], - mode: 'fact', - }), + it('marks symmetric predicate aliases as symmetric', async () => { + const spouse = await parseGraphQueryIntent({ query: "Who is Chaacmol's wife?" }) + const sibling = await parseGraphQueryIntent({ query: "Who is Chaacmol's brother?" }) + + expect(spouse.intent.predicates).toEqual([expect.objectContaining({ name: 'MARRIED', symmetric: true })]) + expect(sibling.intent.predicates).toEqual([expect.objectContaining({ name: 'SIBLING_OF', symmetric: true })]) + }) + + it('uses deterministic mode by default and does not call an available LLM', async () => { + const llm = mockLlm({ + sourceEntityQueries: [], + targetEntityQueries: ['Wrong'], + predicates: [{ name: 'KILLED', confidence: 0.9 }], + subqueries: ['wrong'], + mode: 'fact', + strictness: 'strict', }) - expect(result.parser).toBe('llm') - expect(result.fallbackUsed).toBe(false) + const result = await parseGraphQueryIntent({ query: 'Who killed Chaacmol?', llm }) + + expect(result.parser).toBe('deterministic') expect(result.intent.targetEntityQueries).toEqual(['Chaacmol']) - expect(result.intent.predicates).toEqual([{ name: 'PARENT_OF', confidence: 0.98, symmetric: false }]) - expect(result.intent.answerSide).toBe('source') + expect(llm.generateJSON).not.toHaveBeenCalled() }) - it('parses active killer questions into source anchor and target answer side', async () => { + it('calls the LLM only when llm parser mode is requested', async () => { + const llm = mockLlm({ + sourceEntityQueries: ['Aac'], + targetEntityQueries: [], + predicates: [{ name: 'KILLED', confidence: 0.97 }], + subqueries: ['Aac killed'], + mode: 'fact', + strictness: 'strict', + }) + const result = await parseGraphQueryIntent({ query: 'Who did Aac kill?', - llm: mockLlm({ - sourceEntityQueries: ['Aac'], - targetEntityQueries: [], - predicates: [{ name: 'KILLED', confidence: 0.97 }], - answerSide: 'target', - subqueries: ['Aac killed'], - mode: 'fact', - }), + mode: 'llm', + llm, }) + expect(result.parser).toBe('llm') expect(result.intent.sourceEntityQueries).toEqual(['Aac']) - expect(result.intent.targetEntityQueries).toEqual([]) - expect(result.intent.predicates.map((predicate) => predicate.name)).toEqual(['KILLED']) - expect(result.intent.answerSide).toBe('target') + expect(predicateNames(result.intent)).toEqual(['KILLED']) + expect(llm.generateJSON).toHaveBeenCalledTimes(1) + expectNoAnswerSide(result.intent) }) - it('parses passive killer questions into target anchor and source answer side', async () => { - const result = await parseGraphQueryIntent({ - query: 'Who was Chaacmol killed by?', - llm: mockLlm({ - sourceEntityQueries: [], - targetEntityQueries: ['Chaacmol'], - predicates: [{ name: 'KILLED', confidence: 0.97 }], - answerSide: 'source', - subqueries: ['Chaacmol killed by'], - mode: 'fact', - }), - }) - - expect(result.intent.sourceEntityQueries).toEqual([]) - expect(result.intent.targetEntityQueries).toEqual(['Chaacmol']) - expect(result.intent.predicates.map((predicate) => predicate.name)).toEqual(['KILLED']) - expect(result.intent.answerSide).toBe('source') - }) + it('returns no parser when llm mode fails and does not use deterministic fallback', async () => { + const llm: LLMProvider = { + generateText: vi.fn().mockResolvedValue(''), + generateJSON: vi.fn().mockRejectedValue(new Error('bad JSON')), + } - it('normalizes gendered spouse predicates to MARRIED', async () => { - const result = await parseGraphQueryIntent({ - query: 'Who is Chaacmol wife?', - llm: mockLlm({ - sourceEntityQueries: ['Chaacmol'], - targetEntityQueries: [], - predicates: [{ name: 'WIFE_OF', confidence: 0.91 }], - answerSide: 'target', - subqueries: ['Chaacmol wife spouse married'], - mode: 'fact', - }), - }) + const result = await parseGraphQueryIntent({ query: 'Who killed Chaacmol?', mode: 'llm', llm }) - expect(result.intent.sourceEntityQueries).toEqual(['Chaacmol']) - expect(result.intent.predicates).toEqual([{ name: 'MARRIED', confidence: 0.91, symmetric: true }]) - expect(result.intent.answerSide).toBe('target') + expect(result.parser).toBe('none') + expect(result.intent.strictness).toBe('none') + expect(result.intent.targetEntityQueries).toEqual([]) }) - it('parses sibling questions into symmetric sibling intent', async () => { - const result = await parseGraphQueryIntent({ - query: 'Who is Chaacmol brother?', - llm: mockLlm({ - sourceEntityQueries: ['Chaacmol'], - targetEntityQueries: [], - predicates: [{ name: 'BROTHER_OF', confidence: 0.93 }], - answerSide: 'target', - subqueries: ['Chaacmol brother sibling'], - mode: 'fact', - }), - }) + it('returns no parser when parser mode is none', async () => { + const llm = mockLlm({}) - expect(result.intent.sourceEntityQueries).toEqual(['Chaacmol']) - expect(result.intent.predicates).toEqual([{ name: 'SIBLING_OF', confidence: 0.93, symmetric: true }]) - expect(result.intent.answerSide).toBe('target') + const result = await parseGraphQueryIntent({ query: 'Who killed Chaacmol?', mode: 'none', llm }) + + expect(result.parser).toBe('none') + expect(result.intent.strictness).toBe('none') + expect(llm.generateJSON).not.toHaveBeenCalled() }) - it('drops predicates that are not in the ontology', async () => { + it('drops invalid LLM predicates and traces them', async () => { const result = await parseGraphQueryIntent({ query: 'What happened in Chaacmol funeral chamber?', + mode: 'llm', llm: mockLlm({ sourceEntityQueries: ['Chaacmol'], targetEntityQueries: ['funeral chamber'], predicates: [{ name: 'FUNERAL_CHAMBER_IN', confidence: 0.9 }], - answerSide: 'either', subqueries: ['Chaacmol funeral chamber'], mode: 'summary', + strictness: 'soft', }), }) + expect(result.parser).toBe('llm') expect(result.intent.predicates).toEqual([]) - expect(result.intent.sourceEntityQueries).toEqual(['Chaacmol']) - expect(result.intent.targetEntityQueries).toEqual(['funeral chamber']) - }) - - it('returns empty graph intent when LLM parsing fails', async () => { - const llm: LLMProvider = { - generateText: vi.fn().mockResolvedValue(''), - generateJSON: vi.fn().mockRejectedValue(new Error('bad JSON')), - } - - const result = await parseGraphQueryIntent({ query: 'Who killed Chaacmol?', llm }) - - expect(result.parser).toBe('none') - expect(result.fallbackUsed).toBe(false) - expect(result.intent.predicates).toEqual([]) - expect(result.intent.answerSide).toBe('none') + expect(result.rejectedPredicates).toEqual(['FUNERAL_CHAMBER_IN']) + expectNoAnswerSide(result.intent) }) }) diff --git a/packages/sdk/src/graph/graph-bridge.ts b/packages/sdk/src/graph/graph-bridge.ts index e9ce611..68d5f3e 100644 --- a/packages/sdk/src/graph/graph-bridge.ts +++ b/packages/sdk/src/graph/graph-bridge.ts @@ -4,9 +4,12 @@ import { embeddingModelKey } from '../embedding/provider.js' import type { typegraphIdentity } from '../types/identity.js' import type { EmbeddingConfig } from '../types/bucket.js' import type { LLMConfig, LLMProvider } from '../types/llm-provider.js' -import type { KnowledgeGraphBridge, EntityDetail, EntityResult, EdgeResult, FactChainResult, FactRelevanceFilter, FactResult, FactSearchOpts, GraphExploreOpts, GraphExploreResult, GraphExploreTrace, GraphBackfillOpts, GraphBackfillResult, GraphExplainOpts, GraphSearchOpts, GraphSearchResult, GraphSearchTrace, PassageResult, SubgraphOpts, SubgraphResult, GraphStats, GraphQueryIntent } from '../types/graph-bridge.js' +import type { KnowledgeGraphBridge, EntityDetail, EntityResult, EdgeResult, FactChainResult, FactRelevanceFilter, FactResult, FactSearchOpts, GraphExploreOpts, GraphExploreResult, GraphExploreTrace, GraphBackfillOpts, GraphBackfillResult, GraphExplainOpts, GraphSearchOpts, GraphSearchResult, GraphSearchTrace, ChunkResult, SubgraphOpts, SubgraphResult, GraphStats, GraphQueryIntent, GraphEntityRef, UpsertGraphEdgeInput, UpsertGraphEntityInput, UpsertGraphFactInput, EntityScopeResolution, KnowledgeSearchOpts, KnowledgeSearchResult, MergeGraphEntitiesInput, MergeGraphEntitiesResult, DeleteGraphEntityOpts, DeleteGraphEntityResult } from '../types/graph-bridge.js' import { resolveEmbeddingProvider, resolveLLMProvider } from '../typegraph.js' -import type { MemoryStoreAdapter, SemanticEdge, SemanticEntity, SemanticEntityMention, SemanticFactRecord, SemanticPassageEntityEdge } from '../memory/types/index.js' +import type { ExternalId, MemoryStoreAdapter, SemanticEdge, SemanticEntity, SemanticEntityMention, SemanticEntityChunkEdge, SemanticFactRecord, SemanticGraphEdge } from '../memory/types/index.js' +import type { ChunkRef } from '../types/chunk.js' +import type { SourceSubject } from '../types/connector.js' +import { ConfigError } from '../types/errors.js' import { EntityResolver, PredicateNormalizer, createTemporal } from '../memory/index.js' import { EmbeddedGraph } from './graph/embedded-graph.js' import { parseGraphQueryIntent } from './query-intent.js' @@ -15,7 +18,16 @@ import { buildFactSearchText, formatFactEvidence, } from './retrieval-primitives.js' +import { optionalCompactObject, requiredObject } from '../utils/input.js' import { isSymmetricPredicate } from '../memory/extraction/predicate-normalizer.js' +import { + ALIAS_ASSIGNMENT_CUES, + DEFAULT_ENTITY_TYPE, + GENERIC_DISALLOWED_PREDICATES, + sanitizePredicate, + validatePredicateTypes, + type PredicateTemporalStatus, +} from '../index-engine/ontology.js' // ── Config ── @@ -27,9 +39,8 @@ export interface CreateKnowledgeGraphBridgeConfig { scope?: typegraphIdentity /** * Resolves an embedding model key to the Postgres chunks table that holds - * its embeddings. Required for heterogeneous graph retrieval — the bridge - * JOINs persisted passage nodes back to the per-model chunks table to - * retrieve source text. Typically wired to `vectorAdapter.getTable(model)`. + * its embeddings. Required for heterogeneous graph retrieval over chunks. + * Typically wired to `vectorAdapter.getTable(model)`. */ resolveChunksTable?: (model: string) => string | Promise factRelevanceFilter?: FactRelevanceFilter | undefined @@ -56,10 +67,6 @@ function stableGraphId(prefix: string, parts: Array return `${prefix}_${hash}` } -function contentHashFor(value: string): string { - return createHash('sha256').update(value).digest('hex') -} - function mergeScope(defaultScope: typegraphIdentity, override?: typegraphIdentity): typegraphIdentity { return { tenantId: override?.tenantId ?? defaultScope.tenantId, @@ -70,21 +77,10 @@ function mergeScope(defaultScope: typegraphIdentity, override?: typegraphIdentit } } -function passageIdFor(input: { - scope: typegraphIdentity - bucketId: string - documentId: string - chunkIndex: number - embeddingModel: string -}): string { - return stableGraphId('passage', [ - input.scope.tenantId, - input.scope.groupId, - input.scope.userId, - input.scope.agentId, - input.scope.conversationId, +function chunkNodeIdFor(input: ChunkRef): string { + return input.chunkId ?? stableGraphId('chunk', [ input.bucketId, - input.documentId, + input.sourceId, input.chunkIndex, input.embeddingModel, ]) @@ -104,6 +100,10 @@ function cleanOptionalText(value: unknown): string | undefined { return cleaned ? cleaned : undefined } +function isAliasAssignmentRelation(predicate: string): boolean { + return ALIAS_ASSIGNMENT_CUES.has(sanitizePredicate(predicate)) +} + function propertyString(properties: Record | undefined, key: string): string | undefined { return cleanOptionalText(properties?.[key]) } @@ -127,7 +127,7 @@ function normalizeSeedScore(value: number): number { function buildEntityMentions(input: { entityId: string - documentId: string + sourceId: string chunkIndex: number bucketId: string mentionType: SemanticEntityMention['mentionType'] @@ -147,7 +147,7 @@ function buildEntityMentions(input: { seen.add(key) rows.push({ entityId: input.entityId, - documentId: input.documentId, + sourceId: input.sourceId, chunkIndex: input.chunkIndex, bucketId: input.bucketId, mentionType, @@ -162,6 +162,41 @@ function buildEntityMentions(input: { return rows } +function buildEntityChunkGraphEdge(input: { + entityId: string + chunkRef: ChunkRef + relation?: string | undefined + weight: number + mentionCount: number + confidence?: number | undefined + surfaceTexts: string[] + mentionTypes: SemanticEntityMention['mentionType'][] + scope: typegraphIdentity + visibility?: import('../types/source.js').Visibility | undefined +}): SemanticGraphEdge { + const chunkId = chunkNodeIdFor(input.chunkRef) + return { + id: stableGraphId('edge', ['entity', input.entityId, input.relation ?? 'MENTIONED_IN', 'chunk', chunkId]), + sourceType: 'entity', + sourceId: input.entityId, + targetType: 'chunk', + targetId: chunkId, + relation: input.relation ?? 'MENTIONED_IN', + weight: input.weight, + properties: { + mentionCount: input.mentionCount, + confidence: input.confidence, + surfaceTexts: input.surfaceTexts, + mentionTypes: input.mentionTypes, + }, + scope: input.scope, + visibility: input.visibility, + temporal: createTemporal(), + evidence: [], + targetChunkRef: input.chunkRef, + } +} + // ── Knowledge Graph Bridge Factory ── /** @@ -180,12 +215,6 @@ export function createKnowledgeGraphBridge(config: CreateKnowledgeGraphBridgeCon const resolver = new EntityResolver({ store: memoryStore, embedding }) const predicateNormalizer = new PredicateNormalizer(embedding) - // Generic predicates that add noise without information — filter these out - const GENERIC_PREDICATES = new Set([ - 'IS', 'IS_A', 'IS_AN', 'HAS', 'HAS_A', 'RELATED_TO', 'INVOLVES', - 'MENTIONED', 'ASSOCIATED_WITH', - ]) - function uniqueIds(ids: string[]): string[] { return [...new Set(ids)] } @@ -377,13 +406,21 @@ export function createKnowledgeGraphBridge(config: CreateKnowledgeGraphBridgeCon ]).join('\n') } + function chunkIntentSearchText(intent: GraphQueryIntent, fallbackQuery: string): string { + return uniqueIds([ + fallbackQuery, + ...intent.sourceEntityQueries, + ...intent.targetEntityQueries, + ]).join('\n') + } + function graphIntentIsEmpty(intent: GraphQueryIntent): boolean { return ( intent.sourceEntityQueries.length === 0 && intent.targetEntityQueries.length === 0 && intent.predicates.length === 0 && intent.subqueries.length === 0 && - intent.answerSide === 'none' + intent.strictness === 'none' ) } @@ -512,6 +549,7 @@ export function createKnowledgeGraphBridge(config: CreateKnowledgeGraphBridgeCon name: entity.name, entityType: entity.entityType, aliases: entity.aliases, + externalIds: entity.externalIds, ...(typeof (similarityById?.get(entity.id) ?? inlineSimilarity) === 'number' ? { similarity: similarityById?.get(entity.id) ?? inlineSimilarity } : {}), @@ -581,6 +619,404 @@ export function createKnowledgeGraphBridge(config: CreateKnowledgeGraphBridgeCon const chunkEntityMap = new Map>() const directEdgePairs = new Set() + function scopeFrom(input?: typegraphIdentity): typegraphIdentity { + return mergeScope(defaultScope, { + tenantId: input?.tenantId, + groupId: input?.groupId, + userId: input?.userId, + agentId: input?.agentId, + conversationId: input?.conversationId, + }) + } + + function mergeSeedScope(parent: typegraphIdentity | undefined, child?: typegraphIdentity): typegraphIdentity { + return mergeScope(parent ? scopeFrom(parent) : defaultScope, { + tenantId: child?.tenantId, + groupId: child?.groupId, + userId: child?.userId, + agentId: child?.agentId, + conversationId: child?.conversationId, + }) + } + + function normalizeExternalId(input: ExternalId | null | undefined): ExternalId | undefined { + if (!input || typeof input !== 'object' || Array.isArray(input)) return undefined + if (typeof input.type !== 'string' || typeof input.id !== 'string') return undefined + const type = input.type.trim().toLowerCase() + const id = normalizeExternalIdValue(input.id, type, input.encoding ?? 'none') + if (!id || !type) return undefined + return { + ...input, + id, + type, + encoding: input.encoding ?? 'none', + } + } + + function normalizeExternalIdValue(id: string, type: string, encoding: ExternalId['encoding']): string { + const trimmed = id.trim() + if (encoding === 'sha256') return trimmed.toLowerCase() + if (type === 'email' || type.endsWith('_email') || type === 'github_handle') return trimmed.toLowerCase() + if (type === 'phone') return trimmed.replace(/[^\d+]/g, '') + return trimmed + } + + function externalIdKey(externalId: ExternalId): string { + return [ + externalId.type.trim().toLowerCase(), + externalId.id.trim(), + externalId.encoding ?? 'none', + ].join('|') + } + + function normalizeExternalIds(externalIds: Array | undefined): ExternalId[] { + const byKey = new Map() + for (const externalId of externalIds ?? []) { + const normalized = normalizeExternalId(externalId) + if (!normalized) continue + byKey.set(externalIdKey(normalized), normalized) + } + return [...byKey.values()] + } + + function normalizeSubjectExternalIds(subject: SourceSubject): ExternalId[] { + return normalizeExternalIds(subject.externalIds) + } + + function mergeExternalIds( + existing: ExternalId[] | undefined, + incoming: ExternalId[] | undefined, + ): ExternalId[] | undefined { + const merged = new Map() + for (const externalId of normalizeExternalIds(existing)) { + merged.set(externalIdKey(externalId), externalId) + } + for (const externalId of normalizeExternalIds(incoming)) { + merged.set(externalIdKey(externalId), externalId) + } + return merged.size > 0 ? [...merged.values()] : undefined + } + + function refExternalIds(ref: GraphEntityRef): ExternalId[] { + return normalizeExternalIds([ + ...(ref.externalId ? [ref.externalId] : []), + ...(ref.externalIds ?? []), + ]) + } + + async function findEntityByExternalIds( + externalIds: ExternalId[], + scope: typegraphIdentity, + ): Promise { + if (!memoryStore.findEntityByExternalId || externalIds.length === 0) return undefined + let found: SemanticEntity | undefined + for (const externalId of externalIds) { + const entity = await memoryStore.findEntityByExternalId(externalId, scope) + if (!entity) continue + if (found && found.id !== entity.id) { + throw new Error(`External IDs resolve to multiple entities: ${found.id} and ${entity.id}`) + } + found = entity + } + return found + } + + async function linkExternalIdsToEntity( + entityId: string, + externalIds: ExternalId[], + scope: typegraphIdentity, + ): Promise { + const normalized = normalizeExternalIds(externalIds) + if (normalized.length === 0) return + if (!memoryStore.upsertEntityExternalIds) { + throw new Error('MemoryStoreAdapter does not support deterministic entity external IDs') + } + if (memoryStore.findEntityByExternalId) { + for (const externalId of normalized) { + const existing = await memoryStore.findEntityByExternalId(externalId, scope) + if (existing && existing.id !== entityId) { + throw new Error( + `External ID ${externalId.type}:${externalId.id} is already linked to entity ${existing.id}`, + ) + } + } + } + await memoryStore.upsertEntityExternalIds(entityId, normalized, scope) + } + + function entityResultFromSemanticEntity(entity: SemanticEntity, edgeCount: number): EntityDetail { + return { + id: entity.id, + name: entity.name, + entityType: entity.entityType, + aliases: entity.aliases, + externalIds: entity.externalIds, + edgeCount, + properties: entity.properties, + description: entity.properties.description as string | undefined, + createdAt: entity.temporal.createdAt, + validAt: entity.temporal.validAt, + invalidAt: entity.temporal.invalidAt, + topEdges: [], + } + } + + async function upsertSeedEntity(input: UpsertGraphEntityInput): Promise { + if (!input.name?.trim()) throw new Error('upsertEntity requires a non-empty name') + const scope = scopeFrom(input) + const externalIds = normalizeExternalIds(input.externalIds) + let entity: SemanticEntity | undefined + + if (input.id) { + entity = await graph.getEntity(input.id, scope) ?? undefined + const externalMatch = await findEntityByExternalIds(externalIds, scope) + if (externalMatch && externalMatch.id !== input.id) { + throw new Error(`External IDs resolve to entity ${externalMatch.id}, not requested entity ${input.id}`) + } + entity = entity ?? externalMatch + } else { + entity = await findEntityByExternalIds(externalIds, scope) + } + + if (entity) { + entity = await resolver.merge(entity, { + name: input.name, + entityType: input.entityType ?? DEFAULT_ENTITY_TYPE, + aliases: input.aliases ?? [], + description: input.description, + externalIds, + }) + } else if (input.id) { + const embeddingVector = await embedding.embed(input.name) + const descriptionEmbedding = input.description + ? await embedding.embed(input.description) + : undefined + entity = { + id: input.id, + name: input.name, + entityType: input.entityType ?? DEFAULT_ENTITY_TYPE, + aliases: input.aliases ?? [], + externalIds, + properties: input.description ? { description: input.description } : {}, + embedding: embeddingVector, + descriptionEmbedding, + scope, + visibility: input.visibility, + temporal: createTemporal(), + } + } else { + const resolved = await resolver.resolve( + input.name, + input.entityType ?? DEFAULT_ENTITY_TYPE, + input.aliases ?? [], + scope, + input.description, + input.visibility, + externalIds, + ) + entity = resolved.entity + } + + entity = { + ...entity, + externalIds: mergeExternalIds(entity.externalIds, externalIds), + properties: { + ...entity.properties, + ...(input.properties ?? {}), + }, + scope, + visibility: input.visibility ?? entity.visibility, + } + if (input.description && !entity.properties.description) { + entity.properties.description = input.description + } + + await graph.addEntity(entity) + await linkExternalIdsToEntity(entity.id, externalIds, scope) + return entity + } + + async function resolveEntityForRead( + ref: GraphEntityRef | string, + identity?: typegraphIdentity, + ): Promise { + const scope = scopeFrom(identity) + if (typeof ref === 'string') return graph.getEntity(ref, scope) + const refScope = mergeSeedScope(scope, ref) + if (ref.id) { + const byId = await graph.getEntity(ref.id, refScope) + if (byId) return byId + } + const byExternalId = await findEntityByExternalIds(refExternalIds(ref), refScope) + if (byExternalId) return byExternalId + if (!ref.name?.trim()) return null + if (memoryStore.findEntities) { + const candidates = await memoryStore.findEntities(ref.name, refScope, 10) + return candidates.find(candidate => + candidate.name.toLowerCase() === ref.name!.toLowerCase() + && (!ref.entityType || candidate.entityType === ref.entityType) + ) ?? null + } + return null + } + + async function resolveEntityForWrite( + ref: GraphEntityRef | string, + parentScope: typegraphIdentity, + visibility?: import('../types/source.js').Visibility, + ): Promise { + if (typeof ref === 'string') { + const entity = await graph.getEntity(ref, parentScope) + if (entity) return entity + return upsertSeedEntity({ + name: ref, + entityType: DEFAULT_ENTITY_TYPE, + tenantId: parentScope.tenantId, + groupId: parentScope.groupId, + userId: parentScope.userId, + agentId: parentScope.agentId, + conversationId: parentScope.conversationId, + visibility, + }) + } + const refScope = mergeSeedScope(parentScope, ref) + if (ref.id && !ref.name) { + const existing = await graph.getEntity(ref.id, refScope) + if (!existing) throw new Error(`Entity not found: ${ref.id}`) + const externalIds = refExternalIds(ref) + await linkExternalIdsToEntity(existing.id, externalIds, refScope) + return { ...existing, externalIds: mergeExternalIds(existing.externalIds, externalIds) } + } + if (!ref.name?.trim()) { + const resolved = await resolveEntityForRead(ref, refScope) + if (!resolved) throw new Error('Entity reference requires id, externalId, or name') + return resolved + } + return upsertSeedEntity({ + id: ref.id, + name: ref.name, + entityType: ref.entityType, + aliases: ref.aliases, + description: ref.description, + properties: ref.properties, + externalIds: refExternalIds(ref), + tenantId: refScope.tenantId, + groupId: refScope.groupId, + userId: refScope.userId, + agentId: refScope.agentId, + conversationId: refScope.conversationId, + visibility: ref.visibility ?? visibility, + }) + } + + async function upsertRelation(input: { + source: GraphEntityRef | string + target: GraphEntityRef | string + relation: string + scope: typegraphIdentity + visibility?: import('../types/source.js').Visibility | undefined + weight?: number | undefined + properties?: Record | undefined + description?: string | undefined + evidenceText?: string | undefined + sourceChunkId?: string | undefined + factText?: string | undefined + temporalStatus?: PredicateTemporalStatus | undefined + validFrom?: string | undefined + validTo?: string | undefined + }): Promise<{ edge: SemanticEdge; fact?: SemanticFactRecord | undefined; source: SemanticEntity; target: SemanticEntity }> { + const normalizedRelation = predicateNormalizer.normalizeWithDirection(input.relation) + if (!normalizedRelation.valid || GENERIC_DISALLOWED_PREDICATES.has(normalizedRelation.predicate)) { + throw new Error(`Invalid or too-generic graph relation: ${input.relation}`) + } + + let sourceRef = input.source + let targetRef = input.target + if (normalizedRelation.swapSubjectObject) { + ;[sourceRef, targetRef] = [targetRef, sourceRef] + } + let source = await resolveEntityForWrite(sourceRef, input.scope, input.visibility) + let target = await resolveEntityForWrite(targetRef, input.scope, input.visibility) + if (source.id === target.id) throw new Error(`Refusing to create self-edge for entity ${source.id}`) + + const relation = normalizedRelation.predicate + const typeValidation = validatePredicateTypes(relation, source.entityType, target.entityType) + if (normalizedRelation.symmetric) { + const sourceKey = normalizeSurfaceText(source.id || source.name) + const targetKey = normalizeSurfaceText(target.id || target.name) + if (sourceKey > targetKey) { + ;[source, target] = [target, source] + } + } + + const relationshipDescription = cleanOptionalText(input.description) + const evidenceText = cleanOptionalText(input.evidenceText) + const sourceChunkId = cleanOptionalText(input.sourceChunkId) + const temporalStatus = input.temporalStatus ?? normalizedRelation.temporalStatus + const validFrom = cleanOptionalText(input.validFrom) + const validTo = cleanOptionalText(input.validTo) + const weight = (input.weight ?? 1) * (typeValidation.valid ? 1 : 0.85) + const edge: SemanticEdge = { + id: stableGraphId('edge', [source.id, relation, target.id]), + sourceEntityId: source.id, + targetEntityId: target.id, + relation, + weight, + properties: { + ...(relationshipDescription ? { relationshipDescription } : {}), + ...(evidenceText ? { evidenceText } : {}), + ...(sourceChunkId ? { sourceChunkId } : {}), + ...(temporalStatus ? { temporalStatus } : {}), + ...(validFrom ? { validFrom } : {}), + ...(validTo ? { validTo } : {}), + ...(!typeValidation.valid ? { predicateValidation: typeValidation } : {}), + ...(input.properties ?? {}), + }, + scope: input.scope, + visibility: input.visibility, + temporal: createTemporal(), + evidence: [], + } + + const storedEdge = await graph.addEdge(edge) + let storedFact: SemanticFactRecord | undefined + if (memoryStore.upsertFactRecord) { + const factText = cleanOptionalText(input.factText) ?? factTextFor(source.name, relation, target.name) + const factSearchText = buildFactSearchText({ + factText, + description: relationshipDescription, + evidenceText, + }) + const factEmbedding = await embedding.embed(factSearchText) + storedFact = await memoryStore.upsertFactRecord({ + id: stableGraphId('fact', [storedEdge.sourceEntityId, storedEdge.relation, storedEdge.targetEntityId]), + edgeId: storedEdge.id, + sourceEntityId: storedEdge.sourceEntityId, + targetEntityId: storedEdge.targetEntityId, + relation: storedEdge.relation, + factText, + description: relationshipDescription, + evidenceText, + factSearchText, + sourceChunkId, + weight: storedEdge.weight, + evidenceCount: Math.max(1, Math.round(storedEdge.weight)), + embedding: factEmbedding, + scope: input.scope, + visibility: storedEdge.visibility, + createdAt: storedEdge.temporal.createdAt, + updatedAt: new Date(), + ...(storedEdge.temporal.invalidAt ? { invalidAt: storedEdge.temporal.invalidAt } : {}), + }) + } + + if (memoryStore.upsertEntity) { + await updateProfilesFromFact(source, target, storedEdge.relation, storedEdge.weight) + } + + return { edge: storedEdge, fact: storedFact, source, target } + } + async function updateProfilesFromFact( source: SemanticEntity, target: SemanticEntity, @@ -659,14 +1095,14 @@ export function createKnowledgeGraphBridge(config: CreateKnowledgeGraphBridgeCon aliases?: string[] | undefined description?: string | undefined bucketId: string - documentId?: string | undefined + sourceId?: string | undefined chunkIndex?: number | undefined tenantId?: string | undefined groupId?: string | undefined userId?: string | undefined agentId?: string | undefined conversationId?: string | undefined - visibility?: import('../types/typegraph-document.js').Visibility | undefined + visibility?: import('../types/source.js').Visibility | undefined confidence?: number | undefined mentionType: SemanticEntityMention['mentionType'] }): Promise { @@ -679,7 +1115,7 @@ export function createKnowledgeGraphBridge(config: CreateKnowledgeGraphBridgeCon }) const result = await resolver.resolve( input.name, - input.type ?? 'entity', + input.type ?? DEFAULT_ENTITY_TYPE, input.aliases ?? [], scope, input.description, @@ -688,10 +1124,10 @@ export function createKnowledgeGraphBridge(config: CreateKnowledgeGraphBridgeCon await graph.addEntity(result.entity) - if (memoryStore.upsertEntityChunkMentions && input.documentId && input.chunkIndex !== undefined) { + if (memoryStore.upsertEntityChunkMentions && input.sourceId && input.chunkIndex !== undefined) { const mentions = buildEntityMentions({ entityId: result.entity.id, - documentId: input.documentId, + sourceId: input.sourceId, chunkIndex: input.chunkIndex, bucketId: input.bucketId, mentionType: input.mentionType, @@ -702,21 +1138,19 @@ export function createKnowledgeGraphBridge(config: CreateKnowledgeGraphBridgeCon if (mentions.length > 0) await memoryStore.upsertEntityChunkMentions(mentions) } - if (memoryStore.upsertPassageEntityEdges && input.documentId && input.chunkIndex !== undefined) { - const passageId = passageIdFor({ - scope, - bucketId: input.bucketId, - documentId: input.documentId, - chunkIndex: input.chunkIndex, - embeddingModel: embeddingModelKey(embedding), - }) + if (memoryStore.upsertGraphEdges && input.sourceId && input.chunkIndex !== undefined) { const surfaceTexts = [input.name, result.entity.name, ...(input.aliases ?? [])] .map(value => value.trim()) .filter(Boolean) const uniqueSurfaceTexts = [...new Map(surfaceTexts.map(value => [normalizeSurfaceText(value), value])).values()] - await memoryStore.upsertPassageEntityEdges([{ - passageId, + await memoryStore.upsertGraphEdges([buildEntityChunkGraphEdge({ entityId: result.entity.id, + chunkRef: { + bucketId: input.bucketId, + sourceId: input.sourceId, + chunkIndex: input.chunkIndex, + embeddingModel: embeddingModelKey(embedding), + }, weight: Math.min(2, 0.5 + (input.confidence ?? 0.75)), mentionCount: Math.max(1, uniqueSurfaceTexts.length), confidence: input.confidence, @@ -724,7 +1158,7 @@ export function createKnowledgeGraphBridge(config: CreateKnowledgeGraphBridgeCon mentionTypes: [input.mentionType], scope, visibility: input.visibility, - }]) + })]) } return result.entity @@ -738,13 +1172,13 @@ export function createKnowledgeGraphBridge(config: CreateKnowledgeGraphBridgeCon content: string bucketId: string chunkIndex?: number | undefined - documentId?: string | undefined + sourceId?: string | undefined tenantId?: string | undefined groupId?: string | undefined userId?: string | undefined agentId?: string | undefined conversationId?: string | undefined - visibility?: import('../types/typegraph-document.js').Visibility | undefined + visibility?: import('../types/source.js').Visibility | undefined metadata?: Record | undefined confidence?: number | undefined }>): Promise { @@ -756,7 +1190,7 @@ export function createKnowledgeGraphBridge(config: CreateKnowledgeGraphBridgeCon aliases: mention.aliases, description: mention.description, bucketId: mention.bucketId, - documentId: mention.documentId, + sourceId: mention.sourceId, chunkIndex: mention.chunkIndex, tenantId: mention.tenantId, groupId: mention.groupId, @@ -770,6 +1204,118 @@ export function createKnowledgeGraphBridge(config: CreateKnowledgeGraphBridgeCon } } + async function addSourceSubject(input: { + subject: SourceSubject + bucketId: string + sourceId: string + embeddingModel: string + chunks: Array<{ + id?: string | undefined + chunkIndex: number + content: string + metadata?: Record | undefined + }> + tenantId?: string | undefined + groupId?: string | undefined + userId?: string | undefined + agentId?: string | undefined + conversationId?: string | undefined + visibility?: import('../types/source.js').Visibility | undefined + }): Promise { + const scope = mergeScope(defaultScope, { + tenantId: input.tenantId, + groupId: input.groupId, + userId: input.userId, + agentId: input.agentId, + conversationId: input.conversationId, + }) + const externalIds = normalizeSubjectExternalIds(input.subject) + let entity: SemanticEntity | null | undefined + + if (input.subject.entityId) { + entity = await graph.getEntity(input.subject.entityId, scope) + } + entity = entity ?? (await findEntityByExternalIds(externalIds, scope)) + + if (entity && (input.subject.name || externalIds.length > 0 || input.subject.aliases?.length || input.subject.description || input.subject.properties)) { + entity = await upsertSeedEntity({ + id: entity.id, + name: input.subject.name ?? entity.name, + entityType: input.subject.entityType ?? entity.entityType, + aliases: [...new Set([...(entity.aliases ?? []), ...(input.subject.aliases ?? [])])], + description: input.subject.description ?? (entity.properties.description as string | undefined), + properties: input.subject.properties, + externalIds, + tenantId: scope.tenantId, + groupId: scope.groupId, + userId: scope.userId, + agentId: scope.agentId, + conversationId: scope.conversationId, + visibility: input.visibility, + }) + } + + if (!entity) { + if (!input.subject.name?.trim()) { + throw new Error('source.subject.name is required when the subject entity cannot be resolved by entityId or externalIds.') + } + entity = await upsertSeedEntity({ + id: input.subject.entityId, + name: input.subject.name, + entityType: input.subject.entityType, + aliases: input.subject.aliases, + description: input.subject.description, + properties: input.subject.properties, + externalIds, + tenantId: scope.tenantId, + groupId: scope.groupId, + userId: scope.userId, + agentId: scope.agentId, + conversationId: scope.conversationId, + visibility: input.visibility, + }) + } else if (externalIds.length > 0) { + await linkExternalIdsToEntity(entity.id, externalIds, scope) + entity = { ...entity, externalIds: mergeExternalIds(entity.externalIds, externalIds) } + } + + const mentions: SemanticEntityMention[] = input.chunks.map(chunk => ({ + entityId: entity!.id, + sourceId: input.sourceId, + chunkIndex: chunk.chunkIndex, + bucketId: input.bucketId, + mentionType: 'source_subject', + normalizedSurfaceText: '', + confidence: 1.0, + })) + if (mentions.length > 0 && memoryStore.upsertEntityChunkMentions) { + await memoryStore.upsertEntityChunkMentions(mentions) + } + + if (input.chunks.length > 0 && memoryStore.upsertGraphEdges) { + await memoryStore.upsertGraphEdges(input.chunks.map(chunk => buildEntityChunkGraphEdge({ + entityId: entity!.id, + chunkRef: { + bucketId: input.bucketId, + sourceId: input.sourceId, + chunkIndex: chunk.chunkIndex, + embeddingModel: input.embeddingModel, + chunkId: chunk.id, + }, + relation: 'PRIMARY_SOURCE_CHUNK', + weight: 1.0, + mentionCount: 1, + confidence: 1.0, + surfaceTexts: [], + mentionTypes: ['source_subject'], + scope, + visibility: input.visibility, + }))) + } + + return await getEntity(entity.id, scope) ?? entityResultFromSemanticEntity(entity, 0) + } + async function addTriple(triple: { subject: string subjectType?: string @@ -782,18 +1328,21 @@ export function createKnowledgeGraphBridge(config: CreateKnowledgeGraphBridgeCon objectDescription?: string relationshipDescription?: string | undefined evidenceText?: string | undefined + temporalStatus?: PredicateTemporalStatus | undefined + validFrom?: string | undefined + validTo?: string | undefined sourceChunkId?: string | undefined confidence?: number content: string bucketId: string chunkIndex?: number - documentId?: string + sourceId?: string tenantId?: string | undefined groupId?: string | undefined userId?: string | undefined agentId?: string | undefined conversationId?: string | undefined - visibility?: import('../types/typegraph-document.js').Visibility | undefined + visibility?: import('../types/source.js').Visibility | undefined metadata?: Record }): Promise { const scope = mergeScope(defaultScope, { @@ -804,8 +1353,35 @@ export function createKnowledgeGraphBridge(config: CreateKnowledgeGraphBridgeCon conversationId: triple.conversationId, }) + if (isAliasAssignmentRelation(triple.predicate)) { + const alias = cleanOptionalText(triple.object) + const aliases = [...new Map([ + ...(triple.subjectAliases ?? []), + ...(alias && normalizeSurfaceText(alias) !== normalizeSurfaceText(triple.subject) ? [alias] : []), + ...(triple.objectAliases ?? []), + ].map(value => [normalizeSurfaceText(value), value])).values()] + await resolveAndStoreEntity({ + name: triple.subject, + type: triple.subjectType, + aliases, + description: triple.subjectDescription, + bucketId: triple.bucketId, + sourceId: triple.sourceId, + chunkIndex: triple.chunkIndex, + tenantId: triple.tenantId, + groupId: triple.groupId, + userId: triple.userId, + agentId: triple.agentId, + conversationId: triple.conversationId, + visibility: triple.visibility, + confidence: triple.confidence, + mentionType: 'entity', + }) + return + } + const normalizedRelation = predicateNormalizer.normalizeWithDirection(triple.predicate) - if (!normalizedRelation.valid || GENERIC_PREDICATES.has(normalizedRelation.predicate)) return + if (!normalizedRelation.valid || GENERIC_DISALLOWED_PREDICATES.has(normalizedRelation.predicate)) return let sourceInput = { name: triple.subject, @@ -829,7 +1405,7 @@ export function createKnowledgeGraphBridge(config: CreateKnowledgeGraphBridgeCon aliases: sourceInput.aliases, description: sourceInput.description, bucketId: triple.bucketId, - documentId: triple.documentId, + sourceId: triple.sourceId, chunkIndex: triple.chunkIndex, tenantId: triple.tenantId, groupId: triple.groupId, @@ -846,7 +1422,7 @@ export function createKnowledgeGraphBridge(config: CreateKnowledgeGraphBridgeCon aliases: targetInput.aliases, description: targetInput.description, bucketId: triple.bucketId, - documentId: triple.documentId, + sourceId: triple.sourceId, chunkIndex: triple.chunkIndex, tenantId: triple.tenantId, groupId: triple.groupId, @@ -863,10 +1439,14 @@ export function createKnowledgeGraphBridge(config: CreateKnowledgeGraphBridgeCon if (sourceEntity.id === targetEntity.id) return const relation = normalizedRelation.predicate - const weight = triple.confidence ?? 1.0 + const typeValidation = validatePredicateTypes(relation, sourceEntity.entityType, targetEntity.entityType) + const weight = (triple.confidence ?? 1.0) * (typeValidation.valid ? 1 : 0.85) const relationshipDescription = cleanOptionalText(triple.relationshipDescription) const evidenceText = cleanOptionalText(triple.evidenceText) const sourceChunkId = cleanOptionalText(triple.sourceChunkId) + const temporalStatus = triple.temporalStatus ?? normalizedRelation.temporalStatus + const validFrom = cleanOptionalText(triple.validFrom) + const validTo = cleanOptionalText(triple.validTo) if (textMentionsDirectionalContradiction({ relation, @@ -897,6 +1477,10 @@ export function createKnowledgeGraphBridge(config: CreateKnowledgeGraphBridgeCon ...(relationshipDescription ? { relationshipDescription } : {}), ...(evidenceText ? { evidenceText } : {}), ...(sourceChunkId ? { sourceChunkId } : {}), + ...(temporalStatus ? { temporalStatus } : {}), + ...(validFrom ? { validFrom } : {}), + ...(validTo ? { validTo } : {}), + ...(!typeValidation.valid ? { predicateValidation: typeValidation } : {}), ...(triple.metadata ? { metadata: triple.metadata } : {}), }, scope, @@ -933,6 +1517,7 @@ export function createKnowledgeGraphBridge(config: CreateKnowledgeGraphBridgeCon visibility: storedEdge.visibility, createdAt: storedEdge.temporal.createdAt, updatedAt: new Date(), + ...(storedEdge.temporal.invalidAt ? { invalidAt: storedEdge.temporal.invalidAt } : {}), }) } @@ -944,7 +1529,7 @@ export function createKnowledgeGraphBridge(config: CreateKnowledgeGraphBridgeCon directEdgePairs.add(pairKey) // CO_OCCURS edges for disconnected entities - const chunkKey = `${triple.bucketId}:${triple.documentId ?? ''}:${triple.chunkIndex ?? 0}` + const chunkKey = `${triple.bucketId}:${triple.sourceId ?? ''}:${triple.chunkIndex ?? 0}` let chunkEntities = chunkEntityMap.get(chunkKey) if (!chunkEntities) { chunkEntities = new Set() @@ -974,10 +1559,10 @@ export function createKnowledgeGraphBridge(config: CreateKnowledgeGraphBridgeCon evidence: [], }) // Record the co-occurrence mention on the newly-linked entity - if (memoryStore.upsertEntityChunkMentions && triple.documentId && triple.chunkIndex !== undefined) { + if (memoryStore.upsertEntityChunkMentions && triple.sourceId && triple.chunkIndex !== undefined) { await memoryStore.upsertEntityChunkMentions([{ entityId: newId, - documentId: triple.documentId, + sourceId: triple.sourceId, chunkIndex: triple.chunkIndex, bucketId: triple.bucketId, mentionType: 'co_occurrence', @@ -991,6 +1576,123 @@ export function createKnowledgeGraphBridge(config: CreateKnowledgeGraphBridgeCon } } + async function upsertEntity(input: UpsertGraphEntityInput): Promise { + const entity = await upsertSeedEntity(input) + return await getEntity(entity.id, scopeFrom(input)) + ?? entityResultFromSemanticEntity(entity, 0) + } + + async function upsertEntities(inputs: UpsertGraphEntityInput[]): Promise { + const results: EntityDetail[] = [] + for (const input of inputs) { + results.push(await upsertEntity(input)) + } + return results + } + + async function resolveEntity( + ref: GraphEntityRef | string, + identity?: typegraphIdentity, + ): Promise { + const entity = await resolveEntityForRead(ref, identity) + if (!entity) return null + return await getEntity(entity.id, scopeFrom(identity)) + ?? entityResultFromSemanticEntity(entity, 0) + } + + async function linkExternalIds( + entityId: string, + externalIds: ExternalId[], + identity?: typegraphIdentity, + ): Promise { + const scope = scopeFrom(identity) + const entity = await graph.getEntity(entityId, scope) + if (!entity) throw new Error(`Entity not found: ${entityId}`) + const normalized = normalizeExternalIds(externalIds) + await linkExternalIdsToEntity(entityId, normalized, scope) + const updated: SemanticEntity = { + ...entity, + externalIds: mergeExternalIds(entity.externalIds, normalized), + } + await graph.addEntity(updated) + return await getEntity(entityId, scope) + ?? entityResultFromSemanticEntity(updated, 0) + } + + async function upsertEdge(input: UpsertGraphEdgeInput): Promise { + const scope = scopeFrom(input) + const result = await upsertRelation({ + source: input.source, + target: input.target, + relation: input.relation, + scope, + visibility: input.visibility, + weight: input.weight, + properties: input.properties, + description: input.description, + evidenceText: input.evidenceText, + sourceChunkId: input.sourceChunkId, + temporalStatus: input.temporalStatus, + validFrom: input.validFrom, + validTo: input.validTo, + }) + return edgeResultFromSemanticEdge( + result.edge, + new Map([ + [result.source.id, result.source.name], + [result.target.id, result.target.name], + ]), + ) + } + + async function upsertEdges(inputs: UpsertGraphEdgeInput[]): Promise { + const results: EdgeResult[] = [] + for (const input of inputs) { + results.push(await upsertEdge(input)) + } + return results + } + + async function upsertFact(input: UpsertGraphFactInput): Promise { + const scope = scopeFrom(input) + const result = await upsertRelation({ + source: input.source, + target: input.target, + relation: input.relation, + scope, + visibility: input.visibility, + weight: input.confidence, + properties: input.properties, + description: input.description, + evidenceText: input.evidenceText, + sourceChunkId: input.sourceChunkId, + factText: input.factText, + temporalStatus: input.temporalStatus, + validFrom: input.validFrom, + validTo: input.validTo, + }) + if (result.fact) { + const [fact] = await hydrateFacts([result.fact], scope) + if (fact) return fact + } + return factResultFromEdge( + result.edge, + new Map([ + [result.source.id, result.source.name], + [result.target.id, result.target.name], + ]), + 0, + ) + } + + async function upsertFacts(inputs: UpsertGraphFactInput[]): Promise { + const results: FactResult[] = [] + for (const input of inputs) { + results.push(await upsertFact(input)) + } + return results + } + async function searchEntities( query: string, identity: typegraphIdentity, @@ -1104,95 +1806,52 @@ export function createKnowledgeGraphBridge(config: CreateKnowledgeGraphBridgeCon return adjacency } - async function upsertPassageNodes(nodes: Array<{ - bucketId: string - documentId: string - chunkIndex: number - embeddingModel: string - contentHash: string - chunkId?: string | undefined - metadata?: Record | undefined - visibility?: import('../types/typegraph-document.js').Visibility | undefined - tenantId?: string | undefined - groupId?: string | undefined - userId?: string | undefined - agentId?: string | undefined - conversationId?: string | undefined - }>): Promise { - if (!memoryStore.upsertPassageNodes || nodes.length === 0) return - const now = new Date() - await memoryStore.upsertPassageNodes(nodes.map(node => { - const scope = mergeScope(defaultScope, { - tenantId: node.tenantId, - groupId: node.groupId, - userId: node.userId, - agentId: node.agentId, - conversationId: node.conversationId, - }) - return { - id: passageIdFor({ - scope, - bucketId: node.bucketId, - documentId: node.documentId, - chunkIndex: node.chunkIndex, - embeddingModel: node.embeddingModel, - }), - bucketId: node.bucketId, - documentId: node.documentId, - chunkIndex: node.chunkIndex, - chunkId: node.chunkId, - embeddingModel: node.embeddingModel, - contentHash: node.contentHash, - metadata: node.metadata ?? {}, - scope, - visibility: node.visibility, - createdAt: now, - updatedAt: now, - } - })) - } - async function searchFacts( query: string, - opts: FactSearchOpts = {}, + opts?: FactSearchOpts | null, ): Promise { - if (!memoryStore.searchFacts) return [] + const normalizedOpts = optionalCompactObject(opts, 'graph.searchFacts') as FactSearchOpts + if (!memoryStore.searchFacts && !memoryStore.searchFactsHybrid) return [] const queryEmbedding = await embedding.embed(query) const identity = { - tenantId: opts.tenantId, - groupId: opts.groupId, - userId: opts.userId, - agentId: opts.agentId, - conversationId: opts.conversationId, - } - const facts = await memoryStore.searchFacts(queryEmbedding, identity, opts.limit ?? 20) + tenantId: normalizedOpts.tenantId, + groupId: normalizedOpts.groupId, + userId: normalizedOpts.userId, + agentId: normalizedOpts.agentId, + conversationId: normalizedOpts.conversationId, + } + const facts = memoryStore.searchFactsHybrid + ? await memoryStore.searchFactsHybrid(query, queryEmbedding, identity, normalizedOpts.limit ?? 20) + : await memoryStore.searchFacts!(queryEmbedding, identity, normalizedOpts.limit ?? 20) return hydrateFacts(facts, identity) } async function explore( query: string, - opts: GraphExploreOpts = {}, + opts?: GraphExploreOpts | null, ): Promise { + const normalizedOpts = optionalCompactObject(opts, 'graph.explore') as GraphExploreOpts const identity = { - tenantId: opts.tenantId, - groupId: opts.groupId, - userId: opts.userId, - agentId: opts.agentId, - conversationId: opts.conversationId, + tenantId: normalizedOpts.tenantId, + groupId: normalizedOpts.groupId, + userId: normalizedOpts.userId, + agentId: normalizedOpts.agentId, + conversationId: normalizedOpts.conversationId, } const include = { - entities: opts.include?.entities ?? true, - facts: opts.include?.facts ?? true, - passages: opts.include?.passages ?? false, + entities: normalizedOpts.include?.entities ?? true, + facts: normalizedOpts.include?.facts ?? true, + chunks: normalizedOpts.include?.chunks ?? false, } - const anchorLimit = Math.max(1, opts.anchorLimit ?? 3) - const entityLimit = requestedLimit(opts.entityLimit) - const factLimit = requestedLimit(opts.factLimit) - const passageLimit = Math.max(1, opts.passageLimit ?? 10) - const depth: 1 | 2 = opts.depth === 2 ? 2 : 1 + const anchorLimit = Math.max(1, normalizedOpts.anchorLimit ?? 3) + const entityLimit = requestedLimit(normalizedOpts.entityLimit) + const factLimit = requestedLimit(normalizedOpts.factLimit) + const chunkLimit = Math.max(1, normalizedOpts.chunkLimit ?? 10) + const depth: 1 | 2 = normalizedOpts.depth === 2 ? 2 : 1 const parsed = await parseGraphQueryIntent({ query, + mode: normalizedOpts.intentParser, llm: explorationLlm, }) const predicateConfidenceByName = new Map(parsed.intent.predicates.map(predicate => [predicate.name, predicate.confidence])) @@ -1200,13 +1859,15 @@ export function createKnowledgeGraphBridge(config: CreateKnowledgeGraphBridgeCon const trace: GraphExploreTrace = { parser: parsed.parser, - fallbackUsed: parsed.fallbackUsed, mode: parsed.intent.mode, - answerSide: parsed.intent.answerSide, + strictness: parsed.intent.strictness, selectedPredicates: [...selectedPredicates], sourceEntityQueries: parsed.intent.sourceEntityQueries, targetEntityQueries: parsed.intent.targetEntityQueries, subqueries: parsed.intent.subqueries, + intentParseMs: parsed.parseMs, + intentMatchedPatterns: parsed.matchedPatterns, + rejectedPredicates: parsed.rejectedPredicates, anchorCandidates: [], selectedAnchorIds: [], matchedEdgeIds: [], @@ -1216,7 +1877,7 @@ export function createKnowledgeGraphBridge(config: CreateKnowledgeGraphBridgeCon droppedByType: 0, } - const resolvedAnchors = parsed.parser === 'llm' && !graphIntentIsEmpty(parsed.intent) + const resolvedAnchors = parsed.parser !== 'none' && !graphIntentIsEmpty(parsed.intent) ? await resolveIntentAnchors(parsed.intent, identity, anchorLimit) : { sourceAnchors: [], @@ -1234,8 +1895,8 @@ export function createKnowledgeGraphBridge(config: CreateKnowledgeGraphBridgeCon anchors, entities: [], facts: [], - ...(include.passages ? { passages: [] } : {}), - ...(opts.explain ? { trace } : {}), + ...(include.chunks ? { chunks: [] } : {}), + ...(normalizedOpts.explain ? { trace } : {}), } if (anchors.length === 0) return emptyResult @@ -1318,37 +1979,38 @@ export function createKnowledgeGraphBridge(config: CreateKnowledgeGraphBridgeCon .map(match => factResultFromEdge(match.edge, nameMap, match.score)) : [] - let passages: PassageResult[] | undefined - if (include.passages) { - const passageEntityLimit = Math.max(1, Math.min(entityLimit ?? 10, 10)) + let chunks: ChunkResult[] | undefined + if (include.chunks) { + const chunkEntityLimit = Math.max(1, Math.min(entityLimit ?? 10, 10)) const topEntityIds = [...entityScoreById.entries()] .sort((a, b) => b[1] - a[1]) - .slice(0, passageEntityLimit) + .slice(0, chunkEntityLimit) .map(([entityId]) => entityId) - const passageMap = new Map() + const chunkMap = new Map() for (const entityId of topEntityIds) { - const connectedPassages = await getPassagesForEntity(entityId, { - bucketIds: opts.bucketIds, - limit: passageLimit, + const connectedChunks = await getChunksForEntity(entityId, { + bucketIds: normalizedOpts.bucketIds, + limit: chunkLimit, ...identity, }) const entityScore = entityScoreById.get(entityId) ?? 0 - for (const passage of connectedPassages) { - const score = entityScore * Math.log2(1 + passage.score) - const existing = passageMap.get(passage.passageId) + for (const chunk of connectedChunks) { + const key = chunkRefKey(chunk) + const score = entityScore * Math.log2(1 + chunk.score) + const existing = chunkMap.get(key) if (!existing || score > existing.score) { - passageMap.set(passage.passageId, { - ...passage, + chunkMap.set(key, { + ...chunk, score, }) } } } - passages = [...passageMap.values()] + chunks = [...chunkMap.values()] .sort((a, b) => b.score - a.score) - .slice(0, passageLimit) + .slice(0, chunkLimit) } return { @@ -1356,44 +2018,56 @@ export function createKnowledgeGraphBridge(config: CreateKnowledgeGraphBridgeCon anchors, entities: entityResults, facts, - ...(include.passages ? { passages: passages ?? [] } : {}), - ...(opts.explain ? { trace } : {}), + ...(include.chunks ? { chunks: chunks ?? [] } : {}), + ...(normalizedOpts.explain ? { trace } : {}), } } - async function getPassagesForEntity(entityId: string, opts?: { + function chunkRefKey(ref: ChunkRef): string { + return `${ref.bucketId}\u001f${ref.sourceId}\u001f${ref.chunkIndex}\u001f${ref.embeddingModel ?? ''}` + } + + async function getChunksForEntity(entityId: string, opts?: ({ bucketIds?: string[] | undefined limit?: number | undefined - } & typegraphIdentity): Promise { - if (!memoryStore.getPassageEdgesForEntities || !memoryStore.getPassagesByIds || !config.resolveChunksTable) return [] + } & typegraphIdentity) | null): Promise { + const normalizedOpts = optionalCompactObject<{ + bucketIds?: string[] | undefined + limit?: number | undefined + } & typegraphIdentity>(opts, 'graph.getChunksForEntity') as { + bucketIds?: string[] | undefined + limit?: number | undefined + } & typegraphIdentity + if (!memoryStore.getChunkEdgesForEntities || !memoryStore.getChunksByRefs || !config.resolveChunksTable) return [] const identity = { - tenantId: opts?.tenantId, - groupId: opts?.groupId, - userId: opts?.userId, - agentId: opts?.agentId, - conversationId: opts?.conversationId, + tenantId: normalizedOpts.tenantId, + groupId: normalizedOpts.groupId, + userId: normalizedOpts.userId, + agentId: normalizedOpts.agentId, + conversationId: normalizedOpts.conversationId, } - const passageEdges = await memoryStore.getPassageEdgesForEntities([entityId], { + const chunkEdges = await memoryStore.getChunkEdgesForEntities([entityId], { scope: identity, - bucketIds: opts?.bucketIds, - limit: opts?.limit ?? 20, + bucketIds: normalizedOpts.bucketIds, + limit: normalizedOpts.limit ?? 20, }) - if (passageEdges.length === 0) return [] + if (chunkEdges.length === 0) return [] const chunksTable = await config.resolveChunksTable(embeddingModelKey(embedding)) - const passageRows = await memoryStore.getPassagesByIds( - passageEdges.map(edge => edge.passageId), - { chunksTable, bucketIds: opts?.bucketIds, scope: identity }, + const chunkRows = await memoryStore.getChunksByRefs( + chunkEdges.map(edge => edge.chunkRef), + { chunksTable, bucketIds: normalizedOpts.bucketIds, scope: identity }, ) - const scoreByPassage = new Map(passageEdges.map(edge => [edge.passageId, edge.weight])) - return passageRows + const scoreByChunk = new Map(chunkEdges.map(edge => [chunkRefKey(edge.chunkRef), edge.weight])) + return chunkRows .map(row => ({ - passageId: row.passageId, content: row.content, bucketId: row.bucketId, - documentId: row.documentId, + sourceId: row.sourceId, chunkIndex: row.chunkIndex, + embeddingModel: row.embeddingModel, + chunkId: row.chunkId, totalChunks: row.totalChunks, - score: scoreByPassage.get(row.passageId) ?? 0, + score: scoreByChunk.get(chunkRefKey(row)) ?? 0, metadata: row.metadata, tenantId: row.tenantId, groupId: row.groupId, @@ -1402,28 +2076,124 @@ export function createKnowledgeGraphBridge(config: CreateKnowledgeGraphBridgeCon conversationId: row.conversationId, })) .sort((a, b) => b.score - a.score) - .slice(0, opts?.limit ?? 20) + .slice(0, normalizedOpts.limit ?? 20) + } + + async function resolveEntityScope(scope: import('../types/query.js').QueryEntityScope, identity: typegraphIdentity, opts?: { + bucketIds?: string[] | undefined + limit?: number | undefined + } | null): Promise { + const normalizedOpts = optionalCompactObject<{ + bucketIds?: string[] | undefined + limit?: number | undefined + }>(opts, 'graph.resolveEntityScope') as { + bucketIds?: string[] | undefined + limit?: number | undefined + } + const warnings: string[] = [] + const entityIds = new Set((scope.entityIds ?? []).filter(Boolean)) + if ((scope.externalIds?.length ?? 0) > 0 && !memoryStore.findEntityByExternalId) { + throw new ConfigError('entityScope.externalIds requires a knowledge graph store with external ID resolution.') + } + for (const externalId of scope.externalIds ?? []) { + const entity = memoryStore.findEntityByExternalId + ? await memoryStore.findEntityByExternalId(externalId, identity) + : null + if (entity) entityIds.add(entity.id) + } + if ((scope.externalIds?.length ?? 0) > 0 && entityIds.size === (scope.entityIds?.length ?? 0)) { + warnings.push('No entities resolved for the provided external IDs.') + } + const resolvedIds = [...entityIds] + if (resolvedIds.length > 0 && !memoryStore.getChunkEdgesForEntities) { + throw new ConfigError('entityScope requires a knowledge graph store with entity-chunk edge lookup.') + } + const chunkEdges = resolvedIds.length > 0 + ? await memoryStore.getChunkEdgesForEntities!(resolvedIds, { + scope: identity, + bucketIds: normalizedOpts.bucketIds, + limit: normalizedOpts.limit ?? Math.max(200, resolvedIds.length * 200), + }) + : [] + const chunkRefs = [...new Map(chunkEdges.map(edge => [chunkRefKey(edge.chunkRef), edge.chunkRef])).values()] + return { + entityIds: resolvedIds, + chunkRefs, + ...(warnings.length > 0 ? { warnings } : {}), + } } - async function searchGraphPassages( + async function searchKnowledge( query: string, identity: typegraphIdentity, - opts: GraphSearchOpts = {}, + opts?: KnowledgeSearchOpts | null, + ): Promise { + const normalizedOpts = optionalCompactObject(opts, 'graph.searchKnowledge') as KnowledgeSearchOpts + const limit = normalizedOpts.count ?? 10 + const scopeEntityIds = new Set(normalizedOpts.resolvedEntityIds ?? []) + const hasEntityScopeFilter = Boolean(normalizedOpts.entityScope && normalizedOpts.entityScope.mode !== 'boost') + if (hasEntityScopeFilter && scopeEntityIds.size === 0) return { facts: [], entities: [] } + const shouldFilter = hasEntityScopeFilter + const queryEmbedding = normalizedOpts.signals?.semantic !== false || normalizedOpts.signals?.keyword + ? await embedding.embed(query) + : undefined + + const entityRows = queryEmbedding && (memoryStore.searchEntitiesHybrid || memoryStore.searchEntities) + ? (memoryStore.searchEntitiesHybrid + ? await memoryStore.searchEntitiesHybrid(query, queryEmbedding, identity, limit) + : await memoryStore.searchEntities!(queryEmbedding, identity, limit)) + : [] + const entities = (await hydrateEntityResults(entityRows, undefined, identity)) + .filter(entity => !shouldFilter || scopeEntityIds.has(entity.id)) + .slice(0, limit) + + const factRows = memoryStore.searchFactsHybrid + ? await memoryStore.searchFactsHybrid(query, queryEmbedding, identity, limit) + : queryEmbedding && memoryStore.searchFacts + ? await memoryStore.searchFacts(queryEmbedding, identity, limit) + : [] + const facts = (await hydrateFacts( + factRows.filter(fact => + !shouldFilter || + scopeEntityIds.has(fact.sourceEntityId) || + scopeEntityIds.has(fact.targetEntityId) + ), + identity, + )).slice(0, limit) + + return { facts, entities } + } + + async function searchGraphChunks( + query: string, + identity: typegraphIdentity, + opts?: GraphSearchOpts | null, ): Promise { - const count = opts.count ?? 10 - const restartProbability = opts.restartProbability ?? 0.5 - const passageSeedWeight = opts.passageSeedWeight ?? 0.05 - const entitySeedWeight = opts.entitySeedWeight ?? 1.0 - const factCandidateLimit = opts.factCandidateLimit ?? 200 - const factFilterInputLimit = opts.factFilterInputLimit ?? 8 - const passageSeedLimit = opts.passageSeedLimit ?? 200 - const maxIterations = opts.maxPprIterations ?? 50 - const minPprScore = opts.minPprScore ?? 1e-10 - const maxExpansionEdgesPerEntity = opts.maxExpansionEdgesPerEntity ?? 100 - const factChainLimit = opts.factChainLimit ?? 3 + const normalizedOpts = optionalCompactObject(opts, 'graph.searchGraphChunks') as GraphSearchOpts + const count = normalizedOpts.count ?? 10 + const restartProbability = normalizedOpts.restartProbability ?? 0.5 + const chunkSeedWeight = normalizedOpts.chunkSeedWeight ?? 0.05 + const entitySeedWeight = normalizedOpts.entitySeedWeight ?? 1.0 + const factCandidateLimit = normalizedOpts.factCandidateLimit ?? 200 + const factFilterInputLimit = normalizedOpts.factFilterInputLimit ?? 8 + const chunkSeedLimit = normalizedOpts.chunkSeedLimit ?? 200 + const maxIterations = normalizedOpts.maxPprIterations ?? 50 + const minPprScore = normalizedOpts.minPprScore ?? 1e-10 + const maxExpansionEdgesPerEntity = normalizedOpts.maxExpansionEdgesPerEntity ?? 100 + const factChainLimit = normalizedOpts.factChainLimit ?? 3 + const entityScopeMode = normalizedOpts.entityScope?.mode ?? 'filter' + const scopedEntityIds = normalizedOpts.entityScope + ? (normalizedOpts.resolvedEntityIds ?? (await resolveEntityScope(normalizedOpts.entityScope, identity, { + bucketIds: normalizedOpts.bucketIds, + limit: Math.max(count * 50, 200), + })).entityIds) + : [] + const scopedEntityIdSet = new Set(scopedEntityIds) + const isEntityScopeFilter = Boolean(normalizedOpts.entityScope && entityScopeMode === 'filter') const parsed = await parseGraphQueryIntent({ query, + mode: normalizedOpts.intentParser, llm: explorationLlm, }) @@ -1432,7 +2202,7 @@ export function createKnowledgeGraphBridge(config: CreateKnowledgeGraphBridgeCon parser: parsed.parser, entitySeedCount: 0, factSeedCount: 0, - passageSeedCount: 0, + chunkSeedCount: 0, graphNodeCount: 0, graphEdgeCount: 0, pprNonzeroCount: 0, @@ -1441,41 +2211,55 @@ export function createKnowledgeGraphBridge(config: CreateKnowledgeGraphBridgeCon topGraphScores: [], selectedFactIds: [], selectedEntityIds: [], - selectedPassageIds: [], - finalPassageIds: [], + selectedChunkIds: [], + finalChunkIds: [], selectedFactTexts: [], selectedEntityNames: [], selectedFactChains: [], + intentParseMs: parsed.parseMs, + intentMatchedPatterns: parsed.matchedPatterns, + rejectedPredicates: parsed.rejectedPredicates, }) if (!config.resolveChunksTable) { return { results: [], facts: [], entities: [], factChains: [], trace: emptyTrace() } } - if (parsed.parser !== 'llm' || graphIntentIsEmpty(parsed.intent)) { + if (parsed.parser === 'none' || graphIntentIsEmpty(parsed.intent)) { return { results: [], facts: [], entities: [], factChains: [], trace: emptyTrace() } } - const querySearchText = intentSearchText(parsed.intent, query) - const queryEmbedding = await embedding.embed(querySearchText) + const needsFactSearch = parsed.intent.strictness === 'strict' + const factSearchText = intentSearchText(parsed.intent, query) + const chunkSearchText = chunkIntentSearchText(parsed.intent, query) + const [factEmbedding, chunkEmbedding] = needsFactSearch + ? (factSearchText === chunkSearchText + ? await embedding.embed(factSearchText).then(value => [value, value] as const) + : await Promise.all([ + embedding.embed(factSearchText), + embedding.embed(chunkSearchText), + ])) + : [undefined, await embedding.embed(chunkSearchText)] as const const chunksTable = await config.resolveChunksTable(embeddingModelKey(embedding)) - const factCandidates = memoryStore.searchFacts - ? await memoryStore.searchFacts(queryEmbedding, identity, factCandidateLimit) + const factCandidates = needsFactSearch && factEmbedding && memoryStore.searchFacts + ? await memoryStore.searchFacts(factEmbedding, identity, factCandidateLimit) : [] const candidateEntityIds = uniqueIds(factCandidates.flatMap(fact => [fact.sourceEntityId, fact.targetEntityId])) const candidateEntities = candidateEntityIds.length > 0 ? await graph.getEntitiesBatch(candidateEntityIds, identity) : [] const entityNameById = new Map(candidateEntities.map(entity => [entity.id, entity.name])) - const rankedFactCandidates = rerankFactRecords(factCandidates, querySearchText, entityNameById) + const rankedFactCandidates = rerankFactRecords(factCandidates, factSearchText, entityNameById) const resolvedAnchors = await resolveIntentAnchors(parsed.intent, identity, 5) - let selectedFacts = rankedFactCandidates.filter(fact => factMatchesIntent(fact, { - intent: parsed.intent, - sourceAnchorIds: resolvedAnchors.sourceAnchorIds, - targetAnchorIds: resolvedAnchors.targetAnchorIds, - }).match) - if (opts.factFilter && config.factRelevanceFilter && selectedFacts.length > 0) { + let selectedFacts = needsFactSearch + ? rankedFactCandidates.filter(fact => factMatchesIntent(fact, { + intent: parsed.intent, + sourceAnchorIds: resolvedAnchors.sourceAnchorIds, + targetAnchorIds: resolvedAnchors.targetAnchorIds, + }).match) + : [] + if (normalizedOpts.factFilter && config.factRelevanceFilter && selectedFacts.length > 0) { const filterInput = await hydrateFacts(selectedFacts.slice(0, Math.max(factFilterInputLimit, selectedFacts.length)), identity) try { const selectedIds = new Set(await config.factRelevanceFilter(query, filterInput)) @@ -1484,8 +2268,17 @@ export function createKnowledgeGraphBridge(config: CreateKnowledgeGraphBridgeCon // Keep the strict graph-intent matches when optional LLM fact filtering fails. } } + if (isEntityScopeFilter) { + selectedFacts = selectedFacts.filter(fact => + scopedEntityIdSet.has(fact.sourceEntityId) || + scopedEntityIdSet.has(fact.targetEntityId) + ) + } const entitySeeds = new Map() + for (const entityId of scopedEntityIdSet) { + entitySeeds.set(entityId, Math.max(entitySeeds.get(entityId) ?? 0, entitySeedWeight)) + } for (const anchor of resolvedAnchors.anchors) { const score = normalizeSeedScore(anchor.similarity ?? 1) * entitySeedWeight entitySeeds.set(anchor.id, Math.max(entitySeeds.get(anchor.id) ?? 0, score)) @@ -1496,16 +2289,19 @@ export function createKnowledgeGraphBridge(config: CreateKnowledgeGraphBridgeCon entitySeeds.set(fact.targetEntityId, Math.max(entitySeeds.get(fact.targetEntityId) ?? 0, score)) } - const passageSeeds = new Map() - const passageSeedRows = memoryStore.searchPassageNodes - ? await memoryStore.searchPassageNodes(queryEmbedding, identity, { + const chunkSeeds = new Map() + const chunkSeedRows = memoryStore.searchChunks + ? await memoryStore.searchChunks(chunkEmbedding, identity, { chunksTable, - bucketIds: opts.bucketIds, - limit: passageSeedLimit, + bucketIds: normalizedOpts.bucketIds, + limit: chunkSeedLimit, }) : [] - for (const passage of passageSeedRows) { - passageSeeds.set(passage.passageId, normalizeSeedScore(passage.similarity) * passageSeedWeight) + const chunkRefById = new Map() + for (const chunk of chunkSeedRows) { + const chunkNodeId = chunkNodeIdFor(chunk) + chunkRefById.set(chunkNodeId, chunk) + chunkSeeds.set(chunkNodeId, normalizeSeedScore(chunk.similarity ?? 0) * chunkSeedWeight) } const adjacency = new Map>() @@ -1536,82 +2332,96 @@ export function createKnowledgeGraphBridge(config: CreateKnowledgeGraphBridgeCon activeEntityIds.add(node) for (const edge of edges) activeEntityIds.add(edge.target) } + for (const entityId of scopedEntityIdSet) activeEntityIds.add(entityId) - const passageEntityEdges = memoryStore.getPassageEdgesForEntities - ? await memoryStore.getPassageEdgesForEntities([...activeEntityIds], { + const scopedChunkIds = new Set() + const chunkEntityEdges = memoryStore.getChunkEdgesForEntities + ? await memoryStore.getChunkEdgesForEntities([...activeEntityIds], { scope: identity, - bucketIds: opts.bucketIds, + bucketIds: normalizedOpts.bucketIds, limit: Math.max(100, activeEntityIds.size * maxExpansionEdgesPerEntity), }) : [] - for (const edge of passageEntityEdges) { + for (const edge of chunkEntityEdges) { + const chunkNodeId = chunkNodeIdFor(edge.chunkRef) + chunkRefById.set(chunkNodeId, edge.chunkRef) + if (scopedEntityIdSet.has(edge.entityId)) scopedChunkIds.add(chunkNodeId) const weight = Math.log2(1 + edge.weight) - addWeightedEdge(edge.entityId, edge.passageId, weight) - addWeightedEdge(edge.passageId, edge.entityId, weight) + addWeightedEdge(edge.entityId, chunkNodeId, weight) + addWeightedEdge(chunkNodeId, edge.entityId, weight) const entitySeedScore = entitySeeds.get(edge.entityId) if (entitySeedScore != null) { const mentionSeedScore = entitySeedScore * weight * 0.6 - passageSeeds.set(edge.passageId, Math.max(passageSeeds.get(edge.passageId) ?? 0, mentionSeedScore)) + chunkSeeds.set(chunkNodeId, Math.max(chunkSeeds.get(chunkNodeId) ?? 0, mentionSeedScore)) } } - for (const passageId of passageSeeds.keys()) { - if (!adjacency.has(passageId)) adjacency.set(passageId, []) + for (const chunkId of chunkSeeds.keys()) { + if (!adjacency.has(chunkId)) adjacency.set(chunkId, []) } const seedWeights = new Map() for (const [id, score] of entitySeeds) seedWeights.set(id, Math.max(seedWeights.get(id) ?? 0, score)) - for (const [id, score] of passageSeeds) seedWeights.set(id, Math.max(seedWeights.get(id) ?? 0, score)) + for (const [id, score] of chunkSeeds) seedWeights.set(id, Math.max(seedWeights.get(id) ?? 0, score)) if (seedWeights.size === 0) { return { results: [], facts: [], entities: [], factChains: [], trace: emptyTrace() } } const pprScores = runWeightedPPR(adjacency, seedWeights, restartProbability, maxIterations, minPprScore) - const scoredPassageIds = [...pprScores.entries()] - .filter(([id]) => id.startsWith('passage_')) + const scoredChunkIds = [...pprScores.entries()] + .filter(([id]) => id.startsWith('chunk_') || chunkRefById.has(id)) .sort((a, b) => b[1] - a[1]) .slice(0, Math.max(count * 3, count)) .map(([id]) => id) - const fallbackPassageIds = passageSeedRows - .map(row => row.passageId) - .filter(id => !scoredPassageIds.includes(id)) - .slice(0, Math.max(0, count - scoredPassageIds.length)) - const passageIds = [...scoredPassageIds, ...fallbackPassageIds] - const passageRows = memoryStore.getPassagesByIds && passageIds.length > 0 - ? await memoryStore.getPassagesByIds(passageIds, { chunksTable, bucketIds: opts.bucketIds, scope: identity }) + const fallbackChunkIds = chunkSeedRows + .map(row => chunkNodeIdFor(row)) + .filter(id => !scoredChunkIds.includes(id)) + .slice(0, Math.max(0, count - scoredChunkIds.length)) + const chunkIds = [...scoredChunkIds, ...fallbackChunkIds] + const chunkRefs = chunkIds.map(id => chunkRefById.get(id)).filter((ref): ref is ChunkRef => !!ref) + const chunkRows = memoryStore.getChunksByRefs && chunkRefs.length > 0 + ? await memoryStore.getChunksByRefs(chunkRefs, { chunksTable, bucketIds: normalizedOpts.bucketIds, scope: identity }) : [] - const denseScoreByPassage = new Map(passageSeedRows.map(row => [row.passageId, row.similarity])) + const denseScoreByChunk = new Map(chunkSeedRows.map(row => [chunkNodeIdFor(row), row.similarity ?? 0])) const selectedFactResults = await hydrateFacts(selectedFacts, identity) const factChains = buildFactChains(selectedFactResults, factChainLimit) - const evidenceEntityIds = uniqueIds([ - ...resolvedAnchors.anchors.map(anchor => anchor.id), - ...entitySeeds.keys(), - ...selectedFactResults.flatMap(fact => [fact.sourceEntityId, fact.targetEntityId]), - ...factChains.flatMap(chain => chain.entityIds), - ]) + const evidenceEntityIds = uniqueIds(isEntityScopeFilter + ? [ + ...scopedEntityIdSet, + ...selectedFactResults.flatMap(fact => [fact.sourceEntityId, fact.targetEntityId]), + ...factChains.flatMap(chain => chain.entityIds), + ] + : [ + ...resolvedAnchors.anchors.map(anchor => anchor.id), + ...entitySeeds.keys(), + ...selectedFactResults.flatMap(fact => [fact.sourceEntityId, fact.targetEntityId]), + ...factChains.flatMap(chain => chain.entityIds), + ]) const entityOrder = new Map(evidenceEntityIds.map((id, index) => [id, index])) const selectedEntityRows = await graph.getEntitiesBatch(evidenceEntityIds, identity) const selectedEntityResults = (await hydrateEntityResults(selectedEntityRows, undefined, identity)) .sort((a, b) => (entityOrder.get(a.id) ?? Number.MAX_SAFE_INTEGER) - (entityOrder.get(b.id) ?? Number.MAX_SAFE_INTEGER)) - const passageQueryTokens = queryTokens([ + const chunkQueryTokens = queryTokens([ query, - ...parsed.intent.subqueries, ...parsed.intent.sourceEntityQueries, ...parsed.intent.targetEntityQueries, ].join(' ')) - const results = passageRows + const results = chunkRows + .filter(row => !isEntityScopeFilter || scopedChunkIds.has(chunkNodeIdFor(row))) .map(row => { - const pprScore = pprScores.get(row.passageId) ?? ((denseScoreByPassage.get(row.passageId) ?? 0) * passageSeedWeight) - const denseScore = denseScoreByPassage.get(row.passageId) ?? 0 - const lexicalScore = tokenOverlapScore(passageQueryTokens, row.content) + const nodeId = chunkNodeIdFor(row) + const pprScore = pprScores.get(nodeId) ?? ((denseScoreByChunk.get(nodeId) ?? 0) * chunkSeedWeight) + const denseScore = denseScoreByChunk.get(nodeId) ?? 0 + const lexicalScore = tokenOverlapScore(chunkQueryTokens, row.content) return { - passageId: row.passageId, content: row.content, bucketId: row.bucketId, - documentId: row.documentId, + sourceId: row.sourceId, chunkIndex: row.chunkIndex, + embeddingModel: row.embeddingModel, + chunkId: row.chunkId, totalChunks: row.totalChunks, score: pprScore + denseScore * 0.15 + lexicalScore * 0.12, metadata: row.metadata, @@ -1631,17 +2441,17 @@ export function createKnowledgeGraphBridge(config: CreateKnowledgeGraphBridgeCon parser: parsed.parser, entitySeedCount: entitySeeds.size, factSeedCount: selectedFacts.length, - passageSeedCount: passageSeeds.size, + chunkSeedCount: chunkSeeds.size, graphNodeCount: countGraphNodes(adjacency, seedWeights), graphEdgeCount: countGraphEdges(adjacency), pprNonzeroCount: pprScores.size, - candidatesBeforeMerge: passageRows.length, + candidatesBeforeMerge: chunkRows.length, candidatesAfterMerge: results.length, topGraphScores: results.slice(0, 5).map(result => result.score), selectedFactIds: selectedFacts.map(fact => fact.id), selectedEntityIds: evidenceEntityIds, - selectedPassageIds: [...passageSeeds.keys()].slice(0, 20), - finalPassageIds: results.map(result => result.passageId), + selectedChunkIds: [...chunkSeeds.keys()].slice(0, 20), + finalChunkIds: results.map(result => chunkNodeIdFor(result)), selectedFactTexts: selectedFactResults.map(fact => ({ id: fact.id, content: formatFactEvidence(fact) })), selectedEntityNames: selectedEntityResults.map(entity => ({ id: entity.id, content: entity.name })), selectedFactChains: factChains.map(chain => ({ @@ -1649,20 +2459,24 @@ export function createKnowledgeGraphBridge(config: CreateKnowledgeGraphBridgeCon score: chain.score, factIds: chain.facts.map(fact => fact.id), })), + intentParseMs: parsed.parseMs, + intentMatchedPatterns: parsed.matchedPatterns, + rejectedPredicates: parsed.rejectedPredicates, } return { results, facts: selectedFactResults, entities: selectedEntityResults, factChains, trace } } - async function explainQuery(query: string, opts: GraphExplainOpts = {}): Promise { + async function explainQuery(query: string, opts?: GraphExplainOpts | null): Promise { + const normalizedOpts = optionalCompactObject(opts, 'graph.explainQuery') as GraphExplainOpts const identity = { - tenantId: opts.tenantId, - groupId: opts.groupId, - userId: opts.userId, - agentId: opts.agentId, - conversationId: opts.conversationId, + tenantId: normalizedOpts.tenantId, + groupId: normalizedOpts.groupId, + userId: normalizedOpts.userId, + agentId: normalizedOpts.agentId, + conversationId: normalizedOpts.conversationId, } - const result = await searchGraphPassages(query, identity, opts) + const result = await searchGraphChunks(query, identity, normalizedOpts) return result.trace } @@ -1708,12 +2522,12 @@ export function createKnowledgeGraphBridge(config: CreateKnowledgeGraphBridgeCon async function backfill( identity: typegraphIdentity, - opts: GraphBackfillOpts = {}, + opts?: GraphBackfillOpts | null, ): Promise { - const batchSize = Math.max(1, opts.batchSize ?? 500) + const normalizedOpts = optionalCompactObject(opts, 'graph.backfill') as GraphBackfillOpts + const batchSize = Math.max(1, normalizedOpts.batchSize ?? 500) const result: GraphBackfillResult = { - passageNodesUpserted: 0, - passageEntityEdgesUpserted: 0, + entityChunkEdgesUpserted: 0, factRecordsUpserted: 0, entityProfilesUpdated: 0, batches: 0, @@ -1725,43 +2539,28 @@ export function createKnowledgeGraphBridge(config: CreateKnowledgeGraphBridgeCon const pageOpts = (offset: number) => ({ chunksTable, scope: identity, - bucketIds: opts.bucketIds, + bucketIds: normalizedOpts.bucketIds, limit: batchSize, offset, }) - if ((opts.passages ?? true) && memoryStore.listPassageBackfillChunks && memoryStore.upsertPassageNodes) { - for (let offset = 0; ; offset += batchSize) { - const rows = await memoryStore.listPassageBackfillChunks(pageOpts(offset)) - if (rows.length === 0) break - result.batches++ - await upsertPassageNodes(rows.map(row => ({ - bucketId: row.bucketId, - documentId: row.documentId, - chunkIndex: row.chunkIndex, - chunkId: row.chunkId, - embeddingModel: row.embeddingModel, - contentHash: contentHashFor(row.content), - metadata: row.metadata, - visibility: row.visibility, - tenantId: row.tenantId, - groupId: row.groupId, - userId: row.userId, - agentId: row.agentId, - conversationId: row.conversationId, - }))) - result.passageNodesUpserted += rows.length - if (rows.length < batchSize) break - } - } - - if ((opts.passageEntityEdges ?? true) && memoryStore.listPassageMentionBackfillRows && memoryStore.upsertPassageEntityEdges) { + if ((normalizedOpts.entityChunkEdges ?? true) && memoryStore.listChunkMentionBackfillRows && memoryStore.upsertGraphEdges) { for (let offset = 0; ; offset += batchSize) { - const rows = await memoryStore.listPassageMentionBackfillRows(pageOpts(offset)) + const rows = await memoryStore.listChunkMentionBackfillRows(pageOpts(offset)) if (rows.length === 0) break result.batches++ - const edgeMap = new Map() + const edgeMap = new Map() for (const row of rows) { const scope = mergeScope(defaultScope, { tenantId: row.tenantId, @@ -1770,22 +2569,24 @@ export function createKnowledgeGraphBridge(config: CreateKnowledgeGraphBridgeCon agentId: row.agentId, conversationId: row.conversationId, }) - const passageId = passageIdFor({ - scope, + const chunkRef: ChunkRef = { bucketId: row.bucketId, - documentId: row.documentId, + sourceId: row.sourceId, chunkIndex: row.chunkIndex, embeddingModel: row.embeddingModel, - }) - const key = `${passageId}:${row.entityId}` + chunkId: row.chunkId, + } + const key = `${chunkRefKey(chunkRef)}:${row.entityId}` const current = edgeMap.get(key) ?? { - passageId, entityId: row.entityId, + chunkRef, weight: 0, mentionCount: 0, confidence: undefined, surfaceTexts: [], mentionTypes: [], + scope, + visibility: row.visibility, } current.mentionCount += 1 current.confidence = Math.max(current.confidence ?? 0, row.confidence ?? 0) @@ -1801,21 +2602,21 @@ export function createKnowledgeGraphBridge(config: CreateKnowledgeGraphBridgeCon edgeMap.set(key, current) } - const edges = [...edgeMap.values()] - await memoryStore.upsertPassageEntityEdges(edges) - result.passageEntityEdgesUpserted += edges.length + const edges = [...edgeMap.values()].map(edge => buildEntityChunkGraphEdge(edge)) + await memoryStore.upsertGraphEdges(edges) + result.entityChunkEdgesUpserted += edges.length if (rows.length < batchSize) break } } - const shouldBackfillFacts = opts.facts ?? true - const shouldBackfillProfiles = opts.entityProfiles ?? true + const shouldBackfillFacts = normalizedOpts.facts ?? true + const shouldBackfillProfiles = normalizedOpts.entityProfiles ?? true if ((shouldBackfillFacts || shouldBackfillProfiles) && memoryStore.listSemanticEdgesForBackfill) { const updatedProfileEntityIds = new Set() for (let offset = 0; ; offset += batchSize) { const edges = await memoryStore.listSemanticEdgesForBackfill({ scope: identity, - bucketIds: opts.bucketIds, + bucketIds: normalizedOpts.bucketIds, limit: batchSize, offset, }) @@ -1875,11 +2676,12 @@ export function createKnowledgeGraphBridge(config: CreateKnowledgeGraphBridgeCon // ── Graph Exploration ── - async function getEntity(id: string, opts: typegraphIdentity = {}): Promise { - const entity = await graph.getEntity(id, opts) + async function getEntity(id: string, opts?: typegraphIdentity | null): Promise { + const normalizedOpts = optionalCompactObject(opts, 'graph.getEntity') as typegraphIdentity + const entity = await graph.getEntity(id, normalizedOpts) if (!entity) return null - const edges = await graph.getEdges(id, 'both', opts) + const edges = await graph.getEdges(id, 'both', normalizedOpts) const neighborIds = new Set() for (const e of edges) { neighborIds.add(e.sourceEntityId) @@ -1887,7 +2689,7 @@ export function createKnowledgeGraphBridge(config: CreateKnowledgeGraphBridgeCon } neighborIds.delete(id) const nameMap = new Map([[id, entity.name]]) - const neighbors = await graph.getEntitiesBatch([...neighborIds], opts) + const neighbors = await graph.getEntitiesBatch([...neighborIds], normalizedOpts) for (const n of neighbors) nameMap.set(n.id, n.name) const topEdges: EdgeResult[] = edges @@ -1909,6 +2711,7 @@ export function createKnowledgeGraphBridge(config: CreateKnowledgeGraphBridgeCon name: entity.name, entityType: entity.entityType, aliases: entity.aliases, + externalIds: entity.externalIds, edgeCount: edges.length, properties: entity.properties, description: entity.properties.description as string | undefined, @@ -1919,21 +2722,30 @@ export function createKnowledgeGraphBridge(config: CreateKnowledgeGraphBridgeCon } } - async function getEdges(entityId: string, opts?: { + async function getEdges(entityId: string, opts?: ({ direction?: 'in' | 'out' | 'both' relation?: string limit?: number - } & typegraphIdentity): Promise { + } & typegraphIdentity) | null): Promise { + const normalizedOpts = optionalCompactObject<{ + direction?: 'in' | 'out' | 'both' + relation?: string + limit?: number + } & typegraphIdentity>(opts, 'graph.getEdges') as { + direction?: 'in' | 'out' | 'both' + relation?: string + limit?: number + } & typegraphIdentity const identity = { - tenantId: opts?.tenantId, - groupId: opts?.groupId, - userId: opts?.userId, - agentId: opts?.agentId, - conversationId: opts?.conversationId, + tenantId: normalizedOpts.tenantId, + groupId: normalizedOpts.groupId, + userId: normalizedOpts.userId, + agentId: normalizedOpts.agentId, + conversationId: normalizedOpts.conversationId, } - let edges = await graph.getEdges(entityId, opts?.direction ?? 'both', identity) - if (opts?.relation) { - edges = edges.filter(e => e.relation === opts.relation) + let edges = await graph.getEdges(entityId, normalizedOpts.direction ?? 'both', identity) + if (normalizedOpts.relation) { + edges = edges.filter(e => e.relation === normalizedOpts.relation) } const entityIds = new Set() @@ -1945,7 +2757,7 @@ export function createKnowledgeGraphBridge(config: CreateKnowledgeGraphBridgeCon const ents = await graph.getEntitiesBatch([...entityIds], identity) for (const ent of ents) nameMap.set(ent.id, ent.name) - const limit = opts?.limit ?? 50 + const limit = normalizedOpts.limit ?? 50 return edges.slice(0, limit).map(e => ({ id: e.id, sourceEntityId: e.sourceEntityId, @@ -1959,36 +2771,37 @@ export function createKnowledgeGraphBridge(config: CreateKnowledgeGraphBridgeCon } async function getSubgraph(opts: SubgraphOpts): Promise { - let seedIds = opts.entityIds ?? [] - if (opts.query && (memoryStore.searchEntities || memoryStore.searchEntitiesHybrid)) { - const queryEmb = await embedding.embed(opts.query) + const normalizedOpts = requiredObject(opts, 'graph.getSubgraph', 'opts') + let seedIds = normalizedOpts.entityIds ?? [] + if (normalizedOpts.query && (memoryStore.searchEntities || memoryStore.searchEntitiesHybrid)) { + const queryEmb = await embedding.embed(normalizedOpts.query) const found = memoryStore.searchEntitiesHybrid - ? await memoryStore.searchEntitiesHybrid(opts.query, queryEmb, opts.identity, opts.limit ?? 10) - : await memoryStore.searchEntities!(queryEmb, opts.identity, opts.limit ?? 10) + ? await memoryStore.searchEntitiesHybrid(normalizedOpts.query, queryEmb, normalizedOpts.identity, normalizedOpts.limit ?? 10) + : await memoryStore.searchEntities!(queryEmb, normalizedOpts.identity, normalizedOpts.limit ?? 10) seedIds = [...seedIds, ...found.map(e => e.id)] } if (seedIds.length === 0) { return { entities: [], edges: [], stats: { entityCount: 0, edgeCount: 0, avgDegree: 0, components: 0 } } } - const depth = Math.min(opts.depth ?? 1, 3) - const sub = await graph.getSubgraph(seedIds, depth, opts.identity) + const depth = Math.min(normalizedOpts.depth ?? 1, 3) + const sub = await graph.getSubgraph(seedIds, depth, normalizedOpts.identity) let entities = sub.entities let edges = sub.edges - if (opts.entityTypes?.length) { - const types = new Set(opts.entityTypes) + if (normalizedOpts.entityTypes?.length) { + const types = new Set(normalizedOpts.entityTypes) entities = entities.filter(e => types.has(e.entityType)) } - if (opts.relations?.length) { - const rels = new Set(opts.relations) + if (normalizedOpts.relations?.length) { + const rels = new Set(normalizedOpts.relations) edges = edges.filter(e => rels.has(e.relation)) } - if (opts.minWeight) { - edges = edges.filter(e => e.weight >= opts.minWeight!) + if (normalizedOpts.minWeight) { + edges = edges.filter(e => e.weight >= normalizedOpts.minWeight!) } - const entityLimit = opts.limit ?? 100 + const entityLimit = normalizedOpts.limit ?? 100 entities = entities.slice(0, entityLimit) const entitySet = new Set(entities.map(e => e.id)) edges = edges.filter(e => entitySet.has(e.sourceEntityId) && entitySet.has(e.targetEntityId)) @@ -2070,6 +2883,28 @@ export function createKnowledgeGraphBridge(config: CreateKnowledgeGraphBridgeCon return memoryStore.getEntityTypes ? memoryStore.getEntityTypes(identity) : [] } + async function mergeEntities(input: MergeGraphEntitiesInput): Promise { + if (!memoryStore.mergeEntityReferences) { + throw new ConfigError('MemoryStoreAdapter does not support transactional entity merge operations.') + } + const result = await memoryStore.mergeEntityReferences(input) + return { + ...result, + target: await getEntity(input.targetEntityId, input) + ?? result.target, + } + } + + async function deleteEntity(entityId: string, opts?: DeleteGraphEntityOpts | null): Promise { + if (!memoryStore.deleteEntityReferences) { + throw new ConfigError('MemoryStoreAdapter does not support transactional entity delete operations.') + } + return memoryStore.deleteEntityReferences( + entityId, + optionalCompactObject(opts, 'graph.deleteEntity') as DeleteGraphEntityOpts, + ) + } + async function deploy(): Promise { await memoryStore.initialize() } @@ -2077,13 +2912,25 @@ export function createKnowledgeGraphBridge(config: CreateKnowledgeGraphBridgeCon return { deploy, addTriple, + addSourceSubject, + upsertEntity, + upsertEntities, + resolveEntity, + linkExternalIds, + mergeEntities, + deleteEntity, + upsertEdge, + upsertEdges, + upsertFact, + upsertFacts, addEntityMentions, - upsertPassageNodes, searchEntities, searchFacts, explore, - getPassagesForEntity, - searchGraphPassages, + resolveEntityScope, + searchKnowledge, + getChunksForEntity, + searchGraphChunks, explainQuery, backfill, getEntity, diff --git a/packages/sdk/src/graph/query-intent.ts b/packages/sdk/src/graph/query-intent.ts index 5bff900..868d592 100644 --- a/packages/sdk/src/graph/query-intent.ts +++ b/packages/sdk/src/graph/query-intent.ts @@ -1,6 +1,6 @@ import { z } from 'zod/v4-mini' import type { LLMProvider } from '../types/llm-provider.js' -import type { GraphQueryIntent, GraphQueryIntentPredicate, ParsedGraphQueryIntent } from '../types/graph-bridge.js' +import type { GraphIntentParserMode, GraphQueryIntent, GraphQueryIntentPredicate, ParsedGraphQueryIntent } from '../types/graph-bridge.js' import { ALL_PREDICATES, getPredicatesForPrompt } from '../index-engine/ontology.js' import { PredicateNormalizer } from '../memory/extraction/predicate-normalizer.js' import type { EmbeddingProvider } from '../embedding/provider.js' @@ -23,13 +23,73 @@ const intentSchema = z.object({ sourceEntityQueries: z._default(z.array(z.string()).check(z.maxLength(12)), []), targetEntityQueries: z._default(z.array(z.string()).check(z.maxLength(12)), []), predicates: z._default(z.array(predicateSchema).check(z.maxLength(16)), []), - answerSide: z._default(z.enum(['source', 'target', 'either', 'none']), 'none'), subqueries: z._default(z.array(z.string()).check(z.maxLength(8)), []), mode: z._default(z.enum(['fact', 'relationship', 'summary', 'creative']), 'fact'), + strictness: z._default(z.enum(['strict', 'soft', 'none']), 'strict'), }) +type IntentMode = GraphQueryIntent['mode'] +type IntentStrictness = GraphQueryIntent['strictness'] + +interface PredicateNormalization { + predicates: GraphQueryIntentPredicate[] + rejectedPredicates: string[] +} + +interface IntentDraft { + sourceEntityQueries?: string[] | undefined + targetEntityQueries?: string[] | undefined + predicates?: Array<{ name: string; confidence?: number | undefined }> | undefined + subqueries?: string[] | undefined + mode?: IntentMode | undefined + strictness?: IntentStrictness | undefined + matchedPatterns?: string[] | undefined +} + +const QUERY_WORDS = new Set([ + 'who', + 'what', + 'where', + 'when', + 'why', + 'how', + 'which', + 'write', + 'imagine', + 'tell', + 'summarize', + 'compare', +]) + +const ENTITY_STOP_WORDS = new Set([ + 'book', + 'books', + 'company', + 'city', + 'country', + 'diary', + 'entry', + 'father', + 'husband', + 'letter', + 'mother', + 'organization', + 'parent', + 'parents', + 'chunk', + 'chunks', + 'relationship', + 'sibling', + 'siblings', + 'sister', + 'son', + 'spouse', + 'temples', + 'wife', +]) + function cleanText(value: string): string { - return value.replace(/\s+/g, ' ').trim() + return value.replace(/[“”]/g, '"').replace(/[‘’]/g, "'").replace(/\s+/g, ' ').trim() } function unique(items: T[]): T[] { @@ -40,14 +100,22 @@ function cleanQueries(values: string[]): string[] { return unique(values.map(cleanText).filter(value => value.length > 0 && value.length <= 140)) } -function normalizePredicates(values: Array<{ name: string; confidence?: number | undefined }>): GraphQueryIntentPredicate[] { +function relationToPhrase(relation: string): string { + return relation.toLowerCase().replace(/_/g, ' ') +} + +function normalizePredicates(values: Array<{ name: string; confidence?: number | undefined }>): PredicateNormalization { const byName = new Map() + const rejectedPredicates: string[] = [] for (const value of values) { const normalized = predicateNormalizer.normalizeWithDirection(value.name) - if (!normalized.valid || !(ALL_PREDICATES as ReadonlySet).has(normalized.predicate)) continue + if (!normalized.valid || !(ALL_PREDICATES as ReadonlySet).has(normalized.predicate)) { + rejectedPredicates.push(value.name) + continue + } const confidence = typeof value.confidence === 'number' ? Math.max(0, Math.min(1, value.confidence)) - : 0.8 + : 0.95 const existing = byName.get(normalized.predicate) if (!existing || confidence > existing.confidence) { byName.set(normalized.predicate, { @@ -57,7 +125,7 @@ function normalizePredicates(values: Array<{ name: string; confidence?: number | }) } } - return [...byName.values()] + return { predicates: [...byName.values()], rejectedPredicates: unique(rejectedPredicates) } } function emptyIntent(query: string): GraphQueryIntent { @@ -66,65 +134,412 @@ function emptyIntent(query: string): GraphQueryIntent { sourceEntityQueries: [], targetEntityQueries: [], predicates: [], - answerSide: 'none', subqueries: [], mode: 'fact', + strictness: 'none', } } -function parsedNone(query: string): ParsedGraphQueryIntent { +function parsedNone(query: string, parseMs?: number): ParsedGraphQueryIntent { return { parser: 'none', - fallbackUsed: false, intent: emptyIntent(query), + ...(typeof parseMs === 'number' ? { parseMs } : {}), } } -function buildIntent(query: string, raw: z.infer): GraphQueryIntent { +function isIntentEmpty(intent: GraphQueryIntent): boolean { + return ( + intent.sourceEntityQueries.length === 0 && + intent.targetEntityQueries.length === 0 && + intent.predicates.length === 0 && + intent.subqueries.length === 0 && + intent.strictness === 'none' + ) +} + +function buildIntent( + query: string, + raw: z.infer | IntentDraft, +): { intent: GraphQueryIntent; rejectedPredicates: string[] } { + const normalized = normalizePredicates(raw.predicates ?? []) + const sourceEntityQueries = cleanQueries(raw.sourceEntityQueries ?? []) + const targetEntityQueries = cleanQueries(raw.targetEntityQueries ?? []) + const predicates = normalized.predicates + const strictness = raw.strictness ?? ( + predicates.length > 0 && (sourceEntityQueries.length > 0 || targetEntityQueries.length > 0) + ? 'strict' + : (sourceEntityQueries.length > 0 || targetEntityQueries.length > 0 || cleanQueries(raw.subqueries ?? []).length > 0 ? 'soft' : 'none') + ) return { - rawQuery: query, - sourceEntityQueries: cleanQueries(raw.sourceEntityQueries), - targetEntityQueries: cleanQueries(raw.targetEntityQueries), - predicates: normalizePredicates(raw.predicates), - answerSide: raw.answerSide, - subqueries: cleanQueries(raw.subqueries), - mode: raw.mode, + intent: { + rawQuery: query, + sourceEntityQueries, + targetEntityQueries, + predicates, + subqueries: cleanQueries(raw.subqueries ?? []), + mode: raw.mode ?? 'fact', + strictness, + }, + rejectedPredicates: normalized.rejectedPredicates, + } +} + +function stripTerminalPunctuation(value: string): string { + return value.replace(/[?!.,:;]+$/g, '').trim() +} + +function cleanEntity(value: string): string { + const original = cleanText(stripTerminalPunctuation(value)) + const quoted = /^["'`].*["'`]$/.test(original) + let entity = original.replace(/^["'`]+|["'`]+$/g, '').trim() + entity = entity.replace(/\s+(?:is|are|was|were)$/i, '').trim() + entity = entity.replace(/^(.+?)['’]s\s+.+$/i, '$1').replace(/^(.+?)['’]\s+.+$/i, '$1').trim() + entity = entity.replace(/['’]s$/i, '').replace(/['’]$/i, '').trim() + entity = stripTerminalPunctuation(entity) + if (!quoted) entity = entity.replace(/^(?:the|a|an)\s+/i, '').trim() + return entity +} + +function validEntity(value: string): boolean { + const normalized = value.toLowerCase() + if (!normalized || normalized.length < 2) return false + if (QUERY_WORDS.has(normalized) || ENTITY_STOP_WORDS.has(normalized)) return false + return true +} + +function entity(value: string | undefined): string[] { + if (!value) return [] + const cleaned = cleanEntity(value) + return validEntity(cleaned) ? [cleaned] : [] +} + +function predicate(name: string, confidence = 1): Array<{ name: string; confidence: number }> { + return [{ name, confidence }] +} + +function subquery(entities: string[], predicateName?: string): string[] { + const parts = cleanQueries([...entities]) + if (parts.length === 0) return [] + return [predicateName ? `${parts.join(' ')} ${relationToPhrase(predicateName)}` : parts.join(' ')] +} + +function draft(input: IntentDraft): IntentDraft { + const sourceEntityQueries = cleanQueries(input.sourceEntityQueries ?? []) + const targetEntityQueries = cleanQueries(input.targetEntityQueries ?? []) + const firstPredicate = input.predicates?.[0]?.name + return { + ...input, + sourceEntityQueries, + targetEntityQueries, + subqueries: input.subqueries ?? subquery([...sourceEntityQueries, ...targetEntityQueries], firstPredicate), + } +} + +function matchOne(query: string, patterns: RegExp[]): RegExpMatchArray | null { + for (const pattern of patterns) { + const match = query.match(pattern) + if (match) return match + } + return null +} + +function extractNamedEntities(query: string): string[] { + const names: string[] = [] + const quoted = query.matchAll(/["']([^"']{2,120})["']/g) + for (const match of quoted) names.push(...entity(match[1])) + + const titleCase = query.matchAll(/\b(?:[A-Z][A-Za-z0-9]*(?:[-'][A-Za-z0-9]+)*|[A-Z]{2,})(?:(?:\s+|,\s*)(?:[A-Z][A-Za-z0-9]*(?:[-'][A-Za-z0-9]+)*|[A-Z]{2,}|Mass\.|D\.C\.))*\.?/g) + for (const match of titleCase) { + const candidate = cleanEntity(match[0]) + if (!validEntity(candidate)) continue + const first = candidate.split(/\s+/)[0]?.toLowerCase() + if (first && QUERY_WORDS.has(first)) continue + names.push(candidate) + } + + return cleanQueries(names) +} + +function parseDirectFact(query: string): IntentDraft | null { + let match = matchOne(query, [ + /^who\s+(?:killed|murdered|assassinated|slew|slayed|stabbed)\s+(.+?)\??$/i, + /^who\s+was\s+(.+?)\s+(?:killed|murdered|assassinated|slain|stabbed)\s+by\??$/i, + /^(.+?)\s+was\s+(?:killed|murdered|assassinated|slain|stabbed)\s+by\s+whom\??$/i, + ]) + if (match) { + const targetEntityQueries = entity(match[1]) + return draft({ targetEntityQueries, predicates: predicate('KILLED'), mode: 'fact', strictness: 'strict', matchedPatterns: ['killed-target'] }) + } + + match = query.match(/^who\s+did\s+(.+?)\s+(?:kill|murder|assassinate|slay|stab)\??$/i) + if (match) { + const sourceEntityQueries = entity(match[1]) + return draft({ sourceEntityQueries, predicates: predicate('KILLED'), mode: 'fact', strictness: 'strict', matchedPatterns: ['killed-source'] }) + } + + match = matchOne(query, [ + /^who\s+(?:is|was|are|were)\s+(.+?)['’]s\s+(?:wife|husband|spouse)\??$/i, + /^who\s+(?:is|was|are|were)\s+(.+?)\s+(?:wife|husband|spouse)\??$/i, + /^who\s+(?:is|was)\s+(?:the\s+)?(?:wife|husband|spouse)\s+of\s+(.+?)\??$/i, + /^who\s+(?:was|is)\s+(.+?)\s+married\s+to\??$/i, + ]) + if (match) { + const sourceEntityQueries = entity(match[1]) + return draft({ sourceEntityQueries, predicates: predicate('MARRIED'), mode: 'fact', strictness: 'strict', matchedPatterns: ['spouse'] }) + } + + match = matchOne(query, [ + /^who\s+(?:are|is|was|were)\s+(.+?)['’]s\s+(?:parents?|father|mother|ancestors?)\??$/i, + /^who\s+(?:is|are|was|were)\s+(?:the\s+)?(?:parents?|father|mother|ancestors?)\s+of\s+(.+?)\??$/i, + ]) + if (match) { + const targetEntityQueries = entity(match[1]) + return draft({ targetEntityQueries, predicates: predicate('PARENT_OF'), mode: 'fact', strictness: 'strict', matchedPatterns: ['parent-target'] }) + } + + match = query.match(/^who\s+did\s+(.+?)\s+father\??$/i) + if (match) { + const sourceEntityQueries = entity(match[1]) + return draft({ sourceEntityQueries, predicates: predicate('PARENT_OF'), mode: 'fact', strictness: 'strict', matchedPatterns: ['parent-source'] }) + } + + match = matchOne(query, [ + /^who\s+(?:are|is|was|were)\s+(.+?)['’]s\s+(?:children|child|sons?|daughters?|offspring)\??$/i, + /^who\s+(?:is|are|was|were)\s+(?:the\s+)?(?:children|child|sons?|daughters?|offspring)\s+of\s+(.+?)\??$/i, + ]) + if (match) { + const targetEntityQueries = entity(match[1]) + return draft({ targetEntityQueries, predicates: predicate('CHILD_OF'), mode: 'fact', strictness: 'strict', matchedPatterns: ['child-target'] }) + } + + match = matchOne(query, [ + /^who\s+(?:is|are|was|were)\s+(.+?)['’]s\s+(?:brother|sister|siblings?|sibling)\??$/i, + /^who\s+(?:is|are|was|were)\s+(?:the\s+)?(?:brother|sister|siblings?|sibling)\s+of\s+(.+?)\??$/i, + ]) + if (match) { + const sourceEntityQueries = entity(match[1]) + return draft({ sourceEntityQueries, predicates: predicate('SIBLING_OF'), mode: 'fact', strictness: 'strict', matchedPatterns: ['sibling'] }) + } + + match = matchOne(query, [ + /^who\s+(?:wrote|authored|composed)\s+(.+?)\??$/i, + ]) + if (match) { + const targetEntityQueries = entity(match[1]) + const verb = query.match(/\b(authored|composed)\b/i)?.[1]?.toUpperCase() ?? 'AUTHORED' + return draft({ targetEntityQueries, predicates: predicate(verb), mode: 'fact', strictness: 'strict', matchedPatterns: ['work-target'] }) + } + + match = matchOne(query, [ + /^what\s+(?:books?|works?|novels?|plays?|poems?)?\s*did\s+(.+?)\s+(?:write|author|compose)\??$/i, + ]) + if (match) { + const sourceEntityQueries = entity(match[1]) + return draft({ sourceEntityQueries, predicates: predicate('AUTHORED'), mode: 'fact', strictness: 'strict', matchedPatterns: ['work-source'] }) + } + + match = matchOne(query, [ + /^who\s+(?:founded|established)\s+(.+?)\??$/i, + ]) + if (match) { + const targetEntityQueries = entity(match[1]) + return draft({ targetEntityQueries, predicates: predicate('FOUNDED'), mode: 'fact', strictness: 'strict', matchedPatterns: ['founded-target'] }) + } + + match = matchOne(query, [ + /^what\s+(?:company|organization|org|institution|project)?\s*did\s+(.+?)\s+(co[-\s]?found|found|establish)\??$/i, + ]) + if (match) { + const sourceEntityQueries = entity(match[1]) + return draft({ sourceEntityQueries, predicates: predicate('FOUNDED'), mode: 'fact', strictness: 'strict', matchedPatterns: ['founded-source'] }) + } + + match = matchOne(query, [ + /^where\s+was\s+(.+?)\s+born\??$/i, + ]) + if (match) { + const sourceEntityQueries = entity(match[1]) + return draft({ sourceEntityQueries, predicates: predicate('LOCATED_IN'), mode: 'fact', strictness: 'strict', matchedPatterns: ['born-in'] }) + } + + match = matchOne(query, [ + /^where\s+did\s+(.+?)\s+die\??$/i, + ]) + if (match) { + const sourceEntityQueries = entity(match[1]) + return draft({ sourceEntityQueries, predicates: predicate('DIED_IN'), mode: 'fact', strictness: 'strict', matchedPatterns: ['died-in'] }) + } + + match = matchOne(query, [ + /^where\s+(?:is|was)\s+(.+?)\s+headquartered\??$/i, + ]) + if (match) { + const sourceEntityQueries = entity(match[1]) + return draft({ sourceEntityQueries, predicates: predicate('HEADQUARTERED_IN'), mode: 'fact', strictness: 'strict', matchedPatterns: ['headquartered-in'] }) + } + + match = matchOne(query, [ + /^(?:what\s+(?:city|country|place|location)\s+)?(?:is|was)\s+(.+?)\s+located\s+in\??$/i, + /^where\s+(?:is|was)\s+(.+?)\s+located\??$/i, + ]) + if (match) { + const sourceEntityQueries = entity(match[1]) + return draft({ sourceEntityQueries, predicates: predicate('LOCATED_IN'), mode: 'fact', strictness: 'strict', matchedPatterns: ['located-in'] }) + } + + match = matchOne(query, [ + /^who\s+leads\s+(.+?)\??$/i, + ]) + if (match) { + const targetEntityQueries = entity(match[1]) + return draft({ targetEntityQueries, predicates: predicate('LEADS'), mode: 'fact', strictness: 'strict', matchedPatterns: ['leads-target'] }) + } + + match = matchOne(query, [ + /^what\s+(?:organization|company|org|institution)\s+does\s+(.+?)\s+lead\??$/i, + ]) + if (match) { + const sourceEntityQueries = entity(match[1]) + return draft({ sourceEntityQueries, predicates: predicate('LEADS'), mode: 'fact', strictness: 'strict', matchedPatterns: ['leads-source'] }) + } + + match = matchOne(query, [ + /^who\s+works\s+for\s+(.+?)\??$/i, + ]) + if (match) { + const targetEntityQueries = entity(match[1]) + return draft({ targetEntityQueries, predicates: predicate('WORKS_FOR'), mode: 'fact', strictness: 'strict', matchedPatterns: ['works-for-target'] }) + } + + match = matchOne(query, [ + /^where\s+does\s+(.+?)\s+work\??$/i, + ]) + if (match) { + const sourceEntityQueries = entity(match[1]) + return draft({ sourceEntityQueries, predicates: predicate('WORKS_FOR'), mode: 'fact', strictness: 'strict', matchedPatterns: ['works-for-source'] }) + } + + return null +} + +function parseSoftIntent(query: string): IntentDraft | null { + let match = matchOne(query, [ + /^summarize\s+(?:the\s+)?relationship\s+between\s+(.+?)\s+and\s+(.+?)\??$/i, + ]) + if (match) { + const sourceEntityQueries = entity(match[1]) + const targetEntityQueries = entity(match[2]) + return draft({ sourceEntityQueries, targetEntityQueries, mode: 'summary', strictness: 'soft', subqueries: [`${sourceEntityQueries[0] ?? ''} ${targetEntityQueries[0] ?? ''} relationship`], matchedPatterns: ['relationship-between-summary'] }) + } + + match = matchOne(query, [ + /^how\s+(?:are|is)\s+(.+?)\s+and\s+(.+?)\s+(?:related|connected|linked)\??$/i, + /^what\s+connects\s+(.+?)\s+and\s+(.+?)\??$/i, + ]) + if (match) { + const sourceEntityQueries = entity(match[1]) + const targetEntityQueries = entity(match[2]) + return draft({ sourceEntityQueries, targetEntityQueries, mode: 'relationship', strictness: 'soft', subqueries: [`${sourceEntityQueries[0] ?? ''} ${targetEntityQueries[0] ?? ''} relationship`], matchedPatterns: ['relationship-between'] }) + } + + match = matchOne(query, [ + /^write\b.*?\bfrom\s+(.+?)['’]s\s+perspective\s+about\s+(.+?)\??$/i, + /^imagine\b.*?\bfrom\s+(.+?)\s+to\s+(.+?)\??$/i, + ]) + if (match) { + const sourceEntityQueries = cleanQueries([...entity(match[1]), ...entity(match[2])]) + return draft({ sourceEntityQueries, mode: 'creative', strictness: 'soft', subqueries: sourceEntityQueries, matchedPatterns: ['creative-anchors'] }) + } + + if (/\b(write|imagine|compose|draft)\b/i.test(query)) { + const sourceEntityQueries = extractNamedEntities(query) + if (sourceEntityQueries.length > 0) { + return draft({ sourceEntityQueries, mode: 'creative', strictness: 'soft', subqueries: sourceEntityQueries, matchedPatterns: ['creative-named-entities'] }) + } + } + + match = matchOne(query, [ + /^tell\s+me\s+about\s+(.+?)\??$/i, + /^what\s+do\s+we\s+know\s+about\s+(.+?)\??$/i, + /^summarize\s+(.+?)\??$/i, + ]) + if (match) { + const sourceEntityQueries = entity(match[1]) + if (sourceEntityQueries.length > 0 && !/^(?:this|it|that|these|those)$/i.test(sourceEntityQueries[0]!)) { + return draft({ sourceEntityQueries, mode: 'summary', strictness: 'soft', subqueries: sourceEntityQueries, matchedPatterns: ['anchor-summary'] }) + } + } + + match = query.match(/\babout\s+(.+?)['’]\s+/i) + if (match) { + const sourceEntityQueries = entity(match[1]) + if (sourceEntityQueries.length > 0) { + return draft({ sourceEntityQueries, mode: 'summary', strictness: 'soft', subqueries: sourceEntityQueries, matchedPatterns: ['possessive-anchor'] }) + } + } + + const namedEntities = extractNamedEntities(query) + if (namedEntities.length > 0 && !/^(?:who|what|where|when|why|how|which)\b/i.test(query)) { + return draft({ sourceEntityQueries: namedEntities, mode: 'summary', strictness: 'soft', subqueries: namedEntities, matchedPatterns: ['named-entity-anchor'] }) + } + + return null +} + +function parseDeterministic(query: string): ParsedGraphQueryIntent { + const startedAt = Date.now() + const cleaned = cleanText(query) + const raw = parseDirectFact(cleaned) ?? parseSoftIntent(cleaned) + if (!raw) return parsedNone(query, Date.now() - startedAt) + + const parsed = buildIntent(query, raw) + if (isIntentEmpty(parsed.intent)) return parsedNone(query, Date.now() - startedAt) + + return { + parser: 'deterministic', + intent: parsed.intent, + matchedPatterns: raw.matchedPatterns ?? [], + rejectedPredicates: parsed.rejectedPredicates, + parseMs: Date.now() - startedAt, } } async function parseWithLlm(query: string, llm: LLMProvider): Promise { + const startedAt = Date.now() const prompt = [ - 'Parse this user query into graph-native retrieval intent.', + 'Parse this user query into graph retrieval intent.', '', `Query: ${query}`, '', 'Return JSON only with this exact shape:', - '{ "sourceEntityQueries": string[], "targetEntityQueries": string[], "predicates": [{ "name": string, "confidence": number }], "answerSide": "source" | "target" | "either" | "none", "subqueries": string[], "mode": "fact" | "relationship" | "summary" | "creative" }', + '{ "sourceEntityQueries": string[], "targetEntityQueries": string[], "predicates": [{ "name": string, "confidence": number }], "subqueries": string[], "mode": "fact" | "relationship" | "summary" | "creative", "strictness": "strict" | "soft" | "none" }', '', - 'Definitions:', - '- sourceEntityQueries: entity names expected on the stored edge source side.', - '- targetEntityQueries: entity names expected on the stored edge target side.', - '- predicates: ontology predicates the graph edge must match. Use canonical predicates only.', - '- answerSide: which edge endpoint is being asked for. Use "source" when asking who/what points to a known target; "target" when asking who/what a known source points to; "either" for symmetric relationships; "none" when no endpoint answer is requested.', - '- subqueries: short natural-language searches that can help retrieve supporting passages. Keep them grounded in the original question.', - '- mode: fact for direct fact lookup, relationship for relationship listings, summary for summarization, creative for creative/genre tasks.', + 'Field contract:', + '- sourceEntityQueries: known entities expected on stored edge source side.', + '- targetEntityQueries: known entities expected on stored edge target side.', + '- predicates: canonical ontology predicates required or useful for the query.', + '- subqueries: at most two short searches grounded in the original question.', + '- mode: fact for direct lookup, relationship for relationship questions, summary for summarization, creative for creative/genre tasks.', + '- strictness: strict only for explicit fact lookups with clear entity direction and predicate; soft for summaries, creative tasks, relationship exploration, or ambiguous direction; none when no graph intent exists.', '', - 'Critical rules:', - '- Do not infer extra predicates because nearby facts might exist. Only include predicates needed by the question.', - '- Preserve edge direction. "Who killed Chaacmol?" means targetEntityQueries=["Chaacmol"], predicates=["KILLED"], answerSide="source".', - '- Preserve active direction. "Who did Aac kill?" means sourceEntityQueries=["Aac"], predicates=["KILLED"], answerSide="target".', - '- Spouse, husband, wife, and married questions use MARRIED, never HUSBAND_OF or WIFE_OF.', - '- Parent/father/mother/ancestor questions use PARENT_OF when asking for parents of a known child.', - '- Child/son/daughter questions use CHILD_OF when asking for children of a known parent.', - '- Brother/sister/sibling questions use SIBLING_OF.', - '- If the query does not specify a graph relationship, leave predicates empty and use source/target queries only if explicit entities are present.', + 'Rules:', + '- Direction is represented only by sourceEntityQueries and targetEntityQueries.', + '- Do not emit answerSide.', + '- Do not infer extra predicates because nearby facts might exist.', + '- Do not force predicates for summary or creative queries.', + '- Spouse, husband, wife, and married questions use MARRIED.', + '- Passive voice must preserve logical edge direction.', '', 'Examples:', - '- "Who are Chaacmol parents?" -> {"sourceEntityQueries":[],"targetEntityQueries":["Chaacmol"],"predicates":[{"name":"PARENT_OF","confidence":0.98}],"answerSide":"source","subqueries":["Chaacmol parents","Chaacmol father mother ancestor"],"mode":"fact"}', - '- "Who killed Chaacmol?" -> {"sourceEntityQueries":[],"targetEntityQueries":["Chaacmol"],"predicates":[{"name":"KILLED","confidence":0.98}],"answerSide":"source","subqueries":["who killed Chaacmol"],"mode":"fact"}', - '- "Who did Aac kill?" -> {"sourceEntityQueries":["Aac"],"targetEntityQueries":[],"predicates":[{"name":"KILLED","confidence":0.98}],"answerSide":"target","subqueries":["Aac killed"],"mode":"fact"}', - '- "Who is Chaacmol wife?" -> {"sourceEntityQueries":["Chaacmol"],"targetEntityQueries":[],"predicates":[{"name":"MARRIED","confidence":0.98}],"answerSide":"target","subqueries":["Chaacmol wife spouse married"],"mode":"fact"}', - '- "Summarize the relationship between Aac and Chaacmol" -> {"sourceEntityQueries":["Aac"],"targetEntityQueries":["Chaacmol"],"predicates":[],"answerSide":"either","subqueries":["Aac Chaacmol relationship"],"mode":"summary"}', + '- "Who founded Stripe?" -> {"sourceEntityQueries":[],"targetEntityQueries":["Stripe"],"predicates":[{"name":"FOUNDED","confidence":0.98}],"subqueries":["Stripe founded"],"mode":"fact","strictness":"strict"}', + '- "What did Ada Lovelace write?" -> {"sourceEntityQueries":["Ada Lovelace"],"targetEntityQueries":[],"predicates":[{"name":"AUTHORED","confidence":0.98}],"subqueries":["Ada Lovelace authored"],"mode":"fact","strictness":"strict"}', + '- "Who wrote Frankenstein?" -> {"sourceEntityQueries":[],"targetEntityQueries":["Frankenstein"],"predicates":[{"name":"AUTHORED","confidence":0.98}],"subqueries":["Frankenstein authored"],"mode":"fact","strictness":"strict"}', + '- "Where was Marie Curie born?" -> {"sourceEntityQueries":["Marie Curie"],"targetEntityQueries":[],"predicates":[{"name":"LOCATED_IN","confidence":0.98}],"subqueries":["Marie Curie born in"],"mode":"fact","strictness":"strict"}', + '- "Who was Hamlet killed by?" -> {"sourceEntityQueries":[],"targetEntityQueries":["Hamlet"],"predicates":[{"name":"KILLED","confidence":0.98}],"subqueries":["Hamlet killed"],"mode":"fact","strictness":"strict"}', + '- "Summarize the relationship between Tesla and Edison" -> {"sourceEntityQueries":["Tesla"],"targetEntityQueries":["Edison"],"predicates":[],"subqueries":["Tesla Edison relationship"],"mode":"summary","strictness":"soft"}', + '- "How are Kubernetes and Docker related?" -> {"sourceEntityQueries":["Kubernetes"],"targetEntityQueries":["Docker"],"predicates":[],"subqueries":["Kubernetes Docker relationship"],"mode":"relationship","strictness":"soft"}', + '- "Write a diary entry from Elizabeth Bennet\'s perspective about Darcy" -> {"sourceEntityQueries":["Elizabeth Bennet","Darcy"],"targetEntityQueries":[],"predicates":[],"subqueries":["Elizabeth Bennet","Darcy"],"mode":"creative","strictness":"soft"}', '', 'Valid predicate vocabulary:', getPredicatesForPrompt(), @@ -132,31 +547,29 @@ async function parseWithLlm(query: string, llm: LLMProvider): Promise>(prompt, undefined, { schema: intentSchema, - maxOutputTokens: 1024, + maxOutputTokens: 768, }) const parsed = intentSchema.parse(raw) const intent = buildIntent(query, parsed) - if ( - intent.sourceEntityQueries.length === 0 && - intent.targetEntityQueries.length === 0 && - intent.predicates.length === 0 && - intent.subqueries.length === 0 - ) { - return parsedNone(query) - } + if (isIntentEmpty(intent.intent)) return parsedNone(query, Date.now() - startedAt) return { parser: 'llm', - fallbackUsed: false, - intent, + intent: intent.intent, + rejectedPredicates: intent.rejectedPredicates, + parseMs: Date.now() - startedAt, } } export async function parseGraphQueryIntent(input: { query: string + mode?: GraphIntentParserMode | undefined llm?: LLMProvider | undefined }): Promise { - if (!input.llm) return parsedNone(input.query) + const mode = input.mode ?? 'deterministic' + if (mode === 'none') return parsedNone(input.query, 0) + if (mode === 'deterministic') return parseDeterministic(input.query) + if (!input.llm) return parsedNone(input.query, 0) try { return await parseWithLlm(input.query, input.llm) } catch { diff --git a/packages/sdk/src/index-engine/chunker.ts b/packages/sdk/src/index-engine/chunker.ts index 5833201..ce6bc4a 100644 --- a/packages/sdk/src/index-engine/chunker.ts +++ b/packages/sdk/src/index-engine/chunker.ts @@ -1,5 +1,5 @@ import type { Chunk, ChunkOpts } from '../types/connector.js' -import type { RawDocument } from '../types/connector.js' +import type { SourceInput } from '../types/connector.js' /** Approximate characters per BPE token (calibrated for GPT/Voyage tokenizers). */ const CHARS_PER_TOKEN = 4.2 @@ -165,10 +165,10 @@ function takeTrailingAtWordBoundary(text: string, maxChars: number): string { // ── Public API (token-based wrapper) ── -export async function defaultChunker(doc: RawDocument, opts: ChunkOpts): Promise { - if (!doc.content || doc.content.trim().length === 0) return [] +export async function defaultChunker(source: SourceInput, opts: ChunkOpts): Promise { + if (!source.content || source.content.trim().length === 0) return [] - const results = chunkText(doc.content, { + const results = chunkText(source.content, { maxChars: Math.round(opts.chunkSize * CHARS_PER_TOKEN), overlapChars: opts.chunkOverlap ? Math.round(opts.chunkOverlap * CHARS_PER_TOKEN) : 0, }) diff --git a/packages/sdk/src/index-engine/engine.ts b/packages/sdk/src/index-engine/engine.ts index 34a0897..cef68b8 100644 --- a/packages/sdk/src/index-engine/engine.ts +++ b/packages/sdk/src/index-engine/engine.ts @@ -2,13 +2,15 @@ import type { VectorStoreAdapter, HashRecord } from '../types/adapter.js' import type { EmbeddingProvider } from '../embedding/provider.js' import { embeddingModelKey } from '../embedding/provider.js' import type { IngestOptions, IndexResult, ExtractionFailure } from '../types/index-types.js' -import type { RawDocument, Chunk } from '../types/connector.js' +import type { SourceInput, Chunk } from '../types/connector.js' import { chunkIdFor, generateId } from '../utils/id.js' import { sha256, resolveIdempotencyKey, buildHashStoreKey } from './hash.js' import { stripMarkdown } from './strip-markdown.js' import type { TripleExtractor, EntityContext } from './triple-extractor.js' import type { typegraphEventSink } from '../types/events.js' import type { typegraphLogger } from '../types/logger.js' +import type { KnowledgeGraphBridge } from '../types/graph-bridge.js' +import { optionalCompactObject } from '../utils/input.js' /** Race a promise against a timeout. Resolves to undefined on timeout (never rejects). */ function withTimeout(promise: Promise, ms: number): Promise { @@ -48,12 +50,12 @@ function sanitizeInvalidSurrogates(value: string): string { return out } -function sanitizeDocument(doc: RawDocument): RawDocument { +function sanitizeSource(source: SourceInput): SourceInput { return { - ...doc, - url: doc.url ?? undefined, - title: sanitizeText(doc.title), - content: sanitizeText(doc.content), + ...source, + url: source.url ?? undefined, + title: sanitizeText(source.title), + content: sanitizeText(source.content), } } @@ -74,22 +76,24 @@ export class IndexEngine { private embedding: EmbeddingProvider, eventSink?: typegraphEventSink, logger?: typegraphLogger, + private knowledgeGraph?: KnowledgeGraphBridge, ) { this.eventSink = eventSink this.logger = logger } /** - * Ingest a document with pre-built chunks. + * Ingest a source with pre-built chunks. * Skips the default chunker - uses the provided chunks directly. */ async ingestWithChunks( bucketId: string, - doc: RawDocument, + source: SourceInput, chunks: Chunk[], - opts: IngestOptions = {}, + rawOpts?: IngestOptions | null, ): Promise { - const cleanDoc = sanitizeDocument(doc) + const opts = optionalCompactObject(rawOpts, 'IndexEngine.ingestWithChunks') as IngestOptions + const cleanSource = sanitizeSource(source) const cleanChunks = chunks.map(sanitizeChunk) const { tenantId, groupId, userId, agentId, conversationId, visibility, dryRun = false } = opts const shouldExtract = !!this.tripleExtractor && !dryRun && !!opts.graphExtraction @@ -101,39 +105,40 @@ export class IndexEngine { await this.adapter.ensureModel(modelId, this.embedding.dimensions) } - const contentHash = sha256(cleanDoc.content) + const contentHash = sha256(cleanSource.content) const deduplicateBy = opts.deduplicateBy ?? ['url'] - const ikey = resolveIdempotencyKey(cleanDoc, deduplicateBy) + const ikey = resolveIdempotencyKey(cleanSource, deduplicateBy) - let documentId = cleanDoc.id ?? generateId('doc') - let documentWasCreated = true - if (this.adapter.upsertDocumentRecord && !dryRun) { - const documentRecord = await this.adapter.upsertDocumentRecord({ - id: documentId, + let sourceId = cleanSource.id ?? generateId('src') + let sourceWasCreated = true + if (this.adapter.upsertSourceRecord && !dryRun) { + const sourceRecord = await this.adapter.upsertSourceRecord({ + id: sourceId, bucketId, tenantId, groupId, userId, agentId, conversationId, - title: cleanDoc.title, - url: cleanDoc.url ?? undefined, + title: cleanSource.title, + url: cleanSource.url ?? undefined, contentHash, chunkCount: cleanChunks.length, status: 'processing', visibility, graphExtracted: shouldExtract, - metadata: cleanDoc.metadata ?? {}, + metadata: cleanSource.metadata ?? {}, + subject: cleanSource.subject, }) - documentId = documentRecord.id - documentWasCreated = documentRecord.wasCreated !== false + sourceId = sourceRecord.id + sourceWasCreated = sourceRecord.wasCreated !== false } try { const textsForEmbedding = cleanChunks.map(c => this.preprocessForEmbedding(c.content, opts)) const embeddings = await this.embedding.embedBatch(textsForEmbedding) - const propagated = this.propagateMetadata(cleanDoc, opts.propagateMetadata) + const propagated = this.propagateMetadata(cleanSource, opts.propagateMetadata) const embeddedChunks = cleanChunks.map((chunk, i) => ({ id: chunkIdFor({ @@ -149,7 +154,7 @@ export class IndexEngine { userId, agentId, conversationId, - documentId, + sourceId, content: chunk.content, embedding: embeddings[i]!, embeddingModel: modelId, @@ -161,44 +166,40 @@ export class IndexEngine { })) if (!dryRun) { - await this.adapter.upsertDocument(modelId, embeddedChunks) - if (shouldExtract) { - await this.tripleExtractor?.persistPassageNodes?.(embeddedChunks.map(chunk => ({ - bucketId: chunk.bucketId, - documentId: chunk.documentId, - chunkIndex: chunk.chunkIndex, - chunkId: chunk.id, - embeddingModel: chunk.embeddingModel, - contentHash: sha256(chunk.content), - metadata: chunk.metadata, - visibility: chunk.visibility, - tenantId: chunk.tenantId, - groupId: chunk.groupId, - userId: chunk.userId, - agentId: chunk.agentId, - conversationId: chunk.conversationId, - }))) - } + await this.adapter.upsertSourceChunks(modelId, embeddedChunks) } + const initialEntityContext = !dryRun + ? await this.materializeSourceSubject( + cleanSource, + bucketId, + sourceId, + modelId, + embeddedChunks, + { tenantId, groupId, userId, agentId, conversationId }, + visibility, + ) + : [] + let extraction: { succeeded: number; failed: number; failedChunks?: ExtractionFailure[] } | undefined if (shouldExtract) { - const documentTitle = (propagated.title as string | undefined) ?? undefined + const sourceTitle = (propagated.title as string | undefined) ?? undefined extraction = await this.extractTriplesForChunks( bucketId, - documentId, + sourceId, embeddedChunks, propagated, - documentTitle, + sourceTitle, { tenantId, groupId, userId, agentId, conversationId }, visibility, + initialEntityContext, ) } if (!dryRun) { if (extraction && extraction.failed > 0) { - if (this.adapter.updateDocumentStatus) { - await this.adapter.updateDocumentStatus(documentId, 'failed') + if (this.adapter.updateSourceStatus) { + await this.adapter.updateSourceStatus(sourceId, 'failed') } return { @@ -215,8 +216,8 @@ export class IndexEngine { } } - if (this.adapter.updateDocumentStatus) { - await this.adapter.updateDocumentStatus(documentId, 'complete', cleanChunks.length) + if (this.adapter.updateSourceStatus) { + await this.adapter.updateSourceStatus(sourceId, 'complete', cleanChunks.length) } const storeKey = buildHashStoreKey(tenantId, bucketId, ikey) @@ -237,31 +238,32 @@ export class IndexEngine { mode: 'upsert', total: 1, skipped: 0, - updated: documentWasCreated ? 0 : 1, - inserted: documentWasCreated ? 1 : 0, + updated: sourceWasCreated ? 0 : 1, + inserted: sourceWasCreated ? 1 : 0, pruned: 0, durationMs: Date.now() - startMs, extraction, } } catch (error) { - if (this.adapter.updateDocumentStatus && !dryRun) { - await this.adapter.updateDocumentStatus(documentId, 'failed') + if (this.adapter.updateSourceStatus && !dryRun) { + await this.adapter.updateSourceStatus(sourceId, 'failed') } throw error } } /** - * Ingest a batch of documents with pre-built chunks. - * All chunks across all documents are embedded in a single embedBatch call. + * Ingest a batch of sources with pre-built chunks. + * All chunks across all sources are embedded in a single embedBatch call. */ async ingestBatch( bucketId: string, - items: Array<{ doc: RawDocument; chunks: Chunk[] }>, - opts: IngestOptions = {}, + items: Array<{ source: SourceInput; chunks: Chunk[] }>, + rawOpts?: IngestOptions | null, ): Promise { - const cleanItems = items.map(({ doc, chunks }) => ({ - doc: sanitizeDocument(doc), + const opts = optionalCompactObject(rawOpts, 'IndexEngine.ingestBatch') as IngestOptions + const cleanItems = items.map(({ source, chunks }) => ({ + source: sanitizeSource(source), chunks: chunks.map(sanitizeChunk), })) const { tenantId, groupId, userId, agentId, conversationId, visibility, dryRun = false, traceId, spanId } = opts @@ -273,7 +275,7 @@ export class IndexEngine { id: crypto.randomUUID(), eventType: 'index.start', identity: { tenantId, groupId, userId, agentId, conversationId }, - payload: { bucketId, documentCount: cleanItems.length }, + payload: { bucketId, sourceCount: cleanItems.length }, traceId, spanId, timestamp: new Date(), @@ -296,33 +298,33 @@ export class IndexEngine { pruned: 0, durationMs: 0, } - // Tracks documents whose whole processItem rejected in the concurrent path - // (upsertDocument throw, hashStore failure, etc.). Surfaced in index.complete. + // Tracks sources whose whole processItem rejected in the concurrent path + // (upsertSourceChunks throw, hashStore failure, etc.). Surfaced in index.complete. let processingFailed = 0 - // Phase 1: Prepare all docs and collect all texts for a single embedBatch call + // Phase 1: Prepare all sources and collect all texts for a single embedBatch call const prepared: Array<{ - doc: RawDocument + source: SourceInput chunks: Chunk[] ikey: string contentHash: string - documentId: string - documentWasCreated: boolean + sourceId: string + sourceWasCreated: boolean textOffset: number }> = [] const allTexts: string[] = [] // Batch hash store lookup: check all idempotency keys in a single query - const docMeta = cleanItems.map(({ doc }) => ({ - doc, - contentHash: sha256(doc.content), - ikey: resolveIdempotencyKey(doc, deduplicateBy), - storeKey: buildHashStoreKey(tenantId, bucketId, resolveIdempotencyKey(doc, deduplicateBy)), + const sourceMeta = cleanItems.map(({ source }) => ({ + source, + contentHash: sha256(source.content), + ikey: resolveIdempotencyKey(source, deduplicateBy), + storeKey: buildHashStoreKey(tenantId, bucketId, resolveIdempotencyKey(source, deduplicateBy)), })) let hashMap: Map | undefined if (!dryRun) { - const allStoreKeys = docMeta.map(m => m.storeKey) + const allStoreKeys = sourceMeta.map(m => m.storeKey) hashMap = this.adapter.hashStore.getMany ? await this.adapter.hashStore.getMany(allStoreKeys) : undefined @@ -330,9 +332,9 @@ export class IndexEngine { for (let i = 0; i < cleanItems.length; i++) { const { chunks } = cleanItems[i]! - const { doc, contentHash, ikey, storeKey } = docMeta[i]! + const { source, contentHash, ikey, storeKey } = sourceMeta[i]! - // Hash store dedup: skip docs whose content + model haven't changed + // Hash store dedup: skip sources whose content + model haven't changed if (!dryRun) { const stored = hashMap ? hashMap.get(storeKey) ?? null @@ -354,52 +356,53 @@ export class IndexEngine { } } - let documentId = doc.id ?? generateId('doc') - let documentWasCreated = true + let sourceId = source.id ?? generateId('src') + let sourceWasCreated = true - if (this.adapter.upsertDocumentRecord && !dryRun) { - const documentRecord = await this.adapter.upsertDocumentRecord({ - id: documentId, + if (this.adapter.upsertSourceRecord && !dryRun) { + const sourceRecord = await this.adapter.upsertSourceRecord({ + id: sourceId, bucketId, tenantId, groupId, userId, agentId, conversationId, - title: doc.title, - url: doc.url ?? undefined, + title: source.title, + url: source.url ?? undefined, contentHash, chunkCount: chunks.length, status: 'processing', visibility, - graphExtracted: shouldExtract, - metadata: doc.metadata ?? {}, - }) - documentId = documentRecord.id - documentWasCreated = documentRecord.wasCreated !== false + graphExtracted: shouldExtract, + metadata: source.metadata ?? {}, + subject: source.subject, + }) + sourceId = sourceRecord.id + sourceWasCreated = sourceRecord.wasCreated !== false } const textOffset = allTexts.length const texts = chunks.map(c => this.preprocessForEmbedding(c.content, opts)) allTexts.push(...texts) - prepared.push({ doc, chunks, ikey, contentHash, documentId, documentWasCreated, textOffset }) + prepared.push({ source, chunks, ikey, contentHash, sourceId, sourceWasCreated, textOffset }) } - // Phase 2: Single embedBatch call for all chunks across all documents + // Phase 2: Single embedBatch call for all chunks across all sources const allEmbeddings = allTexts.length > 0 ? await this.embedding.embedBatch(allTexts) : [] - // Phase 3: Per-document upsert + hash store. Graph writes are serialized + // Phase 3: Per-source upsert + hash store. Graph writes are serialized // until the graph storage layer is race-safe. const { concurrency = 1 } = opts const effectiveConcurrency = shouldExtract ? 1 : concurrency const processItem = async (item: typeof prepared[number]) => { - const { doc, chunks, ikey, contentHash, documentId, documentWasCreated, textOffset } = item + const { source, chunks, ikey, contentHash, sourceId, sourceWasCreated, textOffset } = item const embeddings = allEmbeddings.slice(textOffset, textOffset + chunks.length) - const propagated = this.propagateMetadata(doc, opts.propagateMetadata) + const propagated = this.propagateMetadata(source, opts.propagateMetadata) const embeddedChunks = chunks.map((chunk, i) => ({ id: chunkIdFor({ @@ -415,7 +418,7 @@ export class IndexEngine { userId, agentId, conversationId, - documentId, + sourceId, content: chunk.content, embedding: embeddings[i]!, embeddingModel: modelId, @@ -427,37 +430,33 @@ export class IndexEngine { })) if (!dryRun) { - await this.adapter.upsertDocument(modelId, embeddedChunks) - if (shouldExtract) { - await this.tripleExtractor?.persistPassageNodes?.(embeddedChunks.map(chunk => ({ - bucketId: chunk.bucketId, - documentId: chunk.documentId, - chunkIndex: chunk.chunkIndex, - chunkId: chunk.id, - embeddingModel: chunk.embeddingModel, - contentHash: sha256(chunk.content), - metadata: chunk.metadata, - visibility: chunk.visibility, - tenantId: chunk.tenantId, - groupId: chunk.groupId, - userId: chunk.userId, - agentId: chunk.agentId, - conversationId: chunk.conversationId, - }))) - } + await this.adapter.upsertSourceChunks(modelId, embeddedChunks) } + const initialEntityContext = !dryRun + ? await this.materializeSourceSubject( + source, + bucketId, + sourceId, + modelId, + embeddedChunks, + { tenantId, groupId, userId, agentId, conversationId }, + visibility, + ) + : [] + let extraction: { succeeded: number; failed: number; failedChunks?: ExtractionFailure[] } | undefined if (shouldExtract) { - const documentTitle = (propagated.title as string | undefined) ?? undefined + const sourceTitle = (propagated.title as string | undefined) ?? undefined extraction = await this.extractTriplesForChunks( bucketId, - documentId, + sourceId, embeddedChunks, propagated, - documentTitle, + sourceTitle, { tenantId, groupId, userId, agentId, conversationId }, visibility, + initialEntityContext, ) if (!result.extraction) result.extraction = { succeeded: 0, failed: 0 } @@ -475,16 +474,16 @@ export class IndexEngine { if (!dryRun) { if (extraction && extraction.failed > 0) { processingFailed++ - if (this.adapter.updateDocumentStatus) { - await this.adapter.updateDocumentStatus(documentId, 'failed') + if (this.adapter.updateSourceStatus) { + await this.adapter.updateSourceStatus(sourceId, 'failed') } this.eventSink?.emit({ id: crypto.randomUUID(), - eventType: 'index.document', + eventType: 'index.source', identity: { tenantId, groupId, userId, agentId, conversationId }, - targetId: documentId, - targetType: 'document', + targetId: sourceId, + targetType: 'source', payload: { bucketId, chunkCount: chunks.length, status: 'failed', extraction }, traceId, spanId, @@ -493,8 +492,8 @@ export class IndexEngine { return } - if (this.adapter.updateDocumentStatus) { - await this.adapter.updateDocumentStatus(documentId, 'complete', chunks.length) + if (this.adapter.updateSourceStatus) { + await this.adapter.updateSourceStatus(sourceId, 'complete', chunks.length) } const storeKey = buildHashStoreKey(tenantId, bucketId, ikey) @@ -509,16 +508,16 @@ export class IndexEngine { }) } - if (documentWasCreated) result.inserted++ + if (sourceWasCreated) result.inserted++ else result.updated++ this.eventSink?.emit({ id: crypto.randomUUID(), - eventType: 'index.document', + eventType: 'index.source', identity: { tenantId, groupId, userId, agentId, conversationId }, - targetId: documentId, - targetType: 'document', - payload: { bucketId, chunkCount: chunks.length, status: documentWasCreated ? 'new' : 'updated' }, + targetId: sourceId, + targetType: 'source', + payload: { bucketId, chunkCount: chunks.length, status: sourceWasCreated ? 'new' : 'updated' }, traceId, spanId, timestamp: new Date(), @@ -537,13 +536,13 @@ export class IndexEngine { const safeProcessItem = (item: typeof prepared[number]) => processItem(item).catch((err) => { processingFailed++ - this.logger?.error?.('[typegraph] Document processing failed:', { documentId: item.documentId, idempotencyKey: item.ikey, error: err instanceof Error ? err.message : String(err) }) + this.logger?.error?.('[typegraph] Source processing failed:', { sourceId: item.sourceId, idempotencyKey: item.ikey, error: err instanceof Error ? err.message : String(err) }) this.eventSink?.emit({ id: crypto.randomUUID(), - eventType: 'index.document', + eventType: 'index.source', identity: { tenantId, groupId, userId, agentId, conversationId }, - targetId: item.documentId, - targetType: 'document', + targetId: item.sourceId, + targetType: 'source', payload: { bucketId, status: 'failed', error: err instanceof Error ? err.message : String(err) }, traceId, spanId, @@ -569,9 +568,9 @@ export class IndexEngine { identity: { tenantId, groupId, userId, agentId, conversationId }, payload: { bucketId, - documentsProcessed: result.inserted + result.updated, - documentsSkipped: result.skipped, - documentsFailed: processingFailed, + sourcesProcessed: result.inserted + result.updated, + sourcesSkipped: result.skipped, + sourcesFailed: processingFailed, ...(result.extraction ? { extraction: result.extraction } : {}), }, durationMs: result.durationMs, @@ -597,10 +596,10 @@ export class IndexEngine { private async extractTriplesForChunks( bucketId: string, - documentId: string, + sourceId: string, chunks: Array & { id?: string | undefined }>, propagated: Record, - documentTitle?: string, + sourceTitle?: string, identity?: { tenantId?: string | undefined groupId?: string | undefined @@ -609,8 +608,9 @@ export class IndexEngine { conversationId?: string | undefined }, visibility?: IngestOptions['visibility'], + initialEntityContext: EntityContext[] = [], ): Promise<{ succeeded: number; failed: number; failedChunks?: ExtractionFailure[] }> { - let entityContext: EntityContext[] = [] + let entityContext: EntityContext[] = [...initialEntityContext] let succeeded = 0 let failed = 0 const failedChunks: ExtractionFailure[] = [] @@ -626,10 +626,10 @@ export class IndexEngine { chunk.content, bucketId, chunk.chunkIndex, - documentId, + sourceId, { ...propagated, ...chunk.metadata }, contextForChunk, - documentTitle, + sourceTitle, identity, visibility, chunk.id, @@ -639,8 +639,8 @@ export class IndexEngine { if (extractionResult === undefined) { failed++ - failedChunks.push({ documentId, chunkIndex: chunk.chunkIndex, reason: 'timeout' }) - this.logger?.warn?.('[typegraph] Triple extraction timed out', { documentId, chunkIndex: chunk.chunkIndex, bucketId }) + failedChunks.push({ sourceId, chunkIndex: chunk.chunkIndex, reason: 'timeout' }) + this.logger?.warn?.('[typegraph] Triple extraction timed out', { sourceId, chunkIndex: chunk.chunkIndex, bucketId }) continue } @@ -654,8 +654,8 @@ export class IndexEngine { } catch (err) { failed++ const msg = err instanceof Error ? err.message : String(err) - failedChunks.push({ documentId, chunkIndex: chunk.chunkIndex, reason: 'error', message: msg }) - this.logger?.error?.('[typegraph] Triple extraction failed', { documentId, chunkIndex: chunk.chunkIndex, bucketId, error: msg }) + failedChunks.push({ sourceId, chunkIndex: chunk.chunkIndex, reason: 'error', message: msg }) + this.logger?.error?.('[typegraph] Triple extraction failed', { sourceId, chunkIndex: chunk.chunkIndex, bucketId, error: msg }) } } @@ -675,14 +675,15 @@ export class IndexEngine { } private propagateMetadata( - doc: RawDocument, + source: SourceInput, fields?: string[] ): Record { if (!fields) { return { - title: doc.title, - url: doc.url, - updatedAt: doc.updatedAt, + title: source.title, + url: source.url, + updatedAt: source.updatedAt, + ...(source.subject ? { subject: source.subject } : {}), } } @@ -690,11 +691,48 @@ export class IndexEngine { for (const field of fields) { if (field.startsWith('metadata.')) { const key = field.slice('metadata.'.length) - out[key] = doc.metadata?.[key] + out[key] = source.metadata?.[key] } else { - out[field] = (doc as unknown as Record)[field] + out[field] = (source as unknown as Record)[field] } } return out } + + private async materializeSourceSubject( + source: SourceInput, + bucketId: string, + sourceId: string, + embeddingModel: string, + chunks: Array & { id?: string | undefined }>, + identity: { + tenantId?: string | undefined + groupId?: string | undefined + userId?: string | undefined + agentId?: string | undefined + conversationId?: string | undefined + }, + visibility?: IngestOptions['visibility'], + ): Promise { + if (!source.subject) return [] + if (!this.knowledgeGraph) return [] + if (!this.knowledgeGraph.addSourceSubject) { + throw new Error('KnowledgeGraphBridge must implement addSourceSubject() to ingest source.subject.') + } + + const entity = await this.knowledgeGraph.addSourceSubject({ + subject: source.subject, + bucketId, + sourceId, + embeddingModel, + chunks, + ...identity, + visibility, + }) + const name = entity?.name ?? source.subject.name + const type = entity?.entityType ?? source.subject.entityType + return name + ? [{ name, type: type ?? 'concept' }] + : [] + } } diff --git a/packages/sdk/src/index-engine/hash.ts b/packages/sdk/src/index-engine/hash.ts index 810ecde..96acc13 100644 --- a/packages/sdk/src/index-engine/hash.ts +++ b/packages/sdk/src/index-engine/hash.ts @@ -1,5 +1,5 @@ import { createHash } from 'crypto' -import type { RawDocument } from '../types/connector.js' +import type { SourceInput } from '../types/connector.js' export function sha256(content: string): string { return createHash('sha256').update(content, 'utf8').digest('hex') @@ -8,17 +8,17 @@ export function sha256(content: string): string { const AUTO_HASH_THRESHOLD = 128 export function resolveIdempotencyKey( - doc: RawDocument, - spec: string[] | ((doc: RawDocument) => string) + source: SourceInput, + spec: string[] | ((source: SourceInput) => string) ): string { const raw = typeof spec === 'function' - ? spec(doc) + ? spec(source) : spec.map(field => { if (field.startsWith('metadata.')) { const key = field.slice('metadata.'.length) - return String(doc.metadata?.[key] ?? '') + return String(source.metadata?.[key] ?? '') } - return String((doc as unknown as Record)[field] ?? '') + return String((source as unknown as Record)[field] ?? '') }).join('::') // Auto-hash long keys (e.g. when deduplicating by content) diff --git a/packages/sdk/src/index-engine/index.ts b/packages/sdk/src/index-engine/index.ts index 98df7ab..6eee6f5 100644 --- a/packages/sdk/src/index-engine/index.ts +++ b/packages/sdk/src/index-engine/index.ts @@ -2,3 +2,31 @@ export { IndexEngine } from './engine.js' export { defaultChunker } from './chunker.js' export { sha256, resolveIdempotencyKey, buildHashStoreKey } from './hash.js' export { stripMarkdown } from './strip-markdown.js' +export { + ENTITY_TYPES, + DEFAULT_ENTITY_TYPE, + VALID_ENTITY_TYPES, + ENTITY_TYPES_LIST, + ENTITY_TYPE_SPECS, + PREDICATE_SPECS, + ALL_PREDICATES, + PREDICATE_BY_NAME, + SYMMETRIC_PREDICATES, + GENERIC_DISALLOWED_PREDICATES, + ALIAS_RELATION_CUES, + ALIAS_ASSIGNMENT_CUES, + sanitizePredicate, + isSymmetricPredicate, + getPredicatesForPrompt, + normalizePredicateWithDirection, + validatePredicateTypes, +} from './ontology.js' +export type { + EntityType, + EntityTypeSpec, + PredicateAliasSpec, + PredicateSpec, + PredicateTemporalStatus, + PredicateNormalization, + PredicateTypeValidation, +} from './ontology.js' diff --git a/packages/sdk/src/index-engine/ontology.ts b/packages/sdk/src/index-engine/ontology.ts index 9baef62..4ea4a2b 100644 --- a/packages/sdk/src/index-engine/ontology.ts +++ b/packages/sdk/src/index-engine/ontology.ts @@ -1,149 +1,423 @@ /** - * Predicate ontology for triple extraction. - * ~150 predicates organized by entity-type pair. + * Central ontology registry for graph extraction, predicate normalization, + * query-intent parsing, and graph write validation. * - * Design principles: - * - Organized by entity-type pair so the model self-selects relevant predicates - * - Excludes generic/vague predicates (IS, HAS, RELATED_TO, MENTIONED, ASSOCIATED_WITH) - * which are caught by GENERIC_PREDICATES filter in graph-bridge.ts - * - Tense-significant predicates have separate present/past forms - * - Each predicate should carry specific relational semantics, not just co-occurrence + * Keep entity types, canonical predicates, aliases, inverse direction, symmetry, + * prompt grouping, and soft domain/range metadata here. Other modules should + * import the derived helpers instead of maintaining their own predicate lists. */ -// ── Person → Person ── -const PERSON_PERSON = [ - 'MARRIED', 'DIVORCED', 'CHILD_OF', 'PARENT_OF', 'SIBLING_OF', - 'MENTORED', 'SUCCEEDED', 'PRECEDED', - 'INFLUENCED', 'INSPIRED', 'RIVALED', 'OPPOSED', 'ALLIED_WITH', - 'COLLABORATED_WITH', 'CORRESPONDS_WITH', 'BEFRIENDED', - 'EMPLOYED', 'REPORTED_TO', 'SUPERVISED', - 'KILLED', 'BETRAYED', 'RESCUED', 'SERVED', +export const ENTITY_TYPES = [ + 'person', + 'organization', + 'location', + 'product', + 'technology', + 'concept', + 'event', + 'meeting', + 'document', + 'project', + 'issue', + 'role', + 'law_regulation', + 'time_period', + 'creative_work', ] as const -// ── Person → Organization ── -const PERSON_ORGANIZATION = [ - 'WORKS_FOR', 'WORKED_FOR', 'FOUNDED', 'CO_FOUNDED', - 'LEADS', 'LED', 'ADVISES', 'ADVISED', - 'MEMBER_OF', 'JOINED', 'LEFT', 'EXPELLED_FROM', - 'INVESTED_IN', 'DONATED_TO', 'SUED', - 'REPRESENTS', 'REPRESENTED', -] as const +export type EntityType = typeof ENTITY_TYPES[number] -// ── Person → Location ── -const PERSON_LOCATION = [ - 'BORN_IN', 'DIED_IN', 'LIVES_IN', 'LIVED_IN', - 'TRAVELED_TO', 'VISITED', 'MOVED_TO', 'EXILED_TO', - 'GOVERNED', 'RULED', 'CONQUERED', 'DEFENDED', - 'IMPRISONED_IN', 'ESCAPED_FROM', -] as const +export const DEFAULT_ENTITY_TYPE: EntityType = 'concept' -// ── Person → Work of Art / Product ── -const PERSON_WORK = [ - 'WROTE', 'AUTHORED', 'COMPOSED', 'DIRECTED', - 'ILLUSTRATED', 'DESIGNED', 'INVENTED', - 'PERFORMED_IN', 'STARRED_IN', 'NARRATED', - 'EDITED', 'TRANSLATED', 'REVIEWED', 'CRITIQUED', - 'COMMISSIONED', 'DEDICATED_TO', -] as const +export interface EntityTypeSpec { + name: EntityType + description: string + examples: string[] +} -// ── Person → Concept / Event ── -const PERSON_CONCEPT = [ - 'WORKS_AS', 'WORKED_AS', 'HELD_ROLE', 'PRACTICED_AS', - 'STUDIED', 'TAUGHT', 'DISCOVERED', 'DEVELOPED', - 'PROPOSED', 'ADVOCATED_FOR', 'CHAMPIONED', - 'PARTICIPATED_IN', 'WITNESSED', 'SURVIVED', - 'SPOKE_AT', 'ATTENDED', 'ORGANIZED', - 'AWARDED', 'NOMINATED', 'DIAGNOSED', 'TREATED', -] as const +export type PredicateTemporalStatus = 'current' | 'former' | 'historical' | 'unknown' -// ── Organization → Organization ── -const ORG_ORG = [ - 'ACQUIRED', 'MERGED_WITH', 'SPUN_OFF', - 'PARTNERED_WITH', 'COMPETES_WITH', - 'SUED', 'REGULATED_BY', 'SANCTIONED', - 'FUNDED', 'SUBSIDIZED', 'SUPPLIED', - 'SUCCEEDED', 'PRECEDED', 'ALLIED_WITH', 'OPPOSED', -] as const +export interface PredicateAliasSpec { + name: string + swap?: boolean | undefined + temporalStatus?: PredicateTemporalStatus | undefined +} -// ── Organization → Location ── -const ORG_LOCATION = [ - 'HEADQUARTERED_IN', 'LOCATED_IN', 'OPERATES_IN', - 'INCORPORATED_IN', 'EXPANDED_TO', 'WITHDREW_FROM', -] as const +export interface PredicateSpec { + name: string + description: string + category: string + domain: readonly EntityType[] | readonly ['*'] + range: readonly EntityType[] | readonly ['*'] + aliases?: readonly PredicateAliasSpec[] | undefined + symmetric?: boolean | undefined + inverse?: string | undefined +} -// ── Organization → Product / Work ── -const ORG_PRODUCT = [ - 'PRODUCED', 'MANUFACTURES', 'PUBLISHED', - 'DISTRIBUTES', 'LICENSES', 'DEVELOPED', - 'LAUNCHED', 'DISCONTINUED', -] as const +export interface PredicateNormalization { + original: string + predicate: string + valid: boolean + swapSubjectObject: boolean + symmetric: boolean + temporalStatus?: PredicateTemporalStatus | undefined +} -// ── Location → Location ── -const LOCATION_LOCATION = [ - 'BORDERS', 'CONTAINS', 'PART_OF', - 'CAPITAL_OF', 'NEAR', 'CONNECTED_TO', -] as const +export interface PredicateTypeValidation { + valid: boolean + domainValid: boolean + rangeValid: boolean + reason?: string | undefined +} -// ── Concept → Concept ── -const CONCEPT_CONCEPT = [ - 'DERIVES_FROM', 'EXTENDS', 'CONTRADICTS', - 'SUBSET_OF', 'SUPERSEDES', 'EQUIVALENT_TO', - 'INFLUENCES', 'PRECEDED', 'FOLLOWED', - 'APPLIED_TO', 'ENABLES', -] as const +const ALL_TYPES = ['*'] as const -// ── Event → Entity ── -const EVENT_RELATIONS = [ - 'OCCURRED_IN', 'OCCURRED_AT', - 'CAUSED', 'LED_TO', 'RESULTED_IN', 'TRIGGERED', - 'PRECEDED', 'FOLLOWED', -] as const +export const ENTITY_TYPE_SPECS: readonly EntityTypeSpec[] = [ + { name: 'person', description: 'A specific individual or named human persona.', examples: ['Ada Lovelace', 'Pat Smith'] }, + { name: 'organization', description: 'A company, institution, agency, team, department, or formal group.', examples: ['OpenAI', 'Platform team'] }, + { name: 'location', description: 'A place, region, address, market, or jurisdiction.', examples: ['San Francisco', 'European Union'] }, + { name: 'product', description: 'A commercial product, service, package, SKU, or productized capability.', examples: ['Stripe Billing', 'iPhone 16'] }, + { name: 'technology', description: 'A technical system, framework, language, protocol, platform, or standard.', examples: ['PostgreSQL', 'React Native'] }, + { name: 'concept', description: 'A named idea, method, topic, category, metric, goal, or abstract domain object.', examples: ['Data retention', 'Zero trust'] }, + { name: 'event', description: 'A named occurrence with a time anchor.', examples: ['CES 2025', 'Q4 launch'] }, + { name: 'meeting', description: 'A call, demo, sync, review, interview, or transcript-backed event.', examples: ['weekly pipeline review', 'Acme demo'] }, + { name: 'document', description: 'An authored business material distinct from TypeGraph storage sources.', examples: ['RFP', 'contract', 'architecture spec'] }, + { name: 'project', description: 'A bounded initiative, deal, opportunity, migration, program, or body of work.', examples: ['SOC2 rollout', 'Acme renewal'] }, + { name: 'issue', description: 'A ticket, bug, request, story, incident, task, or blocker.', examples: ['AUTH-123', 'billing bug'] }, + { name: 'role', description: 'A title, job, office, function, responsibility, or persona.', examples: ['CTO', 'account owner'] }, + { name: 'law_regulation', description: 'A statute, policy, regulation, contract clause, or formal rule.', examples: ['GDPR', 'SOC2 policy'] }, + { name: 'time_period', description: 'A named period, fiscal window, era, version interval, or date range.', examples: ['Q1 2026', 'Series B stage'] }, + { name: 'creative_work', description: 'A genuinely creative work such as a novel, poem, song, film, or artwork.', examples: ['Fear and Loathing in Las Vegas', 'Moby Dick'] }, +] -// ── Technology / Law ── -const TECHNICAL_RELATIONS = [ - 'IMPLEMENTS', 'BASED_ON', 'REQUIRES', - 'COMPATIBLE_WITH', 'REPLACES', 'DEPRECATED_BY', - 'GOVERNS', 'REGULATES', 'PROHIBITS', 'PERMITS', - 'ENFORCED_BY', 'AMENDED_BY', 'REPEALED', -] as const +const person = ['person'] as const +const personOrg = ['organization'] as const +const org = ['organization'] as const +const loc = ['location'] as const +const role = ['role'] as const +const authoredWork = ['document', 'creative_work'] as const +const workObject = ['document', 'creative_work', 'product', 'technology', 'concept'] as const +const productTech = ['product', 'technology'] as const +const issueProject = ['issue', 'project'] as const +const eventMeeting = ['event', 'meeting'] as const +const legal = ['law_regulation'] as const -// ── General (any entity-type pair) ── -const GENERAL_RELATIONS = [ - 'CREATED', 'DESTROYED', 'SUPPORTED', 'OPPOSED', - 'NAMED_AFTER', 'KNOWN_AS', 'SYMBOLIZES', - 'REFERS_TO', 'DESCRIBED', 'COMPARED_WITH', - 'FOUGHT_IN', 'SIGNED', 'OWNS', -] as const +export const PREDICATE_SPECS: readonly PredicateSpec[] = [ + // Core / taxonomy + { + name: 'IS_A', + category: 'Core / taxonomy', + description: 'Classifies an entity as an instance of a role, type, class, or category.', + domain: ALL_TYPES, + range: ['concept', 'role'], + aliases: [ + { name: 'IS_AN' }, + { name: 'TYPE_OF' }, + { name: 'INSTANCE_OF' }, + { name: 'CLASSIFIED_AS' }, + { name: 'WAS_A', temporalStatus: 'former' }, + { name: 'WAS_AN', temporalStatus: 'former' }, + ], + }, + { + name: 'PART_OF', + category: 'Core / taxonomy', + description: 'Indicates membership in a larger structure or whole.', + domain: ALL_TYPES, + range: ALL_TYPES, + aliases: [{ name: 'WITHIN' }, { name: 'SUBSET_OF' }, { name: 'BELONGS_TO' }], + }, + { + name: 'CONTAINS', + category: 'Core / taxonomy', + description: 'Indicates that one entity contains, includes, or encompasses another.', + domain: ALL_TYPES, + range: ALL_TYPES, + aliases: [{ name: 'INCLUDES' }, { name: 'ENCOMPASSES' }, { name: 'HAS_COMPONENT' }], + }, + { + name: 'EQUIVALENT_TO', + category: 'Core / taxonomy', + description: 'Indicates semantic equivalence between distinct entities.', + domain: ALL_TYPES, + range: ALL_TYPES, + symmetric: true, + aliases: [{ name: 'SAME_AS' }, { name: 'IDENTICAL_TO' }], + }, + { + name: 'RELATED_TO', + category: 'Core / taxonomy', + description: 'A weak fallback relation for explicit but non-specific relationships.', + domain: ALL_TYPES, + range: ALL_TYPES, + symmetric: true, + aliases: [{ name: 'ASSOCIATED_WITH' }, { name: 'INVOLVES' }], + }, + + // People, roles, and organizations + { + name: 'WORKS_FOR', + category: 'People / roles / orgs', + description: 'A person or organization is employed by, contracted with, or attached to an organization.', + domain: ['person', 'organization'], + range: personOrg, + aliases: [ + { name: 'WORKS_AT' }, + { name: 'EMPLOYED_AT' }, + { name: 'EMPLOYED_BY' }, + { name: 'WORKED_FOR', temporalStatus: 'former' }, + { name: 'WORKED_AT', temporalStatus: 'former' }, + { name: 'WAS_EMPLOYED_BY', temporalStatus: 'former' }, + ], + }, + { + name: 'WORKS_AS', + category: 'People / roles / orgs', + description: 'An entity serves in a title, job, function, or responsibility.', + domain: ['person', 'organization'], + range: role, + aliases: [ + { name: 'JOB_IS' }, + { name: 'OCCUPATION_IS' }, + { name: 'EMPLOYED_AS' }, + { name: 'ROLE_IS' }, + { name: 'TITLE_IS' }, + { name: 'WORKS_IN_ROLE' }, + { name: 'WORKS_AS_A' }, + { name: 'HELD_ROLE', temporalStatus: 'former' }, + { name: 'HELD_POSITION', temporalStatus: 'former' }, + { name: 'SERVED_AS' }, + { name: 'SERVES_AS' }, + { name: 'PRACTICED_AS' }, + { name: 'WORKED_AS', temporalStatus: 'former' }, + { name: 'WORKED_AS_A', temporalStatus: 'former' }, + ], + }, + { name: 'REPORTS_TO', category: 'People / roles / orgs', description: 'A person or role reports to another person or role.', domain: ['person', 'role'], range: ['person', 'role'], aliases: [{ name: 'REPORTED_TO' }, { name: 'SUBORDINATE_OF' }, { name: 'UNDER' }] }, + { name: 'MANAGES', category: 'People / roles / orgs', description: 'A person, role, or organization manages another entity.', domain: ['person', 'organization', 'role'], range: ALL_TYPES, aliases: [{ name: 'SUPERVISES' }, { name: 'SUPERVISED' }, { name: 'MANAGED' }, { name: 'OVERSEES' }, { name: 'ADMINISTERS' }] }, + { name: 'FOUNDED', category: 'People / roles / orgs', description: 'An entity founded or co-founded an organization, product, project, or initiative.', domain: ['person', 'organization'], range: ['organization', 'product', 'project'], aliases: [{ name: 'ESTABLISHED' }, { name: 'CO_FOUNDED' }, { name: 'COFOUNDED' }, { name: 'FOUNDED_BY', swap: true }, { name: 'CO_FOUNDED_BY', swap: true }, { name: 'COFOUNDED_BY', swap: true }] }, + { name: 'LEADS', category: 'People / roles / orgs', description: 'A person, role, or organization leads another entity.', domain: ['person', 'organization', 'role'], range: ALL_TYPES, aliases: [{ name: 'HEADS' }, { name: 'DIRECTS' }, { name: 'CHAIRS' }, { name: 'LED', temporalStatus: 'former' }, { name: 'HEADED', temporalStatus: 'former' }, { name: 'CHAIRED', temporalStatus: 'former' }] }, + { name: 'ADVISES', category: 'People / roles / orgs', description: 'A person or organization advises another person, organization, or project.', domain: ['person', 'organization'], range: ALL_TYPES, aliases: [{ name: 'CONSULTS_FOR' }, { name: 'ADVISED', temporalStatus: 'former' }, { name: 'CONSULTED_FOR', temporalStatus: 'former' }] }, + { name: 'MEMBER_OF', category: 'People / roles / orgs', description: 'A person or organization is a member of a group or organization.', domain: ['person', 'organization'], range: ['organization'], aliases: [{ name: 'AFFILIATED_WITH' }, { name: 'JOINED' }] }, + { name: 'REPRESENTS', category: 'People / roles / orgs', description: 'A person or organization represents another entity.', domain: ['person', 'organization'], range: ALL_TYPES, aliases: [{ name: 'REPRESENTATIVE_OF' }, { name: 'SPEAKS_FOR' }, { name: 'REPRESENTED_BY', swap: true }] }, + { name: 'INVESTED_IN', category: 'People / roles / orgs', description: 'A person or organization invested in another entity.', domain: ['person', 'organization'], range: ['organization', 'product', 'project'], aliases: [{ name: 'INVESTOR_IN' }, { name: 'BACKED' }] }, + { name: 'MARRIED', category: 'People / personal', description: 'Two people are or were spouses.', domain: person, range: person, symmetric: true, aliases: [{ name: 'MARRIED_TO' }, { name: 'WED' }, { name: 'SPOUSE_OF' }, { name: 'HUSBAND_OF' }, { name: 'WIFE_OF' }] }, + { name: 'DIVORCED', category: 'People / personal', description: 'Two people divorced or separated.', domain: person, range: person, symmetric: true, aliases: [{ name: 'DIVORCED_FROM' }, { name: 'SEPARATED_FROM' }] }, + { name: 'PARENT_OF', category: 'People / personal', description: 'A person is a parent of another person.', domain: person, range: person, aliases: [{ name: 'FATHER_OF' }, { name: 'MOTHER_OF' }] }, + { name: 'CHILD_OF', category: 'People / personal', description: 'A person is a child of another person.', domain: person, range: person, aliases: [{ name: 'SON_OF' }, { name: 'DAUGHTER_OF' }, { name: 'OFFSPRING_OF' }, { name: 'BORN_TO' }] }, + { name: 'SIBLING_OF', category: 'People / personal', description: 'Two people are siblings.', domain: person, range: person, symmetric: true, aliases: [{ name: 'BROTHER_OF' }, { name: 'SISTER_OF' }] }, + { name: 'MENTORED', category: 'People / personal', description: 'A person mentored, trained, or coached another person.', domain: person, range: person, aliases: [{ name: 'TRAINED' }, { name: 'COACHED' }, { name: 'MENTORED_BY', swap: true }, { name: 'TRAINED_BY', swap: true }, { name: 'COACHED_BY', swap: true }] }, + + // Business / organization + { name: 'ACQUIRED', category: 'Business / organization', description: 'An organization acquired another organization or asset.', domain: org, range: ALL_TYPES, aliases: [{ name: 'BOUGHT' }, { name: 'PURCHASED' }, { name: 'ACQUIRED_BY', swap: true }] }, + { name: 'MERGED_WITH', category: 'Business / organization', description: 'Two organizations or projects merged.', domain: ['organization', 'project'], range: ['organization', 'project'], symmetric: true, aliases: [{ name: 'MERGED_INTO' }] }, + { name: 'PARTNERED_WITH', category: 'Business / organization', description: 'Two entities partnered or collaborated.', domain: ALL_TYPES, range: ALL_TYPES, symmetric: true, aliases: [{ name: 'PARTNER_OF' }, { name: 'IN_PARTNERSHIP_WITH' }, { name: 'COLLABORATED_WITH' }, { name: 'WORKED_WITH' }] }, + { name: 'COMPETES_WITH', category: 'Business / organization', description: 'Two entities compete or rival each other.', domain: ALL_TYPES, range: ALL_TYPES, symmetric: true, aliases: [{ name: 'COMPETITOR_OF' }, { name: 'RIVALS' }, { name: 'RIVALED' }] }, + { name: 'FUNDED', category: 'Business / organization', description: 'An entity funded or financed another entity.', domain: ['person', 'organization'], range: ALL_TYPES, aliases: [{ name: 'FINANCED' }, { name: 'SUBSIDIZED' }, { name: 'FUNDED_BY', swap: true }, { name: 'FINANCED_BY', swap: true }] }, + { name: 'SUPPLIED', category: 'Business / organization', description: 'An entity supplied another entity or acted as a vendor.', domain: ['organization', 'person'], range: ALL_TYPES, aliases: [{ name: 'SUPPLIER_TO' }, { name: 'VENDOR_OF' }, { name: 'SUPPLIED_BY', swap: true }] }, + { name: 'SUED', category: 'Business / organization', description: 'An entity sued or litigated against another entity.', domain: ALL_TYPES, range: ALL_TYPES, aliases: [{ name: 'LITIGATED_AGAINST' }, { name: 'SUED_BY', swap: true }] }, + { name: 'REGULATED_BY', category: 'Business / organization', description: 'An entity is regulated or overseen by another entity.', domain: ALL_TYPES, range: ['organization', 'law_regulation'], aliases: [{ name: 'OVERSEEN_BY' }, { name: 'REGULATES', swap: true }] }, + { name: 'OWNS', category: 'Business / organization', description: 'An entity owns another entity or asset.', domain: ALL_TYPES, range: ALL_TYPES, aliases: [{ name: 'OWNER_OF' }, { name: 'POSSESSES' }, { name: 'OWNED_BY', swap: true }, { name: 'PROPERTY_OF', swap: true }] }, + + // Product / technical + { name: 'USES', category: 'Product / technical', description: 'An entity uses a product, technology, vendor, or process.', domain: ALL_TYPES, range: ['product', 'technology', 'organization', 'concept'], aliases: [{ name: 'USES_VENDOR' }, { name: 'USES_TOOL' }, { name: 'USED_IN', swap: true }, { name: 'UTILIZED_IN', swap: true }] }, + { name: 'IMPLEMENTS', category: 'Product / technical', description: 'A technology, product, or project implements another technology or concept.', domain: ['product', 'technology', 'project'], range: ['technology', 'concept', 'law_regulation'], aliases: [{ name: 'REALIZES' }, { name: 'IMPLEMENTED_BY', swap: true }] }, + { name: 'INTEGRATES_WITH', category: 'Product / technical', description: 'Two products or technologies integrate or interoperate.', domain: productTech, range: productTech, symmetric: true, aliases: [{ name: 'INTEGRATED_WITH' }, { name: 'INTEROPERATES_WITH' }] }, + { name: 'REQUIRES', category: 'Product / technical', description: 'An entity requires or depends on another entity.', domain: ALL_TYPES, range: ALL_TYPES, aliases: [{ name: 'DEPENDS_ON' }, { name: 'NEEDS' }, { name: 'REQUIRED_BY', swap: true }] }, + { name: 'COMPATIBLE_WITH', category: 'Product / technical', description: 'Two products or technologies are compatible.', domain: productTech, range: productTech, symmetric: true, aliases: [{ name: 'WORKS_WITH' }, { name: 'INTEROPERABLE_WITH' }] }, + { name: 'MIGRATED_FROM', category: 'Product / technical', description: 'An entity migrated from another product, technology, or system.', domain: ALL_TYPES, range: productTech, aliases: [{ name: 'MOVED_FROM' }] }, + { name: 'DEPLOYED_AT', category: 'Product / technical', description: 'A product, technology, or project is deployed at an organization or location.', domain: ['product', 'technology', 'project'], range: ['organization', 'location'], aliases: [{ name: 'RUNS_AT' }, { name: 'HOSTED_AT' }] }, + { name: 'REPLACES', category: 'Product / technical', description: 'An entity replaces or supersedes another entity.', domain: ALL_TYPES, range: ALL_TYPES, aliases: [{ name: 'SUPERSEDES' }, { name: 'REPLACED_BY', swap: true }, { name: 'DEPRECATED_BY', swap: true }, { name: 'OBSOLETED_BY', swap: true }] }, + { name: 'BASED_ON', category: 'Product / technical', description: 'An entity is based on or derives from another entity.', domain: ALL_TYPES, range: ALL_TYPES, aliases: [{ name: 'DERIVES_FROM' }, { name: 'DERIVED_FROM' }, { name: 'ORIGINATES_FROM' }] }, + + // Work / project / issue / document + { name: 'ASSIGNED_TO', category: 'Work / issue / document', description: 'A project, issue, task, account, or document is assigned to an owner.', domain: issueProject, range: ['person', 'organization', 'role'], aliases: [{ name: 'OWNER_ASSIGNED' }, { name: 'CLOSED_BY' }, { name: 'REPORTED_BY', swap: true }] }, + { name: 'BLOCKS', category: 'Work / issue / document', description: 'An issue, project, or dependency blocks another work item.', domain: issueProject, range: issueProject, aliases: [{ name: 'BLOCKED_BY', swap: true }] }, + { name: 'DUPLICATES', category: 'Work / issue / document', description: 'An issue duplicates another issue.', domain: ['issue'], range: ['issue'], aliases: [{ name: 'DUPLICATE_OF' }] }, + { name: 'RESOLVES', category: 'Work / issue / document', description: 'An entity resolves, fixes, or closes an issue or project.', domain: ALL_TYPES, range: issueProject, aliases: [{ name: 'FIXES' }, { name: 'FIXED_IN' }, { name: 'CLOSES' }, { name: 'CLOSED' }, { name: 'RESOLVED_BY', swap: true }] }, + { name: 'CREATED', category: 'Work / issue / document', description: 'An entity created, launched, built, announced, or produced another entity.', domain: ALL_TYPES, range: ALL_TYPES, aliases: [{ name: 'BUILT' }, { name: 'DEVELOPED' }, { name: 'LAUNCHED' }, { name: 'ANNOUNCED' }, { name: 'PRODUCED' }, { name: 'MANUFACTURED' }, { name: 'INVENTED' }, { name: 'CREATED_BY', swap: true }] }, + { name: 'AUTHORED', category: 'Work / issue / document', description: 'An entity authored, wrote, composed, or published a document or creative work.', domain: ['person', 'organization'], range: authoredWork, aliases: [{ name: 'WROTE' }, { name: 'COMPOSED' }, { name: 'PENNED' }, { name: 'PUBLISHED' }, { name: 'RELEASED' }, { name: 'WRITTEN_BY', swap: true }, { name: 'AUTHORED_BY', swap: true }, { name: 'COMPOSED_BY', swap: true }, { name: 'PUBLISHED_BY', swap: true }] }, + { name: 'SIGNED', category: 'Work / issue / document', description: 'An entity signed a document, agreement, contract, or policy.', domain: ALL_TYPES, range: ['document', 'law_regulation'], aliases: [{ name: 'SIGNED_BY', swap: true }] }, + { name: 'APPROVED', category: 'Work / issue / document', description: 'An entity approved a document, project, issue, or decision.', domain: ALL_TYPES, range: ALL_TYPES, aliases: [{ name: 'APPROVED_BY', swap: true }] }, + { name: 'REFERENCES', category: 'Work / issue / document', description: 'An entity references another entity.', domain: ALL_TYPES, range: ALL_TYPES, aliases: [{ name: 'REFERS_TO' }, { name: 'CITES' }, { name: 'MENTIONS' }] }, + { name: 'DESCRIBES', category: 'Work / issue / document', description: 'A document, report, or entity describes another entity.', domain: ALL_TYPES, range: ALL_TYPES, aliases: [{ name: 'DESCRIBED' }, { name: 'DEPICTS' }, { name: 'PORTRAYS' }, { name: 'CHARACTERIZES' }, { name: 'REPORTED' }, { name: 'DOCUMENTED' }, { name: 'RECORDED' }] }, + { name: 'SUPPORTS', category: 'Work / issue / document', description: 'An entity supports, endorses, or enables another entity.', domain: ALL_TYPES, range: ALL_TYPES, aliases: [{ name: 'SUPPORTED' }, { name: 'ENDORSED' }, { name: 'ENABLES' }, { name: 'FACILITATES' }] }, + { name: 'OPPOSES', category: 'Work / issue / document', description: 'An entity opposes, criticizes, challenges, or contradicts another entity.', domain: ALL_TYPES, range: ALL_TYPES, aliases: [{ name: 'OPPOSED' }, { name: 'RESISTED' }, { name: 'CRITICIZED' }, { name: 'CHALLENGED' }, { name: 'CONTRADICTS' }, { name: 'CONFLICTS_WITH' }] }, + + // Events, meetings, location, legal + { name: 'ATTENDED', category: 'Event / meeting / location / legal', description: 'An entity attended an event or meeting.', domain: ALL_TYPES, range: eventMeeting, aliases: [{ name: 'PRESENT_AT' }] }, + { name: 'ORGANIZED', category: 'Event / meeting / location / legal', description: 'An entity organized an event, meeting, project, or activity.', domain: ALL_TYPES, range: ['event', 'meeting', 'project'], aliases: [{ name: 'ARRANGED' }, { name: 'COORDINATED' }] }, + { name: 'SPOKE_AT', category: 'Event / meeting / location / legal', description: 'A person or organization spoke or presented at an event or meeting.', domain: ['person', 'organization'], range: eventMeeting, aliases: [{ name: 'PRESENTED_AT' }, { name: 'ADDRESSED' }] }, + { name: 'OCCURRED_AT', category: 'Event / meeting / location / legal', description: 'An event or meeting occurred at a precise place, venue, or time point.', domain: eventMeeting, range: ['location', 'time_period'], aliases: [{ name: 'TOOK_PLACE_AT' }, { name: 'HAPPENED_AT' }] }, + { name: 'OCCURRED_IN', category: 'Event / meeting / location / legal', description: 'An event or meeting occurred in a broader place, time period, or context.', domain: eventMeeting, range: ['location', 'time_period'], aliases: [{ name: 'TOOK_PLACE_IN' }, { name: 'HAPPENED_IN' }] }, + { name: 'LOCATED_IN', category: 'Event / meeting / location / legal', description: 'An entity is located in a place.', domain: ALL_TYPES, range: loc, aliases: [{ name: 'SITUATED_IN' }, { name: 'LIVES_IN' }, { name: 'RESIDES_IN' }, { name: 'LIVED_IN', temporalStatus: 'former' }, { name: 'RESIDED_IN', temporalStatus: 'former' }, { name: 'BORN_IN' }, { name: 'DIED_IN' }] }, + { name: 'OPERATES_IN', category: 'Event / meeting / location / legal', description: 'An organization, product, or project operates in a market or location.', domain: ['organization', 'product', 'project'], range: loc, aliases: [{ name: 'ACTIVE_IN' }, { name: 'PRESENT_IN' }, { name: 'EXPANDED_TO' }, { name: 'WITHDREW_FROM', temporalStatus: 'former' }] }, + { name: 'HEADQUARTERED_IN', category: 'Event / meeting / location / legal', description: 'An organization is headquartered or based in a location.', domain: org, range: loc, aliases: [{ name: 'BASED_IN' }, { name: 'HQ_IN' }] }, + { name: 'GOVERNS', category: 'Event / meeting / location / legal', description: 'A law, regulation, policy, or organization governs an entity.', domain: ['law_regulation', 'organization'], range: ALL_TYPES, aliases: [{ name: 'CONTROLS' }] }, + { name: 'PROHIBITS', category: 'Event / meeting / location / legal', description: 'A law, regulation, policy, or rule prohibits something.', domain: legal, range: ALL_TYPES, aliases: [{ name: 'BANS' }, { name: 'FORBIDS' }] }, + { name: 'PERMITS', category: 'Event / meeting / location / legal', description: 'A law, regulation, policy, or rule permits something.', domain: legal, range: ALL_TYPES, aliases: [{ name: 'ALLOWS' }, { name: 'AUTHORIZES' }] }, + { name: 'AMENDS', category: 'Event / meeting / location / legal', description: 'A law, regulation, policy, or document amends another law, regulation, policy, or document.', domain: ['law_regulation', 'document'], range: ['law_regulation', 'document'], aliases: [{ name: 'AMENDED' }, { name: 'AMENDED_BY', swap: true }, { name: 'MODIFIED_BY', swap: true }, { name: 'REVISED_BY', swap: true }] }, + { name: 'REPEALS', category: 'Event / meeting / location / legal', description: 'A law, regulation, policy, or rule repeals another law, regulation, policy, or rule.', domain: legal, range: legal, aliases: [{ name: 'REPEALED' }, { name: 'REVOKED' }, { name: 'ANNULLED' }, { name: 'RESCINDED' }] }, + { name: 'CAUSED', category: 'Event / meeting / location / legal', description: 'An entity caused or triggered another entity or outcome.', domain: ALL_TYPES, range: ALL_TYPES, aliases: [{ name: 'TRIGGERED' }, { name: 'RESULTED_IN' }, { name: 'LED_TO' }] }, + { name: 'PRECEDED', category: 'Event / meeting / location / legal', description: 'An entity came before another entity.', domain: ALL_TYPES, range: ALL_TYPES, aliases: [{ name: 'CAME_BEFORE' }, { name: 'PRIOR_TO' }, { name: 'SUCCEEDED_BY', swap: true }] }, + { name: 'FOLLOWED', category: 'Event / meeting / location / legal', description: 'An entity came after another entity.', domain: ALL_TYPES, range: ALL_TYPES, aliases: [{ name: 'CAME_AFTER' }, { name: 'SUCCEEDED' }] }, + + // Historical / narrative + { name: 'KILLED', category: 'Historical / narrative', description: 'A person or entity killed another person or entity.', domain: ALL_TYPES, range: ALL_TYPES, aliases: [{ name: 'MURDERED' }, { name: 'ASSASSINATED' }, { name: 'SLAIN_BY', swap: true }, { name: 'KILLED_BY', swap: true }, { name: 'MURDERED_BY', swap: true }, { name: 'ASSASSINATED_BY', swap: true }] }, + { name: 'BETRAYED', category: 'Historical / narrative', description: 'An entity betrayed another entity.', domain: ALL_TYPES, range: ALL_TYPES, aliases: [{ name: 'DECEIVED' }, { name: 'BETRAYED_BY', swap: true }] }, + { name: 'RESCUED', category: 'Historical / narrative', description: 'An entity rescued, saved, or liberated another entity.', domain: ALL_TYPES, range: ALL_TYPES, aliases: [{ name: 'SAVED' }, { name: 'LIBERATED' }, { name: 'RESCUED_BY', swap: true }] }, + { name: 'EXILED_TO', category: 'Historical / narrative', description: 'A person or group was exiled, banished, or deported to a location.', domain: ALL_TYPES, range: loc, aliases: [{ name: 'BANISHED_TO' }, { name: 'DEPORTED_TO' }] }, + { name: 'RULED', category: 'Historical / narrative', description: 'A person or organization ruled or governed a location, organization, or group.', domain: ['person', 'organization'], range: ALL_TYPES, aliases: [{ name: 'GOVERNED' }, { name: 'REIGNED_OVER' }, { name: 'CONTROLLED' }] }, + { name: 'CONQUERED', category: 'Historical / narrative', description: 'An entity conquered, captured, or seized another entity.', domain: ALL_TYPES, range: ALL_TYPES, aliases: [{ name: 'CAPTURED' }, { name: 'SEIZED' }] }, + { name: 'IMPRISONED_IN', category: 'Historical / narrative', description: 'A person or group was imprisoned, jailed, or detained in a location.', domain: ALL_TYPES, range: loc, aliases: [{ name: 'JAILED_IN' }, { name: 'DETAINED_IN' }, { name: 'HELD_IN' }] }, + { name: 'FOUGHT_IN', category: 'Historical / narrative', description: 'An entity fought, served, or battled in an event or conflict.', domain: ALL_TYPES, range: eventMeeting, aliases: [{ name: 'SERVED_IN' }, { name: 'BATTLED_IN' }] }, +] + +export const VALID_ENTITY_TYPES = new Set(ENTITY_TYPES) +export const ENTITY_TYPES_LIST = ENTITY_TYPES.join(', ') +export const ALL_PREDICATES = new Set(PREDICATE_SPECS.map(spec => spec.name)) +export const PREDICATE_BY_NAME = new Map(PREDICATE_SPECS.map(spec => [spec.name, spec])) +export const SYMMETRIC_PREDICATES = new Set(PREDICATE_SPECS.filter(spec => spec.symmetric).map(spec => spec.name)) + +export const GENERIC_DISALLOWED_PREDICATES = new Set([ + 'IS', + 'HAS', + 'HAS_A', + 'MENTIONED', +]) + +export const ALIAS_RELATION_CUES = new Set([ + 'KNOWN_AS', + 'ALSO_CALLED', + 'ALIAS', + 'ALIAS_OF', + 'AKA', + 'CALLED', + 'NAMED_AFTER', + 'NAMED_FOR', +]) + +export const ALIAS_ASSIGNMENT_CUES = new Set([ + 'KNOWN_AS', + 'ALSO_CALLED', + 'ALIAS', + 'ALIAS_OF', + 'AKA', + 'CALLED', +]) + +const PREDICATE_ALIAS_BY_NAME = buildPredicateAliasMap() + +function buildPredicateAliasMap(): Map { + const map = new Map() + for (const spec of PREDICATE_SPECS) { + map.set(sanitizePredicate(spec.name), { canonical: spec.name, alias: { name: spec.name } }) + for (const alias of spec.aliases ?? []) { + const key = sanitizePredicate(alias.name) + if (key === spec.name && alias.swap) { + throw new Error(`Ontology alias ${alias.name} cannot self-map with swap`) + } + map.set(key, { canonical: spec.name, alias }) + } + } + return map +} + +export function sanitizePredicate(predicate: string): string { + return predicate + .trim() + .toUpperCase() + .replace(/[\s-]+/g, '_') + .replace(/[^A-Z0-9_]/g, '') +} + +export function isSymmetricPredicate(predicate: string): boolean { + return SYMMETRIC_PREDICATES.has(sanitizePredicate(predicate)) +} + +export function normalizePredicateWithDirection(predicate: string): PredicateNormalization { + const original = sanitizePredicate(predicate) + if (ALIAS_RELATION_CUES.has(original)) { + return { + original, + predicate: original, + valid: false, + swapSubjectObject: false, + symmetric: false, + } + } + + const resolved = PREDICATE_ALIAS_BY_NAME.get(original) + const normalized = resolved?.canonical ?? original + const valid = ALL_PREDICATES.has(normalized) && !GENERIC_DISALLOWED_PREDICATES.has(normalized) + return { + original, + predicate: normalized, + valid, + swapSubjectObject: !!resolved?.alias.swap, + symmetric: isSymmetricPredicate(normalized), + ...(resolved?.alias.temporalStatus ? { temporalStatus: resolved.alias.temporalStatus } : {}), + } +} + +export function validatePredicateTypes( + predicate: string, + subjectType?: string | undefined, + objectType?: string | undefined, +): PredicateTypeValidation { + const normalized = normalizePredicateWithDirection(predicate) + if (!normalized.valid) { + return { + valid: false, + domainValid: false, + rangeValid: false, + reason: 'invalid-predicate', + } + } + const spec = PREDICATE_BY_NAME.get(normalized.predicate) + if (!spec) { + return { + valid: false, + domainValid: false, + rangeValid: false, + reason: 'missing-predicate-spec', + } + } + const domainValid = typeAllowed(spec.domain, subjectType) + const rangeValid = typeAllowed(spec.range, objectType) + return { + valid: domainValid && rangeValid, + domainValid, + rangeValid, + ...(!domainValid || !rangeValid ? { reason: 'domain-range-mismatch' } : {}), + } +} + +function typeAllowed(allowed: readonly EntityType[] | readonly ['*'], type?: string | undefined): boolean { + if ((allowed as readonly string[]).includes('*')) return true + if (!type) return true + return (allowed as readonly string[]).includes(type) +} /** - * Get the full predicate vocabulary formatted for the extraction prompt. - * Organized by entity-type pair so the model can self-select relevant predicates. + * Get canonical predicates formatted for extraction and intent prompts. + * Synonyms are intentionally omitted so the model emits a compact vocabulary. */ export function getPredicatesForPrompt(): string { - return `Predicate vocabulary (choose from this list when applicable): - -Person → Person: ${PERSON_PERSON.join(', ')} -Person → Organization: ${PERSON_ORGANIZATION.join(', ')} -Person → Location: ${PERSON_LOCATION.join(', ')} -Person → Work/Product: ${PERSON_WORK.join(', ')} -Person → Concept/Event: ${PERSON_CONCEPT.join(', ')} -Organization → Organization: ${ORG_ORG.join(', ')} -Organization → Location: ${ORG_LOCATION.join(', ')} -Organization → Product: ${ORG_PRODUCT.join(', ')} -Location → Location: ${LOCATION_LOCATION.join(', ')} -Concept → Concept: ${CONCEPT_CONCEPT.join(', ')} -Event relations: ${EVENT_RELATIONS.join(', ')} -Technology/Law: ${TECHNICAL_RELATIONS.join(', ')} -General (any pair): ${GENERAL_RELATIONS.join(', ')} - -Use ONLY predicates from this vocabulary. Do not invent new predicate names.` -} + const byCategory = new Map() + for (const spec of PREDICATE_SPECS) { + const list = byCategory.get(spec.category) ?? [] + list.push(spec) + byCategory.set(spec.category, list) + } -/** All canonical predicates from the ontology (flattened). */ -export const ALL_PREDICATES = new Set([ - ...PERSON_PERSON, ...PERSON_ORGANIZATION, ...PERSON_LOCATION, - ...PERSON_WORK, ...PERSON_CONCEPT, - ...ORG_ORG, ...ORG_LOCATION, ...ORG_PRODUCT, - ...LOCATION_LOCATION, ...CONCEPT_CONCEPT, - ...EVENT_RELATIONS, ...TECHNICAL_RELATIONS, ...GENERAL_RELATIONS, -]) + const lines = [...byCategory.entries()].map(([category, specs]) => + `${category}: ${specs.map(spec => spec.name).join(', ')}` + ) + + return `Predicate vocabulary (choose from this canonical list when applicable): + +${lines.join('\n')} + +Use ONLY predicates from this vocabulary. Do not invent new predicate names. Use aliases only to understand source phrasing, not as output predicate names.` +} diff --git a/packages/sdk/src/index-engine/triple-extractor.ts b/packages/sdk/src/index-engine/triple-extractor.ts index 0e6e004..a5d8216 100644 --- a/packages/sdk/src/index-engine/triple-extractor.ts +++ b/packages/sdk/src/index-engine/triple-extractor.ts @@ -1,8 +1,8 @@ import { z } from 'zod/v4-mini' import type { LLMProvider } from '../types/llm-provider.js' import type { KnowledgeGraphBridge } from '../types/graph-bridge.js' -import type { Visibility } from '../types/typegraph-document.js' -import { getPredicatesForPrompt } from './ontology.js' +import type { Visibility } from '../types/source.js' +import { ENTITY_TYPES, ENTITY_TYPES_LIST, VALID_ENTITY_TYPES, getPredicatesForPrompt } from './ontology.js' export interface TripleExtractorConfig { /** LLM for entity extraction (Pass 1 in two-pass mode) or the single combined call. */ @@ -30,6 +30,9 @@ interface ExtractedRelationship { confidence: number description?: string | undefined evidenceText?: string | undefined + temporalStatus?: 'current' | 'former' | 'historical' | 'unknown' | undefined + validFrom?: string | undefined + validTo?: string | undefined } interface ExtractionResult { @@ -43,17 +46,6 @@ export interface EntityContext { type: string } -// ── Entity types ── - -const ENTITY_TYPES = [ - 'person', 'organization', 'location', 'product', 'concept', 'event', - 'work_of_art', 'technology', 'law_regulation', 'time_period', -] as const - -const VALID_ENTITY_TYPES = new Set(ENTITY_TYPES) - -const ENTITY_TYPES_LIST = ENTITY_TYPES.join(', ') - // ── Zod schemas for structured output ── const entitySchema = z.array(z.object({ @@ -70,6 +62,9 @@ const relationshipSchema = z.array(z.object({ confidence: z.number(), description: z.optional(z.string()), evidenceText: z.optional(z.string()), + temporalStatus: z.optional(z.enum(['current', 'former', 'historical', 'unknown'])), + validFrom: z.optional(z.string()), + validTo: z.optional(z.string()), })) const singlePassSchema = z.object({ @@ -515,6 +510,9 @@ function postProcessExtraction( confidence: typeof rel.confidence === 'number' ? rel.confidence : 1, description: sanitizeField(rel.description ?? ''), evidenceText: sanitizeField(rel.evidenceText ?? ''), + temporalStatus: rel.temporalStatus, + validFrom: rel.validFrom ? sanitizeField(rel.validFrom) : undefined, + validTo: rel.validTo ? sanitizeField(rel.validTo) : undefined, }) } @@ -523,12 +521,12 @@ function postProcessExtraction( // ── Single-pass prompt (default) ── -function buildSinglePassPrompt(content: string, entityContext?: EntityContext[], documentTitle?: string): string { +function buildSinglePassPrompt(content: string, entityContext?: EntityContext[], sourceTitle?: string): string { const contextSection = entityContext?.length - ? `\nPreviously identified entities in this document:\n${entityContext.map(e => `- ${e.name} (${e.type})`).join('\n')}\n\nUse these names as canonical entities when the text refers to them by pronoun, abbreviation, surname, title, epithet, or pseudonym. Preserve any newly observed surface form as an alias instead of creating a duplicate entity.\n` + ? `\nPreviously identified entities in this source:\n${entityContext.map(e => `- ${e.name} (${e.type})`).join('\n')}\n\nUse these names as canonical entities when the text refers to them by pronoun, abbreviation, surname, title, epithet, or pseudonym. Preserve any newly observed surface form as an alias instead of creating a duplicate entity.\n` : '' - const titleSection = documentTitle - ? `\nThe text string is from a document titled: "${documentTitle}". Entities referenced in the title should be extracted as primary entities using their full formal names.\n` + const titleSection = sourceTitle + ? `\nThe text string is from a source titled: "${sourceTitle}". Entities referenced in the title should be extracted as primary entities using their full formal names.\n` : '' return `Your task is to extract all named entities, and relationships between them, from a text string. @@ -546,6 +544,7 @@ For each entity, provide: Events: "2024 United States presidential election" not "the election"; "1984 Summer Olympics" not "1984 games"; "CES 2025" not "CES"; "World War II" not "the war" Legal/Science: "General Data Protection Regulation" not "GDPR"; "Clean Air Act of 1970" not "Clean Air Act"; "Hubble Space Telescope" not "Hubble"; "CRISPR-Cas9" not "CRISPR" Products: "iPhone 16 Pro Max" not "iPhone"; "Tesla Model 3" not "Model 3"; "GPT-4" not "GPT" + Documents: "Acme master services agreement" not "MSA"; "Q4 architecture review deck" not "deck"; "SOC2 readiness report" not "report" Culture: "Naismith Memorial Basketball Hall of Fame" not "Hall of Fame"; "Academy Award for Best Picture" not "Best Picture"; "The Great Gatsby" not "Gatsby" - "type": One of: ${ENTITY_TYPES_LIST} - "description": A one-sentence description of what this entity IS — its defining attributes, NOT its relationships to other entities @@ -554,7 +553,7 @@ For each entity, provide: NEVER include as aliases: - Pronouns or pronoun phrases (he, she, it, they, them, we, his, her, its) - Generic references (the team, the roster, the company, the city, the league, the organization, the event, the protocol, the framework, the ingredient) - - Surnames or first names alone as canonical entity names (Curry, Obama, Kevin, Marie). A bare surname may be an alias only when the same passage or prior context clearly ties it to a full person entity, e.g. "Conway" after "Cole Conway" + - Surnames or first names alone as canonical entity names (Curry, Obama, Kevin, Marie). A bare surname may be an alias only when the same chunk or prior context clearly ties it to a full person entity, e.g. "Conway" after "Cole Conway" - Names of DIFFERENT entities — "FIBA Hall of Fame" and "Naismith Hall of Fame" are SEPARATE entities; "React" and "React Native" are SEPARATE; "Python 2" and "Python 3" are SEPARATE - Descriptive phrases (the American team, the defending champions, the former president, the lead researcher, the main ingredient) - Country/city names for their teams — "France" is NOT an alias of "France men's national basketball team"; "Brazil" is NOT an alias of "Brazil national football team" @@ -568,7 +567,7 @@ Entity rules: 4. Actors over settings — prefer entities that DO things over entities that are merely locations or backdrops - Omit entities that appear only in lists, parenthetical asides, or as minor supporting context with no described relationships. - Only extract specific named entities — NOT dates, dollar amounts, percentages, or generic descriptions -- Exception: when the text directly states a named person's or organization's profession, office, or role, extract that role label as a "concept" entity so it can participate in a structured relationship. Examples: "doctor", "pilot", "house surgeon" +- Exception: when the text directly states a named person's or organization's profession, office, or role, extract that role label as a "role" entity so it can participate in a structured relationship. Examples: "doctor", "pilot", "house surgeon", "CTO" - If an entity is referred to by multiple names (e.g., "OpenAI" and "the company"), list the proper name variants as aliases — NOT the generic reference - Include important entities even if they only appear once - Preserve complete person surface forms exactly when present. If the text says a person is "calling himself Cole Conway" or "known as Cole Conway", include "Cole Conway" as the entity name or alias — not only "Conway". @@ -581,7 +580,7 @@ Entity rules: - For events, awards, seasons, software versions, product generations, or any time/version-specific entities, ALWAYS include the year, version, or edition in the name. Each distinct occurrence is a SEPARATE entity — e.g., "2023 NBA Finals" and "2024 NBA Finals" are different, "Python 2" and "Python 3" are different, "iPhone 15" and "iPhone 16" are different, "HTTP/1.1" and "HTTP/2" are different, "Michelin Guide 2024" and "Michelin Guide 2025" are different. - Different awards are ALWAYS separate entities even when they share words — "NBA Finals MVP" and "NBA MVP" are SEPARATE; "Academy Award for Best Picture" and "Academy Award for Best Director" are SEPARATE; "Nobel Peace Prize" and "Nobel Prize in Physics" are SEPARATE - Entities with opposing directional or categorical qualifiers are ALWAYS separate — "Western Conference" and "Eastern Conference" are SEPARATE; "North Atlantic Treaty Organization" and "South Asian Association" are SEPARATE; "Upper Egypt" and "Lower Egypt" are SEPARATE -- Profession and role statements should become structured edges when supported by the text. Examples: "Steve Sharp, a pilot by profession" → Steve Sharp WORKS_AS pilot; "Elsie Inglis was a doctor" → Elsie Inglis WORKS_AS doctor; "She served as a house surgeon" → person HELD_ROLE house surgeon +- Profession and role statements should become structured edges when supported by the text. Examples: "Steve Sharp, a pilot by profession" -> Steve Sharp WORKS_AS pilot; "Elsie Inglis was a doctor" -> Elsie Inglis WORKS_AS doctor; "She served as a house surgeon" -> person WORKS_AS house surgeon CRITICAL — Aliases vs. Relationships: - An ALIAS is a different name for THE SAME entity (e.g., "NYC" is an alias for "New York City") @@ -599,12 +598,17 @@ For each relationship between the entities you identified, provide: - "confidence": How confident you are (0.0 to 1.0) - "description": One standalone sentence describing the relationship as a complete fact. It must be understandable without the source text. - "evidenceText": A concise source-backed excerpt or paraphrase that justifies the relationship. Keep it short; do not include full paragraphs. +- "temporalStatus": Optional. Use "former" for past-tense relationships, "current" for current relationships, "historical" for historical/narrative facts, or "unknown" when unclear. +- "validFrom" / "validTo": Optional ISO-like date strings only when the text states explicit dates or bounded periods. ${getPredicatesForPrompt()} Relationship rules: - Subject and object MUST be entities from Step 1 — do not introduce new entities - Use ONLY predicates from the vocabulary above. Do not invent relation names; omit the relationship if no predicate fits. +- Emit canonical predicates only. Do not emit aliases such as WORKED_FOR, LED, CO_FOUNDED, KNOWN_AS, AKA, or ALIAS. +- Use the same canonical predicate for current and former facts; put tense in temporalStatus instead of the predicate name. +- Use IS_A for taxonomy/classification and WORKS_AS for employment, title, job, function, or role relationships. - Preserve logical direction. Passive voice must be converted to the active graph direction: "X was killed by Y" becomes Y KILLED X; "X was founded by Y" becomes Y FOUNDED X. - Use MARRIED for spouse, husband, wife, wed, or married relationships. Do not emit HUSBAND_OF, WIFE_OF, SPOUSE_OF, or MARRIED_TO. - Use PARENT_OF for father/mother/parent relationships, CHILD_OF for son/daughter/child relationships, and SIBLING_OF for brother/sister/sibling relationships. @@ -625,7 +629,7 @@ Output: {"name": "Nancy Wade", "type": "person", "description": "Mother of Cousin Cæsar", "aliases": []}, {"name": "Big-sis", "type": "person", "description": "Caretaker of Cousin Cæsar during childhood", "aliases": []}, {"name": "Steve Sharp", "type": "person", "description": "Pilot and partner of Cousin Cæsar in the card game", "aliases": ["Sharp"]}, - {"name": "pilot", "type": "concept", "description": "A profession practiced by Steve Sharp", "aliases": []}, + {"name": "pilot", "type": "role", "description": "A profession practiced by Steve Sharp", "aliases": []}, {"name": "Paducah, Kentucky", "type": "location", "description": "City in Kentucky where Cousin Cæsar uses the name Cole Conway", "aliases": ["Paducah"]}, {"name": "West Tennessee", "type": "location", "description": "Region where Cousin Cæsar was born", "aliases": []}, {"name": "Rob Roy", "type": "person", "description": "Wood cutter who worked for Old Smith", "aliases": ["Roy"]}, @@ -633,9 +637,9 @@ Output: {"name": "Tennessee River", "type": "location", "description": "River near Old Smith's farm", "aliases": []} ], "relationships": [ {"subject": "Cousin Cæsar", "predicate": "CHILD_OF", "object": "Nancy Wade", "confidence": 0.95, "description": "Cousin Cæsar was born to Nancy Wade.", "evidenceText": "Cousin Cæsar was born to Nancy Wade"}, - {"subject": "Cousin Cæsar", "predicate": "BORN_IN", "object": "West Tennessee", "confidence": 0.95, "description": "Cousin Cæsar was born in West Tennessee.", "evidenceText": "born to Nancy Wade in West Tennessee"}, - {"subject": "Cousin Cæsar", "predicate": "TRAVELED_TO", "object": "Paducah, Kentucky", "confidence": 0.85, "description": "Cousin Cæsar later went to Paducah, Kentucky.", "evidenceText": "we find Cousin Cæsar in Paducah, Kentucky"}, - {"subject": "Cousin Cæsar", "predicate": "COLLABORATED_WITH", "object": "Steve Sharp", "confidence": 0.95, "description": "Cousin Cæsar and Steve Sharp were partners in a card game.", "evidenceText": "in company with one Steve Sharp; they were partners"}, + {"subject": "Cousin Cæsar", "predicate": "LOCATED_IN", "object": "West Tennessee", "confidence": 0.95, "description": "Cousin Cæsar was born in West Tennessee.", "evidenceText": "born to Nancy Wade in West Tennessee"}, + {"subject": "Cousin Cæsar", "predicate": "LOCATED_IN", "object": "Paducah, Kentucky", "confidence": 0.85, "description": "Cousin Cæsar later went to Paducah, Kentucky.", "evidenceText": "we find Cousin Cæsar in Paducah, Kentucky"}, + {"subject": "Cousin Cæsar", "predicate": "PARTNERED_WITH", "object": "Steve Sharp", "confidence": 0.95, "description": "Cousin Cæsar and Steve Sharp were partners in a card game.", "evidenceText": "in company with one Steve Sharp; they were partners"}, {"subject": "Steve Sharp", "predicate": "WORKS_AS", "object": "pilot", "confidence": 0.9, "description": "Steve Sharp worked as a pilot.", "evidenceText": "Sharp, a pilot by profession"}, {"subject": "Old Smith", "predicate": "EMPLOYED", "object": "Rob Roy", "confidence": 0.9, "description": "Old Smith employed Rob Roy to cut wood.", "evidenceText": "Rob Roy cut wood for Old Smith"} ]} @@ -652,12 +656,12 @@ ${content}` // ── Two-pass prompts ── -function buildEntityExtractionPrompt(content: string, entityContext?: EntityContext[], documentTitle?: string): string { +function buildEntityExtractionPrompt(content: string, entityContext?: EntityContext[], sourceTitle?: string): string { const contextSection = entityContext?.length ? `\nPreviously identified entities in the text string:\n${entityContext.map(e => `- ${e.name} (${e.type})`).join('\n')}\n\nUse these names as canonical entities when the text refers to them by pronoun, abbreviation, surname, title, epithet, or pseudonym. Preserve any newly observed surface form as an alias instead of creating a duplicate entity.\n` : '' - const titleSection = documentTitle - ? `\nThe text string is from a document titled: "${documentTitle}". Entities referenced in the title should be extracted as primary entities using their full formal and canonical names.\n` + const titleSection = sourceTitle + ? `\nThe text string is from a source titled: "${sourceTitle}". Entities referenced in the title should be extracted as primary entities using their full formal and canonical names.\n` : '' return `Your task is to extract all named entities from a text string. @@ -682,7 +686,7 @@ function buildEntityExtractionPrompt(content: string, entityContext?: EntityCont -- NEVER include as aliases: --- Pronouns or pronoun phrases (he, she, it, they, them, we, his, her, its) --- Generic references (the team, the roster, the company, the city, the league, the organization, the event, the protocol, the framework, the ingredient) - --- Surnames or first names alone as canonical entity names (Curry, Obama, Kevin, Marie). A bare surname may be an alias only when the same passage or prior context clearly ties it to a full person entity, e.g. "Conway" after "Cole Conway" + --- Surnames or first names alone as canonical entity names (Curry, Obama, Kevin, Marie). A bare surname may be an alias only when the same chunk or prior context clearly ties it to a full person entity, e.g. "Conway" after "Cole Conway" --- Names of DIFFERENT entities — "FIBA Hall of Fame" and "Naismith Hall of Fame" are SEPARATE entities; "React" and "React Native" are SEPARATE; "Python 2" and "Python 3" are SEPARATE --- Descriptive phrases (the American team, the defending champions, the former president, the lead researcher, the main ingredient) --- Country/city names for their teams — "France" is NOT an alias of "France men's national basketball team"; "Brazil" is NOT an alias of "Brazil national football team" @@ -699,7 +703,7 @@ function buildEntityExtractionPrompt(content: string, entityContext?: EntityCont -- 4. ACTORS OVER SETTINGS — prefer entities that DO things over entities that are merely locations or backdrops -- Omit entities that appear only in lists, parenthetical asides, or as minor supporting context with no described relationships. - Only extract specific named entities. NOT dates, dollar amounts, percentages, or generic descriptions - - Exception: when the text directly states a named person's or organization's profession, office, or role, extract that role label as a "concept" entity so it can participate in a structured relationship. Examples: "doctor", "pilot", "house surgeon" + - Exception: when the text directly states a named person's or organization's profession, office, or role, extract that role label as a "role" entity so it can participate in a structured relationship. Examples: "doctor", "pilot", "house surgeon", "CTO" - If an entity is referred to by multiple names (e.g., "OpenAI" and "the company"), list the proper name variants as aliases — NOT the generic reference - Include important entities even if they only appear once - Preserve complete person surface forms exactly when present. If the text says a person is "calling himself Cole Conway" or "known as Cole Conway", include "Cole Conway" as the entity name or alias — not only "Conway". @@ -713,7 +717,7 @@ function buildEntityExtractionPrompt(content: string, entityContext?: EntityCont - For events, awards, seasons, software versions, product generations, or any time/version-specific entities, ALWAYS include the year, version, or edition in the name. Each distinct occurrence is a SEPARATE entity — e.g., "2023 NBA Finals" and "2024 NBA Finals" are different, "Python 2" and "Python 3" are different, "iPhone 15" and "iPhone 16" are different, "HTTP/1.1" and "HTTP/2" are different, "Michelin Guide 2024" and "Michelin Guide 2025" are different. - Different awards are ALWAYS separate entities even when they share words — "NBA Finals MVP" and "NBA MVP" are SEPARATE; "Academy Award for Best Picture" and "Academy Award for Best Director" are SEPARATE; "Nobel Peace Prize" and "Nobel Prize in Physics" are SEPARATE - Entities with opposing directional or categorical qualifiers are ALWAYS separate — "Western Conference" and "Eastern Conference" are SEPARATE; "North Atlantic Treaty Organization" and "South Asian Association" are SEPARATE; "Upper Egypt" and "Lower Egypt" are SEPARATE - - Profession and role statements should become structured edges when supported by the text. Examples: "Steve Sharp, a pilot by profession" → Steve Sharp WORKS_AS pilot; "Elsie Inglis was a doctor" → Elsie Inglis WORKS_AS doctor; "She served as a house surgeon" → person HELD_ROLE house surgeon + - Profession and role statements should become structured edges when supported by the text. Examples: "Steve Sharp, a pilot by profession" -> Steve Sharp WORKS_AS pilot; "Elsie Inglis was a doctor" -> Elsie Inglis WORKS_AS doctor; "She served as a house surgeon" -> person WORKS_AS house surgeon @@ -727,7 +731,7 @@ function buildEntityExtractionPrompt(content: string, entityContext?: EntityCont - Test: Could you replace one name with the other in any sentence and preserve meaning? If yes → alias. If no → separate entities with a relationship. ACRONYM / INITIALISM CANONICALIZATION RULES: - - Never use an acronym, abbreviation, or initialism as the canonical "name" when a fuller proper name is available in the text, document title, prior entity context, or common domain context. + - Never use an acronym, abbreviation, or initialism as the canonical "name" when a fuller proper name is available in the text, source title, prior entity context, or common domain context. - Use the expanded full name as "name" and put the acronym/initialism in "aliases". - Examples: - Use "Time Variance Authority" as name, aliases ["TVA"]. @@ -756,7 +760,7 @@ function buildEntityExtractionPrompt(content: string, entityContext?: EntityCont {"name": "Nancy Wade", "type": "person", "description": "Mother of Cousin Cæsar", "aliases": []}, {"name": "Big-sis", "type": "person", "description": "Caretaker of Cousin Cæsar during childhood", "aliases": []}, {"name": "Steve Sharp", "type": "person", "description": "Pilot and partner of Cousin Cæsar in the card game", "aliases": []}, - {"name": "pilot", "type": "concept", "description": "A profession practiced by Steve Sharp", "aliases": []}, + {"name": "pilot", "type": "role", "description": "A profession practiced by Steve Sharp", "aliases": []}, {"name": "Paducah, Kentucky", "type": "location", "description": "City in Kentucky where Cousin Cæsar uses the name Cole Conway", "aliases": []}, {"name": "West Tennessee", "type": "location", "description": "Region where Cousin Cæsar was born", "aliases": []}, {"name": "Rob Roy", "type": "person", "description": "Wood cutter who worked for Old Smith", "aliases": []}, @@ -781,7 +785,7 @@ function buildEntityExtractionPrompt(content: string, entityContext?: EntityCont {"name": "Nancy Wade", "type": "person", "description": "Mother of Cousin Cæsar", "aliases": []}, {"name": "Big-sis", "type": "person", "description": "Caretaker of Cousin Cæsar during childhood", "aliases": []}, {"name": "Steve Sharp", "type": "person", "description": "Pilot and partner of Cousin Cæsar in the card game", "aliases": []}, - {"name": "pilot", "type": "concept", "description": "A profession practiced by Steve Sharp", "aliases": []}, + {"name": "pilot", "type": "role", "description": "A profession practiced by Steve Sharp", "aliases": []}, {"name": "Paducah, Kentucky", "type": "location", "description": "City in Kentucky where Cousin Cæsar uses the name Cole Conway", "aliases": []}, {"name": "West Tennessee", "type": "location", "description": "Region where Cousin Cæsar was born", "aliases": []}, {"name": "Rob Roy", "type": "person", "description": "Wood cutter who worked for Old Smith", "aliases": []}, @@ -840,6 +844,8 @@ For each relationship, provide: - "confidence": How confident you are this relationship is stated or strongly implied (0.0 to 1.0) - "description": One standalone sentence describing the relationship as a complete fact. - "evidenceText": A concise source-backed excerpt or paraphrase that justifies the relationship. +- "temporalStatus": Optional. Use "former" for past-tense relationships, "current" for current relationships, "historical" for historical/narrative facts, or "unknown" when unclear. +- "validFrom" / "validTo": Optional ISO-like date strings only when the text states explicit dates or bounded periods. @@ -847,6 +853,9 @@ For each relationship, provide: - Subject and object MUST be from the entity list listed below — do not introduce new entities - Use ONLY predicates from the vocabulary listed below. Do not invent relation names; omit the relationship if no predicate fits. +- Emit canonical predicates only. Do not emit aliases such as WORKED_FOR, LED, CO_FOUNDED, KNOWN_AS, AKA, or ALIAS. +- Use the same canonical predicate for current and former facts; put tense in temporalStatus instead of the predicate name. +- Use IS_A for taxonomy/classification and WORKS_AS for employment, title, job, function, or role relationships. - Preserve logical direction. Passive voice must be converted to the active graph direction: "X was killed by Y" becomes Y KILLED X; "X was founded by Y" becomes Y FOUNDED X. - Use MARRIED for spouse, husband, wife, wed, or married relationships. Do not emit HUSBAND_OF, WIFE_OF, SPOUSE_OF, or MARRIED_TO. - Use PARENT_OF for father/mother/parent relationships, CHILD_OF for son/daughter/child relationships, and SIBLING_OF for brother/sister/sibling relationships. @@ -856,7 +865,7 @@ For each relationship, provide: - Extract relationships that are explicitly stated or strongly implied in the text - Do not emit self-relationships or alias relationships. If two names refer to the same entity, they belong in aliases from the entity step, not in the relationships array. - Do not connect an entity to a generic description or role unless that role was extracted as a specific named entity. -- When the text directly states a profession, office, or role for a named entity, emit a structured relationship to that role concept. Examples: person WORKS_AS doctor, person HELD_ROLE house surgeon, person PRACTICED_AS physician +- When the text directly states a profession, office, or role for a named entity, emit a structured relationship to that role entity. Examples: person WORKS_AS doctor, person WORKS_AS house surgeon, person WORKS_AS physician - Preserve important names, dates, places, objects, and negation in relationship descriptions and evidence text. - Return an empty array if no clear relationships exist between the entities listed below @@ -874,7 +883,7 @@ For each relationship, provide: {"name": "Nancy Wade", "type": "person", "description": "Mother of Cousin Cæsar", "aliases": []}, {"name": "Big-sis", "type": "person", "description": "Caretaker of Cousin Cæsar during childhood", "aliases": []}, {"name": "Steve Sharp", "type": "person", "description": "Pilot and partner of Cousin Cæsar in the card game", "aliases": []}, - {"name": "pilot", "type": "concept", "description": "A profession practiced by Steve Sharp", "aliases": []}, + {"name": "pilot", "type": "role", "description": "A profession practiced by Steve Sharp", "aliases": []}, {"name": "Paducah, Kentucky", "type": "location", "description": "City in Kentucky where Cousin Cæsar uses the name Cole Conway", "aliases": []}, {"name": "West Tennessee", "type": "location", "description": "Region where Cousin Cæsar was born", "aliases": []}, {"name": "Rob Roy", "type": "person", "description": "Wood cutter who worked for Old Smith", "aliases": []}, @@ -892,9 +901,9 @@ For each relationship, provide: [{"subject": "Cousin Cæsar", "predicate": "CHILD_OF", "object": "Nancy Wade", "confidence": 0.95, "description": "Cousin Cæsar was born to Nancy Wade.", "evidenceText": "Cousin Cæsar was born to Nancy Wade"}, - {"subject": "Cousin Cæsar", "predicate": "BORN_IN", "object": "West Tennessee", "confidence": 0.95, "description": "Cousin Cæsar was born in West Tennessee.", "evidenceText": "born to Nancy Wade in West Tennessee"}, - {"subject": "Cousin Cæsar", "predicate": "TRAVELED_TO", "object": "Paducah, Kentucky", "confidence": 0.85, "description": "Cousin Cæsar later went to Paducah, Kentucky.", "evidenceText": "we find Cousin Cæsar in Paducah, Kentucky"}, - {"subject": "Cousin Cæsar", "predicate": "COLLABORATED_WITH", "object": "Steve Sharp", "confidence": 0.95, "description": "Cousin Cæsar and Steve Sharp were partners in a card game.", "evidenceText": "in company with one Steve Sharp; they were partners"}, + {"subject": "Cousin Cæsar", "predicate": "LOCATED_IN", "object": "West Tennessee", "confidence": 0.95, "description": "Cousin Cæsar was born in West Tennessee.", "evidenceText": "born to Nancy Wade in West Tennessee"}, + {"subject": "Cousin Cæsar", "predicate": "LOCATED_IN", "object": "Paducah, Kentucky", "confidence": 0.85, "description": "Cousin Cæsar later went to Paducah, Kentucky.", "evidenceText": "we find Cousin Cæsar in Paducah, Kentucky"}, + {"subject": "Cousin Cæsar", "predicate": "PARTNERED_WITH", "object": "Steve Sharp", "confidence": 0.95, "description": "Cousin Cæsar and Steve Sharp were partners in a card game.", "evidenceText": "in company with one Steve Sharp; they were partners"}, {"subject": "Steve Sharp", "predicate": "WORKS_AS", "object": "pilot", "confidence": 0.9, "description": "Steve Sharp worked as a pilot.", "evidenceText": "Sharp, a pilot by profession"}, {"subject": "Old Smith", "predicate": "EMPLOYED", "object": "Rob Roy", "confidence": 0.9, "description": "Old Smith employed Rob Roy to cut wood.", "evidenceText": "Rob Roy cut wood for Old Smith"}] @@ -957,10 +966,10 @@ export class TripleExtractor { content: string, bucketId: string, chunkIndex?: number, - documentId?: string, + sourceId?: string, metadata?: Record, entityContext?: EntityContext[], - documentTitle?: string, + sourceTitle?: string, identity?: { tenantId?: string | undefined groupId?: string | undefined @@ -974,7 +983,7 @@ export class TripleExtractor { if (!this.graph.addTriple && !this.graph.addEntityMentions) return { entities: [] } const cleanContent = sanitizeText(content) - const cleanTitle = documentTitle ? sanitizeField(documentTitle) : undefined + const cleanTitle = sourceTitle ? sanitizeField(sourceTitle) : undefined const raw = this.twoPass ? await this.extractTwoPass(cleanContent, entityContext, cleanTitle) : await this.extractSinglePass(cleanContent, entityContext, cleanTitle) @@ -989,7 +998,7 @@ export class TripleExtractor { content: cleanContent, bucketId, ...(chunkIndex !== undefined ? { chunkIndex } : {}), - ...(documentId ? { documentId } : {}), + ...(sourceId ? { sourceId } : {}), ...(identity?.tenantId ? { tenantId: identity.tenantId } : {}), ...(identity?.groupId ? { groupId: identity.groupId } : {}), ...(identity?.userId ? { userId: identity.userId } : {}), @@ -1024,12 +1033,15 @@ export class TripleExtractor { objectDescription: objectEntity.description, relationshipDescription: rel.description, evidenceText: rel.evidenceText, + temporalStatus: rel.temporalStatus, + validFrom: rel.validFrom, + validTo: rel.validTo, sourceChunkId, confidence: typeof rel.confidence === 'number' ? Math.max(0, Math.min(1, rel.confidence)) : 1.0, content: cleanContent, bucketId, ...(chunkIndex !== undefined ? { chunkIndex } : {}), - ...(documentId ? { documentId } : {}), + ...(sourceId ? { sourceId } : {}), ...(identity?.tenantId ? { tenantId: identity.tenantId } : {}), ...(identity?.groupId ? { groupId: identity.groupId } : {}), ...(identity?.userId ? { userId: identity.userId } : {}), @@ -1044,17 +1056,13 @@ export class TripleExtractor { return { entities: entities.map(e => ({ name: e.name, type: e.type })) } } - async persistPassageNodes(nodes: Parameters>[0]): Promise { - await this.graph.upsertPassageNodes?.(nodes) - } - /** Single combined LLM call for entities + relationships. Used only when twoPass is disabled. */ private async extractSinglePass( content: string, entityContext?: EntityContext[], - documentTitle?: string, + sourceTitle?: string, ): Promise { - const prompt = buildSinglePassPrompt(content, entityContext, documentTitle) + const prompt = buildSinglePassPrompt(content, entityContext, sourceTitle) const result = await this.llm.generateJSON( prompt, 'You are a precise knowledge graph extractor. Preserve complete named surface forms, model pseudonyms as aliases, reject generic one-token entities, and return only valid JSON.', @@ -1077,11 +1085,11 @@ export class TripleExtractor { private async extractTwoPass( content: string, entityContext?: EntityContext[], - documentTitle?: string, + sourceTitle?: string, ): Promise { // Pass 1: Extract entities const rawEntities = await this.llm.generateJSON( - buildEntityExtractionPrompt(content, entityContext, documentTitle), + buildEntityExtractionPrompt(content, entityContext, sourceTitle), 'You are a precise named entity extractor. Preserve complete named surface forms, model pseudonyms as aliases, reject generic one-token entities, and return only valid JSON arrays.', { schema: entitySchema }, ) diff --git a/packages/sdk/src/index.ts b/packages/sdk/src/index.ts index 7306abf..bcbf2ea 100644 --- a/packages/sdk/src/index.ts +++ b/packages/sdk/src/index.ts @@ -1,12 +1,13 @@ // Main public API export { typegraphInit, typegraphDeploy, resolveEmbeddingProvider, resolveLLMProvider, DEFAULT_BUCKET_ID } from './typegraph.js' -export type { typegraphConfig, typegraphInstance, BucketsApi, DocumentsApi, JobsApi, GraphApi } from './typegraph.js' +export type { typegraphConfig, typegraphInstance, BucketsApi, SourcesApi, JobsApi, GraphApi } from './typegraph.js' /** @deprecated Use LLMConfig instead. */ export type { LLMInput } from './typegraph.js' // Types export type { - RawDocument, + SourceInput, + SourceSubject, ChunkOpts, Chunk, Bucket, @@ -22,7 +23,7 @@ export type { HashStoreAdapter, VectorStoreAdapter, UndeployResult, - ScoredChunkWithDocument, + ScoredChunkWithSource, QuerySignals, QueryChunkResult, QueryMemoryRecord, @@ -34,24 +35,26 @@ export type { QueryContextStats, RawScores, NormalizedScores, + QueryEntityScope, QueryOpts, QueryResponse, IngestOptions, IndexProgressEvent, IndexResult, ExtractionFailure, - typegraphDocument, - DocumentStatus, + typegraphSource, + SourceStatus, Visibility, - DocumentFilter, - UpsertDocumentInput, - UpsertedDocumentRecord, + SourceFilter, + UpsertSourceInput, + UpsertedSourceRecord, typegraphHooks, LLMProvider, LLMGenerateOptions, LLMConfig, typegraphIdentity, MemoryBridge, + MemorySubject, RememberOpts, ForgetOpts, CorrectOpts, @@ -59,6 +62,17 @@ export type { AddConversationTurnOpts, HealthCheckOpts, KnowledgeGraphBridge, + GraphEntityRef, + UpsertGraphEntityInput, + UpsertGraphEdgeInput, + UpsertGraphFactInput, + MergeGraphEntitiesInput, + MergeGraphEntitiesResult, + DeleteGraphEntityOpts, + DeleteGraphEntityResult, + EntityScopeResolution, + KnowledgeSearchOpts, + KnowledgeSearchResult, EntityResult, EntityDetail, EdgeResult, @@ -73,7 +87,8 @@ export type { GraphBackfillOpts, GraphBackfillResult, GraphExplainOpts, - PassageResult, + ChunkResult, + GraphIntentParserMode, GraphSearchProfile, GraphSearchOpts, GraphSearchTrace, @@ -96,7 +111,11 @@ export type { PolicyDecision, PolicyViolation, PolicyStoreAdapter, + ExternalId, + ExternalIdIdentityType, + ExternalIdEncoding, MemoryRecord, + ChunkRef, ConversationTurnResult, MemoryHealthReport, typegraphLogger, @@ -127,7 +146,38 @@ export type { AISDKLLMInput } from './llm/index.js' export { PolicyEngine, PolicyViolationError } from './governance/index.js' // Index engine -export { IndexEngine, defaultChunker, sha256, stripMarkdown } from './index-engine/index.js' +export { + IndexEngine, + defaultChunker, + sha256, + stripMarkdown, + ENTITY_TYPES, + DEFAULT_ENTITY_TYPE, + VALID_ENTITY_TYPES, + ENTITY_TYPES_LIST, + ENTITY_TYPE_SPECS, + PREDICATE_SPECS, + ALL_PREDICATES, + PREDICATE_BY_NAME, + SYMMETRIC_PREDICATES, + GENERIC_DISALLOWED_PREDICATES, + ALIAS_RELATION_CUES, + ALIAS_ASSIGNMENT_CUES, + sanitizePredicate, + isSymmetricPredicate, + getPredicatesForPrompt, + normalizePredicateWithDirection, + validatePredicateTypes, +} from './index-engine/index.js' +export type { + EntityType, + EntityTypeSpec, + PredicateAliasSpec, + PredicateSpec, + PredicateTemporalStatus, + PredicateNormalization, + PredicateTypeValidation, +} from './index-engine/index.js' // Query engine export { mergeAndRank, minMaxNormalize, calibrateSemantic, calibrateKeyword, normalizeGraphPPR } from './query/index.js' @@ -150,15 +200,17 @@ export type { SemanticEntity, EntityMentionType, SemanticEntityMention, - SemanticPassageNode, - SemanticPassageEntityEdge, + SemanticGraphNodeType, + SemanticGraphEdge, + SemanticEntityChunkEdge, + SemanticChunkRecord, SemanticEdge, SemanticFactRecord, SemanticFact, ProceduralMemory, GraphBackfillPageOpts, - PassageBackfillChunk, - PassageMentionBackfillRow, + ChunkBackfillRecord, + ChunkMentionBackfillRow, MemoryFilter, MemorySearchOpts, MemoryStoreAdapter, diff --git a/packages/sdk/src/memory/__tests__/consolidation-engine.test.ts b/packages/sdk/src/memory/__tests__/consolidation-engine.test.ts new file mode 100644 index 0000000..502ae66 --- /dev/null +++ b/packages/sdk/src/memory/__tests__/consolidation-engine.test.ts @@ -0,0 +1,59 @@ +import { describe, expect, it, vi } from 'vitest' +import { ConsolidationEngine } from '../consolidation/engine.js' +import type { EmbeddingProvider } from '../../embedding/provider.js' +import type { LLMProvider } from '../../types/llm-provider.js' +import type { MemoryStoreAdapter } from '../types/adapter.js' + +function createStore(): MemoryStoreAdapter { + return { + initialize: vi.fn(), + upsert: vi.fn(async record => record), + get: vi.fn(async () => null), + list: vi.fn(async () => []), + delete: vi.fn(), + invalidate: vi.fn(), + expire: vi.fn(), + getHistory: vi.fn(async () => []), + search: vi.fn(async () => []), + } +} + +describe('ConsolidationEngine', () => { + const scope = { tenantId: 'tenant-1' } + const llm: LLMProvider = { + generateText: vi.fn(), + generateJSON: vi.fn(), + } + const embedding: EmbeddingProvider = { + model: 'mock', + dimensions: 3, + embed: vi.fn(async () => [0, 0, 0]), + embedBatch: vi.fn(async texts => texts.map(() => [0, 0, 0])), + } + + it('treats null consolidation opts as omitted', async () => { + const store = createStore() + const engine = new ConsolidationEngine({ memoryStore: store, llm, embedding }) + + await expect(engine.consolidate(scope, null)).resolves.toEqual({ + factsExtracted: 0, + factsUpdated: 0, + proceduresCreated: 0, + communitiesDetected: 0, + episodesConsolidated: 0, + }) + }) + + it('treats null promotion opts as omitted', async () => { + const store = createStore() + const engine = new ConsolidationEngine({ memoryStore: store, llm, embedding }) + + await expect(engine.promoteEpisodicToSemantic(scope, null)).resolves.toEqual({ + factsExtracted: 0, + episodesConsolidated: 0, + }) + await expect(engine.promoteToProcedural(scope, null)).resolves.toEqual({ + proceduresCreated: 0, + }) + }) +}) diff --git a/packages/sdk/src/memory/__tests__/memory-bridge.test.ts b/packages/sdk/src/memory/__tests__/memory-bridge.test.ts index 1e61d48..733c293 100644 --- a/packages/sdk/src/memory/__tests__/memory-bridge.test.ts +++ b/packages/sdk/src/memory/__tests__/memory-bridge.test.ts @@ -1,7 +1,7 @@ import { describe, it, expect, vi } from 'vitest' import { createMemoryBridge } from '../memory-bridge.js' import type { MemoryStoreAdapter } from '../types/adapter.js' -import type { SemanticEntity, SemanticEdge } from '../types/memory.js' +import type { ExternalId, MemoryRecord, SemanticEntity, SemanticEdge, SemanticGraphEdge } from '../types/memory.js' import { buildScope } from '../types/scope.js' const testScope = buildScope({ userId: 'test-user' }) @@ -27,6 +27,7 @@ function mockStore() { getEdgesBatch: vi.fn().mockResolvedValue([]), findEdges: vi.fn().mockResolvedValue([]), invalidateEdge: vi.fn(), + invalidateGraphEdgesForNode: vi.fn(), } return store } @@ -84,6 +85,7 @@ describe('createMemoryBridge', () => { await bridge.forget('some-id', { ...testScope }) expect(store.invalidate).toHaveBeenCalledWith('some-id') + expect(store.invalidateGraphEdgesForNode).toHaveBeenCalledWith('memory', 'some-id') }) it('recall delegates to TypegraphMemory', async () => { @@ -98,4 +100,124 @@ describe('createMemoryBridge', () => { expect(Array.isArray(results)).toBe(true) expect(store.search).toHaveBeenCalled() }) + + it('treats null memory opts as omitted', async () => { + const store = mockStore() + const bridge = createMemoryBridge({ + memoryStore: store, + embedding: mockEmbedding(), + llm: mockLLM(), + scope: testScope, + }) + + await bridge.remember('test memory', null) + const results = await bridge.recall('query', null) + await bridge.forget('some-id', null) + + expect(Array.isArray(results)).toBe(true) + expect(store.upsert).toHaveBeenCalled() + expect(store.search).toHaveBeenCalled() + expect(store.invalidate).toHaveBeenCalledWith('some-id') + }) + + it('links memories to deterministic external-ID subjects and recalls by entity scope', async () => { + const store = mockStore() + const records: MemoryRecord[] = [] + const entities = new Map() + const edges: SemanticGraphEdge[] = [] + const email: ExternalId = { id: 'pat@example.com', type: 'email' } + + Object.assign(store, { + upsert: vi.fn().mockImplementation(async (record: MemoryRecord) => { + records.push(record) + return record + }), + findEntityByExternalId: vi.fn().mockImplementation(async (externalId: ExternalId) => { + return [...entities.values()].find(entity => + entity.externalIds?.some(id => + id.type === externalId.type && + id.id === externalId.id + ) + ) ?? null + }), + upsertEntity: vi.fn().mockImplementation(async (entity: SemanticEntity) => { + entities.set(entity.id, entity) + return entity + }), + upsertGraphEdges: vi.fn().mockImplementation(async (nextEdges: SemanticGraphEdge[]) => { + edges.push(...nextEdges) + }), + getMemoryIdsForEntities: vi.fn().mockImplementation(async (entityIds: string[]) => { + const ids = new Set(entityIds) + return edges + .filter(edge => edge.sourceType === 'memory' && edge.targetType === 'entity' && ids.has(edge.targetId)) + .map(edge => edge.sourceId) + }), + search: vi.fn().mockImplementation(async (_embedding, opts) => { + const ids = new Set(opts.filter?.ids ?? []) + return ids.size > 0 ? records.filter(record => ids.has(record.id)) : records + }), + }) + + const bridge = createMemoryBridge({ + memoryStore: store, + embedding: mockEmbedding(), + llm: mockLLM(), + }) + + const memory = await bridge.remember('Prefers SMS for urgent notices', { + tenantId: 'acme', + subject: { + externalIds: [email], + entityType: 'person', + }, + visibility: 'tenant', + }) + const recalled = await bridge.recall('urgent notices', { + tenantId: 'acme', + entityScope: { externalIds: [email] }, + }) + + expect(store.upsertEntity).toHaveBeenCalledWith(expect.objectContaining({ + name: 'pat@example.com', + entityType: 'person', + externalIds: [email], + visibility: 'tenant', + })) + expect(store.upsertGraphEdges).toHaveBeenCalledWith([expect.objectContaining({ + sourceType: 'memory', + sourceId: memory.id, + targetType: 'entity', + relation: 'ABOUT', + visibility: 'tenant', + })]) + expect(store.getMemoryIdsForEntities).toHaveBeenCalledWith([expect.any(String)], { tenantId: 'acme' }) + expect(recalled).toEqual([memory]) + }) + + it('returns empty scoped recall when external IDs resolve to no entity', async () => { + const store = mockStore() + const email: ExternalId = { id: 'missing@example.com', type: 'email' } + Object.assign(store, { + findEntityByExternalId: vi.fn().mockResolvedValue(null), + getMemoryIdsForEntities: vi.fn().mockResolvedValue(['should-not-be-used']), + search: vi.fn().mockResolvedValue([]), + }) + const bridge = createMemoryBridge({ + memoryStore: store, + embedding: mockEmbedding(), + llm: mockLLM(), + }) + + const recalled = await bridge.recall('urgent notices', { + tenantId: 'acme', + entityScope: { externalIds: [email] }, + }) + + expect(store.getMemoryIdsForEntities).not.toHaveBeenCalled() + expect(store.search).toHaveBeenCalledWith(expect.any(Array), expect.objectContaining({ + filter: expect.objectContaining({ ids: [] }), + })) + expect(recalled).toEqual([]) + }) }) diff --git a/packages/sdk/src/memory/__tests__/predicate-normalizer.test.ts b/packages/sdk/src/memory/__tests__/predicate-normalizer.test.ts index 0942027..9eaf236 100644 --- a/packages/sdk/src/memory/__tests__/predicate-normalizer.test.ts +++ b/packages/sdk/src/memory/__tests__/predicate-normalizer.test.ts @@ -35,7 +35,7 @@ describe('PredicateNormalizer', () => { symmetric: false, }) expect(normalizer.normalizeWithDirection('FOUNDED_BY').swapSubjectObject).toBe(true) - expect(normalizer.normalizeWithDirection('WRITTEN_BY').predicate).toBe('WROTE') + expect(normalizer.normalizeWithDirection('WRITTEN_BY').predicate).toBe('AUTHORED') expect(normalizer.normalizeWithDirection('OWNED_BY')).toEqual(expect.objectContaining({ predicate: 'OWNS', swapSubjectObject: true, @@ -60,13 +60,40 @@ describe('PredicateNormalizer', () => { expect(isSymmetricPredicate('MARRIED')).toBe(true) }) - it('keeps tense-significant predicates separate', async () => { + it('normalizes tense aliases to canonical predicates with temporal metadata', async () => { const normalizer = new PredicateNormalizer(mockEmbedding()) expect(await normalizer.normalize('WORKS_FOR')).toBe('WORKS_FOR') - expect(await normalizer.normalize('WORKED_FOR')).toBe('WORKED_FOR') - expect(await normalizer.normalize('WAS_EMPLOYED_BY')).toBe('WORKED_FOR') - expect(normalizer.normalizeWithDirection('WAS_EMPLOYED_BY').swapSubjectObject).toBe(true) + expect(await normalizer.normalize('WORKED_FOR')).toBe('WORKS_FOR') + expect(normalizer.normalizeWithDirection('WAS_EMPLOYED_BY')).toEqual(expect.objectContaining({ + predicate: 'WORKS_FOR', + swapSubjectObject: false, + temporalStatus: 'former', + })) + expect(normalizer.normalizeWithDirection('LED')).toEqual(expect.objectContaining({ + predicate: 'LEADS', + temporalStatus: 'former', + })) + }) + + it('rejects alias cues and prevents self-inverse swap bugs', () => { + const normalizer = new PredicateNormalizer(mockEmbedding()) + + expect(normalizer.normalizeWithDirection('KNOWN_AS')).toEqual(expect.objectContaining({ + predicate: 'KNOWN_AS', + valid: false, + swapSubjectObject: false, + })) + expect(normalizer.normalizeWithDirection('AMENDED_BY')).toEqual(expect.objectContaining({ + predicate: 'AMENDS', + valid: true, + swapSubjectObject: true, + })) + expect(normalizer.normalizeWithDirection('EMPLOYED_BY')).toEqual(expect.objectContaining({ + predicate: 'WORKS_FOR', + valid: true, + swapSubjectObject: false, + })) }) it('rejects invented predicates that are not in the ontology', async () => { diff --git a/packages/sdk/src/memory/consolidation/engine.ts b/packages/sdk/src/memory/consolidation/engine.ts index fa8dbb7..fd00bc0 100644 --- a/packages/sdk/src/memory/consolidation/engine.ts +++ b/packages/sdk/src/memory/consolidation/engine.ts @@ -5,6 +5,7 @@ import type { LLMProvider } from '../../types/llm-provider.js' import type { MemoryStoreAdapter } from '../types/adapter.js' import type { typegraphIdentity } from '../../types/identity.js' import type { EpisodicMemory, SemanticFact, ProceduralMemory } from '../types/index.js' +import { optionalCompactObject } from '../../utils/input.js' // ── Zod schemas for structured output ── @@ -71,8 +72,9 @@ export class ConsolidationEngine { */ async consolidate( scope: typegraphIdentity, - opts: ConsolidationOpts = {}, + rawOpts?: ConsolidationOpts | null, ): Promise { + const opts = optionalCompactObject(rawOpts, 'ConsolidationEngine.consolidate') as ConsolidationOpts const strategies = opts.strategies ?? ['episodic_to_semantic'] const result: ConsolidationResult = { factsExtracted: 0, @@ -111,8 +113,9 @@ export class ConsolidationEngine { */ async promoteEpisodicToSemantic( scope: typegraphIdentity, - opts: ConsolidationOpts = {}, + rawOpts?: ConsolidationOpts | null, ): Promise<{ factsExtracted: number; episodesConsolidated: number }> { + const opts = optionalCompactObject(rawOpts, 'ConsolidationEngine.promoteEpisodicToSemantic') as ConsolidationOpts const minAge = opts.minEpisodicAgeMs ?? 60 * 60 * 1000 // 1 hour const now = new Date() const cutoff = new Date(now.getTime() - minAge) @@ -202,8 +205,9 @@ Respond with only valid JSON: [{"content": "...", "subject": "...", "predicate": */ async promoteToProcedural( scope: typegraphIdentity, - opts: ConsolidationOpts = {}, + rawOpts?: ConsolidationOpts | null, ): Promise<{ proceduresCreated: number }> { + const opts = optionalCompactObject(rawOpts, 'ConsolidationEngine.promoteToProcedural') as ConsolidationOpts // Find tool-trace or action episodes const allEpisodes = await this.store.list({ scope, category: 'episodic' }, 200) const actionEpisodes = allEpisodes.filter((m): m is EpisodicMemory => { diff --git a/packages/sdk/src/memory/extraction/entity-resolver.ts b/packages/sdk/src/memory/extraction/entity-resolver.ts index 25e02ff..8a78624 100644 --- a/packages/sdk/src/memory/extraction/entity-resolver.ts +++ b/packages/sdk/src/memory/extraction/entity-resolver.ts @@ -1,7 +1,7 @@ import type { EmbeddingProvider } from '../../embedding/provider.js' import type { typegraphIdentity } from '../../types/identity.js' -import type { Visibility } from '../../types/typegraph-document.js' -import type { SemanticEntity } from '../types/memory.js' +import type { Visibility } from '../../types/source.js' +import type { ExternalId, SemanticEntity } from '../types/memory.js' import type { MemoryStoreAdapter } from '../types/adapter.js' import { createTemporal } from '../temporal.js' import { generateId } from '../../utils/id.js' @@ -317,12 +317,38 @@ export class EntityResolver { scope: typegraphIdentity, description?: string, visibility?: Visibility, + externalIds: ExternalId[] = [], ): Promise<{ entity: SemanticEntity; isNew: boolean }> { - // Phase 0: In-memory cache (instant — catches all prior entities in this session) const normalizedName = normalizeForComparison(name) + + // Phase 0: Deterministic external IDs. These take precedence over all + // fuzzy/probabilistic matching so application identity graphs can guide + // resolution without relying on LLM extraction behavior. + if (this.store.findEntityByExternalId && externalIds.length > 0) { + let externalMatch: SemanticEntity | undefined + for (const externalId of externalIds) { + const candidate = await this.store.findEntityByExternalId(externalId, scope) + if (!candidate) continue + if (!candidateMatchesWriteScope(candidate, scope, visibility)) continue + if (!typesCompatible(entityType, candidate.entityType)) continue + if (externalMatch && externalMatch.id !== candidate.id) { + throw new Error( + `Conflicting external IDs resolve to multiple entities: ${externalMatch.id} and ${candidate.id}`, + ) + } + externalMatch = candidate + } + if (externalMatch) { + const merged = await this.merge(externalMatch, { name, entityType, aliases, description, externalIds }) + this.cacheEntity(merged) + return { entity: merged, isNew: false } + } + } + + // Phase 1: In-memory cache (instant — catches all prior entities in this session) const cached = this.nameCache.get(this.cacheKey(name, scope, visibility)) if (cached && typesCompatible(entityType, cached.entityType)) { - const merged = await this.merge(cached, { name, entityType, aliases, description }) + const merged = await this.merge(cached, { name, entityType, aliases, description, externalIds }) this.cacheEntity(merged) return { entity: merged, isNew: false } } @@ -331,7 +357,7 @@ export class EntityResolver { if (!isStrongAliasForMerge(alias, entityType, name, aliases)) continue const cachedByAlias = this.nameCache.get(this.cacheKey(alias, scope, visibility)) if (cachedByAlias && typesCompatible(entityType, cachedByAlias.entityType)) { - const merged = await this.merge(cachedByAlias, { name, entityType, aliases, description }) + const merged = await this.merge(cachedByAlias, { name, entityType, aliases, description, externalIds }) this.cacheEntity(merged) return { entity: merged, isNew: false } } @@ -343,7 +369,7 @@ export class EntityResolver { .filter(candidate => candidateMatchesWriteScope(candidate, scope, visibility)) const aliasMatch = this.findByAlias(name, aliases, entityType, candidates) if (aliasMatch) { - const merged = await this.merge(aliasMatch, { name, entityType, aliases, description }) + const merged = await this.merge(aliasMatch, { name, entityType, aliases, description, externalIds }) this.cacheEntity(merged) return { entity: merged, isNew: false } } @@ -352,14 +378,14 @@ export class EntityResolver { for (const candidate of candidates) { if (!typesCompatible(entityType, candidate.entityType)) continue if (normalizeForComparison(candidate.name) === normalizedName) { - const merged = await this.merge(candidate, { name, entityType, aliases, description }) + const merged = await this.merge(candidate, { name, entityType, aliases, description, externalIds }) this.cacheEntity(merged) return { entity: merged, isNew: false } } for (const alias of candidate.aliases) { if (!isStrongAliasForMerge(alias, candidate.entityType, candidate.name, candidate.aliases)) continue if (normalizeForComparison(alias) === normalizedName) { - const merged = await this.merge(candidate, { name, entityType, aliases, description }) + const merged = await this.merge(candidate, { name, entityType, aliases, description, externalIds }) this.cacheEntity(merged) return { entity: merged, isNew: false } } @@ -370,7 +396,7 @@ export class EntityResolver { // e.g., "NY Times" vs "New York Times", "J.K. Rowling" vs "JK Rowling" const fuzzyMatch = this.findByFuzzy(name, aliases, entityType, candidates) if (fuzzyMatch) { - const merged = await this.merge(fuzzyMatch, { name, entityType, aliases, description }) + const merged = await this.merge(fuzzyMatch, { name, entityType, aliases, description, externalIds }) this.cacheEntity(merged) return { entity: merged, isNew: false } } @@ -396,7 +422,7 @@ export class EntityResolver { const similarity = (candidate.properties._similarity as number | undefined) ?? this.cosineSimilarity(nameEmbedding, candidate.embedding ?? []) if (similarity >= this.threshold) { - const merged = await this.merge(candidate, { name, entityType, aliases, description }) + const merged = await this.merge(candidate, { name, entityType, aliases, description, externalIds }) this.cacheEntity(merged) return { entity: merged, isNew: false } } @@ -410,7 +436,7 @@ export class EntityResolver { name, entityType, description, similar, nameEmbedding, ) if (descMatch) { - const merged = await this.merge(descMatch, { name, entityType, aliases, description }) + const merged = await this.merge(descMatch, { name, entityType, aliases, description, externalIds }) this.cacheEntity(merged) return { entity: merged, isNew: false } } @@ -429,6 +455,7 @@ export class EntityResolver { name, entityType, aliases, + externalIds, properties: description ? { description } : {}, embedding: nameEmbedding, descriptionEmbedding, @@ -478,7 +505,13 @@ export class EntityResolver { */ async merge( existing: SemanticEntity, - incoming: { name: string; entityType: string; aliases: string[]; description?: string | undefined }, + incoming: { + name: string + entityType: string + aliases: string[] + description?: string | undefined + externalIds?: ExternalId[] | undefined + }, ): Promise { const existingAliases = new Set() const newAliases: string[] = [] @@ -534,10 +567,11 @@ export class EntityResolver { return { ...existing, aliases: newAliases, + externalIds: mergeExternalIds(existing.externalIds, incoming.externalIds), properties, descriptionEmbedding, - // Keep existing type unless it's generic and incoming is more specific - entityType: (existing.entityType === 'entity' || existing.entityType === 'other') + // Keep existing type unless it is a generic/fallback type and incoming is more specific. + entityType: (existing.entityType === 'entity' || existing.entityType === 'other' || existing.entityType === 'concept') ? incoming.entityType : existing.entityType, } @@ -665,6 +699,55 @@ function normalizeForComparison(s: string): string { .replace(/[^a-z0-9]/g, '') } +function externalIdKey(externalId: ExternalId): string { + const type = externalId.type.trim().toLowerCase() + const encoding = externalId.encoding ?? 'none' + return [ + externalId.identityType, + type, + normalizeExternalIdValue(externalId.id, type, encoding), + encoding, + ].join('|') +} + +function normalizeExternalIdValue(id: string, type: string, encoding: ExternalId['encoding']): string { + const trimmed = id.trim() + if (encoding === 'sha256') return trimmed.toLowerCase() + if (type === 'email' || type.endsWith('_email') || type === 'github_handle') return trimmed.toLowerCase() + if (type === 'phone') return trimmed.replace(/[^\d+]/g, '') + return trimmed +} + +function mergeExternalIds( + existing: ExternalId[] | undefined, + incoming: ExternalId[] | undefined, +): ExternalId[] | undefined { + const merged = new Map() + for (const externalId of existing ?? []) { + if (!externalId.id.trim() || !externalId.type.trim()) continue + const type = externalId.type.trim().toLowerCase() + const encoding = externalId.encoding ?? 'none' + merged.set(externalIdKey(externalId), { + ...externalId, + id: normalizeExternalIdValue(externalId.id, type, encoding), + type, + encoding, + }) + } + for (const externalId of incoming ?? []) { + if (!externalId.id.trim() || !externalId.type.trim()) continue + const type = externalId.type.trim().toLowerCase() + const encoding = externalId.encoding ?? 'none' + merged.set(externalIdKey(externalId), { + ...externalId, + id: normalizeExternalIdValue(externalId.id, type, encoding), + type, + encoding, + }) + } + return merged.size > 0 ? [...merged.values()] : undefined +} + function candidateMatchesWriteScope( candidate: SemanticEntity, scope: typegraphIdentity, @@ -859,8 +942,8 @@ function isLowValueEntityDescription(text: string): boolean { 'through supported', 'creator of the task', 'creator of the record', - 'creator of the document', - 'creator in the document', + 'creator of the source', + 'creator in the source', 'identified as the creator', 'designated as the creator', 'person identified as the creator', @@ -945,10 +1028,10 @@ function trigramJaccard(a: string, b: string): number { /** * Check if two entity types are compatible for merging. * Prevents merging a person with a location, etc. - * Generic types ("entity", "other", "") are compatible with anything. + * Generic/fallback types are compatible with anything. */ function typesCompatible(a: string, b: string): boolean { - const GENERIC_TYPES = new Set(['entity', 'other', '']) + const GENERIC_TYPES = new Set(['entity', 'other', 'concept', '']) return a === b || GENERIC_TYPES.has(a) || GENERIC_TYPES.has(b) } diff --git a/packages/sdk/src/memory/extraction/invalidation.ts b/packages/sdk/src/memory/extraction/invalidation.ts index 7eb2ae3..d9b2cea 100644 --- a/packages/sdk/src/memory/extraction/invalidation.ts +++ b/packages/sdk/src/memory/extraction/invalidation.ts @@ -50,9 +50,10 @@ export class InvalidationEngine { async checkContradictions( newFact: SemanticFact, scope: typegraphIdentity, + opts?: { memoryIds?: string[] | undefined } | undefined, ): Promise { // Search for semantically similar existing facts - const existingFacts = await this.findRelatedFacts(newFact, scope) + const existingFacts = await this.findRelatedFacts(newFact, scope, opts) if (existingFacts.length === 0) return [] const contradictions: Contradiction[] = [] @@ -114,13 +115,17 @@ export class InvalidationEngine { private async findRelatedFacts( newFact: SemanticFact, scope: typegraphIdentity, + opts?: { memoryIds?: string[] | undefined } | undefined, ): Promise { + if (opts?.memoryIds && opts.memoryIds.length === 0) return [] + // Search by embedding similarity if available if (newFact.embedding) { const results = await this.store.search(newFact.embedding, { count: 10, filter: { scope, + ...(opts?.memoryIds ? { ids: opts.memoryIds } : {}), category: 'semantic', }, }) @@ -131,7 +136,11 @@ export class InvalidationEngine { // Fall back to listing facts in scope const results = await this.store.list( - { scope, category: 'semantic' }, + { + scope, + ...(opts?.memoryIds ? { ids: opts.memoryIds } : {}), + category: 'semantic', + }, 20, ) return results.filter( diff --git a/packages/sdk/src/memory/extraction/predicate-normalizer.ts b/packages/sdk/src/memory/extraction/predicate-normalizer.ts index 9c78a23..bcac40a 100644 --- a/packages/sdk/src/memory/extraction/predicate-normalizer.ts +++ b/packages/sdk/src/memory/extraction/predicate-normalizer.ts @@ -1,5 +1,11 @@ import type { EmbeddingProvider } from '../../embedding/provider.js' -import { ALL_PREDICATES } from '../../index-engine/ontology.js' +import { + ALL_PREDICATES, + isSymmetricPredicate, + normalizePredicateWithDirection, + sanitizePredicate, + type PredicateNormalization, +} from '../../index-engine/ontology.js' export interface PredicateNormalizationResult { original: string @@ -7,336 +13,54 @@ export interface PredicateNormalizationResult { valid: boolean swapSubjectObject: boolean symmetric: boolean + temporalStatus?: PredicateNormalization['temporalStatus'] | undefined } -const SYMMETRIC_PREDICATES = new Set([ - 'ALLIED_WITH', - 'BORDERS', - 'COLLABORATED_WITH', - 'COMPARED_WITH', - 'COMPATIBLE_WITH', - 'COMPETES_WITH', - 'CONNECTED_TO', - 'CORRESPONDS_WITH', - 'EQUIVALENT_TO', - 'MARRIED', - 'MERGED_WITH', - 'NEAR', - 'PARTNERED_WITH', - 'RIVALED', - 'SIBLING_OF', -]) +export { isSymmetricPredicate } /** - * Synonym groups: first element is canonical form, rest map to it. - * Tense variants are SEPARATE groups — they carry temporal meaning - * (e.g. PLAYS_FOR = current, PLAYED_FOR = past). + * Thin compatibility wrapper over the central ontology registry. * - * Groups are organized to match the ontology in packages/core/src/index-engine/ontology.ts. - * Every canonical predicate in the ontology should have a synonym group here. - */ -const SYNONYM_GROUPS: readonly string[][] = [ - // ── Person → Person ── - ['MARRIED', 'MARRIED_TO', 'WED', 'SPOUSE_OF', 'HUSBAND_OF', 'WIFE_OF'], - ['DIVORCED', 'DIVORCED_FROM', 'SEPARATED_FROM'], - ['CHILD_OF', 'SON_OF', 'DAUGHTER_OF', 'OFFSPRING_OF'], - ['PARENT_OF', 'FATHER_OF', 'MOTHER_OF'], - ['SIBLING_OF', 'BROTHER_OF', 'SISTER_OF'], - ['MENTORED', 'MENTORED_BY', 'TRAINED', 'COACHED'], - ['SUCCEEDED', 'SUCCEEDED_BY', 'REPLACED'], - ['PRECEDED', 'CAME_BEFORE', 'PRIOR_TO'], - ['INFLUENCED', 'INSPIRED'], - ['RIVALED', 'RIVAL_OF', 'COMPETED_AGAINST'], - ['OPPOSED', 'FOUGHT_AGAINST', 'RESISTED', 'CRITICIZED', 'CHALLENGED'], - ['ALLIED_WITH', 'ALLIED_TO', 'ALIGNED_WITH'], - ['COLLABORATED_WITH', 'COOPERATED_WITH', 'WORKED_WITH'], - ['CORRESPONDS_WITH', 'WROTE_LETTER_TO', 'COMMUNICATED_WITH'], - ['BEFRIENDED', 'FRIEND_OF', 'FRIENDS_WITH'], - ['EMPLOYED', 'HIRED', 'HIRED_BY'], - ['REPORTED_TO', 'SUBORDINATE_OF', 'UNDER'], - ['SUPERVISED', 'MANAGED'], - ['KILLED', 'MURDERED', 'ASSASSINATED'], - ['BETRAYED', 'BETRAYED_BY', 'DECEIVED'], - ['RESCUED', 'SAVED', 'LIBERATED'], - ['SERVED', 'SERVED_UNDER', 'IN_SERVICE_OF'], - - // ── Person → Organization ── - ['WORKS_FOR', 'EMPLOYED_AT', 'WORKS_AT'], - ['WORKED_FOR', 'WORKED_AT'], - ['FOUNDED', 'CO_FOUNDED', 'ESTABLISHED'], - ['LEADS', 'LEADS_AT', 'HEADS', 'DIRECTS', 'CHAIRS'], - ['LED', 'LED_AT', 'HEADED', 'CHAIRED'], - ['ADVISES', 'ADVISES_AT', 'CONSULTS_FOR'], - ['ADVISED', 'ADVISED_AT', 'CONSULTED_FOR'], - ['MEMBER_OF', 'BELONGS_TO', 'JOINED', 'AFFILIATED_WITH'], - ['LEFT', 'DEPARTED', 'RESIGNED_FROM', 'QUIT'], - ['EXPELLED_FROM', 'DISMISSED_FROM', 'FIRED_FROM', 'REMOVED_FROM'], - ['INVESTED_IN', 'INVESTOR_IN', 'BACKED'], - ['DONATED_TO', 'CONTRIBUTED_TO', 'GAVE_TO'], - ['REPRESENTS', 'REPRESENTATIVE_OF', 'SPEAKS_FOR'], - ['REPRESENTED', 'REPRESENTED_BY'], - - // ── Person → Location ── - ['BORN_IN', 'BORN_AT', 'NATIVE_OF', 'BIRTHPLACE'], - ['DIED_IN', 'DIED_AT', 'BURIED_IN'], - ['LIVES_IN', 'RESIDES_IN', 'DWELLING_IN'], - ['LIVED_IN', 'RESIDED_IN', 'SETTLED_IN', 'DWELT_IN'], - ['TRAVELED_TO', 'WENT_TO', 'JOURNEYED_TO'], - ['VISITED', 'BEEN_TO', 'STOPPED_AT'], - ['MOVED_TO', 'RELOCATED_TO', 'MIGRATED_TO'], - ['EXILED_TO', 'BANISHED_TO', 'DEPORTED_TO'], - ['GOVERNED', 'ADMINISTERED', 'OVERSAW'], - ['RULED', 'REIGNED_OVER', 'CONTROLLED'], - ['CONQUERED', 'CAPTURED', 'SEIZED'], - ['DEFENDED', 'PROTECTED', 'GUARDED'], - ['IMPRISONED_IN', 'JAILED_IN', 'DETAINED_IN', 'HELD_IN'], - ['ESCAPED_FROM', 'FLED', 'FLED_FROM'], - - // ── Person → Work of Art / Product ── - ['WROTE', 'AUTHORED', 'COMPOSED', 'PENNED'], - ['DIRECTED', 'HELMED'], - ['ILLUSTRATED', 'DREW'], - ['DESIGNED'], - ['INVENTED'], - ['PERFORMED_IN', 'APPEARED_IN', 'STARRED_IN', 'ACTED_IN'], - ['NARRATED', 'VOICED'], - ['EDITED', 'REVISED'], - ['TRANSLATED'], - ['REVIEWED', 'CRITIQUED'], - ['COMMISSIONED', 'ORDERED'], - ['DEDICATED_TO', 'IN_HONOR_OF'], - - // ── Person → Concept / Event ── - ['WORKS_AS', 'IS_A', 'IS_AN', 'JOB_IS', 'OCCUPATION_IS', 'ROLE_IS', 'TITLE_IS', 'WORKS_IN_ROLE', 'WORKS_AS_A'], - ['WORKED_AS', 'WAS_A', 'WAS_AN', 'WORKED_IN_ROLE', 'WORKED_AS_A'], - ['HELD_ROLE', 'HELD_POSITION', 'SERVED_AS', 'SERVES_AS', 'HELD_TITLE', 'BY_PROFESSION'], - ['PRACTICED_AS', 'PRACTISED_AS'], - ['STUDIED', 'STUDIED_AT', 'EDUCATED_AT', 'ENROLLED_IN'], - ['TAUGHT', 'TAUGHT_AT', 'INSTRUCTED', 'LECTURED'], - ['DISCOVERED', 'FOUND', 'UNCOVERED', 'IDENTIFIED'], - ['DEVELOPED', 'BUILT', 'ENGINEERED'], - ['PROPOSED', 'SUGGESTED', 'PUT_FORWARD'], - ['ADVOCATED_FOR', 'CHAMPIONED', 'PROMOTED'], - ['PARTICIPATED_IN', 'TOOK_PART_IN', 'ENGAGED_IN', 'INVOLVED_IN'], - ['WITNESSED', 'SAW', 'OBSERVED'], - ['SURVIVED', 'LIVED_THROUGH', 'ENDURED'], - ['SPOKE_AT', 'PRESENTED_AT', 'ADDRESSED'], - ['ATTENDED', 'PRESENT_AT'], - ['ORGANIZED', 'ARRANGED', 'COORDINATED'], - ['AWARDED', 'RECEIVED', 'HONORED_WITH', 'GRANTED'], - ['NOMINATED', 'NOMINATED_FOR', 'SHORTLISTED'], - ['DIAGNOSED', 'DIAGNOSED_WITH', 'AFFLICTED_BY', 'SUFFERED_FROM'], - ['TREATED'], - - // ── Organization → Organization ── - ['ACQUIRED', 'BOUGHT', 'PURCHASED'], - ['MERGED_WITH', 'MERGED_INTO'], - ['SPUN_OFF', 'SPUN_OFF_FROM', 'DIVESTED'], - ['PARTNERED_WITH', 'PARTNER_OF', 'IN_PARTNERSHIP_WITH'], - ['COMPETES_WITH', 'COMPETITOR_OF', 'RIVALS'], - ['SUED', 'LITIGATED_AGAINST'], - ['REGULATED_BY', 'OVERSEEN_BY'], - ['SANCTIONED', 'PENALIZED'], - ['FUNDED', 'FINANCED'], - ['SUBSIDIZED'], - ['SUPPLIED', 'VENDOR_OF', 'SUPPLIER_TO'], - - // ── Organization → Location ── - ['HEADQUARTERED_IN', 'BASED_IN', 'HQ_IN'], - ['LOCATED_IN', 'SITUATED_IN'], - ['OPERATES_IN', 'ACTIVE_IN', 'PRESENT_IN'], - ['INCORPORATED_IN', 'REGISTERED_IN', 'CHARTERED_IN'], - ['EXPANDED_TO', 'ENTERED'], - ['WITHDREW_FROM', 'EXITED', 'PULLED_OUT_OF'], - - // ── Organization → Product ── - ['PRODUCED', 'MADE', 'MANUFACTURED'], - ['PUBLISHED', 'PUBLISHED_IN', 'RELEASED', 'ISSUED'], - ['DISTRIBUTES', 'DISTRIBUTES_BY', 'SELLS'], - ['LICENSES', 'LICENSED_BY', 'LICENSED_TO'], - ['LAUNCHED', 'INTRODUCED', 'UNVEILED', 'DEBUTED'], - ['DISCONTINUED', 'ENDED', 'RETIRED'], - - // ── Location → Location ── - ['BORDERS', 'BORDERS_ON', 'ADJACENT_TO'], - ['CONTAINS', 'INCLUDES', 'ENCOMPASSES'], - ['PART_OF', 'WITHIN'], - ['CAPITAL_OF', 'CAPITAL_CITY_OF'], - ['NEAR', 'CLOSE_TO', 'NEARBY'], - - // ── Concept → Concept ── - ['DERIVES_FROM', 'DERIVED_FROM', 'BASED_ON', 'ORIGINATES_FROM'], - ['EXTENDS', 'BUILDS_ON', 'EXPANDS'], - ['CONTRADICTS', 'CONFLICTS_WITH', 'OPPOSES'], - ['SUPERSEDES', 'SUPPLANTS'], - ['EQUIVALENT_TO', 'SAME_AS', 'IDENTICAL_TO'], - ['INFLUENCES', 'AFFECTS', 'IMPACTS'], - ['APPLIED_TO', 'USED_IN', 'UTILIZED_IN'], - ['ENABLES', 'FACILITATES'], - - // ── Event relations ── - ['OCCURRED_IN', 'TOOK_PLACE_IN', 'HAPPENED_IN'], - ['OCCURRED_AT', 'TOOK_PLACE_AT', 'HAPPENED_AT'], - ['CAUSED', 'LED_TO', 'RESULTED_IN', 'TRIGGERED'], - ['FOLLOWED', 'CAME_AFTER'], - - // ── Technology / Law ── - ['IMPLEMENTS', 'REALIZES'], - ['REQUIRES', 'DEPENDS_ON', 'NEEDS'], - ['COMPATIBLE_WITH', 'WORKS_WITH', 'INTEROPERABLE_WITH'], - ['REPLACES'], - ['DEPRECATED_BY', 'OBSOLETED_BY'], - ['GOVERNS', 'CONTROLS', 'OVERSEES'], - ['REGULATES'], - ['PROHIBITS', 'BANS', 'FORBIDS'], - ['PERMITS', 'ALLOWS', 'AUTHORIZES'], - ['ENFORCED_BY', 'ENFORCED', 'POLICED_BY'], - ['AMENDED_BY', 'MODIFIED_BY', 'REVISED_BY'], - ['REPEALED', 'REVOKED', 'ANNULLED', 'RESCINDED'], - - // ── General ── - ['CREATED', 'CONSTRUCTED', 'FABRICATED'], - ['DESTROYED', 'DEMOLISHED', 'RAZED', 'OBLITERATED'], - ['SUPPORTED', 'ENDORSED'], - ['NAMED_AFTER', 'NAMED_FOR', 'EPONYMOUS_WITH'], - ['KNOWN_AS', 'ALSO_CALLED', 'ALIAS', 'AKA'], - ['SYMBOLIZES', 'STANDS_FOR', 'EMBODIES'], - ['DESCRIBED', 'DESCRIBES', 'DEPICTED', 'PORTRAYED', 'CHARACTERIZED'], - ['COMPARED_WITH', 'COMPARED_TO', 'LIKENED_TO', 'CONTRASTED_WITH'], - ['FOUGHT_IN', 'SERVED_IN', 'BATTLED_IN'], - ['SIGNED', 'SIGNED_WITH'], - ['OWNS', 'OWNER_OF', 'POSSESSED'], - - // ── Announcement / Reporting (kept from original) ── - ['ANNOUNCED', 'DECLARED', 'PROCLAIMED', 'STATED'], - ['REPORTED', 'DOCUMENTED', 'RECORDED', 'CHRONICLED'], -] - -const INVERSE_SYNONYMS = new Map([ - ['KILLED_BY', 'KILLED'], - ['SLAIN_BY', 'KILLED'], - ['MURDERED_BY', 'KILLED'], - ['ASSASSINATED_BY', 'KILLED'], - ['BETRAYED_BY', 'BETRAYED'], - ['SUCCEEDED_BY', 'SUCCEEDED'], - ['INFLUENCED_BY', 'INFLUENCED'], - ['INSPIRED_BY', 'INSPIRED'], - ['MENTORED_BY', 'MENTORED'], - ['TRAINED_BY', 'MENTORED'], - ['COACHED_BY', 'MENTORED'], - ['HIRED_BY', 'EMPLOYED'], - ['SUPERVISED_BY', 'SUPERVISED'], - ['MANAGED_BY', 'SUPERVISED'], - ['EMPLOYED_BY', 'WORKS_FOR'], - ['WAS_EMPLOYED_BY', 'WORKED_FOR'], - ['FOUNDED_BY', 'FOUNDED'], - ['CO_FOUNDED_BY', 'CO_FOUNDED'], - ['WRITTEN_BY', 'WROTE'], - ['AUTHORED_BY', 'AUTHORED'], - ['COMPOSED_BY', 'COMPOSED'], - ['DIRECTED_BY', 'DIRECTED'], - ['ILLUSTRATED_BY', 'ILLUSTRATED'], - ['DESIGNED_BY', 'DESIGNED'], - ['INVENTED_BY', 'INVENTED'], - ['NARRATED_BY', 'NARRATED'], - ['EDITED_BY', 'EDITED'], - ['TRANSLATED_BY', 'TRANSLATED'], - ['REVIEWED_BY', 'REVIEWED'], - ['COMMISSIONED_BY', 'COMMISSIONED'], - ['TREATED_BY', 'TREATED'], - ['CURED_BY', 'TREATED'], - ['SUED_BY', 'SUED'], - ['SANCTIONED_BY', 'SANCTIONED'], - ['FUNDED_BY', 'FUNDED'], - ['FINANCED_BY', 'FUNDED'], - ['SUBSIDIZED_BY', 'SUBSIDIZED'], - ['SUPPLIED_BY', 'SUPPLIED'], - ['PUBLISHED_BY', 'PUBLISHED'], - ['DISTRIBUTES_BY', 'DISTRIBUTES'], - ['DISTRIBUTED_BY', 'DISTRIBUTES'], - ['LICENSED_BY', 'LICENSES'], - ['LICENSED_TO', 'LICENSES'], - ['IMPLEMENTS_BY', 'IMPLEMENTS'], - ['REPLACED_BY', 'REPLACES'], - ['REGULATES_BY', 'REGULATES'], - ['ENFORCED', 'ENFORCED_BY'], - ['AMENDED_BY', 'AMENDED_BY'], - ['MODIFIED_BY', 'AMENDED_BY'], - ['REVISED_BY', 'AMENDED_BY'], - ['OWNED_BY', 'OWNS'], - ['ACQUIRED_BY', 'ACQUIRED'], - ['PROPERTY_OF', 'OWNS'], - ['REPRESENTED_BY', 'REPRESENTS'], - ['SIGNED_BY', 'SIGNED'], -]) - -function sanitizePredicate(predicate: string): string { - return predicate - .trim() - .toUpperCase() - .replace(/[\s-]+/g, '_') - .replace(/[^A-Z0-9_]/g, '') -} - -export function isSymmetricPredicate(predicate: string): boolean { - return SYMMETRIC_PREDICATES.has(predicate) -} - -/** - * Clusters semantically equivalent predicates into canonical forms. - * - * Without normalization, predicates like PLAYS_FOR, IS_A_PLAYER_FOR, PLAYED_FOR - * are treated as distinct relation types, fragmenting graph traversal paths. - * - * Resolution order: - * 1. Exact canonical match (O(1)) - * 2. Static synonym table (O(1)) - * 3. Inverse synonym table with subject/object swap metadata - * 4. Ontology validation; unknown predicates are rejected + * The registry owns canonical predicates, aliases, inverse direction, symmetry, + * and temporal alias metadata. This class preserves the public API used by the + * graph bridge and tests while avoiding a second predicate table. */ export class PredicateNormalizer { private readonly canonicalPredicates = new Set() - private readonly synonymMap = new Map() - private readonly inverseSynonymMap = new Map() + private readonly extraSynonymMap = new Map() constructor(_embedding: EmbeddingProvider, _threshold = 0.85, extraSynonyms?: readonly string[][]) { - for (const group of [...SYNONYM_GROUPS, ...(extraSynonyms ?? [])]) { + for (const group of extraSynonyms ?? []) { const canonical = sanitizePredicate(group[0]!) for (const synonym of group) { - this.synonymMap.set(sanitizePredicate(synonym), canonical) + this.extraSynonymMap.set(sanitizePredicate(synonym), canonical) } } - for (const [synonym, canonical] of INVERSE_SYNONYMS) { - this.inverseSynonymMap.set(sanitizePredicate(synonym), sanitizePredicate(canonical)) - } } - /** - * Normalize a predicate to its canonical form. - */ async normalize(predicate: string): Promise { return this.normalizeWithDirection(predicate).predicate } normalizeWithDirection(predicate: string): PredicateNormalizationResult { const original = sanitizePredicate(predicate) - const direct = this.synonymMap.get(original) ?? original - const inverse = this.inverseSynonymMap.get(original) - const normalized = inverse ?? direct - const valid = (ALL_PREDICATES as ReadonlySet).has(normalized) - if (valid) this.canonicalPredicates.add(normalized) - - return { - original, - predicate: normalized, - valid, - swapSubjectObject: !!inverse, - symmetric: isSymmetricPredicate(normalized), - } + const extra = this.extraSynonymMap.get(original) + const normalized = extra + ? { + original, + predicate: extra, + valid: (ALL_PREDICATES as ReadonlySet).has(extra), + swapSubjectObject: false, + symmetric: isSymmetricPredicate(extra), + } + : normalizePredicateWithDirection(original) + + if (normalized.valid) this.canonicalPredicates.add(normalized.predicate) + return normalized } - /** Number of canonical predicates registered. */ get size(): number { return this.canonicalPredicates.size } } + diff --git a/packages/sdk/src/memory/index.ts b/packages/sdk/src/memory/index.ts index 2936c5d..7c9f87f 100644 --- a/packages/sdk/src/memory/index.ts +++ b/packages/sdk/src/memory/index.ts @@ -4,13 +4,18 @@ export type { MemoryCategory, MemoryStatus, TemporalRecord, + ExternalId, + ExternalIdIdentityType, + ExternalIdEncoding, MemoryRecord, EpisodicMemory, SemanticEntity, EntityMentionType, SemanticEntityMention, - SemanticPassageNode, - SemanticPassageEntityEdge, + SemanticGraphNodeType, + SemanticGraphEdge, + SemanticEntityChunkEdge, + SemanticChunkRecord, SemanticEdge, SemanticFactRecord, SemanticFact, diff --git a/packages/sdk/src/memory/memory-bridge.ts b/packages/sdk/src/memory/memory-bridge.ts index 62ab695..e10d563 100644 --- a/packages/sdk/src/memory/memory-bridge.ts +++ b/packages/sdk/src/memory/memory-bridge.ts @@ -19,18 +19,20 @@ import type { MemoryStoreAdapter } from './types/adapter.js' import type { ConversationMessage } from './extraction/extractor.js' import { TypegraphMemory } from './typegraph-memory.js' import { scopeKey } from './types/scope.js' +import { optionalCompactObject, withDefaultTenant } from '../utils/input.js' /** Extract typegraphIdentity fields from an opts bag. */ -function identityFrom(opts: typegraphIdentity): typegraphIdentity { +function identityFrom(opts: typegraphIdentity | null | undefined, defaults?: typegraphIdentity): typegraphIdentity { + const merged = { ...(defaults ?? {}), ...optionalCompactObject(opts, 'memory.identity', 'opts') } return { - tenantId: opts.tenantId, - groupId: opts.groupId, - userId: opts.userId, - agentId: opts.agentId, - conversationId: opts.conversationId, - agentName: opts.agentName, - agentDescription: opts.agentDescription, - agentVersion: opts.agentVersion, + tenantId: merged.tenantId, + groupId: merged.groupId, + userId: merged.userId, + agentId: merged.agentId, + conversationId: merged.conversationId, + agentName: merged.agentName, + agentDescription: merged.agentDescription, + agentVersion: merged.agentVersion, } } @@ -72,78 +74,97 @@ export function createMemoryBridge(config: CreateMemoryBridgeConfig): MemoryBrid return mem } - async function remember(content: string, opts: RememberOpts): Promise { - const mem = getMemory(identityFrom(opts)) + async function remember(content: string, opts?: RememberOpts | null): Promise { + const normalizedOpts = withDefaultTenant(opts, config.scope?.tenantId, 'remember') as RememberOpts + const mem = getMemory(identityFrom(normalizedOpts, config.scope)) return mem.remember(content, { - category: (opts.category as 'episodic' | 'semantic' | 'procedural' | undefined) ?? 'semantic', - importance: opts.importance, - metadata: opts.metadata, - traceId: opts.traceId, - spanId: opts.spanId, + category: (normalizedOpts.category as 'episodic' | 'semantic' | 'procedural' | undefined) ?? 'semantic', + importance: normalizedOpts.importance, + metadata: normalizedOpts.metadata, + subject: normalizedOpts.subject, + relatedEntities: normalizedOpts.relatedEntities, + visibility: normalizedOpts.visibility, + traceId: normalizedOpts.traceId, + spanId: normalizedOpts.spanId, }) as unknown as Promise } - async function forget(id: string, opts: ForgetOpts): Promise { - await memoryStore.invalidate(id) - // Note: MemoryBridge.forget goes direct to store so no TypegraphMemory.emit fires here. - // The telemetry arg is accepted for future symmetry / external event sinks. - void opts + async function forget(id: string, opts?: ForgetOpts | null): Promise { + const normalizedOpts = withDefaultTenant(opts, config.scope?.tenantId, 'forget') as ForgetOpts + const mem = getMemory(identityFrom(normalizedOpts, config.scope)) + await mem.forget(id, { traceId: normalizedOpts.traceId, spanId: normalizedOpts.spanId }) } - async function correct(correction: string, opts: CorrectOpts) { - const mem = getMemory(identityFrom(opts)) - return mem.correct(correction, { traceId: opts.traceId, spanId: opts.spanId }) + async function correct(correction: string, opts?: CorrectOpts | null) { + const normalizedOpts = withDefaultTenant(opts, config.scope?.tenantId, 'correct') as CorrectOpts + const mem = getMemory(identityFrom(normalizedOpts, config.scope)) + return mem.correct(correction, { + subject: normalizedOpts.subject, + relatedEntities: normalizedOpts.relatedEntities, + traceId: normalizedOpts.traceId, + spanId: normalizedOpts.spanId, + }) } async function addConversationTurn( messages: Array<{ role: string; content: string; timestamp?: Date }>, - opts: AddConversationTurnOpts, + opts?: AddConversationTurnOpts | null, ): Promise { - const mem = getMemory(identityFrom(opts)) - return mem.addConversationTurn(messages as ConversationMessage[], opts.conversationId, { - traceId: opts.traceId, - spanId: opts.spanId, + const normalizedOpts = withDefaultTenant(opts, config.scope?.tenantId, 'addConversationTurn') as AddConversationTurnOpts + const mem = getMemory(identityFrom(normalizedOpts, config.scope)) + return mem.addConversationTurn(messages as ConversationMessage[], { + conversationId: normalizedOpts.conversationId, + subject: normalizedOpts.subject, + relatedEntities: normalizedOpts.relatedEntities, + visibility: normalizedOpts.visibility, + traceId: normalizedOpts.traceId, + spanId: normalizedOpts.spanId, }) as unknown as Promise } function recall(query: string, opts: RecallOpts & { format: 'xml' | 'markdown' | 'plain' }): Promise - function recall(query: string, opts: RecallOpts): Promise - function recall(query: string, opts: RecallOpts): Promise { - const mem = getMemory(identityFrom(opts)) + function recall(query: string, opts?: RecallOpts | null): Promise + function recall(query: string, opts?: RecallOpts | null): Promise { + const normalizedOpts = withDefaultTenant(opts, config.scope?.tenantId, 'recall') as RecallOpts + const mem = getMemory(identityFrom(normalizedOpts, config.scope)) const internalOpts = { - limit: opts.limit, - types: opts.types as ('episodic' | 'semantic' | 'procedural')[] | undefined, - asOf: opts.temporalAt, - includeInvalidated: opts.includeInvalidated, - format: opts.format, - traceId: opts.traceId, - spanId: opts.spanId, + limit: normalizedOpts.limit, + types: normalizedOpts.types as ('episodic' | 'semantic' | 'procedural')[] | undefined, + asOf: normalizedOpts.temporalAt, + includeInvalidated: normalizedOpts.includeInvalidated, + entityScope: normalizedOpts.entityScope, + format: normalizedOpts.format, + traceId: normalizedOpts.traceId, + spanId: normalizedOpts.spanId, } - return opts.format + return normalizedOpts.format ? mem.recall(query, internalOpts as typeof internalOpts & { format: 'xml' | 'markdown' | 'plain' }) : mem.recall(query, internalOpts) as unknown as Promise } function recallHybrid(query: string, opts: RecallOpts & { format: 'xml' | 'markdown' | 'plain' }): Promise - function recallHybrid(query: string, opts: RecallOpts): Promise - function recallHybrid(query: string, opts: RecallOpts): Promise { - const mem = getMemory(identityFrom(opts)) + function recallHybrid(query: string, opts?: RecallOpts | null): Promise + function recallHybrid(query: string, opts?: RecallOpts | null): Promise { + const normalizedOpts = withDefaultTenant(opts, config.scope?.tenantId, 'recallHybrid') as RecallOpts + const mem = getMemory(identityFrom(normalizedOpts, config.scope)) const internalOpts = { - limit: opts.limit, - types: opts.types as ('episodic' | 'semantic' | 'procedural')[] | undefined, - asOf: opts.temporalAt, - includeInvalidated: opts.includeInvalidated, - format: opts.format, - traceId: opts.traceId, - spanId: opts.spanId, + limit: normalizedOpts.limit, + types: normalizedOpts.types as ('episodic' | 'semantic' | 'procedural')[] | undefined, + asOf: normalizedOpts.temporalAt, + includeInvalidated: normalizedOpts.includeInvalidated, + entityScope: normalizedOpts.entityScope, + format: normalizedOpts.format, + traceId: normalizedOpts.traceId, + spanId: normalizedOpts.spanId, } - return opts.format + return normalizedOpts.format ? mem.recallHybrid(query, internalOpts as typeof internalOpts & { format: 'xml' | 'markdown' | 'plain' }) : mem.recallHybrid(query, internalOpts) as unknown as Promise } - async function healthCheck(opts?: HealthCheckOpts): Promise { - const mem = getMemory(opts ? identityFrom(opts) : {}) + async function healthCheck(opts?: HealthCheckOpts | null): Promise { + const normalizedOpts = withDefaultTenant(opts, config.scope?.tenantId, 'healthCheck') as HealthCheckOpts + const mem = getMemory(identityFrom(normalizedOpts, config.scope)) return mem.healthCheck() as unknown as Promise } diff --git a/packages/sdk/src/memory/typegraph-memory.ts b/packages/sdk/src/memory/typegraph-memory.ts index aa05cc5..1577a1a 100644 --- a/packages/sdk/src/memory/typegraph-memory.ts +++ b/packages/sdk/src/memory/typegraph-memory.ts @@ -5,17 +5,26 @@ import type { typegraphIdentity } from '../types/identity.js' import type { MemoryRecord, MemoryCategory, + SemanticEntity, SemanticFact, EpisodicMemory, ProceduralMemory, + SemanticGraphEdge, } from './types/memory.js' +import type { MemorySubject } from '../types/graph-bridge.js' +import type { QueryEntityScope } from '../types/query.js' +import type { Visibility } from '../types/source.js' import type { LLMProvider } from './extraction/llm-provider.js' import type { ExtractionResult, ConversationMessage } from './extraction/extractor.js' +import { ConfigError } from '../types/errors.js' import { MemoryExtractor } from './extraction/extractor.js' import { InvalidationEngine } from './extraction/invalidation.js' import { decayScore, DEFAULT_DECAY_CONFIG } from './consolidation/decay.js' import { createTemporal } from './temporal.js' import { generateId } from '../utils/id.js' +import { optionalCompactObject } from '../utils/input.js' +import { DEFAULT_ENTITY_TYPE } from '../index-engine/ontology.js' +import { createHash } from 'crypto' // ── Recall option shapes ── @@ -27,12 +36,29 @@ interface RecallOptsInternal extends TelemetryOpts { asOf?: Date | undefined /** Include invalidated/expired memories. Default: false. */ includeInvalidated?: boolean | undefined + entityScope?: QueryEntityScope | undefined /** Return a formatted string instead of `MemoryRecord[]`. */ format?: RecallFormat | undefined } type RecallOptsWithFormat = RecallOptsInternal & { format: RecallFormat } +interface MemoryContextOpts extends TelemetryOpts { + subject?: MemorySubject | undefined + relatedEntities?: MemorySubject[] | undefined + visibility?: Visibility | undefined +} + +type RememberMemoryOpts = MemoryContextOpts & { + category?: MemoryCategory | undefined + importance?: number | undefined + metadata?: Record | undefined +} + +interface ConversationTurnOpts extends MemoryContextOpts { + conversationId?: string | undefined +} + // ── Memory Health Report ── export interface MemoryHealthReport { @@ -102,7 +128,7 @@ export class TypegraphMemory { targetId: string | undefined, payload: Record, durationMs?: number, - telemetry?: TelemetryOpts, + telemetry?: TelemetryOpts | null, ): void { if (!this.eventSink) return this.eventSink.emit({ @@ -118,17 +144,128 @@ export class TypegraphMemory { }) } + private stableMemoryEntityId(subject: MemorySubject): string { + const key = subject.entityId + ?? subject.externalIds?.map(id => `${id.type}:${id.encoding ?? 'none'}:${id.id}`).sort().join('|') + ?? subject.name + ?? 'memory-subject' + const scopeKey = [ + this.scope.tenantId, + this.scope.groupId, + this.scope.userId, + this.scope.agentId, + this.scope.conversationId, + ].map(value => value ?? '').join('\u001f') + return `ent_${createHash('sha256').update(`${scopeKey}\u001f${key}`).digest('hex').slice(0, 32)}` + } + + private memorySubjectEntityType(subject: MemorySubject): string { + if (subject.entityType?.trim()) return subject.entityType.trim() + return DEFAULT_ENTITY_TYPE + } + + private async resolveMemorySubject(subject: MemorySubject | undefined, visibility?: Visibility): Promise { + if (!subject) return null + if (subject.entityId && this.store.getEntity) { + const existing = await this.store.getEntity(subject.entityId, this.scope) + if (existing) return existing + } + for (const externalId of subject.externalIds ?? []) { + const existing = this.store.findEntityByExternalId + ? await this.store.findEntityByExternalId(externalId, this.scope) + : null + if (existing) return existing + } + if (!this.store.upsertEntity) return null + const name = subject.name?.trim() + || subject.externalIds?.[0]?.id + || subject.entityId + || 'Unknown entity' + const embedding = await this.embedding.embed(name) + const now = new Date() + return this.store.upsertEntity({ + id: subject.entityId ?? this.stableMemoryEntityId(subject), + name, + entityType: this.memorySubjectEntityType(subject), + aliases: subject.aliases ?? [], + externalIds: subject.externalIds, + properties: subject.properties ?? {}, + embedding, + scope: this.scope, + visibility, + temporal: { validAt: now, createdAt: now }, + }) + } + + private async resolveEntityScope(scope: QueryEntityScope | undefined): Promise { + if (!scope) return undefined + const entityIds = new Set((scope.entityIds ?? []).filter(Boolean)) + if ((scope.externalIds?.length ?? 0) > 0 && !this.store.findEntityByExternalId) { + throw new ConfigError('entityScope.externalIds requires a memory store with external ID resolution.') + } + for (const externalId of scope.externalIds ?? []) { + const entity = this.store.findEntityByExternalId + ? await this.store.findEntityByExternalId(externalId, this.scope) + : null + if (entity) entityIds.add(entity.id) + } + return [...entityIds] + } + + private async linkMemoryToEntities(memoryId: string, entities: SemanticEntity[], visibility?: Visibility): Promise { + if (!this.store.upsertGraphEdges || entities.length === 0) return + const now = new Date() + const edges: SemanticGraphEdge[] = entities.map(entity => ({ + id: `edge_${createHash('sha256').update(`memory:${memoryId}:ABOUT:${entity.id}`).digest('hex').slice(0, 32)}`, + sourceType: 'memory', + sourceId: memoryId, + targetType: 'entity', + targetId: entity.id, + relation: 'ABOUT', + weight: 1, + properties: {}, + scope: this.scope, + visibility, + temporal: { validAt: now, createdAt: now }, + evidence: [memoryId], + })) + await this.store.upsertGraphEdges(edges) + } + + private async memoryIdsForEntityScope(scope: QueryEntityScope | undefined): Promise { + const entityIds = await this.resolveEntityScope(scope) + if (!entityIds) return undefined + if (entityIds.length === 0) return [] + if (!this.store.getMemoryIdsForEntities) { + throw new ConfigError('entityScope requires a memory store with entity-memory association lookup.') + } + return this.store.getMemoryIdsForEntities(entityIds, this.scope) + } + + private async resolveMemoryContext(opts?: MemoryContextOpts | null): Promise<{ + entities: SemanticEntity[] + entityScope?: QueryEntityScope | undefined + memoryIds?: string[] | undefined + }> { + const subjects = [opts?.subject, ...(opts?.relatedEntities ?? [])].filter((subject): subject is MemorySubject => !!subject) + if (subjects.length === 0) return { entities: [] } + const entities = (await Promise.all(subjects.map(subject => this.resolveMemorySubject(subject, opts?.visibility)))) + .filter((entity): entity is SemanticEntity => !!entity) + const entityIds = [...new Set(entities.map(entity => entity.id))] + if (entityIds.length === 0) return { entities: [] } + const entityScope: QueryEntityScope = { entityIds } + const memoryIds = await this.memoryIdsForEntityScope(entityScope) + return { entities, entityScope, memoryIds } + } + // ── Store ── /** * Store a memory. Creates a record in the given category (default: `semantic`). * For LLM extraction of structured facts from a conversation, use `addConversationTurn()`. */ - async remember(content: string, opts?: { - category?: MemoryCategory | undefined - importance?: number | undefined - metadata?: Record | undefined - } & TelemetryOpts): Promise { + async remember(content: string, rawOpts?: RememberMemoryOpts | null): Promise { + const opts = optionalCompactObject(rawOpts, 'TypegraphMemory.remember') as RememberMemoryOpts const category = opts?.category ?? 'semantic' const embedding = await this.embedding.embed(content) const temporal = createTemporal() @@ -144,10 +281,13 @@ export class TypegraphMemory { lastAccessedAt: new Date(), metadata: opts?.metadata ?? {}, scope: this.scope, + visibility: opts?.visibility, ...temporal, } const result = await this.store.upsert(record) + const { entities } = await this.resolveMemoryContext(opts) + await this.linkMemoryToEntities(result.id, entities, opts?.visibility) this.emit('memory.write', result.id, { category, contentLength: content.length }, undefined, opts) return result } @@ -155,9 +295,11 @@ export class TypegraphMemory { /** * Forget (invalidate) a memory by ID. Preserves the record with invalidAt set. */ - async forget(id: string, telemetry?: TelemetryOpts): Promise { + async forget(id: string, telemetry?: TelemetryOpts | null): Promise { + const normalizedTelemetry = optionalCompactObject(telemetry, 'TypegraphMemory.forget', 'telemetry') as TelemetryOpts await this.store.invalidate(id) - this.emit('memory.invalidate', id, {}, undefined, telemetry) + await this.store.invalidateGraphEdgesForNode?.('memory', id) + this.emit('memory.invalidate', id, {}, undefined, normalizedTelemetry) } /** @@ -168,11 +310,12 @@ export class TypegraphMemory { * machinery as `addConversationTurn`, so prior facts get invalidated * by the LLM contradiction judge rather than a brittle substring match. */ - async correct(naturalLanguageCorrection: string, telemetry?: TelemetryOpts): Promise<{ + async correct(naturalLanguageCorrection: string, rawOpts?: MemoryContextOpts | null): Promise<{ invalidated: number created: number summary: string }> { + const opts = optionalCompactObject(rawOpts, 'TypegraphMemory.correct') as MemoryContextOpts const messages: ConversationMessage[] = [ { role: 'user', content: naturalLanguageCorrection }, ] @@ -183,31 +326,35 @@ export class TypegraphMemory { correction: naturalLanguageCorrection.slice(0, 100), invalidated: 0, created: 0, - }, undefined, telemetry) + }, undefined, opts) return { invalidated: 0, created: 0, summary: 'Could not parse correction' } } let invalidated = 0 let created = 0 const syntheticEpisodeId = generateId('mem') + const context = await this.resolveMemoryContext(opts) for (const candidate of candidates) { const fact = this.extractor.candidateToFact(candidate, syntheticEpisodeId) fact.metadata = { ...fact.metadata, correctionText: naturalLanguageCorrection } fact.embedding = await this.embedding.embed(fact.content) - const contradictions = await this.invalidation.checkContradictions(fact, this.scope) + const contradictions = await this.invalidation.checkContradictions(fact, this.scope, { + memoryIds: context.memoryIds, + }) if (contradictions.length > 0) { invalidated += contradictions.length this.emit('extraction.contradiction', undefined, { factContent: fact.content.slice(0, 100), contradictionCount: contradictions.length, source: 'correct', - }, undefined, telemetry) + }, undefined, opts) await this.invalidation.resolveContradictions(contradictions) } - await this.store.upsert(fact) + const stored = await this.store.upsert(fact) + await this.linkMemoryToEntities(stored.id, context.entities, opts?.visibility) created++ } @@ -216,7 +363,7 @@ export class TypegraphMemory { correction: naturalLanguageCorrection.slice(0, 100), invalidated, created, - }, undefined, telemetry) + }, undefined, opts) return { invalidated, created, summary } } @@ -228,13 +375,16 @@ export class TypegraphMemory { * suitable for dropping into an LLM prompt. */ async recall(query: string, opts: RecallOptsWithFormat): Promise - async recall(query: string, opts?: RecallOptsInternal): Promise - async recall(query: string, opts?: RecallOptsInternal): Promise { + async recall(query: string, opts?: RecallOptsInternal | null): Promise + async recall(query: string, rawOpts?: RecallOptsInternal | null): Promise { + const opts = optionalCompactObject(rawOpts, 'TypegraphMemory.recall') as RecallOptsInternal const embedding = await this.embedding.embed(query) + const scopedMemoryIds = await this.memoryIdsForEntityScope(opts?.entityScope) const results = await this.store.search(embedding, { count: opts?.limit ?? 10, filter: { scope: this.scope, + ...(scopedMemoryIds ? { ids: scopedMemoryIds } : {}), category: opts?.types, ...(opts?.includeInvalidated ? {} : { status: 'active' as const }), }, @@ -260,13 +410,16 @@ export class TypegraphMemory { } async recallHybrid(query: string, opts: RecallOptsWithFormat): Promise - async recallHybrid(query: string, opts?: RecallOptsInternal): Promise - async recallHybrid(query: string, opts?: RecallOptsInternal): Promise { + async recallHybrid(query: string, opts?: RecallOptsInternal | null): Promise + async recallHybrid(query: string, rawOpts?: RecallOptsInternal | null): Promise { + const opts = optionalCompactObject(rawOpts, 'TypegraphMemory.recallHybrid') as RecallOptsInternal const embedding = await this.embedding.embed(query) + const scopedMemoryIds = await this.memoryIdsForEntityScope(opts?.entityScope) const searchOpts = { count: opts?.limit ?? 10, filter: { scope: this.scope, + ...(scopedMemoryIds ? { ids: scopedMemoryIds } : {}), category: opts?.types, ...(opts?.includeInvalidated ? {} : { status: 'active' as const }), } as import('./types/adapter.js').MemoryFilter, @@ -300,8 +453,8 @@ export class TypegraphMemory { /** * Recall only semantic facts. */ - async recallFacts(query: string, limit: number = 10, telemetry?: TelemetryOpts): Promise { - const results = await this.recall(query, { types: ['semantic'], limit, ...telemetry }) + async recallFacts(query: string, limit: number = 10, telemetry?: TelemetryOpts | null): Promise { + const results = await this.recall(query, { types: ['semantic'], limit, ...(telemetry ?? {}) }) const facts = results.filter((r): r is SemanticFact => r.category === 'semantic') this.emit('memory.read', undefined, { query: query.slice(0, 100), resultCount: facts.length, source: 'facts' }, undefined, telemetry) return facts @@ -310,16 +463,16 @@ export class TypegraphMemory { /** * Recall only episodic memories. */ - async recallEpisodes(query: string, limit: number = 10, telemetry?: TelemetryOpts): Promise { - const results = await this.recall(query, { types: ['episodic'], limit, ...telemetry }) + async recallEpisodes(query: string, limit: number = 10, telemetry?: TelemetryOpts | null): Promise { + const results = await this.recall(query, { types: ['episodic'], limit, ...(telemetry ?? {}) }) return results.filter((r): r is EpisodicMemory => r.category === 'episodic') } /** * Recall procedural memories matching a trigger. */ - async recallProcedures(trigger: string, limit: number = 5, telemetry?: TelemetryOpts): Promise { - const results = await this.recall(trigger, { types: ['procedural'], limit, ...telemetry }) + async recallProcedures(trigger: string, limit: number = 5, telemetry?: TelemetryOpts | null): Promise { + const results = await this.recall(trigger, { types: ['procedural'], limit, ...(telemetry ?? {}) }) return results.filter((r): r is ProceduralMemory => r.category === 'procedural') } @@ -330,15 +483,24 @@ export class TypegraphMemory { */ async addConversationTurn( messages: ConversationMessage[], - conversationId?: string, - telemetry?: TelemetryOpts, + rawOpts?: ConversationTurnOpts | null, ): Promise { + const opts = optionalCompactObject(rawOpts, 'TypegraphMemory.addConversationTurn') as ConversationTurnOpts + const { conversationId } = opts + const context = await this.resolveMemoryContext(opts) // Get existing facts for conflict resolution - const existingFacts = await this.recallFacts( - messages.map(m => m.content).join(' '), - 20, - telemetry, - ) + const existingFacts = context.entityScope + ? (await this.recall(messages.map(m => m.content).join(' '), { + types: ['semantic'], + limit: 20, + entityScope: context.entityScope, + ...opts, + })).filter((record): record is SemanticFact => record.category === 'semantic') + : await this.recallFacts( + messages.map(m => m.content).join(' '), + 20, + opts, + ) const result = await this.extractor.processConversation( messages, @@ -350,7 +512,8 @@ export class TypegraphMemory { for (const episode of result.episodic) { episode.embedding = await this.embedding.embed(episode.content) const stored = await this.store.upsert(episode) - this.emit('memory.write', stored.id, { category: 'episodic', source: 'conversation' }, undefined, telemetry) + await this.linkMemoryToEntities(stored.id, context.entities, opts?.visibility) + this.emit('memory.write', stored.id, { category: 'episodic', source: 'conversation' }, undefined, opts) } // Store new facts and check for contradictions @@ -360,7 +523,9 @@ export class TypegraphMemory { fact.embedding = await this.embedding.embed(fact.content) // Check contradictions before storing - const contradictions = await this.invalidation.checkContradictions(fact, this.scope) + const contradictions = await this.invalidation.checkContradictions(fact, this.scope, { + memoryIds: context.memoryIds, + }) if (contradictions.length > 0) { contradictionCount += contradictions.length for (const c of contradictions) { @@ -374,12 +539,13 @@ export class TypegraphMemory { this.emit('extraction.contradiction', undefined, { factContent: fact.content.slice(0, 100), contradictionCount: contradictions.length, - }, undefined, telemetry) + }, undefined, opts) await this.invalidation.resolveContradictions(contradictions) } const stored = await this.store.upsert(fact) - this.emit('memory.write', stored.id, { category: 'semantic', source: 'conversation' }, undefined, telemetry) + await this.linkMemoryToEntities(stored.id, context.entities, opts?.visibility) + this.emit('memory.write', stored.id, { category: 'semantic', source: 'conversation' }, undefined, opts) } this.emit('extraction.facts', undefined, { @@ -387,7 +553,7 @@ export class TypegraphMemory { factCount: result.facts.length, contradictionCount, conversationId, - }, undefined, telemetry) + }, undefined, opts) // Expose contradictions on the result so callers (typegraph.ts) can fire the onContradictionDetected hook ;(result as ExtractionResult & { _contradictions?: typeof allContradictions })._contradictions = allContradictions @@ -401,7 +567,7 @@ export class TypegraphMemory { * Return a snapshot of memory system health and statistics. * Uses count methods on the adapter when available; falls back to list() sampling. */ - async healthCheck(): Promise { + async healthCheck(_opts?: TelemetryOpts | null): Promise { let totalMemories: number let activeMemories: number let invalidatedMemories: number diff --git a/packages/sdk/src/memory/types/adapter.ts b/packages/sdk/src/memory/types/adapter.ts index fcd74ec..84e2b80 100644 --- a/packages/sdk/src/memory/types/adapter.ts +++ b/packages/sdk/src/memory/types/adapter.ts @@ -1,16 +1,25 @@ import type { typegraphIdentity } from '../../types/identity.js' -import type { Visibility } from '../../types/typegraph-document.js' +import type { Visibility } from '../../types/source.js' import type { MemoryRecord, MemoryCategory, MemoryStatus, + ExternalId, SemanticEntity, SemanticEntityMention, SemanticEdge, + SemanticGraphEdge, + SemanticEntityChunkEdge, + SemanticChunkRecord, SemanticFactRecord, - SemanticPassageEntityEdge, - SemanticPassageNode, } from './memory.js' +import type { ChunkRef } from '../../types/chunk.js' +import type { + DeleteGraphEntityOpts, + DeleteGraphEntityResult, + MergeGraphEntitiesInput, + MergeGraphEntitiesResult, +} from '../../types/graph-bridge.js' // ── Memory Filtering ── @@ -24,6 +33,7 @@ export interface MemoryFilter { agentId?: string | undefined conversationId?: string | undefined visibility?: Visibility | Visibility[] | undefined + ids?: string[] | undefined category?: MemoryCategory | MemoryCategory[] | undefined /** Filter by lifecycle status */ status?: MemoryStatus | MemoryStatus[] | undefined @@ -53,10 +63,10 @@ export interface GraphBackfillPageOpts { offset?: number | undefined } -export interface PassageBackfillChunk { +export interface ChunkBackfillRecord { chunkId: string bucketId: string - documentId: string + sourceId: string chunkIndex: number embeddingModel: string content: string @@ -69,7 +79,7 @@ export interface PassageBackfillChunk { conversationId?: string | undefined } -export interface PassageMentionBackfillRow extends PassageBackfillChunk { +export interface ChunkMentionBackfillRow extends ChunkBackfillRecord { entityId: string mentionType: SemanticEntityMention['mentionType'] surfaceText?: string | undefined @@ -122,81 +132,58 @@ export interface MemoryStoreAdapter { getEntity?(id: string, scope?: typegraphIdentity): Promise getEntitiesBatch?(ids: string[], scope?: typegraphIdentity): Promise findEntities?(query: string, scope: typegraphIdentity, limit?: number): Promise + upsertEntityExternalIds?(entityId: string, externalIds: ExternalId[], scope: typegraphIdentity): Promise + findEntityByExternalId?(externalId: ExternalId, scope?: typegraphIdentity): Promise + mergeEntityReferences?(input: MergeGraphEntitiesInput): Promise + deleteEntityReferences?(entityId: string, opts?: DeleteGraphEntityOpts | null): Promise searchEntities?(embedding: number[], scope: typegraphIdentity, limit?: number): Promise searchEntitiesHybrid?(query: string, embedding: number[], scope: typegraphIdentity, limit?: number): Promise - // ── Passage + Fact Graph Storage (optional - needed for heterogeneous graph retrieval) ── - - upsertPassageNodes?(nodes: SemanticPassageNode[]): Promise + // ── Chunk + Fact Graph Storage (optional - needed for heterogeneous graph retrieval) ── - upsertPassageEntityEdges?(edges: SemanticPassageEntityEdge[]): Promise + upsertGraphEdges?(edges: SemanticGraphEdge[]): Promise upsertFactRecord?(fact: SemanticFactRecord): Promise searchFacts?(embedding: number[], scope: typegraphIdentity, limit?: number): Promise + searchFactsHybrid?(query: string, embedding: number[] | undefined, scope: typegraphIdentity, limit?: number): Promise - getPassageEdgesForEntities?( + getChunkEdgesForEntities?( entityIds: string[], opts?: { scope?: typegraphIdentity | undefined bucketIds?: string[] | undefined limit?: number | undefined } - ): Promise + ): Promise - getPassagesByIds?( - passageIds: string[], + getChunksByRefs?( + chunkRefs: ChunkRef[], opts: { chunksTable: string scope?: typegraphIdentity | undefined bucketIds?: string[] | undefined } - ): Promise - tenantId?: string | undefined - groupId?: string | undefined - userId?: string | undefined - agentId?: string | undefined - conversationId?: string | undefined - }>> - - searchPassageNodes?( + ): Promise + + searchChunks?( embedding: number[], scope: typegraphIdentity, opts: { chunksTable: string bucketIds?: string[] | undefined limit?: number | undefined + chunkRefs?: ChunkRef[] | undefined } - ): Promise - similarity: number - tenantId?: string | undefined - groupId?: string | undefined - userId?: string | undefined - agentId?: string | undefined - conversationId?: string | undefined - }>> - - listPassageBackfillChunks?( + ): Promise + + listChunkBackfillRecords?( opts: GraphBackfillPageOpts & { chunksTable: string } - ): Promise + ): Promise - listPassageMentionBackfillRows?( + listChunkMentionBackfillRows?( opts: GraphBackfillPageOpts & { chunksTable: string } - ): Promise + ): Promise listSemanticEdgesForBackfill?( opts?: GraphBackfillPageOpts @@ -209,13 +196,15 @@ export interface MemoryStoreAdapter { getEdgesBatch?(entityIds: string[], direction?: 'in' | 'out' | 'both', scope?: typegraphIdentity): Promise findEdges?(sourceId: string, targetId: string, relation?: string): Promise invalidateEdge?(id: string, invalidAt?: Date): Promise + invalidateGraphEdgesForNode?(nodeType: 'entity' | 'chunk' | 'memory', nodeId: string, invalidAt?: Date): Promise + getMemoryIdsForEntities?(entityIds: string[], scope?: typegraphIdentity): Promise // ── Entity ↔ Chunk Mention Evidence ── // Records which chunks mentioned which entities during extraction. Used for - // lexical entity lookup, provenance/debugging, and passage-edge backfill. + // lexical entity lookup, provenance/debugging, and edge backfill. /** Record one or more (entity, chunk, bucket) mentions. Idempotent on - * (entityId, documentId, chunkIndex, mentionType, normalizedSurfaceText). */ + * (entityId, sourceId, chunkIndex, mentionType, normalizedSurfaceText). */ upsertEntityChunkMentions?( mentions: SemanticEntityMention[] ): Promise diff --git a/packages/sdk/src/memory/types/index.ts b/packages/sdk/src/memory/types/index.ts index 0622d1d..dcecd8f 100644 --- a/packages/sdk/src/memory/types/index.ts +++ b/packages/sdk/src/memory/types/index.ts @@ -2,13 +2,18 @@ export type { MemoryCategory, MemoryStatus, TemporalRecord, + ExternalId, + ExternalIdIdentityType, + ExternalIdEncoding, MemoryRecord, EpisodicMemory, SemanticEntity, EntityMentionType, SemanticEntityMention, - SemanticPassageNode, - SemanticPassageEntityEdge, + SemanticGraphNodeType, + SemanticGraphEdge, + SemanticEntityChunkEdge, + SemanticChunkRecord, SemanticEdge, SemanticFactRecord, SemanticFact, @@ -24,8 +29,8 @@ export { export type { GraphBackfillPageOpts, - PassageBackfillChunk, - PassageMentionBackfillRow, + ChunkBackfillRecord, + ChunkMentionBackfillRow, MemoryFilter, MemorySearchOpts, MemoryStoreAdapter, diff --git a/packages/sdk/src/memory/types/memory.ts b/packages/sdk/src/memory/types/memory.ts index 7c0663c..0df79e7 100644 --- a/packages/sdk/src/memory/types/memory.ts +++ b/packages/sdk/src/memory/types/memory.ts @@ -1,5 +1,6 @@ import type { typegraphIdentity } from '../../types/identity.js' -import type { Visibility } from '../../types/typegraph-document.js' +import type { Visibility } from '../../types/source.js' +import type { ChunkRef } from '../../types/chunk.js' // ── Memory Categories ── @@ -33,6 +34,31 @@ export interface TemporalRecord { expiredAt?: Date | undefined } +// ── Deterministic External Identity ── + +export type ExternalIdIdentityType = + | 'tenant' + | 'group' + | 'user' + | 'agent' + | 'conversation' + | 'entity' + +export type ExternalIdEncoding = 'none' | 'sha256' + +export interface ExternalId { + /** External system identifier value, e.g. email, Slack user ID, GitHub handle. */ + id: string + /** Identifier namespace/type, e.g. email, slack_user_id, github_handle. */ + type: string + /** Identity level this identifier applies to. */ + identityType: ExternalIdIdentityType + /** Encoding of `id`. Defaults to `none`. */ + encoding?: ExternalIdEncoding | undefined + /** Optional system/source metadata for debugging and future conflict policy. */ + metadata?: Record | undefined +} + // ── Base Memory Record ── export interface MemoryRecord extends TemporalRecord { @@ -90,8 +116,16 @@ export interface SemanticEntity { entityType: string /** Alternative names / spellings */ aliases: string[] + /** Deterministic external identifiers used before fuzzy/probabilistic matching. */ + externalIds?: ExternalId[] | undefined /** Arbitrary typed properties */ properties: Record + /** Entity lifecycle status. Missing/undefined is treated as active for older rows. */ + status?: 'active' | 'merged' | 'invalidated' | undefined + /** Set when this entity was merged into another canonical entity. */ + mergedIntoEntityId?: string | undefined + /** Set when the entity was invalidated or purged by an entity maintenance operation. */ + deletedAt?: Date | undefined /** Embedding of the entity name for similarity matching */ embedding?: number[] | undefined /** Embedding of the entity description for Phase 3.5 near-miss matching */ @@ -105,11 +139,11 @@ export interface SemanticEntity { temporal: TemporalRecord } -export type EntityMentionType = 'subject' | 'object' | 'co_occurrence' | 'entity' | 'alias' +export type EntityMentionType = 'subject' | 'object' | 'co_occurrence' | 'entity' | 'alias' | 'source_subject' export interface SemanticEntityMention { entityId: string - documentId: string + sourceId: string chunkIndex: number bucketId: string mentionType: EntityMentionType @@ -120,24 +154,29 @@ export interface SemanticEntityMention { confidence?: number | undefined } -export interface SemanticPassageNode { +export type SemanticGraphNodeType = 'entity' | 'chunk' | 'memory' + +export interface SemanticGraphEdge { id: string - bucketId: string - documentId: string - chunkIndex: number - embeddingModel: string - contentHash: string - chunkId?: string | undefined - metadata: Record + sourceType: SemanticGraphNodeType + sourceId: string + targetType: SemanticGraphNodeType + targetId: string + relation: string + weight: number + properties: Record scope: typegraphIdentity visibility?: Visibility | undefined - createdAt: Date - updatedAt: Date + temporal: TemporalRecord + evidence: string[] + sourceChunkRef?: ChunkRef | undefined + targetChunkRef?: ChunkRef | undefined } -export interface SemanticPassageEntityEdge { - passageId: string +export interface SemanticEntityChunkEdge { + id: string entityId: string + chunkRef: ChunkRef weight: number mentionCount: number confidence?: number | undefined @@ -149,6 +188,18 @@ export interface SemanticPassageEntityEdge { updatedAt?: Date | undefined } +export interface SemanticChunkRecord extends ChunkRef { + content: string + totalChunks: number + metadata: Record + similarity?: number | undefined + tenantId?: string | undefined + groupId?: string | undefined + userId?: string | undefined + agentId?: string | undefined + conversationId?: string | undefined +} + export interface SemanticFactRecord { id: string edgeId: string @@ -167,6 +218,7 @@ export interface SemanticFactRecord { visibility?: Visibility | undefined createdAt: Date updatedAt: Date + invalidAt?: Date | undefined similarity?: number | undefined } @@ -175,6 +227,10 @@ export interface SemanticFactRecord { export interface SemanticEdge { id: string + sourceType?: 'entity' | undefined + sourceId?: string | undefined + targetType?: 'entity' | undefined + targetId?: string | undefined sourceEntityId: string targetEntityId: string /** Relationship type in SCREAMING_SNAKE_CASE: 'WORKS_AT', 'PREFERS', 'KNOWS' */ diff --git a/packages/sdk/src/query/assemble.ts b/packages/sdk/src/query/assemble.ts index 8b20c91..8091a84 100644 --- a/packages/sdk/src/query/assemble.ts +++ b/packages/sdk/src/query/assemble.ts @@ -92,10 +92,10 @@ function entriesBySection(results: QueryResults): Record { - const s = opts.signals ?? {} +export function resolveSignals(opts?: QueryOpts | null): Required { + const normalizedOpts = optionalCompactObject(opts, 'resolveSignals') as QueryOpts + const s = normalizedOpts.signals ?? {} return { semantic: s.semantic ?? true, keyword: s.keyword ?? false, @@ -34,10 +38,10 @@ export function signalLabel(signals: QuerySignals): string { } /** Compute composite score with eligible/ineligible distinction. - * - `undefined` value = ineligible (result can't have this score, e.g. bucket doc has no memory score). + * - `undefined` value = ineligible (result can't have this score, e.g. bucket source has no memory score). * Weight is redistributed proportionally to eligible categories. * - `0` value = eligible but scored poorly. Full penalty proportional to category weight. - * This ensures bucket documents aren't penalized for lacking a memory score, + * This ensures bucket sources aren't penalized for lacking a memory score, * while memories that score 0 in keyword search are properly penalized. */ function compositeScore( components: Array<{ weight: number; value: number | undefined }> @@ -199,19 +203,20 @@ function toChunkResult(r: RetrievalCandidate, scored: ScoredCandidate): QueryChu score: scored.score, scores: scored.scores, sources: scored.sources, - document: { - id: r.documentId, + source: { + id: r.sourceId, bucketId: r.bucketId, title: r.title ?? '', url: r.url, updatedAt: r.updatedAt ?? new Date(), - status: r.documentStatus, - visibility: r.documentVisibility, + status: r.sourceStatus, + visibility: r.sourceVisibility, tenantId: r.tenantId, userId: r.userId, groupId: r.groupId, agentId: r.agentId, conversationId: r.conversationId, + subject: r.sourceSubject, }, chunk: r.chunk ?? { index: 0, total: 1 }, metadata: r.metadata, @@ -222,7 +227,7 @@ function toChunkResult(r: RetrievalCandidate, scored: ScoredCandidate): QueryChu function fallbackMemoryRecord(r: RetrievalCandidate): Omit { const now = new Date() return { - id: r.documentId, + id: r.sourceId, category: 'semantic', status: 'active', content: r.content, @@ -279,8 +284,8 @@ function partitionResults( return { chunks, - facts: signals.graph ? uniqueById(graphFacts) : [], - entities: signals.graph ? uniqueById(graphEntities) : [], + facts: graphFacts.length > 0 ? uniqueById(graphFacts) : [], + entities: graphEntities.length > 0 ? uniqueById(graphEntities) : [], memories: signals.memory ? memories : [], ...(signals.graph && graphTrace ? { graphTrace } : {}), } @@ -304,6 +309,16 @@ function resultCounts(results: QueryResults): { } } +function boostScopedCandidates(candidates: RetrievalCandidate[], chunkRefs: ChunkRef[]): void { + if (chunkRefs.length === 0) return + const scoped = new Set(chunkRefs.map(ref => `${ref.bucketId}:${ref.sourceId}:${ref.chunkIndex}`)) + for (const candidate of candidates) { + if (!scoped.has(resultIdentityKey(candidate))) continue + candidate.normalizedScore = Math.min(1, candidate.normalizedScore * 1.15 + 0.05) + candidate.rawScores.semantic = Math.min(1, (candidate.rawScores.semantic ?? candidate.normalizedScore) * 1.15 + 0.05) + } +} + export class QueryPlanner { constructor( private adapter: VectorStoreAdapter, @@ -316,17 +331,18 @@ export class QueryPlanner { private logger?: typegraphLogger, ) {} - async execute(text: string, opts: QueryOpts = {}): Promise { + async execute(text: string, opts?: QueryOpts | null): Promise { + const normalizedOpts = optionalCompactObject(opts, 'QueryPlanner.execute') as QueryOpts const startMs = Date.now() - const count = opts.count ?? 10 - const tenantId = opts.tenantId - const signals = resolveSignals(opts) - const onBucketError = opts.onBucketError ?? 'throw' + const count = normalizedOpts.count ?? 10 + const tenantId = normalizedOpts.tenantId + const signals = resolveSignals(normalizedOpts) + const onBucketError = normalizedOpts.onBucketError ?? 'throw' // Auto-weights: classify query type and use optimized weight profile. // User-provided scoreWeights always override. - let effectiveScoreWeights = opts.scoreWeights - if (opts.autoWeights && !effectiveScoreWeights) { + let effectiveScoreWeights = normalizedOpts.scoreWeights + if (normalizedOpts.autoWeights && !effectiveScoreWeights) { const classification = classifyQuery(text) effectiveScoreWeights = classification.weights as Partial> this.logger?.debug('Auto-weights', { queryType: classification.type, confidence: classification.confidence, weights: classification.weights }) @@ -335,8 +351,8 @@ export class QueryPlanner { this.logger?.debug('Query start', { text: text.slice(0, 100), signals, count }) // Filter to requested sources or use all - const activeBucketIds = opts.buckets - ? opts.buckets.filter(id => this.bucketIds.includes(id)) + const activeBucketIds = normalizedOpts.buckets + ? normalizedOpts.buckets.filter(id => this.bucketIds.includes(id)) : this.bucketIds // Group sources by ingest embedding model (determines table routing). @@ -364,13 +380,29 @@ export class QueryPlanner { const needsIndexedSearch = signals.semantic || signals.keyword const needsGraph = Boolean(signals.graph && this.knowledgeGraph) const needsMemory = Boolean(signals.memory && this.memory) - const identity = { tenantId: opts.tenantId, groupId: opts.groupId, userId: opts.userId, agentId: opts.agentId, conversationId: opts.conversationId } + const identity = { tenantId: normalizedOpts.tenantId, groupId: normalizedOpts.groupId, userId: normalizedOpts.userId, agentId: normalizedOpts.agentId, conversationId: normalizedOpts.conversationId } + const entityScopeMode = normalizedOpts.entityScope?.mode ?? 'filter' + let scopedEntityIds: string[] = [] + let scopedChunkRefs: ChunkRef[] = [] + const graphScopedQuery = Boolean(normalizedOpts.entityScope && (needsIndexedSearch || signals.graph)) + if (normalizedOpts.entityScope && graphScopedQuery) { + if (!this.knowledgeGraph?.resolveEntityScope) { + throw new ConfigError('entityScope requires a knowledge graph bridge with entity scope resolution.') + } + const resolved = await this.knowledgeGraph.resolveEntityScope(normalizedOpts.entityScope, identity, { + bucketIds: activeBucketIds, + limit: Math.max(count * 50, 200), + }) + scopedEntityIds = resolved.entityIds + scopedChunkRefs = resolved.chunkRefs + if (resolved.warnings) warnings.push(...resolved.warnings) + } // Timeouts (user-configurable or defaults) const timeouts = { - indexed: opts.timeouts?.indexed ?? 30_000, - graph: opts.timeouts?.graph ?? 30_000, - memory: opts.timeouts?.memory ?? 10_000, + indexed: normalizedOpts.timeouts?.indexed ?? 30_000, + graph: normalizedOpts.timeouts?.graph ?? 30_000, + memory: normalizedOpts.timeouts?.memory ?? 10_000, } // Memory-only or graph-only (no indexed search) @@ -385,7 +417,7 @@ export class QueryPlanner { try { const memoryRunner = new MemoryRunner(this.memory!) const memResults = await withTimeout( - memoryRunner.run(text, identity, count, { useKeyword: signals.keyword }), + memoryRunner.run(text, identity, count, { useKeyword: signals.keyword, entityScope: normalizedOpts.entityScope }), timeouts.memory, [] as RetrievalCandidate[] ) @@ -402,7 +434,10 @@ export class QueryPlanner { try { const graphRunner = new GraphRunner(this.knowledgeGraph!) const graphRun = await withTimeout( - graphRunner.run(text, identity, count, activeBucketIds, opts.graph), + graphRunner.run(text, identity, count, activeBucketIds, { + ...normalizedOpts.graph, + ...(normalizedOpts.entityScope ? { entityScope: normalizedOpts.entityScope, resolvedEntityIds: scopedEntityIds } : {}), + }), timeouts.graph, { results: [], facts: [], entities: [] } as GraphRunResult ) @@ -448,8 +483,8 @@ export class QueryPlanner { bucket_count: activeBucketIds.length, }, durationMs, - traceId: opts.traceId, - spanId: opts.spanId, + traceId: normalizedOpts.traceId, + spanId: normalizedOpts.spanId, timestamp: new Date(), } void this.eventSink.emit(event) @@ -475,7 +510,18 @@ export class QueryPlanner { try { const results = await withTimeout( - runner.run(text, modelGroups, count, identity, opts.documentFilter, signals, opts.traceId, opts.spanId, opts.temporalAt), + runner.run( + text, + modelGroups, + count, + identity, + normalizedOpts.sourceFilter, + signals, + normalizedOpts.traceId, + normalizedOpts.spanId, + normalizedOpts.temporalAt, + normalizedOpts.entityScope && entityScopeMode === 'filter' ? scopedChunkRefs : undefined, + ), timeouts.indexed, [] as RetrievalCandidate[] ) @@ -520,6 +566,25 @@ export class QueryPlanner { let graphFacts: FactResult[] = [] let graphEntities: EntityResult[] = [] let graphTrace: GraphSearchTrace | undefined + if (normalizedOpts.entityScope && entityScopeMode === 'boost') { + boostScopedCandidates(allResults, scopedChunkRefs) + } + if (needsIndexedSearch && this.knowledgeGraph?.searchKnowledge) { + try { + const direct = await this.knowledgeGraph.searchKnowledge(text, identity, { + count, + signals, + entityScope: normalizedOpts.entityScope, + resolvedEntityIds: scopedEntityIds, + }) + graphFacts = direct.facts + graphEntities = direct.entities + } catch (err) { + const msg = `Knowledge search failed: ${err instanceof Error ? err.message : String(err)}` + warnings.push(msg) + this.logger?.warn(msg) + } + } if (needsGraph || needsMemory) { // Skip memory runner if store has no memories (avoids empty table query per query) const skipMemory = !needsMemory || (this.memory?.hasMemories ? !(await this.memory.hasMemories()) : false) @@ -527,8 +592,9 @@ export class QueryPlanner { ? Promise.resolve([] as RetrievalCandidate[]) : withTimeout( new MemoryRunner(this.memory!).run(text, identity, count, { - ...(opts.temporalAt ? { temporalAt: opts.temporalAt } : {}), - ...(opts.includeInvalidated != null ? { includeInvalidated: opts.includeInvalidated } : {}), + ...(normalizedOpts.temporalAt ? { temporalAt: normalizedOpts.temporalAt } : {}), + ...(normalizedOpts.includeInvalidated != null ? { includeInvalidated: normalizedOpts.includeInvalidated } : {}), + ...(normalizedOpts.entityScope ? { entityScope: normalizedOpts.entityScope } : {}), useKeyword: signals.keyword, }).catch((err) => { this.logger?.warn(`MemoryRunner failed: ${err instanceof Error ? err.message : err}`); warnings.push(`Memory search failed: ${err instanceof Error ? err.message : String(err)}`); return [] as RetrievalCandidate[] }), timeouts.memory, @@ -538,7 +604,10 @@ export class QueryPlanner { const graphPromise = !needsGraph ? Promise.resolve({ results: [], facts: [], entities: [] } as GraphRunResult) : withTimeout( - new GraphRunner(this.knowledgeGraph!).run(text, identity, count, activeBucketIds, opts.graph) + new GraphRunner(this.knowledgeGraph!).run(text, identity, count, activeBucketIds, { + ...normalizedOpts.graph, + ...(normalizedOpts.entityScope ? { entityScope: normalizedOpts.entityScope, resolvedEntityIds: scopedEntityIds } : {}), + }) .catch((err) => { this.logger?.warn(`GraphRunner failed: ${err instanceof Error ? err.message : err}`); warnings.push(`Graph search failed: ${err instanceof Error ? err.message : String(err)}`); return { results: [], facts: [], entities: [] } as GraphRunResult }), timeouts.graph, { results: [], facts: [], entities: [] } as GraphRunResult @@ -549,8 +618,8 @@ export class QueryPlanner { graphPromise, ]) const graphResults = graphRun.results - graphFacts = graphRun.facts - graphEntities = graphRun.entities + graphFacts = [...graphFacts, ...graphRun.facts] + graphEntities = [...graphEntities, ...graphRun.entities] graphTrace = graphRun.trace if (memResults.length > 0) { @@ -561,7 +630,7 @@ export class QueryPlanner { bucketTimings['__memory__'] = { mode: 'memory', resultCount: memResults.length, durationMs: Date.now() - startMs, status: 'ok' } } if (graphResults.length > 0) { - const reinforcement = opts.graphReinforcement ?? 'off' + const reinforcement = normalizedOpts.graphReinforcement ?? 'off' if (reinforcement === 'off') { // Include all graph results as-is @@ -622,8 +691,8 @@ export class QueryPlanner { bucket_count: activeBucketIds.length, }, durationMs, - traceId: opts.traceId, - spanId: opts.spanId, + traceId: normalizedOpts.traceId, + spanId: normalizedOpts.spanId, timestamp: new Date(), } void this.eventSink.emit(event) @@ -669,8 +738,8 @@ function sourcesForResult(modes: string[], rawScores: RawScores, signals: Requir } function resultIdentityKey(result: RetrievalCandidate): string { - if (result.documentId && result.chunk?.index !== undefined && result.bucketId) { - return `${result.bucketId}:${result.documentId}:${result.chunk.index}` + if (result.sourceId && result.chunk?.index !== undefined && result.bucketId) { + return `${result.bucketId}:${result.sourceId}:${result.chunk.index}` } return result.content } diff --git a/packages/sdk/src/query/runners/graph-runner.ts b/packages/sdk/src/query/runners/graph-runner.ts index 6106f1b..5a8ba81 100644 --- a/packages/sdk/src/query/runners/graph-runner.ts +++ b/packages/sdk/src/query/runners/graph-runner.ts @@ -8,7 +8,7 @@ const FACT_FILTERED_NARROW_GRAPH_OPTIONS: Requiredpassage graph - * 3. Read out ranked passage nodes directly - * 4. Return passage-backed results for merging with other runners + * 1. Build fact, entity, and chunk seeds + * 2. Traverse a heterogeneous entity<->chunk graph + * 3. Read out ranked chunks directly + * 4. Return chunk-backed results for merging with other runners */ async run( text: string, @@ -56,11 +56,11 @@ export class GraphRunner { bucketIds?: string[], options?: QueryGraphOptions, ): Promise { - if (!this.graph.searchGraphPassages) { - throw new Error('Knowledge graph bridge must implement searchGraphPassages for graph queries.') + if (!this.graph.searchGraphChunks) { + throw new Error('Knowledge graph bridge must implement searchGraphChunks for graph queries.') } - const graphResult = await this.graph.searchGraphPassages(text, identity, { + const graphResult = await this.graph.searchGraphChunks(text, identity, { ...resolveGraphSearchOptions(options), count, bucketIds, @@ -72,15 +72,18 @@ export class GraphRunner { results: graphResult.results.map(result => ({ content: result.content, bucketId: result.bucketId, - documentId: result.documentId, + sourceId: result.sourceId, rawScores: { graph: result.score }, normalizedScore: result.score, mode: 'graph' as const, metadata: { ...(result.metadata ?? {}), - passageId: result.passageId, + chunkId: result.chunkId, }, chunk: { index: result.chunkIndex, total: result.totalChunks ?? 1 }, + title: result.metadata?.title as string | undefined, + url: result.metadata?.url as string | undefined, + sourceSubject: result.metadata?.subject as import('../../types/connector.js').SourceSubject | undefined, tenantId: result.tenantId ?? identity.tenantId, groupId: result.groupId, userId: result.userId, diff --git a/packages/sdk/src/query/runners/indexed.ts b/packages/sdk/src/query/runners/indexed.ts index 09cedcd..7b629aa 100644 --- a/packages/sdk/src/query/runners/indexed.ts +++ b/packages/sdk/src/query/runners/indexed.ts @@ -1,8 +1,9 @@ import type { VectorStoreAdapter } from '../../types/adapter.js' import type { EmbeddingProvider } from '../../embedding/provider.js' -import type { DocumentFilter } from '../../types/typegraph-document.js' +import type { SourceFilter } from '../../types/source.js' import type { typegraphIdentity } from '../../types/identity.js' import type { QuerySignals } from '../../types/query.js' +import type { ChunkRef } from '../../types/chunk.js' import type { RetrievalCandidate } from '../merger.js' import type { typegraphEvent, typegraphEventSink } from '../../types/events.js' @@ -21,11 +22,12 @@ export class IndexedRunner { sourcesByModel: Map, count: number, identity?: typegraphIdentity, - documentFilter?: DocumentFilter, + sourceFilter?: SourceFilter, signals?: Required, traceId?: string, spanId?: string, temporalAt?: Date, + chunkRefs?: ChunkRef[], ): Promise { const allResults: RetrievalCandidate[] = [] const fetchCount = count * 3 @@ -44,14 +46,16 @@ export class IndexedRunner { agentId: identity?.agentId, conversationId: identity?.conversationId, bucketIds: group.bucketIds, + chunkRefs: chunkRefs + ?.filter(ref => ref.embeddingModel == null || ref.embeddingModel === modelId), } - // Prefer searchWithDocuments if available and documentFilter is set - if (this.adapter.searchWithDocuments && documentFilter) { - const chunks = await this.adapter.searchWithDocuments(modelId, queryEmbedding, text, { + // Prefer searchWithSources if available and sourceFilter is set + if (this.adapter.searchWithSources && sourceFilter) { + const chunks = await this.adapter.searchWithSources(modelId, queryEmbedding, text, { count: fetchCount, filter, - documentFilter, + sourceFilter, temporalAt, signals: { semantic: useSemantic, keyword: useKeyword }, }) @@ -60,7 +64,7 @@ export class IndexedRunner { allResults.push({ content: chunk.content, bucketId: chunk.bucketId, - documentId: chunk.documentId, + sourceId: chunk.sourceId, rawScores: { semantic: chunk.scores.semantic, keyword: chunk.scores.keyword, @@ -73,17 +77,18 @@ export class IndexedRunner { index: chunk.chunkIndex, total: chunk.totalChunks, }, - url: chunk.document?.url ?? chunk.metadata.url as string | undefined, - title: chunk.document?.title ?? chunk.metadata.title as string | undefined, + url: chunk.source?.url ?? chunk.metadata.url as string | undefined, + title: chunk.source?.title ?? chunk.metadata.title as string | undefined, updatedAt: chunk.indexedAt, tenantId: chunk.tenantId, - // Carry document-level fields if available - documentStatus: chunk.document?.status, - documentVisibility: chunk.document?.visibility, - userId: chunk.document?.userId, - groupId: chunk.document?.groupId, - agentId: chunk.document?.agentId, - conversationId: chunk.document?.conversationId, + // Carry source-level fields if available + sourceStatus: chunk.source?.status, + sourceVisibility: chunk.source?.visibility, + sourceSubject: chunk.source?.subject, + userId: chunk.source?.userId, + groupId: chunk.source?.groupId, + agentId: chunk.source?.agentId, + conversationId: chunk.source?.conversationId, }) } } else { @@ -103,7 +108,7 @@ export class IndexedRunner { allResults.push({ content: chunk.content, bucketId: chunk.bucketId, - documentId: chunk.documentId, + sourceId: chunk.sourceId, rawScores: { semantic: chunk.scores.semantic, keyword: chunk.scores.keyword, @@ -144,16 +149,16 @@ export class IndexedRunner { } } - // Document-level dedup: keep highest-scoring chunk per document - const docBest = new Map() + // Source-level dedup: keep highest-scoring chunk per source + const sourceBest = new Map() for (const r of allResults) { - const existing = docBest.get(r.documentId) + const existing = sourceBest.get(r.sourceId) if (!existing || r.normalizedScore > existing.normalizedScore) { - docBest.set(r.documentId, r) + sourceBest.set(r.sourceId, r) } } - return [...docBest.values()] + return [...sourceBest.values()] .sort((a, b) => b.normalizedScore - a.normalizedScore) .slice(0, count) } diff --git a/packages/sdk/src/query/runners/memory-runner.ts b/packages/sdk/src/query/runners/memory-runner.ts index 7b0d382..8f49f06 100644 --- a/packages/sdk/src/query/runners/memory-runner.ts +++ b/packages/sdk/src/query/runners/memory-runner.ts @@ -1,5 +1,6 @@ import type { MemoryBridge } from '../../types/graph-bridge.js' import type { typegraphIdentity } from '../../types/identity.js' +import type { QueryEntityScope } from '../../types/query.js' import type { RetrievalCandidate } from '../merger.js' /** Memory composite score weights */ @@ -24,7 +25,7 @@ export class MemoryRunner { text: string, identity: typegraphIdentity, count: number, - opts?: { temporalAt?: Date | undefined; includeInvalidated?: boolean | undefined; useKeyword?: boolean | undefined }, + opts?: { temporalAt?: Date | undefined; includeInvalidated?: boolean | undefined; useKeyword?: boolean | undefined; entityScope?: QueryEntityScope | undefined }, ): Promise { // Use hybrid search when keyword signal is active and bridge supports it const useHybrid = opts?.useKeyword && this.memory.recallHybrid @@ -33,6 +34,7 @@ export class MemoryRunner { limit: count, ...(opts?.temporalAt ? { temporalAt: opts.temporalAt } : {}), ...(opts?.includeInvalidated != null ? { includeInvalidated: opts.includeInvalidated } : {}), + ...(opts?.entityScope ? { entityScope: opts.entityScope } : {}), } const memories = useHybrid @@ -60,7 +62,7 @@ export class MemoryRunner { return { content: m.content ?? '', bucketId: '__memory__', - documentId: m.id ?? `memory-${i}`, + sourceId: m.id ?? `memory-${i}`, rawScores: { memory: compositeMemoryScore, semantic: similarity, // Cosine similarity — same algorithm as indexed search diff --git a/packages/sdk/src/typegraph.ts b/packages/sdk/src/typegraph.ts index 89e6516..efd4d2c 100644 --- a/packages/sdk/src/typegraph.ts +++ b/packages/sdk/src/typegraph.ts @@ -4,14 +4,15 @@ import type { QueryOpts, QueryResponse } from './types/query.js' import type { IngestOptions, IndexResult } from './types/index-types.js' import type { EmbeddingProvider } from './embedding/provider.js' import { embeddingModelKey } from './embedding/provider.js' -import type { RawDocument, Chunk } from './types/connector.js' -import type { typegraphDocument, DocumentFilter, UpsertDocumentInput } from './types/typegraph-document.js' +import type { SourceInput, Chunk, SourceSubject } from './types/connector.js' +import type { typegraphSource, SourceFilter, UpsertSourceInput } from './types/source.js' import type { typegraphHooks } from './types/hooks.js' import type { LLMProvider, LLMConfig } from './types/llm-provider.js' import type { MemoryBridge, KnowledgeGraphBridge, - EntityResult, EntityDetail, EdgeResult, FactResult, FactSearchOpts, GraphExploreOpts, GraphExploreResult, GraphBackfillOpts, GraphBackfillResult, GraphExplainOpts, GraphSearchOpts, GraphSearchTrace, PassageResult, - SubgraphOpts, SubgraphResult, GraphStats, + EntityResult, EntityDetail, EdgeResult, FactResult, FactSearchOpts, GraphExploreOpts, GraphExploreResult, GraphBackfillOpts, GraphBackfillResult, GraphExplainOpts, GraphSearchOpts, GraphSearchTrace, ChunkResult, + SubgraphOpts, SubgraphResult, GraphStats, GraphEntityRef, UpsertGraphEdgeInput, UpsertGraphEntityInput, UpsertGraphFactInput, + MergeGraphEntitiesInput, MergeGraphEntitiesResult, DeleteGraphEntityOpts, DeleteGraphEntityResult, RememberOpts, ForgetOpts, CorrectOpts, AddConversationTurnOpts, RecallOpts, HealthCheckOpts, } from './types/graph-bridge.js' @@ -20,7 +21,7 @@ import type { typegraphIdentity } from './types/identity.js' import type { typegraphEventSink, typegraphEventType, TelemetryOpts } from './types/events.js' import type { PolicyStoreAdapter, CreatePolicyInput, UpdatePolicyInput, Policy, PolicyType, PolicyAction } from './types/policy.js' import type { ConversationTurnResult, MemoryHealthReport } from './types/memory.js' -import type { MemoryRecord } from './memory/types/memory.js' +import type { ExternalId, MemoryRecord } from './memory/types/memory.js' import type { typegraphLogger } from './types/logger.js' import type { Job, JobFilter, UpsertJobInput, JobStatusPatch } from './types/job.js' import type { PaginationOpts, PaginatedResult } from './types/pagination.js' @@ -36,20 +37,57 @@ import { buildContext } from './query/assemble.js' import { createCloudInstance } from './cloud/cloud-instance.js' import { NotFoundError, NotInitializedError, ConfigError } from './types/errors.js' import { generateId } from './utils/id.js' +import { assertHasMeaningfulFilter, hasMeaningfulFilter, optionalCompactObject, withDefaultTenant } from './utils/input.js' // ── Default Bucket ── export const DEFAULT_BUCKET_ID = 'bkt_default' export const DEFAULT_BUCKET_NAME = 'Default' -export const DEFAULT_BUCKET_DESCRIPTION = 'System default bucket. All ingested documents without an explicit bucket assignment are stored here. Cannot be deleted.' +export const DEFAULT_BUCKET_DESCRIPTION = 'System default bucket. All ingested sources without an explicit bucket assignment are stored here. Cannot be deleted.' // Fills in defaults for optional fields the engine relies on. -export function normalizeRawDocument>(doc: RawDocument): RawDocument { +export function normalizeSourceInput>(source: SourceInput): SourceInput { + const subject = normalizeSourceSubject(source.subject) + validateSourceSubject(subject) return { - ...doc, - url: doc.url ?? undefined, - updatedAt: doc.updatedAt ?? new Date(), - metadata: doc.metadata ?? ({} as TMeta), + ...source, + url: source.url ?? undefined, + updatedAt: source.updatedAt ?? new Date(), + metadata: source.metadata ?? ({} as TMeta), + subject, + } +} + +function isExternalIdLike(value: unknown): value is ExternalId { + if (!value || typeof value !== 'object' || Array.isArray(value)) return false + const externalId = value as Partial + return typeof externalId.id === 'string' && + externalId.id.trim().length > 0 && + typeof externalId.type === 'string' && + externalId.type.trim().length > 0 +} + +function normalizeSourceSubject(subject?: SourceSubject | null): SourceSubject | undefined { + if (subject == null) return undefined + if (typeof subject !== 'object' || Array.isArray(subject)) { + throw new ConfigError('Source subject must be an object when provided.') + } + const externalIds = Array.isArray(subject.externalIds) + ? subject.externalIds.filter(isExternalIdLike) + : [] + return { + ...subject, + externalIds: externalIds.length > 0 ? externalIds : undefined, + } +} + +function validateSourceSubject(subject?: SourceSubject): void { + if (!subject) return + const hasEntityId = !!subject.entityId?.trim() + const hasExternalId = (subject.externalIds ?? []).some(isExternalIdLike) + const hasName = !!subject.name?.trim() + if (!hasEntityId && !hasExternalId && !hasName) { + throw new ConfigError('Source subject requires entityId, at least one externalIds entry, or name.') } } @@ -144,23 +182,23 @@ function validateConfig(config: typegraphConfig): void { // ── Sub-API Interfaces ── export interface BucketsApi { - create(input: CreateBucketInput, opts?: TelemetryOpts): Promise + create(input: CreateBucketInput, opts?: TelemetryOpts | null): Promise get(bucketId: string): Promise - list(filter?: BucketListFilter, pagination?: PaginationOpts): Promise> - update(bucketId: string, input: Partial>, opts?: TelemetryOpts): Promise - delete(bucketId: string, opts?: TelemetryOpts): Promise + list(filter?: BucketListFilter | null, pagination?: PaginationOpts | null): Promise> + update(bucketId: string, input: Partial>, opts?: TelemetryOpts | null): Promise + delete(bucketId: string, opts?: TelemetryOpts | null): Promise } -export interface DocumentsApi { - get(id: string): Promise - list(filter?: DocumentFilter, pagination?: PaginationOpts): Promise> - update(id: string, input: Partial>, opts?: TelemetryOpts): Promise - delete(filter: DocumentFilter, opts?: TelemetryOpts): Promise +export interface SourcesApi { + get(id: string): Promise + list(filter?: SourceFilter | null, pagination?: PaginationOpts | null): Promise> + update(id: string, input: Partial>, opts?: TelemetryOpts | null): Promise + delete(filter: SourceFilter | null, opts?: TelemetryOpts | null): Promise } export interface JobsApi { get(id: string): Promise - list(filter?: JobFilter): Promise + list(filter?: JobFilter | null): Promise /** Create or replace a job row (caller-provided id). Writers use this from background workers. */ upsert(input: UpsertJobInput): Promise /** Apply a partial status/result/error/progress patch. */ @@ -170,29 +208,39 @@ export interface JobsApi { } export interface GraphApi { - searchEntities(query: string, identity: typegraphIdentity, opts?: { + upsertEntity(input: UpsertGraphEntityInput, opts?: TelemetryOpts | null): Promise + upsertEntities(inputs: UpsertGraphEntityInput[], opts?: TelemetryOpts | null): Promise + resolveEntity(ref: GraphEntityRef | string, identity?: typegraphIdentity | null, opts?: TelemetryOpts | null): Promise + linkExternalIds(entityId: string, externalIds: ExternalId[], identity?: (typegraphIdentity & TelemetryOpts) | null): Promise + mergeEntities(input: MergeGraphEntitiesInput, opts?: TelemetryOpts | null): Promise + deleteEntity(entityId: string, opts?: (DeleteGraphEntityOpts & TelemetryOpts) | null): Promise + upsertEdge(input: UpsertGraphEdgeInput, opts?: TelemetryOpts | null): Promise + upsertEdges(inputs: UpsertGraphEdgeInput[], opts?: TelemetryOpts | null): Promise + upsertFact(input: UpsertGraphFactInput, opts?: TelemetryOpts | null): Promise + upsertFacts(inputs: UpsertGraphFactInput[], opts?: TelemetryOpts | null): Promise + searchEntities(query: string, identity: typegraphIdentity | null, opts?: ({ limit?: number entityType?: string minConnections?: number - } & TelemetryOpts): Promise - getEntity(id: string, opts?: typegraphIdentity): Promise - getEdges(entityId: string, opts?: { + } & TelemetryOpts) | null): Promise + getEntity(id: string, opts?: typegraphIdentity | null): Promise + getEdges(entityId: string, opts?: ({ direction?: 'in' | 'out' | 'both' relation?: string limit?: number - } & typegraphIdentity): Promise - searchFacts(query: string, opts?: FactSearchOpts & TelemetryOpts): Promise - explore(query: string, opts?: GraphExploreOpts): Promise - getPassagesForEntity(entityId: string, opts?: { + } & typegraphIdentity) | null): Promise + searchFacts(query: string, opts?: (FactSearchOpts & TelemetryOpts) | null): Promise + explore(query: string, opts?: GraphExploreOpts | null): Promise + getChunksForEntity(entityId: string, opts?: ({ bucketIds?: string[] | undefined limit?: number | undefined - } & typegraphIdentity): Promise - explainQuery(query: string, opts?: GraphExplainOpts & TelemetryOpts): Promise - backfill(identity: typegraphIdentity, opts?: GraphBackfillOpts & TelemetryOpts): Promise + } & typegraphIdentity) | null): Promise + explainQuery(query: string, opts?: (GraphExplainOpts & TelemetryOpts) | null): Promise + backfill(identity: typegraphIdentity | null, opts?: (GraphBackfillOpts & TelemetryOpts) | null): Promise getSubgraph(opts: SubgraphOpts): Promise - stats(identity: typegraphIdentity, opts?: TelemetryOpts): Promise - getRelationTypes(identity: typegraphIdentity, opts?: TelemetryOpts): Promise> - getEntityTypes(identity: typegraphIdentity, opts?: TelemetryOpts): Promise> + stats(identity: typegraphIdentity | null, opts?: TelemetryOpts | null): Promise + getRelationTypes(identity: typegraphIdentity | null, opts?: TelemetryOpts | null): Promise> + getEntityTypes(identity: typegraphIdentity | null, opts?: TelemetryOpts | null): Promise> } /** The typegraph instance interface — all public methods. */ @@ -207,7 +255,7 @@ export interface typegraphInstance { undeploy(): Promise buckets: BucketsApi - documents: DocumentsApi + sources: SourcesApi jobs: JobsApi /** Graph exploration API. Requires graph bridge. */ @@ -218,42 +266,42 @@ export interface typegraphInstance { getDistinctEmbeddings(bucketIds?: string[]): Map groupBucketsByModel(bucketIds?: string[]): Map - /** Ingest documents. Target bucket set via opts.bucketId (defaults to default bucket). */ - ingest(docs: RawDocument[], opts?: IngestOptions): Promise + /** Ingest sources. Target bucket set via opts.bucketId (defaults to default bucket). */ + ingest(sources: SourceInput[], opts?: IngestOptions | null): Promise - /** Ingest a document with pre-chunked content. Target bucket set via opts.bucketId. */ - ingestPreChunked(doc: RawDocument, chunks: Chunk[], opts?: IngestOptions): Promise + /** Ingest a source with pre-chunked content. Target bucket set via opts.bucketId. */ + ingestPreChunked(source: SourceInput, chunks: Chunk[], opts?: IngestOptions | null): Promise /** Search across buckets. Optionally build an LLM-ready context via opts.context. */ - query(text: string, opts?: QueryOpts): Promise + query(text: string, opts?: QueryOpts | null): Promise // ── Memory operations (require graph bridge) ── /** Store a memory. LLM extracts triples → entity graph + memory record. */ - remember(content: string, opts: RememberOpts): Promise + remember(content: string, opts?: RememberOpts | null): Promise /** Invalidate a memory and its associated graph edges. Identity must match the memory owner. */ - forget(id: string, opts: ForgetOpts): Promise + forget(id: string, opts?: ForgetOpts | null): Promise /** Apply a natural language correction. */ - correct(correction: string, opts: CorrectOpts): Promise<{ invalidated: number; created: number; summary: string }> + correct(correction: string, opts?: CorrectOpts | null): Promise<{ invalidated: number; created: number; summary: string }> /** Search memories by semantic similarity. When `opts.format` is set, returns a formatted string ready for an LLM prompt. */ recall(query: string, opts: RecallOpts & { format: 'xml' | 'markdown' | 'plain' }): Promise - recall(query: string, opts: RecallOpts): Promise + recall(query: string, opts?: RecallOpts | null): Promise /** Check memory system health — returns stats about stored memories, entities, and edges. */ - healthCheck(opts?: HealthCheckOpts): Promise + healthCheck(opts?: HealthCheckOpts | null): Promise /** Ingest a conversation turn with extraction. */ addConversationTurn( messages: Array<{ role: string; content: string; timestamp?: Date }>, - opts: AddConversationTurnOpts, + opts?: AddConversationTurnOpts | null, ): Promise // ── Policy operations (require policyStore) ── policies: { - create(input: CreatePolicyInput, opts?: TelemetryOpts): Promise + create(input: CreatePolicyInput, opts?: TelemetryOpts | null): Promise get(id: string): Promise - list(filter?: { tenantId?: string; policyType?: PolicyType; enabled?: boolean }): Promise - update(id: string, input: UpdatePolicyInput, opts?: TelemetryOpts): Promise - delete(id: string, opts?: TelemetryOpts): Promise + list(filter?: { tenantId?: string; policyType?: PolicyType; enabled?: boolean } | null): Promise + update(id: string, input: UpdatePolicyInput, opts?: TelemetryOpts | null): Promise + delete(id: string, opts?: TelemetryOpts | null): Promise } /** @@ -286,7 +334,7 @@ class TypegraphImpl implements typegraphInstance { eventType: typegraphEventType, targetId?: string, payload: Record = {}, - telemetry?: TelemetryOpts, + telemetry?: TelemetryOpts | null, ): void { if (!this.config?.eventSink) return this.config.eventSink.emit({ @@ -304,7 +352,7 @@ class TypegraphImpl implements typegraphInstance { // ── Buckets ── buckets: BucketsApi = { - create: async (input: CreateBucketInput, opts?: TelemetryOpts): Promise => { + create: async (input: CreateBucketInput, opts?: TelemetryOpts | null): Promise => { this.assertConfigured() const embeddingModel = input.embeddingModel ?? embeddingModelKey(this.defaultEmbedding) const queryEmbeddingModel = input.queryEmbeddingModel ?? (this.defaultQueryEmbedding ? embeddingModelKey(this.defaultQueryEmbedding) : undefined) @@ -358,9 +406,13 @@ class TypegraphImpl implements typegraphInstance { return this._buckets.get(bucketId) }, - list: async (filter?: BucketListFilter, pagination?: PaginationOpts): Promise> => { + list: async (filter?: BucketListFilter | null, pagination?: PaginationOpts | null): Promise> => { + const normalizedFilter = optionalCompactObject(filter, 'buckets.list', 'filter') as BucketListFilter + const normalizedPagination = pagination == null + ? undefined + : optionalCompactObject(pagination, 'buckets.list', 'pagination') as PaginationOpts if (this.adapter.listBuckets) { - const result = await this.adapter.listBuckets(filter, pagination) + const result = await this.adapter.listBuckets(normalizedFilter, normalizedPagination) const buckets = Array.isArray(result) ? result : result.items for (const b of buckets) { this._buckets.set(b.id, b) @@ -371,22 +423,22 @@ class TypegraphImpl implements typegraphInstance { return result } let all = [...this._buckets.values()] - if (filter) { - if (filter.tenantId) all = all.filter(s => s.tenantId === filter.tenantId) - if (filter.groupId) all = all.filter(s => s.groupId === filter.groupId) - if (filter.userId) all = all.filter(s => s.userId === filter.userId) - if (filter.agentId) all = all.filter(s => s.agentId === filter.agentId) - if (filter.conversationId) all = all.filter(s => s.conversationId === filter.conversationId) + if (hasMeaningfulFilter(normalizedFilter)) { + if (normalizedFilter.tenantId) all = all.filter(s => s.tenantId === normalizedFilter.tenantId) + if (normalizedFilter.groupId) all = all.filter(s => s.groupId === normalizedFilter.groupId) + if (normalizedFilter.userId) all = all.filter(s => s.userId === normalizedFilter.userId) + if (normalizedFilter.agentId) all = all.filter(s => s.agentId === normalizedFilter.agentId) + if (normalizedFilter.conversationId) all = all.filter(s => s.conversationId === normalizedFilter.conversationId) } - if (pagination) { - const limit = pagination.limit ?? 100 - const offset = pagination.offset ?? 0 + if (normalizedPagination) { + const limit = normalizedPagination.limit ?? 100 + const offset = normalizedPagination.offset ?? 0 return { items: all.slice(offset, offset + limit), total: all.length, limit, offset } } return all }, - update: async (bucketId: string, input: Partial>, opts?: TelemetryOpts): Promise => { + update: async (bucketId: string, input: Partial>, opts?: TelemetryOpts | null): Promise => { const bucket = await this.buckets.get(bucketId) if (!bucket) throw new NotFoundError('Bucket', bucketId) if (input.name !== undefined) bucket.name = input.name @@ -404,7 +456,7 @@ class TypegraphImpl implements typegraphInstance { return result }, - delete: async (bucketId: string, opts?: TelemetryOpts): Promise => { + delete: async (bucketId: string, opts?: TelemetryOpts | null): Promise => { if (bucketId === DEFAULT_BUCKET_ID) { throw new ConfigError('Cannot delete the default bucket.') } @@ -420,44 +472,50 @@ class TypegraphImpl implements typegraphInstance { }, } - // ── Documents ── + // ── Sources ── - documents: DocumentsApi = { - get: async (id: string): Promise => { + sources: SourcesApi = { + get: async (id: string): Promise => { this.assertConfigured() - if (!this.adapter.getDocument) { - throw new ConfigError('Adapter does not support document operations.') + if (!this.adapter.getSource) { + throw new ConfigError('Adapter does not support source operations.') } - return this.adapter.getDocument(id) + return this.adapter.getSource(id) }, - list: async (filter?: DocumentFilter, pagination?: PaginationOpts): Promise> => { + list: async (filter?: SourceFilter | null, pagination?: PaginationOpts | null): Promise> => { this.assertConfigured() - if (!this.adapter.listDocuments) { - throw new ConfigError('Adapter does not support document operations.') + if (!this.adapter.listSources) { + throw new ConfigError('Adapter does not support source operations.') } - return this.adapter.listDocuments(filter ?? {}, pagination) + const normalizedFilter = optionalCompactObject(filter, 'sources.list', 'filter') as SourceFilter + const normalizedPagination = pagination == null + ? undefined + : optionalCompactObject(pagination, 'sources.list', 'pagination') as PaginationOpts + return this.adapter.listSources(normalizedFilter, normalizedPagination) }, - update: async (id: string, input: Partial>, opts?: TelemetryOpts): Promise => { + update: async (id: string, input: Partial>, opts?: TelemetryOpts | null): Promise => { this.assertConfigured() - if (!this.adapter.updateDocument) { - throw new ConfigError('Adapter does not support document update operations.') + if (!this.adapter.updateSource) { + throw new ConfigError('Adapter does not support source update operations.') } - const updated = await this.adapter.updateDocument(id, input) - this.emitEvent('document.update', id, { fields: Object.keys(input) }, opts) + const updated = await this.adapter.updateSource(id, input) + this.emitEvent('source.update', id, { fields: Object.keys(input) }, opts) return updated }, - delete: async (filter: DocumentFilter, opts?: TelemetryOpts): Promise => { + delete: async (filter: SourceFilter | null, opts?: TelemetryOpts | null): Promise => { this.assertConfigured() - if (!this.adapter.deleteDocuments) { - throw new ConfigError('Adapter does not support document operations.') + if (!this.adapter.deleteSources) { + throw new ConfigError('Adapter does not support source operations.') } - await this.enforcePolicy('document.delete', { tenantId: filter.tenantId ?? this.config.tenantId }) - const count = await this.adapter.deleteDocuments(filter) + const normalizedFilter = optionalCompactObject(filter, 'sources.delete', 'filter') as SourceFilter + assertHasMeaningfulFilter(normalizedFilter, 'sources.delete') + await this.enforcePolicy('source.delete', { tenantId: normalizedFilter.tenantId ?? this.config.tenantId }) + const count = await this.adapter.deleteSources(normalizedFilter) if (count > 0) { - this.emitEvent('document.delete', undefined, { count, filter }, opts) + this.emitEvent('source.delete', undefined, { count, filter: normalizedFilter }, opts) } return count }, @@ -471,10 +529,11 @@ class TypegraphImpl implements typegraphInstance { if (!this.adapter.getJob) return null return this.adapter.getJob(id) }, - list: async (filter?: JobFilter): Promise => { + list: async (filter?: JobFilter | null): Promise => { this.assertConfigured() if (!this.adapter.listJobs) return [] - const res = await this.adapter.listJobs(filter ?? {}) + const normalizedFilter = optionalCompactObject(filter, 'jobs.list', 'filter') as JobFilter + const res = await this.adapter.listJobs(normalizedFilter) return Array.isArray(res) ? res : res.items }, upsert: async (input: UpsertJobInput): Promise => { @@ -503,71 +562,194 @@ class TypegraphImpl implements typegraphInstance { // ── Graph Exploration ── graph: GraphApi = { - searchEntities: async (query: string, identity: typegraphIdentity, opts?: { + upsertEntity: async (input: UpsertGraphEntityInput, opts?: TelemetryOpts | null): Promise => { + const kg = this.requireKnowledgeGraph() + if (!kg.upsertEntity) throw new ConfigError('Knowledge graph bridge does not support entity seeding.') + const telemetry = optionalCompactObject(opts, 'graph.upsertEntity') as TelemetryOpts + const result = await kg.upsertEntity(input) + this.emitEvent('graph.entity.upsert' as typegraphEventType, result.id, { name: result.name }, telemetry) + return result + }, + + upsertEntities: async (inputs: UpsertGraphEntityInput[], opts?: TelemetryOpts | null): Promise => { + const kg = this.requireKnowledgeGraph() + if (!kg.upsertEntities) throw new ConfigError('Knowledge graph bridge does not support entity seeding.') + const telemetry = optionalCompactObject(opts, 'graph.upsertEntities') as TelemetryOpts + const results = await kg.upsertEntities(inputs) + this.emitEvent('graph.entity.upsert' as typegraphEventType, undefined, { count: results.length }, telemetry) + return results + }, + + resolveEntity: async ( + ref: GraphEntityRef | string, + identity?: typegraphIdentity | null, + _opts?: TelemetryOpts | null, + ): Promise => { + const kg = this.requireKnowledgeGraph() + if (!kg.resolveEntity) throw new ConfigError('Knowledge graph bridge does not support entity resolution.') + return kg.resolveEntity(ref, withDefaultTenant(identity, this.config.tenantId, 'graph.resolveEntity')) + }, + + linkExternalIds: async ( + entityId: string, + externalIds: ExternalId[], + identity?: (typegraphIdentity & TelemetryOpts) | null, + ): Promise => { + const kg = this.requireKnowledgeGraph() + if (!kg.linkExternalIds) throw new ConfigError('Knowledge graph bridge does not support deterministic entity external IDs.') + const normalizedIdentity = withDefaultTenant(identity, this.config.tenantId, 'graph.linkExternalIds') as typegraphIdentity & TelemetryOpts + const result = await kg.linkExternalIds(entityId, externalIds, normalizedIdentity) + this.emitEvent('graph.entity.external_ids.link' as typegraphEventType, entityId, { count: externalIds.length }, normalizedIdentity) + return result + }, + + mergeEntities: async (input: MergeGraphEntitiesInput, opts?: TelemetryOpts | null): Promise => { + const kg = this.requireKnowledgeGraph() + if (!kg.mergeEntities) throw new ConfigError('Knowledge graph bridge does not support entity merge operations.') + const telemetry = optionalCompactObject(opts, 'graph.mergeEntities') as TelemetryOpts + const result = await kg.mergeEntities(input) + this.emitEvent('graph.entity.merge' as typegraphEventType, input.targetEntityId, { + sourceEntityId: input.sourceEntityId, + redirectedEdges: result.redirectedEdges, + redirectedFacts: result.redirectedFacts, + }, telemetry) + return result + }, + + deleteEntity: async (entityId: string, opts?: (DeleteGraphEntityOpts & TelemetryOpts) | null): Promise => { + const kg = this.requireKnowledgeGraph() + if (!kg.deleteEntity) throw new ConfigError('Knowledge graph bridge does not support entity delete operations.') + const normalizedOpts = withDefaultTenant(opts, this.config.tenantId, 'graph.deleteEntity') as DeleteGraphEntityOpts & TelemetryOpts + const result = await kg.deleteEntity(entityId, normalizedOpts) + this.emitEvent('graph.entity.delete' as typegraphEventType, entityId, { + mode: result.mode, + deletedEdges: result.deletedEdges, + deletedFacts: result.deletedFacts, + }, normalizedOpts) + return result + }, + + upsertEdge: async (input: UpsertGraphEdgeInput, opts?: TelemetryOpts | null): Promise => { + const kg = this.requireKnowledgeGraph() + if (!kg.upsertEdge) throw new ConfigError('Knowledge graph bridge does not support edge seeding.') + const telemetry = optionalCompactObject(opts, 'graph.upsertEdge') as TelemetryOpts + const result = await kg.upsertEdge(input) + this.emitEvent('graph.edge.upsert' as typegraphEventType, result.id, { relation: result.relation }, telemetry) + return result + }, + + upsertEdges: async (inputs: UpsertGraphEdgeInput[], opts?: TelemetryOpts | null): Promise => { + const kg = this.requireKnowledgeGraph() + if (!kg.upsertEdges) throw new ConfigError('Knowledge graph bridge does not support edge seeding.') + const telemetry = optionalCompactObject(opts, 'graph.upsertEdges') as TelemetryOpts + const results = await kg.upsertEdges(inputs) + this.emitEvent('graph.edge.upsert' as typegraphEventType, undefined, { count: results.length }, telemetry) + return results + }, + + upsertFact: async (input: UpsertGraphFactInput, opts?: TelemetryOpts | null): Promise => { + const kg = this.requireKnowledgeGraph() + if (!kg.upsertFact) throw new ConfigError('Knowledge graph bridge does not support fact seeding.') + const telemetry = optionalCompactObject(opts, 'graph.upsertFact') as TelemetryOpts + const result = await kg.upsertFact(input) + this.emitEvent('graph.fact.upsert' as typegraphEventType, result.id, { relation: result.relation }, telemetry) + return result + }, + + upsertFacts: async (inputs: UpsertGraphFactInput[], opts?: TelemetryOpts | null): Promise => { + const kg = this.requireKnowledgeGraph() + if (!kg.upsertFacts) throw new ConfigError('Knowledge graph bridge does not support fact seeding.') + const telemetry = optionalCompactObject(opts, 'graph.upsertFacts') as TelemetryOpts + const results = await kg.upsertFacts(inputs) + this.emitEvent('graph.fact.upsert' as typegraphEventType, undefined, { count: results.length }, telemetry) + return results + }, + + searchEntities: async (query: string, identity: typegraphIdentity | null, opts?: ({ limit?: number entityType?: string minConnections?: number - } & TelemetryOpts): Promise => { + } & TelemetryOpts) | null): Promise => { const kg = this.requireKnowledgeGraph() if (!kg.searchEntities) throw new ConfigError('Knowledge graph bridge does not support entity search.') - let results = await kg.searchEntities(query, identity, opts?.limit) - if (opts?.entityType) { - results = results.filter(r => r.entityType === opts.entityType) + const normalizedIdentity = withDefaultTenant(identity, this.config.tenantId, 'graph.searchEntities') + const normalizedOpts = optionalCompactObject<{ + limit?: number + entityType?: string + minConnections?: number + } & TelemetryOpts>(opts, 'graph.searchEntities') as { + limit?: number + entityType?: string + minConnections?: number + } & TelemetryOpts + let results = await kg.searchEntities(query, normalizedIdentity, normalizedOpts.limit) + if (normalizedOpts.entityType) { + results = results.filter(r => r.entityType === normalizedOpts.entityType) } - if (opts?.minConnections !== undefined) { - const minConnections = opts.minConnections + if (normalizedOpts.minConnections !== undefined) { + const minConnections = normalizedOpts.minConnections results = results.filter(r => r.edgeCount >= minConnections) } return results }, - getEntity: async (id: string, opts?: typegraphIdentity): Promise => { + getEntity: async (id: string, opts?: typegraphIdentity | null): Promise => { const kg = this.requireKnowledgeGraph() if (!kg.getEntity) throw new ConfigError('Knowledge graph bridge does not support entity lookup.') - return kg.getEntity(id, opts) + return kg.getEntity(id, withDefaultTenant(opts, this.config.tenantId, 'graph.getEntity')) }, - getEdges: async (entityId: string, opts?: { + getEdges: async (entityId: string, opts?: ({ direction?: 'in' | 'out' | 'both' relation?: string limit?: number - } & typegraphIdentity): Promise => { + } & typegraphIdentity) | null): Promise => { const kg = this.requireKnowledgeGraph() if (!kg.getEdges) throw new ConfigError('Knowledge graph bridge does not support edge queries.') - return kg.getEdges(entityId, opts) + return kg.getEdges(entityId, withDefaultTenant(opts, this.config.tenantId, 'graph.getEdges') as { + direction?: 'in' | 'out' | 'both' + relation?: string + limit?: number + } & typegraphIdentity) }, - searchFacts: async (query: string, opts?: FactSearchOpts & TelemetryOpts): Promise => { + searchFacts: async (query: string, opts?: (FactSearchOpts & TelemetryOpts) | null): Promise => { const kg = this.requireKnowledgeGraph() if (!kg.searchFacts) throw new ConfigError('Knowledge graph bridge does not support fact search.') - return kg.searchFacts(query, opts) + return kg.searchFacts(query, withDefaultTenant(opts, this.config.tenantId, 'graph.searchFacts') as FactSearchOpts & TelemetryOpts) }, - explore: async (query: string, opts?: GraphExploreOpts): Promise => { + explore: async (query: string, opts?: GraphExploreOpts | null): Promise => { const kg = this.requireKnowledgeGraph() if (!kg.explore) throw new ConfigError('Knowledge graph bridge does not support graph exploration.') - return kg.explore(query, opts) + return kg.explore(query, withDefaultTenant(opts, this.config.tenantId, 'graph.explore') as GraphExploreOpts) }, - getPassagesForEntity: async (entityId: string, opts?: { + getChunksForEntity: async (entityId: string, opts?: ({ bucketIds?: string[] | undefined limit?: number | undefined - } & typegraphIdentity): Promise => { + } & typegraphIdentity) | null): Promise => { const kg = this.requireKnowledgeGraph() - if (!kg.getPassagesForEntity) throw new ConfigError('Knowledge graph bridge does not support passage lookup.') - return kg.getPassagesForEntity(entityId, opts) + if (!kg.getChunksForEntity) throw new ConfigError('Knowledge graph bridge does not support chunk lookup.') + return kg.getChunksForEntity(entityId, withDefaultTenant(opts, this.config.tenantId, 'graph.getChunksForEntity') as { + bucketIds?: string[] | undefined + limit?: number | undefined + } & typegraphIdentity) }, - explainQuery: async (query: string, opts?: GraphExplainOpts & TelemetryOpts): Promise => { + explainQuery: async (query: string, opts?: (GraphExplainOpts & TelemetryOpts) | null): Promise => { const kg = this.requireKnowledgeGraph() if (!kg.explainQuery) throw new ConfigError('Knowledge graph bridge does not support graph query explanations.') - return kg.explainQuery(query, opts) + return kg.explainQuery(query, withDefaultTenant(opts, this.config.tenantId, 'graph.explainQuery') as GraphExplainOpts & TelemetryOpts) }, - backfill: async (identity: typegraphIdentity, opts?: GraphBackfillOpts & TelemetryOpts): Promise => { + backfill: async (identity: typegraphIdentity | null, opts?: (GraphBackfillOpts & TelemetryOpts) | null): Promise => { const kg = this.requireKnowledgeGraph() if (!kg.backfill) throw new ConfigError('Knowledge graph bridge does not support graph backfill.') - return kg.backfill(identity, opts) + return kg.backfill( + withDefaultTenant(identity, this.config.tenantId, 'graph.backfill'), + optionalCompactObject(opts, 'graph.backfill') as GraphBackfillOpts & TelemetryOpts, + ) }, getSubgraph: async (opts: SubgraphOpts): Promise => { @@ -576,22 +758,22 @@ class TypegraphImpl implements typegraphInstance { return kg.getSubgraph(opts) }, - stats: async (identity: typegraphIdentity, _opts?: TelemetryOpts): Promise => { + stats: async (identity: typegraphIdentity | null, _opts?: TelemetryOpts | null): Promise => { const kg = this.requireKnowledgeGraph() if (!kg.getGraphStats) throw new ConfigError('Knowledge graph bridge does not support stats.') - return kg.getGraphStats(identity) + return kg.getGraphStats(withDefaultTenant(identity, this.config.tenantId, 'graph.stats')) }, - getRelationTypes: async (identity: typegraphIdentity, _opts?: TelemetryOpts): Promise> => { + getRelationTypes: async (identity: typegraphIdentity | null, _opts?: TelemetryOpts | null): Promise> => { const kg = this.requireKnowledgeGraph() if (!kg.getRelationTypes) throw new ConfigError('Knowledge graph bridge does not support relation type queries.') - return kg.getRelationTypes(identity) + return kg.getRelationTypes(withDefaultTenant(identity, this.config.tenantId, 'graph.getRelationTypes')) }, - getEntityTypes: async (identity: typegraphIdentity, _opts?: TelemetryOpts): Promise> => { + getEntityTypes: async (identity: typegraphIdentity | null, _opts?: TelemetryOpts | null): Promise> => { const kg = this.requireKnowledgeGraph() if (!kg.getEntityTypes) throw new ConfigError('Knowledge graph bridge does not support entity type queries.') - return kg.getEntityTypes(identity) + return kg.getEntityTypes(withDefaultTenant(identity, this.config.tenantId, 'graph.getEntityTypes')) }, } @@ -829,21 +1011,22 @@ class TypegraphImpl implements typegraphInstance { return groups } - async ingest(docs: RawDocument[], opts: IngestOptions = {}): Promise { + async ingest(sources: SourceInput[], opts?: IngestOptions | null): Promise { await this.ensureInitialized() await this.ensureBucketsLoaded() - const resolvedBucketId = opts.bucketId || DEFAULT_BUCKET_ID + const normalizedOpts = withDefaultTenant(opts, this.config.tenantId, 'ingest') as IngestOptions + const resolvedBucketId = normalizedOpts.bucketId || DEFAULT_BUCKET_ID await this.enforcePolicy('index', { tenantId: this.config.tenantId }, resolvedBucketId) const bucket = await this.buckets.get(resolvedBucketId) if (!bucket) throw new NotFoundError('Bucket', resolvedBucketId) - const resolvedOpts = this.resolveIngestOptions(opts, bucket) + const resolvedOpts = this.resolveIngestOptions(normalizedOpts, bucket) const chunkSize = resolvedOpts.chunkSize ?? 512 const chunkOverlap = resolvedOpts.chunkOverlap ?? 64 - const normalizedDocs = docs.map(doc => normalizeRawDocument(doc)) - const items = await Promise.all(normalizedDocs.map(async doc => ({ doc, chunks: await defaultChunker(doc, { chunkSize, chunkOverlap }) }))) + const normalizedSources = sources.map(source => normalizeSourceInput(source)) + const items = await Promise.all(normalizedSources.map(async source => ({ source, chunks: await defaultChunker(source, { chunkSize, chunkOverlap }) }))) const embedding = await this.resolveEmbeddingForBucket(resolvedBucketId) const engine = this.createIndexEngine(embedding) - this.logger?.info('Ingesting documents', { bucketId: resolvedBucketId, count: docs.length }) + this.logger?.info('Ingesting sources', { bucketId: resolvedBucketId, count: sources.length }) await this.config.hooks?.onIndexStart?.(resolvedBucketId, resolvedOpts) const result = await engine.ingestBatch(resolvedBucketId, items, resolvedOpts) result.status = 'complete' @@ -858,33 +1041,35 @@ class TypegraphImpl implements typegraphInstance { return result } - async ingestPreChunked(doc: RawDocument, chunks: Chunk[], opts: IngestOptions = {}): Promise { + async ingestPreChunked(source: SourceInput, chunks: Chunk[], opts?: IngestOptions | null): Promise { await this.ensureInitialized() await this.ensureBucketsLoaded() - const resolvedBucketId = opts.bucketId || DEFAULT_BUCKET_ID + const normalizedOpts = withDefaultTenant(opts, this.config.tenantId, 'ingestPreChunked') as IngestOptions + const resolvedBucketId = normalizedOpts.bucketId || DEFAULT_BUCKET_ID await this.enforcePolicy('index', { tenantId: this.config.tenantId }, resolvedBucketId) const bucket = await this.buckets.get(resolvedBucketId) if (!bucket) throw new NotFoundError('Bucket', resolvedBucketId) - const resolvedOpts = this.resolveIngestOptions(opts, bucket) + const resolvedOpts = this.resolveIngestOptions(normalizedOpts, bucket) const embedding = await this.resolveEmbeddingForBucket(resolvedBucketId) const engine = this.createIndexEngine(embedding) await this.config.hooks?.onIndexStart?.(resolvedBucketId, resolvedOpts) - const result = await engine.ingestWithChunks(resolvedBucketId, normalizeRawDocument(doc), chunks, resolvedOpts) + const result = await engine.ingestWithChunks(resolvedBucketId, normalizeSourceInput(source), chunks, resolvedOpts) result.status = 'complete' await this.config.hooks?.onIndexComplete?.(resolvedBucketId, result) return result } - async query(text: string, opts?: QueryOpts): Promise { + async query(text: string, opts?: QueryOpts | null): Promise { await this.ensureInitialized() await this.ensureBucketsLoaded() - await this.enforcePolicy('query', { tenantId: opts?.tenantId ?? this.config.tenantId }) + const normalizedOpts = withDefaultTenant(opts, this.config.tenantId, 'query') as QueryOpts + await this.enforcePolicy('query', { tenantId: normalizedOpts.tenantId ?? this.config.tenantId }) // Batched lazy-load: if the caller names buckets we haven't seen, fetch them in one round-trip. // Avoids per-id gets in the hot path without forcing eager load at init. - if (opts?.buckets?.length && this.adapter.getBuckets) { - const missing = opts.buckets.filter(id => !this._buckets.has(id)) + if (normalizedOpts.buckets?.length && this.adapter.getBuckets) { + const missing = normalizedOpts.buckets.filter(id => !this._buckets.has(id)) if (missing.length > 0) { const fetched = await this.adapter.getBuckets(missing) for (const b of fetched) { @@ -905,13 +1090,13 @@ class TypegraphImpl implements typegraphInstance { this.logger, ) const response = await planner.execute(text, { - ...opts, - tenantId: opts?.tenantId ?? this.config.tenantId, + ...normalizedOpts, + tenantId: normalizedOpts.tenantId ?? this.config.tenantId, }) // Build LLM-ready context if requested. - if (opts?.context) { - const built = buildContext(response.results, opts.context, this.config.tokenizer) + if (normalizedOpts.context) { + const built = buildContext(response.results, normalizedOpts.context, this.config.tokenizer) response.context = built.context response.contextStats = built.stats } @@ -946,41 +1131,47 @@ class TypegraphImpl implements typegraphInstance { return bridge } - async remember(content: string, opts: RememberOpts): Promise { - await this.enforcePolicy('memory.write', opts) - return this.requireMemory().remember(content, opts) + async remember(content: string, opts?: RememberOpts | null): Promise { + const normalizedOpts = withDefaultTenant(opts, this.config.tenantId, 'remember') as RememberOpts + await this.enforcePolicy('memory.write', normalizedOpts) + return this.requireMemory().remember(content, normalizedOpts) } - async forget(id: string, opts: ForgetOpts): Promise { - await this.enforcePolicy('memory.delete', opts, id) - return this.requireMemory().forget(id, opts) + async forget(id: string, opts?: ForgetOpts | null): Promise { + const normalizedOpts = withDefaultTenant(opts, this.config.tenantId, 'forget') as ForgetOpts + await this.enforcePolicy('memory.delete', normalizedOpts, id) + return this.requireMemory().forget(id, normalizedOpts) } - async correct(correction: string, opts: CorrectOpts): Promise<{ invalidated: number; created: number; summary: string }> { - return this.requireMemory().correct(correction, opts) + async correct(correction: string, opts?: CorrectOpts | null): Promise<{ invalidated: number; created: number; summary: string }> { + return this.requireMemory().correct(correction, withDefaultTenant(opts, this.config.tenantId, 'correct') as CorrectOpts) } async recall(query: string, opts: RecallOpts & { format: 'xml' | 'markdown' | 'plain' }): Promise - async recall(query: string, opts: RecallOpts): Promise - async recall(query: string, opts: RecallOpts): Promise { - await this.enforcePolicy('memory.read', opts) - if (opts.format) { - return this.requireMemory().recall(query, opts as RecallOpts & { format: 'xml' | 'markdown' | 'plain' }) + async recall(query: string, opts?: RecallOpts | null): Promise + async recall(query: string, opts?: RecallOpts | null): Promise { + const normalizedOpts = withDefaultTenant(opts, this.config.tenantId, 'recall') as RecallOpts + await this.enforcePolicy('memory.read', normalizedOpts) + if (normalizedOpts.format) { + return this.requireMemory().recall(query, normalizedOpts as RecallOpts & { format: 'xml' | 'markdown' | 'plain' }) } - return this.requireMemory().recall(query, opts) + return this.requireMemory().recall(query, normalizedOpts) } - async healthCheck(opts?: HealthCheckOpts): Promise { + async healthCheck(opts?: HealthCheckOpts | null): Promise { const mem = this.requireMemory() if (!mem.healthCheck) throw new ConfigError('healthCheck not supported by this memory bridge.') - return mem.healthCheck(opts) + return mem.healthCheck(withDefaultTenant(opts, this.config.tenantId, 'healthCheck') as HealthCheckOpts) } async addConversationTurn( messages: Array<{ role: string; content: string; timestamp?: Date }>, - opts: AddConversationTurnOpts, + opts?: AddConversationTurnOpts | null, ): Promise { - const result = await this.requireMemory().addConversationTurn(messages, opts) + const result = await this.requireMemory().addConversationTurn( + messages, + withDefaultTenant(opts, this.config.tenantId, 'addConversationTurn') as AddConversationTurnOpts, + ) // The bridge returns the underlying ExtractionResult cast to ConversationTurnResult; // read the real shape here for hook dispatch (Fix 10). @@ -1024,10 +1215,10 @@ class TypegraphImpl implements typegraphInstance { } policies = { - create: async (input: CreatePolicyInput, opts?: TelemetryOpts): Promise => { + create: async (input: CreatePolicyInput, opts?: TelemetryOpts | null): Promise => { const store = this.requirePolicyStore() const policy = await store.createPolicy(input) - this.emitEvent('policy.create', policy.id, { name: policy.name, policyType: policy.policyType }, opts) + this.emitEvent('policy.create', policy.id, { name: policy.name, policyType: policy.policyType }, optionalCompactObject(opts, 'policies.create') as TelemetryOpts) return policy }, @@ -1036,22 +1227,22 @@ class TypegraphImpl implements typegraphInstance { return store.getPolicy(id) }, - list: async (filter?: { tenantId?: string; policyType?: PolicyType; enabled?: boolean }): Promise => { + list: async (filter?: { tenantId?: string; policyType?: PolicyType; enabled?: boolean } | null): Promise => { const store = this.requirePolicyStore() - return store.listPolicies(filter) + return store.listPolicies(optionalCompactObject<{ tenantId?: string; policyType?: PolicyType; enabled?: boolean }>(filter, 'policies.list', 'filter')) }, - update: async (id: string, input: UpdatePolicyInput, opts?: TelemetryOpts): Promise => { + update: async (id: string, input: UpdatePolicyInput, opts?: TelemetryOpts | null): Promise => { const store = this.requirePolicyStore() const policy = await store.updatePolicy(id, input) - this.emitEvent('policy.update', policy.id, { name: policy.name }, opts) + this.emitEvent('policy.update', policy.id, { name: policy.name }, optionalCompactObject(opts, 'policies.update') as TelemetryOpts) return policy }, - delete: async (id: string, opts?: TelemetryOpts): Promise => { + delete: async (id: string, opts?: TelemetryOpts | null): Promise => { const store = this.requirePolicyStore() await store.deletePolicy(id) - this.emitEvent('policy.delete', id, {}, opts) + this.emitEvent('policy.delete', id, {}, optionalCompactObject(opts, 'policies.delete') as TelemetryOpts) }, } @@ -1075,8 +1266,8 @@ class TypegraphImpl implements typegraphInstance { } private createIndexEngine(embedding: EmbeddingProvider): IndexEngine { - const engine = new IndexEngine(this.adapter, embedding, this.config.eventSink, this.logger) const kg = this.graphBridge + const engine = new IndexEngine(this.adapter, embedding, this.config.eventSink, this.logger, kg) if (this.config.llm && kg) { const mainLlm = resolveLLMProvider(this.config.llm) const ext = this.config.extraction diff --git a/packages/sdk/src/types/adapter.ts b/packages/sdk/src/types/adapter.ts index 0a40e9b..f8dd417 100644 --- a/packages/sdk/src/types/adapter.ts +++ b/packages/sdk/src/types/adapter.ts @@ -1,5 +1,5 @@ -import type { EmbeddedChunk, ChunkFilter, ScoredChunk } from './document.js' -import type { typegraphDocument, DocumentFilter, DocumentStatus, UpsertDocumentInput, UpsertedDocumentRecord } from './typegraph-document.js' +import type { EmbeddedChunk, ChunkFilter, ScoredChunk } from './chunk.js' +import type { typegraphSource, SourceFilter, SourceStatus, UpsertSourceInput, UpsertedSourceRecord } from './source.js' import type { Bucket, BucketListFilter } from './bucket.js' import type { PaginationOpts, PaginatedResult } from './pagination.js' import type { Job, JobFilter, UpsertJobInput, JobStatusPatch } from './job.js' @@ -38,8 +38,8 @@ export interface HashStoreAdapter { deleteByBucket(bucketId: string, tenantId?: string | undefined): Promise } -export interface ScoredChunkWithDocument extends ScoredChunk { - document?: typegraphDocument | undefined +export interface ScoredChunkWithSource extends ScoredChunk { + source?: typegraphSource | undefined } export interface UndeployResult { @@ -62,30 +62,30 @@ export interface VectorStoreAdapter { /** Ensure a model's storage (e.g., table) exists. Called lazily before first write. */ ensureModel(model: string, dimensions: number): Promise - /** Upsert chunks for a document into the vector store. */ - upsertDocument(model: string, chunks: EmbeddedChunk[]): Promise - delete(model: string, filter: ChunkFilter): Promise + /** Upsert chunks for a source into the vector store. */ + upsertSourceChunks(model: string, chunks: EmbeddedChunk[]): Promise + delete(model: string, filter: ChunkFilter | null): Promise - search(model: string, embedding: number[], opts: SearchOpts): Promise - hybridSearch?(model: string, embedding: number[], query: string, opts: SearchOpts): Promise - countChunks(model: string, filter: ChunkFilter): Promise + search(model: string, embedding: number[], opts: SearchOpts | null): Promise + hybridSearch?(model: string, embedding: number[], query: string, opts: SearchOpts | null): Promise + countChunks(model: string, filter: ChunkFilter | null): Promise hashStore: HashStoreAdapter - // --- Document record methods (optional - adapters that support documents implement these) --- - - /** Create or update a document record. Returns the canonical document row. */ - upsertDocumentRecord?(input: UpsertDocumentInput): Promise - /** Get a document by UUID. */ - getDocument?(id: string): Promise - /** List documents matching a filter. Supports optional pagination. */ - listDocuments?(filter: DocumentFilter, pagination?: PaginationOpts): Promise> - /** Delete documents matching a filter. Returns count deleted. */ - deleteDocuments?(filter: DocumentFilter): Promise - /** Update a document's status and optionally its chunk count. */ - updateDocumentStatus?(id: string, status: DocumentStatus, chunkCount?: number): Promise - /** Update document metadata fields (title, url, visibility, etc.). Returns updated document. */ - updateDocument?(id: string, input: Partial>): Promise + // --- Source record methods (optional - adapters that support sources implement these) --- + + /** Create or update a source record. Returns the canonical source row. */ + upsertSourceRecord?(input: UpsertSourceInput): Promise + /** Get a source by UUID. */ + getSource?(id: string): Promise + /** List sources matching a filter. Supports optional pagination. */ + listSources?(filter?: SourceFilter | null, pagination?: PaginationOpts | null): Promise> + /** Delete sources matching a filter. Returns count deleted. */ + deleteSources?(filter: SourceFilter | null): Promise + /** Update a source's status and optionally its chunk count. */ + updateSourceStatus?(id: string, status: SourceStatus, chunkCount?: number): Promise + /** Update source metadata fields (title, url, visibility, subject, etc.). Returns updated source. */ + updateSource?(id: string, input: Partial>): Promise // --- Job record methods (optional - adapters that persist job state implement these) --- @@ -94,24 +94,24 @@ export interface VectorStoreAdapter { /** Fetch a job by id. */ getJob?(id: string): Promise /** List jobs matching a filter, ordered by created_at DESC. */ - listJobs?(filter: JobFilter, pagination?: PaginationOpts): Promise> + listJobs?(filter?: JobFilter | null, pagination?: PaginationOpts | null): Promise> /** Apply a partial status/result/error/progress patch to a job. */ updateJobStatus?(id: string, patch: JobStatusPatch): Promise /** Atomically add to a job's progress_processed counter. Safe under concurrent workers. */ incrementJobProgress?(id: string, processedDelta: number): Promise - /** Hybrid search with document-level filtering via JOIN to typegraph_documents. */ - searchWithDocuments?( + /** Hybrid search with source-level filtering via JOIN to typegraph_sources. */ + searchWithSources?( model: string, embedding: number[], query: string, - opts: SearchOpts & { documentFilter?: DocumentFilter | undefined } - ): Promise + opts: (SearchOpts & { sourceFilter?: SourceFilter | undefined }) | null + ): Promise - /** Fetch chunks by document and index range (for neighbor expansion). No vector search. */ + /** Fetch chunks by source and index range (for neighbor expansion). No vector search. */ getChunksByRange?( model: string, - documentId: string, + sourceId: string, fromIndex: number, toIndex: number ): Promise diff --git a/packages/sdk/src/types/bucket.ts b/packages/sdk/src/types/bucket.ts index 1b4275e..88122e3 100644 --- a/packages/sdk/src/types/bucket.ts +++ b/packages/sdk/src/types/bucket.ts @@ -1,11 +1,11 @@ -import type { RawDocument } from './connector.js' +import type { SourceInput } from './connector.js' import type { EmbeddingProvider } from '../embedding/provider.js' import type { AISDKEmbeddingInput } from '../embedding/ai-sdk-adapter.js' /** - * A bucket is a named container for documents. - * Buckets have no type - they are user-defined namespaces for organizing documents. - * A bucket named "Marketing Docs" could receive documents from a URL scrape, + * A bucket is a named container for sources. + * Buckets have no type - they are user-defined namespaces for organizing sources. + * A bucket named "Marketing Content" could receive sources from a URL scrape, * a domain crawl, file uploads, and a Slack sync - all at the same time. * * Each bucket supports exactly one embedding model, set at creation time. @@ -38,8 +38,8 @@ export interface Bucket { export interface IndexDefaults { chunkSize?: number | undefined chunkOverlap?: number | undefined - deduplicateBy?: string[] | ((doc: RawDocument) => string) | undefined - visibility?: import('./typegraph-document.js').Visibility | undefined + deduplicateBy?: string[] | ((source: SourceInput) => string) | undefined + visibility?: import('./source.js').Visibility | undefined stripMarkdownForEmbedding?: boolean | undefined preprocessForEmbedding?: ((content: string) => string) | undefined propagateMetadata?: string[] | undefined diff --git a/packages/sdk/src/types/document.ts b/packages/sdk/src/types/chunk.ts similarity index 68% rename from packages/sdk/src/types/document.ts rename to packages/sdk/src/types/chunk.ts index add815e..803f61e 100644 --- a/packages/sdk/src/types/document.ts +++ b/packages/sdk/src/types/chunk.ts @@ -7,8 +7,8 @@ export interface EmbeddedChunk { userId?: string | undefined agentId?: string | undefined conversationId?: string | undefined - /** UUID referencing typegraph_documents.id. */ - documentId: string + /** ID referencing typegraph_sources.id. */ + sourceId: string content: string embedding: number[] @@ -17,26 +17,36 @@ export interface EmbeddedChunk { totalChunks: number /** - * Denormalized from the parent document. Chunks are the query target, so the + * Denormalized from the parent source. Chunks are the query target, so the * visibility gate has to live here or unscoped queries leak narrowly-visible * rows. Defaults to 'tenant' when omitted. */ - visibility?: import('./typegraph-document.js').Visibility | undefined + visibility?: import('./source.js').Visibility | undefined metadata: Record indexedAt: Date } +export interface ChunkRef { + bucketId: string + sourceId: string + chunkIndex: number + embeddingModel?: string | undefined + chunkId?: string | undefined +} + export interface ChunkFilter { bucketId?: string | undefined /** Filter to any of several buckets. Preferred over `bucketId` when searching multiple. */ bucketIds?: string[] | undefined + /** Restrict search to exact chunk identities. Empty array intentionally matches nothing. */ + chunkRefs?: ChunkRef[] | undefined tenantId?: string | undefined groupId?: string | undefined userId?: string | undefined agentId?: string | undefined conversationId?: string | undefined - documentId?: string | undefined + sourceId?: string | undefined idempotencyKey?: string | undefined metadata?: Record | undefined } diff --git a/packages/sdk/src/types/connector.ts b/packages/sdk/src/types/connector.ts index 3f183e8..0de9d25 100644 --- a/packages/sdk/src/types/connector.ts +++ b/packages/sdk/src/types/connector.ts @@ -1,4 +1,17 @@ -export interface RawDocument = Record> { +import type { ExternalId } from '../memory/types/memory.js' +import type { EntityType } from '../index-engine/ontology.js' + +export interface SourceSubject { + entityId?: string | undefined + externalIds?: ExternalId[] | undefined + name?: string | undefined + entityType?: EntityType | string | undefined + aliases?: string[] | undefined + description?: string | undefined + properties?: Record | undefined +} + +export interface SourceInput = Record> { id?: string | undefined content: string title: string @@ -10,6 +23,7 @@ export interface RawDocument = Record /** Duration of the operation in milliseconds. */ diff --git a/packages/sdk/src/types/graph-bridge.ts b/packages/sdk/src/types/graph-bridge.ts index 3f70887..6bbd58f 100644 --- a/packages/sdk/src/types/graph-bridge.ts +++ b/packages/sdk/src/types/graph-bridge.ts @@ -1,9 +1,13 @@ import type { typegraphIdentity } from './identity.js' import type { ConversationTurnResult, MemoryHealthReport } from './memory.js' -import type { MemoryRecord } from '../memory/types/memory.js' +import type { ExternalId, MemoryRecord } from '../memory/types/memory.js' +import type { ChunkRef } from './chunk.js' +import type { SourceSubject } from './connector.js' +import type { QueryEntityScope, QuerySignals } from './query.js' import type { PaginationOpts } from './pagination.js' import type { TelemetryOpts } from './events.js' -import type { Visibility } from './typegraph-document.js' +import type { Visibility } from './source.js' +import type { PredicateTemporalStatus } from '../index-engine/ontology.js' // ── Memory method opts ── // All memory ops take a unified (payload, opts) shape. `opts` extends @@ -13,14 +17,23 @@ export interface RememberOpts extends typegraphIdentity, TelemetryOpts { category?: string | undefined importance?: number | undefined metadata?: Record | undefined + subject?: MemorySubject | undefined + relatedEntities?: MemorySubject[] | undefined + visibility?: Visibility | undefined } export type ForgetOpts = typegraphIdentity & TelemetryOpts -export type CorrectOpts = typegraphIdentity & TelemetryOpts +export type CorrectOpts = typegraphIdentity & TelemetryOpts & { + subject?: MemorySubject | undefined + relatedEntities?: MemorySubject[] | undefined +} export interface AddConversationTurnOpts extends typegraphIdentity, TelemetryOpts { conversationId?: string | undefined + subject?: MemorySubject | undefined + relatedEntities?: MemorySubject[] | undefined + visibility?: Visibility | undefined } export interface RecallOpts extends typegraphIdentity, TelemetryOpts { @@ -30,12 +43,116 @@ export interface RecallOpts extends typegraphIdentity, TelemetryOpts { temporalAt?: Date | undefined /** Include invalidated/expired memories. Default: false. */ includeInvalidated?: boolean | undefined + entityScope?: QueryEntityScope | undefined /** Format results as a string instead of an array. When set, `recall` returns `Promise`. */ format?: 'xml' | 'markdown' | 'plain' | undefined } export type HealthCheckOpts = typegraphIdentity & TelemetryOpts +export interface MemorySubject { + entityId?: string | undefined + externalIds?: ExternalId[] | undefined + name?: string | undefined + entityType?: string | undefined + aliases?: string[] | undefined + properties?: Record | undefined +} + +export interface GraphEntityRef extends typegraphIdentity { + /** Existing TypeGraph entity ID. */ + id?: string | undefined + /** Deterministic identifier lookup. Takes priority over name/fuzzy matching. */ + externalId?: ExternalId | undefined + /** Deterministic identifiers to attach or use for lookup. */ + externalIds?: ExternalId[] | undefined + /** Entity name. Required when the reference must create a new entity. */ + name?: string | undefined + entityType?: string | undefined + aliases?: string[] | undefined + description?: string | undefined + properties?: Record | undefined + visibility?: Visibility | undefined +} + +export interface UpsertGraphEntityInput extends typegraphIdentity { + id?: string | undefined + name: string + entityType?: string | undefined + aliases?: string[] | undefined + description?: string | undefined + properties?: Record | undefined + externalIds?: ExternalId[] | undefined + visibility?: Visibility | undefined +} + +export interface UpsertGraphEdgeInput extends typegraphIdentity { + /** Entity ref. A bare string reuses an existing entity ID when found, otherwise seeds by name. */ + source: GraphEntityRef | string + /** Entity ref. A bare string reuses an existing entity ID when found, otherwise seeds by name. */ + target: GraphEntityRef | string + relation: string + weight?: number | undefined + properties?: Record | undefined + description?: string | undefined + evidenceText?: string | undefined + temporalStatus?: PredicateTemporalStatus | undefined + validFrom?: string | undefined + validTo?: string | undefined + sourceChunkId?: string | undefined + visibility?: Visibility | undefined +} + +export interface UpsertGraphFactInput extends typegraphIdentity { + /** Entity ref. A bare string reuses an existing entity ID when found, otherwise seeds by name. */ + source: GraphEntityRef | string + /** Entity ref. A bare string reuses an existing entity ID when found, otherwise seeds by name. */ + target: GraphEntityRef | string + relation: string + factText?: string | undefined + description?: string | undefined + evidenceText?: string | undefined + temporalStatus?: PredicateTemporalStatus | undefined + validFrom?: string | undefined + validTo?: string | undefined + sourceChunkId?: string | undefined + confidence?: number | undefined + properties?: Record | undefined + visibility?: Visibility | undefined +} + +export interface MergeGraphEntitiesInput extends typegraphIdentity { + sourceEntityId: string + targetEntityId: string + properties?: Record | undefined +} + +export interface MergeGraphEntitiesResult { + target: EntityDetail + sourceEntityId: string + targetEntityId: string + redirectedEdges: number + redirectedFacts: number + redirectedGraphEdges: number + movedMentions: number + movedExternalIds: number + removedSelfEdges: number +} + +export interface DeleteGraphEntityOpts extends typegraphIdentity { + mode?: 'invalidate' | 'purge' | undefined +} + +export interface DeleteGraphEntityResult { + entityId: string + mode: 'invalidate' | 'purge' + deletedEdges: number + deletedFacts: number + deletedGraphEdges: number + deletedMentions: number + deletedExternalIds: number +} + /** * Memory bridge — conversational memory operations (remember, recall, forget, correct). * Independent of the knowledge graph. Use this when you only need memory without entity graphs. @@ -45,39 +162,39 @@ export interface MemoryBridge { deploy?(): Promise /** Store a memory. LLM extracts triples → memory record. */ - remember(content: string, opts: RememberOpts): Promise + remember(content: string, opts?: RememberOpts | null): Promise /** Invalidate a memory. Caller must prove ownership via identity. */ - forget(id: string, opts: ForgetOpts): Promise + forget(id: string, opts?: ForgetOpts | null): Promise /** Apply a natural language correction (e.g., "Actually, Alice works at Beta Inc now"). */ - correct(correction: string, opts: CorrectOpts): Promise<{ invalidated: number; created: number; summary: string }> + correct(correction: string, opts?: CorrectOpts | null): Promise<{ invalidated: number; created: number; summary: string }> /** Ingest a conversation turn with extraction. */ addConversationTurn( messages: Array<{ role: string; content: string; timestamp?: Date }>, - opts: AddConversationTurnOpts, + opts?: AddConversationTurnOpts | null, ): Promise /** Recall memories by semantic similarity. Returns a formatted string when `format` is set. */ recall(query: string, opts: RecallOpts & { format: 'xml' | 'markdown' | 'plain' }): Promise - recall(query: string, opts: RecallOpts): Promise + recall(query: string, opts?: RecallOpts | null): Promise /** Recall memories using hybrid search (vector + BM25 keyword). * When the memory store supports it, uses RRF to fuse vector and keyword results. * Falls back to vector-only recall if not implemented. */ recallHybrid?(query: string, opts: RecallOpts & { format: 'xml' | 'markdown' | 'plain' }): Promise - recallHybrid?(query: string, opts: RecallOpts): Promise + recallHybrid?(query: string, opts?: RecallOpts | null): Promise /** Get memory system health statistics. */ - healthCheck?(opts?: HealthCheckOpts): Promise + healthCheck?(opts?: HealthCheckOpts | null): Promise /** Check if the memory store has any active memories. Used to skip memory runner when empty. */ hasMemories?(): Promise } /** - * Knowledge graph bridge — entity-relationship graph for document retrieval. + * Knowledge graph bridge — entity-relationship graph for source retrieval. * Stores entities and edges extracted during indexing, provides PPR-based retrieval. * Independent of conversational memory. */ @@ -85,7 +202,27 @@ export interface KnowledgeGraphBridge { /** Deploy graph tables (entities, edges). Called by typegraph.deploy() when graph is configured. */ deploy?(): Promise - /** Store an extracted triple in the entity graph. Used during document indexing. */ + /** Materialize the declared source subject and attach deterministic primary-source evidence to every chunk. */ + addSourceSubject?(input: { + subject: SourceSubject + bucketId: string + sourceId: string + embeddingModel: string + chunks: Array<{ + id?: string | undefined + chunkIndex: number + content: string + metadata?: Record | undefined + }> + tenantId?: string | undefined + groupId?: string | undefined + userId?: string | undefined + agentId?: string | undefined + conversationId?: string | undefined + visibility?: Visibility | undefined + }): Promise + + /** Store an extracted triple in the entity graph. Used during source indexing. */ addTriple?(triple: { subject: string subjectType?: string @@ -98,12 +235,15 @@ export interface KnowledgeGraphBridge { objectDescription?: string relationshipDescription?: string | undefined evidenceText?: string | undefined + temporalStatus?: PredicateTemporalStatus | undefined + validFrom?: string | undefined + validTo?: string | undefined sourceChunkId?: string | undefined confidence?: number content: string bucketId: string chunkIndex?: number - documentId?: string + sourceId?: string tenantId?: string | undefined groupId?: string | undefined userId?: string | undefined @@ -113,6 +253,36 @@ export interface KnowledgeGraphBridge { metadata?: Record }): Promise + /** Create or update a deterministic developer-seeded entity. */ + upsertEntity?(input: UpsertGraphEntityInput): Promise + + /** Create or update many deterministic developer-seeded entities. */ + upsertEntities?(inputs: UpsertGraphEntityInput[]): Promise + + /** Resolve an entity by TypeGraph ID, external ID, or scoped name lookup. */ + resolveEntity?(ref: GraphEntityRef | string, identity?: typegraphIdentity): Promise + + /** Attach deterministic external IDs to an existing entity. */ + linkExternalIds?(entityId: string, externalIds: ExternalId[], identity?: typegraphIdentity): Promise + + /** Merge a duplicate source entity into a surviving target entity and rewrite graph references. */ + mergeEntities?(input: MergeGraphEntitiesInput): Promise + + /** Invalidate or purge an entity and its graph references without deleting chunks/sources/memories. */ + deleteEntity?(entityId: string, opts?: DeleteGraphEntityOpts | null): Promise + + /** Create or update a deterministic developer-seeded edge. */ + upsertEdge?(input: UpsertGraphEdgeInput): Promise + + /** Create or update many deterministic developer-seeded edges. */ + upsertEdges?(inputs: UpsertGraphEdgeInput[]): Promise + + /** Create or update a developer-seeded fact and its backing edge/fact record. */ + upsertFact?(input: UpsertGraphFactInput): Promise + + /** Create or update many developer-seeded facts. */ + upsertFacts?(inputs: UpsertGraphFactInput[]): Promise + /** Store extracted entities and their source mentions even when no relationship was found. */ addEntityMentions?(mentions: Array<{ name: string @@ -122,7 +292,7 @@ export interface KnowledgeGraphBridge { content: string bucketId: string chunkIndex?: number | undefined - documentId?: string | undefined + sourceId?: string | undefined tenantId?: string | undefined groupId?: string | undefined userId?: string | undefined @@ -133,58 +303,50 @@ export interface KnowledgeGraphBridge { confidence?: number | undefined }>): Promise - /** Persist graph passage nodes for indexed chunks. */ - upsertPassageNodes?(nodes: Array<{ - bucketId: string - documentId: string - chunkIndex: number - embeddingModel: string - contentHash: string - chunkId?: string | undefined - metadata?: Record | undefined - visibility?: import('./typegraph-document.js').Visibility | undefined - tenantId?: string | undefined - groupId?: string | undefined - userId?: string | undefined - agentId?: string | undefined - conversationId?: string | undefined - }>): Promise - /** Search entities for query seeding and graph exploration. */ searchEntities?(query: string, identity: typegraphIdentity, limit?: number): Promise /** Search persisted facts by semantic similarity. */ - searchFacts?(query: string, opts?: FactSearchOpts): Promise + searchFacts?(query: string, opts?: FactSearchOpts | null): Promise /** Explore a semantic subgraph using anchor resolution and predicate-first intent parsing. */ - explore?(query: string, opts?: GraphExploreOpts): Promise + explore?(query: string, opts?: GraphExploreOpts | null): Promise - /** Retrieve passages directly connected to an entity. */ - getPassagesForEntity?(entityId: string, opts?: { + /** Resolve entity/external-ID scope to concrete graph and chunk anchors. */ + resolveEntityScope?(scope: QueryEntityScope, identity: typegraphIdentity, opts?: { bucketIds?: string[] | undefined limit?: number | undefined - } & typegraphIdentity): Promise + } | null): Promise - /** Run heterogeneous graph traversal and return ranked passages. */ - searchGraphPassages?(query: string, identity: typegraphIdentity, opts?: GraphSearchOpts): Promise + /** Search direct facts/entities without graph traversal. */ + searchKnowledge?(query: string, identity: typegraphIdentity, opts?: KnowledgeSearchOpts | null): Promise + + /** Retrieve chunks directly connected to an entity. */ + getChunksForEntity?(entityId: string, opts?: ({ + bucketIds?: string[] | undefined + limit?: number | undefined + } & typegraphIdentity) | null): Promise + + /** Run heterogeneous graph traversal and return ranked chunks. */ + searchGraphChunks?(query: string, identity: typegraphIdentity, opts?: GraphSearchOpts | null): Promise /** Explain a heterogeneous graph query without changing retrieval behavior. */ - explainQuery?(query: string, opts?: GraphExplainOpts): Promise + explainQuery?(query: string, opts?: GraphExplainOpts | null): Promise - /** Backfill persisted passage nodes, passage-entity edges, and fact records from existing indexed graph data. */ - backfill?(identity: typegraphIdentity, opts?: GraphBackfillOpts): Promise + /** Backfill entity-chunk graph edges and fact records from existing indexed graph data. */ + backfill?(identity: typegraphIdentity, opts?: GraphBackfillOpts | null): Promise // ── Graph exploration methods ── /** Get a single entity by ID. */ - getEntity?(id: string, opts?: typegraphIdentity): Promise + getEntity?(id: string, opts?: typegraphIdentity | null): Promise /** Get edges for an entity. */ - getEdges?(entityId: string, opts?: { + getEdges?(entityId: string, opts?: ({ direction?: 'in' | 'out' | 'both' relation?: string limit?: number - } & typegraphIdentity): Promise + } & typegraphIdentity) | null): Promise /** Extract a subgraph around seed entities or a query. */ getSubgraph?(opts: SubgraphOpts): Promise @@ -206,6 +368,7 @@ export interface EntityResult { name: string entityType: string aliases: string[] + externalIds?: ExternalId[] | undefined /** Present when searched by query. */ similarity?: number | undefined /** Number of edges (degree centrality). */ @@ -258,23 +421,44 @@ export interface FactSearchOpts extends typegraphIdentity { limit?: number | undefined } +export interface EntityScopeResolution { + entityIds: string[] + chunkRefs: ChunkRef[] + warnings?: string[] | undefined +} + +export interface KnowledgeSearchOpts { + count?: number | undefined + signals?: Pick | undefined + entityScope?: QueryEntityScope | undefined + resolvedEntityIds?: string[] | undefined +} + +export interface KnowledgeSearchResult { + facts: FactResult[] + entities: EntityResult[] +} + export interface GraphExploreOptions { + intentParser?: GraphIntentParserMode | undefined include?: { entities?: boolean | undefined facts?: boolean | undefined - passages?: boolean | undefined + chunks?: boolean | undefined } | undefined bucketIds?: string[] | undefined anchorLimit?: number | undefined entityLimit?: number | undefined factLimit?: number | undefined - passageLimit?: number | undefined + chunkLimit?: number | undefined depth?: 1 | 2 | undefined explain?: boolean | undefined } export type GraphExploreOpts = GraphExploreOptions & typegraphIdentity & TelemetryOpts +export type GraphIntentParserMode = 'deterministic' | 'llm' | 'none' + export interface GraphQueryIntentPredicate { name: string confidence: number @@ -286,29 +470,33 @@ export interface GraphQueryIntent { sourceEntityQueries: string[] targetEntityQueries: string[] predicates: GraphQueryIntentPredicate[] - answerSide: 'source' | 'target' | 'either' | 'none' subqueries: string[] mode: 'fact' | 'relationship' | 'summary' | 'creative' + strictness: 'strict' | 'soft' | 'none' } export type GraphExploreIntentPredicate = GraphQueryIntentPredicate export type GraphExploreIntent = GraphQueryIntent export interface ParsedGraphQueryIntent { - parser: 'llm' | 'none' - fallbackUsed: false + parser: 'deterministic' | 'llm' | 'none' intent: GraphQueryIntent + matchedPatterns?: string[] | undefined + rejectedPredicates?: string[] | undefined + parseMs?: number | undefined } export interface GraphExploreTrace { - parser: 'llm' | 'none' - fallbackUsed: false + parser: 'deterministic' | 'llm' | 'none' mode: GraphQueryIntent['mode'] - answerSide: GraphQueryIntent['answerSide'] + strictness: GraphQueryIntent['strictness'] selectedPredicates: string[] sourceEntityQueries: string[] targetEntityQueries: string[] subqueries: string[] + intentParseMs?: number | undefined + intentMatchedPatterns?: string[] | undefined + rejectedPredicates?: string[] | undefined anchorCandidates: EntityResult[] selectedAnchorIds: string[] matchedEdgeIds: string[] @@ -323,16 +511,12 @@ export interface GraphExploreResult { anchors: EntityResult[] entities: EntityResult[] facts: FactResult[] - passages?: PassageResult[] | undefined + chunks?: ChunkResult[] | undefined trace?: GraphExploreTrace | undefined } -export interface PassageResult { - passageId: string +export interface ChunkResult extends ChunkRef { content: string - bucketId: string - documentId: string - chunkIndex: number totalChunks?: number | undefined score: number metadata?: Record | undefined @@ -346,16 +530,19 @@ export interface PassageResult { export type GraphSearchProfile = 'fact-filtered-narrow' export interface GraphSearchOpts { + intentParser?: GraphIntentParserMode | undefined profile?: GraphSearchProfile | undefined count?: number | undefined bucketIds?: string[] | undefined + entityScope?: QueryEntityScope | undefined + resolvedEntityIds?: string[] | undefined restartProbability?: number | undefined - passageSeedWeight?: number | undefined + chunkSeedWeight?: number | undefined entitySeedWeight?: number | undefined factCandidateLimit?: number | undefined factFilterInputLimit?: number | undefined factSeedLimit?: number | undefined - passageSeedLimit?: number | undefined + chunkSeedLimit?: number | undefined maxExpansionEdgesPerEntity?: number | undefined maxPprIterations?: number | undefined minPprScore?: number | undefined @@ -374,10 +561,13 @@ export type GraphExplainOpts = GraphSearchOpts & typegraphIdentity export interface GraphSearchTrace { intent?: GraphQueryIntent | undefined - parser?: 'llm' | 'none' | undefined + parser?: 'deterministic' | 'llm' | 'none' | undefined + intentParseMs?: number | undefined + intentMatchedPatterns?: string[] | undefined + rejectedPredicates?: string[] | undefined entitySeedCount: number factSeedCount: number - passageSeedCount: number + chunkSeedCount: number graphNodeCount: number graphEdgeCount: number pprNonzeroCount: number @@ -386,15 +576,15 @@ export interface GraphSearchTrace { topGraphScores: number[] selectedFactIds: string[] selectedEntityIds: string[] - selectedPassageIds: string[] - finalPassageIds?: string[] | undefined + selectedChunkIds: string[] + finalChunkIds?: string[] | undefined selectedFactTexts?: Array<{ id: string; content: string }> | undefined selectedEntityNames?: Array<{ id: string; content: string }> | undefined selectedFactChains?: Array<{ content: string; score: number; factIds: string[] }> | undefined } export interface GraphSearchResult { - results: PassageResult[] + results: ChunkResult[] facts: FactResult[] entities: EntityResult[] factChains?: FactChainResult[] | undefined @@ -404,15 +594,13 @@ export interface GraphSearchResult { export interface GraphBackfillOpts { bucketIds?: string[] | undefined batchSize?: number | undefined - passages?: boolean | undefined - passageEntityEdges?: boolean | undefined + entityChunkEdges?: boolean | undefined facts?: boolean | undefined entityProfiles?: boolean | undefined } export interface GraphBackfillResult { - passageNodesUpserted: number - passageEntityEdgesUpserted: number + entityChunkEdgesUpserted: number factRecordsUpserted: number entityProfilesUpdated: number batches: number diff --git a/packages/sdk/src/types/index-types.ts b/packages/sdk/src/types/index-types.ts index 3e154a1..4fef343 100644 --- a/packages/sdk/src/types/index-types.ts +++ b/packages/sdk/src/types/index-types.ts @@ -1,5 +1,5 @@ -import type { RawDocument } from './connector.js' -import type { Visibility } from './typegraph-document.js' +import type { SourceInput } from './connector.js' +import type { Visibility } from './source.js' /** * Options for an ingest() call. @@ -32,12 +32,12 @@ export interface IngestOptions { chunkSize?: number | undefined chunkOverlap?: number | undefined - // Document properties (bucket-mergeable) - /** Access visibility for documents from this ingest call. */ + // Source properties (bucket-mergeable) + /** Access visibility for sources from this ingest call. */ visibility?: Visibility | undefined // Processing (bucket-mergeable) - deduplicateBy?: string[] | ((doc: RawDocument) => string) | undefined + deduplicateBy?: string[] | ((source: SourceInput) => string) | undefined propagateMetadata?: string[] | undefined /** If true, strip markdown syntax from chunk content before embedding. Original content is stored as-is. */ stripMarkdownForEmbedding?: boolean | undefined @@ -56,15 +56,15 @@ export interface IngestOptions { removeDeleted?: boolean | undefined dryRun?: boolean | undefined /** - * Controls inter-document parallelism inside a single `ingest()` call. - * A semaphore bounds how many documents run their storage phase concurrently. + * Controls inter-source parallelism inside a single `ingest()` call. + * A semaphore bounds how many sources run their storage phase concurrently. * Graph extraction is currently serialized even when this value is higher. * * Does NOT affect: * - Embedding batching. All chunks in the batch are sent to `embedBatch` * in a single call regardless of this value. - * - Intra-document chunk processing. Chunks are always sequential within - * a single document so cross-chunk entity context can accumulate. + * - Intra-source chunk processing. Chunks are always sequential within + * a single source so cross-chunk entity context can accumulate. * * Default: 1 (sequential). Raise it to speed up vector-only indexing at the * cost of provider/database pressure. @@ -127,7 +127,7 @@ export interface IndexResult { } export interface ExtractionFailure { - documentId: string + sourceId: string chunkIndex: number reason: 'timeout' | 'error' message?: string diff --git a/packages/sdk/src/types/index.ts b/packages/sdk/src/types/index.ts index c61e464..a2efb66 100644 --- a/packages/sdk/src/types/index.ts +++ b/packages/sdk/src/types/index.ts @@ -1,5 +1,6 @@ export type { - RawDocument, + SourceInput, + SourceSubject, ChunkOpts, Chunk, } from './connector.js' @@ -15,9 +16,10 @@ export type { export type { EmbeddedChunk, + ChunkRef, ChunkFilter, ScoredChunk, -} from './document.js' +} from './chunk.js' export type { SearchOpts, @@ -25,7 +27,7 @@ export type { HashStoreAdapter, VectorStoreAdapter, UndeployResult, - ScoredChunkWithDocument, + ScoredChunkWithSource, } from './adapter.js' export type { @@ -40,6 +42,7 @@ export type { QueryContextStats, RawScores, NormalizedScores, + QueryEntityScope, QueryOpts, QueryResponse, } from './query.js' @@ -54,13 +57,13 @@ export type { export { IndexError } from './index-types.js' export type { - typegraphDocument, - DocumentStatus, + typegraphSource, + SourceStatus, Visibility, - DocumentFilter, - UpsertDocumentInput, - UpsertedDocumentRecord, -} from './typegraph-document.js' + SourceFilter, + UpsertSourceInput, + UpsertedSourceRecord, +} from './source.js' export type { typegraphHooks } from './hooks.js' @@ -71,6 +74,18 @@ export type { typegraphIdentity } from './identity.js' export type { MemoryBridge, KnowledgeGraphBridge, + MemorySubject, + GraphEntityRef, + UpsertGraphEntityInput, + UpsertGraphEdgeInput, + UpsertGraphFactInput, + MergeGraphEntitiesInput, + MergeGraphEntitiesResult, + DeleteGraphEntityOpts, + DeleteGraphEntityResult, + EntityScopeResolution, + KnowledgeSearchOpts, + KnowledgeSearchResult, EntityResult, EntityDetail, EdgeResult, @@ -85,7 +100,8 @@ export type { GraphBackfillOpts, GraphBackfillResult, GraphExplainOpts, - PassageResult, + ChunkResult, + GraphIntentParserMode, GraphSearchProfile, GraphSearchOpts, GraphSearchTrace, @@ -129,7 +145,12 @@ export type { ConversationTurnResult, MemoryHealthReport, } from './memory.js' -export type { MemoryRecord } from '../memory/types/memory.js' +export type { + ExternalId, + ExternalIdIdentityType, + ExternalIdEncoding, + MemoryRecord, +} from '../memory/types/memory.js' export type { typegraphLogger } from './logger.js' diff --git a/packages/sdk/src/types/policy.ts b/packages/sdk/src/types/policy.ts index b98df44..2d106fa 100644 --- a/packages/sdk/src/types/policy.ts +++ b/packages/sdk/src/types/policy.ts @@ -10,7 +10,7 @@ export type PolicyAction = | 'memory.write' | 'memory.read' | 'memory.delete' - | 'document.delete' + | 'source.delete' | 'bucket.delete' export interface PolicyRule { diff --git a/packages/sdk/src/types/query.ts b/packages/sdk/src/types/query.ts index 761920c..cfbb14f 100644 --- a/packages/sdk/src/types/query.ts +++ b/packages/sdk/src/types/query.ts @@ -1,5 +1,7 @@ import type { EntityResult, FactResult, GraphSearchOpts, GraphSearchTrace } from './graph-bridge.js' import type { MemoryRecord } from '../memory/types/memory.js' +import type { ExternalId } from '../memory/types/memory.js' +import type { SourceSubject } from './connector.js' export type QueryGraphOptions = GraphSearchOpts @@ -76,7 +78,7 @@ export interface QueryChunkResult { /** Which retrieval systems contributed to this result (e.g. ["semantic"], ["semantic", "graph"]) */ sources: string[] - document: { + source: { id: string bucketId: string title: string @@ -89,6 +91,7 @@ export interface QueryChunkResult { userId?: string | undefined agentId?: string | undefined conversationId?: string | undefined + subject?: SourceSubject | undefined } chunk: { @@ -118,6 +121,12 @@ export interface QueryResults { graphTrace?: GraphSearchTrace | undefined } +export interface QueryEntityScope { + entityIds?: string[] | undefined + externalIds?: ExternalId[] | undefined + mode?: 'filter' | 'boost' | undefined +} + export interface QueryOpts { /** Which retrieval signals to activate. Default: { semantic: true } (semantic-only search). */ signals?: QuerySignals | undefined @@ -130,8 +139,10 @@ export interface QueryOpts { userId?: string | undefined agentId?: string | undefined conversationId?: string | undefined - /** Filter results by document-level fields (status, scope, type, etc.). */ - documentFilter?: import('./typegraph-document.js').DocumentFilter | undefined + /** Filter results by source-level fields (status, scope, type, etc.). */ + sourceFilter?: import('./source.js').SourceFilter | undefined + /** Relevance scope by TypeGraph entity IDs or deterministic external IDs. */ + entityScope?: QueryEntityScope | undefined /** Override composite score weights. Keys are signal names; values are 0-1 weights. * When omitted, defaults are derived from active signals. */ diff --git a/packages/sdk/src/types/typegraph-document.ts b/packages/sdk/src/types/source.ts similarity index 64% rename from packages/sdk/src/types/typegraph-document.ts rename to packages/sdk/src/types/source.ts index a1c8770..404b3ac 100644 --- a/packages/sdk/src/types/typegraph-document.ts +++ b/packages/sdk/src/types/source.ts @@ -1,18 +1,20 @@ -export type DocumentStatus = 'pending' | 'processing' | 'complete' | 'failed' +import type { SourceSubject } from './connector.js' + +export type SourceStatus = 'pending' | 'processing' | 'complete' | 'failed' /** Who can access this record. Defines the narrowest identity level that grants access. */ export type Visibility = 'tenant' | 'group' | 'user' | 'agent' | 'conversation' -export interface typegraphDocument { +export interface typegraphSource { /** UUID primary key. */ id: string - /** The typegraph source that produced this document. */ + /** The bucket that owns this source. */ bucketId: string /** Multi-tenant isolation. Maps to organization_id in many apps. */ tenantId?: string | undefined /** Team, channel, or project. */ groupId?: string | undefined - /** Owner/creator of the document. */ + /** Owner/creator of the source. */ userId?: string | undefined /** Specific agent instance. */ agentId?: string | undefined @@ -23,16 +25,16 @@ export interface typegraphDocument { /** SHA256 of raw content at index time. Used for change detection. */ contentHash: string chunkCount: number - status: DocumentStatus + status: SourceStatus /** - * Access visibility. Controls which queries can see this document. + * Access visibility. Controls which queries can see this source. * `undefined`/NULL means public — visible to any query, including unscoped ones. * A value of `'tenant' | 'group' | 'user' | 'agent' | 'conversation'` restricts * access to queries that supply a matching identity at that level. */ visibility?: Visibility | undefined /** - * Whether triple extraction was run against this document during ingestion. + * Whether triple extraction was run against this source during ingestion. * Reflects "we ran extraction", not "extraction found entities" — partial failures * still count as true. See IndexResult.extraction for success/failure breakdown. */ @@ -41,29 +43,31 @@ export interface typegraphDocument { createdAt: Date updatedAt: Date metadata: Record + /** Optional semantic entity this source is primary evidence for. */ + subject?: SourceSubject | undefined } -export interface UpsertedDocumentRecord extends typegraphDocument { - /** True when the document row was inserted, false when an existing canonical row was updated. */ +export interface UpsertedSourceRecord extends typegraphSource { + /** True when the source row was inserted, false when an existing canonical row was updated. */ wasCreated?: boolean | undefined } -export interface DocumentFilter { +export interface SourceFilter { bucketId?: string | undefined tenantId?: string | undefined groupId?: string | undefined userId?: string | undefined agentId?: string | undefined conversationId?: string | undefined - status?: DocumentStatus | DocumentStatus[] | undefined + status?: SourceStatus | SourceStatus[] | undefined visibility?: Visibility | Visibility[] | undefined - documentIds?: string[] | undefined - /** Filter documents by whether triple extraction ran during ingestion. */ + sourceIds?: string[] | undefined + /** Filter sources by whether triple extraction ran during ingestion. */ graphExtracted?: boolean | undefined } -export interface UpsertDocumentInput { - /** Prefixed document ID (e.g. doc_550e8400...). Must be provided by caller. */ +export interface UpsertSourceInput { + /** Prefixed source ID (e.g. src_550e8400...). Must be provided by caller. */ id: string bucketId: string tenantId?: string | undefined @@ -75,9 +79,10 @@ export interface UpsertDocumentInput { url?: string | undefined contentHash: string chunkCount: number - status: DocumentStatus + status: SourceStatus visibility?: Visibility | undefined - /** Whether triple extraction ran against this document. Defaults to false. */ + /** Whether triple extraction ran against this source. Defaults to false. */ graphExtracted?: boolean | undefined metadata?: Record | undefined + subject?: SourceSubject | undefined } diff --git a/packages/sdk/src/utils/id.ts b/packages/sdk/src/utils/id.ts index b32a53f..c3a56d2 100644 --- a/packages/sdk/src/utils/id.ts +++ b/packages/sdk/src/utils/id.ts @@ -5,7 +5,7 @@ import { createHash, randomUUID } from 'crypto' * * Prefixes: * - `bkt_` — Bucket - * - `doc_` — Document + * - `src_` — Source * - `chk_` — Chunk * - `mem_` — Memory record (episodic) * - `fact_` — Semantic fact @@ -27,7 +27,7 @@ export interface ChunkIdInput { } /** - * Generate the stable chunk id used by vector rows and graph passage nodes. + * Generate the stable chunk id used by vector rows and graph chunk endpoints. * * This keeps chunk identity in the SDK instead of letting each adapter invent * storage-local ids that graph code cannot know about. diff --git a/packages/sdk/src/utils/input.ts b/packages/sdk/src/utils/input.ts new file mode 100644 index 0000000..b30120d --- /dev/null +++ b/packages/sdk/src/utils/input.ts @@ -0,0 +1,87 @@ +import type { typegraphIdentity } from '../types/identity.js' +import { ConfigError } from '../types/errors.js' + +export type Nullable = T | null | undefined + +export function optionalObject( + value: Nullable, + method: string, + param: string = 'opts', +): T { + if (value == null) return {} as T + if (typeof value !== 'object' || Array.isArray(value)) { + throw new ConfigError(`${method} ${param} must be an object when provided.`) + } + return value +} + +export function requiredObject( + value: Nullable, + method: string, + param: string, +): T { + if (value == null) { + throw new ConfigError(`${method} ${param} is required.`) + } + if (typeof value !== 'object' || Array.isArray(value)) { + throw new ConfigError(`${method} ${param} must be an object.`) + } + return value +} + +export function compactObject(value: T): Partial { + const output: Record = {} + for (const [key, entry] of Object.entries(value as Record)) { + if (entry !== undefined && entry !== null) output[key] = entry + } + return output as Partial +} + +export function optionalCompactObject( + value: Nullable, + method: string, + param: string = 'opts', +): Partial { + return compactObject(optionalObject(value, method, param)) +} + +export function compactIdentity(value: Nullable): typegraphIdentity { + const identity = optionalObject(value, 'identity', 'identity') + return compactObject({ + tenantId: identity.tenantId, + groupId: identity.groupId, + userId: identity.userId, + agentId: identity.agentId, + conversationId: identity.conversationId, + agentName: identity.agentName, + agentDescription: identity.agentDescription, + agentVersion: identity.agentVersion, + }) +} + +export function withDefaultTenant( + opts: Nullable, + tenantId: string | undefined, + method: string, +): T { + const normalized = optionalCompactObject(opts, method) as T + if (normalized.tenantId === undefined && tenantId !== undefined) { + return { ...normalized, tenantId } + } + return normalized +} + +export function hasMeaningfulFilter(value: object): boolean { + for (const entry of Object.values(value as Record)) { + if (entry == null) continue + if (Array.isArray(entry) && entry.length === 0) continue + return true + } + return false +} + +export function assertHasMeaningfulFilter(value: object, method: string): void { + if (!hasMeaningfulFilter(value)) { + throw new ConfigError(`${method} requires at least one filter field.`) + } +} diff --git a/packages/vercel-ai-provider/src/middleware.ts b/packages/vercel-ai-provider/src/middleware.ts index 119012b..ce79a82 100644 --- a/packages/vercel-ai-provider/src/middleware.ts +++ b/packages/vercel-ai-provider/src/middleware.ts @@ -64,7 +64,7 @@ export function typegraphMemoryMiddleware(memory: TypegraphMemory, opts: MemoryM messages: { role: 'user' | 'assistant'; content: string }[], conversationId?: string, ): Promise { - await memory.addConversationTurn(messages, conversationId) + await memory.addConversationTurn(messages, { conversationId }) }, } }