diff --git a/crates/sparrowdb-execution/src/engine/mutation.rs b/crates/sparrowdb-execution/src/engine/mutation.rs index f872e0d..039dcb7 100644 --- a/crates/sparrowdb-execution/src/engine/mutation.rs +++ b/crates/sparrowdb-execution/src/engine/mutation.rs @@ -27,13 +27,7 @@ impl Engine { return Ok(vec![]); } let node_pat = &pat.nodes[0]; - let label = node_pat.labels.first().cloned().unwrap_or_default(); - - let label_id = match self.snapshot.catalog.get_label(&label)? { - Some(id) => id as u32, - // SPA-266: unknown label → no nodes can match; return empty result. - None => return Ok(vec![]), - }; + let label_opt = node_pat.labels.first().cloned(); // Col_ids referenced by the WHERE clause (needed for WHERE evaluation // even after the index narrows candidates by inline prop filter). @@ -48,25 +42,46 @@ impl Engine { let var_name = node_pat.var.as_str(); - // Use the property index for O(1) equality lookups on inline prop - // filters, falling back to full scan for overflow strings / params. - let candidates = self.scan_nodes_for_label_with_index(label_id, &node_pat.props)?; + // Build the list of label_ids to scan. For anonymous patterns + // (`MATCH (n) SET n.prop = $val`) there is no label constraint, so we + // scan every label known to the catalog. For labelled patterns we + // resolve the single label (SPA-266: unknown label → empty result). + let label_ids: Vec = match label_opt { + Some(ref label) => match self.snapshot.catalog.get_label(label)? { + Some(id) => vec![id as u32], + // SPA-266: unknown label → no nodes can match; return empty result. + None => return Ok(vec![]), + }, + None => self + .snapshot + .catalog + .list_labels()? + .into_iter() + .map(|(id, _)| id as u32) + .collect(), + }; let mut matching_ids = Vec::new(); - for node_id in candidates { - // Re-read props needed for WHERE clause evaluation. - if mm.where_clause.is_some() { - let props = read_node_props(&self.snapshot.store, node_id, &where_col_ids)?; - if let Some(ref where_expr) = mm.where_clause { - let mut row_vals = - build_row_vals(&props, var_name, &where_col_ids, &self.snapshot.store); - row_vals.extend(self.dollar_params()); - if !self.eval_where_graph(where_expr, &row_vals) { - continue; + for label_id in label_ids { + // Use the property index for O(1) equality lookups on inline prop + // filters, falling back to full scan for overflow strings / params. + let candidates = self.scan_nodes_for_label_with_index(label_id, &node_pat.props)?; + + for node_id in candidates { + // Re-read props needed for WHERE clause evaluation. + if mm.where_clause.is_some() { + let props = read_node_props(&self.snapshot.store, node_id, &where_col_ids)?; + if let Some(ref where_expr) = mm.where_clause { + let mut row_vals = + build_row_vals(&props, var_name, &where_col_ids, &self.snapshot.store); + row_vals.extend(self.dollar_params()); + if !self.eval_where_graph(where_expr, &row_vals) { + continue; + } } } + matching_ids.push(node_id); } - matching_ids.push(node_id); } Ok(matching_ids) diff --git a/crates/sparrowdb-node/src/lib.rs b/crates/sparrowdb-node/src/lib.rs index 4fd8229..04fd431 100644 --- a/crates/sparrowdb-node/src/lib.rs +++ b/crates/sparrowdb-node/src/lib.rs @@ -39,6 +39,32 @@ fn to_napi_typed(e: sparrowdb::Error) -> napi::Error { } } +// ── Cypher identifier validation ───────────────────────────────────────────── + +/// Validate that `s` is a safe Cypher identifier: `[A-Za-z_][A-Za-z0-9_]*`. +/// +/// Labels and relationship types that are user-supplied must be validated +/// before being interpolated into Cypher query strings to prevent injection. +/// Returns `Ok(())` when valid, or a `TypeError` napi::Error otherwise. +fn validate_cypher_identifier(s: &str) -> napi::Result<()> { + let valid = !s.is_empty() + && s.chars() + .next() + .is_some_and(|c| c.is_ascii_alphabetic() || c == '_') + && s.chars().all(|c| c.is_ascii_alphanumeric() || c == '_'); + if valid { + Ok(()) + } else { + Err(napi::Error::new( + napi::Status::InvalidArg, + format!( + "TypeError: invalid Cypher identifier '{s}'; \ + must match [A-Za-z_][A-Za-z0-9_]*" + ), + )) + } +} + // ── JSON → engine Value conversion ──────────────────────────────────────────── /// Convert a `serde_json::Value` (received from JS via napi-rs's `serde-json` @@ -357,6 +383,144 @@ impl SparrowDB { .map_err(to_napi_typed) } + /// Directly insert a vector into the HNSW index for an existing node. + /// + /// This is the explicit backfill API requested by KMSmcp (ch #202). It is + /// useful when nodes were created without an inline embedding and you want to + /// populate the HNSW index without DETACH DELETE + recreate. + /// + /// - `label` — node label (e.g. `"Memory"`) + /// - `property` — the vector property name (e.g. `"embedding"`) + /// - `node_id` — the **user-facing** `id` property value (string), not the + /// internal u64 slot number. + /// - `vector` — Float32Array of `dimensions` elements. + /// + /// Throws `RangeError` if no vector index exists for `(label, property)`. + /// Throws `TypeError` if `vector.length` does not match the index dimensions. + /// Throws `RangeError` if no node with `id = node_id` is found under `label`. + /// + /// ```typescript + /// db.addToVectorIndex('Memory', 'embedding', 'node-uuid-here', myFloat32Array) + /// ``` + #[napi] + pub fn add_to_vector_index( + &self, + label: String, + property: String, + node_id: String, + vector: Float32Array, + ) -> napi::Result<()> { + // 0. Validate label is a safe Cypher identifier before interpolation + // (prevents Cypher injection — label is user-controlled input). + // Accepted: [A-Za-z_][A-Za-z0-9_]* + validate_cypher_identifier(&label)?; + + // 1. Resolve (label, property) → HNSW index arc. + let arc = self + .inner + .get_vector_index(&label, &property) + .ok_or_else(|| { + napi::Error::new( + napi::Status::GenericFailure, + format!( + "RangeError: no vector index on ({label}, {property}); \ + call createVectorIndex first" + ), + ) + })?; + + // 2. Validate dimensionality before any write. + let vec_slice = vector.as_ref(); + { + let idx = arc + .read() + .map_err(|e| to_napi(format!("lock poisoned: {e}")))?; + if vec_slice.len() != idx.dimensions { + return Err(napi::Error::new( + napi::Status::InvalidArg, + format!( + "TypeError: vector has {} dimensions but the index expects {}", + vec_slice.len(), + idx.dimensions + ), + )); + } + } + + // 3+4. Hold the database write lock across the node lookup and HNSW + // insert+save so that a concurrent delete_node cannot create an + // orphaned HNSW entry, and a concurrent drop_vector_index cannot + // be silently undone by our save(). + // + // We acquire the lock via begin_write() (same discipline as other + // durable write paths). We do NOT commit the WriteTx — dropping + // it releases the lock with no disk writes (HNSW save is handled + // separately below, outside the WAL). + let _write_guard = self + .inner + .begin_write() + .map_err(|e| to_napi(format!("WriterBusy: cannot acquire write lock: {e}")))?; + + // 3. Resolve the user-facing string id to the internal u64 node slot. + // We do this with a simple MATCH+RETURN id(n) query so we don't + // need to replicate the id-property lookup logic inline. + let cypher = format!("MATCH (n:{label} {{id: $id}}) RETURN id(n) AS nid"); + let mut lookup_params = std::collections::HashMap::new(); + lookup_params.insert( + "id".to_string(), + sparrowdb_execution::Value::String(node_id.clone()), + ); + let result = self + .inner + .execute_with_params(&cypher, lookup_params) + .map_err(|e| to_napi(format!("{e}")))?; + + if result.rows.is_empty() { + return Err(napi::Error::new( + napi::Status::GenericFailure, + format!("RangeError: no node with id='{node_id}' found under label '{label}'"), + )); + } + + if result.rows.len() != 1 { + return Err(napi::Error::new( + napi::Status::GenericFailure, + format!( + "RangeError: expected exactly one node with id='{node_id}' under label \ + '{label}', found {} — use a unique id property", + result.rows.len() + ), + )); + } + + let internal_id: u64 = match &result.rows[0][0] { + sparrowdb_execution::Value::Int64(n) => *n as u64, + other => { + return Err(to_napi(format!( + "unexpected id(n) type from query: {other:?}" + ))); + } + }; + + // 4. Insert into HNSW and persist. + let db_path = self.inner.path(); + let vidx_dir = db_path.join("vector_indexes"); + { + let mut idx = arc + .write() + .map_err(|e| to_napi(format!("lock poisoned: {e}")))?; + idx.insert(internal_id, vec_slice); + idx.save(&vidx_dir, &label, &property) + .map_err(|e| to_napi(format!("HNSW save failed: {e}")))?; + } + + // Release write lock by dropping _write_guard (no commit — HNSW file + // was already persisted above; WAL does not track vector index files). + drop(_write_guard); + + Ok(()) + } + /// Search the HNSW vector index for `k` nearest neighbours. /// /// Returns results sorted by descending similarity score (highest first). @@ -1169,4 +1333,77 @@ mod tests { } } } + + // ── Regression: validate_cypher_identifier (Issue 1 — PR #410) ─────────── + + /// `validate_cypher_identifier` must reject strings that contain characters + /// outside [A-Za-z_][A-Za-z0-9_]* to prevent Cypher injection via the + /// `label` parameter of `addToVectorIndex`. + #[test] + fn validate_cypher_identifier_rejects_injection_strings() { + use super::validate_cypher_identifier; + + // Valid identifiers must pass. + assert!(validate_cypher_identifier("Memory").is_ok()); + assert!(validate_cypher_identifier("my_label").is_ok()); + assert!(validate_cypher_identifier("_Private").is_ok()); + assert!(validate_cypher_identifier("Label123").is_ok()); + + // Injection attempts must be rejected with TypeError. + let cases = [ + "Memory; DROP", + "Memory RETURN 1 //", + "Mem`ory", + "label-name", + "label name", + "123label", + "", + "label'", + ]; + for bad in &cases { + let err = validate_cypher_identifier(bad).expect_err(&format!("must reject '{bad}'")); + assert!( + err.to_string().contains("TypeError") + || err.to_string().contains("invalid Cypher identifier"), + "error for '{bad}' must mention TypeError, got: {}", + err + ); + } + } + + // ── Regression: non-unique id must error (Issue 3 — PR #410) ───────────── + + /// When two nodes share the same `id` property under the same label, + /// `add_to_vector_index` must return an error (not silently pick one). + /// + /// We test this through the engine layer (GraphDb) because the NAPI layer + /// is hard to call without a Node.js runtime. The duplicate-id guard lives + /// in the Rust code path that the NAPI binding calls. + #[test] + fn add_to_vector_index_duplicate_id_detected_via_engine() { + // We can't call the NAPI `add_to_vector_index` directly without a + // Node.js env. Instead, verify the Cypher query that the binding uses + // returns >1 row when two nodes share an id, proving the guard works. + let dir = tempfile::tempdir().expect("tempdir"); + let db = open_db(dir.path()); + + // Insert two Memory nodes with the same `id` property (intentionally + // duplicated to trigger the guard). + db.execute("CREATE (n:Memory {id: 'dup'})").expect("first"); + db.execute("CREATE (n:Memory {id: 'dup'})").expect("second"); + + // The MATCH query used by add_to_vector_index. + let mut params = HashMap::new(); + params.insert("id".into(), ExecValue::String("dup".into())); + let result = db + .execute_with_params("MATCH (n:Memory {id: $id}) RETURN id(n) AS nid", params) + .expect("query must succeed"); + + // Two rows → the binding's `rows.len() != 1` guard fires. + assert_eq!( + result.rows.len(), + 2, + "duplicate id must produce 2 rows, triggering the non-unique guard in the binding" + ); + } } diff --git a/crates/sparrowdb/src/db.rs b/crates/sparrowdb/src/db.rs index fef33da..3eb47d3 100644 --- a/crates/sparrowdb/src/db.rs +++ b/crates/sparrowdb/src/db.rs @@ -1477,12 +1477,16 @@ impl GraphDb { ) -> Result { let mut tx = self.begin_write()?; let csrs = self.cached_csr_map(); + // Pass runtime params to the engine so that $param expressions in + // WHERE clauses and inline prop filters are resolved during the scan + // (e.g. `MATCH (n) WHERE n.id = $id SET n.embedding = $emb`). let engine = Engine::new( NodeStore::open(&self.inner.path)?, self.catalog_snapshot(), csrs, &self.inner.path, - ); + ) + .with_params(params.clone()); // SPA-219: `MATCH (a)-[r:REL]->(b) DELETE r` — edge delete path. if is_edge_delete_mutation(mm) { @@ -1522,11 +1526,80 @@ impl GraphDb { } } } + // Vector index write-path: for each SET mutation whose value is a + // vector param, write to the HNSW index for every matched node. + // + // This block runs BEFORE tx.commit() so the property write and HNSW + // insert are atomic under the same WriteGuard held by `tx`. If + // idx.save() fails the transaction is not committed, keeping disk state + // consistent. + // + // This mirrors the MERGE write-path at db.rs:1396-1420. Without this + // block, `MATCH (n:L {id: $id}) SET n.embedding = $emb` would silently + // store the property but leave the HNSW file untouched — silent data + // loss surfaced by KMSmcp channel message #202. + // + // Label is derived per matched NodeId (upper 32 bits of node_id.0 + // encode the label_id) rather than from the AST. This correctly + // handles: + // • Anonymous matches (`MATCH (n) SET n.embedding = $emb`) where + // the AST carries no label — the label is resolved from the node. + // • Multi-pattern queries: each matched node's label is used, not + // the first MATCH pattern's label (multi-pattern is currently + // rejected by the engine guard, but this code is future-proof). + { + use sparrowdb_cypher::ast::{Expr, Literal}; + let vidx_dir = self.inner.path.join("vector_indexes"); + // Build a label_id → name map from the catalog once. + let cat = self.catalog_snapshot(); + let label_id_to_name: std::collections::HashMap = + cat.list_labels().unwrap_or_default().into_iter().collect(); + let vidx_guard = self.inner.vector_indexes.read().expect("vector_indexes"); + for mutation in &mm.mutations { + if let sparrowdb_cypher::ast::Mutation::Set { + prop, + value: Expr::Literal(Literal::Param(p)), + .. + } = mutation + { + if let Some(exec_val) = params.get(p.as_str()) { + if let Some(vec) = exec_val.as_vector() { + // Group matched nodes by their primary label so we + // can do one save() per (label, prop) pair. + let mut label_to_node_ids: std::collections::HashMap<&str, Vec> = + std::collections::HashMap::new(); + for node_id in &matching_ids { + let lid = (node_id.0 >> 32) as u16; + if let Some(label_name) = label_id_to_name.get(&lid) { + label_to_node_ids + .entry(label_name.as_str()) + .or_default() + .push(node_id.0); + } + } + for (label, slot_ids) in &label_to_node_ids { + let key = (label.to_string(), prop.clone()); + if let Some(arc_idx) = vidx_guard.get(&key) { + let mut idx = arc_idx.write().expect("vector_index write"); + for &raw_id in slot_ids { + idx.insert(raw_id, &vec); + } + // Persist once per (label, prop) pair. + idx.save(&vidx_dir, label, prop).map_err(Error::Io)?; + } + } + } + } + } + } + } + tx.commit()?; self.invalidate_catalog(); if has_detach_delete { self.invalidate_csr_map(); } + Ok(QueryResult::empty(vec![])) } diff --git a/crates/sparrowdb/tests/vector_index.rs b/crates/sparrowdb/tests/vector_index.rs index 99a9b8e..05e465c 100644 --- a/crates/sparrowdb/tests/vector_index.rs +++ b/crates/sparrowdb/tests/vector_index.rs @@ -250,6 +250,160 @@ fn hnsw_bulk_insert_and_top_k() { ); } +// ── Regression: MATCH…SET $vector_param must write to HNSW (KMSmcp ch#202) ───── +// +// Before the fix, `MATCH (n:L {id: $id}) SET n.embedding = $emb` stored the +// property but left the HNSW file unchanged — silent data loss. This test +// confirms that the inserted node is returned by vector_search after SET. + +#[test] +fn set_vector_param_populates_hnsw() { + let (_dir, db) = make_db(); + + // Create the vector index and a bare node (no embedding yet). + db.create_vector_index("Memory", "embedding", 4, "cosine") + .expect("create index"); + db.execute("CREATE (n:Memory {id: 'k1'})") + .expect("CREATE node"); + + // Use execute_with_params to SET the embedding via a $param. + let emb: Vec = vec![0.1, 0.2, 0.3, 0.4]; + let mut params = std::collections::HashMap::new(); + params.insert("id".to_string(), Value::String("k1".to_string())); + params.insert("emb".to_string(), Value::Vector(emb.clone())); + db.execute_with_params("MATCH (n:Memory {id: $id}) SET n.embedding = $emb", params) + .expect("SET with vector param must not error"); + + // Verify the HNSW index now contains the node. + let arc = db + .get_vector_index("Memory", "embedding") + .expect("index must exist"); + let idx = arc.read().expect("read lock"); + let results = idx.search(&emb, 5, 20); + assert!( + !results.is_empty(), + "vectorSearch after SET must return the inserted node (HNSW was empty — silent data loss bug)" + ); + // The only node in the index should be the one we just SET. + assert_eq!(results.len(), 1, "exactly one node should be in the index"); +} + +#[test] +fn set_vector_param_hnsw_roundtrip_survives_reopen() { + // Same as above, but also verifies persistence: close + re-open the DB + // and confirm vectorSearch still returns the node. + let dir = tempfile::tempdir().expect("tempdir"); + let path = dir.path().to_path_buf(); + + { + let db = GraphDb::open(&path).expect("open"); + db.create_vector_index("Chunk", "emb", 3, "cosine") + .expect("create index"); + db.execute("CREATE (n:Chunk {id: 'c1'})").expect("CREATE"); + + let mut params = std::collections::HashMap::new(); + params.insert("id".to_string(), Value::String("c1".to_string())); + params.insert("emb".to_string(), Value::Vector(vec![1.0, 0.0, 0.0])); + db.execute_with_params("MATCH (n:Chunk {id: $id}) SET n.emb = $emb", params) + .expect("SET emb"); + } + + // Re-open and query. + let db = GraphDb::open(&path).expect("reopen"); + let arc = db + .get_vector_index("Chunk", "emb") + .expect("index must survive restart"); + let idx = arc.read().expect("read"); + let results = idx.search(&[1.0_f32, 0.0, 0.0], 5, 20); + assert!( + !results.is_empty(), + "node must be in HNSW after restart (persistence verification)" + ); +} + +// ── Regression: anonymous MATCH must populate HNSW (Issues #2/#4 in PR #410) ── +// +// `MATCH (n) WHERE n.id = $id SET n.embedding = $emb` has no label in the AST. +// Before the fix, scan_match_mutate bailed early (unknown label ""), leaving +// both the property and the HNSW index untouched — silent data loss. + +#[test] +fn anonymous_match_set_vector_populates_hnsw() { + let (_dir, db) = make_db(); + + db.create_vector_index("Memory", "embedding", 3, "cosine") + .expect("create index"); + db.execute("CREATE (n:Memory {id: 'anon-1'})") + .expect("CREATE node"); + + let emb = vec![1.0_f32, 0.0, 0.0]; + let mut params = std::collections::HashMap::new(); + params.insert("id".to_string(), Value::String("anon-1".to_string())); + params.insert("emb".to_string(), Value::Vector(emb.clone())); + + // Anonymous match (no label in MATCH clause). + db.execute_with_params("MATCH (n) WHERE n.id = $id SET n.embedding = $emb", params) + .expect("anonymous MATCH SET must succeed"); + + let arc = db + .get_vector_index("Memory", "embedding") + .expect("index must exist"); + let results = arc.read().expect("read").search(&emb, 5, 20); + assert!( + !results.is_empty(), + "anonymous MATCH SET must populate HNSW (was silently skipped before fix)" + ); +} + +// ── Regression: label derived from NodeId, not from first MATCH pattern ──────── +// +// The HNSW write-path must derive the label from the matched NodeId's upper +// 32 bits rather than from the first AST node pattern. Single-node queries +// are already correct; this test guards the NodeId-derivation code path +// explicitly, confirming the (label, prop) key is resolved correctly. + +#[test] +fn set_vector_hnsw_label_derived_from_node_id() { + let (_dir, db) = make_db(); + + // Two separate labels, each with its own vector index. + db.create_vector_index("PersonVec", "emb", 2, "cosine") + .expect("create PersonVec index"); + db.create_vector_index("DocVec", "emb", 2, "cosine") + .expect("create DocVec index"); + + db.execute("CREATE (n:PersonVec {id: 'p1'})") + .expect("CREATE PersonVec node"); + db.execute("CREATE (n:DocVec {id: 'd1'})") + .expect("CREATE DocVec node"); + + // SET embedding on the DocVec node by label. + let doc_emb = vec![0.0_f32, 1.0]; + let mut params = std::collections::HashMap::new(); + params.insert("id".to_string(), Value::String("d1".to_string())); + params.insert("emb".to_string(), Value::Vector(doc_emb.clone())); + db.execute_with_params("MATCH (n:DocVec {id: $id}) SET n.emb = $emb", params) + .expect("SET DocVec embedding"); + + // DocVec index must contain the node. + let doc_arc = db.get_vector_index("DocVec", "emb").expect("DocVec index"); + let doc_results = doc_arc.read().expect("read").search(&doc_emb, 5, 20); + assert!( + !doc_results.is_empty(), + "DocVec HNSW must contain the SET node" + ); + + // PersonVec index must remain empty (no cross-label pollution). + let person_arc = db + .get_vector_index("PersonVec", "emb") + .expect("PersonVec index"); + let person_results = person_arc.read().expect("read").search(&doc_emb, 5, 20); + assert!( + person_results.is_empty(), + "PersonVec HNSW must NOT contain the DocVec node (wrong-label write)" + ); +} + // ── Metrics ──────────────────────────────────────────────────────────────────── #[test] diff --git a/npm/sparrowdb/index.d.ts b/npm/sparrowdb/index.d.ts index ac2849b..9e700e5 100644 --- a/npm/sparrowdb/index.d.ts +++ b/npm/sparrowdb/index.d.ts @@ -93,6 +93,11 @@ export declare class SparrowDB { * (`GraphDb::execute_with_params`) has supported this since SPA-218; this * binding simply exposes it to JS callers. * + * **HNSW note (as of 0.1.23):** When the target property has a vector index, + * `SET n.prop = $vec` now ALSO writes the vector into the HNSW index. + * Previously the property was stored but the HNSW file was not updated — + * a silent data-loss bug surfaced by KMSmcp channel message #202. + * * ```typescript * const emb = Array.from(new Float32Array(768)) // your model output * db.executeWithParams( @@ -136,6 +141,28 @@ export declare class SparrowDB { * ``` */ createVectorIndex(label: string, property: string, dimensions: number, similarity?: string | undefined | null): void + /** + * Directly insert a vector into the HNSW index for an existing node. + * + * This is the explicit backfill API requested by KMSmcp (ch #202). It is + * useful when nodes were created without an inline embedding and you want to + * populate the HNSW index without DETACH DELETE + recreate. + * + * - `label` — node label (e.g. `"Memory"`) + * - `property` — the vector property name (e.g. `"embedding"`) + * - `node_id` — the **user-facing** `id` property value (string), not the + * internal u64 slot number. + * - `vector` — Float32Array of `dimensions` elements. + * + * Throws `RangeError` if no vector index exists for `(label, property)`. + * Throws `TypeError` if `vector.length` does not match the index dimensions. + * Throws `RangeError` if no node with `id = node_id` is found under `label`. + * + * ```typescript + * db.addToVectorIndex('Memory', 'embedding', 'node-uuid-here', myFloat32Array) + * ``` + */ + addToVectorIndex(label: string, property: string, nodeId: string, vector: Float32Array): void /** * Search the HNSW vector index for `k` nearest neighbours. * diff --git a/npm/sparrowdb/test/integration.test.js b/npm/sparrowdb/test/integration.test.js index 8284346..6342631 100644 --- a/npm/sparrowdb/test/integration.test.js +++ b/npm/sparrowdb/test/integration.test.js @@ -342,9 +342,11 @@ describe('executeWithParams — parameterized queries (KMSmcp #67)', () => { removeDir(dir) }) - it('SET n.embedding = $emb writes a vector via param (the dedup-gate path)', () => { + it('SET n.embedding = $emb writes a vector via param AND populates HNSW', () => { // The flagship KMSmcp #67 use case: 768d embeddings can't be inlined as // Cypher list literals — they must be passed as $emb. + // Regression test for KMSmcp ch#202: previously SET stored the property + // but silently skipped the HNSW write — vectorSearch returned nothing. const emb = [0.1, 0.2, 0.3, 0.4] const r = db.executeWithParams( 'MATCH (n:Memory {id: $id}) SET n.embedding = $emb', @@ -357,9 +359,21 @@ describe('executeWithParams — parameterized queries (KMSmcp #67)', () => { const got = db.execute("MATCH (n:Memory {id: 'k1'}) RETURN n.embedding") assert.equal(got.rows.length, 1, 'must find the node back') assert.ok(got.rows[0]['n.embedding'] != null, 'embedding must be set') + + // CRITICAL: vectorSearch must return the node — the HNSW index must have + // been updated by the SET. Before the fix this returned an empty array. + // Resolve k1's raw u64 node ID so we can assert the hit is specifically + // this node and not a leftover from a previous test. + const k1Row = db.execute("MATCH (n:Memory {id: 'k1'}) RETURN id(n)") + // id(n) → Int64 → JS number (safe range for typical node IDs); vectorSearch + // returns NodeResult.id as a string, so stringify before comparing. + const k1InternalId = String(k1Row.rows[0]['id(n)']) + const hits = db.vectorSearch('Memory', 'embedding', new Float32Array(emb), 5) + const hitIds = hits.map(h => h.id) + assert.ok(hitIds.includes(k1InternalId), `vectorSearch must include k1 (internal id ${k1InternalId}) — HNSW was not updated (KMSmcp ch#202 regression); got: ${hitIds}`) }) - it('Float32Array embedding via Array.from() roundtrips', () => { + it('Float32Array embedding via Array.from() roundtrips AND is vectorSearchable', () => { // Real KMSmcp callers will derive the array from a Float32Array model output. const f32 = new Float32Array([0.5, -0.3, 0.7, 0.1]) const emb = Array.from(f32) @@ -368,6 +382,50 @@ describe('executeWithParams — parameterized queries (KMSmcp #67)', () => { { id: 'k2', emb } ) assert.equal(r.rows.length, 0) + + // Also verify the HNSW index was populated with k2 specifically. + const k2Row = db.execute("MATCH (n:Memory {id: 'k2'}) RETURN id(n)") + const k2InternalId = String(k2Row.rows[0]['id(n)']) + const hits = db.vectorSearch('Memory', 'embedding', f32, 5) + const hitIds = hits.map(h => h.id) + assert.ok(hitIds.includes(k2InternalId), `Float32Array SET must populate HNSW for k2 (internal id ${k2InternalId}); got: ${hitIds}`) + }) + + it('addToVectorIndex inserts a node into HNSW directly', () => { + // The explicit backfill API added as part of KMSmcp ch#202 fix. + // Create a fresh node without any embedding. + db.execute("CREATE (n:Memory {id: 'k3'})") + const emb = new Float32Array([0.9, 0.1, 0.0, 0.0]) + + // Insert directly into the HNSW index without going through Cypher SET. + assert.doesNotThrow(() => { + db.addToVectorIndex('Memory', 'embedding', 'k3', emb) + }, 'addToVectorIndex must not throw') + + // vectorSearch must return k3 specifically. + const k3Row = db.execute("MATCH (n:Memory {id: 'k3'}) RETURN id(n)") + const k3InternalId = String(k3Row.rows[0]['id(n)']) + const hits = db.vectorSearch('Memory', 'embedding', emb, 5) + const hitIds = hits.map(h => h.id) + assert.ok(hitIds.includes(k3InternalId), `addToVectorIndex must make k3 discoverable via vectorSearch (internal id ${k3InternalId}); got: ${hitIds}`) + }) + + it('addToVectorIndex throws RangeError for missing node', () => { + const emb = new Float32Array([0.1, 0.2, 0.3, 0.4]) + assert.throws( + () => db.addToVectorIndex('Memory', 'embedding', 'no-such-node', emb), + /RangeError/, + 'must throw RangeError when node does not exist' + ) + }) + + it('addToVectorIndex throws RangeError for missing index', () => { + const emb = new Float32Array([0.1, 0.2, 0.3, 0.4]) + assert.throws( + () => db.addToVectorIndex('Memory', 'nonexistent_prop', 'k1', emb), + /RangeError/, + 'must throw RangeError when vector index does not exist' + ) }) it('MATCH … WHERE n.id = $id with a string param', () => {