Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 36 additions & 21 deletions crates/sparrowdb-execution/src/engine/mutation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,7 @@ impl Engine {
return Ok(vec![]);
}
let node_pat = &pat.nodes[0];
let label = node_pat.labels.first().cloned().unwrap_or_default();

let label_id = match self.snapshot.catalog.get_label(&label)? {
Some(id) => id as u32,
// SPA-266: unknown label → no nodes can match; return empty result.
None => return Ok(vec![]),
};
let label_opt = node_pat.labels.first().cloned();

// Col_ids referenced by the WHERE clause (needed for WHERE evaluation
// even after the index narrows candidates by inline prop filter).
Expand All @@ -48,25 +42,46 @@ impl Engine {

let var_name = node_pat.var.as_str();

// Use the property index for O(1) equality lookups on inline prop
// filters, falling back to full scan for overflow strings / params.
let candidates = self.scan_nodes_for_label_with_index(label_id, &node_pat.props)?;
// Build the list of label_ids to scan. For anonymous patterns
// (`MATCH (n) SET n.prop = $val`) there is no label constraint, so we
// scan every label known to the catalog. For labelled patterns we
// resolve the single label (SPA-266: unknown label → empty result).
let label_ids: Vec<u32> = match label_opt {
Some(ref label) => match self.snapshot.catalog.get_label(label)? {
Some(id) => vec![id as u32],
// SPA-266: unknown label → no nodes can match; return empty result.
None => return Ok(vec![]),
},
None => self
.snapshot
.catalog
.list_labels()?
.into_iter()
.map(|(id, _)| id as u32)
.collect(),
};

let mut matching_ids = Vec::new();
for node_id in candidates {
// Re-read props needed for WHERE clause evaluation.
if mm.where_clause.is_some() {
let props = read_node_props(&self.snapshot.store, node_id, &where_col_ids)?;
if let Some(ref where_expr) = mm.where_clause {
let mut row_vals =
build_row_vals(&props, var_name, &where_col_ids, &self.snapshot.store);
row_vals.extend(self.dollar_params());
if !self.eval_where_graph(where_expr, &row_vals) {
continue;
for label_id in label_ids {
// Use the property index for O(1) equality lookups on inline prop
// filters, falling back to full scan for overflow strings / params.
let candidates = self.scan_nodes_for_label_with_index(label_id, &node_pat.props)?;

for node_id in candidates {
// Re-read props needed for WHERE clause evaluation.
if mm.where_clause.is_some() {
let props = read_node_props(&self.snapshot.store, node_id, &where_col_ids)?;
if let Some(ref where_expr) = mm.where_clause {
let mut row_vals =
build_row_vals(&props, var_name, &where_col_ids, &self.snapshot.store);
row_vals.extend(self.dollar_params());
if !self.eval_where_graph(where_expr, &row_vals) {
continue;
}
}
}
matching_ids.push(node_id);
}
matching_ids.push(node_id);
}

Ok(matching_ids)
Expand Down
237 changes: 237 additions & 0 deletions crates/sparrowdb-node/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,32 @@ fn to_napi_typed(e: sparrowdb::Error) -> napi::Error {
}
}

// ── Cypher identifier validation ─────────────────────────────────────────────

/// Validate that `s` is a safe Cypher identifier: `[A-Za-z_][A-Za-z0-9_]*`.
///
/// Labels and relationship types that are user-supplied must be validated
/// before being interpolated into Cypher query strings to prevent injection.
/// Returns `Ok(())` when valid, or a `TypeError` napi::Error otherwise.
fn validate_cypher_identifier(s: &str) -> napi::Result<()> {
let valid = !s.is_empty()
&& s.chars()
.next()
.is_some_and(|c| c.is_ascii_alphabetic() || c == '_')
&& s.chars().all(|c| c.is_ascii_alphanumeric() || c == '_');
if valid {
Ok(())
} else {
Err(napi::Error::new(
napi::Status::InvalidArg,
format!(
"TypeError: invalid Cypher identifier '{s}'; \
must match [A-Za-z_][A-Za-z0-9_]*"
),
))
}
}

// ── JSON → engine Value conversion ────────────────────────────────────────────

/// Convert a `serde_json::Value` (received from JS via napi-rs's `serde-json`
Expand Down Expand Up @@ -357,6 +383,144 @@ impl SparrowDB {
.map_err(to_napi_typed)
}

/// Directly insert a vector into the HNSW index for an existing node.
///
/// This is the explicit backfill API requested by KMSmcp (ch #202). It is
/// useful when nodes were created without an inline embedding and you want to
/// populate the HNSW index without DETACH DELETE + recreate.
///
/// - `label` — node label (e.g. `"Memory"`)
/// - `property` — the vector property name (e.g. `"embedding"`)
/// - `node_id` — the **user-facing** `id` property value (string), not the
/// internal u64 slot number.
/// - `vector` — Float32Array of `dimensions` elements.
///
/// Throws `RangeError` if no vector index exists for `(label, property)`.
/// Throws `TypeError` if `vector.length` does not match the index dimensions.
/// Throws `RangeError` if no node with `id = node_id` is found under `label`.
///
/// ```typescript
/// db.addToVectorIndex('Memory', 'embedding', 'node-uuid-here', myFloat32Array)
/// ```
#[napi]
pub fn add_to_vector_index(
&self,
label: String,
property: String,
node_id: String,
vector: Float32Array,
) -> napi::Result<()> {
// 0. Validate label is a safe Cypher identifier before interpolation
// (prevents Cypher injection — label is user-controlled input).
// Accepted: [A-Za-z_][A-Za-z0-9_]*
validate_cypher_identifier(&label)?;

// 1. Resolve (label, property) → HNSW index arc.
let arc = self
.inner
.get_vector_index(&label, &property)
.ok_or_else(|| {
napi::Error::new(
napi::Status::GenericFailure,
format!(
"RangeError: no vector index on ({label}, {property}); \
call createVectorIndex first"
),
)
})?;

// 2. Validate dimensionality before any write.
let vec_slice = vector.as_ref();
{
let idx = arc
.read()
.map_err(|e| to_napi(format!("lock poisoned: {e}")))?;
if vec_slice.len() != idx.dimensions {
return Err(napi::Error::new(
napi::Status::InvalidArg,
format!(
"TypeError: vector has {} dimensions but the index expects {}",
vec_slice.len(),
idx.dimensions
),
));
}
}

// 3+4. Hold the database write lock across the node lookup and HNSW
// insert+save so that a concurrent delete_node cannot create an
// orphaned HNSW entry, and a concurrent drop_vector_index cannot
// be silently undone by our save().
//
// We acquire the lock via begin_write() (same discipline as other
// durable write paths). We do NOT commit the WriteTx — dropping
// it releases the lock with no disk writes (HNSW save is handled
// separately below, outside the WAL).
let _write_guard = self
.inner
.begin_write()
.map_err(|e| to_napi(format!("WriterBusy: cannot acquire write lock: {e}")))?;

// 3. Resolve the user-facing string id to the internal u64 node slot.
// We do this with a simple MATCH+RETURN id(n) query so we don't
// need to replicate the id-property lookup logic inline.
let cypher = format!("MATCH (n:{label} {{id: $id}}) RETURN id(n) AS nid");
Comment on lines +464 to +467
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

security-high high

Directly interpolating the label string into a Cypher query string poses a security risk (Cypher injection), as label originates from user input in the Node.js binding. While this is a local database, it is best practice to validate the label name to ensure it only contains alphanumeric characters and underscores before interpolation.

        // 3. Resolve the user-facing string id to the internal u64 node slot.
        //    We do this with a simple MATCH+RETURN id(n) query so we don't
        //    need to replicate the id-property lookup logic inline.
        if !label.chars().all(|c| c.is_alphanumeric() || c == '_') {
            return Err(napi::Error::new(
                napi::Status::InvalidArg,
                format!("TypeError: invalid label name '{label}'"),
            ));
        }
        let cypher = format!("MATCH (n:{label} {{id: $id}}) RETURN id(n) AS nid");

let mut lookup_params = std::collections::HashMap::new();
lookup_params.insert(
"id".to_string(),
sparrowdb_execution::Value::String(node_id.clone()),
);
let result = self
.inner
.execute_with_params(&cypher, lookup_params)
.map_err(|e| to_napi(format!("{e}")))?;

if result.rows.is_empty() {
return Err(napi::Error::new(
napi::Status::GenericFailure,
format!("RangeError: no node with id='{node_id}' found under label '{label}'"),
));
}

if result.rows.len() != 1 {
return Err(napi::Error::new(
napi::Status::GenericFailure,
format!(
"RangeError: expected exactly one node with id='{node_id}' under label \
'{label}', found {} — use a unique id property",
result.rows.len()
),
));
}

let internal_id: u64 = match &result.rows[0][0] {
sparrowdb_execution::Value::Int64(n) => *n as u64,
other => {
return Err(to_napi(format!(
"unexpected id(n) type from query: {other:?}"
)));
}
};
Comment thread
coderabbitai[bot] marked this conversation as resolved.

// 4. Insert into HNSW and persist.
let db_path = self.inner.path();
let vidx_dir = db_path.join("vector_indexes");
{
let mut idx = arc
.write()
.map_err(|e| to_napi(format!("lock poisoned: {e}")))?;
idx.insert(internal_id, vec_slice);
idx.save(&vidx_dir, &label, &property)
.map_err(|e| to_napi(format!("HNSW save failed: {e}")))?;
}
Comment thread
coderabbitai[bot] marked this conversation as resolved.

// Release write lock by dropping _write_guard (no commit — HNSW file
// was already persisted above; WAL does not track vector index files).
drop(_write_guard);

Ok(())
}

/// Search the HNSW vector index for `k` nearest neighbours.
///
/// Returns results sorted by descending similarity score (highest first).
Expand Down Expand Up @@ -1169,4 +1333,77 @@ mod tests {
}
}
}

// ── Regression: validate_cypher_identifier (Issue 1 — PR #410) ───────────

/// `validate_cypher_identifier` must reject strings that contain characters
/// outside [A-Za-z_][A-Za-z0-9_]* to prevent Cypher injection via the
/// `label` parameter of `addToVectorIndex`.
#[test]
fn validate_cypher_identifier_rejects_injection_strings() {
use super::validate_cypher_identifier;

// Valid identifiers must pass.
assert!(validate_cypher_identifier("Memory").is_ok());
assert!(validate_cypher_identifier("my_label").is_ok());
assert!(validate_cypher_identifier("_Private").is_ok());
assert!(validate_cypher_identifier("Label123").is_ok());

// Injection attempts must be rejected with TypeError.
let cases = [
"Memory; DROP",
"Memory RETURN 1 //",
"Mem`ory",
"label-name",
"label name",
"123label",
"",
"label'",
];
for bad in &cases {
let err = validate_cypher_identifier(bad).expect_err(&format!("must reject '{bad}'"));
assert!(
err.to_string().contains("TypeError")
|| err.to_string().contains("invalid Cypher identifier"),
"error for '{bad}' must mention TypeError, got: {}",
err
);
}
}

// ── Regression: non-unique id must error (Issue 3 — PR #410) ─────────────

/// When two nodes share the same `id` property under the same label,
/// `add_to_vector_index` must return an error (not silently pick one).
///
/// We test this through the engine layer (GraphDb) because the NAPI layer
/// is hard to call without a Node.js runtime. The duplicate-id guard lives
/// in the Rust code path that the NAPI binding calls.
#[test]
fn add_to_vector_index_duplicate_id_detected_via_engine() {
// We can't call the NAPI `add_to_vector_index` directly without a
// Node.js env. Instead, verify the Cypher query that the binding uses
// returns >1 row when two nodes share an id, proving the guard works.
let dir = tempfile::tempdir().expect("tempdir");
let db = open_db(dir.path());

// Insert two Memory nodes with the same `id` property (intentionally
// duplicated to trigger the guard).
db.execute("CREATE (n:Memory {id: 'dup'})").expect("first");
db.execute("CREATE (n:Memory {id: 'dup'})").expect("second");

// The MATCH query used by add_to_vector_index.
let mut params = HashMap::new();
params.insert("id".into(), ExecValue::String("dup".into()));
let result = db
.execute_with_params("MATCH (n:Memory {id: $id}) RETURN id(n) AS nid", params)
.expect("query must succeed");

// Two rows → the binding's `rows.len() != 1` guard fires.
assert_eq!(
result.rows.len(),
2,
"duplicate id must produce 2 rows, triggering the non-unique guard in the binding"
);
}
}
Loading
Loading