From a0e6cf1420d6c61d64fd0f87005fa9973be15db2 Mon Sep 17 00:00:00 2001 From: Remi Dettai Date: Mon, 22 Jun 2026 17:48:25 +0200 Subject: [PATCH 1/2] Add test showing problem --- .../scenarii/es_compatibility/0031-regex.yaml | 449 ++++++++++++++---- 1 file changed, 351 insertions(+), 98 deletions(-) diff --git a/quickwit/rest-api-tests/scenarii/es_compatibility/0031-regex.yaml b/quickwit/rest-api-tests/scenarii/es_compatibility/0031-regex.yaml index 3a1d0e565e8..e194e973cd8 100644 --- a/quickwit/rest-api-tests/scenarii/es_compatibility/0031-regex.yaml +++ b/quickwit/rest-api-tests/scenarii/es_compatibility/0031-regex.yaml @@ -1,107 +1,360 @@ -# Basic regex match -params: - size: 0 -json: - track_total_hits: true - query: - regexp: - type: - value: ".*event" -expected: - hits: - total: - value: 100 - relation: "eq" ---- -# Regex always match from start to end (`(re)` equivalent to `^(re)$`) -params: - size: 3 -json: - track_total_hits: true - query: - regexp: - type: - value: "event" -expected: - hits: - total: - value: 0 - relation: "eq" +# # Basic regex match +# params: +# size: 0 +# json: +# track_total_hits: true +# query: +# regexp: +# type: +# value: ".*event" +# expected: +# hits: +# total: +# value: 100 +# relation: "eq" +# --- +# # Regex always match from start to end (`(re)` equivalent to `^(re)$`) +# params: +# size: 3 +# json: +# track_total_hits: true +# query: +# regexp: +# type: +# value: "event" +# expected: +# hits: +# total: +# value: 0 +# relation: "eq" +# --- +# # Regex with case_insensitive flag +# params: +# size: 3 +# json: +# track_total_hits: true +# query: +# regexp: +# repo.name: +# # lowercased by the tokenizer +# value: "RUST.*" +# case_insensitive: true +# expected: +# hits: +# total: +# value: 1 +# relation: "eq" +# --- +# params: +# size: 3 +# json: +# track_total_hits: true +# query: +# regexp: +# type: +# # lowercased by the tokenizer +# value: "RUST.*" +# case_insensitive: false +# expected: +# hits: +# total: +# value: 0 +# relation: "eq" +# --- +# # ^ and $ are escaped when they are used as anchors, so +# # ^pushevent$ only matches if the original term is "^pushevent$". +# endpoint: "simple_es_compat/_search" +# params: +# size: 3 +# json: +# track_total_hits: true +# query: +# regexp: +# keyword_text: +# value: "red$" +# expected: +# hits: +# total: +# value: 0 +# relation: "eq" +# --- +# endpoint: "simple_es_compat/_search" +# params: +# size: 3 +# json: +# track_total_hits: true +# query: +# regexp: +# keyword_text: +# value: "gold$" +# expected: +# hits: +# total: +# value: 1 +# relation: "eq" +# --- +# # regex in query_string +# params: +# size: 10 +# json: +# query: +# query_string: +# query: "type:/pushevent/" +# expected: +# hits: +# total: +# value: 60 +# relation: "eq" --- -# Regex with case_insensitive flag +# Many wildcards across a few fields (6 distinct fields, well within the 20-field limit). +# All targeted fields are absent from the index so 0 hits are expected, +# but the query must be accepted without error. params: - size: 3 -json: - track_total_hits: true - query: - regexp: - repo.name: - # lowercased by the tokenizer - value: "RUST.*" - case_insensitive: true -expected: - hits: - total: - value: 1 - relation: "eq" ---- -params: - size: 3 -json: - track_total_hits: true - query: - regexp: - type: - # lowercased by the tokenizer - value: "RUST.*" - case_insensitive: false -expected: - hits: - total: - value: 0 - relation: "eq" ---- -# ^ and $ are escaped when they are used as anchors, so -# ^pushevent$ only matches if the original term is "^pushevent$". -endpoint: "simple_es_compat/_search" -params: - size: 3 + size: 0 json: - track_total_hits: true query: - regexp: - keyword_text: - value: "red$" + bool: + must: + - bool: + must: + - bool: + must_not: + - bool: + should: + - wildcard: + registry.value: + value: "*atlassian*" + case_insensitive: true + - wildcard: + registry.value: + value: "*Agent*" + case_insensitive: true + - wildcard: + registry.value: + value: "*BlueJeans*" + case_insensitive: true + - wildcard: + registry.value: + value: "*Cisco*" + case_insensitive: true + - wildcard: + registry.value: + value: "*Discord*" + case_insensitive: true + - wildcard: + registry.value: + value: "*Dropbox*" + case_insensitive: true + - wildcard: + registry.value: + value: "*Docker*" + case_insensitive: true + - wildcard: + registry.value: + value: "*EIConnectorProxy*" + case_insensitive: true + - wildcard: + registry.value: + value: "*GoToMeeting*" + case_insensitive: true + - wildcard: + registry.value: + value: "*Lifesize*" + case_insensitive: true + - wildcard: + registry.value: + value: "*Mattermost*" + case_insensitive: true + - wildcard: + registry.value: + value: "*Microsoft Edge Update*" + case_insensitive: true + - wildcard: + registry.value: + value: "*OneDrive*" + case_insensitive: true + - wildcard: + registry.value: + value: "*PTOneClick*" + case_insensitive: true + - wildcard: + registry.value: + value: "*slack*" + case_insensitive: true + - wildcard: + registry.value: + value: "*Teams*" + case_insensitive: true + - wildcard: + registry.value: + value: "*TEHTRIS*" + case_insensitive: true + - wildcard: + registry.value: + value: "*VMware*" + case_insensitive: true + - wildcard: + registry.value: + value: "*W3DClient*" + case_insensitive: true + - wildcard: + registry.value: + value: "*Microsoft.lists*" + case_insensitive: true + - wildcard: + registry.value: + value: "*MicrosoftSaRA*" + case_insensitive: true + - wildcard: + process.executable: + value: "*atlassian*" + case_insensitive: true + - wildcard: + process.executable: + value: "*blujeanslauncher*" + case_insensitive: true + - wildcard: + process.executable: + value: "*dropbox*" + case_insensitive: true + - wildcard: + process.executable: + value: "*msiexec.exe*" + case_insensitive: true + - wildcard: + process.executable: + value: "*onedrivesetup.exe*" + case_insensitive: true + - wildcard: + process.executable: + value: "*sara.exe*" + case_insensitive: true + - wildcard: + process.command_line: + value: "*Letsignit*" + case_insensitive: true + minimum_should_match: 1 + - bool: + should: + - bool: + must: + - bool: + should: + - wildcard: + registry.key: + value: "*Microsoft\\Windows\\CurrentVersion\\Run" + case_insensitive: true + - wildcard: + registry.key: + value: "*Microsoft\\Windows\\CurrentVersion\\Policies\\Explorer\\Run" + case_insensitive: true + - wildcard: + registry.key: + value: "*Microsoft\\Windows\\CurrentVersion\\RunOnce" + case_insensitive: true + minimum_should_match: 1 + - term: + action.properties.MessEventType: + value: "SetValue" + - bool: + should: + - wildcard: + action.properties.Details: + value: "*C:\\Windows\\Temp\\*" + case_insensitive: true + - wildcard: + action.properties.Details: + value: "*C:\\ProgramData\\*" + case_insensitive: true + - wildcard: + action.properties.Details: + value: "*Recycle.bin\\*" + case_insensitive: true + - wildcard: + action.properties.Details: + value: "*AppData*" + case_insensitive: true + - wildcard: + action.properties.Details: + value: "*C:\\Temp\\*" + case_insensitive: true + - wildcard: + action.properties.Details: + value: "*C:\\Users\\Public\\*" + case_insensitive: true + - wildcard: + action.properties.Details: + value: "*C:\\Users\\Default\\*" + case_insensitive: true + - wildcard: + action.properties.Details: + value: "*wscript*" + case_insensitive: true + - wildcard: + action.properties.Details: + value: "*script*" + case_insensitive: true + - wildcard: + action.properties.Details: + value: "*.exe" + case_insensitive: true + - wildcard: + action.properties.Details: + value: "*.exe\"" + case_insensitive: true + minimum_should_match: 1 + - bool: + must: + - wildcard: + process.command_line: + value: "*reg*" + case_insensitive: true + - wildcard: + process.command_line: + value: "*add*" + case_insensitive: true + - wildcard: + process.command_line: + value: "*microsoft\\windows\\currentversion*" + case_insensitive: true + - wildcard: + process.command_line: + value: "*/f*" + case_insensitive: true + - wildcard: + process.command_line: + value: "*/v*" + case_insensitive: true + - wildcard: + process.command_line: + value: "*reg_sz*" + case_insensitive: true + - wildcard: + process.command_line: + value: "*c:\\users\\*" + case_insensitive: true + - wildcard: + process.command_line: + value: "*.exe*" + case_insensitive: true + - bool: + should: + - wildcard: + process.command_line: + value: "*\\run*" + case_insensitive: true + - wildcard: + process.command_line: + value: "*\\policies\\explorer\\run*" + case_insensitive: true + - wildcard: + process.command_line: + value: "*\\runonce*" + case_insensitive: true + minimum_should_match: 1 + minimum_should_match: 1 expected: hits: total: value: 0 relation: "eq" ---- -endpoint: "simple_es_compat/_search" -params: - size: 3 -json: - track_total_hits: true - query: - regexp: - keyword_text: - value: "gold$" -expected: - hits: - total: - value: 1 - relation: "eq" ---- -# regex in query_string -params: - size: 10 -json: - query: - query_string: - query: "type:/pushevent/" -expected: - hits: - total: - value: 60 - relation: "eq" From ae0935e97af2b1c791d56564a496f4faa2cca97b Mon Sep 17 00:00:00 2001 From: Remi Dettai Date: Tue, 23 Jun 2026 15:23:57 +0200 Subject: [PATCH 2/2] Use new fst version that falls back to regex list --- quickwit/Cargo.lock | 2 +- quickwit/Cargo.toml | 2 +- .../quickwit-doc-mapper/src/doc_mapper/mod.rs | 3 +- .../quickwit-doc-mapper/src/query_builder.rs | 4 +- quickwit/quickwit-query/src/lib.rs | 1 + quickwit/quickwit-query/src/query_ast/mod.rs | 89 ++++++- .../src/query_ast/regex_query.rs | 32 +-- .../src/query_ast/wildcard_query.rs | 22 +- quickwit/quickwit-search/src/leaf.rs | 112 ++++++--- .../scenarii/es_compatibility/0031-regex.yaml | 222 +++++++++--------- 10 files changed, 303 insertions(+), 186 deletions(-) diff --git a/quickwit/Cargo.lock b/quickwit/Cargo.lock index 8ad8eac98dc..5c699c1f7b4 100644 --- a/quickwit/Cargo.lock +++ b/quickwit/Cargo.lock @@ -9241,7 +9241,7 @@ dependencies = [ [[package]] name = "tantivy-fst" version = "0.5.0" -source = "git+https://github.com/SekoiaLab/fst/?rev=c37128c307b0ba5d7c0040352f0d2606d6383b68#c37128c307b0ba5d7c0040352f0d2606d6383b68" +source = "git+https://github.com/SekoiaLab/fst/?rev=39ccb1c9283034815e22eb663d673a9e37023e7c#39ccb1c9283034815e22eb663d673a9e37023e7c" dependencies = [ "byteorder", "regex-syntax", diff --git a/quickwit/Cargo.toml b/quickwit/Cargo.toml index 334e54470ca..3abd2ce86a9 100644 --- a/quickwit/Cargo.toml +++ b/quickwit/Cargo.toml @@ -365,7 +365,7 @@ encoding_rs = "=0.8.35" [patch.crates-io] sasl2-sys = { git = "https://github.com/quickwit-oss/rust-sasl/", rev = "085a4c7" } -tantivy-fst = { git = "https://github.com/SekoiaLab/fst/", rev = "c37128c307b0ba5d7c0040352f0d2606d6383b68" } +tantivy-fst = { git = "https://github.com/SekoiaLab/fst/", rev = "39ccb1c9283034815e22eb663d673a9e37023e7c" } ## this patched version of tracing helps better understand what happens inside futures (when are ## they polled, how long does poll take...) diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs index 39dd6da124d..9a7d63474bf 100644 --- a/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs +++ b/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs @@ -39,6 +39,7 @@ pub(crate) use field_mapping_entry::{ #[cfg(test)] pub(crate) use field_mapping_entry::{QuickwitNumericOptions, QuickwitTextOptions}; pub use field_mapping_type::FieldMappingType; +use quickwit_query::JsonPath; use serde_json::Value as JsonValue; use tantivy::Term; use tantivy::schema::{Field, FieldType}; @@ -82,7 +83,7 @@ pub enum Automaton { /// targeting the same field and json path. They are warmed up as a single combined automaton /// matching the union of the patterns. The optional path is the json path prefix when the /// field is a json field. - Regex(Option>, Vec), + Regex(Option, Vec), /// An exact-match automaton for a TermSet query. TermSet(ExactSetAutomaton), } diff --git a/quickwit/quickwit-doc-mapper/src/query_builder.rs b/quickwit/quickwit-doc-mapper/src/query_builder.rs index cfe27d841ea..2c434aa6108 100644 --- a/quickwit/quickwit-doc-mapper/src/query_builder.rs +++ b/quickwit/quickwit-doc-mapper/src/query_builder.rs @@ -23,7 +23,7 @@ use quickwit_query::query_ast::{ QueryAstTransformer, QueryAstVisitor, RangeQuery, RegexQuery, TermSetQuery, WildcardQuery, }; use quickwit_query::tokenizers::TokenizerManager; -use quickwit_query::{InvalidQuery, find_field_or_hit_dynamic}; +use quickwit_query::{InvalidQuery, JsonPath, find_field_or_hit_dynamic}; use tantivy::Term; use tantivy::query::Query; use tantivy::schema::{Field, Schema}; @@ -323,7 +323,7 @@ fn coalesce_multi_term_fields_into_automatons( /// and cannot be shared). `Automaton::TermSet` entries are left untouched. fn coalesce_regexes_by_field(automatons_grouped_by_field: &mut HashMap>) { for automatons in automatons_grouped_by_field.values_mut() { - let mut regexes_by_path: HashMap>, Vec> = HashMap::new(); + let mut regexes_by_path: HashMap, Vec> = HashMap::new(); let mut others: Vec = Vec::new(); for automaton in automatons.drain() { match automaton { diff --git a/quickwit/quickwit-query/src/lib.rs b/quickwit/quickwit-query/src/lib.rs index 8f70e155933..c8d390deac2 100644 --- a/quickwit/quickwit-query/src/lib.rs +++ b/quickwit/quickwit-query/src/lib.rs @@ -35,6 +35,7 @@ pub use elastic_query_dsl::{ElasticQueryDsl, OneFieldMap}; pub use error::InvalidQuery; pub use json_literal::{InterpretUserInput, JsonLiteral}; pub(crate) use not_nan_f32::NotNaNf32; +pub use query_ast::JsonPath; pub use query_ast::utils::find_field_or_hit_dynamic; use serde::{Deserialize, Serialize}; pub use tantivy::query::Query as TantivyQuery; diff --git a/quickwit/quickwit-query/src/query_ast/mod.rs b/quickwit/quickwit-query/src/query_ast/mod.rs index d51e36eb93a..d2c66da3e6a 100644 --- a/quickwit/quickwit-query/src/query_ast/mod.rs +++ b/quickwit/quickwit-query/src/query_ast/mod.rs @@ -13,8 +13,9 @@ // limitations under the License. use serde::{Deserialize, Serialize}; +use tantivy::Term; use tantivy::query::BoostQuery as TantivyBoostQuery; -use tantivy::schema::Schema as TantivySchema; +use tantivy::schema::{Field, Schema as TantivySchema}; use crate::tokenizers::TokenizerManager; @@ -325,8 +326,46 @@ pub fn query_ast_from_user_text(user_text: &str, default_fields: Option` +/// marker. +#[derive(Debug, Clone, PartialEq, Eq, Hash, Default)] +pub struct JsonPath(Vec); + +impl std::ops::Deref for JsonPath { + type Target = [u8]; + fn deref(&self) -> &[u8] { + &self.0 + } +} + +impl JsonPath { + /// Builds the serialized JSON-path prefix using the Tantivy encoding. + pub fn from_json_path(json_path: &str, expand_dots_enabled: bool) -> Self { + let mut term = + Term::from_field_json_path(Field::from_field_id(0), json_path, expand_dots_enabled); + term.append_type_and_str(""); + // Skip the first byte: it is a JSON-field marker that is not stored in the dictionary. + JsonPath(term.value().as_serialized()[1..].to_owned()) + } +} + +impl std::fmt::Display for JsonPath { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + use std::fmt::Write; + let path = &self.0; + let path_bytes = &path[..path.iter().position(|&b| b == 0).unwrap_or(path.len())]; + for ch in String::from_utf8_lossy(path_bytes).chars() { + f.write_char(if ch == '\x01' { '.' } else { ch })?; + } + Ok(()) + } +} + #[cfg(test)] mod tests { + use super::JsonPath; use crate::query_ast::tantivy_query_ast::TantivyQueryAst; use crate::query_ast::{ BoolQuery, BuildTantivyAst, BuildTantivyAstContext, QueryAst, UserInputQuery, @@ -440,4 +479,52 @@ mod tests { }; assert_eq!(input_query.default_operator, BooleanOperand::And); } + + #[test] + fn test_json_path_display() { + #[rustfmt::skip] + let cases: &[(&str, bool, &str)] = &[ + // (json_path_input, expand_dots_enabled, expected_display) + + // empty path (field with no sub-key) + ("", false, ""), + ("", true, ""), + + // single-segment paths + ("foo", false, "foo"), + ("foo", true, "foo"), + ("_my_field", false, "_my_field"), + ("field123", false, "field123"), + + // two-level paths + ("foo.bar", false, "foo.bar"), + ("foo.bar", true, "foo.bar"), + ("process.executable", false, "process.executable"), + + // three-level and deeper + ("a.b.c", false, "a.b.c"), + ("a.b.c.d.e", false, "a.b.c.d.e"), + + // escaped dot: `split_json_path` treats `\.` as a literal dot in + // the segment name, so it becomes a single segment "k8s.node". + // With expand_dots=false the writer stores it as literal bytes "k8s.node"; + // with expand_dots=true the writer re-expands the dot into \x01. + // In both cases Display renders it as "k8s.node". + ("k8s\\.node", false, "k8s.node"), + ("k8s\\.node", true, "k8s.node"), + + // escaped dot in the middle of a longer path + ("ns.k8s\\.node.id", false, "ns.k8s.node.id"), + ("ns.k8s\\.node.id", true, "ns.k8s.node.id"), + ]; + + for &(input, expand_dots, expected) in cases { + let path = JsonPath::from_json_path(input, expand_dots); + assert_eq!( + path.to_string(), + expected, + "from_json_path({input:?}, expand_dots={expand_dots})" + ); + } + } } diff --git a/quickwit/quickwit-query/src/query_ast/regex_query.rs b/quickwit/quickwit-query/src/query_ast/regex_query.rs index 851151e839a..e459b48ad57 100644 --- a/quickwit/quickwit-query/src/query_ast/regex_query.rs +++ b/quickwit/quickwit-query/src/query_ast/regex_query.rs @@ -17,12 +17,11 @@ use std::sync::Arc; use anyhow::Context; pub use prefix::{AutomatonQuery, JsonPathPrefix}; use serde::{Deserialize, Serialize}; -use tantivy::Term; use tantivy::schema::{Field, FieldType, Schema as TantivySchema}; use super::{BuildTantivyAst, BuildTantivyAstContext, QueryAst}; use crate::query_ast::TantivyQueryAst; -use crate::{InvalidQuery, find_field_or_hit_dynamic}; +use crate::{InvalidQuery, JsonPath, find_field_or_hit_dynamic}; /// A Regex query #[derive(PartialEq, Eq, Debug, Serialize, Deserialize, Clone)] @@ -51,7 +50,7 @@ impl RegexQuery { pub fn to_field_and_regex( &self, schema: &TantivySchema, - ) -> Result<(Field, Option>, String), InvalidQuery> { + ) -> Result<(Field, Option, String), InvalidQuery> { let Some((field, field_entry, json_path)) = find_field_or_hit_dynamic(&self.field, schema) else { return Err(InvalidQuery::FieldDoesNotExist { @@ -79,17 +78,8 @@ impl RegexQuery { )) })?; - let mut term_for_path = Term::from_field_json_path( - field, - json_path, - json_options.is_expand_dots_enabled(), - ); - term_for_path.append_type_and_str(""); - - let value = term_for_path.value(); - // We skip the 1st byte which is a marker to tell this is json. This isn't present - // in the dictionary - let byte_path_prefix = value.as_serialized()[1..].to_owned(); + let byte_path_prefix = + JsonPath::from_json_path(json_path, json_options.is_expand_dots_enabled()); Ok((field, Some(byte_path_prefix), self.regex.clone())) } _ => Err(InvalidQuery::SchemaError( @@ -125,8 +115,10 @@ mod prefix { use tantivy::schema::Field; use tantivy_fst::Automaton; + use crate::JsonPath; + pub struct JsonPathPrefix { - pub prefix: Vec, + pub prefix: JsonPath, pub automaton: Arc, } @@ -246,6 +238,7 @@ mod prefix { #[cfg(test)] mod tests { + use std::ops::Deref; use std::sync::Arc; use tantivy::schema::{Schema as TantivySchema, TEXT}; @@ -253,6 +246,7 @@ mod tests { use super::prefix::JsonPathPrefixState; use super::{JsonPathPrefix, RegexQuery}; + use crate::JsonPath; #[test] fn test_regex_query_text_field() { @@ -282,7 +276,7 @@ mod tests { }; let (field, path, regex) = query.to_field_and_regex(&schema).unwrap(); assert_eq!(field, schema.get_field("field").unwrap()); - assert_eq!(path.unwrap(), b"sub\x01field\0s"); + assert_eq!(path.unwrap().deref(), b"sub\x01field\0s"); assert_eq!(regex, query.regex); // i believe this is how concatenated field behave @@ -292,7 +286,7 @@ mod tests { }; let (field, path, regex) = query_empty_path.to_field_and_regex(&schema).unwrap(); assert_eq!(field, schema.get_field("field").unwrap()); - assert_eq!(path.unwrap(), b"\0s"); + assert_eq!(path.unwrap().deref(), b"\0s"); assert_eq!(regex, query_empty_path.regex); } @@ -300,7 +294,7 @@ mod tests { fn test_json_prefix_automaton_empty_path() { let regex = Arc::new(Regex::new("e(f|g.*)").unwrap()); let empty_path_automaton = JsonPathPrefix { - prefix: Vec::new(), + prefix: JsonPath::default(), automaton: regex.clone(), }; @@ -312,7 +306,7 @@ mod tests { fn test_json_prefix_automaton() { let regex = Arc::new(Regex::new("e(f|g.*)").unwrap()); let automaton = JsonPathPrefix { - prefix: b"ab".to_vec(), + prefix: JsonPath::from_json_path("ab", false), automaton: regex.clone(), }; diff --git a/quickwit/quickwit-query/src/query_ast/wildcard_query.rs b/quickwit/quickwit-query/src/query_ast/wildcard_query.rs index 8fc9c7eaf2f..2b9e2ce8b22 100644 --- a/quickwit/quickwit-query/src/query_ast/wildcard_query.rs +++ b/quickwit/quickwit-query/src/query_ast/wildcard_query.rs @@ -17,13 +17,12 @@ use std::sync::Arc; use anyhow::{Context, bail}; use serde::{Deserialize, Serialize}; -use tantivy::Term; use tantivy::schema::{Field, FieldType, Schema as TantivySchema}; use super::{BuildTantivyAst, QueryAst}; use crate::query_ast::{AutomatonQuery, BuildTantivyAstContext, JsonPathPrefix, TantivyQueryAst}; use crate::tokenizers::TokenizerManager; -use crate::{InvalidQuery, find_field_or_hit_dynamic}; +use crate::{InvalidQuery, JsonPath, find_field_or_hit_dynamic}; /// A Wildcard query allows to match 'bond' with a query like 'b*d'. #[derive(PartialEq, Eq, Debug, Serialize, Deserialize, Clone)] @@ -112,7 +111,7 @@ impl WildcardQuery { &self, schema: &TantivySchema, tokenizer_manager: &TokenizerManager, - ) -> Result<(Field, Option>, String), InvalidQuery> { + ) -> Result<(Field, Option, String), InvalidQuery> { let Some((field, field_entry, json_path)) = find_field_or_hit_dynamic(&self.field, schema) else { return Err(InvalidQuery::FieldDoesNotExist { @@ -161,17 +160,8 @@ impl WildcardQuery { regex }; - let mut term_for_path = Term::from_field_json_path( - field, - json_path, - json_options.is_expand_dots_enabled(), - ); - term_for_path.append_type_and_str(""); - - let value = term_for_path.value(); - // We skip the 1st byte which is a marker to tell this is json. This isn't present - // in the dictionary - let byte_path_prefix = value.as_serialized()[1..].to_owned(); + let byte_path_prefix = + JsonPath::from_json_path(json_path, json_options.is_expand_dots_enabled()); Ok((field, Some(byte_path_prefix), regex)) } @@ -328,7 +318,7 @@ mod tests { let (_field, path, regex) = query.to_regex(&schema, &tokenizer_manager).unwrap(); assert_eq!(regex, "MyString Wh1ch.a\\.nOrMal Tokenizer would.*cut"); - assert_eq!(path.unwrap(), "Inner\u{1}Fie*ld\0s".as_bytes()); + assert_eq!(path.unwrap().0, "Inner\u{1}Fie*ld\0s".as_bytes()); } for tokenizer in [ @@ -347,7 +337,7 @@ mod tests { let (_field, path, regex) = query.to_regex(&schema, &tokenizer_manager).unwrap(); assert_eq!(regex, "mystring wh1ch.a\\.normal tokenizer would.*cut"); - assert_eq!(path.unwrap(), "Inner\u{1}Fie*ld\0s".as_bytes()); + assert_eq!(path.unwrap().0, "Inner\u{1}Fie*ld\0s".as_bytes()); } } diff --git a/quickwit/quickwit-search/src/leaf.rs b/quickwit/quickwit-search/src/leaf.rs index 8f007a604fe..f090a58159a 100644 --- a/quickwit/quickwit-search/src/leaf.rs +++ b/quickwit/quickwit-search/src/leaf.rs @@ -29,6 +29,7 @@ use quickwit_proto::search::{ CountHits, LeafSearchRequest, LeafSearchResponse, PartialHit, ResourceStats, SearchRequest, SortOrder, SortValue, SplitIdAndFooterOffsets, SplitSearchError, }; +use quickwit_query::JsonPath; use quickwit_query::query_ast::{ BoolQuery, CacheNode, QueryAst, QueryAstTransformer, RangeQuery, TermQuery, }; @@ -42,8 +43,9 @@ use tantivy::aggregation::{AggContextParams, AggregationLimitsGuard}; use tantivy::collector::Collector; use tantivy::directory::FileSlice; use tantivy::fastfield::FastFieldReaders; -use tantivy::schema::Field; +use tantivy::schema::{self, Field}; use tantivy::{DateTime, Index, ReloadPolicy, Searcher, TantivyError, Term}; +use tantivy_fst::DisjunctionRegex; use tokio::task::{JoinError, JoinSet}; use tracing::*; @@ -332,46 +334,55 @@ async fn warm_up_automatons( .map_err(|_| std::io::Error::other("task panicked"))? }; for (field, automatons) in terms_grouped_by_field { - let field_name = searcher.schema().get_field_name(*field).to_string(); for segment_reader in searcher.segment_readers() { let inv_idx = segment_reader.inverted_index(*field)?; for automaton in automatons { let inv_idx_clone = inv_idx.clone(); - let field_name = field_name.clone(); warm_up_futures.push(async move { match automaton { Automaton::Regex(path, patterns) => { - let path_str = path - .as_deref() - .map(|path| String::from_utf8_lossy(path).into_owned()) - .unwrap_or_default(); - // Combine all patterns so the term dictionary is - // traversed once instead of once per regex. let regex = tantivy_fst::Regex::from_patterns(patterns).with_context(|| { format!( - "failed to build combined regex automaton during warmup \ - for field `{field_name}` (path: `{path_str}`, {} \ - patterns)", - patterns.len(), + "failed to build regex during warmup for field `{}`", + full_path(*field, path, searcher.schema()), ) })?; - inv_idx_clone - .warm_postings_automaton( - quickwit_query::query_ast::JsonPathPrefix { - automaton: regex.into(), - prefix: path.clone().unwrap_or_default(), - }, - cpu_intensive_executor, - ) - .await - .with_context(|| { - format!( - "failed to load automaton for field `{field_name}` (path: \ - `{path_str}`, {} patterns)", - patterns.len(), + + match regex { + DisjunctionRegex::Single(regex) => inv_idx_clone + .warm_postings_automaton( + quickwit_query::query_ast::JsonPathPrefix { + automaton: Arc::new(regex), + prefix: path.clone().unwrap_or_default(), + }, + cpu_intensive_executor, + ) + .await + .with_context(|| { + format!( + "failed to warm postings from automaton for field \ + `{}` (type=single)", + full_path(*field, path, searcher.schema()), + ) + }), + DisjunctionRegex::Multi(regexes) => inv_idx_clone + .warm_postings_automaton( + quickwit_query::query_ast::JsonPathPrefix { + automaton: Arc::new(regexes), + prefix: path.clone().unwrap_or_default(), + }, + cpu_intensive_executor, ) - }) + .await + .with_context(|| { + format!( + "failed to warm postings from automaton for field \ + `{}` (type=multi)", + full_path(*field, path, searcher.schema()), + ) + }), + } } Automaton::TermSet(automaton) => inv_idx_clone .warm_postings_automaton(automaton.clone(), cpu_intensive_executor) @@ -1616,6 +1627,15 @@ async fn leaf_search_single_split_wrapper( } } +/// Small helper to display field path in errors +fn full_path(field: Field, path_opt: &Option, schema: &schema::Schema) -> String { + let field_name = schema.get_field_name(field); + let Some(path) = path_opt else { + return field_name.to_string(); + }; + format!("{}.{}", field_name, path) +} + #[cfg(test)] mod tests { use std::ops::Bound; @@ -2186,7 +2206,8 @@ mod tests { let indexing_options = TextOptions::default().set_indexing_options(TextFieldIndexing::default()); let mut schema_builder = Schema::builder(); - let text_field = schema_builder.add_text_field("text", indexing_options); + let text_field = schema_builder.add_text_field("text", indexing_options.clone()); + let json_field = schema_builder.add_json_field("body", indexing_options); let schema = schema_builder.build(); let ram_directory = RamDirectory::create(); @@ -2195,6 +2216,7 @@ mod tests { let mut doc = TantivyDocument::default(); doc.add_field_value(text_field, "hello"); index_writer.add_document(doc).unwrap(); + // the json value is not set index_writer.commit().unwrap(); let searcher = index.reader().unwrap().searcher(); @@ -2210,6 +2232,19 @@ mod tests { .collect(); assert!(warm_up_automatons(&searcher, &valid).await.is_ok()); + // Valid regexes on a JSON sub-field also warm up successfully. + let json_path = + quickwit_query::query_ast::JsonPath::from_json_path("process.executable", false); + let valid_json: HashMap> = std::iter::once(( + json_field, + HashSet::from([Automaton::Regex( + Some(json_path.clone()), + vec!["s.*".to_string(), "b.*".to_string()], + )]), + )) + .collect(); + assert!(warm_up_automatons(&searcher, &valid_json).await.is_ok()); + // An unbuildable regex (here, invalid syntax) must cause warmup to fail // rather than fall back to warming the patterns individually. let invalid: HashMap> = std::iter::once(( @@ -2222,7 +2257,24 @@ mod tests { .unwrap_err() .to_string(); assert!( - error.contains("failed to build combined regex automaton during warmup"), + error.contains("failed to build regex during warmup for field `text`"), + "unexpected error: {error}" + ); + + // An unbuildable regex on a JSON sub-field also produces a useful error message. + let invalid_json: HashMap> = std::iter::once(( + json_field, + HashSet::from([Automaton::Regex(Some(json_path), vec!["(".to_string()])]), + )) + .collect(); + let error = warm_up_automatons(&searcher, &invalid_json) + .await + .unwrap_err() + .to_string(); + assert!( + error.contains( + "failed to build regex during warmup for field `body.process.executable`" + ), "unexpected error: {error}" ); } diff --git a/quickwit/rest-api-tests/scenarii/es_compatibility/0031-regex.yaml b/quickwit/rest-api-tests/scenarii/es_compatibility/0031-regex.yaml index e194e973cd8..7f18e841155 100644 --- a/quickwit/rest-api-tests/scenarii/es_compatibility/0031-regex.yaml +++ b/quickwit/rest-api-tests/scenarii/es_compatibility/0031-regex.yaml @@ -1,110 +1,110 @@ -# # Basic regex match -# params: -# size: 0 -# json: -# track_total_hits: true -# query: -# regexp: -# type: -# value: ".*event" -# expected: -# hits: -# total: -# value: 100 -# relation: "eq" -# --- -# # Regex always match from start to end (`(re)` equivalent to `^(re)$`) -# params: -# size: 3 -# json: -# track_total_hits: true -# query: -# regexp: -# type: -# value: "event" -# expected: -# hits: -# total: -# value: 0 -# relation: "eq" -# --- -# # Regex with case_insensitive flag -# params: -# size: 3 -# json: -# track_total_hits: true -# query: -# regexp: -# repo.name: -# # lowercased by the tokenizer -# value: "RUST.*" -# case_insensitive: true -# expected: -# hits: -# total: -# value: 1 -# relation: "eq" -# --- -# params: -# size: 3 -# json: -# track_total_hits: true -# query: -# regexp: -# type: -# # lowercased by the tokenizer -# value: "RUST.*" -# case_insensitive: false -# expected: -# hits: -# total: -# value: 0 -# relation: "eq" -# --- -# # ^ and $ are escaped when they are used as anchors, so -# # ^pushevent$ only matches if the original term is "^pushevent$". -# endpoint: "simple_es_compat/_search" -# params: -# size: 3 -# json: -# track_total_hits: true -# query: -# regexp: -# keyword_text: -# value: "red$" -# expected: -# hits: -# total: -# value: 0 -# relation: "eq" -# --- -# endpoint: "simple_es_compat/_search" -# params: -# size: 3 -# json: -# track_total_hits: true -# query: -# regexp: -# keyword_text: -# value: "gold$" -# expected: -# hits: -# total: -# value: 1 -# relation: "eq" -# --- -# # regex in query_string -# params: -# size: 10 -# json: -# query: -# query_string: -# query: "type:/pushevent/" -# expected: -# hits: -# total: -# value: 60 -# relation: "eq" +# Basic regex match +params: + size: 0 +json: + track_total_hits: true + query: + regexp: + type: + value: ".*event" +expected: + hits: + total: + value: 100 + relation: "eq" +--- +# Regex always match from start to end (`(re)` equivalent to `^(re)$`) +params: + size: 3 +json: + track_total_hits: true + query: + regexp: + type: + value: "event" +expected: + hits: + total: + value: 0 + relation: "eq" +--- +# Regex with case_insensitive flag +params: + size: 3 +json: + track_total_hits: true + query: + regexp: + repo.name: + # lowercased by the tokenizer + value: "RUST.*" + case_insensitive: true +expected: + hits: + total: + value: 1 + relation: "eq" +--- +params: + size: 3 +json: + track_total_hits: true + query: + regexp: + type: + # lowercased by the tokenizer + value: "RUST.*" + case_insensitive: false +expected: + hits: + total: + value: 0 + relation: "eq" +--- +# ^ and $ are escaped when they are used as anchors, so +# ^pushevent$ only matches if the original term is "^pushevent$". +endpoint: "simple_es_compat/_search" +params: + size: 3 +json: + track_total_hits: true + query: + regexp: + keyword_text: + value: "red$" +expected: + hits: + total: + value: 0 + relation: "eq" +--- +endpoint: "simple_es_compat/_search" +params: + size: 3 +json: + track_total_hits: true + query: + regexp: + keyword_text: + value: "gold$" +expected: + hits: + total: + value: 1 + relation: "eq" +--- +# regex in query_string +params: + size: 10 +json: + query: + query_string: + query: "type:/pushevent/" +expected: + hits: + total: + value: 60 + relation: "eq" --- # Many wildcards across a few fields (6 distinct fields, well within the 20-field limit). # All targeted fields are absent from the index so 0 hits are expected, @@ -185,10 +185,6 @@ json: registry.value: value: "*Teams*" case_insensitive: true - - wildcard: - registry.value: - value: "*TEHTRIS*" - case_insensitive: true - wildcard: registry.value: value: "*VMware*" @@ -229,10 +225,6 @@ json: process.executable: value: "*sara.exe*" case_insensitive: true - - wildcard: - process.command_line: - value: "*Letsignit*" - case_insensitive: true minimum_should_match: 1 - bool: should: