From ae0ae2f55a324fe533fd46ecf11c9a5977124734 Mon Sep 17 00:00:00 2001 From: Darkheir Date: Mon, 22 Jun 2026 10:55:52 +0200 Subject: [PATCH 1/5] feat: Combine regexes targeting same fields Signed-off-by: Darkheir --- quickwit/Cargo.lock | 2 +- quickwit/Cargo.toml | 2 +- .../quickwit-doc-mapper/src/doc_mapper/mod.rs | 12 +- .../quickwit-doc-mapper/src/query_builder.rs | 232 +++++++++++++++++- quickwit/quickwit-search/src/leaf.rs | 55 ++++- 5 files changed, 289 insertions(+), 14 deletions(-) diff --git a/quickwit/Cargo.lock b/quickwit/Cargo.lock index 6161a1f45df..8ad8eac98dc 100644 --- a/quickwit/Cargo.lock +++ b/quickwit/Cargo.lock @@ -9241,7 +9241,7 @@ dependencies = [ [[package]] name = "tantivy-fst" version = "0.5.0" -source = "git+https://github.com/SekoiaLab/fst/?rev=1997e450c52712357a2ffdbf0446263357ee0c02#1997e450c52712357a2ffdbf0446263357ee0c02" +source = "git+https://github.com/SekoiaLab/fst/?rev=c37128c307b0ba5d7c0040352f0d2606d6383b68#c37128c307b0ba5d7c0040352f0d2606d6383b68" dependencies = [ "byteorder", "regex-syntax", diff --git a/quickwit/Cargo.toml b/quickwit/Cargo.toml index c5710648179..334e54470ca 100644 --- a/quickwit/Cargo.toml +++ b/quickwit/Cargo.toml @@ -365,7 +365,7 @@ encoding_rs = "=0.8.35" [patch.crates-io] sasl2-sys = { git = "https://github.com/quickwit-oss/rust-sasl/", rev = "085a4c7" } -tantivy-fst = { git = "https://github.com/SekoiaLab/fst/", rev = "1997e450c52712357a2ffdbf0446263357ee0c02" } +tantivy-fst = { git = "https://github.com/SekoiaLab/fst/", rev = "c37128c307b0ba5d7c0040352f0d2606d6383b68" } ## this patched version of tracing helps better understand what happens inside futures (when are ## they polled, how long does poll take...) diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs index 370674c9536..39dd6da124d 100644 --- a/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs +++ b/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs @@ -78,9 +78,11 @@ pub struct TermRange { #[derive(Debug, Clone, PartialEq, Eq, Hash)] /// Supported automaton types to warmup pub enum Automaton { - /// A regex in its str representation as tantivy_fst::Regex isn't PartialEq, and the path if - /// inside a json field - Regex(Option>, String), + /// One or more regexes (in their str representation, as tantivy_fst::Regex isn't PartialEq) + /// targeting the same field and json path. They are warmed up as a single combined automaton + /// matching the union of the patterns. The optional path is the json path prefix when the + /// field is a json field. + Regex(Option>, Vec), /// An exact-match automaton for a TermSet query. TermSet(ExactSetAutomaton), } @@ -661,7 +663,7 @@ mod tests { fn automaton_hashset(elements: &[&str]) -> HashSet { elements .iter() - .map(|elem| Automaton::Regex(None, elem.to_string())) + .map(|elem| Automaton::Regex(None, vec![elem.to_string()])) .collect() } @@ -783,7 +785,7 @@ mod tests { let expected_automatons = [(1, "my_reg.*ex"), (1, "other-re.ex"), (2, "my_reg.*ex")]; for (field, regex) in expected_automatons { let field = Field::from_field_id(field); - let automaton = Automaton::Regex(None, regex.to_string()); + let automaton = Automaton::Regex(None, vec![regex.to_string()]); assert!( wi_base .automatons_grouped_by_field diff --git a/quickwit/quickwit-doc-mapper/src/query_builder.rs b/quickwit/quickwit-doc-mapper/src/query_builder.rs index 38d4bab60b1..c40078fb080 100644 --- a/quickwit/quickwit-doc-mapper/src/query_builder.rs +++ b/quickwit/quickwit-doc-mapper/src/query_builder.rs @@ -224,6 +224,8 @@ pub(crate) fn build_query( 2, )?; + coalesce_regexes_by_field(&mut automatons_grouped_by_field); + let warmup_info = WarmupInfo { terms_grouped_by_field, term_ranges_grouped_by_field, @@ -285,6 +287,38 @@ fn coalesce_multi_term_fields_into_automatons( Ok(()) } +/// Merges all `Automaton::Regex` entries that target the same field and json path into a single +/// multi-pattern `Automaton::Regex`. +/// +/// During warmup, each `Automaton::Regex` triggers one term-dictionary traversal. By collapsing all +/// regexes sharing a `(field, path)` into a single entry, warmup can build one combined automaton +/// (matching the union of the patterns) and traverse the term dictionary once per field+path +/// instead of once per regex. The union is a safe over-approximation: the actual per-regex queries +/// still perform exact matching at execution time, so results are unchanged. +/// +/// Entries with different json paths are kept separate (the path is applied via `JsonPathPrefix` +/// and cannot be shared). `Automaton::TermSet` entries are left untouched. +fn coalesce_regexes_by_field(automatons_grouped_by_field: &mut HashMap>) { + for automatons in automatons_grouped_by_field.values_mut() { + let mut regexes_by_path: HashMap>, Vec> = HashMap::new(); + let mut others: Vec = Vec::new(); + for automaton in automatons.drain() { + match automaton { + Automaton::Regex(path, patterns) => { + regexes_by_path.entry(path).or_default().extend(patterns); + } + other => others.push(other), + } + } + for (path, mut patterns) in regexes_by_path { + patterns.sort(); + patterns.dedup(); + automatons.insert(Automaton::Regex(path, patterns)); + } + automatons.extend(others); + } +} + /// Converts a `prefix` term into the equivalent term range. /// /// The resulting range is `[prefix, next_prefix)`, that is: @@ -403,7 +437,7 @@ impl<'a, 'b: 'a> QueryAstVisitor<'a> for ExtractPrefixTermRanges<'b> { Err(e) => return Err(e), }; - self.add_automaton(field, Automaton::Regex(path, regex)); + self.add_automaton(field, Automaton::Regex(path, vec![regex])); Ok(()) } @@ -414,7 +448,7 @@ impl<'a, 'b: 'a> QueryAstVisitor<'a> for ExtractPrefixTermRanges<'b> { Err(InvalidQuery::FieldDoesNotExist { .. }) => return Ok(()), Err(e) => return Err(e), }; - self.add_automaton(field, Automaton::Regex(path, regex)); + self.add_automaton(field, Automaton::Regex(path, vec![regex])); Ok(()) } } @@ -441,8 +475,9 @@ mod test { use quickwit_common::shared_consts::FIELD_PRESENCE_FIELD_NAME; use quickwit_query::query_ast::{ - BuildTantivyAstContext, FullTextMode, FullTextParams, PhrasePrefixQuery, QueryAstVisitor, - UserInputQuery, query_ast_from_user_text, + BoolQuery, BuildTantivyAstContext, FullTextMode, FullTextParams, PhrasePrefixQuery, + QueryAst, QueryAstVisitor, RegexQuery, UserInputQuery, WildcardQuery, + query_ast_from_user_text, }; use quickwit_query::{ BooleanOperand, MatchAllOrNone, create_default_quickwit_tokenizer_manager, @@ -1038,4 +1073,193 @@ mod test { expected.insert(field, expected_inner); assert_eq!(extractor1.term_ranges_to_warm_up, expected); } + + /// Builds a bool query made of one regex clause per `(field, regex)` pair. + fn regex_bool_query(clauses: &[(String, &str)]) -> QueryAst { + let must = clauses + .iter() + .map(|(field, regex)| { + QueryAst::Regex(RegexQuery { + field: field.clone(), + regex: regex.to_string(), + }) + }) + .collect(); + QueryAst::Bool(BoolQuery { + must, + ..Default::default() + }) + } + + #[test] + fn test_build_query_rejects_too_many_regex_fields() { + let schema = make_schema(true); + + // 21 distinct fields targeted by regexes: rejected. + let clauses: Vec<(String, &str)> = + (0..21).map(|i| (format!("field_{i}"), "abc.*")).collect(); + let query_ast = regex_bool_query(&clauses); + let err = build_query(query_ast, &BuildTantivyAstContext::for_test(&schema), None) + .unwrap_err() + .to_string(); + assert!( + err.contains("distinct fields with regexes"), + "unexpected error: {err}" + ); + } + + #[test] + fn test_build_query_accepts_exactly_twenty_regex_fields() { + let schema = make_schema(true); + + // Exactly 20 distinct fields: accepted. + let clauses: Vec<(String, &str)> = + (0..20).map(|i| (format!("field_{i}"), "abc.*")).collect(); + let query_ast = regex_bool_query(&clauses); + assert!(build_query(query_ast, &BuildTantivyAstContext::for_test(&schema), None).is_ok()); + } + + #[test] + fn test_build_query_accepts_many_regexes_on_same_field() { + let schema = make_schema(false); + + // 50 regexes all targeting the same field: accepted, since only one + // distinct field is involved. + let clauses: Vec<(String, &str)> = + (0..50).map(|_| ("title".to_string(), "abc.*")).collect(); + let query_ast = regex_bool_query(&clauses); + assert!(build_query(query_ast, &BuildTantivyAstContext::for_test(&schema), None).is_ok()); + } + + /// Builds a bool query made of wildcards. + fn wildcard_bool_query(clauses: &[(String, &str)]) -> QueryAst { + let must = clauses + .iter() + .map(|(field, value)| { + QueryAst::Wildcard(WildcardQuery { + field: field.clone(), + value: value.to_string(), + lenient: false, + case_insensitive: false, + }) + }) + .collect(); + QueryAst::Bool(BoolQuery { + must, + ..Default::default() + }) + } + + #[test] + fn test_build_query_rejects_too_many_wilcard_fields() { + let schema = make_schema(true); + + // 21 distinct fields targeted by wilcards: rejected. + let clauses: Vec<(String, &str)> = + (0..21).map(|i| (format!("field_{i}"), "abc*")).collect(); + let query_ast = wildcard_bool_query(&clauses); + let err = build_query(query_ast, &BuildTantivyAstContext::for_test(&schema), None) + .unwrap_err() + .to_string(); + assert!( + err.contains("distinct fields with regexes"), + "unexpected error: {err}" + ); + } + + /// Builds a bool query made of both regexes and wildcards. + fn regex_wildcard_bool_query(clauses: &[(String, &str)]) -> QueryAst { + let must = clauses + .iter() + .flat_map(|(field, value)| { + let wildcard = QueryAst::Wildcard(WildcardQuery { + field: format!("{}_wild", field), + value: value.to_string(), + lenient: false, + case_insensitive: false, + }); + let regex = QueryAst::Regex(RegexQuery { + field: format!("{}_re", field), + regex: value.to_string(), + }); + vec![wildcard, regex] + }) + .collect(); + QueryAst::Bool(BoolQuery { + must, + ..Default::default() + }) + } + + #[test] + fn test_build_query_rejects_too_many_wilcard_or_regex_fields() { + let schema = make_schema(true); + + // 21 distinct fields targeted by wilcards: rejected. + let clauses: Vec<(String, &str)> = + (0..11).map(|i| (format!("field_{i}"), "abc*")).collect(); + let query_ast = regex_wildcard_bool_query(&clauses); + let err = build_query(query_ast, &BuildTantivyAstContext::for_test(&schema), None) + .unwrap_err() + .to_string(); + assert!( + err.contains("distinct fields with regexes"), + "unexpected error: {err}" + ); + } + + #[test] + fn test_build_query_coalesces_regexes_on_same_field() { + let schema = make_schema(false); + let context = BuildTantivyAstContext::for_test(&schema); + + // Several regexes (including a duplicate) targeting the same field collapse into a single + // automaton holding the deduplicated, sorted set of patterns. + let clauses: Vec<(String, &str)> = vec![ + ("title".to_string(), "abc.*"), + ("title".to_string(), "xyz"), + ("title".to_string(), "foo.*"), + ("title".to_string(), "abc.*"), + ]; + let query_ast = regex_bool_query(&clauses); + let (_, warmup_info) = build_query(query_ast, &context, None).unwrap(); + + assert_eq!(warmup_info.automatons_grouped_by_field.len(), 1); + let automatons = warmup_info + .automatons_grouped_by_field + .values() + .next() + .unwrap(); + assert_eq!(automatons.len(), 1); + let Automaton::Regex(path, patterns) = automatons.iter().next().unwrap() else { + panic!("expected a regex automaton"); + }; + assert!(path.is_none()); + assert_eq!( + patterns, + &vec!["abc.*".to_string(), "foo.*".to_string(), "xyz".to_string()] + ); + } + + #[test] + fn test_build_query_keeps_regexes_on_different_fields_separate() { + let schema = make_schema(false); + let context = BuildTantivyAstContext::for_test(&schema); + + let clauses: Vec<(String, &str)> = vec![ + ("title".to_string(), "abc.*"), + ("desc".to_string(), "xyz.*"), + ]; + let query_ast = regex_bool_query(&clauses); + let (_, warmup_info) = build_query(query_ast, &context, None).unwrap(); + + assert_eq!(warmup_info.automatons_grouped_by_field.len(), 2); + for automatons in warmup_info.automatons_grouped_by_field.values() { + assert_eq!(automatons.len(), 1); + assert!(matches!( + automatons.iter().next().unwrap(), + Automaton::Regex(None, patterns) if patterns.len() == 1 + )); + } + } } diff --git a/quickwit/quickwit-search/src/leaf.rs b/quickwit/quickwit-search/src/leaf.rs index feb2aae3613..1ae34bfe322 100644 --- a/quickwit/quickwit-search/src/leaf.rs +++ b/quickwit/quickwit-search/src/leaf.rs @@ -338,9 +338,12 @@ async fn warm_up_automatons( let inv_idx_clone = inv_idx.clone(); warm_up_futures.push(async move { match automaton { - Automaton::Regex(path, regex_str) => { - let regex = tantivy_fst::Regex::new(regex_str) - .context("failed to parse regex during warmup")?; + Automaton::Regex(path, patterns) => { + // Combine all patterns so the term dictionary is + // traversed once instead of once per regex. + let regex = tantivy_fst::Regex::from_patterns(patterns).context( + "failed to build combined regex automaton during warmup", + )?; inv_idx_clone .warm_postings_automaton( quickwit_query::query_ast::JsonPathPrefix { @@ -2159,4 +2162,50 @@ mod tests { assert!(directory_size_larger > directory_size_smaller + 100); assert!(larger_size > smaller_size + 100); } + + #[tokio::test] + async fn test_warm_up_automatons_errors_when_combined_regex_unbuildable() { + let indexing_options = + TextOptions::default().set_indexing_options(TextFieldIndexing::default()); + let mut schema_builder = Schema::builder(); + let text_field = schema_builder.add_text_field("text", indexing_options); + let schema = schema_builder.build(); + + let ram_directory = RamDirectory::create(); + let index = Index::open_or_create(ram_directory, schema).unwrap(); + let mut index_writer = index.writer(15_000_000).unwrap(); + let mut doc = TantivyDocument::default(); + doc.add_field_value(text_field, "hello"); + index_writer.add_document(doc).unwrap(); + index_writer.commit().unwrap(); + let searcher = index.reader().unwrap().searcher(); + + // Several valid regexes targeting the same field combine into a single + // automaton and warm up successfully. + let valid: HashMap> = std::iter::once(( + text_field, + HashSet::from([Automaton::Regex( + None, + vec!["h.*".to_string(), "x.*".to_string()], + )]), + )) + .collect(); + assert!(warm_up_automatons(&searcher, &valid).await.is_ok()); + + // An unbuildable regex (here, invalid syntax) must cause warmup to fail + // rather than fall back to warming the patterns individually. + let invalid: HashMap> = std::iter::once(( + text_field, + HashSet::from([Automaton::Regex(None, vec!["(".to_string()])]), + )) + .collect(); + let error = warm_up_automatons(&searcher, &invalid) + .await + .unwrap_err() + .to_string(); + assert!( + error.contains("failed to build combined regex automaton during warmup"), + "unexpected error: {error}" + ); + } } From 37bb03c7882a072ac8be8118308914a63a7367f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Cohen?= Date: Mon, 22 Jun 2026 11:04:46 +0200 Subject: [PATCH 2/5] Apply suggestions from code review Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- .../quickwit-doc-mapper/src/query_builder.rs | 120 +----------------- 1 file changed, 1 insertion(+), 119 deletions(-) diff --git a/quickwit/quickwit-doc-mapper/src/query_builder.rs b/quickwit/quickwit-doc-mapper/src/query_builder.rs index c40078fb080..adabf2d3862 100644 --- a/quickwit/quickwit-doc-mapper/src/query_builder.rs +++ b/quickwit/quickwit-doc-mapper/src/query_builder.rs @@ -476,8 +476,7 @@ mod test { use quickwit_common::shared_consts::FIELD_PRESENCE_FIELD_NAME; use quickwit_query::query_ast::{ BoolQuery, BuildTantivyAstContext, FullTextMode, FullTextParams, PhrasePrefixQuery, - QueryAst, QueryAstVisitor, RegexQuery, UserInputQuery, WildcardQuery, - query_ast_from_user_text, + QueryAst, QueryAstVisitor, RegexQuery, UserInputQuery, query_ast_from_user_text, }; use quickwit_query::{ BooleanOperand, MatchAllOrNone, create_default_quickwit_tokenizer_manager, @@ -1091,123 +1090,6 @@ mod test { }) } - #[test] - fn test_build_query_rejects_too_many_regex_fields() { - let schema = make_schema(true); - - // 21 distinct fields targeted by regexes: rejected. - let clauses: Vec<(String, &str)> = - (0..21).map(|i| (format!("field_{i}"), "abc.*")).collect(); - let query_ast = regex_bool_query(&clauses); - let err = build_query(query_ast, &BuildTantivyAstContext::for_test(&schema), None) - .unwrap_err() - .to_string(); - assert!( - err.contains("distinct fields with regexes"), - "unexpected error: {err}" - ); - } - - #[test] - fn test_build_query_accepts_exactly_twenty_regex_fields() { - let schema = make_schema(true); - - // Exactly 20 distinct fields: accepted. - let clauses: Vec<(String, &str)> = - (0..20).map(|i| (format!("field_{i}"), "abc.*")).collect(); - let query_ast = regex_bool_query(&clauses); - assert!(build_query(query_ast, &BuildTantivyAstContext::for_test(&schema), None).is_ok()); - } - - #[test] - fn test_build_query_accepts_many_regexes_on_same_field() { - let schema = make_schema(false); - - // 50 regexes all targeting the same field: accepted, since only one - // distinct field is involved. - let clauses: Vec<(String, &str)> = - (0..50).map(|_| ("title".to_string(), "abc.*")).collect(); - let query_ast = regex_bool_query(&clauses); - assert!(build_query(query_ast, &BuildTantivyAstContext::for_test(&schema), None).is_ok()); - } - - /// Builds a bool query made of wildcards. - fn wildcard_bool_query(clauses: &[(String, &str)]) -> QueryAst { - let must = clauses - .iter() - .map(|(field, value)| { - QueryAst::Wildcard(WildcardQuery { - field: field.clone(), - value: value.to_string(), - lenient: false, - case_insensitive: false, - }) - }) - .collect(); - QueryAst::Bool(BoolQuery { - must, - ..Default::default() - }) - } - - #[test] - fn test_build_query_rejects_too_many_wilcard_fields() { - let schema = make_schema(true); - - // 21 distinct fields targeted by wilcards: rejected. - let clauses: Vec<(String, &str)> = - (0..21).map(|i| (format!("field_{i}"), "abc*")).collect(); - let query_ast = wildcard_bool_query(&clauses); - let err = build_query(query_ast, &BuildTantivyAstContext::for_test(&schema), None) - .unwrap_err() - .to_string(); - assert!( - err.contains("distinct fields with regexes"), - "unexpected error: {err}" - ); - } - - /// Builds a bool query made of both regexes and wildcards. - fn regex_wildcard_bool_query(clauses: &[(String, &str)]) -> QueryAst { - let must = clauses - .iter() - .flat_map(|(field, value)| { - let wildcard = QueryAst::Wildcard(WildcardQuery { - field: format!("{}_wild", field), - value: value.to_string(), - lenient: false, - case_insensitive: false, - }); - let regex = QueryAst::Regex(RegexQuery { - field: format!("{}_re", field), - regex: value.to_string(), - }); - vec![wildcard, regex] - }) - .collect(); - QueryAst::Bool(BoolQuery { - must, - ..Default::default() - }) - } - - #[test] - fn test_build_query_rejects_too_many_wilcard_or_regex_fields() { - let schema = make_schema(true); - - // 21 distinct fields targeted by wilcards: rejected. - let clauses: Vec<(String, &str)> = - (0..11).map(|i| (format!("field_{i}"), "abc*")).collect(); - let query_ast = regex_wildcard_bool_query(&clauses); - let err = build_query(query_ast, &BuildTantivyAstContext::for_test(&schema), None) - .unwrap_err() - .to_string(); - assert!( - err.contains("distinct fields with regexes"), - "unexpected error: {err}" - ); - } - #[test] fn test_build_query_coalesces_regexes_on_same_field() { let schema = make_schema(false); From 98631368b95871d0ae7ac0a9bf8757e4b46b1f47 Mon Sep 17 00:00:00 2001 From: Darkheir Date: Mon, 22 Jun 2026 11:53:55 +0200 Subject: [PATCH 3/5] feat: Limit number of fields targeted by a regex Signed-off-by: Darkheir --- .../quickwit-doc-mapper/src/query_builder.rs | 140 +++++++++++++++++- 1 file changed, 138 insertions(+), 2 deletions(-) diff --git a/quickwit/quickwit-doc-mapper/src/query_builder.rs b/quickwit/quickwit-doc-mapper/src/query_builder.rs index adabf2d3862..aa37895d82e 100644 --- a/quickwit/quickwit-doc-mapper/src/query_builder.rs +++ b/quickwit/quickwit-doc-mapper/src/query_builder.rs @@ -27,11 +27,15 @@ use quickwit_query::{InvalidQuery, find_field_or_hit_dynamic}; use tantivy::Term; use tantivy::query::Query; use tantivy::schema::{Field, Schema}; -use tracing::error; +use tracing::{error, warn}; use crate::doc_mapper::FastFieldWarmupInfo; use crate::{Automaton, ExactSetAutomaton, QueryParserError, TermRange, WarmupInfo}; +/// Maximum number of distinct fields that can be targeted by regex queries in a +/// single query. Multiple regexes targeting the same field count as one field. +const MAX_REGEX_QUERY_FIELDS: usize = 20; + #[derive(Default)] struct RangeQueryFields { range_query_field_names: HashSet, @@ -226,6 +230,20 @@ pub(crate) fn build_query( coalesce_regexes_by_field(&mut automatons_grouped_by_field); + let regex_field_count = automatons_grouped_by_field + .values() + .flatten() + .filter(|automaton| matches!(automaton, Automaton::Regex(_, _))) + .count(); + if regex_field_count > MAX_REGEX_QUERY_FIELDS { + let error_msg = format!( + "query targets {} distinct fields with regexes, but at most {} are allowed", + regex_field_count, MAX_REGEX_QUERY_FIELDS, + ); + warn!("{}", error_msg); + return Err(InvalidQuery::Other(anyhow::anyhow!(error_msg)).into()); + } + let warmup_info = WarmupInfo { terms_grouped_by_field, term_ranges_grouped_by_field, @@ -476,7 +494,8 @@ mod test { use quickwit_common::shared_consts::FIELD_PRESENCE_FIELD_NAME; use quickwit_query::query_ast::{ BoolQuery, BuildTantivyAstContext, FullTextMode, FullTextParams, PhrasePrefixQuery, - QueryAst, QueryAstVisitor, RegexQuery, UserInputQuery, query_ast_from_user_text, + QueryAst, QueryAstVisitor, RegexQuery, UserInputQuery, WildcardQuery, + query_ast_from_user_text, }; use quickwit_query::{ BooleanOperand, MatchAllOrNone, create_default_quickwit_tokenizer_manager, @@ -1144,4 +1163,121 @@ mod test { )); } } + + #[test] + fn test_build_query_rejects_too_many_regex_fields() { + let schema = make_schema(true); + + // 21 distinct fields targeted by regexes: rejected. + let clauses: Vec<(String, &str)> = + (0..21).map(|i| (format!("field_{i}"), "abc.*")).collect(); + let query_ast = regex_bool_query(&clauses); + let err = build_query(query_ast, &BuildTantivyAstContext::for_test(&schema), None) + .unwrap_err() + .to_string(); + assert!( + err.contains("distinct fields with regexes"), + "unexpected error: {err}" + ); + } + + #[test] + fn test_build_query_accepts_exactly_twenty_regex_fields() { + let schema = make_schema(true); + + // Exactly 20 distinct fields: accepted. + let clauses: Vec<(String, &str)> = + (0..20).map(|i| (format!("field_{i}"), "abc.*")).collect(); + let query_ast = regex_bool_query(&clauses); + assert!(build_query(query_ast, &BuildTantivyAstContext::for_test(&schema), None).is_ok()); + } + + #[test] + fn test_build_query_accepts_many_regexes_on_same_field() { + let schema = make_schema(false); + + // 50 regexes all targeting the same field: accepted, since only one + // distinct field is involved. + let clauses: Vec<(String, &str)> = + (0..50).map(|_| ("title".to_string(), "abc.*")).collect(); + let query_ast = regex_bool_query(&clauses); + assert!(build_query(query_ast, &BuildTantivyAstContext::for_test(&schema), None).is_ok()); + } + + /// Builds a bool query made of wildcards. + fn wildcard_bool_query(clauses: &[(String, &str)]) -> QueryAst { + let must = clauses + .iter() + .map(|(field, value)| { + QueryAst::Wildcard(WildcardQuery { + field: field.clone(), + value: value.to_string(), + lenient: false, + case_insensitive: false, + }) + }) + .collect(); + QueryAst::Bool(BoolQuery { + must, + ..Default::default() + }) + } + + #[test] + fn test_build_query_rejects_too_many_wilcard_fields() { + let schema = make_schema(true); + + // 21 distinct fields targeted by wilcards: rejected. + let clauses: Vec<(String, &str)> = + (0..21).map(|i| (format!("field_{i}"), "abc*")).collect(); + let query_ast = wildcard_bool_query(&clauses); + let err = build_query(query_ast, &BuildTantivyAstContext::for_test(&schema), None) + .unwrap_err() + .to_string(); + assert!( + err.contains("distinct fields with regexes"), + "unexpected error: {err}" + ); + } + + /// Builds a bool query made of both regexes and wildcards. + fn regex_wildcard_bool_query(clauses: &[(String, &str)]) -> QueryAst { + let must = clauses + .iter() + .flat_map(|(field, value)| { + let wildcard = QueryAst::Wildcard(WildcardQuery { + field: format!("{}_wild", field), + value: value.to_string(), + lenient: false, + case_insensitive: false, + }); + let regex = QueryAst::Regex(RegexQuery { + field: format!("{}_re", field), + regex: value.to_string(), + }); + vec![wildcard, regex] + }) + .collect(); + QueryAst::Bool(BoolQuery { + must, + ..Default::default() + }) + } + + #[test] + fn test_build_query_rejects_too_many_wilcard_or_regex_fields() { + let schema = make_schema(true); + + // 21 distinct fields targeted by wilcards: rejected. + let clauses: Vec<(String, &str)> = + (0..11).map(|i| (format!("field_{i}"), "abc*")).collect(); + let query_ast = regex_wildcard_bool_query(&clauses); + let err = build_query(query_ast, &BuildTantivyAstContext::for_test(&schema), None) + .unwrap_err() + .to_string(); + assert!( + err.contains("distinct fields with regexes"), + "unexpected error: {err}" + ); + } } From 7710ff0f28c4ec5fa5e090a3e0ff6a97aa3f605c Mon Sep 17 00:00:00 2001 From: Darkheir Date: Mon, 22 Jun 2026 11:59:49 +0200 Subject: [PATCH 4/5] feat: Read max fields from env Signed-off-by: Darkheir --- quickwit/quickwit-doc-mapper/src/query_builder.rs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/quickwit/quickwit-doc-mapper/src/query_builder.rs b/quickwit/quickwit-doc-mapper/src/query_builder.rs index aa37895d82e..2082e5b9867 100644 --- a/quickwit/quickwit-doc-mapper/src/query_builder.rs +++ b/quickwit/quickwit-doc-mapper/src/query_builder.rs @@ -15,7 +15,7 @@ use std::collections::{HashMap, HashSet}; use std::convert::Infallible; use std::ops::Bound; -use std::sync::Arc; +use std::sync::{Arc, LazyLock}; use quickwit_proto::types::SplitId; use quickwit_query::query_ast::{ @@ -34,7 +34,8 @@ use crate::{Automaton, ExactSetAutomaton, QueryParserError, TermRange, WarmupInf /// Maximum number of distinct fields that can be targeted by regex queries in a /// single query. Multiple regexes targeting the same field count as one field. -const MAX_REGEX_QUERY_FIELDS: usize = 20; +static MAX_REGEX_QUERY_FIELDS: LazyLock = + LazyLock::new(|| quickwit_common::get_from_env("QW_MAX_REGEX_QUERY_FIELDS", 20, false)); #[derive(Default)] struct RangeQueryFields { @@ -235,10 +236,10 @@ pub(crate) fn build_query( .flatten() .filter(|automaton| matches!(automaton, Automaton::Regex(_, _))) .count(); - if regex_field_count > MAX_REGEX_QUERY_FIELDS { + if regex_field_count > *MAX_REGEX_QUERY_FIELDS { let error_msg = format!( "query targets {} distinct fields with regexes, but at most {} are allowed", - regex_field_count, MAX_REGEX_QUERY_FIELDS, + regex_field_count, *MAX_REGEX_QUERY_FIELDS, ); warn!("{}", error_msg); return Err(InvalidQuery::Other(anyhow::anyhow!(error_msg)).into()); From 8884e1dc34e1ae5fe26097fb5866de1508587a2a Mon Sep 17 00:00:00 2001 From: Darkheir Date: Mon, 22 Jun 2026 12:04:01 +0200 Subject: [PATCH 5/5] feat: Apply review suggestion Signed-off-by: Darkheir --- .../quickwit-doc-mapper/src/query_builder.rs | 6 ++++- quickwit/quickwit-search/src/leaf.rs | 26 ++++++++++++++++--- 2 files changed, 27 insertions(+), 5 deletions(-) diff --git a/quickwit/quickwit-doc-mapper/src/query_builder.rs b/quickwit/quickwit-doc-mapper/src/query_builder.rs index 2082e5b9867..cfe27d841ea 100644 --- a/quickwit/quickwit-doc-mapper/src/query_builder.rs +++ b/quickwit/quickwit-doc-mapper/src/query_builder.rs @@ -241,7 +241,11 @@ pub(crate) fn build_query( "query targets {} distinct fields with regexes, but at most {} are allowed", regex_field_count, *MAX_REGEX_QUERY_FIELDS, ); - warn!("{}", error_msg); + warn!( + count = regex_field_count, + max = *MAX_REGEX_QUERY_FIELDS, + "too many regexes on distinct paths" + ); return Err(InvalidQuery::Other(anyhow::anyhow!(error_msg)).into()); } diff --git a/quickwit/quickwit-search/src/leaf.rs b/quickwit/quickwit-search/src/leaf.rs index 1ae34bfe322..8f007a604fe 100644 --- a/quickwit/quickwit-search/src/leaf.rs +++ b/quickwit/quickwit-search/src/leaf.rs @@ -332,18 +332,30 @@ async fn warm_up_automatons( .map_err(|_| std::io::Error::other("task panicked"))? }; for (field, automatons) in terms_grouped_by_field { + let field_name = searcher.schema().get_field_name(*field).to_string(); for segment_reader in searcher.segment_readers() { let inv_idx = segment_reader.inverted_index(*field)?; for automaton in automatons { let inv_idx_clone = inv_idx.clone(); + let field_name = field_name.clone(); warm_up_futures.push(async move { match automaton { Automaton::Regex(path, patterns) => { + let path_str = path + .as_deref() + .map(|path| String::from_utf8_lossy(path).into_owned()) + .unwrap_or_default(); // Combine all patterns so the term dictionary is // traversed once instead of once per regex. - let regex = tantivy_fst::Regex::from_patterns(patterns).context( - "failed to build combined regex automaton during warmup", - )?; + let regex = + tantivy_fst::Regex::from_patterns(patterns).with_context(|| { + format!( + "failed to build combined regex automaton during warmup \ + for field `{field_name}` (path: `{path_str}`, {} \ + patterns)", + patterns.len(), + ) + })?; inv_idx_clone .warm_postings_automaton( quickwit_query::query_ast::JsonPathPrefix { @@ -353,7 +365,13 @@ async fn warm_up_automatons( cpu_intensive_executor, ) .await - .context("failed to load automaton") + .with_context(|| { + format!( + "failed to load automaton for field `{field_name}` (path: \ + `{path_str}`, {} patterns)", + patterns.len(), + ) + }) } Automaton::TermSet(automaton) => inv_idx_clone .warm_postings_automaton(automaton.clone(), cpu_intensive_executor)