From 94527ad38d6d7d4a238fed7ce7d92049615036c9 Mon Sep 17 00:00:00 2001 From: Darkheir Date: Thu, 18 Jun 2026 10:50:28 +0200 Subject: [PATCH] feat: Limit number of fields targeted by a regex Signed-off-by: Darkheir --- .../quickwit-doc-mapper/src/query_builder.rs | 181 +++++++++++++++++- 1 file changed, 178 insertions(+), 3 deletions(-) diff --git a/quickwit/quickwit-doc-mapper/src/query_builder.rs b/quickwit/quickwit-doc-mapper/src/query_builder.rs index 38d4bab60b1..f7a3805055d 100644 --- a/quickwit/quickwit-doc-mapper/src/query_builder.rs +++ b/quickwit/quickwit-doc-mapper/src/query_builder.rs @@ -27,11 +27,15 @@ use quickwit_query::{InvalidQuery, find_field_or_hit_dynamic}; use tantivy::Term; use tantivy::query::Query; use tantivy::schema::{Field, Schema}; -use tracing::error; +use tracing::{error, warn}; use crate::doc_mapper::FastFieldWarmupInfo; use crate::{Automaton, ExactSetAutomaton, QueryParserError, TermRange, WarmupInfo}; +/// Maximum number of distinct fields that can be targeted by regex queries in a +/// single query. Multiple regexes targeting the same field count as one field. +const MAX_REGEX_QUERY_FIELDS: usize = 20; + #[derive(Default)] struct RangeQueryFields { range_query_field_names: HashSet, @@ -47,6 +51,29 @@ impl<'a> QueryAstVisitor<'a> for RangeQueryFields { } } +/// Collects the set of distinct fields targeted by regex queries. +#[derive(Default)] +struct RegexQueryFields { + regex_query_field_names: HashSet, +} + +impl<'a> QueryAstVisitor<'a> for RegexQueryFields { + type Err = Infallible; + + fn visit_regex(&mut self, regex_query: &'a RegexQuery) -> Result<(), Infallible> { + self.regex_query_field_names + .insert(regex_query.field.to_string()); + Ok(()) + } + + fn visit_wildcard(&mut self, wildcard_query: &'a WildcardQuery) -> Result<(), Infallible> { + // Wilcard queries are converted to regexes, so we also count them as regex queries. + self.regex_query_field_names + .insert(wildcard_query.field.clone()); + Ok(()) + } +} + /// Term Queries on fields which are fast but not indexed. struct TermSearchOnColumnar<'f> { fields: &'f mut HashSet, @@ -171,6 +198,19 @@ pub(crate) fn build_query( query_ast }; + let mut regex_query_fields = RegexQueryFields::default(); + // This cannot fail. The error type is Infallible. + let Ok(_) = regex_query_fields.visit(&query_ast); + if regex_query_fields.regex_query_field_names.len() > MAX_REGEX_QUERY_FIELDS { + let error_msg = format!( + "query targets {} distinct fields with regexes, but at most {} are allowed", + regex_query_fields.regex_query_field_names.len(), + MAX_REGEX_QUERY_FIELDS, + ); + warn!("{}", error_msg); + return Err(InvalidQuery::Other(anyhow::anyhow!(error_msg)).into()); + } + let mut range_query_fields = RangeQueryFields::default(); // This cannot fail. The error type is Infallible. let Ok(_) = range_query_fields.visit(&query_ast); @@ -441,8 +481,9 @@ mod test { use quickwit_common::shared_consts::FIELD_PRESENCE_FIELD_NAME; use quickwit_query::query_ast::{ - BuildTantivyAstContext, FullTextMode, FullTextParams, PhrasePrefixQuery, QueryAstVisitor, - UserInputQuery, query_ast_from_user_text, + BoolQuery, BuildTantivyAstContext, FullTextMode, FullTextParams, PhrasePrefixQuery, + QueryAst, QueryAstVisitor, RegexQuery, UserInputQuery, WildcardQuery, + query_ast_from_user_text, }; use quickwit_query::{ BooleanOperand, MatchAllOrNone, create_default_quickwit_tokenizer_manager, @@ -1038,4 +1079,138 @@ mod test { expected.insert(field, expected_inner); assert_eq!(extractor1.term_ranges_to_warm_up, expected); } + + /// Builds a bool query made of one regex clause per `(field, regex)` pair. + fn regex_bool_query(clauses: &[(String, &str)]) -> QueryAst { + let must = clauses + .iter() + .map(|(field, regex)| { + QueryAst::Regex(RegexQuery { + field: field.clone(), + regex: regex.to_string(), + }) + }) + .collect(); + QueryAst::Bool(BoolQuery { + must, + ..Default::default() + }) + } + + #[test] + fn test_build_query_rejects_too_many_regex_fields() { + let schema = make_schema(true); + + // 21 distinct fields targeted by regexes: rejected. + let clauses: Vec<(String, &str)> = + (0..21).map(|i| (format!("field_{i}"), "abc.*")).collect(); + let query_ast = regex_bool_query(&clauses); + let err = build_query(query_ast, &BuildTantivyAstContext::for_test(&schema), None) + .unwrap_err() + .to_string(); + assert!( + err.contains("distinct fields with regexes"), + "unexpected error: {err}" + ); + } + + #[test] + fn test_build_query_accepts_exactly_twenty_regex_fields() { + let schema = make_schema(true); + + // Exactly 20 distinct fields: accepted. + let clauses: Vec<(String, &str)> = + (0..20).map(|i| (format!("field_{i}"), "abc.*")).collect(); + let query_ast = regex_bool_query(&clauses); + assert!(build_query(query_ast, &BuildTantivyAstContext::for_test(&schema), None).is_ok()); + } + + #[test] + fn test_build_query_accepts_many_regexes_on_same_field() { + let schema = make_schema(false); + + // 50 regexes all targeting the same field: accepted, since only one + // distinct field is involved. + let clauses: Vec<(String, &str)> = + (0..50).map(|_| ("title".to_string(), "abc.*")).collect(); + let query_ast = regex_bool_query(&clauses); + assert!(build_query(query_ast, &BuildTantivyAstContext::for_test(&schema), None).is_ok()); + } + + /// Builds a bool query made of wildcards. + fn wildcard_bool_query(clauses: &[(String, &str)]) -> QueryAst { + let must = clauses + .iter() + .map(|(field, value)| { + QueryAst::Wildcard(WildcardQuery { + field: field.clone(), + value: value.to_string(), + lenient: false, + case_insensitive: false, + }) + }) + .collect(); + QueryAst::Bool(BoolQuery { + must, + ..Default::default() + }) + } + + #[test] + fn test_build_query_rejects_too_many_wilcard_fields() { + let schema = make_schema(true); + + // 21 distinct fields targeted by wilcards: rejected. + let clauses: Vec<(String, &str)> = + (0..21).map(|i| (format!("field_{i}"), "abc*")).collect(); + let query_ast = wildcard_bool_query(&clauses); + let err = build_query(query_ast, &BuildTantivyAstContext::for_test(&schema), None) + .unwrap_err() + .to_string(); + assert!( + err.contains("distinct fields with regexes"), + "unexpected error: {err}" + ); + } + + /// Builds a bool query made of both regexes and wildcards. + fn regex_wildcard_bool_query(clauses: &[(String, &str)]) -> QueryAst { + let must = clauses + .iter() + .flat_map(|(field, value)| { + let wildcard = QueryAst::Wildcard(WildcardQuery { + field: format!("{}_wild", field), + value: value.to_string(), + lenient: false, + case_insensitive: false, + }); + let regex = QueryAst::Regex(RegexQuery { + field: format!("{}_re", field), + regex: value.to_string(), + }); + vec![wildcard, regex] + }) + .collect(); + QueryAst::Bool(BoolQuery { + must, + ..Default::default() + }) + } + + #[test] + fn test_build_query_rejects_too_many_wilcard_or_regex_fields() { + let schema = make_schema(true); + + // 21 distinct fields targeted by wilcards: rejected. + let clauses: Vec<(String, &str)> = + (0..11).map(|i| (format!("field_{i}"), "abc*")).collect(); + let query_ast = regex_wildcard_bool_query(&clauses); + let err = build_query(query_ast, &BuildTantivyAstContext::for_test(&schema), None) + .unwrap_err() + .to_string(); + assert!( + err.contains("distinct fields with regexes"), + "unexpected error: {err}" + ); + } }