Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
181 changes: 178 additions & 3 deletions quickwit/quickwit-doc-mapper/src/query_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,15 @@ use quickwit_query::{InvalidQuery, find_field_or_hit_dynamic};
use tantivy::Term;
use tantivy::query::Query;
use tantivy::schema::{Field, Schema};
use tracing::error;
use tracing::{error, warn};
Comment thread
Darkheir marked this conversation as resolved.

use crate::doc_mapper::FastFieldWarmupInfo;
use crate::{Automaton, ExactSetAutomaton, QueryParserError, TermRange, WarmupInfo};

/// Maximum number of distinct fields that can be targeted by regex queries in a
/// single query. Multiple regexes targeting the same field count as one field.
const MAX_REGEX_QUERY_FIELDS: usize = 20;

#[derive(Default)]
struct RangeQueryFields {
range_query_field_names: HashSet<String>,
Expand All @@ -47,6 +51,29 @@ impl<'a> QueryAstVisitor<'a> for RangeQueryFields {
}
}

/// Collects the set of distinct fields targeted by regex queries.
#[derive(Default)]
struct RegexQueryFields {
regex_query_field_names: HashSet<String>,
}

impl<'a> QueryAstVisitor<'a> for RegexQueryFields {
type Err = Infallible;

fn visit_regex(&mut self, regex_query: &'a RegexQuery) -> Result<(), Infallible> {
self.regex_query_field_names
.insert(regex_query.field.to_string());
Ok(())
}

fn visit_wildcard(&mut self, wildcard_query: &'a WildcardQuery) -> Result<(), Infallible> {
// Wilcard queries are converted to regexes, so we also count them as regex queries.
self.regex_query_field_names
.insert(wildcard_query.field.clone());
Ok(())
}
}
Comment thread
Darkheir marked this conversation as resolved.

/// Term Queries on fields which are fast but not indexed.
struct TermSearchOnColumnar<'f> {
fields: &'f mut HashSet<FastFieldWarmupInfo>,
Expand Down Expand Up @@ -171,6 +198,19 @@ pub(crate) fn build_query(
query_ast
};

let mut regex_query_fields = RegexQueryFields::default();
// This cannot fail. The error type is Infallible.
let Ok(_) = regex_query_fields.visit(&query_ast);
if regex_query_fields.regex_query_field_names.len() > MAX_REGEX_QUERY_FIELDS {
let error_msg = format!(
"query targets {} distinct fields with regexes, but at most {} are allowed",
regex_query_fields.regex_query_field_names.len(),
MAX_REGEX_QUERY_FIELDS,
);
warn!("{}", error_msg);
return Err(InvalidQuery::Other(anyhow::anyhow!(error_msg)).into());
}

let mut range_query_fields = RangeQueryFields::default();
// This cannot fail. The error type is Infallible.
let Ok(_) = range_query_fields.visit(&query_ast);
Expand Down Expand Up @@ -441,8 +481,9 @@ mod test {

use quickwit_common::shared_consts::FIELD_PRESENCE_FIELD_NAME;
use quickwit_query::query_ast::{
BuildTantivyAstContext, FullTextMode, FullTextParams, PhrasePrefixQuery, QueryAstVisitor,
UserInputQuery, query_ast_from_user_text,
BoolQuery, BuildTantivyAstContext, FullTextMode, FullTextParams, PhrasePrefixQuery,
QueryAst, QueryAstVisitor, RegexQuery, UserInputQuery, WildcardQuery,
query_ast_from_user_text,
};
use quickwit_query::{
BooleanOperand, MatchAllOrNone, create_default_quickwit_tokenizer_manager,
Expand Down Expand Up @@ -1038,4 +1079,138 @@ mod test {
expected.insert(field, expected_inner);
assert_eq!(extractor1.term_ranges_to_warm_up, expected);
}

/// Builds a bool query made of one regex clause per `(field, regex)` pair.
fn regex_bool_query(clauses: &[(String, &str)]) -> QueryAst {
let must = clauses
.iter()
.map(|(field, regex)| {
QueryAst::Regex(RegexQuery {
field: field.clone(),
regex: regex.to_string(),
})
})
.collect();
QueryAst::Bool(BoolQuery {
must,
..Default::default()
})
}

#[test]
fn test_build_query_rejects_too_many_regex_fields() {
let schema = make_schema(true);

// 21 distinct fields targeted by regexes: rejected.
let clauses: Vec<(String, &str)> =
(0..21).map(|i| (format!("field_{i}"), "abc.*")).collect();
let query_ast = regex_bool_query(&clauses);
let err = build_query(query_ast, &BuildTantivyAstContext::for_test(&schema), None)
.unwrap_err()
.to_string();
assert!(
err.contains("distinct fields with regexes"),
"unexpected error: {err}"
);
}

#[test]
fn test_build_query_accepts_exactly_twenty_regex_fields() {
let schema = make_schema(true);

// Exactly 20 distinct fields: accepted.
let clauses: Vec<(String, &str)> =
(0..20).map(|i| (format!("field_{i}"), "abc.*")).collect();
let query_ast = regex_bool_query(&clauses);
assert!(build_query(query_ast, &BuildTantivyAstContext::for_test(&schema), None).is_ok());
}

#[test]
fn test_build_query_accepts_many_regexes_on_same_field() {
let schema = make_schema(false);

// 50 regexes all targeting the same field: accepted, since only one
// distinct field is involved.
let clauses: Vec<(String, &str)> =
(0..50).map(|_| ("title".to_string(), "abc.*")).collect();
let query_ast = regex_bool_query(&clauses);
assert!(build_query(query_ast, &BuildTantivyAstContext::for_test(&schema), None).is_ok());
}

/// Builds a bool query made of wildcards.
fn wildcard_bool_query(clauses: &[(String, &str)]) -> QueryAst {
let must = clauses
.iter()
.map(|(field, value)| {
QueryAst::Wildcard(WildcardQuery {
field: field.clone(),
value: value.to_string(),
lenient: false,
case_insensitive: false,
})
})
.collect();
QueryAst::Bool(BoolQuery {
must,
..Default::default()
})
}

#[test]
fn test_build_query_rejects_too_many_wilcard_fields() {
let schema = make_schema(true);

// 21 distinct fields targeted by wilcards: rejected.
let clauses: Vec<(String, &str)> =
(0..21).map(|i| (format!("field_{i}"), "abc*")).collect();
let query_ast = wildcard_bool_query(&clauses);
let err = build_query(query_ast, &BuildTantivyAstContext::for_test(&schema), None)
.unwrap_err()
.to_string();
assert!(
err.contains("distinct fields with regexes"),
"unexpected error: {err}"
);
}

/// Builds a bool query made of both regexes and wildcards.
fn regex_wildcard_bool_query(clauses: &[(String, &str)]) -> QueryAst {
let must = clauses
.iter()
.flat_map(|(field, value)| {
let wildcard = QueryAst::Wildcard(WildcardQuery {
field: format!("{}_wild", field),
value: value.to_string(),
lenient: false,
case_insensitive: false,
});
let regex = QueryAst::Regex(RegexQuery {
field: format!("{}_re", field),
regex: value.to_string(),
});
vec![wildcard, regex]
})
.collect();
QueryAst::Bool(BoolQuery {
must,
..Default::default()
})
}

#[test]
fn test_build_query_rejects_too_many_wilcard_or_regex_fields() {
let schema = make_schema(true);

// 21 distinct fields targeted by wilcards: rejected.
let clauses: Vec<(String, &str)> =
(0..11).map(|i| (format!("field_{i}"), "abc*")).collect();
let query_ast = regex_wildcard_bool_query(&clauses);
let err = build_query(query_ast, &BuildTantivyAstContext::for_test(&schema), None)
.unwrap_err()
.to_string();
assert!(
err.contains("distinct fields with regexes"),
"unexpected error: {err}"
);
}
}
Loading