Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion quickwit/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion quickwit/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -365,7 +365,7 @@ encoding_rs = "=0.8.35"

[patch.crates-io]
sasl2-sys = { git = "https://github.com/quickwit-oss/rust-sasl/", rev = "085a4c7" }
tantivy-fst = { git = "https://github.com/SekoiaLab/fst/", rev = "c37128c307b0ba5d7c0040352f0d2606d6383b68" }
tantivy-fst = { git = "https://github.com/SekoiaLab/fst/", rev = "39ccb1c9283034815e22eb663d673a9e37023e7c" }

## this patched version of tracing helps better understand what happens inside futures (when are
## they polled, how long does poll take...)
Expand Down
3 changes: 2 additions & 1 deletion quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ pub(crate) use field_mapping_entry::{
#[cfg(test)]
pub(crate) use field_mapping_entry::{QuickwitNumericOptions, QuickwitTextOptions};
pub use field_mapping_type::FieldMappingType;
use quickwit_query::JsonPath;
use serde_json::Value as JsonValue;
use tantivy::Term;
use tantivy::schema::{Field, FieldType};
Expand Down Expand Up @@ -82,7 +83,7 @@ pub enum Automaton {
/// targeting the same field and json path. They are warmed up as a single combined automaton
/// matching the union of the patterns. The optional path is the json path prefix when the
/// field is a json field.
Regex(Option<Vec<u8>>, Vec<String>),
Regex(Option<JsonPath>, Vec<String>),
/// An exact-match automaton for a TermSet query.
TermSet(ExactSetAutomaton),
}
Expand Down
4 changes: 2 additions & 2 deletions quickwit/quickwit-doc-mapper/src/query_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ use quickwit_query::query_ast::{
QueryAstTransformer, QueryAstVisitor, RangeQuery, RegexQuery, TermSetQuery, WildcardQuery,
};
use quickwit_query::tokenizers::TokenizerManager;
use quickwit_query::{InvalidQuery, find_field_or_hit_dynamic};
use quickwit_query::{InvalidQuery, JsonPath, find_field_or_hit_dynamic};
use tantivy::Term;
use tantivy::query::Query;
use tantivy::schema::{Field, Schema};
Expand Down Expand Up @@ -323,7 +323,7 @@ fn coalesce_multi_term_fields_into_automatons(
/// and cannot be shared). `Automaton::TermSet` entries are left untouched.
fn coalesce_regexes_by_field(automatons_grouped_by_field: &mut HashMap<Field, HashSet<Automaton>>) {
for automatons in automatons_grouped_by_field.values_mut() {
let mut regexes_by_path: HashMap<Option<Vec<u8>>, Vec<String>> = HashMap::new();
let mut regexes_by_path: HashMap<Option<JsonPath>, Vec<String>> = HashMap::new();
let mut others: Vec<Automaton> = Vec::new();
for automaton in automatons.drain() {
match automaton {
Expand Down
1 change: 1 addition & 0 deletions quickwit/quickwit-query/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ pub use elastic_query_dsl::{ElasticQueryDsl, OneFieldMap};
pub use error::InvalidQuery;
pub use json_literal::{InterpretUserInput, JsonLiteral};
pub(crate) use not_nan_f32::NotNaNf32;
pub use query_ast::JsonPath;
pub use query_ast::utils::find_field_or_hit_dynamic;
use serde::{Deserialize, Serialize};
pub use tantivy::query::Query as TantivyQuery;
Expand Down
89 changes: 88 additions & 1 deletion quickwit/quickwit-query/src/query_ast/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,9 @@
// limitations under the License.

use serde::{Deserialize, Serialize};
use tantivy::Term;
use tantivy::query::BoostQuery as TantivyBoostQuery;
use tantivy::schema::Schema as TantivySchema;
use tantivy::schema::{Field, Schema as TantivySchema};

use crate::tokenizers::TokenizerManager;

Expand Down Expand Up @@ -325,8 +326,46 @@ pub fn query_ast_from_user_text(user_text: &str, default_fields: Option<Vec<Stri
.into()
}

/// Serialized binary JSON-path prefix used to scope automaton queries (regex,
/// wildcard) to a particular sub-field of a JSON field. The bytes encode the
/// tantivy path with `\x01` segment separators and a trailing `\x00<type>`
/// marker.
#[derive(Debug, Clone, PartialEq, Eq, Hash, Default)]
pub struct JsonPath(Vec<u8>);

impl std::ops::Deref for JsonPath {
type Target = [u8];
fn deref(&self) -> &[u8] {
&self.0
}
}

impl JsonPath {
/// Builds the serialized JSON-path prefix using the Tantivy encoding.
pub fn from_json_path(json_path: &str, expand_dots_enabled: bool) -> Self {
let mut term =
Term::from_field_json_path(Field::from_field_id(0), json_path, expand_dots_enabled);
term.append_type_and_str("");
// Skip the first byte: it is a JSON-field marker that is not stored in the dictionary.
JsonPath(term.value().as_serialized()[1..].to_owned())
}
}

impl std::fmt::Display for JsonPath {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
use std::fmt::Write;
let path = &self.0;
let path_bytes = &path[..path.iter().position(|&b| b == 0).unwrap_or(path.len())];
for ch in String::from_utf8_lossy(path_bytes).chars() {
f.write_char(if ch == '\x01' { '.' } else { ch })?;
}
Ok(())
}
}

#[cfg(test)]
mod tests {
use super::JsonPath;
use crate::query_ast::tantivy_query_ast::TantivyQueryAst;
use crate::query_ast::{
BoolQuery, BuildTantivyAst, BuildTantivyAstContext, QueryAst, UserInputQuery,
Expand Down Expand Up @@ -440,4 +479,52 @@ mod tests {
};
assert_eq!(input_query.default_operator, BooleanOperand::And);
}

#[test]
fn test_json_path_display() {
#[rustfmt::skip]
let cases: &[(&str, bool, &str)] = &[
// (json_path_input, expand_dots_enabled, expected_display)

// empty path (field with no sub-key)
("", false, ""),
("", true, ""),

// single-segment paths
("foo", false, "foo"),
("foo", true, "foo"),
("_my_field", false, "_my_field"),
("field123", false, "field123"),

// two-level paths
("foo.bar", false, "foo.bar"),
("foo.bar", true, "foo.bar"),
("process.executable", false, "process.executable"),

// three-level and deeper
("a.b.c", false, "a.b.c"),
("a.b.c.d.e", false, "a.b.c.d.e"),

// escaped dot: `split_json_path` treats `\.` as a literal dot in
// the segment name, so it becomes a single segment "k8s.node".
// With expand_dots=false the writer stores it as literal bytes "k8s.node";
// with expand_dots=true the writer re-expands the dot into \x01.
// In both cases Display renders it as "k8s.node".
("k8s\\.node", false, "k8s.node"),
("k8s\\.node", true, "k8s.node"),

// escaped dot in the middle of a longer path
("ns.k8s\\.node.id", false, "ns.k8s.node.id"),
("ns.k8s\\.node.id", true, "ns.k8s.node.id"),
];

for &(input, expand_dots, expected) in cases {
let path = JsonPath::from_json_path(input, expand_dots);
assert_eq!(
path.to_string(),
expected,
"from_json_path({input:?}, expand_dots={expand_dots})"
);
}
}
}
32 changes: 13 additions & 19 deletions quickwit/quickwit-query/src/query_ast/regex_query.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,11 @@ use std::sync::Arc;
use anyhow::Context;
pub use prefix::{AutomatonQuery, JsonPathPrefix};
use serde::{Deserialize, Serialize};
use tantivy::Term;
use tantivy::schema::{Field, FieldType, Schema as TantivySchema};

use super::{BuildTantivyAst, BuildTantivyAstContext, QueryAst};
use crate::query_ast::TantivyQueryAst;
use crate::{InvalidQuery, find_field_or_hit_dynamic};
use crate::{InvalidQuery, JsonPath, find_field_or_hit_dynamic};

/// A Regex query
#[derive(PartialEq, Eq, Debug, Serialize, Deserialize, Clone)]
Expand Down Expand Up @@ -51,7 +50,7 @@ impl RegexQuery {
pub fn to_field_and_regex(
&self,
schema: &TantivySchema,
) -> Result<(Field, Option<Vec<u8>>, String), InvalidQuery> {
) -> Result<(Field, Option<JsonPath>, String), InvalidQuery> {
let Some((field, field_entry, json_path)) = find_field_or_hit_dynamic(&self.field, schema)
else {
return Err(InvalidQuery::FieldDoesNotExist {
Expand Down Expand Up @@ -79,17 +78,8 @@ impl RegexQuery {
))
})?;

let mut term_for_path = Term::from_field_json_path(
field,
json_path,
json_options.is_expand_dots_enabled(),
);
term_for_path.append_type_and_str("");

let value = term_for_path.value();
// We skip the 1st byte which is a marker to tell this is json. This isn't present
// in the dictionary
let byte_path_prefix = value.as_serialized()[1..].to_owned();
let byte_path_prefix =
JsonPath::from_json_path(json_path, json_options.is_expand_dots_enabled());
Ok((field, Some(byte_path_prefix), self.regex.clone()))
}
_ => Err(InvalidQuery::SchemaError(
Expand Down Expand Up @@ -125,8 +115,10 @@ mod prefix {
use tantivy::schema::Field;
use tantivy_fst::Automaton;

use crate::JsonPath;

pub struct JsonPathPrefix<A> {
pub prefix: Vec<u8>,
pub prefix: JsonPath,
pub automaton: Arc<A>,
}

Expand Down Expand Up @@ -246,13 +238,15 @@ mod prefix {

#[cfg(test)]
mod tests {
use std::ops::Deref;
use std::sync::Arc;

use tantivy::schema::{Schema as TantivySchema, TEXT};
use tantivy_fst::{Automaton, Regex};

use super::prefix::JsonPathPrefixState;
use super::{JsonPathPrefix, RegexQuery};
use crate::JsonPath;

#[test]
fn test_regex_query_text_field() {
Expand Down Expand Up @@ -282,7 +276,7 @@ mod tests {
};
let (field, path, regex) = query.to_field_and_regex(&schema).unwrap();
assert_eq!(field, schema.get_field("field").unwrap());
assert_eq!(path.unwrap(), b"sub\x01field\0s");
assert_eq!(path.unwrap().deref(), b"sub\x01field\0s");
assert_eq!(regex, query.regex);

// i believe this is how concatenated field behave
Expand All @@ -292,15 +286,15 @@ mod tests {
};
let (field, path, regex) = query_empty_path.to_field_and_regex(&schema).unwrap();
assert_eq!(field, schema.get_field("field").unwrap());
assert_eq!(path.unwrap(), b"\0s");
assert_eq!(path.unwrap().deref(), b"\0s");
assert_eq!(regex, query_empty_path.regex);
}

#[test]
fn test_json_prefix_automaton_empty_path() {
let regex = Arc::new(Regex::new("e(f|g.*)").unwrap());
let empty_path_automaton = JsonPathPrefix {
prefix: Vec::new(),
prefix: JsonPath::default(),
automaton: regex.clone(),
};

Expand All @@ -312,7 +306,7 @@ mod tests {
fn test_json_prefix_automaton() {
let regex = Arc::new(Regex::new("e(f|g.*)").unwrap());
let automaton = JsonPathPrefix {
prefix: b"ab".to_vec(),
prefix: JsonPath::from_json_path("ab", false),
automaton: regex.clone(),
};

Expand Down
22 changes: 6 additions & 16 deletions quickwit/quickwit-query/src/query_ast/wildcard_query.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,12 @@ use std::sync::Arc;

use anyhow::{Context, bail};
use serde::{Deserialize, Serialize};
use tantivy::Term;
use tantivy::schema::{Field, FieldType, Schema as TantivySchema};

use super::{BuildTantivyAst, QueryAst};
use crate::query_ast::{AutomatonQuery, BuildTantivyAstContext, JsonPathPrefix, TantivyQueryAst};
use crate::tokenizers::TokenizerManager;
use crate::{InvalidQuery, find_field_or_hit_dynamic};
use crate::{InvalidQuery, JsonPath, find_field_or_hit_dynamic};

/// A Wildcard query allows to match 'bond' with a query like 'b*d'.
#[derive(PartialEq, Eq, Debug, Serialize, Deserialize, Clone)]
Expand Down Expand Up @@ -112,7 +111,7 @@ impl WildcardQuery {
&self,
schema: &TantivySchema,
tokenizer_manager: &TokenizerManager,
) -> Result<(Field, Option<Vec<u8>>, String), InvalidQuery> {
) -> Result<(Field, Option<JsonPath>, String), InvalidQuery> {
let Some((field, field_entry, json_path)) = find_field_or_hit_dynamic(&self.field, schema)
else {
return Err(InvalidQuery::FieldDoesNotExist {
Expand Down Expand Up @@ -161,17 +160,8 @@ impl WildcardQuery {
regex
};

let mut term_for_path = Term::from_field_json_path(
field,
json_path,
json_options.is_expand_dots_enabled(),
);
term_for_path.append_type_and_str("");

let value = term_for_path.value();
// We skip the 1st byte which is a marker to tell this is json. This isn't present
// in the dictionary
let byte_path_prefix = value.as_serialized()[1..].to_owned();
let byte_path_prefix =
JsonPath::from_json_path(json_path, json_options.is_expand_dots_enabled());

Ok((field, Some(byte_path_prefix), regex))
}
Expand Down Expand Up @@ -328,7 +318,7 @@ mod tests {

let (_field, path, regex) = query.to_regex(&schema, &tokenizer_manager).unwrap();
assert_eq!(regex, "MyString Wh1ch.a\\.nOrMal Tokenizer would.*cut");
assert_eq!(path.unwrap(), "Inner\u{1}Fie*ld\0s".as_bytes());
assert_eq!(path.unwrap().0, "Inner\u{1}Fie*ld\0s".as_bytes());
}

for tokenizer in [
Expand All @@ -347,7 +337,7 @@ mod tests {

let (_field, path, regex) = query.to_regex(&schema, &tokenizer_manager).unwrap();
assert_eq!(regex, "mystring wh1ch.a\\.normal tokenizer would.*cut");
assert_eq!(path.unwrap(), "Inner\u{1}Fie*ld\0s".as_bytes());
assert_eq!(path.unwrap().0, "Inner\u{1}Fie*ld\0s".as_bytes());
}
}

Expand Down
Loading
Loading