From 965a0847a49c3767079806290ea39d43c2bc7510 Mon Sep 17 00:00:00 2001 From: Remi Dettai Date: Thu, 30 Apr 2026 08:45:01 +0200 Subject: [PATCH] Use automaton for multi term queries --- quickwit/Cargo.lock | 1 + quickwit/quickwit-doc-mapper/Cargo.toml | 1 + .../quickwit-doc-mapper/src/doc_mapper/mod.rs | 104 ++++++---- quickwit/quickwit-doc-mapper/src/lib.rs | 6 +- .../quickwit-doc-mapper/src/query_builder.rs | 184 +++++++++++++----- quickwit/quickwit-search/src/leaf.rs | 42 +--- quickwit/quickwit-search/src/root.rs | 11 +- quickwit/quickwit-search/src/tests.rs | 84 +++++++- 8 files changed, 293 insertions(+), 140 deletions(-) diff --git a/quickwit/Cargo.lock b/quickwit/Cargo.lock index 41c8baf8139..e35c7d87de5 100644 --- a/quickwit/Cargo.lock +++ b/quickwit/Cargo.lock @@ -8532,6 +8532,7 @@ dependencies = [ "serde_yaml", "siphasher", "tantivy", + "tantivy-fst", "thiserror 2.0.18", "time", "tracing", diff --git a/quickwit/quickwit-doc-mapper/Cargo.toml b/quickwit/quickwit-doc-mapper/Cargo.toml index 3919e3d1405..2cb2be73b9f 100644 --- a/quickwit/quickwit-doc-mapper/Cargo.toml +++ b/quickwit/quickwit-doc-mapper/Cargo.toml @@ -24,6 +24,7 @@ serde_json = { workspace = true } serde_json_borrow = { workspace = true } siphasher = { workspace = true } tantivy = { workspace = true } +tantivy-fst = { workspace = true } thiserror = { workspace = true } tracing = { workspace = true } utoipa = { workspace = true } diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs index 749dde228a7..370674c9536 100644 --- a/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs +++ b/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs @@ -26,6 +26,7 @@ use std::collections::{HashMap, HashSet}; use std::fmt::Debug; use std::ops::Bound; +use anyhow::bail; pub use doc_mapper_builder::DocMapperBuilder; pub use doc_mapper_impl::DocMapper; pub use field_mapping_entry::{ @@ -41,6 +42,7 @@ pub use field_mapping_type::FieldMappingType; use serde_json::Value as JsonValue; use tantivy::Term; use tantivy::schema::{Field, FieldType}; +use tantivy_fst::Automaton as TantivyFstAutomaton; pub(crate) use tokenizer_entry::{ NgramTokenizerOption, RegexTokenizerOption, TokenFilterType, TokenizerType, }; @@ -76,10 +78,70 @@ pub struct TermRange { #[derive(Debug, Clone, PartialEq, Eq, Hash)] /// Supported automaton types to warmup pub enum Automaton { - /// A regex in it's str representation as tantivy_fst::Regex isn't PartialEq, and the path if + /// A regex in its str representation as tantivy_fst::Regex isn't PartialEq, and the path if /// inside a json field Regex(Option>, String), - // we could add termset query here, instead of downloading the whole dictionary + /// An exact-match automaton for a TermSet query. + TermSet(ExactSetAutomaton), +} + +/// A byte-level DFA that accepts exactly the strings in a sorted, deduplicated byte-sequence +/// set. State = `(depth, lo, hi)` meaning all terms in `self.terms[lo..hi]` share the first +/// `depth` bytes consumed so far. Transitions are computed via binary search, avoiding any +/// upfront DFA materialisation. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct ExactSetAutomaton { + /// Holds sorted, deduplicated `term.serialized_value_bytes()` for all terms in the set. + /// Using `warm_postings_automaton` coalesces both the SSTable lookup and the postings + /// downloads into a small number of merged range requests. + terms: Vec>, +} + +impl ExactSetAutomaton { + /// Create an `ExactSetAutomaton` from an iterator of terms. + pub fn try_from_terms<'a>(terms: impl IntoIterator) -> anyhow::Result { + let mut sorted_bytes: Vec> = terms + .into_iter() + .map(|term| term.serialized_value_bytes().to_vec()) + .collect(); + if sorted_bytes.is_empty() { + bail!("Cannot create an ExactSetAutomaton from an empty set of terms"); + } + sorted_bytes.sort(); + sorted_bytes.dedup(); + Ok(ExactSetAutomaton { + terms: sorted_bytes, + }) + } +} + +impl TantivyFstAutomaton for ExactSetAutomaton { + /// (depth, lo, hi) + type State = (usize, usize, usize); + + fn start(&self) -> Self::State { + (0, 0, self.terms.len()) + } + + fn is_match(&self, &(depth, lo, hi): &Self::State) -> bool { + lo < hi && self.terms[lo].len() == depth + } + + fn can_match(&self, &(_, lo, hi): &Self::State) -> bool { + lo < hi + } + + fn accept(&self, &(depth, lo, hi): &Self::State, byte: u8) -> Self::State { + // Within [lo, hi), terms are sorted by their bytes. Terms of length == depth (exact + // matches) sort before any extension, so there is at most one such term at index lo. + // Skip it — it has no byte at position `depth`. + let lo = lo + usize::from(lo < hi && self.terms[lo].len() == depth); + // Binary-search for the sub-range where terms[i][depth] == byte. + // All remaining terms in [lo, hi) have length > depth, so indexing [depth] is safe. + let new_lo = lo + self.terms[lo..hi].partition_point(|t| t[depth] < byte); + let new_hi = new_lo + self.terms[new_lo..hi].partition_point(|t| t[depth] <= byte); + (depth + 1, new_lo, new_hi) + } } /// Description of how a fast field should be warmed up @@ -95,9 +157,6 @@ pub struct FastFieldWarmupInfo { /// running the query. #[derive(Debug, Default, Clone, PartialEq, Eq)] pub struct WarmupInfo { - /// Name of fields from the term dictionary and posting list which needs to - /// be entirely loaded - pub term_dict_fields: HashSet, /// Fast fields which needs to be loaded pub fast_fields: HashSet, /// Whether to warmup field norms. Used mostly for scoring. @@ -113,7 +172,6 @@ pub struct WarmupInfo { impl WarmupInfo { /// Merge other WarmupInfo into self. pub fn merge(&mut self, other: WarmupInfo) { - self.term_dict_fields.extend(other.term_dict_fields); self.field_norms |= other.field_norms; for fast_field_warmup_info in other.fast_fields.into_iter() { @@ -151,21 +209,6 @@ impl WarmupInfo { /// Simplify a WarmupInfo, removing some redundant tasks pub fn simplify(&mut self) { - self.terms_grouped_by_field.retain(|field, terms| { - if self.term_dict_fields.contains(field) { - // we are already about to full-load this dictionary. We only care about terms - // which needs additional position - terms.retain(|_term, include_position| *include_position); - } - // if no term is left, remove the entry from the hashmap - !terms.is_empty() - }); - self.term_ranges_grouped_by_field.retain(|field, terms| { - if self.term_dict_fields.contains(field) { - terms.retain(|_term, include_position| *include_position); - } - !terms.is_empty() - }); // TODO we could remove from terms_grouped_by_field for ranges with no `limit` in // term_ranges_grouped_by_field } @@ -622,13 +665,6 @@ mod tests { .collect() } - fn hashset_field(elements: &[u32]) -> HashSet { - elements - .iter() - .map(|elem| Field::from_field_id(*elem)) - .collect() - } - fn hashmap(elements: &[(u32, &str, bool)]) -> HashMap> { let mut result: HashMap> = HashMap::new(); for (field, term, pos) in elements { @@ -663,7 +699,6 @@ mod tests { #[test] fn test_warmup_info_merge() { let wi_base = WarmupInfo { - term_dict_fields: hashset_field(&[1, 2]), fast_fields: hashset_fast(&["fast1", "fast2"]), field_norms: false, terms_grouped_by_field: hashmap(&[(1, "term1", false), (1, "term2", false)]), @@ -686,7 +721,6 @@ mod tests { let mut wi_base = wi_base; let wi_2 = WarmupInfo { - term_dict_fields: hashset_field(&[2, 3]), fast_fields: hashset_fast(&["fast2", "fast3"]), field_norms: true, terms_grouped_by_field: hashmap(&[(2, "term1", false), (1, "term2", true)]), @@ -703,7 +737,6 @@ mod tests { }; wi_base.merge(wi_2.clone()); - assert_eq!(wi_base.term_dict_fields, hashset_field(&[1, 2, 3])); assert_eq!( wi_base.fast_fields, hashset_fast(&["fast1", "fast2", "fast3"]) @@ -769,7 +802,6 @@ mod tests { #[test] fn test_warmup_info_simplify() { let mut warmup_info = WarmupInfo { - term_dict_fields: hashset_field(&[1]), fast_fields: hashset_fast(&["fast1", "fast2"]), field_norms: false, terms_grouped_by_field: hashmap(&[ @@ -791,11 +823,15 @@ mod tests { .collect(), }; let expected = WarmupInfo { - term_dict_fields: hashset_field(&[1]), fast_fields: hashset_fast(&["fast1", "fast2"]), field_norms: false, - terms_grouped_by_field: hashmap(&[(1, "term2", true), (2, "term3", false)]), + terms_grouped_by_field: hashmap(&[ + (1, "term1", false), + (1, "term2", true), + (2, "term3", false), + ]), term_ranges_grouped_by_field: hashmap_ranges(&[ + (1, "term1", false), (1, "term2", true), (2, "term3", false), ]), diff --git a/quickwit/quickwit-doc-mapper/src/lib.rs b/quickwit/quickwit-doc-mapper/src/lib.rs index 8dee8d700ed..1b5a67908e3 100644 --- a/quickwit/quickwit-doc-mapper/src/lib.rs +++ b/quickwit/quickwit-doc-mapper/src/lib.rs @@ -30,9 +30,9 @@ mod routing_expression; pub mod tag_pruning; pub use doc_mapper::{ - Automaton, BinaryFormat, DocMapper, DocMapperBuilder, FastFieldWarmupInfo, FieldMappingEntry, - FieldMappingType, JsonObject, NamedField, QuickwitBytesOptions, QuickwitJsonOptions, TermRange, - TokenizerConfig, TokenizerEntry, WarmupInfo, analyze_text, + Automaton, BinaryFormat, DocMapper, DocMapperBuilder, ExactSetAutomaton, FastFieldWarmupInfo, + FieldMappingEntry, FieldMappingType, JsonObject, NamedField, QuickwitBytesOptions, + QuickwitJsonOptions, TermRange, TokenizerConfig, TokenizerEntry, WarmupInfo, analyze_text, }; use doc_mapper::{ FastFieldOptions, FieldMappingEntryForSerialization, IndexRecordOptionSchema, diff --git a/quickwit/quickwit-doc-mapper/src/query_builder.rs b/quickwit/quickwit-doc-mapper/src/query_builder.rs index 78f41b654bf..04b360987f3 100644 --- a/quickwit/quickwit-doc-mapper/src/query_builder.rs +++ b/quickwit/quickwit-doc-mapper/src/query_builder.rs @@ -30,7 +30,7 @@ use tantivy::schema::{Field, Schema}; use tracing::error; use crate::doc_mapper::FastFieldWarmupInfo; -use crate::{Automaton, QueryParserError, TermRange, WarmupInfo}; +use crate::{Automaton, ExactSetAutomaton, QueryParserError, TermRange, WarmupInfo}; #[derive(Default)] struct RangeQueryFields { @@ -198,8 +198,7 @@ pub(crate) fn build_query( let query = query_ast.build_tantivy_query(context)?; - let term_set_query_fields = extract_term_set_query_fields(&query_ast, context.schema)?; - let (term_ranges_grouped_by_field, automatons_grouped_by_field) = + let (term_ranges_grouped_by_field, mut automatons_grouped_by_field) = extract_prefix_term_ranges_and_automaton( &query_ast, context.schema, @@ -219,8 +218,13 @@ pub(crate) fn build_query( .or_default() |= need_position; }); + coalesce_multi_term_fields_into_automatons( + &mut terms_grouped_by_field, + &mut automatons_grouped_by_field, + 2, + )?; + let warmup_info = WarmupInfo { - term_dict_fields: term_set_query_fields, terms_grouped_by_field, term_ranges_grouped_by_field, fast_fields, @@ -231,44 +235,54 @@ pub(crate) fn build_query( Ok((query, warmup_info)) } -struct ExtractTermSetFields<'a> { - term_dict_fields_to_warm_up: HashSet, - schema: &'a Schema, -} - -impl<'a> ExtractTermSetFields<'a> { - fn new(schema: &'a Schema) -> Self { - ExtractTermSetFields { - term_dict_fields_to_warm_up: HashSet::new(), - schema, +/// For any field with more than `term_threshold` non-positional terms, moves +/// those terms into an `Automaton::TermSet` and removes them from +/// `terms_grouped_by_field`. +/// +/// This enables `warm_postings_automaton` to coalesce both the SSTable block +/// fetches and the postings downloads into a small number of merged range +/// requests, instead of N individual per-term requests. +/// +/// A minimum of `term_threshold` terms is required because +/// `warm_postings_automaton` has higher per-call overhead than a direct point +/// lookup: spawning a CPU task and traversing the sstable twice. That overhead +/// is only worth paying when there are enough terms to coalesce. +/// +/// Terms that require positions are left in `terms_grouped_by_field` unchanged, +/// as they must be fetched individually. +/// +/// TODO: should positional terms also support some form of grouping? +fn coalesce_multi_term_fields_into_automatons( + terms_grouped_by_field: &mut HashMap>, + automatons_grouped_by_field: &mut HashMap>, + term_threshold: usize, +) -> anyhow::Result<()> { + let fields: Vec = terms_grouped_by_field.keys().copied().collect(); + for field in fields { + let no_pos_terms: Vec<&Term> = terms_grouped_by_field + .get(&field) + .unwrap() + .iter() + .filter(|(_, need_pos)| !**need_pos) + .map(|(term, _)| term) + .collect(); + if no_pos_terms.len() <= term_threshold { + continue; } - } -} - -impl<'a> QueryAstVisitor<'a> for ExtractTermSetFields<'_> { - type Err = anyhow::Error; - - fn visit_term_set(&mut self, term_set_query: &'a TermSetQuery) -> anyhow::Result<()> { - for field in term_set_query.terms_per_field.keys() { - if let Some((field, _field_entry, _path)) = - find_field_or_hit_dynamic(field, self.schema) - { - self.term_dict_fields_to_warm_up.insert(field); - } else { - anyhow::bail!("field does not exist: {}", field); - } + let automaton = ExactSetAutomaton::try_from_terms(no_pos_terms)?; + automatons_grouped_by_field + .entry(field) + .or_default() + .insert(Automaton::TermSet(automaton)); + // Remove the no-position terms: the automaton covers their SSTable lookup + postings. + // Terms still needing positions are kept for warm_up_terms. + let field_terms = terms_grouped_by_field.get_mut(&field).unwrap(); + field_terms.retain(|_, need_pos| *need_pos); + if field_terms.is_empty() { + terms_grouped_by_field.remove(&field); } - Ok(()) } -} - -fn extract_term_set_query_fields( - query_ast: &QueryAst, - schema: &Schema, -) -> anyhow::Result> { - let mut visitor = ExtractTermSetFields::new(schema); - visitor.visit(query_ast)?; - Ok(visitor.term_dict_fields_to_warm_up) + Ok(()) } /// Converts a `prefix` term into the equivalent term range. @@ -440,7 +454,7 @@ mod test { use tantivy::schema::{DateOptions, DateTimePrecision, FAST, INDEXED, STORED, Schema, TEXT}; use super::{ExtractPrefixTermRanges, build_query}; - use crate::{DYNAMIC_FIELD_NAME, SOURCE_FIELD_NAME, TermRange}; + use crate::{Automaton, DYNAMIC_FIELD_NAME, SOURCE_FIELD_NAME, TermRange}; enum TestExpectation<'a> { Err(&'a str), @@ -884,26 +898,96 @@ mod test { #[test] fn test_build_query_warmup_info() { - let query_with_set = query_ast_from_user_text("desc: IN [hello]", None) + let query_with_set = query_ast_from_user_text("desc: IN [alpha beta gamma delta]", None) + .parse_user_query(&[]) + .unwrap(); + let query_with_small_set = query_ast_from_user_text("desc: IN [beta]", None) .parse_user_query(&[]) .unwrap(); - let query_without_set = query_ast_from_user_text("desc:hello", None) + let query_with_many_terms = + query_ast_from_user_text("desc:(hello OR world OR extra OR big)", None) + .parse_user_query(&[]) + .unwrap(); + let query_with_single_term = query_ast_from_user_text("desc:hello", None) .parse_user_query(&[]) .unwrap(); let schema = make_schema(true); let context = BuildTantivyAstContext::for_test(&schema); - let (_, warmup_info) = build_query(query_with_set, &context, None).unwrap(); - assert_eq!(warmup_info.term_dict_fields.len(), 1); + for query in [query_with_many_terms, query_with_set] { + let (_, warmup_info) = build_query(query, &context, None).unwrap(); + assert!(warmup_info.terms_grouped_by_field.is_empty()); + assert_eq!(warmup_info.automatons_grouped_by_field.len(), 1); + let automatons = warmup_info + .automatons_grouped_by_field + .values() + .next() + .unwrap(); + assert_eq!(automatons.len(), 1); + assert!(matches!( + automatons.iter().next().unwrap(), + Automaton::TermSet(_) + )); + } + + for query in [query_with_small_set, query_with_single_term] { + let (_, warmup_info) = build_query(query, &context, None).unwrap(); + assert!(warmup_info.automatons_grouped_by_field.is_empty()); + } + } + + #[test] + fn test_build_query_warmup_info_term_set_with_other_queries() { + // Verify that: + // - fields with >= 3 non-positional terms are coalesced into an automaton + // - positional terms on the same field remain in terms_grouped_by_field + // - fields with fewer than 3 non-positional terms are unaffected + let query_ast = query_ast_from_user_text( + r#"desc: IN [alpha beta gamma] AND desc:"world extra" AND title:baz"#, + None, + ) + .parse_user_query(&[]) + .unwrap(); + + let schema = make_schema(false); + let context = BuildTantivyAstContext::for_test(&schema); + let (_, warmup_info) = build_query(query_ast, &context, None).unwrap(); + + let desc_field = schema.get_field("desc").unwrap(); + let title_field = schema.get_field("title").unwrap(); + + // desc: 3 non-positional terms (alpha, beta, gamma) are coalesced into an automaton + let desc_automatons = warmup_info + .automatons_grouped_by_field + .get(&desc_field) + .expect("desc should have an automaton"); + assert_eq!(desc_automatons.len(), 1); + assert!(matches!( + desc_automatons.iter().next().unwrap(), + Automaton::TermSet(_) + )); + + // desc: phrase terms "world" and "extra" stay as positional terms + let desc_terms = warmup_info + .terms_grouped_by_field + .get(&desc_field) + .expect("desc positional terms should still be present"); + assert_eq!(desc_terms.len(), 2); + assert!(desc_terms.values().all(|&need_pos| need_pos)); + + // title: only 1 non-positional term (below threshold), stays in terms_grouped_by_field assert!( - warmup_info - .term_dict_fields - .contains(&tantivy::schema::Field::from_field_id(2)) + !warmup_info + .automatons_grouped_by_field + .contains_key(&title_field) ); - - let (_, warmup_info) = build_query(query_without_set, &context, None).unwrap(); - assert!(warmup_info.term_dict_fields.is_empty()); + let title_terms = warmup_info + .terms_grouped_by_field + .get(&title_field) + .expect("title terms should be present"); + assert_eq!(title_terms.len(), 1); + assert!(title_terms.values().all(|&need_pos| !need_pos)); } #[test] diff --git a/quickwit/quickwit-search/src/leaf.rs b/quickwit/quickwit-search/src/leaf.rs index ba206889841..3f9bb480bfe 100644 --- a/quickwit/quickwit-search/src/leaf.rs +++ b/quickwit/quickwit-search/src/leaf.rs @@ -272,16 +272,10 @@ pub(crate) async fn warmup(searcher: &Searcher, warmup_info: &WarmupInfo) -> any let warm_up_term_ranges_future = warm_up_term_ranges(searcher, &warmup_info.term_ranges_grouped_by_field) .instrument(debug_span!("warm_up_term_ranges")); - let warm_up_term_dict_future = - warm_up_term_dict_fields(searcher, &warmup_info.term_dict_fields) - .instrument(debug_span!("warm_up_term_dicts")); let warm_up_fastfields_future = warm_up_fastfields(searcher, &warmup_info.fast_fields) .instrument(debug_span!("warm_up_fastfields")); let warm_up_fieldnorms_future = warm_up_fieldnorms(searcher, warmup_info.field_norms) .instrument(debug_span!("warm_up_fieldnorms")); - // TODO merge warm_up_postings into warm_up_term_dict_fields - let warm_up_postings_future = warm_up_postings(searcher, &warmup_info.term_dict_fields) - .instrument(debug_span!("warm_up_postings")); let warm_up_automatons_future = warm_up_automatons(searcher, &warmup_info.automatons_grouped_by_field) .instrument(debug_span!("warm_up_automatons")); @@ -290,45 +284,13 @@ pub(crate) async fn warmup(searcher: &Searcher, warmup_info: &WarmupInfo) -> any warm_up_terms_future, warm_up_term_ranges_future, warm_up_fastfields_future, - warm_up_term_dict_future, warm_up_fieldnorms_future, - warm_up_postings_future, warm_up_automatons_future, )?; Ok(()) } -async fn warm_up_term_dict_fields( - searcher: &Searcher, - term_dict_fields: &HashSet, -) -> anyhow::Result<()> { - let mut warm_up_futures = Vec::new(); - for field in term_dict_fields { - for segment_reader in searcher.segment_readers() { - let inverted_index = segment_reader.inverted_index(*field)?.clone(); - warm_up_futures.push(async move { - let dict = inverted_index.terms(); - dict.warm_up_dictionary().await - }); - } - } - try_join_all(warm_up_futures).await?; - Ok(()) -} - -async fn warm_up_postings(searcher: &Searcher, fields: &HashSet) -> anyhow::Result<()> { - let mut warm_up_futures = Vec::new(); - for field in fields { - for segment_reader in searcher.segment_readers() { - let inverted_index = segment_reader.inverted_index(*field)?.clone(); - warm_up_futures.push(async move { inverted_index.warm_postings_full(false).await }); - } - } - try_join_all(warm_up_futures).await?; - Ok(()) -} - async fn warm_up_fastfield( fast_field_reader: &FastFieldReaders, fast_field: &FastFieldWarmupInfo, @@ -443,6 +405,10 @@ async fn warm_up_automatons( .await .context("failed to load automaton") } + Automaton::TermSet(automaton) => inv_idx_clone + .warm_postings_automaton(automaton.clone(), cpu_intensive_executor) + .await + .context("failed to warm term set"), } }); } diff --git a/quickwit/quickwit-search/src/root.rs b/quickwit/quickwit-search/src/root.rs index 9bbb5f4052c..1473bd017a1 100644 --- a/quickwit/quickwit-search/src/root.rs +++ b/quickwit/quickwit-search/src/root.rs @@ -1336,12 +1336,11 @@ pub async fn search_plan( } else { 0 }; - let sstable_query_count = warmup_info.term_dict_fields.len() - + warmup_info - .terms_grouped_by_field - .values() - .map(|terms: &HashMap| terms.len()) - .sum::() + let sstable_query_count = warmup_info + .terms_grouped_by_field + .values() + .map(|terms: &HashMap| terms.len()) + .sum::() + warmup_info .term_ranges_grouped_by_field .values() diff --git a/quickwit/quickwit-search/src/tests.rs b/quickwit/quickwit-search/src/tests.rs index c8d851d06cb..46587c25a55 100644 --- a/quickwit/quickwit-search/src/tests.rs +++ b/quickwit/quickwit-search/src/tests.rs @@ -178,7 +178,8 @@ async fn test_single_search_with_snippet() -> anyhow::Result<()> { Ok(()) } -async fn slop_search_and_check( +/// Search with "body" as default field and assert expected number of matches. +async fn search_and_check( test_sandbox: &TestSandbox, index_id: &str, query: &str, @@ -233,33 +234,98 @@ async fn test_slop_queries() { ]; test_sandbox.add_documents(docs.clone()).await.unwrap(); - slop_search_and_check(&test_sandbox, index_id, "\"small bird\"~2", 0) + search_and_check(&test_sandbox, index_id, "\"small bird\"~2", 0) .await .unwrap(); - slop_search_and_check(&test_sandbox, index_id, "\"red bike\"~2", 1) + search_and_check(&test_sandbox, index_id, "\"red bike\"~2", 1) .await .unwrap(); - slop_search_and_check(&test_sandbox, index_id, "\"small blue bike\"~3", 1) + search_and_check(&test_sandbox, index_id, "\"small blue bike\"~3", 1) .await .unwrap(); - slop_search_and_check(&test_sandbox, index_id, "\"small bike\"", 1) + search_and_check(&test_sandbox, index_id, "\"small bike\"", 1) .await .unwrap(); - slop_search_and_check(&test_sandbox, index_id, "\"small bike\"~1", 2) + search_and_check(&test_sandbox, index_id, "\"small bike\"~1", 2) .await .unwrap(); - slop_search_and_check(&test_sandbox, index_id, "\"small bike\"~2", 2) + search_and_check(&test_sandbox, index_id, "\"small bike\"~2", 2) .await .unwrap(); - slop_search_and_check(&test_sandbox, index_id, "\"small bike\"~3", 3) + search_and_check(&test_sandbox, index_id, "\"small bike\"~3", 3) .await .unwrap(); - slop_search_and_check(&test_sandbox, index_id, "\"tiny shelter\"~3", 1) + search_and_check(&test_sandbox, index_id, "\"tiny shelter\"~3", 1) .await .unwrap(); test_sandbox.assert_quit().await; } +#[tokio::test] +async fn test_multi_term_queries() { + let index_id = "multi-term-query"; + let doc_mapping_yaml = r#" + field_mappings: + - name: title + type: text + - name: body + type: text + record: position + "#; + + let test_sandbox = TestSandbox::create(index_id, doc_mapping_yaml, "{}", &["body"]) + .await + .unwrap(); + let docs = vec![ + json!({"title": "one", "body": "a red bike"}), + json!({"title": "two", "body": "a small blue bike"}), + json!({"title": "three", "body": "a small, rusty, and yellow bike"}), + json!({"title": "four", "body": "fred's small bike"}), + json!({"title": "five", "body": "a tiny shelter"}), + ]; + test_sandbox.add_documents(docs.clone()).await.unwrap(); + + search_and_check( + &test_sandbox, + index_id, + "IN [red blue green yellow pink black]", + 3, + ) + .await + .unwrap(); + + search_and_check(&test_sandbox, index_id, "IN [aaaa]", 0) + .await + .unwrap(); + + search_and_check(&test_sandbox, index_id, "IN [red]", 1) + .await + .unwrap(); + + search_and_check(&test_sandbox, index_id, "IN [zzzz]", 0) + .await + .unwrap(); + + search_and_check( + &test_sandbox, + index_id, + "red OR blue OR green OR yellow OR pink OR black", + 3, + ) + .await + .unwrap(); + + search_and_check(&test_sandbox, index_id, "red AND \"small bike\"", 0) + .await + .unwrap(); + + search_and_check(&test_sandbox, index_id, "bike AND \"small bike\"", 1) + .await + .unwrap(); + + test_sandbox.assert_quit().await; +} + #[tokio::test] async fn test_single_node_several_splits() -> anyhow::Result<()> { let index_id = "single-node-several-splits";