From 965a0847a49c3767079806290ea39d43c2bc7510 Mon Sep 17 00:00:00 2001
From: Remi Dettai <remi.dettai@sekoia.io>
Date: Thu, 30 Apr 2026 08:45:01 +0200
Subject: [PATCH] Use automaton for multi term queries

---
 quickwit/Cargo.lock                           |   1 +
 quickwit/quickwit-doc-mapper/Cargo.toml       |   1 +
 .../quickwit-doc-mapper/src/doc_mapper/mod.rs | 104 ++++++----
 quickwit/quickwit-doc-mapper/src/lib.rs       |   6 +-
 .../quickwit-doc-mapper/src/query_builder.rs  | 184 +++++++++++++-----
 quickwit/quickwit-search/src/leaf.rs          |  42 +---
 quickwit/quickwit-search/src/root.rs          |  11 +-
 quickwit/quickwit-search/src/tests.rs         |  84 +++++++-
 8 files changed, 293 insertions(+), 140 deletions(-)
diff --git a/quickwit/Cargo.lock b/quickwit/Cargo.lock
index 41c8baf8139..e35c7d87de5 100644
--- a/quickwit/Cargo.lock
+++ b/quickwit/Cargo.lock
@@ -8532,6 +8532,7 @@ dependencies = [
  "serde_yaml",
  "siphasher",
  "tantivy",
+ "tantivy-fst",
  "thiserror 2.0.18",
  "time",
  "tracing",
diff --git a/quickwit/quickwit-doc-mapper/Cargo.toml b/quickwit/quickwit-doc-mapper/Cargo.toml
index 3919e3d1405..2cb2be73b9f 100644
--- a/quickwit/quickwit-doc-mapper/Cargo.toml
+++ b/quickwit/quickwit-doc-mapper/Cargo.toml
@@ -24,6 +24,7 @@ serde_json = { workspace = true }
 serde_json_borrow = { workspace = true }
 siphasher = { workspace = true }
 tantivy = { workspace = true }
+tantivy-fst = { workspace = true }
 thiserror = { workspace = true }
 tracing = { workspace = true }
 utoipa = { workspace = true }
diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs
index 749dde228a7..370674c9536 100644
--- a/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs
+++ b/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs
@@ -26,6 +26,7 @@ use std::collections::{HashMap, HashSet};
 use std::fmt::Debug;
 use std::ops::Bound;
 
+use anyhow::bail;
 pub use doc_mapper_builder::DocMapperBuilder;
 pub use doc_mapper_impl::DocMapper;
 pub use field_mapping_entry::{
@@ -41,6 +42,7 @@ pub use field_mapping_type::FieldMappingType;
 use serde_json::Value as JsonValue;
 use tantivy::Term;
 use tantivy::schema::{Field, FieldType};
+use tantivy_fst::Automaton as TantivyFstAutomaton;
 pub(crate) use tokenizer_entry::{
     NgramTokenizerOption, RegexTokenizerOption, TokenFilterType, TokenizerType,
 };
@@ -76,10 +78,70 @@ pub struct TermRange {
 #[derive(Debug, Clone, PartialEq, Eq, Hash)]
 /// Supported automaton types to warmup
 pub enum Automaton {
-    /// A regex in it's str representation as tantivy_fst::Regex isn't PartialEq, and the path if
+    /// A regex in its str representation as tantivy_fst::Regex isn't PartialEq, and the path if
     /// inside a json field
     Regex(Option<Vec<u8>>, String),
-    // we could add termset query here, instead of downloading the whole dictionary
+    /// An exact-match automaton for a TermSet query.
+    TermSet(ExactSetAutomaton),
+}
+
+/// A byte-level DFA that accepts exactly the strings in a sorted, deduplicated byte-sequence
+/// set. State = `(depth, lo, hi)` meaning all terms in `self.terms[lo..hi]` share the first
+/// `depth` bytes consumed so far. Transitions are computed via binary search, avoiding any
+/// upfront DFA materialisation.
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub struct ExactSetAutomaton {
+    /// Holds sorted, deduplicated `term.serialized_value_bytes()` for all terms in the set.
+    /// Using `warm_postings_automaton` coalesces both the SSTable lookup and the postings
+    /// downloads into a small number of merged range requests.
+    terms: Vec<Vec<u8>>,
+}
+
+impl ExactSetAutomaton {
+    /// Create an `ExactSetAutomaton` from an iterator of terms.
+    pub fn try_from_terms<'a>(terms: impl IntoIterator<Item = &'a Term>) -> anyhow::Result<Self> {
+        let mut sorted_bytes: Vec<Vec<u8>> = terms
+            .into_iter()
+            .map(|term| term.serialized_value_bytes().to_vec())
+            .collect();
+        if sorted_bytes.is_empty() {
+            bail!("Cannot create an ExactSetAutomaton from an empty set of terms");
+        }
+        sorted_bytes.sort();
+        sorted_bytes.dedup();
+        Ok(ExactSetAutomaton {
+            terms: sorted_bytes,
+        })
+    }
+}
+
+impl TantivyFstAutomaton for ExactSetAutomaton {
+    /// (depth, lo, hi)
+    type State = (usize, usize, usize);
+
+    fn start(&self) -> Self::State {
+        (0, 0, self.terms.len())
+    }
+
+    fn is_match(&self, &(depth, lo, hi): &Self::State) -> bool {
+        lo < hi && self.terms[lo].len() == depth
+    }
+
+    fn can_match(&self, &(_, lo, hi): &Self::State) -> bool {
+        lo < hi
+    }
+
+    fn accept(&self, &(depth, lo, hi): &Self::State, byte: u8) -> Self::State {
+        // Within [lo, hi), terms are sorted by their bytes. Terms of length == depth (exact
+        // matches) sort before any extension, so there is at most one such term at index lo.
+        // Skip it — it has no byte at position `depth`.
+        let lo = lo + usize::from(lo < hi && self.terms[lo].len() == depth);
+        // Binary-search for the sub-range where terms[i][depth] == byte.
+        // All remaining terms in [lo, hi) have length > depth, so indexing [depth] is safe.
+        let new_lo = lo + self.terms[lo..hi].partition_point(|t| t[depth] < byte);
+        let new_hi = new_lo + self.terms[new_lo..hi].partition_point(|t| t[depth] <= byte);
+        (depth + 1, new_lo, new_hi)
+    }
 }
 
 /// Description of how a fast field should be warmed up
@@ -95,9 +157,6 @@ pub struct FastFieldWarmupInfo {
 /// running the query.
 #[derive(Debug, Default, Clone, PartialEq, Eq)]
 pub struct WarmupInfo {
-    /// Name of fields from the term dictionary and posting list which needs to
-    /// be entirely loaded
-    pub term_dict_fields: HashSet<Field>,
     /// Fast fields which needs to be loaded
     pub fast_fields: HashSet<FastFieldWarmupInfo>,
     /// Whether to warmup field norms. Used mostly for scoring.
@@ -113,7 +172,6 @@ pub struct WarmupInfo {
 impl WarmupInfo {
     /// Merge other WarmupInfo into self.
     pub fn merge(&mut self, other: WarmupInfo) {
-        self.term_dict_fields.extend(other.term_dict_fields);
         self.field_norms |= other.field_norms;
 
         for fast_field_warmup_info in other.fast_fields.into_iter() {
@@ -151,21 +209,6 @@ impl WarmupInfo {
 
     /// Simplify a WarmupInfo, removing some redundant tasks
     pub fn simplify(&mut self) {
-        self.terms_grouped_by_field.retain(|field, terms| {
-            if self.term_dict_fields.contains(field) {
-                // we are already about to full-load this dictionary. We only care about terms
-                // which needs additional position
-                terms.retain(|_term, include_position| *include_position);
-            }
-            // if no term is left, remove the entry from the hashmap
-            !terms.is_empty()
-        });
-        self.term_ranges_grouped_by_field.retain(|field, terms| {
-            if self.term_dict_fields.contains(field) {
-                terms.retain(|_term, include_position| *include_position);
-            }
-            !terms.is_empty()
-        });
         // TODO we could remove from terms_grouped_by_field for ranges with no `limit` in
         // term_ranges_grouped_by_field
     }
@@ -622,13 +665,6 @@ mod tests {
             .collect()
     }
 
-    fn hashset_field(elements: &[u32]) -> HashSet<Field> {
-        elements
-            .iter()
-            .map(|elem| Field::from_field_id(*elem))
-            .collect()
-    }
-
     fn hashmap(elements: &[(u32, &str, bool)]) -> HashMap<Field, HashMap<Term, bool>> {
         let mut result: HashMap<Field, HashMap<Term, bool>> = HashMap::new();
         for (field, term, pos) in elements {
@@ -663,7 +699,6 @@ mod tests {
     #[test]
     fn test_warmup_info_merge() {
         let wi_base = WarmupInfo {
-            term_dict_fields: hashset_field(&[1, 2]),
             fast_fields: hashset_fast(&["fast1", "fast2"]),
             field_norms: false,
             terms_grouped_by_field: hashmap(&[(1, "term1", false), (1, "term2", false)]),
@@ -686,7 +721,6 @@ mod tests {
 
         let mut wi_base = wi_base;
         let wi_2 = WarmupInfo {
-            term_dict_fields: hashset_field(&[2, 3]),
             fast_fields: hashset_fast(&["fast2", "fast3"]),
             field_norms: true,
             terms_grouped_by_field: hashmap(&[(2, "term1", false), (1, "term2", true)]),
@@ -703,7 +737,6 @@ mod tests {
         };
         wi_base.merge(wi_2.clone());
 
-        assert_eq!(wi_base.term_dict_fields, hashset_field(&[1, 2, 3]));
         assert_eq!(
             wi_base.fast_fields,
             hashset_fast(&["fast1", "fast2", "fast3"])
@@ -769,7 +802,6 @@ mod tests {
     #[test]
     fn test_warmup_info_simplify() {
         let mut warmup_info = WarmupInfo {
-            term_dict_fields: hashset_field(&[1]),
             fast_fields: hashset_fast(&["fast1", "fast2"]),
             field_norms: false,
             terms_grouped_by_field: hashmap(&[
@@ -791,11 +823,15 @@ mod tests {
             .collect(),
         };
         let expected = WarmupInfo {
-            term_dict_fields: hashset_field(&[1]),
             fast_fields: hashset_fast(&["fast1", "fast2"]),
             field_norms: false,
-            terms_grouped_by_field: hashmap(&[(1, "term2", true), (2, "term3", false)]),
+            terms_grouped_by_field: hashmap(&[
+                (1, "term1", false),
+                (1, "term2", true),
+                (2, "term3", false),
+            ]),
             term_ranges_grouped_by_field: hashmap_ranges(&[
+                (1, "term1", false),
                 (1, "term2", true),
                 (2, "term3", false),
             ]),
diff --git a/quickwit/quickwit-doc-mapper/src/lib.rs b/quickwit/quickwit-doc-mapper/src/lib.rs
index 8dee8d700ed..1b5a67908e3 100644
--- a/quickwit/quickwit-doc-mapper/src/lib.rs
+++ b/quickwit/quickwit-doc-mapper/src/lib.rs
@@ -30,9 +30,9 @@ mod routing_expression;
 pub mod tag_pruning;
 
 pub use doc_mapper::{
-    Automaton, BinaryFormat, DocMapper, DocMapperBuilder, FastFieldWarmupInfo, FieldMappingEntry,
-    FieldMappingType, JsonObject, NamedField, QuickwitBytesOptions, QuickwitJsonOptions, TermRange,
-    TokenizerConfig, TokenizerEntry, WarmupInfo, analyze_text,
+    Automaton, BinaryFormat, DocMapper, DocMapperBuilder, ExactSetAutomaton, FastFieldWarmupInfo,
+    FieldMappingEntry, FieldMappingType, JsonObject, NamedField, QuickwitBytesOptions,
+    QuickwitJsonOptions, TermRange, TokenizerConfig, TokenizerEntry, WarmupInfo, analyze_text,
 };
 use doc_mapper::{
     FastFieldOptions, FieldMappingEntryForSerialization, IndexRecordOptionSchema,
diff --git a/quickwit/quickwit-doc-mapper/src/query_builder.rs b/quickwit/quickwit-doc-mapper/src/query_builder.rs
index 78f41b654bf..04b360987f3 100644
--- a/quickwit/quickwit-doc-mapper/src/query_builder.rs
+++ b/quickwit/quickwit-doc-mapper/src/query_builder.rs
@@ -30,7 +30,7 @@ use tantivy::schema::{Field, Schema};
 use tracing::error;
 
 use crate::doc_mapper::FastFieldWarmupInfo;
-use crate::{Automaton, QueryParserError, TermRange, WarmupInfo};
+use crate::{Automaton, ExactSetAutomaton, QueryParserError, TermRange, WarmupInfo};
 
 #[derive(Default)]
 struct RangeQueryFields {
@@ -198,8 +198,7 @@ pub(crate) fn build_query(
 
     let query = query_ast.build_tantivy_query(context)?;
 
-    let term_set_query_fields = extract_term_set_query_fields(&query_ast, context.schema)?;
-    let (term_ranges_grouped_by_field, automatons_grouped_by_field) =
+    let (term_ranges_grouped_by_field, mut automatons_grouped_by_field) =
         extract_prefix_term_ranges_and_automaton(
             &query_ast,
             context.schema,
@@ -219,8 +218,13 @@ pub(crate) fn build_query(
             .or_default() |= need_position;
     });
 
+    coalesce_multi_term_fields_into_automatons(
+        &mut terms_grouped_by_field,
+        &mut automatons_grouped_by_field,
+        2,
+    )?;
+
     let warmup_info = WarmupInfo {
-        term_dict_fields: term_set_query_fields,
         terms_grouped_by_field,
         term_ranges_grouped_by_field,
         fast_fields,
@@ -231,44 +235,54 @@ pub(crate) fn build_query(
     Ok((query, warmup_info))
 }
 
-struct ExtractTermSetFields<'a> {
-    term_dict_fields_to_warm_up: HashSet<Field>,
-    schema: &'a Schema,
-}
-
-impl<'a> ExtractTermSetFields<'a> {
-    fn new(schema: &'a Schema) -> Self {
-        ExtractTermSetFields {
-            term_dict_fields_to_warm_up: HashSet::new(),
-            schema,
+/// For any field with more than `term_threshold` non-positional terms, moves
+/// those terms into an `Automaton::TermSet` and removes them from
+/// `terms_grouped_by_field`.
+///
+/// This enables `warm_postings_automaton` to coalesce both the SSTable block
+/// fetches and the postings downloads into a small number of merged range
+/// requests, instead of N individual per-term requests.
+///
+/// A minimum of `term_threshold` terms is required because
+/// `warm_postings_automaton` has higher per-call overhead than a direct point
+/// lookup: spawning a CPU task and traversing the sstable twice. That overhead
+/// is only worth paying when there are enough terms to coalesce.
+///
+/// Terms that require positions are left in `terms_grouped_by_field` unchanged,
+/// as they must be fetched individually.
+///
+/// TODO: should positional terms also support some form of grouping?
+fn coalesce_multi_term_fields_into_automatons(
+    terms_grouped_by_field: &mut HashMap<Field, HashMap<Term, bool>>,
+    automatons_grouped_by_field: &mut HashMap<Field, HashSet<Automaton>>,
+    term_threshold: usize,
+) -> anyhow::Result<()> {
+    let fields: Vec<Field> = terms_grouped_by_field.keys().copied().collect();
+    for field in fields {
+        let no_pos_terms: Vec<&Term> = terms_grouped_by_field
+            .get(&field)
+            .unwrap()
+            .iter()
+            .filter(|(_, need_pos)| !**need_pos)
+            .map(|(term, _)| term)
+            .collect();
+        if no_pos_terms.len() <= term_threshold {
+            continue;
         }
-    }
-}
-
-impl<'a> QueryAstVisitor<'a> for ExtractTermSetFields<'_> {
-    type Err = anyhow::Error;
-
-    fn visit_term_set(&mut self, term_set_query: &'a TermSetQuery) -> anyhow::Result<()> {
-        for field in term_set_query.terms_per_field.keys() {
-            if let Some((field, _field_entry, _path)) =
-                find_field_or_hit_dynamic(field, self.schema)
-            {
-                self.term_dict_fields_to_warm_up.insert(field);
-            } else {
-                anyhow::bail!("field does not exist: {}", field);
-            }
+        let automaton = ExactSetAutomaton::try_from_terms(no_pos_terms)?;
+        automatons_grouped_by_field
+            .entry(field)
+            .or_default()
+            .insert(Automaton::TermSet(automaton));
+        // Remove the no-position terms: the automaton covers their SSTable lookup + postings.
+        // Terms still needing positions are kept for warm_up_terms.
+        let field_terms = terms_grouped_by_field.get_mut(&field).unwrap();
+        field_terms.retain(|_, need_pos| *need_pos);
+        if field_terms.is_empty() {
+            terms_grouped_by_field.remove(&field);
         }
-        Ok(())
     }
-}
-
-fn extract_term_set_query_fields(
-    query_ast: &QueryAst,
-    schema: &Schema,
-) -> anyhow::Result<HashSet<Field>> {
-    let mut visitor = ExtractTermSetFields::new(schema);
-    visitor.visit(query_ast)?;
-    Ok(visitor.term_dict_fields_to_warm_up)
+    Ok(())
 }
 
 /// Converts a `prefix` term into the equivalent term range.
@@ -440,7 +454,7 @@ mod test {
     use tantivy::schema::{DateOptions, DateTimePrecision, FAST, INDEXED, STORED, Schema, TEXT};
 
     use super::{ExtractPrefixTermRanges, build_query};
-    use crate::{DYNAMIC_FIELD_NAME, SOURCE_FIELD_NAME, TermRange};
+    use crate::{Automaton, DYNAMIC_FIELD_NAME, SOURCE_FIELD_NAME, TermRange};
 
     enum TestExpectation<'a> {
         Err(&'a str),
@@ -884,26 +898,96 @@ mod test {
 
     #[test]
     fn test_build_query_warmup_info() {
-        let query_with_set = query_ast_from_user_text("desc: IN [hello]", None)
+        let query_with_set = query_ast_from_user_text("desc: IN [alpha beta gamma delta]", None)
+            .parse_user_query(&[])
+            .unwrap();
+        let query_with_small_set = query_ast_from_user_text("desc: IN [beta]", None)
             .parse_user_query(&[])
             .unwrap();
-        let query_without_set = query_ast_from_user_text("desc:hello", None)
+        let query_with_many_terms =
+            query_ast_from_user_text("desc:(hello OR world OR extra OR big)", None)
+                .parse_user_query(&[])
+                .unwrap();
+        let query_with_single_term = query_ast_from_user_text("desc:hello", None)
             .parse_user_query(&[])
             .unwrap();
 
         let schema = make_schema(true);
         let context = BuildTantivyAstContext::for_test(&schema);
 
-        let (_, warmup_info) = build_query(query_with_set, &context, None).unwrap();
-        assert_eq!(warmup_info.term_dict_fields.len(), 1);
+        for query in [query_with_many_terms, query_with_set] {
+            let (_, warmup_info) = build_query(query, &context, None).unwrap();
+            assert!(warmup_info.terms_grouped_by_field.is_empty());
+            assert_eq!(warmup_info.automatons_grouped_by_field.len(), 1);
+            let automatons = warmup_info
+                .automatons_grouped_by_field
+                .values()
+                .next()
+                .unwrap();
+            assert_eq!(automatons.len(), 1);
+            assert!(matches!(
+                automatons.iter().next().unwrap(),
+                Automaton::TermSet(_)
+            ));
+        }
+
+        for query in [query_with_small_set, query_with_single_term] {
+            let (_, warmup_info) = build_query(query, &context, None).unwrap();
+            assert!(warmup_info.automatons_grouped_by_field.is_empty());
+        }
+    }
+
+    #[test]
+    fn test_build_query_warmup_info_term_set_with_other_queries() {
+        // Verify that:
+        // - fields with >= 3 non-positional terms are coalesced into an automaton
+        // - positional terms on the same field remain in terms_grouped_by_field
+        // - fields with fewer than 3 non-positional terms are unaffected
+        let query_ast = query_ast_from_user_text(
+            r#"desc: IN [alpha beta gamma] AND desc:"world extra" AND title:baz"#,
+            None,
+        )
+        .parse_user_query(&[])
+        .unwrap();
+
+        let schema = make_schema(false);
+        let context = BuildTantivyAstContext::for_test(&schema);
+        let (_, warmup_info) = build_query(query_ast, &context, None).unwrap();
+
+        let desc_field = schema.get_field("desc").unwrap();
+        let title_field = schema.get_field("title").unwrap();
+
+        // desc: 3 non-positional terms (alpha, beta, gamma) are coalesced into an automaton
+        let desc_automatons = warmup_info
+            .automatons_grouped_by_field
+            .get(&desc_field)
+            .expect("desc should have an automaton");
+        assert_eq!(desc_automatons.len(), 1);
+        assert!(matches!(
+            desc_automatons.iter().next().unwrap(),
+            Automaton::TermSet(_)
+        ));
+
+        // desc: phrase terms "world" and "extra" stay as positional terms
+        let desc_terms = warmup_info
+            .terms_grouped_by_field
+            .get(&desc_field)
+            .expect("desc positional terms should still be present");
+        assert_eq!(desc_terms.len(), 2);
+        assert!(desc_terms.values().all(|&need_pos| need_pos));
+
+        // title: only 1 non-positional term (below threshold), stays in terms_grouped_by_field
         assert!(
-            warmup_info
-                .term_dict_fields
-                .contains(&tantivy::schema::Field::from_field_id(2))
+            !warmup_info
+                .automatons_grouped_by_field
+                .contains_key(&title_field)
         );
-
-        let (_, warmup_info) = build_query(query_without_set, &context, None).unwrap();
-        assert!(warmup_info.term_dict_fields.is_empty());
+        let title_terms = warmup_info
+            .terms_grouped_by_field
+            .get(&title_field)
+            .expect("title terms should be present");
+        assert_eq!(title_terms.len(), 1);
+        assert!(title_terms.values().all(|&need_pos| !need_pos));
     }
 
     #[test]
diff --git a/quickwit/quickwit-search/src/leaf.rs b/quickwit/quickwit-search/src/leaf.rs
index ba206889841..3f9bb480bfe 100644
--- a/quickwit/quickwit-search/src/leaf.rs
+++ b/quickwit/quickwit-search/src/leaf.rs
@@ -272,16 +272,10 @@ pub(crate) async fn warmup(searcher: &Searcher, warmup_info: &WarmupInfo) -> any
     let warm_up_term_ranges_future =
         warm_up_term_ranges(searcher, &warmup_info.term_ranges_grouped_by_field)
             .instrument(debug_span!("warm_up_term_ranges"));
-    let warm_up_term_dict_future =
-        warm_up_term_dict_fields(searcher, &warmup_info.term_dict_fields)
-            .instrument(debug_span!("warm_up_term_dicts"));
     let warm_up_fastfields_future = warm_up_fastfields(searcher, &warmup_info.fast_fields)
         .instrument(debug_span!("warm_up_fastfields"));
     let warm_up_fieldnorms_future = warm_up_fieldnorms(searcher, warmup_info.field_norms)
         .instrument(debug_span!("warm_up_fieldnorms"));
-    // TODO merge warm_up_postings into warm_up_term_dict_fields
-    let warm_up_postings_future = warm_up_postings(searcher, &warmup_info.term_dict_fields)
-        .instrument(debug_span!("warm_up_postings"));
     let warm_up_automatons_future =
         warm_up_automatons(searcher, &warmup_info.automatons_grouped_by_field)
             .instrument(debug_span!("warm_up_automatons"));
@@ -290,45 +284,13 @@ pub(crate) async fn warmup(searcher: &Searcher, warmup_info: &WarmupInfo) -> any
         warm_up_terms_future,
         warm_up_term_ranges_future,
         warm_up_fastfields_future,
-        warm_up_term_dict_future,
         warm_up_fieldnorms_future,
-        warm_up_postings_future,
         warm_up_automatons_future,
     )?;
 
     Ok(())
 }
 
-async fn warm_up_term_dict_fields(
-    searcher: &Searcher,
-    term_dict_fields: &HashSet<Field>,
-) -> anyhow::Result<()> {
-    let mut warm_up_futures = Vec::new();
-    for field in term_dict_fields {
-        for segment_reader in searcher.segment_readers() {
-            let inverted_index = segment_reader.inverted_index(*field)?.clone();
-            warm_up_futures.push(async move {
-                let dict = inverted_index.terms();
-                dict.warm_up_dictionary().await
-            });
-        }
-    }
-    try_join_all(warm_up_futures).await?;
-    Ok(())
-}
-
-async fn warm_up_postings(searcher: &Searcher, fields: &HashSet<Field>) -> anyhow::Result<()> {
-    let mut warm_up_futures = Vec::new();
-    for field in fields {
-        for segment_reader in searcher.segment_readers() {
-            let inverted_index = segment_reader.inverted_index(*field)?.clone();
-            warm_up_futures.push(async move { inverted_index.warm_postings_full(false).await });
-        }
-    }
-    try_join_all(warm_up_futures).await?;
-    Ok(())
-}
-
 async fn warm_up_fastfield(
     fast_field_reader: &FastFieldReaders,
     fast_field: &FastFieldWarmupInfo,
@@ -443,6 +405,10 @@ async fn warm_up_automatons(
                                 .await
                                 .context("failed to load automaton")
                         }
+                        Automaton::TermSet(automaton) => inv_idx_clone
+                            .warm_postings_automaton(automaton.clone(), cpu_intensive_executor)
+                            .await
+                            .context("failed to warm term set"),
                     }
                 });
             }
diff --git a/quickwit/quickwit-search/src/root.rs b/quickwit/quickwit-search/src/root.rs
index 9bbb5f4052c..1473bd017a1 100644
--- a/quickwit/quickwit-search/src/root.rs
+++ b/quickwit/quickwit-search/src/root.rs
@@ -1336,12 +1336,11 @@ pub async fn search_plan(
     } else {
         0
     };
-    let sstable_query_count = warmup_info.term_dict_fields.len()
-        + warmup_info
-            .terms_grouped_by_field
-            .values()
-            .map(|terms: &HashMap<tantivy::Term, bool>| terms.len())
-            .sum::<usize>()
+    let sstable_query_count = warmup_info
+        .terms_grouped_by_field
+        .values()
+        .map(|terms: &HashMap<tantivy::Term, bool>| terms.len())
+        .sum::<usize>()
         + warmup_info
             .term_ranges_grouped_by_field
             .values()
diff --git a/quickwit/quickwit-search/src/tests.rs b/quickwit/quickwit-search/src/tests.rs
index c8d851d06cb..46587c25a55 100644
--- a/quickwit/quickwit-search/src/tests.rs
+++ b/quickwit/quickwit-search/src/tests.rs
@@ -178,7 +178,8 @@ async fn test_single_search_with_snippet() -> anyhow::Result<()> {
     Ok(())
 }
 
-async fn slop_search_and_check(
+/// Search with "body" as default field and assert expected number of matches.
+async fn search_and_check(
     test_sandbox: &TestSandbox,
     index_id: &str,
     query: &str,
@@ -233,33 +234,98 @@ async fn test_slop_queries() {
     ];
     test_sandbox.add_documents(docs.clone()).await.unwrap();
 
-    slop_search_and_check(&test_sandbox, index_id, "\"small bird\"~2", 0)
+    search_and_check(&test_sandbox, index_id, "\"small bird\"~2", 0)
         .await
         .unwrap();
-    slop_search_and_check(&test_sandbox, index_id, "\"red bike\"~2", 1)
+    search_and_check(&test_sandbox, index_id, "\"red bike\"~2", 1)
         .await
         .unwrap();
-    slop_search_and_check(&test_sandbox, index_id, "\"small blue bike\"~3", 1)
+    search_and_check(&test_sandbox, index_id, "\"small blue bike\"~3", 1)
         .await
         .unwrap();
-    slop_search_and_check(&test_sandbox, index_id, "\"small bike\"", 1)
+    search_and_check(&test_sandbox, index_id, "\"small bike\"", 1)
         .await
         .unwrap();
-    slop_search_and_check(&test_sandbox, index_id, "\"small bike\"~1", 2)
+    search_and_check(&test_sandbox, index_id, "\"small bike\"~1", 2)
         .await
         .unwrap();
-    slop_search_and_check(&test_sandbox, index_id, "\"small bike\"~2", 2)
+    search_and_check(&test_sandbox, index_id, "\"small bike\"~2", 2)
         .await
         .unwrap();
-    slop_search_and_check(&test_sandbox, index_id, "\"small bike\"~3", 3)
+    search_and_check(&test_sandbox, index_id, "\"small bike\"~3", 3)
         .await
         .unwrap();
-    slop_search_and_check(&test_sandbox, index_id, "\"tiny shelter\"~3", 1)
+    search_and_check(&test_sandbox, index_id, "\"tiny shelter\"~3", 1)
         .await
         .unwrap();
     test_sandbox.assert_quit().await;
 }
 
+#[tokio::test]
+async fn test_multi_term_queries() {
+    let index_id = "multi-term-query";
+    let doc_mapping_yaml = r#"
+            field_mappings:
+              - name: title
+                type: text
+              - name: body
+                type: text
+                record: position
+        "#;
+
+    let test_sandbox = TestSandbox::create(index_id, doc_mapping_yaml, "{}", &["body"])
+        .await
+        .unwrap();
+    let docs = vec![
+        json!({"title": "one", "body": "a red bike"}),
+        json!({"title": "two", "body": "a small blue bike"}),
+        json!({"title": "three", "body": "a small, rusty, and yellow bike"}),
+        json!({"title": "four", "body": "fred's small bike"}),
+        json!({"title": "five", "body": "a tiny shelter"}),
+    ];
+    test_sandbox.add_documents(docs.clone()).await.unwrap();
+
+    search_and_check(
+        &test_sandbox,
+        index_id,
+        "IN [red blue green yellow pink black]",
+        3,
+    )
+    .await
+    .unwrap();
+
+    search_and_check(&test_sandbox, index_id, "IN [aaaa]", 0)
+        .await
+        .unwrap();
+
+    search_and_check(&test_sandbox, index_id, "IN [red]", 1)
+        .await
+        .unwrap();
+
+    search_and_check(&test_sandbox, index_id, "IN [zzzz]", 0)
+        .await
+        .unwrap();
+
+    search_and_check(
+        &test_sandbox,
+        index_id,
+        "red OR blue OR green OR yellow OR pink OR black",
+        3,
+    )
+    .await
+    .unwrap();
+
+    search_and_check(&test_sandbox, index_id, "red AND \"small bike\"", 0)
+        .await
+        .unwrap();
+
+    search_and_check(&test_sandbox, index_id, "bike AND \"small bike\"", 1)
+        .await
+        .unwrap();
+
+    test_sandbox.assert_quit().await;
+}
+
 #[tokio::test]
 async fn test_single_node_several_splits() -> anyhow::Result<()> {
     let index_id = "single-node-several-splits";