From cb13f93636b96b5bbbdfbf33fbc20e96fda30a39 Mon Sep 17 00:00:00 2001
From: Zhang Xiaofeng <zhangxffff@gmail.com>
Date: Fri, 20 Feb 2026 04:18:54 +0000
Subject: [PATCH 1/2] bench: Add dynamic IN list benchmarks for non-constant
 list expressions

---
 datafusion/physical-expr/benches/in_list.rs | 145 ++++++++++++++++++++
 1 file changed, 145 insertions(+)
diff --git a/datafusion/physical-expr/benches/in_list.rs b/datafusion/physical-expr/benches/in_list.rs
index 954715d0e5a9c..405088f88fbb2 100644
--- a/datafusion/physical-expr/benches/in_list.rs
+++ b/datafusion/physical-expr/benches/in_list.rs
@@ -23,6 +23,7 @@ use arrow::datatypes::{Field, Schema};
 use arrow::record_batch::RecordBatch;
 use criterion::{Criterion, criterion_group, criterion_main};
 use datafusion_common::ScalarValue;
+use datafusion_physical_expr::PhysicalExpr;
 use datafusion_physical_expr::expressions::{col, in_list, lit};
 use rand::distr::Alphanumeric;
 use rand::prelude::*;
@@ -50,7 +51,9 @@ fn random_string(rng: &mut StdRng, len: usize) -> String {
 }
 
 const IN_LIST_LENGTHS: [usize; 4] = [3, 8, 28, 100];
+const DYNAMIC_LIST_LENGTHS: [usize; 3] = [3, 8, 28];
 const NULL_PERCENTS: [f64; 2] = [0., 0.2];
+const MATCH_PERCENTS: [f64; 3] = [0.0, 0.5, 1.0];
 const STRING_LENGTHS: [usize; 3] = [3, 12, 100];
 const ARRAY_LENGTH: usize = 8192;
 
@@ -219,6 +222,144 @@ fn bench_realistic_mixed_strings<A>(
     }
 }
 
+/// Benchmarks the dynamic evaluation path (no static filter) by including
+/// a column reference in the IN list, which prevents static filter creation.
+fn do_bench_dynamic(
+    c: &mut Criterion,
+    name: &str,
+    values: ArrayRef,
+    list_cols: &[ArrayRef],
+) {
+    let mut fields = vec![Field::new("a", values.data_type().clone(), true)];
+    let mut columns: Vec<ArrayRef> = vec![values];
+
+    // Build list expressions: mix of column refs (forces dynamic path)
+    let schema_fields: Vec<Field> = list_cols
+        .iter()
+        .enumerate()
+        .map(|(i, col_arr)| {
+            let name = format!("b{i}");
+            fields.push(Field::new(&name, col_arr.data_type().clone(), true));
+            columns.push(Arc::clone(col_arr));
+            Field::new(&name, col_arr.data_type().clone(), true)
+        })
+        .collect();
+
+    let schema = Schema::new(fields);
+    let list_exprs: Vec<Arc<dyn PhysicalExpr>> = schema_fields
+        .iter()
+        .map(|f| col(f.name(), &schema).unwrap())
+        .collect();
+
+    let expr = in_list(col("a", &schema).unwrap(), list_exprs, &false, &schema).unwrap();
+    let batch = RecordBatch::try_new(Arc::new(schema), columns).unwrap();
+
+    c.bench_function(name, |b| {
+        b.iter(|| black_box(expr.evaluate(black_box(&batch)).unwrap()))
+    });
+}
+
+/// Benchmarks the dynamic IN list path for Int32 arrays with column references.
+fn bench_dynamic_int32(c: &mut Criterion) {
+    let mut rng = StdRng::seed_from_u64(42);
+
+    for list_size in DYNAMIC_LIST_LENGTHS {
+        for match_percent in MATCH_PERCENTS {
+            for null_percent in NULL_PERCENTS {
+                // Generate the "needle" column
+                let values: Int32Array = (0..ARRAY_LENGTH)
+                    .map(|_| {
+                        rng.random_bool(1.0 - null_percent)
+                            .then(|| rng.random_range(0..1000))
+                    })
+                    .collect();
+
+                // Generate list columns with controlled match rate
+                let list_cols: Vec<ArrayRef> = (0..list_size)
+                    .map(|_| {
+                        let col: Int32Array = (0..ARRAY_LENGTH)
+                            .map(|row| {
+                                if rng.random_bool(1.0 - null_percent) {
+                                    if rng.random_bool(match_percent) {
+                                        // Copy from values to create a match
+                                        if values.is_null(row) {
+                                            Some(rng.random_range(0..1000))
+                                        } else {
+                                            Some(values.value(row))
+                                        }
+                                    } else {
+                                        // Random value (unlikely to match)
+                                        Some(rng.random_range(1000..2000))
+                                    }
+                                } else {
+                                    None
+                                }
+                            })
+                            .collect();
+                        Arc::new(col) as ArrayRef
+                    })
+                    .collect();
+
+                do_bench_dynamic(
+                    c,
+                    &format!(
+                        "in_list_dynamic/Int32/list={}/match={}%/nulls={}%",
+                        list_size,
+                        (match_percent * 100.0) as u32,
+                        (null_percent * 100.0) as u32
+                    ),
+                    Arc::new(values),
+                    &list_cols,
+                );
+            }
+        }
+    }
+}
+
+/// Benchmarks the dynamic IN list path for Utf8 arrays with column references.
+fn bench_dynamic_utf8(c: &mut Criterion) {
+    let mut rng = StdRng::seed_from_u64(99);
+
+    for list_size in DYNAMIC_LIST_LENGTHS {
+        for match_percent in MATCH_PERCENTS {
+            // Generate the "needle" column
+            let value_strings: Vec<Option<String>> = (0..ARRAY_LENGTH)
+                .map(|_| rng.random_bool(0.8).then(|| random_string(&mut rng, 12)))
+                .collect();
+            let values: StringArray =
+                value_strings.iter().map(|s| s.as_deref()).collect();
+
+            // Generate list columns with controlled match rate
+            let list_cols: Vec<ArrayRef> = (0..list_size)
+                .map(|_| {
+                    let col: StringArray = (0..ARRAY_LENGTH)
+                        .map(|row| {
+                            if rng.random_bool(match_percent) {
+                                // Copy from values to create a match
+                                value_strings[row].as_deref()
+                            } else {
+                                Some("no_match_value_xyz")
+                            }
+                        })
+                        .collect();
+                    Arc::new(col) as ArrayRef
+                })
+                .collect();
+
+            do_bench_dynamic(
+                c,
+                &format!(
+                    "in_list_dynamic/Utf8/list={}/match={}%",
+                    list_size,
+                    (match_percent * 100.0) as u32,
+                ),
+                Arc::new(values),
+                &list_cols,
+            );
+        }
+    }
+}
+
 /// Entry point: registers in_list benchmarks for string and numeric array types.
 fn criterion_benchmark(c: &mut Criterion) {
     let mut rng = StdRng::seed_from_u64(120320);
@@ -266,6 +407,10 @@ fn criterion_benchmark(c: &mut Criterion) {
         |rng| rng.random(),
         |v| ScalarValue::TimestampNanosecond(Some(v), None),
     );
+
+    // Dynamic path benchmarks (non-constant list expressions)
+    bench_dynamic_int32(c);
+    bench_dynamic_utf8(c);
 }
 
 criterion_group! {

From 99faf913b3e351aceed7be480a1664fc13cccfe5 Mon Sep 17 00:00:00 2001
From: Zhang Xiaofeng <zhangxffff@gmail.com>
Date: Fri, 20 Feb 2026 14:35:45 +0000
Subject: [PATCH 2/2] rename dynamic to list_with_columns and add equivalent
 sql

---
 datafusion/physical-expr/benches/in_list.rs | 55 ++++++++++++++-------
 1 file changed, 38 insertions(+), 17 deletions(-)

diff --git a/datafusion/physical-expr/benches/in_list.rs b/datafusion/physical-expr/benches/in_list.rs
index 405088f88fbb2..021d8259cdfdf 100644
--- a/datafusion/physical-expr/benches/in_list.rs
+++ b/datafusion/physical-expr/benches/in_list.rs
@@ -51,7 +51,7 @@ fn random_string(rng: &mut StdRng, len: usize) -> String {
 }
 
 const IN_LIST_LENGTHS: [usize; 4] = [3, 8, 28, 100];
-const DYNAMIC_LIST_LENGTHS: [usize; 3] = [3, 8, 28];
+const LIST_WITH_COLUMNS_LENGTHS: [usize; 3] = [3, 8, 28];
 const NULL_PERCENTS: [f64; 2] = [0., 0.2];
 const MATCH_PERCENTS: [f64; 3] = [0.0, 0.5, 1.0];
 const STRING_LENGTHS: [usize; 3] = [3, 12, 100];
@@ -222,9 +222,18 @@ fn bench_realistic_mixed_strings<A>(
     }
 }
 
-/// Benchmarks the dynamic evaluation path (no static filter) by including
+/// Benchmarks the column-reference evaluation path (no static filter) by including
 /// a column reference in the IN list, which prevents static filter creation.
-fn do_bench_dynamic(
+///
+/// This simulates SQL like:
+/// ```sql
+/// CREATE TABLE t (a INT, b0 INT, b1 INT, b2 INT);
+/// SELECT * FROM t WHERE a IN (b0, b1, b2);
+/// ```
+///
+/// - `values`: the "needle" column (`a`)
+/// - `list_cols`: the "haystack" columns (`b0`, `b1`, …)
+fn do_bench_with_columns(
     c: &mut Criterion,
     name: &str,
     values: ArrayRef,
@@ -233,7 +242,7 @@ fn do_bench_dynamic(
     let mut fields = vec![Field::new("a", values.data_type().clone(), true)];
     let mut columns: Vec<ArrayRef> = vec![values];
 
-    // Build list expressions: mix of column refs (forces dynamic path)
+    // Build list expressions: column refs (forces non-constant evaluation path)
     let schema_fields: Vec<Field> = list_cols
         .iter()
         .enumerate()
@@ -259,11 +268,17 @@ fn do_bench_dynamic(
     });
 }
 
-/// Benchmarks the dynamic IN list path for Int32 arrays with column references.
-fn bench_dynamic_int32(c: &mut Criterion) {
+/// Benchmarks the IN list path with column references for Int32 arrays.
+///
+/// Equivalent SQL:
+/// ```sql
+/// CREATE TABLE t (a INT, b0 INT, b1 INT, ...);
+/// SELECT * FROM t WHERE a IN (b0, b1, ...);
+/// ```
+fn bench_with_columns_int32(c: &mut Criterion) {
     let mut rng = StdRng::seed_from_u64(42);
 
-    for list_size in DYNAMIC_LIST_LENGTHS {
+    for list_size in LIST_WITH_COLUMNS_LENGTHS {
         for match_percent in MATCH_PERCENTS {
             for null_percent in NULL_PERCENTS {
                 // Generate the "needle" column
@@ -300,10 +315,10 @@ fn bench_dynamic_int32(c: &mut Criterion) {
                     })
                     .collect();
 
-                do_bench_dynamic(
+                do_bench_with_columns(
                     c,
                     &format!(
-                        "in_list_dynamic/Int32/list={}/match={}%/nulls={}%",
+                        "in_list_cols/Int32/list={}/match={}%/nulls={}%",
                         list_size,
                         (match_percent * 100.0) as u32,
                         (null_percent * 100.0) as u32
@@ -316,11 +331,17 @@ fn bench_dynamic_int32(c: &mut Criterion) {
     }
 }
 
-/// Benchmarks the dynamic IN list path for Utf8 arrays with column references.
-fn bench_dynamic_utf8(c: &mut Criterion) {
+/// Benchmarks the IN list path with column references for Utf8 arrays.
+///
+/// Equivalent SQL:
+/// ```sql
+/// CREATE TABLE t (a VARCHAR, b0 VARCHAR, b1 VARCHAR, ...);
+/// SELECT * FROM t WHERE a IN (b0, b1, ...);
+/// ```
+fn bench_with_columns_utf8(c: &mut Criterion) {
     let mut rng = StdRng::seed_from_u64(99);
 
-    for list_size in DYNAMIC_LIST_LENGTHS {
+    for list_size in LIST_WITH_COLUMNS_LENGTHS {
         for match_percent in MATCH_PERCENTS {
             // Generate the "needle" column
             let value_strings: Vec<Option<String>> = (0..ARRAY_LENGTH)
@@ -346,10 +367,10 @@ fn bench_dynamic_utf8(c: &mut Criterion) {
                 })
                 .collect();
 
-            do_bench_dynamic(
+            do_bench_with_columns(
                 c,
                 &format!(
-                    "in_list_dynamic/Utf8/list={}/match={}%",
+                    "in_list_cols/Utf8/list={}/match={}%",
                     list_size,
                     (match_percent * 100.0) as u32,
                 ),
@@ -408,9 +429,9 @@ fn criterion_benchmark(c: &mut Criterion) {
         |v| ScalarValue::TimestampNanosecond(Some(v), None),
     );
 
-    // Dynamic path benchmarks (non-constant list expressions)
-    bench_dynamic_int32(c);
-    bench_dynamic_utf8(c);
+    // Column-reference path benchmarks (non-constant list expressions)
+    bench_with_columns_int32(c);
+    bench_with_columns_utf8(c);
 }
 
 criterion_group! {