From cb13f93636b96b5bbbdfbf33fbc20e96fda30a39 Mon Sep 17 00:00:00 2001 From: Zhang Xiaofeng Date: Fri, 20 Feb 2026 04:18:54 +0000 Subject: [PATCH 1/2] bench: Add dynamic IN list benchmarks for non-constant list expressions --- datafusion/physical-expr/benches/in_list.rs | 145 ++++++++++++++++++++ 1 file changed, 145 insertions(+) diff --git a/datafusion/physical-expr/benches/in_list.rs b/datafusion/physical-expr/benches/in_list.rs index 954715d0e5a9c..405088f88fbb2 100644 --- a/datafusion/physical-expr/benches/in_list.rs +++ b/datafusion/physical-expr/benches/in_list.rs @@ -23,6 +23,7 @@ use arrow::datatypes::{Field, Schema}; use arrow::record_batch::RecordBatch; use criterion::{Criterion, criterion_group, criterion_main}; use datafusion_common::ScalarValue; +use datafusion_physical_expr::PhysicalExpr; use datafusion_physical_expr::expressions::{col, in_list, lit}; use rand::distr::Alphanumeric; use rand::prelude::*; @@ -50,7 +51,9 @@ fn random_string(rng: &mut StdRng, len: usize) -> String { } const IN_LIST_LENGTHS: [usize; 4] = [3, 8, 28, 100]; +const DYNAMIC_LIST_LENGTHS: [usize; 3] = [3, 8, 28]; const NULL_PERCENTS: [f64; 2] = [0., 0.2]; +const MATCH_PERCENTS: [f64; 3] = [0.0, 0.5, 1.0]; const STRING_LENGTHS: [usize; 3] = [3, 12, 100]; const ARRAY_LENGTH: usize = 8192; @@ -219,6 +222,144 @@ fn bench_realistic_mixed_strings( } } +/// Benchmarks the dynamic evaluation path (no static filter) by including +/// a column reference in the IN list, which prevents static filter creation. +fn do_bench_dynamic( + c: &mut Criterion, + name: &str, + values: ArrayRef, + list_cols: &[ArrayRef], +) { + let mut fields = vec![Field::new("a", values.data_type().clone(), true)]; + let mut columns: Vec = vec![values]; + + // Build list expressions: mix of column refs (forces dynamic path) + let schema_fields: Vec = list_cols + .iter() + .enumerate() + .map(|(i, col_arr)| { + let name = format!("b{i}"); + fields.push(Field::new(&name, col_arr.data_type().clone(), true)); + columns.push(Arc::clone(col_arr)); + Field::new(&name, col_arr.data_type().clone(), true) + }) + .collect(); + + let schema = Schema::new(fields); + let list_exprs: Vec> = schema_fields + .iter() + .map(|f| col(f.name(), &schema).unwrap()) + .collect(); + + let expr = in_list(col("a", &schema).unwrap(), list_exprs, &false, &schema).unwrap(); + let batch = RecordBatch::try_new(Arc::new(schema), columns).unwrap(); + + c.bench_function(name, |b| { + b.iter(|| black_box(expr.evaluate(black_box(&batch)).unwrap())) + }); +} + +/// Benchmarks the dynamic IN list path for Int32 arrays with column references. +fn bench_dynamic_int32(c: &mut Criterion) { + let mut rng = StdRng::seed_from_u64(42); + + for list_size in DYNAMIC_LIST_LENGTHS { + for match_percent in MATCH_PERCENTS { + for null_percent in NULL_PERCENTS { + // Generate the "needle" column + let values: Int32Array = (0..ARRAY_LENGTH) + .map(|_| { + rng.random_bool(1.0 - null_percent) + .then(|| rng.random_range(0..1000)) + }) + .collect(); + + // Generate list columns with controlled match rate + let list_cols: Vec = (0..list_size) + .map(|_| { + let col: Int32Array = (0..ARRAY_LENGTH) + .map(|row| { + if rng.random_bool(1.0 - null_percent) { + if rng.random_bool(match_percent) { + // Copy from values to create a match + if values.is_null(row) { + Some(rng.random_range(0..1000)) + } else { + Some(values.value(row)) + } + } else { + // Random value (unlikely to match) + Some(rng.random_range(1000..2000)) + } + } else { + None + } + }) + .collect(); + Arc::new(col) as ArrayRef + }) + .collect(); + + do_bench_dynamic( + c, + &format!( + "in_list_dynamic/Int32/list={}/match={}%/nulls={}%", + list_size, + (match_percent * 100.0) as u32, + (null_percent * 100.0) as u32 + ), + Arc::new(values), + &list_cols, + ); + } + } + } +} + +/// Benchmarks the dynamic IN list path for Utf8 arrays with column references. +fn bench_dynamic_utf8(c: &mut Criterion) { + let mut rng = StdRng::seed_from_u64(99); + + for list_size in DYNAMIC_LIST_LENGTHS { + for match_percent in MATCH_PERCENTS { + // Generate the "needle" column + let value_strings: Vec> = (0..ARRAY_LENGTH) + .map(|_| rng.random_bool(0.8).then(|| random_string(&mut rng, 12))) + .collect(); + let values: StringArray = + value_strings.iter().map(|s| s.as_deref()).collect(); + + // Generate list columns with controlled match rate + let list_cols: Vec = (0..list_size) + .map(|_| { + let col: StringArray = (0..ARRAY_LENGTH) + .map(|row| { + if rng.random_bool(match_percent) { + // Copy from values to create a match + value_strings[row].as_deref() + } else { + Some("no_match_value_xyz") + } + }) + .collect(); + Arc::new(col) as ArrayRef + }) + .collect(); + + do_bench_dynamic( + c, + &format!( + "in_list_dynamic/Utf8/list={}/match={}%", + list_size, + (match_percent * 100.0) as u32, + ), + Arc::new(values), + &list_cols, + ); + } + } +} + /// Entry point: registers in_list benchmarks for string and numeric array types. fn criterion_benchmark(c: &mut Criterion) { let mut rng = StdRng::seed_from_u64(120320); @@ -266,6 +407,10 @@ fn criterion_benchmark(c: &mut Criterion) { |rng| rng.random(), |v| ScalarValue::TimestampNanosecond(Some(v), None), ); + + // Dynamic path benchmarks (non-constant list expressions) + bench_dynamic_int32(c); + bench_dynamic_utf8(c); } criterion_group! { From 99faf913b3e351aceed7be480a1664fc13cccfe5 Mon Sep 17 00:00:00 2001 From: Zhang Xiaofeng Date: Fri, 20 Feb 2026 14:35:45 +0000 Subject: [PATCH 2/2] rename dynamic to list_with_columns and add equivalent sql --- datafusion/physical-expr/benches/in_list.rs | 55 ++++++++++++++------- 1 file changed, 38 insertions(+), 17 deletions(-) diff --git a/datafusion/physical-expr/benches/in_list.rs b/datafusion/physical-expr/benches/in_list.rs index 405088f88fbb2..021d8259cdfdf 100644 --- a/datafusion/physical-expr/benches/in_list.rs +++ b/datafusion/physical-expr/benches/in_list.rs @@ -51,7 +51,7 @@ fn random_string(rng: &mut StdRng, len: usize) -> String { } const IN_LIST_LENGTHS: [usize; 4] = [3, 8, 28, 100]; -const DYNAMIC_LIST_LENGTHS: [usize; 3] = [3, 8, 28]; +const LIST_WITH_COLUMNS_LENGTHS: [usize; 3] = [3, 8, 28]; const NULL_PERCENTS: [f64; 2] = [0., 0.2]; const MATCH_PERCENTS: [f64; 3] = [0.0, 0.5, 1.0]; const STRING_LENGTHS: [usize; 3] = [3, 12, 100]; @@ -222,9 +222,18 @@ fn bench_realistic_mixed_strings( } } -/// Benchmarks the dynamic evaluation path (no static filter) by including +/// Benchmarks the column-reference evaluation path (no static filter) by including /// a column reference in the IN list, which prevents static filter creation. -fn do_bench_dynamic( +/// +/// This simulates SQL like: +/// ```sql +/// CREATE TABLE t (a INT, b0 INT, b1 INT, b2 INT); +/// SELECT * FROM t WHERE a IN (b0, b1, b2); +/// ``` +/// +/// - `values`: the "needle" column (`a`) +/// - `list_cols`: the "haystack" columns (`b0`, `b1`, …) +fn do_bench_with_columns( c: &mut Criterion, name: &str, values: ArrayRef, @@ -233,7 +242,7 @@ fn do_bench_dynamic( let mut fields = vec![Field::new("a", values.data_type().clone(), true)]; let mut columns: Vec = vec![values]; - // Build list expressions: mix of column refs (forces dynamic path) + // Build list expressions: column refs (forces non-constant evaluation path) let schema_fields: Vec = list_cols .iter() .enumerate() @@ -259,11 +268,17 @@ fn do_bench_dynamic( }); } -/// Benchmarks the dynamic IN list path for Int32 arrays with column references. -fn bench_dynamic_int32(c: &mut Criterion) { +/// Benchmarks the IN list path with column references for Int32 arrays. +/// +/// Equivalent SQL: +/// ```sql +/// CREATE TABLE t (a INT, b0 INT, b1 INT, ...); +/// SELECT * FROM t WHERE a IN (b0, b1, ...); +/// ``` +fn bench_with_columns_int32(c: &mut Criterion) { let mut rng = StdRng::seed_from_u64(42); - for list_size in DYNAMIC_LIST_LENGTHS { + for list_size in LIST_WITH_COLUMNS_LENGTHS { for match_percent in MATCH_PERCENTS { for null_percent in NULL_PERCENTS { // Generate the "needle" column @@ -300,10 +315,10 @@ fn bench_dynamic_int32(c: &mut Criterion) { }) .collect(); - do_bench_dynamic( + do_bench_with_columns( c, &format!( - "in_list_dynamic/Int32/list={}/match={}%/nulls={}%", + "in_list_cols/Int32/list={}/match={}%/nulls={}%", list_size, (match_percent * 100.0) as u32, (null_percent * 100.0) as u32 @@ -316,11 +331,17 @@ fn bench_dynamic_int32(c: &mut Criterion) { } } -/// Benchmarks the dynamic IN list path for Utf8 arrays with column references. -fn bench_dynamic_utf8(c: &mut Criterion) { +/// Benchmarks the IN list path with column references for Utf8 arrays. +/// +/// Equivalent SQL: +/// ```sql +/// CREATE TABLE t (a VARCHAR, b0 VARCHAR, b1 VARCHAR, ...); +/// SELECT * FROM t WHERE a IN (b0, b1, ...); +/// ``` +fn bench_with_columns_utf8(c: &mut Criterion) { let mut rng = StdRng::seed_from_u64(99); - for list_size in DYNAMIC_LIST_LENGTHS { + for list_size in LIST_WITH_COLUMNS_LENGTHS { for match_percent in MATCH_PERCENTS { // Generate the "needle" column let value_strings: Vec> = (0..ARRAY_LENGTH) @@ -346,10 +367,10 @@ fn bench_dynamic_utf8(c: &mut Criterion) { }) .collect(); - do_bench_dynamic( + do_bench_with_columns( c, &format!( - "in_list_dynamic/Utf8/list={}/match={}%", + "in_list_cols/Utf8/list={}/match={}%", list_size, (match_percent * 100.0) as u32, ), @@ -408,9 +429,9 @@ fn criterion_benchmark(c: &mut Criterion) { |v| ScalarValue::TimestampNanosecond(Some(v), None), ); - // Dynamic path benchmarks (non-constant list expressions) - bench_dynamic_int32(c); - bench_dynamic_utf8(c); + // Column-reference path benchmarks (non-constant list expressions) + bench_with_columns_int32(c); + bench_with_columns_utf8(c); } criterion_group! {