From 26e64aba30e73b9ce52f47d69eea755df8e113ac Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Sun, 15 Feb 2026 13:19:26 -0500 Subject: [PATCH 1/5] Revise benchmark for array_has() The previous implementation tested the cost of building an array_has() `Expr` (!), not actually evaluating the array_has() operation itself. Refactor things along the way. --- .../functions-nested/benches/array_has.rs | 676 +++++++++++------- 1 file changed, 426 insertions(+), 250 deletions(-) diff --git a/datafusion/functions-nested/benches/array_has.rs b/datafusion/functions-nested/benches/array_has.rs index d96f26d410dd..302ef9168650 100644 --- a/datafusion/functions-nested/benches/array_has.rs +++ b/datafusion/functions-nested/benches/array_has.rs @@ -15,19 +15,31 @@ // specific language governing permissions and limitations // under the License. +use arrow::array::{ArrayRef, Int64Array, ListArray, StringArray}; +use arrow::buffer::OffsetBuffer; +use arrow::datatypes::{DataType, Field}; use criterion::{ criterion_group, criterion_main, {BenchmarkId, Criterion}, }; -use datafusion_expr::lit; -use datafusion_functions_nested::expr_fn::{ - array_has, array_has_all, array_has_any, make_array, -}; +use datafusion_common::ScalarValue; +use datafusion_common::config::ConfigOptions; +use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl}; +use datafusion_functions_nested::array_has::{ArrayHas, ArrayHasAll, ArrayHasAny}; +use rand::Rng; +use rand::SeedableRng; +use rand::rngs::StdRng; use std::hint::black_box; +use std::sync::Arc; + +const NUM_ROWS: usize = 10000; +const SEED: u64 = 42; +const NULL_DENSITY: f64 = 0.1; +const NEEDLE_SIZE: usize = 3; // If not explicitly stated, `array` and `array_size` refer to the haystack array. fn criterion_benchmark(c: &mut Criterion) { // Test different array sizes - let array_sizes = vec![1, 10, 100, 1000, 10000]; + let array_sizes = vec![10, 100, 500]; for &size in &array_sizes { bench_array_has(c, size); @@ -39,50 +51,65 @@ fn criterion_benchmark(c: &mut Criterion) { bench_array_has_strings(c); bench_array_has_all_strings(c); bench_array_has_any_strings(c); - - // Edge cases - bench_array_has_edge_cases(c); } fn bench_array_has(c: &mut Criterion, array_size: usize) { let mut group = c.benchmark_group("array_has_i64"); - - // Benchmark: element found at beginning - group.bench_with_input( - BenchmarkId::new("found_at_start", array_size), - &array_size, - |b, &size| { - let array = (0..size).map(|i| lit(i as i64)).collect::>(); - let list_array = make_array(array); - let needle = lit(0_i64); - - b.iter(|| black_box(array_has(list_array.clone(), needle.clone()))) - }, - ); - - // Benchmark: element found at end + let list_array = create_int64_list_array(NUM_ROWS, array_size, NULL_DENSITY); + let config_options = Arc::new(ConfigOptions::default()); + let return_field: Arc = Field::new("result", DataType::Boolean, true).into(); + let arg_fields: Vec> = vec![ + Field::new("arr", list_array.data_type().clone(), false).into(), + Field::new("el", DataType::Int64, false).into(), + ]; + + // Benchmark: element found + let args_found = vec![ + ColumnarValue::Array(list_array.clone()), + ColumnarValue::Scalar(ScalarValue::Int64(Some(1))), + ]; group.bench_with_input( - BenchmarkId::new("found_at_end", array_size), + BenchmarkId::new("found", array_size), &array_size, - |b, &size| { - let array = (0..size).map(|i| lit(i as i64)).collect::>(); - let list_array = make_array(array); - let needle = lit((size - 1) as i64); - - b.iter(|| black_box(array_has(list_array.clone(), needle.clone()))) + |b, _| { + let udf = ArrayHas::new(); + b.iter(|| { + black_box( + udf.invoke_with_args(ScalarFunctionArgs { + args: args_found.clone(), + arg_fields: arg_fields.clone(), + number_rows: NUM_ROWS, + return_field: return_field.clone(), + config_options: config_options.clone(), + }) + .unwrap(), + ) + }) }, ); // Benchmark: element not found + let args_not_found = vec![ + ColumnarValue::Array(list_array.clone()), + ColumnarValue::Scalar(ScalarValue::Int64(Some(-999))), + ]; group.bench_with_input( BenchmarkId::new("not_found", array_size), &array_size, - |b, &size| { - let array = (0..size).map(|i| lit(i as i64)).collect::>(); - let list_array = make_array(array); - let needle = lit(-1_i64); // Not in array - - b.iter(|| black_box(array_has(list_array.clone(), needle.clone()))) + |b, _| { + let udf = ArrayHas::new(); + b.iter(|| { + black_box( + udf.invoke_with_args(ScalarFunctionArgs { + args: args_not_found.clone(), + arg_fields: arg_fields.clone(), + number_rows: NUM_ROWS, + return_field: return_field.clone(), + config_options: config_options.clone(), + }) + .unwrap(), + ) + }) }, ); @@ -91,45 +118,65 @@ fn bench_array_has(c: &mut Criterion, array_size: usize) { fn bench_array_has_all(c: &mut Criterion, array_size: usize) { let mut group = c.benchmark_group("array_has_all"); + let haystack = create_int64_list_array(NUM_ROWS, array_size, NULL_DENSITY); + let list_type = haystack.data_type().clone(); + let config_options = Arc::new(ConfigOptions::default()); + let return_field: Arc = Field::new("result", DataType::Boolean, true).into(); + let arg_fields: Vec> = vec![ + Field::new("haystack", list_type.clone(), false).into(), + Field::new("needle", list_type.clone(), false).into(), + ]; // Benchmark: all elements found (small needle) + let needle_found = create_int64_list_array(NUM_ROWS, NEEDLE_SIZE, 0.0); + let args_found = vec![ + ColumnarValue::Array(haystack.clone()), + ColumnarValue::Array(needle_found), + ]; group.bench_with_input( BenchmarkId::new("all_found_small_needle", array_size), &array_size, - |b, &size| { - let array = (0..size).map(|i| lit(i as i64)).collect::>(); - let list_array = make_array(array); - let needle_array = make_array(vec![lit(0_i64), lit(1_i64), lit(2_i64)]); - - b.iter(|| black_box(array_has_all(list_array.clone(), needle_array.clone()))) + |b, _| { + let udf = ArrayHasAll::new(); + b.iter(|| { + black_box( + udf.invoke_with_args(ScalarFunctionArgs { + args: args_found.clone(), + arg_fields: arg_fields.clone(), + number_rows: NUM_ROWS, + return_field: return_field.clone(), + config_options: config_options.clone(), + }) + .unwrap(), + ) + }) }, ); - // Benchmark: all elements found (medium needle - 10% of haystack) + // Benchmark: not all found (needle contains elements outside haystack range) + let needle_missing = + create_int64_list_array_with_offset(NUM_ROWS, NEEDLE_SIZE, array_size as i64); + let args_missing = vec![ + ColumnarValue::Array(haystack.clone()), + ColumnarValue::Array(needle_missing), + ]; group.bench_with_input( - BenchmarkId::new("all_found_medium_needle", array_size), + BenchmarkId::new("not_all_found", array_size), &array_size, - |b, &size| { - let array = (0..size).map(|i| lit(i as i64)).collect::>(); - let list_array = make_array(array); - let needle_size = (size / 10).max(1); - let needle = (0..needle_size).map(|i| lit(i as i64)).collect::>(); - let needle_array = make_array(needle); - - b.iter(|| black_box(array_has_all(list_array.clone(), needle_array.clone()))) - }, - ); - - // Benchmark: not all found (early exit) - group.bench_with_input( - BenchmarkId::new("early_exit", array_size), - &array_size, - |b, &size| { - let array = (0..size).map(|i| lit(i as i64)).collect::>(); - let list_array = make_array(array); - let needle_array = make_array(vec![lit(0_i64), lit(-1_i64)]); // -1 not in array - - b.iter(|| black_box(array_has_all(list_array.clone(), needle_array.clone()))) + |b, _| { + let udf = ArrayHasAll::new(); + b.iter(|| { + black_box( + udf.invoke_with_args(ScalarFunctionArgs { + args: args_missing.clone(), + arg_fields: arg_fields.clone(), + number_rows: NUM_ROWS, + return_field: return_field.clone(), + config_options: config_options.clone(), + }) + .unwrap(), + ) + }) }, ); @@ -138,43 +185,65 @@ fn bench_array_has_all(c: &mut Criterion, array_size: usize) { fn bench_array_has_any(c: &mut Criterion, array_size: usize) { let mut group = c.benchmark_group("array_has_any"); - - // Benchmark: first element matches (best case) + let haystack = create_int64_list_array(NUM_ROWS, array_size, NULL_DENSITY); + let list_type = haystack.data_type().clone(); + let config_options = Arc::new(ConfigOptions::default()); + let return_field: Arc = Field::new("result", DataType::Boolean, true).into(); + let arg_fields: Vec> = vec![ + Field::new("haystack", list_type.clone(), false).into(), + Field::new("needle", list_type.clone(), false).into(), + ]; + + // Benchmark: some elements match + let needle_match = create_int64_list_array(NUM_ROWS, NEEDLE_SIZE, 0.0); + let args_match = vec![ + ColumnarValue::Array(haystack.clone()), + ColumnarValue::Array(needle_match), + ]; group.bench_with_input( - BenchmarkId::new("first_match", array_size), + BenchmarkId::new("some_match", array_size), &array_size, - |b, &size| { - let array = (0..size).map(|i| lit(i as i64)).collect::>(); - let list_array = make_array(array); - let needle_array = make_array(vec![lit(0_i64), lit(-1_i64), lit(-2_i64)]); - - b.iter(|| black_box(array_has_any(list_array.clone(), needle_array.clone()))) - }, - ); - - // Benchmark: last element matches (worst case) - group.bench_with_input( - BenchmarkId::new("last_match", array_size), - &array_size, - |b, &size| { - let array = (0..size).map(|i| lit(i as i64)).collect::>(); - let list_array = make_array(array); - let needle_array = make_array(vec![lit(-1_i64), lit(-2_i64), lit(0_i64)]); - - b.iter(|| black_box(array_has_any(list_array.clone(), needle_array.clone()))) + |b, _| { + let udf = ArrayHasAny::new(); + b.iter(|| { + black_box( + udf.invoke_with_args(ScalarFunctionArgs { + args: args_match.clone(), + arg_fields: arg_fields.clone(), + number_rows: NUM_ROWS, + return_field: return_field.clone(), + config_options: config_options.clone(), + }) + .unwrap(), + ) + }) }, ); // Benchmark: no match + let needle_no_match = + create_int64_list_array_with_offset(NUM_ROWS, NEEDLE_SIZE, array_size as i64); + let args_no_match = vec![ + ColumnarValue::Array(haystack.clone()), + ColumnarValue::Array(needle_no_match), + ]; group.bench_with_input( BenchmarkId::new("no_match", array_size), &array_size, - |b, &size| { - let array = (0..size).map(|i| lit(i as i64)).collect::>(); - let list_array = make_array(array); - let needle_array = make_array(vec![lit(-1_i64), lit(-2_i64), lit(-3_i64)]); - - b.iter(|| black_box(array_has_any(list_array.clone(), needle_array.clone()))) + |b, _| { + let udf = ArrayHasAny::new(); + b.iter(|| { + black_box( + udf.invoke_with_args(ScalarFunctionArgs { + args: args_no_match.clone(), + arg_fields: arg_fields.clone(), + number_rows: NUM_ROWS, + return_field: return_field.clone(), + config_options: config_options.clone(), + }) + .unwrap(), + ) + }) }, ); @@ -183,29 +252,56 @@ fn bench_array_has_any(c: &mut Criterion, array_size: usize) { fn bench_array_has_strings(c: &mut Criterion) { let mut group = c.benchmark_group("array_has_strings"); + let config_options = Arc::new(ConfigOptions::default()); + let return_field: Arc = Field::new("result", DataType::Boolean, true).into(); - // Benchmark with string arrays (common use case for tickers, tags, etc.) - let sizes = vec![10, 100, 1000]; + let sizes = vec![10, 100, 500]; for &size in &sizes { - group.bench_with_input(BenchmarkId::new("found", size), &size, |b, &size| { - let array = (0..size) - .map(|i| lit(format!("TICKER{i:04}"))) - .collect::>(); - let list_array = make_array(array); - let needle = lit("TICKER0005"); - - b.iter(|| black_box(array_has(list_array.clone(), needle.clone()))) + let list_array = create_string_list_array(NUM_ROWS, size, NULL_DENSITY); + let arg_fields: Vec> = vec![ + Field::new("arr", list_array.data_type().clone(), false).into(), + Field::new("el", DataType::Utf8, false).into(), + ]; + + let args_found = vec![ + ColumnarValue::Array(list_array.clone()), + ColumnarValue::Scalar(ScalarValue::Utf8(Some("value_1".to_string()))), + ]; + group.bench_with_input(BenchmarkId::new("found", size), &size, |b, _| { + let udf = ArrayHas::new(); + b.iter(|| { + black_box( + udf.invoke_with_args(ScalarFunctionArgs { + args: args_found.clone(), + arg_fields: arg_fields.clone(), + number_rows: NUM_ROWS, + return_field: return_field.clone(), + config_options: config_options.clone(), + }) + .unwrap(), + ) + }) }); - group.bench_with_input(BenchmarkId::new("not_found", size), &size, |b, &size| { - let array = (0..size) - .map(|i| lit(format!("TICKER{i:04}"))) - .collect::>(); - let list_array = make_array(array); - let needle = lit("NOTFOUND"); - - b.iter(|| black_box(array_has(list_array.clone(), needle.clone()))) + let args_not_found = vec![ + ColumnarValue::Array(list_array.clone()), + ColumnarValue::Scalar(ScalarValue::Utf8(Some("NOTFOUND".to_string()))), + ]; + group.bench_with_input(BenchmarkId::new("not_found", size), &size, |b, _| { + let udf = ArrayHas::new(); + b.iter(|| { + black_box( + udf.invoke_with_args(ScalarFunctionArgs { + args: args_not_found.clone(), + arg_fields: arg_fields.clone(), + number_rows: NUM_ROWS, + return_field: return_field.clone(), + config_options: config_options.clone(), + }) + .unwrap(), + ) + }) }); } @@ -214,52 +310,61 @@ fn bench_array_has_strings(c: &mut Criterion) { fn bench_array_has_all_strings(c: &mut Criterion) { let mut group = c.benchmark_group("array_has_all_strings"); + let config_options = Arc::new(ConfigOptions::default()); + let return_field: Arc = Field::new("result", DataType::Boolean, true).into(); - // Realistic scenario: checking if a portfolio contains certain tickers - let portfolio_size = 100; - let check_sizes = vec![1, 3, 5, 10]; - - for &check_size in &check_sizes { - group.bench_with_input( - BenchmarkId::new("all_found", check_size), - &check_size, - |b, &check_size| { - let portfolio = (0..portfolio_size) - .map(|i| lit(format!("TICKER{i:04}"))) - .collect::>(); - let list_array = make_array(portfolio); - - let checking = (0..check_size) - .map(|i| lit(format!("TICKER{i:04}"))) - .collect::>(); - let needle_array = make_array(checking); - - b.iter(|| { - black_box(array_has_all(list_array.clone(), needle_array.clone())) - }) - }, - ); - - group.bench_with_input( - BenchmarkId::new("some_missing", check_size), - &check_size, - |b, &check_size| { - let portfolio = (0..portfolio_size) - .map(|i| lit(format!("TICKER{i:04}"))) - .collect::>(); - let list_array = make_array(portfolio); - - let mut checking = (0..check_size - 1) - .map(|i| lit(format!("TICKER{i:04}"))) - .collect::>(); - checking.push(lit("NOTFOUND".to_string())); - let needle_array = make_array(checking); - - b.iter(|| { - black_box(array_has_all(list_array.clone(), needle_array.clone())) - }) - }, - ); + let sizes = vec![10, 100, 500]; + + for &size in &sizes { + let haystack = create_string_list_array(NUM_ROWS, size, NULL_DENSITY); + let list_type = haystack.data_type().clone(); + let arg_fields: Vec> = vec![ + Field::new("haystack", list_type.clone(), false).into(), + Field::new("needle", list_type.clone(), false).into(), + ]; + + let needle_found = create_string_list_array(NUM_ROWS, NEEDLE_SIZE, 0.0); + let args_found = vec![ + ColumnarValue::Array(haystack.clone()), + ColumnarValue::Array(needle_found), + ]; + group.bench_with_input(BenchmarkId::new("all_found", size), &size, |b, _| { + let udf = ArrayHasAll::new(); + b.iter(|| { + black_box( + udf.invoke_with_args(ScalarFunctionArgs { + args: args_found.clone(), + arg_fields: arg_fields.clone(), + number_rows: NUM_ROWS, + return_field: return_field.clone(), + config_options: config_options.clone(), + }) + .unwrap(), + ) + }) + }); + + let needle_missing = + create_string_list_array_with_prefix(NUM_ROWS, NEEDLE_SIZE, "missing_"); + let args_missing = vec![ + ColumnarValue::Array(haystack.clone()), + ColumnarValue::Array(needle_missing), + ]; + group.bench_with_input(BenchmarkId::new("not_all_found", size), &size, |b, _| { + let udf = ArrayHasAll::new(); + b.iter(|| { + black_box( + udf.invoke_with_args(ScalarFunctionArgs { + args: args_missing.clone(), + arg_fields: arg_fields.clone(), + number_rows: NUM_ROWS, + return_field: return_field.clone(), + config_options: config_options.clone(), + }) + .unwrap(), + ) + }) + }); } group.finish(); @@ -267,109 +372,180 @@ fn bench_array_has_all_strings(c: &mut Criterion) { fn bench_array_has_any_strings(c: &mut Criterion) { let mut group = c.benchmark_group("array_has_any_strings"); + let config_options = Arc::new(ConfigOptions::default()); + let return_field: Arc = Field::new("result", DataType::Boolean, true).into(); - let portfolio_size = 100; - let check_sizes = vec![1, 3, 5, 10]; - - for &check_size in &check_sizes { - group.bench_with_input( - BenchmarkId::new("first_matches", check_size), - &check_size, - |b, &check_size| { - let portfolio = (0..portfolio_size) - .map(|i| lit(format!("TICKER{i:04}"))) - .collect::>(); - let list_array = make_array(portfolio); - - let mut checking = vec![lit("TICKER0000".to_string())]; - checking.extend((1..check_size).map(|_| lit("NOTFOUND".to_string()))); - let needle_array = make_array(checking); - - b.iter(|| { - black_box(array_has_any(list_array.clone(), needle_array.clone())) - }) - }, - ); - - group.bench_with_input( - BenchmarkId::new("none_match", check_size), - &check_size, - |b, &check_size| { - let portfolio = (0..portfolio_size) - .map(|i| lit(format!("TICKER{i:04}"))) - .collect::>(); - let list_array = make_array(portfolio); - - let checking = (0..check_size) - .map(|i| lit(format!("NOTFOUND{i}"))) - .collect::>(); - let needle_array = make_array(checking); - - b.iter(|| { - black_box(array_has_any(list_array.clone(), needle_array.clone())) - }) - }, - ); + let sizes = vec![10, 100, 500]; + + for &size in &sizes { + let haystack = create_string_list_array(NUM_ROWS, size, NULL_DENSITY); + let list_type = haystack.data_type().clone(); + let arg_fields: Vec> = vec![ + Field::new("haystack", list_type.clone(), false).into(), + Field::new("needle", list_type.clone(), false).into(), + ]; + + let needle_match = create_string_list_array(NUM_ROWS, NEEDLE_SIZE, 0.0); + let args_match = vec![ + ColumnarValue::Array(haystack.clone()), + ColumnarValue::Array(needle_match), + ]; + group.bench_with_input(BenchmarkId::new("some_match", size), &size, |b, _| { + let udf = ArrayHasAny::new(); + b.iter(|| { + black_box( + udf.invoke_with_args(ScalarFunctionArgs { + args: args_match.clone(), + arg_fields: arg_fields.clone(), + number_rows: NUM_ROWS, + return_field: return_field.clone(), + config_options: config_options.clone(), + }) + .unwrap(), + ) + }) + }); + + let needle_no_match = + create_string_list_array_with_prefix(NUM_ROWS, NEEDLE_SIZE, "missing_"); + let args_no_match = vec![ + ColumnarValue::Array(haystack.clone()), + ColumnarValue::Array(needle_no_match), + ]; + group.bench_with_input(BenchmarkId::new("no_match", size), &size, |b, _| { + let udf = ArrayHasAny::new(); + b.iter(|| { + black_box( + udf.invoke_with_args(ScalarFunctionArgs { + args: args_no_match.clone(), + arg_fields: arg_fields.clone(), + number_rows: NUM_ROWS, + return_field: return_field.clone(), + config_options: config_options.clone(), + }) + .unwrap(), + ) + }) + }); } group.finish(); } -fn bench_array_has_edge_cases(c: &mut Criterion) { - let mut group = c.benchmark_group("array_has_edge_cases"); - - // Empty array - group.bench_function("empty_array", |b| { - let list_array = make_array(vec![]); - let needle = lit(1_i64); - - b.iter(|| black_box(array_has(list_array.clone(), needle.clone()))) - }); - - // Single element array - found - group.bench_function("single_element_found", |b| { - let list_array = make_array(vec![lit(1_i64)]); - let needle = lit(1_i64); - - b.iter(|| black_box(array_has(list_array.clone(), needle.clone()))) - }); - - // Single element array - not found - group.bench_function("single_element_not_found", |b| { - let list_array = make_array(vec![lit(1_i64)]); - let needle = lit(2_i64); - - b.iter(|| black_box(array_has(list_array.clone(), needle.clone()))) - }); - - // Array with duplicates - group.bench_function("array_with_duplicates", |b| { - let array = vec![lit(1_i64); 1000]; - let list_array = make_array(array); - let needle = lit(1_i64); - - b.iter(|| black_box(array_has(list_array.clone(), needle.clone()))) - }); - - // array_has_all: empty needle - group.bench_function("array_has_all_empty_needle", |b| { - let array = (0..1000).map(|i| lit(i as i64)).collect::>(); - let list_array = make_array(array); - let needle_array = make_array(vec![]); - - b.iter(|| black_box(array_has_all(list_array.clone(), needle_array.clone()))) - }); +fn create_int64_list_array( + num_rows: usize, + array_size: usize, + null_density: f64, +) -> ArrayRef { + let mut rng = StdRng::seed_from_u64(SEED); + let values = (0..num_rows * array_size) + .map(|_| { + if rng.random::() < null_density { + None + } else { + Some(rng.random_range(0..array_size as i64)) + } + }) + .collect::(); + let offsets = (0..=num_rows) + .map(|i| (i * array_size) as i32) + .collect::>(); + + Arc::new( + ListArray::try_new( + Arc::new(Field::new("item", DataType::Int64, true)), + OffsetBuffer::new(offsets.into()), + Arc::new(values), + None, + ) + .unwrap(), + ) +} - // array_has_any: empty needle - group.bench_function("array_has_any_empty_needle", |b| { - let array = (0..1000).map(|i| lit(i as i64)).collect::>(); - let list_array = make_array(array); - let needle_array = make_array(vec![]); +/// Like `create_int64_list_array` but values are offset so they won't +/// appear in a standard list array (useful for "not found" benchmarks). +fn create_int64_list_array_with_offset( + num_rows: usize, + array_size: usize, + offset: i64, +) -> ArrayRef { + let mut rng = StdRng::seed_from_u64(SEED + 1); + let values = (0..num_rows * array_size) + .map(|_| Some(rng.random_range(0..array_size as i64) + offset)) + .collect::(); + let offsets = (0..=num_rows) + .map(|i| (i * array_size) as i32) + .collect::>(); + + Arc::new( + ListArray::try_new( + Arc::new(Field::new("item", DataType::Int64, true)), + OffsetBuffer::new(offsets.into()), + Arc::new(values), + None, + ) + .unwrap(), + ) +} - b.iter(|| black_box(array_has_any(list_array.clone(), needle_array.clone()))) - }); +fn create_string_list_array( + num_rows: usize, + array_size: usize, + null_density: f64, +) -> ArrayRef { + let mut rng = StdRng::seed_from_u64(SEED); + let values = (0..num_rows * array_size) + .map(|_| { + if rng.random::() < null_density { + None + } else { + let idx = rng.random_range(0..array_size); + Some(format!("value_{idx}")) + } + }) + .collect::(); + let offsets = (0..=num_rows) + .map(|i| (i * array_size) as i32) + .collect::>(); + + Arc::new( + ListArray::try_new( + Arc::new(Field::new("item", DataType::Utf8, true)), + OffsetBuffer::new(offsets.into()), + Arc::new(values), + None, + ) + .unwrap(), + ) +} - group.finish(); +/// Like `create_string_list_array` but values use a different prefix so +/// they won't appear in a standard string list array. +fn create_string_list_array_with_prefix( + num_rows: usize, + array_size: usize, + prefix: &str, +) -> ArrayRef { + let mut rng = StdRng::seed_from_u64(SEED + 1); + let values = (0..num_rows * array_size) + .map(|_| { + let idx = rng.random_range(0..array_size); + Some(format!("{prefix}{idx}")) + }) + .collect::(); + let offsets = (0..=num_rows) + .map(|i| (i * array_size) as i32) + .collect::>(); + + Arc::new( + ListArray::try_new( + Arc::new(Field::new("item", DataType::Utf8, true)), + OffsetBuffer::new(offsets.into()), + Arc::new(values), + None, + ) + .unwrap(), + ) } criterion_group!(benches, criterion_benchmark); From 378cbef7dfbcbdb1d35c69069ccb2ef49f32c785 Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Sun, 15 Feb 2026 13:21:46 -0500 Subject: [PATCH 2/5] Optimize array_has() --- datafusion/functions-nested/src/array_has.rs | 39 +++++++++++++++----- 1 file changed, 30 insertions(+), 9 deletions(-) diff --git a/datafusion/functions-nested/src/array_has.rs b/datafusion/functions-nested/src/array_has.rs index abc0e7406b2c..826ef4677621 100644 --- a/datafusion/functions-nested/src/array_has.rs +++ b/datafusion/functions-nested/src/array_has.rs @@ -353,7 +353,18 @@ fn array_has_dispatch_for_scalar( ))); } let eq_array = compare_with_eq(values, needle, is_nested)?; - let mut final_contained = vec![None; haystack.len()]; + + // When a haystack element is null, `eq()` returns null (not false). + // In Arrow, a null BooleanArray entry has validity=0 but an + // undefined value bit that may happen to be 1. Since set_indices() + // operates on the raw value buffer and ignores validity, we AND the + // values with the validity bitmap to clear any undefined bits at + // null positions. This ensures set_indices() only yields positions + // where the comparison genuinely returned true. + let eq_bits = match eq_array.nulls() { + Some(nulls) => eq_array.values() & nulls.inner(), + None => eq_array.values().clone(), + }; // Check validity buffer to distinguish between null and empty arrays let validity = match &haystack { @@ -362,24 +373,34 @@ fn array_has_dispatch_for_scalar( ArrayWrapper::LargeList(arr) => arr.nulls(), }; - for (i, (start, end)) in haystack.offsets().tuple_windows().enumerate() { - let length = end - start; + let offsets: Vec = haystack.offsets().collect(); + let mut matches = eq_bits.set_indices().peekable(); + let mut final_contained = vec![Some(false); haystack.len()]; + + for (i, window) in offsets.windows(2).enumerate() { + let end = window[1]; // Check if the array at this position is null if let Some(validity_buffer) = validity && !validity_buffer.is_valid(i) { final_contained[i] = None; // null array -> null result + // Advance past any match positions in this null row's range. + while matches.peek().is_some_and(|&p| p < end) { + matches.next(); + } continue; } - // For non-null arrays: length is 0 for empty arrays - if length == 0 { - final_contained[i] = Some(false); // empty array -> false - } else { - let sliced_array = eq_array.slice(start, length); - final_contained[i] = Some(sliced_array.true_count() > 0); + // Check if any match falls within this row's range. + if matches.peek().is_some_and(|&p| p < end) { + final_contained[i] = Some(true); + // Advance past remaining matches in this row. + while matches.peek().is_some_and(|&p| p < end) { + matches.next(); + } } + // else: no match in range, stays Some(false) } Ok(Arc::new(BooleanArray::from(final_contained))) From 64ce7570a5e63d60542d420ada74a13dde50c252 Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Thu, 19 Feb 2026 07:43:29 -0500 Subject: [PATCH 3/5] Avoid alloc when iterating over offsets Co-authored-by: Jeffrey Vo --- datafusion/functions-nested/src/array_has.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/functions-nested/src/array_has.rs b/datafusion/functions-nested/src/array_has.rs index 826ef4677621..784740317f37 100644 --- a/datafusion/functions-nested/src/array_has.rs +++ b/datafusion/functions-nested/src/array_has.rs @@ -377,7 +377,7 @@ fn array_has_dispatch_for_scalar( let mut matches = eq_bits.set_indices().peekable(); let mut final_contained = vec![Some(false); haystack.len()]; - for (i, window) in offsets.windows(2).enumerate() { + for (i, (_start, end)) in haystack.offsets().tuple_windows().enumerate() { let end = window[1]; // Check if the array at this position is null From a72ca08ecf2d2185158444ed236af2468045cb43 Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Thu, 19 Feb 2026 08:01:57 -0500 Subject: [PATCH 4/5] Switch to BooleanBufferBuilder, refactor loop --- datafusion/functions-nested/src/array_has.rs | 42 ++++++++------------ 1 file changed, 16 insertions(+), 26 deletions(-) diff --git a/datafusion/functions-nested/src/array_has.rs b/datafusion/functions-nested/src/array_has.rs index 784740317f37..2d07d2313a38 100644 --- a/datafusion/functions-nested/src/array_has.rs +++ b/datafusion/functions-nested/src/array_has.rs @@ -17,7 +17,7 @@ //! [`ScalarUDFImpl`] definitions for array_has, array_has_all and array_has_any functions. -use arrow::array::{Array, ArrayRef, BooleanArray, Datum, Scalar}; +use arrow::array::{Array, ArrayRef, BooleanArray, BooleanBufferBuilder, Datum, Scalar}; use arrow::buffer::BooleanBuffer; use arrow::datatypes::DataType; use arrow::row::{RowConverter, Rows, SortField}; @@ -366,44 +366,34 @@ fn array_has_dispatch_for_scalar( None => eq_array.values().clone(), }; - // Check validity buffer to distinguish between null and empty arrays let validity = match &haystack { ArrayWrapper::FixedSizeList(arr) => arr.nulls(), ArrayWrapper::List(arr) => arr.nulls(), ArrayWrapper::LargeList(arr) => arr.nulls(), }; - - let offsets: Vec = haystack.offsets().collect(); let mut matches = eq_bits.set_indices().peekable(); - let mut final_contained = vec![Some(false); haystack.len()]; + let mut values = BooleanBufferBuilder::new(haystack.len()); + values.append_n(haystack.len(), false); for (i, (_start, end)) in haystack.offsets().tuple_windows().enumerate() { - let end = window[1]; - - // Check if the array at this position is null - if let Some(validity_buffer) = validity - && !validity_buffer.is_valid(i) - { - final_contained[i] = None; // null array -> null result - // Advance past any match positions in this null row's range. - while matches.peek().is_some_and(|&p| p < end) { - matches.next(); - } - continue; + let has_match = matches.peek().is_some_and(|&p| p < end); + + // Advance past all match positions in this row's range. + while matches.peek().is_some_and(|&p| p < end) { + matches.next(); } - // Check if any match falls within this row's range. - if matches.peek().is_some_and(|&p| p < end) { - final_contained[i] = Some(true); - // Advance past remaining matches in this row. - while matches.peek().is_some_and(|&p| p < end) { - matches.next(); - } + if has_match && !validity.is_some_and(|v| !v.is_valid(i)) { + values.set_bit(i, true); } - // else: no match in range, stays Some(false) } - Ok(Arc::new(BooleanArray::from(final_contained))) + // A null haystack row always produces a null output, so we can + // reuse the haystack's null buffer directly. + Ok(Arc::new(BooleanArray::new( + values.finish(), + validity.cloned(), + ))) } fn array_has_all_inner(args: &[ArrayRef]) -> Result { From e42cd555177a4939901de14fe6f3ad2ace9809fd Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Thu, 19 Feb 2026 08:05:32 -0500 Subject: [PATCH 5/5] Apply clippy fix --- datafusion/functions-nested/src/array_has.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/functions-nested/src/array_has.rs b/datafusion/functions-nested/src/array_has.rs index 2d07d2313a38..e34239ed4919 100644 --- a/datafusion/functions-nested/src/array_has.rs +++ b/datafusion/functions-nested/src/array_has.rs @@ -383,7 +383,7 @@ fn array_has_dispatch_for_scalar( matches.next(); } - if has_match && !validity.is_some_and(|v| !v.is_valid(i)) { + if has_match && validity.is_none_or(|v| v.is_valid(i)) { values.set_bit(i, true); } }