From d09a1550e41a8c4c28cda1b2a783d9a4dfb26a6c Mon Sep 17 00:00:00 2001 From: linfeng <33561138+lyne7-sc@users.noreply.github.com> Date: Wed, 27 May 2026 15:52:19 +0800 Subject: [PATCH 1/2] fix like pruning --- datafusion/pruning/src/pruning_predicate.rs | 22 ++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/datafusion/pruning/src/pruning_predicate.rs b/datafusion/pruning/src/pruning_predicate.rs index 28d4fe9028760..bacdd7032ead2 100644 --- a/datafusion/pruning/src/pruning_predicate.rs +++ b/datafusion/pruning/src/pruning_predicate.rs @@ -41,6 +41,7 @@ use datafusion_common::{ ScalarValue, internal_datafusion_err, plan_datafusion_err, plan_err, tree_node::{Transformed, TreeNode}, }; +use datafusion_expr_common::casts::try_cast_literal_to_type; use datafusion_expr_common::operator::Operator; use datafusion_physical_expr::utils::{Guarantee, LiteralGuarantee}; use datafusion_physical_expr::{PhysicalExprRef, expressions as phys_expr}; @@ -1816,6 +1817,13 @@ fn extract_string_literal(expr: &Arc) -> Option<&str> { None } +/// Wrap a string in a `Literal` whose `ScalarValue` matches `target_type` +fn string_literal_as(value: String, target_type: &DataType) -> Arc { + let utf8 = ScalarValue::Utf8(Some(value)); + let scalar = try_cast_literal_to_type(&utf8, target_type).unwrap_or(utf8); + Arc::new(phys_expr::Literal::new(scalar)) +} + /// Convert `column LIKE literal` where P is a constant prefix of the literal /// to a range check on the column: `P <= column && column < P'`, where P' is the /// lowest string after all P* strings. @@ -1835,6 +1843,8 @@ fn build_like_match( let min_column_expr = expr_builder.min_column_expr().ok()?; let max_column_expr = expr_builder.max_column_expr().ok()?; let scalar_expr = expr_builder.scalar_expr(); + // Synthesized bounds must match the column type (e.g. `Utf8View`). + let target_type = expr_builder.field.data_type(); // check that the scalar is a string literal let s = extract_string_literal(scalar_expr)?; // ANSI SQL specifies two wildcards: % and _. % matches zero or more characters, _ matches exactly one character. @@ -1846,18 +1856,12 @@ fn build_like_match( } let (lower_bound, upper_bound) = if has_wildcard { let incremented_prefix = increment_utf8(&decoded_prefix)?; - let lower_bound_lit = Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some( - decoded_prefix, - )))); - let upper_bound_lit = Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some( - incremented_prefix, - )))); + let lower_bound_lit = string_literal_as(decoded_prefix, target_type); + let upper_bound_lit = string_literal_as(incremented_prefix, target_type); (lower_bound_lit, upper_bound_lit) } else { // the like expression is a literal and can be converted into a comparison - let bound = Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some( - decoded_prefix, - )))); + let bound = string_literal_as(decoded_prefix, target_type); (Arc::clone(&bound), bound) }; let lower_bound_expr = Arc::new(phys_expr::BinaryExpr::new( From 45375e3f0be546170165dc929defad8c9684e462 Mon Sep 17 00:00:00 2001 From: linfeng <33561138+lyne7-sc@users.noreply.github.com> Date: Wed, 27 May 2026 15:57:20 +0800 Subject: [PATCH 2/2] add test for row group pruning with LIKE prefix --- .../core/tests/parquet/row_group_pruning.rs | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/datafusion/core/tests/parquet/row_group_pruning.rs b/datafusion/core/tests/parquet/row_group_pruning.rs index 441d1af3e96fd..0721715921909 100644 --- a/datafusion/core/tests/parquet/row_group_pruning.rs +++ b/datafusion/core/tests/parquet/row_group_pruning.rs @@ -2078,3 +2078,26 @@ async fn test_limit_pruning_exceeds_fully_matched() -> datafusion_common::error: .await; Ok(()) } + +#[tokio::test] +async fn prune_like_prefix() { + // UTF8 scenario: 2 row groups (5 rows each) + // RG1: ["a","b","c","d",NULL] => min="a", max="d" + // RG2: ["e","f","g","h","i"] => min="e", max="i" + // + // LIKE 'a%' => build_like_match produces: "a" <= max AND min <= "a" (actually min < "b") + // RG1: "a" <= "d" ✓, "a" < "b" ✓ => matched + // RG2: "a" <= "i" ✓, "e" < "b" ✗ => pruned + RowGroupPruningTest::new() + .with_scenario(Scenario::UTF8) + .with_query("SELECT * FROM t WHERE utf8 LIKE 'a%'") + .with_expected_errors(Some(0)) + .with_matched_by_stats(Some(1)) + .with_pruned_by_stats(Some(1)) + .with_pruned_files(Some(0)) + .with_matched_by_bloom_filter(Some(1)) + .with_pruned_by_bloom_filter(Some(0)) + .with_expected_rows(1) // only "a" matches LIKE 'a%' + .test_row_group_prune() + .await; +}