Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions datafusion/core/tests/parquet/row_group_pruning.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2078,3 +2078,26 @@ async fn test_limit_pruning_exceeds_fully_matched() -> datafusion_common::error:
.await;
Ok(())
}

#[tokio::test]
async fn prune_like_prefix() {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I verified that this test fails like this without the code chnage

thread 'parquet::row_group_pruning::prune_like_prefix' (83571793) panicked at datafusion/core/tests/parquet/row_group_pruning.rs:138:9:
assertion `left == right` failed: mismatched predicate_evaluation error
  left: Some(5)
 right: Some(0)
note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace

// UTF8 scenario: 2 row groups (5 rows each)
// RG1: ["a","b","c","d",NULL] => min="a", max="d"
// RG2: ["e","f","g","h","i"] => min="e", max="i"
//
// LIKE 'a%' => build_like_match produces: "a" <= max AND min <= "a" (actually min < "b")
// RG1: "a" <= "d" ✓, "a" < "b" ✓ => matched
// RG2: "a" <= "i" ✓, "e" < "b" ✗ => pruned
RowGroupPruningTest::new()
.with_scenario(Scenario::UTF8)
.with_query("SELECT * FROM t WHERE utf8 LIKE 'a%'")
.with_expected_errors(Some(0))
.with_matched_by_stats(Some(1))
.with_pruned_by_stats(Some(1))
.with_pruned_files(Some(0))
.with_matched_by_bloom_filter(Some(1))
.with_pruned_by_bloom_filter(Some(0))
.with_expected_rows(1) // only "a" matches LIKE 'a%'
.test_row_group_prune()
.await;
}
22 changes: 13 additions & 9 deletions datafusion/pruning/src/pruning_predicate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ use datafusion_common::{
ScalarValue, internal_datafusion_err, plan_datafusion_err, plan_err,
tree_node::{Transformed, TreeNode},
};
use datafusion_expr_common::casts::try_cast_literal_to_type;
use datafusion_expr_common::operator::Operator;
use datafusion_physical_expr::utils::{Guarantee, LiteralGuarantee};
use datafusion_physical_expr::{PhysicalExprRef, expressions as phys_expr};
Expand Down Expand Up @@ -1816,6 +1817,13 @@ fn extract_string_literal(expr: &Arc<dyn PhysicalExpr>) -> Option<&str> {
None
}

/// Wrap a string in a `Literal` whose `ScalarValue` matches `target_type`
fn string_literal_as(value: String, target_type: &DataType) -> Arc<dyn PhysicalExpr> {
let utf8 = ScalarValue::Utf8(Some(value));
let scalar = try_cast_literal_to_type(&utf8, target_type).unwrap_or(utf8);
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

It is sad that this potentially results in a new allocation -- maybe as a follow on PR we can avoid the allocation in try_cast_literal_to_type

Arc::new(phys_expr::Literal::new(scalar))
}

/// Convert `column LIKE literal` where P is a constant prefix of the literal
/// to a range check on the column: `P <= column && column < P'`, where P' is the
/// lowest string after all P* strings.
Expand All @@ -1835,6 +1843,8 @@ fn build_like_match(
let min_column_expr = expr_builder.min_column_expr().ok()?;
let max_column_expr = expr_builder.max_column_expr().ok()?;
let scalar_expr = expr_builder.scalar_expr();
// Synthesized bounds must match the column type (e.g. `Utf8View`).
let target_type = expr_builder.field.data_type();
// check that the scalar is a string literal
let s = extract_string_literal(scalar_expr)?;
// ANSI SQL specifies two wildcards: % and _. % matches zero or more characters, _ matches exactly one character.
Expand All @@ -1846,18 +1856,12 @@ fn build_like_match(
}
let (lower_bound, upper_bound) = if has_wildcard {
let incremented_prefix = increment_utf8(&decoded_prefix)?;
let lower_bound_lit = Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some(
decoded_prefix,
))));
let upper_bound_lit = Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some(
incremented_prefix,
))));
let lower_bound_lit = string_literal_as(decoded_prefix, target_type);
let upper_bound_lit = string_literal_as(incremented_prefix, target_type);
(lower_bound_lit, upper_bound_lit)
} else {
// the like expression is a literal and can be converted into a comparison
let bound = Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some(
decoded_prefix,
))));
let bound = string_literal_as(decoded_prefix, target_type);
(Arc::clone(&bound), bound)
};
let lower_bound_expr = Arc::new(phys_expr::BinaryExpr::new(
Expand Down
Loading