Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 6 additions & 7 deletions datafusion/optimizer/src/simplify_expressions/regex.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
// under the License.

use datafusion_common::tree_node::Transformed;
use datafusion_common::{DataFusionError, Result};
use datafusion_common::{DataFusionError, Result, ScalarValue};
use datafusion_expr::{BinaryExpr, Expr, Like, Operator, lit};
use regex_syntax::hir::{Capture, Hir, HirKind, Literal, Look};

Expand All @@ -39,7 +39,7 @@ const ANY_CHAR_REGEX_PATTERN: &str = ".*";
/// - partial anchored regex patterns (e.g. `^foo`) to `LIKE 'foo%'`
/// - combinations (alternatives) of the above, will be concatenated with `OR` or `AND`
/// - `EQ .*` to NotNull
/// - `NE .*` means IS EMPTY
/// - `NE .*` to col IS NOT NULL AND Boolean(NULL) (false for any string, or NULL if col is NULL)
///
/// Dev note: unit tests of this function are in `expr_simplifier.rs`, case `test_simplify_regex`.
pub fn simplify_regex_expr(
Expand Down Expand Up @@ -68,12 +68,11 @@ pub fn simplify_regex_expr(
// Handle the special case for ".*" pattern
if pattern == ANY_CHAR_REGEX_PATTERN {
let new_expr = if mode.not {
// not empty
let empty_lit = Box::new(string_scalar.to_expr(""));
let null_bool = lit(ScalarValue::Boolean(None));
Expr::BinaryExpr(BinaryExpr {
left,
op: Operator::Eq,
right: empty_lit,
left: Box::new(left.is_not_null()),
op: Operator::And,
right: Box::new(null_bool),
})
} else {
// not null
Expand Down
51 changes: 43 additions & 8 deletions datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,7 @@ mod tests {
use arrow::datatypes::{DataType, Field, Schema};
use chrono::{DateTime, Utc};

use datafusion_common::ScalarValue;
use datafusion_expr::logical_plan::builder::table_scan_with_filters;
use datafusion_expr::logical_plan::table_scan;
use datafusion_expr::*;
Expand Down Expand Up @@ -883,17 +884,17 @@ mod tests {
"
)?;

// Test `!= ".*"` transforms to checking if the column is empty
// Test `!~ ".*"` transforms to CASE WHEN col IS NOT NULL THEN FALSE ELSE NULL END
let plan = LogicalPlanBuilder::from(table_scan.clone())
.filter(binary_expr(col("a"), Operator::RegexNotMatch, lit(".*")))?
.build()?;

assert_optimized_plan_equal!(
plan,
@ r#"
Filter: test.a = Utf8("")
@ r"
Filter: test.a IS NOT NULL AND Boolean(NULL)
TableScan: test
"#
"
)?;

// Test case-insensitive versions
Expand All @@ -911,17 +912,51 @@ mod tests {
"
)?;

// Test `!~ ".*"` (case-insensitive) transforms to checking if the column is empty
// Test NULL `!~ ".*"` transforms to Boolean(NULL)
let plan = LogicalPlanBuilder::from(table_scan.clone())
.filter(binary_expr(
lit(ScalarValue::Utf8(None)),
Operator::RegexNotMatch,
lit(".*"),
))?
.build()?;

assert_optimized_plan_equal!(
plan,
@ r"
Filter: Boolean(NULL)
TableScan: test
"
)?;

// Test `!~* ".*"` (case-insensitive) transforms to false
let plan = LogicalPlanBuilder::from(table_scan.clone())
.filter(binary_expr(col("a"), Operator::RegexNotIMatch, lit(".*")))?
.build()?;

assert_optimized_plan_equal!(
plan,
@ r#"
Filter: test.a = Utf8("")
@ r"
Filter: test.a IS NOT NULL AND Boolean(NULL)
TableScan: test
"#
"
)?;

// Test NULL `!~* ".*"` transforms to Boolean(NULL)
let plan = LogicalPlanBuilder::from(table_scan.clone())
.filter(binary_expr(
lit(ScalarValue::Utf8(None)),
Operator::RegexNotIMatch,
lit(".*"),
))?
.build()?;

assert_optimized_plan_equal!(
plan,
@ r"
Filter: Boolean(NULL)
TableScan: test
"
)
}

Expand Down
4 changes: 2 additions & 2 deletions datafusion/sqllogictest/test_files/simplify_expr.slt
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,10 @@ query TT
explain select b from t where b !~ '.*'
----
logical_plan
01)Filter: t.b = Utf8View("")
01)Filter: t.b IS NOT NULL AND Boolean(NULL)
02)--TableScan: t projection=[b]
physical_plan
01)FilterExec: b@0 =
01)FilterExec: b@0 IS NOT NULL AND NULL
02)--DataSourceExec: partitions=1, partition_sizes=[1]

query T
Expand Down
Loading