diff --git a/datafusion/optimizer/src/simplify_expressions/regex.rs b/datafusion/optimizer/src/simplify_expressions/regex.rs index 6c2492d05404d..c7e56611a8d35 100644 --- a/datafusion/optimizer/src/simplify_expressions/regex.rs +++ b/datafusion/optimizer/src/simplify_expressions/regex.rs @@ -16,7 +16,7 @@ // under the License. use datafusion_common::tree_node::Transformed; -use datafusion_common::{DataFusionError, Result}; +use datafusion_common::{DataFusionError, Result, ScalarValue}; use datafusion_expr::{BinaryExpr, Expr, Like, Operator, lit}; use regex_syntax::hir::{Capture, Hir, HirKind, Literal, Look}; @@ -39,7 +39,7 @@ const ANY_CHAR_REGEX_PATTERN: &str = ".*"; /// - partial anchored regex patterns (e.g. `^foo`) to `LIKE 'foo%'` /// - combinations (alternatives) of the above, will be concatenated with `OR` or `AND` /// - `EQ .*` to NotNull -/// - `NE .*` means IS EMPTY +/// - `NE .*` to col IS NOT NULL AND Boolean(NULL) (false for any string, or NULL if col is NULL) /// /// Dev note: unit tests of this function are in `expr_simplifier.rs`, case `test_simplify_regex`. pub fn simplify_regex_expr( @@ -68,12 +68,11 @@ pub fn simplify_regex_expr( // Handle the special case for ".*" pattern if pattern == ANY_CHAR_REGEX_PATTERN { let new_expr = if mode.not { - // not empty - let empty_lit = Box::new(string_scalar.to_expr("")); + let null_bool = lit(ScalarValue::Boolean(None)); Expr::BinaryExpr(BinaryExpr { - left, - op: Operator::Eq, - right: empty_lit, + left: Box::new(left.is_not_null()), + op: Operator::And, + right: Box::new(null_bool), }) } else { // not null diff --git a/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs b/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs index f7f100015004a..4e0317ab6062e 100644 --- a/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs +++ b/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs @@ -155,6 +155,7 @@ mod tests { use arrow::datatypes::{DataType, Field, Schema}; use chrono::{DateTime, Utc}; + use datafusion_common::ScalarValue; use datafusion_expr::logical_plan::builder::table_scan_with_filters; use datafusion_expr::logical_plan::table_scan; use datafusion_expr::*; @@ -883,17 +884,17 @@ mod tests { " )?; - // Test `!= ".*"` transforms to checking if the column is empty + // Test `!~ ".*"` transforms to CASE WHEN col IS NOT NULL THEN FALSE ELSE NULL END let plan = LogicalPlanBuilder::from(table_scan.clone()) .filter(binary_expr(col("a"), Operator::RegexNotMatch, lit(".*")))? .build()?; assert_optimized_plan_equal!( plan, - @ r#" - Filter: test.a = Utf8("") + @ r" + Filter: test.a IS NOT NULL AND Boolean(NULL) TableScan: test - "# + " )?; // Test case-insensitive versions @@ -911,17 +912,51 @@ mod tests { " )?; - // Test `!~ ".*"` (case-insensitive) transforms to checking if the column is empty + // Test NULL `!~ ".*"` transforms to Boolean(NULL) + let plan = LogicalPlanBuilder::from(table_scan.clone()) + .filter(binary_expr( + lit(ScalarValue::Utf8(None)), + Operator::RegexNotMatch, + lit(".*"), + ))? + .build()?; + + assert_optimized_plan_equal!( + plan, + @ r" + Filter: Boolean(NULL) + TableScan: test + " + )?; + + // Test `!~* ".*"` (case-insensitive) transforms to false let plan = LogicalPlanBuilder::from(table_scan.clone()) .filter(binary_expr(col("a"), Operator::RegexNotIMatch, lit(".*")))? .build()?; assert_optimized_plan_equal!( plan, - @ r#" - Filter: test.a = Utf8("") + @ r" + Filter: test.a IS NOT NULL AND Boolean(NULL) TableScan: test - "# + " + )?; + + // Test NULL `!~* ".*"` transforms to Boolean(NULL) + let plan = LogicalPlanBuilder::from(table_scan.clone()) + .filter(binary_expr( + lit(ScalarValue::Utf8(None)), + Operator::RegexNotIMatch, + lit(".*"), + ))? + .build()?; + + assert_optimized_plan_equal!( + plan, + @ r" + Filter: Boolean(NULL) + TableScan: test + " ) } diff --git a/datafusion/sqllogictest/test_files/simplify_expr.slt b/datafusion/sqllogictest/test_files/simplify_expr.slt index f8c219e052f80..5c48c05ae3d0a 100644 --- a/datafusion/sqllogictest/test_files/simplify_expr.slt +++ b/datafusion/sqllogictest/test_files/simplify_expr.slt @@ -44,10 +44,10 @@ query TT explain select b from t where b !~ '.*' ---- logical_plan -01)Filter: t.b = Utf8View("") +01)Filter: t.b IS NOT NULL AND Boolean(NULL) 02)--TableScan: t projection=[b] physical_plan -01)FilterExec: b@0 = +01)FilterExec: b@0 IS NOT NULL AND NULL 02)--DataSourceExec: partitions=1, partition_sizes=[1] query T