From 1ed89c9789f6beac4a86695c1eb691bfd7f6f247 Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Fri, 27 Feb 2026 15:51:02 -0800 Subject: [PATCH 1/6] Optimize ~! '.*' case to False instead of Eq empty str condition (which was a bug) --- .../optimizer/src/simplify_expressions/regex.rs | 11 +++-------- .../src/simplify_expressions/simplify_exprs.rs | 16 ++++++++-------- 2 files changed, 11 insertions(+), 16 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/regex.rs b/datafusion/optimizer/src/simplify_expressions/regex.rs index 6c2492d05404d..f9ee2e1567162 100644 --- a/datafusion/optimizer/src/simplify_expressions/regex.rs +++ b/datafusion/optimizer/src/simplify_expressions/regex.rs @@ -39,7 +39,7 @@ const ANY_CHAR_REGEX_PATTERN: &str = ".*"; /// - partial anchored regex patterns (e.g. `^foo`) to `LIKE 'foo%'` /// - combinations (alternatives) of the above, will be concatenated with `OR` or `AND` /// - `EQ .*` to NotNull -/// - `NE .*` means IS EMPTY +/// - `NE .*` to false (.* matches any string, and NULL !~ results in NULL so NOT match can never be true) /// /// Dev note: unit tests of this function are in `expr_simplifier.rs`, case `test_simplify_regex`. pub fn simplify_regex_expr( @@ -68,13 +68,8 @@ pub fn simplify_regex_expr( // Handle the special case for ".*" pattern if pattern == ANY_CHAR_REGEX_PATTERN { let new_expr = if mode.not { - // not empty - let empty_lit = Box::new(string_scalar.to_expr("")); - Expr::BinaryExpr(BinaryExpr { - left, - op: Operator::Eq, - right: empty_lit, - }) + // Always false. + lit(false) } else { // not null left.is_not_null() diff --git a/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs b/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs index f7f100015004a..71a43f0ad12a0 100644 --- a/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs +++ b/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs @@ -883,17 +883,17 @@ mod tests { " )?; - // Test `!= ".*"` transforms to checking if the column is empty + // Test `!~ ".*"` transforms to false let plan = LogicalPlanBuilder::from(table_scan.clone()) .filter(binary_expr(col("a"), Operator::RegexNotMatch, lit(".*")))? .build()?; assert_optimized_plan_equal!( plan, - @ r#" - Filter: test.a = Utf8("") + @ r" + Filter: Boolean(false) TableScan: test - "# + " )?; // Test case-insensitive versions @@ -911,17 +911,17 @@ mod tests { " )?; - // Test `!~ ".*"` (case-insensitive) transforms to checking if the column is empty + // Test `!~* ".*"` (case-insensitive) transforms to false let plan = LogicalPlanBuilder::from(table_scan.clone()) .filter(binary_expr(col("a"), Operator::RegexNotIMatch, lit(".*")))? .build()?; assert_optimized_plan_equal!( plan, - @ r#" - Filter: test.a = Utf8("") + @ r" + Filter: Boolean(false) TableScan: test - "# + " ) } From e0341edb13259205f95650c66c0a6b8305a9d9e9 Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Wed, 4 Mar 2026 09:12:17 -0800 Subject: [PATCH 2/6] Make commented proof clearer --- datafusion/optimizer/src/simplify_expressions/regex.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/optimizer/src/simplify_expressions/regex.rs b/datafusion/optimizer/src/simplify_expressions/regex.rs index f9ee2e1567162..c0f59b1c247a1 100644 --- a/datafusion/optimizer/src/simplify_expressions/regex.rs +++ b/datafusion/optimizer/src/simplify_expressions/regex.rs @@ -39,7 +39,7 @@ const ANY_CHAR_REGEX_PATTERN: &str = ".*"; /// - partial anchored regex patterns (e.g. `^foo`) to `LIKE 'foo%'` /// - combinations (alternatives) of the above, will be concatenated with `OR` or `AND` /// - `EQ .*` to NotNull -/// - `NE .*` to false (.* matches any string, and NULL !~ results in NULL so NOT match can never be true) +/// - `NE .*` to false (.* matches non-empty and empty strings, and NULL !~ '.*' results in NULL so this can never be true) /// /// Dev note: unit tests of this function are in `expr_simplifier.rs`, case `test_simplify_regex`. pub fn simplify_regex_expr( From 65d16c1162f72bc51f9b3eac3b4f9bcab5c9f5dd Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Wed, 4 Mar 2026 09:29:23 -0800 Subject: [PATCH 3/6] Update expected outputs for simplify_expr.slt --- datafusion/sqllogictest/test_files/simplify_expr.slt | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/datafusion/sqllogictest/test_files/simplify_expr.slt b/datafusion/sqllogictest/test_files/simplify_expr.slt index f8c219e052f80..b7ff19ef13b40 100644 --- a/datafusion/sqllogictest/test_files/simplify_expr.slt +++ b/datafusion/sqllogictest/test_files/simplify_expr.slt @@ -43,12 +43,8 @@ physical_plan query TT explain select b from t where b !~ '.*' ---- -logical_plan -01)Filter: t.b = Utf8View("") -02)--TableScan: t projection=[b] -physical_plan -01)FilterExec: b@0 = -02)--DataSourceExec: partitions=1, partition_sizes=[1] +logical_plan EmptyRelation: rows=0 +physical_plan EmptyExec query T select b from t where b ~ '.*' From f62da0bfd708ae0c2bb269ca6511cfafd75ab203 Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Wed, 4 Mar 2026 18:53:04 -0800 Subject: [PATCH 4/6] Update logic to col IS NOT NULL AND Boolean(NULL) --- .../optimizer/src/simplify_expressions/regex.rs | 14 +++++++++----- .../src/simplify_expressions/simplify_exprs.rs | 6 +++--- .../sqllogictest/test_files/simplify_expr.slt | 8 ++++++-- 3 files changed, 18 insertions(+), 10 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/regex.rs b/datafusion/optimizer/src/simplify_expressions/regex.rs index c0f59b1c247a1..8e6b13886a4b4 100644 --- a/datafusion/optimizer/src/simplify_expressions/regex.rs +++ b/datafusion/optimizer/src/simplify_expressions/regex.rs @@ -16,8 +16,8 @@ // under the License. use datafusion_common::tree_node::Transformed; -use datafusion_common::{DataFusionError, Result}; -use datafusion_expr::{BinaryExpr, Expr, Like, Operator, lit}; +use datafusion_common::{DataFusionError, Result, ScalarValue}; +use datafusion_expr::{BinaryExpr, Case, Expr, Like, Operator, lit}; use regex_syntax::hir::{Capture, Hir, HirKind, Literal, Look}; use crate::simplify_expressions::expr_simplifier::StringScalar; @@ -39,7 +39,7 @@ const ANY_CHAR_REGEX_PATTERN: &str = ".*"; /// - partial anchored regex patterns (e.g. `^foo`) to `LIKE 'foo%'` /// - combinations (alternatives) of the above, will be concatenated with `OR` or `AND` /// - `EQ .*` to NotNull -/// - `NE .*` to false (.* matches non-empty and empty strings, and NULL !~ '.*' results in NULL so this can never be true) +/// - `NE .*` to col IS NOT NULL AND Boolean(NULL) (false for any string, or NULL if col is NULL) /// /// Dev note: unit tests of this function are in `expr_simplifier.rs`, case `test_simplify_regex`. pub fn simplify_regex_expr( @@ -68,8 +68,12 @@ pub fn simplify_regex_expr( // Handle the special case for ".*" pattern if pattern == ANY_CHAR_REGEX_PATTERN { let new_expr = if mode.not { - // Always false. - lit(false) + let null_bool = lit(ScalarValue::Boolean(None)); + Expr::BinaryExpr(BinaryExpr { + left: Box::new(left.is_not_null()), + op: Operator::And, + right: Box::new(null_bool), + }) } else { // not null left.is_not_null() diff --git a/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs b/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs index 71a43f0ad12a0..e1fc039bf7b51 100644 --- a/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs +++ b/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs @@ -883,7 +883,7 @@ mod tests { " )?; - // Test `!~ ".*"` transforms to false + // Test `!~ ".*"` transforms to CASE WHEN col IS NOT NULL THEN FALSE ELSE NULL END let plan = LogicalPlanBuilder::from(table_scan.clone()) .filter(binary_expr(col("a"), Operator::RegexNotMatch, lit(".*")))? .build()?; @@ -891,7 +891,7 @@ mod tests { assert_optimized_plan_equal!( plan, @ r" - Filter: Boolean(false) + Filter: test.a IS NOT NULL AND Boolean(NULL) TableScan: test " )?; @@ -919,7 +919,7 @@ mod tests { assert_optimized_plan_equal!( plan, @ r" - Filter: Boolean(false) + Filter: test.a IS NOT NULL AND Boolean(NULL) TableScan: test " ) diff --git a/datafusion/sqllogictest/test_files/simplify_expr.slt b/datafusion/sqllogictest/test_files/simplify_expr.slt index b7ff19ef13b40..5c48c05ae3d0a 100644 --- a/datafusion/sqllogictest/test_files/simplify_expr.slt +++ b/datafusion/sqllogictest/test_files/simplify_expr.slt @@ -43,8 +43,12 @@ physical_plan query TT explain select b from t where b !~ '.*' ---- -logical_plan EmptyRelation: rows=0 -physical_plan EmptyExec +logical_plan +01)Filter: t.b IS NOT NULL AND Boolean(NULL) +02)--TableScan: t projection=[b] +physical_plan +01)FilterExec: b@0 IS NOT NULL AND NULL +02)--DataSourceExec: partitions=1, partition_sizes=[1] query T select b from t where b ~ '.*' From 58e080ce38b7be27748b545b16775cbae96d389e Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Wed, 4 Mar 2026 18:59:45 -0800 Subject: [PATCH 5/6] Add explicit NULL tests --- .../simplify_expressions/simplify_exprs.rs | 36 ++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs b/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs index e1fc039bf7b51..f11910a3cafea 100644 --- a/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs +++ b/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs @@ -20,7 +20,7 @@ use std::sync::Arc; use datafusion_common::tree_node::{Transformed, TreeNode}; -use datafusion_common::{DFSchema, DFSchemaRef, DataFusionError, Result}; +use datafusion_common::{DFSchema, DFSchemaRef, DataFusionError, Result, ScalarValue}; use datafusion_expr::Expr; use datafusion_expr::logical_plan::LogicalPlan; use datafusion_expr::simplify::SimplifyContext; @@ -911,6 +911,23 @@ mod tests { " )?; + // Test NULL `!~ ".*"` transforms to Boolean(NULL) + let plan = LogicalPlanBuilder::from(table_scan.clone()) + .filter(binary_expr( + lit(ScalarValue::Utf8(None)), + Operator::RegexNotMatch, + lit(".*"), + ))? + .build()?; + + assert_optimized_plan_equal!( + plan, + @ r" + Filter: Boolean(NULL) + TableScan: test + " + )?; + // Test `!~* ".*"` (case-insensitive) transforms to false let plan = LogicalPlanBuilder::from(table_scan.clone()) .filter(binary_expr(col("a"), Operator::RegexNotIMatch, lit(".*")))? @@ -922,6 +939,23 @@ mod tests { Filter: test.a IS NOT NULL AND Boolean(NULL) TableScan: test " + )?; + + // Test NULL `!~* ".*"` transforms to Boolean(NULL) + let plan = LogicalPlanBuilder::from(table_scan.clone()) + .filter(binary_expr( + lit(ScalarValue::Utf8(None)), + Operator::RegexNotIMatch, + lit(".*"), + ))? + .build()?; + + assert_optimized_plan_equal!( + plan, + @ r" + Filter: Boolean(NULL) + TableScan: test + " ) } From 35c6947c30a58d228f53c99133b4a127ed496c24 Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Wed, 4 Mar 2026 19:10:51 -0800 Subject: [PATCH 6/6] Fix clippy --- datafusion/optimizer/src/simplify_expressions/regex.rs | 2 +- .../optimizer/src/simplify_expressions/simplify_exprs.rs | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/regex.rs b/datafusion/optimizer/src/simplify_expressions/regex.rs index 8e6b13886a4b4..c7e56611a8d35 100644 --- a/datafusion/optimizer/src/simplify_expressions/regex.rs +++ b/datafusion/optimizer/src/simplify_expressions/regex.rs @@ -17,7 +17,7 @@ use datafusion_common::tree_node::Transformed; use datafusion_common::{DataFusionError, Result, ScalarValue}; -use datafusion_expr::{BinaryExpr, Case, Expr, Like, Operator, lit}; +use datafusion_expr::{BinaryExpr, Expr, Like, Operator, lit}; use regex_syntax::hir::{Capture, Hir, HirKind, Literal, Look}; use crate::simplify_expressions::expr_simplifier::StringScalar; diff --git a/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs b/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs index f11910a3cafea..4e0317ab6062e 100644 --- a/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs +++ b/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs @@ -20,7 +20,7 @@ use std::sync::Arc; use datafusion_common::tree_node::{Transformed, TreeNode}; -use datafusion_common::{DFSchema, DFSchemaRef, DataFusionError, Result, ScalarValue}; +use datafusion_common::{DFSchema, DFSchemaRef, DataFusionError, Result}; use datafusion_expr::Expr; use datafusion_expr::logical_plan::LogicalPlan; use datafusion_expr::simplify::SimplifyContext; @@ -155,6 +155,7 @@ mod tests { use arrow::datatypes::{DataType, Field, Schema}; use chrono::{DateTime, Utc}; + use datafusion_common::ScalarValue; use datafusion_expr::logical_plan::builder::table_scan_with_filters; use datafusion_expr::logical_plan::table_scan; use datafusion_expr::*;