From cf300e702ad506fff61511e9e83b4cc22fafb2d1 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Thu, 19 Feb 2026 14:20:24 +0000 Subject: [PATCH 1/6] fix: prevent duplicate alias collision with user-provided __datafusion_extracted names (#20430) When a user query contains an explicit alias using the reserved `__datafusion_extracted` prefix, the optimizer's AliasGenerator could generate the same alias name, causing a "Schema contains duplicate unqualified field name" error. Fix by scanning each plan node's expressions for pre-existing `__datafusion_extracted_N` aliases during the TopDown traversal in ExtractLeafExpressions, advancing the generator counter past them before any extraction occurs. Closes #20430 Co-Authored-By: Claude Opus 4.6 --- datafusion/common/src/alias.rs | 10 +++ .../optimizer/src/extract_leaf_expressions.rs | 72 +++++++++++++++++++ .../test_files/projection_pushdown.slt | 13 ++++ 3 files changed, 95 insertions(+) diff --git a/datafusion/common/src/alias.rs b/datafusion/common/src/alias.rs index 2ee2cb4dc7add..947a9627490bf 100644 --- a/datafusion/common/src/alias.rs +++ b/datafusion/common/src/alias.rs @@ -37,6 +37,16 @@ impl AliasGenerator { Self::default() } + /// Advance the counter to at least `min_id`, ensuring future aliases + /// won't collide with already-existing ones. + /// + /// For example, if the query already contains an alias `alias_42`, then calling + /// `update_min_id(42)` will ensure that future aliases generated by this + /// [`AliasGenerator`] will start from `alias_43`. + pub fn update_min_id(&self, min_id: usize) { + self.next_id.fetch_max(min_id + 1, Ordering::Relaxed); + } + /// Return a unique alias with the provided prefix pub fn next(&self, prefix: &str) -> String { let id = self.next_id.fetch_add(1, Ordering::Relaxed); diff --git a/datafusion/optimizer/src/extract_leaf_expressions.rs b/datafusion/optimizer/src/extract_leaf_expressions.rs index f5f4982e38c65..e9624d8b4238d 100644 --- a/datafusion/optimizer/src/extract_leaf_expressions.rs +++ b/datafusion/optimizer/src/extract_leaf_expressions.rs @@ -127,10 +127,41 @@ impl OptimizerRule for ExtractLeafExpressions { return Ok(Transformed::no(plan)); } let alias_generator = config.alias_generator(); + + // Advance the alias generator past any user-provided __datafusion_extracted_N + // aliases to prevent collisions when generating new extraction aliases. + advance_generator_past_existing(&plan, alias_generator); + extract_from_plan(plan, alias_generator) } } +/// Scans the current plan node's expressions for pre-existing +/// `__datafusion_extracted_N` aliases and advances the generator +/// counter past them to avoid collisions with user-provided aliases. +fn advance_generator_past_existing( + plan: &LogicalPlan, + alias_generator: &AliasGenerator, +) { + for expr in plan.expressions() { + expr.apply(|e| { + if let Expr::Alias(alias) = e { + if let Some(id_str) = alias + .name + .strip_prefix(EXTRACTED_EXPR_PREFIX) + .and_then(|s| s.strip_prefix('_')) + { + if let Ok(id) = id_str.parse::() { + alias_generator.update_min_id(id); + } + } + } + Ok(TreeNodeRecursion::Continue) + }) + .ok(); + } +} + /// Extracts `MoveTowardsLeafNodes` sub-expressions from a plan node. /// /// Works for any number of inputs (0, 1, 2, …N). For multi-input nodes @@ -3019,4 +3050,45 @@ mod tests { Ok(()) } + + /// Regression test: user-provided `__datafusion_extracted_2` alias in a + /// Projection must not collide with optimizer-generated aliases. + /// See + #[test] + fn test_user_provided_extracted_alias_no_collision() -> Result<()> { + let table_scan = test_table_scan_with_struct()?; + + // Projection with a user-provided __datafusion_extracted_2 alias + let proj = LogicalPlanBuilder::from(table_scan) + .project(vec![ + leaf_udf(col("user"), "status") + .alias(format!("{EXTRACTED_EXPR_PREFIX}_2")), + col("id"), + ])? + .build()?; + + // Filter below will trigger extraction; the generator must skip past _2 + let plan = LogicalPlanBuilder::from(proj) + .filter( + leaf_udf(col("id"), "check").eq(lit(1)), + )? + .build()?; + + let ctx = OptimizerContext::new().with_max_passes(1); + let optimizer = Optimizer::with_rules(vec![ + Arc::new(OptimizeProjections::new()), + Arc::new(ExtractLeafExpressions::new()), + ]); + // Should not error with "duplicate unqualified field name" + let result = optimizer.optimize(plan, &ctx, |_, _| {})?; + + // Verify the generated alias is _3 or higher (not _2) + let plan_str = format!("{result}"); + assert!( + !plan_str.contains("__datafusion_extracted_2, __datafusion_extracted_2"), + "Found duplicate alias in plan:\n{plan_str}" + ); + + Ok(()) + } } diff --git a/datafusion/sqllogictest/test_files/projection_pushdown.slt b/datafusion/sqllogictest/test_files/projection_pushdown.slt index c25b80a0d7f20..0ed78a3d439c2 100644 --- a/datafusion/sqllogictest/test_files/projection_pushdown.slt +++ b/datafusion/sqllogictest/test_files/projection_pushdown.slt @@ -1949,3 +1949,16 @@ ORDER BY simple_struct.id; 3 3 4 4 5 5 + +# ========================================================================= +# Regression: user-provided __datafusion_extracted aliases must not +# collide with optimizer-generated ones +# (https://github.com/apache/datafusion/issues/20430) +# ========================================================================= + +query I +SELECT s['value'] AS __datafusion_extracted_2 +FROM simple_struct +WHERE COALESCE(s['value'], s['value']) = 100; +---- +100 From 2064491325f01d0004abad200d68f22c3f0319f4 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Thu, 19 Feb 2026 14:30:21 +0000 Subject: [PATCH 2/6] fix slt --- .../test_files/projection_pushdown.slt | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/datafusion/sqllogictest/test_files/projection_pushdown.slt b/datafusion/sqllogictest/test_files/projection_pushdown.slt index 0ed78a3d439c2..c2e1f84fb8262 100644 --- a/datafusion/sqllogictest/test_files/projection_pushdown.slt +++ b/datafusion/sqllogictest/test_files/projection_pushdown.slt @@ -1956,9 +1956,20 @@ ORDER BY simple_struct.id; # (https://github.com/apache/datafusion/issues/20430) # ========================================================================= +statement ok +COPY ( select {f1: 1, f2: 2} as s +) TO 'test_files/scratch/projection_pushdown/test.parquet' +STORED AS PARQUET; + +statement ok +CREATE EXTERNAL TABLE t +STORED AS PARQUET +LOCATION 'test_files/scratch/projection_pushdown/test.parquet'; + query I -SELECT s['value'] AS __datafusion_extracted_2 -FROM simple_struct -WHERE COALESCE(s['value'], s['value']) = 100; +SELECT + get_field(s, 'f1') AS __datafusion_extracted_2 +FROM t +WHERE COALESCE(get_field(s, 'f1'), get_field(s, 'f2')) = 1; ---- -100 +1 From 7cb711389ad068e9838870bdfa9860e952f743ef Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Thu, 19 Feb 2026 14:38:48 +0000 Subject: [PATCH 3/6] remove unit test that doesn't reproduce the bug without get_field Co-Authored-By: Claude Opus 4.6 --- .../optimizer/src/extract_leaf_expressions.rs | 40 ------------------- 1 file changed, 40 deletions(-) diff --git a/datafusion/optimizer/src/extract_leaf_expressions.rs b/datafusion/optimizer/src/extract_leaf_expressions.rs index e9624d8b4238d..3d204b4b72ca4 100644 --- a/datafusion/optimizer/src/extract_leaf_expressions.rs +++ b/datafusion/optimizer/src/extract_leaf_expressions.rs @@ -3051,44 +3051,4 @@ mod tests { Ok(()) } - /// Regression test: user-provided `__datafusion_extracted_2` alias in a - /// Projection must not collide with optimizer-generated aliases. - /// See - #[test] - fn test_user_provided_extracted_alias_no_collision() -> Result<()> { - let table_scan = test_table_scan_with_struct()?; - - // Projection with a user-provided __datafusion_extracted_2 alias - let proj = LogicalPlanBuilder::from(table_scan) - .project(vec![ - leaf_udf(col("user"), "status") - .alias(format!("{EXTRACTED_EXPR_PREFIX}_2")), - col("id"), - ])? - .build()?; - - // Filter below will trigger extraction; the generator must skip past _2 - let plan = LogicalPlanBuilder::from(proj) - .filter( - leaf_udf(col("id"), "check").eq(lit(1)), - )? - .build()?; - - let ctx = OptimizerContext::new().with_max_passes(1); - let optimizer = Optimizer::with_rules(vec![ - Arc::new(OptimizeProjections::new()), - Arc::new(ExtractLeafExpressions::new()), - ]); - // Should not error with "duplicate unqualified field name" - let result = optimizer.optimize(plan, &ctx, |_, _| {})?; - - // Verify the generated alias is _3 or higher (not _2) - let plan_str = format!("{result}"); - assert!( - !plan_str.contains("__datafusion_extracted_2, __datafusion_extracted_2"), - "Found duplicate alias in plan:\n{plan_str}" - ); - - Ok(()) - } } From 3de4dfd439a1495e1dc52f75000e664d9a8949bc Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Thu, 19 Feb 2026 15:19:37 +0000 Subject: [PATCH 4/6] fix: traverse subqueries in extract_leaf_expressions and fix clippy Traverse into subqueries when extracting leaf expressions and when advancing the alias generator past existing extracted aliases. Also collapse nested if-let to satisfy clippy::collapsible_if. Co-Authored-By: Claude Opus 4.6 --- datafusion/common/src/alias.rs | 2 +- .../optimizer/src/extract_leaf_expressions.rs | 45 +++++++++---------- 2 files changed, 23 insertions(+), 24 deletions(-) diff --git a/datafusion/common/src/alias.rs b/datafusion/common/src/alias.rs index 947a9627490bf..99f6447a6acd8 100644 --- a/datafusion/common/src/alias.rs +++ b/datafusion/common/src/alias.rs @@ -39,7 +39,7 @@ impl AliasGenerator { /// Advance the counter to at least `min_id`, ensuring future aliases /// won't collide with already-existing ones. - /// + /// /// For example, if the query already contains an alias `alias_42`, then calling /// `update_min_id(42)` will ensure that future aliases generated by this /// [`AliasGenerator`] will start from `alias_43`. diff --git a/datafusion/optimizer/src/extract_leaf_expressions.rs b/datafusion/optimizer/src/extract_leaf_expressions.rs index 3d204b4b72ca4..5c2b709928bd1 100644 --- a/datafusion/optimizer/src/extract_leaf_expressions.rs +++ b/datafusion/optimizer/src/extract_leaf_expressions.rs @@ -114,10 +114,6 @@ impl OptimizerRule for ExtractLeafExpressions { "extract_leaf_expressions" } - fn apply_order(&self) -> Option { - Some(ApplyOrder::TopDown) - } - fn rewrite( &self, plan: LogicalPlan, @@ -130,9 +126,11 @@ impl OptimizerRule for ExtractLeafExpressions { // Advance the alias generator past any user-provided __datafusion_extracted_N // aliases to prevent collisions when generating new extraction aliases. - advance_generator_past_existing(&plan, alias_generator); + advance_generator_past_existing(&plan, alias_generator)?; - extract_from_plan(plan, alias_generator) + plan.transform_down_with_subqueries(|plan| { + extract_from_plan(plan, alias_generator) + }) } } @@ -142,24 +140,26 @@ impl OptimizerRule for ExtractLeafExpressions { fn advance_generator_past_existing( plan: &LogicalPlan, alias_generator: &AliasGenerator, -) { - for expr in plan.expressions() { - expr.apply(|e| { - if let Expr::Alias(alias) = e { - if let Some(id_str) = alias - .name - .strip_prefix(EXTRACTED_EXPR_PREFIX) - .and_then(|s| s.strip_prefix('_')) +) -> Result<()> { + plan.apply(|plan| { + plan.expressions().iter().try_for_each(|expr| { + expr.apply(|e| { + if let Expr::Alias(alias) = e + && let Some(id_str) = alias + .name + .strip_prefix(EXTRACTED_EXPR_PREFIX) + .and_then(|s| s.strip_prefix('_')) + && let Ok(id) = id_str.parse::() { - if let Ok(id) = id_str.parse::() { - alias_generator.update_min_id(id); - } + alias_generator.update_min_id(id); } - } - Ok(TreeNodeRecursion::Continue) - }) - .ok(); - } + Ok(TreeNodeRecursion::Continue) + })?; + Ok::<(), datafusion_common::error::DataFusionError>(()) + })?; + Ok(TreeNodeRecursion::Continue) + }) + .map(|_| ()) } /// Extracts `MoveTowardsLeafNodes` sub-expressions from a plan node. @@ -3050,5 +3050,4 @@ mod tests { Ok(()) } - } From c6774b67db8520b6678e458215ea99a15cfb1949 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Fri, 20 Feb 2026 16:51:43 +0000 Subject: [PATCH 5/6] refactor: chain parse into and_then for cleaner option handling Co-Authored-By: Claude Opus 4.6 --- datafusion/optimizer/src/extract_leaf_expressions.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/optimizer/src/extract_leaf_expressions.rs b/datafusion/optimizer/src/extract_leaf_expressions.rs index 5c2b709928bd1..922ea7933781e 100644 --- a/datafusion/optimizer/src/extract_leaf_expressions.rs +++ b/datafusion/optimizer/src/extract_leaf_expressions.rs @@ -145,11 +145,11 @@ fn advance_generator_past_existing( plan.expressions().iter().try_for_each(|expr| { expr.apply(|e| { if let Expr::Alias(alias) = e - && let Some(id_str) = alias + && let Some(id) = alias .name .strip_prefix(EXTRACTED_EXPR_PREFIX) .and_then(|s| s.strip_prefix('_')) - && let Ok(id) = id_str.parse::() + .and_then(|s| s.parse().ok()) { alias_generator.update_min_id(id); } From 63789582aa2c119806424211d14d122aea5143f3 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Fri, 20 Feb 2026 17:37:45 +0000 Subject: [PATCH 6/6] test: add EXPLAIN assertion for __datafusion_extracted alias allocation Adds an EXPLAIN query to verify the user-provided __datafusion_extracted_2 alias is preserved while optimizer-generated aliases skip to _3 and _4, guarding against silent regressions in alias allocation. Co-Authored-By: Claude Opus 4.6 --- .../test_files/projection_pushdown.slt | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/datafusion/sqllogictest/test_files/projection_pushdown.slt b/datafusion/sqllogictest/test_files/projection_pushdown.slt index c2e1f84fb8262..dbb77b33c21b7 100644 --- a/datafusion/sqllogictest/test_files/projection_pushdown.slt +++ b/datafusion/sqllogictest/test_files/projection_pushdown.slt @@ -1966,6 +1966,23 @@ CREATE EXTERNAL TABLE t STORED AS PARQUET LOCATION 'test_files/scratch/projection_pushdown/test.parquet'; +# Verify that the user-provided __datafusion_extracted_2 alias is preserved +# and the optimizer skips to _3 and _4 for its generated aliases. +query TT +EXPLAIN SELECT + get_field(s, 'f1') AS __datafusion_extracted_2 +FROM t +WHERE COALESCE(get_field(s, 'f1'), get_field(s, 'f2')) = 1; +---- +logical_plan +01)Projection: __datafusion_extracted_2 +02)--Filter: CASE WHEN __datafusion_extracted_3 IS NOT NULL THEN __datafusion_extracted_3 ELSE __datafusion_extracted_4 END = Int64(1) +03)----Projection: get_field(t.s, Utf8("f1")) AS __datafusion_extracted_3, get_field(t.s, Utf8("f2")) AS __datafusion_extracted_4, get_field(t.s, Utf8("f1")) AS __datafusion_extracted_2 +04)------TableScan: t projection=[s], partial_filters=[CASE WHEN get_field(t.s, Utf8("f1")) IS NOT NULL THEN get_field(t.s, Utf8("f1")) ELSE get_field(t.s, Utf8("f2")) END = Int64(1)] +physical_plan +01)FilterExec: CASE WHEN __datafusion_extracted_3@0 IS NOT NULL THEN __datafusion_extracted_3@0 ELSE __datafusion_extracted_4@1 END = 1, projection=[__datafusion_extracted_2@2] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/test.parquet]]}, projection=[get_field(s@0, f1) as __datafusion_extracted_3, get_field(s@0, f2) as __datafusion_extracted_4, get_field(s@0, f1) as __datafusion_extracted_2], file_type=parquet + query I SELECT get_field(s, 'f1') AS __datafusion_extracted_2