From 61aca57a2733d85e57a07330829c6e70ac9527a6 Mon Sep 17 00:00:00 2001 From: shrirangmhalgi Date: Fri, 15 May 2026 00:45:32 -0700 Subject: [PATCH] [SPARK-48091][SQL] Preserve aliases inside lambda when ExtractGenerator restructures plan ExtractGenerator called trimNonTopLevelAliases on all project list items before extracting the generator. This stripped aliases inside lambda functions (e.g., struct(x.as("data"))) before they could be resolved into struct field names by CreateStruct. Now only uses trimNonTopLevelAliases for pattern matching to detect generators, but preserves the original untrimmed expression for non-generator project items. --- .../sql/catalyst/analysis/Analyzer.scala | 10 +++--- .../spark/sql/GeneratorFunctionSuite.scala | 31 +++++++++++++++++++ 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index f31354179674e..c7fe055ec2f6c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -3260,9 +3260,11 @@ class Analyzer( // The star will be expanded differently if we insert `Generate` under `Project` too early. case p @ Project(projectList, child) if !projectList.exists(_.exists(_.isInstanceOf[Star])) => val (resolvedGenerator, newProjectList) = projectList - .map(trimNonTopLevelAliases) .foldLeft((None: Option[Generate], Nil: Seq[NamedExpression])) { (res, e) => - e match { + // SPARK-48091: Only trim aliases on the generator expression itself. Trimming + // non-generator expressions strips aliases inside lambda functions (e.g., + // struct(x.as("data"))) before they can be resolved into struct field names. + trimNonTopLevelAliases(e) match { // If there are more than one generator, we only rewrite the first one and wait for // the next analyzer iteration to rewrite the next one. case AliasedGenerator(generator, names, outer) if res._1.isEmpty && @@ -3275,8 +3277,8 @@ class Analyzer( generatorOutput = GeneratorResolution.makeGeneratorOutput(generator, names), child) (Some(g), res._2 ++ g.nullableOutput) - case other => - (res._1, res._2 :+ other) + case _ => + (res._1, res._2 :+ e) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/GeneratorFunctionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/GeneratorFunctionSuite.scala index 015ea9defae94..8469397247d98 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/GeneratorFunctionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/GeneratorFunctionSuite.scala @@ -765,6 +765,37 @@ class GeneratorFunctionSuite extends SharedSparkSession { Seq(Row(0, 10, 0, 10), Row(1, 20, 1, 20)) ) } + + test("SPARK-48091: explode with transform should preserve struct field aliases") { + val df = spark.createDataFrame(Seq((1, Array(1, 2, 3), Array(4, 5, 6)))) + .toDF("id", "my_array", "my_array2") + + // Without explode - aliases should work (baseline) + val good = df.select( + transform(col("my_array2"), x => struct(x.as("data"))).as("my_struct") + ) + assert(good.schema("my_struct").dataType.asInstanceOf[types.ArrayType] + .elementType.asInstanceOf[StructType].fieldNames.toSeq === Seq("data")) + + // With explode in same select - aliases should still be preserved + val result = df.select( + explode(col("my_array")).as("exploded"), + transform(col("my_array2"), x => struct(x.as("data"))).as("my_struct") + ) + assert(result.schema("my_struct").dataType.asInstanceOf[types.ArrayType] + .elementType.asInstanceOf[StructType].fieldNames.toSeq === Seq("data")) + + // Multiple aliases inside struct + val result2 = df.select( + explode(col("my_array")).as("exploded"), + transform(col("my_array2"), + x => struct(x.as("value"), col("id").as("key")) + ).as("my_struct") + ) + val fields2 = result2.schema("my_struct").dataType.asInstanceOf[types.ArrayType] + .elementType.asInstanceOf[StructType].fieldNames.toSeq + assert(fields2 === Seq("value", "key")) + } } case class EmptyGenerator() extends Generator with LeafLike[Expression] {