From 6ddda9848b6bc1a21384fae433321ed394dc14f3 Mon Sep 17 00:00:00 2001 From: Parth Chandra Date: Thu, 14 May 2026 13:29:02 -0700 Subject: [PATCH 1/5] feat: parse_url initial implementation --- .../spark_expressions_support.md | 2 +- .../latest/compatibility/expressions/index.md | 1 + .../latest/compatibility/expressions/url.md | 23 +++ native/core/src/execution/jni_api.rs | 4 + .../scala/org/apache/comet/GenerateDocs.scala | 10 ++ .../apache/comet/serde/QueryPlanSerde.scala | 5 +- .../scala/org/apache/comet/serde/url.scala | 45 ++++++ .../apache/comet/shims/CometExprShim.scala | 3 + .../apache/comet/shims/CometExprShim.scala | 3 + .../sql-tests/expressions/url/parse_url.sql | 37 +++++ .../expressions/url/parse_url_ansi.sql | 43 ++++++ .../expressions/url/parse_url_enabled.sql | 140 ++++++++++++++++++ 12 files changed, 314 insertions(+), 2 deletions(-) create mode 100644 docs/source/user-guide/latest/compatibility/expressions/url.md create mode 100644 spark/src/main/scala/org/apache/comet/serde/url.scala create mode 100644 spark/src/test/resources/sql-tests/expressions/url/parse_url.sql create mode 100644 spark/src/test/resources/sql-tests/expressions/url/parse_url_ansi.sql create mode 100644 spark/src/test/resources/sql-tests/expressions/url/parse_url_enabled.sql diff --git a/docs/source/contributor-guide/spark_expressions_support.md b/docs/source/contributor-guide/spark_expressions_support.md index 0efc581d55..dc75554f7b 100644 --- a/docs/source/contributor-guide/spark_expressions_support.md +++ b/docs/source/contributor-guide/spark_expressions_support.md @@ -593,7 +593,7 @@ ### url_funcs -- [ ] parse_url +- [x] parse_url (Incompatible: native diverges from Spark on edge cases) - [x] try_url_decode - 4.0.1, 2026-05-05 - [x] url_decode diff --git a/docs/source/user-guide/latest/compatibility/expressions/index.md b/docs/source/user-guide/latest/compatibility/expressions/index.md index b86e46cc0e..fcaa3bfbbc 100644 --- a/docs/source/user-guide/latest/compatibility/expressions/index.md +++ b/docs/source/user-guide/latest/compatibility/expressions/index.md @@ -36,5 +36,6 @@ math misc string struct +url cast ``` diff --git a/docs/source/user-guide/latest/compatibility/expressions/url.md b/docs/source/user-guide/latest/compatibility/expressions/url.md new file mode 100644 index 0000000000..765e12941f --- /dev/null +++ b/docs/source/user-guide/latest/compatibility/expressions/url.md @@ -0,0 +1,23 @@ + + +# URL Expressions + + + diff --git a/native/core/src/execution/jni_api.rs b/native/core/src/execution/jni_api.rs index 6eeee28358..7093768eee 100644 --- a/native/core/src/execution/jni_api.rs +++ b/native/core/src/execution/jni_api.rs @@ -65,6 +65,8 @@ use datafusion_spark::function::string::char::CharFunc; use datafusion_spark::function::string::concat::SparkConcat; use datafusion_spark::function::string::luhn_check::SparkLuhnCheck; use datafusion_spark::function::string::space::SparkSpace; +use datafusion_spark::function::url::parse_url::ParseUrl as SparkParseUrl; +use datafusion_spark::function::url::try_parse_url::TryParseUrl as SparkTryParseUrl; use datafusion_spark::function::url::try_url_decode::TryUrlDecode as SparkTryUrlDecode; use datafusion_spark::function::url::url_decode::UrlDecode as SparkUrlDecode; use datafusion_spark::function::url::url_encode::UrlEncode as SparkUrlEncode; @@ -597,6 +599,8 @@ fn register_datafusion_spark_function(session_ctx: &SessionContext) { session_ctx.register_udf(ScalarUDF::new_from_impl(SparkUrlEncode::default())); session_ctx.register_udf(ScalarUDF::new_from_impl(SparkTryUrlDecode::default())); session_ctx.register_udf(ScalarUDF::new_from_impl(SparkCsc::default())); + session_ctx.register_udf(ScalarUDF::new_from_impl(SparkParseUrl::default())); + session_ctx.register_udf(ScalarUDF::new_from_impl(SparkTryParseUrl::default())); } /// Prepares arrow arrays for output. diff --git a/spark/src/main/scala/org/apache/comet/GenerateDocs.scala b/spark/src/main/scala/org/apache/comet/GenerateDocs.scala index 870fb5e47d..43f0016d79 100644 --- a/spark/src/main/scala/org/apache/comet/GenerateDocs.scala +++ b/spark/src/main/scala/org/apache/comet/GenerateDocs.scala @@ -129,6 +129,16 @@ object GenerateDocs { serde.getCompatibleNotes(), serde.getIncompatibleReasons(), serde.getUnsupportedReasons()) + })), + "url" -> (( + "compatibility/expressions/url.md", + () => + QueryPlanSerde.urlExpressions.toSeq.map { case (cls, serde) => + ( + cls.getSimpleName, + serde.getCompatibleNotes(), + serde.getIncompatibleReasons(), + serde.getUnsupportedReasons()) }))) def main(args: Array[String]): Unit = { diff --git a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala index d85a2c30cb..fa43109210 100644 --- a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala +++ b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala @@ -246,6 +246,9 @@ object QueryPlanSerde extends Logging with CometExprShim with CometTypeShim { classOf[WeekOfYear] -> CometWeekOfYear, classOf[Quarter] -> CometQuarter) + private[comet] val urlExpressions: Map[Class[_ <: Expression], CometExpressionSerde[_]] = Map( + classOf[ParseUrl] -> CometParseUrl) + private val conversionExpressions: Map[Class[_ <: Expression], CometExpressionSerde[_]] = Map( classOf[Cast] -> CometCast) @@ -273,7 +276,7 @@ object QueryPlanSerde extends Logging with CometExprShim with CometTypeShim { mathExpressions ++ hashExpressions ++ stringExpressions ++ conditionalExpressions ++ mapExpressions ++ predicateExpressions ++ structExpressions ++ bitwiseExpressions ++ miscExpressions ++ arrayExpressions ++ - temporalExpressions ++ conversionExpressions + temporalExpressions ++ conversionExpressions ++ urlExpressions /** * Mapping of Spark aggregate expression class to Comet expression handler. diff --git a/spark/src/main/scala/org/apache/comet/serde/url.scala b/spark/src/main/scala/org/apache/comet/serde/url.scala new file mode 100644 index 0000000000..abc636bce5 --- /dev/null +++ b/spark/src/main/scala/org/apache/comet/serde/url.scala @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.comet.serde + +import org.apache.spark.sql.catalyst.expressions.{Attribute, ParseUrl} + +import org.apache.comet.serde.QueryPlanSerde.{exprToProtoInternal, optExprWithInfo, scalarFunctionExprToProto} + +// On Spark 4.x ParseUrl is RuntimeReplaceable and handled via CometExprShim (ParseUrlEvaluator). +object CometParseUrl extends CometExpressionSerde[ParseUrl] { + + override def getIncompatibleReasons(): Seq[String] = Seq( + "Native parse_url diverges from Spark on several edge cases " + + "(https://github.com/apache/datafusion/issues/21943)") + + override def getSupportLevel(expr: ParseUrl): SupportLevel = + Incompatible(Some(getIncompatibleReasons().head)) + + override def convert( + expr: ParseUrl, + inputs: Seq[Attribute], + binding: Boolean): Option[ExprOuterClass.Expr] = { + val funcName = if (expr.failOnError) "parse_url" else "try_parse_url" + val childExprs = expr.children.map(exprToProtoInternal(_, inputs, binding)) + val optExpr = scalarFunctionExprToProto(funcName, childExprs: _*) + optExprWithInfo(optExpr, expr, expr.children: _*) + } +} diff --git a/spark/src/main/spark-4.0/org/apache/comet/shims/CometExprShim.scala b/spark/src/main/spark-4.0/org/apache/comet/shims/CometExprShim.scala index 3d5b34bfd2..d57f3b2966 100644 --- a/spark/src/main/spark-4.0/org/apache/comet/shims/CometExprShim.scala +++ b/spark/src/main/spark-4.0/org/apache/comet/shims/CometExprShim.scala @@ -23,6 +23,7 @@ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.aggregate.Sum import org.apache.spark.sql.catalyst.expressions.json.StructsToJsonEvaluator import org.apache.spark.sql.catalyst.expressions.objects.{Invoke, StaticInvoke} +import org.apache.spark.sql.catalyst.expressions.url.ParseUrlEvaluator import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.types.StringTypeWithCollation import org.apache.spark.sql.types.{ArrayType, BinaryType, BooleanType, DataTypes, MapType, StringType} @@ -143,6 +144,8 @@ trait CometExprShim extends CommonStringExprs { StructsToJson(evaluator.options, child, evaluator.timeZoneId), inputs, binding) + case (Literal(evaluator: ParseUrlEvaluator, _), "evaluate", args) => + exprToProtoInternal(ParseUrl(args, evaluator.failOnError), inputs, binding) case _ => None } diff --git a/spark/src/main/spark-4.1/org/apache/comet/shims/CometExprShim.scala b/spark/src/main/spark-4.1/org/apache/comet/shims/CometExprShim.scala index 5e906a0d83..a053cca512 100644 --- a/spark/src/main/spark-4.1/org/apache/comet/shims/CometExprShim.scala +++ b/spark/src/main/spark-4.1/org/apache/comet/shims/CometExprShim.scala @@ -23,6 +23,7 @@ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.aggregate.Sum import org.apache.spark.sql.catalyst.expressions.json.StructsToJsonEvaluator import org.apache.spark.sql.catalyst.expressions.objects.{Invoke, StaticInvoke} +import org.apache.spark.sql.catalyst.expressions.url.ParseUrlEvaluator import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.types.StringTypeWithCollation import org.apache.spark.sql.types.{ArrayType, BinaryType, BooleanType, DataTypes, MapType, StringType} @@ -142,6 +143,8 @@ trait CometExprShim extends CommonStringExprs { StructsToJson(evaluator.options, child, evaluator.timeZoneId), inputs, binding) + case (Literal(evaluator: ParseUrlEvaluator, _), "evaluate", args) => + exprToProtoInternal(ParseUrl(args, evaluator.failOnError), inputs, binding) case _ => None } diff --git a/spark/src/test/resources/sql-tests/expressions/url/parse_url.sql b/spark/src/test/resources/sql-tests/expressions/url/parse_url.sql new file mode 100644 index 0000000000..b6882ec19b --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/url/parse_url.sql @@ -0,0 +1,37 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +statement +CREATE TABLE test_parse_url(url string) USING parquet + +statement +INSERT INTO test_parse_url VALUES + ('http://spark.apache.org/path?query=1'), + ('https://user:pass@host:8080/path?k=v#ref'), + (NULL) + +query expect_fallback(not fully compatible with Spark) +SELECT parse_url(url, 'HOST') FROM test_parse_url + +query expect_fallback(not fully compatible with Spark) +SELECT parse_url(url, 'PATH') FROM test_parse_url + +query expect_fallback(not fully compatible with Spark) +SELECT parse_url(url, 'QUERY') FROM test_parse_url + +query expect_fallback(not fully compatible with Spark) +SELECT parse_url(url, 'QUERY', 'k') FROM test_parse_url diff --git a/spark/src/test/resources/sql-tests/expressions/url/parse_url_ansi.sql b/spark/src/test/resources/sql-tests/expressions/url/parse_url_ansi.sql new file mode 100644 index 0000000000..b3ea436c63 --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/url/parse_url_ansi.sql @@ -0,0 +1,43 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- Test parse_url() in ANSI mode (failOnError=true -> native "parse_url" path) +-- Config: spark.sql.ansi.enabled=true +-- Config: spark.comet.expression.ParseUrl.allowIncompatible=true + +-- valid URLs should work identically in ANSI mode +query +SELECT parse_url('http://example.com/path?foo=bar', 'HOST') + +query +SELECT parse_url('http://example.com/path?foo=bar', 'PATH') + +query +SELECT parse_url('http://example.com/path?foo=bar', 'QUERY', 'foo') + +query +SELECT parse_url('https://user:pass@host:8080/p?k=v#ref', 'AUTHORITY') + +query +SELECT parse_url('https://user:pass@host:8080/p?k=v#ref', 'USERINFO') + +query +SELECT parse_url('https://user:pass@host:8080/p?k=v#ref', 'REF') + +-- NULL inputs still return NULL in ANSI mode +query +SELECT parse_url(NULL, 'HOST') diff --git a/spark/src/test/resources/sql-tests/expressions/url/parse_url_enabled.sql b/spark/src/test/resources/sql-tests/expressions/url/parse_url_enabled.sql new file mode 100644 index 0000000000..ae43a34429 --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/url/parse_url_enabled.sql @@ -0,0 +1,140 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- Test parse_url() with allowIncompatible enabled (native execution) +-- Config: spark.comet.expression.ParseUrl.allowIncompatible=true + +statement +CREATE TABLE test_parse_url_enabled(url string) USING parquet + +statement +INSERT INTO test_parse_url_enabled VALUES + ('http://spark.apache.org/path?query=1'), + ('https://user:pass@host:8080/path?k=v#ref'), + ('http://example.com/path?a=1&b=2&a=3'), + ('ftp://ftp.example.com/dir/file.txt'), + (NULL) + +-- HOST +query +SELECT parse_url(url, 'HOST') FROM test_parse_url_enabled + +-- PATH +query +SELECT parse_url(url, 'PATH') FROM test_parse_url_enabled + +-- QUERY (no key) +query +SELECT parse_url(url, 'QUERY') FROM test_parse_url_enabled + +-- QUERY with key +query +SELECT parse_url(url, 'QUERY', 'k') FROM test_parse_url_enabled + +-- PROTOCOL +query +SELECT parse_url(url, 'PROTOCOL') FROM test_parse_url_enabled + +-- REF (fragment) +query +SELECT parse_url(url, 'REF') FROM test_parse_url_enabled + +-- AUTHORITY +query +SELECT parse_url(url, 'AUTHORITY') FROM test_parse_url_enabled + +-- USERINFO +query +SELECT parse_url(url, 'USERINFO') FROM test_parse_url_enabled + +-- FILE +query +SELECT parse_url(url, 'FILE') FROM test_parse_url_enabled + +-- literal arguments +query +SELECT parse_url('http://example.com/path?foo=bar', 'HOST') + +query +SELECT parse_url('http://example.com/path?foo=bar', 'PATH') + +query +SELECT parse_url('http://example.com/path?foo=bar', 'QUERY') + +query +SELECT parse_url('http://example.com/path?foo=bar', 'QUERY', 'foo') + +query +SELECT parse_url('http://example.com/path?foo=bar', 'PROTOCOL') + +-- NULL handling +query +SELECT parse_url(NULL, 'HOST') + +query +SELECT parse_url('http://example.com', NULL) + +-- invalid part key +query +SELECT parse_url('http://example.com', 'INVALID') + +-- malformed URL returns NULL in non-ANSI mode (#7) +query +SELECT parse_url('not a url at all', 'HOST') + +query +SELECT parse_url('://missing-scheme', 'HOST') + +query +SELECT parse_url('', 'HOST') + +-- column-valued part key (#5) +statement +CREATE TABLE test_parse_url_parts(url string, part string, key string) USING parquet + +statement +INSERT INTO test_parse_url_parts VALUES + ('http://example.com/path?foo=bar', 'HOST', NULL), + ('http://example.com/path?foo=bar', 'PATH', NULL), + ('http://example.com/path?foo=bar', 'QUERY', 'foo'), + ('https://user:pw@host:9090/p?a=1#frag', 'REF', NULL), + ('https://user:pw@host:9090/p?a=1#frag', 'USERINFO', NULL) + +query +SELECT parse_url(url, part) FROM test_parse_url_parts + +query +SELECT parse_url(url, 'QUERY', key) FROM test_parse_url_parts WHERE key IS NOT NULL + +-- edge cases for known divergences (#6) +query +SELECT parse_url('http://example.com//double//slashes', 'PATH') + +query +SELECT parse_url('http://example.com/path?key=value%20encoded', 'QUERY', 'key') + +query +SELECT parse_url('http://example.com/path?', 'QUERY') + +query +SELECT parse_url('http://example.com#frag', 'FILE') + +query +SELECT parse_url('http://[::1]:8080/path', 'HOST') + +query +SELECT parse_url('http://example.com/path?a=1&a=2', 'QUERY', 'a') From 1c401a4086e219e490ec3f8d399b8ae5dcab3054 Mon Sep 17 00:00:00 2001 From: Parth Chandra Date: Thu, 14 May 2026 17:08:13 -0700 Subject: [PATCH 2/5] fix --- .../org/apache/comet/shims/CometExprShim.scala | 11 +++++++++-- .../org/apache/comet/shims/CometExprShim.scala | 11 +++++++++-- .../sql-tests/expressions/url/parse_url_enabled.sql | 4 ++-- 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/spark/src/main/spark-4.0/org/apache/comet/shims/CometExprShim.scala b/spark/src/main/spark-4.0/org/apache/comet/shims/CometExprShim.scala index d57f3b2966..dab95b315a 100644 --- a/spark/src/main/spark-4.0/org/apache/comet/shims/CometExprShim.scala +++ b/spark/src/main/spark-4.0/org/apache/comet/shims/CometExprShim.scala @@ -28,7 +28,7 @@ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.types.StringTypeWithCollation import org.apache.spark.sql.types.{ArrayType, BinaryType, BooleanType, DataTypes, MapType, StringType} -import org.apache.comet.CometConf +import org.apache.comet.{CometConf, CometExplainInfo} import org.apache.comet.CometSparkSessionExtensions.withInfo import org.apache.comet.expressions.{CometCast, CometEvalMode} import org.apache.comet.serde.{CommonStringExprs, Compatible, ExprOuterClass, Incompatible, SupportLevel} @@ -145,7 +145,14 @@ trait CometExprShim extends CommonStringExprs { inputs, binding) case (Literal(evaluator: ParseUrlEvaluator, _), "evaluate", args) => - exprToProtoInternal(ParseUrl(args, evaluator.failOnError), inputs, binding) + val parseUrl = ParseUrl(args, evaluator.failOnError) + val result = exprToProtoInternal(parseUrl, inputs, binding) + if (result.isEmpty) { + parseUrl + .getTagValue(CometExplainInfo.EXTENSION_INFO) + .foreach(reasons => i.setTagValue(CometExplainInfo.EXTENSION_INFO, reasons)) + } + result case _ => None } diff --git a/spark/src/main/spark-4.1/org/apache/comet/shims/CometExprShim.scala b/spark/src/main/spark-4.1/org/apache/comet/shims/CometExprShim.scala index a053cca512..1c60298194 100644 --- a/spark/src/main/spark-4.1/org/apache/comet/shims/CometExprShim.scala +++ b/spark/src/main/spark-4.1/org/apache/comet/shims/CometExprShim.scala @@ -28,7 +28,7 @@ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.types.StringTypeWithCollation import org.apache.spark.sql.types.{ArrayType, BinaryType, BooleanType, DataTypes, MapType, StringType} -import org.apache.comet.CometConf +import org.apache.comet.{CometConf, CometExplainInfo} import org.apache.comet.CometSparkSessionExtensions.withInfo import org.apache.comet.expressions.{CometCast, CometEvalMode} import org.apache.comet.serde.{CommonStringExprs, Compatible, ExprOuterClass, Incompatible, SupportLevel} @@ -144,7 +144,14 @@ trait CometExprShim extends CommonStringExprs { inputs, binding) case (Literal(evaluator: ParseUrlEvaluator, _), "evaluate", args) => - exprToProtoInternal(ParseUrl(args, evaluator.failOnError), inputs, binding) + val parseUrl = ParseUrl(args, evaluator.failOnError) + val result = exprToProtoInternal(parseUrl, inputs, binding) + if (result.isEmpty) { + parseUrl + .getTagValue(CometExplainInfo.EXTENSION_INFO) + .foreach(reasons => i.setTagValue(CometExplainInfo.EXTENSION_INFO, reasons)) + } + result case _ => None } diff --git a/spark/src/test/resources/sql-tests/expressions/url/parse_url_enabled.sql b/spark/src/test/resources/sql-tests/expressions/url/parse_url_enabled.sql index ae43a34429..1bde241b4d 100644 --- a/spark/src/test/resources/sql-tests/expressions/url/parse_url_enabled.sql +++ b/spark/src/test/resources/sql-tests/expressions/url/parse_url_enabled.sql @@ -124,13 +124,13 @@ SELECT parse_url(url, 'QUERY', key) FROM test_parse_url_parts WHERE key IS NOT N query SELECT parse_url('http://example.com//double//slashes', 'PATH') -query +query ignore(known divergence: native decodes percent-encoding in QUERY values) SELECT parse_url('http://example.com/path?key=value%20encoded', 'QUERY', 'key') query SELECT parse_url('http://example.com/path?', 'QUERY') -query +query ignore(known divergence: native returns "/" for FILE when URL has no path) SELECT parse_url('http://example.com#frag', 'FILE') query From b76152f16f1902d18b1ac6348f2f7177d92c893e Mon Sep 17 00:00:00 2001 From: Parth Chandra Date: Fri, 15 May 2026 13:44:30 -0700 Subject: [PATCH 3/5] fix for Invoke --- spark/src/main/scala/org/apache/comet/serde/url.scala | 8 +++++--- .../spark-4.0/org/apache/comet/shims/CometExprShim.scala | 6 +++--- .../spark-4.1/org/apache/comet/shims/CometExprShim.scala | 6 +++--- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/spark/src/main/scala/org/apache/comet/serde/url.scala b/spark/src/main/scala/org/apache/comet/serde/url.scala index abc636bce5..fa5742860c 100644 --- a/spark/src/main/scala/org/apache/comet/serde/url.scala +++ b/spark/src/main/scala/org/apache/comet/serde/url.scala @@ -26,12 +26,14 @@ import org.apache.comet.serde.QueryPlanSerde.{exprToProtoInternal, optExprWithIn // On Spark 4.x ParseUrl is RuntimeReplaceable and handled via CometExprShim (ParseUrlEvaluator). object CometParseUrl extends CometExpressionSerde[ParseUrl] { - override def getIncompatibleReasons(): Seq[String] = Seq( + private val incompatibleReason = "Native parse_url diverges from Spark on several edge cases " + - "(https://github.com/apache/datafusion/issues/21943)") + "(https://github.com/apache/datafusion/issues/21943)" + + override def getIncompatibleReasons(): Seq[String] = Seq(incompatibleReason) override def getSupportLevel(expr: ParseUrl): SupportLevel = - Incompatible(Some(getIncompatibleReasons().head)) + Incompatible(Some(incompatibleReason)) override def convert( expr: ParseUrl, diff --git a/spark/src/main/spark-4.0/org/apache/comet/shims/CometExprShim.scala b/spark/src/main/spark-4.0/org/apache/comet/shims/CometExprShim.scala index dab95b315a..c159bde0de 100644 --- a/spark/src/main/spark-4.0/org/apache/comet/shims/CometExprShim.scala +++ b/spark/src/main/spark-4.0/org/apache/comet/shims/CometExprShim.scala @@ -134,9 +134,9 @@ trait CometExprShim extends CommonStringExprs { val optExpr = scalarFunctionExprToProto("width_bucket", childExprs: _*) optExprWithInfo(optExpr, wb, wb.children: _*) - // In Spark 4.0, StructsToJson is a RuntimeReplaceable whose replacement is - // Invoke(Literal(StructsToJsonEvaluator), "evaluate", ...). Reconstruct the - // original StructsToJson and recurse so support-level checks apply. + // In Spark 4.x, RuntimeReplaceable expressions (StructsToJson, ParseUrl) become + // Invoke(Literal(Evaluator), "evaluate", ...). Reconstruct the original expression + // and recurse so support-level checks apply. case i: Invoke => (i.targetObject, i.functionName, i.arguments) match { case (Literal(evaluator: StructsToJsonEvaluator, _), "evaluate", Seq(child)) => diff --git a/spark/src/main/spark-4.1/org/apache/comet/shims/CometExprShim.scala b/spark/src/main/spark-4.1/org/apache/comet/shims/CometExprShim.scala index 1c60298194..c48b420670 100644 --- a/spark/src/main/spark-4.1/org/apache/comet/shims/CometExprShim.scala +++ b/spark/src/main/spark-4.1/org/apache/comet/shims/CometExprShim.scala @@ -133,9 +133,9 @@ trait CometExprShim extends CommonStringExprs { val optExpr = scalarFunctionExprToProto("width_bucket", childExprs: _*) optExprWithInfo(optExpr, wb, wb.children: _*) - // In Spark 4.0, StructsToJson is a RuntimeReplaceable whose replacement is - // Invoke(Literal(StructsToJsonEvaluator), "evaluate", ...). Reconstruct the - // original StructsToJson and recurse so support-level checks apply. + // In Spark 4.x, RuntimeReplaceable expressions (StructsToJson, ParseUrl) become + // Invoke(Literal(Evaluator), "evaluate", ...). Reconstruct the original expression + // and recurse so support-level checks apply. case i: Invoke => (i.targetObject, i.functionName, i.arguments) match { case (Literal(evaluator: StructsToJsonEvaluator, _), "evaluate", Seq(child)) => From 7b47e29f4700438a69608f78bfb99f4e1c357c94 Mon Sep 17 00:00:00 2001 From: Parth Chandra Date: Mon, 18 May 2026 16:15:35 -0700 Subject: [PATCH 4/5] 4.2 shim --- .../org/apache/comet/shims/CometExprShim.scala | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/spark/src/main/spark-4.2/org/apache/comet/shims/CometExprShim.scala b/spark/src/main/spark-4.2/org/apache/comet/shims/CometExprShim.scala index 5e906a0d83..c48b420670 100644 --- a/spark/src/main/spark-4.2/org/apache/comet/shims/CometExprShim.scala +++ b/spark/src/main/spark-4.2/org/apache/comet/shims/CometExprShim.scala @@ -23,11 +23,12 @@ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.aggregate.Sum import org.apache.spark.sql.catalyst.expressions.json.StructsToJsonEvaluator import org.apache.spark.sql.catalyst.expressions.objects.{Invoke, StaticInvoke} +import org.apache.spark.sql.catalyst.expressions.url.ParseUrlEvaluator import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.types.StringTypeWithCollation import org.apache.spark.sql.types.{ArrayType, BinaryType, BooleanType, DataTypes, MapType, StringType} -import org.apache.comet.CometConf +import org.apache.comet.{CometConf, CometExplainInfo} import org.apache.comet.CometSparkSessionExtensions.withInfo import org.apache.comet.expressions.{CometCast, CometEvalMode} import org.apache.comet.serde.{CommonStringExprs, Compatible, ExprOuterClass, Incompatible, SupportLevel} @@ -132,9 +133,9 @@ trait CometExprShim extends CommonStringExprs { val optExpr = scalarFunctionExprToProto("width_bucket", childExprs: _*) optExprWithInfo(optExpr, wb, wb.children: _*) - // In Spark 4.0, StructsToJson is a RuntimeReplaceable whose replacement is - // Invoke(Literal(StructsToJsonEvaluator), "evaluate", ...). Reconstruct the - // original StructsToJson and recurse so support-level checks apply. + // In Spark 4.x, RuntimeReplaceable expressions (StructsToJson, ParseUrl) become + // Invoke(Literal(Evaluator), "evaluate", ...). Reconstruct the original expression + // and recurse so support-level checks apply. case i: Invoke => (i.targetObject, i.functionName, i.arguments) match { case (Literal(evaluator: StructsToJsonEvaluator, _), "evaluate", Seq(child)) => @@ -142,6 +143,15 @@ trait CometExprShim extends CommonStringExprs { StructsToJson(evaluator.options, child, evaluator.timeZoneId), inputs, binding) + case (Literal(evaluator: ParseUrlEvaluator, _), "evaluate", args) => + val parseUrl = ParseUrl(args, evaluator.failOnError) + val result = exprToProtoInternal(parseUrl, inputs, binding) + if (result.isEmpty) { + parseUrl + .getTagValue(CometExplainInfo.EXTENSION_INFO) + .foreach(reasons => i.setTagValue(CometExplainInfo.EXTENSION_INFO, reasons)) + } + result case _ => None } From bf7af4550197f47ba34121730225fe9c486bba67 Mon Sep 17 00:00:00 2001 From: Parth Chandra Date: Tue, 19 May 2026 17:04:25 -0700 Subject: [PATCH 5/5] add tests --- .../sql-tests/expressions/url/parse_url_ansi.sql | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/spark/src/test/resources/sql-tests/expressions/url/parse_url_ansi.sql b/spark/src/test/resources/sql-tests/expressions/url/parse_url_ansi.sql index b3ea436c63..1e71031f41 100644 --- a/spark/src/test/resources/sql-tests/expressions/url/parse_url_ansi.sql +++ b/spark/src/test/resources/sql-tests/expressions/url/parse_url_ansi.sql @@ -41,3 +41,13 @@ SELECT parse_url('https://user:pass@host:8080/p?k=v#ref', 'REF') -- NULL inputs still return NULL in ANSI mode query SELECT parse_url(NULL, 'HOST') + +-- invalid URL throws in ANSI mode (native returns NULL instead of throwing) +query ignore(known divergence: native parse_url does not throw INVALID_URL for malformed URLs) +SELECT parse_url('not a url at all', 'HOST') + +query ignore(known divergence: native parse_url does not throw INVALID_URL for malformed URLs) +SELECT parse_url('://missing-scheme', 'HOST') + +query ignore(known divergence: native parse_url does not throw INVALID_URL for malformed URLs) +SELECT parse_url('', 'HOST')