diff --git a/docs/source/contributor-guide/spark_expressions_support.md b/docs/source/contributor-guide/spark_expressions_support.md index 24422b7a64..702fb21a01 100644 --- a/docs/source/contributor-guide/spark_expressions_support.md +++ b/docs/source/contributor-guide/spark_expressions_support.md @@ -596,7 +596,7 @@ ### url_funcs -- [ ] parse_url +- [x] parse_url (Incompatible: native diverges from Spark on edge cases) - [x] try_url_decode - 4.0.1, 2026-05-05 - [x] url_decode diff --git a/docs/source/user-guide/latest/compatibility/expressions/index.md b/docs/source/user-guide/latest/compatibility/expressions/index.md index b86e46cc0e..fcaa3bfbbc 100644 --- a/docs/source/user-guide/latest/compatibility/expressions/index.md +++ b/docs/source/user-guide/latest/compatibility/expressions/index.md @@ -36,5 +36,6 @@ math misc string struct +url cast ``` diff --git a/docs/source/user-guide/latest/compatibility/expressions/url.md b/docs/source/user-guide/latest/compatibility/expressions/url.md new file mode 100644 index 0000000000..765e12941f --- /dev/null +++ b/docs/source/user-guide/latest/compatibility/expressions/url.md @@ -0,0 +1,23 @@ + + +# URL Expressions + + + diff --git a/native/core/src/execution/jni_api.rs b/native/core/src/execution/jni_api.rs index b1f2ccf02a..e12548f035 100644 --- a/native/core/src/execution/jni_api.rs +++ b/native/core/src/execution/jni_api.rs @@ -66,6 +66,8 @@ use datafusion_spark::function::string::char::CharFunc; use datafusion_spark::function::string::concat::SparkConcat; use datafusion_spark::function::string::luhn_check::SparkLuhnCheck; use datafusion_spark::function::string::space::SparkSpace; +use datafusion_spark::function::url::parse_url::ParseUrl as SparkParseUrl; +use datafusion_spark::function::url::try_parse_url::TryParseUrl as SparkTryParseUrl; use datafusion_spark::function::url::try_url_decode::TryUrlDecode as SparkTryUrlDecode; use datafusion_spark::function::url::url_decode::UrlDecode as SparkUrlDecode; use datafusion_spark::function::url::url_encode::UrlEncode as SparkUrlEncode; @@ -598,6 +600,8 @@ fn register_datafusion_spark_function(session_ctx: &SessionContext) { session_ctx.register_udf(ScalarUDF::new_from_impl(SparkUrlEncode::default())); session_ctx.register_udf(ScalarUDF::new_from_impl(SparkTryUrlDecode::default())); session_ctx.register_udf(ScalarUDF::new_from_impl(SparkCsc::default())); + session_ctx.register_udf(ScalarUDF::new_from_impl(SparkParseUrl::default())); + session_ctx.register_udf(ScalarUDF::new_from_impl(SparkTryParseUrl::default())); session_ctx.register_udf(ScalarUDF::new_from_impl(SparkFactorial::default())); } diff --git a/spark/src/main/scala/org/apache/comet/GenerateDocs.scala b/spark/src/main/scala/org/apache/comet/GenerateDocs.scala index 870fb5e47d..43f0016d79 100644 --- a/spark/src/main/scala/org/apache/comet/GenerateDocs.scala +++ b/spark/src/main/scala/org/apache/comet/GenerateDocs.scala @@ -129,6 +129,16 @@ object GenerateDocs { serde.getCompatibleNotes(), serde.getIncompatibleReasons(), serde.getUnsupportedReasons()) + })), + "url" -> (( + "compatibility/expressions/url.md", + () => + QueryPlanSerde.urlExpressions.toSeq.map { case (cls, serde) => + ( + cls.getSimpleName, + serde.getCompatibleNotes(), + serde.getIncompatibleReasons(), + serde.getUnsupportedReasons()) }))) def main(args: Array[String]): Unit = { diff --git a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala index 5ecf14db3e..bb61753e6f 100644 --- a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala +++ b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala @@ -248,6 +248,9 @@ object QueryPlanSerde extends Logging with CometExprShim with CometTypeShim { classOf[WeekOfYear] -> CometWeekOfYear, classOf[Quarter] -> CometQuarter) + private[comet] val urlExpressions: Map[Class[_ <: Expression], CometExpressionSerde[_]] = Map( + classOf[ParseUrl] -> CometParseUrl) + private val conversionExpressions: Map[Class[_ <: Expression], CometExpressionSerde[_]] = Map( classOf[Cast] -> CometCast) @@ -275,7 +278,7 @@ object QueryPlanSerde extends Logging with CometExprShim with CometTypeShim { mathExpressions ++ hashExpressions ++ stringExpressions ++ conditionalExpressions ++ mapExpressions ++ predicateExpressions ++ structExpressions ++ bitwiseExpressions ++ miscExpressions ++ arrayExpressions ++ - temporalExpressions ++ conversionExpressions + temporalExpressions ++ conversionExpressions ++ urlExpressions /** * Mapping of Spark aggregate expression class to Comet expression handler. diff --git a/spark/src/main/scala/org/apache/comet/serde/url.scala b/spark/src/main/scala/org/apache/comet/serde/url.scala new file mode 100644 index 0000000000..fa5742860c --- /dev/null +++ b/spark/src/main/scala/org/apache/comet/serde/url.scala @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.comet.serde + +import org.apache.spark.sql.catalyst.expressions.{Attribute, ParseUrl} + +import org.apache.comet.serde.QueryPlanSerde.{exprToProtoInternal, optExprWithInfo, scalarFunctionExprToProto} + +// On Spark 4.x ParseUrl is RuntimeReplaceable and handled via CometExprShim (ParseUrlEvaluator). +object CometParseUrl extends CometExpressionSerde[ParseUrl] { + + private val incompatibleReason = + "Native parse_url diverges from Spark on several edge cases " + + "(https://github.com/apache/datafusion/issues/21943)" + + override def getIncompatibleReasons(): Seq[String] = Seq(incompatibleReason) + + override def getSupportLevel(expr: ParseUrl): SupportLevel = + Incompatible(Some(incompatibleReason)) + + override def convert( + expr: ParseUrl, + inputs: Seq[Attribute], + binding: Boolean): Option[ExprOuterClass.Expr] = { + val funcName = if (expr.failOnError) "parse_url" else "try_parse_url" + val childExprs = expr.children.map(exprToProtoInternal(_, inputs, binding)) + val optExpr = scalarFunctionExprToProto(funcName, childExprs: _*) + optExprWithInfo(optExpr, expr, expr.children: _*) + } +} diff --git a/spark/src/main/spark-4.0/org/apache/comet/shims/CometExprShim.scala b/spark/src/main/spark-4.0/org/apache/comet/shims/CometExprShim.scala index 3d5b34bfd2..c159bde0de 100644 --- a/spark/src/main/spark-4.0/org/apache/comet/shims/CometExprShim.scala +++ b/spark/src/main/spark-4.0/org/apache/comet/shims/CometExprShim.scala @@ -23,11 +23,12 @@ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.aggregate.Sum import org.apache.spark.sql.catalyst.expressions.json.StructsToJsonEvaluator import org.apache.spark.sql.catalyst.expressions.objects.{Invoke, StaticInvoke} +import org.apache.spark.sql.catalyst.expressions.url.ParseUrlEvaluator import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.types.StringTypeWithCollation import org.apache.spark.sql.types.{ArrayType, BinaryType, BooleanType, DataTypes, MapType, StringType} -import org.apache.comet.CometConf +import org.apache.comet.{CometConf, CometExplainInfo} import org.apache.comet.CometSparkSessionExtensions.withInfo import org.apache.comet.expressions.{CometCast, CometEvalMode} import org.apache.comet.serde.{CommonStringExprs, Compatible, ExprOuterClass, Incompatible, SupportLevel} @@ -133,9 +134,9 @@ trait CometExprShim extends CommonStringExprs { val optExpr = scalarFunctionExprToProto("width_bucket", childExprs: _*) optExprWithInfo(optExpr, wb, wb.children: _*) - // In Spark 4.0, StructsToJson is a RuntimeReplaceable whose replacement is - // Invoke(Literal(StructsToJsonEvaluator), "evaluate", ...). Reconstruct the - // original StructsToJson and recurse so support-level checks apply. + // In Spark 4.x, RuntimeReplaceable expressions (StructsToJson, ParseUrl) become + // Invoke(Literal(Evaluator), "evaluate", ...). Reconstruct the original expression + // and recurse so support-level checks apply. case i: Invoke => (i.targetObject, i.functionName, i.arguments) match { case (Literal(evaluator: StructsToJsonEvaluator, _), "evaluate", Seq(child)) => @@ -143,6 +144,15 @@ trait CometExprShim extends CommonStringExprs { StructsToJson(evaluator.options, child, evaluator.timeZoneId), inputs, binding) + case (Literal(evaluator: ParseUrlEvaluator, _), "evaluate", args) => + val parseUrl = ParseUrl(args, evaluator.failOnError) + val result = exprToProtoInternal(parseUrl, inputs, binding) + if (result.isEmpty) { + parseUrl + .getTagValue(CometExplainInfo.EXTENSION_INFO) + .foreach(reasons => i.setTagValue(CometExplainInfo.EXTENSION_INFO, reasons)) + } + result case _ => None } diff --git a/spark/src/main/spark-4.1/org/apache/comet/shims/CometExprShim.scala b/spark/src/main/spark-4.1/org/apache/comet/shims/CometExprShim.scala index 88e6f27f9b..9d6092aa5c 100644 --- a/spark/src/main/spark-4.1/org/apache/comet/shims/CometExprShim.scala +++ b/spark/src/main/spark-4.1/org/apache/comet/shims/CometExprShim.scala @@ -23,12 +23,13 @@ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.aggregate.Sum import org.apache.spark.sql.catalyst.expressions.json.StructsToJsonEvaluator import org.apache.spark.sql.catalyst.expressions.objects.{Invoke, StaticInvoke} +import org.apache.spark.sql.catalyst.expressions.url.ParseUrlEvaluator import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.types.StringTypeWithCollation import org.apache.spark.sql.types.{ArrayType, BinaryType, BooleanType, DataTypes, MapType, StringType, TimeType} -import org.apache.comet.CometConf +import org.apache.comet.{CometConf, CometExplainInfo} import org.apache.comet.CometSparkSessionExtensions.withInfo import org.apache.comet.expressions.{CometCast, CometEvalMode} import org.apache.comet.serde.{CommonStringExprs, Compatible, ExprOuterClass, Incompatible, SupportLevel} @@ -143,10 +144,9 @@ trait CometExprShim extends CommonStringExprs { val optExpr = scalarFunctionExprToProto("width_bucket", childExprs: _*) optExprWithInfo(optExpr, wb, wb.children: _*) - // In Spark 4.0, StructsToJson is a RuntimeReplaceable whose replacement is - // Invoke(Literal(StructsToJsonEvaluator), "evaluate", ...). Reconstruct the - // original StructsToJson and recurse so support-level checks apply. - // ToTime (Spark 4.1) resolves to Invoke(Literal(ToTimeParser), "parse", TimeType(), ...). + // In Spark 4.x, RuntimeReplaceable expressions (StructsToJson, ParseUrl) become + // Invoke(Literal(Evaluator), "evaluate", ...). Reconstruct the original expression + // and recurse so support-level checks apply. case i: Invoke => (i.targetObject, i.functionName, i.arguments) match { case (Literal(evaluator: StructsToJsonEvaluator, _), "evaluate", Seq(child)) => @@ -154,6 +154,15 @@ trait CometExprShim extends CommonStringExprs { StructsToJson(evaluator.options, child, evaluator.timeZoneId), inputs, binding) + case (Literal(evaluator: ParseUrlEvaluator, _), "evaluate", args) => + val parseUrl = ParseUrl(args, evaluator.failOnError) + val result = exprToProtoInternal(parseUrl, inputs, binding) + if (result.isEmpty) { + parseUrl + .getTagValue(CometExplainInfo.EXTENSION_INFO) + .foreach(reasons => i.setTagValue(CometExplainInfo.EXTENSION_INFO, reasons)) + } + result case (Literal(parser: ToTimeParser, _), "parse", args) if i.dataType.isInstanceOf[TimeType] && parser.fmt.isEmpty && args.size == 1 => val childExprs = args.map(exprToProtoInternal(_, inputs, binding)) diff --git a/spark/src/main/spark-4.2/org/apache/comet/shims/CometExprShim.scala b/spark/src/main/spark-4.2/org/apache/comet/shims/CometExprShim.scala index 88e6f27f9b..9d6092aa5c 100644 --- a/spark/src/main/spark-4.2/org/apache/comet/shims/CometExprShim.scala +++ b/spark/src/main/spark-4.2/org/apache/comet/shims/CometExprShim.scala @@ -23,12 +23,13 @@ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.aggregate.Sum import org.apache.spark.sql.catalyst.expressions.json.StructsToJsonEvaluator import org.apache.spark.sql.catalyst.expressions.objects.{Invoke, StaticInvoke} +import org.apache.spark.sql.catalyst.expressions.url.ParseUrlEvaluator import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.types.StringTypeWithCollation import org.apache.spark.sql.types.{ArrayType, BinaryType, BooleanType, DataTypes, MapType, StringType, TimeType} -import org.apache.comet.CometConf +import org.apache.comet.{CometConf, CometExplainInfo} import org.apache.comet.CometSparkSessionExtensions.withInfo import org.apache.comet.expressions.{CometCast, CometEvalMode} import org.apache.comet.serde.{CommonStringExprs, Compatible, ExprOuterClass, Incompatible, SupportLevel} @@ -143,10 +144,9 @@ trait CometExprShim extends CommonStringExprs { val optExpr = scalarFunctionExprToProto("width_bucket", childExprs: _*) optExprWithInfo(optExpr, wb, wb.children: _*) - // In Spark 4.0, StructsToJson is a RuntimeReplaceable whose replacement is - // Invoke(Literal(StructsToJsonEvaluator), "evaluate", ...). Reconstruct the - // original StructsToJson and recurse so support-level checks apply. - // ToTime (Spark 4.1) resolves to Invoke(Literal(ToTimeParser), "parse", TimeType(), ...). + // In Spark 4.x, RuntimeReplaceable expressions (StructsToJson, ParseUrl) become + // Invoke(Literal(Evaluator), "evaluate", ...). Reconstruct the original expression + // and recurse so support-level checks apply. case i: Invoke => (i.targetObject, i.functionName, i.arguments) match { case (Literal(evaluator: StructsToJsonEvaluator, _), "evaluate", Seq(child)) => @@ -154,6 +154,15 @@ trait CometExprShim extends CommonStringExprs { StructsToJson(evaluator.options, child, evaluator.timeZoneId), inputs, binding) + case (Literal(evaluator: ParseUrlEvaluator, _), "evaluate", args) => + val parseUrl = ParseUrl(args, evaluator.failOnError) + val result = exprToProtoInternal(parseUrl, inputs, binding) + if (result.isEmpty) { + parseUrl + .getTagValue(CometExplainInfo.EXTENSION_INFO) + .foreach(reasons => i.setTagValue(CometExplainInfo.EXTENSION_INFO, reasons)) + } + result case (Literal(parser: ToTimeParser, _), "parse", args) if i.dataType.isInstanceOf[TimeType] && parser.fmt.isEmpty && args.size == 1 => val childExprs = args.map(exprToProtoInternal(_, inputs, binding)) diff --git a/spark/src/test/resources/sql-tests/expressions/url/parse_url.sql b/spark/src/test/resources/sql-tests/expressions/url/parse_url.sql new file mode 100644 index 0000000000..b6882ec19b --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/url/parse_url.sql @@ -0,0 +1,37 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +statement +CREATE TABLE test_parse_url(url string) USING parquet + +statement +INSERT INTO test_parse_url VALUES + ('http://spark.apache.org/path?query=1'), + ('https://user:pass@host:8080/path?k=v#ref'), + (NULL) + +query expect_fallback(not fully compatible with Spark) +SELECT parse_url(url, 'HOST') FROM test_parse_url + +query expect_fallback(not fully compatible with Spark) +SELECT parse_url(url, 'PATH') FROM test_parse_url + +query expect_fallback(not fully compatible with Spark) +SELECT parse_url(url, 'QUERY') FROM test_parse_url + +query expect_fallback(not fully compatible with Spark) +SELECT parse_url(url, 'QUERY', 'k') FROM test_parse_url diff --git a/spark/src/test/resources/sql-tests/expressions/url/parse_url_ansi.sql b/spark/src/test/resources/sql-tests/expressions/url/parse_url_ansi.sql new file mode 100644 index 0000000000..1e71031f41 --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/url/parse_url_ansi.sql @@ -0,0 +1,53 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- Test parse_url() in ANSI mode (failOnError=true -> native "parse_url" path) +-- Config: spark.sql.ansi.enabled=true +-- Config: spark.comet.expression.ParseUrl.allowIncompatible=true + +-- valid URLs should work identically in ANSI mode +query +SELECT parse_url('http://example.com/path?foo=bar', 'HOST') + +query +SELECT parse_url('http://example.com/path?foo=bar', 'PATH') + +query +SELECT parse_url('http://example.com/path?foo=bar', 'QUERY', 'foo') + +query +SELECT parse_url('https://user:pass@host:8080/p?k=v#ref', 'AUTHORITY') + +query +SELECT parse_url('https://user:pass@host:8080/p?k=v#ref', 'USERINFO') + +query +SELECT parse_url('https://user:pass@host:8080/p?k=v#ref', 'REF') + +-- NULL inputs still return NULL in ANSI mode +query +SELECT parse_url(NULL, 'HOST') + +-- invalid URL throws in ANSI mode (native returns NULL instead of throwing) +query ignore(known divergence: native parse_url does not throw INVALID_URL for malformed URLs) +SELECT parse_url('not a url at all', 'HOST') + +query ignore(known divergence: native parse_url does not throw INVALID_URL for malformed URLs) +SELECT parse_url('://missing-scheme', 'HOST') + +query ignore(known divergence: native parse_url does not throw INVALID_URL for malformed URLs) +SELECT parse_url('', 'HOST') diff --git a/spark/src/test/resources/sql-tests/expressions/url/parse_url_enabled.sql b/spark/src/test/resources/sql-tests/expressions/url/parse_url_enabled.sql new file mode 100644 index 0000000000..1bde241b4d --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/url/parse_url_enabled.sql @@ -0,0 +1,140 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- Test parse_url() with allowIncompatible enabled (native execution) +-- Config: spark.comet.expression.ParseUrl.allowIncompatible=true + +statement +CREATE TABLE test_parse_url_enabled(url string) USING parquet + +statement +INSERT INTO test_parse_url_enabled VALUES + ('http://spark.apache.org/path?query=1'), + ('https://user:pass@host:8080/path?k=v#ref'), + ('http://example.com/path?a=1&b=2&a=3'), + ('ftp://ftp.example.com/dir/file.txt'), + (NULL) + +-- HOST +query +SELECT parse_url(url, 'HOST') FROM test_parse_url_enabled + +-- PATH +query +SELECT parse_url(url, 'PATH') FROM test_parse_url_enabled + +-- QUERY (no key) +query +SELECT parse_url(url, 'QUERY') FROM test_parse_url_enabled + +-- QUERY with key +query +SELECT parse_url(url, 'QUERY', 'k') FROM test_parse_url_enabled + +-- PROTOCOL +query +SELECT parse_url(url, 'PROTOCOL') FROM test_parse_url_enabled + +-- REF (fragment) +query +SELECT parse_url(url, 'REF') FROM test_parse_url_enabled + +-- AUTHORITY +query +SELECT parse_url(url, 'AUTHORITY') FROM test_parse_url_enabled + +-- USERINFO +query +SELECT parse_url(url, 'USERINFO') FROM test_parse_url_enabled + +-- FILE +query +SELECT parse_url(url, 'FILE') FROM test_parse_url_enabled + +-- literal arguments +query +SELECT parse_url('http://example.com/path?foo=bar', 'HOST') + +query +SELECT parse_url('http://example.com/path?foo=bar', 'PATH') + +query +SELECT parse_url('http://example.com/path?foo=bar', 'QUERY') + +query +SELECT parse_url('http://example.com/path?foo=bar', 'QUERY', 'foo') + +query +SELECT parse_url('http://example.com/path?foo=bar', 'PROTOCOL') + +-- NULL handling +query +SELECT parse_url(NULL, 'HOST') + +query +SELECT parse_url('http://example.com', NULL) + +-- invalid part key +query +SELECT parse_url('http://example.com', 'INVALID') + +-- malformed URL returns NULL in non-ANSI mode (#7) +query +SELECT parse_url('not a url at all', 'HOST') + +query +SELECT parse_url('://missing-scheme', 'HOST') + +query +SELECT parse_url('', 'HOST') + +-- column-valued part key (#5) +statement +CREATE TABLE test_parse_url_parts(url string, part string, key string) USING parquet + +statement +INSERT INTO test_parse_url_parts VALUES + ('http://example.com/path?foo=bar', 'HOST', NULL), + ('http://example.com/path?foo=bar', 'PATH', NULL), + ('http://example.com/path?foo=bar', 'QUERY', 'foo'), + ('https://user:pw@host:9090/p?a=1#frag', 'REF', NULL), + ('https://user:pw@host:9090/p?a=1#frag', 'USERINFO', NULL) + +query +SELECT parse_url(url, part) FROM test_parse_url_parts + +query +SELECT parse_url(url, 'QUERY', key) FROM test_parse_url_parts WHERE key IS NOT NULL + +-- edge cases for known divergences (#6) +query +SELECT parse_url('http://example.com//double//slashes', 'PATH') + +query ignore(known divergence: native decodes percent-encoding in QUERY values) +SELECT parse_url('http://example.com/path?key=value%20encoded', 'QUERY', 'key') + +query +SELECT parse_url('http://example.com/path?', 'QUERY') + +query ignore(known divergence: native returns "/" for FILE when URL has no path) +SELECT parse_url('http://example.com#frag', 'FILE') + +query +SELECT parse_url('http://[::1]:8080/path', 'HOST') + +query +SELECT parse_url('http://example.com/path?a=1&a=2', 'QUERY', 'a')