From 0aa77dcf0d51f427d353d4f733a013a0a7b2bebe Mon Sep 17 00:00:00 2001 From: comphead Date: Fri, 15 May 2026 15:26:24 -0700 Subject: [PATCH 1/2] feat: wire `factorial` and update wire skill --- .../skills/wire-datafusion-function/SKILL.md | 38 +++++++++-- .../spark_expressions_support.md | 5 +- docs/source/user-guide/latest/expressions.md | 1 + native/core/src/execution/jni_api.rs | 2 + .../apache/comet/serde/QueryPlanSerde.scala | 1 + .../sql-tests/expressions/math/factorial.sql | 68 +++++++++++++++++++ 6 files changed, 108 insertions(+), 7 deletions(-) create mode 100644 spark/src/test/resources/sql-tests/expressions/math/factorial.sql diff --git a/.claude/skills/wire-datafusion-function/SKILL.md b/.claude/skills/wire-datafusion-function/SKILL.md index b0d9cabdc6..ca53ea6df9 100644 --- a/.claude/skills/wire-datafusion-function/SKILL.md +++ b/.claude/skills/wire-datafusion-function/SKILL.md @@ -72,24 +72,50 @@ Note `inputTypes`, `dataType`, `eval` / `nullSafeEval`, ANSI mode branches, `req ### 2. Find the upstream function -Read the pinned `datafusion-spark` version from `native/Cargo.toml` rather than picking the first cached copy — `head -1` can land on an older release that lacks the function: +**Prefer a local DataFusion clone if one is available** — check `CLAUDE.md`, project memory, and the user's `~/dev` tree for an existing checkout (e.g. `~/dev/.../rust/datafusion`). The local clone is typically the latest source and avoids cargo-registry version mismatches. If no local clone is found, fall back to the cached registry crate. + +Read the pinned `datafusion-spark` version from `native/Cargo.toml` rather than picking the first cached copy — `head -1` can land on an older release that lacks the function. Use a portable regex (BSD `awk` on macOS does not support `\b`): ```bash REPO_ROOT=$(git rev-parse --show-toplevel) -DF_SPARK_VER=$(awk -F'"' '/^datafusion-spark\b/ {print $2; exit}' "$REPO_ROOT/native/Cargo.toml") +DF_SPARK_VER=$(awk -F'"' '/^datafusion-spark[ =]/ {print $2; exit}' "$REPO_ROOT/native/Cargo.toml") DF_SPARK=$(ls -d ~/.cargo/registry/src/*/datafusion-spark-${DF_SPARK_VER}/ 2>/dev/null | head -1) echo "Using datafusion-spark $DF_SPARK_VER at $DF_SPARK" -DF_FUNCS_VER=$(awk -F'"' '/^datafusion = / {print $2; exit}' "$REPO_ROOT/native/Cargo.toml") +DF_FUNCS_VER=$(awk -F'"' '/^datafusion[ =]/ {print $2; exit}' "$REPO_ROOT/native/Cargo.toml") DF_FUNCS=$(ls -d ~/.cargo/registry/src/*/datafusion-functions-${DF_FUNCS_VER}/ 2>/dev/null | head -1) -grep -rin "fn name" "$DF_SPARK/src/function/" 2>/dev/null | grep -i "$ARGUMENTS" -grep -rin "fn name" "$DF_FUNCS/src/" 2>/dev/null | grep -i "$ARGUMENTS" +# Sanity check — empty paths mean the awk match failed or the crate is not cached. +[ -z "$DF_SPARK" ] && echo "WARNING: datafusion-spark path empty — check Cargo.toml regex and run 'cargo fetch' from native/" +[ -z "$DF_FUNCS" ] && echo "WARNING: datafusion-functions path empty — check Cargo.toml regex and run 'cargo fetch' from native/" + +# Search for the candidate. Replace EXPR with the SQL function name. +EXPR='$ARGUMENTS' +grep -rin "fn name" "$DF_SPARK/src/function/" 2>/dev/null | grep -i "$EXPR" +grep -rin "fn name" "$DF_FUNCS/src/" 2>/dev/null | grep -i "$EXPR" ``` +If using a local DataFusion clone instead, point the grep at `/datafusion/spark/src/function/` and `/datafusion/functions/src/` respectively. + If the cached crate directory does not exist (fresh checkout), run `cargo fetch` from `native/` first, or fall back to the latest cached version and verify the function exists in the pinned version's git tag before relying on it. -Verify the candidate function's `Signature`, return type, and behavior matches Spark across all relevant edge cases (NULL, overflow, non-finite floats, decimal scale, locale, ANSI mode). If semantics diverge in a way the Scala serde can't bridge with preprocessing or restrictions → **stop and run `implement-comet-expression`** instead. +### 2a. Decision gate — confirm the source crate with the user + +Before wiring, classify the candidate: + +- **Found in `datafusion-spark`** → proceed without prompting. This crate is explicitly Spark-compatible. +- **Found only in `datafusion-functions` (pure DataFusion)** → **STOP and ask the user before proceeding**. Pure DataFusion functions follow standard SQL semantics and frequently diverge from Spark on edge cases (NULL vs error, negative inputs, overflow, return type, NaN handling). Even when the divergences look bridgeable with Pattern C preprocessing, the user should explicitly approve relying on a non-Spark-tuned implementation rather than requesting an upstream `datafusion-spark` port. + + Use `AskUserQuestion`. Surface what you found in both crates and the specific divergences you've already identified, then offer: + 1. Wire from `datafusion-functions` with bridging preprocessing (Pattern C). List the divergences and the masking/casting that would close them. + 2. Stop and switch to `implement-comet-expression` so the function can be added to `datafusion-spark` upstream and then wired with Pattern A or B. + 3. Skip — leave the expression unsupported for now. + + Do not proceed past this gate without an explicit answer. + +- **Found in neither** → stop and run `implement-comet-expression`. + +Verify the chosen function's `Signature`, return type, and behavior match Spark across all relevant edge cases (NULL, overflow, non-finite floats, decimal scale, locale, ANSI mode). If semantics diverge in a way the Scala serde can't bridge with preprocessing or restrictions → **stop and run `implement-comet-expression`** instead. For datafusion-spark candidates, also check whether the UDF is already pre-registered: see `register_datafusion_spark_function` in `native/core/src/execution/jni_api.rs`. If listed, you only need Pattern A. If missing, you need Pattern B. diff --git a/docs/source/contributor-guide/spark_expressions_support.md b/docs/source/contributor-guide/spark_expressions_support.md index 22ab486cc1..9e28be1713 100644 --- a/docs/source/contributor-guide/spark_expressions_support.md +++ b/docs/source/contributor-guide/spark_expressions_support.md @@ -377,7 +377,10 @@ - [ ] e - [x] exp - [x] expm1 -- [ ] factorial +- [x] factorial + - 3.4.3 (audited 2026-05-15): identical to v3.5.8. + - 3.5.8 (audited 2026-05-15): canonical reference; `extends UnaryExpression with ImplicitCastInputTypes with NullIntolerant`. Returns NULL for NULL input or values outside `[0, 20]`. + - 4.0.1 (audited 2026-05-15): `NullIntolerant` trait replaced by `nullIntolerant: Boolean` method override; behavior unchanged. - [x] floor - [x] greatest - [x] hex diff --git a/docs/source/user-guide/latest/expressions.md b/docs/source/user-guide/latest/expressions.md index 818f379a41..245d839fb1 100644 --- a/docs/source/user-guide/latest/expressions.md +++ b/docs/source/user-guide/latest/expressions.md @@ -154,6 +154,7 @@ of expressions that be disabled. | Divide | `/` | | Exp | `exp` | | Expm1 | `expm1` | +| Factorial | `factorial` | | Floor | `floor` | | Hex | `hex` | | IntegralDivide | `div` | diff --git a/native/core/src/execution/jni_api.rs b/native/core/src/execution/jni_api.rs index e40c8a6355..4c0265793d 100644 --- a/native/core/src/execution/jni_api.rs +++ b/native/core/src/execution/jni_api.rs @@ -56,6 +56,7 @@ use datafusion_spark::function::hash::sha2::SparkSha2; use datafusion_spark::function::map::map_from_entries::MapFromEntries; use datafusion_spark::function::map::str_to_map::SparkStrToMap; use datafusion_spark::function::math::expm1::SparkExpm1; +use datafusion_spark::function::math::factorial::SparkFactorial; use datafusion_spark::function::math::hex::SparkHex; use datafusion_spark::function::math::trigonometry::SparkCsc; use datafusion_spark::function::math::width_bucket::SparkWidthBucket; @@ -593,6 +594,7 @@ fn register_datafusion_spark_function(session_ctx: &SessionContext) { session_ctx.register_udf(ScalarUDF::new_from_impl(SparkUrlEncode::default())); session_ctx.register_udf(ScalarUDF::new_from_impl(SparkTryUrlDecode::default())); session_ctx.register_udf(ScalarUDF::new_from_impl(SparkCsc::default())); + session_ctx.register_udf(ScalarUDF::new_from_impl(SparkFactorial::default())); } /// Prepares arrow arrays for output. diff --git a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala index f9acad1d89..3ef8b05413 100644 --- a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala +++ b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala @@ -108,6 +108,7 @@ object QueryPlanSerde extends Logging with CometExprShim with CometTypeShim { classOf[Divide] -> CometDivide, classOf[Exp] -> CometScalarFunction("exp"), classOf[Expm1] -> CometScalarFunction("expm1"), + classOf[Factorial] -> CometScalarFunction("factorial"), classOf[Floor] -> CometFloor, classOf[Greatest] -> CometScalarFunction("greatest"), classOf[Hex] -> CometHex, diff --git a/spark/src/test/resources/sql-tests/expressions/math/factorial.sql b/spark/src/test/resources/sql-tests/expressions/math/factorial.sql new file mode 100644 index 0000000000..da0d5911d4 --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/math/factorial.sql @@ -0,0 +1,68 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- ConfigMatrix: parquet.enable.dictionary=false,true + +statement +CREATE TABLE test_factorial( + b tinyint, + s smallint, + i int, + l bigint, + f float, + d double, + dec38 decimal(38, 0), + dec10_2 decimal(10, 2) +) USING parquet + +statement +INSERT INTO test_factorial VALUES + (cast(0 as tinyint), cast(0 as smallint), 0, cast(0 as bigint), cast(0.0 as float), 0.0, cast(0 as decimal(38, 0)), cast(0.00 as decimal(10, 2))), + (cast(1 as tinyint), cast(1 as smallint), 1, cast(1 as bigint), cast(1.0 as float), 1.0, cast(1 as decimal(38, 0)), cast(1.49 as decimal(10, 2))), + (cast(5 as tinyint), cast(5 as smallint), 5, cast(5 as bigint), cast(5.7 as float), 5.7, cast(5 as decimal(38, 0)), cast(5.50 as decimal(10, 2))), + (cast(20 as tinyint), cast(20 as smallint), 20, cast(20 as bigint), cast(20.0 as float), 20.0, cast(20 as decimal(38, 0)), cast(20.99 as decimal(10, 2))), + (cast(21 as tinyint), cast(21 as smallint), 21, cast(21 as bigint), cast(21.0 as float), 21.0, cast(21 as decimal(38, 0)), cast(21.00 as decimal(10, 2))), + (cast(-1 as tinyint), cast(-1 as smallint), -1, cast(-1 as bigint), cast(-1.0 as float), -1.0, cast(-1 as decimal(38, 0)), cast(-1.50 as decimal(10, 2))), + (NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL), + (cast(127 as tinyint), cast(32767 as smallint), 2147483647, 9223372036854775807, cast('Infinity' as float), cast('Infinity' as double), cast(99999999999999999999999999999999999999 as decimal(38, 0)), cast(99999999.99 as decimal(10, 2))), + (cast(-128 as tinyint),cast(-32768 as smallint),-2147483648,-9223372036854775808, cast('-Infinity' as float), cast('-Infinity' as double), cast(-99999999999999999999999999999999999999 as decimal(38, 0)),cast(-99999999.99 as decimal(10, 2))), + (cast(0 as tinyint), cast(0 as smallint), 0, cast(0 as bigint), cast('NaN' as float), cast('NaN' as double), cast(0 as decimal(38, 0)), cast(0.00 as decimal(10, 2))), + (cast(0 as tinyint), cast(0 as smallint), 0, cast(0 as bigint), cast(-0.0 as float), cast(-0.0 as double), cast(0 as decimal(38, 0)), cast(0.00 as decimal(10, 2))) + +query +SELECT b, factorial(b) FROM test_factorial + +query +SELECT s, factorial(s) FROM test_factorial + +query +SELECT i, factorial(i) FROM test_factorial + +query +SELECT l, factorial(l) FROM test_factorial + +query +SELECT f, factorial(f) FROM test_factorial + +query +SELECT d, factorial(d) FROM test_factorial + +query +SELECT dec38, factorial(dec38) FROM test_factorial + +query +SELECT dec10_2, factorial(dec10_2) FROM test_factorial From 8daaeff40a2e64969a76a24dc81e979d49c833bc Mon Sep 17 00:00:00 2001 From: comphead Date: Tue, 19 May 2026 16:27:37 -0700 Subject: [PATCH 2/2] feat: wire `factorial` and update wire skill --- .../skills/wire-datafusion-function/SKILL.md | 215 ++++++------------ .../sql-tests/expressions/math/factorial.sql | 55 ++--- 2 files changed, 84 insertions(+), 186 deletions(-) diff --git a/.claude/skills/wire-datafusion-function/SKILL.md b/.claude/skills/wire-datafusion-function/SKILL.md index ca53ea6df9..c03f0c4d80 100644 --- a/.claude/skills/wire-datafusion-function/SKILL.md +++ b/.claude/skills/wire-datafusion-function/SKILL.md @@ -4,201 +4,128 @@ description: Use when wiring an existing DataFusion or datafusion-spark function argument-hint: --- -Wire Comet support for the `$ARGUMENTS` Spark expression by reusing an existing DataFusion or `datafusion-spark` function. **No native Rust implementation is written by this skill** — if upstream coverage is missing, stop and run `implement-comet-expression` instead. - -## Background reading - -- `docs/source/contributor-guide/adding_a_new_expression.md` — Scala serde, protobuf, support levels, SQL tests. -- `docs/source/contributor-guide/sql-file-tests.md` — Comet SQL Tests format. -- `spark/src/main/scala/org/apache/comet/serde/CometExpressionSerde.scala` — trait every serde implements. -- `spark/src/main/scala/org/apache/comet/serde/CometScalarFunction.scala` — the one-line passthrough wrapper. -- `spark/src/main/scala/org/apache/comet/serde/SupportLevel.scala` — `Compatible` / `Incompatible` / `Unsupported`. +Wire Comet support for the `$ARGUMENTS` Spark expression by reusing an existing DataFusion or `datafusion-spark` function. **No native Rust is written here** — if upstream coverage is missing, stop and run `implement-comet-expression`. ## Wiring patterns -Pick the lightest pattern that satisfies the Spark contract. Steps below tell you which one to use. +Pick the lightest one that satisfies the Spark contract. -### Pattern A — One-line passthrough (cleanest) +| Pattern | When | What to change | +| ------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| **A** — passthrough | DF built-in (e.g. `acos`), or `datafusion-spark` UDF already registered in `register_datafusion_spark_function` | one line in the right map of `QueryPlanSerde.scala`: `classOf[Foo] -> CometScalarFunction("foo")` | +| **B** — register + passthrough | `datafusion-spark` UDF, _not_ yet registered, semantics already match Spark | Pattern A line **plus** `session_ctx.register_udf(ScalarUDF::new_from_impl(SparkFoo::default()));` in `native/core/src/execution/jni_api.rs::register_datafusion_spark_function` | +| **C** — custom serde | Inputs need preprocessing (cast, `nullIfNegative`, `+0.0` flip), or you need to set return type / `failOnError`, restrict input types via `getSupportLevel`, enforce foldable-only args, or attach `getCompatibleNotes`/`getIncompatibleReasons`/`getUnsupportedReasons` | new `CometXxx` object in the topic file (`math.scala`, `strings.scala`, …); see `CometCeil`, `CometAtan2`, `CometLog`, `CometSha2`, `CometAbs` | -Spark expression maps directly to a function name resolvable at runtime. The function lives in either `datafusion-functions` (registered by default in `SessionContext`) or in `datafusion-spark` and is **already** registered in `register_datafusion_spark_function`. Single line in the right map of `QueryPlanSerde.scala`: - -```scala -classOf[Acos] -> CometScalarFunction("acos"), // datafusion-functions built-in -classOf[Crc32] -> CometScalarFunction("crc32"), // datafusion-spark SparkCrc32 (already registered) -classOf[DateAdd] -> CometScalarFunction[DateAdd]("date_add"), // see datetime.scala -``` +Function names must match: the string passed to `CometScalarFunction("xyz")` equals `SparkXyz::name()` upstream. -### Pattern B — Pattern A plus a one-line UDF registration - -Same one-liner in `QueryPlanSerde.scala`, **plus** registering the new `datafusion-spark` UDF in `native/core/src/execution/jni_api.rs::register_datafusion_spark_function`: - -```rust -session_ctx.register_udf(ScalarUDF::new_from_impl(SparkXyz::default())); -``` - -Function names must match — the Scala name passed to `CometScalarFunction("xyz")` must equal `SparkXyz::name()` in the upstream crate. - -### Pattern C — Custom `CometExpressionSerde` - -Use when the Spark contract requires any of: +## Workflow -- preprocessing inputs (e.g. wrap with `nullIfNegative`, add `+0.0` to flip `-0.0`, cast), -- setting an explicit return type or `fail_on_error` flag → use `scalarFunctionExprToProtoWithReturnType`, -- restricting supported input types → implement `getSupportLevel` returning `Unsupported(notes)` / `Incompatible(notes)`, -- foldable / literal-only argument checks (e.g. `Sha2.numBits` must be a literal), -- documenting always-on differences via `getCompatibleNotes` / `getIncompatibleReasons` / `getUnsupportedReasons`. +### 1. Study the Spark contract -Examples to study before writing one: `CometCeil` / `CometFloor` (decimal scale fallback), `CometAtan2` (input massaging), `CometLog` (null-if-negative), `CometSha2` (literal-only argument), `CometAbs` (`getSupportLevel`). +Find the `case class $ARGUMENTS` (PascalCase). Prefer the user's local Spark clone (`CLAUDE.md` / project memory); fall back to `git clone --depth 1 https://github.com/apache/spark.git /tmp/spark-master`. If the clone is sandbox-blocked and no local clone is found, **stop and ask** — Spark is the ground truth. -## Workflow +Note `inputTypes`, `dataType`, `eval` / `nullSafeEval`, `require` guards, ANSI mode branches, and foldable-only arguments. These define what Comet must reproduce. -### 1. Study the Spark master implementation +### 2. Find the upstream function (at the pinned version) -Same as step 1 of `implement-comet-expression`. Shallow-clone if missing, then grep for the expression class. The class name is PascalCase (e.g. `Csc`, not `csc`); capitalize `$ARGUMENTS` before grepping, or use `-i`: +The pinned versions in `native/Cargo.toml` are the ground truth — a function on upstream `main` may not exist in the version Comet depends on. Resolve versions first, then search sources matching those exact versions. **Pick one source path; do not mix.** ```bash -if [ ! -d /tmp/spark-master ]; then - git clone --depth 1 https://github.com/apache/spark.git /tmp/spark-master -fi - -CLASS_NAME="$(printf '%s' "$ARGUMENTS" | awk '{print toupper(substr($0,1,1)) substr($0,2)}')" -find /tmp/spark-master/sql -name "*.scala" | \ - xargs grep -l "case class ${CLASS_NAME}\b\|object ${CLASS_NAME}\b" 2>/dev/null +REPO_ROOT=$(git rev-parse --show-toplevel) +# Portable regex — BSD awk on macOS does not support \b. +DF_SPARK_VER=$(awk -F'"' '/^datafusion-spark[ =]/ {print $2; exit}' "$REPO_ROOT/native/Cargo.toml") +DF_FUNCS_VER=$(awk -F'"' '/^datafusion[ =]/ {print $2; exit}' "$REPO_ROOT/native/Cargo.toml") +[ -z "$DF_SPARK_VER" ] || [ -z "$DF_FUNCS_VER" ] && { echo "ERROR: version extraction failed"; exit 1; } +EXPR='$ARGUMENTS' ``` -If the `git clone` is blocked (e.g. sandboxed environment), **stop and ask the user for an alternative source** — typically a local Spark clone path. Then point grep at that path instead. Do not silently skip Spark study — the Spark contract is the ground truth. - -Note `inputTypes`, `dataType`, `eval` / `nullSafeEval`, ANSI mode branches, `require` guards, and any foldable-only arguments. These define the Spark contract Comet must match. - -### 2. Find the upstream function - -**Prefer a local DataFusion clone if one is available** — check `CLAUDE.md`, project memory, and the user's `~/dev` tree for an existing checkout (e.g. `~/dev/.../rust/datafusion`). The local clone is typically the latest source and avoids cargo-registry version mismatches. If no local clone is found, fall back to the cached registry crate. - -Read the pinned `datafusion-spark` version from `native/Cargo.toml` rather than picking the first cached copy — `head -1` can land on an older release that lacks the function. Use a portable regex (BSD `awk` on macOS does not support `\b`): +**Option A — cargo registry (preferred):** ```bash -REPO_ROOT=$(git rev-parse --show-toplevel) -DF_SPARK_VER=$(awk -F'"' '/^datafusion-spark[ =]/ {print $2; exit}' "$REPO_ROOT/native/Cargo.toml") DF_SPARK=$(ls -d ~/.cargo/registry/src/*/datafusion-spark-${DF_SPARK_VER}/ 2>/dev/null | head -1) -echo "Using datafusion-spark $DF_SPARK_VER at $DF_SPARK" - -DF_FUNCS_VER=$(awk -F'"' '/^datafusion[ =]/ {print $2; exit}' "$REPO_ROOT/native/Cargo.toml") DF_FUNCS=$(ls -d ~/.cargo/registry/src/*/datafusion-functions-${DF_FUNCS_VER}/ 2>/dev/null | head -1) - -# Sanity check — empty paths mean the awk match failed or the crate is not cached. -[ -z "$DF_SPARK" ] && echo "WARNING: datafusion-spark path empty — check Cargo.toml regex and run 'cargo fetch' from native/" -[ -z "$DF_FUNCS" ] && echo "WARNING: datafusion-functions path empty — check Cargo.toml regex and run 'cargo fetch' from native/" - -# Search for the candidate. Replace EXPR with the SQL function name. -EXPR='$ARGUMENTS' +# If empty, run `cargo fetch` from native/. grep -rin "fn name" "$DF_SPARK/src/function/" 2>/dev/null | grep -i "$EXPR" -grep -rin "fn name" "$DF_FUNCS/src/" 2>/dev/null | grep -i "$EXPR" +grep -rin "fn name" "$DF_FUNCS/src/" 2>/dev/null | grep -i "$EXPR" ``` -If using a local DataFusion clone instead, point the grep at `/datafusion/spark/src/function/` and `/datafusion/functions/src/` respectively. +**Option B — local DataFusion clone** (only if `CLAUDE.md` / memory points at one). Grep at the pinned tag — DataFusion uses lightweight tags (``, no `v` prefix), so use `tag -l` not `rev-parse --verify ^{tag}`: -If the cached crate directory does not exist (fresh checkout), run `cargo fetch` from `native/` first, or fall back to the latest cached version and verify the function exists in the pinned version's git tag before relying on it. - -### 2a. Decision gate — confirm the source crate with the user - -Before wiring, classify the candidate: - -- **Found in `datafusion-spark`** → proceed without prompting. This crate is explicitly Spark-compatible. -- **Found only in `datafusion-functions` (pure DataFusion)** → **STOP and ask the user before proceeding**. Pure DataFusion functions follow standard SQL semantics and frequently diverge from Spark on edge cases (NULL vs error, negative inputs, overflow, return type, NaN handling). Even when the divergences look bridgeable with Pattern C preprocessing, the user should explicitly approve relying on a non-Spark-tuned implementation rather than requesting an upstream `datafusion-spark` port. +```bash +DF_CLONE= +[ -z "$(git -C "$DF_CLONE" tag -l "$DF_SPARK_VER")" ] && echo "WARNING: tag missing — git fetch --tags or use Option A" +git -C "$DF_CLONE" grep -in "fn name" "$DF_SPARK_VER" -- 'datafusion/spark/src/function/' | grep -i "$EXPR" +git -C "$DF_CLONE" grep -in "fn name" "$DF_FUNCS_VER" -- 'datafusion/functions/src/' | grep -i "$EXPR" +``` - Use `AskUserQuestion`. Surface what you found in both crates and the specific divergences you've already identified, then offer: - 1. Wire from `datafusion-functions` with bridging preprocessing (Pattern C). List the divergences and the masking/casting that would close them. - 2. Stop and switch to `implement-comet-expression` so the function can be added to `datafusion-spark` upstream and then wired with Pattern A or B. - 3. Skip — leave the expression unsupported for now. +Never grep the clone's working tree directly — it may be on `main`. - Do not proceed past this gate without an explicit answer. +### 2a. Decision gate — confirm the source crate +- **Found in `datafusion-spark`** → proceed (Spark-tuned by design). If listed in `register_datafusion_spark_function` → Pattern A; otherwise Pattern B. +- **Found only in `datafusion-functions`** → **STOP and `AskUserQuestion`**. Pure DataFusion follows standard SQL semantics and frequently diverges from Spark (NULL vs error, negatives, overflow, return type, NaN). Surface what you found in each crate and the divergences you've already identified, then offer: + 1. Wire from `datafusion-functions` with Pattern C bridging — list the divergences and the masking/casting that closes them. + 2. Stop and switch to `implement-comet-expression` to port to `datafusion-spark` upstream first. - **Found in neither** → stop and run `implement-comet-expression`. -Verify the chosen function's `Signature`, return type, and behavior match Spark across all relevant edge cases (NULL, overflow, non-finite floats, decimal scale, locale, ANSI mode). If semantics diverge in a way the Scala serde can't bridge with preprocessing or restrictions → **stop and run `implement-comet-expression`** instead. +If semantics diverge in a way Pattern C can't bridge, escalate. -For datafusion-spark candidates, also check whether the UDF is already pre-registered: see `register_datafusion_spark_function` in `native/core/src/execution/jni_api.rs`. If listed, you only need Pattern A. If missing, you need Pattern B. +### 3. Apply the Scala wiring -### 3. Pick the wiring pattern +Add the entry to the matching map in `spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala`: `mathExpressions`, `stringExpressions`, `arrayExpressions`, `mapExpressions`, `structExpressions`, `predicateExpressions`, `conditionalExpressions`, `bitwiseExpressions`, `temporalExpressions`, `hashExpressions`, `conversionExpressions`, `miscExpressions`. -| Situation | Pattern | -| ----------------------------------------------------------------------------------------------------- | ------- | -| DF built-in (e.g. `acos`, `md5`, `replace`) matches Spark; or datafusion-spark UDF already registered | **A** | -| New `datafusion-spark` UDF, semantics already match Spark | **B** | -| Inputs need preprocessing, restrictions, or explicit return type / failOnError | **C** | +Spark expression classes are in scope via `import org.apache.spark.sql.catalyst.expressions._`. For Pattern C, the topic file uses **explicit** imports — extend the existing `import …expressions.{…}` line and place the new object alongside its peers. -When in doubt, start with the lightest pattern and escalate only if the Spark contract demands it. +Helpers from `QueryPlanSerde`: -### 4. Apply the Scala wiring +- `scalarFunctionExprToProto(name, args*)` — return type inferred, `failOnError = false`. +- `scalarFunctionExprToProtoWithReturnType(name, returnType, failOnError, args*)` — explicit type / fail-on-error. +- `optExprWithInfo(optExpr, expr, children*)` — wrap final result; propagates "why we couldn't convert" tags. +- `withInfo(expr, "reason")` — tag a fallback when returning `None`. -Add the entry to the matching map in `spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala`: +`getSupportLevel` returning `Incompatible(Some("…"))` gates behind `spark.comet.expr..allowIncompatible=true`. `Unsupported(…)` always falls back. -- `mathExpressions`, `stringExpressions`, `arrayExpressions`, `mapExpressions`, `structExpressions`, `predicateExpressions`, `conditionalExpressions`, `bitwiseExpressions`, `temporalExpressions`, `hashExpressions`, `conversionExpressions`, `miscExpressions`. +### 4. Register the UDF (Pattern B only) -Confirm the Spark expression class is in scope. `QueryPlanSerde.scala` already imports `org.apache.spark.sql.catalyst.expressions._` (wildcard), so most expressions are picked up for free. For Pattern C, the topic file (`math.scala`, `strings.scala`, `datetime.scala`, `hash.scala`, `arrays.scala`, etc.) uses **explicit** imports — add the new class name to the existing `import org.apache.spark.sql.catalyst.expressions.{...}` line, and place the serde object alongside its peers. +In `native/core/src/execution/jni_api.rs::register_datafusion_spark_function`, add the `use` and the `register_udf` line, grouped with neighbors. Patterns A and C never touch native code. -Helpers from `QueryPlanSerde`: +### 5. Add SQL file tests -- `scalarFunctionExprToProto(funcName, args*)` — basic call, `failOnError = false`, return type inferred from args. -- `scalarFunctionExprToProtoWithReturnType(funcName, returnType, failOnError, args*)` — set explicit return type and/or `fail_on_error`. -- `optExprWithInfo(optExpr, expr, children*)` — attaches the "why we couldn't convert" tag if any child failed; always wrap your final result. -- `withInfo(expr, "reason")` — tag a fallback reason when returning `None`. +Create `spark/src/test/resources/sql-tests/expressions//$ARGUMENTS.sql`. Mandatory rules: -For `getSupportLevel`, returning `Incompatible(Some("..."))` gates the function behind `spark.comet.expr..allowIncompatible=true`. `Unsupported(notes)` always falls back to Spark. +1. **Test only the directly supported input types** — read `inputTypes` from the Spark source. Do **not** add columns for implicit-cast widening (`ImplicitCastInputTypes`); the cast path has its own tests. E.g. `inputTypes = Seq(IntegerType)` (`Factorial`) → one `int` column, not byte/short/long/float/double/decimal. For `TypeCollection(A, B)`, add one column per member. +2. **All `SELECT`s read from a parquet table** — no literal-only queries (the constant folder skips the read path and column-vector kernel). +3. **Mix `NULL` into every column** — exercises the validity bitmap. +4. **Per type, cover the edge-case set:** + - Float / Double: `NaN`, `+0.0`, `-0.0`, `±Infinity` via `cast('NaN' as double)` etc. + - Integer / decimal: each type's max and min (`127`/`-128`, `2147483647`/`-2147483648`, `9223372036854775807`/`-9223372036854775808`, `Decimal(38, 0)` boundary) for overflow / wrap. + - String / binary: empty, multi-byte UTF-8 (`'é'`, `'日本'`), embedded NULs if relevant. +5. **Cover the Spark-specific edges** found in step 1: in-range boundaries, just-out-of-range values, ANSI error paths, foldable-only args. -### 5. Apply the native registration (Pattern B only) +Use `query expect_fallback(...)` for inputs where the serde returns `None`. Format: `docs/source/contributor-guide/sql-file-tests.md`. -In `native/core/src/execution/jni_api.rs::register_datafusion_spark_function`, add one line with the upstream import already in scope (or extend the imports). Keep the list grouped logically with neighboring entries. +### 6. Update docs -Pattern A and Pattern C **never** touch native code. +Hand-curated (`make` does not regenerate these): -### 6. Build and smoke-test +- `docs/source/user-guide/latest/expressions.md` — add `| | \`\` |` to the matching category table, alphabetical. +- `docs/source/contributor-guide/spark_expressions_support.md` — flip `- [ ] ` to `- [x] `. -Build the project (the user generally runs tests themselves — do not run them proactively unless asked): +Per-expression compatibility pages under `compatibility/expressions/*.md` are auto-generated from `getCompatibleNotes` / `getIncompatibleReasons` / `getUnsupportedReasons` — do not hand-edit. -```bash -make -``` - -If the user asks for a smoke test: +### 7. Build, audit, finalize ```bash -./mvnw test -Dsuites="org.apache.comet.CometSqlFileTestSuite $ARGUMENTS" -Dtest=none +make +cd native && cargo clippy --all-targets --workspace -- -D warnings ``` -### 7. Add SQL file tests - -Create `spark/src/test/resources/sql-tests/expressions//$ARGUMENTS.sql`. Coverage requirements (mandatory — do not skip): - -1. **Inspect the Spark source for accepted input datatypes.** Look at `inputTypes`, the parent class (`UnaryMathExpression`, `BinaryExpression`, `ImplicitCastInputTypes`, `ExpectsInputTypes`), and any `dataType` overrides. Add one column per supported type — e.g. for a numeric function: `ByteType`, `ShortType`, `IntegerType`, `LongType`, `FloatType`, `DoubleType`, `DecimalType` (varying precision/scale). For a string function: also cover `BinaryType` if accepted. -2. **All `SELECT`s must read from a parquet table.** Do not write literal-only queries like `SELECT f(1.0), f(NULL)` — encode the values you want to test as rows in a parquet-backed table and query that. This exercises the real read path, not just the constant-folder. -3. **Mix `NULL` into every column** so the kernel exercises the validity bitmap, not just the all-valid fast path. -4. **Floating-point (`FloatType` / `DoubleType`)** columns must include: `NaN`, `+0.0`, `-0.0`, `-Infinity`, `+Infinity`. Use `cast('NaN' as double)`, `cast('Infinity' as double)`, `cast('-Infinity' as double)`. -5. **Integer / decimal** columns must include each type's max and min (e.g. `127` / `-128` for byte, `2147483647` / `-2147483648` for int, `9223372036854775807` / `-9223372036854775808` for long, `Decimal(38, 0)` boundary for decimal) so overflow / wrap behavior is exercised. -6. Cover the **Spark-specific edge cases identified in step 1** (negative scale, empty string, ANSI-mode error paths, etc.) — again, as rows in the parquet table. - -Use `query expect_fallback(...)` for inputs where the serde intentionally returns `None`. Format described in `docs/source/contributor-guide/sql-file-tests.md`. - -### 8. Update the documentation - -Two files are hand-curated (not regenerated by `make`) and must be updated for every newly wired expression: - -- `docs/source/user-guide/latest/expressions.md` — add a row to the matching category table (`## Math Expressions`, `## String Expressions`, etc.) in alphabetical order: `| | \`\` |`. -- `docs/source/contributor-guide/spark_expressions_support.md` — flip the entry from `- [ ] ` to `- [x] ` in the matching category section. +Then run `audit-comet-expression` on `$ARGUMENTS` to compare against Spark 3.4.3 / 3.5.8 / 4.0.1 and surface coverage gaps; iterate on tests. -Per-category compatibility pages under `docs/source/user-guide/latest/compatibility/expressions/*.md` are auto-generated by `GenerateDocs.scala` from `getCompatibleNotes` / `getIncompatibleReasons` / `getUnsupportedReasons`. Do not hand-edit those — `make` regenerates them. - -### 9. Run the audit skill - -Run `audit-comet-expression` on `$ARGUMENTS`. It compares against Spark 3.4.3, 3.5.8, and 4.0.1 and produces a prioritized gap list — usually missing test coverage. Iterate on tests until the user is satisfied. - -### 10. Final checks +The user generally runs tests themselves. If asked for a smoke test: ```bash -make format -cd native && cargo clippy --all-targets --workspace -- -D warnings +./mvnw test -Dsuites="org.apache.comet.CometSqlFileTestSuite $ARGUMENTS" -Dtest=none ``` -### 11. Open the PR - -Fill in every section of `.github/pull_request_template.md`. In "What changes are included in this PR?", note that the `wire-datafusion-function` skill was used and call out which upstream function (DataFusion built-in or `datafusion-spark::function::...`) is being wired. +PR: fill in `.github/pull_request_template.md`; under "What changes are included", note the skill was used and which upstream function (`datafusion::…` built-in or `datafusion-spark::function::…`) is being wired. diff --git a/spark/src/test/resources/sql-tests/expressions/math/factorial.sql b/spark/src/test/resources/sql-tests/expressions/math/factorial.sql index da0d5911d4..612a3af4ec 100644 --- a/spark/src/test/resources/sql-tests/expressions/math/factorial.sql +++ b/spark/src/test/resources/sql-tests/expressions/math/factorial.sql @@ -18,51 +18,22 @@ -- ConfigMatrix: parquet.enable.dictionary=false,true statement -CREATE TABLE test_factorial( - b tinyint, - s smallint, - i int, - l bigint, - f float, - d double, - dec38 decimal(38, 0), - dec10_2 decimal(10, 2) -) USING parquet +CREATE TABLE test_factorial(i int) USING parquet statement INSERT INTO test_factorial VALUES - (cast(0 as tinyint), cast(0 as smallint), 0, cast(0 as bigint), cast(0.0 as float), 0.0, cast(0 as decimal(38, 0)), cast(0.00 as decimal(10, 2))), - (cast(1 as tinyint), cast(1 as smallint), 1, cast(1 as bigint), cast(1.0 as float), 1.0, cast(1 as decimal(38, 0)), cast(1.49 as decimal(10, 2))), - (cast(5 as tinyint), cast(5 as smallint), 5, cast(5 as bigint), cast(5.7 as float), 5.7, cast(5 as decimal(38, 0)), cast(5.50 as decimal(10, 2))), - (cast(20 as tinyint), cast(20 as smallint), 20, cast(20 as bigint), cast(20.0 as float), 20.0, cast(20 as decimal(38, 0)), cast(20.99 as decimal(10, 2))), - (cast(21 as tinyint), cast(21 as smallint), 21, cast(21 as bigint), cast(21.0 as float), 21.0, cast(21 as decimal(38, 0)), cast(21.00 as decimal(10, 2))), - (cast(-1 as tinyint), cast(-1 as smallint), -1, cast(-1 as bigint), cast(-1.0 as float), -1.0, cast(-1 as decimal(38, 0)), cast(-1.50 as decimal(10, 2))), - (NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL), - (cast(127 as tinyint), cast(32767 as smallint), 2147483647, 9223372036854775807, cast('Infinity' as float), cast('Infinity' as double), cast(99999999999999999999999999999999999999 as decimal(38, 0)), cast(99999999.99 as decimal(10, 2))), - (cast(-128 as tinyint),cast(-32768 as smallint),-2147483648,-9223372036854775808, cast('-Infinity' as float), cast('-Infinity' as double), cast(-99999999999999999999999999999999999999 as decimal(38, 0)),cast(-99999999.99 as decimal(10, 2))), - (cast(0 as tinyint), cast(0 as smallint), 0, cast(0 as bigint), cast('NaN' as float), cast('NaN' as double), cast(0 as decimal(38, 0)), cast(0.00 as decimal(10, 2))), - (cast(0 as tinyint), cast(0 as smallint), 0, cast(0 as bigint), cast(-0.0 as float), cast(-0.0 as double), cast(0 as decimal(38, 0)), cast(0.00 as decimal(10, 2))) - -query -SELECT b, factorial(b) FROM test_factorial - -query -SELECT s, factorial(s) FROM test_factorial + (0), + (1), + (5), + (12), + (20), + (21), + (-1), + (-5), + (100), + (NULL), + (2147483647), + (-2147483648) query SELECT i, factorial(i) FROM test_factorial - -query -SELECT l, factorial(l) FROM test_factorial - -query -SELECT f, factorial(f) FROM test_factorial - -query -SELECT d, factorial(d) FROM test_factorial - -query -SELECT dec38, factorial(dec38) FROM test_factorial - -query -SELECT dec10_2, factorial(dec10_2) FROM test_factorial