From 7200af5b6cb183773f1f635d28580ddf9e946f3c Mon Sep 17 00:00:00 2001 From: Abhishek Bansal Date: Tue, 17 Feb 2026 13:03:28 +0530 Subject: [PATCH 1/2] GH-45193: [C++][Compute] Treat NaNs and nulls as distinct values in rank tie-breaking --- cpp/src/arrow/compute/kernels/vector_rank.cc | 30 ++++++++++++++----- .../arrow/compute/kernels/vector_sort_test.cc | 22 ++++++++++++-- 2 files changed, 43 insertions(+), 9 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/vector_rank.cc b/cpp/src/arrow/compute/kernels/vector_rank.cc index ef7419ea7c5..98f305d3071 100644 --- a/cpp/src/arrow/compute/kernels/vector_rank.cc +++ b/cpp/src/arrow/compute/kernels/vector_rank.cc @@ -38,8 +38,9 @@ namespace { // is the same as the value at the previous sort index. constexpr uint64_t kDuplicateMask = 1ULL << 63; -template -void MarkDuplicates(const NullPartitionResult& sorted, ValueSelector&& value_selector) { +template +void MarkDuplicates(const NullPartitionResult& sorted, ValueSelector&& value_selector, + IsNullSelector&& is_null_selector) { using T = decltype(value_selector(int64_t{})); // Process non-nulls @@ -57,10 +58,20 @@ void MarkDuplicates(const NullPartitionResult& sorted, ValueSelector&& value_sel // Process nulls if (sorted.nulls_end != sorted.nulls_begin) { - // TODO this should be able to distinguish between NaNs and real nulls (GH-45193) auto it = sorted.nulls_begin; - while (++it < sorted.nulls_end) { - *it |= kDuplicateMask; + if constexpr (has_null_like_values()) { + bool prev_is_null = is_null_selector(*it); + while (++it < sorted.nulls_end) { + bool curr_is_null = is_null_selector(*it); + if (curr_is_null == prev_is_null) { + *it |= kDuplicateMask; + } + prev_is_null = curr_is_null; + } + } else { + while (++it < sorted.nulls_end) { + *it |= kDuplicateMask; + } } } } @@ -84,7 +95,8 @@ Result DoSortAndMarkDuplicate( auto value_selector = [&array](int64_t index) { return GetView::LogicalValue(array.GetView(index)); }; - MarkDuplicates(sorted, value_selector); + auto is_null_selector = [&array](int64_t index) { return array.IsNull(index); }; + MarkDuplicates(sorted, value_selector, is_null_selector); } return sorted; } @@ -106,7 +118,11 @@ Result DoSortAndMarkDuplicate( auto value_selector = [resolver = ChunkedArrayResolver(span(arrays))](int64_t index) { return resolver.Resolve(index).Value(); }; - MarkDuplicates(sorted, value_selector); + auto is_null_selector = [resolver = + ChunkedArrayResolver(span(arrays))](int64_t index) { + return resolver.Resolve(index).IsNull(); + }; + MarkDuplicates(sorted, value_selector, is_null_selector); } return sorted; } diff --git a/cpp/src/arrow/compute/kernels/vector_sort_test.cc b/cpp/src/arrow/compute/kernels/vector_sort_test.cc index 90f8eb7a56b..72fd2d53510 100644 --- a/cpp/src/arrow/compute/kernels/vector_sort_test.cc +++ b/cpp/src/arrow/compute/kernels/vector_sort_test.cc @@ -76,8 +76,8 @@ std::ostream& operator<<(std::ostream& os, NullPlacement null_placement) { // Tests for NthToIndices template -auto GetLogicalValue(const ArrayType& array, - uint64_t index) -> decltype(array.GetView(index)) { +auto GetLogicalValue(const ArrayType& array, uint64_t index) + -> decltype(array.GetView(index)) { return array.GetView(index); } @@ -2341,6 +2341,24 @@ TEST_F(TestRank, Real) { } } +TEST_F(TestRank, NaNsAndNulls) { + auto type = float64(); + auto array = ArrayFromJSON(type, "[1.0, null, NaN, 2.0, NaN, null]"); + SetInput(array); + + // Sorted order (at_end): [1.0, 2.0, NaN, NaN, null, null] + // Ranks (min): [1, 5, 3, 2, 3, 5] + auto expected_at_end = ArrayFromJSON(uint64(), "[1, 5, 3, 2, 3, 5]"); + AssertRank(SortOrder::Ascending, NullPlacement::AtEnd, RankOptions::Min, + expected_at_end); + + // Sorted order (at_start): [null, null, NaN, NaN, 1.0, 2.0] + // Ranks (min): [5, 1, 3, 6, 3, 1] + auto expected_at_start = ArrayFromJSON(uint64(), "[5, 1, 3, 6, 3, 1]"); + AssertRank(SortOrder::Ascending, NullPlacement::AtStart, RankOptions::Min, + expected_at_start); +} + TEST_F(TestRank, Integral) { for (auto integer_type : ::arrow::IntTypes()) { SetInput(ArrayFromJSON(integer_type, "[2, 3, 1, 0, 5]")); From ebfa315fd27b487af9c34def3dc719c7c50e07c4 Mon Sep 17 00:00:00 2001 From: Abhishek Bansal Date: Tue, 17 Feb 2026 13:18:30 +0530 Subject: [PATCH 2/2] GH-45193: [C++] Apply clang-format --- cpp/src/arrow/compute/kernels/vector_sort_test.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/vector_sort_test.cc b/cpp/src/arrow/compute/kernels/vector_sort_test.cc index 72fd2d53510..5ac4cdfb2f6 100644 --- a/cpp/src/arrow/compute/kernels/vector_sort_test.cc +++ b/cpp/src/arrow/compute/kernels/vector_sort_test.cc @@ -76,8 +76,8 @@ std::ostream& operator<<(std::ostream& os, NullPlacement null_placement) { // Tests for NthToIndices template -auto GetLogicalValue(const ArrayType& array, uint64_t index) - -> decltype(array.GetView(index)) { +auto GetLogicalValue(const ArrayType& array, + uint64_t index) -> decltype(array.GetView(index)) { return array.GetView(index); }