diff --git a/cpp/src/arrow/acero/hash_aggregate_test.cc b/cpp/src/arrow/acero/hash_aggregate_test.cc index 12d24429cb6c..442dcd588345 100644 --- a/cpp/src/arrow/acero/hash_aggregate_test.cc +++ b/cpp/src/arrow/acero/hash_aggregate_test.cc @@ -2157,6 +2157,44 @@ TEST_P(GroupBy, AnyAndAll) { } } +TEST_P(GroupBy, AnyAllSlicedNullableBoolean) { + auto table = TableFromJSON(schema({field("any_arg", boolean()), + field("all_arg", boolean()), field("key", int64())}), + {R"([ + [true, false, 99], + [false, true, 10], + [null, null, 10] + ])"}); + auto sliced = table->Slice(1); + + // GH-50043: hash_any/hash_all should respect the slice offset. + // After Slice(1), any_arg=[false, null] and all_arg=[true, null]. + auto expected = ArrayFromJSON(struct_({ + field("key_0", int64()), + field("hash_any", boolean()), + field("hash_all", boolean()), + }), + R"([ + [10, false, true] + ])"); + + for (bool use_threads : {true, false}) { + SCOPED_TRACE(use_threads ? "parallel/merged" : "serial"); + + ASSERT_OK_AND_ASSIGN(auto actual, GroupByTest({sliced->GetColumnByName("any_arg"), + sliced->GetColumnByName("all_arg")}, + {sliced->GetColumnByName("key")}, + { + {"hash_any", nullptr}, + {"hash_all", nullptr}, + }, + use_threads)); + ValidateOutput(actual); + + AssertDatumsEqual(expected, actual, /*verbose=*/true); + } +} + TEST_P(GroupBy, AnyAllScalar) { BatchesWithSchema input; input.batches = { diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate.cc b/cpp/src/arrow/compute/kernels/hash_aggregate.cc index 8359945319cd..d07096236d8c 100644 --- a/cpp/src/arrow/compute/kernels/hash_aggregate.cc +++ b/cpp/src/arrow/compute/kernels/hash_aggregate.cc @@ -1261,7 +1261,8 @@ struct GroupedBooleanAggregator : public GroupedAggregator { input.buffers[0].data, input.offset, input.length, [&](int64_t position) { counts[*g]++; - Impl::UpdateGroupWith(reduced, *g, bit_util::GetBit(bitmap, position)); + Impl::UpdateGroupWith(reduced, *g, + bit_util::GetBit(bitmap, input.offset + position)); g++; }, [&] { bit_util::SetBitTo(no_nulls, *g++, false); }); diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index b65fb7d952c8..c6dbbc5145d1 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -2976,6 +2976,38 @@ def sorted_by_keys(d): } +@pytest.mark.acero +def test_group_by_sliced_any_all(): + # GH-50043: hash_any/hash_all produce incorrect results on sliced boolean arrays + # Row 0 will be discarded by slice, should not affect aggregation + table = pa.table( + { + "g": [99, 10, 10], + "any_arg": [True, False, None], + "all_arg": [False, True, None], + } + ) + sliced = table.slice(1) + + expected = pa.table( + { + "g": [10], + "any_arg_any": [False], + "all_arg_all": [True], + } + ) + + # any(False, None) = False, all(True, None) = True + for use_threads in [False, True]: + result = sliced.group_by("g", use_threads=use_threads).aggregate( + [ + ("any_arg", "any"), + ("all_arg", "all"), + ] + ) + assert result.equals(expected) + + @pytest.mark.acero def test_table_group_by_first(): # "first" is an ordered aggregation -> requires to specify use_threads=False