Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions cpp/src/arrow/acero/hash_aggregate_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2157,6 +2157,44 @@ TEST_P(GroupBy, AnyAndAll) {
}
}

TEST_P(GroupBy, AnyAllSlicedNullableBoolean) {
auto table = TableFromJSON(schema({field("any_arg", boolean()),
field("all_arg", boolean()), field("key", int64())}),
{R"([
[true, false, 99],
[false, true, 10],
[null, null, 10]
])"});
auto sliced = table->Slice(1);

// GH-50043: hash_any/hash_all should respect the slice offset.
// After Slice(1), any_arg=[false, null] and all_arg=[true, null].
auto expected = ArrayFromJSON(struct_({
field("key_0", int64()),
field("hash_any", boolean()),
field("hash_all", boolean()),
}),
R"([
[10, false, true]
])");

for (bool use_threads : {true, false}) {
SCOPED_TRACE(use_threads ? "parallel/merged" : "serial");

ASSERT_OK_AND_ASSIGN(auto actual, GroupByTest({sliced->GetColumnByName("any_arg"),
sliced->GetColumnByName("all_arg")},
{sliced->GetColumnByName("key")},
{
{"hash_any", nullptr},
{"hash_all", nullptr},
},
use_threads));
ValidateOutput(actual);

AssertDatumsEqual(expected, actual, /*verbose=*/true);
}
}

TEST_P(GroupBy, AnyAllScalar) {
BatchesWithSchema input;
input.batches = {
Expand Down
3 changes: 2 additions & 1 deletion cpp/src/arrow/compute/kernels/hash_aggregate.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1261,7 +1261,8 @@ struct GroupedBooleanAggregator : public GroupedAggregator {
input.buffers[0].data, input.offset, input.length,
[&](int64_t position) {
counts[*g]++;
Impl::UpdateGroupWith(reduced, *g, bit_util::GetBit(bitmap, position));
Impl::UpdateGroupWith(reduced, *g,
bit_util::GetBit(bitmap, input.offset + position));
g++;
},
[&] { bit_util::SetBitTo(no_nulls, *g++, false); });
Expand Down
32 changes: 32 additions & 0 deletions python/pyarrow/tests/test_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -2976,6 +2976,38 @@ def sorted_by_keys(d):
}


@pytest.mark.acero
def test_group_by_sliced_any_all():
# GH-50043: hash_any/hash_all produce incorrect results on sliced boolean arrays
# Row 0 will be discarded by slice, should not affect aggregation
table = pa.table(
{
"g": [99, 10, 10],
"any_arg": [True, False, None],
"all_arg": [False, True, None],
}
)
sliced = table.slice(1)

expected = pa.table(
{
"g": [10],
"any_arg_any": [False],
"all_arg_all": [True],
}
)

# any(False, None) = False, all(True, None) = True
for use_threads in [False, True]:
result = sliced.group_by("g", use_threads=use_threads).aggregate(
[
("any_arg", "any"),
("all_arg", "all"),
]
)
assert result.equals(expected)


@pytest.mark.acero
def test_table_group_by_first():
# "first" is an ordered aggregation -> requires to specify use_threads=False
Expand Down
Loading