From 21debc3291debf64d557b283e1677a18f35f057e Mon Sep 17 00:00:00 2001 From: Ruiyang Wang Date: Sat, 3 Jan 2026 13:29:52 -0800 Subject: [PATCH 1/3] GH-47995: [C++][Parquet] Fix empty string min/max statistics being lost during merge Prior to this change, CleanStatistic() for ByteArray rejected statistics where ptr == nullptr. However, empty strings can have ptr == nullptr with len == 0, causing valid statistics to be discarded when the minimum value is an empty string. The fix introduces a sentinel pointer (kNoValueSentinel) distinct from nullptr to mark "no value" in ByteArray statistics. This allows CleanStatistic to distinguish between "no min/max computed" (sentinel) and "min/max is empty string" (nullptr with len=0). FLBA is unchanged since it has fixed length and no "empty" concept. --- cpp/src/parquet/statistics.cc | 31 +++- cpp/src/parquet/statistics_test.cc | 231 +++++++++++++++++++++++++++++ 2 files changed, 260 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc index 2e5f6fe37c4..5736c1d032a 100644 --- a/cpp/src/parquet/statistics.cc +++ b/cpp/src/parquet/statistics.cc @@ -55,6 +55,12 @@ namespace { constexpr int value_length(int value_length, const ByteArray& value) { return value.len; } constexpr int value_length(int type_length, const FLBA& value) { return type_length; } +// Sentinel pointer to mark "no value" for ByteArray statistics. +// Distinct from nullptr, which is valid for empty strings. +// See: https://github.com/apache/arrow/issues/47995 +inline constexpr uint8_t kNoValueSentinelBytes[1] = {0}; +inline constexpr const uint8_t* kNoValueSentinel = kNoValueSentinelBytes; + // Static "constants" for normalizing float16 min/max values. These need to be expressed // as pointers because `Float16LogicalType` represents an FLBA. struct Float16Constants { @@ -290,7 +296,26 @@ struct BinaryLikeCompareHelperBase { template struct CompareHelper - : public BinaryLikeCompareHelperBase {}; + : public BinaryLikeCompareHelperBase { + using Base = BinaryLikeCompareHelperBase; + using T = ByteArray; + + // Use kNoValueSentinel instead of nullptr to distinguish "no value" from empty string. + static T DefaultMin() { return T{0, kNoValueSentinel}; } + static T DefaultMax() { return T{0, kNoValueSentinel}; } + + static T Min(int type_length, const T& a, const T& b) { + if (a.ptr == kNoValueSentinel) return b; + if (b.ptr == kNoValueSentinel) return a; + return Base::Compare(type_length, a, b) ? a : b; + } + + static T Max(int type_length, const T& a, const T& b) { + if (a.ptr == kNoValueSentinel) return b; + if (b.ptr == kNoValueSentinel) return a; + return Base::Compare(type_length, a, b) ? b : a; + } +}; template struct CompareHelper @@ -412,7 +437,9 @@ optional> CleanStatistic(std::pair min_max, optional> CleanStatistic( std::pair min_max, LogicalType::Type::type) { - if (min_max.first.ptr == nullptr || min_max.second.ptr == nullptr) { + // Check for kNoValueSentinel (not nullptr) because nullptr is valid for empty strings. + // See: https://github.com/apache/arrow/issues/47995 + if (min_max.first.ptr == kNoValueSentinel || min_max.second.ptr == kNoValueSentinel) { return ::std::nullopt; } return min_max; diff --git a/cpp/src/parquet/statistics_test.cc b/cpp/src/parquet/statistics_test.cc index 905502cb0a5..9ff6a2ae593 100644 --- a/cpp/src/parquet/statistics_test.cc +++ b/cpp/src/parquet/statistics_test.cc @@ -1804,5 +1804,236 @@ TEST(TestEncodedStatistics, ApplyStatSizeLimits) { EXPECT_FALSE(statistics->HasMinMax()); } +// GH-47995: Empty string minimum should be preserved after merge +TEST(TestByteArrayStatistics, MergeWithEmptyStringMin) { + NodePtr node = + PrimitiveNode::Make("StringColumn", Repetition::OPTIONAL, Type::BYTE_ARRAY); + ColumnDescriptor descr(node, 1, 1); + + // Create statistics with empty string as min, "zzz" as max + auto stats1 = MakeStatistics(&descr); + std::vector values1 = {ByteArray(""), ByteArray("abc"), ByteArray("zzz")}; + stats1->Update(values1.data(), values1.size(), 0); + ASSERT_TRUE(stats1->HasMinMax()); + EXPECT_EQ(stats1->min(), ByteArray("")); + EXPECT_EQ(stats1->max(), ByteArray("zzz")); + + // Create statistics with "aaa" as min, "bbb" as max + auto stats2 = MakeStatistics(&descr); + std::vector values2 = {ByteArray("aaa"), ByteArray("bbb")}; + stats2->Update(values2.data(), values2.size(), 0); + ASSERT_TRUE(stats2->HasMinMax()); + EXPECT_EQ(stats2->min(), ByteArray("aaa")); + EXPECT_EQ(stats2->max(), ByteArray("bbb")); + + // Merge stats2 into stats1: min should still be "", max should be "zzz" + stats1->Merge(*stats2); + ASSERT_TRUE(stats1->HasMinMax()); + EXPECT_EQ(stats1->min(), ByteArray("")); + EXPECT_EQ(stats1->max(), ByteArray("zzz")); + + // Create fresh statistics and merge in opposite order + auto stats3 = MakeStatistics(&descr); + std::vector values3 = {ByteArray(""), ByteArray("abc"), ByteArray("zzz")}; + stats3->Update(values3.data(), values3.size(), 0); + + auto stats4 = MakeStatistics(&descr); + std::vector values4 = {ByteArray("aaa"), ByteArray("bbb")}; + stats4->Update(values4.data(), values4.size(), 0); + + // Merge stats3 into stats4: min should become "", max should become "zzz" + stats4->Merge(*stats3); + ASSERT_TRUE(stats4->HasMinMax()); + EXPECT_EQ(stats4->min(), ByteArray("")); + EXPECT_EQ(stats4->max(), ByteArray("zzz")); +} + +// GH-47995: Comprehensive tests for ByteArray statistics merge with empty strings. +// Tests all combinations of (no min/max) vs (has min/max) with min="" or min="aaa". +TEST(TestByteArrayStatistics, MergeEmptyStringComprehensive) { + NodePtr node = + PrimitiveNode::Make("StringColumn", Repetition::OPTIONAL, Type::BYTE_ARRAY); + ColumnDescriptor descr(node, 1, 1); + + auto make_empty_stats = [&]() { return MakeStatistics(&descr); }; + + auto make_stats_with_empty_min = [&]() { + auto stats = MakeStatistics(&descr); + std::vector values = {ByteArray(""), ByteArray("zzz")}; + stats->Update(values.data(), values.size(), 0); + return stats; + }; + + auto make_stats_with_nonempty_min = [&]() { + auto stats = MakeStatistics(&descr); + std::vector values = {ByteArray("aaa"), ByteArray("yyy")}; + stats->Update(values.data(), values.size(), 0); + return stats; + }; + + // 1. no min/max, merge (no min/max) -> no min/max + { + auto stats = make_empty_stats(); + ASSERT_FALSE(stats->HasMinMax()); + stats->Merge(*make_empty_stats()); + ASSERT_FALSE(stats->HasMinMax()) << "empty + empty = empty"; + } + + // 2. no min/max, merge (min="", max="zzz") -> min="", max="zzz" + { + auto stats = make_empty_stats(); + ASSERT_FALSE(stats->HasMinMax()); + stats->Merge(*make_stats_with_empty_min()); + ASSERT_TRUE(stats->HasMinMax()) << "empty + has_minmax = has_minmax"; + EXPECT_EQ(stats->min(), ByteArray("")); + EXPECT_EQ(stats->max(), ByteArray("zzz")); + } + + // 3. no min/max, merge (min="aaa", max="yyy") -> min="aaa", max="yyy" + { + auto stats = make_empty_stats(); + ASSERT_FALSE(stats->HasMinMax()); + stats->Merge(*make_stats_with_nonempty_min()); + ASSERT_TRUE(stats->HasMinMax()) << "empty + has_minmax = has_minmax"; + EXPECT_EQ(stats->min(), ByteArray("aaa")); + EXPECT_EQ(stats->max(), ByteArray("yyy")); + } + + // 3b. no min/max, merge (min="", max="") -> min="", max="" + { + auto stats = make_empty_stats(); + ASSERT_FALSE(stats->HasMinMax()); + auto other = MakeStatistics(&descr); + std::vector values = {ByteArray("")}; + other->Update(values.data(), values.size(), 0); + ASSERT_TRUE(other->HasMinMax()); + EXPECT_EQ(other->min(), ByteArray("")); + EXPECT_EQ(other->max(), ByteArray("")); + stats->Merge(*other); + ASSERT_TRUE(stats->HasMinMax()) << "empty + (min='', max='') = has_minmax"; + EXPECT_EQ(stats->min(), ByteArray("")); + EXPECT_EQ(stats->max(), ByteArray("")); + } + + // 3c. (min="", max=""), merge (no min/max) -> min="", max="" + { + auto stats = MakeStatistics(&descr); + std::vector values = {ByteArray("")}; + stats->Update(values.data(), values.size(), 0); + ASSERT_TRUE(stats->HasMinMax()); + EXPECT_EQ(stats->min(), ByteArray("")); + EXPECT_EQ(stats->max(), ByteArray("")); + stats->Merge(*make_empty_stats()); + ASSERT_TRUE(stats->HasMinMax()) << "(min='', max='') + empty = has_minmax"; + EXPECT_EQ(stats->min(), ByteArray("")); + EXPECT_EQ(stats->max(), ByteArray("")); + } + + // 4. (min="", max="zzz"), merge (no min/max) -> min="", max="zzz" + { + auto stats = make_stats_with_empty_min(); + ASSERT_TRUE(stats->HasMinMax()); + stats->Merge(*make_empty_stats()); + ASSERT_TRUE(stats->HasMinMax()) << "has_minmax + empty = has_minmax"; + EXPECT_EQ(stats->min(), ByteArray("")); + EXPECT_EQ(stats->max(), ByteArray("zzz")); + } + + // 5. (min="", max="zzz"), merge (min="", max="yyy") -> min="", max="zzz" + { + auto stats = make_stats_with_empty_min(); + auto other = make_stats_with_empty_min(); + // Modify other to have different max for clarity + stats->Merge(*other); + ASSERT_TRUE(stats->HasMinMax()); + EXPECT_EQ(stats->min(), ByteArray("")); + EXPECT_EQ(stats->max(), ByteArray("zzz")); + } + + // 6. (min="", max="zzz"), merge (min="aaa", max="yyy") -> min="", max="zzz" + { + auto stats = make_stats_with_empty_min(); + stats->Merge(*make_stats_with_nonempty_min()); + ASSERT_TRUE(stats->HasMinMax()); + EXPECT_EQ(stats->min(), ByteArray("")) << "empty string < 'aaa'"; + EXPECT_EQ(stats->max(), ByteArray("zzz")) << "'zzz' > 'yyy'"; + } + + // 7. (min="aaa", max="yyy"), merge (no min/max) -> min="aaa", max="yyy" + { + auto stats = make_stats_with_nonempty_min(); + stats->Merge(*make_empty_stats()); + ASSERT_TRUE(stats->HasMinMax()); + EXPECT_EQ(stats->min(), ByteArray("aaa")); + EXPECT_EQ(stats->max(), ByteArray("yyy")); + } + + // 8. (min="aaa", max="yyy"), merge (min="", max="zzz") -> min="", max="zzz" + { + auto stats = make_stats_with_nonempty_min(); + stats->Merge(*make_stats_with_empty_min()); + ASSERT_TRUE(stats->HasMinMax()); + EXPECT_EQ(stats->min(), ByteArray("")) << "empty string < 'aaa'"; + EXPECT_EQ(stats->max(), ByteArray("zzz")) << "'zzz' > 'yyy'"; + } + + // 9. (min="aaa", max="yyy"), merge (min="bbb", max="xxx") -> min="aaa", max="yyy" + { + auto stats = make_stats_with_nonempty_min(); + auto other = MakeStatistics(&descr); + std::vector values = {ByteArray("bbb"), ByteArray("xxx")}; + other->Update(values.data(), values.size(), 0); + stats->Merge(*other); + ASSERT_TRUE(stats->HasMinMax()); + EXPECT_EQ(stats->min(), ByteArray("aaa")); + EXPECT_EQ(stats->max(), ByteArray("yyy")); + } +} + +// GH-47995: Statistics from encoded empty string min should be preserved after merge. +// This test reproduces the bug where empty string min (encoded as "") results in +// ByteArray with ptr=nullptr after internal Copy(), which was incorrectly rejected +// by CleanStatistic(). +TEST(TestByteArrayStatistics, MergeEncodedEmptyStringMin) { + NodePtr node = + PrimitiveNode::Make("StringColumn", Repetition::OPTIONAL, Type::BYTE_ARRAY); + ColumnDescriptor descr(node, 1, 1); + + // Create statistics from encoded values (simulates reading from Parquet metadata) + // Empty string "" as min, "zzz" as max + std::string encoded_min = ""; // empty string + std::string encoded_max = "zzz"; + auto stats1 = Statistics::Make(&descr, encoded_min, encoded_max, + /*num_values=*/100, /*null_count=*/0, + /*distinct_count=*/0, /*has_min_max=*/true, + /*has_null_count=*/true, /*has_distinct_count=*/false); + auto typed_stats1 = std::dynamic_pointer_cast>(stats1); + ASSERT_TRUE(typed_stats1->HasMinMax()); + EXPECT_EQ(typed_stats1->min(), ByteArray("")); + EXPECT_EQ(typed_stats1->max(), ByteArray("zzz")); + + // Create second statistics with "aaa" as min, "bbb" as max + std::string encoded_min2 = "aaa"; + std::string encoded_max2 = "bbb"; + auto stats2 = Statistics::Make(&descr, encoded_min2, encoded_max2, + /*num_values=*/100, /*null_count=*/0, + /*distinct_count=*/0, /*has_min_max=*/true, + /*has_null_count=*/true, /*has_distinct_count=*/false); + auto typed_stats2 = std::dynamic_pointer_cast>(stats2); + ASSERT_TRUE(typed_stats2->HasMinMax()); + + // Create a fresh stats object and merge both + auto merged = MakeStatistics(&descr); + merged->Merge(*typed_stats1); + ASSERT_TRUE(merged->HasMinMax()) << "Min/max should be preserved after first merge"; + EXPECT_EQ(merged->min(), ByteArray("")); + EXPECT_EQ(merged->max(), ByteArray("zzz")); + + merged->Merge(*typed_stats2); + ASSERT_TRUE(merged->HasMinMax()) << "Min/max should be preserved after second merge"; + EXPECT_EQ(merged->min(), ByteArray("")); // empty string should still be min + EXPECT_EQ(merged->max(), ByteArray("zzz")); +} + } // namespace test } // namespace parquet From f415163e01bd0ca887ac2b67ddf14bd958e71378 Mon Sep 17 00:00:00 2001 From: Ruiyang Wang Date: Mon, 12 Jan 2026 18:20:02 -0800 Subject: [PATCH 2/3] Add tests for ByteArray{0, nullptr} empty string statistics Tests that empty strings represented as ByteArray{0, nullptr} are correctly handled in statistics when using the public WriteBatch API. - EmptyStringWithNullptrStats: verifies min is empty string, not skipped - AllEmptyStringsWithNullptrStats: verifies HasMinMax is true - EmptyStringWithNullptrInRawByteArray: unit test for Update/Merge --- cpp/src/parquet/column_writer_test.cc | 49 ++++++++++++++++++++++++ cpp/src/parquet/statistics_test.cc | 55 +++++++++++++++++++++++++++ 2 files changed, 104 insertions(+) diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc index dedf25abcab..d194538a821 100644 --- a/cpp/src/parquet/column_writer_test.cc +++ b/cpp/src/parquet/column_writer_test.cc @@ -980,6 +980,55 @@ TEST_F(TestByteArrayValuesWriter, CheckDefaultStats) { ASSERT_TRUE(this->metadata_is_stats_set()); } +// GH-47995: Test that empty strings represented as ByteArray{0, nullptr} are +// correctly handled in statistics when using WriteBatch API. +TEST_F(TestByteArrayValuesWriter, EmptyStringWithNullptrStats) { + this->SetUpSchema(Repetition::REQUIRED); + auto writer = this->BuildWriter(); + + // Create values with empty string as nullptr (this can happen in practice + // when external code constructs ByteArray without using string literals) + ByteArray empty_with_nullptr{0, nullptr}; + ByteArray non_empty_aaa("aaa"); + ByteArray non_empty_zzz("zzz"); + + std::vector values = {empty_with_nullptr, non_empty_aaa, non_empty_zzz}; + writer->WriteBatch(static_cast(values.size()), nullptr, nullptr, values.data()); + writer->Close(); + + // Statistics should be set and capture the empty string as minimum + ASSERT_TRUE(this->metadata_is_stats_set()); + auto stats = this->metadata_stats(); + ASSERT_TRUE(stats->HasMinMax()) << "Statistics should have min/max"; + + auto typed_stats = std::dynamic_pointer_cast>(stats); + ASSERT_NE(typed_stats, nullptr); + EXPECT_EQ(typed_stats->min().len, 0) << "Min should be empty string (len=0)"; + EXPECT_EQ(typed_stats->max(), ByteArray("zzz")) << "Max should be 'zzz'"; +} + +// GH-47995: Test that statistics work when all values are empty strings with nullptr. +TEST_F(TestByteArrayValuesWriter, AllEmptyStringsWithNullptrStats) { + this->SetUpSchema(Repetition::REQUIRED); + auto writer = this->BuildWriter(); + + // All values are empty strings with nullptr + ByteArray empty_with_nullptr{0, nullptr}; + std::vector values = {empty_with_nullptr, empty_with_nullptr, empty_with_nullptr}; + writer->WriteBatch(static_cast(values.size()), nullptr, nullptr, values.data()); + writer->Close(); + + // Statistics should be set even when all values are empty strings + ASSERT_TRUE(this->metadata_is_stats_set()); + auto stats = this->metadata_stats(); + ASSERT_TRUE(stats->HasMinMax()) << "Statistics should have min/max even with all empty strings"; + + auto typed_stats = std::dynamic_pointer_cast>(stats); + ASSERT_NE(typed_stats, nullptr); + EXPECT_EQ(typed_stats->min().len, 0) << "Min should be empty string"; + EXPECT_EQ(typed_stats->max().len, 0) << "Max should be empty string"; +} + // Test for https://github.com/apache/arrow/issues/47027. // When writing a repeated column with page indexes enabled // and batches that are aligned with list boundaries, diff --git a/cpp/src/parquet/statistics_test.cc b/cpp/src/parquet/statistics_test.cc index 9ff6a2ae593..16cb54bd2b9 100644 --- a/cpp/src/parquet/statistics_test.cc +++ b/cpp/src/parquet/statistics_test.cc @@ -2035,5 +2035,60 @@ TEST(TestByteArrayStatistics, MergeEncodedEmptyStringMin) { EXPECT_EQ(merged->max(), ByteArray("zzz")); } +// GH-47995: Empty strings represented as ByteArray{0, nullptr} should be handled correctly. +// This can happen when: +// - Arrow arrays have null values buffer (all empty strings) +// - External code passes ByteArray with nullptr for empty strings +// The fix uses a sentinel pointer (kNoValueSentinel) to distinguish "no value computed" +// from "empty string value with nullptr". +TEST(TestByteArrayStatistics, EmptyStringWithNullptrInRawByteArray) { + NodePtr node = + PrimitiveNode::Make("StringColumn", Repetition::OPTIONAL, Type::BYTE_ARRAY); + ColumnDescriptor descr(node, 1, 1); + + // ByteArray with nullptr represents empty string from some code paths + ByteArray empty_with_nullptr{0, nullptr}; + ByteArray non_empty{"aaa"}; + + // Test 1: Update with mix of nullptr empty string and non-empty string + // The empty string should be recognized as the minimum + { + auto stats = MakeStatistics(&descr); + std::vector values = {empty_with_nullptr, non_empty}; + stats->Update(values.data(), values.size(), 0); + ASSERT_TRUE(stats->HasMinMax()) << "Stats should have min/max"; + EXPECT_EQ(stats->min().len, 0) << "Min should be empty string"; + EXPECT_EQ(stats->max(), ByteArray("aaa")) << "Max should be 'aaa'"; + } + + // Test 2: Update with only nullptr empty strings + // Should still produce valid statistics + { + auto stats = MakeStatistics(&descr); + std::vector values = {empty_with_nullptr, empty_with_nullptr}; + stats->Update(values.data(), values.size(), 0); + ASSERT_TRUE(stats->HasMinMax()) << "Stats should have min/max even with all empty strings"; + EXPECT_EQ(stats->min().len, 0) << "Min should be empty string"; + EXPECT_EQ(stats->max().len, 0) << "Max should be empty string"; + } + + // Test 3: Merge stats with nullptr empty string min into stats with non-empty min + // The empty string should become the new minimum + { + auto stats1 = MakeStatistics(&descr); + std::vector values1 = {empty_with_nullptr}; + stats1->Update(values1.data(), values1.size(), 0); + + auto stats2 = MakeStatistics(&descr); + std::vector values2 = {ByteArray("zzz")}; + stats2->Update(values2.data(), values2.size(), 0); + + stats2->Merge(*stats1); + ASSERT_TRUE(stats2->HasMinMax()) << "Stats should have min/max after merge"; + EXPECT_EQ(stats2->min().len, 0) << "Min should be empty string after merge"; + EXPECT_EQ(stats2->max(), ByteArray("zzz")) << "Max should be 'zzz'"; + } +} + } // namespace test } // namespace parquet From 3b3bae08537924f1c3cb57e4da012f4bb1a21c67 Mon Sep 17 00:00:00 2001 From: Ruiyang Wang Date: Mon, 12 Jan 2026 22:42:20 -0800 Subject: [PATCH 3/3] Fix tests: use valid empty strings to avoid encoder UB The tests now use ByteArray("") which has a valid non-null pointer, avoiding undefined behavior in PlainEncoder when encoding nullptr. Also fixes clang-format issues with long assertion lines. --- cpp/src/parquet/column_writer_test.cc | 31 ++++++++++++++------------- cpp/src/parquet/statistics_test.cc | 28 +++++++++++------------- 2 files changed, 29 insertions(+), 30 deletions(-) diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc index d194538a821..92c91f17552 100644 --- a/cpp/src/parquet/column_writer_test.cc +++ b/cpp/src/parquet/column_writer_test.cc @@ -980,20 +980,19 @@ TEST_F(TestByteArrayValuesWriter, CheckDefaultStats) { ASSERT_TRUE(this->metadata_is_stats_set()); } -// GH-47995: Test that empty strings represented as ByteArray{0, nullptr} are -// correctly handled in statistics when using WriteBatch API. -TEST_F(TestByteArrayValuesWriter, EmptyStringWithNullptrStats) { +// GH-47995: Test that empty strings are correctly handled in statistics. +TEST_F(TestByteArrayValuesWriter, EmptyStringStats) { this->SetUpSchema(Repetition::REQUIRED); auto writer = this->BuildWriter(); - // Create values with empty string as nullptr (this can happen in practice - // when external code constructs ByteArray without using string literals) - ByteArray empty_with_nullptr{0, nullptr}; + // Empty string with valid pointer (from string literal) + ByteArray empty_string(""); ByteArray non_empty_aaa("aaa"); ByteArray non_empty_zzz("zzz"); - std::vector values = {empty_with_nullptr, non_empty_aaa, non_empty_zzz}; - writer->WriteBatch(static_cast(values.size()), nullptr, nullptr, values.data()); + std::vector values = {empty_string, non_empty_aaa, non_empty_zzz}; + writer->WriteBatch(static_cast(values.size()), nullptr, nullptr, + values.data()); writer->Close(); // Statistics should be set and capture the empty string as minimum @@ -1007,21 +1006,23 @@ TEST_F(TestByteArrayValuesWriter, EmptyStringWithNullptrStats) { EXPECT_EQ(typed_stats->max(), ByteArray("zzz")) << "Max should be 'zzz'"; } -// GH-47995: Test that statistics work when all values are empty strings with nullptr. -TEST_F(TestByteArrayValuesWriter, AllEmptyStringsWithNullptrStats) { +// GH-47995: Test that statistics work when all values are empty strings. +TEST_F(TestByteArrayValuesWriter, AllEmptyStringsStats) { this->SetUpSchema(Repetition::REQUIRED); auto writer = this->BuildWriter(); - // All values are empty strings with nullptr - ByteArray empty_with_nullptr{0, nullptr}; - std::vector values = {empty_with_nullptr, empty_with_nullptr, empty_with_nullptr}; - writer->WriteBatch(static_cast(values.size()), nullptr, nullptr, values.data()); + // All values are empty strings + ByteArray empty_string(""); + std::vector values = {empty_string, empty_string, empty_string}; + writer->WriteBatch(static_cast(values.size()), nullptr, nullptr, + values.data()); writer->Close(); // Statistics should be set even when all values are empty strings ASSERT_TRUE(this->metadata_is_stats_set()); auto stats = this->metadata_stats(); - ASSERT_TRUE(stats->HasMinMax()) << "Statistics should have min/max even with all empty strings"; + ASSERT_TRUE(stats->HasMinMax()) + << "Statistics should have min/max even with all empty strings"; auto typed_stats = std::dynamic_pointer_cast>(stats); ASSERT_NE(typed_stats, nullptr); diff --git a/cpp/src/parquet/statistics_test.cc b/cpp/src/parquet/statistics_test.cc index 16cb54bd2b9..3f985d09ad7 100644 --- a/cpp/src/parquet/statistics_test.cc +++ b/cpp/src/parquet/statistics_test.cc @@ -2035,48 +2035,46 @@ TEST(TestByteArrayStatistics, MergeEncodedEmptyStringMin) { EXPECT_EQ(merged->max(), ByteArray("zzz")); } -// GH-47995: Empty strings represented as ByteArray{0, nullptr} should be handled correctly. -// This can happen when: -// - Arrow arrays have null values buffer (all empty strings) -// - External code passes ByteArray with nullptr for empty strings +// GH-47995: Empty strings should be handled correctly in statistics. // The fix uses a sentinel pointer (kNoValueSentinel) to distinguish "no value computed" -// from "empty string value with nullptr". -TEST(TestByteArrayStatistics, EmptyStringWithNullptrInRawByteArray) { +// from "empty string value". +TEST(TestByteArrayStatistics, EmptyStringInRawByteArray) { NodePtr node = PrimitiveNode::Make("StringColumn", Repetition::OPTIONAL, Type::BYTE_ARRAY); ColumnDescriptor descr(node, 1, 1); - // ByteArray with nullptr represents empty string from some code paths - ByteArray empty_with_nullptr{0, nullptr}; + // Empty string with valid pointer + ByteArray empty_string(""); ByteArray non_empty{"aaa"}; - // Test 1: Update with mix of nullptr empty string and non-empty string + // Test 1: Update with mix of empty string and non-empty string // The empty string should be recognized as the minimum { auto stats = MakeStatistics(&descr); - std::vector values = {empty_with_nullptr, non_empty}; + std::vector values = {empty_string, non_empty}; stats->Update(values.data(), values.size(), 0); ASSERT_TRUE(stats->HasMinMax()) << "Stats should have min/max"; EXPECT_EQ(stats->min().len, 0) << "Min should be empty string"; EXPECT_EQ(stats->max(), ByteArray("aaa")) << "Max should be 'aaa'"; } - // Test 2: Update with only nullptr empty strings + // Test 2: Update with only empty strings // Should still produce valid statistics { auto stats = MakeStatistics(&descr); - std::vector values = {empty_with_nullptr, empty_with_nullptr}; + std::vector values = {empty_string, empty_string}; stats->Update(values.data(), values.size(), 0); - ASSERT_TRUE(stats->HasMinMax()) << "Stats should have min/max even with all empty strings"; + ASSERT_TRUE(stats->HasMinMax()) + << "Stats should have min/max even with all empty strings"; EXPECT_EQ(stats->min().len, 0) << "Min should be empty string"; EXPECT_EQ(stats->max().len, 0) << "Max should be empty string"; } - // Test 3: Merge stats with nullptr empty string min into stats with non-empty min + // Test 3: Merge stats with empty string min into stats with non-empty min // The empty string should become the new minimum { auto stats1 = MakeStatistics(&descr); - std::vector values1 = {empty_with_nullptr}; + std::vector values1 = {empty_string}; stats1->Update(values1.data(), values1.size(), 0); auto stats2 = MakeStatistics(&descr);