diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc index dedf25abcab..92c91f17552 100644 --- a/cpp/src/parquet/column_writer_test.cc +++ b/cpp/src/parquet/column_writer_test.cc @@ -980,6 +980,56 @@ TEST_F(TestByteArrayValuesWriter, CheckDefaultStats) { ASSERT_TRUE(this->metadata_is_stats_set()); } +// GH-47995: Test that empty strings are correctly handled in statistics. +TEST_F(TestByteArrayValuesWriter, EmptyStringStats) { + this->SetUpSchema(Repetition::REQUIRED); + auto writer = this->BuildWriter(); + + // Empty string with valid pointer (from string literal) + ByteArray empty_string(""); + ByteArray non_empty_aaa("aaa"); + ByteArray non_empty_zzz("zzz"); + + std::vector values = {empty_string, non_empty_aaa, non_empty_zzz}; + writer->WriteBatch(static_cast(values.size()), nullptr, nullptr, + values.data()); + writer->Close(); + + // Statistics should be set and capture the empty string as minimum + ASSERT_TRUE(this->metadata_is_stats_set()); + auto stats = this->metadata_stats(); + ASSERT_TRUE(stats->HasMinMax()) << "Statistics should have min/max"; + + auto typed_stats = std::dynamic_pointer_cast>(stats); + ASSERT_NE(typed_stats, nullptr); + EXPECT_EQ(typed_stats->min().len, 0) << "Min should be empty string (len=0)"; + EXPECT_EQ(typed_stats->max(), ByteArray("zzz")) << "Max should be 'zzz'"; +} + +// GH-47995: Test that statistics work when all values are empty strings. +TEST_F(TestByteArrayValuesWriter, AllEmptyStringsStats) { + this->SetUpSchema(Repetition::REQUIRED); + auto writer = this->BuildWriter(); + + // All values are empty strings + ByteArray empty_string(""); + std::vector values = {empty_string, empty_string, empty_string}; + writer->WriteBatch(static_cast(values.size()), nullptr, nullptr, + values.data()); + writer->Close(); + + // Statistics should be set even when all values are empty strings + ASSERT_TRUE(this->metadata_is_stats_set()); + auto stats = this->metadata_stats(); + ASSERT_TRUE(stats->HasMinMax()) + << "Statistics should have min/max even with all empty strings"; + + auto typed_stats = std::dynamic_pointer_cast>(stats); + ASSERT_NE(typed_stats, nullptr); + EXPECT_EQ(typed_stats->min().len, 0) << "Min should be empty string"; + EXPECT_EQ(typed_stats->max().len, 0) << "Max should be empty string"; +} + // Test for https://github.com/apache/arrow/issues/47027. // When writing a repeated column with page indexes enabled // and batches that are aligned with list boundaries, diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc index 2e5f6fe37c4..5736c1d032a 100644 --- a/cpp/src/parquet/statistics.cc +++ b/cpp/src/parquet/statistics.cc @@ -55,6 +55,12 @@ namespace { constexpr int value_length(int value_length, const ByteArray& value) { return value.len; } constexpr int value_length(int type_length, const FLBA& value) { return type_length; } +// Sentinel pointer to mark "no value" for ByteArray statistics. +// Distinct from nullptr, which is valid for empty strings. +// See: https://github.com/apache/arrow/issues/47995 +inline constexpr uint8_t kNoValueSentinelBytes[1] = {0}; +inline constexpr const uint8_t* kNoValueSentinel = kNoValueSentinelBytes; + // Static "constants" for normalizing float16 min/max values. These need to be expressed // as pointers because `Float16LogicalType` represents an FLBA. struct Float16Constants { @@ -290,7 +296,26 @@ struct BinaryLikeCompareHelperBase { template struct CompareHelper - : public BinaryLikeCompareHelperBase {}; + : public BinaryLikeCompareHelperBase { + using Base = BinaryLikeCompareHelperBase; + using T = ByteArray; + + // Use kNoValueSentinel instead of nullptr to distinguish "no value" from empty string. + static T DefaultMin() { return T{0, kNoValueSentinel}; } + static T DefaultMax() { return T{0, kNoValueSentinel}; } + + static T Min(int type_length, const T& a, const T& b) { + if (a.ptr == kNoValueSentinel) return b; + if (b.ptr == kNoValueSentinel) return a; + return Base::Compare(type_length, a, b) ? a : b; + } + + static T Max(int type_length, const T& a, const T& b) { + if (a.ptr == kNoValueSentinel) return b; + if (b.ptr == kNoValueSentinel) return a; + return Base::Compare(type_length, a, b) ? b : a; + } +}; template struct CompareHelper @@ -412,7 +437,9 @@ optional> CleanStatistic(std::pair min_max, optional> CleanStatistic( std::pair min_max, LogicalType::Type::type) { - if (min_max.first.ptr == nullptr || min_max.second.ptr == nullptr) { + // Check for kNoValueSentinel (not nullptr) because nullptr is valid for empty strings. + // See: https://github.com/apache/arrow/issues/47995 + if (min_max.first.ptr == kNoValueSentinel || min_max.second.ptr == kNoValueSentinel) { return ::std::nullopt; } return min_max; diff --git a/cpp/src/parquet/statistics_test.cc b/cpp/src/parquet/statistics_test.cc index 905502cb0a5..3f985d09ad7 100644 --- a/cpp/src/parquet/statistics_test.cc +++ b/cpp/src/parquet/statistics_test.cc @@ -1804,5 +1804,289 @@ TEST(TestEncodedStatistics, ApplyStatSizeLimits) { EXPECT_FALSE(statistics->HasMinMax()); } +// GH-47995: Empty string minimum should be preserved after merge +TEST(TestByteArrayStatistics, MergeWithEmptyStringMin) { + NodePtr node = + PrimitiveNode::Make("StringColumn", Repetition::OPTIONAL, Type::BYTE_ARRAY); + ColumnDescriptor descr(node, 1, 1); + + // Create statistics with empty string as min, "zzz" as max + auto stats1 = MakeStatistics(&descr); + std::vector values1 = {ByteArray(""), ByteArray("abc"), ByteArray("zzz")}; + stats1->Update(values1.data(), values1.size(), 0); + ASSERT_TRUE(stats1->HasMinMax()); + EXPECT_EQ(stats1->min(), ByteArray("")); + EXPECT_EQ(stats1->max(), ByteArray("zzz")); + + // Create statistics with "aaa" as min, "bbb" as max + auto stats2 = MakeStatistics(&descr); + std::vector values2 = {ByteArray("aaa"), ByteArray("bbb")}; + stats2->Update(values2.data(), values2.size(), 0); + ASSERT_TRUE(stats2->HasMinMax()); + EXPECT_EQ(stats2->min(), ByteArray("aaa")); + EXPECT_EQ(stats2->max(), ByteArray("bbb")); + + // Merge stats2 into stats1: min should still be "", max should be "zzz" + stats1->Merge(*stats2); + ASSERT_TRUE(stats1->HasMinMax()); + EXPECT_EQ(stats1->min(), ByteArray("")); + EXPECT_EQ(stats1->max(), ByteArray("zzz")); + + // Create fresh statistics and merge in opposite order + auto stats3 = MakeStatistics(&descr); + std::vector values3 = {ByteArray(""), ByteArray("abc"), ByteArray("zzz")}; + stats3->Update(values3.data(), values3.size(), 0); + + auto stats4 = MakeStatistics(&descr); + std::vector values4 = {ByteArray("aaa"), ByteArray("bbb")}; + stats4->Update(values4.data(), values4.size(), 0); + + // Merge stats3 into stats4: min should become "", max should become "zzz" + stats4->Merge(*stats3); + ASSERT_TRUE(stats4->HasMinMax()); + EXPECT_EQ(stats4->min(), ByteArray("")); + EXPECT_EQ(stats4->max(), ByteArray("zzz")); +} + +// GH-47995: Comprehensive tests for ByteArray statistics merge with empty strings. +// Tests all combinations of (no min/max) vs (has min/max) with min="" or min="aaa". +TEST(TestByteArrayStatistics, MergeEmptyStringComprehensive) { + NodePtr node = + PrimitiveNode::Make("StringColumn", Repetition::OPTIONAL, Type::BYTE_ARRAY); + ColumnDescriptor descr(node, 1, 1); + + auto make_empty_stats = [&]() { return MakeStatistics(&descr); }; + + auto make_stats_with_empty_min = [&]() { + auto stats = MakeStatistics(&descr); + std::vector values = {ByteArray(""), ByteArray("zzz")}; + stats->Update(values.data(), values.size(), 0); + return stats; + }; + + auto make_stats_with_nonempty_min = [&]() { + auto stats = MakeStatistics(&descr); + std::vector values = {ByteArray("aaa"), ByteArray("yyy")}; + stats->Update(values.data(), values.size(), 0); + return stats; + }; + + // 1. no min/max, merge (no min/max) -> no min/max + { + auto stats = make_empty_stats(); + ASSERT_FALSE(stats->HasMinMax()); + stats->Merge(*make_empty_stats()); + ASSERT_FALSE(stats->HasMinMax()) << "empty + empty = empty"; + } + + // 2. no min/max, merge (min="", max="zzz") -> min="", max="zzz" + { + auto stats = make_empty_stats(); + ASSERT_FALSE(stats->HasMinMax()); + stats->Merge(*make_stats_with_empty_min()); + ASSERT_TRUE(stats->HasMinMax()) << "empty + has_minmax = has_minmax"; + EXPECT_EQ(stats->min(), ByteArray("")); + EXPECT_EQ(stats->max(), ByteArray("zzz")); + } + + // 3. no min/max, merge (min="aaa", max="yyy") -> min="aaa", max="yyy" + { + auto stats = make_empty_stats(); + ASSERT_FALSE(stats->HasMinMax()); + stats->Merge(*make_stats_with_nonempty_min()); + ASSERT_TRUE(stats->HasMinMax()) << "empty + has_minmax = has_minmax"; + EXPECT_EQ(stats->min(), ByteArray("aaa")); + EXPECT_EQ(stats->max(), ByteArray("yyy")); + } + + // 3b. no min/max, merge (min="", max="") -> min="", max="" + { + auto stats = make_empty_stats(); + ASSERT_FALSE(stats->HasMinMax()); + auto other = MakeStatistics(&descr); + std::vector values = {ByteArray("")}; + other->Update(values.data(), values.size(), 0); + ASSERT_TRUE(other->HasMinMax()); + EXPECT_EQ(other->min(), ByteArray("")); + EXPECT_EQ(other->max(), ByteArray("")); + stats->Merge(*other); + ASSERT_TRUE(stats->HasMinMax()) << "empty + (min='', max='') = has_minmax"; + EXPECT_EQ(stats->min(), ByteArray("")); + EXPECT_EQ(stats->max(), ByteArray("")); + } + + // 3c. (min="", max=""), merge (no min/max) -> min="", max="" + { + auto stats = MakeStatistics(&descr); + std::vector values = {ByteArray("")}; + stats->Update(values.data(), values.size(), 0); + ASSERT_TRUE(stats->HasMinMax()); + EXPECT_EQ(stats->min(), ByteArray("")); + EXPECT_EQ(stats->max(), ByteArray("")); + stats->Merge(*make_empty_stats()); + ASSERT_TRUE(stats->HasMinMax()) << "(min='', max='') + empty = has_minmax"; + EXPECT_EQ(stats->min(), ByteArray("")); + EXPECT_EQ(stats->max(), ByteArray("")); + } + + // 4. (min="", max="zzz"), merge (no min/max) -> min="", max="zzz" + { + auto stats = make_stats_with_empty_min(); + ASSERT_TRUE(stats->HasMinMax()); + stats->Merge(*make_empty_stats()); + ASSERT_TRUE(stats->HasMinMax()) << "has_minmax + empty = has_minmax"; + EXPECT_EQ(stats->min(), ByteArray("")); + EXPECT_EQ(stats->max(), ByteArray("zzz")); + } + + // 5. (min="", max="zzz"), merge (min="", max="yyy") -> min="", max="zzz" + { + auto stats = make_stats_with_empty_min(); + auto other = make_stats_with_empty_min(); + // Modify other to have different max for clarity + stats->Merge(*other); + ASSERT_TRUE(stats->HasMinMax()); + EXPECT_EQ(stats->min(), ByteArray("")); + EXPECT_EQ(stats->max(), ByteArray("zzz")); + } + + // 6. (min="", max="zzz"), merge (min="aaa", max="yyy") -> min="", max="zzz" + { + auto stats = make_stats_with_empty_min(); + stats->Merge(*make_stats_with_nonempty_min()); + ASSERT_TRUE(stats->HasMinMax()); + EXPECT_EQ(stats->min(), ByteArray("")) << "empty string < 'aaa'"; + EXPECT_EQ(stats->max(), ByteArray("zzz")) << "'zzz' > 'yyy'"; + } + + // 7. (min="aaa", max="yyy"), merge (no min/max) -> min="aaa", max="yyy" + { + auto stats = make_stats_with_nonempty_min(); + stats->Merge(*make_empty_stats()); + ASSERT_TRUE(stats->HasMinMax()); + EXPECT_EQ(stats->min(), ByteArray("aaa")); + EXPECT_EQ(stats->max(), ByteArray("yyy")); + } + + // 8. (min="aaa", max="yyy"), merge (min="", max="zzz") -> min="", max="zzz" + { + auto stats = make_stats_with_nonempty_min(); + stats->Merge(*make_stats_with_empty_min()); + ASSERT_TRUE(stats->HasMinMax()); + EXPECT_EQ(stats->min(), ByteArray("")) << "empty string < 'aaa'"; + EXPECT_EQ(stats->max(), ByteArray("zzz")) << "'zzz' > 'yyy'"; + } + + // 9. (min="aaa", max="yyy"), merge (min="bbb", max="xxx") -> min="aaa", max="yyy" + { + auto stats = make_stats_with_nonempty_min(); + auto other = MakeStatistics(&descr); + std::vector values = {ByteArray("bbb"), ByteArray("xxx")}; + other->Update(values.data(), values.size(), 0); + stats->Merge(*other); + ASSERT_TRUE(stats->HasMinMax()); + EXPECT_EQ(stats->min(), ByteArray("aaa")); + EXPECT_EQ(stats->max(), ByteArray("yyy")); + } +} + +// GH-47995: Statistics from encoded empty string min should be preserved after merge. +// This test reproduces the bug where empty string min (encoded as "") results in +// ByteArray with ptr=nullptr after internal Copy(), which was incorrectly rejected +// by CleanStatistic(). +TEST(TestByteArrayStatistics, MergeEncodedEmptyStringMin) { + NodePtr node = + PrimitiveNode::Make("StringColumn", Repetition::OPTIONAL, Type::BYTE_ARRAY); + ColumnDescriptor descr(node, 1, 1); + + // Create statistics from encoded values (simulates reading from Parquet metadata) + // Empty string "" as min, "zzz" as max + std::string encoded_min = ""; // empty string + std::string encoded_max = "zzz"; + auto stats1 = Statistics::Make(&descr, encoded_min, encoded_max, + /*num_values=*/100, /*null_count=*/0, + /*distinct_count=*/0, /*has_min_max=*/true, + /*has_null_count=*/true, /*has_distinct_count=*/false); + auto typed_stats1 = std::dynamic_pointer_cast>(stats1); + ASSERT_TRUE(typed_stats1->HasMinMax()); + EXPECT_EQ(typed_stats1->min(), ByteArray("")); + EXPECT_EQ(typed_stats1->max(), ByteArray("zzz")); + + // Create second statistics with "aaa" as min, "bbb" as max + std::string encoded_min2 = "aaa"; + std::string encoded_max2 = "bbb"; + auto stats2 = Statistics::Make(&descr, encoded_min2, encoded_max2, + /*num_values=*/100, /*null_count=*/0, + /*distinct_count=*/0, /*has_min_max=*/true, + /*has_null_count=*/true, /*has_distinct_count=*/false); + auto typed_stats2 = std::dynamic_pointer_cast>(stats2); + ASSERT_TRUE(typed_stats2->HasMinMax()); + + // Create a fresh stats object and merge both + auto merged = MakeStatistics(&descr); + merged->Merge(*typed_stats1); + ASSERT_TRUE(merged->HasMinMax()) << "Min/max should be preserved after first merge"; + EXPECT_EQ(merged->min(), ByteArray("")); + EXPECT_EQ(merged->max(), ByteArray("zzz")); + + merged->Merge(*typed_stats2); + ASSERT_TRUE(merged->HasMinMax()) << "Min/max should be preserved after second merge"; + EXPECT_EQ(merged->min(), ByteArray("")); // empty string should still be min + EXPECT_EQ(merged->max(), ByteArray("zzz")); +} + +// GH-47995: Empty strings should be handled correctly in statistics. +// The fix uses a sentinel pointer (kNoValueSentinel) to distinguish "no value computed" +// from "empty string value". +TEST(TestByteArrayStatistics, EmptyStringInRawByteArray) { + NodePtr node = + PrimitiveNode::Make("StringColumn", Repetition::OPTIONAL, Type::BYTE_ARRAY); + ColumnDescriptor descr(node, 1, 1); + + // Empty string with valid pointer + ByteArray empty_string(""); + ByteArray non_empty{"aaa"}; + + // Test 1: Update with mix of empty string and non-empty string + // The empty string should be recognized as the minimum + { + auto stats = MakeStatistics(&descr); + std::vector values = {empty_string, non_empty}; + stats->Update(values.data(), values.size(), 0); + ASSERT_TRUE(stats->HasMinMax()) << "Stats should have min/max"; + EXPECT_EQ(stats->min().len, 0) << "Min should be empty string"; + EXPECT_EQ(stats->max(), ByteArray("aaa")) << "Max should be 'aaa'"; + } + + // Test 2: Update with only empty strings + // Should still produce valid statistics + { + auto stats = MakeStatistics(&descr); + std::vector values = {empty_string, empty_string}; + stats->Update(values.data(), values.size(), 0); + ASSERT_TRUE(stats->HasMinMax()) + << "Stats should have min/max even with all empty strings"; + EXPECT_EQ(stats->min().len, 0) << "Min should be empty string"; + EXPECT_EQ(stats->max().len, 0) << "Max should be empty string"; + } + + // Test 3: Merge stats with empty string min into stats with non-empty min + // The empty string should become the new minimum + { + auto stats1 = MakeStatistics(&descr); + std::vector values1 = {empty_string}; + stats1->Update(values1.data(), values1.size(), 0); + + auto stats2 = MakeStatistics(&descr); + std::vector values2 = {ByteArray("zzz")}; + stats2->Update(values2.data(), values2.size(), 0); + + stats2->Merge(*stats1); + ASSERT_TRUE(stats2->HasMinMax()) << "Stats should have min/max after merge"; + EXPECT_EQ(stats2->min().len, 0) << "Min should be empty string after merge"; + EXPECT_EQ(stats2->max(), ByteArray("zzz")) << "Max should be 'zzz'"; + } +} + } // namespace test } // namespace parquet