-
Notifications
You must be signed in to change notification settings - Fork 4.1k
GH-49957 [C++][Parquet] Support reading dictionary encoded boolean pages #49914
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1038,7 +1038,16 @@ void DictDecoderImpl<Type>::SetDict(TypedDecoder<Type>* dictionary) { | |
|
|
||
| template <> | ||
| void DictDecoderImpl<BooleanType>::SetDict(TypedDecoder<BooleanType>* dictionary) { | ||
| ParquetException::NYI("Dictionary encoding is not implemented for boolean values"); | ||
| dictionary_length_ = static_cast<int32_t>(dictionary->values_left()); | ||
| PARQUET_THROW_NOT_OK(dictionary_->Resize( | ||
| static_cast<int64_t>(dictionary_length_) * sizeof(bool), false)); | ||
| if (dictionary->Decode(dictionary_->mutable_data_as<bool>(), dictionary_length_) != | ||
| dictionary_length_) { | ||
|
Comment on lines
+1044
to
+1045
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why this test? I see no corresponding check in |
||
| throw ParquetException( | ||
| "Boolean dictionary decode produced fewer values than the dictionary header " | ||
| "declared (expected ", | ||
| dictionary_length_, ")"); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should we add got values here? |
||
| } | ||
| } | ||
|
|
||
| template <> | ||
|
|
@@ -1257,6 +1266,31 @@ void DictDecoderImpl<ByteArrayType>::InsertDictionary(::arrow::ArrayBuilder* bui | |
| PARQUET_THROW_NOT_OK(binary_builder->InsertMemoValues(*arr)); | ||
| } | ||
|
|
||
| // Dictionary decoder for boolean column data. Decodes PLAIN_DICTIONARY | ||
| // and RLE_DICTIONARY pages into the callers byte buffer. | ||
| class DictBooleanDecoderImpl : public DictDecoderImpl<BooleanType>, | ||
| virtual public BooleanDecoder { | ||
| public: | ||
| using BASE = DictDecoderImpl<BooleanType>; | ||
| using BASE::BASE; | ||
| using BASE::Decode; | ||
|
|
||
| int Decode(uint8_t* buffer, int max_values) override { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What about DecodeArrow? |
||
| max_values = std::min(max_values, this->num_values_); | ||
| const auto* dict = dictionary_->data_as<bool>(); | ||
| for (int i = 0; i < max_values; ++i) { | ||
| int32_t index; | ||
| if (ARROW_PREDICT_FALSE(!idx_decoder_.Get(&index))) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why don't call GetBatch and batching sets? |
||
| ParquetException::EofException(); | ||
| } | ||
| PARQUET_THROW_NOT_OK(IndexInBounds(index)); | ||
| ::arrow::bit_util::SetBitTo(buffer, i, dict[index]); | ||
| } | ||
| this->num_values_ -= max_values; | ||
| return max_values; | ||
| } | ||
| }; | ||
|
|
||
| class DictByteArrayDecoderImpl : public DictDecoderImpl<ByteArrayType> { | ||
| public: | ||
| using BASE = DictDecoderImpl<ByteArrayType>; | ||
|
|
@@ -2459,7 +2493,7 @@ std::unique_ptr<Decoder> MakeDictDecoder(Type::type type_num, | |
| MemoryPool* pool) { | ||
| switch (type_num) { | ||
| case Type::BOOLEAN: | ||
| ParquetException::NYI("Dictionary encoding not implemented for boolean type"); | ||
| return std::make_unique<DictBooleanDecoderImpl>(descr, pool); | ||
| case Type::INT32: | ||
| return std::make_unique<DictDecoderImpl<Int32Type>>(descr, pool); | ||
| case Type::INT64: | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -462,8 +462,31 @@ TYPED_TEST(TestDictionaryEncoding, BasicRoundTrip) { | |
| ASSERT_NO_FATAL_FAILURE(this->Execute(2500, 2)); | ||
| } | ||
|
|
||
| TEST(TestDictionaryEncoding, CannotDictDecodeBoolean) { | ||
| ASSERT_THROW(MakeDictDecoder<BooleanType>(nullptr), ParquetException); | ||
| // Decode a dictionary encoded boolean page and ensure decoding works. | ||
| TEST(TestDictionaryEncoding, DictDecodesBoolean) { | ||
| const uint8_t dict_bytes[] = {0x02}; | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you comment on what those bytes mean? |
||
| auto dict_plain_decoder = MakeTypedDecoder<BooleanType>(Encoding::PLAIN); | ||
| dict_plain_decoder->SetData(2, dict_bytes, 1); | ||
|
|
||
| auto decoder = MakeDictDecoder<BooleanType>(); | ||
| decoder->SetDict(dict_plain_decoder.get()); | ||
|
|
||
| const uint8_t indices[] = {0x01, 0x03, 0x35}; | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same here: what do those bytes mean? |
||
| decoder->SetData(8, indices, sizeof(indices)); | ||
|
|
||
| bool out_bool[8] = {}; | ||
| ASSERT_EQ(8, decoder->Decode(out_bool, 8)); | ||
| const bool expected[8] = {true, false, true, false, true, true, false, false}; | ||
|
Comment on lines
+475
to
+479
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Instead of hard-coding 8 everywhere, can we make it a constant? e.g. |
||
| for (int i = 0; i < 8; ++i) { | ||
| EXPECT_EQ(expected[i], out_bool[i]) << " at index " << i; | ||
| } | ||
|
|
||
| decoder->SetData(8, indices, sizeof(indices)); | ||
| uint8_t out_packed = 0; | ||
| auto* bool_dec = dynamic_cast<BooleanDecoder*>(decoder.get()); | ||
| ASSERT_NE(bool_dec, nullptr); | ||
| ASSERT_EQ(8, bool_dec->Decode(&out_packed, 8)); | ||
| EXPECT_EQ(0x35, out_packed); | ||
| } | ||
|
|
||
| // ---------------------------------------------------------------------- | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why can't we simply reuse
DecodeDicthere?