Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 36 additions & 2 deletions cpp/src/parquet/decoder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1038,7 +1038,16 @@ void DictDecoderImpl<Type>::SetDict(TypedDecoder<Type>* dictionary) {

template <>
void DictDecoderImpl<BooleanType>::SetDict(TypedDecoder<BooleanType>* dictionary) {
ParquetException::NYI("Dictionary encoding is not implemented for boolean values");
dictionary_length_ = static_cast<int32_t>(dictionary->values_left());
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why can't we simply reuse DecodeDict here?

PARQUET_THROW_NOT_OK(dictionary_->Resize(
static_cast<int64_t>(dictionary_length_) * sizeof(bool), false));
if (dictionary->Decode(dictionary_->mutable_data_as<bool>(), dictionary_length_) !=
dictionary_length_) {
Comment on lines +1044 to +1045
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why this test? I see no corresponding check in DecodeDict.

throw ParquetException(
"Boolean dictionary decode produced fewer values than the dictionary header "
"declared (expected ",
dictionary_length_, ")");
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should we add got values here?

}
}

template <>
Expand Down Expand Up @@ -1257,6 +1266,31 @@ void DictDecoderImpl<ByteArrayType>::InsertDictionary(::arrow::ArrayBuilder* bui
PARQUET_THROW_NOT_OK(binary_builder->InsertMemoValues(*arr));
}

// Dictionary decoder for boolean column data. Decodes PLAIN_DICTIONARY
// and RLE_DICTIONARY pages into the callers byte buffer.
class DictBooleanDecoderImpl : public DictDecoderImpl<BooleanType>,
virtual public BooleanDecoder {
public:
using BASE = DictDecoderImpl<BooleanType>;
using BASE::BASE;
using BASE::Decode;

int Decode(uint8_t* buffer, int max_values) override {
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What about DecodeArrow?

max_values = std::min(max_values, this->num_values_);
const auto* dict = dictionary_->data_as<bool>();
for (int i = 0; i < max_values; ++i) {
int32_t index;
if (ARROW_PREDICT_FALSE(!idx_decoder_.Get(&index))) {
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why don't call GetBatch and batching sets?

ParquetException::EofException();
}
PARQUET_THROW_NOT_OK(IndexInBounds(index));
::arrow::bit_util::SetBitTo(buffer, i, dict[index]);
}
this->num_values_ -= max_values;
return max_values;
}
};

class DictByteArrayDecoderImpl : public DictDecoderImpl<ByteArrayType> {
public:
using BASE = DictDecoderImpl<ByteArrayType>;
Expand Down Expand Up @@ -2459,7 +2493,7 @@ std::unique_ptr<Decoder> MakeDictDecoder(Type::type type_num,
MemoryPool* pool) {
switch (type_num) {
case Type::BOOLEAN:
ParquetException::NYI("Dictionary encoding not implemented for boolean type");
return std::make_unique<DictBooleanDecoderImpl>(descr, pool);
case Type::INT32:
return std::make_unique<DictDecoderImpl<Int32Type>>(descr, pool);
case Type::INT64:
Expand Down
27 changes: 25 additions & 2 deletions cpp/src/parquet/encoding_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -462,8 +462,31 @@ TYPED_TEST(TestDictionaryEncoding, BasicRoundTrip) {
ASSERT_NO_FATAL_FAILURE(this->Execute(2500, 2));
}

TEST(TestDictionaryEncoding, CannotDictDecodeBoolean) {
ASSERT_THROW(MakeDictDecoder<BooleanType>(nullptr), ParquetException);
// Decode a dictionary encoded boolean page and ensure decoding works.
TEST(TestDictionaryEncoding, DictDecodesBoolean) {
const uint8_t dict_bytes[] = {0x02};
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you comment on what those bytes mean?

auto dict_plain_decoder = MakeTypedDecoder<BooleanType>(Encoding::PLAIN);
dict_plain_decoder->SetData(2, dict_bytes, 1);

auto decoder = MakeDictDecoder<BooleanType>();
decoder->SetDict(dict_plain_decoder.get());

const uint8_t indices[] = {0x01, 0x03, 0x35};
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here: what do those bytes mean?

decoder->SetData(8, indices, sizeof(indices));

bool out_bool[8] = {};
ASSERT_EQ(8, decoder->Decode(out_bool, 8));
const bool expected[8] = {true, false, true, false, true, true, false, false};
Comment on lines +475 to +479
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of hard-coding 8 everywhere, can we make it a constant? e.g. constexpr int kNumValues = 8

for (int i = 0; i < 8; ++i) {
EXPECT_EQ(expected[i], out_bool[i]) << " at index " << i;
}

decoder->SetData(8, indices, sizeof(indices));
uint8_t out_packed = 0;
auto* bool_dec = dynamic_cast<BooleanDecoder*>(decoder.get());
ASSERT_NE(bool_dec, nullptr);
ASSERT_EQ(8, bool_dec->Decode(&out_packed, 8));
EXPECT_EQ(0x35, out_packed);
}

// ----------------------------------------------------------------------
Expand Down
Loading