Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 73 additions & 0 deletions src/DataTypes/NestedUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,14 @@
#include <Common/typeid_cast.h>

#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeNullable.h>
#include <DataTypes/DataTypeTuple.h>
#include <DataTypes/NestedUtils.h>
#include <DataTypes/DataTypeNested.h>

#include <Columns/ColumnArray.h>
#include <Columns/ColumnNullable.h>
#include <Columns/ColumnsCommon.h>
#include <Columns/ColumnTuple.h>
#include <Columns/ColumnConst.h>

Expand Down Expand Up @@ -121,6 +124,76 @@ std::string extractTableName(const std::string & nested_name)
}


ColumnWithTypeAndName unwrapNullableTuple(const ColumnWithTypeAndName & column)
{
const auto * type_nullable = typeid_cast<const DataTypeNullable *>(column.type.get());
if (!type_nullable)
return column;

const auto * tuple_type = typeid_cast<const DataTypeTuple *>(type_nullable->getNestedType().get());
if (!tuple_type)
return column;

const auto & col_nullable = assert_cast<const ColumnNullable &>(*column.column);

const auto & null_map_data = col_nullable.getNullMapData();
bool has_nulls = !memoryIsZero(null_map_data.data(), 0, null_map_data.size());

if (!has_nulls)
{
/// No actual nulls — just strip the Nullable wrapper.
return {col_nullable.getNestedColumnPtr(), type_nullable->getNestedType(), column.name};
}

/// Propagate the struct null map to each Tuple element.
const auto & inner_tuple = assert_cast<const ColumnTuple &>(col_nullable.getNestedColumn());
const auto & null_map_ptr = col_nullable.getNullMapColumnPtr();
Columns new_elements;
DataTypes new_types;
for (size_t i = 0; i < tuple_type->getElements().size(); ++i)
{
auto elem_col = inner_tuple.getColumnPtr(i);
auto elem_type = tuple_type->getElement(i);
if (elem_type->isNullable())
{
/// Element already Nullable — merge null maps (struct null OR element null).
const auto & existing = assert_cast<const ColumnNullable &>(*elem_col);
auto merged = ColumnUInt8::create(null_map_ptr->size());
const auto & s = assert_cast<const ColumnUInt8 &>(*null_map_ptr).getData();
const auto & e = existing.getNullMapData();
auto & m = merged->getData();
for (size_t j = 0; j < s.size(); ++j)
m[j] = s[j] | e[j];
new_elements.push_back(ColumnNullable::create(existing.getNestedColumnPtr(), std::move(merged)));
new_types.push_back(elem_type);
}
else if (elem_type->canBeInsideNullable())
{
new_elements.push_back(ColumnNullable::create(elem_col, null_map_ptr));
new_types.push_back(std::make_shared<DataTypeNullable>(elem_type));
}
else
{
/// Array, Map, etc. — replace values at null positions with type defaults.
const auto & nm = col_nullable.getNullMapData();
auto mutable_col = elem_col->cloneEmpty();
for (size_t j = 0; j < elem_col->size(); ++j)
{
if (nm[j])
mutable_col->insertDefault();
else
mutable_col->insertFrom(*elem_col, j);
}
new_elements.push_back(std::move(mutable_col));
new_types.push_back(elem_type);
}
}

auto result_type = tuple_type->hasExplicitNames() ? std::make_shared<DataTypeTuple>(std::move(new_types), tuple_type->getElementNames())
: std::make_shared<DataTypeTuple>(std::move(new_types));
return {ColumnTuple::create(std::move(new_elements)), result_type, column.name};
}

static Block flattenImpl(const Block & block, bool flatten_named_tuple)
{
Block res;
Expand Down
7 changes: 7 additions & 0 deletions src/DataTypes/NestedUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,13 @@ namespace Nested
/// Convert old-style nested (single arrays with same prefix, `n.a`, `n.b`...) to subcolumns of data type Nested.
NamesAndTypesList convertToSubcolumns(const NamesAndTypesList & names_and_types);

/// Unwrap Nullable(Tuple(...)) into Tuple(...) by propagating the struct-level null map
/// to each element. Scalar elements become Nullable(T), already-Nullable elements get merged
/// null maps, and non-nullable-compatible elements (Array, Map) get defaults at null positions.
/// When there are no actual nulls, simply strips the Nullable wrapper.
/// Used by format readers (Arrow, ORC) to convert Nullable struct elements for Nested flattening.
ColumnWithTypeAndName unwrapNullableTuple(const ColumnWithTypeAndName & column);

/// Check that sizes of arrays - elements of nested data structures - are equal.
void validateArraySizes(const Block & block);

Expand Down
26 changes: 26 additions & 0 deletions src/Formats/insertNullAsDefaultIfNeeded.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,27 @@ bool insertNullAsDefaultIfNeeded(ColumnWithTypeAndName & input_column, const Col
return true;
}

/// When both input and header are Nullable, unwrap and recurse into the nested types.
/// This can handle cases such as e.g. Nullable(Tuple(Nullable(Int32), String)) vs Nullable(Tuple(UInt32, String))
if (input_column.type->isNullable() && header_column.type->isNullable())
{
ColumnWithTypeAndName nested_input;
nested_input.column = assert_cast<const ColumnNullable *>(input_column.column.get())->getNestedColumnPtr();
nested_input.type = removeNullable(input_column.type);

ColumnWithTypeAndName nested_header;
nested_header.column = assert_cast<const ColumnNullable *>(header_column.column.get())->getNestedColumnPtr();
nested_header.type = removeNullable(header_column.type);

if (!insertNullAsDefaultIfNeeded(nested_input, nested_header, 0, nullptr))
return false;

input_column.column = ColumnNullable::create(
nested_input.column, assert_cast<const ColumnNullable *>(input_column.column.get())->getNullMapColumnPtr());
input_column.type = std::make_shared<DataTypeNullable>(std::move(nested_input.type));
return true;
}

if (!isNullableOrLowCardinalityNullable(input_column.type) || isNullableOrLowCardinalityNullable(header_column.type))
return false;

Expand All @@ -118,6 +139,11 @@ bool insertNullAsDefaultIfNeeded(ColumnWithTypeAndName & input_column, const Col
input_column.type = std::make_shared<DataTypeLowCardinality>(removeNullable(lc_type->getDictionaryType()));
}

/// After stripping the outer Nullable, the inner type may also need processing.
/// For example, Nullable(Tuple(Nullable(Int), String)) -> Tuple(Nullable(Int), String)
/// still needs the Tuple elements compared against the header to strip inner Nullable.
insertNullAsDefaultIfNeeded(input_column, header_column, column_i, block_missing_values);

return true;
}

Expand Down
28 changes: 22 additions & 6 deletions src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,18 @@ static ColumnWithTypeAndName readColumnWithNumericData(const std::shared_ptr<arr
std::shared_ptr<arrow::Buffer> buffer = chunk->data()->buffers[1];
const auto * raw_data = reinterpret_cast<const NumericType *>(buffer->data()) + chunk->offset();
column_data.insert_assume_reserved(raw_data, raw_data + chunk->length());

/// Values at null positions are not guaranteed to be initialized in the source buffer.
/// Zero them out because downstream code (type conversions, serialization) may read all values.
if (chunk->null_count() > 0)
{
size_t start = column_data.size() - chunk->length();
for (int64_t i = 0; i < chunk->length(); ++i)
{
if (chunk->IsNull(i))
column_data[start + i] = {};
}
}
}
return {std::move(internal_column), std::move(internal_type), column_name};
}
Expand Down Expand Up @@ -1160,22 +1172,27 @@ static ColumnWithTypeAndName readNonNullableColumnFromArrowColumn(
return readOffsetsFromArrowListColumn<arrow::ListArray>(arrow_column);
}
}();
auto array_column = ColumnArray::create(nested_column.column, offsets_column);

DataTypePtr array_type;
/// If type hint is Nested, we should return Nested type,
/// because we differentiate Nested and simple Array(Tuple)
ColumnPtr array_data_column = nested_column.column;
/// If type hint is Nested and the element is a named Tuple, return the Nested type
/// so that `Nested::flatten` can decompose it into separate arrays.
/// When the element is Nullable(Tuple(...)) (e.g. from Arrow's default nullable schema),
/// unwrap it and propagate the struct null map to each element via `unwrapNullableTuple`.
const auto * tuple_type = type_hint && isNested(type_hint)
? typeid_cast<const DataTypeTuple *>(removeNullable(nested_column.type).get())
: nullptr;
if (tuple_type)
{
array_type = createNested(tuple_type->getElements(), tuple_type->getElementNames());
auto unwrapped = Nested::unwrapNullableTuple({array_data_column, nested_column.type, column_name});
array_data_column = unwrapped.column;
const auto & result_tuple = assert_cast<const DataTypeTuple &>(*unwrapped.type);
array_type = createNested(result_tuple.getElements(), result_tuple.getElementNames());
}
else
{
array_type = std::make_shared<DataTypeArray>(nested_column.type);
}
auto array_column = ColumnArray::create(array_data_column, offsets_column);
return {std::move(array_column), array_type, column_name};
}
case arrow::Type::STRUCT:
Expand Down Expand Up @@ -1408,7 +1425,6 @@ static ColumnWithTypeAndName readColumnFromArrowColumn(
arrow_column->type()->id() != arrow::Type::LARGE_LIST &&
arrow_column->type()->id() != arrow::Type::FIXED_SIZE_LIST &&
arrow_column->type()->id() != arrow::Type::MAP &&
arrow_column->type()->id() != arrow::Type::STRUCT &&
arrow_column->type()->id() != arrow::Type::DICTIONARY)
{
DataTypePtr nested_type_hint;
Expand Down
7 changes: 5 additions & 2 deletions src/Processors/Formats/Impl/CHColumnToArrowColumn.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -300,9 +300,12 @@ namespace DB
for (size_t i = 0; i != column_tuple->tupleSize(); ++i)
{
ColumnPtr nested_column = column_tuple->getColumnPtr(i);
/// Do not propagate the struct-level null_bytemap to child fields.
/// In Arrow, struct-level nulls and child-level nulls are independent;
/// child values at null struct positions are undefined.
fillArrowArray(
column_name + "." + nested_names[i],
nested_column, nested_types[i], null_bytemap,
nested_column, nested_types[i], nullptr,
builder.field_builder(static_cast<int>(i)),
format_name,
start, end,
Expand All @@ -312,7 +315,7 @@ namespace DB

for (size_t i = start; i != end; ++i)
{
auto status = builder.Append();
auto status = (null_bytemap && (*null_bytemap)[i]) ? builder.AppendNull() : builder.Append();
checkStatus(status, column->getName(), format_name);
}
}
Expand Down
20 changes: 18 additions & 2 deletions src/Processors/Formats/Impl/CSVRowInputFormat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include <DataTypes/Serializations/SerializationNullable.h>
#include <DataTypes/DataTypeString.h>
#include <DataTypes/DataTypeNullable.h>
#include <DataTypes/DataTypeTuple.h>


namespace DB
Expand Down Expand Up @@ -377,8 +378,23 @@ bool CSVFormatReader::readField(
/// commas, which might be also used as delimiters. However,
/// they do not contain empty unquoted fields, so this check
/// works for tuples as well.
column.insertDefault();
return false;
///
/// Exception: `Nullable(Tuple())` with zero elements serializes to
/// an empty field in CSV, so an empty value is its only valid
/// representation. Let it fall through to normal deserialization
/// instead of inserting NULL as the default.
bool is_nullable_empty_tuple = false;
if (type->isNullable())
{
if (const auto * tuple_type = typeid_cast<const DataTypeTuple *>(removeNullable(type).get()))
is_nullable_empty_tuple = tuple_type->getElements().empty();
}

if (!is_nullable_empty_tuple)
{
column.insertDefault();
return false;
}
}

if (format_settings.csv.use_default_on_bad_values)
Expand Down
18 changes: 15 additions & 3 deletions src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -398,14 +398,26 @@ bool MsgPackVisitor::start_array(size_t size) // NOLINT
if (size > 0)
info_stack.push(Info{nested_column, nested_type, false, size, nullptr});
}
else if (isTuple(info_stack.top().type))
else if (isTuple(removeNullable(info_stack.top().type)))
{
const auto & tuple_type = assert_cast<const DataTypeTuple &>(*info_stack.top().type);
/// If the type is Nullable, reaching start_array means the value
/// is non-null (for nulls, the parser calls visit_nil instead).
/// So we can safely unwrap the Nullable to work with the inner
/// ColumnTuple directly.
IColumn * column_ptr = &info_stack.top().column;
if (info_stack.top().type->isNullable())
{
auto & nullable_column = assert_cast<ColumnNullable &>(*column_ptr);
nullable_column.getNullMapColumn().insertValue(0);
column_ptr = &nullable_column.getNestedColumn();
}

const auto & tuple_type = assert_cast<const DataTypeTuple &>(*removeNullable(info_stack.top().type));
const auto & nested_types = tuple_type.getElements();
if (size != nested_types.size())
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Cannot insert MessagePack array with size {} into Tuple column with {} elements", size, nested_types.size());

ColumnTuple & column_tuple = assert_cast<ColumnTuple &>(info_stack.top().column);
ColumnTuple & column_tuple = assert_cast<ColumnTuple &>(*column_ptr);
/// Push nested columns into stack in reverse order.
for (ssize_t i = static_cast<ssize_t>(nested_types.size()) - 1; i >= 0; --i)
info_stack.push(Info{column_tuple.getColumn(i), nested_types[i], true, std::nullopt, nullptr});
Expand Down
24 changes: 16 additions & 8 deletions src/Processors/Formats/Impl/NativeORCBlockInputFormat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1710,8 +1710,8 @@ ColumnWithTypeAndName ORCColumnToCHColumn::readColumnFromORCColumn(
{
bool skipped = false;

if (!inside_nullable && (orc_column->hasNulls || (type_hint && type_hint->isNullable())) && !orc_column->isEncoded
&& (orc_type->getKind() != orc::LIST && orc_type->getKind() != orc::MAP && orc_type->getKind() != orc::STRUCT))
if (!inside_nullable && (orc_column->hasNulls || (type_hint && isNullableOrLowCardinalityNullable(type_hint))) && !orc_column->isEncoded
&& (orc_type->getKind() != orc::LIST && orc_type->getKind() != orc::MAP))
{
DataTypePtr nested_type_hint;
if (type_hint)
Expand Down Expand Up @@ -1883,19 +1883,27 @@ ColumnWithTypeAndName ORCColumnToCHColumn::readColumnFromORCColumn(
auto nested_column = readColumnFromORCColumn(orc_nested_column, orc_nested_type, column_name, false, nested_type_hint);

auto offsets_column = readOffsetsFromORCListColumn(orc_list_column);
auto array_column = ColumnArray::create(nested_column.column, offsets_column);
DataTypePtr array_type;
/// If type hint is Nested, we should return Nested type,
/// because we differentiate Nested and simple Array(Tuple)
if (type_hint && isNested(type_hint))
ColumnPtr array_data_column = nested_column.column;
/// If type hint is Nested and the element is a named Tuple, return the Nested type
/// so that `Nested::flatten` can decompose it into separate arrays.
/// When the element is Nullable(Tuple(...)), unwrap it and propagate the struct null
/// map to each element via `unwrapNullableTuple`.
const auto * tuple_type = type_hint && isNested(type_hint)
? typeid_cast<const DataTypeTuple *>(removeNullable(nested_column.type).get())
: nullptr;
if (tuple_type)
{
const auto & tuple_type = assert_cast<const DataTypeTuple &>(*nested_column.type);
array_type = createNested(tuple_type.getElements(), tuple_type.getElementNames());
auto unwrapped = Nested::unwrapNullableTuple({array_data_column, nested_column.type, column_name});
array_data_column = unwrapped.column;
const auto & result_tuple = assert_cast<const DataTypeTuple &>(*unwrapped.type);
array_type = createNested(result_tuple.getElements(), result_tuple.getElementNames());
}
else
{
array_type = std::make_shared<DataTypeArray>(nested_column.type);
}
auto array_column = ColumnArray::create(array_data_column, offsets_column);
return {array_column, array_type, column_name};
}
case orc::STRUCT:
Expand Down
22 changes: 21 additions & 1 deletion src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -494,7 +494,27 @@ void ORCBlockOutputFormat::writeColumn(
const auto & tuple_column = assert_cast<const ColumnTuple &>(column);
auto nested_types = assert_cast<const DataTypeTuple *>(type.get())->getElements();
for (size_t i = 0; i != tuple_column.tupleSize(); ++i)
writeColumn(*struct_orc_column.fields[i], tuple_column.getColumn(i), nested_types[i], nullptr);
{
if (null_bytemap && nested_types[i]->isNullable())
{
/// When both the struct and the element are nullable, we need to merge the two null bitmaps:
/// a child value is null if either the struct row is null OR the element itself is null.
const auto & nullable_col = assert_cast<const ColumnNullable &>(tuple_column.getColumn(i));
const auto & element_null_map = nullable_col.getNullMapData();
PaddedPODArray<UInt8> merged_null_map(element_null_map.size());
for (size_t j = 0; j < element_null_map.size(); ++j)
merged_null_map[j] = element_null_map[j] | (*null_bytemap)[j];

auto nested_type = removeNullable(nested_types[i]);
writeColumn(*struct_orc_column.fields[i], nullable_col.getNestedColumn(), nested_type, &merged_null_map);
}
else
{
/// Propagate the struct-level null_bytemap to children so the ORC library correctly handles
/// null struct rows (child values at null positions must also be marked null).
writeColumn(*struct_orc_column.fields[i], tuple_column.getColumn(i), nested_types[i], null_bytemap);
}
}
break;
}
case TypeIndex::Map:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/usr/bin/env bash
# Tags: no-fasttest
# no-fasttest: Arrow format is not available in fasttest builds

CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CURDIR"/../shell_config.sh

$CLICKHOUSE_LOCAL -q "select toLowCardinality(toNullable('abc')) as lc format Arrow settings output_format_arrow_low_cardinality_as_dictionary=1, output_format_arrow_string_as_string=0" | $CLICKHOUSE_LOCAL --input-format=Arrow --table=test -q "desc test"
$CLICKHOUSE_LOCAL -q "select toLowCardinality(toNullable('abc')) as lc format Arrow settings output_format_arrow_low_cardinality_as_dictionary=1, output_format_arrow_string_as_string=0" | $CLICKHOUSE_LOCAL --input-format=Arrow --table=test -q "select * from test"
$CLICKHOUSE_LOCAL -q "select toLowCardinality(toNullable('abc')) as lc format Arrow settings output_format_arrow_low_cardinality_as_dictionary=1, output_format_arrow_string_as_string=1" | $CLICKHOUSE_LOCAL --input-format=Arrow --table=test -q "desc test"
$CLICKHOUSE_LOCAL -q "select toLowCardinality(toNullable('abc')) as lc format Arrow settings output_format_arrow_low_cardinality_as_dictionary=1, output_format_arrow_string_as_string=1" | $CLICKHOUSE_LOCAL --input-format=Arrow --table=test -q "select * from test"
Loading
Loading