diff --git a/be/src/format/csv/csv_reader.cpp b/be/src/format/csv/csv_reader.cpp index 24012e5bd4374b..9c763235e797bb 100644 --- a/be/src/format/csv/csv_reader.cpp +++ b/be/src/format/csv/csv_reader.cpp @@ -353,6 +353,7 @@ Status CsvReader::get_next_block(Block* block, size_t* read_rows, bool* eof) { block->set_columns(std::move(mutate_columns)); } else { auto columns = block->mutate_columns(); + _nullable_str_col_cache.clear(); while (rows < batch_size && !_line_reader_eof) { const uint8_t* ptr = nullptr; size_t size = 0; @@ -640,6 +641,33 @@ Status CsvReader::_fill_dest_columns(const Slice& line, Block* block, std::vector& columns, size_t* rows) { bool is_success = false; + // Initialize cached column pointers on first call per batch to avoid per-row assert_cast. + // Also pre-reserve capacity for offsets/chars/null_map to eliminate realloc during row loop. + if (UNLIKELY(_nullable_str_col_cache.empty())) { + _nullable_str_col_cache.resize(_file_slot_descs.size()); + _has_escape_char = (_options.escape_char != 0); + const size_t batch_size = std::max(_state->batch_size(), _MIN_BATCH_SIZE); + for (int i = 0; i < _file_slot_descs.size(); ++i) { + if (_use_nullable_string_opt[i]) { + IColumn* col_ptr = + _is_load + ? columns[i].get() + : const_cast(block->get_by_position(_file_slot_idx_map[i]) + .column.get()); + auto& null_col = assert_cast(*col_ptr); + auto* str_col = assert_cast*>(&null_col.get_nested_column()); + auto* null_map = &null_col.get_null_map_data(); + _nullable_str_col_cache[i].nested_str_col = str_col; + _nullable_str_col_cache[i].null_map = null_map; + // Pre-reserve to avoid repeated realloc inside insert_data/push_back. + // Estimate 64 bytes per string as a reasonable average for typical CSV data. + str_col->get_offsets().reserve(str_col->get_offsets().size() + batch_size); + str_col->get_chars().reserve(str_col->get_chars().size() + batch_size * 64); + null_map->reserve(null_map->size() + batch_size); + } + } + } + RETURN_IF_ERROR(_line_split_to_values(line, &is_success)); if (UNLIKELY(!is_success)) { // If not success, which means we met an invalid row, filter this row and return. @@ -653,20 +681,36 @@ Status CsvReader::_fill_dest_columns(const Slice& line, Block* block, ? _split_values[col_idx] : Slice(_options.null_format, _options.null_len); - IColumn* col_ptr = columns[i].get(); - if (!_is_load) { - // block is a Block*, and get_by_position returns a ColumnPtr, - // which is a const pointer. Therefore, using const_cast is permissible. - col_ptr = const_cast( - block->get_by_position(_file_slot_idx_map[i]).column.get()); - } - if (_use_nullable_string_opt[i]) { - // For load task, we always read "string" from file. - // So serdes[i] here must be DataTypeNullableSerDe, and DataTypeNullableSerDe -> nested_serde must be DataTypeStringSerDe. - // So we use deserialize_nullable_string and stringSerDe to reduce virtual function calls. - RETURN_IF_ERROR(_deserialize_nullable_string(*col_ptr, value)); + // Inline fast path: bypass StringSerDe and per-row assert_cast entirely. + DCHECK_LT(i, _nullable_str_col_cache.size()); + auto& cache = _nullable_str_col_cache[i]; + if (_empty_field_as_null && value.size == 0) { + cache.nested_str_col->insert_default(); + cache.null_map->push_back(1); + continue; + } + if (_options.null_len > 0 && + !(_options.converted_from_string && value.trim_double_quotes())) { + if (value.compare(Slice(_options.null_format, _options.null_len)) == 0) { + cache.nested_str_col->insert_default(); + cache.null_map->push_back(1); + continue; + } + } + if (UNLIKELY(_has_escape_char)) { + escape_string_for_csv(value.data, &value.size, _options.escape_char, + _options.quote_char); + } + cache.nested_str_col->insert_data(value.data, value.size); + cache.null_map->push_back(0); } else { + // Non-optimized path: col_ptr only needed here since fast path uses cached pointers + IColumn* col_ptr = columns[i].get(); + if (!_is_load) { + col_ptr = const_cast( + block->get_by_position(_file_slot_idx_map[i]).column.get()); + } RETURN_IF_ERROR(_deserialize_one_cell(_serdes[i], *col_ptr, value)); } } diff --git a/be/src/format/csv/csv_reader.h b/be/src/format/csv/csv_reader.h index 4e24be28d15b95..da0e1636b0313e 100644 --- a/be/src/format/csv/csv_reader.h +++ b/be/src/format/csv/csv_reader.h @@ -30,6 +30,8 @@ #include #include "common/status.h" +#include "core/column/column_nullable.h" +#include "core/column/column_string.h" #include "core/data_type/data_type.h" #include "format/file_reader/new_plain_text_line_reader.h" #include "format/generic_reader.h" @@ -284,6 +286,14 @@ class CsvReader : public GenericReader { // save source text which have been splitted. std::vector _split_values; std::vector _use_nullable_string_opt; + + // Cached column pointers for nullable string fast path, avoiding per-row assert_cast. + struct NullableStringColumnCache { + ColumnStr* nested_str_col = nullptr; + NullMap* null_map = nullptr; + }; + std::vector _nullable_str_col_cache; + bool _has_escape_char = false; }; #include "common/compile_check_end.h" } // namespace doris