From a4a0ae0094b0008c4048d321e978faf639e9b562 Mon Sep 17 00:00:00 2001 From: Ruiyang Wang Date: Sat, 3 Jan 2026 14:45:34 -0800 Subject: [PATCH 1/2] GH-36889: [C++][Python] Fix duplicate CSV header when first batch is empty When writing CSV, if the first record batch was empty, the header would be written twice. This happened because: 1. Header is written to data_buffer_ and flushed during initialization 2. TranslateMinimalBatch returns early for empty batches without modifying data_buffer_ 3. The loop then writes data_buffer_ which still contains the header The fix clears the buffer (resize to 0) when encountering an empty batch, so the subsequent write outputs nothing. Added C++ and Python tests for empty batches at start and in middle of tables. Claude-Generated-By: Claude Code (cli/claude-opus-4-5=1%) Claude-Steers: 2 Claude-Permission-Prompts: 2 Claude-Escapes: 1 --- cpp/src/arrow/csv/writer.cc | 2 ++ cpp/src/arrow/csv/writer_test.cc | 44 ++++++++++++++++++++++++++++++++ python/pyarrow/tests/test_csv.py | 33 ++++++++++++++++++++++++ 3 files changed, 79 insertions(+) diff --git a/cpp/src/arrow/csv/writer.cc b/cpp/src/arrow/csv/writer.cc index 5d14fe4b9b1..cb3d42e9348 100644 --- a/cpp/src/arrow/csv/writer.cc +++ b/cpp/src/arrow/csv/writer.cc @@ -659,6 +659,8 @@ class CSVWriterImpl : public ipc::RecordBatchWriter { Status TranslateMinimalBatch(const RecordBatch& batch) { if (batch.num_rows() == 0) { + // GH-36889: Clear buffer to avoid writing stale content (e.g., header) + RETURN_NOT_OK(data_buffer_->Resize(0, /*shrink_to_fit=*/false)); return Status::OK(); } offsets_.resize(batch.num_rows()); diff --git a/cpp/src/arrow/csv/writer_test.cc b/cpp/src/arrow/csv/writer_test.cc index 783d7631ab3..82c8cd7b068 100644 --- a/cpp/src/arrow/csv/writer_test.cc +++ b/cpp/src/arrow/csv/writer_test.cc @@ -28,6 +28,7 @@ #include "arrow/ipc/writer.h" #include "arrow/record_batch.h" #include "arrow/result.h" +#include "arrow/table.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/matchers.h" #include "arrow/type.h" @@ -405,5 +406,48 @@ INSTANTIATE_TEST_SUITE_P( "\n2016-02-29 10:42:23-0700,2016-02-29 17:42:23Z\n"))); #endif +// GH-36889: Empty batches at the start should not cause duplicate headers +TEST(TestWriteCSV, EmptyBatchAtStart) { + auto schema = arrow::schema({field("col1", utf8())}); + auto empty_batch = RecordBatchFromJSON(schema, "[]"); + auto data_batch = RecordBatchFromJSON(schema, R"([{"col1": "a"}, {"col1": "b"}])"); + + // Concatenate empty table with data table + ASSERT_OK_AND_ASSIGN(auto empty_table, Table::FromRecordBatches(schema, {empty_batch})); + ASSERT_OK_AND_ASSIGN(auto data_table, Table::FromRecordBatches(schema, {data_batch})); + ASSERT_OK_AND_ASSIGN(auto combined_table, + ConcatenateTables({empty_table, data_table})); + + ASSERT_OK_AND_ASSIGN(auto out, io::BufferOutputStream::Create()); + ASSERT_OK(WriteCSV(*combined_table, WriteOptions::Defaults(), out.get())); + ASSERT_OK_AND_ASSIGN(auto buffer, out->Finish()); + + std::string result(reinterpret_cast(buffer->data()), buffer->size()); + // Should have exactly one header, not two + EXPECT_EQ(result, "\"col1\"\n\"a\"\n\"b\"\n"); +} + +// GH-36889: Empty batches in the middle should not cause issues +TEST(TestWriteCSV, EmptyBatchInMiddle) { + auto schema = arrow::schema({field("col1", utf8())}); + auto batch1 = RecordBatchFromJSON(schema, R"([{"col1": "a"}])"); + auto empty_batch = RecordBatchFromJSON(schema, "[]"); + auto batch2 = RecordBatchFromJSON(schema, R"([{"col1": "b"}])"); + + ASSERT_OK_AND_ASSIGN(auto table1, Table::FromRecordBatches(schema, {batch1})); + ASSERT_OK_AND_ASSIGN(auto empty_table, + Table::FromRecordBatches(schema, {empty_batch})); + ASSERT_OK_AND_ASSIGN(auto table2, Table::FromRecordBatches(schema, {batch2})); + ASSERT_OK_AND_ASSIGN(auto combined_table, + ConcatenateTables({table1, empty_table, table2})); + + ASSERT_OK_AND_ASSIGN(auto out, io::BufferOutputStream::Create()); + ASSERT_OK(WriteCSV(*combined_table, WriteOptions::Defaults(), out.get())); + ASSERT_OK_AND_ASSIGN(auto buffer, out->Finish()); + + std::string result(reinterpret_cast(buffer->data()), buffer->size()); + EXPECT_EQ(result, "\"col1\"\n\"a\"\n\"b\"\n"); +} + } // namespace csv } // namespace arrow diff --git a/python/pyarrow/tests/test_csv.py b/python/pyarrow/tests/test_csv.py index f510c6dbe23..cf681c1fdcc 100644 --- a/python/pyarrow/tests/test_csv.py +++ b/python/pyarrow/tests/test_csv.py @@ -2065,3 +2065,36 @@ def readinto(self, *args): for i in range(20): with pytest.raises(pa.ArrowInvalid): read_csv(MyBytesIO(data)) + + +def test_write_csv_empty_batch_no_duplicate_header(): + # GH-36889: Empty batches at the start should not cause duplicate headers + table = pa.table({"col1": ["a", "b", "c"]}) + + # Concatenate empty table with data table + empty_table = table.schema.empty_table() + combined = pa.concat_tables([empty_table, table]) + + buf = io.BytesIO() + write_csv(combined, buf) + buf.seek(0) + result = buf.read() + + # Should have exactly one header, not two + assert result == b'"col1"\n"a"\n"b"\n"c"\n' + + +def test_write_csv_empty_batch_in_middle(): + # GH-36889: Empty batches in the middle should not cause issues + table1 = pa.table({"col1": ["a"]}) + table2 = pa.table({"col1": ["b"]}) + empty_table = table1.schema.empty_table() + + combined = pa.concat_tables([table1, empty_table, table2]) + + buf = io.BytesIO() + write_csv(combined, buf) + buf.seek(0) + result = buf.read() + + assert result == b'"col1"\n"a"\n"b"\n' From 664e11d976389b40e091eb2355522251ce4624c6 Mon Sep 17 00:00:00 2001 From: Ruiyang Wang Date: Sun, 4 Jan 2026 09:48:14 -0800 Subject: [PATCH 2/2] Fix clang-format styling in writer_test.cc Signed-off-by: Ruiyang Wang --- cpp/src/arrow/csv/writer_test.cc | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/csv/writer_test.cc b/cpp/src/arrow/csv/writer_test.cc index 82c8cd7b068..6f68f019d3f 100644 --- a/cpp/src/arrow/csv/writer_test.cc +++ b/cpp/src/arrow/csv/writer_test.cc @@ -415,8 +415,7 @@ TEST(TestWriteCSV, EmptyBatchAtStart) { // Concatenate empty table with data table ASSERT_OK_AND_ASSIGN(auto empty_table, Table::FromRecordBatches(schema, {empty_batch})); ASSERT_OK_AND_ASSIGN(auto data_table, Table::FromRecordBatches(schema, {data_batch})); - ASSERT_OK_AND_ASSIGN(auto combined_table, - ConcatenateTables({empty_table, data_table})); + ASSERT_OK_AND_ASSIGN(auto combined_table, ConcatenateTables({empty_table, data_table})); ASSERT_OK_AND_ASSIGN(auto out, io::BufferOutputStream::Create()); ASSERT_OK(WriteCSV(*combined_table, WriteOptions::Defaults(), out.get())); @@ -435,8 +434,7 @@ TEST(TestWriteCSV, EmptyBatchInMiddle) { auto batch2 = RecordBatchFromJSON(schema, R"([{"col1": "b"}])"); ASSERT_OK_AND_ASSIGN(auto table1, Table::FromRecordBatches(schema, {batch1})); - ASSERT_OK_AND_ASSIGN(auto empty_table, - Table::FromRecordBatches(schema, {empty_batch})); + ASSERT_OK_AND_ASSIGN(auto empty_table, Table::FromRecordBatches(schema, {empty_batch})); ASSERT_OK_AND_ASSIGN(auto table2, Table::FromRecordBatches(schema, {batch2})); ASSERT_OK_AND_ASSIGN(auto combined_table, ConcatenateTables({table1, empty_table, table2}));