diff --git a/cpp/src/arrow/csv/writer.cc b/cpp/src/arrow/csv/writer.cc index 5d14fe4b9b1..cb3d42e9348 100644 --- a/cpp/src/arrow/csv/writer.cc +++ b/cpp/src/arrow/csv/writer.cc @@ -659,6 +659,8 @@ class CSVWriterImpl : public ipc::RecordBatchWriter { Status TranslateMinimalBatch(const RecordBatch& batch) { if (batch.num_rows() == 0) { + // GH-36889: Clear buffer to avoid writing stale content (e.g., header) + RETURN_NOT_OK(data_buffer_->Resize(0, /*shrink_to_fit=*/false)); return Status::OK(); } offsets_.resize(batch.num_rows()); diff --git a/cpp/src/arrow/csv/writer_test.cc b/cpp/src/arrow/csv/writer_test.cc index 783d7631ab3..6f68f019d3f 100644 --- a/cpp/src/arrow/csv/writer_test.cc +++ b/cpp/src/arrow/csv/writer_test.cc @@ -28,6 +28,7 @@ #include "arrow/ipc/writer.h" #include "arrow/record_batch.h" #include "arrow/result.h" +#include "arrow/table.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/matchers.h" #include "arrow/type.h" @@ -405,5 +406,46 @@ INSTANTIATE_TEST_SUITE_P( "\n2016-02-29 10:42:23-0700,2016-02-29 17:42:23Z\n"))); #endif +// GH-36889: Empty batches at the start should not cause duplicate headers +TEST(TestWriteCSV, EmptyBatchAtStart) { + auto schema = arrow::schema({field("col1", utf8())}); + auto empty_batch = RecordBatchFromJSON(schema, "[]"); + auto data_batch = RecordBatchFromJSON(schema, R"([{"col1": "a"}, {"col1": "b"}])"); + + // Concatenate empty table with data table + ASSERT_OK_AND_ASSIGN(auto empty_table, Table::FromRecordBatches(schema, {empty_batch})); + ASSERT_OK_AND_ASSIGN(auto data_table, Table::FromRecordBatches(schema, {data_batch})); + ASSERT_OK_AND_ASSIGN(auto combined_table, ConcatenateTables({empty_table, data_table})); + + ASSERT_OK_AND_ASSIGN(auto out, io::BufferOutputStream::Create()); + ASSERT_OK(WriteCSV(*combined_table, WriteOptions::Defaults(), out.get())); + ASSERT_OK_AND_ASSIGN(auto buffer, out->Finish()); + + std::string result(reinterpret_cast(buffer->data()), buffer->size()); + // Should have exactly one header, not two + EXPECT_EQ(result, "\"col1\"\n\"a\"\n\"b\"\n"); +} + +// GH-36889: Empty batches in the middle should not cause issues +TEST(TestWriteCSV, EmptyBatchInMiddle) { + auto schema = arrow::schema({field("col1", utf8())}); + auto batch1 = RecordBatchFromJSON(schema, R"([{"col1": "a"}])"); + auto empty_batch = RecordBatchFromJSON(schema, "[]"); + auto batch2 = RecordBatchFromJSON(schema, R"([{"col1": "b"}])"); + + ASSERT_OK_AND_ASSIGN(auto table1, Table::FromRecordBatches(schema, {batch1})); + ASSERT_OK_AND_ASSIGN(auto empty_table, Table::FromRecordBatches(schema, {empty_batch})); + ASSERT_OK_AND_ASSIGN(auto table2, Table::FromRecordBatches(schema, {batch2})); + ASSERT_OK_AND_ASSIGN(auto combined_table, + ConcatenateTables({table1, empty_table, table2})); + + ASSERT_OK_AND_ASSIGN(auto out, io::BufferOutputStream::Create()); + ASSERT_OK(WriteCSV(*combined_table, WriteOptions::Defaults(), out.get())); + ASSERT_OK_AND_ASSIGN(auto buffer, out->Finish()); + + std::string result(reinterpret_cast(buffer->data()), buffer->size()); + EXPECT_EQ(result, "\"col1\"\n\"a\"\n\"b\"\n"); +} + } // namespace csv } // namespace arrow diff --git a/python/pyarrow/tests/test_csv.py b/python/pyarrow/tests/test_csv.py index f510c6dbe23..cf681c1fdcc 100644 --- a/python/pyarrow/tests/test_csv.py +++ b/python/pyarrow/tests/test_csv.py @@ -2065,3 +2065,36 @@ def readinto(self, *args): for i in range(20): with pytest.raises(pa.ArrowInvalid): read_csv(MyBytesIO(data)) + + +def test_write_csv_empty_batch_no_duplicate_header(): + # GH-36889: Empty batches at the start should not cause duplicate headers + table = pa.table({"col1": ["a", "b", "c"]}) + + # Concatenate empty table with data table + empty_table = table.schema.empty_table() + combined = pa.concat_tables([empty_table, table]) + + buf = io.BytesIO() + write_csv(combined, buf) + buf.seek(0) + result = buf.read() + + # Should have exactly one header, not two + assert result == b'"col1"\n"a"\n"b"\n"c"\n' + + +def test_write_csv_empty_batch_in_middle(): + # GH-36889: Empty batches in the middle should not cause issues + table1 = pa.table({"col1": ["a"]}) + table2 = pa.table({"col1": ["b"]}) + empty_table = table1.schema.empty_table() + + combined = pa.concat_tables([table1, empty_table, table2]) + + buf = io.BytesIO() + write_csv(combined, buf) + buf.seek(0) + result = buf.read() + + assert result == b'"col1"\n"a"\n"b"\n'