apache · rynewang · Jan 3, 2026 · Jan 4, 2026
diff --git a/cpp/src/arrow/csv/writer.cc b/cpp/src/arrow/csv/writer.cc
@@ -659,6 +659,8 @@ class CSVWriterImpl : public ipc::RecordBatchWriter {
 
   Status TranslateMinimalBatch(const RecordBatch& batch) {
     if (batch.num_rows() == 0) {
+      // GH-36889: Clear buffer to avoid writing stale content (e.g., header)
+      RETURN_NOT_OK(data_buffer_->Resize(0, /*shrink_to_fit=*/false));
       return Status::OK();
     }
     offsets_.resize(batch.num_rows());

diff --git a/cpp/src/arrow/csv/writer_test.cc b/cpp/src/arrow/csv/writer_test.cc
@@ -28,6 +28,7 @@
 #include "arrow/ipc/writer.h"
 #include "arrow/record_batch.h"
 #include "arrow/result.h"
+#include "arrow/table.h"
 #include "arrow/testing/gtest_util.h"
 #include "arrow/testing/matchers.h"
 #include "arrow/type.h"
@@ -405,5 +406,46 @@ INSTANTIATE_TEST_SUITE_P(
                          "\n2016-02-29 10:42:23-0700,2016-02-29 17:42:23Z\n")));
 #endif
 
+// GH-36889: Empty batches at the start should not cause duplicate headers
+TEST(TestWriteCSV, EmptyBatchAtStart) {
+  auto schema = arrow::schema({field("col1", utf8())});
+  auto empty_batch = RecordBatchFromJSON(schema, "[]");
+  auto data_batch = RecordBatchFromJSON(schema, R"([{"col1": "a"}, {"col1": "b"}])");
+
+  // Concatenate empty table with data table
+  ASSERT_OK_AND_ASSIGN(auto empty_table, Table::FromRecordBatches(schema, {empty_batch}));
+  ASSERT_OK_AND_ASSIGN(auto data_table, Table::FromRecordBatches(schema, {data_batch}));
+  ASSERT_OK_AND_ASSIGN(auto combined_table, ConcatenateTables({empty_table, data_table}));
+
+  ASSERT_OK_AND_ASSIGN(auto out, io::BufferOutputStream::Create());
+  ASSERT_OK(WriteCSV(*combined_table, WriteOptions::Defaults(), out.get()));
+  ASSERT_OK_AND_ASSIGN(auto buffer, out->Finish());
+
+  std::string result(reinterpret_cast<const char*>(buffer->data()), buffer->size());
+  // Should have exactly one header, not two
+  EXPECT_EQ(result, "\"col1\"\n\"a\"\n\"b\"\n");
+}
+
+// GH-36889: Empty batches in the middle should not cause issues
+TEST(TestWriteCSV, EmptyBatchInMiddle) {
+  auto schema = arrow::schema({field("col1", utf8())});
+  auto batch1 = RecordBatchFromJSON(schema, R"([{"col1": "a"}])");
+  auto empty_batch = RecordBatchFromJSON(schema, "[]");
+  auto batch2 = RecordBatchFromJSON(schema, R"([{"col1": "b"}])");
+
+  ASSERT_OK_AND_ASSIGN(auto table1, Table::FromRecordBatches(schema, {batch1}));
+  ASSERT_OK_AND_ASSIGN(auto empty_table, Table::FromRecordBatches(schema, {empty_batch}));
+  ASSERT_OK_AND_ASSIGN(auto table2, Table::FromRecordBatches(schema, {batch2}));
+  ASSERT_OK_AND_ASSIGN(auto combined_table,
+                       ConcatenateTables({table1, empty_table, table2}));
+
+  ASSERT_OK_AND_ASSIGN(auto out, io::BufferOutputStream::Create());
+  ASSERT_OK(WriteCSV(*combined_table, WriteOptions::Defaults(), out.get()));
+  ASSERT_OK_AND_ASSIGN(auto buffer, out->Finish());
+
+  std::string result(reinterpret_cast<const char*>(buffer->data()), buffer->size());
+  EXPECT_EQ(result, "\"col1\"\n\"a\"\n\"b\"\n");
+}
+
 }  // namespace csv
 }  // namespace arrow
@@ -2065,3 +2065,36 @@ def readinto(self, *args):
     for i in range(20):
         with pytest.raises(pa.ArrowInvalid):
             read_csv(MyBytesIO(data))
+
+
+def test_write_csv_empty_batch_no_duplicate_header():
+    # GH-36889: Empty batches at the start should not cause duplicate headers
+    table = pa.table({"col1": ["a", "b", "c"]})
+
+    # Concatenate empty table with data table
+    empty_table = table.schema.empty_table()
+    combined = pa.concat_tables([empty_table, table])
+
+    buf = io.BytesIO()
+    write_csv(combined, buf)
+    buf.seek(0)
+    result = buf.read()
+
+    # Should have exactly one header, not two
+    assert result == b'"col1"\n"a"\n"b"\n"c"\n'
+
+
+def test_write_csv_empty_batch_in_middle():
+    # GH-36889: Empty batches in the middle should not cause issues
+    table1 = pa.table({"col1": ["a"]})
+    table2 = pa.table({"col1": ["b"]})
+    empty_table = table1.schema.empty_table()
+
+    combined = pa.concat_tables([table1, empty_table, table2])
+
+    buf = io.BytesIO()
+    write_csv(combined, buf)
+    buf.seek(0)
+    result = buf.read()
+
+    assert result == b'"col1"\n"a"\n"b"\n'