From a41c532ace08c0d56432f28e6e3df4480fa5fd48 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 19 Feb 2026 09:21:40 +0000 Subject: [PATCH] GH-35806: [R] Improve error message for null type inference with sparse CSV data When a CSV column contains only missing values in the first block of data, Arrow infers the type as null. If a non-null value appears later, the conversion fails with an unhelpful error suggesting `skip = 1`. This change adds a specific check for "conversion error to null" and provides a more helpful message explaining the cause (type inference from sparse data) and the solution (specify column types explicitly). Co-Authored-By: Claude Opus 4.5 --- r/R/util.R | 15 +++++++++++++++ r/tests/testthat/test-dataset-csv.R | 18 ++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/r/R/util.R b/r/R/util.R index c63e1ee5459..acbd39e2037 100644 --- a/r/R/util.R +++ b/r/R/util.R @@ -196,6 +196,21 @@ repeat_value_as_array <- function(object, n) { } handle_csv_read_error <- function(msg, call, schema) { + # Handle null type inference issue with sparse data + if (grepl("conversion error to null", msg)) { + msg <- c( + msg, + i = paste( + "Column type was inferred as null because the first block of data", + "(default 1MB, set via `block_size` in read options) contained only", + "missing values. Try specifying the column types explicitly using the", + "`col_types` or `schema` argument." + ) + ) + abort(msg, call = call) + } + + # Handle schema + header row issue if (grepl("conversion error", msg) && inherits(schema, "Schema")) { msg <- c( msg, diff --git a/r/tests/testthat/test-dataset-csv.R b/r/tests/testthat/test-dataset-csv.R index 749d1672ac5..145a376da97 100644 --- a/r/tests/testthat/test-dataset-csv.R +++ b/r/tests/testthat/test-dataset-csv.R @@ -711,3 +711,21 @@ test_that("open_dataset() with `decimal_point` argument", { tibble(x = 1.2, y = "c") ) }) + + +test_that("more informative error when column inferred as null due to sparse data (GH-35806)", { + tf <- tempfile() + on.exit(unlink(tf)) + + # Create a CSV where the second column has NAs in the first rows + # but a value later - this causes Arrow to infer null type + writeLines(c("x,y", paste0(1:100, ",")), tf) + write("101,foo", tf, append = TRUE) + + # Use small block_size to force type inference from only the first rows + expect_error( + open_dataset(tf, format = "csv", read_options = csv_read_options(block_size = 100L)) |> + collect(), + "inferred as null" + ) +})