diff --git a/r/R/util.R b/r/R/util.R index c63e1ee5459..acbd39e2037 100644 --- a/r/R/util.R +++ b/r/R/util.R @@ -196,6 +196,21 @@ repeat_value_as_array <- function(object, n) { } handle_csv_read_error <- function(msg, call, schema) { + # Handle null type inference issue with sparse data + if (grepl("conversion error to null", msg)) { + msg <- c( + msg, + i = paste( + "Column type was inferred as null because the first block of data", + "(default 1MB, set via `block_size` in read options) contained only", + "missing values. Try specifying the column types explicitly using the", + "`col_types` or `schema` argument." + ) + ) + abort(msg, call = call) + } + + # Handle schema + header row issue if (grepl("conversion error", msg) && inherits(schema, "Schema")) { msg <- c( msg, diff --git a/r/tests/testthat/test-dataset-csv.R b/r/tests/testthat/test-dataset-csv.R index 749d1672ac5..145a376da97 100644 --- a/r/tests/testthat/test-dataset-csv.R +++ b/r/tests/testthat/test-dataset-csv.R @@ -711,3 +711,21 @@ test_that("open_dataset() with `decimal_point` argument", { tibble(x = 1.2, y = "c") ) }) + + +test_that("more informative error when column inferred as null due to sparse data (GH-35806)", { + tf <- tempfile() + on.exit(unlink(tf)) + + # Create a CSV where the second column has NAs in the first rows + # but a value later - this causes Arrow to infer null type + writeLines(c("x,y", paste0(1:100, ",")), tf) + write("101,foo", tf, append = TRUE) + + # Use small block_size to force type inference from only the first rows + expect_error( + open_dataset(tf, format = "csv", read_options = csv_read_options(block_size = 100L)) |> + collect(), + "inferred as null" + ) +})