apache · thisisnic · Feb 19, 2026
@@ -196,6 +196,21 @@ repeat_value_as_array <- function(object, n) {
 }
 
 handle_csv_read_error <- function(msg, call, schema) {
+  # Handle null type inference issue with sparse data
+  if (grepl("conversion error to null", msg)) {
+    msg <- c(
+      msg,
+      i = paste(
+        "Column type was inferred as null because the first block of data",
+        "(default 1MB, set via `block_size` in read options) contained only",
+        "missing values. Try specifying the column types explicitly using the",
+        "`col_types` or `schema` argument."
+      )
+    )
+    abort(msg, call = call)
+  }
+
+  # Handle schema + header row issue
   if (grepl("conversion error", msg) && inherits(schema, "Schema")) {
     msg <- c(
       msg,

@@ -711,3 +711,21 @@ test_that("open_dataset() with `decimal_point` argument", {
     tibble(x = 1.2, y = "c")
   )
 })
+
+
+test_that("more informative error when column inferred as null due to sparse data (GH-35806)", {
+  tf <- tempfile()
+  on.exit(unlink(tf))
+
+  # Create a CSV where the second column has NAs in the first rows
+  # but a value later - this causes Arrow to infer null type
+  writeLines(c("x,y", paste0(1:100, ",")), tf)
+  write("101,foo", tf, append = TRUE)
+
+  # Use small block_size to force type inference from only the first rows
+  expect_error(
+    open_dataset(tf, format = "csv", read_options = csv_read_options(block_size = 100L)) |>
+      collect(),
+    "inferred as null"
+  )
+})