Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions r/R/util.R
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,21 @@ repeat_value_as_array <- function(object, n) {
}

handle_csv_read_error <- function(msg, call, schema) {
# Handle null type inference issue with sparse data
if (grepl("conversion error to null", msg)) {
msg <- c(
msg,
i = paste(
"Column type was inferred as null because the first block of data",
"(default 1MB, set via `block_size` in read options) contained only",
"missing values. Try specifying the column types explicitly using the",
"`col_types` or `schema` argument."
)
)
abort(msg, call = call)
}

# Handle schema + header row issue
if (grepl("conversion error", msg) && inherits(schema, "Schema")) {
msg <- c(
msg,
Expand Down
18 changes: 18 additions & 0 deletions r/tests/testthat/test-dataset-csv.R
Original file line number Diff line number Diff line change
Expand Up @@ -711,3 +711,21 @@ test_that("open_dataset() with `decimal_point` argument", {
tibble(x = 1.2, y = "c")
)
})


test_that("more informative error when column inferred as null due to sparse data (GH-35806)", {
tf <- tempfile()
on.exit(unlink(tf))

# Create a CSV where the second column has NAs in the first rows
# but a value later - this causes Arrow to infer null type
writeLines(c("x,y", paste0(1:100, ",")), tf)
write("101,foo", tf, append = TRUE)

# Use small block_size to force type inference from only the first rows
expect_error(
open_dataset(tf, format = "csv", read_options = csv_read_options(block_size = 100L)) |>
collect(),
"inferred as null"
)
})
Loading