From 5ae9c42459d104fdac23728faf1fcb09ebbc5169 Mon Sep 17 00:00:00 2001 From: Jay Hesselberth Date: Wed, 17 Jun 2026 06:51:35 -0600 Subject: [PATCH] Add as = "Rle" return type to read_bigwig() (#18) read_bigwig(..., as = "Rle") returns a per-base run-length-encoded vector spanning the queried range: an Rle for a single chromosome, or a named RleList for several. Uncovered bases are set to a `fill` value (default 0, the coverage convention; use NA to mark them missing). The conversion lives in R (as_rle/runs_to_rle): the C++ layer already emits run-length data via read_bigwig_cpp, and the Rle payload is just (values, lengths), so only compact runs cross the boundary. Building the S4 object stays in R next to the S4Vectors/IRanges classes that define it. Adds S4Vectors to Imports (Rle), imports RleList from IRanges, and local (non-network) tests covering windowed length, gap filling, NA fill, whole-chrom extent, and the multi-chrom RleList. Co-Authored-By: Claude Opus 4.8 (1M context) --- DESCRIPTION | 5 +- NAMESPACE | 2 + NEWS.md | 5 ++ R/read.r | 93 +++++++++++++++++++++++++++++++++++--- man/read_bigwig.Rd | 32 +++++++++++-- tests/testthat/test-read.R | 30 ++++++++++++ 6 files changed, 154 insertions(+), 13 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 729fc85..f2058e9 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -15,9 +15,10 @@ License: MIT + file LICENSE URL: https://rnabioco.github.io/cpp11bigwig/, https://github.com/rnabioco/cpp11bigwig BugReports: https://github.com/rnabioco/cpp11bigwig/issues -Imports: +Imports: GenomicRanges, IRanges, + S4Vectors, tibble Suggests: testthat (>= 3.0.0) @@ -29,5 +30,5 @@ Config/Needs/website: rnabioco/rbitemplate Config/testthat/edition: 3 Encoding: UTF-8 LazyData: TRUE -RoxygenNote: 7.3.2 +RoxygenNote: 7.3.3 Roxygen: list(markdown = TRUE) diff --git a/NAMESPACE b/NAMESPACE index 95f31ef..8b76c58 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -4,5 +4,7 @@ export(read_bigbed) export(read_bigwig) importFrom(GenomicRanges,GRanges) importFrom(IRanges,IRanges) +importFrom(IRanges,RleList) +importFrom(S4Vectors,Rle) importFrom(tibble,as_tibble) useDynLib(cpp11bigwig, .registration = TRUE) diff --git a/NEWS.md b/NEWS.md index 559cd8a..fc72fab 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,10 @@ # cpp11bigwig (development version) +* `read_bigwig()` gains `as = "Rle"`, returning a per-base run-length-encoded + vector spanning the queried range (an `Rle` for a single chromosome, or a + named `RleList` for several). Uncovered bases are set to the `fill` value + (default `0`; use `NA` to mark them missing) (#18). + * Fix remote access to large bigWig/bigBed files. The HTTP `Range` header was not being set, so servers returned the entire file, crashing R or failing to open files larger than the read buffer (#18). diff --git a/R/read.r b/R/read.r index efedb0a..7610d88 100644 --- a/R/read.r +++ b/R/read.r @@ -6,14 +6,31 @@ #' @param chrom read data for specific chromosome #' @param start start position for data #' @param end end position for data -#' @param as return data as a specific type. -#' The default is a tibble (`tbl`) or GRanges (`gr`) -#' -#' @return \code{tibble} +#' @param as return data as a specific type. One of `"tbl"` (the default +#' tibble), `"GRanges"`, or `"Rle"`. `"Rle"` returns a per-base +#' run-length-encoded vector spanning the requested range (see Details). +#' @param fill value used for bases with no data when `as = "Rle"`. +#' Defaults to `0` (the convention for coverage); use `NA` to mark +#' uncovered bases as missing. Ignored for other `as` values. +#' +#' @details +#' When `as = "Rle"`, the result is an [S4Vectors::Rle] whose expanded +#' length equals the queried range, i.e. `end - start` when both are +#' supplied, otherwise the extent of the returned data for each +#' chromosome. Bases with no data in the file are set to `fill`. bigWig +#' coordinates are 0-based and half-open, so element `i` corresponds to +#' genomic position `start + i - 1`. A single-chromosome query returns a +#' bare `Rle`; a multi-chromosome query returns a named +#' [IRanges::RleList]. +#' +#' @return A `tibble`, `GRanges`, or `Rle`/`RleList` depending on `as`. #' #' @seealso \url{https://github.com/dpryan79/libBigWig} #' @seealso \url{https://github.com/brentp/bw-python} #' +#' @importFrom S4Vectors Rle +#' @importFrom IRanges RleList +#' #' @examples #' bw <- system.file("extdata", "test.bw", package = "cpp11bigwig") #' @@ -25,13 +42,16 @@ #' #' read_bigwig(bw, as = "GRanges") #' +#' read_bigwig(bw, chrom = "1", start = 100, end = 130, as = "Rle") +#' #' @export read_bigwig <- function( bwfile, chrom = NULL, start = NULL, end = NULL, - as = NULL + as = NULL, + fill = 0 ) { check_bigwig_file(bwfile) @@ -39,14 +59,16 @@ read_bigwig <- function( stop("`start` and `end` must both be >= 0") } - if (!is.null(as) && !as %in% c("GRanges", "tbl")) { - stop("`as` must be one of 'GRanges' or 'tbl' (the default)") + if (!is.null(as) && !as %in% c("GRanges", "tbl", "Rle")) { + stop("`as` must be one of 'GRanges', 'Rle', or 'tbl' (the default)") } res <- read_bigwig_cpp(bwfile, chrom, start, end) if (!is.null(as) && as == "GRanges") { return(as_granges(res)) + } else if (!is.null(as) && as == "Rle") { + return(as_rle(res, start, end, fill)) } else { return(as_tibble(res)) } @@ -84,6 +106,63 @@ as_granges <- function(x) { ) } +#' build a per-base Rle covering [lo, hi) from sorted, clipped runs +#' @noRd +runs_to_rle <- function(s, e, v, lo, hi, fill) { + # clip runs to the window and drop those entirely outside it + keep <- e > lo & s < hi + s <- pmax(s[keep], lo) + e <- pmin(e[keep], hi) + v <- v[keep] + + o <- order(s) + s <- s[o] + e <- e[o] + v <- v[o] + + n <- length(s) + if (n == 0L) { + return(Rle(fill, max(hi - lo, 0L))) + } + + # interleave a `fill` gap before each run with the run itself, then a + # trailing `fill` gap out to `hi`; gaps of length 0 are dropped below + prev_end <- c(lo, e[-n]) + values <- c(as.vector(rbind(rep(fill, n), v)), fill) + lengths <- c(as.vector(rbind(s - prev_end, e - s)), hi - e[n]) + + pos <- lengths > 0L + Rle(values[pos], lengths[pos]) +} + +#' convert to an Rle (single chrom) or RleList (multiple chroms) +#' @noRd +as_rle <- function(x, start, end, fill) { + chroms <- unique(x$chrom) + + # no data: with an explicit window, return a fill Rle of that length + if (length(chroms) == 0L) { + if (!is.null(start) && !is.null(end)) { + return(Rle(fill, max(end - start, 0L))) + } + return(Rle(numeric(0))) + } + + rles <- lapply(chroms, function(ch) { + rows <- x$chrom == ch + lo <- if (!is.null(start)) start else min(x$start[rows]) + hi <- if (!is.null(end)) end else max(x$end[rows]) + runs_to_rle(x$start[rows], x$end[rows], x$value[rows], lo, hi, fill) + }) + + if (length(rles) == 1L) { + return(rles[[1]]) + } + + names(rles) <- chroms + RleList(rles, compress = FALSE) +} + #' Read data from bigBed files. #' #' Columns are automatically typed based on the autoSql schema embedded diff --git a/man/read_bigwig.Rd b/man/read_bigwig.Rd index 4f1362a..b91d545 100644 --- a/man/read_bigwig.Rd +++ b/man/read_bigwig.Rd @@ -4,7 +4,14 @@ \alias{read_bigwig} \title{Read data from bigWig files.} \usage{ -read_bigwig(bwfile, chrom = NULL, start = NULL, end = NULL, as = NULL) +read_bigwig( + bwfile, + chrom = NULL, + start = NULL, + end = NULL, + as = NULL, + fill = 0 +) } \arguments{ \item{bwfile}{path or URL for a bigWig file. Remote files @@ -17,15 +24,30 @@ installed with libcurl available.} \item{end}{end position for data} -\item{as}{return data as a specific type. -The default is a tibble (\code{tbl}) or GRanges (\code{gr})} +\item{as}{return data as a specific type. One of \code{"tbl"} (the default +tibble), \code{"GRanges"}, or \code{"Rle"}. \code{"Rle"} returns a per-base +run-length-encoded vector spanning the requested range (see Details).} + +\item{fill}{value used for bases with no data when \code{as = "Rle"}. +Defaults to \code{0} (the convention for coverage); use \code{NA} to mark +uncovered bases as missing. Ignored for other \code{as} values.} } \value{ -\code{tibble} +A \code{tibble}, \code{GRanges}, or \code{Rle}/\code{RleList} depending on \code{as}. } \description{ Read data from bigWig files. } +\details{ +When \code{as = "Rle"}, the result is an \link[S4Vectors:Rle-class]{S4Vectors::Rle} whose expanded +length equals the queried range, i.e. \code{end - start} when both are +supplied, otherwise the extent of the returned data for each +chromosome. Bases with no data in the file are set to \code{fill}. bigWig +coordinates are 0-based and half-open, so element \code{i} corresponds to +genomic position \code{start + i - 1}. A single-chromosome query returns a +bare \code{Rle}; a multi-chromosome query returns a named +\link[IRanges:AtomicList-class]{IRanges::RleList}. +} \examples{ bw <- system.file("extdata", "test.bw", package = "cpp11bigwig") @@ -37,6 +59,8 @@ read_bigwig(bw, chrom = "1", start = 100, end = 130) read_bigwig(bw, as = "GRanges") +read_bigwig(bw, chrom = "1", start = 100, end = 130, as = "Rle") + } \seealso{ \url{https://github.com/dpryan79/libBigWig} diff --git a/tests/testthat/test-read.R b/tests/testthat/test-read.R index 56c63e5..01755d2 100644 --- a/tests/testthat/test-read.R +++ b/tests/testthat/test-read.R @@ -28,6 +28,36 @@ test_that("results have expected shape", { expect_equal(nrow(res), 3) }) +test_that("as = 'Rle' returns a per-base run-length vector", { + bw <- test_path("data/test.bw") + + # values are stored as 32-bit floats, so compare with tolerance + tol <- 1e-6 + + # windowed query: length equals end - start, gaps filled with 0 + r <- read_bigwig(bw, chrom = "1", start = 0, end = 5, as = "Rle") + expect_s4_class(r, "Rle") + expect_equal(length(r), 5L) + expect_equal(as.numeric(r), c(0.1, 0.2, 0.3, 0, 0), tolerance = tol) + + # uncovered bases inside the data range are also filled + r <- read_bigwig(bw, chrom = "1", start = 98, end = 103, as = "Rle") + expect_equal(as.numeric(r), c(0, 0, 1.4, 1.4, 1.4), tolerance = tol) + + # fill = NA marks uncovered bases as missing + r <- read_bigwig(bw, chrom = "1", start = 0, end = 5, as = "Rle", fill = NA) + expect_equal(as.numeric(r), c(0.1, 0.2, 0.3, NA, NA), tolerance = tol) + + # without an explicit window the Rle spans the data extent of the chrom + r <- read_bigwig(bw, chrom = "1", as = "Rle") + expect_equal(length(r), 151L) + + # multiple chromosomes return a named RleList + rl <- read_bigwig(bw, as = "Rle") + expect_s4_class(rl, "RleList") + expect_equal(names(rl), c("1", "10")) +}) + test_that("missing file causes error", { expect_snapshot_error(read_bigwig("missing.bw")) })