Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,10 @@ License: MIT + file LICENSE
URL: https://rnabioco.github.io/cpp11bigwig/,
https://github.com/rnabioco/cpp11bigwig
BugReports: https://github.com/rnabioco/cpp11bigwig/issues
Imports:
Imports:
GenomicRanges,
IRanges,
S4Vectors,
tibble
Suggests:
testthat (>= 3.0.0)
Expand All @@ -29,5 +30,5 @@ Config/Needs/website: rnabioco/rbitemplate
Config/testthat/edition: 3
Encoding: UTF-8
LazyData: TRUE
RoxygenNote: 7.3.2
RoxygenNote: 7.3.3
Roxygen: list(markdown = TRUE)
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,7 @@ export(read_bigbed)
export(read_bigwig)
importFrom(GenomicRanges,GRanges)
importFrom(IRanges,IRanges)
importFrom(IRanges,RleList)
importFrom(S4Vectors,Rle)
importFrom(tibble,as_tibble)
useDynLib(cpp11bigwig, .registration = TRUE)
5 changes: 5 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
# cpp11bigwig (development version)

* `read_bigwig()` gains `as = "Rle"`, returning a per-base run-length-encoded
vector spanning the queried range (an `Rle` for a single chromosome, or a
named `RleList` for several). Uncovered bases are set to the `fill` value
(default `0`; use `NA` to mark them missing) (#18).

* Fix remote access to large bigWig/bigBed files. The HTTP `Range` header was
not being set, so servers returned the entire file, crashing R or failing to
open files larger than the read buffer (#18).
Expand Down
93 changes: 86 additions & 7 deletions R/read.r
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,31 @@
#' @param chrom read data for specific chromosome
#' @param start start position for data
#' @param end end position for data
#' @param as return data as a specific type.
#' The default is a tibble (`tbl`) or GRanges (`gr`)
#'
#' @return \code{tibble}
#' @param as return data as a specific type. One of `"tbl"` (the default
#' tibble), `"GRanges"`, or `"Rle"`. `"Rle"` returns a per-base
#' run-length-encoded vector spanning the requested range (see Details).
#' @param fill value used for bases with no data when `as = "Rle"`.
#' Defaults to `0` (the convention for coverage); use `NA` to mark
#' uncovered bases as missing. Ignored for other `as` values.
#'
#' @details
#' When `as = "Rle"`, the result is an [S4Vectors::Rle] whose expanded
#' length equals the queried range, i.e. `end - start` when both are
#' supplied, otherwise the extent of the returned data for each
#' chromosome. Bases with no data in the file are set to `fill`. bigWig
#' coordinates are 0-based and half-open, so element `i` corresponds to
#' genomic position `start + i - 1`. A single-chromosome query returns a
#' bare `Rle`; a multi-chromosome query returns a named
#' [IRanges::RleList].
#'
#' @return A `tibble`, `GRanges`, or `Rle`/`RleList` depending on `as`.
#'
#' @seealso \url{https://github.com/dpryan79/libBigWig}
#' @seealso \url{https://github.com/brentp/bw-python}
#'
#' @importFrom S4Vectors Rle
#' @importFrom IRanges RleList
#'
#' @examples
#' bw <- system.file("extdata", "test.bw", package = "cpp11bigwig")
#'
Expand All @@ -25,28 +42,33 @@
#'
#' read_bigwig(bw, as = "GRanges")
#'
#' read_bigwig(bw, chrom = "1", start = 100, end = 130, as = "Rle")
#'
#' @export
read_bigwig <- function(
bwfile,
chrom = NULL,
start = NULL,
end = NULL,
as = NULL
as = NULL,
fill = 0
) {
check_bigwig_file(bwfile)

if ((!is.null(start) && start < 0) || (!is.null(end) && end < 0)) {
stop("`start` and `end` must both be >= 0")
}

if (!is.null(as) && !as %in% c("GRanges", "tbl")) {
stop("`as` must be one of 'GRanges' or 'tbl' (the default)")
if (!is.null(as) && !as %in% c("GRanges", "tbl", "Rle")) {
stop("`as` must be one of 'GRanges', 'Rle', or 'tbl' (the default)")
}

res <- read_bigwig_cpp(bwfile, chrom, start, end)

if (!is.null(as) && as == "GRanges") {
return(as_granges(res))
} else if (!is.null(as) && as == "Rle") {
return(as_rle(res, start, end, fill))
} else {
return(as_tibble(res))
}
Expand Down Expand Up @@ -84,6 +106,63 @@ as_granges <- function(x) {
)
}

#' build a per-base Rle covering [lo, hi) from sorted, clipped runs
#' @noRd
runs_to_rle <- function(s, e, v, lo, hi, fill) {
# clip runs to the window and drop those entirely outside it
keep <- e > lo & s < hi
s <- pmax(s[keep], lo)
e <- pmin(e[keep], hi)
v <- v[keep]

o <- order(s)
s <- s[o]
e <- e[o]
v <- v[o]

n <- length(s)
if (n == 0L) {
return(Rle(fill, max(hi - lo, 0L)))
}

# interleave a `fill` gap before each run with the run itself, then a
# trailing `fill` gap out to `hi`; gaps of length 0 are dropped below
prev_end <- c(lo, e[-n])
values <- c(as.vector(rbind(rep(fill, n), v)), fill)
lengths <- c(as.vector(rbind(s - prev_end, e - s)), hi - e[n])

pos <- lengths > 0L
Rle(values[pos], lengths[pos])
}

#' convert to an Rle (single chrom) or RleList (multiple chroms)
#' @noRd
as_rle <- function(x, start, end, fill) {
chroms <- unique(x$chrom)

# no data: with an explicit window, return a fill Rle of that length
if (length(chroms) == 0L) {
if (!is.null(start) && !is.null(end)) {
return(Rle(fill, max(end - start, 0L)))
}
return(Rle(numeric(0)))
}

rles <- lapply(chroms, function(ch) {
rows <- x$chrom == ch
lo <- if (!is.null(start)) start else min(x$start[rows])
hi <- if (!is.null(end)) end else max(x$end[rows])
runs_to_rle(x$start[rows], x$end[rows], x$value[rows], lo, hi, fill)
})

if (length(rles) == 1L) {
return(rles[[1]])
}

names(rles) <- chroms
RleList(rles, compress = FALSE)
}

#' Read data from bigBed files.
#'
#' Columns are automatically typed based on the autoSql schema embedded
Expand Down
32 changes: 28 additions & 4 deletions man/read_bigwig.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

30 changes: 30 additions & 0 deletions tests/testthat/test-read.R
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,36 @@ test_that("results have expected shape", {
expect_equal(nrow(res), 3)
})

test_that("as = 'Rle' returns a per-base run-length vector", {
bw <- test_path("data/test.bw")

# values are stored as 32-bit floats, so compare with tolerance
tol <- 1e-6

# windowed query: length equals end - start, gaps filled with 0
r <- read_bigwig(bw, chrom = "1", start = 0, end = 5, as = "Rle")
expect_s4_class(r, "Rle")
expect_equal(length(r), 5L)
expect_equal(as.numeric(r), c(0.1, 0.2, 0.3, 0, 0), tolerance = tol)

# uncovered bases inside the data range are also filled
r <- read_bigwig(bw, chrom = "1", start = 98, end = 103, as = "Rle")
expect_equal(as.numeric(r), c(0, 0, 1.4, 1.4, 1.4), tolerance = tol)

# fill = NA marks uncovered bases as missing
r <- read_bigwig(bw, chrom = "1", start = 0, end = 5, as = "Rle", fill = NA)
expect_equal(as.numeric(r), c(0.1, 0.2, 0.3, NA, NA), tolerance = tol)

# without an explicit window the Rle spans the data extent of the chrom
r <- read_bigwig(bw, chrom = "1", as = "Rle")
expect_equal(length(r), 151L)

# multiple chromosomes return a named RleList
rl <- read_bigwig(bw, as = "Rle")
expect_s4_class(rl, "RleList")
expect_equal(names(rl), c("1", "10"))
})

test_that("missing file causes error", {
expect_snapshot_error(read_bigwig("missing.bw"))
})
Expand Down
Loading