diff --git a/NAMESPACE b/NAMESPACE index 3405d737..05ffb3d1 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -154,6 +154,7 @@ export(psislw) export(relative_eff) export(scrps) export(sis) +export(srs_diff_est) export(stacking_weights) export(tis) export(waic) diff --git a/NEWS.md b/NEWS.md index 4d9a3af0..ab19f715 100644 --- a/NEWS.md +++ b/NEWS.md @@ -29,6 +29,7 @@ * Added contribution section. by @VisruthSK in #286 * Update LOO uncertainty paper to use BA doi by @avehtari in #311 * Update documentation for `E_loo()` function by @avehtari in #312 +* Export `srs_diff_est()` function by @vinniott and @avehtari in #340 # loo 2.8.0 diff --git a/R/loo_subsample.R b/R/loo_subsample.R index bcac4b17..e8eab322 100644 --- a/R/loo_subsample.R +++ b/R/loo_subsample.R @@ -24,7 +24,7 @@ #' same length containing the posterior density and the approximation density #' for the individual draws. #' -#' @seealso [loo()], [psis()], [loo_compare()] +#' @seealso [loo()], [psis()], [loo_compare()], [srs_diff_est()] #' @template loo-large-data-references #' #' @export loo_subsample loo_subsample.function @@ -1166,12 +1166,88 @@ loo_subsample_estimation_diff_srs <- function(x) { update_psis_loo_ss_estimates(x) } -#' Difference estimation using SRS-WOR sampling (Magnusson et al., 2020) -#' @noRd -#' @param y_approx Approximated values of all observations. -#' @param y The values observed. -#' @param y_idx The index of `y` in `y_approx`. -#' @return A list with estimates. +#' Difference estimator with simple random sampling without replacement. +#' +#' The difference estimator `srs_diff_est()` estimates +#' the expectation \eqn{n E[y]} when we have \eqn{n} approximate values \eqn{\tilde{y}_i}, +#' \eqn{i = 1, \ldots, n} and \eqn{m < n} accurate values \eqn{y_j}, \eqn{j \in \mathcal{S}}, +#' where \eqn{m} is the subsample size and \eqn{\mathcal{S}} is +#' a simple random subsample without replacement. The original +#' approach is by Cochran (1977) and we follow the equations 7--9 by +#' Magnusson et al. (2020). +#' +#' @details In Magnusson et al. (2020) Eq (9) first row, the second `+` should +#' be a `-`; Supplementary Material Eq (6) has this correct. +#' As `srs_diff_est()` in the `loo` package is used for \eqn{n E[y]}, there is +#' a proportional difference of \eqn{1/n} compared to the paper. +#' +#' @param y_approx (numeric) `n` approximated values. +#' @param y (numeric) `m unique() |> scale() |> as.data.frame() +#' +#' fitos <- brm(ordered(quality) ~ ., +#' family = cumulative("logit"), +#' prior = prior(R2D2(mean_R2 = 1/3, prec_R2 = 3)), +#' data = wine_scaled, +#' seed = 1, +#' silent = 2, +#' refresh = 0) +#' +#' log_lik_matrix <- log_lik(fitos) +#' +#' N <- nrow(wine_scaled) +#' Nsub <- 100 +#' +#' # posterior log-score +#' lpd <- elpd(log_lik_matrix) +#' sum(lpd$pointwise[,"elpd"]) +#' # Use PSIS-LOO for subsample of Nsub randomly selected observations +#' set.seed(1) +#' idx <- sample(1:N, Nsub) +#' elpd_loo_sub <- loo(log_lik_matrix[,idx]) +#' sum(elpd_loo_sub$pointwise[,"elpd_loo"]) / Nsub * N +#' +#' # Use difference estimator to combine fast result and subsampled accurate result +#' loo::srs_diff_est(lpd$pointwise[,"elpd"], elpd_loo_sub$pointwise[,"elpd_loo"], idx) +#' +#' # Comparison to using PSIS-LOO for all observations +#' loo(log_lik_matrix) +#' } +#' @export srs_diff_est <- function(y_approx, y, y_idx) { checkmate::assert_numeric(y_approx) checkmate::assert_numeric(y, max.len = length(y_approx)) diff --git a/man/loo_subsample.Rd b/man/loo_subsample.Rd index 6f381db6..5889be1e 100644 --- a/man/loo_subsample.Rd +++ b/man/loo_subsample.Rd @@ -196,5 +196,5 @@ In \emph{Proceedings of the 23rd International Conference on Artificial Intelligence and Statistics (AISTATS)}, PMLR 108:341-351. } \seealso{ -\code{\link[=loo]{loo()}}, \code{\link[=psis]{psis()}}, \code{\link[=loo_compare]{loo_compare()}} +\code{\link[=loo]{loo()}}, \code{\link[=psis]{psis()}}, \code{\link[=loo_compare]{loo_compare()}}, \code{\link[=srs_diff_est]{srs_diff_est()}} } diff --git a/man/srs_diff_est.Rd b/man/srs_diff_est.Rd new file mode 100644 index 00000000..97034fee --- /dev/null +++ b/man/srs_diff_est.Rd @@ -0,0 +1,97 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/loo_subsample.R +\name{srs_diff_est} +\alias{srs_diff_est} +\title{Difference estimator with simple random sampling without replacement.} +\usage{ +srs_diff_est(y_approx, y, y_idx) +} +\arguments{ +\item{y_approx}{(numeric) \code{n} approximated values.} + +\item{y}{(numeric) \code{m unique() |> scale() |> as.data.frame() + +fitos <- brm(ordered(quality) ~ ., + family = cumulative("logit"), + prior = prior(R2D2(mean_R2 = 1/3, prec_R2 = 3)), + data = wine_scaled, + seed = 1, + silent = 2, + refresh = 0) + +log_lik_matrix <- log_lik(fitos) + +N <- nrow(wine_scaled) +Nsub <- 100 + +# posterior log-score +lpd <- elpd(log_lik_matrix) +sum(lpd$pointwise[,"elpd"]) +# Use PSIS-LOO for subsample of Nsub randomly selected observations +set.seed(1) +idx <- sample(1:N, Nsub) +elpd_loo_sub <- loo(log_lik_matrix[,idx]) +sum(elpd_loo_sub$pointwise[,"elpd_loo"]) / Nsub * N + +# Use difference estimator to combine fast result and subsampled accurate result +loo::srs_diff_est(lpd$pointwise[,"elpd"], elpd_loo_sub$pointwise[,"elpd_loo"], idx) + +# Comparison to using PSIS-LOO for all observations +loo(log_lik_matrix) +} +} +\references{ +Magnusson, M., Riis Andersen, M., Jonasson, J. and Vehtari, A. (2020). +Leave-One-Out Cross-Validation for Model Comparison in Large Data. +In \emph{Proceedings of the 23rd International Conference on Artificial +Intelligence and Statistics (AISTATS)}, PMLR 108:341-351. + +Cochran, W. G. (1977). \emph{Sampling Techniques, 3rd Edition}. John Wiley. + +Cortez, P., Cerdeira, A.L., Almeida, F., Matos, T., & Reis, J. (2009). +Modeling wine preferences by data mining from physicochemical properties. +\emph{Decis. Support Syst.}, \emph{47}, 547-553. +} +\seealso{ +\code{\link[=loo_subsample]{loo_subsample()}} +}