diff --git a/DESCRIPTION b/DESCRIPTION index 6ff663b..bbf8b2d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,13 @@ Package: clonecensorweighting Title: Infrastructure for Clone-Censor-Weighting Analyses Version: 0.0.0.9000 -Authors@R: - person("Sang Ho", "Park", email = "shstat1729@gmail.com", role = c("aut", "cre")) +Authors@R: c( + person("Sang Ho", "Park", email = "shstat1729@gmail.com", role = c("aut", "cre")), + person("Youngrok", "Lee", role = "aut") + ) +Author: Sang Ho Park [aut, cre], + Youngrok Lee [aut] +Maintainer: Sang Ho Park Description: Provides a starter package for clone-censor-weighting workflows used in target trial emulation. The package currently includes data ingestion helpers, a minimal data cloning routine, and utilities for diff --git a/NAMESPACE b/NAMESPACE index 687db6b..eb23a42 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -6,9 +6,12 @@ export(clone_censor_weighting) export(create_censoring_logics_A) export(create_final_data) export(create_policy_A) -export(create_timestamp_table) +export(emul_estimate) +export(emul_estimate_bootstrap) +export(estimate_censoring) export(make_surv_response) export(read_trial_data) -export(split_at_timestamp) +export(weight_cases) importFrom(rlang,":=") importFrom(rlang,.data) +importFrom(rlang,.env) diff --git a/NEWS.md b/NEWS.md new file mode 100644 index 0000000..3b1ab37 --- /dev/null +++ b/NEWS.md @@ -0,0 +1,49 @@ +# clonecensorweighting 0.0.0.9000 + +## Refactoring + +* Split the final censoring-probability training data workflow into focused + functions. +* Kept `create_final_data()` as the public interface for constructing long-form + clone data. +* Moved `create_timestamp_table()` and `split_at_timestamp()` to internal helper + status and removed them from the package exports. +* Removed duplicated reshape code and script-style execution leftovers from the + package R files. +* Added clone-list and clone-column validation helpers for clearer errors when + required variables are missing. +* Updated `create_final_data()` internals to use explicit character-vector + column selection and joins. +* Added `estimate_censoring()` as the public interface for Cox, pooled-logit, + and stabilized pooled-logit censoring probability estimation. +* Added `weight_cases()` as the public interface for unstabilized and stabilized + inverse probability of censoring weights. +* Added `emul_estimate()` as the public interface for Cox, logistic, and + Kaplan-Meier analyses of the emulated trial. +* Added `emul_estimate_bootstrap()` as the public interface for bootstrap + confidence intervals for two-arm Cox and logistic analyses. +* Kept formula construction, cumulative uncensoring, baseline predictor + extraction, predictor/weight normalization, arm binding, and coefficient + extraction as internal helpers. + +## Documentation + +* Removed generated help pages for internal timestamp and splitting helpers. +* Regenerated the `create_final_data()` help source reference after moving the + function to its own file. +* Added generated help pages for censoring estimation, case weighting, + emulated-trial estimation, and bootstrap estimation. +* Added runnable examples for public workflow functions using the bundled + `lungcancer` data where appropriate. +* Updated the README with a full `lungcancer` clone-censor-weighting workflow. + +## Testing + +* Added focused tests for timestamp table creation, interval splitting, final + data construction, internal helper export status, and missing-column errors. +* Added focused tests for censoring probability estimation, IPC weighting, + emulated-trial model fitting, bootstrap output, and internal helper export + status. +* Added an integration test that runs the bundled `lungcancer` data through + cloning, censoring, interval expansion, censoring probability estimation, + weighting, emulated-trial estimation, and bootstrap estimation. diff --git a/R/apply_logics.R b/R/apply_logics.R index aa5a11c..35c1cea 100644 --- a/R/apply_logics.R +++ b/R/apply_logics.R @@ -13,6 +13,20 @@ #' #' @export #' @examples +#' data(lungcancer) +#' arms <- c("Control", "Surgery") +#' clones <- clone_arms(lungcancer, arms) +#' policies <- create_policy_A( +#' arms, +#' treatment = "surgery", +#' time_to_treatment = "timetosurgery", +#' grace_period = 182.62, +#' outcome = "death", +#' followup = "fup_obs", +#' clone_outcome = "outcome", +#' clone_followup = "fup" +#' ) +#' clones_policy <- apply_logics(clones, policies) apply_logics <- function( clones, logics @@ -91,6 +105,17 @@ apply_logics <- function( #' #' @export #' @examples +#' arms <- c("Control", "Surgery") +#' policies <- create_policy_A( +#' arms, +#' treatment = "surgery", +#' time_to_treatment = "timetosurgery", +#' grace_period = 182.62, +#' outcome = "death", +#' followup = "fup_obs", +#' clone_outcome = "outcome", +#' clone_followup = "fup" +#' ) create_policy_A <- function( arms, treatment, @@ -203,6 +228,16 @@ create_policy_A <- function( #' #' @export #' @examples +#' arms <- c("Control", "Surgery") +#' censoring_logics <- create_censoring_logics_A( +#' arms, +#' treatment = "surgery", +#' time_to_treatment = "timetosurgery", +#' grace_period = 182.62, +#' followup = "fup_obs", +#' clone_censoring = "censoring", +#' clone_uncensored_followup = "fup_uncensored" +#' ) create_censoring_logics_A <- function( arms, treatment, diff --git a/R/censoring_estimation.R b/R/censoring_estimation.R new file mode 100644 index 0000000..3a8a522 --- /dev/null +++ b/R/censoring_estimation.R @@ -0,0 +1,287 @@ +#' Estimate censoring probabilities +#' +#' @param clones A named list of long-form clone data frames. +#' @param predictors Optional character vector of denominator model predictors. +#' @param method Censoring model. `"Cox"` fits a Cox censoring model, +#' `"pooled_logit"` fits a pooled logistic denominator model, and +#' `"stabilized_logit"` additionally fits a numerator model. +#' @param numerator_predictors Optional character vector of numerator model +#' predictors for stabilized pooled-logit weights. When `NULL`, `predictors` +#' are used. +#' @param censoring Column name for the censoring indicator. +#' @param id Column name for the subject identifier. +#' @param time_start Column name for interval start time. +#' @param time_stop Column name for interval stop time. +#' @param eps Small probability floor to avoid division by zero. +#' +#' @returns A named list of clone data frames with censoring probability +#' columns added. All methods add `P_uncens`; pooled-logit methods also add +#' `p_cens_den`, and stabilized pooled logit adds `p_cens_num` and +#' `P_uncens_num`. +#' @export +#' @examples +#' data(lungcancer) +#' arms <- c("Control", "Surgery") +#' clones <- clone_arms(lungcancer, arms) +#' policies <- create_policy_A( +#' arms, "surgery", "timetosurgery", 182.62, "death", "fup_obs", +#' clone_outcome = "outcome", clone_followup = "fup" +#' ) +#' clones_policy <- apply_logics(clones, policies) +#' censoring_logics <- create_censoring_logics_A( +#' arms, "surgery", "timetosurgery", 182.62, "fup_obs", +#' clone_censoring = "censoring", +#' clone_uncensored_followup = "fup_uncensored" +#' ) +#' clones_censored <- apply_logics(clones_policy, censoring_logics) +#' clones_final <- create_final_data( +#' clones_censored, "fup", "outcome", "censoring", "id" +#' ) +#' clones_estimated <- estimate_censoring( +#' clones_final, +#' predictors = c("age", "sex"), +#' method = "pooled_logit" +#' ) +estimate_censoring <- function( + clones, + predictors = NULL, + method = c("Cox", "pooled_logit", "stabilized_logit"), + numerator_predictors = NULL, + censoring = "censoring", + id = "id", + time_start = "Tstart", + time_stop = "Tstop", + eps = 1e-6 +) { + method <- match.arg(method) + predictors <- normalize_predictors(predictors) + + if (method == "stabilized_logit" && is.null(numerator_predictors)) { + numerator_predictors <- predictors + } + numerator_predictors <- normalize_predictors(numerator_predictors) + + required_columns <- c(id, time_start, time_stop, censoring, predictors) + if (method == "stabilized_logit") { + required_columns <- c(required_columns, numerator_predictors) + } + .assert_clone_columns(clones, required_columns) + .assert_probability_floor(eps) + + arms <- names(clones) + res <- vector("list", length = length(arms)) + names(res) <- arms + + cox_response <- paste0( + "survival::Surv(", + backtick_name(time_start), + ", ", + backtick_name(time_stop), + ", ", + backtick_name(censoring), + ")" + ) + cox_formula <- make_censoring_formula( + cox_response, + predictors + ) + pooled_formula <- make_censoring_formula( + backtick_name(censoring), + predictors, + time_var = time_start + ) + + for (arm in arms) { + dat <- clones[[arm]] + + if (method == "Cox") { + ms_cens <- survival::coxph( + cox_formula, + ties = "efron", + data = dat + ) + + lin_pred <- stats::predict( + ms_cens, + newdata = dat, + type = "lp", + reference = "zero" + ) + + base_hazard <- dplyr::as_tibble( + survival::basehaz(ms_cens, centered = FALSE) + ) + names(base_hazard) <- c("hazard", "t") + + res[[arm]] <- + dat |> + dplyr::mutate(lin_pred = .env[["lin_pred"]]) |> + dplyr::left_join( + base_hazard, + by = stats::setNames("t", time_start) + ) |> + dplyr::mutate( + hazard = dplyr::coalesce(.data$hazard, 0), + P_uncens = exp(-.data$hazard * exp(.data$lin_pred)) + ) + } else { + fit_den <- stats::glm( + pooled_formula, + data = dat, + family = stats::binomial(link = "logit") + ) + p_cens_den <- .clamp_probability( + stats::predict(fit_den, type = "response"), + eps = eps + ) + p_uncens_den <- cumulative_uncensoring( + dat, + p_cens_den, + id = id, + time_start = time_start, + time_stop = time_stop, + eps = eps + ) + + res[[arm]] <- + dat |> + dplyr::mutate( + p_cens_den = .env[["p_cens_den"]], + P_uncens = .env[["p_uncens_den"]] + ) + + if (method == "stabilized_logit") { + numerator_data <- add_baseline_predictors( + dat, + predictors = numerator_predictors, + id = id, + time_start = time_start, + time_stop = time_stop + ) + numerator_formula <- make_censoring_formula( + backtick_name(censoring), + numerator_data$predictors, + time_var = time_start + ) + fit_num <- stats::glm( + numerator_formula, + data = numerator_data$data, + family = stats::binomial(link = "logit") + ) + p_cens_num <- .clamp_probability( + stats::predict(fit_num, type = "response"), + eps = eps + ) + p_uncens_num <- cumulative_uncensoring( + dat, + p_cens_num, + id = id, + time_start = time_start, + time_stop = time_stop, + eps = eps + ) + + res[[arm]] <- + res[[arm]] |> + dplyr::mutate( + p_cens_num = .env[["p_cens_num"]], + P_uncens_num = .env[["p_uncens_num"]] + ) + } + } + } + + res +} + +#' Create a censoring model formula +#' +#' @noRd +make_censoring_formula <- function( + response, + predictors = NULL, + time_var = NULL +) { + terms <- unique(c( + normalize_predictors(time_var), + normalize_predictors(predictors) + )) + terms <- backtick_name(terms) + + formula <- stats::reformulate(terms, response = response) + environment(formula) <- parent.frame() + formula +} + +#' Estimate cumulative probability of remaining uncensored +#' +#' @noRd +cumulative_uncensoring <- function( + data, + p_censoring, + id = "id", + time_start = "Tstart", + time_stop = "Tstop", + eps = 1e-6 +) { + .assert_data_frame(data) + .assert_required_columns(data, c(id, time_start, time_stop)) + .assert_probability_floor(eps) + + if (!is.numeric(p_censoring) || length(p_censoring) != nrow(data)) { + stop("`p_censoring` must have one numeric value per row of `data`.", call. = FALSE) + } + + p_uncensored_interval <- 1 - .clamp_probability(p_censoring, eps = eps) + p_uncensored <- numeric(nrow(data)) + ordered_rows <- order(data[[id]], data[[time_start]], data[[time_stop]]) + rows_by_id <- split(ordered_rows, data[[id]][ordered_rows]) + + for (rows in rows_by_id) { + interval_prob <- p_uncensored_interval[rows] + p_uncensored[rows] <- c(1, cumprod(interval_prob[-length(interval_prob)])) + } + + pmax(p_uncensored, eps) +} + +#' Add baseline values of predictors +#' +#' @noRd +add_baseline_predictors <- function( + data, + predictors = NULL, + id = "id", + time_start = "Tstart", + time_stop = "Tstop", + prefix = ".baseline_" +) { + predictors <- normalize_predictors(predictors) + if (length(predictors) == 0L) { + return(list(data = data, predictors = character())) + } + + .assert_data_frame(data) + .assert_required_columns(data, c(id, time_start, time_stop, predictors)) + + baseline_predictors <- paste0(prefix, predictors) + conflicting_columns <- intersect(baseline_predictors, names(data)) + if (length(conflicting_columns) > 0L) { + stop( + "Baseline predictor columns already exist: ", + paste(conflicting_columns, collapse = ", "), + call. = FALSE + ) + } + + ordered_rows <- order(data[[id]], data[[time_start]], data[[time_stop]]) + baseline_rows <- ordered_rows[!duplicated(data[[id]][ordered_rows])] + baseline_values <- data[baseline_rows, c(id, predictors), drop = FALSE] + names(baseline_values)[names(baseline_values) %in% predictors] <- + baseline_predictors + + list( + data = dplyr::left_join(data, baseline_values, by = id), + predictors = baseline_predictors + ) +} diff --git a/R/clone_censor_weighting.R b/R/clone_censor_weighting.R index 8ac1168..96c00b2 100644 --- a/R/clone_censor_weighting.R +++ b/R/clone_censor_weighting.R @@ -16,6 +16,16 @@ #' @return A tibble with one row per participant-strategy combination and the #' additional columns `.clone_id`, `.regime`, `.censored`, and `.weight`. #' @export +#' @examples +#' data(lungcancer) +#' cloned <- clone_censor_weighting( +#' lungcancer, +#' id = "id", +#' follow_up = "fup_obs", +#' event = "death", +#' treatment = "surgery", +#' regimes = c(0, 1) +#' ) clone_censor_weighting <- function( data, id, @@ -86,6 +96,13 @@ clone_censor_weighting <- function( #' #' @return An object of class `"Surv"`. #' @export +#' @examples +#' data(lungcancer) +#' surv_response <- make_surv_response( +#' lungcancer, +#' follow_up = "fup_obs", +#' event = "death" +#' ) make_surv_response <- function(data, follow_up, event) { .assert_data_frame(data) .assert_required_columns(data, c(follow_up, event)) diff --git a/R/clonecensorweighting-package.R b/R/clonecensorweighting-package.R index e3fc135..7404ca0 100644 --- a/R/clonecensorweighting-package.R +++ b/R/clonecensorweighting-package.R @@ -4,5 +4,6 @@ ## usethis namespace: start #' @importFrom rlang := #' @importFrom rlang .data +#' @importFrom rlang .env NULL -## usethis namespace: end \ No newline at end of file +## usethis namespace: end diff --git a/R/reshape.R b/R/create_final_data.R similarity index 50% rename from R/reshape.R rename to R/create_final_data.R index a774ebe..a4874f3 100644 --- a/R/reshape.R +++ b/R/create_final_data.R @@ -1,81 +1,3 @@ -#' Create timestamp table -#' -#' @param clones A list of data frame. Each element of the list represents -#' each treatment arm. This version of clones must contain a column that -#' represents a emulated follow up time. -#' @param clone_followup A column name in emulcated clone (i.e. `clones`) -#' that represents the emulated follow up time in cloned data frame. -#' -#' @returns A data frame with two columns: `tevent` and `ID_t`. -#' `tevent` represents a timestamp that outcome event can occur based on -#' observed data. `ID_t` represents an enumerated identifier of each -#' timestamp, from 1 to n where n represents the number of unique `tevent` -#' value. -#' -#' @export -#' @examples -create_timestamp_table <- function(clones, clone_followup) { - timestamps <- sapply(clones, `[[`, i = clone_followup, simplify = FALSE) - t_events <- sort(unique(unlist(timestamps))) - res <- dplyr::tibble(tevent = t_events, ID_t = seq_along(t_events)) - - res -} - - -#' Split each observation into multiple subrecords at each time cut -#' -#' @param clones A list of data frame. Each element of the list represents -#' each treatment arm. This version of clones must contain a column that -#' represents a emulated follow up time (corresponding to `clone_followup` -#' argument) and an event of interest (corresponding to `event` argument). -#' @param clone_followup A column name in emulcated clone (i.e. `clones`) -#' that represents the emulated follow up time in cloned data frame. -#' @param t_events A vector of timestamp that outcome event can occur based on -#' observed data. -#' @param event A variable name of an event of interest. The variable should -#' exists in each data frame that is an element of `clones` argument, and -#' the variable value should be a binary (0 or 1). -#' @param timestamp_start A new variable name to denote start time. -#' @param id A new variable name for a unique observation identifier, to -#' represents that multiple rows in output data frame is associated with the -#' same observation. -#' -#' @returns A list of long-form data frames. Each data frame represents each -#' clone arm. Each row of the long-form data frame represents a subrecord of -#' each observation associated with each specific time interval. The first -#' subrecord starts with time 0, and the rows are expanded up to -#' `clone_followup`, where cut times are determined by `t_events` argument. -#' -#' @export -#' @examples -split_at_timestamp <- function( - clones, - clone_followup, - t_events, - event, - timestamp_start = "Tstart", - id = "ID" -) { - arms <- names(clones) - - res <- vector("list", length = length(arms)) - names(res) <- arms - for (arm in arms) { - res[[arm]] <- - clones[[arm]] |> - survival::survSplit( - cut = t_events, - end = clone_followup, - start = timestamp_start, - event = event, - id = id - ) - } - - res -} - #' Create training data for censoring probability estimation #' @@ -114,6 +36,27 @@ split_at_timestamp <- function( #' #' @export #' @examples +#' data(lungcancer) +#' arms <- c("Control", "Surgery") +#' clones <- clone_arms(lungcancer, arms) +#' policies <- create_policy_A( +#' arms, "surgery", "timetosurgery", 182.62, "death", "fup_obs", +#' clone_outcome = "outcome", clone_followup = "fup" +#' ) +#' clones_policy <- apply_logics(clones, policies) +#' censoring_logics <- create_censoring_logics_A( +#' arms, "surgery", "timetosurgery", 182.62, "fup_obs", +#' clone_censoring = "censoring", +#' clone_uncensored_followup = "fup_uncensored" +#' ) +#' clones_censored <- apply_logics(clones_policy, censoring_logics) +#' clones_final <- create_final_data( +#' clones_censored, +#' clone_followup = "fup", +#' clone_outcome = "outcome", +#' clone_censoring = "censoring", +#' col_ids = "id" +#' ) create_final_data <- function( clones, clone_followup, @@ -124,6 +67,11 @@ create_final_data <- function( id = "ID", timestamp_stop = "Tstop" ) { + .assert_clone_columns( + clones, + c(clone_followup, clone_outcome, clone_censoring, col_ids) + ) + df_timestamp <- create_timestamp_table(clones, clone_followup) clones_splitted_by_outcome <- split_at_timestamp( @@ -146,8 +94,16 @@ create_final_data <- function( # merge two tables and create column to represent end of timestamp df_timestamp_with_time_zero <- - df_timestamp |> - dplyr::bind_rows(dplyr::tibble(tevent = 0, ID_t = 0)) + dplyr::bind_rows( + dplyr::tibble(tevent = 0, ID_t = 0), + df_timestamp + ) + df_timestamp_with_time_zero <- + df_timestamp_with_time_zero[ + !duplicated(df_timestamp_with_time_zero$tevent), + , + drop = FALSE + ] n_clones <- length(clones) arms <- names(clones) @@ -157,30 +113,28 @@ create_final_data <- function( for (i in seq_len(n_clones)) { x <- clones_splitted_by_outcome[[i]] |> - dplyr::select(!{{ clone_censoring }}) + dplyr::select(-dplyr::all_of(clone_censoring)) y <- clones_splitted_by_censoring[[i]] |> dplyr::select( - dplyr::all_of(col_ids), - {{ clone_followup }}, - {{ clone_censoring }} + dplyr::all_of(c(col_ids, clone_followup, clone_censoring)) ) res[[i]] <- dplyr::inner_join( x, y, - by = dplyr::join_by({{ col_ids }}, {{ clone_followup }}) + by = c(col_ids, clone_followup) ) |> - dplyr::mutate({{ timestamp_stop }} := .data[[clone_followup]]) + dplyr::mutate(!!timestamp_stop := .data[[clone_followup]]) # Merge with timestamp table res[[i]] <- res[[i]] |> dplyr::left_join( df_timestamp_with_time_zero, - by = dplyr::join_by({{ timestamp_start }} == tevent) + by = stats::setNames("tevent", timestamp_start) ) } diff --git a/R/create_timestamp_table.R b/R/create_timestamp_table.R new file mode 100644 index 0000000..fbd2015 --- /dev/null +++ b/R/create_timestamp_table.R @@ -0,0 +1,14 @@ +#' Create a timestamp lookup table +#' +#' @noRd +create_timestamp_table <- function(clones, clone_followup) { + .assert_clone_columns(clones, clone_followup) + + timestamps <- unlist( + lapply(clones, `[[`, clone_followup), + use.names = FALSE + ) + t_events <- sort(unique(timestamps)) + + dplyr::tibble(tevent = t_events, ID_t = seq_along(t_events)) +} diff --git a/R/emul_estimate.R b/R/emul_estimate.R new file mode 100644 index 0000000..97328a9 --- /dev/null +++ b/R/emul_estimate.R @@ -0,0 +1,149 @@ +#' Estimate the emulated trial effect +#' +#' @param clones_weighted A named list of weighted clone data frames, or a +#' single data frame containing an arm column. +#' @param method Analysis method: `"Cox"`, `"logistic"`, or `"KM"`. +#' @param cluster Column name used for robust clustering in Cox models. +#' @param weights Optional weight column name. Unquoted column names are also +#' accepted when they exist in the data. +#' @param predictors Optional adjustment predictors. For KM, predictors define +#' additional strata rather than covariate adjustment. +#' @param outcome Column name for the outcome indicator. +#' @param time_start Column name for interval start time. +#' @param time_stop Column name for interval stop time. +#' @param arm Column name for treatment arm after binding clone lists. +#' +#' @returns A fitted model object: `"coxph"` for Cox, `"glm"` for logistic, or +#' `"survfit"` for KM. +#' @export +#' @examples +#' data(lungcancer) +#' arms <- c("Control", "Surgery") +#' clones <- clone_arms(lungcancer, arms) +#' policies <- create_policy_A( +#' arms, "surgery", "timetosurgery", 182.62, "death", "fup_obs", +#' clone_outcome = "outcome", clone_followup = "fup" +#' ) +#' clones_policy <- apply_logics(clones, policies) +#' censoring_logics <- create_censoring_logics_A( +#' arms, "surgery", "timetosurgery", 182.62, "fup_obs", +#' clone_censoring = "censoring", +#' clone_uncensored_followup = "fup_uncensored" +#' ) +#' clones_censored <- apply_logics(clones_policy, censoring_logics) +#' clones_final <- create_final_data( +#' clones_censored, "fup", "outcome", "censoring", "id" +#' ) +#' clones_estimated <- estimate_censoring( +#' clones_final, +#' predictors = c("age", "sex"), +#' method = "pooled_logit" +#' ) +#' clones_weighted <- weight_cases(clones_estimated) +#' fit <- emul_estimate( +#' clones_weighted, +#' method = "Cox", +#' weights = "weight_Cox", +#' predictors = c("age", "sex") +#' ) +emul_estimate <- function( + clones_weighted, + method = c("Cox", "logistic", "KM"), + cluster = "id", + weights = NULL, + predictors = NULL, + outcome = "outcome", + time_start = "Tstart", + time_stop = "Tstop", + arm = "arms" +) { + weights_expr <- substitute(weights) + method <- match.arg(method) + dat <- bind_clone_arms(clones_weighted, arm = arm) + + if (length(stats::na.omit(levels(dat[[arm]]))) < 2L) { + stop("`", arm, "` must contain at least two levels.", call. = FALSE) + } + + predictors <- normalize_predictors(predictors, exclude = arm) + required_columns <- c(arm, outcome, predictors) + if (method %in% c("Cox", "KM")) { + required_columns <- c(required_columns, time_start, time_stop) + } + if (method == "Cox" && !is.null(cluster)) { + required_columns <- c(required_columns, cluster) + } + .assert_required_columns(dat, required_columns) + + weight_col <- normalize_weights(weights, weights_expr, dat) + weighted_data <- add_analysis_weights(dat, weight_col) + dat <- weighted_data$data + analysis_weight <- weighted_data$weight + + surv_response <- paste0( + "survival::Surv(", + backtick_name(time_start), + ", ", + backtick_name(time_stop), + ", ", + backtick_name(outcome), + ")" + ) + cox_formula <- emul_formula( + surv_response, + predictors = predictors, + cluster = cluster, + arm = arm + ) + km_formula <- emul_formula( + surv_response, + predictors = predictors, + arm = arm + ) + logistic_formula <- emul_formula( + backtick_name(outcome), + predictors = predictors, + arm = arm + ) + + if (method == "KM" && length(predictors) > 0L) { + message( + "`predictors` in KM create separate strata; they do not produce ", + "covariate-adjusted survival curves." + ) + } + + if (method == "Cox") { + cox_args <- list( + formula = cox_formula, + data = quote(dat), + robust = TRUE, + ties = "efron" + ) + if (!is.null(analysis_weight)) { + cox_args$weights <- as.name(analysis_weight) + } + return(base::do.call(survival::coxph, cox_args)) + } + + if (method == "logistic") { + logistic_args <- list( + formula = logistic_formula, + data = quote(dat), + family = stats::binomial(link = "logit") + ) + if (!is.null(analysis_weight)) { + logistic_args$weights <- as.name(analysis_weight) + } + return(base::do.call(stats::glm, logistic_args)) + } + + survfit_args <- list( + formula = km_formula, + data = quote(dat) + ) + if (!is.null(analysis_weight)) { + survfit_args$weights <- as.name(analysis_weight) + } + base::do.call(survival::survfit, survfit_args) +} diff --git a/R/emul_estimate_bootstrap.R b/R/emul_estimate_bootstrap.R new file mode 100644 index 0000000..ad76868 --- /dev/null +++ b/R/emul_estimate_bootstrap.R @@ -0,0 +1,139 @@ +#' Bootstrap the emulated trial effect +#' +#' @param clones_weighted A named list of weighted clone data frames, or a +#' single data frame containing an arm column. +#' @param method Analysis method: `"Cox"` or `"logistic"`. +#' @param cluster Column name identifying resampling clusters. +#' @param predictors Optional adjustment predictors. +#' @param weights Optional weight column name. Unquoted column names are also +#' accepted when they exist in the data. +#' @param n_bootstrap Number of bootstrap resamples. +#' @param outcome Column name for the outcome indicator. +#' @param time_start Column name for interval start time. +#' @param time_stop Column name for interval stop time. +#' @param arm Column name for treatment arm after binding clone lists. +#' @param conf_level Confidence level for the percentile interval. +#' @param seed Optional random seed. +#' +#' @returns A list with `ci_lower`, `ci_upper`, and bootstrap `estimates`. +#' @export +#' @examples +#' data(lungcancer) +#' arms <- c("Control", "Surgery") +#' clones <- clone_arms(lungcancer, arms) +#' policies <- create_policy_A( +#' arms, "surgery", "timetosurgery", 182.62, "death", "fup_obs", +#' clone_outcome = "outcome", clone_followup = "fup" +#' ) +#' clones_policy <- apply_logics(clones, policies) +#' censoring_logics <- create_censoring_logics_A( +#' arms, "surgery", "timetosurgery", 182.62, "fup_obs", +#' clone_censoring = "censoring", +#' clone_uncensored_followup = "fup_uncensored" +#' ) +#' clones_censored <- apply_logics(clones_policy, censoring_logics) +#' clones_final <- create_final_data( +#' clones_censored, "fup", "outcome", "censoring", "id" +#' ) +#' clones_estimated <- estimate_censoring( +#' clones_final, +#' predictors = c("age", "sex"), +#' method = "pooled_logit" +#' ) +#' clones_weighted <- weight_cases(clones_estimated) +#' boot <- emul_estimate_bootstrap( +#' clones_weighted, +#' method = "Cox", +#' weights = "weight_Cox", +#' predictors = c("age", "sex"), +#' n_bootstrap = 3, +#' seed = 1 +#' ) +emul_estimate_bootstrap <- function( + clones_weighted, + method = c("Cox", "logistic"), + cluster = "id", + predictors = NULL, + weights = NULL, + n_bootstrap = 200, + outcome = "outcome", + time_start = "Tstart", + time_stop = "Tstop", + arm = "arms", + conf_level = 0.95, + seed = NULL +) { + weights_expr <- substitute(weights) + method <- match.arg(method) + .assert_positive_integer(n_bootstrap, "n_bootstrap") + .assert_conf_level(conf_level) + + dat <- bind_clone_arms(clones_weighted, arm = arm) + arm_levels <- levels(dat[[arm]]) + if (length(arm_levels) != 2L) { + stop("Bootstrap effect estimation currently requires exactly two arms.", call. = FALSE) + } + + predictors <- normalize_predictors(predictors, exclude = arm) + required_columns <- c(cluster, arm, outcome, predictors) + if (method == "Cox") { + required_columns <- c(required_columns, time_start, time_stop) + } + .assert_required_columns(dat, required_columns) + + weight_col <- normalize_weights(weights, weights_expr, dat) + + if (!is.null(seed)) { + set.seed(seed) + } + + boot_estimates <- vector("numeric", length = n_bootstrap) + unique_clusters <- unique(dat[[cluster]]) + + for (i in seq_len(n_bootstrap)) { + bootstrap_clusters <- unique_clusters[ + sample.int( + length(unique_clusters), + size = length(unique_clusters), + replace = TRUE + ) + ] + bootstrap_sample <- dplyr::bind_rows( + lapply(seq_along(bootstrap_clusters), function(j) { + cluster_rows <- dat[ + dat[[cluster]] == bootstrap_clusters[[j]], + , + drop = FALSE + ] + cluster_rows$.bootstrap_id <- j + cluster_rows + }) + ) + bootstrap_sample[[arm]] <- factor(bootstrap_sample[[arm]], levels = arm_levels) + + fit <- emul_estimate( + bootstrap_sample, + method = method, + cluster = ".bootstrap_id", + weights = weight_col, + predictors = predictors, + outcome = outcome, + time_start = time_start, + time_stop = time_stop, + arm = arm + ) + coef_values <- stats::coef(fit) + arm_coef <- find_arm_coefficient(coef_values, arm, arm_levels[[2]]) + boot_estimates[[i]] <- unname(exp(coef_values[[arm_coef]])) + } + + alpha <- 1 - conf_level + lower_ci <- stats::quantile(boot_estimates, probs = alpha / 2) + upper_ci <- stats::quantile(boot_estimates, probs = 1 - alpha / 2) + + list( + ci_lower = lower_ci, + ci_upper = upper_ci, + estimates = boot_estimates + ) +} diff --git a/R/estimate_helpers.R b/R/estimate_helpers.R new file mode 100644 index 0000000..e9832e8 --- /dev/null +++ b/R/estimate_helpers.R @@ -0,0 +1,181 @@ +#' Backtick a column name when needed for formulas +#' +#' @noRd +backtick_name <- function(x) { + if (length(x) == 0L) { + return(character()) + } + + vapply( + x, + function(name) { + if (make.names(name) == name) { + name + } else { + paste0("`", gsub("`", "\\\\`", name), "`") + } + }, + character(1) + ) +} + +#' Normalize predictor column names +#' +#' @noRd +normalize_predictors <- function(predictors = NULL, exclude = NULL) { + if (is.null(predictors)) { + return(character()) + } + if (!is.character(predictors)) { + stop( + "`predictors` must be NULL or a character vector of column names.", + call. = FALSE + ) + } + if (any(is.na(predictors)) || any(!nzchar(predictors))) { + stop( + "`predictors` must contain non-missing, non-empty column names.", + call. = FALSE + ) + } + + setdiff(unique(predictors), exclude) +} + +#' Normalize a weights argument to a data column +#' +#' @noRd +normalize_weights <- function(weights, weights_expr, dat) { + if (identical(weights_expr, quote(NULL))) { + return(NULL) + } + + if (is.symbol(weights_expr)) { + weights_name <- as.character(weights_expr) + if (weights_name %in% names(dat)) { + weight_col <- weights_name + } else { + weights <- force(weights) + if (is.null(weights)) { + return(NULL) + } + if (!is.character(weights) || length(weights) != 1L) { + stop( + "`weights` must be NULL or a single column name.", + call. = FALSE + ) + } + weight_col <- weights + } + } else { + weights <- force(weights) + if (is.null(weights)) { + return(NULL) + } + if (!is.character(weights) || length(weights) != 1L) { + stop( + "`weights` must be NULL or a single column name.", + call. = FALSE + ) + } + weight_col <- weights + } + + if (is.na(weight_col) || !nzchar(weight_col)) { + stop("`weights` must be a non-empty column name.", call. = FALSE) + } + if (!weight_col %in% names(dat)) { + stop("`weights` column not found in data: ", weight_col, call. = FALSE) + } + if (!is.numeric(dat[[weight_col]])) { + stop("`weights` column must be numeric: ", weight_col, call. = FALSE) + } + + weight_col +} + +#' Create an emulated-trial model formula +#' +#' @noRd +emul_formula <- function( + response, + predictors = NULL, + cluster = NULL, + arm = "arms" +) { + predictors <- normalize_predictors(predictors, exclude = arm) + terms <- backtick_name(c(arm, predictors)) + if (!is.null(cluster)) { + terms <- c(terms, paste0("cluster(", backtick_name(cluster), ")")) + } + + stats::as.formula( + paste( + response, + paste(terms, collapse = " + "), + sep = " ~ " + ) + ) +} + +#' Bind clone arms into one data frame +#' +#' @noRd +bind_clone_arms <- function(clones, arm = "arms") { + if (is.data.frame(clones)) { + dat <- tibble::as_tibble(clones) + if (!arm %in% names(dat)) { + stop("Data frame input must include an `", arm, "` column.", call. = FALSE) + } + } else { + .assert_named_data_frame_list(clones) + arm_conflicts <- vapply(clones, function(x) arm %in% names(x), logical(1)) + if (any(arm_conflicts)) { + stop( + "Clone data frames must not already include the `", arm, "` column.", + call. = FALSE + ) + } + dat <- dplyr::bind_rows(clones, .id = arm) + } + + dat[[arm]] <- factor(dat[[arm]], levels = unique(dat[[arm]])) + dat +} + +#' Add a temporary analysis weight column +#' +#' @noRd +add_analysis_weights <- function(dat, weight_col) { + if (is.null(weight_col)) { + return(list(data = dat, weight = NULL)) + } + + weight_name <- ".analysis_weight" + while (weight_name %in% names(dat)) { + weight_name <- paste0(".", weight_name) + } + dat[[weight_name]] <- dat[[weight_col]] + + list(data = dat, weight = weight_name) +} + +#' Find the arm coefficient in a two-arm model +#' +#' @noRd +find_arm_coefficient <- function(coef_values, arm, arm_level) { + candidates <- c( + paste0(arm, arm_level), + paste0(backtick_name(arm), arm_level) + ) + arm_coef <- intersect(candidates, names(coef_values)) + + if (length(arm_coef) != 1L) { + stop( + "Expected exactly one arm coefficient in the fitted model.", + call. = FALSE + ) + } + + arm_coef +} diff --git a/R/read_trial_data.R b/R/read_trial_data.R index fe577c4..d3b1650 100644 --- a/R/read_trial_data.R +++ b/R/read_trial_data.R @@ -5,6 +5,17 @@ #' #' @return A tibble. #' @export +#' @examples +#' csv_file <- tempfile(fileext = ".csv") +#' writeLines( +#' c( +#' "id,follow_up,event,treatment", +#' "1,10,1,A", +#' "2,12,0,B" +#' ), +#' con = csv_file +#' ) +#' trial_data <- read_trial_data(csv_file) read_trial_data <- function(file, show_col_types = FALSE) { tibble::as_tibble( readr::read_csv(file, show_col_types = show_col_types) diff --git a/R/split_at_timestamp.R b/R/split_at_timestamp.R new file mode 100644 index 0000000..d54589c --- /dev/null +++ b/R/split_at_timestamp.R @@ -0,0 +1,53 @@ +#' Split each observation into multiple subrecords at each time cut +#' +#' @param clones A list of data frame. Each element of the list represents +#' each treatment arm. This version of clones must contain a column that +#' represents a emulated follow up time (corresponding to `clone_followup` +#' argument) and an event of interest (corresponding to `event` argument). +#' @param clone_followup A column name in emulcated clone (i.e. `clones`) +#' that represents the emulated follow up time in cloned data frame. +#' @param t_events A vector of timestamp that outcome event can occur based on +#' observed data. +#' @param event A variable name of an event of interest. The variable should +#' exists in each data frame that is an element of `clones` argument, and +#' the variable value should be a binary (0 or 1). +#' @param timestamp_start A new variable name to denote start time. +#' @param id A new variable name for a unique observation identifier, to +#' represents that multiple rows in output data frame is associated with the +#' same observation. +#' +#' @returns A list of long-form data frames. Each data frame represents each +#' clone arm. Each row of the long-form data frame represents a subrecord of +#' each observation associated with each specific time interval. The first +#' subrecord starts with time 0, and the rows are expanded up to +#' `clone_followup`, where cut times are determined by `t_events` argument. +#' +#' @noRd +split_at_timestamp <- function( + clones, + clone_followup, + t_events, + event, + timestamp_start = "Tstart", + id = "ID" +) { + .assert_clone_columns(clones, c(clone_followup, event)) + + arms <- names(clones) + + res <- vector("list", length = length(arms)) + names(res) <- arms + for (arm in arms) { + res[[arm]] <- + survival::survSplit( + data = clones[[arm]], + cut = t_events, + end = clone_followup, + start = timestamp_start, + event = event, + id = id + ) + } + + res +} diff --git a/R/utils.R b/R/utils.R index bc63cb8..c2480f1 100644 --- a/R/utils.R +++ b/R/utils.R @@ -15,3 +15,71 @@ ) } } + +.assert_named_data_frame_list <- function(data, arg = "clones") { + if (!is.list(data) || is.null(names(data)) || any(names(data) == "")) { + stop("`", arg, "` must be a named list of data frames.", call. = FALSE) + } + + for (name in names(data)) { + if (!is.data.frame(data[[name]])) { + stop("Every element of `", arg, "` must be a data frame.", call. = FALSE) + } + } +} + +.assert_clone_columns <- function(clones, columns) { + .assert_named_data_frame_list(clones) + + for (arm in names(clones)) { + tryCatch( + .assert_required_columns(clones[[arm]], columns), + error = function(cnd) { + stop( + "In clone arm `", arm, "`: ", + conditionMessage(cnd), + call. = FALSE + ) + } + ) + } +} + +.assert_probability_floor <- function(eps) { + if (!is.numeric(eps) || length(eps) != 1L || is.na(eps) || eps <= 0 || eps >= 1) { + stop("`eps` must be a single number between 0 and 1.", call. = FALSE) + } +} + +.clamp_probability <- function(x, eps = 1e-6) { + .assert_probability_floor(eps) + if (!is.numeric(x)) { + stop("Probabilities must be numeric.", call. = FALSE) + } + + pmin(pmax(x, eps), 1 - eps) +} + +.assert_positive_integer <- function(x, arg) { + if ( + !is.numeric(x) || + length(x) != 1L || + is.na(x) || + x < 1 || + x != as.integer(x) + ) { + stop("`", arg, "` must be a positive integer.", call. = FALSE) + } +} + +.assert_conf_level <- function(conf_level) { + if ( + !is.numeric(conf_level) || + length(conf_level) != 1L || + is.na(conf_level) || + conf_level <= 0 || + conf_level >= 1 + ) { + stop("`conf_level` must be a single number between 0 and 1.", call. = FALSE) + } +} diff --git a/R/weight_cases.R b/R/weight_cases.R new file mode 100644 index 0000000..7201319 --- /dev/null +++ b/R/weight_cases.R @@ -0,0 +1,76 @@ +#' Add inverse probability of censoring weights +#' +#' @param clones A named list of clone data frames returned by +#' `estimate_censoring()`. +#' @param uncensored_prob Column name for denominator uncensoring probability. +#' @param numerator_uncensored_prob Column name for numerator uncensoring +#' probability. When this column exists, stabilized weights are created. +#' @param weight Column name for the output weight. +#' @param eps Small probability floor to avoid division by zero. +#' +#' @returns A named list of clone data frames with the weight column added. +#' @export +#' @examples +#' data(lungcancer) +#' arms <- c("Control", "Surgery") +#' clones <- clone_arms(lungcancer, arms) +#' policies <- create_policy_A( +#' arms, "surgery", "timetosurgery", 182.62, "death", "fup_obs", +#' clone_outcome = "outcome", clone_followup = "fup" +#' ) +#' clones_policy <- apply_logics(clones, policies) +#' censoring_logics <- create_censoring_logics_A( +#' arms, "surgery", "timetosurgery", 182.62, "fup_obs", +#' clone_censoring = "censoring", +#' clone_uncensored_followup = "fup_uncensored" +#' ) +#' clones_censored <- apply_logics(clones_policy, censoring_logics) +#' clones_final <- create_final_data( +#' clones_censored, "fup", "outcome", "censoring", "id" +#' ) +#' clones_estimated <- estimate_censoring( +#' clones_final, +#' predictors = c("age", "sex"), +#' method = "pooled_logit" +#' ) +#' clones_weighted <- weight_cases(clones_estimated) +weight_cases <- function( + clones, + uncensored_prob = "P_uncens", + numerator_uncensored_prob = "P_uncens_num", + weight = "weight_Cox", + eps = 1e-6 +) { + .assert_clone_columns(clones, uncensored_prob) + .assert_probability_floor(eps) + + arms <- names(clones) + res <- vector("list", length = length(arms)) + names(res) <- arms + + for (arm in arms) { + dat <- clones[[arm]] + if (!is.numeric(dat[[uncensored_prob]])) { + stop("`", uncensored_prob, "` must be numeric in clone arm `", arm, "`.", call. = FALSE) + } + + denominator <- pmax(dat[[uncensored_prob]], eps) + if (numerator_uncensored_prob %in% names(dat)) { + if (!is.numeric(dat[[numerator_uncensored_prob]])) { + stop( + "`", numerator_uncensored_prob, "` must be numeric in clone arm `", arm, "`.", + call. = FALSE + ) + } + weights <- dat[[numerator_uncensored_prob]] / denominator + } else { + weights <- 1 / denominator + } + + res[[arm]] <- + dat |> + dplyr::mutate(!!weight := .env[["weights"]]) + } + + res +} diff --git a/README.md b/README.md index 483e59e..d0510ed 100644 --- a/README.md +++ b/README.md @@ -64,31 +64,74 @@ library(clonecensorweighting) ```r library(clonecensorweighting) -trial_data <- tibble::tibble( - id = c(1, 2), - follow_up = c(10, 12), - event = c(1, 0), - treatment = c("A", "B") +data(lungcancer) + +arms <- c("Control", "Surgery") +clones <- clone_arms(lungcancer, arms) + +policies <- create_policy_A( + arms, + treatment = "surgery", + time_to_treatment = "timetosurgery", + grace_period = 182.62, + outcome = "death", + followup = "fup_obs", + clone_outcome = "outcome", + clone_followup = "fup" ) - -clones <- clone_censor_weighting( - data = trial_data, - id = "id", - follow_up = "follow_up", - event = "event", - treatment = "treatment", - regimes = c("A", "B") +clones_policy <- apply_logics(clones, policies) + +censoring_logics <- create_censoring_logics_A( + arms, + treatment = "surgery", + time_to_treatment = "timetosurgery", + grace_period = 182.62, + followup = "fup_obs", + clone_censoring = "censoring", + clone_uncensored_followup = "fup_uncensored" +) +clones_censored <- apply_logics(clones_policy, censoring_logics) + +clones_final <- create_final_data( + clones_censored, + clone_followup = "fup", + clone_outcome = "outcome", + clone_censoring = "censoring", + col_ids = "id" ) -clones +clones_estimated <- estimate_censoring( + clones_final, + predictors = c("age", "sex"), + method = "pooled_logit" +) +clones_weighted <- weight_cases(clones_estimated) -make_surv_response( - data = trial_data, - follow_up = "follow_up", - event = "event" +fit <- emul_estimate( + clones_weighted, + method = "Cox", + weights = "weight_Cox", + predictors = c("age", "sex") ) + +exp(stats::coef(fit)) ``` +The full process is: + +1. Clone each patient into the target trial arms with `clone_arms()`. +2. Define and apply treatment-policy logic with `create_policy_A()` and + `apply_logics()`. +3. Define and apply artificial censoring logic with + `create_censoring_logics_A()` and `apply_logics()`. +4. Expand cloned observations into long-form interval data with + `create_final_data()`. +5. Estimate censoring probabilities with `estimate_censoring()`. +6. Add inverse probability of censoring weights with `weight_cases()`. +7. Estimate the emulated trial effect with `emul_estimate()`. +8. Use `emul_estimate_bootstrap()` when bootstrap confidence intervals are + needed. + ## What the package currently provides The package is still an early, lightweight foundation for future work. Right @@ -97,6 +140,15 @@ now it includes: - `read_trial_data()` to read trial-style CSV data into a tibble - `clone_censor_weighting()` to create a starter cloned dataset across regimes - `make_surv_response()` to build a `survival::Surv()` response object +- `clone_arms()` to duplicate observations across treatment strategies +- `create_policy_A()` and `create_censoring_logics_A()` to generate example + treatment-policy and artificial-censoring logic for the lung cancer scenario +- `apply_logics()` to apply policy or censoring logic to cloned data +- `create_final_data()` to create long-form interval data for censoring models +- `estimate_censoring()` and `weight_cases()` to estimate censoring + probabilities and add IPC weights +- `emul_estimate()` and `emul_estimate_bootstrap()` to estimate treatment + effects and bootstrap confidence intervals ## Working together on this repository diff --git a/man/apply_logics.Rd b/man/apply_logics.Rd index 17c3a56..11188b0 100644 --- a/man/apply_logics.Rd +++ b/man/apply_logics.Rd @@ -23,3 +23,19 @@ created by the provided logics. \description{ Apply case_when logics (e.g. policy, censoring) to clones } +\examples{ +data(lungcancer) +arms <- c("Control", "Surgery") +clones <- clone_arms(lungcancer, arms) +policies <- create_policy_A( + arms, + treatment = "surgery", + time_to_treatment = "timetosurgery", + grace_period = 182.62, + outcome = "death", + followup = "fup_obs", + clone_outcome = "outcome", + clone_followup = "fup" +) +clones_policy <- apply_logics(clones, policies) +} diff --git a/man/clone_censor_weighting.Rd b/man/clone_censor_weighting.Rd index eb3f857..5ad0f2e 100644 --- a/man/clone_censor_weighting.Rd +++ b/man/clone_censor_weighting.Rd @@ -30,3 +30,14 @@ immediate deviations from the observed baseline treatment as censored clones. This is a light-weight scaffold for package development rather than a full causal inference implementation. } +\examples{ +data(lungcancer) +cloned <- clone_censor_weighting( + lungcancer, + id = "id", + follow_up = "fup_obs", + event = "death", + treatment = "surgery", + regimes = c(0, 1) +) +} diff --git a/man/create_censoring_logics_A.Rd b/man/create_censoring_logics_A.Rd index d7cee53..314531b 100644 --- a/man/create_censoring_logics_A.Rd +++ b/man/create_censoring_logics_A.Rd @@ -69,3 +69,15 @@ time. \description{ Generate censoring logic for scenario A } +\examples{ +arms <- c("Control", "Surgery") +censoring_logics <- create_censoring_logics_A( + arms, + treatment = "surgery", + time_to_treatment = "timetosurgery", + grace_period = 182.62, + followup = "fup_obs", + clone_censoring = "censoring", + clone_uncensored_followup = "fup_uncensored" +) +} diff --git a/man/create_final_data.Rd b/man/create_final_data.Rd index 91296e4..3f64300 100644 --- a/man/create_final_data.Rd +++ b/man/create_final_data.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/reshape.R +% Please edit documentation in R/create_final_data.R \name{create_final_data} \alias{create_final_data} \title{Create training data for censoring probability estimation} @@ -60,3 +60,26 @@ subrecord starts with time 0, and the rows are expanded up to \description{ Create training data for censoring probability estimation } +\examples{ +data(lungcancer) +arms <- c("Control", "Surgery") +clones <- clone_arms(lungcancer, arms) +policies <- create_policy_A( + arms, "surgery", "timetosurgery", 182.62, "death", "fup_obs", + clone_outcome = "outcome", clone_followup = "fup" +) +clones_policy <- apply_logics(clones, policies) +censoring_logics <- create_censoring_logics_A( + arms, "surgery", "timetosurgery", 182.62, "fup_obs", + clone_censoring = "censoring", + clone_uncensored_followup = "fup_uncensored" +) +clones_censored <- apply_logics(clones_policy, censoring_logics) +clones_final <- create_final_data( + clones_censored, + clone_followup = "fup", + clone_outcome = "outcome", + clone_censoring = "censoring", + col_ids = "id" +) +} diff --git a/man/create_policy_A.Rd b/man/create_policy_A.Rd index ba27671..233b387 100644 --- a/man/create_policy_A.Rd +++ b/man/create_policy_A.Rd @@ -69,3 +69,16 @@ creating new variables for emulated outcome and follow up time. \description{ Generate clone policy for scenario A } +\examples{ +arms <- c("Control", "Surgery") +policies <- create_policy_A( + arms, + treatment = "surgery", + time_to_treatment = "timetosurgery", + grace_period = 182.62, + outcome = "death", + followup = "fup_obs", + clone_outcome = "outcome", + clone_followup = "fup" +) +} diff --git a/man/create_timestamp_table.Rd b/man/create_timestamp_table.Rd deleted file mode 100644 index bf2b34a..0000000 --- a/man/create_timestamp_table.Rd +++ /dev/null @@ -1,26 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/reshape.R -\name{create_timestamp_table} -\alias{create_timestamp_table} -\title{Create timestamp table} -\usage{ -create_timestamp_table(clones, clone_followup) -} -\arguments{ -\item{clones}{A list of data frame. Each element of the list represents -each treatment arm. This version of clones must contain a column that -represents a emulated follow up time.} - -\item{clone_followup}{A column name in emulcated clone (i.e. \code{clones}) -that represents the emulated follow up time in cloned data frame.} -} -\value{ -A data frame with two columns: \code{tevent} and \code{ID_t}. -\code{tevent} represents a timestamp that outcome event can occur based on -observed data. \code{ID_t} represents an enumerated identifier of each -timestamp, from 1 to n where n represents the number of unique \code{tevent} -value. -} -\description{ -Create timestamp table -} diff --git a/man/emul_estimate.Rd b/man/emul_estimate.Rd new file mode 100644 index 0000000..62db16d --- /dev/null +++ b/man/emul_estimate.Rd @@ -0,0 +1,78 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/emul_estimate.R +\name{emul_estimate} +\alias{emul_estimate} +\title{Estimate the emulated trial effect} +\usage{ +emul_estimate( + clones_weighted, + method = c("Cox", "logistic", "KM"), + cluster = "id", + weights = NULL, + predictors = NULL, + outcome = "outcome", + time_start = "Tstart", + time_stop = "Tstop", + arm = "arms" +) +} +\arguments{ +\item{clones_weighted}{A named list of weighted clone data frames, or a +single data frame containing an arm column.} + +\item{method}{Analysis method: \code{"Cox"}, \code{"logistic"}, or \code{"KM"}.} + +\item{cluster}{Column name used for robust clustering in Cox models.} + +\item{weights}{Optional weight column name. Unquoted column names are also +accepted when they exist in the data.} + +\item{predictors}{Optional adjustment predictors. For KM, predictors define +additional strata rather than covariate adjustment.} + +\item{outcome}{Column name for the outcome indicator.} + +\item{time_start}{Column name for interval start time.} + +\item{time_stop}{Column name for interval stop time.} + +\item{arm}{Column name for treatment arm after binding clone lists.} +} +\value{ +A fitted model object: \code{"coxph"} for Cox, \code{"glm"} for logistic, or +\code{"survfit"} for KM. +} +\description{ +Estimate the emulated trial effect +} +\examples{ +data(lungcancer) +arms <- c("Control", "Surgery") +clones <- clone_arms(lungcancer, arms) +policies <- create_policy_A( + arms, "surgery", "timetosurgery", 182.62, "death", "fup_obs", + clone_outcome = "outcome", clone_followup = "fup" +) +clones_policy <- apply_logics(clones, policies) +censoring_logics <- create_censoring_logics_A( + arms, "surgery", "timetosurgery", 182.62, "fup_obs", + clone_censoring = "censoring", + clone_uncensored_followup = "fup_uncensored" +) +clones_censored <- apply_logics(clones_policy, censoring_logics) +clones_final <- create_final_data( + clones_censored, "fup", "outcome", "censoring", "id" +) +clones_estimated <- estimate_censoring( + clones_final, + predictors = c("age", "sex"), + method = "pooled_logit" +) +clones_weighted <- weight_cases(clones_estimated) +fit <- emul_estimate( + clones_weighted, + method = "Cox", + weights = "weight_Cox", + predictors = c("age", "sex") +) +} diff --git a/man/emul_estimate_bootstrap.Rd b/man/emul_estimate_bootstrap.Rd new file mode 100644 index 0000000..21a0a0a --- /dev/null +++ b/man/emul_estimate_bootstrap.Rd @@ -0,0 +1,87 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/emul_estimate_bootstrap.R +\name{emul_estimate_bootstrap} +\alias{emul_estimate_bootstrap} +\title{Bootstrap the emulated trial effect} +\usage{ +emul_estimate_bootstrap( + clones_weighted, + method = c("Cox", "logistic"), + cluster = "id", + predictors = NULL, + weights = NULL, + n_bootstrap = 200, + outcome = "outcome", + time_start = "Tstart", + time_stop = "Tstop", + arm = "arms", + conf_level = 0.95, + seed = NULL +) +} +\arguments{ +\item{clones_weighted}{A named list of weighted clone data frames, or a +single data frame containing an arm column.} + +\item{method}{Analysis method: \code{"Cox"} or \code{"logistic"}.} + +\item{cluster}{Column name identifying resampling clusters.} + +\item{predictors}{Optional adjustment predictors.} + +\item{weights}{Optional weight column name. Unquoted column names are also +accepted when they exist in the data.} + +\item{n_bootstrap}{Number of bootstrap resamples.} + +\item{outcome}{Column name for the outcome indicator.} + +\item{time_start}{Column name for interval start time.} + +\item{time_stop}{Column name for interval stop time.} + +\item{arm}{Column name for treatment arm after binding clone lists.} + +\item{conf_level}{Confidence level for the percentile interval.} + +\item{seed}{Optional random seed.} +} +\value{ +A list with \code{ci_lower}, \code{ci_upper}, and bootstrap \code{estimates}. +} +\description{ +Bootstrap the emulated trial effect +} +\examples{ +data(lungcancer) +arms <- c("Control", "Surgery") +clones <- clone_arms(lungcancer, arms) +policies <- create_policy_A( + arms, "surgery", "timetosurgery", 182.62, "death", "fup_obs", + clone_outcome = "outcome", clone_followup = "fup" +) +clones_policy <- apply_logics(clones, policies) +censoring_logics <- create_censoring_logics_A( + arms, "surgery", "timetosurgery", 182.62, "fup_obs", + clone_censoring = "censoring", + clone_uncensored_followup = "fup_uncensored" +) +clones_censored <- apply_logics(clones_policy, censoring_logics) +clones_final <- create_final_data( + clones_censored, "fup", "outcome", "censoring", "id" +) +clones_estimated <- estimate_censoring( + clones_final, + predictors = c("age", "sex"), + method = "pooled_logit" +) +clones_weighted <- weight_cases(clones_estimated) +boot <- emul_estimate_bootstrap( + clones_weighted, + method = "Cox", + weights = "weight_Cox", + predictors = c("age", "sex"), + n_bootstrap = 3, + seed = 1 +) +} diff --git a/man/estimate_censoring.Rd b/man/estimate_censoring.Rd new file mode 100644 index 0000000..2073b87 --- /dev/null +++ b/man/estimate_censoring.Rd @@ -0,0 +1,74 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/censoring_estimation.R +\name{estimate_censoring} +\alias{estimate_censoring} +\title{Estimate censoring probabilities} +\usage{ +estimate_censoring( + clones, + predictors = NULL, + method = c("Cox", "pooled_logit", "stabilized_logit"), + numerator_predictors = NULL, + censoring = "censoring", + id = "id", + time_start = "Tstart", + time_stop = "Tstop", + eps = 1e-06 +) +} +\arguments{ +\item{clones}{A named list of long-form clone data frames.} + +\item{predictors}{Optional character vector of denominator model predictors.} + +\item{method}{Censoring model. \code{"Cox"} fits a Cox censoring model, +\code{"pooled_logit"} fits a pooled logistic denominator model, and +\code{"stabilized_logit"} additionally fits a numerator model.} + +\item{numerator_predictors}{Optional character vector of numerator model +predictors for stabilized pooled-logit weights. When \code{NULL}, \code{predictors} +are used.} + +\item{censoring}{Column name for the censoring indicator.} + +\item{id}{Column name for the subject identifier.} + +\item{time_start}{Column name for interval start time.} + +\item{time_stop}{Column name for interval stop time.} + +\item{eps}{Small probability floor to avoid division by zero.} +} +\value{ +A named list of clone data frames with censoring probability +columns added. All methods add \code{P_uncens}; pooled-logit methods also add +\code{p_cens_den}, and stabilized pooled logit adds \code{p_cens_num} and +\code{P_uncens_num}. +} +\description{ +Estimate censoring probabilities +} +\examples{ +data(lungcancer) +arms <- c("Control", "Surgery") +clones <- clone_arms(lungcancer, arms) +policies <- create_policy_A( + arms, "surgery", "timetosurgery", 182.62, "death", "fup_obs", + clone_outcome = "outcome", clone_followup = "fup" +) +clones_policy <- apply_logics(clones, policies) +censoring_logics <- create_censoring_logics_A( + arms, "surgery", "timetosurgery", 182.62, "fup_obs", + clone_censoring = "censoring", + clone_uncensored_followup = "fup_uncensored" +) +clones_censored <- apply_logics(clones_policy, censoring_logics) +clones_final <- create_final_data( + clones_censored, "fup", "outcome", "censoring", "id" +) +clones_estimated <- estimate_censoring( + clones_final, + predictors = c("age", "sex"), + method = "pooled_logit" +) +} diff --git a/man/make_surv_response.Rd b/man/make_surv_response.Rd index 669bcc2..25b6cdf 100644 --- a/man/make_surv_response.Rd +++ b/man/make_surv_response.Rd @@ -19,3 +19,11 @@ An object of class \code{"Surv"}. \description{ Construct a survival response } +\examples{ +data(lungcancer) +surv_response <- make_surv_response( + lungcancer, + follow_up = "fup_obs", + event = "death" +) +} diff --git a/man/read_trial_data.Rd b/man/read_trial_data.Rd index 85c723e..4d30fb7 100644 --- a/man/read_trial_data.Rd +++ b/man/read_trial_data.Rd @@ -17,3 +17,15 @@ A tibble. \description{ Read trial-style data from CSV } +\examples{ +csv_file <- tempfile(fileext = ".csv") +writeLines( + c( + "id,follow_up,event,treatment", + "1,10,1,A", + "2,12,0,B" + ), + con = csv_file +) +trial_data <- read_trial_data(csv_file) +} diff --git a/man/split_at_timestamp.Rd b/man/split_at_timestamp.Rd deleted file mode 100644 index 1b50662..0000000 --- a/man/split_at_timestamp.Rd +++ /dev/null @@ -1,47 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/reshape.R -\name{split_at_timestamp} -\alias{split_at_timestamp} -\title{Split each observation into multiple subrecords at each time cut} -\usage{ -split_at_timestamp( - clones, - clone_followup, - t_events, - event, - timestamp_start = "Tstart", - id = "ID" -) -} -\arguments{ -\item{clones}{A list of data frame. Each element of the list represents -each treatment arm. This version of clones must contain a column that -represents a emulated follow up time (corresponding to \code{clone_followup} -argument) and an event of interest (corresponding to \code{event} argument).} - -\item{clone_followup}{A column name in emulcated clone (i.e. \code{clones}) -that represents the emulated follow up time in cloned data frame.} - -\item{t_events}{A vector of timestamp that outcome event can occur based on -observed data.} - -\item{event}{A variable name of an event of interest. The variable should -exists in each data frame that is an element of \code{clones} argument, and -the variable value should be a binary (0 or 1).} - -\item{timestamp_start}{A new variable name to denote start time.} - -\item{id}{A new variable name for a unique observation identifier, to -represents that multiple rows in output data frame is associated with the -same observation.} -} -\value{ -A list of long-form data frames. Each data frame represents each -clone arm. Each row of the long-form data frame represents a subrecord of -each observation associated with each specific time interval. The first -subrecord starts with time 0, and the rows are expanded up to -\code{clone_followup}, where cut times are determined by \code{t_events} argument. -} -\description{ -Split each observation into multiple subrecords at each time cut -} diff --git a/man/weight_cases.Rd b/man/weight_cases.Rd new file mode 100644 index 0000000..1cd5201 --- /dev/null +++ b/man/weight_cases.Rd @@ -0,0 +1,58 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/weight_cases.R +\name{weight_cases} +\alias{weight_cases} +\title{Add inverse probability of censoring weights} +\usage{ +weight_cases( + clones, + uncensored_prob = "P_uncens", + numerator_uncensored_prob = "P_uncens_num", + weight = "weight_Cox", + eps = 1e-06 +) +} +\arguments{ +\item{clones}{A named list of clone data frames returned by +\code{estimate_censoring()}.} + +\item{uncensored_prob}{Column name for denominator uncensoring probability.} + +\item{numerator_uncensored_prob}{Column name for numerator uncensoring +probability. When this column exists, stabilized weights are created.} + +\item{weight}{Column name for the output weight.} + +\item{eps}{Small probability floor to avoid division by zero.} +} +\value{ +A named list of clone data frames with the weight column added. +} +\description{ +Add inverse probability of censoring weights +} +\examples{ +data(lungcancer) +arms <- c("Control", "Surgery") +clones <- clone_arms(lungcancer, arms) +policies <- create_policy_A( + arms, "surgery", "timetosurgery", 182.62, "death", "fup_obs", + clone_outcome = "outcome", clone_followup = "fup" +) +clones_policy <- apply_logics(clones, policies) +censoring_logics <- create_censoring_logics_A( + arms, "surgery", "timetosurgery", 182.62, "fup_obs", + clone_censoring = "censoring", + clone_uncensored_followup = "fup_uncensored" +) +clones_censored <- apply_logics(clones_policy, censoring_logics) +clones_final <- create_final_data( + clones_censored, "fup", "outcome", "censoring", "id" +) +clones_estimated <- estimate_censoring( + clones_final, + predictors = c("age", "sex"), + method = "pooled_logit" +) +clones_weighted <- weight_cases(clones_estimated) +} diff --git a/tests/testthat/test-censoring_estimation.R b/tests/testthat/test-censoring_estimation.R new file mode 100644 index 0000000..cd1ed4a --- /dev/null +++ b/tests/testthat/test-censoring_estimation.R @@ -0,0 +1,87 @@ +make_censoring_clones <- function() { + list( + Control = tibble::tibble( + id = rep(1:6, each = 2), + Tstart = rep(c(0, 1), 6), + Tstop = rep(c(1, 2), 6), + outcome = c(0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1), + censoring = c(0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0), + age = rep(c(50, 60, 55, 65, 58, 62), each = 2) + ), + Surgery = tibble::tibble( + id = rep(1:6, each = 2), + Tstart = rep(c(0, 1), 6), + Tstop = rep(c(1, 2), 6), + outcome = c(0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0), + censoring = c(0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1), + age = rep(c(50, 60, 55, 65, 58, 62), each = 2) + ) + ) +} + +test_that("estimate_censoring adds pooled-logit denominator probabilities", { + result <- suppressWarnings( + estimate_censoring( + make_censoring_clones(), + predictors = "age", + method = "pooled_logit" + ) + ) + + expect_named(result, c("Control", "Surgery")) + expect_true(all(c("p_cens_den", "P_uncens") %in% names(result$Control))) + expect_true(all(result$Control$p_cens_den > 0)) + expect_true(all(result$Control$p_cens_den < 1)) + expect_true(all(result$Control$P_uncens > 0)) + expect_true(all(result$Control$P_uncens <= 1)) +}) + +test_that("estimate_censoring supports stabilized pooled-logit weights", { + result <- suppressWarnings( + estimate_censoring( + make_censoring_clones(), + predictors = "age", + method = "stabilized_logit" + ) + ) + + expect_true(all(c("p_cens_num", "P_uncens_num") %in% names(result$Surgery))) + expect_true(all(result$Surgery$P_uncens_num > 0)) + expect_true(all(result$Surgery$P_uncens_num <= 1)) +}) + +test_that("estimate_censoring supports Cox censoring models", { + result <- suppressWarnings( + estimate_censoring( + make_censoring_clones(), + predictors = "age", + method = "Cox" + ) + ) + + expect_true(all(c("lin_pred", "hazard", "P_uncens") %in% names(result$Control))) + expect_true(all(result$Control$P_uncens > 0)) + expect_true(all(result$Control$P_uncens <= 1)) +}) + +test_that("weight_cases creates unstabilized and stabilized IPC weights", { + unstabilized <- list( + Control = tibble::tibble(P_uncens = c(1, 0.5)), + Surgery = tibble::tibble(P_uncens = c(0.25, 0.8)) + ) + stabilized <- list( + Control = tibble::tibble(P_uncens = c(1, 0.5), P_uncens_num = c(1, 0.75)), + Surgery = tibble::tibble(P_uncens = c(0.25, 0.8), P_uncens_num = c(0.5, 0.4)) + ) + + expect_equal(weight_cases(unstabilized)$Control$weight_Cox, c(1, 2)) + expect_equal(weight_cases(stabilized)$Surgery$weight_Cox, c(2, 0.5)) +}) + +test_that("censoring helper functions remain internal", { + exports <- getNamespaceExports("clonecensorweighting") + + expect_false("make_censoring_formula" %in% exports) + expect_false("cumulative_uncensoring" %in% exports) + expect_false("add_baseline_predictors" %in% exports) +}) diff --git a/tests/testthat/test-create_final_data.R b/tests/testthat/test-create_final_data.R new file mode 100644 index 0000000..117e913 --- /dev/null +++ b/tests/testthat/test-create_final_data.R @@ -0,0 +1,84 @@ +make_test_clones <- function() { + list( + Control = tibble::tibble( + id = c(1, 2), + fup = c(5, 10), + outcome = c(1, 0), + censoring = c(0, 1) + ), + Surgery = tibble::tibble( + id = c(1, 2), + fup = c(6, 8), + outcome = c(0, 1), + censoring = c(1, 0) + ) + ) +} + +test_that("timestamp and split helpers are internal", { + exports <- getNamespaceExports("clonecensorweighting") + + expect_false("create_timestamp_table" %in% exports) + expect_false("split_at_timestamp" %in% exports) +}) + +test_that("create_timestamp_table creates ordered unique event times", { + result <- create_timestamp_table(make_test_clones(), "fup") + + expect_equal( + result, + tibble::tibble(tevent = c(5, 6, 8, 10), ID_t = 1:4) + ) +}) + +test_that("split_at_timestamp splits each clone arm at common timestamps", { + result <- split_at_timestamp( + make_test_clones(), + clone_followup = "fup", + t_events = c(5, 6, 8, 10), + event = "outcome" + ) + + expect_named(result, c("Control", "Surgery")) + expect_equal(result$Control$Tstart, c(0, 0, 5, 6, 8)) + expect_equal(result$Control$fup, c(5, 5, 6, 8, 10)) + expect_equal(result$Control$outcome, c(1, 0, 0, 0, 0)) + expect_equal(result$Surgery$Tstart, c(0, 5, 0, 5, 6)) + expect_equal(result$Surgery$fup, c(5, 6, 5, 6, 8)) + expect_equal(result$Surgery$outcome, c(0, 0, 0, 0, 1)) +}) + +test_that("create_final_data combines outcome and censoring intervals", { + result <- create_final_data( + make_test_clones(), + clone_followup = "fup", + clone_outcome = "outcome", + clone_censoring = "censoring", + col_ids = "id" + ) + + expect_named(result, c("Control", "Surgery")) + expect_equal(result$Control$Tstop, result$Control$fup) + expect_equal(result$Surgery$Tstop, result$Surgery$fup) + expect_equal(result$Control$ID_t, c(0, 0, 1, 2, 3)) + expect_equal(result$Surgery$ID_t, c(0, 1, 0, 1, 2)) + expect_equal(result$Control$censoring, c(0, 0, 0, 0, 1)) + expect_equal(result$Surgery$censoring, c(0, 1, 0, 0, 0)) + expect_equal(result$Surgery$outcome, c(0, 0, 0, 0, 1)) +}) + +test_that("create_final_data reports missing clone columns by arm", { + clones <- make_test_clones() + clones$Control$censoring <- NULL + + expect_error( + create_final_data( + clones, + clone_followup = "fup", + clone_outcome = "outcome", + clone_censoring = "censoring", + col_ids = "id" + ), + "In clone arm `Control`: Missing required columns: censoring" + ) +}) diff --git a/tests/testthat/test-emul_estimate.R b/tests/testthat/test-emul_estimate.R new file mode 100644 index 0000000..11d8f0b --- /dev/null +++ b/tests/testthat/test-emul_estimate.R @@ -0,0 +1,67 @@ +make_weighted_clones <- function() { + list( + Control = tibble::tibble( + id = 1:8, + Tstart = 0, + Tstop = 1, + outcome = c(1, 0, 1, 0, 1, 0, 0, 1), + weight_Cox = 1, + age = c(50, 55, 60, 65, 52, 57, 62, 67) + ), + Surgery = tibble::tibble( + id = 1:8, + Tstart = 0, + Tstop = 1, + outcome = c(0, 1, 0, 0, 0, 1, 0, 0), + weight_Cox = 1, + age = c(50, 55, 60, 65, 52, 57, 62, 67) + ) + ) +} + +test_that("emul_estimate fits Cox, logistic, and KM analyses", { + weighted <- make_weighted_clones() + + cox_fit <- emul_estimate(weighted, method = "Cox", weights = "weight_Cox") + logistic_fit <- suppressWarnings( + emul_estimate(weighted, method = "logistic", weights = weight_Cox) + ) + km_fit <- emul_estimate(weighted, method = "KM", weights = "weight_Cox") + + expect_s3_class(cox_fit, "coxph") + expect_s3_class(logistic_fit, "glm") + expect_s3_class(km_fit, "survfit") +}) + +test_that("emul_estimate accepts data frame input with an arm column", { + dat <- dplyr::bind_rows(make_weighted_clones(), .id = "arms") + + fit <- emul_estimate(dat, method = "Cox", weights = "weight_Cox") + + expect_s3_class(fit, "coxph") +}) + +test_that("emul_estimate_bootstrap returns percentile intervals", { + result <- emul_estimate_bootstrap( + make_weighted_clones(), + method = "Cox", + weights = "weight_Cox", + n_bootstrap = 3, + seed = 1 + ) + + expect_named(result, c("ci_lower", "ci_upper", "estimates")) + expect_length(result$estimates, 3) + expect_true(all(is.finite(result$estimates))) + expect_true(is.finite(result$ci_lower)) + expect_true(is.finite(result$ci_upper)) +}) + +test_that("estimation helper functions remain internal", { + exports <- getNamespaceExports("clonecensorweighting") + + expect_false("backtick_name" %in% exports) + expect_false("normalize_predictors" %in% exports) + expect_false("normalize_weights" %in% exports) + expect_false("emul_formula" %in% exports) +}) diff --git a/tests/testthat/test-lungcancer-workflow.R b/tests/testthat/test-lungcancer-workflow.R new file mode 100644 index 0000000..f59bce8 --- /dev/null +++ b/tests/testthat/test-lungcancer-workflow.R @@ -0,0 +1,97 @@ +make_lungcancer_workflow <- function() { + data(lungcancer, package = "clonecensorweighting") + + arms <- c("Control", "Surgery") + clones <- clone_arms(lungcancer, arms) + policies <- create_policy_A( + arms, + treatment = "surgery", + time_to_treatment = "timetosurgery", + grace_period = 182.62, + outcome = "death", + followup = "fup_obs", + clone_outcome = "outcome", + clone_followup = "fup" + ) + clones_policy <- apply_logics(clones, policies) + censoring_logics <- create_censoring_logics_A( + arms, + treatment = "surgery", + time_to_treatment = "timetosurgery", + grace_period = 182.62, + followup = "fup_obs", + clone_censoring = "censoring", + clone_uncensored_followup = "fup_uncensored" + ) + clones_censored <- apply_logics(clones_policy, censoring_logics) + clones_final <- create_final_data( + clones_censored, + clone_followup = "fup", + clone_outcome = "outcome", + clone_censoring = "censoring", + col_ids = "id" + ) + clones_estimated <- estimate_censoring( + clones_final, + predictors = c("age", "sex"), + method = "pooled_logit" + ) + clones_weighted <- weight_cases(clones_estimated) + + list( + clones = clones, + clones_final = clones_final, + clones_estimated = clones_estimated, + clones_weighted = clones_weighted + ) +} + +test_that("lungcancer data runs through clone-censor-weighting workflow", { + workflow <- make_lungcancer_workflow() + + expect_named(workflow$clones, c("Control", "Surgery")) + expect_equal(vapply(workflow$clones, nrow, integer(1)), c(Control = 200L, Surgery = 200L)) + expect_equal( + vapply(workflow$clones_final, nrow, integer(1)), + c(Control = 8792L, Surgery = 13980L) + ) + + required_final_columns <- c("id", "Tstart", "Tstop", "outcome", "censoring", "ID_t") + expect_true(all(required_final_columns %in% names(workflow$clones_final$Control))) + expect_true(all(c("p_cens_den", "P_uncens") %in% names(workflow$clones_estimated$Control))) + expect_true(all(c("weight_Cox") %in% names(workflow$clones_weighted$Control))) + expect_true(all(is.finite(workflow$clones_weighted$Control$weight_Cox))) + expect_true(all(workflow$clones_weighted$Control$weight_Cox > 0)) +}) + +test_that("lungcancer weighted data supports emulated trial estimation", { + workflow <- make_lungcancer_workflow() + + cox_fit <- emul_estimate( + workflow$clones_weighted, + method = "Cox", + weights = "weight_Cox", + predictors = c("age", "sex") + ) + km_fit <- emul_estimate( + workflow$clones_weighted, + method = "KM", + weights = "weight_Cox" + ) + boot <- emul_estimate_bootstrap( + workflow$clones_weighted, + method = "Cox", + weights = "weight_Cox", + predictors = c("age", "sex"), + n_bootstrap = 3, + seed = 1 + ) + + expect_s3_class(cox_fit, "coxph") + expect_true(all(is.finite(stats::coef(cox_fit)))) + expect_s3_class(km_fit, "survfit") + expect_length(boot$estimates, 3) + expect_true(all(is.finite(boot$estimates))) + expect_true(is.finite(boot$ci_lower)) + expect_true(is.finite(boot$ci_upper)) +})