Nothing
#' VPC function for time-to-event (survival) data
#'
#' This function can be used for either single time-to-event (TTE) or repeated time-to-event (RTTE) data.
#'
#' Creates a VPC plot from observed and simulation survival data
#' @param sim a data.frame with observed data, containing the independent and dependent variable, a column indicating the individual, and possibly covariates. E.g. load in from NONMEM using \link{read_table_nm}
#' @param obs a data.frame with observed data, containing the independent and dependent variable, a column indicating the individual, and possibly covariates. E.g. load in from NONMEM using \link{read_table_nm}
#' @param psn_folder instead of specifying "sim" and "obs", specify a PsN-generated VPC-folder
#' @param bins either "density", "time", or "data", or a numeric vector specifying the bin separators.
#' @param n_bins number of bins
#' @param obs_cols observation dataset column names (list elements: "dv", "idv", "id", "pred")
#' @param sim_cols simulation dataset column names (list elements: "dv", "idv", "id", "pred", "sim")
#' @param software name of software platform using (e.g. nonmem, phoenix)
#' @param show what to show in VPC (obs_ci, obs_median, sim_median, sim_median_ci)
#' @param rtte repeated time-to-event data? Default is FALSE (treat as single-event TTE)
#' @param rtte_calc_diff recalculate time (T/F)? When simulating in NONMEM, you will probably need to set this to TRUE to recalculate the TIME to relative times between events (unless you output the time difference between events and specify that as independent variable to the vpc_tte() function.
#' @param rtte_conditional `TRUE` (default) or `FALSE`. Compute the probability for each event newly (`TRUE`), or calculate the absolute probability (`FALSE`, i.e. the "probability of a 1st, 2nd, 3rd event etc" rather than the "probability of an event happening").
#' @param kmmc either NULL (for regular TTE vpc, default), or a variable name for a KMMC plot (e.g. "WT")
#' @param events numeric vector describing which events to show a VPC for when repeated TTE data, e.g. c(1:4). Default is NULL, which shows all events.
#' @param reverse_prob reverse the probability scale (i.e. plot 1-probability)
#' @param stratify character vector of stratification variables. Only 1 or 2 stratification variables can be supplied.
#' @param stratify_color character vector of stratification variables. Only 1 stratification variable can be supplied, cannot be used in conjunction with `stratify`.
#' @param ci confidence interval to plot. Default is (0.05, 0.95)
#' @param plot Boolean indicating whether to plot the ggplot2 object after creation. Default is FALSE.
#' @param as_percentage Show y-scale from 0-100 percent? TRUE by default, if FALSE then scale from 0-1.
#' @param xlab label for x-axis
#' @param ylab label for y-axis
#' @param title title
#' @param smooth "smooth" the VPC (connect bin midpoints) or show bins as rectangular boxes. Default is TRUE.
#' @param vpc_theme theme to be used in VPC. Expects list of class vpc_theme created with function vpc_theme()
#' @param facet either "wrap", "columns", or "rows"
#' @param labeller ggplot2 labeller function to be passed to underlying ggplot object
#' @param verbose TRUE or FALSE (default)
#' @param vpcdb Boolean whether to return the underlying vpcdb rather than the plot
#' @return a list containing calculated VPC information, and a ggplot2 object
#' @export
#' @seealso \link{sim_data}, \link{vpc}, \link{vpc_tte}, \link{vpc_cens}
#' @examples
#' ## See vpc-docs.ronkeizer.com for more documentation and examples.
#'
#' ## Example for repeated) time-to-event data
#' ## with NONMEM-like data (e.g. simulated using a dense grid)
#'
#' data(rtte_obs_nm)
#' data(rtte_sim_nm)
#'
#' # treat RTTE as TTE, no stratification
#' vpc_tte(sim = rtte_sim_nm[rtte_sim_nm$sim <= 20,],
#' obs = rtte_obs_nm,
#' rtte = FALSE,
#' sim_cols=list(dv = "dv", idv = "t"), obs_cols=list(idv = "t"))
#'
vpc_tte <- function(sim = NULL,
obs = NULL,
psn_folder = NULL,
rtte = FALSE,
rtte_calc_diff = TRUE,
rtte_conditional = TRUE,
events = NULL,
bins = FALSE,
n_bins = 10,
software = "auto",
obs_cols = NULL,
sim_cols = NULL,
kmmc = NULL,
reverse_prob = FALSE,
stratify = NULL,
stratify_color = NULL,
ci = c(0.05, 0.95),
plot = FALSE,
xlab = "Time",
ylab = "Survival (%)",
show = NULL,
as_percentage = TRUE,
title = NULL,
smooth = FALSE,
vpc_theme = NULL,
facet = "wrap",
labeller = NULL,
verbose = FALSE,
vpcdb = FALSE) {
if(is.null(obs) & is.null(sim)) {
stop("At least a simulation or an observation dataset are required to create a plot!")
}
if(!is.null(bins) && bins != FALSE) {
message("Binning is not recommended for `vpc_tte()`, plot might not show correctly!")
}
if(!is.null(kmmc)) {
if(!kmmc %in% names(obs)) {
stop(paste0("Specified covariate ", kmmc, " not found among column names in observed data."))
}
}
if(!is.null(kmmc)) {
if(!kmmc %in% names(sim)) {
stop(paste0("Specified covariate ", kmmc, " not found among column names in simulated data."))
}
}
message("Initializing.")
if(!is.null(psn_folder)) {
if(!is.null(obs)) {
obs <- vpc::read_table_nm(paste0(psn_folder, "/m1/", dir(paste0(psn_folder, "/m1"), pattern="original.npctab")[1]))
}
if(!is.null(sim)) {
sim <- vpc::read_table_nm(paste0(psn_folder, "/m1/", dir(paste0(psn_folder, "/m1"), pattern="simulation.1.npctab")[1]))
}
software = "nonmem"
}
if (!is.null(obs)) {
software_type <- guess_software(software, obs)
} else {
software_type <- guess_software(software, sim)
}
if(is.null(sim)) {
show_default$obs_ci <- TRUE
}
## define what to show in plot
show <- vpc::replace_list_elements(show_default_tte, show)
## checking whether stratification columns are available
stratify_pars <- NULL
if(!is.null(stratify)) stratify_pars <- stratify
if(!is.null(stratify_color)) {
if(!is.null(stratify)) stop("Sorry, stratification using both facets and color is currently not supported, use either `stratify` or `stratify_color`.")
if(length(stratify_color) != 1) {
stop("Sorry, please specify only a single stratification variable for `stratify_color`.")
}
stratify_pars <- stratify_color
}
if(!is.null(stratify_pars)) {
if(!is.null(obs)) {
check_stratification_columns_available(obs, stratify_pars, "observation")
}
if(!is.null(sim)) {
check_stratification_columns_available(sim, stratify_pars, "simulation")
}
}
## redefine strat column in case of "strat"
if(!is.null(stratify_pars) && !is.null(obs)) {
if(stratify_pars[1] == "strat") {
if(!is.null(obs)) {
obs$strat_orig = obs$strat
} else if (!is.null(sim)){
sim$strat_orig = sim$strat
}
stratify <- "strat_orig"
}
}
## define column names
cols <- define_data_columns(sim, obs, sim_cols, obs_cols, software_type)
if(!is.null(obs)) {
old_class <- class(obs)
class(obs) <- c(software_type, old_class)
}
if(!is.null(sim)) {
old_class <- class(sim)
class(sim) <- c(software_type, old_class)
}
## remove EVID != 0 / MDV != 0
if(!is.null(obs)) {
obs <- filter_dv(obs, verbose)
}
if(!is.null(sim)) {
sim <- filter_dv(sim, verbose)
}
## stratification
stratify_original <- stratify_pars
if(!is.null(stratify_pars)) {
if(rtte) {
if (length(stratify_pars) > 1) {
stop ("Sorry, with repeated time-to-event data, stratification on more than 1 variables is currently not supported!")
invisible()
}
} else {
if (length(stratify_pars) > 2) {
stop ("Sorry, stratification on more than 2 variables is currently not supported!")
invisible()
}
}
}
## format obs data
if(!is.null(obs)) {
obs$id <- obs[[cols$obs$id]]
if (length(obs[[cols$obs$id]]) == 0) {
msg("Warning: No ID column found, assuming 1 row per ID.", verbose)
obs$id <- 1:length(obs[,1])
}
obs$time <- as.num(obs[[cols$obs$idv]])
obs$dv <- as.num(obs[[cols$obs$dv]])
if(max(obs$dv) > 1) { # guessing DV definition if not just 0/1
if(max(obs$dv) == 2) { # common approach in NONMEM, 2 = censored
obs[obs$dv != 1,]$dv <- 0
msg("Warning: vpc_tte() expected the observed dependent variable to contain only 0 (censored, or no event observed) or 1 (event observed). Setting all observations != 1 to 0.", verbose)
} else {
obs[obs$dv != 1,]$dv <- 1 # some people use DV to indicate the event time.
msg("Warning: vpc_tte() expected the dependent variable to contain only 0 (censored, or no event observed) or 1 (event observed). Setting all observations != 1 to 1.", verbose)
}
}
if("cens" %in% tolower(colnames(obs))) { # some people use a 'cens' column to indicate censoring
msg(paste0("Detected column '",colnames(obs)[match("cens", tolower(colnames(obs)))],"' with censoring information in observation data, assuming 1=censored event, 0=observed event. Please transpose data if assumption not correct."), TRUE)
colnames(obs)[match("cens", tolower(colnames(obs)))] <- "cens"
obs[obs$cens == 1,]$dv <- 0
}
if(rtte) {
if(rtte_calc_diff) {
obs <- relative_times(obs)
}
obs <- obs %>%
dplyr::group_by_("id") %>%
dplyr::arrange_("id", "t") %>%
dplyr::mutate(rtte = 1:length(dv))
# obs %>% dplyr::group_by(id) %>% dplyr::mutate(rtte = cumsum(dv != 0))
# obs[obs$dv == 0,]$rtte <- obs[obs$dv == 0,]$rtte + 1 # these censored points actually "belong" to the next rtte strata
stratify_pars <- c(stratify_pars, "rtte")
} else {
obs <- obs %>%
dplyr::group_by_("id") %>%
dplyr::mutate(last_obs = 1*(1:length(time) == length(time)), repeat_obs = 1*(cumsum(dv) > 1)) %>%
dplyr::filter(dv == 1 | last_obs == 1) %>%
dplyr::filter(!duplicated(id))
obs$rtte <- 1
}
# add stratification column and comput KM curve for observations
obs <- add_stratification(obs, stratify_pars)
if(!is.null(kmmc) && kmmc %in% names(obs)) {
obs_km <- compute_kmmc(obs, strat = "strat", reverse_prob = reverse_prob, kmmc=kmmc)
} else {
if(show$obs_ci) {
if(length(ci) == 2 && (round(ci[1],3) != round((1-ci[2]),3))) {
stop("Sorry, only symmetric confidence intervals can be computed. Please adjust the ci argument.")
}
obs_km <- compute_kaplan(obs, strat = "strat", reverse_prob = reverse_prob, rtte_conditional = rtte_conditional, ci = ci)
} else {
obs_km <- compute_kaplan(obs, strat = "strat", reverse_prob = reverse_prob, rtte_conditional = rtte_conditional)
}
}
} else { # get bins from sim
obs_km <- NULL
}
if(!is.null(kmmc) & (class(bins) == "logical" && bins == FALSE)) {
msg("Tip: with KMMC-type plots, binning of simulated data is recommended. See documentation for the 'bins' argument for more information.", verbose)
}
all_dat <- c()
if(!is.null(sim)) {
# format sim data and compute KM curve CI for simulations
if (all(c(cols$sim$idv, cols$sim$id, cols$sim$dv) %in% names(sim))) {
sim$id <- sim[[cols$sim$id]]
sim$dv <- sim[[cols$sim$dv]]
sim$time <- sim[[cols$sim$idv]]
} else {
stop("Not all required variables were found, please check column definitions for id, dv and time.")
}
if(max(sim$dv) > 2) { # guessing DV definition if not just 0/1
if(max(sim$dv) == 2) { # common approach in NONMEM, 2 = censored
sim[sim$dv != 1,]$dv <- 1
msg("Warning: Expected simulated dependent variable to contain only 0 (censored, or no event simerved) or 1 (event simerved). Setting all simulated observations != 1 to 0.", verbose)
} else {
sim[sim$dv != 1,]$dv <- 1 # some people use DV to indicate the event time.
msg("Warning: Expected simulated dependent variable to contain only 0 (censored, or no event simerved) or 1 (event simerved). Setting all simulated observations != 1 to 1.", verbose)
}
}
if("nonmem" %in% class(sim)) { # necessary due to a bug in NONMEM simulation
sim <- sim[!(sim$time == 0 & sim$dv == 1),]
}
if(max(sim$dv) == 1) {
if (sum(sim$dv > 0 & sim$dv < 1) > 0) {
sim[sim$dv > 0 & sim$dv < 1,]$dv <- 0
}
}
if("cens" %in% tolower(names(sim$cens))) { # some people use a 'cens' column to indicate censoring
cat("Detected extra column with censoring information in simulation data.")
colnames(sim)[match("cens", tolower(colnames(sim)))] <- "cens"
sim[sim$cens == 1,]$dv <- 0
}
# add sim index number
sim$sim <- add_sim_index_number(sim, id = cols$sim$id, sim_label = cols$sim$sim)
# set last_observation and repeat_obs per sim&id
sim <- sim %>%
dplyr::group_by_("sim", "id") %>%
dplyr::mutate(last_obs = 1*(1:length(time) == length(time)), repeat_obs = 1*(cumsum(dv) > 1))
# filter out stuff and recalculate rtte times
sim <- sim[sim$dv == 1 | (sim$last_obs == 1 & sim$dv == 0),]
if(rtte) {
sim <- sim %>%
dplyr::group_by_("sim", "id") %>%
dplyr::arrange_("sim", "id", "time") %>%
dplyr::mutate(rtte = 1:length(dv)) %>%
dplyr::arrange_("sim", "id")
if(rtte_calc_diff) {
sim <- relative_times(sim, simulation=TRUE)
}
} else {
sim$sim_id <- paste0(sim$sim, "_", sim$id) # remove duplicate observations rows per id to filter out repeated obs
sim <- sim[!duplicated(sim$sim_id),]
}
tmp_bins <- unique(c(0, sort(unique(sim$time)), max(sim$time)))
n_sim <- length(unique(sim$sim))
if(n_sim <= 1) {
stop(paste0("Something seems wrong with your simulation dataset, only ", n_sim, " iterations of the simulation were identified."))
}
all_dat <- c()
if(!(class(bins) == "logical" && bins == FALSE)) {
if(class(bins) == "logical" && bins == TRUE) {
bins <- "time"
}
if(class(bins) == "character") {
if (bins == "obs") {
tmp_bins <- unique(c(0, sort(unique(obs$time)), max(obs$time)))
} else {
if (!(bins %in% c("time","data"))) {
msg(paste0("Note: bining method ", bins," might be slow. Consider using method 'time', or specify 'bins' as numeric vector"), verbose)
}
tmp_bins <- unique(c(0, auto_bin(sim %>% dplyr::mutate(idv=time), type=bins, n_bins = n_bins-1), max(sim$time)))
}
}
if(class(bins) == "numeric") {
tmp_bins <- unique(c(0, bins, max(obs$time)))
}
}
message("Calculating simulation stats.")
pb <- utils::txtProgressBar(min = 1, max = n_sim)
for (i in 1:n_sim) {
utils::setTxtProgressBar(pb, i)
tmp <- sim %>% dplyr::filter(sim == i)
tmp2 <- add_stratification(tmp %>%
dplyr::arrange_("id", "time"), stratify_pars)
if(!is.null(kmmc) && kmmc %in% names(obs)) {
tmp3 <- compute_kmmc(tmp2, strat = "strat", reverse_prob = reverse_prob, kmmc = kmmc)
} else {
tmp3 <- compute_kaplan(tmp2, strat = "strat", reverse_prob = reverse_prob, rtte_conditional = rtte_conditional)
}
tmp3$time_strat <- paste0(tmp3$time, "_", tmp3$strat)
tmp4 <- expand.grid(time = c(0, unique(sim$time)), surv=NA, lower=NA, upper=NA,
strat = unique(tmp3$strat))
tmp4$time_strat <- paste0(tmp4$time, "_", tmp4$strat)
tmp4[match(tmp3$time_strat, tmp4$time_strat),]$surv <- tmp3$surv
# tmp4[match(tmp3$time_strat, tmp4$time_strat),]$lower <- tmp3$lower
# tmp4[match(tmp3$time_strat, tmp4$time_strat),]$upper <- tmp3$upper
tmp4 <- tmp4 %>%
dplyr::arrange(strat, time)
tmp4$surv <- locf(tmp4$surv)
tmp4[,c("bin", "bin_min", "bin_max", "bin_mid")] <- 0
tmp4$bin <- cut(tmp4$time, breaks = tmp_bins, labels = FALSE, right = TRUE)
tmp4$bin_min <- tmp_bins[tmp4$bin]
tmp4$bin_max <- tmp_bins[tmp4$bin+1]
tmp4$bin_mid <- (tmp4$bin_min + tmp4$bin_max) / 2
all_dat <- dplyr::bind_rows(all_dat, cbind(i, tmp4)) ## RK: this can be done more efficient!
}
sim_km <- all_dat %>%
dplyr::group_by_("strat", "bin") %>%
dplyr::summarise (bin_mid = head(bin_mid,1),
bin_min = head(bin_min,1),
bin_max = head(bin_max,1),
qmin = quantile(surv, 0.05),
qmax = quantile(surv, 0.95),
qmed = median(surv),
# lower_med = median(lower, 0.05),
# upper_med = median(upper, 0.05),
step = 0)
} else {
sim_km <- NULL
tmp_bins <- unique(c(0, sort(unique(obs$time)), max(obs$time)))
}
if (rtte) {
if(!is.null(sim)) {
sim_km$rtte <- as.num(gsub(".*, (\\d)", "\\1", sim_km$strat, perl = TRUE))
if (!is.null(events)) {
sim_km <- sim_km %>%
dplyr::filter(rtte %in% events)
# redefine strat factors, since otherwise empty panels will be shown
sim_km$strat <- factor(sim_km$strat, levels = unique(sim_km$strat))
}
}
if(!is.null(obs)) {
obs_km$rtte <- as.num(gsub(".*, (\\d)", "\\1", obs_km$strat, perl = TRUE))
if (!is.null(events)) {
obs_km <- obs_km %>%
dplyr::filter(rtte %in% events)
obs_km$strat <- factor(obs_km$strat, levels = unique(obs_km$strat))
}
}
}
cens_dat <- NULL
if(show$obs_cens && !is.null(obs)) {
cens_dat <- obs
if(rtte) {
if(!rtte_conditional || !rtte_calc_diff) {
cens_dat <- cens_dat %>%
dplyr::mutate(time = t)
}
}
cens_dat <- cens_dat %>%
dplyr::filter(dv == 0, time > 0)
}
if (!is.null(stratify_original)) {
if (length(stratify_pars) == 2) {
if(!is.null(sim_km)) {
sim_km$strat1 <- unlist(strsplit(as.character(sim_km$strat), ", "))[(1:length(sim_km$strat)*2)-1]
sim_km$strat2 <- unlist(strsplit(as.character(sim_km$strat), ", "))[(1:length(sim_km$strat)*2)]
}
if(!is.null(obs_km)) {
obs_km$strat1 <- unlist(strsplit(as.character(obs_km$strat), ", "))[(1:length(obs_km$strat)*2)-1]
obs_km$strat2 <- unlist(strsplit(as.character(obs_km$strat), ", "))[(1:length(obs_km$strat)*2)]
}
if(!is.null(cens_dat)) {
cens_dat$strat1 <- unlist(strsplit(as.character(cens_dat$strat), ", "))[(1:length(cens_dat$strat)*2)-1]
cens_dat$strat2 <- unlist(strsplit(as.character(cens_dat$strat), ", "))[(1:length(cens_dat$strat)*2)]
}
}
}
if (!is.null(obs)) {
if (show$obs_cens) {
if(nrow(cens_dat)>0) {
cens_dat$y <- 1
cens_dat$strat1 <- NA
cens_dat$strat2 <- NA
for (j in 1:nrow(cens_dat[,1])) {
tmp <- obs_km[as.character(obs_km$strat) == as.character(cens_dat$strat[j]),]
cens_dat$y[j] <- rev(tmp$surv[(cens_dat$time[j] - tmp$time) > 0])[1]
if ("strat1" %in% names(tmp)) {
cens_dat$strat1[j] <- rev(tmp$strat1[(cens_dat$time[j] - tmp$time) > 0])[1]
}
if ("strat2" %in% names(tmp)) {
cens_dat$strat2[j] <- rev(tmp$strat2[(cens_dat$time[j] - tmp$time) > 0])[1]
}
}
cens_dat <- cens_dat[!is.na(cens_dat$y),]
}
}
}
if(!is.null(obs)) {
show$obs_dv <- TRUE
} else {
show$obs_dv <- FALSE
}
show$pi <- TRUE
if(!is.null(kmmc)) {
ylab <- paste0("Mean (", kmmc, ")")
}
# plotting starts here
vpc_db <- list(sim = sim,
sim_km = sim_km,
obs = obs,
obs_km = obs_km,
all_dat = all_dat,
stratify_pars = stratify_pars,
stratify = stratify,
stratify_color = stratify_color,
stratify_original = stratify_original,
bins = bins,
facet = facet,
labeller = labeller,
kmmc = kmmc,
cens_dat = cens_dat,
rtte = rtte,
type = "time-to-event",
as_percentage = as_percentage,
tmp_bins = tmp_bins,
xlab = xlab,
ylab = ylab)
if(is.null(xlab)) {
xlab <- "Time (days)"
}
if(vpcdb) {
return(vpc_db)
} else {
message("\nPlotting.")
pl <- plot_vpc(vpc_db,
show = show,
vpc_theme = vpc_theme,
smooth = smooth,
log_y = FALSE,
title = title)
return(pl)
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.