# satisfy lintr
# lintr falsely flags possibly_ecdf as unused variable
if (getRversion() >= "2.15.1") {
utils::globalVariables(c("possibly_ecdf"))
}
#' @title Integrity check for df_visit.
#' @description Internal function used by all functions that accept df_visit as a parameter.
#' Checks for NA columns, numeric visits and AEs, implicitly missing and
#' duplicated visits.
#' @inheritParams site_aggr
#' @return corrected df_visit
#' @examples
#'
#' df_visit <- sim_test_data_study(
#' n_pat = 100,
#' n_sites = 5,
#' frac_site_with_ur = 0.4,
#' ur_rate = 0.6
#' )
#'
#' df_visit$study_id <- "A"
#'
#' df_visit_filt <- df_visit %>%
#' dplyr::filter(visit != 3)
#'
#' df_visit_corr <- check_df_visit(df_visit_filt)
#' 3 %in% df_visit_corr$visit
#' nrow(df_visit_corr) == nrow(df_visit)
#'
#' df_visit_corr <- check_df_visit(dplyr::bind_rows(df_visit, df_visit))
#' nrow(df_visit_corr) == nrow(df_visit)
#'
#' @rdname check_df_visit
#' @export
check_df_visit <- function(df_visit) {
df_visit <- ungroup(df_visit)
stopifnot(
all(c("study_id", "site_number", "patnum", "n_ae", "visit") %in% colnames(df_visit))
)
no_na_cols <- c(
"study_id",
"site_number",
"patnum",
"n_ae",
"visit"
)
cols_na <- purrr::map(no_na_cols, ~ get_any_na(df_visit, .)) %>%
unlist()
if (any(cols_na)) {
stop(paste("NA detected in columns:", paste(no_na_cols[cols_na], collapse = ",")))
}
df_visit %>%
group_by(.data$study_id, .data$patnum) %>%
summarise(n_sites = n_distinct(.data$site_number), .groups = "drop") %>%
mutate(check = .data$n_sites == 1) %>%
pull(.data$check) %>%
unlist() %>%
all() %>%
stopifnot("patient ids must be site exclusive" = .)
df_visit %>%
mutate(check = .data$visit > 0) %>%
pull(.data$check) %>%
unlist() %>%
all() %>%
stopifnot("visit numbering should start at 1" = .)
if (inherits(df_visit, "data.frame")) {
df_visit %>%
summarise_at(
vars(c(
"n_ae",
"visit"
)),
~ is.numeric(.)
) %>%
unlist() %>%
all() %>%
stopifnot("n_ae and visit columns must be numeric" = .)
df_visit <- exp_implicit_missing_visits(df_visit)
df_visit <- aggr_duplicated_visits(df_visit)
}
return(df_visit)
}
#' @title Aggregate duplicated visits.
#' @description Internal function called by [check_df_visit()][check_df_visit].
#' @inheritParams site_aggr
#' @return df_visit corrected
#' @rdname aggr_duplicated_visits
#' @export
aggr_duplicated_visits <- function(df_visit) {
df_visit_out <- df_visit %>%
group_by(
.data$study_id,
.data$site_number,
.data$patnum,
.data$visit
) %>%
summarise(n_ae = max(.data$n_ae), .groups = "drop")
if (nrow(df_visit_out) < nrow(df_visit)) {
warning("Duplicated visit entries for some patients detected and corrected.")
}
return(df_visit_out)
}
#' @title Expose implicitly missing visits.
#' @description Internal function called by [check_df_visit()][check_df_visit].
#' @inheritParams site_aggr
#' @return df_visit corrected
#' @rdname exp_implicit_missing_visits
#' @export
exp_implicit_missing_visits <- function(df_visit) {
df_complete <- df_visit %>%
group_by(.data$study_id) %>%
mutate(min_study_visit = min(.data$visit),
max_study_visit = max(.data$visit)) %>%
select(c(
"study_id",
"site_number",
"patnum",
"min_study_visit",
"max_study_visit"
)) %>%
distinct() %>%
mutate(
visit = map2(
.data$min_study_visit,
.data$max_study_visit,
function(x, y) seq(x, y, 1)
)
) %>%
unnest("visit") %>%
select(c(
"study_id",
"site_number",
"patnum",
"min_study_visit",
"visit"
))
df_visit_out <- df_visit %>%
group_by(.data$study_id, .data$site_number, .data$patnum) %>%
mutate(min_visit_pat = min(.data$visit),
max_visit_pat = max(.data$visit)) %>%
ungroup() %>%
right_join(
df_complete,
by = c("study_id", "site_number", "patnum", "visit")
) %>%
group_by(.data$study_id, .data$site_number, .data$patnum) %>%
arrange(.data$visit) %>%
fill("n_ae", .direction = "down") %>%
mutate(
min_visit_pat = min(.data$min_visit_pat, na.rm = TRUE),
max_visit_pat = max(.data$max_visit_pat, na.rm = TRUE)
) %>%
group_by(.data$study_id) %>%
mutate(min_study_visit = min(.data$min_study_visit, na.rm = TRUE)) %>%
ungroup() %>%
filter(
.data$visit >= .data$min_study_visit &
.data$visit <= .data$max_visit_pat
) %>%
mutate(n_ae = ifelse(is.na(.data$n_ae), 0, .data$n_ae)) %>%
select(c(
"study_id",
"site_number",
"patnum",
"n_ae",
"visit"
)) %>%
arrange(.data$study_id, .data$site_number, .data$patnum, .data$visit)
if (nrow(df_visit_out) > nrow(df_visit)) {
warning("implicitly missing visit numbers detected and corrected")
}
stopifnot(nrow(df_visit_out) >= nrow(df_visit))
return(df_visit_out)
}
#' @title Aggregate visit to patient level.
#' @description Internal function used by [site_aggr()][site_aggr()] and
#' [plot_visit_med75()][plot_visit_med75()], adds the maximum visit for each patient.
#' @param df_visit dataframe
#' @return dataframe
#' @rdname pat_aggr
#' @export
pat_aggr <- function(df_visit) {
df_pat <- df_visit %>%
group_by(.data$study_id, .data$site_number, .data$patnum) %>%
summarise(max_visit_per_pat = max(.data$visit),
.groups = "drop")
return(df_pat)
}
#' @title Get site mean ae development.
#' @description Internal function used by [site_aggr()][site_aggr()], [plot_visit_med75()][plot_visit_med75()],
#' returns mean AE development from visit 0 to visit_med75.
#' @param df_visit dataframe
#' @param df_pat dataframe as returned by pat_aggr()
#' @param df_site dataframe as returned by site_aggr()
#' @return dataframe
#' @rdname get_site_mean_ae_dev
get_site_mean_ae_dev <- function(df_visit, df_pat, df_site) {
df_visit %>%
left_join(df_pat, by = c("study_id", "site_number", "patnum")) %>%
left_join(df_site, by = c("study_id", "site_number")) %>%
filter(
.data$visit <= .data$visit_med75,
.data$max_visit_per_pat >= .data$visit_med75
) %>%
group_by(.data$study_id, .data$site_number, .data$visit_med75, .data$visit) %>%
summarise(mean_ae_site = mean(.data$n_ae),
.groups = "drop")
}
#' @title Get visit_med75.
#' @description Internal function used by [site_aggr()][site_aggr()].
#' @param df_pat dataframe as returned by [pat_aggr()][pat_aggr()]
#' @inheritParams site_aggr
#' @return dataframe
#' @rdname get_visit_med75
get_visit_med75 <- function(df_pat,
method = "med75_adj",
min_pat_pool = 0.2) {
if (inherits(df_pat, "data.frame")) {
fun_arrange <- arrange
} else {
fun_arrange <- window_order
}
# we have to separate the aggregation steps into two steps
# we need to reference visit_med75 for counting patients with
# visit_med75 not all db backends may support this
df_visit_med75 <- df_pat %>%
summarise(
visit_med75 = ceiling(median(.data$max_visit_per_pat) * 0.75),
.by = c("study_id", "site_number")
)
df_site <- df_pat %>%
left_join(
df_visit_med75,
by = c("study_id", "site_number")
) %>%
summarise(
n_pat = n_distinct(.data$patnum),
n_pat_with_med75 = sum(ifelse(
.data$max_visit_per_pat >= .data$visit_med75,
1, 0)
),
.by = c("study_id", "site_number", "visit_med75")
)
if (method == "med75_adj") {
df_site <- df_site %>%
right_join(
df_pat,
by = c("study_id", "site_number")
) %>%
mutate(
pat_has_visit_med75 = ifelse(.data$max_visit_per_pat >= .data$visit_med75, 1, 0)
)
if (inherits(df_pat, "data.frame")) {
# this cannot be exactly replicated, we maintain this method so that results
# do not change compared to previous versions
df_qup8 <- df_site %>%
summarise(
study_qup8_max_visit = quantile(.data$max_visit_per_pat, probs = c(1 - min_pat_pool)),
study_qup8_max_visit = round(.data$study_qup8_max_visit, 0),
.by = "study_id"
) %>%
select(c("study_id", "study_qup8_max_visit"))
} else {
# calculate max_visit that covers at least min_map_pool
# wer are not using quantile for db backend compatibility
df_qup8 <- df_site %>%
mutate(
rnk = percent_rank(.data$max_visit_per_pat),
.by = "study_id"
) %>%
filter(.data$rnk <= 1 - .env$min_pat_pool) %>%
fun_arrange(.data$study_id, desc(.data$max_visit_per_pat)) %>%
filter(row_number() == 1, .by = "study_id") %>%
select(c("study_id", study_qup8_max_visit = "max_visit_per_pat"))
}
df_site <- df_site %>%
left_join(
df_qup8,
by = c("study_id")
) %>%
summarise(
# from all patients that have a visit_med75 we take the smallest maximum
visit_med75 = min(
ifelse(.data$pat_has_visit_med75 == 1, .data$max_visit_per_pat, NA),
na.rm = TRUE
),
.by = c("study_id", "site_number", "n_pat", "n_pat_with_med75", "study_qup8_max_visit")
) %>%
mutate(
# we correct visit_med75 so that it cannot exceed a visit with less than
# the minimum ratio of patients
visit_med75 = ifelse(
.data$visit_med75 > .data$study_qup8_max_visit,
.data$study_qup8_max_visit,
.data$visit_med75
)
) %>%
select(- "study_qup8_max_visit")
}
df_site <- df_site %>%
select(c("study_id",
"site_number",
"n_pat",
"n_pat_with_med75",
"visit_med75"))
return(df_site)
}
#' @title Evaluate sites.
#' @description Correct under-reporting probabilities using \code{\link[stats]{p.adjust}}.
#' @param df_sim_sites dataframe generated by \code{\link[simaerep]{sim_sites}}
#' @param method character, passed to stats::p.adjust(), if NULL
#' eval_sites_deprecated() is used instead, Default = "BH"
#' @inheritParams sim_sites
#' @param ... use to pass r_sim_sites parameter to eval_sites_deprecated()
#' @return dataframe with the following columns:
#' \describe{
#' \item{**study_id**}{study identification}
#' \item{**site_number**}{site identification}
#' \item{**visit_med75**}{median(max(visit)) * 0.75}
#' \item{**mean_ae_site_med75**}{mean AE at visit_med75 site level}
#' \item{**mean_ae_study_med75**}{mean AE at visit_med75 study level}
#' \item{**pval**}{p-value as returned by \code{\link[stats]{poisson.test}}}
#' \item{**prob_low**}{bootstrapped probability for having mean_ae_site_med75 or lower}
#' \item{**pval_adj**}{adjusted p-values}
#' \item{**prob_low_adj**}{adjusted bootstrapped probability for having mean_ae_site_med75 or lower}
#' \item{**pval_prob_ur**}{probability under-reporting as 1 - pval_adj, poisson.test (use as benchmark)}
#' \item{**prob_low_prob_ur**}{probability under-reporting as 1 - prob_low_adj, bootstrapped (use)}
#'
#' }
#' @examples
#' df_visit <- sim_test_data_study(n_pat = 100, n_sites = 5,
#' frac_site_with_ur = 0.4, ur_rate = 0.6)
#'
#' df_visit$study_id <- "A"
#' df_site <- site_aggr(df_visit)
#'
#' df_sim_sites <- sim_sites(df_site, df_visit, r = 100)
#'
#' df_eval <- eval_sites(df_sim_sites)
#' df_eval
#'
#' @rdname eval_sites
#' @seealso \code{\link[simaerep]{site_aggr}},
#' \code{\link[simaerep]{sim_sites}},
#' \code{\link[stats]{p.adjust}}
#' @export
eval_sites <- function(df_sim_sites,
method = "BH",
under_only = TRUE,
...) {
df_out <- df_sim_sites
if ("pval" %in% colnames(df_out)) {
warning_na(df_out, "pval")
df_out <- df_out %>%
p_adjust("pval", "_prob_ur", method = method)
}
if ("prob_low" %in% colnames(df_out)) {
warning_na(df_out, "prob_low")
df_out <- df_out %>%
p_adjust("prob_low", "_prob_ur", method = method)
if (! under_only) {
if (any(str_detect(colnames(df_out), "med75"))) {
df_out <- df_out %>%
mutate(study_site_ae_equal = .data$mean_ae_site_med75 == .data$mean_ae_study_med75)
} else {
df_out <- df_out %>%
mutate(study_site_ae_equal = .data$events_per_visit_site == .data$events_per_visit_study)
}
df_out <- df_out %>%
mutate(
prob_high = 1 - .data$prob_low,
# when study and site values are equl e.g. both zero, no over-reporting possible
prob_high = ifelse(
.data$study_site_ae_equal,
1,
.data$prob_high
)
) %>%
select(- "study_site_ae_equal") %>%
p_adjust("prob_high", "_prob_or", method = method)
}
}
if (inherits(df_out, "data.frame")) {
fun_arrange <- arrange
} else {
fun_arrange <- window_order
}
df_out <- df_out %>%
fun_arrange(.data$study_id, .data$site_number)
return(df_out)
}
#'@keywords internal
p_adjust <- function(df, col, suffix, method = "BH") {
col_adj <- paste0(col, "_adj")
col_suffix <- paste0(col, suffix)
if (is.na(method) || is.null(method) || method %in% c("None", "none")) {
df_out <- df %>%
mutate(
!! as.name(col_suffix) := 1 - .data[[col]]
)
return(df_out)
}
if (inherits(df, "data.frame")) {
df_out <- df %>%
mutate(
!! as.name(col_adj) := p.adjust(.data[[col]], method = method),
!! as.name(col_suffix) := 1 - .data[[col_adj]],
.by = "study_id"
)
} else {
df_out <- p_adjust_bh_inframe(df, col, suffix)
}
return(df_out)
}
#'@keywords internal
warning_na <- function(df, col) {
any_na <- get_any_na(df, col)
if (any_na) {
warning_messages <- df %>%
filter(is.na(.data[[col]])) %>%
mutate(
warning = paste0("\nstudy_id: ", .data$study_id, ", site_number: ", .data$site_number),
warning = paste0(.data$warning, ", ", col, "== NA")
) %>%
distinct() %>%
pull(.data$warning) %>%
paste(collapse = "\n")
warning(warning_messages)
}
}
get_any_na <- function(df, col) {
# dbplyr throws a warning for not using na.rm = TRUE
suppressWarnings({
df %>%
mutate(
any_na = ifelse(is.na(.data[[col]]), 1, 0)
) %>%
summarise(
any_na = sum(.data$any_na, na.rm = TRUE)
) %>%
mutate(
any_na = .data$any_na > 0
) %>%
pull(.data$any_na)
})
}
#' @title Get empirical cumulative distribution values of pval or prob_lower
#' @description Test function, test applicability of poisson test, by calculating
#' - the bootstrapped probability of obtaining a specific p-value or lower, use
#' in combination with [sim_studies()][sim_studies()].
#' @param df_sim_studies dataframe, generated by [sim_studies()][sim_studies()]
#' @param df_sim_sites dataframe, generated by [sim_sites()][sim_sites()]
#' @param val_str c("prob_low","pval")
#' @return dataframe with the following columns:
#' \describe{
#' \item{**study_id**}{study identification}
#' \item{**site_number**}{site identification}
#' \item{**visit_med75**}{median(max(visit)) * 0.75}
#' \item{**mean_ae_site_med75**}{mean AE at visit_med75 site level}
#' \item{**mean_ae_study_med75**}{mean AE at visit_med75 study level}
#' \item{**{pval/prob_low}**}{p-value as returned by `poisson.test`}
#' \item{**{pval/prob_low}_ecd**}{p-value as returned by `poisson.test`}
#' }
#' @details trains a ecdf function for each studies based on the results
#' of [sim_studies()][sim_studies()]
#' @examples
#' df_visit <- sim_test_data_study(n_pat = 100, n_sites = 5,
#' frac_site_with_ur = 0.4, ur_rate = 0.3)
#'
#' df_visit$study_id <- "A"
#' df_site <- site_aggr(df_visit)
#'
#' df_sim_sites <- sim_sites(df_site, df_visit, r = 100)
#'
#' df_sim_studies <- sim_studies(
#' df_site = df_site,
#' df_visit = df_visit,
#' r = 3,
#' parallel = FALSE,
#' poisson_test = TRUE,
#' prob_lower = TRUE
#' )
#'
#' get_ecd_values(df_sim_studies, df_sim_sites, "prob_low")
#' get_ecd_values(df_sim_studies, df_sim_sites, "pval")
#' @rdname get_ecd_values
#' @export
get_ecd_values <- function(df_sim_studies, df_sim_sites, val_str) {
possibly_ecdf <- possibly(ecdf, otherwise = NA) #nolint
apply_ecdf <- function(.f, x) {
if (suppressWarnings(is.na(.f))) {
warning("NA Values in Stats")
return(NA)
} else {
return(.f(x))
}
}
df_ecd <- df_sim_studies %>%
rename(val = !!as.symbol(val_str)) %>%
select("study_id", "val") %>%
nest(data = "val") %>%
mutate(.ecdf = map(.data$data, ~ possibly_ecdf(.$val))) %>%
select(- "data")
df_out <- df_sim_sites %>%
rename(val = !!as.symbol(val_str)) %>%
left_join(df_ecd, "study_id") %>%
mutate(ecd_val = map2_dbl(.data$`.ecdf`, .data$val, apply_ecdf)) %>%
rename(
!!as.symbol(val_str) := "val",
!!as.symbol(paste0(val_str, "_ecd")) := "ecd_val" #nolint
) %>%
select(- ".ecdf")
return(ungroup(df_out))
}
#' @title Create a study specific patient pool for sampling
#' @description Internal function for \code{\link[simaerep]{sim_sites}},
#' filter all visits greater than max_visit_med75_study
#' returns dataframe with one column for studies and one column with nested
#' patient data.
#' @param df_visit dataframe, created by \code{\link[simaerep]{sim_sites}}
#' @param df_site dataframe created by \code{\link[simaerep]{site_aggr}}
#' @return dataframe with nested pat_pool column
#' @examples
#' df_visit <- sim_test_data_study(
#' n_pat = 100,
#' n_sites = 5,
#' frac_site_with_ur = 0.4,
#' ur_rate = 0.6
#')
#'
#' df_visit$study_id <- "A"
#'
#' df_site <- site_aggr(df_visit)
#'
#' df_pat_pool <- pat_pool(df_visit, df_site)
#'
#' df_pat_pool
#' @rdname pat_pool
#' @export
pat_pool <- function(df_visit, df_site) {
df_site <- df_site %>%
group_by(.data$study_id) %>%
mutate(max_visit_med75_study = max(.data$visit_med75))
df_visit %>%
left_join(df_site, by = c("study_id", "site_number")) %>%
filter(.data$visit <= .data$max_visit_med75_study) %>%
select(c("study_id",
"patnum",
"visit",
"n_ae")) %>%
nest(pat_pool = c("patnum", "visit", "n_ae"))
}
#' @title Calculate bootstrapped probability for obtaining a lower site mean AE number.
#' @description Internal function used by [sim_sites()][sim_sites()]
#' @param site_ae vector with AE numbers
#' @param study_ae vector with AE numbers
#' @param r integer, denotes number of simulations, default = 1000
#' @param parallel logical, toggles parallel processing on and of, default = F
#' @param under_only compute under-reporting probabilities only, default = TRUE
#' @return pval
#' @details sets pvalue=1 if mean AE site is greater than mean AE study
#' @examples
#' prob_lower_site_ae_vs_study_ae(
#' site_ae = c(5, 3, 3, 2, 1, 6),
#' study_ae = c(9, 8, 7, 9, 6, 7, 8),
#' parallel = FALSE
#' )
#' @seealso \code{\link[purrr]{safely}}
#' @rdname prob_lower_site_ae_vs_study_ae
#' @export
prob_lower_site_ae_vs_study_ae <- function(site_ae, study_ae, r = 1000, parallel = FALSE, under_only = TRUE) {
# if there is only one site
if (is.null(study_ae)) {
prob_lower <- 1
return(prob_lower)
}
mean_ae_site <- mean(site_ae, na.rm = TRUE)
mean_ae_study <- mean(study_ae, na.rm = TRUE)
# this can happen when no patients with site visit_med75 were found in study
if (is.na(mean_ae_study)) {
return(NA)
}
if (under_only) {
# we are not interested in cases where site AE is greater study AE
if (mean_ae_site > mean_ae_study) {
prob_lower <- 1
return(prob_lower)
}
}
# set-up multiprocessing
# multiprocessing currently not used by sim_sites()
if (parallel) {
.f_map_int <- function(...) {
furrr::future_map_int(..., .options = furrr_options(seed = TRUE))
}
} else {
.f_map_int <- purrr::map_int
}
pool <- c(site_ae, study_ae)
n_pat <- length(site_ae)
sim <- function(seed) {
set.seed(seed)
me <- mean(sample(pool, n_pat, replace = TRUE))
# '<=' includes all cases where mean_ae_site == 0 and me also == 0
return(as.integer(ifelse(me <= mean_ae_site, 1, 0)))
}
df_sim <- tibble(seed = seq.int(1, r, 1)) %>%
mutate(prob_lower = .f_map_int(.data$seed, .f = sim)) %>%
summarise(prob_lower = sum(.data$prob_lower) / r)
return(df_sim$prob_lower)
}
#' @title Calculate prob_lower and poisson.test pvalue for study sites.
#' @description Collects the number of AEs of all eligible patients that
#' meet visit_med75 criteria of site. Then calculates poisson.test pvalue and
#' bootstrapped probability of having a lower mean value.
#' @param df_visit dataframe, created by \code{\link[simaerep]{sim_sites}}
#' @param df_site dataframe created by \code{\link[simaerep]{site_aggr}}
#' @param r integer, denotes number of simulations, default = 1000
#' @param poisson_test logical, calculates poisson.test pvalue
#' @param prob_lower logical, calculates probability for getting a lower value
#' @param progress logical, display progress bar, Default = TRUE
#' @param check, logical, perform data check and attempt repair with
#' @param under_only compute under-reporting probabilities only, default = TRUE
#' [check_df_visit()][check_df_visit], computationally expensive on large data
#' sets. Default: TRUE
#' @return dataframe with the following columns:
#' \describe{
#' \item{**study_id**}{study identification}
#' \item{**site_number**}{site identification}
#' \item{**n_pat**}{number of patients at site}
#' \item{**visit_med75**}{median(max(visit)) * 0.75}
#' \item{**n_pat_with_med75**}{number of patients at site with med75}
#' \item{**mean_ae_site_med75**}{mean AE at visit_med75 site level}
#' \item{**mean_ae_study_med75**}{mean AE at visit_med75 study level}
#' \item{**n_pat_with_med75_study**}{number of patients at study with med75 excl. site}
#' \item{**pval**}{p-value as returned by \code{\link[stats]{poisson.test}}}
#' \item{**prob_low**}{bootstrapped probability for having mean_ae_site_med75 or lower}
#' }
#' @examples
#' df_visit <- sim_test_data_study(
#' n_pat = 100,
#' n_sites = 5,
#' frac_site_with_ur = 0.4,
#' ur_rate = 0.2
#')
#'
#' df_visit$study_id <- "A"
#'
#' df_site <- site_aggr(df_visit)
#'
#' df_sim_sites <- sim_sites(df_site, df_visit, r = 100)
#'
#' df_sim_sites %>%
#' knitr::kable(digits = 2)
#' @rdname sim_sites
#' @seealso \code{\link[simaerep]{sim_sites}},
#' \code{\link[simaerep]{site_aggr}},
#' \code{\link[simaerep]{pat_pool}},
#' \code{\link[simaerep]{prob_lower_site_ae_vs_study_ae}},
#' \code{\link[simaerep]{poiss_test_site_ae_vs_study_ae}},
#' \code{\link[simaerep]{sim_sites}},
#' \code{\link[simaerep]{prep_for_sim}}
#' @export
sim_sites <- function(df_site,
df_visit,
r = 1000,
poisson_test = TRUE,
prob_lower = TRUE,
progress = TRUE,
check = TRUE,
under_only = TRUE) {
if (check) {
df_visit <- check_df_visit(df_visit)
}
df_sim_prep <- prep_for_sim(df_site, df_visit)
df_sim <- sim_after_prep(df_sim_prep,
r = r,
poisson_test = poisson_test,
prob_lower = prob_lower,
progress = progress,
under_only = under_only)
return(df_sim)
}
#'@title Prepare data for simulation.
#'@description Internal function called by \code{\link[simaerep]{sim_sites}}.
#' Collect AEs per patient at visit_med75 for site and study as a vector of
#' integers.
#'@param df_visit dataframe, created by \code{\link[simaerep]{sim_sites}}
#'@param df_site dataframe created by \code{\link[simaerep]{site_aggr}}
#'@return dataframe
#' @examples
#' df_visit <- sim_test_data_study(
#' n_pat = 100,
#' n_sites = 5,
#' frac_site_with_ur = 0.4,
#' ur_rate = 0.2
#')
#'
#' df_visit$study_id <- "A"
#'
#' df_site <- site_aggr(df_visit)
#'
#' df_prep <- prep_for_sim(df_site, df_visit)
#' df_prep
#'@rdname prep_for_sim
#' @seealso \code{\link[simaerep]{sim_sites}}, \code{\link[simaerep]{sim_after_prep}}
#'@export
prep_for_sim <- function(df_site, df_visit) {
df_pat_pool <- pat_pool(df_visit, df_site)
df_sim_prep <- df_visit %>%
left_join(df_site, by = c("study_id", "site_number")) %>%
filter(.data$visit == .data$visit_med75) %>%
group_by(.data$study_id,
.data$site_number,
.data$n_pat,
.data$n_pat_with_med75,
.data$visit_med75) %>%
summarise(patients = list(unique(.data$patnum)), .groups = "drop")
df_sim_prep <- df_sim_prep %>%
left_join(df_pat_pool, "study_id") %>%
mutate(
pat_pool = map2(
.data$pat_pool, .data$visit_med75,
function(x, y) filter(x, .data$visit == y)
),
n_ae_site = map2(
.data$pat_pool, .data$patients,
function(x, y) filter(x, .data$patnum %in% y)
),
n_ae_study = map2(
.data$pat_pool, .data$patients,
function(x, y) filter(x, ! .data$patnum %in% y)
),
n_ae_site = map(.data$n_ae_site, "n_ae"),
n_ae_study = map(.data$n_ae_study, "n_ae")
) %>%
select(- c("patients", "pat_pool"))
return(df_sim_prep)
}
#'@title Start simulation after preparation.
#'@description Internal function called by \code{\link[simaerep]{sim_sites}}
#' after \code{\link[simaerep]{prep_for_sim}}
#'@param df_sim_prep dataframe as returned by
#' \code{\link[simaerep]{prep_for_sim}}
#'@inheritParams sim_sites
#'@return dataframe
#' @examples
#' df_visit <- sim_test_data_study(
#' n_pat = 100,
#' n_sites = 5,
#' frac_site_with_ur = 0.4,
#' ur_rate = 0.2
#')
#'
#' df_visit$study_id <- "A"
#'
#' df_site <- site_aggr(df_visit)
#'
#' df_prep <- prep_for_sim(df_site, df_visit)
#'
#' df_sim <- sim_after_prep(df_prep)
#'
#' df_sim
#'@rdname sim_after_prep
#'@seealso \code{\link[simaerep]{sim_sites}},
#' \code{\link[simaerep]{prep_for_sim}}
#'@export
sim_after_prep <- function(df_sim_prep,
r = 1000,
poisson_test = FALSE,
prob_lower = TRUE,
progress = FALSE,
under_only = TRUE) {
df_sim <- df_sim_prep
if (poisson_test) {
df_sim <- df_sim %>%
mutate(pval = pmap_dbl(list(.data$n_ae_site, .data$n_ae_study, .data$visit_med75),
poiss_test_site_ae_vs_study_ae))
}
if (prob_lower) {
with_progress_cnd(
df_sim <- df_sim %>%
mutate(
prob_low = purrr_bar(
.data$n_ae_site, .data$n_ae_study,
.purrr = map2_dbl,
.f = prob_lower_site_ae_vs_study_ae,
.f_args = list(r = r, under_only = under_only),
.steps = nrow(df_sim),
.progress = progress
)
),
progress = progress
)
}
# clean
df_sim <- df_sim %>%
mutate(mean_ae_site_med75 = map_dbl(.data$n_ae_site, mean),
n_pat_with_med75_study = map_int(.data$n_ae_study, length),
# replace empty vector with NA to silence warning
n_ae_study = ifelse(.data$n_pat_with_med75_study == 0, NA, .data$n_ae_study),
mean_ae_study_med75 = map_dbl(.data$n_ae_study, mean)) %>%
mutate(
across(
any_of(c("prob_low", "pval")),
~ ifelse(.data$n_pat_with_med75_study == 0, NA, .)
)
) %>%
select(- c("n_ae_site", "n_ae_study")) %>%
select(c("study_id",
"site_number",
"n_pat",
"n_pat_with_med75",
"visit_med75",
"mean_ae_site_med75",
"mean_ae_study_med75",
"n_pat_with_med75_study"),
dplyr::everything()) %>%
ungroup()
df_fail <- df_sim %>%
filter(is.na(.data$mean_ae_study_med75), .data$n_pat_with_med75_study == 0)
if (nrow(df_fail) > 0) {
warning_messages <- df_fail %>%
mutate(
warning = paste0(
"\nstudy_id: ",
.data$study_id,
", site_number: ",
.data$site_number,
". No adequate patients found in study pool at visit_med75: ",
.data$visit_med75
)
) %>%
pull(.data$warning)
warning(warning_messages)
}
return(df_sim)
}
#' @title Configure study patient pool by site parameters.
#' @description Internal Function used by [sim_sites()][sim_sites()]
#' @param df_visit dataframe
#' @param df_site dataframe as created by site_aggr()
#' @param min_n_pat_with_med75 minimum number of patients with visit_med_75 for
#' simulation, Default: 1
#' @return dataframe
#' @details For simulating a study we need to configure the study patient
#' pool to match the configuration of the sites
#' @examples
#' df_visit1 <- sim_test_data_study(n_pat = 100, n_sites = 5,
#' frac_site_with_ur = 0.4, ur_rate = 0.6)
#'
#' df_visit1$study_id <- "A"
#'
#' df_visit2 <- sim_test_data_study(n_pat = 1000, n_sites = 3,
#' frac_site_with_ur = 0.2, ur_rate = 0.1)
#'
#' df_visit2$study_id <- "B"
#'
#' df_visit <- dplyr::bind_rows(df_visit1, df_visit2)
#'
#' df_site <- site_aggr(df_visit)
#'
#' df_config <- get_pat_pool_config(df_visit, df_site)
#'
#' df_config
#' @rdname get_pat_pool_config
#' @export
get_pat_pool_config <- function(df_visit, df_site, min_n_pat_with_med75 = 1) {
# site_config are the number of sites with their individual visit_med75 and n_pat_with_med75
df_site_config <- select(df_site, c(
"study_id",
"site_number",
"visit_med75",
"n_pat_with_med75")) %>%
filter(.data$n_pat_with_med75 >= min_n_pat_with_med75)
# pat_pool gives the patient pool for study from which we sample
df_site_config <- df_site_config %>%
left_join(pat_pool(df_visit, df_site), by = "study_id")
# adjusting pat_pool to configuration can be done outside of simulation loop
df_site_config <- df_site_config %>%
mutate(
pat_pool = map2(.data$pat_pool, .data$visit_med75, function(x, y) filter(x, visit == y)),
n_pat_study = map2_dbl(.data$pat_pool, .data$n_pat_with_med75, function(x, y) nrow(x) - y)
) %>%
select(c("study_id",
"site_number",
"visit_med75",
"n_pat_with_med75",
"n_pat_study",
"pat_pool"))
return(ungroup(df_site_config))
}
#' @title Simulate studies.
#' @description Test function, test applicability of poisson test, by
#' calculating a the bootstrapped probability of obtaining a specific p-value
#' or lower, use in combination with [get_ecd_values()][get_ecd_values()].
#' @param df_site dataframe
#' @param df_visit dataframe
#' @param min_n_pat_with_med75 integer, min number of patients with med75 at
#' site to simulate, Default: 1
#' @param keep_ae logical, keep ae numbers in output dataframe memory increase
#' roughly 30 percent, Default: F
#' @param parallel logical, see examples for registering parallel processing framework
#' , Default: FALSE
#' @param r integer, denotes number of simulations, Default: 1000
#' @param r_prob_lower integer, denotes number of simulations for prob_lower
#' value calculation,, Default: 1000
#' @param under_only compute under-reporting probabilities only, default = TRUE
#' @param under_only compute under-reporting probabilities only, default = TRUE
#' @param poisson_test logical, calculates poisson.test pvalue, Default: TRUE
#' @param prob_lower logical, calculates probability for getting a lower value,
#' Default: FALSE
#' @param studies vector with study names, Default: NULL
#' @param .progress logical, show progress bar
#' @details Here we simulate study replicates maintaining the same number of
#' sites, patients and visit_med75 by bootstrap resampling, then probabilities
#' for obtaining lower or same mean_ae count and p-values using poisson.test
#' are calculated.
#' @return dataframe
#' @examples
#' \donttest{
#' df_visit1 <- sim_test_data_study(n_pat = 100, n_sites = 5,
#' frac_site_with_ur = 0.4, ur_rate = 0.6)
#'
#' df_visit1$study_id <- "A"
#'
#' df_visit2 <- sim_test_data_study(n_pat = 1000, n_sites = 3,
#' frac_site_with_ur = 0.2, ur_rate = 0.1)
#'
#' df_visit2$study_id <- "B"
#'
#' df_visit <- dplyr::bind_rows(df_visit1, df_visit2)
#'
#' df_site <- site_aggr(df_visit)
#'
#' sim_studies(df_visit, df_site, r = 3, keep_ae = TRUE)
#'}
#' \dontrun{
#' # parallel processing -------------------------
#' library(future)
#' future::plan(multiprocess)
#' sim_studies(df_visit, df_site, r = 3, keep_ae = TRUE, parallel = TRUE)
#' future::plan(sequential)
#'}
#' @details adds column with simulated probabilities for equal or lower
#' mean_ae at visit_med75
#' @rdname sim_studies
#' @export
sim_studies <- function(df_visit,
df_site,
r = 100,
poisson_test = TRUE,
prob_lower = TRUE,
r_prob_lower = 1000,
under_only = TRUE,
parallel = FALSE,
keep_ae = FALSE,
min_n_pat_with_med75 = 1,
studies = NULL,
.progress = TRUE
) {
df_visit <- check_df_visit(df_visit)
df_config <- get_pat_pool_config(df_visit = df_visit,
df_site = df_site,
min_n_pat_with_med75 = min_n_pat_with_med75)
# filter studies --------------------------------------------
if (!is.null(studies)) {
if (!all(studies %in% unique(df_visit$study_id))) {
stop("not all passed studies can be found in input data")
}
df_config <- df_config %>%
filter(.data$study_id %in% studies)
}
# set-up multiprocessing -------------------------------------
if (parallel) {
.f_map <- function(...) {
furrr::future_map(
...,
.progress = .progress,
.options = furrr_options(seed = TRUE)
)
}
} else {
.f_map <- purrr::map
}
# sim function will be called r times -----------------------
sim <- function(r) {
set.seed(r)
df_config <- df_config %>%
mutate(
n_ae_site = map2(.data$pat_pool, .data$n_pat_with_med75,
function(x, y) sample_n(x, y, replace = TRUE)),
n_ae_site = map(.data$n_ae_site, "n_ae"),
n_ae_study = map2(.data$pat_pool, .data$n_pat_study,
function(x, y) sample_n(x, y, replace = TRUE)),
n_ae_study = map(.data$n_ae_study, "n_ae")
) %>%
select(- "pat_pool")
if (poisson_test) {
df_config <- df_config %>%
mutate(pval = pmap_dbl(list(.data$n_ae_site,
.data$n_ae_study,
.data$visit_med75),
poiss_test_site_ae_vs_study_ae))
}
if (prob_lower) {
df_config <- df_config %>%
mutate(prob_low = map2_dbl(.data$n_ae_site, .data$n_ae_study,
prob_lower_site_ae_vs_study_ae,
r = r_prob_lower,
under_only = under_only))
}
if (!keep_ae) {
df_config <- df_config %>%
select(- c("n_ae_site", "n_ae_study"))
} else {
df_config <- df_config %>%
mutate_at(vars(c("n_ae_site", "n_ae_study")), ~ map_chr(., paste, collapse = ","))
}
return(ungroup(df_config))
}
df_sim <- tibble(r = seq.int(1, r, 1)) %>%
mutate(n_ae_set = .f_map(.data$r, sim)) %>%
unnest("n_ae_set")
return(ungroup(df_sim))
}
#'@title Aggregate from visit to site level.
#'@description Calculates visit_med75, n_pat_with_med75 and mean_ae_site_med75
#'@param df_visit dataframe with columns: study_id, site_number, patnum, visit,
#' n_ae
#'@param method character, one of c("med75", "med75_adj") defining method for
#' defining evaluation point visit_med75 (see details), Default: "med75_adj"
#'@param min_pat_pool, double, minimum ratio of available patients available for
#' sampling. Determines maximum visit_med75 value see Details. Default: 0.2
#'@param check, logical, perform data check and attempt repair with
#' [check_df_visit()][check_df_visit], computationally expensive on large data
#' sets. Default: TRUE
#'@details For determining the visit number at which we are going to evaluate AE
#' reporting we take the maximum visit of each patient at the site and take the
#' median. Then we multiply with 0.75 which will give us a cut-off point
#' determining which patient will be evaluated. Of those patients we will
#' evaluate we take the minimum of all maximum visits hence ensuring that we
#' take the highest visit number possible without excluding more patients from
#' the analysis. In order to ensure that the sampling pool for that visit is
#' large enough we limit the visit number by the 80% quantile of maximum visits
#' of all patients in the study.
#'@return dataframe with the following columns: \describe{
#' \item{**study_id**}{study identification} \item{**site_number**}{site
#' identification} \item{**n_pat**}{number of patients, site level}
#' \item{**visit_med75**}{adjusted median(max(visit)) * 0.75 see Details}
#' \item{**n_pat_with_med75**}{number of patients that meet visit_med75
#' criterion, site level} \item{**mean_ae_site_med75**}{mean AE at visit_med75,
#' site level} }
#' @examples
#' df_visit <- sim_test_data_study(
#' n_pat = 100,
#' n_sites = 5,
#' frac_site_with_ur = 0.4,
#' ur_rate = 0.6
#' )
#'
#' df_visit$study_id <- "A"
#'
#' df_site <- site_aggr(df_visit)
#'
#' df_site %>%
#' knitr::kable(digits = 2)
#'@rdname site_aggr
#'@export
site_aggr <- function(df_visit,
method = "med75_adj",
min_pat_pool = 0.2,
check = TRUE) {
# Checks ----------------------------------------------------------
stopifnot(
method %in% c("med75", "med75_adj", "max")
)
if (method == "max") {
df_site <- df_visit %>%
mutate(
max_visit = max(.data$visit, na.rm = TRUE),
pat_has_max_visit = ifelse(visit == .data$max_visit, patnum, NA),
.by = c("study_id", "site_number")
) %>%
summarise(
n_pat = n_distinct(.data$patnum),
n_pat_with_max_visit = n_distinct(.data$pat_has_max_visit),
.by = c("study_id", "site_number", "max_visit")
)
return(df_site)
}
if (check) {
df_visit <- check_df_visit(df_visit)
}
# Aggregate on patient level---------------------------------------
df_pat <- pat_aggr(df_visit)
# Aggregate on site level ------------------------------------------
df_site <- get_visit_med75(df_pat, method = method, min_pat_pool = min_pat_pool)
# Calculate mean cumulative AE at med75 per Site ------------------
df_mean_ae_dev <- get_site_mean_ae_dev(df_visit, df_pat, df_site)
df_mean_ae_med75 <- df_mean_ae_dev %>%
filter(.data$visit == .data$visit_med75) %>%
rename(mean_ae_site_med75 = "mean_ae_site") %>%
select(c("study_id",
"site_number",
"mean_ae_site_med75"))
# Add mean cumulative AE to site aggregate ----------------------
df_site <- df_site %>%
left_join(df_mean_ae_med75, by = c("study_id", "site_number")) %>%
ungroup()
return(df_site)
}
#' @title Poisson test for vector with site AEs vs vector with study AEs.
#' @description Internal function used by [sim_sites()][sim_sites()].
#' @param site_ae vector with AE numbers
#' @param study_ae vector with AE numbers
#' @param visit_med75 integer
#' @return pval
#' @details sets pvalue=1 if mean AE site is greater than mean AE study or ttest gives error
#' @examples
#' poiss_test_site_ae_vs_study_ae(
#' site_ae = c(5, 3, 3, 2, 1, 6),
#' study_ae = c(9, 8, 7, 9, 6, 7, 8),
#' visit_med75 = 10
#')
#'
#' poiss_test_site_ae_vs_study_ae(
#' site_ae = c(11, 9, 8, 6, 3),
#' study_ae = c(9, 8, 7, 9, 6, 7, 8),
#' visit_med75 = 10
#')
#' @seealso [sim_sites()][sim_sites()]
#' @rdname poiss_test_site_ae_vs_study_ae
#' @export
poiss_test_site_ae_vs_study_ae <- function(site_ae,
study_ae,
visit_med75) {
# if there is only one site
if (is.null(study_ae)) {
pval <- 1
return(pval)
}
mean_ae_site <- mean(site_ae, na.rm = TRUE)
mean_ae_study <- mean(study_ae, na.rm = TRUE)
# this can happen when no patients with site visit_med75 were found in study
if (is.na(mean_ae_study)) {
return(NA)
}
# we are not interested in cases where site AE is greater study AE
if (mean_ae_site > mean_ae_study) {
pval <- 1
return(pval)
}
n_pat_site <- length(site_ae)
n_pat_study <- length(study_ae)
time_base_site <- n_pat_site * visit_med75
time_base_study <- n_pat_study * visit_med75
n_events_site <- sum(site_ae)
n_events_study <- sum(study_ae)
safely_poisson_test <- purrr::safely(poisson.test)
poisson_test <- safely_poisson_test(c(n_events_site, n_events_study),
c(time_base_site, time_base_study))
pval <- poisson_test$result$p.value
# this controls for cases when poisson.test fails for some reason
if (is.null(pval)) {
pval <- 1
}
return(pval)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.