#' <Private function> Prepare the skeleton cohort for subsequent steps.
#' @param cohort Cohort data with at least the following information on each
#' subject: start time (if not 0 for all subjects) and end time of follow-up,
#' censoring status and matching variables (if any). A \code{data.frame} or a
#' matrix with column names.
#' @param t_start_name Name of the variable in \code{cohort} for the start time
#' of follow-up. A \code{string}.
#' @param t_name Name of the variable in \code{cohort} for the event or
#' censoring time. A \code{string}.
#' @param y_name Name of the column of censoring status in each matched set in
#' \code{cohort}, with 1 for event and 0 for censoring. A \code{string}.
#' @param match_var_names Name(s) of the match variable(s) in \code{cohort} used
#' when drawing the NCC. A \code{string} vector.
#' @param print_message Whether to print the message regarding start and exit
#' time. Default is \code{TRUE}.
#' @import dplyr
prepare_cohort <- function(cohort, t_start_name, t_name, y_name,
match_var_names, print_message = TRUE) {
cohort <- unique(as.data.frame(cohort, stringsAsFactors = FALSE))
if (!(y_name %in% names(cohort))) {
stop(simpleError(paste(y_name, "not found in cohort.")))
} else {
cohort$.y <- cohort[, y_name[1]]
}
if (!is.null(t_start_name)) {
if (!(t_start_name %in% names(cohort))) {
stop(simpleError(paste(t_start_name, "not found in cohort.")))
}
if (print_message) {
message(simpleMessage(sprintf(
"Start time is given by variable %s. Event/censoring time is given by variable %s.\n",
t_start_name, t_name
)))
}
cohort$.t_start <- cohort[, t_start_name]
} else {
if (print_message) {
message(simpleMessage(sprintf(
"Start time is 0 for all subjects. Event/censoring time is given by variable %s.\n",
t_name
)))
}
cohort$.t_start <- -Inf # Not used in Surv()
}
if (!(t_name %in% names(cohort))) {
stop(simpleError(paste(t_name, "not found in cohort.")))
} else {
cohort$.t <- cohort[, t_name[1]]
}
if (is.null(match_var_names)) {
cohort %>% mutate(.strata0 = 1, .strata = "match_var=1")
} else {
if (!all(match_var_names %in% names(cohort))) {
stop(simpleError("Make sure all match variables are in cohort."))
}
mat <- unique(as.data.frame(cohort[, match_var_names]))
names(mat) <- match_var_names
match_var <- apply(mat, 1, function(row) paste(row, collapse = "-"))
# Make sure the levels are integers starting from 1
mat$.strata0 <- factor(as.numeric(factor(match_var)))
mat$.strata <- paste0("match_var=", mat$.strata0)
cohort %>% left_join(mat)
}
}
#' <Private function> Prepare the NCC cases for subsequent steps.
#' @param ncc_cases Cases in NCC data. A \code{data.frame} or a matrix with
#' column names. This data should not include the ID of each matched set.
#' @param t_name Name of the variable in \code{ncc_cases} for event time. A
#' \code{string}.
#' @param match_var_names Name(s) of the match variable(s) in
#' \code{ncc_cases} used when drawing the NCC. A \code{string} vector.
#' @import dplyr
prepare_ncc_cases <- function(ncc_cases, t_name, match_var_names) {
ncc <- unique(as.data.frame(ncc_cases, stringsAsFactors = FALSE))
if (!(t_name %in% names(ncc))) {
stop(simpleError(paste(t_name, "not found in ncc.")))
} else {
ncc$.t_event <- ncc[, t_name[1]]
}
if (is.null(match_var_names)) {
ncc %>% mutate(.strata0 = 1, .strata = "match_var=1")
} else {
if (!all(match_var_names %in% names(ncc))) {
stop(simpleError("Make sure all match variables are in ncc."))
}
mat <- unique(as.data.frame(ncc[, match_var_names]))
names(mat) <- match_var_names
match_var <- apply(mat, 1, function(row) paste(row, collapse = "-"))
# Make sure the levels are integers starting from 1
mat$.strata0 <- factor(as.numeric(factor(match_var)))
mat$.strata <- paste0("match_var=", mat$.strata0)
ncc %>% left_join(mat)
}
}
#' <Private function> Compute number at risk at each event time from cohort
#' @param cohort Cohort data prepared by \code{\link{prepare_cohort}}.
#' @param match_var_names Name(s) of the match variable(s).
#' @param staggered Whether time is staggered (i.e., t_start is specified in
#' input).
#' @return Returns a data.frame with the following columns:
#' \describe{
#' \item{t_event}{Unique event times in \code{cohort_skeleton}.}
#' \item{n_event}{Number of events at each event time.}
#' \item{n_at_risk}{Number of subjects at risk at each event time.}
#' \item{strata}{Strata defined by matching variables. \code{match_var=1} if the
#' NCC is only time-matched.}
#' \item{<each matching variable>}{If the NCC is matched on additional
#' confounders, each matching variable will be included as a column to the right
#' of \code{strata}.}
#' }
#' @importFrom survival survfit Surv
#' @importFrom dplyr arrange
#' @importFrom rlang .data
#' @importFrom stats as.formula
compute_risk_tb <- function(cohort, match_var_names, staggered) {
match_var <- cohort$.strata0
if (staggered) {
km <- survfit(as.formula("Surv(.t_start, .t, .y) ~ match_var"), data = cohort)
} else {
km <- survfit(as.formula("Surv(.t, .y) ~ match_var"), data = cohort)
}
km_summ <- summary(km)
if (is.null(km_summ$strata)) {
risk_table <- data.frame(t_event = km_summ$time, n_event = km_summ$n.event,
n_at_risk = km_summ$n.risk,
strata = "match_var=1",
stringsAsFactors = FALSE) %>%
arrange(.data$strata, .data$t_event)
} else {
mat <- unique(cohort[, c(".strata", match_var_names)])
names(mat) <- c("strata", match_var_names)
risk_table <- data.frame(t_event = km_summ$time, n_event = km_summ$n.event,
n_at_risk = km_summ$n.risk,
strata = as.character(km_summ$strata),
stringsAsFactors = FALSE) %>%
arrange(.data$strata, .data$t_event) %>%
left_join(unique(mat))
}
n0 <- sum(risk_table$n_at_risk == 0)
if (n0 > 0) {
warning(simpleWarning(sprintf(
"No subject at risk for %d of the %d event times.", n0, nrow(risk_table)
)))
}
risk_table
}
#' <Private function> Compute the probability that an eligible non-case is not
#' sampled at each event time
#' @param n_event A vector of number of event at each event time (1 if no tie in
#' event time, or >1 in the presence of ties). Non-integers are rounded down.
#' @param n_at_risk A vector of the number of subject at risk at each event
#' time. Non-integers are rounded down.
#' @param n_per_case Number of controls to select per case. A positive integer.
#' Non-integers are rounded down.
#' @return Returns a vector of probabilities.
#' @importFrom dplyr mutate
#' @importFrom rlang .data
p_not_sampled <- function(n_event, n_at_risk, n_per_case) {
n_event <- as.integer(as.vector(n_event))
n_at_risk <- as.integer(as.vector(n_at_risk))
if (anyNA(n_event)) stop(simpleError("Some entries in n_event are not numeric."))
if (anyNA(n_at_risk)) stop(simpleError("Some entries in n_at_risk are not numeric."))
if (length(n_event) != length(n_at_risk)) {
stop(simpleError("n_event, n_at_risk and strata should be vectors of the same length."))
}
if (any(n_at_risk < n_event)) {
stop(simpleError("Some entries in n_at_risk are smaller than n_event. Number at risk must be larger than or equal to number of events."))
}
n_per_case <- as.integer(n_per_case)
if (is.na(n_per_case) || n_per_case < 1) {
stop(simpleError("n_per_case must be a positive integer."))
}
df <- data.frame(n_event = n_event, Rj = n_at_risk - n_event) %>%
dplyr::mutate(n_target = .data$n_event * n_per_case,
n_sampled = ifelse(.data$Rj > .data$n_target,
.data$n_target, .data$Rj),
p0 = 1 - .data$n_sampled / .data$Rj, # not sampled
p = .data$p0)
as.numeric(df$p)
}
#' <Private function> Compute KM-type weight from risk table, provided t=0 for all
#' @param risk_table Output from \code{\link{compute_risk_tb}}, with an additional
#' column \code{p}.
#' @return Returns a \code{data.frame} with \code{t_event}, \code{km_weight} and
#' matching variables (if any).
#' @importFrom dplyr group_by mutate arrange
#' @importFrom rlang .data
#' @importFrom magrittr `%>%`
compute_kmw0 <- function(risk_table) {
km_table <- risk_table %>%
# filter(!is.na(p)) %>%
dplyr::group_by(.data$strata) %>%
dplyr::mutate(cumulative_product = cumprod(.data$p),
sampling_prob = 1 - .data$cumulative_product,
km_weight = 1 / .data$sampling_prob)
if (any(is.na(km_table$km_weight))) {
warning(simpleWarning(sprintf("%d entries in KM-type weight are NA.",
sum(is.na(km_table$km_weight)))))
}
if (any(is.infinite(km_table$km_weight))) {
warning(simpleWarning(sprintf("%d entries in KM-type weight are infinite.",
sum(is.infinite(km_table$km_weight)))))
}
match_var_names <- setdiff(names(risk_table),
c("t_event", "n_event", "n_at_risk", "strata", "p"))
km_table[, c("t_event", "n_event", "strata", match_var_names, "km_weight")] %>%
dplyr::arrange(.data$strata, .data$t_event) %>%
as.data.frame(stringsAsFactors = FALSE)
}
#' <Private function> Assign appropriate KM-type weight to unique subjects in NCC
#' @inheritParams compute_kmw0
#' @param ncc_nodup NCC data prepared using \code{\link{prepare_cohort}}
#' function. This data should not include the ID of each matched set or the
#' time of event in each set.
#' @return Returns \code{ncc_nodup} with two additional columns: \code{.km_prob}
#' for KM-type probability, and \code{.km_weight} for KM-type weight.
#' @importFrom rlang .data
#' @importFrom dplyr filter select mutate
assign_kmw0 <- function(ncc_nodup, risk_table) {
ncc_nodup <- do.call("rbind", lapply(1:nrow(ncc_nodup), function(j) {
ncc_nodup_j <- ncc_nodup[j, ]
if (ncc_nodup_j$.y == 1) {
km_prob <- 1
} else {
# A subject is in the risk set at time t if this subject is still under
# observation at t-, i.e., right before t. Hence a subject censored
# exactly at time t is still in the risk set for an event at t.
risk_table_i <- risk_table %>%
dplyr::filter(.data$strata == ncc_nodup_j$.strata,
.data$t_event > ncc_nodup_j$.t_start,
.data$t_event <= ncc_nodup_j$.t)
if (nrow(risk_table_i) == 0) {
km_prob <- 0
} else {
km_prob <- 1 - prod(risk_table_i$p)
}
}
ncc_nodup_j %>%
dplyr::mutate(.km_prob = km_prob, .km_weight = 1 / km_prob) %>%
dplyr::select(-.data$.strata0, -.data$.strata, -.data$.y,
-.data$.t_start, -.data$.t) %>%
as.data.frame(stringsAsFactors = FALSE)
}))
if (any(is.na(ncc_nodup$.km_weight))) {
warning(simpleWarning(sprintf("%d entries in KM-type weight are NA.",
sum(is.na(ncc_nodup$.km_weight)))))
}
if (any(is.infinite(ncc_nodup$.km_weight))) {
warning(simpleWarning(sprintf("%d entries in KM-type weight are infinite.",
sum(is.infinite(ncc_nodup$.km_weight)))))
}
ncc_nodup
}
#' <Private function> Compute KM-type weights for NCC sample given full cohort
#' @inheritParams prepare_cohort
#' @param t_start_name Name of the variable in \code{cohort_skeleton} for the
#' start time of follow-up. A \code{string}. Default is \code{NULL}, where all
#' subjects started the follow-up at time 0.
#' @param sample_stat A numeric vector containing sampling and status
#' information for each subject in \code{cohort}: use 0 for non-sampled
#' controls, 1 for sampled (and kept) controls, and integers >=2 for events.
#' The length of this vector must be the same as the number of rows in
#' \code{cohort}.
#' @param match_var_names Name(s) of the match variable(s) in \code{cohort} used
#' when drawing the NCC. A \code{string} vector. Default is \code{NULL}, i.e.,
#' the NCC was only time-matched.
#' @param n_per_case Number of controls matched to each case.
#' @param return_risk_table Whether the risk table should be returned. Default
#' is \code{FALSE}.
#' @param km_names Column names for the KM-type probability (the first element)
#' and weight (the second element) computed, if these two columns are to be
#' attached to each subject in the input data. Default is
#' \code{c(".km_prob", ".km_weight")}.
#' @return If \code{return_risk_table = FALSE} (the default), returns the
#' subcohort of sampled subjects with the appropriate KM-type probability and
#' weight attached to each subject. If \code{return_risk_table = TRUE},
#' returns a list containing this subcohort (\code{dat}) and the risk table
#' (\code{risk_table}), which is a \code{data.frame} containing the distinct
#' event time (\code{t_event}), matching variables (if any), and the number of
#' subject at risk at each event time in each strata defined by matching
#' variables (\code{n_at_risk}).
compute_kmw_cohort <- function(cohort, t_start_name = NULL, t_name, sample_stat,
match_var_names = NULL, n_per_case,
return_risk_table = FALSE,
km_names = c(".km_prob", ".km_weight")) {
cohort <- unique(as.data.frame(cohort, stringsAsFactors = FALSE))
if (length(sample_stat) != nrow(cohort)) {
stop(simpleError(
"The length of sample_stat must be the same as the number of rows in cohort."
))
}
cohort$.y <- as.numeric(sample_stat >= 2)
cohort <- prepare_cohort(cohort = cohort, t_start_name = t_start_name,
t_name = t_name, y_name = ".y",
match_var_names = match_var_names)
risk_table <- compute_risk_tb(cohort = cohort, match_var_names = match_var_names,
staggered = !is.null(t_start_name))
risk_table$p <- p_not_sampled(n_event = risk_table$n_event,
n_at_risk = risk_table$n_at_risk,
n_per_case = n_per_case)
# Assign km_weight to each subject
dat <- assign_kmw0(ncc_nodup = cohort[sample_stat > 0, ],
risk_table = risk_table)
dat <- change_km_names(dat = dat, km_names = km_names)
if (return_risk_table) {
# Return time of event and number of subject at risk at each event time
list(
dat = dat,
risk_table = risk_table[, c("t_event", "n_event", match_var_names, "n_at_risk")]
)
} else {
dat
}
}
#' <Private function> Match number at risk at coarsened time to NCC cases
#' @inheritParams prepare_ncc_cases
#' @param risk_table_manual A \code{data.frame} with columns \code{t_event}
#' (unique event times, possibly coarsened), \code{n_at_risk} (number of
#' subjects at risk at each \code{t_event} in the underlying cohort), and
#' additional columns for matching variables, if any. Make sure the matching
#' variables have the same column names as in \code{ncc_cases} and
#' \code{match_var_names}.
#' @param t_coarse_name Name of the column of event time in each matched set in
#' \code{ncc}, possibly coarsened to the same level as \code{t_event} in
#' \code{risk_table_manual}. A \code{string}.
#' @param t_name Name of the column of the exact event time. A \code{string}.
#' @return Returns a data.frame with the following columns:
#' \describe{
#' \item{t_event}{Unique event times (exact, not coarsened).}
#' \item{n_event}{Number of events at each event time.}
#' \item{n_at_risk}{Number of subjects at risk at each event time.}
#' \item{strata}{Strata defined by matching variables. \code{match_var=1} if the
#' NCC is only time-matched.}
#' \item{<each matching variable>}{If the NCC is matched on additional
#' confounders, each matching variable will be included as a column to the right
#' of \code{strata}.}
#' }
#' @importFrom dplyr rename group_by across summarise ungroup left_join arrange n
#' @importFrom rlang .data
match_risk_table <- function(ncc_cases, risk_table_manual, t_coarse_name, t_name,
match_var_names = NULL) {
if (!is.null(match_var_names)) {
if (!all(match_var_names %in% names(risk_table_manual))) {
stop(simpleError("Make sure all match variables are in risk_table_manual."))
}
}
risk_table_manual <- risk_table_manual %>%
dplyr::rename(.t_event = .data$t_event)
vars <- c(".t_event", match_var_names)
risk_table <- prepare_ncc_cases(ncc_cases = ncc_cases,
t_name = t_coarse_name,
match_var_names = match_var_names) %>%
dplyr::group_by(dplyr::across({{vars}})) %>%
dplyr::summarise(n_event = dplyr::n(), strata = unique(.data$.strata)) %>%
dplyr::ungroup() %>%
dplyr::left_join(risk_table_manual) %>%
dplyr::rename(t_coarse = .data$.t_event)
risk_table[, c("t_coarse", "n_event", "n_at_risk", "strata", match_var_names)] %>%
dplyr::left_join(unique(data.frame(t_coarse = ncc_cases[, t_coarse_name],
t_event = ncc_cases[, t_name])),
by = "t_coarse") %>%
dplyr::arrange(.data$strata, .data$t_event) %>%
as.data.frame(stringsAsFactors = FALSE)
}
#' <Private function> Compute KM-type weight for NCC sample given information on
#' underlying cohort
#' @inheritParams match_risk_table
#' @param ncc NCC data. A \code{data.frame} or a matrix with column names. This
#' data should not include the ID of each matched set, but should include the
#' actual event/censoring time of each subject.
#' @param id_name Name of the column of the subject ID. A \code{string}.
#' @param t_start_name Name of the column for the start time of follow-up. A
#' string. Default is NULL, i.e., every subject started the follow-up at time
#' 0.
#' @param t_name Name of the column of the exact event/censoring time of each
#' subject. Note that for controls, this should not be the time of the event
#' in the same matched set. A \code{string}.
#' @param t_match_name Name of the column of event time in each matched set in
#' \code{ncc}, possibly coarsened to the same level as \code{t_event} in
#' \code{risk_table_manual}. A \code{string}. Default is \code{t_name}, i.e.,
#' not coarsened.
#' @param y_name Name of the column of censoring status in each matched set in
#' \code{ncc}, with 1 for event and 0 for censoring. A \code{string}.
#' @param n_per_case Number of controls matched to each case.
#' @param return_risk_table Whether the risk table should be returned. Default
#' is \code{FALSE}.
#' @param km_names Column names for the KM-type probability (the first element)
#' and weight (the second element) computed, if these two columns are to be
#' attached to each subject in the input data. Default is
#' \code{c(".km_prob", ".km_weight")}.
#' @return If \code{return_risk_table = FALSE} (the default), returns a
#' \code{data.frame} containing all the unique subjects selected in the NCC
#' sample, with a column for the KM-type weight associated with each subject.
#' If \code{return_risk_table = TRUE}, returns a list containing this
#' subcohort (\code{dat}) and the risk table (\code{risk_table}), which is a
#' \code{data.frame} containing the distinct (and exact) event time
#' (\code{t_event}), matching variables (if any), and the number of subject at
#' risk at each event time in each strata defined by matching variables
#' (\code{n_at_risk}).
#' @importFrom dplyr arrange desc mutate
#' @importFrom rlang .data
compute_kmw_ncc <- function(ncc, id_name, risk_table_manual,
t_start_name = NULL, t_name, t_match_name = t_name,
y_name, match_var_names = NULL, n_per_case,
return_risk_table = FALSE,
km_names = c(".km_prob", ".km_weight")) {
message(simpleMessage(
"Make sure input ncc does not include ID of matched sets. Failing to do so results in Errors.\n"
))
ncc <- unique(as.data.frame(ncc, stringsAsFactors = FALSE))
risk_table_manual <- as.data.frame(risk_table_manual, stringsAsFactors = FALSE)
if (!(y_name %in% names(ncc))) {
stop(simpleError(paste(y_name, "not found in ncc.")))
} else {
y_name <- y_name[1]
}
if (is.null(id_name)) {
stop(simpleError("Cannot attach weight to each subject if subject id is unknown."))
} else if (!(id_name %in% names(ncc))) {
stop(simpleError(paste(id_name, "not found in ncc.")))
}
if (is.null(t_name)) {
stop(simpleError("Cannot attach weight to each subject if the event/censoring time is unknown."))
} else if (!(t_name %in% names(ncc))) {
stop(simpleError(paste(t_name, "not found in ncc.")))
}
if (is.null(t_match_name)) {
stop(simpleError("Please specify the variable for event time in each matched set."))
} else if (!(t_match_name %in% names(ncc))) {
stop(simpleError(paste(t_match_name, "not found in ncc.")))
}
if (!is.null(t_start_name)) {
if (!(t_start_name %in% names(ncc))) {
stop(simpleError(paste(t_start_name, "not found in ncc.")))
}
}
risk_tb_vars <- c("t_event", "n_event", "n_at_risk", match_var_names)
if (!all(risk_tb_vars %in% names(risk_table_manual))) {
stop(simpleError(
paste("risk_table_manual must include the following columns:",
toString(risk_tb_vars))
))
}
if (!all(ncc[, t_match_name] %in% risk_table_manual$t_event)) {
stop(simpleError(sprintf(
"There is mismatch in event time in ncc ('%s') and risk_table_manual$t_event.",
t_match_name
)))
}
# Make sure each row in ncc_cases uniquely corresponds to a case, and if a
# case is also selected as a control to other cases, we make sure we keep only
# the row where a case is selected as a case (s.t. if time is not coarsened,
# then t = t_match):
t_match_name_symbol <- as.symbol(t_match_name)
ncc_cases <- unique(ncc[ncc[, y_name] == 1, ]) %>%
dplyr::arrange(dplyr::desc({{t_match_name_symbol}}))
row_id_cases <- which(!duplicated(ncc_cases[, id_name]))
ncc_cases <- ncc_cases[row_id_cases, ]
risk_table <- match_risk_table(ncc_cases = ncc_cases,
risk_table_manual = risk_table_manual,
t_coarse_name = t_match_name, t_name = t_name,
match_var_names = match_var_names)
if (anyNA(risk_table)) {
stop(simpleError(sprintf(
"There is mismatch in event time in ncc ('%s') and risk_table_manual$t_event.",
t_match_name
)))
}
risk_table$p <- p_not_sampled(n_event = risk_table$n_event,
n_at_risk = risk_table$n_at_risk,
n_per_case = n_per_case)
ncc_cases <- ncc_cases[, -which(names(ncc_cases) == t_match_name)]
if (sum(ncc[, y_name] != 1) == 0) {
dat <- ncc_cases %>%
dplyr::mutate(.km_prob = 1, .km_weight = 1 / .data$.km_prob) %>%
change_km_names(km_names = km_names) %>%
as.data.frame(stringsAsFactors = FALSE)
} else {
id_cases <- unique(ncc_cases[, id_name])
id_controls <- setdiff(ncc[ncc[, y_name] != 1, id_name], id_cases)
# For each control, only take the row where it first appeared in the NCC
row_id_controls <- which(ncc[, id_name] %in% id_controls)
ids <- ncc[row_id_controls, id_name]
row_id_controls <- row_id_controls[which(!duplicated(ids))]
ncc_controls <- unique(ncc[row_id_controls, -which(names(ncc) == t_match_name)])
dat <- rbind(ncc_cases, ncc_controls)
dat1 <- prepare_cohort(cohort = dat[sort.list(dat[, id_name]), ],
t_start_name = t_start_name, t_name = t_name,
y_name = y_name, match_var_names = match_var_names,
print_message = FALSE)
dat2 <- assign_kmw0(ncc_nodup = dat1, risk_table = risk_table)
dat <- change_km_names(dat = dat2, km_names = km_names)
}
message(simpleMessage(sprintf(
"There are %d unique subjects (identified by %s) in the input ncc with %d rows, therefore the return data only has %d rows.\n",
length(unique(ncc[, id_name])), id_name, nrow(ncc), nrow(dat)
)))
if (return_risk_table) {
list(
dat = dat,
risk_table = risk_table[, c("t_event", "n_event", match_var_names, "n_at_risk")]
)
} else {
dat
}
}
#' Prepare a template risk table given a nested case-control sample
#' @description Given a nested case-control (NCC) data, create a template for
#' users to fill in the number of subjects at risk in the cohort at each event
#' time within each sampling stratum (if applicable).
#' @param ncc Nested case-control data. A \code{data.frame} or a matrix with
#' column names.
#' @param t_match_name Name of the column of event time in each matched set in
#' \code{ncc}. A \code{string}. Time should be recorded in a reasonable
#' resolution of measurement (e.g., in months or years) such that the number
#' of subjects at risk in the cohort is available.
#' @param y_name Name of the case/control indicator in \code{ncc}, with 1 for
#' case and 0 for control A \code{string}.
#' @param match_var_names Name(s) of the match variable(s) used when drawing the
#' NCC. A \code{string} vector. Default is NULL, i.e., the NCC was only
#' time-matched.
#' @param csv_file Name of CSV file to write the template to. If left
#' unspecified, the template will be returned as a \code{data.frame} instead.
#' @return If \code{csv_file} is not supplied, this function returns a
#' \code{data.frame} that contains the unique event times, additional matching
#' variables (if \code{match_var_names} is supplied), and an empty column
#' named \code{n.risk}. Column names in this \code{data.frame} should not be
#' modified. If \code{csv_file} is supplied, this function write the
#' \code{data.frame} to the designated file instead. Users should fill this
#' column with the number of subjects at risk at each time point in the
#' stratum, and later use this \code{data.frame} or file to compute the
#' KM-type weights for the subjects in the NCC data.
#' @importFrom dplyr filter group_by summarise rename n
#' @importFrom utils write.csv
#' @export
#' @example man/examples/prepare_risk_table.R
prepare_risk_table <- function(ncc, t_match_name, y_name, match_var_names = NULL,
csv_file = NULL) {
y_name <- y_name[1]
if (!(y_name %in% names(ncc))) {
stop(simpleError(paste(y_name, "not found in ncc.")))
}
t_match_name <- t_match_name[1]
if (!(t_match_name %in% names(ncc))) {
stop(simpleError(paste(t_match_name, "not found in ncc.")))
}
if (!is.null(match_var_names)) {
if (!all(match_var_names %in% names(ncc))) {
stop(simpleError("Make sure all match variables are in ncc."))
}
}
vars <- c(t_match_name, match_var_names)
y_name_symbol <- as.symbol(y_name)
t_name_symbol <- as.symbol(t_match_name)
risk_table <- ncc %>%
dplyr::filter({{y_name_symbol}} == 1) %>%
dplyr::group_by(across({{vars}})) %>%
dplyr::summarise(n_at_risk = dplyr::n()) %>%
dplyr::rename(t_event = {{t_name_symbol}})
risk_table$n_at_risk <- NA
if (is.null(csv_file)) {
as.data.frame(risk_table)
} else {
write.csv(risk_table, file = csv_file, na = "", row.names = FALSE)
message(paste0("Cohort summary template is saved as '", csv_file, "'."))
NULL
}
}
#' Compute number at risk at each event time from a "skeleton cohort"
#' @inheritParams prepare_cohort
#' @param t_start_name Name of the variable in \code{cohort} for the start time
#' of follow-up. A \code{string}. Default is \code{NULL}, i.e., every subject
#' started the follow-up at time 0.
#' @param match_var_names Name(s) of the match variable(s) in \code{cohort} used
#' when drawing the NCC. A \code{string} vector. Default is \code{NULL}, i.e.,
#' the NCC was only time-matched.
#' @return Returns a data.frame with the following columns:
#' \describe{
#' \item{t_event}{Unique event times in \code{cohort_skeleton}.}
#' \item{n_event}{Number of events at each event time.}
#' \item{n_at_risk}{Number of subjects at risk at each event time.}
#' \item{<each matching variable>}{If the NCC is matched on additional
#' confounders, each matching variable will be included as a column to the right
#' of \code{strata}.}
#' }
#' @example man/examples/compute_risk_table.R
#' @importFrom rlang .data
#' @importFrom dplyr select
#' @importFrom magrittr `%>%`
#' @export
#' @seealso \code{\link{compute_km_weights}}
compute_risk_table <- function(cohort, t_start_name = NULL, t_name, y_name,
match_var_names = NULL) {
cohort <- prepare_cohort(cohort = cohort, t_start_name = t_start_name,
t_name = t_name, y_name = y_name,
match_var_names = match_var_names)
compute_risk_tb(cohort = cohort, match_var_names = match_var_names,
staggered = !is.null(t_start_name)) %>%
dplyr::select(-.data$strata) %>%
as.data.frame(stringsAsFactors = FALSE)
}
#' <private function> Change variable names corresponding to KM-type probability
#' and weight
#' @param dat Cohort or NCC data generated from \code{\link{compute_km_weights}}
#' or \code{\link{compute_km_weights_controls}}.
#' @param km_names Names for KM-type probability and weight, corresponding to the
#' last two columns in \code{dat}.
#' @return Returns \code{dat} with names changed for the last two columns.
change_km_names <- function(dat, km_names) {
nc <- ncol(dat)
names(dat)[c(nc - 1, nc)] <- km_names
dat
}
#' Compute Kaplan-Meier type weights for (matched) nested case-control (NCC)
#' sample
#' @inheritParams compute_kmw_cohort
#' @param ncc (Matched) NCC data, if \code{cohort} is not available.
#' \strong{This data should not include the ID of each matched set, but should
#' include the actual event/censoring time of each subject.} A
#' \code{data.frame} or a matrix with column names.
#' @param risk_table_manual Number of subjects at risk at time of each cases in
#' the NCC, if \code{cohort} is not available. A \code{data.frame} or a matrix
#' with column names. See Details.
#' @param t_start_name Name of the variable in \code{cohort} or \code{ncc} for
#' the start time of follow-up. A \code{string}. Default is \code{NULL}, i.e.,
#' every subject started the follow-up at time 0.
#' @param t_name Name of the variable in \code{cohort} or \code{ncc} for the
#' time of event or censoring. A \code{string}. Note that if \code{ncc} is
#' supplied, in order to correctly compute the weight for each sampled control
#' this should be the actual time of censoring, not the time of event of the
#' case in the same matched set.
#' @param t_match_name Name of the column of event time in each matched set in
#' \code{ncc}, possibly coarsened to the same level as \code{t_event} in
#' \code{risk_table_manual}. A \code{string}. Default is \code{t_name}, i.e.,
#' not coarsened.
#' @param y_name Name of the column of censoring status in \code{cohort} or
#' \code{ncc}, with 1 for event and 0 for censoring. A \code{string}.
#' @param match_var_names Name(s) of the match variable(s) in \code{cohort} or
#' \code{ncc} used when drawing the NCC. A \code{string} vector. Default is
#' \code{NULL}, i.e., the NCC was only time-matched. The corresponding column
#' in \code{risk_table_manual} must have the same name.
#' @param id_name Name of the column indicating subject ID in \code{ncc}, if
#' \code{cohort} is not available.
#' @param km_names Column names for the KM-type probability (the first element)
#' and weight (the second element) computed, if these two columns are to be
#' attached to each subject in the input data. Default is
#' \code{c("km_prob", "km_weight")}.
#' @details When the full cohort is not available, in order to compute the
#' correct weights for each sampled control in the NCC sample, it is important
#' to keep the actual time of event or censoring of each subject in the NCC
#' sample, which should be specified as \code{t_name} in the input. Since the
#' number of subjects in each risk set will be supplied separately (i.e., as
#' \code{n_at_risk}) in such scenario, \code{t_match_name} is required to
#' map each control to the appropriate risk set. \code{t_match_name} may be
#' the same as \code{t_name} if the exact risk set is available in
#' \code{n_at_risk}, but when the full cohort is not available the risk set is
#' usually approximated by using a coarsened version of \code{t_name}. For
#' example, when controls were drawn from a population registry by matching on
#' the exact date of death of cases, birth cohort and gender, the number at
#' risk may be approximated by using the population size in the year of event
#' in the same birth cohort of the same gender. In this scenario
#' \code{t_match_name} would be the year of \code{t_name}.
#' @export
#' @seealso \code{\link{compute_risk_table}}
#' @example man/examples/compute_km_weights.R
compute_km_weights <- function(cohort = NULL, ncc = NULL, id_name = NULL,
risk_table_manual = NULL,
t_start_name = NULL, t_name = NULL,
sample_stat = NULL,
t_match_name = t_name, y_name = NULL,
match_var_names = NULL, n_per_case,
return_risk_table = FALSE,
km_names = c("km_prob", "km_weight")) {
if (!is.null(cohort)) {
if (is.null(sample_stat)) {
stop(simpleError("sample_stat is required if cohort is available."))
}
sample_stat <- as.vector(sample_stat)
# Full cohort is available
compute_kmw_cohort(cohort = cohort,
t_start_name = t_start_name, t_name = t_name,
sample_stat = sample_stat,
match_var_names = match_var_names,
n_per_case = n_per_case,
return_risk_table = return_risk_table,
km_names = km_names)
} else {
# Full cohort is not available
if (is.null(risk_table_manual)) {
stop(simpleError("risk_table_manual is required if cohort is not available."))
}
compute_kmw_ncc(ncc = ncc, id_name = id_name,
risk_table_manual = risk_table_manual,
t_start_name = t_start_name, t_name = t_name,
t_match_name = t_match_name, y_name = y_name,
match_var_names = match_var_names,
n_per_case = n_per_case,
return_risk_table = return_risk_table,
km_names = km_names)
}
}
#' Compute Kaplan-Meier type weights for newly collected NCC controls
#' @param ncc_controls Newly collected NCC controls, where each row corresponds
#' to a unique subject. Make sure this dataset does not include any subject
#' that later became cases. This data should include the actual
#' event/censoring time of each subject. A \code{data.frame} or a matrix with
#' column names.
#' @param risk_table_manual Number of subjects at risk at time of each cases in
#' the NCC, prepared using function \code{\link{match_risk_table}}.
#' @param t_start_name Name of the variable in \code{ncc_controls} for the start
#' time of follow-up. A \code{string}. Default is \code{NULL}, i.e., every
#' subject started the follow-up at time 0.
#' @param t_name Name of the variable in \code{ncc_controls} for the time of
#' censoring (coarsening not yet supported). A \code{string}.
#' @param match_var_names Name(s) of the match variable(s) in
#' \code{ncc_controls} used when drawing the NCC. A \code{string} vector.
#' Default is \code{NULL}, i.e., the NCC was only time-matched. The
#' corresponding column in \code{risk_table_manual} must have the same name.
#' @param km_names Column names for the KM-type probability (the first element)
#' and weight (the second element) computed. Default is
#' \code{c("km_prob", "km_weight")}.
#' @param n_per_case Number of controls matched to each case.
#' @export
#' @example man/examples/compute_km_weights_controls.R
#' @seealso \code{\link{match_risk_table}}, \code{\link{compute_km_weights}}
compute_km_weights_controls <- function(ncc_controls, risk_table_manual,
t_start_name = NULL, t_name,
match_var_names = NULL, n_per_case,
km_names = c("km_prob", "km_weight")) {
ncc_controls <- as.data.frame(ncc_controls, stringsAsFactors = FALSE)
ncc_controls$.y <- 0
risk_table <- risk_table_manual
risk_table$p <- p_not_sampled(n_event = risk_table$n_event,
n_at_risk = risk_table$n_at_risk,
n_per_case = n_per_case)
if (is.null(match_var_names)) {
risk_table$strata <- "match_var=1"
} else {
# When there are matching variables, only matching profiles in risk_table
# can be used
risk_table$.strata_str <- unlist(lapply(1:nrow(risk_table), function(i) {
paste(risk_table[i, match_var_names], collapse = ";")
}))
ncc_controls$.strata_str <- unlist(lapply(1:nrow(ncc_controls), function(i) {
paste(ncc_controls[i, match_var_names], collapse = ";")
}))
ncc_controls <- ncc_controls[which(ncc_controls$.strata_str %in%
risk_table$.strata_str), ]
}
ncc_controls1 <- prepare_cohort(
cohort = ncc_controls, t_start_name = t_start_name, t_name = t_name,
y_name = ".y", match_var_names = match_var_names
)
if (!is.null(match_var_names)) {
# Assign strata to risk_table consistent with ncc_controls
risk_table$strata <- ncc_controls1$.strata[match(risk_table$.strata_str,
ncc_controls1$.strata_str)]
risk_table$strata <- ifelse(is.na(risk_table$strata),
risk_table$.strata_str, risk_table$strata)
}
ncc_controls2 <- assign_kmw0(ncc_nodup = ncc_controls1, risk_table = risk_table)
ncc_controls2 <- ncc_controls2[, setdiff(names(ncc_controls2), ".strata_str")]
change_km_names(dat = ncc_controls2, km_names = km_names)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.