#' Perform Decision Curve Analysis
#'
#' Diagnostic and prognostic models are typically evaluated with measures of
#' accuracy that do not address clinical consequences.
#' Decision-analytic techniques allow assessment of clinical outcomes but often
#' require collection of additional information may be cumbersome to apply to
#' models that yield a continuous result. Decision curve analysis is a method
#' for evaluating and comparing prediction models that incorporates clinical
#' consequences, requires only the data set on which the models are tested,
#' and can be applied to models that have either continuous or dichotomous
#' results.
#' The dca function performs decision curve analysis for binary outcomes.
#' Review the
#' [DCA Vignette](https://www.danieldsjoberg.com/dcurves/articles/dca.html)
#' for a detailed walk-through of various applications.
#' Also, see [www.decisioncurveanalysis.org](https://www.mskcc.org/departments/epidemiology-biostatistics/biostatistics/decision-curve-analysis) for more information.
#'
#' @author Daniel D Sjoberg
#'
#' @param formula a formula with the outcome on the LHS and a sum of
#' markers/covariates to test on the RHS
#' @param data a data frame containing the variables in `formula=`.
#' @param thresholds vector of threshold probabilities between 0 and 1.
#' Default is `seq(0, 0.99, by = 0.01)`. Thresholds at zero are replaced
#' with 10e-10.
#' @param label named list of variable labels, e.g. `list(age = "Age, years")`
#' @param harm named list of harms associated with a test. Default is `NULL`
#' @param as_probability character vector including names of variables
#' that will be converted to a probability. Details below.
#' @param time if outcome is survival, `time=` specifies the time the
#' assessment is made
#' @param prevalence When `NULL`, the prevalence is estimated from `data=`.
#' If the data passed is a case-control set, the population prevalence
#' may be set with this argument.
#'
#' @section as_probability argument:
#' While the `as_probability=` argument can be used to convert a marker to the
#' probability scale, use the argument only when the consequences are fully
#' understood. For example, when the outcome is binary, logistic regression
#' is used to convert the marker to a probability. The logistic regression
#' model assumes linearity on the log-odds scale and can induce
#' miscalibration when this assumption is not true. Miscalibration in a
#' model will adversely affect performance on decision curve
#' analysis. Similarly, when the outcome is time-to-event, Cox Proportional
#' Hazards regression is used to convert the marker to a probability.
#' The Cox model also has a linearity assumption and additionally assumes
#' proportional hazards over the follow-up period. When these assumptions
#' are violated, important miscalibration may occur.
#'
#' Instead of using the `as_probability=` argument, it is suggested to perform
#' the regression modeling outside of the `dca()` function utilizing methods,
#' such as non-linear modeling, as appropriate.
#'
#' @return List including net benefit of each variable
#' @seealso [`net_intervention_avoided()`], [`standardized_net_benefit()`], [`plot.dca()`],
#' [`as_tibble.dca()`]
#' @export
#'
#' @examples
#' # calculate DCA with binary endpoint
#' dca(cancer ~ cancerpredmarker + marker,
#' data = df_binary,
#' as_probability = "marker",
#' label = list(cancerpredmarker = "Prediction Model", marker = "Biomarker")) %>%
#' # plot DCA curves with ggplot
#' plot(smooth = TRUE) +
#' # add ggplot formatting
#' ggplot2::labs(x = "Treatment Threshold Probability")
#'
#' # calculate DCA with time to event endpoint
#' dca(Surv(ttcancer, cancer) ~ cancerpredmarker, data = df_surv, time = 1)
dca <- function(formula, data, thresholds = seq(0, 0.99, by = 0.01),
label = NULL, harm = NULL, as_probability = character(),
time = NULL, prevalence = NULL) {
# checking inputs ------------------------------------------------------------
if (!is.data.frame(data))
stop("`data=` must be a data frame", call. = FALSE)
if (!inherits(formula, "formula"))
stop("`formula=` must be a formula", call. = FALSE)
if (!is.null(label) && (!rlang::is_list(label) || !rlang::is_named(label)))
stop("`label=` must be a named list or NULL.", call. = FALSE)
if (!is.null(harm) && (!rlang::is_list(harm) || !rlang::is_named(harm)))
stop("`harm=` must be a named list or NULL.", call. = FALSE)
if (!is.character(as_probability))
stop("`as_probability=` must be character.", call. = FALSE)
# prepping data --------------------------------------------------------------
thresholds <- thresholds[thresholds >= 0 & thresholds <= 1]
label <-
list(all = "Treat All", none = "Treat None") %>%
purrr::list_modify(!!!label)
model_frame <- stats::model.frame(formula, data)
outcome_name <- names(model_frame)[1]
if (any(c("all", "none") %in% names(model_frame))) {
stop("Variables cannot be named 'all' or 'none': they are reserved.",
call. = FALSE)
}
outcome_type <- .outcome_type(model_frame, outcome_name, time)
# convert to probability if requested ----------------------------------------
model_frame <- .as_probability(model_frame, outcome_name, outcome_type, as_probability, time)
# add treat all and treat none -----------------------------------------------
model_frame <-
model_frame %>%
dplyr::mutate(
all = 1,
none = 0,
dplyr::across(.cols = -dplyr::all_of(outcome_name),
.fns = ~dplyr::case_when(. == 0 ~ 0 - .Machine$double.eps,
. == 1 ~ 1 + .Machine$double.eps,
TRUE ~ .)),
.after = all_of(outcome_name)
) %>%
copy_labels_from(model_frame)
# calculate net benefit ------------------------------------------------------
dca_result <-
test_consequences_data_frame(
model_frame = model_frame,
outcome_name = outcome_name,
outcome_type = outcome_type,
statistics = c("pos_rate", "tp_rate", "fp_rate", "harm", "net_benefit"),
thresholds = thresholds,
label = label,
time = time,
prevalence = prevalence,
harm = harm
)
# return results -------------------------------------------------------------
lst_result <-
list(
call = match.call(),
y = outcome_name,
n = dca_result$n[1],
prevalence = dca_result$pos_rate[1],
time = time,
dca = dca_result
) %>%
purrr::compact()
class(lst_result) <- "dca"
lst_result
}
#' Calculate risks
#'
#' When users request, we can convert a marker to a probability with logistic, or Cox regression
#'
#' @param outcome outcome object
#' @param variable variable
#' @param outcome_type type of outcome
#' @param time time to calculate risk if Surv() outcome
#' @param prevalence specified prevalence (if cannot be estimated from data)
#'
#' @noRd
#' @keywords internal
.convert_to_risk <- function(outcome, variable, outcome_type,
time = NULL, prevalence = NULL) {
if (outcome_type == "binary" && !is.null(prevalence)) {
stop("Cannot convert to risks in case-control setting.")
}
if (outcome_type == "binary") {
risk <-
stats::glm(outcome ~ variable, family = stats::binomial) %>%
stats::predict(type = "response")
} else if (outcome_type == "survival") {
# construct data frame
df <- data.frame(outcome = outcome, variable = variable)
new_df <- data.frame(outcome = outcome, variable = variable)
new_df$outcome[, 1] <- time
# build model, and get predictions for time point of interest
risk <-
survival::coxph(outcome ~ variable, data = df) %>%
stats::predict(newdata = new_df, type = "expected") %>%
{
exp(-.)
}
}
risk
}
#' Convert a `Surv()` object to a n-time risk estimate
#'
#' @param outcome `Surv()` object
#' @param time numeric time
#' @param quiet logical, default is TRUE
#'
#' @noRd
#' @keywords internal
.surv_to_risk <- function(outcome, time, quiet = TRUE) {
df_tidy <-
survival::survfit(outcome) %>%
broom::tidy()
# if multistate (i.e. competing risks) delete states not of interest
if ("state" %in% names(df_tidy)) {
state <- unique(df_tidy$state) %>%
setdiff("(s0)") %>%
purrr::pluck(1)
df_tidy <- df_tidy %>% dplyr::filter(.data$state %in% .env$state)
if (!isTRUE(quiet)) {
glue::glue(
"Multi-state model detected. Showing probabilities into state '{state}'") %>%
message()
}
}
# if regular survfit() model, convert survival to risk
else {
df_tidy <- dplyr::mutate(df_tidy, estimate = 1 - .data$estimate)
}
# if no observed times after specified time, return NA
if (max(df_tidy$time) < time) {
return(NA_real_)
}
df_tidy <-
df_tidy %>%
dplyr::filter(.data$time <= .env$time)
# if no observed times after time point, return NA
if (nrow(df_tidy) == 0L) {
return(NA_real_)
}
df_tidy %>%
dplyr::slice_tail() %>%
dplyr::pull("estimate")
}
# convert marker to a probability
.as_probability <- function(model_frame, outcome_name, outcome_type, as_probability, time) {
as_probability <-
model_frame %>%
dplyr::select(-dplyr::all_of(outcome_name)) %>%
dplyr::select(dplyr::all_of(as_probability))
for (v in names(as_probability)) {
model_frame[[v]] <- .convert_to_risk(model_frame[[outcome_name]],
model_frame[[v]],
outcome_type = outcome_type,
time = time
)
}
.check_probability_range(model_frame, outcome_name)
model_frame
}
# a preceding step may result in the loss of label attributes
# this function helps restore them
copy_labels_from <- function(x, y) {
for (v in names(x)) {
attr(x[[v]], "label") <- attr(y[[v]], "label") %||% attr(x[[v]], "label")
}
x
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.