# This is where all functions related to absolute risk estimation should appear
#' Compute absolute risks using the fitted hazard function.
#'
#' Using the output of the function \code{fitSmoothHazard}, we can compute
#' absolute risks by integrating the fitted hazard function over a time period
#' and then converting this to an estimated survival for each individual.
#'
#' If \code{newdata = "typical"}, we create a typical covariate profile for the
#' absolute risk computation. This means that we take the median for numerical
#' and date variables, and we take the most common level for factor variables.
#'
#' In general, the output will include a column corresponding to the provided
#' time points. Some modifications of the \code{time} vector are done:
#' \code{time=0} is added, the time points are ordered, and duplicates are
#' removed. All these modifications simplify the computations and give an output
#' that can easily be used to plot risk curves.
#'
#' If there is no competing risk, the output is a matrix where each column
#' corresponds to the several covariate profiles, and where each row corresponds
#' to a time point. If there are competing risks, the output will be a
#' 3-dimensional array, with the third dimension corresponding to the different
#' events.
#'
#' The numerical method should be good enough in most situation, but Monte Carlo
#' integration can give more accurate results when the estimated hazard function
#' is not smooth (e.g. when modeling with time-varying covariates).
#'
#' @param object Output of function \code{\link{fitSmoothHazard}}.
#' @param time A vector of time points at which we should compute the absolute
#' risks.
#' @param newdata Optionally, a data frame in which to look for variables with
#' which to predict. If omitted, the mean absolute risk is returned.
#' Alternatively, if \code{newdata = "typical"}, the absolute risk will be
#' computed at a "typical" covariate profile (see Details).
#' @param method Method used for integration. Defaults to \code{"numerical"},
#' which uses the trapezoidal rule to integrate over all time points together.
#' The only other option is \code{"montecarlo"}, which implements Monte-Carlo
#' integration.
#' @param nsamp Maximal number of subdivisions (if \code{method = "numerical"})
#' or number of sampled points (if \code{method = "montecarlo"}).
#' @param s Value of the penalty parameter lambda at which predictions are
#' required (for class \code{cv.glmnet}).
#' @param onlyMain Logical. For competing risks, should we return absolute risks
#' only for the main event of interest? Defaults to \code{TRUE}.
#' @param type Type of output. Can be \code{"CI"} (output is on the cumulative
#' incidence scale) or \code{"survival"} (output is on the survival scale,
#' i.e. 1- CI)
#' @param addZero Logical. Should we add time = 0 at the beginning of the
#' output? Defaults to \code{TRUE}.
#' @param ntimes Number of time points (only used if \code{time} is missing).
#' @param ... Extra parameters. Currently these are simply ignored.
#' @return If \code{time} was provided, returns the estimated absolute risk for
#' the user-supplied covariate profiles. This will be stored in a matrix or a
#' higher dimensional array, depending on the input (see details). If both
#' \code{time} and \code{newdata} are missing, returns the original data
#' with a new column containing the risk estimate at failure times.
#' @export
#' @importFrom stats formula delete.response setNames
#' @examples
#' # Simulate censored survival data for two outcome types
#' library(data.table)
#' set.seed(12345)
#' nobs <- 1000
#' tlim <- 20
#'
#' # simulation parameters
#' b1 <- 200
#' b2 <- 50
#'
#' # event type 0-censored, 1-event of interest, 2-competing event
#' # t observed time/endpoint
#' # z is a binary covariate
#' DT <- data.table(z = rbinom(nobs, 1, 0.5))
#' DT[,`:=` ("t_event" = rweibull(nobs, 1, b1),
#' "t_comp" = rweibull(nobs, 1, b2))]
#' DT[,`:=`("event" = 1 * (t_event < t_comp) + 2 * (t_event >= t_comp),
#' "time" = pmin(t_event, t_comp))]
#' DT[time >= tlim, `:=`("event" = 0, "time" = tlim)]
#'
#' out_linear <- fitSmoothHazard(event ~ time + z, DT, ratio = 10)
#'
#' linear_risk <- absoluteRisk(out_linear, time = 10,
#' newdata = data.table("z" = c(0,1)))
absoluteRisk <- function(object, time, newdata,
method = c("numerical", "montecarlo"),
nsamp = 100, s = c("lambda.1se", "lambda.min"),
onlyMain = TRUE,
type = c("CI", "survival"),
addZero = TRUE, ntimes = 100, ...) {
if (!inherits(object, c("glm", "cv.glmnet", "CompRisk"))) {
stop(paste("object is of class", class(object)[1],
"\nabsoluteRisk should be used with an object of class glm,",
"cv.glmnet, gbm, or CompRisk"),
call. = TRUE)
}
if (inherits(object, "CompRisk")) {
return(absoluteRisk.CompRisk(object, time, newdata, method,
nsamp = nsamp, onlyMain = onlyMain,
type = type, addZero = addZero))
}
# Parse arguments
method <- match.arg(method)
type <- match.arg(type)
if (is.numeric(s)) {
s <- s[1]
if (length(s) > 1) {
warning(paste("More than one value for s has been",
"supplied. Only first entry will be used"))
}
} else if (is.character(s)) {
s <- match.arg(s)
}
if (inherits(object, "gbm")) {
# if (missing(n.trees)) stop("n.trees is missing")
stop("gbm is not supported", call. = FALSE)
} else n.trees <- NULL
if (!missing(newdata) && is.character(newdata) && newdata == "typical") {
check_original_data(object$originalData)
newdata <- if (is.null(object$matrix.fit)) {
get_typical(object$originalData)
} else apply(object$originalData$x, 2, median, na.rm = TRUE)
}
# Call the correct function with correct parameters
if (missing(newdata)) {
if (missing(time)) {
# If newdata and time are missing, compute risk for each subject
# at their failure/censoring times
return(estimate_risk(object, method, nsamp, s, n.trees, type = type,
...))
} else {
return(estimate_risk_newtime(object, time,
method = method, nsamp = nsamp,
s = s, n.trees = n.trees, type = type,
addZero = addZero, ...))
}
} else {
if (missing(time)) {
check_original_data(object$originalData)
# If only time is missing, compute risk for each row of newdata
# at equidistant points
max_time <- if (is.null(object$matrix.fit)) {
max(object$originalData[object$timeVar][[1]])
} else max(object$originalData$y[, object$timeVar])
time <- seq(0, max_time, length.out = ntimes)
}
return(estimate_risk_newtime(object, time, newdata,
method, nsamp, s, n.trees, type = type,
addZero = addZero, ...))
}
}
# Helper functions----
# estimate_risk computes one survival probability per covariate profile
# Currently, this is only called when we want the survival probabilities
# at failure times for the original data
estimate_risk <- function(object, method, nsamp, s, n.trees, type, ...) {
check_original_data(object$originalData)
newdata <- object$originalData
if (inherits(newdata, "data.fit")) newdata <- newdata$x
# Create risk variable and make sure it doesn't already exist
riskVar <- ifelse(type == "CI", "risk", "survival")
while (riskVar %in% names(newdata)) riskVar <- paste0(".", riskVar)
# If both newdata and time are missing
# compute risk at failure times
time_vector <- if (is.null(object$matrix.fit)) {
newdata[object$timeVar][[1]]
} else object$originalData$y[, object$timeVar]
# Create a vectorised hazard function
if (inherits(object, "cv.glmnet") && !is.null(object$matrix.fit)) {
hazard <- function(x, fit, newdata, s, n.trees, ...) {
# Note: the offset should be set to zero when estimating the hazard.
newdata_matrix <- newdata[, colnames(newdata) != fit$timeVar,
drop = FALSE]
# newdata_matrix matches output from fitSmoothHazard.fit
temp_matrix <- model.matrix(update(fit$formula_time, ~ . - 1),
setNames(data.frame(x), fit$timeVar))
newdata_matrix <- as.matrix(cbind(temp_matrix,
as.data.frame(newdata_matrix)))
pred <- estimate_hazard(fit, newdata_matrix, plot = FALSE,
ci = FALSE, s = s, n.trees = n.trees, ...)
return(as.numeric(exp(pred)))
}
} else {
hazard <- function(x, fit, newdata, s, n.trees, ...) {
# Note: the offset should be set to zero when estimating the hazard.
newdata2 <- data.frame(newdata)
newdata2 <- newdata2[rep(1, length(x)), ]
newdata2[fit$timeVar] <- x
pred <- estimate_hazard(fit, newdata2, plot = FALSE, ci = FALSE,
s = s, n.trees = n.trees, ...)
return(as.numeric(exp(pred)))
}
}
# Compute cumulative hazard at each failure time
risk_res <- sapply(seq_len(nrow(newdata)), function(j) {
integrate(hazard, lower = 0, upper = time_vector[j],
fit = object, subdivisions = nsamp,
newdata = newdata[j, , drop = FALSE],
s = s, n.trees = n.trees)$value
})
# Format the output depending on type
if (is.data.frame(newdata)) {
newdata[, riskVar] <- ifelse(type == "CI",
1 - exp(-risk_res),
exp(-risk_res))
} else {
newdata <- if (type == "CI") {
cbind(1 - exp(-risk_res), newdata)
} else cbind(exp(-risk_res), newdata)
colnames(newdata)[1] <- riskVar
}
attr(newdata, "type") <- type
return(newdata)
}
# estimate_risk_newtime computes the whole survival curve for each covariate
# profile
#' @importFrom data.table data.table :=
estimate_risk_newtime <- function(object, time, newdata, method, nsamp,
s, n.trees, type, addZero, ...) {
if (missing(newdata)) {
# Should we use the whole case-base dataset or the original one?
check_original_data(object$originalData)
newdata <- object$originalData
unselectTime <- (names(newdata) != object$timeVar)
newdata <- subset(newdata, select = unselectTime)
}
time_ordered <- unique(c(0, sort(time)))
output <- matrix(NA, ncol = nrow(newdata) + 1, nrow = length(time_ordered))
output[, 1] <- time_ordered
colnames(output) <- c("time", rep("", nrow(newdata)))
rownames(output) <- rep("", length(time_ordered))
output[1, -1] <- 0
if (length(time_ordered) > 1) {
if (method == "numerical") {
# Compute points at which we evaluate integral
# Note: there's probably a more efficient way of choosing the knots
knots <- seq(0, max(time_ordered),
length.out = (length(time_ordered) - 1) * nsamp)
knots <- unique(sort(c(knots, time_ordered)))
for (j in seq_len(nrow(newdata))) {
# Extract current obs
current_obs <- newdata[j, , drop = FALSE]
# Use trapezoidal rule for integration----
if (inherits(newdata, "matrix")) {
newdata2 <- current_obs[,
colnames(newdata) != object$timeVar,
drop = FALSE]
# newdata2 matches the output from fitSmoothHazard.fit
temp_matrix <- model.matrix(update(object$formula_time,
~ . - 1),
setNames(data.frame(knots),
object$timeVar))
newdata2 <- as.matrix(cbind(temp_matrix,
as.data.frame(newdata2)))
} else {
# Create data.table for prediction
newdata2 <- data.table::data.table(current_obs)
newdata2 <- newdata2[rep(1, length(knots))]
newdata2[, object$timeVar := knots]
}
pred <- estimate_hazard(object = object, newdata = newdata2,
plot = FALSE, ci = FALSE,
s = s, n.trees = n.trees, ...)
# Compute integral using trapezoidal rule
# First remove infinite values (e.g. with log(t))
pred_exp <- exp(pred)
pred_exp[which(pred_exp %in% c(Inf, -Inf))] <- 0
which_knots <- knots %in% c(0, time_ordered)
output[, j + 1] <- trap_int(knots, pred_exp)[which_knots]
}
}
if (method == "montecarlo") {
# Sample points at which we evaluate function
knots <- runif(n = length(time_ordered) * nsamp,
min = 0, max = max(time_ordered))
for (j in seq_len(nrow(newdata))) {
# Extract current obs
current_obs <- newdata[j, , drop = FALSE]
# Create data.table for prediction
newdata2 <- data.table(current_obs)
newdata2 <- newdata2[rep(1, length(knots))]
newdata2[, object$timeVar := knots]
pred <- estimate_hazard(object = object, newdata = newdata2,
plot = FALSE, ci = FALSE,
s = s, n.trees = n.trees, ...)
# Compute integral using MC integration
pred_exp <- exp(pred)
pred_exp[which(pred_exp %in% c(Inf, -Inf))] <- NA
mean_values <- sapply(split(pred_exp,
cut(knots, breaks = time_ordered)),
mean, na.rm = TRUE)
integral_estimates <- mean_values * diff(time_ordered)
output[, j + 1] <- cumsum(c(0, integral_estimates))
}
}
output[, -1] <- exp(-output[, -1])
if (type == "CI") output[, -1] <- 1 - output[, -1]
}
# Sometimes montecarlo integration gives nonsensical probability estimates
if (method == "montecarlo" && (any(output[, -1] < 0) |
any(output[, -1] > 1))) {
warning(paste("Some probabilities are out of range. Consider",
"increasing nsamp or using numerical integration"),
call. = FALSE)
}
# Add time = 0?
if (!addZero) output <- output[-1, , drop = FALSE]
# Add class
class(output) <- c("absRiskCB", class(output))
attr(output, "type") <- type
attr(output, "newdata") <- newdata
return(output)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.