Nothing
#' Cross-Validation for CoxKL Model with elastic net & lasso penalty
#'
#' This function performs cross-validation on the high-dimensional Cox model with
#' Kullback–Leibler (KL) penalty.
#' It tunes the parameter \code{eta} (external information weight) using user-specified
#' cross-validation criteria, while also evaluating a \code{lambda} path
#' (either provided or generated) and selecting the best \code{lambda} per \code{eta}.
#'
#' @param z Numeric matrix of covariates with rows representing individuals and
#' columns representing predictors.
#' @param delta Numeric vector of event indicators (1 = event, 0 = censored).
#' @param time Numeric vector of observed times (event or censoring).
#' @param stratum Optional factor or numeric vector indicating strata.
#' @param RS Optional numeric vector or matrix of external risk scores. If not provided,
#' \code{beta} must be supplied.
#' @param beta Optional numeric vector of external coefficients (length equal to
#' \code{ncol(z)}). If not provided, \code{RS} must be supplied.
#' @param etas Numeric vector of candidate \code{eta} values to be evaluated.
#' @param alpha Elastic-net mixing parameter in \eqn{(0,1]}. Default = \code{1}
#' (lasso penalty).
#' @param lambda Optional numeric scalar or vector of penalty parameters. If \code{NULL},
#' a decreasing path is generated using \code{nlambda} and \code{lambda.min.ratio}.
#' @param nlambda Integer number of lambda values to generate when \code{lambda} is
#' \code{NULL}. Default \code{100}.
#' @param lambda.min.ratio Ratio of the smallest to the largest lambda when
#' generating a sequence (when \code{lambda} is \code{NULL}). Default \code{0.05} when \code{n < p},
#' otherwise \code{1e-3}.
#' @param nfolds Integer; number of cross-validation folds. Default = \code{5}.
#' @param cv.criteria Character string specifying the cross-validation criterion. Choices are:
#' \itemize{
#' \item \code{"V&VH"} (default): \code{"V&VH"} loss.
#' \item \code{"LinPred"}: loss based on cross-validated linear predictors.
#' \item \code{"CIndex_pooled"}: pool all held-out predictions and compute one overall C-index.
#' \item \code{"CIndex_foldaverage"}: average C-index across folds.
#' }
#' @param c_index_stratum Optional stratum vector. Used only when
#' \code{cv.criteria} is \code{"CIndex_pooled"} or \code{"CIndex_foldaverage"} to compute a
#' stratified C-index while the fitted model is non-stratified; if supplied, it must be
#' identical to \code{stratum}. Default \code{NULL}.
#' @param message Logical; whether to print progress messages. Default = \code{FALSE}.
#' @param seed Optional integer random seed for fold assignment.
#' @param ... Additional arguments passed to \code{\link{coxkl_enet}}.
#'
#' @details
#' Data are sorted by \code{stratum} and \code{time}. External info must be from \code{RS} or
#' \code{beta} (if \code{beta} given with length \code{ncol(z)}, \code{RS = z \%*\% beta}); \code{alpha} \eqn{\in (0,1]}.
#'
#' For each candidate \code{eta}, a decreasing \code{lambda} path is used (generated from \code{nlambda}/\code{lambda.min.ratio}
#' if \code{lambda = NULL}); CV folds are created by \code{get_fold}. Each fold fits \code{\link{coxkl_enet}}
#' on the training split (full \code{lambda} path) and evaluates the chosen criterion on the test split.
#'
#' Aggregation follows the code paths for \code{"V&VH"}, \code{"LinPred"}, \code{"CIndex_pooled"}, or \code{"CIndex_foldaverage"}:
#' \itemize{
#' \item \code{"V&VH"}: sums \code{pl(full) - pl(train)} across folds (reported as loss via \code{Loss = -2 * score}).
#' \item \code{"LinPred"}: aggregates test-fold linear predictors and evaluates partial log-likelihood on full data (reported as \code{Loss = -2 * score}).
#' \item \code{"CIndex_pooled"}: pools comparable-pair numerators/denominators across folds to compute one C-index.
#' \item \code{"CIndex_foldaverage"}: averages the per-fold stratified C-index.
#' }
#' The best \code{lambda} is selected per \code{eta} (min loss / max C-index), and the function returns full results,
#' the per-\code{eta} optimum, corresponding coefficients, and an external baseline from \code{RS}.
#'
#' @return An object of class \code{"cv.coxkl_enet"}:
#' \describe{
#' \item{\code{integrated_stat.full_results}}{Data frame with columns
#' \code{eta}, \code{lambda}, and the aggregated CV score for each \code{lambda}
#' under the chosen \code{cv.criteria}. For loss criteria, an additional
#' column with the transformed loss (\code{Loss = -2 * score}); for C-index criteria,
#' a column named \code{CIndex_pooled} or \code{CIndex_foldaverage}.}
#' \item{\code{integrated_stat.best_per_eta}}{Data frame with the best
#' \code{lambda} (per \code{eta}) according to the chosen \code{cv.criteria}
#' (minimizing loss or maximizing C-index).}
#' \item{\code{integrated_stat.betahat_best}}{Matrix of coefficient vectors
#' (columns) corresponding to the best \code{lambda} for each \code{eta}.}
#' \item{\code{external_stat}}{Scalar baseline statistic computed from the external
#' risk score \code{RS} under the same \code{cv.criteria}.}
#' \item{\code{criteria}}{The evaluation criterion used (as provided in \code{cv.criteria}).}
#' \item{\code{alpha}}{The elastic-net mixing parameter used.}
#' \item{\code{nfolds}}{Number of folds.}
#' }
#'
#' @examples
#' data(ExampleData_highdim)
#'
#' train_dat_highdim <- ExampleData_highdim$train
#' beta_external_highdim <- ExampleData_highdim$beta_external
#'
#' etas <- generate_eta(method = "exponential", n = 10, max_eta = 100)
#'
#' cv_res <- cv.coxkl_enet(z = train_dat_highdim$z,
#' delta = train_dat_highdim$status,
#' time = train_dat_highdim$time,
#' stratum = NULL,
#' RS = NULL,
#' beta = beta_external_highdim,
#' etas = etas,
#' alpha = 1.0)
#'
#' @importFrom utils txtProgressBar setTxtProgressBar
#' @export
cv.coxkl_enet <- function(z, delta, time, stratum = NULL, RS = NULL, beta = NULL,
etas, alpha = 1.0,
lambda = NULL, nlambda = 100, lambda.min.ratio = ifelse(n < p, 0.05, 1e-03),
nfolds = 5,
cv.criteria = c("V&VH", "LinPred", "CIndex_pooled", "CIndex_foldaverage"),
c_index_stratum = NULL,
message = FALSE, seed = NULL, ...) {
## Input check & data preparation
if (is.null(etas)) stop("etas must be provided.", call. = FALSE)
etas <- sort(etas)
cv.criteria <- match.arg(cv.criteria, choices = c("V&VH", "LinPred", "CIndex_pooled", "CIndex_foldaverage"))
if (alpha > 1 | alpha <= 0) stop("alpha must be in (0,1]", call. = FALSE)
if (is.null(RS) && is.null(beta)) {
stop("No external information is provided. Either RS or beta must be provided.")
} else if (is.null(RS) && !is.null(beta)) {
if (length(beta) != ncol(z)) stop("beta dimension mismatch with z.", call. = FALSE)
RS <- as.matrix(z) %*% as.matrix(beta)
} else {
RS <- as.matrix(RS)
}
if (is.null(stratum)) {
warning("Stratum not provided. Treating all data as one stratum.", call. = FALSE)
stratum <- rep(1, nrow(z))
} else {
if (!is.null(c_index_stratum) & !identical(stratum, c_index_stratum)) {
stop("Provided 'c_index_stratum' not identical to 'stratum'!")
}
stratum <- match(stratum, unique(stratum))
}
## Sort data
time_order <- order(stratum, time)
time <- as.numeric(time[time_order])
stratum <- as.numeric(stratum[time_order])
z <- as.matrix(z)[time_order, , drop = FALSE]
delta <- as.numeric(delta[time_order])
RS <- RS[time_order, , drop = FALSE]
n <- nrow(z)
p <- ncol(z)
n.each_stratum_full <- as.numeric(table(stratum))
## CV folds
if (!is.null(seed)) set.seed(seed)
folds <- get_fold(nfolds = nfolds, delta = delta, stratum = stratum)
results_list <- list()
n_eta <- length(etas)
## External baseline accumulators
if (cv.criteria == "V&VH") {
pl_full_RS <- pl_cal_theta(as.vector(RS), delta, n.each_stratum_full)
ext_vvh_per_fold <- numeric(nfolds)
} else if (cv.criteria == "CIndex_pooled") {
ext_numer <- numeric(nfolds); ext_denom <- numeric(nfolds)
} else if (cv.criteria == "CIndex_foldaverage") {
ext_c_per_fold <- numeric(nfolds)
}
if (message) {
cat("Cross-validation over etas sequence:\n")
pb_eta <- txtProgressBar(min = 0, max = n_eta, style = 3, width = 30)
}
fit_beta_list <- vector("list", length(etas))
lambda_list <- vector("list", length(etas))
for (ei in seq_along(etas)) {
eta <- etas[ei]
## Determine lambda path (on full data) & estimation on full data
if (is.null(lambda)) {
fit0 <- coxkl_enet(z = z, delta = delta, time = time, stratum = stratum,
RS = RS, eta = eta, alpha = alpha,
lambda = NULL, nlambda = nlambda, lambda.min.ratio = lambda.min.ratio,
data_sorted = TRUE, message = FALSE, ...)
lambda_seq <- as.vector(fit0$lambda)
} else {
lambda_seq <- sort(lambda, decreasing = TRUE)
fit0 <- coxkl_enet(z = z, delta = delta, time = time, stratum = stratum,
RS = RS, eta = eta, alpha = alpha,
lambda = lambda_seq, data_sorted = TRUE, message = FALSE, ...)
}
fit_beta_list[[ei]] <- fit0$beta
lambda_list[[ei]] <- lambda_seq
L <- length(lambda_seq)
## Accumulators
if (cv.criteria == "V&VH") {
vvh_sum <- rep(0, L)
} else if (cv.criteria == "LinPred") {
Y <- matrix(NA_real_, nrow = n, ncol = L)
colnames(Y) <- round(lambda_seq, 6)
} else if (cv.criteria == "CIndex_pooled") {
numer <- rep(0, L); denom <- rep(0, L)
} else if (cv.criteria == "CIndex_foldaverage") {
csum <- rep(0, L); cnt <- rep(0, L)
}
## Loop over folds
for (f in seq_len(nfolds)) {
train_idx <- which(folds != f)
test_idx <- which(folds == f)
## External baseline per fold (computed once)
if (ei == 1) {
if (cv.criteria == "V&VH") {
n.each_stratum_train <- as.numeric(table(stratum[train_idx]))
pl_train_RS <- pl_cal_theta(as.vector(RS[train_idx]), delta[train_idx], n.each_stratum_train)
ext_vvh_per_fold[f] <- pl_full_RS - pl_train_RS
} else if (cv.criteria == "CIndex_pooled" || cv.criteria == "CIndex_foldaverage") {
if (is.null(c_index_stratum)) stratum_test <- stratum[test_idx] else stratum_test <- c_index_stratum[test_idx]
cstat_ext <- c_stat_stratcox(time[test_idx], as.vector(RS[test_idx]), stratum_test, delta[test_idx])
if (cv.criteria == "CIndex_pooled") {
ext_numer[f] <- cstat_ext$numer
ext_denom[f] <- cstat_ext$denom
} else {
ext_c_per_fold[f] <- cstat_ext$c_statistic
}
}
}
## Fit full lambda path on training fold
fit_f <- coxkl_enet(z = z[train_idx, , drop = FALSE],
delta = delta[train_idx],
time = time[train_idx],
stratum = stratum[train_idx],
RS = RS[train_idx, , drop = FALSE],
eta = eta, alpha = alpha, lambda = lambda_seq,
data_sorted = TRUE, message = FALSE, ...)
beta_mat <- fit_f$beta
LP_train <- z[train_idx, ] %*% beta_mat
LP_all <- z %*% beta_mat
LP_test <- z[test_idx, ] %*% beta_mat
## Compute fold-wise CV metric
if (cv.criteria == "V&VH") {
n.each_stratum_train <- as.numeric(table(stratum[train_idx]))
n.each_stratum_all <- as.numeric(table(stratum))
pl_all <- apply(LP_all, 2, function(col) pl_cal_theta(col, delta, n.each_stratum_all))
pl_tr <- apply(LP_train,2, function(col) pl_cal_theta(col, delta[train_idx], n.each_stratum_train))
vvh_sum <- vvh_sum + (pl_all - pl_tr)
} else if (cv.criteria == "LinPred") {
Y[test_idx, ] <- LP_test
} else {
if (is.null(c_index_stratum)) stratum_test <- stratum[test_idx] else stratum_test <- c_index_stratum[test_idx]
numer_vec <- denom_vec <- cstat_vec <- rep(NA_real_, ncol(LP_test))
for (j in seq_len(ncol(LP_test))) {
cstat_j <- c_stat_stratcox(time[test_idx], LP_test[, j], stratum_test, delta[test_idx])
numer_vec[j] <- cstat_j$numer
denom_vec[j] <- cstat_j$denom
cstat_vec[j] <- cstat_j$c_statistic
}
if (cv.criteria == "CIndex_pooled") {
numer <- numer + numer_vec
denom <- denom + denom_vec
} else {
csum <- csum + cstat_vec
cnt <- cnt + 1
}
}
}
## Aggregate folds for this eta
if (cv.criteria == "V&VH") {
cve_eta <- vvh_sum
} else if (cv.criteria == "LinPred") {
Lmat <- loss.coxkl_highdim(delta, Y, stratum, total = FALSE)
cve_eta <- colSums(Lmat)
} else if (cv.criteria == "CIndex_pooled") {
cve_eta <- numer / denom
} else {
cve_eta <- csum / pmax(cnt, 1)
}
results_list[[ei]] <- data.frame(
eta = eta,
lambda = lambda_seq,
score = cve_eta,
stringsAsFactors = FALSE
)
if (message) setTxtProgressBar(pb_eta, ei)
}
if (message) close(pb_eta)
results_df <- do.call(rbind, results_list)
## Best per eta
if (cv.criteria %in% c("V&VH", "LinPred")) {
results_df$Loss <- -2 * results_df$score / n
results_df$score <- NULL
best_per_eta <- do.call(rbind, lapply(split(results_df, results_df$eta), function(df) df[which.min(df$Loss), , drop = FALSE]))
} else if (cv.criteria == "CIndex_pooled") {
results_df$CIndex_pooled <- results_df$score; results_df$score <- NULL
best_per_eta <- do.call(rbind, lapply(split(results_df, results_df$eta), function(df) df[which.max(df$CIndex_pooled), , drop = FALSE]))
} else {
results_df$CIndex_foldaverage <- results_df$score; results_df$score <- NULL
best_per_eta <- do.call(rbind, lapply(split(results_df, results_df$eta), function(df) df[which.max(df$CIndex_foldaverage), , drop = FALSE]))
}
rownames(best_per_eta) <- NULL
beta_best_mat <- sapply(seq_along(etas), function(i) {
beta_mat <- fit_beta_list[[i]]
lambda_seq <- lambda_list[[i]]
idx <- which(abs(lambda_seq - best_per_eta$lambda[i]) < 1e-12)
if (length(idx) != 1) {
idx <- which.min(abs(lambda_seq - best_per_eta$lambda[i]))
}
beta_mat[, idx]
})
colnames(beta_best_mat) <- etas
## External baseline
external_stat <- switch(cv.criteria,
"V&VH" = -2 * sum(ext_vvh_per_fold),
"LinPred" = -2 * pl_cal_theta(as.vector(RS), delta, n.each_stratum_full),
"CIndex_pooled" = sum(ext_numer) / sum(ext_denom),
"CIndex_foldaverage" = mean(ext_c_per_fold)
)
structure(
list(
integrated_stat.full_results = results_df,
integrated_stat.best_per_eta = best_per_eta,
integrated_stat.betahat_best = beta_best_mat,
external_stat = external_stat,
criteria = cv.criteria,
alpha = alpha,
nfolds = nfolds
),
class = "cv.coxkl_enet"
)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.