R/cv.coxkl.R

Defines functions cv.coxkl

Documented in cv.coxkl

#' Cross-Validated Selection of Integration Parameter (`eta`) for the Cox–KL Model
#'
#' Performs K-fold cross-validation to select the integration parameter `eta`
#' for the Cox–KL model. Each fold fits the model on a training split and
#' evaluates on the held-out split using the specified performance criterion.
#' 
#'
#' @param z Numeric matrix of covariates (rows = observations, columns = variables).
#' @param delta Numeric vector of event indicators (1 = event, 0 = censored).
#' @param time Numeric vector of observed event or censoring times.
#' @param stratum Optional numeric or factor vector defining strata. If `NULL`,
#'   all observations are treated as a single stratum.
#' @param RS Optional numeric vector or matrix of external risk scores. If omitted,
#'   `beta` must be supplied.
#' @param beta Optional numeric vector of external coefficients. If omitted, `RS`
#'   must be supplied.
#' @param etas Numeric vector of candidate tuning values to be cross-validated. (required). 
#'   Values are internally sorted in ascending order.
#' @param tol Convergence tolerance for the optimizer used inside \code{\link{coxkl}}. Default `1e-4`.
#' @param Mstop Maximum number of Newton iterations used inside \code{\link{coxkl}}. Default `100`.
#' @param backtrack Logical; if `TRUE`, backtracking line search is applied during
#'   optimization. Default is `FALSE`.
#' @param nfolds Number of cross-validation folds. Default `5`.
#' @param criteria Character string specifying the performance criterion.
#'   Choices are `"V&VH"` (default), `"LinPred"`, `"CIndex_pooled"`, or `"CIndex_foldaverage"`.
#' @param c_index_stratum Optional stratum vector. Only required when
#'   \code{criteria} is set to `"CIndex_pooled"` or `"CIndex_foldaverage"`,
#'   and a stratified C-index is desired while the fitted model is non-stratified.
#'   Default `NULL`.
#' @param message Logical; if `TRUE`, prints progress messages and per-fold progress bars. Default `FALSE`.
#' @param seed Optional integer seed for reproducible fold assignment. Default `NULL`.
#' @param ... Additional arguments passed to \code{\link{coxkl}}.
#'
#'
#' @details
#' External information is required: supply either \code{RS} or \code{beta} (if \code{beta} is given,
#' \code{RS} is computed as \code{z \%*\% beta}). Folds are created with stratification by
#' \code{stratum} and censoring status. Within each fold and each candidate \code{eta},
#' the function fits \code{coxkl} on the training split with warm-starts initialized to zero
#' and evaluates on the test split:
#' \itemize{
#'   \item \code{"V&VH"}: uses the difference of partial log-likelihoods between full and
#'         training fits; reported as \eqn{-2} times the aggregated quantity.
#'   \item \code{"LinPred"}: aggregates the test-split linear predictors across folds and
#'         evaluates \eqn{-2} times the partial log-likelihood on the full data.
#'   \item \code{"CIndex_pooled"}: pools pairwise comparable counts across folds (numerator/denominator).
#'   \item \code{"CIndex_foldaverage"}: averages the per-fold stratified C-index.
#' }
#' The function also computes an external baseline statistic from \code{RS} using the
#' same criterion for comparison.
#'
#' @return An object of class \code{"cv.coxkl"} with components:
#' \describe{
#'   \item{\code{internal_stat}}{A data.frame with one row per \code{eta} containing \code{eta} and the
#'         cross-validated measure named according to \code{criteria} (one of
#'         \code{VVH_Loss}, \code{LinPred_Loss}, \code{CIndex_pooled}, \code{CIndex_foldaverage}).}
#'   \item{\code{external_stat}}{Scalar baseline statistic computed from \code{RS} under the same \code{criteria}.}
#'   \item{\code{criteria}}{The evaluation criterion used.}
#'   \item{\code{nfolds}}{Number of folds.}
#' }
#'
#' @examples
#' data(ExampleData_lowdim)
#' 
#' train_dat_lowdim <- ExampleData_lowdim$train
#' beta_external_good_lowdim <- ExampleData_lowdim$beta_external_good
#' 
#' etas <- generate_eta(method = "exponential", n = 10, max_eta = 5)
#' 
#' cv_res <- cv.coxkl(z = train_dat_lowdim$z,
#'                    delta = train_dat_lowdim$status,
#'                    time = train_dat_lowdim$time,
#'                    beta = beta_external_good_lowdim,
#'                    etas = etas)
#'                     
#' @importFrom utils txtProgressBar setTxtProgressBar
#' @export
cv.coxkl <- function(z, delta, time, stratum = NULL,
                     RS = NULL, beta = NULL,
                     etas = NULL,
                     tol = 1.0e-4, Mstop = 100,
                     backtrack = FALSE,
                     nfolds = 5,
                     criteria = c("V&VH", "LinPred", "CIndex_pooled", "CIndex_foldaverage"),
                     c_index_stratum = NULL,
                     message = FALSE,
                     seed = NULL, ...) {
  
  criteria <- match.arg(criteria, choices = c("V&VH", "LinPred", "CIndex_pooled", "CIndex_foldaverage"))
  
  ## Check and prepare external risk score
  if (is.null(etas)) stop("etas must be provided.", call. = FALSE)
  etas <- sort(etas)
  
  if (is.null(RS) && is.null(beta)) {
    stop("No external information is provided. Either RS or beta must be provided.")
  } else if (is.null(RS) && !is.null(beta)) {
    if (length(beta) == ncol(z)) {
      if (message) message("External beta information is used.")
      RS <- as.matrix(z) %*% as.matrix(beta)
    } else {
      stop("The dimension of beta does not match the number of columns in z.")
    }
  } else {
    RS <- as.matrix(RS)
    if (message) message("External Risk Score information is used.")
  }
  
  ## Process stratum
  if (is.null(stratum)) {
    warning("Stratum not provided. Treating all data as one stratum.", call. = FALSE)
    stratum <- rep(1, nrow(z))
  } else {
    if (!is.null(c_index_stratum) & !identical(stratum, c_index_stratum)) {
      stop("The provided 'c_index_stratum' is not identical to 'stratum'!")
    }
    stratum <- match(stratum, unique(stratum))
  }
  
  ## Sort data by stratum and time
  time_order <- order(stratum, time)
  time <- as.numeric(time[time_order])
  stratum <- as.numeric(stratum[time_order])
  z <- as.matrix(z)[time_order, , drop = FALSE]
  delta <- as.numeric(delta[time_order])
  RS <- RS[time_order, , drop = FALSE]
  
  n <- nrow(z)
  n_eta <- length(etas)
  
  ## Precompute counts per stratum for full data (used by VVH / LinPred external)
  n.each_stratum_full <- as.numeric(table(stratum))
  
  ## Fix seed for reproducibility and create folds
  if (!is.null(seed)) set.seed(seed)
  folds <- get_fold(nfolds = nfolds, delta = delta, stratum = stratum)
  
  ## Storage for internal CV results
  result_mat <- matrix(NA_real_, nrow = nfolds, ncol = n_eta)
  if (criteria == "LinPred") {
    cv_all_linpred <- matrix(NA, nrow = n, ncol = n_eta)
  } else if (criteria == "CIndex_pooled") {
    cv_pooled_cindex_array <- array(0, dim = c(nfolds, n_eta, 2))  # [fold, eta, numer/denom]
  }
  
  ## Storage for external baseline, matched to each criteria
  if (criteria == "V&VH") {
    # For VVH we need fold-wise: pl_full(RS) - pl_train(RS)
    pl_full_RS <- pl_cal_theta(as.vector(RS), delta, n.each_stratum_full)
    ext_vvh_per_fold <- numeric(nfolds)
  } else if (criteria == "LinPred") {
    # For LinPred the assembled CV linear predictor equals RS itself
    # external_stat will be computed once at the end (no need for folds)
    # placeholder not needed
    NULL
  } else if (criteria == "CIndex_pooled") {
    ext_numer <- numeric(nfolds)
    ext_denom <- numeric(nfolds)
  } else if (criteria == "CIndex_foldaverage") {
    ext_c_per_fold <- numeric(nfolds)
  }
  
  ## Outer loop over folds
  for (f in seq_len(nfolds)) {
    if (message) message(sprintf("CV fold %d/%d starts...", f, nfolds))
    
    train_idx <- which(folds != f)
    test_idx  <- which(folds == f)
    
    z_train <- z[train_idx, , drop = FALSE]
    delta_train <- delta[train_idx]
    time_train <- time[train_idx]
    stratum_train <- stratum[train_idx]
    RS_train <- RS[train_idx, , drop = FALSE]
    
    beta_initial <- rep(0, ncol(z))  # warm start for each fold
    
    ## ----- External baseline per fold, matched to criteria -----
    if (criteria == "V&VH") {
      # VVH external for this fold: pl_full(RS) - pl_train(RS)
      n.each_stratum_train <- as.numeric(table(stratum_train))
      pl_train_RS <- pl_cal_theta(as.vector(RS_train), delta_train, n.each_stratum_train)
      ext_vvh_per_fold[f] <- pl_full_RS - pl_train_RS
    } else if (criteria == "CIndex_pooled") {
      # pooled: sum numer/denom over test folds
      if (is.null(c_index_stratum)) {
        stratum_test <- stratum[test_idx]
      } else {
        stratum_test <- c_index_stratum[test_idx]
      }
      cstat_ext <- c_stat_stratcox(time[test_idx],
                                   as.vector(RS[test_idx]),
                                   stratum_test,
                                   delta[test_idx])
      ext_numer[f] <- cstat_ext$numer
      ext_denom[f] <- cstat_ext$denom
    } else if (criteria == "CIndex_foldaverage") {
      if (is.null(c_index_stratum)) {
        stratum_test <- stratum[test_idx]
      } else {
        stratum_test <- c_index_stratum[test_idx]
      }
      cstat_ext <- c_stat_stratcox(time[test_idx],
                                   as.vector(RS[test_idx]),
                                   stratum_test,
                                   delta[test_idx])
      ext_c_per_fold[f] <- cstat_ext$c_statistic
    }
    
    ## ----- Cross-validation over eta sequence (internal model) -----
    if (message) {
      cat("Cross-validation over eta sequence:\n")
      pb <- txtProgressBar(min = 0, max = n_eta, style = 3, width = 30)
    }
    
    for (i in seq_along(etas)) {
      eta <- etas[i]
      
      cox_estimate <- coxkl(z = z_train,
                            delta = delta_train,
                            time = time_train,
                            stratum = stratum_train,
                            RS = RS_train,
                            etas = eta,
                            tol = tol,
                            Mstop = Mstop,
                            backtrack = backtrack,
                            message = FALSE,
                            data_sorted = TRUE,
                            beta_initial = beta_initial)
      
      beta_train <- cox_estimate$beta
      beta_initial <- beta_train  # warm start
      
      z_test <- z[test_idx, , drop = FALSE]
      delta_test <- delta[test_idx]
      time_test <- time[test_idx]
      LP_test <- as.matrix(z_test) %*% as.matrix(beta_train)
      
      if (criteria == "V&VH") {
        LP_train <- as.matrix(z_train) %*% as.matrix(beta_train)
        LP_internal <- as.matrix(z) %*% as.matrix(beta_train)
        n.each_stratum_train <- as.numeric(table(stratum_train))
        n.each_stratum_full  <- n.each_stratum_full  # already computed
        
        result_mat[f, i] <-
          pl_cal_theta(LP_internal, delta, n.each_stratum_full) -
          pl_cal_theta(LP_train,    delta_train, n.each_stratum_train)
        
      } else if (criteria == "LinPred") {
        cv_all_linpred[test_idx, i] <- LP_test
        
      } else {
        if (is.null(c_index_stratum)) {
          stratum_test <- stratum[test_idx]
        } else {
          stratum_test <- c_index_stratum[test_idx]
        }
        if (criteria == "CIndex_pooled") {
          cstat <- c_stat_stratcox(time_test, LP_test, stratum_test, delta_test)
          cv_pooled_cindex_array[f, i, ] <- c(cstat$numer, cstat$denom)
        } else if (criteria == "CIndex_foldaverage") {
          cstat <- c_stat_stratcox(time_test, LP_test, stratum_test, delta_test)$c_statistic
          result_mat[f, i] <- cstat
        }
      }
      if (message) setTxtProgressBar(pb, i)
    }
    if (message) close(pb)
  }
  
  ## Combine CV results across folds (internal model)
  if (criteria == "V&VH") {
    result_vec <- colSums(result_mat, na.rm = TRUE)
  } else if (criteria == "LinPred") {
    result_vec <- apply(cv_all_linpred, 2,
                        function(lp) pl_cal_theta(lp, delta, as.numeric(table(stratum))))
  } else if (criteria == "CIndex_foldaverage") {
    result_vec <- colMeans(result_mat, na.rm = TRUE)
  } else if (criteria == "CIndex_pooled") {
    numer <- apply(cv_pooled_cindex_array[,,1], 2, sum, na.rm = TRUE)
    denom <- apply(cv_pooled_cindex_array[,,2], 2, sum, na.rm = TRUE)
    result_vec <- numer / denom
  }
  
  ## Assemble internal results by eta
  results <- data.frame(eta = etas)
  if (criteria == "V&VH") {
    results$VVH_Loss <- -2 * result_vec  / n
  } else if (criteria == "LinPred") {
    results$LinPred_Loss <- -2 * result_vec  / n
  } else if (criteria == "CIndex_pooled") {
    results$CIndex_pooled <- result_vec
  } else if (criteria == "CIndex_foldaverage") {
    results$CIndex_foldaverage <- result_vec
  }
  
  ## External baseline matched to criteria
  external_stat <- switch(criteria,
                          "V&VH" = -2 * sum(ext_vvh_per_fold),
                          "LinPred" = -2 * pl_cal_theta(as.vector(RS), delta, n.each_stratum_full),
                          "CIndex_pooled" = sum(ext_numer) / sum(ext_denom),
                          "CIndex_foldaverage" = mean(ext_c_per_fold)
  )

  structure(
    list(
      internal_stat = results,
      external_stat = external_stat,
      criteria = criteria,
      nfolds = nfolds
    ),
    class = "cv.coxkl"
  )
}

Try the survkl package in your browser

Any scripts or data that you put into this service are public.

survkl documentation built on April 22, 2026, 1:08 a.m.