sieveSurv: Nonparametric Survival Estimator with B-spline Sieves

Documented in cv_loglik_p

#' Cross-validated observed-data log-likelihood for B-spline coefficients
#'
#' @param nfolds Specifies the number of cross-validation folds. The default value is \code{5}. Although \code{nfolds} can be as large as the sample size (leave-one-out cross-validation), it is not recommended for large datasets. The smallest value allowable is \code{3}.
#' @param interp Indicator of whether the B-spline coefficients in the testing data should be linearly interpolated from the trained model. Defaults to \code{FALSE}.
#' @param seed (Optional) For reproducibility in assigning the folds, an integer to specify the random number generator. Default is \code{NULL}.
#' @param time Column name for follow-up time
#' @param event Column name for event indicators
#' @param covar (Optional) column name(s) for additional fully-observed covariates. Default is \code{NULL}, which estimates unconditional survival.
#' @param bspline Column names for B-spline basis.
#' @param data Dataframe or matrix containing (at least) named columns \code{time}, \code{event}, \code{covar}, and \code{bspline}.
#' @param tol Tolerance to define convergence. Default is \code{1E-4}.
#' @param max_iter Maximum number of iterations allowed for the EM algorithm. Default is \code{1000}.
#' @param assume_last Assume last observed \code{time} is an event (for use when the integration of the survival function is desired). Default is \code{FALSE}.
#' @return
#' \item{avg_pred_loglik}{Stores the average predicted log likelihood.}
#' \item{pred_loglik}{Stores the predicted log likelihoood in each fold.}
#' \item{converged}{Stores the convergence status of the EM algorithm in each run.}
#' @export
#'
cv_loglik_p <- function(nfold = 5, interp = FALSE, seed = NULL, time, event, covar = NULL, bspline, data, tol = 1E-4, max_iter = 1000, assume_last = FALSE) {
  if (nfold < 3) {
    nfold <- 3
    warning("nfold too small. Reverting to 3-fold cross validation.")
  }

  if (!is.null(seed)) {
    set.seed(seed)
  }
  assign_folds <- sample(x = 1:nfold, size = nrow(data), replace = TRUE)
  ll <- status <- message <- rep(NA, nfold)

  for (f in 1:nfold) {
    train <- data[assign_folds != f, ]
    train_fit <- cv_sieveSurv(time = time, event = event, covar = covar, bspline = bspline, data = train,
                              tol = tol, max_iter = max_iter, assume_last = assume_last)
    status[f] <- train_fit$conv
    message[f] <- train_fit$conv_msg

    if (train_fit$conv) {
      # Save coefficients from trained SMLE fit ------------------------------
      train_p <- cbind(k = 1:nrow(train_fit$bspline_coeff), train_fit$bspline_coeff)
      train_x <- train_p[, "x"]

      # Separate test data ---------------------------------------------------
      test <- data[assign_folds == f, ]

      if (interp) {
        # Save unique x from test data ---------------------------------------
        test_x <- unique(data.frame(test[test[, event] == 1, time]))
        test_x <- data.frame(x = test_x[order(test_x), ])
        test_x <- cbind(k_ = 1:nrow(test_x), test_x)
        test_p <- matrix(data = NA, nrow = nrow(test_x), ncol = ncol(B))

        # Linearly interpolate B-spline coefficients for the test x ----------
        for (j in 1:nrow(test_x)) {
          x_ <- test_x[j, "x"]
          bf <- suppressWarnings(expr = max(which(train_x <= x_)))
          af <- suppressWarnings(expr = min(which(train_x >= x_)))
          if (bf == -Inf) { bf <- af }
          if (af == Inf) { af <- bf }

          # x values immediately before/after
          x0 <- train_p[bf, "x"]
          x1 <- train_p[af, "x"]

          # B-spline coefficients immediately before/after
          p0 <- train_p[bf, -c(1:2)]
          p1 <- train_p[af, -c(1:2)]

          if (x1 == x0) {
            test_p[j, ] <- unlist(p0)
          } else {
            test_p[j, ] <- unlist((p0 * (x1 - x_) + p1 * (x_ - x0)) / (x1 - x0))
          }
        }

        # Recale columns of test_p to sum to 1 -------------------------------
        denom <- colSums(test_p)
        denom[denom == 0] <- 1 # Avoid NaN error due to dividing by 0
        re_test_p <- t(t(test_p) / denom)

        # Calculate log-likelihood -------------------------------------------
        ll[f] <- loglik_p(bspline_coeff = re_test_p[, bspline], time = time, event = event, bspline = bspline, data = test)
      } else {
        # Calculate log-likelihood -------------------------------------------
        ll[f] <- loglik_p(bspline_coeff = train_p[, bspline], time = time, event = event, bspline = bspline, data = test, try_event_times = train_x)
      }
    }
  }
  return(list(avg_pred_loglik = mean(ll), pred_loglik = ll, converged = status, converged_msg = message))
}