R/cvf.R

Defines functions cvf

Documented in cvf

#' Cross-validation internal function for cv_plmm
#'
#' Internal function for cv_plmm which calls plmm on a fold subset of the original data.
#'
#' @param i Fold number to be excluded from fit.
#' @param fold n-length vector of fold-assignments.
#' @param type A character argument indicating what should be returned from predict.plmm. If \code{type == 'lp'} predictions are based on the linear predictor, \code{$X beta$}. If \code{type == 'individual'} predictions are based on the linear predictor plus the estimated random effect (BLUP).
#' @param cv_args List of additional arguments to be passed to plmm.
#' @param ... Optional arguments to `predict_within_cv`
#'
#' @keywords internal
#'
#' @returns a list with three elements:
#' * a numeric vector with the loss at each value of lambda
#' * a numeric value indicating the number of lambda values used
#' * a numeric value with the predicted outcome (y hat) values at each lambda

cvf <- function(i, fold, type, cv_args, ...) {
  # save the 'prep' object from the plmm_prep() in cv_plmm
  full_cv_prep <- cv_args$prep

  # save outcome information -- will need to subset this into test and train sets
  y <- cv_args$y

  # make list to hold the data for this particular fold:
  fold_args <- list(std_X_details = list(),
                    fbm_flag = cv_args$fbm_flag,
                    plink_flag = cv_args$plink_flag,
                    penalty = cv_args$penalty,
                    penalty_factor = cv_args$penalty_factor,
                    gamma = cv_args$gamma,
                    alpha = cv_args$alpha,
                    nlambda = cv_args$nlambda,
                    lambda_min = cv_args$lambda_min,
                    max_iter = cv_args$max_iter,
                    eps = cv_args$eps,
                    warn = cv_args$warn,
                    lambda = cv_args$lambda)

  # subset std_X, U, and y to match fold indices ------------------------
  #   (and in so doing, leave out the ith fold)
  if (cv_args$fbm_flag) {

    # designate the training set
    train_X <- bigmemory::deepcopy(
      full_cv_prep$std_X,
      rows = which(fold != i),
      type = "double",
      backingfile = paste0("train_fold", i, ".bk"),
      descriptorfile = paste0("train_fold", i, ".desc"),
      backingpath = bigmemory::dir.name(full_cv_prep$std_X)
    )

    # create the copy of the training data to be standardized
    fold_args$std_X <- bigmemory::deepcopy(
      full_cv_prep$std_X,
      rows = which(fold != i),
      type = "double",
      backingfile = paste0("std_train_fold", i, ".bk"),
      descriptorfile = paste0("std_train_fold", i, ".desc"),
      backingpath = bigmemory::dir.name(full_cv_prep$std_X)
    )

    # re-scale data & check for singularity
    std_trainX_info <- .Call(
      "big_std",
      fold_args$std_X@address,
      as.integer(count_cores()),
      to_center = TRUE,
      NULL,
      NULL,
      PACKAGE = "plmmr"
    )

    fold_args$std_X@address <- std_trainX_info$std_X
    fold_args$std_X_details$center <- std_trainX_info$std_X_center
    fold_args$std_X_details$scale <- std_trainX_info$std_X_scale
    fold_args$std_X_details$ns <- which(std_trainX_info$std_X_scale > 1e-3)
    singular <- std_trainX_info$std_X_scale < 1e-3

    # do not fit a model on singular features!
    if (sum(singular) >= 1) fold_args$penalty_factor[singular] <- Inf

  } else {

    # subset training data
    train_X <- full_cv_prep$std_X[fold != i, , drop = FALSE]

    # Note: subsetting the data into test/train sets may cause low variance features
    #   to become constant features in the training data. The following lines address this issue

    # re-standardize training data & check for singularity
    std_info <- standardize_in_memory(train_X)
    fold_args$std_X <- std_info$std_X
    fold_args$std_X_details <- std_info$std_X_details

    # do not fit a model on these (near) singular features!
    singular <- fold_args$std_X_details$scale < 1e-3
    if (sum(singular) >= 1) fold_args$penalty_factor[singular] <- Inf
  }

  # subset outcome vector to include outcomes for training data only
  fold_args$y <- cv_args$y[fold != i]

  # center the training outcome
  fold_args$centered_y <- fold_args$y |>
    scale(scale = FALSE) |>
    drop()
  # extract test set --------------------------------------
  # this comes from cv prep on full data
  if (cv_args$fbm_flag) {
    test_X <- bigmemory::deepcopy(full_cv_prep$std_X,
                                  rows = which(fold == i),
                                  type = "double",
                                  backingfile = paste0("test_fold", i, ".bk"),
                                  descriptorfile = paste0("test_fold", i, ".desc"),
                                  backingpath = bigmemory::dir.name(full_cv_prep$std_X))

  } else {
    test_X <- full_cv_prep$std_X[fold == i, , drop = FALSE]
  }

  # subset outcome for test set
  test_y <- y[fold == i]

  # decomposition for current fold ------------------------------
  if (cv_args$prep$trace) {
    cat("Beginning eigendecomposition in fold ", i, ":\n")
  }
  fold_prep <- plmm_prep(std_X = fold_args$std_X,
                         std_X_n = nrow(fold_args$std_X),
                         std_X_p = ncol(fold_args$std_X),
                         n = nrow(full_cv_prep$std_X),
                         p = ncol(full_cv_prep$std_X),
                         centered_y = fold_args$centered_y,
                         fbm_flag = fold_args$fbm_flag,
                         eta_star = cv_args$eta_star,
                         trace = cv_args$prep$trace)
  fold_args$prep <- fold_prep
  # fit a plmm within each fold at each value of lambda
  # lambda stays the same for each fold; comes from the overall fit in cv_plmm()
  if (cv_args$prep$trace) {
    cat("** Fitting model in fold ", i, "\n", sep = "")
  }

  fit.i <- plmm_fit(prep = fold_prep,
                    y = fold_args$y,
                    std_X_details = fold_args$std_X_details,
                    penalty_factor = fold_args$penalty_factor,
                    fbm_flag = fold_args$fbm_flag,
                    penalty = fold_args$penalty,
                    gamma = fold_args$gamma,
                    alpha = fold_args$alpha,
                    lambda_min = fold_args$lambda_min,
                    nlambda = fold_args$nlambda,
                    lambda = fold_args$lambda,
                    eps = fold_args$eps,
                    max_iter = fold_args$max_iter,
                    warn = fold_args$warn)

  format.i <- plmm_format(fit = fit.i,
                          p = ncol(full_cv_prep$std_X),
                          std_X_details = fold_args$std_X_details,
                          fbm_flag = fold_args$fbm_flag,
                          plink_flag = fold_args$plink_flag)

  # prediction ---------------------------------------------------
  # note: predictions are on the scale of the standardized training data
  if (type == "lp") {
    yhat <- predict_within_cv(fit = format.i,
                              testX = test_X,
                              type = "lp",
                              fbm = cv_args$fbm_flag)
  }

  if (type == "blup") {
    Sigma_11 <- construct_variance(K = fold_prep$K, eta = fit.i$eta)
    if (cv_args$fbm_flag) {
      # we will need a copy of the testing data that is standardized
      std_test_X <- bigmemory::deepcopy(full_cv_prep$std_X,
                                        rows = which(fold == i),
                                        type = "double",
                                        backingfile = paste0("std_test_fold", i, ".bk"),
                                        descriptorfile = paste0("std_test_fold", i, ".desc"),
                                        backingpath = bigmemory::dir.name(full_cv_prep$std_X))

      # use center/scale values from train_X to standardize test_X
      if (length(singular) >= 1) fold_args$std_X_details$scale[singular] <- 1
      std_test_info <- .Call("big_std",
                             std_test_X@address,
                             as.integer(count_cores()),
                             tocenter = TRUE,
                             fold_args$std_X_details$center,
                             fold_args$std_X_details$scale,
                             PACKAGE = "plmmr")
      std_test_X@address <- std_test_info$std_X

      const <- (fit.i$eta / ncol(train_X))
      XXt <- bigalgebra::dgemm(TRANSA = "N",
                               TRANSB = "T",
                               A = std_test_X,
                               B = fold_args$std_X)
      Sigma_21 <- const * XXt
      Sigma_21 <- Sigma_21[,] # convert to in-memory matrix
    } else {
      # don't rescale columns that were singular features in std_train_X;
      #   these features will have an estimated beta of 0 anyway
      if (length(singular) >= 1) fold_args$std_X_details$scale[singular] <- 1
      # use center/scale values from train_X to standardize test_X
      std_test_X <- scale(test_X,
                          center = fold_args$std_X_details$center,
                          scale = fold_args$std_X_details$scale)
      Sigma_21 <- fit.i$eta * (1 / ncol(train_X)) * tcrossprod(std_test_X,
                                                               fold_args$std_X)
    }

    yhat <- predict_within_cv(fit = format.i,
                              testX = test_X,
                              type = "blup",
                              fbm = cv_args$fbm_flag,
                              Sigma_11 = Sigma_11,
                              Sigma_21 = Sigma_21)

  }

  # cleanup -----------------------------------------------------------------
  # delete files created in cross-validation, if data is filebacked
  if (cv_args$fbm_flag) {
    list.files(path = bigmemory::dir.name(full_cv_prep$std_X),
               pattern = paste0("fold", i),
               full.names = TRUE) |> file.remove()
    gc() # release the pointer
  }

  # return -----------------------------------------------------------------
  loss <- sapply(seq_len(ncol(yhat)), function(ll) plmm_loss(test_y, yhat[, ll]))
  list(loss = loss, nl = length(fit.i$lambda), yhat = yhat)
}

Try the plmmr package in your browser

Any scripts or data that you put into this service are public.

plmmr documentation built on Jan. 22, 2026, 1:07 a.m.