multivarious: Extensible Data Structures for Multivariate Analysis

Documented in bootstrap_pca print.bootstrap_pca_result

#' Fast, Exact Bootstrap for PCA Results from `pca` function
#'
#' Performs bootstrap resampling for Principal Component Analysis (PCA) based on
#' the method described by Fisher et al. (2016), optimized for high-dimensional
#' data (p >> n). This version is specifically adapted to work with the output
#' object generated by the provided `pca` function (which returns a `bi_projector`
#' object of class 'pca').
#'
#' @param x An object of class 'pca' as returned by the provided `pca` function.
#'   It's expected to contain loadings (`v`), scores (`s`), singular values (`sdev`),
#'   left singular vectors (`u`), and pre-processing info (`preproc`).
#' @param nboot The number of bootstrap resamples to perform. Must be a positive
#'   integer (default: 100).
#' @param k The number of principal components to bootstrap (default: all
#'   components available in the fitted PCA model `x`). Must be less than or
#'   equal to the number of components in `x`.
#' @param parallel Logical flag indicating whether to use parallel processing
#'   via the `future` framework (default: FALSE). Requires the `future.apply` package
#'   and a configured `future` backend (e.g., `future::plan(future::multisession)`).
#' @param cores The number of cores to use for parallel processing if `parallel = TRUE`
#'   (default: `future::availableCores()`). This is used if no `future` plan is set.
#' @param seed An integer value for the random number generator seed for
#'   reproducibility (default: NULL, no seed is set).
#' @param epsilon A small positive value added to standard deviations before
#'   division to prevent division by zero or instability (default: 1e-15).
#' @param ... Additional arguments (currently ignored).
#'
#' @return A `list` object of class `bootstrap_pca_result` containing:
#'   \item{E_Vb}{Matrix (p x k) of the estimated bootstrap means of the principal components (loadings V^b = coefficients).}
#'   \item{sd_Vb}{Matrix (p x k) of the estimated bootstrap standard deviations of the principal components (loadings V^b).}
#'   \item{z_loadings}{Matrix (p x k) of the bootstrap Z-scores for the loadings, calculated as `E_Vb / sd_Vb`.}
#'   \item{E_Scores}{Matrix (n x k) of the estimated bootstrap means of the principal component scores (S^b).}
#'   \item{sd_Scores}{Matrix (n x k) of the estimated bootstrap standard deviations of the principal component scores (S^b).}
#'   \item{z_scores}{Matrix (n x k) of the bootstrap Z-scores for the scores, calculated as `E_Scores / sd_Scores`.}
#'   \item{E_Ab}{Matrix (k x k) of the estimated bootstrap means of the internal rotation matrices A^b.}
#'   \item{Ab_array}{Array (k x k x nboot) containing all the bootstrap rotation matrices A^b.}
#'   \item{Scores_array}{Array (n x k x nboot) containing all the bootstrap score matrices (S^b, with NAs for non-sampled subjects).}
#'   \item{nboot}{The number of bootstrap samples used (successful ones).}
#'   \item{k}{The number of components bootstrapped.}
#'   \item{call}{The matched call to the function.}
#'
#' @details
#' This function implements the fast bootstrap PCA algorithm proposed by
#' Fisher et al. (2016), adapted for the output structure of the provided `pca` function.
#' The `pca` function returns an object containing:
#' \itemize{
#'   \item `v`: Loadings (coefficients, p x k) - equivalent to V in SVD Y = U D V'. Note the transpose difference from `prcomp`.
#'   \item `s`: Scores (n x k) - calculated as U %*% D.
#'   \item `sdev`: Singular values (vector of length k) - equivalent to d.
#'   \item `u`: Left singular vectors (n x k).
#' }
#'
#' The bootstrap algorithm works by resampling the *subjects* (rows) and recomputing
#' the SVD on a low-dimensional representation. Specifically, it computes the SVD
#' of the resampled matrix `D U' P^b`, where `Y = U D V'` is the SVD of the original
#' (pre-processed) data, and `P^b` is a resampling matrix operating on the subjects (columns of U').
#'
#' The SVD of the resampled low-dimensional matrix is `svd(D U' P^b) = A^b S^b (R^b)'`.
#' The bootstrap principal components (loadings) are then calculated as `V^b = V A^b`,
#' and the bootstrap scores are `Scores^b = R^b S^b`.
#'
#' Z-scores are provided as `mean / sd`.
#'
#' **Important Note:** The algorithm assumes the data `Y` used for the *original* SVD (`Y = U D V'`)
#' was appropriately centered (or pre-processed according to `x$preproc`). The bootstrap
#' samples are generated based on the components derived from this pre-processed data.
#'
#' @references
#' Fisher, Aaron, Brian Caffo, Brian Schwartz, and Vadim Zipunnikov. 2016.
#' "Fast, Exact Bootstrap Principal Component Analysis for P > 1 Million."
#' \emph{Journal of the American Statistical Association} 111 (514): 846–60.
#' \doi{10.1080/01621459.2015.1062383}.
#'
#' @export
#' @family pca bootstrap
#' @examples
#' # --- Assuming the pca and svd_wrapper functions provided are loaded ---
#' # Also assuming helper functions like center(), pass(), prep(), init_transform()
#' # are defined and available. Create simplified mocks if needed:
#' center <- function() structure(list(scale=FALSE, center=TRUE), class=c("center", "prepper"))
#' pass <- function() structure(list(scale=FALSE, center=FALSE), class=c("pass", "prepper"))
#' prep <- function(preproc) {
#'    xfun <- function(X) scale(X, center = preproc$center, scale = preproc$scale)
#'    # Add proper reverse logic if needed for projection
#'    return(structure(list(transform = xfun, reverse = function(X) X), class="prep"))
#' }
#' init_transform <- function(proc, X) proc$transform(X)
#' bi_projector <- function(v, s, sdev, u, preproc, classes, method) {
#'     structure(list(v=v, s=s, sdev=sdev, u=u, preproc=preproc, method=method),
#'               class=c(classes, "bi_projector", "projector"))
#' }
#' coefficients.projector <- function(x,...) x$v
#' scores.projector <- function(x,...) x$s
#' sdev.projector <- function(x,...) x$sdev
#' # --- End Mock Helpers ---
#'
#' # Simulate data (p=50, n=20)
#' set.seed(123)
#' p_dim <- 50
#' n_obs <- 20
#' Y_mat <- matrix(rnorm(p_dim * n_obs), nrow = p_dim, ncol = n_obs)
#' # Transpose for pca function input (n x p)
#' X_mat <- t(Y_mat)
#'
#' # Perform PCA using the provided pca function
#' # Use center() pre-processing
#' pca_res <- pca(X_mat, ncomp = 5, preproc = center(), method = "fast")
#'
#' # Run bootstrap on the pca result
#' boot_res <- bootstrap_pca(pca_res, nboot = 50, k = 5, seed = 456)
#'
#' # Explore results
#' print(dim(boot_res$z_loadings)) # p x k Z-scores for loadings (coefficients)
#' print(dim(boot_res$z_scores))   # n x k Z-scores for scores
#'
#' # Plot std dev for first loading vector (V)
#' plot(boot_res$sd_Vb[, 1], ylab = "Bootstrap SD", main = "Loading Vector 1 Variability")
#'
#' # Plot std dev for first score vector (S)
#' plot(boot_res$sd_Scores[, 1], ylab = "Bootstrap SD", main = "Score Vector 1 Variability")
#'
#' \dontrun{
#' # Example with parallel processing (requires future & future.apply)
#' # library(future)
#' # library(future.apply)
#' # plan(multisession) # Set up parallel backend
#' boot_res_parallel <- bootstrap_pca(pca_res, nboot = 100, k = 5, seed = 456,
#'                                  parallel = TRUE)
#' }
#' @importFrom stats sd
#' @importFrom stats cov
bootstrap_pca <- function(x, nboot = 100, k = NULL,
                          parallel = FALSE, cores = NULL,
                          seed = NULL, epsilon = 1e-15, ...) {

  # --- Input Validation and Setup ---
  if (!inherits(x, "pca") || !inherits(x, "bi_projector")) {
      warning("Input 'x' is not of class 'pca' and 'bi_projector' as returned by the 'pca' function. Ensure it has components 'v', 's', 'sdev', and 'u'.")
  }
  required_comps <- c("v", "s", "sdev", "u")
  if (!all(required_comps %in% names(x))) {
      stop("Input object 'x' is missing required components: ",
           paste(required_comps[!required_comps %in% names(x)], collapse=", "))
  }

  # Determine dimensions from the pca object structure
  p <- nrow(x$v)       # Number of features/dimensions (from loadings V)
  n <- nrow(x$s)       # Number of subjects/observations (from scores S = UD)
  k_max <- ncol(x$v)   # Max components available in x

  if (is.null(k)) {
      k <- k_max
      message("Parameter 'k' not specified. Using k = ", k, " components from the PCA object.")
  }

  if (k > k_max) {
    stop("Requested k (", k, ") exceeds the number of components available in the PCA object (", k_max, ").")
  }
   if (k > n) {
      warning("Requested k (", k, ") exceeds the number of observations (", n, "). Results might be unstable.")
  }
  if (k <= 0 || !is.numeric(k) || k != round(k)) stop("k must be a positive integer.")
  if (!is.numeric(nboot) || length(nboot) != 1 || nboot <= 0 || nboot != round(nboot))
    stop("nboot must be a positive integer.")

  # Seed handling with future_lapply requires specific argument
  # if (!is.null(seed)) withr::local_seed(seed) # Apply seed locally before loop if not parallel

  # Get original PCA results (up to k components) from the 'pca' object
  V <- x$v[, 1:k, drop = FALSE]        # Loadings (p x k)
  Scores_UD <- x$s[, 1:k, drop = FALSE] # Scores (n x k) - Note: x$s is U*D
  d <- x$sdev[1:k]                     # Standard deviations (singular values)
  U_svd <- x$u[, 1:k, drop = FALSE]    # Left singular vectors (n x k) - directly available

  # --- Sanity check: Scores_UD should approximate U_svd %*% diag(d) ---
  reconstruct_scores <- U_svd %*% diag(d, nrow = k, ncol = k)
  if (max(abs(Scores_UD - reconstruct_scores)) > sqrt(.Machine$double.eps)) {
      warning(
        "Internal consistency check failed: x$s does not seem to be x$u %*% diag(x$sdev). Ensure 'pca' object structure is correct."
      )
  }
  # --- End Sanity Check ---

  # Calculate the matrix D U' needed for resampling (k x n)
  DUt <- diag(d, nrow = k, ncol = k) %*% t(U_svd) # Correct: D %*% U'

  # --- Bootstrap Core Function ---
  # (Identical to the previous version, operates on derived DUt, n, k)
  svd_one_bootstrap_sample <- function(iter) {
    idx <- sample.int(n, size = n, replace = TRUE)
    DUtPb <- DUt[, idx, drop = FALSE]

    sv <- tryCatch(
      svd(DUtPb, nu = k, nv = k),
      error = function(e) {
        warning("SVD failed for bootstrap sample ", iter, ". Returning NULL. Error: ", e$message, call. = FALSE)
        return(NULL)
      }
    )
    if (is.null(sv)) return(NULL)

    actual_k_svd <- min(k, length(sv$d)) # Number of non-zero singular values returned

    # Handle cases where SVD returns fewer than k components robustly
    Ab <- matrix(0.0, nrow = k, ncol = k)
    Rb <- matrix(0.0, nrow = n, ncol = k) # R^b refers to the right vectors of svd(DUtPb)
    Sb_vals <- numeric(k)

    if (actual_k_svd > 0) {
        Ab[1:k, 1:actual_k_svd] <- sv$u[, 1:actual_k_svd, drop = FALSE]
        Rb[, 1:actual_k_svd] <- sv$v[, 1:actual_k_svd, drop = FALSE]
        Sb_vals[1:actual_k_svd] <- sv$d[1:actual_k_svd]
    } else {
        warning("SVD returned zero components for bootstrap sample ", iter, ". Results for this sample will be zero/NA.", call. = FALSE)
    }

    # --- Sign Correction ---
    diag_Ab <- diag(Ab)
    signs <- sign(diag_Ab)
    signs[signs == 0] <- 1
    Ab <- sweep(Ab, 2, signs, "*")
    Rb <- sweep(Rb, 2, signs, "*")

    # --- Calculate Bootstrap Scores (S^b = R^b S^b_vals) ---
    Scores_b_unordered <- sweep(Rb, 2, Sb_vals, "*")

    # Reconstruct full n x k score matrix with NAs for non-sampled subjects
    Scores_b <- matrix(NA_real_, nrow = n, ncol = k)
    sampled_indices_map <- split(seq_along(idx), idx) # Map original index to positions in bootstrap sample
    for (orig_idx_str in names(sampled_indices_map)) {
       orig_idx <- as.integer(orig_idx_str)
       rows_in_boot <- sampled_indices_map[[orig_idx_str]]
       # Average scores if an original subject was selected multiple times
       Scores_b[orig_idx, ] <- colMeans(Scores_b_unordered[rows_in_boot, , drop = FALSE])
    }

    return(list(Ab = Ab, Scores = Scores_b))
  }

  # --- Run Bootstrap Loop ---
  if (parallel) {
    if (!requireNamespace("future.apply", quietly = TRUE)) {
      stop("Package 'future.apply' needed for parallel processing. Please install it.", call. = FALSE)
    }
    # Setup future plan if not already set, using specified cores or default
    if (is.null(future::plan("list")[[1]]$workers)) { # Check if a plan with workers is set
        num_cores <- if (!is.null(cores)) cores else future::availableCores()
        message("Setting future plan to multisession with ", num_cores, " workers.")
        # Save current plan before changing it so we can restore on exit
        existing_plan <- future::plan()
        future::plan(future::multisession, workers = num_cores)
        # Ensure plan is reset on exit if we set it here
        on.exit(future::plan(existing_plan), add = TRUE)
    }
    # Use future_lapply - seed needs to be handled via future.seed argument
    res_list <- future.apply::future_lapply(1:nboot, svd_one_bootstrap_sample, future.seed = seed)
  } else {
    # Set seed locally if not running in parallel
    if (!is.null(seed)) withr::local_seed(seed)
    res_list <- lapply(1:nboot, svd_one_bootstrap_sample)
  }

  # Filter out NULL results
  failed_samples <- sum(sapply(res_list, is.null))
  if (failed_samples > 0) {
      warning("SVD failed for ", failed_samples, " out of ", nboot, " bootstrap samples. These were excluded.", call.=FALSE)
      res_list <- res_list[!sapply(res_list, is.null)]
      nboot_actual <- length(res_list)
      if (nboot_actual == 0) stop("SVD failed for all bootstrap samples.")
  } else {
      nboot_actual <- nboot
  }


  # --- Aggregate Results ---
  Ab_array <- simplify2array(lapply(res_list, `[[`, "Ab"), higher = TRUE)       # k x k x nboot_actual
  Scores_array <- simplify2array(lapply(res_list, `[[`, "Scores"), higher = TRUE) # n x k x nboot_actual

  # --- Calculate Bootstrap Moments and Z-scores ---

  # 1. Scores
  E_Scores <- apply(Scores_array, 1:2, mean, na.rm = TRUE)
  sd_Scores <- apply(Scores_array, 1:2, sd, na.rm = TRUE)
  sd_Scores[is.na(sd_Scores) | sd_Scores < epsilon] <- epsilon # Handle NAs and apply epsilon
  z_scores <- E_Scores / sd_Scores

  # 2. Loadings (Vb = V Ab)
  E_Ab <- apply(Ab_array, 1:2, mean)
  E_Vb <- V %*% E_Ab

  # Calculate Variance/SD of Loadings Vb using Cov(Ab_k)
  Var_Vb <- matrix(NA_real_, nrow = p, ncol = k)
  if (p > 0 && !is.null(rownames(V))) rownames(Var_Vb) <- rownames(V)
  if (k > 0 && !is.null(colnames(V))) colnames(Var_Vb) <- colnames(V)

  if (nboot_actual <= 1) {
      warning("Cannot compute variance with <= 1 successful bootstrap sample. Returning NA for loading SD and Z-scores.", call.=FALSE)
      sd_Vb <- Var_Vb # Matrix of NAs
      z_loadings <- Var_Vb # Matrix of NAs
  } else {
      for (ki in 1:k) {
        # Transpose needed: Ab_array is k x k x nboot, we need nboot x k for cov
        Ab_k_samples <- t(Ab_array[, ki, ])
        Cov_Ab_k <- cov(Ab_k_samples)

        # Efficient calculation using rowSums
        V_Cov_k <- V %*% Cov_Ab_k
        Term_k <- V_Cov_k * V
        Var_Vb[, ki] <- rowSums(Term_k)
      }

      Var_Vb[Var_Vb < 0 & !is.na(Var_Vb)] <- 0 # Set small negative variances to 0
      sd_Vb <- sqrt(Var_Vb)
      sd_Vb[is.na(sd_Vb) | sd_Vb < epsilon] <- epsilon # Handle NAs and apply epsilon
      z_loadings <- E_Vb / sd_Vb
  }

  # --- Prepare Return Object ---
  ret <- list(
    E_Vb = E_Vb,
    sd_Vb = sd_Vb,
    z_loadings = z_loadings,
    E_Scores = E_Scores,
    sd_Scores = sd_Scores,
    z_scores = z_scores,
    E_Ab = E_Ab,
    Ab_array = Ab_array,
    Scores_array = Scores_array,
    nboot = nboot_actual,
    k = k,
    call = match.call()
  )

  class(ret) <- c("bootstrap_pca_result", "list")
  return(ret)
}

# Keep the print method as it is generic enough
#' Print method for bootstrap_pca_result
#'
#' @param x An object of class `bootstrap_pca_result`.
#' @param ... Additional arguments passed to `print`.
#' @export
#' @keywords internal
print.bootstrap_pca_result <- function(x, ...) {
  cat("Fast Bootstrap PCA Result\n")
  cat("--------------------------\n")
  cat("Call:\n")
  print(x$call)
  cat("\n")
  cat("Number of bootstrap samples (successful):", x$nboot, "\n")
  cat("Number of components bootstrapped (k):", x$k, "\n")
  cat("\n")
  cat("Output contains bootstrap estimates for:\n")
  cat(" - Loadings (Vb = coefficients): Mean (E_Vb), SD (sd_Vb), Z-scores (z_loadings)\n")
  cat(" - Scores (Sb): Mean (E_Scores), SD (sd_Scores), Z-scores (z_scores)\n")
  cat(" - Internal rotation matrices (Ab): Mean (E_Ab), Full array (Ab_array)\n")
  cat(" - Full bootstrap scores array (Scores_array)\n")
  invisible(x)
}