#' Fast, Exact Bootstrap for PCA Results from `pca` function
#'
#' Performs bootstrap resampling for Principal Component Analysis (PCA) based on
#' the method described by Fisher et al. (2016), optimized for high-dimensional
#' data (p >> n). This version is specifically adapted to work with the output
#' object generated by the provided `pca` function (which returns a `bi_projector`
#' object of class 'pca').
#'
#' @param x An object of class 'pca' as returned by the provided `pca` function.
#' It's expected to contain loadings (`v`), scores (`s`), singular values (`sdev`),
#' left singular vectors (`u`), and pre-processing info (`preproc`).
#' @param nboot The number of bootstrap resamples to perform. Must be a positive
#' integer (default: 100).
#' @param k The number of principal components to bootstrap (default: all
#' components available in the fitted PCA model `x`). Must be less than or
#' equal to the number of components in `x`.
#' @param parallel Logical flag indicating whether to use parallel processing
#' via the `future` framework (default: FALSE). Requires the `future.apply` package
#' and a configured `future` backend (e.g., `future::plan(future::multisession)`).
#' @param cores The number of cores to use for parallel processing if `parallel = TRUE`
#' (default: `future::availableCores()`). This is used if no `future` plan is set.
#' @param seed An integer value for the random number generator seed for
#' reproducibility (default: NULL, no seed is set).
#' @param epsilon A small positive value added to standard deviations before
#' division to prevent division by zero or instability (default: 1e-15).
#' @param ... Additional arguments (currently ignored).
#'
#' @return A `list` object of class `bootstrap_pca_result` containing:
#' \item{E_Vb}{Matrix (p x k) of the estimated bootstrap means of the principal components (loadings V^b = coefficients).}
#' \item{sd_Vb}{Matrix (p x k) of the estimated bootstrap standard deviations of the principal components (loadings V^b).}
#' \item{z_loadings}{Matrix (p x k) of the bootstrap Z-scores for the loadings, calculated as `E_Vb / sd_Vb`.}
#' \item{E_Scores}{Matrix (n x k) of the estimated bootstrap means of the principal component scores (S^b).}
#' \item{sd_Scores}{Matrix (n x k) of the estimated bootstrap standard deviations of the principal component scores (S^b).}
#' \item{z_scores}{Matrix (n x k) of the bootstrap Z-scores for the scores, calculated as `E_Scores / sd_Scores`.}
#' \item{E_Ab}{Matrix (k x k) of the estimated bootstrap means of the internal rotation matrices A^b.}
#' \item{Ab_array}{Array (k x k x nboot) containing all the bootstrap rotation matrices A^b.}
#' \item{Scores_array}{Array (n x k x nboot) containing all the bootstrap score matrices (S^b, with NAs for non-sampled subjects).}
#' \item{nboot}{The number of bootstrap samples used (successful ones).}
#' \item{k}{The number of components bootstrapped.}
#' \item{call}{The matched call to the function.}
#'
#' @details
#' This function implements the fast bootstrap PCA algorithm proposed by
#' Fisher et al. (2016), adapted for the output structure of the provided `pca` function.
#' The `pca` function returns an object containing:
#' \itemize{
#' \item `v`: Loadings (coefficients, p x k) - equivalent to V in SVD Y = U D V'. Note the transpose difference from `prcomp`.
#' \item `s`: Scores (n x k) - calculated as U %*% D.
#' \item `sdev`: Singular values (vector of length k) - equivalent to d.
#' \item `u`: Left singular vectors (n x k).
#' }
#'
#' The bootstrap algorithm works by resampling the *subjects* (rows) and recomputing
#' the SVD on a low-dimensional representation. Specifically, it computes the SVD
#' of the resampled matrix `D U' P^b`, where `Y = U D V'` is the SVD of the original
#' (pre-processed) data, and `P^b` is a resampling matrix operating on the subjects (columns of U').
#'
#' The SVD of the resampled low-dimensional matrix is `svd(D U' P^b) = A^b S^b (R^b)'`.
#' The bootstrap principal components (loadings) are then calculated as `V^b = V A^b`,
#' and the bootstrap scores are `Scores^b = R^b S^b`.
#'
#' Z-scores are provided as `mean / sd`.
#'
#' **Important Note:** The algorithm assumes the data `Y` used for the *original* SVD (`Y = U D V'`)
#' was appropriately centered (or pre-processed according to `x$preproc`). The bootstrap
#' samples are generated based on the components derived from this pre-processed data.
#'
#' @references
#' Fisher, Aaron, Brian Caffo, Brian Schwartz, and Vadim Zipunnikov. 2016.
#' "Fast, Exact Bootstrap Principal Component Analysis for P > 1 Million."
#' \emph{Journal of the American Statistical Association} 111 (514): 846–60.
#' \doi{10.1080/01621459.2015.1062383}.
#'
#' @export
#' @family pca bootstrap
#' @examples
#' # --- Assuming the pca and svd_wrapper functions provided are loaded ---
#' # Also assuming helper functions like center(), pass(), prep(), init_transform()
#' # are defined and available. Create simplified mocks if needed:
#' center <- function() structure(list(scale=FALSE, center=TRUE), class=c("center", "prepper"))
#' pass <- function() structure(list(scale=FALSE, center=FALSE), class=c("pass", "prepper"))
#' prep <- function(preproc) {
#' xfun <- function(X) scale(X, center = preproc$center, scale = preproc$scale)
#' # Add proper reverse logic if needed for projection
#' return(structure(list(transform = xfun, reverse = function(X) X), class="prep"))
#' }
#' init_transform <- function(proc, X) proc$transform(X)
#' bi_projector <- function(v, s, sdev, u, preproc, classes, method) {
#' structure(list(v=v, s=s, sdev=sdev, u=u, preproc=preproc, method=method),
#' class=c(classes, "bi_projector", "projector"))
#' }
#' coefficients.projector <- function(x,...) x$v
#' scores.projector <- function(x,...) x$s
#' sdev.projector <- function(x,...) x$sdev
#' # --- End Mock Helpers ---
#'
#' # Simulate data (p=50, n=20)
#' set.seed(123)
#' p_dim <- 50
#' n_obs <- 20
#' Y_mat <- matrix(rnorm(p_dim * n_obs), nrow = p_dim, ncol = n_obs)
#' # Transpose for pca function input (n x p)
#' X_mat <- t(Y_mat)
#'
#' # Perform PCA using the provided pca function
#' # Use center() pre-processing
#' pca_res <- pca(X_mat, ncomp = 5, preproc = center(), method = "fast")
#'
#' # Run bootstrap on the pca result
#' boot_res <- bootstrap_pca(pca_res, nboot = 50, k = 5, seed = 456)
#'
#' # Explore results
#' print(dim(boot_res$z_loadings)) # p x k Z-scores for loadings (coefficients)
#' print(dim(boot_res$z_scores)) # n x k Z-scores for scores
#'
#' # Plot std dev for first loading vector (V)
#' plot(boot_res$sd_Vb[, 1], ylab = "Bootstrap SD", main = "Loading Vector 1 Variability")
#'
#' # Plot std dev for first score vector (S)
#' plot(boot_res$sd_Scores[, 1], ylab = "Bootstrap SD", main = "Score Vector 1 Variability")
#'
#' \dontrun{
#' # Example with parallel processing (requires future & future.apply)
#' # library(future)
#' # library(future.apply)
#' # plan(multisession) # Set up parallel backend
#' boot_res_parallel <- bootstrap_pca(pca_res, nboot = 100, k = 5, seed = 456,
#' parallel = TRUE)
#' }
#' @importFrom stats sd
#' @importFrom stats cov
bootstrap_pca <- function(x, nboot = 100, k = NULL,
parallel = FALSE, cores = NULL,
seed = NULL, epsilon = 1e-15, ...) {
# --- Input Validation and Setup ---
if (!inherits(x, "pca") || !inherits(x, "bi_projector")) {
warning("Input 'x' is not of class 'pca' and 'bi_projector' as returned by the 'pca' function. Ensure it has components 'v', 's', 'sdev', and 'u'.")
}
required_comps <- c("v", "s", "sdev", "u")
if (!all(required_comps %in% names(x))) {
stop("Input object 'x' is missing required components: ",
paste(required_comps[!required_comps %in% names(x)], collapse=", "))
}
# Determine dimensions from the pca object structure
p <- nrow(x$v) # Number of features/dimensions (from loadings V)
n <- nrow(x$s) # Number of subjects/observations (from scores S = UD)
k_max <- ncol(x$v) # Max components available in x
if (is.null(k)) {
k <- k_max
message("Parameter 'k' not specified. Using k = ", k, " components from the PCA object.")
}
if (k > k_max) {
stop("Requested k (", k, ") exceeds the number of components available in the PCA object (", k_max, ").")
}
if (k > n) {
warning("Requested k (", k, ") exceeds the number of observations (", n, "). Results might be unstable.")
}
if (k <= 0 || !is.numeric(k) || k != round(k)) stop("k must be a positive integer.")
if (!is.numeric(nboot) || length(nboot) != 1 || nboot <= 0 || nboot != round(nboot))
stop("nboot must be a positive integer.")
# Seed handling with future_lapply requires specific argument
# if (!is.null(seed)) withr::local_seed(seed) # Apply seed locally before loop if not parallel
# Get original PCA results (up to k components) from the 'pca' object
V <- x$v[, 1:k, drop = FALSE] # Loadings (p x k)
Scores_UD <- x$s[, 1:k, drop = FALSE] # Scores (n x k) - Note: x$s is U*D
d <- x$sdev[1:k] # Standard deviations (singular values)
U_svd <- x$u[, 1:k, drop = FALSE] # Left singular vectors (n x k) - directly available
# --- Sanity check: Scores_UD should approximate U_svd %*% diag(d) ---
reconstruct_scores <- U_svd %*% diag(d, nrow = k, ncol = k)
if (max(abs(Scores_UD - reconstruct_scores)) > sqrt(.Machine$double.eps)) {
warning(
"Internal consistency check failed: x$s does not seem to be x$u %*% diag(x$sdev). Ensure 'pca' object structure is correct."
)
}
# --- End Sanity Check ---
# Calculate the matrix D U' needed for resampling (k x n)
DUt <- diag(d, nrow = k, ncol = k) %*% t(U_svd) # Correct: D %*% U'
# --- Bootstrap Core Function ---
# (Identical to the previous version, operates on derived DUt, n, k)
svd_one_bootstrap_sample <- function(iter) {
idx <- sample.int(n, size = n, replace = TRUE)
DUtPb <- DUt[, idx, drop = FALSE]
sv <- tryCatch(
svd(DUtPb, nu = k, nv = k),
error = function(e) {
warning("SVD failed for bootstrap sample ", iter, ". Returning NULL. Error: ", e$message, call. = FALSE)
return(NULL)
}
)
if (is.null(sv)) return(NULL)
actual_k_svd <- min(k, length(sv$d)) # Number of non-zero singular values returned
# Handle cases where SVD returns fewer than k components robustly
Ab <- matrix(0.0, nrow = k, ncol = k)
Rb <- matrix(0.0, nrow = n, ncol = k) # R^b refers to the right vectors of svd(DUtPb)
Sb_vals <- numeric(k)
if (actual_k_svd > 0) {
Ab[1:k, 1:actual_k_svd] <- sv$u[, 1:actual_k_svd, drop = FALSE]
Rb[, 1:actual_k_svd] <- sv$v[, 1:actual_k_svd, drop = FALSE]
Sb_vals[1:actual_k_svd] <- sv$d[1:actual_k_svd]
} else {
warning("SVD returned zero components for bootstrap sample ", iter, ". Results for this sample will be zero/NA.", call. = FALSE)
}
# --- Sign Correction ---
diag_Ab <- diag(Ab)
signs <- sign(diag_Ab)
signs[signs == 0] <- 1
Ab <- sweep(Ab, 2, signs, "*")
Rb <- sweep(Rb, 2, signs, "*")
# --- Calculate Bootstrap Scores (S^b = R^b S^b_vals) ---
Scores_b_unordered <- sweep(Rb, 2, Sb_vals, "*")
# Reconstruct full n x k score matrix with NAs for non-sampled subjects
Scores_b <- matrix(NA_real_, nrow = n, ncol = k)
sampled_indices_map <- split(seq_along(idx), idx) # Map original index to positions in bootstrap sample
for (orig_idx_str in names(sampled_indices_map)) {
orig_idx <- as.integer(orig_idx_str)
rows_in_boot <- sampled_indices_map[[orig_idx_str]]
# Average scores if an original subject was selected multiple times
Scores_b[orig_idx, ] <- colMeans(Scores_b_unordered[rows_in_boot, , drop = FALSE])
}
return(list(Ab = Ab, Scores = Scores_b))
}
# --- Run Bootstrap Loop ---
if (parallel) {
if (!requireNamespace("future.apply", quietly = TRUE)) {
stop("Package 'future.apply' needed for parallel processing. Please install it.", call. = FALSE)
}
# Setup future plan if not already set, using specified cores or default
if (is.null(future::plan("list")[[1]]$workers)) { # Check if a plan with workers is set
num_cores <- if (!is.null(cores)) cores else future::availableCores()
message("Setting future plan to multisession with ", num_cores, " workers.")
# Save current plan before changing it so we can restore on exit
existing_plan <- future::plan()
future::plan(future::multisession, workers = num_cores)
# Ensure plan is reset on exit if we set it here
on.exit(future::plan(existing_plan), add = TRUE)
}
# Use future_lapply - seed needs to be handled via future.seed argument
res_list <- future.apply::future_lapply(1:nboot, svd_one_bootstrap_sample, future.seed = seed)
} else {
# Set seed locally if not running in parallel
if (!is.null(seed)) withr::local_seed(seed)
res_list <- lapply(1:nboot, svd_one_bootstrap_sample)
}
# Filter out NULL results
failed_samples <- sum(sapply(res_list, is.null))
if (failed_samples > 0) {
warning("SVD failed for ", failed_samples, " out of ", nboot, " bootstrap samples. These were excluded.", call.=FALSE)
res_list <- res_list[!sapply(res_list, is.null)]
nboot_actual <- length(res_list)
if (nboot_actual == 0) stop("SVD failed for all bootstrap samples.")
} else {
nboot_actual <- nboot
}
# --- Aggregate Results ---
Ab_array <- simplify2array(lapply(res_list, `[[`, "Ab"), higher = TRUE) # k x k x nboot_actual
Scores_array <- simplify2array(lapply(res_list, `[[`, "Scores"), higher = TRUE) # n x k x nboot_actual
# --- Calculate Bootstrap Moments and Z-scores ---
# 1. Scores
E_Scores <- apply(Scores_array, 1:2, mean, na.rm = TRUE)
sd_Scores <- apply(Scores_array, 1:2, sd, na.rm = TRUE)
sd_Scores[is.na(sd_Scores) | sd_Scores < epsilon] <- epsilon # Handle NAs and apply epsilon
z_scores <- E_Scores / sd_Scores
# 2. Loadings (Vb = V Ab)
E_Ab <- apply(Ab_array, 1:2, mean)
E_Vb <- V %*% E_Ab
# Calculate Variance/SD of Loadings Vb using Cov(Ab_k)
Var_Vb <- matrix(NA_real_, nrow = p, ncol = k)
if (p > 0 && !is.null(rownames(V))) rownames(Var_Vb) <- rownames(V)
if (k > 0 && !is.null(colnames(V))) colnames(Var_Vb) <- colnames(V)
if (nboot_actual <= 1) {
warning("Cannot compute variance with <= 1 successful bootstrap sample. Returning NA for loading SD and Z-scores.", call.=FALSE)
sd_Vb <- Var_Vb # Matrix of NAs
z_loadings <- Var_Vb # Matrix of NAs
} else {
for (ki in 1:k) {
# Transpose needed: Ab_array is k x k x nboot, we need nboot x k for cov
Ab_k_samples <- t(Ab_array[, ki, ])
Cov_Ab_k <- cov(Ab_k_samples)
# Efficient calculation using rowSums
V_Cov_k <- V %*% Cov_Ab_k
Term_k <- V_Cov_k * V
Var_Vb[, ki] <- rowSums(Term_k)
}
Var_Vb[Var_Vb < 0 & !is.na(Var_Vb)] <- 0 # Set small negative variances to 0
sd_Vb <- sqrt(Var_Vb)
sd_Vb[is.na(sd_Vb) | sd_Vb < epsilon] <- epsilon # Handle NAs and apply epsilon
z_loadings <- E_Vb / sd_Vb
}
# --- Prepare Return Object ---
ret <- list(
E_Vb = E_Vb,
sd_Vb = sd_Vb,
z_loadings = z_loadings,
E_Scores = E_Scores,
sd_Scores = sd_Scores,
z_scores = z_scores,
E_Ab = E_Ab,
Ab_array = Ab_array,
Scores_array = Scores_array,
nboot = nboot_actual,
k = k,
call = match.call()
)
class(ret) <- c("bootstrap_pca_result", "list")
return(ret)
}
# Keep the print method as it is generic enough
#' Print method for bootstrap_pca_result
#'
#' @param x An object of class `bootstrap_pca_result`.
#' @param ... Additional arguments passed to `print`.
#' @export
#' @keywords internal
print.bootstrap_pca_result <- function(x, ...) {
cat("Fast Bootstrap PCA Result\n")
cat("--------------------------\n")
cat("Call:\n")
print(x$call)
cat("\n")
cat("Number of bootstrap samples (successful):", x$nboot, "\n")
cat("Number of components bootstrapped (k):", x$k, "\n")
cat("\n")
cat("Output contains bootstrap estimates for:\n")
cat(" - Loadings (Vb = coefficients): Mean (E_Vb), SD (sd_Vb), Z-scores (z_loadings)\n")
cat(" - Scores (Sb): Mean (E_Scores), SD (sd_Scores), Z-scores (z_scores)\n")
cat(" - Internal rotation matrices (Ab): Mean (E_Ab), Full array (Ab_array)\n")
cat(" - Full bootstrap scores array (Scores_array)\n")
invisible(x)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.