#' Bootstrap PCA Analysis
#'
#' This function performs bootstrap resampling on a fitted PCA model to estimate the variability of loadings and scores.
#' It resamples observations (with replacement) from the original dataset's scores, then re-estimates the PCA for each bootstrap
#' sample. The resulting distributions of loadings and scores are used to compute z-scores, indicating how stable each component
#' and score is under resampling.
#'
#' @param x A fitted PCA model object. Must provide `scores(x)` and `coefficients(x)` methods.
#' @param nboot Integer. Number of bootstrap resamples (default: 100).
#' @param k Integer. Number of components to analyze (default: all components in `x`).
#' @param method Character. SVD method passed to `svd_wrapper`. Options: "base", "fast", "irlba", "propack", "rsvd", "svds".
#' Default: "base".
#' @param sign_method Character. Method for sign alignment of component loadings/scores across bootstraps.
#' Options:
#' - "max_abs": Flip signs so the element with the largest absolute value in each vector is positive.
#' - "first_element": Flip signs so the first element of each vector is positive.
#' - "none": Do not adjust signs.
#' Default: "max_abs".
#' @param seed Optional integer to set the random seed for reproducibility.
#' @param ... Additional arguments passed to `svd_wrapper`.
#'
#' @return A list of class `bootstrap_result`:
#' \describe{
#' \item{zboot_loadings}{A matrix (D x k) of z-scores for loadings. D = number of features.}
#' \item{zboot_scores}{A matrix (N x k) of z-scores for scores. N = number of samples.}
#' \item{nboot}{Number of bootstrap samples performed.}
#' \item{k}{Number of components analyzed.}
#' }
#'
#' @details
#' The procedure is:
#' 1. Extract scores (N x ncomp) and loadings (D x ncomp) from the fitted PCA.
#' 2. For each bootstrap iteration:
#' - Resample N observations (with replacement).
#' - Perform SVD (via `svd_wrapper`) on the resampled score matrix to get a new PCA estimate.
#' - Align signs of components if `sign_method` is used.
#' 3. Aggregate all bootstrap results to compute mean and SD of loadings and scores.
#' 4. Compute z-scores = mean/SD for both loadings and scores.
#'
#' By default, sign alignment ("max_abs") ensures consistency in the direction of components.
#'
#' @examples
#' # Assuming 'x' is a PCA model fit from a function like pca(), with ncomp=5:
#' # bootstrap_results <- bootstrap(x, nboot=200, k=3, method="irlba", sign_method="max_abs", seed=123)
#'
#' @export
bootstrap.pca <- function(x, nboot=100, k=ncomp(x),
method="base",
sign_method=c("max_abs","first_element","none"),
seed=NULL,
...) {
sign_method <- match.arg(sign_method)
if (!is.null(seed)) set.seed(seed)
# Extract scores and loadings from the fitted PCA model
scores_mat <- scores(x) # N x ncomp
loadings <- coefficients(x) # D x ncomp
if (nboot <= 0) stop("nboot must be a positive integer.")
k <- min(k, ncol(scores_mat))
# Transpose scores for convenience: DUt = t(scores) = (ncomp x N)
DUt <- t(scores_mat)
N <- ncol(DUt)
D <- nrow(loadings)
# Check dimensions
if (k > ncol(scores_mat)) {
warning("Requested k is greater than the number of available components. Using all components.")
k <- ncol(scores_mat)
}
# Generator function for bootstrap samples
gen_sample <- function() {
sidx <- sample(N, replace=TRUE)
list(DUt_boot = DUt[, sidx, drop=FALSE], idx = sidx)
}
# Preallocate list to store bootstrap results
# Each element will hold: svdfit = list(u,d,v) from partial SVD, idx = resample indices
res_list <- vector("list", nboot)
# Compute bootstrap samples
for (i in seq_len(nboot)) {
sam <- gen_sample()
svdfit <- compute_svd_on_boot(sam$DUt_boot, k, method=method, sign_method=sign_method, ...)
res_list[[i]] <- list(svdfit=svdfit, idx=sam$idx)
}
# Summarize bootstrap results to compute z-scores
summary_res <- summarize_bootstrap_results(res_list, k, loadings)
zboot_loadings <- do.call(cbind, lapply(seq_len(k), function(ci) {
summary_res$EVs[[ci]] / summary_res$sdVs[[ci]]
}))
zboot_scores <- do.call(cbind, lapply(seq_len(k), function(ci) {
summary_res$EScores[[ci]] / summary_res$sdScores[[ci]]
}))
ret <- list(zboot_loadings=zboot_loadings, zboot_scores=zboot_scores, nboot=nboot, k=k)
class(ret) <- c("bootstrap_result", "list")
ret
}
#' Compute SVD for a Bootstrap Sample
#'
#' This internal function takes a bootstrap sample (component matrix DUt_boot),
#' applies `svd_wrapper` to get a partial SVD (u, d, v), and then
#' extracts the top k components. It also applies sign flipping if requested.
#'
#' @param DUt_boot (k x N) matrix: transposed scores after resampling.
#' @param k Number of components to extract.
#' @param method SVD method for `svd_wrapper`.
#' @param sign_method How to align signs.
#' @param ... Passed to `svd_wrapper`.
#'
#' @return A list with elements:
#' \describe{
#' \item{d}{Vector of singular values of length k}
#' \item{U}{(k x k) left singular vectors}
#' \item{V}{(N x k) right singular vectors}
#' }
#' @keywords internal
compute_svd_on_boot <- function(DUt_boot, k, method, sign_method, ...) {
# Use svd_wrapper with preproc=pass() since DUt_boot is already "processed"
# We'll just run SVD directly.
# svd_wrapper expects data as samples x features. Here DUt_boot is (k x N), akin to "scores" transposed.
# But we need an SVD that treats DUt_boot in a consistent manner:
# The original code did svd on DUtP = (k x N), producing u (k x k), v (N x k).
# We'll do the same: just call svd_wrapper with appropriate args.
# Since svd_wrapper expects X as samples x features, and we have (k x N),
# let's consider rows as samples (k) and columns as features (N).
# This is consistent since we just need top k comps:
fit <- svd_wrapper(DUt_boot, ncomp=k, preproc=pass(), method=method, ...)
# fit$u is (k x k), fit$v is (N x k)
# We have: DUt_boot = U d V^T in traditional sense (like original code)
d <- fit$sdev
U <- fit$u # (N rows from original vantage, but here it's actually k x k)
# Wait, we must confirm the orientation from svd_wrapper:
# svd_wrapper returns a bi_projector with v as loadings (features x components) and s = scores.
# For an SVD: X = U d V^T.
# By default, svd_wrapper sets:
# v = right singular vectors (features x ncomp)
# u = left singular vectors (samples x ncomp)
# we passed DUt_boot as X, so "samples" = k, "features" = N.
# Therefore:
# u in fit is (k x k)
# v in fit is (N x k)
U_mat <- fit$u # (k x k)
V_mat <- fit$v # (N x k)
# Sign flipping
U_mat <- apply_sign_method(U_mat, method=sign_method)
V_mat <- apply_sign_method(V_mat, method=sign_method)
list(d=d, U=U_mat, V=V_mat)
}
#' Apply Sign Method to SVD Components
#'
#' Ensures consistent sign orientation of singular vectors.
#'
#' @param mat A matrix of singular vectors (either U or V).
#' @param method One of "max_abs", "first_element", "none".
#'
#' @return The matrix with columns potentially flipped in sign for consistency.
#' @keywords internal
apply_sign_method <- function(mat, method="max_abs") {
if (method == "none") return(mat)
for (j in seq_len(ncol(mat))) {
col_j <- mat[, j]
if (method == "max_abs") {
# Flip sign so largest abs element is positive
idx_max <- which.max(abs(col_j))
if (col_j[idx_max] < 0) mat[, j] <- -col_j
} else if (method == "first_element") {
# Flip sign if first element is negative
if (col_j[1] < 0) mat[, j] <- -col_j
}
}
mat
}
#' Summarize Bootstrap Results
#'
#' Aggregates results from all bootstrap samples to compute means and standard deviations of loadings and scores,
#' and then prepares for z-score computation.
#'
#' @param res_list A list of length nboot, each element with `svdfit` (d,U,V) and `idx` (resampled indices).
#' @param k Number of components.
#' @param loadings Original loadings matrix (D x ncomp).
#'
#' @return A list with elements EAs, EVs, varAs, sdVs, EScores, sdScores.
#' @keywords internal
summarize_bootstrap_results <- function(res_list, k, loadings) {
nboot <- length(res_list)
# Extract A (loadings in U space) and Scores per component
# A ~ left singular vectors = U in our notation
# Scores ~ right singular vectors * d: V * diag(d)
# Collect A's for each component (nboot rows, k columns)
A_by_comp <- vector("list", k)
Scores_by_comp <- vector("list", k)
for (ci in seq_len(k)) {
A_mat <- matrix(NA, nrow=nboot, ncol=k) # Actually we only need the ci-th column of U (just one column)
# Wait, original code took the entire U but we only need the ci-th column of U from each bootstrap
# Actually original code: A ~ Ab from code. It used Ab[,ki].
# Our U = (k x k). Ab in original code = U from our code?
# Actually, original code had A as from svd on DUt.
# If original code took a$svdfit$Ab[,ki], that was the i-th column of U.
# We can store each column individually.
# But we need them all for var computations. We'll store entire columns anyway.
# Actually, let's align with original: A_by_comp[[ci]] holds all bootstrap A-values for component ci.
# U is (k x k). The ci-th component is just the ci-th column of U.
# So A_mat should be nboot x k only if we needed all components at once, but we only store the ci-th column:
A_mat <- numeric(nboot * k)
dim(A_mat) <- c(nboot, k)
# Scores: from each bootstrap: Scores = (V * diag(d)) for component ci is V[,ci] * d[ci]
# We must restore original order of samples. idx in res_list[[i]] gives the bootstrap sample indices.
# We'll store them in a matrix nboot x N (N = number of samples)
# But that can be large. Original code does that. We'll do the same.
# We'll do memory-lazy approach:
# Actually, original code computed mean and sd for each column. Let's do that too.
# We'll store them all then mean/sd.
# Potentially large but we trust user environment.
ScoresMat <- matrix(NA, nrow=nboot, ncol=ncol(loadings)) # Wait, must match samples = from `idx`.
# Actually, scores dimension: N samples from original, we must restore them:
# N = number of samples, we can get from length(res_list[[1]]$idx)
N <- length(res_list[[1]]$idx)
ScoresMat <- matrix(NA, nrow=nboot, ncol=N)
for (b in seq_len(nboot)) {
svdfit <- res_list[[b]]$svdfit
idx <- res_list[[b]]$idx
# U is k x k, we want the ci-th column of U:
A_mat[b, ] <- svdfit$U[, ci] # This is a k-length vector, representing left singular vector component ci?
# Wait, U[,ci] is length k. The original code only took a$svdfit$Ab[,ki] which was a single vector.
# In the original code, Ab was analogous to U. They took Ab[,ki], which is a vector length k, representing loadings in reduced space.
# We'll keep as is. It's consistent with original logic.
# Scores: V (N x k), we want V[, ci]*d[ci]
scores_b <- svdfit$V[, ci] * svdfit$d[ci]
# Place them in correct sample order:
# idx are the sampled indices, we must place scores back in original order?
# The original code does:
# u2[a$idx] <- u, meaning we restore original indexing.
# Let's do the same:
full_scores <- rep(NA, N)
full_scores[idx] <- scores_b
ScoresMat[b, ] <- full_scores
}
# Store results
A_by_comp[[ci]] <- A_mat
Scores_by_comp[[ci]] <- ScoresMat
}
# Compute means and sds
EScores <- lapply(Scores_by_comp, function(SM) apply(SM, 2, mean, na.rm=TRUE))
sdScores <- lapply(Scores_by_comp, function(SM) apply(SM, 2, sd, na.rm=TRUE))
# Mean of A (EAs)
EAs <- lapply(A_by_comp, colMeans)
# Compute EVs = loadings * EAs
# varAs and varVs: variance of A and resulting variance in V space
varAs <- lapply(A_by_comp, var)
varVs <- lapply(seq_len(k), function(ci) {
# varVs = rowSums((loadings %*% varAs[[ci]]) * loadings)
# varAs[[ci]] is k x k var matrix
rowSums((loadings %*% varAs[[ci]]) * loadings)
})
sdVs <- lapply(varVs, sqrt)
# EVs = loadings %*% EAs[[ci]] (EAs[[ci]] is length k)
EVs <- lapply(EAs, function(EA) loadings %*% matrix(EA, ncol=1))
list(EAs=EAs, EVs=EVs, varAs=varAs, sdVs=sdVs, EScores=EScores, sdScores=sdScores)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.