knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.path = "figures/fast-big--", fig.width = 7, fig.height = 4.5, dpi = 150, message = FALSE, warning = FALSE ) LOCAL <- identical(Sys.getenv("LOCAL"), "TRUE")
Large survival datasets often outgrow the capabilities of purely in-memory algorithms. bigPLScox therefore offers three flavours of Partial Least Squares (PLS) components for Cox models:
big_pls_cox() – the original R/C++ hybrid that expects dense matrices;big_pls_cox_fast() – the new Armadillo backend with variance-one components
and support for both dense matrices and bigmemory::big.matrix objects; andbig_pls_cox_gd() – a stochastic gradient descent (SGD) solver for streaming
or disk-backed data.
allowing datasets too large to fit in RAM.This vignette demonstrates how to build file-backed matrices, run the fast
backends, and reconcile their outputs. It complements the introductory vignette
vignette("getting-started", package = "bigPLScox") and focuses on workflow
patterns specific to large datasets.
We generate a synthetic dataset with correlated predictors and a known linear predictor. The same object is used to illustrate the dense and big-memory interfaces.
library(bigPLScox) set.seed(2024) n_obs <- 5000 n_pred <- 100 k_true <- 3 Sigma <- diag(n_pred) for (b in 0:2) { idx <- (b * 10 + 1):(b * 10 + 10) Sigma[idx, idx] <- 0.7 diag(Sigma[idx, idx]) <- 1 } L <- chol(Sigma) Z <- matrix(rnorm(n_obs * n_pred), nrow = n_obs) X_dense <- Z %*% L w1 <- numeric(n_pred); w1[1:4] <- c( 1.0, 0.8, 0.6, -0.5) w2 <- numeric(n_pred); w2[5:8] <- c(-0.7, 0.9, -0.4, 0.5) w3 <- numeric(n_pred); w3[9:12] <- c( 0.6, -0.5, 0.7, 0.8) w1 <- w1 / sqrt(sum(w1^2)) w2 <- w2 / sqrt(sum(w2^2)) w3 <- w3 / sqrt(sum(w3^2)) t1 <- as.numeric(scale(drop(X_dense %*% w1), center = TRUE, scale = TRUE)) t2 <- as.numeric(scale(drop(X_dense %*% w2), center = TRUE, scale = TRUE)) t3 <- as.numeric(scale(drop(X_dense %*% w3), center = TRUE, scale = TRUE)) beta_true <- c(1.0, 0.6, 0.3) eta <- beta_true[1]*t1 + beta_true[2]*t2 + beta_true[3]*t3 lambda0 <- 0.05 u <- runif(n_obs) time_event <- -log(u) / (lambda0 * exp(eta)) target_event <- 0.65 f <- function(lc) { mean(time_event <= rexp(n_obs, rate = lc)) - target_event } lambda_c <- uniroot(f, c(1e-4, 1), tol = 1e-4)$root time_cens <- rexp(n_obs, rate = lambda_c) time <- pmin(time_event, time_cens) status <- as.integer(time_event <= time_cens)
if (requireNamespace("bigmemory", quietly = TRUE)) { library(bigmemory) big_dir <- tempfile("bigPLScox-") dir.create(big_dir) if(file.exists(paste(big_dir,"X.bin",sep="/"))){unlink(paste(big_dir,"X.bin",sep="/"))} if(file.exists(paste(big_dir,"X.desc",sep="/"))){unlink(paste(big_dir,"X.desc",sep="/"))} X_big <- bigmemory::as.big.matrix( X_dense, backingpath = big_dir, backingfile = "X.bin", descriptorfile = "X.desc" ) X_big[1:6, 1:6] }
The legacy solver big_pls_cox() performs the component extraction in R before
fitting the Cox model. The new big_pls_cox_fast() backend exposes the same
arguments but runs entirely in C++ for a substantial speed-up.
fit_legacy <- big_pls_cox( X = X_dense, time = time, status = status, ncomp = k_true ) fit_fast_dense <- big_pls_cox_fast( X = X_dense, time = time, status = status, ncomp = k_true ) list( legacy = head(fit_legacy$scores), fast = head(fit_fast_dense$scores) ) list( legacy = apply(fit_legacy$scores, 2, var), fast = apply(fit_fast_dense$scores, 2, var) )
The summary() method reports key diagnostics, including the final Cox
coefficients and the number of predictors retained per component.
summary(fit_fast_dense)
When working with a big.matrix, the same function operates on the shared
memory pointer without copying data back to R. This is ideal for datasets that do
not fit entirely in RAM.
if (exists("X_big")) { fit_fast_big <- big_pls_cox_fast( X = X_big, time = time, status = status, ncomp = k_true ) summary(fit_fast_big) }
The resulting scores, loadings, and centring parameters mirror the dense fit, which simplifies debugging and incremental model building.
The SGD solver trades a small amount of accuracy for drastically reduced memory
usage. Provide the same big.matrix object along with the desired number of
components.
if (exists("X_big")) { fit_gd <- big_pls_cox_gd( X = X_big, time = time, status = status, ncomp = k_true, max_iter = 2000, learning_rate = 0.05, tol = 1e-8 ) str(fit_gd) }
Component bases are unique only up to orthogonal rotations. Comparing the linear predictors generated by each solver verifies that they span the same subspace.
if (exists("fit_fast_dense") && exists("fit_gd")) { eta_fast_dense <- drop(fit_fast_dense$scores %*% fit_fast_dense$cox_fit$coefficients) eta_fast_big <- drop(fit_fast_big$scores %*% fit_fast_big$cox_fit$coefficients) eta_gd <- drop(fit_gd$scores %*% fit_gd$cox_fit$coefficients) list( correlation_fast_dense_big = cor(eta_fast_dense, eta_fast_big), correlation_fast_dense_gd = cor(eta_fast_dense, eta_gd), concordance = c( fast_dense = survival::concordance(survival::Surv(time, status) ~ eta_fast_dense)$concordance, fast_big = survival::concordance(survival::Surv(time, status) ~ eta_fast_big)$concordance, gd = survival::concordance(survival::Surv(time, status) ~ eta_gd)$concordance ) ) }
Use predict() with type = "components" to obtain latent scores for new
observations. Supplying explicit centring and scaling parameters ensures
consistent projections across solvers.
X_new <- matrix(rnorm(10 * n_pred), nrow = 10) scores_new <- predict( fit_fast_dense, newdata = X_new, type = "components" ) risk_new <- predict( fit_fast_dense, newdata = X_new, type = "risk" ) list(scores = scores_new, risk = risk_new)
The bench package provides lightweight instrumentation for comparing solvers.
The chunk below contrasts the classical implementation, the fast backend, and
the SGD routine on the simulated dataset.
if (requireNamespace("bench", quietly = TRUE) && exists("X_big")) { bench_res <- bench::mark( big_pls = big_pls_cox(X_dense, time, status, ncomp = k_true), fast_dense = big_pls_cox_fast(X_dense, time, status, ncomp = k_true), fast_big = big_pls_cox_fast(X_big, time, status, ncomp = k_true), gd = big_pls_cox_gd(X_big, time, status, ncomp = k_true, max_iter = 1500), iterations = 30, check = FALSE ) bench_res$expression <- names(bench_res$expression) bench_res[, c("expression", "median", "itr/sec", "mem_alloc")] }
if (exists("bench_res")) { plot(bench_res, type = "jitter") }
File-backed matrices can be deleted once the analysis is complete. In production
workflows you typically keep the descriptor (.desc) file alongside the binary
matrix for later reuse.
Temporary backing files can be removed after the analysis. In production pipelines you will typically keep the descriptor file alongside the binary data.
if (exists("X_big")) { rm(X_big) file.remove(file.path(big_dir, "X.bin")) file.remove(file.path(big_dir, "X.desc")) unlink(big_dir, recursive = TRUE) }
vignette("bigPLScox-benchmarking", package = "bigPLScox") provides a
reproducible benchmark that includes survival::coxph() and coxgpls().?big_pls_cox_fast, ?big_pls_cox_gd) describe all tuning
parameters in detail, including keepX for component-wise sparsity.Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.