Nothing
# Sept 13, 2021
#' Class fastcca
#'
#' Class \code{fastcca}
#'
#' @details Object storing:
#' \describe{
#' \item{n.comp: }{number of canonical components}
#' \item{cors: }{canonical correlations}
#' \item{x.coefs: }{canonical coefficients for X}
#' \item{x.vars: }{canonical variates for X}
#' \item{y.coefs: }{canonical coefficients for Y}
#' \item{y.vars: }{canonical variates for Y}
#' \item{lambdas: }{shrinkage parameters from \code{eclairs}}
#' }
#' @name fastcca-class
#' @rdname fastcca-class
#' @exportClass fastcca
setClass("fastcca", contains = "list")
setMethod("print", "fastcca", function(x) {
show(x)
})
setMethod("show", "fastcca", function(object) {
x <- object
cat(" Fast regularized canonical correlation analysis\n\n")
k <- min(3, x$n.comp)
cat(" Original data rows:", x$dims["n1"], "\n")
cat(" Original data cols: ", x$dims["p1"], ", ", x$dims["p2"], "\n", sep = "")
cat(" Num components: ", x$n.comp, "\n")
cat(" Cor: ", round(x$cor[seq_len(k)], digits = 3), "...\n")
cat(" rho.mod: ", round(x$rho.mod[seq_len(k)], digits = 3), "...\n")
cat(" Cramer's V: ", round(x$cramer.V, digits = 3), "\n")
cat(" lambda.x: ", format(x$lambdas["x"], digits = 3), "\n")
cat(" lambda.y: ", format(x$lambdas["y"], digits = 3), "\n")
})
#' Fast canonical correlation analysis
#'
#' Fast Canonical correlation analysis that is scalable to high dimensional data. Uses covariance shrinkage and algorithmic speed ups to be linear time in p when p > n.
#'
#' @param X first matrix (n x p1)
#' @param Y first matrix (n x p2)
#' @param k number of canonical components to return
#' @param lambda.x optional shrinkage parameter for estimating covariance of X. If NULL, estimate from data.
#' @param lambda.y optional shrinkage parameter for estimating covariance of Y. If NULL, estimate from data.
#'
#' @details summary statistics of CCA
#' @details
#' Results from standard CCA are based on the SVD of \eqn{\Sigma_{xx}^{-\frac{1}{2}} \Sigma_{xy} \Sigma_{yy}^{-\frac{1}{2}}}.
#'
#' Uses \code{eclairs()} and empirical Bayes covariance regularization, and applies speed up of RCCA (Tuzhilina, et al. 2023) to perform CCA on n PCs and instead of p features. Memory usage is \eqn{\mathcal{O}(np)} instead of \eqn{\mathcal{O}(p^2)}. Computation is \eqn{\mathcal{O}(n^2p)} instead of \eqn{\mathcal{O}(p^3)} or \eqn{\mathcal{O}(np^2)}
# Computation is n^2p instead of p^3 of np^2
#'
#' @references
#' Tuzhilina, E., Tozzi, L., & Hastie, T. (2023). Canonical correlation analysis in high dimensions with structured regularization. Statistical modelling, 23(3), 203-227.
#'
#' @return \code{fastcca} object
#'
# @examples
# pop <- LifeCycleSavings[, 2:3]
# oec <- LifeCycleSavings[, -(2:3)]
#
# decorrelate:::fastcca(pop, oec)
#
#' @importFrom Rfast standardise
#' @keywords internal
# @export
fastcca <- function(X, Y, k = min(dim(X), dim(Y)), lambda.x = NULL, lambda.y = NULL) {
if (!is.matrix(X)) {
X <- as.matrix(X)
}
if (!is.matrix(Y)) {
Y <- as.matrix(Y)
}
if (anyNA(X) | anyNA(Y)) {
stop("No NA values are allowed in data")
}
# mean-center columns X = standardise(X, scale = FALSE) Y = standardise(Y,
# scale = FALSE)
X <- scale(X, scale = FALSE)
Y <- scale(Y, scale = FALSE)
n1 <- nrow(X)
n2 <- nrow(Y)
p1 <- ncol(X)
p2 <- ncol(Y)
if (n1 != n2) {
stop("X and Y must have same number of rows")
}
k <- min(dim(X), dim(Y), k)
if (k <= 0) {
stop("k must be >= 1")
}
# if( (p1 < n1) & (p2 < n2) ){ \t# Low dimensional \tres = cca(X, Y,
# lambda.x, lambda.y) \treturn( res ) }
# transform to lower dimensions
X.tr <- transform(X, lambda.x)
Y.tr <- transform(Y, lambda.y)
n.comp <- min(ncol(X), ncol(Y), nrow(X), k)
# eigen decomp of Cxx^-.5 Cxy Cyy^-.5 geigen can be made faster if using
# low rank eclairs
Cxx <- X.tr$cor
Cyy <- Y.tr$cor
# Cxy = cov(X.tr$mat, Y.tr$mat, use = 'pairwise') get covariance as product
# of centered matricies
Cxy <- crossprod(X.tr$mat, Y.tr$mat) / (n1 - 1)
# scale by shrinkage parameters
Cxy <- Cxy * sqrt(1 - X.tr$lambda) * sqrt(1 - Y.tr$lambda)
sol <- geigen2(Cxy, Cxx, Cyy, k) # compute SVD of matrix products
names(sol) <- c("rho", "alpha", "beta")
# modified canonical correlation
rho.mod <- sol$rho[seq_len(n.comp)]
names(rho.mod) <- paste("can.comp", seq_len(n.comp), sep = "")
# inverse transform
X.inv.tr <- inverseTransform(X.tr$mat, sol$alpha, X.tr$tr)
x.coefs <- X.inv.tr$coefs[, seq_len(n.comp), drop = FALSE]
rownames(x.coefs) <- colnames(X)
x.vars <- X.inv.tr$vars[, seq_len(n.comp), drop = FALSE]
rownames(x.vars) <- rownames(X)
Y.inv.tr <- inverseTransform(Y.tr$mat, sol$beta, Y.tr$tr)
y.coefs <- Y.inv.tr$coefs[, seq_len(n.comp), drop = FALSE]
rownames(y.coefs) <- colnames(Y)
y.vars <- Y.inv.tr$vars[, seq_len(n.comp), drop = FALSE]
rownames(y.vars) <- rownames(Y)
rho <- diag(cor(X.inv.tr$vars, Y.inv.tr$vars))[seq(n.comp)]
names(rho) <- paste("can.comp", seq(n.comp), sep = "")
if (k < min(dim(X), dim(Y))) {
idx <- seq(1, k)
} else {
idx <- seq(1, k - 1)
}
cramer.V <- sqrt(mean(rho.mod[idx]^2)) # Cramer's V-statistic for CCA
# based on CRAN::yacca compute loadings
load.x <- cor(X, x.vars)
load.y <- cor(Y, y.vars)
# compute redundancy index
ri.x <- rho.mod^2 * apply(load.x, 2, function(x) mean(x^2))
ri.y <- rho.mod^2 * apply(load.y, 2, function(x) mean(x^2))
names(ri.x) <- paste("can.comp", seq_len(n.comp), sep = "")
names(ri.y) <- paste("can.comp", seq_len(n.comp), sep = "")
res <- list(
n.comp = n.comp, rho.mod = rho.mod, cor = rho, cramer.V = cramer.V,
x.coefs = x.coefs, x.vars = x.vars, x.ri = ri.x, y.coefs = y.coefs, y.vars = y.vars,
y.ri = ri.y, lambdas = c(x = X.tr$lambda, y = Y.tr$lambda), dims = c(
n1 = n1,
n2 = n2, p1 = p1, p2 = p2
)
)
new("fastcca", res)
}
# Adapted from RCCA:::RCCA.tr
transform <- function(X, lambda = NULL) {
# get shrinkage estimate to 1) compute SVD and 2) estimate lambda
ecl <- eclairs(X, lambda = lambda, compute = "corr")
# if more features that samples
if (ncol(X) > nrow(X)) {
# scale to be consistent with PCA
mat <- with(ecl, V %*% diag(sqrt(dSq))) * sqrt(nrow(X) - 1)
V <- ecl$U
} else {
V <- NULL
mat <- X
}
# get shrinkage estimate of covariance matrix C = with(ecl,
# (1-lambda)*var(mat, use = 'pairwise') + diag(lambda*nu, ncol(mat)))
C <- with(ecl, (1 - lambda) * crossprod(mat) / (nrow(mat) - 1) + diag(
lambda * nu,
ncol(mat)
))
list(mat = mat, tr = V, cor = C, lambda = ecl$lambda, Sigma.eclairs = ecl)
}
# Adapted from RCCA:::RCCA.inv.tr
inverseTransform <- function(X, alpha, V) {
n.comp <- ncol(alpha)
# find canonical variates
u <- X %*% alpha
colnames(u) <- paste("can.comp", seq_len(n.comp), sep = "")
# inverse transfrom canonical coefficients
if (!is.null(V)) {
alpha <- V %*% alpha
}
colnames(alpha) <- paste("can.comp", seq_len(n.comp), sep = "")
list(coefs = alpha, vars = u)
}
geigen <- function(Amat, Bmat, Cmat, k) {
Bdim <- dim(Bmat)
Cdim <- dim(Cmat)
if (Bdim[1] != Bdim[2]) {
stop("BMAT is not square")
}
if (Cdim[1] != Cdim[2]) {
stop("CMAT is not square")
}
p <- Bdim[1]
q <- Cdim[1]
s <- min(c(p, q))
if (max(abs(Bmat - t(Bmat))) / max(abs(Bmat)) > 1e-10) {
stop("BMAT not symmetric.")
}
if (max(abs(Cmat - t(Cmat))) / max(abs(Cmat)) > 1e-10) {
stop("CMAT not symmetric.")
}
Bmat <- (Bmat + t(Bmat)) / 2
Cmat <- (Cmat + t(Cmat)) / 2
Bfac <- chol(Bmat)
Cfac <- chol(Cmat)
Bfacinv <- solve(Bfac)
Cfacinv <- solve(Cfac)
Dmat <- crossprod(Bfacinv, Amat) %*% Cfacinv
if (p >= q) {
if (k < min(p, q) / 3) {
result <- irlba(Dmat, k) # should be faster thatn PRIMME::svds
} else {
result <- svd(Dmat)
}
values <- result$d
Lmat <- Bfacinv %*% result$u
Mmat <- Cfacinv %*% result$v
} else {
result <- svd(t(Dmat))
values <- result$d
Lmat <- Bfacinv %*% result$v
Mmat <- Cfacinv %*% result$u
}
geigenlist <- list(values, Lmat, Mmat)
names(geigenlist) <- c("values", "Lmat", "Mmat")
return(geigenlist)
}
geigen2 <- function(Amat, Bmat, Cmat, k) {
minvsqrt <- function(S) {
dcmp <- eigen(S)
with(dcmp, vectors %*% diag(1 / sqrt(values)) %*% t(vectors))
}
D <- minvsqrt(Bmat) %*% Amat %*% minvsqrt(Cmat)
p <- nrow(Bmat)
q <- nrow(Cmat)
if (q > p) {
D <- t(D)
}
if (k > min(dim(D)) / 3) {
dcmp <- svd(D)
if (k < min(dim(D))) {
dcmp$d <- dcmp$d[seq_len(k)]
dcmp$u <- dcmp$u[, seq_len(k), drop = FALSE]
dcmp$v <- dcmp$v[, seq_len(k), drop = FALSE]
}
} else {
dcmp <- irlba(D, nv = k)
}
# set diagonals to be positive
dcmp$v <- eachrow(dcmp$v, sign(diag(dcmp$v)), "*")
dcmp$u <- eachrow(dcmp$u, sign(diag(dcmp$u)), "*")
values <- dcmp$d
if (p >= q) {
Lmat <- minvsqrt(Bmat) %*% dcmp$u
Mmat <- minvsqrt(Cmat) %*% dcmp$v
} else {
Lmat <- minvsqrt(Bmat) %*% dcmp$v
Mmat <- minvsqrt(Cmat) %*% dcmp$u
}
geigenlist <- list(values, Lmat, Mmat)
names(geigenlist) <- c("values", "Lmat", "Mmat")
return(geigenlist)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.