#' @include generics.R
#'
NULL
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Functions
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#' Bidirectional Canonical Correlation Analysis (BiCCA)
#'
#' Assuming there are two matrices X (M features by K samples) and Y (N features by L samples), we want to
#' find the correlation in feature and sample levels between X and Y.
#' Standard CCA could not handle this case because it requires that there should be at least one
#' dimension shared by two datasets. The BiCCA function introudces one transition matrix Z (M features by L
#' samples) to bridge X with Y. The transition matrix Z is solved by maximalizing correlation between (X, Z)
#' in sample level and correlation between (Z, Y) in feature level simultaneously. Then sample/feature level
#' correlation can be obtained by applying standard CCA on (X, Z) and (Y, Z), respectively.
#'
#' @param X First matrix (M features by K samples)
#' @param Y Second matrix (N features by L samples)
#' @param Z0 Transition matrix (M features by L samples)
#' @param num.X Number of canonical vectors to calculate for pair (X, Z) [default 5]
#' @param num.Y Number of canonical vectors to calculate for pair (Z, Y) [default 5]
#' @param num.iteration Maximal number of iteration [default 100]
#' @param tolerance Relative change ratio for frobenius norm of matrix Z during iteration [default 0.05]
#' @param ncore Number of thread used [default 1]
#' @param bigMemory Use bigMemory mode, this will reduce memory usage when have > 30,000 sampless/features [default TRUE]
#' @param block.size Sample/feature size for each block, only works when bigMemory is set to TRUE
#' @param save Save the temporay files [default FALSE]
#' @param temp.path Folder that is used to store temporary files. Only works when option save is set to be TRUE [default NULL]
#'
#' @return Returns the object with list:
#'
#' @return * `Z` - contains the estimated transition matrix (M features by L samples)
#' @return * `u` - contains the canonical correlation vectors for X (K samples by num.X factor)
#' @return * `r` - contains the canonical correlation vectors for Z (sample level)(L samples by num.X factor)
#' @return * `s` - contains the canonical correlation vectors for Z (feature level)(M features by num.Y factor)
#' @return * `v` - contains the canonical correlation vectors for Y (N features by num.Y factor)
#' @return * `delta` - relative change ratio for frobenius norm of matrix Z during iteration
#'
#'
#' @export
#'
#' @author Jinzhuang Dou, \email{Dou1@mdanderson.org} \email{jinzhuangdou198706@gmail.com}
#'
#' @examples
#' X <- simData$X
#' Y <- simData$Y
#' Z0 <- simData$Z0
#'
#' out <- BiCCA(X=X, Z0=Z0, Y=Y,
#' num.X = 5, num.Y = 5,
#' num.iteration = 100,
#' temp.path = "./tp",
#' tolerance = 0.05,
#' save = FALSE,
#' bigMemory = TRUE,
#' block.size = 1000,
#' ncore = 1)
#'
BiCCA <- function(X=NULL, Z0=NULL, Y=NULL, num.X = 5, num.Y = 5, temp.path=NULL,
num.iteration = 100, tolerance = 0.05, save = FALSE,
bigMemory = FALSE, block.size = 1000, ncore = 1){
start_time <- Sys.time()
message(paste(start_time, " Started!"))
# turn off blas parrallel function
options(bigstatsr.check.parallel.blas = FALSE)
NCORES <- min(c(bigstatsr::nb_cores(), ncore))
preCheck(X=X, Z0=Z0, Y=Y, num.X = num.X, num.Y = num.Y, num.iteration = num.iteration,
temp.path = temp.path, tolerance = tolerance, save = save, bigMemory = bigMemory,
block.size = block.size, ncore = NCORES)
current_time <- Sys.time()
INF <- 10^(10)
M <- 200
X_Ncell <- dim(X)[2]
X_Ngene <- dim(X)[1]
Y_Ncell <- dim(Y)[2]
Y_Nloci <- dim(Y)[1]
Z_Ncell <- dim(Z0)[2]
Z_Ngene <- dim(Z0)[1]
message(paste(current_time, " Dimension Check: X[",X_Ngene,"x",X_Ncell,"]",
" Y[",Y_Nloci,"x",Y_Ncell,"]",
" Z0[",Z_Ngene,"x",Z_Ncell,"]", sep=""))
# Pre calculate generalized inverse
# message(paste(current_time, " Centering and scaling data matrix"))
#X <- ScaleData(object = X, features = rownames(X), do.scale=TRUE, do.center=TRUE, verbose = FALSE)
#X <- colScale(X)
#Z0 <- ScaleData(object = as.matrix(Z0), do.scale=TRUE, do.center=TRUE, verbose = FALSE)
#Z0 <- colScale(Z0)
#Y <- ScaleData(object = t(Y), features = rownames(t(Y)), do.scale=TRUE, do.center=TRUE, verbose = FALSE)
#Y <- colScale(Y)
#Y <- t(Y)
# Use big_randomSVD
if(bigMemory){
gc()
XX_SVD <- irlba_bigMemory(X, X, M, block.size, NCORES)
gc()
YY_SVD <- irlba_bigMemory(t(Y), t(Y), M, block.size, NCORES)
gc()
}
else{
gc()
XX_SVD <- irlba(crossprod(x = X, y = X), M, fastpath = FALSE)
gc()
YY_SVD <- irlba(crossprod(x = t(Y), y = t(Y)), M, fastpath = FALSE)
gc()
}
pre <- Z0
cnt <- 0
delta <- INF
rd_delta <- c()
pb <- progress_bar$new(
format = " Iterating [:bar] :percent eta: :eta",
total = num.iteration, clear = FALSE, width= 60)
for(cnt in seq(1,num.iteration,1)){
if (any(!is.finite(Z0))) stop("infinite or missing values in 'Z0'")
# rownames(Z0) <- rownames(X)
#Z0 <- ScaleData(object = as.matrix(Z0), do.scale=TRUE, do.center=TRUE, verbose = FALSE)
#Z0 <- colScale(Z0)
out1 <- estZ(A=X, B=Z0, AA_SVD = XX_SVD, k = num.X, NCORES = NCORES)
gc()
#tp <- t(as.matrix(out1$Z))
# rownames(tp) <- colnames(Y)
#Z0 <- ScaleData(object = tp , do.scale=TRUE, do.center=TRUE, verbose = FALSE)
#Z0 <- colScale(Z0)
#Z0 <- t(Z0)
Z0 <- as.matrix(out1$Z)
if (any(!is.finite(Z0))) stop("infinite or missing values in 'Z0'")
out2 <- estZ(A=t(Y), B=t(Z0), AA_SVD = YY_SVD, k = num.Y, NCORES = NCORES)
gc()
Z0 <- t(out2$Z)
delta <- norm(Z0-pre, type="F")/norm(as.matrix(pre))
pre <- Z0
pb$tick()
rd_delta <- c(rd_delta, delta)
if(save==TRUE && !is.na(temp.path)){
out<-list(
"u" = out1$coembeding$u,
"r" = out1$coembeding$v,
"s" = out2$coembeding$u,
"v" = out2$coembeding$v,
"Z" = Z0,
"delta" = rd_delta)
saveRDS(out, file=paste(temp.path,"/iteration",cnt,".rds",sep=""))
}
if(delta < tolerance){break}
}
message("\n")
current_time <- Sys.time()
if(delta >= tolerance){
message(paste(current_time, " The decomposition is not converged. The output may not be optimal.
You can increase num.iteration again."))
}
else{
message(paste(current_time, " Done! The decomposition is converged."))
}
rownames(out1$coembeding$u) <- colnames(X)
rownames(out1$coembeding$v) <- colnames(Y)
rownames(out2$coembeding$v) <- rownames(X)
rownames(out2$coembeding$u) <- rownames(Y)
rownames(Z0) <- rownames(X)
colnames(Z0) <- colnames(Y)
#XX_SVD$u <- L2Norm(XX_SVD$u)
#XX_SVD$v <- L2Norm(XX_SVD$v)
#YY_SVD$u <- L2Norm(YY_SVD$u)
#YY_SVD$v <- L2Norm(YY_SVD$v)
out<-list("u" = L2Norm(out1$coembeding$u),
"r" = L2Norm(out1$coembeding$v),
"s" = L2Norm(out2$coembeding$u),
"v" = L2Norm(out2$coembeding$v),
"Z" = Z0,
"delta" = rd_delta)
return(out)
}
crossprod_e <- function(x= X, y=Y){
tp <- crossprod(x = x, y=y)
diag(tp) <- diag(tp) + 0
return(tp)
}
irlba_bigMemory <- function(X, Y, k, block.size, NCORES){
gc()
#tp <- crossprod_e(x = X, y = Y)
#tp <- as_FBM(as.matrix(tp))
X <- t(X)
M <- dim(X)[1]
N <- dim(Y)[2]
X <- as.matrix(X)
Y <- as.matrix(Y)
if (file.exists("test.bk")) {file.remove("test.bk")}
tp <- bigstatsr::FBM(M, N, backingfile = "test")$save()
out <- bigstatsr::big_apply(tp, a.FUN = function(Z, ind) {
Z[, ind] <- X %*%Y[, ind]},
ind=bigstatsr::cols_along(Y), block.size = block.size)
rm(out)
SVD <- bigstatsr::big_randomSVD(tp, fun.scaling = bigstatsr::big_scale(center=FALSE, scale=FALSE),
k = k, ncores = NCORES)
rm(tp)
if (file.exists("test.bk")) {file.remove("test.bk")}
gc()
return(SVD)
}
estZ <- function(A = NULL, B = NULL, AA_SVD = NULL, k = k, bigMemory=TRUE, NCORES = 1){
INF <- 10^(10)
bigMemory <- FALSE
if(bigMemory){
gc()
AB_SVD <- irlba_bigMemory(A, B, k, NCORES)
gc()
BB_SVD <- irlba_bigMemory(B, B, k, NCORES)
gc()
ss_SVD <- irlba_bigMemory(t(AB_SVD$v), t(AB_SVD$v), k, NCORES)
gc()
}
else{
AB_SVD <- irlba(crossprod_e(x = A, y = B), k)
BB_SVD <- irlba(crossprod_e(x = B, y = B), k)
ss_SVD <- irlba(crossprod_e(x = t(AB_SVD$v), y = t(AB_SVD$v)), k)
}
#AB_SVD$u <- L2Norm(AB_SVD$u)
#AB_SVD$v <- L2Norm(AB_SVD$v)
#BB_SVD$u <- L2Norm(BB_SVD$u)
#ss_SVD$u <- L2Norm(ss_SVD$u)
#BB_SVD$v <- L2Norm(BB_SVD$v)
#ss_SVD$v <- L2Norm(ss_SVD$v)
lst<-list()
lst[[1]]<-A
lst[[2]]<-AA_SVD$u
tp <- AA_SVD$d
tp[tp<=0] <- INF
tp <- 1/sqrt(tp)
lst[[3]]<-diag(tp) # update
lst[[4]]<-t(AA_SVD$v)
lst[[5]]<-AB_SVD$u
lst[[6]]<-t(AB_SVD$v)
lst[[7]]<-ss_SVD$u
tp <- ss_SVD$d
tp[tp!=0] <- 1/tp[tp!=0]
lst[[8]]<-diag(tp) # update
lst[[9]]<-t(ss_SVD$u)
lst[[10]]<-BB_SVD$u
tp<-BB_SVD$d
tp[tp<=0] <- INF
tp <- sqrt(tp)
lst[[11]]<-diag(tp) # update
lst[[12]]<-t(BB_SVD$u)
Z0 <- mulitCrossprod(lst)
out <- list()
out$Z <- Z0
out$coembeding <- AB_SVD
gc()
return(out)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.