R/BiCCA.R

Defines functions estZ irlba_bigMemory crossprod_e BiCCA

Documented in BiCCA

#' @include generics.R
#'
NULL

#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Functions
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

#' Bidirectional Canonical Correlation Analysis (BiCCA)
#'
#' Assuming there are two matrices X (M features by K samples) and Y (N features by L samples), we want to 
#' find the correlation in feature and sample levels between X and Y. 
#' Standard CCA could not handle this case because it requires that there should be at least one 
#' dimension shared by two datasets. The BiCCA function introudces one transition matrix Z (M features by L
#' samples) to bridge X with Y. The transition matrix Z is solved by maximalizing correlation between (X, Z) 
#' in sample level and correlation between (Z, Y) in feature level simultaneously. Then sample/feature level 
#' correlation can be obtained by applying standard CCA on (X, Z) and (Y, Z), respectively.



#'
#' @param X First matrix (M features by K samples)
#' @param Y Second matrix (N features by L samples)
#' @param Z0 Transition matrix (M features by L samples) 
#' @param num.X Number of canonical vectors to calculate for pair (X, Z) [default 5]
#' @param num.Y Number of canonical vectors to calculate for pair (Z, Y) [default 5]
#' @param num.iteration Maximal number of iteration [default 100]
#' @param tolerance  Relative change ratio for frobenius norm of matrix Z during iteration [default 0.05]
#' @param ncore Number of thread used [default 1]
#' @param bigMemory Use bigMemory mode, this will reduce memory usage when have > 30,000 sampless/features [default TRUE]
#' @param block.size Sample/feature size for each block, only works when bigMemory is set to TRUE
#' @param save  Save the temporay files [default FALSE]
#' @param temp.path Folder that is used to store temporary files. Only works when option save is set to be TRUE [default NULL]
#'
#' @return Returns the object with list:
#'
#' @return * `Z` - contains the estimated transition matrix (M features by L samples)
#' @return * `u` - contains the canonical correlation vectors for X (K samples by num.X factor) 
#' @return * `r` - contains the canonical correlation vectors for Z (sample level)(L samples by num.X factor) 
#' @return * `s` - contains the canonical correlation vectors for Z (feature level)(M features by num.Y factor) 
#' @return * `v` - contains the canonical correlation vectors for Y (N features by num.Y factor)
#' @return * `delta` - relative change ratio for frobenius norm of matrix Z during iteration
#'
#'
#' @export
#'
#' @author Jinzhuang Dou, \email{Dou1@mdanderson.org} \email{jinzhuangdou198706@gmail.com}

#'
#' @examples

#' X <- simData$X
#' Y <- simData$Y
#' Z0 <- simData$Z0
#'
#' out <- BiCCA(X=X, Z0=Z0, Y=Y, 
#'       num.X = 5, num.Y = 5, 
#'       num.iteration = 100, 
#'       temp.path = "./tp",
#'       tolerance = 0.05, 
#'       save = FALSE, 
#'       bigMemory = TRUE,
#'       block.size = 1000, 
#'       ncore = 1)
#'


BiCCA <- function(X=NULL, Z0=NULL, Y=NULL, num.X = 5, num.Y = 5,  temp.path=NULL,
    num.iteration = 100, tolerance = 0.05, save = FALSE, 
    bigMemory = FALSE, block.size = 1000, ncore = 1){

    start_time <- Sys.time()
    message(paste(start_time, " Started!"))

    # turn off blas parrallel function 
    options(bigstatsr.check.parallel.blas = FALSE)

    NCORES <- min(c(bigstatsr::nb_cores(), ncore))

    preCheck(X=X, Z0=Z0, Y=Y, num.X = num.X, num.Y = num.Y, num.iteration = num.iteration,
        temp.path = temp.path, tolerance = tolerance, save = save, bigMemory = bigMemory,
        block.size = block.size, ncore = NCORES)

    current_time <- Sys.time()
    


    INF <- 10^(10)
    M <- 200
    X_Ncell <- dim(X)[2]
    X_Ngene <- dim(X)[1]
    Y_Ncell <- dim(Y)[2]
    Y_Nloci <- dim(Y)[1]
    Z_Ncell <- dim(Z0)[2]
    Z_Ngene <- dim(Z0)[1]

    
    

    message(paste(current_time, "  Dimension Check: X[",X_Ngene,"x",X_Ncell,"]", 
                                " Y[",Y_Nloci,"x",Y_Ncell,"]",
                                " Z0[",Z_Ngene,"x",Z_Ncell,"]", sep=""))

    # Pre calculate generalized inverse 
    # message(paste(current_time, " Centering and scaling data matrix"))

    #X <- ScaleData(object = X, features = rownames(X), do.scale=TRUE, do.center=TRUE, verbose = FALSE)
    #X <- colScale(X)

    #Z0 <- ScaleData(object = as.matrix(Z0), do.scale=TRUE, do.center=TRUE, verbose = FALSE)
    #Z0 <- colScale(Z0)

    #Y <- ScaleData(object = t(Y), features = rownames(t(Y)), do.scale=TRUE, do.center=TRUE, verbose = FALSE)
    #Y <- colScale(Y) 
    #Y <- t(Y)


    # Use big_randomSVD 
    if(bigMemory){
        gc()
        XX_SVD <- irlba_bigMemory(X, X, M, block.size, NCORES)
        gc()
        YY_SVD <- irlba_bigMemory(t(Y), t(Y),  M, block.size, NCORES)
        gc()
    }
    else{
        gc()
        XX_SVD <- irlba(crossprod(x = X, y = X), M, fastpath = FALSE)
        gc()
        YY_SVD <- irlba(crossprod(x = t(Y), y = t(Y)), M, fastpath = FALSE)
        gc()
    }

    pre <- Z0
    cnt <- 0 
    delta <- INF
    rd_delta <- c()

    pb <- progress_bar$new(
          format = "  Iterating [:bar] :percent eta: :eta",
          total = num.iteration, clear = FALSE, width= 60)

    for(cnt in seq(1,num.iteration,1)){

        if (any(!is.finite(Z0)))  stop("infinite or missing values in 'Z0'")
        # rownames(Z0) <- rownames(X)
        #Z0 <- ScaleData(object = as.matrix(Z0), do.scale=TRUE, do.center=TRUE, verbose = FALSE)
        #Z0 <- colScale(Z0)
        out1 <- estZ(A=X, B=Z0, AA_SVD = XX_SVD, k = num.X, NCORES = NCORES)
        gc()

        #tp <- t(as.matrix(out1$Z))
        # rownames(tp) <- colnames(Y)
        #Z0 <- ScaleData(object = tp , do.scale=TRUE, do.center=TRUE, verbose = FALSE)
        #Z0 <- colScale(Z0)
        #Z0 <- t(Z0)
        Z0 <- as.matrix(out1$Z)

        if (any(!is.finite(Z0)))  stop("infinite or missing values in 'Z0'")
        out2 <- estZ(A=t(Y), B=t(Z0), AA_SVD = YY_SVD, k = num.Y, NCORES = NCORES)
        gc()
        Z0 <- t(out2$Z)

        delta <- norm(Z0-pre, type="F")/norm(as.matrix(pre))
        pre <- Z0 
        pb$tick()
        rd_delta <- c(rd_delta, delta)

        if(save==TRUE && !is.na(temp.path)){
            out<-list(
                  "u" = out1$coembeding$u, 
                  "r" = out1$coembeding$v,
                  "s" = out2$coembeding$u,
                  "v" = out2$coembeding$v,
                  "Z" = Z0,
                  "delta" = rd_delta)
            saveRDS(out, file=paste(temp.path,"/iteration",cnt,".rds",sep=""))
        }

        if(delta < tolerance){break}

    }
    message("\n")
    current_time <- Sys.time()
    if(delta >= tolerance){
        message(paste(current_time, " The decomposition is not converged. The output may not be optimal.
                 You can increase num.iteration again."))
    }
    else{
        message(paste(current_time, " Done! The decomposition is converged."))
    }


    rownames(out1$coembeding$u) <- colnames(X)
    rownames(out1$coembeding$v) <- colnames(Y)
    rownames(out2$coembeding$v) <- rownames(X)
    rownames(out2$coembeding$u) <- rownames(Y)


    rownames(Z0) <- rownames(X)
    colnames(Z0) <- colnames(Y)

    #XX_SVD$u <- L2Norm(XX_SVD$u)
    #XX_SVD$v <- L2Norm(XX_SVD$v)
    #YY_SVD$u <- L2Norm(YY_SVD$u)
    #YY_SVD$v <- L2Norm(YY_SVD$v)

    
    out<-list("u" = L2Norm(out1$coembeding$u), 
              "r" = L2Norm(out1$coembeding$v),
              "s" = L2Norm(out2$coembeding$u),
              "v" = L2Norm(out2$coembeding$v),
              "Z" = Z0,
              "delta" = rd_delta)
    return(out)
}


crossprod_e <- function(x= X, y=Y){

    tp <- crossprod(x = x, y=y)
    diag(tp) <- diag(tp) + 0
    return(tp)
}


irlba_bigMemory <- function(X, Y, k, block.size, NCORES){

    gc()
    #tp <- crossprod_e(x = X, y = Y)
    #tp <- as_FBM(as.matrix(tp))
    X <- t(X)
    M <- dim(X)[1]
    N <- dim(Y)[2]
    X <- as.matrix(X)
    Y <- as.matrix(Y)
    if (file.exists("test.bk")) {file.remove("test.bk")}
    tp <- bigstatsr::FBM(M, N, backingfile = "test")$save()
    out <- bigstatsr::big_apply(tp, a.FUN = function(Z, ind) {
      Z[, ind] <- X %*%Y[, ind]}, 
      ind=bigstatsr::cols_along(Y), block.size = block.size)

    rm(out)
    SVD <- bigstatsr::big_randomSVD(tp, fun.scaling = bigstatsr::big_scale(center=FALSE, scale=FALSE),
     k = k, ncores = NCORES) 
    rm(tp)
    if (file.exists("test.bk")) {file.remove("test.bk")}
    gc()
    return(SVD)
}

estZ <- function(A = NULL, B = NULL, AA_SVD = NULL, k = k, bigMemory=TRUE, NCORES = 1){

    INF <- 10^(10)
    bigMemory <- FALSE
    if(bigMemory){
        gc()
        AB_SVD <- irlba_bigMemory(A, B, k, NCORES)
        gc()
        BB_SVD <- irlba_bigMemory(B, B, k, NCORES)
        gc()
        ss_SVD <- irlba_bigMemory(t(AB_SVD$v), t(AB_SVD$v), k, NCORES)
        gc()
    }
    else{
        AB_SVD <- irlba(crossprod_e(x = A, y = B), k)
        BB_SVD <- irlba(crossprod_e(x = B, y = B), k)
        ss_SVD <- irlba(crossprod_e(x = t(AB_SVD$v), y = t(AB_SVD$v)), k)
    }

    #AB_SVD$u <- L2Norm(AB_SVD$u)
    #AB_SVD$v <- L2Norm(AB_SVD$v)
    #BB_SVD$u <- L2Norm(BB_SVD$u)
    #ss_SVD$u <- L2Norm(ss_SVD$u)
    #BB_SVD$v <- L2Norm(BB_SVD$v)
    #ss_SVD$v <- L2Norm(ss_SVD$v)


    lst<-list()

    lst[[1]]<-A

    lst[[2]]<-AA_SVD$u
    tp <- AA_SVD$d
    tp[tp<=0] <- INF
    tp <- 1/sqrt(tp)
    lst[[3]]<-diag(tp)    # update 
    lst[[4]]<-t(AA_SVD$v)

    lst[[5]]<-AB_SVD$u

    lst[[6]]<-t(AB_SVD$v)

    lst[[7]]<-ss_SVD$u
    tp <- ss_SVD$d
    tp[tp!=0] <- 1/tp[tp!=0]
    lst[[8]]<-diag(tp)    # update 
    lst[[9]]<-t(ss_SVD$u)

    lst[[10]]<-BB_SVD$u
    tp<-BB_SVD$d
    tp[tp<=0] <- INF
    tp <- sqrt(tp)
    lst[[11]]<-diag(tp)   # update
    lst[[12]]<-t(BB_SVD$u)

    Z0 <- mulitCrossprod(lst)
    out <- list()
    out$Z <- Z0
    out$coembeding <- AB_SVD
    gc()
    return(out)

}
jinzhuangdou/SCCAT documentation built on July 8, 2020, 2:36 p.m.