DCCA: Tools for single cell multi-omics analysis \

Documented in runDCCA

#' @include generics.R
#'
NULL

#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Functions
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

#' Perform canonical correlation analysis (CCA) on two dimension
#'
#' Assuming there are two matrix X (M genes by K cells) and Y (N loci by L cells), we want to 
#' find the cell correspondences and correlated gene/loci module between them. 
#' Traditional CCA could not handle this case because it requires that there should be at least one 
#' dimension shared by two datasets. 
#' The DCCA function aims to finding one transition matrix Z (M genes by L cells) to bridge X and Y. 
#' More details can be seen in xx

#'
#' @param X First matrix (M genes by K cells)
#' @param Y Second matrix (N loci by L cells)
#' @param Z0 Transition matrix (M genes by L cells) 
#' @param num.D Number of canonical vectors to calculate for pair (X, Z) [default 10]
#' @param num.E Number of canonical vectors to calculate for pair (Z, Y) [default 10]
#' @param num.iteration Maximal number of iteration [default 100]
#' @param tolerance  Relative change ratio for ||Z||F during iteration [default 0.01]
#' @param save  Save the temporay files [default FALSE]
#'
#' @return Returns the object with list:
#'
#' * `Z` - contains the estimated transition matrix (M genes by L cells)
#' * `u` - contains the canonical correlation vectors for X (K cells by num.D factor) 
#' * `r` - contains the canonical correlation vectors for Z (L cells by num.D factor) 
#' * `s` - contains the canonical correlation vectors for Z (M genes by num.E factor) 
#' * `v` - contains the canonical correlation vectors for Z (N loci by num.E factor)
#'
#' @importFrom methods slot
#' @importFrom stats ave aggregate
#'
#' @export
#'
#' @author Jinzhuang Dou, \email{Dou1@mdanderson.org}
#' @seealso \code{\link{xx}} \code{\link{xx}}
#'
#' @examples
#' runDCCA(X=X, Y=Y, Z0 = Z0, num.D = 6, num.E = 6)
#'


runDCCA <- function(X=NULL, Z0=NULL, Y=NULL, num.D = 10, output=NULL,
    num.E = 10, num.iteration = 100, tolerance = 0.01, save = FALSE){

    # Matrix dimension check 
    start_time <- Sys.time()
    message(paste(start_time, " Started!"))
    preCheck(X=X, Z0=Z0, Y=Y, num.D = num.D, num.E = num.E, num.iteration = num.iteration)
    current_time <- Sys.time()
    

    INF <- 10^(10)
    M <- 200
    X_Ncell <- dim(X)[2]
    X_Ngene <- dim(X)[1]
    Y_Ncell <- dim(Y)[2]
    Y_Nloci <- dim(Y)[1]
    Z_Ncell <- dim(Z0)[2]
    Z_Ngene <- dim(Z0)[1]

    message(paste(current_time, "  Dimension Check: X[",X_Ngene,"x",X_Ncell,"]", 
                                " Y[",Y_Nloci,"x",Y_Ncell,"]",
                                " Z0[",Z_Ngene,"x",Z_Ncell,"]", sep=""))

    # Pre calculate generalized inverse 
    message(paste(current_time, " Centering and scaling data matrix"))

    #X <- ScaleData(object = X, features = rownames(X), do.scale=TRUE, do.center=TRUE, verbose = FALSE)
    #X <- colScale(X)

    #Z0 <- ScaleData(object = as.matrix(Z0), do.scale=TRUE, do.center=TRUE, verbose = FALSE)
    #Z0 <- colScale(Z0)

    #Y <- ScaleData(object = t(Y), features = rownames(t(Y)), do.scale=TRUE, do.center=TRUE, verbose = FALSE)
    #Y <- colScale(Y) 
    #Y <- t(Y)


    XX_SVD <- irlba(crossprod(x = X, y = X), M, fastpath = FALSE)
    YY_SVD <- irlba(crossprod(x = t(Y), y = t(Y)), M, fastpath = FALSE)
    #XX_SVD$u <- L2Norm(XX_SVD$u)
    #XX_SVD$v <- L2Norm(XX_SVD$v)
    #YY_SVD$u <- L2Norm(YY_SVD$u)
    #YY_SVD$v <- L2Norm(YY_SVD$v)


    pre <- Z0
    cnt <- 0 
    delta <- INF
    rd_delta <- c()
    pb <- progress_bar$new(
          format = "  Iterating [:bar] :percent eta: :eta",
          total = num.iteration, clear = FALSE, width= 60)

    for(cnt in seq(1,num.iteration,1)){

        if (any(!is.finite(Z0)))  stop("infinite or missing values in 'Z0'")
        # rownames(Z0) <- rownames(X)
        #Z0 <- ScaleData(object = as.matrix(Z0), do.scale=TRUE, do.center=TRUE, verbose = FALSE)
        #Z0 <- colScale(Z0)
        out1 <- estZ(A=X, B=Z0, AA_SVD = XX_SVD, k = num.D)

        tp <- t(as.matrix(out1$Z))
        # rownames(tp) <- colnames(Y)
        #Z0 <- ScaleData(object = tp , do.scale=TRUE, do.center=TRUE, verbose = FALSE)
        #Z0 <- colScale(Z0)
        #Z0 <- t(Z0)

        if (any(!is.finite(Z0)))  stop("infinite or missing values in 'Z0'")
        out2 <- estZ(A=t(Y), B=t(Z0), AA_SVD = YY_SVD, k = num.E)
        Z0 <- t(out2$Z)

        delta <- norm(Z0-pre, type="F")/norm(pre)
        pre <- Z0 
        pb$tick()
        rd_delta <- c(rd_delta, delta)
        print(delta)

        if(save==TRUE){
            out<-list("cell_u" = out1$coembeding$u, 
                  "cell_r" = out1$coembeding$v,
                  "feature_s" = out2$coembeding$u,
                  "feature_v" = out2$coembeding$v,
                  "Z" = Z0,
                  "delta" = rd_delta)
            saveRDS(out, file=paste(output,"/iteration",cnt,".rds",sep=""))
        }

        if(delta < tolerance){break}

    }
    message("\n")
    current_time <- Sys.time()
    if(delta >= tolerance){
        message(paste(current_time, " The decomposition is not converged. The output may not be optimal.
                 You can increase num.iteration again."))
    }
    else{
        message(paste(current_time, " Done! The decomposition is converged."))
    }

    rownames(out1$coembeding$u) <- colnames(X)
    rownames(out1$coembeding$v) <- colnames(Y)
    rownames(out2$coembeding$u) <- rownames(X)
    rownames(out2$coembeding$v) <- rownames(Y)
    rownames(Z0) <- rownames(X)
    colnames(Z0) <- colnames(Y)
    
    out<-list("cell_u" = out1$coembeding$u, 
              "cell_r" = out1$coembeding$v,
              "feature_s" = out2$coembeding$u,
              "feature_v" = out2$coembeding$v,
              "Z" = Z0,
              "delta" = rd_delta)
    return(out)
}


crossprod_e <- function(x= X, y=Y){

    tp <- crossprod(x = x, y=y)
    diag(tp) <- diag(tp) + 0
    return(tp)
}

estZ <- function(A = NULL, B = NULL, AA_SVD = NULL, k = k){

    INF <- 10^(10)

    AB_SVD <- irlba(crossprod_e(x = A, y = B), k)
    BB_SVD <- irlba(crossprod_e(x = B, y = B), k)
    ss_SVD <- irlba(crossprod_e(x = t(AB_SVD$v), y = t(AB_SVD$v)), k)

    #AB_SVD$u <- L2Norm(AB_SVD$u)
    #AB_SVD$v <- L2Norm(AB_SVD$v)
    #BB_SVD$u <- L2Norm(BB_SVD$u)
    #ss_SVD$u <- L2Norm(ss_SVD$u)
    #BB_SVD$v <- L2Norm(BB_SVD$v)
    #ss_SVD$v <- L2Norm(ss_SVD$v)


    lst<-list()

    lst[[1]]<-A

    lst[[2]]<-AA_SVD$u
    tp <- AA_SVD$d
    tp[tp<=0] <- INF
    tp <- 1/sqrt(tp)
    lst[[3]]<-diag(tp)    # update 
    lst[[4]]<-t(AA_SVD$v)

    lst[[5]]<-AB_SVD$u

    lst[[6]]<-t(AB_SVD$v)

    lst[[7]]<-ss_SVD$u
    tp <- ss_SVD$d
    tp[tp!=0] <- 1/tp[tp!=0]
    lst[[8]]<-diag(tp)    # update 
    lst[[9]]<-t(ss_SVD$u)

    lst[[10]]<-BB_SVD$u
    tp<-BB_SVD$d
    tp[tp<=0] <- INF
    tp <- sqrt(tp)
    lst[[11]]<-diag(tp)   # update
    lst[[12]]<-t(BB_SVD$u)

    Z0 <- mulitCrossprod(lst)
    out <- list()
    out$Z <- Z0
    out$coembeding <- AB_SVD
    return(out)

}



feature_match <- function(dt1, dt2,  band.width=10){
    out <- cdist(dt1, dt2)
    out <- as.matrix(out)
    for( i in seq(1, dim(out)[2],1)){
        out[,i] <- exp(0-band.width*out[,i]/sd(out[,i]))
        out[,i] <- out[,i]/sum(out[,i])
    }
    out[out<10^(-7)] <- 0
    out <- Matrix(out, sparse = TRUE)
    return(out)
}