scAI: integrative analysis of single-cell transcriptomic and epigenomic profiles

Documented in addpData as_matrix bimodLikData clust2Mat create_scAIobject DifferentialLRT ExpMean findNearbyLoci getAggregatedData getEmbeddings identifyClusterMarkers identifyClusters identifyFactorMarkers inferRegulations LogVMR preprocessing reducedDims reorderFeatures runLeiden runPCA run_scAI runUMAP scAImodel searchGeneRegions selectFeatures selectK

#' The scAI Class
#'
#' The scAI object is created from a paired single-cell transcriptomic and epigenomic data.
#' It takes a list of two digital data matrices as input. Genes/loci should be in rows and cells in columns. rownames and colnames should be included.
#' The class provides functions for data preprocessing, integrative analysis, and visualization.
#'
#' The key slots used in the scAI object are described below.
#'
#' @slot raw.data List of raw data matrices, one per dataset (Genes/loci should be in rows and cells in columns)
#' @slot norm.data List of normalized matrices (genes/loci by cells)
#' @slot agg.data Aggregated epigenomic data within similar cells
#' @slot scale.data List of scaled matrices
#' @slot pData data frame storing the information associated with each cell
#' @slot var.features List of informative features to be used, one giving informative genes and the other giving informative loci
#' @slot fit List of inferred low-rank matrices, including W1, W2, H, Z, R
#' @slot fit.variedK List of inferred low-rank matrices when varying the rank K
#' @slot embed List of the reduced 2D coordinates, one per method, e.g., t-SNE/FIt-SNE/umap
#' @slot identity a factor defining the cell identity
#' @slot cluster List of consensus clustering results
#' @slot options List of parameters used throughout analysis
#'
#' @exportClass scAI
#' @importFrom Rcpp evalCpp
#' @importFrom methods setClass
#' @useDynLib scAI
scAI <- methods::setClass("scAI",
                          slots = c(raw.data = "list",
                                    norm.data = "list",
                                    agg.data = "matrix",
                                    scale.data = "list",
                                    pData = "data.frame",
                                    var.features = "list",
                                    fit = "list",
                                    fit.variedK = "list",
                                    embed = "list",
                                    identity = "factor",
                                    cluster = "list",
                                    options = "list")
)
#' show method for scAI
#'
#' @param scAI object
#' @param show show the object
#' @param object object
#' @docType methods
#'
setMethod(f = "show", signature = "scAI", definition = function(object) {
  cat("An object of class", class(object), "\n", length(object@raw.data), "datasets.\n")
  invisible(x = NULL)
})



#' creat a new scAI object
#'
#' @param raw.data List of raw data matrices, a paired single-cell transcriptomic and epigenomic data
#' @param do.sparse whether use sparse format
#'
#' @return
#' @export
#'
#' @examples
#' @importFrom methods as new
create_scAIobject <- function(raw.data, do.sparse = T) {
  object <- methods::new(Class = "scAI",
                         raw.data = raw.data)
  if (do.sparse) {
    raw.data <- lapply(raw.data, function(x) {
      as(as.matrix(x), "dgCMatrix")
    })
  }
  object@raw.data <- raw.data
  return(object)
}



#' preprocess the raw.data including quality control and normalization
#'
#' @param object scAI object
#' @param assay List of assay names to be normalized
#' @param minFeatures Filter out cells with expressed features < minimum features
#' @param minCells Filter out features expressing in less than minCells
#' @param minCounts Filter out cells with expressed count < minCounts
#' @param maxCounts Filter out cells with expressed count > minCounts
#' @param libararyflag Whether do library size normalization
#' @param logNormalize whether do log transformation
#'
#' @return
#' @export
#'
#' @examples
preprocessing <- function(object, assay = list("RNA", "ATAC"), minFeatures = 200, minCells = 3, minCounts = NULL, maxCounts = NULL, libararyflag = TRUE, logNormalize = TRUE) {
    if (is.null(assay)) {
        for (i in 1:length(object@raw.data)) {
            object@norm.data[[i]] <- object@raw.data[[i]]
        }

    } else {

        for (i in 1:length(assay)) {
            iniData <- object@raw.data[[assay[[i]]]]
            print(dim(iniData))

            if (class(iniData) == "data.frame") {
                iniData <- as.matrix(iniData)
            }
            # filter cells that have features less than #minFeatures
            msum <- Matrix::colSums(iniData != 0)
            proData <- iniData[, msum >= minFeatures]
            # filter cells that have UMI counts less than #minCounts
            if (!is.null(minCounts)) {
                proData <- proData[, Matrix::colSums(proData) >= minCounts]
            }

            # filter cells that have expressed genes high than #maxGenes
            if (!is.null(maxCounts)) {
                proData <- proData[, Matrix::colSums(proData) <= maxCounts]
            }

            # filter genes that only express less than #minCells cells
            proData <- proData[Matrix::rowSums(proData != 0) >= minCells, ]
            # normalization:we employ a global-scaling normalization method that normalizes the gene expression measurements for each cell by the total expression multiplies this by a scale factor (10,000 by default)
            if (libararyflag) {
                proData <- sweep(proData, 2, Matrix::colSums(proData), FUN = `/`) * 10000
            }
            if (logNormalize) {
                proData = log(proData + 1)
            }
            object@norm.data[[assay[[i]]]] <- proData

        }
    }
    if (length(assay) == 2) {
        X1 <- object@norm.data[[assay[[1]]]]
        X2 <- object@norm.data[[assay[[2]]]]
        cell.keep <- intersect(colnames(X1), colnames(X2))
        object@norm.data[[assay[[1]]]] <- X1[, cell.keep]
        object@norm.data[[assay[[2]]]] <- X2[, cell.keep]
    } else if (length(assay) == 1) {
        X1 <- object@norm.data[[assay[[1]]]]
        assay2 <- setdiff(names(object@raw.data), assay[[1]])
        X2 <- object@raw.data[[assay2]]
        cell.keep <- intersect(colnames(X1), colnames(X2))
        object@norm.data[[assay[[1]]]] <- X1[, cell.keep]
        object@norm.data[[assay2]] <- X2[, cell.keep]
    }
  names(object@norm.data) <- names(object@raw.data)

    return(object)
}



#' add the cell information into pData slot
#'
#' @param object scAi object
#' @param pdata cell information to be added
#' @param pdata.name the name of column to be assigned
#'
#' @return
#' @export
#'
#' @examples
addpData <- function(object, pdata, pdata.name = NULL) {
  if (is.null(x = pdata.name) && is.atomic(x = pdata)) {
    stop("'pdata.name' must be provided for atomic pdata types (eg. vectors)")
  }
  if (inherits(x = pdata, what = c("matrix", "Matrix"))) {
    pdata <- as.data.frame(x = pdata)
  }

  if (is.null(x = pdata.name)) {
    pdata.name <- names(pdata)
  } else {
    names(pdata) <- pdata.name
  }
  object@pData <- pdata
  return(object)
}



#' run scAI model
#'
#' @param object scAI object
#' @param K An integer indicating the Rank of the inferred factor
#' @param do.fast whether use the python version for running scAI optimization model
#' @param nrun Number of times to repreat the running
#' @param hvg.use1 whether use high variable genes for RNA-seq data
#' @param hvg.use2 whether use high variable genes for ATAC-seq data
#' @param keep_all Whether keep all the results from multiple runs
#' @param s Probability of Bernoulli distribution
#' @param alpha model parameter
#' @param lambda model parameter
#' @param gamma model parameter
#' @param maxIter An integer indicating Maximum number of iteration
#' @param stop_rule Stop rule to be used
#' @param init List of the initialized low-rank matrices
#' @param rand.seed An integer indicating the seed
#'
#' @return
#' @export
#'
#' @examples
#' @importFrom foreach foreach "%dopar%"
#' @importFrom parallel makeForkCluster makeCluster detectCores
#' @importFrom doParallel registerDoParallel
#' @importFrom reticulate r_to_py source_python
run_scAI <- function(object, K, do.fast = FALSE, nrun = 5L, hvg.use1 = FALSE, hvg.use2 = FALSE, keep_all = F, s = 0.25, alpha = 1, lambda = 100000, gamma = 1, maxIter = 500L, stop_rule = 1L, init = NULL, rand.seed = 1L) {
    if (!is.null(init)) {
        W1.init = init$W1.init
        W2.init = init$W2.init
        H.init = init$H.init
        Z.init = init$Z.init
        R.init = init$R.init
    } else {
        R.init = NULL
        W1.init = NULL
        W2.init = NULL
        H.init = NULL
        Z.init = NULL
    }
    options(warn = -1)
    # Calculate the number of cores
    numCores <- min(parallel::detectCores(), nrun)
    cl <- tryCatch({
        parallel::makeForkCluster(numCores)
    }, error = function(e) {
        parallel::makeCluster(numCores)
    })
    doParallel::registerDoParallel(cl)
    if (hvg.use1) {
        X1 <- as.matrix(object@norm.data[[1]][object@var.features[[1]], ])
    } else {
        X1 <- as.matrix(object@norm.data[[1]])
    }
    if (hvg.use2) {
        X2 <- as.matrix(object@norm.data[[2]][object@var.features[[2]], ])
    } else {
        X2 <- as.matrix(object@norm.data[[2]])
    }

    if (!do.fast) {
      outs <- foreach(i = 1:nrun, .packages = c("Matrix")) %dopar% {
        set.seed(rand.seed + i - 1)
        scAImodel(X1, X2, K = K, s = s, alpha = alpha, lambda = lambda, gamma = gamma, maxIter = maxIter, stop_rule = stop_rule,
                  R.init = R.init, W1.init = W1.init, W2.init = W2.init, H.init = H.init, Z.init = Z.init)
      }
    } else {
      barcodes <- colnames(X2)
      feature1 <- rownames(X1)
      feature2 <- rownames(X2)
      names_com <- paste0("factor", seq_len(K))

      path <- paste(system.file(package="scAI"), "scAImodel_py.py", sep="/")
      reticulate::source_python(path)
      X1py = r_to_py(X1)
      X2py = r_to_py(X2)
      R.initpy = r_to_py(R.init); W1.initpy = r_to_py(W1.init); W2.initpy = r_to_py(W2.init); H.initpy = r_to_py(H.init); Z.initpy = r_to_py(Z.init)
      K = as.integer(K); maxIter = as.integer(maxIter)
      outs <- pbapply::pbsapply(
        X = 1:nrun,
        FUN = function(x) {
          seed = as.integer(rand.seed + x - 1)
          ptm = Sys.time()
          results = scAImodel_py(X1 = X1py, X2 = X2py, K = K, S = s, Alpha = alpha, Lambda = lambda, Gamma = gamma, Maxiter = maxIter, Stop_rule = stop_rule,
                                 Seed = seed, W1 = W1.initpy, W2 = W2.initpy, H = H.initpy, Z = Z.initpy,R = R.initpy)
          execution.time = Sys.time() - ptm
          names(results) <- c("W1", "W2", "H", "Z", "R")
          attr(results$W1, "dimnames") <- list(feature1, names_com)
          attr(results$W2, "dimnames") <- list(feature2, names_com)
          attr(results$H, "dimnames") <- list(names_com, barcodes)
          attr(results$Z, "dimnames") <- list(barcodes, barcodes)
          results$options = list(s = s, alpha = alpha, lambda = lambda, gamma = gamma, maxIter = maxIter, stop_rule = stop_rule, run.time = execution.time)
          return(results)
        },
        simplify = FALSE
      )
    }

    objs <- foreach(i = 1:nrun, .combine = c) %dopar% {
        W1 <- outs[[i]]$W1
        W2 <- outs[[i]]$W2
        sum(cor(as.matrix(W1))) + sum(cor(as.matrix(W2)))
    }
    parallel::stopCluster(cl)
    N <- ncol(X1)
    C <- base::matrix(0, N, N)
    for (i in seq_len(nrun)) {
        H <- outs[[i]]$H
        H <- sweep(H, 2, colSums(H), FUN = `/`)
        clusIndex <- apply(H, 2, which.max)
        # compute the consensus matrix
        adjMat <- clust2Mat(clusIndex)
        C <- C + adjMat
    }
    CM <- C/nrun

    if (!keep_all) {
        sprintf("The best seed is %d", which.min(objs))
        outs_final <- outs[[which.min(objs)]]
        #object@agg.data <- outs_final$agg.data
        W = list(W1 = outs_final$W1, W2 = outs_final$W2)
        names(W) <- names(object@norm.data)
        object@fit <- list(W = W, H = outs_final$H, Z = outs_final$Z, R = outs_final$R)
        object <- getAggregatedData(object)
        object@cluster$consensus <- CM
        object@options$cost <- objs
        object@options$paras <- outs_final$options
        object@options$paras$nrun <- nrun
        object@options$best.seed <- which.min(objs)
        return(object)
    } else {
        outs_final <- list()
        outs_final$best <- outs[[which.min(objs)]]
        outs_final$best$consensus <- CM
        outs_final$nruns <- outs
        outs_final$options$cost <- objs
        return(outs_final)
    }
}



#' Solving the optimization problem in scAI
#'
#' @param X1 Single-cell transcriptomic data matrix (norm.data)
#' @param X2 Single-cell epigenomic data matrix (norm.data)
#' @param K Rank of inferred factors
#' @param s Probability of Bernoulli distribution
#' @param alpha model parameter
#' @param lambda model parameter
#' @param gamma model parameter
#' @param maxIter Maximum number of iteration
#' @param stop_rule Stop rule to be used
#' @param R.init initialization of R
#' @param W1.init initialization of W1
#' @param W2.init initialization of W2
#' @param H.init initialization of H
#' @param Z.init initialization of Z
#'
#' @return
#' @export
#'
#' @examples
#' @importFrom stats rbinom runif
#' @importFrom rfunctions crossprodcpp
scAImodel <- function(X1, X2, K, s = 0.25, alpha = 1, lambda = 100000, gamma = 1, maxIter = 500, stop_rule = 1,
                 R.init = NULL, W1.init = NULL, W2.init = NULL, H.init = NULL, Z.init = NULL) {
  # Initialization W1,W2,H and Z
  p <- nrow(X1)
  n <- ncol(X1)
  q = nrow(X2)
  if (is.null(W1.init)) {
    W1.init = matrix(runif(p * K), p, K)
  }
  if (is.null(W2.init)) {
    W2.init = matrix(runif(q * K), q, K)
  }
  if (is.null(H.init)) {
    H.init = matrix(runif(K * n), K, n)
  }
  if (is.null(Z.init)) {
    Z.init = matrix(runif(n), n, n)
  }
  if (is.null(R.init)) {
    R.init = matrix(rbinom(n * n, 1, s), n, n)
  }
  W1 = W1.init
  W2 = W2.init
  H = H.init
  Z = Z.init
  R = R.init

  # start the clock to measure the execution time
  ptm = proc.time()
  eps <- .Machine$double.eps
  onesM_K <- matrix(1, K, K)

  XtX2 <- crossprodcpp(X2)
  index <- which(R == 0)
  for (iter in 1:maxIter) {
    # normalize H
    H = H/rowSums(H)
    # update W1
    HHt <- tcrossprod(H)
    X1Ht <- eigenMapMattcrossprod(X1, H)
    W1HHt <- eigenMapMatMult(W1, HHt)
    W1 <- W1 * X1Ht/(W1HHt + eps)

    # update W2
    ZR <- Z
    ZR[index] <- 0
    ZRHt <- eigenMapMattcrossprod(ZR, H)
    X2ZRHt <- eigenMapMatMult(X2, ZRHt)
    W2HHt <- eigenMapMatMult(W2, HHt)
    W2 = W2 * X2ZRHt/(W2HHt + eps)

    # update H
    W1tX1 <- eigenMapMatcrossprod(W1, X1)
    W2tX2 <- eigenMapMatcrossprod(W2, X2)
    W2tX2ZR <- eigenMapMatMult(W2tX2, ZR)
    HZZt <- eigenMapMatMult(H, Z + t(Z))
    W1tW1 <- crossprodcpp(W1)
    W2tW2 <- crossprodcpp(W2)
    temp1 <- H * (alpha * W1tX1 + W2tX2ZR + lambda * HZZt)
    temp2 <- eigenMapMatMult(alpha * W1tW1 + W2tW2 + 2 * lambda * HHt + gamma * onesM_K, H)
    H <- temp1/(temp2 + eps)

    # update Z
    HtH <- crossprodcpp(H)
    X2tW2H <- eigenMapMatcrossprod(W2tX2, H)
    RX2tW2H = X2tW2H
    RX2tW2H[index] = 0
    XtX2ZR <- eigenMapMatMult(XtX2, ZR)
    XtX2ZRR = XtX2ZR
    XtX2ZRR[index] = 0
    Z = Z * (RX2tW2H + lambda * HtH)/(XtX2ZRR + lambda * Z + eps)

    if (stop_rule == 2) {
      obj = alpha * norm(X1 - W1 %*% H, "F")^2 + norm(X2 %*% (Z * R) - W2 %*% H, "F")^2 + lambda * norm(Z - HtH, "F") + gamma * sum(colSums(H) * colSums(H))
      if (iter > 1 && ((obj_old - obj)/obj_old < 10^(-6)) || iter == maxIter) {
        break
      }
      obj_old = obj
    }
  }
  # compute the execution time
  execution.time = proc.time() - ptm

  barcodes <- colnames(X2)
  feature1 <- rownames(X1)
  feature2 <- rownames(X2)
  names_com <- paste0("factor", seq_len(K))
  attr(W1, "dimnames") <- list(feature1, names_com)
  attr(W2, "dimnames") <- list(feature2, names_com)
  attr(H, "dimnames") <- list(names_com, barcodes)
  attr(Z, "dimnames") <- list(barcodes, barcodes)

  outs <- list(W1 = W1, W2 = W2, H = H, Z = Z, R = R,
               options = list(s = s, alpha = alpha, lambda = lambda, gamma = gamma, maxIter = maxIter, stop_rule = stop_rule, run.time = execution.time))
  return(outs)

}


#' Generate the aggregated epigenomic data
#'
#' @param object an scAI object after running run_scAI
#' @param group cell group information if available; aggregate epigenomic data based on the available cell group information instead of the learned cell-cell similarity matrix from scAI
#'
#' @return
#' @export
#'
getAggregatedData <- function(object, group = NULL) {
  if (is.null(group)) {
    Z <-  object@fit$Z
  } else {
    if (is.factor(group) | is.character(group)) {
      group <- as.numeric(factor(group))
    }
    Z <- clust2Mat(group)
    diag(Z) <- 1
    object@fit$Z <- Z
  }
  R <- object@fit$R
  ZR <- Z * R
  ZR <- sweep(ZR, 2, colSums(ZR), FUN = `/`)
  X2 <- as.matrix(object@norm.data[[2]])
  X2agg <- eigenMapMatMult(X2, ZR)
  X2agg <- sweep(X2agg, 2, colSums(X2agg), FUN = `/`) * 10000
  X2agg <- log(1 + X2agg)
  attr(X2agg, "dimnames") <- list(rownames(object@norm.data[[2]]), colnames(object@norm.data[[2]]))
  object@agg.data <- X2agg
  return(object)
}



#' Perform dimensional reduction
#'
#' Dimension reduction by PCA, t-SNE or UMAP
#' @param object scAI object
#' @param return.object whether return scAI object
#' @param data.use input data
#' @param do.scale whether scale the data
#' @param do.center whether scale and center the data
#' @param method Method of dimensional reduction, one of tsne, FItsne and umap
#' @param rand.seed Set a random seed. By default, sets the seed to 42.
#' @param perplexity perplexity parameter in tsne
#' @param theta parameter in tsne
#' @param check_duplicates parameter in tsne

#' @param FItsne.path File path of FIt-SNE
#' @param dim.embed dimensions of t-SNE embedding
#' @param dim.use num of PCs used for t-SNE
#'
#' @param do.fast whether do fast PCA
#' @param dimPC the number of components to keep in PCA
#' @param weight.by.var whether use weighted pc.scores
#'
#' @param n.neighbors This determines the number of neighboring points used in
#' local approximations of manifold structure. Larger values will result in more
#' global structure being preserved at the loss of detailed local structure. In general this parameter should often be in the range 5 to 50.
#' @param n.components The dimension of the space to embed into.
#' @param distance This determines the choice of metric used to measure distance in the input space.
#' @param n.epochs the number of training epochs to be used in optimizing the low dimensional embedding. Larger values result in more accurate embeddings. If NULL is specified, a value will be selected based on the size of the input dataset (200 for large datasets, 500 for small).
#' @param learning.rate The initial learning rate for the embedding optimization.
#' @param min.dist This controls how tightly the embedding is allowed compress points together.
#' Larger values ensure embedded points are moreevenly distributed, while smaller values allow the
#' algorithm to optimise more accurately with regard to local structure. Sensible values are in the range 0.001 to 0.5.
#' @param spread he effective scale of embedded points. In combination with min.dist this determines how clustered/clumped the embedded points are.
#' @param set.op.mix.ratio Interpolate between (fuzzy) union and intersection as the set operation used to combine local fuzzy simplicial sets to obtain a global fuzzy simplicial sets.
#' @param local.connectivity The local connectivity required - i.e. the number of nearest neighbors
#' that should be assumed to be connected at a local level. The higher this value the more connected
#' the manifold becomes locally. In practice this should be not more than the local intrinsic dimension of the manifold.
#' @param repulsion.strength Weighting applied to negative samples in low dimensional embedding
#' optimization. Values higher than one will result in greater weight being given to negative samples.
#' @param negative.sample.rate The number of negative samples to select per positive sample in the
#' optimization process. Increasing this value will result in greater repulsive force being applied, greater optimization cost, but slightly more accuracy.
#' @param a More specific parameters controlling the embedding. If NULL, these values are set automatically as determined by min. dist and spread.
#' @param b More specific parameters controlling the embedding. If NULL, these values are set automatically as determined by min. dist and spread.

#'
#' @return
#' @export
#'
#' @examples
#' @importFrom Rtsne Rtsne
#' @importFrom reticulate py_module_available py_set_seed import
#'
reducedDims <- function(object, data.use = object@fit$H, do.scale = TRUE, do.center = TRUE, return.object = TRUE, method = "umap",
dim.embed = 2, dim.use = NULL, perplexity = 30, theta = 0.5, check_duplicates = F, rand.seed = 42L,
FItsne.path = NULL,
dimPC = 40,do.fast = TRUE, weight.by.var = TRUE,
n.neighbors = 30L, n.components = 2L, distance = "correlation",n.epochs = NULL,learning.rate = 1.0,min.dist = 0.3,spread = 1.0,set.op.mix.ratio = 1.0,local.connectivity = 1L,
repulsion.strength = 1,negative.sample.rate = 5,a = NULL,b = NULL) {

    data.use <- as.matrix(data.use)
    if (do.scale) {
        data.use <- t(scale(t(data.use), center = do.center, scale = TRUE))
        data.use[is.na(data.use)] <- 0
    }
    if (!is.null(dim.use)) {
        data.use = object@embed$pca[, dim.use]
    }
    if (method == "pca") {
        cell_coords <- runPCA(data.use, do.fast = do.fast, dimPC = dimPC, seed.use = rand.seed, weight.by.var = weight.by.var)
        object@embed$pca <- cell_coords

    } else if (method == "tsne") {
        set.seed(rand.seed)
        cell_coords <- Rtsne::Rtsne(t(data.use), pca = FALSE, dims = dim.embed, theta = theta, perplexity = perplexity, check_duplicates = F)$Y
        rownames(cell_coords) <- rownames(t(data.use))
        object@embed$tsne <- cell_coords

    } else if (method == "FItsne") {
        if (!exists("FItsne")) {
            if (is.null(fitsne.path)) {
                stop("Please pass in path to FIt-SNE directory as FItsne.path.")
            }
            source(paste0(fitsne.path, "/fast_tsne.R"), chdir = T)
        }
        cell_coords <- fftRtsne(t(data.use), pca = FALSE, dims = dim.embed, theta = theta, perplexity = perplexity, check_duplicates = F, rand_seed = rand.seed)
        rownames(cell_coords) <- rownames(t(data.use))
        object@embed$FItsne <- cell_coords

    } else if (method == "umap") {
        cell_coords <- runUMAP(data.use,
        n.neighbors = n.neighbors,
        n.components = n.components,
        distance = distance,
        n.epochs = n.epochs,
        learning.rate = learning.rate,
        min.dist = min.dist,
        spread = spread,
        set.op.mix.ratio = set.op.mix.ratio,
        local.connectivity = local.connectivity,
        repulsion.strength = repulsion.strength,
        negative.sample.rate = negative.sample.rate,
        a = a,
        b = b,
        seed.use = rand.seed
        )
        object@embed$umap <- cell_coords

    } else {
        stop("Error: unrecognized dimensionality reduction method.")
    }
    if (return.object) {
        return(object)
    } else {
        return(cell_coords)
    }

}


#' Dimension reduction using PCA
#'
#' @param data.use input data
#' @param do.fast whether do fast PCA
#' @param dimPC the number of components to keep
#' @param seed.use set a seed
#' @param weight.by.var whether use weighted pc.scores
#' @importFrom stats prcomp
#' @importFrom irlba irlba
#' @return
#' @export
#'
#' @examples
runPCA <- function(data.use, do.fast = T, dimPC = 50, seed.use = 42, weight.by.var = T) {
  set.seed(seed = seed.use)
  if (do.fast) {
    dimPC <- min(dimPC, nrow(data.use) - 1)
    pca.res <- irlba::irlba(t(data.use), nv = dimPC)
    sdev <- pca.res$d/sqrt(max(1, ncol(data.use) - 1))
    if (weight.by.var){
      pc.scores <- pca.res$u %*% diag(pca.res$d)
    } else {
      pc.scores <- pca.res$u
    }
  } else {
    dimPC <- min(dimPC, nrow(data.use) - 1)
    pca.res <- stats::prcomp(x = t(data.use), rank. = dimPC)
    sdev <- pca.res$sdev
    if (weight.by.var) {
      pc.scores <- pca.res$x %*% diag(pca.res$sdev[1:dimPC]^2)
    } else {
      pc.scores <- pca.res$x
    }
  }
  rownames(pc.scores) <- colnames(data.use)
  colnames(pc.scores) <- paste0('PC', 1:ncol(pc.scores))
  cell_coords <- pc.scores
  return(cell_coords)
}

#' Perform dimension reduction using UMAP
#'
#' @param data.use input data
#' @param n.neighbors This determines the number of neighboring points used in
#' local approximations of manifold structure. Larger values will result in more
#' global structure being preserved at the loss of detailed local structure. In general this parameter should often be in the range 5 to 50.
#' @param n.components The dimension of the space to embed into.
#' @param distance This determines the choice of metric used to measure distance in the input space.
#' @param n.epochs the number of training epochs to be used in optimizing the low dimensional embedding. Larger values result in more accurate embeddings. If NULL is specified, a value will be selected based on the size of the input dataset (200 for large datasets, 500 for small).
#' @param learning.rate The initial learning rate for the embedding optimization.
#' @param min.dist This controls how tightly the embedding is allowed compress points together.
#' Larger values ensure embedded points are moreevenly distributed, while smaller values allow the
#' algorithm to optimise more accurately with regard to local structure. Sensible values are in the range 0.001 to 0.5.
#' @param spread he effective scale of embedded points. In combination with min.dist this determines how clustered/clumped the embedded points are.
#' @param set.op.mix.ratio Interpolate between (fuzzy) union and intersection as the set operation used to combine local fuzzy simplicial sets to obtain a global fuzzy simplicial sets.
#' @param local.connectivity The local connectivity required - i.e. the number of nearest neighbors
#' that should be assumed to be connected at a local level. The higher this value the more connected
#' the manifold becomes locally. In practice this should be not more than the local intrinsic dimension of the manifold.
#' @param repulsion.strength Weighting applied to negative samples in low dimensional embedding
#' optimization. Values higher than one will result in greater weight being given to negative samples.
#' @param negative.sample.rate The number of negative samples to select per positive sample in the
#' optimization process. Increasing this value will result in greater repulsive force being applied, greater optimization cost, but slightly more accuracy.
#' @param a More specific parameters controlling the embedding. If NULL, these values are set automatically as determined by min. dist and spread.
#' @param b More specific parameters controlling the embedding. If NULL, these values are set automatically as determined by min. dist and spread.
#' @param seed.use Set a random seed. By default, sets the seed to 42.
#' @param metric.kwds A dictionary of arguments to pass on to the metric, such as the p value for Minkowski distance
#' @param angular.rp.forest Whether to use an angular random projection forest to initialise the
#' approximate nearest neighbor search. This can be faster, but is mostly on useful for metric that
#' use an angular style distance such as cosine, correlation etc. In the case of those metrics angular forests will be chosen automatically.
#' @param verbose Controls verbosity
#' This function is modified from Seurat package
#' @return
#' @export
#' @importFrom reticulate py_module_available py_set_seed import
#' @examples
runUMAP <- function(
  data.use,
  n.neighbors = 30L,
  n.components = 2L,
  distance = "correlation",
  n.epochs = NULL,
  learning.rate = 1.0,
  min.dist = 0.3,
  spread = 1.0,
  set.op.mix.ratio = 1.0,
  local.connectivity = 1L,
  repulsion.strength = 1,
  negative.sample.rate = 5,
  a = NULL,
  b = NULL,
  seed.use = 42L,
  metric.kwds = NULL,
  angular.rp.forest = FALSE,
  verbose = TRUE){
  if (!reticulate::py_module_available(module = 'umap')) {
    stop("Cannot find UMAP, please install through pip (e.g. pip install umap-learn or reticulate::py_install(packages = 'umap-learn')).")
  }
  set.seed(seed.use)
  reticulate::py_set_seed(seed.use)
  umap_import <- reticulate::import(module = "umap", delay_load = TRUE)
  umap <- umap_import$UMAP(
    n_neighbors = as.integer(n.neighbors),
    n_components = as.integer(n.components),
    metric = distance,
    n_epochs = n.epochs,
    learning_rate = learning.rate,
    min_dist = min.dist,
    spread = spread,
    set_op_mix_ratio = set.op.mix.ratio,
    local_connectivity = local.connectivity,
    repulsion_strength = repulsion.strength,
    negative_sample_rate = negative.sample.rate,
    a = a,
    b = b,
    metric_kwds = metric.kwds,
    angular_rp_forest = angular.rp.forest,
    verbose = verbose
  )
  Rumap <- umap$fit_transform
  umap_output <- Rumap(t(data.use))
  colnames(umap_output) <- paste0('UMAP', 1:ncol(umap_output))
  rownames(umap_output) <- colnames(data.use)
  return(umap_output)
}




#' Identify enriched features in each factor
#'
#' Rank features in each factor by Factor loading analysis
#' @param object scAI object
#' @param assay Name of assay to be analyzed
#' @param features a vector of features
#' @param cutoff.W Threshold of feature loading values
#' @param cutoff.H Threshold of cell loading values
#' @param thresh.pc Threshold of the percent of cells enriched in one factor
#' @param thresh.fc Threshold of Fold Change
#' @param thresh.p Threshold of p-values
#' @param n.top Number of top features to be returned
#' @importFrom stats sd wilcox.test
#' @importFrom dplyr top_n slice
#' @importFrom future nbrOfWorkers
#' @importFrom future.apply future_sapply
#' @importFrom pbapply pbsapply
#' @importFrom stats p.adjust
#'
#' @return
#' @export
#'
#' @examples

identifyFactorMarkers <- function(object, assay, features = NULL, cutoff.W = 0.5, cutoff.H = 0.5,
                                  thresh.pc = 0.05, thresh.fc = 0.25, thresh.p = 0.05, n.top = 10) {
    if (assay == "RNA") {
        X <- as.matrix(object@norm.data[[assay]])
    } else {
        X <- as.matrix(object@agg.data)
    }
    H <- object@fit$H
    W <- object@fit$W[[assay]]
    if (is.null(features)) {
        features <- row.names(W)
    }

    H <- sweep(H, 2, colSums(H), FUN = `/`)
    K = ncol(W)
    lib_W <- base::rowSums(W)
    lib_W[lib_W == 0] <- 1
    lib_W[lib_W < mean(lib_W) - 5 * sd(lib_W)] <- 1  #  omit the nearly null rows
    W <- sweep(W, 1, lib_W, FUN = `/`)
    MW <- base::colMeans(W)
    sW <- apply(W, 2, sd)
    # candidate markers for each factor
    IndexW_record <- vector("list", K)
    for (j in 1:K) {
        IndexW_record[[j]] <- which(W[, j] > MW[j] + cutoff.W * sW[j])
    }

    my.sapply <- ifelse(
    test = future::nbrOfWorkers() == 1,
    yes = pbapply::pbsapply,
    no = future.apply::future_sapply
    )

    # divided cells into two groups
    mH <- apply(H, 1, mean)
    sH <- apply(H, 1, sd)
    IndexH_record1 <- vector("list", K)
    IndexH_record2 <- vector("list", K)
    for (i in 1:K) {
        IndexH_record1[[i]] <- which(H[i, ] > mH[i] + cutoff.H * sH[i])
        IndexH_record2[[i]] <- base::setdiff(c(1:ncol(H)), IndexH_record1[[i]])
    }

    # identify factor-specific markers
    factor_markers = vector("list", K)
    markersTopn = vector("list", K)
    factors <- c()
    Features <- c()
    Pvalues <- c()
    Log2FC <- c()
    for (i in 1:K) {
        data1 <- X[IndexW_record[[i]], IndexH_record1[[i]]]
        data2 <- X[IndexW_record[[i]], IndexH_record2[[i]]]
        idx1 <- which(base::rowSums(data1 != 0) > thresh.pc * ncol(data1))  # at least expressed in thresh.pc% cells in one group
        FC <- log2(base::rowMeans(data1)/base::rowMeans(data2))
        idx2 <- which(FC > thresh.fc)
        pvalues <- my.sapply(
        X = 1:nrow(x = data1),
        FUN = function(x) {
            return(wilcox.test(data1[x, ], data2[x, ], alternative = "greater")$p.value)
        }
        )

        idx3 <- which(pvalues < thresh.p)
        idx = intersect(intersect(idx1, idx2), idx3)

        # order
        FC <- FC[idx]
        c = sort(FC, decreasing = T, index.return = T)$ix
        ri = IndexW_record[[i]]

        Pvalues <- c(Pvalues, pvalues[ri[idx[c]]])
        Log2FC <- c(Log2FC, FC[c])
        factors <- c(factors, rep(i, length(c)))
        Features <- c(Features, features[ri[idx[c]]])

    }
    # stopCluster(cl)
    markers.all <- cbind(factors = factors, features = Features, pvalues = Pvalues, log2FC = Log2FC)

    rownames(markers.all) <- Features
    markers.all <- as.data.frame(markers.all)
    markers.top <- markers.all %>% dplyr::group_by(factors) %>% top_n(n.top, log2FC) %>% dplyr::slice(1:n.top)

    markers = list(markers.all = markers.all, markers.top = markers.top)

    return(markers)
}



#' compute the 2D coordinates of embeded cells, genes, loci and factors using VscAI visualization
#'
#' @param object scAI object
#' @param genes.embed A vector of genes to be embedded
#' @param loci.embed A vector of loci to be embedded
#' @param alpha.embed Parameter controlling the distance between the cells and the factors
#' @param snn.embed Number of neighbors
#'
#' @return
#' @export
#'
#' @examples
#' @importFrom Matrix Matrix
getEmbeddings <- function(object, genes.embed = NULL, loci.embed = NULL, alpha.embed = 1.9, snn.embed = 5) {
  H <- object@fit$H
  W1 <- object@fit$W[[1]]
  W2 <- object@fit$W[[2]]
  Z <- object@fit$Z

  if (nrow(H) < 3 & length(object@fit.variedK) == 0) {
    print("VscAI needs at least three factors for embedding. Now rerun scAI with rank being 3...")
    outs <- run_scAI(object, K = 3, nrun = object@options$paras$nrun,
                     s = object@options$paras$s, alpha = object@options$paras$alpha, lambda = object@options$paras$lambda, gamma = object@options$paras$gamma,
                     maxIter = object@options$paras$maxIter, stop_rule = object@options$paras$stop_rule)
    W <- outs@fit$W
    H <- outs@fit$H
    Z <- outs@fit$Z
    W1 <- W[[1]]
    W2 <- W[[2]]
    object@fit.variedK$W <- W
    object@fit.variedK$H <- H
    object@fit.variedK$Z <- Z
  } else if (nrow(H) < 3 & length(object@fit.variedK) != 0) {
    print("Using the previous calculated factors for embedding.")
    W1 <- object@fit.variedK$W[[1]]
    W2 <- object@fit.variedK$W[[2]]
    H <- object@fit.variedK$H
    Z <- object@fit.variedK$Z

  }

  nmf.scores <- H

  Zsym <- Z/max(Z)
  Zsym[Zsym < 10^(-6)] <- 0
  Zsym <- (Zsym + t(Zsym))/2
  diag(Zsym) <- 1
  rownames(Zsym) <- colnames(H)
  colnames(Zsym) <- rownames(Zsym)

  alpha.exp <- alpha.embed  # Increase this > 1.0 to move the cells closer to the factors. Values > 2 start to distort the data.
  snn.exp <- snn.embed  # Lower this < 1.0 to move similar cells closer to each other
  n_pull <- nrow(H)  # The number of factors pulling on each cell. Must be at least 3.

  Zsym <- Matrix(data = Zsym, sparse = TRUE)
  snn <- Zsym
  swne.embedding <- swne::EmbedSWNE(nmf.scores, snn, alpha.exp = alpha.exp, snn.exp = snn.exp, n_pull = n_pull, proj.method = "sammon", dist.use = "cosine")
  # project cells and factors
  factor_coords <- swne.embedding$H.coords
  cell_coords <- swne.embedding$sample.coords
  rownames(factor_coords) <- rownames(H)
  rownames(cell_coords) <- colnames(H)

  # project genes onto the low dimension space by VscAI
  if (is.null(genes.embed)) {
    genes.embed <- rownames(W1)
  } else {
    genes.embed <- as.character(as.vector(as.matrix((genes.embed))))
  }

  swne.embedding <- swne::EmbedFeatures(swne.embedding, W1, genes.embed, n_pull = n_pull)
  gene_coords <- swne.embedding$feature.coords
  rownames(gene_coords) <- gene_coords$name

  # project loci
  if (is.null(loci.embed)) {
    loci.embed <- rownames(W2)
  } else {
    loci.embed <- as.character(as.vector(as.matrix((loci.embed))))
  }

  swne.embedding <- swne::EmbedFeatures(swne.embedding, W2, loci.embed, n_pull = n_pull)
  loci_coords <- swne.embedding$feature.coords
  rownames(loci_coords) <- loci_coords$name

  object@embed$VscAI$cells <- cell_coords
  object@embed$VscAI$factors <- factor_coords
  object@embed$VscAI$genes <- gene_coords
  object@embed$VscAI$loci <- loci_coords
  return(object)
}



#' Identify cell clusters
#'
#' @param object scAI object
#' @param partition.type Type of partition to use. Defaults to RBConfigurationVertexPartition. Options include: ModularityVertexPartition, RBERVertexPartition, CPMVertexPartition, MutableVertexPartition, SignificanceVertexPartition, SurpriseVertexPartition (see the Leiden python module documentation for more details)
#' @param seed.use set seed
#' @param n.iter number of iteration
#' @param initial.membership arameters to pass to the Python leidenalg function defaults initial_membership=None
#' @param weights defaults weights=None
#' @param node.sizes Parameters to pass to the Python leidenalg function
#' @param resolution A parameter controlling the coarseness of the clusters
#' @param K Number of clusters if performing hierarchical clustering of the consensus matrix
#' @return
#' @export
#'
#' @examples
#' @importFrom stats as.dist cophenetic cor cutree dist hclust
identifyClusters <- function(object, resolution = 1, partition.type = "RBConfigurationVertexPartition",
                             seed.use = 42L,n.iter = 10L,
                             initial.membership = NULL, weights = NULL, node.sizes = NULL,
                             K = NULL) {
  if (is.null(K) & !is.null(resolution)) {
    data.use <- object@fit$H
    data.use <- as.matrix(data.use)
    data.use <- t(scale(t(data.use), center = TRUE, scale = TRUE))
    snn <- swne::CalcSNN(data.use, k = 20, prune.SNN = 1/15)
    idents <- runLeiden(SNN = snn, resolution = resolution, partition_type = partition.type,
    seed.use = seed.use, n.iter = n.iter,
    initial.membership = initial.membership, weights = weights, node.sizes = node.sizes)
  } else {
    CM <- object@cluster$consensus
    d <- as.dist(1 - CM)
    hc <- hclust(d, "ave")
    idents<- hc %>% cutree(k = K)
    coph <- cophenetic(hc)
    cs <- cor(d, coph)
    object@cluster$cluster.stability <- cs
  }
  names(idents) <- rownames(object@pData)
  object@identity <- factor(idents)
  object@pData$cluster <- factor(idents)
  return(object)
}


#' Run Leiden clustering algorithm
#' This code is modified from Tom Kelly (https://github.com/TomKellyGenetics/leiden), where we added more parameters (seed.use and n.iter) to run the Python version. In addition, we also take care of the singleton issue after running leiden algorithm.
#' @description Implements the Leiden clustering algorithm in R using reticulate to run the Python version. Requires the python "leidenalg" and "igraph" modules to be installed. Returns a vector of partition indices.
#' @param SNN An adjacency matrix compatible with \code{\link[igraph]{igraph}} object.
#' @param seed.use set seed
#' @param n.iter number of iteration
#' @param initial.membership arameters to pass to the Python leidenalg function defaults initial_membership=None
#' @param node.sizes Parameters to pass to the Python leidenalg function
#' @param partition_type Type of partition to use. Defaults to RBConfigurationVertexPartition. Options include: ModularityVertexPartition, RBERVertexPartition, CPMVertexPartition, MutableVertexPartition, SignificanceVertexPartition, SurpriseVertexPartition (see the Leiden python module documentation for more details)
#' @param resolution A parameter controlling the coarseness of the clusters
#' @param weights defaults weights=None
#' @return A parition of clusters as a vector of integers
##'
##'
#' @keywords graph network igraph mvtnorm simulation
#' @importFrom reticulate import r_to_py
##' @export

runLeiden <- function(SNN = matrix(), resolution = 1, partition_type = c(
  'RBConfigurationVertexPartition',
  'ModularityVertexPartition',
  'RBERVertexPartition',
  'CPMVertexPartition',
  'MutableVertexPartition',
  'SignificanceVertexPartition',
  'SurpriseVertexPartition'),
seed.use = 42L,
n.iter = 10L,
initial.membership = NULL, weights = NULL, node.sizes = NULL) {
  if (!reticulate::py_module_available(module = 'leidenalg')) {
    stop("Cannot find Leiden algorithm, please install through pip (e.g. pip install leidenalg).")
  }

  #import python modules with reticulate
  leidenalg <- import("leidenalg", delay_load = TRUE)
  ig <- import("igraph", delay_load = TRUE)

  resolution_parameter <- resolution
  initial_membership <- initial.membership
  node_sizes <- node.sizes
  #convert matrix input (corrects for sparse matrix input)
  adj_mat <- as.matrix(SNN)

  #compute weights if non-binary adjacency matrix given
  is_pure_adj <- all(as.logical(adj_mat) == adj_mat)
  if (is.null(weights) && !is_pure_adj) {
    #assign weights to edges (without dependancy on igraph)
    weights <- t(adj_mat)[t(adj_mat)!=0]
    #remove zeroes from rows of matrix and return vector of length edges
  }

  ##convert to python numpy.ndarray, then a list
  adj_mat_py <- r_to_py(adj_mat)
  adj_mat_py <- adj_mat_py$tolist()

  #convert graph structure to a Python compatible object
  GraphClass <- if (!is.null(weights) && !is_pure_adj){
    ig$Graph$Weighted_Adjacency
  } else {
    ig$Graph$Adjacency
  }
  snn_graph <- GraphClass(adj_mat_py)

  #compute partitions
  partition_type <- match.arg(partition_type)
  part <- switch(
    EXPR = partition_type,
    'RBConfigurationVertexPartition' = leidenalg$find_partition(
      snn_graph,
      leidenalg$RBConfigurationVertexPartition,
      initial_membership = initial.membership, weights = weights,
      resolution_parameter = resolution,
      n_iterations = n.iter,
      seed = seed.use
    ),
    'ModularityVertexPartition' = leidenalg$find_partition(
      snn_graph,
      leidenalg$ModularityVertexPartition,
      initial_membership = initial.membership, weights = weights,
      n_iterations = n.iter,
      seed = seed.use
    ),
    'RBERVertexPartition' = leidenalg$find_partition(
      snn_graph,
      leidenalg$RBERVertexPartition,
      initial_membership = initial.membership, weights = weights, node_sizes = node.sizes,
      resolution_parameter = resolution_parameter,
      n_iterations = n.iter,
      seed = seed.use
    ),
    'CPMVertexPartition' = leidenalg$find_partition(
      snn_graph,
      leidenalg$CPMVertexPartition,
      initial_membership = initial.membership, weights = weights, node_sizes = node.sizes,
      resolution_parameter = resolution,
      n_iterations = n.iter,
      seed = seed.use
    ),
    'MutableVertexPartition' = leidenalg$find_partition(
      snn_graph,
      leidenalg$MutableVertexPartition,
      initial_membership = initial.membership,
      n_iterations = n.iter,
      seed = seed.use
    ),
    'SignificanceVertexPartition' = leidenalg$find_partition(
      snn_graph,
      leidenalg$SignificanceVertexPartition,
      initial_membership = initial.membership, node_sizes = node.sizes,
      resolution_parameter = resolution,
      n_iterations = n.iter,
      seed = seed.use
    ),
    'SurpriseVertexPartition' = leidenalg$find_partition(
      snn_graph,
      leidenalg$SurpriseVertexPartition,
      initial_membership = initial.membership, weights = weights, node_sizes = node.sizes,
      n_iterations = n.iter,
      seed = seed.use
    ),
    stop("please specify a partition type as a string out of those documented")
  )
  partition <- part$membership+1
  idents <- partition

  if (min(table(idents)) == 1) {
    idents <- assignSingletons(idents, SNN)
  }
  idents <- factor(idents)
  names(idents) <- row.names(SNN)
  return(idents)
}

# Group single cells that make up their own cluster in with the cluster they are most connected to.
#
# @param idents  clustering result
# @param SNN     SNN graph
# @return        Returns scAI object with all singletons merged with most connected cluster

assignSingletons <- function(idents, SNN) {
  # identify singletons
  singletons <- c()
  for (cluster in unique(idents)) {
    if (length(which(idents %in% cluster)) == 1) {
      singletons <- append(x = singletons, values = cluster)
    }
  }
  #singletons = names(table(idents))[which(table(idents)==1)]
  # calculate connectivity of singletons to other clusters, add singleton to cluster it is most connected to
  cluster_names <- unique(x = idents)
  cluster_names <- setdiff(x = cluster_names, y = singletons)
  connectivity <- vector(mode="numeric", length = length(x = cluster_names))
  names(x = connectivity) <- cluster_names
  for (i in singletons) {
    print(i)
    for (j in cluster_names) {
      subSNN = SNN[
        which(idents %in% i),
        which(idents %in% j)
        ]
      if (is.object(x = subSNN)) {
        connectivity[j] <- sum(subSNN) / (nrow(x = subSNN) * ncol(x = subSNN))
      } else {
        connectivity[j] <- mean(x = subSNN)
      }
    }
    m <- max(connectivity, na.rm = T)
    mi <- which(x = connectivity == m, arr.ind = TRUE)
    closest_cluster <- sample(x = names(x = connectivity[mi]), 1)
    which(idents %in% i)[which(idents %in% i)] <- closest_cluster
  }
  if (length(x = singletons) > 0) {
    message(paste(
      length(x = singletons),
      "singletons identified.",
      length(x = unique(idents)),
      "final clusters."
    ))
  }
  return(idents)
}


#' Convert membership into an adjacent matrix
#'
#' @param memb Membership vector
#'
#' @return
#' @export
#'
#' @examples
clust2Mat <- function(memb) {
  N <- length(memb)
  return(as.numeric(outer(memb, memb, FUN = "==")) - outer(1:N, 1:N, "=="))
}


#' Identify markers in each cell cluster
#'
#' @param object scAI object
#' @param assay Name of assay to be analyzed
#' @param features a vector of features
#' @param use.agg whether use the aggregated data
#' @param test.use which test to use ("bimod" or "wilcox")
#' @param thresh.pc Threshold of the percent of cells enriched in one cluster
#' @param thresh.fc Threshold of Fold Change
#' @param thresh.p Threshold of p-values
#' @importFrom future nbrOfWorkers
#' @importFrom pbapply pbsapply
#' @importFrom future.apply future_sapply
#' @importFrom stats sd wilcox.test
#' @importFrom dplyr top_n slice
#' @importFrom stats p.adjust
#'
#' @return
#' @export
#'
#' @examples
identifyClusterMarkers <- function(object, assay, features = NULL, use.agg = TRUE, test.use = "bimod",
thresh.pc = 0.25, thresh.fc = 0.25, thresh.p = 0.01) {
    if (assay == "RNA") {
        X <- object@norm.data[[assay]]
    } else {
        if (use.agg) {
            X <- object@agg.data
        } else {
            X <- object@norm.data[[assay]]
        }
    }
    if (is.null(features)) {
        features.use <- row.names(X)
    } else {
        features.use <- intersect(features, row.names(X))
    }
    data.use <- X[features.use,]
    data.use <- as.matrix(data.use)

    groups <- object@identity
    labels <- groups
    level.use <- levels(labels)
    numCluster <- length(level.use)

    my.sapply <- ifelse(
    test = future::nbrOfWorkers() == 1,
    yes = pbapply::pbsapply,
    no = future.apply::future_sapply
    )


    mean.fxn <- function(x) {
        return(log(x = mean(x = expm1(x = x)) + 1))
    }
    genes.de <- vector("list", length = numCluster)
    for (i in 1:numCluster) {
        features <- features.use
        cell.use1 <- which(labels %in% level.use[i])
        cell.use2 <- base::setdiff(1:length(labels), cell.use1)

# feature selection (based on percentages)
    thresh.min <- 0
    pct.1 <- round(
      x = rowSums(x = data.use[features, cell.use1, drop = FALSE] > thresh.min) /
        length(x = cell.use1),
      digits = 3
    )
    pct.2 <- round(
      x = rowSums(x = data.use[features, cell.use2, drop = FALSE] > thresh.min) /
        length(x = cell.use2),
      digits = 3
    )
    data.alpha <- cbind(pct.1, pct.2)
    colnames(x = data.alpha) <- c("pct.1", "pct.2")
    alpha.min <- apply(X = data.alpha, MARGIN = 1, FUN = max)
    names(x = alpha.min) <- rownames(x = data.alpha)
    features <- names(x = which(x = alpha.min > thresh.pc))
    if (length(x = features) == 0) {
      stop("No features pass thresh.pc threshold")
    }

    # feature selection (based on average difference)
    data.1 <- apply(X = data.use[features, cell.use1, drop = FALSE],MARGIN = 1,FUN = mean.fxn)
    data.2 <- apply(X = data.use[features, cell.use2, drop = FALSE],MARGIN = 1,FUN = mean.fxn)
    FC <- (data.1 - data.2)
    features.diff <- names(x = which(x = FC > thresh.fc))
    features <- intersect(x = features, y = features.diff)
    if (length(x = features) == 0) {
      stop("No features pass thresh.fc threshold")
    }

    data1 <- data.use[features, cell.use1, drop = FALSE]
    data2 <- data.use[features, cell.use2, drop = FALSE]

        if (test.use == "wilcox") {
            pvalues <- unlist(
            x = my.sapply(
            X = 1:nrow(x = data1),
            FUN = function(x) {
                return(wilcox.test(data1[x, ], data2[x, ], alternative = "greater")$p.value)
            }
            )
            )

        } else if (test.use == 'bimod') {
            pvalues <- unlist(
            x = my.sapply(
            X = 1:nrow(x = data1),
            FUN = function(x) {
                return(DifferentialLRT(
                x = as.numeric(x = data1[x,]),
                y = as.numeric(x = data2[x,])
                ))
            }
            )
            )
        }

        pval.adj = stats::p.adjust(
        p = pvalues,
        method = "bonferroni",
        n = nrow(X)
        )
        genes.de[[i]] <- data.frame(clusters = level.use[i], features = as.character(rownames(data1)), pvalues = pvalues, pval_adj = pval.adj, logFC = FC[features], data.alpha[features,])
    }

    markers.all <- data.frame()
    for (i in 1:numCluster) {
        gde <- genes.de[[i]]
        gde <- gde[order(gde$pvalues, -gde$logFC), ]
        gde <- subset(gde, subset = pvalues < thresh.p)
        if (nrow(gde) > 0) {
            markers.all <- rbind(markers.all, gde)
        }
    }
    markers.all$features <- as.character(markers.all$features)
    return(markers.all)
}

# function to run mcdavid et al. DE test
#' likelood ratio test
#'
#' @param x a vector
#' @param y a vector
#' @param xmin threshold for the values in the vector
#'
#'
#' @importFrom stats pchisq
DifferentialLRT <- function(x, y, xmin = 0) {
  lrtX <- bimodLikData(x = x)
  lrtY <- bimodLikData(x = y)
  lrtZ <- bimodLikData(x = c(x, y))
  lrt_diff <- 2 * (lrtX + lrtY - lrtZ)
  return(pchisq(q = lrt_diff, df = 3, lower.tail = F))
}

#' likelood ratio test
#' @importFrom stats sd dnorm
#' @param x a vector
#' @param xmin threshold for the values in the vector

bimodLikData <- function(x, xmin = 0) {
  x1 <- x[x <= xmin]
  x2 <- x[x > xmin]
  xal <- length(x = x2) / length(x = x)
  xal[xal > 1 - 1e-5] <- 1 - 1e-5
  xal[xal < 1e-5] <- 1e-5
  likA <- length(x = x1) * log(x = 1 - xal)
  if (length(x = x2) < 2) {
    mysd <- 1
  } else {
    mysd <- sd(x = x2)
  }
  likB <- length(x = x2) *
    log(x = xal) +
    sum(dnorm(x = x2, mean = mean(x = x2), sd = mysd, log = TRUE))
  return(likA + likB)
}


#' Infer regulatory relationships
#'
#' @param object scAI object
#' @param gene.use genes to be inferred
#' @param candinate_loci cadinate loci
#' @param cutoff_H threshold of coefficient values
#' @param cutoff the difference of correlation to be considered as significant
#' @param thresh_corr threshold of correlation coefficient
#'
#' @return
#' @export
#'
#' @examples
inferRegulations <- function(object, gene.use, candinate_loci, cutoff_H = 0.5, cutoff = 0.1, thresh_corr = 0.2) {
  H <- as.matrix(object@fit$H)
  H <- sweep(H, 2, colSums(H), FUN = `/`)
  K = nrow(H)
  mH <- apply(H, 1, mean)
  sH <- apply(H, 1, sd)
  IndexH_record <- vector("list", K)
  for (i in 1:K) {
    IndexH_record[[i]] <- which(H[i, ] > mH[i] + cutoff_H * sH[i])
  }
  gene.use <- as.character(gene.use)
  X1 <- object@norm.data[[1]]
  X1 <- X1[gene.use, ]
  X2a <- object@agg.data

  regulations <- vector("list", length(gene.use))
  names(regulations) <- gene.use
  for (i in 1:length(gene.use)) {
    regulations_i = vector("list", K)
    names(regulations_i) <- rownames(H)
    for (j in 1:K) {
      loci_j <- candinate_loci$markers.all$features[candinate_loci$markers.all$factors == j]
      # compute the correlation between
      x1 <- X1[i, ]
      x2a <- X2a[loci_j, ]
      cors1 <- cor(x1, t(x2a))

      # set the values of this gene and its candiate loci to be zero
      X1_new <- X1
      X2a_new <- X2a
      X1_new[gene.use[i], IndexH_record[[j]]] <- 0
      X2a_new[loci_j, IndexH_record[[j]]] <- 0
      x1_new <- X1_new[gene.use[i], ]
      x2a_new <- X2a_new[loci_j, ]
      cors2 <- cor(x1_new, t(x2a))
      cors3 <- cor(x1, t(x2a_new))
      cors1[is.na(cors1)] <- 0
      cors2[is.na(cors2)] <- 0
      cors3[is.na(cors3)] <- 0
      D <- rbind(cors1 - cors2, cors1 - cors3)
      flag <- (rowSums(abs(D) > cutoff) > 0) & abs(cors1) > thresh_corr
      regulations_i[[j]]$link <- as.character(loci_j[flag])
      regulations_i[[j]]$intensity <- cors1[flag]
    }
    regulations[[i]] <- regulations_i
  }
  return(regulations)
}



#' select highly variable features
#'
#' @param object scAI objecy
#' @param assay Name of assay
#' @param do.plot Whether showing plot
#' @param do.text Whether adding feature names
#' @param x.low.cutoff The minimum expression level
#' @param x.high.cutoff The maximum expression level
#' @param y.cutoff The minimum fano factor values
#' @param y.high.cutoff The maximum fano factor values
#' @param num.bin Number of bins
#' @param pch.use Shape of dots in ggplot
#' @param col.use Color of dots
#' @param cex.text.use Size of text
#'
#' @return
#' @export
#'
#' @examples
#' @importFrom graphics smoothScatter text
selectFeatures <- function(object, assay = "RNA", do.plot = TRUE, do.text = TRUE,
x.low.cutoff = 0.01, x.high.cutoff = 3.5, y.cutoff = 0.5, y.high.cutoff = Inf,
num.bin = 20, pch.use = 16, col.use = "black", cex.text.use = 0.5) {
    # This function is modified from Seurat Package
    data <- object@norm.data[[assay]]
    genes.use <- rownames(data)

    gene.mean <- apply(X = data, MARGIN = 1, FUN = ExpMean)
    gene.dispersion <- apply(X = data, MARGIN = 1, FUN = LogVMR)
    gene.dispersion[is.na(x = gene.dispersion)] <- 0
    gene.mean[is.na(x = gene.mean)] <- 0
    names(x = gene.mean) <- genes.use
    data_x_bin <- cut(x = gene.mean, breaks = num.bin)
    names(x = data_x_bin) <- names(x = gene.mean)
    mean_y <- tapply(X = gene.dispersion, INDEX = data_x_bin, FUN = mean)
    sd_y <- tapply(X = gene.dispersion, INDEX = data_x_bin, FUN = sd)
    gene.dispersion.scaled <- (gene.dispersion - mean_y[as.numeric(x = data_x_bin)])/sd_y[as.numeric(x = data_x_bin)]
    gene.dispersion.scaled[is.na(x = gene.dispersion.scaled)] <- 0
    names(x = gene.dispersion.scaled) <- names(x = gene.mean)

    mv.df <- data.frame(gene.mean, gene.dispersion, gene.dispersion.scaled)
    rownames(x = mv.df) <- rownames(x = data)
    hvg.info <- mv.df
    names(x = gene.mean) <- names(x = gene.dispersion) <- names(x = gene.dispersion.scaled) <- rownames(hvg.info)
    pass.cutoff <- names(x = gene.mean)[which(x = ((gene.mean > x.low.cutoff) & (gene.mean < x.high.cutoff)) & (gene.dispersion.scaled > y.cutoff) & (gene.dispersion.scaled < y.high.cutoff))]
    if (do.plot) {
        smoothScatter(x = gene.mean, y = gene.dispersion.scaled, pch = pch.use, cex = 0.5, col = col.use, xlab = "Average expression", ylab = "Dispersion", nrpoints = Inf)
    }
    if (do.text) {
        text(x = gene.mean[pass.cutoff], y = gene.dispersion.scaled[pass.cutoff], labels = pass.cutoff, cex = cex.text.use)
    }
    hvg.info <- hvg.info[order(hvg.info$gene.dispersion, decreasing = TRUE), ]

    if (length(object@var.features) == 0) {
        for (i in 1:length(object@raw.data)) {
            object@var.features[[i]] <- vector("character")
        }
        names(object@var.features) <- names(object@raw.data)
    }

    object@var.features[[assay]] <- pass.cutoff
    return(object)

}

#' find chromsome regions of genes
#'
#' @param genes a given set of genes
#' @param species mouse or human
#' @importFrom biomaRt useMart getBM
#' @export
#'
searchGeneRegions <- function(genes, species = "mouse") {
    if (species == "mouse"){
        mouse = biomaRt::useMart("ensembl", dataset = "mmusculus_gene_ensembl")
        loci <- biomaRt::getBM(attributes = c( "mgi_symbol","chromosome_name",'start_position','end_position'), filters = "mgi_symbol", values = genes, mart = mouse)
    } else{
        human = biomaRt::useMart("ensembl", dataset = "hsapiens_gene_ensembl")
        loci <- biomaRt::getBM(attributes = c( "hgnc_symbol","chromosome_name",'start_position','end_position'), filters = "hgnc_symbol", values = genes, mart = human)
    }
    return(loci)
}

#' find the nearby loci in a given loci set for a given set of genes
#'
#' @param genes a given set of genes
#' @param loci the background loci set
#' @param width the width from TSS
#' @param species mouse or human
#' @importFrom bedr bedr.sort.region in.region
#' @export
#'

findNearbyLoci <- function(genes, loci, width = 50000, species = "mouse") {
    gene.loci <- searchGeneRegions(genes, species)
    gene.loci <- gene.loci[gene.loci$chromosome_name %in% c(1:22, "X","Y"), ]
    gene.loci$start_position <- gene.loci$start_position-min(width, gene.loci$start_position)
    gene.loci$end_position <- gene.loci$end_position + width
    gene.loci.bed <- paste0("chr",gene.loci$chromosome_name, ":", gene.loci$start_position, "-",gene.loci$end_position)
    gene.loci.bed.sort <- bedr::bedr.sort.region(gene.loci.bed);
    a = grep(paste0("chr", c(1:22, "X","Y"), ":"), loci)
    loci <- loci[grepl(paste(paste0("chr", c(1:22, "X","Y"), ":"), collapse="|"), loci)]

    loci.sort <- bedr::bedr.sort.region(loci);
    is.region <- bedr::in.region(loci.sort, gene.loci.bed.sort);
    loci.nearby <- loci.sort[is.region]
    return(loci.nearby)
}

#' Select number of the rank
#'
#' @param object scAI object
#' @param rangeK A predefined range of K
#'
#' @return
#' @export
#'
#' @examples
selectK <- function(object, rangeK = c(2:15)) {
  coph <- vector("double", length(rangeK))
  i <- 0
  for (k in rangeK) {
    i <- i+1
    outs <- run_scAI(object, K = k, nrun = object@options$paras$nrun,
                     s = object@options$paras$s, alpha = object@options$paras$alpha, lambda = object@options$paras$lambda, gamma = object@options$paras$gamma,
                     maxIter = object@options$paras$maxIter, stop_rule = object@options$paras$stop_rule)
    CM <- outs@cluster$consensus
    d1 <- dist(CM)
    hc <- hclust(d1, method = "average")
    d2 <- cophenetic(hc)
    coph[i] <- cor(d1, d2)
  }

  df <- data.frame(k = rangeK, Coph = coph)
  gg <- ggplot(df, aes(x = k, y = Coph)) + geom_line(size=1) +
    geom_point() +
    theme_classic() + labs(x = 'K', y='Stability score (Coph)') +
    scAI_theme_opts() + theme(text = element_text(size = 10)) + labs(x = 'K', y='Stability score (Coph)') +
    theme(legend.position = "right", legend.title = NULL)
  gg
}



#' Reorder features according to the loading values
#'
#' @param W Basis matrix
#' @param cutoff Threshold of the loading values
#'
#' @return
#' @export
#'
#' @examples
reorderFeatures <- function(W, cutoff = 0.5) {
  M <- nrow(W)
  K = ncol(W)
  MW <- colMeans(W)
  vw <- apply(W, 2, sd)
  # order features
  IndexW_record <- vector("list", K)
  IndexW_record[[1]] <- which(W[, 1] > MW[1] + cutoff * VW[1])
  n <- length(IndexW_record[[1]])
  A <- c(1:M)
  c <- match(A, IndexW_record[[1]])
  A <- A[-c]
  for (j in 2:K) {
    IndexW_record[[j]] <- which(W[, j] > MW[j] + cutoff * VW[j])
    s <- 0
    for (k in 1:j - 1) {
      s <- s + length(IndexW_record[[k]])
    }
    ir2 <- match(IndexW_record[[j]], 1:s)
    IndexW_record[[j]] <- IndexW_record[[j]][-ir2]
    n <- length(IndexW_record[[j]])
    B <- union(1:s, IndexW_record[[j]])
    A <- 1:M
    c <- match(A, B)
    A <- A[-c]
    W0 <- W
    W[s + 1:s + n, ] <- W0[IndexW_record[[j]], ]
    W[s + n + 1:M, ] <- W0[A, ]
  }
  results <- list()
  list[["W"]] <- W
  list[["index_record"]] <- IndexW_record
  return(results)
}

#' Calculate the variance to mean ratio of logged values
#'
#' Calculate the variance to mean ratio (VMR) in non-logspace (return answer in
#' log-space)
#'
#' @param x A vector of values
#'
#' @return Returns the VMR in log-space
#'
#' @importFrom stats var
#'
#' @export
#'
#' @examples
#' LogVMR(x = c(1, 2, 3))
#'
LogVMR <- function(x) {
    return(log(x = var(x = exp(x = x) - 1) / mean(x = exp(x = x) - 1)))
}

#' Calculate the mean of logged values
#'
#' Calculate mean of logged values in non-log space (return answer in log-space)
#'
#' @param x A vector of values
#'
#' @return Returns the mean in log-space
#'
#' @export
#'
#' @examples
#' ExpMean(x = c(1, 2, 3))
#'
ExpMean <- function(x) {
    return(log(x = mean(x = exp(x = x) - 1) + 1))
}



#' Convert a sparse matrix to a dense matrix
#'
#' @param mat A sparse matrix
#' @export

as_matrix <- function(mat){
  Rcpp::sourceCpp(code='
#include <Rcpp.h>
using namespace Rcpp;
    // [[Rcpp::export]]
    IntegerMatrix asMatrix(NumericVector rp,
    NumericVector cp,
    NumericVector z,
    int nrows,
    int ncols){

        int k = z.size() ;

        IntegerMatrix  mat(nrows, ncols);

        for (int i = 0; i < k; i++){
            mat(rp[i],cp[i]) = z[i];
        }

        return mat;
    }
    ' )

    row_pos <- mat@i
    col_pos <- findInterval(seq(mat@x)-1,mat@p[-1])

    tmp <- asMatrix(rp = row_pos, cp = col_pos, z = mat@x,
    nrows =  mat@Dim[1], ncols = mat@Dim[2])

    row.names(tmp) <- mat@Dimnames[[1]]
    colnames(tmp) <- mat@Dimnames[[2]]
    return(tmp)
}
sqjin/scAI documentation built on Nov. 19, 2020, 4:04 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
sqjin/scAI
integrative analysis of single-cell transcriptomic and epigenomic profiles

R/scAI_model.R
In sqjin/scAI: integrative analysis of single-cell transcriptomic and epigenomic profiles

R Package Documentation

Browse R Packages

We want your feedback!

sqjin/scAI integrative analysis of single-cell transcriptomic and epigenomic profiles

R/scAI_model.R In sqjin/scAI: integrative analysis of single-cell transcriptomic and epigenomic profiles

R Package Documentation

Browse R Packages

We want your feedback!

sqjin/scAI
integrative analysis of single-cell transcriptomic and epigenomic profiles

R/scAI_model.R
In sqjin/scAI: integrative analysis of single-cell transcriptomic and epigenomic profiles