#' The scAI Class
#'
#' The scAI object is created from a paired single-cell transcriptomic and epigenomic data.
#' It takes a list of two digital data matrices as input. Genes/loci should be in rows and cells in columns. rownames and colnames should be included.
#' The class provides functions for data preprocessing, integrative analysis, and visualization.
#'
#' The key slots used in the scAI object are described below.
#'
#' @slot raw.data List of raw data matrices, one per dataset (Genes/loci should be in rows and cells in columns)
#' @slot norm.data List of normalized matrices (genes/loci by cells)
#' @slot agg.data Aggregated epigenomic data within similar cells
#' @slot scale.data List of scaled matrices
#' @slot pData data frame storing the information associated with each cell
#' @slot var.features List of informative features to be used, one giving informative genes and the other giving informative loci
#' @slot fit List of inferred low-rank matrices, including W1, W2, H, Z, R
#' @slot fit.variedK List of inferred low-rank matrices when varying the rank K
#' @slot embed List of the reduced 2D coordinates, one per method, e.g., t-SNE/FIt-SNE/umap
#' @slot identity a factor defining the cell identity
#' @slot cluster List of consensus clustering results
#' @slot options List of parameters used throughout analysis
#'
#' @exportClass scAI
#' @importFrom Rcpp evalCpp
#' @importFrom methods setClass
#' @useDynLib scAI
scAI <- methods::setClass("scAI",
slots = c(raw.data = "list",
norm.data = "list",
agg.data = "matrix",
scale.data = "list",
pData = "data.frame",
var.features = "list",
fit = "list",
fit.variedK = "list",
embed = "list",
identity = "factor",
cluster = "list",
options = "list")
)
#' show method for scAI
#'
#' @param scAI object
#' @param show show the object
#' @param object object
#' @docType methods
#'
setMethod(f = "show", signature = "scAI", definition = function(object) {
cat("An object of class", class(object), "\n", length(object@raw.data), "datasets.\n")
invisible(x = NULL)
})
#' creat a new scAI object
#'
#' @param raw.data List of raw data matrices, a paired single-cell transcriptomic and epigenomic data
#' @param do.sparse whether use sparse format
#'
#' @return
#' @export
#'
#' @examples
#' @importFrom methods as new
create_scAIobject <- function(raw.data, do.sparse = T) {
object <- methods::new(Class = "scAI",
raw.data = raw.data)
if (do.sparse) {
raw.data <- lapply(raw.data, function(x) {
as(as.matrix(x), "dgCMatrix")
})
}
object@raw.data <- raw.data
return(object)
}
#' preprocess the raw.data including quality control and normalization
#'
#' @param object scAI object
#' @param assay List of assay names to be normalized
#' @param minFeatures Filter out cells with expressed features < minimum features
#' @param minCells Filter out features expressing in less than minCells
#' @param minCounts Filter out cells with expressed count < minCounts
#' @param maxCounts Filter out cells with expressed count > minCounts
#' @param libararyflag Whether do library size normalization
#' @param logNormalize whether do log transformation
#'
#' @return
#' @export
#'
#' @examples
preprocessing <- function(object, assay = list("RNA", "ATAC"), minFeatures = 200, minCells = 3, minCounts = NULL, maxCounts = NULL, libararyflag = TRUE, logNormalize = TRUE) {
if (is.null(assay)) {
for (i in 1:length(object@raw.data)) {
object@norm.data[[i]] <- object@raw.data[[i]]
}
} else {
for (i in 1:length(assay)) {
iniData <- object@raw.data[[assay[[i]]]]
print(dim(iniData))
if (class(iniData) == "data.frame") {
iniData <- as.matrix(iniData)
}
# filter cells that have features less than #minFeatures
msum <- Matrix::colSums(iniData != 0)
proData <- iniData[, msum >= minFeatures]
# filter cells that have UMI counts less than #minCounts
if (!is.null(minCounts)) {
proData <- proData[, Matrix::colSums(proData) >= minCounts]
}
# filter cells that have expressed genes high than #maxGenes
if (!is.null(maxCounts)) {
proData <- proData[, Matrix::colSums(proData) <= maxCounts]
}
# filter genes that only express less than #minCells cells
proData <- proData[Matrix::rowSums(proData != 0) >= minCells, ]
# normalization:we employ a global-scaling normalization method that normalizes the gene expression measurements for each cell by the total expression multiplies this by a scale factor (10,000 by default)
if (libararyflag) {
proData <- sweep(proData, 2, Matrix::colSums(proData), FUN = `/`) * 10000
}
if (logNormalize) {
proData = log(proData + 1)
}
object@norm.data[[assay[[i]]]] <- proData
}
}
if (length(assay) == 2) {
X1 <- object@norm.data[[assay[[1]]]]
X2 <- object@norm.data[[assay[[2]]]]
cell.keep <- intersect(colnames(X1), colnames(X2))
object@norm.data[[assay[[1]]]] <- X1[, cell.keep]
object@norm.data[[assay[[2]]]] <- X2[, cell.keep]
} else if (length(assay) == 1) {
X1 <- object@norm.data[[assay[[1]]]]
assay2 <- setdiff(names(object@raw.data), assay[[1]])
X2 <- object@raw.data[[assay2]]
cell.keep <- intersect(colnames(X1), colnames(X2))
object@norm.data[[assay[[1]]]] <- X1[, cell.keep]
object@norm.data[[assay2]] <- X2[, cell.keep]
}
names(object@norm.data) <- names(object@raw.data)
return(object)
}
#' add the cell information into pData slot
#'
#' @param object scAi object
#' @param pdata cell information to be added
#' @param pdata.name the name of column to be assigned
#'
#' @return
#' @export
#'
#' @examples
addpData <- function(object, pdata, pdata.name = NULL) {
if (is.null(x = pdata.name) && is.atomic(x = pdata)) {
stop("'pdata.name' must be provided for atomic pdata types (eg. vectors)")
}
if (inherits(x = pdata, what = c("matrix", "Matrix"))) {
pdata <- as.data.frame(x = pdata)
}
if (is.null(x = pdata.name)) {
pdata.name <- names(pdata)
} else {
names(pdata) <- pdata.name
}
object@pData <- pdata
return(object)
}
#' run scAI model
#'
#' @param object scAI object
#' @param K An integer indicating the Rank of the inferred factor
#' @param do.fast whether use the python version for running scAI optimization model
#' @param nrun Number of times to repreat the running
#' @param hvg.use1 whether use high variable genes for RNA-seq data
#' @param hvg.use2 whether use high variable genes for ATAC-seq data
#' @param keep_all Whether keep all the results from multiple runs
#' @param s Probability of Bernoulli distribution
#' @param alpha model parameter
#' @param lambda model parameter
#' @param gamma model parameter
#' @param maxIter An integer indicating Maximum number of iteration
#' @param stop_rule Stop rule to be used
#' @param init List of the initialized low-rank matrices
#' @param rand.seed An integer indicating the seed
#'
#' @return
#' @export
#'
#' @examples
#' @importFrom foreach foreach "%dopar%"
#' @importFrom parallel makeForkCluster makeCluster detectCores
#' @importFrom doParallel registerDoParallel
#' @importFrom reticulate r_to_py source_python
run_scAI <- function(object, K, do.fast = FALSE, nrun = 5L, hvg.use1 = FALSE, hvg.use2 = FALSE, keep_all = F, s = 0.25, alpha = 1, lambda = 100000, gamma = 1, maxIter = 500L, stop_rule = 1L, init = NULL, rand.seed = 1L) {
if (!is.null(init)) {
W1.init = init$W1.init
W2.init = init$W2.init
H.init = init$H.init
Z.init = init$Z.init
R.init = init$R.init
} else {
R.init = NULL
W1.init = NULL
W2.init = NULL
H.init = NULL
Z.init = NULL
}
options(warn = -1)
# Calculate the number of cores
numCores <- min(parallel::detectCores(), nrun)
cl <- tryCatch({
parallel::makeForkCluster(numCores)
}, error = function(e) {
parallel::makeCluster(numCores)
})
doParallel::registerDoParallel(cl)
if (hvg.use1) {
X1 <- as.matrix(object@norm.data[[1]][object@var.features[[1]], ])
} else {
X1 <- as.matrix(object@norm.data[[1]])
}
if (hvg.use2) {
X2 <- as.matrix(object@norm.data[[2]][object@var.features[[2]], ])
} else {
X2 <- as.matrix(object@norm.data[[2]])
}
if (!do.fast) {
outs <- foreach(i = 1:nrun, .packages = c("Matrix")) %dopar% {
set.seed(rand.seed + i - 1)
scAImodel(X1, X2, K = K, s = s, alpha = alpha, lambda = lambda, gamma = gamma, maxIter = maxIter, stop_rule = stop_rule,
R.init = R.init, W1.init = W1.init, W2.init = W2.init, H.init = H.init, Z.init = Z.init)
}
} else {
barcodes <- colnames(X2)
feature1 <- rownames(X1)
feature2 <- rownames(X2)
names_com <- paste0("factor", seq_len(K))
path <- paste(system.file(package="scAI"), "scAImodel_py.py", sep="/")
reticulate::source_python(path)
X1py = r_to_py(X1)
X2py = r_to_py(X2)
R.initpy = r_to_py(R.init); W1.initpy = r_to_py(W1.init); W2.initpy = r_to_py(W2.init); H.initpy = r_to_py(H.init); Z.initpy = r_to_py(Z.init)
K = as.integer(K); maxIter = as.integer(maxIter)
outs <- pbapply::pbsapply(
X = 1:nrun,
FUN = function(x) {
seed = as.integer(rand.seed + x - 1)
ptm = Sys.time()
results = scAImodel_py(X1 = X1py, X2 = X2py, K = K, S = s, Alpha = alpha, Lambda = lambda, Gamma = gamma, Maxiter = maxIter, Stop_rule = stop_rule,
Seed = seed, W1 = W1.initpy, W2 = W2.initpy, H = H.initpy, Z = Z.initpy,R = R.initpy)
execution.time = Sys.time() - ptm
names(results) <- c("W1", "W2", "H", "Z", "R")
attr(results$W1, "dimnames") <- list(feature1, names_com)
attr(results$W2, "dimnames") <- list(feature2, names_com)
attr(results$H, "dimnames") <- list(names_com, barcodes)
attr(results$Z, "dimnames") <- list(barcodes, barcodes)
results$options = list(s = s, alpha = alpha, lambda = lambda, gamma = gamma, maxIter = maxIter, stop_rule = stop_rule, run.time = execution.time)
return(results)
},
simplify = FALSE
)
}
objs <- foreach(i = 1:nrun, .combine = c) %dopar% {
W1 <- outs[[i]]$W1
W2 <- outs[[i]]$W2
sum(cor(as.matrix(W1))) + sum(cor(as.matrix(W2)))
}
parallel::stopCluster(cl)
N <- ncol(X1)
C <- base::matrix(0, N, N)
for (i in seq_len(nrun)) {
H <- outs[[i]]$H
H <- sweep(H, 2, colSums(H), FUN = `/`)
clusIndex <- apply(H, 2, which.max)
# compute the consensus matrix
adjMat <- clust2Mat(clusIndex)
C <- C + adjMat
}
CM <- C/nrun
if (!keep_all) {
sprintf("The best seed is %d", which.min(objs))
outs_final <- outs[[which.min(objs)]]
#object@agg.data <- outs_final$agg.data
W = list(W1 = outs_final$W1, W2 = outs_final$W2)
names(W) <- names(object@norm.data)
object@fit <- list(W = W, H = outs_final$H, Z = outs_final$Z, R = outs_final$R)
object <- getAggregatedData(object)
object@cluster$consensus <- CM
object@options$cost <- objs
object@options$paras <- outs_final$options
object@options$paras$nrun <- nrun
object@options$best.seed <- which.min(objs)
return(object)
} else {
outs_final <- list()
outs_final$best <- outs[[which.min(objs)]]
outs_final$best$consensus <- CM
outs_final$nruns <- outs
outs_final$options$cost <- objs
return(outs_final)
}
}
#' Solving the optimization problem in scAI
#'
#' @param X1 Single-cell transcriptomic data matrix (norm.data)
#' @param X2 Single-cell epigenomic data matrix (norm.data)
#' @param K Rank of inferred factors
#' @param s Probability of Bernoulli distribution
#' @param alpha model parameter
#' @param lambda model parameter
#' @param gamma model parameter
#' @param maxIter Maximum number of iteration
#' @param stop_rule Stop rule to be used
#' @param R.init initialization of R
#' @param W1.init initialization of W1
#' @param W2.init initialization of W2
#' @param H.init initialization of H
#' @param Z.init initialization of Z
#'
#' @return
#' @export
#'
#' @examples
#' @importFrom stats rbinom runif
#' @importFrom rfunctions crossprodcpp
scAImodel <- function(X1, X2, K, s = 0.25, alpha = 1, lambda = 100000, gamma = 1, maxIter = 500, stop_rule = 1,
R.init = NULL, W1.init = NULL, W2.init = NULL, H.init = NULL, Z.init = NULL) {
# Initialization W1,W2,H and Z
p <- nrow(X1)
n <- ncol(X1)
q = nrow(X2)
if (is.null(W1.init)) {
W1.init = matrix(runif(p * K), p, K)
}
if (is.null(W2.init)) {
W2.init = matrix(runif(q * K), q, K)
}
if (is.null(H.init)) {
H.init = matrix(runif(K * n), K, n)
}
if (is.null(Z.init)) {
Z.init = matrix(runif(n), n, n)
}
if (is.null(R.init)) {
R.init = matrix(rbinom(n * n, 1, s), n, n)
}
W1 = W1.init
W2 = W2.init
H = H.init
Z = Z.init
R = R.init
# start the clock to measure the execution time
ptm = proc.time()
eps <- .Machine$double.eps
onesM_K <- matrix(1, K, K)
XtX2 <- crossprodcpp(X2)
index <- which(R == 0)
for (iter in 1:maxIter) {
# normalize H
H = H/rowSums(H)
# update W1
HHt <- tcrossprod(H)
X1Ht <- eigenMapMattcrossprod(X1, H)
W1HHt <- eigenMapMatMult(W1, HHt)
W1 <- W1 * X1Ht/(W1HHt + eps)
# update W2
ZR <- Z
ZR[index] <- 0
ZRHt <- eigenMapMattcrossprod(ZR, H)
X2ZRHt <- eigenMapMatMult(X2, ZRHt)
W2HHt <- eigenMapMatMult(W2, HHt)
W2 = W2 * X2ZRHt/(W2HHt + eps)
# update H
W1tX1 <- eigenMapMatcrossprod(W1, X1)
W2tX2 <- eigenMapMatcrossprod(W2, X2)
W2tX2ZR <- eigenMapMatMult(W2tX2, ZR)
HZZt <- eigenMapMatMult(H, Z + t(Z))
W1tW1 <- crossprodcpp(W1)
W2tW2 <- crossprodcpp(W2)
temp1 <- H * (alpha * W1tX1 + W2tX2ZR + lambda * HZZt)
temp2 <- eigenMapMatMult(alpha * W1tW1 + W2tW2 + 2 * lambda * HHt + gamma * onesM_K, H)
H <- temp1/(temp2 + eps)
# update Z
HtH <- crossprodcpp(H)
X2tW2H <- eigenMapMatcrossprod(W2tX2, H)
RX2tW2H = X2tW2H
RX2tW2H[index] = 0
XtX2ZR <- eigenMapMatMult(XtX2, ZR)
XtX2ZRR = XtX2ZR
XtX2ZRR[index] = 0
Z = Z * (RX2tW2H + lambda * HtH)/(XtX2ZRR + lambda * Z + eps)
if (stop_rule == 2) {
obj = alpha * norm(X1 - W1 %*% H, "F")^2 + norm(X2 %*% (Z * R) - W2 %*% H, "F")^2 + lambda * norm(Z - HtH, "F") + gamma * sum(colSums(H) * colSums(H))
if (iter > 1 && ((obj_old - obj)/obj_old < 10^(-6)) || iter == maxIter) {
break
}
obj_old = obj
}
}
# compute the execution time
execution.time = proc.time() - ptm
barcodes <- colnames(X2)
feature1 <- rownames(X1)
feature2 <- rownames(X2)
names_com <- paste0("factor", seq_len(K))
attr(W1, "dimnames") <- list(feature1, names_com)
attr(W2, "dimnames") <- list(feature2, names_com)
attr(H, "dimnames") <- list(names_com, barcodes)
attr(Z, "dimnames") <- list(barcodes, barcodes)
outs <- list(W1 = W1, W2 = W2, H = H, Z = Z, R = R,
options = list(s = s, alpha = alpha, lambda = lambda, gamma = gamma, maxIter = maxIter, stop_rule = stop_rule, run.time = execution.time))
return(outs)
}
#' Generate the aggregated epigenomic data
#'
#' @param object an scAI object after running run_scAI
#' @param group cell group information if available; aggregate epigenomic data based on the available cell group information instead of the learned cell-cell similarity matrix from scAI
#'
#' @return
#' @export
#'
getAggregatedData <- function(object, group = NULL) {
if (is.null(group)) {
Z <- object@fit$Z
} else {
if (is.factor(group) | is.character(group)) {
group <- as.numeric(factor(group))
}
Z <- clust2Mat(group)
diag(Z) <- 1
object@fit$Z <- Z
}
R <- object@fit$R
ZR <- Z * R
ZR <- sweep(ZR, 2, colSums(ZR), FUN = `/`)
X2 <- as.matrix(object@norm.data[[2]])
X2agg <- eigenMapMatMult(X2, ZR)
X2agg <- sweep(X2agg, 2, colSums(X2agg), FUN = `/`) * 10000
X2agg <- log(1 + X2agg)
attr(X2agg, "dimnames") <- list(rownames(object@norm.data[[2]]), colnames(object@norm.data[[2]]))
object@agg.data <- X2agg
return(object)
}
#' Perform dimensional reduction
#'
#' Dimension reduction by PCA, t-SNE or UMAP
#' @param object scAI object
#' @param return.object whether return scAI object
#' @param data.use input data
#' @param do.scale whether scale the data
#' @param do.center whether scale and center the data
#' @param method Method of dimensional reduction, one of tsne, FItsne and umap
#' @param rand.seed Set a random seed. By default, sets the seed to 42.
#' @param perplexity perplexity parameter in tsne
#' @param theta parameter in tsne
#' @param check_duplicates parameter in tsne
#' @param FItsne.path File path of FIt-SNE
#' @param dim.embed dimensions of t-SNE embedding
#' @param dim.use num of PCs used for t-SNE
#'
#' @param do.fast whether do fast PCA
#' @param dimPC the number of components to keep in PCA
#' @param weight.by.var whether use weighted pc.scores
#'
#' @param n.neighbors This determines the number of neighboring points used in
#' local approximations of manifold structure. Larger values will result in more
#' global structure being preserved at the loss of detailed local structure. In general this parameter should often be in the range 5 to 50.
#' @param n.components The dimension of the space to embed into.
#' @param distance This determines the choice of metric used to measure distance in the input space.
#' @param n.epochs the number of training epochs to be used in optimizing the low dimensional embedding. Larger values result in more accurate embeddings. If NULL is specified, a value will be selected based on the size of the input dataset (200 for large datasets, 500 for small).
#' @param learning.rate The initial learning rate for the embedding optimization.
#' @param min.dist This controls how tightly the embedding is allowed compress points together.
#' Larger values ensure embedded points are moreevenly distributed, while smaller values allow the
#' algorithm to optimise more accurately with regard to local structure. Sensible values are in the range 0.001 to 0.5.
#' @param spread he effective scale of embedded points. In combination with min.dist this determines how clustered/clumped the embedded points are.
#' @param set.op.mix.ratio Interpolate between (fuzzy) union and intersection as the set operation used to combine local fuzzy simplicial sets to obtain a global fuzzy simplicial sets.
#' @param local.connectivity The local connectivity required - i.e. the number of nearest neighbors
#' that should be assumed to be connected at a local level. The higher this value the more connected
#' the manifold becomes locally. In practice this should be not more than the local intrinsic dimension of the manifold.
#' @param repulsion.strength Weighting applied to negative samples in low dimensional embedding
#' optimization. Values higher than one will result in greater weight being given to negative samples.
#' @param negative.sample.rate The number of negative samples to select per positive sample in the
#' optimization process. Increasing this value will result in greater repulsive force being applied, greater optimization cost, but slightly more accuracy.
#' @param a More specific parameters controlling the embedding. If NULL, these values are set automatically as determined by min. dist and spread.
#' @param b More specific parameters controlling the embedding. If NULL, these values are set automatically as determined by min. dist and spread.
#'
#' @return
#' @export
#'
#' @examples
#' @importFrom Rtsne Rtsne
#' @importFrom reticulate py_module_available py_set_seed import
#'
reducedDims <- function(object, data.use = object@fit$H, do.scale = TRUE, do.center = TRUE, return.object = TRUE, method = "umap",
dim.embed = 2, dim.use = NULL, perplexity = 30, theta = 0.5, check_duplicates = F, rand.seed = 42L,
FItsne.path = NULL,
dimPC = 40,do.fast = TRUE, weight.by.var = TRUE,
n.neighbors = 30L, n.components = 2L, distance = "correlation",n.epochs = NULL,learning.rate = 1.0,min.dist = 0.3,spread = 1.0,set.op.mix.ratio = 1.0,local.connectivity = 1L,
repulsion.strength = 1,negative.sample.rate = 5,a = NULL,b = NULL) {
data.use <- as.matrix(data.use)
if (do.scale) {
data.use <- t(scale(t(data.use), center = do.center, scale = TRUE))
data.use[is.na(data.use)] <- 0
}
if (!is.null(dim.use)) {
data.use = object@embed$pca[, dim.use]
}
if (method == "pca") {
cell_coords <- runPCA(data.use, do.fast = do.fast, dimPC = dimPC, seed.use = rand.seed, weight.by.var = weight.by.var)
object@embed$pca <- cell_coords
} else if (method == "tsne") {
set.seed(rand.seed)
cell_coords <- Rtsne::Rtsne(t(data.use), pca = FALSE, dims = dim.embed, theta = theta, perplexity = perplexity, check_duplicates = F)$Y
rownames(cell_coords) <- rownames(t(data.use))
object@embed$tsne <- cell_coords
} else if (method == "FItsne") {
if (!exists("FItsne")) {
if (is.null(fitsne.path)) {
stop("Please pass in path to FIt-SNE directory as FItsne.path.")
}
source(paste0(fitsne.path, "/fast_tsne.R"), chdir = T)
}
cell_coords <- fftRtsne(t(data.use), pca = FALSE, dims = dim.embed, theta = theta, perplexity = perplexity, check_duplicates = F, rand_seed = rand.seed)
rownames(cell_coords) <- rownames(t(data.use))
object@embed$FItsne <- cell_coords
} else if (method == "umap") {
cell_coords <- runUMAP(data.use,
n.neighbors = n.neighbors,
n.components = n.components,
distance = distance,
n.epochs = n.epochs,
learning.rate = learning.rate,
min.dist = min.dist,
spread = spread,
set.op.mix.ratio = set.op.mix.ratio,
local.connectivity = local.connectivity,
repulsion.strength = repulsion.strength,
negative.sample.rate = negative.sample.rate,
a = a,
b = b,
seed.use = rand.seed
)
object@embed$umap <- cell_coords
} else {
stop("Error: unrecognized dimensionality reduction method.")
}
if (return.object) {
return(object)
} else {
return(cell_coords)
}
}
#' Dimension reduction using PCA
#'
#' @param data.use input data
#' @param do.fast whether do fast PCA
#' @param dimPC the number of components to keep
#' @param seed.use set a seed
#' @param weight.by.var whether use weighted pc.scores
#' @importFrom stats prcomp
#' @importFrom irlba irlba
#' @return
#' @export
#'
#' @examples
runPCA <- function(data.use, do.fast = T, dimPC = 50, seed.use = 42, weight.by.var = T) {
set.seed(seed = seed.use)
if (do.fast) {
dimPC <- min(dimPC, nrow(data.use) - 1)
pca.res <- irlba::irlba(t(data.use), nv = dimPC)
sdev <- pca.res$d/sqrt(max(1, ncol(data.use) - 1))
if (weight.by.var){
pc.scores <- pca.res$u %*% diag(pca.res$d)
} else {
pc.scores <- pca.res$u
}
} else {
dimPC <- min(dimPC, nrow(data.use) - 1)
pca.res <- stats::prcomp(x = t(data.use), rank. = dimPC)
sdev <- pca.res$sdev
if (weight.by.var) {
pc.scores <- pca.res$x %*% diag(pca.res$sdev[1:dimPC]^2)
} else {
pc.scores <- pca.res$x
}
}
rownames(pc.scores) <- colnames(data.use)
colnames(pc.scores) <- paste0('PC', 1:ncol(pc.scores))
cell_coords <- pc.scores
return(cell_coords)
}
#' Perform dimension reduction using UMAP
#'
#' @param data.use input data
#' @param n.neighbors This determines the number of neighboring points used in
#' local approximations of manifold structure. Larger values will result in more
#' global structure being preserved at the loss of detailed local structure. In general this parameter should often be in the range 5 to 50.
#' @param n.components The dimension of the space to embed into.
#' @param distance This determines the choice of metric used to measure distance in the input space.
#' @param n.epochs the number of training epochs to be used in optimizing the low dimensional embedding. Larger values result in more accurate embeddings. If NULL is specified, a value will be selected based on the size of the input dataset (200 for large datasets, 500 for small).
#' @param learning.rate The initial learning rate for the embedding optimization.
#' @param min.dist This controls how tightly the embedding is allowed compress points together.
#' Larger values ensure embedded points are moreevenly distributed, while smaller values allow the
#' algorithm to optimise more accurately with regard to local structure. Sensible values are in the range 0.001 to 0.5.
#' @param spread he effective scale of embedded points. In combination with min.dist this determines how clustered/clumped the embedded points are.
#' @param set.op.mix.ratio Interpolate between (fuzzy) union and intersection as the set operation used to combine local fuzzy simplicial sets to obtain a global fuzzy simplicial sets.
#' @param local.connectivity The local connectivity required - i.e. the number of nearest neighbors
#' that should be assumed to be connected at a local level. The higher this value the more connected
#' the manifold becomes locally. In practice this should be not more than the local intrinsic dimension of the manifold.
#' @param repulsion.strength Weighting applied to negative samples in low dimensional embedding
#' optimization. Values higher than one will result in greater weight being given to negative samples.
#' @param negative.sample.rate The number of negative samples to select per positive sample in the
#' optimization process. Increasing this value will result in greater repulsive force being applied, greater optimization cost, but slightly more accuracy.
#' @param a More specific parameters controlling the embedding. If NULL, these values are set automatically as determined by min. dist and spread.
#' @param b More specific parameters controlling the embedding. If NULL, these values are set automatically as determined by min. dist and spread.
#' @param seed.use Set a random seed. By default, sets the seed to 42.
#' @param metric.kwds A dictionary of arguments to pass on to the metric, such as the p value for Minkowski distance
#' @param angular.rp.forest Whether to use an angular random projection forest to initialise the
#' approximate nearest neighbor search. This can be faster, but is mostly on useful for metric that
#' use an angular style distance such as cosine, correlation etc. In the case of those metrics angular forests will be chosen automatically.
#' @param verbose Controls verbosity
#' This function is modified from Seurat package
#' @return
#' @export
#' @importFrom reticulate py_module_available py_set_seed import
#' @examples
runUMAP <- function(
data.use,
n.neighbors = 30L,
n.components = 2L,
distance = "correlation",
n.epochs = NULL,
learning.rate = 1.0,
min.dist = 0.3,
spread = 1.0,
set.op.mix.ratio = 1.0,
local.connectivity = 1L,
repulsion.strength = 1,
negative.sample.rate = 5,
a = NULL,
b = NULL,
seed.use = 42L,
metric.kwds = NULL,
angular.rp.forest = FALSE,
verbose = TRUE){
if (!reticulate::py_module_available(module = 'umap')) {
stop("Cannot find UMAP, please install through pip (e.g. pip install umap-learn or reticulate::py_install(packages = 'umap-learn')).")
}
set.seed(seed.use)
reticulate::py_set_seed(seed.use)
umap_import <- reticulate::import(module = "umap", delay_load = TRUE)
umap <- umap_import$UMAP(
n_neighbors = as.integer(n.neighbors),
n_components = as.integer(n.components),
metric = distance,
n_epochs = n.epochs,
learning_rate = learning.rate,
min_dist = min.dist,
spread = spread,
set_op_mix_ratio = set.op.mix.ratio,
local_connectivity = local.connectivity,
repulsion_strength = repulsion.strength,
negative_sample_rate = negative.sample.rate,
a = a,
b = b,
metric_kwds = metric.kwds,
angular_rp_forest = angular.rp.forest,
verbose = verbose
)
Rumap <- umap$fit_transform
umap_output <- Rumap(t(data.use))
colnames(umap_output) <- paste0('UMAP', 1:ncol(umap_output))
rownames(umap_output) <- colnames(data.use)
return(umap_output)
}
#' Identify enriched features in each factor
#'
#' Rank features in each factor by Factor loading analysis
#' @param object scAI object
#' @param assay Name of assay to be analyzed
#' @param features a vector of features
#' @param cutoff.W Threshold of feature loading values
#' @param cutoff.H Threshold of cell loading values
#' @param thresh.pc Threshold of the percent of cells enriched in one factor
#' @param thresh.fc Threshold of Fold Change
#' @param thresh.p Threshold of p-values
#' @param n.top Number of top features to be returned
#' @importFrom stats sd wilcox.test
#' @importFrom dplyr top_n slice
#' @importFrom future nbrOfWorkers
#' @importFrom future.apply future_sapply
#' @importFrom pbapply pbsapply
#' @importFrom stats p.adjust
#'
#' @return
#' @export
#'
#' @examples
identifyFactorMarkers <- function(object, assay, features = NULL, cutoff.W = 0.5, cutoff.H = 0.5,
thresh.pc = 0.05, thresh.fc = 0.25, thresh.p = 0.05, n.top = 10) {
if (assay == "RNA") {
X <- as.matrix(object@norm.data[[assay]])
} else {
X <- as.matrix(object@agg.data)
}
H <- object@fit$H
W <- object@fit$W[[assay]]
if (is.null(features)) {
features <- row.names(W)
}
H <- sweep(H, 2, colSums(H), FUN = `/`)
K = ncol(W)
lib_W <- base::rowSums(W)
lib_W[lib_W == 0] <- 1
lib_W[lib_W < mean(lib_W) - 5 * sd(lib_W)] <- 1 # omit the nearly null rows
W <- sweep(W, 1, lib_W, FUN = `/`)
MW <- base::colMeans(W)
sW <- apply(W, 2, sd)
# candidate markers for each factor
IndexW_record <- vector("list", K)
for (j in 1:K) {
IndexW_record[[j]] <- which(W[, j] > MW[j] + cutoff.W * sW[j])
}
my.sapply <- ifelse(
test = future::nbrOfWorkers() == 1,
yes = pbapply::pbsapply,
no = future.apply::future_sapply
)
# divided cells into two groups
mH <- apply(H, 1, mean)
sH <- apply(H, 1, sd)
IndexH_record1 <- vector("list", K)
IndexH_record2 <- vector("list", K)
for (i in 1:K) {
IndexH_record1[[i]] <- which(H[i, ] > mH[i] + cutoff.H * sH[i])
IndexH_record2[[i]] <- base::setdiff(c(1:ncol(H)), IndexH_record1[[i]])
}
# identify factor-specific markers
factor_markers = vector("list", K)
markersTopn = vector("list", K)
factors <- c()
Features <- c()
Pvalues <- c()
Log2FC <- c()
for (i in 1:K) {
data1 <- X[IndexW_record[[i]], IndexH_record1[[i]]]
data2 <- X[IndexW_record[[i]], IndexH_record2[[i]]]
idx1 <- which(base::rowSums(data1 != 0) > thresh.pc * ncol(data1)) # at least expressed in thresh.pc% cells in one group
FC <- log2(base::rowMeans(data1)/base::rowMeans(data2))
idx2 <- which(FC > thresh.fc)
pvalues <- my.sapply(
X = 1:nrow(x = data1),
FUN = function(x) {
return(wilcox.test(data1[x, ], data2[x, ], alternative = "greater")$p.value)
}
)
idx3 <- which(pvalues < thresh.p)
idx = intersect(intersect(idx1, idx2), idx3)
# order
FC <- FC[idx]
c = sort(FC, decreasing = T, index.return = T)$ix
ri = IndexW_record[[i]]
Pvalues <- c(Pvalues, pvalues[ri[idx[c]]])
Log2FC <- c(Log2FC, FC[c])
factors <- c(factors, rep(i, length(c)))
Features <- c(Features, features[ri[idx[c]]])
}
# stopCluster(cl)
markers.all <- cbind(factors = factors, features = Features, pvalues = Pvalues, log2FC = Log2FC)
rownames(markers.all) <- Features
markers.all <- as.data.frame(markers.all)
markers.top <- markers.all %>% dplyr::group_by(factors) %>% top_n(n.top, log2FC) %>% dplyr::slice(1:n.top)
markers = list(markers.all = markers.all, markers.top = markers.top)
return(markers)
}
#' compute the 2D coordinates of embeded cells, genes, loci and factors using VscAI visualization
#'
#' @param object scAI object
#' @param genes.embed A vector of genes to be embedded
#' @param loci.embed A vector of loci to be embedded
#' @param alpha.embed Parameter controlling the distance between the cells and the factors
#' @param snn.embed Number of neighbors
#'
#' @return
#' @export
#'
#' @examples
#' @importFrom Matrix Matrix
getEmbeddings <- function(object, genes.embed = NULL, loci.embed = NULL, alpha.embed = 1.9, snn.embed = 5) {
H <- object@fit$H
W1 <- object@fit$W[[1]]
W2 <- object@fit$W[[2]]
Z <- object@fit$Z
if (nrow(H) < 3 & length(object@fit.variedK) == 0) {
print("VscAI needs at least three factors for embedding. Now rerun scAI with rank being 3...")
outs <- run_scAI(object, K = 3, nrun = object@options$paras$nrun,
s = object@options$paras$s, alpha = object@options$paras$alpha, lambda = object@options$paras$lambda, gamma = object@options$paras$gamma,
maxIter = object@options$paras$maxIter, stop_rule = object@options$paras$stop_rule)
W <- outs@fit$W
H <- outs@fit$H
Z <- outs@fit$Z
W1 <- W[[1]]
W2 <- W[[2]]
object@fit.variedK$W <- W
object@fit.variedK$H <- H
object@fit.variedK$Z <- Z
} else if (nrow(H) < 3 & length(object@fit.variedK) != 0) {
print("Using the previous calculated factors for embedding.")
W1 <- object@fit.variedK$W[[1]]
W2 <- object@fit.variedK$W[[2]]
H <- object@fit.variedK$H
Z <- object@fit.variedK$Z
}
nmf.scores <- H
Zsym <- Z/max(Z)
Zsym[Zsym < 10^(-6)] <- 0
Zsym <- (Zsym + t(Zsym))/2
diag(Zsym) <- 1
rownames(Zsym) <- colnames(H)
colnames(Zsym) <- rownames(Zsym)
alpha.exp <- alpha.embed # Increase this > 1.0 to move the cells closer to the factors. Values > 2 start to distort the data.
snn.exp <- snn.embed # Lower this < 1.0 to move similar cells closer to each other
n_pull <- nrow(H) # The number of factors pulling on each cell. Must be at least 3.
Zsym <- Matrix(data = Zsym, sparse = TRUE)
snn <- Zsym
swne.embedding <- swne::EmbedSWNE(nmf.scores, snn, alpha.exp = alpha.exp, snn.exp = snn.exp, n_pull = n_pull, proj.method = "sammon", dist.use = "cosine")
# project cells and factors
factor_coords <- swne.embedding$H.coords
cell_coords <- swne.embedding$sample.coords
rownames(factor_coords) <- rownames(H)
rownames(cell_coords) <- colnames(H)
# project genes onto the low dimension space by VscAI
if (is.null(genes.embed)) {
genes.embed <- rownames(W1)
} else {
genes.embed <- as.character(as.vector(as.matrix((genes.embed))))
}
swne.embedding <- swne::EmbedFeatures(swne.embedding, W1, genes.embed, n_pull = n_pull)
gene_coords <- swne.embedding$feature.coords
rownames(gene_coords) <- gene_coords$name
# project loci
if (is.null(loci.embed)) {
loci.embed <- rownames(W2)
} else {
loci.embed <- as.character(as.vector(as.matrix((loci.embed))))
}
swne.embedding <- swne::EmbedFeatures(swne.embedding, W2, loci.embed, n_pull = n_pull)
loci_coords <- swne.embedding$feature.coords
rownames(loci_coords) <- loci_coords$name
object@embed$VscAI$cells <- cell_coords
object@embed$VscAI$factors <- factor_coords
object@embed$VscAI$genes <- gene_coords
object@embed$VscAI$loci <- loci_coords
return(object)
}
#' Identify cell clusters
#'
#' @param object scAI object
#' @param partition.type Type of partition to use. Defaults to RBConfigurationVertexPartition. Options include: ModularityVertexPartition, RBERVertexPartition, CPMVertexPartition, MutableVertexPartition, SignificanceVertexPartition, SurpriseVertexPartition (see the Leiden python module documentation for more details)
#' @param seed.use set seed
#' @param n.iter number of iteration
#' @param initial.membership arameters to pass to the Python leidenalg function defaults initial_membership=None
#' @param weights defaults weights=None
#' @param node.sizes Parameters to pass to the Python leidenalg function
#' @param resolution A parameter controlling the coarseness of the clusters
#' @param K Number of clusters if performing hierarchical clustering of the consensus matrix
#' @return
#' @export
#'
#' @examples
#' @importFrom stats as.dist cophenetic cor cutree dist hclust
identifyClusters <- function(object, resolution = 1, partition.type = "RBConfigurationVertexPartition",
seed.use = 42L,n.iter = 10L,
initial.membership = NULL, weights = NULL, node.sizes = NULL,
K = NULL) {
if (is.null(K) & !is.null(resolution)) {
data.use <- object@fit$H
data.use <- as.matrix(data.use)
data.use <- t(scale(t(data.use), center = TRUE, scale = TRUE))
snn <- swne::CalcSNN(data.use, k = 20, prune.SNN = 1/15)
idents <- runLeiden(SNN = snn, resolution = resolution, partition_type = partition.type,
seed.use = seed.use, n.iter = n.iter,
initial.membership = initial.membership, weights = weights, node.sizes = node.sizes)
} else {
CM <- object@cluster$consensus
d <- as.dist(1 - CM)
hc <- hclust(d, "ave")
idents<- hc %>% cutree(k = K)
coph <- cophenetic(hc)
cs <- cor(d, coph)
object@cluster$cluster.stability <- cs
}
names(idents) <- rownames(object@pData)
object@identity <- factor(idents)
object@pData$cluster <- factor(idents)
return(object)
}
#' Run Leiden clustering algorithm
#' This code is modified from Tom Kelly (https://github.com/TomKellyGenetics/leiden), where we added more parameters (seed.use and n.iter) to run the Python version. In addition, we also take care of the singleton issue after running leiden algorithm.
#' @description Implements the Leiden clustering algorithm in R using reticulate to run the Python version. Requires the python "leidenalg" and "igraph" modules to be installed. Returns a vector of partition indices.
#' @param SNN An adjacency matrix compatible with \code{\link[igraph]{igraph}} object.
#' @param seed.use set seed
#' @param n.iter number of iteration
#' @param initial.membership arameters to pass to the Python leidenalg function defaults initial_membership=None
#' @param node.sizes Parameters to pass to the Python leidenalg function
#' @param partition_type Type of partition to use. Defaults to RBConfigurationVertexPartition. Options include: ModularityVertexPartition, RBERVertexPartition, CPMVertexPartition, MutableVertexPartition, SignificanceVertexPartition, SurpriseVertexPartition (see the Leiden python module documentation for more details)
#' @param resolution A parameter controlling the coarseness of the clusters
#' @param weights defaults weights=None
#' @return A parition of clusters as a vector of integers
##'
##'
#' @keywords graph network igraph mvtnorm simulation
#' @importFrom reticulate import r_to_py
##' @export
runLeiden <- function(SNN = matrix(), resolution = 1, partition_type = c(
'RBConfigurationVertexPartition',
'ModularityVertexPartition',
'RBERVertexPartition',
'CPMVertexPartition',
'MutableVertexPartition',
'SignificanceVertexPartition',
'SurpriseVertexPartition'),
seed.use = 42L,
n.iter = 10L,
initial.membership = NULL, weights = NULL, node.sizes = NULL) {
if (!reticulate::py_module_available(module = 'leidenalg')) {
stop("Cannot find Leiden algorithm, please install through pip (e.g. pip install leidenalg).")
}
#import python modules with reticulate
leidenalg <- import("leidenalg", delay_load = TRUE)
ig <- import("igraph", delay_load = TRUE)
resolution_parameter <- resolution
initial_membership <- initial.membership
node_sizes <- node.sizes
#convert matrix input (corrects for sparse matrix input)
adj_mat <- as.matrix(SNN)
#compute weights if non-binary adjacency matrix given
is_pure_adj <- all(as.logical(adj_mat) == adj_mat)
if (is.null(weights) && !is_pure_adj) {
#assign weights to edges (without dependancy on igraph)
weights <- t(adj_mat)[t(adj_mat)!=0]
#remove zeroes from rows of matrix and return vector of length edges
}
##convert to python numpy.ndarray, then a list
adj_mat_py <- r_to_py(adj_mat)
adj_mat_py <- adj_mat_py$tolist()
#convert graph structure to a Python compatible object
GraphClass <- if (!is.null(weights) && !is_pure_adj){
ig$Graph$Weighted_Adjacency
} else {
ig$Graph$Adjacency
}
snn_graph <- GraphClass(adj_mat_py)
#compute partitions
partition_type <- match.arg(partition_type)
part <- switch(
EXPR = partition_type,
'RBConfigurationVertexPartition' = leidenalg$find_partition(
snn_graph,
leidenalg$RBConfigurationVertexPartition,
initial_membership = initial.membership, weights = weights,
resolution_parameter = resolution,
n_iterations = n.iter,
seed = seed.use
),
'ModularityVertexPartition' = leidenalg$find_partition(
snn_graph,
leidenalg$ModularityVertexPartition,
initial_membership = initial.membership, weights = weights,
n_iterations = n.iter,
seed = seed.use
),
'RBERVertexPartition' = leidenalg$find_partition(
snn_graph,
leidenalg$RBERVertexPartition,
initial_membership = initial.membership, weights = weights, node_sizes = node.sizes,
resolution_parameter = resolution_parameter,
n_iterations = n.iter,
seed = seed.use
),
'CPMVertexPartition' = leidenalg$find_partition(
snn_graph,
leidenalg$CPMVertexPartition,
initial_membership = initial.membership, weights = weights, node_sizes = node.sizes,
resolution_parameter = resolution,
n_iterations = n.iter,
seed = seed.use
),
'MutableVertexPartition' = leidenalg$find_partition(
snn_graph,
leidenalg$MutableVertexPartition,
initial_membership = initial.membership,
n_iterations = n.iter,
seed = seed.use
),
'SignificanceVertexPartition' = leidenalg$find_partition(
snn_graph,
leidenalg$SignificanceVertexPartition,
initial_membership = initial.membership, node_sizes = node.sizes,
resolution_parameter = resolution,
n_iterations = n.iter,
seed = seed.use
),
'SurpriseVertexPartition' = leidenalg$find_partition(
snn_graph,
leidenalg$SurpriseVertexPartition,
initial_membership = initial.membership, weights = weights, node_sizes = node.sizes,
n_iterations = n.iter,
seed = seed.use
),
stop("please specify a partition type as a string out of those documented")
)
partition <- part$membership+1
idents <- partition
if (min(table(idents)) == 1) {
idents <- assignSingletons(idents, SNN)
}
idents <- factor(idents)
names(idents) <- row.names(SNN)
return(idents)
}
# Group single cells that make up their own cluster in with the cluster they are most connected to.
#
# @param idents clustering result
# @param SNN SNN graph
# @return Returns scAI object with all singletons merged with most connected cluster
assignSingletons <- function(idents, SNN) {
# identify singletons
singletons <- c()
for (cluster in unique(idents)) {
if (length(which(idents %in% cluster)) == 1) {
singletons <- append(x = singletons, values = cluster)
}
}
#singletons = names(table(idents))[which(table(idents)==1)]
# calculate connectivity of singletons to other clusters, add singleton to cluster it is most connected to
cluster_names <- unique(x = idents)
cluster_names <- setdiff(x = cluster_names, y = singletons)
connectivity <- vector(mode="numeric", length = length(x = cluster_names))
names(x = connectivity) <- cluster_names
for (i in singletons) {
print(i)
for (j in cluster_names) {
subSNN = SNN[
which(idents %in% i),
which(idents %in% j)
]
if (is.object(x = subSNN)) {
connectivity[j] <- sum(subSNN) / (nrow(x = subSNN) * ncol(x = subSNN))
} else {
connectivity[j] <- mean(x = subSNN)
}
}
m <- max(connectivity, na.rm = T)
mi <- which(x = connectivity == m, arr.ind = TRUE)
closest_cluster <- sample(x = names(x = connectivity[mi]), 1)
which(idents %in% i)[which(idents %in% i)] <- closest_cluster
}
if (length(x = singletons) > 0) {
message(paste(
length(x = singletons),
"singletons identified.",
length(x = unique(idents)),
"final clusters."
))
}
return(idents)
}
#' Convert membership into an adjacent matrix
#'
#' @param memb Membership vector
#'
#' @return
#' @export
#'
#' @examples
clust2Mat <- function(memb) {
N <- length(memb)
return(as.numeric(outer(memb, memb, FUN = "==")) - outer(1:N, 1:N, "=="))
}
#' Identify markers in each cell cluster
#'
#' @param object scAI object
#' @param assay Name of assay to be analyzed
#' @param features a vector of features
#' @param use.agg whether use the aggregated data
#' @param test.use which test to use ("bimod" or "wilcox")
#' @param thresh.pc Threshold of the percent of cells enriched in one cluster
#' @param thresh.fc Threshold of Fold Change
#' @param thresh.p Threshold of p-values
#' @importFrom future nbrOfWorkers
#' @importFrom pbapply pbsapply
#' @importFrom future.apply future_sapply
#' @importFrom stats sd wilcox.test
#' @importFrom dplyr top_n slice
#' @importFrom stats p.adjust
#'
#' @return
#' @export
#'
#' @examples
identifyClusterMarkers <- function(object, assay, features = NULL, use.agg = TRUE, test.use = "bimod",
thresh.pc = 0.25, thresh.fc = 0.25, thresh.p = 0.01) {
if (assay == "RNA") {
X <- object@norm.data[[assay]]
} else {
if (use.agg) {
X <- object@agg.data
} else {
X <- object@norm.data[[assay]]
}
}
if (is.null(features)) {
features.use <- row.names(X)
} else {
features.use <- intersect(features, row.names(X))
}
data.use <- X[features.use,]
data.use <- as.matrix(data.use)
groups <- object@identity
labels <- groups
level.use <- levels(labels)
numCluster <- length(level.use)
my.sapply <- ifelse(
test = future::nbrOfWorkers() == 1,
yes = pbapply::pbsapply,
no = future.apply::future_sapply
)
mean.fxn <- function(x) {
return(log(x = mean(x = expm1(x = x)) + 1))
}
genes.de <- vector("list", length = numCluster)
for (i in 1:numCluster) {
features <- features.use
cell.use1 <- which(labels %in% level.use[i])
cell.use2 <- base::setdiff(1:length(labels), cell.use1)
# feature selection (based on percentages)
thresh.min <- 0
pct.1 <- round(
x = rowSums(x = data.use[features, cell.use1, drop = FALSE] > thresh.min) /
length(x = cell.use1),
digits = 3
)
pct.2 <- round(
x = rowSums(x = data.use[features, cell.use2, drop = FALSE] > thresh.min) /
length(x = cell.use2),
digits = 3
)
data.alpha <- cbind(pct.1, pct.2)
colnames(x = data.alpha) <- c("pct.1", "pct.2")
alpha.min <- apply(X = data.alpha, MARGIN = 1, FUN = max)
names(x = alpha.min) <- rownames(x = data.alpha)
features <- names(x = which(x = alpha.min > thresh.pc))
if (length(x = features) == 0) {
stop("No features pass thresh.pc threshold")
}
# feature selection (based on average difference)
data.1 <- apply(X = data.use[features, cell.use1, drop = FALSE],MARGIN = 1,FUN = mean.fxn)
data.2 <- apply(X = data.use[features, cell.use2, drop = FALSE],MARGIN = 1,FUN = mean.fxn)
FC <- (data.1 - data.2)
features.diff <- names(x = which(x = FC > thresh.fc))
features <- intersect(x = features, y = features.diff)
if (length(x = features) == 0) {
stop("No features pass thresh.fc threshold")
}
data1 <- data.use[features, cell.use1, drop = FALSE]
data2 <- data.use[features, cell.use2, drop = FALSE]
if (test.use == "wilcox") {
pvalues <- unlist(
x = my.sapply(
X = 1:nrow(x = data1),
FUN = function(x) {
return(wilcox.test(data1[x, ], data2[x, ], alternative = "greater")$p.value)
}
)
)
} else if (test.use == 'bimod') {
pvalues <- unlist(
x = my.sapply(
X = 1:nrow(x = data1),
FUN = function(x) {
return(DifferentialLRT(
x = as.numeric(x = data1[x,]),
y = as.numeric(x = data2[x,])
))
}
)
)
}
pval.adj = stats::p.adjust(
p = pvalues,
method = "bonferroni",
n = nrow(X)
)
genes.de[[i]] <- data.frame(clusters = level.use[i], features = as.character(rownames(data1)), pvalues = pvalues, pval_adj = pval.adj, logFC = FC[features], data.alpha[features,])
}
markers.all <- data.frame()
for (i in 1:numCluster) {
gde <- genes.de[[i]]
gde <- gde[order(gde$pvalues, -gde$logFC), ]
gde <- subset(gde, subset = pvalues < thresh.p)
if (nrow(gde) > 0) {
markers.all <- rbind(markers.all, gde)
}
}
markers.all$features <- as.character(markers.all$features)
return(markers.all)
}
# function to run mcdavid et al. DE test
#' likelood ratio test
#'
#' @param x a vector
#' @param y a vector
#' @param xmin threshold for the values in the vector
#'
#'
#' @importFrom stats pchisq
DifferentialLRT <- function(x, y, xmin = 0) {
lrtX <- bimodLikData(x = x)
lrtY <- bimodLikData(x = y)
lrtZ <- bimodLikData(x = c(x, y))
lrt_diff <- 2 * (lrtX + lrtY - lrtZ)
return(pchisq(q = lrt_diff, df = 3, lower.tail = F))
}
#' likelood ratio test
#' @importFrom stats sd dnorm
#' @param x a vector
#' @param xmin threshold for the values in the vector
bimodLikData <- function(x, xmin = 0) {
x1 <- x[x <= xmin]
x2 <- x[x > xmin]
xal <- length(x = x2) / length(x = x)
xal[xal > 1 - 1e-5] <- 1 - 1e-5
xal[xal < 1e-5] <- 1e-5
likA <- length(x = x1) * log(x = 1 - xal)
if (length(x = x2) < 2) {
mysd <- 1
} else {
mysd <- sd(x = x2)
}
likB <- length(x = x2) *
log(x = xal) +
sum(dnorm(x = x2, mean = mean(x = x2), sd = mysd, log = TRUE))
return(likA + likB)
}
#' Infer regulatory relationships
#'
#' @param object scAI object
#' @param gene.use genes to be inferred
#' @param candinate_loci cadinate loci
#' @param cutoff_H threshold of coefficient values
#' @param cutoff the difference of correlation to be considered as significant
#' @param thresh_corr threshold of correlation coefficient
#'
#' @return
#' @export
#'
#' @examples
inferRegulations <- function(object, gene.use, candinate_loci, cutoff_H = 0.5, cutoff = 0.1, thresh_corr = 0.2) {
H <- as.matrix(object@fit$H)
H <- sweep(H, 2, colSums(H), FUN = `/`)
K = nrow(H)
mH <- apply(H, 1, mean)
sH <- apply(H, 1, sd)
IndexH_record <- vector("list", K)
for (i in 1:K) {
IndexH_record[[i]] <- which(H[i, ] > mH[i] + cutoff_H * sH[i])
}
gene.use <- as.character(gene.use)
X1 <- object@norm.data[[1]]
X1 <- X1[gene.use, ]
X2a <- object@agg.data
regulations <- vector("list", length(gene.use))
names(regulations) <- gene.use
for (i in 1:length(gene.use)) {
regulations_i = vector("list", K)
names(regulations_i) <- rownames(H)
for (j in 1:K) {
loci_j <- candinate_loci$markers.all$features[candinate_loci$markers.all$factors == j]
# compute the correlation between
x1 <- X1[i, ]
x2a <- X2a[loci_j, ]
cors1 <- cor(x1, t(x2a))
# set the values of this gene and its candiate loci to be zero
X1_new <- X1
X2a_new <- X2a
X1_new[gene.use[i], IndexH_record[[j]]] <- 0
X2a_new[loci_j, IndexH_record[[j]]] <- 0
x1_new <- X1_new[gene.use[i], ]
x2a_new <- X2a_new[loci_j, ]
cors2 <- cor(x1_new, t(x2a))
cors3 <- cor(x1, t(x2a_new))
cors1[is.na(cors1)] <- 0
cors2[is.na(cors2)] <- 0
cors3[is.na(cors3)] <- 0
D <- rbind(cors1 - cors2, cors1 - cors3)
flag <- (rowSums(abs(D) > cutoff) > 0) & abs(cors1) > thresh_corr
regulations_i[[j]]$link <- as.character(loci_j[flag])
regulations_i[[j]]$intensity <- cors1[flag]
}
regulations[[i]] <- regulations_i
}
return(regulations)
}
#' select highly variable features
#'
#' @param object scAI objecy
#' @param assay Name of assay
#' @param do.plot Whether showing plot
#' @param do.text Whether adding feature names
#' @param x.low.cutoff The minimum expression level
#' @param x.high.cutoff The maximum expression level
#' @param y.cutoff The minimum fano factor values
#' @param y.high.cutoff The maximum fano factor values
#' @param num.bin Number of bins
#' @param pch.use Shape of dots in ggplot
#' @param col.use Color of dots
#' @param cex.text.use Size of text
#'
#' @return
#' @export
#'
#' @examples
#' @importFrom graphics smoothScatter text
selectFeatures <- function(object, assay = "RNA", do.plot = TRUE, do.text = TRUE,
x.low.cutoff = 0.01, x.high.cutoff = 3.5, y.cutoff = 0.5, y.high.cutoff = Inf,
num.bin = 20, pch.use = 16, col.use = "black", cex.text.use = 0.5) {
# This function is modified from Seurat Package
data <- object@norm.data[[assay]]
genes.use <- rownames(data)
gene.mean <- apply(X = data, MARGIN = 1, FUN = ExpMean)
gene.dispersion <- apply(X = data, MARGIN = 1, FUN = LogVMR)
gene.dispersion[is.na(x = gene.dispersion)] <- 0
gene.mean[is.na(x = gene.mean)] <- 0
names(x = gene.mean) <- genes.use
data_x_bin <- cut(x = gene.mean, breaks = num.bin)
names(x = data_x_bin) <- names(x = gene.mean)
mean_y <- tapply(X = gene.dispersion, INDEX = data_x_bin, FUN = mean)
sd_y <- tapply(X = gene.dispersion, INDEX = data_x_bin, FUN = sd)
gene.dispersion.scaled <- (gene.dispersion - mean_y[as.numeric(x = data_x_bin)])/sd_y[as.numeric(x = data_x_bin)]
gene.dispersion.scaled[is.na(x = gene.dispersion.scaled)] <- 0
names(x = gene.dispersion.scaled) <- names(x = gene.mean)
mv.df <- data.frame(gene.mean, gene.dispersion, gene.dispersion.scaled)
rownames(x = mv.df) <- rownames(x = data)
hvg.info <- mv.df
names(x = gene.mean) <- names(x = gene.dispersion) <- names(x = gene.dispersion.scaled) <- rownames(hvg.info)
pass.cutoff <- names(x = gene.mean)[which(x = ((gene.mean > x.low.cutoff) & (gene.mean < x.high.cutoff)) & (gene.dispersion.scaled > y.cutoff) & (gene.dispersion.scaled < y.high.cutoff))]
if (do.plot) {
smoothScatter(x = gene.mean, y = gene.dispersion.scaled, pch = pch.use, cex = 0.5, col = col.use, xlab = "Average expression", ylab = "Dispersion", nrpoints = Inf)
}
if (do.text) {
text(x = gene.mean[pass.cutoff], y = gene.dispersion.scaled[pass.cutoff], labels = pass.cutoff, cex = cex.text.use)
}
hvg.info <- hvg.info[order(hvg.info$gene.dispersion, decreasing = TRUE), ]
if (length(object@var.features) == 0) {
for (i in 1:length(object@raw.data)) {
object@var.features[[i]] <- vector("character")
}
names(object@var.features) <- names(object@raw.data)
}
object@var.features[[assay]] <- pass.cutoff
return(object)
}
#' find chromsome regions of genes
#'
#' @param genes a given set of genes
#' @param species mouse or human
#' @importFrom biomaRt useMart getBM
#' @export
#'
searchGeneRegions <- function(genes, species = "mouse") {
if (species == "mouse"){
mouse = biomaRt::useMart("ensembl", dataset = "mmusculus_gene_ensembl")
loci <- biomaRt::getBM(attributes = c( "mgi_symbol","chromosome_name",'start_position','end_position'), filters = "mgi_symbol", values = genes, mart = mouse)
} else{
human = biomaRt::useMart("ensembl", dataset = "hsapiens_gene_ensembl")
loci <- biomaRt::getBM(attributes = c( "hgnc_symbol","chromosome_name",'start_position','end_position'), filters = "hgnc_symbol", values = genes, mart = human)
}
return(loci)
}
#' find the nearby loci in a given loci set for a given set of genes
#'
#' @param genes a given set of genes
#' @param loci the background loci set
#' @param width the width from TSS
#' @param species mouse or human
#' @importFrom bedr bedr.sort.region in.region
#' @export
#'
findNearbyLoci <- function(genes, loci, width = 50000, species = "mouse") {
gene.loci <- searchGeneRegions(genes, species)
gene.loci <- gene.loci[gene.loci$chromosome_name %in% c(1:22, "X","Y"), ]
gene.loci$start_position <- gene.loci$start_position-min(width, gene.loci$start_position)
gene.loci$end_position <- gene.loci$end_position + width
gene.loci.bed <- paste0("chr",gene.loci$chromosome_name, ":", gene.loci$start_position, "-",gene.loci$end_position)
gene.loci.bed.sort <- bedr::bedr.sort.region(gene.loci.bed);
a = grep(paste0("chr", c(1:22, "X","Y"), ":"), loci)
loci <- loci[grepl(paste(paste0("chr", c(1:22, "X","Y"), ":"), collapse="|"), loci)]
loci.sort <- bedr::bedr.sort.region(loci);
is.region <- bedr::in.region(loci.sort, gene.loci.bed.sort);
loci.nearby <- loci.sort[is.region]
return(loci.nearby)
}
#' Select number of the rank
#'
#' @param object scAI object
#' @param rangeK A predefined range of K
#'
#' @return
#' @export
#'
#' @examples
selectK <- function(object, rangeK = c(2:15)) {
coph <- vector("double", length(rangeK))
i <- 0
for (k in rangeK) {
i <- i+1
outs <- run_scAI(object, K = k, nrun = object@options$paras$nrun,
s = object@options$paras$s, alpha = object@options$paras$alpha, lambda = object@options$paras$lambda, gamma = object@options$paras$gamma,
maxIter = object@options$paras$maxIter, stop_rule = object@options$paras$stop_rule)
CM <- outs@cluster$consensus
d1 <- dist(CM)
hc <- hclust(d1, method = "average")
d2 <- cophenetic(hc)
coph[i] <- cor(d1, d2)
}
df <- data.frame(k = rangeK, Coph = coph)
gg <- ggplot(df, aes(x = k, y = Coph)) + geom_line(size=1) +
geom_point() +
theme_classic() + labs(x = 'K', y='Stability score (Coph)') +
scAI_theme_opts() + theme(text = element_text(size = 10)) + labs(x = 'K', y='Stability score (Coph)') +
theme(legend.position = "right", legend.title = NULL)
gg
}
#' Reorder features according to the loading values
#'
#' @param W Basis matrix
#' @param cutoff Threshold of the loading values
#'
#' @return
#' @export
#'
#' @examples
reorderFeatures <- function(W, cutoff = 0.5) {
M <- nrow(W)
K = ncol(W)
MW <- colMeans(W)
vw <- apply(W, 2, sd)
# order features
IndexW_record <- vector("list", K)
IndexW_record[[1]] <- which(W[, 1] > MW[1] + cutoff * VW[1])
n <- length(IndexW_record[[1]])
A <- c(1:M)
c <- match(A, IndexW_record[[1]])
A <- A[-c]
for (j in 2:K) {
IndexW_record[[j]] <- which(W[, j] > MW[j] + cutoff * VW[j])
s <- 0
for (k in 1:j - 1) {
s <- s + length(IndexW_record[[k]])
}
ir2 <- match(IndexW_record[[j]], 1:s)
IndexW_record[[j]] <- IndexW_record[[j]][-ir2]
n <- length(IndexW_record[[j]])
B <- union(1:s, IndexW_record[[j]])
A <- 1:M
c <- match(A, B)
A <- A[-c]
W0 <- W
W[s + 1:s + n, ] <- W0[IndexW_record[[j]], ]
W[s + n + 1:M, ] <- W0[A, ]
}
results <- list()
list[["W"]] <- W
list[["index_record"]] <- IndexW_record
return(results)
}
#' Calculate the variance to mean ratio of logged values
#'
#' Calculate the variance to mean ratio (VMR) in non-logspace (return answer in
#' log-space)
#'
#' @param x A vector of values
#'
#' @return Returns the VMR in log-space
#'
#' @importFrom stats var
#'
#' @export
#'
#' @examples
#' LogVMR(x = c(1, 2, 3))
#'
LogVMR <- function(x) {
return(log(x = var(x = exp(x = x) - 1) / mean(x = exp(x = x) - 1)))
}
#' Calculate the mean of logged values
#'
#' Calculate mean of logged values in non-log space (return answer in log-space)
#'
#' @param x A vector of values
#'
#' @return Returns the mean in log-space
#'
#' @export
#'
#' @examples
#' ExpMean(x = c(1, 2, 3))
#'
ExpMean <- function(x) {
return(log(x = mean(x = exp(x = x) - 1) + 1))
}
#' Convert a sparse matrix to a dense matrix
#'
#' @param mat A sparse matrix
#' @export
as_matrix <- function(mat){
Rcpp::sourceCpp(code='
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
IntegerMatrix asMatrix(NumericVector rp,
NumericVector cp,
NumericVector z,
int nrows,
int ncols){
int k = z.size() ;
IntegerMatrix mat(nrows, ncols);
for (int i = 0; i < k; i++){
mat(rp[i],cp[i]) = z[i];
}
return mat;
}
' )
row_pos <- mat@i
col_pos <- findInterval(seq(mat@x)-1,mat@p[-1])
tmp <- asMatrix(rp = row_pos, cp = col_pos, z = mat@x,
nrows = mat@Dim[1], ncols = mat@Dim[2])
row.names(tmp) <- mat@Dimnames[[1]]
colnames(tmp) <- mat@Dimnames[[2]]
return(tmp)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.