#' Get marker genes of each cluster for huge-size single-cell datasets
#'
#' This is an version of finding marker genes for huge-size single-cell datasets (e.g., the size of the scRNA-seq data is so huge that the expression matrix can not be read into R; instead, a list of block-wise expression matrices is read into R.) This also corresponds to SHARP_unlimited(). Detailed specifications can be found in get_marker_genes().
#'
#' @param scExp the list of original block-wise expression matrices
#'
#' @param y the clustering results after running SHARP.R
#'
#' @param n.cores number of cores to be used. The default is (n-1) cores, where n is the number of cores in your local computer or server.
#'
#' @return a matrix where each row represents each selected marker gene and each column represnts one property of the gene, including its cluster, p-value and AUROC.
#'
#' @examples
#'
#' y = SHARP(scExp)
#' sginfo = get_marker_genes_unlimited(scExp, y)
#'
#' @import ROCR
#'
#' @import doParallel
#'
#' @import data.table
#'
#' @export
get_marker_genes_unlimited <- function(scExp, y, theta, auc, pvalue, n.cores){
start_time <- Sys.time() #we exclude the time for loading the input matrix
if (missing(n.cores)) {
# number of cores to be used, the default is to use all but one cores
n.cores = detectCores() - 1
}
registerDoParallel(n.cores)
if(missing(theta)){
theta = 1e-5
}
if(missing(auc)){
auc = 0.85
}
if(missing(pvalue)){
pvalue = 0.01
}
ncells = y$N.cells#number of cells
N.cluster = y$N.pred_cluster
# ##remove those unnecessary (no changes) genes
cat("Preprocessing...\n")
len = length(scExp)
D = nrow(scExp[[1]])#number of genes
cat("Number of all genes:", D, "\n")
####remove those genes whose expressions are 0 across all cells
# q = foreach(i = 1:len, .combine= c)%dopar%{
# a = scExp[[i]]
# f1 = unique(a@i)+1#0-based index
# f2 = setdiff(1:D, f1)
# return(f2)
# }
q = foreach(a = scExp, .combine= c)%dopar%{
# a = scExp[[i]]
f1 = unique(a@i)+1#0-based index
f2 = setdiff(1:D, f1)
return(f2)
}
f = table(q)#statistics of repeated numbers
x = which(f==len)#all zeros in different lists
nx = as.numeric(names(x))#selected all-zero elements
ny = setdiff(1:D, nx)#non-zero genes
D = length(ny)
#
# nw = list()
# nw = foreach(i = 1:len)%dopar%{
# z = scExp[[i]][ny, ]
# return(z)
# }
# rm("scExp")
# gc()
# scExp = nw
# rm("nw")
# gc()
# my = scExp
#
# # my = my[apply(my,1,function(x) sd(x)!=0),]
# # my <- t(scale(t(my)))
#
# # mat = as.matrix(my)
# # mat = my
#
# # D = nrow(mat)#number of genes
# D = 10
cat("Number of valid genes:", D, "\n")
gnames = rownames(scExp[[1]])[ny]
uy = y$unique_pred_clusters#unique
if(max(as.numeric(uy)) > length(uy)){#in case we use a larger value than the number of unique elements
y$pred_clusters = match(y$pred_clusters, uy)#we use the index instead
}
# dt <- data.frame(yy = t(mat), ig = y$pred_clusters)#gene expression; cluster
label = as.numeric(y$pred_clusters)
rm("y")
gc()
dt <- data.frame(ig = label)#gene expression; cluster
# outdir = "tmp"
# if(!dir.exists(outdir)){#the folder to store the results
# dir.create(outdir)
# }else{
# unlink(paste0(outdir, "/*"))
# }
#
# pn = 1
# k = ceiling(D/pn)
# # D = 1000
# foreach(i = 1:D)%dopar%{
# # if(i!=k){
# # gi = ((i-1)*pn+1):i*pn
# # }else{
# # gi = ((i-1)*pn+1):D
# # }
#
# r = unlist(sapply(1:len, function(x) scExp[[x]][i,]))#the rank of the expression for the i-th gene across single cells
# # s1 = which(r!=0)
# # s2 = r[s1]
# # names(s2) = s1
# fn = paste(outdir, "/t", i, ".rds", sep = "")
# saveRDS(r, fn)
# }
# # rm(scExp)
# # rm(y)
# gc()
# allfiles = list.files(scExp_dir, full.names = TRUE)#include the full path
# len = length(allfiles)
cat("Checking genes...\n")
# foreach(i = 1:len)%dopar%{
# k = paste0("d",i)
# assign(k, scExp[[i]])
#
# return(NULL)
# }
# rm("scExp")
# gc()
#
#
# foreach(j = 1:D)%:%foreach(i = 1:len)%dopar%{
# k = paste0("d",i,"g",j)
# assign(k, scExp[[i]][j,])
# return(NULL)
# }
# rm("scExp")
# gc()
# D = 10
# total <- D
# # create progress bar
# pb <- txtProgressBar(min = 0, max = total, style = 3)
######parallel
# theta = 1e-5
# myenv = new.env()
# myenv = foreach(i = 1:20)%dopar%{
# # dn = paste0("d", i)
# unlist(sapply(1:len, function(x) scExp[[x]][i,]))
# }
# g5 = matrix(0, ncol = 5, nrow = D)
# ginfo = foreach(i = 1:D)%dopar%{
g5 = foreach(i = 1:D, .combine = "rbind")%dopar%{
tt = character(5)
# ginfo = foreach(i = 1:length(ny), .combine = c)%dopar%{
# cat("\nGene", ny[i], "\n")
# r = rank(mat[i, ])#gene rank
# cat("Collect the", i, "-th gene expression across 1.3 million cells...\n")
# eval(parse(text = paste0("mat <- myenv[[", i,"]]")))
# fn = paste(outdir, "/t", i, ".rds", sep = "")
# mat = readRDS(allfiles[i])
# r = rank(mat)
# r = rank(unlist(sapply(1:len, function(x) scExp[[x]][i,])))#the rank of the expression for the i-th gene across single cells
dd = unlist(sapply(1:len, function(x) scExp[[x]][ny[i],]))
dp = length(which(dd!=0))/length(dd)
gn = rownames(scExp[[1]])[ny[i]]#gene name
cat("Number", i, "Gene", ny[i], "---- Non-zero percentage:", dp, "\n")
if(dp > theta){
r = rank(dd)
# s1 = which(dd!=0)
# dt1 = dt[s1]
# s2 = aggregate(r1~dt1, FUN = mean)
# cat("Get average expression on the", i,"-th across different clusters...\n")
s = aggregate(r~dt$ig, FUN = mean)#average over one gene across cells
# cat("Get the cell index of the maximum expression...\n")
icluster = which.max(s$r)#the cluster with the maximum value
x1 = r[as.numeric(dt$ig) == icluster]
x2 = r[as.numeric(dt$ig) != icluster]
p = wilcox.test(x1, x2)$p.value
pred <- prediction(r, as.numeric(dt$ig) == icluster)
auc <- unlist(performance(pred, "auc")@y.values)
}else{
auc = icluster = p = 0
}
tt = c(auc, icluster, p, dp, gn)#character
# update progress bar
# setTxtProgressBar(pb, i)
# cat(c(auc, icluster, p), "\n")
return(tt)
}
# ginfo = matrix(ginfo, nrow = 3)
###############
# close(pb)
ginfo = g5[, 1:4]#collect those info
storage.mode(ginfo) = "numeric"#convert to numeric
ginfo = data.frame(ginfo)
rownames(ginfo) = g5[, 5]
# ginfo <- data.frame(matrix(ginfo, ncol = 5, byrow = T))#it is not necessarily to be a data frame
colnames(ginfo) = c("auc", "icluster", "pvalue", "sparsity")
# cat(ginfo, "\n")
nr = nrow(ginfo)
cat("Number of genes checked:", nr, "\n")
# exmat = scExp[[1]]
# rownames(ginfo) = gnames[1:nr]
ginfo = ginfo[ginfo$sparsity > theta, ]#remove low-expressed genes
ginfo = ginfo[!is.nan(ginfo$pvalue), ]#remove those corresponding to p-value == NaN
ginfo$pvalue = p.adjust(ginfo$pvalue, method = "holm")
###in case some clusters do not have any qualified marker genes, we need to lower the AUC threshold (0.85)
dt = data.table(ginfo)
dt0 = dt[, list(maxauc = max(auc), minpvalue = min(pvalue)), by = icluster]
##adjusted AUC and p-value
# adpvalue = max(0.01, max(dt0$minpvalue))
adpvalue = pvalue
adauc = min(auc, min(dt0$maxauc))
# adauc = auc
x1 = which(ginfo$pvalue < adpvalue)
x2 = which(ginfo$auc > adauc)
# x1 = which(ginfo$pvalue < 0.01)
#
# x2 = which(ginfo$auc > 0.85)
xx = intersect(x1, x2)
mginfo = ginfo[xx, ]#selected genes with p-values and their clusters
nsgene = nrow(mginfo)
cat("Number of selected marker genes:", nsgene, "\n")
#the corresponding expressions of marker genes
# dd = foreach(i = 1:nsgene, .combine = "rbind")%dopar%{
# unlist(sapply(1:len, function(x) scExp[[x]][rownames(mginfo[i, ]),]))
# }
dd = foreach(i = 1:len, .combine = "cbind")%dopar%{
colnames(scExp[[i]]) = NULL#remove the cell id
x = scExp[[i]][rownames(mginfo), , drop = FALSE]#even for one row, it is still regarded as a matrix with the nrow == 1
return(x)
}
# dd = Matrix
# adpvalue = max(mginfo$pvalue)
# adauc = min(mginfo$auc)
end_time <- Sys.time()
t <- difftime(end_time, start_time, units = "mins") #difference time in minutes
cat("Running time:", t ,"minutes\n")
cat("-----------------------------------------------------------------------\n")
cat("The adjusted p-avalue is", adpvalue, "and the adjusted AUROC is", adauc, "\n")
res = list()
res$mginfo = mginfo
res$mat = dd
res$label = label
return(res)
}
# rankcells <- function(x){
# n = length(x)
# s1 = which(x!=0)#indices of non-zero cells
# s2 = setdiff(1:n, s1)#indices of zero cells
# ns2 = length(s2)#number of zeros
#
# y = numeric(n)
# y[s2] = (1+ns2)/2#ranking for those zeros
# y[s1] = rank(x[s1]) + ns2#ranking for those non-zeros
#
# return(y)
# }
# c2t <- function(w, n.cores){
# registerDoParallel(n.cores)
# len = length(w)
#
# s = list()
# s = foreach(i = 1:len)%dopar%{
# as(w[[i]], "dgTMatrix")
# }
# stopImplicitCluster()
# }
#
# sp2 <- function(s){
# f = cbind(s@i, s@j, s@x)
#
# return(f)
# }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.