#' Conduct Louvin Clustering on single dataset
#'
#' Conduct Louvin Clustering on single dataset
#' @param count
#'
#' @return result.cluster
#' @import Seurat
#' @export
#'
#' @examples
#'
Clustering <- function(count){
rownames(count) <- as.character(1:dim(count)[1])
colnames(count) <- as.character(1:dim(count)[2])
sink('NUL')
sink(stdout(), type = "message")
seurat <- CreateSeuratObject(counts = count)
seurat <- NormalizeData(seurat)
seurat <- ScaleData(seurat)
seurat <- FindVariableFeatures(seurat, selection.method = "vst", nfeatures = 2000)
seurat <- RunPCA(seurat)
seurat <- FindNeighbors(seurat, dims = 1:10)
seurat <- FindClusters(seurat, resolution = 0.5)
result.cluster <- seurat@meta.data[["seurat_clusters"]]
sink(NULL, type="message")
sink()
return(result.cluster)
}
#' Conduct cell clustering on mutiple datasets
#'
#' Conduct Louvin Clustering on datasets generated by doublet-detection methods.
#' @param count.list.all A list of scRNA-seq data matrices after removing doublets by different methods with single removal rate.
#'
#' @return A list of clustering result (number of clusters) under different removal rates.
#' @export
#'
#' @examples
#' result.cluster.all <- Clustering.All(count.list.all = data.removal.all$count)
#'
Clustering.All <- function(count.list.all){
result.cluster.all <- list()
for(name in names(count.list.all)){
cat("\nNumber of clusters: ", name, '\n', file = stderr())
count.list <- count.list.all[[name]]
for(method in names(count.list)){
cat("Detection method: ", method, '\n', file = stderr())
count <- count.list[[method]]; dim(count)
result.cluster <- Clustering(count = count); table(result.cluster)
result.cluster.all[[name]][[method]] <- result.cluster
}
}
return(result.cluster.all)
}
#' Conduct cell clustering on mutiple datasets and removal rates
#'
#' Conduct Louvin Clustering on datasets generated with different doublet removal rates and doublet-detection methods.
#' @param data.removal.all.rate A list of scRNA-seq data matrices after removing doublets by different methods with different removal rates.
#'
#' @return A list of clustering result (number of clusters) under different removal rates.
#' @export
#'
#' @examples
#' result.cluster.all.rate <- Clustering.All.Rate(data.removal.all.rate)
#'
Clustering.All.Rate <- function(data.removal.all.rate){
result.cluster.all.rate <- list()
cat('\nConducting Louving Clustering on datasets with:\n', file = stderr())
for(rate in names(data.removal.all.rate)){
cat('\nDoublet rate ', rate, '\n', file = stderr())
count.list.all <- data.removal.all.rate[[rate]]$count
result.cluster.all <- Clustering.All(count.list.all)
result.cluster.all.rate[[rate]] <- result.cluster.all
}
return(result.cluster.all.rate)
}
#' Calculate singlet rates
#'
#' Calculate singlet rates in correctly identified clusters for different doublet-detection methods.
#' @param table.cluster A data frame with clustering result.
#' @param result.cluster.all.rate A list of clustering result (number of clusters) under different removal rates.
#' @param data.removal.all.rate A list of scRNA-seq data matrices after removing doublets by different methods with different removal rates.
#'
#' @return A data frame with singlet rates in each correctly identified cluster.
#' @export
#'
#' @examples
#' table.cluster.quality <- Clustering.Quality(table.cluster, result.cluster.all.rate, data.removal.all.rate)
#'
Clustering.Quality <- function(table.cluster, result.cluster.all.rate, data.removal.all.rate){
result <- data.frame()
table.cluster.correct <- table.cluster[table.cluster$cluster_correct==table.cluster$Louvain_clustering,]
for(i in 1:nrow(table.cluster.correct)){
#i=1
rate <- table.cluster.correct[i, 'removal_rate']; rate
correct <- table.cluster.correct[i, 'cluster_correct']; correct
method <- table.cluster.correct[i, 'method']; method
label <- data.removal.all.rate[[rate]]$label[[correct]][[method]]; table(label)
cluster <- result.cluster.all.rate[[rate]][[correct]][[method]]; table(cluster)
singlet.rate.vector <- c()
for(c in 0:(nlevels(cluster)-1)){
singlet.rate <- mean(label[which(cluster==c)]==0)
singlet.rate.vector <- c(singlet.rate.vector, singlet.rate)
}
result.correct <- rep(correct, nlevels(cluster))
result.method <- rep(method, nlevels(cluster))
result.dataframe.temp <- data.frame(method=result.method, value=singlet.rate.vector, correct=result.correct)
result.dataframe.temp
result <- rbind(result, result.dataframe.temp)
}
return(result)
}
#' Find DE genes
#'
#' Conduct DE gene analysis on a list of datasets by user-specified methods.
#' @param data.removal.list A list of scRNA-seq datasets with cell type annotations.
#' @param method.de A name of DE method: "wilcox", "bimod", "t", "poisson", "negbinom", "LR", or "MAST".
#'
#' @return A list of DE gene indices identified for each doublet-detection method.
#' @import Seurat
#' @export
#'
#' @examples
#' table.DE.all <- data.frame()
#' for(DE.method in c('MAST', 'wilcox', 'bimod')){
#' DE.list <- FindDE(data.removal.list = data.removal.list, method.de = DE.method)
#' DE.acc.list <- FindDEACC(DE.list=DE.list, DE.truth=data.de$gene.de, gene.all=rownames(data.de$count))
#' table.DE <- ListToDataframe(l = DE.acc.list, type = 'barplot')
#' table.DE[['DE_method']] <- DE.method
#' table.DE.all <- rbind(table.DE.all, table.DE)
#' }
#'
FindDE <- function(data.removal.list, method.de='wilcox'){
cat("\nConduct DE analysis by", method.de, '...\n', file = stderr())
result.de <- list()
for(method in names(data.removal.list)){
count <- data.removal.list[[method]]$count; dim(count)
label.cluster <- data.removal.list[[method]]$label; table(label.cluster)
cat(method, ' Data ...\n', file = stderr())
sink('NUL')
sink(stdout(), type = "message")
seurat <- CreateSeuratObject(counts = count)
#count.method <- seurat.clean[["RNA"]]@counts; dim(count.method)
seurat <- NormalizeData(seurat)
seurat <- ScaleData(seurat)
seurat <- FindVariableFeatures(seurat, selection.method = "vst", nfeatures = 2000)
seurat <- RunPCA(seurat)
Idents(seurat) <- as.factor(label.cluster); levels(seurat)
marker <- FindMarkers(seurat, ident.1 = '0', ident.2 = "1", test.use = method.de)
table.de <- marker[marker$p_val_adj <= 0.05,]; dim(table.de)
de <- rownames(table.de)
result.de[[method]] <- de
sink(NULL, type="message")
sink()
}
return(result.de)
}
#' Calculate precision, recall, and TNR of identified DE genes
#'
#' Calculate precision, recall, and TNR of identified DE genes across different doublet-detection methods and DE methods.
#' @param DE.list A list of identified DE genes.
#' @param DE.truth A list of ground-truth DE genes.
#' @param gene.all A vector of all gene names.
#'
#' @return A list of precision, recall, and TNR across different doublet-detection methods and DE methods.
#' @export
#'
#' @examples
#' table.DE.all <- data.frame()
#' for(DE.method in c('MAST', 'wilcox', 'bimod')){
#' DE.list <- FindDE(data.removal.list = data.removal.list, method.de = DE.method)
#' DE.acc.list <- FindDEACC(DE.list=DE.list, DE.truth=data.de$gene.de, gene.all=rownames(data.de$count))
#' table.DE <- ListToDataframe(l = DE.acc.list, type = 'barplot')
#' table.DE[['DE_method']] <- DE.method
#' table.DE.all <- rbind(table.DE.all, table.DE)
#' }
#'
FindDEACC <- function(DE.list, DE.truth, gene.all){
ACC.list <- list()
for(method in names(DE.list)){
#method <- names(DE.list)[1]; method
DE.result <- DE.list[[method]]
tp <- length(intersect(DE.result, DE.truth)); tp
fp <- length(setdiff(DE.result, DE.truth)); fp
fn <- length(setdiff(DE.truth, DE.result)); fn
nde.truth <- setdiff(gene.all, DE.truth); length(nde.truth)
nde.result <- setdiff(gene.all, DE.result); length(nde.result)
tn <- length(intersect(nde.truth, nde.result)); tn
precision <- tp / (tp + fp); precision
recall <- tp / (tp + fn); recall
tnr <- tn / (tn + fn); tnr
ACC.list[[method]] <- list(precision=precision, recall=recall, tnr=tnr)
}
return(ACC.list)
}
#' Conduct cell trajectory inference
#'
#' Conduct cell trajectory inference by Slingshot on one dataset and draw the trajectory.
#' @param count A scRNA-seq data matrix.
#' @param label A vector of doublet annotations (0/1).
#' @param title A title of the visualization.
#'
#' @return No return values.
#' @import SingleCellExperiment
#' @import slingshot
#' @import mclust
#' @import SummarizedExperiment
#' @export
#'
#' @examples
#' data.trajectory <- readRDS('.../sim_psudotime_bifurcating.rds')
#' count <- data.trajectory$count
#' label <- data.trajectory$label
#' FindTrajectory(count=count, label=label, title='Contaminated Data')
#'
FindTrajectory <- function(count, label, title){
cat('\nInfering cell trajectory by slingshot...\n', file = stderr())
cat('saveing trajectory plot to current working directory...\n', file = stderr())
sce <- SingleCellExperiment(assays = List(counts = count))
FQnorm <- function(count){
rk <- apply(count,2,rank,ties.method='min')
count.sort <- apply(count,2,sort)
refdist <- apply(count.sort,1,median)
norm <- apply(rk,2,function(r){ refdist[r] })
rownames(norm) <- rownames(count)
return(norm)
}
assays(sce)$norm <- FQnorm(assays(sce)$counts)
pca <- prcomp(t(log1p(assays(sce)$norm)), scale. = FALSE)
embedding.pca <- pca$x[,1:2]
reducedDims(sce) <- SimpleList(PCA = embedding.pca)
label.cluster <- Mclust(embedding.pca)$classification
colData(sce)$GMM <- label.cluster
sce <- slingshot(sce, clusterLabels = 'GMM', reducedDim = 'PCA')
embedding.pca <- as.data.frame(embedding.pca)
embedding.pca$type <- as.numeric(label)
embedding.pca$type <- ifelse(embedding.pca$type==0, 'singlet', 'doublet')
embedding.pca$type <- as.factor(embedding.pca$type)
if(length(table(embedding.pca$type))==1){
palette(c("grey","red"))
}else{
palette(c("red","grey"))
}
plot(embedding.pca$PC1, embedding.pca$PC2, col = embedding.pca$type, pch=16, asp = 0,
main=title, xlab='', ylab='', xaxt='n', yaxt='n')
lines(SlingshotDataSet(sce), lwd=5, col='black')
legend(x="bottomright",legend=c("Doublet","Singlet"),col=c("red","grey"), lwd=1, lty=c(NA,NA),
pch=c(19,19), cex=1.3)
png(paste(title, ' trajectory.png', sep = ''))
plot(embedding.pca$PC1, embedding.pca$PC2, col = embedding.pca$type, pch=16, asp = 0,
main=title, xlab='', ylab='', xaxt='n', yaxt='n')
lines(SlingshotDataSet(sce), lwd=5, col='black')
legend(x="bottomright",legend=c("Doublet","Singlet"),col=c("red","grey"), lwd=1, lty=c(NA,NA),
pch=c(19,19), cex=1.3)
dev.off()
}
#' Infer temporally DE genes and calculate corresponding accuracy
#'
#' Infer temporally DE genes by Slingshot and GAM model on one dataset. Calculate precision, recall, and TNR of identified doublets.
#' @param count A scRNA-seq data matrix.
#' @param gene.de A vector of ground-truth temporally DE genes.
#'
#' @return A list of precision, recall, and TNR of identified temporally DE genes.
#'
#' @import SingleCellExperiment
#' @import slingshot
#' @import mclust
#' @import gam
#' @import S4Vectors
#' @import SummarizedExperiment
#' @export
#'
#' @examples
#' data.trajectory <- readRDS('.../sim_temporally_DE.rds')
#' count <- data.trajectory$count
#' gene.de <- data.trajectory$gene.de
#' de.temp.list <- FindTempDE(count=count, gene.de=gene.de)
#'
FindTempDE <- function(count, gene.de){
cat('\nInfering temporal DE genes by slingshot...\n', file = stderr())
sce <- SingleCellExperiment(assays = List(counts = count))
FQnorm <- function(count){
rk <- apply(count,2,rank,ties.method='min')
count.sort <- apply(count,2,sort)
refdist <- apply(count.sort,1,median)
norm <- apply(rk,2,function(r){ refdist[r] })
rownames(norm) <- rownames(count)
return(norm)
}
assays(sce)$norm <- FQnorm(assays(sce)$counts)
pca <- prcomp(t(log1p(assays(sce)$norm)), scale. = FALSE)
embedding.pca <- pca$x[,1:2]
reducedDims(sce) <- SimpleList(PCA = embedding.pca)
label.cluster <- Mclust(embedding.pca)$classification
colData(sce)$GMM <- label.cluster
sce <- slingshot(sce, clusterLabels = 'GMM', reducedDim = 'PCA')
t <- sce$slingPseudotime_1; t
Y <- log1p(assays(sce)$norm); dim(Y)
gam.pval <- apply(Y,1,function(z){
d <- data.frame(z=z, t=t)
tmp <- suppressWarnings(gam(z ~ lo(t), data=d))
p <- summary(tmp)[3][[1]][2,3]
p
})
p <- .05
findDE <- names(gam.pval[gam.pval<=p]); length(findDE)
findnonDE <- names(gam.pval[gam.pval>p]); length(findnonDE)
nonDE <- rownames(count)[1:500]
DE <- gene.de
tp <- length(intersect(findDE, DE)); tp
fp <- length(setdiff(findDE, DE)); fp
fn <- length(setdiff(findnonDE, nonDE)); fn
tn <- length(intersect(findnonDE, nonDE)); tn
precision <- tp/(tp+fp); precision
recall <- tp/(tp+fn); recall
tnr <- tn/(tn+fp); tnr
de.list <- list(precision=precision, recall=recall, tnr=tnr)
return(de.list)
}
#' Infer temporally DE genes and calculate corresponding DE genes for multiple methods
#'
#' Infer temporally DE genes by Slingshot and GAM model on multiple datasets. Calculate corresponding precision, recall, and TNR of identified doublets.
#' @param data.list A list of scRNA-seq count matrix.
#' @param gene.de A vector of ground-truth temporally DE genes.
#'
#' @return A list of precision, recall, and TNR of identified temporally DE genes on multiple datasets.
#' @export
#'
#' @examples
#' doublet.list <- FindDoublets(score.list = score.list, rate = 0.2)
#' data.removal.list <- RemoveDoublets.Method(count = count, label = label, doublet.list = doublet.list)
#' data.removal.list[['Clean Data']] <- list(count=count.clean, label=label.clean)
#' de.temp.result.all <- FindTempDE.All(data.list=data.removal.list, gene.de=gene.de)
#'
FindTempDE.All <- function(data.list, gene.de){
de.temp.list.all <- list()
for(method in names(data.list)){
cat('\n')
cat(method, "Dataset", file = stderr())
count <- data.list[[method]]$count
de.temp.list <- FindTempDE(count = count, gene.de = gene.de)
de.temp.list.all[[method]] <- de.temp.list
}
return(de.temp.list.all)
}
#' Calculate concatenated AUPRC or AUROC
#'
#' Concatenate doublet scores of different batches. Calculate AUPRC or AUROC on concatenated doublet scores.
#' @param score.list.all A list of doublet scores of each batch.
#' @param label.list A list of doublet annotations of each batch.
#' @param type A measurement type of "AUPRC" or "AUROC".
#' @return A list of AUPRC or AUROC of different batches.
#' @export
#'
#' @examples
#'
FindDistributedAUC <- function(score.list.all, label.list, type){
score.list.merge <- do.call(Map, c(c, score.list.all))
label.merge <- as.numeric(unlist(label.list))
auc.list <- FindAUC(score.list = score.list.merge, label = label.merge, type=type)
return(auc.list)
}
#' Calculate detection accuracy under distributed computing
#'
#' Randomly split one dataset into user-specified batches. Perform doublet detection on each batch by different methods to calculate doublet scores.
#' Concatenate doublet scores per batch. Calclulate AUPRC or AUROC on concatenated doublet scores.
#' @param count A scRNA-seq data matrix.
#' @param label A vector of doublet annotations (0/1).
#' @param batches A vector of batch numbers.
#' @param type A string of measurement "AUPRC" or "AUROC".
#' @return A list of measurement across different batch numbers and methods.
#' @export
#'
#' @examples
#' data.list <- ReadData(path = ".../real_datasets")
#' count <- data.list$count$`pbmc-ch`
#' label <- data.list$label$`pbmc-ch`
#' label <- ifelse(label == 'doublet', 1, 0)
#' auc.list.batch <- FindDistributedAUC.All(count = count, label = label, method=methods, batches = 2:10, type='AUPRC')
#'
FindDistributedAUC.All <- function(count, label, method, batches, type){
auc.list.batch <- list()
for(batch in batches){
cat('\nbatch number', batch, '\n', file = stderr())
data.split <- SplitData(count=count, label=label, batch=batch)
score.list.all <- FindScores.All(count.list = data.split$count, methods = method)
auc.list <- FindDistributedAUC(score.list.all=score.list.all, label.list=data.split$label, type=type)
auc.list.batch[[as.character(batch)]] <- auc.list
}
return(auc.list.batch)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.