DoubletCollection: Find doublets in scRNA-seq data

Documented in Clustering Clustering.All Clustering.All.Rate Clustering.Quality FindDE FindDEACC FindDistributedAUC FindDistributedAUC.All FindTempDE FindTempDE.All FindTrajectory

#' Conduct Louvin Clustering on single dataset
#'
#' Conduct Louvin Clustering on single dataset
#' @param count
#'
#' @return result.cluster
#' @import Seurat
#' @export
#'
#' @examples
#'
Clustering <- function(count){
  rownames(count) <- as.character(1:dim(count)[1])
  colnames(count) <- as.character(1:dim(count)[2])

  sink('NUL')
  sink(stdout(), type = "message")
  seurat <- CreateSeuratObject(counts = count)
  seurat <- NormalizeData(seurat)
  seurat <- ScaleData(seurat)
  seurat <- FindVariableFeatures(seurat, selection.method = "vst", nfeatures = 2000)
  seurat <- RunPCA(seurat)
  seurat <- FindNeighbors(seurat, dims = 1:10)
  seurat <- FindClusters(seurat, resolution = 0.5)
  result.cluster <- seurat@meta.data[["seurat_clusters"]]
  sink(NULL, type="message")
  sink()
  return(result.cluster)
}

#' Conduct cell clustering on mutiple datasets
#'
#' Conduct Louvin Clustering on datasets generated by doublet-detection methods.
#' @param count.list.all A list of scRNA-seq data matrices after removing doublets by different methods with single removal rate.
#'
#' @return A list of clustering result (number of clusters) under different removal rates.
#' @export
#'
#' @examples
#' result.cluster.all <- Clustering.All(count.list.all = data.removal.all$count)
#'
Clustering.All <- function(count.list.all){
  result.cluster.all <- list()
  for(name in names(count.list.all)){
    cat("\nNumber of clusters: ", name, '\n', file = stderr())
    count.list <- count.list.all[[name]]
    for(method in names(count.list)){
      cat("Detection method: ", method, '\n', file = stderr())
      count <- count.list[[method]]; dim(count)
      result.cluster <- Clustering(count = count); table(result.cluster)
      result.cluster.all[[name]][[method]] <- result.cluster
    }
  }
  return(result.cluster.all)
}


#' Conduct cell clustering on mutiple datasets and removal rates
#'
#' Conduct Louvin Clustering on datasets generated with different doublet removal rates and doublet-detection methods.
#' @param data.removal.all.rate A list of scRNA-seq data matrices after removing doublets by different methods with different removal rates.
#'
#' @return A list of clustering result (number of clusters) under different removal rates.
#' @export
#'
#' @examples
#' result.cluster.all.rate <- Clustering.All.Rate(data.removal.all.rate)
#'
Clustering.All.Rate <- function(data.removal.all.rate){
  result.cluster.all.rate <- list()
  cat('\nConducting Louving Clustering on datasets with:\n', file = stderr())
  for(rate in names(data.removal.all.rate)){
    cat('\nDoublet rate ', rate, '\n', file = stderr())
    count.list.all <- data.removal.all.rate[[rate]]$count
    result.cluster.all <- Clustering.All(count.list.all)
    result.cluster.all.rate[[rate]] <- result.cluster.all
  }
  return(result.cluster.all.rate)
}


#' Calculate singlet rates
#'
#' Calculate singlet rates in correctly identified clusters for different doublet-detection methods.
#' @param table.cluster A data frame with clustering result.
#' @param result.cluster.all.rate A list of clustering result (number of clusters) under different removal rates.
#' @param data.removal.all.rate A list of scRNA-seq data matrices after removing doublets by different methods with different removal rates.
#'
#' @return A data frame with singlet rates in each correctly identified cluster.
#' @export
#'
#' @examples
#' table.cluster.quality <- Clustering.Quality(table.cluster, result.cluster.all.rate, data.removal.all.rate)
#'
Clustering.Quality <- function(table.cluster, result.cluster.all.rate, data.removal.all.rate){
  result <- data.frame()
  table.cluster.correct <- table.cluster[table.cluster$cluster_correct==table.cluster$Louvain_clustering,]
  for(i in 1:nrow(table.cluster.correct)){
    #i=1
    rate <- table.cluster.correct[i, 'removal_rate']; rate
    correct <- table.cluster.correct[i, 'cluster_correct']; correct
    method <- table.cluster.correct[i, 'method']; method
    label <- data.removal.all.rate[[rate]]$label[[correct]][[method]]; table(label)
    cluster <- result.cluster.all.rate[[rate]][[correct]][[method]]; table(cluster)
    singlet.rate.vector <- c()
    for(c in 0:(nlevels(cluster)-1)){
      singlet.rate <- mean(label[which(cluster==c)]==0)
      singlet.rate.vector <- c(singlet.rate.vector, singlet.rate)
    }
    result.correct <- rep(correct, nlevels(cluster))
    result.method <- rep(method, nlevels(cluster))
    result.dataframe.temp <- data.frame(method=result.method, value=singlet.rate.vector, correct=result.correct)
    result.dataframe.temp
    result <- rbind(result, result.dataframe.temp)
  }
  return(result)
}


#' Find DE genes
#'
#' Conduct DE gene analysis on a list of datasets by user-specified methods.
#' @param data.removal.list A list of scRNA-seq datasets with cell type annotations.
#' @param method.de A name of DE method: "wilcox", "bimod", "t", "poisson", "negbinom", "LR", or "MAST".
#'
#' @return A list of DE gene indices identified for each doublet-detection method.
#' @import Seurat
#' @export
#'
#' @examples
#' table.DE.all <- data.frame()
#' for(DE.method in c('MAST', 'wilcox', 'bimod')){
#'  DE.list <- FindDE(data.removal.list = data.removal.list, method.de = DE.method)
#'  DE.acc.list <- FindDEACC(DE.list=DE.list, DE.truth=data.de$gene.de, gene.all=rownames(data.de$count))
#'  table.DE <- ListToDataframe(l = DE.acc.list, type = 'barplot')
#'  table.DE[['DE_method']] <- DE.method
#'  table.DE.all <- rbind(table.DE.all, table.DE)
#' }
#'
FindDE <- function(data.removal.list, method.de='wilcox'){

  cat("\nConduct DE analysis by", method.de, '...\n', file = stderr())
  result.de <- list()

  for(method in names(data.removal.list)){
    count <- data.removal.list[[method]]$count; dim(count)
    label.cluster <- data.removal.list[[method]]$label; table(label.cluster)
    cat(method, ' Data ...\n', file = stderr())

    sink('NUL')
    sink(stdout(), type = "message")
    seurat <- CreateSeuratObject(counts = count)
    #count.method <- seurat.clean[["RNA"]]@counts; dim(count.method)
    seurat <- NormalizeData(seurat)
    seurat <- ScaleData(seurat)
    seurat <- FindVariableFeatures(seurat, selection.method = "vst", nfeatures = 2000)
    seurat <- RunPCA(seurat)
    Idents(seurat) <- as.factor(label.cluster); levels(seurat)
    marker <- FindMarkers(seurat, ident.1 = '0', ident.2 = "1", test.use = method.de)
    table.de <- marker[marker$p_val_adj <= 0.05,]; dim(table.de)
    de <- rownames(table.de)
    result.de[[method]] <- de

    sink(NULL, type="message")
    sink()
  }
  return(result.de)
}


#' Calculate precision, recall, and TNR of identified DE genes
#'
#' Calculate precision, recall, and TNR of identified DE genes across different doublet-detection methods and DE methods.
#' @param DE.list A list of identified DE genes.
#' @param DE.truth A list of ground-truth DE genes.
#' @param gene.all A vector of all gene names.
#'
#' @return A list of precision, recall, and TNR across different doublet-detection methods and DE methods.
#' @export
#'
#' @examples
#' table.DE.all <- data.frame()
#' for(DE.method in c('MAST', 'wilcox', 'bimod')){
#'  DE.list <- FindDE(data.removal.list = data.removal.list, method.de = DE.method)
#'  DE.acc.list <- FindDEACC(DE.list=DE.list, DE.truth=data.de$gene.de, gene.all=rownames(data.de$count))
#'  table.DE <- ListToDataframe(l = DE.acc.list, type = 'barplot')
#'  table.DE[['DE_method']] <- DE.method
#'  table.DE.all <- rbind(table.DE.all, table.DE)
#' }
#'
FindDEACC <- function(DE.list, DE.truth, gene.all){
  ACC.list <- list()
  for(method in names(DE.list)){
    #method <- names(DE.list)[1]; method
    DE.result <- DE.list[[method]]
    tp <- length(intersect(DE.result, DE.truth)); tp
    fp <- length(setdiff(DE.result, DE.truth)); fp
    fn <- length(setdiff(DE.truth, DE.result)); fn
    nde.truth <- setdiff(gene.all, DE.truth); length(nde.truth)
    nde.result <- setdiff(gene.all, DE.result); length(nde.result)
    tn <- length(intersect(nde.truth, nde.result)); tn
    precision <- tp / (tp + fp); precision
    recall <- tp / (tp + fn); recall
    tnr <- tn / (tn + fn); tnr
    ACC.list[[method]] <- list(precision=precision, recall=recall, tnr=tnr)
  }
  return(ACC.list)
}


#' Conduct cell trajectory inference
#'
#' Conduct cell trajectory inference by Slingshot on one dataset and draw the trajectory.
#' @param count A scRNA-seq data matrix.
#' @param label A vector of doublet annotations (0/1).
#' @param title A title of the visualization.
#'
#' @return No return values.
#' @import SingleCellExperiment
#' @import slingshot
#' @import mclust
#' @import SummarizedExperiment
#' @export
#'
#' @examples
#' data.trajectory <- readRDS('.../sim_psudotime_bifurcating.rds')
#' count <- data.trajectory$count
#' label <- data.trajectory$label
#' FindTrajectory(count=count, label=label, title='Contaminated Data')
#'
FindTrajectory <- function(count, label, title){
  cat('\nInfering cell trajectory by slingshot...\n', file = stderr())
  cat('saveing trajectory plot to current working directory...\n', file = stderr())
  sce <- SingleCellExperiment(assays = List(counts = count))
  FQnorm <- function(count){
    rk <- apply(count,2,rank,ties.method='min')
    count.sort <- apply(count,2,sort)
    refdist <- apply(count.sort,1,median)
    norm <- apply(rk,2,function(r){ refdist[r] })
    rownames(norm) <- rownames(count)
    return(norm)
  }
  assays(sce)$norm <- FQnorm(assays(sce)$counts)
  pca <- prcomp(t(log1p(assays(sce)$norm)), scale. = FALSE)
  embedding.pca <- pca$x[,1:2]
  reducedDims(sce) <- SimpleList(PCA = embedding.pca)
  label.cluster <- Mclust(embedding.pca)$classification
  colData(sce)$GMM <- label.cluster
  sce <- slingshot(sce, clusterLabels = 'GMM', reducedDim = 'PCA')
  embedding.pca <- as.data.frame(embedding.pca)
  embedding.pca$type <- as.numeric(label)
  embedding.pca$type <- ifelse(embedding.pca$type==0, 'singlet', 'doublet')
  embedding.pca$type <- as.factor(embedding.pca$type)
  if(length(table(embedding.pca$type))==1){
    palette(c("grey","red"))
  }else{
    palette(c("red","grey"))
  }

  plot(embedding.pca$PC1, embedding.pca$PC2, col = embedding.pca$type, pch=16, asp = 0,
       main=title, xlab='', ylab='', xaxt='n', yaxt='n')
  lines(SlingshotDataSet(sce), lwd=5, col='black')
  legend(x="bottomright",legend=c("Doublet","Singlet"),col=c("red","grey"), lwd=1, lty=c(NA,NA),
         pch=c(19,19), cex=1.3)

  png(paste(title, ' trajectory.png', sep = ''))
  plot(embedding.pca$PC1, embedding.pca$PC2, col = embedding.pca$type, pch=16, asp = 0,
       main=title, xlab='', ylab='', xaxt='n', yaxt='n')
  lines(SlingshotDataSet(sce), lwd=5, col='black')
  legend(x="bottomright",legend=c("Doublet","Singlet"),col=c("red","grey"), lwd=1, lty=c(NA,NA),
         pch=c(19,19), cex=1.3)
  dev.off()
}


#' Infer temporally DE genes and calculate corresponding accuracy
#'
#' Infer temporally DE genes by Slingshot and GAM model on one dataset. Calculate precision, recall, and TNR of identified doublets.
#' @param count A scRNA-seq data matrix.
#' @param gene.de A vector of ground-truth temporally DE genes.
#'
#' @return A list of precision, recall, and TNR of identified temporally DE genes.
#'
#' @import SingleCellExperiment
#' @import slingshot
#' @import mclust
#' @import gam
#' @import S4Vectors
#' @import SummarizedExperiment
#' @export
#'
#' @examples
#' data.trajectory <- readRDS('.../sim_temporally_DE.rds')
#' count <- data.trajectory$count
#' gene.de <- data.trajectory$gene.de
#' de.temp.list <- FindTempDE(count=count, gene.de=gene.de)
#'
FindTempDE <- function(count, gene.de){
  cat('\nInfering temporal DE genes by slingshot...\n', file = stderr())
  sce <- SingleCellExperiment(assays = List(counts = count))
  FQnorm <- function(count){
    rk <- apply(count,2,rank,ties.method='min')
    count.sort <- apply(count,2,sort)
    refdist <- apply(count.sort,1,median)
    norm <- apply(rk,2,function(r){ refdist[r] })
    rownames(norm) <- rownames(count)
    return(norm)
  }
  assays(sce)$norm <- FQnorm(assays(sce)$counts)
  pca <- prcomp(t(log1p(assays(sce)$norm)), scale. = FALSE)
  embedding.pca <- pca$x[,1:2]
  reducedDims(sce) <- SimpleList(PCA = embedding.pca)
  label.cluster <- Mclust(embedding.pca)$classification
  colData(sce)$GMM <- label.cluster
  sce <- slingshot(sce, clusterLabels = 'GMM', reducedDim = 'PCA')

  t <- sce$slingPseudotime_1; t
  Y <- log1p(assays(sce)$norm); dim(Y)
  gam.pval <- apply(Y,1,function(z){
    d <- data.frame(z=z, t=t)
    tmp <- suppressWarnings(gam(z ~ lo(t), data=d))
    p <- summary(tmp)[3][[1]][2,3]
    p
  })
  p <- .05
  findDE <- names(gam.pval[gam.pval<=p]); length(findDE)
  findnonDE <- names(gam.pval[gam.pval>p]); length(findnonDE)
  nonDE <- rownames(count)[1:500]
  DE <- gene.de
  tp <- length(intersect(findDE, DE)); tp
  fp <- length(setdiff(findDE, DE)); fp
  fn <- length(setdiff(findnonDE, nonDE)); fn
  tn <- length(intersect(findnonDE, nonDE)); tn

  precision <- tp/(tp+fp); precision
  recall <- tp/(tp+fn); recall
  tnr <- tn/(tn+fp); tnr

  de.list <- list(precision=precision, recall=recall, tnr=tnr)
  return(de.list)
}


#' Infer temporally DE genes and calculate corresponding DE genes for multiple methods
#'
#' Infer temporally DE genes by Slingshot and GAM model on multiple datasets. Calculate corresponding precision, recall, and TNR of identified doublets.
#' @param data.list A list of scRNA-seq count matrix.
#' @param gene.de A vector of ground-truth temporally DE genes.
#'
#' @return A list of precision, recall, and TNR of identified temporally DE genes on multiple datasets.
#' @export
#'
#' @examples
#' doublet.list <- FindDoublets(score.list = score.list, rate = 0.2)
#' data.removal.list <- RemoveDoublets.Method(count = count, label = label, doublet.list = doublet.list)
#' data.removal.list[['Clean Data']] <- list(count=count.clean, label=label.clean)
#' de.temp.result.all <- FindTempDE.All(data.list=data.removal.list, gene.de=gene.de)
#'
FindTempDE.All <- function(data.list, gene.de){
  de.temp.list.all <- list()
  for(method in names(data.list)){
    cat('\n')
    cat(method, "Dataset", file = stderr())
    count <- data.list[[method]]$count
    de.temp.list <- FindTempDE(count = count, gene.de = gene.de)
    de.temp.list.all[[method]] <- de.temp.list
  }
  return(de.temp.list.all)
}


#' Calculate concatenated AUPRC or AUROC
#'
#' Concatenate doublet scores of different batches. Calculate AUPRC or AUROC on concatenated doublet scores.
#' @param score.list.all A list of doublet scores of each batch.
#' @param label.list A list of doublet annotations of each batch.
#' @param type A measurement type of "AUPRC" or "AUROC".
#' @return A list of AUPRC or AUROC of different batches.
#' @export
#'
#' @examples
#'
FindDistributedAUC <- function(score.list.all, label.list, type){
  score.list.merge <- do.call(Map, c(c, score.list.all))
  label.merge <- as.numeric(unlist(label.list))
  auc.list <- FindAUC(score.list = score.list.merge, label = label.merge, type=type)
  return(auc.list)
}


#' Calculate detection accuracy under distributed computing
#'
#' Randomly split one dataset into user-specified batches. Perform doublet detection on each batch by different methods to calculate doublet scores.
#' Concatenate doublet scores per batch. Calclulate AUPRC or AUROC on concatenated doublet scores.
#' @param count A scRNA-seq data matrix.
#' @param label A vector of doublet annotations (0/1).
#' @param batches A vector of batch numbers.
#' @param type A string of measurement "AUPRC" or "AUROC".
#' @return A list of measurement across different batch numbers and methods.
#' @export
#'
#' @examples
#' data.list <- ReadData(path = ".../real_datasets")
#' count <- data.list$count$`pbmc-ch`
#' label <- data.list$label$`pbmc-ch`
#' label <- ifelse(label == 'doublet', 1, 0)
#' auc.list.batch <- FindDistributedAUC.All(count = count, label = label, method=methods, batches = 2:10, type='AUPRC')
#'
FindDistributedAUC.All <- function(count, label, method, batches, type){
  auc.list.batch <- list()
  for(batch in batches){
    cat('\nbatch number', batch, '\n', file = stderr())
    data.split <- SplitData(count=count, label=label, batch=batch)
    score.list.all <- FindScores.All(count.list = data.split$count, methods = method)
    auc.list <- FindDistributedAUC(score.list.all=score.list.all, label.list=data.split$label, type=type)
    auc.list.batch[[as.character(batch)]] <- auc.list
  }
  return(auc.list.batch)
}

xnnba1984/DoubletCollection documentation built on Dec. 10, 2022, 11:13 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

xnnba1984/DoubletCollection
Find doublets in scRNA-seq data

R/Downstream.R
In xnnba1984/DoubletCollection: Find doublets in scRNA-seq data

Defines functions FindDistributedAUC.All FindDistributedAUC FindTempDE.All FindTempDE FindTrajectory FindDEACC FindDE Clustering.Quality Clustering.All.Rate Clustering.All Clustering

Documented in Clustering Clustering.All Clustering.All.Rate Clustering.Quality FindDE FindDEACC FindDistributedAUC FindDistributedAUC.All FindTempDE FindTempDE.All FindTrajectory

R Package Documentation

Browse R Packages

We want your feedback!

xnnba1984/DoubletCollection Find doublets in scRNA-seq data

R/Downstream.R In xnnba1984/DoubletCollection: Find doublets in scRNA-seq data

Defines functions FindDistributedAUC.All FindDistributedAUC FindTempDE.All FindTempDE FindTrajectory FindDEACC FindDE Clustering.Quality Clustering.All.Rate Clustering.All Clustering

Documented in Clustering Clustering.All Clustering.All.Rate Clustering.Quality FindDE FindDEACC FindDistributedAUC FindDistributedAUC.All FindTempDE FindTempDE.All FindTrajectory

R Package Documentation

Browse R Packages

We want your feedback!

xnnba1984/DoubletCollection
Find doublets in scRNA-seq data

R/Downstream.R
In xnnba1984/DoubletCollection: Find doublets in scRNA-seq data