Here, we becnhmark several factor analyses methods on single cell RNA sequencing (scRNA-Seq) data obtained from human human brain samples (Darmanis et. al., 2015). All of these methods are used to uncover variation associated with different cell types and compared against each other. This dataset is included in an R data package ("iasvaExamples") containing data examples for IA-SVA (https://github.com/dleelab/iasvaExamples). To install the 'iasvaExamples' package, follow the instructions provided in the GitHub page.
rm(list=ls()) library(iasva) library(iasvaExamples) library(sva) library(RUVSeq) library(Rtsne) library(pheatmap) library(corrplot) library(DescTools) #pcc i.e., Pearson's contingency coefficient library(RColorBrewer) library(zinbwave) library(nnet) library(SummarizedExperiment) color.vec <- brewer.pal(8, "Set1") set.seed(3532379) # To make the task more challenging we selected #1000 genes to be used in downstream analyses. num.genes <- 1000
data("Darmanis_Brain_scRNAseq_Read_Counts") data("Darmanis_Brain_scRNAseq_Annotations") ls() counts.total <- Darmanis_Brain_scRNAseq_Read_Counts anns.total <- Darmanis_Brain_scRNAseq_Annotations dim(anns.total) dim(counts.total) summary(anns.total) table(anns.total$tissue, anns.total$cell_type) ContCoef(table(anns.total$tissue, anns.total$cell_type)) table(anns.total$age, anns.total$cell_type) ContCoef(table(anns.total$age, anns.total$cell_type)) table(anns.total$sample_name, anns.total$cell_type) ContCoef(table(anns.total$sample_name, anns.total$cell_type)) table(anns.total$c1_chip_id, anns.total$cell_type) ContCoef(table(anns.total$c1_chip_id, anns.total$cell_type))
The annotations describing samples and experimental settings are stored in "anns" and the read counts information is stored in "counts".
For the downstream analyses, select neurons, oligodendrcytes and astrocytes from 2 cell types and 8 individuals.
# Select neurons and astrocytes for downstream analyses selected.cells <- intersect(which(anns.total$tissue=="cortex"), which(anns.total$cell_type=="neurons"|anns.total$cell_type=="astrocytes")) counts <- counts.total[, selected.cells] anns <- droplevels(anns.total[selected.cells,]) table(anns$tissue, anns$cell_type) table(anns$tissue, anns$age) table(anns$age, anns$sample_name) table(anns$age, anns$c1_chip_id) table(anns$tissue, anns$c1_chip_id) table(anns$cell_type, anns$sample_name) table(anns$cell_type, anns$tissue) dim(counts) dim(anns) dropout.rate <- sum(counts==0)/(dim(counts)[1]*dim(counts)[2])*100 dropout.rate ## Filter out lowly-expressed genes filter = apply(counts, 1, function(x) length(x[x>5])>=3) counts = as.matrix(counts[filter,]) dim(counts) dropout.rate <- sum(counts==0)/(dim(counts)[1]*dim(counts)[2])*100 dropout.rate # To make this task more challenging for benchamarking purposes # we only used randomly selected 1000 genes in downstream analyses rand.gene.list <- sample(1:nrow(counts), num.genes) counts <- counts[rand.gene.list,] Cell_Type <- anns$cell_type Tissue <- anns$tissue Cell_Type_Num <- as.integer(Cell_Type) Samples <- as.integer(anns$sample_name)
It is well known that the geometric library size (i.e., library size of log-transfromed read counts) or proportion of expressed genes in each cell explains very large portion of variability of scRNA-Seq data (Hicks et. al. 2015 BioRxiv, McDavid et. al. 2016 Nature Biotechnology). Frequently, the first principal component of log-transformed scRNA-Seq read counts is highly correlated with the geometric library size. (r > 0.9). Here, we calculate it and show the high correlation. Later this vector will be used as an known factor in IA-SVA algorithm.
Geo_Lib_Size <- colSums(log(counts+1)) barplot(Geo_Lib_Size, xlab="Cell") lcounts <- log(counts + 1) pca.res = svd(lcounts - rowMeans(lcounts))$v cor(Geo_Lib_Size, pca.res[,1]) cor(Geo_Lib_Size, Cell_Type_Num)
Library size, tissue, and sample ID are used as covariates in all factor analyses except for PCA. 3 SVs are captured with each method and the SV that has the maximum correlation with the cell type assignments is selected among these 3. This is especially important for a fair comparison of PCA.
mod1 = model.matrix(~Geo_Lib_Size+anns$sample_name) mod0 = cbind(mod1[,1]) summ_exp <- SummarizedExperiment(assays = counts) ### IA-SVA start_time <- Sys.time() iasva.res<- iasva(summ_exp, mod1[,-1],verbose=FALSE, permute=FALSE, num.sv=3) end_time <- Sys.time() iasva.time <- start_time - end_time iasva.sv <- iasva.res$sv plot(iasva.sv[,1], iasva.sv[,2],col=Cell_Type, xlab="SV1", ylab="SV2") pairs(iasva.sv, col=Cell_Type, main="IA-SVA") max.cor.iasva <- which.is.max(c(abs(cor(iasva.sv[,1], Cell_Type_Num)), abs(cor(iasva.sv[,2], Cell_Type_Num)), abs(cor(iasva.sv[,3], Cell_Type_Num)))) max.cor.iasva ## PCA ldat0 = log(counts + 1) start_time <- Sys.time() pca.sv = svd(ldat0 - rowMeans(ldat0))$v end_time <- Sys.time() pca.time <- start_time - end_time plot(pca.sv[,1], pca.sv[,2],col=Cell_Type, xlab="SV1", ylab="SV2") pairs(pca.sv[,1:3], col=Cell_Type, main="PCA") max.cor.pca <- which.is.max(c(abs(cor(pca.sv[,1], Cell_Type_Num)), abs(cor(pca.sv[,2], Cell_Type_Num)), abs(cor(pca.sv[,3], Cell_Type_Num)))) max.cor.pca ## USVA start_time <- Sys.time() usva.sv = svaseq(counts,mod1,mod0,n.sv=3)$sv end_time <- Sys.time() usva.time <- start_time - end_time plot(usva.sv[,1], usva.sv[,2],col=Cell_Type, xlab="SV1", ylab="SV2") pairs(usva.sv[,1:3], col=Cell_Type, main="USVA") max.cor.usva <- which.is.max(c (abs(cor(usva.sv[,1], Cell_Type_Num)), abs(cor(usva.sv[,2], Cell_Type_Num)), abs(cor(usva.sv[,3], Cell_Type_Num)))) max.cor.usva ### RUVres design <- model.matrix(~Geo_Lib_Size+anns$sample_name) start_time <- Sys.time() y <- DGEList(counts=counts) y <- calcNormFactors(y, method="TMM") y <- estimateGLMCommonDisp(y, design) y <- estimateGLMTagwiseDisp(y, design) fit <- glmFit(y, design) res <- residuals(fit, type="deviance") set <- betweenLaneNormalization(counts, which="upper") genes = rep(TRUE,dim(counts)[1]) ruvres.sv = RUVr(set,genes,k=3,res,isLog = T)$W end_time <- Sys.time() ruvres.time <- start_time - end_time plot(ruvres.sv[,1], ruvres.sv[,2],col=Cell_Type, xlab="SV1", ylab="SV2") pairs(ruvres.sv[,1:3], col=Cell_Type) max.cor.ruvres <- which.is.max (c (abs(cor(ruvres.sv[,1], Cell_Type_Num)), abs(cor(ruvres.sv[,2], Cell_Type_Num)), abs(cor(ruvres.sv[,3], Cell_Type_Num)))) max.cor.ruvres ### RUVemp start_time <- Sys.time() y <- DGEList(counts=counts) y <- calcNormFactors(y, method="TMM") y <- estimateGLMCommonDisp(y, design) y <- estimateGLMTagwiseDisp(y, design) fit <- glmFit(y, design) lrt <- glmLRT(fit, coef=2) emp.genes =rank(lrt$table$LR) > 400 ruvemp.sv <- RUVg(counts, emp.genes, k=3)$W end_time <- Sys.time() ruvemp.time <- start_time - end_time plot(ruvemp.sv[,1], ruvemp.sv[,2],col=Cell_Type, xlab="SV1", ylab="SV2") pairs(ruvemp.sv[,1:3], col=Cell_Type) max.cor.ruvemp <- which.is.max(c (abs(cor(ruvemp.sv[,1], Cell_Type_Num)), abs(cor(ruvemp.sv[,2], Cell_Type_Num)), abs(cor(ruvemp.sv[,3], Cell_Type_Num)))) max.cor.ruvemp ### ZINB WAVE se.data <- SummarizedExperiment(assays=list(counts=counts)) se.data$LS <- Geo_Lib_Size se.data$SN <- anns$sample_name se.data$Tissue <- anns$tissue # use Geometric library size + Tissue + sample is as covariates # suggested epsilon parameter is used for the analyses start_time <- Sys.time() zinb_cov <- zinbFit(se.data, K=3, X="~LS+SN", epsilon=1000) W <- getW(zinb_cov) end_time <- Sys.time() zinb.time <- start_time - end_time colnames(W) <- paste0("W", 1:3) pairs(W,col=Cell_Type) max.cor.zinb <- which.is.max (c (abs(cor(W[,1], Cell_Type_Num)), abs(cor(W[,2], Cell_Type_Num)), abs(cor(W[,3], Cell_Type_Num)))) max.cor.zinb
Compare results from different methods to the true cell type assignments using correlation plots.
## Plot correlation between true cell_type and estimates (SV1) celltype.est = cbind(Cell_Type_Num, iasva.sv[,max.cor.iasva], pca.sv[,max.cor.pca],usva.sv[,max.cor.usva], ruvres.sv[,max.cor.ruvres], ruvemp.sv[,max.cor.ruvemp], W[,max.cor.zinb]) colnames(celltype.est) = c("Cell Type","IA-SVA","PCA","USVA", "RUVres","RUVemp", "ZINB-WAVE") corr = abs(cor(celltype.est)) print(corr) cols = colorRampPalette(c(color.vec[2],"white",color.vec[1])) corrplot.mixed(corr, lower="ellipse", upper="number", mar=c(0,0,1,0)) num.cells <- dim(anns)[1] pdf(paste0("Brain_scRNASeq_neuron_astro_",num.cells,"_cells.pdf"), height=6, width=6) corrplot.mixed(corr, lower="ellipse", upper="number", mar=c(0,0,1,0)) dev.off() time.seconds <- c(iasva.time,pca.time,usva.time,ruvres.time,ruvemp.time,zinb.time) names(time.seconds) <- c("IA-SVA","PCA","USVA","RUVres","RUVemp", "ZINB-WAVE") time.seconds barplot(t(as.matrix(abs(time.seconds))), las =2, ylab = "Run time (seconds)") pdf(paste0("Brain_scRNASeq_neuron_astro_",num.cells,"_cells_run_time.pdf"), height=6, width=6) barplot(t(as.matrix(abs(time.seconds))), las =2, ylab = "Run time (seconds)") dev.off()
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.