library(knitr) opts_chunk$set( warning = FALSE, message = FALSE, fig.path = 'figure/', cache.path = 'cache/', cache = TRUE, dev = 'png', dpi=100, fig.width=6, fig.height=3 )
To demonstrate the utility of MUDAN
, we will analyze two 10X PBMC datasets from Zheng et al.
The data are already provided to you as counts matrices. We will load and subsample the data.
library(MUDAN) data("pbmcA") data("pbmcB") print(dim(pbmcA)) print(dim(pbmcB)) set.seed(0) ## subsample cells for smaller dataset to run faster vi <- sample(colnames(pbmcA), 2000) pbmcA <- pbmcA[, vi] vi <- sample(colnames(pbmcB), 2000) pbmcB <- pbmcB[, vi]
First, we will combine the two datasets without batch correction and perform regular dimensionality reduction with PCA and visualization with tSNE.
## filter and combine v1 <- rownames(pbmcA)[rowSums(pbmcA)>0] v2 <- rownames(pbmcB)[rowSums(pbmcB)>0] genes.int <- intersect(v1, v2) length(genes.int) cd <- cbind(pbmcA[genes.int,], pbmcB[genes.int,]) group <- factor(colnames(cd) %in% colnames(pbmcA), labels=c('pbmcA', 'pbmcB')) names(group) <- colnames(cd) ## see separation by batch if we cluster all together myMudanObject <- Mudan$new("comb", cd, ncores=4) myMudanObject$libSizeNormalize() myMudanObject$varianceNormalize(plot=FALSE) myMudanObject$dimensionalityReduction(nGenes = 1000, nPcs = 30, maxit=1000) myMudanObject$getStandardEmbedding(plot=FALSE) myMudanObject$communityDetection(reductionType='pcs', communityName="Infomap", communityMethod=igraph::cluster_infomap, k=70) par(mfrow=c(1,2), mar=c(5,5,15,5)) plotEmbedding(myMudanObject$emb[['PCA']], groups=group, main="comb PCA", show.legend=TRUE) myMudanObject$plot(reductionType='pcs', communityName="Infomap", embeddingType="PCA", main='detected community', show.legend=TRUE)
Indeed, we see prominent batch/patient differences. Traditionally, we would normalize out batch specific differences with batch correction using as using combat
.
## apply combat batch correction library(sva) myMudanObjectbc <- Mudan$new("combbc", cd, ncores=4) myMudanObjectbc$libSizeNormalize() myMudanObjectbc$varianceNormalize(plot=FALSE) cdbc <- sva::ComBat(dat=as.matrix(myMudanObjectbc$matnorm), batch=group[colnames(myMudanObjectbc$matnorm)]) ## remove batch effect myMudanObjectbc$matnorm <- cdbc myMudanObjectbc$dimensionalityReduction(nGenes = 1000, nPcs = 30, maxit=1000) myMudanObjectbc$getStandardEmbedding(plot=FALSE) myMudanObjectbc$communityDetection(reductionType='pcs', communityName="Infomap", communityMethod=igraph::cluster_infomap, k=60) par(mfrow=c(1,2)) plotEmbedding(myMudanObjectbc$emb[['PCA']], groups=group, main="comb batch normalized PCA", show.legend=TRUE) myMudanObjectbc$plot(reductionType='pcs', communityName="Infomap", embeddingType="PCA", main='detected community', show.legend=TRUE)
However, we will show later that this batch correction "over-corrects", removing true biological signal and obscures our ability to identify the appropriate cell subtypes.
Let's first analyze each dataset separately.
## analyze pbmcA by itself myMudanObject1 <- Mudan$new("pbmcA", pbmcA, ncores=4) myMudanObject1$libSizeNormalize() myMudanObject1$varianceNormalize(plot=FALSE) myMudanObject1$dimensionalityReduction(nGenes = 1000, nPcs = 30, maxit=1000) myMudanObject1$getStandardEmbedding(plot=FALSE) myMudanObject1$communityDetection(reductionType='pcs', communityName="Infomap", communityMethod=igraph::cluster_infomap, k=30) myMudanObject1$modelCommunity(communityName="Infomap") myMudanObject1$getMudanEmbedding(plot=FALSE) par(mfrow=c(1,2)) myMudanObject1$plot(reductionType='pcs', communityName="Infomap", embeddingType="PCA", main='pbmcA PCA', show.legend=TRUE) myMudanObject1$plot(reductionType='pcs', communityName="Infomap", embeddingType="MUDAN", main='pbmcA MUDAN', show.legend=TRUE)
## analyze pbmcB by itself myMudanObject2 <- Mudan$new("pbmcB", pbmcB, ncores=4) myMudanObject2$libSizeNormalize() myMudanObject2$varianceNormalize(plot=FALSE) myMudanObject2$dimensionalityReduction(nGenes = 1000, nPcs = 30, maxit=1000) myMudanObject2$getStandardEmbedding(plot=FALSE) myMudanObject2$communityDetection(reductionType='pcs', communityName="Infomap", communityMethod=igraph::cluster_infomap, k=30) myMudanObject2$modelCommunity(communityName="Infomap") myMudanObject2$getMudanEmbedding(plot=FALSE) par(mfrow=c(1,2)) myMudanObject2$plot(reductionType='pcs', communityName="Infomap", embeddingType="PCA", main='pbmcB PCA', show.legend=TRUE) myMudanObject2$plot(reductionType='pcs', communityName="Infomap", embeddingType="MUDAN", main='pbmcB MUDAN', show.legend=TRUE)
Now, we will learn from each dataset analyzed individually and project the combined dataset.
## project and combine (need to clean up into function) v1 <- rownames(pbmcA)[rowSums(pbmcA)>30] v2 <- rownames(pbmcB)[rowSums(pbmcB)>30] genes.int <- intersect(v1, v2) length(genes.int) m1 <- as.matrix(myMudanObject1$mat[genes.int,]) m2 <- as.matrix(myMudanObject2$mat[genes.int,]) mall <- cbind(m1, m2) combat <- sva::ComBat(dat=mall, batch=group[colnames(mall)]) ## remove batch effect m1c <- combat[, colnames(m1)] m2c <- combat[, colnames(m2)] ## maximize axis of separation nGenes <- 500 vargenes <- getVariableGenes(myMudanObject1$matnorm[genes.int,], nGenes) model1 <- modelLda(mat=m1c[vargenes,], com=myMudanObject1$com[['pcs']][['Infomap']]) vargenes <- getVariableGenes(myMudanObject2$matnorm[genes.int,], nGenes) model2 <- modelLda(mat=m2c[vargenes,], com=myMudanObject2$com[['pcs']][['Infomap']]) ## project lds1 <- predict(model1, data.frame(t(combat)))$x lds2 <- predict(model2, data.frame(t(combat)))$x ## get embedding on projections reduction <- cbind(lds1, lds2) emb <- Rtsne::Rtsne(reduction, is_distance=FALSE, perplexity=60, verbose=TRUE, num_threads=2)$Y rownames(emb) <- rownames(reduction) ## look at new embedding with old annotations par(mfrow=c(1,2)) plotEmbedding(emb, groups=myMudanObject1$com[['pcs']][['Infomap']], show.legend=TRUE, main='annotations from pbmcA only analysis') plotEmbedding(emb, groups=myMudanObject2$com[['pcs']][['Infomap']], show.legend=TRUE, main='annotations from pbmcB only analysis')
We can get new annotations from the projection and compare with our previous results.
## get new annotation com <- getComMembership(t(reduction), k=70, method=igraph::cluster_infomap) par(mfrow=c(1,3)) plotEmbedding(emb, groups=com) plotEmbedding(myMudanObject1$emb[['PCA']], groups=com[rownames(myMudanObject1$emb[['PCA']])]) plotEmbedding(myMudanObject2$emb[['PCA']], groups=com[rownames(myMudanObject2$emb[['PCA']])]) ## plot all together par(mfrow=c(1,2)) myMudanObject1$plot(reductionType='pcs', communityName="Infomap", embeddingType="PCA", main='pbmcA ind') plotEmbedding(myMudanObject1$emb[['PCA']], groups=com[rownames(myMudanObject1$emb[['PCA']])], main='pbmcA comb') myMudanObject2$plot(reductionType='pcs', communityName="Infomap", embeddingType="PCA", main='pbmcB ind') plotEmbedding(myMudanObject2$emb[['PCA']], groups=com[rownames(myMudanObject2$emb[['PCA']])], main='pbmcB comb') ## some groups more prominent in pbmcB now separated in pbmcA and vice versa par(mfrow=c(1,2)) plotEmbedding(emb, groups=com, main="comb MUDAN", show.legend=TRUE) plotEmbedding(emb, groups=group, main="comb MUDAN", show.legend=TRUE) plotEmbedding(myMudanObject$emb[['PCA']], groups=com, main="comb PCA", show.legend=TRUE) plotEmbedding(myMudanObject$emb[['PCA']], groups=group, main="comb PCA", show.legend=TRUE) plotEmbedding(myMudanObjectbc$emb[['PCA']], groups=com, main="batch corrected comb PCA", show.legend=TRUE) plotEmbedding(myMudanObjectbc$emb[['PCA']], groups=group, main="batch corrected comb PCA", show.legend=TRUE)
Replot with shuffled colors.
par(mfrow=c(1,3)) plotEmbedding(emb, groups=com, main="comb MUDAN", shuffle.colors=TRUE) plotEmbedding(myMudanObject$emb[['PCA']], groups=com, main="comb PCA", shuffle.colors=TRUE) plotEmbedding(myMudanObjectbc$emb[['PCA']], groups=com, main="batch corrected comb PCA", shuffle.colors=TRUE)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.