Differential expression analysis in mutant mice is confounded by differing abundances of different cell populations.
To do a fair DE analysis, here we match each mutant cell, to it's nearest WT cell in tSNE space. Each WT cell can only be matched to one mutant cell, and the distance between the cells must be less than 3 tSNE units.
library(data.table) library(ggplot2) library(testisAtlas) load2("../data/cache") load_component_orderings() matched_cells_CNP <- match_cells2(test_group = "CNP") # remove cells with no WT match matched_cells_CNP <- matched_cells_CNP[!is.na(matched_cells_CNP[,2]),] str(matched_cells_CNP) head(matched_cells_CNP) hist(as.numeric(matched_cells_CNP[,3]), breaks=50)
For example:
ggplot(cell_data[order(cell == matched_cells_CNP[500,2])], aes(Tsne1_QC1, Tsne2_QC1, color=cell == matched_cells_CNP[500,2])) + geom_point(size=0.5) + scale_color_manual(values=c("grey","red")) + theme(legend.position = "bottom") + ggtitle("Mutant Cell") ggplot(cell_data[order(cell == matched_cells_CNP[500,1])], aes(Tsne1_QC1, Tsne2_QC1, color=cell == matched_cells_CNP[500,1])) + geom_point(size=0.5) + scale_color_manual(values=c("grey","red")) + theme(legend.position = "bottom") + ggtitle("Nearest WT Cell")
We can then do a DE analysis:
cnp_vs_wt <- data.table(cnp_mean = Matrix::colMeans(data[matched_cells_CNP[,1],]), wt_mean = Matrix::colMeans(data[matched_cells_CNP[,2],]), gene=colnames(data), pval=pVals) cnp_vs_wt2 <- data.table(cnp_mean = colMeans(SDAresults$scores[matched_cells_CNP[,1],] %*% SDAresults$loadings[[1]]), wt_mean = colMeans(SDAresults$scores[matched_cells_CNP[,2],] %*% SDAresults$loadings[[1]]), gene=colnames(SDAresults$loadings[[1]])) hist(cnp_vs_wt$cnp_mean, breaks=100) cnp_vs_wt[,FC := (cnp_mean+0.1)/(wt_mean+0.1)] cnp_vs_wt[,FC2 := (cnp_mean+0.5)/(wt_mean+0.5)] cnp_vs_wt2[,FC2 := (cnp_mean+0.5)/(wt_mean+0.5)] hist(log(cnp_vs_wt$FC2), breaks=100) cnp_vs_wt[order(-FC2)][1:50] cnp_vs_wt[order(FC2)][1:50]
ggplot(cnp_vs_wt, aes(wt_mean, cnp_mean, label=gene)) + geom_point(alpha=0.5, size=0.5) + geom_label_repel(data=cnp_vs_wt[abs(log(FC2))>0.4 | gene=="Ubb"]) ggplot(cnp_vs_wt, aes(log(wt_mean*cnp_mean), log(wt_mean/cnp_mean), label=gene)) + geom_point(alpha=0.5, size=0.5) + geom_label_repel(data=cnp_vs_wt[abs(log(FC2))>0.4 | gene=="Ubb"])
ggplot(cnp_vs_wt, aes(wt_mean, cnp_mean, label=gene)) + geom_point(alpha=0.5, size=0.5) + geom_label_repel(data=cnp_vs_wt[pval<1e-20]) ggplot(cnp_vs_wt, aes(FC2, -log(pval), label=gene)) + geom_point(alpha=0.5, size=0.5) + geom_label_repel(data=cnp_vs_wt[pval<1e-50])
Do any of the components correlate with these results?
Which components have the highest ranked loadings for these genes?
barplot(table(factor(sapply(cnp_vs_wt[order(-FC2)][gene %in% colnames(SDAresults$loadings[[1]])][1:100]$gene, function(x) which.max(abs(ranks[x,]))), levels=1:100, labels = c(paste0(1:50,"N"),paste0(1:50,"P")))), las=2, cex.names=0.5, main="Highest ranked component for top 100 over-expressed genes") barplot(table(factor(sapply(cnp_vs_wt[order(FC2)][gene %in% colnames(SDAresults$loadings[[1]])][1:100]$gene, function(x) which.max(abs(ranks[x,]))), levels=1:100, labels = c(paste0(1:50,"N"),paste0(1:50,"P")))), las=2, cex.names=0.5, main="Highest ranked component for top 100 under-expressed genes") barplot(table(factor(sapply(cnp_vs_wt2[order(-FC2)][gene %in% colnames(SDAresults$loadings[[1]])][1:100]$gene, function(x) which.max(abs(ranks[x,]))), levels=1:100, labels = c(paste0(1:50,"N"),paste0(1:50,"P")))), las=2, cex.names=0.5, main="Highest ranked component for top 100 over-expressed genes (Imputed)") barplot(table(factor(sapply(cnp_vs_wt2[order(FC2)][gene %in% colnames(SDAresults$loadings[[1]])][1:100]$gene, function(x) which.max(abs(ranks[x,]))), levels=1:100, labels = c(paste0(1:50,"N"),paste0(1:50,"P")))), las=2, cex.names=0.5, main="Highest ranked component for top 100 under-expressed genes (Imputed)")
Both the over and underexpressed genes many of them are ranked highest in component 22 positive and negative respectively. So it appears component 22 represents CNP differential expression!
# do any components correlate qplot(sapply(1:50, function(i) cor(SDAresults$loadings[[1]][i,],log(cnp_vs_wt$FC2))), paste0("C",1:50))
There is a pretty good overlap with the loadings in this component, and the results from this analysis. This component was previsouly assigned "odd" given the funky cell scores - I'm still not sure what to make of the low WT group or why the FACS groups are lower - but the CNP cells are indeed higher than the rest.
print_loadings_scores(22)
ggplot(gene_expression_pseudotime(c("Rps7","Tpt1","Fau","Eif1"))[group %in% c("WT","CNP")], aes(-PseudoTime, value, colour=group)) + geom_point(alpha=0.5, size=0.25) + geom_smooth(method = "gam", formula = y ~ s(x, k = 50), se=FALSE) + ylab("Predicted Gene Expression") + xlab("Pseudotime") + facet_wrap(~Gene)
matched_cells_Cul4a <- match_cells2(test_group = "Cul4a") #matched_cells_Cul4a[,1] <- cell_data[group=="Cul4a"]$cell str(matched_cells_Cul4a) sum(is.na(matched_cells_Cul4a[,2])) # remove cells with no WT match matched_cells_Cul4a <- matched_cells_Cul4a[!is.na(matched_cells_Cul4a[,2]),] str(matched_cells_Cul4a) head(matched_cells_Cul4a)
Cul4a_vs_wt <- data.table(Cul4a_mean = Matrix::colMeans(data[matched_cells_Cul4a[,1],]), wt_mean = Matrix::colMeans(data[matched_cells_Cul4a[,2],]), gene=colnames(data)) Cul4a_vs_wt2 <- data.table(Cul4a_mean = colMeans(SDAresults$scores[matched_cells_Cul4a[,1],] %*% SDAresults$loadings[[1]]), wt_mean = colMeans(SDAresults$scores[matched_cells_Cul4a[,2],] %*% SDAresults$loadings[[1]]), gene=colnames(SDAresults$loadings[[1]])) hist(Cul4a_vs_wt$Cul4a_mean, breaks=100) Cul4a_vs_wt[,FC := (Cul4a_mean+0.1)/(wt_mean+0.1)] Cul4a_vs_wt[,FC2 := (Cul4a_mean+0.5)/(wt_mean+0.5)] Cul4a_vs_wt2[,FC2 := (Cul4a_mean+0.5)/(wt_mean+0.5)] hist(log(Cul4a_vs_wt$FC2), breaks=100) Cul4a_vs_wt[order(-FC2)][1:50] Cul4a_vs_wt[order(FC2)][1:50]
ggplot(Cul4a_vs_wt, aes(wt_mean, Cul4a_mean, label=gene)) + geom_point(alpha=0.5, size=0.5) + geom_label_repel(data=Cul4a_vs_wt[abs(log(FC2))>0.45 | gene=="Ubb"]) ggplot(Cul4a_vs_wt, aes(log(wt_mean*Cul4a_mean), log(wt_mean/Cul4a_mean), label=gene)) + geom_point(alpha=0.5, size=0.5) + geom_label_repel(data=Cul4a_vs_wt[abs(log(FC2))>0.45 | gene=="Ubb"])
barplot(table(factor(sapply(Cul4a_vs_wt[order(FC2)][gene %in% colnames(SDAresults$loadings[[1]])][1:100]$gene, function(x) which.max(abs(ranks[x,]))), levels=1:100, labels = c(paste0(1:50,"N"),paste0(1:50,"P")))), las=2, cex.names=0.5, main="Highest ranked component for top 100 under-expressed genes") barplot(table(factor(sapply(Cul4a_vs_wt[order(-FC2)][gene %in% colnames(SDAresults$loadings[[1]])][1:100]$gene, function(x) which.max(abs(ranks[x,]))), levels=1:100, labels = c(paste0(1:50,"N"),paste0(1:50,"P")))), las=2, cex.names=0.5, main="Highest ranked component for top 100 over-expressed genes") barplot(table(factor(sapply(Cul4a_vs_wt2[order(FC2)][gene %in% colnames(SDAresults$loadings[[1]])][1:100]$gene, function(x) which.max(abs(ranks[x,]))), levels=1:100, labels = c(paste0(1:50,"N"),paste0(1:50,"P")))), las=2, cex.names=0.5, main="Highest ranked component for top 100 under-expressed genes (Imputed)") barplot(table(factor(sapply(Cul4a_vs_wt2[order(-FC2)][gene %in% colnames(SDAresults$loadings[[1]])][1:100]$gene, function(x) which.max(abs(ranks[x,]))), levels=1:100, labels = c(paste0(1:50,"N"),paste0(1:50,"P")))), las=2, cex.names=0.5, main="Highest ranked component for top 100 over-expressed genes (Imputed)")
Confirmation that C25 represents Cul4a KO diff expression (& a bit of 9 perhaps)
# do any components correlate qplot(sapply(1:50, function(i) cor(SDAresults$loadings[[1]][i,],log(Cul4a_vs_wt$FC2))), paste0("C",1:50))
matched_cells_Mlh3 <- match_cells2(test_group = "Mlh3") str(matched_cells_Mlh3) sum(is.na(matched_cells_Mlh3[,2])) # remove cells with no WT match matched_cells_Mlh3 <- matched_cells_Mlh3[!is.na(matched_cells_Mlh3[,2]),] str(matched_cells_Mlh3)
group_a = matched_cells_Mlh3[,1] group_b = matched_cells_Mlh3[,2] pVals <- apply( data, 2, function(x) { wilcox.test( x[group_a], x[group_b] )$p.value } ) # multiple testing correction pVals <- p.adjust(pVals, method = "fdr")
Mlh3_vs_wt <- data.table(Mlh3_mean = Matrix::colMeans(data[matched_cells_Mlh3[,1],]), wt_mean = Matrix::colMeans(data[matched_cells_Mlh3[,2],]), gene=colnames(data), pval=pVals) hist(Mlh3_vs_wt$Mlh3_mean, breaks=100) Mlh3_vs_wt[,FC2 := (Mlh3_mean+0.5)/(wt_mean+0.5)] hist(log(Mlh3_vs_wt$FC2), breaks=100) Mlh3_vs_wt[order(-FC2)][1:50] Mlh3_vs_wt[order(FC2)][1:50]
ggplot(Mlh3_vs_wt, aes(wt_mean, Mlh3_mean, label=gene)) + geom_point(alpha=0.5, size=0.5) + geom_label_repel(data=Mlh3_vs_wt[abs(log(FC2))>0.35 | gene=="Ubb"]) ggplot(Mlh3_vs_wt, aes(log(wt_mean * Mlh3_mean), log(Mlh3_mean / wt_mean), label=gene)) + geom_point(alpha=0.5, size=0.5) + geom_label_repel(data=Mlh3_vs_wt[abs(log(FC2))>0.35 | gene=="Ubb"])
ggplot(Mlh3_vs_wt, aes(FC2, -log(pval), label=gene)) + geom_point(alpha=0.5, size=0.5) + geom_label_repel(data=Mlh3_vs_wt[order(pval)][1:100])
barplot(table(factor(sapply(Mlh3_vs_wt[order(-FC2)][gene %in% colnames(SDAresults$loadings[[1]])][1:100]$gene, function(x) which.max(abs(ranks[x,]))), levels=1:100, labels = c(paste0(1:50,"N"),paste0(1:50,"P")))), las=2, cex.names=0.5, main="Highest ranked component for top 100 over-expressed genes") barplot(table(factor(sapply(Mlh3_vs_wt[order(FC2)][gene %in% colnames(SDAresults$loadings[[1]])][1:100]$gene, function(x) which.max(abs(ranks[x,]))), levels=1:100, labels = c(paste0(1:50,"N"),paste0(1:50,"P")))), las=2, cex.names=0.5, main="Highest ranked component for top 100 under-expressed genes")
# do any components correlate qplot(sapply(1:50, function(i) cor(SDAresults$loadings[[1]][i,],log(Mlh3_vs_wt$FC2))), paste0("C",1:50))
Verify results with expression over pseudotime
ggplot(gene_expression_pseudotime("Prm1"), aes(-PseudoTime, value, colour=group)) + geom_point(alpha=0.2, size=0.25) + geom_smooth(method = "gam", formula = y ~ s(x, k = 50), se=FALSE, colour="black") + ylab("Predicted Gene Expression") + xlab("Pseudotime") + facet_wrap(~group) + theme(legend.position = "none") ggplot(gene_expression_pseudotime("Prm1")[group %in% c("Mlh3","WT")], aes(-PseudoTime, value, colour=group, group=group)) + geom_point(alpha=0.5, size=0.25) + geom_smooth(method = "gam", formula = y ~ s(x, k = 50), se=FALSE) + ylab("Predicted Gene Expression") + xlab("Pseudotime") ggplot(gene_expression_pseudotime("mt-Rnr2"), aes(-PseudoTime, value, colour=group)) + geom_point(alpha=0.2, size=0.25) + geom_smooth(method = "gam", formula = y ~ s(x, k = 50), se=FALSE, colour="black") + ylab("Predicted Gene Expression") + xlab("Pseudotime") + facet_wrap(~group) + theme(legend.position = "none") ggplot(gene_expression_pseudotime("mt-Rnr2")[group %in% c("Mlh3","WT")], aes(-PseudoTime, value, colour=group, group=group)) + geom_point(alpha=0.5, size=0.25) + geom_smooth(method = "gam", formula = y ~ s(x, k = 50), se=FALSE) + ylab("Predicted Gene Expression") + xlab("Pseudotime")
matched_cells_Hormad1 <- match_cells2(test_group = "Hormad1") str(matched_cells_Hormad1) head(matched_cells_Hormad1) sum(is.na(matched_cells_Hormad1[,2])) # remove cells with no WT match matched_cells_Hormad1 <- matched_cells_Hormad1[!is.na(matched_cells_Hormad1[,2]),] str(matched_cells_Hormad1)
group_a = matched_cells_Hormad1[,1] group_b = matched_cells_Hormad1[,2] pVals <- apply( data, 2, function(x) { wilcox.test( x[group_a], x[group_b] )$p.value } ) # multiple testing correction pVals <- p.adjust(pVals, method = "fdr")
Hormad1_vs_wt2 <- data.table(Hormad1_mean = colMeans(SDAresults$scores[matched_cells_Hormad1[,1],] %*% SDAresults$loadings[[1]]), wt_mean = colMeans(SDAresults$scores[matched_cells_Hormad1[,2],] %*% SDAresults$loadings[[1]]), gene=colnames(SDAresults$loadings[[1]])) Hormad1_vs_wt <- data.table(Hormad1_mean = Matrix::colMeans(data[matched_cells_Hormad1[,1],]), wt_mean = Matrix::colMeans(data[matched_cells_Hormad1[,2],]), gene=colnames(data), pval=pVals) hist(Hormad1_vs_wt$Hormad1_mean, breaks=100) Hormad1_vs_wt[,FC := (Hormad1_mean+0.1)/(wt_mean+0.1)] Hormad1_vs_wt[,FC2 := (Hormad1_mean+0.5)/(wt_mean+0.5)] Hormad1_vs_wt2[,FC2 := (Hormad1_mean+0.5)/(wt_mean+0.5)] hist(log(Hormad1_vs_wt$FC2), breaks=100) Hormad1_vs_wt[order(FC2)][1:50] Hormad1_vs_wt[order(-FC2)][1:50]
ggplot(Hormad1_vs_wt, aes(wt_mean, Hormad1_mean, label=gene)) + geom_point(alpha=0.5, size=0.5) + geom_label_repel(data=Hormad1_vs_wt[abs(log(FC2))>0.7 | gene %in% c("Hormad1","Ubb")]) ggplot(Hormad1_vs_wt, aes(log(wt_mean*Hormad1_mean), log(wt_mean/Hormad1_mean), label=gene)) + geom_point(alpha=0.5, size=0.5) + geom_label_repel(data=Hormad1_vs_wt[abs(log(FC2))>0.7 | gene %in% c("Hormad1","Ubb")])
ggplot(Hormad1_vs_wt2, aes(wt_mean, Hormad1_mean, label=gene)) + geom_point(alpha=0.5, size=0.5) + geom_label_repel(data=Hormad1_vs_wt2[abs(log(FC2))>0.7 | gene %in% c("Hormad1","Ubb")]) + ggtitle("Imputed scatter plot") ggplot(Hormad1_vs_wt2, aes(log(wt_mean*Hormad1_mean), log(wt_mean/Hormad1_mean), label=gene)) + geom_point(alpha=0.5, size=0.5) + geom_label_repel(data=Hormad1_vs_wt2[abs(log(FC2))>0.7 | gene %in% c("Hormad1","Ubb")]) + ggtitle("Imputed MA plot")
AD_genes <- c("Apoe","Trem2","Bin1","Ms4a6d","Tyrobp","Cd33","Mef2c","Spi1","H2-Eb1","Clu","Picalm","Abca7","Cd2ap","Zcwpw1","Nme8","Inpp5d","Sorl1","Rin3","Ptk2b") others <- c("Akap9","Celf1","App") ggplot(Hormad1_vs_wt, aes(FC2, -log(pval), label=gene)) + geom_point(alpha=0.5, size=0.5) + geom_label_repel(data=Hormad1_vs_wt[order(pval)][1:100]) tmp <- which(Mlh3_vs_wt[order(pval)]$gene %in% c(AD_genes, others)) data.table(tmp, Mlh3_vs_wt[order(pval)]$gene[tmp])
DE genes match well with component 29, especially the underexpessed (previously named "SPG batch??"), and to a lesser extent 43 (ribosomal) for overexpressed genes.
ranks <- apply(SDAresults$loadings[[1]], 1, function(x) rank(abs(x))) ranksN <- apply(SDAresults$loadings[[1]], 1, function(x) rank(x)) ranksP <- apply(SDAresults$loadings[[1]], 1, function(x) rank(-x)) colnames(ranksP) <- paste0(colnames(ranksP), "P") colnames(ranksN) <- paste0(colnames(ranksN), "N") ranks <- cbind(ranksN, ranksP) barplot(table(factor(sapply(Hormad1_vs_wt2[order(FC2)][gene %in% colnames(SDAresults$loadings[[1]])][1:100]$gene, function(x) which.max(abs(ranks[x,]))), levels=1:100, labels = c(paste0(1:50,"N"),paste0(1:50,"P")))), las=2, cex.names=0.5, main="Highest ranked component for top 100 under-expressed genes") barplot(table(factor(sapply(Hormad1_vs_wt2[order(-FC2)][gene %in% colnames(SDAresults$loadings[[1]])][1:100]$gene, function(x) which.max(abs(ranks[x,]))), levels=1:100, labels = c(paste0(1:50,"N"),paste0(1:50,"P")))), las=2, cex.names=0.5, main="Highest ranked component for top 100 over-expressed genes") qplot(sapply(1:50, function(i) cor(SDAresults$loadings[[1]][i,],log(Hormad1_vs_wt$FC2))), paste0("C",1:50))
Only 6 wild type macrophages, so any inference of DE cells is shakey
library(ggplot2) matched_cells_MQ2 <- match_cells2(cell_subset = cell_data[V11>5]$cell, swap_groups = T) ggplot(cell_data, aes(Tsne1_QC1, Tsne2_QC1, color=V11>5)) + geom_point(size=0.5) + theme(legend.position = "bottom") + ggtitle("Mutant Cell") matched_cells_MQ2 MQ_vs_wt <- data.table(WT_mean = Matrix::colMeans(data[matched_cells_MQ2[,1],]), mutant_mean = Matrix::colMeans(data[matched_cells_MQ2[,2],]), gene=colnames(data)) MQ_vs_wt2 <- data.table(mutant_mean = colMeans(SDAresults$scores[matched_cells_MQ2[,1],] %*% SDAresults$loadings[[1]]), WT_mean = colMeans(SDAresults$scores[matched_cells_MQ2[,2],] %*% SDAresults$loadings[[1]]), gene=colnames(SDAresults$loadings[[1]])) hist(MQ_vs_wt$WT_mean, breaks=100, main="Not Imputed") hist(MQ_vs_wt2$WT_mean, breaks=100, main="Imputed") MQ_vs_wt[,FC2 := (mutant_mean+0.5)/(WT_mean+0.5)] MQ_vs_wt2[,FC2 := (mutant_mean+0.5)/(WT_mean+0.5)] hist(log(MQ_vs_wt$FC2), breaks=100, main="Not Imputed") hist(log(MQ_vs_wt2$FC2), breaks=100, main="Imputed") MQ_vs_wt2[order(-FC2)][1:50] MQ_vs_wt2[order(FC2)][1:50] plot(MQ_vs_wt$WT_mean, MQ_vs_wt$mutant_mean, main="Not Imputed") plot(MQ_vs_wt2$WT_mean, MQ_vs_wt2$mutant_mean, main="Imputed") AD_genes <- c("Apoe","Trem2","Bin1","Ms4a6d","Tyrobp","Cd33","Mef2c","Spi1","H2-Eb1","Clu","Picalm","Abca7","Cd2ap","Zcwpw1","Nme8","Inpp5d","Sorl1","Rin3","Ptk2b") ggplot(MQ_vs_wt2[!gene %in% AD_genes], aes(WT_mean, mutant_mean)) + geom_point(size=0.5, alpha=0.5) + scale_color_brewer(palette = "Set1")+ geom_abline(slope = 1,intercept = 0) + ggtitle("Non Alzheimers genes") ggplot(MQ_vs_wt2[gene %in% AD_genes], aes(WT_mean, mutant_mean)) + geom_point(size=0.5, alpha=0.5) + scale_color_brewer(palette = "Set1")+ geom_abline(slope = 1,intercept = 0) + ggtitle("Alzheimers genes") MQ_vs_wt2[gene %in% AD_genes][order(-FC2)]
The underexpressed genes have broad low enrichment, mostly leydig?
Overexpressed genes mostly in late components.
barplot(table(factor(sapply(MQ_vs_wt2[order(FC2)][gene %in% colnames(SDAresults$loadings[[1]])][1:100]$gene, function(x) which.max(abs(ranks[x,]))), levels=1:100, labels = c(paste0(1:50,"N"),paste0(1:50,"P")))), las=2, cex.names=0.5, main="Highest ranked component for top 100 under-expressed genes") barplot(table(factor(sapply(MQ_vs_wt2[order(-FC2)][gene %in% colnames(SDAresults$loadings[[1]])][1:100]$gene, function(x) which.max(abs(ranks[x,]))), levels=1:100, labels = c(paste0(1:50,"N"),paste0(1:50,"P")))), las=2, cex.names=0.5, main="Highest ranked component for top 100 over-expressed genes")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.