This is an analysis of mouse testis scRNA dropseq data from Don Conrad's lab. Here we use SDA and t-SNE to cluster and developmentally order the single cells.
SDA decomposes a matrix (e.g. digital gene expression matrix) into a number of components represented by two matrices. The column vectors of the first matrix tell you how much a given component is active in each cell/individual. The row vectors of the second matrix tell you which genes are active in a given component.
library(data.table) library(ggplot2) library(ggrepel) library(ggforce) library(Matrix) library(sinsynthr) library(SDAtools) library(NMF) # devtools::install_github("renozao/NMF", "devel") nmf.options(grid.patch=TRUE) # stop blank pages appearing library(RColorBrewer) # colour pallet for dendrogram library(viridis) library(grid) library(gridExtra) library(animation) library(testisAtlas)
raw_data <- readRDS("../data/merged-mouse_counts_tsne-QC_low-mt.rds") raw_data <- t(raw_data) data <- normaliseDGE(raw_data, center = FALSE, scale = TRUE, threshold = 10, min_library_size = 200, gene_subset = (2/3)) library_size <- Matrix::colSums(raw_data[,rownames(data)]) # Issue warning if data changes stopifnot(digest::digest(data) == "a4a7ae105f7ec26f98ae7934db7b015f") stopifnot(digest::digest(library_size) == "3b73df05a6e928be9a878a959f4dcaf5") saveRDS(list(dge=data, library_size=library_size), "../data/merged_mouse_normalised_tsne-QC_V3.rds") # save a cache for easy reopening
1- Matrix::nnzero(data)/prod(dim(data)) median(cell_data$library_size)
High library cells tend to be on the outside
Looking at marker genes we can see position on the tSNE correlates with developmental time
data <- readRDS("../data/merged_mouse_normalised_tsne-QC_V3.rds") library_size <- data$library_size data <- data$dge # PCA #devtools::install_github("gabraham/flashpca/flashpcaR") library(flashpcaR) pcaresult <- flashpca(as.matrix(data), ndim=250, stan="center", verbose=T, seed=42) # tSNE set.seed(42) tsne_data <- Rtsne::Rtsne(pcaresult$projection, verbose=TRUE, pca=FALSE, perplexity = 100, max_iter=2000) set.seed(24) tsne_data2 <- Rtsne::Rtsne(pcaresult$projection, verbose=TRUE, pca=FALSE, perplexity = 100, max_iter=2000) save(pcaresult, file="../data/conradV3/merged_mouse_pca_QCtsne.rds") save(tsne_data, tsne_data2, file="../data/conradV3/merged_mouse_tsne_QCtsne.rds")
load("../data/merged_mouse_tsne_QCtsne.rds") cell_data <- data.table(tsne_data$Y) setnames(cell_data, c("Tsne1","Tsne2")) cell_data$cell <- rownames(data) cell_data$experiment <- gsub("[A-Z]+\\.","",rownames(data)) cell_data$group <- gsub("_.*","",cell_data$experiment) cell_data$library_size <- library_size setkey(cell_data, "cell") mito_genes <- grep("mt-",colnames(data), value=T)
ggplot(cell_data, aes(Tsne1, Tsne2, color=log(library_size))) + geom_point() + scale_colour_distiller(palette="YlOrRd", direction=1) + theme(legend.position = "bottom") + ggtitle("t-SNE - library size") ggplot(cell_data, aes(Tsne1, Tsne2, color=experiment)) + geom_point(size=0.5) + theme(legend.position = "bottom") + ggtitle("t-SNE - experiment") + scale_color_manual(values = c(rep(brewer.pal(12,"Paired"),2),"black","grey")) #scale_colour_brewer(palette="Paired") ggplot(cell_data, aes(Tsne1, Tsne2, color=group)) + geom_point(size=0.5) + scale_color_manual(values = brewer.pal(11,"Paired")) + theme(legend.position = "bottom") + ggtitle("t-SNE - experimental group")
" C1qa / (Lyz2, Tyrobp, Pf4, C1qb), Ptprc (CD45/LCA)/Lcp1, Dcn (Adamts5, Col1a2, Gsn,) Cyp11a1, Hsd3b6, Cyp17a1, Agt, Hsd17b3, Star Gfra1, Ccnd2 Sall4, Zbtb16, Nanos3 Uchl1, Dmrt1, Crabp1, Ccna2 Ctcfl, Esx1, Pou4f1 Prdm9, Prss50, Zcwpw1, Dmc1, Ugt8a, Inca1 Ccnb3, Rad51ap2, Meiob Rhox2d, A830018L16Rik, Rhox2h, `Fthl17-ps3`, Dnajc12 (Tex13c2 Piwil1, Tdrd1, Tdrd5 Pou5f2 = Sprm1, 4930581F22Rik, Ccna1, Aurka Ssxb1, Gm3453, Prss46 `1700001F09Rik` Acrv1 Spaca1 Lyzl6 Lrrd1 Klf5 `1700122O11Rik` Saxo1 Tex29 Tbc1d23 Fam71b Hemgn Cd46 Cd55 Fam71a Tnp2 Tnp1 Prm1 Prm2 Smcp Oaz3 " marker_genes <- c("C1qa", "Ptprc","Dcn","Cyp11a1","Gsta3","Aard","Trim7","Gfra1","Nanos1","Sall4","Uchl1","Ctcfl","Prdm9","Ccnb3","Rhox2d","Piwil1","Pou5f2","Ssxb1","Acrv1","Lrrd1","Fam71b","Tnp2","Prm2","Smcp") marker_genes <- c("Csf1r", "Dcn","Cyp11a1","Aard","Trim7","Gfra1","Nanos1","Esx1","Ctcfl","Prdm9","Ccnb3","Rhox2d","Piwil1","Ccna1","Ssxb1","Acrv1","Lrrd1","Fam71b","Tnp2","Prm2")
grid.arrange(grobs=create_grob_list(fn = print_raw_tsne, input = marker_genes), nrow=3, heights = c(1,1,1))
# are the KO genes still expressed? ggplot(cell_data, aes(Tsne1, Tsne2, color=Cul4a)) + geom_point(size=1) + scale_colour_distiller(palette="YlOrRd", direction=1) + theme(legend.position = "bottom") group_vector <- gsub("_.*","",gsub("[A-Z]+\\.","",rownames(data))) qplot(1:20322, raw_data["Cul4a",], col=group_vector, geom="point") + scale_color_manual(values = brewer.pal(11,"Paired")) + theme(legend.position = "bottom") qplot(1:20322, raw_data["Cul4b",], col=group_vector, geom="point") + scale_color_manual(values = brewer.pal(11,"Paired")) + theme(legend.position = "bottom") # CNP is Knock In # qplot(1:20322, raw_data["Cnp",], col=group_vector, geom="point") + # scale_color_manual(values = brewer.pal(11,"Paired")) + # theme(legend.position = "bottom") qplot(1:20322, raw_data["Hormad1",], col=group_vector, geom="point") + scale_color_manual(values = brewer.pal(11,"Paired")) + theme(legend.position = "bottom") qplot(1:20322, raw_data["Mlh3",], col=group_vector, geom="point") + scale_color_manual(values = brewer.pal(11,"Paired")) + theme(legend.position = "bottom")
# Compare to Principal Components pca_dt <- data.table(pcaresult$projection[,1:10]) #data.table(pca_result$x[,1:10], keep.rownames = T) setnames(pca_dt, paste0("PC",1:10)) cell_data <- cbind(cell_data[cell==rownames(data)], pca_dt) grid.arrange(grobs=list(print_pca("PC1"), print_pca("PC2"), print_pca("PC3"), print_pca("PC4"), print_pca("PC5"), print_pca("PC6"), print_pca("PC7")), nrow=3, heights = c(1,1,1)) ggplot(cell_data, aes(PC1, PC2, color=experiment)) + geom_point() + theme(legend.position = "bottom") + ggtitle("t-SNE - experiment") + scale_color_manual(values = c(rep(brewer.pal(12,"Paired"),2)))
Load SDA factorisation and check:
A basic clustering of the components by maximum score and maximum loading shows two main clusters. We will see that the high score components have a single high score so it seems that these components represent overfitting and are therefore not as interesting.
export_data(as.matrix(data), name = "merged_mouse_V3_SDA") run_SDA(sda_location = "../../SDA/build/sda", out = "../results/conradV3_sda_1", data = "V3_SDAmerged_mouse_V3_SDA.data", num_comps = 50, max_iter = 10000, save_freq = 1000, set_seed = "79151 17351", N = 20322)
# load SDA factorisation and helper functions SDAresults <- load_results(results_folder = "../data/SDA/conradV3_sda_1/", data_path = "../data/count_matrices/") rownames(SDAresults$loadings[[1]]) <- paste0("V",1:50) rownames(SDAresults$pips[[1]]) <- paste0("V",1:50) str(SDAresults) check_convergence(SDAresults) loading_distribution(SDAresults) density_data <- data.table("loading"=c(SDAresults$loadings[[1]][SDAresults$pips[[1]]>0.5], SDAresults$loadings[[1]][SDAresults$pips[[1]]<0.5]), "type"=c(rep("Slab (PIP>0.5) ",sum(SDAresults$pips[[1]]>0.5)), rep("Spike (PIP<0.5)",sum(SDAresults$pips[[1]]<0.5)))) sparsity_plot <- ggplot(density_data, aes(loading, colour=type)) + stat_density(bw=1e-3, n=2^11, geom="line", position = "identity") + facet_zoom(xlim=c(-0.1,0.1), ylim=c(0,15)) + scale_color_brewer(palette = "Set1") + theme_bw() + theme(legend.title=element_blank(), legend.position = "top") + labs(x="Gene Loading",y="Density") + scale_x_continuous(labels = function(x) as.character(x)) saveRDS(sparsity_plot, "../data/plots/sparsity_plot.rds") density_data <- as.data.table(density(SDAresults$loadings[[1]], n=5000)[c("x","y")]) ggplot(density_data, aes(x, y)) + geom_line()+ facet_zoom(xy = y<1e5 & abs(x)<0.1) scores_distribution(SDAresults) # Sparsity of Gene Loadings mean(SDAresults$pips[[1]]<0.5) # max loadings with low PIP max(abs(SDAresults$loadings[[1]][SDAresults$pips[[1]]<0.5])) # range of loadings with low PIP quantile(SDAresults$loadings[[1]][SDAresults$pips[[1]]<0.5], c(0.005,0.995)) # mean loading PIP>0.5 mean(abs(SDAresults$loadings[[1]][SDAresults$pips[[1]]>0.5])) # Max loading max(SDAresults$loadings[[1]][SDAresults$pips[[1]]>0.5]) pdf("../results/SDA_diagnostics.pdf", width = 12, height = 10) plot_grid( plot_free_energy_change(SDAresults), plot_PIP_change(SDAresults), plot_maximums(SDAresults), PIP_distribution(SDAresults), labels = "AUTO") dev.off()
rna_locations <- load_gene_locations(colnames(SDAresults$loadings[[1]]), name="merge_mouse_V3b", path = "../data/") rna_locations[,.N,by=chromosome_name][order(-N)] # LOAD SDA into data.table merge_sda4 <- data.table(SDAresults$scores, keep.rownames = T) setnames(merge_sda4, "rn", "cell") setkey(merge_sda4, cell) setkey(cell_data, cell) cell_data <- merge(cell_data, merge_sda4) #merge_sda3$library_size <- library_size[merge_sda3$cell]
Some components are uninteresting, wheras others have spatially (hence peseudotemporally) varying patterns. These components tend to cluster in sections around the outside of the t-SNE plot and so be ordered in terms of spacial position.:
# # odd - 13 # odd_components <- c(1,4,6,9,8,10,14,22,25,41,46,16,21) # grid.arrange(grobs=create_grob_list(fn = print_tsne, input = odd_components), nrow=3, heights = c(1,1,1)) # # # somatic - 12 # somatic_components <- c(3,11,24,26,32,37,40,45,48,43,49,19) # grid.arrange(grobs=create_grob_list(fn = print_tsne, input = somatic_components), nrow=3, heights = c(1,1,1)) # # # - 25 # meiotic_components <- c(31,50,27,33,7,29,2,5,44, 12,13,23,47,38,42,39,20,28,30,35,15,17,36,18,34) #grid.arrange(grobs=create_grob_list(fn = print_tsne, input = meiotic_components), nrow=4, heights = c(1,1,1,1))
load_component_orderings() QC_components <- paste0("V",component_order_dt[QC_fail==F]$component_number)
The first heatmap shows again that some components only have a single high loading in one cell. Removing these components you can start to see more structure and relationships between the components.
# cell scores clustered_heatmap(asinh(cell_data[,QC_components, with=FALSE]), "Cells can have multiple components active (Asinh Transformed)", col_lab=NA) #annotation.col=cell_data[,.(Prm2, Insl3, Nasp, Uchl1, Acrv1)], aheatmap(cor(cell_data[,QC_components, with=FALSE]), breaks=0, main="Correlation between components, by Cell Score", hclustfun="ward.D2", labRow = component_order_dt[QC_fail==F]$name) cor_scores <- cor(cell_data[,QC_components, with=FALSE]) rownames(cor_scores) <- component_order_dt[QC_fail==F]$name Heatmap(cor_scores) # Gene loadings aheatmap(SDAresults$loadings[[1]][QC_components,], breaks=0, labCol=NA, Rowv=NA, main="Overview of genes loadings in each component", hclustfun="ward.D") aheatmap(SDAresults$pips[[1]][QC_components,], breaks=0, labCol=NA, Rowv=NA, main="Overview of genes active (PIP>0.99) in each component", hclustfun="ward.D") tmp <- SDAresults$pips[[1]][QC_components,]>0.99 mode(tmp) <- "numeric" aheatmap(tmp, breaks=0, labCol=NA, Rowv=NA, main="Overview of genes active (PIP>0.99) in each component", hclustfun="ward.D") # every gene is in at least one component qplot(seq_along(SDAresults$loadings[[1]][2,]), apply(SDAresults$pips[[1]], 2, max)) + labs(x="Gene", y="Maximum PIP",title="Every gene is in at least one component") qplot(seq_along(SDAresults$loadings[[1]][2,]), sort(apply(SDAresults$loadings[[1]], 2, max))) + labs(x="Gene", y="Max Loading", title="Maximum loading for each gene") tmp <- cor(t(SDAresults$loadings[[1]][QC_components,])) tmp[tmp==1] <- NA aheatmap(tmp, breaks=0, main="Most components have low correlations by gene loadings", na.color="grey")
head(sort(SDAresults$scores[,4])) #sc_correlations <- sapply(rownames(data), function(x) cor(data[x,], SDAresults$loadings[[1]][4,], method = "spearman")) #saveRDS(sc_correlations, "../data/sc_correlations.rds") #head(sort(sc_correlations, F)) # basically the same as the cell scores, and takes ages so commented out c4scores <- qplot(1:length(SDAresults$scores[,4]),sort(SDAresults$scores[,4],T), geom="point") + labs(x="Sorted Cell Index", y="Component 4 Cell Loading") v4_loadings_plot <- genome_loadings(SDAresults$loadings[[1]][4,], label_both = FALSE, max.items = 10, label.size = 3, hide_unknown = T, gene_locations = gene_annotations, ) + ylab("Gene Loading (Component 4)") cor1 <- qplot(data["ATCCTCACCAAN.Mlh3_1190",], c(SDAresults$scores["ATCCTCACCAAN.Mlh3_1190",4,drop=F] %*% SDAresults$loadings[[1]][4,]), alpha=I(0.3), stroke=0) + annotate("text", x=1.7, y=8, label=paste("Correlation:",signif(cor(data["ATCCTCACCAAN.Mlh3_1190",], -SDAresults$loadings[[1]][4,]),3))) + labs(x="Raw (Normalised) Gene Expression", y="Predicted gene expresstion using component 4", title = "Component 4 only Prediction vs Data for highest cell") cor2 <- qplot(data["ATCCTCATCCAA.Mlh3_1190",], c(SDAresults$scores["ATCCTCATCCAA.Mlh3_1190",4,drop=F] %*% SDAresults$loadings[[1]][4,]), alpha=I(0.3), stroke=0) + annotate("text", x=1.7, y=1, label=paste("Correlation:",signif(cor(data["ATCCTCATCCAA.Mlh3_1190",], -SDAresults$loadings[[1]][4,]),3))) + labs(x="Raw (Normalised) Gene Expression", y="Predicted gene expresstion using component 4", title = "Component 4 only Prediction vs Data for 2nd highest cell") sans4 <- qplot(data["ATCCTCACCAAN.Mlh3_1190",], c(SDAresults$scores["ATCCTCACCAAN.Mlh3_1190",-c(4),drop=F] %*% SDAresults$loadings[[1]][-c(4),]), alpha=I(0.3), ylim = c(-2,10.5), stroke=0) + annotate("text", x=1.7, y=8, label=paste("Correlation:",signif(cor(data["ATCCTCACCAAN.Mlh3_1190",], c(SDAresults$scores["ATCCTCACCAAN.Mlh3_1190",-c(4),drop=F] %*% SDAresults$loadings[[1]][-c(4),])),3))) + labs(x="Raw (Normalised) Gene Expression", y="Predicted Gene Expression without C4", title = "Prediction vs Data without Component 4") allcomps <- qplot(data["ATCCTCACCAAN.Mlh3_1190",], c(SDAresults$scores["ATCCTCACCAAN.Mlh3_1190",,drop=F] %*% SDAresults$loadings[[1]]), alpha=I(0.3), ylim = c(-2,10.5), stroke=0) + annotate("text", x=1.7, y=8, label=paste("Correlation:",signif(cor(data["ATCCTCACCAAN.Mlh3_1190",], c(SDAresults$scores["ATCCTCACCAAN.Mlh3_1190",,drop=F] %*% SDAresults$loadings[[1]][,])),3))) + labs(x="Raw (Normalised) Gene Expression", y="Predicted Gene Expression", title = "Prediction vs Data with all components") pdf("../results/single_cell_component.pdf", width=15, height=15) cowplot::plot_grid(c4scores, v4_loadings_plot, cor1, cor2, sans4, allcomps, ncol=2, labels = "AUTO") dev.off()
Not the same
component_subset <- c("V31","V33","V2","V5","V38","V23","V13","V42","V20","V30","V35","V15","V17","V18") is_negative <- diag(c(1,-1,-1,-1,1,1,-1,-1,-1,1,1,-1,1,-1)) rownames(is_negative) <- component_subset sign_corrected_loadings_subset <- is_negative %*% SDAresults$loadings[[1]][component_subset,] top_20_genes <- sapply(component_subset, function(x) (names(sort(sign_corrected_loadings_subset[x,],T))[1:10])) avoid_overlap <- function(x) { ind <- seq_along(x) %% 2 == 0 x[ind] <- paste0(x[ind],"--------------") #———————— x[!ind] <- paste0(x[!ind],"-") #- x } # tmp2 <- t(scale(t(sign_corrected_loadings_subset), center = F, scale = T))[,unique(c(top_20_genes))] #sign_corrected_loadings_subset[,unique(c(top_20_genes))] # colnames(tmp2) <- avoid_overlap(colnames(tmp2)) # do heatmap in ggplot so can easily combine figures genes_melt <- melt(data.table(t(tmp2), keep.rownames = T), id.vars = "rn", variable.name = "Component", value.name = "Scaled_Gene_Loading") setnames(genes_melt, "rn","Gene") genes_melt$Gene <- factor(genes_melt$Gene, colnames(tmp2)) top_genes_plot <- ggplot(genes_melt, aes(Component, Gene, fill=Scaled_Gene_Loading))+ geom_tile() + scale_fill_gradientn(colours = log_colour_scale(range(genes_melt$Scaled_Gene_Loading), asymetric = T)) + theme_minimal()+ theme(legend.position = "bottom", axis.ticks.y=element_blank(), axis.text.y=element_text(size=11, margin=margin(0,-6,0,0)), panel.grid.major.y = element_blank() )+ ylab(NULL) top_genes_plot saveRDS(top_genes_plot, "../data/plots/top_genes_plot.rds") library(UpSetR) top_250_genes <- sapply(component_subset, function(x) (names(sort(-abs(SDAresults$loadings[[1]][x,])))[1:250])) upset(fromList(as.list(data.table(top_250_genes))), order.by = "freq", nsets = 10)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.