In MyersGroup/testisAtlas: testisAtlas: R functions and analysis scripts used in the analysis of testis single cell RNAseq

This is an analysis of mouse testis scRNA dropseq data from Don Conrad's lab. Here we use SDA and t-SNE to cluster and developmentally order the single cells.

SDA decomposes a matrix (e.g. digital gene expression matrix) into a number of components represented by two matrices. The column vectors of the first matrix tell you how much a given component is active in each cell/individual. The row vectors of the second matrix tell you which genes are active in a given component.

Key Findings

SDA is able to decompose the expression matrix into components which reflect real biological clusters of cells and their associated expressed genes
The location of cells and their components in t-SNE space reflects the developmental timing
Genes within a component have similar gene expression profile through pseudotime which may reflect shared regulatory mechanisms

library(data.table)
library(ggplot2)
library(ggrepel)
library(ggforce)
library(Matrix)
library(sinsynthr)
library(SDAtools)
library(NMF) # devtools::install_github("renozao/NMF", "devel")
nmf.options(grid.patch=TRUE) # stop blank pages appearing
library(RColorBrewer) # colour pallet for dendrogram
library(viridis)
library(grid)
library(gridExtra)
library(animation)

library(testisAtlas)

Renormalise & Reduce Dimensions

raw_data <- readRDS("../data/merged-mouse_counts_tsne-QC_low-mt.rds")
raw_data <- t(raw_data)

data <- normaliseDGE(raw_data,
                     center = FALSE,
                     scale = TRUE,
                     threshold = 10,
                     min_library_size = 200,
                     gene_subset = (2/3))

library_size <- Matrix::colSums(raw_data[,rownames(data)])

# Issue warning if data changes
stopifnot(digest::digest(data) == "a4a7ae105f7ec26f98ae7934db7b015f")
stopifnot(digest::digest(library_size) == "3b73df05a6e928be9a878a959f4dcaf5")

saveRDS(list(dge=data, library_size=library_size), "../data/merged_mouse_normalised_tsne-QC_V3.rds") # save a cache for easy reopening

1- Matrix::nnzero(data)/prod(dim(data))
median(cell_data$library_size)

Raw tSNE

High library cells tend to be on the outside

Looking at marker genes we can see position on the tSNE correlates with developmental time

data <- readRDS("../data/merged_mouse_normalised_tsne-QC_V3.rds")
library_size <- data$library_size
data <- data$dge

# PCA
#devtools::install_github("gabraham/flashpca/flashpcaR")
library(flashpcaR)
pcaresult <- flashpca(as.matrix(data), ndim=250, stan="center", verbose=T, seed=42)

# tSNE
set.seed(42)
tsne_data <- Rtsne::Rtsne(pcaresult$projection, verbose=TRUE, pca=FALSE, perplexity = 100, max_iter=2000)

set.seed(24)
tsne_data2 <- Rtsne::Rtsne(pcaresult$projection, verbose=TRUE, pca=FALSE, perplexity = 100, max_iter=2000)


save(pcaresult, file="../data/conradV3/merged_mouse_pca_QCtsne.rds")
save(tsne_data, tsne_data2, file="../data/conradV3/merged_mouse_tsne_QCtsne.rds")

load("../data/merged_mouse_tsne_QCtsne.rds")

cell_data <- data.table(tsne_data$Y)
setnames(cell_data, c("Tsne1","Tsne2"))
cell_data$cell <- rownames(data)
cell_data$experiment <- gsub("[A-Z]+\\.","",rownames(data))
cell_data$group <- gsub("_.*","",cell_data$experiment)
cell_data$library_size <- library_size
setkey(cell_data, "cell")

mito_genes <- grep("mt-",colnames(data), value=T)

ggplot(cell_data, aes(Tsne1, Tsne2, color=log(library_size))) +
  geom_point() +
  scale_colour_distiller(palette="YlOrRd", direction=1) +
  theme(legend.position = "bottom") +
  ggtitle("t-SNE - library size")

ggplot(cell_data, aes(Tsne1, Tsne2, color=experiment)) +
  geom_point(size=0.5) +
  theme(legend.position = "bottom") +
  ggtitle("t-SNE - experiment") +
  scale_color_manual(values = c(rep(brewer.pal(12,"Paired"),2),"black","grey"))
#scale_colour_brewer(palette="Paired")

ggplot(cell_data, aes(Tsne1, Tsne2, color=group)) +
  geom_point(size=0.5) +
  scale_color_manual(values = brewer.pal(11,"Paired")) +
  theme(legend.position = "bottom") +
  ggtitle("t-SNE - experimental group")

Marker Genes

"
C1qa / (Lyz2, Tyrobp, Pf4, C1qb), 
Ptprc (CD45/LCA)/Lcp1,
Dcn (Adamts5, Col1a2, Gsn,)
Cyp11a1, Hsd3b6, Cyp17a1, Agt, Hsd17b3, Star
Gfra1, Ccnd2
Sall4, Zbtb16, Nanos3
Uchl1, Dmrt1, Crabp1, Ccna2
Ctcfl, Esx1, Pou4f1
Prdm9, Prss50, Zcwpw1, Dmc1, Ugt8a, Inca1
Ccnb3, Rad51ap2, Meiob
Rhox2d, A830018L16Rik, Rhox2h, `Fthl17-ps3`, Dnajc12 (Tex13c2
Piwil1, Tdrd1, Tdrd5
Pou5f2 = Sprm1, 4930581F22Rik, Ccna1, Aurka
Ssxb1, Gm3453, Prss46 `1700001F09Rik`
Acrv1 Spaca1 Lyzl6
Lrrd1 Klf5 `1700122O11Rik` Saxo1 Tex29 Tbc1d23
Fam71b Hemgn Cd46 Cd55 Fam71a
Tnp2 Tnp1 Prm1
Prm2
Smcp Oaz3
"

marker_genes <- c("C1qa", "Ptprc","Dcn","Cyp11a1","Gsta3","Aard","Trim7","Gfra1","Nanos1","Sall4","Uchl1","Ctcfl","Prdm9","Ccnb3","Rhox2d","Piwil1","Pou5f2","Ssxb1","Acrv1","Lrrd1","Fam71b","Tnp2","Prm2","Smcp")

marker_genes <- c("Csf1r", "Dcn","Cyp11a1","Aard","Trim7","Gfra1","Nanos1","Esx1","Ctcfl","Prdm9","Ccnb3","Rhox2d","Piwil1","Ccna1","Ssxb1","Acrv1","Lrrd1","Fam71b","Tnp2","Prm2")

grid.arrange(grobs=create_grob_list(fn = print_raw_tsne, input = marker_genes), nrow=3, heights = c(1,1,1))

# are the KO genes still expressed?

ggplot(cell_data, aes(Tsne1, Tsne2, color=Cul4a)) +
  geom_point(size=1) +
  scale_colour_distiller(palette="YlOrRd", direction=1) +
  theme(legend.position = "bottom")

group_vector <- gsub("_.*","",gsub("[A-Z]+\\.","",rownames(data)))

qplot(1:20322, raw_data["Cul4a",], col=group_vector, geom="point") +
  scale_color_manual(values = brewer.pal(11,"Paired")) +
  theme(legend.position = "bottom")

qplot(1:20322, raw_data["Cul4b",], col=group_vector, geom="point") +
  scale_color_manual(values = brewer.pal(11,"Paired")) +
  theme(legend.position = "bottom")

# CNP is Knock In
# qplot(1:20322, raw_data["Cnp",], col=group_vector, geom="point") +
#   scale_color_manual(values = brewer.pal(11,"Paired")) +
#   theme(legend.position = "bottom")

qplot(1:20322, raw_data["Hormad1",], col=group_vector, geom="point") +
  scale_color_manual(values = brewer.pal(11,"Paired")) +
  theme(legend.position = "bottom")

qplot(1:20322, raw_data["Mlh3",], col=group_vector, geom="point") +
  scale_color_manual(values = brewer.pal(11,"Paired")) +
  theme(legend.position = "bottom")

# Compare to Principal Components

pca_dt <- data.table(pcaresult$projection[,1:10]) #data.table(pca_result$x[,1:10], keep.rownames = T)
setnames(pca_dt, paste0("PC",1:10))
cell_data <- cbind(cell_data[cell==rownames(data)], pca_dt)


grid.arrange(grobs=list(print_pca("PC1"),
                        print_pca("PC2"),
                        print_pca("PC3"),
                        print_pca("PC4"),
                        print_pca("PC5"),
                        print_pca("PC6"),
                        print_pca("PC7")),
            nrow=3, heights = c(1,1,1))

ggplot(cell_data, aes(PC1, PC2, color=experiment)) +
  geom_point() +
  theme(legend.position = "bottom") +
  ggtitle("t-SNE - experiment") +
  scale_color_manual(values = c(rep(brewer.pal(12,"Paired"),2)))

SDA Run Statistics

Load SDA factorisation and check:

That the run converged
That PIPs (posterior inclusion probability: [approximate] posterior probability that a gene contributes to a given component) behave as expected (i.e. close to 0 or 1)
That the gene loadings are indeed sparse (and the distribution of cell scores)
Which components contribute the most to the variability of the original data

A basic clustering of the components by maximum score and maximum loading shows two main clusters. We will see that the high score components have a single high score so it seems that these components represent overfitting and are therefore not as interesting.

export_data(as.matrix(data), name = "merged_mouse_V3_SDA")

run_SDA(sda_location = "../../SDA/build/sda",
        out = "../results/conradV3_sda_1",
        data = "V3_SDAmerged_mouse_V3_SDA.data",
        num_comps = 50,
        max_iter = 10000,
        save_freq = 1000,
        set_seed = "79151 17351",
        N = 20322)

# load SDA factorisation and helper functions
SDAresults <- load_results(results_folder = "../data/SDA/conradV3_sda_1/", data_path = "../data/count_matrices/")
rownames(SDAresults$loadings[[1]]) <- paste0("V",1:50)
rownames(SDAresults$pips[[1]]) <- paste0("V",1:50)
str(SDAresults)

check_convergence(SDAresults)
loading_distribution(SDAresults)

density_data <- data.table("loading"=c(SDAresults$loadings[[1]][SDAresults$pips[[1]]>0.5],
                                       SDAresults$loadings[[1]][SDAresults$pips[[1]]<0.5]),
                                       "type"=c(rep("Slab (PIP>0.5)         ",sum(SDAresults$pips[[1]]>0.5)),
                                                rep("Spike (PIP<0.5)",sum(SDAresults$pips[[1]]<0.5))))

sparsity_plot <- ggplot(density_data, aes(loading, colour=type)) + 
    stat_density(bw=1e-3, n=2^11, geom="line", position = "identity") +
    facet_zoom(xlim=c(-0.1,0.1), ylim=c(0,15)) + 
    scale_color_brewer(palette = "Set1") + 
    theme_bw() +
    theme(legend.title=element_blank(), legend.position = "top") +
    labs(x="Gene Loading",y="Density") +
    scale_x_continuous(labels = function(x) as.character(x))

saveRDS(sparsity_plot, "../data/plots/sparsity_plot.rds")

density_data <- as.data.table(density(SDAresults$loadings[[1]], n=5000)[c("x","y")])
ggplot(density_data, aes(x, y)) + geom_line()+ facet_zoom(xy = y<1e5 & abs(x)<0.1)

scores_distribution(SDAresults)

# Sparsity of Gene Loadings
mean(SDAresults$pips[[1]]<0.5)

# max loadings with low PIP
max(abs(SDAresults$loadings[[1]][SDAresults$pips[[1]]<0.5]))

# range of loadings with low PIP
quantile(SDAresults$loadings[[1]][SDAresults$pips[[1]]<0.5], c(0.005,0.995))

# mean loading PIP>0.5
mean(abs(SDAresults$loadings[[1]][SDAresults$pips[[1]]>0.5]))

# Max loading
max(SDAresults$loadings[[1]][SDAresults$pips[[1]]>0.5])

pdf("../results/SDA_diagnostics.pdf", width = 12, height = 10)
plot_grid(
plot_free_energy_change(SDAresults),
plot_PIP_change(SDAresults),
plot_maximums(SDAresults),
PIP_distribution(SDAresults), labels = "AUTO")
dev.off()

rna_locations <- load_gene_locations(colnames(SDAresults$loadings[[1]]), name="merge_mouse_V3b", path = "../data/")
rna_locations[,.N,by=chromosome_name][order(-N)]

# LOAD SDA into data.table
merge_sda4 <- data.table(SDAresults$scores, keep.rownames = T)
setnames(merge_sda4, "rn", "cell")
setkey(merge_sda4, cell)
setkey(cell_data, cell)
cell_data <- merge(cell_data, merge_sda4)
#merge_sda3$library_size <- library_size[merge_sda3$cell]

Component scores in tSNE

Some components are uninteresting, wheras others have spatially (hence peseudotemporally) varying patterns. These components tend to cluster in sections around the outside of the t-SNE plot and so be ordered in terms of spacial position.:

# # odd - 13
# odd_components <- c(1,4,6,9,8,10,14,22,25,41,46,16,21)
# grid.arrange(grobs=create_grob_list(fn = print_tsne, input = odd_components), nrow=3, heights = c(1,1,1))
# 
# # somatic - 12
# somatic_components <- c(3,11,24,26,32,37,40,45,48,43,49,19)
# grid.arrange(grobs=create_grob_list(fn = print_tsne, input = somatic_components), nrow=3, heights = c(1,1,1))
# 
# # - 25
# meiotic_components <- c(31,50,27,33,7,29,2,5,44, 12,13,23,47,38,42,39,20,28,30,35,15,17,36,18,34)
#grid.arrange(grobs=create_grob_list(fn = print_tsne, input = meiotic_components), nrow=4, heights = c(1,1,1,1))

load_component_orderings()

QC_components <- paste0("V",component_order_dt[QC_fail==F]$component_number)

Components Overview

The first heatmap shows again that some components only have a single high loading in one cell. Removing these components you can start to see more structure and relationships between the components.

# cell scores
clustered_heatmap(asinh(cell_data[,QC_components, with=FALSE]), "Cells can have multiple components active (Asinh Transformed)", col_lab=NA)
#annotation.col=cell_data[,.(Prm2, Insl3, Nasp, Uchl1, Acrv1)],
aheatmap(cor(cell_data[,QC_components, with=FALSE]),
         breaks=0,
         main="Correlation between components, by Cell Score",
         hclustfun="ward.D2",
         labRow = component_order_dt[QC_fail==F]$name)
cor_scores <- cor(cell_data[,QC_components, with=FALSE])
rownames(cor_scores) <- component_order_dt[QC_fail==F]$name
Heatmap(cor_scores)


# Gene loadings
aheatmap(SDAresults$loadings[[1]][QC_components,], breaks=0, labCol=NA, Rowv=NA, main="Overview of genes loadings in each component", hclustfun="ward.D")
aheatmap(SDAresults$pips[[1]][QC_components,], breaks=0, labCol=NA,  Rowv=NA, main="Overview of genes active (PIP>0.99) in each component", hclustfun="ward.D")
tmp <- SDAresults$pips[[1]][QC_components,]>0.99
mode(tmp) <- "numeric"
aheatmap(tmp, breaks=0, labCol=NA,  Rowv=NA, main="Overview of genes active (PIP>0.99) in each component", hclustfun="ward.D")

# every gene is in at least one component
qplot(seq_along(SDAresults$loadings[[1]][2,]), apply(SDAresults$pips[[1]], 2, max)) + labs(x="Gene", y="Maximum PIP",title="Every gene is in at least one component")
qplot(seq_along(SDAresults$loadings[[1]][2,]), sort(apply(SDAresults$loadings[[1]], 2, max))) + labs(x="Gene", y="Max Loading", title="Maximum loading for each gene")

tmp <- cor(t(SDAresults$loadings[[1]][QC_components,]))
tmp[tmp==1] <- NA
aheatmap(tmp, breaks=0, main="Most components have low correlations by gene loadings", na.color="grey")

Single Cell Component

head(sort(SDAresults$scores[,4]))

#sc_correlations <- sapply(rownames(data), function(x) cor(data[x,], SDAresults$loadings[[1]][4,], method = "spearman"))
#saveRDS(sc_correlations, "../data/sc_correlations.rds")
#head(sort(sc_correlations, F))
# basically the same as the cell scores, and takes ages so commented out

c4scores <- qplot(1:length(SDAresults$scores[,4]),sort(SDAresults$scores[,4],T), geom="point") + labs(x="Sorted Cell Index", y="Component 4 Cell Loading")

v4_loadings_plot <- genome_loadings(SDAresults$loadings[[1]][4,], label_both = FALSE, max.items = 10, label.size = 3, hide_unknown = T, gene_locations = gene_annotations, ) +
  ylab("Gene Loading (Component 4)")

cor1 <- qplot(data["ATCCTCACCAAN.Mlh3_1190",], c(SDAresults$scores["ATCCTCACCAAN.Mlh3_1190",4,drop=F] %*% SDAresults$loadings[[1]][4,]), alpha=I(0.3), stroke=0) + 
  annotate("text", x=1.7, y=8, label=paste("Correlation:",signif(cor(data["ATCCTCACCAAN.Mlh3_1190",], -SDAresults$loadings[[1]][4,]),3))) +
  labs(x="Raw (Normalised) Gene Expression", y="Predicted gene expresstion using component 4", title = "Component 4 only Prediction vs Data for highest cell")

cor2 <- qplot(data["ATCCTCATCCAA.Mlh3_1190",], c(SDAresults$scores["ATCCTCATCCAA.Mlh3_1190",4,drop=F] %*% SDAresults$loadings[[1]][4,]), alpha=I(0.3), stroke=0) + 
  annotate("text", x=1.7, y=1, label=paste("Correlation:",signif(cor(data["ATCCTCATCCAA.Mlh3_1190",], -SDAresults$loadings[[1]][4,]),3))) +
  labs(x="Raw (Normalised) Gene Expression", y="Predicted gene expresstion using component 4", title = "Component 4 only Prediction vs Data for 2nd highest cell")

sans4 <- qplot(data["ATCCTCACCAAN.Mlh3_1190",], c(SDAresults$scores["ATCCTCACCAAN.Mlh3_1190",-c(4),drop=F] %*% SDAresults$loadings[[1]][-c(4),]), alpha=I(0.3), ylim = c(-2,10.5), stroke=0) + 
  annotate("text", x=1.7, y=8, label=paste("Correlation:",signif(cor(data["ATCCTCACCAAN.Mlh3_1190",], c(SDAresults$scores["ATCCTCACCAAN.Mlh3_1190",-c(4),drop=F] %*% SDAresults$loadings[[1]][-c(4),])),3))) +
  labs(x="Raw (Normalised) Gene Expression", y="Predicted Gene Expression without C4", title = "Prediction vs Data without Component 4")

allcomps <- qplot(data["ATCCTCACCAAN.Mlh3_1190",], c(SDAresults$scores["ATCCTCACCAAN.Mlh3_1190",,drop=F] %*% SDAresults$loadings[[1]]), alpha=I(0.3), ylim = c(-2,10.5), stroke=0) + 
  annotate("text", x=1.7, y=8, label=paste("Correlation:",signif(cor(data["ATCCTCACCAAN.Mlh3_1190",], c(SDAresults$scores["ATCCTCACCAAN.Mlh3_1190",,drop=F] %*% SDAresults$loadings[[1]][,])),3))) +
  labs(x="Raw (Normalised) Gene Expression", y="Predicted Gene Expression", title = "Prediction vs Data with all components")


pdf("../results/single_cell_component.pdf", width=15, height=15)
cowplot::plot_grid(c4scores, v4_loadings_plot, cor1, cor2, sans4, allcomps, ncol=2, labels = "AUTO")
dev.off()

Top Genes by component

Not the same

component_subset <- c("V31","V33","V2","V5","V38","V23","V13","V42","V20","V30","V35","V15","V17","V18")
is_negative <- diag(c(1,-1,-1,-1,1,1,-1,-1,-1,1,1,-1,1,-1))

rownames(is_negative) <- component_subset

sign_corrected_loadings_subset <- is_negative %*% SDAresults$loadings[[1]][component_subset,]

top_20_genes <- sapply(component_subset, function(x) (names(sort(sign_corrected_loadings_subset[x,],T))[1:10]))

avoid_overlap <- function(x) 
{
  ind <- seq_along(x) %% 2 == 0
  x[ind] <- paste0(x[ind],"--------------") #————————
  x[!ind] <- paste0(x[!ind],"-") #-
  x
}
#                              
tmp2 <- t(scale(t(sign_corrected_loadings_subset), center = F, scale = T))[,unique(c(top_20_genes))] #sign_corrected_loadings_subset[,unique(c(top_20_genes))] #

colnames(tmp2) <- avoid_overlap(colnames(tmp2))

# do heatmap in ggplot so can easily combine figures
genes_melt <- melt(data.table(t(tmp2), keep.rownames = T),
                   id.vars = "rn",
                   variable.name = "Component",
                   value.name = "Scaled_Gene_Loading")

setnames(genes_melt, "rn","Gene")
genes_melt$Gene <- factor(genes_melt$Gene, colnames(tmp2))


top_genes_plot <- ggplot(genes_melt, aes(Component, Gene, fill=Scaled_Gene_Loading))+
  geom_tile() +
  scale_fill_gradientn(colours = log_colour_scale(range(genes_melt$Scaled_Gene_Loading), asymetric = T)) +
  theme_minimal()+
  theme(legend.position = "bottom",
        axis.ticks.y=element_blank(),
        axis.text.y=element_text(size=11, margin=margin(0,-6,0,0)),
        panel.grid.major.y = element_blank()
        )+
  ylab(NULL)
top_genes_plot

saveRDS(top_genes_plot, "../data/plots/top_genes_plot.rds")

library(UpSetR)

top_250_genes <- sapply(component_subset, function(x) (names(sort(-abs(SDAresults$loadings[[1]][x,])))[1:250]))

upset(fromList(as.list(data.table(top_250_genes))), order.by = "freq", nsets = 10)