In MyersGroup/testisAtlas: testisAtlas: R functions and analysis scripts used in the analysis of testis single cell RNAseq

tSNE on SDA subset

We can get rid of some of the batch effects by running tSNE on the SDA scores rather than counts directly

set.seed(1234) # s42? 28 - AV2?
tsne_data3 <- Rtsne::Rtsne(SDAresults$scores[,-QC_fail_components], verbose=TRUE, pca=FALSE, perplexity = 50, max_iter=1000)

# PCA_onSDA <- prcomp(SDAresults$scores)
# 
# ggplot(data.table(PCA_onSDA$x[,1:4]), aes(PC1,PC2)) +geom_point()

# add tsne to cell_data
tsne_data_dt <- data.table(tsne_data3$Y)
setnames(tsne_data_dt, c("Tsne1_QC1", "Tsne2_QC1"))
tsne_data_dt$cell <- rownames(SDAresults$scores)
setkey(tsne_data_dt, cell)
cell_data[,Tsne1_QC1 := NULL]
cell_data[,Tsne2_QC1 := NULL]
setkey(cell_data, cell)
cell_data <- merge(cell_data, tsne_data_dt)

ggplot(cell_data, aes(Tsne1_QC1, Tsne2_QC1, color=log(library_size))) +
  geom_point(size=0.25) +
  scale_color_viridis(direction=-1) +
  theme(legend.position = "bottom") +
  ggtitle("t-SNE - library size")

ggplot(cell_data, aes(Tsne1_QC1, Tsne2_QC1, color=experiment)) +
  geom_point(size=0.5) +
  theme(legend.position = "bottom") +
  ggtitle("t-SNE - experiment") +
  scale_color_manual(values = c(rep(brewer.pal(12,"Paired"),2),"black","grey"))
#scale_colour_brewer(palette="Paired")

ggplot(cell_data, aes(Tsne1_QC1, Tsne2_QC1, color=group)) +
  geom_point(size=0.5) +
  scale_color_manual(values = brewer.pal(11,"Paired")) +
  theme(legend.position = "bottom") +
  ggtitle("t-SNE - experiment")

Components

component_tsne <- plot_grid(plotlist = create_grob_list(fn = print_tsne, input = c(11,32,40,26,37,45,31,33,2,5,38,13,42,39,20,30,35,15,17,18,34)), nrow=3)
component_tsne

pdf("../results/component_tsne.pdf", width=24, height=13.5)
component_tsne
dev.off()


component_tsne <- plot_grid(plotlist = create_grob_list(fn = function(x) print_tsne(x, point_size = 0.5), input = component_order_dt[!grep("Single",name)][order(-pseudotime_average, na.last = F)]$component_number), nrow=6)

pdf("../results/component_tsne_all.pdf", width=25, height=20)
component_tsne
dev.off()

Odd components:

grid.arrange(grobs=create_grob_list(fn = print_tsne, input = QC_fail_components), nrow=3, heights = c(1,1,1))

Marker Genes

marker_genes <- c("Csf1r", "Dcn","Cyp11a1","Aard","Trim7","Gfra1","Nanos1","Esx1","Ctcfl","Prdm9","Ccnb3","Rhox2d","Piwil1","Ccna1","Ssxb1","Acrv1","Lrrd1","Fam71b","Tnp2","Prm2")

marker_tsne <- plot_grid(plotlist = create_grob_list(fn = function(x){print_tsne(x,point_size = 0.25, predict = T) + scale_color_gradient2(mid="lightgrey", high="midnightblue", low=muted("red"))}, input = marker_genes), ncol = 5)

pdf("../results/markers.pdf", width=9, height=6)
marker_tsne
dev.off()

Check Tsne is stable over different seeds

Tsne different seeds

library(SDAtools)
library(ggplot2)
library(data.table)

SDAresults <- load_results(results_folder = "../results/conradV3_sda_1/", data_path = "../data/conradV3/")
rownames(SDAresults$loadings[[1]]) <- paste0("V",1:50)
rownames(SDAresults$pips[[1]]) <- paste0("V",1:50)
str(SDAresults)

do_tsne <- function(i){
  seed <- sample(10^6,1)
  set.seed(seed)
  tsne_data5 <- Rtsne::Rtsne(SDAresults$scores[,-QC_fail_components], verbose=TRUE, pca=FALSE, perplexity = 50, max_iter=1000)
  tsne_data5$seed <- seed
  return(tsne_data5)
}

QC_fail_components <- c(22,6,25,29,12,28,41,1,46,4,8,14,9,43)

tsne_repo_rand_seeds <- mclapply(1:20, function(x) do_tsne(x), mc.cores = 16, mc.silent=TRUE, mc.preschedule = FALSE)

saveRDS(tsne_repo_rand_seeds, "../data/tmp_cache/tsne_repo_rand_seeds.rds")

tsne_repo_rand_seeds <- readRDS("../data/tmp_cache/tsne_repo_rand_seeds.rds")

#c(262279, 609512, 225666, 140187, 857033, 857033, 880642, 940864, 293105, 648670)

selected <- which(sapply(tsne_repo_rand_seeds, function(x) x$seed) %in% c(225666, 262279, 648670)) #880642,
rotations <- c(-165, -50,  45) #95,

Uniform Manifold Approximation and Projection (UMAP)

Compared to t-SNE it preserves as much of the local, and more of the global data structure, with a shorter runtime.

# devtools::install_github("jlmelville/uwot")
library(uwot)

set.seed(116189)
# nb need to set threads to 1 for repoduciability to work
sda_umap <- umap(SDAresults$scores[,-QC_fail_components], n_neighbors = 100, n_threads = 1)
colnames(sda_umap) <- c("Umap1","Umap2")

saveRDS(sda_umap, file = "../data/tmp_cache/umap.rds")

str(sda_umap)

cell_data$Umap1 <- NULL
cell_data$Umap2 <- NULL

cell_data <- merge(cell_data,
                   cbind(data.table(sda_umap),
                         cell=rownames(SDAresults$scores))
)
saveRDS(cell_data, file = "../data/cache/cell_data.rds")

print_tsne("PseudoTime",dim="Umap1",dim2="Umap2")


print_tsne(5,dim="Umap1",dim2="Umap2")
print_tsne(20,dim="Umap1",dim2="Umap2")
print_tsne(34,dim="Umap1",dim2="Umap2")

print_tsne(38,dim="Umap1",dim2="Umap2") # X activation
print_tsne(48,dim="Umap1",dim2="Umap2") # odd pachytene clump


print_tsne(9,dim="Umap1",dim2="Umap2") # respiration

print_tsne(22,dim="Umap1",dim2="Umap2") # ribosomal

print_tsne("Prdm9",dim="Umap1",dim2="Umap2")

Create figure

tsne_repo_plots <- vector("list", 4)

for(i in 1:3){
tsne_repo_plots[[i]] <- ggplot(data.table(rotate_matrix(tsne_repo_rand_seeds[[selected[i]]]$Y, rotations[i])), aes(V1, V2, colour=cell_data$PseudoTime[match(rownames(SDAresults$scores), cell_data$cell)])) +
  geom_point(size=0.1) +
  scale_color_viridis() +
  theme(legend.position = "none") +
  labs(x="tSNE 1", y="tSNE 2") +
  ggtitle(paste0("Random Seed: ",tsne_repo_rand_seeds[[selected[i]]]$seed," (rotated ",rotations[i],"°)"))
}

tsne_repo_plots[[4]] <- ggplot(cell_data, aes(Umap1, -Umap2, colour=PseudoTime)) +
  geom_point(size=0.1) +
  scale_color_viridis() +
  theme(legend.position = "none") +
  labs(x="UMAP 1", y="-UMAP 2") +
  ggtitle("UMAP, Random Seed: 116189")

png("../results/tsne_random_seeds.png", width = 3000, height = 2400, res=300)
plot_grid(plotlist = tsne_repo_plots, labels = "AUTO")
dev.off()

Clustering Components

by transposing cell scores matrix

Clustering Components with tSNE

set.seed(123)
tsne_scores <- Rtsne::Rtsne(abs(t(SDAresults$scores)), verbose=TRUE, pca=FALSE, perplexity=2, max_iter=5000)

set.seed(12345)
comp_clusters <- data.table(tsne_scores$Y,
           Component = paste0("C",sort(component_labels)," ", names(sort(component_labels))),
           cluster = kmeans(tsne_scores$Y, 7, nstart = 3)$cluster)
comp_clusters <- comp_clusters[data.table(cluster=c(1,2,3,4,5,6,7), Cluster=c("Spermatogonia","Leptotene/Zygotene","Somatic","Somatic (Sertoli)","Pachytene","Spermiogenesis","Round Spermatid (Acrosomal)")), on="cluster"]

pdf("../results/tSNE_Components.pdf", width = 20, height=11)
ggplot(comp_clusters, aes(V1, V2)) +
  geom_mark_ellipse(aes(fill = Cluster, label=Cluster),label.buffer = unit(25, 'mm'), label.fontsize=17) +
  geom_point() +
  geom_label_repel(aes(label=Component)) +
  scale_fill_brewer(palette = "Set1", guide=F) +
  labs(x="tSNE1", y="tSNE2") +
  expand_limits(y=-110)
dev.off()

Clustering Components with UMAP

library(uwot)

set.seed(116189)
# nb need  to set threads to 1 for repoduciability to work
sda_umap <- umap(abs(t(SDAresults$scores)), n_neighbors = 2, n_threads = 1)
colnames(sda_umap) <- c("Umap1","Umap2")

comp_clusters <- data.table(sda_umap,
           Component = paste0("C",sort(component_labels)," ", names(sort(component_labels))),
           cluster = kmeans(sda_umap, 9, nstart = 3)$cluster)


ggplot(comp_clusters, aes(Umap1, Umap2)) +
  geom_point(aes(colour=factor(cluster))) +
  geom_label_repel(aes(label=Component)) +
  scale_fill_brewer(palette = "Set1", guide=F) +
  labs(x="tSNE1", y="tSNE2")