tSNE on SDA subset

We can get rid of some of the batch effects by running tSNE on the SDA scores rather than counts directly

set.seed(1234) # s42? 28 - AV2?
tsne_data3 <- Rtsne::Rtsne(SDAresults$scores[,-QC_fail_components], verbose=TRUE, pca=FALSE, perplexity = 50, max_iter=1000)

# PCA_onSDA <- prcomp(SDAresults$scores)
# 
# ggplot(data.table(PCA_onSDA$x[,1:4]), aes(PC1,PC2)) +geom_point()

# add tsne to cell_data
tsne_data_dt <- data.table(tsne_data3$Y)
setnames(tsne_data_dt, c("Tsne1_QC1", "Tsne2_QC1"))
tsne_data_dt$cell <- rownames(SDAresults$scores)
setkey(tsne_data_dt, cell)
cell_data[,Tsne1_QC1 := NULL]
cell_data[,Tsne2_QC1 := NULL]
setkey(cell_data, cell)
cell_data <- merge(cell_data, tsne_data_dt)

ggplot(cell_data, aes(Tsne1_QC1, Tsne2_QC1, color=log(library_size))) +
  geom_point(size=0.25) +
  scale_color_viridis(direction=-1) +
  theme(legend.position = "bottom") +
  ggtitle("t-SNE - library size")

ggplot(cell_data, aes(Tsne1_QC1, Tsne2_QC1, color=experiment)) +
  geom_point(size=0.5) +
  theme(legend.position = "bottom") +
  ggtitle("t-SNE - experiment") +
  scale_color_manual(values = c(rep(brewer.pal(12,"Paired"),2),"black","grey"))
#scale_colour_brewer(palette="Paired")

ggplot(cell_data, aes(Tsne1_QC1, Tsne2_QC1, color=group)) +
  geom_point(size=0.5) +
  scale_color_manual(values = brewer.pal(11,"Paired")) +
  theme(legend.position = "bottom") +
  ggtitle("t-SNE - experiment")

Components

component_tsne <- plot_grid(plotlist = create_grob_list(fn = print_tsne, input = c(11,32,40,26,37,45,31,33,2,5,38,13,42,39,20,30,35,15,17,18,34)), nrow=3)
component_tsne

pdf("../results/component_tsne.pdf", width=24, height=13.5)
component_tsne
dev.off()


component_tsne <- plot_grid(plotlist = create_grob_list(fn = function(x) print_tsne(x, point_size = 0.5), input = component_order_dt[!grep("Single",name)][order(-pseudotime_average, na.last = F)]$component_number), nrow=6)

pdf("../results/component_tsne_all.pdf", width=25, height=20)
component_tsne
dev.off()

Odd components:

grid.arrange(grobs=create_grob_list(fn = print_tsne, input = QC_fail_components), nrow=3, heights = c(1,1,1))

Marker Genes

marker_genes <- c("Csf1r", "Dcn","Cyp11a1","Aard","Trim7","Gfra1","Nanos1","Esx1","Ctcfl","Prdm9","Ccnb3","Rhox2d","Piwil1","Ccna1","Ssxb1","Acrv1","Lrrd1","Fam71b","Tnp2","Prm2")

marker_tsne <- plot_grid(plotlist = create_grob_list(fn = function(x){print_tsne(x,point_size = 0.25, predict = T) + scale_color_gradient2(mid="lightgrey", high="midnightblue", low=muted("red"))}, input = marker_genes), ncol = 5)

pdf("../results/markers.pdf", width=9, height=6)
marker_tsne
dev.off()

Check Tsne is stable over different seeds

Tsne different seeds

library(SDAtools)
library(ggplot2)
library(data.table)

SDAresults <- load_results(results_folder = "../results/conradV3_sda_1/", data_path = "../data/conradV3/")
rownames(SDAresults$loadings[[1]]) <- paste0("V",1:50)
rownames(SDAresults$pips[[1]]) <- paste0("V",1:50)
str(SDAresults)

do_tsne <- function(i){
  seed <- sample(10^6,1)
  set.seed(seed)
  tsne_data5 <- Rtsne::Rtsne(SDAresults$scores[,-QC_fail_components], verbose=TRUE, pca=FALSE, perplexity = 50, max_iter=1000)
  tsne_data5$seed <- seed
  return(tsne_data5)
}

QC_fail_components <- c(22,6,25,29,12,28,41,1,46,4,8,14,9,43)

tsne_repo_rand_seeds <- mclapply(1:20, function(x) do_tsne(x), mc.cores = 16, mc.silent=TRUE, mc.preschedule = FALSE)

saveRDS(tsne_repo_rand_seeds, "../data/tmp_cache/tsne_repo_rand_seeds.rds")

tsne_repo_rand_seeds <- readRDS("../data/tmp_cache/tsne_repo_rand_seeds.rds")

#c(262279, 609512, 225666, 140187, 857033, 857033, 880642, 940864, 293105, 648670)

selected <- which(sapply(tsne_repo_rand_seeds, function(x) x$seed) %in% c(225666, 262279, 648670)) #880642,
rotations <- c(-165, -50,  45) #95,

Uniform Manifold Approximation and Projection (UMAP)

Compared to t-SNE it preserves as much of the local, and more of the global data structure, with a shorter runtime.

# devtools::install_github("jlmelville/uwot")
library(uwot)

set.seed(116189)
# nb need to set threads to 1 for repoduciability to work
sda_umap <- umap(SDAresults$scores[,-QC_fail_components], n_neighbors = 100, n_threads = 1)
colnames(sda_umap) <- c("Umap1","Umap2")

saveRDS(sda_umap, file = "../data/tmp_cache/umap.rds")

str(sda_umap)

cell_data$Umap1 <- NULL
cell_data$Umap2 <- NULL

cell_data <- merge(cell_data,
                   cbind(data.table(sda_umap),
                         cell=rownames(SDAresults$scores))
)
saveRDS(cell_data, file = "../data/cache/cell_data.rds")

print_tsne("PseudoTime",dim="Umap1",dim2="Umap2")


print_tsne(5,dim="Umap1",dim2="Umap2")
print_tsne(20,dim="Umap1",dim2="Umap2")
print_tsne(34,dim="Umap1",dim2="Umap2")

print_tsne(38,dim="Umap1",dim2="Umap2") # X activation
print_tsne(48,dim="Umap1",dim2="Umap2") # odd pachytene clump


print_tsne(9,dim="Umap1",dim2="Umap2") # respiration

print_tsne(22,dim="Umap1",dim2="Umap2") # ribosomal

print_tsne("Prdm9",dim="Umap1",dim2="Umap2")

Create figure

tsne_repo_plots <- vector("list", 4)

for(i in 1:3){
tsne_repo_plots[[i]] <- ggplot(data.table(rotate_matrix(tsne_repo_rand_seeds[[selected[i]]]$Y, rotations[i])), aes(V1, V2, colour=cell_data$PseudoTime[match(rownames(SDAresults$scores), cell_data$cell)])) +
  geom_point(size=0.1) +
  scale_color_viridis() +
  theme(legend.position = "none") +
  labs(x="tSNE 1", y="tSNE 2") +
  ggtitle(paste0("Random Seed: ",tsne_repo_rand_seeds[[selected[i]]]$seed," (rotated ",rotations[i],"°)"))
}

tsne_repo_plots[[4]] <- ggplot(cell_data, aes(Umap1, -Umap2, colour=PseudoTime)) +
  geom_point(size=0.1) +
  scale_color_viridis() +
  theme(legend.position = "none") +
  labs(x="UMAP 1", y="-UMAP 2") +
  ggtitle("UMAP, Random Seed: 116189")

png("../results/tsne_random_seeds.png", width = 3000, height = 2400, res=300)
plot_grid(plotlist = tsne_repo_plots, labels = "AUTO")
dev.off()

Clustering Components

by transposing cell scores matrix

Clustering Components with tSNE

set.seed(123)
tsne_scores <- Rtsne::Rtsne(abs(t(SDAresults$scores)), verbose=TRUE, pca=FALSE, perplexity=2, max_iter=5000)

set.seed(12345)
comp_clusters <- data.table(tsne_scores$Y,
           Component = paste0("C",sort(component_labels)," ", names(sort(component_labels))),
           cluster = kmeans(tsne_scores$Y, 7, nstart = 3)$cluster)
comp_clusters <- comp_clusters[data.table(cluster=c(1,2,3,4,5,6,7), Cluster=c("Spermatogonia","Leptotene/Zygotene","Somatic","Somatic (Sertoli)","Pachytene","Spermiogenesis","Round Spermatid (Acrosomal)")), on="cluster"]

pdf("../results/tSNE_Components.pdf", width = 20, height=11)
ggplot(comp_clusters, aes(V1, V2)) +
  geom_mark_ellipse(aes(fill = Cluster, label=Cluster),label.buffer = unit(25, 'mm'), label.fontsize=17) +
  geom_point() +
  geom_label_repel(aes(label=Component)) +
  scale_fill_brewer(palette = "Set1", guide=F) +
  labs(x="tSNE1", y="tSNE2") +
  expand_limits(y=-110)
dev.off()

Clustering Components with UMAP

library(uwot)

set.seed(116189)
# nb need  to set threads to 1 for repoduciability to work
sda_umap <- umap(abs(t(SDAresults$scores)), n_neighbors = 2, n_threads = 1)
colnames(sda_umap) <- c("Umap1","Umap2")

comp_clusters <- data.table(sda_umap,
           Component = paste0("C",sort(component_labels)," ", names(sort(component_labels))),
           cluster = kmeans(sda_umap, 9, nstart = 3)$cluster)


ggplot(comp_clusters, aes(Umap1, Umap2)) +
  geom_point(aes(colour=factor(cluster))) +
  geom_label_repel(aes(label=Component)) +
  scale_fill_brewer(palette = "Set1", guide=F) +
  labs(x="tSNE1", y="tSNE2")


MyersGroup/testisAtlas documentation built on Nov. 29, 2020, 9:21 p.m.