#library(ggplot2)
#library(testisAtlas)
#library(SDAtools)

#devtools::load_all("~/Dropbox/Github/ggplot2/")
library(ggplot2)
library(SDAtools) #devtools::load_all("~/Dropbox/Github/SDAtools/")
library(MoitfFinder) #devtools::load_all("~/Dropbox/Github/MotifFinder/")
pkgload::load_all()


library(data.table)
library(cowplot)
library(gridExtra)
library(ggforce)
library(ggrepel)
library(RColorBrewer)
library(ggnewscale)


load2("../data/cache")

load_component_orderings()

Figure 3

GWAS recombination enrichment

#from https://science.sciencemag.org/content/sci/363/6425/eaau1043.full.pdf

## TABLE 5A, Coding & Splice site
## Inlcuded in our list:
# Syce1, ok
# Ccnb1ip1, ok, GWS MATERNAL & JOINT, Medium paternal
# Meiob, ok, GWS MATERNAL, weak paternal, but expressed ok zyg
# Fbxo47, ok GWS JOINT
# H2bfm, ok, mainly Hormad / X, but a little in L&Z
# Msh4, Ok, GWS MATERNAL & JOINT, weak paternal
# Hfm1, ok
# Rnf212, ok
# C14orf39, ok,  = 4930447C04Rik, aka SIX6OS1
# Rad21l1, ok,  = RAD21l
# Hormad1, ok
# Prdm9, ok
# Hus1b, ok, GWS joint & MATERNAL
# Sycp3, ok
# EAPP, ok, MATERNAL, - Supp says intergentic, main says ms -see below?, GWS MATERNAL, weak paternal
# Syce2, ok, MATERNAL & JOINT, weak paternal
# Ctcfl, ok, next to Spo11
# Hsf2bp, ok
# Smc1b, ok

## TABLE 5A Not included in our list:
# MSFD7 (aka Slc49a3) = Mfsd7a, no detected expression, Previously CPLX1?? ~ 100kb away
# CT45A9, no ortohlogue, many similar genes nearby (inc. SAGE1 aka Ints6l -testis specific, sertoli/LZ)
# MAPT, 4444 in LD, in inversion HsInv0573 (hg18:chr17:40928985-42139672, hg38:chr17:45495836-46707123 hg19:chr17:43573202-44784489)
# ANHX, no mouse homologue?, MATERNAL & JOINT only, v. testis specific
# Prame, one to many orthologue to mouse
# Fancb, undetecteable expression

## TABLE 5B
## NON CODING
# KLHL21, ok - 3'utr, GWS MATERNAL only, joint medium
# intergenic / Rnf212, ok (also in coding list)
# NFKBIL1 intergenic, DDX39B-AS1, MATERNAL
# Seh1l upstream , GWS MATERNAL, joint medium
# AP1M2, intergenic, GWS MATERNAL, joint medium
# intergeic, MATERNAL  gene desert (BMP2)
# Rnf212, covered above
# SMEK1, ok - 3' UTR, = Ppp4r3a, GWS MATERNAL & JOINT
# C17orf104, ok - (aka Meioc) - paper "Meioc maintains an extended meiotic prophase I in mice."
# ANXA9, intergenic, GWS MATERNAL, medium paternal
# Acyp2, syn, GWS MATERNAL & JOINT, maybe next door gene mouse=Tspyl1?
# Fam178b, intergenic, GWS MATERNAL & JOINT, medium paternal
# C11orf80, ok 3'UTR = Gm960 in mouse, GWS MATERNAL & JOINT
# Sycp2, ok - 3'UTR
# One intergenic near Eapp
# One intergenic near Fancb
# PASD1 no orthologue

gwas_genes <- c("Syce1","Ccnb1ip1","Meiob","Fbxo47","H2bfm","Msh4","Hfm1","Rnf212","4930447C04Rik","Rad21l","Hormad1","Prdm9","Hus1b","Sycp3","Eapp","Syce2","Ctcfl","Hsf2bp","Smc1b","Klhl21","Ppp4r3a","Gm960","Sycp2","Meioc")

# Kong et al : Rnf212, Cplx, Ccnb1ip1, C14orf39 = 4930447C04Rik, aka SIX6OS1, Smek1 = Ppp4r3a, Rad21l, Mh4, Ccdc42, Prdm9

gwas_genes_rr_enrich <- component_enrichment(gwas_genes, threshold = 300)
c5_enrich_plot <- manhatten_plot(gwas_genes_rr_enrich[!component %in% gsub("V","",half_exclusion_comps)], legend_position = c(0.20,0.7))
head(gwas_genes_rr_enrich[!component %in% gsub("V","",half_exclusion_comps)][order(p.value)])

tmp <- single_component_enrichment(SDAresults$loadings[[1]][5,], pos=F, genes=gwas_genes)$ranks
ggplot(data.table(rank=tmp, gene=names(tmp)), aes(rank, 1:24, label=gene))+ geom_vline(xintercept = 0, col="red") + geom_point() + xlim(-4000,19264) + geom_label_repel(force = 10, nudge_x = -1000) + xlab("Rank in Component 5") + ylab("")

Enrichment rug plot

tmp <- data.table(loading = sort(-SDAresults$loadings[[1]][5,],T),
                  gene = names(sort(-SDAresults$loadings[[1]][5,],T)))
tmp$rank <- seq_along(tmp$loading)
tmp[gene %in% gwas_genes, KO := "Recombination GWAS hits"]

c5_rug_plot <- ggplot(tmp, aes(rank, -loading,colour=KO)) +
    geom_point(colour="black") +
    geom_rug(data=tmp[gene %in% gwas_genes], sides = "b", length=0.15) +
    scale_y_reverse() +
    ylab("Component\n 5 Gene Loading") + xlab("Rank of gene by loading") +
    scale_color_brewer(palette = "Set1")+
    guides(colour = guide_legend(override.aes = list(size=5))) +
    theme(legend.position = c(0.4,0.6)) + labs(colour=NULL)

c5_rug_plot

Loadings V5

v5_loadings_plot <- genome_loadings(SDAresults$loadings[[1]][5,], label_both = FALSE, max.items = 20, label.size = 3.5,
                                     gene_locations = gene_annotations, hide_unknown = T,
                                     highlight_genes = gwas_genes,label_genes = c("Smc1b","Gm960","Hfm1","Sycp2","Meioc","Meiob","Sycp3")) +
  ylab("Gene Loadings (Component 5)") +
  scale_y_reverse()

v5_loadings_plot

tSNE v5

tsne_plot <- print_tsne(5, curve = T, stages = T, point_size = 0.75) + ggtitle("")
tsne_plot
matrix_plot <- ggdraw() + draw_image("../../figures/matrix_decomp_cells.png", clip="on")

sparsity_plot <- readRDS("../data/plots/sparsity_plot.rds")

gene_loading_sparsity <- readRDS("../data/plots/gene_loading_sparsity.rds")


left <- plot_grid(matrix_plot, tsne_plot, c5_enrich_plot,
                  rel_heights = c(0.4,1.5,0.6), ncol=1,
                  labels = c("A","D","F"), label_size = 25)

sparsity <- plot_grid(sparsity_plot, gene_loading_sparsity, ncol=2, rel_widths = c(5,4), labels = c("B","C"), label_size = 25)

right <- plot_grid(sparsity, v5_loadings_plot, 
                   rel_heights = c(0.6, 1), ncol=1,
                   labels = c("","E"), label_size = 25)

pdf(height=10, width=13, file="../results/Figure_SDA_Intro.pdf")
plot_grid(left, right, ncol=2)
dev.off()



structure <- readRDS("../data/plots/structure_plot.rds")

top_genes_plot <- readRDS("../data/plots/top_genes_plot.rds")

GO_plot <-  readRDS("../data/plots/GO_plot.rds")

scores_pseudotime <- readRDS("../data/plots/scores_pseudotime.rds")

left <- plot_grid(scores_pseudotime, top_genes_plot,
                   rel_heights = c(1,2), ncol=1,
                   labels = c("A","C"), label_size = 25)
right <- plot_grid(structure, GO_plot,
                   rel_heights = c(3,4), ncol=1,
                   labels = c("B","D"), label_size = 25)

pdf(height=16, width=15, file="../results/Figure_SDA_Overlaping_Scores.pdf")
plot_grid(left, right, ncol=2, rel_widths = c(4,5))
dev.off()
leg <- ggplot(data.table(x=1,y=1,z="Recombination GWAS Hit (Halldorsson et al 2019)"), aes(x,y, colour=z)) + geom_point() + scale_colour_manual(values='red', name="") + theme_minimal()

pdf("../results/C5_loadings.pdf", width = 8.3, height = 5)
plot_grid(v5_loadings_plot, get_legend(leg), ncol=1, rel_heights = c(1,0.1))
dev.off()


v5_loadings_plot + get_legend(leg)

scale_colour_manual(name="", values=c(myline1="red", myline2="blue", myline3="purple"))

Sparsity by component

ggplot(data.table(Sparsity=apply(SDAresults$pips[[1]], 1, function(x) mean(x<0.5)),
           Component=factor(names(sort(component_labels)), levels=unique(names(sort(component_labels))[component_order_all]))),
       aes(Component, Sparsity)) +
  geom_point() +
  coord_flip()

Figure 9

9A Hormad1 Scores & Loadings

v38_loadings_plot <- genome_loadings(SDAresults$loadings[[1]][38,], label_both = FALSE, max.items = 10, label.size = 3, hide_unknown = T, gene_locations = gene_annotations) +
  ylab("Gene Loading (Component 38)")

9B Rhox2h Hormad vs WT

tmp <- gene_expression_pseudotime(c("Rhox2h"))
tmp[group=="Hormad1", Hormad1_KO := TRUE]
tmp[group!="Hormad1", Hormad1_KO := FALSE]

Hormad1_gxe <- ggplot(tmp[PseudoTime>14500], aes(-PseudoTime, value, colour=Hormad1_KO)) +
  geom_point(size=0.5) + 
  theme_minimal() + 
  ylab("Rhox2h Gene Expression") +
  xlab("Pseudotime") +
  theme(legend.position = c(0.8,0.7),
        legend.background = element_rect(colour="black")) + 
  scale_color_brewer(palette = "Set1",name="Hormad1 KO?")

pdf(height=5, width=5*1.7, file="../results/Rhox2h.pdf")
Hormad1_gxe
dev.off()

tmp <- gene_expression_pseudotime(c("A830018L16Rik","Rhox2h"))
tmp[group=="Hormad1", Hormad1_KO := TRUE]
tmp[group!="Hormad1", Hormad1_KO := FALSE]

Hormad1_gxe2 <- ggplot(tmp[PseudoTime>14500], aes(-PseudoTime, value, colour=Hormad1_KO)) +
  geom_point(size=0.5) + 
  theme_minimal() + 
  ylab("Gene Expression") +
  xlab("Pseudotime") +
  theme(legend.position = c(0.8,0.3),
        legend.background = element_rect(colour="black"),
        strip.text.x = element_text(size = 12)) + 
  scale_color_brewer(palette = "Set1",name="Hormad1 KO?") +
  facet_wrap(~Gene, ncol=1)

pdf(height=5, width=6*1.7, file="../results/Hormad1_two.pdf")
plot_grid(v38_loadings_plot, Hormad1_gxe2)
dev.off()

9C Cul4a Scores & Loadings

v25_loadings_plot <- genome_loadings(SDAresults$loadings[[1]][25,], label_both = FALSE, max.items = 10, label.size = 3, hide_unknown = T, gene_locations = gene_annotations) +
  ylab("Gene Loading (Component 25)") + scale_y_reverse()

9D MQ enrichment

# from alzheimers.Rmd
AZ_enrich_plot <- readRDS("../data/plots/AZ_enrich_plot.rds")

Fig 9 Combined

c38 <- plot_grid(v38_loadings_plot + scale_size_continuous(range = c(1, 3)),
                 plot_cell_scores("V38", point_size = 1),
                 ncol = 2, rel_widths = c(3,1))
c25 <- plot_grid(v25_loadings_plot + scale_size_continuous(range = c(1, 3)),
                 plot_cell_scores("V25") + scale_x_reverse(),
                 ncol = 2, rel_widths = c(3,1))

pdf(height=5, width=5*1.7, file="../results/C38.pdf")
c38
dev.off()

#c11 <- plot_grid(v11_loadings_plot, v11_scores, ncol = 2, rel_widths = c(3,1))

set.seed(87162)
pdf(width = 12, height = 6, file = "../results/Figure_9_AtoD.pdf")
plot_grid(plotlist = list(c38,
                       Hormad1_gxe,
                       c25,
                       AZ_enrich_plot + theme(axis.text.x=element_text(size=4))),
          rel_widths = c(3,2),
          labels = "AUTO")
dev.off()

Ernst meadian counts

median: 18,499

download.file("https://www.ebi.ac.uk/arrayexpress/files/E-MTAB-6946/E-MTAB-6946.processed.3.zip", 
                destfile = "../data/previous_studies/Ernst/cellranger_metadata.zip")
unzip("../data/previous_studies/Ernst/cellranger_metadata.zip") 
ernst <- fread("../data/previous_studies/Ernst/cell_metadata.txt")
str(ernst)
median(ernst$total_counts)
mean(ernst$total_counts)

Batch effects

Hormad

tmp <- data.table(SDAresults$scores, cell=rownames(SDAresults$scores))
tmp$Hormad <- grepl("Hormad",tmp$cell)
tmp$WT <- grepl("WT",tmp$cell)

tmp$experiment <- unlist(strsplit(tmp$cell, split = "\\."))[c(F,T)]

tmp$cell_id <- 1:nrow(tmp)

ggplot(tmp, aes(cell_id, V12, colour=experiment)) + geom_point() + theme(axis.text.x = element_blank())

ggplot(tmp[grepl("Hormad",experiment)], aes(cell_id, V12, colour=experiment)) +
  geom_point() + theme(axis.text.x = element_blank())

MiSeq

# MiSeq = SP*, WT 28117, MLH3-KO
tmp[,MiSeq := FALSE]
tmp[grepl("SPC.|SPD.|28117",experiment), MiSeq := TRUE]

# By date jittered
tmp[grepl("WT",experiment)] %>%
   mutate(experiment = factor(experiment, levels = unique(tmp[WT==T]$experiment)[c(9,1:8)])) %>%
ggplot(aes(V22, experiment, colour=experiment)) +
  geom_jitter(size=1, alpha=0.5, stroke=0) +
  scale_color_brewer(palette = "Set1") + 
  theme_minimal() +
  theme(legend.position = "none", axis.title.y=element_blank() ) +
  xlab(paste0("Component ","V22","\n Cell Score"))

# MiSeq
tmp %>%
  mutate(experiment = factor(experiment, levels = tmp[,mean(V22),by=experiment][order(V1)]$experiment)) %>%
ggplot(aes(V22, experiment, colour=MiSeq)) +
  geom_jitter(size=1, alpha=0.5, stroke=0) +
  scale_color_brewer(palette = "Set1") + 
  theme_minimal() +
  theme(axis.title.y=element_blank() ) +
  xlab(paste0("Component ","V22","\n Cell Score"))

# MiSeq aggregated
ggplot(tmp,aes(V22, MiSeq, colour=MiSeq)) +
  geom_jitter(size=1, alpha=0.5, stroke=0) +
  scale_color_brewer(palette = "Set1") + 
  theme_minimal() +
  theme(axis.title.y=element_blank() ) +
  xlab(paste0("Component ","V22","\n Cell Score"))

# MiSeq Density plot
ggplot(tmp, aes(V22, colour=MiSeq)) +
  geom_density() +
  scale_color_brewer(palette = "Set1") + 
  theme_minimal() +
  theme(axis.title.y=element_blank() ) +
  xlab(paste0("Component ","V22","\n Cell Score"))


rpsl_genes <- grep("Rps|Rpl", names(SDAresults$loadings[[1]][22,]), value=T)
rpsl_genes <- rpsl_genes[!grepl("-ps",rpsl_genes)]
manhatten_plot(component_enrichment(rpsl_genes, threshold = 300), topn = 5)
v22_loadings_plot <- genome_loadings(SDAresults$loadings[[1]][22,], label_both = FALSE, max.items = 20, label.size = 4, hide_unknown = T, gene_locations = gene_annotations) +
  ylab("Gene Loading (Component 22)") +
  scale_y_reverse() +
  scale_size_continuous(range = c(1, 3))

v22_cell_scores <- tmp %>%
  mutate(experiment = factor(experiment, levels = tmp[,mean(V22),by=experiment][order(V1)]$experiment)) %>%
ggplot(aes(V22, experiment, colour=MiSeq)) +
  geom_jitter(size=1, alpha=0.5, stroke=0) +
  scale_color_brewer(palette = "Set1") + 
  theme_minimal() +
  theme(axis.title.y=element_blank() ) +
  xlab(paste0("Component ","V22","\n Cell Score"))

v12_cell_scores <- ggplot(tmp,aes(V12, experiment, colour=grepl("Hormad",experiment))) +
  geom_jitter(size=1, alpha=0.5, stroke=0) +
  scale_color_brewer(palette = "Set1", guide=F) + 
  theme_minimal() +
  theme(axis.title.y=element_blank()) +
  xlab(paste0("Component ","V12","\n Cell Score"))

v12_loadings_plot <- genome_loadings(SDAresults$loadings[[1]][12,], label_both = FALSE, max.items = 20, label.size = 4, hide_unknown = T, gene_locations = gene_annotations) +
  ylab("Gene Loading (Component 12)") +
  scale_y_reverse() +
  scale_size_continuous(range = c(1, 3))

ETC enrichment

#GO_data <- readRDS("../data/GO_enrichment_dt.rds")

# unique(ETC_genes$"MGI Gene/Marker ID")[unique(ETC_genes$"MGI Gene/Marker ID") %in% colnames(SDAresults$loadings[[1]])]
# 
# ETC_genes <- fread("../data/GO_term_summary_20190318_211630.txt")
# unique(ETC_genes$"MGI Gene/Marker ID")

etc_genes <- grep("Uqc|Cox|Ndu",colnames(SDAresults$loadings[[1]]),value = T)
etc_genes <- grep("-ps",etc_genes, invert = T,value = T)

library(biomaRt)
ensembl <- useMart("ENSEMBL_MART_ENSEMBL", host = "www.ensembl.org", dataset = "mmusculus_gene_ensembl")
mapTab2 <- getBM(attributes = c("external_gene_name",'description'),
                filter = "external_gene_name", values = etc_genes, mart = ensembl, uniqueRows=TRUE)
etc_genes_clean <- data.table(mapTab2)[!grepl("assembly|like",description)]$external_gene_name
etc_genes_assemble <- data.table(mapTab2)[grepl("assembly",description)]$external_gene_name

etc_enrich <- component_enrichment(etc_genes_clean)


etc_enrich_plot <- manhatten_plot(etc_enrich[!component %in% gsub("V","",half_exclusion_comps)], topn=1, legend_position=c(0.25,0.6), repel_force = 10) + ggtitle("Enrichment of electron transport\n chain proton pump genes")

set.seed(142)
v9_loadings_plot <- genome_loadings(SDAresults$loadings[[1]][9,], label_both = FALSE, label.repel = 2,  max.items = 20, label.size = 4, hide_unknown = T, gene_locations = gene_annotations, highlight_genes = etc_genes_clean) +
  ylab("Gene Loading (Component 9)") +
  scale_y_reverse() +
  scale_size_continuous(range = c(1, 3)) + expand_limits(y=-0.3)

pdf("../results/S6B_NonMeiotic_Components.pdf", height = 12, width=15)
plot_grid(v22_loadings_plot, v22_cell_scores,
          v12_loadings_plot, v12_cell_scores,
          v9_loadings_plot, etc_enrich_plot,
          rel_widths = c(2,1),
          ncol=2,
          labels = "AUTO")
dev.off()


  pdf(height=6, width=6*(16/9), file="../results/MiSeq.pdf")
  plot_grid(v22_loadings_plot, v22_cell_scores, rel_widths = c(3,2))
  dev.off()

  pdf(height=6, width=6*(16/9), file="../results/ETC.pdf")
  plot_grid(v9_loadings_plot, etc_enrich_plot, rel_widths = c(3,2))
  dev.off()

Spermatogonia

tricolour_tsne(c("Gfra1","Upp1","Id4")) +
  facet_zoom(xlim=c(-5,3), ylim=c(35,43), zoom.size = 2) +
  ggtitle("SDA Prediction")

tricolour_tsne(c("Gfra1","Upp1","Id4")) +
  facet_zoom(xlim=c(-5,3), ylim=c(35,43), zoom.size = 2) +
  ggtitle("SDA Prediction")

tricolour_tsne(c("Gfra1","Upp1","Fli1")) +
  facet_zoom(xlim=c(-5,3), ylim=c(35,43), zoom.size = 2) +
  ggtitle("SDA Prediction")

tricolour_tsne(c("Gfra1","Upp1","Utf1")) +
  facet_zoom(xlim=c(-5,3), ylim=c(35,43), zoom.size = 2) +
  ggtitle("SDA Prediction")

tricolour_tsne(c("Gfra1","Upp1","Glis3")) +
  facet_zoom(xlim=c(-5,3), ylim=c(35,43), zoom.size = 2) +
  ggtitle("SDA Prediction")

tricolour_tsne(c("Gfra1","Upp1","Batf")) +
  facet_zoom(xlim=c(-5,3), ylim=c(35,43), zoom.size = 2) +
  ggtitle("SDA Prediction")

tricolour_tsne(c("Gfra1","Upp1","Foxf1")) +
  facet_zoom(xlim=c(-5,3), ylim=c(35,43), zoom.size = 2) +
  ggtitle("SDA Prediction")

tricolour_tsne(c("Gfra1","Upp1","Rhox10")) +
  facet_zoom(xlim=c(-5,3), ylim=c(35,43), zoom.size = 2) +
  ggtitle("SDA Prediction")

tricolour_tsne(c("Gfra1","Upp1","Lin28a")) +
  facet_zoom(xlim=c(-5,3), ylim=c(35,43), zoom.size = 2) +
  ggtitle("SDA Prediction")
tricolour_tsne(c("Gfra1","Upp1","Zbtb12")) +
  facet_zoom(xlim=c(-5,3), ylim=c(35,43), zoom.size = 2) +
  ggtitle("SDA Prediction")

tricolour_tsne(c("Gfra1","Upp1","Fbxl14")) +
  facet_zoom(xlim=c(-5,3), ylim=c(35,43), zoom.size = 2) +
  ggtitle("SDA Prediction")

tricolour_tsne(c("Gfra1","Upp1","Furin")) +
  facet_zoom(xlim=c(-5,3), ylim=c(35,43), zoom.size = 2) +
  ggtitle("SDA Prediction")

tricolour_tsne(c("Gfra1","Upp1","Dvl3")) +
  facet_zoom(xlim=c(-5,3), ylim=c(35,43), zoom.size = 2) +
  ggtitle("SDA Prediction")

tricolour_tsne(c("Gfra1","Upp1","Csrp1")) +
  facet_zoom(xlim=c(-5,3), ylim=c(35,43), zoom.size = 2) +
  ggtitle("SDA Prediction")

tricolour_tsne(c("Gfra1","Upp1","Lmtk2")) +
  facet_zoom(xlim=c(-5,3), ylim=c(35,43), zoom.size = 2) +
  ggtitle("SDA Prediction")

Check for age components

mouse_ages <- data.table(
  experiment=c("CNP_2016-12-15", "CNP_2017-01-05", "Cul4a_2017-03-28", "Cul4a_2017-03-30", 
"Hormad1_1_A", "Hormad1_2_A", "Hormad1_3_A", "Mlh3_1190", "Mlh3_1191", 
"Mlh3_1192", "Mlh3_300", "Mlh3_800", "SPCII_1", "SPCI_1", "SPD_1", 
"SPG_1", "WT_2016-11-08", "WT_2016-11-10", "WT_2016-11-11", "WT_2016-11-15", 
"WT_2016-11-16", "WT_2016-11-17", "WT_2016-11-21", "WT_2016-11-22", 
"WT_28117", "mj_3"),
  mouse_age=c(
    # CNP
2 * month + 14,
2 * month + 16,

  # Cul4a
1 * month + 18,
1 * month + 21,

# Hormad
4 * month + 3,
4 * month + 3,
4 * month + 0,

# Mlh3
2 * month + 21,
2 * month + 22,
2 * month + 23,

8 * month + 30,
8 * month + 30,

#SPC
2 * month + 21,
2 * month + 21,
2 * month + 21,
0 * month + 7,

#WT
5 * month + 5,
5 * month + 7,
3 * month + 27,
4 * month + 0,
3 * month + 1,
3 * month + 2,
9 * month + 14,
7 * month + 21,

4 * month + 0,

#MJ
4 * month + 2)
)

tmp2 <- data.table(SDAresults$scores, cell=rownames(SDAresults$scores))
tmp2 <- melt(tmp2, id="cell")
tmp2$experiment <- unlist(strsplit(tmp$cell,split = "\\."))[c(F,T)]

tmp2 <- tmp2[mouse_ages, on="experiment"]
tmp2[order(mouse_age), cell_id_byage:= 1:nrow(tmp2)]

age_comp_means <- tmp2[,median(value),by=c("experiment","mouse_age","variable")]
age_comp_means[,cor(mouse_age, V1, method='spearman'),by=variable][order(abs(V1))]

qplot(rank(age_comp_means[variable=="V44"]$mouse_age), rank(age_comp_means[variable=="V44"]$V1))
for(i in 1:50){
  print(
    ggplot(tmp, aes(cell_id_byage, get(paste0("V",i)), colour=experiment)) +
  geom_point() + theme(axis.text.x = element_blank(), legend.position = "none")
  )
}

Sox30 enrichment

sox30 <- readxl::read_xlsx("~/Downloads/TableS1.xlsx", sheet = "s1.4", range = "D2:D62")
str(sox30$`Class 2 target genes (60)`)

sox30_subset <- sox30$`Class 2 target genes (60)`[sox30$`Class 2 target genes (60)` %in% colnames(SDAresults$loadings[[1]])]

sox30DEG <- readxl::read_xlsx("~/Downloads/TableS1.xlsx", sheet = "s1.1", skip = 1)
sox30DEG <- data.table(sox30DEG)[order(p_value)][1:500]$Genes
sox30DEG <- sox30DEG[sox30DEG %in% colnames(SDAresults$loadings[[1]])]
sox30_enrich <- component_enrichment(sox30_subset)
sox30DEG_enrich <- component_enrichment(sox30DEG)

manhatten_plot(sox30_enrich, legend_position = c(0.2,0.6))
manhatten_plot(sox30DEG_enrich, legend_position = c(0.2,0.6))

Exerpimental group on tSNE

tmp <- cell_data
tmp[, group := factor(group, levels=c("WT","SPG","SPCI","SPCII","SPD","Hormad1","Mlh3","Cul4a","CNP"))]

tnse_groups <- ggplot(tmp[group!="mj"], aes(Tsne1_QC1, Tsne2_QC1, colour=group)) +
        geom_point(data = transform(tmp, group = NULL), colour = "grey85", size=0.5, stroke=0) + 
        geom_point(size=0.5, stroke=0) +
        scale_color_manual(values = c(brewer.pal(7,"Dark2"),"red", "black")) +
        facet_wrap(~group) +
        labs(x="t-SNE 1", y="t-SNE 2") +
        theme_minimal() +
        theme(legend.position = "none")

pdf("../results/tsne_groups.pdf", width=16/2, height=9/2)
tnse_groups
dev.off()

Mins markers_from_FIG1

marker_genes2 <- c("Dazl", "Dmrt1","Sycp3","Shcbp1l","Cdca8","Aurka","Nxt1","Acrv1","Lyzl1","Hemgn","Prm1","Tssk6","Oaz3","Aard","Clu","Defb19","Ptgds","Insl3","Lcn2","Col1a2")

pdf("../results/markers_from_FIG1.pdf", width=6, height=9)
plot_grid(plotlist = create_grob_list(fn = function(x){print_tsne(x,point_size = 0.1, predict = T)}, input = marker_genes2), ncol = 4, label_size=16)
dev.off()

Fig 4 S3

4S3 AB; C38 vs Trip13 & Hormad1 KO genes

# from Meiotic_components.Rmd
trip13_enrich_plot <- readRDS("../data/plots/trip13_enrich_plot.rds")
c38_rug_plot <- readRDS("../data/plots/c83_rug_plot.rds")

hormad <- plot_grid(c38_rug_plot,
          trip13_enrich_plot,
          labels = "AUTO",
          label_size = 25,
          nrow = 2)

pdf(height=6.5, width=6*1.7, file="../results/hormad_enrich.pdf")
hormad
dev.off()

4S3 CD; MSCI

# from MSCI.Rmd
msci_tsne <- readRDS("../data/plots/msci_tsne.rds")

msci <- plot_grid(print_tsne(38, curve = T, point_size = 0.5) + ggtitle(""),
                  msci_tsne+ ggtitle(""),
                  labels=c("C","D"),
                  label_size = 25)

pdf(height=6, width=6*1.7, file="../results/msci_tsne.pdf")
msci
dev.off()

4S3 F Macrophages

# Macrophages plot inc Hormad enrichment
C11_MQ_plot <- readRDS("../data/plots/C11_MQ_plot.rds")

4S3 G Leydig Cells

c40_loadings_plot <- genome_loadings(SDAresults$loadings[[1]][40,], label_both = FALSE, max.items = 10, label.size = 4,
                                     gene_locations = gene_annotations, hide_unknown = T,
                                     highlight_genes = c("Cyp17a1","Hsd3b6","Star","Cyp11a1","Hsd17b3"),
                                     label_genes = c("Cyp17a1","Hsd3b6","Star","Cyp11a1","Hsd17b3")) +
  ylab("Gene Loadings (Component 40)") +
  scale_size_continuous(range = c(1, 3))

mito_plot <- ggdraw() + draw_image("../results/testosterone-synth_wdoi.png", clip="on")

leydig <- plot_grid(c40_loadings_plot, mito_plot, rel_widths = c(2,1))

Fig4S3 Combined

pdf("../results/Fig_S6_top.pdf", height=13, width=20)
plot_grid(hormad, msci, C11_MQ_plot, leydig,
          nrow = 2,
          labels = c("","","E","F"),
          label_size = 25,
          scale=0.9)
dev.off()

For talks

loadings_plot <- genome_loadings(SDAresults$loadings[[1]][38,], label_both = FALSE, max.items = 10, label.size = 4, hide_unknown = T) +
  ylab("Gene Loadings (Component 38)") + theme_minimal() + theme(legend.position = "none")

loadings_plot

pdf(height=5, width=5*1.7, file="../results/C38.pdf")
loadings_plot
dev.off()

pdf(height=5, width=5*1.7, file="../results/C5_tsne.pdf")
tsne_plot + theme(legend.position = "right" ) + guides(colour = guide_legend(override.aes = list(ncol=1)))
dev.off()

pdf(height=5, width=5*1.7, file="../results/C5_loadings.pdf")
v5_loadings_plot
dev.off()

pdf(height=5, width=5*1.7, file="../results/C5_enrich.pdf")
c5_enrich_plot
dev.off()

pdf(height=5, width=5*1.7, file="../results/sparsity.pdf")
sparsity_plot
dev.off()

jpeg("../results/component_tsne_all_presentation.jpeg", height=3000, width=3000*1.7, res = 200)
component_tsne
dev.off()

pdf(height=5, width=5*1.7, file="../results/Prdm9_impute.pdf")
imputed_vs_raw(c("Prdm9"), facet = F) + theme_minimal(base_size = 18) + theme(legend.position = "none") + ggtitle("Prdm9")
dev.off()

imputed_vs_raw(c("Piwil1"), facet = F) + theme_minimal(base_size = 18) + theme(legend.position = "none") + ggtitle("Piwil1")
imputed_vs_raw(c("Ssxb1"), facet = F) + theme_minimal(base_size = 18) + theme(legend.position = "none") + ggtitle("Ssxb1")

pdf(height=5, width=5*1.7, file="../results/Zfp37_impute.pdf")
imputed_vs_raw(c("Zfp37"), facet = F) + theme_minimal(base_size = 18) + theme(legend.position = "none") + ggtitle("Zfp37")
dev.off()

pdf(height=5, width=5*1.7, file="../results/Gapdhs_impute.pdf")
imputed_vs_raw(c("Gapdhs"), facet = F) + theme_minimal(base_size = 18) + theme(legend.position = "none") + ggtitle("Gapdhs")
dev.off()

pdf(height=5, width=5*1.7, file="../results/Lrrd1_impute.pdf")
imputed_vs_raw(c("Lrrd1"), facet = F) + theme_minimal(base_size = 16) + theme(legend.position = "none") + ggtitle("Lrrd1")
dev.off()

GO_data <- readRDS("../data/GO_enrichment.rds")

library(ggrepel)
pdf(height=5, width=5*1.7, file="../results/V5N_enrich.pdf")
go_volcano_plot(component = "V5N", label_size = 3, OR_threshold = 10)
dev.off()
component_tsne <- plot_grid(plotlist = create_grob_list(fn = function(x) print_tsne(x) + theme(plot.title = element_text(size=24)), input = c(11,32,40,45,38,31,5,13,42,20,30,35,15,17,18)), nrow=3)

pdf("../results/component_tsne_sel.pdf", width=24, height=13.5)
component_tsne 
dev.off()

Other Random Plots

Just Chromosome 1 to show sparsity

devtools::load_all("~/Dropbox/Github/SDAtools/")
genome_loadings(SDAresults$loadings[[1]][5,], label_both = FALSE, max.items = 100, label.size = 4, hide_unknown = T, gene_locations = gene_annotations) + xlim(0,2e8)

Compare Soh et al Ovary Pprophase genes

"A Gene Regulatory Program for Meiotic Prophase in the Fetal Ovary" https://doi.org/10.1371/journal.pgen.1005531

Enriched in 5N

library(readxl)
fetal_ovary <- read_excel("../data/previous_studies/Soh/journal.pgen.1005531.s008.XLSX", sheet = 1)

union_ovary <- as.character(fetal_ovary$Row.names[fetal_ovary$Row.names %in% colnames(SDAresults$loadings[[1]])])

str(union_ovary)

ovary_enrich <- component_enrichment(union_ovary)

library(ggrepel)
ggplot(ovary_enrich[!Component %in% gsub("V","",ordering[half_exclusions])], aes(Component, -log(p.value,10), label=Component)) +
  geom_point() +
  geom_label_repel(data=ovary_enrich[-log(p.value,10) > 15]) +
  geom_hline(yintercept = -log(0.05/100,10), col="red", size=0.2) +
  ggtitle("Manhattan Plot - Enrichment for Soh et al Meiotic Prophase genes in Ovary") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

SDA example

Current schematic doesn't use real data? But it's hard to colour each column/row seperately to match the components across matrices in R. Also hard to represent both positive and negative with a single colour per comopnent..

library(ComplexHeatmap)

tmp <- simulate_2D_data(n_individuals = 20, n_variables = 15, n_components = 5, sparsity = 0.7)

Heatmap(tmp$Y, 
        cluster_rows = F, cluster_columns = F,
        col = circlize::colorRamp2(c(-3, 0, 3), c("#E41A1C", "white", "#377EB8")),
        rect_gp = gpar(col = "grey", lty = 1, lwd = 2),
        show_heatmap_legend = F,
        column_title = "Genes",
        row_title = "Cells")

Heatmap(tmp$A, 
        cluster_rows = F, cluster_columns = F,
        col = circlize::colorRamp2(c(-3, 0, 3), c("#E41A1C", "white", "#377EB8")),
        rect_gp = gpar(col = "grey", lty = 1, lwd = 2),
        show_heatmap_legend = F,
        width = unit(3.5, "cm"),
        column_title = "Components",
        row_title = "Cells")
Heatmap(tmp$X, 
        cluster_rows = F, cluster_columns = F,
        col = circlize::colorRamp2(c(-3, 0, 3), c("#E41A1C", "white", "#377EB8")),
        rect_gp = gpar(col = "grey", lty = 1, lwd = 2),
        show_heatmap_legend = F,
        column_title = "Genes",
        row_title = "Components")

Spermatogonia candidate markers + Ctcfl

sg_genes <- c("Gfra1","Id4","Nanos3","Glis3","Sipa1","Zbtb16","Ctcfl","Egr4","Upp1","Afp","Lin28a","Foxf1","Foxo1","Rhox10","Fbxo32","Morc1","Irf4","Dppa4","Dctd","Pramef12","Erap1","Ccdc141","Sohlh1","Nefm","Sbk1","Prtg")

Spermatogonia_plot <- ggplot(melt_genes(sg_genes, predict = T)[PseudoTime>15500], aes(-PseudoTime, Expression, group=Gene, colour=Gene=="Ctcfl")) +
  geom_point(size=0.1) +
  geom_smooth(method = "gam",formula = y ~ s(x, k = 50),  se=FALSE, colour="black") +
  ylab("Gene Expression") + xlab("Pseudotime") +
  theme_minimal() +
   facet_wrap(~Gene, scales = "free_y") +
   theme(legend.position ="none") +
  ggtitle("Early Spermatogonial Genes, with Pre-leptotene Ctcfl for reference")

Spermatogonia_plot

pdf(width = 8, file="../results/Spermatogonia_candidate_markers.pdf")
Spermatogonia_plot
plot_grid(plotlist = create_grob_list(fn = function(x){print_tsne(x,point_size = 0.1)}, input = sg_genes), ncol = 4, label_size=16)
dev.off()

Counts / Lib Size

data <- readRDS("../data/merged_mouse_normalised_tsne-QC_V3.rds")
library_size <- data$library_size
data <- data$dge


retina <- fread("../data/previous_studies/mccarroll/GSM1626798_P14Retina_6_gene_exon.dge.summary.txt")
retina2 <- fread("../data/previous_studies/mccarroll/GSM1626794_P14Retina_2_gene_exon.dge.summary.txt")

retina <- rbind(retina2, retina)

bipolar <- fread("../data/previous_studies/biopolar/Bipolar_neurons_gene_exon.dge.summary.txt")

pbmc10x <- fread("../data/previous_studies/10X_genomics/293t/293t_10X_gene_exon.dge.summary.txt")

lib_sizes <- rbind(data.table(counts = library_size, exp="conrad"),
                   data.table(counts = retina$NUM_TRANSCRIPTS, exp="mccarrol"),
                   data.table(counts = bipolar$NUM_TRANSCRIPTS, exp="bipolar"),
                   data.table(counts = pbmc10x$NUM_TRANSCRIPTS, exp="293t10x"))

cell_sums <- lib_sizes[order(-counts)]

cell_sums[, cumulative_reads := cumsum(counts), by=exp]
cell_sums[, cell_index := seq_len(.N), by=exp]


ggplot(cell_sums, aes(cell_index, cumulative_reads, colour = exp)) + 
  geom_line() +
  theme(legend.position="bottom")


ggplot(cell_sums, aes(counts)) + geom_histogram(bins=100) + facet_wrap(~exp, scales = "free_y") + xlim(0,5000)
ggplot(cell_sums, aes(log(counts))) + geom_histogram(bins=100) + facet_wrap(~exp)


MyersGroup/testisAtlas documentation built on Nov. 29, 2020, 9:21 p.m.