library(SDAtools)
library(data.table)
library(ggplot2)
library(cowplot)
library(viridis)
library(gridExtra)
library(scales)
library(RColorBrewer)
library(NMF) # devtools::install_github("renozao/NMF", "devel")
nmf.options(grid.patch=TRUE) # stop blank pages appearing

library(testisAtlas)
load2("../data/cache")
load_component_orderings()

MSCI through Pseudotime

Meiotic sex chromosome inactivation, is a transcriptional silencing during pachytene. We can see this effect in the component gene loadings as previously discussed (Leptotene/Zygotene component section 5.2). Here we show it is also visible directly in the gene expression by using the pseudotime ordering.

# ouch 
rna_locations2 <- load_gene_locations(name="merge_mouse_V3_old", path = "../data/")

# Plot gene expression X&Y
sex_genes <- rna_locations2[chromosome_name %in% c("X","Y")]
sex_genes <- sex_genes[gene_symbol %in% colnames(SDAresults$loadings[[1]])]$gene_symbol

X_genes <- rna_locations2[chromosome_name %in% c("X")]
X_genes <- X_genes[gene_symbol %in% colnames(SDAresults$loadings[[1]])]$gene_symbol

Y_genes <- rna_locations2[chromosome_name %in% c("Y")]
Y_genes <- Y_genes[gene_symbol %in% colnames(SDAresults$loadings[[1]])]$gene_symbol

autosomal_genes <- rna_locations2[!chromosome_name %in% c("X","Y","Un")]
autosomal_genes <- autosomal_genes[gene_symbol %in% colnames(SDAresults$loadings[[1]])]$gene_symbol

sex_autosome <- data.table(autosomal = Matrix::rowSums(data[,autosomal_genes]),
                            sex = Matrix::rowSums(data[,sex_genes]),
                            cell = rownames(data))

# autosomal_sum <- cell_data[, .(autosomal = Reduce(`+`, .SD), cell, PseudoTime, somatic4,  V5,  Tsne1_QC1, Tsne2_QC1, library_size), .SDcols = autosomal_genes ] # somatic
# sex_sum <- cell_data[, .(sex = Reduce(`+`, .SD), cell), .SDcols = sex_genes ] 
# setkey(autosomal_sum, cell)
# setkey(sex_sum, cell)
# sex_autosome <- merge(sex_sum, autosomal_sum)
# sex_autosome[order(PseudoTime), PseudoTime_rank:=1:nrow(cell_data)]

#cell_data[somatic4==FALSE][order(-PseudoTime),(sex_predictions / autosomal_predictions)<0.01]
msci_cells <- sex_autosome[(sex + 1) / (autosomal + 1) <0.001]$cell

stopifnot(is.null(cell_data$sex)) # already merged
cell_data[,sex:= NULL]
cell_data[,autosomal:= NULL]
cell_data <- merge(cell_data, sex_autosome[,.(cell, sex, autosomal)])
cell_data[,msci_ratio:=(sex + 1) / (autosomal + 1)]
# plot heatmap of genes with at least 200 cells expression >2
#[names(which(rowSums(abs(sex_expression_matrix)>2)>100)), ] Rowv=NA,

library(bigmemory)
options(bigmemory.allow.dimnames=TRUE)
#predicted_expression <- as.big.matrix(SDAresults$scores %*% SDAresults$loadings[[1]], backingpath="../data/count_matrices/", backingfile = "predicted_expression.big")
predicted_expression <- attach.big.matrix("../data/count_matrices/predicted_expression.big.desc")

autosomal_predictions <- rowSums(predicted_expression[,autosomal_genes])
sex_predictions <- rowSums(predicted_expression[,sex_genes])
X_predictions <- rowSums(predicted_expression[,X_genes])
Y_predictions <- rowSums(predicted_expression[,Y_genes])

#autosomal_sum <- rowSums(data[,autosomal_genes])
#sex_sum <- rowSums(data[,sex_genes])
#X_sum <- rowSums(data[,X_genes])
#Y_sum <- rowSums(data[,Y_genes])

#msci_sums <- data.table(sex_sum, autosomal_sum, Y_sum, X_sum, cell=names(autosomal_sum))

msci_predictions <- data.table(sex_predictions, autosomal_predictions, Y_predictions, X_predictions, cell=names(autosomal_predictions))
setkey(msci_predictions, cell)
cell_data <- merge(msci_predictions, cell_data, by="cell")

Normalised Expresion Heatmap

#cell_data$library.size <- log(cell_data$library_size,100)

sex_expression_matrix <- t(scale(as.matrix(cell_data[order(-PseudoTime)][somatic4==FALSE, sex_genes, with=FALSE])))
#sex_expression_matrix <- t(scale(as.matrix(cell_data[order(-PseudoTime)][somatic4==FALSE, sex_genes, with=FALSE])))
#sex_expression_matrix[sex_expression_matrix > 2.5] <- 2.5
#sex_expression_matrix[sex_expression_matrix < (-2.5)] <- (-2.5)

sex_expression_matrix <- asinh(sex_expression_matrix)

#[,order(-cell_data[somatic4==FALSE]$PseudoTime)]
aheatmap(sex_expression_matrix[names(which(rowSums(sex_expression_matrix>2)>200)), ],
         breaks=0,
         hclustfun = "ward.D",
         Colv=NA,
         labCol = NA,
         annCol = cell_data[somatic4==FALSE][order(-PseudoTime), .(msci_cell = msci_ratio<0.005)],
         main="Sex chromosome gene expression over pseudotime",
         color=rev(colorRampPalette(brewer.pal(11, "RdYlBu"))(100)))

#annColors = list(msci_ratio = rev(colorRampPalette(brewer.pal(9, "YlOrRd"))(100))),

Predicted Expresion Heatmap

tmp <- data.table(predicted_expression[,sex_genes], keep.rownames = T)
setnames(tmp, "rn", "cell")
setkey(tmp, "cell")
tmp <- merge(tmp, cell_data[, .(cell, PseudoTime, V38, somatic4,  Tsne1_QC1, Tsne2_QC1)], by="cell")

sex_expression_matrix <- t(scale(as.matrix(tmp[order(-PseudoTime)][somatic4==FALSE, sex_genes, with=FALSE])))

sex_expression_matrix <- asinh(sex_expression_matrix)

#sex_expression_matrix[sex_expression_matrix > 2.5] <- 2.5
#sex_expression_matrix[sex_expression_matrix < (-2.5)] <- (-2.5)

test <- viridis_pal(alpha = 1, begin = 0, end = 1, direction = 1, option = "D")

aheatmap(sex_expression_matrix[names(which(rowSums(sex_expression_matrix>2)>200)), ],
         hclustfun = "ward.D",
         Colv=NA,
         labCol = NA, labRow = NA, legend = NA, annLegend = FALSE,
         annCol = cell_data[somatic4==FALSE][order(-PseudoTime),(sex_predictions / autosomal_predictions)<0.01],
         color=rev(colorRampPalette(brewer.pal(11, "RdYlBu"))(100)),
         file="../results/MSCI_heatmap.png", width = 9, height = 6)

Alternative Colour Scheme:

aheatmap(sex_expression_matrix[names(which(rowSums(sex_expression_matrix>2)>200)), ],
         hclustfun = "ward.D",
         Colv=NA,
         labCol = NA, labRow = NA, legend = NA, annLegend = FALSE,
         annCol = cell_data[somatic4==FALSE][order(-PseudoTime), .(msci_cell = msci_ratio<0.005)],
         main="Sex chromosome gene expression over pseudotime",
         color=test(100))

Sex-Autosome Ratio

# ggplot(cell_data[somatic4==FALSE], aes(-PseudoTime, (sex + 1) / (autosomal + 1), colour = sex==0, group=NA)) +
#   geom_point(size=0.5,alpha=0.5) +
#   geom_smooth() +
#   ggtitle("Ratio of Sex to Autosomal chromosome expression through pseudotime") +
#   geom_hline(yintercept = 0.05, linetype="dashed") +
#   theme(legend.position = "bottom")

curve_data <- load_curve_data(principal_curves, "df_9")

msci_pseudotime <- ggplot(curve_data[cell_data, on="PseudoTime"][somatic4==FALSE], aes(-PseudoTime, sex_predictions / autosomal_predictions)) +
  geom_point(size=0.25,alpha=0.5, aes(colour = V38)) +
  scale_color_viridis(direction=-1, guide=guide_colourbar("C38 Cell Scores\n(Hormad1 KO)", title.position = "top")) +
  labs(y="Ratio of total sex to autosomal expression") +
  theme_minimal() +
  theme(legend.position = "bottom") +
  new_scale_color()+
  geom_line(aes(y=-0.015, colour=Stage), size=5) +
  scale_colour_brewer(palette = "Set1")+
  guides(colour = guide_legend(ncol = 4, title.position = "top")) +
  theme(legend.position = "bottom")

msci_pseudotimeX <- ggplot(curve_data[cell_data, on="PseudoTime"][somatic4==FALSE], aes(-PseudoTime, X_predictions / autosomal_predictions)) +
  geom_point(size=0.25,alpha=0.5, aes(colour = V38)) +
  scale_color_viridis(direction=-1, guide=guide_colourbar("C38 Cell Scores\n(Hormad1 KO)", title.position = "top")) +
  labs(y="Ratio of total X to autosomal expression") +
  theme_minimal() +
  theme(legend.position = "bottom") +
  new_scale_color()+
  geom_line(aes(y=-0.015, colour=Stage), size=5) +
  scale_colour_brewer(palette = "Set1")+
  guides(colour = guide_legend(ncol = 4, title.position = "top")) +
  theme(legend.position = "bottom")

msci_pseudotimeY <- ggplot(curve_data[cell_data, on="PseudoTime"][somatic4==FALSE], aes(-PseudoTime, Y_predictions / autosomal_predictions)) +
  geom_point(size=0.25,alpha=0.5, aes(colour = V38)) +
  scale_color_viridis(direction=-1, guide=guide_colourbar("C38 Cell Scores\n(Hormad1 KO)", title.position = "top")) +
  labs(y="Ratio of total Y to autosomal expression") +
  theme_minimal() +
  theme(legend.position = "bottom") +
  new_scale_color()+
  geom_line(aes(y=-0.002, colour=Stage), size=5) +
  scale_colour_brewer(palette = "Set1")+
  guides(colour = guide_legend(ncol = 4, title.position = "top")) +
  theme(legend.position = "bottom")

#V38<1

msci_pseudotime

msci_pseudotime_sans38 

saveRDS(msci_pseudotime, "../data/plots/msci_pseudotime.rds")
saveRDS(msci_pseudotimeX, "../data/plots/msci_pseudotimeX.rds")
saveRDS(msci_pseudotimeY, "../data/plots/msci_pseudotimeY.rds")

Sex-Autosome Ratio on tSNE

msci_tsne <- ggplot(cell_data, aes(Tsne1_QC1, Tsne2_QC1, color=sex_predictions / autosomal_predictions)) +
  geom_point(size=0.25) +
  scale_color_viridis(direction=-1) +
  theme_minimal() +
  theme(legend.position = "bottom", legend.key.width=unit(1,"cm")) +
  labs(x="t-SNE 1", y="t-SNE 2",color = "Ratio of total sex to autosomal expression")

msci_tsne

saveRDS(msci_tsne,"../data/tmp_cache/msci_tsne.rds")

# ggplot(cell_data, aes(Tsne1_QC1, Tsne2_QC1, color=log((sex + 1) / (autosomal + 1)))) +
#   geom_point(size=0.5) +
#   scale_color_viridis(direction=-1) +
#   theme(legend.position = "bottom") +
#   ggtitle("t-SNE - MSCI")
ggplot(cell_data, aes(Tsne1_QC1, Tsne2_QC1, color=(sex_predictions / autosomal_predictions)<0.01)) +
    geom_point(size=0.25) +
    theme_minimal() +
    theme(legend.position = "bottom", legend.key.width=unit(1,"cm")) +
    labs(x="t-SNE 1", y="t-SNE 2",color = "Ratio of total sex to autosomal expression < 0.01") +
  scale_color_brewer(palette = "Set1")

X VS Y

Modify Real Data plot X & Y sim through time

Let's use our real data to simulate a case in which there is obvious non sharing of sex transcripts

cell_data[,Y_predictions_sim := Y_predictions]
cell_data[,X_predictions_sim := X_predictions]

set.seed(42)
# start at peudotime 10k
n=10000
cell_data[PseudoTime<=n, male_simulated := sample(c(T,F), n, T)]

# gradually remove X chrom from males
cell_data[PseudoTime<=n & male_simulated, Y_predictions_sim := Y_predictions]
n2 = nrow(cell_data[PseudoTime<=n & male_simulated])
cell_data[PseudoTime<=n & male_simulated, X_predictions_sim := (X_predictions)*(PseudoTime/n) + rnorm(n2,0,3)]

# gradually remove Y chrom from females
cell_data[PseudoTime<=n & !male_simulated, X_predictions_sim := X_predictions]
n2 = nrow(cell_data[PseudoTime<=n & !male_simulated])
cell_data[PseudoTime<=n & !male_simulated, Y_predictions_sim := (Y_predictions)*(PseudoTime/n) + rnorm(n2,0,0.5)]

The X chromosome expression very clearly splits into two populations, 1) the males which do not express it (perhaps there's some left over transcripts, it's not clear how fast they should decay), and 2) the females which do.

For the Y chromosome we have the inverse situation, although it's not as clear as the expression is allready very low even in males.

X_nonsharing_prediction <- ggplot(cell_data[somatic4==FALSE], aes(-PseudoTime, X_predictions_sim/autosomal_predictions, colour = male_simulated)) +
  geom_point(alpha=0.3, size=1, stroke=0) +
  ylab("Ratio of Predicted total X Chromosome\n
       Expression to Autosomal Expression") + 
  theme_minimal() + theme(legend.position="bottom") +
  guides(colour = guide_legend(override.aes = list(size=10, alpha=1),title="Is Simulated Male?"))

X_nonsharing_prediction

ggplot(cell_data[somatic4==FALSE], aes(-PseudoTime, Y_predictions_sim/autosomal_predictions, colour = male_simulated)) +
    geom_point(alpha=0.3, size=1, stroke=0) + theme(legend.position = "bottom")
str(Y_genes)
str(X_genes)

ggplot(cell_data[somatic4==FALSE], aes(-PseudoTime, asinh(Y_predictions / X_predictions), colour = V38)) +
  geom_point(alpha=0.5, size=0.5) +
  ylim(-0.25,0.25) +
  ggtitle("Ratio of Y to X chromosome expression through pseudotime") +
  scale_color_viridis(direction=-1) +
  theme_minimal()

ggplot(cell_data[somatic4==FALSE], aes(-PseudoTime, Y_predictions, colour = V38)) +
  geom_point(alpha=0.5, size=0.5) +
  scale_color_viridis(direction=-1) +
  scale_y_sqrt(limits=c(0,40)) +
  ggtitle("Y chromosome expression through pseudotime") +
  theme_minimal()

ggplot(cell_data[somatic4==FALSE], aes(-PseudoTime, X_predictions, colour = V38)) +
  geom_point(alpha=0.5, size=0.5) +
  scale_color_viridis(direction=-1) +
  scale_y_sqrt() +
  ggtitle("X chromosome expression through pseudotime") +
  theme_minimal()

ggplot(melt_genes(X_genes[1]), aes(-PseudoTime, Expression)) + geom_point()

Expression of X escapees

de Cruz et al. detected a switch off between LZ and PS. They also found some genes upregulated between LZ and PS, which they say 'escaped MSCI' however we find no mRNA genes escape MSCI and the genes they identified are just the ones reactivated post MSCI. This highlights the limitations of the low resolution bulk profiling - using just 4 fractions.

We find that the previsouly proposed MSCI escapees are expressed strongly but not during MSCI (shown as a shaded reigon)

x_escapees <- readxl::read_excel("../data/previous_studies/Cruz/12864_2016_2618_MOESM4_ESM.xlsx")

genes <- x_escapees$`Gene Symbol`[x_escapees$`Gene Symbol` %in% colnames(data)]

# "Cypt2-ps" -> Cypt2

#4930468A15Rik -> Cldn34d
#Mtap7d3 -> Map7d3
#Gm14483 -> H2al1a
#Gm5382 -> H2al1n
#4930557A04Rik -> H2al3
#Gm4937 -> Cldn34b3

# Gm14484 -> H2al1e, not detected
#Gm10096, not detected
#Gm14477 -> H2al1c, not detected

genes <- c(genes,"Cypt2","Cldn34d","Map7d3","H2al1a","H2al1n","H2al3","Cldn34b3")

plot_pseudotime_expression_panel(genes, ncol = 5, title = "Smoothed gene expression over pseudotime - MSCI 'X-escapees'", highlight_reigon = c(-15500,-12500), point_size = 0.1)
X_escapees_plot <- ggplot(curve_data[melt_genes(genes)[!is.na(PseudoTime)], on="PseudoTime"], aes(-PseudoTime, Expression, group=Gene)) +
    stat_smooth(geom="line", method = "gam", formula = y ~ s(x, bs = "ad"), se=FALSE, colour="black") +
    ylab("Gene Expression") + xlab("Pseudotime") +
    theme_minimal() +
    theme(legend.position = "none") +
    new_scale_color()+
    geom_line(aes(y=-0.1, colour=Stage), size=5) +
    scale_colour_brewer(palette = "Set1")+
    theme(legend.position = "none") # + facet_wrap(~Gene, scales='free_y')

X_escapees_plot

saveRDS(X_escapees_plot,"../data/plots/X_escapees_plot.rds")

X escapees In Context

Adding all X&Y genes those previously predicted to escape look just like the rest of the post-MSCI expressed genes

#plot(sex_autosome$PseudoTime, sex_autosome$PseudoTime_rank)
#merge_sda3_melt2[variable %in% sex_genes]
ggplot(melt_genes(sex_genes), aes(-PseudoTime, abs(value), group=variable, colour=variable %in% x_escapees$`Gene Symbol`)) +
  geom_line(stat="smooth", method = "gam", formula = y ~ s(x, k = 20),alpha=0.5) +
  scale_color_manual(values=c("black","red")) +
  ylab("Gene Expression") + xlab("Pseudotime") +
  ylim(0,NA) +
  ggtitle("Expression over pseudotime for all Sex chromosome genes ('escapees' in red)") +
  theme(legend.position = "none") +
  geom_rect(data=data.frame(xmin=-15500, xmax=-12500, ymin=-Inf, ymax=Inf),
            aes(xmin=xmin, xmax=xmax, ymin=ymin, ymax=ymax),
              fill="grey20",
              colour=NA,
              alpha=0.2,
              inherit.aes = FALSE)

Let's look a bit closer are those little bumps during MSCI:

It seems that some genes have expression during MSCI e.g. Tsx, Cited1, Etd etc.

MSCI_cells <- cell_data[-PseudoTime>(-15000) & -PseudoTime<(-12500)]$cell

mean_msci <- Matrix::colMeans(data[MSCI_cells,sex_genes])

top_msci <- names(head(sort(mean_msci,T),20))

plot_pseudotime_expression_panel(top_msci, ncol = 5, title = "Smoothed gene expression over pseudotime - MSCI 'X-escapees'", highlight_reigon = c(-15500,-12500), point_size = 0.1)

ggplot(melt_genes(top_msci), aes(-PseudoTime, value, group=variable)) +
  ylab("Gene Expression") + xlab("Pseudotime") +
  ggtitle("Expression over pseudotime for 'X-escapee' genes") +
  theme_minimal() +
  theme(legend.position = "none") +
  geom_rect(data=data.frame(xmin=-15500, xmax=-12500, ymin=-Inf, ymax=Inf),
            aes(xmin=xmin, xmax=xmax, ymin=ymin, ymax=ymax),
              fill="grey20",
              colour=NA,
              alpha=0.2,
              inherit.aes = FALSE) +
  facet_wrap(~variable, scales="free_y") +
  geom_smooth(method = "gam",formula = y ~ s(x, k = 50),  se=FALSE, colour="black")

This is less of a problem with the new somatic cell definition

However, on tSNE these genes don't look active during MSCI, but highly expressed in neighbouring sertoli cells. So it seems the sertoli contaminated our pseudotime slightly. The only genes here that look like they could be legitamately escaping MSCI are those with low disperse expression like Gm6472, Gm14586, Gm5644, Gm8692, Rps24-ps3 - ALL of which are pseudogenes.

grid.arrange(grobs=create_grob_list(fn = function(x){print_tsne(x,point_size = 0.1)}, input = top_msci), nrow=4, heights = c(1,1,1,1))

A reminder of which cells are masked as somatic in pseudotime assignment. You can see the problem cells, but it's hard to remove these cells without manually defining a reigon.

ggplot(cell_data, aes(Tsne1_QC1, Tsne2_QC1, color=somatic4)) +
  geom_point(size=0.25) +
  theme(legend.position = "bottom") +
  ggtitle("t-SNE - experiment")

Just in case all genes are underexpressed during pachytene, here's a null sample - genes from the autosome.

ggplot(melt_genes(sample(autosomal_genes, length(sex_genes))), aes(-PseudoTime, abs(value), group=variable)) +
  geom_line(stat="smooth", method = "gam", formula = y ~ s(x, k = 20),alpha=0.5) +
  ylab("Gene Expression") + xlab("Pseudotime") +
  ylim(0,NA) +
  ggtitle("Expression over pseudotime for (equal size sample of) Autosomal chromosome genes") +
  theme(legend.position = "none")

# attempt both on same plot with different colours
# ggplot(merge_sda3_melt[variable %in% sample(autosomal_genes, length(sex_genes))], aes(-PseudoTime, abs(value), group=variable)) +
#     stat_smooth(geom='line', alpha=0.3, se=FALSE) +
#   stat_smooth(data=merge_sda3_melt[variable %in% sex_genes], geom='line', alpha=0.3, se=FALSE, colour="red") +
#   ylab("Gene Expression") + xlab("Pseudotime") +
#   ylim(0,NA) +
#   ggtitle("Expression over pseudotime for (equal size sample of) Autosomal chromosome genes") +
#   theme(legend.position = "none")
v42_loadings_plot <- genome_loadings(SDAresults$loadings[[1]][42,], label_both = FALSE, max.items = 20, label.size = 4, hide_unknown = T, gene_locations = gene_annotations) +
  ylab("Gene Loading (Component 42)") +
  scale_y_reverse() +
  scale_size_continuous(range = c(1, 3))

Publication Figure

Figure 2 (MSCI ratio & Escape genes)

msci_pseudotimeY <- readRDS("../data/plots/msci_pseudotimeY.rds")
msci_pseudotimeX <- readRDS("../data/plots/msci_pseudotimeX.rds")
X_escapees_plot <- readRDS("../data/plots/X_escapees_plot.rds")

right <- plot_grid(X_nonsharing_prediction,
                   v42_loadings_plot,
                   ncol=1, labels = c("B","E"))

left <- plot_grid(msci_pseudotimeX,
                  plot_grid(X_escapees_plot,
                            print_tsne(42, point_size = 0.3) + ggtitle(""),
                            ncol=2, labels = c("C","D")),
                  ncol=1, rel_heights = c(3,2),
                  labels = c("A",""))

pdf(width = 15, height = 10, file = "../results/Figure_6_MSCI.pdf")
plot_grid(left, right, ncol=2)
dev.off() 

pdf(width = 15, height = 9, file = "../results/Figure_6_Supplement.pdf")
plot_grid(plotlist = list(msci_pseudotimeY,
                       X_escapees_plot + facet_wrap(~Gene, scales='free_y')),
          labels = c("A","B"), rel_widths = c(2,3))
dev.off()


MyersGroup/testisAtlas documentation built on Nov. 29, 2020, 9:21 p.m.