NNMF

library(data.table)
data <- fread("../data/conradV3/V3_SDAmerged_mouse_V3_SDA.data")

dimnames_conradV3 <- readRDS("../data/conradV3/V3_SDAmerged_mouse_V3_SDA_dimnames.rds")
data <- as.matrix(data)
rownames(data) <- dimnames_conradV3[[1]]
colnames(data) <- dimnames_conradV3[[2]]



Sys.time()
nnmf_decomp <- NNLM::nnmf(as.matrix(data), 50, rel.tol = 1e-5)
Sys.time()
nnmf_decomp

saveRDS(nnmf_decomp,"../data/conradV3/nnmf_decomp_v1.rds")

#nnmf_decomp$run.time["user.self"]/60
# 27.5 mins

gc()

# Try with different Beta values for regularisation

Sys.time()
nnmf_decomp2 <- NNLM::nnmf(as.matrix(data), 50, rel.tol = 1e-5, beta = c(0,0,10))
Sys.time()

saveRDS(nnmf_decomp2,"../data/conradV3/nnmf_decomp_v2.rds")


Sys.time()
nnmf_decomp3 <- NNLM::nnmf(as.matrix(data), 50, rel.tol = 1e-5, beta = c(0,0,1e4))
Sys.time()

saveRDS(nnmf_decomp3,"../data/conradV3/nnmf_decomp_v3.rds")

Sys.time()
nnmf_decomp4 <- NNLM::nnmf(as.matrix(data), 50, rel.tol = 1e-5, beta = c(0,0,1e3))
Sys.time()

saveRDS(nnmf_decomp4,"../data/conradV3/nnmf_decomp_v4.rds")


Sys.time()
nnmf_decomp5 <- NNLM::nnmf(as.matrix(data), 100, rel.tol = 1e-5)
Sys.time()

saveRDS(nnmf_decomp5,"../data/conradV3/nnmf_decomp_v5.rds")


Sys.time()
nnmf_decomp <- NNLM::nnmf(as.matrix(data), 50, rel.tol = 1e-5, n.threads = 15, loss="mkl")
Sys.time()
nnmf_decomp

saveRDS(nnmf_decomp,"../data/conradV3/nnmf_decomp_v6.rds")

[1] "2019-02-26 22:27:55 GMT" nnmf_decomp <- nnmf(as.matrix(data), 50, rel.tol = 1e-5)

0% 10 20 30 40 50 60 70 80 90 100% [----|----|----|----|----|----|----|----|----|----| ********|

Sys.time() [1] "2019-02-26 22:55:02 GMT" nnmf_decomp Non-negative matrix factorization: Algorithm: Sequential coordinate-wise descent Loss: Mean squared error MSE: 0.6380184 MKL: 0.3526576 Target: 0.3190092 Rel. tol.: 9.69e-06 Total epochs: 1605

Interation: 39

Running time: user system elapsed 1653.837 194.547 1622.978

[1] "2019-02-27 19:35:29 GMT"

nnmf_decomp2 <- NNLM::nnmf(as.matrix(data), 50, rel.tol = 1e-5, beta = c(0,0,10)) 0% 10 20 30 40 50 60 70 80 90 100% [----|----|----|----|----|----|----|----|----|----| ********| Sys.time() [1] "2019-02-27 20:11:48 GMT"

Sys.time() [1] "2019-03-26 17:38:30 GMT"

nnmf_decomp <- NNLM::nnmf(as.matrix(data), 50, rel.tol = 1e-5) 0% 10 20 30 40 50 60 70 80 90 100% [----|----|----|----|----|----|----|----|----|----|

********|

Sys.time() [1] "2019-03-26 18:16:34 GMT" gc() used (Mb) gc trigger (Mb) max used (Mb) Ncells 427117 22.9 878160 46.9 878160 46.9 Vcells 394326611 3008.5 1129838049 8620.0 1371726427 10465.5

Sys.time() [1] "2019-03-27 02:06:05 GMT"

nnmf_decomp5 <- NNLM::nnmf(as.matrix(data), 100, rel.tol = 1e-5) 0% 10 20 30 40 50 60 70 80 90 100% [----|----|----|----|----|----|----|----|----|----| ********| Sys.time() [1] "2019-03-27 03:14:12 GMT"

with 15 threads

[1] "2019-04-05 15:49:08 BST"

nnmf_decomp <- NNLM::nnmf(as.matrix(data), 50, rel.tol = 1e-5, n.threads = 15) 0% 10 20 30 40 50 60 70 80 90 100% [----|----|----|----|----|----|----|----|----|----| ********| Sys.time() [1] "2019-04-05 15:59:28 BST" nnmf_decomp Non-negative matrix factorization: Algorithm: Sequential coordinate-wise descent Loss: Mean squared error MSE: 0.6379496 MKL: 0.3527002 Target: 0.3189748 Rel. tol.: 7.92e-06 Total epochs: 1856

Interation: 45

Running time: user system elapsed 1897.522 164.514 618.858

dimnames_conradV3 <- readRDS("../data/conradV3/V3_SDAmerged_mouse_V3_SDA_dimnames.rds")
rownames(nnmf_decomp$W) <- dimnames_conradV3[[1]]
colnames(nnmf_decomp$W) <- paste0("V",1:50)

colnames(nnmf_decomp$H) <- dimnames_conradV3[[2]]
rownames(nnmf_decomp$H) <- paste0("V",1:50)

ICA

ICA_decomp <- fastICA::fastICA(data, n.comp = 50, verbose = TRUE)

# a <- Sys.time()
# ICA_decomp <- fastICA::fastICA(data, n.comp = 50, verbose = TRUE, method="C")
# b <- Sys.time()

# Error in fastICA::fastICA(as.matrix(normalised_data), n.comp = 50, verbose = TRUE,  :
#   svd on 19262 x 197201408 exceeds Fortran indexing limits

saveRDS(ICA_decomp,"../data/conradV3/ICA_decomp_v1.rds")

PCA

library(flashpcaR)
pcaresult <- flashpca(as.matrix(data), ndim=50, stan="none", verbose=T, seed=42, do_loadings=T, divisor = "none")

rownames(pcaresult$loadings) <- dimnames_conradV3[[2]]
colnames(pcaresult$loadings) <- paste0("V",1:50)

rownames(pcaresult$vectors) <- dimnames_conradV3[[1]]
colnames(pcaresult$vectors) <- paste0("V",1:50)

rownames(pcaresult$projection) <- rownames(pcaresult$vectors)
colnames(pcaresult$projection) <- paste0("V",1:50)


saveRDS(pcaresult, "../data/conradV3/pcaresultV1.rds")

Load results

nnmf_decomp <- readRDS("../data/alt_factorisations/nnmf_decomp_v1.rds")

nnmf_decomp2 <- readRDS("../data/alt_factorisations/nnmf_decomp_v2.rds")

nnmf_decomp3 <- readRDS("../data/alt_factorisations/nnmf_decomp_v3.rds")

nnmf_decomp4 <- readRDS("../data/alt_factorisations/nnmf_decomp_v4.rds")

nnmf_decomp5 <- readRDS("../data/alt_factorisations/nnmf_decomp_v5.rds")


rownames(nnmf_decomp2$H) <- paste0("V",1:50)
rownames(nnmf_decomp3$H) <- paste0("V",1:50)
rownames(nnmf_decomp4$H) <- paste0("V",1:50)
rownames(nnmf_decomp5$H) <- paste0("V",1:100)


pcaresult <- readRDS("../data/alt_factorisations/pcaresultV1.rds")

# X = SA
ica_decomp <- readRDS("../data/alt_factorisations/ICA_decomp_v1.rds")
ica_decomp$X <- NULL
rownames(ica_decomp$K) <- colnames(ica_decomp$A) <- colnames(nnmf_decomp$H)
colnames(ica_decomp$K) <- rownames(ica_decomp$A) <- paste0("V",1:50)

compare sparsity of gene loadings

threshold = exp(seq(-10,0,0.1))

loading_sparsity <- data.table(threshold,
                               t(sapply(threshold, function(x) c("NNMF"=mean(abs(nnmf_decomp$H)/max(abs(nnmf_decomp$H)) < x),
                                                                "SDA"=mean(abs(SDAresults$loadings[[1]])/max(abs(SDAresults$loadings[[1]])) < x),
                                                                "PCA"=mean(abs(pcaresult$loadings)/max(abs(pcaresult$loadings)) < x),
                                                                "ICA"=mean(abs(ica_decomp$A)/max(abs(ica_decomp$A)) < x)))))


loading_sparsity <- melt(loading_sparsity, id.vars = "threshold", variable.name = "Method", value.name = "Fraction")

gene_loading_sparsity <- ggplot(loading_sparsity, aes(threshold, Fraction, colour=Method)) +
  geom_line() +
  ylab("Fraction of absolute gene\n loadings below threshold") + 
  xlab("Scaled Threshold \n(threshold * max(abs(loadings)))") +
  scale_x_continuous(trans = reverselog_trans())+
  scale_color_brewer(palette = "Set1") +
  theme_minimal() +
  theme(legend.position = "none") + 
  geom_text_repel(data=loading_sparsity[threshold==threshold[45]], aes(label = Method),
                  nudge_x = c(0.5,0.5,-0.5,0.5), nudge_y = 0.02,
                  na.rm = TRUE, segment.colour='darkgrey', colour="black")

gene_loading_sparsity

saveRDS(gene_loading_sparsity, "../data/plots/gene_loading_sparsity.rds")

pdf("../results/other_factorisations/gene_loading_sparsity.pdf", width = 5, height = 5)
gene_loading_sparsity
dev.off()

Non log scale version

gene_loading_sparsity + scale_x_continuous() + coord_flip()

Cell score sparcity

NMF is highest as expected

threshold = exp(seq(-12,0,0.1))

cellscore_sparsity <- data.table(threshold,
                               t(sapply(threshold, function(x) c("PCA"=mean(abs(pcaresult$projection)/max(abs(pcaresult$projection)) < x),
                                                                 "ICA"=mean(abs(ica_decomp$S)/max(abs(ica_decomp$S)) < x),
                                                                 "SDA"=mean(abs(SDAresults$scores)/max(abs(SDAresults$scores)) < x),
                                                                 "NNMF"=mean(abs(nnmf_decomp$W)/max(abs(nnmf_decomp$W)) < x)))))

cellscore_sparsity <- melt(cellscore_sparsity, id.vars = "threshold", variable.name = "Method", value.name = "Fraction")

cellscore_sparsity_p <- ggplot(cellscore_sparsity, aes(threshold, Fraction, colour=Method)) +
  geom_line() +
  ylab("Fraction of absolute cell scores below threshold") + 
  xlab("Scaled Threshold (t * max(abs(cell_score)))") +
  scale_x_continuous(trans = reverselog_trans())+
  scale_color_brewer(palette = "Set1") +
  theme_minimal() +
  theme(legend.position = "none")+
  geom_text_repel(data=cellscore_sparsity[threshold==threshold[70]], aes(label = Method),
                  nudge_x = c(-0.5,0.5,1,-1.5),
                  na.rm = TRUE, segment.colour='darkgrey', colour="black")

cellscore_sparsity_p

pdf("../results/other_factorisations/cell_score_sparsity.pdf", width = 5, height = 5)
cellscore_sparsity_p
dev.off()

Compare Prdm9 Component Loadings

Find which component is the Prdm9 one

sort(nnmf_decomp$H[,"Prdm9"])
sort(ica_decomp$A[,"Prdm9"])
sort(pcaresult$loadings["Prdm9",])

head(sort(-nnmf_decomp$H[50,]),20)
head(sort(ica_decomp$A[15,]),20)
head(sort(pcaresult$loadings[,"V14"], F),20)

Compare Leptotene Loadings

SDAtools::genome_loadings(SDAresults$loadings[[1]][5,], label_both = T, max.items = 20, label.size = 3,
                                     gene_locations = gene_annotations, hide_unknown = T) +
  ylab("Gene Loadings (SDA Component 5)") +
  scale_y_reverse()

genome_loadings(nnmf_decomp$H["V50",], label_both = FALSE, max.items = 20, label.size = 3,
                                     gene_locations = gene_annotations, hide_unknown = T) +
  ylab("Gene Loadings (NNMF Component 50)")

SDAtools::genome_loadings(pcaresult$loadings[,"V14"], label_both = T, max.items = 20, label.size = 3,
                                     gene_locations = gene_annotations, hide_unknown = T) +
  ylab("Gene Loadings (PCA Component 14)") +
  scale_y_reverse()

SDAtools::genome_loadings(ica_decomp$A[15,], label_both = F, max.items = 20, label.size = 3,
                                     gene_locations = gene_annotations, hide_unknown = T) +
  ylab("Gene Loadings (ICA Component 15)") +
  scale_y_reverse()

Compare X Chrom Activation Loadings

SDAtools::genome_loadings(SDAresults$loadings[[1]][38,], label_both = T, max.items = 20, label.size = 3,
                                     gene_locations = gene_annotations, hide_unknown = T) +
  ylab("Gene Loadings (SDA Component 38)")

genome_loadings(nnmf_decomp$H["V3",], label_both = FALSE, max.items = 20, label.size = 3,
                                     gene_locations = gene_annotations, hide_unknown = T) +
  ylab("Gene Loadings (NNMF Component 3)")

All NMF Loadings

To check if anything interesting looking

for(i in 1:50){
  print(genome_loadings(nnmf_decomp$H[i,], label_both = FALSE, max.items = 20, label.size = 3,
                                     gene_locations = gene_annotations, hide_unknown = T) +
  ylab(i))
}

compare NNMF L1 sparsity

Seems to just make whole components 0? not what I wanted

plot(rowSums(nnmf_decomp$H))
plot(rowSums(nnmf_decomp2$H))
plot(rowSums(nnmf_decomp3$H))

plot(nnmf_decomp$H[,"Prdm9"])
plot(nnmf_decomp3$H[,"Prdm9"])

plot(sort(nnmf_decomp$H[50,]), pch='.')
plot(sort(nnmf_decomp3$H[28,]), pch='.')

plot(sort(rowSums(nnmf_decomp$H)))
plot(sort(rowSums(nnmf_decomp3$H)))

Imputation

Very similar predictions

plot(sda_predict(c("Prdm9"))$Prdm9, nmf_predict(c("Prdm9"))$Prdm9, pch=".")
plot(sda_predict(c("Prdm9"))$Prdm9, pca_predict(c("Prdm9"))$Prdm9, pch=".")
plot(sda_predict(c("Prm1"))$Prm1, pca_predict(c("Prm1"))$Prm1, pch=".")
plot(sda_predict(c("Acrv1"))$Acrv1, pca_predict(c("Acrv1"))$Acrv1, pch=".")
ggplot(cell_data[nmf_predict(c("Prdm9"))], aes(-PseudoTime, Prdm9)) + geom_point(stroke=0, alpha=0.3) + ggtitle("NNMF")
ggplot(cell_data[sda_predict(c("Prdm9"))], aes(-PseudoTime, Prdm9)) + geom_point(stroke=0, alpha=0.3) + ggtitle("SDA")
ggplot(cell_data[pca_predict(c("Prdm9"))], aes(-PseudoTime, Prdm9)) + geom_point(stroke=0, alpha=0.3) + ggtitle("PCA")

test PCA implementaitons

C <- chol(S <- toeplitz(.9 ^ (0:31)))
set.seed(17)
X <- matrix(rnorm(32000), 1000, 32)
Z <- X %*% C
pZ <- prcomp(Z, tol = 0.1)


pZ2 <- flashpca(Z, ndim=14, stan="none", verbose=T, seed=42, do_loadings=T, divisor = "none")

plot(pZ$rotation[,1], -pZ2$loadings[,1]);abline(0,1,col='red')
all.equal(pZ$rotation[,1], -pZ2$loadings[,1])

plot(pZ$x[,1], -pZ2$projection[,1]);abline(0,1,col='red')
plot(pZ$x[,1], -pZ2$vectors[,1])
all.equal(pZ$x[,1], -pZ2$projection[,1])

Correlations

compare_factorisations(nnmf_decomp$H, SDAresults$loadings[[1]], names=c("NNMF","SDA"))
compare_factorisations(t(pcaresult$loadings), SDAresults$loadings[[1]], names=c("PCA","SDA"))
compare_factorisations(ica_decomp$A, SDAresults$loadings[[1]], names=c("ICA","SDA"))

100 componentns

compare_factorisations(SDAresults$loadings[[1]],nnmf_decomp5$H, names=c("SDA","NNMF"), method="pearson")

Pearson

named_loadings <- SDAresults$loadings[[1]]

rownames(named_loadings) <- paste0("Mix",1:50," ",names(sort(component_labels))) #paste0("Mix_",1:50)
pdf("../results/other_factorisations/NNMF_original.pdf", height=9, width=10)
compare_factorisations(nnmf_decomp$H, named_loadings, names=c("NNMF","SDA"), method = "pearson")
dev.off()

compare_factorisations(t(pcaresult$loadings), SDAresults$loadings[[1]], names=c("PCA","SDA"), method = "pearson")
compare_factorisations(ica_decomp$A, SDAresults$loadings[[1]], names=c("ICA","SDA"), method = "pearson")

Rotated NNMF

rot_NNMF <- vegan::procrustes(t(SDAresults$loadings[[1]]), t(nnmf_decomp$H))

colnames(rot_NNMF$Yrot) <- paste0("V",1:50)


pdf("../results/other_factorisations/NNMF_rotated.pdf", height=9, width=10)
compare_factorisations(t(rot_NNMF$Yrot),
                       named_loadings,
                       c("NNMF Rotated","SDA"),
                       method = "pearson")
dev.off()

Rotated SDA

rot_named_loadings <- vegan::procrustes(t(nnmf_decomp$H), t(named_loadings))

colnames(rot_named_loadings$Yrot) <- paste0("V",1:50)


pdf("../results/other_factorisations/SDA_rotated.pdf", height=9, width=10)
compare_factorisations(t(rot_named_loadings$Yrot),
                       nnmf_decomp$H,
                       c("SDA Rotated","NNMF"),
                       method = "pearson")
dev.off()

GGplot version

corrs <- compare_factorisations(t(rot_NNMF$Yrot),
                       named_loadings,
                       c("NNMF Rotated","SDA"),
                       method = "pearson", return="cor")

heatmapgrob <- ggplot(melt(corrs[50:1,], value.name="Pearson_correlation"), aes(`NNMF Rotated`,SDA)) +
  geom_tile(aes(fill=Pearson_correlation)) +
  scale_fill_viridis() +
  theme_bw(base_size = 12) +
  theme(axis.text.x = element_text(angle = 90), legend.position = "bottom")

saveRDS(heatmapgrob, "../data/plots/heatmapgrob.rds")

Compare Promax

sda_load_promax <- promax(t(SDAresults$loadings[[1]]))
nnmf_load_promax <- promax(t(nnmf_decomp$H))

##


compare_factorisations(t(nnmf_load_promax$loadings),
                       t(sda_load_promax$loadings),
                       c("NNMF","SDA"),
                       method = "pearson")

# compare promax to original

compare_factorisations(t(sda_load_promax$loadings),
                       named_loadings,
                       c("NNMF","SDA"),
                       method = "pearson")

compare_factorisations(t(nnmf_load_promax$loadings),
                       nnmf_decomp$H,
                       c("NNMF","SDA"),
                       method = "pearson")

plot(sda_load_promax$loadings[,"V5"],SDAresults$loadings[[1]]["V5",])
plot(nnmf_load_promax$loadings[,"V50"],nnmf_decomp$H["V50",])

plot(rot_NNMF$Yrot[,"V5"], SDAresults$loadings[[1]]["V5",])
plot(nnmf_decomp$H["V50",],SDAresults$loadings[[1]]["V5",])

Expression predictions

Correlate imputed expression NNMF vs SDA

gene_cor <- numeric(length = length(colnames(SDAresults$loadings[[1]])))

for(i in seq_along(gene_cor)){
  gene_cor[i] <- cor(nmf_predict(c(colnames(SDAresults$loadings[[1]])[i]))[,2][[1]],
                     sda_predict(c(colnames(SDAresults$loadings[[1]])[i]))[,2][[1]])
}

names(gene_cor) <- colnames(SDAresults$loadings[[1]])

saveRDS(gene_cor, "../data/imputation/NNMF_vs_SDA_Gene_correlations.rds")

gene_cor <- readRDS("../data/imputation/NNMF_vs_SDA_Gene_correlations.rds")

manhatten_plot(component_enrichment(names(head(sort(gene_cor),500))), topn = 14)
manhatten_plot(component_enrichment(names(head(sort(gene_cor),500)), loadings=nnmf_decomp$H), topn = 14)

Gene Correlation without single cell components

named_SDAresults <- SDAresults

rownames(named_SDAresults$loadings[[1]]) <- paste0("Mix",1:50," ",names(sort(component_labels)))
colnames(named_SDAresults$scores) <- paste0("Mix",1:50," ",names(sort(component_labels)))

sc_components <- grep("Single",rownames(named_SDAresults$loadings[[1]]))

named_SDAresults$loadings[[1]] <- named_SDAresults$loadings[[1]][-sc_components,]
named_SDAresults$scores <- named_SDAresults$scores[,-sc_components]

gene_cor <- numeric(length = length(colnames(SDAresults$loadings[[1]])))

for(i in seq_along(gene_cor)){
  gene_cor[i] <- cor(nmf_predict(c(colnames(SDAresults$loadings[[1]])[i]))[,2][[1]],
                     sda_predict(c(colnames(SDAresults$loadings[[1]])[i]), factorisation = named_SDAresults)[,2][[1]])
}

names(gene_cor) <- colnames(SDAresults$loadings[[1]])

saveRDS(gene_cor, "../data/imputation/NNMF_vs_SDA_Gene_correlations_sans_single_cell.rds")

gene_cor <- readRDS("../data/imputation/NNMF_vs_SDA_Gene_correlations_sans_single_cell.rds")

plot(sort(gene_cor))
head(sort(gene_cor))
str(rank(gene_cor)["Gfra1"])
manhatten_plot(component_enrichment(names(head(sort(gene_cor),500))), topn = 7)
manhatten_plot(component_enrichment(names(head(sort(gene_cor),500)),
                                    loadings=nnmf_decomp$H, orderSDA = F), topn = 0)

Blood

blood_SDA <- tricolour_tsne(c("Cd3g","Csf1r")) +
  facet_zoom(xlim=c(-10,0), ylim=c(-6,-1)) +
  ggtitle("SDA Imputed Expression") + labs(y="t-SNE 2", x="t-SNE 1")

blood_NNMF <- tricolour_tsne(c("Cd3g","Csf1r"), predict = "NNMF") +
  facet_zoom(xlim=c(-10,0), ylim=c(-6,-1)) +
  ggtitle("NNMF Imputed Expression") + labs(y="t-SNE 2", x="t-SNE 1")

blood_SDA
blood_NNMF

pdf("../results/other_factorisations/blood_SDA.pdf", width = 7, height = 5)
blood_SDA
dev.off()

pdf("../results/other_factorisations/blood_NNMF.pdf", width = 7, height = 5)
blood_NNMF
dev.off()

Scatter Plot

corss <- rbind(sda_predict(c("Cd3g","Csf1r")), nmf_predict(c("Cd3g","Csf1r")))
corss$Method <- "NNMF"
corss[1:(nrow(corss)/2)]$Method <- "SDA"

corss$rgb <- ternaryColours(corss, c("Cd3g","Csf1r"))

ggplot(corss, aes(Csf1r, Cd3g, colour=rgb)) + scale_colour_identity() + geom_point() + facet_wrap(~Method, scales = "free")

blood_SDA_vs_NNMF <- ggplot(corss, aes(Csf1r, Cd3g, colour=Method)) +
  geom_point(alpha=0.5) +
  scale_color_brewer(palette = "Set1") +
  theme_minimal() + theme(legend.position = "bottom")

blood_SDA_vs_NNMF

pdf("../results/other_factorisations/blood_SDA_vs_NNMF.pdf", width = 5, height = 5)
blood_SDA_vs_NNMF
dev.off()

Stem Cells

tricolour_tsne(c("Gfra1","Upp1","Crabp1")) +
  facet_zoom(xlim=c(-5,5), ylim=c(35,43), zoom.size = 2) +
  ggtitle("SDA Prediction")

tricolour_tsne(c("Gfra1","Upp1","Crabp1"), predict = "NNMF") +
  facet_zoom(xlim=c(-5,5), ylim=c(35,43), zoom.size = 2) +
  ggtitle("NNMF Prediction")

gonia_SDA <- tricolour_tsne(c("Gfra1","Lin28a","Nanos1")) +
  facet_zoom(xlim=c(-5,5), ylim=c(36,43), zoom.size = 2) +
  ggtitle("SDA Imputed Expression") + labs(y="t-SNE 2", x="t-SNE 1")

gonia_SDA_zoom <- tricolour_tsne(c("Gfra1","Lin28a","Nanos1")) +
  xlim(c(-4,5)) + ylim(c(36,44)) +
  ggtitle("SDA Imputed Expression") + labs(y="t-SNE 2", x="t-SNE 1")

gonia_NNMF <- tricolour_tsne(c("Gfra1","Lin28a","Nanos1"), predict = "NNMF") +
  facet_zoom(xlim=c(-4,5), ylim=c(36,44), zoom.size = 2) +
  ggtitle("NNMF Imputed Expression") + labs(y="t-SNE 2", x="t-SNE 1")
# ccnd2 or gfra1
# upp1 or foxf1
# ccna2 or crabp1

gonia_SDA
gonia_NNMF

pdf("../results/other_factorisations/Spermatogonia_SDA.pdf", width = 7, height = 5)
gonia_SDA
dev.off()

pdf("../results/other_factorisations/Spermatogonia_NNMF.pdf", width = 7, height = 5)
gonia_NNMF
dev.off()

Scatter plot

corss <- rbind(sda_predict(c("Gfra1","Upp1")), nmf_predict(c("Gfra1","Upp1")))
corss$Method <- "NNMF"
corss[1:(nrow(corss)/2)]$Method <- "SDA"

corss$rgb <- ternaryColours(corss, c("Gfra1","Upp1"))

ggplot(corss, aes(Upp1, Gfra1, colour=rgb)) + scale_colour_identity() + geom_point() + facet_wrap(~Method, scales = "free")

ggplot(corss, aes(Upp1, Gfra1, colour=Method)) +
  geom_point() +
  scale_color_brewer(palette = "Set1") +
  theme_minimal()
corss <- rbind(sda_predict(c("Igfbp5","Upp1")), nmf_predict(c("Igfbp5","Upp1")))
corss$Method <- "NNMF"
corss[1:(nrow(corss)/2)]$Method <- "SDA"

ggplot(corss, aes(Upp1, Igfbp5, colour=Method)) +
  geom_point() +
  scale_color_brewer(palette = "Set1") +
  theme_minimal()

corss <- rbind(sda_predict(c("Gm14019","Upp1")), nmf_predict(c("Gm14019","Upp1")))
corss$Method <- "NNMF"
corss[1:(nrow(corss)/2)]$Method <- "SDA"

ggplot(corss, aes(Upp1, Gm14019, colour=Method)) +
  geom_point() +
  scale_color_brewer(palette = "Set1") +
  theme_minimal()

100 component NNMF versions

tricolour_tsne(c("Cd3g","Csf1r"), predict = "NNMF2") +
  facet_zoom(xlim=c(-10,0), ylim=c(-6,-1)) +
  ggtitle("NNMF 100C Prediction")

tricolour_tsne(c("Gfra1","Upp1","Crabp1"), predict = "NNMF2") +
  facet_zoom(xlim=c(-5,3), ylim=c(35,43), zoom.size = 2) +
  ggtitle("NNMF 100C Prediction")

Compare components not expression

(with expression as colour)

#plot(nnmf_decomp$W[,"V4"][cell_data[order(experiment)]$cell], pch=".")

tmp <- tricolour_tsne(c("Cd3g","Csf1r"), returndf = T, predict = F)

ggplot(tmp, aes(V3,V11,colour=rgb)) +
  geom_point() +
  scale_colour_identity() +
  theme_dark() +
  annotate("text", Inf, Inf, hjust = 2, vjust = 2, label = "Cd3g", colour="red", fontface="bold") +
  annotate("text", Inf, Inf, hjust = 2, vjust = 4, label = "Csf1r", colour="blue", fontface="bold")

tmp$NNMF_V4 <- nnmf_decomp$W[,"V4"][cell_data$cell]
tmp$NNMF_V11 <- nnmf_decomp$W[,"V11"][cell_data$cell]
tmp$NNMF_V34 <- nnmf_decomp$W[,"V34"][cell_data$cell]

ggplot(tmp[order(Cd3g/2+Csf1r)], aes(NNMF_V4,NNMF_V11,colour=rgb)) +
  geom_point() +
  scale_colour_identity() +
  theme_dark() +
  annotate("text", Inf, Inf, hjust = 2, vjust = 2, label = "Cd3g", colour="red", fontface="bold") +
  annotate("text", Inf, Inf, hjust = 2, vjust = 4, label = "Csf1r", colour="blue", fontface="bold")
ggplot(tmp[order(Cd3g/2+Csf1r)], aes(NNMF_V4,NNMF_V11,colour=rgb)) +
  geom_point() +
  scale_colour_identity() +
  theme_dark() +
  annotate("text", Inf, Inf, hjust = 2, vjust = 2, label = "Cd3g", colour="red", fontface="bold") +
  annotate("text", Inf, Inf, hjust = 2, vjust = 4, label = "Csf1r", colour="blue", fontface="bold")

tmp$NNMF_Crgb <- ternaryColours(tmp, c("NNMF_V4","NNMF_V11","NNMF_V34"))

ggplot(tmp, aes(Tsne1_QC1, Tsne2_QC1)) +
  geom_point(aes(colour=NNMF_Crgb), stroke=0, size=1) +
  scale_color_identity()  +
  theme_dark() +
  facet_zoom(xlim=c(-10,0), ylim=c(-6,-1), zoom.size = 1)

ggplot(tmp[order(Cd3g)], aes(Tsne1_QC1, Tsne2_QC1)) +
  geom_point(aes(colour=Cd3g), stroke=0, size=1) +
  scale_color_viridis(direction = -1) +
  facet_zoom(xlim=c(-10,0), ylim=c(-6,-1), zoom.size = 1)
tmp$V3L <- asinh(tmp$V3)
tmp$V11L <- asinh(tmp$V11)
tmp$SDA_Crgb <- ternaryColours(tmp, c("V11L","V3L"))

ggplot(tmp, aes(Tsne1_QC1, Tsne2_QC1)) +
  geom_point(aes(colour=SDA_Crgb), stroke=0, size=1) +
  scale_color_identity()  +
  theme_dark() +
  facet_zoom(xlim=c(-9.5,0), ylim=c(-5.5,-1), zoom.size = 1) +
  annotate("text", Inf, Inf, hjust = 1.5, vjust = 2, label = "SDA_C11", colour="red", fontface="bold") +
  annotate("text", Inf, Inf, hjust = 1.5, vjust = 4, label = "SDA_C3", colour="blue", fontface="bold")
tricolour_tsne(c("Cd52","Tyrobp"), predict = F, ptsize = 1)  + facet_zoom(xlim=c(-10,0), ylim=c(-6,-1), zoom.size = 1)

Enrichments

ETC

etc_enrich_nnmf[order(p.value)][1:5]
etc_enrich[order(p.value)][1:5]

manhatten_plot(etc_enrich, legend_position = c(0.2,0.8))
manhatten_plot(etc_enrich_nnmf, legend_position = c(0.2,0.8))

plot(single_component_enrichment(nnmf_decomp$H["V46",], etc_genes_clean, threshold = 500)$ranks)
plot(single_component_enrichment(SDAresults$loadings[[1]][9,], etc_genes_clean, threshold = 500, pos = F)$ranks)

plot(single_component_enrichment(nnmf_decomp$H["V46",], etc_genes_clean, threshold = 500)$ranks, 
     single_component_enrichment(SDAresults$loadings[[1]][9,], etc_genes_clean, threshold = 500, pos = F)$ranks)
nnmf_ranks <- single_component_enrichment(nnmf_decomp$H["V46",], etc_genes_clean, threshold = 500)$ranks
sda_ranks <- single_component_enrichment(SDAresults$loadings[[1]][9,], etc_genes_clean, threshold = 500, pos = F)$ranks

plot(nnmf_ranks, sda_ranks)
abline(0,1)

plot(nnmf_ranks /
     sda_ranks,
     ylab="Ratio rank NNMF/SDA")
abline(h=1)

qplot(1:length(nnmf_ranks), nnmf_ranks[order(names(nnmf_ranks))] /
     sda_ranks[order(names(sda_ranks))],
     ylab="Ratio paired rank NNMF/SDA") + scale_y_log10() + geom_hline(yintercept = 1)

Stra8 Enrichment

kojima_stra8_enrich <- component_enrichment(kojima_stra8_chip)
kojima_stra8_enrich_nnmf <- component_enrichment(kojima_stra8_chip, loadings_matrix = nnmf_decomp$H, orderSDA = F)

kojima_stra8_enrich[order(p.value)][1:5]
kojima_stra8_enrich_nnmf[order(p.value)][1:5]

manhatten_plot(kojima_stra8_enrich, legend_position = c(0.2,0.8))
manhatten_plot(kojima_stra8_enrich_nnmf, legend_position = c(0.2,0.8))

plot(single_component_enrichment(SDAresults$loadings[[1]][5,], kojima_stra8_chip, threshold = 500, pos = F)$ranks)
plot(single_component_enrichment(nnmf_decomp$H["V50",], kojima_stra8_chip, threshold = 500)$ranks)

MSCA

Meiotic Sex Chromosome Activation in Hormad1 KO

In NNMF this effect bleeds into the WT cells

Compare Hormad1KO Cell Scores

# plot(SDAresults$scores[cell_data[order(-PseudoTime)]$cell,"V38"],col=rgb(0,0,0,1-as.double(grepl("Hormad",cell_data[order(-PseudoTime)]$experiment))),xlim=c(0,5000),ylim=c(-5,20))
# plot(nnmf_decomp$W[cell_data[order(-PseudoTime)]$cell,"V3"],col=rgb(0,0,0,1-as.double(grepl("Hormad",cell_data[order(-PseudoTime)]$experiment))),xlim=c(0,5000))
# 
# plot(SDAresults$scores[cell_data[order(-PseudoTime)]$cell,"V38"],col=1+as.double(grepl("Hormad",cell_data[order(-PseudoTime)]$experiment)),xlim=c(0,5000),ylim=c(-5,20))
# plot(nnmf_decomp$W[cell_data[order(-PseudoTime)]$cell,"V3"],col=1+as.double(grepl("Hormad",cell_data[order(-PseudoTime)]$experiment)),xlim=c(0,5000))

tmp <- data.table("NNMF"=nnmf_decomp$W[cell_data[order(-PseudoTime)]$cell,"V3"],
           "SDA"=SDAresults$scores[cell_data[order(-PseudoTime)]$cell,"V38"],
           "Hormad1KO"=grepl("Hormad",cell_data[order(-PseudoTime)]$experiment),
           "PseudoTime" = cell_data[order(-PseudoTime)]$PseudoTime,
           "cell" = cell_data[order(-PseudoTime)]$cell,
           "msci_ratio" = cell_data[order(-PseudoTime)]$msci_ratio)

tmp[Hormad1KO==TRUE, Hormad_Genotype := "Hormad -/-"]
tmp[Hormad1KO==FALSE, Hormad_Genotype := "Hormad WT"]

tmp <- melt(tmp[PseudoTime>14500], id.vars = c("cell","msci_ratio","PseudoTime","Hormad1KO","Hormad_Genotype"), variable.name = "Method", value.name = "Cell_Score")

hormad_cell_score <- ggplot(tmp, aes(-PseudoTime, Cell_Score, colour=Method)) +
  geom_point(alpha=0.5, stroke=0) +
  facet_grid(Hormad_Genotype~Method, scales="free_y") +
  scale_color_brewer(palette = "Set1", guide=FALSE) +
  geom_smooth(colour="black") +
  theme_bw()

pdf("../results/other_factorisations/MSCA_Cell_Score.pdf", width = 7)
hormad_cell_score
dev.off()

Compare Rhox2h Prediction

hormad_gene <- "Rhox2h"

cell_data[grepl("Hormad",experiment), Hormad_Genotype := "Hormad -/-"]
cell_data[!grepl("Hormad",experiment), Hormad_Genotype := "Hormad WT"]

tmp <- cell_data[nmf_predict(hormad_gene,name_extension = "_NNMF")][sda_predict(hormad_gene, name_extension = "_SDA")]

tmp <- melt(tmp[PseudoTime>14500],id.vars = c("cell","PseudoTime","Hormad_Genotype"), measure.vars = paste0(hormad_gene,"_",c("NNMF","SDA")), variable.name = "Method", value.name = hormad_gene)

tmp[,Method := gsub(paste0(hormad_gene,"_"),"",Method)]

Rhox2h_prediction <- ggplot(tmp, aes(-PseudoTime, get(hormad_gene), colour=Method)) +
  geom_point(alpha=0.5, stroke=0) +
  facet_grid(Hormad_Genotype~Method, scales="free_y") +
  scale_color_brewer(palette = "Set1", guide=FALSE) +
  geom_smooth(colour="black") +
  ylab(paste(hormad_gene,"Imputed Expression")) +
  scale_x_continuous(breaks = c(-15000,-16000)) +
  theme_bw()

Rhox2h_prediction

pdf("../results/other_factorisations/MSCA_Rhox2h.pdf", width = 7)
Rhox2h_prediction
dev.off()

With 100 NNMF components

Better

cell_data[grepl("Hormad1_3",experiment), Genotype := "2"]
cell_data[!grepl("Hormad1_3",experiment), Genotype := "1"]

tmp <- cell_data[nmf_predict(c("Rhox2h"),nnmf_decomp5, name_extension = "_NNMF")][sda_predict(c("Rhox2h"), name_extension = "_SDA")]

tmp <- melt(tmp[PseudoTime>14500],id.vars = c("cell","PseudoTime","Hormad_Genotype"), measure.vars = c("Rhox2h_NNMF","Rhox2h_SDA"), variable.name = "Method", value.name = "Rhox2h")

#tmp[,Method := gsub("Rhox2h_","",Method)]

Rhox2h_prediction <- ggplot(tmp, aes(-PseudoTime, Rhox2h, colour=Method)) +
  geom_point(alpha=0.5) +
  facet_grid(Hormad_Genotype~Method, scales="free_y") +
  scale_color_brewer(palette = "Set1") +
  geom_smooth(colour="black") +
  theme_bw()

Rhox2h_prediction

R FA

too slow didn't finish

library(data.table)
data <- fread("../data/conradV3/V3_SDAmerged_mouse_V3_SDA.data")

dimnames_conradV3 <- readRDS("../data/conradV3/V3_SDAmerged_mouse_V3_SDA_dimnames.rds")
data <- as.matrix(data)
rownames(data) <- dimnames_conradV3[[1]]
colnames(data) <- dimnames_conradV3[[2]]

rfa <- factanal(data, factors = 50, rotation = "promax")


MyersGroup/testisAtlas documentation built on Nov. 29, 2020, 9:21 p.m.