In MyersGroup/testisAtlas: testisAtlas: R functions and analysis scripts used in the analysis of testis single cell RNAseq

SDA is deterministic given a random starting configuration, so we test multiple random seeds to ensure stability of the results.

Baseline

This is the result used in the rest of the analysis and we will compare the results of other runs to this.

We will compare two components per run, a large one (pachytene - marked by Nasp), and a smaller one (leptotene - marked by Prdm9).

library(SDAtools)

run_SDA(sda_location = "../../SDA/build/sda",
        out = "../results/conradV3_sda_1",
        data = "V3_SDAmerged_mouse_V3_SDA.data",
        num_comps = 50,
        max_iter = 10000,
        save_freq = 1000,
        set_seed = "79151 17351",
        N = 20322)

results1 <- load_results(results_folder = "../data/conradV3/conradV3_sda_1/", data_path = "../data/conradV3/")
rownames(results1$loadings[[1]]) <- paste0("V",1:50)
rownames(results1$pips[[1]]) <- paste0("V",1:50)

str(results1)
max(results1$free_energy)
check_convergence(results1)

highest_components(results1, "Nasp")
highest_genes(results1, component = 42)

highest_components(results1, "Prdm9")
highest_genes(results1, component = 5)

results_1k <- load_results(results_folder = "../data/SDA/conradV3_sda_1/", iteration = 1000, data_path = "../data/count_matrices/")
rownames(results_1k$loadings[[1]]) <- paste0("V",1:50)
str(results_1k)

pdf("../results/1k_vs_10k.pdf")
compare_factorisations(SDAresults$loadings[[1]],
                       results_1k$loadings[[1]],
                       method = "pearson",
                       names=c("10k Iterations","1k Iterations"))
dev.off()

Different Seed 1

# try different seeds to test consistancy
run_SDA(sda_location = "../../SDA/build/sda",
        out = "../results/conradV3_sda_5",
        data = "../data/conradV3/V3_SDAmerged_mouse_V3_SDA.data",
        num_comps = 50,
        max_iter = 10000,
        save_freq = 1000,
        set_seed = "82145 34369",
        N = 20322)

results5 <- load_results(results_folder = "../data/conradV3/conradV3_sda_5/", data_path = "../data/conradV3/")
rownames(results5$loadings[[1]]) <- paste0("V",1:50)
rownames(results5$pips[[1]]) <- paste0("V",1:50)

str(results5)
max(results5$free_energy)
check_convergence(results5)
loading_distribution(results5)

highest_components(results5, "Nasp")
plot(results1$loadings[[1]][42,], results5$loadings[[1]][5,])
highest_genes(results5, component = 5)

highest_components(results5, "Prdm9")
plot(results1$loadings[[1]][5,], results5$loadings[[1]][26,])
highest_genes(results5, component = 26)

Different Seed 2

run_SDA(sda_location = "../../SDA/build/sda",
        out = "../results/conradV3_sda_6",
        data = "../data/conradV3/V3_SDAmerged_mouse_V3_SDA.data",
        num_comps = 50,
        max_iter = 10000,
        save_freq = 1000,
        set_seed = "69808 6210",
        N = 20322)

results6 <- load_results(results_folder = "../data/conradV3/conradV3_sda_6/", data_path = "../data/conradV3/")
rownames(results6$loadings[[1]]) <- paste0("V",1:50)
rownames(results6$pips[[1]]) <- paste0("V",1:50)

str(results6)
max(results6$free_energy)
check_convergence(results6)

highest_components(results6, "Nasp")
plot(results1$loadings[[1]][42,], results6$loadings[[1]][45,])
highest_genes(results6, component = 45)

highest_components(results6, "Prdm9")
plot(results1$loadings[[1]][5,], results6$loadings[[1]][49,])
highest_genes(results6, component = 49)

Different Seed 3

run_SDA(sda_location = "../../SDA/build/sda",
        out = "../results/conradV3_sda_7",
        data = "../data/conradV3/V3_SDAmerged_mouse_V3_SDA.data",
        num_comps = 50,
        max_iter = 10000,
        save_freq = 1000,
        set_seed = "31111 30486",
        N = 20322)

results7 <- load_results(results_folder = "../data/conradV3/conradV3_sda_7/", data_path = "../data/conradV3/")
rownames(results7$loadings[[1]]) <- paste0("V",1:50)
rownames(results7$pips[[1]]) <- paste0("V",1:50)

str(results7)
max(results7$free_energy)
check_convergence(results7)

highest_components(results7, "Nasp")
plot(results1$loadings[[1]][42,], results7$loadings[[1]][10,])
highest_genes(results7, component = 10)

highest_components(results7, "Prdm9")
plot(results1$loadings[[1]][5,], results7$loadings[[1]][5,])
highest_genes(results7, component = 5)

Different Seed 4

run_SDA(sda_location = "../../SDA/build/sda",
        out = "../results/conradV3_sda_8",
        data = "../data/conradV3/V3_SDAmerged_mouse_V3_SDA.data",
        num_comps = 50,
        max_iter = 10000,
        save_freq = 1000,
        set_seed = "79874 65015",
        N = 20322)

results8 <- load_results(results_folder = "../data/conradV3/conradV3_sda_8/", data_path = "../data/conradV3/")
rownames(results8$loadings[[1]]) <- paste0("V",1:50)
rownames(results8$pips[[1]]) <- paste0("V",1:50)

str(results8)
max(results8$free_energy)
check_convergence(results8)

highest_components(results8, "Lyar")
plot(results1$loadings[[1]][42,], results8$loadings[[1]][39,])
highest_genes(results8, component = 39)

highest_components(results8, "Prdm9")
plot(results1$loadings[[1]][5,], results8$loadings[[1]][30,])
highest_genes(results8, component = 30)

75 Components

results2 <- load_results(results_folder = "../data/SDA/conradV3_sda_2/", data_path = "../data/count_matrices//")
rownames(results2$loadings[[1]]) <- paste0("V",1:75)
rownames(results2$pips[[1]]) <- paste0("V",1:75)

str(results2)
max(results2$free_energy)
check_convergence(results2)

plot_maximums(results1, labels = F)
plot_maximums(results2, labels = F)

highest_components(results2, "Acrv1")
plot(results1$loadings[[1]][30,], results2$loadings[[1]][39,])
highest_genes(results2, component = 39)

highest_components(results2, "Prdm9")
plot(results1$loadings[[1]][5,], results2$loadings[[1]][45,])
highest_genes(results2, component = 45)


compare_factorisations(results1$loadings[[1]], results2$loadings[[1]], method = "pearson")

compare_factorisations(results1, results2, method = "pearson")

results2_rot <- rotate_SDA(results2, results1)

200 Components

# try with different seed & higher components
# higher components
# run_SDA(sda_location = "../../SDA/build/sda",
#         out = "../results/conradV3_sda_2",
#         data = "../data/conradV3/V3_SDAmerged_mouse_V3_SDA.data",
#         num_comps = 75,
#         max_iter = 10000,
#         save_freq = 1000,
#         set_seed = "73453 98253",
#         N = 20322,
#         eigen_parallel = TRUE,
#         num_blocks = 128,
#         num_openmp_threads = 12)


run_SDA(sda_location = "../../SDA/build/sda",
        out = "../results/conradV3_sda_10",
        data = "../data/conradV3/V3_SDAmerged_mouse_V3_SDA.data",
        num_comps = 100,
        max_iter = 5000,
        save_freq = 500,
        set_seed = "79151 17351",
        N = 20322,
        eigen_parallel = TRUE,
          num_blocks = 128,
          num_openmp_threads = 12)


# Even more components
run_SDA(sda_location = "../../SDA/build/sda",
        out = "../results/conradV3_sda_3",
        data = "../data/conradV3/V3_SDAmerged_mouse_V3_SDA.data",
        num_comps = 200,
        max_iter = 5000,
        save_freq = 500,
        set_seed = "79151 17351",
        N = 20322,
        eigen_parallel = TRUE,
          num_blocks = 128,
          num_openmp_threads = 12)

results3 <- load_results(results_folder = "../data/SDA/conradV3_sda_10/", data_path = "../data/count_matrices/")
rownames(results3$loadings[[1]]) <- paste0("V",1:100)
rownames(results3$pips[[1]]) <- paste0("V",1:100)

str(results3)
max(results3$free_energy)
check_convergence(results3)

highest_components(results3, "Lyar")
plot(results1$loadings[[1]][42,], results3$loadings[[1]][157,])
highest_genes(results3, component = 157)

highest_components(results3, "Prdm9")
plot(results1$loadings[[1]][5,], results3$loadings[[1]][138,])
highest_genes(results3, component = 138)

Centered asinh

Using a different normalisation Asinh is like log when x is large but works when x=0

library(data.table)
library(ggplot2)
tmp <- data.table(x=seq(-15,50,0.01),
                  log=log(seq(-15,50,0.01)),
                  log1p=log1p(seq(-15,50,0.01)),
                  asinh=asinh(seq(-15,50,0.01)),
                  sqrt=sqrt(seq(-15,50,0.01)))

ggplot(melt(tmp[x<20], id.vars = "x"), aes(x, value, colour=variable)) +geom_line() + theme_minimal() + geom_abline(intercept = 0,slope=1) + ylim(0,NA) + xlim(0,NA)

export_data(as.matrix(data), name = "merged_mouse_V3_SDA_asinh_c", path="../data/conradV3/")
run_SDA(sda_location = "../../SDA/build/sda",
        out = "../results/conradV3_sda_4",
        data = "../data/conradV3/merged_mouse_V3_SDA_asinh_c.data",
        num_comps = 50,
        max_iter = 5000,
        save_freq = 500,
        set_seed = "79151 17351",
        N = 20322,
        eigen_parallel = TRUE,
          num_blocks = 128,
          num_openmp_threads = 12)

results4 <- load_results(results_folder = "../data/conradV3/conradV3_sda_4/", data_path = "../data/conradV3/")
rownames(results4$loadings[[1]]) <- paste0("V",1:50)
rownames(results4$pips[[1]]) <- paste0("V",1:50)

str(results4)
max(results4$free_energy)
check_convergence(results4)

highest_components(results4, "Lyar")
plot(results1$loadings[[1]][42,], results4$loadings[[1]][47,])
highest_genes(results4, component = 47)

highest_components(results4, "Prdm9")
plot(results1$loadings[[1]][5,], results4$loadings[[1]][49,])
highest_genes(results4, component = 49)

MyersGroup/testisAtlas documentation built on Nov. 29, 2020, 9:21 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

MyersGroup/testisAtlas
testisAtlas: R functions and analysis scripts used in the analysis of testis single cell RNAseq

In MyersGroup/testisAtlas: testisAtlas: R functions and analysis scripts used in the analysis of testis single cell RNAseq

Baseline

Different Seed 1

Different Seed 2

Different Seed 3

Different Seed 4

75 Components

200 Components

Centered asinh

R Package Documentation

Browse R Packages

We want your feedback!

MyersGroup/testisAtlas testisAtlas: R functions and analysis scripts used in the analysis of testis single cell RNAseq

In MyersGroup/testisAtlas: testisAtlas: R functions and analysis scripts used in the analysis of testis single cell RNAseq

Baseline

Different Seed 1

Different Seed 2

Different Seed 3

Different Seed 4

75 Components

200 Components

Centered asinh

R Package Documentation

Browse R Packages

We want your feedback!

MyersGroup/testisAtlas
testisAtlas: R functions and analysis scripts used in the analysis of testis single cell RNAseq