knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
In this vignette, we'll use GCAE and its example_tiny
example data, to train and use a GCAE neural network.
Here are the steps:
First we determine if GCAE is installed.
To determine if GCAE is installed, we'll first load gcaer
:
library(gcaer)
These are the GCAE options:
gcae_options <- create_gcae_options() gcae_options
Now gcaer
can detect if GCAE is installed:
good_to_go <- is_gcae_installed(gcae_options = gcae_options) if (!good_to_go) { message("GCAE is not installed") message("Tip: use 'gcaer::install_gcae()'") }
The GCAE options allow a user to select a GCAE version at a custom location. Or: the GCAE options allows a user to run GenoCAE.
In this tutorial, we'll be using a standard GCAE setup:
gcae_setup <- create_test_gcae_setup( superpops = get_gcaer_filename("gcae_input_files_1_labels.csv") ) gcae_setup
The GCAE setup allow a user to specify: * which data to work on * the GCAE convolutional autoencoder architecture
Or: the GCAE setup allows a user to define how GenoCAE learns/trains/etc.
The GCAE setup contains the GCAE setup, such as the auto-encoder's specification, training options and data options:
datadir
: folder where the input data is storeddata
: base filename of the data files in the datadir
model_id
: the setup of the auto-encodertrain_opts_id
: the setup of the training of the auto-encoderdata_opts_id
: the setup of the ?file storage? of the auto-encoderpheno_model_id
: the setup of the phenotypic prediction
neural network, trained on the latent space of the auto-encodertrainedmodeldir
: folder to store the trained modelHere we take a look at how each of these is set up.
datadir
and data
The datadir
in the name of the folder that hold the GCAE input files.
if (good_to_go) { gcae_setup$datadir }
In this setup, we will use the files in that folder that have
the same base filename as data
(r gcae_setup$data
in this case).
Here are the files that match:
if (good_to_go) { list.files(gcae_setup$datadir, pattern = gcae_setup$data) }
These are the full paths of the files we will use:
if (good_to_go) { gcae_input_filenames <- get_gcae_input_filenames( gcae_setup = gcae_setup ) gcae_input_filenames }
if (good_to_go) { model_filename <- get_gcae_model_filename(model_id = gcae_setup$model_id) testthat::expect_true(file.exists(model_filename)) readr::read_lines(model_filename) }
if (good_to_go) { train_opts_filename <- get_gcae_train_opts_filename(gcae_setup$train_opts_id) testthat::expect_true(file.exists(train_opts_filename)) readr::read_lines(train_opts_filename) }
if (good_to_go) { data_opts_filename <- get_gcae_data_opts_filename(gcae_setup$data_opts_id) testthat::expect_true(file.exists(data_opts_filename)) readr::read_lines(data_opts_filename) }
if (good_to_go) { pheno_model_id_filename <- get_gcae_pheno_model_filename( gcae_setup$pheno_model_id ) testthat::expect_true(file.exists(pheno_model_id_filename)) readr::read_lines(pheno_model_id_filename) }
We read all the files in at once:
if (good_to_go) { gcae_input_data <- read_gcae_input_files(gcae_input_filenames) names(gcae_input_data) }
This is the .fam
table:
n_individuals <- "[unknown]" if (good_to_go) { n_individuals <- nrow(gcae_input_data$fam_table) knitr::kable(utils::head(gcae_input_data$fam_table)) }
This table shows us the r n_individuals
individuals.
This is the .bim
file:
n_snps <- "[unknown]" if (good_to_go) { n_snps <- nrow(gcae_input_data$bim_table) knitr::kable(utils::head(gcae_input_data$bim_table)) }
This table shows us the r n_snps
SNPs.
This is the .bed
file:
if (good_to_go) { testthat::expect_equal(n_individuals, ncol(gcae_input_data$bed_table)) testthat::expect_equal(n_snps, nrow(gcae_input_data$bed_table)) knitr::kable(utils::head(gcae_input_data$bed_table[, 1:10])) }
Here we train the auto-encoder:
if (good_to_go) { epochs <- 3 train_filenames <- gcae_train( gcae_setup = gcae_setup, epochs = epochs, save_interval = 1, gcae_options = gcae_options ) }
if (good_to_go) { train_results <- parse_train_filenames(train_filenames) }
Training times:
if (good_to_go) { ggplot2::ggplot( train_results$train_times_table, ggplot2::aes(x = epoch, y = train_time_sec) ) + ggplot2::geom_line() }
Losses from training:
if (good_to_go) { ggplot2::ggplot( train_results$losses_from_train_t_table, ggplot2::aes(x = epoch, y = losses_from_train_t) ) + ggplot2::geom_line() }
Losses from validation:
if (good_to_go) { ggplot2::ggplot( train_results$losses_from_train_v_table, ggplot2::aes(x = epoch, y = losses_from_train_v) ) + ggplot2::geom_line() }
good_to_go <- FALSE if (good_to_go) { project_filenames <- gcae_project( gcae_setup = gcae_setup ) }
Parse the results:
if (good_to_go) { project_results <- parse_project_files(project_filenames) }
Show the results:
if (good_to_go) { ggplot2::ggplot( project_results$losses_from_project_table, ggplot2::aes(x = epoch, y = losses_from_project) ) + ggplot2::geom_line() + ggplot2::scale_y_continuous(limits = c(0.0, 1.0)) }
if (good_to_go) { ggplot2::ggplot( project_results$genotype_concordances_table, ggplot2::aes(x = epoch, y = genotype_concordance) ) + ggplot2::geom_line() + ggplot2::scale_y_continuous(limits = c(0.0, 1.0)) }
Here we let GCAE create all plots:
if (good_to_go) { plot_filenames <- gcae_plot( superpops = superpops, gcae_setup = gcae_setup, gcae_options = gcae_options ) }
These plots are 8 PDFs and 2 CSVs.
Here we collect the PDF filenames:
if (good_to_go) { pdf_plot_filenames <- stringr::str_subset( string = plot_filenames, pattern = "\\.pdf$" ) testthat::expect_true(length(pdf_plot_filenames) >= epochs + 5) }
We have to plot the 8 PDFs one-by-one as a for-loop won't work here:
if (good_to_go) { pdf_plot_filename <- pdf_plot_filenames[1] cat(basename(pdf_plot_filename)) bitmap <- pdftools::pdf_render_page(pdf_plot_filename, page = 1) temp_png_filename <- tempfile(fileext = ".png") png::writePNG(bitmap, temp_png_filename) knitr::include_graphics(temp_png_filename) }
if (good_to_go) { pdf_plot_filename <- pdf_plot_filenames[2] cat(basename(pdf_plot_filename)) bitmap <- pdftools::pdf_render_page(pdf_plot_filename, page = 1) temp_png_filename <- tempfile(fileext = ".png") png::writePNG(bitmap, temp_png_filename) knitr::include_graphics(temp_png_filename) }
if (good_to_go) { pdf_plot_filename <- pdf_plot_filenames[3] cat(basename(pdf_plot_filename)) bitmap <- pdftools::pdf_render_page(pdf_plot_filename, page = 1) temp_png_filename <- tempfile(fileext = ".png") png::writePNG(bitmap, temp_png_filename) knitr::include_graphics(temp_png_filename) }
if (good_to_go) { pdf_plot_filename <- pdf_plot_filenames[4] cat(basename(pdf_plot_filename)) bitmap <- pdftools::pdf_render_page(pdf_plot_filename, page = 1) temp_png_filename <- tempfile(fileext = ".png") png::writePNG(bitmap, temp_png_filename) knitr::include_graphics(temp_png_filename) }
if (good_to_go) { pdf_plot_filename <- pdf_plot_filenames[5] cat(basename(pdf_plot_filename)) bitmap <- pdftools::pdf_render_page(pdf_plot_filename, page = 1) temp_png_filename <- tempfile(fileext = ".png") png::writePNG(bitmap, temp_png_filename) knitr::include_graphics(temp_png_filename) }
if (good_to_go) { pdf_plot_filename <- pdf_plot_filenames[6] cat(basename(pdf_plot_filename)) bitmap <- pdftools::pdf_render_page(pdf_plot_filename, page = 1) temp_png_filename <- tempfile(fileext = ".png") png::writePNG(bitmap, temp_png_filename) knitr::include_graphics(temp_png_filename) }
if (good_to_go) { pdf_plot_filename <- pdf_plot_filenames[7] cat(basename(pdf_plot_filename)) bitmap <- pdftools::pdf_render_page(pdf_plot_filename, page = 1) temp_png_filename <- tempfile(fileext = ".png") png::writePNG(bitmap, temp_png_filename) knitr::include_graphics(temp_png_filename) }
if (good_to_go) { pdf_plot_filename <- pdf_plot_filenames[8] cat(basename(pdf_plot_filename)) bitmap <- pdftools::pdf_render_page(pdf_plot_filename, page = 1) temp_png_filename <- tempfile(fileext = ".png") png::writePNG(bitmap, temp_png_filename) knitr::include_graphics(temp_png_filename) }
Here we collect the two CSV filenames:
if (good_to_go) { csv_plot_filenames <- stringr::str_subset( string = plot_filenames, pattern = "\\.csv$" ) testthat::expect_equal(length(csv_plot_filenames), 2) }
Now the first CSVs:
if (good_to_go) { csv_plot_filename <- csv_plot_filenames[1] cat(basename(csv_plot_filename)) knitr::kable(utils::head(readr::read_csv(csv_plot_filename))) }
Now the second CSVs:
if (good_to_go) { csv_plot_filename <- csv_plot_filenames[2] cat(basename(csv_plot_filename)) knitr::kable(utils::head(readr::read_csv(csv_plot_filename))) }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.