knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

In this vignette, we'll use GCAE and its example_tiny example data, to train and use a GCAE neural network.

Here are the steps:

Setup GCAE

First we determine if GCAE is installed.

To determine if GCAE is installed, we'll first load gcaer:

library(gcaer)

These are the GCAE options:

gcae_options <- create_gcae_options()
gcae_options

Now gcaer can detect if GCAE is installed:

good_to_go <- is_gcae_installed(gcae_options = gcae_options)
if (!good_to_go) {
  message("GCAE is not installed")
  message("Tip: use 'gcaer::install_gcae()'")
}

The GCAE options allow a user to select a GCAE version at a custom location. Or: the GCAE options allows a user to run GenoCAE.

In this tutorial, we'll be using a standard GCAE setup:

gcae_setup <- create_test_gcae_setup(
  superpops = get_gcaer_filename("gcae_input_files_1_labels.csv")
)
gcae_setup

The GCAE setup allow a user to specify: * which data to work on * the GCAE convolutional autoencoder architecture

Or: the GCAE setup allows a user to define how GenoCAE learns/trains/etc.

Determine the GCAE setup

See the GCAE setup

The GCAE setup contains the GCAE setup, such as the auto-encoder's specification, training options and data options:

Here we take a look at how each of these is set up.

datadir and data

The datadir in the name of the folder that hold the GCAE input files.

if (good_to_go) {
  gcae_setup$datadir
}

In this setup, we will use the files in that folder that have the same base filename as data (r gcae_setup$data in this case). Here are the files that match:

if (good_to_go) {
  list.files(gcae_setup$datadir, pattern = gcae_setup$data)
}

These are the full paths of the files we will use:

if (good_to_go) {
  gcae_input_filenames <- get_gcae_input_filenames(
    gcae_setup = gcae_setup
  )
  gcae_input_filenames
}

The model ID

if (good_to_go) {
  model_filename <- get_gcae_model_filename(model_id = gcae_setup$model_id)
  testthat::expect_true(file.exists(model_filename))
  readr::read_lines(model_filename)
}

train_opts_id

if (good_to_go) {
  train_opts_filename <- get_gcae_train_opts_filename(gcae_setup$train_opts_id)
  testthat::expect_true(file.exists(train_opts_filename))
  readr::read_lines(train_opts_filename)
}

data_opts_id

if (good_to_go) {
  data_opts_filename <- get_gcae_data_opts_filename(gcae_setup$data_opts_id)
  testthat::expect_true(file.exists(data_opts_filename))
  readr::read_lines(data_opts_filename)
}

pheno_model_id

if (good_to_go) {
  pheno_model_id_filename <- get_gcae_pheno_model_filename(
    gcae_setup$pheno_model_id
  )
  testthat::expect_true(file.exists(pheno_model_id_filename))
  readr::read_lines(pheno_model_id_filename)
}

See the genetic data

We read all the files in at once:

if (good_to_go) {
  gcae_input_data <- read_gcae_input_files(gcae_input_filenames)
  names(gcae_input_data)
}

This is the .fam table:

n_individuals <- "[unknown]"
if (good_to_go) {
  n_individuals <- nrow(gcae_input_data$fam_table)
  knitr::kable(utils::head(gcae_input_data$fam_table))
}

This table shows us the r n_individuals individuals.

This is the .bim file:

n_snps <- "[unknown]"
if (good_to_go) {
  n_snps <- nrow(gcae_input_data$bim_table)
  knitr::kable(utils::head(gcae_input_data$bim_table))
}

This table shows us the r n_snps SNPs.

This is the .bed file:

if (good_to_go) {
  testthat::expect_equal(n_individuals, ncol(gcae_input_data$bed_table))
  testthat::expect_equal(n_snps, nrow(gcae_input_data$bed_table))
  knitr::kable(utils::head(gcae_input_data$bed_table[, 1:10]))
}

Training

Here we train the auto-encoder:

if (good_to_go) {
  epochs <- 3
  train_filenames <- gcae_train(
    gcae_setup = gcae_setup,
    epochs = epochs,
    save_interval = 1,
    gcae_options = gcae_options
  )
}
if (good_to_go) {
  train_results <- parse_train_filenames(train_filenames)
}

Training times:

if (good_to_go) {
  ggplot2::ggplot(
    train_results$train_times_table,
    ggplot2::aes(x = epoch, y = train_time_sec)
  ) + ggplot2::geom_line()
}

Losses from training:

if (good_to_go) {
  ggplot2::ggplot(
    train_results$losses_from_train_t_table,
    ggplot2::aes(x = epoch, y = losses_from_train_t)
  ) + ggplot2::geom_line()
}

Losses from validation:

if (good_to_go) {
  ggplot2::ggplot(
    train_results$losses_from_train_v_table,
    ggplot2::aes(x = epoch, y = losses_from_train_v)
  ) + ggplot2::geom_line()
}

Project

good_to_go <- FALSE

if (good_to_go) {
  project_filenames <- gcae_project(
    gcae_setup = gcae_setup
  )
}

Parse the results:

if (good_to_go) {
  project_results <- parse_project_files(project_filenames)
}

Show the results:

if (good_to_go) {
  ggplot2::ggplot(
    project_results$losses_from_project_table,
    ggplot2::aes(x = epoch, y = losses_from_project)
  ) + ggplot2::geom_line() +
    ggplot2::scale_y_continuous(limits = c(0.0, 1.0))
}
if (good_to_go) {
  ggplot2::ggplot(
    project_results$genotype_concordances_table,
    ggplot2::aes(x = epoch, y = genotype_concordance)
  ) + ggplot2::geom_line() +
    ggplot2::scale_y_continuous(limits = c(0.0, 1.0))
}

Show neural network's performance

Here we let GCAE create all plots:

if (good_to_go) {
  plot_filenames <- gcae_plot(
    superpops = superpops,
    gcae_setup = gcae_setup,
    gcae_options = gcae_options
  )
}

These plots are 8 PDFs and 2 CSVs.

Here we collect the PDF filenames:

if (good_to_go) {
  pdf_plot_filenames <- stringr::str_subset(
    string = plot_filenames,
    pattern = "\\.pdf$"
  )
  testthat::expect_true(length(pdf_plot_filenames) >= epochs + 5)
}

We have to plot the 8 PDFs one-by-one as a for-loop won't work here:

if (good_to_go) {
  pdf_plot_filename <- pdf_plot_filenames[1]
  cat(basename(pdf_plot_filename))
  bitmap <- pdftools::pdf_render_page(pdf_plot_filename, page = 1)
  temp_png_filename <- tempfile(fileext = ".png")
  png::writePNG(bitmap, temp_png_filename)
  knitr::include_graphics(temp_png_filename)
}
if (good_to_go) {
  pdf_plot_filename <- pdf_plot_filenames[2]
  cat(basename(pdf_plot_filename))
  bitmap <- pdftools::pdf_render_page(pdf_plot_filename, page = 1)
  temp_png_filename <- tempfile(fileext = ".png")
  png::writePNG(bitmap, temp_png_filename)
  knitr::include_graphics(temp_png_filename)
}
if (good_to_go) {
  pdf_plot_filename <- pdf_plot_filenames[3]
  cat(basename(pdf_plot_filename))
  bitmap <- pdftools::pdf_render_page(pdf_plot_filename, page = 1)
  temp_png_filename <- tempfile(fileext = ".png")
  png::writePNG(bitmap, temp_png_filename)
  knitr::include_graphics(temp_png_filename)
}
if (good_to_go) {
  pdf_plot_filename <- pdf_plot_filenames[4]
  cat(basename(pdf_plot_filename))
  bitmap <- pdftools::pdf_render_page(pdf_plot_filename, page = 1)
  temp_png_filename <- tempfile(fileext = ".png")
  png::writePNG(bitmap, temp_png_filename)
  knitr::include_graphics(temp_png_filename)
}
if (good_to_go) {
  pdf_plot_filename <- pdf_plot_filenames[5]
  cat(basename(pdf_plot_filename))
  bitmap <- pdftools::pdf_render_page(pdf_plot_filename, page = 1)
  temp_png_filename <- tempfile(fileext = ".png")
  png::writePNG(bitmap, temp_png_filename)
  knitr::include_graphics(temp_png_filename)
}
if (good_to_go) {
  pdf_plot_filename <- pdf_plot_filenames[6]
  cat(basename(pdf_plot_filename))
  bitmap <- pdftools::pdf_render_page(pdf_plot_filename, page = 1)
  temp_png_filename <- tempfile(fileext = ".png")
  png::writePNG(bitmap, temp_png_filename)
  knitr::include_graphics(temp_png_filename)
}
if (good_to_go) {
  pdf_plot_filename <- pdf_plot_filenames[7]
  cat(basename(pdf_plot_filename))
  bitmap <- pdftools::pdf_render_page(pdf_plot_filename, page = 1)
  temp_png_filename <- tempfile(fileext = ".png")
  png::writePNG(bitmap, temp_png_filename)
  knitr::include_graphics(temp_png_filename)
}
if (good_to_go) {
  pdf_plot_filename <- pdf_plot_filenames[8]
  cat(basename(pdf_plot_filename))
  bitmap <- pdftools::pdf_render_page(pdf_plot_filename, page = 1)
  temp_png_filename <- tempfile(fileext = ".png")
  png::writePNG(bitmap, temp_png_filename)
  knitr::include_graphics(temp_png_filename)
}

Here we collect the two CSV filenames:

if (good_to_go) {
  csv_plot_filenames <- stringr::str_subset(
    string = plot_filenames,
    pattern = "\\.csv$"
  )
  testthat::expect_equal(length(csv_plot_filenames), 2)
}

Now the first CSVs:

if (good_to_go) {
  csv_plot_filename <- csv_plot_filenames[1]
  cat(basename(csv_plot_filename))
  knitr::kable(utils::head(readr::read_csv(csv_plot_filename)))
}

Now the second CSVs:

if (good_to_go) {
  csv_plot_filename <- csv_plot_filenames[2]
  cat(basename(csv_plot_filename))
  knitr::kable(utils::head(readr::read_csv(csv_plot_filename)))
}


richelbilderbeek/gcaer documentation built on March 25, 2024, 3:08 p.m.