In richelbilderbeek/gcaer: Interface to GCAE

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

In this vignette, we'll use GCAE and its example_tiny example data, to train and use a GCAE neural network.

Here are the steps:

Setup GCAE
See the genetic data
See the GCAE model
Use GCAE for training

Setup GCAE

First we determine if GCAE is installed.

To determine if GCAE is installed, we'll first load gcaer:

library(gcaer)

These are the GCAE options:

gcae_options <- create_gcae_options()
gcae_options

Now gcaer can detect if GCAE is installed:

good_to_go <- is_gcae_installed(gcae_options = gcae_options)
if (!good_to_go) {
  message("GCAE is not installed")
  message("Tip: use 'gcaer::install_gcae()'")
}

The GCAE options allow a user to select a GCAE version at a custom location. Or: the GCAE options allows a user to run GenoCAE.

In this tutorial, we'll be using a standard GCAE setup:

gcae_setup <- create_test_gcae_setup(
  superpops = get_gcaer_filename("gcae_input_files_1_labels.csv")
)
gcae_setup

The GCAE setup allow a user to specify: * which data to work on * the GCAE convolutional autoencoder architecture

Or: the GCAE setup allows a user to define how GenoCAE learns/trains/etc.

Determine the GCAE setup

See the GCAE setup

The GCAE setup contains the GCAE setup, such as the auto-encoder's specification, training options and data options:

datadir: folder where the input data is stored
data: base filename of the data files in the datadir
model_id: the setup of the auto-encoder
train_opts_id: the setup of the training of the auto-encoder
data_opts_id: the setup of the ?file storage? of the auto-encoder
pheno_model_id: the setup of the phenotypic prediction neural network, trained on the latent space of the auto-encoder
trainedmodeldir: folder to store the trained model

Here we take a look at how each of these is set up.

`datadir` and `data`

The datadir in the name of the folder that hold the GCAE input files.

if (good_to_go) {
  gcae_setup$datadir
}

In this setup, we will use the files in that folder that have the same base filename as data (r gcae_setup$data in this case). Here are the files that match:

if (good_to_go) {
  list.files(gcae_setup$datadir, pattern = gcae_setup$data)
}

These are the full paths of the files we will use:

if (good_to_go) {
  gcae_input_filenames <- get_gcae_input_filenames(
    gcae_setup = gcae_setup
  )
  gcae_input_filenames
}

The model ID

if (good_to_go) {
  model_filename <- get_gcae_model_filename(model_id = gcae_setup$model_id)
  testthat::expect_true(file.exists(model_filename))
  readr::read_lines(model_filename)
}

train_opts_id

if (good_to_go) {
  train_opts_filename <- get_gcae_train_opts_filename(gcae_setup$train_opts_id)
  testthat::expect_true(file.exists(train_opts_filename))
  readr::read_lines(train_opts_filename)
}

data_opts_id

if (good_to_go) {
  data_opts_filename <- get_gcae_data_opts_filename(gcae_setup$data_opts_id)
  testthat::expect_true(file.exists(data_opts_filename))
  readr::read_lines(data_opts_filename)
}

pheno_model_id

if (good_to_go) {
  pheno_model_id_filename <- get_gcae_pheno_model_filename(
    gcae_setup$pheno_model_id
  )
  testthat::expect_true(file.exists(pheno_model_id_filename))
  readr::read_lines(pheno_model_id_filename)
}

See the genetic data

We read all the files in at once:

if (good_to_go) {
  gcae_input_data <- read_gcae_input_files(gcae_input_filenames)
  names(gcae_input_data)
}

This is the .fam table:

n_individuals <- "[unknown]"
if (good_to_go) {
  n_individuals <- nrow(gcae_input_data$fam_table)
  knitr::kable(utils::head(gcae_input_data$fam_table))
}

This table shows us the r n_individuals individuals.

This is the .bim file:

n_snps <- "[unknown]"
if (good_to_go) {
  n_snps <- nrow(gcae_input_data$bim_table)
  knitr::kable(utils::head(gcae_input_data$bim_table))
}

This table shows us the r n_snps SNPs.

This is the .bed file:

if (good_to_go) {
  testthat::expect_equal(n_individuals, ncol(gcae_input_data$bed_table))
  testthat::expect_equal(n_snps, nrow(gcae_input_data$bed_table))
  knitr::kable(utils::head(gcae_input_data$bed_table[, 1:10]))
}

Training

Here we train the auto-encoder:

if (good_to_go) {
  epochs <- 3
  train_filenames <- gcae_train(
    gcae_setup = gcae_setup,
    epochs = epochs,
    save_interval = 1,
    gcae_options = gcae_options
  )
}

if (good_to_go) {
  train_results <- parse_train_filenames(train_filenames)
}

Training times:

if (good_to_go) {
  ggplot2::ggplot(
    train_results$train_times_table,
    ggplot2::aes(x = epoch, y = train_time_sec)
  ) + ggplot2::geom_line()
}

Losses from training:

if (good_to_go) {
  ggplot2::ggplot(
    train_results$losses_from_train_t_table,
    ggplot2::aes(x = epoch, y = losses_from_train_t)
  ) + ggplot2::geom_line()
}

Losses from validation:

if (good_to_go) {
  ggplot2::ggplot(
    train_results$losses_from_train_v_table,
    ggplot2::aes(x = epoch, y = losses_from_train_v)
  ) + ggplot2::geom_line()
}

Project

good_to_go <- FALSE

if (good_to_go) {
  project_filenames <- gcae_project(
    gcae_setup = gcae_setup
  )
}

Parse the results:

if (good_to_go) {
  project_results <- parse_project_files(project_filenames)
}

Show the results:

if (good_to_go) {
  ggplot2::ggplot(
    project_results$losses_from_project_table,
    ggplot2::aes(x = epoch, y = losses_from_project)
  ) + ggplot2::geom_line() +
    ggplot2::scale_y_continuous(limits = c(0.0, 1.0))
}

if (good_to_go) {
  ggplot2::ggplot(
    project_results$genotype_concordances_table,
    ggplot2::aes(x = epoch, y = genotype_concordance)
  ) + ggplot2::geom_line() +
    ggplot2::scale_y_continuous(limits = c(0.0, 1.0))
}

Show neural network's performance

Here we let GCAE create all plots:

if (good_to_go) {
  plot_filenames <- gcae_plot(
    superpops = superpops,
    gcae_setup = gcae_setup,
    gcae_options = gcae_options
  )
}

These plots are 8 PDFs and 2 CSVs.

Here we collect the PDF filenames:

if (good_to_go) {
  pdf_plot_filenames <- stringr::str_subset(
    string = plot_filenames,
    pattern = "\\.pdf$"
  )
  testthat::expect_true(length(pdf_plot_filenames) >= epochs + 5)
}

We have to plot the 8 PDFs one-by-one as a for-loop won't work here:

if (good_to_go) {
  pdf_plot_filename <- pdf_plot_filenames[1]
  cat(basename(pdf_plot_filename))
  bitmap <- pdftools::pdf_render_page(pdf_plot_filename, page = 1)
  temp_png_filename <- tempfile(fileext = ".png")
  png::writePNG(bitmap, temp_png_filename)
  knitr::include_graphics(temp_png_filename)
}

if (good_to_go) {
  pdf_plot_filename <- pdf_plot_filenames[2]
  cat(basename(pdf_plot_filename))
  bitmap <- pdftools::pdf_render_page(pdf_plot_filename, page = 1)
  temp_png_filename <- tempfile(fileext = ".png")
  png::writePNG(bitmap, temp_png_filename)
  knitr::include_graphics(temp_png_filename)
}

if (good_to_go) {
  pdf_plot_filename <- pdf_plot_filenames[3]
  cat(basename(pdf_plot_filename))
  bitmap <- pdftools::pdf_render_page(pdf_plot_filename, page = 1)
  temp_png_filename <- tempfile(fileext = ".png")
  png::writePNG(bitmap, temp_png_filename)
  knitr::include_graphics(temp_png_filename)
}

if (good_to_go) {
  pdf_plot_filename <- pdf_plot_filenames[4]
  cat(basename(pdf_plot_filename))
  bitmap <- pdftools::pdf_render_page(pdf_plot_filename, page = 1)
  temp_png_filename <- tempfile(fileext = ".png")
  png::writePNG(bitmap, temp_png_filename)
  knitr::include_graphics(temp_png_filename)
}

if (good_to_go) {
  pdf_plot_filename <- pdf_plot_filenames[5]
  cat(basename(pdf_plot_filename))
  bitmap <- pdftools::pdf_render_page(pdf_plot_filename, page = 1)
  temp_png_filename <- tempfile(fileext = ".png")
  png::writePNG(bitmap, temp_png_filename)
  knitr::include_graphics(temp_png_filename)
}

if (good_to_go) {
  pdf_plot_filename <- pdf_plot_filenames[6]
  cat(basename(pdf_plot_filename))
  bitmap <- pdftools::pdf_render_page(pdf_plot_filename, page = 1)
  temp_png_filename <- tempfile(fileext = ".png")
  png::writePNG(bitmap, temp_png_filename)
  knitr::include_graphics(temp_png_filename)
}

if (good_to_go) {
  pdf_plot_filename <- pdf_plot_filenames[7]
  cat(basename(pdf_plot_filename))
  bitmap <- pdftools::pdf_render_page(pdf_plot_filename, page = 1)
  temp_png_filename <- tempfile(fileext = ".png")
  png::writePNG(bitmap, temp_png_filename)
  knitr::include_graphics(temp_png_filename)
}

if (good_to_go) {
  pdf_plot_filename <- pdf_plot_filenames[8]
  cat(basename(pdf_plot_filename))
  bitmap <- pdftools::pdf_render_page(pdf_plot_filename, page = 1)
  temp_png_filename <- tempfile(fileext = ".png")
  png::writePNG(bitmap, temp_png_filename)
  knitr::include_graphics(temp_png_filename)
}

Here we collect the two CSV filenames:

if (good_to_go) {
  csv_plot_filenames <- stringr::str_subset(
    string = plot_filenames,
    pattern = "\\.csv$"
  )
  testthat::expect_equal(length(csv_plot_filenames), 2)
}

Now the first CSVs:

if (good_to_go) {
  csv_plot_filename <- csv_plot_filenames[1]
  cat(basename(csv_plot_filename))
  knitr::kable(utils::head(readr::read_csv(csv_plot_filename)))
}

Now the second CSVs:

if (good_to_go) {
  csv_plot_filename <- csv_plot_filenames[2]
  cat(basename(csv_plot_filename))
  knitr::kable(utils::head(readr::read_csv(csv_plot_filename)))
}

richelbilderbeek/gcaer documentation built on March 25, 2024, 3:08 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

richelbilderbeek/gcaer
Interface to GCAE

In richelbilderbeek/gcaer: Interface to GCAE

Setup GCAE

Determine the GCAE setup

See the GCAE setup

`datadir` and `data`

The model ID

train_opts_id

data_opts_id

pheno_model_id

See the genetic data

Training

Project

Show neural network's performance

R Package Documentation

Browse R Packages

We want your feedback!

richelbilderbeek/gcaer Interface to GCAE

In richelbilderbeek/gcaer: Interface to GCAE

Setup GCAE

Determine the GCAE setup

See the GCAE setup

datadir and data

The model ID

train_opts_id

data_opts_id

pheno_model_id

See the genetic data

Training

Project

Show neural network's performance

R Package Documentation

Browse R Packages

We want your feedback!

richelbilderbeek/gcaer
Interface to GCAE

`datadir` and `data`