R/fct_ETL.R

Defines functions find_possible_meta_columns load_project_data prep_allele_table prep_alignment_table prep_indel_table prep_qc_barplot_table prep_qc_table etl

Documented in etl find_possible_meta_columns prep_alignment_table prep_allele_table prep_indel_table prep_qc_barplot_table prep_qc_table

#' ETL
#' @description Run a pipeline to prepare all datasets required by the app
#' @param .project character, the project id. Only data belonging
#' to this project will be prepared for loading into the app
#' @param .project_data list, a list of datasets
#' @details In 'production' mode, the app will load required datasets after the user
#' chooses a project id string. For testing individual functions,
#' we can supply a data object directly to the function.
#'
#' @return A list of three lists:
#'  - `$meta`, a list of one data frame with the metadata corresponding to `.project`
#'  - `$data`, a list of six data frames with the raw data corresponding to `.project`
#'  - `$app`,  a list of four tables merging or transforming objects from `$data`
#'     to be used by app modules
#'
#' @export
etl <- function(.project) {
  .project_data <- load_project_data(.project = .project)

  qc_table <- prep_qc_table(.project_data = .project_data, .project = NULL)
  qc_barplot <- prep_qc_barplot_table(.project_data = .project_data, .project = NULL)
  indels <- prep_indel_table(.project_data = .project_data, .project = NULL)
  align <- prep_alignment_table(.project_data = .project_data, .project = NULL)
  allele <- prep_allele_table(.project_data = .project_data, .project = NULL)

  app_tables <-
    list(
      AppQCTable = qc_table
      , AppQCBarplot = qc_barplot
      , AppIndels = indels
      , AppAlign = align
      , AppAllele = allele
    )

  append(.project_data, list(app = app_tables))
}

#' @describeIn etl Merge CRISPRSummary and QCTable to calculate the percent
#' of usable reads to display in the [read qc module](mod_read_qc_ui)
#' @export
prep_qc_table <-
  function(.project_data = NULL,
           .project = NULL) {
    if (all(is.null(.project_data), is.null(.project))) {
      stop("Please provide either a project id string, or project data object")
    }

    if (is.null(.project_data)) {
      .project_data <- load_project_data(.project)
    }

    tmp_df <-
      dplyr::left_join(.project_data$data$CRISPRSummary,
                       .project_data$data$QCTable,
                       by = "SampleName")

    if ("ExperimentId" %in% names(tmp_df)) {
      join_by <- c("ExperimentId", "SampleName")
    } else {
      join_by <- "SampleName"
    }

    dplyr::left_join(tmp_df, .project_data$data$SampleMeta, by = join_by) %>%
      dplyr::select(dplyr::all_of(names(.project_data$data$SampleMeta)),
                    TargetId,
                    SampleName,
                    dplyr::everything()) %>%
      dplyr::mutate(PercentUsableTotalReads = as.numeric(UsableReadsTotal) / as.numeric(NumberOfReads) * 100) %>%
      dplyr::mutate(PercentUsableTotalReads = round(PercentUsableTotalReads, 2))
  }

#' @describeIn etl Merge the SampleMeta and EditingFreq datasets to generate
#' a barplot of the percent modified/unmodified reads per sample to display in
#' the [read qc barplot module](mod_read_qc_barplot_ui)
#' @export
prep_qc_barplot_table <- function(.project_data = NULL, .project = NULL) {

  if (all(is.null(.project_data), is.null(.project))) {
    stop("Please provide either a project id string, or project data object")
  }

  if (is.null(.project_data)) {
    .project_data <- load_project_data(.project)
  }

  .project_data$data$SampleMeta %>%
    dplyr::mutate_all(as.character) %>%
    dplyr::left_join(
      x = .project_data$data$EditingFreq,
      y = .,
      by = c("ExperimentId", "SampleName")) %>%
    dplyr::mutate(TargetId = as.character(TargetId))
}

#' @describeIn etl Merge the SampleMeta and IndelFreq datasets to generate
#' a scatter plot of the indel species distribution, later displayed in the
#' the [indel distributon module](mod_indel_distribution_ui)
#' @export
prep_indel_table <- function(.project_data = NULL, .project = NULL) {
  if (all(is.null(.project_data), is.null(.project))) {
    stop("Please provide either a project id string, or project data object")
  }

  if (is.null(.project_data)) {
    .project_data <- load_project_data(.project)
  }

  dplyr::left_join(
    x = .project_data$data$IndelFreq,
    y = .project_data$data$SampleMeta,
    by = c("ExperimentId", "SampleName")
  )
}

#' @describeIn etl Transform the AlleleAlignFreq table for loading into the app,
#' to display in the [sequence alignment module](mod_seq_alignment_ui)
#'
#' @export
prep_alignment_table <- function(.project_data = NULL, .project = NULL) {

  if (all(is.null(.project_data), is.null(.project))) {
    stop("Please provide either a project id string, or project data object")
  }

  if (is.null(.project_data)) {
    .project_data <- load_project_data(.project)
  }

  .project_data$data$AlleleAlignFreq %>%
    dplyr::left_join(., .project_data$data$SampleMeta, by = c("ExperimentId", "SampleName")) %>%
    dplyr::mutate(PctReads = round(as.numeric(PctReads), 2)) %>%
    dplyr::mutate(NumberOfReads = round(as.numeric(NumberOfReads), 2))
}

#' @describeIn etl Transform the AlleleAlignFreq table for loading into the app,
#' to display in the [allele frequency module](mod_allele_frequency_ui)
#'
#' @export
prep_allele_table <- function(.project_data = NULL, .project = NULL) {

  if (all(is.null(.project_data), is.null(.project))) {
    stop("Please provide either a project id string, or project data object")
  }

  if (is.null(.project_data)) {
    .project_data <- load_project_data(.project)
  }

  group_by_list <- c(
    names(.project_data$data$SampleMeta),
    "TargetId",
    "SampleName",
    "GdnaId",
    "MutationType"
  ) %>% unique()

  .project_data$data$AlleleAlignFreq %>%
    dplyr::left_join(.,
                     .project_data$data$SampleMeta,
                     by = c("ExperimentId", "SampleName")) %>%
    dplyr::mutate(MutationType = ifelse(MutationType == "NA", NA_character_, MutationType)) %>%
    dplyr::group_by(!!!rlang::syms(group_by_list)) %>%
    dplyr::mutate(PctReads = as.numeric(PctReads)) %>%
    dplyr::summarise_at("PctReads", mean, na.rm = TRUE) %>%
    tidyr::drop_na(MutationType) %>%
    tidyr::pivot_wider(names_from = MutationType, values_from = PctReads) %>%
    dplyr::mutate(GrandTotal = sum(Indel, `Indel+Subs`, Subs, na.rm = TRUE)) %>%
    dplyr::ungroup() %>%
    dplyr::mutate_if(is.numeric, round, 2) %>%
    dplyr::mutate_if(is.factor, as.character)
}

#' load_project_data
#' @importFrom parallel detectCores
#' @noRd
load_project_data <- function(.project) {
  if (golem::app_prod()) {
    print("prod")
    meta_data <- readRDS("inst/META/AllExperimentMetaData.RDS")
    data_data <- qs::qread(paste0("inst/DATA/", .project, ".qs"))
    # meta_data <-
    #   system.file("AllExperimentMetaData.RDS",
    #               package = "gene.editing.dash",
    #               mustWork = TRUE) %>% readRDS()
    #
    # data_data <-
    #   system.file(paste0(.project, ".RDS"),
    #               package = "gene.editing.dash",
    #               mustWork = TRUE) %>% readRDS()
  } else {
    meta_data <- readRDS("inst/META/AllExperimentMetaData.RDS")
    data_data <- qs::qread(file = paste0("inst/DATA/", .project, ".qs"), nthreads = parallel::detectCores()-1)
  }
  list(
    meta = meta_data,
    data = data_data
    )
}

#' find possible meta columns
#' @description Try to coerce to numeric, and grab columns that fail as
#' possible categorical variables that can be used to build menus for
#' faceting or color-coding
#' @param .data the table used in a module where we wish to generate menus of column
#' name options
#' @export
find_possible_meta_columns <- function(.data) {
  suppressWarnings(
    .data %>%
      dplyr::mutate_if(is.factor, as.character) %>%
      purrr::map(as.numeric) %>%
      purrr::keep( ~ all(is.na(.))) %>%
      names()
  )
}
teofiln/gene.editing.dash documentation built on Feb. 21, 2022, 12:59 a.m.