R/read-tbl.R

Defines functions filter_tbl check_named read_tbl_helper read_tbl_ref_alt_cov read_tbl_haplotype read_tbl_genotype read_tbl_coverage read_tbl_alternate read_tbl_reference

Documented in read_tbl_alternate read_tbl_coverage read_tbl_genotype read_tbl_haplotype read_tbl_ref_alt_cov read_tbl_reference

#------------------------------------------------
#' Read MIPTools tables
#'
#' @description
#' The `read_tbl_*()` family of functions is designed to read data tables
#' generated by the software program
#' [`MIPTools`](https://github.com/bailey-lab/MIPTools). Data is read lazily
#' using the [`vroom` package](https://vroom.r-lib.org/index.html). Data can be
#' filtered, retaining all rows that satisfy the conditions. To be retained, the
#' row in question must produce a value of `TRUE` for all conditions. Note that
#' when a condition evaluates to NA, the row will be dropped.
#'
#' @section Data structure:
#' Input data must contain six rows of metadata. The metadata can vary depending
#' on what type of file is read, but typically contains information about the
#' location of a mutation. The remaining rows represent the data for each sample
#' sequenced.
#'
#' @section Useful filter functions:
#' The [dplyr::filter()] function is employed to subset the rows of the data by
#' applying the expressions in `...` to the column values to determine which
#' rows should be retained.
#'
#' There are many functions and operators that are useful when constructing the
#' expressions used to filter the data:
#'
#' * [`==`], [`>`], [`>=`], etc.
#' * [`&`], [`|`], [`!`], [xor()]
#' * [is.na()]
#' * [`between()`][dplyr::between()], [`near()`][dplyr::near()]
#'
#' @param .tbl File path to the table.
#' @param ... <[`data-masking`][dplyr::dplyr_data_masking]> Filtering
#'   expressions. Expressions must return a logical value. If multiple
#'   expressions are included, they are combined with the `&` operator. Only
#'   rows for which all conditions evaluate to `TRUE` are kept.
#' @param .col_select Columns to include in the results. Columns can be selected
#'   using one or more selection expressions as in
#'   [`dplyr::select()`][dplyr::select()]. Use `c()` or `list()` to use more
#'   than one expression. See [`?tidyselect::language`][tidyselect::language]
#'   for details on available selection options.
#' @param .tbl_ref File path to the reference table.
#' @param .tbl_alt File path to the alternate table.
#' @param .tbl_cov File path to the coverage table.
#' @param chrom `r lifecycle::badge("deprecated")` The chromosome(s) to filter
#'   to.
#' @param gene `r lifecycle::badge("deprecated")` The gene(s) to filter to.
#'
#' @return
#' A [`tibble()`][tibble::tibble-package] subclass. Each function defines a
#' unique subclass to store the data. Data typically contains the sample,
#' associated metadata, and the value of interest.
#'
#' @seealso [vroom::vroom()] [dplyr::filter()]
#' @name read-tbl
#' @aliases read_tbl
#' @examples
#' # Get path to example file
#' ref_file <- miplicorn_example("reference_AA_table.csv")
#' alt_file <- miplicorn_example("alternate_AA_table.csv")
#' cov_file <- miplicorn_example("coverage_AA_table.csv")
#' ref_file
#'
#' # Input sources -------------------------------------------------------------
#' # Read from a path
#' read_tbl_reference(ref_file)
#'
#' # You can also use paths directly
#' # read_tbl_alternate("alternate_AA_table.csv")
#'
#' # Read entire file ----------------------------------------------------------
#' read_tbl_coverage(cov_file)
#'
#' # Data filtering ------------------------------------------------------------
#' # Filtering by one criterion
#' read_tbl_reference(ref_file, gene == "atp6")
#'
#' # Filtering by multiple criteria within a single logical expression
#' read_tbl_alternate(alt_file, gene == "atp6" & targeted == "Yes")
#' read_tbl_coverage(cov_file, gene == "atp6" | targeted == "Yes")
#'
#' # When multiple expressions are used, they are combined using &
#' read_tbl_reference(ref_file, gene == "atp6", targeted == "Yes")
#'
#' # Read multiple files together ----------------------------------------------
#' read_tbl_ref_alt_cov(ref_file, alt_file, cov_file)
NULL

#' @rdname read-tbl
#' @export
read_tbl_reference <- function(.tbl, ...) {
  tbl <- read_tbl_helper(.tbl, ..., .name = "ref_umi_count")
  new_ref_tbl(tbl)
}

#' @rdname read-tbl
#' @export
read_tbl_alternate <- function(.tbl, ...) {
  tbl <- read_tbl_helper(.tbl, ..., .name = "alt_umi_count")
  new_alt_tbl(tbl)
}

#' @rdname read-tbl
#' @export
read_tbl_coverage <- function(.tbl, ...) {
  tbl <- read_tbl_helper(.tbl, ..., .name = "coverage")
  new_cov_tbl(tbl)
}

#' @rdname read-tbl
#' @export
read_tbl_genotype <- function(.tbl, ...) {
  tbl <- read_tbl_helper(.tbl, ..., .name = "genotype")

  # Check genotype column has correct values
  if (!has_genotype_vals(tbl$genotype)) {
    cli_abort(c(
      "Invalid genotype values detected.",
      "i" = "Please review the input file.",
      "i" = "Allowed values are: -1, 0, 1, 2, or NA."
    ))
  }

  # Assign class
  new_geno_tbl(tbl)
}

#' @rdname read-tbl
#' @export
read_tbl_haplotype <- function(.tbl, ..., .col_select = NULL) {
  dots <- enquos(..., .ignore_empty = "all")
  check_named(dots)

  if (empty_file(.tbl)) {
    return(tibble::tibble())
  }

  # Read table
  data <- .tbl %>%
    vroom::vroom(show_col_types = FALSE, col_select = {{ .col_select }}) %>%
    janitor::clean_names() %>%
    dplyr::relocate(sample = sample_id)

  # In some cases, the `chrom` column appears twice in the dataset, so we remove
  # the last occurrence.
  chrom_cols <- colnames(dplyr::select(data, dplyr::starts_with("chrom")))
  if (length(chrom_cols) == 2) {
    data <- dplyr::select(data, !chrom_cols[-1])
  }

  # Filter the data based on conditions specified
  tbl <- filter_tbl(data, ...)

  # Assign class
  new_hap_tbl(tbl)
}

#' @rdname read-tbl
#' @export
read_tbl_ref_alt_cov <- function(.tbl_ref,
                                 .tbl_alt,
                                 .tbl_cov,
                                 ...,
                                 chrom = deprecated(),
                                 gene = deprecated()) {
  # Deprecated chrom
  if (lifecycle::is_present(chrom)) {
    lifecycle::deprecate_warn(
      when = "0.1.0",
      what = "read(chrom)",
      details = "Please use the `...` argument instead to filter data."
    )
  }

  # Deprecated gene
  if (lifecycle::is_present(gene)) {
    lifecycle::deprecate_warn(
      when = "0.1.0",
      what = "read(gene)",
      details = "Please use the `...` argument instead to filter data."
    )
  }

  # Error message if multiple criteria selected
  if (lifecycle::is_present(chrom) && lifecycle::is_present(gene)) {
    cli_abort(c(
      "Multiple filtering criteria selected.",
      "x" = "Cannot filter on both `chrom` and `gene`.",
      "i" = "Select only one piece of information to filter on."
    ))
  }

  # Error if any file is empty
  if (purrr::some(list(.tbl_ref, .tbl_alt, .tbl_cov), empty_file)) {
    empty <- purrr::detect(list(.tbl_ref, .tbl_alt, .tbl_cov), empty_file)
    cli_abort(c(
      "Unable to read files.",
      "x" = '"{empty}" is an empty file.'
    ))
  }

  # Read in the three files
  if (lifecycle::is_present(chrom) || lifecycle::is_present(gene)) {
    tables <- purrr::pmap(
      list(
        file = c(.tbl_ref, .tbl_alt, .tbl_cov),
        name = c("ref_umi_count", "alt_umi_count", "coverage")
      ),
      deprec_read_file,
      chrom = chrom,
      gene = gene
    )
  } else {
    tables <- purrr::pmap(
      list(
        .tbl = c(.tbl_ref, .tbl_alt, .tbl_cov),
        .name = c("ref_umi_count", "alt_umi_count", "coverage")
      ),
      read_tbl_helper,
      ...,
      call = rlang::call2("read_tbl_ref_alt_cov")
    )
  }

  # Determine overlapping columns
  by <- purrr::reduce(purrr::map(tables, colnames), intersect)

  # Combine three tibbles together
  comb_tbls <- purrr::reduce(tables, dplyr::full_join, by = by)

  # Assign class
  new_ref_alt_cov_tbl(comb_tbls)
}

# Helper function used to read reference, alternate, and coverage tables
read_tbl_helper <- function(.tbl, ..., .name = "value", call = caller_env()) {
  dots <- enquos(..., .ignore_empty = "all")
  check_named(dots, call = call)
  # dplyr:::check_filter(dots, error_call = call)

  if (empty_file(.tbl)) {
    return(tibble::tibble())
  }

  # Read in complete header
  header <- .tbl %>%
    vroom::vroom(col_names = FALSE, show_col_types = FALSE, n_max = 6) %>%
    tibble::rownames_to_column() %>%
    tidyr::pivot_longer(-rowname) %>%
    tidyr::pivot_wider(
      names_from = rowname,
      values_from = value
    ) %>%
    janitor::row_to_names(1) %>%
    janitor::clean_names()

  # Filter the header based on conditions specified
  filter_header <- filter_tbl(header, ..., call = call)

  # Extract which columns of data we are interested in
  col_select <- filter_header[[1]] %>%
    stringr::str_extract("\\d+") %>%
    as.numeric()

  # Read in entire data set but select only columns we are interested in
  data <- vroom::vroom(
    file = .tbl,
    col_names = FALSE,
    col_select = c(1, dplyr::all_of(col_select)),
    show_col_types = FALSE,
    .name_repair = "universal"
  )

  # Take the transpose of our matrix, making rows columns and columns rows
  t_data <- data %>%
    tibble::rownames_to_column() %>%
    tidyr::pivot_longer(-rowname) %>%
    tidyr::pivot_wider(
      names_from = rowname,
      values_from = value
    ) %>%
    # Assign the column names of our tibble and clean them up
    dplyr::select(-name) %>%
    janitor::row_to_names(1)

  # We only want to clean the names of the metadata. We want to leave the
  # sample IDs unchanged.
  colnames(t_data)[1:6] <- janitor::make_clean_names(colnames(t_data)[1:6])

  # Convert our data to a long format
  t_data %>%
    tidyr::pivot_longer(
      cols = -c(1:6),
      names_to = "sample",
      values_to = "value"
    ) %>%
    dplyr::relocate(sample) %>%
    dplyr::mutate(value = as.numeric(value)) %>%
    dplyr::rename({{ .name }} := value)
}

# Check for the presence of named non-logical arguments
check_named <- function(dots, call = caller_env()) {
  named <- rlang::have_name(dots)

  named_non_logical <- purrr::keep(dots[named], function(x) {
    !rlang::is_logical(rlang::quo_get_expr(x))
  })

  if (!rlang::is_empty(named_non_logical)) {
    name <- names(named_non_logical[1])
    expr <- rlang::quo_get_expr(named_non_logical[[1]])
    cli_abort(
      c(
        "Input `{name}` is named.",
        "i" = "This usually means that you've used `=` instead of `==`.",
        "i" = "Did you mean `{name} == {as_label(expr)}`?"
      ),
      call = call
    )
  }
}

# Filter the table based on conditions specified
filter_tbl <- function(.tbl, ..., call = caller_env()) {
  tryCatch(
    dplyr::filter(.tbl, ...),
    error = function(e) {
      e <- rlang::catch_cnd(dplyr::filter(.tbl, ...))
      msg <- e$message %>%
        stringr::str_replace_all(c(
          "filter" = "read_tbl_*",
          "comparison" = "Comparison"
        )) %>%
        stringr::str_c(".")
      objects <- stringr::str_c("'", colnames(.tbl)[-1], "'")
      cli_abort(
        c(msg, "i" = "Filter using the column{?s} {objects}."),
        parent = NA,
        call = call
      )
    }
  )
}
bailey-lab/miplicorn documentation built on March 19, 2023, 7:40 p.m.