#------------------------------------------------
#' Read MIPTools tables
#'
#' @description
#' The `read_tbl_*()` family of functions is designed to read data tables
#' generated by the software program
#' [`MIPTools`](https://github.com/bailey-lab/MIPTools). Data is read lazily
#' using the [`vroom` package](https://vroom.r-lib.org/index.html). Data can be
#' filtered, retaining all rows that satisfy the conditions. To be retained, the
#' row in question must produce a value of `TRUE` for all conditions. Note that
#' when a condition evaluates to NA, the row will be dropped.
#'
#' @section Data structure:
#' Input data must contain six rows of metadata. The metadata can vary depending
#' on what type of file is read, but typically contains information about the
#' location of a mutation. The remaining rows represent the data for each sample
#' sequenced.
#'
#' @section Useful filter functions:
#' The [dplyr::filter()] function is employed to subset the rows of the data by
#' applying the expressions in `...` to the column values to determine which
#' rows should be retained.
#'
#' There are many functions and operators that are useful when constructing the
#' expressions used to filter the data:
#'
#' * [`==`], [`>`], [`>=`], etc.
#' * [`&`], [`|`], [`!`], [xor()]
#' * [is.na()]
#' * [`between()`][dplyr::between()], [`near()`][dplyr::near()]
#'
#' @param .tbl File path to the table.
#' @param ... <[`data-masking`][dplyr::dplyr_data_masking]> Filtering
#' expressions. Expressions must return a logical value. If multiple
#' expressions are included, they are combined with the `&` operator. Only
#' rows for which all conditions evaluate to `TRUE` are kept.
#' @param .col_select Columns to include in the results. Columns can be selected
#' using one or more selection expressions as in
#' [`dplyr::select()`][dplyr::select()]. Use `c()` or `list()` to use more
#' than one expression. See [`?tidyselect::language`][tidyselect::language]
#' for details on available selection options.
#' @param .tbl_ref File path to the reference table.
#' @param .tbl_alt File path to the alternate table.
#' @param .tbl_cov File path to the coverage table.
#' @param chrom `r lifecycle::badge("deprecated")` The chromosome(s) to filter
#' to.
#' @param gene `r lifecycle::badge("deprecated")` The gene(s) to filter to.
#'
#' @return
#' A [`tibble()`][tibble::tibble-package] subclass. Each function defines a
#' unique subclass to store the data. Data typically contains the sample,
#' associated metadata, and the value of interest.
#'
#' @seealso [vroom::vroom()] [dplyr::filter()]
#' @name read-tbl
#' @aliases read_tbl
#' @examples
#' # Get path to example file
#' ref_file <- miplicorn_example("reference_AA_table.csv")
#' alt_file <- miplicorn_example("alternate_AA_table.csv")
#' cov_file <- miplicorn_example("coverage_AA_table.csv")
#' ref_file
#'
#' # Input sources -------------------------------------------------------------
#' # Read from a path
#' read_tbl_reference(ref_file)
#'
#' # You can also use paths directly
#' # read_tbl_alternate("alternate_AA_table.csv")
#'
#' # Read entire file ----------------------------------------------------------
#' read_tbl_coverage(cov_file)
#'
#' # Data filtering ------------------------------------------------------------
#' # Filtering by one criterion
#' read_tbl_reference(ref_file, gene == "atp6")
#'
#' # Filtering by multiple criteria within a single logical expression
#' read_tbl_alternate(alt_file, gene == "atp6" & targeted == "Yes")
#' read_tbl_coverage(cov_file, gene == "atp6" | targeted == "Yes")
#'
#' # When multiple expressions are used, they are combined using &
#' read_tbl_reference(ref_file, gene == "atp6", targeted == "Yes")
#'
#' # Read multiple files together ----------------------------------------------
#' read_tbl_ref_alt_cov(ref_file, alt_file, cov_file)
NULL
#' @rdname read-tbl
#' @export
read_tbl_reference <- function(.tbl, ...) {
tbl <- read_tbl_helper(.tbl, ..., .name = "ref_umi_count")
new_ref_tbl(tbl)
}
#' @rdname read-tbl
#' @export
read_tbl_alternate <- function(.tbl, ...) {
tbl <- read_tbl_helper(.tbl, ..., .name = "alt_umi_count")
new_alt_tbl(tbl)
}
#' @rdname read-tbl
#' @export
read_tbl_coverage <- function(.tbl, ...) {
tbl <- read_tbl_helper(.tbl, ..., .name = "coverage")
new_cov_tbl(tbl)
}
#' @rdname read-tbl
#' @export
read_tbl_genotype <- function(.tbl, ...) {
tbl <- read_tbl_helper(.tbl, ..., .name = "genotype")
# Check genotype column has correct values
if (!has_genotype_vals(tbl$genotype)) {
cli_abort(c(
"Invalid genotype values detected.",
"i" = "Please review the input file.",
"i" = "Allowed values are: -1, 0, 1, 2, or NA."
))
}
# Assign class
new_geno_tbl(tbl)
}
#' @rdname read-tbl
#' @export
read_tbl_haplotype <- function(.tbl, ..., .col_select = NULL) {
dots <- enquos(..., .ignore_empty = "all")
check_named(dots)
if (empty_file(.tbl)) {
return(tibble::tibble())
}
# Read table
data <- .tbl %>%
vroom::vroom(show_col_types = FALSE, col_select = {{ .col_select }}) %>%
janitor::clean_names() %>%
dplyr::relocate(sample = sample_id)
# In some cases, the `chrom` column appears twice in the dataset, so we remove
# the last occurrence.
chrom_cols <- colnames(dplyr::select(data, dplyr::starts_with("chrom")))
if (length(chrom_cols) == 2) {
data <- dplyr::select(data, !chrom_cols[-1])
}
# Filter the data based on conditions specified
tbl <- filter_tbl(data, ...)
# Assign class
new_hap_tbl(tbl)
}
#' @rdname read-tbl
#' @export
read_tbl_ref_alt_cov <- function(.tbl_ref,
.tbl_alt,
.tbl_cov,
...,
chrom = deprecated(),
gene = deprecated()) {
# Deprecated chrom
if (lifecycle::is_present(chrom)) {
lifecycle::deprecate_warn(
when = "0.1.0",
what = "read(chrom)",
details = "Please use the `...` argument instead to filter data."
)
}
# Deprecated gene
if (lifecycle::is_present(gene)) {
lifecycle::deprecate_warn(
when = "0.1.0",
what = "read(gene)",
details = "Please use the `...` argument instead to filter data."
)
}
# Error message if multiple criteria selected
if (lifecycle::is_present(chrom) && lifecycle::is_present(gene)) {
cli_abort(c(
"Multiple filtering criteria selected.",
"x" = "Cannot filter on both `chrom` and `gene`.",
"i" = "Select only one piece of information to filter on."
))
}
# Error if any file is empty
if (purrr::some(list(.tbl_ref, .tbl_alt, .tbl_cov), empty_file)) {
empty <- purrr::detect(list(.tbl_ref, .tbl_alt, .tbl_cov), empty_file)
cli_abort(c(
"Unable to read files.",
"x" = '"{empty}" is an empty file.'
))
}
# Read in the three files
if (lifecycle::is_present(chrom) || lifecycle::is_present(gene)) {
tables <- purrr::pmap(
list(
file = c(.tbl_ref, .tbl_alt, .tbl_cov),
name = c("ref_umi_count", "alt_umi_count", "coverage")
),
deprec_read_file,
chrom = chrom,
gene = gene
)
} else {
tables <- purrr::pmap(
list(
.tbl = c(.tbl_ref, .tbl_alt, .tbl_cov),
.name = c("ref_umi_count", "alt_umi_count", "coverage")
),
read_tbl_helper,
...,
call = rlang::call2("read_tbl_ref_alt_cov")
)
}
# Determine overlapping columns
by <- purrr::reduce(purrr::map(tables, colnames), intersect)
# Combine three tibbles together
comb_tbls <- purrr::reduce(tables, dplyr::full_join, by = by)
# Assign class
new_ref_alt_cov_tbl(comb_tbls)
}
# Helper function used to read reference, alternate, and coverage tables
read_tbl_helper <- function(.tbl, ..., .name = "value", call = caller_env()) {
dots <- enquos(..., .ignore_empty = "all")
check_named(dots, call = call)
# dplyr:::check_filter(dots, error_call = call)
if (empty_file(.tbl)) {
return(tibble::tibble())
}
# Read in complete header
header <- .tbl %>%
vroom::vroom(col_names = FALSE, show_col_types = FALSE, n_max = 6) %>%
tibble::rownames_to_column() %>%
tidyr::pivot_longer(-rowname) %>%
tidyr::pivot_wider(
names_from = rowname,
values_from = value
) %>%
janitor::row_to_names(1) %>%
janitor::clean_names()
# Filter the header based on conditions specified
filter_header <- filter_tbl(header, ..., call = call)
# Extract which columns of data we are interested in
col_select <- filter_header[[1]] %>%
stringr::str_extract("\\d+") %>%
as.numeric()
# Read in entire data set but select only columns we are interested in
data <- vroom::vroom(
file = .tbl,
col_names = FALSE,
col_select = c(1, dplyr::all_of(col_select)),
show_col_types = FALSE,
.name_repair = "universal"
)
# Take the transpose of our matrix, making rows columns and columns rows
t_data <- data %>%
tibble::rownames_to_column() %>%
tidyr::pivot_longer(-rowname) %>%
tidyr::pivot_wider(
names_from = rowname,
values_from = value
) %>%
# Assign the column names of our tibble and clean them up
dplyr::select(-name) %>%
janitor::row_to_names(1)
# We only want to clean the names of the metadata. We want to leave the
# sample IDs unchanged.
colnames(t_data)[1:6] <- janitor::make_clean_names(colnames(t_data)[1:6])
# Convert our data to a long format
t_data %>%
tidyr::pivot_longer(
cols = -c(1:6),
names_to = "sample",
values_to = "value"
) %>%
dplyr::relocate(sample) %>%
dplyr::mutate(value = as.numeric(value)) %>%
dplyr::rename({{ .name }} := value)
}
# Check for the presence of named non-logical arguments
check_named <- function(dots, call = caller_env()) {
named <- rlang::have_name(dots)
named_non_logical <- purrr::keep(dots[named], function(x) {
!rlang::is_logical(rlang::quo_get_expr(x))
})
if (!rlang::is_empty(named_non_logical)) {
name <- names(named_non_logical[1])
expr <- rlang::quo_get_expr(named_non_logical[[1]])
cli_abort(
c(
"Input `{name}` is named.",
"i" = "This usually means that you've used `=` instead of `==`.",
"i" = "Did you mean `{name} == {as_label(expr)}`?"
),
call = call
)
}
}
# Filter the table based on conditions specified
filter_tbl <- function(.tbl, ..., call = caller_env()) {
tryCatch(
dplyr::filter(.tbl, ...),
error = function(e) {
e <- rlang::catch_cnd(dplyr::filter(.tbl, ...))
msg <- e$message %>%
stringr::str_replace_all(c(
"filter" = "read_tbl_*",
"comparison" = "Comparison"
)) %>%
stringr::str_c(".")
objects <- stringr::str_c("'", colnames(.tbl)[-1], "'")
cli_abort(
c(msg, "i" = "Filter using the column{?s} {objects}."),
parent = NA,
call = call
)
}
)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.