Nothing
#' @title Preprocessing and postprocessing of input immune repertoire files
#'
#' @details
#' This collection of "maker" functions generates common preprocessing and
#' postprocessing function steps tailored for immune repertoire data.
#' Each `make_*` function returns a new function that can then be applied
#' to a dataset.
#'
#' These functions are designed to be flexible components in constructing
#' custom data processing workflows.
#'
#' @details
#' The functions generated by these factories typically expect a `dataset`
#' (e.g., a `duckplyr` with annotations) as their first argument
#' and may accept additional arguments via `...` (though often unused in the
#' predefined steps).
#'
#' - `make_default_preprocessing()` and `make_default_postprocessing()` assemble
#' a list of such processing functions.
#' - The individual `make_exclude_columns()`, `make_productive_filter()`, and
#' `make_barcode_prefix()` functions create specific transformation steps.
#'
#' These steps are often used when reading data to standardize formats, filter
#' unwanted records, or enrich information like cell barcodes. They are designed
#' to gracefully handle cases where an operation is not applicable (e.g., a specified
#' column is not found) by issuing a warning and returning the dataset unmodified.
#'
#' @section Functions:
#' * `make_default_preprocessing()`: Creates a default list of preprocessing
#' functions suitable for "airr" or "10x" formatted data. This typically
#' includes steps to exclude unnecessary columns and filter for productive sequences.
#' * `make_default_postprocessing()`: Creates a default list of postprocessing
#' functions, such as adding a prefix to cell barcodes.
#' * `make_exclude_columns()`: Creates a function that, when applied to a
#' dataset, removes a specified set of columns.
#' * `make_productive_filter()`: Creates a function that filters a dataset
#' to retain only rows where sequences are marked as productive, based on
#' a specified column and set of "truthy" values.
#' * `make_barcode_prefix()`: Creates a function that prepends a prefix
#' (sourced from a specified column in the dataset) to the cell barcodes.
#'
#' @param format For `make_default_preprocessing()`, a character string specifying
#' the input data format. Currently supports `"airr"` (default) or `"10x"`.
#' This determines the default set of columns to exclude and the values
#' considered "productive".
#' @param cols For `make_exclude_columns()`, a character vector of column names
#' to be removed from the dataset. Defaults to `imd_drop_cols("airr")`.
#' If empty, the returned function will not remove any columns.
#' @param col_name For `make_productive_filter()`, a character vector of potential
#' column names that indicate sequence productivity (e.g., `"productive"`).
#' The first matching column found in the dataset will be used.
#' @param truthy For `make_productive_filter()`, a value or vector of values
#' that signify a productive sequence in the `col_name` column.
#' Can be a logical `TRUE` (default for "airr" format) or a character vector
#' of strings (e.g., `c("true", "TRUE", "True", "t", "T", "1")` for "10x" format).
#' @param prefix_col For `make_barcode_prefix()`, the name of the column in the
#' dataset that contains the prefix string to be added to each cell barcode.
#' Defaults to `"Prefix"`. The barcode column itself is identified internally
#' via `imd_schema("barcode")`.
#'
#' @return
#' Each `make_*` function returns a *new function*. This returned function takes
#' a `dataset` as its first argument and `...` for any additional arguments,
#' and performs the specific processing step.
#' `make_default_preprocessing()` and `make_default_postprocessing()` return a
#' *named list* of such functions.
#'
#' @seealso
#' [read_repertoires()]
#'
#' @concept processing
#' @rdname preprocess_postprocess
#' @export
make_default_preprocessing <- function(format = c("default", "airr", "10x")) {
format <- match.arg(format)
truthy <- c("TRUE", "True", "true", "T", "t", "YES", "Yes", "yes", "Y", "y", "1")
if (format == "default") {
list(
exclude_columns = make_exclude_columns(imd_drop_cols("universal")),
filter_nonproductive = make_productive_filter(truthy = truthy)
)
} else if (format == "airr") {
list(
exclude_columns = make_exclude_columns(imd_drop_cols("airr")),
filter_nonproductive = make_productive_filter(truthy = truthy)
)
} else if (format == "10x") {
list(
exclude_columns = make_exclude_columns(imd_drop_cols("10x")),
filter_nonproductive = make_productive_filter(truthy = truthy)
)
}
}
#' @rdname preprocess_postprocess
#' @export
make_default_postprocessing <- function() {
list(
prefix_barcodes = make_barcode_prefix()
)
}
#' @rdname preprocess_postprocess
#' @export
make_exclude_columns <- function(cols = imd_drop_cols("airr")) {
fun <- function(dataset, ...) {
if (length(cols)) {
dataset |>
select(-any_of(cols))
} else {
dataset
}
}
fun
}
#' @rdname preprocess_postprocess
#' @export
make_productive_filter <- function(col_name = c("productive"),
truthy = TRUE) {
checkmate::assert_string(col_name)
fun <- function(dataset, ...) {
col_name <- intersect(
colnames(dataset),
col_name
)
if (length(col_name) == 0) {
cli::cli_alert_warning("No columns with the productive specification found; skipping the filtering")
dataset
} else {
prod_col <- paste0("imd_", col_name)
truthy <- truthy |> as.character()
dataset <- dataset |> mutate(!!rlang::sym(prod_col) := dd$concat(!!rlang::sym(col_name), ""))
if (length(truthy) == 1) {
dataset <- dataset |> filter(!!rlang::sym(prod_col) == truthy)
} else {
dataset <- dataset |> filter(!!rlang::sym(prod_col) %in% truthy)
}
dataset |> select(-!!rlang::sym(prod_col))
}
}
fun
}
#' @rdname preprocess_postprocess
#' @export
make_barcode_prefix <- function(prefix_col = "Prefix") {
checkmate::assert_character(prefix_col)
fun <- function(dataset, ...) {
prefix_col_found <- intersect(prefix_col, colnames(dataset))[1]
if (!is.na(prefix_col_found)) {
barcode_col <- imd_schema("barcode")
prefix_col <- prefix_col_found
dataset |> mutate({{ barcode_col }} := dd$concat(
!!rlang::sym(prefix_col),
!!rlang::sym(barcode_col)
))
} else {
cli::cli_alert_warning("No column '{prefix_col}' with barcode prefixes found in the data; skipping the barcode processing")
dataset
}
}
fun
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.