R/taxa_counts.R
In rsahmi: Single-Cell Analysis of Host-Microbiome Interactions

Documented in taxa_counts

#' Quantitation of microbes
#' @description After identifying true taxa, reads assigned to those taxa are
#' extracted and then undergo a series of filters. The cell barcode and UMI are
#' used to demultiplex the reads and create a barcode x taxa counts matrix. The
#' full taxonomic classification of all resulting barcodes and the number of
#' counts assigned to each clade are tabulated.
#'
#' @param umi_list A list of polars [DataFrame][polars::DataFrame_class] for UMI
#' data returned by [prep_dataset].
#' @param samples A character of sample identifier for each element in
#' `umi_list`.
#' @seealso <https://github.com/sjdlabgroup/SAHMI>
#' @return A polars [DataFrame][polars::DataFrame_class].
#' @examples 
#' \dontrun{
#' # `umi_list` should be the output of all samples from `prep_dataset()`, and 
#' # filtered by `slsd()` and `blsd()`
#' taxa_counts(umi_list, basename(names(umi_list)))
#' }
#' @export
taxa_counts <- function(umi_list, samples = NULL) {
    use_polars()
    columns <- c("sample", "barcode", "taxid", "taxa", "rank")
    if (is.list(umi_list)) {
        if (!is.null(samples) && length(samples) != length(umi_list)) {
            cli::cli_abort(
                "{.arg samples} must have the same length of {.arg umi_list}"
            )
        }
        samples <- samples %||% names(umi_list) %||% seq_along(umi_list)
        umi_list <- .mapply(function(umi, sample) {
            umi$with_columns(sample = pl$lit(sample))
        }, list(umi = umi_list, sample = samples), NULL)
        umi <- pl$concat(umi_list, how = "vertical")
    } else if (polars::is_polars_df(umi_list)) {
        if (!is.null(samples)) {
            if (!is_scalar(samples)) {
                cli::cli_abort(
                    "{.arg samples} must be a scalar string for a single umi"
                )
            }
            umi <- umi_list$with_columns(sample = pl$lit(samples))
        } else {
            columns <- setdiff(columns, "sample")
            umi <- umi_list
        }
    } else {
        cli::cli_abort(paste(
            "{.arg umi_list} must be a polars {.cls DataFrame}",
            "or a list of polars {.cls DataFrame}"
        ))
    }

    # create barcode umi data -----------------------------------
    counts <- umi$lazy()$
        with_columns(pl$col(pl$List(pl$String))$list$join("|"))$
        group_by(c(columns, "taxon", "ranks"))$
        agg(pl$col("umi")$n_unique())$
        with_columns(pl$col("ranks", "taxon")$str$split("|"))$
        #     with_columns(
        #     pl$col("ranks", "taxon")$list$eval(
        #         pl$int_range(0L, pl$element()$len())$add(1L)
        #     )$name$suffix("_len")
        # )$
        #     explode("^.+_len$")$
        #     with_columns(
        #     pl$col("ranks")$list$slice(0L, length = pl$col("ranks_len")),
        #     pl$col("taxon")$list$slice(0L, length = pl$col("taxon_len"))
        # )$
        #     drop("^.+_len$")$
        with_row_index("index")$
        explode(pl$col("ranks", "taxon"))$
        with_columns(
        # A rank code, indicating (U)nclassified, (R)oot, (D)omain,
        # (K)ingdom, (P)hylum, (C)lass, (O)rder, (F)amily, (G)enus, or
        # (S)pecies. Taxa that are not at any of these 10 ranks have a rank
        # code that is formed by using the rank code of the closest ancestor
        # rank with a number indicating the distance from that rank. E.g.,
        # "G2" is a rank code indicating a taxon is between genus and
        # species and the grandparent taxon is at the genus rank.
        pl$col("ranks")$
            str$replace("^R", "root")$
            str$replace("^D", "domain")$
            str$replace("^K", "kingdom")$
            str$replace("^P", "phylum")$
            str$replace("^C", "class")$
            str$replace("^O", "order")$
            str$replace("^F", "family")$
            str$replace("^G", "genus")$
            str$replace("^S", "species")
    )$
        collect()$
        pivot(
        values = "taxon", index = c("index", columns, "umi"),
        columns = "ranks"
    )$
        drop("index")

    # relocate columns ----------------------------
    columns <- list(pl$col(columns))
    all_nms <- counts$columns
    for (taxa in c("root", "domain", "kingdom", "phylum", "class", "order", "family", "genus", "species")) {
        pattern <- sprintf("^%s\\d*$", taxa)
        if (any(grepl(pattern, all_nms, perl = TRUE))) {
            columns <- c(columns, list(pl$col(pattern)))
        }
    }
    counts$select(c(columns, list(pl$col("umi")$alias("counts"))))
}