speedyseq: Faster implementations of phyloseq functions

Documented in tax_glom tip_glom tree_glom

# Attribution: The documentation and coding strategy for `tax_glom()` and
# `tip_glom()` are modified from phyloseq
# https://github.com/joey711/phyloseq/blob/master/R/transform_filter-methods.

#' Agglomerate taxa using taxonomy.
#'
#' This method merges species that have the same taxonomy at a certain
#' taxonomic rank. Its approach is analogous to [tip_glom()], but
#' uses categorical data instead of a tree. In principal, other categorical
#' data known for all taxa could also be used in place of taxonomy, but for the
#' moment, this must be stored in the `taxonomyTable` of the data. Also,
#' columns/ranks to the right of the rank chosen to use for agglomeration will
#' be replaced with `NA`, because they should be meaningless following
#' agglomeration.
#'
#' This is the speedyseq reimplementation of `phyloseq::tax_glom()`. It should
#' produce results that are identical to phyloseq up to taxon order. 
#'
#' If `x` is a phyloseq object with a phylogenetic tree, then the new taxa will
#' be ordered as they are in the tree. Otherwise, the taxa order can be
#' controlled by the `reorder` argument, which behaves like the `reorder`
#' argument in [base::rowsum()]. `reorder = FALSE` will keep taxa in
#' the original order determined by when the member of each group first appears
#' in `taxa_names(x)`; `reorder = TRUE` will order new taxa alphabetically
#' according to taxonomy (string of concatenated rank values).
#'
#' Acknowledgements: Documentation and general strategy derived from
#' `phyloseq::tax_glom()`.
#'
#' @param physeq (Required). [phyloseq-class()] or [tax_table()].
#' @param taxrank A character string specifying the taxonomic level that you
#' want to agglomerate over. Should be among the results of
#' `rank_names(physeq)`. The default value is `rank_names(physeq)[1]`, which
#' may agglomerate too broadly for a given experiment. You are strongly
#' encouraged to try different values for this argument.
#' @param NArm (Optional). Logical, length equal to one. Default is `TRUE`.
#' CAUTION. The decision to prune (or not) taxa for which you lack categorical
#' data could have a large effect on downstream analysis.  You may want to
#' re-compute your analysis under both conditions, or at least think carefully
#' about what the effect might be and the reasons explaining the absence of
#' information for certain taxa. In the case of taxonomy, it is often a result
#' of imprecision in taxonomic designation based on short phylogenetic
#' sequences and a patchy system of nomenclature.  If this seems to be an issue
#' for your analysis, think about also trying the nomenclature-agnostic
#' [tip_glom()] method if you have a phylogenetic tree available.
#' @param bad_empty (Optional). Character vector. Default: `c(NA, "", " ",
#' "\t")`. Defines the bad/empty values that should be ignored and/or
#' considered unknown. They will be removed from the internal agglomeration
#' vector derived from the argument to `tax`, and therefore agglomeration will
#' not combine taxa according to the presence of these values in `tax`.
#' Furthermore, the corresponding taxa can be optionally pruned from the output
#' if `NArm` is set to `TRUE`.
#' @param reorder Logical specifying whether to reorder the taxa by taxonomy
#' strings or keep initial order. Ignored if `physeq` has a phylogenetic tree.
#' 
#' @return A taxonomically-agglomerated, optionally-pruned, object with class
#' matching the class of `physeq`.
#'
#' @seealso
#' [phyloseq::tax_glom()]
#' 
#' [tip_glom()]
#' 
#' [prune_taxa()]
#' 
#' [merge_taxa_vec()]
#' 
#' @export
#'
#' @examples
#' data(GlobalPatterns)
#' # print the available taxonomic ranks
#' colnames(tax_table(GlobalPatterns))
#' # agglomerate at the Family taxonomic rank
#' (x1 <- tax_glom(GlobalPatterns, taxrank="Family"))
#' # How many taxa before/after agglomeration?
#' ntaxa(GlobalPatterns); ntaxa(x1)
tax_glom <- function(physeq, 
                     taxrank = rank_names(physeq)[1],
                     NArm = TRUE, 
                     bad_empty = c(NA, "", " ", "\t"),
                     reorder = FALSE) {
  if (is.null(access(physeq, "tax_table"))) {
    stop("`tax_glom()` requires that `physeq` contain a taxonomy table")
  }
  if (!taxrank[1] %in% rank_names(physeq)) {
    stop("Bad `taxrank` argument. Must be among the values of `rank_names(physeq)`")
  }
  rank_idx <- which(rank_names(physeq) %in% taxrank[1])
  # if NArm is TRUE, remove taxa whose value for taxrank is in bad_empty
  if (NArm) {
    bad_taxa <- tax_table(physeq)[, rank_idx] %in% bad_empty
    physeq <- prune_taxa(!bad_taxa, physeq)
  }
  # For each taxon, make a string containing its full taxonomy, which will
  # define the groups to be merged
  tax_strings <- apply(
    tax_table(physeq)[, 1:rank_idx, drop = FALSE],
    1, 
    function(x) {paste(x, collapse=";")}
  )
  # Merge taxa with speedyseq's vectorized merging function
  physeq <- merge_taxa_vec(physeq, tax_strings, tax_adjust = 0L, 
    reorder = reorder)
  # "Empty" the taxonomy values to the right of the rank, using NA_character_.
  if (rank_idx < length(rank_names(physeq))) {
    bad_ranks <- seq(rank_idx + 1, length(rank_names(physeq)))
    tax_table(physeq)[, bad_ranks] <- NA_character_
  }
  physeq
}

#' Agglomerate taxa using phylogeny-derived distances.
#' 
#' All tips of the tree separated by a phylogenetic cophenetic distance smaller
#' than `h` will be agglomerated into one taxon.
#' 
#' Can be used to create a non-trivial OTU Table, if a phylogenetic tree is
#' available.
#'
#' By default, simple "greedy" single-linkage clustering is used. It is
#' possible to specify different clustering approaches by setting `hcfun` and
#' its parameters in `...`. In particular, complete-linkage clustering appears
#' to be used more commonly for OTU clustering applications.
#'
#' The merged taxon is named according to the "archetype" defined as the the
#' most abundant taxon (having the largest value of `taxa_sums(physeq)`. The
#' tree and refseq objects are pruned to the archetype taxa.
#'
#' Speedyseq note: [stats::hclust()] is faster than the default `hcfun`; set
#' `method = "average"` to get equivalent clustering.
#'
#' Acknowledgements: Documentation and general strategy derived from
#' `phyloseq::tip_glom()`.
#'
#' @param physeq (Required). A [phyloseq-class()], containing a phylogenetic
#' tree. Alternatively, a phylogenetic tree [ape::phylo()] will also work.
#' @param h (Optional). Numeric scalar of the height where the tree should be
#' cut. This refers to the tree resulting from hierarchical clustering of the
#' distance matrix, not the original phylogenetic tree. Default value is `0.2`.
#' @param hcfun (Optional). A function. The (agglomerative, hierarchical)
#' clustering function to use. The default is [cluster::agnes()] for phyloseq
#' compatiblity.
#' @param tax_adjust 0: no adjustment; 1: phyloseq-compatible adjustment; 2:
#' conservative adjustment (see [merge_taxa_vec()] for details)
#' @param ... (Optional). Additional named arguments to pass to `hcfun`. 
#'
#' @return An instance of the [phyloseq-class()].  Or alternatively, a
#' [phylo()] object if the `physeq` argument was just a tree.  In the
#' expected-use case, the number of OTUs will be fewer (see [ntaxa()]), after
#' merging OTUs that are related enough to be called the same OTU. 
#'
#' @seealso 
#'
#' [phyloseq::tip_glom()]
#'
#' [tree_glom()] for direct phylogenetic merging
#' 
#' [merge_taxa_vec()]
#' 
#' [cluster::agnes()]
#' 
#' [stats::hclust()]
#' 
#' [castor::get_all_pairwise_distances()]
#' 
#' [ape::phylo()]
#'
#' @export
#'
#' @examples 
#' data("esophagus")
#' esophagus <- prune_taxa(taxa_names(esophagus)[1:25], esophagus)
#' plot_tree(esophagus, label.tips="taxa_names", size="abundance", 
#'   title="Before tip_glom()")
#' plot_tree(tip_glom(esophagus, h=0.2), label.tips="taxa_names", 
#'   size="abundance", title="After tip_glom()")
#' 
#' # *speedyseq only:* Demonstration of different `tax_adjust` behaviors
#' data(GlobalPatterns)
#' 
#' set.seed(20190421)
#' ps <- prune_taxa(sample(taxa_names(GlobalPatterns), 2e2), GlobalPatterns)
#'
#' ps1 <- tip_glom(ps, 0.1, tax_adjust = 1)
#' ps2 <- tip_glom(ps, 0.1, tax_adjust = 2)
#' tax_table(ps1)[c(108, 136, 45),]
#' tax_table(ps2)[c(108, 136, 45),]
tip_glom <- function(physeq, 
                     h = 0.2, 
                     # hcfun = hclust, # Need method = "average" for compat
                     hcfun = cluster::agnes,
                     tax_adjust = 1L,
                     ...) {
  tree <- access(physeq, "phy_tree")
  if (is.null(tree)) {
    stop("`tip_glom()` requires that physeq contain a phylogenetic tree")
  }
  d <- castor::get_all_pairwise_distances(
    tree,
    only_clades = taxa_names(tree),
    check_input = FALSE
  )
  rownames(d) <- colnames(d) <- taxa_names(tree)
  d <- stats::as.dist(d)
  psclust <- stats::cutree(stats::as.hclust(hcfun(d, ...)), h = h)
  merge_taxa_vec(physeq, psclust, tax_adjust = tax_adjust)
}

#' Agglomerate taxa at a given phylogenetic resolution.
#'
#' This function merges taxa that are sufficiently closely related in the tree
#' in `phy_tree(physeq)`. It is similar to `tip_glom()`, except that it uses
#' the tree directly, rather than as the basis for hierarchical clustering.
#'
#' This function uses [castor::collapse_tree_at_resolution()] from
#' the castor package to determine the groups of taxa to be collapsed; see this
#' function for details about the `resolution` and `criterion` parameters.
#'
#' New taxa are named according to the most abundant taxon of each group. The
#' tree, reference sequences, and (by default) the taxonomy table reflect these
#' "archetype" taxa.
#'
#' @param physeq [phyloseq-class()] or [ape::phylo()].
#' @param resolution Phylogenetic resolution at which to merge taxa. 
#' @param criterion Criterion for determining whether to collapse taxa. See
#' [castor::collapse_tree_at_resolution()] for details.
#' @param tax_adjust 0: no adjustment; 1: phyloseq-compatible adjustment; 2:
#' conservative adjustment (see [merge_taxa_vec()] for details)
#'
#' @return A `physeq` object with new taxa reflecting the phylogenetically
#' merged groups.
#'
#' @seealso
#' [castor::collapse_tree_at_resolution()] for more information about the
#' `resolution` and `criterion` parameters.
#'
#' [merge_taxa_vec()] for more about `tax_adjust` and general merging behavior
#'
#' [tip_glom()] for indirect phylogenetic merging
#'
#' [tax_glom()] for merging using taxonomy
#' 
#' @export
#'
#' @examples
#' data(GlobalPatterns)
#' ps1 <- subset_taxa(GlobalPatterns, Phylum == "Chlamydiae")
#' ntaxa(ps1)
#' ps2 <- tree_glom(ps1, 0.05)
#' ntaxa(ps2)
#' 
#' suppressPackageStartupMessages(library(dplyr))
#' suppressPackageStartupMessages(library(ggtree))
#' suppressPackageStartupMessages(library(cowplot))
#'
#' plot1 <- phy_tree(ps1) %>%
#'   ggtree +
#'   geom_tiplab() +
#'   geom_label(aes(x = branch, label = round(branch.length, 4)))
#' plot2 <- phy_tree(ps2) %>%
#'   ggtree +
#'   geom_tiplab() +
#'   geom_label(aes(x = branch, label = round(branch.length, 4)))
#' plot_grid(plot1, plot2)
tree_glom <- function(physeq, 
                      resolution, 
                      criterion = "max_tip_depth",
                      tax_adjust = 1L) {
  tree <- access(physeq, "phy_tree")
  if (is.null(tree)) {
    stop("`tree_glom()` requires that `physeq` contain a phylogenetic tree")
  }

  # Find the groups of taxa to collapse ---------------------------------------
  # Collapse the tree (results include info about the collapsed groups)
  collapsed <- castor::collapse_tree_at_resolution(
    tree, 
    resolution, 
    by_edge_count = FALSE, 
    shorten = FALSE,
    # `rename_collapsed_nodes = TRUE` can give errors
    rename_collapsed_nodes = FALSE,
    criterion = criterion
  )

  # If no collapsed nodes, return physeq as is
  if (length(collapsed$collapsed_nodes) == 0)
    return(physeq)

  # Data frame mapping collapsed tips to group names
  collapsed_tbl <- tibble::tibble(
    collapsed_node = collapsed$collapsed_nodes
    ) %>%
    dplyr::mutate(
      collapsed_tip = purrr::map(collapsed_node,
        ~castor::get_subtree_at_node(tree, .)$subtree$tip.label
      )
    ) %>%
    tidyr::unnest(collapsed_tip) %>%
    # Name the group by the first taxon, rather than the node name, to ensure no
    # name conflicts. Archetypes are computed in merge_taxa_vec later on.
    dplyr::group_by(collapsed_node) %>%
    dplyr::mutate(
      group = utils::head(collapsed_tip, 1)
    ) %>%
    dplyr::ungroup()

  # Compute merged ps object --------------------------------------------------
  # Will use merge_taxa_vec with a `group` vector reflecting the
  # phylogenetically similar groups; this will automatically prune the
  # reference sequences and tree to the archetypes
  group <- tibble::tibble(taxon = taxa_names(physeq)) %>%
    dplyr::left_join(collapsed_tbl, 
      by = c("taxon" = "collapsed_tip")) %>%
    dplyr::mutate(group = ifelse(is.na(group), taxon, group)) %>%
    dplyr::select(taxon, group) %>%
    tibble::deframe()
  merge_taxa_vec(physeq, group, tax_adjust = tax_adjust)
}