R/ps_filter.R

Defines functions tax_filter_zeros ps_filter

Documented in ps_filter

#' Filter phyloseq samples by sample_data variables
#'
#' Keep only samples with sample_data matching one or more conditions.
#' By default this function also removes taxa which never appear in any of
#' the remaining samples, by running tax_filter(min_prevalence = 1). You can
#' prevent this taxa filtering with .keep_all_taxa = TRUE.
#'
#' Use ps_filter as you would use use dplyr::filter(), but with a phyloseq object!
#'
#' @param ps phyloseq object
#' @param ...
#' passed directly to dplyr::filter (see examples and ?dplyr::filter)
#' @param .target which slot of phyloseq to use for filtering by,
#' currently only "sample_data" supported
#' @param .keep_all_taxa if FALSE (the default),
#' remove taxa which are no longer present in the dataset after filtering
#'
#' @return phyloseq object (with filtered sample_data)
#' @export
#'
#' @seealso \code{\link[dplyr]{filter}} explains better how to give arguments to this function
#' @seealso \code{\link{tax_filter}} for filtering taxa (not samples)
#'
#' @examples
#' library(phyloseq)
#' library(dplyr)
#'
#' data("enterotype", package = "phyloseq")
#' enterotype
#' sample_data(enterotype)[1:10, 1:5]
#'
#' # keep only samples with seqtech not equal to sanger
#' ps1 <- ps_filter(enterotype, SeqTech != "Sanger")
#' ps1
#' sample_data(ps1)[1:10, 1:5]
#'
#' # keep only samples with no NAs in any variables
#' ps2 <- enterotype %>% ps_filter(!if_any(everything(), is.na))
#' ps2
#' sample_data(ps2)[1:8, 1:8]
#'
#' # ps2 is equivalent to dropping samples with incomplete sample_variables and tax_filtering 0s
#' ps3 <- enterotype %>%
#'   ps_drop_incomplete() %>%
#'   tax_filter(undetected = 0, use_counts = FALSE)
#' # we needed to set a low detection threshold because this example data is proportions
#' identical(ps2, ps3) # TRUE
#'
#' # function will give warning if some of the otu_values are negative
#' # (which may happen when filtering data that has e.g. clr-transformed taxa abundances)
#' # as it attempts to discard any taxa that become always absent/0 after filtering (by default)
#' # set .keep_all_taxa = TRUE to avoid this filtering behaviour, which is unwanted in this case
#' enterotype %>%
#'   tax_transform("clr") %>%
#'   ps_get() %>%
#'   ps_filter(SeqTech == "Sanger", .keep_all_taxa = TRUE)
ps_filter <- function(ps,
                      ...,
                      .target = "sample_data",
                      .keep_all_taxa = FALSE) {
  check_is_phyloseq(ps, argName = "ps")

  if (!identical(.target, "sample_data")) {
    stop("Only .target = 'sample_data', has been implemented so far.")
  }
  # TODO: see if it is useful to facilitate
  # filtering by variables in other phyloseq slots

  df <- samdatAsDataframe(ps)
  df <- dplyr::filter(df, ...)
  phyloseq::sample_data(ps) <- df

  # remove taxa that now have zero counts (or relative abundance)
  # across all remaining samples
  if (isFALSE(.keep_all_taxa)) ps <- tax_filter_zeros(ps)
  return(ps)
}

# helper function used here and in ps_join
# removes all taxa which sum to zero across all samples
# (phyloseq::taxa_sums(ps) == 0)
# provides helpful warning if otu_table contains negative values
tax_filter_zeros <- function(ps) {
  # remove taxa that now have zero counts (or relative abundance)
  # across all remaining samples
  if (any(phyloseq::otu_table(ps) < 0)) {
    warning(
      "Removing taxa whose abundance across filtered samples is equal to zero.",
      "\nThis may not result in the desired outcome, ",
      "as some values in the otu_table are negative.",
      "\nAvoid performing transformations, ",
      "e.g. clr, before using `ps_filter()`, or set .keep_all_taxa = TRUE "
    )
  }
  return(phyloseq::prune_taxa(taxa = phyloseq::taxa_sums(ps) != 0, x = ps))
}
david-barnett/microViz documentation built on April 17, 2025, 4:25 a.m.