R/phyloseq_filter.R

Defines functions phyloseq_filter_top_taxa_range phyloseq_filter_top_taxa phyloseq_filter_sample_wise_abund_trim phyloseq_filter_prevalence phyloseq_filter_taxa_tot_fraction phyloseq_filter_taxa_rel_abund phyloseq_richness_filter

Documented in phyloseq_filter_prevalence phyloseq_filter_sample_wise_abund_trim phyloseq_filter_taxa_rel_abund phyloseq_filter_taxa_tot_fraction phyloseq_filter_top_taxa phyloseq_filter_top_taxa_range phyloseq_richness_filter

#' @title Remove samples from phyloseq object that have less than n taxa
#'
#' @param physeq A phyloseq-class object
#' @param mintaxa Minimum number of taxa that should be present in a sample (default, 10)
#'
#' @return Trimmed phyloseq object (All samples will have >= N taxa)
#' @export
#'
#' @examples
#' data("esophagus")
#' esophagus
#' phyloseq_richness_filter(esophagus, mintaxa = 30)
#' phyloseq_richness_filter(esophagus, mintaxa = 100)
#'
phyloseq_richness_filter <- function(physeq, mintaxa = 10){

  ## Estimate number of OTUs per sample
  sp <- phyloseq::estimate_richness(physeq, measures = "Observed")
  samples_to_keep <- rownames(sp)[ which(sp$Observed >= mintaxa) ]

  if(length(samples_to_keep) == 0){
    stop("All samples will be removed.\n")
  }

  if(length(samples_to_keep) == phyloseq::nsamples(physeq)){
    cat("All samples will be preserved\n")
    res <- physeq
  }

  if(length(samples_to_keep) < phyloseq::nsamples(physeq)){
    res <- phyloseq::prune_samples(samples = samples_to_keep, x = physeq)
  }

  return(res)
}



#' @title Remove taxa with small mean relative abundance.
#'
#' @param physeq A phyloseq-class object
#' @param frac The minimum cutoff for the relative OTU abundance
#' @details This function searches for taxa with small mean relative abundance and removes them. Result will be returned with original counts in the abundance table.
#' @return Phyloseq object with a subset of taxa.
#' @export
#'
#' @examples
#' data("esophagus")
#' phyloseq_filter_taxa_rel_abund(esophagus, frac = 0.01)
#' phyloseq_filter_taxa_rel_abund(esophagus, frac = 0.1)
#'
phyloseq_filter_taxa_rel_abund <- function(physeq, frac = 1e-4){

  # require(phyloseq)

  ## Transform OTU counts to relative abundance
  rel <- phyloseq::transform_sample_counts(physeq, function(x) x / sum(x) )

  ## Filter OTUs
  rel.subs <- phyloseq::filter_taxa(rel, function(x){ mean(x) > frac }, prune = FALSE)

  ## if prune = TRUE
  # tn <- taxa_names(rel.subs)              # OTUs to preserve
  # tr <- setdiff(taxa_names(physeq), tn)   # OTUs to remove

  ## Taxa to remove
  tr <- names(rel.subs)[ which(rel.subs == FALSE) ]

  ## If all taxa should be removed
  if(length(tr) == phyloseq::ntaxa(physeq)){
    stop("Error: all taxa will be removed with the specified 'frac' cutoff.\n")
  }

  ## If there is nothing to remove
  if(length(tr) == 0){
    res <- physeq
    cat("Warning: no taxa removed.\n")
  }

  ## Keep taxa which satisfies the truncation threshold
  if(length(tr) > 0){
    res <- phyloseq::prune_taxa(taxa = rel.subs, physeq)
  }

  return(res)
}




#' @title Remove taxa with abundance less then a certain fraction of total abundance.
#'
#' @param physeq A phyloseq-class object
#' @param frac The minimum cutoff for the OTU abundance in the table. This number is a fraction, not a percent.
#' @details
#' If frac = 0.0001, this will retain all OTU's that have at least a 0.01\% total abundance in the OTU table.
#' If you wanted to retain OTUs with at least 1\% total abundance, you must specify, 0.01.
#'
#' @return Phyloseq object with a subset of taxa.
#' @export
#' @seealso http://qiime.org/scripts/filter_otus_from_otu_table.html
#' @examples
#' data("esophagus")
#' phyloseq_filter_taxa_tot_fraction(esophagus, frac = 0.01)
#'
phyloseq_filter_taxa_tot_fraction <- function(physeq, frac = 0.01){

  # require(phyloseq)

  ## Estimate total abundance of OTUs
  tot <- sum(phyloseq::taxa_sums(physeq))

  ## Remove OTUs
  res <- phyloseq::filter_taxa(physeq, function(x){ ( sum(x)/tot ) > frac }, prune = TRUE)
  return(res)
}




#' @title Filter low-prevalence OTUs.
#' @description This function will remove taxa (OTUs) with low prevalence, where prevalence is the fraction of total samples in which an OTU is observed.
#' @param physeq A phyloseq-class object
#' @param prev.trh Prevalence threshold (default, 0.05 = 5\% of samples)
#' @param abund.trh Abundance threshold (default, NULL)
#' @param threshold_condition Indicates type of prevalence and abundance conditions, can be "OR" (default) or "AND"
#' @param abund.type Character string indicating which type of OTU abundance to take into account for filtering ("total", "mean", or "median")
#' @details
#' Abundance threshold defines if the OTU should be preserved if its abundance is larger than threshold (e.g., >= 50 reads).
#' Parameter "threshold_condition" indicates whether OTU should be kept if it occurs in many samples AND/OR it has high abundance.
#' @return  Phyloseq object with a subset of taxa.
#' @seealso \code{\link{phyloseq_prevalence_plot}}
#' @export
#'
#' @examples
#' data(GlobalPatterns)
#' GlobalPatterns  # 19216 taxa
#'
#' # OTUs that are found in at least 5% of samples
#' phyloseq_filter_prevalence(GlobalPatterns, prev.trh = 0.05, abund.trh = NULL)  # 15389 taxa
#'
#' # The same, but if total OTU abundance is >= 10 reads it'll be preserved too
#' phyloseq_filter_prevalence(GlobalPatterns, prev.trh = 0.05, abund.trh = 10, threshold_condition = "OR")  # 15639 taxa
#'
#' # Include only taxa with more than 10 reads (on average) in at least 10% samples
#' phyloseq_filter_prevalence(GlobalPatterns, prev.trh = 0.1, abund.trh = 10, abund.type = "mean", threshold_condition = "AND")  # 4250 taxa
#'
phyloseq_filter_prevalence <- function(physeq, prev.trh = 0.05, abund.trh = NULL, threshold_condition = "OR", abund.type = "total"){

  ## Threshold validation
  if(prev.trh > 1 | prev.trh < 0){ stop("Prevalence threshold should be non-negative value in the range of [0, 1].\n") }
  if(!is.null(abund.trh)){ 
    if(abund.trh <= 0){ stop("Abundance threshold should be non-negative value larger 0.\n") }
  }

  # ## Check for the low-prevalence species (compute the total and average prevalences of the features in each phylum)
  # prevdf_smr <- function(prevdf){
  #   plyr::ddply(prevdf, "Phylum", function(df1){ 
  #     data.frame(
  #       Average = mean(df1$Prevalence),
  #       Total = sum(df1$Prevalence))
  #     })
  # }
  # prevdf_smr( prevalence(physeq) )

  ## Check the prevalence threshold
  # phyloseq_prevalence_plot(prevdf, physeq)

  ## Define prevalence threshold as % of total samples
  ## This function is located in 'phyloseq_prevalence_plot.R' file
  prevalenceThreshold <- prev.trh * phyloseq::nsamples(physeq)

  ## Calculate prevalence (number of samples with OTU) and OTU total abundance
  prevdf <- prevalence(physeq)

  ## Get the abundance type
  if(abund.type == "total") { prevdf$AbundFilt <- prevdf$TotalAbundance }
  if(abund.type == "mean")  { prevdf$AbundFilt <- prevdf$MeanAbundance }
  if(abund.type == "median"){ prevdf$AbundFilt <- prevdf$MedianAbundance }

  ## Which taxa to preserve
  if(is.null(abund.trh)) { tt <- prevdf$Prevalence >= prevalenceThreshold }
  if(!is.null(abund.trh)){
    ## Keep OTU if it either occurs in many samples OR it has high abundance
    if(threshold_condition == "OR"){
      tt <- (prevdf$Prevalence >= prevalenceThreshold | prevdf$AbundFilt >= abund.trh)
    }

    ## Keep OTU if it occurs in many samples AND it has high abundance
    if(threshold_condition == "AND"){
      tt <- (prevdf$Prevalence >= prevalenceThreshold & prevdf$AbundFilt >= abund.trh)
    }
  }

  ## Extract names for the taxa we whant to keep
  keepTaxa <- prevdf$Taxa[ tt ]

  ## Execute prevalence filter
  res <- phyloseq::prune_taxa(taxa = keepTaxa, x = physeq)
  return(res)
}



#' @title Filter rare OTUs based on minimum abundance threshold.
#' @description This function performs sample-wise OTU abundance trimming.
#' @param physeq A phyloseq-class object
#' @param minabund Abundance threshold (default, 10)
#' @param relabund Logical; perform trimming based on relative abundances (default, FALSE)
#' @param rm_zero_OTUs Logical, remove OTUs with zero total abundance
#' @details 
#' OTUs can be considered as rare if they comprise fewer than X (e.g., 10) sequences within a sample. 
#' This function is intented to censore OTU abundance (unsing an arbitrary threshold) on a sample-wise basis. 
#' 
#' Trimming can be performed based on relative abundances of OTUs within a sample (`relabund = TRUE`), but the orginal OTU count will be returned. 
#' For this purpose `minabund` parameter should be provided in a range of (0,1] (e.g., use `minabund = 0.1, relabund = TRUE` to remove OTUs with relative abundance < 10% in each sample).
#' 
#' @return Phyloseq object with a filtered data.
#' @export
#'
#' @examples
#' # Load data
#' data(GlobalPatterns)
#'
#' # Trim GlobalPatterns data (19216 taxa) by removing OTUs with less that 10 reads
#' GP1 <- phyloseq_filter_sample_wise_abund_trim(GlobalPatterns, minabund = 10) # 10605 taxa
#'
#' # Trim GlobalPatterns data by removing OTUs with relative abundance less than 1%
#' GP2 <- phyloseq_filter_sample_wise_abund_trim(GlobalPatterns, minabund = 0.01, relabund = TRUE) # 258 taxa
#'
#' # Compare raw and trimmed data
#' phyloseq_summary(GlobalPatterns, GP1, GP2, cols = c("GlobalPatterns", "Trimmed 10 reads", "Trimmed 1 percent"))
#'
phyloseq_filter_sample_wise_abund_trim <- function(physeq, minabund = 10, relabund = FALSE, rm_zero_OTUs = TRUE){

  ## Censore OTU abundance
  if(relabund == FALSE){     # trim based on absolute OTU counts

    res <- phyloseq::transform_sample_counts(physeq, function(OTU, ab = minabund){ ifelse(OTU <= ab,  0, OTU) })

  } else {                   # trim based on relative abundances within sample, but return original counts

    if(!minabund > 0 & minabund <= 1){
      stop("Error: for relative abundance trimmin 'minabund' should be in (0,1] interval.\n")
    }

    ## Convert data to relative abundances
    res <- phyloseq_standardize_otu_abundance(physeq, method = "total")

    ## Remove relative abundances less than the threshold value
    res <- phyloseq::transform_sample_counts(res, function(OTU, ab = minabund){ ifelse(OTU <= ab,  0, OTU) })

    ## Sample sums and data orientation
    smps <- phyloseq::sample_sums(physeq)
    if(phyloseq::taxa_are_rows(physeq) == TRUE){
      mar <- 2
    } else {
      mar <- 1
    }

    ## Convert back to counts by multiplying relative abundances by sample sums
    phyloseq::otu_table(res) <- phyloseq::otu_table(
      sweep(x = phyloseq::otu_table(res), MARGIN = mar, STATS = smps, FUN = `*`),
      taxa_are_rows = phyloseq::taxa_are_rows(physeq))
  }

  ## Remove zero-OTUs
  if(rm_zero_OTUs == TRUE){
    res <- phyloseq::prune_taxa(taxa_sums(res) > 0, res)
  }
  return(res)
}



#' @title Extract the most abundant taxa.
#' @param physeq A phyloseq-class object
#' @param perc Percentage of the most abundant taxa to retain
#' @param n Number of the most abundant taxa to retain (this argument will override perc argument)
#' @return Phyloseq object with a filtered data.
#' @export
#'
#' @examples
#'
phyloseq_filter_top_taxa <- function(physeq, perc = 10, n = NULL){

  ## Arguments validation
  if(perc <= 0 | perc > 100){ stop("Error: percentage should be in 1-100 range.\n") }

  ## Get total abundances for all taxa
  taxx <- sort(phyloseq::taxa_sums(physeq), decreasing = TRUE)

  ## Find how many taxa to preserve (if percentage is specified)
  if(is.null(n)){
    n <- phyloseq::ntaxa(physeq) * perc / 100
    n <- floor(n)
  }

  ## Extract names for the taxa that should be preserved
  keepTaxa <- names(taxx)[1:n]

  ## Extract this taxa
  physeq_pruned <- phyloseq::prune_taxa(keepTaxa, physeq)

  return(physeq_pruned)
}



#' @title Check the range of the top-taxa filtering values to determine the optimal threshold.
#' @description This function performs taxa filtering by retaining the most abundant taxa.
#' A range of abundance percentages (5 - 95\%) will be explored.
#' @param physeq A phyloseq-class object
#' @param show_plot Logical; if TRUE, shows the plot on screen
#' @return ggplot-object.
#' @export
#'
#' @examples
#'
phyloseq_filter_top_taxa_range <- function(physeq, show_plot = TRUE){
  percs <- seq(5, 95, 5)

  fr <- plyr::mlply(.data = data.frame(perc = percs), .fun = function(...){ phyloseq_filter_top_taxa(physeq, ...) })
  names(fr) <- percs

  fr_tab <- plyr::ldply(.data = fr, .fun = function(z){
    sz <- phyloseq::sample_sums(z)
    res <- data.frame(Sample = names(sz), Preserved = sz)
    return(res)
  })

  pp <- ggplot(data = fr_tab, aes(x = perc, y = Preserved, group = Sample)) +   # color = Sample
    geom_vline(xintercept=75, color="grey", linetype = "longdash") +
    geom_line() +
    geom_point() +
    labs(x = "Number of most abundant taxa retained, %", y = "Percentage of total sample abundance") +
    theme(legend.position = "none")

  if(show_plot == TRUE){ print(pp) }
  invisible(pp)
}
vmikk/metagMisc documentation built on Feb. 14, 2024, 2:29 a.m.