R/microbiome_functions.R

Defines functions physeq.create

Documented in physeq.create

#Microbiome Tools

# Load Packages -----------------------------------------------------------
library(tidyverse)
library(scales)
library(phyloseq)
library(cowplot)
library(gridGraphics)

# Function: Create phyloseq object ----------------------------------------


#' Create phyloseq object
#'
#' Function to create phyloseq obejct, with additional quality control measures to ensure input dataframes are correctly aligned for phyloseq.
#'
#' @param asvdf Dataframe containing ASV sequence count data for each sample. Rows are sequences and column are samples. Rownames must be set to sequence ID..
#' @param taxdf Dataframe contaningg taxonomic classification of each ASV.  Rows are sequences and columns are samples. Rownames must be set to sequence ID.
#' @param mapdf Dataframe containing sample meta data (e.g clinical variables). Rows are samples and columns are metadata variables (e.g age, sex, clinical variables, etc.)
#' @param ID.col #String. Name of the column in mapdf that contains sample IDs. (E.g \code{"sample.ids"})
#' @param export.asvsp #\code{TRUE} or \code{FALSE} whether to export a dataframe that contains ASV sequences and their corresponding species ID (spID) auto-generated by phyloseq
#' @param export.wd #Directory path as a string (e.g \code{"~/output"}. Location of directory where dataframe will be exported if export.asvp = TRUE.
#'
#' @return Phyloseq object.
#' @export
#'
#' @examples dat <- physeq.create(asvdf = asv, taxdf = taxanomy, mapdf = metadata, ID.col = "sample_ID", export.asvsp = FALSE)
#'
physeq.create <- function(asvdf, #Dataframe; Rownames = sequences (e.g CCTGTT...) . Columns = sequencing counts.
                          taxdf, #Dataframe; Rownames = sequences (e.g CCTGTT...). Columns = Sequence taxonomic identification (Kingdom, Phylum, Class, Order, Family Genus)
                          mapdf, #Dataframe; No specific rownames required. Rows = Samples, Columns = Sample metadata (e.g depression scores, sex, age, etc.)
                          ID.col, #name of the column containing study IDs
                          export.asvsp = TRUE, #export CSV file that maps phyloseq assigned species ID (sp####) to ASV sequences (CCTGGT...)
                          export.wd  = getwd() #directory exports will be saved
                          )
  {
  # === Directory ===
  #Get original directory
  og.dir <- getwd()

  #Set new export directory
  setwd(export.wd)

  #=== Wrangling ===
  #Standardize ID column
  mapdf <- mapdf %>% dplyr::rename(StudyID = paste(ID.col))

  #=== asvdf ===
  #Remove asvdf rownames
  seqs = rownames(asvdf)
  rownames(asvdf) = NULL

  # === Taxdf  ===
  #Unit test: taxdf is aligned with asvdf
  qc.physeq1 <- all(rownames(taxdf) == seqs) #pass = TRUE
  if(qc.physeq1 == FALSE){
    print("Error: taxdf and asvdf rows are not aligned")
    print("Running UNDECLARED to kill script.")
    UNDECLARED()
  }



  #Remove taxdf rownames for physeq formatting
  taxdf.asv <- taxdf #save ASV key
  rownames(taxdf) = NULL #remove rows

  # Unit test:
  #Taxa nrow = ASV seq nrow
  qc.physeq2 <- nrow(taxdf) == nrow(asvdf) #pass = TRUE
  if(qc.physeq2 == FALSE){
    print("Error: taxdf and asvdf have different number of rows")
    print("Running UNDECLARED to kill script.")
    UNDECLARED()
  }





  #=== mapdf ===
  #Unit test:
  #nrow of mapdf$StudyID is equal to ncol of asvdf
  #Checks that there are same number of StudyIDs in mapdf and asvdf
  qc.physeq3 <- nrow(mapdf) == ncol(asvdf)  #pass = TRUE
  if(qc.physeq3 == FALSE){
    print("Error: mismatched number of samples between asvdf and mapdf" )
    print("Running UNDECLARED to kill script.")
    UNDECLARED()
  }


  #Unit test:
  #Set of StudyIDs in asvdf (cols) are also in mapdf (rows)
  qc.physeq4 <- all(colnames(asvdf) %in% mapdf$StudyID) %>% all #pass = TRUE
  qc.physeq5 <- all(mapdf$StudyID %in% colnames(asvdf)) %>% all #pass = TRUE
  if(qc.physeq4 == FALSE | qc.physeq5 == FALSE){
    print("Error: mismatched sample IDs between asvdf and mapdf" )
    print("Samples in asvdf but not mapdf:")
    qc.df <- data.frame(colnames(asvdf)) %>% filter(!(colnames.asvdf. %in% mapdf$StudyID)) #show all samples in asvdf except those shared between asvdf and mapdf
    print(qc.df)

    print("Samples in mapdf but not asvdf:")
    qc.df <- data.frame(mapdf$StudyID) %>% filter(!(mapdf.StudyID %in% colnames(asvdf))) #show all samples in mapdf except those shared between asvdf and mapdf

    print("Running UNDECLARED to kill script.")
    UNDECLARED()
  }

  #StudyID becomes rownames
  rownames(mapdf) = mapdf$StudyID


  #Unit test:
  #Set of StudyIDs in asvdf (cols) are also in mapdf (rownames)
  qc.physeq6 <- all(rownames(mapdf) %in% colnames(asvdf)) %>% all #pass = TRUE
  qc.physeq7 <- all(colnames(asvdf) %in% rownames(mapdf)) %>% all #pass = TRUE
  if(qc.physeq6 == FALSE | qc.physeq7 == FALSE){
    print("Error: set of asvdf colnames is different from set of mapdf rownames" )
    print("Running UNDECLARED to kill script.")
    UNDECLARED()
  }

  #=== Create phyloseq object ===
  # Convert the ASV and taxonomy data frames to matrices.
  tax_mat = as.matrix(taxdf)
  asv_mat = as.matrix(asvdf)

  dat = phyloseq(otu_table(asv_mat, taxa_are_rows = TRUE), tax_table(tax_mat), sample_data(mapdf))
  str(dat)




  # === Export ASV -> phyloseq sp ID# conversion key ===
  taxdf.asv #taxa df without rows (ASV sequences) removed

  if(export.asvsp == TRUE){
    #Unit test:
    #Physeq and taxdf taxonomic assignments are in the same order
    qc.asv_sp <- (dat %>% tax_table() %>% data.frame() %>% replace(is.na(.), "UNKNOWN") == taxdf %>% replace(is.na(.), "UNKNOWN")) %>%
      as.logical %>% as.data.frame() %>% dplyr::rename(qc = ".")
    if(unique(qc.asv_sp) %>% length != 1 | unique(qc.asv_sp)[1,1] != TRUE){
      print("Error: taxdf.asv and phyloseq object are not aligned.")
      print("Running UNDECLARED to kill script.")
      UNDECLARED()

  }

    #Create asv_sp key
    print(paste("Phyloseq taxa table (dat) and ASV sequence taxa df (taxdf.asv) aligned with original taxa df (taxdf) :", unique(qc.asv_sp)))
    print(paste0("ASV to phyloseq species ID (sp) conversion key created and exported to: ", export.wd, "/asv_spCode.csv"))
    sp_asv <- (taxdf.asv %>% rownames) %>% data.frame() %>% dplyr::rename(asv_seq = ".") %>% mutate(sp = dat %>% tax_table %>% rownames()) %>% relocate(sp, asv_seq)

    #Export
    write.csv(sp_asv, "asv_spCode.csv", row.names = FALSE)
  }

  #=== Remove host mitochondrial sequences ===
  dat2 = subset_taxa(dat,
                     !(Kingdom == 'Eukaryota' |
                         is.na(Phylum) |
                         (!is.na(Family) &
                            Family == 'Mitochondria')))


  # === Directory ===
  #Reset working directory
  setwd(og.dir)

  # === Return phyloseq object ===
  return(dat2)
}



# Function: Change taxa names (sp ID#, full taxonomic name) ------------------------------------------------
#Get sp ID and taxonomic identification from phyloseq object (df)

#' Species ID taxonomoc classification
#'
#' Add full taxonomic classification to species ID auto-generated by Phyloseq
#' @param physeq Phyloseq object
#' @param taxa.col String of taxonomic levels to include in classification. Taxonomic levels must exist in the input phyloseq object. If no input given, defaults to all levels in phyloseq object.
#' @param remove TRUE or FALSE. Are there characters that need to be removed after the spID in the taxa rowname to isolate the spID ? If TRUE, spID is extracted (e.g sp1_bacteria_bacteroidetes extracted to sp1)
#' @param remove.delimiter Character string that separates sp ID from remaining OTU name characters (e.g sp1_bacteria_bacteroidetes delimiter is "_")
#'
#'
#' @return Returns the phyloseq object with taxonomic classification as part of the taxa name (i.e. phyloseq OTU table rownames now contain taxonomy).
#' @export
#'
#' @examples sp_taxa_ID(dat) #Add full taxonomic label to rownames (i.e. sp###_Kingdom_Phylum_Class_Order_Family_Genus)
#' sp_taxa_ID(physeq = dat, taxa.col = c("Family, "Genus)) #Add full taxonomic label to rownames (i.e. sp###_Family_Genus)
sp_taxa_ID <- function(physeq, #phyloseq object
                       taxa.col = NA, #Character vector; taxonomic levels to be included in phyloseq taxa ID (e.g c("Family", "Genus")). If no input given, defaults to all levels in physeq object
                       remove = FALSE, #If TRUE, sp ID needs to be extracted from other information (e.g sp1_bacteria_bacteroidetes extracted to sp)
                       remove.delimiter = "_" #character to separate sp ID from remaining OTU name (e.g sp1_bacteria_bacteroidetes delimiter is "_")
                       )
{
  asvtab <- physeq %>% tax_table %>% data.frame %>% rownames_to_column(var = "OTU")

  #Isolate spID if remove = TRUE
  if(remove == TRUE){
    delim <- paste0(remove.delimiter, ".*")
    asvtab <- asvtab %>% mutate(OTU = gsub(x = OTU, pattern = delim, replacement = ""))
  }


  #Set taxa names if not givene()
  if(is.na(taxa.col[1])){
    taxa.col <- rank_names(physeq)
  }

  tname.paste <- c("OTU", taxa.col)


  #List of phyloseq sp IDs and their taxonomic assignment
  asvtaxa <- asvtab[,c("OTU", taxa.col)] %>% unique() %>%
    mutate(tname = do.call(paste, c(asvtab[tname.paste], sep = "_")), #Create new col tname with OTU and full taxonomic details
           OTU_order = parse_number(OTU) %>% as.numeric()) %>%
    dplyr::select(-OTU_order)


  #Unit test:
  #asvtaxa ASV (colname = OTU) aligns with phyloseq object ASV rownames


  if(remove == FALSE){
    qc.taxa_names <- (asvtaxa$OTU == (tax_table(physeq) %>% row.names())) %>% data.frame() %>% filter(FALSE %in% .) %>% nrow()}

  if(remove == TRUE){
    qc.taxa_names <- ((asvtaxa$OTU %>% gsub(pattern = delim, replacement = "")) == (tax_table(physeq) %>% row.names() %>% gsub(pattern = delim, replacement = ""))) %>%
      data.frame() %>% filter(FALSE %in% .) %>% nrow()
  }


  if(qc.taxa_names == 0){
    print("QC passed, full ASV taxonomic names will be added to phyloseq object")
  }

  if(qc.taxa_names != 0){
    print("Error: OTU order is not aligned. Running undeclared to kill script")
    UNDECLARED()
  }

  #Add new taxa names to phyloseq object
  taxa_names(physeq) <- asvtaxa$tname
  print(head(asvtaxa$tname)) #print example of new taxa names for user


  return(physeq)
}



full_taxonomic_name <- function(physeq #input phyloseq object
                                )
  #Returns dataframe containing the full taxonomic name for each spID
  #Assumes that at minimum, the spID can be found in the rownames
{
  tax.df <- as(access(physeq, "tax_table"), "matrix") %>% data.frame
  tax.df$taxonomic.name <- do.call(paste, c(tax.df[1:ncol(tax.df)], sep = "_"))
  out <- tax.df %>% select(taxonomic.name) %>% mutate(spID = gsub(x = rownames(.), pattern = "_.*", replacement = "")) %>% relocate(spID, .before = taxonomic.name)
  return(out)
}




# Functions: relative abundance   -------------------------------------------
#phyloseq counts -> phyloseq relative abundance and wide ASV dataframe
#' Convert counts to relative abundance.
#'
#' Converts taxa counts stored in phyloseq object into taxa relative abundance
#' @param physeq input phyloseq object
#' @param warning.in \code{TRUE} or \code{FALSE}. If total relative abundance for a sample does not equal 1, should an error be issued?
#'
#' @return Returns both a phyloseq object and the OTU table as a dataframe where counts are replaced by relative abundance.
#' @export
#'
#' @examples
#'
physeq.convert.rel <- function(physeq, warning.in = TRUE)
  {
  dat_rel <- transform_sample_counts(physeq,function(x) x/sum(x))

  rel <- physeqrel.dataframe(dat_rel, warning.in)
  return(list(dat_rel, rel))
}



#phyloseq relative abundance -> wide ASV dataframe
physeqrel.dataframe <- function(physeq, #relative abundance phyloseq object
                                warning = TRUE #if relative abundance is not 1, issue a warning instead of killing script. TRUE = issue warning, FALSE = kill script.
                                )
  #input relative abundance phyloseq object
  #ouput relative abundance ASV table (rel)
  {

  #Unit test: is it relative abundance data?
  qc <- physeq %>% otu_table %>% data.frame %>% pull(1) %>% sum
  if(qc > 1){
    print("ERROR: first sample looks like count data. Input must be relative abundance data")
    print("Running UNDECLARED to kill script.")
    UNDECLARED()
  }



  #Psmelt relative abundanc physeq
  rel <- psmelt(physeq)


  #Unit test:
  #Total relative abundance sums to 1
  #Returns warning. Script will continue even if test fails, unless warning = FALSE
  qc <- rel %>% group_by(Sample) %>% dplyr::summarize(total_abundance = sum(Abundance))
  qc.min <- min(qc$total_abundance) %>% round(digits = 10) %>% trunc
  qc.max <- max(qc$total_abundance) %>% round(digits = 10) %>% trunc
  if(qc.min != 1 | qc.max != 1){
    print(paste0("Min total rel abund:", qc.min, ". Max total rel abund: ", qc.max))
    if(warning == TRUE){
    warning("Relative abundance does not sum to 1 for all samples.")
    print("Total relative abundance summary:")
    print(summary(qc))
    }

    if(warning == FALSE){
      print("Relative abundance does not sum to 1 for all samples.")
      print("Total relative abundance summary:")
      print(summary(qc))

      print("Running UNDECLARED to kill script.")
      UNDECLARED()
    }

  }

  rel <- rel %>% dplyr::select(c(OTU,Sample, Abundance, StudyID)) %>%
    arrange(Sample, desc(Abundance))%>%
    group_by(Sample) %>%
    tidyr::pivot_wider(values_from = Abundance, names_from = OTU) %>%
    replace(is.na(.), 0) %>% ungroup() %>% dplyr::select(-Sample)

  return(rel)
}

#Get mean relative abundance for each taxa
rel.mean.extract <- function(rel){
  relmean <- rel %>% select(-StudyID) %>% summarise_all(mean) %>% mutate(stat = "mean") %>%
    pivot_longer(!stat, names_to = "OTU", values_to = "mean") %>% dplyr::select(-stat)

  return(relmean)
}

#Get taxa prevalence
rel.prev.extract <- function(rel){
  relprev <- rel %>% dplyr::select(-StudyID) %>%
    mutate_all(function(x) (ifelse(x>0, 1, 0))) %>% #converts taxa presence as binary (1 = yes, 0 = no)
    summarise_all(mean) %>%
    mutate(stat = "prev") %>% pivot_longer(!stat, names_to = "OTU", values_to = "prev") %>% dplyr::select(-stat)

  return(relprev)
}

#Combine mean relative abundance and prevalence into one dataframe
rel.criteria.extract <- function(relmean, relprev){
  rel.criteria <- relmean %>% left_join(relprev, by = "OTU") %>% dplyr::rename(mean_relabund = mean)
  summary(rel.criteria %>% dplyr::select(-OTU))
  return(rel.criteria)
}


#Mean relative abundance and prevalence data exploration
mra.prev.vis <- function(rel.criteria, #output from rel.criteria.extract
                         export_prefix  #file prefix
)
  #Exports density and scatter plots of prevalence and mean relative abundance to files in the current wd
  {
  density(log10(rel.criteria$mean_relabund)) %>% plot(main = "Log 10 Mean Relative Abundance") #density
  p1 <- recordPlot()
  ggdraw(p1)

  density(rel.criteria$prev) %>% plot(main = "Prevalence")
  p2 <- recordPlot()
  ggdraw(p2)



  #Scatter
  p <- ggplot(rel.criteria, aes(x = prev, y = mean_relabund)) + geom_point() +
    theme_classic() + scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x),
                                    labels = trans_format("log10", math_format(10^.x))) +
    annotation_logticks(sides = "l")

  #Export
  tiff(paste0(export_prefix, "_mra_prev.tiff"), height = 500, width = 1800, res = 90)
  print(cowplot::plot_grid(p1, p2, p, ncol = 3))
  dev.off()
}





#' Relative abundance Summary and Conversion
#'
#' A comprehensive relative abundance function for phyloseq objects. Includes output of relative abundance summary statistics (mean relative abundance and prevalence), both as a dataframe and with an added option for plotting. The function accepts input of count or relative abundance data.
#'
#' If count data is input, it will be first converted to relative abundance. The relative abundance phyloseq object and OTU table will also be returned.
#'
#' @param physeq Input phyloseq object
#' @param type One of \code{"counts"} or \code{"relabund"}. Describes whether input phyloseq object currently contains raw counts or relative abundance.
#' @param vis.export \code{TRUE} or \code{FALSE}. If \code{"TRUE"}, the function save plots to the working directory describing distribution of taxa relative abundance and prevalence.
#' @param export_prefix.in Provide a prefix (string) for summary plots if vis.export = \code{TRUE}
#'
#' @return List containing [1] mean relative abundance and prevalence for each taxa (dataframe) [2] Relative abundance OTU table (dataframe) [3] Relative abundance phyloseq object.
#' If vis.export = TRUE Will also export summary plots of mean relative abundance and prevalence distributions to working directory
#' @export
#'
#' @examples
#' physeq.mra.prevalence(physeq = dat, type = "counts", vis.export = TRUE, export_prefix.in = "test_counts") #with export
#' physeq.mra.prevalence(physeq = dat_rel, type = "relabund") #input phyloseq object with relative abundance
physeq.mra.prevalence <- function(physeq,
                                  type, #one of: "counts" or "relabund"
                                  vis.export = TRUE, #export mra/prev density function and scatter plots
                                  export_prefix.in #prefix for export file names
                                  )
  #Summary function that uses other microbiome relative abundance functions
  #Input phyloseq object of type: "counts" or "relabund"
  #Outputs taxa mean relative abundance and taxa prevalence (rel.criteria [[1]]), rel abund ASV table [[2]], and abundance phyloseq object (dat_rel [[3]])
  #Option to export mean relative abundance and prevalence visualization plots (mra/prev probability density functions/distribution, scatter plots)
  {
  #Unit test: type has valid input
  if(type != "counts" & type != "relabund"){
    print("Invalid type input.")
    print("Running UNDECLARED to kill script.")
    UNDECLARED()

  }

  #quality control object for determining if relative abundance or count data was input
  qc <- physeq %>% otu_table %>% data.frame %>% pull(1) %>% sum


  if(type == "counts"){
    #Unit test: is it count data?
    if(qc < 1){
      print("ERROR: first sample looks like relative abundance data. Current input is count data")
      print("Running UNDECLARED to kill script.")
      UNDECLARED()

    }

    #Convert phyloseq counts to relative abundance phyloseq and ASV table
    rel.results <- physeq.convert.rel(physeq = physeq)

    #Extract mean relative abundance and prevalence
    rel <- rel.results[[2]]
    relmean <- rel.mean.extract(rel = rel)
    relprev <- rel.prev.extract(rel = rel)
    rel.criteria <- rel.criteria.extract(relmean, relprev)

    #dat_rel
    dat_rel <- rel.results[[1]]
  }
  if(type == "relabund"){

    #Unit test: is it relative abundance data?
    if(qc > 1){
      print("ERROR: first sample looks like count data. Current input is relative abundance data")
      print("Running UNDECLARED to kill script.")
      UNDECLARED()
    }

    #Extract mean relative abundance and prevalence
    rel <- physeqrel.dataframe(physeq)
    relmean <- rel.mean.extract(rel = rel)
    relprev <- rel.prev.extract(rel = rel)
    rel.criteria <- rel.criteria.extract(relmean, relprev)

    #dat_rel
    dat_rel <- physeq
  }


  #Visualize and export current mra and prevalence
  if(vis.export == TRUE){
    mra.prev.vis(rel.criteria = rel.criteria, export_prefix = export_prefix.in)
  }

  return(list(rel.criteria, #1 = mean relative abundance and prevalence of each taxa
              rel, #2 = relative abundance ASV table
              dat_rel #3 = relative abundance phyloseq object
              ))
}

# Function: Glom algorithm -----------------------------------------------
#' Retain resolve agglomeration
#'
#' Execute retain-resolve agglomeration on ASV count data stored in phyloseq object. \cr \cr
#' Filter are required for at the ASV and genus-level for:
#' \enumerate{
#' \item mean relative abundance + prevalence
#' \item {prevalence only}
#' } \cr
#' Users wishing to set thresholds for only one criteria \emph{(e.g mra + prev \strong{or} prev only)} can set the parameters for the unused filter to 0. Thresholds for ASV and genus-level filtering may need to be optimized for each dataset using summary plots exported when \code{export = TRUE}. \cr \cr
#' \emph{\strong{Exporting quality control data to file (\code{export = TRUE}) is highly recommend.}} \cr
#'
#'
#' @param physeq input phyloseq object containing taxa counts
#' @param mra_prev.ASV threshold for mean relative abundance + prevalence filter. Both minimum values must be met for ASV taxa to be retained. Input numeric vector: \code{c(mra, prev)}. Default is 0.001 mean relative abundance and 0.1 prevalence.
#' @param prev.ASV threshold for prevalence filter for ASV to be retained. Input numeric value. Default is 0.5 prevalence.
#' @param mra_prev.genus threshold for mean relative abundance + prevalence filter. Both minimum values must be met for agglomerated genus-level taxa to be retained. Input numeric vector: \code{c(mra, prev)}. Default is 0.0001 mean relative abundance and 0.1 prevalence.
#' @param prev.genus threshold for prevalence filter for agglomerated genus-level taxa to be retained. Input numeric value. Default is 0.5 prevalence.
#' @param export \code{TRUE} or \code{FALSE}. If \code{TRUE}, quality control data and additional phyloseq objects will be exported to a specified directory.
#' @param dir  Input filepath as string for storage location of exported files if export = \code{TRUE}.
#'
#' @return
#' \strong{Phyloseq objects:} \cr
#' dat_glom: retain-resolved ASV and  genus-level taxa count data. Other taxa is not included. \cr \cr
#' dat_glom_aldex: retain-resolved ASV and  genus-level taxa count data. Other taxa classification is included such that total abundances are unchanged from original count data after genus-level filtering.
#' Recommended use for downstream applications where calculations use total abundance (e.g CLR transformation, relative abundance). \cr \cr
#' dat_glom_rel: retain-resolved ASV and  genus-level taxa relative abundance data. Calculated with dat_glom_aldex. Other taxa has been removed after relative abundance calculation.  \cr \cr
#' \cr
#' \strong{Dataframes:} \cr
#' glom.spID.taxonomt: key for taxonomic level (ASV or genus) of each taxonomic ID (e.g )
#'
#' @export
#'
#' @examples
#' output <- retain.resolve_genus(physeq = dat, export = TRUE, dir = "filepath/export_files") #use default thresholds
#'
#' output <- retain.resolve_genus(physeq = dat, export = TRUE, dir = "~filepath/export_files", mra_prev.ASV = c(0.005, 0.25), prev.ASV = 0.75, mra_prev.genus = c(0.001, 0.1), prev.genus = 0.5) #manually set thresholds for genus and ASV-level filters
#'

retain.resolve_genus <- function(physeq, #count phyloseq object
                                 dir = NA,
                                 export, #TRUE or FALSE
                                 mra_prev.ASV = c(0.001, 0.1), #threshold for mean relative abundance [1] AND prevalence [2]. Taxa must pass both criteria for retention. ASV level filter.
                                 prev.ASV = 0.5, #threshold for prevalence-only. ASV level filter.
                                 mra_prev.genus = c(0.0001, 0.1), #threshold for mean relative abundance [1] AND prevalence [2]. Taxa must pass both criteria for retention. genus level filter. ,
                                 prev.genus = 0.5 #threshold for prevalence-only. Genus level filter.
                                 )

  #If taxa pass relabund_prev or prev filters, they will be retained.
  {
  print("QC: Assessing user input.")

  # ====== Unit tests: User input ======
  #Unit test: was export input as a logical variable?
  if(class(export) != "logical"){
    print("Error: export must be TRUE or FALSE")
    print("Running UNDECLARED to kill script.")
    UNDECLARED()

  }


  #Unit test: if user wants to export data, is there a working directory provided?
  if(export == TRUE & is.na(dir)){
    print("Error: please provide a directory path to export files.")
    print("Running UNDECLARED to kill script.")
    UNDECLARED()
  }


  #Unit test: did user input count data?
  qc <- physeq %>% otu_table %>% data.frame %>% pull(1) %>% sum
  if(qc < 1){
    print("ERROR: first sample looks like relative abundance data. Only count data is accepted")
    print("Running UNDECLARED to kill script.")
    UNDECLARED()
  }



  #Unit test: is relative abundance 1 for input physeq?
    #If total relative abundance is not 1, will kill script
  physeq.convert.rel(physeq = physeq,
                     warning.in = FALSE)



  # ====== Set new directory and create subdirectories ======

  #dir = grandparent directory
  #retain_resolve.path-resolve = parent directory
  print("Prepare: Creating export directories")

  #Create retain-resolve (parent)
  if(export == TRUE){
    og.dir <- getwd()
    setwd(dir)
    newdir <- "retain-resolve"
    dir.create(newdir)
    retain_resolve.path <- paste0(dir, "/", newdir)
    setwd(retain_resolve.path)

    print(retain_resolve.path)


    #Abundance
    newdir <- "TotalAbundance"
    dir.create(newdir)
    abund_diff.path <- paste0(retain_resolve.path, "/", newdir)

    #Vis
    newdir <- "plot_AbundSummary"
    dir.create(newdir)
    vis.path <- paste0(retain_resolve.path, "/", newdir)

    #Taxa lists
    newdir <- "taxa_lists"
    dir.create(newdir)
    taxa_lists.path <- paste0(retain_resolve.path, "/", newdir)


    #mra_prev
    newdir <- "mra_prevalence"
    dir.create(newdir)
    mra_prev.path <- paste0(retain_resolve.path, "/", newdir)
  }

  # ====== Unfiltered dataset - mra and prevalence ======
  print("Step 1: Create dat_empty. Phyloseq object with absent taxa (mean relative abundance = 0) removed.")
  dat <- physeq

  if(export == TRUE){
    setwd(vis.path)
  }
  mra.prev.results <- physeq.mra.prevalence(dat, type = "counts", vis.export = export, export_prefix.in = "original")

  if(export == TRUE){
    setwd(retain_resolve.path)
  }
  #mra.prev.results[[1]] = mean relative abundance and prevalence of each taxa (rel.criteria)
  #mra.prev.results[[2]] = relative abundance ASV table (rel)
  #mra.prev.results[[3]] = relative abundance phyloseq object (dat_rel)

  rel.criteria <- mra.prev.results[[1]]
  rel <-  mra.prev.results[[2]]
  dat_rel_unfiltered <- mra.prev.results[[3]]


  #Export and save unfiltered data
  if(export == TRUE){
    setwd(mra_prev.path)
    write.csv(file = "mraprev_original.csv", x = rel.criteria, row.names = FALSE)
    setwd(retain_resolve.path)
  }

  #Save original (unfiltered/non-agglomerated) taxa mra and prevalence as object
  mraprev_original <- rel.criteria


  # === Identify and remove empty taxa ===
  #List of empty taxa (prev = 0)
  emptytaxa <- rel.criteria %>% filter(prev == 0) %>% pull(OTU)
  emptytaxa.df <- rel %>% dplyr::select(all_of(emptytaxa)) %>% summarise_all(mean) %>% mutate(dummy = "dummy") %>% pivot_longer(!dummy, names_to = "OTU", values_to = "mean_abund") %>% select(-dummy)
  summary(emptytaxa.df %>% dplyr::select(-OTU))



  #Pruned phyloseq object - empty taxa removed
  emptytaxa.prune <- rel.criteria %>% filter(!(OTU %in% emptytaxa))
  dat_empty <- prune_taxa(emptytaxa.prune$OTU, dat)

  #Log results
  log.taxa <- data.frame("level" = "empty",
                         "pass" = rel.criteria %>% filter(!(OTU %in% emptytaxa)) %>% nrow(),
                         "fail" = length(emptytaxa))



  #Extract mean relative abundance and prevalence - empty taxa removed
  if(export == TRUE){
    setwd(vis.path)
  }
  mra.prev.results <- physeq.mra.prevalence(dat_empty,
                                            type = "counts", vis.export = export, export_prefix.in = "rm-empty")
  if(export == TRUE){
    setwd(retain_resolve.path)
  }

  #mra and prevalence for each ASV
  rel.criteria <- mra.prev.results[[1]]

  #Save dat_empty_rel
  dat_empty_rel <- mra.prev.results[[3]]






  # ====== Identify and separate ASVs that pass/fail criteria ======
  print("Step 2: Identify ASVs that pass or fail criteria.")

  pass <- rel.criteria %>% filter(mean_relabund > mra_prev.ASV[1] & prev > mra_prev.ASV[2] | prev > prev.ASV) %>% dplyr::select(OTU) #Taxa that do not need to be glommed (meet criteria)
  fail <- rel.criteria %>% filter(!OTU %in% pass$OTU & !OTU %in% emptytaxa.df$OTU) %>% dplyr::select(OTU) #Taxa that require glom (below abund/prevalence thresholds) & with empty taxa removed

  #Log results
  log.taxa.asv <- data.frame("level" = "ASV",
                             "pass" = nrow(pass),
                             "fail" = nrow(fail))
  log.taxa <- rbind(log.taxa, log.taxa.asv)

  #Pass criteria phyloseq
  dat_g1_pass <- prune_taxa(pass$OTU, dat_empty)

  #Fail criteria phyloseq
  dat_g1_fail <- prune_taxa(fail$OTU, dat_empty)

  #Export pass/failed taxa
  fail.tax.preglom <- as(access(dat_g1_fail, "tax_table"), "matrix")[, 6] %>% as.data.frame() %>% dplyr::rename(genus = ".")
  pass.tax <- as(access(dat_g1_pass, "tax_table"), "matrix")[, 6] %>% as.data.frame() %>% dplyr::rename(genus = ".")

  if(export == TRUE){
    setwd(taxa_lists.path)
    write.csv(x = fail.tax.preglom, file = "failASV.csv")
    write.csv(x = pass.tax, file = "passASV.csv")
    setwd(retain_resolve.path)
  }

  # ====== Resolve to genus ======
  print("Step 3: Agglomerate failed ASVs to genus.")

  #Glom
  dat_g1_fail_glom <- tax_glom(dat_g1_fail, NArm = FALSE, taxrank = "Genus")

  #Postglom taxa list
  tax.postglom <- dat_g1_fail_glom %>% tax_table %>% as.data.frame() %>% mutate(taxa_full = paste(Kingdom, Phylum, Class, Order, Family, Genus, sep = "_")) %>% rownames_to_column("sp")

  #Export post-glom taxa list
  if(export == TRUE){
    setwd(taxa_lists.path)
    write.csv(tax.postglom, "retainresolve_taxonomy_prefilter.csv", row.names = FALSE)
    setwd(retain_resolve.path)
  }


  #Merge retained ASVs and resolved genus'
  dat_retres_prefilter <- merge_phyloseq(dat_g1_pass, dat_g1_fail_glom) #relative abundance should sum to 1 for each sample

  # ====== Unit tests: Total abundance post-glom ======
  #Abundance difference between dat_empty and dat_resolved_unfiltered
  abund_diff <- cbind(dat_empty %>% otu_table %>% data.frame() %>% colSums() %>% data.frame(),
                      dat_retres_prefilter %>% otu_table %>% data.frame() %>% colSums() %>% data.frame())
  colnames(abund_diff) <- c("original_abund", "glom_abund")
  abund_diff <- abund_diff %>% mutate(Other = original_abund - glom_abund) %>% mutate(percent_diff = round(Other/original_abund * 100, 2))

  print("Abundance difference between original dataframe (dat_empty) and unfiltered retain-resolved: ")
  print(abund_diff %>% head)
  print("Note: Other = count difference between total sums")

  if(export == TRUE){
    setwd(abund_diff.path)
    write.csv(x = abund_diff, file = "abundDiff_glomprefilter.csv")
    setwd(retain_resolve.path)
  }

  #Save prefilter agglomerated abundance difference to R object
  abundDiff_glomprefilter <- abund_diff

  #Unit test: is abund_diff 0 for all samples? I.e. no counts went missing during the glom.
  qc.abund_diff <- abund_diff %>% filter(Other != 0)
  if(nrow(qc.abund_diff) != 0){
    print("Error: at least 1 sample has a different total abundance after agglomerating")
    print("Samples of concern:")
    print(qc.abund.diff)

    print("Running UNDECLARED to kill script.")
    UNDECLARED()

  }


  # ====== Filter genus that fail criteria ======
  print("Step 4: Filter genus-level taxa.")
  #Merged ASV and genus relative abundance mra and prevalence
  if(export == TRUE){
    setwd(vis.path)
  }
  mra.prev.results <- physeq.mra.prevalence(physeq = dat_retres_prefilter,
                                            type = "counts",
                                            vis.export = export,
                                            export_prefix.in = "glom_prefilter")
  if(export == TRUE){
    setwd(retain_resolve.path)
  }

  rel.criteria <- mra.prev.results[[1]] %>% #relative abundance is calculated with both ASV and genus-level taxa
    filter(OTU %in% (dat_g1_fail_glom %>% tax_table %>% data.frame %>% rownames)) #but only include taxa that are genus-level for physeq pruning

  #Export rel.criteria
  if(export == TRUE){
    setwd(mra_prev.path)
    write.csv(x = rel.criteria, file = "mraprev_glomgenus.csv", row.names = FALSE)
    setwd(retain_resolve.path)
  }

  #Save mra and prevalence dataset for all taxa (ASV and genus-level) after genus-level glom of non-retained ASVs
  mraprev_glomgenus <- rel.criteria

  #Identify genus-level taxa that pass/fail genus-level criteria
  pass <- rel.criteria %>% filter(mean_relabund > mra_prev.genus[1] & prev > mra_prev.genus[2] | prev > prev.genus)
  fail <- rel.criteria %>% filter(!OTU %in% pass$OTU)


  log.taxa.genus <- data.frame("level" = "genus",
                             "pass" = nrow(pass),
                             "fail" = nrow(fail))
  log.taxa <- rbind(log.taxa, log.taxa.genus)


  #Filter genus-agglomerated taxa; pass criteria only
  dat_g2_pass <- prune_taxa(pass$OTU, dat_g1_fail_glom)

  #Export log.taxa
  if(export == TRUE){
    write.csv(x = log.taxa, file = "ntaxa_glom_log.csv", row.names = FALSE)
  }


  # ====== ASV-level and genus-level taxa: identification lists ======
  print("Step 5: Save whether taxa (identified by phyloseq unique species ID) are ASV-level or genus-level taxa")
  genus.taxa <- data.frame("taxa" = dat_g2_pass %>% tax_table %>% rownames,
                           "level" = rep("genus", nrow(dat_g2_pass %>% tax_table)))
  asv.taxa <- data.frame("taxa" = dat_g1_pass %>% tax_table %>% rownames,
                         "level" = rep("ASV", nrow(dat_g1_pass %>% tax_table)))
  glom.spID.levels <- rbind(asv.taxa, genus.taxa)

  if(export == TRUE){
    write.csv(glom.spID.levels, "glom_spID_levels.csv", row.names = FALSE)
  }

  print("Step 6: Identify filtered ASVs that belong to resolve genus-level taxa")
  ResolvedGenus.ASVs <- left_join(full_taxonomic_name(dat_g1_fail) %>% rownames_to_column(var = "ASV") %>% rename(ASV.spID = spID), #failed ASVs
                                  full_taxonomic_name(dat_g1_fail_glom) %>% rownames_to_column(var = "ResolvedGenus") %>% rename(ResolvedGenus.spID = spID),
                                  by = "taxonomic.name") %>%
    filter(taxonomic.name %in% (full_taxonomic_name(dat_g2_pass) %>% pull(taxonomic.name))) %>% # filter to only include passed genera
    arrange(ResolvedGenus.spID)

  if(export == TRUE){
    setwd(taxa_lists.path)
    write.csv(x = ResolvedGenus.ASVs, file = "ResolvedGenus_ASVs.csv", row.names = FALSE)
    setwd(retain_resolve.path)
  }





  # ====== Filtered and merged retain-resolve ASV and genus-level phyloseq object ======
  print("Step 7: Final retained resolve phyloseq object (dat_glom).")
  # === Merge ASV and genus-level phyloseq objects ===
  dat_glom <- merge_phyloseq(dat_g1_pass, #ASV-level pass
                             dat_g2_pass #genus-level pass; filtered
                             )

  # === Identify abundance differences ===
    #Total relative abundance will no longer sum to 1 because some counts are lost from genus filtering
  abund_diff <- cbind(dat_empty %>% otu_table %>% data.frame() %>% colSums() %>% data.frame(),
                      dat_glom %>% otu_table %>% data.frame() %>% colSums() %>% data.frame())
  colnames(abund_diff) <- c("original_abund", "glom_abund")
  abund_diff <- abund_diff %>% mutate(Other = original_abund - glom_abund) %>% mutate(percent_diff = round(Other/original_abund * 100, 2))

  print("Abundance difference between original dataframe (dat_empty) and filtered retain-resolved: ")
  print(abund_diff %>% head)
  print("Note: Other = count difference between total sums")

  #Export abundance differences
  if(export == TRUE){
    setwd(abund_diff.path)
    write.csv(x = abund_diff, file = "abundDiff_glom_postfilter.csv")
    setwd(retain_resolve.path)
  }

  #Save post-filtered total abundance difference R object
  abundDiff_glom_postfilter <- abund_diff

  # ====== Other taxa ======
  print("Step 8: Add Other taxanomic ID for calculations and analyses that require actual total abundance - CLR, relabund. Phyloseq object = dat_glom_aldex.")
  #Add "Other" taxa label that accounts for filtered counts
    #Required for relative abundance and CLR/aldex
  #mapdf
  mapdf.other <- dat %>% sample_data %>% data.frame

  #ASV df
  other_taxa <- abund_diff %>% dplyr::select(Other) %>% t() #other = other taxa
  asvdf.other <- other_taxa %>% as.matrix()
  row.names(asvdf.other) <- "other"

  #Taxa df
  taxdf.other <- data.frame(c("Other", "Other", "Other", "Other", "Other", "Other")) %>% t() %>% data.frame()
  row.names(taxdf.other) <- "other"
  colnames(taxdf.other) <-  colnames(physeq %>% tax_table %>% data.frame)
  taxdf.other <- taxdf.other %>% as.matrix()

  #Create phyloseq object
  dat_other <- phyloseq(otu_table(asvdf.other, taxa_are_rows = TRUE), tax_table(taxdf.other), sample_data(mapdf.other))


  #Merge phyloseq other
  dat_glom_aldex <- merge_phyloseq(dat_glom, #Filtered ASVs and Genus
                                   dat_other) #Lost taxa

  #Unit test: dat_glom_aldex total counts = dat_empty total counts
  abund_diff <- cbind(dat_empty %>% otu_table %>% data.frame() %>% colSums() %>% data.frame(),
                      dat_glom_aldex %>% otu_table %>% data.frame() %>% colSums() %>% data.frame())
  colnames(abund_diff) <- c("original_abund", "glom_abund")
  abund_diff <- abund_diff %>% mutate(Other = original_abund - glom_abund) %>% mutate(percent_diff = round(Other/original_abund * 100, 2))

  print("Abundance difference between original dataframe (dat_empty) and filtered retain-resolved with Other taxa: ")
  print(abund_diff %>% head)
  print("Note: Other = count difference between total abundance sums")

  #Export abundance differences
  if(export == TRUE){
    setwd(abund_diff.path)
    write.csv(x = abund_diff, file = "abundDiff_glompostfilter_other.csv")
    setwd(retain_resolve.path)
  }

  qc.abund_diff <- abund_diff %>% filter(Other != 0)
  if(nrow(qc.abund_diff) != 0){
    print("Error: at least 1 sample has a different total abundance. Step: dat_glom_aldex")
    print("Samples of concern:")
    print(qc.abund.diff)

    print("Running UNDECLARED to kill script.")
    UNDECLARED()

  }

  #Save total abundance difference between original and postfilter glom with Other taxa. R object.
  abundDiff_glom_postfilter_other <- abund_diff


  # ====== Relative abundance phyloseq objects ======
  print("Step 9: Relative abundance conversions - (dat_empty_rel, dat_rel_glom).")


  # === dat_rel_glom ===
  if(export == TRUE){
    setwd(vis.path)
  }

  mra.prev.results <- physeq.mra.prevalence(physeq = dat_glom_aldex,
                                            type = "counts",
                                            vis.export = export,
                                            export_prefix.in = "glom-postfilter-other")
  if(export == TRUE){
    setwd(retain_resolve.path)
    }

  dat_rel_glom <- mra.prev.results[[3]]

  #Remove other taxa
  rm.other <- dat_rel_glom %>% tax_table %>% data.frame %>% rownames_to_column(var = "OTU") %>% filter(OTU != "other") %>% pull(OTU)
  dat_rel_glom <- prune_taxa(rm.other, dat_rel_glom)



  # ====== Export phyloseq objects to file ======
  if(export == TRUE){
    print("Step 10: Export physeq objects")
    #Set wd
    setwd(dir)
    newdir <- "Physeq_glommed_objects"
    dir.create(newdir)
    setwd(paste0(dir, "/", newdir))

    print(paste0("Export to:",  paste0(dir, "/", newdir)))
    #Save
    save(dat_retres_prefilter, dat_glom, dat_rel_glom, dat_glom_aldex,
         dat_empty, dat_empty_rel, dat, file = "physeq_ResolveGenus.RData")

    # === Return to original wd ===
    setwd(og.dir)
  }




  # === Final comments ===
  if(export == TRUE){
    print(paste("All results exported to: ", dir))
    print(paste0("Phyloseq objects exported to sub-directory: ", dir, "/Physeq_glommed_objects"))
    print(paste("Retain-resolve algorithim performance and statistics exported to: ", dir, "/retain-resolve"))
  }

  print(log.taxa)

  #Return
  out <- list(dat_glom, dat_glom_aldex, dat_rel_glom, glom.spID.levels %>% as_tibble)
  names(out) <- c("dat_glom", "dat_glom_aldex", "dat_rel_glom", "glom.spID.taxonomy")
  return(out)
}
SarahAsbury/retainresolve documentation built on Feb. 20, 2023, 7:48 a.m.