#Microbiome Tools
# Load Packages -----------------------------------------------------------
library(tidyverse)
library(scales)
library(phyloseq)
library(cowplot)
library(gridGraphics)
# Function: Create phyloseq object ----------------------------------------
#' Create phyloseq object
#'
#' Function to create phyloseq obejct, with additional quality control measures to ensure input dataframes are correctly aligned for phyloseq.
#'
#' @param asvdf Dataframe containing ASV sequence count data for each sample. Rows are sequences and column are samples. Rownames must be set to sequence ID..
#' @param taxdf Dataframe contaningg taxonomic classification of each ASV. Rows are sequences and columns are samples. Rownames must be set to sequence ID.
#' @param mapdf Dataframe containing sample meta data (e.g clinical variables). Rows are samples and columns are metadata variables (e.g age, sex, clinical variables, etc.)
#' @param ID.col #String. Name of the column in mapdf that contains sample IDs. (E.g \code{"sample.ids"})
#' @param export.asvsp #\code{TRUE} or \code{FALSE} whether to export a dataframe that contains ASV sequences and their corresponding species ID (spID) auto-generated by phyloseq
#' @param export.wd #Directory path as a string (e.g \code{"~/output"}. Location of directory where dataframe will be exported if export.asvp = TRUE.
#'
#' @return Phyloseq object.
#' @export
#'
#' @examples dat <- physeq.create(asvdf = asv, taxdf = taxanomy, mapdf = metadata, ID.col = "sample_ID", export.asvsp = FALSE)
#'
physeq.create <- function(asvdf, #Dataframe; Rownames = sequences (e.g CCTGTT...) . Columns = sequencing counts.
taxdf, #Dataframe; Rownames = sequences (e.g CCTGTT...). Columns = Sequence taxonomic identification (Kingdom, Phylum, Class, Order, Family Genus)
mapdf, #Dataframe; No specific rownames required. Rows = Samples, Columns = Sample metadata (e.g depression scores, sex, age, etc.)
ID.col, #name of the column containing study IDs
export.asvsp = TRUE, #export CSV file that maps phyloseq assigned species ID (sp####) to ASV sequences (CCTGGT...)
export.wd = getwd() #directory exports will be saved
)
{
# === Directory ===
#Get original directory
og.dir <- getwd()
#Set new export directory
setwd(export.wd)
#=== Wrangling ===
#Standardize ID column
mapdf <- mapdf %>% dplyr::rename(StudyID = paste(ID.col))
#=== asvdf ===
#Remove asvdf rownames
seqs = rownames(asvdf)
rownames(asvdf) = NULL
# === Taxdf ===
#Unit test: taxdf is aligned with asvdf
qc.physeq1 <- all(rownames(taxdf) == seqs) #pass = TRUE
if(qc.physeq1 == FALSE){
print("Error: taxdf and asvdf rows are not aligned")
print("Running UNDECLARED to kill script.")
UNDECLARED()
}
#Remove taxdf rownames for physeq formatting
taxdf.asv <- taxdf #save ASV key
rownames(taxdf) = NULL #remove rows
# Unit test:
#Taxa nrow = ASV seq nrow
qc.physeq2 <- nrow(taxdf) == nrow(asvdf) #pass = TRUE
if(qc.physeq2 == FALSE){
print("Error: taxdf and asvdf have different number of rows")
print("Running UNDECLARED to kill script.")
UNDECLARED()
}
#=== mapdf ===
#Unit test:
#nrow of mapdf$StudyID is equal to ncol of asvdf
#Checks that there are same number of StudyIDs in mapdf and asvdf
qc.physeq3 <- nrow(mapdf) == ncol(asvdf) #pass = TRUE
if(qc.physeq3 == FALSE){
print("Error: mismatched number of samples between asvdf and mapdf" )
print("Running UNDECLARED to kill script.")
UNDECLARED()
}
#Unit test:
#Set of StudyIDs in asvdf (cols) are also in mapdf (rows)
qc.physeq4 <- all(colnames(asvdf) %in% mapdf$StudyID) %>% all #pass = TRUE
qc.physeq5 <- all(mapdf$StudyID %in% colnames(asvdf)) %>% all #pass = TRUE
if(qc.physeq4 == FALSE | qc.physeq5 == FALSE){
print("Error: mismatched sample IDs between asvdf and mapdf" )
print("Samples in asvdf but not mapdf:")
qc.df <- data.frame(colnames(asvdf)) %>% filter(!(colnames.asvdf. %in% mapdf$StudyID)) #show all samples in asvdf except those shared between asvdf and mapdf
print(qc.df)
print("Samples in mapdf but not asvdf:")
qc.df <- data.frame(mapdf$StudyID) %>% filter(!(mapdf.StudyID %in% colnames(asvdf))) #show all samples in mapdf except those shared between asvdf and mapdf
print("Running UNDECLARED to kill script.")
UNDECLARED()
}
#StudyID becomes rownames
rownames(mapdf) = mapdf$StudyID
#Unit test:
#Set of StudyIDs in asvdf (cols) are also in mapdf (rownames)
qc.physeq6 <- all(rownames(mapdf) %in% colnames(asvdf)) %>% all #pass = TRUE
qc.physeq7 <- all(colnames(asvdf) %in% rownames(mapdf)) %>% all #pass = TRUE
if(qc.physeq6 == FALSE | qc.physeq7 == FALSE){
print("Error: set of asvdf colnames is different from set of mapdf rownames" )
print("Running UNDECLARED to kill script.")
UNDECLARED()
}
#=== Create phyloseq object ===
# Convert the ASV and taxonomy data frames to matrices.
tax_mat = as.matrix(taxdf)
asv_mat = as.matrix(asvdf)
dat = phyloseq(otu_table(asv_mat, taxa_are_rows = TRUE), tax_table(tax_mat), sample_data(mapdf))
str(dat)
# === Export ASV -> phyloseq sp ID# conversion key ===
taxdf.asv #taxa df without rows (ASV sequences) removed
if(export.asvsp == TRUE){
#Unit test:
#Physeq and taxdf taxonomic assignments are in the same order
qc.asv_sp <- (dat %>% tax_table() %>% data.frame() %>% replace(is.na(.), "UNKNOWN") == taxdf %>% replace(is.na(.), "UNKNOWN")) %>%
as.logical %>% as.data.frame() %>% dplyr::rename(qc = ".")
if(unique(qc.asv_sp) %>% length != 1 | unique(qc.asv_sp)[1,1] != TRUE){
print("Error: taxdf.asv and phyloseq object are not aligned.")
print("Running UNDECLARED to kill script.")
UNDECLARED()
}
#Create asv_sp key
print(paste("Phyloseq taxa table (dat) and ASV sequence taxa df (taxdf.asv) aligned with original taxa df (taxdf) :", unique(qc.asv_sp)))
print(paste0("ASV to phyloseq species ID (sp) conversion key created and exported to: ", export.wd, "/asv_spCode.csv"))
sp_asv <- (taxdf.asv %>% rownames) %>% data.frame() %>% dplyr::rename(asv_seq = ".") %>% mutate(sp = dat %>% tax_table %>% rownames()) %>% relocate(sp, asv_seq)
#Export
write.csv(sp_asv, "asv_spCode.csv", row.names = FALSE)
}
#=== Remove host mitochondrial sequences ===
dat2 = subset_taxa(dat,
!(Kingdom == 'Eukaryota' |
is.na(Phylum) |
(!is.na(Family) &
Family == 'Mitochondria')))
# === Directory ===
#Reset working directory
setwd(og.dir)
# === Return phyloseq object ===
return(dat2)
}
# Function: Change taxa names (sp ID#, full taxonomic name) ------------------------------------------------
#Get sp ID and taxonomic identification from phyloseq object (df)
#' Species ID taxonomoc classification
#'
#' Add full taxonomic classification to species ID auto-generated by Phyloseq
#' @param physeq Phyloseq object
#' @param taxa.col String of taxonomic levels to include in classification. Taxonomic levels must exist in the input phyloseq object. If no input given, defaults to all levels in phyloseq object.
#' @param remove TRUE or FALSE. Are there characters that need to be removed after the spID in the taxa rowname to isolate the spID ? If TRUE, spID is extracted (e.g sp1_bacteria_bacteroidetes extracted to sp1)
#' @param remove.delimiter Character string that separates sp ID from remaining OTU name characters (e.g sp1_bacteria_bacteroidetes delimiter is "_")
#'
#'
#' @return Returns the phyloseq object with taxonomic classification as part of the taxa name (i.e. phyloseq OTU table rownames now contain taxonomy).
#' @export
#'
#' @examples sp_taxa_ID(dat) #Add full taxonomic label to rownames (i.e. sp###_Kingdom_Phylum_Class_Order_Family_Genus)
#' sp_taxa_ID(physeq = dat, taxa.col = c("Family, "Genus)) #Add full taxonomic label to rownames (i.e. sp###_Family_Genus)
sp_taxa_ID <- function(physeq, #phyloseq object
taxa.col = NA, #Character vector; taxonomic levels to be included in phyloseq taxa ID (e.g c("Family", "Genus")). If no input given, defaults to all levels in physeq object
remove = FALSE, #If TRUE, sp ID needs to be extracted from other information (e.g sp1_bacteria_bacteroidetes extracted to sp)
remove.delimiter = "_" #character to separate sp ID from remaining OTU name (e.g sp1_bacteria_bacteroidetes delimiter is "_")
)
{
asvtab <- physeq %>% tax_table %>% data.frame %>% rownames_to_column(var = "OTU")
#Isolate spID if remove = TRUE
if(remove == TRUE){
delim <- paste0(remove.delimiter, ".*")
asvtab <- asvtab %>% mutate(OTU = gsub(x = OTU, pattern = delim, replacement = ""))
}
#Set taxa names if not givene()
if(is.na(taxa.col[1])){
taxa.col <- rank_names(physeq)
}
tname.paste <- c("OTU", taxa.col)
#List of phyloseq sp IDs and their taxonomic assignment
asvtaxa <- asvtab[,c("OTU", taxa.col)] %>% unique() %>%
mutate(tname = do.call(paste, c(asvtab[tname.paste], sep = "_")), #Create new col tname with OTU and full taxonomic details
OTU_order = parse_number(OTU) %>% as.numeric()) %>%
dplyr::select(-OTU_order)
#Unit test:
#asvtaxa ASV (colname = OTU) aligns with phyloseq object ASV rownames
if(remove == FALSE){
qc.taxa_names <- (asvtaxa$OTU == (tax_table(physeq) %>% row.names())) %>% data.frame() %>% filter(FALSE %in% .) %>% nrow()}
if(remove == TRUE){
qc.taxa_names <- ((asvtaxa$OTU %>% gsub(pattern = delim, replacement = "")) == (tax_table(physeq) %>% row.names() %>% gsub(pattern = delim, replacement = ""))) %>%
data.frame() %>% filter(FALSE %in% .) %>% nrow()
}
if(qc.taxa_names == 0){
print("QC passed, full ASV taxonomic names will be added to phyloseq object")
}
if(qc.taxa_names != 0){
print("Error: OTU order is not aligned. Running undeclared to kill script")
UNDECLARED()
}
#Add new taxa names to phyloseq object
taxa_names(physeq) <- asvtaxa$tname
print(head(asvtaxa$tname)) #print example of new taxa names for user
return(physeq)
}
full_taxonomic_name <- function(physeq #input phyloseq object
)
#Returns dataframe containing the full taxonomic name for each spID
#Assumes that at minimum, the spID can be found in the rownames
{
tax.df <- as(access(physeq, "tax_table"), "matrix") %>% data.frame
tax.df$taxonomic.name <- do.call(paste, c(tax.df[1:ncol(tax.df)], sep = "_"))
out <- tax.df %>% select(taxonomic.name) %>% mutate(spID = gsub(x = rownames(.), pattern = "_.*", replacement = "")) %>% relocate(spID, .before = taxonomic.name)
return(out)
}
# Functions: relative abundance -------------------------------------------
#phyloseq counts -> phyloseq relative abundance and wide ASV dataframe
#' Convert counts to relative abundance.
#'
#' Converts taxa counts stored in phyloseq object into taxa relative abundance
#' @param physeq input phyloseq object
#' @param warning.in \code{TRUE} or \code{FALSE}. If total relative abundance for a sample does not equal 1, should an error be issued?
#'
#' @return Returns both a phyloseq object and the OTU table as a dataframe where counts are replaced by relative abundance.
#' @export
#'
#' @examples
#'
physeq.convert.rel <- function(physeq, warning.in = TRUE)
{
dat_rel <- transform_sample_counts(physeq,function(x) x/sum(x))
rel <- physeqrel.dataframe(dat_rel, warning.in)
return(list(dat_rel, rel))
}
#phyloseq relative abundance -> wide ASV dataframe
physeqrel.dataframe <- function(physeq, #relative abundance phyloseq object
warning = TRUE #if relative abundance is not 1, issue a warning instead of killing script. TRUE = issue warning, FALSE = kill script.
)
#input relative abundance phyloseq object
#ouput relative abundance ASV table (rel)
{
#Unit test: is it relative abundance data?
qc <- physeq %>% otu_table %>% data.frame %>% pull(1) %>% sum
if(qc > 1){
print("ERROR: first sample looks like count data. Input must be relative abundance data")
print("Running UNDECLARED to kill script.")
UNDECLARED()
}
#Psmelt relative abundanc physeq
rel <- psmelt(physeq)
#Unit test:
#Total relative abundance sums to 1
#Returns warning. Script will continue even if test fails, unless warning = FALSE
qc <- rel %>% group_by(Sample) %>% dplyr::summarize(total_abundance = sum(Abundance))
qc.min <- min(qc$total_abundance) %>% round(digits = 10) %>% trunc
qc.max <- max(qc$total_abundance) %>% round(digits = 10) %>% trunc
if(qc.min != 1 | qc.max != 1){
print(paste0("Min total rel abund:", qc.min, ". Max total rel abund: ", qc.max))
if(warning == TRUE){
warning("Relative abundance does not sum to 1 for all samples.")
print("Total relative abundance summary:")
print(summary(qc))
}
if(warning == FALSE){
print("Relative abundance does not sum to 1 for all samples.")
print("Total relative abundance summary:")
print(summary(qc))
print("Running UNDECLARED to kill script.")
UNDECLARED()
}
}
rel <- rel %>% dplyr::select(c(OTU,Sample, Abundance, StudyID)) %>%
arrange(Sample, desc(Abundance))%>%
group_by(Sample) %>%
tidyr::pivot_wider(values_from = Abundance, names_from = OTU) %>%
replace(is.na(.), 0) %>% ungroup() %>% dplyr::select(-Sample)
return(rel)
}
#Get mean relative abundance for each taxa
rel.mean.extract <- function(rel){
relmean <- rel %>% select(-StudyID) %>% summarise_all(mean) %>% mutate(stat = "mean") %>%
pivot_longer(!stat, names_to = "OTU", values_to = "mean") %>% dplyr::select(-stat)
return(relmean)
}
#Get taxa prevalence
rel.prev.extract <- function(rel){
relprev <- rel %>% dplyr::select(-StudyID) %>%
mutate_all(function(x) (ifelse(x>0, 1, 0))) %>% #converts taxa presence as binary (1 = yes, 0 = no)
summarise_all(mean) %>%
mutate(stat = "prev") %>% pivot_longer(!stat, names_to = "OTU", values_to = "prev") %>% dplyr::select(-stat)
return(relprev)
}
#Combine mean relative abundance and prevalence into one dataframe
rel.criteria.extract <- function(relmean, relprev){
rel.criteria <- relmean %>% left_join(relprev, by = "OTU") %>% dplyr::rename(mean_relabund = mean)
summary(rel.criteria %>% dplyr::select(-OTU))
return(rel.criteria)
}
#Mean relative abundance and prevalence data exploration
mra.prev.vis <- function(rel.criteria, #output from rel.criteria.extract
export_prefix #file prefix
)
#Exports density and scatter plots of prevalence and mean relative abundance to files in the current wd
{
density(log10(rel.criteria$mean_relabund)) %>% plot(main = "Log 10 Mean Relative Abundance") #density
p1 <- recordPlot()
ggdraw(p1)
density(rel.criteria$prev) %>% plot(main = "Prevalence")
p2 <- recordPlot()
ggdraw(p2)
#Scatter
p <- ggplot(rel.criteria, aes(x = prev, y = mean_relabund)) + geom_point() +
theme_classic() + scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x))) +
annotation_logticks(sides = "l")
#Export
tiff(paste0(export_prefix, "_mra_prev.tiff"), height = 500, width = 1800, res = 90)
print(cowplot::plot_grid(p1, p2, p, ncol = 3))
dev.off()
}
#' Relative abundance Summary and Conversion
#'
#' A comprehensive relative abundance function for phyloseq objects. Includes output of relative abundance summary statistics (mean relative abundance and prevalence), both as a dataframe and with an added option for plotting. The function accepts input of count or relative abundance data.
#'
#' If count data is input, it will be first converted to relative abundance. The relative abundance phyloseq object and OTU table will also be returned.
#'
#' @param physeq Input phyloseq object
#' @param type One of \code{"counts"} or \code{"relabund"}. Describes whether input phyloseq object currently contains raw counts or relative abundance.
#' @param vis.export \code{TRUE} or \code{FALSE}. If \code{"TRUE"}, the function save plots to the working directory describing distribution of taxa relative abundance and prevalence.
#' @param export_prefix.in Provide a prefix (string) for summary plots if vis.export = \code{TRUE}
#'
#' @return List containing [1] mean relative abundance and prevalence for each taxa (dataframe) [2] Relative abundance OTU table (dataframe) [3] Relative abundance phyloseq object.
#' If vis.export = TRUE Will also export summary plots of mean relative abundance and prevalence distributions to working directory
#' @export
#'
#' @examples
#' physeq.mra.prevalence(physeq = dat, type = "counts", vis.export = TRUE, export_prefix.in = "test_counts") #with export
#' physeq.mra.prevalence(physeq = dat_rel, type = "relabund") #input phyloseq object with relative abundance
physeq.mra.prevalence <- function(physeq,
type, #one of: "counts" or "relabund"
vis.export = TRUE, #export mra/prev density function and scatter plots
export_prefix.in #prefix for export file names
)
#Summary function that uses other microbiome relative abundance functions
#Input phyloseq object of type: "counts" or "relabund"
#Outputs taxa mean relative abundance and taxa prevalence (rel.criteria [[1]]), rel abund ASV table [[2]], and abundance phyloseq object (dat_rel [[3]])
#Option to export mean relative abundance and prevalence visualization plots (mra/prev probability density functions/distribution, scatter plots)
{
#Unit test: type has valid input
if(type != "counts" & type != "relabund"){
print("Invalid type input.")
print("Running UNDECLARED to kill script.")
UNDECLARED()
}
#quality control object for determining if relative abundance or count data was input
qc <- physeq %>% otu_table %>% data.frame %>% pull(1) %>% sum
if(type == "counts"){
#Unit test: is it count data?
if(qc < 1){
print("ERROR: first sample looks like relative abundance data. Current input is count data")
print("Running UNDECLARED to kill script.")
UNDECLARED()
}
#Convert phyloseq counts to relative abundance phyloseq and ASV table
rel.results <- physeq.convert.rel(physeq = physeq)
#Extract mean relative abundance and prevalence
rel <- rel.results[[2]]
relmean <- rel.mean.extract(rel = rel)
relprev <- rel.prev.extract(rel = rel)
rel.criteria <- rel.criteria.extract(relmean, relprev)
#dat_rel
dat_rel <- rel.results[[1]]
}
if(type == "relabund"){
#Unit test: is it relative abundance data?
if(qc > 1){
print("ERROR: first sample looks like count data. Current input is relative abundance data")
print("Running UNDECLARED to kill script.")
UNDECLARED()
}
#Extract mean relative abundance and prevalence
rel <- physeqrel.dataframe(physeq)
relmean <- rel.mean.extract(rel = rel)
relprev <- rel.prev.extract(rel = rel)
rel.criteria <- rel.criteria.extract(relmean, relprev)
#dat_rel
dat_rel <- physeq
}
#Visualize and export current mra and prevalence
if(vis.export == TRUE){
mra.prev.vis(rel.criteria = rel.criteria, export_prefix = export_prefix.in)
}
return(list(rel.criteria, #1 = mean relative abundance and prevalence of each taxa
rel, #2 = relative abundance ASV table
dat_rel #3 = relative abundance phyloseq object
))
}
# Function: Glom algorithm -----------------------------------------------
#' Retain resolve agglomeration
#'
#' Execute retain-resolve agglomeration on ASV count data stored in phyloseq object. \cr \cr
#' Filter are required for at the ASV and genus-level for:
#' \enumerate{
#' \item mean relative abundance + prevalence
#' \item {prevalence only}
#' } \cr
#' Users wishing to set thresholds for only one criteria \emph{(e.g mra + prev \strong{or} prev only)} can set the parameters for the unused filter to 0. Thresholds for ASV and genus-level filtering may need to be optimized for each dataset using summary plots exported when \code{export = TRUE}. \cr \cr
#' \emph{\strong{Exporting quality control data to file (\code{export = TRUE}) is highly recommend.}} \cr
#'
#'
#' @param physeq input phyloseq object containing taxa counts
#' @param mra_prev.ASV threshold for mean relative abundance + prevalence filter. Both minimum values must be met for ASV taxa to be retained. Input numeric vector: \code{c(mra, prev)}. Default is 0.001 mean relative abundance and 0.1 prevalence.
#' @param prev.ASV threshold for prevalence filter for ASV to be retained. Input numeric value. Default is 0.5 prevalence.
#' @param mra_prev.genus threshold for mean relative abundance + prevalence filter. Both minimum values must be met for agglomerated genus-level taxa to be retained. Input numeric vector: \code{c(mra, prev)}. Default is 0.0001 mean relative abundance and 0.1 prevalence.
#' @param prev.genus threshold for prevalence filter for agglomerated genus-level taxa to be retained. Input numeric value. Default is 0.5 prevalence.
#' @param export \code{TRUE} or \code{FALSE}. If \code{TRUE}, quality control data and additional phyloseq objects will be exported to a specified directory.
#' @param dir Input filepath as string for storage location of exported files if export = \code{TRUE}.
#'
#' @return
#' \strong{Phyloseq objects:} \cr
#' dat_glom: retain-resolved ASV and genus-level taxa count data. Other taxa is not included. \cr \cr
#' dat_glom_aldex: retain-resolved ASV and genus-level taxa count data. Other taxa classification is included such that total abundances are unchanged from original count data after genus-level filtering.
#' Recommended use for downstream applications where calculations use total abundance (e.g CLR transformation, relative abundance). \cr \cr
#' dat_glom_rel: retain-resolved ASV and genus-level taxa relative abundance data. Calculated with dat_glom_aldex. Other taxa has been removed after relative abundance calculation. \cr \cr
#' \cr
#' \strong{Dataframes:} \cr
#' glom.spID.taxonomt: key for taxonomic level (ASV or genus) of each taxonomic ID (e.g )
#'
#' @export
#'
#' @examples
#' output <- retain.resolve_genus(physeq = dat, export = TRUE, dir = "filepath/export_files") #use default thresholds
#'
#' output <- retain.resolve_genus(physeq = dat, export = TRUE, dir = "~filepath/export_files", mra_prev.ASV = c(0.005, 0.25), prev.ASV = 0.75, mra_prev.genus = c(0.001, 0.1), prev.genus = 0.5) #manually set thresholds for genus and ASV-level filters
#'
retain.resolve_genus <- function(physeq, #count phyloseq object
dir = NA,
export, #TRUE or FALSE
mra_prev.ASV = c(0.001, 0.1), #threshold for mean relative abundance [1] AND prevalence [2]. Taxa must pass both criteria for retention. ASV level filter.
prev.ASV = 0.5, #threshold for prevalence-only. ASV level filter.
mra_prev.genus = c(0.0001, 0.1), #threshold for mean relative abundance [1] AND prevalence [2]. Taxa must pass both criteria for retention. genus level filter. ,
prev.genus = 0.5 #threshold for prevalence-only. Genus level filter.
)
#If taxa pass relabund_prev or prev filters, they will be retained.
{
print("QC: Assessing user input.")
# ====== Unit tests: User input ======
#Unit test: was export input as a logical variable?
if(class(export) != "logical"){
print("Error: export must be TRUE or FALSE")
print("Running UNDECLARED to kill script.")
UNDECLARED()
}
#Unit test: if user wants to export data, is there a working directory provided?
if(export == TRUE & is.na(dir)){
print("Error: please provide a directory path to export files.")
print("Running UNDECLARED to kill script.")
UNDECLARED()
}
#Unit test: did user input count data?
qc <- physeq %>% otu_table %>% data.frame %>% pull(1) %>% sum
if(qc < 1){
print("ERROR: first sample looks like relative abundance data. Only count data is accepted")
print("Running UNDECLARED to kill script.")
UNDECLARED()
}
#Unit test: is relative abundance 1 for input physeq?
#If total relative abundance is not 1, will kill script
physeq.convert.rel(physeq = physeq,
warning.in = FALSE)
# ====== Set new directory and create subdirectories ======
#dir = grandparent directory
#retain_resolve.path-resolve = parent directory
print("Prepare: Creating export directories")
#Create retain-resolve (parent)
if(export == TRUE){
og.dir <- getwd()
setwd(dir)
newdir <- "retain-resolve"
dir.create(newdir)
retain_resolve.path <- paste0(dir, "/", newdir)
setwd(retain_resolve.path)
print(retain_resolve.path)
#Abundance
newdir <- "TotalAbundance"
dir.create(newdir)
abund_diff.path <- paste0(retain_resolve.path, "/", newdir)
#Vis
newdir <- "plot_AbundSummary"
dir.create(newdir)
vis.path <- paste0(retain_resolve.path, "/", newdir)
#Taxa lists
newdir <- "taxa_lists"
dir.create(newdir)
taxa_lists.path <- paste0(retain_resolve.path, "/", newdir)
#mra_prev
newdir <- "mra_prevalence"
dir.create(newdir)
mra_prev.path <- paste0(retain_resolve.path, "/", newdir)
}
# ====== Unfiltered dataset - mra and prevalence ======
print("Step 1: Create dat_empty. Phyloseq object with absent taxa (mean relative abundance = 0) removed.")
dat <- physeq
if(export == TRUE){
setwd(vis.path)
}
mra.prev.results <- physeq.mra.prevalence(dat, type = "counts", vis.export = export, export_prefix.in = "original")
if(export == TRUE){
setwd(retain_resolve.path)
}
#mra.prev.results[[1]] = mean relative abundance and prevalence of each taxa (rel.criteria)
#mra.prev.results[[2]] = relative abundance ASV table (rel)
#mra.prev.results[[3]] = relative abundance phyloseq object (dat_rel)
rel.criteria <- mra.prev.results[[1]]
rel <- mra.prev.results[[2]]
dat_rel_unfiltered <- mra.prev.results[[3]]
#Export and save unfiltered data
if(export == TRUE){
setwd(mra_prev.path)
write.csv(file = "mraprev_original.csv", x = rel.criteria, row.names = FALSE)
setwd(retain_resolve.path)
}
#Save original (unfiltered/non-agglomerated) taxa mra and prevalence as object
mraprev_original <- rel.criteria
# === Identify and remove empty taxa ===
#List of empty taxa (prev = 0)
emptytaxa <- rel.criteria %>% filter(prev == 0) %>% pull(OTU)
emptytaxa.df <- rel %>% dplyr::select(all_of(emptytaxa)) %>% summarise_all(mean) %>% mutate(dummy = "dummy") %>% pivot_longer(!dummy, names_to = "OTU", values_to = "mean_abund") %>% select(-dummy)
summary(emptytaxa.df %>% dplyr::select(-OTU))
#Pruned phyloseq object - empty taxa removed
emptytaxa.prune <- rel.criteria %>% filter(!(OTU %in% emptytaxa))
dat_empty <- prune_taxa(emptytaxa.prune$OTU, dat)
#Log results
log.taxa <- data.frame("level" = "empty",
"pass" = rel.criteria %>% filter(!(OTU %in% emptytaxa)) %>% nrow(),
"fail" = length(emptytaxa))
#Extract mean relative abundance and prevalence - empty taxa removed
if(export == TRUE){
setwd(vis.path)
}
mra.prev.results <- physeq.mra.prevalence(dat_empty,
type = "counts", vis.export = export, export_prefix.in = "rm-empty")
if(export == TRUE){
setwd(retain_resolve.path)
}
#mra and prevalence for each ASV
rel.criteria <- mra.prev.results[[1]]
#Save dat_empty_rel
dat_empty_rel <- mra.prev.results[[3]]
# ====== Identify and separate ASVs that pass/fail criteria ======
print("Step 2: Identify ASVs that pass or fail criteria.")
pass <- rel.criteria %>% filter(mean_relabund > mra_prev.ASV[1] & prev > mra_prev.ASV[2] | prev > prev.ASV) %>% dplyr::select(OTU) #Taxa that do not need to be glommed (meet criteria)
fail <- rel.criteria %>% filter(!OTU %in% pass$OTU & !OTU %in% emptytaxa.df$OTU) %>% dplyr::select(OTU) #Taxa that require glom (below abund/prevalence thresholds) & with empty taxa removed
#Log results
log.taxa.asv <- data.frame("level" = "ASV",
"pass" = nrow(pass),
"fail" = nrow(fail))
log.taxa <- rbind(log.taxa, log.taxa.asv)
#Pass criteria phyloseq
dat_g1_pass <- prune_taxa(pass$OTU, dat_empty)
#Fail criteria phyloseq
dat_g1_fail <- prune_taxa(fail$OTU, dat_empty)
#Export pass/failed taxa
fail.tax.preglom <- as(access(dat_g1_fail, "tax_table"), "matrix")[, 6] %>% as.data.frame() %>% dplyr::rename(genus = ".")
pass.tax <- as(access(dat_g1_pass, "tax_table"), "matrix")[, 6] %>% as.data.frame() %>% dplyr::rename(genus = ".")
if(export == TRUE){
setwd(taxa_lists.path)
write.csv(x = fail.tax.preglom, file = "failASV.csv")
write.csv(x = pass.tax, file = "passASV.csv")
setwd(retain_resolve.path)
}
# ====== Resolve to genus ======
print("Step 3: Agglomerate failed ASVs to genus.")
#Glom
dat_g1_fail_glom <- tax_glom(dat_g1_fail, NArm = FALSE, taxrank = "Genus")
#Postglom taxa list
tax.postglom <- dat_g1_fail_glom %>% tax_table %>% as.data.frame() %>% mutate(taxa_full = paste(Kingdom, Phylum, Class, Order, Family, Genus, sep = "_")) %>% rownames_to_column("sp")
#Export post-glom taxa list
if(export == TRUE){
setwd(taxa_lists.path)
write.csv(tax.postglom, "retainresolve_taxonomy_prefilter.csv", row.names = FALSE)
setwd(retain_resolve.path)
}
#Merge retained ASVs and resolved genus'
dat_retres_prefilter <- merge_phyloseq(dat_g1_pass, dat_g1_fail_glom) #relative abundance should sum to 1 for each sample
# ====== Unit tests: Total abundance post-glom ======
#Abundance difference between dat_empty and dat_resolved_unfiltered
abund_diff <- cbind(dat_empty %>% otu_table %>% data.frame() %>% colSums() %>% data.frame(),
dat_retres_prefilter %>% otu_table %>% data.frame() %>% colSums() %>% data.frame())
colnames(abund_diff) <- c("original_abund", "glom_abund")
abund_diff <- abund_diff %>% mutate(Other = original_abund - glom_abund) %>% mutate(percent_diff = round(Other/original_abund * 100, 2))
print("Abundance difference between original dataframe (dat_empty) and unfiltered retain-resolved: ")
print(abund_diff %>% head)
print("Note: Other = count difference between total sums")
if(export == TRUE){
setwd(abund_diff.path)
write.csv(x = abund_diff, file = "abundDiff_glomprefilter.csv")
setwd(retain_resolve.path)
}
#Save prefilter agglomerated abundance difference to R object
abundDiff_glomprefilter <- abund_diff
#Unit test: is abund_diff 0 for all samples? I.e. no counts went missing during the glom.
qc.abund_diff <- abund_diff %>% filter(Other != 0)
if(nrow(qc.abund_diff) != 0){
print("Error: at least 1 sample has a different total abundance after agglomerating")
print("Samples of concern:")
print(qc.abund.diff)
print("Running UNDECLARED to kill script.")
UNDECLARED()
}
# ====== Filter genus that fail criteria ======
print("Step 4: Filter genus-level taxa.")
#Merged ASV and genus relative abundance mra and prevalence
if(export == TRUE){
setwd(vis.path)
}
mra.prev.results <- physeq.mra.prevalence(physeq = dat_retres_prefilter,
type = "counts",
vis.export = export,
export_prefix.in = "glom_prefilter")
if(export == TRUE){
setwd(retain_resolve.path)
}
rel.criteria <- mra.prev.results[[1]] %>% #relative abundance is calculated with both ASV and genus-level taxa
filter(OTU %in% (dat_g1_fail_glom %>% tax_table %>% data.frame %>% rownames)) #but only include taxa that are genus-level for physeq pruning
#Export rel.criteria
if(export == TRUE){
setwd(mra_prev.path)
write.csv(x = rel.criteria, file = "mraprev_glomgenus.csv", row.names = FALSE)
setwd(retain_resolve.path)
}
#Save mra and prevalence dataset for all taxa (ASV and genus-level) after genus-level glom of non-retained ASVs
mraprev_glomgenus <- rel.criteria
#Identify genus-level taxa that pass/fail genus-level criteria
pass <- rel.criteria %>% filter(mean_relabund > mra_prev.genus[1] & prev > mra_prev.genus[2] | prev > prev.genus)
fail <- rel.criteria %>% filter(!OTU %in% pass$OTU)
log.taxa.genus <- data.frame("level" = "genus",
"pass" = nrow(pass),
"fail" = nrow(fail))
log.taxa <- rbind(log.taxa, log.taxa.genus)
#Filter genus-agglomerated taxa; pass criteria only
dat_g2_pass <- prune_taxa(pass$OTU, dat_g1_fail_glom)
#Export log.taxa
if(export == TRUE){
write.csv(x = log.taxa, file = "ntaxa_glom_log.csv", row.names = FALSE)
}
# ====== ASV-level and genus-level taxa: identification lists ======
print("Step 5: Save whether taxa (identified by phyloseq unique species ID) are ASV-level or genus-level taxa")
genus.taxa <- data.frame("taxa" = dat_g2_pass %>% tax_table %>% rownames,
"level" = rep("genus", nrow(dat_g2_pass %>% tax_table)))
asv.taxa <- data.frame("taxa" = dat_g1_pass %>% tax_table %>% rownames,
"level" = rep("ASV", nrow(dat_g1_pass %>% tax_table)))
glom.spID.levels <- rbind(asv.taxa, genus.taxa)
if(export == TRUE){
write.csv(glom.spID.levels, "glom_spID_levels.csv", row.names = FALSE)
}
print("Step 6: Identify filtered ASVs that belong to resolve genus-level taxa")
ResolvedGenus.ASVs <- left_join(full_taxonomic_name(dat_g1_fail) %>% rownames_to_column(var = "ASV") %>% rename(ASV.spID = spID), #failed ASVs
full_taxonomic_name(dat_g1_fail_glom) %>% rownames_to_column(var = "ResolvedGenus") %>% rename(ResolvedGenus.spID = spID),
by = "taxonomic.name") %>%
filter(taxonomic.name %in% (full_taxonomic_name(dat_g2_pass) %>% pull(taxonomic.name))) %>% # filter to only include passed genera
arrange(ResolvedGenus.spID)
if(export == TRUE){
setwd(taxa_lists.path)
write.csv(x = ResolvedGenus.ASVs, file = "ResolvedGenus_ASVs.csv", row.names = FALSE)
setwd(retain_resolve.path)
}
# ====== Filtered and merged retain-resolve ASV and genus-level phyloseq object ======
print("Step 7: Final retained resolve phyloseq object (dat_glom).")
# === Merge ASV and genus-level phyloseq objects ===
dat_glom <- merge_phyloseq(dat_g1_pass, #ASV-level pass
dat_g2_pass #genus-level pass; filtered
)
# === Identify abundance differences ===
#Total relative abundance will no longer sum to 1 because some counts are lost from genus filtering
abund_diff <- cbind(dat_empty %>% otu_table %>% data.frame() %>% colSums() %>% data.frame(),
dat_glom %>% otu_table %>% data.frame() %>% colSums() %>% data.frame())
colnames(abund_diff) <- c("original_abund", "glom_abund")
abund_diff <- abund_diff %>% mutate(Other = original_abund - glom_abund) %>% mutate(percent_diff = round(Other/original_abund * 100, 2))
print("Abundance difference between original dataframe (dat_empty) and filtered retain-resolved: ")
print(abund_diff %>% head)
print("Note: Other = count difference between total sums")
#Export abundance differences
if(export == TRUE){
setwd(abund_diff.path)
write.csv(x = abund_diff, file = "abundDiff_glom_postfilter.csv")
setwd(retain_resolve.path)
}
#Save post-filtered total abundance difference R object
abundDiff_glom_postfilter <- abund_diff
# ====== Other taxa ======
print("Step 8: Add Other taxanomic ID for calculations and analyses that require actual total abundance - CLR, relabund. Phyloseq object = dat_glom_aldex.")
#Add "Other" taxa label that accounts for filtered counts
#Required for relative abundance and CLR/aldex
#mapdf
mapdf.other <- dat %>% sample_data %>% data.frame
#ASV df
other_taxa <- abund_diff %>% dplyr::select(Other) %>% t() #other = other taxa
asvdf.other <- other_taxa %>% as.matrix()
row.names(asvdf.other) <- "other"
#Taxa df
taxdf.other <- data.frame(c("Other", "Other", "Other", "Other", "Other", "Other")) %>% t() %>% data.frame()
row.names(taxdf.other) <- "other"
colnames(taxdf.other) <- colnames(physeq %>% tax_table %>% data.frame)
taxdf.other <- taxdf.other %>% as.matrix()
#Create phyloseq object
dat_other <- phyloseq(otu_table(asvdf.other, taxa_are_rows = TRUE), tax_table(taxdf.other), sample_data(mapdf.other))
#Merge phyloseq other
dat_glom_aldex <- merge_phyloseq(dat_glom, #Filtered ASVs and Genus
dat_other) #Lost taxa
#Unit test: dat_glom_aldex total counts = dat_empty total counts
abund_diff <- cbind(dat_empty %>% otu_table %>% data.frame() %>% colSums() %>% data.frame(),
dat_glom_aldex %>% otu_table %>% data.frame() %>% colSums() %>% data.frame())
colnames(abund_diff) <- c("original_abund", "glom_abund")
abund_diff <- abund_diff %>% mutate(Other = original_abund - glom_abund) %>% mutate(percent_diff = round(Other/original_abund * 100, 2))
print("Abundance difference between original dataframe (dat_empty) and filtered retain-resolved with Other taxa: ")
print(abund_diff %>% head)
print("Note: Other = count difference between total abundance sums")
#Export abundance differences
if(export == TRUE){
setwd(abund_diff.path)
write.csv(x = abund_diff, file = "abundDiff_glompostfilter_other.csv")
setwd(retain_resolve.path)
}
qc.abund_diff <- abund_diff %>% filter(Other != 0)
if(nrow(qc.abund_diff) != 0){
print("Error: at least 1 sample has a different total abundance. Step: dat_glom_aldex")
print("Samples of concern:")
print(qc.abund.diff)
print("Running UNDECLARED to kill script.")
UNDECLARED()
}
#Save total abundance difference between original and postfilter glom with Other taxa. R object.
abundDiff_glom_postfilter_other <- abund_diff
# ====== Relative abundance phyloseq objects ======
print("Step 9: Relative abundance conversions - (dat_empty_rel, dat_rel_glom).")
# === dat_rel_glom ===
if(export == TRUE){
setwd(vis.path)
}
mra.prev.results <- physeq.mra.prevalence(physeq = dat_glom_aldex,
type = "counts",
vis.export = export,
export_prefix.in = "glom-postfilter-other")
if(export == TRUE){
setwd(retain_resolve.path)
}
dat_rel_glom <- mra.prev.results[[3]]
#Remove other taxa
rm.other <- dat_rel_glom %>% tax_table %>% data.frame %>% rownames_to_column(var = "OTU") %>% filter(OTU != "other") %>% pull(OTU)
dat_rel_glom <- prune_taxa(rm.other, dat_rel_glom)
# ====== Export phyloseq objects to file ======
if(export == TRUE){
print("Step 10: Export physeq objects")
#Set wd
setwd(dir)
newdir <- "Physeq_glommed_objects"
dir.create(newdir)
setwd(paste0(dir, "/", newdir))
print(paste0("Export to:", paste0(dir, "/", newdir)))
#Save
save(dat_retres_prefilter, dat_glom, dat_rel_glom, dat_glom_aldex,
dat_empty, dat_empty_rel, dat, file = "physeq_ResolveGenus.RData")
# === Return to original wd ===
setwd(og.dir)
}
# === Final comments ===
if(export == TRUE){
print(paste("All results exported to: ", dir))
print(paste0("Phyloseq objects exported to sub-directory: ", dir, "/Physeq_glommed_objects"))
print(paste("Retain-resolve algorithim performance and statistics exported to: ", dir, "/retain-resolve"))
}
print(log.taxa)
#Return
out <- list(dat_glom, dat_glom_aldex, dat_rel_glom, glom.spID.levels %>% as_tibble)
names(out) <- c("dat_glom", "dat_glom_aldex", "dat_rel_glom", "glom.spID.taxonomy")
return(out)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.