#' @title Average relative OTU abundances.
#' @description This function implements OTU abundance averaging following CoDa (Compositional Data Analysis) workflow.
#' @param physeq A phyloseq-class object
#' @param avg_type Averaging type ("aldex" for ALDEx2-based averaging , "acomp" for Aitchison CoDa approach; "arithmetic" for simple arithmetic mean)
#' @param acomp_zero_impute Character ("CZM", "GBM","SQ","BL") or NULL; indicating weather to perform replacement of 0 abundance values with an estimate of the probability that the zero is not 0 (implemented only for avg_type = "acomp"; see \code{\link[zCompositions]{cmultRepl}})
#' @param aldex_samples The number of Monte-Carlo Dirichlet instances to generate (see \code{\link[ALDEx2]{aldex.clr}})
#' @param aldex_denom Character ("all", "iqlr", "lvha"), indicating which features to use as the denominator for the geometric mean calculation (see \code{\link[ALDEx2]{aldex.clr}})
#' @param group Variable name in \code{\link[phyloseq]{sample_data}}) which defines sample groups for averaging (default is NULL)
#' @param drop_group_zero Logical; indicating weather OTUs with zero abundance withing a group of samples should be removed
#' @param verbose Logical; if TRUE (default), informational messages will be shown on screen
#' @param progress Name of the progress bar to use ("none" or "text"; see \code{\link[plyr]{create_progress_bar}})
#' @param ... Additional arguments may be passed to \code{\link[zCompositions]{cmultRepl}}
#' @details
#' Typical OTU abundance tables in metagenomic analysis usually has different
#' sampling effort for different samples (which is an artifact of the sequencing procedure).
#' The total number of reads is meaningless and distance between OTU compositions is
#' on the relative scale (e.g., OTUs with 1 and 2 reads in one sample are so far as OTUs
#' with 10 and 20 reads in the other samples). Therefore such OTU tables represents
#' closed compositions and requires a special treatment within Aitchison geometry framework.
#'
#' With ALDEx2-based approach (avg_type = "aldex") it is possible to take into
#' account per-OTU technical variation within each sample using Monte-Carlo instances
#' drawn from the Dirichlet distribution (see Fernandes et al., 2013).
#' As the result the expected average of the OTU portions will be estimated.
#'
#' Zero OTU abundance could be due to the insufficient number of reads. However,
#' it is possible to replace the zero counts with an expected value.
#' Bayesian-multiplicative (BM) replacement of count zeros is implemented in
#' \code{\link[zCompositions]{cmultRepl}} function of zCompositions package.
#' Sevral methods are supported: geometric Bayesian multiplicative (zero_impute = "GBM"),
#' count zero multiplicative (zero_impute = "CZM", default), Bayes-Laplace BM (zero_impute = "BL"),
#' or square root BM (zero_impute = "SQ").
#' In case of structural zeroes in OTU abundance table (e.g., absence of OTU
#' within a group assumes that it is not observed due to some biological pattern
#' and is not caused by a detection limit) "drop_group_zero" argument may be set
#' to "TRUE" to avoid zero replacement.
#'
#' @return phyloseq object with OTU relative abundance averaged over samples (all together or within a group).
#' @export
#' @seealso \code{\link[ALDEx2]{aldex.clr}}, \code{\link[compositions]{acomp}}, \code{\link[zCompositions]{cmultRepl}}
#' @references
#' Gloor GB, Macklaim JM, Pawlowsky-Glahn V and Egozcue JJ (2017) Microbiome Datasets Are Compositional: And This Is Not Optional. Front. Microbiol. 8:2224. doi: 10.3389/fmicb.2017.02224
#' Martin-Fernandez JA, Barcelo-Vidal C, Pawlowsky-Glahn V (2003) Dealing With Zeros and Missing Values in Compositional Data Sets Using Nonparametric Imputation. Mathematical Geology 35:3. doi: 10.1023/A:1023866030544
#' Fernandes AD, Macklaim JM, Linn TG, Reid G, Gloor GB (2013) ANOVA-Like Differential Expression (ALDEx) Analysis for Mixed Population RNA-Seq. PLOS ONE 8(7): e67019. doi: 10.1371/journal.pone.0067019
#'
#' @examples
#'
phyloseq_average <- function(physeq, avg_type = "aldex",
acomp_zero_impute = NULL, aldex_samples = 128, aldex_denom = "all",
group = NULL, drop_group_zero = FALSE, verbose = TRUE, progress = NULL, ...){
# require(compositions) # for Aitchison CoDa approach
# require(zCompositions) # for Bayesian-multiplicative replacement
# require(plyr)
## Add warning for zero imputation and non-CoDa averaging
if(!is.null(acomp_zero_impute) & avg_type != "acomp"){
warning("Warning: imputations of zeros is implemented only for CoDa approach.\n")
zero_impute <- NULL
}
## Progress indicator
if(verbose == TRUE){
if(is.null(progress)){ progress <- "text" }
} else {
progress <- "none"
}
## Average througth the all samples
if(is.null(group)){
res <- OTU_average(physeq, avg_type = avg_type,
acomp_zero_impute = acomp_zero_impute,
aldex_samples = aldex_samples, aldex_denom = aldex_denom,
verbose = verbose)
} ## End of single group
## Average by group
if(!is.null(group)){
## Backup phylogenetic tree and remove it from the main object
with_phy_tree <- phyloseq::phy_tree(physeq, errorIfNULL = F)
if(!is.null(with_phy_tree)){
tree <- phyloseq::phy_tree(physeq)
physeq@phy_tree <- NULL
}
## Split data by group
ph_gr <- phyloseq_sep_variable(physeq, variable = group, drop_zeroes = drop_group_zero)
## Average OTU proportions within each group
res <- plyr::llply(.data = ph_gr, .fun = OTU_average,
avg_type = avg_type, acomp_zero_impute = acomp_zero_impute,
aldex_samples = aldex_samples, aldex_denom = aldex_denom,
verbose = verbose, .progress = progress)
## Give the group names to the averaged proportions
for(i in 1:length(res)){
phyloseq::sample_names(res[[i]]) <- names(res)[i]
}
## Combine results
# do.call(merge_phyloseq, res) # doesn't work
res_mrg <- res[[1]]
for(j in 2:length(res)){
res_mrg <- phyloseq::merge_phyloseq(res_mrg, res[[j]])
}
res <- res_mrg
rm(res_mrg)
## Restore phylogenetic tree
if(!is.null(with_phy_tree)){
phyloseq::phy_tree(res) <- tree
}
} ## End of multiple groups
## Add averaging details as attributes to the resulting list
attr(res, which = "Average_type") <- avg_type
attr(res, which = "Average_ByGroup") <- group
attr(res, which = "Average_GroupZerosRemoved") <- drop_group_zero
if(avg_type == "acomp"){
attr(res, which = "Average_ZeroImputationMethod") <- acomp_zero_impute
} else {
attr(res, which = "Average_ZeroImputationMethod") <- NULL
}
if(avg_type == "aldex"){
attr(res, which = "Average_AldexMCSamples") <- aldex_samples
attr(res, which = "Average_AldexDenom") <- aldex_denom
} else {
attr(res, which = "Average_AldexMCSamples") <- NULL
attr(res, which = "Average_AldexDenom") <- NULL
}
return(res)
}
## Function to average OTU relative abundances
OTU_average <- function(x, avg_type = "aldex",
acomp_zero_impute = NULL, aldex_samples = 128, aldex_denom = "all",
result = "phyloseq", verbose = TRUE, ...){
# x = phyloseq object
# avg_type = averaging type ("aldex" for ALDEx2-based averaging, "acomp" for Aitchison CoDa approach; "arithmetic" for simple arithmetic mean)
# acomp_zero_impute = NULL or character indicating the method of zero imputation ("CZM" or "GBM")
# aldex_samples = The number of Monte-Carlo Dirichlet instances to generate
# aldex_denom = which features to use as the denominator for the geometric mean calculation ("all", "iqlr", "lvha")
# result = resulting object ("phyloseq" or "matrix")
# verbose = logical; shows warnings
## Replace aldex_denom = "zero" with aldex_denom = "all"
if(avg_type == "aldex" & aldex_denom == "zero"){
print("Warning: ALDEx2 denom 'zero' works only for multiple groups, forcing aldex_denom = 'all'.\n")
aldex_denom <- "all"
}
## Remove sample metadata
if(!is.null(phyloseq::sample_data(x, errorIfNULL = FALSE))){
x@sam_data <- NULL
}
## Extract OTU abundance table
otus <- as.data.frame(phyloseq::otu_table(x))
## Show some warnings
if(verbose == TRUE){
# How many zeros are in the table? If more than 15% - rise a warning
nz <- sum(otus == 0)
if(nz > nrow(otus)*ncol(otus)*0.15){
warning("Warning: there are more than 15% of zeroes in OTU table. Consider some additional data filtering.\n")
}
## How many OTUs are with zero total abundance?
oz <- phyloseq::taxa_sums(x) == 0
if(any(oz)){
warning("Warning: there are ", sum(oz), " OTUs with zero total abundance.\n")
}
}
## Transpose OTU abundance table (samples must be ROWS from this step!)
if(phyloseq::taxa_are_rows(x) == TRUE){
otus <- t(otus)
}
## Acomp-based averaging
if(avg_type == "acomp"){
## Replace 0 values with an estimate of the probability that the zero is not 0
if(!is.null(acomp_zero_impute) & any(otus == 0)){
otus <- try(
zCompositions::cmultRepl(otus, label=0, method=acomp_zero_impute, output="prop", suppress.print = TRUE, ...) # output="counts" [zCompositions]
)
# Methods:
# CZM = multiplicative simple replacement
# GBM = Geometric Bayesian multiplicative
if(class(otus) %in% "try-error"){
stop("Error in multiplicative zero replacement, try to use other methods (e.g., 'zero_impute = CZM').\n")
}
}
## Transform the data using the the centred log-ratio (Aitchison compositions)
otucomp <- compositions::acomp(otus) # [compositions]
## Average proportions
# TO DO: add possibilty to specify a robust estimator ('robust = TRUE')
otuavg <- try( suppressWarnings( compositions::mean.acomp(otucomp, robust = F) ), silent = T)
if(class(otuavg) %in% "try-error"){ stop("Error: method acomp failed.\n") }
otuavg <- as.matrix(otuavg) # it will be transposed here
colnames(otuavg) <- "Average" # rename average proporion column
}
## ALDEx2-based averaging (thanks to Thom Quinn for the advice on these steps)
if(avg_type == "aldex"){
## ALDEx2 requires OTUs as rows and samples as columns
otus <- t(otus)
## Generate Monte Carlo samples of the Dirichlet distribution for each sample.
## Convert each instance using the centred log-ratio transform
ald <- try( ALDEx2::aldex.clr(reads = otus, conds = rep("a", ncol(otus)),
mc.samples = aldex_samples, denom = aldex_denom, verbose = verbose, useMC = FALSE) )
if(class(ald) %in% "try-error"){ stop("Error in ALDEx2::aldex.clr. Try to use another 'denom'.\n") }
## Extract the Monte Carlo Dirochlet instances
mc <- ALDEx2::getMonteCarloInstances(ald)
k <- ALDEx2::numMCInstances(ald) # number of samples
logratio <- 0 # placeholder
## Extract each Monte Carlo instance
## And add i-th log-ratio transformation to cumulative sum
## Based on propr code by Thom Quinn
## https://github.com/tpq/propr/blob/90a96e6e24397dcd50e89b25af00edc0fcf56197/R/aldex2propr.R#L70
for(i in 1:k){
mci_lr <- t(sapply(mc, function(x) x[, i]))
logratio <- logratio + mci_lr
}
## Average log-ratios
logratio <- logratio / k
## Functions
closure <- function(x){ x/sum(x) } # divide proportions by sums
geomean <- function(x){ prod(x)^(1/length(x)) } # geometric mean
## Get the portions back
## ALDEx2 uses binary logarithm (log2) instead of natural logarithm, so for the back-transformation of log-ratios use 2^x instead of exp(x)
mci_means <- t(apply(logratio, MARGIN = 1, FUN = function(x){ closure(2^x) }))
## Average OTU proportions across the samples
otuavg <- apply(mci_means, MARGIN = 2, FUN = geomean)
otuavg <- closure(otuavg)
otuavg <- as.matrix(otuavg)
colnames(otuavg) <- "Average"
}
## Simple arithmetic averaging (in case if there are a lot of zeros and CoDa gives strange results)
if(avg_type == "arithmetic"){
## Convert counts to proportions
if(any(rowSums(otus) > 1)){ ## check that values are not proportions
k <- .Machine$double.eps
tmp <- pmax(k, apply(otus, MARGIN = 1, sum, na.rm = TRUE))
otus <- sweep(otus, MARGIN = 1, tmp, "/")
}
## Arithmetic averaging of proportions
otuavg <- colMeans(otus, na.rm = TRUE)
otuavg <- as.matrix(otuavg)
colnames(otuavg) <- "Average" # rename average proporion column
}
## Back-transpose OTU abundances if neccesary
# if(taxa_are_rows(x) == FALSE){
# otuavg <- t(otuavg)
# }
## Replace original counts with the average relative abundance
if(result == "phyloseq"){
otu_table(x) <- phyloseq::otu_table(otuavg, taxa_are_rows = TRUE)
return(x)
}
## Just return matrix with averaged OTU proportions
if(result == "matrix"){
return(otuavg)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.