#' @title Estimate standardized effect sizes (SES) of phylogenetic diversity metrics (PD, MPD, MNTD, VPD) using randomization-based approach.
#' @description This function utilizes randomization-based approach to computet SES values
#' (in contrast to the exact standardisation approach used in \code{\link{phyloseq_phylo_div}} with 'standardize' option),
#' which could be more computation intensive. Note that despite more freedom in defining the null-model with randomization-based approach
#' it is debatable that this method can provide a reasonable approximation by selecting only a limited number of possible combinations
#' (especially for large phylogenetic trees).
#'
#' Currently only non-abundance-weighted estimates are implemented.
#' @param physeq A phyloseq-class object (phylogenetic tree is required)
#' @param measures Character vector with diversity indices names ("PD", "MPD", "MNTD", "VPD")
#' @param null_model Character string indicating which null model to use (for the supported list of models see \code{\link[picante]{ses.pd}})
#' @param package Which package to use for diversity estimation - "picante" (default) or "PhyloMeasures" (could be faster)
#' @param abundance_weighted Logical; should diversity metrics be weighted by species abundance? (default = FALSE)
#' @param nsim Number of randomizations for null-distribution generation
#' @param swapiter Number of iterations for independentswap or trialswap algorithms
#' @param verbose Logical; if TRUE, progress messages from the function will be printed
#' @param ... Additional arguments currently are not implemented
#'
#' @details
#' Available metrics include:
#' \itemize{
#' \item PD - Faith's phylogenetic diversity (total length of all the phylogenetic branches for a given set of taxa);
#' \item MPD - Mean pairwise distance between the tips from a phylogenetic tree;
#' \item MNTD - Mean nearest taxon distance;
#' \item VPD - Variance of the pairwise distances.
#' }
#' Values of SES above zero indicate that the species pool of a habitat is more
#' diverse than the regional species pool.
#' @return Data frame.
#' @references
#' Webb C., Ackerly D., McPeek M., Donoghue M. (2002) Phylogenies and community ecology. Annual Review of Ecology and Systematics, 33, 475-505.
#'
#' @export
#' @seealso \code{\link{phyloseq_phylo_div}}, \code{\link{phyloseq_randomize}},
#' \code{\link[picante]{randomizeMatrix}}, \code{\link[picante]{ses.pd}},
#' \code{\link[picante]{ses.mpd}}, \code{\link[picante]{ses.mntd}},
#' \code{\link[PhyloMeasures]{pd.query}}, \code{\link[PhyloMeasures]{mpd.query}},
#' \code{\link[PhyloMeasures]{mntd.query}}
#' @examples
#' # Load data
#' data(esophagus)
#'
#' # Estimate phylogenetic diversity index (PDI)
#' phyloseq_phylo_ses(esophagus, measures = "PD", null_model = "taxa.labels", nsim = 100, verbose = F)
#'
#' # Estimate phylogenetic diversity, mean pairwise distance,
#' # mean nearest taxon distance and variance of phylogenetic distances;
#' # use independent swap algorithm to generate the null distribution
#' phyloseq_phylo_ses(esophagus,
#' measures = c("PD", "MPD", "MNTD", "VPD"),
#' null_model = "independentswap",
#' nsim = 200, swapiter = 200) # NB. increase the number of iterations!
#'
phyloseq_phylo_ses <- function(physeq, measures = c("PD", "MPD", "MNTD", "VPD"),
null_model = "taxa.labels", package = "picante", abundance_weighted = FALSE,
nsim = 1000, swapiter = 1000, verbose = TRUE, ...){
# require(plyr)
# require(picante) # for null models
# require(PhyloMeasures) # for phylogenetic diversity
## Data validation
if( is.null(phyloseq::phy_tree(physeq, errorIfNULL=F)) ){
stop("Phylogenetic tree is missing in physeq.\n")
}
if( package == "PhyloMeasures" & abundance_weighted == TRUE ){
stop("Abundance-weighted estimates are not supported with PhyloMeasures package.\n")
}
if( is.null(null_model) ){
stop("Null model should be specified.\n")
}
if( length(null_model) != 1 ){
stop("Only a single null model should be specified.\n")
}
## Print when analysis started
if(verbose == TRUE){
progr <- "text"
cat("SES analysis started at ", format(Sys.time(), "%X"), "\n")
} else {
progr <- "none"
}
## Check the orientation of the OTU table
trows <- phyloseq::taxa_are_rows(physeq)
## Extact OTU table
comm <- as(object = phyloseq::otu_table(physeq), Class = "matrix")
## Transpose OTU table (species should be columns for picante::randomizeMatrix)
if(trows == TRUE){ comm <- t(comm) }
## Scale OTU abundance to presence/absence
if(abundance_weighted == FALSE){
comm <- ifelse(comm > 0, 1, 0)
}
## Extact phylogenetic tree
phy <- phyloseq::phy_tree(physeq)
## Function to estimate diversity with PhyloMeasures or picante
pdiv <- function(comm, phy, pdiv_measures, method = "picante", abund_wei){
## Get pairwise distances from a phylogenetic tree
if(method == "picante" | "VPD" %in% pdiv_measures){ dis <- ape::cophenetic.phylo(phy) }
## Initialize results
rez <- vector("list")
## Estimate diversity with PhyloMeasures
if(method == "PhyloMeasures"){
if("PD" %in% pdiv_measures){
rez <- c(rez, list(PD = PhyloMeasures::pd.query(tree = phy, matrix = comm) ))
}
if("MPD" %in% pdiv_measures){
rez <- c(rez, list(MPD = PhyloMeasures::mpd.query(tree = phy, matrix = comm) ))
}
if("MNTD" %in% pdiv_measures){
rez <- c(rez, list(MNTD = PhyloMeasures::mntd.query(tree = phy, matrix = comm) ))
}
}
## Estimate diversity with picante
if(method == "picante"){
if("PD" %in% pdiv_measures){
if(abund_wei == FALSE){
rez <- c(rez, list(PD = picante::pd(samp = comm, tree = phy, include.root = F)$PD ))
}
if(abund_wei == TRUE){
rez <- c(rez, list(PD = pd_weighted(samp = comm, tree = phy) ))
}
}
if("MPD" %in% pdiv_measures){
rez <- c(rez, list(MPD = picante::mpd(samp = comm, dis = dis, abundance.weighted = abund_wei) ))
}
if("MNTD" %in% pdiv_measures){
rez <- c(rez, list(MNTD = picante::mntd(samp = comm, dis = dis, abundance.weighted = abund_wei) ))
}
}
## Estimate VPD with internal function
if("VPD" %in% pdiv_measures){
rez <- c(rez, list(VPD = vpd(samp = comm, dis = dis, abundance.weighted = abund_wei) ))
}
rez <- do.call("cbind", rez)
rez <- data.frame(SampleID = rownames(comm), rez)
return(rez)
}
## Observed diversity
div.obs <- pdiv(comm, phy, pdiv_measures = measures, method = package, abund_wei = abundance_weighted)
## Function to randomize the observed data
null_mod_fun <- function(comm, phy, model, iter = swapiter){
# model = models from picante::randomizeMatrix
# iter = number of iterations for independentswap or trialswap algorithms
if(model == "taxa.labels"){
res_comm <- comm
res_phy <- picante::tipShuffle(phy)
}
if(model %in% c("richness", "sample.pool")){ # same models?
res_comm <- picante::randomizeMatrix(comm, null.model="richness")
res_phy <- phy
}
if(model == "frequency"){
res_comm <- picante::randomizeMatrix(comm, null.model="frequency")
res_phy <- phy
}
if(model == "frequency"){
res_comm <- picante::randomizeMatrix(comm, null.model="frequency")
res_phy <- phy
}
if(model == "phylogeny.pool"){
res_comm <- picante::randomizeMatrix(comm, null.model="richness")
res_phy <- picante::tipShuffle(phy)
}
if(model == "independentswap"){
res_comm <- picante::randomizeMatrix(comm, null.model="independentswap", iterations = iter)
res_phy <- phy
}
if(model == "trialswap"){
res_comm <- picante::randomizeMatrix(comm, null.model="trialswap", iterations = iter)
res_phy <- phy
}
## Return list with comm and phy
rez <- list()
rez$comm <- res_comm
rez$phy <- res_phy
return(rez)
}
## Simulate multiple randomized communities
## TO DO ------- add parallel option here
if(verbose == T){ cat("..Randomizing data with '", null_model, "' algorithm\n", sep = "") }
nmods <- plyr::rlply(.n = nsim, .expr = null_mod_fun(comm, phy, model = null_model), .progress = progr)
## Esimate diversity metrics for each of the randomized communities
## TO DO ------- add parallel option here
if(verbose == T){ cat("..Estimating phylogenetic diversity for the randomized data\n") }
div.rnd <- plyr::ldply(
.data = nmods,
.fun = function(z, ...){ pdiv(comm = z$comm, phy = z$phy, ...) },
pdiv_measures = measures,
method = package,
abund_wei = abundance_weighted,
.progress = progr)
## Summarize null-distribution for each community (mean, SD and rank)
if(verbose == T){ cat("..Estimating effect size\n") }
rnd_mean <- plyr::ddply(.data=div.rnd, .variables="SampleID", .fun=plyr::numcolwise(mean, na.rm = T))
rnd_sd <- plyr::ddply(.data=div.rnd, .variables="SampleID", .fun=plyr::numcolwise(sd, na.rm = T))
rnd_rank <- plyr::ddply(.data = rbind(div.obs, div.rnd), .variables = "SampleID", plyr::numcolwise(.fun = function(z){ rank(z)[1] }))
## Rename columns
colnames(rnd_mean)[-1] <- paste(colnames(rnd_mean)[-1], ".rand.mean", sep="")
colnames(rnd_sd)[-1] <- paste(colnames(rnd_sd)[-1], ".rand.sd", sep="")
colnames(rnd_rank)[-1] <- paste(colnames(rnd_rank)[-1], ".rank", sep="")
## Reorder observed diversity estimates to match the randomized order
div.obs <- div.obs[match(x = div.obs$SampleID, table = rnd_mean$SampleID), ]
## Merge results
res <- cbind(div.obs, rnd_mean[-1], rnd_sd[-1], rnd_rank[-1])
## Estimate Z-score and P-value
if("PD" %in% measures){
res$PD.z <- with(res, (PD - PD.rand.mean)/PD.rand.sd )
res$PD.p <- with(res, PD.rank / (nsim + 1) )
}
if("MPD" %in% measures){
res$MPD.z <- with(res, (MPD - MPD.rand.mean)/MPD.rand.sd )
res$MPD.p <- with(res, MPD.rank / (nsim + 1) )
}
if("MNTD" %in% measures){
res$MNTD.z <- with(res, (MNTD - MNTD.rand.mean)/MNTD.rand.sd )
res$MNTD.p <- with(res, MNTD.rank / (nsim + 1) )
}
if("VPD" %in% measures){
res$VPD.z <- with(res, (VPD - VPD.rand.mean)/VPD.rand.sd )
res$VPD.p <- with(res, VPD.rank / (nsim + 1) )
}
## Ending progress message
if(verbose == TRUE){
cat("..Done\n")
cat("Analysis finished at ", format(Sys.time(), "%X"), "\n")
}
return(res)
}
## Internal function to estimate variance of pairwise distances separating taxa in a community
## Based on picante::mpd
vpd <- function(samp, dis, abundance.weighted=FALSE){
# x = community data matrix (species as columns)
# y = interspecific distance matrix (matrix or dist class) or phylogenetic tree of class phylo
## Convert phylogenetic tree to the pairwise distances
if("phylo" %in% class(dis)){ dis <- ape::cophenetic.phylo(dis) }
if("dist" %in% class(dis)){ dis <- as.matrix(dis) }
N <- dim(samp)[1]
res <- numeric(N)
for(i in 1:N){
## Subset data
sppInSample <- names(samp[i, samp[i, ] > 0])
if(length(sppInSample) > 1){
sample.dis <- dis[sppInSample, sppInSample]
if(abundance.weighted == TRUE){
sample.weights <- t(as.matrix(samp[i, sppInSample, drop=FALSE])) %*% as.matrix(samp[i, sppInSample, drop=FALSE])
res[i] <- weighted.var(sample.dis, sample.weights)
} else {
res[i] <- var(sample.dis[lower.tri(sample.dis)])
}
}
else{
res[i] <- NA
}
}
return(res)
}
# library(picante)
# data(phylocom)
# vpd(phylocom$sample, phylocom$phylo, abundance.weighted=FALSE)
# vpd(phylocom$sample, phylocom$phylo, abundance.weighted=TRUE)
## Internal function to estimate weighted variance
weighted.var <- function(x, w = NULL, normwt = FALSE, na.rm = FALSE){
# x = numeric vector
# w = numeric vector of weights or NULL
# normwt = Logical, to make weights sum to length(x) after deletion of NAs
# na.rm = Logical, remove NAs
## If no weights are provied - just return ordinary variance
if(is.null(w)){ return(var(x, na.rm = na.rm)) }
## Validate data
if(length(x) != length(w)){ stop("Error: 'x' and 'w' must have the same length.\n")}
## Remove missing values
if(na.rm == TRUE){
nas <- is.na(x)
if(any(nas)){
w <- w[which(nas)]
x <- x[which(nas)]
}
}
## Normalize weights
if(normwt == TRUE){ w <- w * length(x) / sum(w) }
sumw <- sum(w)
wtm <- sum(w * x) / sumw ## Weighted mean
res <- sum(w *(x-wtm)^2)*(sumw / (sumw^2-sum(w^2)))
# res <- (sum(w*x^2) * sumw - sum(w*x)^2) / (sumw^2 - sum(w^2))
# res <- sum(w*((x - wtm)^2)) / (sumw - 1)
return(res)
}
# x <- runif(500); wts <- sample(1:6, 500, TRUE)
# weighted.var(x, wts)
## Abundance weighted calculation of Faiths PD index
## based on lefse_0.5::weighted.faith by Nathan G. Swenson
## https://github.com/NGSwenson/lefse_0.5/commit/74814e2f4f8a1f0244da8e8a48bb17897718db67
pd_weighted <- function(samp, tree){
## Function for a single sample
wpd <- function(smp){
## Extract the names of species in a community with an abundance greater than zero
## and make a pruned phylogeny for that community
tmp.tree <- geiger::treedata(tree, smp[smp > 0], warnings=F)$phy
## Create empty branches matrix
branches <- matrix(NA, nrow = nrow(tmp.tree$edge), ncol = 4)
## Fill first two columns of the matrix with node numbers defining each edge
branches[,1:2] <- tmp.tree$edge
## Fill the third column with the length of each branch
branches[,3] <- tmp.tree$edge.length
get.leaves <- function(x){ leaves.node <- geiger::tips(tmp.tree, x[2]) }
## Retrieve species names subtended by each branch (i.e. ## row) in the branches matrix
leaves <- apply(X = branches, MARGIN = 1, FUN = get.leaves)
## Calculate the mean abundance (Ai) for species across each set of leaves
for(i in 1:length(leaves)){
branches[i, 4] <- mean(smp[leaves[[i]]], na.rm = T)
}
## Calculated the Weighted Faith’s Index
rez <- nrow(tmp.tree$edge) * ((sum(branches[,3] * branches[,4])) / sum(branches[,4]))
return(rez)
}
## Estimate weithed PD for each sample
res <- apply(X = samp, MARGIN = 1, FUN = wpd)
res
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.