require(parallel)
options(mc.cores= 2)
phylodiv <- function(physeq, theta = 0) {
## Args:
## - physeq: phyloseq class object, from which phylogeny and abundance data are extracted
## - theta: parameter that determines the balance in the Balance Weighted Phylogenetic Diversity (see McCoy and Matsen, 2013)
## Theta = 0 corresponds to Faith's PD
count_to_prop <- function(x) {x/sum(x)}
physeq <- transform_sample_counts(physeq, count_to_prop)
x <- as(otu_table(physeq), "matrix")
if (taxa_are_rows(physeq)) { x <- t(x) }
phy <- phy_tree(physeq)
## Construct incidence matrix of the tree
incidence <- incidenceMatrix(phy)
## Order incidence matrix according to community tables
incidence <- incidence[colnames(x), ]
## Create community phylogeny matrix by multiplying (community x edge matrix)
## where cpm_{ij} gives the abundance of OTUs originating from branch j in community i.
cpm <- x %*% incidence
## Convert to incidence matrix (0/1) and multiply by edge length to obtain PD per community.
if (theta == 0) {
cpm[cpm > 0] <- 1
} else {
cpm <- pmin(cpm^theta, (1-cpm)^theta)
}
pd <- cpm %*% phy$edge.length
## Add sample data information
if (!is.null(sample_data(physeq, FALSE))) {
sdf <- as(sample_data(physeq), "data.frame")
sdf$pd <- as.vector(pd)
pd <- sdf
}
return (pd)
}
ggpdrare <- function(physeq, step = 10, label = NULL, color = NULL,
log = TRUE,
replace = FALSE, se = TRUE, plot = TRUE, parallel = FALSE) {
## Args:
## - physeq: phyloseq class object, from which abundance data are extracted
## - step: Step size for sample size in rarefaction curves
## - label: Default `NULL`. Character string. The name of the variable
## to map to text labels on the plot. Similar to color option
## but for plotting text.
## - color: (Optional). Default ‘NULL’. Character string. The name of the
## variable to map to colors in the plot. This can be a sample
## variable (among the set returned by
## ‘sample_variables(physeq)’ ) or taxonomic rank (among the set
## returned by ‘rank_names(physeq)’).
## Finally, The color scheme is chosen automatically by
## ‘link{ggplot}’, but it can be modified afterward with an
## additional layer using ‘scale_color_manual’.
## - log: (Otional). Default 'TRUE'. Logical value. Should sample size
## be represented using a log10 scale?
## - replace: If TRUE, population are treated as of infinite size, with probabilities of occurence
## of a taxa computed from the (finite size) community data
## - se : Logical, should standard error be computed in addition to expected pd
## - plot: Logical, should the graphic be plotted.
x <- as(otu_table(physeq), "matrix")
if (taxa_are_rows(physeq)) { x <- t(x) }
phy <- phy_tree(physeq)
## Construct incidence matrix of the tree
incidence <- incidenceMatrix(phy)
nedges <- nrow(phy$edge)
## Order incidence matrix according to community tables and create
## community phylogenetic matrix
incidence <- incidence[colnames(x), ]
cpm <- x %*% incidence
if (se) {
cat("Preliminary computations for se, may take some time\n")
cpm.var <- array(NA, dim = c(nedges, nedges, nrow(x)))
cpm.var.fun <- function(i) {
union.clade <- incidence[, i] + incidence
union.clade[union.clade > 0] <- 1
union.clade <- t(x %*% union.clade)
## union.clade[s, j] is the number of individuals in subtrees
## generated by cutting branches i (from outer loop) and j
## in sample s
return(union.clade)
}
for (i in seq_len(nedges)) {
if (i %% 100 == 0) {
cat(paste("Cutting edge", i, "out of", nedges), sep = "\n")
}
cpm.var[i, , ] <- cpm.var.fun(i)
}
## Deprecated code, need to work on a better parallel version
## if (parallel) {
## cpm.var <- mclapply(seq_len(nedges), cpm.var.fun, mc.preschedule = TRUE)
## } else {
## cpm.var <- lapply(seq_len(nedges), cpm.var.fun)
## }
## cpm.var <- do.call(rbind, cpm.var)
## dim(cpm.var) <- c(nedges, nedges, nrow(x))
dimnames(cpm.var) <- list(phy$edge[, 2], phy$edge[, 2], rownames(x))
}
## Compute overall Phylogenetic Diversity
pd <- (0 + (cpm > 0) ) %*% phy$edge.length
## Transform community matrices to frequency data
tot <- rowSums(x)
nr <- nrow(x)
## Rarefy phylogenetic diversity for one sample (i)
pdrare <- function(i) {
cat(paste("rarefying sample", rownames(x)[i]), sep = "\n")
## Simplify matrices and tree to remove unnecessary computations.
edges.to.keep <- cpm[i, ] > 0
branch.lengths <- phy$edge.length[edges.to.keep]
cpm.i <- cpm[i, edges.to.keep]
if (se) {
cpm.var.i <- cpm.var[ edges.to.keep, edges.to.keep, i]
}
## sequence of sample sizes
n <- seq(1, tot[i], by = step)
if (n[length(n)] != tot[i])
n <- c(n, tot[i])
## Mean and variance of pd for different sample sizes
## Start with mean
if (replace) {
## Expected cpm
cpm.rare <- 1 - t(outer((1 - cpm.i/tot[i]), n, "^"))
} else {
## use lchoose instead of choose for numeric stability
cpm.rare <- outer(tot[i] - cpm.i, n, lchoose)
cpm.rare <- sweep(cpm.rare, 2, lchoose(tot[i], n), FUN = "-")
cpm.rare <- t(1 - exp(cpm.rare))
}
pd.rare <- as.vector(cpm.rare %*% branch.lengths)
## Continue with se, if necessary
if (se) {
cat(paste("Compute se for sample", rownames(x)[i], ", may take some time"), sep = "\n")
## Variance of cpm, computed via a loop to optimize memory use
centering <- (1 - cpm.rare) %*% branch.lengths
pd.rare.var <- rep(NA, length(n))
for (index in seq_along(n)) {
size <- n[index]
if (replace) {
cpm.var.rare <- (1 - cpm.var.i/tot[i])^size
} else {
## use lchoose instead of choose for numeric stability
cpm.var.rare <- lchoose(tot[i] - cpm.var.i, size) - lchoose(tot[i], size)
cpm.var.rare <- exp(cpm.var.rare)
}
pd.var <- t(branch.lengths) %*% cpm.var.rare %*% branch.lengths - centering[index]^2
pd.rare.var[index] <- pd.var
}
pd.rare <- data.frame(pd.rare = pd.rare, se = sqrt(pd.rare.var))
}
return(data.frame(pd.rare, Size = n, Sample = rownames(x)[i]))
}
if (parallel) {
out <- mclapply(seq_len(nr), pdrare, mc.preschedule = FALSE)
} else {
out <- lapply(seq_len(nr), pdrare)
}
df <- do.call(rbind, out)
## Get sample data
if (!is.null(sample_data(physeq, FALSE))) {
sdf <- as(sample_data(physeq), "data.frame")
sdf$Sample <- rownames(sdf)
data <- merge(df, sdf, by = "Sample")
labels <- data.frame(x = tot, y = pd, Sample = rownames(x))
labels <- merge(labels, sdf, by = "Sample")
}
## Add, any custom-supplied plot-mapped variables
if( length(color) > 1 ){
data$color <- color
names(data)[names(data)=="color"] <- deparse(substitute(color))
color <- deparse(substitute(color))
}
if( length(label) > 1 ){
labels$label <- label
names(labels)[names(labels)=="label"] <- deparse(substitute(label))
label <- deparse(substitute(label))
}
p <- ggplot(data = data, aes_string(x = "Size", y = "pd.rare", group = "Sample", color = color))
if (log) {
p <- p + scale_x_log10()
}
p <- p + labs(x = "Sample Size (# reads)", y = "Phylogenetic Diversity")
if (!is.null(label)) {
p <- p + geom_text(data = labels, aes_string(x = "x", y = "y", label = label, color = color),
size = 4, hjust = 0)
}
p <- p + geom_line()
if (se) {
p <- p + geom_ribbon(aes_string(ymin = "pd.rare - se", ymax = "pd.rare + se",
color = NULL, fill = color), alpha = 0.2)
}
if (plot) {
plot(p)
}
invisible(p)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.