phyloseq-extended:

require(parallel)
options(mc.cores= 2)

phylodiv <- function(physeq, theta = 0) {
    ## Args:
    ## - physeq: phyloseq class object, from which phylogeny and abundance data are extracted
    ## - theta: parameter that determines the balance in the Balance Weighted Phylogenetic Diversity (see McCoy and Matsen, 2013)
    ##          Theta = 0 corresponds to Faith's PD
    count_to_prop <- function(x) {x/sum(x)}
    physeq <- transform_sample_counts(physeq, count_to_prop)
    x <- as(otu_table(physeq), "matrix")
    if (taxa_are_rows(physeq)) { x <- t(x) }
    phy <- phy_tree(physeq)

    ## Construct incidence matrix of the tree
    incidence <- incidenceMatrix(phy)

    ## Order incidence matrix according to community tables
    incidence <- incidence[colnames(x), ]

    ## Create community phylogeny matrix by multiplying (community x edge matrix)
    ## where cpm_{ij} gives the abundance of OTUs originating from branch j in community i.
    cpm <- x %*% incidence
    ## Convert to incidence matrix (0/1) and multiply by edge length to obtain PD per community.
    if (theta == 0) {
      cpm[cpm > 0] <- 1
    } else {
      cpm <- pmin(cpm^theta, (1-cpm)^theta)
    }
    pd <-  cpm %*% phy$edge.length

    ## Add sample data information
    if (!is.null(sample_data(physeq, FALSE))) {
        sdf <- as(sample_data(physeq), "data.frame")
        sdf$pd <- as.vector(pd)
        pd <- sdf
    }

    return (pd)
}


ggpdrare <- function(physeq, step = 10, label = NULL, color = NULL,
                     log = TRUE,
                     replace = FALSE, se = TRUE, plot = TRUE, parallel = FALSE) {
    ## Args:
    ## - physeq: phyloseq class object, from which abundance data are extracted
    ## - step: Step size for sample size in rarefaction curves
    ## - label: Default `NULL`. Character string. The name of the variable
    ##          to map to text labels on the plot. Similar to color option
    ##          but for plotting text.
    ## - color: (Optional). Default ‘NULL’. Character string. The name of the
    ##          variable to map to colors in the plot. This can be a sample
    ##          variable (among the set returned by
    ##          ‘sample_variables(physeq)’ ) or taxonomic rank (among the set
    ##          returned by ‘rank_names(physeq)’).
    ##          Finally, The color scheme is chosen automatically by
    ##          ‘link{ggplot}’, but it can be modified afterward with an
    ##          additional layer using ‘scale_color_manual’.
    ## - log:   (Otional). Default 'TRUE'. Logical value. Should sample size
    ##          be represented using a log10 scale?
    ## - replace: If TRUE, population are treated as of infinite size, with probabilities of occurence
    ##            of a taxa computed from the (finite size) community data
    ## - se  : Logical, should standard error be computed in addition to expected pd
    ## - plot:  Logical, should the graphic be plotted.
    x <- as(otu_table(physeq), "matrix")
    if (taxa_are_rows(physeq)) { x <- t(x) }
    phy <- phy_tree(physeq)

    ## Construct incidence matrix of the tree
    incidence <- incidenceMatrix(phy)

    nedges <- nrow(phy$edge)
    ## Order incidence matrix according to community tables and create
    ## community phylogenetic matrix
    incidence <- incidence[colnames(x), ]
    cpm <- x %*% incidence
    if (se) {
        cat("Preliminary computations for se, may take some time\n")
        cpm.var <- array(NA, dim = c(nedges, nedges, nrow(x)))
        cpm.var.fun <- function(i) {
            union.clade <- incidence[, i] + incidence
            union.clade[union.clade > 0] <- 1
            union.clade <- t(x %*% union.clade)
            ## union.clade[s, j] is the number of individuals in subtrees
            ## generated by cutting branches i (from outer loop) and j
            ## in sample s
            return(union.clade)
        }
        for (i in seq_len(nedges)) {
            if (i %% 100 == 0) {
                cat(paste("Cutting edge", i, "out of", nedges), sep = "\n")
            }
            cpm.var[i, , ] <- cpm.var.fun(i)
        }
        ## Deprecated code, need to work on a better parallel version
        ## if (parallel) {
        ##     cpm.var <- mclapply(seq_len(nedges), cpm.var.fun, mc.preschedule = TRUE)
        ## } else {
        ##     cpm.var <- lapply(seq_len(nedges), cpm.var.fun)
        ## }
        ## cpm.var <- do.call(rbind, cpm.var)
        ## dim(cpm.var) <- c(nedges, nedges, nrow(x))
        dimnames(cpm.var) <- list(phy$edge[, 2], phy$edge[, 2], rownames(x))
    }

    ## Compute overall Phylogenetic Diversity
    pd <-  (0 + (cpm > 0) ) %*% phy$edge.length


    ## Transform community matrices to frequency data
    tot <- rowSums(x)
    nr <- nrow(x)
    ## Rarefy phylogenetic diversity for one sample (i)
    pdrare <- function(i) {
        cat(paste("rarefying sample", rownames(x)[i]), sep = "\n")
        ## Simplify matrices and tree to remove unnecessary computations.
        edges.to.keep <- cpm[i, ] > 0
        branch.lengths <- phy$edge.length[edges.to.keep]
        cpm.i <- cpm[i, edges.to.keep]
        if (se) {
            cpm.var.i <- cpm.var[ edges.to.keep, edges.to.keep, i]
        }
        ## sequence of sample sizes
        n <- seq(1, tot[i], by = step)
        if (n[length(n)] != tot[i])
            n <- c(n, tot[i])
        ## Mean and variance of pd for different sample sizes
        ## Start with mean
        if (replace) {
            ## Expected cpm
            cpm.rare <- 1 - t(outer((1 - cpm.i/tot[i]), n, "^"))
        } else {
            ## use lchoose instead of choose for numeric stability
            cpm.rare <-  outer(tot[i] - cpm.i, n, lchoose)
            cpm.rare <- sweep(cpm.rare, 2, lchoose(tot[i], n), FUN = "-")
            cpm.rare <- t(1 - exp(cpm.rare))
        }
        pd.rare <- as.vector(cpm.rare %*% branch.lengths)
        ## Continue with se, if necessary
        if (se) {
            cat(paste("Compute se for sample", rownames(x)[i], ", may take some time"), sep = "\n")
            ## Variance of cpm, computed via a loop to optimize memory use
            centering <-  (1 - cpm.rare) %*% branch.lengths
            pd.rare.var <- rep(NA, length(n))
            for (index in seq_along(n)) {
                size <- n[index]
                if (replace) {
                    cpm.var.rare <- (1 - cpm.var.i/tot[i])^size
                } else {
                    ## use lchoose instead of choose for numeric stability
                    cpm.var.rare <- lchoose(tot[i] - cpm.var.i, size) - lchoose(tot[i], size)
                    cpm.var.rare <- exp(cpm.var.rare)
                }
                pd.var <- t(branch.lengths) %*% cpm.var.rare %*% branch.lengths - centering[index]^2
                pd.rare.var[index] <- pd.var
            }
            pd.rare <- data.frame(pd.rare = pd.rare, se = sqrt(pd.rare.var))
        }
        return(data.frame(pd.rare, Size = n, Sample = rownames(x)[i]))
    }

    if (parallel) {
        out <- mclapply(seq_len(nr), pdrare, mc.preschedule = FALSE)
    } else {
        out <- lapply(seq_len(nr), pdrare)
    }
    df <- do.call(rbind, out)

    ## Get sample data
    if (!is.null(sample_data(physeq, FALSE))) {
        sdf <- as(sample_data(physeq), "data.frame")
        sdf$Sample <- rownames(sdf)
        data <- merge(df, sdf, by = "Sample")
        labels <- data.frame(x = tot, y =  pd, Sample = rownames(x))
        labels <- merge(labels, sdf, by = "Sample")
    }

    ## Add, any custom-supplied plot-mapped variables
    if( length(color) > 1 ){
        data$color <- color
        names(data)[names(data)=="color"] <- deparse(substitute(color))
        color <- deparse(substitute(color))
    }
    if( length(label) > 1 ){
        labels$label <- label
        names(labels)[names(labels)=="label"] <- deparse(substitute(label))
        label <- deparse(substitute(label))
    }

    p <- ggplot(data = data, aes_string(x = "Size", y = "pd.rare", group = "Sample", color = color))
    if (log) {
        p <- p + scale_x_log10()
    }
    p <- p + labs(x = "Sample Size (# reads)", y = "Phylogenetic Diversity")
    if (!is.null(label)) {
        p <- p + geom_text(data = labels, aes_string(x = "x", y = "y", label = label, color = color),
                           size = 4, hjust = 0)
    }
    p <- p + geom_line()
    if (se) {
        p <- p + geom_ribbon(aes_string(ymin = "pd.rare - se", ymax = "pd.rare + se",
                                        color = NULL, fill = color), alpha = 0.2)
    }
    if (plot) {
        plot(p)
    }
    invisible(p)
}