R/Diversity.R

Defines functions plotDiversityTest plotDiversityCurve plotAbundanceCurve betaDiversity alphaDiversity helperTest helperBeta helperAlpha inferRarefiedDiversity calcDiversity estimateAbundance bootstrapAbundance countClones inferCompleteAbundance adjustObservedAbundance inferUnseenAbundance inferUnseenCount inferRarefiedCoverage calcChao1Coverage calcCoverage

Documented in alphaDiversity calcCoverage calcDiversity countClones estimateAbundance plotAbundanceCurve plotDiversityCurve plotDiversityTest

# Clonal diversity analysis

#' @include Classes.R
NULL

#### Coverage functions ####

#' Calculate sample coverage
#' 
#' \code{calcCoverage} calculates the sample coverage estimate, a measure of sample 
#' completeness, for varying orders using the method of Chao et al, 2015, falling back 
#' to the Chao1 method in the first order case.
#'
#' @param    x  numeric vector of abundance counts.
#' @param    r  coverage order to calculate.
#' 
#' @return   The sample coverage of the given order \code{r}.
#' 
#' @references
#' \enumerate{
#'   \item  Chao A. Nonparametric Estimation of the Number of Classes in a Population. 
#'            Scand J Stat. 1984 11, 265270.
#'   \item  Chao A, et al. Unveiling the species-rank abundance distribution by 
#'            generalizing the Good-Turing sample coverage theory. 
#'            Ecology. 2015 96, 11891201.
#' }
#' 
#' @seealso  
#' Used by \link{alphaDiversity}.
#'           
#' @examples
#' # Calculate clone sizes
#' clones <- countClones(ExampleDb, groups="sample_id")
#' 
#' # Calculate 1first order coverage for a single sample
#' calcCoverage(clones$seq_count[clones$sample_id == "+7d"])
#'
#' @export
calcCoverage <- function(x, r=1) {
    # Use traditional calculation for 1st order coverage
    if (r == 1) { return(calcChao1Coverage(x)) }
    
    # Use general form for 2nd order and higher coverage
    x <- x[x >= 1]
    n <- sum(x)
    fr <- sum(x == r)
    fs <- sum(x == r + 1)
    
    if (fr == 0) {
        stop("Cannot calculate coverage of order ", r, ". No abundance data with count=", r, ".")
    }
    if (fs == 0) {
        stop("Cannot calculate coverage of order ", r, ". No abundance data with count=", r + 1, ".")
    }
    
    a <- factorial(r)*fr / sum(x[x >= r]^r)
    b <- ((n - r)*fr / ((n - r)*fr + (r + 1)*fs))^r
    rC <- 1 - a*b
    
    return(rC)
}


# Calculate first order coverage
#
# @param    x  a numeric vector of species abundance as counts
#
# @returns  Coverage estimate.
calcChao1Coverage <- function(x) {
    x <- x[x >= 1]
    n <- sum(x)
    f1 <- sum(x == 1)
    f2 <- sum(x == 2)
    
    if (f2 > 0) {
        rC1 <- 1 - (f1 / n) * (((n - 1) * f1) / ((n - 1) * f1 + 2 * f2))
    } else {
        rC1 <- 1 - (f1 / n) * (((n - 1) * (f1 - 1)) / ((n - 1) * (f1 - 1) + 2))
    }
    
    return(rC1)
}


# Calculates diversity under rarefaction
# 
# Calculates Hill numbers under rarefaction
#
# @param    x  vector of observed abundance counts.
# @param    m  the sequence count to rarefy to.
#
# @return   The first order coverage estimate
inferRarefiedCoverage <- function(x, m) {
    x <- x[x >= 1]
    n <- sum(x)
    if (m > n) {
        stop("m must be <= the total count of observed sequences.")
    }
    
    # Unrarefied case
    if (m == n) {
        return(calcCoverage(x, r=1))
    }
    
    # Calculate rarefied coverage
    # TODO: Read up on this and fix
    #rC1 <- iNEXT:::Chat.Ind(x, m)
    y <- x[(n - x) >= m]
    rC1 <- 1 - sum(y/n * exp(lgamma(n - y + 1) - lgamma(n - m - y + 1) - lgamma(n) + lgamma(n - m)))
    
    return(rC1)
}


#### Abundance functions ####

# Calculate undetected species
# 
# Calculates the lower bound of undetected species counts using the Chao1 estimator.
#
# @param    x  vector of observed abundance counts.
# 
# @return   The count of undetected species.
inferUnseenCount <- function(x) {
    x <- x[x >= 1]
    n <- sum(x)
    f1 <- sum(x == 1)
    f2 <- sum(x == 2)
    
    if (f2 > 0) {
        f0 <- ceiling(((n - 1) * f1^2) / (n * 2 * f2))
    } else {
        f0 <- ceiling(((n - 1) * f1 * (f1 - 1)) / (n * 2))
    }
    
    return(f0)
}


# Define undetected species relative abundances
#
# @param    x  vector of detected species abundance counts.
# 
# @return   An adjusted detected species relative abundance distribution.
inferUnseenAbundance <- function(x) {
    x <- x[x >= 1]
    
    # Coverage
    rC1 <- calcCoverage(x, r=1)
    
    # Unseen count
    f0 <- inferUnseenCount(x)
    
    # Assign unseen relative abundance
    p <- rep((1 - rC1) / f0, f0)
    
    return(p)
}


# Adjustement to observed relative abundances
#
# @param    x  vector of observed abundance counts
#
# @return   An adjusted observed species relative abundance distribution.
adjustObservedAbundance <- function(x) {
    x <- x[x >= 1]
    n <- sum(x)
    
    # Coverage
    rC1 <- calcCoverage(x, r=1)
    
    # Calculate tuning parameter
    lambda <- (1 - rC1) / sum(x/n * exp(-x))
    
    # Define adjusted relative abundance
    p <- x/n * (1 -  lambda * exp(-x))
    
    return(p)
}


# Combined unseen inferrence and observed abundance adjustment
#
# @param    x  named vector of observed abundance counts by clone.
#
# @return   A vector containing the complete inferred abundance distribution.
#           Unseen species will be denote by a clone name starting with "U".
inferCompleteAbundance <- function(x) {
    # Infer complete abundance distribution
    p1 <- adjustObservedAbundance(x)
    p2 <- inferUnseenAbundance(x)
    names(p2) <- if (length(p2) > 0) { paste0("U", 1:length(p2)) } else { NULL }
    
    return(c(p1, p2))
}

#' Tabulates clones sizes
#' 
#' \code{countClones} determines the number of sequences and total copy number of 
#' clonal groups.
#'
#' @param    data    data.frame with columns containing clonal assignments.
#' @param    groups  character vector defining \code{data} columns containing grouping 
#'                   variables. If \code{groups=NULL}, then do not group data.
#' @param    copy    name of the \code{data} column containing copy numbers for each 
#'                   sequence. If this value is specified, then total copy abundance
#'                   is determined by the sum of copy numbers within each clonal group.
#' @param    clone   name of the \code{data} column containing clone identifiers.
#' @param    remove_na    removes rows with \code{NA} values in the clone column if \code{TRUE} and issues a warning. 
#'                        Otherwise, keeps those rows and considers \code{NA} as a clone in the final counts 
#'                        and relative abundances.
#' 
#' @return   A data.frame summarizing clone counts and frequencies with columns:
#'           \itemize{
#'             \item \code{clone_id}:    clone identifier. This is the default column
#'                                       name, specified with \code{clone='clone_id'}.
#'                                       If the function call uses Change-O 
#'                                       formatted data and \code{clone='CLONE'}, this
#'                                       column will have name \code{CLONE}.
#'             \item \code{seq_count}:   total number of sequences for the clone.
#'             \item \code{seq_freq}:    frequency of the clone as a fraction of the total
#'                                       number of sequences within each group.
#'             \item \code{copy_count}:  sum of the copy counts in the \code{copy} column.
#'                                       Only present if the \code{copy} argument is 
#'                                       specified.
#'             \item \code{copy_freq}:   frequency of the clone as a fraction of the total
#'                                       copy number within each group. Only present if 
#'                                       the \code{copy} argument is specified.
#'           }
#'           Also includes additional columns specified in the \code{groups} argument.
#' 
#' @examples
#' # Without copy numbers
#' clones <- countClones(ExampleDb, groups="sample_id")
#'
#' # With copy numbers and multiple groups
#' clones <- countClones(ExampleDb, groups=c("sample_id", "c_call"), copy="duplicate_count")
#' 
#' @export
countClones <- function(data, groups=NULL, copy=NULL, clone="clone_id", remove_na=TRUE) {
    # Check input
    check <- checkColumns(data, c(clone, copy, groups))
    if (check != TRUE) { 
        warning(check) # instead of throwing an error and potentially disrupting a workflow
    }

    # Handle NAs
    if (remove_na) {
        bool_na <- is.na(data[, clone])
        if (any(bool_na)) {
            if  (!all(bool_na)){
                msg <- paste0("NA(s) found in ", sum(bool_na), " row(s) of the ", clone, 
                            " column and excluded from tabulation")
                warning(msg)
            }
            data <- data[!bool_na, ]
        }
    }
    
    # Tabulate clonal abundance
    if (is.null(copy)) {
        clone_tab <- data %>% 
            group_by(!!!rlang::syms(c(groups, clone))) %>%
            dplyr::summarize(seq_count=n()) %>%
            dplyr::mutate(seq_freq=!!rlang::sym("seq_count")/sum(!!rlang::sym("seq_count"), na.rm=TRUE)) %>%
            dplyr::arrange(desc(!!rlang::sym("seq_count")))
    } else {
        clone_tab <- data %>% 
            group_by(!!!rlang::syms(c(groups, clone))) %>%
            dplyr::summarize(seq_count=length(.data[[clone]]),
                              copy_count=sum(.data[[copy]], na.rm=TRUE)) %>%
            dplyr::mutate(seq_freq=!!rlang::sym("seq_count")/sum(!!rlang::sym("seq_count"), na.rm=TRUE),
                          copy_freq=!!rlang::sym("copy_count")/sum(!!rlang::sym("copy_count"), na.rm=TRUE)) %>%
            dplyr::arrange(desc(!!rlang::sym("copy_count"))) 
    }
    return(clone_tab)
}


# Perform boostrap abundance calculation
# 
# @param    x       named vector of observed abundance values.
# @param    n       number of samples to draw from the estimate complete abundance distribution.
# @param    nboot   number of bootstrap realizations.
# @param    method  complete abundance inferrence method. 
#                   One of "before", "after" or "none" for complete abundance distribution
#                   inferrence before sampling, after sampling, or uncorrected, respectively.
# 
# @return   A matrix of bootstrap results.
bootstrapAbundance <- function(x, n, nboot=200, method="before") {
    ## DEBUG
    # x=abund_obs; method="before"
    # Check argumets
    method <- match.arg(method)
  
    if (method == "before") {
        # Calculate estimated complete abundance distribution
        p <- inferCompleteAbundance(x)
        # Bootstrap abundance
        boot_mat <- rmultinom(nboot, n, p) / n
    } else if (method == "after") {
        # Calculate estimated complete abundance distribution
        p <- x / sum(x, na.rm=TRUE)
        boot_sam <- rmultinom(nboot, n, p)
        boot_list <- apply(boot_sam, 2, inferCompleteAbundance)
        
        # Convert to matrix
        boot_names <- unique(unlist(sapply(boot_list, names)))
        boot_mat <- matrix(0, nrow=length(boot_names), ncol=nboot)
        rownames(boot_mat) <- boot_names
        for (i in 1:nboot) {
            boot_mat[names(boot_list[[i]]), i] <- boot_list[[i]]
        } 
    } else if (method == "none") {
        # Raw sampling of input
        p <- x / sum(x, na.rm=TRUE)
        boot_sam <- rmultinom(nboot, n, p)
    } else {
        stop("Invalid method: ", method)
    }
      
    return(boot_mat)
}

#' Estimates the complete clonal relative abundance distribution
#' 
#' \code{estimateAbundance} estimates the complete clonal relative abundance distribution 
#' and confidence intervals on clone sizes using bootstrapping.
#' 
#' @param    data      data.frame with Change-O style columns containing clonal assignments.
#' @param    clone     name of the \code{data} column containing clone identifiers.
#' @param    copy      name of the \code{data} column containing copy numbers for each 
#'                     sequence. If \code{copy=NULL} (the default), then clone abundance
#'                     is determined by the number of sequences. If a \code{copy} column
#'                     is specified, then clone abundances is determined by the sum of 
#'                     copy numbers within each clonal group.
#' @param    group     name of the \code{data} column containing group identifiers. 
#'                     If \code{NULL} then no grouping is performed and the \code{group} 
#'                     column of the output will contain the value \code{NA} for each row.
#' @param    min_n     minimum number of observations to sample.
#'                     A group with less observations than the minimum is excluded.
#' @param    max_n     maximum number of observations to sample. If \code{NULL} then no 
#'                     maximum is set.
#' @param    uniform   if \code{TRUE} then uniformly resample each group to the same 
#'                     number of observations. If \code{FALSE} then allow each group to
#'                     be resampled to its original size or, if specified, \code{max_size}.
#' @param    ci        confidence interval to calculate; the value must be between 0 and 1.
#' @param    nboot     number of bootstrap realizations to generate.
#' @param    progress  if \code{TRUE} show a progress bar. 
#' 
#' @return   A \link{AbundanceCurve} object summarizing the abundances.
#'           
#' @references
#' \enumerate{
#'   \item  Chao A. Nonparametric Estimation of the Number of Classes in a Population. 
#'            Scand J Stat. 1984 11, 265270.
#'   \item  Chao A, et al. Rarefaction and extrapolation with Hill numbers: 
#'            A framework for sampling and estimation in species diversity studies. 
#'            Ecol Monogr. 2014 84:45-67.
#'   \item  Chao A, et al. Unveiling the species-rank abundance distribution by 
#'            generalizing the Good-Turing sample coverage theory. 
#'            Ecology. 2015 96, 11891201.
#' }
#' 
#' @seealso  
#' See \link{plotAbundanceCurve} for plotting of the abundance distribution.
#' See \link{alphaDiversity} for a similar application to clonal diversity.
#'           
#' @examples
#' abund <- estimateAbundance(ExampleDb, group="sample_id", nboot=100)
#'
#' @export
estimateAbundance <- function(data, clone="clone_id", copy=NULL, group=NULL, 
                              min_n=30, max_n=NULL, uniform=TRUE, ci=0.95, nboot=200,
                              progress=FALSE) {
    
    # TODO:
    # Add alakazam style cellIdColumn=NULL, locusColumn="locus", locusValues=c("IGH")
    # similar to distToNearest
    # filter based on locusValues
    # if cellIdColumn
    #    for rows that have unique cell_id, ok
    #    if rows have cell_id not unique, count only once
    # if not cellIdColumn, count heavy chains (locusValues will be IGH)
    # If mixed bulk and sc do calculation but raise warning because different type of abundances
    
    ## DEBUG
    # data=ExampleDb; group="sample_id"; clone="clone_id"; copy=NULL; min_n=1; max_n=NULL; ci=0.95; uniform=F; nboot=100
    # copy="duplicate_count"
    # group=NULL

    # Hack for visibility of dplyr variables
    #. <- NULL
    
    # Check input
    if (!is.data.frame(data)) {
        stop("Input data is not a data.frame")
    }
    
    # Check columns that are reported are real columns (can be NULL)
    check <- checkColumns(data, c(clone, copy, group))
    if (check != TRUE) { stop(check) }
    
    # Set confidence interval
    ci_z <- ci + (1 - ci) / 2
    ci_x <- qnorm(ci_z)
    
    # Tabulate clonal abundance
    count_col <- if (!is.null(copy)) { "copy_count" } else { "seq_count" }
    clone_tab <- countClones(data, copy=copy, clone=clone, groups=group) %>%
        dplyr::mutate(clone_count=!!rlang::sym(count_col))

    # Tabulate group sizes
    if (!is.null(group)) {
        # Summarize groups
        group_tab <- clone_tab %>%
            group_by(!!rlang::sym(group)) %>%
            dplyr::summarize(count=sum(!!rlang::sym("clone_count"), na.rm=TRUE)) %>%
            rename(group=!!rlang::sym(group))
    } else {
        group_tab <- data.frame(v="All", count=sum(clone_tab$clone_count, na.rm=T))
        names(group_tab)[1] <- "group"
    }
    group_all <- as.character(group_tab$group)
    group_tab <- group_tab[group_tab$count >= min_n, ]
    group_keep <- as.character(group_tab$group)
    
    # Set number of sampled sequence
    if (uniform) {
        nsam <- min(group_tab$count, max_n)
        nsam <- setNames(rep(nsam, length(group_keep)), group_keep)
    } else {
        nsam <- if (is.null(max_n)) { group_tab$count } else { pmin(group_tab$count, max_n) }
        nsam <- setNames(nsam, group_keep)
    }
    
    # Warn if groups removed
    if (length(group_keep) < length(group_all)) {
        warning("Not all groups passed threshold min_n=", min_n, ".", 
                " Excluded: ", paste(setdiff(group_all, group_keep), collapse=", "))
    }
    
    # Generate abundance bootstrap
    if (progress) { 
        pb <- progressBar(length(group_keep))
    }
    boot_list <- list()
    abund_list <- list()
    for (g in group_keep) {
        n <- nsam[g]
        
        # Extract abundance vector
        if (!is.null(group)) {
            abund_obs <- clone_tab$clone_count[clone_tab[[group]] == g]
            names(abund_obs) <- clone_tab[[clone]][clone_tab[[group]] == g]
        } else {
            # Extract abundance vector
            abund_obs <- clone_tab$clone_count
            names(abund_obs) <- clone_tab[[clone]]
        } 
        
        # Infer complete abundance distribution
        boot_mat <- bootstrapAbundance(abund_obs, n, nboot=nboot, method="before")
        
        # Assign confidence intervals based on variance of bootstrap realizations
        p_mean <- apply(boot_mat, 1, mean)
        p_sd <- apply(boot_mat, 1, sd)
        p_err <- ci_x * p_sd
        p_lower <- pmax(p_mean - p_err, 0)
        p_upper <- p_mean + p_err
        
        # Assemble and sort abundance data.frame
	    abund_df <- tibble::tibble(!!clone := rownames(boot_mat), p=p_mean, p_sd=p_sd,
	                           lower=p_lower, upper=p_upper) %>%
	        dplyr::arrange(desc(!!rlang::sym("p"))) %>%
	        dplyr::mutate(rank=1:n())
			
        # Save summary
        abund_list[[g]] <- abund_df
        
        # Save bootstrap
        boot_list[[g]] <- as.data.frame(boot_mat) %>%
          tibble::rownames_to_column(clone)
        
        if (progress) { pb$tick() }
    }
    id_col <- "group"
    if (!is.null(group)) { id_col <- group }
    abundance_df <- as.data.frame(bind_rows(abund_list, .id=id_col))
    bootstrap_df <- as.data.frame(bind_rows(boot_list, .id=id_col))
    
    # Create a new diversity object with bootstrap
    abund_obj <- new("AbundanceCurve",
                     bootstrap=bootstrap_df, 
                     abundance=abundance_df,
                     clone_by=clone,
                     group_by=id_col,
                     #groups=if_else(is.null(group), as.character(NA), group_keep),
                     groups=group_keep,
                     n=nsam, 
                     nboot=nboot, 
                     ci=ci)
    
    return(abund_obj)
}

#### Diversity functions ####

#' Calculate the diversity index
#' 
#' \code{calcDiversity} calculates the clonal diversity index for a vector of diversity 
#' orders. 
#'
#' @param    p  numeric vector of clone (species) counts or proportions.
#' @param    q  numeric vector of diversity orders.
#' 
#' @return   A vector of diversity scores \eqn{D} for each \eqn{q}.
#' 
#' @details
#' This method, proposed by Hill (Hill, 1973), quantifies diversity as a smooth function 
#' (\eqn{D}) of a single parameter \eqn{q}. Special cases of the generalized diversity 
#' index correspond to the most popular diversity measures in ecology: species richness 
#' (\eqn{q = 0}), the exponential of the Shannon-Weiner index (\eqn{q} approaches \eqn{1}), the 
#' inverse of the Simpson index (\eqn{q = 2}), and the reciprocal abundance of the largest 
#' clone (\eqn{q} approaches \eqn{+\infty}). At \eqn{q = 0} different clones weight equally, 
#' regardless of their size. As the parameter \eqn{q} increase from \eqn{0} to \eqn{+\infty} 
#' the diversity index (\eqn{D}) depends less on rare clones and more on common (abundant) 
#' ones, thus encompassing a range of definitions that can be visualized as a single curve. 
#' 
#' Values of \eqn{q < 0} are valid, but are generally not meaningful. The value of \eqn{D} 
#' at \eqn{q=1} is estimated by \eqn{D} at \eqn{q=0.9999}. 
#'
#' @references
#' \enumerate{
#'   \item  Hill M. Diversity and evenness: a unifying notation and its consequences. 
#'            Ecology. 1973 54(2):427-32.
#' }
#' 
#' @seealso  Used by \link{alphaDiversity}.
#' 
#' @examples
#' # May define p as clonal member counts
#' p <- c(1, 1, 3, 10)
#' q <- c(0, 1, 2)
#' calcDiversity(p, q)
#'
#' # Or proportional abundance
#' p <- c(1/15, 1/15, 1/5, 2/3)
#' calcDiversity(p, q)
#' 
#' @export
calcDiversity <- function(p, q) {
    # Add jitter to q=1
    q[q == 1] <- 0.9999
    # Remove zeros
    p <- p[p > 0]
    # Convert p to proportional abundance
    p <- p / sum(p)
    # Calculate D for each q
    D <- sapply(q, function(x) sum(p^x)^(1 / (1 - x)))
    
    return(D)
}

# Calculate the inferred diversity index
# 
# \code{calcInferredDiversity} calculates the clonal diversity index for a vector of diversity 
# orders with a correction for the presence of unseen species. Does not take proportional abundance.
#
# @param    p  numeric vector of clone (species) counts.
# @param    q  numeric vector of diversity orders.
# 
# @return   A vector of diversity scores \eqn{D} for each \eqn{q}.
# 
# @details
# This method, proposed by Hill (Hill, 1973), quantifies diversity as a smooth function 
# (\eqn{D}) of a single parameter \eqn{q}. Special cases of the generalized diversity 
# index correspond to the most popular diversity measures in ecology: species richness 
# (\eqn{q = 0}), the exponential of the Shannon-Weiner index (\eqn{q} approaches \eqn{1}), the 
# inverse of the Simpson index (\eqn{q = 2}), and the reciprocal abundance of the largest 
# clone (\eqn{q} approaches \eqn{+\infty}). At \eqn{q = 0} different clones weight equally, 
# regardless of their size. As the parameter \eqn{q} increase from \eqn{0} to \eqn{+\infty} 
# the diversity index (\eqn{D}) depends less on rare clones and more on common (abundant) 
# ones, thus encompassing a range of definitions that can be visualized as a single curve. 
# 
# Values of \eqn{q < 0} are valid, but are generally not meaningful. The value of \eqn{D} 
# at \eqn{q=1} is estimated by \eqn{D} at \eqn{q=0.9999}. 
# 
# An adjusted detected species relative abundance distribution is applied before calculating diversity.
#
# @references
# \enumerate{
#   \item  Hill M. Diversity and evenness: a unifying notation and its consequences. 
#            Ecology. 1973 54(2):427-32.
# }
# 
# @seealso  Used by \link{alphaDiversity}
# 
# @examples
# # May define p as clonal member counts
# p <- c(1, 1, 3, 10)
# q <- c(0, 1, 2)
# calcInferredDiversity(p, q)
#
# 
# @export
# calcInferredDiversity <- function(p, q) {
#    # Correct abundance
#     .infer <- function(y) {
#         # Infer complete abundance distribution
#         p1 <- adjustObservedAbundance(y)
#         p2 <- inferUnseenAbundance(y)
#         names(p2) <- if (length(p2) > 0) { paste0("U", 1:length(p2)) } else { NULL }
#         return(c(p1, p2))
#     }
#    
#    # Correct abundance
#    p <- .infer(p)
#     # Add jitter to q=1
#     q[q == 1] <- 0.9999
#     # Remove zeros
#     p <- p[p > 0]
#     # Convert p to proportional abundance
#     p <- p / sum(p)
#     # Calculate D for each q
#     D <- sapply(q, function(x) sum(p^x)^(1 / (1 - x)))
#     
#     return(D)
# }


# Calculates diversity under rarefaction
# 
# Calculates Hill numbers under rarefaction
#
# @param    x  vector of observed abundance counts.
# @param    q  numeric vector of diversity orders.
# @param    m  the sequence count to rarefy to.
#
# @return   A vector of diversity scores \eqn{D} for each \eqn{q}.
inferRarefiedDiversity <- function(x, q, m) {
    x <- x[x >= 1]
    n <- sum(x)
    if (m > n) {
        stop("m must be <= the total count of observed sequences.")
    }
    q[q == 1] <- 0.9999
    
    # Tabulate frequency counts from 1:n
    fk_n <- tabulate(x, nbins=n)
    
    # Calculate estimated fk(m)
    fk_m <- sapply(1:m, function(k) sum(exp(lchoose(k:m, k) + 
                                            lchoose(n - k:m, m - k) - 
                                            lchoose(n, m))*fk_n[k:m]))
    
    # Calculate diversity
    D <- sapply(q, function(r) sum((1:m / m)^r * fk_m)^(1 / (1 - r)))
    
    return(D)
}


# Helper function for computing alpha diversity from bootrstrap outputs
#
# \code{helperAlpha} divides a set of bootstrapped clones by group annotation,
# and computes the diversity of each set. 
#
# @param    boot_output  data.frame from\link{AbundanceCurve} object containing bootstrapped clonal 
#                        abundance curves.
# @param    q            vector of Hill Diversity indices to test for diversity calculations.
# @param    clone        name of the \code{boot_output} column containing clone identifiers.
# @param    group        name of the \code{boot_output} column containing grouping information for 
#                        diversity calculation.
#
# @return   data.frame containing diversity calculations for each bootstrap iteration.
helperAlpha <- function(boot_output, q, clone="clone_id", group=NULL) {
    ## DEBUG
    # abundance <- estimateAbundance(ExampleDb, group="sample_id", nboot=100)
    # clone <- abundance@clone_by
    # group <- abundance@group_by
  
    # Compute diversity from a column of each bootstrap
    output <- boot_output %>% 
        dplyr::ungroup() %>%
        dplyr::select(-one_of(c(clone, group))) %>%
        as.matrix() %>% 
        apply(2, calcDiversity, q=q) %>%
        data.frame() %>% 
        mutate(q=q)

    return(output)
}


# Helper function for computing beta diversity from bootrstrap outputs
#
# \code{helperBeta} divides a set of bootstrapped clones by group annotation,
# and computes the alpha diversity. Group annotations are then ignored and 
# gamma diversity is computed. A multiplicative beta diversity is used corresponding
# to the gamma diversity divided by the average alpha diversity of each group.
#
# @param    boot_output   data.frame from\link{AbundanceCurve} object containing bootstrapped clonal abundance curves.
# @param    q             vector of Hill Diversity indices to test for diversity calculations.
# @param    ci_z          numeric value corresponding to confidence interval for calculating beta diversity.
# @param    clone         name of the \code{boot_output} column containing clone identifiers.
# @param    group         name of the \code{boot_output} column containing grouping information for diversity 
#                         calculation.
#
# @return   data.frame containing diversity calculations for each bootstrap iteration.
helperBeta <- function(boot_output, q, ci_x, clone="clone_id", group="group") { 
    # Hack for visibility of dplyr variables
    . <- NULL
        
    # Compute gamma diversity metrics
    gamma <- boot_output %>%
        dplyr::group_by(!!rlang::sym(clone)) %>%
        dplyr::select(-one_of(c(group))) %>%
        dplyr::summarize_all(sum) %>%
        dplyr::do(helperAlpha(., q=q, clone=clone)) %>%
        tidyr::gather(key="n", value="gamma", -!!rlang::sym("q")) %>%
        dplyr::mutate(gamma=as.numeric(!!rlang::sym("gamma")))

    # Compute alpha diversity metrics
    alpha <- boot_output %>%
        dplyr::group_by(!!rlang::sym(group)) %>%
        dplyr::do(helperAlpha(., q=q, clone=clone, group=group)) %>%
        dplyr::group_by(!!rlang::sym("q")) %>%
        dplyr::select(-one_of(c(group))) %>%
        dplyr::summarize_all(mean) %>%
        tidyr::gather(key="n", value="alpha", -!!rlang::sym("q")) %>%
        dplyr::mutate(alpha=as.numeric(!!rlang::sym("alpha")))

    # Perform comparisons of alpha and gamma to extract beta
    beta <- bind_cols(gamma, alpha) %>%
        dplyr::group_by(!!rlang::sym("q")) %>%
        dplyr::mutate(X=!!rlang::sym("gamma") / !!rlang::sym("alpha")) %>%
        dplyr::summarize(d=mean(!!rlang::sym("X"), na.rm=TRUE),
                         d_sd=sd(!!rlang::sym("X"), na.rm=TRUE)) %>%
        dplyr::mutate(d_lower=pmax(!!rlang::sym("d") - !!rlang::sym("d_sd") * ci_x, 0), 
                      d_upper=!!rlang::sym("d") + !!rlang::sym("d_sd") * ci_x)

    return(beta)
}

# Helper function for computing statistical significance
#
# \code{helperTest} computes the pairwise statistical significance of differences
# in bootstrapped diversity values between two sets defined by the group column. 
# A p-value is computed using the ECDF distribution as the frequency of bootstrap iterations
# for which no difference is observed. 
#
# @param    div_df  data.frame from\link{DiversityCurve} object containing bootstrapped 
#                   diversity curves.
# @param    group   name of the \code{boot_output} column containing grouping information 
#                   for diversity calculation.
# @param    q       vector of Hill Diversity indices to test for diversity calculations.
#
# @return   data.frame containing test results for each value of q.
helperTest <- function(div_df, q, group="group") {
    # Hack for visibility of dplyr variables
    #. <- NULL
    
    # Pairwise test
    group_pairs <- combn(unique(div_df[[group]]), 2, simplify=F)
    pvalue_list <- list()
    for (group_pair in group_pairs) {
        pair_list <- list()
        for(q_i in q) {
            
            # Currently just testing for one diversity order
            mat1 <- div_df %>%
                dplyr::filter(!!rlang::sym(group) == group_pair[1], !!rlang::sym("q") == q_i) %>%
                dplyr::select(-one_of(c(group, "q"))) %>% unlist()
            mat2 <- div_df %>%
                dplyr::filter(!!rlang::sym(group) == group_pair[2], !!rlang::sym("q") == q_i) %>%
                dplyr::select(-one_of(c(group, "q"))) %>% unlist()

            if (mean(mat1) >= mean(mat2)) { 
                g_delta <- mat1 - mat2 
            } else { 
                g_delta <- mat2 - mat1 
            }  

            # Compute p-value from ecdf
            p <- ecdf(g_delta)(0)
            p <- ifelse(p <= 0.5, p * 2, (1 - p) * 2)
            
            pair_list[[as.character(q_i)]] <- list(delta_mean=mean(g_delta), 
                                                   delta_sd=sd(g_delta), 
                                                   pvalue=p)
        }
        pvalue_list[[paste(group_pair, collapse=" != ")]] <- bind_rows(pair_list, .id="q")

    }
    test_df <- bind_rows(pvalue_list, .id="test")

    return(test_df)
}


#' Calculate clonal alpha diversity
#'
#' \code{alphaDiversity} takes in a data.frame or \link{AbundanceCurve} and computes
#' diversity scores (\eqn{D}) over an interval of diversity orders (\eqn{q}).
#' 
#' @param    data      data.frame with Change-O style columns containing clonal assignments or
#'                     a \link{AbundanceCurve} generate by \link{estimateAbundance} object 
#'                     containing a previously calculated bootstrap distributions of clonal abundance.
#' @param    min_q     minimum value of \eqn{q}.
#' @param    max_q     maximum value of \eqn{q}.
#' @param    step_q    value by which to increment \eqn{q}.
#' @param    ci        confidence interval to calculate; the value must be between 0 and 1.
#' @param    ...       additional arguments to pass to \link{estimateAbundance}. Additional arguments
#'                     are ignored if a \link{AbundanceCurve} is provided as input.
#' 
#' @return   A \link{DiversityCurve} object summarizing the diversity scores.
#' 
#' @references
#' \enumerate{
#'   \item  Hill M. Diversity and evenness: a unifying notation and its consequences. 
#'            Ecology. 1973 54(2):427-32.
#'   \item  Chao A. Nonparametric Estimation of the Number of Classes in a Population. 
#'            Scand J Stat. 1984 11, 265270.
#'   \item  Chao A, et al. Rarefaction and extrapolation with Hill numbers: 
#'            A framework for sampling and estimation in species diversity studies. 
#'            Ecol Monogr. 2014 84:45-67.
#'   \item  Chao A, et al. Unveiling the species-rank abundance distribution by 
#'            generalizing the Good-Turing sample coverage theory. 
#'            Ecology. 2015 96, 11891201.
#' }
#'  
#' @seealso  See \link{calcDiversity} for the basic calculation and 
#'           \link{DiversityCurve} for the return object. 
#'           See \link{plotDiversityCurve} for plotting the return object.
#' 
#' @details
#' Clonal diversity is calculated using the generalized diversity index (Hill numbers) 
#' proposed by Hill (Hill, 1973). See \link{calcDiversity} for further details.
#'
#' To generate a smooth curve, \eqn{D} is calculated for each value of \eqn{q} from
#' \code{min_q} to \code{max_q} incremented by \code{step_q}.  When \code{uniform=TRUE}
#' variability in total sequence counts across unique values in the \code{group} column 
#' is corrected by repeated resampling from the estimated complete clonal distribution to a 
#' common number of sequences. The complete clonal abundance distribution that is resampled 
#' from is inferred by using the Chao1 estimator to infer the number of unseen clones, 
#' followed by applying the relative abundance correction and unseen clone frequencies 
#' described in Chao et al, 2015.
#' 
#' The diversity index (\eqn{D}) for each group is the mean value of over all resampling 
#' realizations. Confidence intervals are derived using the standard deviation of the 
#' resampling realizations, as described in Chao et al, 2015.
#'
#' Significance of the difference in diversity index (\code{D}) between groups is tested by 
#' constructing a bootstrap delta distribution for each pair of unique values in the 
#' \code{group} column. The bootstrap delta distribution is built by subtracting the diversity 
#' index \code{Da} in group \code{a} from the corresponding value \eqn{Db} in group \code{b}, 
#' for all bootstrap realizations, yielding a distribution of \code{nboot} total deltas; where 
#' group \code{a} is the group with the greater mean \code{D}. The p-value for hypothesis 
#' \code{Da  !=  Db} is the value of \code{P(0)} from the empirical cumulative distribution 
#' function of the bootstrap delta distribution, multiplied by 2 for the two-tailed correction.
#' 
#' Note, this method may inflate statistical significance when clone sizes are uniformly small,
#' such as when most clones sizes are 1, sample size is small, and \code{max_n} is near
#' the total count of the smallest data group. Use caution when interpreting the results 
#' in such cases.
#'
#' @examples
#' # Group by sample identifier in two steps
#' abund <- estimateAbundance(ExampleDb, group="sample_id", nboot=100)
#' div <- alphaDiversity(abund, step_q=1, max_q=10)
#' plotDiversityCurve(div, legend_title="Sample")
#'                    
#' # Grouping by isotype rather than sample identifier in one step
#' div <- alphaDiversity(ExampleDb, group="c_call", min_n=40, step_q=1, max_q=10, 
#'                       nboot=100)
#' plotDiversityCurve(div, legend_title="Isotype")
#'
#' @export
alphaDiversity <- function(data, min_q=0, max_q=4, step_q=0.1, ci=0.95, ...) {
    # Hack for visibility of dplyr variables
    . <- NULL
    
    # Check input object and call estimateAbundance if required
    if (is(data, "AbundanceCurve")) {
        abundance <- data
    } else if (is(data, "data.frame")) {
        abundance <- estimateAbundance(data, ci=0.95, ...)
    } else {
        stop("Input must be either a data.frame or AbundanceCurve object.")
    }
    
    # Set diversity orders and confidence interval
    ci_z <- ci + (1 - ci) / 2
    ci_x <- qnorm(ci_z)
    q <- seq(min_q, max_q, step_q)
    if (!(0 %in% q)) { q <- c(0, q) }
    
    # Set grouping variables
    clone <- abundance@clone_by
    group <- abundance@group_by
        
    # Compute diversity metric for bootstrap instances
    boot_df <- abundance@bootstrap %>%
        dplyr::group_by(!!rlang::sym(group)) %>%
        dplyr::do(helperAlpha(., q=q, clone=clone, group=group)) %>%
        dplyr::ungroup()
    
    # Summarize diversity
    div_df <- boot_df %>%
        tidyr::gather(key="n", value="X", -one_of(c(group, "q"))) %>%
        dplyr::mutate(X=as.numeric(!!rlang::sym("X"))) %>%
        dplyr::group_by(!!!rlang::syms(c(group, "q"))) %>%
        dplyr::summarize(d=mean(!!rlang::sym("X"), na.rm=TRUE),
                         d_sd=sd(!!rlang::sym("X"), na.rm=TRUE)) %>%
        dplyr::mutate(d_lower=pmax(!!rlang::sym("d") - !!rlang::sym("d_sd") * ci_x, 0), 
                      d_upper=!!rlang::sym("d") + !!rlang::sym("d_sd") * ci_x)
    
    # Compute evenness
    div_qi <- div_df %>%
        filter(!!rlang::sym("q") == 0) %>%
        select(one_of(c(group, "d")))
    div_df <- div_df %>%
        dplyr::right_join(div_qi, by=group, suffix=c("", "_0")) %>%
        mutate(e=!!rlang::sym("d")/!!rlang::sym("d_0"), 
               e_lower=!!rlang::sym("d_lower")/!!rlang::sym("d_0"), 
               e_upper=!!rlang::sym("d_upper")/!!rlang::sym("d_0")) %>%
        select(-!!rlang::sym("d_0"))
    
    # Test
    if (length(abundance@groups) > 1) {
        test_df <- helperTest(boot_df, q=q, group=group)
    } else {
        test_df <- NULL
    }

    # Build return object
    group_set <- unique(div_df[[group]])
    div_obj <- new("DiversityCurve",
                   diversity=div_df, 
                   tests=test_df,
                   method="alpha",
                   group_by=group,
                   groups=group_set,
                   q=q,  
                   n=abundance@n,
                   ci=ci)
        
   return(div_obj) 
}

# Calculates the pairwise beta diversity
# 
# \code{betaDiversity} takes in a data.frame or \link{AbundanceCurve} and computes
# the multiplicative beta diversity across a range of Hill diversity indices.
# 
# @param    data         data.frame with Change-O style columns containing clonal assignments or
#                        an \link{AbundanceCurve} object generate by \link{estimateAbundance}.
#                        containing a previously calculated bootstrap distributions of clonal abundance.
# @param    comparisons  named list of comparisons between group members for computing beta diversity.
# @param    min_q        minimum value of \eqn{q}.
# @param    max_q        maximum value of \eqn{q}.
# @param    step_q       value by which to increment \eqn{q}.
# @param    ci           confidence interval to calculate; the value must be between 0 and 1.
# @param    ...          additional arguments to pass to \link{estimateAbundance}. Additional arguments
#                        are ignored if a \link{AbundanceCurve} is provided as input.
# 
# @return   A \link{DiversityCurve} object summarizing the diversity scores.
# 
# @details
# Beta diversity or the comparative difference between two samples as quantified using Hill
# diversity indices proposed by Jost (Jost, 2007).
# 
# Briefly, the alpha and gamma diversity components are calculated for each comparison.
# Alpha diversity is calculated as the average hill diversity across each independent sample
# while Gamma diversity is calculated as the total diversity without distinguishing between
# samples. Beta diversity is computed as Gamma/Alpha.
# 
# Diversity is calculated on the estimated clonal abundance distribution with a correction
# for unseen species much like the calculation for alpha diversity \link{alphaDiversity}.
# A smooth curve is generated in the same manner as in \link{alphaDiversity}.
# Confidence intervals are derived using the standard deviation of the resampling realizations.
# 
# \enumerate{
#   \item  Hill M. Diversity and evenness: a unifying notation and its consequences.
#            Ecology. 1973 54(2):427-32.
#   \item  Jost L. Partitioning Diversity Into Independent Alpha and Beta Components.
#            Ecology. 2007 88(10):2427–2439.
#   \item  Jost L, et al. Partitioning diversity for conservation analyses.
#            Diversity Distrib. 2010 16(1):65–76
# }
# 
# @examples
# div <- betaDiversity(ExampleDb, comparisons=list("TIME"=c("-1h", "+7d")), group="sample_id",
#                      min_n=40, step_q=1, max_q=10, nboot=100)
# 
# plotDiversityCurve(div, legend_title="Isotype")
# 
# @export
betaDiversity <- function(data, comparisons, min_q=0, max_q=4, step_q=0.1, ci=0.95, ...) {
    # Hack for visibility of dplyr variables
    . <- NULL
    
    if (!is.list(comparisons) || is.null(names(comparisons))) {
      stop("'comparisons' must be a named list")
    }
    
    # Check input object and call estimateAbundance if required
    if (is(data, "AbundanceCurve")) {
        abundance <- data
    } else if (is(data, "data.frame")) {
        abundance <- estimateAbundance(data, ci=0.95, ...)
    } else {
        stop("Input must be either a data.frame or AbundanceCurve object.")
    }
        
    # Set diversity orders and confidence interval
    ci_z <- ci + (1 - ci) / 2
    ci_x <- qnorm(ci_z)
    q <- seq(min_q, max_q, step_q)
    if (!(0 %in% q)) { q <- c(0, q) }
        
    # Compute pairwise beta diversity for bootstrap instances
    beta_diversity_list <- list()

    for (comparison in names(comparisons)) {
        beta_diversity_list[[comparison]] <- abundance@bootstrap %>%
            dplyr::ungroup() %>%
            dplyr::filter(.[[abundance@group_by]] %in% comparisons[[comparison]]) %>%
            dplyr::do(helperBeta(., q=q, clone=abundance@clone_by, group=abundance@group_by, ci_x=ci_x))
    }

    # Generate summary diversity output
    div_df <- bind_rows(beta_diversity_list, .id = "comparison")
    
    # Beta groups
    group_set <- unique(div_df[["comparison"]])
    
    # Compute evenness
    div_qi <- div_df %>%
        filter(!!rlang::sym("q") == 0) %>%
        select(one_of(c("comparison", "D")))

    div <- div_df %>%
        right_join(div_qi, by = "comparison", suffix = c("", "_0")) %>%
        mutate(d = !!rlang::sym("d")/!!rlang::sym("d_0"), 
            e_lower = !!rlang::sym("d_lower")/!!rlang::sym("d_0"), 
            e_upper = !!rlang::sym("d_upper")/!!rlang::sym("d_0")) %>%
        select(-!!rlang::sym("d_0"))
        
    # Test
    if (length(group_set) > 1) {
       test_df <- helperTest(div_df, q=q, group="comparison")
    } else {
       test_df <- NULL
    }
    
    # Build return object
    div_obj <- new("DiversityCurve",
                   diversity=div, 
                   tests=test_df,
                   method="beta",
                   group_by="comparison",
                   groups=group_set,
                   n=abundance@n,
                   q=q,  
                   ci=ci)
        
    return(div_obj)
}


#### Plotting functions ####

#' Plot a clonal abundance distribution
#' 
#' \code{plotAbundanceCurve} plots the results from estimating the complete clonal 
#' relative abundance distribution. The distribution is plotted as a log rank abundance 
#' distribution.
#' 
#' @param    data          \link{AbundanceCurve} object returned by \link{estimateAbundance}.
#' @param    colors        named character vector whose names are values in the 
#'                         \code{group} column of \code{data} and whose values are 
#'                         colors to assign to those group names.
#' @param    main_title    string specifying the plot title.
#' @param    legend_title  string specifying the legend title.
#' @param    xlim          numeric vector of two values specifying the 
#'                         \code{c(lower, upper)} x-axis limits. The lower x-axis 
#'                         value must be >=1.
#' @param    ylim          numeric vector of two values specifying the 
#'                         \code{c(lower, upper)} y-axis limits. The limits on the 
#'                         abundance values are expressed as fractions of 1: use
#'                         c(0,1) to set the lower and upper limits to 0\% and 100\%.
#' @param    annotate      string defining whether to added values to the group labels 
#'                         of the legend. When \code{"none"} (default) is specified no
#'                         annotations are added. Specifying (\code{"depth"}) adds 
#'                         sequence counts to the labels.
#' @param    silent        if \code{TRUE} do not draw the plot and just return the ggplot2 
#'                         object; if \code{FALSE} draw the plot.
#' @param    ...           additional arguments to pass to ggplot2::theme.
#' 
#' @return   A \code{ggplot} object defining the plot.
#' 
#' @seealso  
#' See \link{AbundanceCurve} for the input object and \link{estimateAbundance} for
#' generating the input abundance distribution. Plotting is performed with \link{ggplot}.
#'           
#' @examples
#' # Estimate abundance by sample and plot
#' abund <- estimateAbundance(ExampleDb, group="sample_id", nboot=100)
#' plotAbundanceCurve(abund, legend_title="Sample")
#' 
#' @export
plotAbundanceCurve <- function(data, colors=NULL, main_title="Rank Abundance", 
                               legend_title=NULL, xlim=NULL, ylim=NULL, 
                               annotate=c("none", "depth"),
                               silent=FALSE, ...) {
    
    # Check if abundance is in data
    if (is.null(data@abundance)) { stop("Missing abundance data.") }
    
    # Validate abundance limits
    if (!is.null(xlim)) {
        if (xlim[1]<1) {
            stop("The lower x-axis xlim value must be >=1.")
        }
        max_xlim <- max(data@abundance$rank, na.rm = T)
        if (xlim[2]>max_xlim) {
            message("The largest x-axis value is ",max_xlim,".")
        }
    }    

    # Check arguments
    annotate <- match.arg(annotate)
    
    # Define group label annotations
    if (all(is.na(data@groups)) || length(data@groups) == 1) {
        group_labels <- NA  
    } else if (annotate == "none") {
        group_labels <- setNames(data@groups, data@groups)
    } else if (annotate == "depth") {
        group_labels <- setNames(paste0(data@groups, " (N=", data@n, ")"),
                                 data@groups)
    }

    # Stupid hack for check NOTE about `.x` in math_format
    .x <- NULL
    
    if (!all(is.na(group_labels))) {
        # Define grouped plot
        p1 <- ggplot(data@abundance, aes(x=!!rlang::sym("rank"), 
                                         y=!!rlang::sym("p"), 
                                         group=!!rlang::sym(data@group_by))) + 
            ggtitle(main_title) + 
            baseTheme() + 
            xlab("Rank") +
            ylab("Abundance") +
            scale_x_log10(
                          breaks=scales::trans_breaks("log10", function(x) 10^x),
                          labels=scales::trans_format("log10", scales::math_format(10^.x))) +
            scale_y_continuous(labels=scales::percent) +
            geom_ribbon(aes(ymin=!!rlang::sym("lower"), 
                            ymax=!!rlang::sym("upper"), 
                            fill=!!rlang::sym(data@group_by)), alpha=0.4) +
            geom_line(aes(color=!!rlang::sym(data@group_by)))
        
        # Set colors and legend
        if (!is.null(colors)) {
            p1 <- p1 + scale_color_manual(name=legend_title, labels=group_labels, values=colors) +
                scale_fill_manual(name=legend_title, labels=group_labels, values=colors)
        } else {
            p1 <- p1 + scale_color_discrete(name=legend_title, labels=group_labels) +
                scale_fill_discrete(name=legend_title, labels=group_labels)
        }
    } else {
        # Set color
        if (!is.null(colors) & length(colors) == 1) {
            line_color <- colors
        } else {
            line_color <- "black"
        }
        # Define plot
        p1 <- ggplot(data@abundance, aes(x=!!rlang::sym("rank"), 
                                         y=!!rlang::sym("p"))) + 
            ggtitle(main_title) + 
            baseTheme() + 
            xlab("Rank") +
            ylab("Abundance") +
            scale_x_log10(
                          breaks=scales::trans_breaks("log10", function(x) 10^x),
                          labels=scales::trans_format("log10", scales::math_format(10^.x))) +
            scale_y_continuous(labels=scales::percent) +
            geom_ribbon(aes(ymin=!!rlang::sym("lower"), 
                            ymax=!!rlang::sym("upper")), 
                        fill=line_color, alpha=0.4) +
            geom_line(color=line_color)
    }
    # Add additional theme elements
    p1 <- p1 + 
        coord_cartesian(xlim=xlim,ylim=ylim) +
        do.call(theme, list(...))
    
    # Plot
    if (!silent) { plot(p1) }
    
    invisible(p1)
}


#' Plot the results of alphaDiversity
#' 
#' \code{plotDiversityCurve} plots a \code{DiversityCurve} object.
#'
#' @param    data            \link{DiversityCurve} object returned by 
#'                           \link{alphaDiversity}.
#' @param    colors          named character vector whose names are values in the 
#'                           \code{group} column of the \code{data} slot of \code{data},
#'                           and whose values are colors to assign to those group names.
#' @param    main_title      string specifying the plot title.
#' @param    legend_title    string specifying the legend title.
#' @param    log_x           if \code{TRUE} then plot \eqn{q} on a log scale;
#'                           if \code{FALSE} plot on a linear scale.
#' @param    log_y           if \code{TRUE} then plot the diversity/evenness scores 
#'                           on a log scale; if \code{FALSE} plot on a linear scale.
#' @param    xlim            numeric vector of two values specifying the 
#'                           \code{c(lower, upper)} x-axis limits.
#' @param    ylim            numeric vector of two values specifying the 
#'                           \code{c(lower, upper)} y-axis limits.
#' @param    annotate        string defining whether to added values to the group labels 
#'                           of the legend. When \code{"none"} (default) is specified no
#'                           annotations are added. Specifying (\code{"depth"}) adds 
#'                           sequence counts to the labels.
#' @param    score           one of \code{"diversity"} or \code{"evenness"} specifying which
#'                           score to plot on the y-asis.
#' @param    silent          if \code{TRUE} do not draw the plot and just return the ggplot2 
#'                           object; if \code{FALSE} draw the plot.
#' @param    ...             additional arguments to pass to ggplot2::theme.
#'
#' @return   A \code{ggplot} object defining the plot.
#' 
#' @seealso  See \link{alphaDiversity} and \link{alphaDiversity} for generating 
#'           \link{DiversityCurve} objects for input. Plotting is performed with \link{ggplot}.
#' 
#' @examples
#' # Calculate diversity
#' div <- alphaDiversity(ExampleDb, group="sample_id", nboot=100)
#' 
#' # Plot diversity
#' plotDiversityCurve(div, legend_title="Sample")
#'
#' # Plot diversity
#' plotDiversityCurve(div, legend_title="Sample", score="evenness")
#' 
#' @export
plotDiversityCurve <- function(data, colors=NULL, main_title="Diversity", 
                               legend_title="Group", log_x=FALSE, log_y=FALSE,
                               xlim=NULL, ylim=NULL, annotate=c("none", "depth"), 
                               score=c("diversity", "evenness"),
                               silent=FALSE, ...) {
    # Check arguments
    annotate <- match.arg(annotate)
    score <- match.arg(score)
    
    # Define group label annotations
    if (all(is.na(data@groups)) || length(data@groups) == 1) {
        group_labels <- NA  
    } else if (annotate == "none") {
        group_labels <- setNames(data@groups, data@groups)
    } else if (annotate == "depth") {
        group_labels <- setNames(paste0(data@groups, " (n=", data@n, ")"),
                                 data@groups)
    }
    
    # Define y-axis scores
    if (score == "diversity") {
        y_value <- "d"
        y_min <- "d_lower"
        y_max <- "d_upper"
        y_label <- expression(''^q * D)
    } else if (score == "evenness") {
        y_value <- "e"
        y_min <- "e_lower"
        y_max <- "e_upper"
        y_label <- expression(''^q * e)
    }
    
    # Stupid hack for check NOTE about `.x` in math_format
    .x <- NULL
    
    if (!all(is.na(group_labels))) {
        # Define grouped plot
        p1 <- ggplot(data@diversity, aes(x=q, y=!!rlang::sym(y_value), 
                                         group=!!rlang::sym(data@group_by))) + 
            ggtitle(main_title) + 
            baseTheme() + 
            xlab('q') +
            ylab(y_label) +
            geom_ribbon(aes(ymin=!!rlang::sym(y_min), ymax=!!rlang::sym(y_max), 
                            fill=!!rlang::sym(data@group_by)), alpha=0.4) +
            geom_line(aes(color=!!rlang::sym(data@group_by)))
    
        # Set colors and legend
        if (!is.null(colors)) {
            p1 <- p1 + scale_color_manual(name=legend_title, labels=group_labels, values=colors) +
                scale_fill_manual(name=legend_title, labels=group_labels, values=colors)
        } else {
            p1 <- p1 + scale_color_discrete(name=legend_title, labels=group_labels) +
                scale_fill_discrete(name=legend_title, labels=group_labels)
        }
    } else {
        # Set color
        if (!is.null(colors) & length(colors) == 1) {
          line_color <- colors
        } else {
          line_color <- "black"
        }
      
        # Define ungrouped plot
        p1 <- ggplot(data@diversity, aes(x=q, y=!!rlang::sym(y_value))) + 
            ggtitle(main_title) + 
            baseTheme() + 
            xlab('q') +
            ylab(y_label) +
            geom_ribbon(aes(ymin=!!rlang::sym(y_min), ymax=!!rlang::sym(y_max)), fill=line_color, alpha=0.4) +
            geom_line(color=line_color)
    }
    
    # Set x-axis style
    if (log_x) {
        p1 <- p1 + scale_x_continuous(trans=scales::log2_trans(),
                                      breaks=scales::trans_breaks('log2', function(x) 2^x),
                                      labels=scales::trans_format('log2', scales::math_format(2^.x)))
    } else {
        p1 <- p1 + scale_x_continuous()
    }
    
    # Set y-axis style
    if (log_y) {
        p1 <- p1 + scale_y_continuous(trans=scales::log2_trans(),
                                      breaks=scales::trans_breaks('log2', function(x) 2^x),
                                      labels=scales::trans_format('log2', scales::math_format(2^.x)))
    } else {
        p1 <- p1 + scale_y_continuous()
    }
    
    # Add additional theme elements
    p1 <- p1 + 
        coord_cartesian(xlim=xlim,ylim=ylim) +
        do.call(theme, list(...))

    # Plot
    if (!silent) { plot(p1) }
    
    invisible(p1)
}


#' Plot the results of diversity testing
#' 
#' \code{plotDiversityTest} plots summary data for a \code{DiversityCurve} object 
#' with mean and a line range indicating plus/minus one standard deviation.
#'
#' @param    data            \link{DiversityCurve} object returned by 
#'                           \link{alphaDiversity}.
#' @param    q               diversity order to plot the test for.
#' @param    colors          named character vector whose names are values in the 
#'                           \code{group} column of the \code{data} slot of \code{data},
#'                           and whose values are colors to assign to those group names.
#' @param    main_title      string specifying the plot title.
#' @param    legend_title    string specifying the legend title.
#' @param    log_d           if \code{TRUE} then plot the diversity scores \eqn{D} 
#'                           on a log scale; if \code{FALSE} plot on a linear scale.
#' @param    annotate        string defining whether to added values to the group labels 
#'                           of the legend. When \code{"none"} (default) is specified no
#'                           annotations are added. Specifying (\code{"depth"}) adds 
#'                           sequence counts to the labels.
#' @param    silent          if \code{TRUE} do not draw the plot and just return the ggplot2 
#'                           object; if \code{FALSE} draw the plot.
#' @param    ...             additional arguments to pass to ggplot2::theme.
#'
#' @return   A \code{ggplot} object defining the plot.
#' 
#' @seealso  See \link{alphaDiversity} for generating input.
#'           Plotting is performed with \link{ggplot}.
#' 
#' @examples
#' # Calculate diversity
#' div <- alphaDiversity(ExampleDb, group="sample_id", min_q=0, max_q=2, step_q=1, nboot=100)
#' 
#' # Plot results at q=0 (equivalent to species richness)
#' plotDiversityTest(div, 0, legend_title="Sample")
#' 
#' # Plot results at q=2 (equivalent to Simpson's index)
#' plotDiversityTest(div, q=2, legend_title="Sample")
#' 
#' @export
plotDiversityTest <- function(data, q, colors=NULL, main_title="Diversity", legend_title="Group", 
                              log_d=FALSE, annotate=c("none", "depth"), silent=FALSE, ...) {
    # Stupid hack for check NOTE about `.x` in math_format
    .x <- NULL
    
    # Check arguments
    annotate <- match.arg(annotate)
    
    # Check if abundance is in data
    if (is.null(data@tests)) { 
        stop("Test data missing from input object.")
    }

    # Check if q is in data
    if (!(q %in% data@q)) { 
        stop("Test for order q=", q, " not found in input object.") 
    }

    # Define group label annotations
    if (annotate == "none") {
        group_labels <- setNames(data@groups, data@groups)
    } else if (annotate == "depth") {
        group_labels <- setNames(paste0(data@groups, " (N=", data@n, ")"),
                                 data@groups)
    }
    # Define plot values
    df <- data@diversity %>%
        dplyr::filter(!!rlang::sym("q") == !!rlang::enquo(q)) %>%
        dplyr::mutate(lower=!!rlang::sym("d") - !!rlang::sym("d_sd"), 
                      upper=!!rlang::sym("d") + !!rlang::sym("d_sd"))
    
    # Define base plot elements
    p1 <- ggplot(df, aes(x=!!rlang::sym(data@group_by))) + 
        ggtitle(main_title) + 
        baseTheme() + 
        xlab("") +
        ylab(bquote("Mean " ^ .(q) * D %+-% "SD")) +
        geom_linerange(aes(ymin=!!rlang::sym("lower"), 
                           ymax=!!rlang::sym("upper"), 
                           color=!!rlang::sym(data@group_by)), alpha=0.8) +
        geom_point(aes(y=!!rlang::sym("d"), color=!!rlang::sym(data@group_by)))
    
    # Set colors and legend
    if (!is.null(colors)) {
        p1 <- p1 + scale_color_manual(name=legend_title, labels=group_labels, values=colors)
    } else {
        p1 <- p1 + scale_color_discrete(name=legend_title, labels=group_labels)
    }

    # Set x-axis style
    if (log_d) {
        p1 <- p1 + scale_y_continuous(trans=scales::log2_trans(),
                                      breaks=scales::trans_breaks('log2', function(x) 2^x),
                                      labels=scales::trans_format('log2', scales::math_format(2^.x)))
    } else {
        p1 <- p1 + scale_y_continuous()
    }

    # Add additional theme elements
    p1 <- p1 + do.call(theme, list(...))
    
    # Plot
    if (!silent) { plot(p1) }
    
    invisible(p1)
}

Try the alakazam package in your browser

Any scripts or data that you put into this service are public.

alakazam documentation built on Sept. 30, 2023, 9:07 a.m.