Nothing
# Clonal diversity analysis
#' @include Classes.R
NULL
#### Coverage functions ####
#' Calculate sample coverage
#'
#' \code{calcCoverage} calculates the sample coverage estimate, a measure of sample
#' completeness, for varying orders using the method of Chao et al, 2015, falling back
#' to the Chao1 method in the first order case.
#'
#' @param x numeric vector of abundance counts.
#' @param r coverage order to calculate.
#'
#' @return The sample coverage of the given order \code{r}.
#'
#' @references
#' \enumerate{
#' \item Chao A. Nonparametric Estimation of the Number of Classes in a Population.
#' Scand J Stat. 1984 11, 265270.
#' \item Chao A, et al. Unveiling the species-rank abundance distribution by
#' generalizing the Good-Turing sample coverage theory.
#' Ecology. 2015 96, 11891201.
#' }
#'
#' @seealso
#' Used by \link{alphaDiversity}.
#'
#' @examples
#' # Calculate clone sizes
#' clones <- countClones(ExampleDb, groups="sample_id")
#'
#' # Calculate 1first order coverage for a single sample
#' calcCoverage(clones$seq_count[clones$sample_id == "+7d"])
#'
#' @export
calcCoverage <- function(x, r=1) {
# Use traditional calculation for 1st order coverage
if (r == 1) { return(calcChao1Coverage(x)) }
# Use general form for 2nd order and higher coverage
x <- x[x >= 1]
n <- sum(x)
fr <- sum(x == r)
fs <- sum(x == r + 1)
if (fr == 0) {
stop("Cannot calculate coverage of order ", r, ". No abundance data with count=", r, ".")
}
if (fs == 0) {
stop("Cannot calculate coverage of order ", r, ". No abundance data with count=", r + 1, ".")
}
a <- factorial(r)*fr / sum(x[x >= r]^r)
b <- ((n - r)*fr / ((n - r)*fr + (r + 1)*fs))^r
rC <- 1 - a*b
return(rC)
}
# Calculate first order coverage
#
# @param x a numeric vector of species abundance as counts
#
# @returns Coverage estimate.
calcChao1Coverage <- function(x) {
x <- x[x >= 1]
n <- sum(x)
f1 <- sum(x == 1)
f2 <- sum(x == 2)
if (f2 > 0) {
rC1 <- 1 - (f1 / n) * (((n - 1) * f1) / ((n - 1) * f1 + 2 * f2))
} else {
rC1 <- 1 - (f1 / n) * (((n - 1) * (f1 - 1)) / ((n - 1) * (f1 - 1) + 2))
}
return(rC1)
}
# Calculates diversity under rarefaction
#
# Calculates Hill numbers under rarefaction
#
# @param x vector of observed abundance counts.
# @param m the sequence count to rarefy to.
#
# @return The first order coverage estimate
inferRarefiedCoverage <- function(x, m) {
x <- x[x >= 1]
n <- sum(x)
if (m > n) {
stop("m must be <= the total count of observed sequences.")
}
# Unrarefied case
if (m == n) {
return(calcCoverage(x, r=1))
}
# Calculate rarefied coverage
# TODO: Read up on this and fix
#rC1 <- iNEXT:::Chat.Ind(x, m)
y <- x[(n - x) >= m]
rC1 <- 1 - sum(y/n * exp(lgamma(n - y + 1) - lgamma(n - m - y + 1) - lgamma(n) + lgamma(n - m)))
return(rC1)
}
#### Abundance functions ####
# Calculate undetected species
#
# Calculates the lower bound of undetected species counts using the Chao1 estimator.
#
# @param x vector of observed abundance counts.
#
# @return The count of undetected species.
inferUnseenCount <- function(x) {
x <- x[x >= 1]
n <- sum(x)
f1 <- sum(x == 1)
f2 <- sum(x == 2)
if (f2 > 0) {
f0 <- ceiling(((n - 1) * f1^2) / (n * 2 * f2))
} else {
f0 <- ceiling(((n - 1) * f1 * (f1 - 1)) / (n * 2))
}
return(f0)
}
# Define undetected species relative abundances
#
# @param x vector of detected species abundance counts.
#
# @return An adjusted detected species relative abundance distribution.
inferUnseenAbundance <- function(x) {
x <- x[x >= 1]
# Coverage
rC1 <- calcCoverage(x, r=1)
# Unseen count
f0 <- inferUnseenCount(x)
# Assign unseen relative abundance
p <- rep((1 - rC1) / f0, f0)
return(p)
}
# Adjustement to observed relative abundances
#
# @param x vector of observed abundance counts
#
# @return An adjusted observed species relative abundance distribution.
adjustObservedAbundance <- function(x) {
x <- x[x >= 1]
n <- sum(x)
# Coverage
rC1 <- calcCoverage(x, r=1)
# Calculate tuning parameter
lambda <- (1 - rC1) / sum(x/n * exp(-x))
# Define adjusted relative abundance
p <- x/n * (1 - lambda * exp(-x))
return(p)
}
# Combined unseen inferrence and observed abundance adjustment
#
# @param x named vector of observed abundance counts by clone.
#
# @return A vector containing the complete inferred abundance distribution.
# Unseen species will be denote by a clone name starting with "U".
inferCompleteAbundance <- function(x) {
# Infer complete abundance distribution
p1 <- adjustObservedAbundance(x)
p2 <- inferUnseenAbundance(x)
names(p2) <- if (length(p2) > 0) { paste0("U", 1:length(p2)) } else { NULL }
return(c(p1, p2))
}
#' Tabulates clones sizes
#'
#' \code{countClones} determines the number of sequences and total copy number of
#' clonal groups.
#'
#' @param data data.frame with columns containing clonal assignments.
#' @param groups character vector defining \code{data} columns containing grouping
#' variables. If \code{groups=NULL}, then do not group data.
#' @param copy name of the \code{data} column containing copy numbers for each
#' sequence. If this value is specified, then total copy abundance
#' is determined by the sum of copy numbers within each clonal group.
#' @param clone name of the \code{data} column containing clone identifiers.
#' @param remove_na removes rows with \code{NA} values in the clone column if \code{TRUE} and issues a warning.
#' Otherwise, keeps those rows and considers \code{NA} as a clone in the final counts
#' and relative abundances.
#'
#' @return A data.frame summarizing clone counts and frequencies with columns:
#' \itemize{
#' \item \code{clone_id}: clone identifier. This is the default column
#' name, specified with \code{clone='clone_id'}.
#' If the function call uses Change-O
#' formatted data and \code{clone='CLONE'}, this
#' column will have name \code{CLONE}.
#' \item \code{seq_count}: total number of sequences for the clone.
#' \item \code{seq_freq}: frequency of the clone as a fraction of the total
#' number of sequences within each group.
#' \item \code{copy_count}: sum of the copy counts in the \code{copy} column.
#' Only present if the \code{copy} argument is
#' specified.
#' \item \code{copy_freq}: frequency of the clone as a fraction of the total
#' copy number within each group. Only present if
#' the \code{copy} argument is specified.
#' }
#' Also includes additional columns specified in the \code{groups} argument.
#'
#' @examples
#' # Without copy numbers
#' clones <- countClones(ExampleDb, groups="sample_id")
#'
#' # With copy numbers and multiple groups
#' clones <- countClones(ExampleDb, groups=c("sample_id", "c_call"), copy="duplicate_count")
#'
#' @export
countClones <- function(data, groups=NULL, copy=NULL, clone="clone_id", remove_na=TRUE) {
# Check input
check <- checkColumns(data, c(clone, copy, groups))
if (check != TRUE) {
warning(check) # instead of throwing an error and potentially disrupting a workflow
}
# Handle NAs
if (remove_na) {
bool_na <- is.na(data[, clone])
if (any(bool_na)) {
if (!all(bool_na)){
msg <- paste0("NA(s) found in ", sum(bool_na), " row(s) of the ", clone,
" column and excluded from tabulation")
warning(msg)
}
data <- data[!bool_na, ]
}
}
# Tabulate clonal abundance
if (is.null(copy)) {
clone_tab <- data %>%
group_by(!!!rlang::syms(c(groups, clone))) %>%
dplyr::summarize(seq_count=n()) %>%
dplyr::mutate(seq_freq=!!rlang::sym("seq_count")/sum(!!rlang::sym("seq_count"), na.rm=TRUE)) %>%
dplyr::arrange(desc(!!rlang::sym("seq_count")))
} else {
clone_tab <- data %>%
group_by(!!!rlang::syms(c(groups, clone))) %>%
dplyr::summarize(seq_count=length(.data[[clone]]),
copy_count=sum(.data[[copy]], na.rm=TRUE)) %>%
dplyr::mutate(seq_freq=!!rlang::sym("seq_count")/sum(!!rlang::sym("seq_count"), na.rm=TRUE),
copy_freq=!!rlang::sym("copy_count")/sum(!!rlang::sym("copy_count"), na.rm=TRUE)) %>%
dplyr::arrange(desc(!!rlang::sym("copy_count")))
}
return(clone_tab)
}
# Perform boostrap abundance calculation
#
# @param x named vector of observed abundance values.
# @param n number of samples to draw from the estimate complete abundance distribution.
# @param nboot number of bootstrap realizations.
# @param method complete abundance inferrence method.
# One of "before", "after" or "none" for complete abundance distribution
# inferrence before sampling, after sampling, or uncorrected, respectively.
#
# @return A matrix of bootstrap results.
bootstrapAbundance <- function(x, n, nboot=200, method="before") {
## DEBUG
# x=abund_obs; method="before"
# Check argumets
method <- match.arg(method)
if (method == "before") {
# Calculate estimated complete abundance distribution
p <- inferCompleteAbundance(x)
# Bootstrap abundance
boot_mat <- rmultinom(nboot, n, p) / n
} else if (method == "after") {
# Calculate estimated complete abundance distribution
p <- x / sum(x, na.rm=TRUE)
boot_sam <- rmultinom(nboot, n, p)
boot_list <- apply(boot_sam, 2, inferCompleteAbundance)
# Convert to matrix
boot_names <- unique(unlist(sapply(boot_list, names)))
boot_mat <- matrix(0, nrow=length(boot_names), ncol=nboot)
rownames(boot_mat) <- boot_names
for (i in 1:nboot) {
boot_mat[names(boot_list[[i]]), i] <- boot_list[[i]]
}
} else if (method == "none") {
# Raw sampling of input
p <- x / sum(x, na.rm=TRUE)
boot_sam <- rmultinom(nboot, n, p)
} else {
stop("Invalid method: ", method)
}
return(boot_mat)
}
#' Estimates the complete clonal relative abundance distribution
#'
#' \code{estimateAbundance} estimates the complete clonal relative abundance distribution
#' and confidence intervals on clone sizes using bootstrapping.
#'
#' @param data data.frame with Change-O style columns containing clonal assignments.
#' @param clone name of the \code{data} column containing clone identifiers.
#' @param copy name of the \code{data} column containing copy numbers for each
#' sequence. If \code{copy=NULL} (the default), then clone abundance
#' is determined by the number of sequences. If a \code{copy} column
#' is specified, then clone abundances is determined by the sum of
#' copy numbers within each clonal group.
#' @param group name of the \code{data} column containing group identifiers.
#' If \code{NULL} then no grouping is performed and the \code{group}
#' column of the output will contain the value \code{NA} for each row.
#' @param min_n minimum number of observations to sample.
#' A group with less observations than the minimum is excluded.
#' @param max_n maximum number of observations to sample. If \code{NULL} then no
#' maximum is set.
#' @param uniform if \code{TRUE} then uniformly resample each group to the same
#' number of observations. If \code{FALSE} then allow each group to
#' be resampled to its original size or, if specified, \code{max_size}.
#' @param ci confidence interval to calculate; the value must be between 0 and 1.
#' @param nboot number of bootstrap realizations to generate.
#' @param progress if \code{TRUE} show a progress bar.
#'
#' @return A \link{AbundanceCurve} object summarizing the abundances.
#'
#' @references
#' \enumerate{
#' \item Chao A. Nonparametric Estimation of the Number of Classes in a Population.
#' Scand J Stat. 1984 11, 265270.
#' \item Chao A, et al. Rarefaction and extrapolation with Hill numbers:
#' A framework for sampling and estimation in species diversity studies.
#' Ecol Monogr. 2014 84:45-67.
#' \item Chao A, et al. Unveiling the species-rank abundance distribution by
#' generalizing the Good-Turing sample coverage theory.
#' Ecology. 2015 96, 11891201.
#' }
#'
#' @seealso
#' See \link{plotAbundanceCurve} for plotting of the abundance distribution.
#' See \link{alphaDiversity} for a similar application to clonal diversity.
#'
#' @examples
#' abund <- estimateAbundance(ExampleDb, group="sample_id", nboot=100)
#'
#' @export
estimateAbundance <- function(data, clone="clone_id", copy=NULL, group=NULL,
min_n=30, max_n=NULL, uniform=TRUE, ci=0.95, nboot=200,
progress=FALSE) {
# TODO:
# Add alakazam style cellIdColumn=NULL, locusColumn="locus", locusValues=c("IGH")
# similar to distToNearest
# filter based on locusValues
# if cellIdColumn
# for rows that have unique cell_id, ok
# if rows have cell_id not unique, count only once
# if not cellIdColumn, count heavy chains (locusValues will be IGH)
# If mixed bulk and sc do calculation but raise warning because different type of abundances
## DEBUG
# data=ExampleDb; group="sample_id"; clone="clone_id"; copy=NULL; min_n=1; max_n=NULL; ci=0.95; uniform=F; nboot=100
# copy="duplicate_count"
# group=NULL
# Hack for visibility of dplyr variables
#. <- NULL
# Check input
if (!is.data.frame(data)) {
stop("Input data is not a data.frame")
}
# Check columns that are reported are real columns (can be NULL)
check <- checkColumns(data, c(clone, copy, group))
if (check != TRUE) { stop(check) }
# Set confidence interval
ci_z <- ci + (1 - ci) / 2
ci_x <- qnorm(ci_z)
# Tabulate clonal abundance
count_col <- if (!is.null(copy)) { "copy_count" } else { "seq_count" }
clone_tab <- countClones(data, copy=copy, clone=clone, groups=group) %>%
dplyr::mutate(clone_count=!!rlang::sym(count_col))
# Tabulate group sizes
if (!is.null(group)) {
# Summarize groups
group_tab <- clone_tab %>%
group_by(!!rlang::sym(group)) %>%
dplyr::summarize(count=sum(!!rlang::sym("clone_count"), na.rm=TRUE)) %>%
rename(group=!!rlang::sym(group))
} else {
group_tab <- data.frame(v="All", count=sum(clone_tab$clone_count, na.rm=T))
names(group_tab)[1] <- "group"
}
group_all <- as.character(group_tab$group)
group_tab <- group_tab[group_tab$count >= min_n, ]
group_keep <- as.character(group_tab$group)
# Set number of sampled sequence
if (uniform) {
nsam <- min(group_tab$count, max_n)
nsam <- setNames(rep(nsam, length(group_keep)), group_keep)
} else {
nsam <- if (is.null(max_n)) { group_tab$count } else { pmin(group_tab$count, max_n) }
nsam <- setNames(nsam, group_keep)
}
# Warn if groups removed
if (length(group_keep) < length(group_all)) {
warning("Not all groups passed threshold min_n=", min_n, ".",
" Excluded: ", paste(setdiff(group_all, group_keep), collapse=", "))
}
# Generate abundance bootstrap
if (progress) {
pb <- progressBar(length(group_keep))
}
boot_list <- list()
abund_list <- list()
for (g in group_keep) {
n <- nsam[g]
# Extract abundance vector
if (!is.null(group)) {
abund_obs <- clone_tab$clone_count[clone_tab[[group]] == g]
names(abund_obs) <- clone_tab[[clone]][clone_tab[[group]] == g]
} else {
# Extract abundance vector
abund_obs <- clone_tab$clone_count
names(abund_obs) <- clone_tab[[clone]]
}
# Infer complete abundance distribution
boot_mat <- bootstrapAbundance(abund_obs, n, nboot=nboot, method="before")
# Assign confidence intervals based on variance of bootstrap realizations
p_mean <- apply(boot_mat, 1, mean)
p_sd <- apply(boot_mat, 1, sd)
p_err <- ci_x * p_sd
p_lower <- pmax(p_mean - p_err, 0)
p_upper <- p_mean + p_err
# Assemble and sort abundance data.frame
abund_df <- tibble::tibble(!!clone := rownames(boot_mat), p=p_mean, p_sd=p_sd,
lower=p_lower, upper=p_upper) %>%
dplyr::arrange(desc(!!rlang::sym("p"))) %>%
dplyr::mutate(rank=1:n())
# Save summary
abund_list[[g]] <- abund_df
# Save bootstrap
boot_list[[g]] <- as.data.frame(boot_mat) %>%
tibble::rownames_to_column(clone)
if (progress) { pb$tick() }
}
id_col <- "group"
if (!is.null(group)) { id_col <- group }
abundance_df <- as.data.frame(bind_rows(abund_list, .id=id_col))
bootstrap_df <- as.data.frame(bind_rows(boot_list, .id=id_col))
# Create a new diversity object with bootstrap
abund_obj <- new("AbundanceCurve",
bootstrap=bootstrap_df,
abundance=abundance_df,
clone_by=clone,
group_by=id_col,
#groups=if_else(is.null(group), as.character(NA), group_keep),
groups=group_keep,
n=nsam,
nboot=nboot,
ci=ci)
return(abund_obj)
}
#### Diversity functions ####
#' Calculate the diversity index
#'
#' \code{calcDiversity} calculates the clonal diversity index for a vector of diversity
#' orders.
#'
#' @param p numeric vector of clone (species) counts or proportions.
#' @param q numeric vector of diversity orders.
#'
#' @return A vector of diversity scores \eqn{D} for each \eqn{q}.
#'
#' @details
#' This method, proposed by Hill (Hill, 1973), quantifies diversity as a smooth function
#' (\eqn{D}) of a single parameter \eqn{q}. Special cases of the generalized diversity
#' index correspond to the most popular diversity measures in ecology: species richness
#' (\eqn{q = 0}), the exponential of the Shannon-Weiner index (\eqn{q} approaches \eqn{1}), the
#' inverse of the Simpson index (\eqn{q = 2}), and the reciprocal abundance of the largest
#' clone (\eqn{q} approaches \eqn{+\infty}). At \eqn{q = 0} different clones weight equally,
#' regardless of their size. As the parameter \eqn{q} increase from \eqn{0} to \eqn{+\infty}
#' the diversity index (\eqn{D}) depends less on rare clones and more on common (abundant)
#' ones, thus encompassing a range of definitions that can be visualized as a single curve.
#'
#' Values of \eqn{q < 0} are valid, but are generally not meaningful. The value of \eqn{D}
#' at \eqn{q=1} is estimated by \eqn{D} at \eqn{q=0.9999}.
#'
#' @references
#' \enumerate{
#' \item Hill M. Diversity and evenness: a unifying notation and its consequences.
#' Ecology. 1973 54(2):427-32.
#' }
#'
#' @seealso Used by \link{alphaDiversity}.
#'
#' @examples
#' # May define p as clonal member counts
#' p <- c(1, 1, 3, 10)
#' q <- c(0, 1, 2)
#' calcDiversity(p, q)
#'
#' # Or proportional abundance
#' p <- c(1/15, 1/15, 1/5, 2/3)
#' calcDiversity(p, q)
#'
#' @export
calcDiversity <- function(p, q) {
# Add jitter to q=1
q[q == 1] <- 0.9999
# Remove zeros
p <- p[p > 0]
# Convert p to proportional abundance
p <- p / sum(p)
# Calculate D for each q
D <- sapply(q, function(x) sum(p^x)^(1 / (1 - x)))
return(D)
}
# Calculate the inferred diversity index
#
# \code{calcInferredDiversity} calculates the clonal diversity index for a vector of diversity
# orders with a correction for the presence of unseen species. Does not take proportional abundance.
#
# @param p numeric vector of clone (species) counts.
# @param q numeric vector of diversity orders.
#
# @return A vector of diversity scores \eqn{D} for each \eqn{q}.
#
# @details
# This method, proposed by Hill (Hill, 1973), quantifies diversity as a smooth function
# (\eqn{D}) of a single parameter \eqn{q}. Special cases of the generalized diversity
# index correspond to the most popular diversity measures in ecology: species richness
# (\eqn{q = 0}), the exponential of the Shannon-Weiner index (\eqn{q} approaches \eqn{1}), the
# inverse of the Simpson index (\eqn{q = 2}), and the reciprocal abundance of the largest
# clone (\eqn{q} approaches \eqn{+\infty}). At \eqn{q = 0} different clones weight equally,
# regardless of their size. As the parameter \eqn{q} increase from \eqn{0} to \eqn{+\infty}
# the diversity index (\eqn{D}) depends less on rare clones and more on common (abundant)
# ones, thus encompassing a range of definitions that can be visualized as a single curve.
#
# Values of \eqn{q < 0} are valid, but are generally not meaningful. The value of \eqn{D}
# at \eqn{q=1} is estimated by \eqn{D} at \eqn{q=0.9999}.
#
# An adjusted detected species relative abundance distribution is applied before calculating diversity.
#
# @references
# \enumerate{
# \item Hill M. Diversity and evenness: a unifying notation and its consequences.
# Ecology. 1973 54(2):427-32.
# }
#
# @seealso Used by \link{alphaDiversity}
#
# @examples
# # May define p as clonal member counts
# p <- c(1, 1, 3, 10)
# q <- c(0, 1, 2)
# calcInferredDiversity(p, q)
#
#
# @export
# calcInferredDiversity <- function(p, q) {
# # Correct abundance
# .infer <- function(y) {
# # Infer complete abundance distribution
# p1 <- adjustObservedAbundance(y)
# p2 <- inferUnseenAbundance(y)
# names(p2) <- if (length(p2) > 0) { paste0("U", 1:length(p2)) } else { NULL }
# return(c(p1, p2))
# }
#
# # Correct abundance
# p <- .infer(p)
# # Add jitter to q=1
# q[q == 1] <- 0.9999
# # Remove zeros
# p <- p[p > 0]
# # Convert p to proportional abundance
# p <- p / sum(p)
# # Calculate D for each q
# D <- sapply(q, function(x) sum(p^x)^(1 / (1 - x)))
#
# return(D)
# }
# Calculates diversity under rarefaction
#
# Calculates Hill numbers under rarefaction
#
# @param x vector of observed abundance counts.
# @param q numeric vector of diversity orders.
# @param m the sequence count to rarefy to.
#
# @return A vector of diversity scores \eqn{D} for each \eqn{q}.
inferRarefiedDiversity <- function(x, q, m) {
x <- x[x >= 1]
n <- sum(x)
if (m > n) {
stop("m must be <= the total count of observed sequences.")
}
q[q == 1] <- 0.9999
# Tabulate frequency counts from 1:n
fk_n <- tabulate(x, nbins=n)
# Calculate estimated fk(m)
fk_m <- sapply(1:m, function(k) sum(exp(lchoose(k:m, k) +
lchoose(n - k:m, m - k) -
lchoose(n, m))*fk_n[k:m]))
# Calculate diversity
D <- sapply(q, function(r) sum((1:m / m)^r * fk_m)^(1 / (1 - r)))
return(D)
}
# Helper function for computing alpha diversity from bootrstrap outputs
#
# \code{helperAlpha} divides a set of bootstrapped clones by group annotation,
# and computes the diversity of each set.
#
# @param boot_output data.frame from\link{AbundanceCurve} object containing bootstrapped clonal
# abundance curves.
# @param q vector of Hill Diversity indices to test for diversity calculations.
# @param clone name of the \code{boot_output} column containing clone identifiers.
# @param group name of the \code{boot_output} column containing grouping information for
# diversity calculation.
#
# @return data.frame containing diversity calculations for each bootstrap iteration.
helperAlpha <- function(boot_output, q, clone="clone_id", group=NULL) {
## DEBUG
# abundance <- estimateAbundance(ExampleDb, group="sample_id", nboot=100)
# clone <- abundance@clone_by
# group <- abundance@group_by
# Compute diversity from a column of each bootstrap
output <- boot_output %>%
dplyr::ungroup() %>%
dplyr::select(-one_of(c(clone, group))) %>%
as.matrix() %>%
apply(2, calcDiversity, q=q) %>%
data.frame() %>%
mutate(q=q)
return(output)
}
# Helper function for computing beta diversity from bootrstrap outputs
#
# \code{helperBeta} divides a set of bootstrapped clones by group annotation,
# and computes the alpha diversity. Group annotations are then ignored and
# gamma diversity is computed. A multiplicative beta diversity is used corresponding
# to the gamma diversity divided by the average alpha diversity of each group.
#
# @param boot_output data.frame from\link{AbundanceCurve} object containing bootstrapped clonal abundance curves.
# @param q vector of Hill Diversity indices to test for diversity calculations.
# @param ci_z numeric value corresponding to confidence interval for calculating beta diversity.
# @param clone name of the \code{boot_output} column containing clone identifiers.
# @param group name of the \code{boot_output} column containing grouping information for diversity
# calculation.
#
# @return data.frame containing diversity calculations for each bootstrap iteration.
helperBeta <- function(boot_output, q, ci_x, clone="clone_id", group="group") {
# Hack for visibility of dplyr variables
. <- NULL
# Compute gamma diversity metrics
gamma <- boot_output %>%
dplyr::group_by(!!rlang::sym(clone)) %>%
dplyr::select(-one_of(c(group))) %>%
dplyr::summarize_all(sum) %>%
dplyr::do(helperAlpha(., q=q, clone=clone)) %>%
tidyr::gather(key="n", value="gamma", -!!rlang::sym("q")) %>%
dplyr::mutate(gamma=as.numeric(!!rlang::sym("gamma")))
# Compute alpha diversity metrics
alpha <- boot_output %>%
dplyr::group_by(!!rlang::sym(group)) %>%
dplyr::do(helperAlpha(., q=q, clone=clone, group=group)) %>%
dplyr::group_by(!!rlang::sym("q")) %>%
dplyr::select(-one_of(c(group))) %>%
dplyr::summarize_all(mean) %>%
tidyr::gather(key="n", value="alpha", -!!rlang::sym("q")) %>%
dplyr::mutate(alpha=as.numeric(!!rlang::sym("alpha")))
# Perform comparisons of alpha and gamma to extract beta
beta <- bind_cols(gamma, alpha) %>%
dplyr::group_by(!!rlang::sym("q")) %>%
dplyr::mutate(X=!!rlang::sym("gamma") / !!rlang::sym("alpha")) %>%
dplyr::summarize(d=mean(!!rlang::sym("X"), na.rm=TRUE),
d_sd=sd(!!rlang::sym("X"), na.rm=TRUE)) %>%
dplyr::mutate(d_lower=pmax(!!rlang::sym("d") - !!rlang::sym("d_sd") * ci_x, 0),
d_upper=!!rlang::sym("d") + !!rlang::sym("d_sd") * ci_x)
return(beta)
}
# Helper function for computing statistical significance
#
# \code{helperTest} computes the pairwise statistical significance of differences
# in bootstrapped diversity values between two sets defined by the group column.
# A p-value is computed using the ECDF distribution as the frequency of bootstrap iterations
# for which no difference is observed.
#
# @param div_df data.frame from\link{DiversityCurve} object containing bootstrapped
# diversity curves.
# @param group name of the \code{boot_output} column containing grouping information
# for diversity calculation.
# @param q vector of Hill Diversity indices to test for diversity calculations.
#
# @return data.frame containing test results for each value of q.
helperTest <- function(div_df, q, group="group") {
# Hack for visibility of dplyr variables
#. <- NULL
# Pairwise test
group_pairs <- combn(unique(div_df[[group]]), 2, simplify=F)
pvalue_list <- list()
for (group_pair in group_pairs) {
pair_list <- list()
for(q_i in q) {
# Currently just testing for one diversity order
mat1 <- div_df %>%
dplyr::filter(!!rlang::sym(group) == group_pair[1], !!rlang::sym("q") == q_i) %>%
dplyr::select(-one_of(c(group, "q"))) %>% unlist()
mat2 <- div_df %>%
dplyr::filter(!!rlang::sym(group) == group_pair[2], !!rlang::sym("q") == q_i) %>%
dplyr::select(-one_of(c(group, "q"))) %>% unlist()
if (mean(mat1) >= mean(mat2)) {
g_delta <- mat1 - mat2
} else {
g_delta <- mat2 - mat1
}
# Compute p-value from ecdf
p <- ecdf(g_delta)(0)
p <- ifelse(p <= 0.5, p * 2, (1 - p) * 2)
pair_list[[as.character(q_i)]] <- list(delta_mean=mean(g_delta),
delta_sd=sd(g_delta),
pvalue=p)
}
pvalue_list[[paste(group_pair, collapse=" != ")]] <- bind_rows(pair_list, .id="q")
}
test_df <- bind_rows(pvalue_list, .id="test")
return(test_df)
}
#' Calculate clonal alpha diversity
#'
#' \code{alphaDiversity} takes in a data.frame or \link{AbundanceCurve} and computes
#' diversity scores (\eqn{D}) over an interval of diversity orders (\eqn{q}).
#'
#' @param data data.frame with Change-O style columns containing clonal assignments or
#' a \link{AbundanceCurve} generate by \link{estimateAbundance} object
#' containing a previously calculated bootstrap distributions of clonal abundance.
#' @param min_q minimum value of \eqn{q}.
#' @param max_q maximum value of \eqn{q}.
#' @param step_q value by which to increment \eqn{q}.
#' @param ci confidence interval to calculate; the value must be between 0 and 1.
#' @param ... additional arguments to pass to \link{estimateAbundance}. Additional arguments
#' are ignored if a \link{AbundanceCurve} is provided as input.
#'
#' @return A \link{DiversityCurve} object summarizing the diversity scores.
#'
#' @references
#' \enumerate{
#' \item Hill M. Diversity and evenness: a unifying notation and its consequences.
#' Ecology. 1973 54(2):427-32.
#' \item Chao A. Nonparametric Estimation of the Number of Classes in a Population.
#' Scand J Stat. 1984 11, 265270.
#' \item Chao A, et al. Rarefaction and extrapolation with Hill numbers:
#' A framework for sampling and estimation in species diversity studies.
#' Ecol Monogr. 2014 84:45-67.
#' \item Chao A, et al. Unveiling the species-rank abundance distribution by
#' generalizing the Good-Turing sample coverage theory.
#' Ecology. 2015 96, 11891201.
#' }
#'
#' @seealso See \link{calcDiversity} for the basic calculation and
#' \link{DiversityCurve} for the return object.
#' See \link{plotDiversityCurve} for plotting the return object.
#'
#' @details
#' Clonal diversity is calculated using the generalized diversity index (Hill numbers)
#' proposed by Hill (Hill, 1973). See \link{calcDiversity} for further details.
#'
#' To generate a smooth curve, \eqn{D} is calculated for each value of \eqn{q} from
#' \code{min_q} to \code{max_q} incremented by \code{step_q}. When \code{uniform=TRUE}
#' variability in total sequence counts across unique values in the \code{group} column
#' is corrected by repeated resampling from the estimated complete clonal distribution to a
#' common number of sequences. The complete clonal abundance distribution that is resampled
#' from is inferred by using the Chao1 estimator to infer the number of unseen clones,
#' followed by applying the relative abundance correction and unseen clone frequencies
#' described in Chao et al, 2015.
#'
#' The diversity index (\eqn{D}) for each group is the mean value of over all resampling
#' realizations. Confidence intervals are derived using the standard deviation of the
#' resampling realizations, as described in Chao et al, 2015.
#'
#' Significance of the difference in diversity index (\code{D}) between groups is tested by
#' constructing a bootstrap delta distribution for each pair of unique values in the
#' \code{group} column. The bootstrap delta distribution is built by subtracting the diversity
#' index \code{Da} in group \code{a} from the corresponding value \eqn{Db} in group \code{b},
#' for all bootstrap realizations, yielding a distribution of \code{nboot} total deltas; where
#' group \code{a} is the group with the greater mean \code{D}. The p-value for hypothesis
#' \code{Da != Db} is the value of \code{P(0)} from the empirical cumulative distribution
#' function of the bootstrap delta distribution, multiplied by 2 for the two-tailed correction.
#'
#' Note, this method may inflate statistical significance when clone sizes are uniformly small,
#' such as when most clones sizes are 1, sample size is small, and \code{max_n} is near
#' the total count of the smallest data group. Use caution when interpreting the results
#' in such cases.
#'
#' @examples
#' # Group by sample identifier in two steps
#' abund <- estimateAbundance(ExampleDb, group="sample_id", nboot=100)
#' div <- alphaDiversity(abund, step_q=1, max_q=10)
#' plotDiversityCurve(div, legend_title="Sample")
#'
#' # Grouping by isotype rather than sample identifier in one step
#' div <- alphaDiversity(ExampleDb, group="c_call", min_n=40, step_q=1, max_q=10,
#' nboot=100)
#' plotDiversityCurve(div, legend_title="Isotype")
#'
#' @export
alphaDiversity <- function(data, min_q=0, max_q=4, step_q=0.1, ci=0.95, ...) {
# Hack for visibility of dplyr variables
. <- NULL
# Check input object and call estimateAbundance if required
if (is(data, "AbundanceCurve")) {
abundance <- data
} else if (is(data, "data.frame")) {
abundance <- estimateAbundance(data, ci=0.95, ...)
} else {
stop("Input must be either a data.frame or AbundanceCurve object.")
}
# Set diversity orders and confidence interval
ci_z <- ci + (1 - ci) / 2
ci_x <- qnorm(ci_z)
q <- seq(min_q, max_q, step_q)
if (!(0 %in% q)) { q <- c(0, q) }
# Set grouping variables
clone <- abundance@clone_by
group <- abundance@group_by
# Compute diversity metric for bootstrap instances
boot_df <- abundance@bootstrap %>%
dplyr::group_by(!!rlang::sym(group)) %>%
dplyr::do(helperAlpha(., q=q, clone=clone, group=group)) %>%
dplyr::ungroup()
# Summarize diversity
div_df <- boot_df %>%
tidyr::gather(key="n", value="X", -one_of(c(group, "q"))) %>%
dplyr::mutate(X=as.numeric(!!rlang::sym("X"))) %>%
dplyr::group_by(!!!rlang::syms(c(group, "q"))) %>%
dplyr::summarize(d=mean(!!rlang::sym("X"), na.rm=TRUE),
d_sd=sd(!!rlang::sym("X"), na.rm=TRUE)) %>%
dplyr::mutate(d_lower=pmax(!!rlang::sym("d") - !!rlang::sym("d_sd") * ci_x, 0),
d_upper=!!rlang::sym("d") + !!rlang::sym("d_sd") * ci_x)
# Compute evenness
div_qi <- div_df %>%
filter(!!rlang::sym("q") == 0) %>%
select(one_of(c(group, "d")))
div_df <- div_df %>%
dplyr::right_join(div_qi, by=group, suffix=c("", "_0")) %>%
mutate(e=!!rlang::sym("d")/!!rlang::sym("d_0"),
e_lower=!!rlang::sym("d_lower")/!!rlang::sym("d_0"),
e_upper=!!rlang::sym("d_upper")/!!rlang::sym("d_0")) %>%
select(-!!rlang::sym("d_0"))
# Test
if (length(abundance@groups) > 1) {
test_df <- helperTest(boot_df, q=q, group=group)
} else {
test_df <- NULL
}
# Build return object
group_set <- unique(div_df[[group]])
div_obj <- new("DiversityCurve",
diversity=div_df,
tests=test_df,
method="alpha",
group_by=group,
groups=group_set,
q=q,
n=abundance@n,
ci=ci)
return(div_obj)
}
# Calculates the pairwise beta diversity
#
# \code{betaDiversity} takes in a data.frame or \link{AbundanceCurve} and computes
# the multiplicative beta diversity across a range of Hill diversity indices.
#
# @param data data.frame with Change-O style columns containing clonal assignments or
# an \link{AbundanceCurve} object generate by \link{estimateAbundance}.
# containing a previously calculated bootstrap distributions of clonal abundance.
# @param comparisons named list of comparisons between group members for computing beta diversity.
# @param min_q minimum value of \eqn{q}.
# @param max_q maximum value of \eqn{q}.
# @param step_q value by which to increment \eqn{q}.
# @param ci confidence interval to calculate; the value must be between 0 and 1.
# @param ... additional arguments to pass to \link{estimateAbundance}. Additional arguments
# are ignored if a \link{AbundanceCurve} is provided as input.
#
# @return A \link{DiversityCurve} object summarizing the diversity scores.
#
# @details
# Beta diversity or the comparative difference between two samples as quantified using Hill
# diversity indices proposed by Jost (Jost, 2007).
#
# Briefly, the alpha and gamma diversity components are calculated for each comparison.
# Alpha diversity is calculated as the average hill diversity across each independent sample
# while Gamma diversity is calculated as the total diversity without distinguishing between
# samples. Beta diversity is computed as Gamma/Alpha.
#
# Diversity is calculated on the estimated clonal abundance distribution with a correction
# for unseen species much like the calculation for alpha diversity \link{alphaDiversity}.
# A smooth curve is generated in the same manner as in \link{alphaDiversity}.
# Confidence intervals are derived using the standard deviation of the resampling realizations.
#
# \enumerate{
# \item Hill M. Diversity and evenness: a unifying notation and its consequences.
# Ecology. 1973 54(2):427-32.
# \item Jost L. Partitioning Diversity Into Independent Alpha and Beta Components.
# Ecology. 2007 88(10):2427–2439.
# \item Jost L, et al. Partitioning diversity for conservation analyses.
# Diversity Distrib. 2010 16(1):65–76
# }
#
# @examples
# div <- betaDiversity(ExampleDb, comparisons=list("TIME"=c("-1h", "+7d")), group="sample_id",
# min_n=40, step_q=1, max_q=10, nboot=100)
#
# plotDiversityCurve(div, legend_title="Isotype")
#
# @export
betaDiversity <- function(data, comparisons, min_q=0, max_q=4, step_q=0.1, ci=0.95, ...) {
# Hack for visibility of dplyr variables
. <- NULL
if (!is.list(comparisons) || is.null(names(comparisons))) {
stop("'comparisons' must be a named list")
}
# Check input object and call estimateAbundance if required
if (is(data, "AbundanceCurve")) {
abundance <- data
} else if (is(data, "data.frame")) {
abundance <- estimateAbundance(data, ci=0.95, ...)
} else {
stop("Input must be either a data.frame or AbundanceCurve object.")
}
# Set diversity orders and confidence interval
ci_z <- ci + (1 - ci) / 2
ci_x <- qnorm(ci_z)
q <- seq(min_q, max_q, step_q)
if (!(0 %in% q)) { q <- c(0, q) }
# Compute pairwise beta diversity for bootstrap instances
beta_diversity_list <- list()
for (comparison in names(comparisons)) {
beta_diversity_list[[comparison]] <- abundance@bootstrap %>%
dplyr::ungroup() %>%
dplyr::filter(.[[abundance@group_by]] %in% comparisons[[comparison]]) %>%
dplyr::do(helperBeta(., q=q, clone=abundance@clone_by, group=abundance@group_by, ci_x=ci_x))
}
# Generate summary diversity output
div_df <- bind_rows(beta_diversity_list, .id = "comparison")
# Beta groups
group_set <- unique(div_df[["comparison"]])
# Compute evenness
div_qi <- div_df %>%
filter(!!rlang::sym("q") == 0) %>%
select(one_of(c("comparison", "D")))
div <- div_df %>%
right_join(div_qi, by = "comparison", suffix = c("", "_0")) %>%
mutate(d = !!rlang::sym("d")/!!rlang::sym("d_0"),
e_lower = !!rlang::sym("d_lower")/!!rlang::sym("d_0"),
e_upper = !!rlang::sym("d_upper")/!!rlang::sym("d_0")) %>%
select(-!!rlang::sym("d_0"))
# Test
if (length(group_set) > 1) {
test_df <- helperTest(div_df, q=q, group="comparison")
} else {
test_df <- NULL
}
# Build return object
div_obj <- new("DiversityCurve",
diversity=div,
tests=test_df,
method="beta",
group_by="comparison",
groups=group_set,
n=abundance@n,
q=q,
ci=ci)
return(div_obj)
}
#### Plotting functions ####
#' Plot a clonal abundance distribution
#'
#' \code{plotAbundanceCurve} plots the results from estimating the complete clonal
#' relative abundance distribution. The distribution is plotted as a log rank abundance
#' distribution.
#'
#' @param data \link{AbundanceCurve} object returned by \link{estimateAbundance}.
#' @param colors named character vector whose names are values in the
#' \code{group} column of \code{data} and whose values are
#' colors to assign to those group names.
#' @param main_title string specifying the plot title.
#' @param legend_title string specifying the legend title.
#' @param xlim numeric vector of two values specifying the
#' \code{c(lower, upper)} x-axis limits. The lower x-axis
#' value must be >=1.
#' @param ylim numeric vector of two values specifying the
#' \code{c(lower, upper)} y-axis limits. The limits on the
#' abundance values are expressed as fractions of 1: use
#' c(0,1) to set the lower and upper limits to 0\% and 100\%.
#' @param annotate string defining whether to added values to the group labels
#' of the legend. When \code{"none"} (default) is specified no
#' annotations are added. Specifying (\code{"depth"}) adds
#' sequence counts to the labels.
#' @param silent if \code{TRUE} do not draw the plot and just return the ggplot2
#' object; if \code{FALSE} draw the plot.
#' @param ... additional arguments to pass to ggplot2::theme.
#'
#' @return A \code{ggplot} object defining the plot.
#'
#' @seealso
#' See \link{AbundanceCurve} for the input object and \link{estimateAbundance} for
#' generating the input abundance distribution. Plotting is performed with \link{ggplot}.
#'
#' @examples
#' # Estimate abundance by sample and plot
#' abund <- estimateAbundance(ExampleDb, group="sample_id", nboot=100)
#' plotAbundanceCurve(abund, legend_title="Sample")
#'
#' @export
plotAbundanceCurve <- function(data, colors=NULL, main_title="Rank Abundance",
legend_title=NULL, xlim=NULL, ylim=NULL,
annotate=c("none", "depth"),
silent=FALSE, ...) {
# Check if abundance is in data
if (is.null(data@abundance)) { stop("Missing abundance data.") }
# Validate abundance limits
if (!is.null(xlim)) {
if (xlim[1]<1) {
stop("The lower x-axis xlim value must be >=1.")
}
max_xlim <- max(data@abundance$rank, na.rm = T)
if (xlim[2]>max_xlim) {
message("The largest x-axis value is ",max_xlim,".")
}
}
# Check arguments
annotate <- match.arg(annotate)
# Define group label annotations
if (all(is.na(data@groups)) || length(data@groups) == 1) {
group_labels <- NA
} else if (annotate == "none") {
group_labels <- setNames(data@groups, data@groups)
} else if (annotate == "depth") {
group_labels <- setNames(paste0(data@groups, " (N=", data@n, ")"),
data@groups)
}
# Stupid hack for check NOTE about `.x` in math_format
.x <- NULL
if (!all(is.na(group_labels))) {
# Define grouped plot
p1 <- ggplot(data@abundance, aes(x=!!rlang::sym("rank"),
y=!!rlang::sym("p"),
group=!!rlang::sym(data@group_by))) +
ggtitle(main_title) +
baseTheme() +
xlab("Rank") +
ylab("Abundance") +
scale_x_log10(
breaks=scales::trans_breaks("log10", function(x) 10^x),
labels=scales::trans_format("log10", scales::math_format(10^.x))) +
scale_y_continuous(labels=scales::percent) +
geom_ribbon(aes(ymin=!!rlang::sym("lower"),
ymax=!!rlang::sym("upper"),
fill=!!rlang::sym(data@group_by)), alpha=0.4) +
geom_line(aes(color=!!rlang::sym(data@group_by)))
# Set colors and legend
if (!is.null(colors)) {
p1 <- p1 + scale_color_manual(name=legend_title, labels=group_labels, values=colors) +
scale_fill_manual(name=legend_title, labels=group_labels, values=colors)
} else {
p1 <- p1 + scale_color_discrete(name=legend_title, labels=group_labels) +
scale_fill_discrete(name=legend_title, labels=group_labels)
}
} else {
# Set color
if (!is.null(colors) & length(colors) == 1) {
line_color <- colors
} else {
line_color <- "black"
}
# Define plot
p1 <- ggplot(data@abundance, aes(x=!!rlang::sym("rank"),
y=!!rlang::sym("p"))) +
ggtitle(main_title) +
baseTheme() +
xlab("Rank") +
ylab("Abundance") +
scale_x_log10(
breaks=scales::trans_breaks("log10", function(x) 10^x),
labels=scales::trans_format("log10", scales::math_format(10^.x))) +
scale_y_continuous(labels=scales::percent) +
geom_ribbon(aes(ymin=!!rlang::sym("lower"),
ymax=!!rlang::sym("upper")),
fill=line_color, alpha=0.4) +
geom_line(color=line_color)
}
# Add additional theme elements
p1 <- p1 +
coord_cartesian(xlim=xlim,ylim=ylim) +
do.call(theme, list(...))
# Plot
if (!silent) { plot(p1) }
invisible(p1)
}
#' Plot the results of alphaDiversity
#'
#' \code{plotDiversityCurve} plots a \code{DiversityCurve} object.
#'
#' @param data \link{DiversityCurve} object returned by
#' \link{alphaDiversity}.
#' @param colors named character vector whose names are values in the
#' \code{group} column of the \code{data} slot of \code{data},
#' and whose values are colors to assign to those group names.
#' @param main_title string specifying the plot title.
#' @param legend_title string specifying the legend title.
#' @param log_x if \code{TRUE} then plot \eqn{q} on a log scale;
#' if \code{FALSE} plot on a linear scale.
#' @param log_y if \code{TRUE} then plot the diversity/evenness scores
#' on a log scale; if \code{FALSE} plot on a linear scale.
#' @param xlim numeric vector of two values specifying the
#' \code{c(lower, upper)} x-axis limits.
#' @param ylim numeric vector of two values specifying the
#' \code{c(lower, upper)} y-axis limits.
#' @param annotate string defining whether to added values to the group labels
#' of the legend. When \code{"none"} (default) is specified no
#' annotations are added. Specifying (\code{"depth"}) adds
#' sequence counts to the labels.
#' @param score one of \code{"diversity"} or \code{"evenness"} specifying which
#' score to plot on the y-asis.
#' @param silent if \code{TRUE} do not draw the plot and just return the ggplot2
#' object; if \code{FALSE} draw the plot.
#' @param ... additional arguments to pass to ggplot2::theme.
#'
#' @return A \code{ggplot} object defining the plot.
#'
#' @seealso See \link{alphaDiversity} and \link{alphaDiversity} for generating
#' \link{DiversityCurve} objects for input. Plotting is performed with \link{ggplot}.
#'
#' @examples
#' # Calculate diversity
#' div <- alphaDiversity(ExampleDb, group="sample_id", nboot=100)
#'
#' # Plot diversity
#' plotDiversityCurve(div, legend_title="Sample")
#'
#' # Plot diversity
#' plotDiversityCurve(div, legend_title="Sample", score="evenness")
#'
#' @export
plotDiversityCurve <- function(data, colors=NULL, main_title="Diversity",
legend_title="Group", log_x=FALSE, log_y=FALSE,
xlim=NULL, ylim=NULL, annotate=c("none", "depth"),
score=c("diversity", "evenness"),
silent=FALSE, ...) {
# Check arguments
annotate <- match.arg(annotate)
score <- match.arg(score)
# Define group label annotations
if (all(is.na(data@groups)) || length(data@groups) == 1) {
group_labels <- NA
} else if (annotate == "none") {
group_labels <- setNames(data@groups, data@groups)
} else if (annotate == "depth") {
group_labels <- setNames(paste0(data@groups, " (n=", data@n, ")"),
data@groups)
}
# Define y-axis scores
if (score == "diversity") {
y_value <- "d"
y_min <- "d_lower"
y_max <- "d_upper"
y_label <- expression(''^q * D)
} else if (score == "evenness") {
y_value <- "e"
y_min <- "e_lower"
y_max <- "e_upper"
y_label <- expression(''^q * e)
}
# Stupid hack for check NOTE about `.x` in math_format
.x <- NULL
if (!all(is.na(group_labels))) {
# Define grouped plot
p1 <- ggplot(data@diversity, aes(x=q, y=!!rlang::sym(y_value),
group=!!rlang::sym(data@group_by))) +
ggtitle(main_title) +
baseTheme() +
xlab('q') +
ylab(y_label) +
geom_ribbon(aes(ymin=!!rlang::sym(y_min), ymax=!!rlang::sym(y_max),
fill=!!rlang::sym(data@group_by)), alpha=0.4) +
geom_line(aes(color=!!rlang::sym(data@group_by)))
# Set colors and legend
if (!is.null(colors)) {
p1 <- p1 + scale_color_manual(name=legend_title, labels=group_labels, values=colors) +
scale_fill_manual(name=legend_title, labels=group_labels, values=colors)
} else {
p1 <- p1 + scale_color_discrete(name=legend_title, labels=group_labels) +
scale_fill_discrete(name=legend_title, labels=group_labels)
}
} else {
# Set color
if (!is.null(colors) & length(colors) == 1) {
line_color <- colors
} else {
line_color <- "black"
}
# Define ungrouped plot
p1 <- ggplot(data@diversity, aes(x=q, y=!!rlang::sym(y_value))) +
ggtitle(main_title) +
baseTheme() +
xlab('q') +
ylab(y_label) +
geom_ribbon(aes(ymin=!!rlang::sym(y_min), ymax=!!rlang::sym(y_max)), fill=line_color, alpha=0.4) +
geom_line(color=line_color)
}
# Set x-axis style
if (log_x) {
p1 <- p1 + scale_x_continuous(trans=scales::log2_trans(),
breaks=scales::trans_breaks('log2', function(x) 2^x),
labels=scales::trans_format('log2', scales::math_format(2^.x)))
} else {
p1 <- p1 + scale_x_continuous()
}
# Set y-axis style
if (log_y) {
p1 <- p1 + scale_y_continuous(trans=scales::log2_trans(),
breaks=scales::trans_breaks('log2', function(x) 2^x),
labels=scales::trans_format('log2', scales::math_format(2^.x)))
} else {
p1 <- p1 + scale_y_continuous()
}
# Add additional theme elements
p1 <- p1 +
coord_cartesian(xlim=xlim,ylim=ylim) +
do.call(theme, list(...))
# Plot
if (!silent) { plot(p1) }
invisible(p1)
}
#' Plot the results of diversity testing
#'
#' \code{plotDiversityTest} plots summary data for a \code{DiversityCurve} object
#' with mean and a line range indicating plus/minus one standard deviation.
#'
#' @param data \link{DiversityCurve} object returned by
#' \link{alphaDiversity}.
#' @param q diversity order to plot the test for.
#' @param colors named character vector whose names are values in the
#' \code{group} column of the \code{data} slot of \code{data},
#' and whose values are colors to assign to those group names.
#' @param main_title string specifying the plot title.
#' @param legend_title string specifying the legend title.
#' @param log_d if \code{TRUE} then plot the diversity scores \eqn{D}
#' on a log scale; if \code{FALSE} plot on a linear scale.
#' @param annotate string defining whether to added values to the group labels
#' of the legend. When \code{"none"} (default) is specified no
#' annotations are added. Specifying (\code{"depth"}) adds
#' sequence counts to the labels.
#' @param silent if \code{TRUE} do not draw the plot and just return the ggplot2
#' object; if \code{FALSE} draw the plot.
#' @param ... additional arguments to pass to ggplot2::theme.
#'
#' @return A \code{ggplot} object defining the plot.
#'
#' @seealso See \link{alphaDiversity} for generating input.
#' Plotting is performed with \link{ggplot}.
#'
#' @examples
#' # Calculate diversity
#' div <- alphaDiversity(ExampleDb, group="sample_id", min_q=0, max_q=2, step_q=1, nboot=100)
#'
#' # Plot results at q=0 (equivalent to species richness)
#' plotDiversityTest(div, 0, legend_title="Sample")
#'
#' # Plot results at q=2 (equivalent to Simpson's index)
#' plotDiversityTest(div, q=2, legend_title="Sample")
#'
#' @export
plotDiversityTest <- function(data, q, colors=NULL, main_title="Diversity", legend_title="Group",
log_d=FALSE, annotate=c("none", "depth"), silent=FALSE, ...) {
# Stupid hack for check NOTE about `.x` in math_format
.x <- NULL
# Check arguments
annotate <- match.arg(annotate)
# Check if abundance is in data
if (is.null(data@tests)) {
stop("Test data missing from input object.")
}
# Check if q is in data
if (!(q %in% data@q)) {
stop("Test for order q=", q, " not found in input object.")
}
# Define group label annotations
if (annotate == "none") {
group_labels <- setNames(data@groups, data@groups)
} else if (annotate == "depth") {
group_labels <- setNames(paste0(data@groups, " (N=", data@n, ")"),
data@groups)
}
# Define plot values
df <- data@diversity %>%
dplyr::filter(!!rlang::sym("q") == !!rlang::enquo(q)) %>%
dplyr::mutate(lower=!!rlang::sym("d") - !!rlang::sym("d_sd"),
upper=!!rlang::sym("d") + !!rlang::sym("d_sd"))
# Define base plot elements
p1 <- ggplot(df, aes(x=!!rlang::sym(data@group_by))) +
ggtitle(main_title) +
baseTheme() +
xlab("") +
ylab(bquote("Mean " ^ .(q) * D %+-% "SD")) +
geom_linerange(aes(ymin=!!rlang::sym("lower"),
ymax=!!rlang::sym("upper"),
color=!!rlang::sym(data@group_by)), alpha=0.8) +
geom_point(aes(y=!!rlang::sym("d"), color=!!rlang::sym(data@group_by)))
# Set colors and legend
if (!is.null(colors)) {
p1 <- p1 + scale_color_manual(name=legend_title, labels=group_labels, values=colors)
} else {
p1 <- p1 + scale_color_discrete(name=legend_title, labels=group_labels)
}
# Set x-axis style
if (log_d) {
p1 <- p1 + scale_y_continuous(trans=scales::log2_trans(),
breaks=scales::trans_breaks('log2', function(x) 2^x),
labels=scales::trans_format('log2', scales::math_format(2^.x)))
} else {
p1 <- p1 + scale_y_continuous()
}
# Add additional theme elements
p1 <- p1 + do.call(theme, list(...))
# Plot
if (!silent) { plot(p1) }
invisible(p1)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.