R/bootstraping.R

Defines functions diversity_ci diversity_boot diversity_stats aboot

Documented in aboot diversity_boot diversity_ci diversity_stats

#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
#
# This software was authored by Zhian N. Kamvar and Javier F. Tabima, graduate 
# students at Oregon State University; Jonah C. Brooks, undergraduate student at
# Oregon State University; and Dr. Nik Grünwald, an employee of USDA-ARS.
#
# Permission to use, copy, modify, and distribute this software and its
# documentation for educational, research and non-profit purposes, without fee, 
# and without a written agreement is hereby granted, provided that the statement
# above is incorporated into the material, giving appropriate attribution to the
# authors.
#
# Permission to incorporate this software into commercial products may be
# obtained by contacting USDA ARS and OREGON STATE UNIVERSITY Office for 
# Commercialization and Corporate Development.
#
# The software program and documentation are supplied "as is", without any
# accompanying services from the USDA or the University. USDA ARS or the 
# University do not warrant that the operation of the program will be 
# uninterrupted or error-free. The end-user understands that the program was 
# developed for research purposes and is advised not to rely exclusively on the 
# program for any reason.
#
# IN NO EVENT SHALL USDA ARS OR OREGON STATE UNIVERSITY BE LIABLE TO ANY PARTY 
# FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING
# LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, 
# EVEN IF THE OREGON STATE UNIVERSITY HAS BEEN ADVISED OF THE POSSIBILITY OF 
# SUCH DAMAGE. USDA ARS OR OREGON STATE UNIVERSITY SPECIFICALLY DISCLAIMS ANY 
# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE AND ANY STATUTORY 
# WARRANTY OF NON-INFRINGEMENT. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS"
# BASIS, AND USDA ARS AND OREGON STATE UNIVERSITY HAVE NO OBLIGATIONS TO PROVIDE
# MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. 
#
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
#==============================================================================#
#' Calculate a dendrogram with bootstrap support using any distance applicable 
#' to genind or genclone objects.
#' 
#' @param x a [genind-class][genind], [genpop-class][genpop],
#'   [genclone-class][genclone], [genlight-class], [snpclone-class] or
#'   \link{matrix} object.
#'   
#' @param strata a formula specifying the strata to be used to convert x to a
#'   genclone object if x is a genind object. Defaults to `NULL`. See details.
#' 
#' @param tree a text string or function that can calculate a tree from a 
#'   distance matrix. Defaults to "upgma". Note that you must load the package 
#'   with the function for it to work.
#'   
#' @param distance a character or function defining the distance to be applied 
#'   to x. Defaults to [nei.dist()].
#'   
#' @param sample An integer representing the number of bootstrap replicates 
#'   Default is 100.
#'   
#' @param cutoff An integer from 0 to 100 setting the cutoff value to return the
#'   bootstrap values on the nodes. Default is 0.
#'   
#' @param showtree If `TRUE` (Default), a dendrogram will be plotted. If 
#'   `FALSE`, nothing will be plotted.
#'   
#' @param missing any method to be used by [missingno()]: "mean" 
#'   (default), "zero", "loci", "genotype", or "ignore".
#'   
#' @param mcutoff a value between 0 (default) and 1 defining the percentage of 
#'   tolerable missing data if the `missing` parameter is set to "loci" or 
#'   "genotype". This should only be set if the distance metric can handle 
#'   missing data.
#'   
#' @param quiet if `FALSE` (default), a progress bar will be printed to 
#'   screen.
#'   
#' @param root is the tree rooted? This is a parameter passed off to 
#'   [ape::boot.phylo()]. If the `tree` parameter returns a 
#'   rooted tree (like UPGMA), this should be `TRUE`, otherwise (like 
#'   neighbor-joining), it should be false. When set to `NULL` (default), 
#'   the tree is considered rooted if [ape::is.ultrametric()] is true.
#'   
#' @param ... any parameters to be passed off to the distance method.
#'   
#' @return an object of class [ape::phylo()].
#'   
#' @details This function automates the process of bootstrapping genetic data to
#'   create a dendrogram with bootstrap support on the nodes. It will randomly
#'   sample with replacement the loci of a `gen` (genind/genpop) object or the columns of a 
#'   numeric matrix, **assuming that all loci/columns are independent**. The 
#'   process of randomly sampling `gen` objects with replacement is carried out
#'   through the use of an internal class called 
#'   [bootgen-class]. This is necessary due to the fact that columns in
#'   the genind matrix are defined as alleles and are thus interrelated. This 
#'   function will specifically bootstrap loci so that results are biologically 
#'   relevant. With this function, the user can also define a custom distance to
#'   be performed on the genind or genclone object. If you have a data frame-like
#'   object where all of the columns are independent or pairs of columns are
#'   independent, then it may be simpler to use [ape::boot.phylo()] to calculate
#'   your bootstrap support values. 
#'   
#'   \subsection{the strata argument}{
#'   There is an argument called `strata`. This argument is useful for when
#'   you want to bootstrap by populations from a [adegenet::genind()]
#'   object. When you specify strata, the genind object will be converted to
#'   [adegenet::genpop()] with the specified strata.
#'   }
#'   
#' @note [prevosti.dist()] and [diss.dist()] are exactly the
#'   same, but [diss.dist()] scales better for large numbers of 
#'   individuals (n > 125) at the cost of required memory. 
#'   \subsection{missing data}{
#'   Missing data is not allowed by many of the distances. Thus, one of 
#'   the first steps of this function is to treat missing data by setting it to 
#'   the average allele frequency in the data set. If you are using a distance 
#'   that can handle missing data (Prevosti's distance), you can set 
#'   `missing = "ignore"` to allow the distance function to handle any 
#'   missing data. See [missingno()] for details on missing 
#'   data.}
#'   \subsection{Bruvo's Distance}{
#'   While calculation of Bruvo's distance 
#'   is possible with this function, it is optimized in the function 
#'   [bruvo.boot()].}
#'   
#' @seealso [nei.dist()] [edwards.dist()] 
#'   [rogers.dist()] [reynolds.dist()] 
#'   [prevosti.dist()] [diss.dist()] 
#'   [bruvo.boot()] [ape::boot.phylo()] 
#'   [adegenet::dist.genpop()] [dist()]
#'   [bootgen2genind()] [bootgen-class]
#'   
#' @references
#' Kamvar ZN, Brooks JC and Grünwald NJ (2015) Novel R tools for analysis of 
#' genome-wide population genetic data with emphasis on clonality. 
#' Frontiers in Genetics 6:208. \doi{10.3389/fgene.2015.00208}
#' @export
#' @md
#' @keywords bootstrap
#' @aliases bootstrap
#' @examples
#' 
#' data(nancycats)
#' nan9 <- popsub(nancycats, 9)
#' 
#' set.seed(9999)
#' # Generate a tree using nei's distance
#' neinan <- aboot(nan9, dist = nei.dist)
#' 
#' set.seed(9999)
#' # Generate a tree using custom distance
#' bindist <- function(x) dist(tab(x), method = "binary")
#' binnan <- aboot(nan9, dist = bindist)
#' 
#' \dontrun{
#' # Distances from other packages.
#' #
#' # Sometimes, distance functions from other packages will have the constraint
#' # that the incoming data MUST be genind. Internally, aboot uses the 
#' # bootgen class ( class?bootgen ) to shuffle loci, and will throw an error
#' # The function bootgen2genind helps fix that. Here's an example of a function
#' # that expects a genind class from above
#' bindist <- function(x){
#'   stopifnot(is.genind(x))
#'   dist(tab(x), method = "binary")
#' }
#' #
#' # Fails:
#' # aboot(nan9, dist = bindist)
#' ## Error: is.genind(x) is not TRUE
#' #
#' # Add bootgen2genind to get it working!
#' # Works:
#' aboot(nan9, dist = function(x) bootgen2genind(x) %>% bindist)
#' 
#' # AFLP data
#' data(Aeut)
#' 
#' # Nei's distance
#' anei <- aboot(Aeut, dist = nei.dist, sample = 1000, cutoff = 50)
#' 
#' # Rogers' distance
#' arog <- aboot(Aeut, dist = rogers.dist, sample = 1000, cutoff = 50)
#' 
#' # This can also be run on genpop objects
#' strata(Aeut) <- other(Aeut)$population_hierarchy[-1]
#' Aeut.gc <- as.genclone(Aeut)
#' setPop(Aeut.gc) <- ~Pop/Subpop
#' Aeut.pop <- genind2genpop(Aeut.gc)
#' set.seed(5000)
#' aboot(Aeut.pop, sample = 1000) # compare to Grunwald et al. 2006
#' 
#' # You can also use the strata argument to convert to genpop inside the function.
#' set.seed(5000)
#' aboot(Aeut.gc, strata = ~Pop/Subpop, sample = 1000)
#' 
#' # And genlight objects 
#' # From glSim:
#' ## 1,000 non structured SNPs, 100 structured SNPs
#' x <- glSim(100, 1e3, n.snp.struc=100, ploid=2)
#' aboot(x, distance = bitwise.dist)
#' 
#' # Utilizing other tree methods
#' 
#' library("ape")
#' 
#' aboot(Aeut.pop, tree = fastme.bal, sample = 1000)
#' 
#' # Utilizing options in other tree methods
#' 
#' myFastME <- function(x) fastme.bal(x, nni = TRUE, spr = FALSE, tbr = TRUE)
#' aboot(Aeut.pop, tree = myFastME, sample = 1000)
#' 
#' }
#==============================================================================#
aboot <- function(x, strata = NULL, tree = "upgma", distance = "nei.dist", 
                  sample = 100, cutoff = 0, showtree = TRUE, missing = "mean", 
                  mcutoff = 0, quiet = FALSE, root = NULL, ...){
  if (!is.null(strata)){
    if (!is.genind(x)){
      warning("Sorry, the strata argument can only be used with genind objects.")
    } else {
      x <- genind2genpop(x, pop = strata, quiet = TRUE, process.other = FALSE)
    }
  }
  if (is.matrix(x)){
    if (is.null(rownames(x))) rownames(x) <- .genlab("", nrow(x))
    if (is.null(colnames(x))) colnames(x) <- .genlab("L", ncol(x))
    xboot <- x
  } else if (!is(x, "genlight") && x@type == "PA") {
    xboot           <- x@tab
    colnames(xboot) <- locNames(x)
    rownames(xboot) <- if (is.genpop(x)) popNames(x) else indNames(x)
  } else if (is(x, "gen")) {
    if (is.genind(x)) {
      if (missing %in% c("loci", "geno", "ignore")){
        x <- missingno(x, missing, quiet = quiet, cutoff = mcutoff)
        missing <- "asis"  
      } 
    }
    distname <- as.character(substitute(distance))
    if (length(distname) == 1 && distname %in% "diss.dist"){
      if (missing == "mean"){
        missing <- "asis"
        warning("Sorry, missing = 'mean' is incompatible with diss.dist(), setting missing to 'asis'.", 
                call. = FALSE)
      }
      xboot <- new("bootgen", x, na = missing, freq = FALSE)
    } else {
      xboot <- new("bootgen", x, na = missing, freq = TRUE)
    }
  } else if (is(x, "genlight")){
    xboot <- x
    my_dists <- c("diss.dist", "nei.dist", "prevosti.dist", "edwards.dist", 
                  "reynolds.dist", "rogers.dist", "provesti.dist")
    wrong_dist <- any(as.character(substitute(distance)) %in% my_dists)
    if (wrong_dist){
      warning("Sorry, distance from genlight objects can only be calculated by bitwise.dist.")
      distance <- bitwise.dist
    }
  } else {
    stop("Sorry, x must be a genind, genpop, or genlight object.")
  }
  treefunk <- tree_generator(tree, distance, ...)
  xtree    <- treefunk(xboot)
  if (any(xtree$edge.len < 0)){
    xtree <- fix_negative_branch(xtree)
    warning(negative_branch_warning())
  }
  treechar <- paste(substitute(tree), collapse = "")
  if (is.null(root)) {
    root <- ape::is.ultrametric(xtree)
  }
  nodelabs <- boot.phylo(xtree, xboot, treefunk, B = sample, rooted = root, 
                         quiet = quiet)
  nodelabs <- (nodelabs/sample)*100
  nodelabs <- ifelse(nodelabs >= cutoff, nodelabs, NA)
  if (!is.genpop(x)){
    if (is.matrix(x)){
      xtree$tip.label <- rownames(x)
    } else if (!is.null(indNames(x))) {
      xtree$tip.label <- indNames(x)
    }
  } else {
    xtree$tip.label <- popNames(x)
  }
  xtree$node.label <- nodelabs
  if (showtree) {
    poppr.plot.phylo(xtree, treechar, root)
  }
  return(xtree)
}

#==============================================================================#
#' Produce a table of diversity statistics
#' 
#' Calculate diversity statistics on a matrix containing counts of multilocus
#' genotypes per population.
#' 
#' @param z a table of integers representing counts of MLGs (columns) per 
#'   population (rows)
#'   
#' @param H logical whether or not to calculate Shannon's index
#' @param G logical whether or not to calculate Stoddart and Taylor's index (aka
#'   inverse Simpson's index).
#' @param lambda logical whether or not to calculate Simpson's index
#' @param E5 logical whether or not to calculate Evenness
#' @param ... any functions that can be calculated on a vector or matrix of 
#'   genotype counts.
#'   
#' @return a numeric matrix giving statistics (columns) for each population 
#'   (rows).
#'   
#' @details This function will calculate any diversity statistic for counts of 
#'   multilocus genotypes per population. This does not count allelic diversity.
#'   The calculations of H, G, and lambda are all performed by 
#'   [vegan::diversity()]. E5 is calculated as \deqn{E_{5} = 
#'   \frac{(1/\lambda) - 1}{e^{H} - 1}}{(G - 1)/(exp(H) - 1)}.
#'   
#' @export
#' @md
#' @seealso [diversity_boot()] [diversity_ci()]
#'   [poppr()]
#' @author Zhian N. Kamvar
#' @examples
#' library(poppr)
#' data(Pinf)
#' tab <- mlg.table(Pinf, plot = FALSE)
#' diversity_stats(tab)
#' \dontrun{
#' # Example using the poweRlaw package to calculate the negative slope of the
#' # Pareto distribution.
#'
#' library("poweRlaw")
#' power_law_beta <- function(x){
#'   xpow <- displ(x[x > 0])                 # Generate the distribution
#'   xpow$setPars(estimate_pars(xpow))       # Estimate the parameters
#'   xdat <- plot(xpow, draw = FALSE)        # Extract the data
#'   xlm <- lm(log(y) ~ log(x), data = xdat) # Run log-log linear model for slope
#'   return(-coef(xlm)[2])
#' }
#' 
#' Beta <- function(x){
#'   x <- drop(as.matrix(x))
#'   if (length(dim(x)) > 1){
#'     res <- apply(x, 1, power_law_beta)
#'   } else {
#'     res <- power_law_beta(x)
#'   }
#'   return(res)
#' }
#'
#' diversity_stats(tab, B = Beta)
#' }
#==============================================================================#
diversity_stats <- function(z, H = TRUE, G = TRUE, lambda = TRUE, E5 = TRUE, ...){
  E.5 <- function(x){
    H <- vegan::diversity(x)
    G <- vegan::diversity(x, "inv")
    (G - 1)/(exp(H) - 1)
  }
  BASIC_FUNS <- list(H = vegan::diversity,
                     G = function(x) vegan::diversity(x, "inv"),
                     lambda = function(x) vegan::diversity(x, "simp"),
                     E.5 = E.5)
  FUNS   <- c(BASIC_FUNS, list(...))
  STATS <- names(FUNS)
  STATS <- STATS[c(H, G, lambda, E5, rep(TRUE, length(list(...))))]
  
  # Indicator to determine if the input is from diversity_boot
  boot <- is.null(dim(z))
  
  nrows <- ifelse(boot, 1, nrow(z))
  if (boot){
    dims <- list(NULL, Index = STATS)
  } else {
    dims <- list(Pop = rownames(z), Index = STATS)
  }
  
  mat <- matrix(nrow = nrows, ncol = length(STATS), dimnames = dims)
  for (i in STATS){
    if (i == "E.5" && H && G){
      mat[, i] <- (mat[, "G"] - 1)/(exp(mat[, "H"]) - 1)
    } else {
      mat[, i] <- FUNS[[i]](z)
    }
  }
  return(drop(mat))
}

#==============================================================================#
#' Perform a bootstrap analysis on diversity statistics
#' 
#' This function is intended to perform bootstrap statistics on a matrix of
#' multilocus genotype counts in different populations. Results from this 
#' function should be interpreted carefully as the default statistics are known
#' to have a downward bias. See the details for more information.
#' 
#' 
#' @param tab a table produced from the \pkg{poppr} function 
#'   [mlg.table()]. MLGs in columns and populations in rows
#' @param n an integer > 0 specifying the number of bootstrap replicates to 
#'   perform (corresponds to `R` in the function [boot::boot()].
#' @param n.boot an integer specifying the number of samples to be drawn in each
#'   bootstrap replicate. If `n.boot` < 2 (default), the number of samples 
#'   drawn for each bootstrap replicate will be equal to the number of samples in
#'   the data set. 
#' @param n.rare a sample size at which all resamplings should be performed. 
#'   This should be no larger than the smallest sample size. Defaults to 
#'   `NULL`, indicating that each population will be sampled at its own 
#'   size.
#' @inheritParams diversity_stats
#' @param ... other parameters passed on to [boot::boot()] and 
#'   [diversity_stats()].
#'   
#' @return a list of objects of class "boot".
#' @seealso [diversity_stats()] for basic statistic calculation, 
#'    [diversity_ci()] for confidence intervals and plotting, and 
#'    [poppr()]. For bootstrap sampling:
#'    [stats::rmultinom()] [boot::boot()]
#'    
#' @details 
#'   Bootstrapping is performed in three ways:
#'   \itemize{
#'     \item if `n.rare` is a number greater than zero, then bootstrapping
#'     is performed by randomly sampling without replacement *n.rare*
#'     samples from the data.
#'     
#'     \item if `n.boot` is greater than 1, bootstrapping is performed by
#'     sampling n.boot samples from a multinomial distribution weighted by the
#'     proportion of each MLG in the data.
#'     
#'     \item if `n.boot` is less than 2, bootstrapping is performed by 
#'     sampling N samples from a multinomial distribution weighted by the
#'     proportion of each MLG in the data.
#'   }
#'   
#'   \subsection{Downward Bias}{
#'     When sampling with replacement, the diversity statistics here present a 
#'     downward bias partially due to the small number of samples in the data. 
#'     The result is that the mean of the bootstrapped samples will often be 
#'     much lower than the observed value. Alternatively, you can increase the
#'     sample size of the bootstrap by increasing the size of `n.boot`. Both
#'     of these methods should be taken with caution in interpretation. There
#'     are several R packages freely available that will calculate and perform
#'     bootstrap estimates of Shannon and Simpson diversity metrics (eg.
#'     \pkg{entropart}, \pkg{entropy}, \pkg{simboot}, and
#'     \pkg{EntropyEstimation}. These packages also offer unbiased estimators of
#'     Shannon and Simpson diversity. Please take care when attempting to
#'     interpret the results of this function.
#'   }
#' @export
#' @md
#' @author Zhian N. Kamvar
#' @examples
#' library(poppr)
#' data(Pinf)
#' tab <- mlg.table(Pinf, plot = FALSE)
#' diversity_boot(tab, 10L)
#' \dontrun{
#' # This can be done in a parallel fashion (OSX uses "multicore", Windows uses "snow")
#' system.time(diversity_boot(tab, 10000L, parallel = "multicore", ncpus = 4L))
#' system.time(diversity_boot(tab, 10000L))
#' }
#' @importFrom boot boot boot.ci norm.ci
#==============================================================================#
diversity_boot <- function(tab, n, n.boot = 1L, n.rare = NULL, H = TRUE, 
                           G = TRUE, lambda = TRUE, E5 = TRUE, ...){
  if (!is.null(n.rare)){
    FUN <- rare_sim_boot
    mle <- n.rare
  } else {
    FUN <- multinom_boot
    mle <- n.boot
  }
  res <- apply(tab, 1, boot_per_pop, rg = FUN, n = n, mle = mle, H = H, G = G, 
               lambda = lambda, E5 = E5, ...)
  return(res)
}

#==============================================================================#
#' Perform bootstrap statistics, calculate, and plot confidence intervals.
#' 
#' This function is for calculating bootstrap statistics and their confidence 
#' intervals. It is important to note that the calculation of confidence 
#' intervals is not perfect (See Details). Please be cautious when interpreting 
#' the results.
#' 
#' @param tab a [genind()], [genclone()],
#'   [snpclone()], OR a matrix produced from 
#'   [poppr::mlg.table()].
#' @param n an integer defining the number of bootstrap replicates (defaults to 
#'   1000).
#' @param n.boot an integer specifying the number of samples to be drawn in each
#'   bootstrap replicate. If `n.boot` < 2 (default), the number of samples 
#'   drawn for each bootstrap replicate will be equal to the number of samples
#'   in the data set. See Details.
#' @param ci the percent for confidence interval.
#' @param total argument to be passed on to [poppr::mlg.table()] if 
#'   `tab` is a genind object.
#' @param rarefy if `TRUE`, bootstrapping will be performed on the smallest
#'   population size or the value of `n.rare`, whichever is larger. 
#'   Defaults to `FALSE`, indicating that bootstrapping will be performed 
#'   respective to each population size.
#' @param n.rare an integer specifying the smallest size at which to resample 
#'   data. This is only used if `rarefy = TRUE`.
#' @param plot If `TRUE` (default), boxplots will be produced for each 
#'   population, grouped by statistic. Colored dots will indicate the observed 
#'   value.This plot can be retrieved by using `p <- last_plot()` from the 
#'   \pkg{ggplot2} package.
#' @param raw if `TRUE` (default) a list containing three elements will be 
#'   returned
#' @param center if `TRUE` (default), the confidence interval will be 
#'   centered around the observed statistic. Otherwise, if `FALSE`, the 
#'   confidence interval will be bias-corrected normal CI as reported from 
#'   [boot::boot.ci()]
#' @param ... parameters to be passed on to [boot::boot()] and 
#'   [diversity_stats()]
#'   
#' @return \subsection{raw = TRUE}{
#' 
#'    - **obs** a matrix with observed statistics in columns,
#'   populations in rows
#'    - **est** a matrix with estimated statistics in columns,
#'   populations in rows
#'    - **CI** an array of 3 dimensions giving the lower and upper
#'   bound, the index measured, and the population.
#'    - **boot** a list containing the output of
#'   [boot::boot()] for each population.
#'   }
#' 
#' \subsection{raw = FALSE}{ a data frame with the statistic observations,
#' estimates, and confidence intervals in columns, and populations in rows. Note
#' that the confidence intervals are converted to characters and rounded to
#' three decimal places. }
#' 
#' @details 
#'   \subsection{Bootstrapping}{
#'   For details on the bootstrapping procedures, see 
#'   [diversity_boot()]. Default bootstrapping is performed by 
#'   sampling \strong{N} samples from a multinomial distribution weighted by the
#'   relative multilocus genotype abundance per population where \strong{N} is
#'   equal to the number of samples in the data set. If \strong{n.boot} > 2,
#'   then \strong{n.boot} samples are taken at each bootstrap replicate. When
#'   `rarefy = TRUE`, then samples are taken at the smallest population
#'   size without replacement. This will provide confidence intervals for all
#'   but the smallest population.
#'   }
#'   \subsection{Confidence intervals}{
#'   Confidence intervals are derived from the function 
#'   [boot::norm.ci()]. This function will attempt to correct for bias
#'   between the observed value and the bootstrapped estimate. When `center
#'   = TRUE` (default), the confidence interval is calculated from the
#'   bootstrapped distribution and centered around the bias-corrected estimate
#'   as prescribed in Marcon (2012). This method can lead to undesirable
#'   properties, such as the confidence interval lying outside of the maximum
#'   possible value. For rarefaction, the confidence interval is simply
#'   determined by calculating the percentiles from the bootstrapped
#'   distribution. If you want to calculate your own confidence intervals, you
#'   can use the results of the permutations stored in the `$boot` element
#'   of the output.
#'   }
#'   \subsection{Rarefaction}{
#'   Rarefaction in the sense of this function is simply sampling a subset of 
#'   the data at size **n.rare**. The estimates derived from this method
#'   have straightforward interpretations and allow you to compare diversity
#'   across populations since you are controlling for sample size.
#'   }
#'   \subsection{Plotting}{ Results are plotted as boxplots with point
#'   estimates. If there is no rarefaction applied, confidence intervals are
#'   displayed around the point estimates. The boxplots represent the actual
#'   values from the bootstrapping and will often appear below the estimates and
#'   confidence intervals. 
#'   }
#' @note 
#'   \subsection{Confidence interval calculation}{ Almost all of the statistics 
#'   supplied here have a maximum when all genotypes are equally represented. 
#'   This means that bootstrapping the samples will always be downwardly biased.
#'   In many cases, the confidence intervals from the bootstrapped distribution 
#'   will fall outside of the observed statistic. The reported confidence 
#'   intervals here are reported by assuming the variance of the bootstrapped 
#'   distribution is the same as the variance around the observed statistic. As
#'   different statistics have different properties, there will not always be
#'   one clear method for calculating confidence intervals. A suggestion for
#'   correction in Shannon's index is to center the CI around the observed
#'   statistic (Marcon, 2012), but there are theoretical limitations to this.
#'   For details, see <https://stats.stackexchange.com/q/156235/49413>.
#'   }
#' 
#'   \subsection{User-defined functions}{ 
#'   While it is possible to use custom functions with this, there are three
#'   important things to remember when using these functions:
#'   
#'     1. The function must return a single value. 
#'     2. The function must allow for both matrix and vector inputs 
#'     3. The function name cannot match or partially match any arguments 
#'     from [boot::boot()]
#'    
#'   Anonymous functions are okay \cr(e.g. `function(x)
#'   vegan::rarefy(t(as.matrix(x)), 10)`).
#'   }
#' @export
#' @md
#' @seealso [diversity_boot()] [diversity_stats()]
#'   [poppr()] [boot::boot()] [boot::norm.ci()]
#'   [boot::boot.ci()]
#' @author Zhian N. Kamvar
#' @references
#' Marcon, E., Herault, B., Baraloto, C. and Lang, G. (2012). The Decomposition 
#' of Shannon’s Entropy and a Confidence Interval for Beta Diversity.
#' *Oikos* 121(4): 516-522.
#' 
#' @examples
#' library(poppr)
#' data(Pinf)
#' diversity_ci(Pinf, n = 100L)
#' \dontrun{
#' # With pretty results
#' diversity_ci(Pinf, n = 100L, raw = FALSE)
#' 
#' # This can be done in a parallel fasion (OSX uses "multicore", Windows uses "snow")
#' system.time(diversity_ci(Pinf, 10000L, parallel = "multicore", ncpus = 4L))
#' system.time(diversity_ci(Pinf, 10000L))
#' 
#' # We often get many requests for a clonal fraction statistic. As this is 
#' # simply the number of observed MLGs over the number of samples, we 
#' # recommended that people calculate it themselves. With this function, you
#' # can add it in:
#' 
#' CF <- function(x){
#'  x <- drop(as.matrix(x))
#'  if (length(dim(x)) > 1){
#'    res <- rowSums(x > 0)/rowSums(x)
#'  } else {
#'    res <- sum(x > 0)/sum(x)
#'  }
#'  return(res)
#' }
#' # Show pretty results
#' 
#' diversity_ci(Pinf, 1000L, CF = CF, center = TRUE, raw = FALSE)
#' diversity_ci(Pinf, 1000L, CF = CF, rarefy = TRUE, raw = FALSE)
#' }
#' 
#==============================================================================#
diversity_ci <- function(tab, n = 1000, n.boot = 1L, ci = 95, total = TRUE, 
                         rarefy = FALSE, n.rare = 10, plot = TRUE, raw = TRUE, 
                         center = TRUE, ...){
  allowed_objects <- c("genind", "genclone", "snpclone")
  if (!is.matrix(tab) & inherits(tab, allowed_objects)){
    tab <- mlg.table(tab, total = total, plot = FALSE)
  }
  rareval <- NULL
  if (rarefy){
    rareval <- max(min(rowSums(tab)), n.rare)
    msg     <- paste("\nSamples for rarefaction:", rareval)
    message(msg)
  } else {
    if (center == TRUE){
      msg <- paste("\nConfidence Intervals have been centered around observed",
                   "statistic.\nPlease see ?diversity_ci for details.\n")
      message(msg)
    } else if (n.boot > 2){
      msg <- paste("\nTake caution in interpreting the results.\nWhile sampling",
                   n.boot, "samples will produce a confidence interval,\nit",
                   "might be unclear what the correct interpretation should be.\n")
      message(msg)
    }
    
  }
  res  <- diversity_boot(tab, n, n.rare = rareval, n.boot = n.boot, ...)
  orig <- get_boot_stats(res)
  statnames <- colnames(orig)
  CI  <- get_all_ci(res, ci = ci, index_names = statnames, 
                    center = if (rarefy) FALSE else center,
                    btype =  if (rarefy) "percent" else "normal",
                    bci =    if (rarefy) FALSE else TRUE)
  est <- get_boot_se(res, "mean")
  out <- list(obs = orig, est = est, CI = CI, boot = res)
  if (plot) {
    boot_plot(res = res, 
              orig = orig,  
              CI = if (rarefy) NULL else CI
              )
  }
  out <- if (raw) out else do.call("pretty_info", out)
  return(out)
}

Try the poppr package in your browser

Any scripts or data that you put into this service are public.

poppr documentation built on May 29, 2024, 5:54 a.m.