repvar: Extract Samples to Represent All Variables

Documented in rpv_stats

#' Calculate Entropy Statistics
#'
#' Because it's possible to have multiple results with a minimum number of
#' samples, one way of assessing their importance is to calculate how
#' distributed the alleles are among the samples. This can be done with entropy
#' statistics
#'
#' @param tab a numeric matrix
#' @param f a factor that is the same length as the number of columns in `tab`.
#'   this is used to split the matrix up by groups for analysis.
#'
#' @return a data frame with three columns: `eH`, `G`, `E5`, `lambda`, and
#'   `missing`
#' @details This function caluclates four statistics from your data using
#'   variable counts.
#'
#'   * eH: The exponentiation of shannon's entropy: `exp(sum(-x * log(x)))`
#'   (Shannon, 1948)
#'   * G : Stoddart and Taylor's index, or inverse Simpson's index: `1/sum(x^2)`
#'   (Stoddart and Taylor, 1988; Simpson, 1949)
#'   * E5: Evenness (5) the ratio between the above two estimates:
#'   `(G - 1)/(eH - 1)` (Pielou, 1975)
#'   * lambda: Unbiased Simpson's index: `(n/(n-1))*(1 - sum(x^2))`
#'   * missing: the percent missing data out of the total number of cells.
#'
#'   Both G and eH can be thought of as the number of equally abundant variables
#'   to acheive the same observed diversity. Both G and eH give different weight
#'   to variables based on their abundance, so we use evenness to describe how
#'   uniform this distribution is.
#'
#'   Note that this version of Evenness is different than Shannon's Evenness,
#'   which is `H/ln(S)` where S is the number of variables (in our case).
#'
#'   If a vector of factors is supplied, the columns of the matrix is first
#'   split by this factor and each statistic calculated on each level.
#'
#' @note The calculations within this function are derived from the vegan and
#'   poppr R packages.
#'
#' @export
#' @references
#'
#' Claude Elwood Shannon. A mathematical theory of communication. Bell Systems
#' Technical Journal, 27:379-423,623-656, 1948
#'
#' Simpson, E. H. Measurement of diversity. Nature 163: 688, 1949
#' doi:10.1038/163688a0
#'
#' J.A. Stoddart and J.F. Taylor. Genotypic diversity: estimation and prediction
#' in samples. Genetics, 118(4):705-11, 1988.
#'
#' E.C. Pielou. Ecological Diversity. Wiley, 1975.
#'
#' @examples
#'
#' # Calculate statistics for the whole data set -----------------------------
#'
#' data(monilinia)
#' rpv_stats(monilinia)
#'
#' # Use a grouping factor for variables -------------------------------------
#' # Each variable in this data set represents and allele that is one of
#' # thirteen loci. If we wanted a table across all loci individually, we can
#' # group by locus name.
#'
#' f <- gsub("[.][0-9]+", "", colnames(monilinia))
#' f <- factor(f, levels = unique(f))
#' colMeans(emon <- rpv_stats(monilinia, f = f)) # average entropy across loci
#' emon
#'
#' # calculating entropy for minimum sets ------------------------------------
#'
#' set.seed(1999)
#' i <- rpv_find(monilinia, n = 150, cut = TRUE, progress = FALSE)
#' colMeans(emon1 <- rpv_stats(monilinia[i[[1]], ], f = f))
#' colMeans(emon2 <- rpv_stats(monilinia[i[[2]], ], f = f))
#'
rpv_stats <- function(tab, f = NULL) {
  if (!is.null(f)) {
    if (length(f) != ncol(tab)) {
      stop("f must be the same number of columns as tab")
    }
    if (!is.factor(f)) {
      warning("f is not a factor. I am coercing f to be a factor in order")
      f <- factor(f, levels = unique(f))
    }
    groups <- split.matrix(tab, f)
    res    <- lapply(groups, rpv_stats, f = NULL)
    resdf  <- do.call("rbind", res)
    return(resdf)
  }
  tabsums <- colSums(tab, na.rm = TRUE)
  n       <- sum(tabsums)
  x       <- tabsums/(sum(tabsums))
  sx2     <- sum(x * x)
  H       <- exp(sum(-x * log(x)))
  G       <- 1/(sx2)
  lambda  <- (n/(n - 1)) * (1 - sx2)
  E5      <- (G - 1)/(H - 1)
  missing <- sum(is.na(tab))/length(tab)
  data.frame(eH = H, G = G, E5 = E5, lambda = lambda, missing = missing)
}

split.matrix <- function(x, f, ...){
  tmp <- split.default(x, f[col(x)], ...)
  lapply(tmp, matrix, nrow = nrow(x))
}