Phenoflow_package: Advanced analysis of microbial flow cytometry data

Documented in Diversity_16S

#' Diversity function for 16S amplicon data
#'
#' This function calculates Hill diversity metrics from 16S amplicon data (in phyloseq format).
#' D0 (richness) is calculated with three methods: 1) Observed richness, 2) Chao1 estimator
#' 3) breakaway method (Willis and Bunge 2016). D1 (exponential of Shannon entropy)
#' and D2 (inverse Simpson index) are respectively Hill order 1 and 2.
#' Errors for D1 and D2 are calculated by bootstrapping.
#' 
#' @param x phyloseq object generated by the phyloseq package
#' @param R Number of bootstraps to conduct. Defaults to 999
#' @param brea TRUE/FALSE if breakaway method for D0 estimation should be used.
#' Defaults to TRUE. This method fails easily if you don't have atleast 6 contiguous
#' frequencies.
#' @param thresh Minimum sample size required to perform Chao1 estimation.
#' @param parallel Should the calculation be parallelized? Defaults to FALSE
#' @param ncores How many cores should be used in case of parallel computation?
#' Defaults to 2.
#' @keywords diversity, fcm, alpha
#' @importFrom foreach %dopar%
#' @importFrom stats sd
#' @examples 
#' # First install phyloseq if you haven't yet
#' if(requireNamespace("phyloseq",quietly=TRUE)){
#' library(phyloseq)
#' } else {
#' source("https://bioconductor.org/biocLite.R")
#' biocLite("phyloseq")
#' library(phyloseq)
#' }
#' # Load data (V3-V4 amplicon data from doi: 10.1111/2041-210X.12607)
#' data(physeq_test)
#' # Opting for three bootstraps, because this can take some time.
#' Diversity_16S(phyloseq::prune_samples(phyloseq::sample_names(physeq_test) == "1", physeq_test), R=3)
#' @export

Diversity_16S <- function(x, R = 999, brea = TRUE, thresh = 200, parallel = FALSE, 
                          ncores = 2) {
  if (!requireNamespace("phyloseq", quietly = TRUE)) {
    stop("Phyloseq package needed for this function to work. Please install it.",
         call. = FALSE)
  }
  cat("\t**WARNING** this functions assumes that rows are samples and columns
      \tare taxa in your phyloseq object, please verify.\n")
  
  # Matrix for storing data
  DIV <- matrix(nrow = phyloseq::nsamples(x), ncol = 10)
  row.names(DIV) <- phyloseq::sample_names(x)
  
  # Diversity functions
  D0.boot <- function(x) sum(x != 0)
  D2.boot <- function(x) 1/sum((x)^2)
  D1.boot <- function(x) exp(-sum(x * log(x)))
  
  if(parallel == FALSE){
    # Start resampling
    for (i in 1:phyloseq::nsamples(x)) {
      temp.D0 <- c()
      temp.D1 <- c()
      temp.D2 <- c()
      temp.phy <- phyloseq::prune_samples(x = x, samples = phyloseq::sample_names(x)[i])
      cat(paste0(date(), "\tCalculating diversity for sample ",i,"/",phyloseq::nsamples(x)," --- ",phyloseq::sample_names(x)[i], "\n"))
      for (j in 1:R) {
        temp <- phyloseq::rarefy_even_depth(temp.phy, verbose = FALSE, replace = TRUE)
        # Calculate frequencies
        temp <- data.frame(phyloseq::transform_sample_counts(temp, fun = function(x) x/sum(x))@otu_table)
        # Calculate Diversities
        temp.D0 <- c(temp.D0, D0.boot(temp))
        temp.D1 <- c(temp.D1, D1.boot(temp))
        temp.D2 <- c(temp.D2, D2.boot(temp))
        # Store diversities at the end of resampling run
        if (j == R) {
          DIV[i, 1] <- mean(temp.D0)
          DIV[i, 2] <- stats::sd(temp.D0)
          DIV[i, 7] <- mean(temp.D1)
          DIV[i, 8] <- stats::sd(temp.D1)
          DIV[i, 9] <- mean(temp.D2)
          DIV[i, 10] <- stats::sd(temp.D2)
          remove(temp.D0, temp.D1, temp.D2)
          # cat(paste0(date(), "\tDone with sample ", phyloseq::sample_names(x)[i], "\n"))
        }
      }
      # Perform breakaway for richness estimation
      temp <- t(matrix(temp.phy@otu_table))
      temp <- temp[temp != 0]
      temp <- data.frame(table(temp))
      temp <- apply(temp, 2, FUN = function(x) as.integer(x))
      if(brea==TRUE && phyloseq::sample_sums(temp.phy) > thresh){
        rich <- breakaway::breakaway(temp, print = FALSE, plot = FALSE, answers = TRUE, force=TRUE)
        if(!is.null(rich)){
          DIV[i, 3] <- rich$est
          DIV[i, 4] <- rich$seest
        } else {
          DIV[i, 3] <- NA
          DIV[i, 4] <- NA
        }
      }
      if(phyloseq::sample_sums(temp.phy) >= thresh){
        rich.chao <- breakaway::chao1(temp, output = FALSE, answers = TRUE)
        DIV[i, 5] <- rich.chao$est
        DIV[i, 6] <- rich.chao$seest
      } else {
        DIV[i, 5] <- NA
        DIV[i, 6] <- NA
      }
    }
  } else {
    # Initiate/register multiple cores
    cl <- parallel::makeCluster(ncores)
    doParallel::registerDoParallel(cl)
    cat(date(), "\tUsing", ncores, "cores for calculations\n")
    # Start resampling
    for (i in 1:phyloseq::nsamples(x)) {
      temp.D0 <- c()
      temp.D1 <- c()
      temp.D2 <- c()
      temp.phy <- phyloseq::prune_samples(x = x, samples = phyloseq::sample_names(x)[i])
      cat(paste0(date(), "\tCalculating diversity for sample ", i, "/", phyloseq::nsamples(x)," --- ", phyloseq::sample_names(x)[i], "\n"))
      
      # Parallelize diversity calculations 
      tmp <- foreach::foreach(j = 1:R, .combine = rbind) %dopar% {
        temp <- phyloseq::rarefy_even_depth(temp.phy, verbose = FALSE, replace = TRUE)
        # Calculate frequencies
        temp <- data.frame(phyloseq::transform_sample_counts(temp, fun = function(x) x/sum(x))@otu_table)
        # Calculate Diversities
        cbind(D0.boot(temp), D1.boot(temp), D2.boot(temp))
      }
      
      DIV[i, c(1,7,9)] <- colMeans(tmp)
      DIV[i, c(2,8,10)] <- apply(tmp, 2, stats::sd)
      # if (i > 1) DIV <- rbind(DIV, matrix(c(colMeans(tmp), apply(tmp, 2, FUN = sd)), nrow = 1))
      
      # Perform breakaway for richness estimation
      temp <- t(matrix(temp.phy@otu_table))
      temp <- temp[temp != 0]
      temp <- data.frame(table(temp))
      temp <- apply(temp, 2, FUN = function(x) as.integer(x))
      if(brea==TRUE && phyloseq::sample_sums(temp.phy) > thresh){
        rich <- breakaway::breakaway(temp, print = FALSE, plot = FALSE, answers = TRUE, force=TRUE)
        if(!is.null(rich)){
          DIV[i, 3] <- rich$est
          DIV[i, 4] <- rich$seest
        } else {
          DIV[i, 3] <- NA
          DIV[i, 4] <- NA
        }
      }
      if(phyloseq::sample_sums(temp.phy) >= thresh){
        rich.chao <- breakaway::chao1(temp, output = FALSE, answers = TRUE)
        DIV[i, 5] <- rich.chao$est
        DIV[i, 6] <- rich.chao$seest
      } else {
        DIV[i, 5] <- NA
        DIV[i, 6] <- NA
      }
      cat(paste0(date(), "\tDone with sample ", phyloseq::sample_names(x)[i], "\n"))
    }
    
    if(parallel == TRUE){
      cat(date(), "\tClosing workers\n")
      parallel::stopCluster(cl)
    }
  }
 
  colnames(DIV) = c("D0", "sd.D0", "D0.bre" , "sd.D0.bre", "D0.chao", "sd.D0.chao", "D1", "sd.D1", "D2", 
                    "sd.D2")
  cat(date(), "\tDone with all", phyloseq::nsamples(x), "samples\n")
  return(DIV)
}