DivNet: Diversity Estimation in Networked Ecological Communities

Documented in divnet

#' divnet
#'
#' @param W An abundance table with taxa as columns and samples as rows; or a phyloseq object.
#' @param X The covariate matrix, with samples as rows and variables as columns. Defaults to NULL (sample_names are the covariates). Instead of specifying \code{X}, you can specify this information using the argument \code{formula}. If you specify a \code{formula} and \code{W} is a phyloseq object, this argument will be ignored.
#' @param fitted_model object produced by fit_aitchison. Defaults to NULL.
#' @param tuning settings for tuning the MC-MH algorithm. Options include NULL (defaults to "fast"), "fast", "careful" or a named list with components EMiter (number of EM iterations; 6 for fast, 10 for careful), EMburn (number of EM iterations to burn; 3 for fast, 5 for careful), MCiter (number of MC iterations; 500 for fast, 1000 for careful), MCburn (number of MC iterations to burn; 250 for fast, 500 for careful) and stepsize (variance used for MH samples; 0.01 for both fast and careful)
#' @param perturbation Perturbation magnitude for zero values when calculating logratios.
#' @param network How to estimate network. Defaults to NULL (the default), "default" (generalised inverse, aka naive). Other options include "diagonal", "stars" (requires glasso and SpiecEasi to be installed), or a function that you want to use to estimate the network
#' @param base The column index of the base taxon in the columns of W, or the name of the taxon (must be a column name of W, or a taxon name if W is a phyloseq object). If NULL, will use `pick_base` to choose a taxon. If no taxa are observed in all samples, an error will be thrown. In that case, we recommend trying a number of different highly abundant taxa to confirm the results are robust to the taxon choice.
#' @param ncores Number of cores to use for parallelization
#' @param variance method to get variance of estimates. Current options are "parametric" for parametric bootstrap, "nonparametric" for nonparametric bootstrap, and "none" for no variance estimates
#' @param B Number of bootstrap iterations for estimating the variance.
#' @param nsub Number of subsamples for nonparametric bootstrap. Defaults to half the number of observed samples.
#' @param formula an object of class \code{formula}: a symbolic description of the model to be fitted; a means of constructing \code{X} via \code{stats::model.matrix}. If \code{W} is a phyloseq object, the formula should refer to variables stored in sample_data. If \code{W} is not a phyloseq object, \code{X} should be a data frame containing columns referred to in your formula. Formula references must match column names found in the sample data from \code{W} or \code{X}. Optional, defaults to \code{NULL}.
#' @param ... Additional parameters to be passed to the network function
#'
#' @importFrom breakaway make_design_matrix
#' @importFrom magrittr "%>%"
#' @importFrom phyloseq otu_table
#' @importFrom phyloseq sample_names
# #' @importFrom breakaway alpha_estimate
# #' @importFrom breakaway alpha_estimates
# #' @importClassesFrom phyloseq phyloseq
# #' @importClassesFrom breakaway alpha_estimate alpha_estimates
#'
#' @import phyloseq
#' @import breakaway
#'
#' @author Amy Willis
#'
#' @export
divnet <-  function(W,
                    X = NULL,
                    fitted_model = NULL,
                    tuning = NULL,
                    perturbation = NULL,
                    network = NULL,
                    base = NULL,
                    ncores = NULL,
                    variance = "parametric",
                    B = 5,
                    nsub = NULL,
                    formula = NULL,
                    ...) {

  if (!is.null(formula)) {
    if ("phyloseq" %in% class(W)) {
      X <- data.frame(phyloseq::sample_data(W))
      X <- stats::model.matrix(object = formula, data = X)
    } else {
      # get model.matrix
      X <- data.frame(X)
      X <- stats::model.matrix(object = formula, data = X)
    }
  }

  if ("phyloseq" %in% class(W)) {

    input_data <- W

    W <- input_data %>% otu_table %>% as.matrix
    suppressWarnings({class(W) <- "matrix"})

    if (phyloseq::taxa_are_rows(input_data)) W <- W %>% t

    samples_names <- input_data %>% sample_names

    # make the design matrix
    if (is.character(X)) {
      X <- breakaway::make_design_matrix(input_data, X)
    } else if (is.null(X)) {
      xx <- input_data %>% sample_data %>% rownames
      X <- model.matrix(~xx)
      #X <- matrix(1, ncol = 1, nrow = nrow(W))
    }
  } else if ("otu_table" %in% class(W)) {

    input_data <- W
    W <- input_data %>% as.matrix
    suppressWarnings({class(W) <- "matrix"})

    if (phyloseq::taxa_are_rows(input_data)) W <- W %>% t

    samples_names <- input_data %>% sample_names

  } else {
    samples_names <- rownames(W)
  }

  # autogenerate sample names
  if (is.null(samples_names)) {
    samples_names <- paste0("sample_", 1:nrow(W))
  }

  if (is.null(X)) {
    #X <- matrix(1, ncol=1, nrow=nrow(W))
    X <- model.matrix(~samples_names)
  }

  # remove taxa that weren't observed
  # yes, this is a good idea
  if (any(colSums(W) == 0)) {
    message("Removing absent taxa!")
    W <- W[ , which(colSums(W) > 0)]
  }

  if (nrow(W) == 1) {
    stop("DivNet requires more than 1 sample")
  }
  if (ncol(W) == 2) {
    stop("Cannot fit a network model with 2 taxa")
  }

  if ("character" %in% class(base)) {
    if (base %in% colnames(W)) {
      base <- which(base == colnames(W))
    } else {
      stop("base not found in taxon names. base taxon may be unobserved in all samples, or you may have a typo.")
    }
  }


  if (is.null(fitted_model)) {
    fitted_model <- fit_aitchison(W,
                                  X = X,
                                  tuning = tuning,
                                  perturbation = perturbation,
                                  network = network,
                                  base = base,
                                  ncores = ncores,
                                  ...)
  }
  zz <- fitted_model$fitted_z
  output_list <- get_diversities(zz, samples_names)

  base <- fitted_model$base

  if (variance == "parametric") {

    # resample from models
    parametric_list <- replicate(B,
                                 parametric_variance(fitted_model,
                                                     W = W,
                                                     X = X,
                                                     tuning = tuning,
                                                     perturbation = perturbation,
                                                     network = network,
                                                     base = base,
                                                     ncores = ncores,
                                                     ...),
                                 simplify=F)

    variance_estimates <- get_diversity_variance(parametric_list, samples_names)

    for(i in 1:length(variance_estimates)) {
      output_list[[names(variance_estimates)[i]]] <-  variance_estimates[[i]]
    }

    # Add variance to alpha_diversity class
    for(i in 1:nrow(W)) {
      output_list$shannon[[i]]$error <- sqrt(variance_estimates$`shannon-variance`)[i]
      output_list$shannon[[i]]$interval <- c(output_list$shannon[[i]]$estimate - 2*output_list$shannon[[i]]$error,
                                             output_list$shannon[[i]]$estimate + 2*output_list$shannon[[i]]$error)
      output_list$shannon[[i]]$interval_type <- "symmetric"

      output_list$simpson[[i]]$error <- sqrt(variance_estimates$`simpson-variance`)[i]
      output_list$simpson[[i]]$interval <- c(output_list$simpson[[i]]$estimate - 2*output_list$simpson[[i]]$error,
                                             output_list$simpson[[i]]$estimate + 2*output_list$simpson[[i]]$error)
      output_list$simpson[[i]]$interval_type <- "symmetric"
    }
    variance_estimates$`shannon-variance` <- variance_estimates$`simpson-variance` <- NULL

  } else if (variance == "nonparametric") {
    if (is.null(nsub)) nsub <- ceiling(dim(W)[1]/2)

    nonparametric_list <- replicate(B,
                                    nonparametric_variance(W = W,
                                                           X = X,
                                                           tuning = tuning,
                                                           perturbation = perturbation,
                                                           network = network,
                                                           base = base,
                                                           ncores = ncores,
                                                           nsub = nsub,
                                                           ...),
                                    simplify=F)
    variance_estimates <- get_diversity_variance(nonparametric_list, samples_names)

    for(i in 1:length(variance_estimates)) {
      output_list[[names(variance_estimates)[i]]] <-  variance_estimates[[i]]
    }
  } else if (variance == 0 | variance == "none") {
    # do nothing, no warning
  } else {
    warning("No variance estimate is computed")
  }

  output_list[["X"]] <- X
  output_list[["fitted_z"]] <- zz

  class(output_list) <- c("diversityEstimates", class(output_list))

  output_list
}