#' Identify differentially-abundant and differentially-variable taxa
#'
#' @param formula an object of class \code{formula} without the response: a symbolic description of the model to be fitted to the abundance
#' @param phi.formula an object of class \code{formula} without the response: a symbolic description of the model to be fitted to the dispersion
#' @param formula_null Formula for mean under null, without response
#' @param phi.formula_null Formula for overdispersion under null, without response
#' @param data a data frame containing the OTU table, or \code{phyloseq} object containing the variables in the models
#' @param link link function for abundance covariates, defaults to \code{"logit"}
#' @param phi.link link function for dispersion covariates, defaults to \code{"logit"}
#' @param test Character. Hypothesis testing procedure to use. One of \code{"Wald"}, \code{"LRT"} (likelihood ratio test), or \code{"Rao"}.
#' @param boot Boolean. Defaults to \code{FALSE}. Indicator of whether or not to use parametric bootstrap algorithm. (See \code{\link{pbWald}} and \code{\link{pbLRT}}).
#' @param B Optional integer. Number of bootstrap iterations. Ignored if \code{boot} is \code{FALSE}. Otherwise, defaults to \code{1000}.
#' @param sample_data Data frame or matrix. Defaults to \code{NULL}. If \code{data} is a data frame or matrix, this must be included as covariates/sample data.
#' @param taxa_are_rows Boolean. Optional. If \code{data} is a data frame or matrix, this indicates whether taxa are rows. Defaults to \code{TRUE}.
#' @param filter_discriminant Boolean. Defaults to \code{TRUE}. If \code{FALSE}, discriminant taxa will not be filtered out.
#' @param fdr_cutoff Integer. Defaults to \code{0.05}. Desired false discovery rate.
#' @param fdr Character. Defaults to \code{"fdr"}. False discovery rate control method, see \code{\link{p.adjust}} for more options.
#' @param full_output Boolean. Opetional. Defaults to \code{FALSE}. Indicator of whether to include full \code{bbdml} model output for all taxa.
#' @param inits Optional initializations for model fit using \code{formula} and \code{phi.formula} as rows of a matrix. Defaults to \code{NULL}.
#' @param inits_null Optional initializations for model fit using \code{formula_null} and \code{phi.formula_null} as rows of a matrix. Defaults to \code{NULL}.
#' @param try_only Optional numeric. Will try only the \code{try_only} taxa, specified either via numeric input or character taxa names. Useful for speed when troubleshooting. Defaults to \code{NULL}, testing all taxa.
#' @param verbose Boolean. Defaults to \code{FALSE}; print status updates for long-running analyses
#' @param robust Should robust standard errors be used? If not, model-based standard errors are used. Logical, defaults to \code{FALSE}.
#' @param ... Optional additional arguments for \code{\link{bbdml}}
#'
#' @details See package vignette for details and example usage. Make sure the number of columns in all of the initializations are correct! \code{inits} probably shouldn't match \code{inits_null}. To use a contrast matrix, see \code{\link{contrastsTest}}.
#'
#' @return An object of class \code{differentialTest}. List with elements \code{p} containing the p-values, \code{p_fdr} containing the p-values after false discovery rate control, \code{significant_taxa} containing the taxa names of the statistically significant taxa, \code{significant_models} containing a list of the model fits for the significant taxa, \code{all_models} containing a list of the model fits for all taxa, \code{restrictions_DA} containing a list of covariates that were tested for differential abundance, \code{restrictions_DV} containing a list of covariates that were tested for differential variability, \code{discriminant_taxa_DA} containing the taxa for which at least one covariate associated with the abundance was perfectly discriminant, \code{discriminant_taxa_DV} containing the taxa for which at least one covariate associated with the dispersion was perfectly discriminant, \code{data} containing the data used to fit the models. If \code{full_output = TRUE}, it will also include \code{full_output}, a list of all model output from \code{bbdml}.
#'
#'
#' @examples
#'
#' # data frame example
#' data(soil_phylum_small_sample)
#' data(soil_phylum_small_otu)
#' da_analysis <- differentialTest(formula = ~ DayAmdmt,
#' phi.formula = ~ DayAmdmt,
#' formula_null = ~ 1,
#' phi.formula_null = ~ DayAmdmt,
#' test = "Wald", boot = FALSE,
#' data = soil_phylum_small_otu,
#' sample_data = soil_phylum_small_sample,
#' fdr_cutoff = 0.05,
#' try_only = 1:5)
#'
#' # phyloseq example (only run if you have phyloseq installed)
#' \dontrun{
#' data_phylo <- phyloseq::phyloseq(phyloseq::sample_data(soil_phylum_small_sample),
#' phyloseq::otu_table(soil_phylum_small_otu, taxa_are_rows = TRUE))
#' da_analysis <- differentialTest(formula = ~ DayAmdmt,
#' phi.formula = ~ DayAmdmt,
#' formula_null = ~ 1,
#' phi.formula_null = ~ DayAmdmt,
#' test = "Wald", boot = FALSE,
#' data = data_phylo,
#' fdr_cutoff = 0.05,
#' try_only = 1:5)
#' }
#'
#' @export
differentialTest <- function(formula, phi.formula,
formula_null, phi.formula_null,
data,
link = "logit",
phi.link = "logit",
test,
boot = FALSE,
B = 1000,
sample_data = NULL,
taxa_are_rows = TRUE,
filter_discriminant = TRUE,
fdr_cutoff = 0.05,
fdr = "fdr",
full_output = FALSE,
inits = NULL,
inits_null = NULL,
try_only = NULL,
verbose = FALSE,
robust = FALSE,
...) {
# Record call
call <- match.call(expand.dots = TRUE)
# Record mu link
link <- match.arg(link, choices = "logit")
# Record phi link
phi.link <- match.arg(phi.link, choices = c("fishZ", "logit"))
# Convert phyloseq objects
if ("phyloseq" %in% class(data)) {
if (requireNamespace("phyloseq", quietly = TRUE)) {
# Set up response
taxanames <- phyloseq::taxa_names(data)
sample_data <- data.frame(phyloseq::sample_data(data))
} else {
warn_phyloseq()
}
} else if (is.matrix(data) || is.data.frame(data)) {
# # use phyloseq
# OTU <- phyloseq::otu_table(data, taxa_are_rows = taxa_are_rows)
#
# # Make sample data
# sampledata <- phyloseq::sample_data(data.frame(
# sample_data,
# row.names = phyloseq::sample_names(OTU)
# ))
#
# # Make phyloseq object
# data <- phyloseq::phyloseq(OTU, sampledata)
# # Set up response
# taxanames <- phyloseq::taxa_names(data)
if (taxa_are_rows) {
data <- t(data)
}
taxanames <- colnames(data)
M <- rowSums(data)
} else {
stop("Input must be either data frame, matrix, or phyloseq object!")
}
# Set up output
pvals <- perfDisc_DA <- perfDisc_DV <- rep(NA, length(taxanames))
model_summaries <- rep(list(NA), length(taxanames))
full_outputs <- rep(list(NA), length(taxanames))
# check to make sure inits is of the same length
if (!is.null(inits)) {
ncol1 <- ncol(stats::model.matrix(object = formula, data = sample_data))
ncol2 <- ncol(stats::model.matrix(object = phi.formula, data = sample_data))
if (length(inits) != ncol1 + ncol2) {
stop("inits must match number of regression parameters in formula and phi.formula!")
}
}
# inits_null_mu
if (!is.null(inits_null)) {
ncol1 <- ncol(stats::model.matrix(object = formula_null, data = sample_data))
ncol2 <- ncol(stats::model.matrix(object = phi.formula_null, data = sample_data))
if (length(inits_null) != ncol1 + ncol2) {
stop("init_null must match number of regression parameters in formula_null and phi.formula_null!")
}
}
restrict_ind <- 0
if (is.null(try_only)) {
try_only <- 1:length(taxanames)
}
if (is.character(try_only)) {
try_only <- which(taxanames %in% try_only)
}
# Loop through OTU/taxa
for (i in try_only) {
if (verbose) print(paste0(" ------- Fitting ", taxanames[i], ", taxa ", i, " of ", length(try_only), " -------"))
# Subset data to only select that taxa
if ("phyloseq" %in% class(data)) {
data_i <- convert_phylo(data, select = taxanames[i])
} else {
response_i <- data.frame(W = data[, taxanames[i]], M = M)
data_i <- cbind(response_i, sample_data)
}
if (sum(data_i$W) == 0) {
perfDisc_DA[i] <- TRUE
perfDisc_DV[i] <- TRUE
} else {
# Update formula to match
formula_i <- stats::update(formula, cbind(W, M - W) ~ .)
formula_null_i <- stats::update(formula_null, cbind(W, M - W) ~ .)
# Fit unrestricted model
mod <- suppressWarnings(try(bbdml(formula = formula_i, phi.formula = phi.formula,
data = data_i, link = link, phi.link = phi.link,
inits = inits, robust = robust, ...), silent = TRUE))
#print(" -- model --")
#print(mod)
# Fit restricted model
mod_null <- suppressWarnings(try(bbdml(formula = formula_null_i, phi.formula = phi.formula_null,
data = data_i, link = link, phi.link = phi.link,
inits = inits_null, robust = robust, ...), silent = TRUE))
#print(" -- null model --")
#print(mod_null)
if (!("try-error" %in% c(class(mod), class(mod_null)))) {
if (restrict_ind == 0) {
restricts <- getRestrictionTerms(mod = mod, mod_null = mod_null)
restrict_ind <- 1
}
# If both models fit, otherwise keep as NA
model_summaries[[i]] <- suppressWarnings(summary(mod))
if (full_output) {
full_outputs[[i]] <- suppressWarnings(mod)
}
if (test == "Wald") {
if (boot) {
tmp <- try(pbWald(mod = mod, mod_null = mod_null, B = B, robust = robust), silent = TRUE)
if (!inherits(tmp, "try-error")) {
pvals[i] <- tmp
}
} else {
tmp <- try(waldchisq(mod = mod, mod_null = mod_null, robust = robust), silent = TRUE)
if (!inherits(tmp, "try-error")) {
pvals[i] <- tmp
}
}
} else if (test == "Rao") {
if (boot) {
tmp <- try(pbRao(mod = mod, mod_null = mod_null, B = B), silent = TRUE)
if (!inherits(tmp, "try-error")) {
pvals[i] <- tmp
}
} else {
tmp <- try(raotest(mod = mod, mod_null = mod_null), silent = TRUE)
if (!inherits(tmp, "try-error")) {
pvals[i] <- tmp
}
}
} else if (test == "LRT") {
if (mod$has_noninteger) stop("Amy needs to think about whether you can test via LRTs with non-integer data first! \n We would recommend robust Wald testing instead. \n If you really, really, really want to LRT with non-integer data, \n please post an issue to GitHub and Amy will think about this for you.")
if (boot) {
tmp <- try(pbLRT(mod = mod, mod_null = mod_null, B = B), silent = TRUE)
if (!inherits(tmp, "try-error")) {
pvals[i] <- tmp
}
} else {
tmp <- try(lrtest(mod = mod, mod_null = mod_null), silent = TRUE)
if (!inherits(tmp, "try-error")) {
pvals[i] <- tmp
}
}
} else {
stop("Invalid test argument?")
}
perfDisc_DA[i] <- mod$sep_da
perfDisc_DV[i] <- mod$sep_dv
}
}
}
ind_disc_da <- which(perfDisc_DA == TRUE)
ind_disc_dv <- which(perfDisc_DV == TRUE)
disc_vec_da <- taxanames[ind_disc_da]
disc_vec_dv <- taxanames[ind_disc_dv]
ind_disc <- union(ind_disc_da, ind_disc_dv)
if (test == "Wald") {
if (filter_discriminant && length(ind_disc) > 0) {
# Want to keep same length, rest will ignore NAs
pvals[ind_disc] <- NA
}
}
if (all(is.na(pvals))) {
message("All p-values are NA! \n
There are a number of reasons why this could happen, including
- All models failed to converge because your model is overspecified
(e.g., because your design matrix X is not full rank, or because your sample size
is not large enough to estimate all parameters)
- You misspelled an argument. Double-check your values for the arguments (e.g., `link`, `phi.link`,
`method`, etc.) to makes sure that they follow the specified options.
- Other reasons not listed here. \n
We *strongly recommend* running `bbdml` on a single taxon (especially before posting an issue on GitHub). \n
If a error message was thrown by `bbdml`, it is reproduced below for a single taxon.
If no errors were thrown by `bbdml`, and the issue is instead in `differentialTest`, no output is shown. \n")
i <- (try_only[!(try_only %in% ind_disc)])[1]
if ("phyloseq" %in% class(data)) {
data_i <- convert_phylo(data, select = taxanames[i])
} else {
response_i <- data.frame(W = data[, taxanames[i]], M = M)
data_i <- cbind(response_i, sample_data)
}
formula_i <- stats::update(formula, cbind(W, M - W) ~ .)
formula_null_i <- stats::update(formula_null, cbind(W, M - W) ~ .)
mod <- bbdml(formula = formula_i, phi.formula = phi.formula,
data = data_i, link = link, phi.link = phi.link,
inits = inits, ...)
mod_null <- bbdml(formula = formula_null_i, phi.formula = phi.formula_null,
data = data_i, link = link, phi.link = phi.link,
inits = inits_null, ...)
print(tmp)
}
post_fdr <- stats::p.adjust(pvals, method = fdr)
names(pvals) <- names(post_fdr) <- taxanames
# Record significant taxa
signif_vec <- taxanames[which(post_fdr < fdr_cutoff)]
signif_models <- model_summaries[which(post_fdr < fdr_cutoff)]
restricts_mu <- setdiff(colnames(stats::model.matrix(object = formula, data = sample_data)),
colnames(stats::model.matrix(object = formula_null, data = sample_data)))
restricts_phi <- setdiff(colnames(stats::model.matrix(object = phi.formula, data = sample_data)),
colnames(stats::model.matrix(object = phi.formula_null, data = sample_data)))
attr(restricts_mu, "index") <- restricts$mu
attr(restricts_phi, "index") <- restricts$phi
if (!full_output) {
full_outputs <- NULL
}
structure(
list("p" = pvals, "p_fdr" = post_fdr,
"significant_taxa" = signif_vec,
"significant_models" = signif_models,
"all_models" = model_summaries,
"restrictions_DA" = restricts_mu,
"restrictions_DV" = restricts_phi,
"discriminant_taxa_DA" = disc_vec_da,
"discriminant_taxa_DV" = disc_vec_dv,
"data" = data,
"full_output" = full_outputs),
class = "differentialTest"
)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.