R/testing.R

Defines functions combinatorial_association association iter_lm iter_clr iter_voom iter_limma unshrink iter_deseq2 check_variables

Documented in association combinatorial_association

# Functions for testing associations between the microbiome
# and exogenous variables

check_variables <- function(variable, counts, meta, confounders) {
    good <- !is.na(meta[[variable]])
    # Check for constant variables
    sds <- sapply(c(variable, confounders),
                 function(co) sd(as.numeric(meta[good, co])))
    if (any(is.na(sds) | sds < 1e-6)) {
        flog.info("Detected variable or confounder with zero variation.")
        return(NULL)
    }
    if (is.null(confounders)) {
        ref_model <- ~ 1
    } else {
        ref_model <- reformulate(confounders)
        num_conf <- sapply(confounders, function(co) as.numeric(meta[[co]]))
        corrs <- cor(as.numeric(meta[[variable]]), num_conf,
                     use = "pairwise.complete.obs")
        if (any(abs(corrs) > 0.9)) {
            flog.info(
                paste("Variable `%s` is correlated with the confounders.",
                       "Skipping it."),
                variable)
            return(NULL)
        }
    }
    if (sum(good) < length(confounders) + 2) {
        flog.info("Not enough degrees of freedom. Skipping variable `%s`.",
                  variable)
        return(NULL)
    }
    return(list(good = good, ref_model = ref_model))
}

iter_deseq2 <- function(variable, counts, meta, confounders, shrink, tax) {
    is_reg <- !is.factor(meta[[variable]])
    check <- check_variables(variable, counts, meta, confounders)
    if (is.null(check)) {
        return(NULL)
    } else {
        good <- check$good
        ref_model <- check$ref_model
    }
    if (ncol(counts) < 50) {
        fit_type <- "mean"
    } else {
        fit_type <- "local"
    }
    formula <- reformulate(c(confounders, variable))
    dds <- suppressMessages(
        DESeqDataSetFromMatrix(t(counts[good, ]), meta[good, ],
        design = formula))
    dds <- suppressMessages(
        DESeq(dds, test = "LRT", parallel = FALSE, quiet = TRUE,
              fitType = fit_type, reduced = ref_model,
              sfType = "poscounts"))
    res <- results(dds)
    if (shrink) {
        res <- lfcShrink(dds, coef = length(resultsNames(dds)),
                            res = res, quiet = TRUE)
    }
    res <- as.data.table(res)
    set(res, j = ifelse(is.na(tax), "variant", tax),
        value = colnames(counts))
    set(res, j = "variable", value = variable)
    set(res, j = "coef", value = resultsNames(dds)[length(resultsNames(dds))])
    if (is_reg) {
        n <- sum(good)
    } else {
        levs <- levels(meta[[variable]])
        n <- min(sum(meta[good, variable] == levs[1]),
                 sum(meta[good, variable] == levs[length(levs)]))
    }
    set(res, j = "n_eff", value = n)
    return(res)
}

unshrink <- function(fit, df) {
    dm <- dimnames(fit$t)
    fit$t <- fit$coefficients / fit$stdev.unscaled / fit$sigma
    dimnames(fit$t) <- dm
    fit$p.value <- 2 * pt(-abs(fit$t), df = df)
    dimnames(fit$p.value) <- dm
    return(fit)
}

iter_limma <- function(variable, counts, meta, confounders, shrink, tax,
                       strategy="clr") {
    is_reg <- !is.factor(meta[[variable]])
    check <- check_variables(variable, counts, meta, confounders)
    if (is.null(check)) {
        return(NULL)
    } else {
        good <- check$good
        ref_model <- check$ref_model
    }
    dformula <- reformulate(c(confounders, variable))
    design <- model.matrix(dformula, data = meta)
    if (strategy == "voom") {
        norm_counts <- t(suppressMessages(normalize(counts[good, ])))
        model <- voom(norm_counts, design, plot=FALSE)
    } else if (strategy == "clr") {
        model <- apply(
            counts[good, ], 2,
            function(x) log(x + 0.5) - mean(log(x + 0.5))
        ) %>% t()
        shrink <- FALSE
    } else {
        model <- t(counts[good, ])
    }

    fit <- lmFit(model, design)
    fit <- eBayes(fit)

    if (!shrink) {
        fit <- unshrink(fit, ncol(model) - ncol(design))
    }
    res <- topTable(fit, coef=ncol(design), sort.by="none", number=Inf)
    res <- as.data.table(res)
    names(res) <- c("log2FoldChange", "baseMean", "t", "pvalue", "padj", "B")
    res$baseMean <- 2 ^ res$baseMean
    set(res, j = ifelse(is.na(tax), "variant", tax),
        value = colnames(counts))
    set(res, j = "variable", value = variable)
    if (is_reg) {
        n <- sum(good)
    } else {
        levs <- levels(meta[[variable]])
        n <- min(sum(meta[good, variable] == levs[1]),
                 sum(meta[good, variable] == levs[length(levs)]))
    }
    set(res, j = "n_eff", value = n)
    return(res)
}

iter_voom <- function(...) {
    iter_limma(..., strategy = "voom")
}

iter_clr <- function(...) {
    iter_limma(..., strategy = "clr")
}

iter_lm <- function(...) {
    iter_limma(..., strategy = "lm")
}

#' Build a configuration for the alignment workflows.
#'
#' This can be saved and passed on to others to ensure reproducibility.
#'
#' @param ... Any arguments are used to update the default configuration. See
#'  the example below. Optional.
#' @return A list with the parameters used in the long read alignment
#'  workflow.
#' @export
#' @examples
#'  config <- config_association(variable = "glucose")
config_association <- config_builder(list(
    variables = NULL,
    taxa_rank = "genus",
    confounders = NULL,
    min_abundance = 10,
    in_samples = 0.1,
    presence_threshold = 1,
    independent_weighting = TRUE,
    standardize = TRUE,
    shrink = TRUE,
    method = "deseq2",
    threads = getOption("mc.cores", 1)
))

#' Run differential association tests between taxa counts and exogenous
#' factors.
#'
#' @param ps A phyloseq object containing the taxa counts.
#' @param ... A configuration as returned by \code{\link{config_association}}.
#' @return An artifact containing the results.
#' @examples
#'  NULL
#'
#' @export
#' @importFrom data.table set
#' @importFrom phyloseq sample_data
#' @importFrom DESeq2 DESeqDataSetFromMatrix DESeq results lfcShrink
#'  resultsNames estimateSizeFactors
#' @importFrom limma voom eBayes lmFit topTable
#' @importFrom stats model.matrix
association <- function(ps, ...) {
    config <- config_parser(list(...), config_association)
    meta <- as(sample_data(ps), "data.frame")
    if (!is.null(config$confounders)) {
        missing_conf <- apply(sample_data(ps)[, config$confounders], 1,
                              function(row) any(is.na(row)))
        ps <- prune_samples(!missing_conf, ps)
    }
    if ((!requireNamespace("IHW", quietly = TRUE)) &&
          config$independent_weighting) {
        stop("independent weighting requires the IHW package!")
    }
    if (config$standardize) meta <- standardize(meta)
    if (is.null(config$variables)) {
        variables <- names(meta)
    } else {
        variables <- config$variables
    }
    variables <- variables[!(variables %in% config$confounders)]
    counts <- as.matrix(taxa_count(ps, lev = config$taxa_rank))
    meta <- meta[rownames(counts), , drop = FALSE]
    too_rare <- (
        colSums(counts >= config$presence_threshold) /
        nrow(counts)) < config$in_samples
    too_few <- colMeans(counts) < config$min_abundance
    counts <- counts[, !(too_rare | too_few)]
    if (config$method == "deseq2") {
        iter <- iter_deseq2
    } else if (config$method == "voom") {
        iter <- iter_voom
    } else if (config$method == "clr") {
        iter <- iter_clr
    } else if (config$method == "lm") {
        iter <- iter_lm
    } else {
        stop("`%s` is not a recognized method :/", config$method)
    }

    apfun <- parse_threads(config$threads)
    report_at <- max(ceiling(length(variables) / 20),
                     ceiling(5e4 / nrow(counts)))
    tests <- apfun(1:length(variables), function(i) {
        v <- variables[i]
        test <- iter(v, counts = counts, meta = meta,
                     confounders = config$confounders,
                     shrink = config$shrink, tax = config$taxa_rank)
        if (i %% report_at == 0) {
            flog.info("Finished running %d/%d tests (%d%%).",
                      i * ncol(counts), length(variables) * ncol(counts),
                      floor(100 * i / length(variables)))
        }
        return(test)
    }) %>% rbindlist()

    if (length(variables) > 1) {
        if (config$independent_weighting) {
            weights <- IHW::ihw(pvalue ~ baseMean, tests, alpha = 0.05)
            set(tests, j = "padj", value = IHW::adj_pvalues(weights))
        } else {
            set(tests, j = "padj", value =
                p.adjust(tests$pvalue, method = "BH"))
        }
    }

    return(tests)
}


#' Run differential association tests between between all combinations of a
#' factor variable with DESeq2. Can be used as post-hoc test for regression.
#'
#' @param ps A phyloseq object containing the taxa counts.
#' @param variable The factor variable to permute.
#' @param tax The taxa level on which to run differential tests. Defaults to
#'  genus.
#' @param confounders A character vector containing the confounders that should
#'  be used.
#' @param min_abundance Minimum required number of average counts for a taxa.
#' @param in_samples Taxa must be present in at least this fraction of samples.
#' @param independent_weighting Whether to adjust p values by independent
#'  weighting or normal Benjamini-Hochberg.
#'  factors.
#' @param standardize Whether to standardize continuous variables to a mean
#'  of zero and a variance of 1. If True log fold changes for those variables
#'  denote are relative to a change of one standard deviation in the variable
#'  value.
#' @param shrink Whether to return shrunken log fold changes. Defaults to true.
#' @return A data.table containing the results.
#' @examples
#'  NULL
#'
#' @export
#' @importFrom data.table set
#' @importFrom stats glm p.adjust influence reformulate
#' @importFrom utils combn
#' @importFrom phyloseq sample_data
combinatorial_association <- function(ps, variable, tax = "genus",
                        confounders = NULL, min_abundance = 10,
                        in_samples = 0.1,
                        independent_weighting = TRUE, standardize = TRUE,
                        shrink = TRUE) {
    if (!is.null(confounders)) {
        missing_conf <- apply(sample_data(ps)[, confounders], 1,
                              function(row) any(is.na(row)))
        ps <- prune_samples(!missing_conf, ps)
    }
    if ((!requireNamespace("IHW", quietly = TRUE)) && independent_weighting) {
        stop("independent weighting requires the IHW package!")
    }
    meta <- as(sample_data(ps), "data.frame")
    if (standardize) meta <- standardize(meta)
    if (!is.factor(meta[[variable]])) {
        stop("variable must be a factor.")
    }
    levs <- levels(meta[[variable]])
    if (length(levs) <= 2) {
        stop("variable must have more than 2 levels.")
    }
    counts <- as.matrix(taxa_count(ps, lev = tax))
    meta <- meta[rownames(counts), ]
    too_rare <- (
        colSums(counts >= min_abundance) /
        nrow(counts)) < in_samples
    too_few <- colMeans(counts) < min_abundance
    counts <- counts[, !(too_rare | too_few)]

    good <- !is.na(meta[[variable]])
    if (is.null(confounders)) {
            ref_model <- ~ 1
        } else {
            ref_model <- reformulate(confounders)
        }
    dds <- DESeqDataSetFromMatrix(t(counts[good, ]), meta[good, ],
        design = reformulate(c(confounders, variable)))
    dds <- estimateSizeFactors(dds, type = "poscount")
    dds <- DESeq(dds, test = "LRT", parallel = TRUE, quiet = TRUE,
                 fitType = "local", reduced = ref_model)
    combinations <- combn(length(levs):1, 2)
    tests <- pbapply(combinations, 2, function(co) {
        name <- paste0(variable, levs[co[1]], "_vs_", levs[co[2]])
        res <- results(dds, contrast = c(variable, levs[co[1]], levs[co[2]]))
        if (shrink) {
            res <- lfcShrink(dds,
                             contrast = c(variable, levs[co[1]], levs[co[2]]),
                             res = res, type = "normal")
        }
        res <- as.data.table(res)
        set(res, j = tax, value = colnames(counts))
        set(res, j = "variable", value = name)
        n <- min(table(meta[[variable]])[co])
        set(res, j = "n_eff", value = n)
        return(res)
    })
    tests <- rbindlist(tests)

    if (independent_weighting) {
        weights <- IHW::ihw(pvalue ~ baseMean, tests, alpha = 0.05)
        set(tests, j = "padj", value = IHW::adj_pvalues(weights))
    } else {
        set(tests, j = "padj", value = p.adjust(tests$pvalue, method = "BH"))
    }

    return(tests)
}
Gibbons-Lab/mbtools documentation built on Jan. 28, 2024, 11:08 a.m.