#' Perform differential analysis using ALDEx2
#'
#' @param ps a [`phyloseq::phyloseq-class`] object
#' @param group character, the variable to set the group
#' @param taxa_rank character to specify taxonomic rank to perform
#' differential analysis on. Should be one of
#' `phyloseq::rank_names(phyloseq)`, or "all" means to summarize the taxa by
#' the top taxa ranks (`summarize_taxa(ps, level = rank_names(ps)[1])`), or
#' "none" means perform differential analysis on the original taxa
#' (`taxa_names(phyloseq)`, e.g., OTU or ASV).
#' @param transform character, the methods used to transform the microbial
#' abundance. See [`transform_abundances()`] for more details. The
#' options include:
#' * "identity", return the original data without any transformation
#' (default).
#' * "log10", the transformation is `log10(object)`, and if the data contains
#' zeros the transformation is `log10(1 + object)`.
#' * "log10p", the transformation is `log10(1 + object)`.
#' @param norm the methods used to normalize the microbial abundance data. See
#' [`normalize()`] for more details.
#' Options include:
#' * "none": do not normalize.
#' * "rarefy": random subsampling counts to the smallest library size in the
#' data set.
#' * "TSS": total sum scaling, also referred to as "relative abundance", the
#' abundances were normalized by dividing the corresponding sample library
#' size.
#' * "TMM": trimmed mean of m-values. First, a sample is chosen as reference.
#' The scaling factor is then derived using a weighted trimmed mean over the
#' differences of the log-transformed gene-count fold-change between the
#' sample and the reference.
#' * "RLE", relative log expression, RLE uses a pseudo-reference calculated
#' using the geometric mean of the gene-specific abundances over all
#' samples. The scaling factors are then calculated as the median of the
#' gene counts ratios between the samples and the reference.
#' * "CSS": cumulative sum scaling, calculates scaling factors as the
#' cumulative sum of gene abundances up to a data-derived threshold.
#' * "CLR": centered log-ratio normalization.
#' * "CPM": pre-sample normalization of the sum of the values to 1e+06.
#' @param norm_para arguments passed to specific normalization methods
#' @param method test method, options include: "t.test" and "wilcox.test"
#' for two groups comparison, "kruskal" and "glm_anova" for multiple groups
#' comparison.
#' @param p_adjust method for multiple test correction, default `none`,
#' for more details see [stats::p.adjust].
#' @param pvalue_cutoff cutoff of p value, default 0.05.
#' @param mc_samples integer, the number of Monte Carlo samples to use for
#' underlying distributions estimation, 128 is usually sufficient.
#' @param denom character string, specifiy which features used to as the
#' denominator for the geometric mean calculation. Options are:
#' * "all", with all features.
#' * "iqlr", accounts for data with systematic variation and centers the
#' features on the set features that have variance that is between the lower
#' and upper quartile of variance.
#' * "zero", a more extreme case where there are many non-zero features in
#' one condition but many zeros in another. In this case the geometric mean
#' of each group is calculated using the set of per-group non-zero features.
#' * "lvha", with house keeping features.
#' @param paired logical, whether to perform paired tests, only worked for
#' method "t.test" and "wilcox.test".
#' @export
#' @references Fernandes, A.D., Reid, J.N., Macklaim, J.M. et al. Unifying the
#' analysis of high-throughput sequencing datasets: characterizing RNA-seq,
#' 16S rRNA gene sequencing and selective growth experiments by compositional
#' data analysis. Microbiome 2, 15 (2014).
#' @seealso [`ALDEx2::aldex()`]
#' @return a [`microbiomeMarker-class`] object.
#' @examples
#' data(enterotypes_arumugam)
#' ps <- phyloseq::subset_samples(
#' enterotypes_arumugam,
#' Enterotype %in% c("Enterotype 3", "Enterotype 2")
#' )
#' run_aldex(ps, group = "Enterotype")
run_aldex <- function(ps,
group,
taxa_rank = "all",
transform = c("identity", "log10", "log10p"),
norm = "none",
norm_para = list(),
method = c(
"t.test", "wilcox.test",
"kruskal", "glm_anova"
),
p_adjust = c(
"none", "fdr", "bonferroni", "holm",
"hochberg", "hommel", "BH", "BY"
),
pvalue_cutoff = 0.05,
mc_samples = 128,
denom = c("all", "iqlr", "zero", "lvha"),
paired = FALSE) {
stopifnot(inherits(ps, "phyloseq"))
ps <- check_rank_names(ps) %>%
check_taxa_rank( taxa_rank)
denom <- match.arg(denom, c("all", "iqlr", "zero", "lvha"))
p_adjust <- match.arg(
p_adjust,
c(
"none", "fdr", "bonferroni", "holm",
"hochberg", "hommel", "BH", "BY"
)
)
# trans method as argument test in ALDEx2::aldex
method <- match.arg(
method,
c("t.test", "wilcox.test", "kruskal", "glm_anova")
)
if (method %in% c("t.test", "wilcox.test")) {
test_method <- "t"
} else {
test_method <- "kw"
}
# check whether group is valid, write a function
sample_meta <- sample_data(ps)
meta_nms <- names(sample_meta)
if (!group %in% meta_nms) {
stop(
group, " are not contained in the `sample_data` of `ps`",
call. = FALSE
)
}
transform <- match.arg(transform, c("identity", "log10", "log10p"))
# preprocess phyloseq object
ps <- preprocess_ps(ps)
ps <- transform_abundances(ps, transform = transform)
# normalize the data
norm_para <- c(norm_para, method = norm, object = list(ps))
ps_normed <- do.call(normalize, norm_para)
ps_summarized <- pre_ps_taxa_rank(ps_normed, taxa_rank)
groups <- sample_meta[[group]]
abd <- abundances(ps_summarized, norm = TRUE)
test_fun <- ifelse(test_method == "t", aldex_t, aldex_kw)
test_para <- list(
reads = abd,
conditions = groups,
method = method,
mc_samples = mc_samples,
denom = denom,
p_adjust = p_adjust
)
if (test_method == "t") {
test_para <- c(test_para, paired = paired)
}
test_out <- tryCatch(
do.call(test_fun, test_para),
error = function(e) e
)
# check whether counts are integers
if (inherits(test_out, "error") &&
conditionMessage(test_out) == "not all reads are integers") {
warning(
"Not all reads are integers, the reads are ceiled to integers.\n",
" Raw reads is recommended from the ALDEx2 paper.",
call. = FALSE
)
test_para$reads <- ceiling(abd)
test_out <- do.call(test_fun, test_para)
}
sig_feature <- dplyr::filter(test_out, .data$padj <= pvalue_cutoff)
marker <- return_marker(sig_feature, test_out)
feature <- test_out$feature
tax <- matrix(feature) %>%
tax_table()
row.names(tax) <- row.names(abd)
mm <- microbiomeMarker(
marker_table = marker,
norm_method = get_norm_method(norm),
diff_method = paste0("ALDEx2_", method),
sam_data = sample_data(ps_summarized),
otu_table = otu_table(abd, taxa_are_rows = TRUE),
tax_table = tax
)
mm
}
# aldex t test, wilcox test
# In the original version of ALDEx2, each p value is corrected using the
# Benjamini-Hochberg method. Here, we add a new argument `p_adjust` to
# make aldex support for other correction methods.
aldex_t <- function(reads,
conditions,
mc_samples,
method = c("t.test", "wilcox.test"),
denom = c("all", "iqlr", "zero", "lvha"),
p_adjust = c(
"none", "fdr", "bonferroni", "holm",
"hochberg", "hommel", "BH", "BY"
),
paired = FALSE) {
method <- match.arg(method, c("t.test", "wilcox.test"))
demon <- match.arg(denom, c("all", "iqlr", "zero", "lvha"))
p_adjust <- match.arg(
p_adjust,
c(
"none", "fdr", "bonferroni", "holm",
"hochberg", "hommel", "BH", "BY"
)
)
conditions <- as.factor(conditions)
if (!inherits(reads, "aldex.clr")) {
reads_clr <- ALDEx2::aldex.clr(
reads = reads,
conds = as.character(conditions),
mc.samples = mc_samples,
denom = denom
)
feature <- row.names(reads)
} else {
reads_clr <- reads
feature <- row.names(reads@reads)
}
mc_instance <- reads_clr@analysisData
mc_instance_ldf <- convert_instance(mc_instance, mc_samples)
if (method == "t.test") {
pvalue <- purrr::map_dfc(
mc_instance_ldf,
t_fast,
group = conditions, paired = paired)
} else {
pvalue <- purrr::map_dfc(
mc_instance_ldf,
wilcox_fast,
group = conditions, paired = paired
)
}
padj_greater <- purrr::map_dfc(
pvalue,
\(x) p.adjust (2 * x, method = p_adjust)
)
padj_less <- purrr::map_dfc(
pvalue,
\(x) p.adjust (2 * (1 - x), method = p_adjust)
)
# making this into a two-sided test
pvalue_greater <-2 * pvalue
pvalue_less <- 2 * (1 - pvalue)
# making sure the max p-value is 1
pvalue_greater <- apply(pvalue_greater, c(1, 2), \(x) min(x, 1))
pvalue_less <- apply(pvalue_less, c(1, 2), \(x) min(x, 1))
# get the expected value of p value and adjusted p value
e_pvalue <- cbind(rowMeans(pvalue_greater), rowMeans(pvalue_less)) |>
apply(1, min)
e_padj <- cbind(rowMeans(padj_greater), rowMeans(padj_less)) |>
apply(1, min)
# effect size
ef <- ALDEx2::aldex.effect(
reads_clr,
include.sample.summary = FALSE,
verbose = FALSE
)
# enrich group
cds <- gsub("rab.win.", "", names(ef)[2:3])
ef <- ef$effect
enrich_group <- ifelse(ef > 0, cds[1], cds[2])
res <- data.frame(
feature = feature,
enrich_group = enrich_group,
ef_aldex = ef,
pvalue = e_pvalue,
padj = e_padj
)
res
}
# aldex kruskal-wallis test and glm anova statistics
#' @importFrom stats kruskal.test glm drop1
aldex_kw <- function(reads,
conditions,
method = c("kruskal", "glm_anova"),
mc_samples = 128,
denom = c("all", "iqlr", "zero", "lvha"),
p_adjust = c(
"none", "fdr", "bonferroni", "holm",
"hochberg", "hommel", "BH", "BY"
)) {
method <- match.arg(method, c("kruskal", "glm_anova"))
demon <- match.arg(denom, c("all", "iqlr", "zero", "lvha"))
p_adjust <- match.arg(
p_adjust,
c(
"none", "fdr", "bonferroni", "holm",
"hochberg", "hommel", "BH", "BY"
)
)
conditions <- as.factor(conditions)
if (!inherits(reads, "aldex.clr")) {
reads_clr <- ALDEx2::aldex.clr(
reads = reads,
conds = conditions,
mc.samples = mc_samples,
denom = denom
)
feature <- row.names(reads)
} else {
reads_clr <- reads
feature <- row.names(reads@reads)
}
mc_instance <- reads_clr@analysisData
# convert mc_instance to a list of data frame, each element represents a mc
# sample for all samples.
mc_instance_ldf <- convert_instance(mc_instance, mc_samples)
if (method == "kruskal") {
pvalue <- purrr::map_dfc(
mc_instance_ldf,
function(x) {
apply(
x, 1,
function(y) {
stats::kruskal.test(y, g = factor(conditions))[[3]]
}
)
}
)
} else {
pvalue <- purrr::map_dfc(
mc_instance_ldf,
function(x) {
apply(
x, 1,
function(y) {
stats::glm(as.numeric(y) ~ factor(conditions)) %>%
stats::drop1(test = "Chis") %>%
purrr::pluck(5, 2)
}
)
}
)
}
padj <- purrr::map_dfc(pvalue, p.adjust, method = p_adjust)
e_pvalue <- rowMeans(pvalue)
e_padj <- rowMeans(padj)
# f statistic
ef_F_statistic <- purrr::map_dfc(
mc_instance_ldf,
function(x) {
apply(
x, 1,
function(y) {
summary(aov(y ~ factor(conditions)))[[1]]$`F value`[1]
}
)
}
) %>%
rowMeans()
enrich_group <- get_aldex_kwglm_enrich_group(mc_instance_ldf, conditions)
res <- data.frame(
feature = feature,
enrich_group = enrich_group,
ef_F_statistic = ef_F_statistic,
pvalue = e_pvalue,
padj = e_padj
)
res
}
# enriched group for kw and glm anova
get_aldex_kwglm_enrich_group <- function(mc_instance_ldf, conditions) {
instance_split <- purrr::map(
mc_instance_ldf,
~ split(data.frame(t(.x)), conditions)
)
instance_mean <- purrr::map(
instance_split,
~ purrr::map_dfc(.x, colMeans)
)
instance_mean <- Reduce("+", instance_mean)
max_idx <- apply(instance_mean, 1, which.max)
enrich_group <- names(instance_mean)[max_idx]
enrich_group
}
# Each element of mc instances of a clr object represents all instances of a
# sample, this function convert mc instances to list data frames where each
# element represents a mc instance for all samples
convert_instance <- function(mc_instance, mc_samples) {
mc_instance_ldf <- purrr::map(
seq.int(mc_samples),
function(x) {
res <- purrr::map_dfc(mc_instance, function(y) y[, x])
names(res) <- names(mc_instance)
res
}
)
mc_instance_ldf
}
# fast test function modified from ALDEx2::t.fast
#' @importFrom stats pt
t_fast <- function(x, group, paired = FALSE) {
grp1 <- group == unique(group)[1]
grp2 <- group == unique(group)[2]
n1 <- sum(grp1)
n2 <- sum(grp2)
if (paired) {
# Order pairs for the mt.teststat function
if (n1 != n2) stop("Cannot pair uneven groups.")
idx1 <- which(grp1)
idx2 <- which(grp2)
paired_order <- unlist(
lapply(
seq_along(idx1),
function(i) c(idx1[i], idx2[i])
)
)
t <- multtest::mt.teststat(
x[, paired_order],
as.numeric(grp1)[paired_order],
test = "pairt",
nonpara = "n"
)
df <- length(idx1) - 1
} else {
t <- multtest::mt.teststat(x,
as.numeric(grp1),
test = "t",
nonpara = "n"
)
s1 <- apply(x[, grp1], 1, sd)
s2 <- apply(x[, grp2], 1, sd)
df <- ((s1^2 / n1 + s2^2 / n2)^2) / ((s1^2 / n1)^2 / (n1 - 1) +
(s2^2 / n2)^2 / (n2 - 1))
}
return(pt(t, df = df, lower.tail = FALSE))
}
# wilcox.fast function replaces wilcox.test
# * runs much faster
# * uses exact distribution for ties!
# * this differs from ?wilcox.test
# * optional paired test
# * equivalent to wilcox.test(..., correct = FALSE)
# * uses multtest
#' @importFrom stats psignrank pnorm pwilcox wilcox.test
wilcox_fast <- function(x, group, paired = FALSE) {
stopifnot(ncol(x) == length(group))
grp1 <- group == unique(group)[1]
grp2 <- group == unique(group)[2]
n1 <- sum(grp1)
n2 <- sum(grp2)
# Check for ties in i-th Monte-Carlo instance
xt <- t(x)
if (paired) {
any_ties <- any(
apply(xt[grp1, ] - xt[grp2, ], 2, function(y) length(unique(y))) !=
ncol(x) / 2
)
} else {
any_ties <- any(
apply(xt, 2, function(y) length(unique(y))) != ncol(x)
)
}
# Ties trigger slower, safer wilcox.test function
if (any_ties) {
res <- apply(
xt, 2,
function(i) {
wilcox.test(
i[grp1], i[grp2],
paired = paired,
alternative = "greater",
correct = FALSE
)$p.value
}
)
return(res)
}
if (paired) {
if (n1 != n2) stop("Cannot pair uneven groups.")
x_diff <- xt[grp1, ] - xt[grp2, ]
v <- apply(x_diff, 2, function(y) sum(rank(abs(y))[y > 0]))
topscore <- (n1 * (n1 + 1)) / 2
if (sum(grp1) < 50) {
# as per wilcox test, use exact -- ASSUMES NO TIES!!
res <- psignrank(v - 1, n1, lower.tail = FALSE)
} else { # Use normal approximation
v_std <- (v - topscore / 2) /
sqrt(n1 * (n1 + 1) * (2 * n1 + 1) / 24)
res <- pnorm(v_std, lower.tail = FALSE)
}
} else {
w_std <- multtest::mt.teststat(x, as.numeric(grp1), test = "wilcoxon")
if (sum(grp1) < 50 && sum(grp2) < 50) {
# as per wilcox test, use exact -- ASSUMES NO TIES!!
w_var <- sqrt((n1 * n2) * (n1 + n2 + 1) / 12)
w <- w_std * w_var + (n1 * n2) / 2
res <- pwilcox(w - 1, n1, n2, lower.tail = FALSE)
} else { # Use normal approximation
res <- pnorm(w_std, lower.tail = FALSE)
}
}
res
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.