knitr::opts_chunk$set(message = FALSE, warning = FALSE, comment = NA, fig.width = 6.25, fig.height = 5) library(tidyverse) library(microbiome) library(magrittr) library(qwraps2) library(ANCOMBC) library(DT) options(DT.options = list( initComplete = JS("function(settings, json) {", "$(this.api().table().header()).css({'background-color': '#000', 'color': '#fff'});","}")))
Analysis of Compositions of Microbiomes with Bias Correction (ANCOM-BC) [@lin2020analysis] is a methodology of differential abundance (DA) analysis for microbial absolute abundance data. ANCOM-BC estimates the unknown sampling fractions, corrects the bias induced by their differences among samples, and identifies taxa that are differentially abundant according to the covariate of interest.
For more details, please refer to the ANCOM-BC paper.
Download package.
if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager") BiocManager::install("ANCOMBC")
Load the package.
library(ANCOMBC)
The HITChip Atlas data set [@lahti2014tipping] is available via the microbiome R package [@lahti2017tools] in phyloseq [@mcmurdie2013phyloseq] format, and via Data Dryad in tabular format. This data set comes with 130 genus-like taxonomic groups across 1006 western adults with no reported health complications. Some subjects have also short time series.
data(atlas1006) # Subset to baseline pseq = subset_samples(atlas1006, time == 0) # Re-code the bmi group sample_data(pseq)$bmi_group = recode(sample_data(pseq)$bmi_group, underweight = "lean", lean = "lean", overweight = "overweight", obese = "obese", severeobese = "obese", morbidobese = "obese") # Re-code the nationality group sample_data(pseq)$nation = recode(sample_data(pseq)$nationality, Scandinavia = "NE", UKIE = "NE", SouthEurope = "SE", CentralEurope = "CE", EasternEurope = "EE") # Aggregate to phylum level phylum_data = aggregate_taxa(pseq, "Phylum")
options(qwraps2_markup = "markdown") summary_template = list("Age" = list("min" = ~ min(age, na.rm = TRUE), "max" = ~ max(age, na.rm = TRUE), "mean (sd)" = ~ mean_sd(age, na_rm = TRUE, show_n = "never")), "Gender" = list("F" = ~ n_perc0(sex == "female", na_rm = TRUE), "M" = ~ n_perc0(sex == "male", na_rm = TRUE), "NA" = ~ n_perc0(is.na(sex))), "Nationality" = list("Central Europe" = ~ n_perc0(nation == "CE", na_rm = TRUE), "Eastern Europe" = ~ n_perc0(nation == "EE", na_rm = TRUE), "Northern Europe" = ~ n_perc0(nation == "NE", na_rm = TRUE), "Southern Europe" = ~ n_perc0(nation == "SE", na_rm = TRUE), "US" = ~ n_perc0(nation == "US", na_rm = TRUE), "NA" = ~ n_perc0(is.na(nation))), "BMI" = list("Lean" = ~ n_perc0(bmi_group == "lean", na_rm = TRUE), "Overweight" = ~ n_perc0(bmi_group == "overweight", na_rm = TRUE), "Obese" = ~ n_perc0(bmi_group == "obese", na_rm = TRUE), "NA" = ~ n_perc0(is.na(bmi_group))) ) data_summary = summary_table(meta(pseq), summary_template) data_summary
The number of samples: r nsamples(phylum_data)
.
The number of phyla: r ntaxa(phylum_data)
.
out = ancombc(phyloseq = phylum_data, formula = "age + nation + bmi_group", p_adj_method = "holm", zero_cut = 0.90, lib_cut = 1000, group = "nation", struc_zero = TRUE, neg_lb = TRUE, tol = 1e-5, max_iter = 100, conserve = TRUE, alpha = 0.05, global = TRUE) res = out$res res_global = out$res_global
Result from the ANCOM-BC log-linear model to determine taxa that are differentially abundant according to the covariate of interest. It contains: 1) coefficients; 2) standard errors; 3) test statistics; 4) p-values; 5) adjusted p-values; 6) indicators whether the taxon is differentially abundant (TRUE) or not (FALSE).
tab_coef = res$beta col_name = c("Age", "EE - CE", "NE - CE", "SE - CE", "US - CE", "Oerweight - Lean", "Obese - Lean") colnames(tab_coef) = col_name tab_coef %>% datatable(caption = "Coefficients from the Primary Result") %>% formatRound(col_name, digits = 2)
tab_se = res$se colnames(tab_se) = col_name tab_se %>% datatable(caption = "SEs from the Primary Result") %>% formatRound(col_name, digits = 2)
tab_w = res$W colnames(tab_w) = col_name tab_w %>% datatable(caption = "Test Statistics from the Primary Result") %>% formatRound(col_name, digits = 2)
tab_p = res$p_val colnames(tab_p) = col_name tab_p %>% datatable(caption = "P-values from the Primary Result") %>% formatRound(col_name, digits = 2)
tab_q = res$q colnames(tab_q) = col_name tab_q %>% datatable(caption = "Adjusted p-values from the Primary Result") %>% formatRound(col_name, digits = 2)
tab_diff = res$diff_abn colnames(tab_diff) = col_name tab_diff %>% datatable(caption = "Differentially Abundant Taxa from the Primary Result")
Step 1: estimate sample-specific sampling fractions (log scale). Note that for each sample, if it contains missing values for any variable specified in the formula, the corresponding sampling fraction estimate for this sample will return NA since the sampling fraction is not estimable with the presence of missing values.
Step 2: adjust the log observed abundances by subtracting the estimated sampling fraction from log observed abundances of each sample.
samp_frac = out$samp_frac # Replace NA with 0 samp_frac[is.na(samp_frac)] = 0 # Add pesudo-count (1) to avoid taking the log of 0 log_obs_abn = log(abundances(phylum_data) + 1) # Adjust the log observed abundances log_obs_abn_adj = t(t(log_obs_abn) - samp_frac) # Show the first 6 samples round(log_obs_abn_adj[, 1:6], 2) %>% datatable(caption = "Bias-adjusted log observed abundances")
"Age" is a continuous variable.
df_fig1 = data.frame(res$beta * res$diff_abn, check.names = FALSE) %>% rownames_to_column("taxon_id") df_fig2 = data.frame(res$se * res$diff_abn, check.names = FALSE) %>% rownames_to_column("taxon_id") colnames(df_fig2)[-1] = paste0(colnames(df_fig2)[-1], "SD") df_fig = df_fig1 %>% left_join(df_fig2, by = "taxon_id") %>% transmute(taxon_id, age, ageSD) %>% filter(age != 0) %>% arrange(desc(age)) %>% mutate(group = ifelse(age > 0, "g1", "g2")) df_fig$taxon_id = factor(df_fig$taxon_id, levels = df_fig$taxon_id) p = ggplot(data = df_fig, aes(x = taxon_id, y = age, fill = group, color = group)) + geom_bar(stat = "identity", width = 0.7, position = position_dodge(width = 0.4)) + geom_errorbar(aes(ymin = age - ageSD, ymax = age + ageSD), width = 0.2, position = position_dodge(0.05), color = "black") + labs(x = NULL, y = "Log fold change", title = "Waterfall Plot for the Age Effect") + theme_bw() + theme(legend.position = "none", plot.title = element_text(hjust = 0.5), panel.grid.minor.y = element_blank(), axis.text.x = element_text(angle = 60, hjust = 1)) p
Result from the ANCOM-BC global test to determine taxa that are differentially abundant between three or more groups of multiple samples. It contains: 1) test statistics; 2) p-values; 3) adjusted p-values; 4) indicators whether the taxon is differentially abundant (TRUE) or not (FALSE).
tab_w = res_global[, "W", drop = FALSE] colnames(tab_w) = "Nationality" tab_w %>% datatable(caption = "Test Statistics from the Global Test Result") %>% formatRound(c("Nationality"), digits = 2)
tab_p = res_global[, "p_val", drop = FALSE] colnames(tab_p) = "Nationality" tab_p %>% datatable(caption = "P-values from the Global Test Result") %>% formatRound(c("Nationality"), digits = 2)
tab_q = res_global[, "q_val", drop = FALSE] colnames(tab_q) = "Nationality" tab_q %>% datatable(caption = "Adjusted p-values from the Global Test Result") %>% formatRound(c("Nationality"), digits = 2)
tab_diff = res_global[, "diff_abn", drop = FALSE] colnames(tab_diff) = "Nationality" tab_diff %>% datatable(caption = "Differentially Abundant Taxa from the Global Test Result")
sessionInfo()
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.