ANCOM-BC

knitr::opts_chunk$set(message = FALSE, warning = FALSE, comment = NA, 
                      fig.width = 6.25, fig.height = 5)

library(tidyverse)
library(microbiome)
library(magrittr)
library(qwraps2)
library(ANCOMBC)
library(DT)
options(DT.options = list(
  initComplete = JS("function(settings, json) {",
  "$(this.api().table().header()).css({'background-color': 
  '#000', 'color': '#fff'});","}")))

1. Introduction

Analysis of Compositions of Microbiomes with Bias Correction (ANCOM-BC) [@lin2020analysis] is a methodology of differential abundance (DA) analysis for microbial absolute abundance data. ANCOM-BC estimates the unknown sampling fractions, corrects the bias induced by their differences among samples, and identifies taxa that are differentially abundant according to the covariate of interest.

For more details, please refer to the ANCOM-BC paper.

2. Installation

Download package.

if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
BiocManager::install("ANCOMBC")

Load the package.

library(ANCOMBC)

3. Running ANCOMBC

3.1 Intestinal microbiota profiling data

The HITChip Atlas data set [@lahti2014tipping] is available via the microbiome R package [@lahti2017tools] in phyloseq [@mcmurdie2013phyloseq] format, and via Data Dryad in tabular format. This data set comes with 130 genus-like taxonomic groups across 1006 western adults with no reported health complications. Some subjects have also short time series.

data(atlas1006) 
# Subset to baseline
pseq = subset_samples(atlas1006, time == 0)
# Re-code the bmi group
sample_data(pseq)$bmi_group = recode(sample_data(pseq)$bmi_group,
                                     underweight = "lean",
                                     lean = "lean",
                                     overweight = "overweight",
                                     obese = "obese",
                                     severeobese = "obese",
                                     morbidobese = "obese")
# Re-code the nationality group
sample_data(pseq)$nation = recode(sample_data(pseq)$nationality,
                                  Scandinavia = "NE",
                                  UKIE = "NE",
                                  SouthEurope = "SE",
                                  CentralEurope = "CE",
                                  EasternEurope = "EE")

# Aggregate to phylum level
phylum_data = aggregate_taxa(pseq, "Phylum")

3.2 Data summary

options(qwraps2_markup = "markdown")
summary_template =
  list("Age" =
       list("min" = ~ min(age, na.rm = TRUE),
            "max" = ~ max(age, na.rm = TRUE),
            "mean (sd)" = ~ mean_sd(age, na_rm = TRUE, 
                                    show_n = "never")),
       "Gender" =
       list("F" = ~ n_perc0(sex == "female", na_rm = TRUE),
            "M" = ~ n_perc0(sex == "male", na_rm = TRUE),
            "NA" = ~ n_perc0(is.na(sex))),
       "Nationality" =
       list("Central Europe" = ~ n_perc0(nation == "CE", na_rm = TRUE),
            "Eastern Europe" = ~ n_perc0(nation == "EE", na_rm = TRUE),
            "Northern Europe" = ~ n_perc0(nation == "NE", na_rm = TRUE),
            "Southern Europe" = ~ n_perc0(nation == "SE", na_rm = TRUE),
            "US" = ~ n_perc0(nation == "US", na_rm = TRUE),
            "NA" = ~ n_perc0(is.na(nation))),
       "BMI" =
       list("Lean" = ~ n_perc0(bmi_group == "lean", na_rm = TRUE),
            "Overweight" = ~ n_perc0(bmi_group == "overweight", 
                                     na_rm = TRUE),
            "Obese" = ~ n_perc0(bmi_group == "obese", na_rm = TRUE),
            "NA" = ~ n_perc0(is.na(bmi_group)))
       )
data_summary = summary_table(meta(pseq), summary_template)
data_summary
  1. The number of samples: r nsamples(phylum_data).

  2. The number of phyla: r ntaxa(phylum_data).

3.3 Running ancombc function

out = ancombc(phyloseq = phylum_data, formula = "age + nation + bmi_group", 
              p_adj_method = "holm", zero_cut = 0.90, lib_cut = 1000, 
              group = "nation", struc_zero = TRUE, neg_lb = TRUE, tol = 1e-5, 
              max_iter = 100, conserve = TRUE, alpha = 0.05, global = TRUE)

res = out$res
res_global = out$res_global

3.4 ANCOMBC primary result

Result from the ANCOM-BC log-linear model to determine taxa that are differentially abundant according to the covariate of interest. It contains: 1) coefficients; 2) standard errors; 3) test statistics; 4) p-values; 5) adjusted p-values; 6) indicators whether the taxon is differentially abundant (TRUE) or not (FALSE).

3.41 Coefficients

tab_coef = res$beta
col_name = c("Age", "EE - CE", "NE - CE", "SE - CE", "US - CE", 
             "Oerweight - Lean", "Obese - Lean")
colnames(tab_coef) = col_name
tab_coef %>% datatable(caption = "Coefficients from the Primary Result") %>%
      formatRound(col_name, digits = 2)

3.42 SEs

tab_se = res$se
colnames(tab_se) = col_name
tab_se %>% datatable(caption = "SEs from the Primary Result") %>%
      formatRound(col_name, digits = 2)

3.43 Test statistics

tab_w = res$W
colnames(tab_w) = col_name
tab_w %>% datatable(caption = "Test Statistics from the Primary Result") %>%
      formatRound(col_name, digits = 2)

3.44 P-values

tab_p = res$p_val
colnames(tab_p) = col_name
tab_p %>% datatable(caption = "P-values from the Primary Result") %>%
      formatRound(col_name, digits = 2)

3.45 Adjusted p-values

tab_q = res$q
colnames(tab_q) = col_name
tab_q %>% datatable(caption = "Adjusted p-values from the Primary Result") %>%
      formatRound(col_name, digits = 2)

3.46 Differentially abundant taxa

tab_diff = res$diff_abn
colnames(tab_diff) = col_name
tab_diff %>% 
  datatable(caption = "Differentially Abundant Taxa 
            from the Primary Result")

3.47 Bias-adjusted abundances

Step 1: estimate sample-specific sampling fractions (log scale). Note that for each sample, if it contains missing values for any variable specified in the formula, the corresponding sampling fraction estimate for this sample will return NA since the sampling fraction is not estimable with the presence of missing values.

Step 2: adjust the log observed abundances by subtracting the estimated sampling fraction from log observed abundances of each sample.

samp_frac = out$samp_frac
# Replace NA with 0
samp_frac[is.na(samp_frac)] = 0 

# Add pesudo-count (1) to avoid taking the log of 0
log_obs_abn = log(abundances(phylum_data) + 1) 
# Adjust the log observed abundances
log_obs_abn_adj = t(t(log_obs_abn) - samp_frac)
# Show the first 6 samples
round(log_obs_abn_adj[, 1:6], 2) %>% 
  datatable(caption = "Bias-adjusted log observed abundances")

3.48 Visualizations for "age"

"Age" is a continuous variable.

df_fig1 = data.frame(res$beta * res$diff_abn, check.names = FALSE) %>% 
  rownames_to_column("taxon_id")
df_fig2 = data.frame(res$se * res$diff_abn, check.names = FALSE) %>% 
  rownames_to_column("taxon_id")
colnames(df_fig2)[-1] = paste0(colnames(df_fig2)[-1], "SD")
df_fig = df_fig1 %>% left_join(df_fig2, by = "taxon_id") %>%
  transmute(taxon_id, age, ageSD) %>%
  filter(age != 0) %>% arrange(desc(age)) %>%
  mutate(group = ifelse(age > 0, "g1", "g2"))
df_fig$taxon_id = factor(df_fig$taxon_id, levels = df_fig$taxon_id)

p = ggplot(data = df_fig, 
           aes(x = taxon_id, y = age, fill = group, color = group)) + 
  geom_bar(stat = "identity", width = 0.7, 
           position = position_dodge(width = 0.4)) +
  geom_errorbar(aes(ymin = age - ageSD, ymax = age + ageSD), width = 0.2,
                position = position_dodge(0.05), color = "black") + 
  labs(x = NULL, y = "Log fold change", 
       title = "Waterfall Plot for the Age Effect") + 
  theme_bw() + 
  theme(legend.position = "none",
        plot.title = element_text(hjust = 0.5),
        panel.grid.minor.y = element_blank(),
        axis.text.x = element_text(angle = 60, hjust = 1))
p

3.5 ANCOMBC global test result

Result from the ANCOM-BC global test to determine taxa that are differentially abundant between three or more groups of multiple samples. It contains: 1) test statistics; 2) p-values; 3) adjusted p-values; 4) indicators whether the taxon is differentially abundant (TRUE) or not (FALSE).

3.51 Test statistics

tab_w = res_global[, "W", drop = FALSE]
colnames(tab_w) = "Nationality"
tab_w %>% datatable(caption = "Test Statistics 
                    from the Global Test Result") %>%
      formatRound(c("Nationality"), digits = 2)

3.52 P-values

tab_p = res_global[, "p_val", drop = FALSE]
colnames(tab_p) = "Nationality"
tab_p %>% datatable(caption = "P-values 
                    from the Global Test Result") %>%
      formatRound(c("Nationality"), digits = 2)

3.53 Adjusted p-values

tab_q = res_global[, "q_val", drop = FALSE]
colnames(tab_q) = "Nationality"
tab_q %>% datatable(caption = "Adjusted p-values 
                    from the Global Test Result") %>%
      formatRound(c("Nationality"), digits = 2)

3.54 Differentially abundant taxa

tab_diff = res_global[, "diff_abn", drop = FALSE]
colnames(tab_diff) = "Nationality"
tab_diff %>% datatable(caption = "Differentially Abundant Taxa 
                       from the Global Test Result")

Session information

sessionInfo()

References



Try the ANCOMBC package in your browser

Any scripts or data that you put into this service are public.

ANCOMBC documentation built on March 11, 2021, 2 a.m.