knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.path = "man/figures/README-",
  out.width = "100%"
)

bacphene

The goal of bacphene is to generate a table of phenotype characteristics about your samples. Input is a table of Samples with counts per taxa.

The package utilizes the official BacDive R package that accesses bacterial phenotype data from bacdive.org.

Installation

You can install bacphene from the github repo. You also need to install the BacDive R package (if you want the full data - see below).

library(devtools)
install.packages("BacDive", repos="http://R-Forge.R-project.org")
install_github(repo = "scottdaniel/bacphene")

If you want to follow along with the demo, you will also need the following package(s):

install.packages("nlme")

After this, you may register at bacdive.org here.

Recommended

Edit your $HOME/.Renviron file (you can open in R with usethis::edit_r_environ()) and add your bacdive credentials like so:

DSMZ_API_USER=your_email@something.com
DSMZ_API_PASSWORD=your_password

If you do not register, you will not be able to download the full information on strains in BacDive. Instead, what you will get is access to three data-frames: bacdive_phenotypes, bacdive_susceptibility, and bacdive_enzymes. Which look like this:

library(bacphene)
head(bacdive_phenotypes)
head(bacdive_susceptibility)
head(bacdive_enzymes)

Demo

Our test data-set is from Shen et al. 2021 and contains taxonomic counts from metagenomic shotgun alignments. See ?Shen2021 for a full description of the columns. With our test data and bacdive we can ask a few questions:

Differences in aerobic status

library(tidyverse)

my_df <- Shen2021 %>%
  left_join(bacdive_phenotypes, by = "taxon") %>%
  mutate(prop = count / read_counts) %>%
  group_by(SampleID, aerobic_status) %>%
  filter(count > 0, !is.na(aerobic_status)) %>%
  summarise(n = n(), weighted_n = sum(prop) * n(), .groups = "drop") %>%
  ungroup()

sort_order <- my_df %>%
  group_by(SampleID) %>%
  mutate(prop = weighted_n / sum(weighted_n)) %>%
  filter(aerobic_status %in% "anaerobe") %>%
  arrange(prop) %>%
  pull(SampleID)

my_df %>%
  left_join(Shen2021 %>% select(SampleID, SampleType) %>% unique(), by = "SampleID") %>%
  mutate(SampleID = fct_relevel(SampleID, sort_order)) %>%
  mutate(aerobic_status = fct_relevel(aerobic_status, "anaerobe", "aerobe")) %>%

  ggplot(aes(x = SampleID, y = weighted_n, fill = aerobic_status)) +
  geom_bar(stat = "identity", position = "fill", color = "white", size = 0.2) +
  theme_bw() +
  theme(strip.text.x = element_text(size = 8),
        strip.background = element_blank(),
        axis.text.x = element_blank()) +
  facet_wrap(~SampleType, scales = "free") +
  scale_y_continuous(labels = scales::percent) +
  scale_fill_brewer(palette = "Set1") +
  labs(y = "Proportion of bacteria type weighted by abundance", x = "", fill = "Aerobic status", caption = "Each column represents an individual sample")

What are contributing most to these counts?

Top taxa in feces

library(pander)

top_taxa_feces <- Shen2021 %>%
  left_join(bacdive_phenotypes, by = "taxon") %>%
  filter(SampleType %in% "Feces") %>%
  group_by(SampleType, aerobic_status, taxon) %>%
  summarise(total_count = sum(count), .groups = "drop") %>%
  mutate(percentage = paste(round((total_count / sum(total_count))*100), "%")) %>%
  slice_max(order_by = total_count, n = 10)

top_taxa_feces %>% pander(split.table = Inf, split.cells = 20, digits = 2)

Top taxa in rectal swab

top_taxa_rectal <- Shen2021 %>%
  left_join(bacdive_phenotypes, by = "taxon") %>%
  filter(SampleType %in% "Rectal swab") %>%
  group_by(SampleType, aerobic_status, taxon) %>%
  summarise(total_count = sum(count), .groups = "drop") %>%
  mutate(percentage = paste(round((total_count / sum(total_count))*100), "%")) %>%
  slice_max(order_by = total_count, n = 10)

top_taxa_rectal %>% pander(split.table = Inf, split.cells = 20, digits = 2)

Is the aerobe / anaerobe ratio significantly different?

totals_df <- my_df %>%
  pivot_wider(-n, names_from = "aerobic_status", values_from = "weighted_n") %>%
  mutate(log_aerobe_ratio = log(aerobe / anaerobe)) %>%
  left_join(Shen2021 %>% select(SampleID, SampleType) %>% unique(), by = "SampleID")

my_lm <- lm(log_aerobe_ratio ~ SampleType, data = totals_df)

summary(my_lm)
Graph
totals_df %>%
  ggplot(aes(x = SampleType, y = log_aerobe_ratio)) +
  geom_boxplot()

Looks like there are slightly more aerobes compared to anaerobes in the rectal swabs but not enough to be statistically significant.

Differences in gram-staining

my_df <- Shen2021 %>%
  left_join(bacdive_phenotypes, by = "taxon") %>%
  mutate(prop = count / read_counts) %>%
  group_by(SampleID, gram_stain) %>%
  filter(count > 0, !is.na(gram_stain)) %>%
  summarise(n = n(), weighted_n = sum(prop) * n(), .groups = "drop") %>%
  ungroup()

sort_order <- my_df %>%
  group_by(SampleID) %>%
  mutate(prop = weighted_n / sum(weighted_n)) %>%
  filter(gram_stain %in% "negative") %>%
  arrange(prop) %>%
  pull(SampleID)

my_df %>%
  left_join(Shen2021 %>% select(SampleID, SampleType) %>% unique(), by = "SampleID") %>%
  mutate(SampleID = fct_relevel(SampleID, sort_order)) %>%
  mutate(gram_stain = fct_relevel(gram_stain, "negative", "positive")) %>%

  ggplot(aes(x = SampleID, y = weighted_n, fill = gram_stain)) +
  geom_bar(stat = "identity", position = "fill", color = "white", size = 0.2) +
  theme_bw() +
  theme(strip.text.x = element_text(size = 8),
        strip.background = element_blank(),
        axis.text.x = element_blank()) +
  facet_wrap(~SampleType, scales = "free") +
  scale_y_continuous(labels = scales::percent) +
  scale_fill_brewer(palette = "Set2") +
  labs(y = "Proportion of bacteria type weighted by abundance", x = "", fill = "Gram stain", caption = "Each column represents an individual sample")

Is the gram-stain status significantly different?

totals_df <- my_df %>%
  pivot_wider(-n, names_from = "gram_stain", values_from = "weighted_n") %>%
  mutate(log_neggramstain_ratio = log(negative / positive)) %>%
  left_join(Shen2021 %>% select(SampleID, SampleType) %>% unique(), by = "SampleID")

my_lm <- lm(log_neggramstain_ratio ~ SampleType, data = totals_df)

summary(my_lm)
Graph
totals_df %>%
  ggplot(aes(x = SampleType, y = log_neggramstain_ratio)) +
  geom_boxplot()

Differences in antibiotic resistance

Here, we will look at how antibiotic resistance might differ between the types of liver disease. Namely, alcohol related and non-alcohol related. There might be differences between sample types so that variable will be looked at as well. We will limit ourselves to the top 10 antibiotics as ranked by the number of species tested. These are:

abx_subset <- bacdive_susceptibility %>%
  count(antibiotic) %>%
  slice_max(order_by = n, n = 10) 

abx_subset %>%
  pander()
my_df <- Shen2021 %>%
  left_join(bacdive_susceptibility %>% filter(antibiotic %in% abx_subset$antibiotic, value %in% "resistant"), by = "taxon") %>%
  mutate(prop = count / read_counts) %>%
  group_by(SampleID, antibiotic) %>%
  filter(count > 0, !is.na(antibiotic)) %>%
  summarise(n = n(), weighted_n = sum(prop) * n(), .groups = "drop") %>%
  ungroup()

sort_order <- my_df %>%
  group_by(SampleID) %>%
  mutate(prop = weighted_n / sum(weighted_n)) %>%
  filter(antibiotic %in% "ampicillin") %>%
  arrange(prop) %>%
  pull(SampleID)

my_df %>%
  left_join(Shen2021 %>% select(SampleID, ETOH_etiology, SampleType) %>% unique(), by = "SampleID") %>%
  mutate(SampleID = fct_relevel(SampleID, sort_order)) %>%

  ggplot(aes(x = SampleID, y = weighted_n, fill = antibiotic)) +
  geom_bar(stat = "identity", position = "fill", color = "white", size = 0.2) +
  theme_bw() +
  theme(strip.text.x = element_text(size = 8),
        strip.background = element_blank(),
        axis.text.x = element_blank()) +
  facet_wrap(SampleType~ETOH_etiology, scales = "free") +
  scale_y_continuous(labels = scales::percent) +
  scale_fill_brewer(palette = "Paired") +
  labs(y = "Proportion of bacteria resistant to antibiotic\nweighted by abundance", x = "", fill = "Antibiotic", caption = "Each column represents an individual sample")

Is Vancomycin restistance different?

totals_df <- my_df %>%
  filter(antibiotic %in% "vancomycin") %>%
  pivot_wider(-n, names_from = "antibiotic", values_from = "weighted_n") %>%
  left_join(Shen2021 %>% select(SampleID, ETOH_etiology, SampleType) %>% unique(), by = "SampleID") %>%
  mutate(SubjectID = str_remove(SampleID, "\\.RS|\\.F"))

my_lm <- nlme::lme(vancomycin ~ SampleType + ETOH_etiology, data = totals_df, random = ~ 1 | SubjectID)

summary(my_lm)
Graph
totals_df %>%
  ggplot(aes(x = SampleType, y = vancomycin, color = ETOH_etiology)) +
  geom_boxplot() +
  labs(y = "Number of vancomycin resistant bacteria\nweighted by relative abundance")

Differences in Urease utilization

my_df <- Shen2021 %>%
  left_join(bacdive_enzymes %>% filter(value %in% "urease"), by = "taxon") %>%
  rename(enzyme = value) %>%
  mutate(prop = count / read_counts) %>%
  group_by(SampleID, enzyme, activity) %>%
  filter(count > 0, !is.na(enzyme)) %>%
  summarise(n = n(), weighted_n = sum(prop) * n(), .groups = "drop") %>%
  ungroup()

sort_order <- my_df %>%
  group_by(SampleID) %>%
  mutate(prop = weighted_n / sum(weighted_n)) %>%
  filter(activity %in% "+") %>%
  arrange(prop) %>%
  pull(SampleID)

my_df %>%
  left_join(Shen2021 %>% select(SampleID, ETOH_etiology, SampleType) %>% unique(), by = "SampleID") %>%
  mutate(SampleID = fct_relevel(SampleID, sort_order)) %>%

  ggplot(aes(x = SampleID, y = weighted_n, fill = activity)) +
  geom_bar(stat = "identity", position = "fill", color = "white", size = 0.2) +
  theme_bw() +
  theme(strip.text.x = element_text(size = 8),
        strip.background = element_blank(),
        axis.text.x = element_blank()) +
  facet_wrap(SampleType~ETOH_etiology, scales = "free") +
  scale_y_continuous(labels = scales::percent) +
  scale_fill_brewer(palette = "Set2") +
  labs(y = "Proportion of urease utilization\nweighted by abundance", x = "", fill = "Activity", caption = "Each column represents an individual sample")

Is urease utilization significantly different?

totals_df <- my_df %>%
  pivot_wider(-n, names_from = "activity", values_from = "weighted_n") %>%
  mutate(log_activity_ratio = log(`+` / `-`)) %>%
  left_join(Shen2021 %>% select(SampleID, ETOH_etiology, SampleType) %>% unique(), by = "SampleID") %>%
  mutate(SubjectID = str_remove(SampleID, "\\.RS|\\.F"))

my_lm <- nlme::lme(log_activity_ratio ~ SampleType + ETOH_etiology, data = totals_df, random = ~ 1 | SubjectID)

summary(my_lm)

Graph

totals_df %>%
  ggplot(aes(x = SampleType, y = log_activity_ratio, color = ETOH_etiology)) +
  geom_boxplot() +
  labs(y = "Ratio of positive to negative urease activity for bacteria\nweighted by abundance")

References

When using BacDive for research please consider citing the following paper:

BacDive in 2019: bacterial phenotypic data for High-throughput biodiversity analysis. Reimer, L. C., Vetcininova, A., Sardà Carbasse, J., Söhngen, C., Gleim, D., Ebeling, C., Overmann, J. Nucleic Acids Research; database issue 2019.

Sample data set from: Shen, T.-C. D. et al. (2021) ‘The Mucosally-Adherent Rectal Microbiota Contains Features Unique to Alcohol-Related Cirrhosis’, Gut microbes, 13(1), p. 1987781.

To cite this package enter citation("bacphene") in your R console.



scottdaniel/bacphene documentation built on March 25, 2023, 12:51 p.m.