In cb-42/cbmbtools: R Functions for Facilitating Microbiome Analyses

geometry: margin = 1.25cm

This report was built with R version r getRversion().

knitr::opts_chunk$set(echo = FALSE, # whether code should be displayed in document
                      cache = TRUE, # whether to cache results of code chunks for future knits
                      fig.align = 'center', # alignment options: 'left', 'right', 'center', 'default'
                      error = TRUE # display error messages in document (TRUE) or stop render on error (FALSE)
                      )

# space for notes/to do list/goals that shouldn't be included in resulting report

# This section contains a list of typical libraries used and others (commented out) that could be useful in other situations

# I. universal data wrangling and visualization
library(tidyverse)      # ggplot, dplyr, purrr, forcats, stringr... and other packages. See https://tidyverse.tidyverse.org/

# II. formatting, plotting
library(scales)         # used in formatting relative abundance % labels; unit_format()
library(knitr)          # necessary for kable, for displaying nicer tables in html/pdf output
library(RColorBrewer)   # for generating custom palettes
# library(grid)         
# library(gridExtra)      # grid.arrange(); plot & table arranging
# library(gganimate)      # useful for comparing data across time points or changes in a single categorical variable

# III. machine learning & analysis packages
library(vegan)          # decostand(), metaMDS(), scores(), rda(), ordispider(), adonis()
library(MASS)           # isoMDS(); masks dplyr::select()
library(mvabund)
# library(randomForest)   # randomForest()
# library(caret)          # vast support for all flavors of predictive models

# IV. functions for automating and streamlining workflow
# if (!require(devtools)) install.packages("devtools")
# devtools::install_github("https://github.com/cb-42/cbmbtools")
# devtools::update_packages("cbmbtools")
library(cbmbtools)
library(testthat) # expect_identical() and related functions are useful for ensuring custom functions return expected output

# Space for keeping customized functions which may only be useful for this analysis, such as plotting or data wrangling with slight variations.

I. Data Preparation

# For ease of use, place the processed 16S data files of interest into the same directory as your current RStudio project

# I. Bacteria_unclassified removal (can be skipped if desired); Read in cons.taxonomy
# ___ Bacteria_unclassified of ___ total untrimmed OTUs (trim = .05%) 
otu_taxonomy <- load_tax("data.cons.taxonomy")
rem_bact <- as.character(otu_taxonomy[otu_taxonomy$Genus == "Bacteria_unclassified", "OTU"])

# Basic processing with selection of OTUs > 0.1 % of the population
# II. Read in opti_mcc.shared file
otu_good <- load_shared(shared = "data.shared")
# otu_good <- load_shared(shared = "data.shared", otu_vec = rem_bact, thresh = 0.05)

# Trim _S from end of rownames(otu_good); note that it may be useful to keep these well identifiers in some analyses
rownames(otu_good) <- str_remove(rownames(otu_good), "_S\\d+")

# It's a good idea to inspect the data regularly with tools such as dim(), str(), head(), tail(), summary()... This will help in catching errors earlier
dim(otu_good) # dim(otu_good)[1] observations (samples) x dim(otu_good)[2] features (OTUs above .1% population threshold)

# III. Subset otu_taxonomy to retain good/trimmed OTUs only
otu_good_taxonomy <- droplevels(otu_taxonomy[colnames(otu_good), ])

# otu_good_bact_un <- load_shared(shared = "data.shared", thresh = 0.05) # shared w/o bact_un removal, for comparison

# Load in metadata/environmental data, or create it

# Examples of metadata creation
otu_df <- data.frame(decostand(otu_good, "total") * 100, Sample_name = row.names(otu_good), stringsAsFactors = FALSE) %>%
  # Create Experiment from first string of characters prior to first "_"
  mutate(Experiment = factor(case_when(str_detect(Sample_name, "Exp\\d") ~ str_extract(Sample_name, "Exp\\d"),
                                       str_detect(Sample_name, "^AE\\d$") ~ "AE",
                                       str_detect(Sample_name, "^empty_|^Blank") ~ "Empty",
                                       str_detect(Sample_name, "^IsoCtrl_") ~ "IsoCtrl",
                                       str_detect(Sample_name, "^Water|^WATER|^WaterNeg") ~ "WaterNeg",
                                       str_detect(Sample_name, "^Mock|^ZymoMock") ~ "Mock"),
                             levels = c("Exp1", "Exp2", "AE", "Empty", "IsoCtrl", "WaterNeg", "Mock"))
         ) %>%
  # Create Day; account for empty_D1 and empty_D2 which refer to wells and not days
  mutate(Day = factor(case_when(str_detect(Sample_name, "_D\\d+_") & Experiment != "Empty" ~ str_extract(Sample_name, "D\\d+"))
                      ) # sum(is.na(otu_df$Day)) returns __ (as expected, controls have NA in this column)
         ) %>%
  # Create Tissue
  mutate(Tissue = factor(case_when(str_detect(Sample_name, "Cecum") ~ "Cecum",
                                  str_detect(Sample_name, "Colon") ~ "Colon",
                                  str_detect(Sample_name, "Feces") ~ "Feces",
                                  str_detect(Sample_name, "SID") ~ "SID",
                                  str_detect(Sample_name, "SII") ~ "SII",
                                  str_detect(Sample_name, "SIJ") ~ "SIJ",
                                  str_detect(Sample_name, "Stomach") ~ "Stomach"),
                        levels = c("Stomach", "SID", "SIJ", "SII", "Cecum", "Colon", "Feces")
                        ) # sum(is.na(otu_df$Tissue)) returns __ (as expected, controls have NA in this column)
  ) %>%
  # Create Mouse
  mutate(Mouse = as.numeric(str_extract(Sample_name, "(?<=M)\\d+")
              ) # sum(is.na(otu_df$Mouse)) returns __ (as expected, the controls have NA in this column)
         ) %>%
  # Alternative Mouse creation:
  # mutate(Mouse = fct_inseq(ifelse(!is.na(Tissue), str_extract(Sample_name, "\\d+$"), NA))
  # Create Sex
  mutate(Sex = factor(str_extract(Mouse, "[FM]$"))
         ) %>%
  # Create Cage/Group
  mutate(Cage = case_when((Group == "WT") & (Sex == "F") ~ 1,
                          (Group == "WT") & (Sex == "M") ~ 2,
                          (Group == "Y709F") & (Sex == "F") ~ 3,
                          (Group == "Y709F") & (Sex == "M") ~ 4
         )) %>%
  # Create Run
  mutate(Run = factor(str_extract(Sample_name, "(?<=N)\\d$"))
         ) %>%
  # metadata columns at front, followed by all of the count data
  dplyr::select(Sample_name:Run, everything())
  # note that the data is in wide format

# Optionally, subset to retain only samples of interest to this analysis
exp1_fec_df <- otu_df %>%
  dplyr::filter(Experiment %in% c("Exp1"), Tissue == "Feces") %>%
  droplevels() # discard unused factor levels

phy_df <- agg_otus(otu_df) # aggregate by Phylum, add metadata

phy_df <- phy_df %>%
  dplyr::select(Sample_name, colnames(phy_df[,2:19])[colSums(phy_df[,2:19]) > 0]) %>% # Retain only the phyla that were present in any of these samples
  inner_join(dplyr::select(otu_df, -contains("Otu")), by = "Sample_name") %>%
  dplyr::select(Sample_name, Experiment:Mouse, everything())

# 9 color palette - for 9 unique values in a categorical variable total; for PCA ordinations
set1cols <- brewer.pal(9,"Set1")

# 7 color viridis palette
# vir7 <- viridis::viridis(n=7)

# Determine abundance ranking of phyla within samples
phy_rank <- phy_df %>% 
  gather(key = "Phylum", value = "Perc", -c(dplyr::select(otu_df, -contains("Otu")) %>% colnames())) %>% 
  group_by(Phylum) %>%
  summarize(Mean_Perc = mean(Perc)) %>%
  arrange(desc(Mean_Perc)) %>%
  mutate(Phylum = fct_inorder(Phylum)) # Create ordered factor levels to aid in creating visualizations.

# One approach for a custom palette - this has limitations if more than around 12-15 Phyla levels are necessary
phy_pal <- scales::hue_pal(c = 125, l = 70, h.start=150)(length(levels(phy_rank$Phylum)))

After the data was processed using mothur(v.1.42.3), it was put through standardization methods for community ecology using the vegan package (version r packageVersion("vegan")) in R. Of total OTUs, were denoted as Bacteria_unclassified and were removed. Next, any OTU count that made up less than .05% of the total OTU count for any given sample was set to 0. Subsequently, any OTUs which had a value of 0 for sum of counts across all samples were removed from the data, leaving r dim(otu_good)[2] OTUs for further analysis. There are r dim(otu_good)[1] samples when including all controls and experiments.

# export normalized counts of samples and metadata
# export_norm_tax(norm_df = dplyr::filter(otu_df, Experiment == "Exp1", !is.na(Tissue)), digits = 2)

# Use knitr::kable to generate a table summarizing key information about the experiment, e.g. number of samples by treatments
otu_df %>%
  dplyr::filter(Experiment == "Exp1") %>%
  dplyr::select(Sample_name:Mouse) %>%
  # arrange(Housing, Cage, Sample_name) %>%
  knitr::kable(caption = "Exp1 Samples: Current metadata", align = "l")

\newpage

II. Analysis of controls

2.1 Mock

[Descriptive text here]

# example code
mock_gg <- dplyr::filter(otu_df, Experiment == "Zymo_Mock") %>%
  tsg_ind(geo_mean = TRUE, ar_mean = TRUE)

plot_ra(df = mock_gg, title = "Cross-Comparison of Mock Control Replicates", facet_var = "Sample_name", fill = "Sample_name", yvar = "Percentage")

\newpage

2.2 Water

[Descriptive text here]

# example code
water_gg <- dplyr::filter(otu_df, Experiment == "Water_Neg") %>%
  tsg_ind(geo_mean = TRUE, ar_mean = TRUE)

plot_ra(df = water_gg, title = "Cross-Comparison of Negative Water Control Replicates", facet_var = "Sample_name", fill = "Sample_name", yvar = "Percentage")

\newpage

2.3 AE

[Descriptive text here]

ae_gg <- filter(otu_df, Experiment == "AE") %>%
  tsg_ind(geo_mean = TRUE, ar_mean = TRUE)

plot_ra(df = ae_gg, title = "Cross-Comparison of AE Control Replicates", facet_var = "Sample_name", fill = "Sample_name", yvar = "Percentage")

\newpage

2.4 Empty wells

[Descriptive text here]

empty_gg <- filter(otu_df, Experiment == "Empty") %>%
  tsg_ind(geo_mean = TRUE, ar_mean = TRUE)

plot_ra(df = empty_gg, title = "Cross-Comparison of Empty Wells", facet_var = "Sample_name", fill = "Sample_name", yvar = "Percentage")

\newpage

III. Relative Abundance

3.1 RA by Sample

[Descriptive text here]

otu_df %>%
  dplyr::filter(Mouse == 1, Tissue == "Feces") %>%
  dropevels() %>%
  tsg_ind(tax_level = "Phylum") %>% # otu_vec could be used to order these within sample OTUs by the mean of a larger/different grouping
  droplevels()

plot_ra(yvar = "Percentage", title = paste0("Exp1, Tissue: ", pull(., Tissue), ", Mouse: ", pull(., Mouse)), fill = "Phylum", phy_vec = phy_rank$Phylum, fill_pal = phy_pal) +
  labs(subtitle = paste0("OTUs Ordered by: ___"))

\newpage

3.2 RA for Individual Treatment Group (unfaceted)

# aggregate data 
kin_agg <- dplyr::filter(samp_df, ExptGrp == "TG1") %>% 
  taxon_sort_gather() %>%
  droplevels()

# individual observations - for scatterplot
# kin_ind <- dplyr::filter(otu_df, Day == 0, Organ == "Feces", Treatment_group %in% c("Exp1", "Exp2")) %>%
#   tsg_ind(otu_vec = kin_agg$OTU) %>% droplevels()

plot_ra(df=kin_agg, title="Relative Abundance of WT18 samples", fill="Phylum", phy_vec=phy_rank$Phylum, fill_pal=phy_pal, error_bar=TRUE)

\newpage

3.3 RA for all ExptGrps, Ordered by 1 ExptGrp (faceted)

# aggregate data
kin_agg <- samp_df %>% # Add filtering if needed
  taxon_sort_gather(facet_var = "ExptGrp", ord_val = "ExptGrp1") %>% 
  droplevels()

# individual observations - update as necessary
# kin_ind <- dplyr::filter(otu_df, Day == 0, Organ == "Feces", Treatment_group %in% c("Exp1", "Exp2")) %>%
#   tsg_ind(otu_vec = kin_agg$OTU) %>% droplevels()

plot_ra(df=kin_agg, title="Relative Abundance of All ExptGrps, ordered by ExptGrp1", facet_var = "ExptGrp", fill="Phylum", phy_vec=phy_rank$Phylum, fill_pal=phy_pal, error_bar=TRUE)

\newpage

3.4 RA by ExptGrp, Aggregated by Phyla (unfaceted)

phy_filt <- phy_df %>%
  dplyr::filter(ExptGrp == "ExptGrp1") %>%
  droplevels()

taxon_order <- names(sort(colMeans(dplyr::select(phy_filt, -Sample_name:-Cage)), decreasing = TRUE)) # workaround for rank_tax() due to removing Phyla earlier in the workflow

phy_agg <- phy_filt %>% 
  dplyr::select(Sample_name:Cage, all_of(taxon_order)) %>%
  gather(key = Phylum, value = Percentage, -Sample_name:-Cage, factor_key = TRUE) %>% # update to pivot_longer
  group_by(Phylum) %>%
  dplyr::summarize(Mean_Perc = mean(Percentage), SEM = sem(Percentage)) %>%
  dplyr::filter(Mean_Perc > 0 ) %>%
  arrange(desc(Mean_Perc)) %>% 
  droplevels() # remove Phylum with 0 values

plot_ra(df=phy_agg,  title="ExptGrp1 samples, Aggregated by Phylum", fill="Phylum", taxon="Phylum", phy_vec=phy_rank$Phylum, fill_pal=phy_pal)

\newpage

IV. PCA

[Descriptive text here]

# example: filter out controls
otu_good2 <- dplyr::filter(otu_df, !(Experiment %in% c("AE", "Empty", "IsoCtrl", "Water_Neg", "Zymo_Mock"))) %>%
  droplevels() # remove levels that are no longer present due to filtering

# PCA of all data (non-control)
otu_good2_hel <- decostand(dplyr::select(otu_good2, contains("Otu")), "hellinger")
otu_good2_pca <- rda(otu_good2_hel)

# (summary(otu_good2_pca)) # 0.3006 0.10995 0.06362 proportion explained by first 3 axes

4.1 Centroids

# set.seed(3881) # permanova
# adonis(otu_good2_hel~samp_df$Tissue, method="euclidean", permutations = 10000) # 9.999e-05 ***

plot(otu_good2_pca, type = "n", font = 2, font.lab = 2, xlab = "PC1 (38.1% Explained)", ylab = "PC2 (12.6% Explained)",
     main = "PCA of Exp1 Samples by Tissue", display = "sites")
points(otu_good2_pca, pch = 19, col = set1cols[as.numeric(samp_df$Tissue)])
vegan::ordispider(otu_good2_pca, samp_df$Tissue, label = TRUE) # Adds centroids/spiders
legend("topright", levels(as.factor(samp_df$Tissue)), pch = 19, col = set1cols, title = "Tissue")
legend("bottomright", legend = "Difference between Tissue Types:\n permanova: p < 0.0001")

\newpage

4.2 Individual Tissue ordination

This Ordination shows a subset of the points, belonging to a single tissue

plot(otu_good2_pca, type = "n", font = 2, font.lab = 2, xlab = "PC1 (38.1% Explained)", ylab = "PC2 (12.6% Explained)",
     main = "PCA of All Exp1 Samples: only Stomach shown", display = "sites")
points(otu_good2_pca, pch = 19, col = set1cols[1], select = samp_df$Tissue == "Stomach")
legend("topright", levels(as.factor(samp_df$Tissue))[1], pch = 19, col = set1cols[1])

\newpage

4.3 PCA of ___ Samples: Symbols by ExptGrp, color by Sex (no centroids)

# set.seed(2315) # permanova
# adonis(samp_all_hel~samp_df$Group, method="euclidean", permutations = 10000) # 9.999e-05 ***

plot(all_pca, type = "n", font = 2, font.lab = 2, xlab = "PC1 (49.3% Explained)", ylab = "PC2 (10.9% Explained)",
     main = "PCA of ___ Samples by Group + Housing, color by Sex", display = "sites")
points(all_pca, pch = c(15, 0, 16, 1)[as.numeric(samp_df$ExptGrp)], col = set1cols[as.numeric(samp_df$Sex)]) # col = set1cols[as.numeric(samp_df$Housing)]
# vegan::ordispider(sl_all_pca, sl_samp_df$ExptGrp, label = TRUE)
legend("bottomright", legend = paste(rep(levels(samp_df$ExptGrp),times=2),rep(levels(samp_df$Sex),each=4),sep=", "),col=rep(set1cols[c(2,1)], each=4),pch=rep(c(15,0,16,1),times=2),bty="n",ncol=2,cex=0.7,pt.cex=0.7)
legend("topright", legend = "Difference between Group + Housing:\n permanova: p < 0.0001")

\newpage

4.4 Lines connecting samples belonging to each Mouse

# Create dataframe with samplenames and PC coords
pca_df <- data.frame(samp_df[,1:8], summary(samp_all_pca)$sites, row.names = NULL)

coords_df <- pca_df[, 1:10] %>%
  dplyr::select(Sample_name, Mouse, Housing, PC1, PC2) %>%
  dplyr::filter(Housing == "18C") %>%
  left_join(dplyr::filter(pca_df[, 1:10], Housing == "30C"), by = "Mouse")

ggplot(pca_df, aes(x = PC1, y = PC2, shape = ExptGrp)) + 
  geom_point(size = 2.5) + 
  geom_segment(data = coords_df, aes(x = PC1.x, xend = PC1.y, y = PC2.x, yend = PC2.y)) +
  theme_bw() +
  scale_shape_manual(values = c(15, 0, 16, 1)) +
  labs(title = "PCA of ___ Samples: Each pair of Mouse samples connected by lines", subtitle = "Difference between Group + Housing: permanova: p < 0.0001", x = "PC1 (49.3% Explained)", y = "PC2 (10.9% Explained)")

cb-42/cbmbtools documentation built on Jan. 9, 2021, 1:38 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

cb-42/cbmbtools
R Functions for Facilitating Microbiome Analyses

In cb-42/cbmbtools: R Functions for Facilitating Microbiome Analyses

geometry: margin = 1.25cm

I. Data Preparation

II. Analysis of controls

2.1 Mock

2.2 Water

2.3 AE

2.4 Empty wells

III. Relative Abundance

3.1 RA by Sample

3.2 RA for Individual Treatment Group (unfaceted)

3.3 RA for all ExptGrps, Ordered by 1 ExptGrp (faceted)

3.4 RA by ExptGrp, Aggregated by Phyla (unfaceted)

IV. PCA

4.1 Centroids

4.2 Individual Tissue ordination

4.3 PCA of ___ Samples: Symbols by ExptGrp, color by Sex (no centroids)

4.4 Lines connecting samples belonging to each Mouse

R Package Documentation

Browse R Packages

We want your feedback!

cb-42/cbmbtools R Functions for Facilitating Microbiome Analyses

In cb-42/cbmbtools: R Functions for Facilitating Microbiome Analyses

geometry: margin = 1.25cm

I. Data Preparation

II. Analysis of controls

2.1 Mock

2.2 Water

2.3 AE

2.4 Empty wells

III. Relative Abundance

3.1 RA by Sample

3.2 RA for Individual Treatment Group (unfaceted)

3.3 RA for all ExptGrps, Ordered by 1 ExptGrp (faceted)

3.4 RA by ExptGrp, Aggregated by Phyla (unfaceted)

IV. PCA

4.1 Centroids

4.2 Individual Tissue ordination

4.3 PCA of ___ Samples: Symbols by ExptGrp, color by Sex (no centroids)

4.4 Lines connecting samples belonging to each Mouse

R Package Documentation

Browse R Packages

We want your feedback!

cb-42/cbmbtools
R Functions for Facilitating Microbiome Analyses