geometry: margin = 1.25cm

This report was built with R version r getRversion().

knitr::opts_chunk$set(echo = FALSE, # whether code should be displayed in document
                      cache = TRUE, # whether to cache results of code chunks for future knits
                      fig.align = 'center', # alignment options: 'left', 'right', 'center', 'default'
                      error = TRUE # display error messages in document (TRUE) or stop render on error (FALSE)
                      )
# space for notes/to do list/goals that shouldn't be included in resulting report
# This section contains a list of typical libraries used and others (commented out) that could be useful in other situations

# I. universal data wrangling and visualization
library(tidyverse)      # ggplot, dplyr, purrr, forcats, stringr... and other packages. See https://tidyverse.tidyverse.org/

# II. formatting, plotting
library(scales)         # used in formatting relative abundance % labels; unit_format()
library(knitr)          # necessary for kable, for displaying nicer tables in html/pdf output
library(RColorBrewer)   # for generating custom palettes
# library(grid)         
# library(gridExtra)      # grid.arrange(); plot & table arranging
# library(gganimate)      # useful for comparing data across time points or changes in a single categorical variable

# III. machine learning & analysis packages
library(vegan)          # decostand(), metaMDS(), scores(), rda(), ordispider(), adonis()
library(MASS)           # isoMDS(); masks dplyr::select()
library(mvabund)
# library(randomForest)   # randomForest()
# library(caret)          # vast support for all flavors of predictive models

# IV. functions for automating and streamlining workflow
# if (!require(devtools)) install.packages("devtools")
# devtools::install_github("https://github.com/cb-42/cbmbtools")
# devtools::update_packages("cbmbtools")
library(cbmbtools)
library(testthat) # expect_identical() and related functions are useful for ensuring custom functions return expected output
# Space for keeping customized functions which may only be useful for this analysis, such as plotting or data wrangling with slight variations.

I. Data Preparation

# For ease of use, place the processed 16S data files of interest into the same directory as your current RStudio project

# I. Bacteria_unclassified removal (can be skipped if desired); Read in cons.taxonomy
# ___ Bacteria_unclassified of ___ total untrimmed OTUs (trim = .05%) 
otu_taxonomy <- load_tax("data.cons.taxonomy")
rem_bact <- as.character(otu_taxonomy[otu_taxonomy$Genus == "Bacteria_unclassified", "OTU"])

# Basic processing with selection of OTUs > 0.1 % of the population
# II. Read in opti_mcc.shared file
otu_good <- load_shared(shared = "data.shared")
# otu_good <- load_shared(shared = "data.shared", otu_vec = rem_bact, thresh = 0.05)

# Trim _S from end of rownames(otu_good); note that it may be useful to keep these well identifiers in some analyses
rownames(otu_good) <- str_remove(rownames(otu_good), "_S\\d+")

# It's a good idea to inspect the data regularly with tools such as dim(), str(), head(), tail(), summary()... This will help in catching errors earlier
dim(otu_good) # dim(otu_good)[1] observations (samples) x dim(otu_good)[2] features (OTUs above .1% population threshold)

# III. Subset otu_taxonomy to retain good/trimmed OTUs only
otu_good_taxonomy <- droplevels(otu_taxonomy[colnames(otu_good), ])

# otu_good_bact_un <- load_shared(shared = "data.shared", thresh = 0.05) # shared w/o bact_un removal, for comparison
# Load in metadata/environmental data, or create it

# Examples of metadata creation
otu_df <- data.frame(decostand(otu_good, "total") * 100, Sample_name = row.names(otu_good), stringsAsFactors = FALSE) %>%
  # Create Experiment from first string of characters prior to first "_"
  mutate(Experiment = factor(case_when(str_detect(Sample_name, "Exp\\d") ~ str_extract(Sample_name, "Exp\\d"),
                                       str_detect(Sample_name, "^AE\\d$") ~ "AE",
                                       str_detect(Sample_name, "^empty_|^Blank") ~ "Empty",
                                       str_detect(Sample_name, "^IsoCtrl_") ~ "IsoCtrl",
                                       str_detect(Sample_name, "^Water|^WATER|^WaterNeg") ~ "WaterNeg",
                                       str_detect(Sample_name, "^Mock|^ZymoMock") ~ "Mock"),
                             levels = c("Exp1", "Exp2", "AE", "Empty", "IsoCtrl", "WaterNeg", "Mock"))
         ) %>%
  # Create Day; account for empty_D1 and empty_D2 which refer to wells and not days
  mutate(Day = factor(case_when(str_detect(Sample_name, "_D\\d+_") & Experiment != "Empty" ~ str_extract(Sample_name, "D\\d+"))
                      ) # sum(is.na(otu_df$Day)) returns __ (as expected, controls have NA in this column)
         ) %>%
  # Create Tissue
  mutate(Tissue = factor(case_when(str_detect(Sample_name, "Cecum") ~ "Cecum",
                                  str_detect(Sample_name, "Colon") ~ "Colon",
                                  str_detect(Sample_name, "Feces") ~ "Feces",
                                  str_detect(Sample_name, "SID") ~ "SID",
                                  str_detect(Sample_name, "SII") ~ "SII",
                                  str_detect(Sample_name, "SIJ") ~ "SIJ",
                                  str_detect(Sample_name, "Stomach") ~ "Stomach"),
                        levels = c("Stomach", "SID", "SIJ", "SII", "Cecum", "Colon", "Feces")
                        ) # sum(is.na(otu_df$Tissue)) returns __ (as expected, controls have NA in this column)
  ) %>%
  # Create Mouse
  mutate(Mouse = as.numeric(str_extract(Sample_name, "(?<=M)\\d+")
              ) # sum(is.na(otu_df$Mouse)) returns __ (as expected, the controls have NA in this column)
         ) %>%
  # Alternative Mouse creation:
  # mutate(Mouse = fct_inseq(ifelse(!is.na(Tissue), str_extract(Sample_name, "\\d+$"), NA))
  # Create Sex
  mutate(Sex = factor(str_extract(Mouse, "[FM]$"))
         ) %>%
  # Create Cage/Group
  mutate(Cage = case_when((Group == "WT") & (Sex == "F") ~ 1,
                          (Group == "WT") & (Sex == "M") ~ 2,
                          (Group == "Y709F") & (Sex == "F") ~ 3,
                          (Group == "Y709F") & (Sex == "M") ~ 4
         )) %>%
  # Create Run
  mutate(Run = factor(str_extract(Sample_name, "(?<=N)\\d$"))
         ) %>%
  # metadata columns at front, followed by all of the count data
  dplyr::select(Sample_name:Run, everything())
  # note that the data is in wide format

# Optionally, subset to retain only samples of interest to this analysis
exp1_fec_df <- otu_df %>%
  dplyr::filter(Experiment %in% c("Exp1"), Tissue == "Feces") %>%
  droplevels() # discard unused factor levels
phy_df <- agg_otus(otu_df) # aggregate by Phylum, add metadata

phy_df <- phy_df %>%
  dplyr::select(Sample_name, colnames(phy_df[,2:19])[colSums(phy_df[,2:19]) > 0]) %>% # Retain only the phyla that were present in any of these samples
  inner_join(dplyr::select(otu_df, -contains("Otu")), by = "Sample_name") %>%
  dplyr::select(Sample_name, Experiment:Mouse, everything())
# 9 color palette - for 9 unique values in a categorical variable total; for PCA ordinations
set1cols <- brewer.pal(9,"Set1")

# 7 color viridis palette
# vir7 <- viridis::viridis(n=7)

# Determine abundance ranking of phyla within samples
phy_rank <- phy_df %>% 
  gather(key = "Phylum", value = "Perc", -c(dplyr::select(otu_df, -contains("Otu")) %>% colnames())) %>% 
  group_by(Phylum) %>%
  summarize(Mean_Perc = mean(Perc)) %>%
  arrange(desc(Mean_Perc)) %>%
  mutate(Phylum = fct_inorder(Phylum)) # Create ordered factor levels to aid in creating visualizations.

# One approach for a custom palette - this has limitations if more than around 12-15 Phyla levels are necessary
phy_pal <- scales::hue_pal(c = 125, l = 70, h.start=150)(length(levels(phy_rank$Phylum)))

After the data was processed using mothur(v.1.42.3), it was put through standardization methods for community ecology using the vegan package (version r packageVersion("vegan")) in R. Of total OTUs, were denoted as Bacteria_unclassified and were removed. Next, any OTU count that made up less than .05% of the total OTU count for any given sample was set to 0. Subsequently, any OTUs which had a value of 0 for sum of counts across all samples were removed from the data, leaving r dim(otu_good)[2] OTUs for further analysis. There are r dim(otu_good)[1] samples when including all controls and experiments.

# export normalized counts of samples and metadata
# export_norm_tax(norm_df = dplyr::filter(otu_df, Experiment == "Exp1", !is.na(Tissue)), digits = 2)
# Use knitr::kable to generate a table summarizing key information about the experiment, e.g. number of samples by treatments
otu_df %>%
  dplyr::filter(Experiment == "Exp1") %>%
  dplyr::select(Sample_name:Mouse) %>%
  # arrange(Housing, Cage, Sample_name) %>%
  knitr::kable(caption = "Exp1 Samples: Current metadata", align = "l")

\newpage

II. Analysis of controls

2.1 Mock

[Descriptive text here]

# example code
mock_gg <- dplyr::filter(otu_df, Experiment == "Zymo_Mock") %>%
  tsg_ind(geo_mean = TRUE, ar_mean = TRUE)
plot_ra(df = mock_gg, title = "Cross-Comparison of Mock Control Replicates", facet_var = "Sample_name", fill = "Sample_name", yvar = "Percentage")

\newpage

2.2 Water

[Descriptive text here]

# example code
water_gg <- dplyr::filter(otu_df, Experiment == "Water_Neg") %>%
  tsg_ind(geo_mean = TRUE, ar_mean = TRUE)
plot_ra(df = water_gg, title = "Cross-Comparison of Negative Water Control Replicates", facet_var = "Sample_name", fill = "Sample_name", yvar = "Percentage")

\newpage

2.3 AE

[Descriptive text here]

ae_gg <- filter(otu_df, Experiment == "AE") %>%
  tsg_ind(geo_mean = TRUE, ar_mean = TRUE)
plot_ra(df = ae_gg, title = "Cross-Comparison of AE Control Replicates", facet_var = "Sample_name", fill = "Sample_name", yvar = "Percentage")

\newpage

2.4 Empty wells

[Descriptive text here]

empty_gg <- filter(otu_df, Experiment == "Empty") %>%
  tsg_ind(geo_mean = TRUE, ar_mean = TRUE)
plot_ra(df = empty_gg, title = "Cross-Comparison of Empty Wells", facet_var = "Sample_name", fill = "Sample_name", yvar = "Percentage")

\newpage

III. Relative Abundance

3.1 RA by Sample

[Descriptive text here]

otu_df %>%
  dplyr::filter(Mouse == 1, Tissue == "Feces") %>%
  dropevels() %>%
  tsg_ind(tax_level = "Phylum") %>% # otu_vec could be used to order these within sample OTUs by the mean of a larger/different grouping
  droplevels()

plot_ra(yvar = "Percentage", title = paste0("Exp1, Tissue: ", pull(., Tissue), ", Mouse: ", pull(., Mouse)), fill = "Phylum", phy_vec = phy_rank$Phylum, fill_pal = phy_pal) +
  labs(subtitle = paste0("OTUs Ordered by: ___"))

\newpage

3.2 RA for Individual Treatment Group (unfaceted)

# aggregate data 
kin_agg <- dplyr::filter(samp_df, ExptGrp == "TG1") %>% 
  taxon_sort_gather() %>%
  droplevels()

# individual observations - for scatterplot
# kin_ind <- dplyr::filter(otu_df, Day == 0, Organ == "Feces", Treatment_group %in% c("Exp1", "Exp2")) %>%
#   tsg_ind(otu_vec = kin_agg$OTU) %>% droplevels()

plot_ra(df=kin_agg, title="Relative Abundance of WT18 samples", fill="Phylum", phy_vec=phy_rank$Phylum, fill_pal=phy_pal, error_bar=TRUE)

\newpage

3.3 RA for all ExptGrps, Ordered by 1 ExptGrp (faceted)

# aggregate data
kin_agg <- samp_df %>% # Add filtering if needed
  taxon_sort_gather(facet_var = "ExptGrp", ord_val = "ExptGrp1") %>% 
  droplevels()

# individual observations - update as necessary
# kin_ind <- dplyr::filter(otu_df, Day == 0, Organ == "Feces", Treatment_group %in% c("Exp1", "Exp2")) %>%
#   tsg_ind(otu_vec = kin_agg$OTU) %>% droplevels()

plot_ra(df=kin_agg, title="Relative Abundance of All ExptGrps, ordered by ExptGrp1", facet_var = "ExptGrp", fill="Phylum", phy_vec=phy_rank$Phylum, fill_pal=phy_pal, error_bar=TRUE)

\newpage

3.4 RA by ExptGrp, Aggregated by Phyla (unfaceted)

phy_filt <- phy_df %>%
  dplyr::filter(ExptGrp == "ExptGrp1") %>%
  droplevels()

taxon_order <- names(sort(colMeans(dplyr::select(phy_filt, -Sample_name:-Cage)), decreasing = TRUE)) # workaround for rank_tax() due to removing Phyla earlier in the workflow

phy_agg <- phy_filt %>% 
  dplyr::select(Sample_name:Cage, all_of(taxon_order)) %>%
  gather(key = Phylum, value = Percentage, -Sample_name:-Cage, factor_key = TRUE) %>% # update to pivot_longer
  group_by(Phylum) %>%
  dplyr::summarize(Mean_Perc = mean(Percentage), SEM = sem(Percentage)) %>%
  dplyr::filter(Mean_Perc > 0 ) %>%
  arrange(desc(Mean_Perc)) %>% 
  droplevels() # remove Phylum with 0 values

plot_ra(df=phy_agg,  title="ExptGrp1 samples, Aggregated by Phylum", fill="Phylum", taxon="Phylum", phy_vec=phy_rank$Phylum, fill_pal=phy_pal)

\newpage

IV. PCA

[Descriptive text here]

# example: filter out controls
otu_good2 <- dplyr::filter(otu_df, !(Experiment %in% c("AE", "Empty", "IsoCtrl", "Water_Neg", "Zymo_Mock"))) %>%
  droplevels() # remove levels that are no longer present due to filtering

# PCA of all data (non-control)
otu_good2_hel <- decostand(dplyr::select(otu_good2, contains("Otu")), "hellinger")
otu_good2_pca <- rda(otu_good2_hel)

# (summary(otu_good2_pca)) # 0.3006 0.10995 0.06362 proportion explained by first 3 axes

4.1 Centroids

# set.seed(3881) # permanova
# adonis(otu_good2_hel~samp_df$Tissue, method="euclidean", permutations = 10000) # 9.999e-05 ***

plot(otu_good2_pca, type = "n", font = 2, font.lab = 2, xlab = "PC1 (38.1% Explained)", ylab = "PC2 (12.6% Explained)",
     main = "PCA of Exp1 Samples by Tissue", display = "sites")
points(otu_good2_pca, pch = 19, col = set1cols[as.numeric(samp_df$Tissue)])
vegan::ordispider(otu_good2_pca, samp_df$Tissue, label = TRUE) # Adds centroids/spiders
legend("topright", levels(as.factor(samp_df$Tissue)), pch = 19, col = set1cols, title = "Tissue")
legend("bottomright", legend = "Difference between Tissue Types:\n permanova: p < 0.0001")

\newpage

4.2 Individual Tissue ordination

This Ordination shows a subset of the points, belonging to a single tissue

plot(otu_good2_pca, type = "n", font = 2, font.lab = 2, xlab = "PC1 (38.1% Explained)", ylab = "PC2 (12.6% Explained)",
     main = "PCA of All Exp1 Samples: only Stomach shown", display = "sites")
points(otu_good2_pca, pch = 19, col = set1cols[1], select = samp_df$Tissue == "Stomach")
legend("topright", levels(as.factor(samp_df$Tissue))[1], pch = 19, col = set1cols[1])

\newpage

4.3 PCA of ___ Samples: Symbols by ExptGrp, color by Sex (no centroids)

# set.seed(2315) # permanova
# adonis(samp_all_hel~samp_df$Group, method="euclidean", permutations = 10000) # 9.999e-05 ***

plot(all_pca, type = "n", font = 2, font.lab = 2, xlab = "PC1 (49.3% Explained)", ylab = "PC2 (10.9% Explained)",
     main = "PCA of ___ Samples by Group + Housing, color by Sex", display = "sites")
points(all_pca, pch = c(15, 0, 16, 1)[as.numeric(samp_df$ExptGrp)], col = set1cols[as.numeric(samp_df$Sex)]) # col = set1cols[as.numeric(samp_df$Housing)]
# vegan::ordispider(sl_all_pca, sl_samp_df$ExptGrp, label = TRUE)
legend("bottomright", legend = paste(rep(levels(samp_df$ExptGrp),times=2),rep(levels(samp_df$Sex),each=4),sep=", "),col=rep(set1cols[c(2,1)], each=4),pch=rep(c(15,0,16,1),times=2),bty="n",ncol=2,cex=0.7,pt.cex=0.7)
legend("topright", legend = "Difference between Group + Housing:\n permanova: p < 0.0001")

\newpage

4.4 Lines connecting samples belonging to each Mouse

# Create dataframe with samplenames and PC coords
pca_df <- data.frame(samp_df[,1:8], summary(samp_all_pca)$sites, row.names = NULL)

coords_df <- pca_df[, 1:10] %>%
  dplyr::select(Sample_name, Mouse, Housing, PC1, PC2) %>%
  dplyr::filter(Housing == "18C") %>%
  left_join(dplyr::filter(pca_df[, 1:10], Housing == "30C"), by = "Mouse")

ggplot(pca_df, aes(x = PC1, y = PC2, shape = ExptGrp)) + 
  geom_point(size = 2.5) + 
  geom_segment(data = coords_df, aes(x = PC1.x, xend = PC1.y, y = PC2.x, yend = PC2.y)) +
  theme_bw() +
  scale_shape_manual(values = c(15, 0, 16, 1)) +
  labs(title = "PCA of ___ Samples: Each pair of Mouse samples connected by lines", subtitle = "Difference between Group + Housing: permanova: p < 0.0001", x = "PC1 (49.3% Explained)", y = "PC2 (10.9% Explained)")


cb-42/cbmbtools documentation built on Jan. 9, 2021, 1:38 a.m.