This report was built with R version r getRversion()
.
knitr::opts_chunk$set(echo = FALSE, # whether code should be displayed in document cache = TRUE, # whether to cache results of code chunks for future knits fig.align = 'center', # alignment options: 'left', 'right', 'center', 'default' error = TRUE # display error messages in document (TRUE) or stop render on error (FALSE) )
# space for notes/to do list/goals that shouldn't be included in resulting report
# This section contains a list of typical libraries used and others (commented out) that could be useful in other situations # I. universal data wrangling and visualization library(tidyverse) # ggplot, dplyr, purrr, forcats, stringr... and other packages. See https://tidyverse.tidyverse.org/ # II. formatting, plotting library(scales) # used in formatting relative abundance % labels; unit_format() library(knitr) # necessary for kable, for displaying nicer tables in html/pdf output library(RColorBrewer) # for generating custom palettes # library(grid) # library(gridExtra) # grid.arrange(); plot & table arranging # library(gganimate) # useful for comparing data across time points or changes in a single categorical variable # III. machine learning & analysis packages library(vegan) # decostand(), metaMDS(), scores(), rda(), ordispider(), adonis() library(MASS) # isoMDS(); masks dplyr::select() library(mvabund) # library(randomForest) # randomForest() # library(caret) # vast support for all flavors of predictive models # IV. functions for automating and streamlining workflow # if (!require(devtools)) install.packages("devtools") # devtools::install_github("https://github.com/cb-42/cbmbtools") # devtools::update_packages("cbmbtools") library(cbmbtools) library(testthat) # expect_identical() and related functions are useful for ensuring custom functions return expected output
# Space for keeping customized functions which may only be useful for this analysis, such as plotting or data wrangling with slight variations.
# For ease of use, place the processed 16S data files of interest into the same directory as your current RStudio project # I. Bacteria_unclassified removal (can be skipped if desired); Read in cons.taxonomy # ___ Bacteria_unclassified of ___ total untrimmed OTUs (trim = .05%) otu_taxonomy <- load_tax("data.cons.taxonomy") rem_bact <- as.character(otu_taxonomy[otu_taxonomy$Genus == "Bacteria_unclassified", "OTU"]) # Basic processing with selection of OTUs > 0.1 % of the population # II. Read in opti_mcc.shared file otu_good <- load_shared(shared = "data.shared") # otu_good <- load_shared(shared = "data.shared", otu_vec = rem_bact, thresh = 0.05) # Trim _S from end of rownames(otu_good); note that it may be useful to keep these well identifiers in some analyses rownames(otu_good) <- str_remove(rownames(otu_good), "_S\\d+") # It's a good idea to inspect the data regularly with tools such as dim(), str(), head(), tail(), summary()... This will help in catching errors earlier dim(otu_good) # dim(otu_good)[1] observations (samples) x dim(otu_good)[2] features (OTUs above .1% population threshold) # III. Subset otu_taxonomy to retain good/trimmed OTUs only otu_good_taxonomy <- droplevels(otu_taxonomy[colnames(otu_good), ]) # otu_good_bact_un <- load_shared(shared = "data.shared", thresh = 0.05) # shared w/o bact_un removal, for comparison
# Load in metadata/environmental data, or create it # Examples of metadata creation otu_df <- data.frame(decostand(otu_good, "total") * 100, Sample_name = row.names(otu_good), stringsAsFactors = FALSE) %>% # Create Experiment from first string of characters prior to first "_" mutate(Experiment = factor(case_when(str_detect(Sample_name, "Exp\\d") ~ str_extract(Sample_name, "Exp\\d"), str_detect(Sample_name, "^AE\\d$") ~ "AE", str_detect(Sample_name, "^empty_|^Blank") ~ "Empty", str_detect(Sample_name, "^IsoCtrl_") ~ "IsoCtrl", str_detect(Sample_name, "^Water|^WATER|^WaterNeg") ~ "WaterNeg", str_detect(Sample_name, "^Mock|^ZymoMock") ~ "Mock"), levels = c("Exp1", "Exp2", "AE", "Empty", "IsoCtrl", "WaterNeg", "Mock")) ) %>% # Create Day; account for empty_D1 and empty_D2 which refer to wells and not days mutate(Day = factor(case_when(str_detect(Sample_name, "_D\\d+_") & Experiment != "Empty" ~ str_extract(Sample_name, "D\\d+")) ) # sum(is.na(otu_df$Day)) returns __ (as expected, controls have NA in this column) ) %>% # Create Tissue mutate(Tissue = factor(case_when(str_detect(Sample_name, "Cecum") ~ "Cecum", str_detect(Sample_name, "Colon") ~ "Colon", str_detect(Sample_name, "Feces") ~ "Feces", str_detect(Sample_name, "SID") ~ "SID", str_detect(Sample_name, "SII") ~ "SII", str_detect(Sample_name, "SIJ") ~ "SIJ", str_detect(Sample_name, "Stomach") ~ "Stomach"), levels = c("Stomach", "SID", "SIJ", "SII", "Cecum", "Colon", "Feces") ) # sum(is.na(otu_df$Tissue)) returns __ (as expected, controls have NA in this column) ) %>% # Create Mouse mutate(Mouse = as.numeric(str_extract(Sample_name, "(?<=M)\\d+") ) # sum(is.na(otu_df$Mouse)) returns __ (as expected, the controls have NA in this column) ) %>% # Alternative Mouse creation: # mutate(Mouse = fct_inseq(ifelse(!is.na(Tissue), str_extract(Sample_name, "\\d+$"), NA)) # Create Sex mutate(Sex = factor(str_extract(Mouse, "[FM]$")) ) %>% # Create Cage/Group mutate(Cage = case_when((Group == "WT") & (Sex == "F") ~ 1, (Group == "WT") & (Sex == "M") ~ 2, (Group == "Y709F") & (Sex == "F") ~ 3, (Group == "Y709F") & (Sex == "M") ~ 4 )) %>% # Create Run mutate(Run = factor(str_extract(Sample_name, "(?<=N)\\d$")) ) %>% # metadata columns at front, followed by all of the count data dplyr::select(Sample_name:Run, everything()) # note that the data is in wide format # Optionally, subset to retain only samples of interest to this analysis exp1_fec_df <- otu_df %>% dplyr::filter(Experiment %in% c("Exp1"), Tissue == "Feces") %>% droplevels() # discard unused factor levels
phy_df <- agg_otus(otu_df) # aggregate by Phylum, add metadata phy_df <- phy_df %>% dplyr::select(Sample_name, colnames(phy_df[,2:19])[colSums(phy_df[,2:19]) > 0]) %>% # Retain only the phyla that were present in any of these samples inner_join(dplyr::select(otu_df, -contains("Otu")), by = "Sample_name") %>% dplyr::select(Sample_name, Experiment:Mouse, everything())
# 9 color palette - for 9 unique values in a categorical variable total; for PCA ordinations set1cols <- brewer.pal(9,"Set1") # 7 color viridis palette # vir7 <- viridis::viridis(n=7) # Determine abundance ranking of phyla within samples phy_rank <- phy_df %>% gather(key = "Phylum", value = "Perc", -c(dplyr::select(otu_df, -contains("Otu")) %>% colnames())) %>% group_by(Phylum) %>% summarize(Mean_Perc = mean(Perc)) %>% arrange(desc(Mean_Perc)) %>% mutate(Phylum = fct_inorder(Phylum)) # Create ordered factor levels to aid in creating visualizations. # One approach for a custom palette - this has limitations if more than around 12-15 Phyla levels are necessary phy_pal <- scales::hue_pal(c = 125, l = 70, h.start=150)(length(levels(phy_rank$Phylum)))
After the data was processed using mothur(v.1.42.3), it was put through standardization methods for community ecology using the vegan package (version r packageVersion("vegan")
) in R. Of total OTUs, were denoted as Bacteria_unclassified and were removed. Next, any OTU count that made up less than .05% of the total OTU count for any given sample was set to 0. Subsequently, any OTUs which had a value of 0 for sum of counts across all samples were removed from the data, leaving r dim(otu_good)[2]
OTUs for further analysis. There are r dim(otu_good)[1]
samples when including all controls and experiments.
# export normalized counts of samples and metadata # export_norm_tax(norm_df = dplyr::filter(otu_df, Experiment == "Exp1", !is.na(Tissue)), digits = 2)
# Use knitr::kable to generate a table summarizing key information about the experiment, e.g. number of samples by treatments otu_df %>% dplyr::filter(Experiment == "Exp1") %>% dplyr::select(Sample_name:Mouse) %>% # arrange(Housing, Cage, Sample_name) %>% knitr::kable(caption = "Exp1 Samples: Current metadata", align = "l")
\newpage
[Descriptive text here]
# example code mock_gg <- dplyr::filter(otu_df, Experiment == "Zymo_Mock") %>% tsg_ind(geo_mean = TRUE, ar_mean = TRUE)
plot_ra(df = mock_gg, title = "Cross-Comparison of Mock Control Replicates", facet_var = "Sample_name", fill = "Sample_name", yvar = "Percentage")
\newpage
[Descriptive text here]
# example code water_gg <- dplyr::filter(otu_df, Experiment == "Water_Neg") %>% tsg_ind(geo_mean = TRUE, ar_mean = TRUE)
plot_ra(df = water_gg, title = "Cross-Comparison of Negative Water Control Replicates", facet_var = "Sample_name", fill = "Sample_name", yvar = "Percentage")
\newpage
[Descriptive text here]
ae_gg <- filter(otu_df, Experiment == "AE") %>% tsg_ind(geo_mean = TRUE, ar_mean = TRUE)
plot_ra(df = ae_gg, title = "Cross-Comparison of AE Control Replicates", facet_var = "Sample_name", fill = "Sample_name", yvar = "Percentage")
\newpage
[Descriptive text here]
empty_gg <- filter(otu_df, Experiment == "Empty") %>% tsg_ind(geo_mean = TRUE, ar_mean = TRUE)
plot_ra(df = empty_gg, title = "Cross-Comparison of Empty Wells", facet_var = "Sample_name", fill = "Sample_name", yvar = "Percentage")
\newpage
[Descriptive text here]
otu_df %>% dplyr::filter(Mouse == 1, Tissue == "Feces") %>% dropevels() %>% tsg_ind(tax_level = "Phylum") %>% # otu_vec could be used to order these within sample OTUs by the mean of a larger/different grouping droplevels() plot_ra(yvar = "Percentage", title = paste0("Exp1, Tissue: ", pull(., Tissue), ", Mouse: ", pull(., Mouse)), fill = "Phylum", phy_vec = phy_rank$Phylum, fill_pal = phy_pal) + labs(subtitle = paste0("OTUs Ordered by: ___"))
\newpage
# aggregate data kin_agg <- dplyr::filter(samp_df, ExptGrp == "TG1") %>% taxon_sort_gather() %>% droplevels() # individual observations - for scatterplot # kin_ind <- dplyr::filter(otu_df, Day == 0, Organ == "Feces", Treatment_group %in% c("Exp1", "Exp2")) %>% # tsg_ind(otu_vec = kin_agg$OTU) %>% droplevels() plot_ra(df=kin_agg, title="Relative Abundance of WT18 samples", fill="Phylum", phy_vec=phy_rank$Phylum, fill_pal=phy_pal, error_bar=TRUE)
\newpage
# aggregate data kin_agg <- samp_df %>% # Add filtering if needed taxon_sort_gather(facet_var = "ExptGrp", ord_val = "ExptGrp1") %>% droplevels() # individual observations - update as necessary # kin_ind <- dplyr::filter(otu_df, Day == 0, Organ == "Feces", Treatment_group %in% c("Exp1", "Exp2")) %>% # tsg_ind(otu_vec = kin_agg$OTU) %>% droplevels() plot_ra(df=kin_agg, title="Relative Abundance of All ExptGrps, ordered by ExptGrp1", facet_var = "ExptGrp", fill="Phylum", phy_vec=phy_rank$Phylum, fill_pal=phy_pal, error_bar=TRUE)
\newpage
phy_filt <- phy_df %>% dplyr::filter(ExptGrp == "ExptGrp1") %>% droplevels() taxon_order <- names(sort(colMeans(dplyr::select(phy_filt, -Sample_name:-Cage)), decreasing = TRUE)) # workaround for rank_tax() due to removing Phyla earlier in the workflow phy_agg <- phy_filt %>% dplyr::select(Sample_name:Cage, all_of(taxon_order)) %>% gather(key = Phylum, value = Percentage, -Sample_name:-Cage, factor_key = TRUE) %>% # update to pivot_longer group_by(Phylum) %>% dplyr::summarize(Mean_Perc = mean(Percentage), SEM = sem(Percentage)) %>% dplyr::filter(Mean_Perc > 0 ) %>% arrange(desc(Mean_Perc)) %>% droplevels() # remove Phylum with 0 values plot_ra(df=phy_agg, title="ExptGrp1 samples, Aggregated by Phylum", fill="Phylum", taxon="Phylum", phy_vec=phy_rank$Phylum, fill_pal=phy_pal)
\newpage
[Descriptive text here]
# example: filter out controls otu_good2 <- dplyr::filter(otu_df, !(Experiment %in% c("AE", "Empty", "IsoCtrl", "Water_Neg", "Zymo_Mock"))) %>% droplevels() # remove levels that are no longer present due to filtering # PCA of all data (non-control) otu_good2_hel <- decostand(dplyr::select(otu_good2, contains("Otu")), "hellinger") otu_good2_pca <- rda(otu_good2_hel) # (summary(otu_good2_pca)) # 0.3006 0.10995 0.06362 proportion explained by first 3 axes
# set.seed(3881) # permanova # adonis(otu_good2_hel~samp_df$Tissue, method="euclidean", permutations = 10000) # 9.999e-05 *** plot(otu_good2_pca, type = "n", font = 2, font.lab = 2, xlab = "PC1 (38.1% Explained)", ylab = "PC2 (12.6% Explained)", main = "PCA of Exp1 Samples by Tissue", display = "sites") points(otu_good2_pca, pch = 19, col = set1cols[as.numeric(samp_df$Tissue)]) vegan::ordispider(otu_good2_pca, samp_df$Tissue, label = TRUE) # Adds centroids/spiders legend("topright", levels(as.factor(samp_df$Tissue)), pch = 19, col = set1cols, title = "Tissue") legend("bottomright", legend = "Difference between Tissue Types:\n permanova: p < 0.0001")
\newpage
This Ordination shows a subset of the points, belonging to a single tissue
plot(otu_good2_pca, type = "n", font = 2, font.lab = 2, xlab = "PC1 (38.1% Explained)", ylab = "PC2 (12.6% Explained)", main = "PCA of All Exp1 Samples: only Stomach shown", display = "sites") points(otu_good2_pca, pch = 19, col = set1cols[1], select = samp_df$Tissue == "Stomach") legend("topright", levels(as.factor(samp_df$Tissue))[1], pch = 19, col = set1cols[1])
\newpage
# set.seed(2315) # permanova # adonis(samp_all_hel~samp_df$Group, method="euclidean", permutations = 10000) # 9.999e-05 *** plot(all_pca, type = "n", font = 2, font.lab = 2, xlab = "PC1 (49.3% Explained)", ylab = "PC2 (10.9% Explained)", main = "PCA of ___ Samples by Group + Housing, color by Sex", display = "sites") points(all_pca, pch = c(15, 0, 16, 1)[as.numeric(samp_df$ExptGrp)], col = set1cols[as.numeric(samp_df$Sex)]) # col = set1cols[as.numeric(samp_df$Housing)] # vegan::ordispider(sl_all_pca, sl_samp_df$ExptGrp, label = TRUE) legend("bottomright", legend = paste(rep(levels(samp_df$ExptGrp),times=2),rep(levels(samp_df$Sex),each=4),sep=", "),col=rep(set1cols[c(2,1)], each=4),pch=rep(c(15,0,16,1),times=2),bty="n",ncol=2,cex=0.7,pt.cex=0.7) legend("topright", legend = "Difference between Group + Housing:\n permanova: p < 0.0001")
\newpage
# Create dataframe with samplenames and PC coords pca_df <- data.frame(samp_df[,1:8], summary(samp_all_pca)$sites, row.names = NULL) coords_df <- pca_df[, 1:10] %>% dplyr::select(Sample_name, Mouse, Housing, PC1, PC2) %>% dplyr::filter(Housing == "18C") %>% left_join(dplyr::filter(pca_df[, 1:10], Housing == "30C"), by = "Mouse") ggplot(pca_df, aes(x = PC1, y = PC2, shape = ExptGrp)) + geom_point(size = 2.5) + geom_segment(data = coords_df, aes(x = PC1.x, xend = PC1.y, y = PC2.x, yend = PC2.y)) + theme_bw() + scale_shape_manual(values = c(15, 0, 16, 1)) + labs(title = "PCA of ___ Samples: Each pair of Mouse samples connected by lines", subtitle = "Difference between Group + Housing: permanova: p < 0.0001", x = "PC1 (49.3% Explained)", y = "PC2 (10.9% Explained)")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.