This document shows an example microViz
analysis workflow using
example data atlas1006
from the microbiome
package.
knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) options(width = 100)
library(dplyr) library(ggplot2) library(patchwork) library(phyloseq) library(microbiome) library(microViz)
# get some example data data("atlas1006", package = "microbiome") # create a couple of numerical variables (often more useful than character strings) ps <- atlas1006 %>% ps_mutate( weight = recode( .x = bmi_group, morbidobese = 5, severeobese = 4, obese = 3, overweight = 2, lean = 1, underweight = 0 ), lean = if_else(weight < 2, true = 1, false = 0, missing = 0), female = if_else(sex == "female", true = 1, false = 0), extract_P = if_else(DNA_extraction_method == "p", true = 1, false = 0) ) %>% # only look at the baseline time point if multiple samples available # and drop samples with no DNA extraction method info ps_filter(time == 0, !is.na(DNA_extraction_method)) %>% # remove the sample data variables about time ps_select(-time) # add a couple of missing values to show how microViz handles missing data sample_data(ps)$female[c(3, 4)] <- NA # look at phyloseq object description ps
# this example data has a slightly odd tax_table because it comes from HITChip data (instead of sequencing) # the taxonomic assignment is done differently, so we need to ensure taxon names are not repeated across columns # it can otherwise be used the same as sequencing data for this example tax_table(ps) %>% head(3) ps <- ps %>% tax_prepend_ranks() tax_table(ps) %>% head(3) # replace any uninformative tax_table values ps <- ps %>% tax_fix() # look at the effect of removing rare Genera, e.g. how many Genera are present in at least 5% of samples? ps %>% tax_filter(min_prevalence = 5 / 100, tax_level = "Genus") # we will use this udring other analyses, but not overwrite the original phyloseq object as the # unfiltered set of taxa would be required if we were performing e.g. alpha diversity analyses
Ordination methods can help you to visualize variation in overall microbial ecosystem composition, and look at whether it might differ markedly between groups, e.g. weight.
Here is one option to try first:
tax_filter()
tax_agg()
tax_transform()
ord_calc()
ord_plot()
clr_pca <- ps %>% tax_filter(min_prevalence = 5 / 100, tax_level = "Genus") %>% tax_agg("Genus") %>% # aggregate taxa at Genus level tax_transform("clr") %>% # centered log ratio transformation ord_calc(method = "PCA") # Note: PCA is RDA without constraints (& ord_calc uses an RDA method to perform PCA)
clr_pca
clr_pca %>% ord_plot(colour = "weight", shape = "DNA_extraction_method", alpha = 0.7, size = 1.5) + scale_colour_viridis_c(option = "inferno", direction = -1, end = 0.8) + scale_shape_manual( values = c(o = "square open", r = "triangle open", p = "circle"), name = "DNA\nextraction\nmethod" )
Using a PCA ordination allows us reliably draw biplots, showing which taxa are associated with this major variation in sample microbiota composition as represented on axes PC1 and PC2.
pca <- clr_pca %>% ord_plot( colour = "weight", shape = "DNA_extraction_method", alpha = 0.7, size = 1, plot_taxa = 12:1, tax_vec_length = 0.3, taxon_renamer = function(x) stringr::str_remove_all(x, "^G: | [ae]t rel."), center = TRUE, tax_lab_style = tax_lab_style( type = "label", max_angle = 90, fontface = "bold", alpha = 0.8, size = 2 ) ) + scale_colour_viridis_c(option = "inferno", direction = -1, end = 0.8) + scale_shape_manual( values = c(o = "square open", r = "triangle open", p = "circle"), name = "DNA\nextraction\nmethod" ) + # essential for correct label angles coord_fixed(clip = "off", xlim = c(-1.5, 1.5), ylim = c(-1.5, 1.5))
pca
What about the actual compositions of these 800 samples?
iris <- clr_pca %>% ord_plot_iris( tax_level = "Genus", n_taxa = 15, anno_binary = "extract_P", anno_binary_style = list(y = 1.1, size = 0.5, alpha = 0.3, shape = 1), taxon_renamer = function(x) stringr::str_remove_all(x, "^G: | [ae]t rel.") ) iris
You can arrange both plots together with patchwork package.
pca / iris
What variables is the overall microbial community variation associated with?
ps %>% tax_filter(min_prevalence = 5 / 100, tax_level = "Genus") %>% tax_agg("Genus") %>% dist_calc("aitchison") %>% dist_permanova( variables = c("DNA_extraction_method", "weight", "female"), n_perms = 99, # this is a low number of permutations for speed, you should set more e.g. 9999 seed = 12345, complete_cases = TRUE, verbose = "max" )
(Partial) Constrained ordination can be useful to show microbial variation explained by predictors significant in PERMANOVA analyses.
# constraints need to be on the same or similar scales for comparability # so make binary variables and scale the weight variable ps <- ps %>% ps_mutate( wt_scaled = c(scale(weight, center = TRUE, scale = TRUE)), P = if_else(DNA_extraction_method == "p", 1, 0), R = if_else(DNA_extraction_method == "r", 1, 0), O = if_else(DNA_extraction_method == "o", 1, 0) )
constr_ord <- ps %>% tax_filter(min_prevalence = 5 / 100, tax_level = "Genus") %>% tax_agg("Genus") %>% tax_transform("clr") %>% ord_calc( method = "RDA", constraints = c("female", "wt_scaled", "P", "R", "O") )
constr_ord
ord_plot(constr_ord, plot_taxa = 10:1, colour = "DNA_extraction_method", shape = 1) + scale_color_brewer(palette = "Set2", name = "DNA\nextraction\nmethod")
As DNA extraction method dominates the plot above, we could try "partialing out" the variation explained by DNA extraction method, prior to constraining on the other factors of interest.
ps %>% tax_filter(min_prevalence = 5 / 100, tax_level = "Genus") %>% tax_agg("Genus") %>% tax_transform("clr") %>% ord_calc( method = "RDA", conditions = c("P", "R", "O"), constraints = c("female", "wt_scaled") ) %>% ord_plot(plot_taxa = 10:1, colour = "DNA_extraction_method", shape = 1) + scale_color_brewer(palette = "Set2", name = "DNA\nextraction\nmethod")
What are the effects of these factors on individual taxa? Let's use beta binomial regression models to find out. We will skip fitting models for genera here, only for speed of generating this example. You should include all ranks (which is the default) for the best use of taxatree_plots.
See the taxon modelling article for a more comprehensive look at the taxatree_* family of functions.
library(corncob) tt_models <- ps %>% tax_filter(min_prevalence = 5 / 100, tax_level = "Genus") %>% taxatree_models( ranks = c("Phylum", "Family"), variables = c("female", "wt_scaled", "P", "R"), type = "bbdml", verbose = "max" ) tt_stats <- taxatree_models2stats(tt_models, param = "mu")
Visualize the results compactly on a microViz taxonomic association tree.
tt_stats %>% taxatree_plotkey( node_size_range = c(2, 8), size_stat = list(mean = mean), rank == "Phylum", taxon_renamer = function(x) stringr::str_remove_all(x, "^.: | [ae]t rel.") )
tt_stats %>% taxatree_plots( node_size_range = c(1, 5), size_stat = list(mean = mean) ) %>% .[c("P", "R", "female", "wt_scaled")] %>% wrap_plots(., ncol = 2, guides = "collect")
This document is intended only to be an example of the kind of analyses and visualization you can do with microViz. The analysis of the atlas1006 data is not intended to be considered theoretically sound or biologically interpretable.
devtools::session_info()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.