Nothing
## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(echo = TRUE)
knitr::opts_chunk$set(cache = FALSE)
## ---- message=FALSE-----------------------------------------------------------
library(MMUPHin)
# tidyverse packages for utilities
library(magrittr)
library(dplyr)
library(ggplot2)
## ----Installation, eval = FALSE-----------------------------------------------
# if (!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
# BiocManager::install("MMUPHin")
## ----load data----------------------------------------------------------------
data("CRC_abd", "CRC_meta")
# CRC_abd is the feature (species) abundance matrix. Rows are features and
# columns are samples.
CRC_abd[1:5, 1, drop = FALSE]
# CRC_meta is the metadata data frame. Columns are samples.
CRC_meta[1, 1:5]
# A total of five studies are included
table(CRC_meta$studyID)
# The following were used to access and format the two objects
# library(curatedMetagenomicData)
# library(phyloseq)
# library(genefilter)
# datasets <- curatedMetagenomicData(
# c("FengQ_2015.metaphlan_bugs_list.stool" ,
# "HanniganGD_2017.metaphlan_bugs_list.stool",
# "VogtmannE_2016.metaphlan_bugs_list.stool",
# "YuJ_2015.metaphlan_bugs_list.stool",
# "ZellerG_2014.metaphlan_bugs_list.stool"),
# dryrun = FALSE)
# # Construct phyloseq object from the five datasets
# physeq <-
# # Aggregate the five studies into ExpressionSet
# mergeData(datasets) %>%
# # Convert to phyloseq object
# ExpressionSet2phyloseq() %>%
# # Subset samples to only CRC and controls
# subset_samples(study_condition %in% c("CRC", "control")) %>%
# # Subset features to species
# subset_taxa(!is.na(Species) & is.na(Strain)) %>%
# # Normalize abundances to relative abundance scale
# transform_sample_counts(function(x) x / sum(x)) %>%
# # Filter features to be of at least 1e-5 relative abundance in five samples
# filter_taxa(kOverA(5, 1e-5), prune = TRUE)
# CRC_abd <- otu_table(physeq)@.Data
# CRC_meta <- data.frame(sample_data(physeq))
# CRC_meta$studyID <- factor(CRC_meta$studyID)
## ----adjust_batch-------------------------------------------------------------
# The function call indicates for adjust_batch to correct for the effect
# of the batch variable, studyID, while controlling for the effect of the
# disease variable, study_condition. Many additional options are available
# through the control parameter, here we specify verbose=FALSE to avoid
# excessive messages, although they can often be helpful in practice!
fit_adjust_batch <- adjust_batch(feature_abd = CRC_abd,
batch = "studyID",
covariates = "study_condition",
data = CRC_meta,
control = list(verbose = FALSE))
# Note that adjust_batch returns a list of more than one components, and
# feature_abd_adj is the corrected feature abundance matrix. See
# help(adjust_batch) for the meaning of other components.
CRC_abd_adj <- fit_adjust_batch$feature_abd_adj
## ----permanova----------------------------------------------------------------
library(vegan)
# adonis requires as input sample-versus-sample dissimilarities between
# microbial profiles
D_before <- vegdist(t(CRC_abd))
D_after <- vegdist(t(CRC_abd_adj))
# fix random seed as adonis runs randomized permutations
set.seed(1)
fit_adonis_before <- adonis(D_before ~ studyID, data = CRC_meta)
fit_adonis_after <- adonis(D_after ~ studyID, data = CRC_meta)
print(fit_adonis_before)
print(fit_adonis_after)
## ----lm_meta------------------------------------------------------------------
# lm_meta runs regression and meta-analysis models to identify consistent
# effects of the exposure (study_condition, i.e., disease) on feature_abd
# (microbial feature abundances). Batch variable (studyID) needs to be
# specified to identify different studies. Additional covariates to include in
# the regression model can be specified via covariates (here set to gender,
# age, BMI). Check help(lm_meta) for additional parameter options.
# Note the warnings: lm_meta can tell if a covariate cannot be meaningfully fit
# within a batch and will inform the user of such cases through warnings.
fit_lm_meta <- lm_meta(feature_abd = CRC_abd,
batch = "studyID",
exposure = "study_condition",
covariates = c("gender", "age", "BMI"),
data = CRC_meta,
control = list(verbose = FALSE))
# Again, lm_meta returns a list of more than one components.
# meta_fits provides the final meta-analytical testing results. See
# help(lm_meta) for the meaning of other components.
meta_fits <- fit_lm_meta$meta_fits
## ----significant differential abundant species--------------------------------
meta_fits %>%
filter(qval.fdr < 0.05) %>%
arrange(coef) %>%
mutate(feature = factor(feature, levels = feature)) %>%
ggplot(aes(y = coef, x = feature)) +
geom_bar(stat = "identity") +
coord_flip()
## ----discrete_discover--------------------------------------------------------
# First subset both feature abundance table and metadata to only control samples
control_meta <- subset(CRC_meta, study_condition == "control")
control_abd_adj <- CRC_abd_adj[, rownames(control_meta)]
# discrete_discover takes as input sample-by-sample dissimilarity measurements
# rather than abundance table. The former can be easily computed from the
# latter with existing R packages.
D_control <- vegdist(t(control_abd_adj))
fit_discrete <- discrete_discover(D = D_control,
batch = "studyID",
data = control_meta,
control = list(k_max = 8,
verbose = FALSE))
## ----visualize discrete structure---------------------------------------------
internal <- data.frame(
# By default, fit_discrete evaluates cluster numbers 2-10
K = 2:8,
statistic =
fit_discrete$internal_mean[, "ZellerG_2014.metaphlan_bugs_list.stool"],
se =
fit_discrete$internal_se[, "ZellerG_2014.metaphlan_bugs_list.stool"],
type = "internal")
external <- data.frame(
# By default, fit_discrete evaluates cluster numbers 2-10
K = 2:8,
statistic =
fit_discrete$external_mean[, "ZellerG_2014.metaphlan_bugs_list.stool"],
se =
fit_discrete$external_se[, "ZellerG_2014.metaphlan_bugs_list.stool"],
type = "external")
rbind(internal, external) %>%
ggplot(aes(x = K, y = statistic, color = type)) +
geom_point(position = position_dodge(width = 0.5)) +
geom_line(position = position_dodge(width = 0.5)) +
geom_errorbar(aes(ymin = statistic - se, ymax = statistic + se),
position = position_dodge(width = 0.5), width = 0.5) +
ggtitle("Evaluation of discrete structure in control stool microbiome (ZellerG_2014)")
## ----discrete_discover vaginal------------------------------------------------
# library(curatedMetagenomicData)
# library(phyloseq)
# datasets <- curatedMetagenomicData(
# "*metaphlan_bugs_list.vagina*",
# dryrun = FALSE)
# # Construct phyloseq object from the five datasets
# physeq <-
# # Aggregate the five studies into ExpressionSet
# mergeData(datasets) %>%
# # Convert to phyloseq object
# ExpressionSet2phyloseq() %>%
# # Subset features to species
# subset_taxa(!is.na(Species) & is.na(Strain)) %>%
# # Normalize abundances to relative abundance scale
# transform_sample_counts(function(x) x / sum(x)) %>%
# # Filter features to be of at least 1e-5 relative abundance in two samples
# filter_taxa(kOverA(2, 1e-5), prune = TRUE)
# vaginal_abd <- otu_table(physeq)@.Data
# vaginal_meta <- data.frame(sample_data(physeq))
# vaginal_meta$studyID <- factor(vaginal_meta$studyID)
data("vaginal_abd", "vaginal_meta")
D_vaginal <- vegdist(t(vaginal_abd))
fit_discrete_vag <- discrete_discover(D = D_vaginal,
batch = "studyID",
data = vaginal_meta,
control = list(verbose = FALSE,
k_max = 8))
# Examine results for the larger study, HMP_2012
data.frame(
# By default, fit_discrete evaluates cluster numbers 2-10
K = 2:8,
statistic =
fit_discrete_vag$internal_mean[, "HMP_2012.metaphlan_bugs_list.vagina"],
se =
fit_discrete_vag$internal_se[, "HMP_2012.metaphlan_bugs_list.vagina"]) %>%
ggplot(aes(x = K, y = statistic)) +
geom_point() + geom_line() +
geom_errorbar(aes(ymin = statistic - se, ymax = statistic + se),
width = 0.5) +
ggtitle("Evaluation of discrete structure in vaginal microbiome (HMP_2012)")
## ----continuos_structre-------------------------------------------------------
# Much like adjust_batch and lm_meta, continuous_discover also takes
# as input feature-by-sample abundances. control offers many tuning parameters
# and here we set one of them, var_perc_cutoff, to 0.5, which asks the method
# to include top principal components within each batch that in total explain
# at least 50% of the total variability in the batch. See
# help(continuosu_discover) for more details on the tuning parameters and
# their interpretations.
fit_continuous <- continuous_discover(feature_abd = control_abd_adj,
batch = "studyID",
data = control_meta,
control = list(var_perc_cutoff = 0.5,
verbose = FALSE))
## ----visualize continuous structure-------------------------------------------
# Examine top loadings
loading <- data.frame(feature = rownames(fit_continuous$consensus_loadings),
loading1 = fit_continuous$consensus_loadings[, 1])
loading %>%
dplyr::arrange(-abs(loading1)) %>%
dplyr::slice(1:20) %>%
dplyr::arrange(loading1) %>%
dplyr::mutate(feature = factor(feature, levels = feature)) %>%
ggplot(aes(x = feature, y = loading1)) +
geom_bar(stat = "identity") +
coord_flip() +
ggtitle("Features with top loadings")
# Ordinate the samples
mds <- cmdscale(d = D_control)
colnames(mds) <- c("Axis1", "Axis2")
as.data.frame(mds) %>%
dplyr::mutate(score1 = fit_continuous$consensus_scores[, 1]) %>%
ggplot(aes(x = Axis1, y = Axis2, color = score1)) +
geom_point() +
coord_fixed()
## ----sessioninfo--------------------------------------------------------------
sessionInfo()
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.