knitr::opts_chunk$set(echo = TRUE)
Here we analyze a subset of the ENIGMA data (Smith, M. and Rocha, A. et al 2015).
The dataset includes water samples from 67 wells with well-characterized biochemical metadata (i.e. pH, heavy metal concentrations, etc).
We'll download the processed OTU table and associated metadata from Zenodo and put it in the data/enigma_results
folder.
Note that I've uploaded this to our repo as a gzipped file, make sure to gunzip the OTU table before knitting this.
We did some preliminary correlation analysis and found that the metadata variables show a range of correlation with different OTUs: some metadata variables don't seem correlated with any OTUs, whereas others seem correlated with basically the entire community. We'll pick three of these metadata variables to illustrate our FDR correction methods in cases where we expect different numbers of true significant associations.
library(dplyr) library(tidyr) library(ggplot2) library(magrittr) library(SummarizedBenchmark) ## load helper functions for (f in list.files("../R", "\\.(r|R)$", full.names = TRUE)) { source(f) } ## project data/results folders datdir <- "data" resdir <- "results" sbdir <- "../../results/microbiome" dir.create(datdir, showWarnings = FALSE) dir.create(resdir, showWarnings = FALSE) dir.create(sbdir, showWarnings = FALSE) enigma_dir <- file.path(datdir, "enigma_results") dir.create(enigma_dir, showWarnings = FALSE) # From their paper, we expect SO4 to have little correlation, pH # to have high correlations, and Al to be in the middle so4_result_file <- file.path(resdir, "enigma-so4-otus-results.rds") so4_bench_file <- file.path(sbdir, "enigma-so4-otus-benchmark.rds") so4_bench_file_abun <- file.path(sbdir, "enigma-so4-otus-abun-benchmark.rds") so4_bench_file_uninf <- file.path(sbdir, "enigma-so4-otus-uninf-benchmark.rds") ph_result_file <- file.path(resdir, "enigma-ph-otus-results.rds") ph_bench_file <- file.path(sbdir, "enigma-ph-otus-benchmark.rds") ph_bench_file_abun <- file.path(sbdir, "enigma-ph-otus-abun-benchmark.rds") ph_bench_file_uninf <- file.path(sbdir, "enigma-ph-otus-uninf-benchmark.rds") al_result_file <- file.path(resdir, "enigma-al-otus-results.rds") al_bench_file <- file.path(sbdir, "enigma-al-otus-benchmark.rds") al_bench_file_abun <- file.path(sbdir, "enigma-al-otus-abun-benchmark.rds") al_bench_file_uninf <- file.path(sbdir, "enigma-al-otus-uninf-benchmark.rds")
We download the data from Zenodo if not already available.
if (!file.exists(file.path(enigma_dir, "enigma.metadata.txt"))) { download.file("https://zenodo.org/record/1455793/files/enigma.metadata.txt", destfile = file.path(enigma_dir, "enigma.metadata.txt")) } if (!file.exists(file.path(enigma_dir, "enigma.otu_table_resampled_updated_r.txt.gz"))) { download.file("https://zenodo.org/record/1455793/files/enigma.otu_table_resampled_updated_r.txt.gz", destfile = file.path(enigma_dir, "enigma.otu_table_resampled_updated_r.txt.gz")) }
## load OTU table and metadata otu <- read.table(file.path(enigma_dir, "enigma.otu_table_resampled_updated_r.txt.gz"), sep='\t', header = TRUE, row.names = 1) meta <- read.csv(file.path(enigma_dir, "enigma.metadata.txt"), sep='\t') # Select only the 0.02 um samples meta <- meta[endsWith(as.character(meta$Sample), "02"), ] # Make metadata sample ID match OTU table meta$Sample <- gsub("_", "\\.", as.character(meta$Sample)) meta$Sample <- substr(meta$Sample, 1, nchar(meta$Sample)-3) ## keep only samples with both metadata and 16S data keep_samples <- intersect(colnames(otu), meta$Sample) otu <- otu[, keep_samples] meta <- dplyr::filter(meta, Sample %in% keep_samples) ## Make sure samples in metadata and OTU table match meta <- meta[match(colnames(otu), meta$Sample), ]
We'll do a little filtering here before calculating correlations.
We'll apply a minimum threshold of 10 reads per OTU across all samples.
After removing these shallow OTUs, we also get rid of any samples with too few reads.
We define the minimum number of reads per OTU in min_otu
, and
the minimum number of reads per sample in min_sample
.
After we've removed any shallow OTUs and samples, we'll convert the OTU table to relative abundances.
min_otu <- 10 minp_otu <- 0.01 min_sample <- 100 ## Remove OTUs w/ <= min reads, w/ <= min prop, samples w/ <= min reads otu <- otu[rowSums(otu) > min_otu, ] otu <- otu[rowSums(otu > 0) / ncol(otu) > minp_otu, ] otu <- otu[, colSums(otu) > min_sample] ## Update metadata with new samples meta <- dplyr::filter(meta, Sample %in% colnames(otu)) ## Remove empty OTUs otu <- otu[rowSums(otu) > 0, ] ## Convert to relative abundance abun_otu <- t(t(otu) / rowSums(t(otu))) ## Add pseudo counts zeroabun <- 0 abun_otu <- abun_otu + zeroabun
Next, we need to calculate the pvalues for correlations between abundances of each OTU and the metadata variables of interest. (i.e. we're correlating each OTU's abundances across wells with the metadata variable across wells). Let's start with "pH". We'll store the pvalue results in a dataframe.
While we're at it, we'll also calculate the mean abundance and ubiquity (detection rate)
of each OTU. Later, we can assign their values to a new column called ind_covariate
for use in downstream steps.
if (!file.exists(ph_result_file)) { res <- test_microbiome_corr(abundance = abun_otu, meta_col = meta$pH, shift = zeroabun) saveRDS(res, file = ph_result_file) } else { res <- readRDS(ph_result_file) }
Add random (uninformative) covariate.
set.seed(44318) res$rand_covar <- rnorm(nrow(res))
Here we look to see if the covariates do indeed look informative.
strat_hist(res, pvalue="pval", covariate="ubiquity", maxy=10, binwidth=0.05)
rank_scatter(res, pvalue="pval", covariate="ubiquity")
strat_hist(res, pvalue="pval", covariate="mean_abun_present", maxy=8, binwidth=0.05)
rank_scatter(res, pvalue="pval", covariate="mean_abun_present")
strat_hist(res, pvalue="pval", covariate="rand_covar", maxy=8, binwidth=0.05)
rank_scatter(res, pvalue="pval", covariate="rand_covar")
Let's use ubiquity
as our ind_covariate
.
res <- dplyr::mutate(res, ind_covariate = ubiquity)
First, we'll create an object of BenchDesign
class to hold the data and
add the benchmark methods to the BenchDesign
object. We remove ASH from
the default comparison.
Then, we'll construct the SummarizedBenchmark
object, which will run
the functions specified in each method (these are actually sourced in from the
helper scripts).
if (!file.exists(ph_bench_file)) { bd <- initializeBenchDesign() bd <- dropBMethod(bd, "ashq") sb <- buildBench(bd, data = res, ftCols = "ind_covariate") saveRDS(sb, file = ph_bench_file) } else { sb <- readRDS(ph_bench_file) }
Next, we'll add the default performance metric for q-value assays. First, we have to rename the assay to 'qvalue'.
assayNames(sb) <- "qvalue" sb <- addDefaultMetrics(sb)
Now, we'll plot the results.
rejections_scatter( sb, as_fraction=FALSE, supplementary=FALSE) rejection_scatter_bins(sb, covariate="ind_covariate", supplementary=FALSE)
covariateLinePlot(sb, alpha = 0.05, covname = "ind_covariate")
Let's use mean_abun_present
as our ind_covariate
.
res <- dplyr::mutate(res, ind_covariate = mean_abun_present)
First, we'll create an object of BenchDesign
class to hold the data and
add the benchmark methods to the BenchDesign
object. We remove ASH from
the default comparison.
Then, we'll construct the SummarizedBenchmark
object, which will run
the functions specified in each method (these are actually sourced in from the
helper scripts).
if (!file.exists(ph_bench_file_abun)) { bd <- initializeBenchDesign() bd <- dropBMethod(bd, "ashq") sb <- buildBench(bd, data = res, ftCols = "ind_covariate") saveRDS(sb, file = ph_bench_file_abun) } else { sb <- readRDS(ph_bench_file_abun) }
Next, we'll add the default performance metric for q-value assays. First, we have to rename the assay to 'qvalue'.
assayNames(sb) <- "qvalue" sb <- addDefaultMetrics(sb)
Now, we'll plot the results.
rejections_scatter( sb, as_fraction=FALSE, supplementary=FALSE) rejection_scatter_bins(sb, covariate="ind_covariate", supplementary=FALSE)
covariateLinePlot(sb, alpha = 0.05, covname = "ind_covariate")
Let's use rand_covar
as our ind_covariate
.
res <- dplyr::mutate(res, ind_covariate = rand_covar)
First, we'll create an object of BenchDesign
class to hold the data and
add the benchmark methods to the BenchDesign
object. We remove ASH from
the default comparison.
Then, we'll construct the SummarizedBenchmark
object, which will run
the functions specified in each method (these are actually sourced in from the
helper scripts).
if (!file.exists(ph_bench_file_uninf)) { bd <- initializeBenchDesign() bd <- dropBMethod(bd, "ashq") sb <- buildBench(bd, data = res, ftCols = "ind_covariate") saveRDS(sb, file = ph_bench_file_uninf) } else { sb <- readRDS(ph_bench_file_uninf) }
Next, we'll add the default performance metric for q-value assays. First, we have to rename the assay to 'qvalue'.
assayNames(sb) <- "qvalue" sb <- addDefaultMetrics(sb)
Now, we'll plot the results.
rejections_scatter( sb, as_fraction=FALSE, supplementary=FALSE) rejection_scatter_bins(sb, covariate="ind_covariate", supplementary=FALSE)
covariateLinePlot(sb, alpha = 0.05, covname = "ind_covariate")
Next, we'll look at SO4 levels
if (!file.exists(so4_result_file)) { res <- test_microbiome_corr(abundance = abun_otu, meta_col = meta$SO4_mgL, shift = zeroabun) saveRDS(res, file = so4_result_file) } else { res <- readRDS(so4_result_file) }
Add random (uninformative) covariate.
set.seed(27349) res$rand_covar <- rnorm(nrow(res))
Here we look to see if the covariates do indeed look informative.
strat_hist(res, pvalue="pval", covariate="ubiquity", maxy=10, binwidth=0.05)
rank_scatter(res, pvalue="pval", covariate="ubiquity")
strat_hist(res, pvalue="pval", covariate="mean_abun_present", maxy=8, binwidth=0.05)
rank_scatter(res, pvalue="pval", covariate="mean_abun_present")
strat_hist(res, pvalue="pval", covariate="rand_covar", maxy=8, binwidth=0.05)
rank_scatter(res, pvalue="pval", covariate="rand_covar")
Let's use ubiquity
as our ind_covariate
.
res <- dplyr::mutate(res, ind_covariate = ubiquity)
First, we'll create an object of BenchDesign
class to hold the data and
add the benchmark methods to the BenchDesign
object. We remove ASH from
the default comparison.
Then, we'll construct the SummarizedBenchmark
object, which will run
the functions specified in each method (these are actually sourced in from the
helper scripts).
if (!file.exists(so4_bench_file)) { bd <- initializeBenchDesign() bd <- dropBMethod(bd, "ashq") sb <- buildBench(bd, data = res, ftCols = "ind_covariate") saveRDS(sb, file = so4_bench_file) } else { sb <- readRDS(so4_bench_file) }
Next, we'll add the default performance metric for q-value assays. First, we have to rename the assay to 'qvalue'.
assayNames(sb) <- "qvalue" sb <- addDefaultMetrics(sb)
Now, we'll plot the results.
rejections_scatter( sb, as_fraction=FALSE, supplementary=FALSE) rejection_scatter_bins(sb, covariate="ind_covariate", supplementary=FALSE)
covariateLinePlot(sb, alpha = 0.05, covname = "ind_covariate")
Let's use mean_abun_present
as our ind_covariate
.
res <- dplyr::mutate(res, ind_covariate = mean_abun_present)
First, we'll create an object of BenchDesign
class to hold the data and
add the benchmark methods to the BenchDesign
object. We remove ASH from
the default comparison.
Then, we'll construct the SummarizedBenchmark
object, which will run
the functions specified in each method (these are actually sourced in from the
helper scripts).
if (!file.exists(so4_bench_file_abun)) { bd <- initializeBenchDesign() bd <- dropBMethod(bd, "ashq") sb <- buildBench(bd, data = res, ftCols = "ind_covariate") saveRDS(sb, file = so4_bench_file_abun) } else { sb <- readRDS(so4_bench_file_abun) }
Next, we'll add the default performance metric for q-value assays. First, we have to rename the assay to 'qvalue'.
assayNames(sb) <- "qvalue" sb <- addDefaultMetrics(sb)
Now, we'll plot the results.
rejections_scatter( sb, as_fraction=FALSE, supplementary=FALSE) rejection_scatter_bins(sb, covariate="ind_covariate", supplementary=FALSE)
covariateLinePlot(sb, alpha = 0.05, covname = "ind_covariate")
Let's use rand_covar
as our ind_covariate
.
res <- dplyr::mutate(res, ind_covariate = mean_abun_present)
First, we'll create an object of BenchDesign
class to hold the data and
add the benchmark methods to the BenchDesign
object. We remove ASH from
the default comparison.
Then, we'll construct the SummarizedBenchmark
object, which will run
the functions specified in each method (these are actually sourced in from the
helper scripts).
if (!file.exists(so4_bench_file_uninf)) { bd <- initializeBenchDesign() bd <- dropBMethod(bd, "ashq") sb <- buildBench(bd, data = res, ftCols = "ind_covariate") saveRDS(sb, file = so4_bench_file_uninf) } else { sb <- readRDS(so4_bench_file_uninf) }
Next, we'll add the default performance metric for q-value assays. First, we have to rename the assay to 'qvalue'.
assayNames(sb) <- "qvalue" sb <- addDefaultMetrics(sb)
Now, we'll plot the results.
rejections_scatter( sb, as_fraction=FALSE, supplementary=FALSE) rejection_scatter_bins(sb, covariate="ind_covariate", supplementary=FALSE)
covariateLinePlot(sb, alpha = 0.05, covname = "ind_covariate")
Finally, we'll look at Al levels
if (!file.exists(al_result_file)) { res <- test_microbiome_corr(abundance = abun_otu, meta_col = meta$Al_mgL, shift = zeroabun) saveRDS(res, file = al_result_file) } else { res <- readRDS(al_result_file) } # remove OTUs with missing pvals res <- res %>% filter(!is.na(pval))
Add random (uninformative) covariate.
set.seed(19474) res$rand_covar <- rnorm(nrow(res))
Here we look to see if the covariates do indeed look informative.
strat_hist(res, pvalue="pval", covariate="ubiquity", maxy=10, binwidth=0.05)
rank_scatter(res, pvalue="pval", covariate="ubiquity")
strat_hist(res, pvalue="pval", covariate="mean_abun_present", maxy=8, binwidth=0.05)
rank_scatter(res, pvalue="pval", covariate="mean_abun_present")
strat_hist(res, pvalue="pval", covariate="rand_covar", maxy=8, binwidth=0.05)
rank_scatter(res, pvalue="pval", covariate="rand_covar")
Let's use ubiquity
as our ind_covariate
.
res <- dplyr::mutate(res, ind_covariate = ubiquity)
First, we'll create an object of BenchDesign
class to hold the data and
add the benchmark methods to the BenchDesign
object. We remove ASH from
the default comparison.
Then, we'll construct the SummarizedBenchmark
object, which will run
the functions specified in each method (these are actually sourced in from the
helper scripts).
if (!file.exists(al_bench_file)) { bd <- initializeBenchDesign() bd <- dropBMethod(bd, "ashq") sb <- buildBench(bd, data = res, ftCols = "ind_covariate") saveRDS(sb, file = al_bench_file) } else { sb <- readRDS(al_bench_file) }
Next, we'll add the default performance metric for q-value assays. First, we have to rename the assay to 'qvalue'.
assayNames(sb) <- "qvalue" sb <- addDefaultMetrics(sb)
Now, we'll plot the results.
rejections_scatter( sb, as_fraction=FALSE, supplementary=FALSE) rejection_scatter_bins(sb, covariate="ind_covariate", supplementary=FALSE)
covariateLinePlot(sb, alpha = 0.05, covname = "ind_covariate")
Let's use mean_abun_present
as our ind_covariate
.
res <- dplyr::mutate(res, ind_covariate = mean_abun_present)
First, we'll create an object of BenchDesign
class to hold the data and
add the benchmark methods to the BenchDesign
object. We remove ASH from
the default comparison.
Then, we'll construct the SummarizedBenchmark
object, which will run
the functions specified in each method (these are actually sourced in from the
helper scripts).
if (!file.exists(al_bench_file_abun)) { bd <- initializeBenchDesign() bd <- dropBMethod(bd, "ashq") sb <- buildBench(bd, data = res, ftCols = "ind_covariate") saveRDS(sb, file = al_bench_file_abun) } else { sb <- readRDS(al_bench_file_abun) }
Next, we'll add the default performance metric for q-value assays. First, we have to rename the assay to 'qvalue'.
assayNames(sb) <- "qvalue" sb <- addDefaultMetrics(sb)
Now, we'll plot the results.
rejections_scatter( sb, as_fraction=FALSE, supplementary=FALSE) rejection_scatter_bins(sb, covariate="ind_covariate", supplementary=FALSE)
covariateLinePlot(sb, alpha = 0.05, covname = "ind_covariate")
Let's use rand_covar
as our ind_covariate
.
res <- dplyr::mutate(res, ind_covariate = rand_covar)
First, we'll create an object of BenchDesign
class to hold the data and
add the benchmark methods to the BenchDesign
object. We remove ASH from
the default comparison.
Then, we'll construct the SummarizedBenchmark
object, which will run
the functions specified in each method (these are actually sourced in from the
helper scripts).
if (!file.exists(al_bench_file_uninf)) { bd <- initializeBenchDesign() bd <- dropBMethod(bd, "ashq") sb <- buildBench(bd, data = res, ftCols = "ind_covariate") saveRDS(sb, file = al_bench_file_uninf) } else { sb <- readRDS(al_bench_file_uninf) }
Next, we'll add the default performance metric for q-value assays. First, we have to rename the assay to 'qvalue'.
assayNames(sb) <- "qvalue" sb <- addDefaultMetrics(sb)
Now, we'll plot the results.
rejections_scatter( sb, as_fraction=FALSE, supplementary=FALSE) rejection_scatter_bins(sb, covariate="ind_covariate", supplementary=FALSE)
covariateLinePlot(sb, alpha = 0.05, covname = "ind_covariate")
Here we compare the method ranks for the different comparisons at alpha = 0.10.
plotMethodRanks(c(so4_bench_file, so4_bench_file_abun, so4_bench_file_uninf, ph_bench_file, ph_bench_file_abun, ph_bench_file_uninf, al_bench_file, al_bench_file_abun, al_bench_file_uninf), colLabels = c("SO4-ubiquity", "SO4-abun", "SO4-uninf", "pH-ubiquity", "ph-abun", "pH-uninf", "Al-ubiquity", "Al-abun", "Al-uninf"), alpha = 0.10, xlab = "Comparison")
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.