knitr::opts_knit$set(progress = TRUE, verbose = TRUE) # Global chunk options knitr::opts_chunk$set( cache = TRUE, autodep = TRUE, include = TRUE, echo = TRUE, warning = TRUE, message = FALSE, fig.width = 6, fig.height = 4 )
Run with SAVE_FIGURES = TRUE
to save figures in figures/
.
SAVE_FIGURES = TRUE
Libraries and paths:
library(here) library(tidyverse) library(ggthemes) library(cowplot) library(ggbeeswarm) # Functions for bias estimation and calibration library(metacal); packageVersion("metacal") # This package devtools::load_all(here())
Plot setup:
# Options for `ggplot`: base_theme <- theme_tufte() + theme( text = element_text(size=9, family = ""), strip.text = element_text(size=9, family = ""), legend.position = "none" ) tax_theme <- theme( axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1), axis.title.x = element_blank()) update_geom_defaults("point", list(size = 1)) update_geom_defaults("text", list(size = 2.5)) update_geom_defaults("hline", list(color = "grey")) update_geom_defaults("vline", list(color = "grey")) # Color-blind friendly colors from ColorBrewer2 colors.protocol <- c( H = "#1b9e77", W = "#d95f02", Q = "#7570b3", Actual = "black") # Labeller for facets when faceting by protocol labeller.protocol <- labeller( Protocol = function(p) paste("Protocol", p)) labeller.protocol.ref <- labeller( Protocol = function(p) { focal <- str_sub(p, 1, 1) ref <- str_sub(p, 2, 2) paste("Protocol", focal, "/", ref) } )
Sample metadata:
data("costea2017_sample_data") sam <- costea2017_sample_data sam
Expected mock community composition, obtained from the supplementary data (see
data-raw/costea2017.R
)
data("costea2017_mock_composition") mock <- costea2017_mock_composition
The FACS measurements are the "bacterial cells" columns; the measurements without endospores were used as the actual composition, and replicate measurements were averaged (personal communication with Paul Costea);
mock <- mock %>% select(Taxon, FACS = "bacterial cells in spike in Mix") %>% group_by(Taxon) %>% summarize_at(vars(FACS), mean) mock <- mock %>% arrange(desc(FACS)) mock
Note that the order of three species (V. cholerae, C. saccharolyticum, and Y.
pseudotuberculosis) differ from that of Costea2017 Figure 6. We reproduce their
figure below, showing that these species are likely mislabeled in their figure
and not in the mock
table. But a mislabeling would here not affect our main
conclusions.
Load the Metaphlan2 profiles:
data("costea2017_metaphlan2_profiles") profiles <- costea2017_metaphlan2_profiles profiles %>% corner
The samples are named here by their ENA run accession; let's rename to the new sample names.
new_names <- c("Clade", sam$Sample[match(colnames(profiles)[-1], sam$Run_accession)]) colnames(profiles) <- new_names
Two samples, QA and QB, are two extra fecal samples that were only sequenced by protocol Q and so we won't be using for our analysis. We also drop unneeded sample variables.
sam <- sam %>% filter(!(Sample %in% c("QA", "QB"))) %>% select(Sample, Protocol, Individual) profiles <- profiles[, c("Clade", sam$Sample)]
Taxonomy from the metaphlan2 clade names:
tax_ranks <- c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species", "Strain") rank_letters <- c("k", "p", "c", "o", "f", "g", "s", "t") tax_pattern <- paste0("(?:", rank_letters, "__(\\w+))?") %>% paste(collapse = "\\|?") tax <- profiles %>% select(Clade) %>% extract(Clade, tax_ranks, regex = tax_pattern, remove = FALSE) tax
Species table with just the Metaphlan species identifiers
st <- profiles %>% left_join(tax, by = "Clade") %>% filter(!is.na(Species), is.na(Strain)) %>% select(-Strain) st
Species names have the form Genus_species
and are unique,
st$Species %>% head st$Species %>% anyDuplicated
The Genus_unclassified
catches cases where more reads were mapped to the
genus's markers than could be accounted for by species within the genus. We
will include these for measuring the family-level compositions but not the
spike-in compositions From here on, we will identify the Clade / Taxon with
this species identifier,
st <- st %>% mutate(Clade = Species) %>% rename(Taxon = Clade)
Next, join the metadata and tidy it into a tall format (one sample-taxon observation per row)
st <- st %>% gather("Sample", "Abundance", sam$Sample) %>% left_join(sam, by = "Sample") %>% select(Taxon, Sample, Abundance, colnames(sam), everything()) st
Metaphlan outputs estimates of the proportions * 100. We will renormalize to proportions summing to 1 instead.
st <- st %>% mutate_by(Sample, Abundance = close_elts(Abundance)) st %>% group_by(Sample) %>% summarize(sum(Abundance)) %>% {summary(.[,2])}
Output from the various kingdoms:
st %>% group_by(Kingdom, Sample) %>% summarize(Abundance = sum(Abundance)) %>% spread(Kingdom, Abundance) %>% print(n=Inf)
There is very little Archaea and Eukaryote abundance, and the vast majority of the viral abundance comes from a Salmonella phage that I suspect is integrated into the genome of the Salmonella enterica strain in the spike-in.
st %>% filter(Kingdom == "Viruses") %>% group_by(Taxon) %>% summarize(Abundance = mean(Abundance)) %>% arrange(desc(Abundance))
Bacteria form the vast majority of estimated non-viral abundance in each sample,
st %>% group_by(Kingdom, Sample) %>% summarize(Abundance = sum(Abundance)) %>% filter(Kingdom != "Viruses") %>% mutate_by(Sample, Abundance = close_elts(Abundance)) %>% filter(Kingdom == "Bacteria") %>% summarize_at("Abundance", list(Min = min, Mean = mean))
We will therefore restrict our analysis of sample composition to just Bacteria, renormalizing the abundances to sum to 1 after filtering.
st <- st %>% filter(Kingdom == "Bacteria") %>% mutate_by(Sample, Abundance = close_elts(Abundance))
Costea et al. report E. coli as a likely lab contaminant of the spike-in based on its appearance in the profiles of the mock-only samples (which they derived using mOTUs rather than Metaphlan2). So let's check what is at appreciable frequency in the mock-only samples that shouldn't be,
tb <- st %>% filter( Individual == "M", Abundance > 0, !(Taxon %in% mock$Taxon) ) %>% group_by(Family, Taxon) %>% summarize( Mean = mean(Abundance), Max = max(Abundance), ) %>% filter(Max > 1e-4) %>% arrange(desc(Mean)) tb
Here we see that most of the contaminant reads appear to be captured by
Shigella_flexneri
identifier. Given the closeness of S. flexnari with E.
coli, the estimated abundance for Escherichia_coli
may also be coming from
the contaminant strain. Since S. flexnari is generally much larger,
st %>% filter(Taxon %in% c("Shigella_flexneri", "Escherichia_coli")) %>% group_by(Individual, Taxon) %>% summarize_at(vars(Abundance), mean) %>% spread(Taxon, Abundance) %>% mutate(Escherichia_coli / Shigella_flexneri)
We will treat Shigella_flexneri
as the lab contaminant in our analysis,
contaminant <- "Shigella_flexneri"
Our analysis of the spike-in abundances below show that the ratio of
Shigella_flexneri
to other spike-in taxa is constant across samples,
indicating that Shigella_flexneri
reads in the fecal samples are primarily
from the contaminant and not native gut strains.
Above, we also see appreciable frequency of Salmonella_unclassified
, which is
due to the genus-level estimated proportion for Salmonella being larger than
the species-level estimates. Because the estimated proportions of
Escherichia_coli
, Escherichia_unclassified
, and Salmonella_unclassified
are all fairly small, they will not substantially affect the family-level
composition plot (Figure 4A). and so we will leave it in as a "native" taxon
for this analysis. However, because these abundances might be due to the
spike-in rather than native gut taxa, we will filter them out before
proceeding.
st0 <- st %>% filter(!(Taxon %in% c("Salmonella_unclassified", "Escherichia_coli", "Escherichia_unclassified"))) %>% mutate_by(Sample, Abundance = close_elts(Abundance))
Classify the taxa by source type:
st0 <- st0 %>% mutate(Source = case_when( Taxon %in% mock$Taxon ~ "Mock", Taxon == contaminant ~ "Contaminant", TRUE ~ "Native" ) )
View the proportions of native, mock, and contaminant for each sample:
tb.source <- st0 %>% group_by(Sample, Protocol, Individual, Source) %>% summarize(Abundance = sum(Abundance)) %>% ungroup() tb.source %>% spread(Source, Abundance) %>% print(n=Inf)
The contaminant is always a fairly small fraction of the total sample. It also generally appears in a consistent ratio with the mock for a given protocol:
tb.source %>% spread(Source, Abundance) %>% ggplot(aes(Sample, Contaminant / Mock, color = Protocol, label = Individual)) + scale_color_manual(values = colors.protocol) + geom_text() + coord_flip()
In samples Q2 and Q3, the spike-in makes up an unusually low fraction of the total (see table above), so the larger ratio in these samples may be due to chance. These observations suggest that the contaminant is being correctly identified.
Composition by source type:
p.source <- ggplot(tb.source, aes(y = Abundance, x = Individual, # x = factor(Individual, lvls.mock_prop), fill = Source)) + geom_bar(stat = "identity") + scale_y_continuous(breaks = c(0, 0.5, 1), labels = c("0", "0.5", "1")) + geom_rangeframe(data = tibble(y = c(0, 1), Protocol = "H"), aes(y = y), sides = "l", inherit.aes = FALSE) + scale_fill_brewer(type = "qual") + facet_grid(. ~ Protocol, labeller = labeller.protocol) + labs(x = "Specimen", fill = "Taxon type", title = "Composition of different bacterial types") + base_theme + theme(legend.position = "right") p.source
Get the native taxa agglomerated to the Family level,
nat <- st0 %>% filter(Source == "Native") %>% group_by_at(vars(Sample, Protocol, Individual, Kingdom:Family)) %>% summarize(Abundance = sum(Abundance)) %>% ungroup
Let's restrict to Families within a minimal abundance to simplify the figure. First, lets check the prevalence of various families,
prev <- nat %>% filter(Individual != "M") %>% mutate_by(Sample, Proportion = close_elts(Abundance)) %>% group_by(Family) %>% summarize(Prev = sum(Proportion > 0), Min = min(Proportion), Median = median(Proportion), Mean = mean(Proportion), Max = max(Proportion)) prev %>% arrange(desc(Max)) %>% print(n=30)
Let's use the families with a proportion of at least 2%.
families <- prev %>% filter(Max > 0.02) %>% {.$Family} nat0 <- nat %>% filter(Family %in% families) %>% arrange(Phylum, Family)
Create a color scale for Family that distinguishes Phylum by hue.
# Color tibble ctb <- nat0 %>% select(Phylum, Family) %>% distinct %>% arrange(Phylum, Family) %>% group_by(Phylum) %>% nest(Family) %>% rowwise() %>% mutate(Families = list(unlist(data)), Num = length(Families)) ctb <- ctb %>% arrange(desc(Num)) %>% add_column(Palette = c("Blues", "Greens", "Reds", "Oranges")) ctb <- ctb %>% rowwise() %>% mutate(Colors = list(RColorBrewer::brewer.pal(Num, name = Palette))) last_n <- function(x, n) { len <- length(x) x[seq(len - n + 1, len)] } ctb <- ctb %>% rowwise() %>% mutate(Colors = list(last_n(Colors, Num))) ctb <- ctb %>% select(Phylum, Families, Colors) %>% unnest() %>% arrange(Phylum, Families) colors.families <- ctb$Colors names(colors.families) <- ctb$Families # RColorBrewer::display.brewer.all() # RColorBrewer::brewer.pal(5, name = "Blues")
We also want the mock and contaminant fractions in the final figure
comp <- bind_rows( nat0, tb.source %>% filter(Source != "Native") %>% rename(Family = Source) ) comp %>% filter(Sample == "W1") %>% select(Family, Abundance) %>% arrange(desc(Abundance)) comp %>% group_by(Sample) %>% summarize(sum= sum(Abundance)) %>% {min(.$sum)} colors.source <- c("#A9A9A9", "#D3D3D3") names(colors.source) <- c("Contaminant", "Mock") colors.comp <- c(colors.source, colors.families)
p.comp <- ggplot(comp, aes(x = factor(Individual, as.character(c(1:8, "M"))), y = Abundance, fill = factor(Family, names(colors.comp)))) + geom_bar(stat = "identity") + scale_y_continuous(limits=c(0,1), breaks = c(0, 0.5, 1), labels = c("0", "0.5", "1")) + geom_rangeframe(data = tibble(y = c(0, 1), Protocol = "H"), aes(y = y), sides = "l", inherit.aes = FALSE) + scale_fill_manual(values = colors.comp) + facet_grid(. ~ Protocol, labeller = labeller.protocol) + labs(x = "Specimen", y = "Proportion", title = paste0("Composition of all bacteria") ) + base_theme + theme(legend.position = "right", legend.key.size = unit(0.11, "in"), legend.margin = margin(5,5,5,-5)) + # theme(legend.position = c(1, 0.5), legend.justification = c(1, 0.5), # legend.key.size = unit(0.11, "in"), # ) + labs(fill = "Source or\nFamily (Native)") p.comp
Next, we want to visualize relative abundances vs. actual composition (bias) of the mock spike-in (including the contaminant). Let's first confirm that all taxa were detected in all samples:
st0 %>% filter(Taxon %in% c(mock$Taxon, contaminant)) %>% {all(.$Abundance > 0)}
Now let's restrict to just the spike-in taxa. Since the 10 mock species appear
in all samples with the same known relative abundances, it is convenient to
normalize the abundances of all taxa to the geometric mean of the 10 mock
species (called Denom
below).
sts <- st0 %>% filter(Taxon %in% c(mock$Taxon, contaminant)) %>% rename(Observed = Abundance) %>% mutate(Taxon = ifelse(Taxon == contaminant, "Contaminant", Taxon)) denom <- sts %>% filter(Taxon %in% mock$Taxon) %>% group_by(Sample) %>% summarize(Denom = gm_mean(Observed)) sts <- sts %>% left_join(denom, by = c("Sample")) %>% mutate(Observed = Observed / Denom)
Next, add the actual relative abundances as determined by the FACS measurement.
mock0 <- mock %>% rename(Actual = FACS) %>% mutate(Actual = center_elts(Actual)) sts <- sts %>% left_join(mock0, by = "Taxon") %>% mutate(Error = Observed / Actual)
Also grab a version with just the mock taxa
stm <- sts %>% filter(Taxon %in% mock$Taxon)
Plot the observed and actual relative abundances.
p.step <- ggplot(sts, aes(x = factor(Taxon, c(mock$Taxon, "Contaminant")), y = Observed, color = Protocol, shape = (Individual == "M"))) + geom_quasirandom() + geom_rangeframe(color = "black", sides = "l") + scale_y_log10(labels = log_formatter) + scale_x_discrete(labels = tax_labeller) + scale_shape_manual(values = c(16, 1), labels = c("Fecal", "Mock only")) + scale_color_manual(values = colors.protocol) + coord_cartesian(clip = "off") + labs(title = "Composition of the spike-in", # y = "Abundance / gm. mean of non-contaminants", y = "Abundance / gm. mean", shape = "Sample type") + base_theme + tax_theme + theme(legend.position = "right", legend.margin = margin(5,0,5,-5)) # Add the actual abundances (as a stair plot like in Costea2017's Figure 6) # underneath the data points tb.step <- mock0 %>% mutate(x1 = rank(desc(Actual)) - 0.5, x2 = x1 + 0.5, x3 = ifelse(x2 == 10, 10.5, NA)) %>% gather("loc", "x", x1, x2, x3) %>% filter(!is.na(x)) %>% arrange(x) p.step$layers <- c( geom_step(data = tb.step, aes(x = x, y = Actual, linetype = "Actual"), size = 0.3, color = "black", inherit.aes = FALSE), p.step$layers) # p.step <- p.step + # annotate("text", x = 10.6, y = 0.021, label = "Actual", # color = "black", hjust = 0) p.step <- p.step + guides( color = guide_legend(order = 1), shape = guide_legend(order = 2), linetype = guide_legend(order = 3)) p.step
plot_grid(p.comp, p.step, labels = c("A", "B"))
ggsave(here("figures", "costea-main.pdf"), width = 8, height = 3.5, units = "in", useDingbats = FALSE)
Estimate the bias within each protocol as the center (or compositional mean) of the observed composition errors. Because all samples have all taxa, we can do this very straightforwardly by taking geometric means:
bias <- stm %>% group_by(Protocol, Taxon) %>% summarize(Bias = gm_mean(Error)) %>% mutate(Bias = center_elts(Bias)) %>% ungroup bias %>% spread(Protocol, Bias)
The call to center_elts
normalizes the efficiencies within each protocol to
have a geometric mean of 1,
bias %>% group_by(Protocol) %>% summarize(gm_mean(Bias))
The elements of the Bias
column can be interpretted as the detection
efficiency relative to the average taxon for that protocol.
To confirm that the estimation worked, let's calibrate the relative abundances from the previous figure:
p.step.cal <- p.step p.step.cal$data <- p.step.cal$data %>% left_join(bias, by = c("Protocol", "Taxon")) %>% mutate(Observed = Observed / Bias) p.step.cal
Join the bias w/ the abundance table and compute compositional residuals.
stm <- stm %>% left_join(bias, by = c("Protocol", "Taxon")) %>% mutate( Residual = Error / Bias, Predicted = Actual * Bias ) # Note, `Residual` is already geometrically centered.
Let's also check that the bias predictions predict the taxon ratios:
gvs <- c("Protocol", "Individual", "Sample") ratios <- stm %>% mutate(Taxon = tax_abbrev(Taxon)) %>% compute_ratios(group_vars = gvs) %>% mutate(Pair = paste(Taxon.x, Taxon.y, sep = ":")) ratios.pred <- ratios %>% select(Taxon.x, Taxon.y, Pair, Protocol, Actual, Bias, Predicted) %>% distinct
ggplot(ratios, aes(Pair, Error, label = Individual, color = Protocol)) + geom_point(data = ratios.pred, aes(Pair, Bias), inherit.aes = FALSE, shape = 3, size = 4, color = "black") + geom_text(position = position_jitter(width=0.25, height=0)) + geom_hline(yintercept = 1) + facet_grid(Protocol ~ ., labeller = labeller.protocol) + geom_rangeframe(sides = "l", color= "black") + scale_color_manual(values = colors.protocol) + scale_y_log10() + scale_x_discrete(labels = tax_labeller) + base_theme + tax_theme
Distribution of efficiencies for the three protocols:
ggplot(bias, aes(Bias, fill = Protocol)) + geom_dotplot() + scale_x_log10() + facet_grid(Protocol ~ ., labeller = labeller.protocol) + scale_fill_manual(values = colors.protocol) + scale_y_continuous(breaks = NULL) + geom_rangeframe() + base_theme
Let's also estimate the differential bias (including the contaminant):
dbias <- sts %>% select(Taxon, Observed, Protocol, Individual) %>% spread(Protocol, Observed) %>% group_by(Taxon) %>% summarize( HQ = gm_mean(H / Q), HW = gm_mean(H / W), QW = gm_mean(Q / W)) dbias <- dbias %>% gather("Protocol", "Bias", -Taxon) dbias %>% spread(Protocol, Bias)
For consistency we keep the denominator as just the 10 mock (non-contaminant) species.
What fraction of the compositional error (measured by squared Aitchison distance) is explained by our model for each protocol?
err <- stm %>% group_by(Protocol, Sample) %>% summarize( Error2 = anorm(Error)^2, Bias2 = anorm(Bias)^2, Resid2 = anorm(Error / Bias)^2 ) %>% summarize_at(vars(-Sample), mean) err <- err %>% mutate(Frac_Bias = Bias2 / Error2, Frac_Resid = Resid2 / Error2) err
Residuals, with the mock-only samples marked:
stm %>% mutate(Taxon = factor(Taxon, mock$Taxon)) %>% ggplot(aes(x = Taxon, y = Residual, shape = Individual == "M", color = Protocol)) + geom_quasirandom() + geom_hline(yintercept = 1) + geom_rangeframe(sides = "l", color = "black") + scale_y_log10() + scale_shape_manual(values = c(16, 1), labels = c("Fecal", "Mock only")) + scale_color_manual(values = colors.protocol) + facet_grid(Protocol ~ ., scales = "fixed") + base_theme + tax_theme
Note that the residuals are not independent across taxa; the high Residuals for L. plantarum and B. hansenii for protocol H are pushing down the others.
Get the bias estimates w/ estimated geometric standard errors -
reps <- stm %>% group_by(Protocol) %>% nest %>% mutate( Error_matrix = map(data, build_matrix, "Sample", "Taxon", "Error"), Bootreps = map(Error_matrix, bootrep_center, R = 4e3, method = "gm")) %>% unnest(Bootreps) %>% rename(Bias = Center) stats <- reps %>% group_by(Protocol, Taxon) %>% summarize(gm_mean = gm_mean(Bias), gm_se = gm_sd(Bias)) %>% ungroup bias <- left_join(bias, stats, by = c("Protocol", "Taxon"))
As expected, the geometric means of the resamplings approximate the point estimate.
all.equal(bias$Bias, bias$gm_mean) bias <- bias %>% select(-gm_mean)
Create the SI figure showing the bias estimate with two standard errors:
r <- range(stm$Observed / stm$Actual) p1 <- ggplot(bias, aes(factor(Taxon, mock$Taxon), Bias, color = Protocol)) + geom_hline(yintercept = 1) + geom_quasirandom(data = stm, aes(y = Observed / Actual), alpha = 0.4) + geom_pointrange(aes(ymin = Bias / gm_se^2, ymax = Bias * gm_se^2), fatten = 1.5) + scale_y_log10() + scale_color_manual(values = colors.protocol) + facet_wrap(~ Protocol, labeller = labeller.protocol) + geom_rangeframe(sides = "l", color = "black") + scale_y_log10(limits = r, labels = log_formatter) + scale_x_discrete(labels = tax_labeller) + labs(y = "Efficiency / gm. mean") + base_theme + tax_theme + theme(panel.spacing = unit(1, "lines")) p1
ggsave(here("figures", "costea-bias.pdf"), width = 9, height = 4.2, units = "in", useDingbats = FALSE)
Do this for differential bias as well. Note setting the denominator to just the mock taxa.
dreps <- sts %>% select(Taxon, Observed, Protocol, Individual) %>% spread(Protocol, Observed) %>% transmute( Taxon, Individual, HQ = H / Q, HW = H / W, QW = Q / W ) %>% gather("Protocol", "Error", -Taxon, -Individual) %>% group_by(Protocol) %>% nest %>% mutate( Error_matrix = map(data, build_matrix, "Individual", "Taxon", "Error"), Bootreps = map(Error_matrix, bootrep_center, R = 4e3, method = "gm", denom = mock$Taxon)) %>% unnest(Bootreps) %>% rename(Bias = Center) dstats <- dreps %>% group_by(Protocol, Taxon) %>% summarize(gm_mean = gm_mean(Bias), gm_se = gm_sd(Bias)) %>% ungroup dbias <- left_join(dbias, dstats, by = c("Protocol", "Taxon")) all.equal(dbias$Bias, dbias$gm_mean) dbias <- dbias %>% select(-gm_mean)
p2 <- ggplot(dbias, aes(factor(Taxon, c(mock$Taxon, "Contaminant")), Bias)) + geom_hline(yintercept = 1) + # geom_quasirandom(data = sts, aes(y = Observed / Actual), alpha = 0.5) + geom_pointrange(aes(ymin = Bias / gm_se^2, ymax = Bias * gm_se^2), fatten = 1.5) + scale_y_log10() + # scale_color_manual(values = colors.protocol) + facet_wrap(~ Protocol, labeller = labeller.protocol.ref) + geom_rangeframe(sides = "l", color = "black") + scale_y_log10(limits = r, labels = log_formatter) + scale_x_discrete(labels = tax_labeller) + labs(y = "Efficiency / gm. mean") + base_theme + tax_theme + theme(panel.spacing = unit(1, "lines")) p2
Here we wish to see how the noise in the bias estimate decreases with the number of control samples used. To do so, we will perform multinomial resampling of N control samples, for each N between 1 and 9 (the total number of samples). For a given N, the standard deviation of the bias estimate across resamples approximates the standard error in the bias estimate when N control samples are measured. We'll use Protocol H, which has an intermediate level of noise.
First, estimate the standard errors for each N:
mat <- stm %>% filter(Protocol == "H") %>% build_matrix(Sample, Taxon, Error) N <- 1:9 names(N) <- N reps.H <- map_dfr(N, ~bootrep_center(mat, R = 4e3, dist = "multinomial", N = ., method = "gm"), .id = "N") %>% rename(Bias = Center) reps.H.summary <- reps.H %>% group_by(N, Taxon) %>% summarize(gm_mean = gm_mean(Bias), gm_se = gm_sd(Bias)) %>% mutate(Protocol = "H")
Then view how the dispersion decreases with N:
lvls.color <- reps.H.summary %>% filter(N == 3) %>% arrange(-gm_se) %>% .$Taxon values.color <- scales::seq_gradient_pal("black", "grey")(seq(0, 1, length.out = 10)) names(values.color) <- lvls.color r <- range(reps.H.summary$gm_se) names(r) <- r %>% round(2) p.se <- ggplot(reps.H.summary, aes(N, gm_se, color = fct_reorder(Taxon, -gm_se))) + geom_line(aes(group = Taxon)) + geom_point() + scale_y_continuous(limits = range(c(1, r)), ) + base_theme + scale_color_manual(values = values.color, labels = tax_labeller) + geom_rangeframe(color = "black") + theme(legend.position = "right", legend.margin = margin(5,10,5,-5)) + labs(x = "Number of controls", y = "Geometric standard error", title = "Standard error vs. number of control samples", color = "Taxon") p.se
The relative abundance plot provides useful context for understanding why the standard errors are much higher for two taxa:
pd <- filter(stm, Protocol == "H") pd0 <- pd %>% filter(Individual %in% c("5", "M")) p.step.H <- p.step %+% pd %+% aes(x = factor(Taxon, mock$Taxon), y = Observed, color = factor(Taxon, lvls.color), shape = NULL) + ggrepel::geom_text_repel(data = pd0, aes(label = Individual), direction = "x", size = 2, color = "black") + scale_color_manual(values = values.color, labels = tax_labeller) + labs(title = "Observed compositions", y = "Abundance / gm. mean", shape = "Sample type") + theme(legend.position = "none") # p.step.H
plot_grid(p.se, p.step.H, nrow = 1, labels = c("A", "B"), rel_widths = c(1.3, 1))
ggsave(here("figures", "costea-dispersion.pdf"), width = 8, height = 3.5, units = "in", useDingbats = FALSE)
Compute pairwise error summary statistics:
avg_errors <- ratios %>% # Take the geometric absolute values of the errors mutate_at(vars(Error, Bias, Residual), gm_abs) %>% # Average the errors for each pair of different taxa filter(Taxon.x != Taxon.y) %>% group_by(Protocol, Taxon.x, Taxon.y) %>% summarize_at(vars(Error, Bias, Residual), gm_mean) %>% # Average over pairs of taxa for each protocol group_by(Protocol) %>% summarize_at(vars(Error, Bias, Residual), gm_mean) %>% rename_at(vars(Error, Bias, Residual), paste0, ".avg") max_errors <- ratios %>% # Take the geometric absolute values of the errors mutate_at(vars(Error, Bias, Residual), gm_abs) %>% # Find the max fold error in a ratio for each protocol group_by(Protocol) %>% summarize_at(vars(Error, Bias, Residual), gm_range) %>% rename_at(vars(Error, Bias, Residual), paste0, ".max") pw_summary <- bind_cols(avg_errors, max_errors) %>% select(Protocol, Bias.max, Bias.avg, Residual.avg) pw_summary
Get ratios for the ratio-based statistics as in ratios
, but for the
differential error between protocols.
tb <- stm %>% select(Taxon, Observed, Protocol, Individual) %>% spread(Protocol, Observed) %>% transmute( Taxon, Individual, HQ = H / Q, HW = H / W, QW = Q / W ) %>% gather("Protocol", "Error", -Taxon, -Individual) %>% left_join(dbias, by = c("Protocol", "Taxon")) %>% select(-gm_se) %>% mutate(Residual = Error / Bias) gvs <- c("Protocol", "Individual") d_ratios <- tb %>% compute_ratios(group_vars = gvs) %>% mutate(Pair = paste(Taxon.x, Taxon.y, sep = ":"))
Compute pairwise error summary statistics:
d_avg_errors <- d_ratios %>% # Take the geometric absolute values of the errors mutate_at(vars(Error, Bias, Residual), gm_abs) %>% # Average the errors for each pair of different taxa filter(Taxon.x != Taxon.y) %>% group_by(Protocol, Taxon.x, Taxon.y) %>% summarize_at(vars(Error, Bias, Residual), gm_mean) %>% # Average over pairs of taxa for each protocol group_by(Protocol) %>% summarize_at(vars(Error, Bias, Residual), gm_mean) %>% rename_at(vars(Error, Bias, Residual), paste0, ".avg") d_max_errors <- d_ratios %>% # Take the geometric absolute values of the errors mutate_at(vars(Error, Bias, Residual), gm_abs) %>% # Find the max fold error in a ratio for each protocol group_by(Protocol) %>% summarize_at(vars(Error, Bias, Residual), gm_range) %>% rename_at(vars(Error, Bias, Residual), paste0, ".max") d_pw_summary <- bind_cols(d_avg_errors, d_max_errors) %>% select(Protocol, Bias.max, Bias.avg, Residual.avg) d_pw_summary
# Function to create string for formatting numbers with glue::glue() fmt_string <- function(var, digits) { paste0("{format(", var, ", digits = ", digits, ")}") } bias_tab <- bind_rows(bias, dbias) %>% select(-gm_se) %>% mutate(Bias = glue::glue(fmt_string("Bias", 1))) %>% mutate_all(as.character) %>% spread(Protocol, Bias) %>% mutate_all(~ifelse(is.na(.), "---", .)) pw_summary_tab <- bind_rows(pw_summary, d_pw_summary) %>% mutate( Bias.max = glue::glue(fmt_string("Bias.max", 1)), Bias.avg = glue::glue(fmt_string("Bias.avg", 2)), Residual.avg = glue::glue(fmt_string("Residual.avg", 2)) ) %>% mutate_all(as.character) %>% gather("Taxon", "Value", -Protocol) %>% spread(Protocol, Value) lvls.taxon <- c(mock$Taxon, "Contaminant", "Bias.max", "Bias.avg", "Residual.avg") lbls.taxon <- c(mock$Taxon, "Contaminant", "Max pairwise bias", "Avg. pairwise bias", "Avg. pairwise noise") bias_tab0 <- bind_rows(bias_tab, pw_summary_tab) %>% select(Taxon, "H", "Q", "W", "HQ", "HW", "QW") %>% mutate( Taxon = factor(Taxon, lvls.taxon, lbls.taxon), ) %>% arrange(Taxon) bias_tab0
Latex version for the manuscript:
tex <- bias_tab0 %>% rename_at(vars(HQ, HW, QW), ~ {str_sub(., 2, 1) <- "/"; .}) %>% mutate( Taxon = kableExtra::cell_spec( str_replace(Taxon, "_", " "), "latex", italic = Taxon %in% mock$Taxon) ) %>% knitr::kable(format="latex", booktabs = TRUE, linesep = "", escape = FALSE, align = c("l", rep("r", 6))) %>% kableExtra::add_header_above(c(" ", "Protocol" = 3, "Protocol/Reference" = 3)) %>% kableExtra::row_spec(length(mock$Taxon) + 1, extra_latex_after = "\\midrule") # tex # clipr::write_clip(tex)
Choose Individuals (specimens) to use for estimating bias.
set.seed(20190201) individuals <- as.character(1:8) estimation_set <- base::sample(individuals, 3, replace = FALSE) stm0 <- stm %>% select(-Bias, -Residual) %>% mutate(Set = ifelse(Individual %in% estimation_set, "Est", "Eval")) sam <- sam %>% mutate(Set = ifelse(Individual %in% estimation_set, "Est", "Eval"))
Estimate bias from the estimation samples:
bias_est <- stm0 %>% filter(Set == "Est") %>% group_by(Protocol, Taxon) %>% summarize(Bias = gm_mean(Error)) %>% mutate(Bias = center_elts(Bias)) %>% ungroup
Compare the bias estimated from the estimation set and that from all specimens:
bind_rows(list(All = bias, Est = bias_est), .id = "Specimens") %>% ggplot(aes(Taxon, Bias, color = Protocol, shape = Specimens)) + geom_quasirandom() + scale_y_log10() + scale_shape_manual(breaks = c("All", "Est"), values = c(16, 1)) + scale_color_manual(values = colors.protocol) + coord_flip()
Bias is consistent with the estimate using all specimens, though this would vary for the higher-variance protocols H and Q for some estimation sets.
Also estimate the differential bias to protocol W. First, add a column with the "Reference" measurement; then repeat bias estimation with the reference measurement instead of the Actual abundance.
stmW <- stm0 %>% select(Taxon, Protocol, Individual, Set, Observed) %>% spread(Protocol, Observed) %>% mutate(Actual = W) %>% gather("Protocol", "Observed", H, Q, W) %>% mutate(Error = Observed / Actual) biasW_est <- stmW %>% filter(Set == "Est") %>% group_by(Protocol, Taxon) %>% summarize(BiasW = gm_mean(Error)) %>% mutate(BiasW = center_elts(BiasW)) biasW_est %>% spread(Protocol, BiasW)
Calibrate all samples
stm0 <- stm0 %>% left_join(bias_est, by = c("Protocol", "Taxon")) %>% left_join(biasW_est, by = c("Protocol", "Taxon")) %>% mutate_by(Sample, Calibrated_to_Actual = close_elts(Observed / Bias), Calibrated_to_W = close_elts(Observed / BiasW) )
Prep for the PCA: we first need to Clr transform each relative abundance vector.
stm1 <- stm0 %>% gather("Type", "Abundance", Observed, Actual, Calibrated_to_Actual, Calibrated_to_W) %>% select(Taxon:Individual, Set:Abundance) %>% group_by(Sample, Type) %>% mutate(Clr = clr(Abundance)) %>% ungroup
Get a matrix with samples as rows to use for the PCA.
temp <- stm1 %>% unite("Sample_Type", Sample, Type, sep = ":") %>% select(Sample_Type, Taxon, Clr) %>% spread(Taxon, Clr) mat <- temp %>% select(-Sample_Type) %>% as("matrix") rownames(mat) <- temp$Sample_Type corner(mat)
Note that each sample (protocol + individual) appears four times, as Observed, Actual, Calibrated to Actual, and Calibrated to W, and currently I'm just doing a simple PCA on all of these, since our goal is mainly to illustrate the effect of calibration rather than make quantitative claims about variation. Next we run the PCA and make a tibble for plotting,
pcx <- prcomp(mat) tb <- as_tibble(pcx$x[,c(1,2)], rownames='Sample_Type') %>% separate(Sample_Type, c("Sample", "Type"), sep = ":") %>% left_join(sam, by = "Sample") tb <- tb %>% mutate(PC1 = -PC1, PC2 = PC2)
Nearly 90% of the variance is covered by the first two PCs, with about 5.2x as much variation on PC1 than PC2:
fracvar <- pcx$sdev^2/sum(pcx$sdev^2) round(fracvar, 3) fracvar[1] / fracvar[2] percvar <- round(100 * fracvar, 0) strvar <- paste0("PC", seq(fracvar), " [", percvar, "%]")
PCA figure that will form the right half of the main text figure:
facet_labeller <- function (tb) { tb %>% mutate_all(str_replace_all, "_", " ") %>% mutate_all(str_replace, " ", "\n") } type.lvls <- c("Observed", "Calibrated_to_Actual", "Calibrated_to_W") actual <- tb %>% filter(Type == "Actual", Sample == "H1") %>% select(PC1, PC2) %>% expand(Type = factor(type.lvls, type.lvls), PC1, PC2) tb0 <- tb %>% filter(Type != "Actual") %>% mutate(Type = factor(Type, type.lvls)) tb.breaks <- tribble( ~PC1, ~PC2, # min(tb0$PC1), min(tb0$PC2), # actual$PC1[1], actual$PC2[1], # max(tb0$PC1), max(tb0$PC2), ) tb.range <- tribble( ~PC1, ~PC2, min(tb0$PC1), min(tb0$PC2), max(tb0$PC1), max(tb0$PC2), ) p.pca <- ggplot(tb0, aes(PC1, PC2, color = Protocol, shape = Set)) + geom_point(data = actual, aes(PC1, PC2), shape = 3, size = 4, color = "black") + geom_point() + facet_grid(Type ~ ., labeller = facet_labeller) + geom_rangeframe(data = tb.range, aes(PC1, PC2), color = "black", inherit.aes = FALSE) + scale_x_continuous(breaks = tb.breaks$PC1, labels = NULL) + scale_y_continuous(breaks = tb.breaks$PC2, labels = NULL) + scale_shape_manual(breaks = c("Est", "Eval"), values = c(1, 16)) + scale_color_manual(values = colors.protocol) + base_theme + theme(strip.text.y = element_text(angle=0, size = 9)) + # theme(axis.line = element_line()) + labs(title = "Sample ordination", x = strvar[1], y = strvar[2]) # Label the protocols in the "Observed" plot labtb <- tribble( ~x, ~y, ~Protocol, -4, 1.5, "H", -0.5, 1.3, "Q", -1.85, -0.4, "W", 2.3, 0.7, "Actual" ) %>% mutate(Type = factor("Observed", type.lvls)) p.pca <- p.pca + geom_text(data = labtb, aes(x, y, label = Protocol, color = Protocol), size = 3, inherit.aes = FALSE, show.legend = FALSE) p.pca
Figure showing the relative abundances, which will form the left half of the main text figure:
stm1 <- stm1 %>% group_by(Sample, Type) %>% mutate(Abundance = center_elts(Abundance)) %>% ungroup tbrf <- tibble(Abundance = range(stm1$Abundance)) p.step.cal <- stm1 %>% filter(Type %in% type.lvls) %>% mutate(Taxon = factor(Taxon, mock$Taxon), Type = factor(Type, type.lvls)) %>% ggplot(aes(x = Taxon, y = Abundance, color = Protocol, shape = Set)) + geom_quasirandom() + geom_rangeframe(data = tbrf, aes(y = Abundance), color = "black", sides = "l", inherit.aes = FALSE) + scale_color_manual(values = colors.protocol) + scale_y_log10(labels = log_formatter) + scale_x_discrete(labels = tax_labeller) + coord_cartesian(clip = "off") + labs(title = "Taxon relative abundances", y = "Abundance / gm. mean") + facet_grid(Type ~ .) + scale_shape_manual(breaks = c("Est", "Eval"), values = c(1, 16)) + scale_color_manual(values = colors.protocol) + base_theme + tax_theme + theme(strip.text.y = element_blank()) # Add the actual abundances (as a stair plot like in Costea2017's Figure 6) # underneath the data points tb.step.cal <- tb.step %>% expand(Type = factor(type.lvls, type.lvls), nesting(x, Actual)) p.step.cal$layers <- c( geom_step(data = tb.step.cal, aes(x = x, y = Actual), size = 0.3, color = "black", inherit.aes = FALSE), p.step.cal$layers) # p.step.cal
plot_grid(p.step.cal + theme(plot.margin = margin(5,20,5,5)), p.pca + theme(legend.position = "bottom", legend.box = "vertical", legend.margin = margin(), legend.spacing = unit(0, "in"), ), rel_widths = c(1, 1.1) )
# ggsave(here("figures", "costea-calibration.pdf"), # width = 6.5, height = 6.5, units = "in", useDingbats = FALSE) ggsave(here("figures", "costea-calibration.pdf"), width = 8, height = 8, units = "in", useDingbats = FALSE)
Get tree for the spike-in taxa:
data(costea2017_metaphlan2_tree) costea2017_metaphlan2_tree$tip.label %>% head tree <- costea2017_metaphlan2_tree tree$tip.label <- tree$tip.label %>% str_extract("(?<=s__).+") spikein_taxa <- c(mock$Taxon, contaminant) tree <- ape::keep.tip(tree, spikein_taxa) tree
Plot tree along with differential bias to Protocol W. Bias is shown relative to the 10 mock taxa, with 2 geometric standard errors.
gt <- ggtree::ggtree(tree, layout = "rectangular") + theme(plot.margin = margin(5, 0, 5, 5)) gtb <- dbias %>% mutate(Taxon = str_replace(Taxon, "Contaminant", contaminant)) %>% left_join(gt$data, by = c("Taxon" = "label")) %>% filter(Protocol %in% c("HW", "QW")) %>% mutate_at("Protocol", str_sub, 1, 1)
pr <- gtb %>% ggplot(aes(Bias, fct_reorder(Taxon, y), color = Protocol)) + # geom_point() + geom_vline(xintercept = 1) + ggstance::geom_pointrangeh(aes(xmin = Bias / gm_se^2, xmax = Bias * gm_se^2), fatten = 1.0, position = position_jitter(width = 0, height = 0.1) ) + scale_color_manual(values = colors.protocol) + base_theme + scale_y_discrete(labels = tax_labeller) + scale_x_log10(labels = log_formatter) + labs(y = "", x = "Differential efficiency rel. to Protocol W") + theme(legend.position = "right", plot.margin = margin(5, 5, 5, 0), axis.text.y = element_text(size=9, family = "") )
plot_grid(gt, pr, align = "h", rel_widths = c(0.6, 1))
ggsave(here("figures", "costea-tree.png"), width = 6, height = 4, units = "in", dpi=300) ggsave(here("figures", "costea-tree.pdf"), width = 6, height = 4, units = "in", useDingbats = FALSE)
mock.gathered <- costea2017_mock_composition %>% select(Taxon, FACS = "bacterial cells in spike in Mix", OD = "Bacterial OD in sample", ) %>% group_by(Taxon) %>% summarize_at(vars(FACS, OD), mean) %>% gather("Protocol", "Observed", -Taxon) %>% mutate_by(Protocol, Observed = center_elts(Observed)) stm %>% group_by(Protocol, Taxon) %>% summarize(Observed = gm_mean(Observed)) %>% ungroup %>% bind_rows(mock.gathered) %>% filter(Protocol != "FACS") %>% mutate( Taxon = factor(Taxon, mock$Taxon), Protocol = factor(Protocol, c("W", "Q", "H", "OD")) ) %>% ggplot(aes(Taxon, Observed, fill = Protocol)) + geom_col(position = "dodge") + geom_step(data = tb.step, aes(x = x, y = Actual, linetype = "FACS"), size = 0.3, color = "black", inherit.aes = FALSE) + scale_fill_manual(values = c(colors.protocol, OD = "grey")) + scale_y_log10() + base_theme + tax_theme
The abundances don't match exactly. This is expected, given that Costea et al. used mOTU to measure abundances. However, the general pattern is highly consistent with their figure, including the OD measurements, except for a swapping in the taxon labels on the x-axis for the three species V. cholerae, C. saccharolyticum, and Y. pseudotuberculosis. This strongly suggests that the mock species are correctly labeled here and are labeled incorrectly in their Figure 6.
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.