R setup

knitr::opts_knit$set(progress = TRUE, verbose = TRUE)
# Global chunk options
knitr::opts_chunk$set(
    cache = TRUE, autodep = TRUE,
    include = TRUE, echo = TRUE,
    warning = TRUE, message = FALSE, 
    fig.width = 6, fig.height = 4
)

Run with SAVE_FIGURES = TRUE to save figures in figures/.

SAVE_FIGURES = TRUE

Libraries and paths:

library(here)
library(tidyverse)
library(ggthemes)
library(cowplot)
library(ggbeeswarm)
# Functions for bias estimation and calibration
library(metacal); packageVersion("metacal")
# This package
devtools::load_all(here())

Plot setup:

# Options for `ggplot`:
base_theme <- theme_tufte() + 
    theme(
        text = element_text(size=9, family = ""),
        strip.text = element_text(size=9, family = ""),
        legend.position = "none"
        )
tax_theme <- theme(
        axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1), 
        axis.title.x = element_blank())
update_geom_defaults("point", list(size = 1))
update_geom_defaults("text", list(size = 2.5))
update_geom_defaults("hline", list(color = "grey"))
update_geom_defaults("vline", list(color = "grey"))
# Color-blind friendly colors from ColorBrewer2
colors.protocol <- c(
    H = "#1b9e77", 
    W = "#d95f02", 
    Q = "#7570b3", 
    Actual = "black")
# Labeller for facets when faceting by protocol
labeller.protocol <- labeller(
    Protocol = function(p) paste("Protocol", p))
labeller.protocol.ref <- labeller(
    Protocol = function(p) {
        focal <- str_sub(p, 1, 1)
        ref <- str_sub(p, 2, 2)
        paste("Protocol", focal, "/", ref)
    }
)

Data setup

Sample metadata:

data("costea2017_sample_data")
sam <- costea2017_sample_data
sam

Expected mock community composition, obtained from the supplementary data (see data-raw/costea2017.R)

data("costea2017_mock_composition")
mock <- costea2017_mock_composition

The FACS measurements are the "bacterial cells" columns; the measurements without endospores were used as the actual composition, and replicate measurements were averaged (personal communication with Paul Costea);

mock <- mock %>%
    select(Taxon, FACS = "bacterial cells in spike in Mix") %>%
    group_by(Taxon) %>%
    summarize_at(vars(FACS), mean)
mock <- mock %>%
    arrange(desc(FACS))
mock

Note that the order of three species (V. cholerae, C. saccharolyticum, and Y. pseudotuberculosis) differ from that of Costea2017 Figure 6. We reproduce their figure below, showing that these species are likely mislabeled in their figure and not in the mock table. But a mislabeling would here not affect our main conclusions.

Load the Metaphlan2 profiles:

data("costea2017_metaphlan2_profiles")
profiles <- costea2017_metaphlan2_profiles
profiles %>% corner

The samples are named here by their ENA run accession; let's rename to the new sample names.

new_names <- c("Clade", 
    sam$Sample[match(colnames(profiles)[-1], sam$Run_accession)])
colnames(profiles) <- new_names

Two samples, QA and QB, are two extra fecal samples that were only sequenced by protocol Q and so we won't be using for our analysis. We also drop unneeded sample variables.

sam <- sam %>%
    filter(!(Sample %in% c("QA", "QB"))) %>%
    select(Sample, Protocol, Individual)
profiles <- profiles[, c("Clade", sam$Sample)]

Taxonomy from the metaphlan2 clade names:

tax_ranks <- c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus",
    "Species", "Strain")
rank_letters <- c("k", "p", "c", "o", "f", "g", "s", "t")
tax_pattern  <- paste0("(?:", rank_letters, "__(\\w+))?") %>%
    paste(collapse = "\\|?")
tax <- profiles %>%
    select(Clade) %>%
    extract(Clade, tax_ranks, regex = tax_pattern, remove = FALSE)
tax

Species table with just the Metaphlan species identifiers

st <- profiles %>%
    left_join(tax, by = "Clade") %>%
    filter(!is.na(Species), is.na(Strain)) %>%
    select(-Strain)
st

Species names have the form Genus_species and are unique,

st$Species %>% head
st$Species %>% anyDuplicated

The Genus_unclassified catches cases where more reads were mapped to the genus's markers than could be accounted for by species within the genus. We will include these for measuring the family-level compositions but not the spike-in compositions From here on, we will identify the Clade / Taxon with this species identifier,

st <- st %>%
    mutate(Clade = Species) %>%
    rename(Taxon = Clade)

Next, join the metadata and tidy it into a tall format (one sample-taxon observation per row)

st <- st %>%
    gather("Sample", "Abundance", sam$Sample) %>%
    left_join(sam, by = "Sample") %>%
    select(Taxon, Sample, Abundance, colnames(sam), everything())
st

Metaphlan outputs estimates of the proportions * 100. We will renormalize to proportions summing to 1 instead.

st <- st %>%
    mutate_by(Sample, Abundance = close_elts(Abundance))
st %>%
    group_by(Sample) %>%
    summarize(sum(Abundance)) %>%
    {summary(.[,2])}

Output from the various kingdoms:

st %>%
    group_by(Kingdom, Sample) %>%
    summarize(Abundance = sum(Abundance)) %>%
    spread(Kingdom, Abundance) %>%
    print(n=Inf)

There is very little Archaea and Eukaryote abundance, and the vast majority of the viral abundance comes from a Salmonella phage that I suspect is integrated into the genome of the Salmonella enterica strain in the spike-in.

st %>%
    filter(Kingdom == "Viruses") %>%
    group_by(Taxon) %>% 
    summarize(Abundance = mean(Abundance)) %>%
    arrange(desc(Abundance))

Bacteria form the vast majority of estimated non-viral abundance in each sample,

st %>%
    group_by(Kingdom, Sample) %>%
    summarize(Abundance = sum(Abundance)) %>%
    filter(Kingdom != "Viruses") %>%
    mutate_by(Sample, Abundance = close_elts(Abundance)) %>%
    filter(Kingdom == "Bacteria") %>%
    summarize_at("Abundance", list(Min = min, Mean = mean))

We will therefore restrict our analysis of sample composition to just Bacteria, renormalizing the abundances to sum to 1 after filtering.

st <- st %>%
    filter(Kingdom == "Bacteria") %>%
    mutate_by(Sample, Abundance = close_elts(Abundance))

Composition of the samples

Identifying the lab contaminant

Costea et al. report E. coli as a likely lab contaminant of the spike-in based on its appearance in the profiles of the mock-only samples (which they derived using mOTUs rather than Metaphlan2). So let's check what is at appreciable frequency in the mock-only samples that shouldn't be,

tb <- st %>%
    filter(
        Individual == "M", Abundance > 0, 
        !(Taxon %in% mock$Taxon)
        ) %>%
    group_by(Family, Taxon) %>%
    summarize(
        Mean = mean(Abundance),
        Max = max(Abundance),
        ) %>%
    filter(Max > 1e-4) %>%
    arrange(desc(Mean))
tb

Here we see that most of the contaminant reads appear to be captured by Shigella_flexneri identifier. Given the closeness of S. flexnari with E. coli, the estimated abundance for Escherichia_coli may also be coming from the contaminant strain. Since S. flexnari is generally much larger,

st %>%
    filter(Taxon %in% c("Shigella_flexneri", "Escherichia_coli")) %>%
    group_by(Individual, Taxon) %>%
    summarize_at(vars(Abundance), mean) %>%
    spread(Taxon, Abundance) %>%
    mutate(Escherichia_coli / Shigella_flexneri)

We will treat Shigella_flexneri as the lab contaminant in our analysis,

contaminant <- "Shigella_flexneri"

Our analysis of the spike-in abundances below show that the ratio of Shigella_flexneri to other spike-in taxa is constant across samples, indicating that Shigella_flexneri reads in the fecal samples are primarily from the contaminant and not native gut strains.

Above, we also see appreciable frequency of Salmonella_unclassified, which is due to the genus-level estimated proportion for Salmonella being larger than the species-level estimates. Because the estimated proportions of Escherichia_coli, Escherichia_unclassified, and Salmonella_unclassified are all fairly small, they will not substantially affect the family-level composition plot (Figure 4A). and so we will leave it in as a "native" taxon for this analysis. However, because these abundances might be due to the spike-in rather than native gut taxa, we will filter them out before proceeding.

st0 <- st %>%
    filter(!(Taxon %in% c("Salmonella_unclassified", "Escherichia_coli",
                "Escherichia_unclassified"))) %>%
    mutate_by(Sample, Abundance = close_elts(Abundance))

Taxa by source (Mock, Contaminant, Native)

Classify the taxa by source type:

st0 <- st0 %>%
    mutate(Source = case_when(
            Taxon %in% mock$Taxon ~ "Mock", 
            Taxon == contaminant ~ "Contaminant",
            TRUE ~ "Native"
            )
        )

View the proportions of native, mock, and contaminant for each sample:

tb.source <- st0 %>%
    group_by(Sample, Protocol, Individual, Source) %>%
    summarize(Abundance = sum(Abundance)) %>%
    ungroup()
tb.source %>%
    spread(Source, Abundance) %>%
    print(n=Inf)

The contaminant is always a fairly small fraction of the total sample. It also generally appears in a consistent ratio with the mock for a given protocol:

tb.source %>%
    spread(Source, Abundance) %>%
    ggplot(aes(Sample, Contaminant / Mock, color = Protocol, 
            label = Individual)) +
    scale_color_manual(values = colors.protocol) + 
    geom_text() +
    coord_flip()

In samples Q2 and Q3, the spike-in makes up an unusually low fraction of the total (see table above), so the larger ratio in these samples may be due to chance. These observations suggest that the contaminant is being correctly identified.

Composition by source type:

p.source <- ggplot(tb.source,
    aes(y = Abundance, 
        x = Individual,
        # x = factor(Individual, lvls.mock_prop), 
        fill = Source)) +
    geom_bar(stat = "identity") +
    scale_y_continuous(breaks = c(0, 0.5, 1), 
        labels = c("0", "0.5", "1")) +
    geom_rangeframe(data = tibble(y = c(0, 1), Protocol = "H"), 
        aes(y = y), sides = "l", inherit.aes = FALSE) +
    scale_fill_brewer(type = "qual") +
    facet_grid(. ~ Protocol, labeller = labeller.protocol) +
    labs(x = "Specimen", fill = "Taxon type", 
        title = "Composition of different bacterial types") +
    base_theme +
    theme(legend.position = "right")
p.source 

Native taxa composition

Get the native taxa agglomerated to the Family level,

nat <- st0 %>%
    filter(Source == "Native") %>%
    group_by_at(vars(Sample, Protocol, Individual, Kingdom:Family)) %>%
    summarize(Abundance = sum(Abundance)) %>%
    ungroup

Let's restrict to Families within a minimal abundance to simplify the figure. First, lets check the prevalence of various families,

prev <- nat %>%
    filter(Individual != "M") %>%
    mutate_by(Sample, Proportion = close_elts(Abundance)) %>%
    group_by(Family) %>%
    summarize(Prev = sum(Proportion > 0), 
        Min = min(Proportion), 
        Median = median(Proportion), 
        Mean = mean(Proportion), 
        Max = max(Proportion))
prev %>%
    arrange(desc(Max)) %>%
    print(n=30)

Let's use the families with a proportion of at least 2%.

families <- prev %>%
    filter(Max > 0.02) %>%
    {.$Family}
nat0 <- nat %>%
    filter(Family %in% families) %>%
    arrange(Phylum, Family)

Create a color scale for Family that distinguishes Phylum by hue.

# Color tibble
ctb <- nat0 %>%
    select(Phylum, Family) %>%
    distinct %>%
    arrange(Phylum, Family) %>%
    group_by(Phylum) %>%
    nest(Family) %>%
    rowwise() %>%
    mutate(Families = list(unlist(data)),
        Num = length(Families))
ctb <- ctb %>%
    arrange(desc(Num)) %>%
    add_column(Palette = c("Blues", "Greens", "Reds", "Oranges"))
ctb <- ctb %>%
    rowwise() %>%
    mutate(Colors = list(RColorBrewer::brewer.pal(Num, name = Palette)))

last_n <- function(x, n) {
    len <- length(x)
    x[seq(len - n + 1, len)]
}

ctb <- ctb %>%
    rowwise() %>%
    mutate(Colors = list(last_n(Colors, Num)))

ctb <- ctb %>%
    select(Phylum, Families, Colors) %>%
    unnest() %>%
    arrange(Phylum, Families)

colors.families <-  ctb$Colors
names(colors.families) <- ctb$Families
# RColorBrewer::display.brewer.all()
# RColorBrewer::brewer.pal(5, name = "Blues")

We also want the mock and contaminant fractions in the final figure

comp <- bind_rows(
    nat0,
    tb.source %>% filter(Source != "Native") %>% rename(Family = Source)
)
comp %>%
    filter(Sample == "W1") %>%
    select(Family, Abundance) %>%
    arrange(desc(Abundance))
comp %>%
    group_by(Sample) %>%
    summarize(sum= sum(Abundance)) %>%
    {min(.$sum)}
colors.source <- c("#A9A9A9", "#D3D3D3")
names(colors.source) <- c("Contaminant", "Mock")
colors.comp <- c(colors.source, colors.families)
p.comp <- ggplot(comp,
    aes(x = factor(Individual, as.character(c(1:8, "M"))),
        y = Abundance, 
        fill = factor(Family, names(colors.comp)))) +
    geom_bar(stat = "identity") +
    scale_y_continuous(limits=c(0,1), breaks = c(0, 0.5, 1), 
        labels = c("0", "0.5", "1")) +
    geom_rangeframe(data = tibble(y = c(0, 1), Protocol = "H"), 
        aes(y = y), sides = "l", inherit.aes = FALSE) +
    scale_fill_manual(values = colors.comp) +
    facet_grid(. ~ Protocol, labeller = labeller.protocol) +
    labs(x = "Specimen", y = "Proportion",
        title = paste0("Composition of all bacteria")
        ) +
    base_theme +
    theme(legend.position = "right", legend.key.size = unit(0.11, "in"),
        legend.margin = margin(5,5,5,-5)) + 
    # theme(legend.position = c(1, 0.5), legend.justification = c(1, 0.5),
    #     legend.key.size = unit(0.11, "in"),
    #     ) +
    labs(fill = "Source or\nFamily (Native)")
p.comp

Relative abundances and bias in the mock spike-in

Next, we want to visualize relative abundances vs. actual composition (bias) of the mock spike-in (including the contaminant). Let's first confirm that all taxa were detected in all samples:

st0 %>%
    filter(Taxon %in% c(mock$Taxon, contaminant)) %>%
    {all(.$Abundance > 0)}

Now let's restrict to just the spike-in taxa. Since the 10 mock species appear in all samples with the same known relative abundances, it is convenient to normalize the abundances of all taxa to the geometric mean of the 10 mock species (called Denom below).

sts <- st0 %>%
    filter(Taxon %in% c(mock$Taxon, contaminant)) %>%
    rename(Observed = Abundance) %>%
    mutate(Taxon = ifelse(Taxon == contaminant, "Contaminant", Taxon))
denom <- sts %>%
    filter(Taxon %in% mock$Taxon) %>%
    group_by(Sample) %>%
    summarize(Denom = gm_mean(Observed))
sts <- sts %>%
    left_join(denom, by = c("Sample")) %>%
    mutate(Observed = Observed / Denom)

Next, add the actual relative abundances as determined by the FACS measurement.

mock0 <- mock %>%
    rename(Actual = FACS) %>%
    mutate(Actual = center_elts(Actual))
sts <- sts %>%
    left_join(mock0, by = "Taxon") %>%
    mutate(Error = Observed / Actual)

Also grab a version with just the mock taxa

stm <- sts %>%
    filter(Taxon %in% mock$Taxon)

Plot the observed and actual relative abundances.

p.step <- ggplot(sts,
    aes(x = factor(Taxon, c(mock$Taxon, "Contaminant")), y = Observed,
        color = Protocol, shape = (Individual == "M"))) + 
    geom_quasirandom() +
    geom_rangeframe(color = "black", sides = "l") +
    scale_y_log10(labels = log_formatter) +
    scale_x_discrete(labels = tax_labeller) + 
    scale_shape_manual(values = c(16, 1), labels = c("Fecal", "Mock only")) +
    scale_color_manual(values = colors.protocol) + 
    coord_cartesian(clip = "off") +
    labs(title = "Composition of the spike-in",
        # y = "Abundance / gm. mean of non-contaminants",
        y = "Abundance / gm. mean",
        shape = "Sample type") +
    base_theme + 
    tax_theme + 
    theme(legend.position = "right", legend.margin = margin(5,0,5,-5))
# Add the actual abundances (as a stair plot like in Costea2017's Figure 6)
# underneath the data points
tb.step <- mock0 %>%
    mutate(x1 = rank(desc(Actual)) - 0.5, x2 = x1 + 0.5,
        x3 = ifelse(x2 == 10, 10.5, NA)) %>%
    gather("loc", "x", x1, x2, x3) %>%
    filter(!is.na(x)) %>%
    arrange(x)
p.step$layers <- c(
    geom_step(data = tb.step, aes(x = x, y = Actual, linetype = "Actual"), 
        size = 0.3, color = "black", inherit.aes = FALSE),
    p.step$layers)
# p.step <- p.step +
#     annotate("text", x = 10.6, y = 0.021, label = "Actual",
#         color = "black", hjust = 0)
p.step <- p.step +
    guides(
        color = guide_legend(order = 1),
        shape = guide_legend(order = 2),
        linetype = guide_legend(order = 3))
p.step

Main text Figure 3

plot_grid(p.comp, p.step, labels = c("A", "B"))
ggsave(here("figures", "costea-main.pdf"),
    width = 8, height = 3.5, units = "in", useDingbats = FALSE)

Estimate bias

Estimate the bias within each protocol as the center (or compositional mean) of the observed composition errors. Because all samples have all taxa, we can do this very straightforwardly by taking geometric means:

bias <- stm %>%
    group_by(Protocol, Taxon) %>%
    summarize(Bias = gm_mean(Error)) %>%
    mutate(Bias = center_elts(Bias)) %>%
    ungroup
bias %>%
    spread(Protocol, Bias)

The call to center_elts normalizes the efficiencies within each protocol to have a geometric mean of 1,

bias %>%
    group_by(Protocol) %>%
    summarize(gm_mean(Bias))

The elements of the Bias column can be interpretted as the detection efficiency relative to the average taxon for that protocol.

To confirm that the estimation worked, let's calibrate the relative abundances from the previous figure:

p.step.cal <- p.step
p.step.cal$data <- p.step.cal$data %>%
    left_join(bias, by = c("Protocol", "Taxon")) %>%
    mutate(Observed = Observed / Bias)
p.step.cal

Join the bias w/ the abundance table and compute compositional residuals.

stm <- stm %>%
    left_join(bias, by = c("Protocol", "Taxon")) %>%
    mutate(
        Residual = Error / Bias,
        Predicted = Actual * Bias
    )
# Note, `Residual` is already geometrically centered.

Let's also check that the bias predictions predict the taxon ratios:

gvs <- c("Protocol", "Individual", "Sample")
ratios <- stm %>%
    mutate(Taxon = tax_abbrev(Taxon)) %>%
    compute_ratios(group_vars = gvs) %>%
    mutate(Pair = paste(Taxon.x, Taxon.y, sep = ":"))
ratios.pred <- ratios %>%
    select(Taxon.x, Taxon.y, Pair, Protocol, Actual, Bias, Predicted) %>%
    distinct
ggplot(ratios, aes(Pair, Error, 
        label = Individual, color = Protocol)) +
    geom_point(data = ratios.pred,
        aes(Pair, Bias), inherit.aes = FALSE,
        shape = 3, size = 4, color = "black") +
    geom_text(position = position_jitter(width=0.25, height=0)) +
    geom_hline(yintercept = 1) +
    facet_grid(Protocol ~ ., labeller = labeller.protocol) +
    geom_rangeframe(sides = "l", color= "black") + 
    scale_color_manual(values = colors.protocol) + 
    scale_y_log10() +
    scale_x_discrete(labels = tax_labeller) + 
    base_theme +
    tax_theme

Distribution of efficiencies for the three protocols:

ggplot(bias, aes(Bias, fill = Protocol)) +
    geom_dotplot() +
    scale_x_log10() +
    facet_grid(Protocol ~ ., labeller = labeller.protocol) +
    scale_fill_manual(values = colors.protocol) +
    scale_y_continuous(breaks = NULL) +
    geom_rangeframe() +
    base_theme

Let's also estimate the differential bias (including the contaminant):

dbias <- sts %>%
    select(Taxon, Observed, Protocol, Individual) %>%
    spread(Protocol, Observed) %>%
    group_by(Taxon) %>%
    summarize(
        HQ = gm_mean(H / Q),
        HW = gm_mean(H / W),
        QW = gm_mean(Q / W))
dbias <- dbias %>%
    gather("Protocol", "Bias", -Taxon)
dbias %>%
    spread(Protocol, Bias)

For consistency we keep the denominator as just the 10 mock (non-contaminant) species.

Fraction of squared Aitchison error due to bias

What fraction of the compositional error (measured by squared Aitchison distance) is explained by our model for each protocol?

err <- stm %>%
    group_by(Protocol, Sample) %>%
    summarize(
        Error2 = anorm(Error)^2,
        Bias2 = anorm(Bias)^2,
        Resid2 = anorm(Error / Bias)^2
        ) %>%
    summarize_at(vars(-Sample), mean)
err <- err %>% 
    mutate(Frac_Bias = Bias2 / Error2, Frac_Resid = Resid2 / Error2)
err

Residuals

Residuals, with the mock-only samples marked:

stm %>%
    mutate(Taxon = factor(Taxon, mock$Taxon)) %>%
    ggplot(aes(x = Taxon, y = Residual, shape = Individual == "M",
            color = Protocol)) +
    geom_quasirandom() +
    geom_hline(yintercept = 1) +
    geom_rangeframe(sides = "l", color = "black") +
    scale_y_log10() +
    scale_shape_manual(values = c(16, 1), labels = c("Fecal", "Mock only")) +
    scale_color_manual(values = colors.protocol) + 
    facet_grid(Protocol ~ ., scales = "fixed") +
    base_theme +
    tax_theme

Note that the residuals are not independent across taxa; the high Residuals for L. plantarum and B. hansenii for protocol H are pushing down the others.

Bootstrap standard errors

Get the bias estimates w/ estimated geometric standard errors -

reps <- stm %>%
    group_by(Protocol) %>%
    nest %>%
    mutate(
        Error_matrix = map(data, build_matrix, "Sample", "Taxon", "Error"),
        Bootreps = map(Error_matrix, bootrep_center, R = 4e3, 
            method = "gm")) %>%
    unnest(Bootreps) %>%
    rename(Bias = Center)
stats <- reps %>%
    group_by(Protocol, Taxon) %>%
    summarize(gm_mean = gm_mean(Bias), gm_se = gm_sd(Bias)) %>%
    ungroup
bias <- left_join(bias, stats, by = c("Protocol", "Taxon"))

As expected, the geometric means of the resamplings approximate the point estimate.

all.equal(bias$Bias, bias$gm_mean)
bias <- bias %>% select(-gm_mean)

Create the SI figure showing the bias estimate with two standard errors:

r <- range(stm$Observed / stm$Actual)
p1 <- ggplot(bias, aes(factor(Taxon, mock$Taxon), Bias, color = Protocol)) +
    geom_hline(yintercept = 1) +
    geom_quasirandom(data = stm, aes(y = Observed / Actual), alpha = 0.4) +
    geom_pointrange(aes(ymin = Bias / gm_se^2, ymax = Bias * gm_se^2),
        fatten = 1.5) +
    scale_y_log10() +
    scale_color_manual(values = colors.protocol) + 
    facet_wrap(~ Protocol, labeller = labeller.protocol) +
    geom_rangeframe(sides = "l", color = "black") +
    scale_y_log10(limits = r, labels = log_formatter) +
    scale_x_discrete(labels = tax_labeller) + 
    labs(y = "Efficiency / gm. mean") +
    base_theme +
    tax_theme +
    theme(panel.spacing = unit(1, "lines"))
p1
ggsave(here("figures", "costea-bias.pdf"),
    width = 9, height = 4.2, units = "in", useDingbats = FALSE)

Do this for differential bias as well. Note setting the denominator to just the mock taxa.

dreps <- sts %>%
    select(Taxon, Observed, Protocol, Individual) %>%
    spread(Protocol, Observed) %>%
    transmute(
        Taxon, Individual,
        HQ = H / Q, 
        HW = H / W,
        QW = Q / W
    ) %>%
    gather("Protocol", "Error", -Taxon, -Individual) %>%
    group_by(Protocol) %>%
    nest %>%
    mutate(
        Error_matrix = map(data, build_matrix, "Individual", "Taxon", "Error"),
        Bootreps = map(Error_matrix, bootrep_center, R = 4e3, method = "gm",
            denom = mock$Taxon)) %>%
    unnest(Bootreps) %>%
    rename(Bias = Center)
dstats <- dreps %>%
    group_by(Protocol, Taxon) %>%
    summarize(gm_mean = gm_mean(Bias), gm_se = gm_sd(Bias)) %>%
    ungroup
dbias <- left_join(dbias, dstats, by = c("Protocol", "Taxon"))
all.equal(dbias$Bias, dbias$gm_mean)
dbias <- dbias %>% select(-gm_mean)
p2 <- ggplot(dbias, aes(factor(Taxon, c(mock$Taxon, "Contaminant")), Bias)) +
    geom_hline(yintercept = 1) +
    # geom_quasirandom(data = sts, aes(y = Observed / Actual), alpha = 0.5) +
    geom_pointrange(aes(ymin = Bias / gm_se^2, ymax = Bias * gm_se^2),
        fatten = 1.5) +
    scale_y_log10() +
    # scale_color_manual(values = colors.protocol) + 
    facet_wrap(~ Protocol, labeller = labeller.protocol.ref) +
    geom_rangeframe(sides = "l", color = "black") +
    scale_y_log10(limits = r, labels = log_formatter) +
    scale_x_discrete(labels = tax_labeller) + 
    labs(y = "Efficiency / gm. mean") +
    base_theme +
    tax_theme +
    theme(panel.spacing = unit(1, "lines"))
p2

How does the standard error decrease with sample size?

Here we wish to see how the noise in the bias estimate decreases with the number of control samples used. To do so, we will perform multinomial resampling of N control samples, for each N between 1 and 9 (the total number of samples). For a given N, the standard deviation of the bias estimate across resamples approximates the standard error in the bias estimate when N control samples are measured. We'll use Protocol H, which has an intermediate level of noise.

First, estimate the standard errors for each N:

mat <- stm %>%
    filter(Protocol == "H") %>%
    build_matrix(Sample, Taxon, Error)
N <- 1:9
names(N) <- N
reps.H <- map_dfr(N, ~bootrep_center(mat, R = 4e3, dist = "multinomial",
        N = ., method = "gm"),
    .id = "N") %>%
    rename(Bias = Center)
reps.H.summary <- reps.H %>%
    group_by(N, Taxon) %>%
    summarize(gm_mean = gm_mean(Bias), gm_se = gm_sd(Bias)) %>%
    mutate(Protocol = "H")

Then view how the dispersion decreases with N:

lvls.color <- reps.H.summary %>%
    filter(N == 3) %>%
    arrange(-gm_se) %>%
    .$Taxon
values.color <- scales::seq_gradient_pal("black", "grey")(seq(0, 1, length.out = 10))
names(values.color) <- lvls.color
r <- range(reps.H.summary$gm_se)
names(r) <- r  %>% round(2)
p.se <- ggplot(reps.H.summary, aes(N, gm_se, 
        color = fct_reorder(Taxon, -gm_se))) +
    geom_line(aes(group = Taxon)) +
    geom_point() +
    scale_y_continuous(limits = range(c(1, r)), ) +
    base_theme +
    scale_color_manual(values = values.color, labels = tax_labeller) +
    geom_rangeframe(color = "black") +
    theme(legend.position = "right", legend.margin = margin(5,10,5,-5)) +
    labs(x = "Number of controls", y = "Geometric standard error", 
        title = "Standard error vs. number of control samples", 
        color = "Taxon")
p.se

The relative abundance plot provides useful context for understanding why the standard errors are much higher for two taxa:

pd <- filter(stm, Protocol == "H")
pd0 <- pd %>%
    filter(Individual %in% c("5", "M"))
p.step.H <- p.step %+% 
    pd %+%
    aes(x = factor(Taxon, mock$Taxon), y = Observed, 
        color = factor(Taxon, lvls.color), shape = NULL) +
    ggrepel::geom_text_repel(data = pd0, aes(label = Individual), 
        direction = "x", size = 2, color = "black") +
    scale_color_manual(values = values.color, labels = tax_labeller) +
    labs(title = "Observed compositions",
        y = "Abundance / gm. mean",
        shape = "Sample type") +
    theme(legend.position = "none")
# p.step.H
plot_grid(p.se, p.step.H, nrow = 1, labels = c("A", "B"),
    rel_widths = c(1.3, 1))
ggsave(here("figures", "costea-dispersion.pdf"),
    width = 8, height = 3.5, units = "in", useDingbats = FALSE)

Table with estimated bias and summary statistics

Compute pairwise-error summary statistics

Error vs. actual compositions

Compute pairwise error summary statistics:

avg_errors <- ratios %>%
    # Take the geometric absolute values of the errors
    mutate_at(vars(Error, Bias, Residual), gm_abs) %>%
    # Average the errors for each pair of different taxa
    filter(Taxon.x != Taxon.y) %>%
    group_by(Protocol, Taxon.x, Taxon.y) %>%
    summarize_at(vars(Error, Bias, Residual), gm_mean) %>%
    # Average over pairs of taxa for each protocol
    group_by(Protocol) %>%
    summarize_at(vars(Error, Bias, Residual), gm_mean) %>%
    rename_at(vars(Error, Bias, Residual), paste0, ".avg")
max_errors <- ratios %>%
    # Take the geometric absolute values of the errors
    mutate_at(vars(Error, Bias, Residual), gm_abs) %>%
    # Find the max fold error in a ratio for each protocol
    group_by(Protocol) %>%
    summarize_at(vars(Error, Bias, Residual), gm_range) %>%
    rename_at(vars(Error, Bias, Residual), paste0, ".max")
pw_summary <- bind_cols(avg_errors, max_errors) %>%
    select(Protocol, Bias.max, Bias.avg, Residual.avg)
pw_summary

Error between protocols

Get ratios for the ratio-based statistics as in ratios, but for the differential error between protocols.

tb <- stm %>%
    select(Taxon, Observed, Protocol, Individual) %>%
    spread(Protocol, Observed) %>%
    transmute(
        Taxon, Individual,
        HQ = H / Q, 
        HW = H / W,
        QW = Q / W
    ) %>%
    gather("Protocol", "Error", -Taxon, -Individual) %>%
    left_join(dbias, by = c("Protocol", "Taxon")) %>%
    select(-gm_se) %>%
    mutate(Residual = Error / Bias)
gvs <- c("Protocol", "Individual")
d_ratios <- tb %>%
    compute_ratios(group_vars = gvs) %>%
    mutate(Pair = paste(Taxon.x, Taxon.y, sep = ":"))

Compute pairwise error summary statistics:

d_avg_errors <- d_ratios %>%
    # Take the geometric absolute values of the errors
    mutate_at(vars(Error, Bias, Residual), gm_abs) %>%
    # Average the errors for each pair of different taxa
    filter(Taxon.x != Taxon.y) %>%
    group_by(Protocol, Taxon.x, Taxon.y) %>%
    summarize_at(vars(Error, Bias, Residual), gm_mean) %>%
    # Average over pairs of taxa for each protocol
    group_by(Protocol) %>%
    summarize_at(vars(Error, Bias, Residual), gm_mean) %>%
    rename_at(vars(Error, Bias, Residual), paste0, ".avg")
d_max_errors <- d_ratios %>%
    # Take the geometric absolute values of the errors
    mutate_at(vars(Error, Bias, Residual), gm_abs) %>%
    # Find the max fold error in a ratio for each protocol
    group_by(Protocol) %>%
    summarize_at(vars(Error, Bias, Residual), gm_range) %>%
    rename_at(vars(Error, Bias, Residual), paste0, ".max")
d_pw_summary <- bind_cols(d_avg_errors, d_max_errors) %>%
    select(Protocol, Bias.max, Bias.avg, Residual.avg)
d_pw_summary

Combine with bias estimates in a single table

# Function to create string for formatting numbers with glue::glue()
fmt_string <- function(var, digits) {
    paste0("{format(", var, ", digits = ", digits, ")}")
}
bias_tab <- bind_rows(bias, dbias) %>%
    select(-gm_se) %>%
    mutate(Bias = glue::glue(fmt_string("Bias", 1))) %>%
    mutate_all(as.character) %>%
    spread(Protocol, Bias) %>%
    mutate_all(~ifelse(is.na(.), "---", .))
pw_summary_tab <- bind_rows(pw_summary, d_pw_summary) %>%
    mutate(
        Bias.max = glue::glue(fmt_string("Bias.max", 1)),
        Bias.avg = glue::glue(fmt_string("Bias.avg", 2)),
        Residual.avg = glue::glue(fmt_string("Residual.avg", 2))
        ) %>%
    mutate_all(as.character) %>%
    gather("Taxon", "Value", -Protocol) %>%
    spread(Protocol, Value)
lvls.taxon <- c(mock$Taxon, "Contaminant", "Bias.max", "Bias.avg",
    "Residual.avg")
lbls.taxon <- c(mock$Taxon, "Contaminant", "Max pairwise bias", 
    "Avg. pairwise bias", "Avg. pairwise noise")
bias_tab0 <- bind_rows(bias_tab, pw_summary_tab) %>%
    select(Taxon, "H", "Q", "W", "HQ", "HW", "QW") %>%
    mutate(
        Taxon = factor(Taxon, lvls.taxon, lbls.taxon),
        ) %>%
    arrange(Taxon)
bias_tab0

Latex version for the manuscript:

tex <- bias_tab0 %>%
    rename_at(vars(HQ, HW, QW), ~ {str_sub(., 2, 1) <- "/"; .}) %>%
    mutate(
        Taxon = kableExtra::cell_spec(
            str_replace(Taxon, "_", " "), 
            "latex", 
            italic = Taxon %in% mock$Taxon)
        ) %>%
    knitr::kable(format="latex", booktabs = TRUE, linesep = "",
        escape = FALSE, align = c("l", rep("r", 6))) %>%
    kableExtra::add_header_above(c(" ", "Protocol" = 3, 
            "Protocol/Reference" = 3)) %>%
    kableExtra::row_spec(length(mock$Taxon) + 1, extra_latex_after = "\\midrule")
# tex
# clipr::write_clip(tex)

Calibration

Choose Individuals (specimens) to use for estimating bias.

set.seed(20190201)
individuals <- as.character(1:8)
estimation_set <- base::sample(individuals, 3, replace = FALSE)
stm0 <- stm %>%
    select(-Bias, -Residual) %>%
    mutate(Set = ifelse(Individual %in% estimation_set, "Est", "Eval"))
sam <- sam %>%
    mutate(Set = ifelse(Individual %in% estimation_set, "Est", "Eval"))

Estimate bias from the estimation samples:

bias_est <- stm0 %>%
    filter(Set == "Est") %>%
    group_by(Protocol, Taxon) %>%
    summarize(Bias = gm_mean(Error)) %>%
    mutate(Bias = center_elts(Bias)) %>%
    ungroup

Compare the bias estimated from the estimation set and that from all specimens:

bind_rows(list(All = bias, Est = bias_est), .id = "Specimens") %>%
    ggplot(aes(Taxon, Bias, color = Protocol, shape = Specimens)) + 
    geom_quasirandom() +
    scale_y_log10() +
    scale_shape_manual(breaks = c("All", "Est"), values = c(16, 1)) +
    scale_color_manual(values = colors.protocol) + 
    coord_flip()

Bias is consistent with the estimate using all specimens, though this would vary for the higher-variance protocols H and Q for some estimation sets.

Also estimate the differential bias to protocol W. First, add a column with the "Reference" measurement; then repeat bias estimation with the reference measurement instead of the Actual abundance.

stmW <- stm0 %>%
    select(Taxon, Protocol, Individual, Set, Observed) %>%
    spread(Protocol, Observed) %>%
    mutate(Actual = W) %>%
    gather("Protocol", "Observed", H, Q, W) %>%
    mutate(Error = Observed / Actual)
biasW_est <- stmW %>%
    filter(Set == "Est") %>%
    group_by(Protocol, Taxon) %>%
    summarize(BiasW = gm_mean(Error)) %>%
    mutate(BiasW = center_elts(BiasW))
biasW_est %>%
    spread(Protocol, BiasW)

Calibrate all samples

stm0 <- stm0 %>%
    left_join(bias_est, by = c("Protocol", "Taxon")) %>%
    left_join(biasW_est, by = c("Protocol", "Taxon")) %>%
    mutate_by(Sample, 
        Calibrated_to_Actual = close_elts(Observed / Bias),
        Calibrated_to_W = close_elts(Observed / BiasW)
    )

Sample ordination: Compositional PCA

Prep for the PCA: we first need to Clr transform each relative abundance vector.

stm1 <- stm0 %>%
    gather("Type", "Abundance", 
        Observed, Actual, Calibrated_to_Actual, Calibrated_to_W) %>%
    select(Taxon:Individual, Set:Abundance) %>%
    group_by(Sample, Type) %>%
    mutate(Clr = clr(Abundance)) %>%
    ungroup

Get a matrix with samples as rows to use for the PCA.

temp <- stm1 %>%
    unite("Sample_Type", Sample, Type, sep = ":") %>%
    select(Sample_Type, Taxon, Clr) %>%
    spread(Taxon, Clr)
mat <- temp %>%
    select(-Sample_Type) %>%
    as("matrix")
rownames(mat) <- temp$Sample_Type
corner(mat)

Note that each sample (protocol + individual) appears four times, as Observed, Actual, Calibrated to Actual, and Calibrated to W, and currently I'm just doing a simple PCA on all of these, since our goal is mainly to illustrate the effect of calibration rather than make quantitative claims about variation. Next we run the PCA and make a tibble for plotting,

pcx <- prcomp(mat)
tb <- as_tibble(pcx$x[,c(1,2)], rownames='Sample_Type') %>%
    separate(Sample_Type, c("Sample", "Type"), sep = ":") %>%
    left_join(sam, by = "Sample")
tb <- tb %>%
    mutate(PC1 = -PC1, PC2 = PC2)

Nearly 90% of the variance is covered by the first two PCs, with about 5.2x as much variation on PC1 than PC2:

fracvar <- pcx$sdev^2/sum(pcx$sdev^2)
round(fracvar, 3)
fracvar[1] / fracvar[2]
percvar <- round(100 * fracvar, 0)
strvar <- paste0("PC", seq(fracvar), " [", percvar, "%]")

PCA figure that will form the right half of the main text figure:

facet_labeller <- function (tb) {
    tb %>% 
        mutate_all(str_replace_all, "_", " ") %>%
        mutate_all(str_replace, " ", "\n")
}
type.lvls <- c("Observed", "Calibrated_to_Actual", "Calibrated_to_W")
actual <- tb %>%
    filter(Type == "Actual", Sample == "H1") %>%
    select(PC1, PC2) %>%
    expand(Type = factor(type.lvls, type.lvls), PC1, PC2)
tb0 <- tb %>%
    filter(Type != "Actual") %>%
    mutate(Type = factor(Type, type.lvls))
tb.breaks <- 
    tribble(
        ~PC1, ~PC2,
        # min(tb0$PC1), min(tb0$PC2),
        # actual$PC1[1], actual$PC2[1],
        # max(tb0$PC1), max(tb0$PC2),
        )
tb.range <- 
    tribble(
        ~PC1, ~PC2,
        min(tb0$PC1), min(tb0$PC2),
        max(tb0$PC1), max(tb0$PC2),
        )
p.pca <- ggplot(tb0, aes(PC1, PC2, color = Protocol, shape = Set)) +
    geom_point(data = actual, aes(PC1, PC2),
        shape = 3, size = 4, color = "black") +
    geom_point() +
    facet_grid(Type ~ ., labeller = facet_labeller) +
    geom_rangeframe(data = tb.range, aes(PC1, PC2), 
        color = "black", inherit.aes = FALSE) + 
    scale_x_continuous(breaks = tb.breaks$PC1, labels = NULL) +
    scale_y_continuous(breaks = tb.breaks$PC2, labels = NULL) +
    scale_shape_manual(breaks = c("Est", "Eval"), values = c(1, 16)) +
    scale_color_manual(values = colors.protocol) + 
    base_theme +
    theme(strip.text.y = element_text(angle=0, size = 9)) +
    # theme(axis.line = element_line()) +
    labs(title = "Sample ordination", x = strvar[1], y = strvar[2])
# Label the protocols in the "Observed" plot
labtb <- tribble(
    ~x,     ~y,   ~Protocol,
    -4,    1.5,   "H",
    -0.5,  1.3,   "Q",
    -1.85, -0.4,  "W",
    2.3,    0.7,   "Actual"
    ) %>%
    mutate(Type = factor("Observed", type.lvls))
p.pca <- p.pca +
    geom_text(data = labtb,
        aes(x, y, label = Protocol, color = Protocol), 
        size = 3, inherit.aes = FALSE, show.legend = FALSE)
p.pca

Relative abundances before and after calibration

Figure showing the relative abundances, which will form the left half of the main text figure:

stm1 <- stm1 %>% 
    group_by(Sample, Type) %>%
    mutate(Abundance = center_elts(Abundance)) %>%
    ungroup
tbrf <- tibble(Abundance = range(stm1$Abundance))
p.step.cal <- stm1 %>% 
    filter(Type %in% type.lvls) %>%
    mutate(Taxon = factor(Taxon, mock$Taxon),
        Type = factor(Type, type.lvls)) %>%
    ggplot(aes(x = Taxon, y = Abundance, color = Protocol, shape = Set)) + 
    geom_quasirandom() +
    geom_rangeframe(data = tbrf, aes(y = Abundance), 
        color = "black", sides = "l", inherit.aes = FALSE) +
    scale_color_manual(values = colors.protocol) + 
    scale_y_log10(labels = log_formatter) +
    scale_x_discrete(labels = tax_labeller) + 
    coord_cartesian(clip = "off") +
    labs(title = "Taxon relative abundances",
        y = "Abundance / gm. mean") + 
    facet_grid(Type ~ .) +
    scale_shape_manual(breaks = c("Est", "Eval"), values = c(1, 16)) +
    scale_color_manual(values = colors.protocol) + 
    base_theme + 
    tax_theme +
    theme(strip.text.y = element_blank())
# Add the actual abundances (as a stair plot like in Costea2017's Figure 6)
# underneath the data points
tb.step.cal <- tb.step %>%
    expand(Type = factor(type.lvls, type.lvls), nesting(x, Actual))
p.step.cal$layers <- c(
    geom_step(data = tb.step.cal, aes(x = x, y = Actual), 
        size = 0.3, color = "black", inherit.aes = FALSE),
    p.step.cal$layers)
# p.step.cal

Main text figure

plot_grid(p.step.cal + theme(plot.margin = margin(5,20,5,5)),
    p.pca + 
        theme(legend.position = "bottom", legend.box = "vertical",
            legend.margin = margin(),
            legend.spacing = unit(0, "in"),
        ),
    rel_widths = c(1, 1.1)
)
# ggsave(here("figures", "costea-calibration.pdf"),
#     width = 6.5, height = 6.5, units = "in", useDingbats = FALSE)
ggsave(here("figures", "costea-calibration.pdf"),
    width = 8, height = 8, units = "in", useDingbats = FALSE)

Bias vs. phylogeny

Get tree for the spike-in taxa:

data(costea2017_metaphlan2_tree)
costea2017_metaphlan2_tree$tip.label %>% head
tree <- costea2017_metaphlan2_tree
tree$tip.label <- tree$tip.label %>%
    str_extract("(?<=s__).+")
spikein_taxa <- c(mock$Taxon, contaminant)
tree <- ape::keep.tip(tree, spikein_taxa)
tree

Plot tree along with differential bias to Protocol W. Bias is shown relative to the 10 mock taxa, with 2 geometric standard errors.

gt <- ggtree::ggtree(tree, layout = "rectangular") +
    theme(plot.margin = margin(5, 0, 5, 5))
gtb <- dbias %>%
    mutate(Taxon = str_replace(Taxon, "Contaminant", contaminant)) %>%
    left_join(gt$data, by = c("Taxon" = "label")) %>%
    filter(Protocol %in% c("HW", "QW")) %>%
    mutate_at("Protocol", str_sub, 1, 1)
pr <- gtb %>%
    ggplot(aes(Bias, fct_reorder(Taxon, y), color = Protocol)) +
    # geom_point() +
    geom_vline(xintercept = 1) +
    ggstance::geom_pointrangeh(aes(xmin = Bias / gm_se^2, 
            xmax = Bias * gm_se^2), 
        fatten = 1.0, position = position_jitter(width = 0, height = 0.1)
        ) +
    scale_color_manual(values = colors.protocol) +
    base_theme +
    scale_y_discrete(labels = tax_labeller) +
    scale_x_log10(labels = log_formatter) +
    labs(y = "", x = "Differential efficiency rel. to Protocol W") +
    theme(legend.position = "right", plot.margin = margin(5, 5, 5, 0),
        axis.text.y = element_text(size=9, family = "")
    )
plot_grid(gt, pr, align = "h", rel_widths = c(0.6, 1))
ggsave(here("figures", "costea-tree.png"),
    width = 6, height = 4, units = "in", dpi=300)
ggsave(here("figures", "costea-tree.pdf"),
    width = 6, height = 4, units = "in", useDingbats = FALSE)

Reproduce Costea et al.'s Figure 6A

mock.gathered <- costea2017_mock_composition %>%
    select(Taxon, 
        FACS = "bacterial cells in spike in Mix",
        OD = "Bacterial OD in sample",
        ) %>%
    group_by(Taxon) %>%
    summarize_at(vars(FACS, OD), mean) %>%
    gather("Protocol", "Observed", -Taxon) %>%
    mutate_by(Protocol, Observed = center_elts(Observed))
stm %>%
    group_by(Protocol, Taxon) %>%
    summarize(Observed = gm_mean(Observed)) %>%
    ungroup %>%
    bind_rows(mock.gathered) %>%
    filter(Protocol != "FACS") %>%
    mutate(
        Taxon = factor(Taxon, mock$Taxon),
        Protocol = factor(Protocol, c("W", "Q", "H", "OD"))
        ) %>%
    ggplot(aes(Taxon, Observed, fill = Protocol)) +
    geom_col(position = "dodge") +
    geom_step(data = tb.step, aes(x = x, y = Actual, linetype = "FACS"), 
        size = 0.3, color = "black", inherit.aes = FALSE) +
    scale_fill_manual(values = c(colors.protocol, OD = "grey")) +
    scale_y_log10() +
    base_theme + 
    tax_theme

The abundances don't match exactly. This is expected, given that Costea et al. used mOTU to measure abundances. However, the general pattern is highly consistent with their figure, including the OD measurements, except for a swapping in the taxon labels on the x-axis for the three species V. cholerae, C. saccharolyticum, and Y. pseudotuberculosis. This strongly suggests that the mock species are correctly labeled here and are labeled incorrectly in their Figure 6.

Session info

sessionInfo()


mikemc/2019-bias-manuscript documentation built on Sept. 26, 2019, 6:04 p.m.