R setup

knitr::opts_knit$set(progress=TRUE, verbose=TRUE)
# Global chunk options
knitr::opts_chunk$set(cache=TRUE, autodep = TRUE,
    include=TRUE, echo=TRUE,
    warning=TRUE, message=FALSE, 
    fig.width=5, fig.height=4)
library(here)
library(tidyverse)
library(ggthemes)
# library(cowplot)
# library(ggbeeswarm)
library(metacal)
# This package
devtools::load_all(here())
base_theme <- theme_tufte() + 
    theme(
        text = element_text(size=9, family = ""),
        legend.position = "none"
        )
base_theme0 <- theme_grey() + 
    theme(
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        panel.background = element_blank()
        )
tax_theme <- theme(
        axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1), 
        axis.title.x = element_blank())

update_geom_defaults("point", list(size = 1))
update_geom_defaults("text", list(size = 2.5))
update_geom_defaults("hline", list(color = "grey"))

colors.taxa <- c(
    T1 = "#66c2a5", 
    T2 = "#fc8d62", 
    T3 = "#8da0cb"
    )

Example: three-species communities

We will consider the three taxa in the main text Figures 1 and 2, with bias equal to (1, 18, 6). We consider three samples; samples S1 and S2 in the main text, and a third sample S3 that switches the abundance of the second and third taxa in sample S3. First we create a data frame with the actual compositions and the bias.

tb <- tibble(
    Taxon = c("T1", "T2", "T3"), 
    Bias = c(1, 18, 6),
    S1 = c(1, 1, 1),
    S2 = c(1, 1/15, 4/15),
    S3 = c(1, 4/15, 1/15),
    ) 
print(tb)

Note that we have expressed bias and the relative abundances as relative to taxon 1. Then, we gather these relative abundances into a tidy form, use the bias to calculate the observed abundances, and also obtain the proportions from the relative abundances.

tb <- tb %>%
    gather("Sample", "Actual", S1:S3) %>%
    mutate(Observed = Actual * Bias) %>%
    gather("Type", "Abundance", Actual, Observed) %>%
    mutate_by(c(Sample, Type), Proportion = close_elts(Abundance)) %>%
    select(Sample, Taxon, Bias, everything())
tb

Error in ratios vs. error in proportions

The proportions used to label Figure 2:

tb0 <- tb %>%
    select(-Abundance) %>%
    spread(Type, Proportion) %>%
    arrange(Sample, Taxon)
tb0 %>%
    knitr::kable(digits = 2, format = "pandoc")

We can view the measurement error across the three samples with bar plots, as done in main text Figure 2, and also in terms of the ratios to Taxon 1:

p.props <- ggplot(tb, aes(x = Type, y = Proportion, fill = Taxon)) +
    geom_bar(stat = "identity") +
    scale_fill_manual(values = colors.taxa) +
    facet_wrap(~Sample) +
    scale_y_continuous(breaks = c(0, 0.5, 1)) +
    base_theme +
    theme(strip.text = element_text(size = 9), axis.title.x = element_blank(),
        legend.position = "right")
p.ratios <- ggplot(tb, aes(x = Type, y = Abundance, color = Taxon)) +
    geom_path(aes(group = Taxon)) +
    geom_point() +
    facet_wrap(~Sample) +
    scale_y_log10() +
    scale_color_manual(values = colors.taxa) +
    base_theme +
    labs(y = "Ratio to Taxon 1") +
    theme(strip.text = element_text(size = 9), axis.title.x = element_blank(),
        legend.position = "right")
cowplot::plot_grid(p.props, p.ratios, ncol = 1, align = "v", axis = "lr",
    labels = c("A", "B"))
ggsave(here("figures", "compositional_dependence_multipanel.pdf"),
    width = 4.5, height = 5.5, units = "in", useDingbats = FALSE)

From this figure, we can see that the error in the proportions differs for the three samples, but the error in ratios is consistent.

Calculate the compositional error and the fold-error in proportions:

err <- tb %>%
    select(-Abundance) %>%
    spread(Type, Proportion) %>%
    arrange(Sample, Taxon) %>%
    mutate(Error = Observed / Actual) %>%
    mutate_by(Sample, 
        Error.T1 = Error / Error[1],
    )
err %>%
    knitr::kable(digits = 2, format = "pandoc")

The Error column shows the fold error in the Proportions, while the Error.T1 column divides this by the Error of taxon T1 in each sample and equals the Bias relative to T1. We can see that the fold-error error in proportions of a given taxon varies across samples, but when divided by the fold-error in taxon T1, is consistent.

The sample mean efficiencies are

sme <- err %>%
    group_by(Sample) %>%
    summarize(SME = sum(Actual * Bias) / sum(Actual))
sme

Check that the observed proportions are given by the equation from the main text---

err0 <- left_join(err, sme, by = "Sample")
err0 %>%
    mutate(Observed0 = Actual * Bias / SME) %>%
    {all.equal(.$Observed, .$Observed0)}

Analysis of compositional differences between samples

We get very different pictures comparing the three samples based on the Actual or the Observed proportions.

p.props <- ggplot(tb, aes(x = Sample, y = Proportion, fill = Taxon)) +
    geom_bar(stat = "identity") +
    facet_wrap(~Type) +
    scale_y_continuous(breaks = c(0, 0.5, 1), labels = c(0, 0.5, 1)) +
    scale_fill_manual(values = colors.taxa) +
    base_theme +
    theme(strip.text = element_text(size = 9))

p.ratios <- ggplot(tb, aes(x = Sample, y = Abundance, color = Taxon)) +
    geom_path(aes(group = Taxon)) +
    geom_point() +
    facet_wrap(~Type) +
    scale_y_log10() +
    scale_color_manual(values = colors.taxa) +
    base_theme +
    labs(y = "Ratio to Taxon 1") +
    theme(strip.text = element_text(size = 9))

cowplot::plot_grid(p.props, p.ratios, ncol = 1)
ggsave(here("figures", "community_clustering.pdf"),
    width = 3, height = 5.5, units = "in", useDingbats = FALSE)

The lower panel shows that the change in ratios between samples is consistent whether using the actual and observed profiles.

From this figure, it is apparent that Sample S1 is the most even and thus has the highest diversity by any metric that values evenness (such as the Shannon or Inverse Simpson indices), but but sample S2 is observed to be the most even and thus to have the highest diversity.

We can also see that Samples S1 and S2 are more similar compositionally by the Bray-Curtis (BC) community similarity, but that samples S1 and S3 appear to be the most similar. This is easy to see because BC similarity, when applied to proportions, simply sums up the amount of shared proportion of each taxon.

B <- c(1, 18, 6)
A1 <- c(1, 1, 1)
A2 <- c(1, 1/15, 4/15)
A3 <- c(1, 4/15, 1/15)
O1 <- close_elts(A1 * B)
O2 <- close_elts(A2 * B)
O3 <- close_elts(A3 * B)

The BC index gives a different clustering when applied to the actual and observed compositions.

# Actual
xydist(A1, A2, method = "bray")
xydist(A1, A3, method = "bray")
xydist(A2, A3, method = "bray")
# Observed
xydist(O1, O2, method = "bray")
xydist(O1, O3, method = "bray")
xydist(O2, O3, method = "bray")

The Aitchison distances are invariant to bias, although sample 1 is equidistant from samples 2 and 3, so clustering isn't possible in this toy example.

# Actual
xydist(A1, A2, method = "aitchison")
xydist(A1, A3, method = "aitchison")
xydist(A2, A3, method = "aitchison")
# Observed
xydist(O1, O2, method = "aitchison")
xydist(O1, O3, method = "aitchison")
xydist(O2, O3, method = "aitchison")

Taxonomic aggregation

The theoretical bias invariance of ratio-based analyses does not hold under the common practice of taxonomically agglomerating abundances prior to analysis. For example, suppose the first two taxa are in phylum P1 and the third taxon is in phylum P2.

tb1 <- tb %>%
    mutate(Phylum = ifelse(Taxon < "T3", "P1", "P2")) %>%
    group_by(Sample, Type, Phylum) %>%
    summarize(Proportion = sum(Proportion)) %>%
    ungroup
tb1

Consider the ratio of phylum 1 to phylum 2,

tb1 <- tb1 %>%
    spread(Phylum, Proportion) %>%
    mutate(Ratio = P1 / P2) %>%
    select(-P1, -P2) %>%
    spread(Sample, Ratio)
tb1

From Community 1 to Community 2, the ratio of P1 to P2 increases by a factor of 4/2 = 2x but is observed to decrease by factor of 1.38/3.17 = 0.44x.

Session info

sessionInfo()


mikemc/2019-bias-manuscript documentation built on Sept. 26, 2019, 6:04 p.m.