knitr::opts_knit$set(progress=TRUE, verbose=TRUE) # Global chunk options knitr::opts_chunk$set(cache=TRUE, autodep = TRUE, include=TRUE, echo=TRUE, warning=TRUE, message=FALSE, fig.width=5, fig.height=4)

library(here) library(tidyverse) library(ggthemes) # library(cowplot) # library(ggbeeswarm) library(metacal) # This package devtools::load_all(here())

base_theme <- theme_tufte() + theme( text = element_text(size=9, family = ""), legend.position = "none" ) base_theme0 <- theme_grey() + theme( panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank() ) tax_theme <- theme( axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1), axis.title.x = element_blank()) update_geom_defaults("point", list(size = 1)) update_geom_defaults("text", list(size = 2.5)) update_geom_defaults("hline", list(color = "grey")) colors.taxa <- c( T1 = "#66c2a5", T2 = "#fc8d62", T3 = "#8da0cb" )

We will consider the three taxa in the main text Figures 1 and 2, with bias
equal to `(1, 18, 6)`

. We consider three samples; samples S1 and S2 in the main
text, and a third sample S3 that switches the abundance of the second and third
taxa in sample S3. First we create a data frame with the actual compositions
and the bias.

tb <- tibble( Taxon = c("T1", "T2", "T3"), Bias = c(1, 18, 6), S1 = c(1, 1, 1), S2 = c(1, 1/15, 4/15), S3 = c(1, 4/15, 1/15), ) print(tb)

Note that we have expressed bias and the relative abundances as relative to taxon 1. Then, we gather these relative abundances into a tidy form, use the bias to calculate the observed abundances, and also obtain the proportions from the relative abundances.

tb <- tb %>% gather("Sample", "Actual", S1:S3) %>% mutate(Observed = Actual * Bias) %>% gather("Type", "Abundance", Actual, Observed) %>% mutate_by(c(Sample, Type), Proportion = close_elts(Abundance)) %>% select(Sample, Taxon, Bias, everything()) tb

The proportions used to label Figure 2:

tb0 <- tb %>% select(-Abundance) %>% spread(Type, Proportion) %>% arrange(Sample, Taxon) tb0 %>% knitr::kable(digits = 2, format = "pandoc")

We can view the measurement error across the three samples with bar plots, as done in main text Figure 2, and also in terms of the ratios to Taxon 1:

p.props <- ggplot(tb, aes(x = Type, y = Proportion, fill = Taxon)) + geom_bar(stat = "identity") + scale_fill_manual(values = colors.taxa) + facet_wrap(~Sample) + scale_y_continuous(breaks = c(0, 0.5, 1)) + base_theme + theme(strip.text = element_text(size = 9), axis.title.x = element_blank(), legend.position = "right") p.ratios <- ggplot(tb, aes(x = Type, y = Abundance, color = Taxon)) + geom_path(aes(group = Taxon)) + geom_point() + facet_wrap(~Sample) + scale_y_log10() + scale_color_manual(values = colors.taxa) + base_theme + labs(y = "Ratio to Taxon 1") + theme(strip.text = element_text(size = 9), axis.title.x = element_blank(), legend.position = "right") cowplot::plot_grid(p.props, p.ratios, ncol = 1, align = "v", axis = "lr", labels = c("A", "B"))

ggsave(here("figures", "compositional_dependence_multipanel.pdf"), width = 4.5, height = 5.5, units = "in", useDingbats = FALSE)

From this figure, we can see that the error in the proportions differs for the three samples, but the error in ratios is consistent.

Calculate the compositional error and the fold-error in proportions:

err <- tb %>% select(-Abundance) %>% spread(Type, Proportion) %>% arrange(Sample, Taxon) %>% mutate(Error = Observed / Actual) %>% mutate_by(Sample, Error.T1 = Error / Error[1], ) err %>% knitr::kable(digits = 2, format = "pandoc")

The `Error`

column shows the fold error in the Proportions, while the
`Error.T1`

column divides this by the Error of taxon T1 in each sample and
equals the Bias relative to T1. We can see that the fold-error error in
proportions of a given taxon varies across samples, but when divided by the
fold-error in taxon T1, is consistent.

The sample mean efficiencies are

sme <- err %>% group_by(Sample) %>% summarize(SME = sum(Actual * Bias) / sum(Actual)) sme

Check that the observed proportions are given by the equation from the main text---

err0 <- left_join(err, sme, by = "Sample") err0 %>% mutate(Observed0 = Actual * Bias / SME) %>% {all.equal(.$Observed, .$Observed0)}

We get very different pictures comparing the three samples based on the Actual or the Observed proportions.

p.props <- ggplot(tb, aes(x = Sample, y = Proportion, fill = Taxon)) + geom_bar(stat = "identity") + facet_wrap(~Type) + scale_y_continuous(breaks = c(0, 0.5, 1), labels = c(0, 0.5, 1)) + scale_fill_manual(values = colors.taxa) + base_theme + theme(strip.text = element_text(size = 9)) p.ratios <- ggplot(tb, aes(x = Sample, y = Abundance, color = Taxon)) + geom_path(aes(group = Taxon)) + geom_point() + facet_wrap(~Type) + scale_y_log10() + scale_color_manual(values = colors.taxa) + base_theme + labs(y = "Ratio to Taxon 1") + theme(strip.text = element_text(size = 9)) cowplot::plot_grid(p.props, p.ratios, ncol = 1)

ggsave(here("figures", "community_clustering.pdf"), width = 3, height = 5.5, units = "in", useDingbats = FALSE)

The lower panel shows that the change in ratios between samples is consistent whether using the actual and observed profiles.

From this figure, it is apparent that Sample S1 is the most even and thus has the highest diversity by any metric that values evenness (such as the Shannon or Inverse Simpson indices), but but sample S2 is observed to be the most even and thus to have the highest diversity.

We can also see that Samples S1 and S2 are more similar compositionally by the Bray-Curtis (BC) community similarity, but that samples S1 and S3 appear to be the most similar. This is easy to see because BC similarity, when applied to proportions, simply sums up the amount of shared proportion of each taxon.

B <- c(1, 18, 6) A1 <- c(1, 1, 1) A2 <- c(1, 1/15, 4/15) A3 <- c(1, 4/15, 1/15) O1 <- close_elts(A1 * B) O2 <- close_elts(A2 * B) O3 <- close_elts(A3 * B)

The BC index gives a different clustering when applied to the actual and observed compositions.

# Actual xydist(A1, A2, method = "bray") xydist(A1, A3, method = "bray") xydist(A2, A3, method = "bray") # Observed xydist(O1, O2, method = "bray") xydist(O1, O3, method = "bray") xydist(O2, O3, method = "bray")

The Aitchison distances are invariant to bias, although sample 1 is equidistant from samples 2 and 3, so clustering isn't possible in this toy example.

# Actual xydist(A1, A2, method = "aitchison") xydist(A1, A3, method = "aitchison") xydist(A2, A3, method = "aitchison") # Observed xydist(O1, O2, method = "aitchison") xydist(O1, O3, method = "aitchison") xydist(O2, O3, method = "aitchison")

The theoretical bias invariance of ratio-based analyses does not hold under the common practice of taxonomically agglomerating abundances prior to analysis. For example, suppose the first two taxa are in phylum P1 and the third taxon is in phylum P2.

tb1 <- tb %>% mutate(Phylum = ifelse(Taxon < "T3", "P1", "P2")) %>% group_by(Sample, Type, Phylum) %>% summarize(Proportion = sum(Proportion)) %>% ungroup tb1

Consider the ratio of phylum 1 to phylum 2,

tb1 <- tb1 %>% spread(Phylum, Proportion) %>% mutate(Ratio = P1 / P2) %>% select(-P1, -P2) %>% spread(Sample, Ratio) tb1

From Community 1 to Community 2, the ratio of P1 to P2 increases by a factor of 4/2 = 2x but is observed to decrease by factor of 1.38/3.17 = 0.44x.

```
sessionInfo()
```

mikemc/2019-bias-manuscript documentation built on Sept. 26, 2019, 6:04 p.m.

Embedding an R snippet on your website

Add the following code to your website.

For more information on customizing the embed code, read Embedding Snippets.