knitr::opts_chunk$set(echo = FALSE)
library(drake)
library(dplyr)
library(ggplot2)
library(scadsplants)
library(SPECIES)

## Set up the cache and config
db <- DBI::dbConnect(RSQLite::SQLite(), here::here("analysis", "drake", "drake-cache.sqlite"))
cache <- storr::storr_dbi("datatable", "keystable", db)

Developing a rarefaction protocol

datnames <- cached(cache = cache)[ which(substr(cached(cache = cache), 1, 3) == "dat")]
datnames <- datnames[ !grepl(datnames, pattern = "singles")]

dat <- readd(datnames[2], character_only = T,  cache = cache)

all_dats <- lapply(datnames, readd, cache = cache, character_only = T)

all_ests <- lapply(all_dats, FUN = function(a_dat) return(try(add_singletons(a_dat))))
all_ests_max <- lapply(all_dats, FUN = function(a_dat) return(try(add_singletons(a_dat, use_max = TRUE))))
all_ests <- all_ests[ which(!is.na(all_ests))]

alldat <- bind_rows(all_dats)
allests <- bind_rows(all_ests)
allestsmax <- bind_rows(all_ests_max)

allests <- allests %>%
  group_by(sim, source, season, year, treatment) %>%
  summarize(n0 = sum(abund),
         s0 = n()) %>%
  ungroup() %>%
  select(season, year, s0, n0) %>%
  rename(est_n0 = n0,
         est_s0 = s0)

allestsmax <- allestsmax %>%
  group_by(sim, source, season, year, treatment) %>%
  summarize(n0 = sum(abund),
         s0 = n()) %>%
  ungroup() %>%
  select(season, year, s0, n0) %>%
  rename(est_n0_max = n0,
         est_s0_max = s0)

alldat <- alldat %>%
  group_by(sim, source, season, year, treatment) %>%
  summarize(n0 = sum(abund),
         s0 = n()) %>%
  ungroup() %>%
  select(season, year, s0, n0)

alldat <- left_join(alldat, allests, by = c("season", "year")) %>%
  left_join(allestsmax, by = c("season", "year"))

The x-axis is the actual number of species and the y-axis is the mean of the ACE and Chao estimators implemented in vegan::estimateR. The black line is the 1:1 line and the blue line is the 1:1.1 line, which represents the 10% I initially added. This is often sufficient but also often conservative. The red dots are the mean maximum estimates obtained using the estimate + the standard error.

ests_plots <- ggplot(data = alldat, aes(x = s0, y = est_s0)) +
  geom_point(aes(x = s0, y = est_s0)) +
  geom_point(aes(x = s0, y = est_s0_max), color = "red")+
  geom_abline(intercept = 0, slope = 1) +
  geom_abline(intercept = 0, slope = 1.1, color = "blue") +
  theme_bw() +
  facet_wrap(vars(season), ncol = 2, strip.position = "top") +
  ylim(0, max(alldat$est_s0_max, na.rm = T) + 5)

ests_plots

est_datnames <- cached(cache = cache)[ which(substr(cached(cache = cache), 1, 3) == "dat")]
est_datnames <- est_datnames[ grepl(est_datnames, pattern = "singles")]

all_est_cached <- lapply(est_datnames, readd, cache = cache, character_only = T)

names(all_est_cached) <- est_datnames

all_est_cached <- all_est_cached[ which(!is.na(all_est_cached))]

all_est_cached <- bind_rows(all_est_cached, .id = "est_datname")


est_statevars <- all_est_cached %>%
  select(season, year, sim, rank, abund, est_datname) %>%
  group_by(season, year, sim, est_datname) %>%
  summarize(cache_n0 = sum(abund), cache_s0 = n()) %>%
  ungroup() %>%
  select(-sim, -est_datname)


alldat <- alldat %>%
  left_join(est_statevars, by = c("season", "year"))

alldat <- alldat %>%
  mutate(s0_match = est_s0_max == cache_s0,
         n0_match = est_n0_max == cache_n0)

sum(alldat$s0_match, na.rm = T)
sum(alldat$n0_match, na.rm = T)

sum(is.na(alldat$s0_match))
loadd(di_long_df, cache = cache)

di_statevars <- di_long_df %>%
  filter(source == "observed") %>%
  group_by(source, season, year, treatment, singletons) %>%
  summarize(s0 = n(),
            n0 = sum(abund)) %>%
  ungroup()

alldat_statevars_singletons <- alldat_statevars %>%
  select(-s0, -n0) %>%
  mutate(singletons = "singletons") %>%
  rename(S = cache_s0, N = cache_n0)

alldat_statevars <- alldat_statevars %>%
  select(-cache_s0, -cache_n0) %>%
  mutate(singletons = "true") %>%
  rename(S= s0, N = n0)

alldat_statevars <- bind_rows(alldat_statevars, alldat_statevars_singletons)

ranked_di_df <- ranked_di_df %>%
  filter(source == "observed") %>%
  select(-sim) %>%
  mutate(fs_name = substr(di_name, 4, nchar(di_name))) %>%
  left_join(alldat_statevars, by = c("season", "year", "singletons"))
sv <- ranked_di_df %>%
  select(singletons, season, year, S, N) %>%
  distinct()

sv_singles <- filter(sv, singletons == "singletons") %>%
  select(-singletons) %>%
  rename(S_est = S, N_est = N)

sv_wide <- filter(sv, singletons == "true") %>%
  select(-singletons) %>%
  left_join(sv_singles, by = c("season", "year")) %>%
  mutate(s_change = S_est - S) %>%
  mutate(s_change_prop = s_change / S)

Sensitivity of percentile to adding singletons

singletons_1to1_dat <- filter(ranked_di_long, source == "observed", variable %in% c("skew_percentile", "shannon_percentile", "simpson_percentile"), singletons == "true") %>%
  select(-di_name) %>%
  left_join(filter(ranked_di_long, source == "observed", variable %in% c("skew_percentile", "shannon_percentile", "simpson_percentile"), singletons == "singletons"), by = c("sim", "year", "season", "treatment", "source", "variable")) %>%
  select(-sim, -di_name) %>%
  rename(true_value = value.x,
         singletons_value = value.y) %>%
  select(-singletons.x, -singletons.y) %>%
  filter(!is.na(singletons_value))
di_df <- read.csv(here::here("analysis", "drake", "di_df.csv"), stringsAsFactors = F)

fs_size_info <- di_df %>%
  filter(sim != -99, singletons == "true") %>%
  select(season, year, treatment) %>%
  group_by(season, year, treatment) %>%
  summarize(n_elements = n()) %>%
  ungroup() %>%
  left_join(sv_wide, by = c("season", "year"))

Does adding singletons change our conclusions about weirdness?

conclusions_dat <- ranked_di_df %>%
  select(season, year, source, singletons, simpson_percentile, skew_percentile) %>%
  distinct() %>%
  filter(source == "observed",
         !is.na(simpson_percentile), 
         !is.na(skew_percentile)) %>%
  select(-source) %>%
  mutate(skew_outlier_95 = skew_percentile >= 95,
         skew_outlier_97p5 = skew_percentile >= 97.5,
         simpson_outlier_5 = simpson_percentile <= 5,
         simpson_outlier_2p5 = simpson_percentile <= 2.5) %>%
  tidyr::gather(-season, -year, -singletons, key = "variable", value = "value")

singletons_conclusions <- filter(conclusions_dat, singletons == "singletons") %>%
  select(-singletons) %>%
  rename(singletons_value = value)

conclusions_dat2 <- filter(conclusions_dat, singletons == "true") %>%
  select(-singletons) %>%
  left_join(singletons_conclusions, by = c("year", "season", "variable")) %>%
  filter(!is.na(singletons_value)) %>%
  mutate(singletons_change = singletons_value != value) %>%
  left_join(sv_wide, by = c("year", "season")) %>%
  filter(s_change > 0) %>%
  left_join(select(fs_size_info, season, year, n_elements), by = c("season", "year"))

This is a plot of true-value (x) vs. singletons-added (y) percentiles for skewness and Simpson's. Color corresponds to the number of species added because of the estimation. I have removed points where the estimate was equal to the actual value. The black line is the 1:1 line.

conclusions_1to1_plot <- ggplot(data = filter(conclusions_dat2, grepl(variable, pattern = "percentile")), aes(x = value, y = singletons_value, color = s_change_prop)) +
  geom_point() +
  geom_abline(intercept = 0, slope = 1, alpha  = .2) +
  theme_bw() +
  facet_wrap(vars(variable)) +
    scale_color_viridis_c(option = "magma", begin = .3, end = .9) + ggtitle("Percentiles 1:1") +
  xlab("Percentile from raw data") +
  ylab("Percentile with added singletons")

conclusions_1to1_plot

Let's zoom in on the regions of these plots where there are actually data points.

This is Simpson's. We've zoomed in to the bottom left corner. The purple guidelines are the 2.5 percentile and the blue ones are the 5 percentile. Adding singletons consistently drives the percentile value down, i.e. more extreme. It never changes whether we think the vector is "weird" to 5%, but it does push two samples from "not weird" to "weird" at 2.5%.

simpson_1to1_plot <- ggplot(data = filter(conclusions_dat2, variable == "simpson_percentile"), aes(x = value, y = singletons_value, color = s_change_prop)) +
  geom_point(data = filter(conclusions_dat2, n_elements > 2, variable == "simpson_percentile")) +
  geom_point(data = filter(conclusions_dat2, n_elements < 5, variable == "simpson_percentile"), size = .8) +
  geom_abline(intercept = 0, slope = 1, alpha = .2) +
  geom_hline(yintercept = 2.5, color = "purple", alpha = .5) +
  geom_hline(yintercept = 5, color = "blue", alpha = .5) +
  geom_vline(xintercept = 2.5, color = "purple", alpha = .5) +
  geom_vline(xintercept = 5, color = "blue", alpha = .5) +
  ylim(0, 7) +
  xlim(0, 7) +
  theme_bw() +
  scale_color_viridis_c(option = "magma", begin = .3, end = .9) +
  ggtitle("Simpson 1:1") + 
  xlab("Percentile from raw data") +
  ylab("Percentile with added singletons")
suppressWarnings(print(simpson_1to1_plot))

This is skewness. We've zoomed to the top right. Again, the purple lines are the 97.5, and the blue are 95. The grey is 1:1. Again, adding singletons pushes the sample to a higher, more extreme, percentile. It basically doesn't change our conclusions if we use the 95% mark (except for the one that jumps from 80 to 97.4!), but it does change our conclusions to the 97.5%. For the 97.5, two samples that are not outliers become outliers if we add singletons.

There's also one just on 95/97.5 line, which I'm not counting above.

skew_1to1_plot <- ggplot(data = filter(conclusions_dat2, variable == "skew_percentile"), aes(x = value, y = singletons_value, color = s_change_prop)) +
  geom_point(data = filter(conclusions_dat2, n_elements > 2, variable == "skew_percentile")) +
    geom_point(data = filter(conclusions_dat2, n_elements < 5, variable == "skew_percentile"), size = .8) +
  geom_abline(intercept = 0, slope = 1, alpha = .2) +
  geom_hline(yintercept = 97.5, color = "purple", alpha = .5) +
  geom_hline(yintercept = 95, color = "blue", alpha = .5) +
  geom_vline(xintercept = 97.5, color = "purple", alpha = .5) +
  geom_vline(xintercept = 95, color = "blue", alpha = .5) +
  ylim(80, 100) +
  xlim(80, 100) +
  theme_bw() +
  scale_color_viridis_c(option = "magma", begin = .3, end = .9) +
  ggtitle("Skewness 1:1")  +
  xlab("Percentile from raw data") +
  ylab("Percentile with added singletons")
suppressWarnings(print(skew_1to1_plot))

Sometimes skewness is super small! Zooming in to the bottom left of the skewness plot, it turns out we have 1 point that is 0 and 0. This community has only 2 unique draws from the feasible set, out of 10000, so don't put a lot of stock in it.

skew_1to1_plot_bottom <- ggplot(data = filter(conclusions_dat2, variable == "skew_percentile"), aes(x = value, y = singletons_value, color = s_change_prop)) +
  geom_point(data = filter(conclusions_dat2, n_elements > 2, variable == "skew_percentile")) +
    geom_point(data = filter(conclusions_dat2, n_elements < 5, variable == "skew_percentile"), size = .8) +
  geom_abline(intercept = 0, slope = 1, alpha = .2) +
  geom_hline(yintercept = 2.5, color = "purple", alpha = .5) +
  geom_hline(yintercept = 5, color = "blue", alpha = .5) +
  geom_vline(xintercept = 2.5, color = "purple", alpha = .5) +
  geom_vline(xintercept = 5, color = "blue", alpha = .5) +
  ylim(0, 7) +
  xlim(0, 7) +
  theme_bw() +
  scale_color_viridis_c(option = "magma", begin = .3, end = .9) +
  ggtitle("Skewness 1:1 - low values")
suppressWarnings(print(skew_1to1_plot_bottom))


diazrenata/scadsplants documentation built on Nov. 14, 2019, 2:42 p.m.