In diazrenata/cats: CATS

knitr::opts_chunk$set(echo = FALSE)
library(drake)
library(dplyr)
library(ggplot2)
library(cats)


## Set up the cache and config
db <- DBI::dbConnect(RSQLite::SQLite(), here::here("analysis", "drake", "drake-cache-bbs.sqlite"))
cache <- storr::storr_dbi("datatable", "keystable", db)

loadd(all_results, cache = cache)

alldat <- as.list(cached(cache = cache)[ which(substr(cached(cache = cache), 0, 6) == "s_spab")])
alldat <- lapply(alldat, FUN = readd, cache = cache, character_only = T)
allsv <- lapply(alldat, FUN = get_statevars_ts)
allsv <- bind_rows(allsv)

all_results <- left_join(all_results, allsv, by = c("site", "singletons", "dat", "source", "sim", "timestep"))

all_results <- all_results %>%
  group_by(site, singletons, dat, source) %>%
  mutate(mean_skew = mean(skew_percentile),
         sd_skew = sd(skew_percentile),
         mean_simpson = mean(simpson_percentile),
         sd_simpson = sd(simpson_percentile),
         med_skew = median(skew_percentile),
         med_simpson = median(simpson_percentile),
         skew_25 = quantile(skew_percentile, probs = .25),
         simpson_75 = quantile(simpson_percentile, probs = .75),
         skew_high = 100 * mean(skew_percentile >= 95),
         simpson_low = 100*mean(simpson_percentile <= 5)) %>%
  ungroup()

Nsamples

The maximum number of samples is r max(all_results$nsamples).

The minimum number of samples is r min(all_results$nsamples).

nsamples_hist <- ggplot(data = all_results, aes(x = nsamples)) +
  geom_histogram(binwidth = 5) +
  facet_wrap(vars(site, singletons), ncol = 2, scales = "free_y") +
  xlim(-10, max(all_results$nsamples) + 10) +
  theme_bw() +
  ggtitle("Nsamples")
#nsamples_hist

s0_ts <- ggplot(data = filter(all_results, nsamples == max(all_results$nsamples)), aes(x = timestep, y = s0, color = singletons)) + 
  geom_line() +
  theme_bw() +
  facet_wrap(vars(site), ncol = 1) +
  scale_color_viridis_d(end =.8) +
  ggtitle("S") +
  theme(legend.position = "none")


skew_ts <- ggplot(data = filter(all_results, nsamples == max(all_results$nsamples)), aes(x = timestep, y = skew_percentile, color = singletons, group = singletons)) +
  geom_line() +
  geom_label(aes(x = 1980, y = 50, label = signif(mean_skew, 3)), position = position_dodge(15)) +
  geom_label(aes(x = 1980, y = 15, label = signif(sd_skew, 2)), position = position_dodge(15), size = 3) +
  theme_bw() +
  facet_wrap(vars(site), ncol = 1) +
  scale_color_viridis_d(end =.8) +
  ggtitle("Skew") +
  ylim(0, 100)+
  theme(legend.position = "none")

# gridExtra::grid.arrange(grobs = list(skew_ts, s0_ts), ncol = 2)


simpson_ts <- ggplot(data = filter(all_results, nsamples == max(all_results$nsamples)), aes(x = timestep, y = simpson_percentile, color = singletons)) +
  geom_line() +
  theme_bw() +
  geom_label(aes(x = 1980, y = 50, label = signif(mean_simpson, 3)), position = position_dodge(25)) +
  geom_label(aes(x = 1980, y = 15, label = signif(sd_simpson, 2)), position = position_dodge(25), size = 3) +

  facet_wrap(vars(site), ncol = 1) +
  scale_color_viridis_d(end =.8) +
  ggtitle("Simpson") +
  ylim(0, 100)+
  theme(legend.position = "none")


# gridExtra::grid.arrange(grobs = list(simpson_ts, s0_ts), ncol = 2)

Percentile values by site

These are histograms of the skewness/simpson's percentile values of all years for each site. The purple dots are the x-value (percentile value) for 2009.

Questions here are:

How pointy/broad are these histograms? (More rigorous metrics later, this is just a gestalt)
Is 2009 generally an outlier, or not?

skew_hist <- ggplot(data = filter(all_results, !singletons), aes(x = skew_percentile)) +
  geom_histogram(binwidth = 5, pad = T) +
  # geom_label(aes(x = 10, y = 5, label = round(skew_high, digits = 1))) +
  geom_point(data = filter(all_results, !singletons, timestep == 2009), aes(x = skew_percentile, y = 5), color = "purple") +
  theme_bw() +
  facet_wrap(vars(site))
skew_hist


simpson_hist <- ggplot(data = filter(all_results, !singletons), aes(x = simpson_percentile)) +
  geom_histogram(binwidth = 5, pad = T) +
  #  geom_label(aes(x = 10, y = 5, label = round(simpson_low, digits = 1))) +
  geom_point(data = filter(all_results, !singletons, timestep == 2009), aes(x = simpson_percentile, y = 5), color = "purple") +

  theme_bw() +
  facet_wrap(vars(site))
simpson_hist

My gleanings from these:

2009 is not particularly weird, but there is a lot of variation that you simply won't capture by pulling out one year.
Simpson's is consistently more extreme than skewness; this is consistent with scadsplants, scadsanalysis.
This needs to be addressed more rigorously, but it looks like sites with more extreme centers-of-mass have less general variation than sites that do not? This is probably constrained because the percentile scale has hard ends at 0 and 100.

skew_lowerq <- all_results %>%
  filter(!singletons) %>%
  mutate(lowerq = med_skew - skew_25) %>%
  select(site, lowerq) %>%
  distinct() %>%
  arrange((lowerq)) %>%
  mutate(lowerq_rank = row_number()) %>%
  right_join(filter(all_results, !singletons), by = "site")

skew_whiskers <- ggplot(data = skew_lowerq, aes(x = skew_percentile, y = lowerq_rank)) +
  geom_point(aes(x = med_skew, y= lowerq_rank)) +
  geom_errorbarh(aes(xmin = skew_25, xmax = med_skew, y= lowerq_rank)) +
  geom_point(aes(x = skew_percentile, y = lowerq_rank), alpha = .2, size = 1) +
  theme_bw()

skew_whiskers


simpson_upperq <- all_results %>%
  filter(!singletons) %>%
  mutate(upperq = simpson_75 - med_simpson) %>%
  select(site, upperq) %>%
  distinct() %>%
  arrange((upperq)) %>%
  mutate(upperq_rank = row_number()) %>%
  right_join(filter(all_results, !singletons), by = "site")

simpson_whiskers <- ggplot(data = simpson_upperq, aes(x = simpson_percentile, y = upperq_rank)) +
  geom_point(aes(x = med_simpson, y= upperq_rank)) +
  geom_errorbarh(aes(xmin = simpson_75, xmax = med_simpson, y= upperq_rank)) +
  geom_point(aes(x = simpson_percentile, y = upperq_rank), alpha = .2, size = 1) +
  theme_bw()

simpson_whiskers