In diazrenata/scadsanalysis: What the Package Does (One Line, Title Case)

knitr::opts_chunk$set(echo = FALSE)
library(drake)
library(dplyr)
library(ggplot2)
library(scadsanalysis)

if(params$on_hpg) {

  all_di <- list() 
  datasets <- c("bbs", "fia_short", "gentry", "mcdb", "misc_abund_short", "portal_plants")

  #datasets <- c("fia", "gentry", "mcdb", "misc_abund_short", "portal_plants")

  for(i in 1:length(datasets)) {

    this_dataset = datasets[i]
    if(this_dataset == "misc_abund_short") {
      cache_loc = "miscabund"
    } else if(this_dataset == "fia_short") {

      cache_loc = "fia"
    } else {
      cache_loc = this_dataset
    }

    ## Set up the cache and config
    db <- DBI::dbConnect(RSQLite::SQLite(), here::here("analysis", "drake", paste0("drake-cache-", cache_loc, ".sqlite")))
    cache <- storr::storr_dbi("datatable", "keystable", db)

    all_di[[i]] <- readd(all_di_obs, cache = cache)

    all_di[[i]] <- filter(all_di[[i]], source == "observed")

    this_dat <- readd(paste0("dat_s_dat_", this_dataset), cache = cache, character_only = T)
    sv <- get_statevars(this_dat)
    all_di[[i]] <- left_join(all_di[[i]], sv, by = c("sim", "site", "source", "singletons", "dat"))


    DBI::dbDisconnect(db)
    rm(cache)
    rm(db)
  }


  all_di <- bind_rows(all_di)

  write.csv(all_di, "all_di.csv", row.names = F)
} else {
  all_di <- read.csv(here::here("analysis", "reports", "all_di.csv"), stringsAsFactors = F)
}

skewhists <- ggplot(data = all_di, aes(x = skew_percentile)) +
  geom_histogram(binwidth = 5, alpha = .3, boundary = 0) +
  geom_histogram(binwidth = 5, data = filter(all_di, nsamples == 2500), boundary = 0) +
  theme_bw() +
  facet_grid(rows = vars(dat), cols = vars(singletons), scales = "free_y") +
  ggtitle("Skewness") +
  geom_vline(xintercept = 95, color = "red")
skewhists


simpsonhists <- ggplot(data = all_di, aes(x = simpson_percentile)) +
  geom_histogram(binwidth = 5, alpha = .3, boundary = 0) +
  geom_histogram(binwidth = 5, data = filter(all_di, nsamples == 2500), boundary = 0) +
  theme_bw() +
  facet_grid(rows = vars(dat), cols = vars(singletons), scales = "free_y") +
  ggtitle("Simpson") +
  geom_vline(xintercept = 5, color = "red")

simpsonhists

Observations from these histograms:

FIA really struggles to get 2500 samples
Portal is the most concentrated of all the datasets.
Then MCDB and misc, then gentry, then bbs. FIA is hard to place but looks idiosyncratic.
Curious about how variation in %ile maps on to S and N, particularly for Gentry and BBS.
Skewness is less extreme than Simpson (more density in the in-between).
Singletons doesn't appear to be terrifically impactful. From now on filtering to unaltered vectors.
I think it's best to filter to at least 2000 samples.

Some summary statistics:

di_summary <- all_di %>%
  filter(nsamples >= 2000) %>%
  group_by(dat, singletons) %>%
  summarize(mean_skew = mean(skew_percentile, na.rm=T),
            sd_skew = sd(skew_percentile, na.rm = T),
            mean_simpson = mean(simpson_percentile, na.rm = T),
            sd_simpson = sd(simpson_percentile, na.rm = T),
            mean_nsamples = mean(nsamples),
            nsites = length(unique(site)),
            prop_skew_high = mean(skew_percentile >= 95),
            prop_simpson_low = mean(simpson_percentile <= 5),
            med_skew = median(skew_percentile, na.rm = T),
            med_simpson = median(simpson_percentile, na.rm = T),
            skew_25 = quantile(skew_percentile, probs = .25),
            simpson_75 = quantile(simpson_percentile, probs = .75)) %>%
  ungroup()


skew_boxplot <- ggplot(data = filter(all_di, nsamples >= 2000), aes(x = singletons, y = skew_percentile, group = singletons)) +
  # geom_point() +
  # geom_errorbar(aes(x = singletons, ymin = skew_25, ymax = med_skew)) +
  # geom_point(data = all_di, aes(x = singletons, y = skew_percentile), size = .5, alpha = .05) +
  # geom_label(aes(x = singletons, y = 10, label = signif(med_skew, digits = 4))) +
  geom_boxplot() +
  theme_bw() +
  facet_wrap(vars(dat)) +
  geom_hline(yintercept = 100, linetype = 2, color = "red") +
  ylim(0, 110) +
  ggtitle("Skewness, >= 2000 samples")
skew_boxplot


simpson_boxplot <- ggplot(data = filter(all_di, nsamples >= 2000), aes(x = singletons, y = simpson_percentile, group = singletons)) +
  geom_boxplot() +
  theme_bw() +
  facet_wrap(vars(dat)) +
  geom_hline(yintercept = 0, linetype = 2, color = "red") +
  ylim(-10, 100) +
  ggtitle("Simpson's, >= 2000 samples")
simpson_boxplot

print(select(di_summary, dat, singletons, mean_nsamples, nsites, prop_skew_high, prop_simpson_low))

skew_summary_plot <- ggplot(data = di_summary, aes(x = dat, y = prop_skew_high, color = singletons, size = as.factor(nsites))) +
  geom_point() +
  theme_bw() +
  ggtitle("Skew") +
  scale_color_viridis_d(end = .8) +
  ylim(0, 1) +
  geom_hline(yintercept = .05, color = "red") +
  theme(legend.position = "none")

simpson_summary_plot <- ggplot(data = di_summary, aes(x = dat, y = prop_simpson_low, color = singletons, size = as.factor(nsites))) +
  geom_point() +
  theme_bw() +
  ggtitle("Simpson") +
  scale_color_viridis_d(end = .8)+
  ylim(0, 1) +
    geom_hline(yintercept = .05, color = "red") +
  theme(legend.position = "none")

gridExtra::grid.arrange(grobs = list(skew_summary_plot, simpson_summary_plot), nrow = 1)

all_di <- filter(all_di, !singletons)

skew_s_n_maps <- ggplot(data = all_di, aes(x = s0, y = n0, color = skew_percentile)) +
  geom_point(alpha = .1, size = .5) +
  geom_point(data = filter(all_di, nsamples >= 2000), alpha = .2) +
  theme_bw() +
  facet_wrap(vars(dat), scales = "free", ncol = 2) + 
  ggtitle("Skewness") +
  scale_color_viridis_c(option = "plasma", end= .75)

skew_s_n_maps


simpson_s_n_maps <- ggplot(data = all_di, aes(x = s0, y = n0, color = simpson_percentile)) +
  geom_point(alpha =  .1, size = .5) +
  geom_point(data = filter(all_di, nsamples >= 2000), alpha = .2) +
  theme_bw() +
  facet_wrap(vars(dat), scales = "free", ncol = 2)  + 
  ggtitle("Simpson") +
  scale_color_viridis_c(option = "plasma", end= .75, direction = -1)

simpson_s_n_maps

The less extreme (low skew, high simpson) vectors appear vaguely collected in the lower right for Gentry and BBS. Those are the regions with relatively high N/S, aka low average abundance, aka a relatively small feasible set.

Conversely, for MCDB and Misc, it almost looks like the very least skewed elements are way down at tiny abundances?

Do percentiles scan with S/N?

all_di <- mutate(all_di, avg_abund = n0/s0)

skew_avgabund <- ggplot(data = all_di, aes(x = avg_abund, y = skew_percentile)) +
  geom_point(alpha = .3) +
  theme_bw() +
  ggtitle("Skew") +
  facet_wrap(vars(dat), scales = "free")

skew_avgabund


simpson_avgabund <- ggplot(data = all_di, aes(x = avg_abund, y = simpson_percentile)) +
  geom_point(alpha = .3) +
  theme_bw() +
  ggtitle("Simpson") +
  facet_wrap(vars(dat), scales = "free")

simpson_avgabund

There is perhaps a constraint, but that might be because you rarely get v high average abundances. You seem to rarely get low skewness or high Simpson at high average abundance, which scans with the heatmaps.

Do Simpson and skewness correspond?

ss_plot <- ggplot(data = all_di, aes(x= skew_percentile, y = simpson_percentile, color = dat)) +
  geom_point(alpha = .1) +
  scale_color_viridis_d(end = .8) +
  theme_bw() +
  ggtitle("Skew v Simp")

ss_plot

ss_facet_plot <- ss_plot +
  facet_wrap(vars(dat))

ss_facet_plot

There's a weak relationship between Simpson and skewness, but a lot of noise. They are not substitutable.

            Why is Gentry so strangely bimodal? It's like a U, when all the others tend towards one end or the other.

Why are BBS and mammals less frequently squished than Portal?

Looking at range of skewness and Simpsons:

skew_range_hist <- ggplot(data = filter(all_di, nsamples > 2000), aes(x = skew_range, y = skew_percentile, color = dat)) +
  geom_point(alpha = .2) +
  geom_smooth(method = "lm", se =F) +
  theme_bw() +
  scale_color_viridis_d(end = .8) +
  facet_wrap(vars(dat), scales = "free") + 
  ggtitle("Skewness")

skew_range_hist



simpson_range_hist <- ggplot(data = filter(all_di, nsamples > 2000), aes(x = simpson_range, y = simpson_percentile, color = dat)) +
  geom_point(alpha = .2) +
  geom_smooth(method = "lm", se =F) +

  theme_bw() +
  scale_color_viridis_d(end = .8) +
  facet_wrap(vars(dat), scales = "free") +
  ggtitle("Simpson")

simpson_range_hist

I added the ranges because I thought the percentiles might be constrained somewhat by the range of values represented in the feasible set. It doesn't look to me like there is a strong relationship between range and %ile.

mcdb_di <- filter(all_di, dat == "mcdb")

mcdb_sites <- read.csv(here::here("working-data/site_data/mammal_community_db_sites.csv"), stringsAsFactors = F)

mcdb_sites <- mcdb_sites %>%
  select(site_id, n_years, study_duration) %>%
  rename(site = site_id) %>%
  mutate(site = as.character(site))

mcdb_di <- left_join(mcdb_di, mcdb_sites, by = "site")

nyears_hist <- ggplot(data = distinct(select(mcdb_di, site, n_years)), aes(x = n_years)) +
  geom_histogram(pad = T, boundary = 0) +
  theme_bw() +
  ggtitle("Nyears in MCDB")

nyears_hist
# 
# duration_hist <-  ggplot(data = distinct(select(mcdb_di, site, study_duration)), aes(x = study_duration)) +
#   geom_histogram(pad = T, boundary = 0) +
#   theme_bw() +
#   ggtitle("Duration in MCDB")
# 
# duration_hist

nyears_skew <- ggplot(data = mcdb_di, aes(x = n_years, y = skew_percentile)) +
  geom_point(alpha = .2) +
  #facet_wrap(vars(singletons)) +
  theme_bw() +
  ggtitle("Skew v nyears (MCDB)")

nyears_skew


nyears_simpson <- ggplot(data = mcdb_di, aes(x = n_years, y = simpson_percentile)) +
  geom_point(alpha = .2) +
  #facet_wrap(vars(singletons)) +
  theme_bw() +
  ggtitle("Simpson v nyears (MCDB)")

nyears_simpson