knitr::opts_chunk$set(echo = FALSE)
library(drake)
library(dplyr)
library(ggplot2)
library(scadsanalysis)

if(params$on_hpg) {

  all_di <- list() 
  datasets <- c("bbs", "fia_short", "gentry", "mcdb", "misc_abund_short", "portal_plants")

  #datasets <- c("fia", "gentry", "mcdb", "misc_abund_short", "portal_plants")

  for(i in 1:length(datasets)) {

    this_dataset = datasets[i]
    if(this_dataset == "misc_abund_short") {
      cache_loc = "miscabund"
    } else if(this_dataset == "fia_short") {

      cache_loc = "fia"
    } else {
      cache_loc = this_dataset
    }

    ## Set up the cache and config
    db <- DBI::dbConnect(RSQLite::SQLite(), here::here("analysis", "drake", paste0("drake-cache-", cache_loc, ".sqlite")))
    cache <- storr::storr_dbi("datatable", "keystable", db)

    all_di[[i]] <- readd(all_di_obs, cache = cache)

    all_di[[i]] <- filter(all_di[[i]], source == "observed")

    this_dat <- readd(paste0("dat_s_dat_", this_dataset), cache = cache, character_only = T)
    sv <- get_statevars(this_dat)
    all_di[[i]] <- left_join(all_di[[i]], sv, by = c("sim", "site", "source", "singletons", "dat"))


    DBI::dbDisconnect(db)
    rm(cache)
    rm(db)
  }


  all_di <- bind_rows(all_di)

  write.csv(all_di, "all_di.csv", row.names = F)
} else {
  all_di <- read.csv(here::here("analysis", "reports", "all_di.csv"), stringsAsFactors = F)
}
skewhists <- ggplot(data = all_di, aes(x = skew_percentile)) +
  geom_histogram(binwidth = 5, alpha = .3, boundary = 0) +
  geom_histogram(binwidth = 5, data = filter(all_di, nsamples == 2500), boundary = 0) +
  theme_bw() +
  facet_grid(rows = vars(dat), cols = vars(singletons), scales = "free_y") +
  ggtitle("Skewness") +
  geom_vline(xintercept = 95, color = "red")
skewhists


simpsonhists <- ggplot(data = all_di, aes(x = simpson_percentile)) +
  geom_histogram(binwidth = 5, alpha = .3, boundary = 0) +
  geom_histogram(binwidth = 5, data = filter(all_di, nsamples == 2500), boundary = 0) +
  theme_bw() +
  facet_grid(rows = vars(dat), cols = vars(singletons), scales = "free_y") +
  ggtitle("Simpson") +
  geom_vline(xintercept = 5, color = "red")

simpsonhists

Observations from these histograms:

Some summary statistics:

di_summary <- all_di %>%
  filter(nsamples >= 2000) %>%
  group_by(dat, singletons) %>%
  summarize(mean_skew = mean(skew_percentile, na.rm=T),
            sd_skew = sd(skew_percentile, na.rm = T),
            mean_simpson = mean(simpson_percentile, na.rm = T),
            sd_simpson = sd(simpson_percentile, na.rm = T),
            mean_nsamples = mean(nsamples),
            nsites = length(unique(site)),
            prop_skew_high = mean(skew_percentile >= 95),
            prop_simpson_low = mean(simpson_percentile <= 5),
            med_skew = median(skew_percentile, na.rm = T),
            med_simpson = median(simpson_percentile, na.rm = T),
            skew_25 = quantile(skew_percentile, probs = .25),
            simpson_75 = quantile(simpson_percentile, probs = .75)) %>%
  ungroup()


skew_boxplot <- ggplot(data = filter(all_di, nsamples >= 2000), aes(x = singletons, y = skew_percentile, group = singletons)) +
  # geom_point() +
  # geom_errorbar(aes(x = singletons, ymin = skew_25, ymax = med_skew)) +
  # geom_point(data = all_di, aes(x = singletons, y = skew_percentile), size = .5, alpha = .05) +
  # geom_label(aes(x = singletons, y = 10, label = signif(med_skew, digits = 4))) +
  geom_boxplot() +
  theme_bw() +
  facet_wrap(vars(dat)) +
  geom_hline(yintercept = 100, linetype = 2, color = "red") +
  ylim(0, 110) +
  ggtitle("Skewness, >= 2000 samples")
skew_boxplot


simpson_boxplot <- ggplot(data = filter(all_di, nsamples >= 2000), aes(x = singletons, y = simpson_percentile, group = singletons)) +
  geom_boxplot() +
  theme_bw() +
  facet_wrap(vars(dat)) +
  geom_hline(yintercept = 0, linetype = 2, color = "red") +
  ylim(-10, 100) +
  ggtitle("Simpson's, >= 2000 samples")
simpson_boxplot

print(select(di_summary, dat, singletons, mean_nsamples, nsites, prop_skew_high, prop_simpson_low))

skew_summary_plot <- ggplot(data = di_summary, aes(x = dat, y = prop_skew_high, color = singletons, size = as.factor(nsites))) +
  geom_point() +
  theme_bw() +
  ggtitle("Skew") +
  scale_color_viridis_d(end = .8) +
  ylim(0, 1) +
  geom_hline(yintercept = .05, color = "red") +
  theme(legend.position = "none")

simpson_summary_plot <- ggplot(data = di_summary, aes(x = dat, y = prop_simpson_low, color = singletons, size = as.factor(nsites))) +
  geom_point() +
  theme_bw() +
  ggtitle("Simpson") +
  scale_color_viridis_d(end = .8)+
  ylim(0, 1) +
    geom_hline(yintercept = .05, color = "red") +
  theme(legend.position = "none")

gridExtra::grid.arrange(grobs = list(skew_summary_plot, simpson_summary_plot), nrow = 1)
all_di <- filter(all_di, !singletons)

skew_s_n_maps <- ggplot(data = all_di, aes(x = s0, y = n0, color = skew_percentile)) +
  geom_point(alpha = .1, size = .5) +
  geom_point(data = filter(all_di, nsamples >= 2000), alpha = .2) +
  theme_bw() +
  facet_wrap(vars(dat), scales = "free", ncol = 2) + 
  ggtitle("Skewness") +
  scale_color_viridis_c(option = "plasma", end= .75)

skew_s_n_maps


simpson_s_n_maps <- ggplot(data = all_di, aes(x = s0, y = n0, color = simpson_percentile)) +
  geom_point(alpha =  .1, size = .5) +
  geom_point(data = filter(all_di, nsamples >= 2000), alpha = .2) +
  theme_bw() +
  facet_wrap(vars(dat), scales = "free", ncol = 2)  + 
  ggtitle("Simpson") +
  scale_color_viridis_c(option = "plasma", end= .75, direction = -1)

simpson_s_n_maps

The less extreme (low skew, high simpson) vectors appear vaguely collected in the lower right for Gentry and BBS. Those are the regions with relatively high N/S, aka low average abundance, aka a relatively small feasible set.

Conversely, for MCDB and Misc, it almost looks like the very least skewed elements are way down at tiny abundances?

Do percentiles scan with S/N?

all_di <- mutate(all_di, avg_abund = n0/s0)

skew_avgabund <- ggplot(data = all_di, aes(x = avg_abund, y = skew_percentile)) +
  geom_point(alpha = .3) +
  theme_bw() +
  ggtitle("Skew") +
  facet_wrap(vars(dat), scales = "free")

skew_avgabund


simpson_avgabund <- ggplot(data = all_di, aes(x = avg_abund, y = simpson_percentile)) +
  geom_point(alpha = .3) +
  theme_bw() +
  ggtitle("Simpson") +
  facet_wrap(vars(dat), scales = "free")

simpson_avgabund

There is perhaps a constraint, but that might be because you rarely get v high average abundances. You seem to rarely get low skewness or high Simpson at high average abundance, which scans with the heatmaps.

Do Simpson and skewness correspond?

ss_plot <- ggplot(data = all_di, aes(x= skew_percentile, y = simpson_percentile, color = dat)) +
  geom_point(alpha = .1) +
  scale_color_viridis_d(end = .8) +
  theme_bw() +
  ggtitle("Skew v Simp")

ss_plot

ss_facet_plot <- ss_plot +
  facet_wrap(vars(dat))

ss_facet_plot

There's a weak relationship between Simpson and skewness, but a lot of noise. They are not substitutable.

            Why is Gentry so strangely bimodal? It's like a U, when all the others tend towards one end or the other.

Why are BBS and mammals less frequently squished than Portal?

Looking at range of skewness and Simpsons:

skew_range_hist <- ggplot(data = filter(all_di, nsamples > 2000), aes(x = skew_range, y = skew_percentile, color = dat)) +
  geom_point(alpha = .2) +
  geom_smooth(method = "lm", se =F) +
  theme_bw() +
  scale_color_viridis_d(end = .8) +
  facet_wrap(vars(dat), scales = "free") + 
  ggtitle("Skewness")

skew_range_hist



simpson_range_hist <- ggplot(data = filter(all_di, nsamples > 2000), aes(x = simpson_range, y = simpson_percentile, color = dat)) +
  geom_point(alpha = .2) +
  geom_smooth(method = "lm", se =F) +

  theme_bw() +
  scale_color_viridis_d(end = .8) +
  facet_wrap(vars(dat), scales = "free") +
  ggtitle("Simpson")

simpson_range_hist

I added the ranges because I thought the percentiles might be constrained somewhat by the range of values represented in the feasible set. It doesn't look to me like there is a strong relationship between range and %ile.

mcdb_di <- filter(all_di, dat == "mcdb")

mcdb_sites <- read.csv(here::here("working-data/site_data/mammal_community_db_sites.csv"), stringsAsFactors = F)

mcdb_sites <- mcdb_sites %>%
  select(site_id, n_years, study_duration) %>%
  rename(site = site_id) %>%
  mutate(site = as.character(site))

mcdb_di <- left_join(mcdb_di, mcdb_sites, by = "site")

nyears_hist <- ggplot(data = distinct(select(mcdb_di, site, n_years)), aes(x = n_years)) +
  geom_histogram(pad = T, boundary = 0) +
  theme_bw() +
  ggtitle("Nyears in MCDB")

nyears_hist
# 
# duration_hist <-  ggplot(data = distinct(select(mcdb_di, site, study_duration)), aes(x = study_duration)) +
#   geom_histogram(pad = T, boundary = 0) +
#   theme_bw() +
#   ggtitle("Duration in MCDB")
# 
# duration_hist

nyears_skew <- ggplot(data = mcdb_di, aes(x = n_years, y = skew_percentile)) +
  geom_point(alpha = .2) +
  #facet_wrap(vars(singletons)) +
  theme_bw() +
  ggtitle("Skew v nyears (MCDB)")

nyears_skew


nyears_simpson <- ggplot(data = mcdb_di, aes(x = n_years, y = simpson_percentile)) +
  geom_point(alpha = .2) +
  #facet_wrap(vars(singletons)) +
  theme_bw() +
  ggtitle("Simpson v nyears (MCDB)")

nyears_simpson


diazrenata/scadsanalysis documentation built on May 14, 2021, 6:59 p.m.