knitr::opts_chunk$set(echo = FALSE)
library(drake)
library(dplyr)
library(ggplot2)
library(scadsanalysis)

if(params$on_hpg) {

  all_di <- list() 
  datasets <- c("bbs", "fia_short", "gentry", "mcdb", "misc_abund_short", "portal_plants")

  #datasets <- c("fia", "gentry", "mcdb", "misc_abund_short", "portal_plants")

  for(i in 1:length(datasets)) {

    this_dataset = datasets[i]
    if(this_dataset == "misc_abund_short") {
      cache_loc = "miscabund"
    } else if(this_dataset == "fia_short") {

      cache_loc = "fia"
    } else {
      cache_loc = this_dataset
    }

    ## Set up the cache and config
    db <- DBI::dbConnect(RSQLite::SQLite(), here::here("analysis", "drake", paste0("drake-cache-", cache_loc, ".sqlite")))
    cache <- storr::storr_dbi("datatable", "keystable", db)

    all_di[[i]] <- readd(all_di_obs, cache = cache)

    all_di[[i]] <- filter(all_di[[i]], source == "observed")

    this_dat <- readd(paste0("dat_s_dat_", this_dataset), cache = cache, character_only = T)
    sv <- get_statevars(this_dat)
    all_di[[i]] <- left_join(all_di[[i]], sv, by = c("sim", "site", "source", "singletons", "dat"))


    DBI::dbDisconnect(db)
    rm(cache)
    rm(db)
  }


  all_di <- bind_rows(all_di)

  write.csv(all_di, "all_di.csv", row.names = F)
} else {
  all_di <- read.csv(here::here("analysis", "reports", "all_di.csv"), stringsAsFactors = F)
}

Number of draws in feasible set

For low values of S and N, we expect to get considerably fewer than 2500 unique draws from the feasible set. Small feasible sets present a couple of problems for this percentile approach:

It's not clear a priori what actual values of S and N will push us into small-community problems. Again, beyond the really small communities, it really depends on the ratio. Early on I just filtered out anything that gave fewer than 2000 unique draws. This has the advantage that we only end up comparing %ile values with reasonably comparable precision, but it might artificially exclude an important region on S*N space.

ggplot(data = all_di, aes(x = log(s0), y = log(n0), color = log(nsamples))) +
  geom_point(data = filter(all_di, nsamples >= 2000), color = "grey", alpha = .25) +
  geom_point(data = filter(all_di, nsamples <2000), alpha = .1) + 
  theme_bw() +
  scale_color_viridis_c(option = "plasma", end = .9)

ggplot(data=all_di, aes(x = log(s0), y = log(n0), color = log(n0/s0))) +    geom_point(alpha = .5) + 
  theme_bw() +
  scale_color_viridis_c(option = "plasma", end = .9)
# ggplot(data = all_di, aes(x = log(s0), y = log(n0), color = log(nsamples))) +
#    geom_point(data = filter(all_di, nsamples <2000), alpha = .1) + 
#  theme_bw() +
#  scale_color_viridis_c(option = "plasma", end = .9)
# 


ggplot(data = all_di, aes(x = log(n0/s0), y = nsamples, color = log(s0))) +
  geom_point(alpha = .1) +
  theme_bw() +
  geom_hline(yintercept = 2000, color = "black") +
  scale_color_viridis_c(option = "plasma", end =.9)

ggplot(data = all_di, aes(x = log(n0/s0), y = nsamples, color = log(n0))) +
  geom_point(alpha = .1) +
  theme_bw() +
  geom_hline(yintercept = 2000, color = "black") +
  scale_color_viridis_c(option = "plasma", end =.9)




ggplot(data = filter(all_di, log(n0/s0) <= 2), aes(x = log(n0/s0), y = nsamples, color = log(s0))) +
  geom_point(alpha = .1) +
  theme_bw() +
  geom_hline(yintercept = 2000, color = "black") +
  scale_color_viridis_c(option = "plasma", end =.9)

ggplot(data = filter(all_di, log(n0/s0) <= 2), aes(x = log(n0/s0), y = nsamples, color = log(n0))) +
  geom_point(alpha = .1) +
  theme_bw() +
  geom_hline(yintercept = 2000, color = "black") +
  scale_color_viridis_c(option = "plasma", end =.9)

ggplot(data = all_di, aes(x = log(n0/s0))) +
  geom_histogram(alpha = .3) +
  geom_histogram(data = filter(all_di, nsamples <= 2000)) +
  theme_bw()

In the first plot, the grey region is nsamples >= 2000. In the histogram, the light grey is all the samples and the bold is samples where nsamples <= 2000.

Looking at log(n0/s0) vs. nsamples,

Where the datasets fall

ggplot(data = all_di, aes(x = log(s0), y = log(n0), color = dat)) + 
  geom_point(alpha = .2) +
  scale_color_viridis_d() +
  theme_bw() +
  geom_point(data = filter(all_di, dat == "I don't exist!")) # for a legend with alpha = 1

ggplot(data = all_di, aes(x = nsamples)) +
  geom_histogram() +
  theme_bw()+
  facet_wrap(vars(dat), scales = "free_y") +
  geom_vline(xintercept = 2000)


ggplot(data = all_di, aes(x = log(n0/s0))) +
  geom_histogram(alpha = .3) +
  geom_histogram(data = filter(all_di, nsamples <= 2000)) +
  theme_bw()+
  facet_wrap(vars(dat), scales = "free_y")

The datasets overlap but do occupy broadly different regions of S*N space. The arm - which is the weirdest region of the percentile values - is, unfortunately, 100% one dataset (Gentry).

For the vast majority of S*N space here, I don't think we can really use dataset as a predictor of percentile, because of the way they occupy different parts of what we know to be a very important range of variation.

We could narrow in on the region of maximum overlap/minimum variation and make comparison based on the sites only within that.

FIA, dramatically more than any other dataset, struggles to get even 2000 samples.

Percentile value vs. range of variation of FS

ggplot(data = all_di, aes(x = log(s0), y = log(n0), color = simpson_mean)) +
  geom_point(alpha = .07) +
  theme_bw() +
  scale_color_viridis_c(option = "plasma", end = .9) +
  theme(legend.position = "top") +
  ggtitle("Simpson mean")
ggplot(data = all_di, aes(x = log(s0), y = log(n0), color = simpson_sd)) +
  geom_point(alpha = .07) +
  theme_bw() +
  scale_color_viridis_c(option = "plasma", end = .9)+
  theme(legend.position = "top") +
  ggtitle("Simpson sd")
ggplot(data = all_di, aes(x = log(s0), y = log(n0), color = simpson_range)) +
  geom_point(alpha = .07) +
  theme_bw() +
  scale_color_viridis_c(option = "plasma", end = .9)+
  theme(legend.position = "top") +
  ggtitle("Simpson range")
ggplot(data = all_di, aes(x = log(s0), y = log(n0), color = simpson_percentile)) +
  geom_point(alpha = .07) +
  theme_bw() +
  scale_color_viridis_c(option = "plasma", end = .9, direction = -1)+
  theme(legend.position = "top") +
  ggtitle("Simpson percentile")


ggplot(data = filter(all_di, nsamples >= 2000), aes(x = log(s0), y = log(n0), color = simpson_mean)) +
  geom_point(alpha = .07) +
  theme_bw() +
  scale_color_viridis_c(option = "plasma", end = .9) +
  theme(legend.position = "top") +
  ggtitle("Simpson mean")
ggplot(data = filter(all_di, nsamples >= 2000), aes(x = log(s0), y = log(n0), color = simpson_sd)) +
  geom_point(alpha = .07) +
  theme_bw() +
  scale_color_viridis_c(option = "plasma", end = .9)+
  theme(legend.position = "top") +
  ggtitle("Simpson sd")
ggplot(data = filter(all_di, nsamples >= 2000), aes(x = log(s0), y = log(n0), color = simpson_range)) +
  geom_point(alpha = .07) +
  theme_bw() +
  scale_color_viridis_c(option = "plasma", end = .9)+
  theme(legend.position = "top") +
  ggtitle("Simpson range")
ggplot(data = filter(all_di, nsamples >= 2000), aes(x = log(s0), y = log(n0), color = simpson_percentile)) +
  geom_point(alpha = .07) +
  theme_bw() +
  scale_color_viridis_c(option = "plasma", end = .9, direction = -1)+
  theme(legend.position = "top") +
  ggtitle("Simpson percentile")

ggplot(data = filter(all_di, nsamples >= 2000, singletons == F), aes(x = log(n0/s0), y = skew_percentile)) +
  geom_point(alpha = .1) +
  theme_bw()

So for Simpson's, the mean, sd, and range of values within the feasible set all clearly vary with S and N. There seems to be an edge situation with the percentiles. The really high values are all out along the arm and, to a lesser extent, where N/S is relatively small. This is not a region of pronounced variation in the FS characteristics. Also, there is variation that clearly does not map on to the gradients in the FS characteristics.

ggplot(data = all_di, aes(x = log(s0), y = log(n0), color = skew_mean)) +
  geom_point(alpha = .07) +
  theme_bw() +
  scale_color_viridis_c(option = "plasma", end = .9, direction = -1) +
  theme(legend.position = "top") +
  ggtitle("Skew mean")
ggplot(data = all_di, aes(x = log(s0), y = log(n0), color = skew_sd)) +
  geom_point(alpha = .07) +
  theme_bw() +
  scale_color_viridis_c(option = "plasma", end = .9, direction = -1)+
  theme(legend.position = "top") +
  ggtitle("Skew sd")
ggplot(data = all_di, aes(x = log(s0), y = log(n0), color = skew_range)) +
  geom_point(alpha = .07) +
  theme_bw() +
  scale_color_viridis_c(option = "plasma", end = .9, direction = -1)+
  theme(legend.position = "top") +
  ggtitle("Skew range")
ggplot(data = all_di, aes(x = log(s0), y = log(n0), color = skew_percentile)) +
  geom_point(alpha = .07) +
  theme_bw() +
  scale_color_viridis_c(option = "plasma", end = .9)+
  theme(legend.position = "top") +
  ggtitle("Skew percentile")


ggplot(data = filter(all_di, nsamples >= 2000), aes(x = log(s0), y = log(n0), color = skew_mean)) +
  geom_point(alpha = .07) +
  theme_bw() +
  scale_color_viridis_c(option = "plasma", end = .9, direction = -1) +
  theme(legend.position = "top") +
  ggtitle("Skew mean")
ggplot(data = filter(all_di, nsamples >= 2000), aes(x = log(s0), y = log(n0), color = skew_sd)) +
  geom_point(alpha = .07) +
  theme_bw() +
  scale_color_viridis_c(option = "plasma", end = .9, direction = -1)+
  theme(legend.position = "top") +
  ggtitle("Skew sd")
ggplot(data = filter(all_di, nsamples >= 2000), aes(x = log(s0), y = log(n0), color = skew_range)) +
  geom_point(alpha = .07) +
  theme_bw() +
  scale_color_viridis_c(option = "plasma", end = .9, direction = -1)+
  theme(legend.position = "top") +
  ggtitle("Skew range")
ggplot(data = filter(all_di, nsamples >= 2000), aes(x = log(s0), y = log(n0), color = skew_percentile)) +
  geom_point(alpha = .07) +
  theme_bw() +
  scale_color_viridis_c(option = "plasma", end = .9)+
  theme(legend.position = "top") +
  ggtitle("Skew percentile")

ggplot(data = filter(all_di, nsamples >= 2000, singletons == F), aes(x = log(n0/s0), y = simpson_percentile)) +
  geom_point(alpha = .1) +
  theme_bw()

There are similar, if less strong, gradients in skewness over the range of S and N; it's particularly low up the low N/S arm. The percentile values are not as different in that arm than the rest of the space (unlike with Simpson's). The variation in skewness percentile doesn't appear to track the gradients in the characteristics of the feasible set. Maybe it does a little? But no argument that there's a lot of variation over and above.

I feel like it might be good to find some way to test these statements quantitatively, but it's very tricky. Absolutely everything depends on S and N and moves in weird nonlinear ways.

These scatterplots are good for seeing where the variation is but not the density; there's a lot of points on top of each other down at 0/up at 100.

Overall percentile results

ggplot(data = filter(all_di, nsamples >= 2000, singletons == FALSE), aes(x = skew_percentile)) +
  geom_histogram(bins = 100, aes(y = stat(count / sum(count)))) +
  theme_bw() +
  geom_hline(yintercept = .01, linetype = 3)

ggplot(data = filter(all_di, nsamples >= 2000, singletons == FALSE), aes(x = simpson_percentile)) +
  geom_histogram(bins = 100, aes(y = stat(count / sum(count)))) +
  theme_bw() +
  geom_hline(yintercept = .01, linetype = 3)

Here, the dotted lines mark the 1%; at random, percentile values should be uniformly distributed with 1% per bin on these histograms.

Both Simpson's and skewness are disproportiately in the extremes: from about the 75th percentile on up for skewness, and maybe the 10th percentile and below for evenness.

It's not ubiquitous! Often things are unremarkable compared to the feasible set. That said, more often than we'd expect, real distributions are highly skewed/highly uneven compared to their feasible sets.

ggplot(data = filter(all_di, nsamples >= 2000, singletons == FALSE), aes(x = simpson_range, y = simpson_percentile)) +
  geom_point(alpha = .1) +
  theme_bw()
ggplot(data = filter(all_di, nsamples >= 2000, singletons == FALSE), aes(x = simpson_mean, y = simpson_percentile)) +
  geom_point(alpha = .05) +
  theme_bw()
ggplot(data = filter(all_di, nsamples >= 2000, singletons == FALSE), aes(x = simpson_sd, y = simpson_percentile)) +
  geom_point(alpha = .1) +
  theme_bw()

ggplot(data = filter(all_di, nsamples >= 2000, singletons == FALSE), aes(x = skew_range, y = skew_percentile)) +
  geom_point(alpha = .1) +
  theme_bw()
ggplot(data = filter(all_di, nsamples >= 2000, singletons == FALSE), aes(x = skew_mean, y = skew_percentile)) +
  geom_point(alpha = .05) +
  theme_bw()
ggplot(data = filter(all_di, nsamples >= 2000, singletons == FALSE), aes(x = skew_sd, y = skew_percentile)) +
  geom_point(alpha = .1) +
  theme_bw()

Effect of singletons

singletons_effect <- all_di %>%
  select(dat, nsamples, site, singletons, skew_percentile, simpson_percentile) %>%
  tidyr::pivot_wider(names_from = singletons, values_from = c(skew_percentile, simpson_percentile, nsamples)) %>%
  mutate(skewdiff = skew_percentile_TRUE - skew_percentile_FALSE,
         simpdiff= simpson_percentile_TRUE - simpson_percentile_FALSE) %>%
  filter(nsamples_TRUE >= 2000, nsamples_FALSE >= 2000)

ggplot(data = singletons_effect, aes(x = skew_percentile_FALSE, y = skew_percentile_TRUE)) +
  geom_point(alpha = .2) +
  theme_bw() +
  geom_abline(intercept = 0, slope = 1, color = "green")

ggplot(data = singletons_effect, aes(x = simpson_percentile_FALSE, y = simpson_percentile_TRUE)) +
  geom_point(alpha = .2) +
  theme_bw() +
  geom_abline(intercept = 0, slope = 1, color = "green")

The rarefaction-inflated datasets are strongly // the raw vectors. They have more extreme skewness and evenness values, relative to their feasible sets, than the raw vectors. This is almost always true for evenness, with a little more noise in the skewness signal. But either way, very strong.

Broken out by dataset

make_dat_hist <- function(datname, varname, data) {

  if(varname == "skew") {

    dat_hist <- ggplot(data = filter(data, nsamples >= 2000, singletons == FALSE, dat == datname), aes(x = skew_percentile)) 
  } else if (varname == "simpson") {
    dat_hist <- ggplot(data = filter(data, nsamples >= 2000, singletons == FALSE, dat == datname), aes(x = simpson_percentile)) 
  }

  dat_hist <- dat_hist +
    geom_histogram(bins = 100, aes(y = stat(count / sum(count)))) +
    theme_bw() +
    geom_hline(yintercept = .01, linetype = 3) +
    ggtitle(paste0(datname, ": ", varname))

  return(dat_hist)
}

simp_hists <- lapply(as.list(unique(all_di$dat)), FUN = make_dat_hist, varname = "simpson", data = all_di)

skew_hists <- lapply(as.list(unique(all_di$dat)), FUN = make_dat_hist, varname = "skew", data = all_di)

gridExtra::grid.arrange(grobs = simp_hists)

gridExtra::grid.arrange(grobs = skew_hists)

Evennes is consistently more concentrated in the extremes than skewness.

Gentry has a weird U going on, where it has a lot of weirdly low/high values. All the others are concentrated as low (evenness) or high (skew). BBS and FIA have the most that are in the intermediate zone.

ggplot(data = filter(all_di, nsamples >= 2000, singletons == FALSE), aes(x = dat, y = simpson_range)) +
  geom_boxplot() +
  theme_bw()
ggplot(data = filter(all_di, nsamples >= 2000, singletons == FALSE), aes(x = dat, y = simpson_mean)) +
  geom_boxplot() +
  theme_bw()
ggplot(data = filter(all_di, nsamples >= 2000, singletons == FALSE), aes(x = dat, y = simpson_sd)) +
  geom_boxplot() +
  theme_bw()
ggplot(data = filter(all_di, nsamples >= 2000, singletons == FALSE), aes(x = dat, y = simpson_percentile)) +
  geom_boxplot() +
  theme_bw()
# Removing low n/s
ggplot(data = filter(all_di, nsamples >= 2000, singletons == FALSE, log(n0/s0) >= 1.5), aes(x = dat, y = simpson_range)) +
  geom_boxplot() +
  theme_bw()
ggplot(data = filter(all_di, nsamples >= 2000, singletons == FALSE, log(n0/s0) >= 1.5), aes(x = dat, y = simpson_mean)) +
  geom_boxplot() +
  theme_bw()
ggplot(data = filter(all_di, nsamples >= 2000, singletons == FALSE, log(n0/s0) >= 1.5), aes(x = dat, y = simpson_sd)) +
  geom_boxplot() +
  theme_bw()
ggplot(data = filter(all_di, nsamples >= 2000, singletons == FALSE, log(n0/s0) >= 1.5), aes(x = dat, y = simpson_percentile)) +
  geom_boxplot() +
  theme_bw()

ggplot(data = filter(all_di, nsamples >= 2000, singletons == FALSE), aes(x = dat, y = skew_range)) +
  geom_boxplot() +
  theme_bw()
ggplot(data = filter(all_di, nsamples >= 2000, singletons == FALSE), aes(x = dat, y = skew_mean)) +
  geom_boxplot() +
  theme_bw()
ggplot(data = filter(all_di, nsamples >= 2000, singletons == FALSE), aes(x = dat, y = skew_sd)) +
  geom_boxplot() +
  theme_bw()
ggplot(data = filter(all_di, nsamples >= 2000, singletons == FALSE), aes(x = dat, y = skew_percentile)) +
  geom_boxplot() +
  theme_bw()
# Removing low n/s
ggplot(data = filter(all_di, nsamples >= 2000, singletons == FALSE, log(n0/s0) >= 1.5), aes(x = dat, y = skew_range)) +
  geom_boxplot() +
  theme_bw()
ggplot(data = filter(all_di, nsamples >= 2000, singletons == FALSE, log(n0/s0) >= 1.5), aes(x = dat, y = skew_mean)) +
  geom_boxplot() +
  theme_bw()
ggplot(data = filter(all_di, nsamples >= 2000, singletons == FALSE, log(n0/s0) >= 1.5), aes(x = dat, y = skew_sd)) +
  geom_boxplot() +
  theme_bw()
ggplot(data = filter(all_di, nsamples >= 2000, singletons == FALSE, log(n0/s0) >= 1.5), aes(x = dat, y = skew_percentile)) +
  geom_boxplot() +
  theme_bw()

Effect of singletons

singletons_effect <- all_di %>%
  select(dat, nsamples, site, singletons, skew_percentile, simpson_percentile) %>%
  tidyr::pivot_wider(names_from = singletons, values_from = c(skew_percentile, simpson_percentile, nsamples)) %>%
  mutate(skewdiff = skew_percentile_TRUE - skew_percentile_FALSE,
         simpdiff= simpson_percentile_TRUE - simpson_percentile_FALSE) %>%
  filter(nsamples_TRUE >= 2000, nsamples_FALSE >= 2000)

ggplot(data = singletons_effect, aes(x = skew_percentile_FALSE, y = skew_percentile_TRUE)) +
  geom_point(alpha = .2) +
  theme_bw() +
  geom_abline(intercept = 0, slope = 1, color = "green") +
  facet_wrap(vars(dat))

ggplot(data = singletons_effect, aes(x = simpson_percentile_FALSE, y = simpson_percentile_TRUE)) +
  geom_point(alpha = .2) +
  theme_bw() +
  geom_abline(intercept = 0, slope = 1, color = "green") +
  facet_wrap(vars(dat))

There's some fuzz, most pronouncedly for BBS and FIA. Those are also the ones with 1) the most points and 2) the most fuzz/uniform-distributed percentile values.

MACD

cache_loc = "macdb"

## Set up the cache and config
db <- DBI::dbConnect(RSQLite::SQLite(), here::here("analysis", "drake", paste0("drake-cache-", cache_loc, ".sqlite")))
cache <- storr::storr_dbi("datatable", "keystable", db)

all_di_macd <- readd(all_di_manip, cache = cache) 
macd_dat <- readd(dat_s_dat_macdb, cache = cache)

DBI::dbDisconnect(db)
rm(cache)

macd_dat <- macd_dat %>%
  group_by(site, dat, singletons, sim, source) %>%
  summarize(s0 = max(rank),
            n0 = sum(abund)) %>%
  ungroup()

all_di_macd <- left_join(all_di_macd, macd_dat, by = c("site", "dat", "singletons", "sim", "source"))

ggplot(data = all_di, aes(x = log(s0), y = log(n0))) + geom_point(alpha = .1) + geom_point(data = all_di_macd, aes(x = log(s0), y = log(n0)), color = "blue", alpha = .9) + theme_bw()


ggplot(data = filter(all_di, nsamples >= 2000), aes(x = log(s0), y = log(n0))) + geom_point(alpha = .1) + geom_point(data = filter(all_di_macd, nsamples >= 2000), aes(x = log(s0), y = log(n0)), color = "blue", alpha = .9) + theme_bw()

Nsamples

ggplot(data = all_di_macd, aes(x = nsamples)) +
  geom_histogram(bins = 100) +
  theme_bw() +
  geom_vline(xintercept = 2000)

all_di_macd <- filter(all_di_macd, nsamples >= 2000)

ggplot(data = all_di_macd, aes(x = skew_percentile)) +
  geom_histogram(bins = 100, aes(y = stat(count / sum(count)))) +
  theme_bw() +
  geom_hline(yintercept = .01, linetype = 3)

ggplot(data = all_di_macd, aes(x = simpson_percentile)) +
  geom_histogram(bins = 100, aes(y = stat(count / sum(count)))) +
  theme_bw() +
  geom_hline(yintercept = .01, linetype = 3)

By treatment - overall

ggplot(data = all_di_macd, aes(x = skew_percentile)) +
  geom_histogram(bins = 100) +
  theme_bw() +
  facet_wrap(vars(dat), scales = "free_y")

ggplot(data = all_di_macd, aes(x = simpson_percentile)) +
  geom_histogram(bins = 100) +
  theme_bw() +
  facet_wrap(vars(dat), scales = "free_y")
macd_comparisons <- read.csv(here::here("working-data", "macdb_data", "orderedcomparisons.csv"), header = F, stringsAsFactors = F)

colnames(macd_comparisons) <- c("studyID", "control", "site")
macd_comparisons <- macd_comparisons %>%
  mutate(site = as.character(site), control = as.character(control))

cc_di <- all_di_macd %>%
  filter(singletons == FALSE, treatment == "comparison") %>%
  left_join(macd_comparisons, by = c("studyID", "site")) %>%
  select(dat, site, skew_percentile, simpson_percentile, studyID, control) %>%
  rename(comparison = site) %>%
  mutate(ctrl_skew_percentile = NA,
         ctrl_simpson_percentile = NA)

for(i in 1:nrow(cc_di)) {

  control_row <- filter(all_di_macd, site == cc_di$control[i], singletons == F) %>%
    unique()
  if(nrow(control_row) >0 ) {
    cc_di$ctrl_simpson_percentile[i] <- control_row$simpson_percentile
    cc_di$ctrl_skew_percentile[i] <- control_row$skew_percentile
  }
}

cc_di <- cc_di %>%
  mutate(simpson_change = simpson_percentile - ctrl_simpson_percentile,
         skew_change = skew_percentile - ctrl_skew_percentile)

ggplot(data = cc_di, aes(x = skew_percentile, y = ctrl_skew_percentile)) +
  geom_point(alpha = .5) +
  #  xlim(0, 100) +
  # ylim(0, 100) +
  theme_bw() +
  geom_abline(intercept = 0, slope = 1, color = "green")

ggplot(data = cc_di, aes(x = simpson_percentile, y = ctrl_skew_percentile)) +
  geom_point(alpha = .5) +
  xlim(0, 100) +
  ylim(0, 100) +
  theme_bw() +
  geom_abline(intercept = 0, slope = 1, color = "green")


ggplot(data = cc_di, aes(x = simpson_change)) +
  geom_histogram() +
  theme_bw()

ggplot(data = cc_di, aes(x = skew_change)) +
  geom_histogram() +
  theme_bw()

ggplot(data = cc_di, aes(x = ctrl_simpson_percentile, y = simpson_change)) +
  geom_point(alpha = .3) +
  geom_hline(yintercept = 0) +
  theme_bw()


ggplot(data = cc_di, aes(x = ctrl_skew_percentile, y = skew_change)) +
  geom_point(alpha = .3) +
  geom_hline(yintercept = 0) +
  theme_bw()


print(t.test(cc_di$skew_percentile, cc_di$ctrl_skew_percentile, paired = T)
)
print(t.test(cc_di$simpson_percentile, cc_di$ctrl_simpson_percentile, paired = T)
)

print(wilcox.test(cc_di$skew_percentile, cc_di$ctrl_skew_percentile, paired = T))
print(wilcox.test(cc_di$simpson_percentile, cc_di$ctrl_simpson_percentile, paired = T))

Change is going to be bounded at 100 and 0: you can't go up or down from there. (Another argument for increasing the number of samples?)

Portal plant manips

cache_loc = "portal_plants_manip"

## Set up the cache and config
db <- DBI::dbConnect(RSQLite::SQLite(), here::here("analysis", "drake", paste0("drake-cache-", cache_loc, ".sqlite")))
cache <- storr::storr_dbi("datatable", "keystable", db)

all_di_p<- readd(all_di_manip, cache = cache) 

DBI::dbDisconnect(db)
rm(cache)

Nsamples, singletons

ggplot(data = all_di_p, aes(x = nsamples)) +
  geom_histogram(bins = 100) +
  theme_bw() +
  geom_vline(xintercept = 2000)

all_di_p <- filter(all_di_p, nsamples >= 2000)

ggplot(data = all_di_p, aes(x = skew_percentile)) +
  geom_histogram(bins = 100, aes(y = stat(count / sum(count)))) +
  theme_bw() +
  geom_hline(yintercept = .01, linetype = 3)

ggplot(data = all_di_p, aes(x = simpson_percentile)) +
  geom_histogram(bins = 100, aes(y = stat(count / sum(count)))) +
  theme_bw() +
  geom_hline(yintercept = .01, linetype = 3)

pp_singletons_effect <- all_di_p %>%
  select(singletons, skew_percentile, simpson_percentile, year, plot, treatment, season) %>%
  tidyr::pivot_wider(names_from = singletons, values_from = c("skew_percentile", "simpson_percentile"))

ggplot(data = pp_singletons_effect, aes(x = skew_percentile_FALSE, y  = skew_percentile_TRUE)) +
  geom_point(alpha = .1) +
  geom_abline(intercept = 0, slope = 1, color = "green") +
  theme_bw()


ggplot(data = pp_singletons_effect, aes(x = simpson_percentile_FALSE, y  = simpson_percentile_TRUE)) +
  geom_point(alpha = .1) +
  geom_abline(intercept = 0, slope = 1, color = "green") +
  theme_bw()

By treatment, season

ggplot(data = filter(all_di_p, singletons == F), aes(x = skew_percentile)) +
  geom_histogram(bins = 100) +
  theme_bw() +
  facet_wrap(vars(season, treatment), scales = "free_y")

ggplot(data = filter(all_di_p, singletons == F), aes(x = treatment, y = skew_percentile)) +
  geom_boxplot() +
  theme_bw() +
  facet_wrap(vars(season))

ggplot(data = filter(all_di_p, singletons == F), aes(x = simpson_percentile)) +
  geom_histogram(bins = 100) +
  theme_bw() +
  facet_wrap(vars(season, treatment), scales = "free_y")

ggplot(data = filter(all_di_p, singletons == F), aes(x = treatment, y = simpson_percentile)) +
  geom_boxplot() +
  theme_bw() +
  facet_wrap(vars(season))

By year

ggplot(data = filter(all_di_p, singletons == F), aes(x = treatment, y = skew_percentile)) +
  geom_boxplot() +
  theme_bw() +
  facet_grid(rows = vars(year), cols = vars(season), scales = "free_y")
ggplot(data = filter(all_di_p, singletons == F), aes(x = treatment, y = simpson_percentile)) +
  geom_boxplot() +
  theme_bw() +
  facet_grid(rows = vars(year), cols = vars(season), scales = "free_y")

Trying median

pp_median <- all_di_p %>%
  filter(singletons == F) %>%
  group_by(season, year, treatment) %>%
  summarize(skew_percentile = median(skew_percentile),
            simpson_percentile = median(simpson_percentile)) %>%
  ungroup() %>%
  tidyr::pivot_wider(names_from = treatment, values_from = c(skew_percentile, simpson_percentile))

sk_x <- ggplot(data = pp_median, aes(x = skew_percentile_control, y = skew_percentile_exclosure)) +
  geom_point(alpha = .5) +
  facet_wrap(vars(season)) +
  theme_bw() +
  ggtitle("Skew Ctrl v Exclosure") +
  geom_abline(intercept = 0, slope = 1, color = "green") +
  xlim(40, 100) +
  ylim(40, 100)

sk_r <- ggplot(data = pp_median, aes(x = skew_percentile_control, y = skew_percentile_removal)) +
  geom_point(alpha = .5) +
  facet_wrap(vars(season)) +
  theme_bw() +
  ggtitle("Skew Ctrl v Removal") +
  geom_abline(intercept = 0, slope = 1, color = "green") +
  xlim(40, 100) +
  ylim(40, 100)

gridExtra::grid.arrange(grobs = list(sk_x, sk_r), nrow = 2)


sp_x <- ggplot(data = pp_median, aes(x = simpson_percentile_control, y = simpson_percentile_exclosure)) +
  geom_point(alpha = .5) +
  facet_wrap(vars(season)) +
  theme_bw() +
  ggtitle("Simpson Ctrl v Exclosure") +
  geom_abline(intercept = 0, slope = 1, color = "green") +
  xlim(0, 40) +
  ylim(0, 40)

sp_r <- ggplot(data = pp_median, aes(x = simpson_percentile_control, y = simpson_percentile_removal)) +
  geom_point(alpha = .5) +
  facet_wrap(vars(season)) +
  theme_bw() +
  ggtitle("Simpson Ctrl v Removal") +
  geom_abline(intercept = 0, slope = 1, color = "green") +
  xlim(0, 40) +
  ylim(0, 40)

gridExtra::grid.arrange(grobs = list(sp_x, sp_r), nrow = 2)


diazrenata/scadsanalysis documentation built on May 14, 2021, 6:59 p.m.