In diazrenata/scadsanalysis: What the Package Does (One Line, Title Case)

knitr::opts_chunk$set(echo = FALSE)
library(drake)
library(dplyr)
library(ggplot2)

all_di <- read.csv(here::here("analysis", "reports", "all_di.csv"), stringsAsFactors = F)

all_di <- all_di %>%
  mutate(log_nparts = log(gmp:::as.double.bigz(nparts)),
         log_nsamples = log(nsamples)) %>%
  mutate(prop_found = exp(log_nsamples - log_nparts)) %>%
  mutate(found_all = prop_found==1)

There is some weird behavior where, for datasets where all samples were found, we do not see the disproportionately common extreme %ile values. I see a couple of reasons for this: - very small communities are naturally bizarre and not super meaningful - there's something squirrely with %ile

Distribution of percentile values

Skewness

Here is the overall distribution of skewness, and if we split based on whether we found all the samples:

ggplot(filter(all_di, singletons == F, s0 >= 3), aes(x = skew_percentile, y =..count.. / sum(..count..))) +
  geom_histogram(bins = 100) +
  theme_bw() +
  geom_hline(yintercept = .01)
ggplot(filter(all_di, singletons == F, s0 >= 3), aes(x = skew_percentile)) +
  theme_bw() +
  geom_histogram(bins = 100) +
  facet_wrap(vars(found_all), scales = "free_y")

When we found all the samples, the percentiles are more evenly distributed. I do not read much into the spike at 0 for those communities, because skewness is bizarre for very small communities.

Here is how skewness maps with S and N, filtered to communities where we found all samples:

ggplot(filter(all_di, singletons == F, found_all, s0 >= 3), aes(x = log(s0), y = log(n0), color = skew_percentile)) +
  geom_point(alpha = .5) +
  theme_bw() +
  theme(legend.position = "top") + scale_color_viridis_c(option = "plasma", end = .9)


# ggplot(filter(all_di, singletons == F), aes(x = log(n0/s0), y = skew_percentile, color = prop_found)) +
#   geom_point() +
#   theme_bw() +
#   scale_color_viridis_c(option = "plasma", end = .9, direction = -1) +
#   theme(legend.position = "top")

Here is how the histogram changes as we bin the lognparts into 2s:

all_di <- all_di %>%
  mutate(nparts_binned = as.factor(ceiling(log_nparts / 2) * 2))

for(i in 1:nrow(all_di)) {
  if(as.numeric(levels(all_di$nparts_binned)[as.numeric(all_di$nparts_binned[i])]) >= 25) {
    all_di$nparts_binned[i] <- 26
  }
}

ggplot(filter(all_di, !singletons, s0 >= 3), aes(x = skew_percentile)) +
  geom_histogram(bins = 100) +
  theme_bw() +
  facet_wrap(vars(nparts_binned), scales = "free_y")

Simpson

Here is the overall evenness distribution, and split by whether we found 'em all:

ggplot(filter(all_di, singletons == F, s0 >= 2), aes(x = simpson_percentile, y =..count.. / sum(..count..))) +
  geom_histogram(bins = 100) +
  theme_bw() +
  geom_hline(yintercept = .01)

ggplot(filter(all_di, singletons == F, s0 >= 2), aes(x = simpson_percentile)) +
  theme_bw() +
  geom_histogram(bins = 100) +
  facet_wrap(vars(found_all))

Simpson is less evenly distributed than skewness. Again, where we found them all, we don't see the disproportionately common low percentile values.

Here is how Simpson behaves in S and N space where we found them all:

ggplot(filter(all_di, singletons == F, found_all, s0 >= 2), aes(x = log(s0), y = log(n0), color = simpson_percentile)) +
  geom_point(alpha = .5) +
  theme_bw() +
  theme(legend.position = "top") + scale_color_viridis_c(option = "plasma", end = .9)

Here is how the histogram changes as we bin the lognparts into 5s:

ggplot(filter(all_di, !singletons, s0 >= 2), aes(x = simpson_percentile)) +
  geom_histogram(bins = 100) +
  theme_bw() +
  facet_wrap(vars(nparts_binned), scales = "free_y")

!!!!! OK SO!!!!!

It is NOT ABOUT whether we found all the samples. It is that the %iles look pretty UNIFORMLY DISTRIBUTED for communities with up to somewhere from r exp(10) to r exp(15) elements in the feasible set. Then the extremes emerge: high for skew and low for Simpson.

We expect issues of small N, but not for communities even so large. So if there are even 150 elements in the FS, we expect to be able to identify some kind of central tendency.

all_di <-  mutate(all_di, more_than_10 = log_nparts >= 10)
ggplot(filter(all_di, !singletons), aes(x = log(s0), y = log(n0), color = dat)) +
  geom_point() +
  theme_bw() +
  scale_color_viridis_d() +
  facet_wrap(vars(more_than_10), scales = "fixed") +
  theme(legend.position = "top")

Communities with more than r exp(10) parts in the FS account for about r 100*mean(filter(all_di, !singletons)$more_than_10)% of communities.

all_di <-  mutate(all_di, more_than_10 = log_nparts >= 10)
ggplot(filter(all_di, !singletons), aes(x = (s0), y = (n0), color = more_than_10)) +
  geom_point() +
  theme_bw() +
  facet_wrap(vars(dat), scales = "free")

filter(all_di, !singletons, s0 >= 3) %>%
  group_by(dat) %>%
  summarize(prop_more_than_10 = mean(more_than_10),
            nsites = length(unique(site))) %>%
  ungroup()

for(i in 1:nrow(all_di)) {
  if(as.numeric(levels(all_di$nparts_binned)[as.numeric(all_di$nparts_binned[i])]) >= 50) {
    all_di$nparts_binned[i] <- 50
  }
}

all_di$nparts_binned <- as.factor(as.numeric(as.character(all_di$nparts_binned)))

ggplot(filter(all_di, !singletons, s0 >= 3), aes(x = simpson_percentile)) +
  geom_histogram(bins = 100) +
  theme_bw() +
  facet_wrap(vars(nparts_binned, dat), scales = "free_y", ncol = length(unique(all_di$dat)), drop = F)


ggplot(filter(all_di, !singletons, s0 >= 3), aes(x = skew_percentile)) +
  geom_histogram(bins = 100) +
  theme_bw() +
  facet_wrap(vars(nparts_binned, dat), scales = "free_y", ncol = length(unique(all_di$dat)), drop = F)