In diazrenata/scadsplants: Constraints on SADs of Portal Plants

knitr::opts_chunk$set(echo = FALSE)
library(drake)
library(dplyr)
library(ggplot2)
library(scadsplants)

# ## Set up the cache and config
# db <- DBI::dbConnect(RSQLite::SQLite(), here::here("analysis", "drake", "drake-cache.sqlite"))
# cache <- storr::storr_dbi("datatable", "keystable", db)

di_df <- read.csv(here::here("analysis", "drake", "di_df.csv"), stringsAsFactors = F)

Distribution of skewnesses

di_df <- di_df %>%
  mutate(singletons = ifelse(grepl(di_name, pattern = "singles"), "singletons", "true"))

skew_violins <- ggplot(data = di_df, aes(x = singletons, y = skew, color = season)) +
  geom_violin(data = filter(di_df, source == "sampled")) +
  geom_point(data = filter(di_df, source == "observed")) +
  facet_wrap(c("year", "season"), ncol = 5, strip.position = "top") +
  scale_color_viridis_d(end = .8, direction = -1) +
  theme_bw()

skew_violins

2009 is problematic because the communities are very small (S = 1 and 3, N = 2 and 5 respectively). Similarly, winter of 2000 had 12 individuals of 2 species. Notably, winter 1996 had 34 individuals of 3 species, and this appears to be enough to get some interesting variation going.

Shannon violins

shannon_violins <- ggplot(data = di_df, aes(x = singletons, y = shannon, color = season)) +
  geom_violin(data = filter(di_df, source == "sampled")) +
  geom_point(data = filter(di_df, source == "observed")) +
  facet_wrap(c("year", "season"), ncol = 5, strip.position = "top") +
  scale_color_viridis_d(end = .8, direction = -1) +
  theme_bw()

shannon_violins

Simpson violins

simpson_violins <- ggplot(data = di_df, aes(x = singletons, y = simpson, color = season)) +
  geom_violin(data = filter(di_df, source == "sampled")) +
  geom_point(data = filter(di_df, source == "observed")) +
  facet_wrap(c("year", "season"), ncol = 5, strip.position = "top") +
  scale_color_viridis_d(end = .8, direction = -1) +
  theme_bw()

simpson_violins

Frequency of diversity index percentiles

# 
# ranked_di_df <- di_df %>%
#   filter(source == "sampled") %>%
#   group_by(year, season, treatment, singletons) %>%
#   mutate_at(c("skew", "shannon", "simpson"), .funs = list("percentile" = get_percentiles)) %>%
#   ungroup()
# 
# observed_percentile <- di_df %>%
#   filter(source == "observed") %>%
#   left_join(select(ranked_di_df, -sim, -source, -skew_percentile, -shannon_percentile, -simpson_percentile, -di_name), by = c("year", "season", "treatment", "singletons")) %>%
#   group_by(year, season, treatment, singletons) %>%
#   mutate(skew_percentile = get_percentile(skew.x, a_vector = skew.y),
#          shannon_percentile = get_percentile(shannon.x, a_vector = shannon.y),
#          simpson_percentile = get_percentile(simpson.x, a_vector = simpson.y))  %>%
#   ungroup() %>%
#   select(year, season, treatment, source, singletons, skew_percentile, shannon_percentile, simpson_percentile,
#          skew.x, shannon.x, simpson.x, di_name) %>%
#   rename(skew = skew.x,
#          shannon = shannon.x,
#          simpson = simpson.x) %>%
#   distinct()
# 
# 
# ranked_di_df <- bind_rows(ranked_di_df, observed_percentile)
# 
# ranked_di_long <- ranked_di_df %>%
#   tidyr::gather(-sim, -year, -season, -treatment, -source, -di_name, -singletons, key = "variable", value = "value")
# 

load(here::here("analysis", "reports", "ranked_dis.Rds"))

di_hists <- ggplot(data = filter(ranked_di_long, source == "observed", singletons == "true", variable %in% c("skew_percentile", "shannon_percentile", "simpson_percentile")), aes(x = season, y= value,color = season)) +
  geom_violin() +
  geom_jitter(alpha = .7, height = 0, width = .2) +
  theme_bw() +
  scale_color_viridis_d(end = .8, direction = -1) +
  facet_wrap(~variable, strip.position = "top")

di_hists

#save(ranked_di_df, ranked_di_long, file = here::here("analysis", "reports", "ranked_dis.Rds"))

Simpson's vs. skewness percentile values

simp_skew <- ranked_di_df %>%
  group_by(season, year, treatment, di_name) %>%
  mutate(ndraws = n() - 1) %>%
  ungroup() %>%
  filter(source == "observed") %>%
  mutate(alpha = max(0.05, ndraws / max(ndraws)))

simp_skew_percentile_plot <- ggplot(data = simp_skew, aes(x = skew_percentile, y = simpson_percentile, color = season, alpha = alpha)) +
  geom_point() +
  geom_abline(slope = -1, intercept = 100) +
  theme_bw() +
  scale_color_viridis_d(end = .8) +
  ylim(0, 100) +
  xlim(0, 100)

simp_skew_percentile_plot

Zoomed to the very weirdest

simp_skew_percentile_plot_zoomed <- simp_skew_percentile_plot +
  xlim(80, 100) +
  ylim(0, 20)

simp_skew_percentile_plot_zoomed

print("How often is the Simpson percentile 0?")
sum(simp_skew$simpson_percentile == 0)
print("How often is the skew percentile 100?")
sum(simp_skew$skew_percentile == 100, na.rm = T)

There's some relationship, but Simpson's drives to zero much earlier than the skewness percentile gets to 100. This squares with violin plots from earlier. Simpson's is more sensitive/less nuanced than skewness.

Would our conclusions change using Simpson's vs. skewness?

conclusions <- ranked_di_df %>%
  filter(source == "observed", singletons == "true") %>%
  mutate(simp_outlier_2p5 = simpson_percentile <= 2.5,
         skew_outlier_97p5 = skew_percentile >= 97.5,
         simp_outlier_5 = simpson_percentile <=5,
         skew_outlier_95 = skew_percentile >= 97.5) %>%
  select(season, year, simp_outlier_2p5, skew_outlier_97p5, simp_outlier_5, skew_outlier_95, skew_percentile, simpson_percentile) %>%
  tidyr::gather(-season, -year, key = "variable", value = "value") %>%
  mutate(value = as.logical(value)) %>%
  mutate(variable = as.factor(variable)) %>%
  mutate(variable = factor(variable, levels = levels(variable)[c(1, 2, 5, 4,3, 6)]))


conclusions_plot <- ggplot(data = filter(conclusions, grepl(variable, pattern = "outlier"), !is.na(value)), aes(x = year, y = season, color = value)) +
  geom_point() +
  scale_color_viridis_d(option = "magma", end = .8) +
  facet_wrap(vars(variable), nrow = 2, dir = "h") + 
  theme_bw()
conclusions_plot

Orange is outliers. The Simpson non-outliers are a subset of the skew non-outliers. Distinguishing between a 5% threshold and a 2.5% threshold for outliers (vaguely one-sided v two-sided - but these are always one-sided) only changes one point.

diazrenata/scadsplants documentation built on Nov. 14, 2019, 2:42 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

Tweet to @rdrrHQ

GitHub issue tracker

ian@mutexlabs.com