In diazrenata/scadsanalysis: What the Package Does (One Line, Title Case)

knitr::opts_chunk$set(echo = FALSE)
library(drake)
library(dplyr)
library(ggplot2)
library(grid)


all_di <- read.csv(here::here("analysis", "rev_prototyping", "all_di.csv"), stringsAsFactors = F)

all_di <- all_di %>%
  mutate(log_nparts = log(gmp:::as.double.bigz(nparts)),
         log_nsamples = log(nsamples),
         log_s0 = log(s0),
         log_n0 = log(n0)) %>%
  filter(n0 != s0,
         s0 != 1,
         !singletons,
         n0 != (s0 + 1)) %>%
  mutate(dat = ifelse(grepl(dat, pattern = "fia"), "fia", dat),
         dat = ifelse(dat == "misc_abund_short", "misc_abund", dat)) %>%
  mutate(Dataset = dat,
         Dataset = ifelse(Dataset == "fia", "FIA", Dataset),
         Dataset = ifelse(Dataset == "bbs", "Breeding Bird Survey", Dataset),
         Dataset = ifelse(Dataset == "mcdb", "Mammal Communities", Dataset),
         Dataset = ifelse(Dataset == "gentry", "Gentry", Dataset),
         Dataset = ifelse(Dataset == "misc_abund", "Misc. Abundance", Dataset))

Illustrations of 95% interval (Figure 1)

To show the 95% interval, we need to load the distribution of shape metric values from the samples from the feasible set for a few communities. See rov_metric.md.

library(drake)

db <- DBI::dbConnect(RSQLite::SQLite(), here::here("analysis", "drake", "drake-cache-net.sqlite"))
cache <- storr::storr_dbi("datatable", "keystable", db)
cache$del(key = "lock", namespace = "session")

net_summary <- readd(all_di_summary, cache = cache)

net_summary <- net_summary %>%
  mutate(log_nparts = log(gmp:::as.double.bigz(nparts)))

example_fs <- readd(fs_s_44_n_13360, cache = cache)

example_di <- readd(di_fs_s_44_n_13360, cache =cache)

example_fs <- example_fs %>%
  left_join(example_di) %>%
  left_join(net_summary)

example_fs2 <- readd(fs_s_7_n_71, cache = cache)

example_di2 <- readd(di_fs_s_7_n_71, cache =cache)

example_fs2 <- example_fs2 %>%
  left_join(example_di2) %>%
  left_join(net_summary)



example_fs3 <- readd(fs_s_4_n_34, cache = cache)

example_di3 <- readd(di_fs_s_4_n_34, cache =cache)

example_fs3 <- example_fs3 %>%
  left_join(example_di3) %>%
  left_join(net_summary)


breadth_plots <- list(

  ggplot(example_fs3, aes(rank, abund, group = sim, color = skew)) +
    geom_line(alpha = .25) +
    theme_bw() +
    scale_color_viridis_c(option = "plasma", end = .8) +
    ggtitle("Small community", subtitle = paste0("S = ", (example_fs3$s0), "; N = ", (example_fs3$n0[1]))) +
    theme(legend.position = "right") +
    xlab("Rank") +
    ylab("Abundance"),

  ggplot(example_fs3, aes(skew)) +
  #  geom_density() +
    geom_histogram(bins = 50) +
    theme_bw() +
    geom_vline(xintercept = c(example_fs3$skew_95[1], example_fs3$skew_min[1]), color = "red") +
    ggtitle("", subtitle = paste0("Breadth index: ", round((example_fs3$skew_95_ratio_1t[1]), 2))) +
    xlab("Skewness") +
    ylab("Count"),

  ggplot(example_fs2, aes(rank, abund, group = sim, color = skew)) +
    geom_line(alpha = .1) +
    theme_bw() +
    scale_color_viridis_c(option = "plasma", end = .8) +
    ggtitle("Medium community", subtitle = paste0("S = ", (example_fs2$s0), "; N = ", (example_fs2$n0[1]))) +
    theme(legend.position = "right")+
    xlab("Rank") +
    ylab("Abundance"),

  ggplot(example_fs2, aes(skew)) +
  #  geom_density() +
    geom_histogram(bins = 50) +
    theme_bw() +
    geom_vline(xintercept = c(example_fs2$skew_95[1], example_fs2$skew_min[1]), color = "red") +
    ggtitle("", subtitle =  paste0("Breadth index: ", round((example_fs2$skew_95_ratio_1t[1]), 2)))+
    xlab("Skewness") +
    ylab("Count"),


  ggplot(example_fs, aes(rank, abund, group = sim, color = skew)) +
    geom_line(alpha = .1) +
    theme_bw() +
    scale_color_viridis_c(option = "plasma", end = .8) +
    ggtitle("Large community", subtitle = paste0("S = ", (example_fs$s0), "; N = ", (example_fs$n0[1]))) +
    theme(legend.position = "right") +
    ylim(0, 5000) + # Remove a very few very very uneven SADs that make the scale too big to be interpretable
    theme(axis.text.y = element_text(size = 6, angle = 60))+
    xlab("Rank") +
    ylab("Abundance"),

  ggplot(example_fs, aes(skew)) +
   # geom_density() +
    geom_histogram(bins = 50) +
    theme_bw() +
    geom_vline(xintercept = c(example_fs$skew_95[1], example_fs$skew_min[1]), color = "red") +
    ggtitle("", subtitle =  paste0("Breadth index: ", round((example_fs$skew_95_ratio_1t[1]), 2)))+
    xlab("Skewness") +
    ylab("Count")

)

fig_1 <- gridExtra::grid.arrange(grobs = breadth_plots, ncol = 2, top = textGrob("Figure 1", gp = gpar(fill = "white")))

plot(fig_1)


DBI::dbDisconnect(db)
rm(cache)

Skewness and evenness histograms by dataset (Figure 2)

fig_2 <- gridExtra::grid.arrange(grobs = list(

  ggplot(filter(all_di, s0 > 2, nparts > 20), aes(skew_percentile_excl)) +
    geom_histogram(bins = 100) +
    geom_vline(xintercept = 95, color = "red") +
    theme_bw() +
    facet_wrap(vars(Dataset), scales = "free_y", ncol = 1) + ggtitle("Skewness") +
    xlab("") +
    ylab(""),# +
  #  xlab("Percentile rank of observed value relative to feasible set") +
    #ylab("Number of communities") ,

  ggplot(filter(all_di, nparts > 20), aes(simpson_percentile)) + 
    geom_histogram(bins = 100) +
    geom_vline(xintercept = 5, color = "red") +
    theme_bw() +
    facet_wrap(vars(Dataset), scales = "free_y", ncol = 1) + ggtitle("Evenness") + 
    xlab("") +
    ylab("")#+
    #xlab("Percentile rank of observed value relative to feasible set") +
  #  ylab("Number of communities")
), ncol = 2,
top = textGrob("Figure 2", gp = gpar(fill = "white")), left = textGrob("Number of communities", rot = 90, gp = gpar(fill = "black")),
bottom = textGrob("Percentile rank of observed value relative to feasible set"), gp = gpar(fill = "white"))

Breadth index by dataset (Figure 3)

fig_3 <- gridExtra::grid.arrange(grobs = list(
  ggplot(filter(all_di, s0 > 2, nparts > 20), aes(x= skew_95_ratio_1t)) +
    geom_histogram() +
    theme_bw() +
    facet_wrap(vars(Dataset), scales = "free_y", ncol = 1) +
    xlab("") +
    ylab("") +
    ggtitle("Skewness") +
    xlim(0, 1),

  ggplot(filter(all_di, nparts > 20), aes(x = simpson_95_ratio_1t)) +
    geom_histogram() +
    theme_bw() +
    facet_wrap(vars(Dataset), scales = "free_y", ncol = 1) +
    xlab("") +
    ylab("") +
    ggtitle("Evenness") +
    xlim(0, 1)
), ncol = 2,
top = textGrob("Figure 3", gp = gpar(fill = "white")),
left = "Number of communities", 
bottom = "Breadth index (ranges 0-1, with 1 being very broad)")

plot(fig_3)

Comparison of FIA and comparably sized communities (Figure 4)

fia_max_s = max(filter(all_di, dat == "fia")$s0)
fia_max_n = max(filter(all_di, dat == "fia")$n0)

fia = filter(all_di, dat == "fia")

small_di <- all_di %>%
  filter(s0 <= fia_max_s,
         n0 <= fia_max_n) %>%
  mutate(fia_yn = ifelse(dat == "fia", "fia", "other datasets"),
         right_corner = FALSE)

for(i in 1:nrow(small_di)) {
  if(small_di$dat[i] != "fia") {
    fia_match_s0 = filter(fia, s0 >= small_di$s0[i])

    max_n0_for_this_s0 = max(fia_match_s0$n0)

    small_di$right_corner[i] = small_di$n0[i] > max_n0_for_this_s0

  }
}


small_di <- small_di %>%
  filter(!right_corner)
not_fia <- filter(small_di, fia_yn != "fia")
fia = filter(small_di, dat == "fia")

small_di_s_n <- small_di %>%
  select(s0, n0) %>%
  distinct() %>%
  mutate(in_fia = FALSE,
         in_not_fia = FALSE)

for(i in 1:nrow(small_di_s_n)) {

  this_s0 = small_di_s_n$s0[i]
  this_n0 = small_di_s_n$n0[i]

  fia_matches = filter(fia, s0 == this_s0, n0 == this_n0)
  not_fia_matches = filter(not_fia, s0 == this_s0, n0 == this_n0)

  small_di_s_n$in_fia[i] = nrow(fia_matches) > 0 
  small_di_s_n$in_not_fia[i] = nrow(not_fia_matches) > 0 


}

small_di_s_n <- filter(small_di_s_n, in_fia, in_not_fia)

set.seed(1977)

subsamples <- list()

for(i in 1:nrow(small_di_s_n)) {

  this_s0 = small_di_s_n$s0[i]
  this_n0 = small_di_s_n$n0[i]

  fia_matches = filter(fia, s0 == this_s0, n0 == this_n0)

  not_fia_matches = filter(not_fia, s0 == this_s0, n0 == this_n0)

  n_to_draw = min(nrow(fia_matches), nrow(not_fia_matches))

  fia_draw = sample.int(n = nrow(fia_matches), size = n_to_draw, replace = F)
  not_fia_draw = sample.int(n = nrow(not_fia_matches), size = n_to_draw, replace = F)

  subsamples[[i]] <- (bind_rows(fia_matches[fia_draw,], not_fia_matches[not_fia_draw, ]))

}

sub_di <- bind_rows(subsamples) %>%
  mutate(Dataset = ifelse(fia_yn == "fia", "FIA", "Other datasets"))

simpson_ks <- filter(sub_di, nparts > 40) %>%
  select(simpson_95_ratio_1t, Dataset, simpson_percentile, s0, n0) %>% 
  group_by(Dataset, s0, n0) %>%
  mutate(pairing = dplyr::row_number()) %>%
  ungroup() %>%
  tidyr::pivot_wider(id_cols = c(s0, n0, pairing), names_from = Dataset, values_from = c(simpson_95_ratio_1t, simpson_percentile))

simpson_breadth <- ks.test(simpson_ks$simpson_95_ratio_1t_FIA, simpson_ks$`simpson_95_ratio_1t_Other datasets`)

simpson_percent <- ks.test(simpson_ks$simpson_percentile_FIA, simpson_ks$`simpson_percentile_Other datasets`)
simpson_breadth
simpson_percent

skewness_ks <- filter(sub_di, nparts > 40) %>%
  select(skew_95_ratio_1t, Dataset, skew_percentile, s0, n0) %>% 
  group_by(Dataset, s0, n0) %>%
  mutate(pairing = dplyr::row_number()) %>%
  ungroup() %>%
  tidyr::pivot_wider(id_cols = c(s0, n0, pairing), names_from = Dataset, values_from = c(skew_95_ratio_1t, skew_percentile))

skewness_breadth <- ks.test(skewness_ks$skew_95_ratio_1t_FIA, skewness_ks$`skew_95_ratio_1t_Other datasets`)

skewness_percent <- ks.test(skewness_ks$skew_percentile_FIA, skewness_ks$`skew_percentile_Other datasets`)
skewness_breadth
skewness_percent


fig_4 <- gridExtra::grid.arrange(grobs = list(
  ggplot(filter(sub_di, nparts > 20), aes(skew_95_ratio_1t)) +
    geom_histogram()+
    theme_bw() +
    facet_wrap(vars(Dataset), scales = "free_y")  +
    xlab("Breadth index") +
    ylab("Number of communities") +
    ggtitle("Breadth index", subtitle = "Skewness"),

  ggplot(filter(sub_di, nparts > 20), aes(simpson_95_ratio_1t)) +
    geom_histogram()+
    theme_bw() +
    facet_wrap(vars(Dataset), scales = "free_y")  +
    xlab("Breadth index") +
    ylab("") +
    ggtitle("", subtitle = "Evenness"),

  ggplot(filter(sub_di, nparts > 20), aes(skew_percentile_excl)) +
    geom_histogram()+
    theme_bw() +
    facet_wrap(vars(Dataset), scales = "free_y")  +
    xlab("Percentile rank") +
    ylab("Number of communities") +
    ggtitle("Percentile rank", subtitle = "Skewness"),


  ggplot(filter(sub_di, nparts > 20), aes(simpson_percentile)) +
    geom_histogram()+
    theme_bw() +
    facet_wrap(vars(Dataset), scales = "free_y")  +
    xlab("Percentile rank") +
    ylab("")+
    ggtitle("", subtitle = "Evenness")), ncol = 2, top = textGrob("Figure 4", gp = gpar(fill = "white")))

plot(fig_4)

pdf("figure_1.pdf", title = "Figure 1", bg = "white", paper = "letter", width = 6.5, height = 6.5)
gridExtra::grid.arrange(grobs = breadth_plots, ncol = 2, top = textGrob("Figure 1", gp = gpar(fill = "white")))
dev.off()

pdf("figure_2.pdf", title = "Figure 2", bg = "white", paper = "letter", width = 6.5, height = 6.5)
 gridExtra::grid.arrange(grobs = list(

  ggplot(filter(all_di, s0 > 2, nparts > 20), aes(skew_percentile_excl)) +
    geom_histogram(bins = 100) +
    geom_vline(xintercept = 95, color = "red") +
    theme_bw() +
    facet_wrap(vars(Dataset), scales = "free_y", ncol = 1) + ggtitle("Skewness") +
    xlab("") +
    ylab(""),# +
  #  xlab("Percentile rank of observed value relative to feasible set") +
    #ylab("Number of communities") ,

  ggplot(filter(all_di, nparts > 20), aes(simpson_percentile)) + 
    geom_histogram(bins = 100) +
    geom_vline(xintercept = 5, color = "red") +
    theme_bw() +
    facet_wrap(vars(Dataset), scales = "free_y", ncol = 1) + ggtitle("Evenness") + 
    xlab("") +
    ylab("")#+
    #xlab("Percentile rank of observed value relative to feasible set") +
  #  ylab("Number of communities")
), ncol = 2,
top = textGrob("Figure 2", gp = gpar(fill = "white")), left = textGrob("Number of communities", rot = 90, gp = gpar(fill = "black")),
bottom = textGrob("Percentile rank of observed value relative to feasible set"), gp = gpar(fill = "white"))
 dev.off()

pdf("figure_3.pdf", title = "Figure 3", bg = "white", paper = "letter", width = 6.5, height = 6.5)
gridExtra::grid.arrange(grobs = list(
  ggplot(filter(all_di, s0 > 2, nparts > 20), aes(x= skew_95_ratio_1t)) +
    geom_histogram() +
    theme_bw() +
    facet_wrap(vars(Dataset), scales = "free_y", ncol = 1) +
    xlab("") +
    ylab("") +
    ggtitle("Skewness") +
    xlim(0, 1),

  ggplot(filter(all_di, nparts > 20), aes(x = simpson_95_ratio_1t)) +
    geom_histogram() +
    theme_bw() +
    facet_wrap(vars(Dataset), scales = "free_y", ncol = 1) +
    xlab("") +
    ylab("") +
    ggtitle("Evenness") +
    xlim(0, 1)
), ncol = 2,
top = textGrob("Figure 3", gp = gpar(fill = "white")),
left = "Number of communities", 
bottom = "Breadth index (ranges 0-1, with 1 being very broad)")

dev.off()

pdf("figure_4.pdf", title = "Figure 4", bg = "white", paper = "letter", width = 6.5, height = 6.5)
 gridExtra::grid.arrange(grobs = list(
  ggplot(filter(sub_di, nparts > 20), aes(skew_95_ratio_1t)) +
    geom_histogram()+
    theme_bw() +
    facet_wrap(vars(Dataset), scales = "free_y")  +
    xlab("Breadth index") +
    ylab("Number of communities") +
    ggtitle("Breadth index", subtitle = "Skewness"),

  ggplot(filter(sub_di, nparts > 20), aes(simpson_95_ratio_1t)) +
    geom_histogram()+
    theme_bw() +
    facet_wrap(vars(Dataset), scales = "free_y")  +
    xlab("Breadth index") +
    ylab("") +
    ggtitle("", subtitle = "Evenness"),

  ggplot(filter(sub_di, nparts > 20), aes(skew_percentile_excl)) +
    geom_histogram()+
    theme_bw() +
    facet_wrap(vars(Dataset), scales = "free_y")  +
    xlab("Percentile rank") +
    ylab("Number of communities") +
    ggtitle("Percentile rank", subtitle = "Skewness"),


  ggplot(filter(sub_di, nparts > 20), aes(simpson_percentile)) +
    geom_histogram()+
    theme_bw() +
    facet_wrap(vars(Dataset), scales = "free_y")  +
    xlab("Percentile rank") +
    ylab("")+
    ggtitle("", subtitle = "Evenness")), ncol = 2, top = textGrob("Figure 4", gp = gpar(fill = "white")))
dev.off()