In diazrenata/scadsanalysis: What the Package Does (One Line, Title Case)

We compare the results for very small communities (fewer than 2000 possible SADs in the feasible set) to larger communities within each dataset.

knitr::opts_chunk$set(echo = FALSE)
library(drake)
library(dplyr)
library(ggplot2)
library(grid)
theme_set(theme_bw())

all_di <- read.csv(here::here("analysis", "reports", "submission2", "all_di.csv"), stringsAsFactors = F)

all_ct <-  read.csv(here::here("analysis", "reports", "submission2", "all_ct.csv"), stringsAsFactors = F)


#fia_ct <- read.csv(here::here("fia_cts.csv"))

#all_ct <- rbind(all_ct, fia_ct)
all_ct <- all_ct %>%
  mutate(dat = ifelse(grepl(dat, pattern = "fia"), "fia", dat),
         dat = ifelse(dat == "misc_abund_short", "misc_abund", dat))

all_di <- all_di %>%
  mutate(log_nparts = log(gmp:::as.double.bigz(nparts)),
         log_nsamples = log(nsamples),
         log_s0 = log(s0),
         log_n0 = log(n0)) %>%
  filter(n0 != s0,
         s0 != 1,
         !singletons,
         n0 != (s0 + 1)) %>%
  mutate(dat = ifelse(grepl(dat, pattern = "fia"), "fia", dat),
         dat = ifelse(dat == "misc_abund_short", "misc_abund", dat)) %>%
  mutate(Dataset = dat,
         Dataset = ifelse(Dataset == "fia", "FIA", Dataset),
         Dataset = ifelse(Dataset == "bbs", "Breeding Bird Survey", Dataset),
         Dataset = ifelse(Dataset == "mcdb", "Mammal Communities", Dataset),
         Dataset = ifelse(Dataset == "gentry", "Gentry", Dataset),
         Dataset = ifelse(Dataset == "misc_abund", "Misc. Abundance", Dataset)) %>%
  filter(nparts > 20) %>%
  left_join(all_ct) 

all_di <- all_di %>%
  group_by_all() %>%
  mutate(real_po_percentile_mean = mean(real_po_percentile, real_po_percentile_excl),
         skew_percentile_mean = mean(skew_percentile, skew_percentile_excl),
         simpson_percentile_mean = mean(simpson_percentile,simpson_percentile_excl),
         shannon_percentile_mean = mean(shannon_percentile, shannon_percentile_excl),
         nsingletons_percentile_mean = mean(nsingletons_percentile, nsingletons_percentile_excl),) %>%
  ungroup()

all_di <- all_di %>%
  mutate(in_fia = ifelse(Dataset == "FIA", "FIA", "Other datasets"))

plot_percentile_hist <- function(di_df, col_name, plot_name, tails = 2, facetvar = "Dataset", min_s0 = 0) {

  if(tails == 2) {
    cutoff_percentiles = c(2.5, 97.5)
    min_nparts = 40
  } else if (tails== 1) {
    cutoff_percentiles = c(95)
    min_nparts = 20
  }

  di_df <- di_df %>%
    mutate(response = (di_df[[col_name]]),
           facetvar = di_df[[facetvar]])


  ggplot(filter(di_df, nparts > min_nparts, s0 >= min_s0), aes(response)) +
    geom_histogram(bins = 40, boundary = 100) +
    theme_bw() +
    xlab("") +
    ylab("") +
    geom_vline(xintercept = cutoff_percentiles, color = "red") +
    ggtitle( plot_name) +
    facet_wrap(vars(facetvar), ncol = 1, scales = "free_y")+
    theme(plot.title = element_text(size=10))



}

all_di <- all_di %>%
  mutate(`Observed \npercentile score` = ifelse(real_po_percentile_excl > 95, "High (> 95)", "Less than 95"))

plot_narrowness <- function(di_df, yvar, yvar_name) {

  if(grepl("sim_pos", yvar)) {
    min_nparts = 20
    ylabel = "Mean dissimilarity"
  } else {
    min_nparts = 40
    ylabel = "Breadth index"
  }

  di_df <- di_df %>%
    mutate(response = (di_df[[yvar]])) %>%
    filter(nparts > min_nparts,
           nparts < 10 ^ 50)


  ggplot(di_df, aes(nparts, response, color = Dataset)) +
    geom_point(data = filter(di_df, dat == "fia"), alpha = .1) +
    geom_point(data = filter(di_df, dat != "fia"), alpha = .1) +
    geom_point(data = filter(di_df, s0 < 0)) +
    scale_color_viridis_d(end = .9) +
    xlab("") +
    ylab(ylabel) +
    scale_x_log10() +
    ggtitle(yvar_name) +
    theme(legend.position = "none")+
    theme(plot.title = element_text(size=10))
}

plot_narrowness_legend <- function(di_df) {

  legend_df <- di_df %>%
    select(Dataset) %>%
    distinct() %>%
    mutate(ymark = dplyr::row_number())

  ggplot(legend_df, aes(1, ymark, color = Dataset)) +
    geom_label(aes(x = 2, y = ymark, label = Dataset)) +
    geom_point(size = 4) +
    scale_color_viridis_d(end = .9) +
    xlab("") +
    ylab("") +
    theme(legend.position = "none") +
    xlim(1, 3) + theme(
      panel.grid.major.x = element_blank(),
      panel.grid.minor.x = element_blank(),
      panel.grid.major.y = element_blank(),
      panel.grid.minor.y = element_blank(),
      axis.text.x = element_blank(),
      axis.ticks.x = element_blank(),
      axis.text.y = element_blank(),
      axis.ticks.y = element_blank()
    ) +
    ggtitle("Dataset")

}
#
# gridExtra::grid.arrange(grobs = list(
#   plot_narrowness(all_di, "sim_pos_from_best", "Dissimilarity to central tendency"),
#   plot_narrowness_legend(all_di)),
#   ncol = 1)

plot_narrowness_hist <- function(di_df, col_name, plot_name, facetvar = "Dataset", min_s0 = 0) {

  if(grepl("sim_pos", col_name)) {
xvarname = "95th percentile \nof dissimilarity scores"
min_nparts = 40
  } else {
    min_nparts = 20
    xvarname = "Breadth index"
  }

  di_df <- di_df %>%
    mutate(response = (di_df[[col_name]]),
           facetvar = di_df[[facetvar]])


  ggplot(filter(di_df, nparts > min_nparts, s0 >= min_s0), aes(response)) +
    geom_histogram(bins = 40, boundary = 100) +
    theme_bw() +
    ylab("") +
    xlab(xvarname) +
    ggtitle( plot_name) +
    facet_wrap(vars(facetvar), ncol = 1, scales = "free_y")+
    theme(plot.title = element_text(size=10))

}

Only three of our datasets - Mammal Communities, Misc. Abundance, and FIA - have appreciable numbers of very small communities. We use 2000 possible SADs as a cutoff because it splits these datasets into reasonably large subsets of both "large" and "small" communities.

Proportions of communities in these datasets with fewer than 2000 possible SADs:

all_di <- all_di %>%
  mutate( `Number of elements` = ifelse(nparts < 2000, "Less than 2000", "More than 2000"))
diss_results <- all_di %>%
  filter(nparts > 20) %>%
  group_by(Dataset,  `Number of elements` ) %>%
  summarize(high_diss = mean(real_po_percentile_excl > 95),
            nsites = dplyr::n()) %>%
  ungroup() %>%
  mutate(`High dissimilarity` = paste0(signif(100 * high_diss, 2), "%; n = ", nsites)) %>%
  select(-nsites)

skew_results <- all_di %>%
  filter(nparts > 40, s0 > 2) %>%
  group_by(Dataset,  `Number of elements` ) %>%
  summarize(high_skew = mean(skew_percentile_excl > 97.5),
            low_skew = mean(skew_percentile < 2.5),
            nsites = dplyr::n()) %>%
  ungroup() %>%
  mutate(`High skew` = paste0(signif(100 * high_skew, 2), "%; n = ", nsites),
         `Low skew` = paste0(signif(100 * low_skew, 2), "%; n = ", nsites)) %>%
  select(-nsites)

metric_results <- all_di %>%
  filter(nparts > 40) %>%
  group_by(Dataset,  `Number of elements` ) %>%
  summarize(high_simpson = mean(simpson_percentile_excl > 97.5),
            low_simpson = mean(simpson_percentile < 2.5),
            high_shannon = mean(shannon_percentile_excl > 97.5),
            low_shannon = mean(shannon_percentile < 2.5),
            high_nsingletons = mean(nsingletons_percentile_excl > 97.5),
            low_nsingletons = mean(nsingletons_percentile < 2.5),
            nsites = dplyr::n()) %>%
  ungroup() %>%
  mutate(`High Simpson` = paste0(signif(100 * high_simpson, 2), "%; n = ", nsites),
         `Low Simpson` = paste0(signif(100 * low_simpson, 2), "%; n = ", nsites),
         `High Shannon` = paste0(signif(100 * high_shannon, 2), "%; n = ", nsites),
         `Low Shannon` = paste0(signif(100 * low_shannon, 2), "%; n = ", nsites),
         `High proportion of rare species` = paste0(signif(100 * high_nsingletons, 2), "%; n = ", nsites),
         `Low proportion of rare species` = paste0(signif(100 * low_nsingletons, 2), "%; n = ", nsites)) %>%
  select(-nsites)

all_results <- left_join(diss_results, skew_results) %>%
  left_join(metric_results)

cols1 <- c("Dataset", "Number of elements", "High dissimilarity", "High proportion of rare species", "High skew", "Low Simpson", "Low Shannon")


cols2 <- c("Dataset", "Number of elements", "Low proportion of rare species", "Low skew", "High Simpson", "High Shannon")
all_di %>%
  filter(dat %in% c("fia", "mcdb", "misc_abund")) %>%
  group_by(Dataset) %>%
  summarize(`Proportion with < 2000 possible SADs` = mean(nparts < 2000))

Proportions of communities with extreme percentile scores

The proportions of communities with >/< 2000 possible SADs with extreme percentile scores for each dataset.

For dissimilarity, approximately 5% of communities would have extreme scores by chance. For all other metrics, this would be approximately 2.5%.

In the direction of effects usually seen:

all_results %>%
  filter(Dataset %in% c("FIA", "Mammal Communities", "Misc. Abundance")) %>%
  select(cols1)

In the opposite direction:

all_results %>%
  filter(Dataset %in% c("FIA", "Mammal Communities", "Misc. Abundance")) %>%
  select(cols2)

Plots of breadth indices and percentile scores

Breadth indices (top) and percentile scores (bottom) for communities with fewer than 2000 possible SADs, or more than 2000 possible SADs, from FIA, Mammal Community, and Misc. Abundance datasets.

nhist <- plot_narrowness_hist(filter(all_di, dat %in% c("fia", "mcdb", "misc_abund")), "sim_pos_from_best_95", "Dissimilarity to the central tendency") +
  facet_wrap(vars(`Number of elements`, Dataset), scales = "free_y", nrow = 2, ncol = 3) +
  xlab("95th percentile of scores for dissimilarity to the central tendency \nfor elements of the feasible set") +
  ylab("Number of communities") +
  ggtitle("")

phist <- plot_percentile_hist(filter(all_di, dat %in% c("fia", "mcdb", "misc_abund")), "real_po_percentile_mean", "Dissimilarity to the central tendency") +
  facet_wrap(vars(`Number of elements`, Dataset), scales = "free_y", nrow = 2, ncol = 3) +
  xlab("Percentile rank of observed value relative to feasible set") +
  ylab("Number of communities") +
  ggtitle("")


nskew <- plot_narrowness_hist(filter(all_di, dat %in% c("fia", "mcdb", "misc_abund")), "skew_95_ratio_2t", "Skewness") +
  facet_wrap(vars(`Number of elements`, Dataset), scales = "free_y", nrow = 2, ncol = 3)+
  xlab("Breadth index") +
  ylab("Number of communities") +
  ggtitle("")

pskew <- plot_percentile_hist(filter(all_di, dat %in% c("fia", "mcdb", "misc_abund")), "skew_percentile_mean", "Skewness") +
  facet_wrap(vars(`Number of elements`, Dataset), scales = "free_y", nrow = 2, ncol = 3)+
  xlab("Percentile rank of observed value relative to feasible set") +
  ylab("Number of communities") +
  ggtitle("")

nrare <- plot_narrowness_hist(filter(all_di, dat %in% c("fia", "mcdb", "misc_abund")), "nsingletons_95_ratio_2t", "Number of rare species") +
  facet_wrap(vars(`Number of elements`, Dataset), scales = "free_y", nrow = 2, ncol = 3)+
  xlab("Breadth index") +
  ylab("Number of communities") +
  ggtitle("")


prare <- plot_percentile_hist(filter(all_di, dat %in% c("fia", "mcdb", "misc_abund")), "nsingletons_percentile_mean", "Number of rare species") +
  facet_wrap(vars(`Number of elements`, Dataset), scales = "free_y", nrow = 2, ncol = 3)+
  xlab("Percentile rank of observed value relative to feasible set") +
  ylab("Number of communities") +
  ggtitle("")

nsimp <- plot_narrowness_hist(filter(all_di, dat %in% c("fia", "mcdb", "misc_abund")), "simpson_95_ratio_2t", "Simpson") +
  facet_wrap(vars(`Number of elements`, Dataset), scales = "free_y", nrow = 2, ncol = 3)+
  xlab("Breadth index") +
  ylab("Number of communities") +
  ggtitle("")


psimp <- plot_percentile_hist(filter(all_di, dat %in% c("fia", "mcdb", "misc_abund")), "simpson_percentile_mean", "Simpson") +
  facet_wrap(vars(`Number of elements`, Dataset), scales = "free_y", nrow = 2, ncol = 3)+
  xlab("Percentile rank of observed value relative to feasible set") +
  ylab("Number of communities") +
  ggtitle("")


nshan <- plot_narrowness_hist(filter(all_di, dat %in% c("fia", "mcdb", "misc_abund")), "shannon_95_ratio_2t", "Shannon") +
  facet_wrap(vars(`Number of elements`, Dataset), scales = "free_y", nrow = 2, ncol = 3)+
  xlab("Breadth index") +
  ylab("Number of communities") +
  ggtitle("")


pshan <- plot_percentile_hist(filter(all_di, dat %in% c("fia", "mcdb", "misc_abund")), "shannon_percentile_mean", "Shannon") +
  facet_wrap(vars(`Number of elements`, Dataset), scales = "free_y", nrow = 2, ncol = 3)+
  xlab("Percentile rank of observed value relative to feasible set") +
  ylab("Number of communities") +
  ggtitle("")



gridExtra::grid.arrange(grobs = list(nhist, phist), nrow = 2, top = textGrob("Dissimilarity to the central tendency"))


gridExtra::grid.arrange(grobs = list(nskew, pskew), nrow = 2, top = textGrob("Skewness"))


gridExtra::grid.arrange(grobs = list(nrare, prare), nrow = 2, top = textGrob("Number of rare species"))


gridExtra::grid.arrange(grobs = list(nsimp, psimp), nrow = 2, top = textGrob("Simpson"))


gridExtra::grid.arrange(grobs = list(nshan, pshan), nrow = 2, top = textGrob("Shannon"))