knitr::opts_chunk$set(echo = FALSE)
library(drake)
library(dplyr)
library(ggplot2)
library(grid)
theme_set(theme_bw())

all_di <- read.csv(here::here("analysis", "reports", "submission2", "all_di.csv"), stringsAsFactors = F)

all_ct <-  read.csv(here::here("analysis", "reports", "submission2", "all_ct.csv"), stringsAsFactors = F)


#fia_ct <- read.csv(here::here("fia_cts.csv"))

#all_ct <- rbind(all_ct, fia_ct)
all_ct <- all_ct %>%
  mutate(dat = ifelse(grepl(dat, pattern = "fia"), "fia", dat),
         dat = ifelse(dat == "misc_abund_short", "misc_abund", dat))

all_di <- all_di %>%
  mutate(log_nparts = log(gmp:::as.double.bigz(nparts)),
         log_nsamples = log(nsamples),
         log_s0 = log(s0),
         log_n0 = log(n0)) %>%
  filter(n0 != s0,
         s0 != 1,
         !singletons,
         n0 != (s0 + 1)) %>%
  mutate(dat = ifelse(grepl(dat, pattern = "fia"), "fia", dat),
         dat = ifelse(dat == "misc_abund_short", "misc_abund", dat)) %>%
  mutate(Dataset = dat,
         Dataset = ifelse(Dataset == "fia", "FIA", Dataset),
         Dataset = ifelse(Dataset == "bbs", "Breeding Bird Survey", Dataset),
         Dataset = ifelse(Dataset == "mcdb", "Mammal Communities", Dataset),
         Dataset = ifelse(Dataset == "gentry", "Gentry", Dataset),
         Dataset = ifelse(Dataset == "misc_abund", "Misc. Abundance", Dataset)) %>%
  filter(nparts > 20) %>%
  left_join(all_ct) 

all_di <- all_di %>%
  group_by_all() %>%
  mutate(real_po_percentile_mean = mean(real_po_percentile, real_po_percentile_excl),
         skew_percentile_mean = mean(skew_percentile, skew_percentile_excl),
         simpson_percentile_mean = mean(simpson_percentile,simpson_percentile_excl),
         shannon_percentile_mean = mean(shannon_percentile, shannon_percentile_excl),
         nsingletons_percentile_mean = mean(nsingletons_percentile, nsingletons_percentile_excl),) %>%
  ungroup()

all_di <- all_di %>%
  mutate(in_fia = ifelse(Dataset == "FIA", "FIA", "Other datasets"))
plot_narrowness <- function(di_df, yvar, yvar_name) {

  if(grepl("sim_pos", yvar)) {
    min_nparts = 20
    ylabel = "Mean dissimilarity"
  } else {
    min_nparts = 40
    ylabel = "Breadth index"
  }

  di_df <- di_df %>%
    mutate(response = (di_df[[yvar]])) %>%
    filter(nparts > min_nparts,
           nparts < 10 ^ 50)


  ggplot(di_df, aes(nparts, response, color = Dataset)) +
    geom_point(data = filter(di_df, dat == "fia"), alpha = .1) +
    geom_point(data = filter(di_df, dat != "fia"), alpha = .1) +
    geom_point(data = filter(di_df, s0 < 0)) +
    scale_color_viridis_d(end = .9) +
    xlab("") +
    ylab(ylabel) +
    scale_x_log10() +
    ggtitle(yvar_name) +
    theme(legend.position = "none")+
    theme(plot.title = element_text(size=10))
}

plot_narrowness_legend <- function(di_df) {

  legend_df <- di_df %>%
    select(Dataset) %>%
    distinct() %>%
    mutate(ymark = dplyr::row_number())

  ggplot(legend_df, aes(1, ymark, color = Dataset)) +
    geom_label(aes(x = 2, y = ymark, label = Dataset)) +
    geom_point(size = 4) +
    scale_color_viridis_d(end = .9) +
    xlab("") +
    ylab("") +
    theme(legend.position = "none") +
    xlim(1, 3) + theme(
      panel.grid.major.x = element_blank(),
      panel.grid.minor.x = element_blank(),
      panel.grid.major.y = element_blank(),
      panel.grid.minor.y = element_blank(),
      axis.text.x = element_blank(),
      axis.ticks.x = element_blank(),
      axis.text.y = element_blank(),
      axis.ticks.y = element_blank()
    ) +
    ggtitle("Dataset")

}
#
# gridExtra::grid.arrange(grobs = list(
#   plot_narrowness(all_di, "sim_pos_from_best", "Dissimilarity to central tendency"),
#   plot_narrowness_legend(all_di)),
#   ncol = 1)
plot_narrowness_hist <- function(di_df, col_name, plot_name, facetvar = "Dataset", min_s0 = 0) {

  if(grepl("sim_pos", col_name)) {
    xvarname = "95th percentile \nof dissimilarity scores"
    min_nparts = 40
  } else {
    min_nparts = 20
    xvarname = "Breadth index \n"
  }

  di_df <- di_df %>%
    mutate(response = (di_df[[col_name]]),
           facetvar = di_df[[facetvar]])


  ggplot(filter(di_df, nparts > min_nparts, s0 >= min_s0), aes(response)) +
    geom_histogram(bins = 40, boundary = 100) +
    theme_bw() +
    ylab("") +
    xlab(xvarname) +
    ggtitle( plot_name) +
    facet_wrap(vars(facetvar), ncol = 1, scales = "free_y")+
    theme(plot.title = element_text(size=10))

}
fig_2 <- gridExtra::grid.arrange(grobs = list(

  plot_narrowness_hist(all_di, "sim_pos_from_best_95", "Dissimilarity to the \ncentral tendency", facetvar = "Dataset"),
  plot_narrowness_hist(all_di, "nsingletons_95_ratio_2t", "\nNumber of rare species", facetvar = "Dataset"),
  plot_narrowness_hist(all_di, "skew_95_ratio_2t", "\nSkewness", facetvar = "Dataset", min_s0 = 3),
  plot_narrowness_hist(all_di, "simpson_95_ratio_2t", "\nSimpson evenness", facetvar = "Dataset"),
  plot_narrowness_hist(all_di, "shannon_95_ratio_2t", "\nShannon diversity", facetvar = "Dataset")
), ncol = 5,
 left = textGrob("Number of communities", rot = 90, gp = gpar(fill = "black")), gp = gpar(fill = "white"))

Figure S6. Partially because of the uneven distribution of S and N among the different datasets, the narrowness of the feasible sets - defined either as the 95th percentile of scores for the dissimilarity of samples from the feasible set compared to the central tendency of the feasible set, or using a breadth index for specific metrics - varies among different datasets. In particular, the FIA dataset, and subsets of the Mammal Community and Miscellaneous Abundance databases, often have highly variable, broadly-defined statistical baselines derived from the feasible set.



diazrenata/scadsanalysis documentation built on May 14, 2021, 6:59 p.m.