knitr::opts_chunk$set(echo = FALSE) library(drake) library(dplyr) library(ggplot2) library(scadsanalysis) if(params$on_hpg) { all_di <- list() datasets <- c("bbs", "fia_short", "gentry", "mcdb", "misc_abund_short", "portal_plants") #datasets <- c("fia", "gentry", "mcdb", "misc_abund_short", "portal_plants") for(i in 1:length(datasets)) { this_dataset = datasets[i] if(this_dataset == "misc_abund_short") { cache_loc = "miscabund" } else if(this_dataset == "fia_short") { cache_loc = "fia" } else { cache_loc = this_dataset } ## Set up the cache and config db <- DBI::dbConnect(RSQLite::SQLite(), here::here("analysis", "drake", paste0("drake-cache-", cache_loc, ".sqlite"))) cache <- storr::storr_dbi("datatable", "keystable", db) all_di[[i]] <- readd(all_di_obs, cache = cache) all_di[[i]] <- filter(all_di[[i]], source == "observed") this_dat <- readd(paste0("dat_s_dat_", this_dataset), cache = cache, character_only = T) sv <- get_statevars(this_dat) all_di[[i]] <- left_join(all_di[[i]], sv, by = c("sim", "site", "source", "singletons", "dat")) DBI::dbDisconnect(db) rm(cache) rm(db) } all_di <- bind_rows(all_di) write.csv(all_di, "all_di.csv", row.names = F) } else { all_di <- read.csv(here::here("analysis", "reports", "all_di.csv"), stringsAsFactors = F) }
skewhists <- ggplot(data = all_di, aes(x = skew_percentile)) + geom_histogram(binwidth = 5, alpha = .3, boundary = 0) + geom_histogram(binwidth = 5, data = filter(all_di, nsamples == 2500), boundary = 0) + theme_bw() + facet_grid(rows = vars(dat), cols = vars(singletons), scales = "free_y") + ggtitle("Skewness") + geom_vline(xintercept = 95, color = "red") skewhists simpsonhists <- ggplot(data = all_di, aes(x = simpson_percentile)) + geom_histogram(binwidth = 5, alpha = .3, boundary = 0) + geom_histogram(binwidth = 5, data = filter(all_di, nsamples == 2500), boundary = 0) + theme_bw() + facet_grid(rows = vars(dat), cols = vars(singletons), scales = "free_y") + ggtitle("Simpson") + geom_vline(xintercept = 5, color = "red") simpsonhists
Observations from these histograms:
Some summary statistics:
di_summary <- all_di %>% filter(nsamples >= 2000) %>% group_by(dat, singletons) %>% summarize(mean_skew = mean(skew_percentile, na.rm=T), sd_skew = sd(skew_percentile, na.rm = T), mean_simpson = mean(simpson_percentile, na.rm = T), sd_simpson = sd(simpson_percentile, na.rm = T), mean_nsamples = mean(nsamples), nsites = length(unique(site)), prop_skew_high = mean(skew_percentile >= 95), prop_simpson_low = mean(simpson_percentile <= 5), med_skew = median(skew_percentile, na.rm = T), med_simpson = median(simpson_percentile, na.rm = T), skew_25 = quantile(skew_percentile, probs = .25), simpson_75 = quantile(simpson_percentile, probs = .75)) %>% ungroup() skew_boxplot <- ggplot(data = filter(all_di, nsamples >= 2000), aes(x = singletons, y = skew_percentile, group = singletons)) + # geom_point() + # geom_errorbar(aes(x = singletons, ymin = skew_25, ymax = med_skew)) + # geom_point(data = all_di, aes(x = singletons, y = skew_percentile), size = .5, alpha = .05) + # geom_label(aes(x = singletons, y = 10, label = signif(med_skew, digits = 4))) + geom_boxplot() + theme_bw() + facet_wrap(vars(dat)) + geom_hline(yintercept = 100, linetype = 2, color = "red") + ylim(0, 110) + ggtitle("Skewness, >= 2000 samples") skew_boxplot simpson_boxplot <- ggplot(data = filter(all_di, nsamples >= 2000), aes(x = singletons, y = simpson_percentile, group = singletons)) + geom_boxplot() + theme_bw() + facet_wrap(vars(dat)) + geom_hline(yintercept = 0, linetype = 2, color = "red") + ylim(-10, 100) + ggtitle("Simpson's, >= 2000 samples") simpson_boxplot print(select(di_summary, dat, singletons, mean_nsamples, nsites, prop_skew_high, prop_simpson_low)) skew_summary_plot <- ggplot(data = di_summary, aes(x = dat, y = prop_skew_high, color = singletons, size = as.factor(nsites))) + geom_point() + theme_bw() + ggtitle("Skew") + scale_color_viridis_d(end = .8) + ylim(0, 1) + geom_hline(yintercept = .05, color = "red") + theme(legend.position = "none") simpson_summary_plot <- ggplot(data = di_summary, aes(x = dat, y = prop_simpson_low, color = singletons, size = as.factor(nsites))) + geom_point() + theme_bw() + ggtitle("Simpson") + scale_color_viridis_d(end = .8)+ ylim(0, 1) + geom_hline(yintercept = .05, color = "red") + theme(legend.position = "none") gridExtra::grid.arrange(grobs = list(skew_summary_plot, simpson_summary_plot), nrow = 1)
all_di <- filter(all_di, !singletons) skew_s_n_maps <- ggplot(data = all_di, aes(x = s0, y = n0, color = skew_percentile)) + geom_point(alpha = .1, size = .5) + geom_point(data = filter(all_di, nsamples >= 2000), alpha = .2) + theme_bw() + facet_wrap(vars(dat), scales = "free", ncol = 2) + ggtitle("Skewness") + scale_color_viridis_c(option = "plasma", end= .75) skew_s_n_maps simpson_s_n_maps <- ggplot(data = all_di, aes(x = s0, y = n0, color = simpson_percentile)) + geom_point(alpha = .1, size = .5) + geom_point(data = filter(all_di, nsamples >= 2000), alpha = .2) + theme_bw() + facet_wrap(vars(dat), scales = "free", ncol = 2) + ggtitle("Simpson") + scale_color_viridis_c(option = "plasma", end= .75, direction = -1) simpson_s_n_maps
The less extreme (low skew, high simpson) vectors appear vaguely collected in the lower right for Gentry and BBS. Those are the regions with relatively high N/S, aka low average abundance, aka a relatively small feasible set.
Conversely, for MCDB and Misc, it almost looks like the very least skewed elements are way down at tiny abundances?
Do percentiles scan with S/N?
all_di <- mutate(all_di, avg_abund = n0/s0) skew_avgabund <- ggplot(data = all_di, aes(x = avg_abund, y = skew_percentile)) + geom_point(alpha = .3) + theme_bw() + ggtitle("Skew") + facet_wrap(vars(dat), scales = "free") skew_avgabund simpson_avgabund <- ggplot(data = all_di, aes(x = avg_abund, y = simpson_percentile)) + geom_point(alpha = .3) + theme_bw() + ggtitle("Simpson") + facet_wrap(vars(dat), scales = "free") simpson_avgabund
There is perhaps a constraint, but that might be because you rarely get v high average abundances. You seem to rarely get low skewness or high Simpson at high average abundance, which scans with the heatmaps.
Do Simpson and skewness correspond?
ss_plot <- ggplot(data = all_di, aes(x= skew_percentile, y = simpson_percentile, color = dat)) + geom_point(alpha = .1) + scale_color_viridis_d(end = .8) + theme_bw() + ggtitle("Skew v Simp") ss_plot ss_facet_plot <- ss_plot + facet_wrap(vars(dat)) ss_facet_plot
There's a weak relationship between Simpson and skewness, but a lot of noise. They are not substitutable.
Why is Gentry so strangely bimodal? It's like a U, when all the others tend towards one end or the other.
Why are BBS and mammals less frequently squished than Portal?
Looking at range of skewness and Simpsons:
skew_range_hist <- ggplot(data = filter(all_di, nsamples > 2000), aes(x = skew_range, y = skew_percentile, color = dat)) + geom_point(alpha = .2) + geom_smooth(method = "lm", se =F) + theme_bw() + scale_color_viridis_d(end = .8) + facet_wrap(vars(dat), scales = "free") + ggtitle("Skewness") skew_range_hist simpson_range_hist <- ggplot(data = filter(all_di, nsamples > 2000), aes(x = simpson_range, y = simpson_percentile, color = dat)) + geom_point(alpha = .2) + geom_smooth(method = "lm", se =F) + theme_bw() + scale_color_viridis_d(end = .8) + facet_wrap(vars(dat), scales = "free") + ggtitle("Simpson") simpson_range_hist
I added the ranges because I thought the percentiles might be constrained somewhat by the range of values represented in the feasible set. It doesn't look to me like there is a strong relationship between range and %ile.
mcdb_di <- filter(all_di, dat == "mcdb") mcdb_sites <- read.csv(here::here("working-data/site_data/mammal_community_db_sites.csv"), stringsAsFactors = F) mcdb_sites <- mcdb_sites %>% select(site_id, n_years, study_duration) %>% rename(site = site_id) %>% mutate(site = as.character(site)) mcdb_di <- left_join(mcdb_di, mcdb_sites, by = "site") nyears_hist <- ggplot(data = distinct(select(mcdb_di, site, n_years)), aes(x = n_years)) + geom_histogram(pad = T, boundary = 0) + theme_bw() + ggtitle("Nyears in MCDB") nyears_hist # # duration_hist <- ggplot(data = distinct(select(mcdb_di, site, study_duration)), aes(x = study_duration)) + # geom_histogram(pad = T, boundary = 0) + # theme_bw() + # ggtitle("Duration in MCDB") # # duration_hist nyears_skew <- ggplot(data = mcdb_di, aes(x = n_years, y = skew_percentile)) + geom_point(alpha = .2) + #facet_wrap(vars(singletons)) + theme_bw() + ggtitle("Skew v nyears (MCDB)") nyears_skew nyears_simpson <- ggplot(data = mcdb_di, aes(x = n_years, y = simpson_percentile)) + geom_point(alpha = .2) + #facet_wrap(vars(singletons)) + theme_bw() + ggtitle("Simpson v nyears (MCDB)") nyears_simpson
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.