knitr::opts_chunk$set(echo = FALSE) library(drake) library(dplyr) library(ggplot2) all_di <- read.csv(here::here("analysis", "reports", "all_di.csv"), stringsAsFactors = F) all_di <- all_di %>% mutate(log_nparts = log(gmp:::as.double.bigz(nparts)), log_nsamples = log(nsamples)) %>% mutate(prop_found = exp(log_nsamples - log_nparts)) %>% mutate(found_all = prop_found==1)
Here is where our communities fall in S and N space:
ggplot(filter(all_di, singletons == F), aes(x = log(s0), y = log(n0), color = dat)) + geom_point(alpha = .8) + theme_bw() + theme(legend.position = "top") + scale_color_viridis_d()
Here is how that translates into the size of the feasible set:
ggplot(filter(all_di, singletons == F), aes(x = log(s0), y = log(n0), color = log_nparts)) + geom_point(alpha = .5) + theme_bw() + theme(legend.position = "top") + scale_color_viridis_c(option = "magma", end = .9, begin = .1)
Note that the color scale is log transformed, so the largest communities have e^r max(all_di$log_nparts)
, or r exp(max(all_di$log_nparts))
, elements in the feasible set!
Here is how the size of the feasible set maps on to N/S. It increases with n0/s0 and s0.
ggplot(filter(all_di, singletons == F), aes(x = log(n0 / s0), y = log_nparts, color = log(s0))) + theme_bw() + geom_point(alpha = .2) + theme(legend.position = "top") + scale_color_viridis_c()
Here is how many samples we are achieving:
ggplot(filter(all_di, singletons == F), aes(x = log(s0), y = log(n0), color = log_nsamples)) + geom_point(alpha = .3) + theme_bw() + theme(legend.position = "top") + scale_color_viridis_c(option = "magma", end = .9)
Only in small communities do we get appreciably fewer than r max(all_di$nsamples)
samples.
Here is how the number of samples we're getting compares to the size of the feasible set:
ggplot(filter(all_di, singletons == F), aes(x = prop_found)) + theme_bw() + geom_histogram(bins = 100) ggplot(filter(all_di, singletons == F), aes(x = log(s0), y = log(n0), color = prop_found)) + geom_point(alpha = .7) + geom_point(data = filter(all_di, singletons == F, prop_found == 1), color = "grey") + theme_bw() + scale_color_viridis_c(option = "magma", end = .9, direction = -1) #+ #theme(legend.position = "top") ggplot(filter(all_di, singletons == F), aes(x = prop_found, y =nsamples)) + theme_bw() + geom_vline(xintercept = c((.95), (.1))) + geom_point(alpha = .3) + geom_point(data = filter(all_di, singletons == F, prop_found == 1), color = "grey") prop_found_ecdf <- ecdf(filter(all_di,singletons == F)$prop_found) pf_df <- data.frame(prop_found = seq(0, 1, by = 0.001), cdf = prop_found_ecdf(seq(0, 1, by = 0.001))) ggplot(data = pf_df, aes(x = prop_found, y = cdf)) + geom_line() + theme_bw()
For about r 100 * mean(filter(all_di, singletons == F)$prop_found == 1)
% of sites, we found all the elements of the FS. The vast majority of this is FIA - here is what happens if we take out FIA:
ggplot(filter(all_di, singletons == F, dat != "fia_short"), aes(x = prop_found)) + theme_bw() + geom_histogram(bins = 100) ggplot(filter(all_di, singletons == F, dat != "fia_short"), aes(x = log(s0), y = log(n0), color = prop_found)) + geom_point(alpha = .7) + geom_point(data = filter(all_di, singletons == F, dat != "fia_short", prop_found == 1), color = "grey") + theme_bw() + scale_color_viridis_c(option = "magma", end = .9, direction = -1)# + #theme(legend.position = "top") ggplot(filter(all_di, singletons == F, dat != "fia_short"), aes(x = prop_found, y =nsamples)) + theme_bw() + geom_vline(xintercept = c((.95), (.1))) + geom_point(alpha = .3) + geom_point(data = filter(all_di, singletons == F, prop_found == 1, dat != "fia_short"), color = "grey") prop_found_ecdf <- ecdf(filter(all_di,singletons == F, dat != "fia_short")$prop_found) pf_df <- data.frame(prop_found = seq(0, 1, by = 0.001), cdf = prop_found_ecdf(seq(0, 1, by = 0.001))) ggplot(data = pf_df, aes(x = prop_found, y = cdf)) + geom_line() + theme_bw()
Without FIA, we find all the samples about r 100 * mean(filter(all_di, singletons == F, dat != "fia_short")$prop_found == 1)
% of the time.
Here is the overall distribution of skewness, and if we split based on whether we found all the samples:
ggplot(filter(all_di, singletons == F), aes(x = skew_percentile, y =..count.. / sum(..count..))) + geom_histogram(bins = 100) + theme_bw() + geom_hline(yintercept = .01) ggplot(filter(all_di, singletons == F), aes(x = skew_percentile)) + theme_bw() + geom_histogram(bins = 100) + facet_wrap(vars(found_all), scales = "free_y")
When we found all the samples, the percentiles are more evenly distributed. I do not read much into the spike at 0 for those communities, because skewness is bizarre for very small communities.
Here is how skewness maps with S and N:
ggplot(filter(all_di, singletons == F), aes(x = log(s0), y = log(n0), color = skew_percentile)) + geom_point(alpha = .5) + theme_bw() + theme(legend.position = "top") + scale_color_viridis_c(option = "plasma", end = .9) # ggplot(filter(all_di, singletons == F), aes(x = log(n0/s0), y = skew_percentile, color = prop_found)) + # geom_point() + # theme_bw() + # scale_color_viridis_c(option = "plasma", end = .9, direction = -1) + # theme(legend.position = "top") # # ggplot(filter(all_di, singletons == F), aes(x = log(n0 / s0), y = skew_percentile, color = log(s0))) + # geom_point(alpha = .2) + # theme_bw() + # theme(legend.position = "top") + scale_color_viridis_c()
The very low skewness values are down in the very small and very weird communities. There may be variation along S and N elsewhere, but it is hard to parse.
Here is the overall evenness distribution, and split by whether we found 'em all:
ggplot(filter(all_di, singletons == F), aes(x = simpson_percentile, y =..count.. / sum(..count..))) + geom_histogram(bins = 100) + theme_bw() + geom_hline(yintercept = .01) ggplot(filter(all_di, singletons == F), aes(x = simpson_percentile)) + theme_bw() + geom_histogram(bins = 100) + facet_wrap(vars(found_all))
Simpson is less evenly distributed than skewness. Again, where we found them all, we don't see the disproportionately common low percentile values.
Here is how Simpson behaves in S and N space:
# ggplot(filter(all_di, singletons == F), aes(x = log(n0 / s0), y = simpson_percentile, color = log(s0))) + # geom_point(alpha = .2) + # theme_bw() + # theme(legend.position = "top") + scale_color_viridis_c() ggplot(filter(all_di, singletons == F), aes(x = log(s0), y = log(n0), color = simpson_percentile)) + geom_point(alpha = .5) + theme_bw() + theme(legend.position = "top") + scale_color_viridis_c(option = "plasma", end = .9) # ggplot(filter(all_di, singletons == F), aes(x = log(s0/n0), y = simpson_percentile, color = prop_found)) + # geom_point() + # theme_bw() + # scale_color_viridis_c(option = "plasma", end = .9, direction = -1) + # theme(legend.position = "top")
There is unusual behavior where S is large and N/S is relatively small (log N/S <= 1.5), where evenness is unusually high.
For both skew and evenness, we do not see non-extreme percentile values in large communities:
ggplot(filter(all_di, singletons == F), aes(x = log(n0 / s0), y = skew_percentile, color = log(n0))) + geom_point(alpha = .1) + theme_bw() + theme(legend.position = "top") + scale_color_viridis_c(end = .9) ggplot(filter(all_di, singletons == F), aes(x = log(n0 / s0), y = simpson_percentile, color = log(n0))) + geom_point(alpha = .1) + theme_bw() + theme(legend.position = "top") + scale_color_viridis_c(end = .9)
Here is how singletons change percentiles, broken out by whether or not we found all the samples:
singletons_effect <- all_di %>% select(dat, nsamples, site, singletons, skew_percentile, simpson_percentile, found_all) %>% tidyr::pivot_wider(names_from = singletons, values_from = c(skew_percentile, simpson_percentile, nsamples, found_all)) %>% mutate(skewdiff = skew_percentile_TRUE - skew_percentile_FALSE, simpdiff= simpson_percentile_TRUE - simpson_percentile_FALSE, found_all = (found_all_TRUE + found_all_FALSE) == 2) ggplot(data = singletons_effect, aes(x = skew_percentile_FALSE, y = skew_percentile_TRUE)) + geom_point(alpha = .1) + theme_bw() + geom_abline(intercept = 0, slope = 1, color = "green") + facet_wrap(vars(found_all)) ggplot(data = singletons_effect, aes(x = simpson_percentile_FALSE, y = simpson_percentile_TRUE)) + geom_point(alpha = .1) + theme_bw() + geom_abline(intercept = 0, slope = 1, color = "green") + facet_wrap(vars(found_all))
The rarefaction-inflated datasets are strongly // the raw vectors. They have more extreme skewness and evenness values, relative to their feasible sets, than the raw vectors. This is almost always true for evenness, with a little more noise in the skewness signal. But either way, very strong.
cache_loc = "macdb" ## Set up the cache and config db <- DBI::dbConnect(RSQLite::SQLite(), here::here("analysis", "drake", paste0("drake-cache-", cache_loc, ".sqlite"))) cache <- storr::storr_dbi("datatable", "keystable", db) all_di_macd <- readd(all_di_obs, cache = cache) all_di_macd_manip <- readd(all_di_manip, cache = cache) macd_dat <- readd(dat_s_dat_macdb, cache = cache) DBI::dbDisconnect(db) rm(cache) all_di_macd <- all_di_macd %>% mutate(log_nparts = log(gmp:::as.double.bigz(nparts)), log_nsamples = log(nsamples)) %>% mutate(prop_found = exp(log_nsamples - log_nparts)) %>% mutate(found_all = prop_found==1) # # all_di_macd_manip <- all_di_macd_manip %>% # mutate(log_nparts = log(gmp:::as.double.bigz(nparts)), # log_nsamples = log(nsamples)) %>% # mutate(prop_found = exp(log_nsamples - log_nparts)) %>% # mutate(found_all = prop_found==1) ggplot(data = all_di, aes(x = log(s0), y = log(n0))) + geom_point(alpha = .1) + geom_point(data = all_di_macd, aes(x = log(s0), y = log(n0)), color = "blue", alpha = .9) + theme_bw() ggplot(data = filter(all_di, !singletons), aes(x = log(s0), y = log(n0))) + geom_point(alpha = .1) + geom_point(data = filter(all_di_macd, !singletons), aes(x = log(s0), y = log(n0)), color = "blue", alpha = .9) + theme_bw()
Here are the distributions of skew and evenness, overall.
ggplot(data = all_di_macd, aes(x = skew_percentile)) + geom_histogram(bins = 100) + theme_bw() + facet_wrap(vars(found_all), scales = "free_y") ggplot(data = all_di_macd, aes(x = simpson_percentile)) + geom_histogram(bins = 100) + theme_bw() + facet_wrap(vars(found_all), scales = "free_y")
Here is how manipulation affects things:
ggplot(data = all_di_macd_manip, aes(x = skew_percentile, y = ctrl_skew_percentile)) + geom_point(alpha = .5) + # xlim(0, 100) + # ylim(0, 100) + theme_bw() + geom_abline(intercept = 0, slope = 1, color = "green") ggplot(data = all_di_macd_manip, aes(x = simpson_percentile, y = ctrl_skew_percentile)) + geom_point(alpha = .5) + xlim(0, 100) + ylim(0, 100) + theme_bw() + geom_abline(intercept = 0, slope = 1, color = "green") ggplot(data = all_di_macd_manip, aes(x = simpson_change)) + geom_histogram() + theme_bw() ggplot(data = all_di_macd_manip, aes(x = skew_change)) + geom_histogram() + theme_bw() ggplot(data = all_di_macd_manip, aes(x = ctrl_simpson_percentile, y = simpson_change)) + geom_point(alpha = .3) + geom_hline(yintercept = 0) + theme_bw() ggplot(data = all_di_macd_manip, aes(x = ctrl_skew_percentile, y = skew_change)) + geom_point(alpha = .3) + geom_hline(yintercept = 0) + theme_bw() print(t.test(all_di_macd_manip$skew_percentile, all_di_macd_manip$ctrl_skew_percentile, paired = T) ) print(t.test(all_di_macd_manip$simpson_percentile, all_di_macd_manip$ctrl_simpson_percentile, paired = T) ) print(wilcox.test(all_di_macd_manip$skew_percentile, all_di_macd_manip$ctrl_skew_percentile, paired = T)) print(wilcox.test(all_di_macd_manip$simpson_percentile, all_di_macd_manip$ctrl_simpson_percentile, paired = T))
Change is going to be bounded at 100 and 0: you can't go up or down from there. (Another argument for increasing the number of samples?)
cache_loc = "portal_plants_manip" ## Set up the cache and config db <- DBI::dbConnect(RSQLite::SQLite(), here::here("analysis", "drake", paste0("drake-cache-", cache_loc, ".sqlite"))) cache <- storr::storr_dbi("datatable", "keystable", db) all_di_p<- readd(all_di_manip, cache = cache) all_di_p <- all_di_p %>% mutate(log_nparts = log(gmp:::as.double.bigz(nparts)), log_nsamples = log(nsamples)) %>% mutate(prop_found = exp(log_nsamples - log_nparts)) %>% mutate(found_all = prop_found==1) DBI::dbDisconnect(db) rm(cache)
Nsamples, singletons
# # ggplot(data = all_di_p, aes(x = nsamples)) + # geom_histogram(bins = 100) + # theme_bw() + # geom_vline(xintercept = 2000) # # all_di_p <- filter(all_di_p, nsamples >= 2000) ggplot(data = all_di_p, aes(x = skew_percentile)) + geom_histogram(bins = 100) + theme_bw() + facet_wrap(vars(found_all), scales = "free_y") ggplot(data = all_di_p, aes(x = simpson_percentile)) + geom_histogram(bins = 100) + theme_bw() + facet_wrap(vars(found_all), scales = "free_y") # # pp_singletons_effect <- all_di_p %>% # select(singletons, skew_percentile, simpson_percentile, year, plot, treatment, season) %>% # tidyr::pivot_wider(names_from = singletons, values_from = c("skew_percentile", "simpson_percentile")) # # ggplot(data = pp_singletons_effect, aes(x = skew_percentile_FALSE, y = skew_percentile_TRUE)) + # geom_point(alpha = .1) + # geom_abline(intercept = 0, slope = 1, color = "green") + # theme_bw() # # # ggplot(data = pp_singletons_effect, aes(x = simpson_percentile_FALSE, y = simpson_percentile_TRUE)) + # geom_point(alpha = .1) + # geom_abline(intercept = 0, slope = 1, color = "green") + # theme_bw()
By treatment, season
ggplot(data = filter(all_di_p, singletons == F), aes(x = skew_percentile)) + geom_histogram(bins = 100) + theme_bw() + facet_wrap(vars(season, treatment), scales = "free_y") ggplot(data = filter(all_di_p, singletons == F), aes(x = treatment, y = skew_percentile)) + geom_boxplot() + theme_bw() + facet_wrap(vars(season)) ggplot(data = filter(all_di_p, singletons == F), aes(x = simpson_percentile)) + geom_histogram(bins = 100) + theme_bw() + facet_wrap(vars(season, treatment), scales = "free_y") ggplot(data = filter(all_di_p, singletons == F), aes(x = treatment, y = simpson_percentile)) + geom_boxplot() + theme_bw() + facet_wrap(vars(season))
Median
pp_median <- all_di_p %>% filter(singletons == F) %>% group_by(season, year, treatment) %>% summarize(skew_percentile = median(skew_percentile), simpson_percentile = median(simpson_percentile)) %>% ungroup() %>% tidyr::pivot_wider(names_from = treatment, values_from = c(skew_percentile, simpson_percentile)) sk_x <- ggplot(data = pp_median, aes(x = skew_percentile_control, y = skew_percentile_exclosure)) + geom_point(alpha = .5) + facet_wrap(vars(season)) + theme_bw() + ggtitle("Skew Ctrl v Exclosure") + geom_abline(intercept = 0, slope = 1, color = "green") + xlim(40, 100) + ylim(40, 100) sk_r <- ggplot(data = pp_median, aes(x = skew_percentile_control, y = skew_percentile_removal)) + geom_point(alpha = .5) + facet_wrap(vars(season)) + theme_bw() + ggtitle("Skew Ctrl v Removal") + geom_abline(intercept = 0, slope = 1, color = "green") + xlim(40, 100) + ylim(40, 100) gridExtra::grid.arrange(grobs = list(sk_x, sk_r), nrow = 2) sp_x <- ggplot(data = pp_median, aes(x = simpson_percentile_control, y = simpson_percentile_exclosure)) + geom_point(alpha = .5) + facet_wrap(vars(season)) + theme_bw() + ggtitle("Simpson Ctrl v Exclosure") + geom_abline(intercept = 0, slope = 1, color = "green") + xlim(0, 40) + ylim(0, 40) sp_r <- ggplot(data = pp_median, aes(x = simpson_percentile_control, y = simpson_percentile_removal)) + geom_point(alpha = .5) + facet_wrap(vars(season)) + theme_bw() + ggtitle("Simpson Ctrl v Removal") + geom_abline(intercept = 0, slope = 1, color = "green") + xlim(0, 40) + ylim(0, 40) gridExtra::grid.arrange(grobs = list(sp_x, sp_r), nrow = 2)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.