knitr::opts_chunk$set(echo = FALSE)
library(drake)
library(dplyr)
library(ggplot2)
knitr::opts_chunk$set(fig.width=4, fig.height=3, warning = FALSE, message = FALSE) 
all_di <- read.csv(here::here("analysis", "reports", "all_di.csv"), stringsAsFactors = F)

all_di <- all_di %>%
  mutate(log_nparts = log(gmp:::as.double.bigz(nparts)),
         log_nsamples = log(nsamples),
         log_s0 = log(s0),
         log_n0 = log(n0)) %>%
  filter(n0 > s0,
         !singletons,
         dat %in% c("bbs", "fia_short", "fia_small", "gentry", "mcdb", "misc_abund_short")) %>%
  mutate(dat = ifelse(grepl(dat, pattern = "fia"), "fia", dat),
         dat = ifelse(dat == "misc_abund_short", "misc_abund", dat))

The FIA sites stand out from the other datasets in that, overall, they do not show pronouncedly extreme values. This is especially the case for skewness:

ggplot(filter(all_di, s0 > 2, skew_unique > 20), aes(skew_percentile_excl)) +
  geom_histogram(bins = 20) +
  theme_bw() +
  facet_wrap(vars(dat), scales = "free_y")

ggplot(filter(all_di, simpson_unique > 20), aes(simpson_percentile)) +
  geom_histogram(bins = 20) +
  theme_bw() +
  facet_wrap(vars(dat), scales = "free_y")

One possible explanation for this is that the FIA sites tend to have quite small S and N, leading to quite a small feasible set and potentially quite a broad distribution of expected values for skewness and evenness.

It is also possible that the FIA datasets differ qualitatively from other datasets, and that this drives the difference.

We may be able to disentangle these possibilities by

If we compare the FIA sites to sites from other datasets that are of a similar size, the comparison is limited by the fact that our sites are nonuniformly distributed in SxN space, or in terms of the the number of elements in their feasible sets. Specifically, the FIA datasets are concentrated towards especially small feasible sets compared "other datasets" - even the ones that fall within the general range found in FIA.

fia_max_s = max(filter(all_di, dat == "fia")$s0)
fia_max_n = max(filter(all_di, dat == "fia")$n0)

fia = filter(all_di, dat == "fia")

small_di <- all_di %>%
  filter(s0 <= fia_max_s,
         n0 <= fia_max_n) %>%
  mutate(fia_yn = ifelse(dat == "fia", "fia", "other datasets"),
         right_corner = FALSE)

for(i in 1:nrow(small_di)) {
  if(small_di$dat[i] != "fia") {
    fia_match_s0 = filter(fia, s0 >= small_di$s0[i])

    max_n0_for_this_s0 = max(fia_match_s0$n0)

    small_di$right_corner[i] = small_di$n0[i] > max_n0_for_this_s0

  }
}


small_di <- small_di %>%
  filter(!right_corner)

ggplot(small_di, aes(x = s0, y = n0, color = log_nparts)) +
  geom_point() +
  theme_bw() +
  scale_color_viridis_c(option = "plasma", end = .7) +
  facet_wrap(vars(fia_yn)) +
  theme(legend.position = "top")

ggplot(filter(small_di), aes(log_nparts)) +
  geom_histogram(bins = 30) +
  theme_bw() +
  facet_wrap(vars(fia_yn), scales = "free_y")

This means that an overall histogram for FIA would be representative more of sites in the 4-7 parts range, while one for "other datasets" would be representative of more of a spread from 3-16 parts. Given that we suspect the number of parts is quite important, this is a sticky problem.

We can subsample FIA so there is equal representation of all S and N combinations between FIA and "other datasets".

Some details on the subsampling:

not_fia <- filter(small_di, fia_yn != "fia")
fia = filter(small_di, dat == "fia")

small_di_s_n <- small_di %>%
  select(s0, n0) %>%
  distinct() %>%
  mutate(in_fia = FALSE,
         in_not_fia = FALSE)

for(i in 1:nrow(small_di_s_n)) {

  this_s0 = small_di_s_n$s0[i]
  this_n0 = small_di_s_n$n0[i]

  fia_matches = filter(fia, s0 == this_s0, n0 == this_n0)
  not_fia_matches = filter(not_fia, s0 == this_s0, n0 == this_n0)

  small_di_s_n$in_fia[i] = nrow(fia_matches) > 0 
  small_di_s_n$in_not_fia[i] = nrow(not_fia_matches) > 0 


}

small_di_s_n <- filter(small_di_s_n, in_fia, in_not_fia)

set.seed(1977)

subsamples <- list()

for(i in 1:nrow(small_di_s_n)) {

  this_s0 = small_di_s_n$s0[i]
  this_n0 = small_di_s_n$n0[i]

  fia_matches = filter(fia, s0 == this_s0, n0 == this_n0)

  not_fia_matches = filter(not_fia, s0 == this_s0, n0 == this_n0)

  n_to_draw = min(nrow(fia_matches), nrow(not_fia_matches))

  fia_draw = sample.int(n = nrow(fia_matches), size = n_to_draw, replace = F)
  not_fia_draw = sample.int(n = nrow(not_fia_matches), size = n_to_draw, replace = F)

  subsamples[[i]] <- (bind_rows(fia_matches[fia_draw,], not_fia_matches[not_fia_draw, ]))

}

sub_di <- bind_rows(subsamples) %>%
   mutate(Dataset = ifelse(fia_yn == "fia", "Forest Inventory and Analysis", "Other datasets"))
# 
# ggplot(sub_di, aes(log(s0), log(n0), color = fia_yn)) +
#   geom_point() +
#   facet_wrap(vars(fia_yn))
# 
# 
# ggplot(sub_di, aes(log_nparts)) +
#   geom_histogram() +
#   facet_wrap(vars(fia_yn))


ggplot(small_di, aes(log(s0), log(n0))) +
  geom_point(color = "gray") +
  theme_bw() +
  geom_point(data = sub_di, alpha = 1)

ggplot(sub_di, aes(log_nparts)) +
  geom_histogram() +
  theme_bw()

The dark dots are the sites for which we can find at least one exact match in s0 and n0 between a FIA site and a site from another dataset. The histogram is the distribution of FS sizes represented in the subsample.

Here are the results for that subsample:

ggplot(filter(sub_di, skew_unique > 20), aes(skew_percentile_excl)) +
  geom_histogram()+
  theme_bw() +
  facet_wrap(vars(Dataset), scales = "free_y")  +
  xlab("Percentile rank of observed skewness value relative to feasible set") +
  ylab("Number of communities")

ggplot(filter(sub_di, simpson_unique > 20), aes(simpson_percentile)) +
  geom_histogram()+
  theme_bw() +
  facet_wrap(vars(Dataset), scales = "free_y")  +
  xlab("Percentile rank of observed evenness value relative to feasible set") +
  ylab("Number of communities")

sub_di %>%
  filter(s0 > 2) %>%
  group_by_all %>%
  mutate(skew_percentile_excl = ifelse(skew_unique >= 20, skew_percentile_excl, NA),
         simpson_percentile = ifelse(simpson_unique >= 20, simpson_percentile, NA)) %>%
  ungroup() %>%
  group_by(Dataset) %>%
  summarize(prop_skew_high = mean(skew_percentile_excl > 95, na.rm = T),
            prop_even_low = mean(simpson_percentile < 5, na.rm = T),
            nsites_skew = sum(!is.na(skew_percentile_excl)),
            nsites_even = sum(!is.na(simpson_percentile))) %>%
   rename("Proportion of communities with skewness above 95th percentile" = prop_skew_high,
         "Number of communities analyzed for skewness" = nsites_skew,
         "Proportion of communities with evenness below 5th percentile" = prop_even_low,
         "Number of communities analyzed for evenness" = nsites_even)

For the subsample that is directly comparable, there is not a difference between FIA and other datasets in the distribution of percentile values. This is a very small subsample relative to all the sites in both datasets - FIA has 20,000 sites overall, and we are only looking at around 375 of them. However, given how sensitive results aggregated in this way are to the relative frequency of SxN combinations/FS sizes in the sample, I believe this is the most robust way we can ask whether being from FIA causes detectably different outcomes.

The absence of a difference here implicates the distribution of sizes in FIA as driving the difference in pattern. In the main manuscript we document that the 95% ratios for both skewness and evenness decrease over large gradients in S and N. In the self_similarity supplement, we show that this is also the case for other metrics of self-similarity.



diazrenata/scadsanalysis documentation built on May 14, 2021, 6:59 p.m.