data-raw/DATASET.R

### This is a short script to make the subsetted data from  Roswell et al. 2019 PLoS One https://doi.org/10.5061/dryad.c3rr6q1 and used for figures in Roswell et al. 2020 Oikos.
library(dplyr)
library(tidyr)


#################################
#read in data from .csv on Dryad
################################
XerDate <- read.csv("https://datadryad.org/stash/downloads/file_stream/93833") %>%
  dplyr::filter(keep == "Y" &
           bee_species != "sp" &
           bee_species != "species" &
           bee_genus != "sandwasp" &
           bee_genus != "sand wasp" &
           bee_genus != "Anacrabro") %>% #note non-bee accidentally uploaded
  tidyr::unite(col="sr", site, sampling_round, remove = FALSE) %>%
  tidyr::unite(col="siteday",site, sday, remove = FALSE) %>%
  droplevels()

XerDay <- XerDate %>%
  dplyr::group_by(sr, sampling_round, site) %>%
  dplyr::mutate(sday1=dplyr::dense_rank(sday))

###############################
## Subset PLoS One dataset ####
###############################

#due to turnover in set.seed() with R 3.6.0
set.seed(5062, sample.kind = "Rounding")

makedat1 <- XerDay %>%
  dplyr::group_by(sr, sday) %>%
  dplyr::do(filter(.,length(unique(boutstart)) > 4)) %>%
  #drop dates with unsufficient sampling effort
  dplyr::do(dplyr::filter(.
                          , boutstart %in% sample(unique(boutstart)
                                                   , size=5
                                                  #select 5 bouts from each day of sampling at each site (equal effort)
                                                   , replace = FALSE))) %>%
  dplyr::ungroup()


 ################################
# create package dataset beeObs
#################################
beeObs <- makedat1 %>%
  dplyr::select(uniqueID, bee, start=boutstart, sr, sday1, site, sampling_round) %>%
  # FOUR SITE-ROUNDS SELECTED TO DEMONSTRATE PITFALLS IN DIVERSITY COMPARISON
  dplyr::filter(sr %in% c(
    "IAS_3"
    , "Cold Soil_5"
    , "Lord Stirling_4"
    , "Fox Hill_5"
    ))


####################################
# create package dataset beeAbunds
#####################################

# summarizes beeObs as abundance per site-species
beeAbunds <- beeObs %>% #summary table used in MS
  dplyr::group_by(sr, bee) %>%
  dplyr::summarize(abund = n()) %>%
  tidyr::spread(sr, abund, fill = 0)




#########################################################
# generate individual-based rarefaction diversity estimates
########################################################
##############
# this takes a long time to run so we're going to stash it but export the data it creates

sz <- 1:745 #sample sizes from 1 to maximum observed total abundance at a site
nrep <- 200 #Large enough because estimating means

tictoc::tic() #time this
nc <- parallel::detectCores() - 1 #nc is number of cores to use in parallel processing
future::plan(strategy = future::multiprocess, workers = nc) #this sets up the cluster


div_ests <- purrr::map_dfr(1:nrep, function(k){
  furrr::future_map_dfr(sz, function(x){
    map(2:length(beeAbunds), function(sitio){
      f<-beeAbunds %>% dplyr::select(all_of(sitio))
      if(sum(f) > x){
        comm = dplyr::pull(f) %>%
          subsam(size = x)
        return(data.frame(obs_est(comm)
                          , site = names(f)))
      }
    })
  })
})

tictoc::toc() #507 seconds, not bad.
mean_narm <- function(x){mean(x, na.rm = TRUE)}

###########################
# create dataset div_ests for package
##############################

# summarize rarefaction data
mean_ests <- div_ests %>%
  dplyr::group_by(site, individuals) %>%
  dplyr::summarize_all(mean_narm)


###########################
# sample-based rarefaction
###########################

maxeff <- 14
reps <- 9999
tictoc::tic() #time this
# nc <- parallel::detectCores() - 1 #nc is number of cores to use in parallel processing
future::plan(strategy = future::multiprocess, workers = nc) #this sets up the cluster

show_ests <- furrr::future_map_dfr(1:reps, function(x){
  purrr::map_dfr(1:maxeff, function(i){
    sam <- beeObs %>%
      dplyr::group_by(sr) %>%
      dplyr::do(filter(., start %in%
                         sample(unique(.$start)
                                , size = maxeff - i + 1
                                , replace = F)))
    subcom <- sam %>% dplyr::group_by(bee, sr) %>%
      dplyr::summarize(abund = n()) %>%
      tidyr::spread(key = sr
                    , value = abund
                    , fill = 0) %>%
      as.data.frame()
    basicdat <- purrr::map_dfr(apply(subcom[, -1], MARGIN = 2, obs_est), rbind)
    return(data.frame(basicdat, site = names(subcom[, -1]), transects = maxeff - i + 1))
  })
})

tictoc::toc() #Under 2 hours with only 4 cores

effort_means <- show_ests %>%
  dplyr::group_by(site, transects) %>%
  dplyr::summarize_at(.vars = c("n"
                              , "coverage"
                              , "obsrich"
                              , "chaorich"
                              , "obsshan"
                              , "chaoshan"
                              , "obssimp"
                              , "chaosimp"), .funs = mean)


#gather to make a prettier plot, will need to get fancy now that there's asy also
effort_rare <- effort_means %>% dplyr::rename(
  "richness_observed" = obsrich
  , "richness_asymptotic" = chaorich
  , "Hill-Shannon_observed" = obsshan
  , "Hill-Shannon_asymptotic" = chaoshan
  , "Hill-Simpson_observed" = obssimp
  , "Hill-Simpson_asymptotic" = chaosimp
  , size = n
  , effort = transects) %>%
  tidyr::gather(key = "ell"
                , value = "diversity"
                , "richness_observed"
                , "richness_asymptotic"
                , "Hill-Shannon_observed"
                , "Hill-Shannon_asymptotic"
                , "Hill-Simpson_observed"
                , "Hill-Simpson_asymptotic") %>%
  tidyr::gather(key = method
                , value = xax
                , effort
                , size
                , coverage) %>%
  tidyr::separate(ell, c("divind", "etype"), sep = "_")

effort_rare$divind <- factor(effort_rare$divind
                          , levels = c("richness"
                                       , "Hill-Shannon"
                                       , "Hill-Simpson")
                          , labels = c("richness"
                                     , "Hill-Shannon"
                                     , "Hill-Simpson"))
effort_rare$method <- factor(effort_rare$method
                          , levels = c("effort"
                                     , "size"
                                     , "coverage"))
effort_rare$etype <- factor(effort_rare$etype
                         , levels = c("observed"
                                      , "asymptotic"))

####################################################
### make data testing estimators ##################
#####################################################

# simulate SADs for analysis
SADs_list <- purrr::map(c("lnorm", "gamma"), function(distr){
  purrr::map(c(100, 200), function(rich){
    purrr::map(c(.1,.15,.25,.5,0.75,.85), function(simp_Prop){ #simp_Prop is evenness
      fit_SAD(rich = rich, simpson = simp_Prop*(rich-1)+1, distr = distr)
    })
  })
})

nreps <- 9999

ss_to_test <- floor(10^seq(1, 3, 0.2))

flatsads <- purrr::flatten(purrr::flatten(SADs_list))

nc <- parallel::detectCores() - 1
future::plan(strategy = "multisession", workers = nc)
# options <- furrr::furrr_options(seed = TRUE)

tictoc::tic()
compare_ests <- purrr::map_dfr(1:24, function(SAD){
  purrr::map_dfr(-1:1, function(ell){
    truth = as.numeric(flatsads[[SAD]][[2]][2-ell])
    truep = flatsads[[SAD]][[3]]
    dinfo = data.frame(t(flatsads[[SAD]][[1]]))
    furrr::future_map_dfr(.x=1:nreps, .f=function(nrep){
      purrr::map_dfr(ss_to_test, function(ss){
        subsam = sample_infinite(truep, ss)
        chaoest = Chao_Hill_abu(subsam, ell)
        naive = rarity(subsam, ell)
        gods = GUE(subsam, truep, ell)
        return(dplyr::bind_cols(data.frame(truth, chaoest, naive, gods, n=ss, ell, SAD), dinfo))
      })
    }
    # , .options= options
    )
  })
})
tictoc::toc() # ~1hr on 2.9 GHz i7 quad core processor
# define a function for computing the root mean square
nasum <- function(x){sum(x, na.rm = TRUE)}
rootms <- function(x){sqrt(nasum(((x)^2)/length(x)))}
namean <- function(x){mean(x, na.rm = TRUE)}

errs <- compare_ests %>%
  group_by(ell, distribution, fitted.parameter, n, SAD) %>%
  mutate(godsError = gods - truth
         , naiveError = naive - truth
         , chaoError = chaoest - truth) %>%
  summarize_at(.vars = c("godsError", "naiveError", "chaoError")
               , .funs = c(rootms, namean)) %>%
  pivot_longer(godsError_fn1:chaoError_fn2,
               names_to = c("estimator", ".value"),
               names_sep = "_"
  ) %>%
  rename(rmse = fn1, bias = fn2)


#############################################################
## make data to show bias and variability for vignette #####
############################################################

# genereate SADs

evenness <- c(0.1, 0.3, 0.5)
rich <- 100
sample_sizes <- floor(10^seq(1.5, 3, 0.5))
# generate SADs with 100 spp and varying evenness, parametric distributions
SADS<-purrr::flatten(
  purrr::map(
    evenness, function(simp_evenness){
      purrr::map(
        c("lnorm", "gamma", "invgamma"), function(dist){
          fit_SAD(rich = rich
                  , simpson = simp_evenness*(rich-1)+1
                  , distr = dist
                  , int_lwr = ifelse(dist == "invgamma", 1e-2, 1e-4))
        })
    }))

# warning alerts us that with these parameters, the rare species in the gamma
# SADS are supremely rare


# set sample size
reps <- 999

# set future plan

future::plan(strategy = "multisession", workers = parallel::detectCores()-1)

sample_data <- purrr::map_dfr(SADS, function(mySAD){
  evenness = (mySAD[[2]][3]-1)/(mySAD[[2]][1]-1)
  distribution = mySAD[[1]][1]
  purrr::map_dfr(sample_sizes, function(SS){
    sample_abundances = replicate(reps, sample_infinite(ab = mySAD[[3]]
                                                        , size = SS))
    furrr::future_map_dfr(seq(-1.5, 1.5, 0.05), function(ell){
      true_diversity = rarity(ab = mySAD[[3]], l = ell)
      sample_diversity = sapply(1:reps
                                , function(rep){
                                  rarity(ab = sample_abundances[, rep]
                                         , l = ell)})
      estimated_diversity = sapply(1:reps, function(rep){
        Chao_Hill_abu(sample_abundances[, rep], l = ell)})
      return(data.frame(evenness
                        , distribution
                        , SS
                        , ell
                        , true_diversity
                        , sample_diversity
                        , estimated_diversity
                        , row.names = NULL))
    })
  })
})

err_plot_data <- sample_data %>%
  dplyr::group_by(evenness, distribution, ell, SS) %>%
  dplyr::summarize(sample_sdlog = sd(ifelse(sample_diversity == 0
                                            , 0, Ln(sample_diversity)))
                   , estimator_sdlog = sd(ifelse(estimated_diversity == 0
                                                  , 0, Ln(estimated_diversity)))
                   , sample_bias = mean(Ln(sample_diversity) -
                                          mean(Ln(sample_diversity)))
                   , naive_bias = mean(Ln(sample_diversity) -
                                            Ln(true_diversity))
                   , estimator_bias = mean(Ln(estimated_diversity) -
                                             Ln(true_diversity))
                   , sample_rmsle = rmsle(sample_diversity)
                   , naive_rmsle = rmsle(sample_diversity, Ln(true_diversity))
                   , estimator_rmsle = rmsle(estimated_diversity
                                             , Ln(true_diversity))
                   , true_diversity = mean(true_diversity)
                   , mean_sample = mean(sample_diversity)
                   , mean_asymptotic = mean(estimated_diversity)) %>%
  dplyr::mutate(evenness = as.factor(round(evenness, 2))) %>%
  dplyr::rename(sample_size = SS) %>%
  dplyr::ungroup()


#############################################
#### turn into package data ##############
#######################################
usethis::use_data(errs, overwrite = TRUE)
usethis::use_data(beeObs, overwrite = TRUE)
usethis::use_data(beeAbunds, overwrite = TRUE)
usethis::use_data(mean_ests, overwrite = TRUE)
usethis::use_data(effort_rare, overwrite = TRUE)
usethis::use_data(err_plot_data, overwrite = TRUE)
mikeroswell/MeanRarity documentation built on May 5, 2024, 4:50 p.m.