IS_LOCAL <- identical(Sys.getenv("LOCAL"), "true")

  collapse = TRUE,
  comment = "#>", 
  purl = IS_LOCAL


The basic info we want for each dataset is:

For the latter two, it's probably easier to rely on external packages to retrieve the info, so the summaries will first focus on the remaining entries.

BioTime summaries

Define a function for how to generate the summary for BioTime datasets. Note that we use a tibble structure, to enable easy assembly across datasets.

summarize_biotime_data <- function(dataset_id = 10, ...)
    dat <- get_biotime_data(dataset_id = dataset_id, 

    summ_info <- tibble(dataset_id = dataset_id)

    # determine species info
    summ_info$species_table <- I(list(dat$metadata$species_table))

    # determine start and end time
    if (dat$metadata$is_annual_sampling)
        var <- "year"
    } else {
        var <- "date"
    summ_info$start_time <- as.character(min(dat$covariates[[var]], na.rm = TRUE))
    summ_info$end_time <- as.character(max(dat$covariates[[var]], na.rm = TRUE))

    # determine sampling frequency
    if (dat$metadata$is_annual_sampling) # missing month data in raw values
        summ_info$sampling_frequency <- "annual (or less)"
    } else if (length(unique(dat$covariates$year)) >= NROW(dat$covariates)) {
        summ_info$sampling_frquency <- "annual (or less)"
    } else {
        summ_info$sampling_frequency <- "subannual"

    # determine spatial extent
    summ_info$number_locations <- dat$metadata$number_lat_long
    summ_info$lat <- dat$metadata$cent_lat
    summ_info$long <- dat$metadata$cent_long
    summ_info$lat_min <- min(dat$covariates$latitude, na.rm = TRUE)
    summ_info$lat_max <- max(dat$covariates$latitude, na.rm = TRUE)
    summ_info$long_min <- min(dat$covariates$longitude, na.rm = TRUE)
    summ_info$long_max <- max(dat$covariates$longitude, na.rm = TRUE)


Now repeat for all the datasets within BioTime

biotime_dataset_ids <- get_biotime_dataset_ids()

biotime_summaries <- map_dfr(biotime_dataset_ids, summarize_biotime_data)

Extract out just the dataset-level info, and save it for the vignette to be built without needing to download and process the BioTime data first.

dataset_info <- biotime_summaries %>% 

saveRDS(dataset_info, here::here("inst/biotime_dataset_info.RDS"))
dataset_info_file <- system.file("biotime_dataset_info.RDS", 
                                 package = "MATSS", mustWork = TRUE)
dataset_info <- readRDS(dataset_info_file)
knitr::kable(dataset_info, digits = 2)

Extract out just the taxa:

species_info <- biotime_summaries %>%
    unnest() %>%
    select(dataset_id, genus, species, genus_species)

# knitr::kable(species_info)

Search for duplicates

weecology/MATSS-pipeline documentation built on May 16, 2020, 1:54 p.m.