create.R
In ecocomDP: Tools to Create, Use, and Convert ecocomDP Data

## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----setup--------------------------------------------------------------------
library(ecocomDP)

## ----eval=FALSE---------------------------------------------------------------
#  # -----------------------------------------------------------------------------
#  # This function converts source dataset "knb-lter-hfr.118" (archived in the EDI
#  # Data Repository) to ecocomDP dataset "edi.193" (also archived in EDI)
#  #
#  # Arguments:
#  #
#  # path        Where the ecocomDP tables will be written
#  # source_id   Identifier of the source dataset
#  # derived_id  Identifier of the derived dataset
#  # url         The URL by which the derived tables and metadata can be accessed
#  #             by a data repository. This argument is used when automating the
#  #             repository publication step, but not used when manually
#  #             publishing.
#  #
#  # Value:
#  #
#  # tables      (.csv) ecocomDP tables
#  # metadata    (.xml) EML metadata for tables
#  #
#  # Details:
#  #             This function facilitates automated updates to the derived
#  #             "edi.193" whenever new data are added to the source
#  #             "knb-lter-hrf.118". The framework executing this maintenance
#  #             routine is hosted on a remote server and jumps into action
#  #             whenever an update notification is received for
#  #             "knb-lter-hrf.118". The maintenance routine parses the
#  #             notification to get the arguments to create_ecocomDP().
#  #
#  # Landing page to source dataset "knb-lter-hfr.118":
#  # https://portal.edirepository.org/nis/mapbrowse?scope=knb-lter-hfr&identifier=118
#  # Landing page to derived dataset "edi.193":
#  # https://portal.edirepository.org/nis/mapbrowse?scope=edi&identifier=193
#  # -----------------------------------------------------------------------------
#  
#  # Libraries used by this function
#  
#  library(ecocomDP)
#  library(xml2)
#  library(magrittr)
#  library(data.table)
#  library(lubridate)
#  library(tidyr)
#  library(dplyr)
#  library(EDIutils)       # remotes::install_github("EDIorg/EDIutils")
#  library(taxonomyCleanr) # remotes::install_github("EDIorg/taxonomyCleanr")
#  
#  create_ecocomDP <- function(path,
#                              source_id,
#                              derived_id,
#                              url = NULL) {
#  
#    # Read source dataset -------------------------------------------------------
#  
#    # The source dataset is about ant communities and their functional traits
#    # changing in response to an invasive species. Observations are made across
#    # habitat types within the Harvard Experimental Forest. The dataset consists
#    # of a primary table listing abundances at sites through time, and an
#    # ancillary table listing physical and functional traits of observed species.
#  
#    # Read the source dataset from EDI
#  
#    eml <- EDIutils::api_read_metadata(source_id)
#    data <- EDIutils::read_tables(
#      eml = eml,
#      strip.white = TRUE,
#      na.strings = "",
#      convert.missing.value = TRUE,
#      add.units = TRUE)
#  
#    ants <- data$`hf118-01-ants.csv`
#    traits <- data$`hf118-02-functional-traits.csv`
#    ants$date <- ymd(ants$date)
#  
#    # Join and flatten the source dataset ---------------------------------------
#  
#    # Joining all source data and relevant metadata into one big flat table
#    # simplifies parsing into ecocomDP tables and facilitates referential
#    # integrity in the process.
#  
#    # Remove duplicate data from the ancillary table and join on species "code"
#  
#    traits <- traits %>% select(-genus, -species)
#    traits <- traits %>% rename(code = species.code)
#    wide <- left_join(ants, traits, by = "code")
#  
#    # Convert wide format to "flat" format. This is the wide form but gathered on
#    # core observation variables, which are often > 1 in source datasets. This
#    # "flat" table is the "widest" ecocomDP datasets can be consistently returned
#    # to by the ecocomDP::flatten_data() function, and is the input format
#    # required by the "create table" helpers we'll meet shortly. This dataset
#    # only has one core observation variable, "abundance", so gathering really
#    # only entails a change of column names.
#  
#    wide <- wide %>% rename(value_abundance = abundance)
#    flat <- pivot_longer(
#      wide,
#      cols = matches("abundance"),
#      names_to = c(".value", "variable_name"),
#      names_sep = '\\_')
#  
#    # We're now in a good place to begin adding columns of the ecocomDP tables
#    # we can create from this source dataset. We'll begin with the observation
#    # table.
#  
#    # Add columns for the observation table -------------------------------------
#  
#    # The frequency and timing of surveys (events) varied throughout the history
#    # of this dataset and are uniquely identifiable by grouping sample dates by
#    # year and month.
#  
#    flat$event_id <- flat %>% group_by(month = floor_date(flat$date, "month"),
#                                       year) %>% group_indices()
#    flat <- flat %>% arrange(event_id)
#  
#    # Observations are made in plots, which are nested in blocks. Unique
#    # combinations of these form a location
#  
#    flat$location_id <- flat %>% group_by(plot, block) %>% group_indices()
#  
#    # Each row of the flattened source dataset represents an observation of taxa
#    # abundance and should have a unique ID for reference
#  
#    flat$observation_id <- seq(nrow(flat))
#  
#    # Add columns for the location table ----------------------------------------
#  
#    # Ideally, the source dataset would include latitude, longitude, and
#    # elevation for each location_id, but all we have are coordinates for the
#    # area encompassing all sampling locations. The best we can do here is use
#    # the middle of the bounding box and mean of the bounding elevations.
#  
#    geocov <- xml_find_all(eml, ".//geographicCoverage")
#    north <- xml_double(xml_find_all(geocov, './/northBoundingCoordinate'))
#    east <- xml_double(xml_find_all(geocov, './/eastBoundingCoordinate'))
#    south <- xml_double(xml_find_all(geocov, './/southBoundingCoordinate'))
#    west <- xml_double(xml_find_all(geocov, './/westBoundingCoordinate'))
#    elev_max <- xml_double(xml_find_all(geocov, './/altitudeMaximum'))
#    elev_min <- xml_double(xml_find_all(geocov, './/altitudeMinimum'))
#  
#    flat$latitude <- mean(c(north, south))
#    flat$longitude <- mean(c(east, west))
#    flat$elevation <- mean(c(elev_max, elev_min))
#  
#    # Add columns for the taxon table -------------------------------------------
#  
#    # Taxonomic entities of this dataset are comprised of unique genus and
#    # species pairs
#  
#    flat <- flat %>%
#      mutate(taxon_name = trimws(paste(genus, species))) %>%
#      select(-genus, -species)
#  
#    flat$taxon_id <- flat %>% group_by(taxon_name) %>% group_indices()
#  
#    # While not required, resolving taxonomic entities to an authority system
#    # improves the discoverability and interoperability of the ecocomDP dataset.
#    # We can resolve taxa by sending names through taxonomyCleanr for direct
#    # matches against the Integrated Taxonomic Information System
#    # (ITIS; https://www.itis.gov/).
#  
#    taxa_resolved <- taxonomyCleanr::resolve_sci_taxa(
#      x = unique(flat$taxon_name),
#      data.sources = 3)
#  
#    taxa_resolved <- taxa_resolved %>%
#      select(taxa, rank, authority, authority_id) %>%
#      rename(taxon_rank = rank,
#             taxon_name = taxa,
#             authority_system = authority,
#             authority_taxon_id = authority_id)
#  
#    flat <- left_join(flat, taxa_resolved, by = "taxon_name")
#  
#    # Add columns for the dataset_summary table ---------------------------------
#  
#    dates <- flat$date %>% stats::na.omit() %>% sort()
#  
#    # Use the calc_*() helper functions for consistency
#  
#    flat$package_id <- derived_id
#    flat$original_package_id <- source_id
#    flat$length_of_survey_years <- ecocomDP::calc_length_of_survey_years(dates)
#    flat$number_of_years_sampled <- ecocomDP::calc_number_of_years_sampled(dates)
#    flat$std_dev_interval_betw_years <-
#      ecocomDP::calc_std_dev_interval_betw_years(dates)
#    flat$max_num_taxa <- length(unique(flat$taxon_name))
#    flat$geo_extent_bounding_box_m2 <-
#      ecocomDP::calc_geo_extent_bounding_box_m2(west, east, north, south)
#  
#    # Odds and ends -------------------------------------------------------------
#  
#    # Rename source columns with an ecocomDP equivalent (date to datetime)
#  
#    flat <- flat %>% rename(datetime = date)
#  
#    # The hard work is done! The flat table contains all the source data and
#    # more! We can now use the "create" functions to parse this table into the
#    # ecocomDP tables.
#  
#    # Parse flat into ecocomDP tables -------------------------------------------
#  
#    # Each ecocomDP table has an associated "create" function. Begin with the
#    # core required tables.
#  
#    observation <- ecocomDP::create_observation(
#      L0_flat = flat,
#      observation_id = "observation_id",
#      event_id = "event_id",
#      package_id = "package_id",
#      location_id = "location_id",
#      datetime = "datetime",
#      taxon_id = "taxon_id",
#      variable_name = "variable_name",
#      value = "value",
#      unit = "unit")
#  
#    location <- ecocomDP::create_location(
#      L0_flat = flat,
#      location_id = "location_id",
#      location_name = c("block", "plot"),
#      latitude = "latitude",
#      longitude = "longitude",
#      elevation = "elevation")
#  
#    taxon <- ecocomDP::create_taxon(
#      L0_flat = flat,
#      taxon_id = "taxon_id",
#      taxon_rank = "taxon_rank",
#      taxon_name = "taxon_name",
#      authority_system = "authority_system",
#      authority_taxon_id = "authority_taxon_id")
#  
#    dataset_summary <- ecocomDP::create_dataset_summary(
#      L0_flat = flat,
#      package_id = "package_id",
#      original_package_id = "original_package_id",
#      length_of_survey_years = "length_of_survey_years",
#      number_of_years_sampled = "number_of_years_sampled",
#      std_dev_interval_betw_years = "std_dev_interval_betw_years",
#      max_num_taxa = "max_num_taxa",
#      geo_extent_bounding_box_m2 = "geo_extent_bounding_box_m2")
#  
#    # Create the ancillary ecocomDP tables. These are optional, but should be
#    # included if possible.
#  
#    observation_ancillary <- ecocomDP::create_observation_ancillary(
#      L0_flat = flat,
#      observation_id = "observation_id",
#      variable_name = c("trap.type", "trap.num", "moose.cage"))
#  
#    location_ancillary <- ecocomDP::create_location_ancillary(
#      L0_flat = flat,
#      location_id = "location_id",
#      variable_name = "treatment")
#  
#    taxon_ancillary <- ecocomDP::create_taxon_ancillary(
#      L0_flat = flat,
#      taxon_id = "taxon_id",
#      variable_name = c(
#        "subfamily", "hl", "rel", "rll", "colony.size",
#        "feeding.preference", "nest.substrate", "primary.habitat",
#        "secondary.habitat", "seed.disperser", "slavemaker.sp",
#        "behavior", "biogeographic.affinity", "source"),
#      unit = c("unit_hl", "unit_rel", "unit_rll"))
#  
#    # Create the variable_mapping table. This is optional but highly recommended
#    # as it provides unambiguous definitions to variables and facilitates
#    # integration with other ecocomDP datasets.
#  
#    variable_mapping <- ecocomDP::create_variable_mapping(
#      observation = observation,
#      observation_ancillary = observation_ancillary,
#      location_ancillary = location_ancillary,
#      taxon_ancillary = taxon_ancillary)
#  
#    i <- variable_mapping$variable_name == 'abundance'
#    variable_mapping$mapped_system[i] <- 'Darwin Core'
#    variable_mapping$mapped_id[i] <- 'http://rs.tdwg.org/dwc/terms/individualCount'
#    variable_mapping$mapped_label[i] <- 'individualCount'
#  
#    i <- variable_mapping$variable_name == 'treatment'
#    variable_mapping$mapped_system[i] <- 'The Ecosystem Ontology'
#    variable_mapping$mapped_id[i] <- 'http://purl.dataone.org/odo/ECSO_00000506'
#    variable_mapping$mapped_label[i] <- 'Manipulative experiment'
#  
#    i <- variable_mapping$variable_name == 'trap.type'
#    variable_mapping$mapped_system[i] <- 'The Ecosystem Ontology'
#    variable_mapping$mapped_id[i] <- 'http://purl.dataone.org/odo/ECSO_00001591'
#    variable_mapping$mapped_label[i] <- 'type of trap'
#  
#    i <- variable_mapping$variable_name == 'hl'
#    variable_mapping$mapped_system[i] <- 'Darwin Core'
#    variable_mapping$mapped_id[i] <- 'http://rs.tdwg.org/dwc/terms/measurementType'
#    variable_mapping$mapped_label[i] <- 'measurementType'
#  
#    i <- variable_mapping$variable_name == 'rel'
#    variable_mapping$mapped_system[i] <- 'Darwin Core'
#    variable_mapping$mapped_id[i] <- 'http://rs.tdwg.org/dwc/terms/measurementType'
#    variable_mapping$mapped_label[i] <- 'measurementType'
#  
#    i <- variable_mapping$variable_name == 'rll'
#    variable_mapping$mapped_system[i] <- 'Darwin Core'
#    variable_mapping$mapped_id[i] <- 'http://rs.tdwg.org/dwc/terms/measurementType'
#    variable_mapping$mapped_label[i] <- 'measurementType'
#  
#    i <- variable_mapping$variable_name == 'colony.size'
#    variable_mapping$mapped_system[i] <- 'The Ecosystem Ontology'
#    variable_mapping$mapped_id[i] <- 'http://purl.dataone.org/odo/ECSO_00000311'
#    variable_mapping$mapped_label[i] <- 'Population'
#  
#    i <- variable_mapping$variable_name == 'feeding.preference'
#    variable_mapping$mapped_system[i] <- 'Darwin Core'
#    variable_mapping$mapped_id[i] <- 'http://rs.tdwg.org/dwc/terms/behavior'
#    variable_mapping$mapped_label[i] <- 'behavior'
#  
#    i <- variable_mapping$variable_name == 'primary.habitat'
#    variable_mapping$mapped_system[i] <- 'The Ecosystem Ontology'
#    variable_mapping$mapped_id[i] <- 'http://purl.dataone.org/odo/ECSO_00002736'
#    variable_mapping$mapped_label[i] <- 'type of habitat'
#  
#    i <- variable_mapping$variable_name == 'secondary.habitat'
#    variable_mapping$mapped_system[i] <- 'The Ecosystem Ontology'
#    variable_mapping$mapped_id[i] <- 'http://purl.dataone.org/odo/ECSO_00002736'
#    variable_mapping$mapped_label[i] <- 'type of habitat'
#  
#    i <- variable_mapping$variable_name == 'seed.disperser'
#    variable_mapping$mapped_system[i] <- 'Darwin Core'
#    variable_mapping$mapped_id[i] <- 'http://rs.tdwg.org/dwc/terms/behavior'
#    variable_mapping$mapped_label[i] <- 'behavior'
#  
#    i <- variable_mapping$variable_name == 'slavemaker.sp'
#    variable_mapping$mapped_system[i] <- 'Darwin Core'
#    variable_mapping$mapped_id[i] <- 'http://rs.tdwg.org/dwc/terms/behavior'
#    variable_mapping$mapped_label[i] <- 'behavior'
#  
#    i <- variable_mapping$variable_name == 'behavior'
#    variable_mapping$mapped_system[i] <- 'Darwin Core'
#    variable_mapping$mapped_id[i] <- 'http://rs.tdwg.org/dwc/terms/behavior'
#    variable_mapping$mapped_label[i] <- 'behavior'
#  
#    i <- variable_mapping$variable_name == 'biogeographic.affinity'
#    variable_mapping$mapped_system[i] <- 'The Ecosystem Ontology'
#    variable_mapping$mapped_id[i] <- 'http://purl.dataone.org/odo/ECSO_00002736'
#    variable_mapping$mapped_label[i] <- 'type of habitat'
#  
#    i <- variable_mapping$variable_name == 'source'
#    variable_mapping$mapped_system[i] <- 'Darwin Core'
#    variable_mapping$mapped_id[i] <- 'http://purl.org/dc/terms/references'
#    variable_mapping$mapped_label[i] <- 'references'
#  
#    # Write tables to file
#  
#    ecocomDP::write_tables(
#      path = path,
#      observation = observation,
#      location = location,
#      taxon = taxon,
#      dataset_summary = dataset_summary,
#      observation_ancillary = observation_ancillary,
#      location_ancillary = location_ancillary,
#      taxon_ancillary = taxon_ancillary,
#      variable_mapping = variable_mapping)
#  
#    # Validate tables -----------------------------------------------------------
#  
#    # Validation checks ensure the derived set of tables comply with the ecocomDP
#    # model. Any issues at this point
#    # should be addressed in the lines of code above, the tables rewritten, and
#    # another round of validation, to be certain the fix worked.
#  
#    issues <- ecocomDP::validate_data(path = path)
#  
#    # Create metadata -----------------------------------------------------------
#  
#    # Before publishing the derived ecocomDP dataset, we need to describe it. The
#    # create_eml() function does this all for us. It knows the structure of the
#    # ecocomDP model and applies standardized table descriptions and mixes in
#    # important elements of the source dataset metadata for purposes of
#    # communication and provenance tracking.
#  
#    # Convert "dataset level keywords" listed in the source to "dataset level
#    # annotations" in the derived. The predicate "is about" is used, which
#    # results in an annotation that reads "This dataset is about 'species
#    # abundance'", "This dataset is about an ecological 'Community'", etc. All
#    # source datasets involving a human induced manipulative experiment, not a
#    # natural disturbance/experiment, should include the "Manipulative
#    # experiment" annotation below to enable searching on this term.
#  
#    dataset_annotations <- c(
#      `species abundance` =
#        "http://purl.dataone.org/odo/ECSO_00001688",
#      Community =
#        "http://purl.dataone.org/odo/ECSO_00000310",
#      `Manipulative experiment` =
#        "http://purl.dataone.org/odo/ECSO_00000506",
#      `level of ecological disturbance` =
#        "http://purl.dataone.org/odo/ECSO_00002588",
#      `type of ecological disturbance` =
#        "http://purl.dataone.org/odo/ECSO_00002589")
#  
#    # Add contact information for the author of this script and dataset
#  
#    additional_contact <- data.frame(
#      givenName = 'Colin',
#      surName = 'Smith',
#      organizationName = 'Environmental Data Initiative',
#      electronicMailAddress = 'ecocomdp@gmail.com',
#      stringsAsFactors = FALSE)
#  
#    # Create EML metadata
#  
#    eml <- ecocomDP::create_eml(
#      path = path,
#      source_id = source_id,
#      derived_id = derived_id,
#      is_about = dataset_annotations,
#      script = "create_ecocomDP.R",
#      script_description =
#        "A function for converting knb-lter-hrf.118 to ecocomDP",
#      contact = additional_contact,
#      user_id = 'ecocomdp',
#      user_domain = 'EDI',
#      basis_of_record = "HumanObservation")
#  
#  }

## ----eval=FALSE---------------------------------------------------------------
#  
#  # Create directory for tables and metadata
#  
#  mypath <- paste0(tempdir(), "/edi_193")
#  dir.create(mypath)
#  
#  # Create ecocomDP dataset "edi.193.5" from source dataset "knb-lter-hfr.118.33"
#  
#  create_ecocomDP(
#    path = mypath,
#    source_id = "knb-lter-hfr.118.33",
#    derived_id = "edi.193.5")
#  #> Retrieving EML for data package knb-lter-hfr.118.33
#  #>  [0%] Downloaded 0 bytes...
#  #>  [0%] Downloaded 0 bytes...
#  #>
#  #> Searching ITIS for "Aphaenogaster picea"
#  #> Searching ITIS for "Camponotus novaeboracensis"
#  #> Searching ITIS for "Aphaenogaster fulva"
#  #> Searching ITIS for "Temnothorax longispinosus"
#  #> Searching ITIS for "Stenemma impar"
#  #> Searching ITIS for "Stenemma diecki"
#  #> Searching ITIS for "Camponotus pennsylvanicus"
#  #> Searching ITIS for "Lasius americanus"
#  #> Searching ITIS for "Myrmica punctiventris"
#  #> Searching ITIS for "Lasius nearcticus"
#  #> Searching ITIS for "Formica subaenescens"
#  #> Searching ITIS for "Lasius umbratus"
#  #> Searching ITIS for "Formica subsericea"
#  #> Searching ITIS for "Formica aserva"
#  #> Searching ITIS for "Formica neogagates"
#  #> Searching ITIS for "Camponotus nearcticus"
#  #> Searching ITIS for "Ponera pennsylvanica"
#  #> Searching ITIS for "Stenamma brevicorne"
#  #> Searching ITIS for "Lasius claviger"
#  #> Searching ITIS for "Stenamma impar"
#  #> Searching ITIS for "Temnothorax lognispinosus"
#  #> Searching ITIS for "Stenamma diecki"
#  #> Searching ITIS for "Camponotus herculeanus"
#  #> Searching ITIS for "Lasius speculiventris"
#  #> Searching ITIS for "Stenamma schmitti"
#  #> Searching ITIS for "Lasius neoniger"
#  #> Searching ITIS for "Camponotus pennsylvanica"
#  #> Searching ITIS for "Tapinoma sessile"
#  #> Searching ITIS for "Myrmica AF-smi"
#  #> Searching ITIS for "Formica neorufibarbis"
#  #> Searching ITIS for "Myrmica incompleta"
#  #> Searching ITIS for "Formica argentea"
#  #> Searching ITIS for "Myrmica AF-scu"
#  #> Searching ITIS for "Formica dolosa"
#  #> Searching ITIS for "Formica subintegra"
#  #> Searching ITIS for "Formica incerta"
#  #> Searching ITIS for "Myrmica nearctica"
#  #> Searching ITIS for "Formica pergandei"
#  #> Searching ITIS for "Formica lasioides"
#  #> Searching ITIS for "Myrmica pinetorum"
#  #> Searching ITIS for "Leptothorax canadensis"
#  #> Searching ITIS for "Myrmica detritinodis"
#  #> Searching ITIS for "Myrmecina americana"
#  #> Searching ITIS for "Crematogaster lineolata"
#  #> Searching ITIS for "Lasius interjectus"
#  #> Searching ITIS for "Camponotus chromaiodes"
#  #> Searching ITIS for "Formica pallidefulva"
#  #> Searching ITIS for "Temnothorax ambiguus"
#  #> Searching ITIS for "Lasius subglaber"
#  #> Searching ITIS for "Formica rubicunda"
#  #> Searching ITIS for "Lasius brevicornis"
#  #> Searching ITIS for "Lasius aphidicolus"
#  #> Searching ITIS for "Formica integra"
#  #>
#  #> Writing tables to file:
#  #>   observation
#  #>   location
#  #>   taxon
#  #>   dataset_summary
#  #>   observation_ancillary
#  #>   location_ancillary
#  #>   taxon_ancillary
#  #>   variable_mapping
#  #>
#  #> Validating edi_193:
#  #>   Required tables
#  #>   Column names
#  #>   Required columns
#  #>   Column classes
#  #>   Datetime formats
#  #>   Primary keys
#  #>   Composite keys
#  #>   Referential integrity
#  #>   Latitude and longitude format
#  #>   Latitude and longitude range
#  #>   Elevation
#  #>   variable_mapping
#  #>
#  #> Creating EML for derived data package (edi.193.5)
#  #> Reading EML of L0 data package knb-lter-hfr.118.33
#  #> Creating EML of L1 data package edi.193.5
#  #> Updating:
#  #> <eml>
#  #>   <dataset>
#  #>     <alternateIdentifier>
#  #>     <title>
#  #>     <pubDate>
#  #>     <keywordSet>
#  #>     <contact>
#  #>     <methods>
#  #>     <dataTable>
#  #>     <otherEntity>
#  #>     <annotations>
#  #> </eml>
#  #> Writing EML
#  #> Validating EML
#  #>   Validation passed :)
#  #> Done.
#  
#  # The working directory contains a valid set of ecocomDP tables and metadata,
#  # which is ready for upload to EDI (or any other EML based repository)
#  dir(mypath)
#  #>  [1] "create_ecocomDP.R"
#  #>  [2] "dataset_summary.csv"
#  #>  [3] "edi.193.5.xml"
#  #>  [4] "location.csv"
#  #>  [5] "location_ancillary.csv"
#  #>  [6] "observation.csv"
#  #>  [7] "observation_ancillary.csv"
#  #>  [8] "taxon.csv"
#  #>  [9] "taxon_ancillary.csv"
#  #> [10] "variable_mapping.csv"
#