##############################################################################################
# @describeIn map_neon_data_to_ecocomDP This method will retrieve density data for MACROINVERTEBRATE from neon.data.product.id DP1.20120.001 from the NEON data portal and map to the ecocomDP format
##############################################################################################
map_neon.ecocomdp.20120.001.001 <- function(
neon.data.list,
neon.data.product.id ="DP1.20120.001",
...){
#NEON target taxon group is MACROINVERTEBRATE
neon_method_id <- "neon.ecocomdp.20120.001.001"
# make sure neon.data.list matches the method
if(!any(grepl(
neon.data.product.id %>% gsub("^DP1\\.","",.) %>% gsub("\\.001$","",.),
names(neon.data.list)))) stop(
"This dataset does not appeaer to be sourced from NEON ",
neon.data.product.id,
" and cannot be mapped using method ",
neon_method_id)
# get all tables for this data product for the specified sites in my_site_list, store them in a list called all_tabs
all_tabs <- neon.data.list
# extract the table with the field data from the all_tabs list of tables
if("inv_fieldData" %in% names(all_tabs)){
inv_fielddata <- all_tabs$inv_fieldData %>%
dplyr::filter(!is.na(sampleID))
# known problem with dupes published in the inv_fieldData table as of 2021-02-18
# this anticipated to be fixed in data release next year (Jan 2022)
# use sampleID as primary key, keep the first uid associated with any sampleID that has multiple uids
de_duped_uids <- inv_fielddata %>%
dplyr::group_by(sampleID) %>%
dplyr::summarise(n_recs = length(uid),
n_unique_uids = length(unique(uid)),
uid = dplyr::first(uid))
inv_fielddata <- de_duped_uids %>%
dplyr::select(uid, sampleID) %>%
dplyr::left_join(inv_fielddata)
}else{
# if no data, return an empty list
warning(paste0(
"WARNING: No field data available for NEON data product ",
neon.data.product.id, " for the dates and sites selected."))
return(list())
}
# extract the table with the taxonomy data from all_tabls list of tables
if("inv_taxonomyProcessed" %in% names(all_tabs)){
inv_taxonomyProcessed <- all_tabs$inv_taxonomyProcessed
}else{
# if no data, return an empty list
warning(paste0(
"WARNING: No taxon count data available for NEON data product ",
neon.data.product.id, " for the dates and sites selected."))
return(list())
}
# observation ----
my_package_id <- paste0(
neon_method_id, ".", format(Sys.time(), "%Y%m%d%H%M%S"))
# Make the observation table.
# start with inv_taxonomyProcessed
# NOTE: the observation_id = uuid for record in NEON's inv_taxonomyProcessed table
table_observation_raw <- inv_taxonomyProcessed %>%
dplyr::filter(targetTaxaPresent == "Y") %>%
dplyr::distinct() %>%
# add counts for taxa (e.g., different size classes) in same sample together
dplyr::group_by(sampleID, acceptedTaxonID, scientificName, taxonRank, ) %>%
dplyr::summarize(
estimatedTotalCount = sum(estimatedTotalCount, na.rm = TRUE),
individualCount = sum(individualCount),
subsamplePercent = subsamplePercent %>% unique() %>% paste(collapse = "|"),
identificationReferences = identificationReferences %>% unique() %>% paste(collapse = "|"),
laboratoryName = laboratoryName %>% unique() %>% paste(collapse = "|"),
publicationDate = publicationDate %>% unique() %>% paste(collapse = "|"),
release = release %>% unique() %>% paste(collapse = "|")
) %>%
# Join the columns selected above with two columns from inv_fielddata (the two columns are sampleID and benthicArea)
dplyr::left_join(
inv_fielddata %>%
dplyr::select(
sampleID, benthicArea,
collectDate,
namedLocation,
eventID, habitatType,
samplerType, substratumSizeClass,
ponarDepth, snagLength, snagDiameter,
remarks)) %>%
# some new columns called 'variable_name', 'value', and 'unit', and assign values for all rows in the table.
# variable_name and unit are both assigned the same text strint for all rows.
dplyr::ungroup() %>%
dplyr::mutate(
observation_id = paste0("obs_",1:nrow(.)),
neon_sample_id = sampleID,
variable_name = 'density',
value = estimatedTotalCount / benthicArea,
unit = 'count per square meter') %>%
# rename some columns
dplyr::rename(
event_id = sampleID,
neon_event_id = eventID,
datetime = collectDate,
location_id = namedLocation,
taxon_id = acceptedTaxonID) %>%
# make a new column called package_id, assign it NA for all rows
dplyr::mutate(package_id = my_package_id) %>%
# filter out invalid records
dplyr::filter(
!is.na(value),
value >= 0,
is.finite(value))
table_observation <- table_observation_raw %>%
dplyr::select(observation_id,
event_id,
package_id,
location_id,
datetime,
taxon_id,
variable_name,
value,
unit) %>%
dplyr::distinct()
# ancillary observation table ----
table_observation_ancillary <- make_neon_ancillary_observation_table(
obs_wide = table_observation_raw,
ancillary_var_names = c(
"observation_id",
"sampleID",
"neon_event_id",
"individualCount",
"subsamplePercent",
"estimatedTotalCount",
"benthicArea",
"habitatType",
"samplerType", "substratumSizeClass",
"ponarDepth", "snagLength", "snagDiameter",
"laboratoryName",
"remarks",
"release",
"publicationDate"))
# location ----
# get relevant location info from the data
table_location_raw <- inv_fielddata %>%
dplyr::select(domainID, siteID, namedLocation, decimalLatitude,
aquaticSiteType,
decimalLongitude, elevation) %>%
dplyr::distinct() %>%
dplyr::filter(namedLocation %in% table_observation$location_id)
# create a location table, which has the lat long for each NEON site included in the data set
# start with the inv_fielddata table and pull out latitude, longitude, and elevation for each NEON site that occurs in the data
table_location <- make_neon_location_table(
loc_info = table_location_raw,
loc_col_names = c("domainID", "siteID", "namedLocation"))
table_location_ancillary <- make_neon_ancillary_location_table(
loc_info = table_location_raw,
loc_col_names = c("domainID", "siteID", "namedLocation"),
ancillary_var_names = c("namedLocation","aquaticSiteType"))
# taxon ----
# create a taxon table, which describes each taxonID that appears in the data set
# start with inv_taxonomyProcessed
# This approach takes too long
# my_dots <- list(...)
#
# if("token" %in% names(my_dots)){
# my_token <- my_dots$token
# }else{
# my_token <- NA
# }
#
# # get macroinvert taxon table from NEON
# neon_inv_taxon_table <- neonOS::getTaxonList(
# taxonType = "MACROINVERTEBRATE",
# token = my_token) %>%
# dplyr::filter(taxonID %in% table_observation$taxon_id)
#
# table_taxon <- neon_inv_taxon_table %>%
# dplyr::select(taxonID, taxonRank, scientificName, nameAccordingToID) %>%
# dplyr::distinct() %>%
# dplyr::rename(taxon_id = taxonID,
# taxon_rank = taxonRank,
# taxon_name = scientificName,
# authority_system = nameAccordingToID) %>%
# dplyr::select(taxon_id,
# taxon_rank,
# taxon_name,
# authority_system)
# create a taxon table, which describes each taxonID that appears in the data set
# start with inv_taxonomyProcessed
table_taxon <- inv_taxonomyProcessed %>%
# keep only the coluns listed below
dplyr::select(acceptedTaxonID, taxonRank, scientificName, identificationReferences) %>%
# remove rows with duplicate information
dplyr::distinct() %>%
# rename some columns
dplyr::rename(taxon_id = acceptedTaxonID,
taxon_rank = taxonRank,
taxon_name = scientificName,
authority_system = identificationReferences) %>%
# concatenate different references for same taxonID
dplyr::group_by(taxon_id, taxon_rank, taxon_name) %>%
dplyr::summarize(
authority_system = paste(authority_system, collapse = "; ")) %>%
dplyr::filter(taxon_id %in% table_observation$taxon_id)
# make dataset_summary -- required table ----
years_in_data <- table_observation$datetime %>% lubridate::year()
years_in_data %>% ordered()
table_dataset_summary <- data.frame(
package_id = table_observation$package_id[1],
original_package_id = neon.data.product.id,
length_of_survey_years = max(years_in_data) - min(years_in_data) + 1,
number_of_years_sampled = years_in_data %>% unique() %>% length(),
std_dev_interval_betw_years = years_in_data %>%
unique() %>% sort() %>% diff() %>% stats::sd(),
max_num_taxa = table_taxon$taxon_id %>% unique() %>% length()
)
# return ----
# list of tables to be returned, with standardized names for elements
out_list <- list(
location = table_location,
location_ancillary = table_location_ancillary,
taxon = table_taxon,
observation = table_observation,
observation_ancillary = table_observation_ancillary,
dataset_summary = table_dataset_summary)
# return out_list -- this is output from this function
return(out_list)
} #END of function
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.