# Validate metadata template content
#
# @param fun.name
# (character) Name calling function.
# @param x
# (named list) The data and metadata object returned by \code{template_arguments()}.
#
# @return
# \item{template_issues}{Any issues found in the validation process and
# returned as a character vector. This object can be saved for later use
# and viewed in a human readable form with \code{issues()}}.
#
# @details
# Each template has a unique set of metadata providing valuable
# information about corresponding data objects. This function checks
# for metadata content according to metadata best practices and the needs
# of downstream processes in the EMLassemblyline. Each set of checks is
# applied within the scope of the calling function (i.e. \code{fun.name}),
# which each has a unique set of needs and constraints. If the needs of
# down stream processes are not met, then the template content is modified
# and a message returned to the user.
#
# All issues are compiled into a single report and returned to the user
# with a warning. This approach of not "failing fast" allows the user to
# address several issues at once.
#
validate_templates <- function(fun.name, x) {
message("Checking inputs")
attr_tmp <- read_template_attributes()
# Called from annotate_eml() ------------------------------------------------
if (fun.name == "annotate_eml") {
# Initialize object for collecting issue messages
issues <- c()
# Annotations
# validate_annotation(x)
# r <- validate_annotation(x)
# issues <- c(issues, r$issues)
# x <- r$x
# Return
if (!is.null(issues)) {
list2env(
list(template_issues = issues),
if(is.null(options("eal.env"))) {
.GlobalEnv
} else {
options("eal.env")[[1]]
}
)
warning(
"Input issues found. Use issues() to see them.",
call. = FALSE)
}
return(x)
}
# Called from make_eml() ----------------------------------------------------
if (fun.name == 'make_eml'){
# Initialize object for collecting issue messages
issues <- c()
# Taxonomic coverage
r <- validate_taxonomic_coverage(x)
issues <- c(issues, r$issues)
x <- r$x
# Return
if (!is.null(issues)) {
list2env(
list(template_issues = issues),
if(is.null(options("eal.env"))) {
.GlobalEnv
} else {
options("eal.env")[[1]]
}
)
warning(
"Template issues found. Use issues() to see them.",
call. = FALSE)
}
return(x)
}
}
# Validate the taxonomic_coverage template
#
# @param x
# (named list) The data and metadata object returned by
# \code{template_arguments()}.
#
# @return
# \item{issues}{(character) Descriptions of issues found in the template
# grouped by "required" and "optional" criteria.}
# \item{x}{(named list) If "required" issues are found, then the template
# is removed.}
#
# @details
# Checks performed by this function:
# \itemize{
# \item Template column names
# \item Taxonomic authority
# \item Complete entries
# }
#
# Manipulations performed by this function:
# \itemize{
# \item Use raw names when a resolved name is missing. This ensures the taxon is not ommitted from the taxonomic coverage metadata.
# }
#
validate_taxonomic_coverage <- function(x) {
# Objects for catching required and optional issues
required_issues <- c()
optional_issues <- c()
if (any(names(x$template) == "taxonomic_coverage.txt")) {
# Default manipulation - Use raw names when a resolved name is missing
missing <- x$template$taxonomic_coverage.txt$content$name_resolved == ""
x$template$taxonomic_coverage.txt$content$name_resolved[missing] <-
x$template$taxonomic_coverage.txt$content$name[missing]
# authority_system is supported
r <- validate_taxonomic_coverage_authority_system(x)
optional_issues <- c(optional_issues, r)
# Relax constraints on this check to match those of
# taxonomyCleanr::make_taxonomicCoverage()
# Complete entries are required
# r <- validate_taxonomic_coverage_completeness(x)
# required_issues <- c(required_issues, r)
}
# Compile issues
if (!is.null(required_issues)) {
required_issues <- paste0(
"\n",
"Taxonomic coverage (Required) - Taxonomic coverage will be dropped ",
"from the EML until these issues are fixed:\n",
paste(
paste0(seq_along(required_issues), ". "),
required_issues,
collapse = "\n"),
"\n")
}
if (!is.null(optional_issues)) {
optional_issues <- paste0(
"\n",
"Taxonomic coverage (Optional):\n",
paste(
paste0(seq_along(optional_issues), ". "),
optional_issues,
collapse = "\n"),
"\n")
}
issues <- c(required_issues, optional_issues)
# Drop the taxonomic coverage template if required issues are found
if (!is.null(required_issues)) {
x$template$taxonomic_coverage.txt <- NULL
}
# Return
list(issues = issues, x = x)
}
# Check authority_system values in the taxonomic coverage template
#
# @param x
# (list) The data and metadata object returned by
# \code{template_arguments()}.
#
# @return
# \item{error}{If column names are invalid}
# \item{NULL}{If no issues were found}
#
validate_taxonomic_coverage_authority_system <- function(x) {
authorities_supported <- c(
'Catalogue of Life', 'ITIS', 'Integrated Taxonomic Information System',
'https://www.itis.gov/', 'https://www.itis.gov', 'https://itis.gov',
'itis', 'World Register of Marine Species', 'WORMS', 'https://www.marinespecies.org/',
'http://www.marinespecies.org', 'http://marinespecies.org',
'http://www.marinespecies.org/', 'https://www.marinespecies.org',
'https://marinespecies.org', 'worms', 'GBIF Backbone Taxonomy', 'GBIF',
'gbif', 'https://gbif.org', 'Tropicos - Missouri Botanical Garden')
authorities_found <-
x$template$taxonomic_coverage.txt$content$authority_system
unsupported_authorities <- !((authorities_found %in% authorities_supported) |
(authorities_found == ""))
if (any(unsupported_authorities)) {
paste0(
"Taxa resolved to unsupported authorities cannot be expanded into ",
"their full taxonomic classifications. Unsupported authorities found ",
"for entries: ",
paste(which(unsupported_authorities), collapse = ", "), ". ",
"Supported authorities are: ITIS, WORMS, GBIF")
}
}
# Check completeness of the taxonomic coverage template
#
# @param x
# (list) The data and metadata object returned by
# \code{template_arguments()}.
#
# @return
# \item{error}{If column names are invalid}
# \item{NULL}{If no issues were found}
#
validate_taxonomic_coverage_completeness <- function(x) {
authorities <- x$template$taxonomic_coverage.txt$content$authority_system
ids <- x$template$taxonomic_coverage.txt$content$authority_id
incomplete_entries <- !((authorities != "")) ==
((ids != "") & (!is.na(ids)))
if (any(incomplete_entries)) {
paste0(
"Missing inputs. A taxonomic authority and corresponding identifier is ",
"missing from entries: ",
paste(which(incomplete_entries), collapse = ", "))
}
}
# Helper functions ------------------------------------------------------------
# View issues to function inputs
#
# @return
# A message listing any issues found in EMLassemblyline function inputs (arguments and templates).
#
# @details
# Validation functions \code{validate_arguments()} and \code{validate_templates()} return a list of issues to the global
# environment in \code{argument_issues} and \code{template_issues} list objects, respectively. The \code{view_issues()} function wraps these issues in \code{message()} to provide a human readable form.
#
issues <- function() {
arg <- exists(
"argument_issues",
if(is.null(options("eal.env"))) {
.GlobalEnv
} else {
options("eal.env")[[1]]
})
tmp <- exists(
"template_issues",
if(is.null(options("eal.env"))) {
.GlobalEnv
} else {
options("eal.env")[[1]]
})
if (arg & tmp) {
message(c(argument_issues, template_issues))
} else if (arg) {
message(argument_issues)
} else if (tmp) {
message(template_issues)
} else {
message("No issues found")
}
}
read_template_attributes <- function() {
data.table::fread(
system.file('extdata', 'template_characteristics.txt', package = 'ecocomDP'),
fill = TRUE,
blank.lines.skip = TRUE)
}
# Annotate an EML file (.xml) or an emld list object
#
# @param annotations
# (character; data frame) Path or data frame of the annotations.txt
# template for annotating \code{eml.in}. Create annotations.txt with
# \code{template_annotations()}.
# @param eml.in
# (EML; emld list) Path to, or emld list of, the EML file to be annotated.
# Accepted versions of the emld list object can be created by
# \code{EMLassemblyline::read_eml()} or \code{make_eml()}.
# @param eml.out
# (character) Path of the annotated EML file to be written.
#
# @return
# (EML; emld list) If the input to \code{eml.in} is an EML file then the
# output will be an EML file. The emld list object is always returned.
#
# @note
# All annotated elements are assigned ids and their annotations are
# placed both immediately under the parent element (subject) when
# supported and within the /eml/annotations node through id+reference
# pairs. This redundant approach supports variation in where EML metadata
# consumers harvest this information and supports annotation of EML
# elements requiring id+references.
#
annotate_eml <- function(
annotations = NULL,
eml.in = NULL,
eml.out = NULL) {
# Validate arguments --------------------------------------------------------
validate_arguments(
fun.name = 'annotate_eml',
fun.args = as.list(environment()))
if (is.character(eml.in) | is.list(eml.in)) {
message("Annotating EML ...")
}
# Read metadata templates ---------------------------------------------------
if (is.character(annotations)) {
x <- EAL_template_arguments(
path = dirname(annotations)
)$x
} else if (is.data.frame(annotations)) {
x <- EAL_template_arguments()$x
x$template$annotations.txt$content <- annotations
}
# Validate templates --------------------------------------------------------
# Don't rerun validate_templates() if it has already been run by make_eml().
# Validation can take minutes.
if (!any(stringr::str_detect(deparse(sys.calls()), "make_eml"))) {
x <- validate_templates("annotate_eml", x)
}
# Continue if annotations.txt hasn't been dropped after validation
if (!is.null(x$template$annotations.txt)) {
# Set parameters ----------------------------------------------------------
# Read EML
if (is.character(eml.in)) {
eml <- EAL_read_eml(
path = dirname(eml.in),
eml = basename(eml.in))
} else {
eml <- eml.in
}
# Replace blank cells with NAs and remove incomplete cases to make
# annotations data easier to use and to preempt assigning a subject an
# incomplete annotation.
anno <- x$template$annotations.txt$content
anno[anno == ""] <- NA_character_
anno <- anno[stats::complete.cases(anno), ]
# ResponsibleParty elements often recur throughout an EML record, each of
# which requires a unique ID. To facilitate expansion of ResponsibleParty
# from the annotations.txt template, create an empty copy of the
# annotations data frame (anno) and fill it with ResponsibleParty elements
# gathered from the EML (rp). Then replace the ResponsibleParty of anno
# with rp and add to the /eml/datset/annotations element within the EML.
rp <- anno[0, ]
# Convert annotations into the list structure expected by EML::
anno_ls <- lapply(
seq(length(anno$id)),
function(k) {
list(
propertyURI = list(
anno$predicate_uri[k],
label = anno$predicate_label[k]),
valueURI = list(
anno$object_uri[k],
label = anno$object_label[k]))
})
# Annotate ----------------------------------------------------------------
# Match annotatable elements (subjects) to their annotations listed in
# the annotations.txt template. Target EML sub-trees where the subjects
# occur, extract their values and, when not unique, their context, then
# use these as keys to annotations.txt from which IDs and predicate
# + object annotations are retrieved and assigned to the EML. Annotatable
# elements only supported through ID references are added to
# /eml/dataset/annotations (e.g. ResponsibleParty).
annotate_element <- function(element) {
if (element == "dataset") {
if (!is.null(eml$dataset)) {
sub_i <- element
eml$dataset$id <- unique(anno$id[anno$subject == sub_i])
eml$dataset$annotation <- anno_ls[anno$subject == sub_i]
}
} else if (element == "dataTable") {
if (!is.null(eml$dataset$dataTable)) {
for (k in seq_along(eml$dataset$dataTable)) {
sub_i <- eml$dataset$dataTable[[k]]$physical$objectName
if (sub_i != "") {
eml$dataset$dataTable[[k]]$id <- unique(anno$id[anno$subject == sub_i])
eml$dataset$dataTable[[k]]$annotation <- anno_ls[anno$subject == sub_i]
}
for (m in seq_along(eml$dataset$dataTable[[k]]$attributeList$attribute)) {
sub_i <- eml$dataset$dataTable[[k]]$attributeList$attribute[[m]]$attributeName
con_i <- eml$dataset$dataTable[[k]]$physical$objectName
use_i <- (anno$subject == sub_i) & (anno$context == con_i)
eml$dataset$dataTable[[k]]$attributeList$attribute[[m]]$id <- unique(anno$id[use_i])
eml$dataset$dataTable[[k]]$attributeList$attribute[[m]]$annotation <- anno_ls[use_i]
}
}
}
} else if (element == "otherEntity") {
if (!is.null(eml$dataset$otherEntity)) {
for (k in seq_along(eml$dataset$otherEntity)) {
sub_i <- eml$dataset$otherEntity[[k]]$physical$objectName
if (sub_i != "") {
eml$dataset$otherEntity[[k]]$id <- unique(anno$id[anno$subject == sub_i])
eml$dataset$otherEntity[[k]]$annotation <- anno_ls[anno$subject == sub_i]
}
}
}
}
eml
}
# Assign IDs and annotations. Note: ResponsibleParty annotations are
# added to /eml/dataset/annotations (below).
eml <- annotate_element("dataset")
eml <- annotate_element("dataTable")
eml <- annotate_element("otherEntity")
eml <- annotate_element("ResponsibleParty")
# Replace ResponsibleParty in anno with expanded ResponsibleParty in rp and
# update anno_ls so all annotations listed in /eml/dataset/annotations will
# have the correct ID pairing.
#
# (A patch for EML::) IDs within the annotations template uses a
# preceding "/" (e.g. /dataTable/attribute) that EML:: fails to include in
# the id attribute of annotatable elements but does include in the
# references attribute of /eml/dataset/annotations/annotation. Remove
# preceding "/" from anno$id until this is fixed in EML::.
anno <- anno[anno$element != "/ResponsibleParty", ]
anno <- rbind(anno, rp)
anno$id <- stringr::str_remove(anno$id, "^/")
anno_ls <- lapply(
seq(length(anno$id)),
function(k) {
list(
propertyURI = list(
anno$predicate_uri[k],
label = anno$predicate_label[k]),
valueURI = list(
anno$object_uri[k],
label = anno$object_label[k]))
})
# Create the /dataset/annotations node and append below.
annotations <- list(
annotation = lapply(
seq_along(anno_ls),
function(k) {
list(
references = anno$id[k],
propertyURI = list(
label = anno_ls[[k]]$propertyURI$label,
propertyURI = anno_ls[[k]]$propertyURI[[1]]),
valueURI = list(
label = anno_ls[[k]]$valueURI$label,
valueURI = anno_ls[[k]]$valueURI[[1]]))
}))
# Remove unused references from the /dataset/annotations node or the EML will
# be invalid
rmatch <- function(x, name) {
pos <- match(name, names(x))
if (!is.na(pos)) return(x[[pos]])
for (el in x) {
if (methods::is(el, "list")) {
# if (class(el) == "list") {
out <- Recall(el, name)
if (!is.null(out)) return(out)
}
}
}
references <- unlist(
lapply(
annotations$annotation,
rmatch,
"references"))
eml_flattened <- unlist(eml)
eml_ids <- eml_flattened[
stringr::str_detect(names(eml_flattened), "id")]
annotation_ids <- stringr::str_remove(
eml_ids[
stringr::str_detect(eml_ids, "/")],
"^/")
annotations_clean <- annotations$annotation[
references %in% annotation_ids]
# Add the /eml/datset/annotations element created and cleaned above
eml$annotations$annotation <- annotations_clean
# Return ------------------------------------------------------------------
# If writing to file, then update packageId, schemaLocation, and EML version.
if (is.character(eml.in)) {
eml$packageId <- basename(tools::file_path_sans_ext(eml.out))
eml$schemaLocation <- "eml://ecoinformatics.org/eml-2.2.0 http://nis.lternet.edu/schemas/EML/eml-2.2.0/xsd/eml.xsd"
emld::eml_version("eml-2.2.0")
EML::write_eml(eml, eml.out)
}
if (is.character(eml.in) | is.list(eml.in)) {
message("Done.")
}
eml
}
}
# Annotate an EML element
#
# This is a helper function for \code{annotate_eml()}.
#
# @param element
# (character) Element to be annotated
# @param eml
# (emld list) EML to be annotated
# @param anno
# (data.frame) The annotations.txt template
# @param rp
# (data.frame) The responsibleParty table
#
#
# @return
# \item{eml}{emld list object being annotated}
#
# @details
# Match annotatable elements (subjects) to their annotations listed in
# the annotations.txt template. Target EML sub-trees where the subjects
# occur, extract their values and, when not unique, their context, then
# use these as keys to annotations.txt from which IDs and predicate +
# object annotations are retrieved and assigned to the EML. Annotatable
# elements only supported through ID references are added to
# /eml/dataset/annotations (e.g. ResponsibleParty).
#
annotate_element <- function(element, eml, anno, rp) {
if (element == "dataset") {
if (!is.null(eml$dataset)) {
sub_i <- element
eml$dataset$id <- unique(anno$id[anno$subject == sub_i])
eml$dataset$annotation <- anno_ls[anno$subject == sub_i]
}
} else if (element == "dataTable") {
if (!is.null(eml$dataset$dataTable)) {
for (k in seq_along(eml$dataset$dataTable)) {
sub_i <- eml$dataset$dataTable[[k]]$physical$objectName
if (sub_i != "") {
eml$dataset$dataTable[[k]]$id <- unique(anno$id[anno$subject == sub_i])
eml$dataset$dataTable[[k]]$annotation <- anno_ls[anno$subject == sub_i]
}
for (m in seq_along(eml$dataset$dataTable[[k]]$attributeList$attribute)) {
sub_i <- eml$dataset$dataTable[[k]]$attributeList$attribute[[m]]$attributeName
con_i <- eml$dataset$dataTable[[k]]$physical$objectName
use_i <- (anno$subject == sub_i) & (anno$context == con_i)
eml$dataset$dataTable[[k]]$attributeList$attribute[[m]]$id <- unique(anno$id[use_i])
eml$dataset$dataTable[[k]]$attributeList$attribute[[m]]$annotation <- anno_ls[use_i]
}
}
}
} else if (element == "otherEntity") {
if (!is.null(eml$dataset$otherEntity)) {
for (k in seq_along(eml$dataset$otherEntity)) {
sub_i <- eml$dataset$otherEntity[[k]]$physical$objectName
if (sub_i != "") {
eml$dataset$otherEntity[[k]]$id <- unique(anno$id[anno$subject == sub_i])
eml$dataset$otherEntity[[k]]$annotation <- anno_ls[anno$subject == sub_i]
}
}
}
} else if (element == "ResponsibleParty") {
# Responsible party --------------------------------------
if (!is.null(eml$dataset$creator)) {
r <- assign_responsible_party_id(eml$dataset$creator, anno, rp)
eml$dataset$creator <- r$responsible.party
rp <- r$rp
}
if (!is.null(eml$dataset$contact)) {
r <- assign_responsible_party_id(eml$dataset$contact, anno, rp)
eml$dataset$contact <- r$responsible.party
rp <- r$rp
}
if (!is.null(eml$dataset$associatedParty)) {
eml$dataset$associatedParty <- lapply(
eml$dataset$associatedParty,
function(k) {
append_rp(k)
})
}
if (!is.null(eml$dataset$project$personnel)) {
eml$dataset$project$personnel <- lapply(
eml$dataset$project$personnel,
function(k) {
append_rp(k)
})
}
if (!is.null(eml$dataset$project$relatedProject)) {
eml$dataset$project$relatedProject <- lapply(
eml$dataset$project$relatedProject,
function(k) {
k$personnel <- lapply(
k$personnel,
function(m) {
append_rp(m)
})
k
})
}
}
eml
}
# Assign ResponsibleParty IDs and append recurrences to the rp data frame
#
# This is a helper function for \code{annotate_eml()}
#
# @param responsible.party
# (emld list) A ResponsibleParty EML node in the emld list format.
# @param annotations.template
# (data.frame) The annotations template created within
# \code{annotate_eml()}.
#
# @return
# \item{responsible.party}{(emld list) Annotated responsible party node}
# \item{rp}{(data.frame) the rp object}
#
append_rp <- function(responsible.party, annotations.template) {
subject <- paste(
c(
unlist(responsible.party$individualName$givenName),
responsible.party$individualName$surName),
collapse = " ")
message(subject)
if ((subject != "") & any(annotations.template$subject == subject)) {
responsible.party$id <- paste0(
unique(
annotations.template$id[
annotations.template$subject == subject]),
" ",
uuid::UUIDgenerate(use.time = TRUE))
rp <- suppressWarnings(
data.frame(
id = responsible.party$id,
annotations.template[
annotations.template$subject == subject,
c("element", "context", "subject", "predicate_label",
"predicate_uri", "object_label", "object_uri")
]))
list(
responsible.party = responsible.party,
rp = rp)
}
}
# Update the rp object with ID information
#
# This is a helper function for \code{annotate_eml()}
#
# @param responsible.party
# (emld list) A ResponsibleParty EML node in the emld list format.
# @param anno
# (data.frame) The annotations template created within
# \code{annotate_eml()}.
# @param rp
# (data.frame) A growing set of responsible party metadata to be used
# in \code{annotate_eml()}
#
# @return
# \item{responsible.party}{(emld list) Annotated responsible party node}
# \item{rp}{(data.frame) the rp object}
#
assign_responsible_party_id <- function(responsible.party, anno, rp) {
r <- lapply(
responsible.party,
function(k) {
append_rp(
responsible.party = k,
annotations.template = anno)
})
list(
responsible.party = lapply(r, `[[`, 1),
rp = data.table::rbindlist(lapply(r, `[[`, 2)))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.