# ------ 1. A DWCTERM CLASS TO HOLD INFORMATION ON DARWIN CORE TERMS ------
#' DWCTerm object
#' @description R6 class to hold information on DWC terms
#' @importFrom R6 R6Class
#' @export
#' @format \code{\link{R6Class}} object.
DwCTerm <- R6::R6Class("DwCTerm",
# ====== 1.1. Define private members of the terms class ======
private = list(
termName = character(),
namespaceName = character(),
termIRI = character(),
termVersionIRI = character(),
dateModified = character(),
label = character(),
isReplacedBy = character(),
definition = character(),
notes = character(),
type = character(),
examples = character(),
termInformationLN = character(),
execCommitteeDecisions = character(),
miscInformation = character(),
termDef = character(),
vocabularyURI = character()
),
public = list(
# ====== 1.2. Initialise a DwCTerm object ======
#' Initialise a DwCTerm object
#'
#' @param termName A \code{character} scalar containing the name of the term
#' @param namespaceName A \code{character} scalar containing the name of the namespace containing the term
#' @param termIRI A \code{character} scalar containing the Internationalized Resource Identifier (IRI) of the definition of the term
#' @param termVersionIRI A \code{character} scalar containing the Internationalized Resource Identifier (IRI) of the definition of the current version of the term
#' @param dateModified A \code{character} scalar containing date information of the last modification of the term definition
#' @param label A \code{character} scalar containing a label for the term (used in casual language)
#' @param isReplacedBy A \code{character} scalar containing the Internationalized Resource Identifier (IRI) of the definition of the term that replaced this term (used
#' if the term is deprecated)
#' @param definition A \code{character} scalar containing a brief description of the purpose of the term
#' @param notes A \code{character} vector containing extra information about the usage of the term
#' @param type A \code{character} scalar containing the type of term defined
#' @param examples A \code{character} vector giving a set of example values for the term
#' @param termInformationLN A \code{characer} vector giving supplementary information for the term provided by the \url{https://livingnorway.no/}{Living Norway} initiative
#' @param execCommitteeDecisions A \code{character} vector of links to decisions made by executive committees on the usage of the term
#' @param miscInformation A \code{character} vector providing supplementary information for the term
#' @param termDef A \code{character} scalar providing information about the location of the body in charge of definition of the term
#' @param vocabularyURI A \code{character} scalar containing the Unified Resource Identifier (URI) for a vocabulary that the possible values for this term are based on
#'
#' @return A new DwCTerm object
initialize = function(termName, namespaceName = character(), termIRI = character(), termVersionIRI = character(),
dateModified = character(), label = character(), isReplacedBy = character(), definition = character(), notes = character(),
type = character(), examples = character(), termInformationLN = character(), execCommitteeDecisions = character(),
miscInformation = character(), termDef = character(), vocabularyURI = character()) {
# Utility function to ensure that character scalar have the correct formatting
characterScalarTest <- function(paramText, inVar) {
outVal <- tryCatch(as.character(inVar), error = function(err) {
stop("error encountered whilst processing the ", paramText, " parameter: ", err)
})
if(length(outVal) > 2) {
warning(paramText, " has length greater than one: only the first element will be used")
outVal <- outVal[1]
} else if(length(outVal) > 0) {
if(is.na(outVal) || outVal == "") {
outVal <- character()
}
}
outVal
}
# Utility function to ensure that character vectors have the correct formatting
characterVectorTest <- function(paramText, inVar) {
outVal <- tryCatch(as.character(inVar), error = function(err) {
stop("error encountered whilst processing the ", paramText, "parameter: ", err)
})
# Remove the empty strings from the output value
if(length(outVal) >= 1) {
outVal <- outVal[!is.na(outVal) & outVal != ""]
}
outVal
}
# Sanity check the term name
private$termName <- characterScalarTest("term name", termName)
if(length(private$termName) < 1) {
stop("error encountered whilst processing the term name parameter: invalid value provided")
}
# Sanity check the namespace
private$namespaceName <- characterScalarTest("namespace name", namespaceName)
if(length(private$namespaceName) > 0) {
if(!grepl("/$", private$namespaceName, perl = TRUE)) {
private$namespaceName <- paste(private$namespaceName, "/", sep = "")
}
}
# Sanity check the term IRI
private$termIRI <- characterScalarTest("term IRI", termIRI)
if(length(private$termIRI) < 0) {
private$termIRI <- self$getQualifiedName()
}
# Sanity check the term version IRI
private$termVersionIRI <- characterScalarTest("term version IRI", termVersionIRI)
# Sanity check the date modified
private$dateModified <- characterScalarTest("date modified", dateModified)
# Sanity check the label
private$label <- characterScalarTest("label", label)
# Sanity check the is replaced by
private$isReplacedBy <- characterScalarTest("\"is replaced by\"", isReplacedBy)
# Sanity check the definition
private$definition <- characterScalarTest("definition", definition)
# Sanity check the notes
private$notes <- characterVectorTest("notes", notes)
# Sanity check the type
private$type <- characterScalarTest("type", type)
# Sanity check the examples
private$examples <- characterVectorTest("examples", examples)
# Sanity check the Living Norway term information
private$termInformationLN <- characterVectorTest("Living Norway term information", termInformationLN)
# Sanity check the executive dicisions
private$execCommitteeDecisions <- characterVectorTest("executive committee decisiosn", execCommitteeDecisions)
# Sanity check the miscellaneous information
private$miscInformation <- characterVectorTest("miscellaneous information", miscInformation)
# Sanity check the term definition source
private$termDef <- characterScalarTest("term definition source", termDef)
# Sanity check the vocabulary term
private$vocabularyURI <- characterScalarTest("vocabulary URI", vocabularyURI)
# Return the object invisibly
invisible(self)
},
# ====== 1.3. Retrieve the qualified name of the term ======
#' Retrieve the qualified name of the term (the term name with the associated namespace)
#'
#' @return The qualified name of the term
#'
getQualifiedName = function() {
outVal <- private$termName
if(length(private$namespaceName) > 0)
{ # Include the namespace informtion if it exists
outVal <- paste(private$namespaceName, private$termName, sep = "")
}
outVal
},
# ====== 1.4. Assess whether the term is deprecated ======
#' Return the deprecation status of the term
#'
#' @return A \code{logical} scalar that is \code{TRUE} if the term is deprecated and
#' \code{FALSE} otherwise
#'
isDeprecated = function() {
deprTextSearch <- function(inText) {
any(
grepl("This term is deprecated and should no longer be used", inText, fixed = TRUE) |
grepl("This extension has been DEPRECATED", inText, fixed = TRUE)
)
}
outVal <- FALSE
if(length(private$isReplacedBy) > 0) {
outVal <- TRUE
}
if(length(private$miscInformation) > 0 && !outVal) {
outVal <- deprTextSearch(private$miscInformation)
}
if(length(private$notes) > 0 && !outVal) {
outVal <- deprTextSearch(private$miscInformation)
}
outVal
},
# ====== 1.5. Override print function for the term ======
#' Print the term information
#'
print = function(...) {
cat(self$getQualifiedName())
if(length(private$label) > 0) {
cat(" -", private$label)
}
if(self$isDeprecated()) {
cat(" (DEPRECATED)")
}
cat("\n")
if(length(private$definition) > 0) {
cat(private$definition, "\n", sep = "")
}
cat("\n")
if(length(private$termDef) > 0) {
cat("\tDefined in: ", private$termDef, "\n", sep = "")
}
if(length(private$termIRI) > 0) {
cat("\tIRI: ", private$termIRI, "\n", sep = "")
}
if(length(private$termVersionIRI) > 0) {
cat("\tVersion IRI: ", private$termVersionIRI, "\n", sep = "")
}
if(length(private$type) > 0) {
cat("\tType: ", private$type, "\n", sep = "")
}
if(length(private$dateModified) > 0) {
cat("\tDate modified: ", private$dateModified, "\n", sep = "")
}
if(length(private$isReplacedBy) > 0) {
cat("\tIs replaced by: ", private$isReplacedBy, "\n", sep = "")
}
if(length(private$notes) > 0) {
cat("\tNotes:\n", paste("\t\t", private$notes, sep = "", collapse = "\n"), "\n", sep = "")
}
if(length(private$execCommitteeDecisions) > 0) {
cat("\tExecutive committee decisions:\n", paste("\t\t", private$execCommitteeDecisions, sep = "", collapse = "\n"), "\n", sep = "")
}
if(length(private$examples) > 0) {
cat("\tExamples:\n", paste("\t\t", private$examples, sep = "", collapse = "\n"), "\n", sep = "")
}
if(length(private$miscInformation) > 0) {
cat("\tMiscellaneous information:\n", paste("\t\t", private$miscInformation, sep = "", collapse = "\n"), "\n", sep = "")
}
if(length(private$termInformationLN) > 0) {
cat("\tLiving Norway supplementary information:\n", paste("\t\t", private$termInformationLN, sep = "", collapse = "\n"), "\n", sep = "")
}
if(length(private$vocabularyURI) > 0) {
cat("\tVocabulary URI:\n", paste("\t\t", private$vocabularyURI, sep = "", collapse = "\n"), "\n", sep = "")
}
invisible(self)
},
# ====== 1.6. Retrieve the term name ======
#' Retrieve the term name
#'
#' @return A \code{character} scalar containing the term name
#'
getTermName = function() {
private$termName
},
# ====== 1.7. Retrieve the namespace name ======
#' Retrieve the namespace name
#'
#' @return A \code{character} scalar containing the namespace name
#'
getNamespaceName = function() {
private$namespaceName
},
# ====== 1.8. Retrieve the term IRI ======
#' Retrieve the term IRI
#'
#' @return A \code{character} scalar containing the term IRI
#'
getTermIRI = function() {
private$termIRI
},
# ====== 1.9. Retrieve the term version IRI ======
#' Retrieve the term version IRI
#'
#' @return A \code{character} scalar containing the term version IRI
#'
getTermVersionIRI = function() {
private$termVersionIRI
},
# ====== 1.10. Retrieve the modification date ======
#' Retrieve the modification date
#'
#' @return A \code{character} scalar containing the modification date
#'
getDateModified = function() {
private$dateModified
},
# ====== 1.11. Retrieve the term label ======
#' Retrieve the label
#'
#' @return A \code{character} scalar containing the label
#'
getLabel = function() {
private$label
},
# ====== 1.12. Retrieve the replacement IRI ======
#' Retrieve the replacement IRI
#'
#' @return A \code{character} scalar containing the IRI of the replacement term
#'
getIsReplacedBy = function() {
private$isReplacedBy
},
# ====== 1.13. Retrieve the definition ======
#' Retrieve the definition
#'
#' @return A \code{character} scalar containing the term definition
#'
getDefinition = function() {
private$definition
},
# ====== 1.14. Retrieve the term notes ======
#' Retrieve the term notes
#'
#' @return A \code{character} vector containing the term notes
#'
getNotes = function() {
private$notes
},
# ====== 1.15. Retrieve the type of the term ======
#' Retrieve the type of the term
#'
#' @return A \code{character} scalar containing the term type
getType = function() {
private$type
},
# ====== 1.16. Retrieve examples of the use of the term ======
#' Retrieve examples of the use of the term
#'
#' @return A \code{character} vector containing examples of use of the term
#'
getExamples = function() {
self$examples
},
# ====== 1.17. Retrieve Living Norway supplementary information ======
#' Retrieve supplementary information about the term provided by Living Norway
#'
#' @return A \code{character} vector containing the Living Norway supplementary information
#'
getTermInformationLN = function() {
self$termInformationLN
},
# ====== 1.18. Retrieve executive committee decisions ======
#' Retrieve executive committee decisions about the use of the term
#'
#' @return A \code{character} vector containing the executive committee decisions
#'
getExecCommitteeDecisions = function() {
self$execCommitteeDecisions
},
# ====== 1.19. Retrieve miscellaneous information ======
#' Retrieve miscellaneous information on the usage of the term
#'
#' @return A \code{character} vector containing the miscellaneous information
#'
getMiscInformation = function() {
self$miscInformation
},
# ====== 1.20. Retrieve the source of the term definition ======
#' Retrieve the source of the term definition
#'
#' @return A \code{character} scalar containing the source of the term definition
#'
getTermDef = function() {
self$termDef
},
# ===== 1.21. Retrieve the URI of the vocabulary definition ======
#' Retrieve the vocabulary URI
#'
#' @return A \code{character} scalar containing the URI where the vocabulary for the
#' term is described
#'
getVocabularyURI = function() {
self$vocabularyURI
}
)
)
# ------ 2. FUNCTION TO RETRIEVE DATA FRAME OF DARWIN CORE TERMS ------
#' Retrieve terms used by Darwin core
#'
#' @param includeExtensions A \code{logical} scalar that, if \code{TRUE}, instructs the fuction to also
#' include terms used in registered GBIF extensions to Darwin Core. If \code{FALSE}, only the terms specified by
#' the Darwin core standard is included
#' @param includeDeprecated A \code{logical} scalar that, if \code{TRUE}, instructs the function to also
#' include terms in the Dawin Core standard that are deprecated
#'
#' @return A \code{list} of \code{DwCTerm} objects representing the terms defined by the Darwin core
#' standard
#'
#' @seealso \code{\link[retrieveDwCClassSpecifications]{retrieveDwCClassSpecifications}}
#' \code{\link[DwCTerm]{DwCTerm}}
#' @author Joseph D. Chipperfield, \email{joechip90@@googlemail.com}
#'
retrieveDwCTermSpecifications <- function(includeExtensions = TRUE, includeDeprecated = FALSE) {
# TODO: add in some error handling if a conection to the server can't be made
# ====== 2.1. Retrieve terms from Darwin core standard =====
# Download terms defined by the Darwin Core standard
termNodes <- xml2::xml_find_all(xml2::read_html("https://dwc.tdwg.org/list/"), "//h2[@id=\"4-vocabulary\"]/following-sibling::table")
termList <- lapply(X = termNodes, FUN = function(curNode) {
# Initialise an output data.frame for the term entry
outList <- data.frame(
# Retrieve the name of the term as defined (and its namespace)
termName = gsub("^.*\\:([\\w\\-]+)\\s*$", "\\1", xml2::xml_text(xml2::xml_find_first(curNode, ".//thead")),perl = TRUE),
namespaceName = switch(
gsub("^.*\\s([\\w\\-]+)\\:[\\w\\-]+\\s*$", "\\1", xml2::xml_text(xml2::xml_find_first(curNode, ".//thead")), perl = TRUE),
"dwc" = "http://rs.tdwg.org/dwc/terms/",
"dwciri" = "http://rs.tdwg.org/dwc/iri/",
"dc" = "http://purl.org/dc/elements/1.1/",
"dcterms" = "http://purl.org/dc/terms/",
""
),
termIRI = "",
termVersionIRI = "",
dateModified = "",
label = "",
isReplacedBy = "",
definition = "",
notes = "",
type = "",
examples = "",
termInformationLN = "",
execCommitteeDecisions = "",
miscInformation = "",
termDef = "https://dwc.tdwg.org/",
vocabularyURI = "",
stringsAsFactors = FALSE
)
# Look through the table of attributes for the term and populate the list
for(infoNode in xml2::xml_find_all(curNode, ".//tbody/tr")) {
colNodes <- xml2::xml_find_all(infoNode, ".//td")
attrName <- xml2::xml_text(colNodes[1])
attrVal <- xml2::xml_text(colNodes[2])
colName <- switch(attrName,
"Term IRI" = "termIRI",
"Term version IRI" = "termVersionIRI",
"Modified" = "dateModified",
"Label" = "label",
"Is replaced by" = "isReplacedBy",
"Definition" = "definition",
"Notes" = "notes",
"Type" = "type",
"Examples" = "examples",
"Executive Committee decision" = "execCommitteeDecisions",
"miscInformation")
delimToUse <- "|||"
if(outList[, colName] == "") {
outList[, colName] <- attrVal
} else {
outList[, colName] <- paste(outList[, colName], attrVal, sep = delimToUse)
}
}
outList[1, ] <- ifelse(outList[1, ] == "", NA, outList[1, ])
DwCTerm$new(termName = outList$termName, namespaceName = outList$namespaceName, termIRI = outList$termIRI, termVersionIRI = outList$termVersionIRI,
dateModified = outList$dateModified, label = outList$label, isReplacedBy = outList$isReplacedBy, definition = outList$definition, notes = strsplit(outList$notes, delimToUse, fixed = TRUE)[[1]],
type = outList$type, examples = strsplit(outList$examples, delimToUse, fixed = TRUE)[[1]], termInformationLN = strsplit(outList$termInformationLN, delimToUse, fixed = TRUE)[[1]],
execCommitteeDecisions = strsplit(outList$execCommitteeDecisions, delimToUse, fixed = TRUE)[[1]], miscInformation = strsplit(outList$miscInformation, delimToUse, fixed = TRUE)[[1]],
termDef = outList$termDef, vocabularyURI = outList$vocabularyURI)
})
# ====== 2.2. Retrieve GBIF extensions terms ======
# If requested, download terms defined in the GBIF registered extensions
if(tryCatch(as.logical(includeExtensions)[1], error = function(err) {
stop("error encountered during processing of extensions inclusion parameter: ", err)
})) {
# Function to read all the terms specified at a given URL
retrieveXMLSpecs <- function(specAddress) {
# Function to read the GBIF XML file
readGBIFXML <- function(specAddress) {
curDoc <- xml2::read_xml(specAddress)
# Class name
className <- switch(xml2::xml_attr(curDoc, "rowType"),
"http://data.ggbn.org/schemas/ggbn/terms/Cloning" = "Cloning", # Cloning class has the 'Amplification' name. Manually need to over-ride that here
xml2::xml_attr(curDoc, "name"))
# Convert each of the child nodes of the xml specification to terms objects
append(lapply(X = xml2::xml_children(curDoc), FUN = function(curNode, curDoc) {
DwCTerm$new(
termName = xml2::xml_attr(curNode, "name"),
namespaceName = xml2::xml_attr(curNode, "namespace"),
termIRI = xml2::xml_attr(curNode, "rowType"),
termVersionIRI = xml2::xml_attr(curNode, "rowType"),
dateModified = xml2::xml_attr(curDoc, "issued"),
label = xml2::xml_attr(curNode, "name"),
isReplacedBy = "",
definition = xml2::xml_attr(curNode, "description"),
notes = xml2::xml_attr(curNode, "description"),
type = xml2::xml_name(curNode),
examples = xml2::xml_attr(curNode, "examples"),
termInformationLN = "",
execCommitteeDecisions = "",
miscInformation = paste("GBIF sub-class designation: ", ifelse(is.na(xml2::xml_attr(curNode, "group")), "unknown", xml2::xml_attr(curNode, "group")), sep = ""),
vocabularyURI = xml2::xml_attr(curNode, "thesaurus")
)
}, curDoc = curDoc), list(
# Convert the countaining class to a a terms object
DwCTerm$new(
termName = className,
namespaceName = xml2::xml_attr(curDoc, "namespace"),
termIRI = xml2::xml_attr(curDoc, "rowType"),
termVersionIRI = xml2::xml_attr(curDoc, "rowType"),
dateModified = xml2::xml_attr(curDoc, "issued"),
label = className,
isReplacedBy = "",
definition = xml2::xml_attr(curDoc, "description"),
notes = xml2::xml_attr(curDoc, "description"),
type = "class",
examples = "",
termInformationLN = "",
miscInformation = paste("GBIF core/extension class"),
vocabularyURI = xml2::xml_attr(curDoc, "relation")
)
))
}
# Retrieve the link nodes for all the entries in the specification page
if(!grepl("/$", specAddress, perl = TRUE)) {
specAddress <- paste(specAddress, "/", sep = "")
}
aNodes <- xml2::xml_find_all(xml2::read_html(specAddress), "//td/a")
aLinks <- sapply(X = aNodes[2:length(aNodes)], FUN = xml2::xml_attr, attr = "href")
# Go through each of the links and process each entry
unlist(lapply(X = aLinks, FUN = function(curLink, curBaseAddress) {
outVals <- list()
if(grepl("\\.xml$", curLink, perl = TRUE)) {
# If the link is to a XML file then scrape the property information from it
outVals <- readGBIFXML(paste(curBaseAddress, curLink, sep = ""))
} else if(grepl("/$", curLink, perl = TRUE)) {
# If the link is to another folder then call the function recursively
outVals <- retrieveXMLSpecs(paste(curBaseAddress, curLink, sep = ""))
}
outVals
}, curBaseAddress = specAddress))
}
# Retrieve the term specifications for both the core and extension elements
GBIFSpecs <- append(
retrieveXMLSpecs("https://rs.gbif.org/core/"),
retrieveXMLSpecs("https://rs.gbif.org/extension/")
)
# Remove those entries in the GBIF specifications that are already in the Darwin Core specification and append those to in the
# Darwin core specification
termList <- append(termList, GBIFSpecs[sapply(X = GBIFSpecs, FUN = function(curSpec, termList) {
!any(curSpec$getQualifiedName() == sapply(X = termList, FUN = function(curTerm) { curTerm$getQualifiedName() }))
}, termList = termList)])
}
# ====== 2.3. Process the outputs ======
# Use qualified names to index the list
names(termList) <- sapply(X = termList, FUN = function(curOb) {
curOb$getQualifiedName()
})
if(tryCatch(as.logical(includeDeprecated)[1] == FALSE, error = function(err) {
stop("error encountered during processing of depracation inclusion parameter: ", err)
})) {
# If the terms list is to exclude depracted terms then remove them from the output
termList <- termList[!sapply(X = termList, FUN = function(curOb) { curOb$isDeprecated() })]
}
termList
}
# ------ 3. FUNCTION TO RETRIEVE LIST OF DARWIN CORE CLASSES ------
#' Retrieve classes and their related terms used by Darwin core
#'
#' @param includeDeprecated A \code{logical} scalar that, if \code{TRUE}, instructs the function to also
#' include terms in the Dawin Core standard that are depracated
#'
#' @return A \code{list} containing one element per class. Each element is itself a \code{list} with the
#' following named elements:
#' \itemize{
#' \item{termInfo}{A \code{DwCTerm} object containing the information of the class term}
#' \item{compositeTerms}{A \code{list} of \code{DwCTerm} objects for each term that is associated with
#' the class}
#' }
#'
#' @seealso \code{\link[retrieveDwCTermSpecifications]{retrieveDwCTermSpecifications}}
#' \code{\link[DwCTerm]{DwCTerm}}
#' @author Joseph D. Chipperfield, \email{joechip90@@googlemail.com}
#'
retrieveDwCClassSpecifications <- function(includeDeprecated = FALSE) {
# TODO: add in some error handling if a connection to the server can't be made
# ====== 3.1. Retrieve terms from the Darwin core standard ======
termList <- retrieveDwCTermSpecifications(FALSE, includeDeprecated)
# ====== 3.2. Retrieve classes from the Darwin core standard ======
classList <- termList[sapply(X = termList, FUN = function(curOb) { curOb$getType() == "Class" })]
# Download the landuage terms defined by the Darwin Core standard
langNodes <- xml2::xml_find_all(xml2::read_html("https://dwc.tdwg.org/list/"), "//p[preceding-sibling::h3[@id=\"31-index-by-term-name\"] and following-sibling::h3[@id=\"32-index-by-label\"]]")
setNames(lapply(X = classList, FUN = function(curClassInfo, langNodes, termList) {
# Initialise an output list
outList <- list(
termInfo = curClassInfo,
compositeTerms = list()
)
# Lookup the HTML node containing the specification for the class
nodeIndex <- which(xml2::xml_text(langNodes) == outList$termInfo$getLabel())
if(length(nodeIndex) > 0) {
# Get the terms associated with the class
termLabels <- strsplit(xml2::xml_text(langNodes)[nodeIndex[1] + 1], "\\s*\\|\\s*", perl = TRUE)[[1]]
termLabels <- termLabels[termLabels %in% names(termList)]
outList$compositeTerms <- termList[termLabels]
}
outList
}, langNodes = langNodes, termList = termList), names(classList))
}
# ------ 4. FUNCTION TO RETRIEVE LIST OF GBIF REGISTERED CLASSES ------
#' Retrieve classes and their related terms used by GBIF
#'
#' @param classOption A \code{character} scalar that if set to \code{"core"} returns only the classes and the associated terms of GBIF's accepted core types.
#' If set to \code{"extension"} returns only the classes and associated terms of GBIF's \url{https://tools.gbif.org/dwca-validator/extensions.do}{registered extensions}.
#' \code{"all"} (the default) returns all of GBIF's registered class types.
#' @param includeDeprecated A \code{logical} scalar that, if \code{TRUE}, instructs the function to also include terms in the GBIF classes that are depracated
#'
#' @return A \code{list} containing one element per class. Each element is itself a \code{list} with the
#' following named elements:
#' \itemize{
#' \item{termInfo}{A \code{DwCTerm} object containing the information of the class term}
#' \item{compositeTerms}{A \code{list} of \code{DwCTerm} objects for each term that is associated with
#' the class}
#' }
#'
#' @seealso \code{\link[retrieveDwCTermSpecifications]{retrieveDwCTermSpecifications}}
#' \code{\link[DwCTerm]{DwCTerm}}
#' @author Joseph D. Chipperfield, \email{joechip90@@googlemail.com}
#' @export
retrieveGBIFClassSpecifications <- function(classOption = "all", includeDeprecated = FALSE) {
# ====== 4.1. Retrieve terms from the GBIF list of used terms ======
termList <- retrieveDwCTermSpecifications(TRUE, includeDeprecated)
# ====== 4.2. Assign terms to their respective GBIF classes ======
# Sanity test the class option input
inClassOption <- tryCatch(tolower(as.character(classOption)), error = function(err) {
stop("error encountered processing the class option parameter: ", err)
})
if(length(inClassOption) > 1) {
warning("class option parameter length greater than one: only the first element will be used")
inClassOption <- inClassOption[1]
} else if(length(inClassOption) == 0) {
stop("error encountered processing the class option parameter: parameter has length 0")
}
# The URLs to check the class structure from
urlCheck <- c("https://rs.gbif.org/core/", "https://rs.gbif.org/extension/")
if(inClassOption != "all") {
if(inClassOption == "core") {
urlCheck <- urlCheck[1]
} else if(inClassOption == "extension") {
urlCheck <- urlCheck[2]
} else {
stop("error encountered processing the class option parameter: values must be \"all\", \"core\", or \"extension\"")
}
}
# Function to find the classes contained in the current URL
findGBIFClasses <- function(specAddress, termList) {
# Function to generate a GBIF class from a list of terms
createGBIFClass <- function(specAddress, termList) {
# Function to add a trailing "/" if there isn't one
addTrailingDir <- function(inText) {
outText <- as.character(inText)
if(!grepl("\\/$", outText, perl = TRUE)) {
outText <- paste(outText, "/", sep = "")
}
outText
}
curDoc <- xml2::read_xml(specAddress)
# Get the qualified names of each of the members of the class
memberTerms <- sapply(X = xml2::xml_children(curDoc), FUN = function(curNode) {
paste(addTrailingDir(xml2::xml_attr(curNode, "namespace")), xml2::xml_attr(curNode, "name"), sep = "")
})
memberTerms <- memberTerms[memberTerms %in% names(termList)]
# Format the output object into a list of terms for the class
list(
termInfo = termList[[paste(addTrailingDir(xml2::xml_attr(curDoc, "namespace")),
switch(specAddress,
"https://rs.gbif.org/extension/ggbn/cloning.xml" = "Cloning", # Cloning class been given an incorrect name in the GBIF API so need to over-ride it here
xml2::xml_attr(curDoc, "name")
),
sep = "")]],
compositeTerms = termList[memberTerms]
)
}
# Retrieve the link nodes for all the entries in the specification page
if(!grepl("/$", specAddress, perl = TRUE)) {
specAddress <- paste(specAddress, "/", sep = "")
}
aNodes <- xml2::xml_find_all(xml2::read_html(specAddress), "//td/a")
aLinks <- sapply(X = aNodes[2:length(aNodes)], FUN = xml2::xml_attr, attr = "href")
# Retrieve the links that are directories
dirLinks <- aLinks[grepl("/$", aLinks, perl = TRUE)]
# For the links that are XML files: make sure the links are only the most recently defined
xmlLinks <- aLinks[grepl("\\.xml$", aLinks, perl = TRUE)]
xmlLinks <- sapply(X = unique(gsub("_*\\d\\d\\d\\d[_-]\\d\\d[_-]\\d\\d_*", "", xmlLinks, perl = TRUE)), FUN = function(curLink, xmlLinks) {
possLinks <- xmlLinks[curLink == gsub("_*\\d\\d\\d\\d[_-]\\d\\d[_-]\\d\\d_*", "", xmlLinks, perl = TRUE)]
if(length(possLinks) > 1) {
# If the there are multiple possible links for the class definition then use the most recent definition
curDates <- strptime(
gsub("^.*(\\d\\d\\d\\d)[_-](\\d\\d)[_-](\\d\\d).*$", "\\1-\\2-\\3", possLinks, perl = TRUE),
"%Y-%m-%d")
possLinks <- possLinks[which.max(ifelse(is.na(curDates), -Inf, as.double(as.POSIXlt(curDates))))]
}
if(length(possLinks) == 0) {
possLinks <- curLink
}
possLinks
}, xmlLinks = xmlLinks)
# Go through each of the links and process each entry
do.call(c, lapply(X = c(dirLinks, xmlLinks), FUN = function(curLink, curBaseAddress, termList) {
outVals <- list()
if(grepl("\\.xml$", curLink, perl = TRUE)) {
# If the link is to a XML file then scrape the property information from it
outVals <- list(createGBIFClass(paste(curBaseAddress, curLink, sep = ""), termList))
} else if(grepl("/$", curLink, perl = TRUE)) {
# If the link is to another folder then call the function recursively
outVals <- findGBIFClasses(paste(curBaseAddress, curLink, sep = ""), termList)
}
outVals
}, curBaseAddress = specAddress, termList = termList))
}
outList <- do.call(c, lapply(X = urlCheck, FUN = findGBIFClasses, termList = termList))
outList <- outList[sapply(X = outList, FUN = function(curEl) {!is.null(curEl$termInfo)})]
names(outList) <- sapply(X = outList, FUN = function(curEl) {curEl$termInfo$getQualifiedName()})
outList
}
# TODO: document helper function
isDwCTerm <- function(inOb) {
any(class(inOb) == "DwCTerm")
}
# TODO: document GBIF core class retrieval function
#' Class retrieval function
#' @export
#
getGBIFCoreClasses <- function() {
setNames(lapply(X = GBIFCoreClassList, FUN = function(curClass) {curClass$termInfo}),
paste("GBIF", sapply(X = GBIFCoreClassList, FUN = function(curClass) {curClass$termInfo$getTermName()}), sep = ""))
}
# TODO: document GBIF extension class retrieval function
#' Extention retrieval function
#' @export
getGBIFExtensionClasses <- function() {
setNames(lapply(X = GBIFExtClassList, FUN = function(curClass) {curClass$termInfo}),
paste("GBIF", sapply(X = GBIFExtClassList, FUN = function(curClass) {curClass$termInfo$getTermName()}), sep = ""))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.