# Get taxonomic authority
# @description
# Use fuzzy searching in the Global Names Resolver to correct spelling
# and locate appropriate authorities.
# @param taxon
# A character string representation of the taxon to search on.
# @param data.source
# A numeric ID corresponding to the data source (i.e. taxonomic authority)
# you'd like to query. Run `view_authorities` to get valid data source
# options and ID's.
# @return
# \itemize{
# \item resolved_name - Resolved taxon name.
# \item authority - Name of the authority searched.
# \item score - Relative match score provided by the authority.
# }
txcl_get_authority <- function(taxon, data.source){
# Tell user what authority is being queried
gnr_list <- txcl_load_gnr_datasources()
use_i <- gnr_list[ , 'id'] == data.source
message('Searching ', gnr_list[use_i, 'title'],' for "',taxon,'"')
# User Global Names Resolver to see if the taxon can be found in data.source
resp <- suppressWarnings(
taxize::gnr_resolve(sci = taxon,
data_source_ids = as.character(data.source),
resolve_once = TRUE,
canonical = TRUE,
best_match_only = TRUE))
# Parse response and return and first item found
if (nrow(resp) != 0) {
res <- list(resolved_name = resp$matched_name2[1],
authority = gnr_list$title[use_i],
score = resp$score[1])
} else {
res <- list(resolved_name = NA_character_,
authority = NA_character_,
score = NA_character_)
# Get the taxonomic classification hierarchy for taxa resolved to supported authorities
# @param taxa.clean
# (character) Taxa names
# @param authority
# (character) Authority \code{taxa.clean} have been resolved to, otherwise \code{NA}. Supported authorities include: "ITIS", "WORMS", "GBIF".
# @param authority.id
# (character) ID of \code{taxa.clean} within the \code{authority}, otherwise \code{NA}
# @param rank
# (character) Rank (e.g. "Genus", "Species") of \code{taxa.clean}, otherwise \code{NA}. This is useful when \code{taxa.clean} can't be resolved to an \code{authority} and the rank must be manually defined.
# @param path
# (character) Path of the directory containing taxa_map.csv.
# @return
# (list) For \code{taxa.clean} resolved to a supported authority, each item in the list is a classification hierarchy (also a list), including one or more common names
# (only when \code{authority} is ITIS or WORMS) and authority IDs for each rank-value pair. For \code{taxa.clean} not resolved to a supported authority, each item is listed as defined in
# the \code{taxa.clean}, \code{authority}, and \code{authority.id} arguments.
# @details
# Only taxa resolved to supported authorities can be expanded into a full taxonomic classification with common names. Taxa resolved to unsupported authorities, or not resolved at all, will be listed as
# is defined in the \code{taxa.clean}, \code{authority}, and \code{authority.id} arguments.
# Supported authorities are recognized by a controlled set of representations.
# \itemize{
# \item ITIS can be: "ITIS", "itis", "Integrated Taxonomic Information System", or "https://www.itis.gov/".
# \item WORMS can be: "WORMS", "worms", "World Register of Marine Species", or "https://www.marinespecies.org/".
# \item GBIF can be: "GBIF", "gbif", "GBIF Backbone Taxonomy", or "https://gbif.org".
# }
txcl_get_classification <- function(taxa.clean,
authority = NA,
authority.id = NA,
rank = NA,
path = NULL) {
# Parameterize --------------------------------------------------------------
cw <- data.frame(
human.readable = c(
'Catalogue of Life',
'Integrated Taxonomic Information System',
'World Register of Marine Species',
'GBIF Backbone Taxonomy',
'Tropicos - Missouri Botanical Garden'),
machine.readable = c(
stringsAsFactors = F)
supported <- unique.data.frame(
taxa.clean = taxa.clean[authority %in% cw$human.readable],
authority = cw$machine.readable[
match(authority[authority %in% cw$human.readable], cw$human.readable)],
authority.id = authority.id[authority %in% cw$human.readable],
rank = rank[authority %in% cw$human.readable],
stringsAsFactors = FALSE))
unsupported <- unique.data.frame(
taxa.clean = taxa.clean[!(authority %in% cw$human.readable)],
authority = authority[!(authority %in% cw$human.readable)],
authority.id = authority.id[!(authority %in% cw$human.readable)],
rank = rank[!(authority %in% cw$human.readable)],
stringsAsFactors = FALSE))
# Supported authorities -----------------------------------------------------
if (nrow(supported) != 0) {
message("Retrieving classifications")
classifications <- suppressMessages(
sci_id = supported$authority.id,
db = supported$authority)))
# Move "false supported" to "unsupported" so taxa won't be lost from the
# returned object. This can happen when the authority + ID pair doesn't
# resolve due to inaccuracies
i <- is.na(classifications)
unsupported <- rbind(
taxa.clean = supported$taxa.clean[i],
authority = supported$authority[i],
authority.id = supported$authority.id[i],
rank = supported$rank[i],
stringsAsFactors = FALSE))
supported <- supported[!i, ]
classifications <- classifications[!i]
# Get all common names and restructure for annotation. Add the authority
# system identifier and common names (there may be more than one) to each
# name + rank pair. This will be used by txcl_make_taxonomicCoverage() to
# annotate the output EML metadata.
output_supported <- mapply(
FUN = function(classification, authority) {
restructured_classification <- apply(
function(row) {
# Set Authority system identifier (i.e. provider)
if (authority == "itis") {
provider <- "https://itis.gov"
} else if (authority == "worms") {
provider <- "https://marinespecies.org"
} else if (authority == "gbif") {
provider <- "https://gbif.org"
} else {
provider <- NA_character_
# Get common name(s). Only English language is supported (currently).
# GBIF doesn't support common name fetching (currently).
if (authority == "itis") {
names_common <- tryCatch({
r <- ritis::common_names(row[["id"]])
r$commonName[r$language == "English"])
}, error = function(e) {
} else if (authority == "worms") {
names_common <- tryCatch({
r <- worrms::wm_common_id(as.numeric(row[["id"]]))
r$vernacular[r$language == "English"])
}, error = function(e) {
} else {
names_common <- NULL
# Restructure
classification <- list(
taxonRankName = row[["rank"]],
taxonRankValue = row[["name"]],
commonName = as.list(names_common),
taxonId = list(
provider = provider,
taxonId = trimws(row[["id"]])))
classification = classifications,
authority = supported$authority,
output_supported <- unname(output_supported)
} else {
output_supported <- NULL
# Unsupported authorities ---------------------------------------------------
if (nrow(unsupported) != 0) {
classifications <- unname(
name = unsupported$taxa.clean,
rank = unsupported$rank,
id = unsupported$authority.id,
stringsAsFactors = FALSE),
# List names and ranks "as is" and restructure for annotation. Add the
# authority system identifier to each name + rank pair. This will be used
# by txcl_make_taxonomicCoverage() to annotate the output EML metadata.
output_unsupported <- mapply(
FUN = function(classification, authority) {
restructured_classification <- apply(
function(row) {
# Set Authority system identifier (i.e. provider)
if (!is.na(authority) | ("" %in% authority)) {
provider <- authority
} else {
provider <- NA_character_
# Restructure
classification <- list(
taxonRankName = row[["rank"]],
taxonRankValue = row[["name"]],
taxonId = list(
provider = provider,
taxonId = trimws(row[["id"]])))
classification = classifications,
authority = unsupported$authority,
} else {
output_unsupported <- NULL
# Combine -------------------------------------------------------------------
return(c(output_supported, output_unsupported))
# Get taxonomic identifiers
# @description
# Get a taxonomic identifier for a taxon name and corresponding authority.
# @param taxon
# A character string specifying taxon to get the ID for.
# @param authority
# A character string specifying the authority from which to get the ID.
# @return
# \itemize{
# \item taxon_id - An authority ID for the taxon.
# \item rank - The taxonomic rank of the taxon.
# }
txcl_get_id <- function(taxon, authority){
taxon_id <- NA_character_
taxon_rank <- NA_character_
# Get ID and rank from taxon and authority ----------------------------------
# Get authority and query for ID and rank
# # Catalogue of Life
# if ((!is.na(authority)) & (authority == 'Catalogue of Life')){
# response <- taxize::get_ids_(
# taxon,
# 'col'
# )
# if (nrow(response[[1]][[1]]) > 0){
# response <- as.data.frame(response$col)
# use_i <- response[ , 2] == taxon
# response <- response[use_i, ]
# if (nrow(response) > 0){
# taxon_id <- as.character(response[1, 1])
# taxon_rank <- response[1, 3]
# } else {
# taxon_id <- NA_character_
# taxon_rank <- NA_character_
# }
# } else {
# taxon_id <- NA_character_
# taxon_rank <- NA_character_
# }
# }
if ((!is.na(authority)) & (authority == 'ITIS')) {
response <- as.data.frame(
if (nrow(response) > 0) {
use_i <- response[ , 'scientificName'] == taxon
response <- response[use_i, ]
if (nrow(response) > 0) {
taxon_id <- as.character(response[1, 'tsn'])
taxon_rank <- taxize::itis_taxrank(as.numeric(taxon_id))
} else {
taxon_id <- NA_character_
taxon_rank <- NA_character_
} else {
taxon_id <- NA_character_
taxon_rank <- NA_character_
# World Register of Marine Species
if ((!is.na(authority)) & (authority == 'World Register of Marine Species')){
response <- taxize::get_wormsid_(
searchtype = 'scientific',
accepted = F,
ask = F,
messages = F)
if (!is.null(response[[1]])) {
response <- as.data.frame(response[[1]])
use_i <- response[ , 2] == taxon
response <- response[use_i, ]
if (nrow(response) > 0) {
taxon_id <- as.character(response[ , 'AphiaID'])
response <- taxize::classification(taxon_id, db = 'worms')
response <- as.data.frame(response[[1]])
taxon_rank <- response[nrow(response), 2]
} else {
taxon_id <- NA_character_
taxon_rank <- NA_character_
} else {
taxon_id <- NA_character_
taxon_rank <- NA_character_
# GBIF Backbone Taxonomy
if ((!is.na(authority)) & (authority == 'GBIF Backbone Taxonomy')){
response <- taxize::get_ids_(
if (nrow(response[[1]][[1]]) > 0){
response <- as.data.frame(response[[1]][[1]])
use_i <- response[ , 6] == taxon
response <- response[use_i, ]
if (nrow(response) > 0){
taxon_id <- as.character(response[1, 'usagekey'])
response <- taxize::classification(taxon_id, db = 'gbif')
response <- as.data.frame(response[[1]])
taxon_rank <- response[nrow(response), 2]
} else {
taxon_id <- NA_character_
taxon_rank <- NA_character_
} else {
taxon_id <- NA_character_
taxon_rank <- NA_character_
# Tropicos - Missouri Botanical Garden
if ((!is.na(authority)) & (authority == 'Tropicos - Missouri Botanical Garden')){
response <- taxize::get_ids_(
if (nrow(response[[1]][[1]]) > 0){
response <- as.data.frame(response[[1]][[1]])
use_i <- response[ , 2] == taxon
response <- response[use_i, ]
if (nrow(response) > 0){
taxon_id <- as.character(response[1, 'nameid'])
response <- taxize::tax_rank(taxon_id, db = 'tropicos')
taxon_rank <- response[[1]]
} else {
taxon_id <- NA_character_
taxon_rank <- NA_character_
} else {
taxon_id <- NA_character_
taxon_rank <- NA_character_
# Return --------------------------------------------------------------------
if (!exists('taxon_id')){
taxon_id <- NA_character_
if (!exists('taxon_rank')){
taxon_rank <- NA_character_
if (is.null(taxon_id)){
taxon_id <- NA_character_
if (is.null(taxon_rank)){
taxon_rank <- NA_character_
list('taxon_id' = taxon_id,
'taxon_rank' = taxon_rank)
# Get taxonomic identifiers
# @description
# Get a taxonomic identifier for a taxon name and corresponding authority.
# @param taxon
# A character string specifying taxon to get the ID for.
# @param authority
# A character string specifying the authority from which to get the ID.
# @return
# \itemize{
# \item taxon_id - An authority ID for the taxon.
# \item rank - The taxonomic rank of the taxon.
# }
txcl_get_id_common <- function(taxon, authority){
taxon_id <- NA_character_
taxon_rank <- NA_character_
taxon_authority <- NA_character_
taxon_clean <- NA_character_
# Match authority -----------------------------------------------------------
gnr_list <- txcl_load_gnr_datasources()
use_i <- authority == gnr_list[ , 'id']
authority <- gnr_list[use_i, 'title']
# Get ID and rank from taxon and authority ----------------------------------
# Get authority and query for ID and rank
# # ITIS
if ((!is.na(authority)) & (authority == 'ITIS')){
response <- as.data.frame(
sci_com = taxon,
searchtype = 'common',
ask = FALSE))
if (nrow(response) > 0){
use_i <- tolower(response[ , 3]) == tolower(taxon)
response <- response[use_i, ]
if (nrow(response) > 0){
taxon_id <- as.character(response[1, 1])
taxon_rank <- 'Common'
taxon_authority <- authority
taxon_clean <- taxon
} else {
taxon_id <- NA_character_
taxon_rank <- NA_character_
taxon_authority <- NA_character_
taxon_clean <- NA_character_
} else {
taxon_id <- NA_character_
taxon_rank <- NA_character_
taxon_authority <- NA_character_
taxon_clean <- NA_character_
# # World Register of Marine Species
# if ((!is.na(authority)) & (authority == 'World Register of Marine Species')){
# response <- as.data.frame(
# get_wormsid_(
# query = taxon,
# searchtype = 'common',
# ask = F
# )
# )
# if (!is.null(response[[1]])){
# response <- as.data.frame(response[[1]])
# use_i <- response[ , 2] == taxon
# response <- response[use_i, ]
# if (nrow(response) > 0){
# taxon_id <- as.character(response[ , 'AphiaID'])
# response <- taxize::classification(taxon_id, db = 'worms')
# response <- as.data.frame(response[[1]])
# taxon_rank <- response[nrow(response), 2]
# } else {
# taxon_id <- NA_character_
# taxon_rank <- NA_character_
# }
# } else {
# taxon_id <- NA_character_
# taxon_rank <- NA_character_
# }
# }
# # Tropicos - Missouri Botanical Garden
# if ((!is.na(authority)) & (authority == 'Tropicos - Missouri Botanical Garden')){
# response <- as.data.frame(
# tp_search(
# commonname = taxon
# )
# )
# if (nrow(response[[1]][[1]]) > 0){
# response <- as.data.frame(response[[1]][[1]])
# use_i <- response[ , 2] == taxon
# response <- response[use_i, ]
# if (nrow(response) > 0){
# taxon_id <- as.character(response[1, 'nameid'])
# response <- taxize::tax_rank(taxon_id, db = 'tropicos')
# taxon_rank <- response[[1]]
# } else {
# taxon_id <- NA_character_
# taxon_rank <- NA_character_
# }
# } else {
# taxon_id <- NA_character_
# taxon_rank <- NA_character_
# }
# }
# # Encyclopedia of life
# if ((!is.na(authority)) & (authority == 'EOL')){
# response <- suppressMessages(as.data.frame(
# taxize::eol_search(
# terms = taxon,
# exact = T
# )
# ))
# if (nrow(response) > 0){
# if (nrow(response) > 0){
# taxon_id <- as.character(response[1, 'pageid'])
# taxon_rank <- 'common'
# taxon_authority <- authority
# taxon_clean <- taxon
# } else {
# taxon_id <- NA_character_
# taxon_rank <- NA_character_
# taxon_authority <- NA_character_
# taxon_clean <- NA_character_
# }
# } else {
# taxon_id <- NA_character_
# taxon_rank <- NA_character_
# taxon_authority <- NA_character_
# taxon_clean <- NA_character_
# }
# }
# Return --------------------------------------------------------------------
if (!exists('taxon_id')){
taxon_id <- NA_character_
if (!exists('taxon_rank')){
taxon_rank <- NA_character_
if (!exists('taxon_authority')){
taxon_id <- NA_character_
if (!exists('taxon_clean')){
taxon_rank <- NA_character_
if (is.null(taxon_id)){
taxon_id <- NA_character_
if (is.null(taxon_rank)){
taxon_rank <- NA_character_
if (is.null(taxon_authority)){
taxon_authority <- NA_character_
if (is.null(taxon_clean)){
taxon_clean <- NA_character_
list('taxon_id' = taxon_id,
'taxon_rank' = taxon_rank,
'taxon_authority' = taxon_authority,
'taxon_clean' = taxon_clean)
# Load and fix GNR Datasources
# @return (data.frame) GNR datasources from \code{taxize::gnr_datasources()}
# @details This fixes bugs in taxize which otherwise produce inconsistent datasource names (e.g. "Integrated Taxonomic Information SystemITIS" rather than expected "ITIS")
txcl_load_gnr_datasources <- function() {
gnr_list <- as.data.frame(taxize::gnr_datasources())
gnr_list$title[gnr_list$id == "3"] <- "ITIS"
# Make taxonomicCoverage EML node
# @param taxa.clean
# (character) Taxa names as they appear in your dataset
# @param authority
# (character) Authority \code{taxa.clean} have been resolved to. Supported authorities include: "ITIS", "WORMS", "GBIF". For unsupported authorities, list the home page URL. For unresolved taxa use \code{NA}.
# @param authority.id
# (character) ID of \code{taxa.clean} within the \code{authority}, otherwise \code{NA}
# @param rank
# (character) Rank (e.g. "Genus", "Species") of \code{taxa.clean}, otherwise \code{NA}. This is useful when \code{taxa.clean} can't be resolved to an \code{authority} and the rank must be manually defined.
# @param path
# (character) Path of the directory to which taxonomicCoverage.xml will be written. Can also be the path of the directory containing taxa_map.csv, if using as inputs to this function.
# @param write.file
# (logical) Whether taxonomicCoverage.xml should be written to file. Default is \code{TRUE}.
# @return
# \item{emld list}{The taxonomicClassification EML node for use in constructing EML metadata with the EML R library.}
# \item{taxonomicCoverage.xml}{If \code{write.file = TRUE}.}
# @details This function uses \code{txcl_get_classification()} to expand taxa, resolved to supported authorities, into full taxonomic classification. Each level of classification is
# accompanied by an annotation (listing the \code{authority} and \code{authority.id}) and common names (only when \code{authority} is ITIS or WORMS). Taxa resolved to unsupported authorities, or not resolved at
# all, will be listed as is defined in the \code{taxa.clean}, \code{authority}, and \code{authority.id} arguments.
# @note The name of this function is a bit misleading. The return value is actually a list of taxonomicClassification nodes, which occur immediately below taxonomicCoverage
# (i.e. ../taxonomicCoverage/taxonomicClassification).
txcl_make_taxonomicCoverage <- function(
authority = NA,
authority.id = NA,
rank = NA,
write.file = TRUE){
message('Creating <taxonomicCoverage>')
# Not all taxonomic authority systems are supported by this function.
# Testing and reporting of supported authorities in the function
# documentation is needed. Additionally, the valid inputs to authority and
# authority ID need to be definied in the function documentation so users can
# manually supply this information if necessary.
# Validate arguments --------------------------------------------------------
# A path is required when writing to file
if (missing(path) & isTRUE(write.file)){
stop('Input argument "path" is required when writing data to file.')
# The path must be valid
if (!missing(path)){
# Load data -----------------------------------------------------------------
# Remove any blank or missing taxa otherwise txcl_get_classification() will throw
# errors
missing_names <- is.na(taxa.clean) | taxa.clean == ""
taxa.clean <- taxa.clean[!missing_names]
authority <- authority[!missing_names]
authority.id <- authority.id[!missing_names]
rank <- rank[!missing_names]
# Create taxonomicCoverage --------------------------------------------------
# This method supports EML annotation and more than one common name per taxon
# rank and thus requires a nested list structure not currently supported by
# EML::set_taxonomicCoverage().
# Retrieve taxonomic hierarchy and common names when possible
classifications <- txcl_get_classification(
taxa.clean = taxa.clean,
authority = authority,
authority.id = authority.id,
rank = rank,
path = path)
# Recursively convert classifications into the nested structure expected by
# EML::write_eml().
taxonomic_coverage <- txcl_set_taxonomic_coverage(classifications)
# Write to file -------------------------------------------------------------
if (isTRUE(write.file)){
message('Writing taxonomicCoverage.xml')
eml = taxonomic_coverage,
file = paste0(path, '/taxonomicCoverage.xml'))
# Return object -------------------------------------------------------------
# Optimize match
# @description
# Optimize the taxon match to an authority based on completeness of
# returned information. A complete return contains both an authority
# name and an authority ID for a taxon.
# @param x
# (character) A character string specifying the taxon.
# @param data.sources
# (numeric) A numeric vector of values specifying the authorities to search across.
# Run `view_authorities` to get valid data source options and ID's.
# @return
# \itemize{
# \item taxa_clean - Resolved name for input taxon.
# \item rank - Rank of the input taxon.
# \item authority - Best authority match for input taxon.
# \item authority_id - Corresponding authority ID for the input taxon.
# \item score - Authority match score for input taxon.
# }
txcl_optimize_match <- function(x, data.sources){
# Initialize output
output <- data.frame(
taxa_clean = rep(NA_character_, length(data.sources)),
rank = rep(NA_character_, length(data.sources)),
authority = rep(NA_character_, length(data.sources)),
authority_id = rep(NA_character_, length(data.sources)),
score = rep(NA_character_, length(data.sources)),
stringsAsFactors = FALSE)
# Iterate overall sources and stop on the first match
j <- 1
while (j != (length(data.sources)+1)) {
# Does taxon resolve to data.sources[j]?
gnrr <- try(
txcl_get_authority(taxon = x,
data.source = as.character(data.sources[j])),
silent = TRUE)
# Try for ID and rank if GNR didn't error, then add results
if (!("try-error" %in% class(gnrr))) {
id <- try(
txcl_get_id(taxon = x, authority = gnrr$authority)),
silent = TRUE)
if (!("try-error" %in% class(id))) {
output$authority_id[j] <- id$taxon_id
output$rank[j] <- id$taxon_rank
output$taxa_clean[j] <- x
output$authority[j] <- gnrr$authority
output$score[j] <- gnrr$score
# Continue with next data.source if no results were found in this one
if (!is.na(output$authority_id[j])) {
j <- length(data.sources) + 1
} else {
j <- j + 1
# Return the first match otherwise NAs
resolved <- !is.na(output$authority_id)
if (any(resolved)) {
output <- output[resolved, ]
} else {
output <- output[1, ]
# Optimize match common
# @description
# Optimize the common taxon match to an authority based on completeness of
# returned information. A complete return contains both an authority
# name and an authority ID for a taxon.
# @param x
# (character) A character string specifying the taxon.
# @param data.sources
# (Numeric) A numeric vector of values specifying the authorities to search across.
# Run `view_authorities` to get valid data source options and ID's.
# @return
# \itemize{
# \item taxa_clean - Resolved name for input taxon.
# \item rank - Rank of the input taxon.
# \item authority - Best authority match for input taxon.
# \item authority_id - Corresponding authority ID for the input taxon.
# \item score - Authority match score for input taxon.
# }
txcl_optimize_match_common <- function(x, data.sources){
output <- data.frame(
taxa_clean = rep(NA_character_, length(data.sources)),
rank = rep(NA_character_, length(data.sources)),
authority = rep(NA_character_, length(data.sources)),
authority_id = rep(NA_character_, length(data.sources)),
stringsAsFactors = F)
j <- 1
while (j != (length(data.sources)+1)){
# Resolve ID, and rank
out_id <- try(
taxon = x,
authority = data.sources[j])),
silent = TRUE)
if (methods::is(out_id, "try-error")) {
out_id <- list(
'taxon_id' = NA_character_,
'taxon_rank' = NA_character_,
'taxon_authority' = NA_character_,
'taxon_clean' = NA_character_)
# Parse results into output data frame
output[j, 'taxa_clean'] <- out_id[['taxon_clean']]
output[j, 'rank'] <- out_id[['taxon_rank']]
output[j, 'authority'] <- out_id[['taxon_authority']]
output[j, 'authority_id'] <- out_id[['taxon_id']]
# Stop if a successful match has been made to save redundant effort
if (!is.na(output[j, 'authority_id'])) {
j <- length(data.sources) + 1
} else {
j <- j + 1
# Get best match
if (sum(is.na(output[ , 'authority_id'])) == nrow(output)){
if (sum(is.na(output[ , 'authority'])) != nrow(output)){
output <- output[!is.na(output[ , 'authority']), ]
output <- output[1, ]
} else {
output <- output[1, ]
} else {
output <- output[!is.na(output[ , 'authority_id']), ]
# Return
output[1, 'taxa_clean'],
output[1, 'rank'],
output[1, 'authority'],
output[1, 'authority_id']
# Resolve common names to an authority
# @description
# Resolve taxa to preferred authorities and get associated IDs.
# @param x
# (character) A vector of taxa names.
# @param data.sources
# (numeric) An ordered vector of authority IDs to be queried (
# \code{view_taxa_authorities()} lists currently supported authorities).
# Taxa are resolved to authorities in the order listed. If an authority
# and ID match cannot be made, then the next will be queried.
# @param path
# (character) Path to directory containing taxa_map.csv. This tracks
# provenance throughout the data cleaning process. Create this file with
# \code{create_taxa_map()}.
# @return
# (data frame; taxa_map.csv) If using \code{x}, then a data frame
# containing the input taxa, accepted taxa name, rank, authority,
# authority ID, and score are returned. If using \code{path}, then an
# updated version of taxa_map.csv will be returned to \code{path} and
# a data frame of taxa_map.csv to the R environment.
txcl_resolve_comm_taxa <- function(x = NULL, data.sources, path = NULL){
# Check arguments ---------------------------------------------------------
if (!is.null(x) & !is.null(path)) {
stop('Both "path" and "x" arguments are not allowed. Select one or the other.')
if (is.null(x)) {
if (is.null(path)) {
stop('Input argument "path" is missing!')
use_i <- file.exists(
if (!isTRUE(use_i)) {
stop('taxa_map.csv is missing! Create it with initialize_taxa_map.R.')
if (missing(data.sources)) {
stop('Input argument "data.sources" is missing!')
authorities <- view_taxa_authorities()
authorities <- authorities[authorities$resolve_comm_taxa == 'supported', ]
use_i <- as.character(data.sources) %in% as.character(authorities$id)
if (sum(use_i) != length(use_i)){
stop('Input argument "data.sources" contains unsupported data source IDs!')
# Create taxa list ----------------------------------------------------------
if (!is.null(path)) {
taxa_list <- data.frame(
index = seq(nrow(taxa_map)),
taxa = rep(NA_character_, nrow(taxa_map)),
stringsAsFactors = F)
taxa_list$taxa <- taxa_map$taxa_raw
use_i <- !is.na(taxa_map$taxa_trimmed)
taxa_list$taxa[use_i] <- taxa_map$taxa_trimmed[use_i]
use_i <- !is.na(taxa_map$taxa_replacement)
taxa_list$taxa[use_i] <- taxa_map$taxa_replacement[use_i]
use_i <- is.na(taxa_map$taxa_clean)
taxa_list$taxa[!use_i] <- NA
use_i <- is.na(taxa_map$authority_id) & is.na(taxa_map$taxa_removed)
taxa_list <- taxa_list[use_i, ]
} else {
taxa_list <- data.frame(
index = seq(length(x)),
taxa = rep(NA_character_, length(x)),
stringsAsFactors = F)
taxa_list$taxa <- x
# Optimize match ------------------------------------------------------------
r <- lapply(
data.sources = data.sources)
# Update taxa_map.csv -------------------------------------------------------
if (!is.null(path)) {
r <- data.frame(
nrow = length(r),
byrow = T),
stringsAsFactors = F)
colnames(r) <- c(
r$taxa <- unique(taxa_list$taxa)
rj <- dplyr::full_join(r, taxa_list, by = "taxa")
taxa_map[rj$index, c("taxa_clean", "rank", "authority", "authority_id", "score")] <-
dplyr::select(rj, -taxa, -index)
taxa_map$rank <- stringr::str_to_title(taxa_map$rank)
} else {
r <- data.frame(
nrow = length(r),
byrow = T),
stringsAsFactors = F)
colnames(r) <- c(
taxa_map <- cbind(taxa_list, r)
# Return --------------------------------------------------------------------
# Resolve scientific names to an authority
# @description
# Resolve taxa to preferred authorities and get associated IDs.
# @param x
# (character) A vector of taxa names.
# @param data.sources
# (numeric) An ordered vector of authority IDs to be queried (
# \code{view_taxa_authorities()} lists currently supported authorities).
# Taxa are resolved to authorities in the order listed. If an authority
# and ID match cannot be made, then the next will be queried.
# @param path
# (character) Path to directory containing taxa_map.csv. This tracks
# provenance throughout the data cleaning process. Create this file with
# \code{create_taxa_map()}.
# @return
# (data frame; taxa_map.csv) If using \code{x}, then a data frame
# containing the input taxa, accepted taxa name, rank, authority,
# authority ID, and score are returned. If using \code{path}, then an
# updated version of taxa_map.csv will be returned to \code{path} and
# a data frame of taxa_map.csv to the R environment.
txcl_resolve_sci_taxa <- function(x = NULL, data.sources, path = NULL){
# Validate arguments --------------------------------------------------------
validate_arguments("txcl_resolve_sci_taxa", as.list(environment()))
# Create taxa list ----------------------------------------------------------
if (!is.null(path)) {
taxa_list <- data.frame(
index = seq(nrow(taxa_map)),
taxa = rep(NA_character_, nrow(taxa_map)),
stringsAsFactors = F)
taxa_list$taxa <- taxa_map$taxa_raw
use_i <- !is.na(taxa_map$taxa_trimmed)
taxa_list$taxa[use_i] <- taxa_map$taxa_trimmed[use_i]
use_i <- !is.na(taxa_map$taxa_replacement)
taxa_list$taxa[use_i] <- taxa_map$taxa_replacement[use_i]
use_i <- !is.na(taxa_map$taxa_removed) | !is.na(taxa_map$authority_id)
taxa_list <- taxa_list[!use_i, ]
} else {
taxa_list <- data.frame(
index = seq(length(x)),
taxa = rep(NA_character_, length(x)),
stringsAsFactors = F)
taxa_list$taxa <- x
# Optimize match ------------------------------------------------------------
r <- lapply(
data.sources = data.sources)
# Update taxa_map.csv -----------------------------------------------------
if (!is.null(path)) {
r <- data.frame(
nrow = length(r),
byrow = T),
stringsAsFactors = F)
colnames(r) <- c(
r$taxa <- unique(taxa_list$taxa)
rj <- dplyr::full_join(r, taxa_list, by = "taxa")
taxa_map[rj$index, c("taxa_clean", "rank", "authority", "authority_id", "score")] <-
dplyr::select(rj, -taxa, -index)
taxa_map$rank <- stringr::str_to_title(taxa_map$rank)
} else {
r <- data.frame(
nrow = length(r),
byrow = T),
stringsAsFactors = F)
colnames(r) <- c(
taxa_map <- cbind(taxa_list, r)
# Return --------------------------------------------------------------------
# Create the taxonomicCoverage EML node
# @param sci_names
# (list) Object returned by \code{txcl_get_classification()}.
# @return
# \item{list}{If \code{write.file = FALSE} an emld list object is returned
# for use with the EML R Package.}
# \item{.xml file}{If \code{write.file = TRUE} a .xml file is written to
# \code{path}}.
txcl_set_taxonomic_coverage <- function(sci_names) {
pop <- function(taxa) {
if (length(taxa) > 1) {
taxonRankName = taxa[[1]]$taxonRankName,
taxonRankValue = taxa[[1]]$taxonRankValue,
taxonId = taxa[[1]]$taxonId,
commonName = taxa[[1]]$commonName,
taxonomicClassification = pop(taxa[-1]))
} else {
taxonRankName = taxa[[1]]$taxonRankName,
taxonRankValue = taxa[[1]]$taxonRankValue,
taxonId = taxa[[1]]$taxonId,
commonName = taxa[[1]]$commonName)
taxa <- lapply(
function(sci_name) {
return(list(taxonomicClassification = taxa))
# View taxonomic authorities
# @description
# List taxonomic authorities supported by taxonomic funcs.
# @details
# View taxonomic authorities supported by `resolve_taxa` and
# `resolve_common`.
# @return
# (data frame) Taxonomic authorities and corresponding identifiers
# supported by `resolve_taxa` and `resolve_common`.
view_taxa_authorities <- function(){
suggs <- c("ritis", "taxize", "worrms") # suggested packages
suggsmissing <- !unlist(lapply(suggs, requireNamespace, quietly = TRUE))
if (any(suggsmissing)) {
stop("Packages ", paste(suggs, collapse = ", "), " are required for ",
"running view_taxa_authorities(). Packages ",
paste(suggs[suggsmissing], collapse = ", "), " are not installed.",
call. = FALSE)
# Get GNR datasources -----------------------------------------------------
gnr_list <- txcl_load_gnr_datasources()
# Mark supported databases ------------------------------------------------
gnr_list$return_to_user <- NA
gnr_list$resolve_taxa <- NA
gnr_list$resolve_common <- NA
use_i <- gnr_list[ , 'title'] == 'ITIS'
gnr_list[use_i, 'return_to_user'] <- 'Integrated Taxonomic Information System (ITIS)'
gnr_list[use_i, 'resolve_taxa'] <- 'supported'
gnr_list[use_i, 'resolve_common'] <- 'supported'
use_i <- gnr_list[ , 'title'] == 'EOL'
gnr_list[use_i, 'return_to_user'] <- 'Encyclopedia of Life (EOL)'
gnr_list[use_i, 'resolve_taxa'] <- 'not supported'
gnr_list[use_i, 'resolve_common'] <- 'not supported'
use_i <- gnr_list[ , 'title'] == 'Tropicos - Missouri Botanical Garden'
gnr_list[use_i, 'return_to_user'] <- 'Tropicos - Missouri Botanical Garden'
gnr_list[use_i, 'resolve_taxa'] <- 'supported'
gnr_list[use_i, 'resolve_common'] <- 'not supported'
use_i <- gnr_list[ , 'title'] == 'GBIF Backbone Taxonomy'
gnr_list[use_i, 'return_to_user'] <- 'Global Biodiversity Information Facility (GBIF)'
gnr_list[use_i, 'resolve_taxa'] <- 'supported'
gnr_list[use_i, 'resolve_common'] <- 'not supported'
# use_i <- gnr_list[ , 'title'] == 'Catalogue of Life'
# gnr_list[use_i, 'return_to_user'] <- 'Catalogue of Life (COL)'
# gnr_list[use_i, 'resolve_taxa'] <- 'supported'
# gnr_list[use_i, 'resolve_common'] <- 'not supported'
use_i <- gnr_list[ , 'title'] == 'World Register of Marine Species'
gnr_list[use_i, 'return_to_user'] <- 'World Register of Marine Species (WORMS)'
gnr_list[use_i, 'resolve_taxa'] <- 'supported'
gnr_list[use_i, 'resolve_common'] <- 'not supported'
use_i <- !is.na(gnr_list$return_to_user)
taxonomic_authorities <- gnr_list[use_i, c('id', 'return_to_user', 'resolve_taxa', 'resolve_common')]
colnames(taxonomic_authorities) <- c('id', 'authority', 'resolve_sci_taxa', 'resolve_comm_taxa')
rownames(taxonomic_authorities) <- c()
# Return
