#' Internal function. Find a potential matching candidate for misspelled names
#'
#' This function searches for a potential match candidate within an orthographic
#' distance of each misspelled names. Only names within an orthographic distance
#' specified in 'max_distance' are returned. It was built based on
#' the 'suggest.name' function of the flora package.
#'
#' @param sci_name character string. Scientific name to match a taxonomic
#' database available in the 'taxadb' package.
#' @param max_distance numeric. Value between 0 and 1 specifying the maximum
#' distance between the scientific names and names suggested by a fuzzy
#' matching. Values close to 1 indicate that only a few differences between
#' scientific names and name suggested are allowed. Default = 0.9.
#' @param provider character string. A database where the valid will be
#' searched. The options are those provided by the 'taxadb' package.
#' @param db_version numeric string. A database version (eg. 2002).
#' @param rank_name character string. Taxonomic rank name (e.g. "Plantae",
#' "Animalia", "Aves", "Carnivora". Default = NULL.
#' @param rank character string. A taxonomic rank to filter the database.
#' Options available are: "kingdom", "phylum", "class", "order", "family", and
#' "genus". Default = NULL.
#' @param parallel logical. If TRUE (Default) to run in parallel.
#' @param ncores numeric. Number of cores to run in parallel. Default = 2.
#'
#' @details This function looks for scientific names in a reference taxonomic
#' database ('db'). Higher taxonomic rank information and the first letter of
#' names supplied are used to filter the reference database. Then, it is
#' calculated a string distance among original names and candidate names.
#' String distance is calculated by optimal string alignment (restricted
#' Damerau-Levenshtein distance) that counts the number of deletions,
#' insertions, substitutions, and transpositions of adjacent characters. It
#' ranges from 0 to 1, being 1 an indicative of a perfect match. If a
#' candidate name is found and the string distance is equal or higher than the
#' distance informed in ‘max_distance’, a suggested name is returned.
#' Otherwise, names are returned as NA. To reduce the number of both false
#' positives and negatives, the ‘suggest_distance’ is 0.9, by default, which
#' means that only a few differences between original and candidate names are
#' allowed. When multiple candidate names have an identical matching distance
#' for the original name, the name with the lowest alphabetical sort order is
#' presented as the best match.
#'
#' @return A three-column data.frame containing original name, names suggested,
#' and string distance between original and suggested (candidates) names.
#'
#' @importFrom dplyr filter pull
#' @importFrom parallel makeCluster stopCluster
#' @importFrom taxadb taxa_tbl
#' @importFrom foreach %dopar%
#' @importFrom utils head
#'
#' @noRd
#'
#' @examples
#' \donttest{
#' x <- c("Cebus apela", "Puma concolar")
#' bdc_suggest_names_taxadb(
#' x,
#' max_distance = 0.75,
#' provider = "gbif",
#' rank_name = "Plantae",
#' rank = "kingdom",
#' parallel = TRUE,
#' ncores = 2
#' )
#' }
bdc_suggest_names_taxadb <-
function(sci_name,
max_distance = suggestion_distance,
provider = db,
db_version = db_version,
rank_name = NULL,
rank = NULL,
parallel = TRUE,
ncores = 2) {
suggestion_distance <- db <- . <- .data <- scientificName <- NULL
# FIXME: set a env var for now
# REVIEW: https://github.com/ropensci/taxadb/issues/91
# Sys.setenv("CONTENTID_REGISTRIES" = "https://hash-archive.carlboettiger.info")
# Sys.setenv("TAXADB_DRIVER"="MonetDBLite")
# Get first letter of all scientific names
first_letter <-
unique(sapply(sci_name, function(i) {
strsplit(i, "")[[1]][1]
},
USE.NAMES = FALSE
))
name_to_case_check <- suppressMessages(taxadb::taxa_tbl(provider
# , version = getOption("taxadb_default_provider", db_version)
)) %>%
dplyr::filter(!is.na(scientificName)) %>%
utils::head(1) %>%
pull(scientificName) %>%
stringr::str_split(., "")
# if (lower_case) {
# first_letter <- tolower(first_letter)
# } else {
first_letter <- toupper(first_letter)
# }
# Should taxonomic database be filtered according to a taxonomic rank name?
if (!is.null(rank_name) & !is.null(rank)) {
species_first_letter <-
suppressMessages(taxadb::taxa_tbl(provider)) %>%
dplyr::filter(., .data[[rank]] == rank_name | is.na(.data[[rank]])) %>%
dplyr::pull(scientificName) %>%
grep(paste0("^", first_letter, collapse = "|"), ., value = TRUE)
} else if (is.null(rank_name) & !is.null(rank)) {
stop("Please, provide both 'rank_name' and 'rank' arguments")
} else if (!is.null(rank_name) & is.null(rank)) {
stop("Please, provide both 'rank_name' and 'rank' arguments")
} else {
species_first_letter <-
suppressMessages(taxadb::taxa_tbl(provider)) %>%
dplyr::pull(scientificName) %>%
grep(paste0("^", first_letter, collapse = "|"), ., value = TRUE)
}
# Should parallel processing be used?
if (parallel == TRUE) {
# setup parallel backend to use many processors
cl <- parallel::makeCluster(ncores) # not to overload your computer
doParallel::registerDoParallel(cl)
sug_dat <-
foreach::foreach(
i = sci_name,
.combine = rbind, .export = "bdc_return_names"
) %dopar% {
bdc_return_names(i, max_distance, species_first_letter)
} # end foreach
parallel::stopCluster(cl) # stop cluster
} else {
sug_dat <-
data.frame(
original = character(length(sci_name)),
suggested = character(length(sci_name)),
distance = numeric(length(sci_name))
)
for (i in seq_along(sci_name)) {
sug_dat[i, ] <-
bdc_return_names(sci_name[i], max_distance, species_first_letter)
}
}
return(sug_dat)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.