#' Match two name lists using the Leipzig Catalogue of Plants (LCVP)
#'
#'
#' Matches and compares two name lists based on the taxonomic resolution of
#' plant taxa names listed in the "Leipzig Catalogue of Vascular Plants" (LCVP).
#'
#' @param splist1 A character vector specifying the reference input taxon to be matched.
#' Each element including genus and specific epithet and, potentially,
#' infraspecific rank, infraspecific name, and author name. Only valid characters are allowed
#' (see \code{\link[base:validEnc]{base:validEnc}}).
#'
#' @param splist2 A character vector specifying the input taxon to match splist1.
#' Each element including genus and specific epithet and, potentially, infraspecific rank,
#' infraspecific name, and author name. Only valid characters are allowed
#' (see \code{\link[base:validEnc]{base:validEnc}}).
#'
#'@param max_distance It represents the maximum string distance allowed for a
#' match when comparing the submitted name with the closest name matches in the
#' LCVP. The distance used is a generalized Levenshtein distance that indicates
#' the total number of insertions, deletions, and substitutions allowed to
#' match the two names. It can be expressed as an integer or as the fraction of
#' the binomial name. For example, a name with length 10, and a max_distance =
#' 0.1, allow only one change (insertion, deletion, or substitution). A
#' max_distance = 2, allows two changes.
#'
#'@param genus_fuzzy If TRUE, the fuzzy match algorithm based on max_distance
#' will also be applied to the genus (note that this may considerably increase
#' computational time). If FALSE, fuzzy match will only apply to the epithet.
#'
#'@param grammar_check if TRUE, the algorithm will try to fix common latin
#'grammar mistakes.
#'
#' @param include_all If \code{TRUE} (default), it will include all species in both
#' \code{splist1} and \code{splist2}. If \code{FALSE}, it will exclude species
#' only found in \code{splist2}.
#'
#' @param identify_dups If \code{TRUE} (default), a column indicating the position
#' of duplicated LCVP output names in the resulting data.frame.
#'
#'
#' @return
#' A data.frame with the following columns:
#'
#' \itemize{
#' \item{\emph{Species.List.1}}{: Taxa name list provided by the user in the
#' splist1.}
#' \item{\emph{Species.List.2}}{: Taxa name list provided by the user in the
#' splist2.}
##' \item{global.Id}{The fixed species id of the input taxon in the
#' Leipzig Catalogue of Vascular Plants (LCVP).}
#' \item{Input.Genus}{A
#' character vector. The input genus of the corresponding vascular plant
#' species name listed in LCVP.}
#' \item{Input.Epitheton}{A character vector.
#' The input epitheton of the corresponding vascular plant species name listed
#' in LCVP.}
#' \item{Rank}{A character vector. The taxonomic rank ("species",
#' subspecies: "subsp.", variety: "var.", subvariety: "subvar.", "forma", or
#' subforma: "subf.") of the corresponding vascular plant species name listed
#' in LCVP.}
#' \item{Input.Subspecies.Epitheton}{A character vector. If the
#' indicated rank is below species, the subspecies epitheton input of the
#' corresponding vascular plant species name listed in LCVP. If the rank is
#' "species", the input is "nil".}
#' \item{Input.Authors}{A character vector.
#' The taxonomic authority input of the corresponding vascular plant species
#' name listed in LCVP.}
#' \item{Status}{A character vector. description if a
#' taxon is classified as ‘valid’, ‘synonym’, ‘unresolved’, ‘external’ or
#' ‘blanks’. The ‘unresolved’ rank means that the status of the plant name
#' could be either valid or synonym, but the information available does not
#' allow a definitive decision. ‘External’ in an extra rank which lists names
#' outside the scope of this publication but useful to keep on this updated
#' list. ‘Blanks’ means that the respective name exists in bibliography but it
#' is neither clear where it came from valid, synonym or unresolved. (see the
#' main text Freiberg et al. for more details)}
#' \item{globalId.of.Output.Taxon}{The fixed species id of the output taxon
#' in LCVP.}
#' \item{Output.Taxon}{A character vector. The list of the accepted
#' plant taxa names according to the LCVP.}
#' \item{Family}{A character vector.
#' The corresponding family name of the Input.Taxon, staying empty if the
#' Status is unresolved.}
#' \item{Order}{A character vector. The corresponding
#' order name of the Input.Taxon, staying empty if the Status is unresolved.}
#' \item{Literature}{A character vector. The bibliography used.}
#' \item{Comments}{A character vector. Further taxonomic comments.}
#' \item{\emph{Match.Position.2to1}}{: positions of the names in splist1 in
#' splist2. Can be used to reorder splist2 to match splist1.
#' }
#' \item{\emph{Duplicated.Output.Position}}{: If \code{identify_dups = TRUE}, it
#' indicates the position of duplicated names in LCVP.Output.Taxon column.
#' This may occur if two inputs are now synonyms. It will output NA if there is
#' no duplicated for the species name.
#' }
#'
#' }
#' See \code{\link[LCVP:tab_lcvp]{LCVP:tab_lcvp}} for more details.
#'
#' If \code{include_all = TRUE}, all species will be included. Ordered based on the
#' \code{splist1}, and followed by non-matched names in \code{splist2}.
#' If \code{include_all = FALSE}, non-matched names in \code{splist2} are not
#' included.
#'
#' @author
#' Bruno Vilela & Alexander Ziska
#'
#' @seealso
#' \code{\link[lcvplants:lcvp_join]{lcvp_join}}
#'
#'
#' @references
#' Freiberg, M., Winter, M., Gentile, A. et al. LCVP, The Leipzig
#' catalogue of vascular plants, a new taxonomic reference list for all known
#' vascular plants. Sci Data 7, 416 (2020).
#' https://doi.org/10.1038/s41597-020-00702-z
#'
#' @keywords R-package nomenclature taxonomy vascular plants
#'
#' @examples
#' # Ensure that LCVP package is available before running the example.
#' # If it is not, see the `lcvplants` package vignette for details
#' # on installing the required data package.
#' if (requireNamespace("LCVP", quietly = TRUE)) { # Do not run this
#'
#' # Generate two lists of species name
#' splist1 <- sample(apply(LCVP::tab_lcvp[2:10, 2:3], 1, paste, collapse = " "))
#' splist2 <- sample(apply(LCVP::tab_lcvp[11:3, 2:3], 1, paste, collapse = " "))
#'
#' # Including all species in both lists
#' lcvp_match(splist1, splist2, include_all = TRUE)
#'
#' # Including all species only in the first list
#' matchLists <- lcvp_match(splist1, splist2, include_all = FALSE)
#' ## This can be used to quickly change positions in splist2 to match splist1
#' splist2[matchLists$Match.Position.2to1]
#'
#' }
#'@export
lcvp_match <- function(splist1,
splist2,
max_distance = 0.2,
genus_fuzzy = FALSE,
grammar_check = FALSE,
include_all = TRUE,
identify_dups = TRUE) {
hasData() # Check if LCVP is installed
# Defensive
if (is.factor(splist1)) {
splist1 <- as.character(splist1)
}
if (is.factor(splist2)) {
splist2 <- as.character(splist2)
}
.names_check(splist1, "The first list of species name")
.names_check(splist2, "The second list of species name")
# Run the search
search1 <- lcvp_search(splist = splist1,
max_distance = max_distance,
genus_fuzzy = genus_fuzzy,
grammar_check = grammar_check)
if (is.null(search1)) {
stop(paste("No match found for splist1.",
"Try increasing the 'max_distance' argument."),
call. = FALSE)
}
search2 <- lcvp_search(splist = splist2,
max_distance = max_distance,
genus_fuzzy = genus_fuzzy,
grammar_check = grammar_check)
if (is.null(search2)) {
stop(paste("No match found for splist2.",
"Try increasing the 'max_distance' argument."),
call. = FALSE)
}
# match
Input.Taxon1 <- search1$global.Id
Input.Taxon2 <- search2$global.Id
match_pos <- match(Input.Taxon1,
Input.Taxon2,
incomparables = NA)
# Adjust output
sp2 <- splist2[match_pos]
result <- data.frame(
"Species.List.1" = splist1,
"Species.List.2" = sp2,
search1[, -1],
"Match.Position.2to1" = match_pos
)
## Include species only in second dataset
if (include_all) {
pos_no_match <- which(!(splist2 %in% sp2))
if (length(pos_no_match) > 0) {
sp2_miss <- splist2[pos_no_match]
for (i in seq_along(sp2_miss)) {
extra_lines <- c(NA,
sp2_miss[i],
unlist(search2[pos_no_match[i], -1]),
pos_no_match[i])
result <- rbind(result, extra_lines)
}
}
}
if (identify_dups) {
result$Duplicated.Output.Position <- .find_dups(result, output_pos = 5)
}
return(result)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.