R/lcvp_match.R

Defines functions lcvp_match

Documented in lcvp_match

#' Match two name lists using the Leipzig Catalogue of Plants (LCVP)
#'
#'
#' Matches and compares two name lists based on the taxonomic resolution of 
#' plant taxa names listed in the "Leipzig Catalogue of Vascular Plants" (LCVP).
#'
#' @param splist1 A character vector specifying the reference input taxon to be matched.
#' Each element including genus and specific epithet and, potentially,
#' infraspecific rank, infraspecific name, and author name. Only valid characters are allowed 
#' (see \code{\link[base:validEnc]{base:validEnc}}).
#'
#' @param splist2 A character vector specifying the input taxon to match splist1.
#' Each element including genus and specific epithet and, potentially, infraspecific rank,
#' infraspecific name, and author name. Only valid characters are allowed 
#' (see \code{\link[base:validEnc]{base:validEnc}}).
#'
#'@param max_distance It represents the maximum string distance allowed for a
#'  match when comparing the submitted name with the closest name matches in the
#'  LCVP. The distance used is a generalized Levenshtein distance that indicates
#'  the total number of insertions, deletions, and substitutions allowed to
#'  match the two names. It can be expressed as an integer or as the fraction of
#'  the binomial name. For example, a name with length 10, and a max_distance =
#'  0.1, allow only one change (insertion, deletion, or substitution). A
#'  max_distance = 2, allows two changes.
#'  
#'@param genus_fuzzy If TRUE, the fuzzy match algorithm based on max_distance
#'  will also be applied to the genus (note that this may considerably increase
#'  computational time). If FALSE, fuzzy match will only apply to the epithet.
#'  
#'@param grammar_check if TRUE, the algorithm will try to fix common latin 
#'grammar mistakes.
#' 
#' @param include_all If \code{TRUE} (default), it will include all species in both
#'  \code{splist1} and \code{splist2}. If \code{FALSE}, it will exclude species 
#'  only found in \code{splist2}.
#'  
#' @param identify_dups If \code{TRUE} (default), a column indicating the position 
#'  of duplicated LCVP output names in the resulting data.frame.
#'
#' 
#' @return 
#' A data.frame with the following columns:
#' 
#' \itemize{
#' \item{\emph{Species.List.1}}{: Taxa name list provided by the user in the 
#' splist1.}
#' \item{\emph{Species.List.2}}{: Taxa name list provided by the user in the 
#' splist2.}
##'   \item{global.Id}{The fixed species id of the input taxon in the
#'   Leipzig Catalogue of Vascular Plants (LCVP).} 
#'   \item{Input.Genus}{A
#'   character vector. The input genus of the corresponding vascular plant
#'   species name listed in LCVP.} 
#'   \item{Input.Epitheton}{A character vector.
#'   The input epitheton of the corresponding vascular plant species name listed
#'   in LCVP.} 
#'   \item{Rank}{A character vector. The taxonomic rank ("species",
#'   subspecies: "subsp.", variety: "var.", subvariety: "subvar.", "forma", or
#'   subforma: "subf.") of the corresponding vascular plant species name listed
#'   in LCVP.} 
#'   \item{Input.Subspecies.Epitheton}{A character vector. If the
#'   indicated rank is below species, the subspecies epitheton input of the
#'   corresponding vascular plant species name listed in LCVP. If the rank is
#'   "species", the input is "nil".} 
#'   \item{Input.Authors}{A character vector.
#'   The taxonomic authority input of the corresponding vascular plant species
#'   name listed in LCVP.} 
#'   \item{Status}{A character vector. description if a
#'   taxon is classified as ‘valid’, ‘synonym’, ‘unresolved’, ‘external’ or
#'   ‘blanks’. The ‘unresolved’ rank means that the status of the plant name
#'   could be either valid or synonym, but the information available does not
#'   allow a definitive decision. ‘External’ in an extra rank which lists names
#'   outside the scope of this publication but useful to keep on this updated
#'   list. ‘Blanks’ means that the respective name exists in bibliography but it
#'   is neither clear where it came from valid, synonym or unresolved. (see the
#'   main text Freiberg et al. for more details)}
#'   \item{globalId.of.Output.Taxon}{The fixed species id of the output taxon
#'   in LCVP.} 
#'   \item{Output.Taxon}{A character vector. The list of the accepted
#'   plant taxa names according to the LCVP.} 
#'   \item{Family}{A character vector.
#'   The corresponding family name of the Input.Taxon, staying empty if the
#'   Status is unresolved.} 
#'   \item{Order}{A character vector. The corresponding
#'   order name of the Input.Taxon, staying empty if the Status is unresolved.}
#'   \item{Literature}{A character vector. The bibliography used.}
#'   \item{Comments}{A character vector. Further taxonomic comments.}  
#' \item{\emph{Match.Position.2to1}}{: positions of the names in splist1 in
#'  splist2. Can be used to reorder splist2 to match splist1.
#'  }
#' \item{\emph{Duplicated.Output.Position}}{: If \code{identify_dups = TRUE}, it 
#' indicates the position of duplicated names in LCVP.Output.Taxon column. 
#' This may occur if two inputs are now synonyms. It will output NA if there is 
#' no duplicated for the species name.
#' }
#'  
#' }
#' See \code{\link[LCVP:tab_lcvp]{LCVP:tab_lcvp}} for more details.
#' 
#' If \code{include_all = TRUE}, all species will be included. Ordered based on the 
#' \code{splist1}, and followed by non-matched names in \code{splist2}. 
#' If \code{include_all = FALSE}, non-matched names in \code{splist2} are not 
#' included.
#' 
#' @author 
#' Bruno Vilela & Alexander Ziska
#' 
#' @seealso 
#' \code{\link[lcvplants:lcvp_join]{lcvp_join}}
#' 
#' 
#' @references 
#' Freiberg, M., Winter, M., Gentile, A. et al. LCVP, The Leipzig 
#' catalogue of vascular plants, a new taxonomic reference list for all known 
#' vascular plants. Sci Data 7, 416 (2020). 
#' https://doi.org/10.1038/s41597-020-00702-z 
#' 
#' @keywords R-package nomenclature taxonomy vascular plants
#' 
#' @examples
#' # Ensure that LCVP package is available before running the example.
#' # If it is not, see the `lcvplants` package vignette for details
#' # on installing the required data package.
#' if (requireNamespace("LCVP", quietly = TRUE)) { # Do not run this
#'
#' # Generate two lists of species name
#' splist1 <- sample(apply(LCVP::tab_lcvp[2:10, 2:3], 1, paste, collapse = " "))
#' splist2 <- sample(apply(LCVP::tab_lcvp[11:3, 2:3], 1, paste, collapse = " "))
#' 
#' # Including all species in both lists
#' lcvp_match(splist1, splist2, include_all = TRUE)
#' 
#' # Including all species only in the first list
#' matchLists <- lcvp_match(splist1, splist2, include_all = FALSE)
#' ## This can be used to quickly change positions in splist2 to match splist1
#' splist2[matchLists$Match.Position.2to1]
#' 
#' }
#'@export


lcvp_match <- function(splist1,
                       splist2,
                       max_distance = 0.2,
                       genus_fuzzy = FALSE,
                       grammar_check = FALSE,
                       include_all = TRUE, 
                       identify_dups = TRUE) {
  hasData() # Check if LCVP is installed
  # Defensive
  if (is.factor(splist1)) {
    splist1 <- as.character(splist1)
  }
  if (is.factor(splist2)) {
    splist2 <- as.character(splist2)
  }
  
  .names_check(splist1, "The first list of species name")
  .names_check(splist2, "The second list of species name")
  
  # Run the search
  search1 <- lcvp_search(splist = splist1, 
                         max_distance = max_distance, 
                         genus_fuzzy = genus_fuzzy, 
                         grammar_check = grammar_check)
  if (is.null(search1)) {
    stop(paste("No match found for splist1.",
               "Try increasing the 'max_distance' argument."),
         call. = FALSE)
  }
  search2 <- lcvp_search(splist = splist2, 
                         max_distance = max_distance, 
                         genus_fuzzy = genus_fuzzy, 
                         grammar_check = grammar_check)
  if (is.null(search2)) {
    stop(paste("No match found for splist2.",
         "Try increasing the 'max_distance' argument."),
         call. = FALSE)
  }
  # match
  Input.Taxon1 <- search1$global.Id
  Input.Taxon2 <- search2$global.Id
  match_pos <- match(Input.Taxon1,
                     Input.Taxon2,
                     incomparables = NA)
  
  # Adjust output
  sp2 <- splist2[match_pos]
  result <- data.frame(
    "Species.List.1" = splist1,
    "Species.List.2" = sp2,
    search1[, -1],
    "Match.Position.2to1" = match_pos
  )
  
  ## Include species only in second dataset
  if (include_all) {
    pos_no_match <- which(!(splist2 %in% sp2))
    if (length(pos_no_match) > 0) {
      sp2_miss <- splist2[pos_no_match]
      for (i in seq_along(sp2_miss)) {
        extra_lines <- c(NA,
                         sp2_miss[i],
                         unlist(search2[pos_no_match[i], -1]),
                         pos_no_match[i])
        result <- rbind(result, extra_lines)
      }
    }
  }
  if (identify_dups) {
    result$Duplicated.Output.Position <- .find_dups(result, output_pos = 5)
  }
  return(result)
}
idiv-biodiversity/lcvplants documentation built on Nov. 18, 2022, 3:39 a.m.