R/wcvp_match_fuzzy.R
In rWCVP: Generating Summaries, Reports and Plots from the World Checklist of Vascular Plants

Documented in edit_match phonetic_match wcvp_match_fuzzy

#' Fuzzy (approximate) matching to the WCVP.
#'
#' Fuzzy matching to names in the WCVP using phonetic matching and edit distance.
#' The WCVP can be loaded for matching from [rWCVPdata::wcvp_names](https://matildabrown.github.io/rWCVPdata/).
#'
#' @param names_df Data frame of names for matching.
#' @param wcvp_names Data frame of taxonomic names from WCVP version 7 or later.
#'   If `NULL` (the default), names will be loaded from [`rWCVPdata::wcvp_names`](https://matildabrown.github.io/rWCVPdata/).
#' @param name_col Character. The column in `names_df` that has the taxon name
#' for matching.
#' @param progress_bar Logical. Show progress bar when matching? Defaults to
#'  `TRUE`; should be changed to `FALSE` if used in a markdown report.
#'
#' @return Match results from WCVP bound to the original data from `names_df`.
#'
#' @details The `wcvp_match_fuzzy` function uses phonetic matching first and then finds
#' the closest match based on edit distance for any remaining names.
#'
#' @import dplyr
#' @importFrom rlang .data
#'
#' @export
#'
#' @examples
#' \donttest{ # this example requires 'rWCVPdata'
#' if(requireNamespace("rWCVPdata")){
#' wcvp_names <- rWCVPdata::wcvp_names
#' wcvp_match_fuzzy(redlist_example, wcvp_names, "scientificName")
#' }
#' }
#'
#' @family name matching functions
#'
wcvp_match_fuzzy <- function(names_df, wcvp_names, name_col, progress_bar = TRUE) {
  # check for duplicated names
  if (sum(duplicated(names_df[[name_col]])) > 10) {
    cli::cli_alert_warning("Fuzzy matching is very slow for data with repeated names. \nWe recommend matching a unique set of names to the WCVP, \nthen attaching the resolved matches to the original data using e.g. `*_join` functions.")
    invisible(readline("Press [Enter] to continue or [Escape] to exit:"))
  }

  wcvp_species <- filter(wcvp_names, .data$taxon_rank != "Genus")

  phonetic_matches <-
    names_df %>%
    phonetic_match(wcvp_species, name_col = name_col) %>%
    filter(!is.na(.data$wcvp_id))

  matched_names <- phonetic_matches[[name_col]]

  unmatched <-
    names_df %>%
    filter(.data[[name_col]] %notin% matched_names)

  if (nrow(unmatched) == 0) {
    return(phonetic_matches)
  }

  if (progress_bar == TRUE) {
    edit_matches <- edit_match(unmatched, wcvp_species, name_col = name_col)
  } else {
    edit_matches <- suppressMessages(edit_match(unmatched, wcvp_species, name_col = name_col))
  }


  if (nrow(phonetic_matches) > 0) {
    return(
      phonetic_matches %>%
        bind_rows(edit_matches)
    )
  } else {
    return(edit_matches)
  }
}

#' @rdname wcvp_match_fuzzy
#'
#' @details Phonetic matching uses [phonics::metaphone] encoding with a maximum
#' code length of 20.
#'
#' @importFrom phonics metaphone
#' @importFrom RecordLinkage levenshteinSim
#' @importFrom utils adist
#' @import dplyr
#'
#' @export
#'
#' @examples
#' \donttest{ # this example requires 'rWCVPdata'
#' if(requireNamespace("rWCVPdata")){
#' wcvp_names <- rWCVPdata::wcvp_names
#' phonetic_match(redlist_example, wcvp_names, "scientificName")
#' }
#' }
#'
phonetic_match <- function(names_df, wcvp_names, name_col) {
  original_names <- colnames(names_df)
  wcvp_names$mp <- metaphone(wcvp_names$taxon_name, maxCodeLen = 20, clean = FALSE)

  matches <-
    names_df %>%
    mutate(sanitised_ = sanitise_names_(.data[[name_col]])) %>%
    mutate(mp = metaphone(.data$sanitised_, maxCodeLen = 20, clean = FALSE)) %>%
    left_join(wcvp_names, by = "mp", suffix = c("", "_wcvp_names"))

  matches <-
    matches %>%
    mutate(
      match_similarity = levenshteinSim(.data$sanitised_, .data$taxon_name),
      match_similarity = round(.data$match_similarity, 3)
    ) %>%
    rowwise() %>%
    mutate(match_edit_distance = adist(.data$sanitised_, .data$taxon_name)[, 1]) %>%
    ungroup()

  matches <-
    matches %>%
    mutate(match_type = case_when(
      .data$match_similarity > 0.9 ~ "Fuzzy (phonetic)",
      .data$match_similarity >= 0.75 ~ "Fuzzy (phonetic)",
      TRUE ~ NA_character_
    )) %>%
    mutate(
      match_similarity = ifelse(.data$match_similarity < 0.75, NA_real_, .data$match_similarity),
      match_edit_distance = ifelse(is.na(.data$match_type), NA_real_, .data$match_edit_distance)
    ) %>%
    add_count(.data[[name_col]]) %>%
    mutate(multiple_matches = .data$n > 1) %>%
    select(-"n")

  matches %>%
    format_output_(original_cols = original_names) %>%
    mutate(across(starts_with("wcvp_"), ~ ifelse(is.na(.data$match_type), NA, .x)))
}

#' @rdname wcvp_match_fuzzy
#'
#' @details Edit distance matching finds the closest match based on Levenshtein similarity,
#' calculated by [RecordLinkage::levenshteinSim].
#'
#' @import dplyr
#' @importFrom rlang .data
#' @importFrom tidyr unnest
#' @importFrom cli cli_progress_along
#' @importFrom purrr map
#'
#' @export
#'
#' @examples
#' \donttest{ # this example requires 'rWCVPdata'
#' if(requireNamespace("rWCVPdata")){
#' wcvp_names <- rWCVPdata::wcvp_names
#' edit_match(redlist_example, wcvp_names, "scientificName")
#' }
#' }
#'
edit_match <- function(names_df, wcvp_names, name_col) {
  withr::local_options(list(cli.progress_show_after = 2, cli.progress_clear = FALSE))

  original_names <- colnames(names_df)
  names_df <-
    names_df %>%
    filter(!is.na(.data[[name_col]])) %>%
    mutate(sanitised_ = sanitise_names_(.data[[name_col]]))

  matches <-
    names_df %>%
    mutate(match_info = map(
      cli_progress_along(.data$sanitised_, "Matching"),
      ~ edit_match_name_(.data$sanitised_[.x], wcvp_names)
    ))

  matches <-
    matches %>%
    unnest("match_info")

  matches %>%
    left_join(
      wcvp_names,
      by = c("plant_name_id", "taxon_name"),
      suffix = c("", "_wcvp")
    ) %>%
    format_output_(original_cols = original_names)
}

#' Match a name to a lookup table using Levenshtein similarity (edit distance).
#'
#' @import dplyr
#' @importFrom RecordLinkage levenshteinSim
#' @importFrom utils adist
#' @importFrom stringr str_extract str_remove_all str_squish
#' @importFrom rlang .data
#'
#' @noRd
#'
edit_match_name_ <- function(name, lookup) {
  genus <- str_extract(name, "^[^ ]+")
  genus_lookup <-
    lookup %>%
    filter(.data$genus == genus)

  similarity <- levenshteinSim(name, genus_lookup$taxon_name)

  best_idx <- which(similarity > 0.9)

  if (length(best_idx) == 0) {
    best_idx <- which(similarity == max(similarity) & similarity > 0.75)
  }

  best_name_id <- genus_lookup$plant_name_id[best_idx]
  best_name <- genus_lookup$taxon_name[best_idx]

  if (length(best_name_id) == 0) {
    similarity <- levenshteinSim(name, lookup$taxon_name)
    best_idx <- which(similarity == max(similarity) & similarity > 0.75)
    best_name_id <- lookup$plant_name_id[best_idx]
    best_name <- lookup$taxon_name[best_idx]
  }

  if (length(best_name_id) == 0) {
    best_name_id <- NA_real_
    best_name <- NA_character_
    best_similarity <- NA_real_
    match_type <- NA_character_
  } else {
    best_similarity <- round(similarity[best_idx], 3)
    match_type <- "Fuzzy (edit distance)"
  }

  data.frame(
    taxon_name = best_name,
    plant_name_id = best_name_id,
    match_similarity = best_similarity,
    match_edit_distance = as.vector(adist(name, best_name)),
    match_type = match_type,
    multiple_matches = length(best_name_id) > 1
  )
}

Any scripts or data that you put into this service are public.

rWCVP documentation built on Feb. 16, 2023, 8:30 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

rWCVP
Generating Summaries, Reports and Plots from the World Checklist of Vascular Plants

R/wcvp_match_fuzzy.R
In rWCVP: Generating Summaries, Reports and Plots from the World Checklist of Vascular Plants

Defines functions edit_match_name_ edit_match phonetic_match wcvp_match_fuzzy

Documented in edit_match phonetic_match wcvp_match_fuzzy

Try the rWCVP package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

rWCVP Generating Summaries, Reports and Plots from the World Checklist of Vascular Plants

R/wcvp_match_fuzzy.R In rWCVP: Generating Summaries, Reports and Plots from the World Checklist of Vascular Plants

Defines functions edit_match_name_ edit_match phonetic_match wcvp_match_fuzzy

Documented in edit_match phonetic_match wcvp_match_fuzzy

Try the rWCVP package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

rWCVP
Generating Summaries, Reports and Plots from the World Checklist of Vascular Plants

R/wcvp_match_fuzzy.R
In rWCVP: Generating Summaries, Reports and Plots from the World Checklist of Vascular Plants