R/find_most_similar_string.R

Defines functions find_most_similar_string

Documented in find_most_similar_string

#' @title find_most_similar_string
#'
#' @description Returns a vector of most similar string in another vector.
#' Returns a vector of the same length as input vector '.s'.
#'
#' @param .s a character vector to be matched.
#' @param .t a character vector to be matched against.
#' @param max_dist the maximum string distance
#' @param ignore_case should case be ignored? Default is TRUE.
#' @param verbose should warnings be printed in the console.
#' @param feeling_lucky if multiple most similar strings are found. Should the first one be returned?
#' @param ... other arguments passed to utils::adist.
#'
#' @details Uses the generalized Levenshtein distance. For more information type \code{?utils::adist} in the console. the original source is https://github.com/davidsjoberg/similiars/blob/master/R/similiars.R
#'
#' @examples 1+1
#'
#'
#' @return a character vector
#'
#' @export
find_most_similar_string <- function(.s, .t, max_dist = Inf, verbose = TRUE, ignore_case = TRUE, feeling_lucky = FALSE, ...) {

    find_string_distance <- function(.s, .t, ignore_case = TRUE, ...) {
        if(any(!is.character(.t), !is.character(.s))) stop("'.s' and '.t' need to be character vectors")

        purrr::map(.s, function(.e) {
            .d <- utils::adist(.t, .e, ignore.case = ignore_case)
            dplyr::tibble(input_string = .e,
                          string = .t,
                          string_distance = .d[, 1]) |>
                dplyr::arrange(.data$string_distance)
        }) |>
            purrr::set_names(.s)
    }
    if(any(!is.character(.t), !is.character(.s))) stop("'.s' and '.t' need to be character vectors")
    .dfs <- find_string_distance(.s, .t, ignore_case = ignore_case)
    .dfs <- purrr::map(.dfs, function(.h) {.h |> dplyr::filter(.data$string_distance <= max_dist)})

    out <- purrr::map_chr(.dfs, ~{
        if(is.na(.x$input_string[1])) {return(NA_character_)}
        .x <- .x |> dplyr::filter(string_distance == min(string_distance))

        if( nrow(.x) > 1){
            if(feeling_lucky){

                if(verbose){
                    warning(paste0("No single most similar string found for '",
                                   .x$input_string[1],
                                   "'. Returning '", .x$string[1], "'. Other exactly similar strings were ",
                                   paste(paste0("'", .x$string[-1], "'"),
                                         collapse = ", "),
                                   "."))
                }
                return(.x$string[1])

            } else{
                if(verbose){

                    warning(paste0("No single most similar string found for '",
                                   .x$input_string[1],
                                   "'. Returning NA. Most similar strings were ",
                                   paste(paste0("'", .x$string, "'"),
                                         collapse = ", "),
                                   "."))
                }
                return(NA_character_)
            }
        }
        if( nrow(.x) == 0 ){
            if(verbose){
                warning(paste0("No similar string below threshold found for '", .x$input_string[1], "'. Returning NA.\n"))
            }
            return(NA_character_)
        }
        .x |> dplyr::pull(string)
    })
    out |> as.character()
}
fdzul/rgeomex documentation built on July 20, 2024, 7:57 p.m.