R/close_names.R

Defines functions find_close_strings display_close_names

Documented in display_close_names find_close_strings

#!/usr/bin/Rscript
# -*- coding: utf-8 -*-

################################ Description ###################################
# Title: Find close names
# Purpose: Find close character string that should be a consequence of misspelling
# Created the 2014-12-30 
# by Joris Muller <joris.muller@jom.link>
# Licence: GPLv3 <http://www.gnu.org/licenses/>
################################################################################


#' @title Find close strings
#' @description Find close strings in a character vector (for example, the labels of a factor). It could be useful if you try to find misspelled words.
#' @param char character 
#' @return A list, with for each unique word, the others words close to it.
#' @author Joris Muller
#' @import stringdist
#' @export
#' @examples
#' chaine2 <- c("mainson", "maison", "cave", "caves", "Cave", "Hôpital", "Hopital", "Bachibouzouk")
#' the_list <- find_close_strings(chaine2)
#' print(the_list)

find_close_strings <- function(char, nb_subst = 3) {

    # Only takes the unique names
    char <- unique(char)    

    # Calculate the distance matrix
    dist_matrix <- stringdistmatrix(char, char, useNames = TRUE)

    close_list <- apply(X = dist_matrix, MARGIN = 1, FUN = function(x) {
        proches <- x > 0 & x < nb_subst 
        noms_proches <- names(x)[proches]
        return(noms_proches)
}
        )

    return(close_list)
} # End of function "find_close_strings" definition


#' @title Display close names
#' @description Display in an human readable way the result of the function \code{find_close_string}.
#' @param close_list A list produced by the function \code{find_close_string}.
#' @param sentence character A sentence between the word tested and the words that should be close.
#' @return A character vector, one element by word.
#' @author Joris Muller
#' @export
#' @examples
#' chaine2 <- c("mainson", "maison", "cave", "caves", "Cave", "Hôpital", "Hopital", "Bachibouzouk")
#' the_list <- find_close_strings(chaine2)
#' display_close_names(the_list)

display_close_names <- function(close_list, sentence = "is close to") {

    # Prepare the character vector where we will append the sentences
    results <- character() 

    # For each element of the list (= each word) write the sentence
    for (word in names(close_list)) {

        # Find the close words for this word
        close_words <- close_list[[word]]

        # If there is wlose words, write a sentence
        if (length(close_words) > 0) {
            a_line <- paste0("'", word, "' ", sentence, " '",
                paste0(close_words, collapse = "', '"), "'.")

            # Append this sentence to other ones
            results <- c(results, a_line)
        }
    }

    return(results)
} # End of function "display_close_names" definition
jomuller/dfcheck documentation built on May 19, 2019, 7:26 p.m.