R/ri_fmatch.R

#' @title ri_fmatch function
#'
#'
#' @description
#' ver: 0.1.0
#'
#' Author: Dien Giau Bui (aka Richard)
#'
#' This function fuzzy-match 2 vector of strings and return the output vector. In this version, I allow only one key.
#'
#' We need two objects and the method to match:
#'
#' 1. name_old: is the string vector of name (that we need to match)
#'
#' 2. key: a data.frame with at least two variables: name and key (e.g, gvkey or cusip)
#'
#' 3. Method: default is "dl". Other methods:  "osa", "lv", "dl", "hamming", "lcs", "qgram", "cosine", "jaccard", "jw", "soundex". Detail: ?stringdist::stringdist
#'
#' @param name_old a vector
#' @param key a data.frame
#' @return the matching between \code(name_old) and \code(key)
#' @export
#' @examples
#'
#' data("key")
#'
#' old <- name_old$name_old
#'
#' match <- ri_fmatch(old, key)
#'
ri_fmatch <- function(name_old, key, method_type = "dl") {
  #step 1: match
  library(parallel)
  require(stringdist)
  require(tidyverse)

  # Calculate the number of cores
  no_cores <- detectCores() - 1

  # Initiate cluster
  cl_par <- makeCluster(no_cores)
  clusterExport(cl = cl_par, varlist = c("key"))


  dl_match <- parLapply(cl_par, name_old,
                        function(x) {
                          key[stringdist::amatch(x, key$conm,
                                                 matchNA = FALSE, method = method_type,
                                                 maxDist = stringr::str_length(x)/3, nthread = getOption("sd_num_thread")),]
                        }
  )
  stopCluster(cl_par)

  #step 2: return the output
  dl_match %>% unlist() %>% matrix(., ncol = 2, byrow = T) -> A
  A <- as_tibble(A)
  A <- as_tibble(data.frame(name_old, A))
  A <- rename(A, name_old = name_old, name_matched = V1, key = V2)
  return(A)
}
diengiau/rifin documentation built on May 6, 2019, 6:01 p.m.