R/fuzzy_match.R
In fedmatch: Fast, Flexible, and User-Friendly Record Linkage Methods

Documented in fuzzy_match

#' Use string distances to match on names
#'
#' Use the \code{stringdist} package to perform a fuzzy match on two datasets.
#'
#' \code{stringdist} \code{amatch} computes string distances between every
#' pair of strings in two vectors, then picks the closest string pair for each
#' observation in the dataset. This is used by \code{fuzzy_match} to perform
#' a string distance-based match between two datasets. This process can take quite a long time,
#' for quicker matches try adjusting the \code{nthread} argument in \code{fuzzy_settings}.
#' The default fuzzy_settings are sensible starting points for company name matching,
#' but adjusting these can greatly change how the match performs.
#'
#' @param data1 data.frame. First to-merge dataset.
#' @param data2 data.frame. Second to-merge dataset.
#' @param by character string. Variables to merge on (common across data 1 and data 2). See \code{merge}
#' @param by.x character string. Variable to merge on in data1. See \code{merge}
#' @param by.y character string. Variable to merge on in data2. See \code{merge}
#' @param suffixes character vector with length==2. Suffix to add to like named variables after the merge. See \code{merge}
#' @param unique_key_1 character vector. Primary key of data1 that uniquely identifies each row (can be multiple fields)
#' @param unique_key_2 character vector. Primary key of data2 that uniquely identifies each row (can be multiple fields)
#' @param fuzzy_settings list of arguments to pass to to the fuzzy matching function. See \code{amatch}.
#' @return a data.table, the resultant merged data set, including all columns from both data sets.
#' @import data.table

fuzzy_match <- function(data1,
                        data2,
                        by = NULL,
                        by.x = NULL,
                        by.y = NULL,
                        suffixes,
                        unique_key_1,
                        unique_key_2,
                        fuzzy_settings = list(
                          method = "jw", p = 0.1, maxDist = 0.05,
                          matchNA = FALSE, nthread = getOption("sd_num_thread")
                        )) {

  # FOR TESTING
  # data1 <- data.table(
  #   name = c("abaf", "abe", "abgegh"),
  #   id1 = c(1:3)
  # )
  # data2 <- data.table(
  #   name = c("abc"),
  #   id2 = c(1)
  # )
  # by <- NULL
  # by.x <- "name"
  # by.y <- "name"
  # suffixes <- c("_1", "_2")
  # unique_key_1 <- "id1"
  # unique_key_2 <- "id2"
  # fuzzy_settings <- list(
  #   method = "jw", p = 0.1, maxDist = 0.9,
  #   matchNA = FALSE, nthread = getOption("sd_num_thread")
  # )
  # FOR TESTING
  # check that id's aren't the same
  if (unique_key_1 == unique_key_2) {
    stop("Error: unique_key_1 must not equal unique_key_2")
  }
  if (is.null(by) & (is.null(by.x) | is.null(by.y))) {
    stop("Error: either 'by' or both of 'by.x' and 'by.y' must not be NULL")
  }
  if (!is.null(by) & (!is.null(by.x) | !is.null(by.y))) {
    stop("both 'by' and ('by.x' or 'by.y') were supplied, this is not allowed")
  }
  if (!is.null(by)) {
    by.x <- by
    by.y <- by
  }

  # make sure data1 and data2 are data.table's and not data.frames
  data1 <- data.table(data1)
  data2 <- data.table(data2)
  # the amatch options
  if (fuzzy_settings[["method"]] %in% c(
    "osa", "lv", "dl", "hamming", "lcs", "qgram", "cosine",
    "jaccard", "jw", "soundex"
  )) {
    if (length(fuzzy_settings[["nthread"]]) <= 0) {
      fuzzy_settings[["nthread"]] <- 1
    }
    # print(fuzzy_settings[["nthread"]])
    match_indices <- do.call(stringdist::amatch, c(list(data1[[by.x]], data2[[by.y]]), fuzzy_settings))

    # our special weighted jaccard method
  } else if (fuzzy_settings[["method"]] == "wgt_jaccard") {
    if (length(fuzzy_settings[["nthread"]]) <= 0) {
      fuzzy_settings[["nthread"]] <- 1
    }
    # print(fuzzy_settings[["nthread"]])
    corpus <- build_corpus(data1[[by.x]], data2[[by.y]])
    jaccard_result <- wgt_jaccard(
      corpus$word, corpus$log_inv_freq,
      data1[[by.x]], data2[[by.y]], fuzzy_settings[["nthread"]]
    )
    # filter out matches that are under the minimum threshold
    jaccard_index <- jaccard_result$index
    jaccard_sim <- jaccard_result$similarity
    jaccard_index[which(jaccard_sim < 1 - fuzzy_settings[["maxDist"]])] <- NA

    # jaccard_index <- jaccard_index[which(jaccard_sim >= 1 - fuzzy_settings[["maxDist"]])]
    jaccard_sim <- jaccard_sim[which(jaccard_sim >= 1 - fuzzy_settings[["maxDist"]])]


    # this is done in C, so add 1 to the index to make it match
    match_indices <- jaccard_index + 1
    # score_table <- data.table(wgt_jaccard_sim = jaccard_sim)    # score_table <- data.table(wgt_jaccard_sim = jaccard_sim)
  } else {
    stop("Fuzzy 'method' must be one of those in stringdist::amatch's 'method' argument, or 'wgt_jaccard'.")
  }

  # the x-indices which are matched to y
  x_index <- seq(1, length(match_indices))[!is.na(match_indices)]
  y_index <- match_indices[!is.na(match_indices)]

  rows_from_1 <- copy(data1[x_index, c(by.x, unique_key_1), with = F])
  rows_from_2 <- copy(data2[y_index, c(by.y, unique_key_2), with = F])

  if (by.x == by.y) {
    setnames(rows_from_1, c(paste0(by.x, suffixes[[1]]), unique_key_1))
    setnames(rows_from_2, c(paste0(by.x, suffixes[[2]]), unique_key_2))
  } else {
    setnames(rows_from_1, c(by.x, unique_key_1))
    setnames(rows_from_2, c(by.y, unique_key_2))
  }
  matched_and_ids <- cbind(rows_from_1, rows_from_2)
  # if (fuzzy_settings[["method"]] == "wgt_jaccard") {
  #   matched_and_ids <- cbind(matched_and_ids, score_table)
  # }

  # data1[, c(by.x) := NULL]
  # data2[, c(by.y) := NULL]

  first_merge <- merge(data1[, !c(by.x), with = F], matched_and_ids, by = unique_key_1)

  final_merge <- merge(first_merge, data2[, !c(by.y), with = F], by = unique_key_2, suffixes = suffixes)
  # dedupe
  # keys_to_calc <- final_merge[, .N, c(unique_key_2)][N > 1][[unique_key_2]]
  # # do we need to dedupe?
  # if (length(keys_to_calc) != 0) {
  #   # include suffixes
  #   if (by.x == by.y) {
  #     if (fuzzy_settings[["method"]] != "wgt_jaccard") {
  #       setkeyv(final_merge, c(unique_key_2))
  #       no_compare <- final_merge[!data.table(keys_to_calc)]
  #       to_compare <- final_merge[data.table(keys_to_calc)]
  #       sim <- N <- NULL # due to NSE notes in R CMD check
  #       to_compare[, sim := 1 - do.call(
  #         stringdist::stringdist,
  #         c(
  #           list(to_compare[[paste0(by.x, suffixes[[1]])]]),
  #           list(to_compare[[paste0(by.y, suffixes[[2]])]]),
  #           fuzzy_settings[c("method", "p")]
  #         )
  #       )]
  #       to_compare <- to_compare[order(-sim), .SD[1], c(unique_key_2)]
  #       to_compare[, sim := NULL]
  #       final_merge <- rbind(to_compare, no_compare)
  #     } else {
  #       setkeyv(final_merge, c(unique_key_2))
  #       no_compare <- final_merge[!data.table(keys_to_calc)]
  #       to_compare <- final_merge[data.table(keys_to_calc)]
  #       rm(final_merge)
  #       corpus <- build_corpus(data1[[by.x]], data2[[by.y]])
  #       to_compare[, sim := 1 - wgt_jaccard_distance(to_compare[[paste0(by.x, suffixes[[1]])]],
  #         to_compare[[paste0(by.y, suffixes[[2]])]],
  #         corpus = corpus, nthreads = fuzzy_settings$nthread
  #       )]
  #
  #       to_compare <- to_compare[order(-sim), .SD[1], c(unique_key_2)]
  #       to_compare[, sim := NULL]
  #       final_merge <- rbind(to_compare, no_compare)
  #     }
  #   } else if (by.x != by.y) {
  #     # no suffixes needed
  #     keys_to_calc <- final_merge[, .N, c(unique_key_2)][N > 1][[unique_key_2]]
  #     if (fuzzy_settings[["method"]] != "wgt_jaccard") {
  #       setkeyv(final_merge, c(unique_key_2))
  #       no_compare <- final_merge[!data.table(keys_to_calc)]
  #       to_compare <- final_merge[data.table(keys_to_calc)]
  #       to_compare[, sim := 1 - do.call(
  #         stringdist::stringdist,
  #         c(
  #           list(to_compare[[by.x]]),
  #           list(to_compare[[by.y]]),
  #           fuzzy_settings[c("method", "p")]
  #         )
  #       )]
  #       to_compare <- to_compare[order(-sim), .SD[1], c(unique_key_2)]
  #       to_compare[, sim := NULL]
  #       final_merge <- rbind(to_compare, no_compare)
  #     } else {
  #       setkeyv(final_merge, c(unique_key_2))
  #       no_compare <- final_merge[!data.table(keys_to_calc)]
  #       to_compare <- final_merge[data.table(keys_to_calc)]
  #       corpus <- build_corpus(data1[[by.x]], data2[[by.y]])
  #       to_compare[, sim := 1 - wgt_jaccard_distance(to_compare[[by.x]],
  #         to_compare[[by.y]],
  #         corpus = corpus, nthreads = fuzzy_settings$nthread
  #       )]
  #       to_compare <- to_compare[order(-sim), .SD[1], c(unique_key_2)]
  #       to_compare[, sim := NULL]
  #       final_merge <- rbind(to_compare, no_compare)
  #     }
  #   }
  # }
  return(final_merge)
}