#' Value merging based on ngram fingerprints
#'
#' This function takes a character vector and makes edits and merges values
#' that are approximately equivalent yet not identical. It uses a two step
#' process, the first is clustering values based on their ngram fingerprint (described here
#' \url{https://openrefine.org/docs/technical-reference/clustering-in-depth}).
#' The second step is merging values based on approximate string matching of
#' the ngram fingerprints, using the [sd_lower_tri()] C function from the
#' package \code{stringdist}.
#'
#' @param vect Character vector, items to be potentially clustered and merged.
#' @param numgram Numeric value, indicating the number of characters that
#' will occupy each ngram token. Default value is 2.
#' @param ignore_strings Character vector, these strings will be ignored during
#' the merging of values within \code{vect}. Default value is NULL.
#' @param bus_suffix Logical, indicating whether the merging of records should
#' be insensitive to common business suffixes or not. Default value is TRUE.
#' @param edit_threshold Numeric value, indicating the threshold at which a
#' merge is performed, based on the sum of the edit values derived from
#' param \code{weight}. Default value is 1. If this parameter is
#' set to 0 or NA, then no approximate string matching will be done, and all
#' merging will be based on strings that have identical ngram fingerprints.
#' @param weight Numeric vector, indicating the weights to assign to
#' the four edit operations (see details below), for the purpose of
#' approximate string matching. Default values are
#' c(d = 0.33, i = 0.33, s = 1, t = 0.5). This parameter gets passed along
#' to the \code{stringdist} function. Must be either
#' a numeric vector of length four, or NA.
#' @param ... additional args to be passed along to the \code{stringdist}
#' function. The acceptable args are identical to those of
#' [stringdistmatrix()].
#'
#' @details The values of arg \code{weight} are edit distance values that
#' get passed to the \code{stringdist} edit distance function. The
#' param takes four arguments, each one is a specific type of edit, with
#' default penalty value.
#' \itemize{
#' \item d: deletion, default value is 0.33
#' \item i: insertion, default value is 0.33
#' \item s: substitution, default value is 1
#' \item t: transposition, default value is 0.5
#' }
#'
#' @return Character vector with similar values merged.
#' @export
#'
#' @examples
#' x <- c("Acme Pizza, Inc.", "ACME PIZA COMPANY", "Acme Pizzazza LLC")
#'
#' n_gram_merge(vect = x)
#'
#' # The performance of the approximate string matching can be ajusted using
#' # parameters 'weight' or 'edit_threshold'
#' n_gram_merge(vect = x,
#' weight = c(d = 0.4, i = 1, s = 1, t = 1))
#'
#' # Use parameter 'ignore_strings' to ignore specific strings during merging
#' # of values.
#' x <- c("Bakersfield Highschool", "BAKERSFIELD high",
#' "high school, bakersfield")
#' n_gram_merge(vect = x, ignore_strings = c("high", "school", "highschool"))
#'
n_gram_merge <- function(vect, numgram = 2, ignore_strings = NULL,
bus_suffix = TRUE, edit_threshold = 1,
weight = c(d = 0.33, i = 0.33, s = 1, t = 0.5), ...) {
# Input validation.
stopifnot(is.character(vect))
stopifnot(is.numeric(numgram))
stopifnot(is.numeric(edit_threshold) || is.na(edit_threshold))
stopifnot(is.logical(bus_suffix))
stopifnot(is.null(ignore_strings) || is.character(ignore_strings))
if (all(!is.na(weight))) {
if (!is.numeric(weight) && length(weight) != 4) {
stop("param 'weight' must be either a numeric vector with ",
"length four, or NA", call. = FALSE)
} else {
weight <- as.double(weight)
}
}
if ((!is.na(edit_threshold) && edit_threshold == 0) ||
numgram == 1) {
edit_threshold <- NA
}
edit_threshold_missing <- is.na(edit_threshold)
if (!edit_threshold_missing && any(is.na(weight))) {
stop("param 'weight' must not be NA if 'edit_threshold'is not NA",
call. = FALSE)
}
# If any args were passed via ellipsis, check to make sure they are valid
# stringdist args.
dots <- list(...)
dots_names <- names(dots)
if (any(c("a", "b") %in% dots_names)) {
stop("'stringdist' args 'a' and 'b' cannot be set manually",
call. = FALSE)
}
# Vector of valid arg names for stringdist.
sdm_args <- c("method", "useBytes", "weight", "q", "p", "bt", "useNames",
"nthread")
if (!all(dots_names %in% sdm_args)) {
bad_args <- paste(
dots_names[!dots_names %in% sdm_args],
collapse = ", "
)
stop(paste("these input arg(s) are invalid:", bad_args), call. = FALSE)
}
# More input validations for stringdist args.
if (!edit_threshold_missing) {
if (!"method" %in% dots_names) {
method <- 1L
} else {
sdm_methods <- c(osa = 0L, lv = 1L, dl = 2L, hamming = 3L, lcs = 4L,
qgram = 5L, cosine = 6L, jaccard = 7L, jw = 8L,
soundex = 9L)
if (!dots$method %in% names(sdm_methods)) {
stop(
sprintf("arg 'method' must be one of:\n%s",
paste(names(sdm_methods), collapse = ", ")),
call. = FALSE
)
}
method <- sdm_methods[dots$method]
}
if (!"nthread" %in% dots_names) {
nthread <- getOption("sd_num_thread")
} else {
stopifnot(is.numeric(dots$nthread) && dots$nthread > 0)
nthread <- as.integer(dots$nthread)
}
if (!"useBytes" %in% dots_names) {
useBytes <- FALSE
} else {
stopifnot(is.logical(dots$useBytes))
useBytes <- dots$useBytes
}
if (!"q" %in% dots_names) {
q <- 1
} else {
stopifnot(dots$q >= 0)
q <- as.integer(dots$q)
}
if (!"p" %in% dots_names) {
p <- 0
} else {
stopifnot(dots$p <= 0.25 && dots$p >= 0)
p <- as.double(dots$p)
}
if (!"bt" %in% dots_names) {
bt <- 0
} else {
stopifnot(is.numeric(dots$bt))
bt <- as.double(dots$bt)
}
}
# If ignore_strings is not NULL, make all values lower case then get uniques.
if (!is.null(ignore_strings)) {
ignore_strings <- unique(
cpp_tolower(ignore_strings[!is.na(ignore_strings)])
)
}
# If approx string matching is being used, then get ngram == 1 keys for all
# records.
univect <- cpp_unique(vect[!is.na(vect)])
if (!edit_threshold_missing) {
one_gram_keys <- get_fingerprint_ngram(univect, numgram = 1, bus_suffix,
ignore_strings)
} else {
one_gram_keys <- NULL
}
# Get ngram == numgram keys for all records.
n_gram_keys <- get_fingerprint_ngram(univect, numgram = numgram, bus_suffix,
ignore_strings)
# If approximate string matching is not being used, return output of
# ngram_merge_no_approx().
if (edit_threshold_missing) {
return(ngram_merge_no_approx(n_gram_keys, univect, vect))
}
# If approximate string matching is enabled, call ngram_merge_approx(). This
# will do the following:
# 1. Get initial clusters by finding all elements of n_gram_keys for which
# their associated one_gram_key has one or more matches within the entire
# list of one_gram_keys.
# 2. Create a stringdistmatrix for every initial cluster, then filter
# clusters based on the dist matrices.
# 3. For each remaining cluster, make mass edits to the values of vect
# related to that cluster. Return vect after mass edits have been made.
ngram_merge_approx(n_gram_keys, one_gram_keys, univect, vect, edit_threshold,
method, weight, p, bt, q, useBytes, nthread)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.