#' Fuzzy matching of cases between linelists
#'
#' This function matches cases between linelists on specified columns using
#' user-specified matching thresholds.
#'
#' @export
#'
#' @author Finlay Campbell (\email{finlaycampbell93@@gmail.com})
#'
#' @param x Linelist 1 as a dataframe.
#'
#' @param y Linelist 2 as a dataframe.
#'
#' @param by Linelist columns to match cases on. This can be a character vector
#' indicating column names found in both linelists, a 2-column integer matrix
#' indicating the pairs of columns to be matched in linelist 1 and linelist 2,
#' or a 2-column character matrix indicating the names of the columns to be
#' matched in linelist 1 and linelist 2.
#'
#' @param max_dist A numeric vector indicating the cutoff distance for fuzzy
#' matching of each column-pair. This can be a single value used for all
#' column-pairs, or a vector of values indicating the cutoff for each
#' column-pair. Distances between numeric columns are calculated as the
#' absolute difference between values, distances between Date columns are
#' calculated as the absolute difference in number of days and distances
#' between character columns are calculated using the \code{stringdist}
#' function from the \code{stringdist} package.
#'
#' @param match_fun An optional list of functions for customised evaluations of
#' matches. Each function must accept two vectors as arguments and return a
#' logical vector of the same length indicating whether a comparison is a
#' match or not. The list must be of the same length as max_dist.
#'
#' @param output If "index", returns a dataframe of matched indices between the
#' linelists. If "merged", returns a merged linelist.
#'
#' @param mode The type of join when returning a merged linelist. One of
#' "inner", "left", "right", "full", "semi", "anti".
#'
#' @return A dataframe of matching indices if output = "index", a merged
#' linelist if output = "merged".
#'
#' @importFrom fuzzyjoin fuzzy_join
#' @importFrom stringdist stringdist
#'
#' @examples
#' data(sample_linelists)
#' linelist_a <- sample_linelists$linelist_a
#' linelist_b <- sample_linelists$linelist_b
#'
#' ## examine linelists
#' head(linelist_a)
#' head(linelist_b)
#'
#' ## specify matching columns
#' by <- matrix(c("numeric_a", "numeric_b",
#' "character_a", "character_b",
#' "date_a", "date_b"),
#' ncol = 2, byrow = TRUE)
#'
#' ## define thresholds
#' max_dist <- c(5, 1, 5)
#'
#' ## find matching case indices
#' matches <- match_cases(linelist_a, linelist_b, by, max_dist)
#' head(matches)
#'
#' ## merge linelists
#' linelist <- match_cases(linelist_a, linelist_b, by, max_dist, output = "merged")
#' head(linelist)
#'
match_cases <- function(x, y, by,
max_dist = NULL,
match_fun = NULL,
output = c("index", "merged"),
mode = c("inner", "left", "right", "full", "semi", "anti")) {
## check by
col_match <- assert_by(x, y, by)
## check max_dist
max_dist <- assert_max_dist(max_dist, col_match)
## determine column classes
classes <- assert_col_class(x, y, col_match)
## generate matching functions
if(is.null(match_fun)) {
match_fun <- mapply(create_match_fun, classes, max_dist, USE.NAMES = FALSE)
}
match_fun <- assert_match_fun(match_fun, col_match)
## keep indices from original tables
x$x_index <- seq_len(nrow(x))
y$y_index <- seq_len(nrow(y))
names_x <- names(x)
names_y <- names(y)
names_diff <- names(x)[col_match[,1]] != names(y)[col_match[,2]]
## rename columns for joining
match_names <- names(x)[col_match[,1]]
names(y)[col_match[,2]] <- match_names
## join the dataframes
output <- match.arg(output)
mode <- match.arg(mode)
if(output == "index") mode <- "inner"
join <- fuzzy_join(x, y, match_names, match_fun, mode = mode)
## restore original column names that don't match
names(join)[col_match[names_diff, 1]] <- names_x[col_match[names_diff, 1]]
## restore y columns if available
if(!mode %in% c("semi", "anti")) {
names(join)[ncol(x) + col_match[names_diff, 2]] <- names_y[col_match[names_diff, 2]]
}
## format output and return
if(output == 'index') {
out <- stats::na.omit(join[c("x_index", "y_index")])
} else {
out <- join[-which(names(join) %in% c("x_index", "y_index"))]
}
return(out)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.