casematcher: Matching cases between linelists

Documented in match_cases

#' Fuzzy matching of cases between linelists
#'
#' This function matches cases between linelists on specified columns using
#' user-specified matching thresholds.
#'
#' @export
#'
#' @author Finlay Campbell (\email{finlaycampbell93@@gmail.com})
#'
#' @param x Linelist 1 as a dataframe.
#'
#' @param y Linelist 2 as a dataframe.
#'
#' @param by Linelist columns to match cases on. This can be a character vector
#'   indicating column names found in both linelists, a 2-column integer matrix
#'   indicating the pairs of columns to be matched in linelist 1 and linelist 2,
#'   or a 2-column character matrix indicating the names of the columns to be
#'   matched in linelist 1 and linelist 2.
#'
#' @param max_dist A numeric vector indicating the cutoff distance for fuzzy
#'   matching of each column-pair. This can be a single value used for all
#'   column-pairs, or a vector of values indicating the cutoff for each
#'   column-pair. Distances between numeric columns are calculated as the
#'   absolute difference between values, distances between Date columns are
#'   calculated as the absolute difference in number of days and distances
#'   between character columns are calculated using the \code{stringdist}
#'   function from the \code{stringdist} package.
#'
#' @param match_fun An optional list of functions for customised evaluations of
#'   matches. Each function must accept two vectors as arguments and return a
#'   logical vector of the same length indicating whether a comparison is a
#'   match or not. The list must be of the same length as max_dist.
#'
#' @param output If "index", returns a dataframe of matched indices between the
#'   linelists. If "merged", returns a merged linelist.
#'
#' @param mode The type of join when returning a merged linelist. One of
#'   "inner", "left", "right", "full", "semi", "anti".
#' 
#' @return A dataframe of matching indices if output = "index", a merged
#'   linelist if output = "merged".
#'
#' @importFrom fuzzyjoin fuzzy_join
#' @importFrom stringdist stringdist
#'
#' @examples
#' data(sample_linelists)
#' linelist_a <- sample_linelists$linelist_a
#' linelist_b <- sample_linelists$linelist_b
#' 
#' ## examine linelists
#' head(linelist_a)
#' head(linelist_b)
#'
#' ## specify matching columns
#' by <- matrix(c("numeric_a", "numeric_b",
#'                "character_a", "character_b",
#'                "date_a", "date_b"),
#'              ncol = 2, byrow = TRUE)
#'
#' ## define thresholds
#' max_dist <- c(5, 1, 5)
#'
#' ## find matching case indices
#' matches <- match_cases(linelist_a, linelist_b, by, max_dist)
#' head(matches)
#'
#' ## merge linelists
#' linelist <- match_cases(linelist_a, linelist_b, by, max_dist, output = "merged")
#' head(linelist)
#' 
match_cases <- function(x, y, by,
                        max_dist = NULL,
                        match_fun = NULL,
                        output = c("index", "merged"),
                        mode = c("inner", "left", "right", "full", "semi", "anti")) {

  ## check by
  col_match <- assert_by(x, y, by)

  ## check max_dist
  max_dist <- assert_max_dist(max_dist, col_match)

  ## determine column classes
  classes <- assert_col_class(x, y, col_match)

  ## generate matching functions
  if(is.null(match_fun)) {
    match_fun <- mapply(create_match_fun, classes, max_dist, USE.NAMES = FALSE)
  }

  match_fun <- assert_match_fun(match_fun, col_match)

  ## keep indices from original tables
  x$x_index <- seq_len(nrow(x))
  y$y_index <- seq_len(nrow(y))
  names_x <- names(x)
  names_y <- names(y)
  names_diff <- names(x)[col_match[,1]] != names(y)[col_match[,2]]
  
  ## rename columns for joining
  match_names <- names(x)[col_match[,1]]
  names(y)[col_match[,2]] <- match_names

  ## join the dataframes
  output <- match.arg(output)
  mode <- match.arg(mode)

  if(output == "index") mode <- "inner"
  join <- fuzzy_join(x, y, match_names, match_fun, mode = mode)

  ## restore original column names that don't match
  names(join)[col_match[names_diff, 1]] <- names_x[col_match[names_diff, 1]]

  ## restore y columns if available
  if(!mode %in% c("semi", "anti")) {
    names(join)[ncol(x) + col_match[names_diff, 2]] <- names_y[col_match[names_diff, 2]]
  }

  ## format output and return
  if(output == 'index') {
    out <- stats::na.omit(join[c("x_index", "y_index")])
  } else {
    out <- join[-which(names(join) %in% c("x_index", "y_index"))]
  }

  return(out)

}