R/hmatch_composite.R

Defines functions hmatch_composite

Documented in hmatch_composite

#' Implement a variety of hierarchical matching strategies in sequence
#'
#' @description
#' Match a data frame with raw, potentially messy hierarchical data (e.g.
#' province, county, township) against a reference dataset, using a variety of
#' matching strategies implemented in sequence to identify the best-possible
#' match (i.e. highest-resolution) for each row.
#'
#' The sequence of matching strategies is:
#' 1. (optional) manually-specified matching with \code{\link{hmatch_manual}}
#' 2. complete matching with \code{\link{hmatch}(..., allow_gaps = FALSE)}
#' 3. partial matching with \code{\link{hmatch}(..., allow_gaps = TRUE)}
#' 4. fuzzy partial matching with \code{\link{hmatch}(allow_gaps = TRUE, fuzzy = TRUE)}
#' 5. best-possible matching with \code{\link{hmatch_settle}}
#'
#' Each approach is implement only on the rows of data for which a single match
#' has not already been identified using the previous approaches.
#'
#' @inheritParams hmatch_settle
#'
#' @param man (optional) data frame of manually-specified matches, relating a
#'   given set of hierarchical values to the code within `ref` to which those
#'   values correspond
#' @param pattern regex pattern to match the hierarchical columns in `raw`
#'   (and `man` if given) (see also \link{specifying_columns})
#' @param by vector giving the names of the hierarchical columns in `raw` (and
#'   `man` if given)
#' @param code_col name of the code column containing codes for matching `ref`
#'   and `man` (only required if argument `man` is given)
#' @param type type of join ("resolve_left", "resolve_inner", or
#'   "resolve_anti"). Defaults to "left". See \link{join_types}.
#'
#' @return a data frame obtained by matching the hierarchical columns in `raw`
#'   and `ref`, using the join type specified by argument `type` (see
#'   \link{join_types} for more details)
#'
#' @examples
#' data(ne_raw)
#' data(ne_ref)
#'
#' hmatch_composite(ne_raw, ne_ref, fuzzy = TRUE)
#'
#' @importFrom dplyr inner_join left_join
#' @export hmatch_composite
hmatch_composite <- function(raw,
                             ref,
                             man,
                             pattern,
                             pattern_ref = pattern,
                             by,
                             by_ref = by,
                             code_col,
                             type = "resolve_left",
                             allow_gaps = TRUE,
                             fuzzy = FALSE,
                             fuzzy_method = "osa",
                             fuzzy_dist = 1L,
                             dict = NULL,
                             ref_prefix = "ref_",
                             std_fn = string_std,
                             ...) {


  ## match args
  if (!is.null(std_fn)) std_fn <- match.fun(std_fn)
  type <- match.arg(type, c("resolve_left", "resolve_inner", "resolve_anti"))

  ## save original colnames of raw
  names_raw_orig <- names(raw)

  ## temporary columns to aid in matching
  temp_col_id <- "TEMP_ROW_ID_COMPOSITE"
  temp_col_code <- "TEMP_COL_CODE_COMPOSITE"

  raw[[temp_col_id]] <- seq_len(nrow(raw))

  ## if 'raw' is a spatial data frame of class "sf"
  raw_is_sf <- "sf" %in% class(raw)

  if (raw_is_sf) {
    raw_sf <- raw
    raw <- sf::st_drop_geometry(raw)
  }

  ## identify hierarchical columns to match, and rename ref cols if necessary
  prep <- prep_match_columns(
    raw = raw,
    ref = ref,
    pattern = pattern,
    pattern_ref = pattern_ref,
    by = by,
    by_ref = by_ref,
    ref_prefix = ref_prefix,
    code_col = temp_col_code
  )

  ## add standardized columns for joining
  raw_join <- add_join_columns(
    dat = raw,
    by = prep$by_raw,
    join_cols = prep$by_raw_join,
    std_fn = std_fn,
    ...
  )

  ref_join <- add_join_columns(
    dat = prep$ref,
    by = prep$by_ref,
    join_cols = prep$by_ref_join,
    std_fn = std_fn,
    ...
  )

  ## implement dictionary recoding on join columns
  if (!is.null(dict)) {
    raw_join <- apply_dict(
      raw_join,
      dict,
      by_raw = prep$by_raw,
      by_join = prep$by_raw_join,
      std_fn = std_fn
    )
  }

  ## initiate empty match dfs
  m_manual <- m_complete <- m_partial <- m_fuzzy <- m_settle <- NULL

  ## initial df to store remaining unmatched rows of raw
  raw_join_remaining <- raw_join

  ## manual match
  if (!missing(man) && !is.null(man)) {

    ## join ref to man by code_col
    man_ref <- dplyr::inner_join(
      prep$ref,
      man,
      by = code_col
    )

    man_join <- add_join_columns(
      dat = man_ref,
      by = prep$by_raw,
      join_cols = prep$by_raw_join,
      std_fn = std_fn,
      ...
    )

    ## run main matching routines
    m_manual <- hmatch_manual_(
      raw_join = raw_join,
      man_join = man_join,
      by_raw = prep$by_raw,
      by_ref = prep$by_ref,
      by_join = prep$by_raw_join,
      type = "inner",
      class_raw = class(raw)
    )

    m_manual <- m_manual[,c(temp_col_id, temp_col_code)]
    m_manual$match_type <- rep("manual", nrow(m_manual))

    unmatched <- !raw_join_remaining[[temp_col_id]] %in% m_manual[[temp_col_id]]
    raw_join_remaining <- raw_join_remaining[unmatched,]
  }

  ## complete non-fuzzy match
  # (run even if no rows remaining to get column template for later join)
  m_complete <- hmatch_(
    raw_join = raw_join_remaining,
    ref_join = ref_join,
    by_raw = prep$by_raw,
    by_ref = prep$by_ref,
    by_raw_join = prep$by_raw_join,
    by_ref_join = prep$by_ref_join,
    type = "resolve_inner",
    allow_gaps = FALSE,
    fuzzy = FALSE
  )

  m_complete <- m_complete[,c(temp_col_id, temp_col_code)]
  m_complete$match_type <- rep("complete", nrow(m_complete))

  unmatched <- !raw_join_remaining[[temp_col_id]] %in% m_complete[[temp_col_id]]
  raw_join_remaining <- raw_join_remaining[unmatched,]

  ## partial non-fuzzy match
  if (nrow(raw_join_remaining) > 0 & allow_gaps) {

    m_partial <- hmatch_(
      raw_join = raw_join_remaining,
      ref_join = ref_join,
      by_raw = prep$by_raw,
      by_ref = prep$by_ref,
      by_raw_join = prep$by_raw_join,
      by_ref_join = prep$by_ref_join,
      type = "resolve_inner",
      allow_gaps = TRUE,
      fuzzy = FALSE
    )

    m_partial <- m_partial[,c(temp_col_id, temp_col_code)]
    m_partial$match_type <- rep("gaps", nrow(m_partial))

    unmatched <- !raw_join_remaining[[temp_col_id]] %in% m_partial[[temp_col_id]]
    raw_join_remaining <- raw_join_remaining[unmatched,]
  }

  ## partial fuzzy match
  if (nrow(raw_join_remaining) > 0) {

    m_fuzzy <- hmatch_(
      raw_join = raw_join_remaining,
      ref_join = ref_join,
      by_raw = prep$by_raw,
      by_ref = prep$by_ref,
      by_raw_join = prep$by_raw_join,
      by_ref_join = prep$by_ref_join,
      type = "resolve_inner",
      allow_gaps = allow_gaps,
      fuzzy = TRUE,
      fuzzy_method = fuzzy_method,
      fuzzy_dist = fuzzy_dist
    )

    m_fuzzy <- m_fuzzy[,c(temp_col_id, temp_col_code)]
    m_fuzzy$match_type <- rep("fuzzy", nrow(m_fuzzy))

    unmatched <- !raw_join_remaining[[temp_col_id]] %in% m_fuzzy[[temp_col_id]]
    raw_join_remaining <- raw_join_remaining[unmatched,]
  }

  ## settle join
  if (nrow(raw_join_remaining) > 0) {

    m_settle <- hmatch_settle_(
      raw_join = raw_join_remaining,
      ref_join = ref_join,
      by_raw = prep$by_raw,
      by_ref = prep$by_ref,
      by_raw_join = prep$by_raw_join,
      by_ref_join = prep$by_ref_join,
      allow_gaps = allow_gaps,
      type = "resolve_inner",
      fuzzy = fuzzy,
      fuzzy_method = fuzzy_method,
      fuzzy_dist = fuzzy_dist
    )

    # note that with resolve join m_settle returns correct by_ref cols but
    # not necessarily correct temp code col
    m_settle <- m_settle[,c(temp_col_id, prep$by_ref)]
    m_settle <- dplyr::left_join(m_settle, prep$ref, by = prep$by_ref)
    m_settle <- m_settle[,c(temp_col_id, temp_col_code)]
    m_settle$match_type <- rep("settle", nrow(m_settle))
  }

  ## combine results from all match types
  m_full <- rbind.data.frame(
    m_manual,
    m_complete,
    m_partial,
    m_fuzzy,
    m_settle
  )

  ## merge to ref
  m_bind_ref <- dplyr::left_join(m_full, prep$ref, by = temp_col_code)
  m_bind_ref <- m_bind_ref[,c(temp_col_id, names(prep$ref), "match_type")]

  ## merge to raw
  if (raw_is_sf) raw <- raw_sf
  out <- dplyr::left_join(raw, m_bind_ref, by = temp_col_id)

  ## execute match type and remove temporary columns
  prep_output(
    x = out,
    type = gsub("^resolve_", "", type),
    temp_col_id = temp_col_id,
    temp_col_match = temp_col_code,
    cols_raw_orig = names_raw_orig,
    class_raw = class(raw),
    by_raw = prep$by_raw,
    by_ref = prep$by_ref
  )
}
epicentre-msf/hmatch documentation built on Nov. 15, 2023, 1:47 a.m.