R/split_block.R

Defines functions split_block

# Generated by fusen: do not edit by hand

#' Split to Blocks
#' 
#' Description
#' 
#' @param .source 
#' The Source Dataframe.\cr
#' (Must contain a unique column id and the columns you want to match on)
#' @param .target 
#' The Target Dataframe.\cr
#' (Must contain a unique column id and the columns you want to match on)
#' @param .cols_match 
#' A character vector of columns to perform fuzzy matching. 
#' @param .char_block 
#' Character Block Size. Used to partition data.\cr
#' - First element chunks the source data in ngram-blocks.\cr
#' - Second element allows for characters in target below/above block size.
#'
#' @return
#' A List
#' 
#' @noRd
#' @examples
#' tab_source <- table_source[1:100, ]
#' tab_target <- table_target[1:999, ]
#' cols_match <- c("name", "iso3", "city", "address")
#' char_block = c(25, 5)
#' 
#' split_block(
#'   .source = tab_source,
#'   .target = tab_target,
#'   .cols_match = cols_match,
#'   .char_block = char_block
#' )
split_block <- function(.source, .target, .cols_match, .char_block) {
  n__ <- b__ <- NULL
  check_id(.source, .target)
  source_ <- prep_tables(.source, .cols_match)
  target_ <- prep_tables(.target, .cols_match)

  t_ <- dplyr::mutate(target_, n__ = nchar(!!dplyr::sym(.cols_match[1])))
  max_t_ <- max(t_$n__)
  s_ <- source_ %>%
    dplyr::mutate(
      n__ = nchar(!!dplyr::sym(.cols_match[1])),
      n__ = dplyr::if_else(n__ > max_t_, max_t_, n__)
    ) %>%
    dplyr::arrange(n__) %>%
    dplyr::mutate(b__ = floor(n__ / .char_block[1])) %>%
    dplyr::group_by(b__) %>%
    dplyr::mutate(b__ = paste0(
      stringi::stri_pad_left(dplyr::first(n__), 3, 0),
      "-",
      stringi::stri_pad_left(dplyr::last(n__), 3, 0)
    )) %>%
    dplyr::ungroup()
  return(
    list(
      ls = split(dplyr::select(s_, -c(n__, b__)), s_$b__),
      tt = t_
    )
  )
}
MatthiasUckert/Rmatch documentation built on Jan. 3, 2022, 11:09 p.m.