Setup

library(testthat); library(tidyverse); library(stringi); library(fst)
pkgload::load_all(path = here::here(), export_all = TRUE)

Functions

HELP: standardize_str

.str <- "International Business Corporation"
.op = c("space", "punct", "case", "ascii")

Function

#' Standardize Strings
#' 
#' Description
#' 
#' @param .str 
#' A string
#' @param .op 
#' One of c("space", "punct", "case", "ascii")
#'
#' @return A string
#' 
#' @export
standardize_str <- function(.str, .op = c("space", "punct", "case", "ascii")) {
  str_ <- .str

  if ("ascii" %in% .op) {
    str_ <- stringi::stri_trans_general(str_, "Latin-ASCII")
  }

  if ("punct" %in% .op) {
    str_ <- trimws(stringi::stri_replace_all_regex(str_, "\\W", " "))
    str_ <- trimws(stringi::stri_replace_all_regex(str_, "[[:punct:]]", " "))

    if (!"space" %in% .op) {
      str_ <- trimws(stringi::stri_replace_all_regex(str_, "([[:blank:]]|[[:space:]])+", " "))
    }
  }

  if ("space" %in% .op) {
    str_ <- trimws(stringi::stri_replace_all_regex(str_, "([[:blank:]]|[[:space:]])+", " "))
  }

  if ("case" %in% .op) {
    str_ <- toupper(str_)
  }

  return(str_)
}

Example/Test

standardize_str(c("jkldsa   jkdhas   äää  §$ ## #'''"))
test_that("standardize_str works", {
  expect_true(inherits(standardize_str, "function")) 
})

MAIN: standardize_data

.tab <- table_source
.cols <- c("name", "iso3", "city", "address")
.op = c("space", "punct", "case", "ascii")

Function

#' Standardize Data
#' 
#' Description
#' 
#' @param .tab 
#' A dataframe (either the source or target dataframe)
#' @param .cols_match 
#' A character vector of columns to perform fuzzy matching. 
#' @param .fun Function for standardization, if NULL standardize_str() is used
#'
#' @return A dataframe
#' 
#' @export
standardize_data <- function(.tab, .cols_match, .fun = NULL) {
  tab_ <- tibble::as_tibble(.tab)
  if (is.null(.fun)) {
    f_ <- standardize_str
  } else {
    f_ <- .fun
  }

  for (i in .cols_match) {
    tab_[[i]] <- f_(tab_[[i]])
  }
  return(tab_)
}

Example/Test

standardize_data(table_source, c("name", "iso3", "city", "address"))
test_that("standardize_data works", {
  expect_true(inherits(standardize_data, "function")) 
})

MAIN: extract_legal_form

.tab <- table_source
.col_name <- "name"
.col_country <- NULL
.legal_forms <- tibble::tibble(.rows = 0)
.workers = future::availableCores()

Function

#' Extract Legal Forms
#' 
#' Description
#' 
#' @param .tab 
#' A dataframe (either the source or target dataframe)
#' @param .col_name 
#' The column with firm names
#' @param .col_country 
#' Optionally, a column with iso3 country codes
#' @param .legal_forms 
#' A dataframe with legal forms
#' @param .workers 
#' Number of cores to utilize (Default all cores determined by future::availableCores())
#'
#' @return A dataframe
#' 
#' @importFrom rlang :=
#' 
#' @export
extract_legal_form <- function(
  .tab, .col_name, .col_country = NULL, .legal_forms = data.frame(), 
  .workers = future::availableCores()
  ) {
  tmp <- legal_form_orig <- legal_form_stand <- legal_form <- name <- lf_stand <-
    lf_orig <- NULL

  .tab <- tibble::as_tibble(.tab)

  if (nrow(.legal_forms) == 0) {
    tab_lf_ <- get("legal_form_all")
  } else {
    tab_lf_ <- .legal_forms
  }

  if (is.null(.col_country)) {
    tab_lf_ <- tab_lf_ %>%
      dplyr::distinct(legal_form_orig, legal_form_stand) %>%
      dplyr::distinct(legal_form_orig, .keep_all = TRUE)
    join_by_ <- "legal_form_orig"
  } else {
    colnames(tab_lf_) <- c(.col_country, "legal_form_orig", "legal_form_stand")
    join_by_ <- c(.col_country, "legal_form_orig")
  }

  tab_ <- standardize_data(.tab, .col_name)
  lf_ <- unique(tab_lf_[["legal_form_orig"]])
  nm_ <- tab_[[.col_name]]

  f_ <- carrier::crate(function(.lf, .nm) which(endsWith(.nm, paste0(" ", .lf))))
  future::plan("multisession", workers = .workers)
  lf_ext_ <- furrr::future_map(
    .x = purrr::set_names(lf_, lf_),
    .f = ~ f_(.x, nm_),
    .options = furrr::furrr_options(seed = TRUE)
  )
  future::plan("default")
  lf_ext_ <- lf_ext_ %>%
    purrr::compact() %>%
    tibble::enframe(name = "legal_form_orig", value = "tmp") %>%
    tidyr::unnest(tmp) %>%
    dplyr::arrange(dplyr::desc(nchar(legal_form_orig))) %>%
    dplyr::distinct(tmp, .keep_all = TRUE)

  tab_ %>%
    dplyr::mutate(tmp = dplyr::row_number()) %>%
    dplyr::left_join(lf_ext_, by = "tmp") %>%
    dplyr::left_join(tab_lf_, by = join_by_) %>%
    dplyr::rename(
      lf_stand = legal_form_stand, 
      lf_orig = legal_form_orig
      ) %>%
    dplyr::relocate(lf_stand, .after = !!dplyr::sym(.col_name)) %>%
    dplyr::relocate(lf_orig, .after = !!dplyr::sym(.col_name)) %>%
    dplyr::mutate(
      !!dplyr::sym(paste0(.col_name, "_adj")) := trimws(
        stringi::stri_replace_last_fixed(!!dplyr::sym(.col_name), lf_orig, "")
      ),
      .after = !!dplyr::sym(.col_name)
    ) %>%
    dplyr::mutate(
      !!dplyr::sym(paste0(.col_name, "_adj")) := dplyr::if_else(
        condition = is.na(!!dplyr::sym(paste0(.col_name, "_adj"))),
        true = !!dplyr::sym(.col_name), 
        false = !!dplyr::sym(paste0(.col_name, "_adj"))
        )) %>%
    dplyr::mutate(
      !!dplyr::sym(paste0(.col_name, "_std")) := dplyr::if_else(
        condition = !is.na(lf_stand),
        true = paste(!!dplyr::sym(paste0(.col_name, "_adj")), lf_stand),
        false = !!dplyr::sym(paste0(.col_name, "_adj"))
      ),
      .after = !!dplyr::sym(paste0(.col_name, "_adj"))
    ) %>%
    dplyr::select(-tmp) %>%
    dplyr::mutate(!!dplyr::sym(.col_name) := .tab[[.col_name]])


}

Example/Test

extract_legal_form(
  .tab = table_source[1:100, ], 
  .col_name = "name", 
  .col_country = "iso3",
  .workers = 1
  )
test_that("extract_legal_form works", {
  expect_true(inherits(extract_legal_form, "function")) 
})

HELP: prep_tables

Function

.tab <- table_source
.cols_match <- c("name", "iso3", "city", "address")
#' Prepare Table
#' 
#' Description
#' 
#' @param .tab 
#' A dataframe (either the source or target dataframe)
#' @param .cols_match 
#' A character vector of columns to perform fuzzy matching. 
#'
#' @return
#' 
#' @noRd
prep_tables <- function(.tab, .cols_match) {
  tmp <- id <- `_id_` <- NULL
  if (!"_id_" %in% colnames(.tab)) {
    tidyr::unite(
      data = tibble::as_tibble(.tab[, c("id", .cols_match)]),
      col = tmp,
      !dplyr::matches("^id$")
    ) %>%
      dplyr::group_by(tmp) %>%
      dplyr::summarise(
        `_id_` = list(id),
        id = dplyr::first(id),
        .groups = "drop"
      ) %>%
      dplyr::select(id, `_id_`) %>%
      dplyr::left_join(.tab, by = "id")
  } else {
    .tab
  }
}

Example/Test

prep_tables(table_source, c("name", "iso3", "city", "address"))
test_that("prep_tables works", {
  expect_true(inherits(prep_tables, "function")) 
})

HELP: match_col

Function

.source <- bind_rows(table_source[1:100, ], dplyr::mutate(table_source[1:100, ], id = paste0(id, "-1")))
.target <- table_target[1:999, ]
.cols_match <- c("name", "iso3", "city", "address")
.max_match <- 10
.method <- "osa"
.workers <- future::availableCores()
#' Match a on a single column
#' 
#' Description
#' 
#' @param .source 
#' The Source Dataframe.\cr
#' (Must contain a unique column id and the columns you want to match on)
#' @param .target 
#' The Target Dataframe.\cr
#' (Must contain a unique column id and the columns you want to match on)
#' @param .cols_match 
#' A character vector of columns to perform fuzzy matching.  
#' @param .max_match 
#' Maximum number of matches to return (Default = 10)
#' @param .method 
#' One of "osa", "lv", "dl", "hamming", "lcs", "qgram", "cosine", "jaccard", "jw", "soundex".\cr
#' See: stringdist-metrics {stringdist}
#' @param .workers 
#' Number of cores to utilize (Default all cores determined by future::availableCores())
#'
#' @return A Dataframe
match_col <- function(
  .source, .target, .cols_match, .max_match = 10, .method = "osa", 
  .workers = future::availableCores()
  ) {

  V1 <- value <- id <- name <- id_t <- sim <- NULL

  check_id(.source, .target)

  source_ <- prep_tables(.source, .cols_match)
  target_ <- prep_tables(.target, .cols_match)

  method_ <- match.arg(
    arg = .method,
    choices = c("osa", "lv", "dl", "hamming", "lcs", "qgram", "cosine", "jaccard", "jw", "soundex")
  )

  tab_ <- stringdist::stringsimmatrix(
    a = source_[[.cols_match[1]]],
    b = target_[[.cols_match[1]]],
    method = method_,
    nthread = .workers
  ) %>%
    tibble::as_tibble() %>%
    dplyr::mutate(id = dplyr::row_number(), .before = V1) %>%
    tidyr::pivot_longer(!dplyr::matches("id")) %>%
    dplyr::group_by(id) %>%
    dplyr::slice_max(order_by = value, n = .max_match) %>%
    dplyr::ungroup() %>%
    dplyr::rename(id_s = id, id_t = name) %>%
    dplyr::mutate(id_t = as.integer(gsub("V", "", id_t, fixed = TRUE))) %>%
    suppressWarnings()

  tab_[["id_s"]] <- source_[["id"]][tab_[["id_s"]]]
  tab_[["id_t"]] <- target_[["id"]][tab_[["id_t"]]]
  colnames(tab_) <- c("id_s", "id_t", paste0("sim_", .cols_match[1]))
  return(tab_)
}

Example/Test

match_col(
  .source = table_source[1:100, ],
  .target = table_target[1:999, ],
  .cols_match = c("name", "iso3", "city", "address")
)
test_that("match_col works", {
  expect_true(inherits(match_col, "function")) 
})

HELP: help_match_data

.source <- bind_rows(table_source[1:100, ], dplyr::mutate(table_source[1:100, ], id = paste0(id, "-1")))
.target <- table_target[1:999, ]
.cols_match <- c("name", "iso3", "city", "address")
.cols_exact = NULL
.max_match = 10
.method = "osa"
.verbose = TRUE
.workers = future::availableCores()

Function

#' Match Data
#' 
#' Description
#' 
#' @param .source 
#' The Source Dataframe.\cr
#' (Must contain a unique column id and the columns you want to match on)
#' @param .target 
#' The Target Dataframe.\cr
#' (Must contain a unique column id and the columns you want to match on)
#' @param .cols_match 
#' A character vector of columns to perform fuzzy matching.  
#' @param .cols_exact 
#' Columns that must be matched perfectly.\cr
#' (Data will be partitioned using those columns)
#' @param .max_match 
#' Maximum number of matches to return (Default = 10)
#' @param .method 
#' One of "osa", "lv", "dl", "hamming", "lcs", "qgram", "cosine", "jaccard", "jw", "soundex".\cr
#' See: stringdist-metrics {stringdist}
#' @param .verbose 
#' Print additional information?
#' @param .workers 
#' Number of cores to utilize (Default all cores determined by future::availableCores())
#' 
#' @return A Dataframe
#' 
help_match_data <- function(
  .source, .target, .cols_match, .cols_exact = NULL, .max_match = 10,
  .method = "osa", .verbose = TRUE, .workers = future::availableCores()
  ) {

  sim <- NULL

  check_id(.source, .target)

  source_ <- prep_tables(.source, .cols_match)
  target_ <- prep_tables(.target, .cols_match)


  if (!is.null(.cols_exact)) {
    vs_ <- tidyr::unite(source_[, .cols_exact], "tmp", dplyr::everything())[["tmp"]]
    ls_ <- split(source_, vs_)

    vt_ <- tidyr::unite(target_[, .cols_exact], "tmp", dplyr::everything())[["tmp"]]
    lt_ <- split(target_, vt_)

    lt_ <- lt_[names(lt_) %in% names(ls_)]
    ls_ <- ls_[names(lt_)]
  } else {
    ls_ <- list(source_)
    lt_ <- list(target_)
  }

  if (.verbose) {
    pb <- progress::progress_bar$new(
      total = length(ls_),
      clear = FALSE, show_after = 1
    )
  }
  purrr::map2_dfr(
    .x = ls_,
    .y = lt_,
    .f = ~ {
      if (.verbose) pb$tick()
      tab_ <- match_col(
        .source = .x,
        .target = .y,
        .cols_match = .cols_match,
        .max_match = .max_match,
        .method = .method,
        .workers = .workers
      )

      if (length(.cols_match) > 1) {
        s_ <- dplyr::left_join(tab_, .x, by = c("id_s" = "id"))
        t_ <- dplyr::left_join(tab_, .y, by = c("id_t" = "id"))

        for (i in 2:length(.cols_match)) {
          cols_sim_ <- paste0("sim_", .cols_match[i])
          tab_[[cols_sim_]] <- stringdist::stringsim(
            a = s_[[.cols_match[i]]], 
            b = t_[[.cols_match[i]]], 
            method = .method
            )
        }
      }
      return(tab_)
    }
  )
}

Example/Test

help_match_data(
  .source = table_source[1:100, ], 
  .target = table_target[1:999, ], 
  .cols_match = c("name", "iso3", "city", "address"),
  .cols_exact = NULL,
  .max_match = 10,
  .method = "osa",
  .verbose = TRUE,
  .workers = 4
  )
test_that("match_data works", {
  expect_true(inherits(help_match_data, "function")) 
})

MAIN: join_data

Function

.source = table_source[1:100, ]
.target = table_target[1:999, ]
.cols_match = c("name", "iso3", "city", "address")
.cols_join <- c("name", "iso3")
.method = "osa"
#' Perform LeftJoin on Data
#' 
#' Description
#' 
#' @param .source 
#' The Source Dataframe.\cr
#' (Must contain a unique column id and the columns you want to match on)
#' @param .target 
#' The Target Dataframe.\cr
#' (Must contain a unique column id and the columns you want to match on)
#' @param .cols_match 
#' A character vector of columns to perform fuzzy matching.  
#' @param .cols_join
#' Columns to perfrom an exact match on, before fuzzy-matching.\cr
#' (Matched IDs will be excluded from fuzzy-match)
#' @param .method 
#' One of "osa", "lv", "dl", "hamming", "lcs", "qgram", "cosine", "jaccard", "jw", "soundex".\cr
#' See: stringdist-metrics {stringdist}
#' @return A Dataframe
#' 
#' @export
join_data <- function(.source, .target, .cols_match, .cols_join, .method = "osa") {
  id_s <- id_t <- NULL

  check_id(.source, .target)
  source_ <- prep_tables(.source, .cols_match)
  target_ <- prep_tables(.target, .cols_match)

  s_ <- source_[, c("id", .cols_join)]
  t_ <- target_[, c("id", .cols_join)]
  non_ <- .cols_match[!.cols_match %in% .cols_join]

  tab_ <- dplyr::inner_join(s_, t_, by = .cols_join, suffix = c("_s", "_t")) %>%
    dplyr::mutate(
      dplyr::across(!dplyr::matches("^id_s$|^id_t$"), ~1)
    ) %>%
    dplyr::select(id_s, id_t, dplyr::everything()) %>%
    `colnames<-`(c("id_s", "id_t", paste0("sim_", .cols_join)))

  s_ <- dplyr::left_join(tab_, .source[, c("id", non_)], by = c("id_s" = "id"))
  t_ <- dplyr::left_join(tab_, .target[, c("id", non_)], by = c("id_t" = "id"))

  for (i in seq_len(length(non_))) {
    tab_[[paste0("sim_", non_[i])]] <- stringdist::stringsim(s_[[non_[i]]], t_[[non_[i]]], .method)
  }

  return(tab_)
}

Example/Test

join_data(
  .source = table_source,
  .target = table_target,
  .cols_match = c("name", "iso3", "city", "address"),
  .cols_join = c("name", "iso3"),
  .method = "osa"
)
test_that("join_data works", {
  expect_true(inherits(join_data, "function")) 
})

HELP: split_block

Function

.source = table_source[1:100, ]
.target = table_target[1:999, ]
.cols_match = c("name", "iso3", "city", "address")
.char_block = c(2, 5)
#' Split to Blocks
#' 
#' Description
#' 
#' @param .source 
#' The Source Dataframe.\cr
#' (Must contain a unique column id and the columns you want to match on)
#' @param .target 
#' The Target Dataframe.\cr
#' (Must contain a unique column id and the columns you want to match on)
#' @param .cols_match 
#' A character vector of columns to perform fuzzy matching. 
#' @param .char_block 
#' Character Block Size. Used to partition data.\cr
#' - First element chunks the source data in ngram-blocks.\cr
#' - Second element allows for characters in target below/above block size.
#'
#' @return
#' A List
#' 
#' @noRd
split_block <- function(.source, .target, .cols_match, .char_block) {
  n__ <- b__ <- NULL
  check_id(.source, .target)
  source_ <- prep_tables(.source, .cols_match)
  target_ <- prep_tables(.target, .cols_match)

  t_ <- dplyr::mutate(target_, n__ = nchar(!!dplyr::sym(.cols_match[1])))
  max_t_ <- max(t_$n__)
  s_ <- source_ %>%
    dplyr::mutate(
      n__ = nchar(!!dplyr::sym(.cols_match[1])),
      n__ = dplyr::if_else(n__ > max_t_, max_t_, n__)
    ) %>%
    dplyr::arrange(n__) %>%
    dplyr::mutate(b__ = floor(n__ / .char_block[1])) %>%
    dplyr::group_by(b__) %>%
    dplyr::mutate(b__ = paste0(
      stringi::stri_pad_left(dplyr::first(n__), 3, 0),
      "-",
      stringi::stri_pad_left(dplyr::last(n__), 3, 0)
    )) %>%
    dplyr::ungroup()
  return(
    list(
      ls = split(dplyr::select(s_, -c(n__, b__)), s_$b__),
      tt = t_
    )
  )
}

Example/Test

tab_source <- table_source[1:100, ]
tab_target <- table_target[1:999, ]
cols_match <- c("name", "iso3", "city", "address")
char_block = c(25, 5)

split_block(
  .source = tab_source,
  .target = tab_target,
  .cols_match = cols_match,
  .char_block = char_block
)
test_that("split_block works", {
  expect_true(inherits(split_block, "function")) 
})

HELP: filter_block

Function

#' Filter Target Dataframe for Block Sizes
#' 
#' Description
#' 
#' @param .block 
#' A block as character string (names of ls element from split_block())
#' @param .tab 
#' Target table (tt element from split_block())
#' @param .size 
#' Second element of .char_block (3. Argument of split_block())
#' @return 
#' A Dataframe
#' 
#' @noRd
filter_block <- function(.block, .tab, .size) {
  n__ <- NULL
  int_ <- as.integer(unlist(stringi::stri_split_fixed(.block, "-")))
  min_ <- int_[1] - .size
  max_ <- int_[2] + .size
  if (is.infinite(.size)) {
    return(.tab)
  } else {
    return(dplyr::filter(.tab, n__ %in% min_:max_))
  }
}

Example/Test

tab_source <- table_source[1:100, ]
tab_target <- table_target[1:999, ]
cols_match <- c("name", "iso3", "city", "address")
char_block = c(25, 5)

lblock <- split_block(
  .source = tab_source,
  .target = tab_target,
  .cols_match = cols_match,
  .char_block = char_block
)

filter_block(
  .block = names(lblock$ls)[1],
  .tab = lblock$tt,
  .size = char_block[2]
)
test_that("filter_block works", {
  expect_true(inherits(filter_block, "function")) 
})

MAIN: match_data

Function

.source <- bind_rows(
  table_source[1:100, ],
  mutate(table_source[1:10, ], id = paste0(id, "1"))
)
.target <- bind_rows(
  table_target[1:999, ],
  mutate(table_target[1:10, ], id = paste0(id, "2"))
)
.cols_match = c("name", "iso3", "city", "address")
.cols_join = c("name", "iso3")
.cols_exact = "iso3"
.max_match = 10
.method = "osa"
.verbose = TRUE
.workers = future::availableCores()
.char_block = c(2, 5)
#' Match Data
#' 
#' Description
#' 
#' @param .source 
#' The Source Dataframe.\cr
#' (Must contain a unique column id and the columns you want to match on)
#' @param .target 
#' The Target Dataframe.\cr
#' (Must contain a unique column id and the columns you want to match on)
#' @param .cols_match 
#' A character vector of columns to perform fuzzy matching. 
#' @param .cols_join
#' Columns to perfrom an exact match on, before fuzzy-matching.\cr
#' (Matched IDs will be excluded from fuzzy-match)
#' @param .cols_exact 
#' Columns that must be matched perfectly.\cr
#' (Data will be partitioned using those columns)
#' @param .max_match 
#' Maximum number of matches to return (Default = 10)
#' @param .method 
#' One of "osa", "lv", "dl", "hamming", "lcs", "qgram", "cosine", "jaccard", "jw", "soundex".\cr
#' See: stringdist-metrics {stringdist}
#' @param .verbose 
#' Print additional information?
#' @param .char_block 
#' Character Block Size. Used to partition data.\cr
#' - First element chunks the source data in ngram-blocks.\cr
#' - Second element allows for characters in target below/above block size.
#' @param .workers 
#' Number of cores to utilize (Default all cores determined by future::availableCores())
#' 
#' @return A dataframe
#' 
#' @export
match_data <- function(
  .source, .target, .cols_match, .cols_join = NULL, .cols_exact = NULL, 
  .max_match = 10, .method = "osa", .verbose = TRUE, 
  .workers = future::availableCores(), .char_block = c(Inf, Inf)
  ) {
  id <- id_s <- id_t <- add_s <- `_id_` <- all_s <- all_t <- NULL


  check_id(.source, .target)
  source_ <- prep_tables(.source, .cols_match)
  target_ <- prep_tables(.target, .cols_match)

  if (!is.null(.cols_join)) {
    tab0_ <- join_data(source_, target_, .cols_match, .cols_join)
    s_ <- dplyr::filter(source_, !id %in% tab0_$id_s)
    t_ <- dplyr::filter(target_, !id %in% tab0_$id_t)
  } else {
    tab0_ <- tibble::tibble(id_s = "", .rows = 0)
    s_ <- source_
    t_ <- target_
  }

  tmp_ <- split_block(s_, t_, .cols_match, .char_block)

  tab1_ <- purrr::imap_dfr(
    .x = tmp_$ls,
    .f = ~ {
      if (.verbose) cat("\rCalculating Block:", .y, "     ")
      help_match_data(
        .source = .x,
        .target = filter_block(.block = .y, .tab = tmp_$tt, .size = .char_block[2]),
        .cols_match = .cols_match,
        .cols_exact = .cols_exact,
        .max_match = .max_match,
        .method = .method,
        .verbose = .verbose,
        .workers = .workers
      )
    }
  )

  out_ <- dplyr::bind_rows(tab0_, tab1_)
  out_ <- out_[, c("id_s", "id_t", paste0("sim_", .cols_match))]
  out_ %>%
    dplyr::left_join(dplyr::select(source_, id_s = id, all_s = `_id_`), by = "id_s") %>%
    dplyr::left_join(dplyr::select(target_, id_t = id, all_t = `_id_`), by = "id_t") %>%
    dplyr::select(id_s, id_t, all_s, all_t, dplyr::everything())

}

Example/Test

tab_source <- table_source[1:100, ]
tab_target <- table_target[1:999, ]
cols_match <- c("name", "iso3", "city", "address")
cols_join  <- c("name", "iso3")
cols_exact <- "iso3"

match_data(
  .source = tab_source,
  .target = tab_target,
  .cols_match = cols_match,
  .cols_join = cols_join,
  .cols_exact = cols_exact
)
test_that("match_data works", {
  expect_true(inherits(match_data, "function")) 
})

HELP: uniqueness_vec

.vec <- standardize_str(table_source[["name"]])

Function

#' Uniquness/Rarity of a vector 
#' 
#' Description
#' 
#' @param .vec A character vector
#' @param .normalize Normalize between 0 and 1
#'
#' @return A numeric vector
#' 
#' @noRd
uniqueness_vec <- function(.vec, .normalize = FALSE) {
  value <- name <- n <- NULL

  l1_ <- stringi::stri_split_fixed(.vec, " ")
  v1_ <- unlist(l1_)
  v1_ <- as.integer(stats::ave(v1_, v1_, FUN = length))
  l1_ <- utils::relist(v1_, l1_)
  v1_ <- purrr::map_dbl(l1_, ~ mean(.x, na.rm = TRUE))

  v2_ <- as.integer(stats::ave(.vec, .vec, FUN = length))

  1 / ((v1_ + v2_) / 2)
}

Example/Test

mean(uniqueness_vec(table_source[["name"]], TRUE), na.rm = TRUE) 
mean(uniqueness_vec(table_source[["iso3"]], TRUE), na.rm = TRUE)
mean(uniqueness_vec(table_source[["city"]], TRUE), na.rm = TRUE)
mean(uniqueness_vec(table_source[["address"]], TRUE), na.rm = TRUE)
test_that("uniqueness_vec works", {
  expect_true(inherits(uniqueness_vec, "function")) 
})

MAIN: get_weights

Function

.tab <- table_source
.cols_match <- c("name", "city", "address")
#' Get Weights
#' 
#' Description
#' 
#' @param .tab 
#' A dataframe (either the source or target dataframe)
#' @param .cols_match 
#' A character vector of columns to perform fuzzy matching.  
#'
#' @return A numeric vector
#' 
#' @export
get_weights <- function(.tab, .cols_match) {
  . <- NULL
  purrr::map_dbl(
    .x = stats::setNames(.cols_match, .cols_match), 
    .f = ~ mean(uniqueness_vec(.tab[[.x]]), na.rm = TRUE)
    ) %>% `/`(sum(., na.rm = TRUE))
}

Example Test

get_weights(table_source, c("name", "city", "address"))
test_that("get_weights works", {
  expect_true(inherits(get_weights, "function")) 
})

HELP: help_check_weights

Function

#' Check Weights
#'
#' @param .weights weights 
#' @param .cols columns
#'
#' @return Error
help_check_weights <- function(.weights = NULL, .cols = NULL) {
  if (!is.null(.weights)) {
    nw_ <- sort(names(.weights))
    nc_ <- sort(.cols)
    lw_ <- length(nw_)
    lc_ <- length(nc_)

    if (lw_ == 0) stop(".weights must be a named vector", call. = FALSE)
    if (!lw_ == lc_) stop(".weights and .cols must have the same length", call. = FALSE)
    if (!all(nw_ == nc_)) stop(".weights and .cols must have the same names", call. = FALSE)
  }
}

Example/Test

# help_check_weights()
test_that("help_check_weights works", {
  expect_true(inherits(help_check_weights, "function")) 
})

MAIN: scores_data

tab_source <- bind_rows(
  table_source[1:100, ],
  mutate(table_source[1:10, ], id = paste0(id, "1"))
)
tab_target <- bind_rows(
  table_target[1:999, ],
  mutate(table_target[1:10, ], id = paste0(id, "2"))
)

.matches <- match_data(
  .source = tab_source,
  .target = tab_target,
  .cols_match = c("name", "iso3", "city", "address"),
  .cols_exact = "iso3",
  .verbose = FALSE
)
.source <- tab_source
.target <- tab_target
.cols_match = c("name", "iso3", "city", "address")
.w_unique <- NULL
.w_custom <- c(name = .6, city = .1, address = .3)
.cols_exact = "iso3"

Function

#' Score Data
#' 
#' Description
#' 
#' @param .matches 
#' Dataframe produced by match_data()
#' @param .source 
#' The Source Dataframe.\cr
#' (Must contain a unique column id and the columns you want to match on)
#' @param .target 
#' The Target Dataframe.\cr
#' (Must contain a unique column id and the columns you want to match on)
#' @param .cols_match 
#' A character vector of columns to perform fuzzy matching.  
#' @param .cols_exact 
#' Columns that must be matched perfectly.\cr
#' (Data will be partitioned using those columns)
#' @param .w_unique 
#' Weights calculated by get_weights()
#' @param .w_custom 
#' A named numeric vector that matches the columns of .cols_match w/o the columns of .cols_exact
#'
#' @return A dataframe
#' 
#' @export
scores_data <- function(.matches, .source, .target, .cols_match, .cols_exact = NULL, 
                        .w_unique = NULL, .w_custom = NULL) {
  id_s <- id_t <- . <- n_s <- add_t <- NULL

  check_id(.source, .target)
  source_  <- prep_tables(.source, .cols_match)
  target_  <- prep_tables(.target, .cols_match)
  matches_ <- tibble::as_tibble(.matches)

  # cols_ <- colnames(matches_)
  # cols_ <- gsub("sim_", "", cols_[grepl("^sim_", cols_)])
  cols_ <- .cols_match[!.cols_match %in% .cols_exact]

  if (!is.null(.w_unique)) {
    help_check_weights(.w_unique, cols_)
    wu_ <- .w_unique
  } else {
    wu_ <- (get_weights(source_, cols_) + get_weights(target_, cols_)) / 2
  }

  if (!is.null(.w_custom)) {
    help_check_weights(.w_custom, cols_)
    wc_ <- .w_custom[order(match(names(.w_custom), cols_))]
    wc_ <- wc_ / sum(wc_)
  } else {
    wc_ <- rep(NA_real_, length(cols_))
  }

  mat_ <- as.matrix(matches_[, paste0("sim_", cols_)])

  matches_ %>%
    dplyr::mutate(
      sms = rowMeans(mat_, na.rm = TRUE),
      smw = rowSums(mat_ * wu_, na.rm = TRUE),
      smc = rowSums(mat_ * wc_, na.rm = TRUE),

      sss = rowMeans(mat_ ^ 2, na.rm = TRUE),
      ssw = rowSums(mat_  ^ 2 * wu_, na.rm = TRUE),
      ssc = rowSums(mat_  ^ 2 * wc_, na.rm = TRUE),
    )
}

Example/Test

tab_source <- table_source[1:100, ]
tab_target <- table_target[1:999, ]
cols_match <- c("name", "iso3", "city", "address")
cols_exact <- "iso3"
cols_join  <- c("name", "iso3")
tab_match <- match_data(
  .source = tab_source,
  .target = tab_target,
  .cols_match = cols_match,
  .cols_exact = cols_exact,
  .cols_join = cols_join,
  .method = "soundex",
)
scores_data(
  .matches = tab_match, 
  .source = tab_source, 
  .target = tab_target, 
  .cols_match = cols_match,
  .cols_exact = cols_exact
  )
test_that("scores_data works", {
  expect_true(inherits(scores_data, "function")) 
})

MAIN: dedup_data

tab_source <- bind_rows(
  table_source[1:100, ],
  mutate(table_source[1:10, ], id = paste0(id, "1"))
)
tab_target <- bind_rows(
  table_target[1:999, ],
  mutate(table_target[1:10, ], id = paste0(id, "2"))
)
cols_match <- c("name", "iso3", "city", "address")
cols_exact <- "iso3"
cols_join  <- c("name", "iso3")

tab_match <- match_data(
  .source = tab_source,
  .target = tab_target,
  .cols_match = cols_match,
  .cols_exact = cols_exact,
  .cols_join = cols_join,
  .method = "soundex",
)
.score <- scores_data(
  .matches = tab_match, 
  .source = tab_source, 
  .target = tab_target, 
  .cols_match = cols_match,
  .cols_exact = cols_exact
  )
.source <- tab_source
.target <- tab_target
.cols_match <- c("name", "iso3", "city", "address")
.col_score <- c("sms", "smw", "smc", "sss", "ssw", "ssc")

.min_sim = c(name = .25, address = .25)

Function

#' Deduplicate Data
#' 
#' Description
#' 
#' @param .score 
#' Dataframe generated by scores_data()
#' @param .source 
#' The Source Dataframe.\cr
#' (Must contain a unique column id and the columns you want to match on)
#' @param .target 
#' The Target Dataframe.\cr
#' (Must contain a unique column id and the columns you want to match on)
#' @param .cols_match 
#' A character vector of columns to perform fuzzy matching.
#' @param .min_sim
#' Named vector with minimum similarities
#' @param .col_score 
#' Score column generated by scores_data().\cr
#' Options are:\cr
#' - sms: Simple Mean (mean over all fuzzy columns)\cr
#' - smw: Weighted Mean (mean over all fuzzy columns, weighted by get_weights())\cr
#' - smc: Custom Mean (mean over all fuzzy columns, weighted custom weights)\cr
#' - sss: Simple Mean, squared (mean over all fuzzy columns, scores are squared)\cr
#' - ssw: Weighted Mean, squared (mean over all fuzzy columns, scores are squared before weighted by get_weights())\cr
#' - ssc: Custom Mean, squared (mean over all fuzzy columns, scores are squared before weighted custom weights)
#'
#' @return A dataframe
#' 
#' @importFrom rlang :=
#' 
#' @export
dedup_data <- function(
  .score, .source, .target, .cols_match, .min_sim = NULL, 
  .col_score = c("sms", "smw", "smc", "sss", "ssw", "ssc")
  ) {
  id_s <- id_t <- name_s <- name_t <- all_s <- all_t <- score <- 
    `_id_` <- len_s <- len_t <- n_s <- n_t <- sms <- smw <- smc <- sss <- 
    ssw <- ssc <- NULL
  check_id(.source, .target)

  cols_score_ <- match.arg(.col_score, c("sms", "smw", "smc", "sss", "ssw", "ssc"))

  source_  <- prep_tables(.source, .cols_match)
  target_  <- prep_tables(.target, .cols_match)
  score_  <- tibble::as_tibble(.score)


  col_s_ <- colnames(source_)[!colnames(source_) == "_id_"]
  col_t_ <- colnames(target_)[!colnames(target_) == "_id_"]


  col_e_ <- col_s_[col_s_ %in% col_t_]
  col_e_ <- col_e_[!col_e_ == "id"]
  col_e_ <- unlist(purrr::map2(paste0(col_e_, "_s"), paste0(col_e_, "_t"), c))

  tab_ <- dplyr::filter(score_, !!dplyr::sym(cols_score_) > 0)

  if (!is.null(.min_sim)) {
    for (i in seq_len(length(.min_sim))) {
      n_ <- paste0("sim_", names(.min_sim)[i])
      v_ <- .min_sim[i]
      tab_ <- dplyr::filter(tab_, !!dplyr::sym(n_) >= v_)
    }
  }
  tab_ <- tab_ %>%
    dplyr::group_by(id_t) %>%
    dplyr::slice_max(!!dplyr::sym(cols_score_)) %>%
    dplyr::mutate(n_t = dplyr::n()) %>%
    dplyr::ungroup() %>%
    dplyr::group_by(id_s) %>%
    dplyr::slice_max(!!dplyr::sym(cols_score_)) %>%
    dplyr::mutate(n_s = dplyr::n()) %>%
    dplyr::ungroup() %>%
    dplyr::left_join(
      y = dplyr::select(source_, -c(`_id_`)),
      by = c("id_s" = "id"),
      suffix = c("_s", "_t")
    ) %>%
    dplyr::left_join(
      y = dplyr::select(target_, -c(`_id_`)),
      by = c("id_t" = "id"),
      suffix = c("_s", "_t")
    ) %>%
    dplyr::mutate(
      len_s = lengths(all_s),
      len_t = lengths(all_t)
    )

  cols_use_ <- colnames(tab_)
  cols_use_ <- cols_use_[cols_use_ %in% c(
    "id_s", "id_t", "n_s", "n_t", "all_s", "all_t", "len_s", "len_t",
    paste0("sim_", .cols_match), "sms", "smw", "smc", "sss", "ssw", "ssc"
  )]

  tab_[, c(cols_use_, col_e_)]
}

Example/Test

tab_source <- table_source[1:100, ]
tab_target <- table_target[1:999, ]
cols_match <- c("name", "iso3", "city", "address")
cols_exact <- "iso3"
cols_join  <- c("name", "iso3")
tab_match <- match_data(
  .source = tab_source,
  .target = tab_target,
  .cols_match = cols_match,
  .cols_exact = cols_exact,
  .cols_join = cols_join,
  .method = "soundex"
)
tab_score <- scores_data(
  .matches = tab_match, 
  .source = tab_source, 
  .target = tab_target, 
  .cols_match = cols_match,
  .cols_exact = cols_exact
  )

dedup_data(
  .score = tab_score, 
  .source = tab_source, 
  .target = tab_target,
  .cols_match = cols_match,
  .col_score = "sms"
  )
test_that("dedup_data works", {
  expect_true(inherits(dedup_data, "function")) 
})

MAIN: match_complete

Function

.source = table_source[1:100, ]
.target = table_target[1:999, ]
.standardize = TRUE
.cols_match = c("name", "iso3", "city", "address")
.cols_join = c("name", "iso3")
.cols_exact = "iso3"
.max_match = 10
.method = "osa"
.verbose = TRUE
.workers = future::availableCores()
.char_block = c(2, 5)
.w_unique = NULL
.w_custom = NULL
.col_score = c("sms", "smw", "smc", "sss", "ssw", "ssc")
#' Complete Match
#' 
#' Description
#' 
#' @param .source 
#' The Source Dataframe.\cr
#' (Must contain a unique column id and the columns you want to match on)
#' @param .target 
#' The Target Dataframe.\cr
#' (Must contain a unique column id and the columns you want to match on)
#' @param .cols_match 
#' A character vector of columns to perform fuzzy matching. 
#' @param .cols_join
#' Columns to perfrom an exact match on, before fuzzy-matching.\cr
#' (Matched IDs will be excluded from fuzzy-match)
#' @param .cols_exact 
#' Columns that must be matched perfectly.\cr
#' (Data will be partitioned using those columns)
#' @param .max_match 
#' Maximum number of matches to return (Default = 10)
#' @param .method 
#' One of "osa", "lv", "dl", "hamming", "lcs", "qgram", "cosine", "jaccard", "jw", "soundex".\cr
#' See: stringdist-metrics {stringdist}
#' @param .verbose 
#' Print additional information?
#' @param .workers 
#' Number of cores to utilize (Default all cores determined by future::availableCores())
#' @param .char_block 
#' Character Block Size. Used to partition data.\cr
#' - First element chunks the source data in ngram-blocks.\cr
#' - Second element allows for characters in target below/above block size.
#' @param .standardize 
#' Perform String Standardization using standardize_data()?
#' @param .w_unique 
#' Weights calculated by get_weights()
#' @param .w_custom 
#' A named numeric vector that matches the columns of .cols_match w/o the columns of .cols_exact
#' @param .min_sim
#' Named vector with minimum similarities
#' @param .col_score 
#' Score column generated by scores_data().\cr
#' Options are:\cr
#' - sms: Simple Mean (mean over all fuzzy columns)\cr
#' - smw: Weighted Mean (mean over all fuzzy columns, weighted by get_weights())\cr
#' - smc: Custom Mean (mean over all fuzzy columns, weighted custom weights)\cr
#' - sss: Simple Mean, squared (mean over all fuzzy columns, scores are squared)\cr
#' - ssw: Weighted Mean, squared (mean over all fuzzy columns, scores are squared before weighted by get_weights())\cr
#' - ssc: Custom Mean, squared (mean over all fuzzy columns, scores are squared before weighted custom weights)
#'
#' @return
#' A dataframe
#' 
#' @export
match_complete <- function(
  .source, .target, .cols_match, .cols_join = NULL, .cols_exact = NULL, 
  .max_match = 10, .method = "osa", .verbose = TRUE, 
  .workers = future::availableCores(), .char_block = c(Inf, Inf), 
  .standardize = TRUE, .w_unique = NULL, .w_custom = NULL, 
  .min_sim = NULL, .col_score = c("sms", "smw", "smc", "sss", "ssw", "ssc")
  ) {
  check_id(.source, .target)
  source_ <- prep_tables(.source, .cols_match)
  target_ <- prep_tables(.target, .cols_match)

  cols_score_ <- match.arg(.col_score, c("sms", "smw", "smc", "sss", "ssw", "ssc"))

  if (.standardize) {
    source_ <- standardize_data(source_, .cols_match)
    target_ <- standardize_data(target_, .cols_match)
  }

  match_ <- match_data(
    .source = source_,
    .target = target_,
    .cols_match = .cols_match,
    .cols_join = .cols_join,
    .cols_exact = .cols_exact,
    .max_match = .max_match,
    .method = .method,
    .verbose = .verbose,
    .workers = .workers,
    .char_block = .char_block
  )

  score_ <- scores_data(
    .matches = match_,
    .source = source_,
    .target = target_,
    .cols_match = .cols_match,
    .cols_exact = .cols_exact,
    .w_unique = .w_unique,
    .w_custom = .w_custom
  )

  dedup_data(
    .score = score_,
    .source = source_,
    .target = target_,
    .cols_match = .cols_match,
    .col_score = cols_score_,
    .min_sim = .min_sim
  )

}

Example/Test

match_complete(
  .source = table_source[1:100, ],
  .target = table_target[1:999, ],
  .cols_match = c("name", "iso3", "city", "address"),
  .cols_join = c("name", "iso3"),
  .cols_exact = "iso3",
  .max_match = 25,
  .method = "soundex",
  .verbose = TRUE,
  .workers = 4,
  .char_block = c(5, 5),
  .standardize = TRUE,
  .w_unique = NULL,
  .w_custom = c(name = .7, city = .2, address = .1),
  .col_score = "sms"
)
test_that("match_complete works", {
  expect_true(inherits(match_complete, "function")) 
})

Inflate your package

You're one inflate from paper to box. Build your package from this very Rmd using fusen::inflate()

fusen::inflate(
  flat_file = "dev/flat_full.Rmd", 
  vignette_name = NA, 
  overwrite = TRUE,
  check = FALSE
  )
devtools::check(vignettes = FALSE)


MatthiasUckert/Rmatch documentation built on Jan. 3, 2022, 11:09 p.m.