library(testthat); library(tidyverse); library(stringi); library(fst)
pkgload::load_all(path = here::here(), export_all = TRUE)
.str <- "International Business Corporation" .op = c("space", "punct", "case", "ascii")
#' Standardize Strings #' #' Description #' #' @param .str #' A string #' @param .op #' One of c("space", "punct", "case", "ascii") #' #' @return A string #' #' @export standardize_str <- function(.str, .op = c("space", "punct", "case", "ascii")) { str_ <- .str if ("ascii" %in% .op) { str_ <- stringi::stri_trans_general(str_, "Latin-ASCII") } if ("punct" %in% .op) { str_ <- trimws(stringi::stri_replace_all_regex(str_, "\\W", " ")) str_ <- trimws(stringi::stri_replace_all_regex(str_, "[[:punct:]]", " ")) if (!"space" %in% .op) { str_ <- trimws(stringi::stri_replace_all_regex(str_, "([[:blank:]]|[[:space:]])+", " ")) } } if ("space" %in% .op) { str_ <- trimws(stringi::stri_replace_all_regex(str_, "([[:blank:]]|[[:space:]])+", " ")) } if ("case" %in% .op) { str_ <- toupper(str_) } return(str_) }
standardize_str(c("jkldsa jkdhas äää §$ ## #'''"))
test_that("standardize_str works", { expect_true(inherits(standardize_str, "function")) })
.tab <- table_source .cols <- c("name", "iso3", "city", "address") .op = c("space", "punct", "case", "ascii")
#' Standardize Data #' #' Description #' #' @param .tab #' A dataframe (either the source or target dataframe) #' @param .cols_match #' A character vector of columns to perform fuzzy matching. #' @param .fun Function for standardization, if NULL standardize_str() is used #' #' @return A dataframe #' #' @export standardize_data <- function(.tab, .cols_match, .fun = NULL) { tab_ <- tibble::as_tibble(.tab) if (is.null(.fun)) { f_ <- standardize_str } else { f_ <- .fun } for (i in .cols_match) { tab_[[i]] <- f_(tab_[[i]]) } return(tab_) }
standardize_data(table_source, c("name", "iso3", "city", "address"))
test_that("standardize_data works", { expect_true(inherits(standardize_data, "function")) })
.tab <- table_source .col_name <- "name" .col_country <- NULL .legal_forms <- tibble::tibble(.rows = 0) .workers = future::availableCores()
#' Extract Legal Forms #' #' Description #' #' @param .tab #' A dataframe (either the source or target dataframe) #' @param .col_name #' The column with firm names #' @param .col_country #' Optionally, a column with iso3 country codes #' @param .legal_forms #' A dataframe with legal forms #' @param .workers #' Number of cores to utilize (Default all cores determined by future::availableCores()) #' #' @return A dataframe #' #' @importFrom rlang := #' #' @export extract_legal_form <- function( .tab, .col_name, .col_country = NULL, .legal_forms = data.frame(), .workers = future::availableCores() ) { tmp <- legal_form_orig <- legal_form_stand <- legal_form <- name <- lf_stand <- lf_orig <- NULL .tab <- tibble::as_tibble(.tab) if (nrow(.legal_forms) == 0) { tab_lf_ <- get("legal_form_all") } else { tab_lf_ <- .legal_forms } if (is.null(.col_country)) { tab_lf_ <- tab_lf_ %>% dplyr::distinct(legal_form_orig, legal_form_stand) %>% dplyr::distinct(legal_form_orig, .keep_all = TRUE) join_by_ <- "legal_form_orig" } else { colnames(tab_lf_) <- c(.col_country, "legal_form_orig", "legal_form_stand") join_by_ <- c(.col_country, "legal_form_orig") } tab_ <- standardize_data(.tab, .col_name) lf_ <- unique(tab_lf_[["legal_form_orig"]]) nm_ <- tab_[[.col_name]] f_ <- carrier::crate(function(.lf, .nm) which(endsWith(.nm, paste0(" ", .lf)))) future::plan("multisession", workers = .workers) lf_ext_ <- furrr::future_map( .x = purrr::set_names(lf_, lf_), .f = ~ f_(.x, nm_), .options = furrr::furrr_options(seed = TRUE) ) future::plan("default") lf_ext_ <- lf_ext_ %>% purrr::compact() %>% tibble::enframe(name = "legal_form_orig", value = "tmp") %>% tidyr::unnest(tmp) %>% dplyr::arrange(dplyr::desc(nchar(legal_form_orig))) %>% dplyr::distinct(tmp, .keep_all = TRUE) tab_ %>% dplyr::mutate(tmp = dplyr::row_number()) %>% dplyr::left_join(lf_ext_, by = "tmp") %>% dplyr::left_join(tab_lf_, by = join_by_) %>% dplyr::rename( lf_stand = legal_form_stand, lf_orig = legal_form_orig ) %>% dplyr::relocate(lf_stand, .after = !!dplyr::sym(.col_name)) %>% dplyr::relocate(lf_orig, .after = !!dplyr::sym(.col_name)) %>% dplyr::mutate( !!dplyr::sym(paste0(.col_name, "_adj")) := trimws( stringi::stri_replace_last_fixed(!!dplyr::sym(.col_name), lf_orig, "") ), .after = !!dplyr::sym(.col_name) ) %>% dplyr::mutate( !!dplyr::sym(paste0(.col_name, "_adj")) := dplyr::if_else( condition = is.na(!!dplyr::sym(paste0(.col_name, "_adj"))), true = !!dplyr::sym(.col_name), false = !!dplyr::sym(paste0(.col_name, "_adj")) )) %>% dplyr::mutate( !!dplyr::sym(paste0(.col_name, "_std")) := dplyr::if_else( condition = !is.na(lf_stand), true = paste(!!dplyr::sym(paste0(.col_name, "_adj")), lf_stand), false = !!dplyr::sym(paste0(.col_name, "_adj")) ), .after = !!dplyr::sym(paste0(.col_name, "_adj")) ) %>% dplyr::select(-tmp) %>% dplyr::mutate(!!dplyr::sym(.col_name) := .tab[[.col_name]]) }
extract_legal_form( .tab = table_source[1:100, ], .col_name = "name", .col_country = "iso3", .workers = 1 )
test_that("extract_legal_form works", { expect_true(inherits(extract_legal_form, "function")) })
.tab <- table_source .cols_match <- c("name", "iso3", "city", "address")
#' Prepare Table #' #' Description #' #' @param .tab #' A dataframe (either the source or target dataframe) #' @param .cols_match #' A character vector of columns to perform fuzzy matching. #' #' @return #' #' @noRd prep_tables <- function(.tab, .cols_match) { tmp <- id <- `_id_` <- NULL if (!"_id_" %in% colnames(.tab)) { tidyr::unite( data = tibble::as_tibble(.tab[, c("id", .cols_match)]), col = tmp, !dplyr::matches("^id$") ) %>% dplyr::group_by(tmp) %>% dplyr::summarise( `_id_` = list(id), id = dplyr::first(id), .groups = "drop" ) %>% dplyr::select(id, `_id_`) %>% dplyr::left_join(.tab, by = "id") } else { .tab } }
prep_tables(table_source, c("name", "iso3", "city", "address"))
test_that("prep_tables works", { expect_true(inherits(prep_tables, "function")) })
.source <- bind_rows(table_source[1:100, ], dplyr::mutate(table_source[1:100, ], id = paste0(id, "-1"))) .target <- table_target[1:999, ] .cols_match <- c("name", "iso3", "city", "address") .max_match <- 10 .method <- "osa" .workers <- future::availableCores()
#' Match a on a single column #' #' Description #' #' @param .source #' The Source Dataframe.\cr #' (Must contain a unique column id and the columns you want to match on) #' @param .target #' The Target Dataframe.\cr #' (Must contain a unique column id and the columns you want to match on) #' @param .cols_match #' A character vector of columns to perform fuzzy matching. #' @param .max_match #' Maximum number of matches to return (Default = 10) #' @param .method #' One of "osa", "lv", "dl", "hamming", "lcs", "qgram", "cosine", "jaccard", "jw", "soundex".\cr #' See: stringdist-metrics {stringdist} #' @param .workers #' Number of cores to utilize (Default all cores determined by future::availableCores()) #' #' @return A Dataframe match_col <- function( .source, .target, .cols_match, .max_match = 10, .method = "osa", .workers = future::availableCores() ) { V1 <- value <- id <- name <- id_t <- sim <- NULL check_id(.source, .target) source_ <- prep_tables(.source, .cols_match) target_ <- prep_tables(.target, .cols_match) method_ <- match.arg( arg = .method, choices = c("osa", "lv", "dl", "hamming", "lcs", "qgram", "cosine", "jaccard", "jw", "soundex") ) tab_ <- stringdist::stringsimmatrix( a = source_[[.cols_match[1]]], b = target_[[.cols_match[1]]], method = method_, nthread = .workers ) %>% tibble::as_tibble() %>% dplyr::mutate(id = dplyr::row_number(), .before = V1) %>% tidyr::pivot_longer(!dplyr::matches("id")) %>% dplyr::group_by(id) %>% dplyr::slice_max(order_by = value, n = .max_match) %>% dplyr::ungroup() %>% dplyr::rename(id_s = id, id_t = name) %>% dplyr::mutate(id_t = as.integer(gsub("V", "", id_t, fixed = TRUE))) %>% suppressWarnings() tab_[["id_s"]] <- source_[["id"]][tab_[["id_s"]]] tab_[["id_t"]] <- target_[["id"]][tab_[["id_t"]]] colnames(tab_) <- c("id_s", "id_t", paste0("sim_", .cols_match[1])) return(tab_) }
match_col( .source = table_source[1:100, ], .target = table_target[1:999, ], .cols_match = c("name", "iso3", "city", "address") )
test_that("match_col works", { expect_true(inherits(match_col, "function")) })
.source <- bind_rows(table_source[1:100, ], dplyr::mutate(table_source[1:100, ], id = paste0(id, "-1"))) .target <- table_target[1:999, ] .cols_match <- c("name", "iso3", "city", "address") .cols_exact = NULL .max_match = 10 .method = "osa" .verbose = TRUE .workers = future::availableCores()
#' Match Data #' #' Description #' #' @param .source #' The Source Dataframe.\cr #' (Must contain a unique column id and the columns you want to match on) #' @param .target #' The Target Dataframe.\cr #' (Must contain a unique column id and the columns you want to match on) #' @param .cols_match #' A character vector of columns to perform fuzzy matching. #' @param .cols_exact #' Columns that must be matched perfectly.\cr #' (Data will be partitioned using those columns) #' @param .max_match #' Maximum number of matches to return (Default = 10) #' @param .method #' One of "osa", "lv", "dl", "hamming", "lcs", "qgram", "cosine", "jaccard", "jw", "soundex".\cr #' See: stringdist-metrics {stringdist} #' @param .verbose #' Print additional information? #' @param .workers #' Number of cores to utilize (Default all cores determined by future::availableCores()) #' #' @return A Dataframe #' help_match_data <- function( .source, .target, .cols_match, .cols_exact = NULL, .max_match = 10, .method = "osa", .verbose = TRUE, .workers = future::availableCores() ) { sim <- NULL check_id(.source, .target) source_ <- prep_tables(.source, .cols_match) target_ <- prep_tables(.target, .cols_match) if (!is.null(.cols_exact)) { vs_ <- tidyr::unite(source_[, .cols_exact], "tmp", dplyr::everything())[["tmp"]] ls_ <- split(source_, vs_) vt_ <- tidyr::unite(target_[, .cols_exact], "tmp", dplyr::everything())[["tmp"]] lt_ <- split(target_, vt_) lt_ <- lt_[names(lt_) %in% names(ls_)] ls_ <- ls_[names(lt_)] } else { ls_ <- list(source_) lt_ <- list(target_) } if (.verbose) { pb <- progress::progress_bar$new( total = length(ls_), clear = FALSE, show_after = 1 ) } purrr::map2_dfr( .x = ls_, .y = lt_, .f = ~ { if (.verbose) pb$tick() tab_ <- match_col( .source = .x, .target = .y, .cols_match = .cols_match, .max_match = .max_match, .method = .method, .workers = .workers ) if (length(.cols_match) > 1) { s_ <- dplyr::left_join(tab_, .x, by = c("id_s" = "id")) t_ <- dplyr::left_join(tab_, .y, by = c("id_t" = "id")) for (i in 2:length(.cols_match)) { cols_sim_ <- paste0("sim_", .cols_match[i]) tab_[[cols_sim_]] <- stringdist::stringsim( a = s_[[.cols_match[i]]], b = t_[[.cols_match[i]]], method = .method ) } } return(tab_) } ) }
help_match_data( .source = table_source[1:100, ], .target = table_target[1:999, ], .cols_match = c("name", "iso3", "city", "address"), .cols_exact = NULL, .max_match = 10, .method = "osa", .verbose = TRUE, .workers = 4 )
test_that("match_data works", { expect_true(inherits(help_match_data, "function")) })
.source = table_source[1:100, ] .target = table_target[1:999, ] .cols_match = c("name", "iso3", "city", "address") .cols_join <- c("name", "iso3") .method = "osa"
#' Perform LeftJoin on Data #' #' Description #' #' @param .source #' The Source Dataframe.\cr #' (Must contain a unique column id and the columns you want to match on) #' @param .target #' The Target Dataframe.\cr #' (Must contain a unique column id and the columns you want to match on) #' @param .cols_match #' A character vector of columns to perform fuzzy matching. #' @param .cols_join #' Columns to perfrom an exact match on, before fuzzy-matching.\cr #' (Matched IDs will be excluded from fuzzy-match) #' @param .method #' One of "osa", "lv", "dl", "hamming", "lcs", "qgram", "cosine", "jaccard", "jw", "soundex".\cr #' See: stringdist-metrics {stringdist} #' @return A Dataframe #' #' @export join_data <- function(.source, .target, .cols_match, .cols_join, .method = "osa") { id_s <- id_t <- NULL check_id(.source, .target) source_ <- prep_tables(.source, .cols_match) target_ <- prep_tables(.target, .cols_match) s_ <- source_[, c("id", .cols_join)] t_ <- target_[, c("id", .cols_join)] non_ <- .cols_match[!.cols_match %in% .cols_join] tab_ <- dplyr::inner_join(s_, t_, by = .cols_join, suffix = c("_s", "_t")) %>% dplyr::mutate( dplyr::across(!dplyr::matches("^id_s$|^id_t$"), ~1) ) %>% dplyr::select(id_s, id_t, dplyr::everything()) %>% `colnames<-`(c("id_s", "id_t", paste0("sim_", .cols_join))) s_ <- dplyr::left_join(tab_, .source[, c("id", non_)], by = c("id_s" = "id")) t_ <- dplyr::left_join(tab_, .target[, c("id", non_)], by = c("id_t" = "id")) for (i in seq_len(length(non_))) { tab_[[paste0("sim_", non_[i])]] <- stringdist::stringsim(s_[[non_[i]]], t_[[non_[i]]], .method) } return(tab_) }
join_data( .source = table_source, .target = table_target, .cols_match = c("name", "iso3", "city", "address"), .cols_join = c("name", "iso3"), .method = "osa" )
test_that("join_data works", { expect_true(inherits(join_data, "function")) })
.source = table_source[1:100, ] .target = table_target[1:999, ] .cols_match = c("name", "iso3", "city", "address") .char_block = c(2, 5)
#' Split to Blocks #' #' Description #' #' @param .source #' The Source Dataframe.\cr #' (Must contain a unique column id and the columns you want to match on) #' @param .target #' The Target Dataframe.\cr #' (Must contain a unique column id and the columns you want to match on) #' @param .cols_match #' A character vector of columns to perform fuzzy matching. #' @param .char_block #' Character Block Size. Used to partition data.\cr #' - First element chunks the source data in ngram-blocks.\cr #' - Second element allows for characters in target below/above block size. #' #' @return #' A List #' #' @noRd split_block <- function(.source, .target, .cols_match, .char_block) { n__ <- b__ <- NULL check_id(.source, .target) source_ <- prep_tables(.source, .cols_match) target_ <- prep_tables(.target, .cols_match) t_ <- dplyr::mutate(target_, n__ = nchar(!!dplyr::sym(.cols_match[1]))) max_t_ <- max(t_$n__) s_ <- source_ %>% dplyr::mutate( n__ = nchar(!!dplyr::sym(.cols_match[1])), n__ = dplyr::if_else(n__ > max_t_, max_t_, n__) ) %>% dplyr::arrange(n__) %>% dplyr::mutate(b__ = floor(n__ / .char_block[1])) %>% dplyr::group_by(b__) %>% dplyr::mutate(b__ = paste0( stringi::stri_pad_left(dplyr::first(n__), 3, 0), "-", stringi::stri_pad_left(dplyr::last(n__), 3, 0) )) %>% dplyr::ungroup() return( list( ls = split(dplyr::select(s_, -c(n__, b__)), s_$b__), tt = t_ ) ) }
tab_source <- table_source[1:100, ] tab_target <- table_target[1:999, ] cols_match <- c("name", "iso3", "city", "address") char_block = c(25, 5) split_block( .source = tab_source, .target = tab_target, .cols_match = cols_match, .char_block = char_block )
test_that("split_block works", { expect_true(inherits(split_block, "function")) })
#' Filter Target Dataframe for Block Sizes #' #' Description #' #' @param .block #' A block as character string (names of ls element from split_block()) #' @param .tab #' Target table (tt element from split_block()) #' @param .size #' Second element of .char_block (3. Argument of split_block()) #' @return #' A Dataframe #' #' @noRd filter_block <- function(.block, .tab, .size) { n__ <- NULL int_ <- as.integer(unlist(stringi::stri_split_fixed(.block, "-"))) min_ <- int_[1] - .size max_ <- int_[2] + .size if (is.infinite(.size)) { return(.tab) } else { return(dplyr::filter(.tab, n__ %in% min_:max_)) } }
tab_source <- table_source[1:100, ] tab_target <- table_target[1:999, ] cols_match <- c("name", "iso3", "city", "address") char_block = c(25, 5) lblock <- split_block( .source = tab_source, .target = tab_target, .cols_match = cols_match, .char_block = char_block ) filter_block( .block = names(lblock$ls)[1], .tab = lblock$tt, .size = char_block[2] )
test_that("filter_block works", { expect_true(inherits(filter_block, "function")) })
.source <- bind_rows( table_source[1:100, ], mutate(table_source[1:10, ], id = paste0(id, "1")) ) .target <- bind_rows( table_target[1:999, ], mutate(table_target[1:10, ], id = paste0(id, "2")) ) .cols_match = c("name", "iso3", "city", "address") .cols_join = c("name", "iso3") .cols_exact = "iso3" .max_match = 10 .method = "osa" .verbose = TRUE .workers = future::availableCores() .char_block = c(2, 5)
#' Match Data #' #' Description #' #' @param .source #' The Source Dataframe.\cr #' (Must contain a unique column id and the columns you want to match on) #' @param .target #' The Target Dataframe.\cr #' (Must contain a unique column id and the columns you want to match on) #' @param .cols_match #' A character vector of columns to perform fuzzy matching. #' @param .cols_join #' Columns to perfrom an exact match on, before fuzzy-matching.\cr #' (Matched IDs will be excluded from fuzzy-match) #' @param .cols_exact #' Columns that must be matched perfectly.\cr #' (Data will be partitioned using those columns) #' @param .max_match #' Maximum number of matches to return (Default = 10) #' @param .method #' One of "osa", "lv", "dl", "hamming", "lcs", "qgram", "cosine", "jaccard", "jw", "soundex".\cr #' See: stringdist-metrics {stringdist} #' @param .verbose #' Print additional information? #' @param .char_block #' Character Block Size. Used to partition data.\cr #' - First element chunks the source data in ngram-blocks.\cr #' - Second element allows for characters in target below/above block size. #' @param .workers #' Number of cores to utilize (Default all cores determined by future::availableCores()) #' #' @return A dataframe #' #' @export match_data <- function( .source, .target, .cols_match, .cols_join = NULL, .cols_exact = NULL, .max_match = 10, .method = "osa", .verbose = TRUE, .workers = future::availableCores(), .char_block = c(Inf, Inf) ) { id <- id_s <- id_t <- add_s <- `_id_` <- all_s <- all_t <- NULL check_id(.source, .target) source_ <- prep_tables(.source, .cols_match) target_ <- prep_tables(.target, .cols_match) if (!is.null(.cols_join)) { tab0_ <- join_data(source_, target_, .cols_match, .cols_join) s_ <- dplyr::filter(source_, !id %in% tab0_$id_s) t_ <- dplyr::filter(target_, !id %in% tab0_$id_t) } else { tab0_ <- tibble::tibble(id_s = "", .rows = 0) s_ <- source_ t_ <- target_ } tmp_ <- split_block(s_, t_, .cols_match, .char_block) tab1_ <- purrr::imap_dfr( .x = tmp_$ls, .f = ~ { if (.verbose) cat("\rCalculating Block:", .y, " ") help_match_data( .source = .x, .target = filter_block(.block = .y, .tab = tmp_$tt, .size = .char_block[2]), .cols_match = .cols_match, .cols_exact = .cols_exact, .max_match = .max_match, .method = .method, .verbose = .verbose, .workers = .workers ) } ) out_ <- dplyr::bind_rows(tab0_, tab1_) out_ <- out_[, c("id_s", "id_t", paste0("sim_", .cols_match))] out_ %>% dplyr::left_join(dplyr::select(source_, id_s = id, all_s = `_id_`), by = "id_s") %>% dplyr::left_join(dplyr::select(target_, id_t = id, all_t = `_id_`), by = "id_t") %>% dplyr::select(id_s, id_t, all_s, all_t, dplyr::everything()) }
tab_source <- table_source[1:100, ] tab_target <- table_target[1:999, ] cols_match <- c("name", "iso3", "city", "address") cols_join <- c("name", "iso3") cols_exact <- "iso3" match_data( .source = tab_source, .target = tab_target, .cols_match = cols_match, .cols_join = cols_join, .cols_exact = cols_exact )
test_that("match_data works", { expect_true(inherits(match_data, "function")) })
.vec <- standardize_str(table_source[["name"]])
#' Uniquness/Rarity of a vector #' #' Description #' #' @param .vec A character vector #' @param .normalize Normalize between 0 and 1 #' #' @return A numeric vector #' #' @noRd uniqueness_vec <- function(.vec, .normalize = FALSE) { value <- name <- n <- NULL l1_ <- stringi::stri_split_fixed(.vec, " ") v1_ <- unlist(l1_) v1_ <- as.integer(stats::ave(v1_, v1_, FUN = length)) l1_ <- utils::relist(v1_, l1_) v1_ <- purrr::map_dbl(l1_, ~ mean(.x, na.rm = TRUE)) v2_ <- as.integer(stats::ave(.vec, .vec, FUN = length)) 1 / ((v1_ + v2_) / 2) }
mean(uniqueness_vec(table_source[["name"]], TRUE), na.rm = TRUE) mean(uniqueness_vec(table_source[["iso3"]], TRUE), na.rm = TRUE) mean(uniqueness_vec(table_source[["city"]], TRUE), na.rm = TRUE) mean(uniqueness_vec(table_source[["address"]], TRUE), na.rm = TRUE)
test_that("uniqueness_vec works", { expect_true(inherits(uniqueness_vec, "function")) })
.tab <- table_source .cols_match <- c("name", "city", "address")
#' Get Weights #' #' Description #' #' @param .tab #' A dataframe (either the source or target dataframe) #' @param .cols_match #' A character vector of columns to perform fuzzy matching. #' #' @return A numeric vector #' #' @export get_weights <- function(.tab, .cols_match) { . <- NULL purrr::map_dbl( .x = stats::setNames(.cols_match, .cols_match), .f = ~ mean(uniqueness_vec(.tab[[.x]]), na.rm = TRUE) ) %>% `/`(sum(., na.rm = TRUE)) }
get_weights(table_source, c("name", "city", "address"))
test_that("get_weights works", { expect_true(inherits(get_weights, "function")) })
#' Check Weights #' #' @param .weights weights #' @param .cols columns #' #' @return Error help_check_weights <- function(.weights = NULL, .cols = NULL) { if (!is.null(.weights)) { nw_ <- sort(names(.weights)) nc_ <- sort(.cols) lw_ <- length(nw_) lc_ <- length(nc_) if (lw_ == 0) stop(".weights must be a named vector", call. = FALSE) if (!lw_ == lc_) stop(".weights and .cols must have the same length", call. = FALSE) if (!all(nw_ == nc_)) stop(".weights and .cols must have the same names", call. = FALSE) } }
# help_check_weights()
test_that("help_check_weights works", { expect_true(inherits(help_check_weights, "function")) })
tab_source <- bind_rows( table_source[1:100, ], mutate(table_source[1:10, ], id = paste0(id, "1")) ) tab_target <- bind_rows( table_target[1:999, ], mutate(table_target[1:10, ], id = paste0(id, "2")) ) .matches <- match_data( .source = tab_source, .target = tab_target, .cols_match = c("name", "iso3", "city", "address"), .cols_exact = "iso3", .verbose = FALSE ) .source <- tab_source .target <- tab_target .cols_match = c("name", "iso3", "city", "address") .w_unique <- NULL .w_custom <- c(name = .6, city = .1, address = .3) .cols_exact = "iso3"
#' Score Data #' #' Description #' #' @param .matches #' Dataframe produced by match_data() #' @param .source #' The Source Dataframe.\cr #' (Must contain a unique column id and the columns you want to match on) #' @param .target #' The Target Dataframe.\cr #' (Must contain a unique column id and the columns you want to match on) #' @param .cols_match #' A character vector of columns to perform fuzzy matching. #' @param .cols_exact #' Columns that must be matched perfectly.\cr #' (Data will be partitioned using those columns) #' @param .w_unique #' Weights calculated by get_weights() #' @param .w_custom #' A named numeric vector that matches the columns of .cols_match w/o the columns of .cols_exact #' #' @return A dataframe #' #' @export scores_data <- function(.matches, .source, .target, .cols_match, .cols_exact = NULL, .w_unique = NULL, .w_custom = NULL) { id_s <- id_t <- . <- n_s <- add_t <- NULL check_id(.source, .target) source_ <- prep_tables(.source, .cols_match) target_ <- prep_tables(.target, .cols_match) matches_ <- tibble::as_tibble(.matches) # cols_ <- colnames(matches_) # cols_ <- gsub("sim_", "", cols_[grepl("^sim_", cols_)]) cols_ <- .cols_match[!.cols_match %in% .cols_exact] if (!is.null(.w_unique)) { help_check_weights(.w_unique, cols_) wu_ <- .w_unique } else { wu_ <- (get_weights(source_, cols_) + get_weights(target_, cols_)) / 2 } if (!is.null(.w_custom)) { help_check_weights(.w_custom, cols_) wc_ <- .w_custom[order(match(names(.w_custom), cols_))] wc_ <- wc_ / sum(wc_) } else { wc_ <- rep(NA_real_, length(cols_)) } mat_ <- as.matrix(matches_[, paste0("sim_", cols_)]) matches_ %>% dplyr::mutate( sms = rowMeans(mat_, na.rm = TRUE), smw = rowSums(mat_ * wu_, na.rm = TRUE), smc = rowSums(mat_ * wc_, na.rm = TRUE), sss = rowMeans(mat_ ^ 2, na.rm = TRUE), ssw = rowSums(mat_ ^ 2 * wu_, na.rm = TRUE), ssc = rowSums(mat_ ^ 2 * wc_, na.rm = TRUE), ) }
tab_source <- table_source[1:100, ] tab_target <- table_target[1:999, ] cols_match <- c("name", "iso3", "city", "address") cols_exact <- "iso3" cols_join <- c("name", "iso3") tab_match <- match_data( .source = tab_source, .target = tab_target, .cols_match = cols_match, .cols_exact = cols_exact, .cols_join = cols_join, .method = "soundex", ) scores_data( .matches = tab_match, .source = tab_source, .target = tab_target, .cols_match = cols_match, .cols_exact = cols_exact )
test_that("scores_data works", { expect_true(inherits(scores_data, "function")) })
tab_source <- bind_rows( table_source[1:100, ], mutate(table_source[1:10, ], id = paste0(id, "1")) ) tab_target <- bind_rows( table_target[1:999, ], mutate(table_target[1:10, ], id = paste0(id, "2")) ) cols_match <- c("name", "iso3", "city", "address") cols_exact <- "iso3" cols_join <- c("name", "iso3") tab_match <- match_data( .source = tab_source, .target = tab_target, .cols_match = cols_match, .cols_exact = cols_exact, .cols_join = cols_join, .method = "soundex", ) .score <- scores_data( .matches = tab_match, .source = tab_source, .target = tab_target, .cols_match = cols_match, .cols_exact = cols_exact ) .source <- tab_source .target <- tab_target .cols_match <- c("name", "iso3", "city", "address") .col_score <- c("sms", "smw", "smc", "sss", "ssw", "ssc") .min_sim = c(name = .25, address = .25)
#' Deduplicate Data #' #' Description #' #' @param .score #' Dataframe generated by scores_data() #' @param .source #' The Source Dataframe.\cr #' (Must contain a unique column id and the columns you want to match on) #' @param .target #' The Target Dataframe.\cr #' (Must contain a unique column id and the columns you want to match on) #' @param .cols_match #' A character vector of columns to perform fuzzy matching. #' @param .min_sim #' Named vector with minimum similarities #' @param .col_score #' Score column generated by scores_data().\cr #' Options are:\cr #' - sms: Simple Mean (mean over all fuzzy columns)\cr #' - smw: Weighted Mean (mean over all fuzzy columns, weighted by get_weights())\cr #' - smc: Custom Mean (mean over all fuzzy columns, weighted custom weights)\cr #' - sss: Simple Mean, squared (mean over all fuzzy columns, scores are squared)\cr #' - ssw: Weighted Mean, squared (mean over all fuzzy columns, scores are squared before weighted by get_weights())\cr #' - ssc: Custom Mean, squared (mean over all fuzzy columns, scores are squared before weighted custom weights) #' #' @return A dataframe #' #' @importFrom rlang := #' #' @export dedup_data <- function( .score, .source, .target, .cols_match, .min_sim = NULL, .col_score = c("sms", "smw", "smc", "sss", "ssw", "ssc") ) { id_s <- id_t <- name_s <- name_t <- all_s <- all_t <- score <- `_id_` <- len_s <- len_t <- n_s <- n_t <- sms <- smw <- smc <- sss <- ssw <- ssc <- NULL check_id(.source, .target) cols_score_ <- match.arg(.col_score, c("sms", "smw", "smc", "sss", "ssw", "ssc")) source_ <- prep_tables(.source, .cols_match) target_ <- prep_tables(.target, .cols_match) score_ <- tibble::as_tibble(.score) col_s_ <- colnames(source_)[!colnames(source_) == "_id_"] col_t_ <- colnames(target_)[!colnames(target_) == "_id_"] col_e_ <- col_s_[col_s_ %in% col_t_] col_e_ <- col_e_[!col_e_ == "id"] col_e_ <- unlist(purrr::map2(paste0(col_e_, "_s"), paste0(col_e_, "_t"), c)) tab_ <- dplyr::filter(score_, !!dplyr::sym(cols_score_) > 0) if (!is.null(.min_sim)) { for (i in seq_len(length(.min_sim))) { n_ <- paste0("sim_", names(.min_sim)[i]) v_ <- .min_sim[i] tab_ <- dplyr::filter(tab_, !!dplyr::sym(n_) >= v_) } } tab_ <- tab_ %>% dplyr::group_by(id_t) %>% dplyr::slice_max(!!dplyr::sym(cols_score_)) %>% dplyr::mutate(n_t = dplyr::n()) %>% dplyr::ungroup() %>% dplyr::group_by(id_s) %>% dplyr::slice_max(!!dplyr::sym(cols_score_)) %>% dplyr::mutate(n_s = dplyr::n()) %>% dplyr::ungroup() %>% dplyr::left_join( y = dplyr::select(source_, -c(`_id_`)), by = c("id_s" = "id"), suffix = c("_s", "_t") ) %>% dplyr::left_join( y = dplyr::select(target_, -c(`_id_`)), by = c("id_t" = "id"), suffix = c("_s", "_t") ) %>% dplyr::mutate( len_s = lengths(all_s), len_t = lengths(all_t) ) cols_use_ <- colnames(tab_) cols_use_ <- cols_use_[cols_use_ %in% c( "id_s", "id_t", "n_s", "n_t", "all_s", "all_t", "len_s", "len_t", paste0("sim_", .cols_match), "sms", "smw", "smc", "sss", "ssw", "ssc" )] tab_[, c(cols_use_, col_e_)] }
tab_source <- table_source[1:100, ] tab_target <- table_target[1:999, ] cols_match <- c("name", "iso3", "city", "address") cols_exact <- "iso3" cols_join <- c("name", "iso3") tab_match <- match_data( .source = tab_source, .target = tab_target, .cols_match = cols_match, .cols_exact = cols_exact, .cols_join = cols_join, .method = "soundex" ) tab_score <- scores_data( .matches = tab_match, .source = tab_source, .target = tab_target, .cols_match = cols_match, .cols_exact = cols_exact ) dedup_data( .score = tab_score, .source = tab_source, .target = tab_target, .cols_match = cols_match, .col_score = "sms" )
test_that("dedup_data works", { expect_true(inherits(dedup_data, "function")) })
.source = table_source[1:100, ] .target = table_target[1:999, ] .standardize = TRUE .cols_match = c("name", "iso3", "city", "address") .cols_join = c("name", "iso3") .cols_exact = "iso3" .max_match = 10 .method = "osa" .verbose = TRUE .workers = future::availableCores() .char_block = c(2, 5) .w_unique = NULL .w_custom = NULL .col_score = c("sms", "smw", "smc", "sss", "ssw", "ssc")
#' Complete Match #' #' Description #' #' @param .source #' The Source Dataframe.\cr #' (Must contain a unique column id and the columns you want to match on) #' @param .target #' The Target Dataframe.\cr #' (Must contain a unique column id and the columns you want to match on) #' @param .cols_match #' A character vector of columns to perform fuzzy matching. #' @param .cols_join #' Columns to perfrom an exact match on, before fuzzy-matching.\cr #' (Matched IDs will be excluded from fuzzy-match) #' @param .cols_exact #' Columns that must be matched perfectly.\cr #' (Data will be partitioned using those columns) #' @param .max_match #' Maximum number of matches to return (Default = 10) #' @param .method #' One of "osa", "lv", "dl", "hamming", "lcs", "qgram", "cosine", "jaccard", "jw", "soundex".\cr #' See: stringdist-metrics {stringdist} #' @param .verbose #' Print additional information? #' @param .workers #' Number of cores to utilize (Default all cores determined by future::availableCores()) #' @param .char_block #' Character Block Size. Used to partition data.\cr #' - First element chunks the source data in ngram-blocks.\cr #' - Second element allows for characters in target below/above block size. #' @param .standardize #' Perform String Standardization using standardize_data()? #' @param .w_unique #' Weights calculated by get_weights() #' @param .w_custom #' A named numeric vector that matches the columns of .cols_match w/o the columns of .cols_exact #' @param .min_sim #' Named vector with minimum similarities #' @param .col_score #' Score column generated by scores_data().\cr #' Options are:\cr #' - sms: Simple Mean (mean over all fuzzy columns)\cr #' - smw: Weighted Mean (mean over all fuzzy columns, weighted by get_weights())\cr #' - smc: Custom Mean (mean over all fuzzy columns, weighted custom weights)\cr #' - sss: Simple Mean, squared (mean over all fuzzy columns, scores are squared)\cr #' - ssw: Weighted Mean, squared (mean over all fuzzy columns, scores are squared before weighted by get_weights())\cr #' - ssc: Custom Mean, squared (mean over all fuzzy columns, scores are squared before weighted custom weights) #' #' @return #' A dataframe #' #' @export match_complete <- function( .source, .target, .cols_match, .cols_join = NULL, .cols_exact = NULL, .max_match = 10, .method = "osa", .verbose = TRUE, .workers = future::availableCores(), .char_block = c(Inf, Inf), .standardize = TRUE, .w_unique = NULL, .w_custom = NULL, .min_sim = NULL, .col_score = c("sms", "smw", "smc", "sss", "ssw", "ssc") ) { check_id(.source, .target) source_ <- prep_tables(.source, .cols_match) target_ <- prep_tables(.target, .cols_match) cols_score_ <- match.arg(.col_score, c("sms", "smw", "smc", "sss", "ssw", "ssc")) if (.standardize) { source_ <- standardize_data(source_, .cols_match) target_ <- standardize_data(target_, .cols_match) } match_ <- match_data( .source = source_, .target = target_, .cols_match = .cols_match, .cols_join = .cols_join, .cols_exact = .cols_exact, .max_match = .max_match, .method = .method, .verbose = .verbose, .workers = .workers, .char_block = .char_block ) score_ <- scores_data( .matches = match_, .source = source_, .target = target_, .cols_match = .cols_match, .cols_exact = .cols_exact, .w_unique = .w_unique, .w_custom = .w_custom ) dedup_data( .score = score_, .source = source_, .target = target_, .cols_match = .cols_match, .col_score = cols_score_, .min_sim = .min_sim ) }
match_complete( .source = table_source[1:100, ], .target = table_target[1:999, ], .cols_match = c("name", "iso3", "city", "address"), .cols_join = c("name", "iso3"), .cols_exact = "iso3", .max_match = 25, .method = "soundex", .verbose = TRUE, .workers = 4, .char_block = c(5, 5), .standardize = TRUE, .w_unique = NULL, .w_custom = c(name = .7, city = .2, address = .1), .col_score = "sms" )
test_that("match_complete works", { expect_true(inherits(match_complete, "function")) })
You're one inflate from paper to box. Build your package from this very Rmd using fusen::inflate()
"DESCRIPTION"
file has been updated"R/"
directory"tests/testthat/"
directory"vignettes/"
directoryfusen::inflate( flat_file = "dev/flat_full.Rmd", vignette_name = NA, overwrite = TRUE, check = FALSE ) devtools::check(vignettes = FALSE)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.