README.md

Rmatch

Rmatch is a package that aims to assist you in any matching task that involves two datasets and (approximate) string matching

Installation

You can install the development version of Rmatch like so:

# devtools::install_github("https://github.com/MatthiasUckert/Rmatch")

Example (Step-by-Step)

Basic usage using build-in datasets. Check out the vignette for full specification

library(Rmatch); library(kableExtra); library(tidyverse)
#> -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
#> v ggplot2 3.3.5     v purrr   0.3.4
#> v tibble  3.1.6     v dplyr   1.0.7
#> v tidyr   1.1.4     v stringr 1.4.0
#> v readr   2.1.1     v forcats 0.5.1
#> -- Conflicts ------------------------------------------ tidyverse_conflicts() --
#> x dplyr::filter()     masks stats::filter()
#> x dplyr::group_rows() masks kableExtra::group_rows()
#> x dplyr::lag()        masks stats::lag()

Match Data: You need 2 datasets, a source table (data you want to match) and a target table (data you want to match against).

tab_source <- table_source
tab_target <- table_target

First, make sure your datasets are correcly formatted.

check_data(tab_source, tab_target)
Check Source Target Matrix Column: ‘id’ exists TRUE TRUE Column: ‘id’ is unique TRUE TRUE Check NAs for column: ‘id’ 0 0 Check NAs for column: ‘name’ 0 0 Check NAs for column: ‘iso3’ 0 0 Check NAs for column: ‘city’ 1 11 Check NAs for column: ‘address’ 3 27 Check (cumulative) duplicates for column(s): ‘name’ 0 (0) 0 (0) Check (cumulative) duplicates for column(s): ‘iso3’ 2,406 (0) 4,657 (0) Check (cumulative) duplicates for column(s): ‘city’ 1,520 (0) 3,282 (0) Check (cumulative) duplicates for column(s): ‘address’ 73 (0) 688 (0) Elements in Dataframe/Matrix 2,426 4,671 11,331,846 Estimated memory allocation 1,813 MB

Standardizing columns won’t harm

cols_match <- c("name", "iso3", "city", "address")
tab_source <- standardize_data(tab_source, cols_match)
tab_target <- standardize_data(tab_target, cols_match)

Check again

check_data(tab_source, tab_target)
Check Source Target Matrix Column: ‘id’ exists TRUE TRUE Column: ‘id’ is unique TRUE TRUE Check NAs for column: ‘id’ 0 0 Check NAs for column: ‘name’ 0 0 Check NAs for column: ‘iso3’ 0 0 Check NAs for column: ‘city’ 1 11 Check NAs for column: ‘address’ 3 27 Check (cumulative) duplicates for column(s): ‘name’ 0 (0) 0 (0) Check (cumulative) duplicates for column(s): ‘iso3’ 2,406 (0) 4,657 (0) Check (cumulative) duplicates for column(s): ‘city’ 1,522 (0) 3,293 (0) Check (cumulative) duplicates for column(s): ‘address’ 85 (0) 723 (0) Elements in Dataframe/Matrix 2,426 4,671 11,331,846 Estimated memory allocation 1,813 MB

Let’s match the two dataframes

cols_join <- c("name", "iso3")
cols_exact <- "iso3"
method <- "soundex"
tab_match <- match_data(
  .source = tab_source,
  .target = tab_target,
  .cols_match = cols_match,
  .cols_join = c("name", "iso3"),
  .cols_exact = cols_exact,
  .method = method,
)
#> Calculating Block: 003-028
id_s id_t all_s all_t sim_name sim_iso3 sim_city sim_address 52BE07D5 F671078F 52BE07D5 F671078F 1 1 1 0.9642857 975AF234 67CC2286 975AF234 67CC2286 1 1 1 1.0000000 6E20553A E8E307AE 6E20553A E8E307AE 1 1 1 0.0000000 C07A46B5 775B36DB C07A46B5 775B36DB 1 1 1 1.0000000 FB8EC365 C7F93EC4 FB8EC365 C7F93EC4 1 1 1 1.0000000 9E715D89 820A46E0 9E715D89 820A46E0 1 1 1 1.0000000

Let’s calculate the best matches

tab_score <- scores_data(
  .matches = tab_match,
  .source = tab_source,
  .target = tab_target,
  .cols_match = cols_match,
  .cols_exact = cols_exact
  )
id_s id_t all_s all_t sim_name sim_iso3 sim_city sim_address sms smw smc sss ssw ssc 52BE07D5 F671078F 52BE07D5 F671078F 1 1 1 0.9642857 0.9880952 0.6816429 0 0.9766156 0.6737237 0 975AF234 67CC2286 975AF234 67CC2286 1 1 1 1.0000000 1.0000000 1.6256599 0 1.0000000 1.6256599 0 6E20553A E8E307AE 6E20553A E8E307AE 1 1 1 0.0000000 0.6666667 0.4563230 0 0.6666667 0.4563230 0 C07A46B5 775B36DB C07A46B5 775B36DB 1 1 1 1.0000000 1.0000000 0.6898555 0 1.0000000 0.6898555 0 FB8EC365 C7F93EC4 FB8EC365 C7F93EC4 1 1 1 1.0000000 1.0000000 1.6256599 0 1.0000000 1.6256599 0 9E715D89 820A46E0 9E715D89 820A46E0 1 1 1 1.0000000 1.0000000 0.6844846 0 1.0000000 0.6844846 0

Finally Deduplicate Matches, DONE(ish)

col_score <- "sms"
tab_final1 <- dedup_data(
  .score = tab_score,
  .source = tab_source,
  .target = tab_target,
  .cols_match = cols_match,
  .col_score = col_score
)
id_s id_t all_s all_t score name_s name_t iso3_s iso3_t city_s city_t address_s address_t 975AF234 67CC2286 975AF234 67CC2286 1 1000MERCIS 1000MERCIS FRA FRA PARIS PARIS 28 RUE DE CHATEAUDUN 28 RUE DE CHATEAUDUN C07A46B5 775B36DB C07A46B5 775B36DB 1 2G ENERGY AG 2G ENERGY AG DEU DEU HEEK HEEK BENZSTRASSE 3 BENZSTRASSE 3 FB8EC365 C7F93EC4 FB8EC365 C7F93EC4 1 2INVEST AG 2INVEST AG DEU DEU HEIDELBERG HEIDELBERG ZIEGELHAUSER LANDSTR 3 ZIEGELHAUSER LANDSTR 3 9E715D89 820A46E0 9E715D89 820A46E0 1 3U HOLDING AG 3U HOLDING AG DEU DEU MARBURG MARBURG FRAUENBERGSTRASSE 31 33 FRAUENBERGSTRASSE 31 33 8953E4CF 566D4EAF 8953E4CF 566D4EAF 1 5TH PLANET GAMES A S 5TH PLANET GAMES A S DNK DNK COPENHAGEN COPENHAGEN GOTHERSGADE 11 GOTHERSGADE 11 5F1D6968 3269F9E1 5F1D6968 3269F9E1 1 7C SOLARPARKEN AG 7C SOLARPARKEN AG DEU DEU BAYREUTH BAYREUTH AN DER FEUERWACHE 15 AN DER FEUERWACHE 15

Example (All-in-One)

tab_final2 <- match_complete(
  .source = table_source,
  .target = table_target,
  .cols_match = cols_match,
  .cols_join = cols_join,
  .cols_exact = cols_exact,
  .method = method,
  .standardize = TRUE,
  .col_score = col_score
)
#> Calculating Block: 003-028
all.equal(tab_final1, tab_final2)
#> [1] TRUE
id_s id_t all_s all_t score name_s name_t iso3_s iso3_t city_s city_t address_s address_t 975AF234 67CC2286 975AF234 67CC2286 1 1000MERCIS 1000MERCIS FRA FRA PARIS PARIS 28 RUE DE CHATEAUDUN 28 RUE DE CHATEAUDUN C07A46B5 775B36DB C07A46B5 775B36DB 1 2G ENERGY AG 2G ENERGY AG DEU DEU HEEK HEEK BENZSTRASSE 3 BENZSTRASSE 3 FB8EC365 C7F93EC4 FB8EC365 C7F93EC4 1 2INVEST AG 2INVEST AG DEU DEU HEIDELBERG HEIDELBERG ZIEGELHAUSER LANDSTR 3 ZIEGELHAUSER LANDSTR 3 9E715D89 820A46E0 9E715D89 820A46E0 1 3U HOLDING AG 3U HOLDING AG DEU DEU MARBURG MARBURG FRAUENBERGSTRASSE 31 33 FRAUENBERGSTRASSE 31 33 8953E4CF 566D4EAF 8953E4CF 566D4EAF 1 5TH PLANET GAMES A S 5TH PLANET GAMES A S DNK DNK COPENHAGEN COPENHAGEN GOTHERSGADE 11 GOTHERSGADE 11 5F1D6968 3269F9E1 5F1D6968 3269F9E1 1 7C SOLARPARKEN AG 7C SOLARPARKEN AG DEU DEU BAYREUTH BAYREUTH AN DER FEUERWACHE 15 AN DER FEUERWACHE 15

Code of Conduct

Please note that the Rmatch project is released with a Contributor Code of Conduct. By contributing to this project, you agree to abide by its terms.



MatthiasUckert/Rmatch documentation built on Jan. 3, 2022, 11:09 p.m.