Rmatch is a package that aims to assist you in any matching task that involves two datasets and (approximate) string matching
You can install the development version of Rmatch like so:
# devtools::install_github("https://github.com/MatthiasUckert/Rmatch")
Basic usage using build-in datasets. Check out the vignette for full specification
library(Rmatch); library(kableExtra); library(tidyverse)
#> -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
#> v ggplot2 3.3.5 v purrr 0.3.4
#> v tibble 3.1.6 v dplyr 1.0.7
#> v tidyr 1.1.4 v stringr 1.4.0
#> v readr 2.1.1 v forcats 0.5.1
#> -- Conflicts ------------------------------------------ tidyverse_conflicts() --
#> x dplyr::filter() masks stats::filter()
#> x dplyr::group_rows() masks kableExtra::group_rows()
#> x dplyr::lag() masks stats::lag()
Match Data: You need 2 datasets, a source table (data you want to match) and a target table (data you want to match against).
tab_source <- table_source
tab_target <- table_target
First, make sure your datasets are correcly formatted.
check_data(tab_source, tab_target)
Check
Source
Target
Matrix
Column: ‘id’ exists
TRUE
TRUE
Column: ‘id’ is unique
TRUE
TRUE
Check NAs for column: ‘id’
0
0
Check NAs for column: ‘name’
0
0
Check NAs for column: ‘iso3’
0
0
Check NAs for column: ‘city’
1
11
Check NAs for column: ‘address’
3
27
Check (cumulative) duplicates for column(s): ‘name’
0
(0)
0
(0)
Check (cumulative) duplicates for column(s): ‘iso3’
2,406
(0)
4,657
(0)
Check (cumulative) duplicates for column(s): ‘city’
1,520
(0)
3,282
(0)
Check (cumulative) duplicates for column(s): ‘address’
73
(0)
688
(0)
Elements in Dataframe/Matrix
2,426
4,671
11,331,846
Estimated memory allocation
1,813 MB
Standardizing columns won’t harm
cols_match <- c("name", "iso3", "city", "address")
tab_source <- standardize_data(tab_source, cols_match)
tab_target <- standardize_data(tab_target, cols_match)
Check again
check_data(tab_source, tab_target)
Check
Source
Target
Matrix
Column: ‘id’ exists
TRUE
TRUE
Column: ‘id’ is unique
TRUE
TRUE
Check NAs for column: ‘id’
0
0
Check NAs for column: ‘name’
0
0
Check NAs for column: ‘iso3’
0
0
Check NAs for column: ‘city’
1
11
Check NAs for column: ‘address’
3
27
Check (cumulative) duplicates for column(s): ‘name’
0
(0)
0
(0)
Check (cumulative) duplicates for column(s): ‘iso3’
2,406
(0)
4,657
(0)
Check (cumulative) duplicates for column(s): ‘city’
1,522
(0)
3,293
(0)
Check (cumulative) duplicates for column(s): ‘address’
85
(0)
723
(0)
Elements in Dataframe/Matrix
2,426
4,671
11,331,846
Estimated memory allocation
1,813 MB
Let’s match the two dataframes
cols_join <- c("name", "iso3")
cols_exact <- "iso3"
method <- "soundex"
tab_match <- match_data(
.source = tab_source,
.target = tab_target,
.cols_match = cols_match,
.cols_join = c("name", "iso3"),
.cols_exact = cols_exact,
.method = method,
)
#> Calculating Block: 003-028
id_s
id_t
all_s
all_t
sim_name
sim_iso3
sim_city
sim_address
52BE07D5
F671078F
52BE07D5
F671078F
1
1
1
0.9642857
975AF234
67CC2286
975AF234
67CC2286
1
1
1
1.0000000
6E20553A
E8E307AE
6E20553A
E8E307AE
1
1
1
0.0000000
C07A46B5
775B36DB
C07A46B5
775B36DB
1
1
1
1.0000000
FB8EC365
C7F93EC4
FB8EC365
C7F93EC4
1
1
1
1.0000000
9E715D89
820A46E0
9E715D89
820A46E0
1
1
1
1.0000000
Let’s calculate the best matches
tab_score <- scores_data(
.matches = tab_match,
.source = tab_source,
.target = tab_target,
.cols_match = cols_match,
.cols_exact = cols_exact
)
id_s
id_t
all_s
all_t
sim_name
sim_iso3
sim_city
sim_address
sms
smw
smc
sss
ssw
ssc
52BE07D5
F671078F
52BE07D5
F671078F
1
1
1
0.9642857
0.9880952
0.6816429
0
0.9766156
0.6737237
0
975AF234
67CC2286
975AF234
67CC2286
1
1
1
1.0000000
1.0000000
1.6256599
0
1.0000000
1.6256599
0
6E20553A
E8E307AE
6E20553A
E8E307AE
1
1
1
0.0000000
0.6666667
0.4563230
0
0.6666667
0.4563230
0
C07A46B5
775B36DB
C07A46B5
775B36DB
1
1
1
1.0000000
1.0000000
0.6898555
0
1.0000000
0.6898555
0
FB8EC365
C7F93EC4
FB8EC365
C7F93EC4
1
1
1
1.0000000
1.0000000
1.6256599
0
1.0000000
1.6256599
0
9E715D89
820A46E0
9E715D89
820A46E0
1
1
1
1.0000000
1.0000000
0.6844846
0
1.0000000
0.6844846
0
Finally Deduplicate Matches, DONE(ish)
col_score <- "sms"
tab_final1 <- dedup_data(
.score = tab_score,
.source = tab_source,
.target = tab_target,
.cols_match = cols_match,
.col_score = col_score
)
id_s
id_t
all_s
all_t
score
name_s
name_t
iso3_s
iso3_t
city_s
city_t
address_s
address_t
975AF234
67CC2286
975AF234
67CC2286
1
1000MERCIS
1000MERCIS
FRA
FRA
PARIS
PARIS
28 RUE DE CHATEAUDUN
28 RUE DE CHATEAUDUN
C07A46B5
775B36DB
C07A46B5
775B36DB
1
2G ENERGY AG
2G ENERGY AG
DEU
DEU
HEEK
HEEK
BENZSTRASSE 3
BENZSTRASSE 3
FB8EC365
C7F93EC4
FB8EC365
C7F93EC4
1
2INVEST AG
2INVEST AG
DEU
DEU
HEIDELBERG
HEIDELBERG
ZIEGELHAUSER LANDSTR 3
ZIEGELHAUSER LANDSTR 3
9E715D89
820A46E0
9E715D89
820A46E0
1
3U HOLDING AG
3U HOLDING AG
DEU
DEU
MARBURG
MARBURG
FRAUENBERGSTRASSE 31 33
FRAUENBERGSTRASSE 31 33
8953E4CF
566D4EAF
8953E4CF
566D4EAF
1
5TH PLANET GAMES A S
5TH PLANET GAMES A S
DNK
DNK
COPENHAGEN
COPENHAGEN
GOTHERSGADE 11
GOTHERSGADE 11
5F1D6968
3269F9E1
5F1D6968
3269F9E1
1
7C SOLARPARKEN AG
7C SOLARPARKEN AG
DEU
DEU
BAYREUTH
BAYREUTH
AN DER FEUERWACHE 15
AN DER FEUERWACHE 15
tab_final2 <- match_complete(
.source = table_source,
.target = table_target,
.cols_match = cols_match,
.cols_join = cols_join,
.cols_exact = cols_exact,
.method = method,
.standardize = TRUE,
.col_score = col_score
)
#> Calculating Block: 003-028
all.equal(tab_final1, tab_final2)
#> [1] TRUE
id_s
id_t
all_s
all_t
score
name_s
name_t
iso3_s
iso3_t
city_s
city_t
address_s
address_t
975AF234
67CC2286
975AF234
67CC2286
1
1000MERCIS
1000MERCIS
FRA
FRA
PARIS
PARIS
28 RUE DE CHATEAUDUN
28 RUE DE CHATEAUDUN
C07A46B5
775B36DB
C07A46B5
775B36DB
1
2G ENERGY AG
2G ENERGY AG
DEU
DEU
HEEK
HEEK
BENZSTRASSE 3
BENZSTRASSE 3
FB8EC365
C7F93EC4
FB8EC365
C7F93EC4
1
2INVEST AG
2INVEST AG
DEU
DEU
HEIDELBERG
HEIDELBERG
ZIEGELHAUSER LANDSTR 3
ZIEGELHAUSER LANDSTR 3
9E715D89
820A46E0
9E715D89
820A46E0
1
3U HOLDING AG
3U HOLDING AG
DEU
DEU
MARBURG
MARBURG
FRAUENBERGSTRASSE 31 33
FRAUENBERGSTRASSE 31 33
8953E4CF
566D4EAF
8953E4CF
566D4EAF
1
5TH PLANET GAMES A S
5TH PLANET GAMES A S
DNK
DNK
COPENHAGEN
COPENHAGEN
GOTHERSGADE 11
GOTHERSGADE 11
5F1D6968
3269F9E1
5F1D6968
3269F9E1
1
7C SOLARPARKEN AG
7C SOLARPARKEN AG
DEU
DEU
BAYREUTH
BAYREUTH
AN DER FEUERWACHE 15
AN DER FEUERWACHE 15
Please note that the Rmatch project is released with a Contributor Code of Conduct. By contributing to this project, you agree to abide by its terms.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.