doc/Intro-to-fedmatch.R

## ---- include = FALSE---------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----setup, include = F-------------------------------------------------------
library(fedmatch)
library(data.table)

## -----------------------------------------------------------------------------
raw_names <- c("Hamlin, Hamlin, McGill", "Schweibert & Cokely ", "Wexler McGill, LLC",
               "Davis and Main, Inc.")
clean_names <- clean_strings(raw_names)
clean_names

## -----------------------------------------------------------------------------
fedmatch::corporate_words[1:5]
scrubbed_names <- clean_strings(raw_names, common_words = fedmatch::corporate_words)
scrubbed_names

## -----------------------------------------------------------------------------
fedmatch::corp_data1
fedmatch::corp_data2


## -----------------------------------------------------------------------------
basic_merge <- merge_plus(data1 = corp_data1, 
                          data2 = corp_data2,
                          by.x = "Company",
                          by.y = "Name", match_type = "exact", 
                          unique_key_1 = "unique_key_1",
                          unique_key_2 = "unique_key_2")

## -----------------------------------------------------------------------------
print(basic_merge$matches)

## -----------------------------------------------------------------------------
print(basic_merge$matches_filter)

## -----------------------------------------------------------------------------
print(basic_merge$data1_nomatch)
print(basic_merge$data2_nomatch)

## -----------------------------------------------------------------------------
print(basic_merge$match_evaluation)

## -----------------------------------------------------------------------------
fuzzy_result <- merge_plus(data1 = corp_data1, 
                          data2 = corp_data2,
                          by.x = "Company",
                          by.y = "Name", match_type = "fuzzy", 
                          unique_key_1 = "unique_key_1",
                          unique_key_2 = "unique_key_2")
print(fuzzy_result$matches)

## -----------------------------------------------------------------------------
fuzzy_result <- merge_plus(data1 = corp_data1, 
                          data2 = corp_data2,
                          by.x = "Company",
                          by.y = "Name", match_type = "fuzzy", 
                          fuzzy_settings = build_fuzzy_settings(maxDist = .5),
                          unique_key_1 = "unique_key_1",
                          unique_key_2 = "unique_key_2")
print(fuzzy_result$matches)

## -----------------------------------------------------------------------------
# for simplicity's sake, rename columns in corp_data2
data.table::setnames(corp_data2, c("Name", "country"), c("Company", "Country"))
multivar_linear_result <- merge_plus(corp_data1, corp_data2, 
                                     match_type = "multivar",
                                     by = c("Country", "Company"), 
                                     unique_key_1 = "unique_key_1",
                                     suffixes = c("_1", "_2"),
                                     unique_key_2 = "unique_key_2",
                                    
                                     multivar_settings = build_multivar_settings(compare_type = c("indicator", "stringdist"),
                                                               wgts = c(.5, .5),
                                                              top = 1))
multivar_linear_result$matches                                     

## -----------------------------------------------------------------------------
set.seed(111)
training_table <- data.table::data.table(match = c(rep(1, 5e4), sample(c(0,1 ), 5e4, replace = TRUE)),
                                Company_compare = seq(1, 0.00001, -.00001),
                                Country_compare = c(rep(1, 5e4), sample(c(1, 0), 5e4, replace = TRUE)))
# training_table
logit_model <- glm(match ~ Company_compare + Country_compare, family = "binomial",
                   data = training_table)
summary(logit_model)

## -----------------------------------------------------------------------------
result <- merge_plus(corp_data1, corp_data2, by = c("Country", "Company"), unique_key_1 = "unique_key_1",
                        unique_key_2 = "unique_key_2", 
                     match_type = "multivar",
                     multivar_settings = list(logit = logit_model, compare_type = c("indicator", "stringdist"),
                        wgts = NULL),
                        suffixes = c("_1", "_2"))
result$matches
seunglee98/fedmatch documentation built on April 26, 2024, 10:24 a.m.