doc/Multivar_matching.R

## ---- include = FALSE---------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----setup--------------------------------------------------------------------
library(fedmatch)
library(data.table)

## -----------------------------------------------------------------------------

data("corp_data1", package = "fedmatch")
data("corp_data2", package = "fedmatch")

## -----------------------------------------------------------------------------
corp_data1_test <- copy(corp_data1)
data.table::setDT(corp_data1_test)
corp_data2_test <- copy(corp_data2)
data.table::setDT(corp_data2_test)

corp_data1_test[, id_1 := seq(1, .N)]
corp_data2_test[, id_2 := seq(1, .N)]

corp_data2_test[, Country := country]
corp_data2_test[, Company := Name]
result <- merge_plus(
  data1 = corp_data1_test,
  data2 = corp_data2_test,
  match_type = "multivar",
  by = c("Country", "Company"),
   suffixes = c("_1", "_2"),
  unique_key_1 = "id_1",
  unique_key_2 = "id_2", 
  multivar_settings = build_multivar_settings(
  compare_type = c("indicator", "stringdist"),
  wgts = c(.5, .5), nthread = 1
 
))
result

## -----------------------------------------------------------------------------
print(result$matches[, .(Company_1, Company_2, Company_compare)])
print(result$matches[, .(Country_1, Country_2, Country_compare)])

## -----------------------------------------------------------------------------

print(result$matches[, .(Company_compare, Country_compare, multivar_score)])

## -----------------------------------------------------------------------------
set.seed(111)
fake_result_table <- data.table::data.table(
    Company_1 = c("ABC Corp", "XYZ Corporation", "Apple Corp", "Banana Corp"),
    Company_2 = c("ABC Corporation", "XYZ Inc", "Apple Incorporated", "Banana Stand"),
    Country_1 = c("USA", "USA", "TUR", "USA"),
    Country_2 = c("MEX", "USA", "TUR", "USA")
  )
calculated_weights <- calculate_weights(fake_result_table, c("Company", "Country"),
                                        compare_type = c("stringdist", "indicator"),
                                        suffixes = c("_1", "_2"))
calculated_weights$w


## -----------------------------------------------------------------------------
set.seed(111)
corp_data1_test <- copy(corp_data1)
data.table::setDT(corp_data1_test)
corp_data2_test <- copy(corp_data2)
data.table::setDT(corp_data2_test)
corp_data1_test[, id_1 := seq(1, .N)]
corp_data2_test[, id_2 := seq(1, .N)]

corp_data2_test[, Country := country]
corp_data2_test[, Company := Name]
set.seed(111)
fake_result_table <- data.table::data.table(
  match = sample(c(1, 0, 1), 1e5, replace = TRUE),
  Company_compare = runif(1e5),
  Country_compare = sample(c(1, 0), 1e5, replace = TRUE)
)

logit_model <- glm(match ~ Company_compare + Country_compare,
  family = "binomial",
  data = fake_result_table
)

summary(logit_model)

result <- merge_plus(corp_data1_test, corp_data2_test,
                     match_type = "multivar",
                     multivar_settings = build_multivar_settings(logit = logit_model, compare_type = c("indicator", "stringdist"),
  wgts = NULL, nthread = 1),
  by = c("Country", "Company"), unique_key_1 = "id_1",
  unique_key_2 = "id_2", 
  suffixes = c("_1", "_2")
)
result

## -----------------------------------------------------------------------------
corp_data1_test <- copy(corp_data1)
data.table::setDT(corp_data1_test)
corp_data2_test <- copy(corp_data2)
data.table::setDT(corp_data2_test)
corp_data1_test[, id_1 := seq(1, .N)]
corp_data2_test[, id_2 := seq(1, .N)]

corp_data2_test[, Country := country]
corp_data2_test[, Company := Name]
result <- merge_plus(
  data1 = corp_data1_test,
  data2 = corp_data2_test,
  match_type = "multivar",
  by = c("Company"),
   suffixes = c("_1", "_2"),
  unique_key_1 = "id_1",
  unique_key_2 = "id_2", 
  multivar_settings = build_multivar_settings(
  compare_type = c( "stringdist"),
  wgts = c(1), nthread = 1, blocks = "Country"
 
))
result$matches
seunglee98/fedmatch documentation built on April 26, 2024, 10:24 a.m.