vignettes/example_integrated.R

library(reclin)
library(tidyverse)
library(purrr)



theme_set(theme_minimal())
data_dir <- file.path('TokenLink','data')
setwd('C:/Users/hswerdfe/projects/')


orig_dat <-
  data_dir |>
  file.path('generated_dataset.csv') |>
  read_csv() |>
  replace_na(list(company_name = '',
                  address = '',
                  first_name = '',
                  last_name = '',
                  age = 0
  ))|>
  mutate_all(as.character)

edited_dat <-
  data_dir |>
  file.path('generated_dataset_random_edits.csv') |>
  read_csv() |>
  replace_na(list(company_name = '',
                  address = '',
                  first_name = '',
                  last_name = '',
                  age = 0
  )) |>
  mutate_all(as.character)




bind_cols(orig_dat, edited_dat |> rename_all(\(x){paste0(x,"_edited")})) |>
  {\(.)select(., order(colnames(.)))}() |>
  sample_n(5) |>
  knitr::kable(caption = 'original and edited data')

blocked_pairs <- reclin_pair_blocking(x = orig_dat,
                                      y = edited_dat,
                                      blocking_var = c('first_name', 'last_name'), #Block on Any of these Columns
                                      token_types =  c('company_name', 'address'), #Block on Any of these tokens
                                      col_nms_x = c('company_name', 'address'),    # Column Names
                                      col_nms_y =  c('company_name', 'address'),   # Column Names
                                      min_token_u_prob = 0.0000784)               # min u_prob to consider blocking on

blocked_pairs |>
  as_tibble()






# Compare pairs in Reclin using First and last name
p <- reclin::compare_pairs(blocked_pairs,
                           by = c('first_name', 'last_name'),
                           default_comparator = jaro_winkler(0.9))

m <- problink_em(p)
p <- score_simsum(p, var = "sim_sum")
p <- score_problink(p, model = m, var = "scores", type  = 'all')

p |>
  sample_n(5) |>
  knitr::kable(caption = 'Show scores cenerated from Reclin')





refined_p <-
  refine_posterior(p = p,
                   x_dat = orig_dat,
                   y_dat = edited_dat,
                   weights_nm = 'scores_weight',
                   args_x = list(col_nms = c('company_name', 'address')),
                   args_y = list(col_nms = c('company_name', 'address')),
                   token_types = c('company_name', 'address')
  )

refined_p |>
  mutate(is_same = (x == y)) |>
  mutate(delta_belief = posterior - priori) |>
  ggplot(aes(y = delta_belief, x = priori,  color = is_same )) +
  geom_jitter(alpha = 0.05, width = 0.05, height = 0.05) +
  geom_hline(yintercept=0, linetype="dashed", color = "black", size = 1.25) +
  scale_x_continuous(labels = scales::percent) +
  scale_y_continuous(labels = scales::percent) +
  labs(title = 'How much do the token columns change our belief in the Match?',
       y = 'Change in Belief',
       color = 'Actually the same') +
  guides(colour = guide_legend(override.aes = list(alpha = 1)))
csps-efpc/TokenLink documentation built on Feb. 10, 2023, 3:30 a.m.