Link two data sets using tokens or words in common between them.
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
source('../R/tokenify.R')
source('R/tokenify.R') devtools::install_github("csps-efpc/TokenLink")
library(reclin) library(tidyverse) library(purrr) theme_set(theme_minimal()) data_dir <- file.path('..','data')
orig_dat <- data_dir |> file.path('generated_dataset.csv') |> read_csv() |> replace_na(list(company_name = '', address = '', first_name = '', last_name = '', age = 0 ))|> mutate_all(as.character) edited_dat <- data_dir |> file.path('generated_dataset_random_edits.csv') |> read_csv() |> replace_na(list(company_name = '', address = '', first_name = '', last_name = '', age = 0 )) |> mutate_all(as.character) bind_cols(orig_dat, edited_dat |> rename_all(\(x){paste0(x,"_edited")})) |> {\(.)select(., order(colnames(.)))}() |> sample_n(5) |> knitr::kable(caption = 'original and edited data')
blocked_pairs <- reclin_pair_blocking(x = orig_dat, y = edited_dat, blocking_var = c('first_name', 'last_name'), #Block on Any of these Columns token_types = c('company_name', 'address'), #Block on Any of these tokens col_nms_x = c('company_name', 'address'), # Column Names col_nms_y = c('company_name', 'address'), # Column Names min_token_u_prob = 0.0000784) # min u_prob to consider blocking on blocked_pairs |> as_tibble()
# Compare pairs in Reclin using First and last name p <- reclin::compare_pairs(blocked_pairs, by = c('first_name', 'last_name'), default_comparator = jaro_winkler(0.9)) m <- problink_em(p) p <- score_simsum(p, var = "sim_sum") p <- score_problink(p, model = m, var = "scores", type = 'all') p |> sample_n(5) |> knitr::kable(caption = 'Show scores cenerated from Reclin')
print(getwd()) refined_p <- refine_posterior(p = p, x_dat = orig_dat, y_dat = edited_dat, weights_nm = 'scores_weight', args_x = list(col_nms = c('company_name', 'address')), args_y = list(col_nms = c('company_name', 'address')), token_types = c('company_name', 'address') ) refined_p |> mutate(is_same = (x == y)) |> mutate(delta_belief = posterior - priori) |> ggplot(aes(y = delta_belief, x = priori, color = is_same )) + geom_jitter(alpha = 0.05, width = 0.05, height = 0.05) + geom_hline(yintercept=0, linetype="dashed", color = "black", size = 1.25) + scale_x_continuous(labels = scales::percent) + scale_y_continuous(labels = scales::percent) + labs(title = 'How much do the token columns change our belief in the Match?', y = 'Change in Belief', color = 'Actually the same') + guides(colour = guide_legend(override.aes = list(alpha = 1)))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.