Nothing
## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>"
)
## ----setup--------------------------------------------------------------------
library(blocking)
library(data.table)
## -----------------------------------------------------------------------------
data(census)
data(cis)
## -----------------------------------------------------------------------------
head(census)
## -----------------------------------------------------------------------------
head(cis)
## -----------------------------------------------------------------------------
set.seed(2024)
census <- census[sample(nrow(census), floor(nrow(census) / 2)), ]
cis <- cis[sample(nrow(cis), floor(nrow(cis) / 2)), ]
## -----------------------------------------------------------------------------
census[, txt:=paste0(pername1, pername2, sex, dob_day, dob_mon, dob_year, enumcap, enumpc)]
cis[, txt:=paste0(pername1, pername2, sex, dob_day, dob_mon, dob_year, enumcap, enumpc)]
## -----------------------------------------------------------------------------
result1 <- blocking(x = census$txt, y = cis$txt, verbose = 1)
## -----------------------------------------------------------------------------
hist(result1$result$dist, main = "Distribution of distances between pairs", xlab = "Distances")
## -----------------------------------------------------------------------------
head(result1$result, n = 10)
## -----------------------------------------------------------------------------
cbind(t(census[1, c(1:7, 9:10)]), t(cis[12088, 1:9]))
## -----------------------------------------------------------------------------
matches <- merge(x = census[, .(x=1:.N, person_id)],
y = cis[, .(y = 1:.N, person_id)],
by = "person_id")
matches[, block:=1:.N]
head(matches)
## -----------------------------------------------------------------------------
result2 <- blocking(x = census$txt, y = cis$txt, verbose = 1,
true_blocks = matches[, .(x, y, block)])
## -----------------------------------------------------------------------------
result2
## -----------------------------------------------------------------------------
ann_control_pars <- controls_ann()
ann_control_pars$nnd$epsilon <- 0.2
result3 <- blocking(x = census$txt, y = cis$txt, verbose = 1,
true_blocks = matches[, .(x, y, block)],
control_ann = ann_control_pars)
## -----------------------------------------------------------------------------
result3
## -----------------------------------------------------------------------------
result4 <- blocking(x = census$txt, y = cis$txt, verbose = 1,
true_blocks = matches[, .(x, y, block)],
ann = "hnsw")
## -----------------------------------------------------------------------------
result4
## -----------------------------------------------------------------------------
c("no tuning" = mean(result2$result[order(y)]$x == result4$result[order(y)]$x)*100,
"with tuning" = mean(result3$result[order(y)]$x == result4$result[order(y)]$x)*100)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.