inst/doc/v2-reclin.R

## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----setup--------------------------------------------------------------------
library(blocking)
library(data.table)

## -----------------------------------------------------------------------------
data(census)
data(cis)

## -----------------------------------------------------------------------------
head(census)

## -----------------------------------------------------------------------------
head(cis)

## -----------------------------------------------------------------------------
set.seed(2024)
census <- census[sample(nrow(census), floor(nrow(census) / 2)), ]
cis <- cis[sample(nrow(cis), floor(nrow(cis) / 2)), ]

## -----------------------------------------------------------------------------
census[, txt:=paste0(pername1, pername2, sex, dob_day, dob_mon, dob_year, enumcap, enumpc)]
cis[, txt:=paste0(pername1, pername2, sex, dob_day, dob_mon, dob_year, enumcap, enumpc)]

## -----------------------------------------------------------------------------
result1 <- blocking(x = census$txt, y = cis$txt, verbose = 1)

## -----------------------------------------------------------------------------
hist(result1$result$dist, main = "Distribution of distances between pairs", xlab = "Distances")

## -----------------------------------------------------------------------------
head(result1$result, n = 10)

## -----------------------------------------------------------------------------
cbind(t(census[1, c(1:7, 9:10)]), t(cis[12088, 1:9]))

## -----------------------------------------------------------------------------
matches <- merge(x = census[, .(x=1:.N, person_id)],
                 y = cis[, .(y = 1:.N, person_id)],
                 by = "person_id")
matches[, block:=1:.N]
head(matches)

## -----------------------------------------------------------------------------
result2 <- blocking(x = census$txt, y = cis$txt, verbose = 1,
                    true_blocks = matches[, .(x, y, block)])

## -----------------------------------------------------------------------------
result2

## -----------------------------------------------------------------------------
ann_control_pars <- controls_ann()
ann_control_pars$nnd$epsilon <- 0.2

result3 <- blocking(x = census$txt, y = cis$txt, verbose = 1, 
                    true_blocks = matches[, .(x, y, block)], 
                    control_ann = ann_control_pars)

## -----------------------------------------------------------------------------
result3

## -----------------------------------------------------------------------------
result4 <- blocking(x = census$txt, y = cis$txt, verbose = 1, 
                    true_blocks = matches[, .(x, y, block)], 
                    ann = "hnsw")

## -----------------------------------------------------------------------------
result4

## -----------------------------------------------------------------------------
c("no tuning" = mean(result2$result[order(y)]$x == result4$result[order(y)]$x)*100,
  "with tuning" = mean(result3$result[order(y)]$x == result4$result[order(y)]$x)*100)

Try the blocking package in your browser

Any scripts or data that you put into this service are public.

blocking documentation built on June 18, 2025, 9:16 a.m.