inst/doc/v1-deduplication.R

## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  warning = FALSE,
  message = FALSE,
  comment = "#>"
)

## ----setup--------------------------------------------------------------------
library(blocking)
library(data.table)

## -----------------------------------------------------------------------------
data(RLdata500)
head(RLdata500)

## -----------------------------------------------------------------------------
RLdata500[, id_count :=.N, ent_id] ## how many times given unit occurs
RLdata500[, bm:=sprintf("%02d", bm)] ## add leading zeros to month
RLdata500[, bd:=sprintf("%02d", bd)] ## add leading zeros to day
RLdata500[, txt:=tolower(paste0(fname_c1,fname_c2,lname_c1,lname_c2,by,bm,bd))]
head(RLdata500)

## -----------------------------------------------------------------------------
df_blocks <- blocking(x = RLdata500$txt, ann = "nnd", verbose = 1, graph = TRUE, seed = 2024)

## -----------------------------------------------------------------------------
df_blocks

## -----------------------------------------------------------------------------
str(df_blocks,1)

## -----------------------------------------------------------------------------
plot(df_blocks$graph, vertex.size=1, vertex.label = NA)

## -----------------------------------------------------------------------------
head(df_blocks$result)

## -----------------------------------------------------------------------------
df_block_melted <- melt(df_blocks$result, id.vars = c("block", "dist"))
df_block_melted_rec_block <- unique(df_block_melted[, .(rec_id=value, block)])
head(df_block_melted_rec_block)

## -----------------------------------------------------------------------------
RLdata500[df_block_melted_rec_block, on = "rec_id", block_id := i.block]
head(RLdata500)

## -----------------------------------------------------------------------------
RLdata500[, .(uniq_blocks = uniqueN(block_id)), .(ent_id)][, .N, uniq_blocks]

## -----------------------------------------------------------------------------
hist(df_blocks$result$dist, xlab = "Distances", ylab = "Frequency", breaks = "fd",
     main = "Distances calculated between units")

## -----------------------------------------------------------------------------
df_for_density <- copy(df_block_melted[block %in% RLdata500$block_id])
df_for_density[, match:= block %in% RLdata500[id_count == 2]$block_id]

plot(density(df_for_density[match==FALSE]$dist), col = "blue", xlim = c(0, 0.8), 
     main = "Distribution of distances between\nclusters type (match=red, non-match=blue)")
lines(density(df_for_density[match==TRUE]$dist), col = "red", xlim = c(0, 0.8))

Try the blocking package in your browser

Any scripts or data that you put into this service are public.

blocking documentation built on June 18, 2025, 9:16 a.m.