knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
library(recordlinkR)

# Load sample Iowa data
data(iowa_sample)

Dataset Description

The sample Iowa dataset contains last name, first name, year of birth, and middle initial for a subset of names from the 1915 and the 1940 Iowa Census. Unique identifiers uid1915 and hhid are available in the iowa_1915 and iowa_1940 datasets respectively, but do not correspond across datasets. The rows of iwoa_matches denote true links using the row index from iowa_1915 and iowa_1940.

head(iowa_1915)

Create Blocks

# Load sample encoders 
loadSampleEncoders() 

cols.encoder <- list('A'='fname1915', 'B'='fname1940')
blocks <- block(iowa_1915, iowa_1940, 
                cols.encoder=cols.encoder, 
                encoder.model.path=encoder_iowa_first_4, 
                encoder.block.method = 'cluster', 
                encoder.nclusters = 10,
                n.cores = 1 )

If a set of true matches are known, then you can check to see how many pairs are remaining post blocking. In this case we have reduce the total number of pairs to compare to 10.7% of the original number while retaining 92.2% of the sample matches in the remaining set.

blocks <- block(iowa_1915, iowa_1940, 
                cols.encoder=cols.encoder, 
                encoder.model.path=encoder_iowa_first_4, 
                encoder.block.method = 'cluster', 
                encoder.nclusters = 10, 
                known.matches = iowa_sample_matches[iowa_sample_matches$match == 1][,1:2], 
                n.cores = 1)

AUC plot

dfA <- blocks[['dfA']]
dfB <- blocks[['dfB']]
block.pairs <- blocks[['blocks']][1:1000,]
# Encode and check AUC 
fname.encoded <- encode(iowa_1915[block.pairs$V1,], iowa_1940[block.pairs$V2,], 
                        cols.encoder = list('A'='fname1915', 'B'='fname1940'), 
                        encoder.model.path = encoder_iowa_first_256)
fname1915.encoded <- fname.encoded[['encoded.A']][[1]]
fname1940.encoded <- fname.encoded[['encoded.B']][[1]]
cos.similarity <- function(A, B) {
  (A%*% B) / (norm(A) * norm(B)) 
}

Making comparisons

# encoder.string.cols <- list('A'='fname1915', 'B'='fname1940') 
# comparisons <- compare(dfA = dfA, dfB = dfB, 
#                        blocks = block.pairs, 
#                        compare.string.encoder = encoder.string.cols, 
#                        encoder.model.path = encoder_iowa_first_512)


kailin-lu/recordlinkR documentation built on May 4, 2019, 7:37 a.m.