inst/doc/streaming-rl-howto.R

## ---- include = FALSE---------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----setup--------------------------------------------------------------------
library(bstrl)

## ----showdata-----------------------------------------------------------------
length(geco_small)
head(geco_small[[1]])

## ----preparecomparisons-------------------------------------------------------
# Names of the columns on which to perform linkage
fieldnames <- c("given.name", "surname", "age", "occup", "extra1", "extra2", "extra3", "extra4", "extra5", "extra6")

# How to compare each of the fields
types <- c("lv", "lv", # First name and last name use normalized edit distance
           "bi", "bi", "bi", "bi", "bi", "bi", "bi", "bi") # All others binary equal/unequal
breaks <- c(0, 0.25, 0.5) # Break continuous difference measures into 4 levels using these split points


## ----streaming----------------------------------------------------------------
res.twofile <- bipartiteRL(geco_small[[1]], geco_small[[2]],
                           flds = fieldnames, types = types, breaks = breaks,
                           nIter = 600, burn = 100,
                           seed = 0)

## ----pprbupdate---------------------------------------------------------------
res.pprb3 <- PPRBupdate(res.twofile, geco_small[[3]], # Comparison details are stored with previous result
                       nIter = 600, burn = 100,
                       seed = 0,
                       refresh = 0.05)
res.pprb4 <- PPRBupdate(res.pprb3, geco_small[[4]], # Comparison details are stored with previous result
                       nIter = 600, burn = 100,
                       seed = 0,
                       refresh = 0.05)

## ----filtersamples------------------------------------------------------------
filtered <- thinsamples(res.twofile, 50) # Don't need 500, 50 for demonstration

## ----smcmcupdate--------------------------------------------------------------
res.smcmc3 <- SMCMCupdate(filtered, geco_small[[3]],
                         nIter.jumping=2, nIter.transition = 8,
                         proposals.jumping="component", proposals.transition="component", #Either can be LB, but increase corresponding number of iterations
                         cores = 2) # Parallel execution
res.smcmc4 <- SMCMCupdate(res.smcmc3, geco_small[[4]],
                         nIter.jumping=2, nIter.transition = 8,
                         proposals.jumping="component", proposals.transition="component", #Either can be LB, but increase corresponding number of iterations
                         cores = 2) # Parallel execution

## ----resultsobject------------------------------------------------------------
names(res.pprb4)

## ----objectsizes--------------------------------------------------------------
# All have 500 columns and different numbers of rows.
dim(res.pprb4$Z)
dim(res.pprb4$m)
dim(res.pprb4$u)

## ----examiningZ---------------------------------------------------------------
# The first post-burn MCMC sample of Z
Zexample <- res.pprb4$Z[,1]
Zexample

## ----examplelink--------------------------------------------------------------
Zexample[8]

## ----examplecluster-----------------------------------------------------------
Zexample[27]
Zexample[8]

## ----linkprocessing-----------------------------------------------------------
# Create a list of length 500, where each element is one streaming link object
# for each posterior sample.
samples <- extractlinks(res.pprb4)

# Are record 9 in file 1 and record 7 in file 4 linked in the first posterior sample?
islinked(samples[[1]], file1=1, record1=9, file2=4, record2=7)

# In what proportion of posterior samples are record 9 in file 1 and record 7 in file 4 linked?
mean(sapply(samples, islinked, file1=1, record1=9, file2=4, record2=7))

# In what proportion of posterior samples are record 8 in file 1 and record 1 in file 2 linked?
mean(sapply(samples, islinked, file1=1, record1=8, file2=2, record2=1))

## ----multifile----------------------------------------------------------------
# Link three files from scratch, returning 500 posterior samples
res.threefile <- multifileRL(geco_small[1:3],
                             flds = fieldnames, types = types, breaks = breaks,
                             nIter = 600, burn=100, # Number of iterations to run
                             proposals = "comp", # Change to "LB" for faster iterations, slower convergence
                             seed = 0,
                             refresh = 0.05) # Print progress every 5% of run.

Try the bstrl package in your browser

Any scripts or data that you put into this service are public.

bstrl documentation built on Nov. 11, 2022, 1:06 a.m.