knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
library(fastLink)
library(lubridate)
library(RecordLinkUtil)
library(knitr)
library(dplyr)
library(stringdist)

The Entity Matching Problem:

Given two datasets without shared unique identifers, how can we determine which records represent the same real world "entity"?

Example: Matching health insurance claims and death certificates in maine over the last 5 years.

Generating Data:

We can generate claims and death certificate data that reflects this situation:

#Start and end dates for death certificate data
start_date = ymd("2015-1-1")
start_date_num = as.numeric(start_date)
end_date = ymd("2019-1-3")
end_date_num = as.numeric(end_date)

data_size = 1000
data <- tibble(id = 1:data_size,
                   town = sample_towns_by_pop(data_size), 
                   birthyear = sample(data_size, x = 1960:2000, replace = TRUE),
                   sex = sample(data_size, x = c("M", "F"), replace = TRUE),
                   deathdate = as_date(sample(data_size, x = start_date_num:end_date_num, replace = TRUE)))

dth_crt = data[1:500,]

kable(dth_crt[1:10,])


claims <-  mutate(dth_crt,
                          birthyear = corrupt_numeric(birthyear, error_rate = .3, scale = 1),
                          town = corrupt_replace(town,error_rate = .05, pool = dth_crt$town),
                          town = corrupt_character(town),
                          deathdate = as_date(corrupt_numeric(as.numeric(deathdate), error_rate = .9,  scale = 10)))

kable(claims[1:10,])
look = cbind(data, claims)
kable(look[1:10,])

The Naive Aproach

Joining exactly on all fields yeilds very few matches:

perfect_matches = inner_join(dth_crt, claims, by = c("town", "birthyear", "sex", "deathdate"))
print(paste0("Only ", nrow(perfect_matches), " perfect matches out of ", nrow(dth_crt)))

The approach using fastLink

The fastLink solution:

claims <- mutate(claims, deathdate = as.numeric(deathdate), birthyear = as.numeric(birthyear))
dth_crt <- mutate(dth_crt, deathdate = as.numeric(deathdate), birthyear = as.numeric(birthyear))
result <- fastLink(claims, dth_crt, 
                   varnames = c("town", "birthyear", "sex", "deathdate"),
                   numeric.match = c("birthyear", "deathdate"),
                   stringdist.match = c("town"),
                   partial.match = c("town", "deathdate"),
                   cut.a.num = 1, cut.p.num = 14,
                   cut.a = .94, cut.p = .88,
                   threshold.match = .85)

print(paste0("Result returns an object of class: ", class(result)))
print(paste0(" with fields: ", paste( names(result), collapse = ", ")))
c_table <- getConfusion(claims, dth_crt, result$matches, "id")
c_table_info <- getConfusionInfo(c_table)
kable(c_table)
kable(c_table_info)

Fellegi-Sunter Model of Record Linkage:

A formalization and general algorithm for record linkage.

  1. Calculating the comparison vector for each pair of records
  2. Finding the probability of any given comparison vector given a match/nonmatch.
  3. Assigning the weights to each comparison vector & finding matches

1. Calculating Comparison vectors:

-Comparison Vector: a formal respresentation of the differencebetween two records -Usually in terms of the numeric "distances" between each feild being matched on

get_comp_vec <- function(a, b){
  out <- list()
  town_dist <- 1 - stringdist(a$town, b$town, method = "jw")
  yob_dist <- abs(a$birthyear - b$birthyear)
  sex_dist<- as.numeric(!(a$sex == b$sex))
  deathdate_dist <- abs(as.numeric(a$deathdate)- as.numeric(b$deathdate))
  return(tibble("town_dist" = town_dist, "yob_dist" =yob_dist, "sex_dist" =sex_dist, "deathdate_dist" = deathdate_dist))
}

Comparison vectors Example 1:

record_1 <- dth_crt[1,]
print("The first record in the dth_crt data is :")
kable(record_1)
print("The first record in the claims data is :")
record_2 <- claims[1,]
kable(record_2)

print("The comparison vector between the first records of the claims and crt data is:")
kable(get_comp_vec(record_1,record_2))

Comparison vectors Example 2:

record_1 <- dth_crt[1,]
print("The first record in the dth_crt data is :")
kable(record_1)
print("The second record in the claims data is :")
record_3 <- claims[2,]
kable(record_3)

print("The comparison vector between the first record of the claims and the second record of crt data is:")
kable(get_comp_vec(record_1,record_3))

Bucketing Comparison Vectors

FastLink and most other open source packages assign discrete cutoffs for each type of distance, that classifies the range of possible distances given by that metric into three similarity levels: very similar, similar, and completly different.

These scores are encoded as integers 2, 1, and 0 respectivly.

In our example, we can assign cutoffs 2, 10 for numeric distances, .98, .8 for string distances, and do exact matching on gender.

Given these cutoffs, our previous comparison vectors become:

cv1 <-  get_comp_vec(record_1,record_2)
cv2 <- get_comp_vec(record_1,record_3)

cvs <- rbind(cv1, cv2)

cvs <-  mutate(cvs,
               town_dist = ifelse(town_dist >= .98, 2, ifelse(town_dist >= .8 , 1, 0)),
               yob_dist = ifelse(yob_dist <= 2, 2, ifelse(yob_dist <= 10 , 1, 0)),
               deathdate_dist = ifelse(deathdate_dist <= 2, 2, ifelse(deathdate_dist <= 10,1, 0)),
               sex_dist = ifelse(sex_dist == 0, 2, 0))


print("The comparison vector between the first records of the claims and crt data is:")
kable(cvs[1,])

print("The comparison vector between the first record of the claims and the second record of crt data is:")
kable(cvs[2,])

These comparison vectors are identical what fastLink uses under the hood. In the fastLink documentation and in this vignette, we will refer to the comparison vector of records $i$ and $j$ of the first and second data base as $\gamma_{i,j}$.

2. Finding probabilities:

Once we have the comparison vector for every pair, we can start estimating several probabilities:

  1. $P(M_{i,j})$: The probability of any two records $i$, $j$ being a true match(ie represent the same real world entity).

  2. $P(\gamma_{i,j} |M_{i,j})$: The probability that comparison vector $\gamma_{i,j}$ occuring given records $i$ and $j$ are a true match.

  3. $P(\gamma_{i,j} |\neg M_{i,j})$:The probability that comparison vector $\gamma_{i,j}$ occuring given records $i$ and $j$ are not a true match.

We want to estimate $P(M_{i,j})$, $P(\gamma_{i,j} |M_{i,j})$, and $P(\gamma_{i,j} |\neg M_{i,j})$, because using bayes rule we can then calculate the probability of $i$, $j$ being a match given the observed comparison vector $\gamma_{i,j}$.

Bayes Rule

$$ P(A|B) = \frac{P(B|A)P(A)}{P(B|A)P(A) + P(B|\neg A)P(\neg A)} $$

Applying Bayes Rule

$$ P( M_{i,j}|\gamma_{i,j}) = \frac{P( \gamma_{i,j}|M_{i,j})P(M_{i,j})}{P( \gamma_{i,j}|M_{i,j})P(M_{i,j}) + P( \gamma_{i,j}|\neg M_{i,j})P(\neg M_{i,j})} $$ To estimate these probabilities, fastLink uses an unsupervised learning technique called the expectation-maximization algorithm.

EM Algorithm

The EM Algorithm takes observed data(the comparison vectors) and latent data(the true matches) and assumes they are sampled from some probability distribution with parameters $\theta$. We can estimate the latent data by finding the parameters that maximize probability of the observed data, then just using the resulting distribution to find the probabilities of the latent data given the observed data.

The Algorithm finds these optimal parameters by:

  1. Making an educated guess for parameters
  2. Estimate the latent data given parameters
  3. Updating the parameters

EM Parameters

1.$\pi_{kml}$: The probabilities that for any given feild k in the comparison vector, the probability of that feild taking on the lth similarity level, given a match or a nonmatch.

2.$\lambda$ The probability that any two pairs are a match (ie just $P(M_{i,j})$)

Assuming that the parameters are conditionally independent given a match/nonmatch, we can the calculate $P(\gamma_{i,j} |M_{i,j})$ for comparison vector $\gamma_{i,j} = \langle a_1, a_2, ..., a_k \rangle$ as a product of parameters $P(\gamma_{i,j} |M_{i,j}) = \pi_{1,1,a_1}\pi_{2,1,a_2} \dots \pi_{k,1,a_k}$

In our example, we would calculate the probability of this comparison vector given a match $P(\gamma_{1,1} |M_{1,1})$

print("The comparison vector between the first records of the claims and crt data is:")
kable(cvs[1,])
vec <- cvs[1,]
pi_vec <- vector(mode = "character", length = length(vec))

for(i in 1:length(vec)){
  pi_vec[i] = paste0("pi_",i,"1",vec[i])
}
print(paste("P(gamma_{1,1} |M_{1,1})", pi_vec, collapse = " * "))

Using these parameters, we can calulate the probability $i,j$ are a match given the observed comparison vector: $$ P( M_{i,j}|\gamma_{i,j}) = \frac{(\pi_{1,1,a_1}\pi_{2,1,a_2} \dots \pi_{k,1,a_k})\lambda}{(\pi_{1,1,a_1}\pi_{2,1,a_2} \dots \pi_{k,1,a_k})\lambda + (\pi_{1,0,a_1}\pi_{2,0,a_2} \dots *\pi_{k,0,a_k})(1-\lambda)} $$ We can then update $\lambda$ and the $\pi_{kml}$ based on the $P( M_{i,j}|\gamma_{i,j})$.

Visualizing The EM Algorithm

We can get a better inutitive understanding of this algorithm using the folowing animation.

Figure 1: The Probabilities of Comparison Vectors over Iterations

Figure 1 is the probability that a comparison vector represents a match given the comparison vector $P( M_{i,j}|\gamma_{i,j})$ on the y axis, and a index representing every unique observed comparison vector $\gamma_{i,j}$ along the x axis. The probabilities start unstable, but eventually converge.

3. Assigning Weights & Finding Matches

Once the EM algorithm converges, we have our estimates for $P( M_{i,j}|\gamma_{i,j})$. We can then assign a probability threshold, and take as matches all pairs with $P( M_{i,j}|\gamma_{i,j})$ greater than this threshold.

The fastLink function

Lets look back now at the orgininal call to fastLink:

result <- fastLink(claims, dth_crt, 
                   varnames = c("town", "birthyear", "sex", "deathdate"),
                   numeric.match = c("birthyear", "deathdate"),
                   stringdist.match = c("town"),
                   partial.match = c("town", "deathdate"),
                   cut.a.num = 1, cut.p.num = 14,
                   cut.a = .94, cut.p = .88,
                   threshold.match = .85)

Arguments

-varnames: Column names of variables to match on. -numeric.match: Column names of variables to match numerically -stringdist.match: Column names of variables to match using string distance -partial.match: Columns to match with 3 similarity(as opposed to 2, which is the default).

-cut.a, cut.p: cutoffs for string distance matching -cut.a.num, cut.p.num: cutoffs for numeric matching

Cutoffs example

r1 <- tibble(town = "PORTLAND", birthyear = 1988, deathdate = 16500, sex = "M")

print("Record 1:")
kable(r1)
r2 <- tibble(town = "ORTLAND", birthyear = 1989, deathdate = 16501, sex = "M")
print("Record 2:")
kable(r2)
r3 <- tibble(town = "RTLAND", birthyear = 1990, deathdate = 16514, sex = "F")
print("Record 3:")
kable(r3)
apply_cutoffs <- function(x, cut.a, cut.p, cut.num.a, cut.num.p){
  res <- tibble(town_dist = ifelse(x$town_dist >= (cut.a), 2, ifelse(x$town_dist >= (cut.p),1,0)),
                yob_dist = ifelse(x$yob_dist <= (cut.num.a), 2, 0),
                deathdate_dist = ifelse(x$deathdate_dist <= cut.num.a, 2, ifelse(x$deathdate_dist <= cut.num.p,1,0)),
                sex_dist = ifelse( x$sex_dist == 0,2, 0)
                )
  return(res)
}

print("Comparison Vector between Record 1 and Record 2")
comp_vec1 <- get_comp_vec(r1,r2)
comp_vec1 <- apply_cutoffs(comp_vec1, cut.a = .94, cut.p = .88, cut.num.a = 1, cut.num.p = 14)
kable(comp_vec1)

print("Comparison Vector between Record 1 and Record 3")
comp_vec2 <- get_comp_vec(r1,r3)
comp_vec2 <- apply_cutoffs(comp_vec2, cut.a = .94, cut.p = .88, cut.num.a = 1, cut.num.p = 14)
kable(comp_vec2)

Return Values

FastLink returns a object of type "FastLink", with feilds:

1. matches

matches is a 2 column matrix of matched indices from dtA and dfB

matches <- result$matches
kable(matches[1:10,])
print(paste0("Number of detected matches: ",nrow(matches)))
c_table <- getConfusion(claims, dth_crt, matches, "id")
kable(c_table)
c_table_info <- getConfusionInfo(c_table)
kable(c_table_info)

2. posterior and patterns

-posterior: A vector of posterior probabilities associated with each match in matches -patterns: A matrix of comparison vectors associated with each match in matches

posterior <- result$posterior
patterns <- result$patterns
kable(posterior[1:10])
kable(patterns[1:10,])
matched_pairs <- cbind(claims[matches[,1],],dth_crt[matches[,2],],posterior)
matched_pairs <- bindMatches_p(claims[-1], dth_crt[-1], posterior, matches, idA = "claims.", idB = "dth_crt.")
kable(matched_pairs[1:10,])

2. EM object

-EM: A object of class fastLink.EM, information about EM algorithm. Notable contains: -patterns.w: List of unique comparison vectors with their count. -zeta.j: List of posterior probabilities associated with each comparison vector in patterns.w

EM <- result$EM
unique_patterns = cbind(as.data.frame(EM$patterns.w)[,-c(6,7,8)], EM$zeta.j)
unique_patterns <- unique_patterns[rev(order(EM$zeta.)),]
kable(unique_patterns)
 new_cutoff <- unique_patterns[4,6]

new_result <- fastLink(claims, dth_crt, 
                   varnames = c("town", "birthyear", "sex", "deathdate"),
                   numeric.match = c("birthyear", "deathdate"),
                   stringdist.match = c("town"),
                   partial.match = c("town", "deathdate"),
                   cut.a.num = 1, cut.p.num = 14,
                   cut.a = .94, cut.p = .88,
                   threshold.match = new_cutoff)

print(paste0("Number of detected matches: ",nrow(new_result$matches)))
new_c_table <- getConfusion(claims, dth_crt, new_result$matches, "id")
new_c_table_info <- getConfusionInfo(new_c_table)

Optional arguments to fastLink:

-dedupe.matches Enforces that every record in dfA must match with at most one record in dfB. Does this by choosing the best match for every document in A and B, then breaks ties arbitrarily. Is true by default, when false returns every match over matching threshold.

-return.all Sets the threshold for a match to 0. If dedupe.matches is also true, this will return the best matches for every record in the smaller dataframe. If dedupe matches is not true, it will return all pairs of records in dfA and dfB.

-cond.indep The fellegi-sunter model for record linkage assumes conditional independence of feilds in comparison vectors, but this is sometimes not the case. If cond.indep is false, then we use a modified version of the fellegi-sunter model.

Techniques for reducing computation time

1.Blocking:

Seperates data frame into mutliple data frames by values in a given feild

block_sex <- blockData(claims, dth_crt, varnames = "sex")
print(names(block_sex))
kable(claims[block_sex$block.2$dfA.inds[1:10],])
kable(claims[block_sex$block.1$dfA.inds[1:10],])

2.Sampling

Given appriopriate samples of dfA and dfB: sdfA, sdfB, we can run fastLink on those samples, then use the optimal parameters given by the EM algorithm

sample_claims <- claims[sample(1:nrow(claims), 400, replace = TRUE),]
sample_dth_crt <- dth_crt[sample(1:nrow(dth_crt), 400, replace = TRUE),]

sample_EM <- fastLink(sample_claims, sample_dth_crt, 
                   varnames = c("town", "birthyear", "sex", "deathdate"),
                   numeric.match = c("birthyear", "deathdate"),
                   stringdist.match = c("town"),
                   partial.match = c("town", "deathdate"),
                   cut.a.num = 1, cut.p.num = 14,
                   cut.a = .94, cut.p = .88,
                   estimate.only = TRUE)
result = fastLink(claims, dth_crt, 
                   varnames = c("town", "birthyear", "sex", "deathdate"),
                   numeric.match = c("birthyear", "deathdate"),
                   stringdist.match = c("town"),
                   partial.match = c("town", "deathdate"),
                   cut.a.num = 1, cut.p.num = 14,
                   cut.a = .94, cut.p = .88,
                   em.obj = sample_EM)
print(paste0("Number of detected matches: ",nrow(new_result$matches)))
new_c_table <- getConfusion(claims, dth_crt, new_result$matches, "id")
new_c_table_info <- getConfusionInfo(new_c_table)

Applications of FastLink

Here is some of the ways we applied fastLink to

Matching Patient Written Documents

Matching between information on documents filled out by pateints vs on documents filled out by doctors. There was a shared ID between both documents, so we were able to get a confusion matrix:

confusion_matrix = data.frame(True_match = c(15,10), True_Nonmatch = c(2,623))
rownames(confusion_matrix) <- c("Detected Link", "Detected Non-Link")
kable(confusion_matrix)
kable(getConfusionInfo(confusion_matrix))

Matching claims data to death certificate data

Matching between claims data and death certificate data for Maine. Out of 6337 claims records and 39873 death certificates, we were able to find 3775 matches.

Matching EMS data to CARES data

Matching between CARE registry and EMS data. We managed to get 1061 matches out of 2658 EMS records and 1336 CAREs records



Sam-Murray/RecordLinkUtil documentation built on Oct. 30, 2019, 11:48 p.m.