link_wf | R Documentation |
Deterministic and probabilistic record linkage Assign unique identifiers to records based on partial, nested or calculated probabilities.
links_af_probabilistic(
attribute,
blocking_attribute = NULL,
cmp_func = diyar::exact_match,
attr_threshold = 1,
probabilistic = TRUE,
m_probability = 0.95,
u_probability = NULL,
score_threshold = 1,
repeats_allowed = FALSE,
permutations_allowed = FALSE,
data_source = NULL,
ignore_same_source = TRUE,
display = "none"
)
links_wf_probabilistic(
attribute,
blocking_attribute = NULL,
cmp_func = diyar::exact_match,
attr_threshold = 1,
probabilistic = TRUE,
m_probability = 0.95,
u_probability = NULL,
score_threshold = 1,
id_1 = NULL,
id_2 = NULL,
return_weights = FALSE,
...
)
prob_score_range(attribute, m_probability = 0.95, u_probability = NULL)
attribute |
|
blocking_attribute |
|
cmp_func |
|
attr_threshold |
|
probabilistic |
|
m_probability |
|
u_probability |
|
score_threshold |
|
repeats_allowed |
|
permutations_allowed |
|
data_source |
|
ignore_same_source |
|
display |
|
id_1 |
|
id_2 |
|
return_weights |
If |
... |
Arguments passed to |
links_wf_probabilistic()
- A wrapper function of links
with a
with a specific sub_criteria
and to achieve to achieve probabilistic record linkage
It excludes functionalities for the nested and multi-stage linkage.
links_wf_probabilistic()
requires a score_threshold
in advance.
To help with this, prob_score_range()
can be used to return the range of scores attainable for a given set of attribute
, m
and u
-probabilities.
Additionally, id_1
and id_2
can be used to link specific records pairs, aiding the review of potential scores.
links_af_probabilistic()
- A simpler version of links
.
It excludes functionalities for the batched, nested and multi-stage linkage.
links_af_probabilistic()
requires a score_threshold
in advance,
however, since it exports the match weights, the score_threshold
can be changed after the analysis.
pid
; list
Fellegi, I. P., & Sunter, A. B. (1969). A Theory for Record Linkage. Journal of the Statistical Association, 64(328), 1183 - 1210. https://doi.org/10.1080/01621459.1969.10501049
Asher, J., Resnick, D., Brite, J., Brackbill, R., & Cone, J. (2020). An Introduction to Probabilistic Record Linkage with a Focus on Linkage Processing for WTC Registries. International journal of environmental research and public health, 17(18), 6937. https://doi.org/10.3390/ijerph17186937.
See vignette("links")
for more information.
links
data(patient_records)
# Weighted (probabilistic) comparison of forename, middlename and surname
criteria_1 <- as.list(patient_records[c("forename", "middlename", "surname")])
# Possible scores when m-probability is 0.95
prob_scores <- prob_score_range(attribute = criteria_1,
m_probability = 0.95,
u_probability = NULL)
## Not run:
# Probabilistic record linkage with 'links_af_probabilistic()'
pids_1a <- links_af_probabilistic(attribute = criteria_1,
cmp_func = exact_match,
attr_threshold = 1,
probabilistic = TRUE,
m_probability = 0.95,
score_threshold = prob_scores$mid_scorce,
display = "stats")
# Equivalent with 'links_wf_probabilistic()'
pids_1b <- links_wf_probabilistic(attribute = criteria_1,
cmp_func = exact_match,
attr_threshold = 1,
probabilistic = TRUE,
m_probability = 0.95,
score_threshold = prob_scores$mid_scorce,
display = "progress",
recursive = TRUE,
check_duplicates = TRUE)
# Less thorough but faster equivalent with `links_wf_probabilistic()`
pids_1c <- links_wf_probabilistic(attribute = criteria_1,
cmp_func = exact_match,
attr_threshold = 1,
probabilistic = TRUE,
m_probability = 0.95,
score_threshold = prob_scores$mid_scorce,
display = "progress",
recursive = FALSE,
check_duplicates = FALSE)
# Each implementation can lead to different results
summary(pids_1a$pid)
summary(pids_1b$pid)
summary(pids_1c$pid)
## End(Not run)
# Weighted (non-probabilistic) comparison of forename, middlename and age difference
criteria_2 <- as.list(patient_records[c("forename", "middlename", "dateofbirth")])
age_diff <- function(x, y){
diff <- abs(as.numeric(x) - as.numeric(y))
wgt <- diff %in% 0:(365 * 10) & !is.na(diff)
wgt
}
pids_2a <- links_af_probabilistic(attribute = criteria_2,
blocking_attribute = patient_records$surname,
cmp_func = c(exact_match, exact_match, age_diff),
score_threshold = number_line(3, 5),
probabilistic = FALSE,
display = "stats")
# Larger weights can be assigned to particular attributes through `cmp_func`
# For example, a smaller age difference can contribute a higher score (e.g 0 to 3)
age_diff_2 <- function(x, y){
diff <- as.numeric(abs(x - y))
wgt <- diff %in% 0:(365 * 10) & !is.na(diff)
wgt[wgt] <- match(as.numeric(cut(diff[wgt], 3)), 3:1)
wgt
}
pids_2b <- links_af_probabilistic(attribute = criteria_2,
blocking_attribute = patient_records$surname,
cmp_func = c(exact_match, exact_match, age_diff_2),
score_threshold = number_line(3, 5),
probabilistic = FALSE,
display = "stats")
head(pids_2a$pid_weights, 10)
head(pids_2b$pid_weights, 10)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.