capelinker: Machine Learning-based Record Linkage for Historical South Africa

rm(list = ls())

library("randomForest")
library("xgboost")
library("data.table")
library("stringdist")

setwd("~/repos/capelinker")

# remotes::install_github("rijpma/capelinker", dependencies = FALSE)
library("capelinker")

## opgaafrollen
# -------------

# graaf reinet
# stellenbosch
# combined
# full and sparse

rein = readRDS("data_raw/candidates_miss.rds.gz")
stel = readRDS("data_raw/stel_candidates_miss.rds.gz")
both = rbindlist(list(rein, stel), fill = TRUE, idcol = "region")
regions = unique(both$region)
both[, (paste0("region", as.character(regions))) := lapply(unique(region), function(x) as.numeric(region == x))]

f = formula(correct ~ 
    mlastdist + mfirstdist + minitialsdist_osa + 
    mlastsdx + 
    mfirstsdx + 
    wlastdist + wfirstdist + winitialsdist_osa + 
    wlastsdx + 
    wfirstsdx + 
    namefreq_from + 
    spousenamedist_from + 
    namefreq_to + 
    spousenamedist_to + 
    wifepresent_from + 
    wifepresent_to + 
    wifeinboth + 
    settlerchildrengauss + 
    nextmfirst + 
    mfirst_uniqueness_to +
    mfirst_uniqueness_from +
    matches + 
    husb_wife_surnamedist + 
    region1)

f_sparse = update(f, . ~ . 
    - wfirstsdx - mfirstsdx - mlastsdx - wlastsdx 
    - wifeinboth - wifepresent_to - wifepresent_from
    - namefreq_to - namefreq_from
    - spousenamedist_to - spousenamedist_from)

set.seed(123871)
share_train = 0.7
both[, train := persid_from %in% sample(unique(persid_from), ceiling(length(unique(persid_from)) * share_train))]
trn_both = both[train == 1]
vld_both = both[train == 0]

m_boost_stel_rein = xgboost::xgb.train(
    data = capelinker::xgbm_ff(trn_both, f),
    nrounds = 500,
    watchlist = list(train = xgbm_ff(trn_both, f), eval = xgbm_ff(vld_both, f)),
    params = list(
        max_depth = 6,        # default 6
        min_child_weight = 1, # default 1
        gamma = 1,            # default 0
        eta = 0.3,            # default 0.3
        subsample = 0.8,        # default 1
        colsample_bytree = 0.5, # default 1
        objective = "binary:logistic"
))

m_boost_stel_rein_sparse = xgboost::xgb.train(
    data = capelinker::xgbm_ff(trn_both, f_sparse),
    nrounds = 1000,
    params = list(
        max_depth = 6,        # default 6
        min_child_weight = 1, # default 1
        gamma = 1,            # default 0
        eta = 0.3,            # default 0.3
        subsample = 0.8,        # default 1
        colsample_bytree = 0.5, # default 1
        objective = "binary:logistic"
))

predictions = data.table(
    wife = vld_both$wife * 2, # normalised originally
    correct = vld_both$correct,
    pred_bos = predict(m_boost_stel_rein, newdata = xgbm_ff(vld_both, f)))
Metrics::precision(predictions$correct, predictions$pred_bos > 0.5)
Metrics::recall(predictions$correct, predictions$pred_bos > 0.5)

table(predictions$correct, predictions$pred_bos > 0.5)
table(trn_both$correct, predict(m_boost_stel_rein, newdata = xgbm_ff(trn_both, f)) > 0.5)

predictions[, Metrics::precision(correct, pred_bos > 0.5), by = wife]
predictions[, Metrics::recall(correct, pred_bos > 0.5), by = wife]
predictions[, Metrics::fbeta_score(correct, pred_bos > 0.5), by = wife]

# baptism and marriage records
# ----------------------
manual_baptisms = readRDS("data_raw/baptisms.rds.gz")
manual_marriages = readRDS("data_raw/marriages.rds.gz")

cnd = capelinker::candidates(
    dat_from = manual_baptisms,
    dat_to =  manual_marriages,
    idvariable_from = "baptid",
    idvariable_to = "marid",
    linktype = "many:one",
    blocktype = "string distance",
    blockvariable_from = "mlast",
    blockvariable_to = "mlast",
    maxdist = 0.15)

# cnd[is.na(marid)]
# NAs present because merge(..., all = TRUE), meaning NA when no plausible
# matches were found

# A problem that comes up here is that because we are doing a many-to-
# one match, we get more manual matches in the candidates dataset than
# we'd expect:

cnd[linkid_from == linkid_to][order(linkid_from), list(mlast_from, mlast_to, linkid_from, linkid_to, baptid, marid)]
# note the number of rows when subsetting to matches should be:
cnd[linkid_from == linkid_to, max(baptid)]

# Some sort of self-join is happening. Here we use the `baptid` made
# earlier to filter for duplicates.
# Apparently this isn't happening anymore. I suppose that's good?

cnd[, 
    duplicate := (linkid_from == linkid_to) & duplicated(baptid), 
    by = linkid_to]
cnd = cnd[duplicate == FALSE]
nrow(cnd[linkid_from == linkid_to])
# ? why is the self-join no longer happening?

# Next we create a variable indicating whether a candidate is a match in the
# manual data. We also get rid of the cases where either `linkid_from` or
# `linkid_to` was missing (need to check why that is.)

cnd[, correct := linkid_from == linkid_to]
cnd[, .N, by = correct]
# merge in make_candidates has all = TRUE, giving NAs in marid and baptid
# fix that
cnd = cnd[!is.na(correct)]


# train
cnd = distcalc(cnd, 
    character_variables = c("mlast", "mfirst", "wfirst", "minitials", "winitials", "mprof"),
    numeric_variables = "year")

# We are now finally ready to train a model on the manual links. We split the data in a training and a test data. Then we use the random forest classifier which proved efficient in the opgaafrollen. It might be worth testing a few more models.
# make this per-marid
set.seed(225) # for reproducability
cnd[, train := runif(.N) > 0.5]
train = cnd[train == TRUE]
test = cnd[train == FALSE]

x= train[, 
        grep("correct|dist|sdx|yeardist", names(train)), 
        with = FALSE]
x[!complete.cases(x)]
m_rf_baptisms_full = randomForest(
    as.factor(correct) ~ ., 
    data = train[, 
        grep("correct|dist|sdx|yeardist", names(train)), 
        with = FALSE],
    na.action = "na.exclude")

# no soundex, no professions
m_rf_baptisms_sparse = randomForest(
    as.factor(correct) ~ .,
    data = train[,
        grep("correct|mlastdist|mfirstdist|wfirstdist|yeardist", names(train)),
        with = FALSE],
    na.action = "na.exclude")

## couples in saf
# ---------------
jlabels = readxl::read_xlsx("~/data/cape/saf/Jeanne matches_full.xlsx")
setDT(jlabels)
alabels = readxl::read_xlsx("~/data/cape/saf/saf_spousallinks_labels_a_done.xlsx")
setDT(alabels)
alabels = alabels[match > 0, list(couple_id_from, couple_id_to)]

labelled = rbindlist(list(alabels, jlabels[, -"match"]))
labelled = unique(labelled)

saf_cnd = fread("~/data/cape/saf/saf_spousallinks_labels.csv", na.strings = "")
saf_cnd = saf_cnd[!is.na(couple_id_to)]

saf_cnd[paste(couple_id_from, couple_id_to) %in% paste(labelled$couple_id_from, labelled$couple_id_to), match := 1]

# remove duplicate
saf_cnd = saf_cnd[!(couple_id_from == "snymana1b7c1d2e7_mar_1" & couple_id_to == "snymana1b7c1d7e4_mar_1")]
# nb now more matches in labelled than in saf_cnd but is ok

setnames(saf_cnd, "match", "correct")

saf_cnd[, mfirstdist := stringdist::stringdist(firstnames_ego_husb, firstnames_spouse_husb, method = "jw")]
saf_cnd[, mlastdist := stringdist::stringdist(surname_ego_husb, surname_spouse_husb, method = "jw")]

saf_cnd[, wfirstdist := stringdist::stringdist(firstnames_ego_wife, firstnames_spouse_wife, method = "jw")]
saf_cnd[, wlastdist := stringdist::stringdist(surname_ego_wife, surname_spouse_wife, method = "jw")]

saf_cnd[, minitals_ego_husb := initials(firstnames_ego_husb)]
saf_cnd[, minitals_spouse_husb := initials(firstnames_spouse_husb)]
saf_cnd[, minitialsdist_osa := 1 - stringdist::stringsim(minitals_ego_husb, minitals_spouse_husb, method = "osa")]

saf_cnd[, winitals_ego_husb := initials(firstnames_ego_wife)]
saf_cnd[, winitals_spouse_husb := initials(firstnames_spouse_wife)]
saf_cnd[, winitialsdist_osa := 1 - stringdist::stringsim(winitals_ego_husb, winitals_spouse_husb, method = "osa")]

saf_cnd[, implied_marriage_age_wife := as.numeric(married_year_ego_wife) - as.numeric(startyear_ego_wife)]
saf_cnd[, implied_marriage_age_husb := as.numeric(married_year_ego_husb) - as.numeric(startyear_ego_husb)]

saf_cnd[, spousal_age_gap := startyear_ego_husb - startyear_ego_wife]
saf_cnd[, maryear_initialsdist_osa := 1 - stringdist::stringsim(as.character(married_year_ego_husb), as.character(married_year_ego_wife), method = "osa")]

f_saf = formula(correct ~ 
    # maryear_initialsdist_osa +  
    mlastdist + mfirstdist + minitialsdist_osa + 
    # mlastsdx + 
    # mfirstsdx + 
    wlastdist + wfirstdist + winitialsdist_osa +
    # wlastsdx + 
    # wfirstsdx + 
    # namefreq_from + 
    # spousenamedist_from + 
    # namefreq_to + 
    # spousenamedist_to + 
    # wifepresent_from + 
    # wifepresent_to + 
    # wifeinboth + 
    # settlerchildrengauss + 
    # nextmfirst + 
    # mfirst_uniqueness_to +
    # mfirst_uniqueness_from +
    # matches + 
    # husb_wife_surnamedist + 
    # region1
    implied_marriage_age_wife + 
    implied_marriage_age_husb
    # spousal_age_gap +
    # myeardiff
)

set.seed(987)
share_train = 0.7
saf_cnd[, train := couple_id_from %in% sample(unique(couple_id_from), ceiling(length(unique(couple_id_from)) * share_train))]
trn_saf = saf_cnd[train == 1]
vld_saf = saf_cnd[train == 0]

trn_saf[, uniqueN(couple_id_from)]
vld_saf[, uniqueN(couple_id_from)]
trn_saf[, uniqueN(couple_id_from)] + vld_saf[, uniqueN(couple_id_from)]
trn_saf[, sum(correct)] + vld_saf[, sum(correct)]

# some overfitting going on her
m_boost_saf = xgboost::xgb.train(
    data = capelinker::xgbm_ff(trn_saf, f_saf),
    nrounds = 1000,
    # watchlist = list(train = xgbm_ff(trn_saf, f_saf), eval = xgbm_ff(vld_saf, f_saf)),
    params = list(
        max_depth = 6,        # default 6
        min_child_weight = 1, # default 1 larger is more consevative
        gamma = 1,            # default 0, larger is more conservative
        eta = 0.3,            # default 0.3 lower for less overfitting
        max_delta_step = 0,   # deafult 0, useful for unbalanced, higher is more conservative
        subsample = 0.8,        # default 1 lower is less overfitting
        colsample_bytree = 0.5, # default 1 
        objective = "binary:logistic"
))

impmat = xgboost::xgb.importance(model = m_boost_saf)

predictions = data.table(
    correct = as.logical(vld_saf$correct),
    pred_bos = predict(m_boost_saf, newdata = xgbm_ff(vld_saf, f_saf)))
conf = table(actual = predictions$correct, predicted = predictions$pred_bos > 0.5)
writeLines(conf2tex(conf), con = "~/repos/saf/out/within_saf_confmat.tex")

saf_validation_metrics = list(
    confusion = predictions[, .N, by = list(actual = correct, predicted = pred_bos > 0.5)],
    precision = Metrics::precision(predictions$correct, predictions$pred_bos > 0.5),
    recall = Metrics::recall(predictions$correct, predictions$pred_bos > 0.5),
    fbeta_score = Metrics::fbeta_score(predictions$correct, predictions$pred_bos > 0.5),
    date = Sys.time(),
    features = m_boost_saf$feature_names
)
out = capture.output(saf_validation_metrics)
writeLines(capture.output(saf_validation_metrics), 
    paste0("~/repos/saf/saf_validation_metrics-", Sys.time(), ".txt"))

toplot = predictions[, .N, by = list(actual = correct, predicted = pred_bos > 0.5)]
steps = seq(0.01, 1, 0.01)
toplot = predictions[, list(
        prec = sapply(steps, function(x) Metrics::precision(correct, predicted = pred_bos > x)),
        rec = sapply(steps, function(x) Metrics::recall(correct, predicted = pred_bos > x)),
        fbeta = sapply(steps, function(x) Metrics::fbeta_score(correct, predicted = pred_bos > x)),
        tr = steps)]
pdf("~/repos/saf/precrec.pdf")
plot(rec ~ prec, data = toplot, type = 'b', pch = 19,
    xlab = "precision", ylab = "recal")
points(rec ~ prec, data = toplot[tr == 0.5], type = 'b', col = 2, pch = 19)
dev.off()

library(ggplot2)
ggplot(toplot, aes(prec, rec)) + 
    geom_line() + 
    geom_point(data = toplot[tr == 0.5], col = "black")


## saf to opgaafrol
# -----------------

# training data
alabels = readxl::read_xlsx("~/data/cape/saf/saf2opg_candidates_batch2_auke.xlsx")
jlabels = readxl::read_xlsx("~/data/cape/saf/saf2opg_candidates_batch1_jeanne.xlsx")

setDT(alabels)
setDT(jlabels)

# empty is no link
alabels[is.na(link), link := 0]
jlabels[is.na(link), link := 0]
jlabels[link == 2, link := 1] # typo

# check overlap
first50a = alabels[, unique(couple_id)[1:50]]
first50j = jlabels[, unique(couple_id)[1:50]]
all.equal(first50j, first50a)

# check intercoder reliability
intercoder = merge(
    alabels[couple_id %in% first50j], 
    jlabels[couple_id %in% first50j, list(couple_id, persid, link_j = link, comment_j = comment)], 
    by = c("couple_id", "persid"))
# w/o "based on opg" because j never linked those
writexl::write_xlsx(
    intercoder[link != link_j & (comment != "based on opg" | is.na(comment))],
    "~/data/cape/saf/saf2opg_intercoder.xlsx")

alabels[comment == "based on opg", link := 0] # for now don't use these (singles inferred from opg series)

saf2opg = rbindlist(
    list(alabels,
        jlabels[!couple_id %in% first50j]), # avoid doubles
    fill = TRUE)

# linking 248 in labelled, ~50%
saf2opg[, uniqueN(couple_id[link == 1])]
saf2opg[, uniqueN(couple_id[link == 1]) / uniqueN(couple_id)]

# link about 2000 opg observations
saf2opg[, uniqueN(persid[link == 1])]
saf2opg[, uniqueN(idx[link == 1])]
# can't say yet if that's a lot

# for compatability with other pretrained models
setnames(saf2opg, "link", "correct")

# labelling blocks
saf2opg[, uniqueN(couple_id)]

saf2opg[, mfirstdist := stringdist::stringdist(firstnames, mfirst, method = "jw")]
saf2opg[, mlastdist := stringdist::stringdist(surname, mlast, method = "jw")]

saf2opg[, wfirstdist := stringdist::stringdist(spouse_firstnames_clean, wfirst, method = "jw")]
saf2opg[, wlastdist := stringdist::stringdist(spouse_surname_clean, wlast, method = "jw")]

# maybe we had this at an earlier stage?
# later merge back into original cleaned files
saf2opg[, minitals_saf := initials(firstnames)]
saf2opg[, minitals_opg := initials(mfirst)] # def. here
saf2opg[, minitialsdist_osa := 1 - stringdist::stringsim(minitals_saf, minitals_opg, method = "osa")]

saf2opg[, winitals_saf := initials(spouse_firstnames_clean)]
saf2opg[, winitals_opg := initials(wfirst)]
saf2opg[, winitialsdist_osa := 1 - stringdist::stringsim(winitals_saf, winitals_opg, method = "osa")]

# add next year dist
# ...

# add surnamedist
saf2opg[, cross_surnamedist := stringdist::stringdist(firstnames, wlast, method = "jw")]

# add firstnames count

# add "name is initials"
saf2opg[, wfirst_is_initials := as.numeric(len_longest_word(wfirst) == 1)]
saf2opg[, mfirst_is_initials := as.numeric(len_longest_word(mfirst) == 1)]

saf2opg[, years_married := year - married_year]
# saf_cnd[, maryear_initialsdist_osa := 1 - stringdist::stringsim(as.character(married_year_ego_husb), as.character(married_year_ego_wife), method = "osa")]
# x[, implied_age_gk := gk(year, sy)]

f_saf2opg = formula(correct ~ 
    # maryear_initialsdist_osa +  
    mlastdist + mfirstdist + minitialsdist_osa + 
    # mlastsdx + 
    # mfirstsdx + 
    wlastdist + wfirstdist + winitialsdist_osa + 
    cross_surnamedist + 
    wfirst_is_initials + 
    # mfirst_is_initials + 
    years_married + 
    implied_age
    # wlastsdx + 
    # wfirstsdx + 
    # namefreq_from + 
    # spousenamedist_from + 
    # namefreq_to + 
    # spousenamedist_to + 
    # wifepresent_from + 
    # wifepresent_to + 
    # wifeinboth + 
    # settlerchildrengauss + 
    # nextmfirst + 
    # mfirst_uniqueness_to +
    # mfirst_uniqueness_from +
    # matches + 
    # husb_wife_surnamedist + 
    # region1
    # implied_marriage_age_wife + 
    # implied_marriage_age_husb +
    # spousal_age_gap +
    # myeardiff
)

set.seed(123654)
share_train = 0.7
saf2opg[, train := couple_id %in% sample(unique(couple_id), ceiling(length(unique(couple_id)) * share_train))]
trn_saf2opg = saf2opg[train == 1]
vld_saf2opg = saf2opg[train == 0]

trn_saf2opg[, uniqueN(couple_id)]
vld_saf2opg[, uniqueN(couple_id)]
trn_saf2opg[, uniqueN(couple_id)] + vld_saf2opg[, uniqueN(couple_id)]
trn_saf2opg[, sum(correct)] + vld_saf2opg[, sum(correct)]

# no link at all
trn_saf2opg[, all(correct == 0), by = couple_id][, sum(V1)] + 
    vld_saf2opg[, all(correct == 0), by = couple_id][, sum(V1)]


# slight overfitting going on here
m_saf2opg = xgboost::xgb.train(
    data = capelinker::xgbm_ff(trn_saf2opg, f_saf2opg),
    nrounds = 500,
    watchlist = list(train = xgbm_ff(trn_saf2opg, f_saf2opg), eval = xgbm_ff(vld_saf2opg, f_saf2opg)),
    params = list(
        max_depth = 6,        # default 6
        min_child_weight = 1, # default 1 larger is more consevative
        gamma = 1,            # default 0, larger is more conservative
        eta = 0.3,            # default 0.3 lower for less overfitting
        max_delta_step = 0,   # deafult 0, useful for unbalanced, higher is more conservative
        subsample = 0.8,        # default 1 lower is less overfitting
        colsample_bytree = 0.5, # default 1 
        objective = "binary:logistic"
))
predictions = data.table(
    correct = as.logical(vld_saf2opg$correct),
    predicted = predict(m_saf2opg, newdata = xgbm_ff(vld_saf2opg, f_saf2opg)))
conf = table(actual = predictions$correct, predicted = predictions$predicted > 0.5)
predictions[, Metrics::precision(correct, predicted > 0.5)]
predictions[, Metrics::recall(correct, predicted > 0.5)]
predictions[, Metrics::fbeta_score(correct, predicted > 0.5)]

writeLines(conf2tex(conf), con = "~/repos/saf2opg/out/between_saf2opg_confmat.tex")

saf2opg_validation_metrics = list(
    confusion = predictions[, .N, by = list(actual = correct, predicted = predicted > 0.5)],
    precision = Metrics::precision(predictions$correct, predictions$predicted > 0.5),
    recall = Metrics::recall(predictions$correct, predictions$predicted > 0.5),
    fbeta_score = Metrics::fbeta_score(predictions$correct, predictions$predicted > 0.5),
    date = Sys.time(),
    features = m_saf2opg$feature_names
)
writeLines(capture.output(saf_validation_metrics), 
    paste0("~/repos/saf2opg/saf2opg_validation_metrics-", Sys.time(), ".txt"))
knitr::kable(cbind(saf2opg_validation_metrics$features[-1], ""))

pretrained_models = list(
    m_boost_stel_rein = list(
        model = m_boost_stel_rein,
        variables = all.vars(f)[-1]),
    m_boost_stel_rein_sparse = list(
        model = m_boost_stel_rein_sparse,
        variables = all.vars(f_sparse)[-1]),
    m_rf_baptisms_sparse = list(
        model = m_rf_baptisms_sparse,
        variables = all.vars(formula(m_rf_baptisms_sparse))[-1]),
    m_rf_baptisms_full = list(
        model = m_rf_baptisms_full,
        variables = all.vars(formula(m_rf_baptisms_full))[-1]),
    m_boost_saf = list(
        model = m_boost_saf,
        variables = all.vars(f_saf)[-1]),
    m_boost_saf2opg = list(
        model = m_saf2opg,
        variables = all.vars(f_saf2opg)[-1])
)
lapply(pretrained_models, `[[`, "variables")

save(pretrained_models, 
    file = "data/pretrained_models.rda", 
    version = 2)
rijpma/capelinker documentation built on Nov. 7, 2024, 3:06 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
rijpma/capelinker
Machine Learning-based Record Linkage for Historical South Africa

data_raw/pretrained.R
In rijpma/capelinker: Machine Learning-based Record Linkage for Historical South Africa

R Package Documentation

Browse R Packages

We want your feedback!

rijpma/capelinker Machine Learning-based Record Linkage for Historical South Africa

data_raw/pretrained.R In rijpma/capelinker: Machine Learning-based Record Linkage for Historical South Africa

R Package Documentation

Browse R Packages

We want your feedback!

rijpma/capelinker
Machine Learning-based Record Linkage for Historical South Africa

data_raw/pretrained.R
In rijpma/capelinker: Machine Learning-based Record Linkage for Historical South Africa