R/candidates.R

Defines functions candidates

Documented in candidates

#' Create candidate links from two datasets.
#' 
#' \code{candidates} merges two datasets based on a distance criterium. The resulting dataset can be used to predict links.
#' 
#' Blocking on multiple variables is currently not supported, but could be done by using \code{candidates()} repeatedly and merging the results might work.
#' 
#' Because historical records often provide limited information, it is possible to block on string distances. Note that this can become quite slow when there is a large number of records to in each dataset (say, tens of thousands). 
#' 
#' String distance blocking is done using Jaro-Winkler string distances, which de-emphasise differences at the end of the string. The distance ranging from 0 (perfect match) to 1 (completely mismatch). Set the maxdist (see Arguments) accordingly.
#' 
#' It is currently possible to return missing values when for a given record, no candidate is found. While these records can never be matched, they are left in to make comparisons of the dataset easier.
#' 
#' 
#' @param dat_from The "from" dataset, should be a data.table
#' @param dat_to The "from" dataset, should be a data.table
#' @param blockvariable_from String giving the name of the blocking variable in the "from" data. Distance between this variable in both datasets determines whether a pair of records is a candidate. Defaults to "mlast", the male surname in the opgaafrollen data.
#' @param blockvariable_to String giving the name of the blocking variable in the "to" data. Distance between this variable in both datasets determines whether a pair of records is a candidate. Defaults to "mlast", the male surname in the opgaafrollen data.
#' @param idvariable_from String giving the identifier variable in dat_from.
#' @param idvariable_to String giving the identifier variable in dat_from.
#' @param blocktype Type of blocking: bigram distance (default), string distance or numeric.
#' @param linktype Should there be no more than one record in each dataset that can be linked (one:one), or is it possible for multiple records in \code{dat_from} to be linked to \code{dat_to} (many:one)? Defaults to "one:one".
#' @param maxdist Maximum distance (0-1) to consider a record a candidate. Defaults to 0.15 for male surname string distance. If using numeric distance (for instance year of birth), very different values could be needed.
#' 
#' @return A dataset containing all candidate pairs, and all columns in dat_from and dat_to. Columns with the same name will get a suffix "_from" or "_to".
#' 
#' @examples
#' d1 = data.table::data.table(mlast = c("jong", "smid", "nauda"), persid = c(1:3))
#' d2 = data.table::data.table(mlast = c("jongh", "jong", "smit", "veld"), persid = c(1:4))
#' candidates(d1, d2)
#'  
#' @export
candidates = function(dat_from, dat_to, 
    blockvariable_from = "mlast", 
    blockvariable_to = "mlast", 
    idvariable_from = "persid", 
    idvariable_to = "persid",
    blocktype = c("bigram distance", 
                 "string distance", 
                 "numeric", 
                 "idf bigram distance", 
                 "soundex"),
    linktype = c("one:one", "many:one"),
    maxdist = 0.15){

    linktype = match.arg(linktype)
    blocktype = match.arg(blocktype)

    stopifnot(data.table::is.data.table(dat_from))
    stopifnot(data.table::is.data.table(dat_to))

    stopifnot(nrow(dat_from) > 0)
    stopifnot(nrow(dat_to) > 0)

    if (linktype == "one:one" & (
            any(duplicated(dat_from[[idvariable_from]]))
          | any(duplicated(dat_to[[idvariable_to]])))){
        warning("One to one matching, but idvariables not unique")
    }

    if ((maxdist < 0 | maxdist > 1) & blocktype == "string distance"){
        warning("Maxdist should be between 0 and 1.")
    }
    maxsim = 1 - maxdist # for similarity measures

    if (blocktype == "string distance"){
        distmat = stringdist::stringdistmatrix(
            a = dat_from[, get(blockvariable_from)],
            b = dat_to[, get(blockvariable_to)],
            method = 'jw', p = 0.1)
        candidate_list = apply(distmat, 1, function(x) which(x < maxdist), simplify = FALSE)
        score_list = distmat[rep(1:length(candidate_list), sapply(candidate_list, length)) + nrow(distmat) * (unlist(candidate_list) - 1)]
        # score_list = apply(distmat, 1, function(x) x[which(x < maxdist)])
    }
    if (blocktype == "numeric"){
        simmat = 1 - outer(
            X = dat_from[, get(blockvariable_from)],
            Y = dat_to[, get(blockvariable_to)],
            FUN = capelinker::gk)
        candidate_list = apply(simmat, 1, function(x) which(x > maxsim), simplify = FALSE)
        score_list = apply(simmat, 1, function(x) x[which(x > maxsim)], simplify = FALSE)
    }
    if (blocktype == "bigram distance"){
        simmat = qlcMatrix::sim.strings(
            strings1 = dat_from[, get(blockvariable_from)],
            strings2 = dat_to[, get(blockvariable_to)],
            boundary = TRUE,
            left.boundary = "#", right.boundary = "#") # maybe no right boundary?

        # simmat gets simplified to vector if only one candidate, but breaks code later on
        if (nrow(dat_from) == 1){
            simmat = Matrix::Matrix(simmat, nrow = 1, sparse = TRUE)
        }

        candidate_list = apply(simmat, 1, function(x) which(x > maxsim), simplify = FALSE)
        score_list = apply(simmat, 1, function(x) x[which(x > maxsim)], simplify = FALSE)
    }
    if (blocktype == "idf bigram distance"){
        s1 = qlcMatrix::splitStrings(
            strings = dat_from[, get(blockvariable_from)], 
            simplify = TRUE,
            boundary = TRUE,
            left.boundary = "#", right.boundary = "#")
        s2 = qlcMatrix::splitStrings(
            strings = dat_to[, get(blockvariable_to)],
            simplify = TRUE,
            boundary = TRUE,
            left.boundary = "#", right.boundary = "#")
        m = qlcMatrix::jMatrix(rownames(s1), rownames(s2))
        simmat = qlcMatrix::cosSparse((m$M1 * 1) %*% s1, (m$M2 * 1) %*% s2, weight = qlcMatrix::idf)
        candidate_list = apply(simmat, 1, function(x) which(x > maxsim), simplify = FALSE)
        score_list = apply(simmat, 1, function(x) x[which(x > maxsim)], simplify = FALSE)
    }
    if (blocktype == "soundex"){
        candidate_list = lapply(phonetic(dat_from[[blockvariable_from]]), 
            function(x) which(phonetic(dat_to[[blockvariable_to]]) %in% x))
    }

    tomerge = dat_to[unlist(candidate_list), ]
    tomerge[, score:= unlist(score_list)]

    # creating merge keys
    tomerge[, linked_to  := rep(dat_from[, get(idvariable_from)], times=sapply(candidate_list, length))]
    dat_from[, linked_from := get(idvariable_from)]

    # all = TRUE means some of idvariable_from idvariable_to can come back as
    # NA. Undesirable, surely? Can never be linked, can only cause mistakes...
    combined = merge(dat_from, tomerge, 
        all = TRUE, 
        # is this correct? Why not cartesian for many-one?
        allow.cartesian = linktype == "one:one",
        by.x = 'linked_from', by.y = 'linked_to', 
        suffixes = c('_from', '_to'))

    return(combined) # maybe w/o linked_to, only confuses...
}

# make candidates
# requires named list of variables of interest for use down
# the road in calc scores, also to make sure there are no name
# clashses
# maybe this list of names is stupid, because we'll need to 
# enter the pairs in scoring function anyway

# add blocktype == exact?
# add multiple blocks?
# faster way than outer? can be used identically to stringdistmatrix though
# handle maxdist according to type

# more thoughts
# should accomodate one to many and many to one match
# so maybe function(dat_from, dat_to, ..., many = c("none", "to", "from", "both"))
# and then make persids and 
# warn for duplicates if none
# kill only to if from
# kill only from if to
# kill neither if both (would we ever want that)
rijpma/capelinker documentation built on Nov. 7, 2024, 3:06 a.m.