R/links_wf.R

Defines functions prob_score_range links_wf_probabilistic links_af_probabilistic

Documented in links_af_probabilistic links_wf_probabilistic prob_score_range

#' @name link_wf
#' @title Record linkage
#'
#' @description Deterministic and probabilistic record linkage
#' Assign unique identifiers to records based on partial, nested or calculated probabilities.
#'
#' @param attribute \code{[atomic|list|data.frame|matrix|d_attribute]}. Attributes to compare.
#' @param cmp_func \code{[list|function]}. String comparators for each \code{attribute}. See \code{Details}.
#' @param attr_threshold \code{[list|numeric|\link{number_line}]}. Weight-thresholds for each \code{cmp_func}. See \code{Details}.
#' @param probabilistic \code{[logical]}. If \code{TRUE}, scores are assigned base on Fellegi-Sunter model for probabilistic record linkage. See \code{Details}.
#' @param m_probability \code{[list|numeric]}. The probability that a matching records are the same entity.
#' @param u_probability \code{[list|numeric]}. The probability that a matching records are not the same entity.
#' @param score_threshold \code{[numeric|\link{number_line}]}. Score-threshold for linked records. See \code{Details}.
#' @param id_1 \code{[list|numeric]}. Record id or index of one half of a record-pair.
#' @param id_2 \code{[list|numeric]}. Record id or index of one half of a record-pair.
#' @param ... Arguments passed to \bold{\code{links}}
#' @param blocking_attribute \code{[atomic]}. Passed to \bold{\code{criteria}} in \bold{\code{\link{links}}}.
#' @param repeats_allowed \code{[logical]} Passed to \bold{\code{repeats_allowed}} in \bold{\code{\link{links}}}.
#' @param permutations_allowed \code{[logical]} Passed to \bold{\code{permutations_allowed}} in \bold{\code{\link{links}}}.
#' @param ignore_same_source \code{[logical]} Passed to \bold{\code{ignore_same_source}} in \bold{\code{\link{links}}}.
#' @param data_source \code{[character]}. Passed to \bold{\code{data_source}} in \bold{\code{\link{links}}}.
#' @param display \code{[character]}. Passed to \bold{\code{display}} in \bold{\code{\link{links}}}.
#' @param return_weights If \code{TRUE}, returns the match-weights and score-thresholds for record pairs.
#' @return \code{\link[=pid-class]{pid}}; \code{list}
#'
#' @seealso \code{\link{links}}
#'
#' @details
#' \bold{\code{links_wf_probabilistic()}} - A wrapper function of \code{\link{links}} with a
#' with a specific \code{\link{sub_criteria}} and to achieve to achieve probabilistic record linkage
#' It excludes functionalities for the nested and multi-stage linkage.
#'  \code{links_wf_probabilistic()} requires a \code{score_threshold} in advance.
#' To help with this, \code{prob_score_range()} can be used to return the range of scores attainable for a given set of \code{attribute}, \code{m} and \code{u}-probabilities.
#' Additionally, \code{id_1} and \code{id_2} can be used to link specific records pairs, aiding the review of potential scores.
#'
#'
#' \bold{\code{links_af_probabilistic()}} - A simpler version of \code{\link{links}}.
#' It excludes functionalities for the batched, nested and multi-stage linkage.
#' \code{links_af_probabilistic()} requires a \code{score_threshold} in advance,
#' however, since it exports the match weights, the \code{score_threshold}
#' can be changed after the analysis.
#'
#' @references
#' Fellegi, I. P., & Sunter, A. B. (1969). A Theory for Record Linkage. \emph{Journal of the Statistical Association}, 64(328), 1183 - 1210. https://doi.org/10.1080/01621459.1969.10501049
#'
#' Asher, J., Resnick, D., Brite, J., Brackbill, R., & Cone, J. (2020). An Introduction to Probabilistic Record Linkage with a Focus on Linkage Processing for WTC Registries. \emph{International journal of environmental research and public health}, 17(18), 6937. https://doi.org/10.3390/ijerph17186937.
#' See \code{vignette("links")} for more information.

#' @aliases links_wf
#'
#' @examples
#' data(patient_records)
#' # Weighted (probabilistic) comparison of forename, middlename and surname
#' criteria_1 <- as.list(patient_records[c("forename", "middlename", "surname")])
#'
#' # Possible scores when m-probability is 0.95
#' prob_scores <- prob_score_range(attribute = criteria_1,
#'                                 m_probability = 0.95,
#'                                 u_probability = NULL)
#' \dontrun{
#' # Probabilistic record linkage with 'links_af_probabilistic()'
#' pids_1a <- links_af_probabilistic(attribute = criteria_1,
#'                         cmp_func = exact_match,
#'                         attr_threshold = 1,
#'                         probabilistic = TRUE,
#'                         m_probability = 0.95,
#'                         score_threshold = prob_scores$mid_scorce,
#'                         display = "stats")
#'
#' # Equivalent with 'links_wf_probabilistic()'
#' pids_1b <- links_wf_probabilistic(attribute = criteria_1,
#'                                   cmp_func = exact_match,
#'                                   attr_threshold = 1,
#'                                   probabilistic = TRUE,
#'                                   m_probability = 0.95,
#'                                   score_threshold = prob_scores$mid_scorce,
#'                                   display = "progress",
#'                                   recursive = TRUE,
#'                                   check_duplicates = TRUE)
#'
#' # Less thorough but faster equivalent with `links_wf_probabilistic()`
#' pids_1c <- links_wf_probabilistic(attribute = criteria_1,
#'                                   cmp_func = exact_match,
#'                                   attr_threshold = 1,
#'                                   probabilistic = TRUE,
#'                                   m_probability = 0.95,
#'                                   score_threshold = prob_scores$mid_scorce,
#'                                   display = "progress",
#'                                   recursive = FALSE,
#'                                   check_duplicates = FALSE)
#'
#' # Each implementation can lead to different results
#' summary(pids_1a$pid)
#' summary(pids_1b$pid)
#' summary(pids_1c$pid)
#' }
#'
#' # Weighted (non-probabilistic) comparison of forename, middlename and age difference
#' criteria_2 <- as.list(patient_records[c("forename", "middlename", "dateofbirth")])
#' age_diff <- function(x, y){
#'   diff <- abs(as.numeric(x) - as.numeric(y))
#'   wgt <-  diff %in% 0:(365 * 10) & !is.na(diff)
#'   wgt
#' }
#'
#' pids_2a <- links_af_probabilistic(attribute = criteria_2,
#'                         blocking_attribute = patient_records$surname,
#'                         cmp_func = c(exact_match, exact_match, age_diff),
#'                         score_threshold = number_line(3, 5),
#'                         probabilistic = FALSE,
#'                         display = "stats")
#'
#' # Larger weights can be assigned to particular attributes through `cmp_func`
#' # For example, a smaller age difference can contribute a higher score (e.g 0 to 3)
#' age_diff_2 <- function(x, y){
#'   diff <- as.numeric(abs(x - y))
#'   wgt <-  diff %in% 0:(365 * 10) & !is.na(diff)
#'   wgt[wgt] <- match(as.numeric(cut(diff[wgt], 3)), 3:1)
#'   wgt
#' }
#' pids_2b <- links_af_probabilistic(attribute = criteria_2,
#'                         blocking_attribute = patient_records$surname,
#'                         cmp_func = c(exact_match, exact_match, age_diff_2),
#'                         score_threshold = number_line(3, 5),
#'                         probabilistic = FALSE,
#'                         display = "stats")
#'
#' head(pids_2a$pid_weights, 10)
#' head(pids_2b$pid_weights, 10)
#'
#' @rdname links_wf
#' @export
links_af_probabilistic <- function(attribute,
                                   blocking_attribute = NULL,
                                   cmp_func = diyar::exact_match,
                                   attr_threshold = 1,
                                   probabilistic = TRUE,
                                   m_probability = .95,
                                   u_probability = NULL,
                                   score_threshold = 1,
                                   repeats_allowed = FALSE,
                                   permutations_allowed = FALSE,
                                   data_source = NULL,
                                   ignore_same_source = TRUE,
                                   display = "none"){
  tm_a <- Sys.time()
  err <- err_links_wf_probablistic_0(attribute = attribute,
                                     blocking_attribute = blocking_attribute,
                                     cmp_func = cmp_func,
                                     attr_threshold = attr_threshold,
                                     probabilistic = probabilistic,
                                     m_probability = m_probability,
                                     score_threshold = score_threshold,
                                     id_1 = NULL, id_2 = NULL,
                                     u_probability = u_probability)
  if(!isFALSE(err)) stop(err, call. = FALSE)

  if(!display %in% c("none")){
    rp_data <- di_report(duration = tm_a, iteration = "Data validation",
                         current_tot = length(attrs(attribute)[[1]]))
    tm_ia <- Sys.time()
    report <- list(rp_data)
    if(display %in% c("stats_with_report", "stats")){
      cat(paste0(rp_data[[1]], ": ", fmt(rp_data[[2]], "difftime"), "\n"))
    }
  }

  if(inherits(attribute, c("list", "data.frame"))){
    attribute <- attrs(.obj = attribute)
  }else if(inherits(attribute, c("matrix"))){
    attribute <- attrs(.obj = as.data.frame(attribute))
  }else if(inherits(attribute, c("d_attribute"))){
  }else{
    attribute <- attrs(attribute)
  }

  if(is.null(names(attribute))){
    names(attribute) <- paste0("var_", seq_len(length(attribute)))
  }

  # Attribute names
  attr_nm <- names(attribute)
  rd_n <- length(attribute[[1]])

  lgk <- unlist(lapply(attribute, function(x){
    if(is.number_line(x)){
      length(unique(x)) == 1
    }else{
      length(x[!duplicated(x)]) == 1
    }
  }), use.names = FALSE)
  if(any(lgk)){
    warning(paste0("Attributes with identicial values in every record are ignored:\n",
                   paste0("i - `", attr_nm[lgk], "` was ignored!", collapse = "\n")), call. = FALSE)
  }
  if(all(lgk)){
    stop("Linkage stopped because all attributes were ignored.", call. = FALSE)
  }
  if(!is.null(blocking_attribute)){
    if(all(is.na(blocking_attribute))){
      stop("Linkage stopped because all records have a missing (`NA`) `strata`.", call. = FALSE)
    }
  }
  attribute <- attribute[!lgk]
  attr_nm <- names(attribute)
  probs_repo <- prep_prob_link_args(attribute = attribute,
                                    m_probability = m_probability,
                                    u_probability = u_probability)
  thresh_repo <- prep_cmps_thresh(attr_nm = attr_nm,
                                  cmp_func = cmp_func,
                                  attr_threshold = attr_threshold,
                                  score_threshold = score_threshold)

  probs_repo$m_probability$x <- lapply(probs_repo$m_probability$x, mk_lazy_opt)
  probs_repo$u_probability$x <- lapply(probs_repo$u_probability$x, mk_lazy_opt)

  if(!is.null(blocking_attribute)){
    blocking_attribute <- as.vector(blocking_attribute)
    strata <- match(blocking_attribute, blocking_attribute[!duplicated(blocking_attribute)])
    strata[is.na(blocking_attribute)] <- ((seq_len(rd_n) + max(strata))[is.na(blocking_attribute)])
  }else{
    strata <- blocking_attribute
  }

  # Create record-pairs
  if(isTRUE(ignore_same_source) & !is.null(data_source)){
    r_pairs <- make_pairs_wf_source(seq_len(rd_n),
                                    strata = strata,
                                    repeats_allowed = repeats_allowed,
                                    permutations_allowed = permutations_allowed,
                                    data_source = data_source)
  }else{
    r_pairs <- make_pairs(seq_len(rd_n),
                          strata = strata,
                          repeats_allowed = repeats_allowed,
                          permutations_allowed = permutations_allowed)
  }

  if(length(r_pairs$x_pos) == 0){
    pid_weights <- data.frame(sn_x = integer(0),
                              sn_y = integer(0))
    if(!is.null(data_source)){
      pid_weights$source_x <-
        pid_weights$source_y <- integer(0)
    }
    wts <- lapply(c(attr_nm, "weight"), function(x) numeric(0))
    names(wts) <- paste0("cmp.", c(attr_nm, "weight"))
    pid_weights <- c(pid_weights, wts)
    if(isTRUE(probabilistic)){
      wts <- lapply(c(attr_nm, "weight"), function(x) numeric(0))
      names(wts) <- paste0("prb.", c(attr_nm, "weight"))
      pid_weights <- c(pid_weights, wts)
    }
    pid_weights$record.match <- logical(0)
    pid_weights <- as.data.frame(pid_weights)
    pids <- list(pid = as.pid(seq_len(rd_n)),
                 pid_weights = pid_weights)
    rm(list = ls()[ls() != "pids"])
    return(pids)
  }
  x <- lapply(attribute, function(k) k[r_pairs$x_pos])
  y <- lapply(attribute, function(k) k[r_pairs$y_pos])
  rp_n <- length(x[[1]])

  if(!display %in% c("none")){
    rp_data <- di_report(duration = tm_ia, iteration = "Pairs created",
                         current_tot = length(x[[1]]))
    tm_ia <- Sys.time()
    report <- c(report, list(rp_data))
    if(display %in% c("stats_with_report", "stats")){
      cat(paste0(rp_data[[1]], ": ", fmt(rp_data[[2]], "difftime"), "\n"))
    }
  }

  pid_weights <- prob_link(x = attrs(.obj = list(attribute = x,
                                                 m_probability = lapply(probs_repo$m_probability$x, function(k) k[r_pairs$x_pos]),
                                                 u_probability = lapply(probs_repo$u_probability$x, function(k) k[r_pairs$x_pos]),
                                                 sn = list(seq_len(length(x[[1]]))[r_pairs$x_pos]))),
                           y = attrs(.obj = list(attribute = y,
                                                 m_probability = lapply(probs_repo$m_probability$x, function(k) k[r_pairs$y_pos]),
                                                 u_probability = lapply(probs_repo$u_probability$x, function(k) k[r_pairs$y_pos]),
                                                 sn = list(seq_len(length(x[[1]]))[r_pairs$y_pos]))),
                           attr_threshold = thresh_repo$attr_threshold,
                           score_threshold = thresh_repo$score_threshold,
                           return_weights = TRUE,
                           cmp_func = thresh_repo$cmp_func,
                           probabilistic = probabilistic)
  if(!display %in% c("none")){
    rp_data <- di_report(duration = tm_ia, iteration =  "Weights calculated",
                         current_tot = length(x[[1]]))
    tm_ia <- Sys.time()
    report <- c(report, list(rp_data))
    if(display %in% c("stats_with_report", "stats")){
      cat(paste0(rp_data[[1]], ": ", fmt(rp_data[[2]], "difftime"), "\n"))
    }
  }
  # Output
  if(!is.null(data_source)){
    pid_weights <- cbind(data.frame(r_pairs$x_pos, r_pairs$y_pos,
                                    data_source[r_pairs$x_pos], data_source[r_pairs$y_pos], stringsAsFactors = FALSE),
                         pid_weights)
    colnames(pid_weights)[1:4] <- c("sn_x","sn_y", "source_x", "source_y")
  }else{
    pid_weights <- cbind(data.frame(r_pairs$x_pos, r_pairs$y_pos, stringsAsFactors = FALSE), pid_weights)
    colnames(pid_weights)[1:2] <- c("sn_x","sn_y")
  }

  pids <- pid_weights[c("sn_x", "sn_y", "record.match")]
  pids <- pids[as.logical(pids$record.match),]

  link_id <- make_refs_V2(x_val = pids$sn_x, y_val = pids$sn_y, rd_n)
  group_id <- make_ids(pids$sn_x, pids$sn_y, rd_n)

  link_id <- link_id[,-1]
  tgt_pos <- which(is.na(link_id))
  link_id[tgt_pos] <- tgt_pos %% rd_n

  pids <- make_pids(
    y_pos = group_id$group_id,
    x_pos = group_id$sn,
    pid_cri = as.integer(group_id$linked),
    iteration = rep(1L, length(group_id$sn)),
    link_id = link_id,
    data_source = data_source)

  if(!display %in% c("none")){
    rp_data <- di_report(duration = tm_ia, iteration = "`pid` created",
                         current_tot = length(x[[1]]),
                         current_tagged = nrow(pid_weights[pid_weights$record.match,]))
    tm_ia <- Sys.time()
    report <- c(report, list(rp_data))
    if(display %in% c("stats_with_report", "stats")){
      cat(paste0(rp_data[[1]], ": ", fmt(rp_data[[2]], "difftime"), "\n"))
    }
  }

  pids <- list(pid = pids,
               pid_weights = pid_weights)

  tm_z <- Sys.time()
  tms <- difftime(tm_z, tm_a)
  tms <- fmt(tms, "difftime")

  if(display %in% c("none_with_report", "progress_with_report", "stats_with_report")){
    pids$report <- as.list(do.call("rbind", lapply(report, as.data.frame)))
    class(pids$report) <- "d_report"
  }

  if(!display %in% c("none", "none_with_report")) cat("Records linked in ", tms, "!\n", sep = "")
  rm(list = ls()[ls() != "pids"])
  return(pids)
}

#' @rdname links_wf
#' @export
links_wf_probabilistic <- function(attribute,
                                   blocking_attribute = NULL,
                                   cmp_func = diyar::exact_match,
                                   attr_threshold = 1,
                                   probabilistic = TRUE,
                                   m_probability = .95,
                                   u_probability = NULL,
                                   score_threshold = 1,
                                   id_1 = NULL, id_2 = NULL,
                                   return_weights = FALSE,
                                   ...){

  args <- list(...)
  err <- err_links_wf_probablistic_0(attribute = attribute,
                                     blocking_attribute = blocking_attribute,
                                     cmp_func = cmp_func,
                                     attr_threshold = attr_threshold,
                                     probabilistic = probabilistic,
                                     m_probability = m_probability,
                                     u_probability = u_probability,
                                     score_threshold = score_threshold,
                                     id_1 = id_1, id_2 = id_2)
  if(!isFALSE(err)) stop(err, call. = FALSE)

  if(inherits(attribute, c("list", "data.frame"))){
    attribute <- attrs(.obj = attribute)
  }else if(inherits(attribute, c("matrix"))){
    attribute <- attrs(.obj = as.data.frame(attribute))
  }else if(inherits(attribute,  c("d_attribute"))){
  }else{
    attribute <- attrs(attribute)
  }

  if(is.null(names(attribute))){
    names(attribute) <- paste0("var_", seq_len(length(attribute)))
  }

  # Attribute names
  attr_nm <- names(attribute)
  rd_n <- length(attribute[[1]])

  lgk <- unlist(lapply(attribute, function(x){
    if(is.number_line(x)){
      length(unique(x)) == 1
    }else{
      length(x[!duplicated(x)]) == 1
    }
  }), use.names = FALSE)
  if(any(lgk)){
    warning(paste0("Attributes with identicial values in every record are ignored:\n",
                   paste0("i - `", attr_nm[lgk], "` was ignored!", collapse = "\n")), call. = FALSE)
  }
  if(all(lgk)){
    stop("Linkage stopped since all attributes were ignored.", call. = FALSE)
  }
  attribute <- attribute[!lgk]
  attr_nm <- names(attribute)

  probs_repo <- prep_prob_link_args(attribute = attribute,
                                    m_probability = m_probability,
                                    u_probability = u_probability)
  thresh_repo <- prep_cmps_thresh(attr_nm = attr_nm,
                                  cmp_func = cmp_func,
                                  attr_threshold = attr_threshold,
                                  score_threshold = score_threshold)

  probs_repo$m_probability$x <- lapply(probs_repo$m_probability$x, function(x) if(length(x) == 1) rep(x, rd_n) else x)
  probs_repo$u_probability$x <- lapply(probs_repo$u_probability$x, function(x) if(length(x) == 1) rep(x, rd_n) else x)

  probs_repo$m_probability$x <- lapply(probs_repo$m_probability$x, mk_lazy_opt)
  probs_repo$u_probability$x <- lapply(probs_repo$u_probability$x, mk_lazy_opt)

  # Weight or probabilistic matching
  prob_link_wf <- function(x, y){
    wgt <- prob_link(x, y,
                     attr_threshold = thresh_repo$attr_threshold,
                     score_threshold = thresh_repo$score_threshold,
                     return_weights = return_weights,
                     probabilistic = probabilistic,
                     cmp_func = thresh_repo$cmp_func)
    if(isFALSE(return_weights)){
      return(as.logical(wgt))
    }else{
      return(list(as.logical(wgt[,which(colnames(wgt) == "record.match")]),
                  wgt))
    }

  }

  # Identify identical records to skip repeat checks
  same_rec_func <- function(x, y){
    attr_n <- length(x$attribute)
    lgk <- sapply(seq_len(attr_n), function(i){
      lgk <- x$attribute[[i]] == y$attribute[[i]] | (is.na(x$attribute[[i]]) & is.na(y$attribute[[i]]))
      lgk[is.na(lgk)] <- FALSE
      lgk
    })
    if(is.null(nrow(lgk))){
      sum(lgk) == attr_n
    }else{
      rowSums(lgk) == attr_n
    }
  }

  # Re-calculate weights for linked records
  if(any(names(args) == "sn")){
    sn <- args$sn
  }else{
    sn <- seq_len(rd_n)
  }
  x <- list(attribute = attribute,
            m_probability = probs_repo$m_probability$x,
            u_probability = probs_repo$u_probability$x,
            sn = list(sn))
  links_wf_sub_cri <- sub_criteria(attrs(.obj = x),
                                   match_funcs = prob_link_wf,
                                   equal_funcs = false)
  pids <- links(criteria = "place_holder",
                strata = blocking_attribute,
                sub_criteria = list("cr1" = links_wf_sub_cri),
                ...)

  # Output
  rm(list = ls()[ls() != "pids"])
  return(pids)
}


#' @rdname links_wf
#' @export
prob_score_range <- function(attribute,
                             m_probability = .95,
                             u_probability = NULL){
  if(inherits(attribute, c("list", "data.frame"))){
    attribute <- attrs(.obj = attribute)
  }else if(inherits(attribute, c("matrix"))){
    attribute <- attrs(.obj = as.data.frame(attribute))
  }else if(inherits(attribute, c("d_attribute"))){
  }else{
    attribute <- attrs(attribute)
  }
  if(is.null(names(attribute))){
    names(attribute) <- paste0("var_", seq_len(length(attribute)))
  }

  if(is.null(u_probability)){
    u_probability <- lapply(attribute, function(x){
      x_cd <- match(x, x[!duplicated(x)])
      x_cd[is.na(x)] <- NA_real_
      r <- rle(x_cd[order(x_cd)])
      n <- r$lengths[match(x_cd, r$values)]
      p <- n/length(x_cd)
      p[is.na(x_cd)] <- 0
      p
    })
  }

  lgk <- unlist(lapply(attribute, function(x) length(x[!duplicated(x)]) == 1), use.names = FALSE)
  if(any(lgk)){
    warning(paste0("Attributes with identicial values in every record are ignored:\n",
                   paste0("i - `", names(attribute)[lgk], "` was ignored!", collapse = "\n")), call. = FALSE)

    attribute <- attribute[!lgk]
    u_probability <- u_probability[!lgk]
  }

  if(!inherits(m_probability, "list")){
    m_probability <- list(m_probability)
  }
  if(length(m_probability) != 1 & any(lgk[lgk])){
    m_probability <- m_probability[!lgk]
  }
  if(length(m_probability) == 1 & length(attribute) > 1){
    m_probability <- rep(m_probability, length(attribute))
  }

  max_thresh <- sapply(seq_len(length(u_probability)), function(i){
    curr_uprob <- u_probability[[i]]
    # Exclude u-probability of '0'
    # These are from missing data and will never be an agreement
    curr_uprob[curr_uprob == 0] <- 1
    curr_mprob <- m_probability[[i]]
    log2((curr_mprob ^ 2) / (curr_uprob ^ 2))
  })
  if(is.null(nrow(max_thresh))){
    max_thresh <- max(sum(max_thresh))
  }else{
    max_thresh <- max(rowSums(max_thresh))
  }
  min_thresh <- sapply(seq_len(length(u_probability)), function(i){
    curr_uprob <- u_probability[[i]]
    curr_mprob <- m_probability[[i]]
    log2((1 - (curr_mprob ^ 2))/(1 - (curr_uprob ^ 2)))
  })
  if(is.null(nrow(min_thresh))){
    min_thresh <- min(sum(min_thresh))
  }else{
    min_thresh <- min(rowSums(min_thresh))
  }
  list(minimum_score = min_thresh,
       mid_scorce = (min_thresh + max_thresh)/2,
       maximum_score = max_thresh)
}
OlisaNsonwu/diyar documentation built on April 22, 2024, 6:27 p.m.