RecordLinkUtil: Tools for Testing Record Linkage Software

Documented in getConfusion getConfusionInfo

#' getConfusion
#'
#' @usage getConfusion(dfA, dfB, inds, idColA, idColB = NULL)
#'
#' @param dfA First data frame being linked
#' @param dfB Second data frame being linked
#' @param inds A nx2 index pair matrix indicating the detected matches
#' @param idColA The column name in dfA of the shared id across dfA and dfB
#' @param idColB The column name in dfB of the shared id across dfA and dfB. If NULL, it assumes the column in B has the same name as the column in A
#'
#'
#' @return \code{getConfusion} returns a dataframe of the original dataframes bound by index.
#'
#' @author Sam Murray<slmurray@andrew.cmu.edu>
#' @export
#' @import tidyverse

getConfusion <-  function(dfA, dfB, inds, idColA, idColB = NULL){
  if(is.null(idColB)){
    idColB = idColA
  }

  #We call 2 records matches if they represent the same entity
  #We call 2 records linked if they are detected as a match by the linking rule defined by inds
  r_names = c(idColB)
  names(r_names) = idColA

  true_matches = inner_join(dfA, dfB, by = r_names)

  #Number of matches
  N_true_matches = nrow(true_matches)
  #Number of nonmatch pairs
  N_true_nonmatches = nrow(dfA)*nrow(dfB) - N_true_matches


  #True positives
  N_linked_matches = sum(dfA[[idColA]][inds[ ,1]] == dfB[[idColB]][inds[ ,2]])
  #False Negatives
  N_unlinked_matches = N_true_matches - N_linked_matches
  #False Positives
  N_linked_nonmatches = nrow(inds) - N_linked_matches
  #True Negatives
  N_unlinked_nonmatches = N_true_nonmatches - N_linked_nonmatches

  confusion_matrix = data.frame(True_match = c(N_linked_matches,N_unlinked_matches), True_Nonmatch = c(N_linked_nonmatches,N_unlinked_nonmatches))
  rownames(confusion_matrix) <- c("Detected Link", "Detected Non-Link")

  return(confusion_matrix)
}

#' getConfusionInfo
#'
#' @usage getConfusionInfo(conMat)
#'
#' @param conMat The confusion matrix generated by getConfusion
#'
#' @return \code{getConfusionInfo} Returns a dataframe of information on the given confusion table, including sensitivity, specificity, PPV, NPV, etc.
#'
#' @author Sam Murray<slmurray@andrew.cmu.edu>
#' @export
#' @import tidyverse

getConfusionInfo <-  function(conmat){

  #We call 2 records matches if they represent the same entity
  #We call 2 records linked if they are detected as a match by the linking rule defined by inds

  TruePositive <-  conmat[1,1]
  FalseNegative <- conmat[2,1]
  FalsePositive <- conmat[1,2]
  TrueNegative <- conmat[2,2]

  TotalPositive <- TruePositive + FalseNegative

  TotalNegative <- TrueNegative + FalsePositive

  sensitivity <- TruePositive/TotalPositive

  specificity <- TrueNegative/TotalNegative

  #Positive Predictive Value
  PPV <- TruePositive/(TruePositive + FalsePositive)
  #Negative Predictive Value
  NPV <- TrueNegative/(TrueNegative + FalseNegative)
  #False Negative Rate
  FNR <- 1 - sensitivity
  #False Positive Rate
  FPR <- 1 - specificity
  #False Discovery Rate
  FDR <- 1 - PPV
  #False Omission Rate
  FOR <- 1-NPV
  #Threat Score
  TS <- TruePositive/(TotalPositive + FalseNegative)
  #Accuracy
  ACC <- (TruePositive + TrueNegative)/(TotalNegative + TotalPositive)
  #F1 Score
  F1 <- 2*(sensitivity*PPV)/(sensitivity+PPV)
  #Informedness
  BM <- sensitivity + specificity - 1
  #Markedness
  MK <- PPV + NPV - 1

  out <- data.frame(info = c(sensitivity, specificity, PPV, NPV, FNR, FPR, FDR, FOR, TS, ACC, F1, BM, MK, TotalPositive, TotalNegative),
                    row.names = c("sensitivity", "specificity", "Positive Predictive Value", "Negative Predictive Value", "False Negative Rate",
                                  "False Positive Rate","False Discovery Rate", "False Omission Rate", "Threat Score", "Accuracy", "F1 Score",
                                  "Informedness", "Markedness", "Total Postive", "Total Negative"))
return(out)
}