Nothing
#' Creation of Comparison Data
#'
#' Create comparison vectors for all pairs of records coming from
#' two datafiles to be linked.
#'
#' @param df1,df2 two datasets to be linked, of class \code{data.frame}, with rows representing records and columns
#' representing fields. Without loss of generality,
#' \code{df1} is assumed to have no less records than \code{df2}.
#' @param flds a vector indicating the fields to be used in the linkage. Either a \code{character} vector, in which case
#' all entries need to be names of columns of \code{df1} and \code{df2}, or a \code{numeric} vector
#' indicating the columns in \code{df1} and \code{df2} to be used in the linkage. If provided as a
#' \code{numeric} vector it is assumed that the columns of \code{df1} and \code{df2} are organized such that
#' it makes sense to compare the columns
#' \code{df1[,flds]} and \code{df2[,flds]} in that order.
#' @param flds1,flds2 vectors indicating the fields of \code{df1} and \code{df2} to be used in the linkage.
#' Either \code{character} vectors, in which case
#' all entries need to be names of columns of \code{df1} and \code{df2}, respectively, or \code{numeric} vectors
#' indicating the columns in \code{df1} and \code{df2} to be used in the linkage. It is assumed that
#' it makes sense to compare the columns
#' \code{df1[,flds1]} and \code{df2[,flds2]} in that order. These arguments are ignored if \code{flds} is specified.
#' If none of \code{flds,flds1,flds2} are specified, the columns with the same names in \code{df1} and \code{df2}
#' are compared, if any.
#' @param types a vector of characters indicating the comparison type per comparison field. The options
#' are: \code{"lv"} for comparisons based on the Levenshtein edit distance normalized to \eqn{[0,1]}, with \eqn{0}
#' indicating no disagreement and \eqn{1} indicating maximum disagreement;
#' \code{"bi"} for binary comparisons (agreement/disagreement); \code{"nu"} for numeric comparisons computed as
#' the absolute difference.
#' The default is \code{"lv"}. Fields compared with the \code{"lv"} option are first transformed to \code{character}
#' class. Factors with different levels compared using the \code{"bi"} option are transformed to factors with the union
#' of the levels. Fields compared with the \code{"nu"} option need to be of class \code{numeric}.
#' @param breaks break points for the comparisons to obtain levels of disagreement.
#' It can be a list of length equal to the number of comparison fields, containing one numeric vector with the break
#' points for each comparison field, where entries corresponding to comparison type \code{"bi"} are ignored.
#' It can also be a named list of length two with elements 'lv' and 'nu'
#' containing numeric vectors with the break
#' points for all Levenshtein-based and numeric comparisons, respectively.
#' Finally, it can be a numeric vector with the break points for all comparison fields of type \code{"lv"} and \code{"nu"},
#' which might be meaningful only if all the non-binary comparisons are of a single type, either \code{"lv"} or \code{"nu"}.
#' For comparisons based on the normalized Levenshtein distance, a vector of length \eqn{L} of break
#' points for the interval \eqn{[0,1]} leads to \eqn{L+1} levels of disagreement. Similarly, for comparisons based on the absolute
#' difference, the break points are for the interval \eqn{[0,\infty)}.
#' The default is \code{breaks=c(0,.25,.5)}, which might be meaningful only for comparisons of type \code{"lv"}.
#' @return a list containing:
#' \describe{
#' \item{\code{comparisons}}{
#' matrix with \code{n1*n2} rows, where the comparison pattern for record pair \eqn{(i,j)}
#' appears in row \code{(j-1)*n1+i}, for \eqn{i} in \eqn{{1,\dots,n1}}, and \eqn{j} in \eqn{{1,\dots,n2}}.
#' A comparison field with \eqn{L+1} levels of disagreement,
#' is represented by \eqn{L+1} columns of TRUE/FALSE indicators. Missing comparisons are coded as FALSE,
#' which is justified under an assumption of ignorability of the missing comparisons, see Sadinle (2017).
#' }
#' \item{\code{n1,n2}}{the datafile sizes, \code{n1 = nrow(df1)} and \code{n2 = nrow(df2)}.}
#' \item{\code{nDisagLevs}}{a vector containing the number of levels of
#' disagreement per comparison field.}
#' \item{\code{compFields}}{a data frame containing the names of the fields in the datafiles used in the comparisons
#' and the types of comparison.}
#' }
#'
#' @references Mauricio Sadinle (2017). Bayesian Estimation of Bipartite Matchings for Record Linkage. \emph{Journal of the
#' American Statistical Association} 112(518), 600-612. [\href{https://doi.org/10.1080/01621459.2016.1148612}{Published}] [\href{https://arxiv.org/abs/1601.06630}{arXiv}]
#'
#' @examples
#' data(twoFiles)
#'
#' myCompData <- compareRecords(df1, df2,
#' flds=c("gname", "fname", "age", "occup"),
#' types=c("lv","lv","bi","bi"),
#' breaks=c(0,.25,.5))
#'
#' ## same as
#' myCompData <- compareRecords(df1, df2, types=c("lv","lv","bi","bi"))
#'
#'
#' ## let's transform 'occup' to numeric to illustrate how to obtain numeric comparisons
#' df1$occup <- as.numeric(df1$occup)
#' df2$occup <- as.numeric(df2$occup)
#'
#' ## using different break points for 'lv' and 'nu' comparisons
#' myCompData1 <- compareRecords(df1, df2,
#' flds=c("gname", "fname", "age", "occup"),
#' types=c("lv","lv","bi","nu"),
#' breaks=list(lv=c(0,.25,.5), nu=0:3))
#'
#' ## using different break points for each comparison field
#' myCompData2 <- compareRecords(df1, df2,
#' flds=c("gname", "fname", "age", "occup"),
#' types=c("lv","lv","bi","nu"),
#' breaks=list(c(0,.25,.5), c(0,.2,.4,.6), NULL, 0:3))
compareRecords <- function(df1, df2, flds=NULL, flds1=NULL, flds2=NULL, types=NULL, breaks=c(0,.25,.5)){
warn <- FALSE
# control the input
if(!is.data.frame(df1)) stop("'df1' is not a data frame")
if(!is.data.frame(df2)) stop("'df2' is not a data frame")
if(!is.null(flds)){ # user provides 'flds' fields in 'df1' and 'df2' for linkage
if( !(class(flds) %in% c("character", "numeric")) )
stop("'flds' should be of class character or numeric")
if(!is.null(flds1)) warning("Argument 'flds' was provided, 'flds1' is ignored")
if(!is.null(flds2)) warning("Argument 'flds' was provided, 'flds2' is ignored")
flds1 <- flds2 <- flds
if(is.numeric(flds)){
fldsCheck1 <- flds %in% seq_len(ncol(df1))
if(!all(fldsCheck1)) stop("Some numbers in 'flds' are out of the column range for 'df1'")
fldsCheck2 <- flds %in% seq_len(ncol(df2))
if(!all(fldsCheck2)) stop("Some numbers in 'flds' are out of the column range for 'df2'")
warn <- TRUE
}
if(is.character(flds)){
fldsCheck1 <- flds %in% colnames(df1)
if(!all(fldsCheck1)) stop("Some names in 'flds' are not columns of 'df1'")
fldsCheck2 <- flds %in% colnames(df2)
if(!all(fldsCheck2)) stop("Some names in 'flds' are not columns of 'df2'")
flds1 <- match(flds1, colnames(df1))
flds2 <- match(flds2, colnames(df2))
}
}else{
if(is.null(flds1) & !is.null(flds2)) stop("Argument 'flds1' also needs to be specified")
if(is.null(flds2) & !is.null(flds1)) stop("Argument 'flds2' also needs to be specified")
if(is.null(flds2) & is.null(flds2)){
flds1 <- flds2 <- intersect(colnames(df1), colnames(df2))
if(length(flds1)==0)
stop("'df1' and 'df2' do not have fields with the same names")
flds1 <- match(flds1, colnames(df1))
flds2 <- match(flds2, colnames(df2))
}else{ # user provides 'flds1' and 'flds2' fields in 'df1' and 'df2' for linkage
if( !(class(flds1) %in% c("character", "numeric")) )
stop("'flds1' should be of class character or numeric")
if( !(class(flds2) %in% c("character", "numeric")) )
stop("'flds2' should be of class character or numeric")
if(is.numeric(flds1)){
flds1Check <- flds1 %in% seq_len(ncol(df1))
if(!all(flds1Check)) stop("Some numbers in 'flds1' are out of the column range for 'df1'")
}
if(is.numeric(flds2)){
flds2Check <- flds2 %in% seq_len(ncol(df2))
if(!all(flds2Check)) stop("Some numbers in 'flds2' are out of the column range for 'df2'")
}
if(is.character(flds1)){
flds1Check <- flds1 %in% colnames(df1)
if(!all(flds1Check)) stop("Some names in 'flds1' are not columns of 'df1'")
flds1 <- match(flds1, colnames(df1))
}
if(is.character(flds2)){
flds2Check <- flds2 %in% colnames(df2)
if(!all(flds2Check)) stop("Some names in 'flds2' are not columns of 'df2'")
flds2 <- match(flds2, colnames(df2))
}
if(length(flds1)!=length(flds2)) stop("'flds1' and 'flds2' should have the same length")
}
}
n1 <- nrow(df1)
n2 <- nrow(df2)
if(n2 > n1) stop("'df2' has more records than 'df1'")
F <- length(flds1)
if(is.null(types)) types <- rep("lv",F)
if(F != length(types))
stop("Length of 'types' does not equal number of fields used for comparison")
if( (!is.character(types)) | any( !(types %in% c("lv","bi","nu")) ) )
stop("'types' should be a character vector of 'lv', 'bi', and/or 'nu' indicating Levenshtein-based,
binary, or numeric comparisons")
c1 <- sapply(df1[,flds1], class)
c2 <- sapply(df2[,flds2], class)
if( any(types=="nu") & any(c1[types=="nu", drop=FALSE] != "numeric") )
stop(paste("Numeric comparison requested for columns '", paste(colnames(df1)[flds1[types=="nu"]],collapse="' '"),
"' in 'df1', but at least one of these is not numeric", sep=""))
if( any(types=="nu") & any(c2[types=="nu", drop=FALSE] != "numeric") )
stop(paste("Numeric comparison requested for columns '", paste(colnames(df2)[flds2[types=="nu"]],collapse="' '"),
"' in 'df2', but at least one of these is not numeric", sep=""))
if( !(class(breaks) %in% c("list", "numeric")) )
stop("'breaks' should be of class list or numeric")
if( class(breaks) == "list" ){ # if user specifies list with breaks
if(length(breaks) == F){ # breaks for each comparison field
for(fld in 1:F){ # check the breaks provided for each field
if(types[fld]=="lv"){
if( (!is.numeric(breaks[[fld]])) | any(breaks[[fld]] < 0 | breaks[[fld]] > 1) )
stop(paste("'types[",fld,"]' specified as 'lv', so 'breaks[[",fld,"]]' should be a vector of numbers in [0,1]",
sep=""))
breaks[[fld]] <- unique(c(-Inf,breaks[[fld]],Inf))
}
if(types[fld]=="nu"){
if( (!is.numeric(breaks[[fld]])) | any(breaks[[fld]] < 0) )
stop(paste("'types[",fld,"]' specified as 'nu', so 'breaks[[",fld,"]]' should be a vector of non-negative numbers",
sep=""))
breaks[[fld]] <- unique(c(-Inf,breaks[[fld]],Inf))
}
}
}else{ # if length of list is not F, then list needs to be named with elements 'lv' and 'nu'
if( (length(breaks) != 2) | (!all(names(breaks) %in% c("lv","nu"))) )
stop("When 'breaks' is specified as a list, it should either have length equal to the number of comparison fields, or be a named list with elements 'lv' and 'nu'")
if( (!is.numeric(breaks$lv)) | any(breaks$lv < 0 | breaks$lv > 1) )
stop("'breaks$lv should be a vector of numbers in [0,1]")
breaks$lv <- unique(c(-Inf,breaks$lv,Inf))
if( (!is.numeric(breaks$nu)) | any(breaks$nu < 0) )
stop("'breaks$nu should be a vector of non-negative numbers")
breaks$nu <- unique(c(-Inf,breaks$nu,Inf))
breaks1 <- list()
breaks1[1:F] <- list(NULL)
breaks1[types=="nu"] <- list(breaks$nu)
breaks1[types=="lv"] <- list(breaks$lv)
breaks <- breaks1
}
}
if( class(breaks) == "numeric" ){ # if user specifies only one vector with breaks for all non-binary comparison fields
if( any(breaks < 0) )
stop("When 'breaks' is specified as a numeric vector, it should contain non-negative numbers")
if( all(types %in% c("lv","bi")) & any(breaks > 1) )
stop("When 'breaks' is specified as a numeric vector, it should contain numbers in [0,1] if all the non-binary comparisons in 'types' are 'lv'")
breaks <- unique(c(-Inf,breaks,Inf))
breaks1 <- list()
breaks1[1:F] <- list(breaks)
breaks <- breaks1
}
pairInds1 <- rep(1:n1, n2)
pairInds2 <- rep(1:n2, each=n1)
comparisons <- list()
for(fld in 1:F){
if(types[fld]=="bi"){
# computing agreement levels for binary comparisons
if( c1[fld] != c2[fld] )
warning(paste("Columns '", colnames(df1)[flds1[fld]], "' in 'df1' and '" ,
colnames(df2)[flds2[fld]], "' in 'df2' are of different classes", sep=""))
if( (c1[fld] == "factor")&(c2[fld] == "factor") ){
levs1 <- sort(levels(df1[,flds1[fld]]))
levs2 <- sort(levels(df2[,flds2[fld]]))
if(!identical(levs1,levs2)){
warning(paste("Columns '", colnames(df1)[flds1[fld]], "' in 'df1' and '" ,
colnames(df2)[flds2[fld]], "' in 'df2' are factors with different levels", sep=""))
unilevs <- union(levs1,levs2)
df1[,flds1[fld]] <- factor(df1[,flds1[fld]], levels = unilevs)
df2[,flds2[fld]] <- factor(df2[,flds2[fld]], levels = unilevs)
}
}
same <- df1[pairInds1,flds1[fld]] == df2[pairInds2,flds2[fld]]
AgrLev <- 1*same
AgrLev[!same] <- 2
comparisons[[fld]] <- as.factor(AgrLev)
}
if(types[fld]=="nu"){
absDiff <- abs(df1[pairInds1,flds1[fld]] - df2[pairInds2,flds2[fld]])
AgrLev <- cut(absDiff, breaks=breaks[[fld]], labels=seq_len(length(breaks[[fld]])-1))
comparisons[[fld]] <- as.factor(AgrLev)
}
if(types[fld]=="lv"){
# computing agreement levels for Levenshtein-based comparisons
df1[,flds1[fld]] <- as.character(df1[,flds1[fld]])
df2[,flds2[fld]] <- as.character(df2[,flds2[fld]])
# adist in the utils package returns the matrix of Levenshtein distances
lvd <- as.numeric(utils::adist(df1[,flds1[fld]], df2[,flds2[fld]]))/
pmax(nchar(df1[,flds1[fld]])[pairInds1], nchar(df2[,flds2[fld]])[pairInds2])
AgrLev <- cut(lvd, breaks=breaks[[fld]], labels=seq_len(length(breaks[[fld]])-1))
comparisons[[fld]] <- as.factor(AgrLev)
}
}
nDisagLevs <- sapply(comparisons, FUN=nlevels)
# next four lines compute the binary indicators from the agreement levels
xfun <- function(f) paste("(comparisons[[",f,"]]==",seq_len(nDisagLevs[f]),")")
expr1 <- paste(sapply(sapply(seq_len(F),xfun),FUN=paste,collapse=","),collapse=",")
expr2 <- paste("cbind(",expr1,")")
comparisons <- eval(parse(text=expr2))
# replacing NAs by FALSE or zeroes is justified by ignorability and CI assumptions (see Sadinle 2017)
comparisons[is.na(comparisons)] <- FALSE
df1Fields <- colnames(df1)[flds1]
df2Fields <- colnames(df2)[flds2]
compFields <- data.frame(file1=df1Fields, file2=df2Fields, types=types)
if(any(df1Fields != df2Fields) | warn){
warning(paste("The fields '", paste(df1Fields,collapse="' '"),
"' in 'df1' are being compared with the fields '",
paste(df2Fields,collapse="' '"), "' in 'df2'",
sep=""))
}
out <- list(comparisons=comparisons, n1=n1, n2=n2, nDisagLevs=nDisagLevs,
compFields=compFields)
return(out)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.