R/internals.r

Defines functions resample deleteNULLs isFALSE .lib_path .allows_extensions init_sqlite_extensions getSQLStatement blockfldfun

Documented in .allows_extensions blockfldfun deleteNULLs getSQLStatement init_sqlite_extensions isFALSE .lib_path resample

# internals.r: Various utility functions which are (usually) not called from
# outside the package


### Functions to create SQL statements and to retreive record pairs from
### the database

# construct part of where-clause which represents blocking restrictions
# make something like 'list(1:2, 3)' to something like
# '(t1.field1=t2.field1 and t1.field2=t2.field2) or (t1.field3=t2.field3)'
blockfldfun <- function(blockfld, phoneticFld, phoneticFun, coln)
{
  blockElemFun <- function(fldIndex)
  {
    if (fldIndex %in% phoneticFld)
      return(sprintf("%1$s(t1.'%2$s')=%1$s(t2.'%2$s')", phoneticFun, coln[fldIndex]))
    else
      return(sprintf("t1.'%1$s'=t2.'%1$s'", coln[fldIndex]))
  }

 paste("(", paste(sapply(blockfld, function(blockvec)
                  paste(sapply(blockvec, blockElemFun),
                        collapse=" and ")),
                  collapse=") or ("), ")", sep="")
}

#' Create SQL statement
#'
#' Creates SQL statememt to retreive comparison patterns, respecting
#' parameters such as blocking definition and exclusion of fields.
#'
#' @value A list with components "select_list", "from_clause", "where_clause"
#' representing the corresponding parts of the query without the keywords
#' 'SELECT', 'FROM' and 'WHERE'.
getSQLStatement <- function(data1, data2 = data1, con, type, blockFld,
  excludeFld, strcmpFld, strcmpFun, phoneticFld, phoneticFun)
{
  # constructs select for a single column, to be used by lapply 
  # (see below)
  selectListElem <- function(fldIndex, coln, excludeFld, strcmpFld, strcmpFun,
                            phoneticFld, phoneticFun)
  {
    # nothing if field is excluded
    if (fldIndex %in% excludeFld)
      return(character(0))
      
    # enclose fields in phonetic function if desired
    if (fldIndex %in% phoneticFld)
    {
      fld1 <- sprintf("%s(t1.'%s')", phoneticFun, coln[fldIndex])
      fld2 <- sprintf("%s(t2.'%s')", phoneticFun, coln[fldIndex])
    } else
    {
      fld1 <- sprintf("t1.'%s'", coln[fldIndex])
      fld2 <- sprintf("t2.'%s'", coln[fldIndex])
    }


    # something like 'jarowinkler(t1.fname, t2.fname) as fname'
    if (fldIndex %in% strcmpFld)
      return(sprintf("%s(%s, %s) as '%s'", strcmpFun, fld1, fld2, coln[fldIndex]))


    # direct comparison: something like 't1.fname=t2.fname as fname'      
    return(sprintf("%s=%s as '%s'", fld1, fld2, coln[fldIndex]))
  }
  coln <- colnames(data1) # make.db.names(con, colnames(data1), keywords = SQLKeywords(con))

  selectlist_id <- "t1.row_names as id1, t2.row_names as id2"
  # use unlist to delete NULLs from list
  selectlist <- paste(unlist(lapply(1:length(coln), selectListElem,
    coln, excludeFld, strcmpFld, strcmpFun,
    phoneticFld, phoneticFun)), collapse = ", ")
  selectlist <- paste(selectlist, "t1.identity=t2.identity as is_match", sep=",")
  fromclause <- switch(type, deduplication = "data t1, data t2",
                                      linkage = "data1 t1, data2 t2")
  whereclause <- switch(type, deduplication = "t1.row_names < t2.row_names",
                                      linkage = "1")
  if (length(blockFld)>0)
  {
   whereclause <- sprintf("%s and (%s)", whereclause, blockfldfun(blockFld,
    phoneticFld, phoneticFun, coln))
  }
  return(list(select_list = paste(selectlist_id, selectlist, sep=", "),
                from_clause = fromclause, where_clause = whereclause))
}


#' Begin generation of data pairs
#'
#' An SQL statement representing the generation of data pairs, including
#' the configuration of blocking fields, phonetics etc. is constructed and
#' send to SQLite.
setGeneric(
  name = "begin",
  def = function(x, ...) standardGeneric("begin")
)
      
setMethod(
  f = "begin",
  signature = "RLBigData",
  definition = function(x, ...)
  {
    sql <- getSQLStatement(x)  
    query <- sprintf("select %s from %s where %s", sql$select_list, 
      sql$from_clause, sql$where_clause)
    dbSendQuery(x@con, query) # can be retreived via dbListResults(x@con)[[1]]
    invisible(x)
  }
)

# retreive next n pairs
setGeneric(
  name = "nextPairs",
  def = function(x, n=10000, ...) standardGeneric("nextPairs")
)

setMethod(
  f = "nextPairs",
  signature = "RLBigData",
  definition = function(x, n=10000, ...)
  {
    res <- dbListResults(x@con)[[1]]
    result <- fetch(res, n)
    # Spalten, die nur NA enthalten, werden als character ausgegeben, deshalb
    # Umwandlung nicht-numerischer Spalten in numeric
    if (nrow(result) > 0) # wichtig, weil sonst Fehler bei Zugriff auf Spalten auftritt
    {
      for (i in 1:ncol(result))
      {
        if (!is.numeric(result[,i]))
          result[,i] <- as.numeric(result[,i])
      }
    }
    # column names may have changed due to SQL conform conversion, reset them
    colnames(result)[-c(1,2,ncol(result))] <- getColumnNames(x)
    result
  }
)

# Clean up after retreiving data pairs (close result set in database)
setGeneric(
  name = "clear",
  def = function(x, ...) standardGeneric("clear")
)

setMethod(
  f = "clear",
  signature = "RLBigData",
  definition = function(x, ...) dbClearResult(dbListResults(x@con)[[1]])
)


### Functions neccessary to load extensions into SQLite database

# Function body taken from init_extension, package RSQLite.extfuns
init_sqlite_extensions <- function(db)
{
    ans <- FALSE
    if (.allows_extensions(db)) {
        res <- dbGetQuery(db, sprintf("SELECT load_extension('%s')",
                                      .lib_path()))
        ans <- all(dim(res) == c(1, 1))
    } else {
        stop("loadable extensions are not enabled for this db connection")
    }
    ans
}

# taken from RSQLite.extfuns
.allows_extensions <- function(db)
{
    v <- db@loadable.extensions
    isTRUE(v)
}


# taken from RSQLite.extfuns
.lib_path <- function()
{
    ## this is a bit of a trick, but the NAMESPACE code
    ## puts .packageName in the package environment and this
    ## seems slightly better than hard-coding.
    ##
    ## This also relies on the DLL being loaded even though only SQLite
    ## actually needs to load the library.  It does not appear that
    ## loading it causes any harm and it makes finding the path easy
    ## (don't have to worry about arch issues).
    getLoadedDLLs()[[.packageName]][["path"]]
}



# get count of each distinct comparison pattern
# Fuzzy values above cutoff are converted to 1, below cutoff to 0
# NAs are converted to 0
setGeneric(
  name = "getPatternCounts",
  def = function(x, n=10000, cutoff=1, withProgressBar = (sink.number()==0))
    standardGeneric("getPatternCounts")
)

setMethod(
  f = "getPatternCounts",
  signature = "RLBigData",
  definition = function(x, n=10000, cutoff=1, withProgressBar = (sink.number()==0))
  {
    if (withProgressBar)
    {
      pgb <- txtProgressBar(max=nrow(x@pairs))
    }

    patternCounts <- 0L
    ffrowapply(
      {
        slice <- as.ram(x@pairs[i1:i2, 3:(ncol(x@pairs)-1), drop=FALSE])
        slice[is.na(slice)] <- 0
        slice[slice < cutoff] <- 0
        slice[slice >= cutoff] <- 1
        patternCounts <- patternCounts + countpattern(slice)
        if (withProgressBar) setTxtProgressBar(pgb, i2)
      }, X = x@pairs)
      patternCounts
  }
)

# get number of matches
setGeneric(
  name = "getMatchCount",
  def = function(object) standardGeneric("getMatchCount")
)

setMethod(
  f = "getMatchCount",
  signature = "RLBigData",
  definition = function(object)
  {
    sum(as.ram(object@pairs$is_match), na.rm=TRUE)
  }
)

# get number of non-matches
setGeneric(
  name = "getNonMatchCount",
  def = function(object) standardGeneric("getNonMatchCount")
)

setMethod(
  f = "getNonMatchCount",
  signature = "RLBigData",
  definition = function(object)
  {
#    sum(chunkify(function(x) x==0)(object@pairs$is_match), na.rm=TRUE)
    sum(as.ram(object@pairs$is_match)==0, na.rm=TRUE)
  }
)

# Get the number of pairs with unknown matching status
setGeneric(
  name = "getNACount",
  def = function(object) standardGeneric("getNACount")
)

setMethod(
  f = "getNACount",
  signature = "RLBigData",
  definition = function(object)
  {
#    sum(chunkify(is.na)(object@pairs$is_match))
    sum(is.na(as.ram(object@pairs$is_match)))
  }
)


setGeneric(
  name = "getColumnNames",
  def = function(object, withExcluded = FALSE) standardGeneric("getColumnNames")
)

setMethod(
  f = "getColumnNames",
  signature = "RLBigDataDedup",
  definition = function(object, withExcluded = FALSE)
  {
    if (withExcluded || length(object@excludeFld)==0) colnames(object@data)
    else colnames(object@data)[-object@excludeFld]
  }
)

setMethod(
  f = "getColumnNames",
  signature = "RLBigDataLinkage",
  definition = function(object, withExcluded = FALSE)
  {
    if (withExcluded || length(object@excludeFld)==0) colnames(object@data1)
    else colnames(object@data1)[-object@excludeFld]
  }
)

### Various other utility functions

# utility function to generate all unordered pairs of x[1]..x[n]
# if x is a vector, or 1..x
unorderedPairs <- function (x)
{
    if (length(x)==1)
    {
      if (!is.numeric(x) || x < 2)
        stop("x must be a vector or a number >= 2")
        return (array(unlist(lapply(1:(x-1),
          function (k) rbind(k,(k+1):x))),dim=c(2,x*(x-1)/2)))
    }
    if (!is.vector(x))
      stop ("x must be a vector or a number >= 2")
    n=length(x)
    return (array(unlist(lapply(1:(n-1),
    function (k) rbind(x[k],x[(k+1):n]))),dim=c(2,n*(n-1)/2)))
}

isFALSE <- function(x) identical(x,FALSE)

deleteNULLs  <-  function(x)
    x[unlist(lapply(x, length) != 0)]

# interprets a scalar x as a set with 1 element (see also man page for sample)
resample <- function(x, size, ...)
     if(length(x) <= 1) { if(!missing(size) && size == 0) x[FALSE] else x
     } else sample(x, size, ...)


# modified function from package e1071, works also for data with only one column
countpattern <- function (x, matching = FALSE)
{
    nvar <- dim(x)[2]
    n <- dim(x)[1]
    b <- matrix(0, 2^nvar, nvar)
    for (i in 1:nvar) b[, nvar + 1 - i] <- rep(rep(c(0, 1), c(2^(i -
        1), 2^(i - 1))), 2^(nvar - i))
    namespat <- character(nrow(b))
    for (i in 1:nvar) namespat <- paste(namespat, b[, i], sep = "")
    xpat <- numeric(nrow(x))
    for (i in 1:nvar) xpat <- 2 * xpat + x[, i]
    xpat <- xpat + 1
    pat <- tabulate(xpat, nbins = 2^nvar)
    names(pat) <- namespat
    if (matching)
        return(list(pat = pat, matching = xpat))
    else return(pat)
}

Try the RecordLinkage package in your browser

Any scripts or data that you put into this service are public.

RecordLinkage documentation built on Nov. 10, 2022, 5:42 p.m.