R/dupes.R

#' Identify duplicate entries
#'
#' For a tabular set of publication records, identifies potential sets of
#' duplicate entries and labels them with a unique identifier.
#'
#' @param x The dataset in which duplicate entries will be identified
#' @param match_cols Column(s) that will be used to search for duplicate
#' records
#' @param approx_match Whether to perform a duplicate search using
#' string distances or exact values
#' @param string_dist When using approximate matching, the string
#' distance cutoff at which records will be assumed duplicated
#' @param min_length The minimum length for the combined matching string
#' produced by \code{match_cols} at which a record will be considered for
#' matching
#' @param simplify_match Whether to perform duplicate searches after removing
#' all non alpha-numeric characters from the reference string generated from
#' \code{match_cols}
#' @return An updated version of \code{x}, with one column specifying the
#' final string used to search for duplicates (\code{matching_col})
#' and another column containing unique identifiers for each set of
#' duplicates (\code{match_ID}).
#' @examples
#' \dontrun{
#' test <- rbind(form_mm_recs, form_mm_recs)
#' test <- dupes_find(test, c(1, 3))
#' dupes <- dupes_return(test)
#' out <- dupes_rm(test)
#' }
#' @export
dupes_find <- function(x,
                            match_cols,
                            approx_match = FALSE,
                            string_dist = 5,
                            min_length = 10,
                            simplify_match = TRUE) {
    UseMethod("dupes_find")
}

#' @export
dupes_find.default <- function(x, ...) {
    stop("x must be of class data.frame")
}

#' @export
dupes_find.data.frame <- function(x,
                                       match_cols,
                                       approx_match = FALSE,
                                       string_dist = 5,
                                       min_length = 10,
                                       simplify_match = TRUE) {

    # Remove the unique ID
    x[, "UNIQUE_ID"] <- NULL

    # Matching column ---------------------------------------------------
    x <- add_matching_col(x, match_cols, simplify_match = simplify_match)

    # Flag rows based on string length ----------------------------------
    if (!is.null(min_length)) {
        x <- protect_min_length(x, min_length)
    }

    # Add matching IDs based on exact or fuzzy matching -----------------
    if (approx_match) {
        x <- fuzzy_matching(x, string_dist)
    } else {
        x <- strict_matching(x)
    }

    return(x)
}

#' Return duplicated entries
#'
#' Takes output generated by \code{dupes_find}, and returns a
#' \code{data.frame} containing only potentially duplicated records.
#'
#' @param x The ouput of \code{dupes_find}
#' @return A \code{data.frame} containing only entries of the original dataset
#' identified as duplicate entries. This is meant to facilitate manual
#' inspection of potential duplicates. Aberrantly identified duplicates
#' may subsequently be spared when running \code{dupes_rm}
#' by indicating their match IDs in \code{dupes_rm}'s
#' \code{ignore_IDs} argument.
#' @examples
#' \dontrun{
#' test <- rbind(form_mm_recs, form_mm_recs)
#' test <- dupes_find(test, c(1, 3))
#' dupes <- dupes_return(test)
#' out <- dupes_rm(test)
#' }
#' @export
dupes_return <- function(x) {
    UseMethod("dupes_return")
}

#' @export
dupes_return.default <- function(x, ...) {
    stop("x must be of class data.frame")
}

#' @export
dupes_return.data.frame <- function(x) {
    matched_match_IDs <- get_dupe_m_id(x)
    x <- x[x[,"match_ID"] %in% matched_match_IDs, ]
    idx <- order(x[, "match_ID"])
    return(x[idx,])
}

#' Remove duplicate entries
#'
#' Takes output of \code{dupes_find}, and returns a set of records with
#' duplicate entries removed.
#'
#' After running \code{dupes_find}, duplicate sets of records will contain
#' identical match IDs specified in the \code{match_ID} column. This
#' function will remove all but one entry from each set of duplicates. The
#' preserved entry is selected based on the user's preferred database
#' (\code{db_pref}) and the entry within the duplicate set with the
#' least number of blank cells.
#'
#' @param x The output of \code{dupes_find}
#' @param db_pref The preferred database for a reference.
#' @param ignore_IDs A set of match IDs that will be ignored when removing
#' duplicate records. These should be identified by manually inspecting
#' the output of \code{dupes_return}.
#' @return An updated version of \code{x}, with duplicate records removed.
#' @examples
#' \dontrun{
#' test <- rbind(form_mm_recs, form_mm_recs)
#' test <- dupes_find(test, c(1, 3))
#' dupes <- dupes_return(test)
#' out <- dupes_rm(test)
#' }
#' @export
dupes_rm <- function(x, db_pref = NULL, ignore_IDs = NULL) {
    UseMethod("dupes_rm")
}

#' @export
dupes_rm.default <- function(x, ...) {
    stop("x must be of class data.frame")
}

#' @export
dupes_rm.data.frame <- function(x, db_pref = NULL, ignore_IDs = NULL) {

    # Remove matching column here
    x$matching_col <- NULL

    # Add internal ID to x
    x$internal_ID <- 1:nrow(x)

    # Create a search variable if database precedence is specified
    if(!is.null(db_pref)) {
        db_search <- paste(db_pref, collapse = "|")
    }

    # Get the match IDs of duplicates
    duplicated_match_IDs <- get_dupe_m_id(x)

    # Remove those IDs that have been manually specified
    if (!is.null(ignore_IDs)) {
        duplicated_match_IDs <- setdiff(duplicated_match_IDs, ignore_IDs)
    }

    for (id in duplicated_match_IDs) {

        # Create duplicates df here
        duplicate.df <- x[x[, "match_ID"] == id, ]

        # Skip if there aren't any duplicates
        if (nrow(duplicate.df) == 1) next

        # Give precedence to certain databases if they are contained
        # within the duplicates dataframe
        if (!is.null(db_pref) &&
            sum(grepl(db_search, duplicate.df[, "DATABASE"])) > 0) {

            # Find which IDs to remove
            removal_ids <- duplicate.df[!(grepl(db_search,
                                                duplicate.df[,"DATABASE"])),
                                        "internal_ID"]

            # Remove IDs from duplicate df and original df
            duplicate.df <-
                duplicate.df[!(duplicate.df[, "internal_ID"] %in% removal_ids), ]
            x <- x[!(x[, "internal_ID"] %in% removal_ids), ]

            if (nrow(duplicate.df) == 1) next
        }
        # Choose entry with the most available data
        idx <- which.min(rowSums(duplicate.df == "", na.rm = TRUE))
        removal_ids <- duplicate.df[-idx, "internal_ID"]
        x <- x[!(x[, "internal_ID"] %in% removal_ids), ]
    }
    # Remove internal ID
    x$internal_ID <- NULL
    # Remove match_ID
    x$match_ID <- NULL
    # Add a unique identifier that will allow the output to be used as
    # a reference for future tables
    x <- data.frame(UNIQUE_ID = 1:nrow(x),
                    x,
                    stringsAsFactors = FALSE)

    return(x)
}

# Helper functions --------------------------------------------------------

# Remove all non alpha-numeric characters (including
# punctuation) and to convert all characters to lower-case.
simplify_string <- function(x) {
    x <- gsub("[^a-zA-z0-9]", "", x)
    x <- gsub("[[:punct:]]", "", x)
    x <- tolower(x)
    return(x)
}

# Takes a data.frame and a list of columns and creates a new column composed of
# the pasted contents of the specified columns. Additionally, allows
# string simplification. The purpose is to create a single column on which
# to eventually compute string distances for the purpose of fuzzy matching.
add_matching_col <- function(x, match_cols, simplify_match = TRUE) {

    # Add an empty match_ID to identify matching rows
    x$match_ID <- NA

    if(length(match_cols) > 1) {
        x$matching_col <- apply(x[, match_cols], 1, paste0, collapse = "")
    } else {
        x$matching_col <- x[, match_cols]
    }
    if (simplify_match) {
        x$matching_col <- sapply(x$matching_col, simplify_string)
    }
    return(x)
}

# Returns logical index indicating if row should be flagged based on a minimum
# string length
protect_min_length <- function(x, min_length) {

    idx <- which(nchar(x$matching_col) < min_length)
    if (length(idx) > 0) {
        x[idx, "match_ID"] <- paste0("m", 1:nrow(x))[idx]
    }
    return(x)
}

fuzzy_matching <- function(x, string_dist) {

    # Add an empty match_ID to identify matching rows
    # x$match_ID <- NA
    # Add an interal ID to uniquely identify rows for this function
    x$internal_ID <- 1:nrow(x)

    for (i in 1:nrow(x)) {
        # Skip row if it has already been assigned a match ID
        if (!is.na(x$match_ID[i])) next
        # Index entries still missing a match_ID
        na.idx <- is.na(x$match_ID)
        # Among those entries still that have not been used for a pair-wise
        # search and which have not already been matched
        # (match_ID still set to NA), make an index to to pull entries which
        # match based upon a certain string distance
        m.idx <-
            as.vector(utils::adist(x$matching_col[i],
                                   x$matching_col[na.idx]) <=
                          string_dist)
        # Get the IDs of those that matched
        internal_ids <- x[na.idx, "internal_ID"][m.idx]
        # Set the match ID of those entries to a value based upon the current
        # entry used for the search.
        x[x$internal_ID %in% internal_ids, "match_ID"] <-
            paste0("m", i)
    }

    # Remove the internal ID
    x$internal_ID <- NULL

    return(x)
}

strict_matching <- function(x) {
    # Add an empty match_ID to identify matching rows
    # x$match_ID <- NA
    # Add an interal ID to uniquely identify rows for this function
    x$internal_ID <- 1:nrow(x)

    for (i in 1:nrow(x)) {
        # Skip row if it has already been assigned a match ID
        if (!is.na(x$match_ID[i])) next
        # This will still look at things upstream and downstream that have
        # already been paired, but will not use these elements for pairwise
        # matching if they have already been assigned
        x[x$matching_col == x$matching_col[i] &
              is.na(x$match_ID), "match_ID"] <-
            paste0("m", i)
    }

    # Remove the internal ID
    x$internal_ID <- NULL

    return(x)
}

get_dupe_m_id <- function(x) {
    return(unique(x[duplicated(x[, "match_ID"]), "match_ID"]))
}
graggsd/sysreviewR documentation built on May 16, 2019, 2:52 a.m.