#' Identify duplicate entries
#'
#' For a tabular set of publication records, identifies potential sets of
#' duplicate entries and labels them with a unique identifier.
#'
#' @param x The dataset in which duplicate entries will be identified
#' @param match_cols Column(s) that will be used to search for duplicate
#' records
#' @param approx_match Whether to perform a duplicate search using
#' string distances or exact values
#' @param string_dist When using approximate matching, the string
#' distance cutoff at which records will be assumed duplicated
#' @param min_length The minimum length for the combined matching string
#' produced by \code{match_cols} at which a record will be considered for
#' matching
#' @param simplify_match Whether to perform duplicate searches after removing
#' all non alpha-numeric characters from the reference string generated from
#' \code{match_cols}
#' @return An updated version of \code{x}, with one column specifying the
#' final string used to search for duplicates (\code{matching_col})
#' and another column containing unique identifiers for each set of
#' duplicates (\code{match_ID}).
#' @examples
#' \dontrun{
#' test <- rbind(form_mm_recs, form_mm_recs)
#' test <- dupes_find(test, c(1, 3))
#' dupes <- dupes_return(test)
#' out <- dupes_rm(test)
#' }
#' @export
dupes_find <- function(x,
match_cols,
approx_match = FALSE,
string_dist = 5,
min_length = 10,
simplify_match = TRUE) {
UseMethod("dupes_find")
}
#' @export
dupes_find.default <- function(x, ...) {
stop("x must be of class data.frame")
}
#' @export
dupes_find.data.frame <- function(x,
match_cols,
approx_match = FALSE,
string_dist = 5,
min_length = 10,
simplify_match = TRUE) {
# Remove the unique ID
x[, "UNIQUE_ID"] <- NULL
# Matching column ---------------------------------------------------
x <- add_matching_col(x, match_cols, simplify_match = simplify_match)
# Flag rows based on string length ----------------------------------
if (!is.null(min_length)) {
x <- protect_min_length(x, min_length)
}
# Add matching IDs based on exact or fuzzy matching -----------------
if (approx_match) {
x <- fuzzy_matching(x, string_dist)
} else {
x <- strict_matching(x)
}
return(x)
}
#' Return duplicated entries
#'
#' Takes output generated by \code{dupes_find}, and returns a
#' \code{data.frame} containing only potentially duplicated records.
#'
#' @param x The ouput of \code{dupes_find}
#' @return A \code{data.frame} containing only entries of the original dataset
#' identified as duplicate entries. This is meant to facilitate manual
#' inspection of potential duplicates. Aberrantly identified duplicates
#' may subsequently be spared when running \code{dupes_rm}
#' by indicating their match IDs in \code{dupes_rm}'s
#' \code{ignore_IDs} argument.
#' @examples
#' \dontrun{
#' test <- rbind(form_mm_recs, form_mm_recs)
#' test <- dupes_find(test, c(1, 3))
#' dupes <- dupes_return(test)
#' out <- dupes_rm(test)
#' }
#' @export
dupes_return <- function(x) {
UseMethod("dupes_return")
}
#' @export
dupes_return.default <- function(x, ...) {
stop("x must be of class data.frame")
}
#' @export
dupes_return.data.frame <- function(x) {
matched_match_IDs <- get_dupe_m_id(x)
x <- x[x[,"match_ID"] %in% matched_match_IDs, ]
idx <- order(x[, "match_ID"])
return(x[idx,])
}
#' Remove duplicate entries
#'
#' Takes output of \code{dupes_find}, and returns a set of records with
#' duplicate entries removed.
#'
#' After running \code{dupes_find}, duplicate sets of records will contain
#' identical match IDs specified in the \code{match_ID} column. This
#' function will remove all but one entry from each set of duplicates. The
#' preserved entry is selected based on the user's preferred database
#' (\code{db_pref}) and the entry within the duplicate set with the
#' least number of blank cells.
#'
#' @param x The output of \code{dupes_find}
#' @param db_pref The preferred database for a reference.
#' @param ignore_IDs A set of match IDs that will be ignored when removing
#' duplicate records. These should be identified by manually inspecting
#' the output of \code{dupes_return}.
#' @return An updated version of \code{x}, with duplicate records removed.
#' @examples
#' \dontrun{
#' test <- rbind(form_mm_recs, form_mm_recs)
#' test <- dupes_find(test, c(1, 3))
#' dupes <- dupes_return(test)
#' out <- dupes_rm(test)
#' }
#' @export
dupes_rm <- function(x, db_pref = NULL, ignore_IDs = NULL) {
UseMethod("dupes_rm")
}
#' @export
dupes_rm.default <- function(x, ...) {
stop("x must be of class data.frame")
}
#' @export
dupes_rm.data.frame <- function(x, db_pref = NULL, ignore_IDs = NULL) {
# Remove matching column here
x$matching_col <- NULL
# Add internal ID to x
x$internal_ID <- 1:nrow(x)
# Create a search variable if database precedence is specified
if(!is.null(db_pref)) {
db_search <- paste(db_pref, collapse = "|")
}
# Get the match IDs of duplicates
duplicated_match_IDs <- get_dupe_m_id(x)
# Remove those IDs that have been manually specified
if (!is.null(ignore_IDs)) {
duplicated_match_IDs <- setdiff(duplicated_match_IDs, ignore_IDs)
}
for (id in duplicated_match_IDs) {
# Create duplicates df here
duplicate.df <- x[x[, "match_ID"] == id, ]
# Skip if there aren't any duplicates
if (nrow(duplicate.df) == 1) next
# Give precedence to certain databases if they are contained
# within the duplicates dataframe
if (!is.null(db_pref) &&
sum(grepl(db_search, duplicate.df[, "DATABASE"])) > 0) {
# Find which IDs to remove
removal_ids <- duplicate.df[!(grepl(db_search,
duplicate.df[,"DATABASE"])),
"internal_ID"]
# Remove IDs from duplicate df and original df
duplicate.df <-
duplicate.df[!(duplicate.df[, "internal_ID"] %in% removal_ids), ]
x <- x[!(x[, "internal_ID"] %in% removal_ids), ]
if (nrow(duplicate.df) == 1) next
}
# Choose entry with the most available data
idx <- which.min(rowSums(duplicate.df == "", na.rm = TRUE))
removal_ids <- duplicate.df[-idx, "internal_ID"]
x <- x[!(x[, "internal_ID"] %in% removal_ids), ]
}
# Remove internal ID
x$internal_ID <- NULL
# Remove match_ID
x$match_ID <- NULL
# Add a unique identifier that will allow the output to be used as
# a reference for future tables
x <- data.frame(UNIQUE_ID = 1:nrow(x),
x,
stringsAsFactors = FALSE)
return(x)
}
# Helper functions --------------------------------------------------------
# Remove all non alpha-numeric characters (including
# punctuation) and to convert all characters to lower-case.
simplify_string <- function(x) {
x <- gsub("[^a-zA-z0-9]", "", x)
x <- gsub("[[:punct:]]", "", x)
x <- tolower(x)
return(x)
}
# Takes a data.frame and a list of columns and creates a new column composed of
# the pasted contents of the specified columns. Additionally, allows
# string simplification. The purpose is to create a single column on which
# to eventually compute string distances for the purpose of fuzzy matching.
add_matching_col <- function(x, match_cols, simplify_match = TRUE) {
# Add an empty match_ID to identify matching rows
x$match_ID <- NA
if(length(match_cols) > 1) {
x$matching_col <- apply(x[, match_cols], 1, paste0, collapse = "")
} else {
x$matching_col <- x[, match_cols]
}
if (simplify_match) {
x$matching_col <- sapply(x$matching_col, simplify_string)
}
return(x)
}
# Returns logical index indicating if row should be flagged based on a minimum
# string length
protect_min_length <- function(x, min_length) {
idx <- which(nchar(x$matching_col) < min_length)
if (length(idx) > 0) {
x[idx, "match_ID"] <- paste0("m", 1:nrow(x))[idx]
}
return(x)
}
fuzzy_matching <- function(x, string_dist) {
# Add an empty match_ID to identify matching rows
# x$match_ID <- NA
# Add an interal ID to uniquely identify rows for this function
x$internal_ID <- 1:nrow(x)
for (i in 1:nrow(x)) {
# Skip row if it has already been assigned a match ID
if (!is.na(x$match_ID[i])) next
# Index entries still missing a match_ID
na.idx <- is.na(x$match_ID)
# Among those entries still that have not been used for a pair-wise
# search and which have not already been matched
# (match_ID still set to NA), make an index to to pull entries which
# match based upon a certain string distance
m.idx <-
as.vector(utils::adist(x$matching_col[i],
x$matching_col[na.idx]) <=
string_dist)
# Get the IDs of those that matched
internal_ids <- x[na.idx, "internal_ID"][m.idx]
# Set the match ID of those entries to a value based upon the current
# entry used for the search.
x[x$internal_ID %in% internal_ids, "match_ID"] <-
paste0("m", i)
}
# Remove the internal ID
x$internal_ID <- NULL
return(x)
}
strict_matching <- function(x) {
# Add an empty match_ID to identify matching rows
# x$match_ID <- NA
# Add an interal ID to uniquely identify rows for this function
x$internal_ID <- 1:nrow(x)
for (i in 1:nrow(x)) {
# Skip row if it has already been assigned a match ID
if (!is.na(x$match_ID[i])) next
# This will still look at things upstream and downstream that have
# already been paired, but will not use these elements for pairwise
# matching if they have already been assigned
x[x$matching_col == x$matching_col[i] &
is.na(x$match_ID), "match_ID"] <-
paste0("m", i)
}
# Remove the internal ID
x$internal_ID <- NULL
return(x)
}
get_dupe_m_id <- function(x) {
return(unique(x[duplicated(x[, "match_ID"]), "match_ID"]))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.