#' @title Search For Duplicate Specimens Among Collections
#'
#' @description This function searches for duplicate specimens among
#' collections, based on duplicate search strings.
#'
#' @param df a data frame with the unique record identifier (in the first
#' column) and the strings to be used for the duplicate search (second and
#' other columns). See Examples.
#' @param flag.ind logical. Should duplicates based on indirect matches be
#' flagged with brackets? Default to TRUE.
#'
#' @author Renato A. F. de Lima
#'
#' @return The input data frame with new columns containing the number and
#' proportion of duplicated search strings found (i.e. 'dup.numb' and
#' 'dup.prop') and the duplicated ID string ('dup.ID'). If no duplicate was
#' found dup.ID = NA.
#'
#' @details The function searches for duplicated specimens within and across
#' collections based on one or more duplicate search strings, typically the
#' output of the __plantR__ function `prepDup()`. These strings combine
#' different types of information (i.e. taxonomy, collection and locality).
#' For instance, a string combining information on taxonomy, collector last
#' name and number, and collection locality would look like:
#' 'Myrtaceae_Silva_110_Curitiba'.
#'
#' Strings too flexible (e.g. 'Silva_110') return more duplicated records but
#' many may be false duplicates. Strings too strict, on the other hand, may
#' miss true duplicates if some of the collections have not entered all search
#' fields or if they were entered using different notation standards. Finding
#' all existing duplicates requires that all collections are available and
#' that all search fields are complete, filled in without typos and using the
#' same notation standards. This is rarely the case, so the list of duplicates
#' returned should be considered incomplete in many cases. Moreover, to make
#' sure missing information does not prevent the retrieval of duplicates,
#' using more than two combinations of fields is advised. See Lima et al.
#' (2020) for an example of a conservative usage of different combinations
#' of strings to find duplicated specimens.
#'
#' The function returns direct matches for up to one search string. If
#' two or more search strings are provided, the search of duplicates uses
#' tools from network analysis to find both direct and indirect matches of
#' strings between records. If records are grouped under the same duplicated
#' ID string ('dup.ID') but only based on indirect matches with other records,
#' 'dup.ID' is returned between brackets (the default of argument `flag.ind`).
#' These IDs may need to be inspected more closely to detect possible spurious
#' matches.
#'
#' Note that duplicate records can be physical (i.e. samples of the same
#' biological specimen incorporated in two or more collections) or virtual
#' (i.e. duplicated entries of the same record in different data
#' repositories). Although the function aims to detect the physical
#' duplicates, it also flags the virtual duplicates by adding 'virtual' inside
#' brackets to the end of the duplicated ID string. Currently, this flag is
#' only added if the duplicate is purely virtual, i.e., if the virtual
#' duplicate is not also part of a physical duplicate.
#'
#' Besides the duplicated ID, the function returns the number and proportion
#' of duplicated search strings found for each record within its group of
#' duplicates (i.e., 'dup.numb' and 'dup.prop'). These values can be used to
#' assess the confidence level that records are indeed true duplicates within
#' its group. The higher the 'dup.prop', the greater the chances that the
#' record is indeed a duplicate. To calculate the proportion of duplicates
#' found within the number of available search strings, mismatches due to
#' different or to missing strings are treated the same.
#'
#'
#' @examples
#'
#' df <- data.frame(id=c("a_1","b_3","c_7","d_5","e_3",
#' "f_4","g_2","h_8","i_9","j_6","k_7","l_1"),
#' str1=c("a","b","c","l","l","p","p","p",NA,NA,"x","y"),
#' str2=c("d","d","e","k","k","o","o","o",NA,NA,"v","w"),
#' str3=c("f","g","f","n","n","s","r","s","t","t","z","u"),
#' str4=c("h","i","j","m","m","q","q","q",NA,NA,"ab","ac"))
#'
#' getDup(df)
#'
#' @references
#' Lima, R.A.F. et al. 2020. Defining endemism levels for biodiversity
#' conservation: Tree species in the Atlantic Forest hotspot. Biological
#' Conservation, 252: 108825.
#'
#' @seealso
#' \link[plantR]{prepDup}.
#'
#' @importFrom stringr str_trim str_count
#' @import data.table
#' @importFrom igraph graph_from_data_frame components
#'
#' @export getDup
#'
getDup <- function(df = NULL, flag.ind = TRUE) {
# check input
if (!class(df) == "data.frame")
stop("Input object needs to be a data frame!")
if (dim(df)[1] == 0)
stop("Input data frame is empty!")
#Escaping R CMD check notes from using data.table syntax
tmp.ordem <- numTombo <- value <- i.memb <- new_id <- dup.ind.test <- . <- NULL
#Getting the search string names
rec.ID <- names(df)[1]
str_names <- names(df)[-1]
#Making sure the data.table does not contains factors
changeCols <- sapply(df, class) == "factor"
if (any(changeCols)) {
changeCols <- names(changeCols)[changeCols]
for(i in 1:length(changeCols))
df[,changeCols[i]] <- as.character(df[,changeCols[i]])
}
#Creating the new columns used in the search
df$dup.check <- NA
df$dup.numb <- NA
df$dup.prop <- NA
df$dup.ID <- NA
##Checking for the duplicates (top down and bottom up)
for(i in 1:length(str_names)) {
col_name <- paste0("dup.check", i)
forward <- duplicated(df[,str_names[i]], incomparables = NA)
reverse <- duplicated(df[,str_names[i]], incomparables = NA,
fromLast = TRUE)
df[, col_name] <- forward | reverse
}
#### Number of duplicated strings found ####
df$dup.numb[apply(df[, names(df) %in% str_names, drop = FALSE], 1,
function(x) all(is.na(x)))] <- "cc"
df$dup.numb[apply(df[,grepl("dup.check[0-9]", names(df)), drop = FALSE], 1,
function(x) all(x))] <- length(str_names)
df$dup.numb[is.na(df$dup.numb)] <-
apply(df[is.na(df$dup.numb), grepl("dup.check[0-9]", names(df)), drop = FALSE], 1, sum, na.rm = TRUE)
#### Proportion of duplicated strings found among te possible non-NA strings (max. 1) ####
tmp <- df[, grepl("dup.check[0-9]", names(df)), drop = FALSE]
tmp1 <- apply(tmp, 1, function(x)
sum(!is.na(x)))
tmp1[!df$dup.numb %in% "cc"] <-
as.double(df$dup.numb[!df$dup.numb %in% "cc"]) /
tmp1[!df$dup.numb %in% "cc"]
tmp1[df$dup.numb %in% "cc"] <- "cc"
df$dup.prop <- tmp1
#### Assigning a unique column with occurrences with any indication of duplicates ####
df$dup.check[!df$dup.numb %in% "cc"] <-
df$dup.numb[!df$dup.numb %in% "cc"] > 0
df$dup.check[df$dup.numb %in% "cc"] <- "cannot_check"
#### Defining the unique duplicated code ####
## Filtering data with any evidence of duplicates
df1 <- df[df$dup.check %in% "TRUE", ] # only data with any indication of duplicates
if (dim(df1)[1] > 0) {
## Filtering the dataset and converting to the data.table format
dt <- data.table::data.table(df1) # only data with any indication of duplicates
dt[, tmp.ordem := .I,]
#Dealing with duplicated values of 'numTombo'
data.table::setnames(dt, rec.ID, "numTombo")
dt[duplicated(numTombo),
numTombo := paste(numTombo, 1:.N, sep="_dup"), by = numTombo]
## Finding indirect duplicated search strings
# melt data to long format (need to remove duplicated 'numTombo')
cols <- c(rec.ID, str_names)
d <- data.table::melt.data.table(dt[, 1:(length(cols)),],
id.vars = "numTombo", na.rm = TRUE)
# convert to graph
g <- igraph::graph_from_data_frame(d[ , .(numTombo, value)])
# get components
memb <- igraph::components(g)$membership
memb <- data.table::data.table(numTombo = names(memb),
memb = memb)
# add component id to original data
data.table::setDT(dt)[memb, memb := i.memb, on = c(numTombo = "numTombo")]
# putting duplicated numTombo back on their original format
`%like.ic%` <- function (x, pattern) {
grepl(pattern, x, perl = TRUE, ignore.case = TRUE)
}
dt[numTombo %like.ic% "_dup[0-9]",
numTombo := gsub("_dup[0-9]", "", numTombo, perl = TRUE)]
# concatenate 'id' by 'memb' column
data.table::setkeyv(dt, c("memb"))
dt[!is.na(memb),
new_id := as.character(paste0(sort(unique(numTombo)), collapse = "|")),
by = memb]
## Flagging groups of duplicates based only on indirect duplicates
if (flag.ind) {
dup.cols <- data.table::copy(names(dt))
dup.cols <- dup.cols[grepl("dup.check[1-9]", dup.cols, perl = TRUE)]
for(i in seq_along(str_names)) {
data.table::setkeyv(dt, c(str_names[i]))
dt[NA_character_, (dup.cols[i]) := NA]
}
data.table::setkey(dt, new_id)
dt[, dup.ind.test := lapply(.SD, function(x) any(sum(x) == 0, na.rm = TRUE)),
by = new_id, .SDcols = c(dup.cols)]
dt[!is.na(new_id),
new_id := ifelse(dup.ind.test, paste0("[",new_id,"]",collapse=""), new_id),
by = new_id]
}
# for groups of length one, set 'memb' to NA
dt[dt[, .I[.N == 1], by = memb]$V1, new_id := NA]
# flagging virtual duplicates
data.table::setkeyv(dt, c("new_id"))
dt[!is.na(new_id) & !new_id %like.ic% "\\|",
new_id := paste0(new_id,"[virtual]",collapse=""), by = new_id]
#Saving the duplicated IDs in the main data frame
data.table::setkeyv(dt, c("tmp.ordem")) #re-ordering the data.table
df$dup.ID[df$dup.check %in% "TRUE"] <- as.character(dt$new_id)
}
#Filtering out unnecessary columns and returning
drop.cols <- names(df)[grepl("dup.check", names(df))]
df1 <- df[,-which(names(df) %in% drop.cols)]
return(df1)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.