#' @title Prepare, Search and Merge Duplicate Specimens
#'
#' @description This function search for duplicated specimens within and across
#' collections and it can be used to homogenize the information of
#' different groups of fields and to remove duplicates, leaving only one
#' occurrence for each group of duplicata.
#'
#' @return The input data frame, plus the new columns with the formatted
#' fields.
#'
#' @param occ.df a data frame, containing typical fields from occurrence records
#' from herbarium specimens
#' @param cat.code character. The name of the column containing the code of the
#' collection. Default to the __plantR__ output column "collectionCode.new".
#' @param cat.numb character. The name of the column containing the catalog
#' number (a.k.a. accession number) of the record. Default to "catalogNumber".
#' @param merge logical. Should duplicates be merged? Default to TRUE.
#' @param remove logical. Should all duplicates be removed or only the
#' duplicated entries from the same collection? Default to FALSE.
#'
#' @inheritParams prepDup
#' @inheritParams mergeDup
#' @inheritParams rmDup
#'
#' @details The function works similarly to a wrapper function, where the
#' individuals steps of the proposed __plantR__ workflow for preparing,
#' searching, merging and removal of duplicates are performed altogether (see
#' the __plantR__ tutorial for details).
#'
#' @seealso
#' \link[plantR]{prepDup}, \link[plantR]{getDup}, \link[plantR]{mergeDup},
#' \link[plantR]{rmDup}
#'
#' @author Renato A. F. de Lima
#'
#' @export validateDup
#'
validateDup <- function(occ.df,
cat.code = "collectionCode.new",
cat.numb = "catalogNumber",
merge = TRUE,
remove = FALSE,
noYear = "s.d.",
noName = "s.n.",
noNumb = "s.n.",
comb.fields = list(c("family","col.last.name","col.number","col.loc"),
c("family","col.year","col.number","col.loc"),
c("species","col.last.name","col.number","col.year"),
c("col.year","col.last.name","col.number","col.loc")),
ignore.miss = TRUE,
dup.name = "dup.ID", prop.name = "dup.prop",
prop = 0.75, rec.ID = "numTombo",
info2merge = c("tax", "geo", "loc"),
tax.names = c(family = "family.new",
species = "scientificName.new",
det.name = "identifiedBy.new",
det.year = "yearIdentified.new",
tax.check = "tax.check",
status = "scientificNameStatus"),
geo.names = c(lat = "decimalLatitude.new",
lon = "decimalLongitude.new",
org.coord = "origin.coord",
prec.coord = "precision.coord",
geo.check = "geo.check"),
loc.names = c(loc.str = "loc.correct",
res.gazet = "resolution.gazetteer",
res.orig = "resol.orig",
loc.check = "loc.check"),
tax.level = "high", overwrite = FALSE,
print.rm = TRUE) {
# check input:
if (!class(occ.df)[1] == "data.frame")
stop("input object needs to be a data frame!")
if (dim(occ.df)[1] == 0)
stop("Input data frame is empty!")
# getTombo
occ.df$numTombo <- getTombo(occ.df[, cat.code],
occ.df[, cat.numb])
# prepDup
dups <- prepDup(occ.df, noYear = noYear, noName = noName, noNumb = noNumb,
comb.fields = comb.fields, ignore.miss = ignore.miss)
# getDup
dups <- getDup(dups)
occ.df <- cbind.data.frame(occ.df,
dups[, c("dup.ID", "dup.numb", "dup.prop")],
stringsAsFactors = FALSE)
# mergeDup
if (merge) {
occ.df1 <- mergeDup(occ.df, dup.name = dup.name, prop.name = prop.name,
rec.ID = rec.ID, prop = prop, info2merge = info2merge,
tax.names = tax.names, geo.names = geo.names,
loc.names = loc.names, tax.level = tax.level,
overwrite = overwrite)
}
# rmDup
occ.df2 <- rmDup(occ.df1, rm.all = remove, rec.ID = rec.ID,
print.rm = print.rm)
return(occ.df2)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.