Nothing
#' Taxonomic spell check
#'
#' A function to check for and count potential spelling variations of the same
#' taxon. Spelling variations are checked within alphabetical groups (default),
#' or within higher taxonomic groups if provided.
#'
#' @param taxdf \code{data.frame}. A dataframe with named columns containing
#' taxon names (e.g. "species", "genus"). An optional column
#' containing the groups (e.g. "family", "order") which taxon names
#' belong to may also be provided (see `group` for details).
#' NA values or empty strings in the name and group columns (i.e. "" and " ")
#' are ignored.
#' @param name \code{character}. The column name of the taxon names you wish
#' to check (e.g. "genus").
#' @param group \code{character}. The column name of the higher taxonomic
#' assignments in `taxdf` you wish to group by. If `NULL` (default), name
#' comparison will be conducted within alphabetical groups.
#' @param dis \code{numeric}. The dissimilarity threshold: a value greater than
#' 0 (completely dissimilar), and less than 1 (completely similar).
#' Potential synonyms above this threshold are not returned.
#' This value is set to 0.05 by default, but the user might wish to experiment
#' with this value for their specific data.
#' @param start \code{numeric}. The number of shared characters at the
#' beginning of potential synonyms that should match. Potential synonyms below
#' this value will not be returned. By default this value is set to 1 (i.e.
#' the first letter of synonyms must match).
#' @param verbose \code{logical}. Should the results of the non-letter
#' character check be reported to the user? If `TRUE`, the result will only be
#' reported if such characters are detected in the taxon names.
#'
#' @return If verbose = `TRUE` (default), a \code{list} with three elements. The
#' first element in the list (synonyms) is a \code{data.frame} with each row
#' reporting a pair of potential synonyms. The first column "group" contains the
#' higher group in which they occur (alphabetical groupings if `group` is
#' not provided). The second column "greater" contains the most common synonym
#' in each pair. The third column "lesser" contains the least common synonym in
#' each pair. The third and fourth column (`count_greater`, `count_lesser`)
#' contain the respective counts of each synonym in a pair. If no matches were
#' found for the filtering arguments, this element is `NULL` instead. The second
#' element (`non_letter_name`) is a vector of taxon names which contain
#' non-letter characters, or `NULL` if none were detected. The third element
#' (non_letter_group) is a vector of taxon groups which contain non-letter
#' characters, or `NULL` if none were detected. If verbose = `FALSE`, a
#' \code{data.frame} as described above is returned, or `NULL` if no matches
#' were found.
#'
#' @details When higher taxonomy is provided, but some entries are missing,
#' comparisons will still be made within alphabetical groups of taxa which lack
#' higher taxonomic affiliations. The function also performs a check for
#' non-letter characters which are not expected to be present in
#' correctly-formatted taxon names. This detection may be made available to the
#' user via the `verbose` argument. Comparisons are performed using the
#' Jaro dissimilarity metric via
#' \code{\link[stringdist:stringdistmatrix]{stringdist::stringdistmatrix()}}.
#'
#' As all string distance metrics rely on approximate string matching,
#' different metrics can produce different results. This function uses Jaro
#' distance as it was designed with short, typed strings in mind, but good
#' practice should include comparisons using multiple metrics, and ultimately
#' specific taxonomic vetting where possible. A more complete implementation
#' and workflow for cleaning taxonomic occurrence data is available in the
#' `fossilbrush` R package on CRAN.
#'
#' @section Reference:
#' van der Loo, M. P. J. (2014). The stringdist package for approximate string
#' matching. The R Journal 6, 111-122.
#'
#' @section Developer(s):
#' Joseph T. Flannery-Sutherland & Lewis A. Jones
#' @section Reviewer(s):
#' Lewis A. Jones, Kilian Eichenseer & Christopher D. Dean
#' @importFrom stats na.omit
#' @importFrom stringdist stringdistmatrix
#' @examples
#' \dontrun{
#' # load occurrence data
#' data("tetrapods")
#' # Check taxon names alphabetically
#' ex1 <- tax_check(taxdf = tetrapods, name = "genus", dis = 0.1)
#' # Check taxon names by group
#' ex2 <- tax_check(taxdf = tetrapods, name = "genus",
#' group = "family", dis = 0.1)
#' }
#' @export
tax_check <- function(taxdf, name = "genus", group = NULL, dis = 0.05,
start = 1, verbose = TRUE) {
# ARGUMENT CHECKS --------------------------------------------------------- #
# taxdf: a data.frame with column names and at least one row
if (!exists("taxdf")) {
taxdf <- NULL
}
if (any(c(!is.data.frame(taxdf),
nrow(taxdf) == 0,
is.null(colnames(taxdf))))) {
stop("Please supply `taxdf` as a data.frame with named columns, containing
taxon names, and optionally their higher classification")
}
# names: a 1L character vector denoting a character column in taxdf
if (any(c(!is.atomic(name),
length(name) != 1,
!name %in% colnames(taxdf)))) {
stop("Please specify `name` as a single column name in `taxdf`")
}
# Replace missing values with NA
taxdf[grep("^$|^\\s+$", taxdf[, name, drop = TRUE]), name] <- NA
if (!is.character(taxdf[, name, drop = TRUE]) ||
all(is.na(taxdf[, name, drop = TRUE]))) {
stop("The `name` column in `taxdf` must contain data of class character and
at least one entry that is not NA or empty")
}
# groups: If not NULL, a 1L character vector denoting a character column
# in taxdf
if (!is.null(group)) {
if (any(c(!is.atomic(group), length(group) != 1,
!group %in% colnames(taxdf)))) {
stop("Please specify `group` as a single column name in `taxdf`")
}
if (!is.character(taxdf[, group, drop = TRUE])) {
stop("The `group` column in `taxdf` must contain data of class character")
}
group <- gsub("^$|^\\s+$", NA, taxdf[, group, drop = TRUE])
} else {
group <- substring(taxdf[, name, drop = TRUE], 1, 1)
}
# dis: a 1L numeric > 0 and < 1
if (any(c(!is.numeric(dis), length(dis) != 1, !is.atomic(dis)))) {
stop("`dis` must be a single numeric, greater than 0 and less than 1")
}
if (dis >= 1 || dis <= 0) {
stop("`dis` must be a single numeric, greater than 0 and less than 1")
}
# start: a 1L integer >= 0
if (!is.null(start)) {
if (any(c(!is.numeric(start),
length(start) != 1,
!is.atomic(start)))) {
stop("`start` must be a single positive integer, or zero")
}
if (any(c(start < 0, start %% 1 != 0,
is.nan(start),
is.infinite(start)))) {
stop("`start` must be a single positive integer, or zero")
}
}
# verbose: a 1L logical vector
if (any(c(!is.atomic(c(verbose)),
!is.logical(verbose),
length(verbose) != 1))) {
stop("`verbose` must be a single logical value")
}
# check for non-letter characters, returning NULL if none
gp <- unique(grep("[^[:alpha:] ]", group, value = TRUE))
if (length(gp) != 0) {
warning("Non-letter characters present in the group names")
} else {
gp <- NULL
}
nm <- unique(grep("[^[:alpha:] ]", taxdf[, name, drop = TRUE],
value = TRUE))
if (length(nm) != 0) {
warning("Non-letter characters present in the taxon names")
} else {
nm <- NULL
}
# FORMAT INPUT DATA ------------------------------------------------------- #
# names data.frame, drop missing names, fill missing groups alphabetically
taxdf <- taxdf2 <- data.frame(group = group,
name = taxdf[, name, drop = TRUE])
taxdf <- taxdf[!duplicated(taxdf), , drop = FALSE]
taxdf <- taxdf[!is.na(taxdf[, "name", drop = TRUE]), , drop = FALSE]
no_group <- which(is.na(taxdf[, "group", drop = TRUE]))
taxdf[no_group, "group"] <- substring(taxdf[no_group, "name"], 1, 1)
# RUN GROUPWISE COMPARISONS ----------------------------------------------- #
# apply the comparison procedure group wise
sp <- lapply(unique(taxdf[, "group", drop = TRUE]), function(y) {
# all taxon names which belong to group y
ob <- taxdf[taxdf[, "group", drop = TRUE] == y, "name"]
# if there is are not multiple names in the group, skip
if (length(ob) < 2) {
flag <- NULL
# otherwise perform group wise comparisons
} else {
# else get the Jaro distance matrix for the elements in the group
test <- stringdist::stringdistmatrix(a = ob, b = ob, method = "jw")
colnames(test) <- rownames(test) <- ob
# set self matches to max dissimilarity for removal in the next step
diag(test) <- 1
# subset to those which fall below the dissimilarity threshold
flag <- which(test < dis, arr.ind = TRUE)
# if there are no remaining flagged names, return NULL
if (length(flag) == 0) {
flag <- NULL
# otherwise additionally filter using shared starting/ending letters
} else {
# retrieve names
flag <- cbind(ob[flag[, 1]], ob[flag[, 2]], y)
# drop equivalent rows (xy, yx pairs)
eq <- duplicated(t(apply(flag, 1, function(z) {
paste0(z[order(z)])
})))
flag <- flag[!eq, , drop = FALSE]
# cull by first y letter non-matches
if (!is.null(start)) {
c1 <- substr(flag[, 1], start = 1, stop = start)
c2 <- substr(flag[, 2], start = 1, stop = start)
flag <- flag[which(c1 == c2), , drop = FALSE]
}
# if there are no remaining flagged names, return NULL
if (length(flag) == 0) {
flag <- NULL
}
}
}
flag
})
# FORMAT OUTPUT ----------------------------------------------------------- #
# format initial results data.frame from list
err <- sp[!unlist(lapply(sp, is.null))]
err <- as.data.frame(do.call(rbind, err))
err$f1 <- as.vector(table(taxdf2[, "name"])[match(err$V1,
names(table(
taxdf2[, "name"])))])
err$f2 <- as.vector(table(taxdf2[, "name"])[match(err$V2,
names(table(
taxdf2[, "name"])))])
# NULL if no matches present
if (nrow(err) == 0) {
err <- NULL
# else reorder rows so the more frequent synonym is in the first column
} else {
mins <- apply(err[, 4:5], 1, which.min) - 1
maxs <- abs(mins - 1)
fq1 <- unlist(err[, 4:5])[seq_along(maxs) + (maxs * length(maxs))]
fq2 <- unlist(err[, 4:5])[seq_along(mins) + (mins * length(mins))]
mins <- unlist(err[, 1:2])[seq_along(mins) + (mins * length(mins))]
maxs <- unlist(err[, 1:2])[seq_along(maxs) + (maxs * length(maxs))]
err <- data.frame(group = err$y, greater = as.vector(maxs),
lesser = as.vector(mins), count_greater = fq1,
count_lesser = fq2)
err <- err[order(err[, "group"], err[, "greater"], method = "radix"), ]
row.names(err) <- NULL
}
# return
if (verbose) {
return(list(synonyms = err, non_letter_name = nm, non_letter_group = gp))
} else {
return(err)
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.