#' @title Create Species (Check-)List
#'
#' @description This function creates a list of the species contained
#' in the occurrence data, including a list of voucher specimens. If the
#' occurrence data is associated to an specific geographical area, this list
#' can be used as an inventory or check-list for the species occurring in that
#' area.
#'
#' @param x a data frame with the occurrence data, generally as the output of the
#' __plantR__ validation functions.
#' @param fam.order logical. Should taxa be organized in alphabetical order
#' of families? Defaults to TRUE.
#' @param n.vouch numerical. Maximum number of vouchers to be listed per taxa.
#' Defaults to 30.
#' @param type character. The type of voucher list desired. Options are: 'short',
#' 'selected' and 'list' (see details below).
#' @param rm.dup logical. Should duplicated specimens be removed prior to the
#' calculation of species summaries? Defaults to TRUE.
#' @param rank.type numerical. Value of the ranking for type specimens in order
#' to organize order and filter the voucher list. Defaults to 5.
#' @param date.format The desired format for the dates. Defaults to "%d %b %Y"
#'
#' @details The list of species can be organized in alphabetic order by taxa or
#' in alphabetic order by family and then by taxa within families (the
#' default).
#'
#' By default, the output list provides the number of records found and the
#' overall taxonomic and geographic confidence level of the records (columns
#' 'tax.CL' and 'geo.CL'), if available. The taxonomic confidence level is the
#' percentage of records with the identification flagged as 'high', while the
#' geographic confidence level is the percentage of records with coordinates
#' flagged as being validated at municipality or locality levels.
#'
#' The function also provides a list of vouchers, giving priority to type
#' specimens and records with higher level of confidence in their
#' identification. By default, the function provides up to 30 vouchers
#' per taxa, but this number can be controlled using the argument `n.vouch`.
#'
#' The voucher list can be provided in the following output formats (the
#' option 'list' is not implemented yet):
#'
#' + 'short': Collector name, Collector number (collections of deposit)
#' + 'selected': COUNTRY, stateProvince: municipality, Date, Collector name,
#' Collector number (collections of deposit)
#' + 'list': Collector name, Collector number(s) (species code)
#'
#' Note 1: although we provide a `date.format` argument, checks of other date
#' formats other than the default are pending and so they may not work
#' properly.
#'
#' Note 2: The columns names of the input data are expected to be in the
#' DarwinCore format or in the standard output names of the __plantR__ workflow.
#' Currently, there is no argument to make the equivalency to different column
#' names, so users need to convert their data into one of these two options. See
#' function `formatDwc()` for more details.
#'
#' @examples
#' (df <- data.frame(collectionCode = c("CRI","CRI","CRI","CRI"),
#' catalogNumber = c("3565","713","3073","15331"),
#' recordedBy = c("Rebelo, M.C.","Citadini-Zanette, V.",
#' "Santos, R.","Zapelini, I."),
#' recordNumber = c("s.n.","1063","11","s.n."),
#' year = c("1994","1990","1994","2020"),
#' family = c("Salicaceae","Salicaceae","Cannabaceae","Cannabaceae"),
#' scientificName = c("Casearia sylvestris","Casearia sylvestris",
#' "Trema micrantha","Trema micrantha"),
#' country = c("brazil","brazil","brazil","brazil"),
#' stateProvince = c("santa catarina","santa catarina",
#' "santa catarina","santa catarina"),
#' municipality = c("jaguaruna","orleans","icara",NA)))
#'
#' checkList(df, rm.dup = FALSE)
#' checkList(df, rm.dup = FALSE, type = "selected")
#'
#'
#' @import data.table
#' @importFrom stringr str_replace str_trim
#' @importFrom utils head
#'
#' @export checkList
#'
checkList <- function(x, fam.order = TRUE, n.vouch = 30, type = "short",
rm.dup = TRUE, rank.type = 5, date.format = "%d %b %Y") {
# check input
if (!class(x) == "data.frame")
stop("Input object needs to be a data frame!")
if (dim(x)[1] == 0)
stop("Input data frame is empty!")
#Escaping R CMD check notes from using data.table syntax
dup.ID <- N <- N.all <- temp.geo.check <- priority <- NULL
typeStatus <- temp.rec.numb <- ordem <- temp.pais <- NULL
coletores <- temp.accession <- numTombo <- vchrs <- NULL
datas <- datas.tipo <- lista.vouchs <- . <- NULL
## PREPARING THE TABLE ##
# Select which co-variables will be used in the summary (priority to the edited columns)
covs <- list(collections = c("collectionCode.new", "collectionCode"),
catalog = c("catalogNumber.new", "catalogNumber"),
collectors = c("recordedBy.new", "recordedBy"),
recordNumber = c("recordNumber.new", "recordNumber"),
colYears = c("year.new", "year"),
families = c("family.new", "family"),
species = c("scientificName.new", "scientificName"),
countries = c("country.new", "country"),
state = c("stateProvince.new", "stateProvince"),
county = c("municipality.new", "municipality"),
locality = c("loc.correct1","loc.correct","loc"),
coordinates = c("geo.check1", "geo.check"),
taxonomy = c("tax.check1", "tax.check"))
#Get only the columns of interest
covs.present <- lapply(covs, function(z) z[which(z %in% names(x))][1])
covs.present[sapply(covs.present, identical, character(0))] <- NA
if (all(sapply(covs.present, nchar)==0))
stop("The input data frame does not contain at least one of the required columns")
# Getting the input columns to be used for filtering
other.cols <- c("dup.ID", "dup.prop","typeStatus", "scientific.name",
"numTombo", "temp.accession", "month", "day")
covs.final <- c(unlist(covs.present), other.cols)
# Should the duplicates be removed?
if (rm.dup) {
if ("numTombo" %in% names(x)) {
dt <- data.table::data.table(suppressMessages(
rmDup(x[, names(x) %in% covs.final],
rm.all = FALSE, print.rm = FALSE)))
} else {
dt <- data.table::data.table(x[, names(x) %in% covs.final])
warning("Duplicated specimens cannot be removed; using the all occurrences instead")
}
} else {
dt <- data.table::data.table(x[, names(x) %in% covs.final])
}
#Making sure the data.table does not contains factors
changeCols <- colnames(dt)[which(as.vector(dt[,lapply(.SD, class)]) == "factor")]
if (length(changeCols) > 0)
dt[,(changeCols):= lapply(.SD, as.character), .SDcols = changeCols]
# getting the list of taxa and the selected columns
data.table::setindexv(dt, covs.present[["species"]])
checklist <- data.frame(unique(dt, by= covs.present[["species"]]))
cols <- c(unlist(covs.present[names(covs.present) %in% c("families", "species")]),
"scientific.name")
checklist <- checklist[, names(checklist) %in% cols]
checklist$records <- NA
checklist$tax.CL <- NA
checklist$geo.CL <- NA
checklist$vouchers <- NA
## NUMBER OF RECORDS PER SPECIES ##
records <- dt[, .N , by = c(covs.present[["species"]])]
if ("dup.ID" %in% names(dt)) {
unicatas <- dt[is.na(dup.ID), .N , by = c(covs.present[["species"]])]
unicatas <- merge(records, unicatas, by = c(covs.present[["species"]]),
all.x = TRUE, suffixes = c("", ".unis"))
unicatas[, N := NULL]
duplicatas <- dt[!is.na(dup.ID), .N, by = c(covs.present[["species"]], "dup.ID")]
duplicatas <- duplicatas[, .N , by = c(covs.present[["species"]])]
duplicatas <- data.table::merge.data.table(records, duplicatas, by = c(covs.present[["species"]]),
all.x = TRUE, suffixes = c("", ".dups"))
duplicatas[, N:= NULL]
records <- records[unicatas, on = c(covs.present[["species"]])]
records <- records[duplicatas, on = c(covs.present[["species"]])]
checklist$records <-
records$N[match(checklist[, covs.present[["species"]]],
data.frame(records)[,1])]
} else {
checklist$records <-
records$N[match(checklist[, covs.present[["species"]]],
data.frame(records)[,1])]
}
## TAXONOMIC CONFIDENCE LEVEL ##
if (!is.na(covs.present[["taxonomy"]])) {
# Proportion of validate identifications per species
colunas <- c(covs.present[["taxonomy"]], covs.present[["species"]])
data.table::setkeyv(dt, cols = colunas)
taxs <- dt[, .N,
by = c(covs.present[["species"]], covs.present[["taxonomy"]])]
#dt[data.table::CJ(tax.check1, scientificName.new, unique = TRUE), .N, by = .EACHI]
vals <- c("unknown", "low", "medium", "high")
all.taxs <- data.table::CJ(unique(data.frame(taxs)[,1]),
vals, 0, unique = TRUE)
names(all.taxs) <- names(taxs)
all.taxs <- merge(all.taxs, taxs, by = c(covs.present[["species"]], covs.present[["taxonomy"]]),
all.x = TRUE, suffixes = c(".all", ""))
all.taxs <- all.taxs[, N.all := NULL]
all.taxs <- all.taxs[data.frame(all.taxs)[,2] %in% "high", ]
all.taxs[is.na(N), N := 0]
# Saving the result
checklist$tax.CL <- round(100 * all.taxs$N[match(checklist[,1], data.frame(all.taxs)[,1])]/
checklist$records, 2)
}
## GEOGRAPHIC CONFIDENCE LEVEL ##
if (!is.na(covs.present[["coordinates"]])) {
# Proportion of valid coordinates (at county or locality levels)
dt[, temp.geo.check := .SD, .SDcols = c(covs.present[["coordinates"]])]
dt[, temp.geo.check := gsub("_gazet", "", temp.geo.check)]
dt[temp.geo.check %in% c("ok_county", "ok_locality"), temp.geo.check := "1"]
dt[!temp.geo.check %in% "1", temp.geo.check := "0"]
colunas <- c("temp.geo.check", covs.present[["species"]])
data.table::setkeyv(dt, cols = colunas)
coords <- dt[, .N, by = colunas]
coords[is.na(N), N := 0]
coords <- coords[temp.geo.check %in% "1", ]
coords <- coords[, temp.geo.check := NULL]
dt[ , temp.geo.check := NULL]
checklist$geo.CL <- round(100 * coords$N[match(checklist[,1], data.frame(coords)[,1])]/
checklist$records, 2)
}
## SETTING PRIORITIES WITHIN RECORDS OF EACH SPECIES ##
# Ranking records according to the completeness of the label information
dt[ , priority := 0]
if ("typeStatus" %in% names(dt)) {
dt[ , typeStatus := tolower(typeStatus), ]
dt[(!is.na(typeStatus) & !typeStatus %in% "") &
!grepl("not a type|notatype|probable type|tipo provavel|tipo provavel", typeStatus),
priority := rank.type, ]
}
if (!is.na(covs.present[["collectors"]])) {
data.table::setkeyv(dt, c(covs.present[["collectors"]]))
if (dim(dt["s.n.", nomatch = NULL])[1] > 0) {
suppressWarnings(
dt[c("s.n."), priority := priority - 3, nomatch = NULL])
}
temp <- data.frame(dt[, lapply(.SD, nchar),
by = c(covs.present[["collectors"]]), .SDcols = c(covs.present[["collectors"]])])
dt[ temp[,2] < 4, priority := priority - 3]
}
if (!is.na(covs.present[["recordNumber"]])) {
data.table::setkeyv(dt, c(covs.present[["recordNumber"]]))
data.table::setnames(dt, covs.present[["recordNumber"]], "temp.rec.numb")
if (dim(dt[grepl("\\d", temp.rec.numb, perl = TRUE), ])[1] > 0) {
dt[!grepl("\\d", temp.rec.numb, perl = TRUE), priority := priority - 2]
}
data.table::setnames(dt, "temp.rec.numb", covs.present[["recordNumber"]])
}
# ranking vouchers in respect to their locality precision
if (!is.na(covs.present[["county"]])) {
data.table::setkeyv(dt, c(covs.present[["county"]]))
dt[.(NA_character_), priority := priority - 1]
}
if (!is.na(covs.present[["state"]])) {
data.table::setkeyv(dt, c(covs.present[["state"]]))
dt[.(NA_character_), priority := priority - 1]
}
if (!is.na(covs.present[["countries"]])) {
data.table::setkeyv(dt, c(covs.present[["countries"]]))
dt[.(NA_character_), priority := priority - 2]
# ranking vouchers in respect to duplicated countries
dt[, ordem := 1:dim(dt)[1],]
data.table::setnames(dt, covs.present[["countries"]], "temp.pais")
temp <- dt[, .(ordem, temp.pais, !duplicated(temp.pais, incomparables = NA_character_)),
by = c(covs.present[["species"]])]
temp$V3 <- !unlist(temp$V3)
# temp$V4 <- !unlist(dt[, .(ordem, temp.pais, !duplicated(temp.pais, incomparables = NA_character_, fromLast = TRUE)),
# by = c(covs.present[["species"]])]$V3)
# temp$check <- temp$V3 | temp$V4
data.table::setorder(temp, "ordem")
dt[, ordem := NULL,]
data.table::setnames(dt, "temp.pais", covs.present[["countries"]])
dt[temp$V3, priority := priority - 2]
}
# ranking vouchers in respect to their taxonomic confidence
if (!is.na(covs.present[["taxonomy"]])) {
data.table::setkeyv(dt, c(covs.present[["taxonomy"]]))
if (dim(dt[.(c("unknown", "low")), ])[1] > 0) {
dt[ .(c("unknown", "low")), priority := priority - 3]
}
if (dim(dt[.(c("medium")), ])[1] > 0) {
dt[ .(c("medium")), priority := priority - 2]
}
}
# ranking vouchers in respect to the existence of an accession number
if (!is.na(covs.present[["catalog"]])) {
data.table::setkeyv(dt, c(covs.present[["catalog"]]))
dt[.(NA_character_), priority := priority - 3]
}
# ranking vouchers in respect to the existence of a collection year
if (!is.na(covs.present[["colYears"]])) {
data.table::setkeyv(dt, c(covs.present[["colYears"]]))
dt[.(c(NA_character_, "n.d.")), priority := priority - 2]
}
# Still too many vouchers per species?
# Add extra steps to downgrad vouchers from the same author,
#or with bad coordinates for species with too many vouchers of high priority
#from the same collector or same county?
# Organizing and filtering records based on the ranks by species
data.table::setorderv(dt, c(covs.present[["species"]], "priority"), c(1,-1))
dt1 <- dt[dt[, .I[1:n.vouch],
by = c(covs.present[["species"]])]$V1]
dt1 <- dt1[rowSums(is.na(dt1)) < dim(dt1)[2],]
## GENERATING THE LIST OF VOUCHERS ##
# Collector name and number
dt1[ , coletores := do.call(paste, c(.SD, sep=", ")),
.SDcols = c(covs.present[["collectors"]], covs.present[["recordNumber"]])]
if (type == "short") { # Inspired in the Flora do Brail format
## Getting more up-to-date accession numbers, if available
if ("numTombo" %in% names(x))
dt1[, temp.accession := toupper(gsub("_", " ", numTombo, perl = TRUE))]
## Accession numbers
if (!"temp.accession" %in% names(dt1))
dt1[ , temp.accession := do.call(paste, c(.SD, sep=", ")),
.SDcols = c(covs.present[["collections"]], covs.present[["catalog"]])]
#correcting accessions numbers for duplicates across herbaria
if (dim(dt1[!is.na(dup.ID)])[1] > 0) {
getDupIDs <- function(id) {
id <- toupper(gsub("_", " ", id, perl = TRUE))
id <- sapply(lapply(strsplit(id, "\\|"), unique), paste, collapse = "|")
id <- gsub("\\|", ", ", id, perl = TRUE)
return(id)
}
dt1[!is.na(dup.ID) ,
temp.accession := as.character(lapply(dup.ID, getDupIDs))]
}
dt1[ , temp.accession := paste("(", temp.accession, ")", sep="")]
dt1[ , temp.accession := gsub("\\[VIRTUAL\\]", "", temp.accession, perl = TRUE)]
#getting the voucher vector itslef
dt1[ , vchrs := do.call(paste, c(.SD, sep=" ")),
.SDcols = c("coletores", "temp.accession")]
if ("typeStatus" %in% names(dt)) {
dt1[!is.na(typeStatus) & !grepl("not a type|notatype|probable type|tipo provavel|tipo provavel", typeStatus),
vchrs := paste(vchrs, " [",tolower(typeStatus),"]", sep="")]
dt1[, vchrs := gsub(" \\[\\]$", "", vchrs, perl = TRUE)]
}
#combining all voucher into a single string
data.table::setorderv(dt1, c(covs.present[["species"]], "vchrs"), c(1,1))
dt2 <- dt1[ , do.call(paste, c(.SD, collapse= "; ",sep="")),
by = c(covs.present[["species"]]),
.SDcols = "vchrs"]
checklist$vouchers <-
as.character(dt2$V1[match(checklist[,covs.present[["species"]]],
data.frame(dt2)[,1])])
}
if (type == "selected") { # From 'species examined' in Flora Neotropica
# VOUCHERS
#getting more up-to-date collectionCode, if available
if ("numTombo" %in% names(x)) {
dt1[, temp.accession := toupper(gsub("_", " ", numTombo, perl = TRUE))]
dt1[, temp.accession := lapply(strsplit(temp.accession, " "), utils::head, n = 1)]
# dt1[, temp.accession := lapply(strsplit(temp.accession, " "), my.head)]
}
#collectionCode
if (!"temp.accession" %in% names(dt1))
dt1[ , temp.accession := .SD,
.SDcols = c(covs.present[["collections"]])]
#correcting accessions numbers for duplicates across herbaria
if ("dup.ID" %in% names(dt1)) {
getDupIDs1 <- function(id) {
id <- toupper(gsub("_", " ", id, perl = TRUE))
id <- sapply(lapply(strsplit(id, "\\|"), strsplit, " ")[[1]], utils::head, n = 1)
# id <- sapply(lapply(strsplit(id, "\\|"), strsplit, " ")[[1]], my.head)
id <- paste0(unique(id), collapse = ", ")
return(id)
}
dt1[!is.na(dup.ID) , temp.accession := lapply(dup.ID, getDupIDs1)]
}
dt1[ , temp.accession := paste("(", temp.accession, ")", sep="")]
dt1[ , temp.accession := gsub("\\[VIRTUAL\\]", "", temp.accession, perl = TRUE)]
#getting the voucher vector itself
dt1[ , vchrs := do.call(paste, c(.SD, sep=" ")),
.SDcols = c("coletores", "temp.accession")]
# if ("typeStatus" %in% names(dt)) {
# dt1[!is.na(typeStatus) & !grepl("not a type|notatype|probable type|tipo provavel|tipo provavel", typeStatus),
# vchrs := paste(vchrs, " [",tolower(typeStatus),"]", sep="")]
# dt1[, vchrs := gsub(" \\[\\]$", "", vchrs, perl = TRUE)]
# }
# LOCALITY
locais <- NULL
if (is.na(covs.present[["locality"]]) &
any(!sapply(covs.present[c("countries","state","county")], is.na))) {
loc.df <- dt[, .SD, .SDcols = c(unlist(covs.present[c("countries","state","county")]))]
loc.df <- fixLoc(data.frame(loc.df), scrap = FALSE,
loc.levels = c("country", "stateProvince", "municipality"))
loc.df <- strLoc(loc.df)
loc.df$loc.string <- prepLoc(loc.df$loc.string)
loc.df <- getLoc(loc.df)
locais <- getAdmin(loc.df)
locais$string <- paste(toupper(locais$NAME_0),", ",
locais$NAME_1, ": ",
locais$NAME_2, sep="")
locais$string[locais$string == "NA, NA: NA"] <- "[Locality unknown]"
locais$string <- gsub(", NA: NA$", "", locais$string, perl = TRUE)
locais$string <- gsub(": NA$", "", locais$string, perl = TRUE)
}
if (!is.na(covs.present[["locality"]])) {
df <- data.frame(dt1[, .SD, .SDcols = c(covs.present[["locality"]])])
locais <- getAdmin(df)
miss.ids <- is.na(locais$NAME_0) & !is.na(locais$loc.correct) & !locais$loc.correct %in% ""
if (any(miss.ids)) locais[miss.ids,] <- getAdmin(locais[miss.ids,])
locais$string <- paste(toupper(locais$NAME_0),", ",
locais$NAME_1, ": ",
locais$NAME_2, sep="")
locais$string[locais$string == "NA, NA: NA"] <- "[Locality unknown]"
locais$string <- gsub(", NA: NA$", "", locais$string, perl = TRUE)
locais$string <- gsub(": NA$", "", locais$string, perl = TRUE)
}
if (is.null(locais)) {
locais <- data.frame(string = rep("[Locality unknown]", dim(dt1)[1]))
}
# DATES
if (all(c("month", "day") %in% names(dt1))) {
dt1[, datas := do.call(paste, c(.SD, sep = "-")),
.SDcols = c("day", "month", covs.present[["colYears"]])]
dt1[, datas.tipo := "full",]
`%like.ic%` <- function (x, pattern) {
grepl(pattern, x, perl = TRUE, ignore.case = TRUE)
}
dt1[!datas %like.ic% "\\d", c("datas", "datas.tipo") := list("n.d.", "no_date"),]
dt1[datas %like.ic% "^NA-NA-", datas.tipo := "year_only",]
dt1[datas %like.ic% "^NA-[0-9]", datas.tipo := "month_year",]
data.table::setDT(dt1)[!datas.tipo %in% c("no_date", "full"),
datas := stringr::str_replace(datas, "^NA-NA-", "01-01-")]
data.table::setDT(dt1)[!datas.tipo %in% c("no_date", "full"),
datas := stringr::str_replace(datas, "^NA-", "01-")]
dt1[!datas.tipo %in% c("no_date"), datas := format(as.Date(datas, "%d-%m-%Y"), date.format)]
### MAKE THE 4 AND 3 FROM THE SUBSTR ADAPTED TO DIFFERENT 'date.format'
data.table::setDT(dt1)[datas.tipo %in% "year_only",
datas := substr(datas, nchar(datas) - 4, nchar(datas))]
data.table::setDT(dt1)[datas.tipo %in% "month_year",
datas := substr(datas, 3, nchar(datas))]
# data.table::setDT(dt1)[!datas.tipo %in% c("no_date", "full"),
# datas := stringr::str_trim(datas)]
data.table::setDT(dt1)[!datas.tipo %in% c("no_date", "full"),
datas := gsub("^ | $", "", datas, perl = TRUE)]
} else {
dt1[, datas := .SD,
.SDcols = c(covs.present[["colYears"]])]
}
#COMBINING LOCALITIES, DATES AND VOUCHERS
dt1[ , locais := locais$string , ]
dt1[ , lista.vouchs := do.call(paste, c(.SD, sep=", ")), ,
.SDcols = c("locais", "datas", "vchrs")]
#combining all voucher into a single string
data.table::setorderv(dt1, c(covs.present[["species"]], "lista.vouchs"), c(1,1))
dt2 <- dt1[ , do.call(paste, c(.SD, collapse= "; ",sep="")),
by = c(covs.present[["species"]]),
.SDcols = "lista.vouchs"]
#Saving
checklist$vouchers <-
as.character(dt2$V1[match(checklist[, covs.present[["species"]]],
data.frame(dt2)[,1])])
}
# Organizing and ordering the output
cols <- as.character(
c(unlist(covs.present[names(covs.present) %in% c("families", "species")]),
"scientific.name", "records", "tax.CL", "geo.CL","vouchers"))
if (all(c(covs.present[["species"]], "scientific.name") %in% names(checklist)))
cols <- cols[!cols %in% covs.present[["species"]]]
cols <- cols[cols %in% names(checklist)]
checklist <- checklist[, cols]
if (fam.order) {
checklist <- checklist[order(checklist[,1], checklist[,2], na.last = TRUE),]
checklist <- checklist[!is.na(checklist[,2]),]
} else {
checklist <- checklist[order(checklist[,2], na.last = NA),]
}
if (any(apply(checklist, 2, function(x) all(is.na(x)))))
checklist <- checklist[, !apply(checklist, 2, function(x) all(is.na(x)))]
return(checklist)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.