#' @title Formatting and binding databases using DarwinCore fields
#'
#' @description Formats fields in the occurrence data frame - either downloaded
#' from a database or provided by the user - according to the DarwinCore
#' standard. Optionally, drops fields not used in data cleaning performed by
#' plantR and do some text re-encoding. In addition, this functions can bind
#' different sources of data after formatting.
#'
#' @param splink_data A data frame as in the output from `rspeciesLink()`
#' @param gbif_data A data frame as in the output from `rgbif()` or `rgbif2()`
#' @param bien_data A data frame as in the output from the
#' `BIEN_occurrence`-function family
#' @param user_data A data frame provided by the user. Minimum fields are:
#' `c("collectionCode", "catalogNumber", "recordNumber", "recordedBy", "year",
#' "country", "stateProvince", "county", "municipality", "decimalLatitude",
#' "decimalLongitude", "identifiedBy", "dateIdentified", "typeStatus",
#' "scientificName", "scientificNameAuthorship", "institutionCode")`. Fields can
#' be placed at any order and any given name. If using `user_data` argument,
#' the user must indicate the name of the column for each field if not already
#' in DwC standard
#' @param bind_data Logical. Either to bind data from different sources after
#' formatting
#' @param drop Logical. Either to drop non-essential fields to the data cleaning
#' routine performed by plantR
#' @param drop.opt Logical. Either to drop optional fields in the data cleaning
#' routine performed by plantR
#' @param drop.empty Logical. Either to drop fields where all values are NA
#' @param fix.encoding Character. The name of the input data frame (i.e.
#' 'splink_data', 'gbif_data', 'bien_data', 'user_data') that common encoding
#' problems from text in 'latin1' should be re-encoded to UTF-8. Default to
#' NULL (no re-encoding is performed)
#' @param institutionCode The name of the column containing The name (or
#' acronym) in use by the institution having custody of the object(s) or
#' information referred to in the record
#' @param collectionCode The name of the column containing the name, acronym,
#' coden, or initialism identifying the collection or data set from which the
#' record was derived
#' @param catalogNumber The name of the column containing an identifier for the
#' record within the data set or collection
#' @param recordNumber The name of the column containing an identifier given to
#' the Occurrence at the time it was recorded. Often serves as a link between
#' field notes and an Occurrence record, such as a specimen collector's number
#' @param recordedBy The name of the column containing a person, group, or
#' organization responsible for recording the original Occurrence
#' @param year The name of the column containing the four-digit year in which
#' the Event occurred, according to the Common Era Calendar
#' @param country The name of the column containing the name of the country or
#' major administrative unit in which the Location occurs
#' @param stateProvince The name of the column containing the name of the next
#' smaller administrative region than country (state, province, canton,
#' department, region, etc.) in which the Location occurs
#' @param county The name of the column containing the full, unabbreviated name
#' of the next smaller administrative region than stateProvince (county,
#' shire, department, etc.) in which the Location occurs
#' @param municipality The name of the column containing the full, unabbreviated
#' name of the next smaller administrative region than county (city,
#' municipality, etc.) in which the Location occurs. Do not use this term for
#' a nearby named place that does not contain the actual location
#' @param locality The name of the column containing the specific description of
#' the place in which the Location occurs.
#' @param decimalLatitude The name of the column containing the geographic
#' latitude (in decimal degrees, using the spatial reference system given in
#' geodeticDatum) of the geographic center of a Location. Positive values are
#' north of the Equator, negative values are south of it. Legal values lie
#' between -90 and 90, inclusive
#' @param decimalLongitude The name of the column containing the geographic
#' longitude (in decimal degrees, using the spatial reference system given in
#' geodeticDatum) of the geographic center of a Location. Positive values are
#' east of the Greenwich Meridian, negative values are west of it. Legal
#' values lie between -180 and 180, inclusive
#' @param identifiedBy The name of the column containing a list (concatenated
#' and separated) of names of people, groups, or organizations who assigned
#' the Taxon to the subject
#' @param dateIdentified The name of the column containing the date on which the
#' subject was identified as representing the Taxon
#' @param typeStatus The name of the column containing a nomenclatural type
#' (type status, typified scientific name, publication) applied to the subject
#' @param family The name of the column containing the family in which the taxon
#' is classified.
#' @param scientificName The name of the column containing the full scientific
#' name, with authorship and date information if known. When forming part of
#' an Identification, this should be the name in lowest level taxonomic rank
#' that can be determined. This term should not contain identification
#' qualifications, which should instead be supplied in the
#' IdentificationQualifier term
#' @param scientificNameAuthorship The name of the column containing the
#' authorship information for the scientificName formatted according to the
#' conventions of the applicable nomenclaturalCode
#'
#' @return Either a data.frame or list with the database fields formatted
#' following DarwinCore standards
#'
#' @import data.table
#' @importFrom flora remove.authors
#' @importFrom dplyr bind_rows
#' @importFrom stats na.omit
#' @importFrom utils tail
#'
#' @author Sara R. Mortara, Andrea Sánchez-Tapia & Renato A. F. de Lima
#'
#' @encoding UTF-8
#'
#' @export formatDwc
#'
formatDwc <- function(splink_data = NULL,
gbif_data = NULL,
bien_data = NULL,
user_data = NULL,
bind_data = TRUE,
drop = FALSE,
drop.opt = FALSE,
drop.empty = FALSE,
fix.encoding = NULL,
institutionCode = "institutionCode",
collectionCode = "collectionCode",
catalogNumber = "catalogNumber",
recordNumber = "recordNumber",
recordedBy = "recordedBy",
year = "year",
country = "country",
stateProvince = "stateProvince",
county = "county",
municipality = "municipality",
locality = "locality",
decimalLatitude = "decimalLatitude",
decimalLongitude = "decimalLongitude",
identifiedBy = "identifiedBy",
dateIdentified = "dateIdentified",
typeStatus = "typeStatus",
family = "family",
scientificName = "scientificName",
scientificNameAuthorship = "scientificNameAuthorship") {
# Required fields by plantR
must <-
sort(unique(stats::na.omit(fieldNames$plantr[fieldNames$type %in%
"required"])))
opt <-
sort(unique(stats::na.omit(fieldNames$plantr[fieldNames$type %in%
"optional"])))
# formating user data --------------------------------------------------------
if (!is.null(user_data)) {
user_colnames <- c(institutionCode, collectionCode,
catalogNumber,
recordNumber, recordedBy,
year, country, stateProvince, county,
municipality, locality,
decimalLatitude, decimalLongitude,
identifiedBy, dateIdentified,
typeStatus, family, scientificName,
scientificNameAuthorship)
names(user_colnames) <- c("institutionCode", "collectionCode",
"catalogNumber",
"recordNumber", "recordedBy",
"year", "country", "stateProvince", "county",
"municipality", "locality",
"decimalLatitude", "decimalLongitude",
"identifiedBy", "dateIdentified",
"typeStatus", "family", "scientificName",
"scientificNameAuthorship")
if (any(!user_colnames %in% names(user_data)))
stop("user_data does not have the minimum fields required for data cleaning!")
# Applying DwC names
names(user_data)[match(user_colnames, names(user_data))] <-
names(user_colnames)
if (!is.null(fix.encoding)) {
fix.cols <- c("recordedBy", "country", "stateProvince", "county",
"municipality", "locality", "identifiedBy",
"fieldNotes", "occurrenceRemarks", "habitat")
user.cols <- colnames(user_data)[colnames(user_data) %in% fix.cols]
if (any(c('user_data', 'user') %in% fix.encoding)) {
for (i in seq_along(user.cols)) {
bad_enc <- badEncoding
replace_these <- grepl(paste0(bad_enc, collapse = "|"),
user_data[, user.cols[i]], perl = TRUE)
if (any(replace_these))
user_data[replace_these, user.cols[i]] <-
fixEncoding(user_data[replace_these, user.cols[i]])
}
}
}
}
# formating speciesLink data -------------------------------------------------
if (!is.null(splink_data)) {
# required absent fields in speciesLink: municipality and dateIdentified
miss.cols <- must[!must %in% names(splink_data)]
# Creating field municipality
splink_data$municipality <- NA_character_
# Creating field dateIdentified
if (!"monthIdentified" %in% names(splink_data))
splink_data$monthIdentified <- NA_character_
if (!"dayIdentified" %in% names(splink_data))
splink_data$dayIdentified <- NA_character_
splink_date <- apply(X = splink_data[, c("yearIdentified",
"monthIdentified",
"dayIdentified")],
MARGIN = 1,
FUN = function(x) paste(x, collapse = "-"))
splink_data$dateIdentified <-
ifelse(grepl("NA-NA-NA", splink_date), NA_character_, splink_date)
# optional absent fields
if (!drop.opt) {
miss.cols.opt <- opt[!opt %in% names(splink_data)]
for (i in seq_along(miss.cols.opt))
splink_data[, miss.cols.opt[i]] <- NA_character_
}
# checking name standards (only for mandatory and optional fields)
splink_cols <-
sort(unique(stats::na.omit(fieldNames$speciesLink[!is.na(fieldNames$type)])))
if (any(!splink_cols %in% names(splink_data))) {
# stop("splink_data does not follow the speciesLink pattern")
colunas <- splink_cols[!splink_cols %in% names(splink_data)]
aviso <- paste("Important columns were not found in the splink_data: \n",
paste(colunas, collapse = ", "))
warning(aviso, call. = FALSE)
}
# Encoding issues
if (!is.null(fix.encoding)) {
fix.cols <- c("recordedBy", "country", "stateProvince",
"locality", "identifiedBy", "county", "verbatimLocality",
"fieldNotes", "occurrenceRemarks", "habitat", "datasetName")
splink.cols <- colnames(splink_data)[colnames(splink_data) %in% fix.cols]
if (any(c('splink_data', 'splink', 'speciesLink') %in% fix.encoding)) {
for (i in seq_along(splink.cols)) {
bad_enc <- badEncoding
replace_these <- grepl(paste0(bad_enc, collapse = "|"),
splink_data[, splink.cols[i]], perl = TRUE)
if (any(replace_these))
splink_data[replace_these, splink.cols[i]] <-
fixEncoding(splink_data[replace_these, splink.cols[i]])
}
}
}
}
# formating gbif data --------------------------------------------------------
if (!is.null(gbif_data)) {
# fixing problematic GBIF names
tmp <- names(gbif_data)[grepl("\\.\\.", names(gbif_data), perl = TRUE)]
tmp1 <- sapply(tmp,
function(x)
utils::tail(unlist(strsplit(x, "([a-z])\\.(?=[a-zA-Z])",
perl = TRUE)), n = 1))
tmp[!duplicated(tmp1) & !tmp1 %in% names(gbif_data)] <-
tmp1[!duplicated(tmp1) & !tmp1 %in% names(gbif_data)]
names(gbif_data)[grepl("\\.\\.", names(gbif_data), perl = TRUE)] <- tmp
# required absent fields in gbif: scientificNameAuthorship
miss.cols <- must[!must %in% names(gbif_data)]
# Creating field scientificNameAuthorship
species <- as.character(unique(gbif_data$scientificName))
# authors <- stringr::str_trim(sapply(species,
# function(x) gsub(flora::remove.authors(x),
# "", x, perl = TRUE)))
authors <- sapply(species,
function(x) gsub(flora::remove.authors(x), "", x, perl = TRUE))
authors <- gsub("\\s+", " ", authors, perl = TRUE)
authors <- gsub("^ | $", "", authors, perl = TRUE)
df <- data.frame(scientificName = species,
scientificNameAuthorship = authors)
gbif_data <- suppressMessages(dplyr::left_join(gbif_data, df,
by = 'scientificName'))
# optional absent fields
if (!drop.opt) {
miss.cols.opt <- opt[!opt %in% names(gbif_data)]
for (i in seq_along(miss.cols.opt))
gbif_data[, miss.cols.opt[i]] <- NA_character_
}
# checking name standards (only mandatory and optional fields)
gbif_cols <-
sort(unique(stats::na.omit(fieldNames$gbif[!is.na(fieldNames$type)])))
if (any(!gbif_cols %in% names(gbif_data))) {
#stop("gbif_data does not follow the gbif pattern!")
colunas <- gbif_cols[!gbif_cols %in% names(gbif_data)]
aviso <- paste("Important columns were not found in the gbif_data: \n",
paste(colunas, collapse = ", "))
warning(aviso, call. = FALSE)
}
# Encoding issues
if (!is.null(fix.encoding)) {
fix.cols <- c("recordedBy", "country", "stateProvince", "municipality",
"locality", "identifiedBy", "county", "verbatimLocality",
"fieldNotes", "occurrenceRemarks", "habitat", "datasetName")
gbif.cols <- colnames(gbif_data)[colnames(gbif_data) %in% fix.cols]
if (any(c('gbif_data', 'gbif', 'GBIF') %in% fix.encoding)) {
for (i in seq_along(gbif.cols)) {
bad_enc <- badEncoding
replace_these <- grepl(paste0(bad_enc, collapse = "|"),
gbif_data[, gbif.cols[i]], perl = TRUE)
if (any(replace_these))
gbif_data[replace_these, gbif.cols[i]] <-
fixEncoding(gbif_data[replace_these, gbif.cols[i]])
}
}
}
}
# formating BIEN data --------------------------------------------------------
if (!is.null(bien_data)) {
# Applying DwC names
bien_equiv <- fieldNames[!is.na(fieldNames$bien) &
!is.na(fieldNames$type), ]
bien_equiv <- bien_equiv[bien_equiv$bien %in% names(bien_data), ]
names(bien_data)[match(bien_equiv$bien, names(bien_data))] <-
bien_equiv$plantr
# required absent fields in BIEN:
miss.cols <- must[!must %in% names(bien_data)]
# Creating field municipality
bien_data$municipality <- NA_character_
# Creating field typeStatus
bien_data$typeStatus <- NA_character_
# Creating field year
bien_data$year <- gsub("-.*", "", bien_data$date_collected, perl = TRUE)
bien_data$day <- gsub(".*-", "", bien_data$date_collected, perl = TRUE)
bien_data$month <-
sub("-.*", "",
sub("....-*", "", bien_data$date_collected, perl = TRUE),
perl = TRUE)
if (!"genus" %in% names(bien_data))
bien_data$genus <- NA_character_
if (!"family" %in% names(bien_data))
bien_data$family <- NA_character_
# optional absent fields
if (!drop.opt) {
miss.cols.opt <- opt[!opt %in% names(bien_data)]
for (i in seq_along(miss.cols.opt))
bien_data[, miss.cols.opt[i]] <- NA_character_
}
# checking name standards (only for mandatory and optional fields)
bien_cols <-
sort(unique(stats::na.omit(fieldNames$plantr[!is.na(fieldNames$type)])))
if (any(!bien_cols %in% names(bien_data))) {
# stop("splink_data does not follow the speciesLink pattern")
colunas <- bien_cols[!bien_cols %in% names(bien_data)]
aviso <- paste("Important columns were not found in the bien_data: \n",
paste(colunas, collapse = ", "))
warning(aviso, call. = FALSE)
for (i in seq_along(colunas))
bien_data[, colunas[i]] <- NA_character_
}
# Encoding issues
if (!is.null(fix.encoding)) {
fix.cols <- c("recordedBy", "country", "stateProvince",
"locality", "identifiedBy", "county",
"occurrenceRemarks", "datasetName")
bien.cols <- colnames(bien_data)[colnames(bien_data) %in% fix.cols]
if (any(c('bien_data', 'bien', 'BIEN') %in% fix.encoding)) {
for (i in seq_along(bien.cols)) {
bad_enc <- badEncoding
replace_these <- grepl(paste0(bad_enc, collapse = "|"),
bien_data[, bien.cols[i]], perl = TRUE)
if (any(replace_these))
bien_data[replace_these, bien.cols[i]] <-
fixEncoding(bien_data[replace_these, bien.cols[i]])
}
}
}
}
# removing fields if drop = TRUE ---------------------------------------------
if (drop) {
if (drop.opt) {
gbif_data <- gbif_data[, names(gbif_data) %in% must]
splink_data <- splink_data[, names(splink_data) %in% must]
bien_data <- bien_data[, names(bien_data) %in% must]
user_data <- user_data[, names(user_data) %in% must]
message("Dropping fields not essential to the data cleaning!")
} else {
gbif_data <- gbif_data[, names(gbif_data) %in% sort(unique(c(must, opt)))]
splink_data <- splink_data[, names(splink_data) %in% sort(unique(c(must, opt)))]
bien_data <- bien_data[, names(bien_data) %in% sort(unique(c(must, opt)))]
user_data <- user_data[, names(user_data) %in% sort(unique(c(must, opt)))]
message("Dropping fields not essential or recommended to the data cleaning!")
}
}
# binding data ---------------------------------------------------------------
if (bind_data) {
# Forcing numeric/date columns to be characters to allow binding
if (!is.null(splink_data)) {
ids <- which(sapply(splink_data, function(x) class(x)[1]) %in%
c("numeric", "integer", "Date", "POSIXct", "POSIXt"))
if (length(ids) > 0)
for (i in ids) splink_data[, i] <- as.character(splink_data[, i])
}
if (!is.null(gbif_data)) {
ids <- which(sapply(gbif_data, function(x) class(x)[1]) %in%
c("numeric", "integer", "Date", "POSIXct", "POSIXt"))
if (length(ids) > 0)
for (i in ids) gbif_data[, i] <- as.character(gbif_data[, i])
}
if (!is.null(bien_data)) {
ids <- which(sapply(bien_data, function(x) class(x)[1]) %in%
c("numeric", "integer", "Date", "POSIXct", "POSIXt"))
if (length(ids) > 0)
for (i in ids) bien_data[, i] <- as.character(bien_data[, i])
}
if (!is.null(user_data)) {
ids <- which(sapply(user_data, function(x) class(x)[1]) %in%
c("numeric", "integer", "Date", "POSIXct", "POSIXt"))
if (length(ids) > 0)
for (i in ids) user_data[, i] <- as.character(user_data[, i])
}
res_list <- list(gbif = gbif_data,
speciesLink = splink_data,
bien = bien_data,
user = user_data)
res_list <- res_list[sapply(res_list, function(x) !is.null(x))]
res_list <- dplyr::bind_rows(res_list, .id = "data_source")
if (drop.empty) {
DT <- data.table::as.data.table(res_list)
res_list <- data.frame(
DT[, which(unlist(lapply(DT, function(x) !all(is.na(x))))),
with = FALSE],
stringsAsFactors = FALSE)
}
} else {
res_list <- list(gbif = gbif_data,
speciesLink = splink_data,
bien = bien_data,
user = user_data)
if (drop.empty) {
for(i in 1:length(list)) {
DT <- data.table::as.data.table(res_list[[i]])
res_list[[i]] <- data.frame(
DT[, which(unlist(lapply(DT, function(x)!all(is.na(x))))),
with = FALSE],
stringsAsFactors = FALSE)
}
}
}
return(res_list)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.