#' Verify taxa that the GBIF Backbone Taxonomy does not recognize or will lump
#'
#' Verify taxa that the \href{https://doi.org/10.15468/39omei}{GBIF Backbone
#' Taxonomy} does not recognize (no backbone match) or will lump under another
#' name (synonyms). This is done by adding a \code{verificationKey} to the input
#' dataframe, populated with: \itemize{ \item{For \code{ACCEPTED} and
#' \code{DOUBTFUL} taxa: the backbone taxon key for that taxon (taxon is its own
#' unit and won't be lumped).} \item{For other taxa: a manually chosen and thus
#' verified backbone taxon key. This could either be the taxon key of: \itemize{
#' \item{accepted taxon suggested by GBIF: backbone synonymy is accepted and
#' taxon will be lumped.} \item{another accepted taxon: backbone synonymy is
#' rejected, but taxon will be lumped under another name.} \item{taxon itself:
#' backbone synonymy is rejected, taxon will be considered as separate taxon.}
#' \item{other taxon/taxa: automatic backbone match failed, but taxon can be
#' considered/lumped with manually found taxon/taxa (e.g. hybrid formula
#' considered equal to its hybrid parents).} }} } The manually chosen
#' \code{verificationKey} should be provided in \code{verification}: a dataframe
#' (probably read from a file) listing all checklist taxon/backbone
#' taxon/accepted taxon combinations that require verification. The function
#' will update a provided verification based on the input taxa or create a new
#' one if none is provided. Any changes to the verification are also provided as
#' ancillary information.
#'
#' @param taxa df. Dataframe with at least the following (default) columns for
#' each taxon: \itemize{ \item{\code{taxonKey}: numeric. Non-backbone
#' checklist taxon key assigned by GBIF.} \item{\code{scientificName}:
#' character. Scientific name as interpreted by GBIF.}
#' \item{\code{datasetKey}: character. Dataset key (UUID) assigned by GBIF of
#' originating checklist.} \item{\code{bb_key}: numeric. Taxon key of matching
#' backbone taxon (if any).} \item{\code{bb_scientificName}: character.
#' Scientific name of matching backbone taxon.} \item{\code{bb_kingdom}:
#' character. Kingdom of matching backbone taxon.} \item{\code{bb_rank}:
#' character. Rank of matching backbone taxon.}
#' \item{\code{bb_taxonomicStatus}: character. Taxonomic status of matching
#' backbone taxon.} \item{\code{bb_acceptedKey}: numeric. Accepted key of
#' taxon for which matching backbone taxon is considered a synonym.}
#' \item{\code{bb_acceptedName}: character. Accepted name of taxon for which
#' matching backbone taxon is considered a synonym.} }
#' @param verification df. Dataframe with at least the following columns for
#' each checklist taxon/backbone taxon/accepted taxon combination: \itemize{
#' \item{\code{taxonKey}: numeric. Non-backbone checklist taxon key assigned
#' by GBIF.} \item{\code{scientificName}: character. Scientific name as
#' interpreted by GBIF.} \item{\code{datasetKey}: character. Dataset key
#' (UUID) assigned by GBIF of originating checklist.} \item{\code{bb_key}:
#' numeric. Taxon key of matching backbone taxon (if any).}
#' \item{\code{bb_scientificName}: character. Scientific name of matching
#' backbone taxon.} \item{\code{bb_kingdom}: character. Kingdom of matching
#' backbone taxon.} \item{\code{bb_rank}: character. Rank of matching backbone
#' taxon.} \item{\code{bb_taxonomicStatus}: character. Taxonomic status of
#' matching backbone taxon.} \item{\code{bb_acceptedKey}: numeric. Taxon key
#' of accepted backbone taxon in case matching backbone taxon is considered a
#' synonym.} \item{\code{bb_acceptedName}: character. Scientific name of
#' accepted backbone taxon in case matching backbone taxon is considered a
#' synonym.} \item{\code{bb_acceptedKingdom}: character. Kingdom of accepted
#' taxon. Expected to be equal to \code{bb_kingdom}.}
#' \item{\code{bb_acceptedRank}: character. Rank of accepted taxon.}
#' \item{\code{bb_acceptedTaxonomicStatus}: character. Taxonomic status of
#' accepted taxon. Expected to be \code{ACCEPTED}.}
#' \item{\code{verificationKey}: character. Taxon key(s) of backbone taxon
#' manually set by expert.} \item{\code{remarks}: character. Remarks provided
#' by the expert.} \item{\code{verifiedBy}: character. Name of the person who
#' assigned \code{verificationKey}.} \item{\code{dateAdded}: date. Date on
#' which new combinations were added.} \item{\code{outdated}: logical.
#' \code{TRUE} when combination was not used for input taxa.} }
#' @param taxonKey,scientificName,datasetKey,bb_key,bb_scientificName,bb_kingdom,bb_rank,bb_taxonomicStatus,bb_acceptedKey,bb_acceptedName
#' Column names of required columns of \code{taxa}. They have to be passed as
#' strings, e.g. \code{"taxon_keys"}. Default: column names as specified above
#' in \code{taxa}.
#' @param verification_taxonKey,verification_scientificName,verification_datasetKey,verification_bb_key,verification_bb_scientificName,verification_bb_kingdom,verification_bb_rank,verification_bb_taxonomicStatus,verification_bb_acceptedKey,verification_bb_acceptedName,verification_bb_acceptedKingdom,verification_bb_acceptedRank,verification_bb_acceptedTaxonomicStatus,verification_verificationKey,verification_remarks,verification_verifiedBy,verification_dateAdded,verification_outdated
#' Column names of required columns of \code{verification}. They have to be
#' passed as strings, e.g. \code{"verification_taxon_keys"}. Default: column
#' names as specified above in \code{verification}.
#'
#' @return list. List with three objects: \itemize{ \item{\code{taxa}: df.
#' Provided dataframe with additional column \code{verificationKey}.}
#' \item{\code{verification}: df. New or updated dataframe with verification
#' information.} \item{\code{info}: list. Dataframes with ancillary
#' information regarding changes to the verification. \itemize{
#' \item{\code{new_synonyms}: df. Subset of \code{verification} with synonym
#' taxa found in \code{taxa} but not in provided \code{verification}).}
#' \item{\code{new_unmatched_taxa}: df. Subset of \code{verification} with
#' unmatched taxa found in \code{taxa} but not in provided
#' \code{verification}).} \item{\code{outdated_synonyms}: df. Subset of
#' \code{verification} with synonyms found in provided \code{verification} but
#' not in \code{taxa}.} \item{\code{outdated_unmatched_taxa}: df. Subset of
#' \code{verification} with unmatched taxa found in provided
#' \code{verification} but not in \code{taxa}.}
#' \item{\code{updated_bb_scientificName}: df. \code{bb_scientificName}s in
#' provided \code{verification} that were updated
#' \code{updated_bb_scientificName} in the backbone since.}
#' \item{\code{updated_bb_acceptedName}: df. \code{bb_acceptedName}s in
#' provided \code{verification} that were updated
#' \code{updated_bb_acceptedName} in the backbone since.}
#' \item{\code{duplicates}: df. Taxa present in more than one checklist.}
#' \item{\code{check_verificationKey}: df. Check if provided
#' \code{verificationKey}s can be found in backbone.} }} }
#'
#' @export
#' @importFrom dplyr %>% .data
#'
#' @examples
#' \dontrun{
#' my_taxa <- data.frame(
#' taxonKey = c(
#' 141117238,
#' 113794952,
#' 141264857,
#' 100480872,
#' 141264614,
#' 100220432,
#' 141264835,
#' 140563014,
#' 140562956,
#' 145953989,
#' 148437916,
#' 114445583,
#' 141264849,
#' 101790530
#' ),
#' scientificName = c(
#' "Aspius aspius",
#' "Rana catesbeiana",
#' "Polystichum tsus-simense J.Smith",
#' "Apus apus (Linnaeus, 1758)",
#' "Begonia x semperflorens hort.",
#' "Rana catesbeiana",
#' "Spiranthes cernua (L.) Richard x S. odorata (Nuttall) Lindley",
#' "Atyaephyra desmaresti",
#' "Ferrissia fragilis",
#' "Ferrissia fragilis",
#' "Ferrissia fragilis",
#' "Rana blanfordii Boulenger",
#' "Pterocarya x rhederiana C.K. Schneider",
#' "Stenelmis williami Schmude"
#' ),
#' datasetKey = c(
#' "98940a79-2bf1-46e6-afd6-ba2e85a26f9f",
#' "e4746398-f7c4-47a1-a474-ae80a4f18e92",
#' "9ff7d317-609b-4c08-bd86-3bc404b77c42",
#' "39653f3e-8d6b-4a94-a202-859359c164c5",
#' "9ff7d317-609b-4c08-bd86-3bc404b77c42",
#' "b351a324-77c4-41c9-a909-f30f77268bc4",
#' "9ff7d317-609b-4c08-bd86-3bc404b77c42",
#' "289244ee-e1c1-49aa-b2d7-d379391ce265",
#' "289244ee-e1c1-49aa-b2d7-d379391ce265",
#' "3f5e930b-52a5-461d-87ec-26ecd66f14a3",
#' "1f3505cd-5d98-4e23-bd3b-ffe59d05d7c2",
#' "3772da2f-daa1-4f07-a438-15a881a2142c",
#' "9ff7d317-609b-4c08-bd86-3bc404b77c42",
#' "9ca92552-f23a-41a8-a140-01abaa31c931"
#' ),
#' bb_key = c(
#' 2360181,
#' 2427092,
#' 2651108,
#' 5228676,
#' NA,
#' 2427092,
#' NA,
#' 4309705,
#' 2291152,
#' 2291152,
#' 2291152,
#' 2430304,
#' NA,
#' 1033588
#' ),
#' bb_scientificName = c(
#' "Aspius aspius (Linnaeus, 1758)",
#' "Rana catesbeiana Shaw, 1802",
#' "Polystichum tsus-simense (Hook.) J.Sm.",
#' "Apus apus (Linnaeus, 1758)",
#' NA,
#' "Rana catesbeiana Shaw, 1802",
#' NA,
#' "Atyaephyra desmarestii (Millet, 1831)",
#' "Ferrissia fragilis (Tryon, 1863)",
#' "Ferrissia fragilis (Tryon, 1863)",
#' "Ferrissia fragilis (Tryon, 1863)",
#' "Rana blanfordii Boulenger, 1882",
#' NA,
#' "Stenelmis williami Schmude"
#' ),
#' bb_kingdom = c(
#' "Animalia",
#' "Animalia",
#' "Plantae",
#' "Animalia",
#' NA,
#' "Animalia",
#' NA,
#' "Animalia",
#' "Animalia",
#' "Animalia",
#' "Animalia",
#' "Animalia",
#' NA,
#' "Animalia"
#' ),
#' bb_rank = c(
#' "SPECIES",
#' "SPECIES",
#' "SPECIES",
#' "SPECIES",
#' NA,
#' "SPECIES",
#' NA,
#' "SPECIES",
#' "SPECIES",
#' "SPECIES",
#' "SPECIES",
#' "SPECIES",
#' NA,
#' "SPECIES"
#' ),
#' bb_taxonomicStatus = c(
#' "SYNONYM",
#' "SYNONYM",
#' "SYNONYM",
#' "ACCEPTED",
#' NA,
#' "SYNONYM",
#' NA,
#' "HOMOTYPIC_SYNONYM",
#' "SYNONYM",
#' "SYNONYM",
#' "SYNONYM",
#' "SYNONYM",
#' NA,
#' "SYNONYM"
#' ),
#' bb_acceptedKey = c(
#' 5851603,
#' 2427091,
#' 4046493,
#' NA,
#' NA,
#' 2427091,
#' NA,
#' 6454754,
#' 9520065,
#' 9520065,
#' 9520065,
#' 2430301,
#' NA,
#' 1033553
#' ),
#' bb_acceptedName = c(
#' "Leuciscus aspius (Linnaeus, 1758)",
#' "Lithobates catesbeianus (Shaw, 1802)",
#' "Polystichum luctuosum (Kunze) Moore.",
#' NA,
#' NA,
#' "Lithobates catesbeianus (Shaw, 1802)",
#' NA,
#' "Hippolyte desmarestii Millet, 1831",
#' "Ferrissia californica (Rowell, 1863)",
#' "Ferrissia californica (Rowell, 1863)",
#' "Ferrissia californica (Rowell, 1863)",
#' "Nanorana blanfordii (Boulenger, 1882)",
#' NA,
#' "Stenelmis Dufour, 1835"
#' ),
#' taxonID = c(
#' "alien-fishes-checklist:taxon:c937610f85ea8a74f105724c8f198049",
#' "88",
#' "alien-plants-belgium:taxon:57c1d111f14fd5f3271b0da53c05c745",
#' "4512",
#' "alien-plants-belgium:taxon:9a6c5ed8907ff169433fe44fcbff0705",
#' "80-syn",
#' "alien-plants-belgium:taxon:29409d1e1adc88d6357dd0be13350d6c",
#' "alien-macroinvertebrates-checklist:taxon:54cca150e1e0b7c0b3f5b152ae64d62b",
#' "alien-macroinvertebrates-checklist:taxon:73f271d93128a4e566e841ea6e3abff0",
#' "rinse-checklist:taxon:7afe7b1fbdd06cbdfe97272567825c09",
#' "ad-hoc-checklist:taxon:32dc2e18733fffa92ba4e1b35d03c4e2",
#' "a80caa33-da9d-48ed-80e3-f76b0b3810f9",
#' "alien-plants-belgium:taxon:56d6564f59d9092401c454849213366f",
#' "193729"
#' ),
#' stringsAsFactors = FALSE
#' )
#'
#' my_verification <- data.frame(
#' taxonKey = c(
#' 113794952,
#' 141264857,
#' 143920280,
#' 141264835,
#' 141264614,
#' 140562956,
#' 145953989,
#' 114445583,
#' 128897752,
#' 101790530,
#' 141265523
#' ),
#' scientificName = c(
#' "Rana catesbeiana",
#' "Polystichum tsus-simense J.Smith",
#' "Lemnaceae",
#' "Spiranthes cernua (L.) Richard x S. odorata (Nuttall) Lindley",
#' "Begonia x semperflorens hort.",
#' "Ferrissia fragilis",
#' "Ferrissia fragilis",
#' "Rana blanfordii Boulenger",
#' "Python reticulatus Fitzinger, 1826",
#' "Stenelmis williami Schmude",
#' "Veronica austriaca Jacq."
#' ),
#' datasetKey = c(
#' "e4746398-f7c4-47a1-a474-ae80a4f18e92",
#' "9ff7d317-609b-4c08-bd86-3bc404b77c42",
#' "e4746398-f7c4-47a1-a474-ae80a4f18e92",
#' "9ff7d317-609b-4c08-bd86-3bc404b77c42",
#' "9ff7d317-609b-4c08-bd86-3bc404b77c42",
#' "289244ee-e1c1-49aa-b2d7-d379391ce265",
#' "3f5e930b-52a5-461d-87ec-26ecd66f14a3",
#' "3772da2f-daa1-4f07-a438-15a881a2142c",
#' "7ddf754f-d193-4cc9-b351-99906754a03b",
#' "9ca92552-f23a-41a8-a140-01abaa31c931",
#' "9ff7d317-609b-4c08-bd86-3bc404b77c42"
#' ),
#' bb_key = c(
#' 2427092,
#' 2651108,
#' 6723,
#' NA,
#' NA,
#' 2291152,
#' 2291152,
#' 2430304,
#' 7587934,
#' 1033588,
#' NA
#' ),
#' bb_scientificName = c(
#' "Rana catesbeiana Shaw, 1802",
#' "Polystichum tsus-tsus-tsus (Hook.) Captain",
#' "Lemnaceae",
#' NA,
#' NA,
#' "Ferrissia fragilis (Tryon, 1863)",
#' "Ferrissia fragilis (Tryon, 1863)",
#' "Rana blanfordii Boulenger, 1882",
#' "Python reticulatus Fitzinger, 1826",
#' "Stenelmis williami Schmude",
#' NA
#' ),
#' bb_kingdom = c(
#' "Animalia",
#' "Plantae",
#' "Plantae",
#' NA,
#' NA,
#' "Animalia",
#' "Animalia",
#' "Animalia",
#' "Animalia",
#' "Animalia",
#' NA
#' ),
#' bb_rank = c(
#' "SPECIES",
#' "SPECIES",
#' "FAMILY",
#' NA,
#' NA,
#' "SPECIES",
#' "SPECIES",
#' "SPECIES",
#' "SPECIES",
#' "SPECIES",
#' NA
#' ),
#' bb_taxonomicStatus = c(
#' "SYNONYM",
#' "SYNONYM",
#' "SYNONYM",
#' NA,
#' NA,
#' "SYNONYM",
#' "SYNONYM",
#' "SYNONYM",
#' "SYNONYM",
#' "SYNONYM",
#' NA
#' ),
#' bb_acceptedKey = c(
#' 2427091,
#' 4046493,
#' 6979,
#' NA,
#' NA,
#' 9520065,
#' 9520065,
#' 2427008,
#' 9260388,
#' 1033553,
#' NA
#' ),
#' bb_acceptedName = c(
#' "Lithobates dummyus (Batman, 2018)",
#' "Polystichum luctuosum (Kunze) Moore.",
#' "Araceae",
#' NA,
#' NA,
#' "Ferrissia californica (Rowell, 1863)",
#' "Ferrissia californica (Rowell, 1863)",
#' "Hylarana chalconota (Schlegel, 1837)",
#' "Malayopython reticulatus (Schneider, 1801)",
#' "Stenelmis Dufour, 1835",
#' NA
#' ),
#' bb_acceptedKingdom = c(
#' "Animalia",
#' "Plantae",
#' "Plantae",
#' NA,
#' NA,
#' "Animalia",
#' "Animalia",
#' "Animalia",
#' "Animalia",
#' "Animalia",
#' NA
#' ),
#' bb_acceptedRank = c(
#' "SPECIES",
#' "SPECIES",
#' "FAMILY",
#' NA,
#' NA,
#' "SPECIES",
#' "SPECIES",
#' "SPECIES",
#' "SPECIES",
#' "GENUS",
#' NA
#' ),
#' bb_acceptedTaxonomicStatus = c(
#' "ACCEPTED",
#' "ACCEPTED",
#' "ACCEPTED",
#' NA,
#' NA,
#' "ACCEPTED",
#' "ACCEPTED",
#' "ACCEPTED",
#' "ACCEPTED",
#' "ACCEPTED",
#' NA
#' ),
#' verificationKey = c(
#' 2427091,
#' 4046493,
#' 6979,
#' "2805420,2805363",
#' NA,
#' NA,
#' NA,
#' NA,
#' 9260388,
#' NA,
#' 3172099
#' ),
#' remarks = c(
#' "dummy example 1: bb_acceptedName should be updated.",
#' "dummy example 2: bb_scientificName should be updated.",
#' "dummy example 3: not used anymore. Set outdated = TRUE.",
#' "dummy example 4: multiple keys in verificationKey are allowed.",
#' "dummy example 5: nothing should happen.",
#' "dummy example 6: datasetKey should not be modified. If new taxa come in
#' with same name from other checklsits, they should be added as new rows.
#' Report them as duplicates in duplicates_taxa",
#' "dummy example 7: datasetKey should not be modified. If new taxa come in
#' with same name from other checklsits, they should be added as new rows.
#' Report them as duplicates in duplicates_taxa",
#' "dummy example 8: outdated synonym. Set outdated = TRUE.",
#' "dummy example 9: outdated synonym. outdated is already TRUE. No actions.",
#' "dummy example 10: outdated synonym. Not outdated anymore. Change outdated
#' back to FALSE.",
#' "dummy example 11: outdated unmatched taxa. Set outdated = TRUE."
#' ),
#' verifiedBy = c(
#' "Damiano Oldoni",
#' "Peter Desmet",
#' "Stijn Van Hoey",
#' "Tanja Milotic",
#' NA,
#' NA,
#' NA,
#' NA,
#' "Lien Reyserhove",
#' NA,
#' "Dimitri Brosens"
#' ),
#' dateAdded = as.Date(
#' c(
#' "2018-07-01",
#' "2018-07-01",
#' "2018-07-01",
#' "2018-07-16",
#' "2018-07-16",
#' "2018-07-01",
#' "2018-11-20",
#' "2018-11-29",
#' "2018-12-01",
#' "2018-12-02",
#' "2018-12-03"
#' )
#' ),
#' outdated = c(
#' FALSE,
#' FALSE,
#' FALSE,
#' FALSE,
#' FALSE,
#' FALSE,
#' FALSE,
#' FALSE,
#' TRUE,
#' TRUE,
#' FALSE
#' ),
#' stringsAsFactors = FALSE
#' )
#'
#' # output
#' verify_taxa(taxa = my_taxa, verification = my_verification)
#' verify_taxa(taxa = my_taxa)
#'
#' # you can also provide your own column names for one or more required columns:
#' library(dplyr)
#' my_taxa_other_colnames <-
#' rename(
#' my_taxa,
#' checklist = datasetKey,
#' scientific_names = scientificName
#' )
#'
#' my_verification_other_colnames <-
#' rename(
#' my_verification,
#' backbone_scientific_names = bb_scientificName,
#' backbone_accepted_names = bb_acceptedName,
#' is_outdated = outdated,
#' author_verification = verifiedBy
#' )
#'
#' # output
#' verify_taxa(
#' taxa = my_taxa_other_colnames,
#' verification = my_verification_other_colnames
#' )
#' }
verify_taxa <- function(taxa,
verification = NULL,
taxonKey = "taxonKey",
scientificName = "scientificName",
datasetKey = "datasetKey",
bb_key = "bb_key",
bb_scientificName = "bb_scientificName",
bb_kingdom = "bb_kingdom",
bb_rank = "bb_rank",
bb_taxonomicStatus = "bb_taxonomicStatus",
bb_acceptedKey = "bb_acceptedKey",
bb_acceptedName = "bb_acceptedName",
verification_taxonKey = "taxonKey",
verification_scientificName = "scientificName",
verification_datasetKey = "datasetKey",
verification_bb_key = "bb_key",
verification_bb_scientificName = "bb_scientificName",
verification_bb_kingdom = "bb_kingdom",
verification_bb_rank = "bb_rank",
verification_bb_taxonomicStatus = "bb_taxonomicStatus",
verification_bb_acceptedKey = "bb_acceptedKey",
verification_bb_acceptedName = "bb_acceptedName",
verification_bb_acceptedKingdom = "bb_acceptedKingdom",
verification_bb_acceptedRank = "bb_acceptedRank",
verification_bb_acceptedTaxonomicStatus = "bb_acceptedTaxonomicStatus",
verification_verificationKey = "verificationKey",
verification_remarks = "remarks",
verification_verifiedBy = "verifiedBy",
verification_dateAdded = "dateAdded",
verification_outdated = "outdated") {
# Start tests input
message("Check input dataframes...", appendLF = FALSE)
# Test taxa
# Retrieve names of needed columns of taxa
name_col_taxa_original <- c(
taxonKey, scientificName, datasetKey, bb_key, bb_scientificName,
bb_kingdom, bb_rank, bb_taxonomicStatus, bb_acceptedKey, bb_acceptedName
)
# Define vector of column names of taxa we will use later
name_col_taxa <- c(
"taxonKey", "scientificName", "datasetKey", "bb_key",
"bb_scientificName", "bb_kingdom", "bb_rank", "bb_taxonomicStatus",
"bb_acceptedKey", "bb_acceptedName"
)
# Check taxa is a dataframe
assertthat::assert_that(is.data.frame(taxa))
# Check presence needed columns
col_not_present <-
name_col_taxa_original[which(!name_col_taxa_original %in% names(taxa))]
assertthat::assert_that(
all(name_col_taxa_original %in% names(taxa)),
msg = paste(
"The following columns of taxa are not present:",
paste0(paste(col_not_present, collapse = ", "), "."),
"Did you maybe forget to provide the mapping of",
"columns named differently than the default names?"
)
)
# Check that taxon keys are all set up, no NAs present in input taxa
assertthat::assert_that(all(!is.na(taxa[[taxonKey]])),
msg = sprintf(
paste0(
"Missing values found in taxon keys of input ",
"taxa. Check values in column %s."
),
taxonKey
)
)
# Check that taxon keys are unique in taxa
assertthat::assert_that(nrow(taxa) == length(unique(taxa[[taxonKey]])),
msg = sprintf(
paste0(
"Taxon keys of input taxa must be unique. ",
"Check values in column %s."
),
taxonKey
)
)
# Convert to default column names
taxa <-
taxa %>%
dplyr::rename_at(dplyr::vars(name_col_taxa_original), ~name_col_taxa)
# Check class columns
taxa$scientificName <- as.character(taxa$scientificName)
taxa$datasetKey <- as.character(taxa$datasetKey)
taxa$bb_scientificName <- as.character(taxa$bb_scientificName)
taxa$bb_kingdom <- as.character(taxa$bb_kingdom)
taxa$bb_rank <- as.character(taxa$bb_rank)
taxa$bb_taxonomicStatus <- as.character(taxa$bb_taxonomicStatus)
taxa$bb_acceptedName <- as.character(taxa$bb_acceptedName)
taxa$taxonKey <- as.numeric(taxa$taxonKey)
taxa$bb_key <- as.numeric(taxa$bb_key)
taxa$bb_acceptedKey <- as.numeric(taxa$bb_acceptedKey)
# Check that accepted or doubtful taxa have a backbone key
assertthat::assert_that(
taxa %>%
dplyr::filter(.data$bb_taxonomicStatus %in% c("ACCEPTED", "DOUBTFUL") &
is.na(.data$bb_key)) %>%
nrow() == 0,
msg = "Taxa which don't need verification must have a backbone key."
)
# Unmatched taxa should have no GBIF Backbone information at all
assertthat::assert_that(
taxa %>%
dplyr::filter(is.na(.data$bb_key)) %>%
dplyr::filter_at(dplyr::vars(dplyr::starts_with("bb_")),
dplyr::all_vars(is.na(.))) %>%
nrow() ==
taxa %>%
dplyr::filter(is.na(.data$bb_key)) %>%
dplyr::filter_at(dplyr::vars(dplyr::starts_with("bb_")),
dplyr::any_vars(is.na(.))) %>%
nrow(),
msg = "Columns with GBIF Backbone info should be empty for unmatched taxa."
)
# Throw a message if a column called verificationKey already exists
message_existence_verificationKey <- NULL
if ("verificationKey" %in% names(taxa)) {
message_existence_verificationKey <-
"Column verificationKey already exists. It will be overwritten."
taxa <-
taxa %>%
dplyr::select(-"verificationKey")
}
# Test verification
# Retrieve names of needed columns of verification
name_col_verification_original <- c(
verification_taxonKey, verification_scientificName,
verification_datasetKey, verification_bb_key,
verification_bb_scientificName, verification_bb_kingdom,
verification_bb_rank, verification_bb_taxonomicStatus,
verification_bb_acceptedKey, verification_bb_acceptedName,
verification_bb_acceptedKingdom, verification_bb_acceptedRank,
verification_bb_acceptedTaxonomicStatus, verification_verificationKey,
verification_remarks, verification_verifiedBy,
verification_dateAdded, verification_outdated
)
# Define vector of names of required columns of verification we will use later
name_col_verification <- c(
"taxonKey", "scientificName", "datasetKey",
"bb_key", "bb_scientificName",
"bb_kingdom", "bb_rank", "bb_taxonomicStatus",
"bb_acceptedKey", "bb_acceptedName",
"bb_acceptedKingdom", "bb_acceptedRank",
"bb_acceptedTaxonomicStatus",
"verificationKey", "remarks",
"verifiedBy", "dateAdded", "outdated"
)
name_col_verification_extra <-
names(verification)[!names(verification) %in%
name_col_verification_original]
# Make empty tibble df if not exists
if (is.null(verification)) {
verification <- dplyr::tibble(
taxonKey = double(),
scientificName = character(),
datasetKey = character(),
bb_key = double(),
bb_scientificName = character(),
bb_kingdom = character(),
bb_rank = character(),
bb_taxonomicStatus = character(),
bb_acceptedKey = double(),
bb_acceptedName = character(),
bb_acceptedKingdom = character(),
bb_acceptedRank = character(),
bb_acceptedTaxonomicStatus = character(),
verificationKey = character(),
remarks = character(),
verifiedBy = character(),
dateAdded = numeric(),
outdated = logical()
)
class(verification$dateAdded) <- "Date"
} else {
# Check verification is a dataframe
assertthat::assert_that(is.data.frame(verification))
# Check presence needed columns
col_not_present <-
name_col_verification_original[
which(!name_col_verification_original %in% names(verification))
]
assertthat::assert_that(
all(name_col_verification_original %in% names(verification)),
msg = paste(
"The following columns of verification are not present:",
paste0(paste(col_not_present, collapse = ", "), "."),
"Did you maybe forget to provide the mapping of",
"columns named differently than the default names?"
)
)
# Check that taxon keys are all set up, no NAs present in verification df
assertthat::assert_that(all(!is.na(verification[[verification_taxonKey]])),
msg = sprintf(
paste0(
"Missing values found in taxon keys of input ",
"taxa. Check values in column %s."
),
verification_taxonKey
)
)
# Check that taxon keys are unique in verification df
assertthat::assert_that(
nrow(verification) == length(unique(verification[[verification_taxonKey]])),
msg = sprintf(
paste0(
"Taxon keys of input taxa must be unique. ",
"Check values in column %s."
),
taxonKey
)
)
# Convert to standard column names
verification <-
verification %>%
dplyr::rename_at(dplyr::vars(name_col_verification_original), ~name_col_verification)
}
# Check class columns
verification$scientificName <- as.character(verification$scientificName)
verification$datasetKey <- as.character(verification$datasetKey)
verification$bb_scientificName <- as.character(verification$bb_scientificName)
verification$bb_kingdom <- as.character(verification$bb_kingdom)
verification$bb_rank <- as.character(verification$bb_rank)
verification$bb_taxonomicStatus <-
as.character(verification$bb_taxonomicStatus)
verification$bb_acceptedName <- as.character(verification$bb_acceptedName)
verification$bb_acceptedKingdom <-
as.character(verification$bb_acceptedKingdom)
verification$bb_acceptedRank <- as.character(verification$bb_acceptedRank)
verification$bb_acceptedTaxonomicStatus <-
as.character(verification$bb_acceptedTaxonomicStatus)
verification$verificationKey <- as.character(verification$verificationKey)
verification$remarks <- as.character(verification$remarks)
verification$verifiedBy <- as.character(verification$verifiedBy)
verification$taxonKey <- as.numeric(verification$taxonKey)
verification$bb_key <- as.numeric(verification$bb_key)
verification$bb_acceptedKey <- as.numeric(verification$bb_acceptedKey)
verification$dateAdded <- as.Date(verification$dateAdded)
verification$outdated <- as.logical(verification$outdated)
assertthat::assert_that(
all(nchar(verification$datasetKey) == 36) &
isFALSE(any(grepl(pattern = ",", x = verification$datasetKey))),
msg = paste(
"Incorrect datesetKey:", verification$datasetKey,
"Is expected to be 36-character UUID."
)
)
assertthat::assert_that(verification %>%
dplyr::filter(is.na(.data$outdated)) %>%
nrow() == 0,
msg = "Only logicals (TRUE/FALSE) allowed in 'outdated' of verification."
)
# Allow multiple comma separated verification keys (character)
class(verification$verificationKey) <- "character"
# Allow remarks (remarks col empty means for R a column logicals)
class(verification$remarks) <- "character"
# Check for integrity synonym relations
assertthat::assert_that(
verification %>%
dplyr::filter((is.na(.data$bb_acceptedName) &
!is.na(.data$bb_acceptedKey)
) |
(!is.na(.data$bb_acceptedName) &
is.na(.data$bb_acceptedKey)
)) %>%
nrow() == 0,
msg = paste(
"bb_acceptedName and bb_acceptedKey",
"should be both NA or both present."
)
)
# Check that only synonyms and unmatched taxa are present in verification
taxonomic_status <-
verification %>%
dplyr::distinct(.data$bb_taxonomicStatus) %>%
dplyr::filter(!is.na(.data$bb_taxonomicStatus)) %>%
dplyr::pull()
not_allowed_taxonomicStatus <- c("ACCEPTED", "DOUBTFUL")
assertthat::assert_that(all(!taxonomic_status %in% not_allowed_taxonomicStatus),
msg = "Only synonyms and unmatched taxa allowed in verification."
)
message("DONE.", appendLF = TRUE)
if (!is.null(message_existence_verificationKey)) {
message(message_existence_verificationKey, appendLF = TRUE)
}
# Get order taxon keys
ordered_taxon_keys <-
taxa %>%
dplyr::select("taxonKey")
# Find taxa which don't need any verification and assign verificationKey
message("Assign verificationKey to taxa which don't need verification...",
appendLF = FALSE
)
not_to_verify_taxa <-
taxa %>%
dplyr::filter(!is.na(.data$bb_key) &
bb_taxonomicStatus %in% c("ACCEPTED", "DOUBTFUL")) %>%
dplyr::mutate(
verificationKey = as.character(bb_key)
)
message("DONE.", appendLF = TRUE)
# Go further with taxa which need verification
taxa_input <- taxa
taxa <-
taxa %>%
dplyr::anti_join(not_to_verify_taxa,
by = colnames(taxa)
)
message("Find new synonyms...", appendLF = FALSE)
# Find new synonyms (= new triplets (taxonKey, bb_key, bb_acceptedKey))
new_synonyms <-
taxa %>%
# Remove not synonyms
dplyr::filter(!is.na(.data$bb_taxonomicStatus) &
!.data$bb_taxonomicStatus %in% c("ACCEPTED", "DOUBTFUL")) %>%
dplyr::anti_join(verification,
by = c("taxonKey", "bb_key", "bb_acceptedKey")
) %>%
dplyr::mutate(
bb_acceptedKingdom = NA_character_,
bb_acceptedRank = NA_character_,
bb_acceptedTaxonomicStatus = NA_character_,
verificationKey = NA_character_,
remarks = NA_character_,
verifiedBy = NA_character_,
dateAdded = Sys.Date(),
outdated = FALSE
) %>%
dplyr::select(dplyr::one_of(name_col_verification), name_col_verification_extra)
message("DONE.", appendLF = TRUE)
# Find new taxa not matched to GBIF backbone
message("Find new unmatched taxa...", appendLF = FALSE)
unmatched_taxa <-
verification %>%
dplyr::filter(is.na(.data$bb_key)) %>%
dplyr::distinct(.data$taxonKey) %>%
dplyr::pull()
new_unmatched_taxa <-
taxa %>%
dplyr::filter(is.na(.data$bb_key)) %>%
dplyr::filter(!.data$taxonKey %in% unmatched_taxa) %>%
dplyr::mutate(
bb_acceptedKingdom = NA_character_,
bb_acceptedRank = NA_character_,
bb_acceptedTaxonomicStatus = NA_character_,
verificationKey = NA_character_,
remarks = NA_character_,
verifiedBy = NA_character_,
dateAdded = Sys.Date(),
outdated = FALSE
) %>%
dplyr::select(dplyr::one_of(name_col_verification), name_col_verification_extra)
message("DONE.", appendLF = TRUE)
# Create df of updated bb_scientificName
message("Update backbone scientific names...", appendLF = FALSE)
if (nrow(verification) > 0) {
updated_bb_scientificName <-
verification %>%
dplyr::filter(!is.na(.data$bb_scientificName)) %>%
dplyr::left_join(taxa,
by = c("taxonKey", "bb_key", "bb_acceptedKey")
) %>%
dplyr::rename(
"bb_scientificName" = "bb_scientificName.x",
"updated_bb_scientificName" = "bb_scientificName.y"
) %>%
dplyr::filter(.data$bb_scientificName != .data$updated_bb_scientificName) %>%
dplyr::select(
which(colnames(verification) %in% colnames(.)),
"updated_bb_scientificName",
dplyr::ends_with(".x")
) %>%
dplyr::rename_at(
dplyr::vars(dplyr::ends_with(".x")),
list(~ stringr::str_remove(., "\\.x"))
)
# Update bb_scientificName of verification
verification <-
verification %>%
dplyr::anti_join(updated_bb_scientificName,
by = colnames(verification)
) %>%
dplyr::bind_rows(
updated_bb_scientificName %>%
dplyr::mutate(bb_scientificName = updated_bb_scientificName) %>%
dplyr::select(-"updated_bb_scientificName")) %>%
dplyr::select(dplyr::one_of(name_col_verification), name_col_verification_extra)
# Version for info
updated_bb_scientificName_short <-
updated_bb_scientificName %>%
dplyr::select(
"taxonKey", "bb_key", "bb_acceptedKey",
"bb_scientificName", "updated_bb_scientificName"
)
}
else {
updated_bb_scientificName_short <- dplyr::tibble(
taxonKey = double(),
bb_key = double(),
bb_acceptedKey = double(),
bb_scientificName = character(),
updated_bb_scientificName = character()
)
}
# Retrieve present column names and original ones for later renaming
name_col_updated_bb_scientificName_short <-
names(updated_bb_scientificName_short)
name_col_updated_bb_scientificName_short_original <-
c(
verification_taxonKey,
verification_bb_key,
verification_bb_acceptedKey,
verification_bb_scientificName,
paste0("updated_", verification_bb_scientificName)
)
message("DONE.", appendLF = TRUE)
# Create df of updated acceptedName
message("Update backbone accepted names...", appendLF = FALSE)
if (nrow(verification) > 0) {
updated_bb_acceptedName <-
verification %>%
dplyr::filter(!is.na(.data$bb_acceptedName)) %>%
dplyr::left_join(taxa,
by = c("taxonKey", "bb_key", "bb_acceptedKey")
) %>%
dplyr::rename(
"bb_acceptedName" = "bb_acceptedName.x",
"updated_bb_acceptedName" = "bb_acceptedName.y"
) %>%
dplyr::filter(bb_acceptedName != updated_bb_acceptedName) %>%
dplyr::select(
which(colnames(verification) %in% colnames(.)),
"updated_bb_acceptedName",
dplyr::ends_with(".x")
) %>%
dplyr::rename_at(dplyr::vars(dplyr::ends_with(".x")),
list(~ stringr::str_remove(., "\\.x"))
)
# Update bb_acceptedName of verification
verification <-
verification %>%
dplyr::anti_join(updated_bb_acceptedName,
by = colnames(verification)
) %>%
dplyr::bind_rows(updated_bb_acceptedName %>%
dplyr::mutate(bb_acceptedName = updated_bb_acceptedName) %>%
dplyr::select(-"updated_bb_acceptedName")) %>%
dplyr::select(dplyr::one_of(name_col_verification),
name_col_verification_extra
)
# Version for info
updated_bb_acceptedName_short <-
updated_bb_acceptedName %>%
dplyr::select(
"taxonKey", "bb_key", "bb_acceptedKey",
"bb_acceptedName", "updated_bb_acceptedName"
)
}
else {
updated_bb_acceptedName_short <- dplyr::tibble(
taxonKey = double(),
bb_key = double(),
bb_acceptedKey = double(),
bb_acceptedName = character(),
updated_bb_acceptedName = character()
)
}
# Retrieve present column names and original ones for later renaming
name_col_updated_bb_acceptedName_short <-
names(updated_bb_acceptedName_short)
name_col_updated_bb_acceptedName_short_original <-
c(
verification_taxonKey,
verification_bb_key,
verification_bb_acceptedKey,
verification_bb_acceptedName,
paste0("updated_", verification_bb_acceptedName)
)
message("DONE.", appendLF = TRUE)
# Add new synonyms to verification
verification <-
verification %>%
dplyr::bind_rows(new_synonyms) %>%
dplyr::select(dplyr::one_of(name_col_verification), name_col_verification_extra)
# Add new unmatches to verification
verification <-
verification %>%
dplyr::bind_rows(new_unmatched_taxa) %>%
dplyr::select(dplyr::one_of(name_col_verification), name_col_verification_extra)
# Retrieve backbone information about taxa the synonyms point to
message("Retrieve backbone info about accepted taxa for synonyms...",
appendLF = FALSE
)
if (nrow(verification) > 0) {
accepted_keys <-
verification %>%
dplyr::distinct(bb_acceptedKey) %>%
dplyr::filter(!is.na(.data$bb_acceptedKey))
accepted_info <- purrr::pmap_dfr(
accepted_keys,
function(bb_acceptedKey) {
rgbif::name_usage(
key = bb_acceptedKey
)$data
}
) %>%
dplyr::select("key", "kingdom", "rank", "taxonomicStatus") %>%
dplyr::rename(
bb_acceptedKey = .data$key,
bb_acceptedKingdom = .data$kingdom,
bb_acceptedRank = .data$rank,
bb_acceptedTaxonomicStatus = .data$taxonomicStatus
)
# Update backbone info about accepted taxa in verification
verification <-
verification %>%
dplyr::select(-c(
"bb_acceptedKingdom",
"bb_acceptedRank",
"bb_acceptedTaxonomicStatus"
)) %>%
dplyr::left_join(accepted_info, by = "bb_acceptedKey") %>%
dplyr::select(dplyr::one_of(name_col_verification), name_col_verification_extra)
# Add backbone info to new_synonys too
new_synonyms <-
new_synonyms %>%
dplyr::select(-c(
"bb_acceptedKingdom",
"bb_acceptedRank",
"bb_acceptedTaxonomicStatus"
)) %>%
dplyr::left_join(verification %>%
dplyr::select(
"taxonKey", "bb_key", "bb_acceptedKey",
"bb_acceptedKingdom", "bb_acceptedRank",
"bb_acceptedTaxonomicStatus"
),
by = c("taxonKey", "bb_key", "bb_acceptedKey")
) %>%
dplyr::select(dplyr::one_of(name_col_verification), name_col_verification_extra)
} else {
verification <-
verification %>%
dplyr::mutate(
bb_acceptedKey = double(),
bb_acceptedKingdom = character(),
bb_acceptedRank = character(),
bb_acceptedTaxonomicStatus = character()
) %>%
dplyr::select(dplyr::one_of(name_col_verification), name_col_verification_extra)
}
message("DONE.", appendLF = TRUE)
# Handle outdated taxa
message("Detect outdated data...", appendLF = FALSE)
# Set outdated = FALSE for taxa which are in use:
# some outdated taxa could be back in use
if (nrow(verification) > 0) {
not_outdated_taxa <-
verification %>%
dplyr::inner_join(
taxa %>%
dplyr::select("taxonKey", "bb_key", "bb_acceptedKey"),
by = c("taxonKey", "bb_key", "bb_acceptedKey")
) %>%
dplyr::mutate(outdated = FALSE) %>%
dplyr::mutate(remarks = stringr::str_remove(.data$remarks, "Outdated taxa."))
# Define the outdated taxa subset
outdated_taxa <-
verification %>%
dplyr::anti_join(taxa, by = c("taxonKey", "bb_key", "bb_acceptedKey"))
# Set outdated = TRUE to all outdated taxa
outdated_taxa <-
outdated_taxa %>%
dplyr::mutate(outdated = TRUE)
# Compose verification back together
verification <-
not_outdated_taxa %>%
dplyr::bind_rows(outdated_taxa) %>%
dplyr::select(dplyr::one_of(name_col_verification),
name_col_verification_extra
)
}
message("DONE.", appendLF = TRUE)
# Check verificationKey values against GBIF and GBIF Backbone
message("Check verification keys...", appendLF = FALSE)
verification_keys <- verification %>%
dplyr::filter(!is.na(.data$verificationKey)) %>%
dplyr::filter(nchar(.data$verificationKey) > 0) %>%
dplyr::pull(.data$verificationKey)
verification_keys <- paste(verification_keys, collapse = ",")
verification_keys <- unlist(stringr::str_split(verification_keys, ","))
check_verificationKey <- gbif_verify_keys(verification_keys)
if (is.null(check_verificationKey)) {
check_verificationKey <- dplyr::tibble(
key = double(),
is_taxonKey = logical(),
is_from_gbif_backbone = logical(),
is_synonym = logical()
)
}
message("DONE.", appendLF = TRUE)
# Find taxa duplicates
message("Find scientific names used in multiple taxa...", appendLF = FALSE)
if (nrow(verification > 0)) {
duplicates <-
verification %>%
dplyr::filter(!is.na(.data$bb_key) & !is.na(.data$bb_acceptedKey)) %>%
dplyr::group_by(bb_key, bb_acceptedKey) %>%
dplyr::count() %>%
dplyr::filter(.data$n > 1) %>%
dplyr::left_join((verification %>%
dplyr::select("bb_key",
"bb_acceptedKey",
"bb_scientificName")),
by = c("bb_key", "bb_acceptedKey")
) %>%
dplyr::select(
.data$bb_key,
.data$bb_acceptedKey,
.data$bb_scientificName,
.data$n
) %>%
dplyr::arrange(dplyr::desc(.data$n)) %>%
dplyr::ungroup()
} else {
duplicates <- dplyr::tibble(
bb_key = double(),
bb_acceptedKey = double(),
bb_scientificName = character(),
n = double()
)
}
# Retrieve present column names and original ones for later renaming
name_col_duplicates <-
names(duplicates)
name_col_duplicates_original <-
c(
verification_bb_key,
verification_bb_acceptedKey,
verification_bb_scientificName,
"n"
)
message("DONE.", appendLF = TRUE)
# Order verification by outdated and dateAdded
verification <-
verification %>%
dplyr::arrange(.data$outdated, dplyr::desc(.data$dateAdded))
# Add not outdated taxa from verification to not_to_verify_taxa
taxa <-
verification %>%
dplyr::filter(.data$outdated == FALSE) %>%
dplyr::select(name_col_taxa, "verificationKey") %>%
dplyr::left_join(taxa_input,
by = name_col_taxa
) %>%
dplyr::bind_rows(not_to_verify_taxa)
# set same order as in input df taxa
taxa <-
ordered_taxon_keys %>%
dplyr::left_join(taxa, by = "taxonKey")
# Split outdated_taxa in outdated_unmatched_taxa and outdated_synonyms
outdated_unmatched_taxa <-
outdated_taxa %>%
dplyr::filter(is.na(.data$bb_key)) %>%
dplyr::select(dplyr::one_of(name_col_verification),
name_col_verification_extra
)
outdated_synonyms <-
outdated_taxa %>%
dplyr::filter(!is.na(.data$bb_acceptedKey)) %>%
dplyr::select(dplyr::one_of(name_col_verification),
name_col_verification_extra
)
# Convert to original column names
taxa <-
taxa %>%
dplyr::rename_at(dplyr::vars(name_col_taxa), ~name_col_taxa_original)
verification <-
verification %>%
dplyr::rename_at(
dplyr::vars(name_col_verification), ~name_col_verification_original
)
new_synonyms <-
new_synonyms %>%
dplyr::rename_at(
dplyr::vars(name_col_verification), ~name_col_verification_original
)
new_unmatched_taxa <-
new_unmatched_taxa %>%
dplyr::rename_at(
dplyr::vars(name_col_verification), ~name_col_verification_original
)
outdated_unmatched_taxa <-
outdated_unmatched_taxa %>%
dplyr::rename_at(
dplyr::vars(name_col_verification), ~name_col_verification_original
)
outdated_synonyms <-
outdated_synonyms %>%
dplyr::rename_at(
dplyr::vars(name_col_verification), ~name_col_verification_original
)
updated_bb_scientificName_short <-
updated_bb_scientificName_short %>%
dplyr::rename_at(
dplyr::vars(name_col_updated_bb_scientificName_short),
~name_col_updated_bb_scientificName_short_original
)
updated_bb_acceptedName_short <-
updated_bb_acceptedName_short %>%
dplyr::rename_at(
dplyr::vars(name_col_updated_bb_acceptedName_short),
~name_col_updated_bb_acceptedName_short_original
)
duplicates <-
duplicates %>%
dplyr::rename_at(
dplyr::vars(name_col_duplicates), ~name_col_duplicates_original
)
return(list(
taxa = taxa,
verification = verification,
info = list(
new_synonyms = dplyr::as_tibble(new_synonyms),
new_unmatched_taxa = dplyr::as_tibble(new_unmatched_taxa),
outdated_unmatched_taxa = dplyr::as_tibble(outdated_unmatched_taxa),
outdated_synonyms = dplyr::as_tibble(outdated_synonyms),
updated_bb_scientificName = dplyr::as_tibble(updated_bb_scientificName_short),
updated_bb_acceptedName = dplyr::as_tibble(updated_bb_acceptedName_short),
duplicates = duplicates,
check_verificationKey = check_verificationKey
)
))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.