Nothing
# This function was written by James Dorey to harmonise the names of bees using the Ascher-Orr-Chesshire
# bee taxonomies.
# The function first merges names based on scientificName, then merging the bdc cleaned_name and
# scientificNameAuthorship and matching those, followed by matching to canoncial with flags, then canonical.
# In all of these cases, names that are ambiguous at that level are removed so that only confident
# matches are maintaned.
# This function was written between the 18th and 20th of May 2022. For questions, please email James
# at jbdorey[at]me.com
#' Harmonise taxonomy of bee occurrence data
#'
#' Uses the Discover Life taxonomy to harmonise bee occurrences and flag those that do not match
#' the checklist. [BeeBDC::harmoniseR()] prefers to use the names_clean columns that is generated
#' by [bdc::bdc_clean_names()]. While this is not required, you may find better results by running
#' that function on your dataset first.
#' This function could be hijacked to service other taxa if a user matched the format of the
#' [BeeBDC::beesTaxonomy()] file.
#'
#' @param path A directory as character. The path to a folder that the output can be saved.
#' @param taxonomy A data frame or tibble. The bee taxonomy to use.
#' Default = [BeeBDC::beesTaxonomy()].
#' @param data A data frame or tibble. Occurrence records as input.
#' @param speciesColumn Character. The name of the column containing species names. Default = "scientificName".
#' @param rm_names_clean Logical. If TRUE then the names_clean column will be removed at the end of
#' this function to help reduce confusion about this column later. Default = TRUE
#' @param checkVerbatim Logical. If TRUE then the verbatimScientificName will be checked as well
#' for species matches. This matching will ONLY be done after harmoniseR has failed for the other
#' name columns. NOTE: this column is *not* first run through `bdc::bdc_clean_names`. Default = FALSE
#' @param stepSize Numeric. The number of occurrences to process in each chunk. Default = 1000000.
#' @param mc.cores Numeric. If > 1, the function will run in parallel
#' using mclapply using the number of cores specified. If = 1 then it will be run using a serial
#' loop. NOTE: Windows machines must use a value of 1 (see ?parallel::mclapply). Additionally,
#' be aware that each thread can use large chunks of memory.
#' Default = 1.
#'
#' @return The occurrences are returned with update taxonomy columns, including: scientificName,
#' species, family, subfamily, genus, subgenus, specificEpithet, infraspecificEpithet, and
#' scientificNameAuthorship. A new column, .invalidName, is also added and is FALSE when the occurrence's
#' name did not match the supplied taxonomy.
#'
#' @importFrom dplyr %>%
#'
#' @seealso [BeeBDC::taxadbToBeeBDC()] to download any taxonomy (of any taxa or of bees) and
#' [BeeBDC::beesTaxonomy()] for the bee taxonomy download.
#'
#' @export
#'
#' @examples
#' # load in the test dataset
#' system.file("extdata", "testTaxonomy.rda", package="BeeBDC") |> load()
#'
#' beesRaw_out <- BeeBDC::harmoniseR(
#' #The path to a folder that the output can be saved
#' path = tempdir(),
#' # The formatted taxonomy file
#' taxonomy = testTaxonomy,
#' data = BeeBDC::beesFlagged,
#' speciesColumn = "scientificName")
#' table(beesRaw_out$.invalidName, useNA = "always")
harmoniseR <- function(
data = NULL,
path = NULL, #The path to a folder that the output can be saved
taxonomy = BeeBDC::beesTaxonomy(), # The formatted taxonomy file
speciesColumn = "scientificName",
rm_names_clean = TRUE,
checkVerbatim = FALSE,
stepSize = 1000000,
mc.cores = 1
) {
# locally bind variables to the function
. <- id <- validName<-canonical<-canonical_withFlags<-family<-subfamily<-genus<-subgenus<-
species<-infraspecies<-authorship<-taxonomic_status<-flags<-accid<-validName_valid<-
family_valid<-subfamily_valid<-canonical_withFlags_valid<-genus_valid<-subgenus_valid<-
species_valid<-infraspecies_valid<-authorship_valid<-database_id<-names_clean<-
scientificNameAuthorship<-taxonRank<-authorFound<-SciNameAuthorSimple<-
authorSimple<-united_SciName<-verbatimScientificName <- scientificName <- BeeBDC_order <- NULL
# Load required packages
requireNamespace("rlang")
requireNamespace("dplyr")
# Record start time
startTime <- Sys.time()
#### 0.0 Prep ####
##### 0.1 Errors ####
###### a. FATAL errors ####
if(is.null(data)){
stop(" - Please provide an argument for data. I'm a program not a magician.")
}
if(is.null(taxonomy)){
stop(" - Please provide an argument for taxonomy I'm a program not a magician.")
}
if(is.null(path)){
stop(" - Please provide an argument for path I'm a program not a magician.")
}
if(!"verbatimScientificName" %in% colnames(data) & checkVerbatim == TRUE){
stop(paste0(" - If 'checkVerbatim = TRUE', then the verbatimScientificName column must be
present in the data."))
}
#### 1.0 _match columns ####
# Make a synonym index list
writeLines(paste(" - Formatting taxonomy for matching..."))
# save the original column names
OG_colnames <- unique(c("database_id", colnames(data)))
# Save the original number of rows
OG_rowNum <- nrow(data)
##### 1.1 Prepare columns ####
# To make the function more general, allow some column changing internally.
###### a. rename to scientificName ####
# Temporarily rename the speciesColumn to "scientificName" within the function
data <- data %>%
dplyr::rename("scientificName" = tidyselect::any_of(speciesColumn))
###### b. temp names_clean ####
# IF the names_clean column does not exist, temporarily add it to the dataset using the
# scientificName column's data.
if(!"names_clean" %in% colnames(data)){
data <- data %>%
dplyr::mutate(names_clean = scientificName)
message(paste0("The names_clean column was not found and will be temporarily copied from",
" scientificName"))
}
###### c. database_id ####
# If the database_id column isn't in the dataset, then add it for internal use
if(!"database_id" %in% colnames(data)){
data <- data %>%
dplyr::mutate(database_id = paste0("BeeBDC_TempCode_", dplyr::row_number()), .before = 1)
message("The database_idcolumn was not found, making this column with 'BeeBDC_TempCode_'...")
}
###### d. scientificNameAuthorship ####
# If there is no scientificNameAuthorship, make all NA
if(!"scientificNameAuthorship" %in% colnames(data)){
data <- data %>%
dplyr::mutate(scientificNameAuthorship = NA_character_)
message("The scientificNameAuthorship column was not found, making this column full of NAs.")
}
###### e. taxonRank ####
# If there is no taxonRank, make all NA
if(!"taxonRank" %in% colnames(data)){
data <- data %>%
dplyr::mutate(taxonRank = NA_character_)
message("The taxonRank column was not found, making this column full of NAs.")
}
###### f. species ####
# If there is no species, make all NA
if(!"species" %in% colnames(data)){
data <- data %>%
dplyr::mutate(species = scientificName)
message("The species column was not found, filling this column with scientificName.")
}
# Remove non-ambiguous tags
taxonomy <- taxonomy %>%
dplyr::mutate(flags = flags %>%
stringr::str_remove_all("non-ambiguous canonical| non-ambiguous can_wFlags"))
# Add a new column which has the canonical names matched to the synonyms
taxonomy <- taxonomy %>%
dplyr::left_join(x = .,
# left join ONLY the validName, canonical, and canonical_withFlags
y = dplyr::select(taxonomy,
tidyselect::any_of(
c("id", "validName", "canonical", "canonical_withFlags",
"family", "subfamily", "genus", "subgenus", "species",
"infraspecies", "authorship"))),
by = c("accid" = "id"), suffix = c("", "_valid"),
multiple = "all")
# Now, also duplicate the accepted names into the ._matched columns
AccMatched <- taxonomy %>%
# select only the ACCEPTED NAMES
dplyr::filter(taxonomic_status == "accepted") %>%
# duplicate the valid columns into the matched column locations
dplyr::mutate(validName_valid = validName,
canonical_valid = canonical,
canonical_withFlags_valid = canonical_withFlags,
family_valid = family,
subfamily_valid = subfamily,
genus_valid = genus,
subgenus_valid = subgenus,
species_valid = species,
infraspecies_valid = infraspecies,
authorship_valid = authorship)
# Merge these datasets
taxonomy <- taxonomy %>%
# First filter for the reverse of above - SYNONYM NAMES
dplyr::filter(taxonomic_status == "synonym") %>%
# combine
dplyr::bind_rows(AccMatched)
rm(AccMatched)
#### 2.0 Harmonise data ####
writeLines(paste("\n",
" - Harmonise the occurrence data with unambiguous names...", sep = ""))
# Create the parallel-able function
unAmbiguousFunction <- function(data){
##### 2.1 Valid Name ####
###### a. prep synonyms ####
# Filter out the AMBIGUOUS validNames prior to matching
currenttaxonomy <- taxonomy %>%
# REMOVE ambiguous validNames
dplyr::filter(!stringr::str_detect(
# Replace NA in flags with "" to allow matching
tidyr::replace_na(flags, ""),
"ambiguous validName"))
###### b. assign names ####
# Clean up some illegal characters
data$scientificName <- data$scientificName %>%
stringr::str_replace(pattern = "^\"", replacement = "") %>%
stringr::str_replace(pattern = "\"$", replacement = "")
# Match names first with the validName column
occs_21 <- data %>%
dplyr::left_join(currenttaxonomy %>%
dplyr::select(c(id, accid, validName, canonical_withFlags, canonical, validName_valid,
family_valid, subfamily_valid,
canonical_withFlags_valid, genus_valid, subgenus_valid,
species_valid, infraspecies_valid, authorship_valid)),
# Match scientific name with the valid synonym name
by = c("scientificName" = "validName"),
suffix = c("", "_harmon"),
multiple = "all", relationship = "many-to-many")
# # Add a column to express the name-match quality - "high" IF there is a match at this point
# dplyr::mutate(nameQuality = dplyr::if_else(stats::complete.cases(validName_valid),
# "high", "NA")) 3,703
###### c. return Occs ####
# Return the matched data
occs_21 <- occs_21 %>%
dplyr::filter(stats::complete.cases(validName_valid)) # 1,927
##### 2.2 validName_comb ####
# Now we will try and match the valid name by combining the names_clean and scientificNameAuthorship columns
###### a. prep synonyms ####
# For those that did not match, attempt to match them with the Canonical with flags column...
# Filter out the AMBIGUOUS validNames prior to matching
## SAME as 2.1 ##
currenttaxonomy <- taxonomy %>%
# REMOVE ambiguous validNames
dplyr::filter(!stringr::str_detect(
# Replace NA in flags with "" to allow matching
tidyr::replace_na(flags, ""),
"ambiguous validName"))
###### b. assign names ####
# Match names first with the validName column
occs_22 <- data %>%
# remove already-matched names
dplyr::filter(!database_id %in% occs_21$database_id) %>%
# Make a new column by combining names_clean and scientificNameAuthorship
tidyr::unite(col = "united_SciName", names_clean, scientificNameAuthorship, sep = " ",
na.rm = TRUE)
# Match names first with the validName column
occs_22 <- occs_22 %>%
dplyr::left_join(currenttaxonomy %>%
dplyr::select(c(id, accid, validName, canonical_withFlags, canonical, validName_valid,
family_valid, subfamily_valid,
canonical_withFlags_valid, genus_valid, subgenus_valid,
species_valid, infraspecies_valid, authorship_valid)),
# Match scientific name with the valid synonym name
by = c("united_SciName" = "validName"),
suffix = c("", "_harmon"),
multiple = "all", relationship = "many-to-many")
###### c. return Occs ####
# Return the matched data
runningOccurrences <- occs_22 %>%
dplyr::filter(stats::complete.cases(validName_valid) & validName_valid != "NA") %>%
# Bind the previous rows
dplyr::bind_rows(occs_21) # 2,678
# Remove this spent files
rm(occs_21, occs_22)
##### 2.3 canonical_wFlags ####
###### a. prep synonyms ####
# For those that did not match, attempt to match them with the Canonical with flags column...
# Filter out the AMBIGUOUS validNames prior to matching
currenttaxonomy <- taxonomy %>%
# REMOVE ambiguous validNames and can_wFlags
dplyr::filter(!stringr::str_detect(
# Replace NA in flags with "" to allow matching
tidyr::replace_na(flags, ""),
paste("ambiguous validName",
"ambiguous can_wFlags",
sep = "|"))) %>%
# remove the rows where the canonical and canonical_withFlags match
# ONLY matches those with added canonicals flags
dplyr::filter(!canonical == canonical_withFlags)
###### b. assign names ####
# Match names first with the validName column
occs_23 <- data %>%
# remove already-matched names
dplyr::filter(!database_id %in% runningOccurrences$database_id) %>%
dplyr::left_join(currenttaxonomy %>%
dplyr::select(c(id, accid, validName, canonical_withFlags, canonical, validName_valid,
family_valid, subfamily_valid,
canonical_withFlags_valid, genus_valid, subgenus_valid,
species_valid, infraspecies_valid, authorship_valid)),
# Match scientific name with the valid synonym name
by = c("species" = "canonical_withFlags"),
suffix = c("", "_harmon"),
multiple = "all", relationship = "many-to-many")
###### c. return Occs ####
# Return the matched data
runningOccurrences <- occs_23 %>%
dplyr::filter(stats::complete.cases(validName_valid)) %>%
# Bind the previous rows
dplyr::bind_rows(runningOccurrences)
# Remove this spent file
rm(occs_23)
##### 2.4 canonical ####
###### a. prep synonyms ####
# For those that did not match, attempt to match them with the Canonical with flags column...
# Filter out the AMBIGUOUS validNames prior to matching
currenttaxonomy <- taxonomy %>%
# REMOVE ambiguous names
dplyr::filter(!stringr::str_detect(
# Replace NA in flags with "" to allow matching
tidyr::replace_na(flags, ""),
paste("ambiguous validName",
"ambiguous can_wFlags",
"ambiguous canonical",
sep = "|")))
###### b. assign names ####
# Match names first with the validName column
occs_24 <- data %>%
# Keep the unmatched names
dplyr::filter(!database_id %in% runningOccurrences$database_id) %>%
dplyr::left_join(currenttaxonomy %>%
dplyr::select(c(id, accid, validName, canonical_withFlags, canonical,
validName_valid,
family_valid, subfamily_valid,
canonical_withFlags_valid, genus_valid, subgenus_valid,
species_valid, infraspecies_valid, authorship_valid)),
# Match scientific name with the valid synonym name
by = c("names_clean" = "canonical"),
suffix = c("", "_harmon"),
multiple = "all", relationship = "many-to-many")
###### c. return Occs ####
# Return the matched data
runningOccurrences <- occs_24 %>%
dplyr::filter(stats::complete.cases(validName_valid)) %>%
# Bind the previous rows
dplyr::bind_rows(runningOccurrences) %>%
# Make sure no duplicates have snuck in
dplyr::distinct(database_id, .keep_all = TRUE)
# Remove spent file
rm(occs_24)
##### 2.5 sciName_comb ####
# Now we will try and match the valid name by combining the scientificName and scientificNameAuthorship columns
###### a. prep synonyms ####
# For those that did not match, attempt to match them with the Canonical with flags column...
# Filter out the AMBIGUOUS validNames prior to matching
## SAME as 2.1 ##
currenttaxonomy <- taxonomy %>%
# REMOVE ambiguous validNames
dplyr::filter(!stringr::str_detect(
# Replace NA in flags with "" to allow matching
tidyr::replace_na(flags, ""),
"ambiguous validName"))
###### b. assign names ####
# Match names first with the validName column
occs_25 <- data %>%
# remove already-matched names
dplyr::filter(!database_id %in% runningOccurrences$database_id) %>%
# Make a new column by combining names_clean and scientificNameAuthorship
tidyr::unite(col = "united_SciName", names_clean, scientificNameAuthorship, sep = " ",
na.rm = TRUE)
# Match names first with the validName column
occs_25 <- occs_25 %>%
dplyr::left_join(currenttaxonomy %>%
dplyr::select(c(id, accid, validName, canonical_withFlags, canonical, validName_valid,
family_valid, subfamily_valid,
canonical_withFlags_valid, genus_valid, subgenus_valid,
species_valid, infraspecies_valid, authorship_valid)),
# Match scientific name with the valid synonym name
by = c("united_SciName" = "validName"),
suffix = c("", "_harmon"),
multiple = "all", relationship = "many-to-many")
###### c. return Occs ####
# Return the matched data
runningOccurrences <- occs_25 %>%
dplyr::filter(stats::complete.cases(validName_valid)) %>%
# Bind the previous rows
dplyr::bind_rows(runningOccurrences) %>%
# Make sure no duplicates have snuck in
dplyr::distinct(database_id, .keep_all = TRUE)
# Remove spent file
rm(occs_25)
##### 2.6 No subgenus validName ####
# Match scientificName with validName; remove subgenus from both
###### a. prep synonyms ####
# For those that did not match, attempt to match them with the Canonical with flags column...
# Filter out the AMBIGUOUS validNames prior to matching
# For those that did not match, attempt to match them with the Canonical with flags column...
# Filter out the AMBIGUOUS validNames prior to matching
currenttaxonomy <- taxonomy %>%
# REMOVE ambiguous names
dplyr::filter(!stringr::str_detect(
# Replace NA in flags with "" to allow matching
tidyr::replace_na(flags, ""),
paste("ambiguous validName",
"ambiguous canonical",
sep = "|")))
###### b. assign names ####
# Match names first with the validName column
occs_26 <- data %>%
# remove already-matched names
dplyr::filter(!database_id %in% runningOccurrences$database_id) %>%
dplyr::mutate(scientificNameMatch = scientificName %>%
# Replace subgenus with nothing
stringr::str_replace("\\([A-Za-z]+\\)", "") %>%
stringr::str_squish())
# Match names first with the validName column
occs_26 <- occs_26 %>%
dplyr::left_join(currenttaxonomy %>%
dplyr::select(c(id, accid, validName, canonical_withFlags, canonical, validName_valid,
family_valid, subfamily_valid,
canonical_withFlags_valid, genus_valid, subgenus_valid,
species_valid, infraspecies_valid, authorship_valid)) %>%
dplyr::mutate(validNameMatch = validName %>%
# Replace subgenus with nothing
stringr::str_replace("\\([A-Za-z]+\\)", "") %>%
stringr::str_squish()),
# Match scientific name with the valid synonym name
by = c("scientificNameMatch" = "validNameMatch"),
suffix = c("", "_harmon"),
multiple = "all", relationship = "many-to-many")
###### c. return Occs ####
# Return the matched data
runningOccurrences <- occs_26 %>%
dplyr::filter(stats::complete.cases(validName_valid)) %>%
# Bind the previous rows
dplyr::bind_rows(runningOccurrences) %>%
# Make sure no duplicates have snuck in
dplyr::distinct(database_id, .keep_all = TRUE)
# Remove spent file
rm(occs_26, currenttaxonomy)
##### 2.7 No subgenus canonical ####
# Match scientificName with canonical; remove subgenus from both
###### a. prep synonyms ####
# For those that did not match, attempt to match them with the Canonical with flags column...
# Filter out the AMBIGUOUS validNames prior to matching
# For those that did not match, attempt to match them with the Canonical with flags column...
# Filter out the AMBIGUOUS validNames prior to matching
currenttaxonomy <- taxonomy %>%
# REMOVE ambiguous names
dplyr::filter(!stringr::str_detect(
# Replace NA in flags with "" to allow matching
tidyr::replace_na(flags, ""),
paste("ambiguous validName",
"ambiguous canonical",
sep = "|")))
###### b. assign names ####
# Match names first with the canonical column
occs_27 <- data %>%
# remove already-matched names
dplyr::filter(!database_id %in% runningOccurrences$database_id) %>%
dplyr::mutate(scientificNameMatch = scientificName %>%
# Replace subgenus with nothing
stringr::str_replace("\\([A-Za-z]+\\)", "") %>%
stringr::str_squish())
# Match names first with the canonical column
occs_27 <- occs_27 %>%
dplyr::left_join(currenttaxonomy %>%
dplyr::select(c(id, accid, validName, canonical_withFlags, canonical, validName_valid,
family_valid, subfamily_valid,
canonical_withFlags_valid, genus_valid, subgenus_valid,
species_valid, infraspecies_valid, authorship_valid)) %>%
dplyr::mutate(canonicalMatch = canonical %>%
# Replace subgenus with nothing
stringr::str_replace("\\([A-Za-z]+\\)", "") %>%
stringr::str_squish()),
# Match scientific name with the canonical synonym name
by = c("scientificNameMatch" = "canonicalMatch"),
suffix = c("", "_harmon"),
multiple = "all", relationship = "many-to-many")
###### c. return Occs ####
# Return the matched data
runningOccurrences <- occs_27 %>%
dplyr::filter(stats::complete.cases(validName_valid)) %>%
# Bind the previous rows
dplyr::bind_rows(runningOccurrences) %>%
# Make sure no duplicates have snuck in
dplyr::distinct(database_id, .keep_all = TRUE)
# Remove spent file
rm(occs_27, currenttaxonomy)
return(runningOccurrences)
} # END unAmbiguousFunction
# Run the function
runningOccurrences <- data %>%
# Make a new column with the ordering of rows
dplyr::mutate(BeeBDC_order = dplyr::row_number()) %>%
# Group by the row number and step size
dplyr::group_by(BeeBDC_group = ceiling(BeeBDC_order/stepSize)) %>%
# Split the dataset up into a list by group
dplyr::group_split(.keep = TRUE) %>%
# Run the actual function
parallel::mclapply(., unAmbiguousFunction,
mc.cores = mc.cores
) %>%
# Combine the lists of tibbles
dplyr::bind_rows()
#### 3.0 Ambiguous names ####
writeLines(paste("\n",
" - Attempting to harmonise the occurrence data with ambiguous names...", sep = ""))
ambiguousFunction <- function(data){
##### 3.1 Prepare datasets ####
###### a. prep synonyms ####
# Synonym list of ambiguous names
# Filter TO the AMBIGUOUS validNames prior to matching
ambiguousSynonyms <- taxonomy %>%
# Keep only ambiguous validNames
dplyr::filter(stringr::str_detect(
# Replace NA in flags with "" to allow matching
tidyr::replace_na(flags, ""),
"ambiguous")) %>%
# Remove non-ambiguous matches
dplyr::filter(!stringr::str_detect(
# Replace NA in flags with "" to allow matching
tidyr::replace_na(flags, ""),
"non-")) %>%
# Remove ambiguous validName matches because these are ambiguous even with authorities.
# Perhaps in future these can be matched by geography.
dplyr::filter(!stringr::str_detect(
# Replace NA in flags with "" to allow matching
tidyr::replace_na(flags, ""),
"ambiguous validName")) %>%
# Remove those without authorship
dplyr::filter(!is.na(authorship)) %>%
# # Change the authorship to be easier to match by removing capitals and punctuation
dplyr::mutate(authorSimple = stringr::str_remove_all(authorship,
pattern = "[:punct:]") %>% tolower())
###### b. author in sciName ####
if(!all(is.na(data$scientificNameAuthorship))){
# Create a list of scientificNameAuthorships that can be found in scientificName, where the former is empty
ambiguousAuthorFound <- data %>%
# check only the data without authorship
dplyr::filter(is.na(scientificNameAuthorship)) %>%
# Select only UNDER genus-level IDs
dplyr::filter(taxonRank %in% c("Especie", "forma", "Infrasubspecies", "Race",
"species", "Species", "SPECIES", "subsp.", "subspecies",
"Subspecies", "SUBSPECIES", "syn", "var.", "variety",
"Variety", "VARIETY", NA, "NA")) %>%
# Make a new column with the authorship extracted from scientificName
dplyr::mutate(authorFound = stringr::str_extract(
# Use a simplified scientificName string
string = stringr::str_remove_all(scientificName,
pattern = "[:punct:]") %>% tolower(),
pattern = paste(ambiguousSynonyms$authorSimple, collapse = "|"))) %>%
# Keep only matched names
dplyr::filter(stats::complete.cases(authorFound)) %>%
# Return only the database_id and authorFound for merging...
dplyr::select(tidyselect::all_of(c("database_id", "authorFound")))
# Add the author to those data that were lacking
data <- data %>%
# Add authorFound to original dataset
dplyr::left_join(ambiguousAuthorFound, by = "database_id",multiple = "all") %>%
# If scientificNameAuthorship is empty, use authorFound from ambiguousAuthorFound
dplyr::mutate(scientificNameAuthorship =
dplyr::if_else(is.na(scientificNameAuthorship),
# If missing replace the na with the authorFound
authorFound,
# IF already complete, keep the original
scientificNameAuthorship))
# Remove used data
rm(ambiguousAuthorFound)}
###### c. ambiguous data ####
# Filter occurrence dataset to those with ambiguous names AND authorship values
data_amb <- data %>%
# Keep those with authorship recorded
dplyr::filter(stats::complete.cases(scientificNameAuthorship)) %>%
# Keep those that are in the ambiguous names list
dplyr::filter(scientificName %in% ambiguousSynonyms$validName |
scientificName %in% ambiguousSynonyms$canonical_withFlags |
scientificName %in% ambiguousSynonyms$canonical) %>%
# Simplify scientificNameAuthorship to make easier matches
dplyr::mutate(SciNameAuthorSimple = stringr::str_remove_all(scientificNameAuthorship,
pattern = "[:punct:]") %>% tolower())
##### 3.2 Valid Name ####
###### a. assign names ####
# Match names first with the validName column
runningAmb_occs <- data_amb %>%
# Select only rows with both scientificName and SciNameAuthorSimple
dplyr::filter(stats::complete.cases(scientificName) & stats::complete.cases(SciNameAuthorSimple)) %>%
# Add taxonomy information to the occurrence data.
dplyr::left_join(ambiguousSynonyms %>%
dplyr::select(c(id, accid, validName, canonical_withFlags, canonical, validName_valid,
family_valid, subfamily_valid,
canonical_withFlags_valid, genus_valid, subgenus_valid,
species_valid, infraspecies_valid, authorship_valid, authorSimple)),
# Match scientific name with the valid synonym name
by = c("scientificName" = "validName",
"SciNameAuthorSimple" = "authorSimple"),
suffix = c("", "_harmon"),
multiple = "all",relationship = "many-to-many")
###### b. return Occs ####
# Return the matched data_amb
runningAmb_occs <- runningAmb_occs %>%
dplyr::filter(stats::complete.cases(validName_valid)) # 1,927
##### 3.3 validName_comb ####
# Now we will try and match the valid name by combining the names_clean and scientificNameAuthorship columns
###### a. assign names ####
# Match names first with the validName column
occs_33 <- data_amb %>%
# remove already-matched names
dplyr::filter(!database_id %in% runningAmb_occs$database_id) %>%
# Make a new column by combining names_clean and scientificNameAuthorship
tidyr::unite(col = "united_SciName", names_clean, scientificNameAuthorship, sep = " ",
na.rm = TRUE)
# Match names first with the validName column
occs_33 <- occs_33 %>%
# Select only rows with both united_SciName and SciNameAuthorSimple
dplyr::filter(stats::complete.cases(united_SciName) & stats::complete.cases(SciNameAuthorSimple)) %>%
# Add taxonomy information to the occurrence data.
dplyr::left_join(ambiguousSynonyms %>%
dplyr::select(c(id, accid, validName, canonical_withFlags, canonical, validName_valid,
family_valid, subfamily_valid,
canonical_withFlags_valid, genus_valid, subgenus_valid,
species_valid, infraspecies_valid, authorship_valid, authorSimple)),
# Match scientific name with the valid synonym name
by = c("united_SciName" = "validName",
"SciNameAuthorSimple" = "authorSimple"),
suffix = c("", "_harmon"),
multiple = "all", relationship = "many-to-many")
###### b. return Occs ####
# Return the matched data_amb
runningAmb_occs <- occs_33 %>%
dplyr::filter(stats::complete.cases(validName_valid)) %>%
# Bind the previous rows
dplyr::bind_rows(runningAmb_occs) # 2,678
# Remove this spent files
rm(occs_33)
##### 3.4 canonical_wFlags ####
###### a. prep datasets ####
# Synonym list of ambiguous names
# Filter TO the AMBIGUOUS validNames prior to matching
syns_34 <- ambiguousSynonyms %>%
# remove the rows where the canonical and canonical_withFlags match
dplyr::filter(!canonical == canonical_withFlags)
###### b. assign names ####
# Match names first with the validName column
occs_34 <- data_amb %>%
# remove already-matched names
dplyr::filter(!database_id %in% runningAmb_occs$database_id) %>%
# Select only rows with both species and SciNameAuthorSimple
dplyr::filter(stats::complete.cases(species) & stats::complete.cases(SciNameAuthorSimple)) %>%
# Add taxonomy information to the occurrence data.
dplyr::left_join(syns_34 %>%
dplyr::select(c(id, accid, validName, canonical_withFlags, canonical, validName_valid,
family_valid, subfamily_valid,
canonical_withFlags_valid, genus_valid, subgenus_valid,
species_valid, infraspecies_valid, authorship_valid, authorSimple)),
# Match scientific name with the valid synonym name
by = c("species" = "canonical_withFlags",
"SciNameAuthorSimple" = "authorSimple"),
suffix = c("", "_harmon"),
multiple = "all", relationship = "many-to-many")
###### c. return Occs ####
# Return the matched data_amb
runningAmb_occs <- occs_34 %>%
dplyr::filter(stats::complete.cases(validName_valid)) %>%
# Bind the previous rows
dplyr::bind_rows(runningAmb_occs)
# Remove this spent file
rm(occs_34, syns_34)
##### 3.5 canonical ####
###### b. assign names ####
# Match names first with the validName column
occs_35 <- data_amb %>%
# Keep the unmatched names
dplyr::filter(!database_id %in% runningAmb_occs$database_id) %>%
# Select only rows with both names_clean and SciNameAuthorSimple
dplyr::filter(stats::complete.cases(names_clean) & stats::complete.cases(SciNameAuthorSimple)) %>%
# Add taxonomy information to the occurrence data.
dplyr::left_join(ambiguousSynonyms %>%
dplyr::select(c(id, accid, validName, canonical_withFlags, canonical, validName_valid,
family_valid, subfamily_valid,
canonical_withFlags_valid, genus_valid, subgenus_valid,
species_valid, infraspecies_valid, authorship_valid, authorSimple)),
# Match scientific name with the valid synonym name
by = c("names_clean" = "canonical",
"SciNameAuthorSimple" = "authorSimple"),
suffix = c("", "_harmon"),
multiple = "all", relationship = "many-to-many")
###### c. return Occs ####
# Return the matched data_amb
runningAmb_occs <- occs_35 %>%
dplyr::filter(stats::complete.cases(validName_valid)) %>%
# Bind the previous rows
dplyr::bind_rows(runningAmb_occs)%>%
# Make sure no duplicates have snuck in
dplyr::distinct(database_id, .keep_all = TRUE)
# Remove spent file
rm(occs_35)
##### 3.6 validName_comb ####
# Now we will try and match the valid name by combining the names_clean and scientificNameAuthorship columns
###### a. assign names ####
# Match names first with the validName column
occs_36 <- data_amb %>%
# remove already-matched names
dplyr::filter(!database_id %in% runningAmb_occs$database_id) %>%
# Make a new column by combining names_clean and scientificNameAuthorship
tidyr::unite(col = "united_SciName", names_clean, scientificNameAuthorship, sep = " ",
na.rm = TRUE)
# Match names first with the validName column
occs_36 <- occs_36 %>%
# Select only rows with both united_SciName and SciNameAuthorSimple
dplyr::filter(stats::complete.cases(united_SciName) & stats::complete.cases(SciNameAuthorSimple)) %>%
# Add taxonomy information to the occurrence data.
dplyr::left_join(ambiguousSynonyms %>%
dplyr::select(c(id, accid, validName, canonical_withFlags, canonical, validName_valid,
family_valid, subfamily_valid,
canonical_withFlags_valid, genus_valid, subgenus_valid,
species_valid, infraspecies_valid, authorship_valid, authorSimple)),
# Match scientific name with the valid synonym name
by = c("united_SciName" = "validName",
"SciNameAuthorSimple" = "authorSimple"),
suffix = c("", "_harmon"),
multiple = "all", relationship = "many-to-many")
###### b. return Occs ####
# Return the matched data_amb
runningAmb_occs <- occs_36 %>%
dplyr::filter(stats::complete.cases(validName_valid)) %>%
# Bind the previous rows
dplyr::bind_rows(runningAmb_occs) # 2,678
# Remove this spent files
rm(occs_36)
##### 3.7 No subgenus validName ####
# Match scientificName with validName; remove subgenus from both
###### a. assign names ####
# Match names first with the validName column
occs_37 <- data_amb %>%
# remove already-matched names
dplyr::filter(!database_id %in% runningAmb_occs$database_id) %>%
dplyr::mutate(scientificNameMatch = scientificName %>%
# Replace subgenus with nothing
stringr::str_replace("\\([A-Za-z]+\\)", "") %>%
stringr::str_squish())
occs_37 <- occs_37 %>%
dplyr::left_join(ambiguousSynonyms %>%
dplyr::select(c(id, accid, validName, canonical_withFlags, canonical,
validName_valid, family_valid, subfamily_valid,
canonical_withFlags_valid, genus_valid, subgenus_valid,
species_valid, infraspecies_valid, authorship_valid)) %>%
dplyr::mutate(validNameMatch = validName %>%
# Replace subgenus with nothing
stringr::str_replace("\\([A-Za-z]+\\)", "") %>%
stringr::str_squish()),
# Match scientific name with the valid synonym name
by = c("scientificNameMatch" = "validNameMatch"),
suffix = c("", "_harmon"),
multiple = "all", relationship = "many-to-many")
###### b. return Occs ####
# Return the matched data_amb
runningAmb_occs <- occs_37 %>%
dplyr::filter(stats::complete.cases(validName_valid)) %>%
# Bind the previous rows
dplyr::bind_rows(runningAmb_occs)
# Remove this spent files
rm(occs_37)
##### 3.8 No subgenus canonical ####
# Match scientificName with canonical; remove subgenus from both
###### a. assign names ####
# Match names first with the canonical column
occs_38 <- data_amb %>%
# remove already-matched names
dplyr::filter(!database_id %in% runningAmb_occs$database_id) %>%
dplyr::mutate(scientificNameMatch = scientificName %>%
# Replace subgenus with nothing
stringr::str_replace("\\([A-Za-z]+\\)", "") %>%
stringr::str_squish())
occs_38 <- occs_38 %>%
dplyr::left_join(ambiguousSynonyms %>%
dplyr::select(c(id, accid, validName, canonical_withFlags, canonical,
validName_valid, family_valid, subfamily_valid,
canonical_withFlags_valid, genus_valid, subgenus_valid,
species_valid, infraspecies_valid, authorship_valid)) %>%
dplyr::mutate(canonicalMatch = canonical %>%
# Replace subgenus with nothing
stringr::str_replace("\\([A-Za-z]+\\)", "") %>%
stringr::str_squish()),
# Match scientific name with the valid synonym name
by = c("scientificNameMatch" = "canonicalMatch"),
suffix = c("", "_harmon"),
multiple = "all", relationship = "many-to-many")
###### b. return Occs ####
# Return the matched data_amb
runningAmb_occs <- occs_38 %>%
dplyr::filter(stats::complete.cases(validName_valid)) %>%
# Bind the previous rows
dplyr::bind_rows(runningAmb_occs)
# Remove this spent files
rm(occs_38)
return(runningAmb_occs)
} # END ambiguousFunction
# Run the function
runningAmb_occs <- data %>%
# Make a new column with the ordering of rows
dplyr::mutate(BeeBDC_order = dplyr::row_number()) %>%
# Group by the row number and step size
dplyr::group_by(BeeBDC_group = ceiling(BeeBDC_order/stepSize)) %>%
# Split the dataset up into a list by group
dplyr::group_split(.keep = TRUE) %>%
# Run the actual function
parallel::mclapply(., ambiguousFunction,
mc.cores = mc.cores
) %>%
# Combine the lists of tibbles
dplyr::bind_rows()
##### 3.6 Combine 2.x & 3.x ####
# Merge the results from 2.x and 3.x
runningOccurrences <- runningOccurrences %>%
# Remove the ambiguous data from the prior-matched data
dplyr::filter(!database_id %in% runningAmb_occs$database_id) %>%
# Add in the ambiguous data again.
dplyr::bind_rows(runningAmb_occs)
# Remove the spent runningAmb_occs data
rm(runningAmb_occs)
#### 4.0 verbatimScientificName ####
if(checkVerbatim == TRUE){
writeLines(paste0("checkVerbatim = TRUE. Checking the verbatimScientificName column..."))
###### 4.1 failedMatches ####
# Find the data that did not match
failedMatches <- data %>%
# Remove the matched names from the OG dataset
dplyr::filter(!database_id %in% runningOccurrences$database_id) %>%
# Move the verbatimScientificName to scientificName
dplyr::mutate(scientificName = verbatimScientificName)
#### 4.2 Run unAmbiguous names ####
# Run the function
runningOccurrences_verb <- failedMatches %>%
# Make a new column with the ordering of rows
dplyr::mutate(BeeBDC_order = dplyr::row_number()) %>%
# Group by the row number and step size
dplyr::group_by(BeeBDC_group = ceiling(BeeBDC_order/stepSize)) %>%
# Split the dataset up into a list by group
dplyr::group_split(.keep = TRUE) %>%
# Run the actual function
parallel::mclapply(., unAmbiguousFunction,
mc.cores = mc.cores
) %>%
# Combine the lists of tibbles
dplyr::bind_rows()
#### 4.3 Run ambiguous names ####
# Run the function
runningAmb_occs_verb <- failedMatches %>%
# Make a new column with the ordering of rows
dplyr::mutate(BeeBDC_order = dplyr::row_number()) %>%
# Group by the row number and step size
dplyr::group_by(BeeBDC_group = ceiling(BeeBDC_order/stepSize)) %>%
# Split the dataset up into a list by group
dplyr::group_split(.keep = TRUE) %>%
# Run the actual function
parallel::mclapply(., ambiguousFunction,
mc.cores = mc.cores
) %>%
# Combine the lists of tibbles
dplyr::bind_rows()
#### 4.4 Combine 4.2-.3 ####
runningOccurrences_verb <- runningOccurrences_verb %>%
# Remove the ambiguous data from the prior-matched data
dplyr::filter(!database_id %in% runningAmb_occs_verb$database_id) %>%
# Add in the ambiguous data again.
dplyr::bind_rows(runningAmb_occs_verb)
#### 4.5 Combine 3.6 & 4.4 ####
# Merge the results from 2.x and 3.x
runningOccurrences <- runningOccurrences %>%
# Remove the ambiguous data from the prior-matched data
dplyr::filter(!database_id %in% runningOccurrences_verb$database_id) %>%
# Add in the ambiguous data again.
dplyr::bind_rows(runningOccurrences_verb)
# Remove the spent runningAmb_occs data
rm(runningOccurrences_verb, runningAmb_occs_verb)
}
#### 5.0 Merge ####
writeLines(" - Formatting merged datasets...")
# merge datasets
runningOccurrences <- runningOccurrences %>%
# Put the scientific name into a new column called verbatimScientificName
dplyr::mutate(verbatimScientificName = scientificName) %>%
# select the columns we want to keep
dplyr::select( c(tidyselect::any_of(OG_colnames), validName_valid,
verbatimScientificName,
family_valid, subfamily_valid,
canonical_withFlags_valid, genus_valid, subgenus_valid,
species_valid, infraspecies_valid, authorship_valid)) %>%
# rename validName_valid to scientificName and place it where it used to sit.
dplyr::mutate(scientificName = validName_valid, .after = database_id) %>%
# Add in the other taxonomic data
dplyr::mutate(species = canonical_withFlags_valid,
family = family_valid,
subfamily = subfamily_valid,
genus = genus_valid,
subgenus = subgenus_valid,
specificEpithet = species_valid,
infraspecificEpithet = infraspecies_valid,
scientificNameAuthorship = authorship_valid,
.after = scientificName) %>%
# Remove extra columns
dplyr::select(!c(canonical_withFlags_valid, family_valid, subfamily_valid, genus_valid,
subgenus_valid, species_valid, infraspecies_valid, authorship_valid,
validName_valid)) %>%
# Add the .invalidName columns as TRUE (not flagged)
dplyr::mutate(.invalidName = TRUE)
##### 5.1 User output ####
nMatchedRows <- nrow(runningOccurrences)
nUnmatchedRows <- nrow(data) - nrow(runningOccurrences)
###### a. failedMatches ####
# Find the data that did not match
failedMatches <- data %>%
# Remove the matched names from the OG dataset
dplyr::filter(!database_id %in% runningOccurrences$database_id) %>%
# Add the .invalidName columns as TRUE (not flagged)
dplyr::mutate(.invalidName = FALSE)
# Remove this spent file
rm(data)
###### b. Add column ####
runningOccurrences <- runningOccurrences %>%
# Bind the failed matches
dplyr::bind_rows(failedMatches) %>%
# Make sure no duplicates have snuck in
dplyr::distinct(database_id, .keep_all = TRUE)
###### c. return columns states ####
# Return the speciesColumn name to it's original state
names(runningOccurrences)[names(runningOccurrences) == "scientificName"] <- speciesColumn
if(rm_names_clean == TRUE){
message("Removing the names_clean column...")
runningOccurrences <- runningOccurrences %>%
dplyr::select(!tidyselect::any_of("names_clean"))
}
# Cut down the failed list...
# failedMatches <- failedMatches %>%
# dplyr::select(tidyselect::any_of("taxonRank")) %>%
# dplyr::filter(!taxonRank %in% c("Especie", "forma", "Infrasubspecies", "Race",
# "species", "Species", "SPECIES", "subsp.", "subspecies",
# "Subspecies", "SUBSPECIES", "syn", "var.", "variety",
# "Variety", "VARIETY"))
###### d. output ####
writeLines(paste(
" - We matched valid names to ",
format(sum(runningOccurrences$.invalidName == TRUE), big.mark = ","), " of ",
format(OG_rowNum, big.mark = ","), " occurrence records. This leaves a total of ",
format(sum(runningOccurrences$.invalidName == FALSE), big.mark = ","), " unmatched occurrence records.",
# " Of the unmatched records, approximately ",
# format( nrow(failedMatches), big.mark = ","),
# " are only identified to genus rank or higher.",
sep = ""))
writeLines(paste("\nharmoniseR:"))
message(paste(format(sum(runningOccurrences$.invalidName == FALSE), big.mark = ",")))
writeLines(paste(
"records were flagged.\nThe column, '.invalidName' was added to the database.\n"))
message(paste0(
" - We updated the following columns: ", speciesColumn,", species, family, subfamily, genus, subgenus, ",
"specificEpithet, infraspecificEpithet, and scientificNameAuthorship. ",
"The previous ",speciesColumn," column was converted to verbatimScientificName"
))
# End time message
endTime <- Sys.time()
message(paste(
" - Completed in ",
round(difftime(endTime, startTime), digits = 2 ),
" ",
units(round(endTime - startTime, digits = 2)),
sep = ""))
# Return this file
return(runningOccurrences)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.