R/cleanSpecies.R

Defines functions cleanSpecies

Documented in cleanSpecies

#' @name cleanSpecies
#'
#' @title Helper function to clean taxonomy.
#'
#' @description This correct misspellings, synonyms and flags not recognizes species.
#'
#' @param species A vector of species to be checked.
#' @param verbose logical. Print stuff? Default set to `TRUE`.
#' @param db Which db should be used 'itis' is the default
#'
#' @return data.frame of original and corrected species.
#'
#' @examples
#' \donttest{
#' # not run
#' species <- c('Osmia rufa', 'Osmia bicornis', 'Osmia ruffa',
#'            'Osmia wikifluqie', 'watermelon pie', 'Osmia sp.')
#' cleanSpecies(species)
#' }
#' @export

cleanSpecies <- function(species, verbose = TRUE, db = "itis") {
    species <- as.character(species)
    # misspellings
    species2 <- unique(species)
    temp <- taxize::gnr_resolve(species2, best_match_only = TRUE, canonical = TRUE)
    dat <- merge(data.frame(species2), temp[, c("user_supplied_name", "matched_name2")],
        by.x = "species2", by.y = "user_supplied_name", all.x = TRUE)
    # synonyms here we ca save time by re-doing a unique() and removing
    # NA's
    species3 <- unique(dat$matched_name2)
    species3 <- species3[!is.na(species3)]
    temp <- taxize::synonyms(species3, db = db)
    synonym_ids <- grep(pattern = "acc_name", temp)  #is this the optimal solution?
    accepted_names <- unlist(lapply(temp[synonym_ids], "[", "acc_name"),
        use.names = FALSE)
    synonym_names <- species3
    synonym_names[synonym_ids] <- accepted_names[1L]
    key <- data.frame(species3, synonym_names, stringsAsFactors = FALSE)
    dat <- merge(dat, key, by.x = "matched_name2", by.y = "species3", all.x = TRUE)
    # clean non accepted species
    species4 <- unique(dat$synonym_names)
    species4 <- species4[!is.na(species4)]
    out2 <- taxize::tax_name(species4, get = "species", db = db, pref = "itis",
        verbose = verbose, messages = verbose)
    out2_u <- unique(out2$species)
    final_names <- species4
    final_names[which(!species4 %in% out2_u)] <- NA
    key2 <- data.frame(species4, final_names, stringsAsFactors = FALSE)
    dat <- merge(dat, key2, by.x = "synonym_names", by.y = "species4",
        all.x = TRUE)
    # output
    dat <- merge(data.frame(species), dat, by.x = "species", by.y = "species2",
        all.x = TRUE)
    colnames(dat) <- c("species", "accepted_synonyms", "matched_names", "filtred_names")
    dat[, seq_len(4)]
}
metadevpro/traitbaser documentation built on April 20, 2020, 10:52 p.m.