# Load packages needed for this script library(tidyverse) ; library(readxl) ; library(taxize) ; library(rgbif) ; library(purrr) # List all the raw data files in your local directory file_list <- dir(path="path/to/your/data", pattern='*.xlsx') # makes list of the files file_listp <- paste0("path/to/your/data/", file_list) # adds path to file names # if you would like to use the example data in this package, use the following code # file_listp <- system.file("extdata", "Japan_taxa.xlsx", package = "insectcleanr")
# apply the separate_taxonomy function over the list of dataframes tax_list <- lapply(file_listp, separate_taxonomy) # put all taxonomy dataframes into one large dataframe tax_df <- tax_list %>% purrr::reduce(full_join) %>% mutate_all(~gsub("(*UCP)\\s\\+|\\W+$", "", . , perl=TRUE)) %>% dplyr::rename(taxonomic_authority = authority) %>% dplyr::arrange(genus_species) %>% dplyr::filter(!(genus_species == "Baridinae gen")) # define what taxonomic columns might be named tax_class <- c("kingdom", "phylum", "class", "order", "family", "super_family", "genus", "species", "genus_species", "taxonomic_authority", "taxonomy_system")
# also correct mis-spellings of certain species based on expert review by A. Liebhold # misspell <- read_csv("./data/raw_data/taxonomic_reference/misspelling_SAL_resolved.csv", trim_ws = TRUE) tax_df1 <- tax_df %>% mutate_all(~gsub("(*UCP)\\s\\+|\\W+$", "", . , perl=TRUE)) %>% mutate_at(vars(genus_species), str_squish) %>% mutate(user_supplied_name = genus_species) %>% # full_join(misspell, by = "user_supplied_name") %>% # transmute(phylum, class, order, family, super_family, user_supplied_name, # genus_species = ifelse(!is.na(genus_species.y), genus_species.y, genus_species.x ), # genus = word(genus_species, 1), # species = word(genus_species, 2), # taxonomy_system, taxonomic_authority) %>% distinct(genus_species) %>% # remove species duplicates dplyr::arrange(genus_species) # arrange alphabetically
# make character vector of names only to genus # g_sp <- grep('\\<sp\\>', tax_df1$genus_species, value=TRUE) # g_spp <- grep('\\<sp.\\>', tax_df1$genus_species, value=TRUE) g_sp <- filter(tax_df1, (str_count(genus_species, " ") + 1) == 1) # bard <- grep('\\<gen\\>', tax_df1$genus_species, value=TRUE) # include sub-family here tax_vec_gn <- unlist(g_sp, use.names = FALSE) %>% # gsub(" [a-zA-Z0-9]*", "", .) %>% magrittr::extract(!(. == "Tasconotus")) # remove this species # makes character vector of names only to species tax_vec_sp <- tax_df1 %>% filter(!(genus_species %in% g_sp$genus_species)) %>% # magrittr::extract(!(. %in% g_sp)) %>% # magrittr::extract(!(. %in% g_spp)) %>% unlist(., use.names = FALSE) %>% magrittr::extract(!(. == "Baridinae")) # this family put with genus above
xtra_cols <- c("kingdomkey", "phylumkey", "classkey", "orderkey", "specieskey", "note", "familykey", "genuskey", "scientificname", "canonicalname", "confidence") ###################### # apply the get_accepted_taxonomy function over the vector of species names tax_acc_l <- lapply(tax_vec_sp, get_accepted_taxonomy) # make dataframe of all results suppressMessages( tax_acc <- tax_acc_l %>% purrr::reduce(full_join) %>% mutate(genus_species = str_squish(genus_species)) %>% select(-one_of(xtra_cols)) )
###################### # apply the get_accepted_taxonomy function over the vector of genus names gn_acc_l <- lapply(tax_vec_gn, get_accepted_taxonomy) # make dataframe of all results suppressMessages( gen_acc <- gn_acc_l %>% purrr::reduce(full_join) %>% mutate(genus_species = str_squish(genus_species)) %>% select(-one_of(xtra_cols)) )
###################### # resolve species without accepted species names ######## # genus level matches from get_accepted_taxonomy results genus_only <- tax_acc %>% dplyr::filter(rank == "genus") #%>% # filter out those where user_supplied_name was only genus to begin with #dplyr::filter(!word(user_supplied_name,-1) == "sp") go_vec <- unlist(genus_only$user_supplied_name, use.names = FALSE) # apply the function over the vector of species names tax_go_l <- lapply(go_vec, get_more_info) # make dataframe of all species rank matches that were originally genus only suppressMessages( tax_go <- tax_go_l %>% purrr::reduce(full_join) %>% # join all data frames from list dplyr::filter(!(matched_name2 == "species not found")) %>% # remove taxa that didn't provide a species-level match (no new info) dplyr::filter((str_count(matched_name2, '\\s+')+1) %in% c(2,3)) %>% mutate(genus = ifelse((str_count(matched_name2, '\\s+')+1) == 1, matched_name2, stringr::word(matched_name2, 1)), species = ifelse((str_count(matched_name2, '\\s+')+1) %in% c(2,3), matched_name2, NA_character_), genus_species = ifelse(is.na(species), paste(genus, "sp"), species)) %>% select(-matched_name2) )
# How many did not return lower rank? suppressMessages( no_lower <- tax_go_l %>% purrr::reduce(full_join) %>% # join all data frames from list # filter to taxa that only returned genus (no new info) dplyr::filter((str_count(matched_name2, '\\s+')+1) == 1| matched_name2 == "species not found") ) # from no_lower, the not found no_lower_not_found <- no_lower %>% filter(matched_name2 == "species not found") # from no_lower, the genus_only matches no_lower_genus <- no_lower %>% filter(!(matched_name2 == "species not found"))
######## # species not found at all from get_accepted_taxonomy results not_found <- tax_acc %>% dplyr::filter(genus_species == "species not found", !(is.na(user_supplied_name))) %>% bind_rows(no_lower_not_found) not_found_vec <- unlist(not_found$user_supplied_name, use.names = FALSE) # apply the function over the vector of species names tax_nf_l <- lapply(not_found_vec, get_more_info) # make dataframe of matches at species rank suppressMessages( tax_nf <- tax_nf_l %>% purrr::reduce(full_join) %>% dplyr::filter(!(matched_name2 == "species not found")) %>% dplyr::filter((str_count(matched_name2, '\\s+')+1) == 2) %>% mutate(genus = ifelse((str_count(matched_name2, '\\s+')+1) == 1, matched_name2, NA_character_), species = ifelse((str_count(matched_name2, '\\s+')+1) %in% c(2,3), matched_name2, NA_character_), genus_species = ifelse(is.na(species), genus, species)) %>% mutate(genus = ifelse(is.na(genus), stringr::word(species, 1), genus)) %>% select(-matched_name2) )
# How many were not found? suppressMessages( no_match <- tax_nf_l %>% purrr::reduce(full_join) %>% # join all data frames from list dplyr::filter(matched_name2 == "species not found") ) # How many returned at genus level rank? suppressMessages( nf_go <- tax_nf_l %>% purrr::reduce(full_join) %>% # join all data frames from list dplyr::filter((str_count(matched_name2, '\\s+')+1) == 1) ) ######## # put together genus-level only matches genus_matches <- no_lower_genus %>% bind_rows(nf_go) %>% mutate(genus = matched_name2) %>% dplyr::rename(genus_species = matched_name2)
# load file with manual corrections, for example, to family names, synonyms, etc. sal_taxa <- read_csv("data/taxonomic_reference/genus_only_resolution_FIXED.csv", trim_ws = TRUE, col_types = cols(up_to_date_name = col_character())) # add manual corrections to correct genus-level only matches genus_match_SAL <- genus_matches %>% left_join(sal_taxa, by = "user_supplied_name") %>% transmute(user_supplied_name, taxonomy_system = ifelse(!(is.na(genus_species.y)), taxonomy_system.y, taxonomy_system.x), #kingdom, phylum, class, order, family, genus = ifelse(!(is.na(genus_species.y)), word(genus_species.y, 1), genus), species = ifelse(!(is.na(genus_species.y)) & rank == "species", genus_species.y, NA_character_), genus_species = ifelse(!(is.na(genus_species.y)), genus_species.y, genus_species.x), rank, synonym) %>% mutate(synonym = as.character(synonym)) manually_matched <- subset(genus_match_SAL, (user_supplied_name %in% sal_taxa$user_supplied_name))
######## # dataframes of remaining unmatched taxa and # remaining manual corrections (will be implemented by row replacement below) # taxa still missing a genus match still_no_match <- subset(genus_match_SAL, !(user_supplied_name %in% sal_taxa$user_supplied_name)) # taxa included in sal_taxa but not matched in genus_matches (could be from interception data) man_correct_remain <- subset(sal_taxa, !(user_supplied_name %in% manually_matched$user_supplied_name))
######## # put together dataframes with new info new_sp_info <- tax_nf %>% full_join(tax_go) %>% dplyr::left_join(select(genus_only, user_supplied_name, kingdom, # this and the transmute adds back in the higher rank info phylum, class, order, family), by = "user_supplied_name") %>% # full_join(synon_retest) %>% full_join(manually_matched) %>% # df of manual corrections mutate(genus = ifelse(is.na(genus), word(genus_species, 1), genus), rank = ifelse(is.na(rank) & str_count(genus_species, '\\w+')%in% c(2,3), "species", rank), kingdom = ifelse(is.na(kingdom), "Animalia", kingdom), phylum = ifelse(is.na(phylum), "Arthropoda", phylum), class = ifelse(is.na(class), "Insecta", class))
tax_combo <- dplyr::filter(tax_acc, rank %in% c("species", "subspecies")) %>% # GBIF matches to species rank full_join(gen_acc) %>% # df of taxa where user supplied name was genus only to start with # full_join(new_npf_aus) %>% # df of new non-plant-feeding Australian taxa from Helen full_join(new_sp_info, by = "user_supplied_name") %>% # bind in the new info from auto and manual resolution transmute(user_supplied_name, rank = ifelse(is.na(rank.y), rank.x, rank.y), status, # = ifelse(is.na(status.y), status.x, status.y), matchtype, # = ifelse(is.na(matchtype.y), matchtype.x, matchtype.y), usagekey, # = ifelse(is.na(usagekey.y), usagekey.x, usagekey.y), synonym = ifelse(is.na(synonym.y), synonym.x, synonym.y), acceptedusagekey, # = ifelse(is.na(acceptedusagekey.y), acceptedusagekey.x, acceptedusagekey.y), kingdom = ifelse(is.na(kingdom.y), kingdom.x, kingdom.y), phylum = ifelse(is.na(phylum.y), phylum.x, phylum.y), class = ifelse(is.na(class.y), class.x, class.y), order = ifelse(is.na(order.y), order.x, order.y), family = ifelse(is.na(family.y), family.x, family.y), genus = ifelse(is.na(genus.y), genus.x, genus.y), species = ifelse(is.na(species.y), species.x, species.y), genus_species = ifelse(is.na(genus_species.y), genus_species.x, genus_species.y), taxonomy_system = ifelse(is.na(taxonomy_system.y), taxonomy_system.x, taxonomy_system.y), taxonomic_authority) %>% # = ifelse(is.na(taxonomic_authority.y), taxonomic_authority.x, taxonomic_authority.y)) %>% # a bit more cleaning from Rebecca Turner mutate(family = ifelse(family %in% c("Rutelidae","Melolonthidae", "Dynastidae"), "Scarabaeidae", family), family = ifelse(genus == "Dermestes", "Dermestidae", family)) %>% dplyr::filter(!(is.na(user_supplied_name))) # remove blank rows # subset remaining manual fixes for those user supplied names that are in tax_combo to get rows that need to be replaced # rows_2_replace <- subset(man_correct_remain, (user_supplied_name %in% tax_combo$user_supplied_name))
# replace rows with new info, and add rows from interception data tax_final <- tax_combo %>% full_join(man_correct_remain, by = "user_supplied_name") %>% transmute(user_supplied_name, status , #= ifelse(!is.na(rank.y), NA_character_, status), matchtype = ifelse(!is.na(rank.y), NA_character_, matchtype), usagekey = ifelse(!is.na(rank.y), NA_character_, usagekey), rank = ifelse(!is.na(rank.y), rank.y, rank.x), synonym = ifelse(!is.na(rank.y), synonym.y, synonym.x), acceptedusagekey = ifelse(!is.na(rank.y), NA_character_, acceptedusagekey), kingdom, phylum, class, order = ifelse(!is.na(order.y), order.y, order.x), family = ifelse(!is.na(family.y), family.y, family.x), genus = ifelse(!is.na(family.y), word(genus_species.y, 1), genus), species = ifelse(!is.na(family.y), word(genus_species.y, 2), species), genus_species = ifelse(!is.na(genus_species.y), genus_species.y, genus_species.x), taxonomy_system = ifelse(!is.na(taxonomy_system.y), taxonomy_system.y, taxonomy_system.x), taxonomic_authority = ifelse(!is.na(taxonomic_authority.y), taxonomic_authority.y, taxonomic_authority.x)) %>% mutate(kingdom = ifelse(is.na(kingdom), "Animalia", kingdom), phylum = ifelse(is.na(phylum), "Arthropoda", phylum), class = ifelse(is.na(class), "Insecta", class), genus_species = ifelse(genus_species == "species not found", NA_character_, genus_species)) %>% mutate(genus = ifelse(is.na(genus), word(genus_species, 1), genus), species = ifelse(is.na(species), word(genus_species, 2), species)) %>% arrange(user_supplied_name) %>% # add the unique ID column after all unique species are in one dataframe tibble::rowid_to_column("taxon_id")
# check for duplicates # dups <- tax_final %>% group_by(user_supplied_name) %>% filter(n()>1) # Bostrichidae # bos <- tax_combo %>% filter(family == "Bostrichidae") # Rutelidae, Melolonthidae, and Dynastidae # RMD <- tax_combo %>% filter(family %in% c("Rutelidae", "Melolonthidae", "Dynastidae"))
# write the clean taxonomy table to a CSV file readr::write_csv(tax_final, "path/to/your/data/taxonomy_table.csv")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.