---
title: "make_taxonomy_table"
output: rmarkdown::html_vignette
vignette: >
%\VignetteIndexEntry{my-vignette}
%\VignetteEngine{knitr::rmarkdown}
%\VignetteEncoding{UTF-8}
---
####################################################################
##### Insect Invasions Pursuit @ SESYNC #####
##### Example script to create clean taxonomy table #####
##### created by Rachael Blake 11/15/2018 #####
####################################################################
# Load packages needed for this script
library(tidyverse) ; library(readxl) ; library(taxize) ; library(rgbif) ; library(purrr)
# source the custom functions
# source("./custom_taxonomy_funcs.R")
# checks to see if clean flat files exist, otherwise creates them from multi-worksheet files
# if(!file.exists("./data/raw_data/seebens_clean.csv")|
# !file.exists("./data/raw_data/raw_by_country/New Zealand_Edney_Browne_2018_clean.xlsx")) {
# source("./scripts/clean_seebens.R")
# source("./scripts/clean_new_zealand.R")
# }
# List all the raw data files
file_list <- dir(path="extdata/", pattern='*.xlsx') # makes list of the files
file_listp <- paste0("extdata/", file_list) # adds path to file names
####################################
### Making the taxonomic table ###
####################################
# apply the separate_taxonomy function over the list of dataframes
tax_list <- lapply(file_listp, separate_taxonomy)
# put all taxonomy dataframes into one large dataframe
tax_df <- tax_list %>%
purrr::reduce(full_join) %>%
mutate_all(~gsub("(*UCP)\\s\\+|\\W+$", "", . , perl=TRUE)) %>%
dplyr::rename(taxonomic_authority = authority) %>%
dplyr::arrange(genus_species) %>%
dplyr::filter(!(genus_species == "Baridinae gen"))
# define what taxonomic columns might be named
tax_class <- c("kingdom", "phylum", "class", "order", "family", "super_family",
"genus", "species", "genus_species", "taxonomic_authority", "taxonomy_system")
#####################################
### Make large table with all info
# also correct mis-spellings of certain species based on expert review by A. Liebhold
# misspell <- read_csv("./data/raw_data/taxonomic_reference/misspelling_SAL_resolved.csv", trim_ws = TRUE)
tax_df1 <- tax_df %>%
mutate_all(~gsub("(*UCP)\\s\\+|\\W+$", "", . , perl=TRUE)) %>%
mutate_at(vars(genus_species), str_squish) %>%
mutate(user_supplied_name = genus_species) %>%
# full_join(misspell, by = "user_supplied_name") %>%
# transmute(phylum, class, order, family, super_family, user_supplied_name,
# genus_species = ifelse(!is.na(genus_species.y), genus_species.y, genus_species.x ),
# genus = word(genus_species, 1),
# species = word(genus_species, 2),
# taxonomy_system, taxonomic_authority) %>%
distinct(genus_species) %>% # remove species duplicates
dplyr::arrange(genus_species) # arrange alphabetically
#####################################
### Make vectors of genus names (no species info) and species names
# make character vector of names only to genus
# g_sp <- grep('\\<sp\\>', tax_df1$genus_species, value=TRUE)
# g_spp <- grep('\\<sp.\\>', tax_df1$genus_species, value=TRUE)
g_sp <- filter(tax_df1, (str_count(genus_species, " ") + 1) == 1)
# bard <- grep('\\<gen\\>', tax_df1$genus_species, value=TRUE) # include sub-family here
tax_vec_gn <- unlist(g_sp, use.names = FALSE) %>% # gsub(" [a-zA-Z0-9]*", "", .) %>%
magrittr::extract(!(. == "Tasconotus")) # remove this species
# makes character vector of names only to species
tax_vec_sp <- tax_df1 %>%
filter(!(genus_species %in% g_sp$genus_species)) %>%
# magrittr::extract(!(. %in% g_sp)) %>%
# magrittr::extract(!(. %in% g_spp)) %>%
unlist(., use.names = FALSE) %>%
magrittr::extract(!(. == "Baridinae")) # this family put with genus above
#####################################
### Get taxonomy info from GBIF ###
#####################################
xtra_cols <- c("kingdomkey", "phylumkey", "classkey", "orderkey", "specieskey",
"note", "familykey", "genuskey", "scientificname", "canonicalname", "confidence")
######################
# apply the get_accepted_taxonomy function over the vector of species names
tax_acc_l <- lapply(tax_vec_sp, get_accepted_taxonomy)
# make dataframe of all results
suppressMessages(
tax_acc <- tax_acc_l %>%
purrr::reduce(full_join) %>%
mutate(genus_species = str_squish(genus_species)) %>%
select(-one_of(xtra_cols))
)
######################
# apply the get_accepted_taxonomy function over the vector of genus names
gn_acc_l <- lapply(tax_vec_gn, get_accepted_taxonomy)
# make dataframe of all results
suppressMessages(
gen_acc <- gn_acc_l %>%
purrr::reduce(full_join) %>%
mutate(genus_species = str_squish(genus_species)) %>%
select(-one_of(xtra_cols))
)
######################
# resolve species without accepted species names
########
# genus level matches from get_accepted_taxonomy results
genus_only <- tax_acc %>%
dplyr::filter(rank == "genus") #%>%
# filter out those where user_supplied_name was only genus to begin with
#dplyr::filter(!word(user_supplied_name,-1) == "sp")
go_vec <- unlist(genus_only$user_supplied_name, use.names = FALSE)
# apply the function over the vector of species names
tax_go_l <- lapply(go_vec, get_more_info)
# make dataframe of all species rank matches that were originally genus only
suppressMessages(
tax_go <- tax_go_l %>%
purrr::reduce(full_join) %>% # join all data frames from list
dplyr::filter(!(matched_name2 == "species not found")) %>% # remove taxa that didn't provide a species-level match (no new info)
dplyr::filter((str_count(matched_name2, '\\s+')+1) %in% c(2,3)) %>%
mutate(genus = ifelse((str_count(matched_name2, '\\s+')+1) == 1, matched_name2, stringr::word(matched_name2, 1)),
species = ifelse((str_count(matched_name2, '\\s+')+1) %in% c(2,3), matched_name2, NA_character_),
genus_species = ifelse(is.na(species), paste(genus, "sp"), species)) %>%
select(-matched_name2)
)
# # send the species rank back through GBIF to identify synonyms
# synon_l <- lapply(tax_go$genus_species, get_accepted_taxonomy)
#
# # How many synonyms were found on retesting in GBIF?
# suppressMessages(
# synon_retest <- synon_l %>%
# purrr::reduce(full_join) %>%
# mutate(genus_species = str_squish(genus_species)) %>%
# select(-one_of(xtra_cols)) %>%
# filter(synonym == TRUE & rank == "species")
# )
#
# # tax_go[tax_go$genus_species %in% c("Exochomus quadripustulatus", "Opius makii"),]
#
# # tax_go results minus the two retested synonyms
# syn <- synon_retest$user_supplied_name
#
# tax_go2 <- tax_go %>%
# filter(!(genus_species %in% syn))
# How many did not return lower rank?
suppressMessages(
no_lower <- tax_go_l %>%
purrr::reduce(full_join) %>% # join all data frames from list
# filter to taxa that only returned genus (no new info)
dplyr::filter((str_count(matched_name2, '\\s+')+1) == 1|
matched_name2 == "species not found")
)
# from no_lower, the not found
no_lower_not_found <- no_lower %>% filter(matched_name2 == "species not found")
# from no_lower, the genus_only matches
no_lower_genus <- no_lower %>% filter(!(matched_name2 == "species not found"))
########
# species not found at all from get_accepted_taxonomy results
not_found <- tax_acc %>%
dplyr::filter(genus_species == "species not found",
!(is.na(user_supplied_name))) %>%
bind_rows(no_lower_not_found)
not_found_vec <- unlist(not_found$user_supplied_name, use.names = FALSE)
# apply the function over the vector of species names
tax_nf_l <- lapply(not_found_vec, get_more_info)
# make dataframe of matches at species rank
suppressMessages(
tax_nf <- tax_nf_l %>%
purrr::reduce(full_join) %>%
dplyr::filter(!(matched_name2 == "species not found")) %>%
dplyr::filter((str_count(matched_name2, '\\s+')+1) == 2) %>%
mutate(genus = ifelse((str_count(matched_name2, '\\s+')+1) == 1, matched_name2, NA_character_),
species = ifelse((str_count(matched_name2, '\\s+')+1) %in% c(2,3), matched_name2, NA_character_),
genus_species = ifelse(is.na(species), genus, species)) %>%
mutate(genus = ifelse(is.na(genus), stringr::word(species, 1), genus)) %>%
select(-matched_name2)
)
# How many were not found?
suppressMessages(
no_match <- tax_nf_l %>%
purrr::reduce(full_join) %>% # join all data frames from list
dplyr::filter(matched_name2 == "species not found")
)
# How many returned at genus level rank?
suppressMessages(
nf_go <- tax_nf_l %>%
purrr::reduce(full_join) %>% # join all data frames from list
dplyr::filter((str_count(matched_name2, '\\s+')+1) == 1)
)
########
# put together genus-level only matches
genus_matches <- no_lower_genus %>%
bind_rows(nf_go) %>%
mutate(genus = matched_name2) %>%
dplyr::rename(genus_species = matched_name2)
# bring in manual corrections
sal_taxa <- read_csv("nfs_data/data/raw_data/taxonomic_reference/genus_only_resolution_FIXED.csv", trim_ws = TRUE,
col_types = cols(up_to_date_name = col_character()))
# add manual corrections to correct genus-level only matches
genus_match_SAL <- genus_matches %>%
left_join(sal_taxa, by = "user_supplied_name") %>%
transmute(user_supplied_name,
taxonomy_system = ifelse(!(is.na(genus_species.y)), taxonomy_system.y, taxonomy_system.x),
#kingdom, phylum, class,
order, family,
genus = ifelse(!(is.na(genus_species.y)), word(genus_species.y, 1), genus),
species = ifelse(!(is.na(genus_species.y)) & rank == "species",
genus_species.y, NA_character_),
genus_species = ifelse(!(is.na(genus_species.y)), genus_species.y, genus_species.x),
rank, synonym) %>%
mutate(synonym = as.character(synonym))
manually_matched <- subset(genus_match_SAL, (user_supplied_name %in% sal_taxa$user_supplied_name))
########
# dataframes of remaining unmatched taxa and
# remaining manual corrections (will be implemented by row replacement below)
# taxa still missing a genus match
still_no_match <- subset(genus_match_SAL, !(user_supplied_name %in% sal_taxa$user_supplied_name))
# taxa included in sal_taxa but not matched in genus_matches (could be from interception data)
man_correct_remain <- subset(sal_taxa, !(user_supplied_name %in% manually_matched$user_supplied_name))
########
# put together dataframes with new info
new_sp_info <- tax_nf %>%
full_join(tax_go) %>%
dplyr::left_join(select(genus_only, user_supplied_name, kingdom, # this and the transmute adds back in the higher rank info
phylum, class, order, family), by = "user_supplied_name") %>%
# full_join(synon_retest) %>%
full_join(manually_matched) %>% # df of manual corrections
mutate(genus = ifelse(is.na(genus), word(genus_species, 1), genus),
rank = ifelse(is.na(rank) & str_count(genus_species, '\\w+')%in% c(2,3),
"species", rank),
kingdom = ifelse(is.na(kingdom), "Animalia", kingdom),
phylum = ifelse(is.na(phylum), "Arthropoda", phylum),
class = ifelse(is.na(class), "Insecta", class))
########
# bring in new non-plant-feeding Australian taxa from Helen
# Sept 2020, Rebecca said that all these npf taxa have been incorporated into the raw Australian file
# new_npf_aus <- read_csv("nfs_data/data/clean_data/new_Aussie_npf_taxa.csv", trim_ws = TRUE, col_types = "cnccccccccccccccn")
#######################################################################
### Combine species list and GBIF accepted names ###
#######################################################################
tax_combo <- dplyr::filter(tax_acc, rank %in% c("species", "subspecies")) %>% # GBIF matches to species rank
full_join(gen_acc) %>% # df of taxa where user supplied name was genus only to start with
# full_join(new_npf_aus) %>% # df of new non-plant-feeding Australian taxa from Helen
full_join(new_sp_info, by = "user_supplied_name") %>% # bind in the new info from auto and manual resolution
transmute(user_supplied_name,
rank = ifelse(is.na(rank.y), rank.x, rank.y),
status, # = ifelse(is.na(status.y), status.x, status.y),
matchtype, # = ifelse(is.na(matchtype.y), matchtype.x, matchtype.y),
usagekey, # = ifelse(is.na(usagekey.y), usagekey.x, usagekey.y),
synonym = ifelse(is.na(synonym.y), synonym.x, synonym.y),
acceptedusagekey, # = ifelse(is.na(acceptedusagekey.y), acceptedusagekey.x, acceptedusagekey.y),
kingdom = ifelse(is.na(kingdom.y), kingdom.x, kingdom.y),
phylum = ifelse(is.na(phylum.y), phylum.x, phylum.y),
class = ifelse(is.na(class.y), class.x, class.y),
order = ifelse(is.na(order.y), order.x, order.y),
family = ifelse(is.na(family.y), family.x, family.y),
genus = ifelse(is.na(genus.y), genus.x, genus.y),
species = ifelse(is.na(species.y), species.x, species.y),
genus_species = ifelse(is.na(genus_species.y), genus_species.x, genus_species.y),
taxonomy_system = ifelse(is.na(taxonomy_system.y), taxonomy_system.x, taxonomy_system.y),
taxonomic_authority) %>% # = ifelse(is.na(taxonomic_authority.y), taxonomic_authority.x, taxonomic_authority.y)) %>%
# a bit more cleaning from Rebecca Turner
mutate(family = ifelse(family %in% c("Rutelidae","Melolonthidae", "Dynastidae"), "Scarabaeidae", family),
family = ifelse(genus == "Dermestes", "Dermestidae", family)) %>%
dplyr::filter(!(is.na(user_supplied_name))) # remove blank rows
# subset remaining manual fixes for those user supplied names that are in tax_combo to get rows that need to be replaced
# rows_2_replace <- subset(man_correct_remain, (user_supplied_name %in% tax_combo$user_supplied_name))
# replace rows with new info, and add rows from interception data
tax_final <- tax_combo %>%
full_join(man_correct_remain, by = "user_supplied_name") %>%
transmute(user_supplied_name,
status , #= ifelse(!is.na(rank.y), NA_character_, status),
matchtype = ifelse(!is.na(rank.y), NA_character_, matchtype),
usagekey = ifelse(!is.na(rank.y), NA_character_, usagekey),
rank = ifelse(!is.na(rank.y), rank.y, rank.x),
synonym = ifelse(!is.na(rank.y), synonym.y, synonym.x),
acceptedusagekey = ifelse(!is.na(rank.y), NA_character_, acceptedusagekey),
kingdom, phylum, class,
order = ifelse(!is.na(order.y), order.y, order.x),
family = ifelse(!is.na(family.y), family.y, family.x),
genus = ifelse(!is.na(family.y), word(genus_species.y, 1), genus),
species = ifelse(!is.na(family.y), word(genus_species.y, 2), species),
genus_species = ifelse(!is.na(genus_species.y), genus_species.y, genus_species.x),
taxonomy_system = ifelse(!is.na(taxonomy_system.y), taxonomy_system.y, taxonomy_system.x),
taxonomic_authority = ifelse(!is.na(taxonomic_authority.y), taxonomic_authority.y, taxonomic_authority.x)) %>%
mutate(kingdom = ifelse(is.na(kingdom), "Animalia", kingdom),
phylum = ifelse(is.na(phylum), "Arthropoda", phylum),
class = ifelse(is.na(class), "Insecta", class),
genus_species = ifelse(genus_species == "species not found", NA_character_, genus_species)) %>%
mutate(genus = ifelse(is.na(genus), word(genus_species, 1), genus),
species = ifelse(is.na(species), word(genus_species, 2), species)) %>%
arrange(user_supplied_name) %>%
# add the unique ID column after all unique species are in one dataframe
tibble::rowid_to_column("taxon_id")
# duplicates
# dups <- tax_final %>% group_by(user_supplied_name) %>% filter(n()>1)
# Bostrichidae
# bos <- tax_combo %>% filter(family == "Bostrichidae")
# Rutelidae, Melolonthidae, and Dynastidae
# RMD <- tax_combo %>% filter(family %in% c("Rutelidae", "Melolonthidae", "Dynastidae"))
#####################################
### Write file ###
#####################################
# write the clean taxonomy table to a CSV file
readr::write_csv(tax_final, "nfs_data/data/clean_data/taxonomy_table.csv")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.