# Load packages needed for this script library(tidyverse) ; library(readxl) ; library(purrr) ; library(DescTools) # List all the raw data files in your local directory file_list <- dir(path="path/to/your/data", pattern='*.xlsx') # makes list of the files file_listp <- paste0("path/to/your/data/", file_list) # adds path to file names # if you would like to use the example data in this package, use the following code # file_listp <- system.file("extdata", "Japan_taxa.xlsx", package = "insectcleanr")
# apply that function over the list of dataframes attrib_list <- lapply(file_listp, separate_attributes) df_attrib <- attrib_list %>% purrr::reduce(full_join) %>% # clean up origin column mutate(origin = gsub("&", "", origin), origin = gsub("Indomaraya|indomalaya", "Indomalaya", origin), origin = gsub("IndomalayaOceania", "Indomalaya, Oceania", origin), origin = gsub("Middle East", "Middle_East", origin), origin = gsub("cosmopolitan|Cosmoploitan", "Cosmopolitan", origin), origin = gsub("S.\\sAfrica|Sth\\sAfrica", "South_Africa", origin), origin = gsub("\\(Taiwan", "Taiwan", origin), origin = gsub("\\(Okinawa|\\(Okinawa\\)", "Okinawa", origin), origin = gsub("\\(Ogasawara", "Ogasawara", origin), origin = gsub("\\(Java", "Java", origin), origin = gsub("N.\\sAmerica", "North_America", origin), origin = gsub("S.\\sAmerica", "South_America", origin), origin = gsub("C.\\sAmerica", "Central_America", origin), origin = gsub("Palearctic\\(Asia\\)|Plearctic\\(Asia\\)", "Palearctic_Asia", origin), origin = gsub("Palearctic\\s\\(Asia\\)|Paleartic\\(Asia\\)", "Palearctic_Asia", origin), origin = gsub("Ppalearctic\\(Asia\\)|Palearctic\\(Asia", "Palearctic_Asia", origin), origin = gsub("Palearctic\\s\\(Asia|Paleartic\\(Asia", "Palearctic_Asia", origin), origin = gsub("Palearctic\\s\\(E.\\sAsia|Palearctic\\s\\(Central\\sAsia", "Palearctic_Asia", origin), origin = gsub("Palearctic\\(Europe\\)|Palearctic\\s\\(Europe\\)", "Palearctic_Europe", origin), origin = gsub("alearctic\\(Europe\\)|Palearctic\\(Europe", "Palearctic_Europe", origin), origin = gsub("Palearctic\\s\\(Europe|Paleartic\\(Europe", "Palearctic_Europe", origin), origin = gsub("Parearctic\\(Europe", "Palearctic_Europe", origin), origin = gsub("Palearctic\\s\\(Eurasia", "Palearctic_Europe, Palearctic_Asia", origin), origin = gsub("Palearctic\\(Europe\\)Nearctic", "Palearctic_Europe, Nearctic", origin), origin = gsub("Nearctic Nearctic", "Nearctic", origin), origin = gsub("Nearctic\\(Europe\\)", "Palearctic_Europe", origin), origin = gsub("\\\"Old World\\\"\\/Europe", "Old_World_Europe", origin), origin = gsub("Sri\\sLanka\\sor\\sAustralasia\\?\\s\\(Dugdale,\\s1988", "Sri Lanka Australasia", origin), origin = gsub("C\\/C.\\sAmerica\\?\\sOld\\sworld\\stropics\\s\\(Mound\\s&\\sWalker,\\s1982", "Cosmopolitan, Central_America, Old_World_tropics", origin), origin = gsub(" ", ", ", origin), origin = gsub(", , ", ", ", origin), origin = gsub(",, ", ", ", origin) ) %>% dplyr::rename("user_supplied_name" = "genus_species") # NOTE: even though changed name to user_supplied_name, # these data have not been run through GBIF! You still # must join the attribute table with the taxonomy table!!!
# bring in taxonomic table for order, family, and genus columns tax_table <- read_csv("path/to/your/data/taxonomy_table.csv") tax_cols <- tax_table %>% select(taxon_id, user_supplied_name, order, family, genus, genus_species)
# origin_correspondence_table.xlsx for the 8 biogeographic regions o_corr_file <- read_excel("path/to/your/data/origin_correspondence_table.xlsx", trim_ws = TRUE, col_types = "text") o_corr_table <- o_corr_file %>% mutate_at(vars(starts_with("origin_")), list(~ replace(., . %in% c("NA"), NA_character_)))
# plant feeding attribute column from the non-plant-feeding_taxa file npf_file <- "path/to/your/data/non-plant-feeding_taxa_updatedOct07.xlsx" npf_ord <- read_excel(npf_file, sheet = 2, trim_ws = TRUE, col_types = "text") npf_fams <- read_excel(npf_file, sheet = 3, trim_ws = TRUE, col_types = "text") npf_gen <- read_excel(npf_file, sheet = 4, trim_ws = TRUE, col_types = "text") pf_gen <- read_excel(npf_file, sheet = 6, trim_ws = TRUE, col_types = "text") pf_sp <- read_excel(npf_file, sheet = 7, trim_ws = TRUE, col_types = "text") # make plant feeding taxa names title case; make vectors using dplyr::pull() npf_ord <- npf_ord %>% mutate(npf_orders = str_to_title(npf_orders)) %>% pull() npf_fams <- npf_fams %>% mutate(npf_families = str_to_title(npf_families)) %>% pull() npf_gen <- npf_gen %>% mutate(npf_genus = str_to_title(npf_genus)) %>% pull() pf_gen <- pf_gen %>% pull() pf_sp <- pf_sp %>% pull()
df_attrib_o <- df_attrib %>% left_join(tax_cols, by = "user_supplied_name") %>% # merge in taxonomic info left_join(o_corr_table) %>% # merge in origin correspondence table # add plant feeding attribute column mutate(plant_feeding = "Y", plant_feeding = ifelse(order %in% npf_ord, "N", plant_feeding), plant_feeding = ifelse((order == "Blattodea" & family %in% npf_fams), "N", plant_feeding), plant_feeding = ifelse((order == "Coleoptera" & family %in% npf_fams), "N", plant_feeding), plant_feeding = ifelse((order == "Diptera" & family %in% npf_fams), "N", plant_feeding), plant_feeding = ifelse((order == "Hemiptera" & family %in% npf_fams), "N", plant_feeding), plant_feeding = ifelse((order == "Hymenoptera" & family %in% npf_fams), "N", plant_feeding), plant_feeding = ifelse((order == "Lepidoptera" & family %in% npf_fams), "N", plant_feeding), plant_feeding = ifelse((order == "Thysanoptera" & family %in% npf_fams), "N", plant_feeding), plant_feeding = ifelse((order == "Thysanoptera" & family == "Phlaeothripidae" & genus %in% npf_gen), "N", plant_feeding), plant_feeding = ifelse((order == "Thysanoptera" & family == "Thripidae" & genus %in% npf_gen), "N", plant_feeding), plant_feeding = ifelse((order == "Coleoptera" & family == "Coccinellidae" & genus %in% pf_gen), "Y", plant_feeding), plant_feeding = ifelse((order == "Diptera" & family == "Muscidae" & genus %in% pf_gen), "Y", plant_feeding), plant_feeding = ifelse((order == "Diptera" & family == "Phoridae" & genus %in% pf_gen), "Y", plant_feeding), plant_feeding = ifelse((order == "Diptera" & family == "Drosophilidae" & user_supplied_name %in% pf_sp), "Y", plant_feeding) ) %>% # clean up intentional release column mutate(intentional_release = ifelse(intentional_release %in% c("N"), "No", ifelse(intentional_release %in% c("1", "I"), "Yes", intentional_release))) %>% # add column for whether species every introduced anywhere in world group_by(user_supplied_name) %>% mutate(ever_introduced_anywhere = ifelse(intentional_release %in% c("Yes", "Eradicated"), "Yes", ifelse(intentional_release %in% c("No"), "No", NA_character_))) %>% ungroup() %>% select(-origin, -country_nm, -nz_region, -order, -family, -genus) %>% # coalesce rows to one per species group_split(user_supplied_name) %>% map(~coalesce_manual(.x)) %>% bind_rows() %>% select(-genus_species) %>% # arrange rows and columns arrange(user_supplied_name) %>% select(taxon_id, user_supplied_name, plant_feeding, origin_Nearctic, origin_Neotropic, origin_European_Palearctic, origin_Asian_Palearctic, origin_Indomalaya, origin_Afrotropic, origin_Australasia, origin_Oceania, everything())
# write out the attribute table readr::write_csv(df_attrib_o, "path/to/your/data/attribute_table.csv")
df_attrib_gbif <- df_attrib %>% left_join(tax_cols, by = "user_supplied_name") %>% # merge in taxonomic info left_join(o_corr_table) %>% # merge in origin correspondence table # add plant feeding attribute column mutate(plant_feeding = "Y", plant_feeding = ifelse(order %in% npf_ord, "N", plant_feeding), plant_feeding = ifelse((order == "Blattodea" & family %in% npf_fams), "N", plant_feeding), plant_feeding = ifelse((order == "Coleoptera" & family %in% npf_fams), "N", plant_feeding), plant_feeding = ifelse((order == "Diptera" & family %in% npf_fams), "N", plant_feeding), plant_feeding = ifelse((order == "Hemiptera" & family %in% npf_fams), "N", plant_feeding), plant_feeding = ifelse((order == "Hymenoptera" & family %in% npf_fams), "N", plant_feeding), plant_feeding = ifelse((order == "Lepidoptera" & family %in% npf_fams), "N", plant_feeding), plant_feeding = ifelse((order == "Thysanoptera" & family %in% npf_fams), "N", plant_feeding), plant_feeding = ifelse((order == "Thysanoptera" & family == "Phlaeothripidae" & genus %in% npf_gen), "N", plant_feeding), plant_feeding = ifelse((order == "Thysanoptera" & family == "Thripidae" & genus %in% npf_gen), "N", plant_feeding), plant_feeding = ifelse((order == "Coleoptera" & family == "Coccinellidae" & genus %in% pf_gen), "Y", plant_feeding), plant_feeding = ifelse((order == "Diptera" & family == "Muscidae" & genus %in% pf_gen), "Y", plant_feeding), plant_feeding = ifelse((order == "Diptera" & family == "Phoridae" & genus %in% pf_gen), "Y", plant_feeding), plant_feeding = ifelse((order == "Diptera" & family == "Drosophilidae" & user_supplied_name %in% pf_sp), "Y", plant_feeding) ) %>% # clean up intentional release column mutate(intentional_release = ifelse(intentional_release %in% c("N"), "No", ifelse(intentional_release %in% c("1", "I"), "Yes", intentional_release))) %>% # add column for whether species every introduced anywhere in world group_by(user_supplied_name) %>% mutate(ever_introduced_anywhere = ifelse(intentional_release %in% c("Yes", "Eradicated"), "Yes", ifelse(intentional_release %in% c("No"), "No", NA_character_))) %>% ungroup() %>% select(-origin, -country_nm, -nz_region, -user_supplied_name, -order, -family, -genus, -notes) %>% # set column types mutate_at(vars(origin_Nearctic, origin_Neotropic, origin_European_Palearctic, origin_Asian_Palearctic, origin_Indomalaya, origin_Afrotropic, origin_Australasia, origin_Oceania), list(as.numeric)) %>% # coalesce rows to one per GBIF genus_species # group_by(genus_species) %>% # summarize_all(coalesce_manual) %>% # ungroup() %>% group_split(genus_species) %>% map(~coalesce_manual(.x)) %>% bind_rows() %>% # arrange rows and columns arrange(genus_species) %>% select(taxon_id, genus_species, origin_Nearctic, origin_Neotropic, origin_European_Palearctic, origin_Asian_Palearctic, origin_Indomalaya, origin_Afrotropic, origin_Australasia, origin_Oceania, plant_feeding, intentional_release, ever_introduced_anywhere, everything())
# write out the attribute table readr::write_csv(df_attrib_gbif, "path/to/your/data/attribute_table_gbif.csv")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.