Insect Invasions Pursuit @ SESYNC

Example script to create clean attribute table

created by Rachael Blake 09/23/2019

# Load packages needed for this script
library(tidyverse) ; library(readxl) ; library(purrr) ; library(DescTools)

# List all the raw data files in your local directory
file_list <- dir(path="path/to/your/data", pattern='*.xlsx')  # makes list of the files
file_listp <- paste0("path/to/your/data/", file_list)         # adds path to file names

# if you would like to use the example data in this package, use the following code
# file_listp <- system.file("extdata", "Japan_taxa.xlsx", package = "insectcleanr")

Making the attribute table

# apply that function over the list of dataframes
attrib_list <- lapply(file_listp, separate_attributes)

df_attrib <- attrib_list %>%
             purrr::reduce(full_join) %>%
             # clean up origin column
             mutate(origin = gsub("&", "", origin),
                    origin = gsub("Indomaraya|indomalaya", "Indomalaya", origin),
                    origin = gsub("IndomalayaOceania", "Indomalaya, Oceania", origin),
                    origin = gsub("Middle East", "Middle_East", origin),
                    origin = gsub("cosmopolitan|Cosmoploitan", "Cosmopolitan", origin),
                    origin = gsub("S.\\sAfrica|Sth\\sAfrica", "South_Africa", origin),
                    origin = gsub("\\(Taiwan", "Taiwan", origin),
                    origin = gsub("\\(Okinawa|\\(Okinawa\\)", "Okinawa", origin),
                    origin = gsub("\\(Ogasawara", "Ogasawara", origin),
                    origin = gsub("\\(Java", "Java", origin),
                    origin = gsub("N.\\sAmerica", "North_America", origin),
                    origin = gsub("S.\\sAmerica", "South_America", origin),
                    origin = gsub("C.\\sAmerica", "Central_America", origin),
                    origin = gsub("Palearctic\\(Asia\\)|Plearctic\\(Asia\\)", "Palearctic_Asia", origin),
                    origin = gsub("Palearctic\\s\\(Asia\\)|Paleartic\\(Asia\\)", "Palearctic_Asia", origin),
                    origin = gsub("Ppalearctic\\(Asia\\)|Palearctic\\(Asia", "Palearctic_Asia", origin),
                    origin = gsub("Palearctic\\s\\(Asia|Paleartic\\(Asia", "Palearctic_Asia", origin),
                    origin = gsub("Palearctic\\s\\(E.\\sAsia|Palearctic\\s\\(Central\\sAsia", "Palearctic_Asia", origin),
                    origin = gsub("Palearctic\\(Europe\\)|Palearctic\\s\\(Europe\\)", "Palearctic_Europe", origin),
                    origin = gsub("alearctic\\(Europe\\)|Palearctic\\(Europe", "Palearctic_Europe", origin),
                    origin = gsub("Palearctic\\s\\(Europe|Paleartic\\(Europe", "Palearctic_Europe", origin),
                    origin = gsub("Parearctic\\(Europe", "Palearctic_Europe", origin),
                    origin = gsub("Palearctic\\s\\(Eurasia", "Palearctic_Europe, Palearctic_Asia", origin),
                    origin = gsub("Palearctic\\(Europe\\)Nearctic", "Palearctic_Europe, Nearctic", origin),
                    origin = gsub("Nearctic Nearctic", "Nearctic", origin),
                    origin = gsub("Nearctic\\(Europe\\)", "Palearctic_Europe", origin),
                    origin = gsub("\\\"Old World\\\"\\/Europe", "Old_World_Europe", origin),
                    origin = gsub("Sri\\sLanka\\sor\\sAustralasia\\?\\s\\(Dugdale,\\s1988", "Sri Lanka Australasia", origin),
                    origin = gsub("C\\/C.\\sAmerica\\?\\sOld\\sworld\\stropics\\s\\(Mound\\s&\\sWalker,\\s1982",
                                  "Cosmopolitan, Central_America, Old_World_tropics", origin),
                    origin = gsub(" ", ", ", origin),
                    origin = gsub(", , ", ", ", origin),
                    origin = gsub(",, ", ", ", origin)
                    ) %>%
             dplyr::rename("user_supplied_name" = "genus_species")  # NOTE: even though changed name to user_supplied_name,
                                                                    # these data have not been run through GBIF!  You still
                                                                    # must join the attribute table with the taxonomy table!!!

bring in taxonomic information

# bring in taxonomic table for order, family, and genus columns
tax_table <- read_csv("path/to/your/data/taxonomy_table.csv")
tax_cols <- tax_table %>% select(taxon_id, user_supplied_name, order, family, genus, genus_species)

bring in origin information

# origin_correspondence_table.xlsx for the 8 biogeographic regions
o_corr_file <- read_excel("path/to/your/data/origin_correspondence_table.xlsx",
                           trim_ws = TRUE, col_types = "text")
o_corr_table <- o_corr_file %>%
                mutate_at(vars(starts_with("origin_")), list(~ replace(., . %in% c("NA"), NA_character_)))

bring in plant feeding information

# plant feeding attribute column from the non-plant-feeding_taxa file
npf_file <- "path/to/your/data/non-plant-feeding_taxa_updatedOct07.xlsx"
npf_ord <- read_excel(npf_file, sheet = 2, trim_ws = TRUE, col_types = "text")
npf_fams <- read_excel(npf_file, sheet = 3, trim_ws = TRUE, col_types = "text")
npf_gen <- read_excel(npf_file, sheet = 4, trim_ws = TRUE, col_types = "text")
pf_gen <- read_excel(npf_file, sheet = 6, trim_ws = TRUE, col_types = "text")
pf_sp <- read_excel(npf_file, sheet = 7, trim_ws = TRUE, col_types = "text")

# make plant feeding taxa names title case; make vectors using dplyr::pull()
npf_ord <- npf_ord %>% mutate(npf_orders = str_to_title(npf_orders)) %>% pull()
npf_fams <- npf_fams %>% mutate(npf_families = str_to_title(npf_families)) %>% pull()
npf_gen <- npf_gen %>% mutate(npf_genus = str_to_title(npf_genus)) %>% pull()
pf_gen <- pf_gen %>% pull()
pf_sp <- pf_sp %>% pull()

make attribute table

df_attrib_o <- df_attrib %>%
               left_join(tax_cols, by = "user_supplied_name") %>% # merge in taxonomic info
               left_join(o_corr_table) %>%  # merge in origin correspondence table
               # add plant feeding attribute column
               mutate(plant_feeding = "Y",
                      plant_feeding = ifelse(order %in% npf_ord, "N", plant_feeding),
                      plant_feeding = ifelse((order == "Blattodea" & family %in% npf_fams), "N", plant_feeding),
                      plant_feeding = ifelse((order == "Coleoptera" & family %in% npf_fams), "N", plant_feeding),
                      plant_feeding = ifelse((order == "Diptera" & family %in% npf_fams), "N", plant_feeding),
                      plant_feeding = ifelse((order == "Hemiptera" & family %in% npf_fams), "N", plant_feeding),
                      plant_feeding = ifelse((order == "Hymenoptera" & family %in% npf_fams), "N", plant_feeding),
                      plant_feeding = ifelse((order == "Lepidoptera" & family %in% npf_fams), "N", plant_feeding),
                      plant_feeding = ifelse((order == "Thysanoptera" & family %in% npf_fams), "N", plant_feeding),
                      plant_feeding = ifelse((order == "Thysanoptera" & family == "Phlaeothripidae" & genus %in% npf_gen), "N", plant_feeding),
                      plant_feeding = ifelse((order == "Thysanoptera" & family == "Thripidae" & genus %in% npf_gen), "N", plant_feeding),
                      plant_feeding = ifelse((order == "Coleoptera" & family == "Coccinellidae" & genus %in% pf_gen), "Y", plant_feeding),
                      plant_feeding = ifelse((order == "Diptera" & family == "Muscidae" & genus %in% pf_gen), "Y", plant_feeding),
                      plant_feeding = ifelse((order == "Diptera" & family == "Phoridae" & genus %in% pf_gen), "Y", plant_feeding),
                      plant_feeding = ifelse((order == "Diptera" & family == "Drosophilidae" & user_supplied_name %in% pf_sp), "Y", plant_feeding)
                      ) %>%
               # clean up intentional release column
               mutate(intentional_release = ifelse(intentional_release %in% c("N"), "No",
                                            ifelse(intentional_release %in% c("1", "I"), "Yes", intentional_release))) %>%
               # add column for whether species every introduced anywhere in world
               group_by(user_supplied_name) %>%
               mutate(ever_introduced_anywhere = ifelse(intentional_release %in% c("Yes", "Eradicated"), "Yes",
                                                  ifelse(intentional_release %in% c("No"), "No", NA_character_))) %>%
               ungroup() %>%
               select(-origin, -country_nm, -nz_region, -order, -family, -genus) %>%
               # coalesce rows to one per species
               group_split(user_supplied_name) %>%
               map(~coalesce_manual(.x)) %>%
               bind_rows() %>%
               select(-genus_species) %>%
               # arrange rows and columns
               arrange(user_supplied_name) %>%
               select(taxon_id, user_supplied_name, plant_feeding,
                      origin_Nearctic, origin_Neotropic, origin_European_Palearctic, origin_Asian_Palearctic, origin_Indomalaya,
                      origin_Afrotropic, origin_Australasia, origin_Oceania, everything())

Write file

# write out the attribute table
readr::write_csv(df_attrib_o, "path/to/your/data/attribute_table.csv")

Code to filter to unique GBIF genus_species, and do manual coalesce

df_attrib_gbif <- df_attrib %>%
                  left_join(tax_cols, by = "user_supplied_name") %>% # merge in taxonomic info
                  left_join(o_corr_table) %>%  # merge in origin correspondence table
                  # add plant feeding attribute column
                  mutate(plant_feeding = "Y",
                         plant_feeding = ifelse(order %in% npf_ord, "N", plant_feeding),
                         plant_feeding = ifelse((order == "Blattodea" & family %in% npf_fams), "N", plant_feeding),
                         plant_feeding = ifelse((order == "Coleoptera" & family %in% npf_fams), "N", plant_feeding),
                         plant_feeding = ifelse((order == "Diptera" & family %in% npf_fams), "N", plant_feeding),
                         plant_feeding = ifelse((order == "Hemiptera" & family %in% npf_fams), "N", plant_feeding),
                         plant_feeding = ifelse((order == "Hymenoptera" & family %in% npf_fams), "N", plant_feeding),
                         plant_feeding = ifelse((order == "Lepidoptera" & family %in% npf_fams), "N", plant_feeding),
                         plant_feeding = ifelse((order == "Thysanoptera" & family %in% npf_fams), "N", plant_feeding),
                         plant_feeding = ifelse((order == "Thysanoptera" & family == "Phlaeothripidae" & genus %in% npf_gen), "N", plant_feeding),
                         plant_feeding = ifelse((order == "Thysanoptera" & family == "Thripidae" & genus %in% npf_gen), "N", plant_feeding),
                         plant_feeding = ifelse((order == "Coleoptera" & family == "Coccinellidae" & genus %in% pf_gen), "Y", plant_feeding),
                         plant_feeding = ifelse((order == "Diptera" & family == "Muscidae" & genus %in% pf_gen), "Y", plant_feeding),
                         plant_feeding = ifelse((order == "Diptera" & family == "Phoridae" & genus %in% pf_gen), "Y", plant_feeding),
                         plant_feeding = ifelse((order == "Diptera" & family == "Drosophilidae" & user_supplied_name %in% pf_sp), "Y", plant_feeding)
                         ) %>%
                  # clean up intentional release column
                  mutate(intentional_release = ifelse(intentional_release %in% c("N"), "No",
                                               ifelse(intentional_release %in% c("1", "I"), "Yes", intentional_release))) %>%
                  # add column for whether species every introduced anywhere in world
                  group_by(user_supplied_name) %>%
                  mutate(ever_introduced_anywhere = ifelse(intentional_release %in% c("Yes", "Eradicated"), "Yes",
                                                    ifelse(intentional_release %in% c("No"), "No", NA_character_))) %>%
                  ungroup() %>%
                  select(-origin, -country_nm, -nz_region, -user_supplied_name, -order, -family, -genus, -notes) %>%
                  # set column types
                  mutate_at(vars(origin_Nearctic, origin_Neotropic, origin_European_Palearctic,
                                 origin_Asian_Palearctic, origin_Indomalaya, origin_Afrotropic,
                                 origin_Australasia, origin_Oceania),
                            list(as.numeric)) %>%
                  # coalesce rows to one per GBIF genus_species
                  # group_by(genus_species) %>%
                  # summarize_all(coalesce_manual) %>%
                  # ungroup() %>%


                  group_split(genus_species) %>%
                  map(~coalesce_manual(.x)) %>%
                  bind_rows() %>%


                  # arrange rows and columns
                  arrange(genus_species) %>%
                  select(taxon_id, genus_species,
                         origin_Nearctic, origin_Neotropic, origin_European_Palearctic, origin_Asian_Palearctic,
                         origin_Indomalaya, origin_Afrotropic, origin_Australasia, origin_Oceania, plant_feeding,
                         intentional_release, ever_introduced_anywhere, everything())

Write another file

# write out the attribute table
readr::write_csv(df_attrib_gbif, "path/to/your/data/attribute_table_gbif.csv")


reblake/insectcleanr documentation built on March 22, 2022, 6:44 p.m.