Insect Invasions Pursuit @ SESYNC

Example script to create clean occurrence table

created by Rachael Blake 11/19/2018

# Load packages needed for this script
library(tidyverse) ; library(readxl) ; library(purrr) ; library(countrycode)

# List all the raw data files in your local directory
file_list <- dir(path="path/to/your/data", pattern='*.xlsx')  # makes list of the files
file_listp <- paste0("path/to/your/data/", file_list)         # adds path to file names

# if you would like to use the example data in this package, use the following code
# file_listp <- system.file("extdata", "Japan_taxa.xlsx", package = "insectcleanr")

Making the occurrence table

# apply that function over the list of dataframes
occurr_list <- lapply(file_listp, separate_occurrence)

# put all occurrence dataframes into one large dataframe
df_occurr <- occurr_list %>%
             purrr::reduce(full_join) %>%
             mutate_all(~gsub("(*UCP)\\s\\+|\\W+$", "", . , perl=TRUE)) %>% # remove rogue white spaces
             # remove Arachnid
             filter(!(genus_species == "Trixacarus caviae")) %>%
             # add blank country and present_status columns because they were removed in edits of the raw data files (Aug 14, 2020)
             mutate(country = '',
                    present_status = '') %>%
             # fill in country column with canada_or_us info
             mutate(country = ifelse(is.na(country) & canada_or_us %in% c("Canada", "Us", "Us, may not actually be adventive"),
                                     canada_or_us, country),
                    present_status = ifelse(present_status == "Na", NA, present_status),
                    notes = ifelse(country == "Us, may not actually be adventive", "may not actually be adventive", ""),
                    country = ifelse(country == "Us, may not actually be adventive", "Us", country),
                    notes = ifelse(origin == "New insect record for 1960  purposeful introduction",
                                   "New insect record for 1960  purposeful introduction", ""),
                    origin = ifelse(origin == "New insect record for 1960  purposeful introduction",
                                    "", origin),
                    notes = ifelse(origin == "New insect record for 1963, chance immigrant",
                                   "New insect record for 1963, chance immigrant", ""),
                    origin = ifelse(origin == "New insect record for 1963, chance immigrant",
                                    "", origin)
                    ) %>%
             # clean up/fill in country column
             mutate(year = ifelse(year == -999, NA, year),
                    country = ifelse(region %in% c("Okinawa", "Ogasawara", "Japan"), "Japan", country),
                    country = ifelse(region == "Hawaii", "Us", country),
                    country = ifelse(region == "Korea", "Korea", country),
                    country = ifelse(region == "New Zealand", "New Zealand", country),
                    notes = ifelse(grepl("Proceedings of the", .$origin), origin, notes),
                    origin = ifelse(grepl("Proceedings of the", .$origin), "", origin)) %>%
             # clean up some species names
             mutate(genus_species = gsub("Mycetophila\xa0propria", "Mycetophila propria", genus_species),
                    genus_species = gsub("Mycetophila\xa0vulgaris", "Mycetophila vulgaris", genus_species),
                    genus_species = gsub("Mycetophila\xa0marginepunctata", "Mycetophila marginepunctata", genus_species),
                    ) %>%
             # clean up year column
             mutate(year = ifelse(year == "N/A", NA_character_, year),
                    year = gsub("\\s", "", year, perl=TRUE)) %>%
             # clean up intentional release column
             mutate(intentional_release = ifelse(intentional_release %in% c("N"), "No",
                                          ifelse(intentional_release %in% c("1", "I"), "Yes", intentional_release))) %>%
             # add country codes for country and origin columns
             mutate(country_code = countrycode(country, "country.name", "iso3n", warn = TRUE),
                    origin_code = countrycode(origin, "country.name", "iso3n", warn = TRUE)) %>%
             mutate(genus_species = gsub("\xa0", " ", genus_species , perl=TRUE)) %>% # trying to get rid of weird characters
             dplyr::select(-canada_or_us, -nz_region) %>%
             dplyr::arrange(genus_species)
# add the unique ID column and delete genus species column(s)
tax_table <- read.csv("path/to/your/data/taxonomy_table.csv", stringsAsFactors=F)  # read in the taxonomy table

# make final occurrence dataframe
occurr_df <- df_occurr %>%
             mutate_all(~gsub("(*UCP)\\s\\+|\\W+$", "", . , perl=TRUE)) %>%  # remove rogue white spaces
             dplyr::rename(user_supplied_name = genus_species) %>% # have to rename genus_species to user_supplied_name so matches are correct
             dplyr::left_join(y = select(tax_table, c(user_supplied_name, taxon_id, genus_species)),
                              by = "user_supplied_name") %>% # join in the taxonomy info
             mutate(genus_species = gsub("<a0>", " ", genus_species, perl=TRUE)) %>%
             select(taxon_id, everything()) %>% # make taxon_id column the first column
             dplyr::arrange(taxon_id) # order by taxon_id

Write file

# write the clean occurrence table to a CSV file
readr::write_csv(occurr_df, "path/to/your/data/occurrence_table.csv")


reblake/insectcleanr documentation built on March 22, 2022, 6:44 p.m.