Nothing
#### 4. USGS formatter ####
#' Find, import, and format USGS data to Darwin Core
#'
#' The function finds, imports, formats, and creates metadata for the USGS dataset.
#'
#' @param path A character path to a directory that contains the USGS data, which will be found using
#' [BeeBDC::fileFinder()]. The function will look for "USGS_DRO_flat".
#' @param pubDate Character. The publication date of the dataset to update the metadata and citation.
#'
#' @return Returns a list with the occurrence data, "USGS_data", and the EML data, "EML_attributes".
#' @export
#'
#' @importFrom dplyr %>%
#'
#' @examples
#' \dontrun{
#' USGS_data <- USGS_formatter(path = DataPath, pubDate = "19-11-2022")
#' }
#'
USGS_formatter <- function(
path,
pubDate){
# locally bind variables to the function
. <-readr <- lubridate <- problems <- how0 <- how1 <- how2 <- how3 <- how4 <- days <-
field_note <- note <- SpeciesNotes <- SpeciesNotes<-DateEntered<-DateScanned<-ip<-position<-
time1<-time2<- occurrenceID <- NULL
#### require and checks ####
# Load required packages
requireNamespace("lubridate")
requireNamespace("dplyr")
# File name to search for
USGS_fileName <- "USGS_DRO_flat"
# Find the USGS data from the HomePath
USGS_loc <- fileFinder(path, USGS_fileName)
# Define ColsToKeep
ColsToKeep <- ColTypeR()[[1]] %>% names()
# Check if the data are present
if(is.null(USGS_loc) == TRUE){ # If there are no data matching the name...
stop(" - Oh dear, it looks like there are no USGS data in the HomePath. If you want to include such data, make sure that they exist.")
}
# Check if the ddate has been enetered
if(exists("pubDate") == FALSE){ # If there are no data matching the name...
stop(paste(" - Oh no! It looks like you have not provided pubDate = dd-mm-yyy. ",
"Please include this as the date that Sam Droege passed on his data in the above format."))
}
#### Find and import ####
if(grepl(pattern = ".txt.gz", USGS_loc) == TRUE){ # If the zipped file is present
writeLines("Only one zip file detected. Unzipping file to be read in.")
# unzip the file
R.utils::gunzip(
# File to unzip
fileName = USGS_loc,
# Where to put the extracted file
destname = stringr::str_remove(USGS_loc, ".gz"),
overwrite = FALSE,
remove = FALSE)
USGS_loc <- stringr::str_remove(USGS_loc, ".gz")
# User output
writeLines(paste(" - Unzipped file to: ", USGS_loc))
}
# If already extracted, use the extracted file
if(stringr::str_detect(pattern = USGS_fileName,
# Select the string that matches only (avoids a warning but works without)
string = USGS_loc)){
writeLines(paste(" - Reading in data file. This should not take too long.","\n",
"There may be some errors upon reading in depending on the state of the data.",
"\n", "One might consider reporting errors to Sam Droege to improve the dataset."))
# Read in the data file using "$" as the delimiter
USGS_data <- readr::read_delim(USGS_loc[[1]],
delim = "$")
# Make a copy of the problems, if they exist
USGS_problems <- readr::problems(USGS_data)
}
#### Metadata building ####
Attributes_USGS <- dplyr::tibble(
dataSource = "USGS_data",
alternateIdentifier = "Not provided",
title = "USGS_DRO database",
pubDate = lubridate::dmy(pubDate),
dateStamp = pubDate,
doi = "Not provided",
downloadLink = "Not provided, contact Sam Droege at sdroege@usgs.gov",
abstract = dplyr::lst(
"Note: If you are getting data from Sam Droege, you are getting one of two file types. Either an Excel spreadsheet with data already extracted for you, or the entire data set which is compressed into a file called USGS_DRO_flat.gz. The file USGS_DRO_flat.gz is how this file is usually shipped when we send the entire database. If you get this file it is compressed and needs to be uncompressed with 7-zip, gzip, or something similar ...after unzipping it becomes a very large txt file which can be imported into a database program for manipulation and searching. Data can also be extracted from BISON and GBIF and we assume the file structure below is reflected in their data also.
Be sure to check for spelling errors...we do global checks only every year or two.
Column Headings for USGS Bee Flat File (or an Excel Sheet if we send a subset)
ID. - Unique 6 digit specimen number with the database identifier (USGS_DRO) in front
name - Scientific name; ('Destroyed' = label destroyed without being used; 'NONBEE' = An insect that is not a bee, but was given a label; Blank = not yet identified, can indicate that specimen is waiting for identification, is lost, or was missed during data entry)
Sex - 'f' = female; 's' = male; 'u' = unknown
DeterminedBy - Who did the identification of the specimen
DeterminedWhen - Roughly when the identification was done
WhoScanned - Who scanned or entered the data
DateScanned - Date data were scanned
SpeciesNotes - Any notes about the species such as changes in identification
DateEntered - Date entered into the new database (done automatically)
COLLECTION.db - Collection event number. Up to 5 digit code preceded by database identifier (USGS_DRO) of the date, time, place of the collection event
gmt - Greenwich Mean Time of I think the time in which the Collection Event Number was created
Latitude - Latitude in decimal degrees
Longitude - Longitude in decimal degrees
accuracy - Degree of association of specimens to the Latitude/Longitude (1 = Specimens from the level of a country/state; 2 = specimens found in area of a county; 3 = specimens found in area of a park or refuge; 4 and 5 = Specimens located very close to coordinants
elevation - Elevation of location (rarely used)
country - Country where collection occurred
state - State where collection occurred
county - County where collection occurred
city - Nearest city or the geographic unit (i.e. Park or refuge) where collected
site - A site name, number, or designation within a city/park
position - similar to \"site\" but used less often
time1 - Date/time collection started or traps put out (format is yyyymmddmmss)
time2 - Date/time collection ended or traps picked up
days - Number of trapping days (used inconsistently)
who - The collector
email - Almost always Sam Droege's OLD email address
how0 - Technique used to capture bees
how1 - Number of traps that remained full of trap fluid when traps picked up
how2 - Bowl/ trap size
how3 - Trap color (rarely used since mostly we use several colors)
how4 - Trap liquid/soap
habitat - Rarely used
field_note - 'field_note' and 'note' used interchangeably to take notes about collection event
note - 'field_note' and 'note' used interchangeably to take notes about collection event"
),
citations = dplyr::lst("Citations not provided"),
downloadCitation = paste("Sam Droege. (",
lubridate::as_date(pubDate, format = "%m-%d-%y"),
"). United States Geological Survey bee data.",
sep = ""),
rights = dplyr::lst("Rights are not provided. Please seek permission for data use from Same Droege.")
) # END metadata
# combine the input eml and the attributes tibble into a list for output from the function
EML_attributes <- dplyr::lst("No_eml_from_USGS", Attributes_USGS)
names(EML_attributes) <- c("source_eml","Source_tibble")
#### Format the data ####
writeLines(" - Formatting the USGS dataset...")
writeLines(" - Formatting the dateTime...")
# Convert time1 and time2 to dateTime format
# Convert time1
USGS_data$time1 <- USGS_data$time1 %>%
lubridate::ymd_hms(., truncated = 5)
# Convert time2
USGS_data$time2 <- USGS_data$time2 %>%
lubridate::ymd_hms(., truncated = 5)
writeLines(" - Creating samplingProtocol and samplingEffort columns...")
# Create new columns with extra information that doesn't fit the established columns well
# Merge all of the extra collection info
USGS_data <- USGS_data %>% dplyr::mutate(
samplingProtocol = stringr::str_c(
dplyr::if_else(!is.na(how0),
paste0("Technique used: ", how0), ""),
dplyr::if_else(!is.na(how1),
paste0("Bowls full upon collection: ", how1), ""),
dplyr::if_else(!is.na(how2),
paste0("Sampling Bowl/trap size: ", how2), ""),
dplyr::if_else(!is.na(how3),
paste0("Trap colour: ", how3), ""),
dplyr::if_else(!is.na(how4),
paste0("Trap liquid: ", how4), ""),
sep = "|") %>%
# Remove extra bars.
# Remove extra bars.
stringr::str_replace_all(pattern = "(\\|){2,5}",
replacement = "\\|") %>%
stringr::str_replace_all(pattern = "(\\|$)+|(^\\|)+",
replacement = "")) %>%
# Add in samplingEffort
dplyr::mutate(
samplingEffort = stringr::str_c(
dplyr::if_else(!is.na(days),
paste0("sampling days: ", days), ""),
sep = "")) %>%
# Add in dataset information
dplyr::mutate(
datasetName = "USGS_DRO database",
datasetID = "USGS_DRO",
institutionCode = "USGS"
)
writeLines(" - Creating the fieldNotes and dataSource columns...")
# Enter these extra data into a new column.
USGS_data <- USGS_data %>% dplyr::mutate(
fieldNotes = stringr::str_c(
dplyr::if_else(!is.na(field_note),
paste0("field_note: ", field_note), ""),
dplyr::if_else(!is.na(note),
paste0("note: ", note), ""),
dplyr::if_else(!is.na(SpeciesNotes),
paste0("SpeciesNotes: ", SpeciesNotes), ""),
dplyr::if_else(!is.na(DateEntered),
paste0("DateEntered: ", DateEntered), ""),
dplyr::if_else(!is.na(DateScanned),
paste0("DateScanned: ", DateScanned), ""),
dplyr::if_else(!is.na(ip),
paste0("ipAddress: ", ip), ""),
dplyr::if_else(!is.na(position),
paste0("position: ", position), ""),
dplyr::if_else(!is.na(time1),
paste0("time1: ", time1), ""),
dplyr::if_else(!is.na(time2),
paste0("time2: ", time2), ""),
sep = "|") %>%
# Remove extra bars.
stringr::str_replace_all(pattern = "(\\|){2,9}",
replacement = "\\|") %>%
stringr::str_replace_all(pattern = "(\\|$)+|(^\\|)+",
replacement = ""))
# Trim white spaces
# stringr::str_trim(side = "both")
# Set the dataSource
USGS_data$dataSource <- "USGS_data"
writeLines(" - Renaming and selecting columns...")
# These data must be formatted to match the other data sets.
# that we created at the top of the R-script
USGS_data <- USGS_data %>% # The data frame to match with
dplyr::rename("occurrenceID" = "ID.",
"scientificName" = "name",
"sex" = "sex",
"identifiedBy" = "DeterminedBy",
"dateIdentified" = "DeterminedWhen",
"eventID" = "COLLECTION.db",
"eventTime" = "gmt",
"decimalLatitude" = "latitude",
"decimalLongitude" = "longitude",
"coordinateUncertaintyInMeters" = "accuracy",
"stateProvince" = "state",
"municipality" = "city",
"Location" = "site",
"recordedBy" = "who",
"eventDate" = "time1") %>%
dplyr::mutate(
id = occurrenceID,
recordId = occurrenceID,
) %>%
# select columns that match the following string
dplyr::select( dplyr::matches(
# Use the carrot "^" to signify start of string and dollar sign "$" to signify end of
# string. Effectively, this will only return an exact match.
paste("^",ColsToKeep,"$",sep="") ))
#### Save data ####
# Check for and create outpath if needed
outPath <- outFile_maker(path = path)
# Notfiy user that occurrence file is being written
writeLines( paste(" - Writing occurrence data file...", "\n",
"Number of rows (records): ", format(nrow(USGS_data), big.mark=",",scientific=FALSE), "\n",
"Written to file called ", paste("USGS_formatted_", Sys.Date(), ".csv", sep = ""),
" at location ", outPath,
sep = ""))
# Write the occurence file
write_excel_csv(USGS_data, paste(outPath, "/USGS_formatted_", Sys.Date(), ".csv", sep = ""))
# Notify user that the .eml file is being written
writeLines( paste(" - Writing attributes file...", "\n",
"Written to file called ", paste("USGS_attribute_files", Sys.Date(),".xml", sep="" ),
" at location ", outPath,
sep = ""))
# Format the attributes for exporting lists
EML_attributes$Source_tibble$abstract <- EML_attributes$Source_tibble$abstract %>% unlist()
EML_attributes$Source_tibble$citations <- EML_attributes$Source_tibble$citations %>% unlist()
EML_attributes$Source_tibble$rights <- EML_attributes$Source_tibble$rights %>% unlist()
# Write the attribute file
write_excel_csv(EML_attributes$Source_tibble, file = paste(outPath,
"/USGS_attribute_files",
Sys.Date(),".csv", sep="" ))
# IF there were problems detected, write these to a .csv file and notify the user
if(nrow(USGS_problems) > 0){
writeLines(" - Problems detected with the tibble. Saving to a .csv file...")
write_excel_csv(USGS_problems, file = paste(outPath, "/USGS_problems",
Sys.Date(),".csv", sep="" ))
}
# Return end product and print completion note
writeLines(paste(" - Fin.", sep = "\n"))
return( dplyr::lst(USGS_data, EML_attributes) )
} # END USGS_import
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.