#' Read in data
#'
#' Compile data from state-exported text files by providing a path to the download directory.
#'
#' @importFrom dplyr mutate
#' @importFrom dplyr filter
#' @importFrom dplyr pull
#' @importFrom dplyr group_by
#' @importFrom dplyr n
#' @importFrom dplyr ungroup
#' @importFrom dplyr select
#' @importFrom dplyr distinct
#' @importFrom dplyr rename
#' @importFrom dplyr cur_group_id
#' @importFrom dplyr row_number
#' @importFrom purrr map
#' @importFrom purrr list_rbind
#' @importFrom readr read_fwf
#' @importFrom readr fwf_widths
#' @importFrom stringr str_detect
#' @importFrom stringr str_extract
#' @importFrom stringr str_remove
#'
#' @param path File path to the folder containing HIP .txt files
#' @param unique Return a distinct frame? Defaults to TRUE
#' @param state When specified, reads in download data from a specified state. Must match one of the following two-letter abbreviations:
#' \itemize{
#' \item AL, AK, AZ, AR, CA, CO, CT, DE, DC, FL, GA, ID, IL, IN, IA, KS, KY, LA, ME, MD, MA, MI, MN, MS, MO, MT, NE, NV, NH, NJ, NM, NY, NC, ND, OH, OK, OR, PA, RI, SC, SD, TN, TX, UT, VT, VA, WA, WV, WI, WY}
#' @param season If set as TRUE, selects only folders starting with "DL" in a a season's upper-level directory
#'
#' @author Abby Walter, \email{abby_walter@@fws.gov}
#' @references \url{https://github.com/USFWS/migbirdHIP}
#'
#' @export
read_hip <-
function(path, unique = TRUE, state = NA, season = FALSE) {
# Add a final "/" if not included already
if(!str_detect(path, "\\/$")) {
path <- paste0(path, "/")
}
# Error for possibly wrong path
if(str_detect(path, "DL") & season == TRUE) {
message("Are you sure you supplied a season path?")
}
# Fail if incorrect unique supplied
stopifnot(
"Please supply TRUE or FALSE for `unique`." =
unique %in% c(TRUE, FALSE, T, F))
# Fail if incorrect state supplied
stopifnot(
"Use a 2-letter abbreviation for `state`, e.g. 'DE'." =
state %in% c(NA, REF_ABBR_49_STATES))
# Fail if incorrect season supplied
stopifnot(
"Please supply TRUE or FALSE for `season`." =
season %in% c(TRUE, FALSE, T, F))
# Create a tibble of the HIP .txt files to be read from the provided
# directory
file_list <-
listFiles(path, season) |>
# Don't process permit files
ignorePermits() |>
# Don't process hold files
ignoreHolds() |>
# Don't process lifetime files
ignoreLifetime() |>
# Identify blank files
idBlankFiles() |>
# Drop blank files
dropBlankFiles()
# Filter files to include only specified state (state param NA by default)
if (!is.na(state)) {
file_list <- filter(file_list, str_detect(filepath, state))
}
# Create a vector of file paths
file_list_vector <- file_list$filepath
# Stop if there are no files to read in
stopifnot("No file(s) to read in." = length(file_list_vector) != 0)
# Stop if any file name date is formatted as MMDDYYYY or DDMMYYYY
date_test <- checkFileNameDateFormat(file_list_vector)
stopifnot("Incorrect date format in file name." = is.null(date_test))
# Stop if any file name contains a state abbreviation not found in the list
# of 49 continental US states
state_test <- checkFileNameStateAbbr(file_list_vector)
stopifnot("Bad state abbreviation in file name." = is.null(state_test))
# Read in HIP data
raw_data <-
map(
1:length(file_list_vector),
function(i) {
# Compile each state's file into one table
read_fwf(
file_list_vector[i],
fwf_widths(c(1, 15, 1, 20, 3, 60, 20, 2, 10, 10, 10,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, NA)),
col_types = "cccccccccccccccccccccccc",
na = c("N/A", "")) |>
mutate(
# Add the download state as a column
dl_state =
str_extract(
file_list_vector[i], "[A-Z]{2}(?=[0-9]{8}\\.txt)"),
# Add the download date as a column
dl_date =
str_extract(
file_list_vector[i], "(?<=[A-Z]{2})[0-9]{8}(?=\\.txt)"),
# Add the source file as a column
source_file =
str_remove(file_list_vector[i], path),
# Add the download cycle as a column
dl_cycle =
str_extract(file_list_vector[i], "(?<=DL).+(?=\\/)"))
}) |>
# Row bind data from each file into one tibble
list_rbind() |>
# Rename columns
rename(
title = 1,
firstname = 2,
middle = 3,
lastname = 4,
suffix = 5,
address = 6,
city = 7,
state = 8,
zip = 9,
birth_date = 10,
issue_date = 11,
hunt_mig_birds = 12,
ducks_bag = 13,
geese_bag = 14,
dove_bag = 15,
woodcock_bag = 16,
coots_snipe = 17,
rails_gallinules = 18,
cranes = 19,
band_tailed_pigeon = 20,
brant = 21,
seaducks = 22,
registration_yr = 23,
email = 24) |>
# Add a download key
group_by(dl_date, dl_state) |>
mutate(dl_key = paste0("dl_", cur_group_id())) |>
ungroup()
# Remove exact duplicates
if (unique == TRUE) {
raw_data <-
raw_data |>
distinct() |>
# Add a record key
mutate(record_key = paste0("record_", row_number()))
}
# Return messages to console for important issues
readMessages(raw_data)
return(raw_data)
}
#' List files
#'
#' The internal \code{listFiles} function is used inside of \code{\link{read_hip}} and creates a tibble of the HIP .txt files to be read in from the provided directory.
#'
#' @importFrom dplyr tibble
#'
#' @param path File path to the folder containing HIP .txt files
#' @param season If set as TRUE, selects only folders starting with "DL" in a a season's upper-level directory
#'
#' @author Abby Walter, \email{abby_walter@@fws.gov}
#' @references \url{https://github.com/USFWS/migbirdHIP}
listFiles <-
function(path, season) {
# Create a tibble of the HIP .txt files to be read from the provided
# directory
tibble(
filepath =
list.files(
path,
recursive = {{season}},
pattern = "*\\.txt$",
ignore.case = TRUE,
full.names = TRUE)
)
}
#' Ignore permit files
#'
#' The internal \code{ignorePermits} function is used inside of \code{\link{read_hip}} to filter out permit files from the file list.
#'
#' @importFrom dplyr filter
#' @importFrom stringr str_detect
#'
#' @param filelist The file list tibble created by \code{\link{listFiles}}
#'
#' @author Abby Walter, \email{abby_walter@@fws.gov}
#' @references \url{https://github.com/USFWS/migbirdHIP}
ignorePermits <-
function(filelist) {
# Don't process permit files
filelist |>
filter(!str_detect(filepath, "permit"))
}
#' Ignore hold files
#'
#' The internal \code{ignoreHolds} function is used inside of \code{\link{read_hip}} to filter out hold files from the file list.
#'
#' @importFrom dplyr filter
#' @importFrom stringr str_detect
#'
#' @param filelist The file list tibble created by \code{\link{listFiles}}
#'
#' @author Abby Walter, \email{abby_walter@@fws.gov}
#' @references \url{https://github.com/USFWS/migbirdHIP}
ignoreHolds <-
function(filelist) {
# Don't process hold files
filelist |>
filter(!str_detect(filepath, "hold"))
}
#' Ignore lifetime files
#'
#' The internal \code{ignoreLifetime} function is used inside of \code{\link{read_hip}} to filter out lifetime files from the file list.
#'
#' @importFrom dplyr filter
#' @importFrom stringr str_detect
#'
#' @param filelist The file list tibble created by \code{\link{listFiles}}
#'
#' @author Abby Walter, \email{abby_walter@@fws.gov}
#' @references \url{https://github.com/USFWS/migbirdHIP}
ignoreLifetime <-
function(filelist) {
# Don't process lifetime files
filelist |>
filter(!str_detect(filepath, "lifetime"))
}
#' Identify blank files
#'
#' The internal \code{idBlankFiles} function is used inside of \code{\link{read_hip}} to identify files in the list that contain no data.
#'
#' @importFrom dplyr mutate
#' @importFrom stringr str_replace
#'
#' @param filelist The file list tibble created by \code{\link{listFiles}}
#'
#' @author Abby Walter, \email{abby_walter@@fws.gov}
#' @references \url{https://github.com/USFWS/migbirdHIP}
idBlankFiles <-
function(filelist) {
# Identify blank files
filelist |>
mutate(
filepath = str_replace(filepath, "TXT", "txt"),
check = ifelse(file.size(filepath) == 0, "blank", ""))
}
#' Drop blank files
#'
#' The internal \code{dropBlankFiles} function is used inside of \code{\link{read_hip}} to return an error message if blank files exist in the directory, and remove them from the file list so they are not read in.
#'
#' @importFrom dplyr filter
#' @importFrom dplyr pull
#'
#' @param filelist The file list tibble created by \code{\link{listFiles}}
#'
#' @author Abby Walter, \email{abby_walter@@fws.gov}
#' @references \url{https://github.com/USFWS/migbirdHIP}
dropBlankFiles <-
function(filelist) {
# Error for blank files
if("blank" %in% filelist$check) {
message("Error: One or more files are blank in the directory.")
print(filter(filelist, check == "blank"))
}
# Filter out blank files from the paths list
filelist_without_blanks <-
filelist |>
filter(check != "blank")
return(filelist_without_blanks)
}
#' Check HIP file name date formatting
#'
#' The internal \code{checkFileNameDateFormat} function is used inside of \code{\link{read_hip}} to return an error message if any file does not have a date formatted as YYYYMMDD.
#'
#' @importFrom stringr str_extract
#' @importFrom stringr str_detect
#'
#' @param file_list_vector A file list vector
#'
#' @author Abby Walter, \email{abby_walter@@fws.gov}
#' @references \url{https://github.com/USFWS/migbirdHIP}
checkFileNameDateFormat <-
function(file_list_vector) {
# Extract all dates from file names in the file_list_vector
dl_date_test <-
str_extract(file_list_vector, "(?<=[A-Z]{2})[0-9]{8}(?=\\.txt)")
if (FALSE %in% str_detect(dl_date_test, "^202") &
TRUE %in% str_detect(dl_date_test, "^[0-9]{4}202")) {
message(
paste0(
"Error: MMDDYYYY or DDMMYYYY format suspected in dl_date.",
" Please fix the source file name(s)."
)
)
bad_dates <- dl_date_test[str_detect(dl_date_test, "^[0-9]{4}202") &
!str_detect(dl_date_test, "^202")]
for (i in 1:length(bad_dates)) {
print(
file_list_vector[str_detect(file_list_vector, bad_dates[i])]
)
}
return("error")
} else {
return(NULL)
}
}
#' Check HIP file name state abbreviations
#'
#' The internal \code{checkFileNameStateAbbr} function is used inside of \code{\link{read_hip}} to return an error message if any file does not have an state abbreviation from the expected 49 continental states.
#'
#' @importFrom stringr str_extract
#'
#' @param file_list_vector A file list vector
#'
#' @author Abby Walter, \email{abby_walter@@fws.gov}
#' @references \url{https://github.com/USFWS/migbirdHIP}
checkFileNameStateAbbr <-
function(file_list_vector) {
# Extract all state abbreviations from file names in the file_list_vector
state_test <-
unique(str_extract(file_list_vector, "[A-Z]{2}(?=[0-9]{8}\\.txt$)"))
# Return a message if there is a dl_state not found in the list of 49
# continental US states
if (FALSE %in%
(state_test %in% REF_ABBR_49_STATES)
) {
message(
paste(
"Error: One or more files contains a state abbreviation not in the",
"list of expected 49 continental US states."
)
)
print(state_test[!state_test %in% REF_ABBR_49_STATES])
return("error")
} else {
return(NULL)
}
}
#' Return messages to console for common or catastrophic read_hip issues
#'
#' The internal \code{readMessages} function is used inside of \code{\link{read_hip}} to return messages for missing PII, missing email addresses, all-zero bag records, non-numeric bag values, NAs in dl_state, and NAs in dl_date.
#'
#' @param raw_data The product of \code{\link{read_hip}}
#'
#' @author Abby Walter, \email{abby_walter@@fws.gov}
#' @references \url{https://github.com/USFWS/migbirdHIP}
readMessages <-
function(raw_data) {
# Return a message for records with blank or NA values in firstname,
# lastname, state, or birth date
missingPIIMessage(raw_data)
# Return a message if all emails are missing from a file
missingEmailsMessage(raw_data)
# Return a message if "TEST" is found in firstname or lastname field
testRecordMessage(raw_data)
# Return a message if any record contains all-zero bag values
zeroBagsMessage(raw_data)
# Return a message if any record contains all-NA bag values
naBagsMessage(raw_data)
# Return a message if any record contains a bag value that is not a
# 1-digit number
nonDigitBagsMessage(raw_data)
# Return a message if there is an NA in dl_state
dlStateNAMessage(raw_data)
# Return a message if there is an NA in dl_date
dlDateNAMessage(raw_data)
# Return a message if in-line permit does not have hunt_mig_birds == 2
inLinePermitDNHMessage(raw_data)
}
#' Return message for records with blank or NA values in firstname, lastname, state, or birth date
#'
#' The internal \code{missingPIIMessage} function is used inside of \code{\link{readMessages}}
#'
#' @importFrom dplyr group_by
#' @importFrom dplyr mutate
#' @importFrom dplyr n
#' @importFrom dplyr ungroup
#' @importFrom dplyr filter
#' @importFrom dplyr reframe
#' @importFrom dplyr distinct
#'
#' @inheritParams readMessages
#'
#' @author Abby Walter, \email{abby_walter@@fws.gov}
#' @references \url{https://github.com/USFWS/migbirdHIP}
missingPIIMessage <-
function(raw_data) {
# Return a message for records with blank or NA values in firstname,
# lastname, state, or birth date
raw_nas <-
raw_data |>
group_by(dl_state) |>
mutate(n_total = n()) |>
ungroup() |>
filter(
!!LOGIC_MISSING_PII |
!!LOGIC_MISSING_ADDRESSES |
!!LOGIC_MISSING_CITY_ZIP_EMAIL) |>
group_by(dl_state) |>
reframe(n = n(), proportion = round(n/n_total, 2)) |>
distinct() |>
filter(n >= 100 | proportion >= 0.1)
if (nrow(raw_nas) > 0) {
message(
paste(
"Error: NA values detected in one or more ID fields",
"(firstname, lastname, state, birth date) for >10% of a file",
"and/or >100 records."
)
)
print(raw_nas)
}
}
#' Return message if all emails are missing from a file
#'
#' The internal \code{missingEmailsMessage} function is used inside of \code{\link{readMessages}}
#'
#' @importFrom dplyr group_by
#' @importFrom dplyr summarize
#' @importFrom dplyr ungroup
#' @importFrom dplyr filter
#' @importFrom dplyr select
#'
#' @inheritParams readMessages
#'
#' @author Abby Walter, \email{abby_walter@@fws.gov}
#' @references \url{https://github.com/USFWS/migbirdHIP}
missingEmailsMessage <-
function(raw_data) {
# Return a message if all emails are missing from a file
missing <-
raw_data |>
group_by(source_file) |>
summarize(n_emails = length(unique(email))) |>
ungroup() |>
filter(n_emails == 1)
if (nrow(missing) > 0) {
message("Error: One or more files are missing 100% of emails.")
print(missing |> select(source_file))
}
}
#' Return message if test record is found
#'
#' The internal \code{testRecordMessage} function is used inside of \code{\link{readMessages}}
#'
#' @importFrom dplyr filter
#' @importFrom dplyr select
#'
#' @inheritParams readMessages
#'
#' @author Abby Walter, \email{abby_walter@@fws.gov}
#' @references \url{https://github.com/USFWS/migbirdHIP}
testRecordMessage <-
function(raw_data) {
# Return a message if test records are found
bad_test_records <-
raw_data |>
# Convert firstname, lastname, and suffix to upper case
namesToUppercase() |>
# Identify test record through searching first name and last name
filter(!!LOGIC_TEST_RECORD)
if (nrow(bad_test_records) > 0) {
message(
paste(
"Error: One or more records contain 'TEST' in first name and last",
"name fields."
)
)
print(
bad_test_records |>
select(source_file, record_key, firstname, lastname))
}
}
#' Return message if any record has a "0" in every bag field
#'
#' The internal \code{zeroBagsMessage} function is used inside of \code{\link{readMessages}}
#'
#' @importFrom dplyr filter
#' @importFrom dplyr if_all
#' @importFrom dplyr all_of
#' @importFrom dplyr select
#'
#' @inheritParams readMessages
#'
#' @author Abby Walter, \email{abby_walter@@fws.gov}
#' @references \url{https://github.com/USFWS/migbirdHIP}
zeroBagsMessage <-
function(raw_data) {
# Return a message if any record has a "0" in every bag field
zero_bags <-
raw_data |>
# Find any records that have a "0" in every bag field
filter(!!LOGIC_ZERO_BAGS)
if (nrow(zero_bags) > 0) {
message(
paste(
"Error: One or more records has a '0' in every bag field; these",
"records will be filtered out in clean()."
)
)
print(zero_bags |> select(source_file, record_key))
}
}
#' Return message if any record has an NA in every bag field
#'
#' The internal \code{naBagsMessage} function is used inside of \code{\link{readMessages}}
#'
#' @importFrom dplyr filter
#' @importFrom dplyr if_all
#' @importFrom dplyr all_of
#' @importFrom dplyr select
#'
#' @inheritParams readMessages
#'
#' @author Abby Walter, \email{abby_walter@@fws.gov}
#' @references \url{https://github.com/USFWS/migbirdHIP}
naBagsMessage <-
function(raw_data) {
# Return a message if any record has an NA in every bag field
NA_bags <-
raw_data |>
# Find any records that have an NA in every bag field
filter(if_all(all_of(REF_BAG_FIELDS), \(x) is.na(x)))
if (nrow(NA_bags) > 0) {
message("Error: One or more records has an NA in every bag field.")
print(NA_bags |> select(source_file, record_key))
}
}
#' Return message if any record contains a bag value that is not a 1-digit number
#'
#' The internal \code{nonDigitBagsMessage} function is used inside of \code{\link{readMessages}}
#'
#' @importFrom dplyr filter
#' @importFrom dplyr if_any
#' @importFrom dplyr all_of
#' @importFrom stringr str_detect
#' @importFrom tidyr unite
#' @importFrom dplyr matches
#' @importFrom dplyr select
#'
#' @inheritParams readMessages
#'
#' @author Abby Walter, \email{abby_walter@@fws.gov}
#' @references \url{https://github.com/USFWS/migbirdHIP}
nonDigitBagsMessage <-
function(raw_data) {
# Return a message if any record contains a bag value that is not a 1-digit
# number
nondigit_bags <-
raw_data |>
filter(!!LOGIC_NONDIGIT_BAGS)
if (nrow(nondigit_bags) > 0) {
message(
paste(
"Error: One or more records detected with a value other than a",
"single digit; these records will be filtered out in clean()."
)
)
print(
nondigit_bags |>
unite(bags, matches(REF_BAG_FIELDS), sep = " ") |>
select(source_file, record_key, bags)
)
}
}
#' Return message if there is an NA in dl_state
#'
#' The internal \code{dlStateNAMessage} function is used inside of \code{\link{readMessages}}
#'
#' @importFrom dplyr distinct
#' @importFrom dplyr filter
#'
#' @inheritParams readMessages
#'
#' @author Abby Walter, \email{abby_walter@@fws.gov}
#' @references \url{https://github.com/USFWS/migbirdHIP}
dlStateNAMessage <-
function(raw_data) {
# Return a message if there is an NA in dl_state
if (TRUE %in% is.na(raw_data$dl_state)) {
message("Error: One or more more NA values detected in dl_state.")
print(raw_data |>
distinct(dl_state, source_file) |>
filter(is.na(dl_state)))
}
}
#' Return message if there is an NA in dl_date
#'
#' The internal \code{dlDateNAMessage} function is used inside of \code{\link{readMessages}}
#'
#' @importFrom dplyr select
#' @importFrom dplyr distinct
#' @importFrom dplyr filter
#'
#' @inheritParams readMessages
#'
#' @author Abby Walter, \email{abby_walter@@fws.gov}
#' @references \url{https://github.com/USFWS/migbirdHIP}
dlDateNAMessage <-
function(raw_data) {
# Return a message if there is an NA in dl_date
if (TRUE %in% is.na(raw_data$dl_date)) {
message("Error: One or more more NA values detected in dl_date.")
print(raw_data |>
select(dl_date, source_file) |>
filter(is.na(dl_date)) |>
distinct())
}
}
#' In-line permit did-not-hunt message
#'
#' The internal \code{inLinePermitDNHMessage} function returns a message for in-line permit records from OR or WA that indicate they did not hunt.
#'
#' @importFrom dplyr filter
#' @importFrom dplyr count
#'
#' @inheritParams readMessages
#'
#' @author Abby Walter, \email{abby_walter@@fws.gov}
#' @references \url{https://github.com/USFWS/migbirdHIP}
inLinePermitDNHMessage <-
function(raw_data) {
# If any OR or WA hunt_mig_birds != "2" for presumed solo permit, return a
# message
inline_pmt_dnh <-
raw_data |>
filter(!is.na(band_tailed_pigeon) &
!is.na(brant) &
!is.na(seaducks)) |>
filter(!!LOGIC_INLINE_PMT_DNH) |>
count(source_file, hunt_mig_birds, band_tailed_pigeon, brant, seaducks)
if (nrow(inline_pmt_dnh) > 0) {
message(
paste(
"Error:", sum(inline_pmt_dnh$n), "in-line permit records",
"from OR and/or WA do not contain hunt_mig_birds == 2; they will be",
"edited in clean()."
)
)
print(inline_pmt_dnh)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.