#' Flag errors
#'
#' After cleaning the data with \code{\link{clean}}, compare each field to an expected range of values and flag non-conforming values in a new "errors" column.
#'
#' @importFrom dplyr mutate
#' @importFrom dplyr select
#' @importFrom tibble as_tibble
#' @importFrom dplyr row_number
#' @importFrom dplyr bind_rows
#' @importFrom dplyr distinct
#' @importFrom dplyr filter
#' @importFrom dplyr case_when
#' @importFrom dplyr left_join
#' @importFrom dplyr reframe
#' @importFrom stringr str_detect
#' @importFrom dplyr group_by
#' @importFrom dplyr ungroup
#' @importFrom dplyr n
#' @importFrom stringr str_extract
#' @importFrom stringr str_remove_all
#' @importFrom stringr str_length
#'
#' @param x The object created after cleaning data with \code{\link{clean}}
#' @param year The year in which the Harvest Information Program data were collected
#'
#' @author Abby Walter, \email{abby_walter@@fws.gov}
#' @references \url{https://github.com/USFWS/migbirdHIP}
#'
#' @export
proof <-
function(x, year){
# Combine US State, District and Territory abbreviations with Canada
# abbreviations
states_provinces_and_canada <-
paste(c(datasets::state.abb, abbr_usa, abbr_canada), collapse = "|")
# Create a record key so that the errors can be joined in later
keyed_x <-
x |>
mutate(temp_key = paste0("row_", row_number()))
markup <-
bind_rows(
# Title should be 1 or 2, no other values
keyed_x |>
filter(!str_detect(title, "0|1|2")) |>
mutate(error = "title"),
# First name
keyed_x |>
mutate(
error =
case_when(
# First name should be >1 letter
str_detect(firstname, "^[A-Z]{1}$") ~ "firstname",
# First name should not contain 1 letter followed by a name
str_detect(firstname, "^[A-Z]{1}\\s[A-Z]+$") ~ "firstname",
# First name should not contain a name followed by 1 letter
str_detect(firstname, "^[A-Z]+\\s[A-Z]{1}$") ~ "firstname",
# Only non-alpha characters allowed are spaces, apostrophes, and
# hyphens. No numbers, commas, parentheses, etc allowed
str_detect(firstname, "[^A-Z\\s\\-\\']") ~ "firstname",
# No full names (detect using 2+ spaces)
str_detect(
firstname, "^[A-Z]+\\s[A-Z]+\\s[A-Z]+$") ~ "firstname",
# Should not be "blank" or "inaudible"
str_detect(
firstname, "^[INAUDIBLE|BLANK|TEST|USER|RESIDENT]$") ~
"firstname",
TRUE ~ NA_character_)),
# Middle name should only be 1 letter of the alphabet
keyed_x |>
filter(!str_detect(middle, "^[A-Z]{1}$")) |>
mutate(error = "middle"),
# Last name
keyed_x |>
mutate(
error =
case_when(
# Last name should be >1 letter
str_detect(lastname, "^[A-Z]{1}$") ~ "lastname",
# Only non-alpha characters allowed are spaces, periods,
# hyphens, and apostrophes. No numbers, commas, parentheses, etc
str_detect(lastname, "[^A-Z\\s\\-\\.\\']") ~ "lastname",
# No full names (detect using 2+ spaces)
str_detect(
lastname, "^[A-Z]+\\s[A-Z]+\\s[A-Z]+$") ~ "lastname",
# Should not be "inaudible"
str_detect(lastname, "^INAUDIBLE$") ~ "lastname",
TRUE ~ NA_character_)),
# Suffix
# Allows 1-20 in Roman numerals and numeric, excluding XVIII since the
# limit is 4 characters)
keyed_x |>
filter(
str_detect(
suffix,
"[^JR|SR|I{1,3}|IV|VI{0,3}|I{0,1}X|XI{1,3}|XI{0,1}V|XVI{1,2}|XI{0,1}X|1ST|2ND|3RD|[4-9]TH|1[0-9]TH|20TH]")) |>
mutate(error = "suffix"),
# Address does not contain |, tab or non-UTF8 characters
# Any further address verification isn't really possible
keyed_x |>
filter(str_detect(address, "\\||\\t|[^\\x00-\\x7F]+")) |>
mutate(error = "address"),
# City should only contain letters, spaces (e.g. New York City), hyphens
# (e.g. Winston-Salem, NC), or apostrophes (e.g. HI residents and
# O'Fallon, MO)
keyed_x |>
filter(str_detect(city, "[^A-Za-z\\s\\-\\']")) |>
mutate(error = "city"),
# City MUST contain at least 3 letters
keyed_x |>
filter(str_detect(city, "^[A-Za-z]{1,2}$")) |>
mutate(error = "city"),
# State should only contain a specific list of states/provinces/etc
keyed_x |>
filter(!str_detect(state, states_provinces_and_canada)) |>
mutate(error = "state"),
# Zip code should be in the reference table
keyed_x |>
filter(!str_extract(zip, "^[0-9]{5}") %in% zip_code_ref$zipcode) |>
mutate(error = "zip"),
# Birth date should only ever be between 100 and 16 years go, since
# hunters do not have to register if less than 16 years of age & years
# from earlier than 100 years ago are unlikely
keyed_x |>
filter(
as.numeric(
str_extract(birth_date, "(?<=\\/)[0-9]{4}")) < year - 100 |
as.numeric(
str_extract(birth_date, "(?<=\\/)[0-9]{4}")) > year - 16) |>
mutate(error = "birth_date"),
# Hunting migratory birds should only be = 1 or 2
keyed_x |>
filter(!hunt_mig_birds %in% c("1", "2")) |>
mutate(error = "hunt_mig_birds"),
# Registration year should = survey year
keyed_x |>
filter(registration_yr != year) |>
mutate(error = "registration_yr"),
# Email
keyed_x |>
mutate(email = tolower(email)) |>
filter(
# If email doesn't fit a loose validation regular expression, mark
# as error. Local part may contain Latin lower and uppercase
# letters, numbers, underscores, dots, hyphens, and a plus sign
# (consecutive dots, leading dots, etc all handled in correct
# function even if not marked as error); must contain @; domain may
# contain Latin lower and uppercase letters, numbers, and hyphens;
# subdomains acceptable when separated by a dot.
!str_detect(
email,
"^[a-zA-Z0-9\\_\\.\\+\\-]+\\@[a-zA-Z0-9\\-]+\\.[a-zA-Z0-9\\-\\.]+$"
) |
# If email is obfuscative
str_detect(
email,
"^(none|no|na|not|non|www\\.none|nomail|noemail|noreply|customer|unknown|notprovided)\\@"
) |
str_detect(email, "\\@none") |
str_detect(email, "\\@(no\\.com|na\\.org)$") |
# Obfuscative Texas emails from @tpw or @tpwd
str_detect(email, "\\@(tpw|twp)") |
# If domain is invalid
str_detect(email, "\\@example.com$") |
# If longer than 100 characters (max length of valid address is
# 254 but this would be very rare)
str_length(email) > 100 |
# If there are multiple .
str_detect(email, "\\.\\.+") |
# If there is a dot in the place of the first character
str_detect(email, "^\\.") |
# If there is a dot in the place of last character in local part
str_detect(email, "\\.(?=\\@)") |
# If dot is last character
str_detect(email, "\\.$") |
# Hyphen in first place of domain
str_detect(email, "(?<=\\@)\\-") |
# Bad top level domain
str_detect(email, "(?<=\\@)gmail\\.(co|net|edu|org)$") |
str_detect(email, "(?<=\\@)att\\.(com|org)$") |
str_detect(email, "(?<=\\@)comcast\\.(com|org)$") |
str_detect(email, "(?<=\\@)icloud\\.(net|org)$") |
str_detect(
email,
"(?<=\\.)(com(\\.com)+|com(com)+|con|ccom|coom|comm|c0m|ocm|cm|om|cim|common)$"
)
) |>
mutate(error = "email")
) |>
select(temp_key, error)
graded_x <-
keyed_x |>
# Join in the error report
left_join(markup, by = "temp_key") |>
group_by(temp_key) |>
# Paste errors together (some records might have more than one!)
mutate(errors = paste(error, collapse = "-")) |>
ungroup() |>
select(-c(temp_key, error)) |>
distinct() |>
# Make the NAs "real NAs" not just strings
mutate(
errors =
ifelse(
str_detect(errors, "NA\\-|\\-NA"),
str_remove_all(errors, "NA\\-|\\-NA"),
errors)) |>
# Add a second mutate here because we cannot pipe '.'
mutate(errors = ifelse(str_detect(errors, "^NA$"), NA, errors)) |>
as_tibble()
# Proof the zip codes. Are they associated with the correct states?
graded_x <-
graded_x |>
left_join(
zip_code_ref |>
distinct(zip = zipcode, zipState = state),
by = "zip") |>
# Add an error if the state doesn't match zipState
mutate(
errors =
case_when(
state != zipState & is.na(errors) ~ "zip",
state != zipState & !is.na(errors) & !str_detect(errors, "zip") ~
paste0(errors, "-zip"),
TRUE ~ errors)
) |>
select(-zipState)
return(graded_x)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.