R/clean.R

Defines functions strataFix zipCheck special_OregonHuntYCheck formatZip fixMiddleInitials moveSuffixes missingPIIFilter bagsFilter namesToUppercase clean

Documented in bagsFilter clean fixMiddleInitials formatZip missingPIIFilter moveSuffixes namesToUppercase special_OregonHuntYCheck strataFix zipCheck

#' Clean data
#'
#' After reading the data with \code{\link{read_hip}}, reformat and clean the HIP registrations.
#'
#' @importFrom dplyr mutate_all
#' @importFrom stringr str_trim
#'
#' @param raw_df The object created after reading in data with \code{\link{read_hip}}
#'
#' @author Abby Walter, \email{abby_walter@@fws.gov}
#' @references \url{https://github.com/USFWS/migbirdHIP}
#'
#' @export

clean <-
  function(raw_df) {

    # Convert firstname, lastname, and suffix to upper case
    names_uppercased <- namesToUppercase(raw_df)

    # Filter out any record if any bag value is not a 1-digit number
    bags_checked <- bagsFilter(names_uppercased)

    # Filter out records if firstname, lastname, city of residence, state of
    # residence, or date of birth are missing -- records discarded because these
    # are needed to identify individuals. Filter out any other additional
    # records if they are missing a value for email AND elements of a physical
    # address that are required to determine where to mail a letter.
    PII_checked <- missingPIIFilter(bags_checked)

    # Delete suffixes from the lastname field and/or firstname field and move
    # them to the suffix field. Catches values from 1-20 in Roman numerals and
    # numeric, excluding XVIII since the db limit is 4 characters. Delete
    # periods and commas from suffixes.
    suffixes_moved <- moveSuffixes(PII_checked)

    # Change any character that's not a letter to NA in the from middle
    # initial field
    middle_initials_fixed <- fixMiddleInitials(suffixes_moved)

    # Remove ending hyphen from zip codes with 5 digits
    # Remove final 0 from zip codes with length of 10 digits
    # Insert a hyphen in continuous 9 digit zip codes
    # Insert a hyphen in 9 digit zip codes with a middle space
    # Remove trailing -0000
    # Remove trailing -___
    zips_formatted <- formatZip(middle_initials_fixed)

    # Check that the zip code for each address is associated with the correct
    # state
    zipCheck(zips_formatted)

    # If any OR HuntY = 0 for solo permit, change HuntY to 2
    oregon_hunty_checked <- special_OregonHuntYCheck(zips_formatted)

    # Delete white space around strings
    white_space_deleted <- oregon_hunty_checked |> mutate_all(str_trim)

    # If any permit file states submitted a 2 for crane and/or
    # band_tailed_pigeon, change the 2 to a 0
    permit_state_strata_fixed <- strataFix(white_space_deleted)

    return(permit_state_strata_fixed)
  }

#' Names to uppercase
#'
#' The internal \code{namesToUppercase} function converts name elements to uppercase for easier string cleaning.
#'
#' @importFrom dplyr mutate
#' @importFrom stringr str_to_upper
#'
#' @param raw_df The object created after reading in data with \code{\link{read_hip}}
#'
#' @author Abby Walter, \email{abby_walter@@fws.gov}
#' @references \url{https://github.com/USFWS/migbirdHIP}

namesToUppercase <-
  function(raw_df) {

    names_to_uppercase <-
      raw_df |>
      # Convert name elements to uppercase for easier string cleaning
      mutate(
        firstname = str_to_upper(firstname),
        lastname = str_to_upper(lastname),
        suffix = str_to_upper(suffix))

    return(names_to_uppercase)
  }

#' Filter out bad bags
#'
#' The internal \code{bagsFilter} function filters out any record if any bag value is not a 1-digit number.
#'
#' @importFrom dplyr filter
#' @importFrom dplyr if_any
#' @importFrom tidyr all_of
#' @importFrom stringr str_detect
#'
#' @param raw_df The object created after reading in data with \code{\link{read_hip}}
#'
#' @author Abby Walter, \email{abby_walter@@fws.gov}
#' @references \url{https://github.com/USFWS/migbirdHIP}

bagsFilter <-
  function(raw_df) {

    bags_checked <-
      # Filter out any record if any bag value is not a 1-digit number
      raw_df |>
      filter(!if_any(all_of(ref_bagfields), ~!str_detect(.x, "^[0-9]{1}$")))

    return(bags_checked)
  }

#' Missing PII filter
#'
#' The internal \code{missingPIIFilter} function filters out HIP registrations that are missing critical pieces of contact information.
#'
#' @importFrom dplyr filter
#' @importFrom dplyr if_all
#' @importFrom dplyr if_any
#'
#' @param raw_df The object created after reading in data with \code{\link{read_hip}}
#'
#' @author Abby Walter, \email{abby_walter@@fws.gov}
#' @references \url{https://github.com/USFWS/migbirdHIP}

missingPIIFilter <-
  function(raw_df) {
    PII_checked <-
      raw_df |>
      # Filter out records if firstname, lastname, city of residence, state of
      # residence, or date of birth are missing -- records discarded because
      # these are needed to identify individuals
      filter(
        !if_any(
          c("firstname", "lastname", "state", "birth_date"), ~is.na(.x))) |>
      # Filter out any additional records if they are missing a value for email
      # AND elements of a physical address that are required to determine where
      # to mail a letter
      filter(!if_all(c("address", "email"), ~is.na(.x))) |>
      filter(!if_all(c("city", "zip", "email"), ~is.na(.x)))

    return(PII_checked)
  }

#' Move suffixes
#'
#' The internal \code{moveSuffixes} function moves suffixes from first name or last name columns into the suffix column and performs other cleaning steps. This function catches values from 1 to 20 in Roman numerals and numeric, excluding
# XVIII since the database limit is 4 characters.
#'
#' @importFrom dplyr mutate
#' @importFrom dplyr case_when
#' @importFrom stringr str_detect
#' @importFrom stringr str_extract
#' @importFrom stringr str_remove_all
#' @importFrom stringr str_remove
#'
#' @param raw_df The object created after reading in data with \code{\link{read_hip}}
#'
#' @author Abby Walter, \email{abby_walter@@fws.gov}
#' @references \url{https://github.com/USFWS/migbirdHIP}

moveSuffixes <-
  function(raw_df) {

    suffix_regex <-
      paste0(
        "(?<=\\s)(JR|SR|I{1,3}|IV|VI{0,3}|I{0,1}X|XI{1,3}|XI{0,1}V|XVI{1,2}|XI",
        "{0,1}X|1ST|2ND|3RD|[4-9]TH|1[0-9]TH|20TH)\\.?$")

    suffixes_moved <-
      raw_df |>
      mutate(
        # Extract suffixes from lastname and firstname cols to suffix col
        # Catches values from 1-20 in Roman numerals and numeric, excluding
        # XVIII since the db limit is 4 characters
        suffix =
          case_when(
            # Lastname
            str_detect(lastname, suffix_regex) ~
              str_extract(lastname, suffix_regex),
            # Firstname
            str_detect(firstname, suffix_regex) ~
              str_extract(firstname, suffix_regex),
            TRUE ~ suffix),
        # Delete periods and commas from suffixes
        suffix = str_remove_all(suffix, "\\.|\\,"),
        # Delete suffixes from lastname col (includes 1-20 in Roman numerals and
        # numeric, excluding XVIII since the db limit is 4 characters)
        lastname =
          ifelse(
            str_detect(lastname, suffix_regex),
            str_remove(lastname, suffix_regex),
            lastname),
        # Delete suffixes from firstname col (includes 1-20 in Roman numerals
        # and numeric, excluding XVIII since the db limit is 4 characters)
        firstname =
          ifelse(
            str_detect(firstname, suffix_regex),
            str_remove(firstname, suffix_regex),
            firstname))

    return(suffixes_moved)
  }

#' Fix middle initials
#'
#' The internal \code{fixMiddleInitials} function changes non-alphabetic characters in the middle initial column to NA.
#'
#' @importFrom stringr str_detect
#'
#' @param raw_df The object created after reading in data with \code{\link{read_hip}}
#'
#' @author Abby Walter, \email{abby_walter@@fws.gov}
#' @references \url{https://github.com/USFWS/migbirdHIP}

fixMiddleInitials <-
  function(raw_df) {
    middle_initials_fixed <-
      raw_df |>
      # Change any character that's not a letter to NA in the from middle
      # initial field
      mutate(middle = ifelse(str_detect(middle, "[^A-Z]"), NA, middle))

    return(middle_initials_fixed)
  }

#' Format zip codes
#'
#' The internal \code{formatZip} function formats zip codes.
#'
#' @importFrom dplyr mutate
#' @importFrom stringr str_detect
#' @importFrom stringr str_extract
#' @importFrom stringr str_remove
#' @importFrom stringr str_replace
#'
#' @param raw_df The object created after reading in data with \code{\link{read_hip}}
#'
#' @author Abby Walter, \email{abby_walter@@fws.gov}
#' @references \url{https://github.com/USFWS/migbirdHIP}

formatZip <-
  function(raw_df) {
    zips_formatted <-
      raw_df |>
      # Zip code format corrections
      mutate(
        zip =
          # Remove ending hyphen from zip codes with 5 digits
          ifelse(
            str_detect(zip, "^[0-9]{5}\\-$"),
            str_remove(zip, "\\-$"),
            zip),
        zip =
          # Remove final 0 from zip codes with length of 10 digits
          ifelse(
            str_detect(zip, "^[0-9]{10}$") &
              str_extract(zip, "[0-9]{1}(?=$)") == "0",
            str_remove(zip, "[0-9]{1}(?=$)"),
            zip),
        zip =
          # Insert a hyphen in continuous 9 digit zip codes
          ifelse(
            str_detect(zip, "^[0-9]{9}$"),
            paste0(
              str_extract(zip, "^[0-9]{5}"),
              "-",
              str_extract(zip,"[0-9]{4}$")),
            zip),
        zip =
          # Insert a hyphen in 9 digit zip codes with a middle space
          ifelse(
            str_detect(zip, "^[0-9]{5}\\s[0-9]{4}$"),
            str_replace(zip, "\\s", "\\-"),
            zip),
        zip =
          # Remove trailing -0000
          ifelse(
            str_detect(zip, "\\-0000"),
            str_remove(zip, "\\-0000"),
            zip),
        zip =
          # Remove trailing -___
          ifelse(
            str_detect(zip, "\\-\\_+"),
            str_remove(zip, "\\-\\_+"),
            zip)
      )

    return(zips_formatted)
  }

#' Oregon HuntY check
#'
#' The internal \code{special_OregonHuntYCheck} function changes any registration from Oregon with HuntY == "0" to "2" if one or more of the band_tailed_pigeon, brant, or seaducks fields indicate hunting.
#'
#' @importFrom dplyr mutate
#'
#' @param raw_df The object created after reading in data with \code{\link{read_hip}}
#'
#' @author Abby Walter, \email{abby_walter@@fws.gov}
#' @references \url{https://github.com/USFWS/migbirdHIP}

special_OregonHuntYCheck <-
  function(raw_df) {
    oregon_hunty_checked <-
      raw_df |>
      # If any OR HuntY = 0 for solo permit, change HuntY to 2
      mutate(
        hunt_mig_birds =
          ifelse(
            dl_state == "OR" &
              hunt_mig_birds == "0" &
              sum(as.numeric(band_tailed_pigeon),
                  as.numeric(brant),
                  as.numeric(seaducks)) > 2,
            "2",
            hunt_mig_birds)
      )
  }

#' Check if zip codes are associated with the correct state
#'
#' The internal \code{zipCheck} function checks to see if zip codes in hunter addresses match the address state.
#'
#' @importFrom dplyr group_by
#' @importFrom dplyr mutate
#' @importFrom dplyr ungroup
#' @importFrom dplyr filter
#' @importFrom dplyr select
#' @importFrom dplyr left_join
#' @importFrom dplyr reframe
#' @importFrom dplyr n
#' @importFrom dplyr arrange
#' @importFrom dplyr desc
#' @importFrom dplyr distinct
#'
#' @param raw_df The object created after reading in data with \code{\link{read_hip}}
#'
#' @author Abby Walter, \email{abby_walter@@fws.gov}
#' @references \url{https://github.com/USFWS/migbirdHIP}

zipCheck <-
  function(raw_df) {
    # Proof the zip codes -- are they associated with the correct states?
    zipcheck <-
      raw_df |>
      left_join(
        zip_code_ref |>
          distinct(zip = zipcode, zipState = state),
        by = "zip") |>
      select(source_file, state, zip, zipState) |>
      group_by(source_file) |>
      mutate(total_records = n()) |>
      ungroup() |>
      filter(state != zipState) |>
      group_by(source_file) |>
      reframe(
        n = n(),
        prop = round(n/total_records, 2)) |>
      distinct() |>
      arrange(desc(n)) |>
      filter(n >= 100 | prop >= 0.1)

    # Error check: are any zip codes wrong?
    if(nrow(zipcheck) > 0){
      message(
        paste0("Warning: Zip codes detected that do not correspond to provided",
               " state of residence for >10% of a file and/or >100 records."))

      print(zipcheck)
    }
  }

#' Fix permit strata
#'
#' The internal \code{strataFix} function is used inside of \code{\link{clean}} to edit strata for states that submit permit files separately from HIP. If records from these states submit a "2" for the band_tailed_pigeon or crane field, they will be mistakenly identified as permit records. The \code{strataFix} function changes band_tailed_pigeon and/or crane "2" values to "0" so that they are classified as HIP records until permit files are received later in the hunting season.
#'
#' @importFrom dplyr filter
#' @importFrom dplyr count
#' @importFrom dplyr mutate
#' @importFrom dplyr bind_rows
#'
#' @param raw_df An intermediate object created inside of \code{\link{clean}}
#'
#' @author Abby Walter, \email{abby_walter@@fws.gov}
#' @references \url{https://github.com/USFWS/migbirdHIP}

strataFix <-
  function(raw_df) {
    bad_bt_2s <-
      raw_df |>
      filter(
        dl_state %in%
          pmt_files$dl_state[pmt_files$spp == "band_tailed_pigeon"] &
          band_tailed_pigeon == "2") |>
      count(dl_state)

    bad_cr_2s <-
      raw_df |>
      filter(
        dl_state %in%
          pmt_files$dl_state[pmt_files$spp == "cranes"] &
          cranes == "2") |>
      count(dl_state)

    if(nrow(bad_bt_2s) > 0 | nrow(bad_cr_2s) > 0) {

      corrected_pmt_strata <-
        raw_df |>
        mutate(
          band_tailed_pigeon =
            ifelse(
              dl_state %in%
                pmt_files$dl_state[pmt_files$spp == "band_tailed_pigeon"] &
                band_tailed_pigeon == "2",
              "0",
              band_tailed_pigeon
            ),
          cranes =
            ifelse(
              dl_state %in%
                pmt_files$dl_state[pmt_files$spp == "cranes"] &
                cranes == "2",
              "0",
              cranes
            )
        )

      message("2s converted to 0s for permit file states:")
      print(
        bind_rows(
          bad_bt_2s |> mutate(spp = "band_tailed_pigeon"),
          bad_cr_2s |> mutate(spp = "cranes")
        )
      )

      return(corrected_pmt_strata)

    } else {
      message(
        paste0(
          "No 2s received for band_tailed_pigeon or crane from permit file",
          " states."))
      return(raw_df)
    }
  }
USFWS/migbirdHarvestData documentation built on April 3, 2025, 4:09 p.m.