R/read_hip.R

Defines functions sumLines read_hip

Documented in read_hip sumLines

#' Read in data
#'
#' Compile data from state-exported text files by providing a path to the download directory.
#'
#' @importFrom tibble as_tibble_col
#' @importFrom dplyr mutate
#' @importFrom dplyr filter
#' @importFrom dplyr pull
#' @importFrom dplyr group_by
#' @importFrom dplyr n
#' @importFrom dplyr ungroup
#' @importFrom dplyr select
#' @importFrom dplyr distinct
#' @importFrom dplyr rename
#' @importFrom dplyr cur_group_id
#' @importFrom dplyr row_number
#' @importFrom purrr map
#' @importFrom purrr list_rbind
#' @importFrom readr read_fwf
#' @importFrom readr fwf_widths
#' @importFrom stringr str_detect
#' @importFrom stringr str_replace
#' @importFrom stringr str_extract
#' @importFrom stringr str_remove
#'
#' @param path File path to the folder containing HIP .txt files
#' @param unique Return a distinct frame? Defaults to TRUE
#' @param state When specified, reads in download data from a specified state. Must match one of the following two-letter abbreviations:
#' \itemize{
#' \item AL, AK, AZ, AR, CA, CO, CT, DE, DC, FL, GA, ID, IL, IN, IA, KS, KY, LA, ME, MD, MA, MI, MN, MS, MO, MT, NE, NV, NH, NJ, NM, NY, NC, ND, OH, OK, OR, PA, RI, SC, SD, TN, TX, UT, VT, VA, WA, WV, WI, WY}
#' @param season If set as TRUE, selects only folders starting with "DL" in a a season's upper-level directory
#' @param sumlines If set as TRUE, runs \code{\link{sumLines}} and returns a list of any files that were not fully read in
#'
#' @author Abby Walter, \email{abby_walter@@fws.gov}
#' @references \url{https://github.com/USFWS/migbirdHIP}
#'
#' @export

read_hip <-
  function(path, unique = TRUE, state = NA, season = FALSE, sumlines = FALSE) {

    # Add a final "/" if not included already
    if(!str_detect(path, "\\/$")) {
      path <- paste0(path, "/")
    }
    # Error for possibly wrong path
    if(str_detect(path, "DL") & season == TRUE) {
      message("Are you sure you supplied a season path?")
    }
    # Error for bad state
    if(!state %in% c(NA, datasets::state.abb[datasets::state.abb != "HI"])) {
      message(
        paste0(
          "Error: Please supply a valid 2-letter state abbreviation for ",
          "`state` parameter."))
    }
    # Error for bad season
    if(!season %in% c(TRUE, FALSE, T, F)) {
      message("Error: Please supply TRUE or FALSE for `season` parameter.")
    }
    # Error for bad sumlines
    if(!sumlines %in% c(TRUE, FALSE, T, F)) {
      message("Error: Please supply TRUE or FALSE for `sumlines` parameter.")
    }

    # Create a tibble of the HIP .txt files to be read from the provided
    # directory
    files <-
      list.files(
        path, recursive = {{season}}, pattern = "*\\.txt$", ignore.case = TRUE,
        full.names = TRUE) |>
      as_tibble_col(column_name = "filepath") |>
      # Don't process permit files
      filter(!str_detect(filepath, "permit")) |>
      # Identify blank files
      mutate(
        filepath = str_replace(filepath, "TXT", "txt"),
        check =
          ifelse(
            file.size(filepath) == 0,
            "blank",
            ""))

    # Filter files to include only specified state
    if(!is.na(state)) {
      files <- filter(files, str_detect(filepath, state))
    }
    # Error for blank files
    if("blank" %in% files$check) {
      message("Error: One or more files are blank in the directory.")
      print(filter(files, check == "blank"))
    }

    # Filter out blank files from the paths list
    files <- filter(files, check != "blank") |> pull(filepath)

    # Message if there are no files to read in
    if(length(files) == 0) {
      message(
        paste0(
          "Error: No file(s) to read in. Did you specify a state that did not ",
          "submit data?"))
    } else {

      # Read data from filepaths
      pulled_data <-
        map(
          1:length(files),
          function(i) {
            # Compile each state's file into one table
            read_fwf(
              files[i],
              fwf_widths(c(1, 15, 1, 20, 3, 60, 20, 2, 10, 10, 10,
                           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, NA)),
              col_types = "cccccccccccccccccccccccc",
              na = c("N/A", "")) |>
              mutate(
                # Add the download state as a column
                dl_state =
                  str_extract(
                    files[i], "[A-Z]{2}(?=[0-9]{8}\\.txt)"),
                # Add the download date as a column
                dl_date =
                  str_extract(
                    files[i], "(?<=[A-Z]{2})[0-9]{8}(?=\\.txt)"),
                # Add the source file as a column
                source_file =
                  str_remove(files[i], path),
                # Add the download cycle as a column
                dl_cycle =
                  str_extract(files[i], "(?<=DL).+(?=\\/)"))
          }) |>
        list_rbind() |>
        rename(
          title = 1,
          firstname = 2,
          middle = 3,
          lastname = 4,
          suffix = 5,
          address = 6,
          city = 7,
          state = 8,
          zip = 9,
          birth_date = 10,
          issue_date = 11,
          hunt_mig_birds = 12,
          ducks_bag = 13,
          geese_bag = 14,
          dove_bag = 15,
          woodcock_bag = 16,
          coots_snipe = 17,
          rails_gallinules = 18,
          cranes = 19,
          band_tailed_pigeon = 20,
          brant = 21,
          seaducks = 22,
          registration_yr = 23,
          email = 24) |>
        # Add a download key
        group_by(dl_date, dl_state) |>
        mutate(dl_key = paste0("dl_", cur_group_id())) |>
        ungroup() |>
        # Add a record key
        mutate(record_key = paste0("record_", row_number()))

      # Remove exact duplicates
      if(unique == TRUE){
        pulled_data <- distinct(pulled_data)
      }

      # Check for missing lines. Were all lines of data read in?
      if(sumlines == TRUE) {

        # Suppress incomplete final line warnings
        suppressWarnings(
          missing_lines <-
            pulled_data |>
            count(source_file) |>
            left_join(sumLines(path), by = "source_file") |>
            filter(n != num_lines)
        )

        if(nrow(missing_lines) > 0) {
          message("Error: One or more files was not fully read.")
          print(missing_lines)
        }
      }

      # Return a message for records with blank or NA values in firstname,
      # lastname, state, or birth date
      raw_nas <-
        pulled_data |>
        group_by(dl_state) |>
        mutate(n_total = n()) |>
        ungroup() |>
        filter(
          is.na(firstname)|is.na(lastname)|is.na(state)|is.na(birth_date)) |>
        group_by(dl_state) |>
        reframe(n = n(), prop = round(n/n_total, 2)) |>
        distinct() |>
        filter(n >= 100 | prop >= 0.1)

      if(nrow(raw_nas) > 0) {
        message(
          paste0("Error: NA values detected in one or more ID fields ",
                 "(firstname, lastname, state, birth date) for >10% of a file ",
                 "and/or >100 records."))

        print(raw_nas)
      }

      # Return a message if any records contain a bag value that is not a
      # 1-digit number
      nondigit_bags <-
        pulled_data |>
        filter(if_any(all_of(ref_bagfields), ~!str_detect(.x, "^[0-9]{1}$")))

      if(nrow(nondigit_bags) > 0) {
        message(
          paste0(
            "Error: One or more records detected with a value other than a ",
            "single digit; these records will be filtered out in clean()."))
        print(
          nondigit_bags |>
            unite(bags, matches(ref_bagfields), sep = " ") |>
            select(source_file, record_key, bags)
          )
      }

      # Return a message if there is an NA in dl_state
      if(TRUE %in% is.na(pulled_data$dl_state)) {
        message("Error: One or more more NA values detected in dl_state.")

        print(
          pulled_data |>
            distinct(dl_state, source_file) |>
            filter(is.na(dl_state))
        )
      }

      # Return a message if there is an NA in dl_date
      if(TRUE %in% is.na(pulled_data$dl_date)) {
        message("Error: One or more more NA values detected in dl_date.")

        print(
          pulled_data |>
            select(dl_date, source_file) |>
            filter(is.na(dl_date)) |>
            distinct())
      }

      # Return a message if all emails are missing from a file
      if(nrow(
        pulled_data |>
        group_by(source_file) |>
        summarize(n_emails = length(unique(email))) |>
        ungroup() |>
        filter(n_emails == 1)) > 0) {
        message("Error: One or more files are missing 100% of emails.")

        print(
          pulled_data |>
            group_by(source_file) |>
            summarize(n_emails = length(unique(email))) |>
            ungroup() |>
            filter(n_emails == 1) |>
            select(source_file))
      }

      # Check if all dl_states are acceptable
      # States in the data
      dl_states_in_data <- distinct(pulled_data, dl_state)

      # Return a message if there is a dl_state not found in the list of 49
      # continental US states
      if(FALSE %in%
         (dl_states_in_data |> pull(dl_state) %in%
          datasets::state.abb[datasets::state.abb != "HI"])) {
        message(
          paste0("Error: One or more dl_state values do not belong in the ",
                 "list of expected 49 continental US states."))

        print(
          dl_states_in_data |>
            filter(
              !dl_state %in% datasets::state.abb[datasets::state.abb != "HI"]
            ) |>
            pull()
        )
      }

      return(pulled_data)
    }
  }

#' Sum lines of new data
#'
#' The internal \code{sumLines} function returns a data frame containing the sums of the number of lines in new download files. It is used inside of \code{\link{read_hip}} to return a message for any file that did not have all its lines read in.
#'
#' @importFrom stringr str_detect
#'
#' @param path File path to the download folder containing HIP .txt files
#'
#' @author Abby Walter, \email{abby_walter@@fws.gov}
#' @references \url{https://github.com/USFWS/migbirdHIP}

sumLines <-
  function(path) {

    # Add a final "/" if not included already
    if(!str_detect(path, "\\/$")){
      path <- paste0(path, "/")
    }

    # Create a vector of the HIP .txt files to be read from the provided
    # directory
    # For reading data from a download cycle for ALL states available
    dl_files <- list.files(path, pattern = "\\.txt|.TXT$")

    sum_lines <- c()

    for (i in seq_along(dl_files)){
      con <- file(paste0(path, dl_files[i]))
      sum_lines[i] <- length(readLines(con))
      close(con)
    }

    return(data.frame(source_file = dl_files, num_lines = sum_lines))
  }
USFWS/migbirdHarvestData documentation built on Feb. 12, 2024, 4:38 p.m.