R/read_demographics_csv.R

Defines functions read_demographics_csv

Documented in read_demographics_csv

#' Read CSV file containing COVID19 Sample Collection and
#' deidentified Demographics Data from REDDI
#'
#' @param filepath Path to the CSV file containing Sample Collection and
#' deidentified Demographics Data
#' @param date_fmt Format used to specify dates (Default: MM/DD/YYYY)
#' @param time_zone Time zone for collection time (Default: "America/New_York")
#'
#' @return A tibble with the Sample Collection and
#' the deidentified Demographics Data
#'
#' @importFrom magrittr "%>%"
read_demographics_csv <- function(filepath,
                                  date_fmt = c("%m/%d/%y"),
                                  time_zone = "America/New_York") {
  test_tbl <- readr::read_csv(filepath,
    na = c(
      "", "NA", "N/A", "<NA>", "null",
      "Null", "Missing", "Error 404"
    ),
    n_max = 1,
    show_col_types = FALSE
  )
  stopifnot(all(c(
    "Testing Group Name",
    "Patient City",
    "Patient Zip Code",
    "Patient State",
    "Year of Birth",
    "Patient Gender",
    "Pregnant",
    "Patient Ethnic Group",
    "Patient Race",
    "Patient ID",
    "TestKit ID",
    "Result description",
    "Result Date",
    "Collection Date",
    "SKU",
    "Order Priority"
  ) %in% colnames(test_tbl)))

  output_tbl <- readr::read_csv(filepath,
    na = c(
      "", "NA", "N/A", "<NA>",
      "null", "Null", "Missing", "Error 404"
    ),
    show_col_types = FALSE
  ) %>%
    dplyr::rename(
      "test_group" = "Testing Group Name",
      "city" = "Patient City",
      "zip_code" = "Patient Zip Code",
      "state" = "Patient State",
      "birth_year" = "Year of Birth",
      "gender" = "Patient Gender",
      "pregnancy_status" = "Pregnant",
      "ethnicity" = "Patient Ethnic Group",
      "race" = "Patient Race",
      "patient_id" = "Patient ID",
      "testkit_id" = "TestKit ID",
      "rymedi_result" = "Result description",
      "result_date" = "Result Date",
      "teskit_sku" = "SKU",
      "order_priority" = "Order Priority"
    )

  if ("Time Zone" %in% colnames(output_tbl)) {
    output_tbl <- output_tbl %>%
      dplyr::select(-c("Time Zone"))
  }

 output_tbl <- output_tbl %>%
    dplyr::mutate(
      `Collection Date` = lubridate::parse_date_time(`Collection Date`,
        orders = date_fmt,
        tz = time_zone
      ),
      collection_date = lubridate::as_datetime(`Collection Date`,
        tz = time_zone
      ),
      result_date = lubridate::as_date(lubridate::parse_date_time(result_date,
        orders = date_fmt,
        tz = time_zone
      ),
      tz = time_zone
      )
    )


  output_tbl <- output_tbl %>%
    dplyr::mutate(birth_year = tidy_up_birth_year(birth_year,
      max_year = max(lubridate::year(collection_date))
    ))


  return(output_tbl)
}
CUGBF/deidentifiedDB documentation built on Sept. 13, 2023, 6:28 a.m.