gsedread: Tools for reading the GSED validation data

Documented in read_lf

#' Read lf data
#'
#' @param adm Character, either "fixed" or "adaptive" administration type
#' @param onedrive Character, the location of the local OneDrive sync
#' @param path Character, path name within the OneDrive
#' @param verbose Logical. Print diagnostic info? Defaults to `FALSE`.
#' @param progress Logical. Show progress in interactive session. Default to `FALSE`.
#' @param warnings Logical. Show warnings generated by `readr::read_csv()`.
#' Defaults to `FALSE`.
#' @return A tibble with the original data and two column names: `file`
#' (containing the original file name) and `adm` (fixed or adaptive).
#' If `adm == "fixed"` a tibble with 159
#' columns, with one test administration per row. If `adm == "adaptive"`, a tibble
#' with 14 columns, with one item administration per row.
#' @examples
#' \dontrun{
#' # assuming environmental variable ONEDRIVE_GSED has been set
#' d <- read_lf()
#' dim(d)
#' }
#' @export
read_lf <- function(adm = c("fixed", "adaptive"),
                    onedrive = Sys.getenv("ONEDRIVE_GSED"),
                    path = "GSED Final Collated Phase 1 Data Files 18_05_22",
                    verbose = FALSE,
                    progress = FALSE,
                    warnings = FALSE) {
  if (nchar(onedrive) == 0L) {
    stop("Environmental variable ONEDRIVE_GSED not set.", call. = FALSE)
  }
  adm <- match.arg(adm)

  if (adm == "fixed") {
    return(read_lf_fixed(onedrive, path, verbose, progress, warnings))
  } else {
    return(read_lf_adaptive(onedrive, path, verbose, progress, warnings))
  }
}

read_lf_fixed <- function(onedrive, path, verbose, progress, warnings) {
  # hardcode fixed lf files names
  files_fixed <- c(
    "tan/tza-lf-2021-11-01.csv",
    "tan/tza_lf_predictive_10_05_2022.csv",
    "tan/tza_lf_new_enrollment_10_05_2022.csv",
    "ban/ban-lf-2021-11-03.csv",
    "ban/ban_lf_predictive_17_05_2022.csv",
    "ban/ban_lf_new_enrollment_17_05_2022.csv",
    "pak/pak_lf_2022_05_17.csv",
    "pak/pak_lf_predictive_2022_05_17.csv",
    "pak/pak_lf_new_enrollment_2022_05_17.csv"
  )

  # read
  files <- file.path(onedrive, path, files_fixed)
  date_formats <- c(
    "%Y-%m-%d", "%d-%m-%Y", "%d-%m-%Y",
    "%d/%m/%Y", "%d/%m/%Y", "%d/%m/%Y",
    "%m/%d/%Y", "%m/%d/%Y", "%m/%d/%Y"
  )
  data <- read_files("lf", "fixed", files, 1:length(files),
                     date_formats, NULL,
                     verbose, progress, warnings)

  # bind
  # remove orphan records without a GSED_ID
  data %>%
    bind_rows(.id = "file") %>%
    filter(!is.na(.data$GSED_ID)) %>%
    mutate(adm = "fixed")
}

read_lf_adaptive <- function(onedrive, path, verbose, progress, warnings) {

  # hardcode adaptive lf files names
  files_adaptive <- c(
    "tan/tza_lf_adaptive_10_05_2022.csv",
    "tan/tza_lf_new_adaptive_10_05_2022.csv",
    "ban/ban_lf_adaptive_17_05_2022.csv",
    "ban/ban_lf_new_adaptive_17_05_2022.csv",
    "pak/pak_lf_adaptive_2022_05_17.csv",
    "pak/pak_lf_new_adaptive_2022_05_17.csv"
  )
  files <- file.path(onedrive, path, files_adaptive)
  date_formats <- c(
    "%d-%m-%Y", "%d-%m-%Y",
    "%d-%m-%Y", "%d/%m/%Y",
    "%m/%d/%Y", "%m/%d/%Y"
  )
  datetime_formats <- c(
    "%d-%m-%Y %H:%M:%S", "%d-%m-%Y %H:%M:%S",
    "%d-%m-%Y %H:%M:%S", "%d/%m/%Y %H:%M",
    "%d-%m-%Y %H:%M:%S", "%d-%m-%Y %H:%M:%S"
  )
  data <- read_files("lf", "adaptive", files, 1:length(files),
                     date_formats, datetime_formats,
                     verbose, progress, warnings)

  # repair mixed time stamp format in pak_lf_adaptive_2022_05_17 and
  # pak_lf_new_adaptive_2022_05_17 (about 15% of the values).
  idx <- 5:6
  datetime_formats <- rep("%m/%d/%Y %H:%M", 6)
  tmp <- read_files("lf", "adaptive", files, idx,
                    date_formats, datetime_formats,
                    verbose, progress, warnings)
  for (i in idx) {
    z <- data[[i]]$Ma_LF_timestamp
    z[is.na(z)] <- tmp[[i]]$Ma_LF_timestamp[!is.na(tmp[[i]]$Ma_LF_timestamp)]
    data[[i]]$Ma_LF_timestamp <- z
  }

  # bind
  # remove orphan records without a GSED_ID
  data %>%
    bind_rows(.id = "file") %>%
    filter(!is.na(.data$GSED_ID)) %>%
    mutate(adm = "adaptive")
}