R/hfr_dqa_utilities.R

Defines functions revert_validations confirm_validations update_operatingunits validate_hfr_data validate_orgunit validate_date report_submissions_errors validate_submissions validate_submission parse_submission validate_mechanism extract_mechcode extract_iso3code guess_operatingunit get_operatingunit

Documented in confirm_validations extract_iso3code extract_mechcode get_operatingunit guess_operatingunit parse_submission report_submissions_errors revert_validations update_operatingunits validate_date validate_hfr_data validate_mechanism validate_orgunit validate_submission validate_submissions

#' Get operating unit name
#'
#' @param iso_code iso3 code
#' @param orglevels df org levels
#' @return operating unit
#' @export
#' @examples
#' \dontrun{
#'   get_operatingunit(org_levels, 'XWA')
#' }
#'
get_operatingunit <- function(iso_code, orglevels)  {

  iso = iso_code %>% toupper()

  if (iso == 'MULTIPLE') {
    return(iso)
  }

  orglevels <- orglevels %>%
    dplyr::filter(iso3 == iso | iso4 == iso)

  if (nrow(orglevels) > 0) {
    return( orglevels$name3[1] )
  }
  else {
    return(NA)
  }
}


#' Get operating unit name
#'
#' @param pfile processed file
#' @param levels org levels
#' @param ims mechanisms df
#' @return operating unit name
#' @export
#' @examples
#' \dontrun{
#'   get_mech_ou(ims, 'HFR_2020.99_XAR_100000_processed_20200528.csv')
#' }
#'
guess_operatingunit <- function(pfile, levels, ims) {

  fcomponents <- validate_submission(pfile, levels, ims)

  #c(
  #   "fy",
  #   "hfr_pd",
  #   "iso3",
  #   "operatingunit",
  #   "mech_code",
  #   "mech_valid",
  #   "mech_name",
  #   "pdate",
  #   "name"
  #)

  ou <- ifelse(fcomponents[6] == 1, fcomponents[4], NA)

  return(ou)
}

#' Extract OU ISO3 code
#'
#' @param pfile processed file
#' @return ISO3 3 character iso code
#' @export
#' @examples
#' \dontrun{
#'   extract_iso3code('HFR_2020.99_XAR_100000_processed_20200528.csv')
#' }
#'
extract_iso3code <- function(pfile) {

  fcomponents <- parse_submission(pfile)

  fsections <- c("hfr_pd", "iso3", "mech_code", "pdate")

  iso <- ifelse(length(fcomponents) == length(fsections), toupper(fcomponents[2]), NA)

  if ( !is.na(iso) & nchar(iso) != 3 ) {
    return(NA)
  }

  return(iso)
}


#' Extract mechanism code
#'
#' @param pfile processed file
#' @return mech_code mechanism code
#' @export
#' @examples
#' \dontrun{
#'   extract_mechcode('HFR_2020.99_XAR_100000_processed_20200528.csv')
#' }
#'
extract_mechcode <- function(pfile) {

  fcomponents <- parse_submission(pfile)

  fsections <- c("hfr_pd","iso3","mech_code","pdate")

  mech_code <- ifelse(
    length(fcomponents) == length(fsections),
    as.integer(fcomponents[5]),
    NA
  )

  return(mech_code)
}


#' Validate mechanism code
#'
#' @param mechanisms df of mechs
#' @param mech_code mech code
#' @return vector c(valid_im,  mech_name)
#' @export
#' @examples
#' \dontrun{
#'   validate_mechanism(ims, 'Angola', 16172)
#' }
#'
validate_mechanism <- function(mechanisms, ou, mcode) {

  uniq_ims <- mechanisms %>%
    dplyr::distinct(mech_code) %>%
    dplyr::pull()

  valid_im <- ifelse(mcode %in% uniq_ims, 1, 0)

  mechanisms <- mechanisms %>%
    dplyr::filter(operatingunit == ou, mech_code == mcode)

  if ( valid_im == 1 & nrow(mechanisms) > 0 ) {
    return( c(valid_im, mechanisms$mech_name[mechanisms$mech_code == mcode]) )
  }
  else{
    return(c(valid_im, NA))
  }
}


#' Parse out submitted file components
#'
#' @param sfile processed file
#' @return component As vector c("hfr_pd", "iso3", "mech_code", "pdate")
#' @export
#' @examples
#' \dontrun{
#'   parse_submission("HFR_2020.99_XWH_100000_processed_20200101.csv")
#' }
#'
parse_submission <- function(pfile) {

  parts <- pfile %>% stringr::str_split("_") %>% unlist()

  parts <- parts[2:length(parts)]
  parts <- parts[-(length(parts)-1)]

  parts[length(parts)] <- stringr::str_remove(parts[length(parts)], ".csv")

  return(parts)
}



#' Validate submitted files
#'
#' @param sfile processed file
#' @param levels org levels
#' @param ims mechanisms df
#' @return vector c("fy", "hfr_pd", "iso3", "operatingunit", "mech_code", "mech_valid", "mech_name", "pdate", "name")
#' @export
#' @examples
#' \dontrun{
#'   validate_submissions("HFR_2020.99_XWH_100000_processed_20200101.csv")
#' }
#'
validate_submission <- function(pfile, levels, ims) {

  # c("hfr_pd", "iso3", "mech_code", "pdate")
  parts <- parse_submission(pfile)

  pd <- parts[1] %>% stringr::str_split("\\D") %>% unlist()

  fy = pd[1]
  pd = pd[2]

  iso <- parts[2]

  ou <- get_operatingunit(iso_code=iso, orglevels=levels)

  mech_code <- parts[3]

  im <- validate_mechanism(mechanisms=ims, ou=ou, mcode=mech_code)

  mech_valid <- im[1]
  mech_name <- im[2]

  pdate <- parts[length(parts)] %>% stringr::str_split(".csv") %>% unlist()
  pdate <- pdate[1]

  parts <- c(fy, pd, iso, ou, mech_code, mech_valid, mech_name, pdate[1], pfile)

  return(parts)
}


#' Validate files
#'
#' @param folder pfolder
#' @param levels org levels
#' @param ims mechanisms df
#' @param pattern filename pattern
#' @return dataframe
#' @export
#' @examples
#' \dontrun{
#'   validate_submissions(dir_hfr_pd205, pattern = "HFR_2020.05")
#' }
#'
validate_submissions <- function(pfolder, levels, ims, pattern=NULL) {

  fcomponents <- c("fy", "hfr_pd", "iso3", "operatingunit", "mech_code", "mech_valid", "mech_name", "pdate", "name")

  if ( is.null(pattern) ) {
    files <- list.files(path = pfolder, full.names = F)
  } else {
    files <- list.files(path = pfolder, pattern = pattern, full.names = F)
  }

  processed <- files %>%
    purrr::map(validate_submission, levels, ims) %>%
    unlist() %>%
    matrix(ncol=9, byrow=T) %>%
    as.data.frame(stringsAsFactors=F) %>%
    purrr::set_names(fcomponents)

  return(processed)
}


#' Report files validation
#'
#' @param df_files df of filename validation
#' @param mechanisms mechs
#' @return void
#' @export
#' @examples
#' \dontrun{
#'   report_file_errors(files, ims, processed)
#' }
#'
report_submissions_errors <- function(df_files, mechanisms, export = FALSE) {

  cat(crayon::blue("Reporting errors from file names ...\n"))

  # All errors
  file_errors <- df_files %>%
    dplyr::filter(is.na(operatingunit) | mech_valid == 0 | is.na(mech_name))

  #file_errors %>% glimpse()

  # ISO Errors
  iso_errors <- file_errors %>%
    dplyr::filter(is.na(operatingunit))

  iso_errors_n <- nrow(iso_errors)

  cat(crayon::red(paste0("Invalid ISO Codes: ", iso_errors_n, "\n")))

  if ( iso_errors_n > 0 ) {
    cat(crayon::blue(unique(iso_errors %>% dplyr::pull(iso3)), "\n"))
  }

  # IM errors
  im_errors <- df_files %>%
    dplyr::filter(mech_valid == 0)

  im_errors_n <- nrow(im_errors)

  cat(crayon::red("Invalid IMs: ", im_errors_n, "\n"))

  if ( im_errors_n > 0 ) {

    im_msgs <- im_errors %>%
      dplyr::select(iso3, mech_code) %>%
      mutate(error = paste0(as.character(mech_code), " [", iso3, "]")) %>%
      dplyr::pull(error)

    cat(crayon::blue(unique(im_msgs), "\n"))
  }

  # Unmatched IMs
  im_unmatched <- df_files %>%
    dplyr::filter(is.na(mech_name))

  im_unmatched_n <- nrow(im_unmatched)

  cat(crayon::red("Unmatched IMs: ", im_unmatched_n, "\n"))

  if ( im_unmatched_n > 0 ) {

    im_u_msgs <- im_unmatched %>%
      dplyr::select(iso3, mech_code) %>%
      mutate(error = paste0(as.character(mech_code), " [", iso3, "]")) %>%
      dplyr::pull(error)

    cat(crayon::blue(unique(im_u_msgs), "\n"))
  }

  # Duplicate files

  file_dup <- df_files %>%
    dplyr::group_by(fy, hfr_pd, iso3, mech_code) %>%
    dplyr::summarise(n=n()) %>%
    dplyr::ungroup() %>%
    dplyr::filter(n > 1)

  file_dup_n <- nrow(file_dup)

  cat(crayon::red("Duplucate file: ", file_dup_n, "\n"))

  if ( file_dup_n > 0) {
    file_dup %>%
      dplyr::arrange(iso3, mech_code, desc(n)) %>%
      print()
  }

  # Export
  if (export == TRUE) {
    file_errors %>%
      readr::write_csv(path = here::here(paste0("HFR_", file_errors$fy[1], "_", file_errors$hfr_pd[1], "_filename_errors.csv")))
  }
}


#' Validate HFR PD Date
#'
#' @param df_pds hfr period dates
#' @param pdate period date
#' @param pd reporting period
#' @return valid: True / False
#' @export
#' @examples
#' \dontrun{
#'   validate_date(df_pds = valid_dates, '2020-01-27', 5)
#' }
#'
validate_date <- function(df_pds, pdate, pd) {

  valid <- df_pds %>%
    dplyr::filter(date == pdate, hfr_pd == pd) %>%
    dplyr::count() %>%
    dplyr::pull()

  valid <- ifelse(length(valid) > 0, 1, 0)

  return(valid)
}


#' Validate org unit uid
#'
#' @param df_orgs org hierarchy
#' @param ou operating unit
#' @param uid orgunituid
#' @return valid as a vector c(valid_uid, valid_uid_ou)
#' @export
#' @examples
#' \dontrun{
#'   validate_orgunit(df_orgs, 'Eswatini', 'g48XD8px8NN')
#' }

validate_orgunit <- function(df_orgs, ou, uid) {

  # Is orgunituid valid?
  valid_uid <- ifelse(
    uid %in% (df_orgs %>% dplyr::distinct(orgunituid) %>% dplyr::pull()),
    1,
    0
  )

  # Does uid belong to OU
  valid_uid_ou <- df_orgs %>%
    dplyr::distinct(operatingunit, orgunituid) %>%
    dplyr::filter(
      operatingunit == ou,
      orgunituid == uid
    ) %>%
    dplyr::count() %>%
    dplyr::pull()

  valid_uid_ou <- ifelse(length(valid_uid_ou) > 0, 1, 0)

  # Return results
  valid <- c(valid_uid, valid_uid_ou)

  return(valid)
}


#' Validate processed hfr data
#'
#' @param hfr_data processed hfr data
#' @param orgs datim org hierarchy
#' @param ims datim mechanisms
#' @param dates hfr valid dates
#' @param keep_values Keep values along the error flags
#' @return errors data frame
#' @export
#' @examples
#' \dontrun{
#'   validate_hfr_data(df_hfr_data, orgs=org_hierarchy, ims=df_mechanisms, dates=df_hfr_dates)
#'   validate_hfr_data(df_hfr_data, orgs=org_hierarchy, ims=df_mechanisms, dates=df_hfr_dates, keep_values = TRUE)
#' }

validate_hfr_data <- function(hfr_data, orgs, ims, dates, keep_values = FALSE){

  # Valid age and sex values
  valid_age <- c("<15", "15+", NA)
  valid_sex <- c("Female", "Male", NA)

  # Flag all errors
  errors <- hfr_data %>%
    group_by(source) %>%
    mutate(row_id = row_number()) %>%
    ungroup() %>%
    is_date_valid(df_dates = valid_dates) %>%
    is_ou_valid(df_orgs = orgs) %>%
    is_ou_valid(df_orgs = orgs) %>%
    is_orgunituid_valid(df_orgs = orgs) %>%
    is_orgunituid4ou(df_orgs = orgs) %>%
    is_mech_valid(df_mechs = ims) %>%
    is_mech4ou(df_mechs = ims) %>%
    mutate(
      valid_age = ifelse(is.na(agecoarse) | agecoarse %in% valid_age, TRUE, FALSE),
      valid_sex = ifelse(is.na(sex) | sex %in% valid_sex, TRUE, FALSE),
      valid_value = ifelse(is.na(val) | val >= 0, TRUE, FALSE)
    )

  # Exclude data
  if (keep_values != TRUE) {
    errors <- errors %>%
      select(source:valid_value) %>%
      distinct()
  }

  # Filter rows with errors
  errors <- errors %>%
    mutate(errors = rowSums(select(., valid_date:valid_value) == FALSE)) %>%
    filter(errors > 0)

  return(errors)
}


#' Update invalid operating units
#'
#' @param hfr_data processed hfr data
#' @param levels datim org levels
#' @param orgs datim org hierarchy
#' @param ims datim mechanisms
#' @return hfr_data df
#' @export
#' @examples
#' \dontrun{
#'   update_operatingunits(hfr_df, levels=org_levels, orgs=org_hierarchy, ims=mechanisms)
#' }

update_operatingunits <- function(hfr_data, levels, orgs, ims=NULL){

  if( is.null(ims) ){

    hfr_data <- hfr_data %>%
      is_ou_valid(df_orgs = orgs) %>%
      dplyr::rowwise() %>%
      dplyr::mutate(operatingunit = ifelse(valid_ou == FALSE, get_operatingunit(iso_code=extract_iso3code(source), orglevels=levels), operatingunit)) %>%
      dplyr::ungroup() %>%
      dplyr::select(-valid_ou)

  }else{

    hfr_data <- hfr_data %>%
      is_ou_valid(df_orgs = orgs) %>%
      dplyr::rowwise() %>%
      dplyr::mutate(operatingunit = ifelse(valid_ou == FALSE, guess_operatingunit(pfile=source, levels=levels, ims=ims), operatingunit)) %>%
      dplyr::ungroup() %>%
      dplyr::select(-valid_ou)
  }

  return(hfr_data)
}

#' Confirm validations of processed files
#'
#' @param hfr_data content of processed files
#' @param hfr_errors errors detected from files content
#' @param dir_files location of processed files
#' @export
#' @examples
#' \dontrun{
#' confirm_validations(hfr_data, hfr_errors, dir_files)
#' }
#'
confirm_validations <- function(hfr_data, hfr_errors, dir_files){

  hfr_data %>%
    dplyr::left_join(errors, by = "source") %>%
    dplyr::filter(is.na(row_id)) %>%
    dplyr::distinct(source, row_id) %>%
    dplyr::pull(source) %>%
    purrr::map(function(source) {

      valid_file <- stringr::str_replace(source, "processed", "validated")

      file.rename(from = here::here(dir_files, source),
                  to = here::here(dir_files, valid_file))
    })
}


#' Revert validated file to processed files
#'
#' @param dir_files location of processed files
#' @export
#' @examples
#' \dontrun{
#' revert_validations(dir_files)
#' }
#'
revert_validations <- function(dir_files) {

  file_pattern = "HFR_*.*.validated.*.csv$"

  files <- list.files(path=dir_files, pattern = file_pattern, full.names = FALSE)

  if (length(files) == 0) {
    stop(paste0(dir_files, " contains no file that matches this pattern: ", file_pattern))
  }

  files %>%
    purrr::map(function(file) {

      revert_file <- file %>% stringr::str_replace("validated", "processed")

      file.rename(from = here::here(dir_files, file),
                  to = here::here(dir_files, revert_file))
    })
}
USAID-OHA-SI/Wavelength documentation built on March 24, 2023, 10:07 a.m.