R/validate_output.R

Defines functions check_content check_disaggs check_inds check_mechs check_orgunituids check_dates check_output_cols validate_output

Documented in check_content check_dates check_disaggs check_inds check_mechs check_orgunituids check_output_cols validate_output

#' Validation Checks
#'
#' Runs a number of validations after the tidying has occured. Additional,
#' optional validations against DATIM data can be run if data are available,
#' from `pull_hierarchy`,`pull_mech`, `pull_mer`
#'
#' @param df HFR data framed created by `hfr_process_template()`
#' @param datim_path path to look up files from `pull_hierarchy`,`pull_mech`,
#' `pull_mer`
#'
#' @export
#' @family validation

validate_output <- function(df, output_path, datim_path=NULL){

    check_output_cols(df)
    check_dates(df)
    check_orgunituids(df)
    check_mechs(df)
    check_inds(df)
    check_disaggs(df)

    #optional check
    if (!is.null(datim_path)) {
        df <- check_content(df, output_path, datim_path)
    }

    return(df)
}


#' Validate columns for export
#'
#' Ensure all expected columns exist before exporting
#'
#' @param df HFR data framed created by `hfr_process_template()`
#' @family validation

check_output_cols <- function(df){

  #check headers
    req_cols <- c("date", "fy", "hfr_pd", "hfr_freq","orgunit",
                  "orgunituid", "mech_code", "partner",
                  "operatingunit", "psnu", "indicator",
                  "agecoarse", "sex", "otherdisaggregate", "val")

    submitted <- names(df)

  #missing columns
   missing <- flag_missing(req_cols, submitted)

  #extra columns
  extra <- flag_extra(req_cols, submitted)

  #print validation
    cat("\nAre there any missing columns for export?", missing,
        "\nAre there any extra columns for export?", extra)
}


#' Validate dates
#'
#' Check whether there are any rows/records with missing dates. Provides readout
#' on this as well as on whether the submission covers multiple periods and the
#' dates covered in the file.
#'
#' @param df HFR data framed created by `hfr_process_template()`
#' @family validation

check_dates <-function(df){

  #missing dates?
    missing_dates <- count_missing(df, date)

  #date range
    pds <- length(unique(df$hfr_pd))

    date_range <- df %>%
      dplyr::distinct(date, fy, hfr_pd) %>%
      dplyr::mutate(hfr_pd = hfr_pd/100,
                    pd = paste0("[", fy + hfr_pd, "]")) %>%
      tidyr::unite(date_pd, c("date", "pd"), sep = " ") %>%
      dplyr::pull(date_pd) %>%
      paste(collapse = ", ")
    date_range <- ifelse(pds > 1, crayon::red(date_range), crayon::green(date_range))
    pds <- ifelse(pds > 1, crayon::red("Yes"), crayon::green("No"))

  #print validation
  cat("\nAre there any missing dates?", missing_dates,
      "\nDoes the submission cover multiple period?", pds,
      "\nWhat dates does the submission cover?", date_range)
}



#' Validate orgunituids for export
#'
#' Check whether there are any rows/records with missing orgunits and provides
#' readout
#'
#' @param df HFR data framed created by `hfr_process_template()`
#' @family validation

check_orgunituids <-function(df){

  #missing orgunituid?
    missing_orgunituid <- count_missing(df, orgunituid)

  #print validation
    cat("\nAre there any missing orgunituids?", missing_orgunituid)
}


#' Validate mechanisms for export
#'
#' Check whether there are any rows/records with missing mechanisms and provides
#' readout of the mechanisms included
#'
#' @param df HFR data framed created by `hfr_process_template()`

check_mechs <-function(df){

  #missing mechanisms?
    missing_mechs<- count_missing(df, mech_code)

  #mechanisms
    mech_list <- unique(df$mech_code) %>% sort() %>% paste(collapse = ", ") %>% crayon::blue()

  #print validation
  cat("\nAre there any missing mech_codes?", missing_mechs,
      "\nWhat mechanism are included?", mech_list)
}


#' Validate indicators for export
#'
#' Check whether there are any rows/records with missing indicators and provides
#' readout
#'
#' @param df HFR data framed created by `hfr_process_template()`

check_inds <-function(df){

    missing_ind <- count_missing(df, indicator)

  #indicators
    req <- c("HTS_TST", "HTS_TST_POS", "PrEP_NEW",
                 "TX_CURR", "TX_MMD", "TX_NEW", "VMMC_CIRC")
    sumbitted <- unique(df$indicator)

    missing <- flag_missing(req, sumbitted)
    extra <- flag_extra(req, sumbitted)


  #print validation
  cat("\nAre there any unspecified indicators?", missing_ind,
      "\nAre there any missing indicators?", missing,
      "\nAre there any extra indicators?", extra)
}


#' Validate disaggs for export
#'
#' Check whether there are any rows/records with missing disaggs and provides
#' readout
#' @param df HFR data framed created by `hfr_process_template()`
#' @family validation

check_disaggs <- function(df){

  #age/sex
    req <- c("Female <15", "Female 15+", "Male <15", "Male 15+")
    submitted <- df %>%
      dplyr::distinct(agecoarse, sex) %>%
      tidyr::unite(agesex, c("sex", "agecoarse"), sep = " ") %>%
      dplyr::pull(agesex)

    missing <- flag_missing(req, submitted)
    extra <- flag_extra(req, submitted)

  #MMD months
    req_otherdisagg <- c("<3 months", "3-5 months", "6 months or more")
    sumbitted_otherdisagg <- unique(df$otherdisaggregate) %>% setdiff(NA)

  #extra otherdisaggs
    extra_otherdisagg <- flag_extra(req_otherdisagg, sumbitted_otherdisagg)

  #print validation
  cat( "\nAre there any missing age/sex disaggs?", missing,
       "\nAre there any extra age/sex disaggs?", extra,
       "\nAre there any extra other disaggs?", extra_otherdisagg, "\n")
}


#' Validate output content
#'
#' Additional/optional validation against DATIM tables.
#'
#' @param df HFR data framed created by `hfr_process_template()`
#' @param datim_path path to look up files from `pull_hierarchy`,`pull_mech`,
#' `pull_mer`
#' @export
#' @return df updated HFR dataframe
#' @family validation
#'
check_content <- function(df, output_path, datim_path) {

  cat("\nLoading lookup tables ...\n")

  # Load lookup tables: load only once
  if ( !exists("orgs") | !exists("ims") ) {
    load_lookups(datim_path)
  }

  cat("\nChecking operatingunits values ...")

  # Check and update operatingunits
    err_ou <- df %>%
      is_ou_valid(df_orgs = orgs) %>%
      dplyr::filter(!valid_ou) %>%
      dplyr::select(-valid_ou) %>%
      dplyr::distinct(operatingunit) %>%
      dplyr::pull()

    if ( length(err_ou) > 0 ) {

      cat("\nAre there any invalid operatingunits?", ifelse(length(err_ou) > 0, paint_red("Yes"), paint_green("No")),
          "\nList of invalid operatingunits: ", paint_red(paste(err_ou, collapse = ", ")))

      cat("\nUpdating operatingunit from mech codes ...")

      # transform & extract unique mech codes
      ims_ou <- ims %>%
        dplyr::mutate(mech_code = mech_code) %>%
        dplyr::select(mech_code , ou = operatingunit) %>%
        dplyr::distinct(mech_code, ou)

      df <- df %>%
        is_ou_valid(df_orgs = orgs) %>%
        dplyr::mutate(mech_code = mech_code) %>%
        dplyr::left_join(ims_ou, by = c("mech_code" = "mech_code")) %>%
        dplyr::mutate(operatingunit = ifelse(valid_ou == FALSE, ou, operatingunit)) %>%
        dplyr::select(-c(valid_ou, ou))

      #Check again after update
      err_ou <- df %>%
        is_ou_valid(df_orgs = orgs) %>%
        dplyr::filter(!valid_ou) %>%
        dplyr::select(-valid_ou) %>%
        dplyr::distinct(operatingunit) %>%
        dplyr::pull()

      cat("\nAre there still any invalid operatingunit?", ifelse(length(err_ou) > 0, paint_red("Yes"), paint_green("No")),
          "\nList of invalid operatingunit: ", ifelse(length(err_ou) > 0, paint_red(paste(err_ou, collapse = ", ")), paint_green("None")))
    }

  # Check the rest of the data
    cat("\nChecking the entire dataset ...")

    df <- df %>%
      is_ou_valid(df_orgs = orgs) %>%
      is_mech_valid(df_mechs = ims) %>%
      is_mech4ou(df_mechs = ims) %>%
      is_orgunituid_valid(df_orgs = orgs) %>%
      is_orgunituid4ou(df_orgs = orgs) %>%
      dplyr::mutate(
        valid_age = ifelse(is.na(agecoarse) | agecoarse %in% valid_age, TRUE, FALSE),
        valid_sex = ifelse(is.na(sex) | sex %in% valid_sex, TRUE, FALSE),
        valid_value = ifelse(is.na(val) | is.integer(val) | val >= 0, TRUE, FALSE)
      )

    # Sum up invalid columns
    grp <- df %>%
      dplyr::select(-c(date:val)) %>%
      names()

    df <- df %>%
      dplyr::rowwise() %>%
      dplyr::mutate(errors = sum(tidyselect::all_of(grp) == FALSE)) %>%
      dplyr::ungroup()

    # Errors count
    n_errors <- df %>%
      dplyr::filter(errors > 0) %>%
      dplyr::distinct(mech_code) %>%
      dplyr::pull() %>%
      length()

    if ( n_errors > 0 ) {
      msg_errors <- paint_red('Yes')
    } else {
      msg_errors <- paint_green('No')
    }

    cat("\nAre there any mechanism with invalid data?", msg_errors)

    if (n_errors > 0) {

      cat("\nList of mechanisms with errros: ",
          paint_red(paste(errors, collapse = ", ")))

      df %>%
        dplyr::group_by(mech_code) %>%
        dplyr::mutate(row_id = dplyr::row_number()) %>%
        dplyr::ungroup() %>%
        dplyr::filter(errors > 0) %>%
        readr::write_csv(., paste0(output_path,
                                   "/HFR_ERRORS_", curr_fy, ".",
                                   stringr::str_pad(dplyr::first(df$hfr_pd), 2, pad = "0"), "_",
                                   paste(errors, collapse = "_"), "_",
                                   format(Sys.Date(),"%Y%m%d"), ".csv"), na = "")

      cat("\nThe errors file is located here: ", paint_blue(datim_path))
    }

    df <- df %>%
      dplyr::select(date:val, errors) %>%
      dplyr::mutate(errors = ifelse(errors > 0, TRUE, FALSE))

    return(df)
}
USAID-OHA-SI/Wavelength documentation built on March 24, 2023, 10:07 a.m.