R/validate_output.R

Defines functions check_content check_disaggs check_numdenom check_inds check_mechs check_orgunituids check_operatingunit check_output_cols validate_output

Documented in check_content check_disaggs check_inds check_mechs check_numdenom check_operatingunit check_orgunituids check_output_cols validate_output

#' Validation Checks
#'
#' @param df HFR data framed created by `cir_process_template()`
#' @param refs Reference datasets to include orgs, mechs, and data elements
#' @param content check full dataset
#'
#' @return list containing validated data along with results of the checks
#' @export

validate_output <- function(df, refs, content=FALSE){

  # Extract Meta Info
  subm_file <- df %>%
    dplyr::filter(!is.na(filename)) %>%
    dplyr::distinct(filename) %>%
    dplyr::pull() %>%
    dplyr::first()

  subm_sheet <- df %>%
    dplyr::filter(!is.na(sheet)) %>%
    dplyr::distinct(sheet) %>%
    dplyr::pull() %>%
    dplyr::first()

  # Notifications
  if (interactive()) {
    usethis::ui_info("---- OUTPUT VALIDATIONS ----")
    usethis::ui_info("---- Missing Values ----")
  }

  ou_miss <- df %>% get_missing("operatingunit")
  pd_miss <- df %>% get_missing("reportingperiod")

  pd_mism <- df %>%
    dplyr::rowwise() %>%
    dplyr::filter(!is.na(reportingperiod)) %>%
    dplyr::mutate(pd_mism = reportingperiod != stringr::str_remove_all(refs$pd, " ")) %>%
    dplyr::ungroup() %>%
    dplyr::filter(pd_mism) %>%
    dplyr::distinct(row_id) %>%
    dplyr::pull(row_id)

  pd_fmt <- df %>% match_value("reportingperiod")

  org_miss <- df %>% get_missing("orgunituid")
  mech_miss <- df %>% get_missing("mech_code")
  ind_miss <- df %>% get_missing("indicator")
  nd_miss <- df %>% get_missing("numeratordenom")

  # Validations
  vout <- tibble::tibble(
    filename = subm_file,
    sheet = subm_sheet,
    pd_missing = paste0(pd_miss, collapse = ", "),
    pd_format = paste0(pd_fmt, collapse = ", "),
    pd_mismatch = paste0(pd_mism, collapse = ", "),
    ou_missing = paste0(ou_miss, collapse = ", "),
    ou_valid = NA,
    org_missing = paste0(org_miss, collapse = ", "),
    org_valid = NA,
    mech_missing = paste0(mech_miss, collapse = ", "),
    mech_valid = NA,
    ind_missing = paste0(ind_miss, collapse = ", "),
    ind_valid = NA,
    nd_missing = paste0(nd_miss, collapse = ", "),
    nd_valid = NA,
    disagg_valid = NA
  )

  if (interactive()) {
    cat("\nOperaringunit: ", empty_to_chr(vout$ou_missing, type="string"),
        "\nReporting Period: ", empty_to_chr(vout$pd_missing, type="string"),
        "\nOrgunit: ", empty_to_chr(vout$org_missing, type="string"),
        "\nMechanisms: ", empty_to_chr(vout$mech_missing, type="string"),
        "\nIndicator: ", empty_to_chr(vout$ind_missing, type="string"),
        "\nNum/Denominator: ", empty_to_chr(vout$nd_missing, type="string"),
        "\n")
  }

  # Skip content validation

  if (content == FALSE) {

    return(
      list(
        status = "success",
        message = "Content validation was skipped as instructed",
        checks = dplyr::select(vout, filename, sheet, dplyr::ends_with("missing")),
        data = df
      )
    )
  }

  if (content == TRUE & (is.null(refs$orgs) | is.null(refs$mechs) | is.null(refs$de))) {
    return(
      list(
        status = "failed",
        message = "Missing some (if not all) of the reference datasets needed for this validation",
        checks = dplyr::select(vout, filename, sheet, dplyr::ends_with("missing")),
        data = df
      )
    )
  }

  # Skip content validation => parameter needs to be set to true + ref datasets

  if (interactive()) {
    usethis::ui_info("---- Invalid Values ----")
  }

  # Check operatingunit
  ou_valid <- check_operatingunit(df, refs$ou)

  vout$ou_valid <- paste0(ou_valid, collapse = ", ")

  # Check orgunits
  org_valid <- check_orgunituids(df, ref_orgs = refs$orgs)

  vout$org_valid <- paste0(org_valid, collapse = ", ")

  # Check mechanisms
  mech_valid <- check_mechs(df, ref_mechs = refs$mechs)

  vout$mech_valid <- paste0(mech_valid, collapse = ", ")

  # Check Indicators
  ind_valid <- check_inds(df, ref_de = refs$de)

  vout$ind_valid <- paste0(ind_valid, collapse = ", ")

  # Check Disaggs
  disagg_valid <- check_disaggs(df, ref_de = refs$de)

  vout$disagg_valid <- paste0(disagg_valid, collapse = ", ")

  # Check numerator / denom
  nd_valid <- check_numdenom(df)

  vout$na_valid <- paste0(nd_valid, collapse = ", ")

  # Notification

  if (interactive()) {
    cat("\nOperaringunit: ", empty_to_chr(vout$ou_valid, type="string"),
        "\nReporting Period (format): ", empty_to_chr(vout$pd_format, type="string"),
        "\nReporting Period (out of bound): ", empty_to_chr(vout$pd_mismatch, type="string"),
        "\nOrgunit: ", empty_to_chr(vout$org_valid, type="string"),
        "\nMechanisms: ", empty_to_chr(vout$mech_valid, type="string"),
        "\nIndicator: ", empty_to_chr(vout$ind_valid, type="string"),
        "\nNum/Denominator: ", empty_to_chr(vout$nd_valid, type="string"),
        "\nDisaggregation: ", empty_to_chr(vout$disagg_valid, type="string"),
        "\n\n")
  }

  # Return data, message and checks
  return(list(
    status = "success",
    message = "All validations successfully completed",
    checks = vout,
    data = df
  ))
}


#' Validate columns for export
#'
#' @param df HFR data framed created by `cir_process_template()`

check_output_cols <- function(df){

  #check headers
  req_cols <- c("reportingperiod", "orgunit", "orgunituid", "mech_code","partner",
                "operatingunit", "psnu", "indicator",
                "sex", "ageasentered", "otherdisaggregate_sub",
                "otherdisaggregate", "numeratordenom", "val",
                "temp_type", "period_meta", "version_meta", "type_meta","filepaths",
                "file_size", "google_id")

  submitted <- names(df)

  #missing columns
  missing <- flag_missing(req_cols, submitted)

  #extra columns
  extra <- flag_extra(req_cols, submitted)

  #print validation
  cat("\nAre there any missing columns for export?", missing,
      "\nAre there any extra columns for export?", extra)
}


#' @title Check OU
#'
#' @param df Data frame from transformed submission
#' @param ou Name of OU/Country submitting data
#'
#' @return list of row ids with invalid operatingunit
#' @export

check_operatingunit <- function(df, ou) {

  # Countries Reference List
  cntries <- pepfar_countries %>%
    dplyr::filter(ou_country == ou | operatingunit == ou) %>%
    dplyr::pull(country)

  # Check if Operating Unit listed is valid
  df %>%
    dplyr::filter(!is.na(operatingunit)) %>%
    dplyr::rowwise() %>%
    dplyr::mutate(valid_ou = operatingunit %in% cntries) %>%
    dplyr::ungroup() %>%
    dplyr::filter(!valid_ou) %>%
    dplyr::distinct(row_id) %>%
    dplyr::pull(row_id)
}


#' Validate orgunituids
#'
#' @param df       HFR data frame containing reshaped submission
#' @param ref_orgs Datim OU Orgunits Reference Data as data frame
#'
#' @export
#' @return list of row ids with invalid orgunituid

check_orgunituids <- function(df, ref_orgs){

  df %>%
    dplyr::filter(!is.na(orgunituid)) %>%
    dplyr::left_join(ref_orgs, by = "orgunituid") %>%
    dplyr::filter(is.na(orgunit_level)) %>%
    dplyr::distinct(row_id) %>%
    dplyr::pull(row_id)

}


#' Validate Implementing Mechanisms for export
#'
#' @param df        HFR data frame containing reshaped submission
#' @param ref_mechs Datim OU Mechanisms Reference Data as data frame
#'
#' @export
#' @return list of row ids with invalid mechanism code

check_mechs <- function(df, ref_mechs){

  df %>%
    dplyr::filter(!is.na(mech_code)) %>%
    dplyr::left_join(ref_mechs, by = "mech_code") %>%
    dplyr::filter(is.na(mech_name)) %>%
    dplyr::distinct(row_id) %>%
    dplyr::pull(row_id)
}


#' Validate indicators for export
#'
#' @param df     HFR data frame containing reshaped submission
#' @param ref_de CIR Data Element Reference Data as data frame
#'
#' @export
#' @return list of row ids with invalid indicators

check_inds <- function(df, ref_de){

  df %>%
    dplyr::filter(!is.na(indicator)) %>%
    dplyr::left_join(
      ref_de[, c("indicator", "field_marking")], by = "indicator") %>%
    dplyr::filter(is.na(field_marking)) %>%
    dplyr::distinct(row_id) %>%
    dplyr::pull(row_id)
}

#' Validate Numerator/Denominator for export
#'
#' @param df     HFR data frame containing reshaped submission
#'
#' @export
#' @return list of row ids with invalid numerator/denominator values

check_numdenom <- function(df){

  df %>%
    dplyr::filter(!is.na(numeratordenom)) %>%
    dplyr::rowwise() %>%
    dplyr::mutate(
      nd_valid = numeratordenom %in% c("N", "Numerator", "D", "Denominator")
    ) %>%
    dplyr::ungroup() %>%
    dplyr::filter(!nd_valid) %>%
    dplyr::distinct(row_id) %>%
    dplyr::pull(row_id)

}


#' Validate disaggs for export
#'
#' @param df     HFR data frame containing reshaped submission
#' @param ref_de CIR Data Element Reference Data as data frame
#'
#' @export
#' @return list of row ids with invalid indicators' disaggregations

check_disaggs <- function(df, ref_de){

  df %>%
    dplyr::filter(!is.na(indicator)) %>%
    dplyr::left_join(
      dplyr::select(ref_de, !c("tech_area", "disaggregate_group")),
      by = c("indicator", "ageasentered" = "age",
             "sex", "otherdisaggregate",
             "otherdisaggregate_sub", "numeratordenom")
    ) %>%
    dplyr::filter(is.na(field_marking)) %>%
    dplyr::distinct(row_id) %>%
    dplyr::pull(row_id)

}


#' Validate output content
#'
#' @param df HFR DataFrame
#' @param datim_path path of datim lookup files
#' @export
#' @return df updated HFR dataframe
#'
check_content <- function(df, output_path, datim_path) {

  # cat("\nLoading lookup tables ...\n")
  #
  # # Load lookup tables: load only once
  # if ( !exists("df_orgs") | !exists("df_") ) {
  #   load_lookups(datim_path)
  # }


  # LOAD SOMEWHERE

  cntry <- df %>%
    dplyr::distinct(operatingunit) %>%
    dplyr::pull()

  uid <- grabr::get_ouuid(cntry)

  df_orgs <- Wavelength::pull_hierarchy(uid, username = glamr::datim_user(), password = glamr::datim_pwd())
  df_mechs <- Wavelength::pull_mech(usaid_only = TRUE, ou_sel = cntry, folderpath_output = NULL)

  cat("\nChecking operatingunits values ...")

  # Check and update operatingunits
  err_ou <- df %>%
    is_ou_valid(df_orgs = df_orgs) %>%
    dplyr::filter(!valid_ou) %>%
    dplyr::select(-valid_ou) %>%
    dplyr::distinct(operatingunit) %>%
    dplyr::pull()

  if ( length(err_ou) > 1 ) {

    cat("\nAre there any invalid operatingunits?", ifelse(length(err_ou) > 0, paint_red("Yes"), paint_green("No")),
        "\nList of invalid operatingunits: ", paint_red(paste(err_ou, collapse = ", ")))

    cat("\nUpdating operatingunit from mech codes ...")

    # transform & extract unique mech codes
    ims_ou <- df_mechs %>%
      dplyr::mutate(mech_code = mech_code) %>%
      dplyr::select(mech_code , ou = operatingunit) %>%
      dplyr::distinct(mech_code, ou)

    df <- df %>%
      is_ou_valid(df_orgs = df_orgs) %>%
      dplyr::mutate(mech_code = mech_code) %>%
      dplyr::left_join(ims_ou, by = c("mech_code" = "mech_code")) %>%
      dplyr::mutate(operatingunit = ifelse(valid_ou == FALSE, ou, operatingunit)) %>%
      dplyr::select(-c(valid_ou, ou))

    #Check again after update
    err_ou <- df %>%
      is_ou_valid(df_orgs = df_orgs) %>%
      dplyr::filter(!valid_ou) %>%
      dplyr::select(-valid_ou) %>%
      dplyr::distinct(operatingunit) %>%
      dplyr::pull()

    cat("\nAre there still any invalid operatingunit?", ifelse(length(err_ou) > 0, paint_red("Yes"), paint_green("No")),
        "\nList of invalid operatingunit: ", ifelse(length(err_ou) > 0, paint_red(paste(err_ou, collapse = ", ")), paint_green("None")))
  }

  # Check the rest of the data
  cat("\nChecking the entire dataset ...")

  df <- df %>%
    is_ou_valid(df_orgs = df_orgs) %>%
    is_mech_valid(df_mechs = df_mechs) %>%
    is_mech4ou(df_mechs = df_mechs) %>%
    is_orgunituid_valid(df_orgs = df_orgs) %>%
    is_orgunituid4ou(df_orgs = df_orgs)


  valid_ou <- df %>% distinct(valid_ou) %>% pull()
  valid_mech <- df %>% distinct(valid_mech) %>% pull()
  mech_to_ou <- df %>% distinct(mech_to_ou) %>% pull()
  valid_uid <- df %>% distinct(valid_uid) %>% pull()
  uid_to_ou <- df %>% distinct(uid_to_ou) %>% pull()


  cat(
      "\nIs OU valid?", ifelse(length(valid_ou) > 1, paint_red("No"), paint_green("Yes")),
      "\nAre mechs valid?", ifelse(length(valid_mech) > 1, paint_red("No"), paint_green("Yes")),
      "\nIs orgunituid valid?", ifelse(length(valid_uid) > 1, paint_red("No"), paint_green("Yes")),
      "\nDo all org units exist in OU?", ifelse(length(uid_to_ou) > 1, paint_red("No"), paint_green("Yes")),
      "\nAre all mechs valid for the OU?", ifelse(length(mech_to_ou) > 1, paint_red("No"), paint_green("Yes")),
      "\n--------------------------------------------")



#
#   # Sum up invalid columns
#   grp <- df %>%
#     dplyr::select(-c(date:val)) %>%
#     names()
#
#   df <- df %>%
#     dplyr::rowwise() %>%
#     dplyr::mutate(errors = sum(tidyselect::all_of(grp) == FALSE)) %>%
#     dplyr::ungroup()
#
#   # Errors count
#   n_errors <- df %>%
#     dplyr::filter(errors > 0) %>%
#     dplyr::distinct(mech_code) %>%
#     dplyr::pull() %>%
#     length()
#
#   if ( n_errors > 0 ) {
#     msg_errors <- paint_red('Yes')
#   } else {
#     msg_errors <- paint_green('No')
#   }
#
#   cat("\nAre there any mechanism with invalid data?", msg_errors)
#
#   if (n_errors > 0) {
#
#     cat("\nList of mechanisms with errros: ",
#         paint_red(paste(errors, collapse = ", ")))
#
#     df %>%
#       dplyr::group_by(mech_code) %>%
#       dplyr::mutate(row_id = dplyr::row_number()) %>%
#       dplyr::ungroup() %>%
#       dplyr::filter(errors > 0) %>%
#       readr::write_csv(., paste0(output_path,
#                                  "/HFR_ERRORS_", curr_fy, ".",
#                                  stringr::str_pad(dplyr::first(df$hfr_pd), 2, pad = "0"), "_",
#                                  paste(errors, collapse = "_"), "_",
#                                  format(Sys.Date(),"%Y%m%d"), ".csv"), na = "")
#
#     cat("\nThe errors file is located here: ", paint_blue(datim_path))
#   }
#
#   df <- df %>%
#     dplyr::select(date:val, errors) %>%
#     dplyr::mutate(errors = ifelse(errors > 0, TRUE, FALSE))

  return(df)
}
USAID-OHA-SI/Interesting documentation built on Jan. 26, 2023, 5:57 p.m.