R/ddc_prevalidation.R

Defines functions ddcpv_provide_status ddcpv_split_singlerow ddcpv_resolve_singlerow ddcpv_resolve_order is_singlerow is_misaligned ddcpv_get_tabinfo ddcpv_check

Documented in ddcpv_check ddcpv_get_tabinfo ddcpv_provide_status ddcpv_resolve_order ddcpv_resolve_singlerow ddcpv_split_singlerow is_misaligned is_singlerow

#' DDC Pre-Validation Check
#'
#' This code should be run on HFR submission prior to loading into DDC/s3.
#' Currently, DDC cannot handle two issues - (1) tabs with only one row of data
#' and (2) tabs not ordered from least to greatest This code resolves the
#' first by creating a second row of data with the first value and the second
#' by reordering tabs using openxlsx.
#'
#' @param filepath path to HFR submission
#'
#' @return print out of checks and
#' @export
#'
#' @examples
#' \dontrun{
#' files <- list.files("ou_submissions/", "xlsx", full.names = TRUE)
#' ddcpv_check(files[1])
#' purrr::walk(files, ddcpv_check) }

ddcpv_check <- function(filepath){

  package_check("openxlsx")

  cat("checking", basename(filepath),"... ")

  #extract info relevant for pre-validation checks: tab order and single rows
  tab_info <- ddcpv_get_tabinfo(filepath)

  #resolve tab order if there is an issue
  ddcpv_resolve_order(filepath, tab_info)

  #resolve single rows if there is an issue
  ddcpv_resolve_singlerow(filepath, tab_info)

  #status
  ddcpv_provide_status(filepath, tab_info)

}



#' Extract Necessary Info on Tabs
#'
#' @param filepath path to HFR submission
#'
#' @keywords internal
ddcpv_get_tabinfo <- function(filepath){

  #id type
  type <- hfr_extract_meta(filepath)
  import_cols <- switch(type,
                        "Long" = template_cols_long,
                        "Wide" = template_cols_wide,
                        "Wide - Limited" = template_cols_long)

  #identify all the data tabs in file
  tabs <- filepath %>%
    readxl::excel_sheets() %>%
    stringr::str_subset("HFR") %>%
    purrr::set_names()

  #count the number of rows per tab to reorder on (Trifacta needed)
  tab_info <- purrr::map_dfr(tabs,
                             purrr::possibly(
                               ~ readxl::read_excel(filepath,
                                                    sheet = .x, skip = 1,
                                                    col_names = import_cols,
                                                    col_types = "text") %>%
                                 dplyr::count(name = "rows")
                             ),
                             .id = "tab")

  #Stop if there are tabs not read in (extra columns)
  if(nrow(tab_info) == 0)
    return(usethis::ui_stop("Review the following tabs for extra columns:
                            {paste0(tabs, collapse = ', ')}"))
  if(length(setdiff(tabs, tab_info$tab) > 0))
    return(usethis::ui_stop("Review the following tabs for extra columns:
                            {paste0(setdiff(tabs, tab_info$tab), collapse = ', ')}"))

  #identify if the row rank aligns with the tab order
  tab_info <- tab_info %>%
    dplyr::mutate(tab_order = dplyr::row_number(),
                  rank = dplyr::row_number(rows),
                  # rank = rank(-rows, ties.method= "first"),
                  misaligned = tab_order != rank)

  #identify if tabs are only 1 line
  tab_info <- tab_info %>%
    dplyr::mutate(singlerow = rows == 1)

  return(tab_info)
}


#' Flag if any tabs are not ordered greatest to least
#'
#' @param tab_info data frame from ddcpv_get_tabinfo
#'
#' @return boolean
#' @keywords internal
is_misaligned <- function(tab_info){

  sum(tab_info$misaligned) > 0

}

#' Flag if any tabs have only one row
#'
#' @param tab_info data frame from ddcpv_get_tabinfo
#'
#' @return boolean
#' @keywords internal
is_singlerow <- function(tab_info){

  sum(tab_info$singlerow) > 0

}


#' Resolve Tab Order
#'
#' @param filepath path to HFR submission
#' @param tab_info data frame from ddcpv_get_tabinfo
#'
#' @keywords internal
ddcpv_resolve_order <- function(filepath, tab_info){

  if(is_misaligned(tab_info)){

    #read in all tab and assign group order (meta, hfr, other)
    tabs_all <- filepath %>%
      readxl::excel_sheets() %>%
      tibble::enframe(name = "order", value = "tab") %>%
      dplyr::mutate(grp_order = dplyr::case_when(tab == "meta" ~ 1,
                                                 stringr::str_detect(tab, "HFR") ~ 2,
                                                 TRUE ~ 3))

    #join with tab info and pull new order of tabs
    tab_order <- tabs_all %>%
      dplyr::left_join(tab_info, by = "tab") %>%
      dplyr::arrange(grp_order, rows) %>%
      dplyr::pull(order)

    #load submission file
    wb <- openxlsx::loadWorkbook(filepath)

    #reorder
    openxlsx::worksheetOrder(wb) <- tab_order

    #overwrite original download
    openxlsx::saveWorkbook(wb, filepath, overwrite = TRUE)

  }

}


#' Resolve Single Row Tabs
#'
#' @param filepath path to HFR submission
#' @param tab_info data frame from ddcpv_get_tabinfo
#'
#' @keywords internal
ddcpv_resolve_singlerow <- function(filepath, tab_info){

  if(is_singlerow(tab_info)){

    tabs <- tab_info %>%
      dplyr::filter(singlerow == TRUE) %>%
      dplyr::pull(tab)

     purrr::walk(tabs, ~ ddcpv_split_singlerow(filepath, .x))

  }
}

#' Fixes Single Row for one tab
#'
#' @param filepath path to HFR submission
#' @param tab_info data frame from ddcpv_get_tabinfo
#'
#' @keywords internal
ddcpv_split_singlerow <- function(filepath, tab){

    #read in sheet with one row of data
    df_orig <- readxl::read_excel(filepath, sheet = tab,
                                  skip = 1,
                                  col_types = "text")

    #identify meta data columns with no data
    meta_cols <- template_cols_long[1:11]

    #identify first column of reported data to then move to a send row
    df_replace <- df_orig %>%
      dplyr::select(!dplyr::matches(meta_cols)) %>%
      tidyr::pivot_longer(!dplyr::matches(meta_cols),
                          values_drop_na = TRUE) %>%
      dplyr::slice_head(n = 1)

    col_rep <- df_replace$name
    col_val <- df_replace$value

    #create the second row of the submission
    df_ln2 <- df_orig %>%
      dplyr::select(dplyr::matches(meta_cols)) %>%
      dplyr::mutate(!!col_rep := col_val)

    #remove data from the specified column, append, and clean date
    df_split <- df_orig %>%
      dplyr::mutate(!!col_rep := NA) %>%
      dplyr::bind_rows(df_ln2)

    #clean date
    df_split <- df_split %>%
      hfr_fix_date() %>%
      dplyr::mutate(date = as.character(date))

    #conver value rows to numeric
    suppressWarnings(
      df_split <- df_split %>%
        dplyr::mutate(dplyr::across(!dplyr::matches(meta_cols), as.numeric))
    )

    #load workbook
    wb <- openxlsx::loadWorkbook(filepath)

    #overwrite data in tab
    openxlsx::writeData(wb, tab, df_split,
                        startRow = 3,
                        colNames = FALSE)
    #save
    openxlsx::saveWorkbook(wb, filepath, overwrite = TRUE)
}


#' Status Output
#'
#' @param filepath path to HFR submission
#' @param tab_info data frame from ddcpv_get_tabinfo
#'
#' @return print out and status df to global envir
#' @keywords internal
ddcpv_provide_status <- function(filepath, tab_info){

  df_file_status <- tab_info %>%
    dplyr::mutate(singlerow_tab = dplyr::case_when(singlerow == TRUE ~ tab),
                  file = basename(filepath)) %>%
    dplyr::group_by(file) %>%
    dplyr::summarise(dplyr::across(c(misaligned, singlerow), max, na.rm = TRUE),
                     singlerow_tab = paste(singlerow_tab, collapse = ","),
                                   .groups = "drop") %>%
    dplyr::mutate(singlerow_tab = stringr::str_remove_all(singlerow_tab, "NA,|NA"),
                  across(c(misaligned, singlerow), as.logical),
                  status = dplyr::case_when(misaligned == TRUE & singlerow == TRUE ~ "resolved tab order and single row",
                                            misaligned == TRUE ~ "resolved tab order",
                                            singlerow == TRUE ~ "resolved single row",
                                            TRUE ~ "good"
                                            ))

  if(exists("df_stat_rep")){
    df_stat_rep <<- dplyr::bind_rows(df_stat_rep, df_file_status)
  } else {
    df_stat_rep <<- df_file_status
  }


  status <- if(df_file_status$status == "good"){
    crayon::blue("good")
  } else {
    crayon::yellow(df_file_status$status)
  }

  cat(status, "\n")

}
USAID-OHA-SI/Wavelength documentation built on March 24, 2023, 10:07 a.m.