#' DDC Pre-Validation Check
#'
#' This code should be run on HFR submission prior to loading into DDC/s3.
#' Currently, DDC cannot handle two issues - (1) tabs with only one row of data
#' and (2) tabs not ordered from least to greatest This code resolves the
#' first by creating a second row of data with the first value and the second
#' by reordering tabs using openxlsx.
#'
#' @param filepath path to HFR submission
#'
#' @return print out of checks and
#' @export
#'
#' @examples
#' \dontrun{
#' files <- list.files("ou_submissions/", "xlsx", full.names = TRUE)
#' ddcpv_check(files[1])
#' purrr::walk(files, ddcpv_check) }
ddcpv_check <- function(filepath){
package_check("openxlsx")
cat("checking", basename(filepath),"... ")
#extract info relevant for pre-validation checks: tab order and single rows
tab_info <- ddcpv_get_tabinfo(filepath)
#resolve tab order if there is an issue
ddcpv_resolve_order(filepath, tab_info)
#resolve single rows if there is an issue
ddcpv_resolve_singlerow(filepath, tab_info)
#status
ddcpv_provide_status(filepath, tab_info)
}
#' Extract Necessary Info on Tabs
#'
#' @param filepath path to HFR submission
#'
#' @keywords internal
ddcpv_get_tabinfo <- function(filepath){
#id type
type <- hfr_extract_meta(filepath)
import_cols <- switch(type,
"Long" = template_cols_long,
"Wide" = template_cols_wide,
"Wide - Limited" = template_cols_long)
#identify all the data tabs in file
tabs <- filepath %>%
readxl::excel_sheets() %>%
stringr::str_subset("HFR") %>%
purrr::set_names()
#count the number of rows per tab to reorder on (Trifacta needed)
tab_info <- purrr::map_dfr(tabs,
purrr::possibly(
~ readxl::read_excel(filepath,
sheet = .x, skip = 1,
col_names = import_cols,
col_types = "text") %>%
dplyr::count(name = "rows")
),
.id = "tab")
#Stop if there are tabs not read in (extra columns)
if(nrow(tab_info) == 0)
return(usethis::ui_stop("Review the following tabs for extra columns:
{paste0(tabs, collapse = ', ')}"))
if(length(setdiff(tabs, tab_info$tab) > 0))
return(usethis::ui_stop("Review the following tabs for extra columns:
{paste0(setdiff(tabs, tab_info$tab), collapse = ', ')}"))
#identify if the row rank aligns with the tab order
tab_info <- tab_info %>%
dplyr::mutate(tab_order = dplyr::row_number(),
rank = dplyr::row_number(rows),
# rank = rank(-rows, ties.method= "first"),
misaligned = tab_order != rank)
#identify if tabs are only 1 line
tab_info <- tab_info %>%
dplyr::mutate(singlerow = rows == 1)
return(tab_info)
}
#' Flag if any tabs are not ordered greatest to least
#'
#' @param tab_info data frame from ddcpv_get_tabinfo
#'
#' @return boolean
#' @keywords internal
is_misaligned <- function(tab_info){
sum(tab_info$misaligned) > 0
}
#' Flag if any tabs have only one row
#'
#' @param tab_info data frame from ddcpv_get_tabinfo
#'
#' @return boolean
#' @keywords internal
is_singlerow <- function(tab_info){
sum(tab_info$singlerow) > 0
}
#' Resolve Tab Order
#'
#' @param filepath path to HFR submission
#' @param tab_info data frame from ddcpv_get_tabinfo
#'
#' @keywords internal
ddcpv_resolve_order <- function(filepath, tab_info){
if(is_misaligned(tab_info)){
#read in all tab and assign group order (meta, hfr, other)
tabs_all <- filepath %>%
readxl::excel_sheets() %>%
tibble::enframe(name = "order", value = "tab") %>%
dplyr::mutate(grp_order = dplyr::case_when(tab == "meta" ~ 1,
stringr::str_detect(tab, "HFR") ~ 2,
TRUE ~ 3))
#join with tab info and pull new order of tabs
tab_order <- tabs_all %>%
dplyr::left_join(tab_info, by = "tab") %>%
dplyr::arrange(grp_order, rows) %>%
dplyr::pull(order)
#load submission file
wb <- openxlsx::loadWorkbook(filepath)
#reorder
openxlsx::worksheetOrder(wb) <- tab_order
#overwrite original download
openxlsx::saveWorkbook(wb, filepath, overwrite = TRUE)
}
}
#' Resolve Single Row Tabs
#'
#' @param filepath path to HFR submission
#' @param tab_info data frame from ddcpv_get_tabinfo
#'
#' @keywords internal
ddcpv_resolve_singlerow <- function(filepath, tab_info){
if(is_singlerow(tab_info)){
tabs <- tab_info %>%
dplyr::filter(singlerow == TRUE) %>%
dplyr::pull(tab)
purrr::walk(tabs, ~ ddcpv_split_singlerow(filepath, .x))
}
}
#' Fixes Single Row for one tab
#'
#' @param filepath path to HFR submission
#' @param tab_info data frame from ddcpv_get_tabinfo
#'
#' @keywords internal
ddcpv_split_singlerow <- function(filepath, tab){
#read in sheet with one row of data
df_orig <- readxl::read_excel(filepath, sheet = tab,
skip = 1,
col_types = "text")
#identify meta data columns with no data
meta_cols <- template_cols_long[1:11]
#identify first column of reported data to then move to a send row
df_replace <- df_orig %>%
dplyr::select(!dplyr::matches(meta_cols)) %>%
tidyr::pivot_longer(!dplyr::matches(meta_cols),
values_drop_na = TRUE) %>%
dplyr::slice_head(n = 1)
col_rep <- df_replace$name
col_val <- df_replace$value
#create the second row of the submission
df_ln2 <- df_orig %>%
dplyr::select(dplyr::matches(meta_cols)) %>%
dplyr::mutate(!!col_rep := col_val)
#remove data from the specified column, append, and clean date
df_split <- df_orig %>%
dplyr::mutate(!!col_rep := NA) %>%
dplyr::bind_rows(df_ln2)
#clean date
df_split <- df_split %>%
hfr_fix_date() %>%
dplyr::mutate(date = as.character(date))
#conver value rows to numeric
suppressWarnings(
df_split <- df_split %>%
dplyr::mutate(dplyr::across(!dplyr::matches(meta_cols), as.numeric))
)
#load workbook
wb <- openxlsx::loadWorkbook(filepath)
#overwrite data in tab
openxlsx::writeData(wb, tab, df_split,
startRow = 3,
colNames = FALSE)
#save
openxlsx::saveWorkbook(wb, filepath, overwrite = TRUE)
}
#' Status Output
#'
#' @param filepath path to HFR submission
#' @param tab_info data frame from ddcpv_get_tabinfo
#'
#' @return print out and status df to global envir
#' @keywords internal
ddcpv_provide_status <- function(filepath, tab_info){
df_file_status <- tab_info %>%
dplyr::mutate(singlerow_tab = dplyr::case_when(singlerow == TRUE ~ tab),
file = basename(filepath)) %>%
dplyr::group_by(file) %>%
dplyr::summarise(dplyr::across(c(misaligned, singlerow), max, na.rm = TRUE),
singlerow_tab = paste(singlerow_tab, collapse = ","),
.groups = "drop") %>%
dplyr::mutate(singlerow_tab = stringr::str_remove_all(singlerow_tab, "NA,|NA"),
across(c(misaligned, singlerow), as.logical),
status = dplyr::case_when(misaligned == TRUE & singlerow == TRUE ~ "resolved tab order and single row",
misaligned == TRUE ~ "resolved tab order",
singlerow == TRUE ~ "resolved single row",
TRUE ~ "good"
))
if(exists("df_stat_rep")){
df_stat_rep <<- dplyr::bind_rows(df_stat_rep, df_file_status)
} else {
df_stat_rep <<- df_file_status
}
status <- if(df_file_status$status == "good"){
crayon::blue("good")
} else {
crayon::yellow(df_file_status$status)
}
cat(status, "\n")
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.