data-raw/resolve_blank-rows.R

# PROJECT:  Wavelength
# AUTHOR:   A.Chafetz | USAID
# PURPOSE:  resolve blank row known issue
# LICENSE:  MIT
# DATE:     2022-06-23
# UPDATED:

# DEPENDENCIES ------------------------------------------------------------

  library(tidyverse)
  library(glamr)
  library(readxl)
  library(openxlsx)
  library(Wavelength)
  library(lubridate)
  library(glue)
  library(googledrive)
  library(googlesheets4)
  library(janitor)
  library(crayon)

# GLOBAL VARIABLES --------------------------------------------------------

  load_secrets()

  temp_folder()

  gdrive_submissions <- as_sheets_id("1gQvY1KnjreRO3jl2wzuVCKmKjUUgZDwByVK1c-bzpYI")


# RESOLVE BLANK ROW ISSUE -------------------------------------------------

  #load table of submissions
  df_subm <- read_sheet(gdrive_submissions, .name_repair = make_clean_names)

  #limit submissions to w
  df_subm_sel <- df_subm %>%
    filter(operating_unit_country %in% c("Zambia", "Western Hemisphere Region/Guatemala"),
           str_detect(hfr_fy_and_period, "FY22")) %>%
    select(id = upload_your_hfr_file_s_here) %>%
    separate_rows(id, sep = ", ") %>%
    mutate(id = str_extract(id, "(?<=id=).*"),
           name = map(id, ~ drive_get(as_id(.))$name)) %>%
    unnest(name)

  df_subm_sel <- df_subm_sel %>%
    filter(name %in% c("HFR_FY22_08_WesternHemisphere_Guatemala_IntraHealth_20220615 - Daniel Muralles 20220616 160640.xlsx",
                       "HFR_FY22_May_Zambia_Partners8Data_20220615 - Mwila Kangwa 20220620 222824.xlsx"))

  #download issue submissions
  df_subm_sel %>%
    pwalk(~drive_download(as_id(..1),
                          file.path(folderpath_tmp, ..2)))

  #store
  files_local <- list.files(folderpath_tmp, full.names = TRUE)


## FUNCTION TO resolve blank row issue

  clean_subm <- function(path){

    #identify country for pasting into meta tab of template
    cntry <- read_excel(path, range = "C2", col_names = "country") %>% pull()

    #identify template type to open a new template of that type
    type <- read_excel(path, range = "C5", col_names = "type") %>% pull()

    #print out status report to console
    cat(paste("  ", green(basename(path)), "\n",
              "   OU:     ", cntry, "\n",
              "   Type:   ", type, "\n",
              "   Sheets: ", str_subset(excel_sheets(path), "meta", TRUE) %>% paste(collapse = ", "), "\n"
    ))

    #determine which columns need to be kept based on template
    col_keep <- switch(type,
                       "Long" = template_cols_long,
                       "Wide" =  template_cols_wide,
                       "Wide - Limited" = template_cols_wide_lim)

    #import problematic submission
    #df_hfr <- hfr_import(path)
    #df_hfr <- suppressMessages(hfr_import(path))

    print("Importing templates ...")

    # Alternative to original import
    df_hfr <- hfr_import2(path, cols = col_keep)

    print(glue("{nrow(df_hfr)} rows"))

    #limit columns (remove blank cols) & resolve date issue from MS import
    print("Fixing dates ...")

    df_hfr_fix <- df_hfr %>%
      hfr_fix_date() %>%
      select(all_of(col_keep)) %>%
      mutate(date = as.character(date))

    #identify which columns are numeric to resolve decimal import issue
    col_num <- setdiff(col_keep,
                       c("date", "orgunit", "orgunituid", "mech_code",  "partner",
                         "operatingunit", "psnu", "indicator", "sex", "agecoarse",
                         "otherdisaggregate"))

    #clean up decimal issue (caused by import)
    print("fixing numeric values ...")

    df_hfr_fix <- df_hfr_fix %>%
      mutate(across(all_of(col_num), ~ as.integer(.)),
             mech_code = str_remove(mech_code, "\\.0$"))

    #print(glimpse(df_hfr_fix))

    #use date to get submission period for meta tab
    d <- unique(df_hfr_fix$date) %>% ymd() %>% median()

    #create submission period to paste into meta tab
    pd <- glue("FY{quarter(d, with_year = TRUE, fiscal_start = 10) %>% str_sub(3, 4)} {month(d, label = TRUE)}") %>% as.character()

    #print out status report to console
    cat(paste("   Date:   ", unique(df_hfr_fix$date) %>% paste(collapse = ", "), "\n",
              "   Period: ", pd, "\n"
    ))

    # cat(paste("  ", green(basename(path)), "\n",
    #           "   OU:     ", cntry, "\n",
    #           "   Date:   ", unique(df_hfr_fix$date) %>% paste(collapse = ", "), "\n",
    #           "   Period: ", pd, "\n",
    #           "   Type:   ", type, "\n",
    #           "   Sheets: ", str_subset(excel_sheets(path), "meta", TRUE) %>% paste(collapse = ", "), "\n"
    # ))

    #identify which template type to open and paste into
    path_template <- case_when(type == "Long" ~ "templates/HFR_Submission_Template_Long.xlsx",
                               type == "Wide" ~ "templates/HFR_Submission_Template_Wide.xlsx",
                               type == "Wide - Limited" ~ "templates/HFR_Submission_Template_Wide_LIMITED.xlsx")


    print(glue("Template: {basename(path_template)}"))

    #open template
    print("Loading template ....")

    wb <- loadWorkbook(path_template)

    #enter required data onto meta tab - country and period
    print(glue("Writing metadata: OU = {cntry}, period = {pd}"))

    writeData(wb, sheet = "meta", xy = c("C", 2), x = cntry)
    writeData(wb, sheet = "meta", xy = c("C", 3), x = pd)

    #paste values into HFR tab
    print(glue("Writing data: rows = {nrow(df_hfr_fix)}"))

    writeData(wb, sheet = "HFR", startRow = 3, colNames = FALSE, x = df_hfr_fix)

    #create and apply same style to cells to match template format
    # print("Creating Styles ...")
    #
    # cell_style <- createStyle(fontName = "Gill Sans MT",
    #                           border = c("top", "bottom", "left", "right"))
    #
    # print("Adding Styles ...")
    #
    # addStyle(wb, sheet = "HFR", cell_style,
    #          cols = 1:ncol(df_hfr_fix),
    #          rows = 3:(nrow(df_hfr_fix)+2),
    #          gridExpand = TRUE)

    #rename the file for exporting
    folder_out <- file.path(dirname(path), "clean")

    if(!dir.exists(folder_out))
      dir.create(folder_out)

    path_out <- file.path(folder_out,
                          basename(path) %>% str_remove(" -.*") %>% paste0("adj_no_styles.xlsx"))

    #save file
    print(glue("Saving workbook to: {path_out}"))

    saveWorkbook(wb, path_out, overwrite = TRUE)
  }

  #' @title Read HFR Template
  #' @note Making sure empty columns or columns with notes are excluded
  #'
  hfr_import2 <- function(filepath, cols){

    df <- filepath %>%
      readxl::excel_sheets() %>%
      stringr::str_subset("HFR") %>%
      purrr::map_dfr(function(.x) {

        # Get columns list - less likely to have notes & empty columns
        curr_cols <- names(read_excel(filepath, sheet = .x, skip=1, n_max=0))

        # TODO - Compare curr_cols to col_keep and retain valid columns only
        com_cols <- curr_cols %in% cols
        idx_cols <- which(com_cols == TRUE)

        # Read column based range and use second row as header
        df <- readxl::read_excel(filepath,
                                 sheet = .x,
                                 range = cellranger::cell_cols(1:length(curr_cols)),
                                 col_types = "text") %>%
          janitor::row_to_names(., 1)

        return(df)
      })

    if("mechanismid" %in% names(df))
      df <- dplyr::rename(df, mech_code = mechanismid)

    return(df)
  }


  hfr_read_wb <- function(filepath) {
    print(filepath)

    openxlsx::getSheetNames(filepath) %>%
      stringr::str_subset("HFR") %>%
      map_dfr(~openxlsx::readWorkbook(filepath, sheet = .x, startRow = 2))
  }

##

  #run function clean files
  walk(files_local, clean_subm)

  folderpath_tmp %>%
    file.path("clean") %>%
    list.files(pattern = ".xlsx$", full.names = T) %>%
    map_dfr(hfr_read_wb)

  edit_file <- "C:/Users/BKAGNI~1/AppData/Local/Temp/1/RtmpyImXmc/file4bc2e2c4f42/clean/HFR_FY22_08_WesternHemisphere_Guatemala_IntraHealth_20220615adj.xlsx"

  read_excel(edit_file, sheet = "HFR", skip = 1)

  wbb <- loadWorkbook(edit_file)

  getSheetNames(edit_file)

  #open temp folder
  shell.exec(folderpath_tmp)

  #check submission status
  s3_objects(bucket = 'gov-usaid',
             prefix = "ddc/uat/raw/hfr/incoming/",
             n = Inf,
             unpack_keys = TRUE) %>%
    filter(nchar(sys_data_object) > 1) %>%
    pull(sys_data_object)



# RESOLVE LIBERIA MMD ISSUE -----------------------------------------------

# df_subm <- read_sheet(gdrive_submissions, .name_repair = make_clean_names)
#
# df_subm_lbr <- df_subm %>%
#   filter(operating_unit_country == "West Africa Region/Liberia",
#          str_detect(hfr_fy_and_period, "FY22")) %>%
#   select(id = upload_your_hfr_file_s_here) %>%
#   mutate(id = str_extract(id, "(?<=id=).*"),
#          name = map(id, ~ drive_get(as_id(.))$name)) %>%
#   unnest(name)
#
# df_subm_lbr %>%
#   pwalk(~drive_download(as_id(..1),
#                         file.path(folderpath_tmp, ..2)))
#
#
# files_local <- list.files(folderpath_tmp, full.names = TRUE)
#
#
# clean_sub <- function(path){
#
#   df_lbr <- read_excel(path, sheet = 2, skip = 1, col_types = "text")
#
#   df_lbr_fix <- df_lbr %>%
#     hfr_fix_date() %>%
#     select(-hfr_freq) %>%
#     mutate(otherdisaggregate = str_extract(indicator, "(?<=MMD ).*"),
#            indicator = str_replace(indicator, "TX_MMD.*", "TX_MMD"),
#            otherdisaggregate = recode(otherdisaggregate,
#                                       "< 3 months" = "<3 months",
#                                       " 3-5 months" = "3-5 months",
#                                       "6+ months" = "6 months or more"),
#            operatingunit = "West Africa Region",
#            psnu = "Liberia",
#            date = as.character(date)) %>%
#     relocate(otherdisaggregate, .after = agecoarse)
#
#   cat(paste(basename(path), "\n",
#             "   Date: ", unique(df_lbr_fix$date), "\n",
#             "   Disaggs: ", unique(df_lbr_fix$otherdisaggregate) %>% paste(collapse = ', '),
#             "\n"
#   ))
#
#   d <- unique(df_lbr_fix$date) %>% ymd()
#
#   pd <- glue("FY{quarter(d, with_year = TRUE, fiscal_start = 10) %>% str_sub(3, 4)} {month(d, label = TRUE)}") %>% as.character()
#
#   cntry <- "West Africa Region/Liberia"
#
#   wb <- loadWorkbook("templates/HFR_Submission_Template_Long.xlsx")
#
#   writeData(wb, sheet = "meta", xy = c("C", 2), x = cntry)
#
#   writeData(wb, sheet = "meta", xy = c("C", 3), x = pd)
#
#   writeData(wb, sheet = "HFR", startRow = 3, colNames = FALSE, x = df_lbr_fix)
#
#   cell_style <- createStyle(fontName = "Gill Sans MT",
#                             border = c("top", "bottom", "left", "right"))
#
#   addStyle(wb, sheet = "HFR", cell_style,
#            cols = 1:ncol(df_lbr_fix),
#            rows = 3:(nrow(df_lbr_fix)+2),
#            gridExpand = TRUE)
#
#   path_out <- str_remove(path, " -.*") %>% paste0("adj.xlsx")
#
#   saveWorkbook(wb, path_out, overwrite = TRUE)
#
# }
#
# walk(files_local, clean_sub)
#
# unlink(files_local)
#
# shell.exec(folderpath_tmp)
#
USAID-OHA-SI/Wavelength documentation built on March 24, 2023, 10:07 a.m.