data-raw/reprocessing_newest_file.R

## PROJECT: WAVELENGTH
## AUTHOR:  A.Chafetz, B.Kagniniwa, T.Essam | USAID
## PURPOSE: identify files that need to be reprocessed
## LICENSE: MIT
## UPDATED: 2021-05-18

# DEPENDENCIES ------------------------------------------------------------

library(tidyverse)
library(Wavelength)
library(glamr)
library(janitor)
library(lubridate)
library(googledrive)
library(googlesheets4)
library(glue)

# GLOBAL VARIABLES --------------------------------------------------------

  #load secrets
  glamr::load_secrets()

  #google ID
  tracker <- as_sheets_id("1gQvY1KnjreRO3jl2wzuVCKmKjUUgZDwByVK1c-bzpYI")


# IMPORT ------------------------------------------------------------------

  #Google Drive log
  df_tracker <- read_sheet(tracker) %>%
    clean_names()

  #HFR status report
  df_stat <- s3_objects(bucket = "gov-usaid",
                        prefix = "ddc/uat/processed/hfr/outgoing/HFR_Submission") %>%
    filter(last_modified == max(last_modified)) %>%
    pull(key) %>%
    s3_read_object(bucket = "gov-usaid", .)


# MUNGE TRACKER -----------------------------------------------------------

  #clean up namesa nd separate for join with status report
  df_tracker <- df_tracker %>%
    select(timestamp,
           email = email_address,
           country = operating_unit_country,
           hfr_pd = hfr_fy_and_period,
           submission_type = what_type_of_submission_is_this,
           id = upload_your_hfr_file_s_here) %>%
    separate_rows(id, sep = ", ") %>%
    mutate(id = str_extract(id, "(?<=id=).*")) #%>% view()


  #pull in file names to join with tracker
  df_info <- df_tracker %>%
    pull(id) %>%
    purrr::map_dfr(possibly(~drive_get(as_id(.x)), otherwise = NULL)) %>%
    compact() %>% #view()
    select(-drive_resource)

  #join names to tracker for join with status report
  df_tracker <- df_tracker %>%
    left_join(df_info)

  #clean for join/ease
  df_tracker <- df_tracker %>%
    rename(file_name = name,
           submission_date = timestamp)  %>%
    separate(hfr_pd, c("fy", "hfr_pd")) %>%
    mutate(fy = str_replace(fy, "FY", "20"),
           hfr_pd = glue("{hfr_pd} 1, 2020") %>%
             mdy() %m-% months(9) %>%
             month() %>%
             str_pad(2, pad = 0))

# MUNGE STATUS REPORT -----------------------------------------------------

  #filter for the run in question that occured 5/10-11
  df_stat_sel <- df_stat %>%
    filter(processed_date %in% c("2021-05-10", "2021-05-11"))


# JOIN --------------------------------------------------------------------

  #join log and status report to combine file dates and process dates
  df_combo <- df_tracker %>%
    select(-email) %>%
    tidylog::full_join(df_stat_sel %>%
                         select(-c(fy, hfr_pd, operatingunit, countryname, starts_with("records"),
                                   ind, mech, org, pd, tmp, val))) %>%
    select(country, fy, hfr_pd, submission_date, submission_type, processed_date, file_name, id, status,
           everything())

  #flag if there are multiple files from a period and if they need to be rerun
  df_combo <- df_combo %>%
    arrange(country, fy, hfr_pd, processed_date) %>%
    group_by(country, fy, hfr_pd) %>% #BK => Should we add filename in this group? Especially for countries that separate mechanisms?
    mutate(process_in_group = !is.na(processed_date), #BL => Processed dates is very narrow. Should df_combo include only submissions btw the last 2 processes?
           process_in_group = max(process_in_group, na.rm = TRUE)) %>%
    filter(process_in_group == 1, n() > 1) %>%
    mutate(flag = ifelse(submission_date == max(submission_date),
                         is.na(processed_date), FALSE),
           flag = max(flag),
           across(c(flag, process_in_group), as.logical),
           file_to_reprocess = flag == TRUE & max(submission_date) == submission_date & is.na(processed_date)) %>%
    relocate(flag, file_to_reprocess, .before = 1) %>%
    ungroup()

  #review
  df_combo %>%
    filter(flag == TRUE) %>%
    count(file_to_reprocess)

  #export
  df_combo %>%
    filter(flag == TRUE) %>%
    write_csv("out/HFR_files-to-rerun.csv", na = "")
USAID-OHA-SI/Wavelength documentation built on March 24, 2023, 10:07 a.m.