## PROJECT: WAVELENGTH
## AUTHOR: A.Chafetz, B.Kagniniwa, T.Essam | USAID
## PURPOSE: identify files that need to be reprocessed
## LICENSE: MIT
## UPDATED: 2021-05-18
# DEPENDENCIES ------------------------------------------------------------
library(tidyverse)
library(Wavelength)
library(glamr)
library(janitor)
library(lubridate)
library(googledrive)
library(googlesheets4)
library(glue)
# GLOBAL VARIABLES --------------------------------------------------------
#load secrets
glamr::load_secrets()
#google ID
tracker <- as_sheets_id("1gQvY1KnjreRO3jl2wzuVCKmKjUUgZDwByVK1c-bzpYI")
# IMPORT ------------------------------------------------------------------
#Google Drive log
df_tracker <- read_sheet(tracker) %>%
clean_names()
#HFR status report
df_stat <- s3_objects(bucket = "gov-usaid",
prefix = "ddc/uat/processed/hfr/outgoing/HFR_Submission") %>%
filter(last_modified == max(last_modified)) %>%
pull(key) %>%
s3_read_object(bucket = "gov-usaid", .)
# MUNGE TRACKER -----------------------------------------------------------
#clean up namesa nd separate for join with status report
df_tracker <- df_tracker %>%
select(timestamp,
email = email_address,
country = operating_unit_country,
hfr_pd = hfr_fy_and_period,
submission_type = what_type_of_submission_is_this,
id = upload_your_hfr_file_s_here) %>%
separate_rows(id, sep = ", ") %>%
mutate(id = str_extract(id, "(?<=id=).*")) #%>% view()
#pull in file names to join with tracker
df_info <- df_tracker %>%
pull(id) %>%
purrr::map_dfr(possibly(~drive_get(as_id(.x)), otherwise = NULL)) %>%
compact() %>% #view()
select(-drive_resource)
#join names to tracker for join with status report
df_tracker <- df_tracker %>%
left_join(df_info)
#clean for join/ease
df_tracker <- df_tracker %>%
rename(file_name = name,
submission_date = timestamp) %>%
separate(hfr_pd, c("fy", "hfr_pd")) %>%
mutate(fy = str_replace(fy, "FY", "20"),
hfr_pd = glue("{hfr_pd} 1, 2020") %>%
mdy() %m-% months(9) %>%
month() %>%
str_pad(2, pad = 0))
# MUNGE STATUS REPORT -----------------------------------------------------
#filter for the run in question that occured 5/10-11
df_stat_sel <- df_stat %>%
filter(processed_date %in% c("2021-05-10", "2021-05-11"))
# JOIN --------------------------------------------------------------------
#join log and status report to combine file dates and process dates
df_combo <- df_tracker %>%
select(-email) %>%
tidylog::full_join(df_stat_sel %>%
select(-c(fy, hfr_pd, operatingunit, countryname, starts_with("records"),
ind, mech, org, pd, tmp, val))) %>%
select(country, fy, hfr_pd, submission_date, submission_type, processed_date, file_name, id, status,
everything())
#flag if there are multiple files from a period and if they need to be rerun
df_combo <- df_combo %>%
arrange(country, fy, hfr_pd, processed_date) %>%
group_by(country, fy, hfr_pd) %>% #BK => Should we add filename in this group? Especially for countries that separate mechanisms?
mutate(process_in_group = !is.na(processed_date), #BL => Processed dates is very narrow. Should df_combo include only submissions btw the last 2 processes?
process_in_group = max(process_in_group, na.rm = TRUE)) %>%
filter(process_in_group == 1, n() > 1) %>%
mutate(flag = ifelse(submission_date == max(submission_date),
is.na(processed_date), FALSE),
flag = max(flag),
across(c(flag, process_in_group), as.logical),
file_to_reprocess = flag == TRUE & max(submission_date) == submission_date & is.na(processed_date)) %>%
relocate(flag, file_to_reprocess, .before = 1) %>%
ungroup()
#review
df_combo %>%
filter(flag == TRUE) %>%
count(file_to_reprocess)
#export
df_combo %>%
filter(flag == TRUE) %>%
write_csv("out/HFR_files-to-rerun.csv", na = "")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.