data-raw/fy23_full_sitelist.R

# PROJECT:  Wavelength
# AUTHOR:   A.Chafetz | USAID
# PURPOSE:  create a FY23 official site list
# LICENSE:  MIT
# DATE:     2022-11-07
# UPDATED:


# DEPENDENCIES ------------------------------------------------------------

library(tidyverse)
library(glamr)
library(googledrive)
library(googlesheets4)
library(janitor)
library(glue)
library(Wavelength)


# GLOBAL ------------------------------------------------------------------

  #add credentials to access drive
  load_secrets()

  #site submission form (sheet)
  gs_id <- as_sheets_id("1aW37FmWOc3TWQqUjoWagcZ53bj9cbaW2W8NFVIUlua4")

  #create temp folder for downloading FY22 site files
  temp_folder()


  #return latest FY21 + 22 site list
  fy_prior_sitelist <- return_latest("out",
                                 glue("HFR_{str_replace((curr_fy - 1), '20', 'FY')}_GLOBAL_sitelist"))

# DOWNLOAD FY22 SITE VALIDATION FILES -------------------------------------

  #read in googlesheet with all submissions
  df_sub <- read_sheet(gs_id, .name_repair = janitor::make_clean_names)

  #pull last cntry submission & apply name
  df_sub_name <- df_sub %>%
    group_by(operating_unit_country) %>%
    filter(timestamp == max(timestamp)) %>%
    ungroup() %>%
    mutate(id = str_extract(upload_your_hfr_site_validation_file_here, "(?<=id=).*")) %>%
    select(timestamp, operating_unit_country, id, prefill_template) %>%
    mutate(name = map(id, ~ drive_get(as_id(.))$name)) %>%
    unnest(name)%>%
    mutate(name = str_remove_all(name, "'"))

  #check files to be dropped
    # tidylog::filter(df_sub_name, str_detect(name, str_replace(curr_fy, "20", "FY"), negate = TRUE))

  #remove old files
  df_sub_name <- df_sub_name %>%
    filter(str_detect(name, str_replace(curr_fy, "20", "FY")))

  #download all
  walk2(.x = df_sub_name$id,
        .y = df_sub_name$name,
        ~ drive_download(as_id(.x),
                         path = file.path(folderpath_tmp, .y), overwrite = TRUE))

# COMBINE INTO ONE GLOBAL SITE LIST ---------------------------------------

  #file names of submission to read in
  files_submissions <- list.files(folderpath_tmp, full.names = TRUE, recursive = TRUE)

  #test import files
  # purrr::map(files_submissions,
  #            purrr::possibly(~tidy_sitelist(.x),  otherwise = "Error"))

  #import files
  df_sitelist <- purrr::map_dfr(files_submissions, tidy_sitelist)

  #flag if original or not
  df_flag <- df_sub_name %>%
    select(operatingunit = operating_unit_country, is_original = prefill_template) %>%
    mutate(is_original = ifelse(is.na(is_original), FALSE, is_original))

  df_sitelist <-  left_join(df_sitelist, df_flag)

  #parse operatingunit for ou and country
  df_sitelist <- df_sitelist %>%
    tidyr::separate(operatingunit, c("operatingunit", "countryname"), sep = "/", fill = "right") %>%
    dplyr::mutate(countryname = ifelse(is.na(countryname), operatingunit, countryname))

  #remove any rows where the UID is missing
  df_sitelist <- df_sitelist %>%
    tidylog::filter(!is.na(orgunituid)) %>%
    tidylog::filter(!is.na(mech_code))

  #adjust for inclusion of some ending mechanisms"
  df_sitelist <- df_sitelist %>%
    mutate(mech_code = str_remove(mech_code, "\\(!\\) "))

  #add source
  df_sitelist <- df_sitelist %>%
    mutate(source = case_when(is_original == FALSE ~ "Country Team Submitted",
                              TRUE ~ "Original DATIM - FY22Q3"))


# VALIDITY CHECK ----------------------------------------------------------

  # df_msd <- si_path() %>%
  #   return_latest("OU_IM") %>%
  #   read_rds()

  df_datim_org <- return_latest("out/DATIM", "GLOBAL_org") %>%
    read_csv()

  # df_msd_mech <- df_msd %>%
  #   filter(fundingagency == "USAID",
  #          fiscal_year == curr_fy,
  #          indicator %in% c("HTS_TST", "TX_NEW", "TX_CURR",
  #                           "PrEP_NEW", "VMMC_CIRC")) %>%
  #   count(operatingunit, countryname, mech_code, indicator, wt = targets)
  #
  #
  # df_sitelist %>%
  #   anti_join(df_msd_mech %>%
  #               select(operatingunit, mech_code)) %>%
  #   distinct(operatingunit, countryname,
  #            mech_code, mech_name, primepartner)
  #
  # df_sitelist %>%
  #   anti_join(df_msd_mech %>%
  #               select(operatingunit, mech_code, indicator)) %>%
  #   distinct(operatingunit, countryname,
  #            mech_code, mech_name, primepartner, indicator)


  (df_err_orgunit <- df_sitelist %>%
    anti_join(df_datim_org %>%
                select(orgunituid, operatingunit, countryname)) %>%
    distinct(operatingunit, countryname,
             orgunit, orgunituid))

  df_sitelist <- df_sitelist %>%
    tidylog::filter(orgunituid %ni% df_err_orgunit$orgunituid)

# COMBINE CURRENT AND PRIOR SITE LIST -------------------------------------

  #import
  df_sitelist_prior <- read_csv(fy_prior_sitelist,
                                col_types = c(expect_reporting = "l",
                                              is_original = "l",
                                              .default = "c"))

  #resolve submission across 2 FY
  df_sitelist_prior <- df_sitelist_prior %>%
    mutate(end = start %>%
             str_extract("[:digit:]{2}") %>%
             str_replace(end, "[:digit:]{2}", .))

  #combine
  df_sitelist <- bind_rows(df_sitelist, df_sitelist_prior)

  #arrange
  df_sitelist <- df_sitelist %>%
    arrange(start, operatingunit, countryname, orgunit)

  #limit output
  df_sitelist <- df_sitelist %>%
    select(orgunit, orgunituid, type, operatingunit, countryname,
           snu1, psnu, mech_code, mech_name, primepartner,
           start, end, indicator, expect_reporting, is_original, source)

# EXPORT ------------------------------------------------------------------

  date <- format(Sys.Date(), "%Y%m%d")

  fy <- str_sub(curr_fy, -2)

  file <- glue("HFR_FY{fy}_GLOBAL_sitelist_{date}.csv")

  readr::write_csv(df_sitelist,
                   file.path("out", file), na = "")

  grabr::s3_upload(
    filepath = file.path("out", file),
    bucket = "gov-usaid",
    prefix = "ddc/uat/raw/hfr/receiving"
  )
USAID-OHA-SI/Wavelength documentation built on March 24, 2023, 10:07 a.m.