R/identify_newfiles.R

Defines functions match_ignoredfiles download_new apply_filetimestamp identify_newfiles

Documented in apply_filetimestamp download_new identify_newfiles match_ignoredfiles

#' Identify New Submissions on Google Drive
#'
#' @param print_files print out list of new files
#' @param id_modified removed Modified from filename
#'
#' @return new files, along with submission (df_submissions) and s3 files
#'  from the archive folder (df_archive)
#' @export
#'
identify_newfiles <- function(print_files = TRUE, id_modified = TRUE){

  #identify files to ignore
  gdrive_ignore <- googlesheets4::as_sheets_id("1gQvY1KnjreRO3jl2wzuVCKmKjUUgZDwByVK1c-bzpYI")

  suppressMessages(
    lst_ignore <- googlesheets4::read_sheet(gdrive_ignore, "ignore_files") %>%
      dplyr::pull(name_googledrive)
  )

  #identify all files in Google Drive directory to compare
  gdrive_submissions <- googledrive::as_id("0B9c20El0HKU1fldGdnQ3enBfQlFhSW9rQ21XUzdKT2tLdkNRSFlWNnladllLOGY3em5OdWs")

  suppressMessages(
  df_submissions <<- googledrive::drive_ls(gdrive_submissions) %>%
    # glamr::gdrive_metadata() %>%
    dplyr::mutate(created_time = purrr::map_chr(drive_resource, "createdTime") %>%
                    lubridate::ymd_hms(tz = "EST"),
                  modified_time = purrr::map_chr(drive_resource, "modifiedTime") %>%
                    lubridate::ymd_hms(tz = "EST")) %>%
    dplyr::select(name, id, created_time, modified_time) %>%
    dplyr::mutate(exists_gdrive = TRUE) %>%
    dplyr::filter(!name %in% lst_ignore)
  )

  #identify all files in s3 bucket raw/archive to compare [new folder structure]
  df_archive <- grabr::s3_objects(bucket = 'gov-usaid',
                                   prefix = "ddc/uat/raw/hfr/archive/",
                                   n = Inf,
                                   unpack_keys = TRUE) %>%
    dplyr::filter(nchar(sys_data_object) > 1)

  if(id_modified == TRUE)
    df_archive <- dplyr::mutate(df_archive, sys_data_object = stringr::str_remove(sys_data_object, "^Modified_"))

  df_archive <<- df_archive %>%
    dplyr::select(sys_data_object, last_modified) %>%
    dplyr::group_by(sys_data_object) %>%
    dplyr::filter(last_modified == max(last_modified)) %>%
    dplyr::ungroup() %>%
    dplyr::mutate(exists_s3 = TRUE)

  #filter for files needed to add to s3 bucket
  df_new <- df_submissions %>%
    dplyr::anti_join(df_archive, by = c("name" = "sys_data_object"))

  #rename files
  if(any(stringr::str_detect(df_new$name, "'"))){
    print("changing file name with apostrophe")
    df_chng <- df_new %>%
      dplyr::filter(stringr::str_detect(name, "\\'")) %>%
      dplyr::mutate(name = stringr::str_remove_all(name, "\\'"))

    purrr::map2(.x = df_chng$id,
                .y = df_chng$name,
                .f = ~googledrive::drive_rename(googledrive::as_id(.x),
                                                name = .y))
    df_new <- df_new %>%
      dplyr::mutate(name = stringr::str_remove_all(name, "\\'"))
  }


  if(print_files == TRUE)
    print(df_new$name)

  return(df_new)
}


#' Apply Time Stamp to file and
#'
#' @param df dataframe from identify_newfiles()
#' @param gdrive_rename rename on Google drive? defaule = TRUE
#'
#' @export

apply_filetimestamp <- function(df, gdrive_rename = TRUE){
  #add time stamp to file name (to ensure no dup filenames)
  df <- df %>%
    dplyr::mutate(name = ifelse(stringr::str_detect(name, ".xlsx$"), name, glue::glue("{name}.xlsx") %>% as.character),
                  created_clean = created_time %>%
                    stringr::str_remove(":[:digit:]+\\.[:digit:]+Z") %>%
                    stringr::str_remove_all("-|:") %>%
                    stringr::str_replace("T", " "),
                  name_new = ifelse(stringr::str_detect(name, glue::glue("{created_clean}\\.xlsx")),
                                    stringr::str_replace(name, "[:digit:]{8} [:digit:]{6}.xlsx$", glue::glue("{created_clean}\\.xlsx")),
                                    stringr::str_replace(name, ".xlsx$", glue::glue(" {created_clean}\\.xlsx"))))


  if(gdrive_rename == TRUE){
    #rename all new files on googledrive with date and time stamp
    purrr::map2(.x = df$id,
                .y = df$name_new,
                .f = ~ googledrive::drive_rename(googledrive::as_id(.x), .y))
  }

  return(df)
}


#' Download New Submission to upload
#'
#' @param df dataframe from apply_filetimestamp
#'
#' @export
#'
download_new <- function(df){

  #create a temp folder for storing downloads
  glamr::temp_folder()

  #download all files missing from s3 locally
  purrr::walk2(.x = df$id,
               .y = df$name_new,
               .f = ~googledrive::drive_download(googledrive::as_id(.x),
                                                 file.path(folderpath_tmp, .y),
                                                 overwrite = TRUE))

  files <- list.files(folderpath_tmp, full.names = TRUE)

  #launch tmp folder
  shell.exec(folderpath_tmp)

  return(files)
}


#' @title Identify S3 file names matching Ignored files
#'
#' @param file_name  Filename to be used as look up
#' @param df_ref     Reference data frame, Recommend using files present in S3 but not in gdrive, Eg: `df_ghosts`
#' @param ref_name   Column name to be used for lookup
#'
#' @export
#'
#' @examples
#' \dontrun{
#'
#'   library(tidyverse)
#'   library(Wavelength)
#'   library(googlesheets4)
#'
#'   ss_sbm <- as_sheets_id("<xyz>")
#'
#'   df_ignore_files <- read_sheet(ss = ss_sbm, "ignore_files")
#'
#'   df_new <- identify_newfiles() # this will also generate df_submissions & df_archive
#'
#'   # Ghost files
#'   df_ghosts <- df_submissions %>%
#'      full_join(df_archive, by = c("name" = "sys_data_object")) %>%
#'      filter(is.na(exists_gdrive))
#'
#'   # Match ignored files
#'   df_ignore_files <- df_ignore_files %>%
#'     mutate(names_s3 = match_ignoredfiles(name_googledrive, df_ghosts, "name")
#'
#'   # Update ignore files table
#'   range_write(ss = sbm_form,
#'               data = df_ignore_files,
#'               sheet = "ignore_files",
#'               col_names = TRUE)
#' }
#'
match_ignoredfiles <- function(file_name, df_ref,
                               ref_name = "name") {

  name <- {{ref_name}}

  new_names <- file_name %>%
    purrr::map(function(.x) {

      a <- paste0("^", .x)
      b <- paste0(.x, "$")
      c <- paste0(".*", .x, "$")

      d <- .x %>%
        stringr::str_remove("\\s\\d{8}\\s\\d{6}.xlsx$") %>%
        paste0(., ".*")

      e <- .x %>%
        stringr::str_remove("\\s\\d{8}\\s\\d{6}.xlsx$") %>%
        paste0("^", ., ".*")

      f <- .x %>%
        stringr::str_remove("\\s\\d{8}\\s\\d{6}.xlsx$") %>%
        paste0(".*", ., ".*")

      z <- .x %>% stringr::str_remove("\\s\\d{8}\\s\\d{6}")

      s <- c(a, b, c, d, e, f) %>%
        paste(collapse = "|")

      nm <- df_ref %>%
        dplyr::filter({name} == z | stringr::str_detect(name, s)) %>%
        dplyr::pull(name)

      if(purrr::is_empty(nm))
        return(NA_character_)

      if(length(nm) > 1)
        return(nm %>% paste(collapse = ", "))

      return(nm)

    }) %>% unlist()

  return(new_names)
}
USAID-OHA-SI/Wavelength documentation built on March 24, 2023, 10:07 a.m.