#' Identify New Submissions on Google Drive
#'
#' @param print_files print out list of new files
#' @param id_modified removed Modified from filename
#'
#' @return new files, along with submission (df_submissions) and s3 files
#' from the archive folder (df_archive)
#' @export
#'
identify_newfiles <- function(print_files = TRUE, id_modified = TRUE){
#identify files to ignore
gdrive_ignore <- googlesheets4::as_sheets_id("1gQvY1KnjreRO3jl2wzuVCKmKjUUgZDwByVK1c-bzpYI")
suppressMessages(
lst_ignore <- googlesheets4::read_sheet(gdrive_ignore, "ignore_files") %>%
dplyr::pull(name_googledrive)
)
#identify all files in Google Drive directory to compare
gdrive_submissions <- googledrive::as_id("0B9c20El0HKU1fldGdnQ3enBfQlFhSW9rQ21XUzdKT2tLdkNRSFlWNnladllLOGY3em5OdWs")
suppressMessages(
df_submissions <<- googledrive::drive_ls(gdrive_submissions) %>%
# glamr::gdrive_metadata() %>%
dplyr::mutate(created_time = purrr::map_chr(drive_resource, "createdTime") %>%
lubridate::ymd_hms(tz = "EST"),
modified_time = purrr::map_chr(drive_resource, "modifiedTime") %>%
lubridate::ymd_hms(tz = "EST")) %>%
dplyr::select(name, id, created_time, modified_time) %>%
dplyr::mutate(exists_gdrive = TRUE) %>%
dplyr::filter(!name %in% lst_ignore)
)
#identify all files in s3 bucket raw/archive to compare [new folder structure]
df_archive <- grabr::s3_objects(bucket = 'gov-usaid',
prefix = "ddc/uat/raw/hfr/archive/",
n = Inf,
unpack_keys = TRUE) %>%
dplyr::filter(nchar(sys_data_object) > 1)
if(id_modified == TRUE)
df_archive <- dplyr::mutate(df_archive, sys_data_object = stringr::str_remove(sys_data_object, "^Modified_"))
df_archive <<- df_archive %>%
dplyr::select(sys_data_object, last_modified) %>%
dplyr::group_by(sys_data_object) %>%
dplyr::filter(last_modified == max(last_modified)) %>%
dplyr::ungroup() %>%
dplyr::mutate(exists_s3 = TRUE)
#filter for files needed to add to s3 bucket
df_new <- df_submissions %>%
dplyr::anti_join(df_archive, by = c("name" = "sys_data_object"))
#rename files
if(any(stringr::str_detect(df_new$name, "'"))){
print("changing file name with apostrophe")
df_chng <- df_new %>%
dplyr::filter(stringr::str_detect(name, "\\'")) %>%
dplyr::mutate(name = stringr::str_remove_all(name, "\\'"))
purrr::map2(.x = df_chng$id,
.y = df_chng$name,
.f = ~googledrive::drive_rename(googledrive::as_id(.x),
name = .y))
df_new <- df_new %>%
dplyr::mutate(name = stringr::str_remove_all(name, "\\'"))
}
if(print_files == TRUE)
print(df_new$name)
return(df_new)
}
#' Apply Time Stamp to file and
#'
#' @param df dataframe from identify_newfiles()
#' @param gdrive_rename rename on Google drive? defaule = TRUE
#'
#' @export
apply_filetimestamp <- function(df, gdrive_rename = TRUE){
#add time stamp to file name (to ensure no dup filenames)
df <- df %>%
dplyr::mutate(name = ifelse(stringr::str_detect(name, ".xlsx$"), name, glue::glue("{name}.xlsx") %>% as.character),
created_clean = created_time %>%
stringr::str_remove(":[:digit:]+\\.[:digit:]+Z") %>%
stringr::str_remove_all("-|:") %>%
stringr::str_replace("T", " "),
name_new = ifelse(stringr::str_detect(name, glue::glue("{created_clean}\\.xlsx")),
stringr::str_replace(name, "[:digit:]{8} [:digit:]{6}.xlsx$", glue::glue("{created_clean}\\.xlsx")),
stringr::str_replace(name, ".xlsx$", glue::glue(" {created_clean}\\.xlsx"))))
if(gdrive_rename == TRUE){
#rename all new files on googledrive with date and time stamp
purrr::map2(.x = df$id,
.y = df$name_new,
.f = ~ googledrive::drive_rename(googledrive::as_id(.x), .y))
}
return(df)
}
#' Download New Submission to upload
#'
#' @param df dataframe from apply_filetimestamp
#'
#' @export
#'
download_new <- function(df){
#create a temp folder for storing downloads
glamr::temp_folder()
#download all files missing from s3 locally
purrr::walk2(.x = df$id,
.y = df$name_new,
.f = ~googledrive::drive_download(googledrive::as_id(.x),
file.path(folderpath_tmp, .y),
overwrite = TRUE))
files <- list.files(folderpath_tmp, full.names = TRUE)
#launch tmp folder
shell.exec(folderpath_tmp)
return(files)
}
#' @title Identify S3 file names matching Ignored files
#'
#' @param file_name Filename to be used as look up
#' @param df_ref Reference data frame, Recommend using files present in S3 but not in gdrive, Eg: `df_ghosts`
#' @param ref_name Column name to be used for lookup
#'
#' @export
#'
#' @examples
#' \dontrun{
#'
#' library(tidyverse)
#' library(Wavelength)
#' library(googlesheets4)
#'
#' ss_sbm <- as_sheets_id("<xyz>")
#'
#' df_ignore_files <- read_sheet(ss = ss_sbm, "ignore_files")
#'
#' df_new <- identify_newfiles() # this will also generate df_submissions & df_archive
#'
#' # Ghost files
#' df_ghosts <- df_submissions %>%
#' full_join(df_archive, by = c("name" = "sys_data_object")) %>%
#' filter(is.na(exists_gdrive))
#'
#' # Match ignored files
#' df_ignore_files <- df_ignore_files %>%
#' mutate(names_s3 = match_ignoredfiles(name_googledrive, df_ghosts, "name")
#'
#' # Update ignore files table
#' range_write(ss = sbm_form,
#' data = df_ignore_files,
#' sheet = "ignore_files",
#' col_names = TRUE)
#' }
#'
match_ignoredfiles <- function(file_name, df_ref,
ref_name = "name") {
name <- {{ref_name}}
new_names <- file_name %>%
purrr::map(function(.x) {
a <- paste0("^", .x)
b <- paste0(.x, "$")
c <- paste0(".*", .x, "$")
d <- .x %>%
stringr::str_remove("\\s\\d{8}\\s\\d{6}.xlsx$") %>%
paste0(., ".*")
e <- .x %>%
stringr::str_remove("\\s\\d{8}\\s\\d{6}.xlsx$") %>%
paste0("^", ., ".*")
f <- .x %>%
stringr::str_remove("\\s\\d{8}\\s\\d{6}.xlsx$") %>%
paste0(".*", ., ".*")
z <- .x %>% stringr::str_remove("\\s\\d{8}\\s\\d{6}")
s <- c(a, b, c, d, e, f) %>%
paste(collapse = "|")
nm <- df_ref %>%
dplyr::filter({name} == z | stringr::str_detect(name, s)) %>%
dplyr::pull(name)
if(purrr::is_empty(nm))
return(NA_character_)
if(length(nm) > 1)
return(nm %>% paste(collapse = ", "))
return(nm)
}) %>% unlist()
return(new_names)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.