## PROJECT: HFR
## AUTHOR: A.Chafetz | USAID
## PURPOSE: identify dupliate mechanism files in a pd folder
## DATE: 2020-04-23
## UPDATED: 2020-05-23
#dependencies
library(googledrive)
library(tidyverse)
library(lubridate)
#authenticate GDrive access
drive_auth()
#find the id for the HFR data folder
datafldr_id <-drive_find(pattern = "2. Output") %>%
pull(id)
#pull the idea for each subsequent period folder to pull each individually
pd_fldrs <- drive_ls(as_id(datafldr_id), pattern = "2020") %>%
pull(id)
#pull all files by period folder
df_files <- map_dfr(.x = pd_fldrs,
.f = ~ drive_ls(as_id(.x)) %>%
drive_reveal(what = c("path")) %>%
distinct(name, path, id) %>%
select(path, id))
#extract the folders, mech codes and processed dates from filepath
df_extract <- df_files %>%
mutate(path = str_extract(path, "2020\\..*")) %>%
separate(path, c("pd_folder", "filename"), sep = "/") %>%
mutate(mech_code = str_extract(filename, "(?<=_)[:digit:]+(?=_)"),
date_processed = str_extract(filename, "[:digit:]+(?=\\.csv)") %>%
ymd) %>%
select(id, filename, everything())
#flag where there are multiple mechanism in a folder
df_extract <- df_extract %>%
group_by(pd_folder, mech_code) %>%
mutate(count = n()) %>%
ungroup() %>%
arrange(pd_folder, mech_code, date_processed)
#mech file count by period
df_extract %>%
count(pd_folder, count) %>%
spread(count, n)
#id keep/drop
df_dups <- df_extract %>%
filter(count > 1,
!is.na(mech_code)) %>%
group_by(pd_folder, mech_code) %>%
mutate(status = ifelse(row_number() == max(row_number()), "keep", "drop")) %>%
ungroup()
#export list for sharing/identifiation
df_dups %>%
write_csv("out/HFR_duplicate_processed_files.csv", na = "")
#identify where we want the duplicates to keep to go
new_fldr <- drive_ls(as_id(datafldr_id), pattern = "DUPLICATE KEEPS") %>%
pull(id)
#filter the duplicates list to only ones to keep
df_mv <- df_dups %>%
filter(status == "keep")
#move files to new folder
walk2(.x = df_mv$id,
.y = df_mv$filename,
.f = ~ drive_cp(as_id(.x), path = as_id(new_fldr), name = .y))
#check that al moved
drive_ls(as_id(new_fldr))
# MOVE DUPLICATES OUT OF FOLDERS ------------------------------------------
#id dropped dups (handle count = 3 manually)
df_dups_drop <- df_dups %>%
filter(count == 2) %>%
select(filename)
#semi join to get ids
df_move <- semi_join(df_extract, df_dups_drop)
#create a new deleted folder
del_fldr <- drive_ls(as_id(datafldr_id), pattern = "_Deleted") %>%
pull(id)
dupdel_fldr <- drive_mkdir("Duplicates", as_id(del_fldr)) %>%
pull(id)
#move dup files into deleted folder
walk(df_move$id,
~ drive_mv(as_id(.x), path = as_id(dupdel_fldr)))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.