#' clean a file name
#'
#' given a file to copy, generates a new file name stripped of special
#' characters
#'
#' @param in_file the file to copy
#' @param replace_special what to use to replace anything besides dot
#'
#' @details This function creates a new file name that is stripped of special
#' characters. The following characters are replaced by a period: " ", ":", "~",
#' "'". A leading "~" will be replaced by space.
#'
#' @export
#'
#' @return character
#'
#' @examples
#' clean_filename("badly:formatted~file name(sample1)")
#'
clean_filename <- function(in_file, replace_special = "-"){
use_file <- basename(in_file)
file_name <- basename(use_file)
file_name <- gsub(" ", replace_special, file_name, fixed = TRUE)
file_name <- gsub(":", replace_special, file_name, fixed = TRUE)
file_name <- gsub("^~", "", file_name)
file_name <- gsub("~", replace_special, file_name, fixed = TRUE)
file_name <- gsub("'", replace_special, file_name, fixed = TRUE)
file_name <- gsub("(", replace_special, file_name, fixed = TRUE)
file_name <- gsub(")", replace_special, file_name, fixed = TRUE)
file_name
}
#' copy file
#'
#' Copy a single file from one location to another
#'
#' @param from_file the location of the original file
#' @param to_dir the location to copy the file to
#' @param json_data the json meta information data
#' @param tmp_loc temp location if you want to specify it
#' @param clean_file_fun function used to rename the file
#'
#' @details We want to keep track of information about the copied files, so this
#' function does some stuff to help us out. It strips special characters from
#' the base file name, copies the file to a temp location, calculates the SHA-1 hash
#' of the file, checks the JSON meta file for matches to the SHA-1, and if there
#' are none, copies the renamed file to the copy location.
#'
#' If a matching instance of SHA-1 hashes are found, then the file path is added
#' to the entry for that file location.
#'
#' If a matching file name is found but with a different SHA-1 hash, then the first
#' 8 characters of the SHA-1 hash are appended to the file, and it is added to the
#' database.
#'
#' @import digest
#' @importFrom tools file_ext
#' @importFrom purrr map_lgl
#'
#' @export
#' @return list
#'
copy_file <- function(from_file = NULL, to_dir = ".", json_data = NULL, tmp_loc = "/tmp",
clean_file_fun = clean_filename){
stopifnot(!is.null(from_file))
to_dir <- normalizePath(to_dir)
base_file <- basename(from_file)
if (!is.null(clean_file_fun)) {
base_out <- clean_file_fun(base_file)
}
tmp_file <- file.path(tmp_loc, base_out)
did_copy <- file.copy(from_file, tmp_file)
if (did_copy) {
add_file <- TRUE
sha1 <- digest(tmp_file, algo = "sha1", file = TRUE)
if (!is.null(json_data)) {
match_sha1 <- map_lgl(json_data, function(x){sha1 %in% x$sha1})
match_file <- map_lgl(json_data, function(x){base_file %in% basename(unlist(x$original_path))})
} else {
match_sha1 <- FALSE
match_file <- FALSE
}
if (any(match_sha1)) {
#browser(expr = TRUE)
tmp_json <- json_data[[which(match_sha1)]]
# figure out where the individual json file should be saved
raw_path <- tmp_json$saved_path
json_path <- replace_file_extension(raw_path, ".json")
tmp_json[["original_path"]] <- c(tmp_json[["original_path"]], from_file)
save_json(tmp_json, json_path)
json_data[[which(match_sha1)]] <- tmp_json
add_file <- FALSE
} else if (any(match_file)) {
fileext <- tools::file_ext(base_out)
fileext_regex <- paste0(".", fileext, "$")
base2 <- gsub(fileext_regex, "", base_out)
base_out <- paste0(base2, "-", substr(sha1, 1, 8), ".", fileext)
}
if (add_file) {
file_loc <- file.path(to_dir, base_out)
did_copy2 <- file.copy(tmp_file, file_loc)
if (did_copy2) {
file_data <- vector("list", 1)
file_data[[1]] <- list(
file = base_out,
saved_path = file_loc,
original_path = from_file,
sha1 = sha1
)
json_path <- replace_file_extension(file_loc, ".json")
save_json(file_data, json_path)
if (is.null(json_data)) {
json_data <- file_data
} else {
json_data <- c(json_data, file_data)
}
}
}
unlink(tmp_file)
}
json_data
}
#' convert to another format
#'
#' given a file path, replace a files extension with a new one
#'
#' @param in_file the file name to work with
#' @param out_extension the new file extension
#'
#' @noRd
replace_file_extension <- function(in_file, out_extension){
fileext <- tools::file_ext(in_file)
fileext_regex <- paste0(".", fileext, "$")
new_file <- gsub(fileext_regex, out_extension, in_file)
new_file
}
#' save json
#'
#' given a list, saves it to json in a nice way
#'
#' @param list_data the data in list format
#' @param save_loc the location to save it to
#'
#' @noRd
save_json <- function(list_data, save_loc){
cat(jsonlite::toJSON(list_data, pretty = TRUE, auto_unbox = TRUE), file = save_loc, append = FALSE)
}
#' wait copy
#'
#' Copy files from one location to another, during set hours if desired. This
#' is very useful for copying from networked drives that get a lot of activity
#' during the day.
#'
#' @param file_list a character vector of files to copy from
#' @param to_dir where to copy the files to
#' @param json_meta the json meta flat file
#' @param tmp_loc a temp file location
#' @param clean_file_fun function to use for cleaning up the file name?
#' @param time_limit only copy during a certain time?
#' @param start_time when to start copying
#' @param stop_time when to stop copying
#' @param time_zone what time zone are we in
#' @param wait_check how long to wait before checking again
#' @param n_check how many times to try before giving up
#' @param wait_files how many files before pausing
#' @param pause_wait how long to pause when the wait limit is reached
#' @param pause_file how long to pause between every file
#'
#' @details
#' 1. **Limiting by time of day**: if `time_limit = TRUE`, the `start_time` and
#' `stop_time` are assumed to be on a
#' per day basis, so they should be encoded as the number of hours from midnight.
#' The function actually does a periodic check as to whether the
#' `start_time` is ahead of it, and if it is not, then it will create
#' a new time interval for the copying to be allowed. Default is from 8pm (20:00)
#' to 6am (30:00). The `wait_check` parameter sets how often to wait before checking
#' the time again (default is 1800 seconds / 30 minutes), and `n_check` parameter
#' defines how many times to check if the copying can be done (defaults to infinite).
#' 1. **Time Zone**: Provide your time zone so that the time functionality works
#' properly!
#' 1. **Waiting Between Copies**: In addition to only copying between certain hours,
#' it is possible to set how long to pause between each file using `pause_file`,
#' default is 2 seconds, and also a longer interval after copying several files
#' using `wait_files` (10 files) and `pause_wait` (10 seconds).
#' 1. **Checking for files**: It is recommended before running `wait_copy` to
#' first run `check_files_exist` on the file list to copy to make sure that
#' you are passing valid file paths.
#'
#'
#' @import lubridate
#' @importFrom jsonlite fromJSON
#' @importFrom purrr map_chr map
#'
#' @return logical
#' @export
#'
#' @examples
#' \dontrun{
#'
#' # assume files are in /home/tmp/
#' # assume current working directory is where to copy to
#' file_list <- dir(".", full.names = TRUE)
#' wait_copy(file_list) # copy between 8pm and 6am
#'
#' # no time limit for copying
#' wait_copy(file_list, time_limit = FALSE)
#'
#' # copy from 10am to 1pm (13:00)
#' wait_copy(file_list, start_time = hours(10), stop_time = hours(13))
#'
#' # stop checking if can copy after particular number of checks (3)
#' wait_copy(file_list, n_check = 3)
#'
#' # check every 30 seconds instead of 30 minutes
#' wait_copy(file_list, wait_check = 30)
#'
#' # pause 4 seconds between each file
#' wait_copy(file_list, pause_file = 4)
#'
#' # pause 30 seconds after every 20 files
#' wait_copy(file_list, wait_files = 20, pause_wait = 30)
#'
#' # don't rename the files
#' wait_copy(file_list, clean_file_fun = NULL)
#'
#' # use make.names instead
#' wait_copy(file_list, clean_file_fun = make.names)
#'
#' }
wait_copy <- function(file_list, to_dir = ".",
json_meta = "all_meta_data.json",
tmp_loc = "/tmp",
clean_file_fun = clean_filename,
time_limit = TRUE,
start_time = hours(20), stop_time = hours(30),
time_zone = NULL, wait_check = 1800, n_check = Inf,
wait_files = 10, pause_wait = 10,
pause_file = 2){
if (!dir.exists(to_dir)) {
dir.create(to_dir)
}
if (file.exists(json_meta)) {
backup_name <- gsub(".json", paste0("-", gsub(" ", "-", as.character(Sys.time())), ".json"), json_meta)
file.copy(json_meta, backup_name)
json_data <- jsonlite::fromJSON(json_meta, simplifyVector = FALSE)
} else {
json_data <- NULL
}
json_sha1 <- digest::digest(json_data, "sha1")
# check if we've copied some before, and if so we want to remove them so
# we don't waste time copying them again.
if (!(length(json_data) == 0)) {
#browser()
previous_files <- unlist(purrr::map(json_data, function(x){x$original_path}))
file_list <- file_list[!(file_list %in% previous_files)]
}
if (time_limit) {
t_start <- get_today_in_local() + start_time
t_stop <- get_today_in_local() + stop_time
} else {
t_start <- get_today_in_local() - days(10)
t_stop <- get_today_in_local() + days(10)
}
if (is.null(time_zone)) {
time_zone <- get_tz(now())
}
allowed_copy_time <- interval(t_start, t_stop, tz(t_stop))
# check that the top level directory we are copying from and to
# actually exists so that we don't try to copy from an invalid location
to_copy <- length(file_list)
check_wait_counter <- 0
i_check <- 0
did_copy <- 1
while ((to_copy > 0) && (did_copy <= to_copy)) {
now1 <- get_now_in_local()
now2 <- get_now_in_local()
tmp_int <- interval(now1, now2, tz(t_stop))
can_copy <- int_overlaps(tmp_int, allowed_copy_time)
if (can_copy) {
# copy if we're below the limit for waiting for a time,
# otherwise, wait for a time. Used to keep us in check on the server
# and wait some time after copying a reasonable number of files.
# Note that we also pause a little bit of time after each copy.
if (check_wait_counter < wait_files) {
json_data <- copy_file(file_list[did_copy], to_dir, json_data, tmp_loc, clean_file_fun = clean_filename)
Sys.sleep(pause_file)
check_wait_counter <- check_wait_counter + 1
did_copy <- did_copy + 1
} else {
Sys.sleep(pause_wait)
message("Waiting between sets of files! ....")
check_wait_counter <- 0
}
} else {
# before waiting, we want to check if the meta-data has changed
# so we can write it before we sit for a long time again
if (digest::digest(json_data, "sha1") != json_sha1) {
save_json(json_data, json_meta)
json_sha1 <- digest::digest(json_data, "sha1")
}
# if we're not allowed, wait some time before trying again.
message(paste0("Not allowed to copy yet, waiting! .... ", Sys.time()))
i_check <- i_check + 1
if (i_check <= n_check) {
Sys.sleep(wait_check)
} else {
message("Reached maximum number of wait periods, exiting ....")
break()
}
}
}
save_json(json_data, json_meta)
}
get_tz <- function(in_time){
new_time <- as.POSIXlt(in_time)
new_time$zone
}
get_today_in_local <- function(){
use_tz <- get_tz(now())
curr_day <- today() + seconds(1)
tz(curr_day) <- use_tz
curr_day
}
get_now_in_local <- function(){
use_tz <- get_tz(now())
right_now <- now()
tz(right_now) <- use_tz
right_now
}
#' check files
#'
#' check that requested files to copy actually exist, and warn the user
#' if any of them fail.
#'
#' @param file_list the list of files
#' @param n_check how many to check that they exist?
#'
#' @export
check_files_exist <- function(file_list, n_check = 10){
n_file <- length(file_list)
if (n_check >= n_file) {
n_check <- n_file
}
files_to_check <- sample(file_list, n_check)
does_exist <- file.exists(files_to_check)
if (sum(does_exist) != n_check){
warning(paste0(sum(!does_exist), " of ", n_check, " files in your file list do not exist!"))
}
}
#' create meta-data
#'
#' Given a set of directories to search for JSON meta-data files, remake
#' a complete meta-data file and save it to the location specified.
#'
#' @param file_dirs the directories to search
#' @param json_meta where to save the new file
#' @param recursive should they be searched recursively?
#'
#' @export
#'
#' @return list of meta-data, invisibly
create_master_meta_data <- function(file_dirs = ".", json_meta = NULL,
recursive = TRUE){
all_json <- dir(normalizePath(file_dirs), recursive = recursive, full.names = TRUE, pattern = "json$")
json_data <- lapply(all_json, function(x){
#print(x)
import_json(x)
})
#json_lengths <- vapply(json_data, length, numeric(1))
if (file.exists(json_meta)) {
backup_name <- gsub(".json", paste0("-", gsub(" ", "-", as.character(Sys.time())), ".json"), json_meta)
file.copy(json_meta, backup_name)
}
if (!is.null(json_meta)) {
save_json(json_data, json_meta)
message(paste0("Meta-data saved to ", json_meta))
}
invisible(json_data)
}
#' import json
#'
#' import json from a file correctly given some things where things get written
#' differently
#'
#' @param json_file the json file to read
#'
#' @export
import_json <- function(json_file){
json_data <- jsonlite::fromJSON(json_file, simplifyVector = FALSE)
if (length(json_data) == 1) {
out_list <- json_data[[1]]
} else {
out_list <- json_data
}
out_list
}
#' update hashes
#'
#' If you used a previous version of this package that used MD5 hashes and now
#' need to fix it by changing to SHA-1 hashes, this is your function.
#'
#' @param json_file the json file that needs to be modified
#'
#' @export
#'
#' @return NULL
#'
update_hashes <- function(json_file) {
file.copy(json_file, paste0(json_file, ".bak"))
json_metadata <- import_json(json_file)
json_metadata$sha1 <- digest::digest(json_metadata$saved_path, algo = "sha1", file = TRUE)
if (!is.null(json_metadata$md5)) {
json_metadata$md5 <- NULL
}
save_json(json_metadata, json_file)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.