library(dplyr)
library(purrr)
library(readr)
#' Split a repetitive task into smaller batches.
#' @description This can be useful if the task risks failure and you don't want to restart
#' from scratch, or if you want to be able to stop a long task midway through.
#' @details Given a function and a list of argument vectors, split the argument vectors
#' into batches of fixed size, then call the function with its arguments,
#' saving the results to disk at the end of each batch.
#' @import readr
#' @import purrr
#' @import dplyr
#' @importFrom stringr str_replace
#' @export
#' @param f The function you want to call.
#' @param args List of arguments passed to f. Each argument can be a list or vector.
#' @param batch_size Size of the batches.
#' @param out_folder Folder where to store the intermediate results (string).
#' @param file_prefix Prefix for the intermediate result files (string).
#' @param .combine Function used to combine the results.
#' @param verbose Print progress (boolean).
#' @return The final aggregate result of the operation.
#'
do_batch <- function(f,
args = NULL,
batch_size = NULL,
out_folder = '.',
file_prefix = 'batch_',
.combine = bind_rows,
verbose = TRUE,
...){
if(!dir.exists(out_folder)) dir.create(out_folder)
# split argument lists into batches
batch_list <-
map(args, ~ split(., rep(1:ceiling(length(.)/batch_size), each = batch_size)[1:length(.)])) %>%
transpose
n <- length(batch_list)
existing_file_numbers <- parse_number(list.files(out_folder))
max_file_number <- if(length(existing_file_numbers) > 0) max(existing_file_numbers) else 0
new_file_numbers <- max_file_number + (1:n)
# format file numbers
new_file_numbers <-
str_replace(
format(new_file_numbers),
' ',
'0'
)
pmap(
list(
batch_list,
1:n,
new_file_numbers
),
function(l, i, new_file_number){
if(verbose) print(paste("Batch", i, "/", n))
res_df <- pmap(l, f, ...) %>% map(~ mutate_if(., is.factor, as.character)) %>% .combine
saveRDS(res_df, paste0(out_folder, '/', file_prefix, new_file_number, '.RDS'))
res_df
}
) %>%
.combine
}
#' Read all files in a folder into a data.frame.
#' @import purrr
#' @import dplyr
#' @export
#' @param outfolder What folder to read from (string).
#' @param pattern Pattern to use to filter the files (default: .RDS).
#' @param .combine Function to use to combine the files (default: bind_rows)
#' @return The combined result.
read_batch <- function(outfolder = '.', pattern = "\\.RDS", .combine = bind_rows){
list.files(outfolder, pattern, full.names = TRUE) %>%
map(readRDS) %>%
bind_rows
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.