R/do_batch.R

Defines functions do_batch

Documented in do_batch

library(dplyr)
library(purrr)
library(readr)

#' Split a repetitive task into smaller batches.
#' @description This can be useful if the task risks failure and you don't want to restart
#' from scratch, or if you want to be able to stop a long task midway through.
#' @details Given a function and a list of argument vectors, split the argument vectors
#' into batches of fixed size, then call the function with its arguments,
#' saving the results to disk at the end of each batch.
#' @import readr
#' @import purrr
#' @import dplyr
#' @importFrom stringr str_replace
#' @export
#' @param f The function you want to call.
#' @param args List of arguments passed to f. Each argument can be a list or vector.
#' @param batch_size Size of the batches.
#' @param out_folder Folder where to store the intermediate results (string).
#' @param file_prefix Prefix for the intermediate result files (string).
#' @param .combine Function used to combine the results.
#' @param verbose Print progress (boolean).
#' @return The final aggregate result of the operation.
#'
do_batch <- function(f,
                     args = NULL,
                     batch_size = NULL,
                     out_folder = '.',
                     file_prefix = 'batch_',
                     .combine = bind_rows,
                     verbose = TRUE,
                     ...){

  if(!dir.exists(out_folder)) dir.create(out_folder)

  # split argument lists into batches

  batch_list <-
    map(args, ~ split(., rep(1:ceiling(length(.)/batch_size), each = batch_size)[1:length(.)])) %>%
    transpose

  n <- length(batch_list)

  existing_file_numbers <- parse_number(list.files(out_folder))
  max_file_number <- if(length(existing_file_numbers) > 0) max(existing_file_numbers) else 0
  new_file_numbers <-  max_file_number + (1:n)

  # format file numbers
  new_file_numbers <-
    str_replace(
      format(new_file_numbers),
      ' ',
      '0'
    )

  pmap(
    list(
      batch_list,
      1:n,
      new_file_numbers
    ),
    function(l, i, new_file_number){
      if(verbose) print(paste("Batch", i, "/", n))
      res_df <- pmap(l, f, ...) %>% map(~ mutate_if(., is.factor, as.character)) %>% .combine
      saveRDS(res_df, paste0(out_folder, '/', file_prefix, new_file_number, '.RDS'))
      res_df
    }
  ) %>%
    .combine
}

#' Read all files in a folder into a data.frame.
#' @import purrr
#' @import dplyr
#' @export
#' @param outfolder What folder to read from (string).
#' @param pattern Pattern to use to filter the files (default: .RDS).
#' @param .combine Function to use to combine the files (default: bind_rows)
#' @return The combined result.
read_batch <- function(outfolder = '.', pattern = "\\.RDS", .combine = bind_rows){

  list.files(outfolder, pattern, full.names = TRUE) %>%
    map(readRDS) %>%
    bind_rows

}
artichaud1/dobatch documentation built on May 21, 2019, 9:34 a.m.