#' Utilities for Looping to Read In Documents
#'
#' `loop_counter` - A simple loop counter for tracking the progress of reading in
#' a batch of files.
#'
#' @param i Iteration of the loop.
#' @param total Total number of iterations.
#' @param file The file name of that iteration to print out.
#' @param ... ignored
#' @return `loop_counter` - Prints loop information.
#' @export
#' @rdname loop_utilities
#' @examples
#' \dontrun{
#' files <- dir(
#' system.file("docs", package = "textreadr"),
#' full.names = TRUE,
#' recursive = TRUE,
#' pattern = '\\.(R?md|Rd?$|txt|sql|html|pdf|doc|ppt|tex)'
#' )
#'
#' max_wait <- 30
#' total <- length(files)
#' content <- vector(mode = "list", total)
#'
#' for (i in seq_along(files)){
#'
#' loop_counter(i, total, base_name(files[i]))
#'
#' content[[i]] <- try_limit(
#' textreadr::read_document(files[i]),
#' max.time = max_wait,
#' zero.length.return = NA
#' )
#' }
#'
#'
#' sapply(content, is.null)
#' sapply(content, function(x) length(x) == 1 && is.na(x))
#' content
#' }
loop_counter <- function(i, total, file, ...){
percent <- round(100*i/total, 0)
pcnt <- paste0(strrep(' ', 3 - nchar(percent)), '(', percent, '%)')
cat(sprintf(
'%s of %s %s \'%s\'\n',
sprintf(paste0("%0", nchar(total), "d"), i),
total,
pcnt,
file
))
utils::flush.console()
}
#' Utilities for Looping to Read In Documents
#'
#' `base_name` - Like `base::basename` but doesn't choke on long paths.
#'
#' @param path A character vector, containing path names.
#' @export
#' @return `base_name` - Returns just the basename of the path.
#' @rdname loop_utilities
base_name <- function(path) gsub('^.+/', '', path)
#' Utilities for Looping to Read In Documents
#'
#' `try_limit` - Limits the amount of try that an expression can run for. This
#' works to limit how long an attempted read-in of a document may take. Most
#' useful in a loop with a few very long running document read-ins (e.g., .pdf
#' files that require [**tesseract** package](https://CRAN.R-project.org/package=tesseract)).
#' Note that `max.time` can not stop a `system` call (as many read-in functions
#' are essentially utilizing, but it can limit how many `system` calls are made.
#' This means a .pdf with multiple
#' [**tesseract**](https://CRAN.R-project.org/package=tesseract)) pages will only
#' allow the first page to read-in before returning an error result. Note that
#' this approach does not distinguish between errors running the `expr` and
#' time-out errors.
#'
#' @param expr An expression to run.
#' @param max.time Max allotted elapsed run time in seconds.
#' @param timeout.return Value to return for timeouts.
#' @param zero.length.return Value to return for length zero expression evaluations.
#' @param silent logical. If `TRUE` report of error messages.
#' @export
#' @rdname loop_utilities
try_limit <- function(expr, max.time = Inf, timeout.return = NULL,
zero.length.return = "", silent = TRUE, ...){
setTimeLimit(cpu = max.time, elapsed = max.time, transient=TRUE)
on.exit(setTimeLimit(cpu = Inf, elapsed = Inf, transient = FALSE))
out <- try(expr, silent = silent)
if (is.null(out) | inherits(out, "try-error")) return(timeout.return)
if (length(out) == 0) zero.length.return else out
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.