R/download.R

Defines functions JDownload

Documented in JDownload

library(httr)
library(tools)

#' Downloads one or more URLs and saves them to local files.
#'
#' The intent of this function is to save time downloading files by only
#' downloading them if they haven't previously been downloaded. Files will be
#' downloaded if they have been modified since the previous download (as
#' determined by HTTP If-Modified-Since header). On some platforms, caching does
#' not seem to work correctly. On those platforms, the files are always downloaded.
#'
#' A file is created within \code{cacheDir} which contains the URL, file name
#' and modification date of each downloaded file in the directory. It is used to
#' determine whether a URL needs to be downloaded again. The file is in R
#' serialised object format (\code{readRDS}, \code{saveRDS}).  The URL is always
#' fetched (using \code{httr::GET}), however the http header
#' \code{If-Modified-Since} is specified so that it will not be downloaded again
#' if it is unmodified.
#'
#' @param url Character vector specifying the URLs to be downloaded.
#' @param tempfileFn Function to create names of downloaded files. (defaults to
#'   tempfile). Must accept the same arguments as the base R function
#'   \code{tempfile}.
#' @param cacheDir Passed to \code{tempFileFn} as the parameter \code{tmpdir}.
#'   By default, files will be created in this directory.
#' @param filePattern Passed to \code{tempFileFn} as the parameter \code{pattern}.
#'   By default, this is used as the initial part of the file name.
#' @param verbose If TRUE, prints status messages.
#' @param debug If TRUE, prints debugging messages.
#'
#' @return Array of names of the downloaded files.
#'
#' @seealso \code{\link[utils]{download.file}}
#'
#' @export
JDownload <- function(url, tempfileFn = NULL, cacheDir = tempdir(), filePattern = "jdcache", verbose = FALSE, debug = FALSE) {
  if (!dir.exists(cacheDir))
    dir.create(cacheDir, recursive = TRUE)

  # Prepare index. Index is a list of (url,modified,file)
  indexPath <- file.path(cacheDir, ".JDindex.rds")
  if (file.exists(indexPath)) {
    index <- readRDS(indexPath)
  } else {
    index <- data.frame()
  }

  # Construct file names
  if (is.null(tempfileFn))
    tempfileFn <- tempfile
  f = tempfileFn(pattern = filePattern, tmpdir = cacheDir, fileext = paste0('.', tools::file_ext(url)))

  # For each file
  for (i in 1:length(url)) {
    u <- url[i]

    # Get index entry
    entry <- index[u,]
    if (debug)
      cat(sprintf("%d: %s: %s\n", i, u, ifelse(is.null(entry$modified), "NULL", entry$modified)))
    resp <- httr::GET(u, httr::add_headers('If-Modified-Since' = entry$modified))
    if (debug)
      cat(sprintf("GET status %s\n", resp$status_code))
    if (resp$status_code == 304) {
      # Use cached file
      f[i] <- entry[,"file"]
      if (verbose)
        cat(sprintf("Using cached %s\n", u))
    } else {
      httr::stop_for_status(resp, paste("download", u))         # Ugly
      if (verbose)
        cat(sprintf("Downloading %s\n", u))
      if (resp$status_code == 200) {
        # Save content to the file
        bin <- httr::content(resp, "raw")
        writeBin(bin, f[i])
        # Update the index
        index[u,"modified"] <- httr::headers(resp)$`Last-Modified`
        index[u,"file"] <- f[i]
      }
    }
  }
  # Save potentially modified index
  saveRDS(index, indexPath)

  f
}
JimMcL/JUtils documentation built on Nov. 7, 2024, 11:25 a.m.