R/downloadData.R

Defines functions remoteFileSize checksums

Documented in checksums remoteFileSize

utils::globalVariables(c("expectedFile", "result"))

#' Extract a url from module metadata
#'
#' This will get the `sourceURL` for the object named.
#'
#' @param objectName A character string of the object name in the metadata.
#' @param sim A `simList` object from which to extract the `sourceURL`
#' @param module An optional character string of the module name whose metadata is
#'               to be used. If omitted, the function will use the `currentModule(sim)`,
#'               if defined.
#'
#' @return The url.
#'
#' @author Eliot McIntire
#' @export
#' @exportMethod extractURL
#' @rdname extractURL
setGeneric("extractURL", function(objectName, sim, module) {
  standardGeneric("extractURL")
})

#' @export
#' @exportMethod extractURL
#' @rdname extractURL
setMethod(
  "extractURL",
  signature = c(objectName = "character", sim = "missing"),
  definition = function(objectName, sim, module) {
    i <- 0
    lenSC <- length(sys.calls())
    # This will get the simList that is closest in the call stack, noting that
    #  in this first one (i.e., this function), sim will be missing
    simEnv <- .whereInStack("sim", parent.frame()) # has to start "before" this function or it will find this function's missing "sim"
    if (!is.null(simEnv)) {
      sim <- simEnv$sim
      # while (missing(sim) && i < lenSC) {
      #   i <- i + 1
      #   simEnv <- .whereInStack("sim", -i)
      #   sim <- simEnv$sim
      # }
      extractURL(objectName = objectName, sim = sim, module = module)
    } else {
      NULL
    }
})

#' @export
#' @rdname extractURL
setMethod(
  "extractURL",
  signature = c(objectName = "character", sim = "simList"),
  definition = function(objectName, sim, module) {
    if (missing(module)) {
      module <- currentModule(sim)
    }

    io <- .parseModulePartial(sim, modules = list(module), defineModuleElement = "inputObjects")
    wh <- io[[module]][["objectName"]] == objectName
    io[[module]][wh, ]$sourceURL
  }
)

#' Calculate checksum for a module's data files
#'
#' Verify (and optionally write) checksums for data files in a module's
#' \file{data/} subdirectory. The file \file{data/CHECKSUMS.txt} contains the
#' expected checksums for each data file.
#' Checksums are computed using `reproducible:::.digest`, which is simply a
#' wrapper around `digest::digest`.
#'
#' Modules may require data that for various reasons cannot be distributed with
#' the module source code. In these cases, the module developer should ensure
#' that the module downloads and extracts the data required. It is useful to not
#' only check that the data files exist locally but that their checksums match
#' those expected.
#'
#' @note In version 1.2.0 and earlier, two checksums per file were required
#' because of differences in the checksum hash values on Windows and Unix-like
#' platforms. Recent versions use a different (faster) algorithm and only require
#' one checksum value per file.
#' To update your \file{CHECKSUMS.txt} files using the new algorithm:
#' \enumerate{
#'   \item specify your module (`moduleName <- "my_module"`);
#'   \item use a temporary location to ensure all modules get fresh copies of the data
#'         (`tmpdir <- file.path(tempdir(), "SpaDES_modules")`);
#'   \item download your module's data to the temp dir (`downloadData(moduleName, tmpdir)`);
#'   \item initialize a dummy simulation to ensure any 'data prep' steps in the
#'         `.inputObjects` section are run (`simInit(modules = moduleName)`);
#'   \item recalculate your checksums and overwrite the file
#'         (`checksums(moduleName, tmpdir, write = TRUE)`);
#'   \item copy the new checksums file to your working module directory
#'         (the one not in the temp dir)
#'         (`file.copy(from = file.path(tmpdir, moduleName, 'data', 'CHECKSUMS.txt'),
#'                to = file.path('path/to/my/moduleDir', moduleName, 'data', 'CHECKSUMS.txt'),
#'                overwrite = TRUE)`).
#' }
#'
#' @param module  Character string giving the name of the module.
#'
#' @param path    Character string giving the path to the module directory.
#' @param ... Passed to [reproducible::Checksums()], notably, `write`,
#'            `quickCheck`,  `checksumFile` and `files`.
#' @importFrom reproducible Checksums
checksums <- function(module, path, ...) {
  path <- if (length(module)) {
    fp <- file.path(path, module, "data")
    checkPath(fp, create = TRUE)
    fp
  } else {
    file.path(path)
  }
  result <- Checksums(path, ...)
}

#' Determine the size of a remotely hosted file
#'
#' Defunct. Will be removed by mid-2023.
#'
#' @param url  The url of the remote file.
#'
#' @return A numeric indicating the size of the remote file in bytes.
#'
#' @author Eliot McIntire and Alex Chubaty
#'
#' @export
remoteFileSize <- function(url) {
  .Defunct()
  # contentLength <- vapply(url, function(u) {
  #   header <- RCurl::url.exists(u, .header = TRUE)
  #   status <- tryCatch(as.numeric(header[["status"]]), error = function(e) 0)
  #   if (status == 200) {
  #     as.numeric(header[["Content-Length"]])
  #   } else {
  #     0
  #   }
  # }, numeric(1))
  #
  # return(contentLength)
}

################################################################################
#' Download module data
#'
#' Download external data for a module if not already present in the module
#' directory, or if there is a checksum mismatch indicating that the file is not
#' the correct one.
#'
#' `downloadData` requires a checksums file to work, as it will only download
#'  the files specified therein. Hence, module developers should make sure they
#'  have manually downloaded all the necessary data and ran `checksums` to
#'  build a checksums file.
#'
#' There is an experimental attempt to use the \pkg{googledrive} package to download
#' data from a shared (publicly or with individual users) file.
#' To try this, put the Google Drive URL in `sourceURL` argument of
#' `expectsInputs` in the module metadata, and put the filename once downloaded
#' in the `objectName` argument.
#' If using RStudio Server, you may need to use "out of band" authentication by
#' setting `options(httr_oob_default = TRUE)`.
#' To avoid caching of Oauth credentials, set `options(httr_oauth_cache = TRUE)`.
#'
#' There is also an experimental option for the user to make a new \file{CHECKSUMS.txt}
#' file if there is a `sourceURL` but no entry for that file.
#' This is experimental and should be used with caution.
#'
#' @param module  Character string giving the name of the module.
#'
#' @param path    Character string giving the path to the module directory.
#'
#' @param quiet   Logical. This is passed to `download.file`. Default is FALSE.
#'
#' @param quickCheck Logical. If `TRUE`, then the check with local data will only
#'                   use `file.size` instead of `digest::digest`.
#'                   This is faster, but potentially much less robust.
#'
#' @param overwrite Logical. Should local data files be overwritten in case they exist?
#'                  Default is `FALSE`.
#'
#' @param files A character vector of length 1 or more if only a subset of files should be
#'              checked in the \file{CHECKSUMS.txt} file.
#'
#' @param checked The result of a previous `checksums` call. This should only be used when
#'         there is no possibility that the file has changed, i.e., if `downloadData` is
#'         called from inside another function.
#'
#' @param urls Character vector of urls from which to get the data. This is automatically
#'             found from module metadata when this function invoked with
#'            `SpaDES.core::downloadModule(..., data = TRUE)`. See also
#'            [reproducible::prepInputs()].
#'
#' @param children The character vector of child modules (without path) to also
#'                 run `downloadData` on
#'
#' @param ... Passed to [reproducible::preProcess()], e.g., `purge`
#'
#' @return Invisibly, a list of downloaded files.
#'
#' @seealso [reproducible::prepInputs()], [checksums()], and [downloadModule()]
#' for downloading modules and building a checksums file.
#'
#' @author Alex Chubaty & Eliot McIntire
#' @export
#' @importFrom reproducible checkPath compareNA
#' @importFrom utils download.file
#' @rdname downloadData
#' @examples
#' ## In metadata, each expectsInput has a sourceURL;
#' ## downloadData will look for that and download if it defined;
#' ## however this sample module has all NAs for sourceURL, so nothing to download.
#' if (FALSE) {
#'   modulePath <- getSampleModules(tempdir())
#'   downloadData("caribouMovement", path = modulePath)
#' }
#'
setGeneric(
  "downloadData",
  function(
    module,
    path,
    quiet,
    quickCheck = FALSE,
    overwrite = FALSE,
    files = NULL,
    checked = NULL,
    urls = NULL,
    children = NULL,
    ...
  ) {
    standardGeneric("downloadData")
  }
)

#' @rdname downloadData
setMethod(
  "downloadData",
  signature = c(
    module = "character",
    path = "character",
    quiet = "logical",
    quickCheck = "ANY",
    overwrite = "ANY",
    files = "ANY",
    checked = "ANY",
    urls = "ANY",
    children = "ANY"
  ),
  definition = function(
    module,
    path,
    quiet,
    quickCheck,
    overwrite,
    files,
    checked,
    urls,
    children,
    ...
  ) {
    cwd <- getwd()
    path <- checkPath(path, create = FALSE)

    if (is.null(urls)) {
      inputs <- .parseModulePartial(
        filename = file.path(path, module, paste0(module, ".R")),
        defineModuleElement = "inputObjects"
      )
      urls <- inputs$sourceURL
    }

    # parsedModule <- parse(file = file.path(path, module, paste0(module, '.R')))
    # urls <- .getSourceURL(pattern = fileToDownload, x = parsedModule)

    if (is.call(urls)) {
      # This is the case where it can't evaluate the .parseModulePartial because of a reference
      #  to the sim object that isn't available. Because sourceURL is unlikely to use
      #  a call to sim object, then try to evaluate again here, just the one column
      urls <- eval(urls)
      #urls <- moduleMetadata(module, path)$inputObjects$sourceURL
    }

    targetFiles <- if (is.null(files)) {
      lapply(urls, function(x) NULL)
    } else {
      files
    }
    notNAs <- !unlist(lapply(urls, is.na))
    dPath <- file.path(path, module, "data")
    if (any(notNAs)) {
      # This requires googledrive in reproducible 1.2.16; even if not a googledrive url
      res <- Map(
        reproducible::preProcess,
        targetFile = targetFiles[notNAs],
        url = urls[notNAs],
        MoreArgs = append(
          list(
            quick = quickCheck,
            overwrite = overwrite,
            destinationPath = dPath,
            verbose = !quiet
          ),
          list(...)
        )
      )
      chksums <- rbindlist(lapply(res, function(x) x$checkSums))
      chksums <- chksums[order(-result)]
      chksums <- unique(chksums, by = "expectedFile")
    } else {
      unlinkAfter <- !dir.exists(dPath) # next line will make the folder and put nothing in it
      chksums <- Checksums(dPath, write = TRUE)
      if (isTRUE(unlinkAfter)) {
        unlink(dPath, recursive = TRUE)
      }
    }

    # after download, check for childModules that also require downloading
    #children <- moduleMetadata(module, path)$childModules
    if (!is.null(children)) {
      if (length(children)) {
        if (all(nzchar(children) & !is.na(children))) {
          chksums2 <- bindrows(lapply(
            children,
            downloadData,
            path = path,
            quiet = quiet,
            quickCheck = quickCheck
          ))
          chksums <- bindrows(chksums, chksums2)
        }
      }
    }

    return(chksums)
  }
)

#' @rdname downloadData
setMethod(
  "downloadData",
  signature = c(
    module = "character",
    path = "missing",
    quiet = "missing",
    quickCheck = "ANY",
    overwrite = "ANY",
    files = "ANY",
    checked = "ANY",
    urls = "ANY",
    children = "ANY"
  ),
  definition = function(module, quickCheck, overwrite, files, checked, urls, children) {
    downloadData(
      module = module,
      path = getOption("spades.modulePath"),
      quiet = FALSE,
      quickCheck = quickCheck,
      overwrite = overwrite,
      files = files,
      checked = checked,
      urls = urls,
      children = children
    )
  }
)

#' @rdname downloadData
setMethod(
  "downloadData",
  signature = c(
    module = "character",
    path = "missing",
    quiet = "logical",
    quickCheck = "ANY",
    overwrite = "ANY",
    files = "ANY",
    checked = "ANY",
    urls = "ANY",
    children = "ANY"
  ),
  definition = function(module, quiet, quickCheck, overwrite, files, checked, urls, children) {
    downloadData(
      module = module,
      path = getOption("spades.modulePath"),
      quiet = quiet,
      quickCheck = quickCheck,
      overwrite = overwrite,
      files = files,
      checked = checked,
      urls = urls,
      children = children
    )
  }
)

#' @rdname downloadData
setMethod(
  "downloadData",
  signature = c(
    module = "character",
    path = "character",
    quiet = "missing",
    quickCheck = "ANY",
    overwrite = "ANY",
    files = "ANY",
    checked = "ANY",
    urls = "ANY",
    children = "ANY"
  ),
  definition = function(module, path, quickCheck, overwrite, files, checked, urls, children) {
    downloadData(
      module = module,
      path = path,
      quiet = FALSE,
      quickCheck = quickCheck,
      overwrite = overwrite,
      files = files,
      checked = checked,
      urls = urls,
      children = children
    )
  }
)

Try the SpaDES.core package in your browser

Any scripts or data that you put into this service are public.

SpaDES.core documentation built on Jan. 11, 2026, 9:06 a.m.