R/s3_get_files.R

Defines functions s3_get_files

Documented in s3_get_files

#' download several s3 files
#' 
#' Progress messages for downloading several S3 objects at once cannot be silenced.
#' Like s3_get, S3 objects that have already been downloaded will not be re downloaded
#' @param s3_uri vector of S3 object URIs
#' @param region AWS region for bucket containing the file (defaults to "us-east-2", but only required for private files)
#' @param progress show download progress for each individual file? (currently only for public objects)
#' @param force force download to overwrite existing S3 objects
#' @param confirm ask user to interactively confirm downloads? (only possible when session is interactive)
#' @param public defaults to FALSE; if TRUE, ignore any environment
#'                    variables specifying AWS credentials and
#'                    attempt to download the file as publicly available
#' @param data_dir root directory for downloaded files (defaults to `tools::R_user_dir("s3", "data")`)
#' @return data.frame (or tibble) with s3_uris and corresponding file paths to downloaded files (invisibly)
#' @examples
#' \donttest{
#' Sys.setenv("R_USER_DATA_DIR" = tempdir())
#' the_files <- s3_get_files(c(
#'     "s3://geomarker/testing_downloads/mtcars.rds",
#'     "s3://geomarker/testing_downloads/mtcars.fst"
#' ))
#' unlink(the_files$file_path)
#' }
#' @export
s3_get_files <-
  function(s3_uri,
           region = "us-east-2",
           progress = FALSE,
           force = FALSE,
           confirm = TRUE,
           public = FALSE,
           data_dir = tools::R_user_dir("s3", "data")) {

  out <-
    purrr::map(s3_uri, s3_parse_uri) |>
    dplyr::bind_rows() |>
    dplyr::mutate(exists_already = purrr::map_lgl(uri,
      s3_check_for_file_local,
      quiet = TRUE
    ))

  out <-
    out |>
    dplyr::rowwise() |>
    dplyr::mutate(
      file_path =
        fs::path_join(c(
          data_dir,
          bucket,
          folder,
          file_name
        ))
    )

    n_exist <- sum(out$exists_already)
    n_to_dl <- sum(!out$exists_already)

    if (n_to_dl == 0 & !force) {
      cli::cli_alert_info("all files already exist")
      return(invisible(dplyr::select(out, uri, file_path)))
    }

    if (n_exist > 0 & !force) {
      cli::cli_alert_info("{n_exist} file{?s} already exis{?ts/t}")
    }

    if (force) out$exists_already <- TRUE

    need_to_download <- dplyr::filter(out, !exists_already)

    files_size <- Reduce(f = `+`, x = lapply(need_to_download$uri, s3_file_size, region = region, public = public))

    cli::cli_alert_info("{n_to_dl} file{?s} totaling {prettyunits::pretty_bytes(files_size)} will be downloaded to {tools::R_user_dir('s3', 'data')}")
    if (interactive() & confirm) ui_confirm()

    download_files_with_progress <- function(...) {
        sb <- cli::cli_status("{cli::symbol$arrow_right} Downloading {n_to_dl} file{?s}.")

        file_paths <- vector("list", length = n_to_dl)

        for (i in n_to_dl:1) {
            cli::cli_status_update(
                id = sb,
                "{cli::symbol$arrow_right} Got {n_to_dl - i} file{?s}, downloading {i}"
            )
            file_paths[i] <- s3_get(need_to_download[i, "uri"], data_dir = data_dir, ...)
        }

        cli::cli_status_clear(id = sb)
        return(file_paths)
    }

    download_time <- system.time({
      download_files_with_progress(
        region = region,
        quiet = TRUE,
        force = TRUE,
        public = public,
        progress = progress
      )
    })["elapsed"]

    cli::cli_alert_success("Downloaded {n_to_dl} file{?s} in {prettyunits::pretty_sec(download_time)}.")

    # return all file paths (for files that already existed and those that were downloaded)
    return(invisible(dplyr::select(out, uri, file_path)))
}

Try the s3 package in your browser

Any scripts or data that you put into this service are public.

s3 documentation built on Sept. 11, 2024, 7:03 p.m.