R/cloud-storage.R

Defines functions download_cloud_file cloud_object_name upload_cloud_file cloud_storage_authenticate

Documented in cloud_object_name cloud_storage_authenticate download_cloud_file upload_cloud_file

#' Authenticate to a storage cloud provider
#'
#' Usually used internally by other functions
#'
#' @param provider cloud provider to use, either "gcs" or "aws"
#' @param options named list with cloud provider options, see details
#'
#' @details
#'
#' ### Google Cloud Services
#'
#' For Google Cloud Services ("gcs") options must be a list with the field
#' `service_account_key` with the contents of the authentication json file you
#' have downloaded from your Google Project.
#'
#' This function uses [googleCloudStorageR::gcs_auth] under the hood to
#' authenticate.
#'
#' @export
#'
#' @examples
#'
#' # Google Cloud Services
#' \dontrun{
#' authentication_details <- readLines("location_of_json_file.json")
#' cloud_storage_authenticate(
#'   provider = "gcs",
#'   options = list(
#'     service_account_key = authentication_details,
#'     bucket = "my-bucket"
#'   )
#' )
#' }
cloud_storage_authenticate <- function(provider, options) {
  if ("gcs" %in% provider) {
    # Only need to authenticate if there is no token for downstream requests
    if (isFALSE(googleAuthR::gar_has_token())) {
      service_account_key <- options$service_account_key
      temp_auth_file <- tempfile(fileext = "json")
      writeLines(service_account_key, temp_auth_file)
      googleCloudStorageR::gcs_auth(json_file = temp_auth_file)
    }
  }
}

#' Upload a local file to a cloud storage bucket
#'
#' @param file a file-path (character) to upload. A vector with multiple files
#'   is also supported.
#' @param provider
#' @param options
#' @param name What to call the file once uploaded. Default is the filepath
#' @inheritParams cloud_storage_authenticate
#' @details
#'
#' ### Google Cloud Services
#'
#' For Google Cloud Services ("gcs") options must be a list with two fields:
#' `bucket` with the bucketname (character) you are uploading to, and
#' `service_account_key` with the contents of the authentication json file you
#' have downloaded from your Google Project (if [cloud_storage_authenticate] has
#' not been called before).
#'
#' This function uses [googleCloudStorageR::gcs_upload] under the hood to upload
#' the file.
#'
#' @return If `provider` is "gcs" and if successful a list of medatada objects
#' @export
#'
#' @examples
#'
#' # Google Cloud Services
#' \dontrun{
#' authentication_details <- readLines("location_of_json_file.json")
#' upload_cloud_file(
#'   file = "table_to_upload.csv",
#'   provider = "gcs",
#'   options = list(
#'     service_account_key = authentication_details,
#'     bucket = "my-bucket"
#'   )
#' )
#' }
#'
upload_cloud_file <- function(file, provider, options, name = file) {
  cloud_storage_authenticate(provider, options)

  out <- list()
  if ("gcs" %in% provider) {
    # Iterate over multiple files (and names)
    google_output <- purrr::map2(
      file, name,
      ~ googleCloudStorageR::gcs_upload(
        file = .x,
        bucket = options$bucket,
        name = .y,
        predefinedAcl = "bucketLevel"
      )
    )

    out <- c(out, google_output)
  }

  out
}

#' Get the full name of a versioned cloud object
#'
#' Obtain the full name (e.g.
#' `timor-landings-v2_metadata__20210326084600_54617b3__.json`) of a cloud
#' storage object. If there are more than one object matching the prefix,
#' version, and extension, a vector with all the names is returned.
#'
#' @param prefix string indicating the prefix of the object
#' @param version either "latest" or the specific version string generated by
#'   [add_version] when the file was uploaded to the cloud provider
#' @param extension extension of the desired file. Use an empty string "" to
#'   return all extensions founds
#' @param provider
#' @param exact_match logical indicating whether the prefix should be matched
#'   exactly
#' @param options
#' @inheritParams upload_cloud_file
#'
#'
#' @details
#'
#' ### Google Cloud Services
#'
#' For Google Cloud Services ("gcs") options must be a list with two fields:
#' `bucket` with the bucketname (character) you are uploading to, and
#' `service_account_key` with the contents of the authentication json file you
#' have downloaded from your Google Project (if [cloud_storage_authenticate] has
#' not been called before).
#'
#' This function uses [googleCloudStorageR::gcs_upload] under the hood to upload
#' the file.
#'
#' @return A string vector with the object names in the cloud storage that match
#'   the prefix, the version, and the extension indicated in the parameters
#' @export
#'
#' @examples
#'
#' #' # Google Cloud Services
#' \dontrun{
#' authentication_details <- readLines("location_of_json_file.json")
#' # obtain the latest version of all files corresponding to timor-landings-v2
#' cloud_object_name(
#'   prefix = "timor-landings-v2",
#'   version = "latest",
#'   provider = "gcs",
#'   options = list(
#'     service_account_key = authentication_details,
#'     bucket = "my-bucket"
#'   )
#' )
#'
#' # obtain a specific version of the structured data from timor-landings-v2
#' cloud_object_name(
#'   prefix = "timor-landings-v2_raw",
#'   version = "20210326084600_54617b",
#'   extension = "csv",
#'   provider = "gcs",
#'   options = list(
#'     service_account_key = authentication_details,
#'     bucket = "my-bucket"
#'   )
#' )
#' }
#'
cloud_object_name <- function(prefix, version = "latest", extension = "",
                              provider, exact_match = FALSE, options) {
  cloud_storage_authenticate(provider, options)

  if ("gcs" %in% provider) {
    gcs_files <- googleCloudStorageR::gcs_list_objects(
      bucket = options$bucket,
      prefix = prefix
    )

    if (nrow(gcs_files) == 0) {
      return(character(0))
    }

    gcs_files_formatted <- gcs_files %>%
      tidyr::separate(
        col = .data$name,
        into = c("base_name", "version", "ext"),
        # Version is separated with the "__" string
        sep = "__",
        remove = FALSE
      ) %>%
      dplyr::filter(stringr::str_detect(.data$ext, paste0(extension, "$"))) %>%
      dplyr::group_by(.data$base_name, .data$ext) %>%
      na.omit()

    if (isTRUE(exact_match)) {
      selected_rows <- gcs_files_formatted %>%
        dplyr::filter(.data$base_name == prefix)
    } else {
      selected_rows <- gcs_files_formatted
    }

    if (version == "latest") {
      selected_rows <- selected_rows %>%
        dplyr::filter(max(.data$updated) == .data$updated)
    } else {
      this_version <- version
      selected_rows <- selected_rows %>%
        dplyr::filter(.data$version == this_version)
    }

    selected_rows$name
  }
}


#' Download an object from a cloud storage bucket to a local file
#'
#' Download object from the cloud storage to a local file
#'
#' @param name the name of the object in the storage bucket.
#' @param provider
#' @param options
#' @param file a file-path (character) where the object will be saved. Default
#'   is the object name.
#' @inheritParams upload_cloud_file
#'
#'
#' @return the file path
#' @export
#'
#' @examples
#'
#' # Google Cloud Services
#' \dontrun{
#' authentication_details <- readLines("location_of_json_file.json")
#' download_cloud_file(
#'   name = "timor-landings-v2_metadata__20210326084600_54617b3__.json",
#'   provider = "gcs",
#'   options = list(
#'     service_account_key = authentication_details,
#'     bucket = "my-bucket"
#'   )
#' )
#' }
download_cloud_file <- function(name, provider, options, file = name) {
  cloud_storage_authenticate(provider, options)

  if ("gcs" %in% provider) {
    purrr::map2(
      name, file,
      ~ googleCloudStorageR::gcs_get_object(
        object_name = .x,
        bucket = options$bucket,
        saveToDisk = .y,
        overwrite = ifelse(is.null(options$overwrite), TRUE, options$overwrite)
      )
    )
  }

  file
}
WorldFishCenter/peskas.timor.data.pipeline documentation built on April 14, 2025, 1:47 p.m.