R/read.R

Defines functions download_file_from_s3 s3_path_to_preview_df s3_path_to_full_df read_using

Documented in download_file_from_s3 read_using s3_path_to_full_df s3_path_to_preview_df

#' Read from S3 using a particular function
#'
#' @param FUN a function to parse the data into
#' @param ... arguments for said function
#' @param s3_path path to the s3 file bucket/folder/file.txt
#'
#' @return whatever the function returns
#' @export
#'
#' @examples Rs3tools::read_using(FUN=readxl::read_excel, s3_path="alpha-test-team/mpg.xlsx")
read_using <- function(FUN, s3_path, ...) {
  p <- parse_path(s3_path)
  tryCatch(
    obj <- s3_svc()$get_object(Bucket = p$bucket, Key = p$key),
    error = function(c) {
      message("Could not read ", s3_path)
      stop(c)
    }
  )

  fext <- dot_file_ext(p$key)
  tmp_location <- tempfile(fileext = fext)
  writeBin(obj$Body, con = tmp_location)
  on.exit(unlink(tmp_location))

  FUN(tmp_location, ...)
}


#' Read a full file from S3, using the full path to the file including the
#' bucketname.
#'
#' This function will attempt to read the file directly, as a dataframe.
#' If this is not possible it will download the file to a temporary location
#' and load it.
#' At present the function supports direct reading of CSV, TSV, XLS, XLSX,
#' SAS7DBAT, SAV and DTA file types.
#' You can add options to the read function that are compatible with
#' readxl::read_excel() and read.csv(). See their help files for more info.
#'
#' @param path a string -  the full path to the file including the bucketname
#'
#' @return a dataframe (.csv, .tsv), tibble (Excel, SAS, SPSS, Stata), or file location
#' @export
#'
#' @examples df <- s3_read_path_to_df("alpha-moj-analytics-scratch/folder/file.csv")
#' @examples df <- s3_read_path_to_df("alpha-moj-analytics-scratch/folder/file.tsv")
#' @examples df <- s3_read_path_to_df("alpha-moj-analytics-scratch/folder/file.xls")
#' @examples df <- s3_read_path_to_df("alpha-moj-analytics-scratch/folder/file.xls", sheet = 1)
#' @examples filelocation <- s3_read_path_to_df("alpha-moj-analytics-scratch/folder/file.png")
s3_path_to_full_df <- function(s3_path, ...) {
  handle_default <- function(tmp_location) {
    message("Rs3tools cannot parse this file automatically")
    message("If you want to specify your own reading function see Rs3tools::read_using()")
    message("or use the file path provided by this function.")
    message(stringr::str_glue("Your file is available at: {tmp_location}"))
    tmp_location
  }

  fext <- tolower(tools::file_ext(s3_path))
  if (fext %in% c("csv", "tsv")) {
    f <- read.csv
  } else if (fext %in% c("xls", "xlsx")) {
    f <- readxl::read_excel
  } else if (fext == "sas7bdat") {
    f <- haven::read_sas
  } else if (fext == "sav") {
    f <- haven::read_spss
  } else if (fext == "dta") {
    f <- haven::read_stata
  } else {
    f <- handle_default
  }

  tryCatch(
    read_using(f, s3_path, ...),
    error = function(c) {
      stop(c)
    }
  )
}


#' Preview the first 5 rows of a CSV file from S3, using the full path to the file including the bucketname
#'
#' @param path a string -  the full path to the file including the bucketname
#'
#' @return a tibble (dataframe)
#' @export
#'
#'
#' @examples df <- s3_read_path_to_df("alpha-moj-analytics-scratch/a/b/c/robins_temp.csv")
#'
s3_path_to_preview_df <- function(s3_path, ...) {
  p <- parse_path(s3_path)
  fext <- tolower(tools::file_ext(p$key))

  if (!(fext %in% c("csv", "tsv"))) {
    message(stringr::str_glue("Preview not supported for {fext} files"))
    NULL
  } else {
    tryCatch(
      {
        obj <- s3_svc()$get_object(Bucket = p$bucket, Key = p$key,
                                   Range = 'bytes=0-12000')
        read.csv(text = rawToChar(obj$Body), stringsAsFactors = FALSE) %>%
          head(n = 5)
      },
      error = function(c) {
        message("Could not read ", s3_path)
        stop(c)
      }
    )
  }
}


#' Download a file from s3 to somewhere in your home directory
#'
#' @param s3_path character - the full path to the file in s3 e.g. alpha-everyone/iris.csv
#' @param local_path - character - the location you want to store the file locally e..g
#' @param overwrite - boolean - if file exists locally, overwrite?
#'
#' @return NULL
#' @export
#'
#' @examples Rs3tools:::download_file_from_s3("alpha-everyone/iris.csv", "iris.csv", overwrite =TRUE)
download_file_from_s3 <- function(s3_path, local_path, overwrite=FALSE) {
  p <- parse_path(s3_path)

  if (!(file.exists(local_path)) || overwrite) {
    tryCatch(
      obj <- s3_svc()$get_object(Bucket = p$bucket, Key = p$key),
      error = function(c) {
        message("Could not read ", s3_path)
        stop(c)
      }
    )
  } else {
    stop("The file already exists locally and you didn't specify overwrite=TRUE")
  }

  tryCatch(
    writeBin(obj$Body, con = local_path),
    error = function(c) {
      message("Could not write to ", local_path)
      stop(c)
    }
  )
}
moj-analytical-services/Rs3tools documentation built on Aug. 9, 2024, 1:27 a.m.