R/read_adl_delim.R

Defines functions read_adl_delim

Documented in read_adl_delim

#' Read a delim file from Azure Data Lake
#'
#' Read a deliminated from Azure Data Lake given a path to the file. `set_adl_token()`
#' must be run before any file may be read. Also, the parameters for this function are the
#' same as the parameters found in readr::read_delim(). Thus the documentation that follows
#' is identical to the readr documentation.
#'
#' @param adl_file_path A string representing the adl file path. _Required parameter._
#' @param delim A string representing the seperator such as pipe "|" _Required parameter._
#' @param col_names Either `TRUE`, `FALSE` or a character vector
#'   of column names.
#'
#'   If `TRUE`, the first row of the input will be used as the column
#'   names, and will not be included in the data frame. If `FALSE`, column
#'   names will be generated automatically: X1, X2, X3 etc.
#'
#'   If `col_names` is a character vector, the values will be used as the
#'   names of the columns, and the first row of the input will be read into
#'   the first row of the output data frame.
#'
#'   Missing (`NA`) column names will generate a warning, and be filled
#'   in with dummy names `X1`, `X2` etc. Duplicate column names
#'   will generate a warning and be made unique with a numeric prefix.
#' @param col_types One of `NULL`, a [cols()] specification, or
#'   a string. See `vignette("readr")` for more details.
#'
#'   If `NULL`, all column types will be imputed from the first 1000 rows
#'   on the input. This is convenient (and fast), but not robust. If the
#'   imputation fails, you'll need to supply the correct types yourself.
#'
#'   If a column specification created by [cols()], it must contain
#'   one column specification for each column. If you only want to read a
#'   subset of the columns, use [cols_only()].
#'
#'   Alternatively, you can use a compact string representation where each
#'   character represents one column:
#' - c = character
#' - i = integer
#' - n = number
#' - d = double
#' - l = logical
#' - f = factor
#' - D = date
#' - T = date time
#' - t = time
#' - ? = guess
#' - _ or - = skip
#' @param locale The locale controls defaults that vary from place to place.
#'   The default locale is US-centric (like R), but you can use
#'   [locale()] to create your own locale that controls things like
#'   the default time zone, encoding, decimal mark, big mark, and day/month
#'   names.
#' @param n_max Maximum number of records to read.
#' @param guess_max Maximum number of records to use for guessing column types.
#' @param progress Display a progress bar? By default it will only display
#'   in an interactive session and not while knitting a document. The display
#'   is updated every 50,000 values and will only display if estimated reading
#'   time is 5 seconds or more. The automatic progress bar can be disabled by
#'   setting option `readr.show_progress` to `FALSE`.
#' @param quote Single character used to quote strings.
#' @param escape_backslash Does the file use backslashes to escape special
#'          characters? This is more general than ‘escape_double’ as
#'          backslashes can be used to escape the delimiter character,
#'          the quote character, or to add special characters like ‘\\n’.
#' @param escape_double Does the file escape quotes by doubling them? i.e. If
#'          this option is ‘TRUE’, the value ‘""""’ represents a single
#'          quote, ‘\"’.
#' @param na Character vector of strings to interpret as missing values.
#'         Set this option to ‘character()’ to indicate no missing
#'         values.
#' @param quoted_na  Should missing values inside quotes be treated as missing
#'          values (the default) or strings.
#' @param comment A string used to identify comments. Any text after the
#'         comment characters will be silently ignored.
#' @param trim_ws Should leading and trailing whitespace be trimmed from each
#'          field before parsing it?
#'
#'
#' @param skip Number of lines to skip before reading data.
#'
#' @param skip_empty_rows Should blank rows be ignored altogether? i.e. If this
#'          option is ‘TRUE’ then blank rows will not be represented at
#'          all.  If it is ‘FALSE’ then they will be represented by ‘NA’
#'          values in all the columns.
#'
#' @return a tibble
#' @importFrom readr read_delim default_locale show_progress
#' @examples
#' \dontrun{
#'
#'  set_adl_token(tenant = "abc123", client_id = "abc123", client_secret = "abc123")
#'  df <- read_adl_delim(
#'        adl_file_path =  "adl://<storename>.azuredatalakestore.net/path/to/file.csv",
#'        delim = "|")
#'
#' }
#'
#' @export
read_adl_delim <- function(adl_file_path, delim, quote = "\"", escape_backslash = FALSE,
  escape_double = TRUE, col_names = TRUE, col_types = NULL,
  locale = default_locale(), na = c("", "NA"), quoted_na = TRUE,
  comment = "", trim_ws = FALSE, skip = 0, n_max = Inf,
  guess_max = min(1000, n_max), progress = show_progress(),
  skip_empty_rows = TRUE){

  a <- strsplit(adl_file_path, split = "//", fixed = TRUE)
  b <- strsplit(a[[1]][2], split = ".net/", fixed = TRUE)
  adl_fs_name <- b[[1]][1]
  file_name <- b[[1]][2]

  if (identical(Sys.getenv("ADL_TOKEN"), "")) {
    rlang::abort(message = "ADL_TOKEN must be set. See `?set_adl_token`.")
  }

  r <- httr::GET(paste0("https://", adl_fs_name, ".net", "/webhdfs/v1/",
                        file_name, "?op=OPEN&read=true"),
                 httr::add_headers(Authorization = paste0("Bearer ", Sys.getenv("ADL_TOKEN"))))

  abort_bad_response <- function(arg, must,  not = NULL) {
    msg <- glue::glue("{arg} must {must}")
    if (!is.null(not)) {
      not <- httr::status_code(not)
      msg <- glue::glue("{msg}; not {not}.
                        make sure `adl_file_path` is correct")
    }

    rlang::abort("error_bad_response",
          message = msg,
          arg = arg,
          must = must,
          not = not
    )
  }

  if (httr::http_error(r)) {
    abort_bad_response("Azure Data Lake API request failed http respose",
                       must = "have status 200", not = r)
  }else{

  # Check status
  file_type <- strsplit(file_name, split=".", fixed=TRUE)[[1]][2]
    tmp = tempfile(fileext = ".csv")
    writeBin(httr::content(r), tmp, useBytes = TRUE)
    df <- readr::read_delim(file = tmp, delim = delim, quote = quote, escape_backslash = escape_backslash,
  escape_double = escape_double, col_names = col_names, col_types = col_types,
  locale = default_locale(), na = na, quoted_na = quoted_na,
  comment = comment, trim_ws = trim_ws, skip = skip , n_max = n_max,
  guess_max = guess_max , progress = progress,
  skip_empty_rows = skip_empty_rows)
    return(df)
}
}
alexhallam/flyrod documentation built on Nov. 20, 2019, 7:33 a.m.