R/adl_dir_ls.R

Defines functions adl_dir_ls

Documented in adl_dir_ls

#' Azure Data Lake Directory List
#'
#' This function provides the functionality to list files in a directory. `set_adl_token()`
#' must be run before any file may be read.
#'
#' @param adl_file_path A string representing the adl directory path. _Required parameter._
#' @return a list of file paths within a directory
#' @importFrom dplyr pull
#' @import httr
#' @examples
#' \dontrun{
#' df <- adl_dir_ls( adl_file_path =  "adl://path/to/dir/")
#'}
#' @export
adl_dir_ls <- function(adl_file_path){
  a <- strsplit(adl_file_path, split = "//", fixed = TRUE)
  b <- strsplit(a[[1]][2], split = ".net/", fixed = TRUE)
  adl_fs_name <- b[[1]][1]
  dir_name <- b[[1]][2]

  if (identical(Sys.getenv("ADL_TOKEN"), "")) {
    rlang::abort(message = "ADL_TOKEN must be set. See `?set_adl_token`.")
  }
  r <- httr::GET(paste0("https://", adl_fs_name, ".net", "/webhdfs/v1/",
                        dir_name, "?op=LISTSTATUS"),
                 httr::add_headers(Authorization = paste0("Bearer ", Sys.getenv("ADL_TOKEN"))))

  abort_bad_response <- function(arg, must,  not = NULL) {
    msg <- glue::glue("{arg} must {must}")
    if (!is.null(not)) {
      not <- httr::status_code(not)
      msg <- glue::glue("{msg}; not {not}.
                        make sure `adl_file_path` is correct")
    }

    rlang::abort("error_bad_response",
          message = msg,
          arg = arg,
          must = must,
          not = not
    )
  }

  if (httr::http_error(r)) {
    abort_bad_response("Azure Data Lake API request failed http respose",
                       must = "have status 200", not = r)
  }else{
  # get content
  all_dir_data <- httr::content(r)
  # get list of data needed for file paths
  lists <- all_dir_data[[1]][[1]]
  # get file paths
  files <- lapply(lists, FUN = `[`, c("pathSuffix"))
  # make function to add url
  add_url <- function(file_name){
    # I have seen some inconsistent behavior with this line sometime
    # though the following works most of the time I also saved the
    # alternative below.
    # future work. Find cases where this line does not work and write
    # conditions to fix it.
    paste0("adl://", adl_fs_name, ".net", "/", dir_name, file_name)

    # alternative path
    # paste0("adl://", adl_fs_name, ".net", "/", dir_name, "/", file_name)
  }

  # get full list of paths
  unlst <- unlist(files, use.names = FALSE)
  file_url_paths <- list(file_url_paths = unlst)
  adl_paths <- lapply(file_url_paths, add_url)

  return(adl_paths$file_url_paths)
  }
}
alexhallam/flyrod documentation built on Nov. 20, 2019, 7:33 a.m.