R/azure.R

Defines functions getExcelSheetsFromAzureExcelFile getExcelFilesFromAzure searchAndGetExcelFilesFromAzure getExcelFileFromAzure searchAndGetParquetFilesFromAzure getParquetFilesFromAzure getParquetFileFromAzure searchAndGetCSVFilesFromAzure getCSVFilesFromAzure getCSVFileFromAzure downloadDataFileFromAzure listItemsInAzure listAzureContainers getAzureContainer getAzureEndPoint

#' @param host
#' @param securityToken
#' export
getAzureEndPoint <- function(host = "", securityToken = ""){
  # To prevent "The AzureR packages can save your authentication credentials in the directory:" blocks loading Azure Packages,
  # Set current working directory to workaround it. Since we always authenticate, we don't cache anything on the directory.
  Sys.setenv("R_AZURE_DATA_DIR" = getwd())
  AzureStor::storage_endpoint(endpoint = host, sas = securityToken)
}
#' @param host
#' @param securityToken
#' export
getAzureContainer <- function(host = "", securityToken = "", container = "") {
  endpoint <- getAzureEndPoint(host = host, securityToken = securityToken)
  AzureStor::storage_container(endpoint, container)
}
#' @param host
#' @param securityToken
#' export
listAzureContainers <- function(host = "", securityToken = ""){
  endpoint <- getAzureEndPoint(host = host, securityToken = securityToken)
  containers <- AzureStor::list_storage_containers(endpoint)
  df <- data.frame(matrix(unlist(containers), nrow = length(containers), byrow = TRUE))
  colnames(df) <- c("name", "endpoint", "sas", "version")
  df
}
#' @param host
#' @param securityToken
#' @param container
#' @param folder
#' export
listItemsInAzure <- function(host = "", securityToken = "", container = "", folder = ""){
  container <- getAzureContainer(host = host, securityToken = securityToken, container = container)
  AzureStor::list_storage_files(container, dir = folder, info = "all")
}
#' @param host
#' @param securityToken
#' @param container
#' @param folder
#' export
downloadDataFileFromAzure <- function(host = "", securityToken = "", container = "", fileName = ""){
  shouldCacheFile <- getOption("tam.should.cache.datafile")
  filepath <- NULL
  hash <- digest::digest(stringr::str_c(host, container, fileName, sep = ":"), "md5", serialize = FALSE)
  tryCatch({
    filepath <- getDownloadedFilePath(hash)
  }, error = function(e){
    # if filePath hash is not set as global variable yet, it raises error that says object not found
    # which can be ignored
    filepath <- NULL
  })
  # Check if cached excel/csv exists for the filepath
  if (!is.null(shouldCacheFile) && isTRUE(shouldCacheFile) && !is.null(filepath)) {
    filepath
  } else {
    ext <- stringr::str_to_lower(tools::file_ext(fileName))
    tmp <- tempfile(fileext = stringr::str_c(".", ext))

    # In case of using Rserve on linux, somehow it doesn't create a temporary
    # directory specified by tempdir() which is used as a part of temp file
    # path generated by tempfile(). So if you try to use that temp file path,
    # dump some data into it for example, it will fail because no such path
    # found. This function fails with the same reason at download.file below.
    #
    # It works fine from the R command line on linux, and it works
    # fine all the time on Mac and Windows regardless Rserv or not.
    #
    # The following command is harmless even if you have the directory already.
    # http://stackoverflow.com/questions/4216753/check-existence-of-directory-and-create-if-doesnt-exist
    dir.create(tempdir(), showWarnings = FALSE)

    # download file to temporary location
    container <- exploratory::getAzureContainer(host = host, securityToken = securityToken, container = container)
    AzureStor::storage_download(container, src=fileName, dest = tmp, overwrite = T)
    # cache file
    if(!is.null(shouldCacheFile) && isTRUE(shouldCacheFile)){
      setDownloadedFilePath(hash, tmp)
    }
    tmp
  }
}

#'API that imports a CSV file from Azure.
#'@export
getCSVFileFromAzure <- function(fileName, host, securityToken, container, delim, quote = '"',
                             escape_backslash = FALSE, escape_double = TRUE,
                             col_names = TRUE, col_types = NULL,
                             locale = readr::default_locale(),
                             na = c("", "NA"), quoted_na = TRUE,
                             comment = "", trim_ws = FALSE,
                             skip = 0, n_max = Inf, guess_max = min(1000, n_max),
                             progress = interactive()) {
  tryCatch({
    filePath <- downloadDataFileFromAzure(host = host, securityToken = securityToken, container = container, fileName = fileName)
  }, error = function(e) {
    if (stringr::str_detect(e$message, "(Not Found|Moved Permanently)")) {
      # Looking for error that looks like "Not Found (HTTP 404). Failed to complete Storage Services operation. Message:\n.".
      # This seems to be returned when the bucket itself does not exist.
      stop(paste0('EXP-DATASRC-12 :: ', jsonlite::toJSON(c(container, fileName)), ' :: There is no such file in the Azure Container.'))
    }
    else {
      stop(e)
    }
  })
  exploratory::read_delim_file(filePath, delim = delim, quote = quote,
                               escape_backslash = escape_backslash, escape_double = escape_double,
                               col_names = col_names, col_types = col_types,
                               locale = locale,
                               na = na, quoted_na = quoted_na,
                               comment = comment, trim_ws = trim_ws,
                               skip = skip, n_max = n_max, guess_max = guess_max,
                               progress = progress)
}

#'API that imports multiple same structure CSV files and merge it to a single data frame
#'
#'For col_types parameter, by default it forces character to make sure that merging the CSV based data frames doesn't error out due to column data types mismatch.
# Once the data frames merging is done, readr::type_convert is called from Exploratory Desktop to restore the column data types.

#'@export
getCSVFilesFromAzure <- function(files, host, securityToken, container, folder = folder, forPreview = FALSE, delim, quote = '"',
                                 escape_backslash = FALSE, escape_double = TRUE,
                                 col_names = TRUE, col_types = NULL,
                                 locale = readr::default_locale(),
                                 na = c("", "NA"), quoted_na = TRUE,
                                 comment = "", trim_ws = FALSE,
                                 skip = 0, n_max = Inf, guess_max = min(1000, n_max),
                                 progress = interactive()) {
  # for preview mode, just use the first file.
  if (forPreview & length(files) > 0) {
    files <- files[1]
  }
  # set name to the files so that it can be used for the "id" column created by purrr:map_dfr.
  files <- setNames(as.list(files), files)
  df <- purrr::map_dfr(files, exploratory::getCSVFileFromAzure, host = host, securityToken = securityToken, container = container, delim = delim, quote = quote,
                       escape_backslash = escape_backslash, escape_double = escape_double,
                       col_names = col_names, col_types = col_types,
                       locale = locale,
                       na = na, quoted_na = quoted_na,
                       comment = comment, trim_ws = trim_ws,
                       skip = skip, n_max = n_max, guess_max = guess_max,
                       progress = progress, .id = "exp.file.id") %>% mutate(exp.file.id = basename(exp.file.id))  # extract file name from full path with basename and create file.id column.
  id_col <- avoid_conflict(colnames(df), "id")
  # copy internal exp.file.id to the id column.
  df[[id_col]] <- df[["exp.file.id"]]
  # drop internal column and move the id column to the very beginning.
  df %>% dplyr::select(!!rlang::sym(id_col), dplyr::everything(), -exp.file.id)
}

#'API that search then imports CSV Files from Azure.
#'@export
searchAndGetCSVFilesFromAzure <- function(searchKeyword, host, securityToken, container, folder, forPreview = FALSE, delim, quote = '"',
                                          escape_backslash = FALSE, escape_double = TRUE,
                                          col_names = TRUE, col_types = readr::cols(.default = readr::col_character()),
                                          locale = readr::default_locale(),
                                          na = c("", "NA"), quoted_na = TRUE,
                                          comment = "", trim_ws = FALSE,
                                          skip = 0, n_max = Inf, guess_max = min(1000, n_max),
                                          progress = interactive()) {
  # search condition is case insensitive. (ref: https://www.regular-expressions.info/modifiers.html, https://stackoverflow.com/questions/5671719/case-insensitive-search-of-a-list-in-r)
  tryCatch({
    files <- exploratory::listItemsInAzure(host = host, securityToken = securityToken, container = container, folder = folder) %>%
      dplyr::filter(!isdir & str_detect(name, stringr::str_c("(?i)", searchKeyword))) %>% dplyr::select(name)
  }, error = function(e) {
    # if container does not exist, below error is raised:
    # Error in list_adls_files(container, ...) :
    #  Not Found (HTTP 404). Failed to complete Storage Services operation. Message:
    #  The specified filesystem does not exist.
    if (stringr::str_detect(e$message, "The specified filesystem does not exist.")) {
      stop(paste0('EXP-DATASRC-11 :: ', jsonlite::toJSON(container), ' :: The specified Azure container does not exist.'))
    }
    else {
      stop(e)
    }
  })
  if (nrow(files) == 0) {
    stop(paste0('EXP-DATASRC-10 :: ', jsonlite::toJSON(container), ' :: There is no file in the Azure container that matches with the specified condition.')) # TODO: escape bucket name.
  }
  getCSVFilesFromAzure(files = files$name, host = host, securityToken = securityToken, container = container, forPreview = forPreview, delim = delim, quote = quote,
                    col_names = col_names, col_types = col_types, locale = locale, na = na, quoted_na = quoted_na, comment = comment, trim_ws = trim_ws,
                    skip = skip, n_max = n_max, guess_max = guess_max, progress = progress)


}

#'API that imports a Parquet file from Azure.
#'@export
getParquetFileFromAzure <- function(fileName = "", host = "", securityToken = "", container = "",  col_select = NULL) {
  tryCatch({
    filePath <- downloadDataFileFromAzure(host = host, securityToken = securityToken, container = container, fileName = fileName)
  }, error = function(e) {
    if (stringr::str_detect(e$message, "(Not Found|Moved Permanently)")) {
      # Looking for error that looks like "Not Found (HTTP 404). Failed to complete Storage Services operation. Message:\n.".
      # This seems to be returned when the bucket itself does not exist.
      stop(paste0('EXP-DATASRC-12 :: ', jsonlite::toJSON(c(container, fileName)), ' :: There is no such file in the Azure Container.'))
    }
    else {
      stop(e)
    }
  })
  exploratory::read_parquet_file(filePath, col_select = col_select)
}

#'API that imports Parquet Files from Azure.
#'@export
getParquetFilesFromAzure <- function(files = "", host = "", securityToken = "", container = "", forPreview = FALSE, col_select = NULL) {
  # for preview mode, just use the first file.
  if (forPreview & length(files) > 0) {
    files <- files[1]
  }
  files <- setNames(as.list(files), files)
  df <- purrr::map_dfr(files, exploratory::getParquetFileFromAzure, host = host, securityToken = securityToken, container = container, col_select = col_select, .id = "exp.file.id") %>% mutate(exp.file.id = basename(exp.file.id))  # extract file name from full path with basename and create file.id column.
  id_col <- avoid_conflict(colnames(df), "id")
  # copy internal exp.file.id to the id column.
  df[[id_col]] <- df[["exp.file.id"]]
  # drop internal column and move the id column to the very beginning.
  df %>% dplyr::select(!!rlang::sym(id_col), dplyr::everything(), -exp.file.id)
}

#'@export
searchAndGetParquetFilesFromAzure <- function(searchKeyword = "", host = "", securityToken = "", container = "", folder = "", forPreview = FALSE, col_select = NULL) {

  # search condition is case insensitive. (ref: https://www.regular-expressions.info/modifiers.html, https://stackoverflow.com/questions/5671719/case-insensitive-search-of-a-list-in-r)
  tryCatch({
    files <- exploratory::listItemsInAzure(host = host, securityToken = securityToken, container = container, folder = folder) %>%
      dplyr::filter(!isdir & str_detect(name, stringr::str_c("(?i)", searchKeyword))) %>% dplyr::select(name)
  }, error = function(e) {
    # if container does not exist, below error is raised:
    # Error in list_adls_files(container, ...) :
    #  Not Found (HTTP 404). Failed to complete Storage Services operation. Message:
    #  The specified filesystem does not exist.
    if (stringr::str_detect(e$message, "The specified filesystem does not exist.")) {
      stop(paste0('EXP-DATASRC-11 :: ', jsonlite::toJSON(container), ' :: The specified Azure container does not exist.'))
    }
    else {
      stop(e)
    }
  })
  if (nrow(files) == 0) {
    stop(paste0('EXP-DATASRC-10 :: ', jsonlite::toJSON(container), ' :: There is no file in the Azure container that matches with the specified condition.')) # TODO: escape bucket name.
  }
  getParquetFilesFromAzure(files = files$name, host = host, securityToken = securityToken, container = container, forPreview = forPreview, col_select = col_select)

}

#'API that imports a Excel file from Azure.
#'@export
getExcelFileFromAzure <- function(fileName, host, securityToken, container, sheet = 1, col_names = TRUE, col_types = NULL, na = "", skip = 0, trim_ws = TRUE, n_max = Inf, use_readxl = NULL, detectDates = FALSE, skipEmptyRows = FALSE, skipEmptyCols = FALSE, check.names = FALSE, tzone = NULL, convertDataTypeToChar = FALSE, ...) {
  tryCatch({
    filePath <- downloadDataFileFromAzure(host = host, securityToken = securityToken, container = container, fileName = fileName)
  }, error = function(e) {
    if (stringr::str_detect(e$message, "(Not Found|Moved Permanently)")) {
      # Looking for error that looks like "Not Found (HTTP 404). Failed to complete Storage Services operation. Message:\n.".
      # This seems to be returned when the bucket itself does not exist.
      stop(paste0('EXP-DATASRC-12 :: ', jsonlite::toJSON(c(container, fileName)), ' :: There is no such file in the Azure Container.'))
    }
    else {
      stop(e)
    }
  })
  exploratory::read_excel_file(path = filePath, sheet = sheet, col_names = col_names, col_types = col_types, na = na, skip = skip, trim_ws = trim_ws, n_max = n_max, use_readxl = use_readxl, detectDates = detectDates, skipEmptyRows =  skipEmptyRows, skipEmptyCols = skipEmptyCols, check.names = check.names, tzone = tzone, convertDataTypeToChar = convertDataTypeToChar, ...)
}

#'API that search files by search keyword then imports multiple same structure Excel files and merge it to a single data frame
#'
#'For col_types parameter, by default it forces character to make sure that merging the Excel based data frames doesn't error out due to column data types mismatch.
# Once the data frames merging is done, readr::type_convert is called from Exploratory Desktop to restore the column data types.

#'@export
searchAndGetExcelFilesFromAzure <- function(searchKeyword, host, securityToken, container, folder, forPreview = FALSE, sheet = 1, col_names = TRUE, col_types = NULL, na = "", skip = 0, trim_ws = TRUE, n_max = Inf, use_readxl = NULL, detectDates = FALSE, skipEmptyRows = FALSE, skipEmptyCols = FALSE, check.names = FALSE, tzone = NULL, convertDataTypeToChar = TRUE, ...){

  # search condition is case insensitive. (ref: https://www.regular-expressions.info/modifiers.html, https://stackoverflow.com/questions/5671719/case-insensitive-search-of-a-list-in-r)
  tryCatch({
    files <- exploratory::listItemsInAzure(host = host, securityToken = securityToken, container = container, folder = folder) %>%
      dplyr::filter(!isdir & str_detect(name, stringr::str_c("(?i)", searchKeyword))) %>% dplyr::select(name)
  }, error = function(e) {
    # if container does not exist, below error is raised:
    # Error in list_adls_files(container, ...) :
    #  Not Found (HTTP 404). Failed to complete Storage Services operation. Message:
    #  The specified filesystem does not exist.
    if (stringr::str_detect(e$message, "The specified filesystem does not exist.")) {
      stop(paste0('EXP-DATASRC-11 :: ', jsonlite::toJSON(container), ' :: The specified Azure container does not exist.'))
    }
    else {
      stop(e)
    }
  })
  if (nrow(files) == 0) {
    stop(paste0('EXP-DATASRC-10 :: ', jsonlite::toJSON(container), ' :: There is no file in the Azure container that matches with the specified condition.')) # TODO: escape bucket name.
  }
  exploratory::getExcelFilesFromAzure(files = files$name, host = host, securityToken = securityToken, container = container, forPreview = forPreview, sheet = sheet,
                                   col_names = col_names, col_types = col_types, na = na, skip = skip, trim_ws = trim_ws, n_max = n_max,
                                   use_readxl = use_readxl, detectDates = detectDates, skipEmptyRows = skipEmptyRows, skipEmptyCols = skipEmptyCols,
                                   check.names = check.names, tzone = tzone, convertDataTypeToChar = convertDataTypeToChar, ...)
}

#'API that imports multiple Excel files from Azure
#'@export
getExcelFilesFromAzure <- function(files, host, securityToken, container, forPreview = FALSE, sheet = 1, col_names = TRUE, col_types = NULL, na = "", skip = 0, trim_ws = TRUE, n_max = Inf, use_readxl = NULL, detectDates = FALSE, skipEmptyRows = FALSE, skipEmptyCols = FALSE, check.names = FALSE, tzone = NULL, convertDataTypeToChar = TRUE, ...) {
  # for preview mode, just use the first file.
  if (forPreview & length(files) > 0) {
    files <- files[1]
  }
  # set name to the files so that it can be used for the "id" column created by purrr:map_dfr.
  files <- setNames(as.list(files), files)
  df <- purrr::map_dfr(files, exploratory::getExcelFileFromAzure, host = host, securityToken = securityToken, container = container, sheet = sheet,
                       col_names = col_names, col_types = col_types, na = na, skip = skip, trim_ws = trim_ws, n_max = n_max, use_readxl = use_readxl,
                       detectDates = detectDates, skipEmptyRows =  skipEmptyRows, skipEmptyCols = skipEmptyCols, check.names = check.names,
                       tzone = tzone, convertDataTypeToChar = convertDataTypeToChar, .id = "exp.file.id") %>% mutate(exp.file.id = basename(exp.file.id))  # extract file name from full path with basename and create file.id column.
  id_col <- avoid_conflict(colnames(df), "id")
  # copy internal exp.file.id to the id column.
  df[[id_col]] <- df[["exp.file.id"]]
  # drop internal column and move the id column to the very beginning.
  df %>% dplyr::select(!!rlang::sym(id_col), dplyr::everything(), -exp.file.id)
}

#'Wrapper for readxl::excel_sheets to support Azure Excel file
#'@export
getExcelSheetsFromAzureExcelFile <- function(fileName, host, securityToken, container){
  filePath <- downloadDataFileFromAzure(host = host, securityToken = securityToken, container =container, fileName = fileName)
  readxl::excel_sheets(filePath)
}
exploratory-io/exploratory_func documentation built on April 23, 2024, 9:15 p.m.