#' @param host
#' @param securityToken
#' export
getAzureEndPoint <- function(host = "", securityToken = ""){
# To prevent "The AzureR packages can save your authentication credentials in the directory:" blocks loading Azure Packages,
# Set current working directory to workaround it. Since we always authenticate, we don't cache anything on the directory.
Sys.setenv("R_AZURE_DATA_DIR" = getwd())
AzureStor::storage_endpoint(endpoint = host, sas = securityToken)
}
#' @param host
#' @param securityToken
#' export
getAzureContainer <- function(host = "", securityToken = "", container = "") {
endpoint <- getAzureEndPoint(host = host, securityToken = securityToken)
AzureStor::storage_container(endpoint, container)
}
#' @param host
#' @param securityToken
#' export
listAzureContainers <- function(host = "", securityToken = ""){
endpoint <- getAzureEndPoint(host = host, securityToken = securityToken)
containers <- AzureStor::list_storage_containers(endpoint)
df <- data.frame(matrix(unlist(containers), nrow = length(containers), byrow = TRUE))
colnames(df) <- c("name", "endpoint", "sas", "version")
df
}
#' @param host
#' @param securityToken
#' @param container
#' @param folder
#' export
listItemsInAzure <- function(host = "", securityToken = "", container = "", folder = ""){
container <- getAzureContainer(host = host, securityToken = securityToken, container = container)
AzureStor::list_storage_files(container, dir = folder, info = "all")
}
#' @param host
#' @param securityToken
#' @param container
#' @param folder
#' export
downloadDataFileFromAzure <- function(host = "", securityToken = "", container = "", fileName = ""){
shouldCacheFile <- getOption("tam.should.cache.datafile")
filepath <- NULL
hash <- digest::digest(stringr::str_c(host, container, fileName, sep = ":"), "md5", serialize = FALSE)
tryCatch({
filepath <- getDownloadedFilePath(hash)
}, error = function(e){
# if filePath hash is not set as global variable yet, it raises error that says object not found
# which can be ignored
filepath <- NULL
})
# Check if cached excel/csv exists for the filepath
if (!is.null(shouldCacheFile) && isTRUE(shouldCacheFile) && !is.null(filepath)) {
filepath
} else {
ext <- stringr::str_to_lower(tools::file_ext(fileName))
tmp <- tempfile(fileext = stringr::str_c(".", ext))
# In case of using Rserve on linux, somehow it doesn't create a temporary
# directory specified by tempdir() which is used as a part of temp file
# path generated by tempfile(). So if you try to use that temp file path,
# dump some data into it for example, it will fail because no such path
# found. This function fails with the same reason at download.file below.
#
# It works fine from the R command line on linux, and it works
# fine all the time on Mac and Windows regardless Rserv or not.
#
# The following command is harmless even if you have the directory already.
# http://stackoverflow.com/questions/4216753/check-existence-of-directory-and-create-if-doesnt-exist
dir.create(tempdir(), showWarnings = FALSE)
# download file to temporary location
container <- exploratory::getAzureContainer(host = host, securityToken = securityToken, container = container)
AzureStor::storage_download(container, src=fileName, dest = tmp, overwrite = T)
# cache file
if(!is.null(shouldCacheFile) && isTRUE(shouldCacheFile)){
setDownloadedFilePath(hash, tmp)
}
tmp
}
}
#'API that imports a CSV file from Azure.
#'@export
getCSVFileFromAzure <- function(fileName, host, securityToken, container, delim, quote = '"',
escape_backslash = FALSE, escape_double = TRUE,
col_names = TRUE, col_types = NULL,
locale = readr::default_locale(),
na = c("", "NA"), quoted_na = TRUE,
comment = "", trim_ws = FALSE,
skip = 0, n_max = Inf, guess_max = min(1000, n_max),
progress = interactive()) {
tryCatch({
filePath <- downloadDataFileFromAzure(host = host, securityToken = securityToken, container = container, fileName = fileName)
}, error = function(e) {
if (stringr::str_detect(e$message, "(Not Found|Moved Permanently)")) {
# Looking for error that looks like "Not Found (HTTP 404). Failed to complete Storage Services operation. Message:\n.".
# This seems to be returned when the bucket itself does not exist.
stop(paste0('EXP-DATASRC-12 :: ', jsonlite::toJSON(c(container, fileName)), ' :: There is no such file in the Azure Container.'))
}
else {
stop(e)
}
})
exploratory::read_delim_file(filePath, delim = delim, quote = quote,
escape_backslash = escape_backslash, escape_double = escape_double,
col_names = col_names, col_types = col_types,
locale = locale,
na = na, quoted_na = quoted_na,
comment = comment, trim_ws = trim_ws,
skip = skip, n_max = n_max, guess_max = guess_max,
progress = progress)
}
#'API that imports multiple same structure CSV files and merge it to a single data frame
#'
#'For col_types parameter, by default it forces character to make sure that merging the CSV based data frames doesn't error out due to column data types mismatch.
# Once the data frames merging is done, readr::type_convert is called from Exploratory Desktop to restore the column data types.
#'@export
getCSVFilesFromAzure <- function(files, host, securityToken, container, folder = folder, forPreview = FALSE, delim, quote = '"',
escape_backslash = FALSE, escape_double = TRUE,
col_names = TRUE, col_types = NULL,
locale = readr::default_locale(),
na = c("", "NA"), quoted_na = TRUE,
comment = "", trim_ws = FALSE,
skip = 0, n_max = Inf, guess_max = min(1000, n_max),
progress = interactive()) {
# for preview mode, just use the first file.
if (forPreview & length(files) > 0) {
files <- files[1]
}
# set name to the files so that it can be used for the "id" column created by purrr:map_dfr.
files <- setNames(as.list(files), files)
df <- purrr::map_dfr(files, exploratory::getCSVFileFromAzure, host = host, securityToken = securityToken, container = container, delim = delim, quote = quote,
escape_backslash = escape_backslash, escape_double = escape_double,
col_names = col_names, col_types = col_types,
locale = locale,
na = na, quoted_na = quoted_na,
comment = comment, trim_ws = trim_ws,
skip = skip, n_max = n_max, guess_max = guess_max,
progress = progress, .id = "exp.file.id") %>% mutate(exp.file.id = basename(exp.file.id)) # extract file name from full path with basename and create file.id column.
id_col <- avoid_conflict(colnames(df), "id")
# copy internal exp.file.id to the id column.
df[[id_col]] <- df[["exp.file.id"]]
# drop internal column and move the id column to the very beginning.
df %>% dplyr::select(!!rlang::sym(id_col), dplyr::everything(), -exp.file.id)
}
#'API that search then imports CSV Files from Azure.
#'@export
searchAndGetCSVFilesFromAzure <- function(searchKeyword, host, securityToken, container, folder, forPreview = FALSE, delim, quote = '"',
escape_backslash = FALSE, escape_double = TRUE,
col_names = TRUE, col_types = readr::cols(.default = readr::col_character()),
locale = readr::default_locale(),
na = c("", "NA"), quoted_na = TRUE,
comment = "", trim_ws = FALSE,
skip = 0, n_max = Inf, guess_max = min(1000, n_max),
progress = interactive()) {
# search condition is case insensitive. (ref: https://www.regular-expressions.info/modifiers.html, https://stackoverflow.com/questions/5671719/case-insensitive-search-of-a-list-in-r)
tryCatch({
files <- exploratory::listItemsInAzure(host = host, securityToken = securityToken, container = container, folder = folder) %>%
dplyr::filter(!isdir & str_detect(name, stringr::str_c("(?i)", searchKeyword))) %>% dplyr::select(name)
}, error = function(e) {
# if container does not exist, below error is raised:
# Error in list_adls_files(container, ...) :
# Not Found (HTTP 404). Failed to complete Storage Services operation. Message:
# The specified filesystem does not exist.
if (stringr::str_detect(e$message, "The specified filesystem does not exist.")) {
stop(paste0('EXP-DATASRC-11 :: ', jsonlite::toJSON(container), ' :: The specified Azure container does not exist.'))
}
else {
stop(e)
}
})
if (nrow(files) == 0) {
stop(paste0('EXP-DATASRC-10 :: ', jsonlite::toJSON(container), ' :: There is no file in the Azure container that matches with the specified condition.')) # TODO: escape bucket name.
}
getCSVFilesFromAzure(files = files$name, host = host, securityToken = securityToken, container = container, forPreview = forPreview, delim = delim, quote = quote,
col_names = col_names, col_types = col_types, locale = locale, na = na, quoted_na = quoted_na, comment = comment, trim_ws = trim_ws,
skip = skip, n_max = n_max, guess_max = guess_max, progress = progress)
}
#'API that imports a Parquet file from Azure.
#'@export
getParquetFileFromAzure <- function(fileName = "", host = "", securityToken = "", container = "", col_select = NULL) {
tryCatch({
filePath <- downloadDataFileFromAzure(host = host, securityToken = securityToken, container = container, fileName = fileName)
}, error = function(e) {
if (stringr::str_detect(e$message, "(Not Found|Moved Permanently)")) {
# Looking for error that looks like "Not Found (HTTP 404). Failed to complete Storage Services operation. Message:\n.".
# This seems to be returned when the bucket itself does not exist.
stop(paste0('EXP-DATASRC-12 :: ', jsonlite::toJSON(c(container, fileName)), ' :: There is no such file in the Azure Container.'))
}
else {
stop(e)
}
})
exploratory::read_parquet_file(filePath, col_select = col_select)
}
#'API that imports Parquet Files from Azure.
#'@export
getParquetFilesFromAzure <- function(files = "", host = "", securityToken = "", container = "", forPreview = FALSE, col_select = NULL) {
# for preview mode, just use the first file.
if (forPreview & length(files) > 0) {
files <- files[1]
}
files <- setNames(as.list(files), files)
df <- purrr::map_dfr(files, exploratory::getParquetFileFromAzure, host = host, securityToken = securityToken, container = container, col_select = col_select, .id = "exp.file.id") %>% mutate(exp.file.id = basename(exp.file.id)) # extract file name from full path with basename and create file.id column.
id_col <- avoid_conflict(colnames(df), "id")
# copy internal exp.file.id to the id column.
df[[id_col]] <- df[["exp.file.id"]]
# drop internal column and move the id column to the very beginning.
df %>% dplyr::select(!!rlang::sym(id_col), dplyr::everything(), -exp.file.id)
}
#'@export
searchAndGetParquetFilesFromAzure <- function(searchKeyword = "", host = "", securityToken = "", container = "", folder = "", forPreview = FALSE, col_select = NULL) {
# search condition is case insensitive. (ref: https://www.regular-expressions.info/modifiers.html, https://stackoverflow.com/questions/5671719/case-insensitive-search-of-a-list-in-r)
tryCatch({
files <- exploratory::listItemsInAzure(host = host, securityToken = securityToken, container = container, folder = folder) %>%
dplyr::filter(!isdir & str_detect(name, stringr::str_c("(?i)", searchKeyword))) %>% dplyr::select(name)
}, error = function(e) {
# if container does not exist, below error is raised:
# Error in list_adls_files(container, ...) :
# Not Found (HTTP 404). Failed to complete Storage Services operation. Message:
# The specified filesystem does not exist.
if (stringr::str_detect(e$message, "The specified filesystem does not exist.")) {
stop(paste0('EXP-DATASRC-11 :: ', jsonlite::toJSON(container), ' :: The specified Azure container does not exist.'))
}
else {
stop(e)
}
})
if (nrow(files) == 0) {
stop(paste0('EXP-DATASRC-10 :: ', jsonlite::toJSON(container), ' :: There is no file in the Azure container that matches with the specified condition.')) # TODO: escape bucket name.
}
getParquetFilesFromAzure(files = files$name, host = host, securityToken = securityToken, container = container, forPreview = forPreview, col_select = col_select)
}
#'API that imports a Excel file from Azure.
#'@export
getExcelFileFromAzure <- function(fileName, host, securityToken, container, sheet = 1, col_names = TRUE, col_types = NULL, na = "", skip = 0, trim_ws = TRUE, n_max = Inf, use_readxl = NULL, detectDates = FALSE, skipEmptyRows = FALSE, skipEmptyCols = FALSE, check.names = FALSE, tzone = NULL, convertDataTypeToChar = FALSE, ...) {
tryCatch({
filePath <- downloadDataFileFromAzure(host = host, securityToken = securityToken, container = container, fileName = fileName)
}, error = function(e) {
if (stringr::str_detect(e$message, "(Not Found|Moved Permanently)")) {
# Looking for error that looks like "Not Found (HTTP 404). Failed to complete Storage Services operation. Message:\n.".
# This seems to be returned when the bucket itself does not exist.
stop(paste0('EXP-DATASRC-12 :: ', jsonlite::toJSON(c(container, fileName)), ' :: There is no such file in the Azure Container.'))
}
else {
stop(e)
}
})
exploratory::read_excel_file(path = filePath, sheet = sheet, col_names = col_names, col_types = col_types, na = na, skip = skip, trim_ws = trim_ws, n_max = n_max, use_readxl = use_readxl, detectDates = detectDates, skipEmptyRows = skipEmptyRows, skipEmptyCols = skipEmptyCols, check.names = check.names, tzone = tzone, convertDataTypeToChar = convertDataTypeToChar, ...)
}
#'API that search files by search keyword then imports multiple same structure Excel files and merge it to a single data frame
#'
#'For col_types parameter, by default it forces character to make sure that merging the Excel based data frames doesn't error out due to column data types mismatch.
# Once the data frames merging is done, readr::type_convert is called from Exploratory Desktop to restore the column data types.
#'@export
searchAndGetExcelFilesFromAzure <- function(searchKeyword, host, securityToken, container, folder, forPreview = FALSE, sheet = 1, col_names = TRUE, col_types = NULL, na = "", skip = 0, trim_ws = TRUE, n_max = Inf, use_readxl = NULL, detectDates = FALSE, skipEmptyRows = FALSE, skipEmptyCols = FALSE, check.names = FALSE, tzone = NULL, convertDataTypeToChar = TRUE, ...){
# search condition is case insensitive. (ref: https://www.regular-expressions.info/modifiers.html, https://stackoverflow.com/questions/5671719/case-insensitive-search-of-a-list-in-r)
tryCatch({
files <- exploratory::listItemsInAzure(host = host, securityToken = securityToken, container = container, folder = folder) %>%
dplyr::filter(!isdir & str_detect(name, stringr::str_c("(?i)", searchKeyword))) %>% dplyr::select(name)
}, error = function(e) {
# if container does not exist, below error is raised:
# Error in list_adls_files(container, ...) :
# Not Found (HTTP 404). Failed to complete Storage Services operation. Message:
# The specified filesystem does not exist.
if (stringr::str_detect(e$message, "The specified filesystem does not exist.")) {
stop(paste0('EXP-DATASRC-11 :: ', jsonlite::toJSON(container), ' :: The specified Azure container does not exist.'))
}
else {
stop(e)
}
})
if (nrow(files) == 0) {
stop(paste0('EXP-DATASRC-10 :: ', jsonlite::toJSON(container), ' :: There is no file in the Azure container that matches with the specified condition.')) # TODO: escape bucket name.
}
exploratory::getExcelFilesFromAzure(files = files$name, host = host, securityToken = securityToken, container = container, forPreview = forPreview, sheet = sheet,
col_names = col_names, col_types = col_types, na = na, skip = skip, trim_ws = trim_ws, n_max = n_max,
use_readxl = use_readxl, detectDates = detectDates, skipEmptyRows = skipEmptyRows, skipEmptyCols = skipEmptyCols,
check.names = check.names, tzone = tzone, convertDataTypeToChar = convertDataTypeToChar, ...)
}
#'API that imports multiple Excel files from Azure
#'@export
getExcelFilesFromAzure <- function(files, host, securityToken, container, forPreview = FALSE, sheet = 1, col_names = TRUE, col_types = NULL, na = "", skip = 0, trim_ws = TRUE, n_max = Inf, use_readxl = NULL, detectDates = FALSE, skipEmptyRows = FALSE, skipEmptyCols = FALSE, check.names = FALSE, tzone = NULL, convertDataTypeToChar = TRUE, ...) {
# for preview mode, just use the first file.
if (forPreview & length(files) > 0) {
files <- files[1]
}
# set name to the files so that it can be used for the "id" column created by purrr:map_dfr.
files <- setNames(as.list(files), files)
df <- purrr::map_dfr(files, exploratory::getExcelFileFromAzure, host = host, securityToken = securityToken, container = container, sheet = sheet,
col_names = col_names, col_types = col_types, na = na, skip = skip, trim_ws = trim_ws, n_max = n_max, use_readxl = use_readxl,
detectDates = detectDates, skipEmptyRows = skipEmptyRows, skipEmptyCols = skipEmptyCols, check.names = check.names,
tzone = tzone, convertDataTypeToChar = convertDataTypeToChar, .id = "exp.file.id") %>% mutate(exp.file.id = basename(exp.file.id)) # extract file name from full path with basename and create file.id column.
id_col <- avoid_conflict(colnames(df), "id")
# copy internal exp.file.id to the id column.
df[[id_col]] <- df[["exp.file.id"]]
# drop internal column and move the id column to the very beginning.
df %>% dplyr::select(!!rlang::sym(id_col), dplyr::everything(), -exp.file.id)
}
#'Wrapper for readxl::excel_sheets to support Azure Excel file
#'@export
getExcelSheetsFromAzureExcelFile <- function(fileName, host, securityToken, container){
filePath <- downloadDataFileFromAzure(host = host, securityToken = securityToken, container =container, fileName = fileName)
readxl::excel_sheets(filePath)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.