#------------------------------------------------------------------------------#
#
# _ _ _ _ _
# (_) | | | | | | | |
# _ __ ___ _ _ __ | |_ | |__ | | __ _ _ __ | | __
# | '_ \ / _ \ | || '_ \ | __|| '_ \ | | / _` || '_ \ | |/ /
# | |_) || (_) || || | | || |_ | |_) || || (_| || | | || <
# | .__/ \___/ |_||_| |_| \__||_.__/ |_| \__,_||_| |_||_|\_\
# | |
# |_|
#
# This file is part of the 'rstudio/pointblank' project.
#
# Copyright (c) 2017-2024 pointblank authors
#
# For full copyright and license information, please look at
# https://rstudio.github.io/pointblank/LICENSE.html
#
#------------------------------------------------------------------------------#
# nocov start
#' Get a table from a local or remote file
#'
#' @description
#'
#' If your target table is in a file, stored either locally or remotely, the
#' `file_tbl()` function can make it possible to access it in a single function
#' call. Compatible file types for this function are: CSV (`.csv`), TSV
#' (`.tsv`), RDA (`.rda`), and RDS (`.rds`) files. This function generates an
#' in-memory `tbl_df` object, which can be used as a target table for
#' [create_agent()] and [create_informant()]. Another great option is supplying
#' a table-prep formula involving `file_tbl()` to [tbl_store()] so that you have
#' access to tables based on flat files though single names via a table store.
#'
#' In the remote data use case, we can specify a URL starting with `http://`,
#' `https://`, etc., and ending with the file containing the data table. If data
#' files are available in a GitHub repository then we can use the
#' [from_github()] function to specify the name and location of the table data
#' in a repository.
#'
#' @param file The complete file path leading to a compatible data table either
#' in the user system or at a `http://`, `https://`, `ftp://`, or `ftps://`
#' URL. For a file hosted in a GitHub repository, a call to the
#' [from_github()] function can be used here.
#'
#' @param type The file type. This is normally inferred by file extension and is
#' by default `NULL` to indicate that the extension will dictate the type of
#' file reading that is performed internally. However, if there is no
#' extension (and valid extensions are `.csv`, `.tsv`, `.rda`, and `.rds`), we
#' can provide the type as either of `csv`, `tsv`, `rda`, or `rds`.
#'
#' @param ... Options passed to **readr**'s `read_csv()` or `read_tsv()`
#' function. Both functions have the same arguments and one or the other will
#' be used internally based on the file extension or an explicit value given
#' to `type`.
#'
#' @param keep In the case of a downloaded file, should it be stored in the
#' working directory (`keep = TRUE`) or should it be downloaded to a temporary
#' directory? By default, this is `FALSE`.
#'
#' @param verify If `TRUE` (the default) then a verification of the data object
#' having the `data.frame` class will be carried out.
#'
#' @return A `tbl_df` object.
#'
#' @section Examples:
#'
#' ## Producing tables from CSV files
#'
#' A local CSV file can be obtained as a tbl object by supplying a path to the
#' file and some CSV reading options (the ones used by `readr::read_csv()`) to
#' the `file_tbl()` function. For this example we could obtain a path to a CSV
#' file in the **pointblank** package with `system.file()`.
#'
#' ```r
#' csv_path <-
#' system.file(
#' "data_files", "small_table.csv",
#' package = "pointblank"
#' )
#' ```
#'
#' Then use that path in `file_tbl()` with the option to specify the column
#' types in that CSV.
#'
#' ```r
#' tbl <-
#' file_tbl(
#' file = csv_path,
#' col_types = "TDdcddlc"
#' )
#'
#' tbl
#' ```
#'
#' \preformatted{## # A tibble: 13 × 8
#' ## date_time date a b c d e f
#' ## <dttm> <date> <dbl> <chr> <dbl> <dbl> <lgl> <chr>
#' ## 1 2016-01-04 11:00:00 2016-01-04 2 1-bcd-… 3 3423. TRUE high
#' ## 2 2016-01-04 00:32:00 2016-01-04 3 5-egh-… 8 10000. TRUE low
#' ## 3 2016-01-05 13:32:00 2016-01-05 6 8-kdg-… 3 2343. TRUE high
#' ## 4 2016-01-06 17:23:00 2016-01-06 2 5-jdo-… NA 3892. FALSE mid
#' ## 5 2016-01-09 12:36:00 2016-01-09 8 3-ldm-… 7 284. TRUE low
#' ## 6 2016-01-11 06:15:00 2016-01-11 4 2-dhe-… 4 3291. TRUE mid
#' ## 7 2016-01-15 18:46:00 2016-01-15 7 1-knw-… 3 843. TRUE high
#' ## 8 2016-01-17 11:27:00 2016-01-17 4 5-boe-… 2 1036. FALSE low
#' ## 9 2016-01-20 04:30:00 2016-01-20 3 5-bce-… 9 838. FALSE high
#' ## 10 2016-01-20 04:30:00 2016-01-20 3 5-bce-… 9 838. FALSE high
#' ## 11 2016-01-26 20:07:00 2016-01-26 4 2-dmx-… 7 834. TRUE low
#' ## 12 2016-01-28 02:51:00 2016-01-28 2 7-dmx-… 8 108. FALSE low
#' ## 13 2016-01-30 11:23:00 2016-01-30 1 3-dka-… NA 2230. TRUE high}
#'
#'
#'
#' Now that we have a `tbl` object that is a tibble it could be introduced to
#' [create_agent()] for validation.
#'
#' ```r
#' agent <- create_agent(tbl = tbl)
#' ```
#'
#' A different strategy is to provide the data-reading function call directly to
#' [create_agent()]:
#'
#' ```r
#' agent <-
#' create_agent(
#' tbl = ~ file_tbl(
#' file = system.file(
#' "data_files", "small_table.csv",
#' package = "pointblank"
#' ),
#' col_types = "TDdcddlc"
#' )
#' ) %>%
#' col_vals_gt(columns = a, value = 0)
#' ```
#'
#' All of the file-reading instructions are encapsulated in the `tbl` expression
#' (with the leading `~`) so the agent will always obtain the most recent
#' version of the table (and the logic can be translated to YAML, for later
#' use).
#'
#' ## Producing tables from files on GitHub
#'
#' A CSV can be obtained from a public GitHub repo by using the [from_github()]
#' helper function. Let's create an agent a supply a table-prep formula that
#' gets the same CSV file from the GitHub repository for the pointblank package.
#'
#' ```r
#' agent <-
#' create_agent(
#' tbl = ~ file_tbl(
#' file = from_github(
#' file = "inst/data_files/small_table.csv",
#' repo = "rstudio/pointblank"
#' ),
#' col_types = "TDdcddlc"
#' ),
#' tbl_name = "small_table",
#' label = "`file_tbl()` example.",
#' ) %>%
#' col_vals_gt(columns = a, value = 0) %>%
#' interrogate()
#' ```
#'
#' ```r
#' agent
#' ```
#'
#' \if{html}{
#'
#' \out{
#' `r pb_get_image_tag(file = "man_file_tbl_1.png")`
#' }
#' }
#'
#' This interrogated the data that was obtained from the remote source file,
#' and, there's nothing to clean up (by default, the downloaded file goes into a
#' system temp directory).
#'
#' ## File access, table creation, and prep via the table store
#'
#' Using table-prep formulas in a centralized table store can make it easier to
#' work with tables from disparate sources. Here's how to generate a table store
#' with two named entries for table preparations involving the [tbl_store()] and
#' `file_tbl()` functions.
#'
#' ```r
#' store <-
#' tbl_store(
#' small_table_file ~ file_tbl(
#' file = system.file(
#' "data_files", "small_table.csv",
#' package = "pointblank"
#' ),
#' col_types = "TDdcddlc"
#' ),
#' small_high_file ~ {{ small_table_file }} %>%
#' dplyr::filter(f == "high")
#' )
#' ```
#'
#' Now it's easy to access either of these tables via [tbl_get()]. We can
#' reference the table in the store by its name (given to the left of the `~`).
#'
#' ```r
#' tbl_get(tbl = "small_table_file", store = store)
#' ```
#'
#' \preformatted{## # A tibble: 13 × 8
#' ## date_time date a b c d e f
#' ## <dttm> <date> <dbl> <chr> <dbl> <dbl> <lgl> <chr>
#' ## 1 2016-01-04 11:00:00 2016-01-04 2 1-bcd-… 3 3423. TRUE high
#' ## 2 2016-01-04 00:32:00 2016-01-04 3 5-egh-… 8 10000. TRUE low
#' ## 3 2016-01-05 13:32:00 2016-01-05 6 8-kdg-… 3 2343. TRUE high
#' ## 4 2016-01-06 17:23:00 2016-01-06 2 5-jdo-… NA 3892. FALSE mid
#' ## 5 2016-01-09 12:36:00 2016-01-09 8 3-ldm-… 7 284. TRUE low
#' ## 6 2016-01-11 06:15:00 2016-01-11 4 2-dhe-… 4 3291. TRUE mid
#' ## 7 2016-01-15 18:46:00 2016-01-15 7 1-knw-… 3 843. TRUE high
#' ## 8 2016-01-17 11:27:00 2016-01-17 4 5-boe-… 2 1036. FALSE low
#' ## 9 2016-01-20 04:30:00 2016-01-20 3 5-bce-… 9 838. FALSE high
#' ## 10 2016-01-20 04:30:00 2016-01-20 3 5-bce-… 9 838. FALSE high
#' ## 11 2016-01-26 20:07:00 2016-01-26 4 2-dmx-… 7 834. TRUE low
#' ## 12 2016-01-28 02:51:00 2016-01-28 2 7-dmx-… 8 108. FALSE low
#' ## 13 2016-01-30 11:23:00 2016-01-30 1 3-dka-… NA 2230. TRUE high}
#'
#'
#'
#' The second table in the table store is a mutated version of the first. It's
#' just as easily obtainable via [tbl_get()]:
#'
#' ```r
#' tbl_get(tbl = "small_high_file", store = store)
#' ```
#'
#' \preformatted{## # A tibble: 6 × 8
#' ## date_time date a b c d e f
#' ## <dttm> <date> <dbl> <chr> <dbl> <dbl> <lgl> <chr>
#' ## 1 2016-01-04 11:00:00 2016-01-04 2 1-bcd-345 3 3423. TRUE high
#' ## 2 2016-01-05 13:32:00 2016-01-05 6 8-kdg-938 3 2343. TRUE high
#' ## 3 2016-01-15 18:46:00 2016-01-15 7 1-knw-093 3 843. TRUE high
#' ## 4 2016-01-20 04:30:00 2016-01-20 3 5-bce-642 9 838. FALSE high
#' ## 5 2016-01-20 04:30:00 2016-01-20 3 5-bce-642 9 838. FALSE high
#' ## 6 2016-01-30 11:23:00 2016-01-30 1 3-dka-303 NA 2230. TRUE high}
#'
#'
#'
#' The table-prep formulas in the `store` object could also be used in functions
#' with a `tbl` argument (like [create_agent()] and [create_informant()]). This
#' is accomplished most easily with the [tbl_source()] function.
#'
#' ```r
#' agent <-
#' create_agent(
#' tbl = ~ tbl_source(
#' tbl = "small_table_file",
#' store = store
#' )
#' )
#' ```
#'
#' ```r
#' informant <-
#' create_informant(
#' tbl = ~ tbl_source(
#' tbl = "small_high_file",
#' store = store
#' )
#' )
#' ```
#'
#' @family Planning and Prep
#' @section Function ID:
#' 1-7
#'
#' @export
file_tbl <- function(
file,
type = NULL,
...,
keep = FALSE,
verify = TRUE
) {
if (!requireNamespace("readr", quietly = TRUE)) {
stop(
"Reading a table from a file requires the readr package:\n",
" * It can be installed with `install.packages(\"readr\")`.",
call. = FALSE
)
}
file_extension <- tolower(tools::file_ext(file))
file_name <- basename(file)
if (grepl("^(ht|f)tps?://", file)) {
remote_file <- TRUE
} else {
remote_file <- FALSE
}
# If the file is a remote file then download that file
if (remote_file) {
# If not keeping a downloaded file (option set through the `keep`
# argument), then make the destination path one that's in a temp dir
if (!keep) {
temp_dir <- tempdir()
file_path <- file.path(temp_dir, file_name)
} else {
file_path <- file_name
}
download_remote_file(url = file, destfile = file_path)
} else {
file_path <- file
}
# Unless specified by `type`, the `file_type` will be determined
# by the file extension (or inner extension if the file is compressed)
if (!is.null(type)) {
# Check that the `type` provided is valid for this function
if (!(tolower(type) %in% c("rda", "rdata", "rds", "csv", "tsv"))) {
stop(
"If specifying the file `type`, it must be one of the following:\n",
" * `rda`, `rds`, `csv`, or `tsv`.",
call. = FALSE
)
}
file_type <- tolower(type)
} else {
if (file_extension %in% c("gz", "bz2", "xz", "zip")) {
file_basename_no_ext <-
gsub(
paste0("\\.(", paste("gz", "bz2", "xz", "zip", sep = "|"), ")"),
"", basename(file)
)
secondary_file_ext <- tolower(tools::file_ext(file_basename_no_ext))
if (secondary_file_ext != "") {
file_type <- secondary_file_ext
} else {
stop(
"File has no secondary extension to indicate the file type:\n",
" * Use the `type` argument to explicitly state the file type",
call. = FALSE
)
}
} else {
file_type <- file_extension
}
}
access_time <- Sys.time()
if (file_type %in% c("rda", "rdata")) {
x <- load_rda_object(file = file_path)
} else if (file_type == "rds") {
x <- readr::read_rds(file = file_path)
} else if (file_type %in% c("csv", "tsv")) {
x <- readr::read_csv(file = file_path, ...)
} else {
stop(
"The file type is incompatible with `file_tbl()`, the following work:\n",
" * Comma or tab separated values (`.csv` or `.tsv`)\n",
" * RDA or RDS files (`.rda`/`.rdata` or `.rds`)",
call. = FALSE
)
}
# If `verify = TRUE` then ensure that the data object inherits
# from `data.frame`; this can either be a data frame proper or
# a tibble (`tbl_df`)
if (verify && !inherits(x, "data.frame")) {
stop(
"The data object is not a data table:\n",
" * It is an object of class `", class(x)[1], "`.",
call. = FALSE
)
}
attr(x, "pb_tbl_name") <- file_name
attr(x, "pb_full_path") <- file
attr(x, "pb_dir_name") <- dirname(file)
attr(x, "pb_file_type") <- file_type
attr(x, "pb_access_time") <- access_time
x
}
#' Specify a file for download from GitHub
#'
#' @description
#'
#' The `from_github()` function is helpful for generating a valid URL that
#' points to a data file in a public GitHub repository. This function can be
#' used in the `file` argument of the [file_tbl()] function or anywhere else
#' where GitHub URLs for raw user content are needed.
#'
#' @param file The name of the file to target in a GitHub repository. This can
#' be a path leading to and including the file. This is combined with any path
#' given in `subdir`.
#'
#' @param repo The GitHub repository address in the format
#' `username/repo[/subdir][@ref|#pull|@*release]`.
#'
#' @param subdir A path string representing a subdirectory in the GitHub
#' repository. This is combined with any path components included in `file`.
#'
#' @param default_branch The name of the default branch for the repo. This is
#' usually `"main"` (the default used here).
#'
#' @return A character vector of length 1 that contains a URL.
#'
#' @examples
#' # A valid URL to a data file in GitHub can be
#' # obtained from the HEAD of the default branch
#' # from_github(
#' # file = "inst/data_files/small_table.csv",
#' # repo = "rstudio/pointblank"
#' # )
#'
#' # The path to the file location can be supplied
#' # fully or partially to `subdir`
#' # from_github(
#' # file = "small_table.csv",
#' # repo = "rstudio/pointblank",
#' # subdir = "inst/data_files"
#' # )
#'
#' # We can use the first call in combination with
#' # `file_tbl()` and `create_agent()`; this
#' # supplies a table-prep formula that gets
#' # a CSV file from the GitHub repository for the
#' # pointblank package
#' # agent <-
#' # create_agent(
#' # tbl = ~ file_tbl(
#' # file = from_github(
#' # file = "inst/data_files/small_table.csv",
#' # repo = "rstudio/pointblank"
#' # ),
#' # col_types = "TDdcddlc"
#' # )
#' # ) %>%
#' # col_vals_gt(a, 0) %>%
#' # interrogate()
#'
#' # The `from_github()` helper function is
#' # pretty powerful and can get at lots of
#' # different files in a repository
#'
#' # A data file from GitHub can be obtained from
#' # a commit at release time
#' # from_github(
#' # file = "inst/extdata/small_table.csv",
#' # repo = "rstudio/pointblank@v0.2.1"
#' # )
#'
#' # A file may also be obtained from a repo at the
#' # point in time of a specific commit (partial or
#' # full SHA-1 hash for the commit can be used)
#' # from_github(
#' # file = "data-raw/small_table.csv",
#' # repo = "rstudio/pointblank@e04a71"
#' # )
#'
#' # A file may also be obtained from an
#' # *open* pull request
#' # from_github(
#' # file = "data-raw/small_table.csv",
#' # repo = "rstudio/pointblank#248"
#' # )
#'
#' @family Utility and Helper Functions
#' @section Function ID:
#' 13-6
#'
#' @export
from_github <- function(
file,
repo,
subdir = NULL,
default_branch = "main"
) {
# get the username, repo, subdir component
u_r_s <- gsub("(@|#).*", "", repo)
u_r_s <- unlist(strsplit(u_r_s, "/"))
# Get the username and repo
username <- u_r_s[1]
repository <- u_r_s[2]
# Get the package subdir if length of `u_r_s` is 3
if (length(u_r_s) == 3) {
subdir_file <- u_r_s[3]
} else {
subdir_file <- NULL
}
if (grepl("@*", repo, fixed = TRUE)) {
ref_res <- unlist(strsplit(repo, "@\\*"))[2]
} else if (grepl("@", repo, fixed = TRUE)) {
ref_res <- unlist(strsplit(repo, "@"))[2]
} else if (grepl("#", repo, fixed = TRUE)) {
if (!requireNamespace("jsonlite", quietly = TRUE)) {
stop(
"Getting a table from a file in a PR requires the jsonlite package:\n",
" * It can be installed with `install.packages(\"jsonlite\")`.",
call. = FALSE
)
}
pr_number <- unlist(strsplit(repo, "#"))[2]
pulls_doc_tempfile <- tempfile(pattern = "pulls", fileext = ".json")
pulls_doc_url <-
as.character(
glue::glue(
"https://api.github.com/repos/{username}/{repository}/pulls"
)
)
# Download the `pulls` JSON document for the repo from GitHub
download_remote_file(
url = pulls_doc_url,
destfile = pulls_doc_tempfile,
quiet = TRUE
)
# Resolve the PR number to a merge commit SHA
ref_res <-
(jsonlite::fromJSON(pulls_doc_tempfile, flatten = TRUE) %>%
dplyr::select(number, merge_commit_sha, head.ref) %>%
dplyr::filter(number == as.integer(pr_number)) %>%
dplyr::pull(merge_commit_sha))[1]
} else {
ref_res <- default_branch
}
if (!is.null(subdir_file)) {
file_path <- as.character(glue::glue("{subdir_file}/{file}"))
} else if (!is.null(subdir)) {
file_path <- as.character(glue::glue("{subdir}/{file}"))
} else {
file_path <- file
}
url <-
as.character(
glue::glue(
"https://github.com/{username}/{repository}/raw/{ref_res}/{file_path}"
)
)
url
}
download_remote_file <- function(url, ...) {
if (grepl("^https?://", url)) {
is_r32 <- getRversion() >= "3.2"
if (.Platform$OS.type == "windows") {
if (is_r32) {
method <- "wininet"
} else {
seti2 <- utils::"setInternet2"
internet2_start <- seti2(NA)
if (!internet2_start) {
on.exit(suppressWarnings(seti2(internet2_start)))
suppressWarnings(seti2(TRUE))
}
method <- "internal"
}
suppressWarnings(utils::download.file(url, method = method, ...))
} else {
if (is_r32 && capabilities("libcurl")) {
method <- "libcurl"
} else if (nzchar(Sys.which("wget")[1])) {
method <- "wget"
} else if (nzchar(Sys.which("curl")[1])) {
method <- "curl"
orig_extra_options <- getOption("download.file.extra")
on.exit(options(download.file.extra = orig_extra_options))
options(download.file.extra = paste("-L", orig_extra_options))
} else if (nzchar(Sys.which("lynx")[1])) {
method <- "lynx"
} else {
stop("No download method can be found.")
}
utils::download.file(url, method = method, ...)
}
} else {
utils::download.file(url, ...)
}
}
load_rda_object <- function(file) {
env <- new.env()
nm <- load(file, env)[1]
env[[nm]]
}
get_attr_file_tbl <- function(x, attr) {
# Possible `attr` values are:
# * "tbl_name"
# * "full_path"
# * "dir_name"
# * "file_type"
attr(x, which = paste0("pb_", attr), exact = TRUE)
}
# nocov end
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.