Nothing
#
# _ _ _ _ _
# (_) | | | | | | | |
# _ __ ___ _ _ __ | |_ | |__ | | __ _ _ __ | | __
# | '_ \ / _ \ | || '_ \ | __|| '_ \ | | / _` || '_ \ | |/ /
# | |_) || (_) || || | | || |_ | |_) || || (_| || | | || <
# | .__/ \___/ |_||_| |_| \__||_.__/ |_| \__,_||_| |_||_|\_\
# | |
# |_|
#
# This file is part of the 'rich-iannone/pointblank' package.
#
# (c) Richard Iannone <riannone@me.com>
#
# For full copyright and license information, please look at
# https://rich-iannone.github.io/pointblank/LICENSE.html
#
#' Draft a starter **pointblank** validation .R/.Rmd file with a data table
#'
#' @description
#' Generate a draft validation plan in a new .R or .Rmd file using an input data
#' table. Using this workflow, the data table will be scanned to learn about its
#' column data and a set of starter validation steps (constituting a validation
#' plan) will be written. It's best to use a data extract that contains at least
#' 1000 rows and is relatively free of spurious data.
#'
#' Once in the file, it's possible to tweak the validation steps to better fit
#' the expectations to the particular domain. While column inference is used to
#' generate reasonable validation plans, it is difficult to infer the acceptable
#' values without domain expertise. However, using `draft_validation()` could
#' get you started on floor 10 of tackling data quality issues and is in any
#' case better than starting with an empty code editor view.
#'
#' @section Supported Input Tables:
#' The types of data tables that are officially supported are:
#'
#' - data frames (`data.frame`) and tibbles (`tbl_df`)
#' - Spark DataFrames (`tbl_spark`)
#' - the following database tables (`tbl_dbi`):
#' - *PostgreSQL* tables (using the `RPostgres::Postgres()` as driver)
#' - *MySQL* tables (with `RMySQL::MySQL()`)
#' - *Microsoft SQL Server* tables (via **odbc**)
#' - *BigQuery* tables (using `bigrquery::bigquery()`)
#' - *DuckDB* tables (through `duckdb::duckdb()`)
#' - *SQLite* (with `RSQLite::SQLite()`)
#'
#' Other database tables may work to varying degrees but they haven't been
#' formally tested (so be mindful of this when using unsupported backends with
#' **pointblank**).
#'
#' @param tbl The input table. This can be a data frame, tibble, a `tbl_dbi`
#' object, or a `tbl_spark` object.
#' @param tbl_name A optional name to assign to the input table object. If no
#' value is provided, a name will be generated based on whatever information
#' is available. This table name will be displayed in the header area of the
#' agent report generated by printing the *agent* or calling
#' [get_agent_report()].
#' @param file_name An optional name for the .R or .Rmd file. This should be a
#' name without an extension. By default, this is taken from the `tbl_name`
#' but if nothing is supplied for that, the name will contain the text
#' `"draft_validation_"` followed by the current date and time.
#' @param path A path can be specified here if there shouldn't be an attempt to
#' place the generated file in the working directory.
#' @param lang The language to use when creating comments for the automatically-
#' generated validation steps. By default, `NULL` will create English (`"en"`)
#' text. Other options include French (`"fr"`), German (`"de"`), Italian
#' (`"it"`), Spanish (`"es"`), Portuguese (`"pt"`), Turkish (`"tr"`), Chinese
#' (`"zh"`), Russian (`"ru"`), Polish (`"pl"`), Danish (`"da"`), Swedish
#' (`"sv"`), and Dutch (`"nl"`).
#' @param output_type An option for choosing what type of output should be
#' generated. By default, this is an .R script (`"R"`) but this could
#' alternatively be an R Markdown document (`"Rmd"`).
#' @param add_comments Should there be comments that explain the features of the
#' validation plan in the generated document? By default, this is `TRUE`.
#' @param overwrite Should a file of the same name be overwritten? By default,
#' this is `FALSE`.
#' @param quiet Should the function *not* inform when the file is written? By
#' default this is `FALSE`.
#'
#' @return Invisibly returns `TRUE` if the file has been written.
#'
#' @section Examples:
#'
#' Let's draft a validation plan for the `dplyr::storms` dataset.
#'
#' ```{r}
#' dplyr::storms
#' ```
#'
#' The `draft_validation()` function creates an .R file by default. Using just
#' the defaults with `dplyr::storms` will yield the `"dplyr__storms.R"` file
#' in the working directory. Here are the contents of the file:
#'
#' ```r
#' library(pointblank)
#'
#' agent <-
#' create_agent(
#' tbl = ~ dplyr::storms,
#' actions = action_levels(
#' warn_at = 0.05,
#' stop_at = 0.10
#' ),
#' tbl_name = "dplyr::storms",
#' label = "Validation plan generated by `draft_validation()`."
#' ) %>%
#' # Expect that column `name` is of type: character
#' col_is_character(
#' columns = vars(name)
#' ) %>%
#' # Expect that column `year` is of type: numeric
#' col_is_numeric(
#' columns = vars(year)
#' ) %>%
#' # Expect that values in `year` should be between `1975` and `2020`
#' col_vals_between(
#' columns = vars(year),
#' left = 1975,
#' right = 2020
#' ) %>%
#' # Expect that column `month` is of type: numeric
#' col_is_numeric(
#' columns = vars(month)
#' ) %>%
#' # Expect that values in `month` should be between `1` and `12`
#' col_vals_between(
#' columns = vars(month),
#' left = 1,
#' right = 12
#' ) %>%
#' # Expect that column `day` is of type: integer
#' col_is_integer(
#' columns = vars(day)
#' ) %>%
#' # Expect that values in `day` should be between `1` and `31`
#' col_vals_between(
#' columns = vars(day),
#' left = 1,
#' right = 31
#' ) %>%
#' # Expect that column `hour` is of type: numeric
#' col_is_numeric(
#' columns = vars(hour)
#' ) %>%
#' # Expect that values in `hour` should be between `0` and `23`
#' col_vals_between(
#' columns = vars(hour),
#' left = 0,
#' right = 23
#' ) %>%
#' # Expect that column `lat` is of type: numeric
#' col_is_numeric(
#' columns = vars(lat)
#' ) %>%
#' # Expect that values in `lat` should be between `-90` and `90`
#' col_vals_between(
#' columns = vars(lat),
#' left = -90,
#' right = 90
#' ) %>%
#' # Expect that column `long` is of type: numeric
#' col_is_numeric(
#' columns = vars(long)
#' ) %>%
#' # Expect that values in `long` should be between `-180` and `180`
#' col_vals_between(
#' columns = vars(long),
#' left = -180,
#' right = 180
#' ) %>%
#' # Expect that column `status` is of type: character
#' col_is_character(
#' columns = vars(status)
#' ) %>%
#' # Expect that column `category` is of type: factor
#' col_is_factor(
#' columns = vars(category)
#' ) %>%
#' # Expect that column `wind` is of type: integer
#' col_is_integer(
#' columns = vars(wind)
#' ) %>%
#' # Expect that values in `wind` should be between `10` and `160`
#' col_vals_between(
#' columns = vars(wind),
#' left = 10,
#' right = 160
#' ) %>%
#' # Expect that column `pressure` is of type: integer
#' col_is_integer(
#' columns = vars(pressure)
#' ) %>%
#' # Expect that values in `pressure` should be between `882` and `1022`
#' col_vals_between(
#' columns = vars(pressure),
#' left = 882,
#' right = 1022
#' ) %>%
#' # Expect that column `tropicalstorm_force_diameter` is of type: integer
#' col_is_integer(
#' columns = vars(tropicalstorm_force_diameter)
#' ) %>%
#' # Expect that values in `tropicalstorm_force_diameter` should be between
#' # `0` and `870`
#' col_vals_between(
#' columns = vars(tropicalstorm_force_diameter),
#' left = 0,
#' right = 870,
#' na_pass = TRUE
#' ) %>%
#' # Expect that column `hurricane_force_diameter` is of type: integer
#' col_is_integer(
#' columns = vars(hurricane_force_diameter)
#' ) %>%
#' # Expect that values in `hurricane_force_diameter` should be between
#' # `0` and `300`
#' col_vals_between(
#' columns = vars(hurricane_force_diameter),
#' left = 0,
#' right = 300,
#' na_pass = TRUE
#' ) %>%
#' # Expect entirely distinct rows across all columns
#' rows_distinct() %>%
#' # Expect that column schemas match
#' col_schema_match(
#' schema = col_schema(
#' name = "character",
#' year = "numeric",
#' month = "numeric",
#' day = "integer",
#' hour = "numeric",
#' lat = "numeric",
#' long = "numeric",
#' status = "character",
#' category = c("ordered", "factor"),
#' wind = "integer",
#' pressure = "integer",
#' tropicalstorm_force_diameter = "integer",
#' hurricane_force_diameter = "integer"
#' )
#' ) %>%
#' interrogate()
#'
#' agent
#' ```
#'
#' This is runnable as is, and the promise is that the interrogation should
#' produce no failing test units. After execution, we get the following
#' validation report:
#'
#' \if{html}{
#' \out{
#' `r pb_get_image_tag(file = "man_draft_validation_1.png")`
#' }
#' }
#'
#' All of the expressions in the resulting file constitute just a rough
#' approximation of what a validation plan should be for a dataset. Certainly,
#' the value ranges in the emitted [col_vals_between()] may not be realistic for
#' the `wind` column and may require some modification (the provided `left` and
#' `right` values are just the limits of the provided data). However, note that
#' the `lat` and `long` (latitude and longitude) columns have acceptable ranges
#' (providing the limits of valid lat/lon values). This is thanks to
#' **pointblank**'s column inference routines, which is able to understand what
#' certain columns contain.
#'
#' For an evolving dataset that will experience changes (either in the form of
#' revised data and addition/deletion of rows or columns), the emitted
#' validation will serve as a good first step and changes can more easily be
#' made since there is a foundation to build from.
#'
#'
#' @family Planning and Prep
#' @section Function ID:
#' 1-11
#'
#' @export
draft_validation <- function(
tbl,
tbl_name = NULL,
file_name = tbl_name,
path = NULL,
lang = NULL,
output_type = c("R", "Rmd"),
add_comments = TRUE,
overwrite = FALSE,
quiet = FALSE
) {
output_type <- match.arg(output_type)
tbl_material <- materialize_table(tbl = tbl)
column_roles <- get_column_roles(data = tbl_material)
column_names <- colnames(tbl_material)
agent <-
create_agent(
tbl = tbl,
tbl_name = tbl_name,
label = "Validation plan generated by `draft_validation()`.",
actions = action_levels(warn_at = 0.05, stop_at = 0.10),
lang = lang
)
agent$tbl <- tbl_material
# Add column-based validation steps to the agent on
# the basis of column roles
for (i in seq_along(column_roles)) {
agent <-
add_valdn_steps_with_role(
agent = agent,
column = column_names[i],
column_role = column_roles[i]
)
}
# Add the `rows_distinct()` validation step if all rows in the
# table are distinct
total_rows <- get_table_total_rows(data = tbl_material)
distinct_rows <- get_table_total_distinct_rows(data = tbl_material)
if (distinct_rows == total_rows) {
agent <- rows_distinct(agent)
}
# Add the `col_schema_match()` validation step
agent <- col_schema_match(agent, schema = col_schema(.tbl = tbl_material))
# Get the `read_fn` text from `tbl`
read_fn_name <- deparse(match.call()$tbl)
read_fn_name <- gsub("^\\s+", "", read_fn_name)
read_fn_name <- paste(read_fn_name, collapse = "")
if (is.null(tbl_name) && !grepl("\\s", read_fn_name)) {
tbl_name <- read_fn_name
}
if (read_fn_name == ".") {
read_fn_name <- NULL
}
if (file_name == ".") {
file_name <- NULL
}
# Create the filename for the pointblank file
file_name <-
resolve_file_filename(
agent = agent,
name = file_name,
output_type = output_type
)
if (is.null(path)) {
file_path <- "."
} else {
if (!fs::dir_exists(path)) {
# Stop function if the path doesn't exist and inform user
# that this function won't create a path
stop(
"The provided `path` does not exist:\n",
"* Please create the path",
call. = FALSE
)
}
file_path <- path
}
# Create path that contains the file
path <- as.character(fs::path_norm(fs::path_wd(file_path, file_name)))
# Check if the file to write already exists; if it does, don't
# write the new file if `overwrite` is FALSE
if (fs::file_exists(path) && !overwrite) {
stop(
"A file of the same name already exists:\n",
"* set `overwrite` to `TRUE`, or\n",
"* choose a different `file_name`, or\n",
"* define another `path` for the file",
call. = FALSE
)
}
# Set a temporary `read_fn` value if one doesn't exist in the agent
if (is.null(agent$read_fn)) {
agent$read_fn <- ""
}
# Extract all briefs from the validation steps
briefs <- agent$validation_set$brief
# Extract all R expressions for the file
agent_exprs <- agent_get_exprs(agent = agent, expanded = TRUE)
agent_exprs <-
gsub(
"tbl = ,\n",
paste0(
"tbl = ~ ",
ifelse(
is.null(read_fn_name),
"CODE_TO_ACCESS_TABLE, # <- Add R code that obtains the data table",
read_fn_name),
",\n"
),
agent_exprs
)
agent_exprs <-
gsub(
"stop_at = 0.1",
"stop_at = 0.10",
agent_exprs
)
agent_exprs <-
gsub(
" tbl_name = \".*?\",",
paste0(
" tbl_name = ",
ifelse(
is.null(tbl_name) || tbl_name == ".",
"NULL, # <- Optionally add in the table name",
paste0("\"", tbl_name, "\",")
)
),
agent_exprs
)
agent_expr_vec <- unlist(strsplit(agent_exprs, " %>%\n", fixed = TRUE))
if (add_comments) {
agent_expr_vec_2 <-
paste(
paste0("%>%\n # ", gsub("\\. $", "", briefs), "\n"),
paste0(" ", gsub("\n", "\n ", agent_expr_vec[-1])),
collapse = " "
)
} else {
agent_expr_vec_2 <-
paste(
paste0("%>%\n"),
paste0(" ", gsub("\n", "\n ", agent_expr_vec[-1])),
collapse = " "
)
}
agent_lines <-
paste(
paste0("agent <-\n ", gsub("\n", "\n ", agent_expr_vec[1])),
agent_expr_vec_2,
collapse = ""
)
if (output_type == "R") {
file_content <-
paste0(
"library(pointblank)\n\n",
agent_lines,
"%>%\n interrogate()\n\nagent",
collapse = ""
) %>%
gsub(" %>%", " %>%", .)
} else {
file_content <-
paste0(
"---\n",
"title: \"",
ifelse(is.null(tbl_name) || tbl_name == ".", "Untitled", tbl_name),
"\"\n",
"output: html_document\n",
"---\n",
"\n",
"```{r setup, include=FALSE}\n",
"knitr::opts_chunk$set(echo = TRUE)\n",
"library(pointblank)\n",
"```\n",
"\n\n",
"```{r create_agent, echo=TRUE}\n",
agent_lines,
"%>%\n interrogate()\n",
"```\n",
"\n\n",
"```{r print_agent, echo=FALSE}\n",
"agent\n",
"```\n",
collapse = ""
) %>%
gsub(" %>%", " %>%", .)
}
# Write the file to the resulting `path`
pb_write_file(
path = path,
lines = file_content,
append = FALSE
)
# Generate cli message
if (!quiet) {
cli_bullet_msg(
msg = paste0(
"The pointblank .", output_type, " file has been written to `{path}`"
),
bullet = cli::symbol$tick,
color = "green"
)
}
invisible(TRUE)
}
add_valdn_steps_with_role <- function(agent, column, column_role) {
if (grepl("string", column_role)) {
if (inherits(agent$tbl, "data.frame") &&
is.factor(agent$tbl[[column]])) {
agent <- col_is_factor(agent, columns = {{ column }})
} else {
agent <- col_is_character(agent, columns = {{ column }})
}
}
if (column_role == "integer.discrete") {
agent <- col_is_integer(agent, columns = {{ column }})
}
if (column_role == "boolean.logical.categorical") {
agent <- col_is_logical(agent, columns = {{ column }})
}
if (column_role == "country:iso3166-1-esn.string.categorical") {
country_names <-
dplyr::pull(
get_non_null_col_sample(
data_column = dplyr::select(agent$tbl, {{ column }}),
sample_n = 2E8,
make_distinct = TRUE
)
)
missing_values_column <-
get_table_total_missing_values(
data = dplyr::select(agent$tbl, {{ column }})
)
if (missing_values_column > 0) {
country_names <- c(NA_character_, country_names)
}
agent <-
col_vals_in_set(
agent,
columns = {{ column }},
set = country_names
)
}
if (column_role == "country:iso3166-1-a-2.string.categorical") {
alpha_2 <- countries$alpha_2
missing_values_column <-
get_table_total_missing_values(
data = dplyr::select(agent$tbl, {{ column }})
)
if (missing_values_column > 0) {
alpha_2 <- c(NA_character_, alpha_2)
}
agent <-
col_vals_in_set(
agent,
columns = {{ column }},
set = alpha_2
)
}
if (column_role == "country:iso3166-1-a-3.string.categorical") {
alpha_3 <- countries$alpha_3
missing_values_column <-
get_table_total_missing_values(
data = dplyr::select(agent$tbl, {{ column }})
)
if (missing_values_column > 0) {
alpha_3 <- c(NA_character_, alpha_3)
}
agent <-
col_vals_in_set(
agent,
columns = {{ column }},
set = alpha_3
)
}
if (grepl("country_subd:iso3166-2\\[...\\].string", column_role)) {
country <- gsub("(^.*\\[|\\].*$)", "", column_role)
subd_2 <- subd_list_main[[country]]
missing_values_column <-
get_table_total_missing_values(
data = dplyr::select(agent$tbl, {{ column }})
)
if (missing_values_column > 0) {
subd_2 <- c(NA_character_, subd_2)
}
agent <-
col_vals_in_set(
agent,
columns = {{ column }},
set = subd_2
)
}
if (grepl("numeric", column_role)) {
agent <-
col_is_numeric(
agent,
columns = {{ column }}
)
}
if (column_role %in% c(
"numeric.continuous", "numeric.discrete", "numeric", "integer.discrete"
)) {
missing_values_column <-
get_table_total_missing_values(
data = dplyr::select(agent$tbl, {{ column }})
)
summary_list <-
get_table_column_summary(
data_column = dplyr::select(agent$tbl, {{ column }}),
round = Inf
)
agent <-
col_vals_between(
agent, columns = {{ column }},
left = summary_list$min, right = summary_list$max,
na_pass = missing_values_column > 0
)
}
if (grepl("geo:latitude.numeric", column_role)) {
missing_values_column <-
get_table_total_missing_values(
data = dplyr::select(agent$tbl, {{ column }})
)
agent <-
col_vals_between(
agent, columns = {{ column }},
left = -90, right = 90,
na_pass = missing_values_column > 0
)
}
if (grepl("geo:longitude.numeric", column_role)) {
missing_values_column <-
get_table_total_missing_values(
data = dplyr::select(agent$tbl, {{ column }})
)
agent <-
col_vals_between(
agent, columns = {{ column }},
left = -180, right = 180,
na_pass = missing_values_column > 0
)
}
agent
}
resolve_file_filename <- function(agent,
name,
output_type) {
if (is.null(name)) {
sys_time <- format(Sys.time(), format = "%Y_%m_%d_%I_%M_%p")
file_name <-
paste0("draft_validation_", sys_time, ".", output_type)
} else {
if (!is.character(name)) {
stop(
"The value supplied to `name` must be of class 'character'.",
call. = FALSE
)
}
# Handle special case of `pkg::dataset` before sanitization
if (!grepl("\\s", name) && grepl("::", name)) {
name <- gsub("::", "__", name, fixed = TRUE)
}
file_name <-
name[1] %>%
fs::path_sanitize() %>%
gsub("(\\.| |'|\\:)", "_", .) %>%
paste0(., ".", output_type)
}
file_name
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.