R/draft_validation.R

Defines functions draft_validation

Documented in draft_validation

#
#                _         _    _      _                _    
#               (_)       | |  | |    | |              | |   
#  _ __    ___   _  _ __  | |_ | |__  | |  __ _  _ __  | | __
# | '_ \  / _ \ | || '_ \ | __|| '_ \ | | / _` || '_ \ | |/ /
# | |_) || (_) || || | | || |_ | |_) || || (_| || | | ||   < 
# | .__/  \___/ |_||_| |_| \__||_.__/ |_| \__,_||_| |_||_|\_\
# | |                                                        
# |_|                                                        
# 
# This file is part of the 'rich-iannone/pointblank' package.
# 
# (c) Richard Iannone <riannone@me.com>
# 
# For full copyright and license information, please look at
# https://rich-iannone.github.io/pointblank/LICENSE.html
#


#' Draft a starter **pointblank** validation .R/.Rmd file with a data table
#' 
#' @description
#' Generate a draft validation plan in a new .R or .Rmd file using an input data
#' table. Using this workflow, the data table will be scanned to learn about its
#' column data and a set of starter validation steps (constituting a validation
#' plan) will be written. It's best to use a data extract that contains at least
#' 1000 rows and is relatively free of spurious data.
#'
#' Once in the file, it's possible to tweak the validation steps to better fit
#' the expectations to the particular domain. While column inference is used to
#' generate reasonable validation plans, it is difficult to infer the acceptable
#' values without domain expertise. However, using `draft_validation()` could
#' get you started on floor 10 of tackling data quality issues and is in any
#' case better than starting with an empty code editor view.
#' 
#' @section Supported Input Tables:
#' The types of data tables that are officially supported are:
#' 
#'  - data frames (`data.frame`) and tibbles (`tbl_df`)
#'  - Spark DataFrames (`tbl_spark`)
#'  - the following database tables (`tbl_dbi`):
#'    - *PostgreSQL* tables (using the `RPostgres::Postgres()` as driver)
#'    - *MySQL* tables (with `RMySQL::MySQL()`)
#'    - *Microsoft SQL Server* tables (via **odbc**)
#'    - *BigQuery* tables (using `bigrquery::bigquery()`)
#'    - *DuckDB* tables (through `duckdb::duckdb()`)
#'    - *SQLite* (with `RSQLite::SQLite()`)
#'    
#' Other database tables may work to varying degrees but they haven't been
#' formally tested (so be mindful of this when using unsupported backends with
#' **pointblank**).
#' 
#' @param tbl The input table. This can be a data frame, tibble, a `tbl_dbi`
#'   object, or a `tbl_spark` object.
#' @param tbl_name A optional name to assign to the input table object. If no
#'   value is provided, a name will be generated based on whatever information
#'   is available. This table name will be displayed in the header area of the
#'   agent report generated by printing the *agent* or calling
#'   [get_agent_report()].
#' @param file_name An optional name for the .R or .Rmd file. This should be a
#'   name without an extension. By default, this is taken from the `tbl_name`
#'   but if nothing is supplied for that, the name will contain the text
#'   `"draft_validation_"` followed by the current date and time.
#' @param path A path can be specified here if there shouldn't be an attempt to
#'   place the generated file in the working directory.
#' @param lang The language to use when creating comments for the automatically-
#'   generated validation steps. By default, `NULL` will create English (`"en"`)
#'   text. Other options include French (`"fr"`), German (`"de"`), Italian
#'   (`"it"`), Spanish (`"es"`), Portuguese (`"pt"`), Turkish (`"tr"`), Chinese
#'   (`"zh"`), Russian (`"ru"`), Polish (`"pl"`), Danish (`"da"`), Swedish
#'   (`"sv"`), and Dutch (`"nl"`).
#' @param output_type An option for choosing what type of output should be
#'   generated. By default, this is an .R script (`"R"`) but this could
#'   alternatively be an R Markdown document (`"Rmd"`).
#' @param add_comments Should there be comments that explain the features of the
#'   validation plan in the generated document? By default, this is `TRUE`.
#' @param overwrite Should a file of the same name be overwritten? By default,
#'   this is `FALSE`.
#' @param quiet Should the function *not* inform when the file is written? By
#'   default this is `FALSE`.
#'   
#' @return Invisibly returns `TRUE` if the file has been written. 
#' 
#' @section Examples:
#' 
#' Let's draft a validation plan for the `dplyr::storms` dataset.
#' 
#' ```{r}
#' dplyr::storms
#' ```
#' 
#' The `draft_validation()` function creates an .R file by default. Using just
#' the defaults with `dplyr::storms` will yield the `"dplyr__storms.R"` file
#' in the working directory. Here are the contents of the file:
#' 
#' ```r
#' library(pointblank)
#' 
#' agent <-
#'   create_agent(
#'     tbl = ~ dplyr::storms,
#'     actions = action_levels(
#'       warn_at = 0.05,
#'       stop_at = 0.10
#'     ),
#'     tbl_name = "dplyr::storms",
#'     label = "Validation plan generated by `draft_validation()`."
#'   ) %>%
#'   # Expect that column `name` is of type: character
#'   col_is_character(
#'     columns = vars(name)
#'   ) %>%
#'   # Expect that column `year` is of type: numeric
#'   col_is_numeric(
#'     columns = vars(year)
#'   ) %>%
#'   # Expect that values in `year` should be between `1975` and `2020`
#'   col_vals_between(
#'     columns = vars(year),
#'     left = 1975,
#'     right = 2020
#'   ) %>%
#'   # Expect that column `month` is of type: numeric
#'   col_is_numeric(
#'     columns = vars(month)
#'   ) %>%
#'   # Expect that values in `month` should be between `1` and `12`
#'   col_vals_between(
#'     columns = vars(month),
#'     left = 1,
#'     right = 12
#'   ) %>%
#'   # Expect that column `day` is of type: integer
#'   col_is_integer(
#'     columns = vars(day)
#'   ) %>%
#'   # Expect that values in `day` should be between `1` and `31`
#'   col_vals_between(
#'     columns = vars(day),
#'     left = 1,
#'     right = 31
#'   ) %>%
#'   # Expect that column `hour` is of type: numeric
#'   col_is_numeric(
#'     columns = vars(hour)
#'   ) %>%
#'   # Expect that values in `hour` should be between `0` and `23`
#'   col_vals_between(
#'     columns = vars(hour),
#'     left = 0,
#'     right = 23
#'   ) %>%
#'   # Expect that column `lat` is of type: numeric
#'   col_is_numeric(
#'     columns = vars(lat)
#'   ) %>%
#'   # Expect that values in `lat` should be between `-90` and `90`
#'   col_vals_between(
#'     columns = vars(lat),
#'     left = -90,
#'     right = 90
#'   ) %>%
#'   # Expect that column `long` is of type: numeric
#'   col_is_numeric(
#'     columns = vars(long)
#'   ) %>%
#'   # Expect that values in `long` should be between `-180` and `180`
#'   col_vals_between(
#'     columns = vars(long),
#'     left = -180,
#'     right = 180
#'   ) %>%
#'   # Expect that column `status` is of type: character
#'   col_is_character(
#'     columns = vars(status)
#'   ) %>%
#'   # Expect that column `category` is of type: factor
#'   col_is_factor(
#'     columns = vars(category)
#'   ) %>%
#'   # Expect that column `wind` is of type: integer
#'   col_is_integer(
#'     columns = vars(wind)
#'   ) %>%
#'   # Expect that values in `wind` should be between `10` and `160`
#'   col_vals_between(
#'     columns = vars(wind),
#'     left = 10,
#'     right = 160
#'   ) %>%
#'   # Expect that column `pressure` is of type: integer
#'   col_is_integer(
#'     columns = vars(pressure)
#'   ) %>%
#'   # Expect that values in `pressure` should be between `882` and `1022`
#'   col_vals_between(
#'     columns = vars(pressure),
#'     left = 882,
#'     right = 1022
#'   ) %>%
#'   # Expect that column `tropicalstorm_force_diameter` is of type: integer
#'   col_is_integer(
#'     columns = vars(tropicalstorm_force_diameter)
#'   ) %>%
#'   # Expect that values in `tropicalstorm_force_diameter` should be between
#'   # `0` and `870`
#'   col_vals_between(
#'     columns = vars(tropicalstorm_force_diameter),
#'     left = 0,
#'     right = 870,
#'     na_pass = TRUE
#'   ) %>%
#'   # Expect that column `hurricane_force_diameter` is of type: integer
#'   col_is_integer(
#'     columns = vars(hurricane_force_diameter)
#'   ) %>%
#'   # Expect that values in `hurricane_force_diameter` should be between
#'   # `0` and `300`
#'   col_vals_between(
#'     columns = vars(hurricane_force_diameter),
#'     left = 0,
#'     right = 300,
#'     na_pass = TRUE
#'   ) %>%
#'   # Expect entirely distinct rows across all columns
#'   rows_distinct() %>%
#'   # Expect that column schemas match
#'   col_schema_match(
#'     schema = col_schema(
#'       name = "character",
#'       year = "numeric",
#'       month = "numeric",
#'       day = "integer",
#'       hour = "numeric",
#'       lat = "numeric",
#'       long = "numeric",
#'       status = "character",
#'       category = c("ordered", "factor"),
#'       wind = "integer",
#'       pressure = "integer",
#'       tropicalstorm_force_diameter = "integer",
#'       hurricane_force_diameter = "integer"
#'     )
#'   ) %>%
#'   interrogate()
#' 
#' agent
#' ```
#' 
#' This is runnable as is, and the promise is that the interrogation should
#' produce no failing test units. After execution, we get the following
#' validation report:
#' 
#' \if{html}{
#' \out{
#' `r pb_get_image_tag(file = "man_draft_validation_1.png")`
#' }
#' }
#' 
#' All of the expressions in the resulting file constitute just a rough
#' approximation of what a validation plan should be for a dataset. Certainly,
#' the value ranges in the emitted [col_vals_between()] may not be realistic for
#' the `wind` column and may require some modification (the provided `left` and
#' `right` values are just the limits of the provided data). However, note that
#' the `lat` and `long` (latitude and longitude) columns have acceptable ranges
#' (providing the limits of valid lat/lon values). This is thanks to
#' **pointblank**'s column inference routines, which is able to understand what
#' certain columns contain.
#'
#' For an evolving dataset that will experience changes (either in the form of
#' revised data and addition/deletion of rows or columns), the emitted
#' validation will serve as a good first step and changes can more easily be
#' made since there is a foundation to build from.
#' 
#' 
#' @family Planning and Prep
#' @section Function ID:
#' 1-11
#' 
#' @export
draft_validation <- function(
    tbl,
    tbl_name = NULL,
    file_name = tbl_name,
    path = NULL,
    lang = NULL,
    output_type = c("R", "Rmd"),
    add_comments = TRUE,
    overwrite = FALSE,
    quiet = FALSE
) {
  
  output_type <- match.arg(output_type)
  
  tbl_material <- materialize_table(tbl = tbl)
  
  column_roles <- get_column_roles(data = tbl_material)
  column_names <- colnames(tbl_material)
  
  agent <- 
    create_agent(
      tbl = tbl,
      tbl_name = tbl_name,
      label = "Validation plan generated by `draft_validation()`.",
      actions = action_levels(warn_at = 0.05, stop_at = 0.10),
      lang = lang
    )
  
  agent$tbl <- tbl_material
  
  # Add column-based validation steps to the agent on
  # the basis of column roles
  for (i in seq_along(column_roles)) {
    
    agent <- 
      add_valdn_steps_with_role(
        agent = agent,
        column = column_names[i],
        column_role = column_roles[i]
      )
  }
  
  # Add the `rows_distinct()` validation step if all rows in the
  # table are distinct
  total_rows <- get_table_total_rows(data = tbl_material)
  distinct_rows <- get_table_total_distinct_rows(data = tbl_material)
  
  if (distinct_rows == total_rows) {
    agent <- rows_distinct(agent)
  }
  
  # Add the `col_schema_match()` validation step
  agent <- col_schema_match(agent, schema = col_schema(.tbl = tbl_material))
  
  # Get the `read_fn` text from `tbl`
  read_fn_name <- deparse(match.call()$tbl)
  read_fn_name <- gsub("^\\s+", "", read_fn_name)
  read_fn_name <- paste(read_fn_name, collapse = "")
  
  if (is.null(tbl_name) && !grepl("\\s", read_fn_name)) {
    tbl_name <- read_fn_name
  }
  
  if (read_fn_name == ".") {
    read_fn_name <- NULL
  }
  
  if (file_name == ".") {
    file_name <- NULL
  }
  
  # Create the filename for the pointblank file
  file_name <- 
    resolve_file_filename(
      agent = agent,
      name = file_name,
      output_type = output_type
    )
  
  if (is.null(path)) {
    file_path <- "."
    
  } else {
    
    if (!fs::dir_exists(path)) {
      # Stop function if the path doesn't exist and inform user
      # that this function won't create a path
      stop(
        "The provided `path` does not exist:\n",
        "* Please create the path",
        call. = FALSE
      )
    }
    
    file_path <- path
  }
  
  # Create path that contains the file
  path <- as.character(fs::path_norm(fs::path_wd(file_path, file_name)))
  
  # Check if the file to write already exists; if it does, don't
  # write the new file if `overwrite` is FALSE
  if (fs::file_exists(path) && !overwrite) {
    stop(
      "A file of the same name already exists:\n",
      "* set `overwrite` to `TRUE`, or\n",
      "* choose a different `file_name`, or\n",
      "* define another `path` for the file",
      call. = FALSE
    )
  }
  
  # Set a temporary `read_fn` value if one doesn't exist in the agent
  if (is.null(agent$read_fn)) {
    agent$read_fn <- ""
  }
  
  # Extract all briefs from the validation steps
  briefs <- agent$validation_set$brief
  
  # Extract all R expressions for the file
  agent_exprs <- agent_get_exprs(agent = agent, expanded = TRUE)
  
  agent_exprs <-
    gsub(
      "tbl = ,\n",
      paste0(
        "tbl = ~ ",
        ifelse(
          is.null(read_fn_name),
          "CODE_TO_ACCESS_TABLE, # <- Add R code that obtains the data table",
          read_fn_name),
        ",\n"
      ),
      agent_exprs
    )
  
  agent_exprs <-
    gsub(
      "stop_at = 0.1",
      "stop_at = 0.10",
      agent_exprs
    )
  
  agent_exprs <-
    gsub(
      "  tbl_name = \".*?\",",
      paste0(
        "  tbl_name = ",
        ifelse(
          is.null(tbl_name) || tbl_name == ".",
          "NULL, # <- Optionally add in the table name",
          paste0("\"", tbl_name, "\",")
        )
      ),
      agent_exprs
    )
  
  agent_expr_vec <- unlist(strsplit(agent_exprs, " %>%\n", fixed = TRUE))
  
  if (add_comments) {
    
    agent_expr_vec_2 <- 
      paste(
        paste0("%>%\n  # ", gsub("\\. $", "", briefs), "\n"),
        paste0(" ", gsub("\n", "\n  ", agent_expr_vec[-1])),
        collapse = "  "
      )
    
  } else {

    agent_expr_vec_2 <- 
      paste(
        paste0("%>%\n"),
        paste0(" ", gsub("\n", "\n  ", agent_expr_vec[-1])),
        collapse = "  "
      )
  }
  
  agent_lines <-
    paste(
      paste0("agent <-\n  ", gsub("\n", "\n  ", agent_expr_vec[1])),
      agent_expr_vec_2,
      collapse = ""
    )
  
  if (output_type == "R") {
    
    file_content <-
      paste0(
        "library(pointblank)\n\n",
        agent_lines,
        "%>%\n  interrogate()\n\nagent",
        collapse = ""
      ) %>%
      gsub("  %>%", " %>%", .)
    
  } else {
    
    file_content <-
      paste0(
        "---\n",
        "title: \"",
        ifelse(is.null(tbl_name) || tbl_name == ".", "Untitled", tbl_name),
        "\"\n",
        "output: html_document\n",
        "---\n",
        "\n",
        "```{r setup, include=FALSE}\n",
        "knitr::opts_chunk$set(echo = TRUE)\n",
        "library(pointblank)\n",
        "```\n",
        "\n\n",
        "```{r create_agent, echo=TRUE}\n",
        agent_lines,
        "%>%\n  interrogate()\n",
        "```\n",
        "\n\n",
        "```{r print_agent, echo=FALSE}\n",
        "agent\n",
        "```\n",
        collapse = ""
      ) %>%
      gsub("  %>%", " %>%", .)
  }
  
  # Write the file to the resulting `path`
  pb_write_file(
    path = path,
    lines = file_content,
    append = FALSE
  )
  
  # Generate cli message
  if (!quiet) {
    cli_bullet_msg(
      msg = paste0(
        "The pointblank .", output_type, " file has been written to `{path}`"
        ),
      bullet = cli::symbol$tick,
      color = "green"
    )
  }
  
  invisible(TRUE)
}

add_valdn_steps_with_role <- function(agent, column, column_role) {
  
  if (grepl("string", column_role)) {
    
    if (inherits(agent$tbl, "data.frame") && 
        is.factor(agent$tbl[[column]])) {
      
      agent <- col_is_factor(agent, columns = {{ column }})
    } else {
      agent <- col_is_character(agent, columns = {{ column }})
    }
  }
  
  if (column_role == "integer.discrete") {
    agent <- col_is_integer(agent, columns = {{ column }})
  }
  
  if (column_role == "boolean.logical.categorical") {
    agent <- col_is_logical(agent, columns = {{ column }})
  }
  
  if (column_role == "country:iso3166-1-esn.string.categorical") {
    
    country_names <-
      dplyr::pull(
        get_non_null_col_sample(
          data_column = dplyr::select(agent$tbl, {{ column }}),
          sample_n = 2E8,
          make_distinct = TRUE
        )
      )
    
    missing_values_column <- 
      get_table_total_missing_values(
        data = dplyr::select(agent$tbl, {{ column }})
      )
    
    if (missing_values_column > 0) {
      country_names <- c(NA_character_, country_names)
    }
    
    agent <- 
      col_vals_in_set(
        agent,
        columns = {{ column }},
        set = country_names
      )
  }
  
  if (column_role == "country:iso3166-1-a-2.string.categorical") {
    
    alpha_2 <- countries$alpha_2
    
    missing_values_column <- 
      get_table_total_missing_values(
        data = dplyr::select(agent$tbl, {{ column }})
      )
    
    if (missing_values_column > 0) {
      alpha_2 <- c(NA_character_, alpha_2)
    }
    
    agent <- 
      col_vals_in_set(
        agent,
        columns = {{ column }},
        set = alpha_2
      )
  }
  
  if (column_role == "country:iso3166-1-a-3.string.categorical") {
    
    alpha_3 <- countries$alpha_3
    
    missing_values_column <- 
      get_table_total_missing_values(
        data = dplyr::select(agent$tbl, {{ column }})
      )
    
    if (missing_values_column > 0) {
      alpha_3 <- c(NA_character_, alpha_3)
    }
    
    agent <- 
      col_vals_in_set(
        agent,
        columns = {{ column }},
        set = alpha_3
      )
  }
  
  if (grepl("country_subd:iso3166-2\\[...\\].string", column_role)) {
    
    country <- gsub("(^.*\\[|\\].*$)", "", column_role)
    
    subd_2 <- subd_list_main[[country]]
    
    missing_values_column <- 
      get_table_total_missing_values(
        data = dplyr::select(agent$tbl, {{ column }})
      )
    
    if (missing_values_column > 0) {
      subd_2 <- c(NA_character_, subd_2)
    }
    
    agent <- 
      col_vals_in_set(
        agent,
        columns = {{ column }},
        set = subd_2
      )
  }
  
  if (grepl("numeric", column_role)) {
    
    agent <- 
      col_is_numeric(
        agent,
        columns = {{ column }}
      )
  }
  
  if (column_role %in% c(
    "numeric.continuous", "numeric.discrete", "numeric", "integer.discrete"
  )) {
    
    missing_values_column <- 
      get_table_total_missing_values(
        data = dplyr::select(agent$tbl, {{ column }})
      )
    
    summary_list <- 
      get_table_column_summary(
        data_column = dplyr::select(agent$tbl, {{ column }}),
        round = Inf
      )
    
    agent <- 
      col_vals_between(
        agent, columns = {{ column }},
        left = summary_list$min, right = summary_list$max,
        na_pass = missing_values_column > 0
      )
  }
  
  if (grepl("geo:latitude.numeric", column_role)) {
    
    missing_values_column <- 
      get_table_total_missing_values(
        data = dplyr::select(agent$tbl, {{ column }})
      )
    
    agent <- 
      col_vals_between(
        agent, columns = {{ column }},
        left = -90, right = 90,
        na_pass = missing_values_column > 0
      )
  }
  
  if (grepl("geo:longitude.numeric", column_role)) {
    
    missing_values_column <- 
      get_table_total_missing_values(
        data = dplyr::select(agent$tbl, {{ column }})
      )
    
    agent <- 
      col_vals_between(
        agent, columns = {{ column }},
        left = -180, right = 180,
        na_pass = missing_values_column > 0
      )
  }
  
  agent
}


resolve_file_filename <- function(agent,
                                  name,
                                  output_type) {
  
  if (is.null(name)) {
    
    sys_time <- format(Sys.time(), format = "%Y_%m_%d_%I_%M_%p")
    file_name <- 
      paste0("draft_validation_", sys_time, ".", output_type)
    
  } else {
    
    if (!is.character(name)) {
      stop(
        "The value supplied to `name` must be of class 'character'.",
        call. = FALSE
      )
    }
    
    # Handle special case of `pkg::dataset` before sanitization
    if (!grepl("\\s", name) && grepl("::", name)) {
      name <- gsub("::", "__", name, fixed = TRUE)
    }
    
    file_name <- 
      name[1] %>%
      fs::path_sanitize() %>%
      gsub("(\\.| |'|\\:)", "_", .) %>%
      paste0(., ".", output_type)
  }
  
  file_name
}

Try the pointblank package in your browser

Any scripts or data that you put into this service are public.

pointblank documentation built on April 25, 2023, 5:06 p.m.