Nothing
#' Check for missing values
#'
#' `check_missing()` creates a *specification* of a recipe operation that will
#' check if variables contain missing values.
#'
#' @inheritParams step_pca
#' @param recipe A recipe object. The check will be added to the sequence of
#' operations for this recipe.
#' @param ... One or more selector functions to choose variables for this check.
#' See [selections()] for more details.
#' @param role Not used by this check since no new variables are created.
#' @param trained A logical for whether the selectors in `...` have been
#' resolved by [prep()].
#' @param id A character string that is unique to this check to identify it.
#' @param skip A logical. Should the check be skipped when the recipe is baked
#' by [bake()]? While all operations are baked when [prep()] is run, some
#' operations may not be able to be conducted on new data (e.g. processing the
#' outcome variable(s)). Care should be taken when using `skip = TRUE` as it
#' may affect the computations for subsequent operations.
#' @template check-return
#' @family checks
#' @export
#' @details
#'
#' This check will break the [bake()] function if any of the checked columns
#' does contain `NA` values. If the check passes, nothing is changed to the
#' data.
#'
#' # tidy() results
#'
#' When you [`tidy()`][tidy.recipe()] this check, a tibble with column `terms`
#' (the selectors or variables selected) is returned.
#'
#' @examplesIf rlang::is_installed("modeldata")
#' data(credit_data, package = "modeldata")
#' is.na(credit_data) |> colSums()
#'
#' # If the test passes, `new_data` is returned unaltered
#' recipe(credit_data) |>
#' check_missing(Age, Expenses) |>
#' prep() |>
#' bake(credit_data)
#'
#' # If your training set doesn't pass, prep() will stop with an error
#' \dontrun{
#' recipe(credit_data) |>
#' check_missing(Income) |>
#' prep()
#' }
#'
#' # If `new_data` contain missing values, the check will stop `bake()`
#'
#' train_data <- credit_data |> dplyr::filter(Income > 150)
#' test_data <- credit_data |> dplyr::filter(Income <= 150 | is.na(Income))
#'
#' rp <- recipe(train_data) |>
#' check_missing(Income) |>
#' prep()
#'
#' bake(rp, train_data)
#' \dontrun{
#' bake(rp, test_data)
#' }
check_missing <-
function(
recipe,
...,
role = NA,
trained = FALSE,
columns = NULL,
skip = FALSE,
id = rand_id("missing")
) {
add_check(
recipe,
check_missing_new(
terms = enquos(...),
role = role,
trained = trained,
columns = columns,
skip = skip,
id = id
)
)
}
check_missing_new <-
function(terms, role, trained, columns, skip, id) {
check(
subclass = "missing",
prefix = "check_",
terms = terms,
role = role,
trained = trained,
columns = columns,
skip = skip,
id = id
)
}
#' @export
prep.check_missing <- function(x, training, info = NULL, ...) {
col_names <- recipes_eval_select(x$terms, training, info)
check_missing_new(
terms = x$terms,
role = x$role,
trained = TRUE,
columns = col_names,
skip = x$skip,
id = x$id
)
}
#' @export
bake.check_missing <- function(object, new_data, ...) {
col_names <- object$columns
check_new_data(col_names, object, new_data)
subset_to_check <- new_data[col_names]
nr_na <- colSums(is.na(subset_to_check))
if (any(nr_na > 0)) {
with_na <- names(nr_na[nr_na > 0])
cli::cli_abort(
"The following columns contains missing values: {with_na}."
)
}
new_data
}
#' @export
print.check_missing <-
function(x, width = max(20, options()$width - 30), ...) {
title <- "Check missing values for "
print_step(x$columns, x$terms, x$trained, title, width)
invisible(x)
}
#' @rdname tidy.recipe
#' @export
tidy.check_missing <- function(x, ...) {
if (is_trained(x)) {
res <- tibble(terms = unname(x$columns))
} else {
res <- tibble(terms = sel2char(x$terms))
}
res$id <- x$id
res
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.