#' Check the dataset contains enough data points.
#'
#' `check_enough_data` creates a *specification* of a recipe
#' operation that will check if variables contain enough data.
#'
#' @param recipe A recipe object. The check will be added to the
#' sequence of operations for this recipe.
#' @param ... One or more selector functions to choose variables for this check.
#' See [selections()] for more details. You will usually want to use
#' [recipes::all_predictors()] and/or [recipes::all_outcomes()] here.
#' @param min_observations The minimum number of data points required for
#' training. If this is NULL, the total number of predictors will be used.
#' @param epi_keys A character vector of column names on which to group the data
#' and check threshold within each group. Useful if your forecaster trains
#' per group (for example, per geo_value).
#' @param drop_na A logical for whether to count NA values as valid rows.
#' @param role Not used by this check since no new variables are
#' created.
#' @param trained A logical for whether the selectors in `...`
#' have been resolved by [prep()].
#' @param id A character string that is unique to this check to identify it.
#' @param skip A logical. If `TRUE`, only training data is checked, while if
#' `FALSE`, both training and predicting data is checked. Technically, this
#' answers the question "should the check be skipped when the recipe is baked
#' by [bake()]?" While all operations are baked when [prep()] is run, some
#' operations may not be able to be conducted on new data (e.g. processing the
#' outcome variable(s)). Care should be taken when using `skip = TRUE` as it
#' may affect the computations for subsequent operations.
#' @family checks
#' @export
#' @details This check will break the `prep` and/or bake function if any of the
#' checked columns have not enough non-NA values. If the check passes, nothing
#' is changed in the data. It is best used after every other step.
#'
#' For checking training data, it is best to set `...` to be
#' `all_predictors(), all_outcomes()`, while for checking prediction data, it
#' is best to set `...` to be `all_predictors()` only, with `n = 1`.
#'
#' # tidy() results
#'
#' When you [`tidy()`][tidy.recipe()] this check, a tibble with column
#' `terms` (the selectors or variables selected) is returned.
#'
check_enough_data <-
function(recipe,
...,
min_observations = NULL,
epi_keys = NULL,
drop_na = TRUE,
role = NA,
trained = FALSE,
skip = TRUE,
id = rand_id("enough_data")) {
recipes::add_check(
recipe,
check_enough_data_new(
min_observations = min_observations,
epi_keys = epi_keys,
drop_na = drop_na,
terms = enquos(...),
role = role,
trained = trained,
columns = NULL,
skip = skip,
id = id
)
)
}
check_enough_data_new <-
function(min_observations, epi_keys, drop_na, terms,
role, trained, columns, skip, id) {
recipes::check(
subclass = "enough_data",
prefix = "check_",
min_observations = min_observations,
epi_keys = epi_keys,
drop_na = drop_na,
terms = terms,
role = role,
trained = trained,
columns = columns,
skip = skip,
id = id
)
}
#' @export
prep.check_enough_data <- function(x, training, info = NULL, ...) {
col_names <- recipes::recipes_eval_select(x$terms, training, info)
if (is.null(x$min_observations)) {
x$min_observations <- length(col_names)
}
check_enough_data_core(training, x, col_names, "train")
check_enough_data_new(
min_observations = x$min_observations,
epi_keys = x$epi_keys,
drop_na = x$drop_na,
terms = x$terms,
role = x$role,
trained = TRUE,
columns = col_names,
skip = x$skip,
id = x$id
)
}
#' @export
bake.check_enough_data <- function(object, new_data, ...) {
col_names <- object$columns
check_enough_data_core(new_data, object, col_names, "predict")
new_data
}
#' @export
print.check_enough_data <- function(x, width = max(20, options()$width - 30), ...) {
title <- paste0("Check enough data (n = ", x$min_observations, ") for ")
recipes::print_step(x$columns, x$terms, x$trained, title, width)
invisible(x)
}
#' @export
tidy.check_enough_data <- function(x, ...) {
if (recipes::is_trained(x)) {
res <- tibble(terms = unname(x$columns))
} else {
res <- tibble(terms = recipes::sel2char(x$terms))
}
res$id <- x$id
res$min_observations <- x$min_observations
res$epi_keys <- x$epi_keys
res$drop_na <- x$drop_na
res
}
check_enough_data_core <- function(epi_df, step_obj, col_names, train_or_predict) {
epi_df <- epi_df %>%
group_by(across(all_of(.env$step_obj$epi_keys)))
if (step_obj$drop_na) {
any_missing_data <- epi_df %>%
mutate(any_are_na = rowSums(across(any_of(.env$col_names), ~ is.na(.x))) > 0) %>%
# count the number of rows where they're all not na
summarise(sum(any_are_na == 0) < .env$step_obj$min_observations, .groups = "drop")
any_missing_data <- any_missing_data %>%
summarize(across(all_of(setdiff(names(any_missing_data), step_obj$epi_keys)), any)) %>%
any()
# figuring out which individual columns (if any) are to blame for this dearth
# of data
cols_not_enough_data <- epi_df %>%
summarise(
across(
all_of(.env$col_names),
~ sum(!is.na(.x)) < .env$step_obj$min_observations
),
.groups = "drop"
) %>%
# Aggregate across keys (if present)
summarise(across(all_of(.env$col_names), any), .groups = "drop") %>%
unlist() %>%
# Select the names of the columns that are TRUE
names(.)[.]
# Either all columns have enough data, in which case this message won't be
# sent later or none of the single columns have enough data, that means its
# the combination of all of them.
if (length(cols_not_enough_data) == 0) {
cols_not_enough_data <-
glue::glue("no single column, but the combination of {paste0(col_names, collapse = ', ')}")
}
} else {
# if we're not dropping na values, just count
cols_not_enough_data <- epi_df %>%
summarise(across(all_of(.env$col_names), ~ dplyr::n() < .env$step_obj$min_observations))
any_missing_data <- cols_not_enough_data %>%
summarize(across(all_of(.env$col_names), all)) %>%
all()
cols_not_enough_data <- cols_not_enough_data %>%
summarise(across(all_of(.env$col_names), any), .groups = "drop") %>%
unlist() %>%
# Select the names of the columns that are TRUE
names(.)[.]
}
if (any_missing_data) {
cli_abort(
"The following columns don't have enough data to {train_or_predict}: {cols_not_enough_data}.",
class = "epipredict__not_enough_data"
)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.