Nothing
#' @title Examples of preprocess_fn functions
#' @description
#' \Sexpr[results=rd, stage=render]{lifecycle::badge("experimental")}
#'
#' Examples of preprocess functions that can be used in
#' \code{\link[cvms:cross_validate_fn]{cross_validate_fn()}} and
#' \code{\link[cvms:validate_fn]{validate_fn()}}.
#' They can either be used directly or be starting points.
#'
#' The examples use \code{\link[recipes:recipe]{recipes}},
#' but you can also use \code{caret::preProcess()} or
#' similar functions.
#'
#' In these examples, the preprocessing will only affect the numeric predictors.
#'
#' You may prefer to hardcode a formula like \code{"y ~ ."} (where
#' \code{y} is your dependent variable) as that will allow you to set
#' \strong{`preprocess_one`} to \code{TRUE} in \code{\link[cvms:cross_validate_fn]{cross_validate_fn()}}
#' and \code{\link[cvms:validate_fn]{validate_fn()}} and save time.
#' @author Ludvig Renbo Olsen, \email{r-pkgs@@ludvigolsen.dk}
#' @export
#' @family example functions
#' @return A function with the following form:
#'
#' \code{function(train_data, test_data, formula, hyperparameters) \{}
#'
#' \verb{ }\code{# Preprocess train_data and test_data}
#'
#' \verb{ }\code{# Return a list with the preprocessed datasets}
#'
#' \verb{ }\code{# and optionally a data frame with preprocessing parameters}
#'
#' \verb{ }\code{list(}
#'
#' \verb{ }\code{"train" = train_data,}
#'
#' \verb{ }\code{"test" = test_data,}
#'
#' \verb{ }\code{"parameters" = tidy_parameters}
#'
#' \verb{ }\code{)}
#'
#' \code{\}}
#' @param name Name of preprocessing function
#' as it appears in the following list:
#'
#' \tabular{rrr}{
#' \strong{Name} \tab \strong{Description} \cr
#' "standardize" \tab Centers and scales the numeric predictors\cr
#' "range" \tab Normalizes the numeric predictors to the 0-1 range\cr
#' "scale" \tab Scales the numeric predictors to have a standard deviation of one\cr
#' "center" \tab Centers the numeric predictors to have a mean of zero\cr
#' "warn" \tab Identity function that throws a warning and a message\cr
#' }
preprocess_functions <- function(name) {
if (name == "standardize") {
preprocess_fn <- function(train_data, test_data, formula, hyperparameters) {
# Create simplified version of the formula
# as recipe() does not like inline functions
# like log() or random effect structures like (1|z)
# Example:
# "y ~ log(x) + (1 | z)" becomes "y ~ x + z"
formula <- simplify_formula(formula, train_data)
# Create recipes object
recipe_object <- recipes::recipe(
# Note: If we hardcoded the formula instead of using the formula argument
# we could preprocess the train/test splits once
# instead of for every formula
# Tip: Use `y ~ .` to include all predictors (where `y` is your dependent variable)
formula = formula,
data = train_data
) %>%
# Add preprocessing steps
# Note: We could add specific variables to each step
# instead of just selecting all numeric predictors
recipes::step_center(recipes::all_numeric(), -recipes::all_outcomes()) %>%
recipes::step_scale(recipes::all_numeric(), -recipes::all_outcomes()) %>%
# Find parameters from the training set
recipes::prep(training = train_data)
# Apply preprocessing to the partitions
train_data <- recipes::bake(recipe_object, train_data)
test_data <- recipes::bake(recipe_object, test_data)
# Extract the preprocessing parameters
means <- recipe_object$steps[[1]]$means
sds <- recipe_object$steps[[2]]$sds
# Add preprocessing parameters to a tibble
tidy_parameters <- tibble::tibble("Measure" = c("Mean", "SD")) %>%
dplyr::bind_cols(dplyr::bind_rows(means, sds))
list(
"train" = train_data,
"test" = test_data,
"parameters" = tidy_parameters
)
}
} else if (name == "scale") {
preprocess_fn <- function(train_data, test_data, formula, hyperparameters) {
# Create simplified version of the formula
# as recipe() does not like inline functions
# like log() or random effect structures like (1|z)
# Example:
# "y ~ log(x) + (1 | z)" becomes "y ~ x + z"
formula <- simplify_formula(formula, train_data)
# Create recipes object
recipe_object <- recipes::recipe(
# Note: If we hardcoded the formula instead of using the formula argument
# we could preprocess the train/test splits once
# instead of for every formula
# Tip: Use `y ~ .` to include all predictors (where `y` is your dependent variable)
formula = formula,
data = train_data
) %>%
# Add preprocessing steps
# Note: We could add specific variables to each step
# instead of just selecting all numeric predictors
recipes::step_scale(recipes::all_numeric(), -recipes::all_outcomes()) %>%
# Find parameters from the training set
recipes::prep(training = train_data)
# Apply preprocessing to the partitions
train_data <- recipes::bake(recipe_object, train_data)
test_data <- recipes::bake(recipe_object, test_data)
# Extract the preprocessing parameters
sds <- recipe_object$steps[[1]]$sds
# Add preprocessing parameters to a tibble
tidy_parameters <- tibble::tibble("Measure" = c("SD")) %>%
dplyr::bind_cols(dplyr::bind_rows(sds))
list(
"train" = train_data,
"test" = test_data,
"parameters" = tidy_parameters
)
}
} else if (name == "center") {
preprocess_fn <- function(train_data, test_data, formula, hyperparameters) {
# Create simplified version of the formula
# as recipe() does not like inline functions
# like log() or random effect structures like (1|z)
# Example:
# "y ~ log(x) + (1 | z)" becomes "y ~ x + z"
formula <- simplify_formula(formula, train_data)
# Create recipes object
recipe_object <- recipes::recipe(
# Note: If we hardcoded the formula instead of using the formula argument
# we could preprocess the train/test splits once
# instead of for every formula
# Tip: Use `y ~ .` to include all predictors (where `y` is your dependent variable)
formula = formula,
data = train_data
) %>%
# Add preprocessing steps
# Note: We could add specific variables to each step
# instead of just selecting all numeric predictors
recipes::step_center(recipes::all_numeric(), -recipes::all_outcomes()) %>%
# Find parameters from the training set
recipes::prep(training = train_data)
# Apply preprocessing to the partitions
train_data <- recipes::bake(recipe_object, train_data)
test_data <- recipes::bake(recipe_object, test_data)
# Extract the preprocessing parameters
means <- recipe_object$steps[[1]]$means
# Add preprocessing parameters to a tibble
tidy_parameters <- tibble::tibble("Measure" = c("Mean")) %>%
dplyr::bind_cols(dplyr::bind_rows(means))
list(
"train" = train_data,
"test" = test_data,
"parameters" = tidy_parameters
)
}
} else if (name == "range") {
preprocess_fn <- function(train_data, test_data, formula, hyperparameters) {
# Create simplified version of the formula
# as recipe() does not like inline functions
# like log() or random effect structures like (1|z)
# Example:
# "y ~ log(x) + (1 | z)" becomes "y ~ x + z"
formula <- simplify_formula(formula, train_data)
# Create recipes object
recipe_object <- recipes::recipe(
# Note: If we hardcoded the formula instead of using the formula argument
# we could preprocess the train/test splits once
# instead of for every formula
# Tip: Use `y ~ .` to include all predictors (where `y` is your dependent variable)
formula = formula,
data = train_data
) %>%
# Add preprocessing steps
# Note: We could add specific variables to each step
# instead of just selecting all numeric variables
recipes::step_range(recipes::all_numeric(), -recipes::all_outcomes(),
min = 0.0, max = 1.0
) %>%
# Find parameters from the training set
recipes::prep(training = train_data)
# Apply preprocessing to the partitions
train_data <- recipes::bake(recipe_object, train_data)
test_data <- recipes::bake(recipe_object, test_data)
# Extract the preprocessing parameters
ranges <- dplyr::as_tibble(recipe_object$steps[[1]]$ranges)
# Add preprocessing parameters to a tibble
tidy_parameters <- tibble::tibble("Measure" = c("Min", "Max")) %>%
dplyr::bind_cols(ranges)
list(
"train" = train_data,
"test" = test_data,
"parameters" = tidy_parameters
)
}
} else if (name == "warn") {
preprocess_fn <- function(train_data, test_data, formula, hyperparameters) {
# For testing that warnings and messages are caught
# Throw a warning and a message
warning("This is a preprocess_fn warning")
message("This is a preprocess_fn message")
list(
"train" = train_data,
"test" = test_data
)
}
} else {
stop(paste0("Could not find '", name, "'."))
}
preprocess_fn
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.