test_that("simple forge works", {
sparse_bp <- default_recipe_blueprint(composition = "dgCMatrix")
matrix_bp <- default_recipe_blueprint(composition = "matrix")
rec <- recipes::recipe(Species ~ Sepal.Length, data = iris)
rec <- recipes::step_normalize(rec, Sepal.Length)
x1 <- mold(rec, iris)
x2 <- mold(rec, iris, blueprint = sparse_bp)
x3 <- mold(rec, iris, blueprint = matrix_bp)
xx1 <- forge(iris, x1$blueprint)
xx2 <- forge(iris, x2$blueprint)
xx3 <- forge(iris, x3$blueprint)
expect_s3_class(
xx1$predictors,
"tbl_df"
)
expect_s4_class(
xx2$predictors,
"dgCMatrix"
)
expect_matrix(
xx3$predictors
)
expect_equal(
colnames(xx1$predictors),
"Sepal.Length"
)
expect_equal(
colnames(xx2$predictors),
"Sepal.Length"
)
expect_equal(
colnames(xx3$predictors),
"Sepal.Length"
)
expect_equal(
xx1$outcomes,
NULL
)
expect_equal(
xx2$outcomes,
NULL
)
expect_equal(
xx3$outcomes,
NULL
)
})
test_that("asking for the outcome works", {
sparse_bp <- default_recipe_blueprint(composition = "dgCMatrix")
matrix_bp <- default_recipe_blueprint(composition = "matrix")
rec <- recipes::recipe(Species ~ Sepal.Length, data = iris)
rec <- recipes::step_normalize(rec, Sepal.Length)
x1 <- mold(rec, iris)
x2 <- mold(rec, iris, blueprint = sparse_bp)
x3 <- mold(rec, iris, blueprint = matrix_bp)
xx1 <- forge(iris, x1$blueprint, outcomes = TRUE)
xx2 <- forge(iris, x2$blueprint, outcomes = TRUE)
xx3 <- forge(iris, x2$blueprint, outcomes = TRUE)
expect_equal(
xx1$outcomes,
tibble::tibble(Species = iris$Species)
)
expect_equal(
xx2$outcomes,
tibble::tibble(Species = iris$Species)
)
expect_equal(
xx3$outcomes,
tibble::tibble(Species = iris$Species)
)
})
test_that("asking for the outcome when it isn't there fails", {
sparse_bp <- default_recipe_blueprint(composition = "dgCMatrix")
rec <- recipes::recipe(Species ~ Sepal.Length, data = iris)
x1 <- mold(rec, iris)
x2 <- mold(rec, iris, blueprint = sparse_bp)
iris2 <- iris
iris2$Species <- NULL
expect_error(
forge(iris2, x1$blueprint, outcomes = TRUE),
"The following required columns"
)
expect_error(
forge(iris2, x2$blueprint, outcomes = TRUE),
"The following required columns"
)
})
test_that("outcomes steps get processed", {
sparse_bp <- default_recipe_blueprint(composition = "dgCMatrix")
rec <- recipes::recipe(Sepal.Width ~ Sepal.Length, data = iris)
rec <- recipes::step_log(rec, Sepal.Width)
x1 <- mold(rec, iris)
x2 <- mold(rec, iris, blueprint = sparse_bp)
xx1 <- forge(iris, x1$blueprint, outcomes = TRUE)
xx2 <- forge(iris, x2$blueprint, outcomes = TRUE)
expect_equal(
xx1$outcomes$Sepal.Width,
log(iris$Sepal.Width)
)
expect_equal(
xx2$outcomes$Sepal.Width,
log(iris$Sepal.Width)
)
})
test_that("missing predictor columns fail appropriately", {
x <- mold(
recipes::recipe(Species ~ Sepal.Length + Sepal.Width, data = iris),
iris
)
expect_error(
forge(iris[, 1, drop = FALSE], x$blueprint),
"Sepal.Width"
)
expect_error(
forge(iris[, 3, drop = FALSE], x$blueprint),
"'Sepal.Length', 'Sepal.Width'"
)
})
test_that("novel predictor levels are caught", {
sparse_bp <- default_recipe_blueprint(composition = "dgCMatrix")
dat <- data.frame(
y = 1:4,
f = factor(letters[1:4])
)
new <- data.frame(
y = 1:5,
f = factor(letters[1:5])
)
x1 <- mold(recipes::recipe(y ~ f, dat), dat)
x2 <- mold(recipes::step_dummy(recipes::recipe(y ~ f, dat), f), dat, blueprint = sparse_bp)
expect_snapshot({
xx1 <- forge(new, x1$blueprint)
})
expect_snapshot({
xx2 <- forge(new, x2$blueprint)
})
expect_equal(
xx1$predictors[[5, 1]],
factor(NA_real_, levels = c("a", "b", "c", "d"))
)
expect_equal(
unname(xx2$predictors[5, ]),
rep(NA_real_, 3)
)
})
test_that("novel predictor levels can be ignored and handled by recipes", {
dat <- data.frame(
y = 1:4,
f = factor(letters[1:4])
)
new <- data.frame(
y = 1:5,
f = factor(letters[1:5])
)
bp1 <- default_recipe_blueprint(allow_novel_levels = TRUE)
bp2 <- default_recipe_blueprint(allow_novel_levels = TRUE, composition = "dgCMatrix")
rec1 <- recipes::recipe(y ~ f, dat)
rec2 <- recipes::step_novel(rec1, f)
rec3 <- recipes::step_dummy(rec2, f)
x1 <- mold(rec1, dat, blueprint = bp1)
x2 <- mold(rec2, dat, blueprint = bp1)
x3 <- mold(rec3, dat, blueprint = bp2)
# Recipes will silently handle the novel level
expect_snapshot({
xx1 <- forge(new, x1$blueprint)
})
expect_equal(
xx1$predictors$f[[5]],
factor(NA_real_, levels = c("a", "b", "c", "d"))
)
# `step_novel()` let's us handle the novel level differently
expect_snapshot({
xx2 <- forge(new, x2$blueprint)
})
expect_equal(
xx2$predictors$f[[5]],
factor("new", levels = c("a", "b", "c", "d", "new"))
)
# Silent
expect_snapshot({
xx3 <- forge(new, x3$blueprint)
})
expect_equal(
colnames(xx3$predictors),
c("f_b", "f_c", "f_d", "f_new")
)
})
test_that("novel predictor levels without any data are silently removed", {
sparse_bp <- default_recipe_blueprint(composition = "dgCMatrix")
dat <- data.frame(
y = 1:4,
f = factor(letters[1:4])
)
new <- data.frame(
y = 1:5,
f = factor(letters[1:5])
)
# The 'e' level exists, but there is no data for it!
new <- new[1:4, ]
rec1 <- recipes::recipe(y ~ f, dat)
rec2 <- recipes::step_dummy(rec1, f)
x1 <- mold(rec1, dat)
x2 <- mold(rec2, dat, blueprint = sparse_bp)
expect_silent(
xx1 <- forge(new, x1$blueprint)
)
expect_equal(
colnames(xx1$predictors),
colnames(x1$predictors)
)
expect_silent(
xx2 <- forge(new, x2$blueprint)
)
expect_equal(
colnames(xx2$predictors),
c("f_b", "f_c", "f_d")
)
})
test_that("novel outcome levels are caught", {
sparse_bp <- default_recipe_blueprint(composition = "dgCMatrix")
dat <- data.frame(
y = 1:4,
f = factor(letters[1:4])
)
new <- data.frame(
y = 1:5,
f = factor(letters[1:5])
)
rec1 <- recipes::recipe(f ~ y, dat)
x1 <- mold(rec1, dat)
x2 <- mold(rec1, dat, blueprint = sparse_bp)
expect_snapshot({
xx1 <- forge(new, x1$blueprint, outcomes = TRUE)
})
expect_equal(
xx1$outcomes[[5, 1]],
factor(NA_real_, levels = c("a", "b", "c", "d"))
)
expect_snapshot({
xx2 <- forge(new, x2$blueprint, outcomes = TRUE)
})
expect_equal(
xx2$outcomes[[5, 1]],
factor(NA_real_, levels = c("a", "b", "c", "d"))
)
})
test_that("original predictor and outcome classes / names are recorded", {
sparse_bp <- default_recipe_blueprint(composition = "dgCMatrix")
rec <- recipes::recipe(Sepal.Length ~ Species, data = iris)
rec <- recipes::step_dummy(rec, Species)
x1 <- mold(rec, iris)
x2 <- mold(rec, iris, blueprint = sparse_bp)
expect_equal(
colnames(x1$blueprint$ptypes$predictors),
"Species"
)
expect_equal(
colnames(x2$blueprint$ptypes$predictors),
"Species"
)
expect_equal(
colnames(x1$blueprint$ptypes$outcomes),
"Sepal.Length"
)
expect_equal(
colnames(x2$blueprint$ptypes$outcomes),
"Sepal.Length"
)
expect_equal(
get_data_classes(x1$blueprint$ptypes$predictors),
list(Species = "factor")
)
expect_equal(
get_data_classes(x2$blueprint$ptypes$predictors),
list(Species = "factor")
)
expect_equal(
get_data_classes(x1$blueprint$ptypes$outcomes),
list(Sepal.Length = "numeric")
)
expect_equal(
get_data_classes(x2$blueprint$ptypes$outcomes),
list(Sepal.Length = "numeric")
)
})
test_that("new data classes are caught", {
sparse_bp <- default_recipe_blueprint(composition = "dgCMatrix")
iris2 <- iris
iris2$Species <- as.character(iris2$Species)
rec <- recipes::recipe(Sepal.Length ~ Species, iris)
x1 <- mold(rec, iris)
x2 <- mold(recipes::step_dummy(rec, Species), iris, blueprint = sparse_bp)
# Silent recovery
expect_error(
xx1 <- forge(iris2, x1$blueprint),
NA
)
expect_error(
xx2 <- forge(iris2, x2$blueprint),
NA
)
expect_s3_class(
xx1$predictors$Species,
"factor"
)
expect_s4_class(
xx2$predictors,
"dgCMatrix"
)
x3 <- mold(recipes::recipe(Species ~ Sepal.Length, iris), iris)
x4 <- mold(recipes::recipe(Species ~ Sepal.Length, iris), iris, blueprint = sparse_bp)
expect_error(
xx3 <- forge(iris2, x3$blueprint, outcomes = TRUE),
NA
)
expect_error(
xx4 <- forge(iris2, x4$blueprint, outcomes = TRUE),
NA
)
expect_s3_class(
xx3$outcomes$Species,
"factor"
)
expect_s3_class(
xx4$outcomes$Species,
"factor"
)
})
test_that("new data classes can interchange integer/numeric", {
sparse_bp <- default_recipe_blueprint(composition = "dgCMatrix")
iris2 <- iris
iris2$Sepal.Length <- as.integer(iris2$Sepal.Length)
x1 <- mold(recipes::recipe(Species ~ Sepal.Length, iris), iris)
x2 <- mold(recipes::recipe(Species ~ Sepal.Length, iris), iris, blueprint = sparse_bp)
expect_error(
forge(iris2, x1$blueprint),
NA
)
expect_error(
forge(iris2, x2$blueprint),
NA
)
rec <- recipes::recipe(Sepal.Length ~ Species, iris)
x3 <- mold(rec, iris)
x4 <- mold(recipes::step_dummy(rec, Species), iris, blueprint = sparse_bp)
expect_error(
forge(iris2, x3$blueprint, outcomes = TRUE),
NA
)
expect_error(
forge(iris2, x4$blueprint, outcomes = TRUE),
NA
)
})
test_that("an `extras` slot exists for `roles`", {
sparse_bp <- default_recipe_blueprint(composition = "dgCMatrix")
rec <- recipes::recipe(Species ~ Sepal.Length, data = iris)
x1 <- mold(rec, iris)
x2 <- mold(rec, iris, blueprint = sparse_bp)
xx1 <- forge(iris, x1$blueprint)
xx2 <- forge(iris, x2$blueprint)
expect_equal(
xx1$extras,
list(roles = NULL)
)
expect_equal(
xx2$extras,
list(roles = NULL)
)
})
test_that("non-standard roles generated in the recipe are returned by both `mold()` and `forge()`", {
# This is a convention we follow to have consistency between `mold()` and `forge()`.
# Both should have the same `.$extras$roles$<name>` slot names.
rec <- recipes::recipe(Species ~ ., iris)
rec <- recipes::step_bs(rec, Sepal.Length, role = "dummy", deg_free = 3)
x <- mold(rec, iris)
xx <- forge(iris, x$blueprint)
expect_identical(x$blueprint$extra_role_ptypes, NULL)
# Same slots in both `mold()` and `forge()` results
expect_identical(
colnames(x$extras$roles$dummy),
paste("Sepal.Length", c("1", "2", "3"), sep = "_bs_")
)
expect_identical(
colnames(xx$extras$roles$dummy),
paste("Sepal.Length", c("1", "2", "3"), sep = "_bs_")
)
})
test_that("`forge()` returns a slot for non standard roles that aren't required at `bake()` time", {
# This is a convention we follow to have consistency between `mold()` and `forge()`.
# Both should have the same `.$extras$roles$<name>` slot names, even if there
# wasn't any data there at `forge()` time (because it wasn't a required role).
rec <- recipes::recipe(Species ~ ., iris)
rec <- recipes::update_role(rec, Sepal.Width, new_role = "id")
rec <- recipes::update_role_requirements(rec, "id", bake = FALSE)
rec <- recipes::step_bs(rec, Sepal.Length, role = "dummy", deg_free = 3)
x <- mold(rec, iris)
xx <- forge(iris, x$blueprint)
expect_identical(x$blueprint$extra_role_ptypes, NULL)
# Same slots in both `mold()` and `forge()` results
expect_identical(
colnames(x$extras$roles$dummy),
paste("Sepal.Length", c("1", "2", "3"), sep = "_bs_")
)
expect_identical(
colnames(xx$extras$roles$dummy),
paste("Sepal.Length", c("1", "2", "3"), sep = "_bs_")
)
expect_identical(
x$extras$roles$id,
tibble(Sepal.Width = iris$Sepal.Width)
)
# There is a slot for this, but no data is returned here, because `"id"`
# wasn't a required role, so `Sepal.Width` was removed before being passed
# to `bake()`
expect_identical(
xx$extras$roles$id,
tibble::new_tibble(x = list(), nrow = nrow(iris))
)
})
test_that("required non standard roles can be dropped during the baking process", {
rec <- recipes::recipe(Species ~ ., iris)
rec <- recipes::update_role(rec, Sepal.Width, new_role = "dummy1")
rec <- recipes::update_role(rec, Sepal.Length, new_role = "dummy2")
rec <- recipes::step_rm(rec, Sepal.Length)
x <- mold(rec, iris)
xx <- forge(iris, x$blueprint)
# `extra_role_ptypes` holds ptypes for both
expect_identical(
x$blueprint$extra_role_ptypes$dummy1,
tibble::tibble(Sepal.Width = double())
)
expect_identical(
x$blueprint$extra_role_ptypes$dummy2,
tibble::tibble(Sepal.Length = double())
)
# But `Sepal.Length` got dropped along the way and isn't in the `term_info`
# of the recipe, so it doesn't appear in the final result
expect_identical(
x$extras$roles,
list(dummy1 = tibble::tibble(Sepal.Width = iris$Sepal.Width))
)
expect_identical(
xx$extras$roles,
list(dummy1 = tibble::tibble(Sepal.Width = iris$Sepal.Width))
)
})
test_that("`forge()` will error if required non standard roles are missing", {
rec <- recipes::recipe(Species ~ ., iris)
rec <- recipes::update_role(rec, Sepal.Width, new_role = "dummy1")
x <- mold(rec, iris)
iris$Sepal.Width <- NULL
expect_snapshot(error = TRUE, {
forge(iris, x$blueprint)
})
})
test_that("recipes will error if the role is declared as not required, but really was", {
rec <- recipes::recipe(Species ~ ., iris)
rec <- recipes::update_role(rec, Sepal.Width, new_role = "dummy1")
rec <- recipes::update_role_requirements(rec, "dummy1", bake = FALSE)
rec <- recipes::step_log(rec, Sepal.Width)
x <- mold(rec, iris)
# Error is specific to however `step_log()` handles this
expect_error(forge(iris, x$blueprint))
})
test_that("`NA` roles are treated as extra roles that are required at `forge()` time", {
# `Petal.Length`, `Petal.Width` have `NA` roles
rec <- recipes::recipe(iris)
rec <- recipes::update_role(rec, Sepal.Length, new_role = "predictor")
rec <- recipes::update_role(rec, Species, new_role = "outcome")
rec <- recipes::update_role(rec, Sepal.Width, new_role = "custom")
x <- mold(rec, iris)
xx <- forge(iris, x$blueprint)
# They are returned in `forge()` as extras
expect_identical(
xx$extras$roles$`NA`,
tibble(Petal.Length = iris$Petal.Length, Petal.Width = iris$Petal.Width)
)
# They are required at `forge()` time
iris$Petal.Length <- NULL
expect_snapshot(error = TRUE, {
forge(iris, x$blueprint)
})
})
test_that("`NA` roles can be declared as not required at `forge()` time", {
# Petal.Length, Petal.Width have `NA` roles
rec <- recipes::recipe(iris)
rec <- recipes::update_role(rec, Sepal.Length, new_role = "predictor")
rec <- recipes::update_role(rec, Species, new_role = "outcome")
rec <- recipes::update_role(rec, Sepal.Width, new_role = "custom")
rec <- recipes::update_role_requirements(rec, "NA", bake = FALSE)
x <- mold(rec, iris)
# No longer required at `forge()` time
iris$Petal.Length <- NULL
xx <- forge(iris, x$blueprint)
# And `Petal.Width` (which is still in `iris`) won't be in the output
expect_identical(
xx$extras$roles$`NA`,
tibble::new_tibble(x = list(), nrow = nrow(iris))
)
})
test_that("`extras` only hold roles that actually exist in the data", {
df <- tibble(y = 1, x = 1, z = 2)
rec <- recipes::recipe(y ~ ., df)
rec <- recipes::update_role(rec, x, new_role = "dummy")
# For example `default_bake_role_requirements()` specifies that `NA` is
# a required role, but there aren't any columns with that role in `df`
x <- mold(rec, df)
xx <- forge(df, x$blueprint)
expect_named(xx$extras$roles, "dummy")
})
test_that("Missing y value still returns `NULL` if no outcomes are asked for", {
sparse_bp <- default_recipe_blueprint(composition = "dgCMatrix")
rec <- recipes::recipe(~Sepal.Width, data = iris)
x1 <- mold(rec, iris)
x2 <- mold(rec, iris, blueprint = sparse_bp)
expect_equal(forge(iris, x1$blueprint)$outcomes, NULL)
expect_equal(forge(iris, x2$blueprint)$outcomes, NULL)
})
test_that("Missing y value returns 0 column tibble if outcomes are asked for", {
sparse_bp <- default_recipe_blueprint(composition = "dgCMatrix")
rec <- recipes::recipe(~Sepal.Width, data = iris)
x1 <- mold(rec, iris)
x2 <- mold(rec, iris, blueprint = sparse_bp)
xx1 <- forge(iris, x1$blueprint, outcomes = TRUE)
xx2 <- forge(iris, x2$blueprint, outcomes = TRUE)
expect_equal(nrow(xx1$outcomes), 150)
expect_equal(ncol(xx1$outcomes), 0)
expect_equal(nrow(xx2$outcomes), 150)
expect_equal(ncol(xx2$outcomes), 0)
})
test_that("Predictors with multiple roles are only included once before baking (#120)", {
rec <- recipes::recipe(Species ~ ., iris) # Implicit "predictor" role too
rec <- recipes::add_role(rec, Sepal.Length, new_role = "test1")
rec <- recipes::add_role(rec, Sepal.Length, new_role = "test2")
sparse_bp <- default_recipe_blueprint(composition = "dgCMatrix")
x1 <- mold(rec, iris)
x2 <- mold(rec, iris, blueprint = sparse_bp)
xx1 <- forge(iris, x1$blueprint)
xx2 <- forge(iris, x2$blueprint)
expect_true("Sepal.Length" %in% colnames(xx1$predictors))
expect_true("Sepal.Length" %in% colnames(xx1$extras$roles$test1))
expect_true("Sepal.Length" %in% colnames(xx1$extras$roles$test2))
expect_true("Sepal.Length" %in% colnames(xx2$predictors))
expect_true("Sepal.Length" %in% colnames(xx2$extras$roles$test1))
expect_true("Sepal.Length" %in% colnames(xx2$extras$roles$test2))
})
test_that("`strings_as_factors` is used at `bake()` time", {
df <- tibble(y = "a", x = "b")
rec <- recipes::recipe(y ~ x, data = df)
# Default is `TRUE`
x <- mold(rec, df)
out <- forge(df, x$blueprint, outcomes = TRUE)
expect_identical(out$outcomes$y, factor("a"))
expect_identical(out$predictors$x, factor("b"))
x <- mold(rec, df, blueprint = default_recipe_blueprint(strings_as_factors = FALSE))
out <- forge(df, x$blueprint, outcomes = TRUE)
expect_identical(out$outcomes$y, "a")
expect_identical(out$predictors$x, "b")
})
test_that("`forge()` is compatible with hardhat 0.2.0 molded blueprints with a basic recipe", {
path <- test_path("data", "hardhat-0.2.0-post-mold-recipe.rds")
object <- readRDS(path)
new_data <- object$new_data
blueprint <- object$blueprint
out <- forge(new_data, blueprint)
expect <- recipes::bake(blueprint$recipe, new_data, recipes::all_predictors())
expect_identical(out$predictors, expect)
expect_identical(out$extras, list(roles = NULL))
new_data$x <- NULL
expect_snapshot(error = TRUE, {
forge(new_data, blueprint)
})
})
test_that("`forge()` is compatible with hardhat 0.2.0 molded blueprints with a recipe with a nonstandard role", {
path <- test_path("data", "hardhat-0.2.0-post-mold-recipe-nonstandard-role.rds")
object <- readRDS(path)
new_data <- object$new_data
blueprint <- object$blueprint
out <- forge(new_data, blueprint)
expect <- recipes::bake(blueprint$recipe, new_data, recipes::all_predictors())
expect_identical(out$predictors, expect)
expect <- recipes::bake(blueprint$recipe, new_data, id)
expect_identical(out$extras, list(roles = list(id = expect)))
new_data$id <- NULL
expect_snapshot(error = TRUE, {
forge(new_data, blueprint)
})
})
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.