recipes: Preprocessing and Feature Engineering Steps for Modeling

library(testthat)
library(recipes)

skip_if_not_installed("modeldata")
data(biomass, package = "modeldata")
biomass_tr <- biomass[biomass$dataset == "Training", ]
biomass_te <- biomass[biomass$dataset == "Testing", ]

test_that("Recipe correctly identifies output variable", {
  raw_recipe <- recipe(HHV ~ ., data = biomass)
  var_info <- raw_recipe$var_info
  expect_true(is_tibble(var_info))
  outcome_ind <- which(var_info$variable == "HHV")
  expect_true(var_info$role[outcome_ind] == "outcome")
  expect_true(all(var_info$role[-outcome_ind] == rep("predictor", ncol(biomass) - 1)))
})

test_that("Recipe fails on in-line functions", {
  expect_snapshot(error = TRUE,
    recipe(HHV ~ log(nitrogen), data = biomass)
  )
  expect_snapshot(error = TRUE,
    recipe(HHV ~ (.)^2, data = biomass)
  )
  expect_snapshot(error = TRUE,
    recipe(HHV ~ nitrogen + sulfur + nitrogen:sulfur, data = biomass)
  )
  expect_snapshot(error = TRUE,
    recipe(HHV ~ nitrogen^2, data = biomass)
  )
})

test_that("return character or factor values", {
  raw_recipe <- recipe(HHV ~ ., data = biomass)
  centered <- raw_recipe %>%
    step_center(carbon, hydrogen, oxygen, nitrogen, sulfur)

  centered_char <- prep(centered, training = biomass, strings_as_factors = FALSE)
  char_var <- bake(centered_char, new_data = head(biomass))
  expect_equal(class(char_var$sample), "character")

  centered_fac <- prep(centered, training = biomass, strings_as_factors = TRUE)
  fac_var <- bake(centered_fac, new_data = head(biomass))
  expect_equal(class(fac_var$sample), "factor")
  expect_equal(levels(fac_var$sample), sort(unique(biomass$sample)))
})


test_that("Using prepare", {
  expect_snapshot(error = TRUE,
    prepare(recipe(HHV ~ ., data = biomass),
      training = biomass
    )
  )
})

test_that("Multiple variables on lhs of formula", {
  # from issue #96
  expect_silent(multi_1 <- recipe(Petal.Width + Species ~ ., data = iris))
  expect_equal(
    multi_1$var_info$variable[multi_1$var_info$role == "outcome"],
    names(iris)[4:5]
  )
  expect_equal(
    multi_1$var_info$variable[multi_1$var_info$role == "predictor"],
    names(iris)[1:3]
  )

  iris$Species <- as.character(iris$Species)
  expect_silent(multi_2 <- recipe(Petal.Width + Species ~ ., data = iris))
  expect_equal(
    multi_2$var_info$variable[multi_2$var_info$role == "outcome"],
    names(iris)[4:5]
  )
  expect_equal(
    multi_2$var_info$variable[multi_2$var_info$role == "predictor"],
    names(iris)[1:3]
  )
})

test_that("detect_step function works", {
  rec <- recipe(Species ~ ., data = iris) %>%
    step_center(all_predictors()) %>%
    step_scale(Sepal.Width) %>%
    step_relu(Sepal.Length) %>%
    step_intercept()

  prepped_rec <- prep(rec, iris)

  # detect untrained steps
  expect_true(detect_step(rec, "center"))
  expect_true(detect_step(rec, "scale"))
  expect_true(detect_step(rec, "relu"))
  expect_true(detect_step(rec, "intercept"))

  # detect trained steps
  expect_true(detect_step(prepped_rec, "center"))
  expect_true(detect_step(prepped_rec, "scale"))
  expect_true(detect_step(prepped_rec, "relu"))
  expect_true(detect_step(prepped_rec, "intercept"))

  # don't detect untrained steps not in use
  expect_false(detect_step(rec, "pca"))
  expect_false(detect_step(rec, "meanimpute"))

  # don't detect trained steps not in use
  expect_false(detect_step(prepped_rec, "pca"))
  expect_false(detect_step(prepped_rec, "meanimpute"))
})

test_that("bake without prep", {
  sp_signed <- recipe(HHV ~ ., data = biomass) %>%
    step_center(all_predictors()) %>%
    step_scale(all_predictors()) %>%
    step_spatialsign(all_predictors())
  expect_snapshot(error = TRUE,
    bake(sp_signed, new_data = biomass_te)
  )
  expect_snapshot(error = TRUE,
    juice(sp_signed)
  )
})

test_that("prep with fresh = TRUE", {
  test_data <- data.frame(x = factor(c(1, 2)), y = c(1, 2))

  rec <-
    recipe(y ~ ., data = test_data) %>%
    step_dummy(x, id = "") %>%
    prep()

  new_rec <- prep(rec, training = test_data, fresh = TRUE)

  rec$fit_times$elapsed <- 0
  new_rec$fit_times$elapsed <- 0

  expect_identical(rec, new_rec)

  expect_equal(
    tidy(new_rec, 1),
    tibble(terms = "x", columns = "2", id = "")
  )

  test_data2 <- data.frame(x = factor(c(1, 3)), y = c(1, 2))

  new_rec2 <- prep(rec, training = test_data2, fresh = TRUE)

  expect_equal(
    tidy(new_rec2, 1),
    tibble(terms = "x", columns = "3", id = "")
  )
})

test_that("bake without newdata", {
  rec <- recipe(HHV ~ ., data = biomass) %>%
    step_center(all_numeric()) %>%
    step_scale(all_numeric()) %>%
    prep(training = biomass)

  expect_snapshot(error = TRUE,
    bake(rec, newdata = biomass)
  )
})

test_that("`juice()` returns a 0 column / N row tibble when a selection returns no columns", {
  rec <- recipe(~., data = iris)
  rec <- prep(rec, iris)

  expect_equal(
    juice(rec, all_outcomes()),
    tibble(.rows = nrow(iris))
  )
})

test_that("`bake()` returns a 0 column / N row tibble when a selection returns no columns", {
  rec <- recipe(~., data = iris)
  rec <- prep(rec, iris)

  expect_equal(
    bake(rec, iris, all_outcomes()),
    tibble(.rows = nrow(iris))
  )
})

test_that("tunable arguments at prep-time", {
  .tune <- function() rlang::call2("tune")

  expect_snapshot(error = TRUE,
    recipe(Species ~ ., data = iris) %>%
      step_ns(all_predictors(), deg_free = .tune()) %>%
      prep()
  )

  expect_snapshot(error = TRUE,
    recipe(~., data = mtcars) %>%
      step_pca(all_predictors(), threshold = .tune()) %>%
      step_kpca(all_predictors(), num_comp = .tune()) %>%
      step_bs(all_predictors(), deg_free = .tune()) %>%
      prep()
  )
})

test_that("logging", {
  expect_snapshot(
    recipe(mpg ~ ., data = mtcars) %>%
      step_ns(disp, deg_free = 2, id = "splines!") %>%
      prep(log_changes = TRUE)
  )
})

test_that("`bake(new_data = NULL)` same as `juice()`", {
  rec <-
    recipe(mpg ~ ., data = mtcars) %>%
    step_filter(gear == 4) %>%
    step_center(all_predictors()) %>%
    prep()

  juiced <- juice(rec)
  baked <- bake(rec, new_data = NULL)
  expect_equal(juiced, baked)

  # make sure that filter is skipped on training data this way
  roasted <- bake(rec, new_data = mtcars)
  expect_equal(nrow(roasted), 32)
})

test_that("`retain flag in prep should return data when TRUE and zero rows when FALSE", {
  rec <- recipe(mpg ~ ., data = mtcars %>% head(20))

  # flag TRUE but no training data
  prec_1 <- prep(rec, retain = TRUE)
  expect_equal(nrow(prec_1$template), 20)

  # flag FALSE and no training data
  prec_2 <- prep(rec, retain = FALSE)
  expect_equal(nrow(prec_2$template), 0)

  # flag TRUE with training data
  prec_3 <- prep(rec, training = mtcars %>% tail(12), retain = TRUE)
  expect_equal(nrow(prec_3$template), 12)

  # flag FALSE with training data
  prec_4 <- prep(rec, training = mtcars %>% tail(12), retain = FALSE)
  expect_equal(nrow(prec_4$template), 0)
})

test_that("case weights are being infered correctly for formula interface", {
  mtcars1 <- mtcars
  mtcars1$disp <- importance_weights(mtcars1$disp)

  rec <- recipe(mpg ~ cyl + disp, data = mtcars1)

  ref_summary <- tibble(
    variable = c("cyl", "disp", "mpg"),
    type = list(c("double", "numeric"), "case_weights", c("double", "numeric")),
    role = c("predictor", "case_weights", "outcome"),
    source = "original"
  )
  expect_equal(
    summary(rec),
    ref_summary
  )

  mtcars2 <- mtcars
  mtcars2$disp <- importance_weights(mtcars2$disp)
  mtcars2$cyl <- importance_weights(mtcars2$cyl)

  expect_snapshot(error = TRUE,
    recipe(mpg ~ cyl + disp, data = mtcars2)
  )
})

test_that("case weights are being infered correctly for x interface", {
  mtcars1 <- mtcars[c(2, 3, 1)]
  mtcars1$disp <- importance_weights(mtcars1$disp)

  rec <- recipe(mtcars1)

  ref_summary <- tibble(
    variable = c("cyl", "disp", "mpg"),
    type = list(c("double", "numeric"), "case_weights", c("double", "numeric")),
    role = c(NA, "case_weights", NA),
    source = "original"
  )
  expect_equal(
    summary(rec),
    ref_summary
  )

  mtcars2 <- mtcars
  mtcars2$disp <- importance_weights(mtcars2$disp)
  mtcars2$cyl <- importance_weights(mtcars2$cyl)

  expect_snapshot(error = TRUE,
    recipe(mtcars2)
  )

})


test_that("verbose when printing", {
  standardized <- recipe(~., mtcars) %>%
    step_center(all_predictors()) %>%
    step_scale(all_predictors()) %>%
    step_normalize(all_predictors())
  expect_snapshot(tmp <- prep(standardized, verbose = TRUE))

})

test_that("`internal data is kept as tibbles when prepping", {
  rec_spec <- recipe(mpg ~ ., data = mtcars) %>%
    step_testthat_helper()

  expect_true(
    tibble::is_tibble(prep(rec_spec)$template)
  )

  expect_true(
    tibble::is_tibble(rec_spec %>% prep() %>% bake(new_data = NULL))
  )

  expect_true(
    tibble::is_tibble(rec_spec %>% prep() %>% bake(new_data = mtcars))
  )

  rec_prepped <- prep(rec_spec)

  # Pretending that the outcome will be a data.frame
  rec_prepped$steps[[1]]$output <- mtcars

  expect_true(
    tibble::is_tibble(bake(rec_prepped, new_data = NULL))
  )

  # Will ignore new_data and return `output`
  expect_snapshot(error = TRUE,
    bake(rec_prepped, new_data = as_tibble(mtcars))
  )

  rec_spec <- recipe(mpg ~ ., data = mtcars) %>%
    step_testthat_helper(output = mtcars)

  expect_snapshot(error = TRUE, prep(rec_spec))
})

test_that("recipe() errors if `data` is missing", {
  expect_snapshot(error = TRUE, recipe(mpg ~ .))
})

topepo/recipes documentation built on April 10, 2024, 10:30 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

topepo/recipes
Preprocessing and Feature Engineering Steps for Modeling

tests/testthat/test-basics.R
In topepo/recipes: Preprocessing and Feature Engineering Steps for Modeling

R Package Documentation

Browse R Packages

We want your feedback!

topepo/recipes Preprocessing and Feature Engineering Steps for Modeling

tests/testthat/test-basics.R In topepo/recipes: Preprocessing and Feature Engineering Steps for Modeling

R Package Documentation

Browse R Packages

We want your feedback!

topepo/recipes
Preprocessing and Feature Engineering Steps for Modeling

tests/testthat/test-basics.R
In topepo/recipes: Preprocessing and Feature Engineering Steps for Modeling