tests/testthat/test-mold-recipe.R

test_that("can mold recipes", {
  sparse_bp <- default_recipe_blueprint(composition = "dgCMatrix")
  matrix_bp <- default_recipe_blueprint(composition = "matrix")

  rec <- recipes::recipe(Species ~ Sepal.Length, data = iris)
  x1 <- mold(rec, iris)
  x2 <- mold(rec, iris, blueprint = sparse_bp)
  x3 <- mold(rec, iris, blueprint = matrix_bp)

  expect_s3_class(x1$predictors, "data.frame")
  expect_s4_class(x2$predictors, "dgCMatrix")
  expect_matrix(x3$predictors)

  expect_s3_class(x1$outcomes, "data.frame")
  expect_s3_class(x2$outcomes, "data.frame")
  expect_s3_class(x3$outcomes, "data.frame")

  expect_equal(colnames(x1$predictors), "Sepal.Length")
  expect_equal(colnames(x2$predictors), "Sepal.Length")
  expect_equal(colnames(x3$predictors), "Sepal.Length")
  expect_s3_class(x1$outcomes[[1]], "factor")
  expect_s3_class(x1$blueprint, "default_recipe_blueprint")

  # Training data should _not_ be in the recipe
  expect_error(recipes::juice(x1$blueprint))
})

test_that("can mold recipes with intercepts", {
  rec <- recipes::recipe(Species ~ Sepal.Length, data = iris)

  x1 <- mold(
    rec, iris,
    blueprint = default_recipe_blueprint(intercept = TRUE)
  )
  x2 <- mold(
    rec, iris,
    blueprint = default_recipe_blueprint(
      intercept = TRUE,
      composition = "dgCMatrix"
    )
  )
  x3 <- mold(
    rec, iris,
    blueprint = default_recipe_blueprint(
      intercept = TRUE,
      composition = "matrix"
    )
  )

  expect_true("(Intercept)" %in% colnames(x1$predictors))
  expect_true("(Intercept)" %in% colnames(x2$predictors))
  expect_true("(Intercept)" %in% colnames(x3$predictors))
})

test_that("can pass `fresh` through to `prep()`", {
  sparse_bp <- default_recipe_blueprint(composition = "dgCMatrix")

  iris1 <- iris[1:50, ]
  iris2 <- iris[51:100, ]

  rec <- recipes::recipe(Species ~ Sepal.Length, data = iris1)
  rec <- recipes::step_center(rec, Sepal.Length)

  prepped_rec <- recipes::prep(rec, iris1)
  non_fresh_mean <- prepped_rec$steps[[1]]$means

  x1 <- mold(prepped_rec, iris2, blueprint = default_recipe_blueprint(fresh = FALSE))
  mold_non_fresh_mean1 <- x1$blueprint$recipe$steps[[1]]$means
  x2 <- mold(prepped_rec, iris2, blueprint = default_recipe_blueprint(fresh = FALSE, composition = "dgCMatrix"))
  mold_non_fresh_mean2 <- x2$blueprint$recipe$steps[[1]]$means

  y1 <- mold(prepped_rec, iris2, blueprint = default_recipe_blueprint(fresh = TRUE))
  mold_fresh_mean1 <- y1$blueprint$recipe$steps[[1]]$means
  y2 <- mold(prepped_rec, iris2, blueprint = default_recipe_blueprint(fresh = TRUE, composition = "dgCMatrix"))
  mold_fresh_mean2 <- y2$blueprint$recipe$steps[[1]]$means

  fresh_mean <- c(Sepal.Length = mean(iris2$Sepal.Length))

  expect_equal(non_fresh_mean, mold_non_fresh_mean1)
  expect_false(identical(non_fresh_mean, mold_fresh_mean1))
  expect_equal(non_fresh_mean, mold_non_fresh_mean2)
  expect_false(identical(non_fresh_mean, mold_fresh_mean2))
  expect_equal(mold_fresh_mean1, fresh_mean)
  expect_equal(mold_fresh_mean2, fresh_mean)
})

test_that("`fresh` defaults to `TRUE`", {
  sparse_bp <- default_recipe_blueprint(composition = "dgCMatrix")

  iris1 <- iris[1:50, ]
  iris2 <- iris[51:100, ]

  rec <- recipes::recipe(Species ~ Sepal.Length, data = iris1)
  rec <- recipes::step_center(rec, Sepal.Length)

  prepped_rec <- recipes::prep(rec, iris1)
  non_fresh_mean <- prepped_rec$steps[[1]]$means
  fresh_mean <- c(Sepal.Length = mean(iris2$Sepal.Length))

  x1 <- mold(prepped_rec, iris2, blueprint = default_recipe_blueprint())
  mold_fresh_mean1 <- x1$blueprint$recipe$steps[[1]]$means
  x2 <- mold(prepped_rec, iris2, blueprint = sparse_bp)
  mold_fresh_mean2 <- x2$blueprint$recipe$steps[[1]]$means

  expect_false(identical(non_fresh_mean, mold_fresh_mean1))
  expect_identical(mold_fresh_mean1, fresh_mean)
  expect_false(identical(non_fresh_mean, mold_fresh_mean2))
  expect_identical(mold_fresh_mean2, fresh_mean)
})

test_that("can pass `strings_as_factors` through to `prep()`", {
  df <- tibble(y = 1, x = "a")
  rec <- recipes::recipe(y ~ x, data = df)

  # Defaults to `TRUE`
  x <- mold(rec, df)
  expect_identical(x$predictors$x, factor("a"))

  x <- mold(rec, df, blueprint = default_recipe_blueprint(strings_as_factors = FALSE))
  expect_identical(x$predictors$x, "a")
})

test_that("`data` is validated", {
  expect_error(
    mold(recipes::recipe(Species ~ Sepal.Length, data = iris), 1),
    "`data` must be a data.frame or a matrix"
  )
})

test_that("`extras` holds a slot for `roles`", {
  sparse_bp <- default_recipe_blueprint(composition = "dgCMatrix")
  matrix_bp <- default_recipe_blueprint(composition = "matrix")

  rec <- recipes::recipe(Species ~ Sepal.Length, data = iris)
  x1 <- mold(rec, iris)
  x2 <- mold(rec, iris, blueprint = sparse_bp)
  x3 <- mold(rec, iris, blueprint = matrix_bp)

  expect_equal(x1$extras, list(roles = NULL))
  expect_equal(x2$extras, list(roles = NULL))
  expect_equal(x3$extras, list(roles = NULL))

  expect_equal(x1$blueprint$extra_role_ptypes, NULL)
  expect_equal(x2$blueprint$extra_role_ptypes, NULL)
  expect_equal(x3$blueprint$extra_role_ptypes, NULL)
})

test_that("non-standard roles ptypes are stored by default", {
  rec <- recipes::recipe(Species ~ ., iris)
  rec <- recipes::update_role(rec, Sepal.Width, new_role = "dummy")

  x <- mold(rec, iris)

  expect_identical(
    x$blueprint$extra_role_ptypes,
    list(dummy = tibble::tibble(Sepal.Width = double()))
  )

  expect_identical(
    x$extras$roles,
    list(dummy = tibble::tibble(Sepal.Width = iris$Sepal.Width))
  )
})

test_that("case weights is not considered a required extra role by default", {
  iris$weight <- frequency_weights(seq_len(nrow(iris)))

  rec <- recipes::recipe(Species ~ ., iris)
  rec <- recipes::update_role(rec, Sepal.Width, new_role = "dummy")

  x <- mold(rec, iris)

  expect_identical(
    x$blueprint$extra_role_ptypes,
    list(dummy = tibble::tibble(Sepal.Width = double()))
  )

  expect_identical(
    x$extras$roles$dummy,
    tibble::tibble(Sepal.Width = iris$Sepal.Width)
  )
  expect_identical(
    x$extras$roles$case_weights,
    tibble::tibble(weight = iris$weight)
  )
})

test_that("case weights can be updated to be a required extra role", {
  iris$weight <- frequency_weights(seq_len(nrow(iris)))

  rec <- recipes::recipe(Species ~ ., iris)
  rec <- recipes::update_role(rec, Sepal.Width, new_role = "dummy")
  rec <- recipes::update_role_requirements(rec, "case_weights", bake = TRUE)

  x <- mold(rec, iris)

  expect_identical(
    x$blueprint$extra_role_ptypes$dummy,
    tibble::tibble(Sepal.Width = double())
  )
  expect_identical(
    x$blueprint$extra_role_ptypes$case_weights,
    tibble::tibble(weight = frequency_weights(integer()))
  )

  expect_identical(
    x$extras$roles$dummy,
    tibble::tibble(Sepal.Width = iris$Sepal.Width)
  )
  expect_identical(
    x$extras$roles$case_weights,
    tibble::tibble(weight = iris$weight)
  )
})

test_that("only original non-standard columns are in the extra roles ptype", {
  sparse_bp <- default_recipe_blueprint(composition = "dgCMatrix")

  # same custom role, but note step_bs() columns aren't original columns
  rec <- recipes::recipe(Species ~ ., iris)
  rec <- recipes::update_role(rec, Sepal.Width, new_role = "dummy")
  rec <- recipes::step_bs(rec, Sepal.Length, role = "dummy", deg_free = 3)

  x1 <- mold(rec, iris)
  x2 <- mold(rec, iris, blueprint = sparse_bp)

  # extra roles ptype only has original columns
  expect_identical(
    colnames(x1$blueprint$extra_role_ptypes$dummy),
    "Sepal.Width"
  )
  expect_identical(
    colnames(x2$blueprint$extra_role_ptypes$dummy),
    "Sepal.Width"
  )

  # all `dummy` columns are returned in the `extras` slot
  expect_identical(
    colnames(x1$extras$roles$dummy),
    c("Sepal.Width", paste("Sepal.Length", c("1", "2", "3"), sep = "_bs_"))
  )
  expect_identical(
    colnames(x2$extras$roles$dummy),
    c("Sepal.Width", paste("Sepal.Length", c("1", "2", "3"), sep = "_bs_"))
  )
})

test_that("`extra_role_ptypes` and `extras` only hold roles that actually exist in the data", {
  df <- tibble(y = 1, x = 1, z = 2)

  rec <- recipes::recipe(y ~ ., df)
  rec <- recipes::update_role(rec, x, new_role = "dummy")

  # For example `default_bake_role_requirements()` specifies that `NA` is
  # a required role, but there aren't any columns with that role in `df`
  x <- mold(rec, df)

  expect_named(x$blueprint$extra_role_ptypes, "dummy")
  expect_named(x$extras$roles, "dummy")
})

test_that("multiple extra roles types can be stored", {
  sparse_bp <- default_recipe_blueprint(composition = "dgCMatrix")

  rec <- recipes::recipe(Species ~ ., iris)
  rec <- recipes::step_bs(rec, Sepal.Length, role = "dummy", deg_free = 3)
  rec <- recipes::step_bs(rec, Sepal.Width, role = "dummy2", deg_free = 3)

  x1 <- mold(rec, iris)
  x2 <- mold(rec, iris, blueprint = sparse_bp)

  # these are not original columns
  expect_equal(
    x1$blueprint$extra_role_ptypes,
    NULL
  )
  expect_equal(
    x2$blueprint$extra_role_ptypes,
    NULL
  )

  # They are automatically are returned in the `mold()` result
  expect_equal(
    names(x1$extras$roles),
    c("dummy", "dummy2")
  )
  expect_equal(
    names(x2$extras$roles),
    c("dummy", "dummy2")
  )
})

test_that("`NA` roles are treated as extra roles", {
  sparse_bp <- default_recipe_blueprint(composition = "dgCMatrix")

  rec <- recipes::recipe(iris)
  rec <- recipes::update_role(rec, Sepal.Length, new_role = "predictor")
  rec <- recipes::update_role(rec, Species, new_role = "outcome")
  rec <- recipes::update_role(rec, Sepal.Width, new_role = "custom")

  x1 <- mold(rec, iris)
  x2 <- mold(rec, iris, blueprint = sparse_bp)

  expect_equal(
    colnames(x1$predictors),
    "Sepal.Length"
  )
  expect_equal(
    colnames(x2$predictors),
    "Sepal.Length"
  )

  expect_equal(
    colnames(x1$blueprint$ptypes$predictors),
    "Sepal.Length"
  )
  expect_equal(
    colnames(x2$blueprint$ptypes$predictors),
    "Sepal.Length"
  )

  expect_equal(
    colnames(x1$outcomes),
    "Species"
  )
  expect_equal(
    colnames(x2$outcomes),
    "Species"
  )

  expect_equal(
    colnames(x1$blueprint$ptypes$outcomes),
    "Species"
  )
  expect_equal(
    colnames(x2$blueprint$ptypes$outcomes),
    "Species"
  )

  expect_equal(
    colnames(x1$extras$roles$custom),
    "Sepal.Width"
  )
  expect_equal(
    colnames(x2$extras$roles$custom),
    "Sepal.Width"
  )

  expect_equal(
    colnames(x1$extras$roles$`NA`),
    c("Petal.Length", "Petal.Width")
  )
  expect_equal(
    colnames(x2$extras$roles$`NA`),
    c("Petal.Length", "Petal.Width")
  )

  expect_equal(
    colnames(x1$blueprint$extra_role_ptypes$custom),
    "Sepal.Width"
  )
  expect_equal(
    colnames(x2$blueprint$extra_role_ptypes$custom),
    "Sepal.Width"
  )

  expect_equal(
    colnames(x1$blueprint$extra_role_ptypes$`NA`),
    c("Petal.Length", "Petal.Width")
  )
  expect_equal(
    colnames(x2$blueprint$extra_role_ptypes$`NA`),
    c("Petal.Length", "Petal.Width")
  )
})

test_that("roles that aren't required are not retained as `extra_role_ptypes`, but are in the mold result", {
  rec <- recipes::recipe(Species ~ ., iris)
  rec <- recipes::update_role(rec, Sepal.Width, new_role = "dummy1")
  rec <- recipes::update_role(rec, Sepal.Length, new_role = "dummy2")
  rec <- recipes::update_role_requirements(rec, "dummy1", bake = FALSE)

  x <- mold(rec, iris)

  expect_equal(
    x$blueprint$extra_role_ptypes,
    list(dummy2 = tibble::tibble(Sepal.Length = double()))
  )

  expect_equal(
    x$extras$roles$dummy1,
    tibble::tibble(Sepal.Width = iris$Sepal.Width)
  )
  expect_equal(
    x$extras$roles$dummy2,
    tibble::tibble(Sepal.Length = iris$Sepal.Length)
  )
})

test_that("Missing y value returns a 0 column tibble for `outcomes`", {
  sparse_bp <- default_recipe_blueprint(composition = "dgCMatrix")
  rec <- recipes::recipe(~Sepal.Width, data = iris)
  x1 <- mold(rec, iris)
  x2 <- mold(rec, iris, blueprint = sparse_bp)

  expect_equal(nrow(x1$outcomes), 150)
  expect_equal(ncol(x1$outcomes), 0)
  expect_equal(x1$outcomes, x2$outcomes)
})

test_that("Missing y value returns a 0 column / 0 row tibble for `ptype`", {
  sparse_bp <- default_recipe_blueprint(composition = "dgCMatrix")
  rec <- recipes::recipe(~Sepal.Width, data = iris)
  x1 <- mold(rec, iris)
  x2 <- mold(rec, iris, blueprint = sparse_bp)
  expect_equal(x1$blueprint$ptypes$outcomes, tibble())
  expect_equal(x2$blueprint$ptypes$outcomes, tibble())
})

test_that("`mold()` is compatible with hardhat 0.2.0 blueprints", {
  path <- test_path("data", "hardhat-0.2.0-pre-mold-recipe.rds")
  object <- readRDS(path)

  data <- object$data
  blueprint <- object$blueprint

  rec <- recipes::recipe(y ~ ., data = data)
  rec <- recipes::step_mutate(rec, z = 1)

  out <- mold(rec, data = data, blueprint = blueprint)

  expect <- tibble::tibble(x = data$x, z = 1)
  expect_identical(out$predictors, expect)
})

Try the hardhat package in your browser

Any scripts or data that you put into this service are public.

hardhat documentation built on March 31, 2023, 10:21 p.m.