tests/testthat/test_preproc.R

context("preproc")

test_that("preproc - basic sanity checks", {
  skip_if_not_installed("smotefamily")
  task = tsk("iris")
  op = po("scale")
  graph = as_graph(op)


  # Test indata
  expect_no_error(preproc(task, op))
  expect_no_error(preproc(task, op, predict = TRUE))
  op$state = NULL
  expect_data_table(preproc(task$data(), op))
  expect_data_table(preproc(task$data(), op, predict = TRUE))
  op$state = NULL
  expect_data_frame(preproc(as.data.frame(task$data()), op))
  expect_data_frame(preproc(as.data.frame(task$data()), op, predict = TRUE))
  op$state = NULL
  expect_error(preproc(NULL, op), "type 'data.frame'.*inherit from class 'Task'")
  expect_error(preproc(NULL, op, predict = TRUE), "type 'data.frame'.*inherit from class 'Task'")


  # Test processor
  # PipeOp processor already tested above
  expect_no_error(preproc(task, graph))
  expect_no_error(preproc(task, graph, predict = TRUE))
  expect_error(preproc(task, lrn("regr.rpart")), "no applicable method")

  # Test handling of multiple input channels
  gr = gunion(list(
    po("select_1", selector = selector_type("factor")) %>>% po("colapply", applicator = as.numeric),
    po("select_2", selector = selector_type("numeric")) %>>% po("pca")
  )) %>>% po("featureunion")
  expect_no_error(preproc(tsk("boston_housing"), gr))
  expect_no_error(preproc(tsk("boston_housing"), gr))

  # Test error for processor with more than one output channel
  expect_error(preproc(task, PipeOpDebugMulti$new(1, 2)), "must have exactly one output channel")
  expect_error(preproc(task, gunion(list(NULL, NULL))), "must have exactly one output channel")

  # Test error if processor cannot handle Tasks as input
  # Need a PipeOp that cannot handle Tasks as input to train
  PipeOpDebugNonTaskInput = R6Class("PipeOpDebugNonTaskInput",
    inherit = PipeOp,
    public = list(
      initialize = function(id = "non_task_input", param_set = ps()) {
        super$initialize(id = id, param_set = param_set,
          input = data.table(name = "input", train = "numeric", predict = "numeric"),
          output = data.table(name = "output", train = "character", predict = "character")
        )
      }),
    private = list(
      .train = function(inputs) {
        self$state = inputs
        list(as.character(inputs[[1L]]))
      },
      .predict = function(inputs) list(as.character(inputs[[1L]]))
    )
  )
  expect_error(preproc(task, PipeOpDebugNonTaskInput$new()), "Must inherit from class '.*', but has classes.*'Task'")
  expect_error(preproc(task, po("regravg"), state = list(), predict = TRUE), "Must inherit from class '.*', but has classes.*'Task'")

  # Test error for processors incapable of handling targetless tasks when indata is a data.frame
  expect_error(preproc(task$data(), po("smote")), "Must inherit from class 'TaskClassif'")
  # Need to have a trained PipeOp to test this for predict as well
  op2 = po("smote")
  op2$train(list(task))
  expect_error(preproc(task$data(), op2, predict = TRUE), "Must inherit from class 'TaskClassif'")

  # Test error for non-Task output of processor when indata is a data.frame
  # Construct a PipeOp that takes a Task as input but returns something else
  PipeOpDebugTaskToVec = R6Class("PipeOpDebugTaskToVec",
    inherit = PipeOp,
    public = list(
      initialize = function(id = "task_to_vec", param_set = ps()) {
        super$initialize(id = id, param_set = param_set,
          input = data.table(name = "input", train = "Task", predict = "Task"),
          output = data.table(name = "output", train = "character", predict = "character")
        )
      }),
    private = list(
      .train = function(inputs) {
        self$state = inputs
        list(inputs[[1L]]$feature_names)
      },
      .predict = function(inputs) list(inputs[[1L]]$feature_names)
    )
  )
  expect_error(preproc(task$data(), PipeOpDebugTaskToVec$new()), "Output channel of 'processor' does not return a Task")
  expect_error(preproc(task$data(), PipeOpDebugTaskToVec$new(), state = named_list("task_to_vec", task$data())), "Output channel of 'processor' does not return a Task")


  # Test state
  # Only basic tests, most checks are done by the graph
  expect_error(preproc(task, op, state = TRUE), "type 'list'.*or 'NULL'")

  # Test that there is no error if the state is correct
  op$train(list(task))
  expect_no_error(preproc(task, op, state = op$state))  # if processor is a PipeOp, i.e. internal conversion works
  expect_no_error(preproc(task, graph, state = list(scale = op$state)))  # if processor is a Graph

  # Test that there is an error if we pass a state for the Graph that does not have the correct names
  expect_error(preproc(task, graph, state = list(a = 1)), "Must be a subset of.*scale")
})

test_that("preproc - PipeOp processor", {
  task = tsk("iris")
  dt = task$data(cols = task$feature_names)
  processor = PipeOpScale$new()
  expected_train_out_task = processor$train(list(task))[[1L]]
  expected_predict_out_task = processor$predict(list(task))[[1L]]
  expected_train_out_dt = expected_train_out_task$data(cols = task$feature_names)
  expected_predict_out_dt = expected_train_out_task$data(cols = task$feature_names)
  # Extract state to pass to preproc in some of the tests
  state = processor$state
  processor$state = NULL


  # Untrained PipeOp, state NULL, predict FALSE -> train
  train_out = expect_no_error(preproc(task, processor, predict = FALSE))
  expect_equal(train_out, expected_train_out_task)
  expect_true(processor$is_trained)  # sufficient to test modification-in-place?
  processor$state = NULL

  # Untrained PipeOp, state NULL, predict TRUE -> error: Can't predict untrained PipeOp
  expect_error(preproc(task, processor, predict = TRUE), "Cannot predict.*not been trained yet")

  # Untrained PipeOp, state given, predict FALSE -> error: contradictory input
  expect_error(preproc(task, processor, state = state, predict = FALSE), "Inconsistent function arguments")

  # Untrained PipeOp, state given, predict TRUE -> predict
  predict_out = expect_no_error(preproc(task, processor, state = state, predict = TRUE))
  expect_equal(predict_out, expected_predict_out_task)

  # Trained PipeOp, state NULL, predict FALSE -> re-train
  train_out = expect_no_error(preproc(task, processor, predict = FALSE))
  expect_equal(train_out, expected_train_out_task)
  expect_true(processor$is_trained)

  # Trained PipeOp, state NULL, predict TRUE -> predict
  predict_out = expect_no_error(preproc(task, processor, predict = TRUE))
  expect_equal(predict_out, expected_predict_out_task)

  # Trained PipeOp, state given, predict FALSE -> error: contradictory input
  expect_error(preproc(task, processor, state = state, predict = FALSE), "Inconsistent function arguments")

  # Trained PipeOp, state given, predict TRUE -> predict
  predict_out = expect_no_error(preproc(task, processor, state = state, predict = TRUE))
  expect_equal(predict_out, expected_predict_out_task)


  # Same tests for data.frame indata:
  processor$state = NULL

  # Untrained PipeOp, state NULL, predict FALSE -> train
  train_out = expect_no_error(preproc(dt, processor, predict = FALSE))
  expect_equal(train_out, expected_train_out_dt)
  expect_true(processor$is_trained)
  processor$state = NULL

  # Untrained PipeOp, state NULL, predict TRUE -> error: Can't predict untrained PipeOp
  expect_error(preproc(dt, processor, predict = TRUE), "Cannot predict.*not been trained yet")

  # Untrained PipeOp, state given, predict FALSE -> error: contradictory input
  expect_error(preproc(dt, processor, state = state, predict = FALSE), "Inconsistent function arguments")

  # Untrained PipeOp, state given, predict TRUE -> predict
  predict_out = expect_no_error(preproc(dt, processor, state = state, predict = TRUE))
  expect_equal(predict_out, expected_predict_out_dt)

  # Trained PipeOp, state NULL, predict FALSE -> re-train
  train_out = expect_no_error(preproc(dt, processor, predict = FALSE))
  expect_equal(train_out, expected_train_out_dt)
  expect_true(processor$is_trained)

  # Trained PipeOp, state NULL, predict TRUE -> predict
  predict_out = expect_no_error(preproc(dt, processor, predict = TRUE))
  expect_equal(predict_out, expected_predict_out_dt)

  # Trained PipeOp, state given, predict FALSE -> error: contradictory input
  expect_error(preproc(dt, processor, state = state, predict = FALSE), "Inconsistent function arguments")

  # Trained PipeOp, state given, predict TRUE -> predict
  predict_out = expect_no_error(preproc(dt, processor, state = state, predict = TRUE))
  expect_equal(predict_out, expected_predict_out_dt)
})

test_that("preproc - Graph processor", {
  task = tsk("iris")
  dt = task$data(cols = task$feature_names)
  op = po("scale")
  processor = as_graph(op)
  expected_train_out_task = processor$train(task)[[1L]]
  expected_predict_out_task = processor$predict(task)[[1L]]
  expected_train_out_dt = expected_train_out_task$data(cols = task$feature_names)
  expected_predict_out_dt = expected_train_out_task$data(cols = task$feature_names)
  # Extract state to pass to preproc in some of the tests
  state = processor$state
  processor$state = NULL


  # Untrained Graph, state NULL, predict FALSE -> train
  train_out = expect_no_error(preproc(task, processor, predict = FALSE))
  expect_equal(train_out, expected_train_out_task)
  expect_true(processor$is_trained)  # sufficient to test modification-in-place?
  processor$state = NULL  # Reset PipeOp

  # Untrained Graph, state NULL, predict TRUE -> error: Can't predict untrained PipeOp
  expect_error(preproc(task, processor, predict = TRUE), "Cannot predict.*not been trained yet")

  # Untrained Graph, state given, predict FALSE -> error: contradictory input
  expect_error(preproc(task, processor, state = state, predict = FALSE), "Inconsistent function arguments")

  # Untrained Graph, state given, predict TRUE -> predict
  predict_out = expect_no_error(preproc(task, processor, state = state, predict = TRUE))
  expect_equal(predict_out, expected_predict_out_task)

  # Trained Graph, state NULL, predict FALSE -> re-train
  train_out = expect_no_error(preproc(task, processor, predict = FALSE))
  expect_equal(train_out, expected_train_out_task)
  expect_true(processor$is_trained)

  # Trained Graph, state NULL, predict TRUE -> predict
  predict_out = expect_no_error(preproc(task, processor, predict = TRUE))
  expect_equal(predict_out, expected_predict_out_task)

  # Trained Graph, state given, predict FALSE -> error: contradictory input
  expect_error(preproc(task, processor, state = state, predict = FALSE), "Inconsistent function arguments")

  # Trained Graph, state given, predict TRUE -> predict
  predict_out = expect_no_error(preproc(task, processor, state = state, predict = TRUE))
  expect_equal(predict_out, expected_predict_out_task)


  # Same tests for data.frame indata:
  processor$state = NULL

  # Untrained PipeOp, state NULL, predict FALSE -> train
  train_out = expect_no_error(preproc(dt, processor, predict = FALSE))
  expect_equal(train_out, expected_train_out_dt)
  expect_true(processor$is_trained)
  processor$state = NULL

  # Untrained PipeOp, state NULL, predict TRUE -> error: Can't predict untrained PipeOp
  expect_error(preproc(dt, processor, predict = TRUE), "Cannot predict.*not been trained yet")

  # Untrained PipeOp, state given, predict FALSE -> error: contradictory input
  expect_error(preproc(dt, processor, state = state, predict = FALSE), "Inconsistent function arguments")

  # Untrained PipeOp, state given, predict TRUE -> predict
  predict_out = expect_no_error(preproc(dt, processor, state = state, predict = TRUE))
  expect_equal(predict_out, expected_predict_out_dt)

  # Trained PipeOp, state NULL, predict FALSE -> re-train
  train_out = expect_no_error(preproc(dt, processor, predict = FALSE))
  expect_equal(train_out, expected_train_out_dt)
  expect_true(processor$is_trained)

  # Trained PipeOp, state NULL, predict TRUE -> predict
  predict_out = expect_no_error(preproc(dt, processor, predict = TRUE))
  expect_equal(predict_out, expected_predict_out_dt)

  # Trained PipeOp, state given, predict FALSE -> error: contradictory input
  expect_error(preproc(dt, processor, state = state, predict = FALSE), "Inconsistent function arguments")

  # Trained PipeOp, state given, predict TRUE -> predict
  predict_out = expect_no_error(preproc(dt, processor, state = state, predict = TRUE))
  expect_equal(predict_out, expected_predict_out_dt)
})

Try the mlr3pipelines package in your browser

Any scripts or data that you put into this service are public.

mlr3pipelines documentation built on June 17, 2025, 9:08 a.m.