tests/testthat/test-data_prepare.R

context("Test preparation functions")

test_task <- get_test_task(stage = "split")

create_prepared_task <- function(
                                 test_task,
                                 processing_pipeline = mlr3pipelines::PipeOpNOP$new(),
                                 target = "target") {
  prepared_task <- prepare(
    test_task,
    data_names = c("train_data", "test_data"),
    outcome_field = target,
    training_fields = c("feature"),
    processing_pipeline = processing_pipeline
  )
  return(prepared_task)
}

test_that("no error is thrown with a valid 'processing_pipeline'", {
  expect_error(
    get_test_task(
      stage = "prepare",
      processing_pipeline = mlr3pipelines::PipeOpNOP$new()
    ),
    NA
  )
})

test_that("no error is thrown with a valid 'processing_pipeline' with cv", {
  expect_error(
    get_test_task(
      stage = "prepare",
      resampling_strategy = "cv",
      processing_pipeline = mlr3pipelines::PipeOpNOP$new()
    ),
    NA
  )
})

fake_pipe <- mlr3pipelines::PipeOpScale$new(
  param_vals = list(affect_columns = mlr3pipelines::selector_name("feature"))
)
fake_pipe <- mlr3pipelines::as_graph(fake_pipe)

test_that("'processing_pipeline' is correctly applied", {
  prep_task <- get_test_task(
    stage = "prepare",
    processing_pipeline = fake_pipe
  )
  test_mean_sd <- function(mean_or_sd, expected, test_or_train) {
    if (test_or_train == "train") {
      ids <- prep_task$mlr3rsmp$train_set(1)
    } else {
      ids <- prep_task$mlr3rsmp$test_set(1)
    }

    data <- prep_task$mlr3task$data(ids)
    expect_equal(
      mean_or_sd(
        get_prepared_data(prep_task, data)$feature
      ),
      expected,
      tolerance = 10e-3
    )
  }

  # train_data
  test_mean_sd(mean, 0, "train")
  test_mean_sd(sd, 1, "train")
  # test_data
  test_mean_sd(mean, 0.346, "test")
  test_mean_sd(sd, 0.46, "test")
})


test_that("processing pipeline is stored in 'mlr3pipeline' property", {
  test_mlr3pipeline_prop <- function(pipe) {
    prep_task <- get_test_task(stage = "prepare", processing_pipeline = pipe)
    expect_true("mlr3pipeline" %in% names(prep_task))
    expect_true(inherits(prep_task[["mlr3pipeline"]], "PipeOp") ||
      inherits(prep_task[["mlr3pipeline"]], "Graph"))
  }
  test_mlr3pipeline_prop(mlr3pipelines::PipeOpNOP$new())
  test_mlr3pipeline_prop(
    mlr3pipelines::as_graph(mlr3pipelines::PipeOpNOP$new())
  )
})

test_that("processing pipeline is stored in 'mlr3pipeline' property with cv", {
  test_mlr3pipeline_prop <- function(pipe) {
    prep_task <- get_test_task(
      stage = "prepare",
      resampling_strategy = "cv",
      processing_pipeline = pipe
    )
    expect_true("mlr3pipeline" %in% names(prep_task))
    expect_true(inherits(prep_task[["mlr3pipeline"]], "PipeOp") ||
      inherits(prep_task[["mlr3pipeline"]], "Graph"))
  }
  test_mlr3pipeline_prop(mlr3pipelines::PipeOpNOP$new())
  test_mlr3pipeline_prop(
    mlr3pipelines::as_graph(mlr3pipelines::PipeOpNOP$new())
  )
})

test_that("prepare changes the requested features", {
  test_features <- function(pipe, features, mlr_features = features) {
    prepared_task <- get_test_task(
      stage = "prepare",
      processing_pipeline = pipe,
      training_fields = features
    )
    expect_equal(prepared_task[["training_fields"]], features)
    expect_equal(prepared_task[["mlr3task"]]$col_roles$feature, mlr_features)
    # new features are added when training
  }
  test_features(mlr3pipelines::PipeOpNOP$new(), "feature")
  mutate_po <- mlr3pipelines::PipeOpMutate$new()
  mutate_po$param_set$values$mutation <- list(new_feature = ~ feature^2)
  test_features(
    mutate_po,
    features = "feature"
  )
  test_features(
    mutate_po,
    features = "feature" # new_feature is not computed before training
  )
})

test_that("prepare changes the outcome field if requested", {
  prepared_task <- create_prepared_task(test_task)
  testthat::expect_equal(
    prepared_task[["outcome_field"]],
    "target"
  )
  testthat::expect_equal(
    prepared_task[["mlr3task"]]$col_roles$target,
    "target"
  )
  prepared_task2 <- create_prepared_task(
    test_task,
    target =  "periode"
  )
  testthat::expect_equal(
    prepared_task2[["outcome_field"]],
    "periode"
  )
  testthat::expect_equal(
    prepared_task2[["mlr3task"]]$col_roles$target,
    "periode"
  )
})

create_fte_test_task <- function(processing_pipeline = NULL) {
  test_task <- get_test_task(stage = "load")
  new_data <- test_task[["hist_data"]] %>%
    cbind(
      ab = c("a", "b", "a", "a", "a", "b", "b", "a", "b", "a"),
      cd = c("d", "d", "c", "c", "d", "d", "c", "c", "c", "d"),
      outcome = c(TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, TRUE, FALSE, FALSE) # nolint
    )

  prepared_task <- get_test_task(
    fake_data = new_data,
    fake_target = "outcome",
    stage = "prepare",
    processing_pipeline = processing_pipeline,
    training_fields = c("feature", "ab", "cd")
  )
  return(prepared_task)
}

test_that("fte state works as expected", {
  testthat::skip_on_ci()
  prep_task <- create_fte_test_task(create_fte_pipeline(c("ab", "cd")))

  ids <- prep_task$mlr3rsmp$test_set(1)

  expect_known_hash(
    get_prepared_data(prep_task, prep_task$mlr3task$data(ids)),
    "40734c2555"
  )
})

test_that("preparing twice with different training fields takes effect", {
  prep_task <- create_fte_test_task(mlr3pipelines::PipeOpNOP$new())

  ids <- prep_task$mlr3rsmp$test_set(1)

  check_names <- function(names) {
    expect_setequal(
      names(get_prepared_data(prep_task, prep_task$mlr3task$data(ids))),
      names
    )
  }
  check_names(c("outcome", "feature", "ab", "cd"))
  prep_task <- prepare(
    prep_task,
    processing_pipeline = mlr3pipelines::PipeOpNOP$new(),
    training_fields = "feature"
  )
  check_names(c("outcome", "feature"))
  prep_task <- prepare(
    prep_task,
    processing_pipeline = mlr3pipelines::PipeOpNOP$new(),
    training_fields = c("feature", "ab", "cd")
  )
  check_names(c("outcome", "feature", "ab", "cd"))
})

test_that("get_prepared_data works with and without resampling", {
  prep_task <- get_test_task(stage = "prepare")
  # Remove resampling plan
  train_ids <- prep_task$mlr3rsmp$train_set(1)
  train_data <- prep_task$mlr3task$data(train_ids)
  test_ids <- prep_task$mlr3rsmp$test_set(1)
  test_data <- prep_task$mlr3task$data(test_ids)
  expect_equal(dim(get_prepared_data(prep_task, train_data)), c(7, 2))
  expect_equal(dim(get_prepared_data(prep_task, test_data)), c(3, 2))
  prep_task[["mlr3rsmp"]] <- NULL
  expect_equal(dim(get_prepared_data(prep_task, prep_task$mlr3task$data())), c(10, 2))
})
signaux-faibles/rsignauxfaibles documentation built on Dec. 2, 2020, 3:24 a.m.