tests/testthat/test-impute.r

context("Testing impute")

# Setup ------------------------------------------------------------------------
# set seed for reproducibility
set.seed(7)
# build data set to predict whether or not animal_id is a kitty
n <- 300
df <- tibble(animal_id = 1:n,
             length = rnorm(n, mean = 7, sd = 2),
             width = rnorm(n, mean = 2, sd = 0.5),
             fur = sample(c("Long", "Short"), size = n, replace = T),
             color = sample(c("Orange", "Black", "White", "Mixed"),
                            size = n, replace = T)
)

colors <- c("Orange", "Black", "White", "Mixed")

# give kitty likeliness score
df["kitty"] <- df["length"] - 2 * df["width"] - 1
df$kitty[df["fur"] == "Long"]  <-
  df$kitty[df["fur"] == "Long"] + 1
df$kitty[df["fur"] == "Short"]  <-
  df$kitty[df["fur"] == "Short"] - 1
df$kitty[df["color"] == "Mixed"] <-
  df$kitty[df["color"] == "Mixed"] + 1
df$kitty[df["color"] == "Orange"] <-
  df$kitty[df["color"] == "Orange"] + 2
df$kitty[df["color"] == "Black"] <-
  df$kitty[df["color"] == "Black"] - 1
df$kitty[df["color"] == "White"] <-
  df$kitty[df["color"] == "White"] - 4


# Add noise
df$kitty <- df$kitty + rnorm(n, mean = 0, sd = 1.25)
df$kitty <- ifelse(df$kitty > 0, "Y", "N")

# Add missing data
df$color[sample(1:n, 32, replace = FALSE)] <- NA
df$length[sample(1:n, 51, replace = FALSE)] <- NA
df$fur[sample(1:n, 125, replace = FALSE)] <- NA
df$width[sample(1:n, 9, replace = FALSE)] <- NA

df <- df %>% mutate(across(.cols = 4:6, .fns = as.factor))

train_index <- caret::createDataPartition(
  df$kitty,
  p = 0.8,
  times = 1,
  list = TRUE)

d_train <- df[train_index$Resample1, ]
d_test <- df[-train_index$Resample1, ]

d_train$length[1] <- d_test$length[1] <- NA
d_train$color[2] <- d_test$color[2] <- NA
d_train$width[3] <-  d_test$width[3] <- NA
d_train$fur[3] <- d_test$fur[3] <- NA


# Tests ------------------------------------------------------------------------
test_that("Bad data throws an error", {
  expect_error(impute(),
               regexp = "\"d\" must be a tibble")
  expect_error(impute(d = "yeah hi!"),
               regexp = "\"d\" must be a tibble")
  expect_error(impute(d = df, recipe = "fried_fish"),
               regexp = "\"recipe\" must be a valid recipe object.")
})

test_that("Bad ignore_colums are parsed correctly.", {
  capture_output(expect_error(impute(d = d_train, chippies),
                              regexp = "chippies not found in d"))
  capture_output(expect_error(impute(d = d_train, chippies, fishes),
                              regexp = "chippies and fishes not found in d"))
})

test_that("No recipe with defaults trains and predicts.", {
  capture_output(res <- impute(d = d_train,
                               animal_id, kitty))
  expect_equal(res$length[1], 7.1, tolerance = 1)
  expect_equal(as.character(res$color[2]), "missing")
  expect_equal(as.character(res$fur[3]), "missing")
  expect_equal(res$width[3], 1.99, tolerance = 1)

  capture_output(res <- impute(d = d_test,
                               animal_id, kitty,
                               recipe = attr(res, "recipe")))
  expect_equal(res$length[1], 7.1, tolerance = 1)
  expect_equal(as.character(res$color[2]), "missing")
  expect_equal(as.character(res$fur[3]), "missing")
  expect_equal(res$width[3], 1.99, tolerance = 1)
})

test_that("No recipe with methods trains and predicts.", {
  capture_output(res <- impute(d = d_train,
                               animal_id, kitty,
                               nominal_method = "bagimpute",
                               numeric_method = "knnimpute"))
  expect_equal(res$length[1], 6.27, tolerance = 1)
  expect_true(as.character(res$color[2]) %in% c("Orange", "Black", "White", "Mixed"))
  expect_equal(as.character(res$fur[3]), "Short")
  expect_equal(res$width[3], 1.57, tolerance = 1)

  capture_output(res <- impute(d = d_test,
                               animal_id, kitty,
                               recipe = attr(res, "recipe")))
  expect_equal(res$length[1], 6.66, tolerance = 1)
  expect_true(as.character(res$color[2]) %in% c("Orange", "Black", "White", "Mixed"))
  expect_equal(as.character(res$fur[3]), "Long")
  expect_equal(res$width[3], 1.95, tolerance = 1)
})

test_that("No recipe with methods and params trains and predicts.", {
  capture_output(res <- impute(d = d_train,
                               animal_id, kitty,
                               nominal_method = "bagimpute",
                               numeric_method = "knnimpute",
                               nominal_params = list(bag_trees = 20),
                               numeric_params = list(knn_K = 3)))
  expect_equal(res$length[1], 5.25, tolerance = 1)
  expect_true(as.character(res$color[2]) %in% c("Orange", "Black", "White", "Mixed"))
  expect_equal(as.character(res$fur[3]), "Short")
  expect_equal(res$width[3], 1.83, tolerance = 1)

  capture_output(res <- impute(d = d_test,
                               animal_id, kitty,
                               recipe = attr(res, "recipe")))
  expect_equal(res$length[1], 6.6, tolerance = 1)
  expect_true(as.character(res$color[2]) %in% c("Orange", "Black", "White", "Mixed"))
  expect_equal(as.character(res$fur[3]), "Short")
  expect_equal(res$width[3], 2.22, tolerance = 1)
})

test_that("Ignored columns are not imputed but are returned.", {
  d_train$animal_id[1:5] <- NA
  d_train$kitty[1:5] <- NA
  expect_warning(capture_output(res <- impute(d = d_train, animal_id, kitty)))
  expect_true(is.na(res$animal_id[2]))
  expect_true(is.na(res$kitty[4]))

  d_test$animal_id[1:5] <- NA
  d_test$kitty[1:5] <- NA
  expect_warning(capture_output(
    res <- impute(d = d_test, animal_id, kitty, recipe = attr(res, "recipe"))
  ))
  expect_true(is.na(res$animal_id[2]))
  expect_true(is.na(res$kitty[4]))
})

test_that("Columns have the same order after", {
  capture_output(res <- impute(d = d_train, animal_id, kitty))
  expect_equal(names(d_train), names(res))
})

test_that("Missingness in ignored columns throws warning, elsewhere doesn't", {
  d_train$animal_id[1:10] <- NA
  expect_warning(
    capture_output(tmp <- impute(d_train, animal_id)))
  expect_warning(capture_output(
    tmp <- impute(d_train, kitty)), regexp = NA)
})

test_that("Output of impute is a data frame with our custom child class", {
  capture_output(imped <- impute(d_train))
  expect_true(is.data.frame(imped))
  expect_s3_class(imped, "hcai_imputed_df")
})

test_that("Output of impute is same for tibble vs data frame", {
  expect_equal(
    capture_output(impute(d_train)),
    capture_output(impute(tibble::as_tibble(d_train)))
  )
})

test_that("recipe attr is a recipe class object", {
  capture_output(imp_train <- impute(d_train))
  expect_true("recipe" %in% names(attributes(imp_train)))
  expect_s3_class(attr(imp_train, "recipe"), "recipe")
})

test_that("imp_summary attr is contained within d_imputed", {
  capture_output(imp_train <- impute(d_train))
  expect_true("imp_summary" %in% names(attributes(imp_train)))
})

test_that("print method works as expected", {
  d_train$animal_id[1:5] <- NA
  d_train$kitty[1:5] <- NA
  expect_warning(
    msg <- capture_output(
      capture_messages(
        res <- impute(d = d_train, animal_id, kitty, verbose = TRUE))))
  expect_true(grepl("ignored", msg))
  expect_true(grepl("new_category", msg))
})

test_that("a data.frame with a recipe in recipe slot works", {
  imp_train <- impute(d_train)
  expect_equal(impute(d_test, recipe = imp_train),
               impute(d_test, recipe = attr(imp_train, "recipe")))
})

test_that("an attr that doesn't exist passed to recipe errors", {
  imp_train <- impute(d_train)
  expect_error(impute(d_test, recipe = attr(imp_train, "nonsense")),
               regexp = "nonsense")
})

Try the healthcareai package in your browser

Any scripts or data that you put into this service are public.

healthcareai documentation built on Sept. 5, 2022, 5:12 p.m.