tests/testthat/test-features.R

context("test-features.R")

test_that("Test feature_combinations", {

  # Example 1 -----------
  m <- 3
  exact <- TRUE
  w <- 10^6
  x1 <- feature_combinations(m = m, exact = exact, weight_zero_m = w)
  x2 <- feature_exact(m, w)

  # Example 2 -----------
  m <- 10
  exact <- FALSE
  n_combinations <- 50
  w <- 10^6
  set.seed(1)
  y1 <- feature_combinations(
    m = m,
    exact = exact,
    n_combinations = n_combinations,
    weight_zero_m = w
  )

  set.seed(1)
  y2 <- feature_not_exact(
    m = m,
    n_combinations = n_combinations,
    weight_zero_m = w
  )
  y2[, p := NULL]

  # Example 3 -----------
  m <- 3
  exact <- FALSE
  n_combinations <- 1e4
  w <- 10^6
  set.seed(1)
  y3 <- feature_combinations(
    m = m,
    exact = exact,
    n_combinations = n_combinations,
    weight_zero_m = w
  )

  # Test results -----------
  expect_equal(x1, x2)
  expect_equal(y1, y2)
  expect_equal(nrow(y3), 2^3)
  expect_error(feature_combinations(100))
  expect_error(feature_combinations(100, n_combinations = NULL))
})

test_that("Test feature_exact", {

  # Example -----------
  m <- 3
  weight_zero_m <- 10^6
  x <- feature_exact(m, weight_zero_m)

  # Define results -----------
  cnms <- c("id_combination", "features", "n_features", "N", "shapley_weight")
  classes <- c("integer", "list", "integer", "integer", "double")
  lfeatures <- list(
    integer(0),
    1L,
    2L,
    3L,
    c(1L, 2L),
    c(1L, 3L),
    c(2L, 3L),
    c(1L, 2L, 3L)
  )
  n_features <- c(0, rep(1, 3), rep(2, 3), 3)
  n <- c(1, rep(3, 6), 1)

  # Tests -----------
  expect_true(data.table::is.data.table(x))
  expect_equal(names(x), cnms)
  expect_equal(unname(sapply(x, typeof)), classes)
  expect_equal(x[["id_combination"]], seq(nrow(x)))
  expect_equal(x[["features"]], lfeatures)
  expect_equal(x[["n_features"]], n_features)
  expect_equal(x[["N"]], n)
})

test_that("Test feature_not_exact", {

  # Example -----------
  m <- 10
  exact <- FALSE
  n_combinations <- 50
  w <- 10^6
  set.seed(1)
  x <- feature_not_exact(
    m = m,
    n_combinations = n_combinations,
    weight_zero_m = w
  )
  set.seed(1)

  cnms <- c("id_combination", "features", "n_features", "N", "shapley_weight", "p")
  classes <- c("integer", "list", "integer", "integer", "integer", "double")
  n <- sapply(seq(m - 1), choose, n = m)
  w_all <- shapley_weights(m = m, N = n, n_features = seq(m - 1)) * n
  w_default <- w_all / sum(w_all)

  # Test results -----------
  expect_true(data.table::is.data.table(x))
  expect_equal(names(x), cnms)
  expect_equal(unname(sapply(x, typeof)), classes)
  expect_true(nrow(x) <= n_combinations + 2)
  expect_equal(x[["id_combination"]], seq(nrow(x)))
  for (i in x[, .I]) {
    f <- x[["features"]][[i]]
    if (length(f) == 0) {
      expect_equal(x[["n_features"]][[i]], 0)
      expect_equal(x[["N"]][[i]], 1)
      expect_equal(x[["shapley_weight"]][[i]], w)
      expect_equal(x[["p"]][[i]], NA_real_)
    } else if (length(f) == m) {
      expect_equal(f, seq(m))
      expect_equal(x[["n_features"]][[i]], m)
      expect_equal(x[["N"]][[i]], 1)
      expect_equal(x[["shapley_weight"]][[i]], w)
      expect_equal(x[["p"]][[i]], NA_real_)
    } else {
      k <- length(f)
      expect_equal(f, sort(f))
      expect_equal(x[["n_features"]][[i]], k)
      expect_equal(x[["N"]][[i]], choose(m, k))
      expect_equal(x[["p"]][[i]], w_default[x[["n_features"]][[i]]])
      expect_equal(between(x[["shapley_weight"]][[i]], 1L, n_combinations), TRUE)
    }
  }
})

test_that("Test helper_feature", {

  # Example -----------
  m <- 5
  feature_sample <- list(
    integer(0),
    1:2,
    3:5,
    1:2,
    1:5
  )
  x <- helper_feature(m, feature_sample)

  # Define results -----------
  x2 <- c(1, 2, 1, 2, 1)
  x3 <- c(FALSE, FALSE, FALSE, TRUE, FALSE)

  # Test results -----------
  cnms <- c("sample_frequence", "is_duplicate")
  classes <- c("integer", "logical")
  expect_true(data.table::is.data.table(x))
  expect_equal(names(x), cnms)
  expect_equal(nrow(x), length(feature_sample))
  expect_equal(classes, unname(sapply(x, typeof)))
  expect_equal(x[["sample_frequence"]], x2)
  expect_equal(x[["is_duplicate"]], x3)
})


test_that("Test make_dummies", {
  if (requireNamespace("MASS", quietly = TRUE)) {
    data("Boston", package = "MASS")
    x_var <- c("lstat", "rm", "dis", "indus")
    y_var <- "medv"

    x_train <- as.data.frame(Boston[401:411, x_var])
    y_train <- Boston[401:408, y_var]
    x_test <- as.data.frame(Boston[1:4, x_var])

    # convert to factors for illustrational purpose
    x_train$rm <- factor(round(x_train$rm))
    x_test$rm <- factor(round(x_test$rm), levels = levels(x_train$rm))

    factor_feat <- sapply(x_train, is.factor)
    nb_factor_feat <- sum(factor_feat)

    dummylist <- make_dummies(traindata = x_train, testdata = x_train)

    # Tests
    expect_type(dummylist, "list")

    expect_equal(length(dummylist$feature_list$contrasts_list), nb_factor_feat)

    expect_equal(length(dummylist$feature_list$labels), ncol(x_train))

    expect_equal(sum(dummylist$feature_list$classes == "factor"), nb_factor_feat)

    expect_equal(ncol(dummylist$feature_list$contrasts_list$rm), length(levels(x_train$rm)))

    # 1) What if train has more features than test but features in test are contained in train
    x_train1 <- cbind(x_train, 1)
    x_test1 <- x_test
    expect_error(make_dummies(traindata = x_train1, testdata = x_test1))

    # 2) What if train has different feature types than test
    x_train2 <- x_train
    x_test2 <- x_test
    x_test2$rm <- as.numeric(x_test2$rm)
    expect_error(make_dummies(traindata = x_train2, testdata = x_test2))

    # 3) What if test has more features than train but features in train are contained in test
    x_train3 <- x_train
    x_test3 <- cbind(x_test, 1)
    expect_error(make_dummies(traindata = x_train3, testdata = x_test3))

    # 4) What if train and test only have numerical features
    x_train4 <- x_train
    x_train4$rm <- as.numeric(x_train4$rm)
    x_test4 <- x_test
    x_test4$rm <- as.numeric(x_test4$rm)
    expect_type(make_dummies(traindata = x_train4, testdata = x_test4), "list")

    # 5) What if train and test only have categorical features
    x_train5 <- x_train
    x_train5 <- x_train5[, "rm", drop = FALSE]
    x_test5 <- x_test
    x_test5 <- x_test5[, "rm", drop = FALSE]
    expect_type(make_dummies(traindata = x_train5, testdata = x_test5), "list")

    # 6) What if test has the same levels as train but random ordering of levels
    x_train6 <- x_train
    x_train6$rm <- factor(x_train6$rm, levels = 4:9)
    x_test6 <- x_test
    x_test6$rm <- factor(x_test6$rm, levels = c(8, 9, 7, 4, 5, 6))
    expect_type(make_dummies(traindata = x_train6, testdata = x_test6), "list")

    # 7) What if test has different levels than train
    x_train7 <- x_train
    x_train7$rm <- factor(x_train7$rm, levels = 4:9)
    x_test7 <- x_test
    x_test7$rm <- factor(x_test7$rm, levels = 6:8)
    expect_error(make_dummies(traindata = x_train7, testdata = x_test7))

    # 8) What if train and test have different feature names
    x_train8 <- x_train
    x_test8 <- x_test
    names(x_test8) <- c("lstat2", "rm2", "dis2", "indus2")
    expect_error(make_dummies(traindata = x_train8, testdata = x_test8))

    # 9) What if one variables has an empty name
    x_train9 <- x_train
    colnames(x_train9) <- c("", "rm", "dis", "indus")
    x_test9 <- x_test
    colnames(x_test9) <- c("", "rm", "dis", "indus")
    expect_error(make_dummies(traindata = x_train9, testdata = x_test9))

    # 10) What if traindata has a column that repeats
    x_train10 <- cbind(x_train, lstat = x_train$lstat)
    x_test10 <- cbind(x_test, lstat = x_test$lstat)
    expect_error(make_dummies(traindata = x_train10, testdata = x_test10))

    # 11) What if traindata has no column names
    x_train11 <- x_train
    colnames(x_train11) <- NULL
    x_test11 <- x_test
    colnames(x_test11) <- NULL
    expect_error(make_dummies(traindata = x_train11, testdata = x_test11))

    # 12 Test that traindata_new and testdata_new will be the same as the original
    # x_train and x_test. The only time this is different is if the levels of train
    # and test are different. See below.
    dummylist12 <- make_dummies(traindata = x_train, testdata = x_test)
    #
    expect_true(all(data.frame(dummylist12$traindata_new) == x_train))
    expect_true(all(levels(dummylist12$traindata_new$rm) == levels(x_train$rm)))
    expect_true(all(data.frame(dummylist12$testdata_new) == x_test))
    expect_true(all(levels(dummylist12$testdata_new$rm) == levels(x_test$rm)))


    # 13 Different levels same as check # 12
    #
    x_train13 <- x_train
    x_train13$rm <- factor(x_train13$rm, levels = 4:9)
    x_test13 <- x_test
    x_test13$rm <- factor(x_test13$rm, levels = c(8, 9, 7, 4, 5, 6))
    dummylist13 <- make_dummies(traindata = x_train13, testdata = x_test13)
    #
    expect_true(all(data.frame(dummylist13$traindata_new) == x_train13))
    expect_true(all(levels(dummylist13$traindata_new$rm) == levels(x_train13$rm)))
    expect_true(all(data.frame(dummylist13$testdata_new) == x_test13))
    # Important !!!!
    expect_false(all(levels(dummylist13$testdata_new$rm) == levels(x_test13$rm)))
  }
})

test_that("Test apply_dummies", {
  if (requireNamespace("MASS", quietly = TRUE)) {
    data("Boston", package = "MASS")
    x_var <- c("lstat", "rm", "dis", "indus")
    y_var <- "medv"

    x_train <- as.data.frame(Boston[401:411, x_var])
    y_train <- Boston[401:408, y_var]
    x_test <- as.data.frame(Boston[1:4, x_var])

    # convert to factors for illustrational purpose
    x_train$rm <- factor(round(x_train$rm))
    x_test$rm <- factor(round(x_test$rm), levels = levels(x_train$rm))

    numeric_feat <- !sapply(x_train, is.factor)
    nb_numeric_feat <- sum(numeric_feat)

    dummylist <- make_dummies(traindata = x_train, testdata = x_test)

    x_test_dummies <- apply_dummies(feature_list = dummylist$feature_list, testdata = x_test)

    # Tests
    expect_type(x_test_dummies, "double")

    expect_equal(
      ncol(x_test_dummies),
      nb_numeric_feat + ncol(dummylist$feature_list$contrasts_list$rm)
    )

    # Test that make_dummies() and apply_dummies() gives the same output
    # for a given traindata and testdata
    expect_true(all(dummylist$test_dummies == x_test_dummies))

    # 1) What if you re-arrange the columns in x_train
    x_test1 <- x_test[, c(2, 3, 1, 4)]
    diff_column_placements <- apply_dummies(dummylist$feature_list, testdata = x_test1)
    expect_equal(colnames(diff_column_placements), colnames(x_test_dummies))

    # 2) What if you put in less features then the original traindata
    x_test2 <- x_test[, c(2, 1)]
    expect_error(apply_dummies(dummylist$feature_list, testdata = x_test2))

    # 3) What if you change the feature types of testdata
    x_test3 <- sapply(x_test, as.numeric)
    expect_error(apply_dummies(dummylist$feature_list, testdata = x_test3))

    # 4) What if you add a feature
    x_test4 <- cbind(x_train[, c(1, 2)], new_var = x_train[, 2], x_train[, c(3, 4)])
    expect_error(apply_dummies(dummylist$feature_list, testdata = x_test4))

    # 6) What if test has the same levels as train but random ordering of levels
    x_test6 <- x_test
    x_test6$rm <- factor(x_test6$rm, levels = c(8, 9, 7, 4, 5, 6))
    expect_error(apply_dummies(dummylist$feature_list, testdata = x_test6))

    # 7) What if test has different levels than train
    x_test7 <- x_test
    x_test7$rm <- factor(x_test7$rm, levels = 6:8)
    expect_error(apply_dummies(dummylist$feature_list, testdata = x_test7))

    # 8) What if train and test have different feature names
    x_test8 <- x_test
    names(x_test8) <- c("lstat2", "rm2", "dis2", "indus2")
    expect_error(apply_dummies(dummylist$feature_list, testdata = x_test8))

    # 9) What if one variables has an empty name
    x_test9 <- x_test
    colnames(x_test9) <- c("", "rm", "dis", "indus")
    expect_error(apply_dummies(dummylist$feature_list, testdata = x_test9))

    # 10) What if traindata has a column that repeats
    x_test10 <- cbind(x_test, lstat = x_test$lstat)
    expect_error(apply_dummies(dummylist$feature_list, testdata = x_test10))

    # 11) What if testdata has no column names
    x_test11 <- x_test
    colnames(x_test11) <- NULL
    expect_error(apply_dummies(dummylist$feature_list, testdata = x_test11))
  }
})

Try the shapr package in your browser

Any scripts or data that you put into this service are public.

shapr documentation built on May 4, 2023, 5:10 p.m.