test_gpe.R
In pre: Prediction Rule Ensembles

context("Tests the gpe function, gpe base learner functions and gpe sampling function")

test_that("gpe works with default settings and gives previous results", {
  #####
  # Continous outcome
  set.seed(8782650)
  fit <- gpe(
    Ozone ~ ., 
    data = airquality,
    base_learners = list(
      gpe_trees(ntrees = 10)))
  
  fit <- fit$glmnet.fit$glmnet.fit
  fit <- fit[c("a0", "beta")]
  fit$beta <- fit$beta[1:10, 1:10]
  
  # save_to_test(fit, "gpe_fit1")
  expect_equal(fit, read_to_test("gpe_fit1"), tolerance = 1.49e-08)
  
  #####
  # Binary outcome
  fit <- gpe(diabetes ~ ., PimaIndiansDiabetes, 
              base_learners = list(gpe_linear(), gpe_trees(ntrees = 10)))

  fit <- fit$glmnet.fit$glmnet.fit
  fit <- fit[c("a0", "beta")]
  fit$beta <- fit$beta[, 1:10]
  
  # save_to_test(fit, "gpe_fit1_binary")
  expect_equal(fit, read_to_test("gpe_fit1_binary"))
})

test_that("gpe works with factor predictor and gpe_linear", {
  dat <- data.frame(
    y = 1:30, 
    x = factor(rep(1:3, 10)), 
    z = factor(rep(1:2, 15)))
  
  expect_silent(fit <- gpe(y ~ ., dat, base_learners = list(gpe_linear())))
  expect_equal(
    row.names(fit$glmnet.fit$glmnet.fit$beta),
    c("(Intercept)", "lTerm(x == \"2\", scale = 0.48)", 
      "lTerm(x == \"3\", scale = 0.48)", "lTerm(z == \"2\", scale = 0.51)"))
})

test_that("Sampling and subsampling works and is used in gpe", {
  #####
  # Bootstrap sample
  set.seed(seed <- 9495688)
  func <- gpe_sample(1)
  out <- func(100, rep(1, 100))
  
  expect_true(any(!1:100 %in% out))
  expect_length(out, 100)
  
  #####
  # Subsampling with lower fraction
  func <- gpe_sample(.5)
  out <- func(100, rep(1, 100))
  
  expect_true(any(!1:100 %in% out))
  expect_true(!any(table(out) > 1))
  expect_length(out, 50)
  
  #####
  # Bootstrap with weights
  func <- gpe_sample(1)
  ws <- rep(1, 100)
  ws[1] <- 100
  
  out <- func(100, ws)
  expect_true(names(sort(table(out),decreasing=TRUE))[1] == "1")
  expect_length(out, 100)
  
  #####
  # Bootstrap is used with a lower fraction and unequal weights
  func <- gpe_sample(.5)
  expect_message(
    out <- func(100, ws),
    "Some weights do not match. Bootstrap will be used instead of subsampling to reflect weights")
  
  # should only show the message once
  expect_silent(out <- func(100, ws))
                 
  expect_true(names(sort(table(out),decreasing=TRUE))[1] == "1")
  expect_length(out, 50)
})

test_that("gpe_trees gives previous results for continous outcomes", {
  # partykit-1.2.0 cannot handle transformation in formulas in the development
  # version. Thus, we "cut" Wind prior to get a factor
  airquality$Wind_cut <- cut(airquality$Wind, breaks = 3)
  on.exit(airquality$Wind_cut <- NULL)
  
  #####
  # Default settings with fewer trees
  func <- gpe_trees(ntrees = 10)
  
  args <- list(
    formula = Ozone ~ Solar.R + Wind + Wind_cut,
    data = airquality,
    weights = rep(1, nrow(airquality)),
    sample_func = gpe_sample(.5), 
    family = "gaussian")
  
  set.seed(seed <- 6772769)
  out <- do.call(func, c(args))
  
  expect_known_value(out, "previous_results/gpe_tree_1.RDS", update = FALSE)

  #####  
  # Only use stumps
  func <- gpe_trees(ntrees = 10, maxdepth = 1)
  
  set.seed(seed)
  out2 <- do.call(func, args)
  
  expect_lt(length(out2), length(out))
  expect_true(!any(grepl("\\$", out2)))
  # save_to_test(out2, "gpe_tree_2")
  expect_equal(out2, read_to_test("gpe_tree_2"))
  
  #####
  # Without learning rate
  func <- gpe_trees(ntrees = 10, learnrate = 0)
  
  set.seed(seed)
  out3 <- do.call(func, args)
  
  expect_known_value(out3, "previous_results/gpe_tree_3.RDS", update = FALSE)
  
  #####
  # Without removal of duplicates and complements
  func <- gpe_trees(ntrees = 10, remove_duplicates_complements = FALSE)
  
  set.seed(seed)
  out4 <- do.call(func, args)
  
  expect_gt(length(out4), length(out))
  expect_true(all(out %in% out4))
  
  # save_to_test(out4, "gpe_tree_4")
  expect_equal(out4, read_to_test("gpe_tree_4"))
})

test_that("gpe_trees gives previous results for binary outcomes", {
  #####
  # Works with binomial outcome
  func <- gpe_trees(ntrees = 10)
  
  args <- list(
    formula = diabetes ~ ., 
    data = PimaIndiansDiabetes,
    weights = rep(1, nrow(PimaIndiansDiabetes)),
    sample_func = gpe_sample(), 
    family = "binomial")
  
  set.seed(seed <- 8779606)
  out <- do.call(func, args)
  
  # save_to_test(out, "gpe_tree_binary_1")
  expect_equal(out, read_to_test("gpe_tree_binary_1"))
  
  func <- gpe_trees(ntrees = 10, use_grad = FALSE)
  
  set.seed(seed)
  out2 <- do.call(func, args)
  
  expect_true(any(!out2 %in% out))
  
  # save_to_test(out2, "gpe_tree_binary_1_w_glm")
  expect_equal(out2, read_to_test("gpe_tree_binary_1_w_glm"))
  
  #####
  # Binary without learning rate
  func <- gpe_trees(ntrees = 10, learnrate = 0)
  
  set.seed(seed)
  out <- do.call(func, args)
  
  # save_to_test(out, "gpe_tree_binary_2")
  expect_equal(out, read_to_test("gpe_tree_binary_2"))
})

test_that("gpe_earth gives expected_result for continous outcomes", {
  ##### 
  # With defaults settings
  func <- gpe_earth()
  
  args <- list(
    formula = Ozone ~ Solar.R + Wind + 
      cut(Wind, breaks = 5), # Notice that we include a factor. There where  
                             # some issues with factor at one point 
    data = airquality,
    weights = rep(1, sum(complete.cases(airquality))),
    sample_func = gpe_sample(.5), 
    family = "gaussian")
  
  set.seed(seed <- 6817505)
  out <- do.call(func, args)
  
  # save_to_test(out, "gpe_earth_1")
  expect_equal(out, read_to_test("gpe_earth_1"))
  
  ##### 
  # Additive model
  func <- gpe_earth(degree = 1)
  
  set.seed(seed)
  out2 <- do.call(func, args)
  
  # Match any string not containing *
  expect_match(out2, "^[^\\*]+$", perl = TRUE)
  # save_to_test(out2, "gpe_earth_2")
  expect_equal(out2, read_to_test("gpe_earth_2"))
  
  ##### 
  # Without high learning rate
  func <- gpe_earth(learnrate = 1)
  
  set.seed(seed)
  out3 <- do.call(func, args)
  
  expect_true(length(setdiff(out3, out)) > 0)
  # save_to_test(out, "gpe_earth_3")
  expect_equal(out, read_to_test("gpe_earth_3"))
  
  #####
  # With different number of end nodes
  func <- gpe_earth(nk = 5, ntrain = 1)
  
  set.seed(seed)
  out4 <- do.call(func, args)
  expect_length(out4, 4) # 5 - 1 for intercept
  
  func <- gpe_earth(nk = 50, ntrain = 1)
  set.seed(seed)
  out5 <- do.call(func, args)
  expect_gt(length(out5), length(out4))
  
  #####
  # Without standardizing
  func <- gpe_earth(normalize = FALSE)
  
  set.seed(seed)
  out6 <- do.call(func, args)
  
  expect_true(!any(grepl("scale\\ ?=", out6, perl = TRUE)))
  # save_to_test(out6, "gpe_earth_6")
  expect_equal(out6, read_to_test("gpe_earth_6"))
  
  #####
  # No using threshold gives different results
  
  func <- gpe_earth(cor_thresh = 1.01)
  
  set.seed(seed)
  out7 <- do.call(func, args)
  
  expect_gt(length(out7), length(out))
  
  # Should give the same
  func <- gpe_earth(cor_thresh = NULL)
  
  set.seed(seed)
  out8 <- do.call(func, args)
  
  expect_equal(out8, out7)
  
  # save_to_test(out8[1:25], "gpe_earth_6.1")
  expect_equal(out8[1:25], read_to_test("gpe_earth_6.1"))
  
  ######
  # Continous outcome with defaults with two factors
  func <- gpe_earth(ntrain = 10)
  
  args <- list(
    formula = Ozone ~ Solar.R + Wind + 
      cut(Wind, breaks = 5) + # Notice that we include a factor. There where  
      cut(Temp, breaks = 5),  # some issues with factor at one point 
                                
    data = airquality,
    weights = rep(1, sum(complete.cases(airquality))),
    sample_func = gpe_sample(.5), 
    family = "gaussian")
  
  set.seed(seed)
  out <- do.call(func, args)
  
  # save_to_test(out, "gpe_earth_7")
  expect_equal(out, read_to_test("gpe_earth_7"))
})

test_that("gpe_earth gives previous results for binary outcomes", {
  #####
  # With learning rate
  func <- gpe_earth(ntrain = 20)
  
  args <- list(
    formula = diabetes ~ ., 
    data = PimaIndiansDiabetes,
    weights = rep(1, nrow(PimaIndiansDiabetes)),
    sample_func = gpe_sample(), 
    family = "binomial")
  
  set.seed(seed <- 1067229)
  expect_message(out <- do.call(func, args), 
                 "Beware that gpe_earth will use gradient boosting")
  
  # save_to_test(out, "gpe_earth_binary")
  expect_equal(out, read_to_test("gpe_earth_binary"))
  
  #####
  # Without learning rate
  func <- gpe_earth(ntrain = 20, learnrate = 0)

  set.seed(seed)
  expect_message(out1 <- do.call(func, args), 
                 "Beware that gpe_earth will use L2 loss to train")
  
  expect_true(any(!out1 %in% out))
  
  # save_to_test(out1, "gpe_earth_binary_no_learn")
  expect_equal(out1, read_to_test("gpe_earth_binary_no_learn"))
})

test_that("gpe_earth works with ordered factors and does not alter contrast options", {
  set.seed(1660180)
  n <- 100
  dat_frame <- data.frame(
    y = rnorm(n),
    # Ordered factor
    x1 = as.ordered(sample.int(15, n, replace = TRUE)))
  
  old <- c('contr.treatment', 'contr.poly')
  options (contrasts = old)
  tmp <- gpe_earth(ntrain = 5)
  
  fit <- tmp(
    formula = y ~ .,
    data = dat_frame,
    weights = rep(1, n),
    sample_func = gpe_sample(.5), 
    family = "gaussian")
  
  expect_equal(getOption("contrasts"), old)
})

test_that("gpe_linear gives expected_result", {
  func <- gpe_linear()
  
  args <- list(
    formula = Ozone ~ Solar.R + Wind + cut(Wind, breaks = 3), 
    data = airquality,
    weights = rep(1, sum(complete.cases(airquality))))
  
  out <- do.call(func, args)
  
  # save_to_test(out, "gpe_linear_1")
  expect_equal(out, read_to_test("gpe_linear_1"), tolerance = 1.490116e-08)
  
  # No winsorization
  func <- gpe_linear(winsfrac = 0)
  
  out <- do.call(func, args)
  
  # save_to_test(out, "gpe_linear_2")
  expect_equal(out, read_to_test("gpe_linear_2"), tolerance = 1.490116e-08)
  
  # No scaling
  func <- gpe_linear(normalize = F)
  
  out <- do.call(func, args)
  
  # save_to_test(out, "gpe_linear_3")
  expect_equal(out, read_to_test("gpe_linear_3"), tolerance = 1.490116e-08)
  
  # do not do either
  func <- gpe_linear(normalize = F, winsfrac = 0)
  
  out <- do.call(func, args)
  
  expect_equal(out[1:2], c("lTerm(Solar.R)", "lTerm(Wind)"))
})

test_that("gpe_linear returns terms for factor levels", {
  set.seed(1660180)
  n <- 20
  dat_frame <- data.frame(
    y = rnorm(n),
    x1 = factor(sample.int(4, n, replace = TRUE)))
  
  tmp <- gpe_linear()
  out <- tmp(y ~ x1, dat_frame)
  
  expect_length(out, 3)
  expect_equal(out, c(
    "lTerm(x1 == '2', scale = 0.41)", "lTerm(x1 == '3', scale = 0.5)",
    "lTerm(x1 == '4', scale = 0.41)"))
  
  # Also works w/ ordered factors
  dat_frame$x1 <- ordered(dat_frame$x1)
  
  out <- tmp(y ~ x1, dat_frame)
  
  expect_length(out, 3)
})

test_that("eTerm works for logicals", expect_length(eTerm(c(F, T, T, T)), 4))

test_that("get_cv.glmnet_args works", {
  #####
  # Not changing defaults works
  get_cv.glmnet_args <- with(environment(gpe), get_cv.glmnet_args)
  
  no_args <- get_cv.glmnet_args(list(), c(), c(), c(), "boh")
  w_args <- get_cv.glmnet_args(list(xyz = 1), c(), c(), c(), "boh")
  
  expect_equal(no_args, w_args[names(w_args) != "xyz"])
  
  #####
  # Changing defaults works
  
  change_def <- get_cv.glmnet_args(list(parallel = T), c(), c(), c(), "boh")
  expect_equal(no_args[names(no_args) != "parallel"], 
               change_def[names(change_def) != "parallel"])
  expect_false(no_args$parallel == change_def$parallel)
})

test_that("gpe_cv.glmnet gives same results as cv.glmnet", {
  # Create dummy data
  set.seed(3782347)
  X <- rnorm(90)
  dim(X) <- c(30, 3)
  y <- rpois(30, 1)
  ws <- runif(30)
  
  get_cv.glmnet_args <- with(environment(gpe), get_cv.glmnet_args)
  
  #####
  # fit with defaults
  def <- get_cv.glmnet_args(
    args = list(), x = X, y = y, weights = ws, family = "poisson")
  set.seed(seed <- 9629006)
  f1 <- do.call(glmnet::cv.glmnet, def)
  set.seed(seed)
  f2 <- gpe_cv.glmnet()(x = X, y = y, weights = ws, family = "poisson")
  
  expect_equal(f1, f2)
  
  ######
  # change lambda argument
  def <- get_cv.glmnet_args(
    args = list(lambda = c(.1, .01)), 
    x = X, y = y, weights = ws, family = "poisson")
  set.seed(seed)
  f1 <- do.call(glmnet::cv.glmnet, def)
  set.seed(seed)
  f2 <- gpe_cv.glmnet(lambda =  c(.1, .01))(
    x = X, y = y, weights = ws, family = "poisson")
  
  expect_equal(f1, f2)
})