tests/testthat/test-interact.r

####################
# Author: James Hickey
#
# Series of tests to check the calculation of the variable's 
# interaction strength
#
####################

context("Testing help functions - Interact")
test_that("check_and_set_num_trees throws a warning if given a vector", {
  # Given a vector for num_trees and a fake fit object
  gbm_fake_fit <- list()
  gbm_fake_fit$params$num_trees <- 2000
  num_trees <- c(200, 500)
  
  # When checking and setting the number of trees
  # Then a warning is thrown
  expect_warning(check_and_set_num_trees(gbm_fake_fit, num_trees), 
                 "length num_trees > 1: using first element")
})
test_that("check_and_set_num_trees throws an error if not given natural number for num_trees", {
  # Given a fake fit object and a num_trees which isn't a positive integer
  gbm_fake_fit <- list()
  gbm_fake_fit$params$num_trees <- 2000
  num_trees <- "wrong"
  
  # When checking and setting the number of trees
  # Then an error is thrown
  expect_error(check_and_set_num_trees(gbm_fake_fit, num_trees))
})
test_that("check_and_set_num_trees throws a warning if number of trees exceeds those in original fit", {
  # Given a fake fit object and num_trees > those fitted
  gbm_fake_fit <- list()
  gbm_fake_fit$params$num_trees <- 2000
  num_trees <- 5000
  
  # When checking and setting the number of trees
  # Then a warning is thrown
  expect_warning(check_and_set_num_trees(gbm_fake_fit, num_trees), 
                 paste("num_trees exceeds the number of trees in the model, ",
                       gbm_fake_fit$params$num_trees,". Using ", gbm_fake_fit$params$num_trees, " trees.", 
                       sep = ""))
  
})
test_that("check_and_set_num_trees returns 1st element of vector is < those in original fit", {
  # Given a vector for num_trees and a fake fit object
  gbm_fake_fit <- list()
  gbm_fake_fit$params$num_trees <- 2000
  num_trees <- c(200, 500)
  
  # When checking and setting the number of trees
  # Then returns first element of vector
  expect_equal(check_and_set_num_trees(gbm_fake_fit, num_trees), 
                num_trees[1])
})
test_that("check_and_set_num_trees returns those in original fit if number specified exceeds total in fit", {
  # Given a fake fit object and num_trees > those fitted
  gbm_fake_fit <- list()
  gbm_fake_fit$params$num_trees <- 2000
  num_trees <- 5000
  
  # When checking and setting the number of trees
  # Then returns number of trees in fit
  expect_equal(check_and_set_num_trees(gbm_fake_fit, num_trees), gbm_fake_fit$params$num_trees)
})
test_that("check_and_set_num_trees returns argument num_trees if it is a positive integer <= the total num_trees in original fit", {
  # Given a fake fit object and num_trees < those fitted
  gbm_fake_fit <- list()
  gbm_fake_fit$params$num_trees <- 2000
  num_trees <- 100
  
  # When checking and setting the number of trees
  # Then returns the num_trees passed in
  expect_equal(check_and_set_num_trees(gbm_fake_fit, num_trees), num_trees)
})
test_that("check_and_set_variables_indices throws an error if all strings but not all in fit's variable names", {
  # Given a fake fit object and a character vector with unrecognised variables
  gbm_fake_fit <- list()
  gbm_fake_fit$variables$var_names <- c("Var1", "Var2", "Var3")
  var_indices <- "Var4"
  
  # When checking and setting variable indices
  # Then an error is thrown
  expect_error(check_and_set_variables_indices(gbm_fake_fit, var_indices))
})
test_that("check_and_set_variables_indices throws a warning if indices not between 1 and the number of variables in fit", {
  # Given a fake fit object and index outside range
  gbm_fake_fit <- list()
  gbm_fake_fit$variables$var_names <- c("Var1", "Var2", "Var3")
  var_indices <- 4
  
  # When checking and setting variable indices
  # Then a warning will be thrown
  expect_warning(check_and_set_variables_indices(gbm_fake_fit, var_indices),
                 paste0("variables_indices must be between 1 and ", 
                        length(gbm_fake_fit$variables$var_names)))
})
test_that("check_and_set_variables_indices converts character vector input to integers", {
  # Given a fake fit object and variable indices that are strings
  gbm_fake_fit <- list()
  gbm_fake_fit$variables$var_names <- c("Var1", "Var2", "Var3")
  var_indices <- c("Var1", "Var2")
  
  # When checking and setting variables indices
  # Then the indices are converted to appropriate integers
  expect_equal(check_and_set_variables_indices(gbm_fake_fit, var_indices),
               c(1, 2))
})
test_that("check_and_set_variables_indices returns original vector if positive integers", {
  # Given a fake fit object and correct indices
  gbm_fake_fit <- list()
  gbm_fake_fit$variables$var_names <- c("Var1", "Var2", "Var3")
  var_indices <- c(2, 3)
  
  # When checking and setting variables indices
  # Then the original indices are returned
  expect_equal(check_and_set_variables_indices(gbm_fake_fit, var_indices),
               var_indices)
})
test_that("table_of_unique_values works correctly", {
  # Given data and correct variable indices
  N <- 100
  data <- data.frame("Var1"=rnorm(N), "Var2"=sample(1:5, N, replace=TRUE),
                     "Var3"=rnorm(N))
  var_indices <- c(1, 2)
  
  # When table_of_unique_values is called
  output <- table_of_unique_values(data, var_indices)
  
  # Then correct output produced
  corr_out <- unique(data[, var_indices, drop=FALSE])
  corr_out$num_levels_factors <- table(factor(apply(data[, var_indices,drop=FALSE],1,paste,collapse="\r"),
                                              levels=apply(corr_out, 1,paste,collapse="\r")))
  expect_equal(output, corr_out)
})
test_that("output of compute_preds_for_all_var_combinations is a list with the correct fields", {
  # Given data, a fit object, variable indices and all combinations of variables
  var_indices <- c(1, 2, 3)
  all_combinations_vars <- apply(expand.grid(rep(list(c(FALSE,TRUE)), length(var_indices)))[-1,], 1,
                                 function(x) as.numeric(which(x)))
  
  # create some data
  N <- 1000
  X1 <- runif(N)
  X2 <- 2*runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  X4 <- ordered(sample(letters[1:6],N,replace=T))
  X5 <- factor(sample(letters[1:3],N,replace=T))
  X6 <- 3*runif(N)
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  SNR <- 10 # signal-to-noise ratio
  Y <- X1**1.5 + 2 * (X2**.5) + mu
  sigma <- sqrt(var(Y)/SNR)
  Y <- Y + rnorm(N,0,sigma)
  
  # create a bunch of missing values
  X1[sample(1:N,size=100)] <- NA
  X3[sample(1:N,size=300)] <- NA
  
  # random weights if you want to experiment with them
  w <- rep(1,N)
  
  data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
  
  # fit initial model
  gbm_fit_obj <- gbm(Y~X1+X2+X3+X4+X5+X6,         # formula
                     data=data,                   # dataset
                     var.monotone=c(0,0,0,0,0,0), # -1: monotone decrease, +1: monotone increase, 0: no monotone restrictions
                     distribution="Gaussian",     # bernoulli, adaboost, gaussian, poisson, coxph, or
                     # list(name="quantile",alpha=0.05) for quantile regression
                     n.trees=2000,                 # number of trees
                     shrinkage=0.005,             # shrinkage or learning rate, 0.001 to 0.1 usually work
                     interaction.depth=3,         # 1: additive model, 2: two-way interactions, etc
                     bag.fraction = 0.5,          # subsampling fraction, 0.5 is probably best
                     train.fraction = 0.5,        # fraction of data for training, first train.fraction*N used for training
                     mFeatures = 3,               # Number of features to consider at each node.
                     n.minobsinnode = 10,         # minimum number of obs needed in each node
                     keep.data=TRUE,
                     cv.folds=10,                 # do 10-fold cross-validation
                     verbose = FALSE)             # don't print progress
  
  # When calculating predictions for all combinations of variables
  # Then has correct fields
  expect_equal(names(compute_preds_for_all_var_combinations(data, gbm_fit_obj,
                                                      all_combinations_vars, var_indices, 1000)[[1]]),
  c("data", "num_levels_factors", "preds", "sign"))
})
test_that("output of compute_pred_for_all_var_combinations is of the correct size", {
  # Given data, a fit object, variable indices and all combinations of variables
  var_indices <- c(1, 2, 3)
  all_combinations_vars <- apply(expand.grid(rep(list(c(FALSE,TRUE)), length(var_indices)))[-1,], 1,
                                 function(x) as.numeric(which(x)))
  
  # create some data
  N <- 1000
  X1 <- runif(N)
  X2 <- 2*runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  X4 <- ordered(sample(letters[1:6],N,replace=T))
  X5 <- factor(sample(letters[1:3],N,replace=T))
  X6 <- 3*runif(N)
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  SNR <- 10 # signal-to-noise ratio
  Y <- X1**1.5 + 2 * (X2**.5) + mu
  sigma <- sqrt(var(Y)/SNR)
  Y <- Y + rnorm(N,0,sigma)
  
  # create a bunch of missing values
  X1[sample(1:N,size=100)] <- NA
  X3[sample(1:N,size=300)] <- NA
  
  # random weights if you want to experiment with them
  w <- rep(1,N)
  
  data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
  
  # fit initial model
  gbm_fit_obj <- gbm(Y~X1+X2+X3+X4+X5+X6,         # formula
                     data=data,                   # dataset
                     var.monotone=c(0,0,0,0,0,0), # -1: monotone decrease, +1: monotone increase, 0: no monotone restrictions
                     distribution="Gaussian",     # bernoulli, adaboost, gaussian, poisson, coxph, or
                     # list(name="quantile",alpha=0.05) for quantile regression
                     n.trees=2000,                 # number of trees
                     shrinkage=0.005,             # shrinkage or learning rate, 0.001 to 0.1 usually work
                     interaction.depth=3,         # 1: additive model, 2: two-way interactions, etc
                     bag.fraction = 0.5,          # subsampling fraction, 0.5 is probably best
                     train.fraction = 0.5,        # fraction of data for training, first train.fraction*N used for training
                     mFeatures = 3,               # Number of features to consider at each node.
                     n.minobsinnode = 10,         # minimum number of obs needed in each node
                     keep.data=TRUE,
                     cv.folds=10,                 # do 10-fold cross-validation
                     verbose = FALSE)             # don't print progress
  
  # When calculating predictions for all combinations of variables
  # Then has correct size
  expect_equal(length(compute_preds_for_all_var_combinations(data, 
                                                             gbm_fit_obj,
                                                             all_combinations_vars, 
                                                             var_indices, 
                                                             num_trees = 1000)),
               length(all_combinations_vars))
})
test_that("sign field of output of compute_pred_for_all_var_combinations consists of +/-1s", {
  # Given data, a fit object, variable indices and all combinations of variables
  var_indices <- c(1, 2, 3)
  all_combinations_vars <- apply(expand.grid(rep(list(c(FALSE,TRUE)), length(var_indices)))[-1,], 1,
                                 function(x) as.numeric(which(x)))
  
  # create some data
  N <- 1000
  X1 <- runif(N)
  X2 <- 2*runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  X4 <- ordered(sample(letters[1:6],N,replace=T))
  X5 <- factor(sample(letters[1:3],N,replace=T))
  X6 <- 3*runif(N)
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  SNR <- 10 # signal-to-noise ratio
  Y <- X1**1.5 + 2 * (X2**.5) + mu
  sigma <- sqrt(var(Y)/SNR)
  Y <- Y + rnorm(N,0,sigma)
  
  # create a bunch of missing values
  X1[sample(1:N,size=100)] <- NA
  X3[sample(1:N,size=300)] <- NA
  
  # random weights if you want to experiment with them
  w <- rep(1,N)
  
  data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
  
  # fit initial model
  gbm_fit_obj <- gbm(Y~X1+X2+X3+X4+X5+X6,         # formula
                     data=data,                   # dataset
                     var.monotone=c(0,0,0,0,0,0), # -1: monotone decrease, +1: monotone increase, 0: no monotone restrictions
                     distribution="Gaussian",     # bernoulli, adaboost, gaussian, poisson, coxph, or
                     # list(name="quantile",alpha=0.05) for quantile regression
                     n.trees=2000,                 # number of trees
                     shrinkage=0.005,             # shrinkage or learning rate, 0.001 to 0.1 usually work
                     interaction.depth=3,         # 1: additive model, 2: two-way interactions, etc
                     bag.fraction = 0.5,          # subsampling fraction, 0.5 is probably best
                     train.fraction = 0.5,        # fraction of data for training, first train.fraction*N used for training
                     mFeatures = 3,               # Number of features to consider at each node.
                     n.minobsinnode = 10,         # minimum number of obs needed in each node
                     keep.data=TRUE,
                     cv.folds=10,                 # do 10-fold cross-validation
                     verbose = FALSE)             # don't print progress
  
  # When calculating predictions for all combinations of variables
  preds_for_combs <- compute_preds_for_all_var_combinations(data, 
                                                            gbm_fit_obj,
                                                            all_combinations_vars, 
                                                            var_indices, 
                                                            num_trees = 1000)
  # Then sign field is +-1s
  expect_true(all(preds_for_combs[[1]]$sign %in% c(1, -1)))
})

context("testing interact.GBMFit")
test_that("if data not a data.frame or matrix then an error is thrown", {
  # Given data and a fit object 
  # create some data
  N <- 1000
  X1 <- runif(N)
  X2 <- 2*runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  X4 <- ordered(sample(letters[1:6],N,replace=T))
  X5 <- factor(sample(letters[1:3],N,replace=T))
  X6 <- 3*runif(N)
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  SNR <- 10 # signal-to-noise ratio
  Y <- X1**1.5 + 2 * (X2**.5) + mu
  sigma <- sqrt(var(Y)/SNR)
  Y <- Y + rnorm(N,0,sigma)
  
  # create a bunch of missing values
  X1[sample(1:N,size=100)] <- NA
  X3[sample(1:N,size=300)] <- NA
  
  # random weights if you want to experiment with them
  w <- rep(1,N)
  
  data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
  
  # fit initial model
  gbm_fit_obj <- gbm(Y~X1+X2+X3+X4+X5+X6,         # formula
                     data=data,                   # dataset
                     var.monotone=c(0,0,0,0,0,0), # -1: monotone decrease, +1: monotone increase, 0: no monotone restrictions
                     distribution="Gaussian",     # bernoulli, adaboost, gaussian, poisson, coxph, or
                     # list(name="quantile",alpha=0.05) for quantile regression
                     n.trees=2000,                 # number of trees
                     shrinkage=0.005,             # shrinkage or learning rate, 0.001 to 0.1 usually work
                     interaction.depth=3,         # 1: additive model, 2: two-way interactions, etc
                     bag.fraction = 0.5,          # subsampling fraction, 0.5 is probably best
                     train.fraction = 0.5,        # fraction of data for training, first train.fraction*N used for training
                     mFeatures = 3,               # Number of features to consider at each node.
                     n.minobsinnode = 10,         # minimum number of obs needed in each node
                     keep.data=TRUE,
                     cv.folds=10,                 # do 10-fold cross-validation
                     verbose = FALSE)             # don't print progress
  
  # When calculating the model interactions with data
  # not in data frame or matrix
  # Then an error is thrown
  expect_error(interact(gbm_fit_obj, as.list(data)), 
               "data argument should be a data.frame or matrix")
})
test_that("if var_indices not a vector of characters or integers then an error is thrown", {
  # Given data and a fit object 
  # create some data
  N <- 1000
  X1 <- runif(N)
  X2 <- 2*runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  X4 <- ordered(sample(letters[1:6],N,replace=T))
  X5 <- factor(sample(letters[1:3],N,replace=T))
  X6 <- 3*runif(N)
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  SNR <- 10 # signal-to-noise ratio
  Y <- X1**1.5 + 2 * (X2**.5) + mu
  sigma <- sqrt(var(Y)/SNR)
  Y <- Y + rnorm(N,0,sigma)
  
  # create a bunch of missing values
  X1[sample(1:N,size=100)] <- NA
  X3[sample(1:N,size=300)] <- NA
  
  # random weights if you want to experiment with them
  w <- rep(1,N)
  
  data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
  
  # fit initial model
  gbm_fit_obj <- gbm(Y~X1+X2+X3+X4+X5+X6,         # formula
                     data=data,                   # dataset
                     var.monotone=c(0,0,0,0,0,0), # -1: monotone decrease, +1: monotone increase, 0: no monotone restrictions
                     distribution="Gaussian",     # bernoulli, adaboost, gaussian, poisson, coxph, or
                     # list(name="quantile",alpha=0.05) for quantile regression
                     n.trees=2000,                 # number of trees
                     shrinkage=0.005,             # shrinkage or learning rate, 0.001 to 0.1 usually work
                     interaction.depth=3,         # 1: additive model, 2: two-way interactions, etc
                     bag.fraction = 0.5,          # subsampling fraction, 0.5 is probably best
                     train.fraction = 0.5,        # fraction of data for training, first train.fraction*N used for training
                     mFeatures = 3,               # Number of features to consider at each node.
                     n.minobsinnode = 10,         # minimum number of obs needed in each node
                     keep.data=TRUE,
                     cv.folds=10,                 # do 10-fold cross-validation
                     verbose = FALSE)             # don't print progress
  
  # When calculating interactions using var_indices that aren't a
  # vector of integers or character vectors
  var_ind1 <- Inf
  var_ind2 <- NA
  var_ind3 <- c(1, "Hello")
  
  # Then error is thrown
  expect_error(interact(gbm_fit_obj, data, var_ind1), 
               "Variables indices must be a vector of integers or characters")
  expect_error(interact(gbm_fit_obj, data, var_ind2), 
               "Variables indices must be a vector of integers or characters")
  expect_error(interact(gbm_fit_obj, data, var_ind3), 
               "Variables indices must be a vector of integers or characters")
})
test_that("Error thrown if length var_indices exceeds interaction depth of fit", {
  # Given data and a fit object 
  # create some data
  N <- 1000
  X1 <- runif(N)
  X2 <- 2*runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  X4 <- ordered(sample(letters[1:6],N,replace=T))
  X5 <- factor(sample(letters[1:3],N,replace=T))
  X6 <- 3*runif(N)
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  SNR <- 10 # signal-to-noise ratio
  Y <- X1**1.5 + 2 * (X2**.5) + mu
  sigma <- sqrt(var(Y)/SNR)
  Y <- Y + rnorm(N,0,sigma)
  
  # create a bunch of missing values
  X1[sample(1:N,size=100)] <- NA
  X3[sample(1:N,size=300)] <- NA
  
  # random weights if you want to experiment with them
  w <- rep(1,N)
  
  data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
  
  # fit initial model
  gbm_fit_obj <- gbm(Y~X1+X2+X3+X4+X5+X6,         # formula
                     data=data,                   # dataset
                     var.monotone=c(0,0,0,0,0,0), # -1: monotone decrease, +1: monotone increase, 0: no monotone restrictions
                     distribution="Gaussian",     # bernoulli, adaboost, gaussian, poisson, coxph, or
                     # list(name="quantile",alpha=0.05) for quantile regression
                     n.trees=2000,                 # number of trees
                     shrinkage=0.005,             # shrinkage or learning rate, 0.001 to 0.1 usually work
                     interaction.depth=3,         # 1: additive model, 2: two-way interactions, etc
                     bag.fraction = 0.5,          # subsampling fraction, 0.5 is probably best
                     train.fraction = 0.5,        # fraction of data for training, first train.fraction*N used for training
                     mFeatures = 3,               # Number of features to consider at each node.
                     n.minobsinnode = 10,         # minimum number of obs needed in each node
                     keep.data=TRUE,
                     cv.folds=10,                 # do 10-fold cross-validation
                     verbose = FALSE)             # don't print progress
  
  # When evaluating the interactions for more than variables than interaction depth
  # Then an error is thrown
  expect_error(interact(gbm_fit_obj, data, 1:4))
})
test_that("Given correct inputs will return H-statistic < 1", {
  # Given data and a fit object 
  # create some data
  N <- 1000
  X1 <- runif(N)
  X2 <- 2*runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  X4 <- ordered(sample(letters[1:6],N,replace=T))
  X5 <- factor(sample(letters[1:3],N,replace=T))
  X6 <- 3*runif(N)
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  SNR <- 10 # signal-to-noise ratio
  Y <- X1**1.5 + 2 * (X2**.5) + mu
  sigma <- sqrt(var(Y)/SNR)
  Y <- Y + rnorm(N,0,sigma)
  
  # create a bunch of missing values
  X1[sample(1:N,size=100)] <- NA
  X3[sample(1:N,size=300)] <- NA
  
  # random weights if you want to experiment with them
  w <- rep(1,N)
  
  data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
  
  # fit initial model
  gbm_fit_obj <- gbm(Y~X1+X2+X3+X4+X5+X6,         # formula
                    data=data,                   # dataset
              var.monotone=c(0,0,0,0,0,0), # -1: monotone decrease, +1: monotone increase, 0: no monotone restrictions
              distribution="Gaussian",     # bernoulli, adaboost, gaussian, poisson, coxph, or
              # list(name="quantile",alpha=0.05) for quantile regression
              n.trees=2000,                 # number of trees
              shrinkage=0.005,             # shrinkage or learning rate, 0.001 to 0.1 usually work
              interaction.depth=3,         # 1: additive model, 2: two-way interactions, etc
              bag.fraction = 0.5,          # subsampling fraction, 0.5 is probably best
              train.fraction = 0.5,        # fraction of data for training, first train.fraction*N used for training
              mFeatures = 3,               # Number of features to consider at each node.
              n.minobsinnode = 10,         # minimum number of obs needed in each node
              keep.data=TRUE,
              cv.folds=10,                 # do 10-fold cross-validation
              verbose = FALSE)             # don't print progress
  
  # When calculating the interactions 
  int_stats <- interact(gbm_fit_obj, data=data, var_indices=1:2)
  
  # Then H-statistic is correct and < 1
  expect_true(is.double(int_stats))
  expect_true(int_stats < 1)
})
gbm-developers/gbm3 documentation built on April 28, 2024, 10:04 p.m.