tests/testthat/test-summary-and-print.r

####################
# Author: James Hickey
#
# Series of tests to check print and summary methods
#
####################

context("Testing print.GBMFit helper functions")
test_that("print_perf_measures defaults to total number of iterations if train_fraction = 1 and fit is not cross-validated", {
  # Given a "correct" GBMFit object - train_fraction=1
  set.seed(1)
  
  # create some data
  N <- 1000
  X1 <- runif(N)
  X2 <- runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  p <- 1/(1+exp(-(sin(3*X1) - 4*X2 + mu)))
  Y <- rbinom(N,1,p)
  
  # random weights if you want to experiment with them
  w <- rexp(N)
  w <- N*w/sum(w)
  
  data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3)
  offset <- rep(0, N)
  
  # Set up for new API
  params <- training_params(num_trees=30, interaction_depth=3, min_num_obs_in_node=10, 
                            shrinkage=0.001, bag_fraction=0.5, id=seq(nrow(data)), num_train=N, num_features=3)
  dist <- gbm_dist("Bernoulli")
  fit <- gbmt(Y~X1+X2+X3, data=data, distribution=dist, weights=w, offset=offset,
              train_params=params, var_monotone=c(0, 0, 0), keep_gbm_data=TRUE, cv_folds=1, is_verbose = FALSE)
  
  # When calculating print_perf_measures
  best_iter <- print_perf_measures(fit)
  
  # Then results equal to num_trees
  expect_equal(best_iter, params$num_trees)
  
})

test_that("print_perf_measures calculates the performance using test if train_fraction < 1", {
  # Given a "correct" GBMFit object - train_fraction < 1
  set.seed(1)
  
  # create some data
  N <- 1000
  X1 <- runif(N)
  X2 <- runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  p <- 1/(1+exp(-(sin(3*X1) - 4*X2 + mu)))
  Y <- rbinom(N,1,p)
  
  # random weights if you want to experiment with them
  w <- rexp(N)
  w <- N*w/sum(w)
  
  data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3)
  offset <- rep(0, N)
  
  # Set up for new API
  params <- training_params(num_trees=30, interaction_depth=3, min_num_obs_in_node=10, 
                            shrinkage=0.001, bag_fraction=0.5, id=seq(nrow(data)), num_train=N/2, num_features=3)
  dist <- gbm_dist("Bernoulli")
  fit <- gbmt(Y~X1+X2+X3, data=data, distribution=dist, weights=w, offset=offset,
              train_params=params, var_monotone=c(0, 0, 0), keep_gbm_data=TRUE, cv_folds=5, is_verbose = FALSE)
  
  # When calculating print_perf_measures
  best_iter <- print_perf_measures(fit)
  
  # Then results equal to gbm_perf with "test" method
  expect_equal(best_iter, gbmt_performance(fit, method="test"))
})
test_that("print_perf_measures returns the performance using cv if fit is cross-validated and train_fraction=1", {
  # Given a "correct" GBMFit object - train_fraction=1 and cv_folds > 1
  set.seed(1)
  
  # create some data
  N <- 1000
  X1 <- runif(N)
  X2 <- runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  p <- 1/(1+exp(-(sin(3*X1) - 4*X2 + mu)))
  Y <- rbinom(N,1,p)
  
  # random weights if you want to experiment with them
  w <- rexp(N)
  w <- N*w/sum(w)
  
  data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3)
  offset <- rep(0, N)
  
  # Set up for new API
  params <- training_params(num_trees=30, interaction_depth=3, min_num_obs_in_node=10, 
                            shrinkage=0.001, bag_fraction=0.5, id=seq(nrow(data)), num_train=N, num_features=3)
  dist <- gbm_dist("Bernoulli")
  fit <- gbmt(Y~X1+X2+X3, data=data, distribution=dist, weights=w, offset=offset,
              train_params=params, var_monotone=c(0, 0, 0), keep_gbm_data=TRUE, cv_folds=5, is_verbose = FALSE)
  
  # When calculating print_perf_measures
  best_iter <- print_perf_measures(fit)
  
  # Then results equal to gbm_perf with method="cv"
  expect_equal(best_iter, gbmt_performance(fit, method="cv"))
})
test_that("print_iters_and_dist does not throw error when passed a GBMFit object", {
  # Given a "correct" GBMFit object
  set.seed(1)
  
  # create some data
  N <- 1000
  X1 <- runif(N)
  X2 <- runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  p <- 1/(1+exp(-(sin(3*X1) - 4*X2 + mu)))
  Y <- rbinom(N,1,p)
  
  # random weights if you want to experiment with them
  w <- rexp(N)
  w <- N*w/sum(w)
  
  data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3)
  offset <- rep(0, N)
  
  # Set up for new API
  params <- training_params(num_trees=30, interaction_depth=3, min_num_obs_in_node=10, 
                            shrinkage=0.001, bag_fraction=0.5, id=seq(nrow(data)), num_train=N/2, num_features=3)
  dist <- gbm_dist("Bernoulli")
  fit <- gbmt(Y~X1+X2+X3, data=data, distribution=dist, weights=w, offset=offset,
              train_params=params, var_monotone=c(0, 0, 0), keep_gbm_data=TRUE, cv_folds=5, is_verbose = FALSE)
  
  # When calculating print_iters_and_dist 
  # Then no error is thrown
  expect_error(print_iters_and_dist(fit), NA)
})
test_that("print_confusion_matrix does not throw error when passed a GBMFit object", {
  # Given a "correct" GBMFit object
  set.seed(1)
  
  # create some data
  N <- 1000
  X1 <- runif(N)
  X2 <- runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  p <- 1/(1+exp(-(sin(3*X1) - 4*X2 + mu)))
  Y <- rbinom(N,1,p)
  
  # random weights if you want to experiment with them
  w <- rexp(N)
  w <- N*w/sum(w)
  
  data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3)
  offset <- rep(0, N)
  
  # Set up for new API
  params <- training_params(num_trees=30, interaction_depth=3, min_num_obs_in_node=10, 
                            shrinkage=0.001, bag_fraction=0.5, id=seq(nrow(data)), num_train=N/2, num_features=3)
  dist <- gbm_dist("Bernoulli")
  fit <- gbmt(Y~X1+X2+X3, data=data, distribution=dist, weights=w, offset=offset,
              train_params=params, var_monotone=c(0, 0, 0), keep_gbm_data=TRUE, cv_folds=5, is_verbose = FALSE)
  
  # When calculating print_confusion_matrix
  # Then no error is thrown
  expect_error(print_confusion_matrix(fit), NA)
})
test_that("binary_response_conf_matrix does not throw error when called correctly", {
  # Given a "correct" GBMFit object - Bernoulli distribution
  set.seed(1)
  
  # create some data
  N <- 1000
  X1 <- runif(N)
  X2 <- runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  p <- 1/(1+exp(-(sin(3*X1) - 4*X2 + mu)))
  Y <- rbinom(N,1,p)
  
  # random weights if you want to experiment with them
  w <- rexp(N)
  w <- N*w/sum(w)
  
  data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3)
  offset <- rep(0, N)
  
  # Set up for new API
  params <- training_params(num_trees=30, interaction_depth=3, min_num_obs_in_node=10, 
                            shrinkage=0.001, bag_fraction=0.5, id=seq(nrow(data)), num_train=N/2, num_features=3)
  dist <- gbm_dist("Bernoulli")
  fit <- gbmt(Y~X1+X2+X3, data=data, distribution=dist, weights=w, offset=offset,
              train_params=params, var_monotone=c(0, 0, 0), keep_gbm_data=TRUE, cv_folds=5, is_verbose = FALSE)
  
  # When calculating binary_response_conf_matrix
  # Then no error is thrown
  expect_error(binary_response_conf_matrix(fit$gbm_data_obj$y, fit$cv_fitted), NA)
})
test_that("pseudo_r_squared does not throw error when passed correct inputs", {
  # Given a "correct" GBMFit object - using Gauss
  set.seed(1)
  
  # create some data
  N <- 1000
  X1 <- runif(N)
  X2 <- runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  p <- 1/(1+exp(-(sin(3*X1) - 4*X2 + mu)))
  Y <- rbinom(N,1,p)
  
  # random weights if you want to experiment with them
  w <- rexp(N)
  w <- N*w/sum(w)
  
  data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3)
  offset <- rep(0, N)
  
  # Set up for new API
  params <- training_params(num_trees=30, interaction_depth=3, min_num_obs_in_node=10, 
                            shrinkage=0.001, bag_fraction=0.5, id=seq(nrow(data)), num_train=N/2, num_features=3)
  dist <- gbm_dist("Bernoulli")
  fit <- gbmt(Y~X1+X2+X3, data=data, distribution=dist, weights=w, offset=offset,
              train_params=params, var_monotone=c(0, 0, 0), keep_gbm_data=TRUE, cv_folds=5, is_verbose = FALSE)
  
  # When calculating pseudo_r_squared 
  # Then no error is thrown
  expect_error(pseudo_r_squared(fit$gbm_data_obj$y, fit$cv_fitted, fit$distribution$name), NA)
})
test_that("pseudo_r_squared is same for all dists except Gaussian", {
  # Given a "correct" GBMFit object - NOT Gaussian
  set.seed(1)
  
  # create some data
  N <- 1000
  X1 <- runif(N)
  X2 <- runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  p <- 1/(1+exp(-(sin(3*X1) - 4*X2 + mu)))
  Y <- rbinom(N,1,p)
  
  # random weights if you want to experiment with them
  w <- rexp(N)
  w <- N*w/sum(w)
  
  data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3)
  offset <- rep(0, N)
  
  # Set up for new API
  params <- training_params(num_trees=30, interaction_depth=3, min_num_obs_in_node=10, 
                            shrinkage=0.001, bag_fraction=0.5, id=seq(nrow(data)), num_train=N/2, num_features=3)
  dist <- gbm_dist("Bernoulli")
  fit <- gbmt(Y~X1+X2+X3, data=data, distribution=dist, weights=w, offset=offset,
              train_params=params, var_monotone=c(0, 0, 0), keep_gbm_data=TRUE, cv_folds=5, is_verbose = FALSE)
  
  # When calculating pseudo_r_squared for Bernoulli
  r_sq <- pseudo_r_squared(fit$gbm_data_obj$y, fit$cv_fitted, fit$distribution$name)
  
  # Then this value is same for all dists except Gaussian
  expect_equal(pseudo_r_squared(fit$gbm_data_obj$y, fit$cv_fitted, "AdaBoost"), r_sq)
  expect_equal(pseudo_r_squared(fit$gbm_data_obj$y, fit$cv_fitted, "Bernoulli"), r_sq)
  expect_equal(pseudo_r_squared(fit$gbm_data_obj$y, fit$cv_fitted, "CoxPH"), r_sq)
  expect_equal(pseudo_r_squared(fit$gbm_data_obj$y, fit$cv_fitted, "Gamma"), r_sq)
  expect_true(pseudo_r_squared(fit$gbm_data_obj$y, fit$cv_fitted, "Gaussian") != r_sq)
  expect_equal(pseudo_r_squared(fit$gbm_data_obj$y, fit$cv_fitted, "Huberized"), r_sq)
  expect_equal(pseudo_r_squared(fit$gbm_data_obj$y, fit$cv_fitted, "Laplace"), r_sq)
  expect_equal(pseudo_r_squared(fit$gbm_data_obj$y, fit$cv_fitted, "Pairwise"), r_sq)
  expect_equal(pseudo_r_squared(fit$gbm_data_obj$y, fit$cv_fitted, "Poisson"), r_sq)
  expect_equal(pseudo_r_squared(fit$gbm_data_obj$y, fit$cv_fitted, "Quantile"), r_sq)
  expect_equal(pseudo_r_squared(fit$gbm_data_obj$y, fit$cv_fitted, "TDist"), r_sq)
  expect_equal(pseudo_r_squared(fit$gbm_data_obj$y, fit$cv_fitted, "Tweedie"), r_sq)
})


context("Test Input Error checking - printing")
test_that("print_perf_measures throws error if passed an object other than GBMFit class", {
  # Given a "correct" GBMFit objects
  set.seed(1)
  
  # create some data
  N <- 1000
  X1 <- runif(N)
  X2 <- runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  p <- 1/(1+exp(-(sin(3*X1) - 4*X2 + mu)))
  Y <- rbinom(N,1,p)
  
  # random weights if you want to experiment with them
  w <- rexp(N)
  w <- N*w/sum(w)
  
  data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3)
  offset <- rep(0, N)
  
  # Set up for new API
  params <- training_params(num_trees=30, interaction_depth=3, min_num_obs_in_node=10, 
                            shrinkage=0.001, bag_fraction=0.5, id=seq(nrow(data)), num_train=N/2, num_features=3)
  dist <- gbm_dist("Bernoulli")
  fit <- gbmt(Y~X1+X2+X3, data=data, distribution=dist, weights=w, offset=offset,
              train_params=params, var_monotone=c(0, 0, 0), keep_gbm_data=TRUE, cv_folds=5, is_verbose = FALSE)
  
  
  # When  print_perf_measures is called on fit which is missing class GBMFit
  class(fit) <- ""
  
  # Then an error is thrown
  expect_error(print_perf_measures(fit))
})

test_that("print_iters_and_dist throws error if passed an object other than GBMFit class", {
  # Given a "correct" GBMFit objects
  set.seed(1)
  
  # create some data
  N <- 1000
  X1 <- runif(N)
  X2 <- runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  p <- 1/(1+exp(-(sin(3*X1) - 4*X2 + mu)))
  Y <- rbinom(N,1,p)
  
  # random weights if you want to experiment with them
  w <- rexp(N)
  w <- N*w/sum(w)
  
  data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3)
  offset <- rep(0, N)
  
  # Set up for new API
  params <- training_params(num_trees=30, interaction_depth=3, min_num_obs_in_node=10, 
                            shrinkage=0.001, bag_fraction=0.5, id=seq(nrow(data)), num_train=N/2, num_features=3)
  dist <- gbm_dist("Bernoulli")
  fit <- gbmt(Y~X1+X2+X3, data=data, distribution=dist, weights=w, offset=offset,
              train_params=params, var_monotone=c(0, 0, 0), keep_gbm_data=TRUE, cv_folds=5, is_verbose = FALSE)
  
  
  # When  print_iters_and_dist is called on fit which is missing class GBMFit
  class(fit) <- ""
  
  # Then an error is thrown
  expect_error(print_iters_and_dist(fit))
})

test_that("print_confusion_matrix throws error if passed an object other than GBMFit class", {
  # Given a "correct" GBMFit objects
  set.seed(1)
  
  # create some data
  N <- 1000
  X1 <- runif(N)
  X2 <- runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  p <- 1/(1+exp(-(sin(3*X1) - 4*X2 + mu)))
  Y <- rbinom(N,1,p)
  
  # random weights if you want to experiment with them
  w <- rexp(N)
  w <- N*w/sum(w)
  
  data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3)
  offset <- rep(0, N)
  
  # Set up for new API
  params <- training_params(num_trees=30, interaction_depth=3, min_num_obs_in_node=10, 
                            shrinkage=0.001, bag_fraction=0.5, id=seq(nrow(data)), num_train=N/2, num_features=3)
  dist <- gbm_dist("Bernoulli")
  fit <- gbmt(Y~X1+X2+X3, data=data, distribution=dist, weights=w, offset=offset,
              train_params=params, var_monotone=c(0, 0, 0), keep_gbm_data=TRUE, cv_folds=5, is_verbose = FALSE)
  
  
  # When  print_confustion_matrix is called on fit which is missing class GBMFit
  class(fit) <- ""
  
  # Then an error is thrown
  expect_error(print_confusion_matrix(fit))
})

test_that("summary.GBMFit throws error if cBars is not a whole number > 1", {
  # Given a "correct" GBMFit objects
  set.seed(1)
  
  # create some data
  N <- 1000
  X1 <- runif(N)
  X2 <- runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  p <- 1/(1+exp(-(sin(3*X1) - 4*X2 + mu)))
  Y <- rbinom(N,1,p)
  
  # random weights if you want to experiment with them
  w <- rexp(N)
  w <- N*w/sum(w)
  
  data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3)
  offset <- rep(0, N)
  
  # Set up for new API
  params <- training_params(num_trees=30, interaction_depth=3, min_num_obs_in_node=10, 
                            shrinkage=0.001, bag_fraction=0.5, id=seq(nrow(data)), num_train=N/2, num_features=3)
  dist <- gbm_dist("Bernoulli")
  fit <- gbmt(Y~X1+X2+X3, data=data, distribution=dist, weights=w, offset=offset,
              train_params=params, var_monotone=c(0, 0, 0), keep_gbm_data=TRUE, cv_folds=5, is_verbose = FALSE)
  
  
  # When summary method is called with cBars is not a whole number > 1
  # Then an error is thrown
  expect_error(summary(fit, cBars=c(TRUE, FALSE)))
  expect_error(summary(fit, cBars=1.3))
  expect_error(summary(fit, cBars="What"))
  expect_error(summary(fit, cBars=Inf))
  expect_error(summary(fit, cBars=NULL))
})

test_that("summary.GBMFit throws error if num_trees is not a whole number > 1", {
  # Given a "correct" GBMFit objects
  set.seed(1)
  
  # create some data
  N <- 1000
  X1 <- runif(N)
  X2 <- runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  p <- 1/(1+exp(-(sin(3*X1) - 4*X2 + mu)))
  Y <- rbinom(N,1,p)
  
  # random weights if you want to experiment with them
  w <- rexp(N)
  w <- N*w/sum(w)
  
  data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3)
  offset <- rep(0, N)
  
  # Set up for new API
  params <- training_params(num_trees=30, interaction_depth=3, min_num_obs_in_node=10, 
                            shrinkage=0.001, bag_fraction=0.5, id=seq(nrow(data)), num_train=N/2, num_features=3)
  dist <- gbm_dist("Bernoulli")
  fit <- gbmt(Y~X1+X2+X3, data=data, distribution=dist, weights=w, offset=offset,
              train_params=params, var_monotone=c(0, 0, 0), keep_gbm_data=TRUE, cv_folds=5, is_verbose = FALSE)
  
  
  # When summary method is called with num_trees is not a whole number > 1
  # Then an error is thrown
  expect_error(summary(fit, num_trees=c(TRUE, FALSE)))
  expect_error(summary(fit, num_trees=1.3))
  expect_error(summary(fit, num_trees="What"))
  expect_error(summary(fit, num_trees=Inf))
  expect_error(summary(fit, num_trees=NULL))
})

test_that("summary.GBMFit throws error if plot_it is not a logical", {
  # Given a "correct" GBMFit objects
  set.seed(1)
  
  # create some data
  N <- 1000
  X1 <- runif(N)
  X2 <- runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  p <- 1/(1+exp(-(sin(3*X1) - 4*X2 + mu)))
  Y <- rbinom(N,1,p)
  
  # random weights if you want to experiment with them
  w <- rexp(N)
  w <- N*w/sum(w)
  
  data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3)
  offset <- rep(0, N)
  
  # Set up for new API
  params <- training_params(num_trees=30, interaction_depth=3, min_num_obs_in_node=10, 
                            shrinkage=0.001, bag_fraction=0.5, id=seq(nrow(data)), num_train=N/2, num_features=3)
  dist <- gbm_dist("Bernoulli")
  fit <- gbmt(Y~X1+X2+X3, data=data, distribution=dist, weights=w, offset=offset,
              train_params=params, var_monotone=c(0, 0, 0), keep_gbm_data=TRUE, cv_folds=5, is_verbose = FALSE)
  
  
  # When summary method is called with plot_it not a logical
  # Then an error is thrown
  expect_error(summary(fit, plot_it=c(TRUE, FALSE)))
  expect_error(summary(fit, plot_it=1))
  expect_error(summary(fit, plot_it="What"))
  expect_error(summary(fit, plot_it=Inf))
  expect_error(summary(fit, plot_it=NULL))
})

test_that("summary.GBMFit throws error if normalize is not a logical", {
  # Given a "correct" GBMFit objects
  set.seed(1)
  
  # create some data
  N <- 1000
  X1 <- runif(N)
  X2 <- runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  p <- 1/(1+exp(-(sin(3*X1) - 4*X2 + mu)))
  Y <- rbinom(N,1,p)
  
  # random weights if you want to experiment with them
  w <- rexp(N)
  w <- N*w/sum(w)
  
  data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3)
  offset <- rep(0, N)
  
  # Set up for new API
  params <- training_params(num_trees=30, interaction_depth=3, min_num_obs_in_node=10, 
                            shrinkage=0.001, bag_fraction=0.5, id=seq(nrow(data)), num_train=N/2, num_features=3)
  dist <- gbm_dist("Bernoulli")
  fit <- gbmt(Y~X1+X2+X3, data=data, distribution=dist, weights=w, offset=offset,
              train_params=params, var_monotone=c(0, 0, 0), keep_gbm_data=TRUE, cv_folds=5, is_verbose = FALSE)
 
  
  # When summary method is called with normalize not a logical
  # Then an error is thrown
  expect_error(summary(fit, normalize=c(TRUE, FALSE)))
  expect_error(summary(fit, normalize=1))
  expect_error(summary(fit, normalize="What"))
  expect_error(summary(fit, normalize=Inf))
  expect_error(summary(fit, normalize=NULL))
})

context("Test print.GBMFit")
test_that("Print method on GBMFit object runs without error", {
  # Given some "correct" GBMFit objects
  set.seed(1)
  
  # create some data
  N <- 1000
  X1 <- runif(N)
  X2 <- runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  p <- 1/(1+exp(-(sin(3*X1) - 4*X2 + mu)))
  Y <- rbinom(N,1,p)
  
  # random weights if you want to experiment with them
  w <- rexp(N)
  w <- N*w/sum(w)
  
  data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3)
  offset <- rep(0, N)
  
  # Set up for new API
  params <- training_params(num_trees=30, interaction_depth=3, min_num_obs_in_node=10, 
                            shrinkage=0.001, bag_fraction=0.5, id=seq(nrow(data)), num_train=N/2, num_features=3)
  dist <- gbm_dist("Bernoulli")
  fit <- gbmt(Y~X1+X2+X3, data=data, distribution=dist, weights=w, offset=offset,
              train_params=params, var_monotone=c(0, 0, 0), keep_gbm_data=TRUE, cv_folds=5, is_verbose = FALSE)
  fit2 <- gbmt(Y~X1+X2+X3, data=data, distribution=dist, weights=w, offset=offset,
               train_params=params, var_monotone=c(0, 0, 0), keep_gbm_data=TRUE, cv_folds=1, is_verbose = FALSE)
  
  # When print method is called
  # Then runs without error
  expect_error(print(fit), NA)
  expect_error(print(fit2), NA)
})
context("Test summary.GBMFit")
test_that("Summary method on GBMFit object runs without error", {
  # Given some "correct" GBMFit objects
  set.seed(1)
  
  # create some data
  N <- 1000
  X1 <- runif(N)
  X2 <- runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  p <- 1/(1+exp(-(sin(3*X1) - 4*X2 + mu)))
  Y <- rbinom(N,1,p)
  
  # random weights if you want to experiment with them
  w <- rexp(N)
  w <- N*w/sum(w)
  
  data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3)
  offset <- rep(0, N)
  
  # Set up for new API
  params <- training_params(num_trees=30, interaction_depth=3, min_num_obs_in_node=10, 
                            shrinkage=0.001, bag_fraction=0.5, id=seq(nrow(data)), num_train=N/2, num_features=3)
  dist <- gbm_dist("Bernoulli")
  fit <- gbmt(Y~X1+X2+X3, data=data, distribution=dist, weights=w, offset=offset,
              train_params=params, var_monotone=c(0, 0, 0), keep_gbm_data=TRUE, cv_folds=5, is_verbose = FALSE)
  fit2 <- gbmt(Y~X1+X2+X3, data=data, distribution=dist, weights=w, offset=offset,
              train_params=params, var_monotone=c(0, 0, 0), keep_gbm_data=TRUE, cv_folds=1, is_verbose = FALSE)

  # When summary method is called
  # Then runs without error
  expect_error(summary(fit), NA)
  expect_error(summary(fit2), NA)
})

test_that("Summary method returns data.frame of variables and relative influences", {
  # Given some "correct" GBMFit objects
  set.seed(1)
  
  # create some data
  N <- 1000
  X1 <- runif(N)
  X2 <- runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  p <- 1/(1+exp(-(sin(3*X1) - 4*X2 + mu)))
  Y <- rbinom(N,1,p)
  
  # random weights if you want to experiment with them
  w <- rexp(N)
  w <- N*w/sum(w)
  
  data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3)
  offset <- rep(0, N)
  
  # Set up for new API
  params <- training_params(num_trees=30, interaction_depth=3, min_num_obs_in_node=10, 
                            shrinkage=0.001, bag_fraction=0.5, id=seq(nrow(data)), num_train=N/2, num_features=3)
  dist <- gbm_dist("Bernoulli")
  fit <- gbmt(Y~X1+X2+X3, data=data, distribution=dist, weights=w, offset=offset,
              train_params=params, var_monotone=c(0, 0, 0), keep_gbm_data=TRUE, cv_folds=5, is_verbose = FALSE)
  fit2 <- gbmt(Y~X1+X2+X3, data=data, distribution=dist, weights=w, offset=offset,
               train_params=params, var_monotone=c(0, 0, 0), keep_gbm_data=TRUE, cv_folds=1, is_verbose = FALSE)
  
  # When summary method is called
  summary_fit_1 <- summary(fit)
  summary_fit_2 <- summary(fit2)
  
  # Then summary method returns data.frame of variables and relative influences
  # with variables ordered 
  expect_true(is.data.frame(summary_fit_1))
  expect_true(is.data.frame(summary_fit_2))
  expect_equal(names(summary_fit_1), c("var", "rel_inf"))
  expect_equal(names(summary_fit_2), c("var", "rel_inf"))
})

test_that("Summary method returns variables and relative influence ordered by relative influence in descending order", {
  # Given a "correct" GBMFit objects
  set.seed(1)
  
  # create some data
  N <- 1000
  X1 <- runif(N)
  X2 <- runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  p <- 1/(1+exp(-(sin(3*X1) - 4*X2 + mu)))
  Y <- rbinom(N,1,p)
  
  # random weights if you want to experiment with them
  w <- rexp(N)
  w <- N*w/sum(w)
  
  data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3)
  offset <- rep(0, N)
  
  # Set up for new API
  params <- training_params(num_trees=30, interaction_depth=3, min_num_obs_in_node=10, 
                            shrinkage=0.001, bag_fraction=0.5, id=seq(nrow(data)), num_train=N/2, num_features=3)
  dist <- gbm_dist("Bernoulli")
  fit <- gbmt(Y~X1+X2+X3, data=data, distribution=dist, weights=w, offset=offset,
              train_params=params, var_monotone=c(0, 0, 0), keep_gbm_data=TRUE, cv_folds=5, is_verbose = FALSE)
  
  # When summary method is called
  summary_fit_1 <- summary(fit)
  
  # Then the variables are ordered in terms of descending relative influence
  rel_inf <- relative_influence(fit, gbmt_performance(fit, method="cv"))
  expect_equal(fit$variables$var_names[order(-rel_inf)], summary_fit_1$var)
})
gbm-developers/gbm3 documentation built on April 28, 2024, 10:04 p.m.