tests/testthat/test-plotting.r

####################
# Author: James Hickey
#
# Series of tests to check if gbm plotting functions run correctly
#
####################

context("Testing performance plotting")
test_that("perf_plot runs with all perf methods", {
  # Given a fit object and perf evaluated with all methods
  ## test Gaussian distribution gbm model
  set.seed(1)
  
  # create some data
  N <- 100
  X1 <- runif(N)
  X2 <- 2*runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  X4 <- ordered(sample(letters[1:6],N,replace=T))
  X5 <- factor(sample(letters[1:3],N,replace=T))
  X6 <- 3*runif(N)
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  SNR <- 10 # signal-to-noise ratio
  Y <- X1**1.5 + 2 * (X2**.5) + mu
  sigma <- sqrt(var(Y)/SNR)
  Y <- Y + rnorm(N,0,sigma)
  
  # create a bunch of missing values
  X1[sample(1:N,size=25)] <- NA
  X3[sample(1:N,size=25)] <- NA
  
  w <- rep(1,N)
  offset <- rep(0, N)
  data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
  
  
  # Set up for new API
  params <- training_params(num_trees=20, interaction_depth=3, min_num_obs_in_node=10, 
                            shrinkage=0.005, bag_fraction=0.5, id=seq(nrow(data)), num_train=N/2, num_features=6)
  dist <- gbm_dist("Gaussian")
  
  fit <- gbmt(Y~X1+X2+X3+X4+X5+X6, data=data, distribution=dist, weights=w, offset=offset,
              train_params=params, var_monotone=c(0, 0, 0, 0, 0, 0), keep_gbm_data=TRUE, cv_folds=10, is_verbose=FALSE)
  
  best_iter_t <- best_iter_test(fit)
  best_iter_c <- best_iter_cv(fit)
  best_iter_oo <- best_iter_out_of_bag(fit)
  
  # When calling perf-plot
  # Using all 3 methods
  # Then no error is thrown
  expect_error(perf_plot(fit, best_iter_t, FALSE, FALSE, "test", "main"), NA)
  expect_error(perf_plot(fit, best_iter_t, FALSE, TRUE, "test", "main"), NA)
  expect_error(perf_plot(fit, best_iter_t, TRUE, FALSE, "test", "main"), NA)
  expect_error(perf_plot(fit, best_iter_t, TRUE, TRUE, "test", "main"), NA)
  
  expect_error(perf_plot(fit, best_iter_c, FALSE, FALSE, "cv", "main"), NA)
  expect_error(perf_plot(fit, best_iter_c, FALSE, TRUE, "cv", "main"), NA)
  expect_error(perf_plot(fit, best_iter_c, TRUE, FALSE, "cv", "main"), NA)
  expect_error(perf_plot(fit, best_iter_c, TRUE, TRUE, "cv", "main"), NA)
    
  expect_error(perf_plot(fit, best_iter_oo, FALSE, FALSE, "OOB", "main"), NA)
  expect_error(perf_plot(fit, best_iter_oo, FALSE, TRUE, "OOB", "main"), NA)
  expect_error(perf_plot(fit, best_iter_oo, TRUE, FALSE, "OOB", "main"), NA)
  expect_error(perf_plot(fit, best_iter_oo, TRUE, TRUE, "OOB", "main"), NA)
})
test_that("perf_plot throws error if gbm_fit_obj is not of class GBMFit", {
  # Given a fit object and perf evaluated with all methods
  ## test Gaussian distribution gbm model
  set.seed(1)
  
  # create some data
  N <- 100
  X1 <- runif(N)
  X2 <- 2*runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  X4 <- ordered(sample(letters[1:6],N,replace=T))
  X5 <- factor(sample(letters[1:3],N,replace=T))
  X6 <- 3*runif(N)
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  SNR <- 10 # signal-to-noise ratio
  Y <- X1**1.5 + 2 * (X2**.5) + mu
  sigma <- sqrt(var(Y)/SNR)
  Y <- Y + rnorm(N,0,sigma)
  
  # create a bunch of missing values
  X1[sample(1:N,size=25)] <- NA
  X3[sample(1:N,size=25)] <- NA
  
  w <- rep(1,N)
  offset <- rep(0, N)
  data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
  
  
  # Set up for new API
  params <- training_params(num_trees=20, interaction_depth=3, min_num_obs_in_node=10, 
                            shrinkage=0.005, bag_fraction=0.5, id=seq(nrow(data)), num_train=N/2, num_features=6)
  dist <- gbm_dist("Gaussian")
  
  fit <- gbmt(Y~X1+X2+X3+X4+X5+X6, data=data, distribution=dist, weights=w, offset=offset,
              train_params=params, var_monotone=c(0, 0, 0, 0, 0, 0), keep_gbm_data=TRUE, cv_folds=10, is_verbose=FALSE)
  
  best_iter_t <- best_iter_test(fit)

  # When calling perf_plot with fit stripped of class
  class(fit) <- "Wrong"
  
  # Then no error is thrown
  expect_error(perf_plot(fit, best_iter_t, FALSE, FALSE, "test", "main"))
})
test_that("perf_plot throws error if out_of_bag_curve is not logical or is na", {
  # Given a fit object and perf evaluated with cv method
  ## test Gaussian distribution gbm model
  set.seed(1)
  
  # create some data
  N <- 100
  X1 <- runif(N)
  X2 <- 2*runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  X4 <- ordered(sample(letters[1:6],N,replace=T))
  X5 <- factor(sample(letters[1:3],N,replace=T))
  X6 <- 3*runif(N)
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  SNR <- 10 # signal-to-noise ratio
  Y <- X1**1.5 + 2 * (X2**.5) + mu
  sigma <- sqrt(var(Y)/SNR)
  Y <- Y + rnorm(N,0,sigma)
  
  # create a bunch of missing values
  X1[sample(1:N,size=25)] <- NA
  X3[sample(1:N,size=25)] <- NA
  
  w <- rep(1,N)
  offset <- rep(0, N)
  data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
  
  
  # Set up for new API
  params <- training_params(num_trees=20, interaction_depth=3, min_num_obs_in_node=10, 
                            shrinkage=0.005, bag_fraction=0.5, id=seq(nrow(data)), num_train=N/2, num_features=6)
  dist <- gbm_dist("Gaussian")
  
  fit <- gbmt(Y~X1+X2+X3+X4+X5+X6, data=data, distribution=dist, weights=w, offset=offset,
              train_params=params, var_monotone=c(0, 0, 0, 0, 0, 0), keep_gbm_data=TRUE, cv_folds=10, is_verbose=FALSE)
  
  best_iter_c <- best_iter_cv(fit)
  
  # When calling perf-plot with out_of_bag_curve NA or not logical
  # Then an error is thrown
  expect_error(perf_plot(fit, best_iter_c, out_of_bag_curve=NA, FALSE, "test", "main"))
  expect_error(perf_plot(fit, best_iter_c, out_of_bag_curve=c(TRUE, FALSE), FALSE, "test", "main"))
  expect_error(perf_plot(fit, best_iter_c, out_of_bag_curve=1.5, FALSE, "test", "main"))
  expect_error(perf_plot(fit, best_iter_c, out_of_bag_curve="", FALSE, "test", "main"))
  expect_error(perf_plot(fit, best_iter_c, out_of_bag_curve=NaN, FALSE, "test", "main"))
  
})
test_that("perf_plot throws error if overlay is not logical or is na", {
  # Given a fit object and perf evaluated with cv method
  ## test Gaussian distribution gbm model
  set.seed(1)
  
  # create some data
  N <- 100
  X1 <- runif(N)
  X2 <- 2*runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  X4 <- ordered(sample(letters[1:6],N,replace=T))
  X5 <- factor(sample(letters[1:3],N,replace=T))
  X6 <- 3*runif(N)
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  SNR <- 10 # signal-to-noise ratio
  Y <- X1**1.5 + 2 * (X2**.5) + mu
  sigma <- sqrt(var(Y)/SNR)
  Y <- Y + rnorm(N,0,sigma)
  
  # create a bunch of missing values
  X1[sample(1:N,size=25)] <- NA
  X3[sample(1:N,size=25)] <- NA
  
  w <- rep(1,N)
  offset <- rep(0, N)
  data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
  
  
  # Set up for new API
  params <- training_params(num_trees=20, interaction_depth=3, min_num_obs_in_node=10, 
                            shrinkage=0.005, bag_fraction=0.5, id=seq(nrow(data)), num_train=N/2, num_features=6)
  dist <- gbm_dist("Gaussian")
  
  fit <- gbmt(Y~X1+X2+X3+X4+X5+X6, data=data, distribution=dist, weights=w, offset=offset,
              train_params=params, var_monotone=c(0, 0, 0, 0, 0, 0), keep_gbm_data=TRUE, cv_folds=10, is_verbose=FALSE)
  
  best_iter_c <- best_iter_cv(fit)
  
  # When calling perf-plot with out_of_bag_curve NA or not logical
  # Then an error is thrown
  expect_error(perf_plot(fit, best_iter_c, FALSE, overlay=NA, "test", "main"))
  expect_error(perf_plot(fit, best_iter_c, FALSE, overlay=c(TRUE, FALSE), "test", "main"))
  expect_error(perf_plot(fit, best_iter_c, FALSE, overlay=1.5, "test", "main"))
  expect_error(perf_plot(fit, best_iter_c, FALSE, overlay="", "test", "main"))
  expect_error(perf_plot(fit, best_iter_c, FALSE, overlay=NaN, "test", "main"))
})
test_that("get_ylabel returns correct string for distribution objects", {
  # Given distribution objects
  dist_1 <- gbm_dist("AdaBoost")
  dist_2 <- gbm_dist("Bernoulli")
  dist_3 <- gbm_dist("CoxPH")
  dist_4 <- gbm_dist("Gamma")
  dist_5 <- gbm_dist("Gaussian")
  dist_6 <- gbm_dist("Huberized")
  dist_7 <- gbm_dist("Laplace")
  dist_8_a <- gbm_dist("Pairwise", metric="conc")
  dist_8_b <- gbm_dist("Pairwise", metric="map")
  dist_8_c <- gbm_dist("Pairwise", metric="mrr")
  dist_8_d <- gbm_dist("Pairwise", metric="ndcg")
  dist_9 <- gbm_dist("Poisson")
  dist_10 <- gbm_dist("Quantile")
  dist_11 <- gbm_dist("TDist")
  dist_12 <- gbm_dist("Tweedie")
  
  # When we get the appropriate y_labels for perf_plot
  label_1 <- get_ylabel(dist_1)
  label_2 <- get_ylabel(dist_2)
  label_3 <- get_ylabel(dist_3)
  label_4 <- get_ylabel(dist_4)
  label_5 <- get_ylabel(dist_5)
  label_6 <- get_ylabel(dist_6)
  label_7 <- get_ylabel(dist_7)
  label_8_a <- get_ylabel(dist_8_a)
  label_8_b <- get_ylabel(dist_8_b)
  label_8_c <- get_ylabel(dist_8_c)
  label_8_d <- get_ylabel(dist_8_d)
  label_9 <- get_ylabel(dist_9)
  label_10 <- get_ylabel(dist_10)
  label_11 <- get_ylabel(dist_11)
  label_12 <- get_ylabel(dist_12)
  
  # Then get correct labels
  expect_equal(label_1, "AdaBoost exponential bound")
  expect_equal(label_2, "Bernoulli deviance")
  expect_equal(label_3,"Cox partial deviance")
  expect_equal(label_4, "Gamma deviance")
  expect_equal(label_5, "Squared error loss")
  expect_equal(label_6, "Hinged loss")
  expect_equal(label_7, "Absolute loss")
  expect_equal(label_8_a, "Fraction of concordant pairs")
  expect_equal(label_8_b, "Mean average precision")
  expect_equal(label_8_c, "Mean reciprocal rank")
  expect_equal(label_8_d, "Normalized discounted cumulative gain")
  expect_equal(label_9, "Poisson deviance")
  expect_equal(label_10, "Quantile loss")
  expect_equal(label_11, "t-distribution deviance")
  expect_equal(label_12, "Tweedie deviance")
})
test_that("get_ylabel throws error if passed unrecognised GBMDist class", {
  # Given a GBM dist
  dist <- gbm_dist()
  
  # When GBMDist not recognised
  class(dist) <- c("What is this?", "GBMDist")
  
  # Then error thrown with get_ylabel
  expect_error(get_ylabel(dist))
})

context("Testing plot method for GBMFit")
test_that("Error thrown if type of plot not 'link' or 'response' ", {
  # Given a GBM fit and type not 'link' or 'response'
  type_plot <- "stuff"
  
  ## test Gaussian distribution gbm model
  set.seed(1)
  
  # create some data
  N <- 100
  X1 <- runif(N)
  X2 <- 2*runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  X4 <- ordered(sample(letters[1:6],N,replace=T))
  X5 <- factor(sample(letters[1:3],N,replace=T))
  X6 <- 3*runif(N)
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  SNR <- 10 # signal-to-noise ratio
  Y <- X1**1.5 + 2 * (X2**.5) + mu
  sigma <- sqrt(var(Y)/SNR)
  Y <- Y + rnorm(N,0,sigma)
  
  # create a bunch of missing values
  X1[sample(1:N,size=25)] <- NA
  X3[sample(1:N,size=25)] <- NA
  
  w <- rep(1,N)
  offset <- rep(0, N)
  data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
  
  
  # Set up for new API
  params <- training_params(num_trees=20, interaction_depth=3, min_num_obs_in_node=10, 
                            shrinkage=0.005, bag_fraction=0.5, id=seq(nrow(data)), num_train=N/2, num_features=6)
  dist <- gbm_dist("Gaussian")
  
  fit <- gbmt(Y~X1+X2+X3+X4+X5+X6, data=data, distribution=dist, weights=w, offset=offset,
              train_params=params, var_monotone=c(0, 0, 0, 0, 0, 0), keep_gbm_data=TRUE, cv_folds=10, is_verbose=FALSE)
  
  # When plotting fit with unknown type
  # Then error thrown
  expect_error(plot(fit, type=type_plot))
})
test_that("Error thrown if var_index has variable outside range not used in fit", {
  # Given a GBM fit and var_index outside range of predictors
  # Used in fit
  var_ind <- 7
  
  ## test Gaussian distribution gbm model
  set.seed(1)
  
  # create some data
  N <- 100
  X1 <- runif(N)
  X2 <- 2*runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  X4 <- ordered(sample(letters[1:6],N,replace=T))
  X5 <- factor(sample(letters[1:3],N,replace=T))
  X6 <- 3*runif(N)
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  SNR <- 10 # signal-to-noise ratio
  Y <- X1**1.5 + 2 * (X2**.5) + mu
  sigma <- sqrt(var(Y)/SNR)
  Y <- Y + rnorm(N,0,sigma)
  
  # create a bunch of missing values
  X1[sample(1:N,size=25)] <- NA
  X3[sample(1:N,size=25)] <- NA
  
  w <- rep(1,N)
  offset <- rep(0, N)
  data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
  
  
  # Set up for new API
  params <- training_params(num_trees=20, interaction_depth=3, min_num_obs_in_node=10, 
                            shrinkage=0.005, bag_fraction=0.5, id=seq(nrow(data)), num_train=N/2, num_features=6)
  dist <- gbm_dist("Gaussian")
  
  fit <- gbmt(Y~X1+X2+X3+X4+X5+X6, data=data, distribution=dist, weights=w, offset=offset,
              train_params=params, var_monotone=c(0, 0, 0, 0, 0, 0), keep_gbm_data=TRUE, cv_folds=10, is_verbose=FALSE)
  
  # When plotting fit with out of range var index
  # Then error thrown
  expect_error(plot(fit, var_index = var_ind))
  
})
test_that("Response throws warning if not Bernoulli, Poisson, Gamma, Pairwise or Tweedie", {
  # Given distribution and response
  dist_1 <- gbm_dist("AdaBoost")
  dist_2 <- gbm_dist("CoxPH")
  dist_3 <- gbm_dist("Gaussian")
  dist_4 <- gbm_dist("Huberized")
  dist_5 <- gbm_dist("Laplace")
  dist_6 <- gbm_dist("Quantile")
  dist_7 <- gbm_dist("TDist")
  resp <- rep(1, 10)
  
  # When response S3 method called
  # Then warning thrown
  expect_warning(response(resp, dist_1))
  expect_warning(response(resp, dist_2))
  expect_warning(response(resp, dist_3))
  expect_warning(response(resp, dist_4))
  expect_warning(response(resp, dist_5))
  expect_warning(response(resp, dist_6))
  expect_warning(response(resp, dist_7))
  
})
test_that("Response returns NULL if not Bernoulli, Poisson, Gamma, Pairwise or Tweedie", {
  # Given distribution and response
  dist_1 <- gbm_dist("AdaBoost")
  dist_2 <- gbm_dist("CoxPH")
  dist_3 <- gbm_dist("Gaussian")
  dist_4 <- gbm_dist("Huberized")
  dist_5 <- gbm_dist("Laplace")
  dist_6 <- gbm_dist("Quantile")
  dist_7 <- gbm_dist("TDist")
  resp <- rep(1, 10)
  
  # When response S3 method called
  # Then returns NULL
  expect_null(response(resp, dist_1))
  expect_null(response(resp, dist_2))
  expect_null(response(resp, dist_3))
  expect_null(response(resp, dist_4))
  expect_null(response(resp, dist_5))
  expect_null(response(resp, dist_6))
  expect_null(response(resp, dist_7))
})
test_that("Response returns correct answer for Bernoulli, Pairwise, Poisson, Gamma and Tweedie", {
  # Given distributions which don't return NULL and response
  dist_1 <- gbm_dist("Bernoulli")
  dist_2 <- gbm_dist("Pairwise")
  dist_3 <- gbm_dist("Poisson")
  dist_4 <- gbm_dist("Gamma")
  dist_5 <- gbm_dist("Tweedie")
  
  resp <- rep(1, 10)
  
  # When response S3 method called
  # Then returns correct answer
  expect_equal(response(resp, dist_1), 1/(1+exp(-resp)))
  expect_equal(response(resp, dist_2), 1/(1+exp(-resp)))
  expect_equal(response(resp, dist_3), exp(resp))
  expect_equal(response(resp, dist_4), exp(resp))
  expect_equal(response(resp, dist_5), exp(resp))
})
test_that("get_default_grid_levels returns answer of correct type and size", {
  # Given a fitted gbm object, var_index and continuous resolution
  ## test Gaussian distribution gbm model
  set.seed(1)
  
  # create some data
  N <- 100
  X1 <- runif(N)
  X2 <- 2*runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  X4 <- ordered(sample(letters[1:6],N,replace=T))
  X5 <- factor(sample(letters[1:3],N,replace=T))
  X6 <- 3*runif(N)
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  SNR <- 10 # signal-to-noise ratio
  Y <- X1**1.5 + 2 * (X2**.5) + mu
  sigma <- sqrt(var(Y)/SNR)
  Y <- Y + rnorm(N,0,sigma)
  
  # create a bunch of missing values
  X1[sample(1:N,size=25)] <- NA
  X3[sample(1:N,size=25)] <- NA
  
  w <- rep(1,N)
  offset <- rep(0, N)
  data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
  
  
  # Set up for new API
  params <- training_params(num_trees=200, interaction_depth=3, min_num_obs_in_node=10, 
                            shrinkage=0.005, bag_fraction=0.5, id=seq(nrow(data)), num_train=N/2, num_features=6)
  dist <- gbm_dist("Gaussian")
  
  fit <- gbmt(Y~X1+X2+X3+X4+X5+X6, data=data, distribution=dist, weights=w, offset=offset,
              train_params=params, var_monotone=c(0, 0, 0, 0, 0, 0), keep_gbm_data=TRUE, cv_folds=10, is_verbose=FALSE)
  var_index <- 1
  continuous_resolution <- 100
  
  # When default grids generated
  # Then output is of correct type and size
  expect_true(is.list(get_default_grid_levels(fit, var_index, continuous_resolution)))
  expect_equal(length(get_default_grid_levels(fit, var_index, continuous_resolution)), var_index)
})
test_that("generate_grid_levels throws an error if length of grid_levels not same length as var_index", {
  # Given a fitted gbm object, var_index, grid_levels and continuous resolution
  ## test Gaussian distribution gbm model
  set.seed(1)
  
  # create some data
  N <- 100
  X1 <- runif(N)
  X2 <- 2*runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  X4 <- ordered(sample(letters[1:6],N,replace=T))
  X5 <- factor(sample(letters[1:3],N,replace=T))
  X6 <- 3*runif(N)
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  SNR <- 10 # signal-to-noise ratio
  Y <- X1**1.5 + 2 * (X2**.5) + mu
  sigma <- sqrt(var(Y)/SNR)
  Y <- Y + rnorm(N,0,sigma)
  
  # create a bunch of missing values
  X1[sample(1:N,size=25)] <- NA
  X3[sample(1:N,size=25)] <- NA
  
  w <- rep(1,N)
  offset <- rep(0, N)
  data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
  
  
  # Set up for new API
  params <- training_params(num_trees=200, interaction_depth=3, min_num_obs_in_node=10, 
                            shrinkage=0.005, bag_fraction=0.5, id=seq(nrow(data)), num_train=N/2, num_features=6)
  dist <- gbm_dist("Gaussian")
  
  fit <- gbmt(Y~X1+X2+X3+X4+X5+X6, data=data, distribution=dist, weights=w, offset=offset,
              train_params=params, var_monotone=c(0, 0, 0, 0, 0, 0), keep_gbm_data=TRUE, cv_folds=10, is_verbose=FALSE)
  var_index <- 1
  continuous_resolution <- 100
  grid_levels <- get_default_grid_levels(fit, c(1, 2), continuous_resolution)
  
  # When generate_grid_levels called
  # Then error is thrown
  expect_error(generate_grid_levels(grid_levels, fit, var_index))
})
test_that("warning thrown if num_trees exceeds those in fit and then uses number of trees in fit to plot", {
  # Given a GBM fit and num_trees exceeding those in fit
  num_trees_plot <- 100000 
  
  ## test Gaussian distribution gbm model
  set.seed(1)
  
  # create some data
  N <- 100
  X1 <- runif(N)
  X2 <- 2*runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  X4 <- ordered(sample(letters[1:6],N,replace=T))
  X5 <- factor(sample(letters[1:3],N,replace=T))
  X6 <- 3*runif(N)
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  SNR <- 10 # signal-to-noise ratio
  Y <- X1**1.5 + 2 * (X2**.5) + mu
  sigma <- sqrt(var(Y)/SNR)
  Y <- Y + rnorm(N,0,sigma)
  
  # create a bunch of missing values
  X1[sample(1:N,size=25)] <- NA
  X3[sample(1:N,size=25)] <- NA
  
  w <- rep(1,N)
  offset <- rep(0, N)
  data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
  
  
  # Set up for new API
  params <- training_params(num_trees=200, interaction_depth=3, min_num_obs_in_node=10, 
                            shrinkage=0.005, bag_fraction=0.5, id=seq(nrow(data)), num_train=N/2, num_features=6)
  dist <- gbm_dist("Gaussian")
  
  fit <- gbmt(Y~X1+X2+X3+X4+X5+X6, data=data, distribution=dist, weights=w, offset=offset,
              train_params=params, var_monotone=c(0, 0, 0, 0, 0, 0), keep_gbm_data=TRUE, cv_folds=10, is_verbose=FALSE)
  
  # When plotting fit with too many trees
  # Then warning thrown and uses number of trees in fit
  expect_warning(plot(fit, num_trees=num_trees_plot))
})
test_that("warning thrown if number var indices > 3", {
  # Given a GBM fit and more than 3 var indices
  var_ind <- c(1, 2, 3, 4)
  
  ## test Gaussian distribution gbm model
  set.seed(1)
  
  # create some data
  N <- 100
  X1 <- runif(N)
  X2 <- 2*runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  X4 <- ordered(sample(letters[1:6],N,replace=T))
  X5 <- factor(sample(letters[1:3],N,replace=T))
  X6 <- 3*runif(N)
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  SNR <- 10 # signal-to-noise ratio
  Y <- X1**1.5 + 2 * (X2**.5) + mu
  sigma <- sqrt(var(Y)/SNR)
  Y <- Y + rnorm(N,0,sigma)
  
  # create a bunch of missing values
  X1[sample(1:N,size=25)] <- NA
  X3[sample(1:N,size=25)] <- NA
  
  w <- rep(1,N)
  offset <- rep(0, N)
  data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
  
  
  # Set up for new API
  params <- training_params(num_trees=20, interaction_depth=3, min_num_obs_in_node=10, 
                            shrinkage=0.005, bag_fraction=0.5, id=seq(nrow(data)), num_train=N/2, num_features=6)
  dist <- gbm_dist("Gaussian")
  
  fit <- gbmt(Y~X1+X2+X3+X4+X5+X6, data=data, distribution=dist, weights=w, offset=offset,
              train_params=params, var_monotone=c(0, 0, 0, 0, 0, 0), keep_gbm_data=TRUE, cv_folds=10, is_verbose=FALSE)
  
  # When plotting fit with too indices
  # Then warning thrown 
  expect_warning(plot(fit, var_index = var_ind))
})
test_that("return_grid=TRUE returns the grid", {
  # Given a GBM fit and return_grid=TRUE
  grid_return <- TRUE
  
  ## test Gaussian distribution gbm model
  set.seed(1)
  
  # create some data
  N <- 100
  X1 <- runif(N)
  X2 <- 2*runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  X4 <- ordered(sample(letters[1:6],N,replace=T))
  X5 <- factor(sample(letters[1:3],N,replace=T))
  X6 <- 3*runif(N)
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  SNR <- 10 # signal-to-noise ratio
  Y <- X1**1.5 + 2 * (X2**.5) + mu
  sigma <- sqrt(var(Y)/SNR)
  Y <- Y + rnorm(N,0,sigma)
  
  # create a bunch of missing values
  X1[sample(1:N,size=25)] <- NA
  X3[sample(1:N,size=25)] <- NA
  
  w <- rep(1,N)
  offset <- rep(0, N)
  data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
  
  
  # Set up for new API
  params <- training_params(num_trees=20, interaction_depth=3, min_num_obs_in_node=10, 
                            shrinkage=0.005, bag_fraction=0.5, id=seq(nrow(data)), num_train=N/2, num_features=6)
  dist <- gbm_dist("Gaussian")
  
  fit <- gbmt(Y~X1+X2+X3+X4+X5+X6, data=data, distribution=dist, weights=w, offset=offset,
              train_params=params, var_monotone=c(0, 0, 0, 0, 0, 0), keep_gbm_data=TRUE, cv_folds=10, is_verbose=FALSE)
  
  # When plotting fit with grid_return returns grid
  # Then output is same
  expect_true(is.list(plot(fit, return_grid=grid_return)))
})
test_that("number of var indices >3 sets return_grid to TRUE", {
  # Given a GBM fit and more than 3 var indices
  var_ind <- c(1, 2, 3, 4)
  
  ## test Gaussian distribution gbm model
  set.seed(1)
  
  # create some data
  N <- 100
  X1 <- runif(N)
  X2 <- 2*runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  X4 <- ordered(sample(letters[1:6],N,replace=T))
  X5 <- factor(sample(letters[1:3],N,replace=T))
  X6 <- 3*runif(N)
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  SNR <- 10 # signal-to-noise ratio
  Y <- X1**1.5 + 2 * (X2**.5) + mu
  sigma <- sqrt(var(Y)/SNR)
  Y <- Y + rnorm(N,0,sigma)
  
  # create a bunch of missing values
  X1[sample(1:N,size=25)] <- NA
  X3[sample(1:N,size=25)] <- NA
  
  w <- rep(1,N)
  offset <- rep(0, N)
  data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
  
  
  # Set up for new API
  params <- training_params(num_trees=20, interaction_depth=3, min_num_obs_in_node=10, 
                            shrinkage=0.005, bag_fraction=0.5, id=seq(nrow(data)), num_train=N/2, num_features=6)
  dist <- gbm_dist("Gaussian")
  
  fit <- gbmt(Y~X1+X2+X3+X4+X5+X6, data=data, distribution=dist, weights=w, offset=offset,
              train_params=params, var_monotone=c(0, 0, 0, 0, 0, 0), keep_gbm_data=TRUE, cv_folds=10, is_verbose=FALSE)
  
  # When plotting fit with too indices and return_grid = TRUE
  # Then output is same
  expect_warning(plot(fit, var_index=var_ind, return_grid=FALSE))
  expect_equal(plot(fit, var_index = var_ind, return_grid=FALSE),  plot(fit, var_index = var_ind, return_grid=TRUE))
})
test_that("can plot with one variable selected", {
  # Given a GBM fit 
  ## test Gaussian distribution gbm model
  set.seed(1)
  
  # create some data
  N <- 100
  X1 <- runif(N)
  X2 <- 2*runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  X4 <- ordered(sample(letters[1:6],N,replace=T))
  X5 <- factor(sample(letters[1:3],N,replace=T))
  X6 <- 3*runif(N)
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  SNR <- 10 # signal-to-noise ratio
  Y <- X1**1.5 + 2 * (X2**.5) + mu
  sigma <- sqrt(var(Y)/SNR)
  Y <- Y + rnorm(N,0,sigma)
  
  # create a bunch of missing values
  X1[sample(1:N,size=25)] <- NA
  X3[sample(1:N,size=25)] <- NA
  
  w <- rep(1,N)
  offset <- rep(0, N)
  data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
  
  
  # Set up for new API
  params <- training_params(num_trees=20, interaction_depth=3, min_num_obs_in_node=10, 
                            shrinkage=0.005, bag_fraction=0.5, id=seq(nrow(data)), num_train=N/2, num_features=6)
  dist <- gbm_dist("Gaussian")
  
  fit <- gbmt(Y~X1+X2+X3+X4+X5+X6, data=data, distribution=dist, weights=w, offset=offset,
              train_params=params, var_monotone=c(0, 0, 0, 0, 0, 0), keep_gbm_data=TRUE, cv_folds=10, is_verbose=FALSE)
  
  # When plotting fit  - 1 variable
  # Then no Error thrown 
  expect_error(plot(fit), NA)
})
test_that("can correctly get one variable y-label", {
  # Given an object for each distribution
  dist_1 <- gbm_dist("AdaBoost")
  dist_2 <- gbm_dist("Bernoulli")
  dist_3 <- gbm_dist("CoxPH")
  dist_4 <- gbm_dist("Gamma")
  dist_5 <- gbm_dist("Gaussian")
  dist_6 <- gbm_dist("Huberized")
  dist_7 <- gbm_dist("Laplace")
  dist_8 <- gbm_dist("Pairwise")
  dist_9 <- gbm_dist("Poisson")
  dist_10 <- gbm_dist("Quantile")
  dist_11 <- gbm_dist("TDist")
  dist_12 <- gbm_dist("Tweedie")
  
  # When y-label retreived
  # Then correct
  expect_equal(get_ylabel_one_var(dist_1), "")
  expect_equal(get_ylabel_one_var(dist_2), "Predicted Probability")
  expect_equal(get_ylabel_one_var(dist_3), "")
  expect_equal(get_ylabel_one_var(dist_4), "")
  expect_equal(get_ylabel_one_var(dist_5), "")
  expect_equal(get_ylabel_one_var(dist_6), "")
  expect_equal(get_ylabel_one_var(dist_7), "")
  expect_equal(get_ylabel_one_var(dist_8), "Predicted Probability")
  expect_equal(get_ylabel_one_var(dist_9), "Predicted Count")
  expect_equal(get_ylabel_one_var(dist_10), "")
  expect_equal(get_ylabel_one_var(dist_11), "")
  expect_equal(get_ylabel_one_var(dist_12), "")
})
test_that("can plot with two variables", {
  # Given a GBM fit and two variables selected - all combinations
  var_1 <- c(1, 2)
  var_2 <- c(1, 3)
  var_3 <- c(3, 5)
  
  ## test Gaussian distribution gbm model
  set.seed(1)
  
  # create some data
  N <- 100
  X1 <- runif(N)
  X2 <- 2*runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  X4 <- ordered(sample(letters[1:6],N,replace=T))
  X5 <- factor(sample(letters[1:3],N,replace=T))
  X6 <- 3*runif(N)
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  SNR <- 10 # signal-to-noise ratio
  Y <- X1**1.5 + 2 * (X2**.5) + mu
  sigma <- sqrt(var(Y)/SNR)
  Y <- Y + rnorm(N,0,sigma)
  
  # create a bunch of missing values
  X1[sample(1:N,size=25)] <- NA
  X3[sample(1:N,size=25)] <- NA
  
  w <- rep(1,N)
  offset <- rep(0, N)
  data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
  
  
  # Set up for new API
  params <- training_params(num_trees=20, interaction_depth=3, min_num_obs_in_node=10, 
                            shrinkage=0.005, bag_fraction=0.5, id=seq(nrow(data)), num_train=N/2, num_features=6)
  dist <- gbm_dist("Gaussian")
  
  fit <- gbmt(Y~X1+X2+X3+X4+X5+X6, data=data, distribution=dist, weights=w, offset=offset,
              train_params=params, var_monotone=c(0, 0, 0, 0, 0, 0), keep_gbm_data=TRUE, cv_folds=10, is_verbose=FALSE)
  
  # When plotting fit  - 2  variables
  # Then no Error thrown 
  expect_error(plot(fit, var_index = var_1), NA)
  expect_error(plot(fit, var_index = var_2), NA)
  expect_error(plot(fit, var_index = var_3), NA)
})
test_that("can plot with 3 variables selected", {
  # Given a GBM fit and 3 variables selected - all combinations
  var_1 <- c(1, 2, 6)
  var_2 <- c(1, 3, 2)
  var_3 <- c(4, 3, 1)
  var_4 <- c(3, 4, 5)
  
  ## test Gaussian distribution gbm model
  set.seed(1)
  
  # create some data
  N <- 100
  X1 <- runif(N)
  X2 <- 2*runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  X4 <- ordered(sample(letters[1:6],N,replace=T))
  X5 <- factor(sample(letters[1:3],N,replace=T))
  X6 <- 3*runif(N)
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  SNR <- 10 # signal-to-noise ratio
  Y <- X1**1.5 + 2 * (X2**.5) + mu
  sigma <- sqrt(var(Y)/SNR)
  Y <- Y + rnorm(N,0,sigma)
  
  # create a bunch of missing values
  X1[sample(1:N,size=25)] <- NA
  X3[sample(1:N,size=25)] <- NA
  
  w <- rep(1,N)
  offset <- rep(0, N)
  data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
  
  
  # Set up for new API
  params <- training_params(num_trees=20, interaction_depth=3, min_num_obs_in_node=10, 
                            shrinkage=0.005, bag_fraction=0.5, id=seq(nrow(data)), num_train=N/2, num_features=6)
  dist <- gbm_dist("Gaussian")
  
  fit <- gbmt(Y~X1+X2+X3+X4+X5+X6, data=data, distribution=dist, weights=w, offset=offset,
              train_params=params, var_monotone=c(0, 0, 0, 0, 0, 0), keep_gbm_data=TRUE, cv_folds=10, is_verbose=FALSE)
  
  # When plotting fit  - 3 variables
  # Then no Error thrown 
  expect_error(plot(fit, var_index = var_1), NA)
  expect_error(plot(fit, var_index = var_2), NA)
  expect_error(plot(fit, var_index = var_3), NA)
  expect_error(plot(fit, var_index = var_4), NA)
})
gbm-developers/gbm3 documentation built on April 28, 2024, 10:04 p.m.