tests/testthat/test-cv-groups.r

####################
# Author: James Hickey
#
# Series of tests to check the creation of cv groups
#
####################

context("Testing input checking - create_cv_groups")
test_that("create_cv_groups throws an error if not passed a GBMData object", {
  # Given inputs
  # Dist_Obj
  dist <- gbm_dist()
  
  # create some data
  set.seed(1)
  N <- 1000
  X1 <- runif(N)
  X2 <- 2*runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  X4 <- ordered(sample(letters[1:6],N,replace=T))
  X5 <- factor(sample(letters[1:3],N,replace=T))
  X6 <- 3*runif(N)
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  SNR <- 10 # signal-to-noise ratio
  Y <- X1**1.5 + 2 * (X2**.5) + mu
  sigma <- sqrt(var(Y)/SNR)
  Y <- Y + rnorm(N,0,sigma)
  
  # create a bunch of missing values
  X1[sample(1:N,size=100)] <- NA
  X3[sample(1:N,size=300)] <- NA
  
  w <- rep(1,N)
  offset <- rep(0, N)
  X <- data.frame(X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
  
  data <- gbm_data(X, Y, w, offset)
  
  # Params Obj
  params <- training_params(num_train = N)
  
  # Other CV Parameters
  cv_class_stratify <- FALSE
  cv_folds <- 2
  fold_id <- NULL
  
  # When data is stripped of GBMData class
  class(data) <- "wrong"
  
  # Then error thrown when trying to calculate CV groups
  expect_error(create_cv_groups(data, dist, params, cv_folds, cv_class_stratify, fold_id))
})
test_that("create_cv_groups throws an error if not passed a GBMDist object", {
  # Given inputs
  # Dist_Obj
  dist <- gbm_dist()
  
  # create some data
  set.seed(1)
  N <- 1000
  X1 <- runif(N)
  X2 <- 2*runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  X4 <- ordered(sample(letters[1:6],N,replace=T))
  X5 <- factor(sample(letters[1:3],N,replace=T))
  X6 <- 3*runif(N)
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  SNR <- 10 # signal-to-noise ratio
  Y <- X1**1.5 + 2 * (X2**.5) + mu
  sigma <- sqrt(var(Y)/SNR)
  Y <- Y + rnorm(N,0,sigma)
  
  # create a bunch of missing values
  X1[sample(1:N,size=100)] <- NA
  X3[sample(1:N,size=300)] <- NA
  
  w <- rep(1,N)
  offset <- rep(0, N)
  X <- data.frame(X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
  
  data <- gbm_data(X, Y, w, offset)
  
  # Params Obj
  params <- training_params(num_train = N)
  
  # Other CV Parameters
  cv_class_stratify <- FALSE
  cv_folds <- 2
  fold_id <- NULL
  
  # When dist is stripped of GBMDist class
  class(dist) <- ""
  
  # Then error thrown when trying to calculate CV groups
  expect_error(create_cv_groups(data, dist, params, cv_folds, cv_class_stratify, fold_id))
})
test_that("create_cv_groups throws an error if not passed a GBMTrainParams object", {
  # Given inputs
  # Dist_Obj
  dist <- gbm_dist()
  
  # create some data
  set.seed(1)
  N <- 1000
  X1 <- runif(N)
  X2 <- 2*runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  X4 <- ordered(sample(letters[1:6],N,replace=T))
  X5 <- factor(sample(letters[1:3],N,replace=T))
  X6 <- 3*runif(N)
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  SNR <- 10 # signal-to-noise ratio
  Y <- X1**1.5 + 2 * (X2**.5) + mu
  sigma <- sqrt(var(Y)/SNR)
  Y <- Y + rnorm(N,0,sigma)
  
  # create a bunch of missing values
  X1[sample(1:N,size=100)] <- NA
  X3[sample(1:N,size=300)] <- NA
  
  w <- rep(1,N)
  offset <- rep(0, N)
  X <- data.frame(X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
  
  data <- gbm_data(X, Y, w, offset)
  
  # Params Obj
  params <- training_params(num_train = N)
  
  # Other CV Parameters
  cv_class_stratify <- FALSE
  cv_folds <- 2
  fold_id <- NULL
  
  # When params is stripped of GBMTrainParams class
  class(params) <- ""
  
  # Then error thrown when trying to calculate CV groups
  expect_error(create_cv_groups(data, dist, params, cv_folds, cv_class_stratify, fold_id))
})
test_that("create_cv_groups throws an error if not cv_folds is not a whole number >= 1", {
  # Given inputs - cv_folds not a whole number >= 1
  # Dist_Obj
  dist <- gbm_dist()
  
  # create some data
  set.seed(1)
  N <- 1000
  X1 <- runif(N)
  X2 <- 2*runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  X4 <- ordered(sample(letters[1:6],N,replace=T))
  X5 <- factor(sample(letters[1:3],N,replace=T))
  X6 <- 3*runif(N)
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  SNR <- 10 # signal-to-noise ratio
  Y <- X1**1.5 + 2 * (X2**.5) + mu
  sigma <- sqrt(var(Y)/SNR)
  Y <- Y + rnorm(N,0,sigma)
  
  # create a bunch of missing values
  X1[sample(1:N,size=100)] <- NA
  X3[sample(1:N,size=300)] <- NA
  
  w <- rep(1,N)
  offset <- rep(0, N)
  X <- data.frame(X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
  
  data <- gbm_data(X, Y, w, offset)
  
  # Params Obj
  params <- training_params(num_train = N)
  
  # Other CV Parameters
  cv_class_stratify <- FALSE
  fold_id <- NULL
  cv_folds_1 <- 2.1
  cv_folds_2 <- -1
  cv_folds_3 <- "What?"
  cv_folds_4 <- Inf
  cv_folds_5 <- NaN
  cv_folds_6 <- 0
  
  # When creating cv groups
  # Then error thrown when trying to calculate CV groups
  expect_error(create_cv_groups(data, dist, params, cv_folds_1, cv_class_stratify, fold_id))
  expect_error(create_cv_groups(data, dist, params, cv_folds_2, cv_class_stratify, fold_id))
  expect_error(create_cv_groups(data, dist, params, cv_folds_3, cv_class_stratify, fold_id))
  expect_error(create_cv_groups(data, dist, params, cv_folds_4, cv_class_stratify, fold_id))
  expect_error(create_cv_groups(data, dist, params, cv_folds_5, cv_class_stratify, fold_id))
  expect_error(create_cv_groups(data, dist, params, cv_folds_6, cv_class_stratify, fold_id))
})
test_that("create_cv_groups throws an error if not cv_class_stratify is not a logical", {
  # Given inputs - cv_class_stratify is not a logical!
  # Dist_Obj
  dist <- gbm_dist()
  
  # create some data
  set.seed(1)
  N <- 1000
  X1 <- runif(N)
  X2 <- 2*runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  X4 <- ordered(sample(letters[1:6],N,replace=T))
  X5 <- factor(sample(letters[1:3],N,replace=T))
  X6 <- 3*runif(N)
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  SNR <- 10 # signal-to-noise ratio
  Y <- X1**1.5 + 2 * (X2**.5) + mu
  sigma <- sqrt(var(Y)/SNR)
  Y <- Y + rnorm(N,0,sigma)
  
  # create a bunch of missing values
  X1[sample(1:N,size=100)] <- NA
  X3[sample(1:N,size=300)] <- NA
  
  w <- rep(1,N)
  offset <- rep(0, N)
  X <- data.frame(X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
  
  data <- gbm_data(X, Y, w, offset)
  
  # Params Obj
  params <- training_params(num_train = N)
  
  # Other CV Parameters
  fold_id <- NULL
  cv_folds <- 2
  cv_class_stratify_1 <- -1
  cv_class_stratify_2 <- "What?"
  cv_class_stratify_3 <- Inf
  cv_class_stratify_4 <- NaN
  cv_class_stratify_5 <- 0
  cv_class_stratify_6 <- c(TRUE, FALSE)
  
  # When creating cv groups
  # Then error thrown when trying to calculate CV groups
  expect_error(create_cv_groups(data, dist, params, cv_folds, cv_class_stratify_1, fold_id))
  expect_error(create_cv_groups(data, dist, params, cv_folds, cv_class_stratify_2, fold_id))
  expect_error(create_cv_groups(data, dist, params, cv_folds, cv_class_stratify_3, fold_id))
  expect_error(create_cv_groups(data, dist, params, cv_folds, cv_class_stratify_4, fold_id))
  expect_error(create_cv_groups(data, dist, params, cv_folds, cv_class_stratify_5, fold_id))
  expect_error(create_cv_groups(data, dist, params, cv_folds, cv_class_stratify_6, fold_id))
})

context("Testing cv groupings")
test_that("cv groups is an atomic vector of integers whose length is equal to the number of training observations", {
  # Given inputs
  # Dist_Obj
  dist <- gbm_dist()
  
  # create some data
  set.seed(1)
  N <- 1000
  X1 <- runif(N)
  X2 <- 2*runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  X4 <- ordered(sample(letters[1:6],N,replace=T))
  X5 <- factor(sample(letters[1:3],N,replace=T))
  X6 <- 3*runif(N)
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  SNR <- 10 # signal-to-noise ratio
  Y <- X1**1.5 + 2 * (X2**.5) + mu
  sigma <- sqrt(var(Y)/SNR)
  Y <- Y + rnorm(N,0,sigma)
  
  # create a bunch of missing values
  X1[sample(1:N,size=100)] <- NA
  X3[sample(1:N,size=300)] <- NA
  
  w <- rep(1,N)
  offset <- rep(0, N)
  X <- data.frame(X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
  
  data <- gbm_data(X, Y, w, offset)
  
  # Params Obj
  params <- training_params(num_train = N)
  
  # Other CV Parameters
  cv_class_stratify <- FALSE
  cv_folds <- 2
  fold_id <- NULL
  
  # When creating cv_groups
  cv_groups <- create_cv_groups(data, dist, params, cv_folds, cv_class_stratify, fold_id)
  
  # Then cv_groups is an atomic vector of integers with length equal to N
  expect_true(any(cv_groups == as.integer(cv_groups)))
  expect_true(is.atomic(cv_groups))
  expect_true(length(cv_groups)== params$num_train)  
})
test_that("max integer in cv groups produced is = cv_folds parameter", {
  # Given inputs
  # Dist_Obj
  dist <- gbm_dist()
  
  # create some data
  set.seed(1)
  N <- 1000
  X1 <- runif(N)
  X2 <- 2*runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  X4 <- ordered(sample(letters[1:6],N,replace=T))
  X5 <- factor(sample(letters[1:3],N,replace=T))
  X6 <- 3*runif(N)
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  SNR <- 10 # signal-to-noise ratio
  Y <- X1**1.5 + 2 * (X2**.5) + mu
  sigma <- sqrt(var(Y)/SNR)
  Y <- Y + rnorm(N,0,sigma)
  
  # create a bunch of missing values
  X1[sample(1:N,size=100)] <- NA
  X3[sample(1:N,size=300)] <- NA
  
  w <- rep(1,N)
  offset <- rep(0, N)
  X <- data.frame(X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
  
  data <- gbm_data(X, Y, w, offset)
  
  # Params Obj
  params <- training_params(num_train = N)
  
  # Other CV Parameters
  cv_class_stratify <- FALSE
  cv_folds <- 2
  fold_id <- NULL
  
  # When creating cv_groups
  cv_groups <- create_cv_groups(data, dist, params, cv_folds, cv_class_stratify, fold_id)
  
  # Then cv_groups max fold number == cv_folds
  expect_true(max(cv_groups) == cv_folds)
})
test_that("cv groups is equal to fold_id if not NULL - distributions other than Bernoulli or Pairwise", {
  # Given inputs - default distributions other than Bernoulli
  # Dist_Objs
  dist_1 <- gbm_dist("AdaBoost")
  dist_2 <- gbm_dist("CoxPH")
  dist_3 <- gbm_dist("Gamma")
  dist_4 <- gbm_dist("Gaussian")
  dist_5 <- gbm_dist("Huberized")
  dist_6 <- gbm_dist("Laplace")
  dist_7 <- gbm_dist("Poisson")
  dist_8 <- gbm_dist("Quantile")
  dist_9 <- gbm_dist("TDist")
  dist_10 <- gbm_dist("Tweedie")
  
  # create some data - DOESN'T NEED TO REFLECT DIST
  set.seed(1)
  N <- 1000
  X1 <- runif(N)
  X2 <- 2*runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  X4 <- ordered(sample(letters[1:6],N,replace=T))
  X5 <- factor(sample(letters[1:3],N,replace=T))
  X6 <- 3*runif(N)
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  SNR <- 10 # signal-to-noise ratio
  Y <- X1**1.5 + 2 * (X2**.5) + mu
  sigma <- sqrt(var(Y)/SNR)
  Y <- Y + rnorm(N,0,sigma)
  
  # create a bunch of missing values
  X1[sample(1:N,size=100)] <- NA
  X3[sample(1:N,size=300)] <- NA
  
  w <- rep(1,N)
  offset <- rep(0, N)
  X <- data.frame(X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
  
  data <- gbm_data(X, Y, w, offset)
  
  # Params Obj
  params <- training_params(num_train = N)
  
  # Other CV Parameters - NB: fold_id not NULL
  cv_class_stratify <- FALSE
  cv_folds <- 2
  fold_id <- seq_len(params$num_train)
  
  # When creating cv_groups for distributions other than Bernoulli
  cv_groups_1 <- create_cv_groups(data, dist_1, params, cv_folds, cv_class_stratify, fold_id)
  cv_groups_2 <- create_cv_groups(data, dist_2, params, cv_folds, cv_class_stratify, fold_id)
  cv_groups_3 <- create_cv_groups(data, dist_3, params, cv_folds, cv_class_stratify, fold_id)
  cv_groups_4 <- create_cv_groups(data, dist_4, params, cv_folds, cv_class_stratify, fold_id)
  cv_groups_5 <- create_cv_groups(data, dist_5, params, cv_folds, cv_class_stratify, fold_id)
  cv_groups_6 <- create_cv_groups(data, dist_6, params, cv_folds, cv_class_stratify, fold_id)
  cv_groups_7 <- create_cv_groups(data, dist_7, params, cv_folds, cv_class_stratify, fold_id)
  cv_groups_8 <- create_cv_groups(data, dist_8, params, cv_folds, cv_class_stratify, fold_id)
  cv_groups_9 <- create_cv_groups(data, dist_9, params, cv_folds, cv_class_stratify, fold_id)
  cv_groups_10 <- create_cv_groups(data, dist_10, params, cv_folds, cv_class_stratify, fold_id)

    
  # Then cv_groups == fold_id
  expect_equal(cv_groups_1, fold_id)
  expect_equal(cv_groups_2, fold_id)
  expect_equal(cv_groups_3, fold_id)
  expect_equal(cv_groups_4, fold_id)
  expect_equal(cv_groups_5, fold_id)
  expect_equal(cv_groups_6, fold_id)
  expect_equal(cv_groups_7, fold_id)
  expect_equal(cv_groups_8, fold_id)
  expect_equal(cv_groups_9, fold_id)
  expect_equal(cv_groups_10, fold_id)
})
test_that("cv groups is equal to fold_id if not NULL - Bernoulli with cv_class_stratify=FALSE", {
  # Given inputs - default distributions for Bernoulli
  # Dist_Objs
  dist <- gbm_dist("Bernoulli")
  
  # create some data - DOESN'T NEED TO REFLECT DIST
  set.seed(1)
  N <- 1000
  X1 <- runif(N)
  X2 <- 2*runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  X4 <- ordered(sample(letters[1:6],N,replace=T))
  X5 <- factor(sample(letters[1:3],N,replace=T))
  X6 <- 3*runif(N)
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  SNR <- 10 # signal-to-noise ratio
  Y <- X1**1.5 + 2 * (X2**.5) + mu
  sigma <- sqrt(var(Y)/SNR)
  Y <- Y + rnorm(N,0,sigma)
  
  # create a bunch of missing values
  X1[sample(1:N,size=100)] <- NA
  X3[sample(1:N,size=300)] <- NA
  
  w <- rep(1,N)
  offset <- rep(0, N)
  X <- data.frame(X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
  
  data <- gbm_data(X, Y, w, offset)
  
  # Params Obj
  params <- training_params(num_train = N)
  
  # Other CV Parameters
  cv_folds <- 2
  fold_id <- seq_len(params$num_train)
  
  # When creating cv_groups with cv_class_stratify=FALSE and fold_id != NULL
  cv_class_stratify <- FALSE
  cv_groups <- create_cv_groups(data, dist, params, cv_folds, cv_class_stratify, fold_id)
  
  # Then cv_groups == fold_id
  expect_equal(cv_groups, fold_id)
})
test_that("cv_class_stratify does nothing for distributions other than Bernoulli - fold_id != NULL", {
  # Given inputs - default distributions other than Bernoulli
  # Dist_Objs
  dist_1 <- gbm_dist("AdaBoost")
  dist_2 <- gbm_dist("CoxPH")
  dist_3 <- gbm_dist("Gamma")
  dist_4 <- gbm_dist("Gaussian")
  dist_5 <- gbm_dist("Huberized")
  dist_6 <- gbm_dist("Laplace")
  dist_7 <- gbm_dist("Pairwise")
  dist_8 <- gbm_dist("Poisson")
  dist_9 <- gbm_dist("Quantile")
  dist_10 <- gbm_dist("TDist")
  dist_11 <- gbm_dist("Tweedie")
  
  # create some data - DOESN'T NEED TO REFLECT DIST
  set.seed(1)
  N <- 1000
  X1 <- runif(N)
  X2 <- 2*runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  X4 <- ordered(sample(letters[1:6],N,replace=T))
  X5 <- factor(sample(letters[1:3],N,replace=T))
  X6 <- 3*runif(N)
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  SNR <- 10 # signal-to-noise ratio
  Y <- X1**1.5 + 2 * (X2**.5) + mu
  sigma <- sqrt(var(Y)/SNR)
  Y <- Y + rnorm(N,0,sigma)
  
  # create a bunch of missing values
  X1[sample(1:N,size=100)] <- NA
  X3[sample(1:N,size=300)] <- NA
  
  w <- rep(1,N)
  offset <- rep(0, N)
  X <- data.frame(X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
  
  data <- gbm_data(X, Y, w, offset)
  
  # Params Obj
  params <- training_params(num_train = N)
  
  # Other CV Parameters - NB: fold_id not NULL
  cv_folds <- 2
  fold_id <- seq_len(params$num_train)
  
  # Set up groups for Pairwise
  dist_7$group <- sample(1:5, N, replace=TRUE)
  
  # When creating cv_groups for both cv_class_stratify==TRUE and FALSE for distributions other than Bernoulli
  cv_groups_1_T <- create_cv_groups(data, dist_1, params, cv_folds, cv_class_stratify=TRUE, fold_id)
  cv_groups_2_T <- create_cv_groups(data, dist_2, params, cv_folds, cv_class_stratify=TRUE, fold_id)
  cv_groups_3_T <- create_cv_groups(data, dist_3, params, cv_folds, cv_class_stratify=TRUE, fold_id)
  cv_groups_4_T <- create_cv_groups(data, dist_4, params, cv_folds, cv_class_stratify=TRUE, fold_id)
  cv_groups_5_T <- create_cv_groups(data, dist_5, params, cv_folds, cv_class_stratify=TRUE, fold_id)
  cv_groups_6_T <- create_cv_groups(data, dist_6, params, cv_folds, cv_class_stratify=TRUE, fold_id)
  cv_groups_7_T <- create_cv_groups(data, dist_7, params, cv_folds, cv_class_stratify=TRUE, fold_id)
  cv_groups_8_T <- create_cv_groups(data, dist_8, params, cv_folds, cv_class_stratify=TRUE, fold_id)
  cv_groups_9_T <- create_cv_groups(data, dist_9, params, cv_folds, cv_class_stratify=TRUE, fold_id)
  cv_groups_10_T <- create_cv_groups(data, dist_10, params, cv_folds, cv_class_stratify=TRUE, fold_id)
  cv_groups_11_T <- create_cv_groups(data, dist_11, params, cv_folds, cv_class_stratify=TRUE, fold_id)
  
  cv_groups_1_F <- create_cv_groups(data, dist_1, params, cv_folds, cv_class_stratify=FALSE, fold_id)
  cv_groups_2_F <- create_cv_groups(data, dist_2, params, cv_folds, cv_class_stratify=FALSE, fold_id)
  cv_groups_3_F <- create_cv_groups(data, dist_3, params, cv_folds, cv_class_stratify=FALSE, fold_id)
  cv_groups_4_F <- create_cv_groups(data, dist_4, params, cv_folds, cv_class_stratify=FALSE, fold_id)
  cv_groups_5_F <- create_cv_groups(data, dist_5, params, cv_folds, cv_class_stratify=FALSE, fold_id)
  cv_groups_6_F <- create_cv_groups(data, dist_6, params, cv_folds, cv_class_stratify=FALSE, fold_id)
  cv_groups_7_F <- create_cv_groups(data, dist_7, params, cv_folds, cv_class_stratify=FALSE, fold_id)
  cv_groups_8_F <- create_cv_groups(data, dist_8, params, cv_folds, cv_class_stratify=FALSE, fold_id)
  cv_groups_9_F <- create_cv_groups(data, dist_9, params, cv_folds, cv_class_stratify=FALSE, fold_id)
  cv_groups_10_F <- create_cv_groups(data, dist_10, params, cv_folds, cv_class_stratify=FALSE, fold_id)
  cv_groups_11_F <- create_cv_groups(data, dist_11, params, cv_folds, cv_class_stratify=FALSE, fold_id)
  
  # These groups don't depend on cv_class_stratify
  expect_equal(cv_groups_1_T, cv_groups_1_F)
  expect_equal(cv_groups_2_T, cv_groups_2_F)
  expect_equal(cv_groups_3_T, cv_groups_3_F)
  expect_equal(cv_groups_4_T, cv_groups_4_F)
  expect_equal(cv_groups_5_T, cv_groups_5_F)
  expect_equal(cv_groups_6_T, cv_groups_6_F)
  expect_equal(cv_groups_7_T, cv_groups_7_F)
  expect_equal(cv_groups_8_T, cv_groups_8_F)
  expect_equal(cv_groups_9_T, cv_groups_9_F)
  expect_equal(cv_groups_10_T, cv_groups_10_F)
  expect_equal(cv_groups_11_T, cv_groups_11_F)
})
test_that("cv_class_stratify does nothing for distributions other than Bernoulli - fold_id == NULL", {
  # Given inputs - default distributions other than Bernoulli
  # Dist_Objs
  dist_1 <- gbm_dist("AdaBoost")
  dist_2 <- gbm_dist("CoxPH")
  dist_3 <- gbm_dist("Gamma")
  dist_4 <- gbm_dist("Gaussian")
  dist_5 <- gbm_dist("Huberized")
  dist_6 <- gbm_dist("Laplace")
  dist_7 <- gbm_dist("Pairwise")
  dist_8 <- gbm_dist("Poisson")
  dist_9 <- gbm_dist("Quantile")
  dist_10 <- gbm_dist("TDist")
  dist_11 <- gbm_dist("Tweedie")
  
  # create some data - DOESN'T NEED TO REFLECT DIST
  set.seed(1)
  N <- 1000
  X1 <- runif(N)
  X2 <- 2*runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  X4 <- ordered(sample(letters[1:6],N,replace=T))
  X5 <- factor(sample(letters[1:3],N,replace=T))
  X6 <- 3*runif(N)
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  SNR <- 10 # signal-to-noise ratio
  Y <- X1**1.5 + 2 * (X2**.5) + mu
  sigma <- sqrt(var(Y)/SNR)
  Y <- Y + rnorm(N,0,sigma)
  
  # create a bunch of missing values
  X1[sample(1:N,size=100)] <- NA
  X3[sample(1:N,size=300)] <- NA
  
  w <- rep(1,N)
  offset <- rep(0, N)
  X <- data.frame(X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
  
  data <- gbm_data(X, Y, w, offset)
  
  # Params Obj
  params <- training_params(num_train = N)
  
  # Other CV Parameters - NB: fold_id is NULL
  cv_folds <- 2
  fold_id <- NULL
  
  # Set up groups for Pairwise
  dist_7$group <- sample(1:5, N, replace=TRUE)
  
  # When creating cv_groups for both cv_class_stratify==TRUE and FALSE for distributions other than Bernoulli
  set.seed(1)
  cv_groups_1_T <- create_cv_groups(data, dist_1, params, cv_folds, cv_class_stratify=TRUE, fold_id)
  cv_groups_2_T <- create_cv_groups(data, dist_2, params, cv_folds, cv_class_stratify=TRUE, fold_id)
  cv_groups_3_T <- create_cv_groups(data, dist_3, params, cv_folds, cv_class_stratify=TRUE, fold_id)
  cv_groups_4_T <- create_cv_groups(data, dist_4, params, cv_folds, cv_class_stratify=TRUE, fold_id)
  cv_groups_5_T <- create_cv_groups(data, dist_5, params, cv_folds, cv_class_stratify=TRUE, fold_id)
  cv_groups_6_T <- create_cv_groups(data, dist_6, params, cv_folds, cv_class_stratify=TRUE, fold_id)
  cv_groups_7_T <- create_cv_groups(data, dist_7, params, cv_folds, cv_class_stratify=TRUE, fold_id)
  cv_groups_8_T <- create_cv_groups(data, dist_8, params, cv_folds, cv_class_stratify=TRUE, fold_id)
  cv_groups_9_T <- create_cv_groups(data, dist_9, params, cv_folds, cv_class_stratify=TRUE, fold_id)
  cv_groups_10_T <- create_cv_groups(data, dist_10, params, cv_folds, cv_class_stratify=TRUE, fold_id)
  cv_groups_11_T <- create_cv_groups(data, dist_11, params, cv_folds, cv_class_stratify=TRUE, fold_id)
  
  set.seed(1)
  cv_groups_1_F <- create_cv_groups(data, dist_1, params, cv_folds, cv_class_stratify=FALSE, fold_id)
  cv_groups_2_F <- create_cv_groups(data, dist_2, params, cv_folds, cv_class_stratify=FALSE, fold_id)
  cv_groups_3_F <- create_cv_groups(data, dist_3, params, cv_folds, cv_class_stratify=FALSE, fold_id)
  cv_groups_4_F <- create_cv_groups(data, dist_4, params, cv_folds, cv_class_stratify=FALSE, fold_id)
  cv_groups_5_F <- create_cv_groups(data, dist_5, params, cv_folds, cv_class_stratify=FALSE, fold_id)
  cv_groups_6_F <- create_cv_groups(data, dist_6, params, cv_folds, cv_class_stratify=FALSE, fold_id)
  cv_groups_7_F <- create_cv_groups(data, dist_7, params, cv_folds, cv_class_stratify=FALSE, fold_id)
  cv_groups_8_F <- create_cv_groups(data, dist_8, params, cv_folds, cv_class_stratify=FALSE, fold_id)
  cv_groups_9_F <- create_cv_groups(data, dist_9, params, cv_folds, cv_class_stratify=FALSE, fold_id)
  cv_groups_10_F <- create_cv_groups(data, dist_10, params, cv_folds, cv_class_stratify=FALSE, fold_id)
  cv_groups_11_F <- create_cv_groups(data, dist_11, params, cv_folds, cv_class_stratify=FALSE, fold_id)
  
  
  # These groups don't depend on cv_class_stratify
  expect_equal(cv_groups_1_T, cv_groups_1_F)
  expect_equal(cv_groups_2_T, cv_groups_2_F)
  expect_equal(cv_groups_3_T, cv_groups_3_F)
  expect_equal(cv_groups_4_T, cv_groups_4_F)
  expect_equal(cv_groups_5_T, cv_groups_5_F)
  expect_equal(cv_groups_6_T, cv_groups_6_F)
  expect_equal(cv_groups_7_T, cv_groups_7_F)
  expect_equal(cv_groups_8_T, cv_groups_8_F)
  expect_equal(cv_groups_9_T, cv_groups_9_F)
  expect_equal(cv_groups_10_T, cv_groups_10_F)
  expect_equal(cv_groups_11_T, cv_groups_11_F)
  
})
test_that("cv_class_stratify changes grouping for Bernoulli distribution", {
  # Given inputs - default distributions for Bernoulli, data
  # now representative of distribution
  # Dist_Objs
  dist <- gbm_dist("Bernoulli")
  
  # create some data - DOES NEED TO REFLECT DIST
  set.seed(1)
  set.seed(1)
  
  # create some data
  N <- 1000
  X1 <- runif(N)
  X2 <- runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  p <- 1/(1+exp(-(sin(3*X1) - 4*X2 + mu)))
  Y <- rbinom(N,1,p)
  
  # random weights if you want to experiment with them
  w <- rexp(N)
  w <- N*w/sum(w)
  
  offset <- rep(0, N)
  
  X <- data.frame(X1=X1,X2=X2,X3=X3)
  
  data <- gbm_data(X, Y, w, offset)
  
  # Params Obj
  params <- training_params(num_train = N)
  
  # Other CV Parameters
  cv_folds <- 2
  fold_id <- seq_len(params$num_train)
  
  # When creating cv_groups with cv_class_stratify=TRUE and FALSE
  cv_class_stratify_T <- TRUE
  cv_class_stratify_F <- FALSE
  
  set.seed(1)
  cv_groups_T <- create_cv_groups(data, dist, params, cv_folds, cv_class_stratify_T, fold_id)
  
  set.seed(1)
  cv_groups_F <- create_cv_groups(data, dist, params, cv_folds, cv_class_stratify_F, fold_id)
  
  # Then cv_groups are not identical
  expect_true(any(cv_groups_T != cv_groups_F))
})
test_that("Error thrown when number in a certain class is < cv_folds on stratification - Bernoulli", {
  # Given inputs - default distributions for Bernoulli, data
  # now representative of distribution
  # Dist_Objs
  dist <- gbm_dist("Bernoulli")
  
  # create some data - DOES NEED TO REFLECT DIST
  set.seed(1)
  set.seed(1)
  
  # create some data
  N <- 1000
  X1 <- runif(N)
  X2 <- runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  p <- 1/(1+exp(-(sin(3*X1) - 4*X2 + mu)))
  Y <- rbinom(N,1,p)
  
  # random weights if you want to experiment with them
  w <- rexp(N)
  w <- N*w/sum(w)
  
  offset <- rep(0, N)
  
  X <- data.frame(X1=X1,X2=X2,X3=X3)
  
  data <- gbm_data(X, Y, w, offset)
  
  # Params Obj
  params <- training_params(num_train = N)
  
  # Other CV Parameters
  fold_id <- seq_len(params$num_train)
  
  # When creating cv_groups with cv_class_stratify=TRUE and cv_folds > min number of a certain class
  cv_class_stratify <- TRUE
  Ones <- tabulate(data$y[seq_len(params$num_train)])
  Zeros <- length(data$y[seq_len(params$num_train)])-Ones
  cv_folds <- min(c(Ones, Zeros)) + 1
  
  # Then error will be thrown
  expect_error(create_cv_groups(data, dist, params, cv_folds, cv_class_stratify, fold_id))
})
test_that("cv groups are such that folds are split at group boundaries - Pairwise", {
  # Given a Pairwise distribution object and appropriate
  # groupings etc.
  dist <- gbm_dist("Pairwise")

  
  # create some data - DOESN'T NEED TO REFLECT DIST
  set.seed(1)
  N <- 1000
  X1 <- runif(N)
  X2 <- 2*runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  X4 <- ordered(sample(letters[1:6],N,replace=T))
  X5 <- factor(sample(letters[1:3],N,replace=T))
  X6 <- 3*runif(N)
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  SNR <- 10 # signal-to-noise ratio
  Y <- X1**1.5 + 2 * (X2**.5) + mu
  sigma <- sqrt(var(Y)/SNR)
  Y <- Y + rnorm(N,0,sigma)
  
  # create a bunch of missing values
  X1[sample(1:N,size=100)] <- NA
  X3[sample(1:N,size=300)] <- NA
  
  w <- rep(1,N)
  offset <- rep(0, N)
  X <- data.frame(X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
  
  data <- gbm_data(X, Y, w, offset)
  
  # Params Obj
  params <- training_params(num_train = N)
  
  # Other CV Parameters - NB: fold_id not NULL
  cv_folds <- 2
  fold_id <- seq_len(params$num_train)
  
  # DEFINE GROUPINGS
  dist$group <- as.factor(sample(seq_len(5), N, replace=TRUE))
  
  # When cv_groups is called and compared to 
  set.seed(1)
  cv_groups <- create_cv_groups(data, dist, params, cv_folds, FALSE, fold_id)
  
  # Then the splits are at group boundaries
  set.seed(1)
  samp <- sample(rep(seq_len(cv_folds), length=nlevels(dist$group)))
  expect_equal(cv_groups, samp[as.integer(dist$group[seq_len(params$num_train)])])
})
gbm-developers/gbm3 documentation built on April 28, 2024, 10:04 p.m.