####################
# Author: James Hickey
#
# Series of tests to check the creation of cv groups
#
####################
context("Testing input checking - create_cv_groups")
test_that("create_cv_groups throws an error if not passed a GBMData object", {
# Given inputs
# Dist_Obj
dist <- gbm_dist()
# create some data
set.seed(1)
N <- 1000
X1 <- runif(N)
X2 <- 2*runif(N)
X3 <- factor(sample(letters[1:4],N,replace=T))
X4 <- ordered(sample(letters[1:6],N,replace=T))
X5 <- factor(sample(letters[1:3],N,replace=T))
X6 <- 3*runif(N)
mu <- c(-1,0,1,2)[as.numeric(X3)]
SNR <- 10 # signal-to-noise ratio
Y <- X1**1.5 + 2 * (X2**.5) + mu
sigma <- sqrt(var(Y)/SNR)
Y <- Y + rnorm(N,0,sigma)
# create a bunch of missing values
X1[sample(1:N,size=100)] <- NA
X3[sample(1:N,size=300)] <- NA
w <- rep(1,N)
offset <- rep(0, N)
X <- data.frame(X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
data <- gbm_data(X, Y, w, offset)
# Params Obj
params <- training_params(num_train = N)
# Other CV Parameters
cv_class_stratify <- FALSE
cv_folds <- 2
fold_id <- NULL
# When data is stripped of GBMData class
class(data) <- "wrong"
# Then error thrown when trying to calculate CV groups
expect_error(create_cv_groups(data, dist, params, cv_folds, cv_class_stratify, fold_id))
})
test_that("create_cv_groups throws an error if not passed a GBMDist object", {
# Given inputs
# Dist_Obj
dist <- gbm_dist()
# create some data
set.seed(1)
N <- 1000
X1 <- runif(N)
X2 <- 2*runif(N)
X3 <- factor(sample(letters[1:4],N,replace=T))
X4 <- ordered(sample(letters[1:6],N,replace=T))
X5 <- factor(sample(letters[1:3],N,replace=T))
X6 <- 3*runif(N)
mu <- c(-1,0,1,2)[as.numeric(X3)]
SNR <- 10 # signal-to-noise ratio
Y <- X1**1.5 + 2 * (X2**.5) + mu
sigma <- sqrt(var(Y)/SNR)
Y <- Y + rnorm(N,0,sigma)
# create a bunch of missing values
X1[sample(1:N,size=100)] <- NA
X3[sample(1:N,size=300)] <- NA
w <- rep(1,N)
offset <- rep(0, N)
X <- data.frame(X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
data <- gbm_data(X, Y, w, offset)
# Params Obj
params <- training_params(num_train = N)
# Other CV Parameters
cv_class_stratify <- FALSE
cv_folds <- 2
fold_id <- NULL
# When dist is stripped of GBMDist class
class(dist) <- ""
# Then error thrown when trying to calculate CV groups
expect_error(create_cv_groups(data, dist, params, cv_folds, cv_class_stratify, fold_id))
})
test_that("create_cv_groups throws an error if not passed a GBMTrainParams object", {
# Given inputs
# Dist_Obj
dist <- gbm_dist()
# create some data
set.seed(1)
N <- 1000
X1 <- runif(N)
X2 <- 2*runif(N)
X3 <- factor(sample(letters[1:4],N,replace=T))
X4 <- ordered(sample(letters[1:6],N,replace=T))
X5 <- factor(sample(letters[1:3],N,replace=T))
X6 <- 3*runif(N)
mu <- c(-1,0,1,2)[as.numeric(X3)]
SNR <- 10 # signal-to-noise ratio
Y <- X1**1.5 + 2 * (X2**.5) + mu
sigma <- sqrt(var(Y)/SNR)
Y <- Y + rnorm(N,0,sigma)
# create a bunch of missing values
X1[sample(1:N,size=100)] <- NA
X3[sample(1:N,size=300)] <- NA
w <- rep(1,N)
offset <- rep(0, N)
X <- data.frame(X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
data <- gbm_data(X, Y, w, offset)
# Params Obj
params <- training_params(num_train = N)
# Other CV Parameters
cv_class_stratify <- FALSE
cv_folds <- 2
fold_id <- NULL
# When params is stripped of GBMTrainParams class
class(params) <- ""
# Then error thrown when trying to calculate CV groups
expect_error(create_cv_groups(data, dist, params, cv_folds, cv_class_stratify, fold_id))
})
test_that("create_cv_groups throws an error if not cv_folds is not a whole number >= 1", {
# Given inputs - cv_folds not a whole number >= 1
# Dist_Obj
dist <- gbm_dist()
# create some data
set.seed(1)
N <- 1000
X1 <- runif(N)
X2 <- 2*runif(N)
X3 <- factor(sample(letters[1:4],N,replace=T))
X4 <- ordered(sample(letters[1:6],N,replace=T))
X5 <- factor(sample(letters[1:3],N,replace=T))
X6 <- 3*runif(N)
mu <- c(-1,0,1,2)[as.numeric(X3)]
SNR <- 10 # signal-to-noise ratio
Y <- X1**1.5 + 2 * (X2**.5) + mu
sigma <- sqrt(var(Y)/SNR)
Y <- Y + rnorm(N,0,sigma)
# create a bunch of missing values
X1[sample(1:N,size=100)] <- NA
X3[sample(1:N,size=300)] <- NA
w <- rep(1,N)
offset <- rep(0, N)
X <- data.frame(X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
data <- gbm_data(X, Y, w, offset)
# Params Obj
params <- training_params(num_train = N)
# Other CV Parameters
cv_class_stratify <- FALSE
fold_id <- NULL
cv_folds_1 <- 2.1
cv_folds_2 <- -1
cv_folds_3 <- "What?"
cv_folds_4 <- Inf
cv_folds_5 <- NaN
cv_folds_6 <- 0
# When creating cv groups
# Then error thrown when trying to calculate CV groups
expect_error(create_cv_groups(data, dist, params, cv_folds_1, cv_class_stratify, fold_id))
expect_error(create_cv_groups(data, dist, params, cv_folds_2, cv_class_stratify, fold_id))
expect_error(create_cv_groups(data, dist, params, cv_folds_3, cv_class_stratify, fold_id))
expect_error(create_cv_groups(data, dist, params, cv_folds_4, cv_class_stratify, fold_id))
expect_error(create_cv_groups(data, dist, params, cv_folds_5, cv_class_stratify, fold_id))
expect_error(create_cv_groups(data, dist, params, cv_folds_6, cv_class_stratify, fold_id))
})
test_that("create_cv_groups throws an error if not cv_class_stratify is not a logical", {
# Given inputs - cv_class_stratify is not a logical!
# Dist_Obj
dist <- gbm_dist()
# create some data
set.seed(1)
N <- 1000
X1 <- runif(N)
X2 <- 2*runif(N)
X3 <- factor(sample(letters[1:4],N,replace=T))
X4 <- ordered(sample(letters[1:6],N,replace=T))
X5 <- factor(sample(letters[1:3],N,replace=T))
X6 <- 3*runif(N)
mu <- c(-1,0,1,2)[as.numeric(X3)]
SNR <- 10 # signal-to-noise ratio
Y <- X1**1.5 + 2 * (X2**.5) + mu
sigma <- sqrt(var(Y)/SNR)
Y <- Y + rnorm(N,0,sigma)
# create a bunch of missing values
X1[sample(1:N,size=100)] <- NA
X3[sample(1:N,size=300)] <- NA
w <- rep(1,N)
offset <- rep(0, N)
X <- data.frame(X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
data <- gbm_data(X, Y, w, offset)
# Params Obj
params <- training_params(num_train = N)
# Other CV Parameters
fold_id <- NULL
cv_folds <- 2
cv_class_stratify_1 <- -1
cv_class_stratify_2 <- "What?"
cv_class_stratify_3 <- Inf
cv_class_stratify_4 <- NaN
cv_class_stratify_5 <- 0
cv_class_stratify_6 <- c(TRUE, FALSE)
# When creating cv groups
# Then error thrown when trying to calculate CV groups
expect_error(create_cv_groups(data, dist, params, cv_folds, cv_class_stratify_1, fold_id))
expect_error(create_cv_groups(data, dist, params, cv_folds, cv_class_stratify_2, fold_id))
expect_error(create_cv_groups(data, dist, params, cv_folds, cv_class_stratify_3, fold_id))
expect_error(create_cv_groups(data, dist, params, cv_folds, cv_class_stratify_4, fold_id))
expect_error(create_cv_groups(data, dist, params, cv_folds, cv_class_stratify_5, fold_id))
expect_error(create_cv_groups(data, dist, params, cv_folds, cv_class_stratify_6, fold_id))
})
context("Testing cv groupings")
test_that("cv groups is an atomic vector of integers whose length is equal to the number of training observations", {
# Given inputs
# Dist_Obj
dist <- gbm_dist()
# create some data
set.seed(1)
N <- 1000
X1 <- runif(N)
X2 <- 2*runif(N)
X3 <- factor(sample(letters[1:4],N,replace=T))
X4 <- ordered(sample(letters[1:6],N,replace=T))
X5 <- factor(sample(letters[1:3],N,replace=T))
X6 <- 3*runif(N)
mu <- c(-1,0,1,2)[as.numeric(X3)]
SNR <- 10 # signal-to-noise ratio
Y <- X1**1.5 + 2 * (X2**.5) + mu
sigma <- sqrt(var(Y)/SNR)
Y <- Y + rnorm(N,0,sigma)
# create a bunch of missing values
X1[sample(1:N,size=100)] <- NA
X3[sample(1:N,size=300)] <- NA
w <- rep(1,N)
offset <- rep(0, N)
X <- data.frame(X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
data <- gbm_data(X, Y, w, offset)
# Params Obj
params <- training_params(num_train = N)
# Other CV Parameters
cv_class_stratify <- FALSE
cv_folds <- 2
fold_id <- NULL
# When creating cv_groups
cv_groups <- create_cv_groups(data, dist, params, cv_folds, cv_class_stratify, fold_id)
# Then cv_groups is an atomic vector of integers with length equal to N
expect_true(any(cv_groups == as.integer(cv_groups)))
expect_true(is.atomic(cv_groups))
expect_true(length(cv_groups)== params$num_train)
})
test_that("max integer in cv groups produced is = cv_folds parameter", {
# Given inputs
# Dist_Obj
dist <- gbm_dist()
# create some data
set.seed(1)
N <- 1000
X1 <- runif(N)
X2 <- 2*runif(N)
X3 <- factor(sample(letters[1:4],N,replace=T))
X4 <- ordered(sample(letters[1:6],N,replace=T))
X5 <- factor(sample(letters[1:3],N,replace=T))
X6 <- 3*runif(N)
mu <- c(-1,0,1,2)[as.numeric(X3)]
SNR <- 10 # signal-to-noise ratio
Y <- X1**1.5 + 2 * (X2**.5) + mu
sigma <- sqrt(var(Y)/SNR)
Y <- Y + rnorm(N,0,sigma)
# create a bunch of missing values
X1[sample(1:N,size=100)] <- NA
X3[sample(1:N,size=300)] <- NA
w <- rep(1,N)
offset <- rep(0, N)
X <- data.frame(X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
data <- gbm_data(X, Y, w, offset)
# Params Obj
params <- training_params(num_train = N)
# Other CV Parameters
cv_class_stratify <- FALSE
cv_folds <- 2
fold_id <- NULL
# When creating cv_groups
cv_groups <- create_cv_groups(data, dist, params, cv_folds, cv_class_stratify, fold_id)
# Then cv_groups max fold number == cv_folds
expect_true(max(cv_groups) == cv_folds)
})
test_that("cv groups is equal to fold_id if not NULL - distributions other than Bernoulli or Pairwise", {
# Given inputs - default distributions other than Bernoulli
# Dist_Objs
dist_1 <- gbm_dist("AdaBoost")
dist_2 <- gbm_dist("CoxPH")
dist_3 <- gbm_dist("Gamma")
dist_4 <- gbm_dist("Gaussian")
dist_5 <- gbm_dist("Huberized")
dist_6 <- gbm_dist("Laplace")
dist_7 <- gbm_dist("Poisson")
dist_8 <- gbm_dist("Quantile")
dist_9 <- gbm_dist("TDist")
dist_10 <- gbm_dist("Tweedie")
# create some data - DOESN'T NEED TO REFLECT DIST
set.seed(1)
N <- 1000
X1 <- runif(N)
X2 <- 2*runif(N)
X3 <- factor(sample(letters[1:4],N,replace=T))
X4 <- ordered(sample(letters[1:6],N,replace=T))
X5 <- factor(sample(letters[1:3],N,replace=T))
X6 <- 3*runif(N)
mu <- c(-1,0,1,2)[as.numeric(X3)]
SNR <- 10 # signal-to-noise ratio
Y <- X1**1.5 + 2 * (X2**.5) + mu
sigma <- sqrt(var(Y)/SNR)
Y <- Y + rnorm(N,0,sigma)
# create a bunch of missing values
X1[sample(1:N,size=100)] <- NA
X3[sample(1:N,size=300)] <- NA
w <- rep(1,N)
offset <- rep(0, N)
X <- data.frame(X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
data <- gbm_data(X, Y, w, offset)
# Params Obj
params <- training_params(num_train = N)
# Other CV Parameters - NB: fold_id not NULL
cv_class_stratify <- FALSE
cv_folds <- 2
fold_id <- seq_len(params$num_train)
# When creating cv_groups for distributions other than Bernoulli
cv_groups_1 <- create_cv_groups(data, dist_1, params, cv_folds, cv_class_stratify, fold_id)
cv_groups_2 <- create_cv_groups(data, dist_2, params, cv_folds, cv_class_stratify, fold_id)
cv_groups_3 <- create_cv_groups(data, dist_3, params, cv_folds, cv_class_stratify, fold_id)
cv_groups_4 <- create_cv_groups(data, dist_4, params, cv_folds, cv_class_stratify, fold_id)
cv_groups_5 <- create_cv_groups(data, dist_5, params, cv_folds, cv_class_stratify, fold_id)
cv_groups_6 <- create_cv_groups(data, dist_6, params, cv_folds, cv_class_stratify, fold_id)
cv_groups_7 <- create_cv_groups(data, dist_7, params, cv_folds, cv_class_stratify, fold_id)
cv_groups_8 <- create_cv_groups(data, dist_8, params, cv_folds, cv_class_stratify, fold_id)
cv_groups_9 <- create_cv_groups(data, dist_9, params, cv_folds, cv_class_stratify, fold_id)
cv_groups_10 <- create_cv_groups(data, dist_10, params, cv_folds, cv_class_stratify, fold_id)
# Then cv_groups == fold_id
expect_equal(cv_groups_1, fold_id)
expect_equal(cv_groups_2, fold_id)
expect_equal(cv_groups_3, fold_id)
expect_equal(cv_groups_4, fold_id)
expect_equal(cv_groups_5, fold_id)
expect_equal(cv_groups_6, fold_id)
expect_equal(cv_groups_7, fold_id)
expect_equal(cv_groups_8, fold_id)
expect_equal(cv_groups_9, fold_id)
expect_equal(cv_groups_10, fold_id)
})
test_that("cv groups is equal to fold_id if not NULL - Bernoulli with cv_class_stratify=FALSE", {
# Given inputs - default distributions for Bernoulli
# Dist_Objs
dist <- gbm_dist("Bernoulli")
# create some data - DOESN'T NEED TO REFLECT DIST
set.seed(1)
N <- 1000
X1 <- runif(N)
X2 <- 2*runif(N)
X3 <- factor(sample(letters[1:4],N,replace=T))
X4 <- ordered(sample(letters[1:6],N,replace=T))
X5 <- factor(sample(letters[1:3],N,replace=T))
X6 <- 3*runif(N)
mu <- c(-1,0,1,2)[as.numeric(X3)]
SNR <- 10 # signal-to-noise ratio
Y <- X1**1.5 + 2 * (X2**.5) + mu
sigma <- sqrt(var(Y)/SNR)
Y <- Y + rnorm(N,0,sigma)
# create a bunch of missing values
X1[sample(1:N,size=100)] <- NA
X3[sample(1:N,size=300)] <- NA
w <- rep(1,N)
offset <- rep(0, N)
X <- data.frame(X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
data <- gbm_data(X, Y, w, offset)
# Params Obj
params <- training_params(num_train = N)
# Other CV Parameters
cv_folds <- 2
fold_id <- seq_len(params$num_train)
# When creating cv_groups with cv_class_stratify=FALSE and fold_id != NULL
cv_class_stratify <- FALSE
cv_groups <- create_cv_groups(data, dist, params, cv_folds, cv_class_stratify, fold_id)
# Then cv_groups == fold_id
expect_equal(cv_groups, fold_id)
})
test_that("cv_class_stratify does nothing for distributions other than Bernoulli - fold_id != NULL", {
# Given inputs - default distributions other than Bernoulli
# Dist_Objs
dist_1 <- gbm_dist("AdaBoost")
dist_2 <- gbm_dist("CoxPH")
dist_3 <- gbm_dist("Gamma")
dist_4 <- gbm_dist("Gaussian")
dist_5 <- gbm_dist("Huberized")
dist_6 <- gbm_dist("Laplace")
dist_7 <- gbm_dist("Pairwise")
dist_8 <- gbm_dist("Poisson")
dist_9 <- gbm_dist("Quantile")
dist_10 <- gbm_dist("TDist")
dist_11 <- gbm_dist("Tweedie")
# create some data - DOESN'T NEED TO REFLECT DIST
set.seed(1)
N <- 1000
X1 <- runif(N)
X2 <- 2*runif(N)
X3 <- factor(sample(letters[1:4],N,replace=T))
X4 <- ordered(sample(letters[1:6],N,replace=T))
X5 <- factor(sample(letters[1:3],N,replace=T))
X6 <- 3*runif(N)
mu <- c(-1,0,1,2)[as.numeric(X3)]
SNR <- 10 # signal-to-noise ratio
Y <- X1**1.5 + 2 * (X2**.5) + mu
sigma <- sqrt(var(Y)/SNR)
Y <- Y + rnorm(N,0,sigma)
# create a bunch of missing values
X1[sample(1:N,size=100)] <- NA
X3[sample(1:N,size=300)] <- NA
w <- rep(1,N)
offset <- rep(0, N)
X <- data.frame(X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
data <- gbm_data(X, Y, w, offset)
# Params Obj
params <- training_params(num_train = N)
# Other CV Parameters - NB: fold_id not NULL
cv_folds <- 2
fold_id <- seq_len(params$num_train)
# Set up groups for Pairwise
dist_7$group <- sample(1:5, N, replace=TRUE)
# When creating cv_groups for both cv_class_stratify==TRUE and FALSE for distributions other than Bernoulli
cv_groups_1_T <- create_cv_groups(data, dist_1, params, cv_folds, cv_class_stratify=TRUE, fold_id)
cv_groups_2_T <- create_cv_groups(data, dist_2, params, cv_folds, cv_class_stratify=TRUE, fold_id)
cv_groups_3_T <- create_cv_groups(data, dist_3, params, cv_folds, cv_class_stratify=TRUE, fold_id)
cv_groups_4_T <- create_cv_groups(data, dist_4, params, cv_folds, cv_class_stratify=TRUE, fold_id)
cv_groups_5_T <- create_cv_groups(data, dist_5, params, cv_folds, cv_class_stratify=TRUE, fold_id)
cv_groups_6_T <- create_cv_groups(data, dist_6, params, cv_folds, cv_class_stratify=TRUE, fold_id)
cv_groups_7_T <- create_cv_groups(data, dist_7, params, cv_folds, cv_class_stratify=TRUE, fold_id)
cv_groups_8_T <- create_cv_groups(data, dist_8, params, cv_folds, cv_class_stratify=TRUE, fold_id)
cv_groups_9_T <- create_cv_groups(data, dist_9, params, cv_folds, cv_class_stratify=TRUE, fold_id)
cv_groups_10_T <- create_cv_groups(data, dist_10, params, cv_folds, cv_class_stratify=TRUE, fold_id)
cv_groups_11_T <- create_cv_groups(data, dist_11, params, cv_folds, cv_class_stratify=TRUE, fold_id)
cv_groups_1_F <- create_cv_groups(data, dist_1, params, cv_folds, cv_class_stratify=FALSE, fold_id)
cv_groups_2_F <- create_cv_groups(data, dist_2, params, cv_folds, cv_class_stratify=FALSE, fold_id)
cv_groups_3_F <- create_cv_groups(data, dist_3, params, cv_folds, cv_class_stratify=FALSE, fold_id)
cv_groups_4_F <- create_cv_groups(data, dist_4, params, cv_folds, cv_class_stratify=FALSE, fold_id)
cv_groups_5_F <- create_cv_groups(data, dist_5, params, cv_folds, cv_class_stratify=FALSE, fold_id)
cv_groups_6_F <- create_cv_groups(data, dist_6, params, cv_folds, cv_class_stratify=FALSE, fold_id)
cv_groups_7_F <- create_cv_groups(data, dist_7, params, cv_folds, cv_class_stratify=FALSE, fold_id)
cv_groups_8_F <- create_cv_groups(data, dist_8, params, cv_folds, cv_class_stratify=FALSE, fold_id)
cv_groups_9_F <- create_cv_groups(data, dist_9, params, cv_folds, cv_class_stratify=FALSE, fold_id)
cv_groups_10_F <- create_cv_groups(data, dist_10, params, cv_folds, cv_class_stratify=FALSE, fold_id)
cv_groups_11_F <- create_cv_groups(data, dist_11, params, cv_folds, cv_class_stratify=FALSE, fold_id)
# These groups don't depend on cv_class_stratify
expect_equal(cv_groups_1_T, cv_groups_1_F)
expect_equal(cv_groups_2_T, cv_groups_2_F)
expect_equal(cv_groups_3_T, cv_groups_3_F)
expect_equal(cv_groups_4_T, cv_groups_4_F)
expect_equal(cv_groups_5_T, cv_groups_5_F)
expect_equal(cv_groups_6_T, cv_groups_6_F)
expect_equal(cv_groups_7_T, cv_groups_7_F)
expect_equal(cv_groups_8_T, cv_groups_8_F)
expect_equal(cv_groups_9_T, cv_groups_9_F)
expect_equal(cv_groups_10_T, cv_groups_10_F)
expect_equal(cv_groups_11_T, cv_groups_11_F)
})
test_that("cv_class_stratify does nothing for distributions other than Bernoulli - fold_id == NULL", {
# Given inputs - default distributions other than Bernoulli
# Dist_Objs
dist_1 <- gbm_dist("AdaBoost")
dist_2 <- gbm_dist("CoxPH")
dist_3 <- gbm_dist("Gamma")
dist_4 <- gbm_dist("Gaussian")
dist_5 <- gbm_dist("Huberized")
dist_6 <- gbm_dist("Laplace")
dist_7 <- gbm_dist("Pairwise")
dist_8 <- gbm_dist("Poisson")
dist_9 <- gbm_dist("Quantile")
dist_10 <- gbm_dist("TDist")
dist_11 <- gbm_dist("Tweedie")
# create some data - DOESN'T NEED TO REFLECT DIST
set.seed(1)
N <- 1000
X1 <- runif(N)
X2 <- 2*runif(N)
X3 <- factor(sample(letters[1:4],N,replace=T))
X4 <- ordered(sample(letters[1:6],N,replace=T))
X5 <- factor(sample(letters[1:3],N,replace=T))
X6 <- 3*runif(N)
mu <- c(-1,0,1,2)[as.numeric(X3)]
SNR <- 10 # signal-to-noise ratio
Y <- X1**1.5 + 2 * (X2**.5) + mu
sigma <- sqrt(var(Y)/SNR)
Y <- Y + rnorm(N,0,sigma)
# create a bunch of missing values
X1[sample(1:N,size=100)] <- NA
X3[sample(1:N,size=300)] <- NA
w <- rep(1,N)
offset <- rep(0, N)
X <- data.frame(X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
data <- gbm_data(X, Y, w, offset)
# Params Obj
params <- training_params(num_train = N)
# Other CV Parameters - NB: fold_id is NULL
cv_folds <- 2
fold_id <- NULL
# Set up groups for Pairwise
dist_7$group <- sample(1:5, N, replace=TRUE)
# When creating cv_groups for both cv_class_stratify==TRUE and FALSE for distributions other than Bernoulli
set.seed(1)
cv_groups_1_T <- create_cv_groups(data, dist_1, params, cv_folds, cv_class_stratify=TRUE, fold_id)
cv_groups_2_T <- create_cv_groups(data, dist_2, params, cv_folds, cv_class_stratify=TRUE, fold_id)
cv_groups_3_T <- create_cv_groups(data, dist_3, params, cv_folds, cv_class_stratify=TRUE, fold_id)
cv_groups_4_T <- create_cv_groups(data, dist_4, params, cv_folds, cv_class_stratify=TRUE, fold_id)
cv_groups_5_T <- create_cv_groups(data, dist_5, params, cv_folds, cv_class_stratify=TRUE, fold_id)
cv_groups_6_T <- create_cv_groups(data, dist_6, params, cv_folds, cv_class_stratify=TRUE, fold_id)
cv_groups_7_T <- create_cv_groups(data, dist_7, params, cv_folds, cv_class_stratify=TRUE, fold_id)
cv_groups_8_T <- create_cv_groups(data, dist_8, params, cv_folds, cv_class_stratify=TRUE, fold_id)
cv_groups_9_T <- create_cv_groups(data, dist_9, params, cv_folds, cv_class_stratify=TRUE, fold_id)
cv_groups_10_T <- create_cv_groups(data, dist_10, params, cv_folds, cv_class_stratify=TRUE, fold_id)
cv_groups_11_T <- create_cv_groups(data, dist_11, params, cv_folds, cv_class_stratify=TRUE, fold_id)
set.seed(1)
cv_groups_1_F <- create_cv_groups(data, dist_1, params, cv_folds, cv_class_stratify=FALSE, fold_id)
cv_groups_2_F <- create_cv_groups(data, dist_2, params, cv_folds, cv_class_stratify=FALSE, fold_id)
cv_groups_3_F <- create_cv_groups(data, dist_3, params, cv_folds, cv_class_stratify=FALSE, fold_id)
cv_groups_4_F <- create_cv_groups(data, dist_4, params, cv_folds, cv_class_stratify=FALSE, fold_id)
cv_groups_5_F <- create_cv_groups(data, dist_5, params, cv_folds, cv_class_stratify=FALSE, fold_id)
cv_groups_6_F <- create_cv_groups(data, dist_6, params, cv_folds, cv_class_stratify=FALSE, fold_id)
cv_groups_7_F <- create_cv_groups(data, dist_7, params, cv_folds, cv_class_stratify=FALSE, fold_id)
cv_groups_8_F <- create_cv_groups(data, dist_8, params, cv_folds, cv_class_stratify=FALSE, fold_id)
cv_groups_9_F <- create_cv_groups(data, dist_9, params, cv_folds, cv_class_stratify=FALSE, fold_id)
cv_groups_10_F <- create_cv_groups(data, dist_10, params, cv_folds, cv_class_stratify=FALSE, fold_id)
cv_groups_11_F <- create_cv_groups(data, dist_11, params, cv_folds, cv_class_stratify=FALSE, fold_id)
# These groups don't depend on cv_class_stratify
expect_equal(cv_groups_1_T, cv_groups_1_F)
expect_equal(cv_groups_2_T, cv_groups_2_F)
expect_equal(cv_groups_3_T, cv_groups_3_F)
expect_equal(cv_groups_4_T, cv_groups_4_F)
expect_equal(cv_groups_5_T, cv_groups_5_F)
expect_equal(cv_groups_6_T, cv_groups_6_F)
expect_equal(cv_groups_7_T, cv_groups_7_F)
expect_equal(cv_groups_8_T, cv_groups_8_F)
expect_equal(cv_groups_9_T, cv_groups_9_F)
expect_equal(cv_groups_10_T, cv_groups_10_F)
expect_equal(cv_groups_11_T, cv_groups_11_F)
})
test_that("cv_class_stratify changes grouping for Bernoulli distribution", {
# Given inputs - default distributions for Bernoulli, data
# now representative of distribution
# Dist_Objs
dist <- gbm_dist("Bernoulli")
# create some data - DOES NEED TO REFLECT DIST
set.seed(1)
set.seed(1)
# create some data
N <- 1000
X1 <- runif(N)
X2 <- runif(N)
X3 <- factor(sample(letters[1:4],N,replace=T))
mu <- c(-1,0,1,2)[as.numeric(X3)]
p <- 1/(1+exp(-(sin(3*X1) - 4*X2 + mu)))
Y <- rbinom(N,1,p)
# random weights if you want to experiment with them
w <- rexp(N)
w <- N*w/sum(w)
offset <- rep(0, N)
X <- data.frame(X1=X1,X2=X2,X3=X3)
data <- gbm_data(X, Y, w, offset)
# Params Obj
params <- training_params(num_train = N)
# Other CV Parameters
cv_folds <- 2
fold_id <- seq_len(params$num_train)
# When creating cv_groups with cv_class_stratify=TRUE and FALSE
cv_class_stratify_T <- TRUE
cv_class_stratify_F <- FALSE
set.seed(1)
cv_groups_T <- create_cv_groups(data, dist, params, cv_folds, cv_class_stratify_T, fold_id)
set.seed(1)
cv_groups_F <- create_cv_groups(data, dist, params, cv_folds, cv_class_stratify_F, fold_id)
# Then cv_groups are not identical
expect_true(any(cv_groups_T != cv_groups_F))
})
test_that("Error thrown when number in a certain class is < cv_folds on stratification - Bernoulli", {
# Given inputs - default distributions for Bernoulli, data
# now representative of distribution
# Dist_Objs
dist <- gbm_dist("Bernoulli")
# create some data - DOES NEED TO REFLECT DIST
set.seed(1)
set.seed(1)
# create some data
N <- 1000
X1 <- runif(N)
X2 <- runif(N)
X3 <- factor(sample(letters[1:4],N,replace=T))
mu <- c(-1,0,1,2)[as.numeric(X3)]
p <- 1/(1+exp(-(sin(3*X1) - 4*X2 + mu)))
Y <- rbinom(N,1,p)
# random weights if you want to experiment with them
w <- rexp(N)
w <- N*w/sum(w)
offset <- rep(0, N)
X <- data.frame(X1=X1,X2=X2,X3=X3)
data <- gbm_data(X, Y, w, offset)
# Params Obj
params <- training_params(num_train = N)
# Other CV Parameters
fold_id <- seq_len(params$num_train)
# When creating cv_groups with cv_class_stratify=TRUE and cv_folds > min number of a certain class
cv_class_stratify <- TRUE
Ones <- tabulate(data$y[seq_len(params$num_train)])
Zeros <- length(data$y[seq_len(params$num_train)])-Ones
cv_folds <- min(c(Ones, Zeros)) + 1
# Then error will be thrown
expect_error(create_cv_groups(data, dist, params, cv_folds, cv_class_stratify, fold_id))
})
test_that("cv groups are such that folds are split at group boundaries - Pairwise", {
# Given a Pairwise distribution object and appropriate
# groupings etc.
dist <- gbm_dist("Pairwise")
# create some data - DOESN'T NEED TO REFLECT DIST
set.seed(1)
N <- 1000
X1 <- runif(N)
X2 <- 2*runif(N)
X3 <- factor(sample(letters[1:4],N,replace=T))
X4 <- ordered(sample(letters[1:6],N,replace=T))
X5 <- factor(sample(letters[1:3],N,replace=T))
X6 <- 3*runif(N)
mu <- c(-1,0,1,2)[as.numeric(X3)]
SNR <- 10 # signal-to-noise ratio
Y <- X1**1.5 + 2 * (X2**.5) + mu
sigma <- sqrt(var(Y)/SNR)
Y <- Y + rnorm(N,0,sigma)
# create a bunch of missing values
X1[sample(1:N,size=100)] <- NA
X3[sample(1:N,size=300)] <- NA
w <- rep(1,N)
offset <- rep(0, N)
X <- data.frame(X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
data <- gbm_data(X, Y, w, offset)
# Params Obj
params <- training_params(num_train = N)
# Other CV Parameters - NB: fold_id not NULL
cv_folds <- 2
fold_id <- seq_len(params$num_train)
# DEFINE GROUPINGS
dist$group <- as.factor(sample(seq_len(5), N, replace=TRUE))
# When cv_groups is called and compared to
set.seed(1)
cv_groups <- create_cv_groups(data, dist, params, cv_folds, FALSE, fold_id)
# Then the splits are at group boundaries
set.seed(1)
samp <- sample(rep(seq_len(cv_folds), length=nlevels(dist$group)))
expect_equal(cv_groups, samp[as.integer(dist$group[seq_len(params$num_train)])])
})
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.