####################
# Author: James Hickey
#
# Series of tests to check other, less used, distributions can be run to
# completion
#
####################
context("Testing runs of other distributions")
test_that("Can fit GBM with - AdaBoost", {
# Given appropriate data
set.seed(1)
# create some data
N <- 1000
X1 <- runif(N)
X2 <- runif(N)
X3 <- factor(sample(letters[1:4],N,replace=T))
mu <- c(-1,0,1,2)[as.numeric(X3)]
p <- 1/(1+exp(-(sin(3*X1) - 4*X2 + mu)))
Y <- rbinom(N,1,p)
# random weights if you want to experiment with them
w <- rexp(N)
w <- N*w/sum(w)
data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3)
offset <- rep(0, N)
# Set up for new API
params <- training_params(num_trees=20, interaction_depth=3,
min_num_obs_in_node=10,
shrinkage=0.001, bag_fraction=0.5,
id=seq(nrow(data)), num_train=N/2,
num_features=3)
# When GBM fit with AdaBoost
dist <- gbm_dist("AdaBoost")
# Then no error is thrown
expect_error(gbmt(Y~X1+X2+X3, data=data,
distribution=dist, weights=w, offset=offset,
train_params=params, var_monotone=c(0, 0, 0),
keep_gbm_data=TRUE, cv_folds=5, is_verbose = FALSE),
NA)
})
test_that("Can fit GBM with - Gamma", {
# Given appropriate data
set.seed(1)
# create some data
N <- 1000
X1 <- runif(N)
X2 <- 2*runif(N)
X3 <- factor(sample(letters[1:4],N,replace=T))
X4 <- ordered(sample(letters[1:6],N,replace=T))
X5 <- factor(sample(letters[1:3],N,replace=T))
X6 <- 3*runif(N)
mu <- c(-1,0,1,2)[as.numeric(X3)]
SNR <- 10 # signal-to-noise ratio
Y <- X1**1.5 + 2 * (X2**.5) + mu
sigma <- sqrt(var(Y)/SNR)
Y <- abs(Y + rnorm(N,0,sigma))
# create a bunch of missing values
X1[sample(1:N,size=100)] <- NA
X3[sample(1:N,size=300)] <- NA
w <- rep(1,N)
offset <- rep(0, N)
data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
# Set up for new API
params <- training_params(num_trees=20, interaction_depth=3,
min_num_obs_in_node=10,
shrinkage=0.005, bag_fraction=0.5,
id=seq(nrow(data)), num_train=N/2,
num_features=6)
# When GBM fit with Gamma
dist <- gbm_dist("Gamma")
# Then no error is thrown
expect_error(gbmt(Y~X1+X2+X3+X4+X5+X6, data=data,
distribution=dist, weights=w, offset=offset,
train_params=params, var_monotone=c(0, 0, 0, 0, 0, 0),
keep_gbm_data=TRUE, cv_folds=10, is_verbose=FALSE),
NA)
})
test_that("Can fit GBM with - Huberized", {
# Given appropriate data
set.seed(1)
# create some data
N <- 1000
X1 <- runif(N)
X2 <- runif(N)
X3 <- factor(sample(letters[1:4],N,replace=T))
mu <- c(-1,0,1,2)[as.numeric(X3)]
p <- 1/(1+exp(-(sin(3*X1) - 4*X2 + mu)))
Y <- rbinom(N,1,p)
# random weights if you want to experiment with them
w <- rexp(N)
w <- N*w/sum(w)
data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3)
offset <- rep(0, N)
# Set up for new API
params <- training_params(num_trees=20, interaction_depth=3,
min_num_obs_in_node=10,
shrinkage=0.001, bag_fraction=0.5,
id=seq(nrow(data)), num_train=N/2,
num_features=3)
# When GBM fit with Huberized
dist <- gbm_dist("Huberized")
# Then no error is thrown
expect_error(gbmt(Y~X1+X2+X3, data=data, distribution=dist,
weights=w, offset=offset,
train_params=params, var_monotone=c(0, 0, 0),
keep_gbm_data=TRUE, cv_folds=5, is_verbose = FALSE),
NA)
})
test_that("Can fit GBM with - Laplace", {
# Given appropriate data
set.seed(1)
# create some data
N <- 1000
X1 <- runif(N)
X2 <- 2*runif(N)
X3 <- factor(sample(letters[1:4],N,replace=T))
X4 <- ordered(sample(letters[1:6],N,replace=T))
X5 <- factor(sample(letters[1:3],N,replace=T))
X6 <- 3*runif(N)
mu <- c(-1,0,1,2)[as.numeric(X3)]
SNR <- 10 # signal-to-noise ratio
Y <- X1**1.5 + 2 * (X2**.5) + mu
sigma <- sqrt(var(Y)/SNR)
Y <- Y + rnorm(N,0,sigma)
# create a bunch of missing values
X1[sample(1:N,size=100)] <- NA
X3[sample(1:N,size=300)] <- NA
w <- rep(1,N)
offset <- rep(0, N)
data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
# Set up for new API
params <- training_params(num_trees=20, interaction_depth=3,
min_num_obs_in_node=10,
shrinkage=0.005, bag_fraction=0.5,
id=seq(nrow(data)), num_train=N/2,
num_features=6)
# When GBM fit with Laplace
dist <- gbm_dist("Laplace")
# Then no error is thrown
expect_error(gbmt(Y~X1+X2+X3+X4+X5+X6, data=data, distribution=dist,
weights=w, offset=offset,
train_params=params, var_monotone=c(0, 0, 0, 0, 0, 0),
keep_gbm_data=TRUE, cv_folds=10, is_verbose=FALSE),
NA)
})
test_that("Can fit GBM with - Pairwise", {
skip("Skipping pairwise")
# Given data and a fitted pairwise
# create query groups, with an average size of 25 items each
N <- 1000
num.queries <- floor(N/25)
query <- sample(1:num.queries, N, replace=TRUE)
# X1 is a variable determined by query group only
query.level <- runif(num.queries)
X1 <- query.level[query]
# X2 varies with each item
X2 <- runif(N)
# X3 is uncorrelated with target
X3 <- runif(N)
# The target
Y <- X1 + X2
# Add some random noise to X2 that is correlated with
# queries, but uncorrelated with items
X2 <- X2 + scale(runif(num.queries))[query]
# Add some random noise to target
SNR <- 5 # signal-to-noise ratio
sigma <- sqrt(var(Y)/SNR)
Y <- Y + runif(N, 0, sigma)
Y_M <- Y
Y_M[Y_M >= 1] <- 1
Y_M[Y_M < 1] <- 0
data <- data.frame(Y, query=query, X1, X2, X3)
data_M <- data.frame(Y_M, query=query, X1, X2, X3)
params <- training_params(num_trees = 20, num_train = nrow(data),
id=seq_len(nrow(data)),
interaction_depth = 3)
# When fitting all Pairwise distributions
dist <- gbm_dist("Pairwise", metric="ndcg", group="query")
dist_2 <- gbm_dist("Pairwise", metric="conc", group="query")
dist_3 <- gbm_dist("Pairwise", metric="mrr", group="query")
dist_4 <- gbm_dist("Pairwise", metric="map", group="query")
# Then runs without error
expect_error(gbmt(Y~X1+X2+X3,
data=data,
distribution=dist,
train_params=params,
keep_gbm_data=TRUE,
cv_folds=5,
is_verbose = FALSE ,
par_details=gbmParallel()), NA)
expect_error(gbmt(Y~X1+X2+X3,
data=data,
distribution=dist_2,
train_params=params,
keep_gbm_data=TRUE,
cv_folds=5,
is_verbose = FALSE ,
par_details=gbmParallel()), NA)
expect_error(gbmt(Y_M~X1+X2+X3,
data=data_M,
distribution=dist_3,
train_params=params,
keep_gbm_data=TRUE,
cv_folds=5,
is_verbose = FALSE ,
par_details=gbmParallel()), NA)
expect_error(gbmt(Y_M~X1+X2+X3,
data=data_M,
distribution=dist_4,
train_params=params,
keep_gbm_data=TRUE,
cv_folds=5,
is_verbose = FALSE ,
par_details=gbmParallel()), NA)
})
test_that("Can fit GBM with - Poisson", {
# Given appropriate data
set.seed(1)
# create some data
N <- 1000
X1 <- runif(N)
X2 <- runif(N)
X3 <- factor(sample(letters[1:4],N,replace=T))
mu <- c(-1,0,1,2)[as.numeric(X3)]
p <- 1/(1+exp(-(sin(3*X1) - 4*X2 + mu)))
Y <- rbinom(N,1,p) + 1
# random weights if you want to experiment with them
w <- rexp(N)
w <- N*w/sum(w)
data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3)
offset <- rep(0, N)
# Set up for new API
params <- training_params(num_trees=20, interaction_depth=3,
min_num_obs_in_node=10,
shrinkage=0.001, bag_fraction=0.5,
id=seq(nrow(data)), num_train=N/2, num_features=3)
# When GBM fit with Poisson
dist <- gbm_dist("Poisson")
# Then no error is thrown
expect_error(gbmt(Y~X1+X2+X3, data=data, distribution=dist,
weights=w, offset=offset,
train_params=params, var_monotone=c(0, 0, 0),
keep_gbm_data=TRUE, cv_folds=5, is_verbose = FALSE),
NA)
})
test_that("Can fit GBM with - Quantile", {
## Given appropriate data
set.seed(1)
# create some data
N <- 1000
X1 <- runif(N)
X2 <- 2*runif(N)
X3 <- factor(sample(letters[1:4],N,replace=T))
X4 <- ordered(sample(letters[1:6],N,replace=T))
X5 <- factor(sample(letters[1:3],N,replace=T))
X6 <- 3*runif(N)
mu <- c(-1,0,1,2)[as.numeric(X3)]
SNR <- 10 # signal-to-noise ratio
Y <- X1**1.5 + 2 * (X2**.5) + mu
sigma <- sqrt(var(Y)/SNR)
Y <- Y + rnorm(N,0,sigma)
# create a bunch of missing values
X1[sample(1:N,size=100)] <- NA
X3[sample(1:N,size=300)] <- NA
w <- rep(1,N)
offset <- rep(0, N)
data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
# Set up for new API
params <- training_params(num_trees=20, interaction_depth=3, min_num_obs_in_node=10,
shrinkage=0.005, bag_fraction=0.5, id=seq(nrow(data)), num_train=N/2, num_features=6)
# When GBM fit with Quantile
dist <- gbm_dist("Quantile")
# Then no error is thrown
expect_error(gbmt(Y~X1+X2+X3+X4+X5+X6, data=data, distribution=dist,
weights=w, offset=offset,
train_params=params, var_monotone=c(0, 0, 0, 0, 0, 0),
keep_gbm_data=TRUE, cv_folds=10, is_verbose=FALSE),
NA)
})
test_that("Can fit GBM with - TDist", {
# Given appropriate data
set.seed(1)
# create some data
N <- 1000
X1 <- runif(N)
X2 <- 2*runif(N)
X3 <- factor(sample(letters[1:4],N,replace=T))
X4 <- ordered(sample(letters[1:6],N,replace=T))
X5 <- factor(sample(letters[1:3],N,replace=T))
X6 <- 3*runif(N)
mu <- c(-1,0,1,2)[as.numeric(X3)]
SNR <- 10 # signal-to-noise ratio
Y <- X1**1.5 + 2 * (X2**.5) + mu
sigma <- sqrt(var(Y)/SNR)
Y <- Y + rnorm(N,0,sigma)
# create a bunch of missing values
X1[sample(1:N,size=100)] <- NA
X3[sample(1:N,size=300)] <- NA
w <- rep(1,N)
offset <- rep(0, N)
data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
# Set up for new API
params <- training_params(num_trees=20, interaction_depth=3,
min_num_obs_in_node=10,
shrinkage=0.005, bag_fraction=0.5,
id=seq(nrow(data)), num_train=N/2,
num_features=6)
# When GBM fit with TDist
dist <- gbm_dist("TDist")
# Then no error is thrown
expect_error(gbmt(Y~X1+X2+X3+X4+X5+X6, data=data, distribution=dist,
weights=w, offset=offset,
train_params=params, var_monotone=c(0, 0, 0, 0, 0, 0),
keep_gbm_data=TRUE, cv_folds=10, is_verbose=FALSE),
NA)
})
test_that("Can fit GBM with - Tweedie", {
# Given appropriate data
set.seed(1)
# create some data
N <- 1000
X1 <- runif(N)
X2 <- 2*runif(N)
X3 <- factor(sample(letters[1:4],N,replace=T))
X4 <- ordered(sample(letters[1:6],N,replace=T))
X5 <- factor(sample(letters[1:3],N,replace=T))
X6 <- 3*runif(N)
mu <- c(-1,0,1,2)[as.numeric(X3)]
SNR <- 10 # signal-to-noise ratio
Y <- X1**1.5 + 2 * (X2**.5) + mu
sigma <- sqrt(var(Y)/SNR)
Y <- abs(Y + rnorm(N,0,sigma))
# create a bunch of missing values
X1[sample(1:N,size=100)] <- NA
X3[sample(1:N,size=300)] <- NA
w <- rep(1,N)
offset <- rep(0, N)
data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
# Set up for new API
params <- training_params(num_trees=20, interaction_depth=3,
min_num_obs_in_node=10,
shrinkage=0.005, bag_fraction=0.5,
id=seq(nrow(data)), num_train=N/2,
num_features=6)
# When GBM fit with Gamma
dist <- gbm_dist("Tweedie")
# Then no error is thrown
expect_error(gbmt(Y~X1+X2+X3+X4+X5+X6, data=data, distribution=dist,
weights=w, offset=offset,
train_params=params, var_monotone=c(0, 0, 0, 0, 0, 0),
keep_gbm_data=TRUE, cv_folds=10, is_verbose=FALSE),
NA)
})
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.