tests/testthat/test-offset.r

##############
# Author: James Hickey
# 
# Series of tests on the effect of the offset on the output.
# Help identify if the refactoring is changing the higher level behaviour.
#
##############
context("Testing 0 offset - old API")
test_that("Setting the offset to 0 does not alter the initial value - Adaboost",{
  # create some data
  N <- 1000
  X1 <- runif(N)
  X2 <- runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  p <- 1/(1+exp(-(sin(3*X1) - 4*X2 + mu)))
  Y <- rbinom(N,1,p)
  
  # random weights if you want to experiment with them
  w <- rexp(N)
  w <- N*w/sum(w)
  data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3)
  
  # Offset 
  offset <- rep(0, N)
  
  # Generate new gbm object
  set.seed(15)
  gbm.no.offset <- gbm(Y~X1+X2+X3,
                  data=data,
                  weights=w,
                  var.monotone=c(0,0,0),
                  distribution="adaboost",
                  n.trees=100,
                  shrinkage=0.001,
                  interaction.depth=3,
                  bag.fraction = 0.5,
                  train.fraction = 0.5,
                  cv.folds=1,
                  n.minobsinnode = 10,
                  verbose = FALSE)
  set.seed(15)
  gbm.zero.offset <- gbm(Y~X1+X2+X3,
                       data=data,
                       weights=w,
                       offset = offset,
                       var.monotone=c(0,0,0),
                       distribution="adaboost",
                       n.trees=100,
                       shrinkage=0.001,
                       interaction.depth=3,
                       bag.fraction = 0.5,
                       train.fraction = 0.5,
                       cv.folds=1,
                       n.minobsinnode = 10,
                       verbose = FALSE)
  
  expect_equal(gbm.no.offset$initF, gbm.zero.offset$initF)
})
test_that("Setting the offset to 0 does not alter the initial value - Bernoulli",{
  # create some data
  set.seed(1)
  N <- 1000
  X1 <- runif(N)
  X2 <- runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  p <- 1/(1+exp(-(sin(3*X1) - 4*X2 + mu)))
  Y <- rbinom(N,1,p)
  
  # random weights if you want to experiment with them
  w <- rexp(N)
  w <- N*w/sum(w)
  data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3)
  
  # Offset
  offset <- rep(0, N)
  
  # Generate new gbm object
  set.seed(15)
  gbm.no.offset <- gbm(Y~X1+X2+X3,
                  data=data,
                  weights=w,
                  var.monotone=c(0,0,0),
                  distribution="bernoulli",
                  n.trees=100,
                  shrinkage=0.001,
                  interaction.depth=3,
                  bag.fraction = 0.5,
                  train.fraction = 0.5,
                  cv.folds=1,
                  n.minobsinnode = 10,
                  verbose = FALSE)
  set.seed(15)
  gbm.zero.offset <- gbm(Y~X1+X2+X3,
                       data=data,
                       weights=w,
                       offset=offset,
                       var.monotone=c(0,0,0),
                       distribution="bernoulli",
                       n.trees=100,
                       shrinkage=0.001,
                       interaction.depth=3,
                       bag.fraction = 0.5,
                       train.fraction = 0.5,
                       cv.folds=1,
                       n.minobsinnode = 10,
                       verbose = FALSE)
  
  expect_equal(gbm.no.offset$initF, gbm.zero.offset$initF)
})
test_that("Setting the offset to 0 does not alter the initial value - CoxPH",{
  require(survival)
  set.seed(1)
  ## GIVEN
  # create some data
  N <- 3000
  X1 <- runif(N)
  X2 <- runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  f <- 0.5*sin(3*X1 + 5*X2^2 + mu/10)
  tt.surv <- rexp(N,exp(f))
  tt.cens <- rexp(N,0.5)
  delta <- as.numeric(tt.surv <= tt.cens)
  tt <- apply(cbind(tt.surv,tt.cens),1,min)
  
  # throw in some missing values
  X1[sample(1:N,size=100)] <- NA
  X3[sample(1:N,size=300)] <- NA
  
  # random weights if you want to experiment with them
  w <- rep(1,N)
  
  data <- data.frame(y=Surv(tt,delta),X1=X1,X2=X2,X3=X3)
  
  #Offset 
  offset <- rep(0, N)
  
  ## WHEN
  # Generate new gbm object
  set.seed(15)
  gbm.no.offset <- gbm(y~X1+X2+X3,
                  data=data,
                  weights=w,
                  var.monotone=c(0,0,0),
                  distribution="coxph",
                  n.trees=100,
                  shrinkage=0.001,
                  interaction.depth=3,
                  bag.fraction = 0.5,
                  train.fraction = 0.5,
                  cv.folds = 1,
                  n.minobsinnode = 10,
                  keep.data = TRUE,
                  verbose = FALSE)
  set.seed(15)
  gbm.zero.offset <- gbm(y~X1+X2+X3,
                       data=data,
                       weights=w,
                       var.monotone=c(0,0,0),
                       distribution="CoxPH",
                       offset=offset,
                       n.trees=100,
                       shrinkage=0.001,
                       interaction.depth=3,
                       bag.fraction = 0.5,
                       train.fraction = 0.5,
                       cv.folds = 1,
                       n.minobsinnode = 10,
                       keep.data = TRUE,
                       verbose = FALSE)
  
  expect_equal(gbm.no.offset$initF, gbm.zero.offset$initF)
})
test_that("Setting the offset to 0 does not alter the initial value - Gamma",{
  # Create data
  set.seed(1)
  N <- 1000
  X1 <- runif(N)
  X2 <- 2*runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  X4 <- ordered(sample(letters[1:6],N,replace=T))
  X5 <- factor(sample(letters[1:3],N,replace=T))
  X6 <- 3*runif(N)
  mu <- c(0,0,1,2)[as.numeric(X3)]
  
  SNR <- 10 # signal-to-noise ratio
  Y <- X1**1.5 + 2 * (X2**.5) + mu
  sigma <- sqrt(var(Y)/SNR)
  Y <- Y + abs(rnorm(N,0,sigma))
  
  # create a bunch of missing values
  X1[sample(1:N,size=100)] <- NA
  X3[sample(1:N,size=300)] <- NA
  
  w <- rep(1,N)
  data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
  
  # Offset
  offset <- rep(0, N)
  
  # Generate new gbm object
  set.seed(15)
  gbm.no.offset <- gbm(Y~X1+X2+X3+X4+X5+X6,
                  data=data,
                  var.monotone=c(0,0,0,0,0,0),
                  distribution="gamma",
                  n.trees=100,
                  shrinkage=0.005,
                  interaction.depth=3,
                  bag.fraction = 0.5,
                  train.fraction = 0.5,
                  mFeatures = 3,
                  n.minobsinnode = 10,
                  keep.data=TRUE,
                  cv.folds=1,
                  verbose = FALSE)
  set.seed(15)
  gbm.zero.offset <- gbm(Y~X1+X2+X3+X4+X5+X6,
                       data=data,
                       var.monotone=c(0,0,0,0,0,0),
                       distribution="Gamma",
                       n.trees=100,
                       offset=offset,
                       shrinkage=0.005,
                       interaction.depth=3,
                       bag.fraction = 0.5,
                       train.fraction = 0.5,
                       mFeatures = 3,
                       n.minobsinnode = 10,
                       keep.data=TRUE,
                       cv.folds=1,
                       verbose = FALSE)

    expect_equal(gbm.no.offset$initF, gbm.zero.offset$initF)
})
test_that("Setting the offset to 0 does not alter the initial value - Gaussian",{
  # Create data
  set.seed(1)
  N <- 1000
  X1 <- runif(N)
  X2 <- 2*runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  X4 <- ordered(sample(letters[1:6],N,replace=T))
  X5 <- factor(sample(letters[1:3],N,replace=T))
  X6 <- 3*runif(N)
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  SNR <- 10 # signal-to-noise ratio
  Y <- X1**1.5 + 2 * (X2**.5) + mu
  sigma <- sqrt(var(Y)/SNR)
  Y <- Y + rnorm(N,0,sigma)
  
  # create a bunch of missing values
  X1[sample(1:N,size=100)] <- NA
  X3[sample(1:N,size=300)] <- NA
  
  w <- rep(1,N)
  data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
  
  # Offset
  offset <- rep(0, N)
  
  # Generate new gbm object
  set.seed(15)
  gbm.no.offset <- gbm(Y~X1+X2+X3+X4+X5+X6,
                  data=data,
                  var.monotone=c(0,0,0,0,0,0),
                  distribution="gaussian",
                  n.trees=100,
                  shrinkage=0.005,
                  interaction.depth=3,
                  bag.fraction = 0.5,
                  train.fraction = 0.5,
                  mFeatures = 3,
                  n.minobsinnode = 10,
                  keep.data=TRUE,
                  cv.folds=1,
                  verbose = FALSE)
  set.seed(15)
  gbm.zero.offset <- gbm(Y~X1+X2+X3+X4+X5+X6,
                       data=data,
                       var.monotone=c(0,0,0,0,0,0),
                       distribution="gaussian",
                       n.trees=100,
                       offset=offset,
                       shrinkage=0.005,
                       interaction.depth=3,
                       bag.fraction = 0.5,
                       train.fraction = 0.5,
                       mFeatures = 3,
                       n.minobsinnode = 10,
                       keep.data=TRUE,
                       cv.folds=1,
                       verbose = FALSE)
  
  expect_equal(gbm.no.offset$initF, gbm.zero.offset$initF)
})
test_that("Setting the offset to 0 does not alter the initial value - Laplace",{
  # Create data
  set.seed(1)
  N <- 1000
  X1 <- runif(N)
  X2 <- 2*runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  X4 <- ordered(sample(letters[1:6],N,replace=T))
  X5 <- factor(sample(letters[1:3],N,replace=T))
  X6 <- 3*runif(N)
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  SNR <- 10 # signal-to-noise ratio
  Y <- X1**1.5 + 2 * (X2**.5) + mu
  sigma <- sqrt(var(Y)/SNR)
  Y <- Y + rnorm(N,0,sigma)
  
  # create a bunch of missing values
  X1[sample(1:N,size=100)] <- NA
  X3[sample(1:N,size=300)] <- NA
  
  w <- rep(1,N)
  data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
  
  # Offset
  offset <- rep(0, N)
  
  # Generate new gbm object
  set.seed(15)
  gbm.no.offset <- gbm(Y~X1+X2+X3+X4+X5+X6,
                  data=data,
                  var.monotone=c(0,0,0,0,0,0),
                  distribution="laplace",
                  n.trees=100,
                  shrinkage=0.005,
                  interaction.depth=3,
                  bag.fraction = 0.5,
                  train.fraction = 0.5,
                  mFeatures = 3,
                  n.minobsinnode = 10,
                  keep.data=TRUE,
                  cv.folds=1,
                  verbose = FALSE)
  set.seed(15)
  gbm.zero.offset <- gbm(Y~X1+X2+X3+X4+X5+X6,
                       data=data,
                       var.monotone=c(0,0,0,0,0,0),
                       distribution="laplace",
                       n.trees=100,
                       shrinkage=0.005,
                       interaction.depth=3,
                       bag.fraction = 0.5,
                       train.fraction = 0.5,
                       mFeatures = 3,
                       n.minobsinnode = 10,
                       keep.data=TRUE,
                       offset=offset,
                       cv.folds=1,
                       verbose = FALSE)
  
  expect_equal(gbm.no.offset$initF, gbm.zero.offset$initF)
})
test_that("Setting the offset to 0 does not alter the initial value - Huberized Hinge Loss",{
  # create some data
  set.seed(1)
  N <- 1000
  X1 <- runif(N)
  X2 <- runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  p <- 1/(1+exp(-(sin(3*X1) - 4*X2 + mu)))
  Y <- rbinom(N,1,p)
  
  # random weights if you want to experiment with them
  w <- rexp(N)
  w <- N*w/sum(w)
  data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3)
  
  # Offset
  offset <- rep(0, N)
  
  # Generate new gbm object
  set.seed(15)
  gbm.no.offset <- gbm(Y~X1+X2+X3,
                  data=data,
                  weights=w,
                  var.monotone=c(0,0,0),
                  distribution="huberized",
                  n.trees=100,
                  shrinkage=0.001,
                  interaction.depth=3,
                  bag.fraction = 0.5,
                  train.fraction = 0.5,
                  cv.folds=1,
                  n.minobsinnode = 10,
                  verbose = FALSE)
  set.seed(15)
  gbm.zero.offset <- gbm(Y~X1+X2+X3,
                       data=data,
                       weights=w,
                       offset=offset,
                       var.monotone=c(0,0,0),
                       distribution="huberized",
                       n.trees=100,
                       shrinkage=0.001,
                       interaction.depth=3,
                       bag.fraction = 0.5,
                       train.fraction = 0.5,
                       cv.folds=1,
                       n.minobsinnode = 10,
                       verbose = FALSE)
  
  expect_equal(gbm.no.offset$initF, gbm.zero.offset$initF)
})

test_that("Setting the offset to 0 does not alter the initial value - Pairwise",{
  skip("Skipping pairwise")
  # create query groups, with an average size of 25 items each
  set.seed(1)
  N <- 1000
  num.queries <- floor(N/25)
  query <- sample(1:num.queries, N, replace=TRUE)
  
  # X1 is a variable determined by query group only
  query.level <- runif(num.queries)
  X1 <- query.level[query]
  
  # X2 varies with each item
  X2 <- runif(N)
  
  # X3 is uncorrelated with target
  X3 <- runif(N)
  
  # The target
  Y <- X1 + X2
  
  # Add some random noise to X2 that is correlated with
  # queries, but uncorrelated with items
  
  X2 <- X2 + scale(runif(num.queries))[query]
  
  # Add some random noise to target
  SNR <- 5 # signal-to-noise ratio
  sigma <- sqrt(var(Y)/SNR)
  Y <- Y + runif(N, 0, sigma)
  Y.norm <- round(Y/(max(Y) + 0.001))
  
  data.train <- data.frame(Y, query=query, X1, X2, X3)
  data.train.mrr <- data.frame(Y.norm, query=query, X1, X2, X3)
  
  
  # Generate new gbm objects
  set.seed(15)
  gbm.no.offset.ndcg <- gbm(Y~X1+X2+X3,
                  data=data.train,     
                  distribution=list(   
                    name='pairwise',   
                    metric="ndcg",     
                    group="query"),    
                  n.trees=100,        
                  shrinkage=0.005,     
                  interaction.depth=3, 
                  bag.fraction = 0.5,  
                  train.fraction = 1,  
                  n.minobsinnode = 10, 
                  keep.data=TRUE,      
                  cv.folds=1,          
                  verbose = FALSE     
                  )         
  
  gbm.no.offset.mrr <- gbm(Y.norm~X1+X2+X3, 
                 data=data.train.mrr,     
                 distribution=list(  
                   name='pairwise',  
                   metric="mrr",
                   group='query'),   
                 n.trees=100,        
                 shrinkage=0.005,    
                 interaction.depth=3,
                 bag.fraction = 0.5, 
                 train.fraction = 1, 
                 n.minobsinnode = 10,
                 keep.data=TRUE,     
                 cv.folds=1,         
                 verbose = FALSE     
                 )         
  
  gbm.no.offset.map <- gbm(Y.norm~X1+X2+X3,
                 data=data.train.mrr,
                 distribution=list(  
                   name='pairwise',  
                   metric="map",     
                   group='query'),   
                 n.trees=100,        
                 shrinkage=0.005,    
                 interaction.depth=3,
                 bag.fraction = 0.5, 
                 train.fraction = 1, 
                 n.minobsinnode = 10,
                 keep.data=TRUE,     
                 cv.folds=1,         
                 verbose = FALSE     
                 )         
  gbm.no.offset.conc <- gbm(Y~X1+X2+X3,  
                  data=data.train,    
                  distribution=list(  
                    name='pairwise',  
                    metric="conc",    
                    group='query'),   
                  n.trees=100,        
                  shrinkage=0.005,    
                  interaction.depth=3,
                  bag.fraction = 0.5, 
                  train.fraction = 1, 
                  n.minobsinnode = 10,
                  keep.data=TRUE,     
                  cv.folds=1,         
                  verbose = FALSE     
                  )
  set.seed(15)
  gbm.zero.offset.ndcg <- gbm(Y~X1+X2+X3,       
                            data=data.train,    
                            distribution=list(  
                              name='pairwise',  
                              metric="ndcg",     
                              group='query'),   
                            n.trees=100,        
                            shrinkage=0.005,    
                            interaction.depth=3,
                            bag.fraction = 0.5, 
                            train.fraction = 1, 
                            n.minobsinnode = 10,
                            keep.data=TRUE,     
                            cv.folds=1,         
                            verbose = FALSE,    
                            offset=rep(0, N)
                            )
  
  gbm.zero.offset.mrr <- gbm(Y.norm~X1+X2+X3,  
                           data=data.train.mrr,
                           distribution=list(  
                             name='pairwise',  
                             metric="mrr",     
                             group='query'),   
                           n.trees=100,        
                           shrinkage=0.005,    
                           interaction.depth=3,
                           bag.fraction = 0.5, 
                           train.fraction = 1, 
                           n.minobsinnode = 10,
                           keep.data=TRUE,     
                           cv.folds=1,         
                           verbose = FALSE,    
                           offset=rep(0,N)
                           )         
  
  gbm.zero.offset.map <- gbm(Y.norm~X1+X2+X3,  
                           data=data.train.mrr,
                           distribution=list(  
                             name='pairwise',  
                             metric="map",     
                             group='query'),   
                           n.trees=100,        
                           shrinkage=0.005,    
                           interaction.depth=3,
                           bag.fraction = 0.5, 
                           train.fraction = 1, 
                           n.minobsinnode = 10,
                           keep.data=TRUE,     
                           cv.folds=1,         
                           verbose = FALSE,    
                           offset=rep(0,N)
                           )         
  gbm.zero.offset.conc <- gbm(Y~X1+X2+X3,       
                            data=data.train,    
                            distribution=list(  
                              name='pairwise',  
                              metric="conc",    
                              group='query'),   
                            n.trees=100,        
                            shrinkage=0.005,    
                            interaction.depth=3,
                            bag.fraction = 0.5, 
                            train.fraction = 1, 
                            n.minobsinnode = 10,
                            keep.data=TRUE,     
                            cv.folds=1,         
                            verbose = FALSE,    
                            offset = rep(0,N))
  
  expect_equal(gbm.no.offset.conc$initF, gbm.zero.offset.conc$initF)
  expect_equal(gbm.no.offset.mrr$initF, gbm.zero.offset.mrr$initF)
  expect_equal(gbm.no.offset.map$initF, gbm.zero.offset.map$initF)
  expect_equal(gbm.no.offset.ndcg$initF, gbm.zero.offset.ndcg$initF)
})
test_that("Setting the offset to 0 does not alter the initial value - Poisson",{
  # Create data
  set.seed(1)
  N <- 1000
  X1 <- runif(N)
  X2 <- 2*runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  X4 <- ordered(sample(letters[1:6],N,replace=T))
  X5 <- factor(sample(letters[1:3],N,replace=T))
  X6 <- 3*runif(N)
  mu <- c(0,0,1,2)[as.numeric(X3)]
  
  SNR <- 10 # signal-to-noise ratio
  Y <- X1**1.5 + 2 * (X2**.5) + mu
  sigma <- sqrt(var(Y)/SNR)
  Y <- round(Y + abs(rnorm(N,0,sigma))) # Ensure it is normal
  
  # create a bunch of missing values
  X1[sample(1:N,size=100)] <- NA
  X3[sample(1:N,size=300)] <- NA
  
  w <- rep(1,N)
  data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
  
  # Offset
  offset <- rep(0, N)
  
  # Generate new gbm object
  set.seed(15)
  gbm.no.offset <- gbm(Y~X1+X2+X3+X4+X5+X6,   
                  data=data,                  
                  var.monotone=c(0,0,0,0,0,0),
                  distribution="poisson",     
                  n.trees=100,                
                  shrinkage=0.005,            
                  interaction.depth=3,        
                  bag.fraction = 0.5,         
                  train.fraction = 0.5,       
                  mFeatures = 3,              
                  n.minobsinnode = 10,        
                  keep.data=TRUE,
                  cv.folds=1,                 
                  verbose = FALSE)            
  set.seed(15)
  gbm.zero.offset <- gbm(Y~X1+X2+X3+X4+X5+X6,      
                       data=data,                  
                       var.monotone=c(0,0,0,0,0,0),
                       distribution="poisson",     
                       n.trees=100,                
                       offset=offset,
                       shrinkage=0.005,            
                       interaction.depth=3,        
                       bag.fraction = 0.5,         
                       train.fraction = 0.5,       
                       mFeatures = 3,              
                       n.minobsinnode = 10,        
                       keep.data=TRUE,
                       cv.folds=1,                 
                       verbose = FALSE)            
  
  expect_equal(gbm.no.offset$initF, gbm.zero.offset$initF)
})
test_that("Setting the offset to 0 does not alter the initial value - Quantile Reg",{
  # Create data
  set.seed(1)
  N <- 1000
  X1 <- runif(N)
  X2 <- 2*runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  X4 <- ordered(sample(letters[1:6],N,replace=T))
  X5 <- factor(sample(letters[1:3],N,replace=T))
  X6 <- 3*runif(N)
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  SNR <- 10 # signal-to-noise ratio
  Y <- X1**1.5 + 2 * (X2**.5) + mu
  sigma <- sqrt(var(Y)/SNR)
  Y <- Y + rnorm(N,0,sigma)
  
  # create a bunch of missing values
  X1[sample(1:N,size=100)] <- NA
  X3[sample(1:N,size=300)] <- NA
  
  w <- rep(1,N)
  data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
  
  #Offset
  offset <- rep(0, N)
  
  # Generate new gbm object
  set.seed(15)
  gbm.no.offset <- gbm(Y~X1+X2+X3+X4+X5+X6,
                  data=data,               
                  var.monotone=c(0,0,0,0,0,0),
                  distribution= list(name="quantile",alpha=0.95),
                  n.trees=100,                 
                  shrinkage=0.005,             
                  interaction.depth=3,         
                  bag.fraction = 0.5,          
                  train.fraction = 0.5,        
                  mFeatures = 3,               
                  n.minobsinnode = 10,         
                  keep.data=TRUE,
                  cv.folds=1,                 
                  verbose = FALSE)            
  set.seed(15)
  gbm.zero.offset <- gbm(Y~X1+X2+X3+X4+X5+X6, 
                       data=data,             
                       var.monotone=c(0,0,0,0,0,0),
                       distribution= list(name="quantile",alpha=0.95), 
                       n.trees=100,                 
                       shrinkage=0.005,
                       interaction.depth=3,
                       bag.fraction = 0.5,
                       train.fraction = 0.5,
                       mFeatures = 3, 
                       n.minobsinnode = 10,
                       keep.data=TRUE,
                       offset=offset,
                       cv.folds=1,
                       verbose = FALSE)
  
  expect_equal(gbm.no.offset$initF, gbm.zero.offset$initF)
})
test_that("Setting the offset to 0 does not alter the initial value - T dist",{
  # Create data
  set.seed(1)
  N <- 1000
  X1 <- runif(N)
  X2 <- 2*runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  X4 <- ordered(sample(letters[1:6],N,replace=T))
  X5 <- factor(sample(letters[1:3],N,replace=T))
  X6 <- 3*runif(N)
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  SNR <- 10 # signal-to-noise ratio
  Y <- X1**1.5 + 2 * (X2**.5) + mu
  sigma <- sqrt(var(Y)/SNR)
  Y <- Y + rnorm(N,0,sigma)
  
  # create a bunch of missing values
  X1[sample(1:N,size=100)] <- NA
  X3[sample(1:N,size=300)] <- NA
  
  w <- rep(1,N)
  data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
  
  offset <- rep(0, N)
  
  # Generate new gbm object
  set.seed(15)
  gbm.no.offset <- gbm(Y~X1+X2+X3+X4+X5+X6,    
                  data=data,                   
                  var.monotone=c(0,0,0,0,0,0), 
                  distribution="tdist",     
                  n.trees=100,              
                  shrinkage=0.005,          
                  interaction.depth=3,      
                  bag.fraction = 0.5,       
                  train.fraction = 0.5,     
                  mFeatures = 3,            
                  n.minobsinnode = 10,      
                  keep.data=TRUE,
                  cv.folds=1,               
                  verbose = FALSE)          
  set.seed(15)
  gbm.zero.offset <- gbm(Y~X1+X2+X3+X4+X5+X6,
                       data=data,            
                       var.monotone=c(0,0,0,0,0,0),
                       distribution="tdist",     
                       n.trees=100,              
                       shrinkage=0.005,          
                       interaction.depth=3,      
                       bag.fraction = 0.5,       
                       train.fraction = 0.5,     
                       mFeatures = 3,            
                       n.minobsinnode = 10,      
                       keep.data=TRUE,
                       offset=offset,
                       cv.folds=1,               
                       verbose = FALSE)          
  
  expect_equal(gbm.no.offset$initF, gbm.zero.offset$initF)
})
test_that("Setting the offset to 0 does not alter the initial value - Tweedie",{
  # Create data
  set.seed(1)
  N <- 1000
  X1 <- runif(N)
  X2 <- 2*runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  X4 <- ordered(sample(letters[1:6],N,replace=T))
  X5 <- factor(sample(letters[1:3],N,replace=T))
  X6 <- 3*runif(N)
  mu <- c(0,0,1,2)[as.numeric(X3)]
  
  SNR <- 10 # signal-to-noise ratio
  Y <- X1**1.5 + 2 * (X2**.5) + mu
  sigma <- sqrt(var(Y)/SNR)
  Y <- Y + abs(rnorm(N,0,sigma))
  
  # create a bunch of missing values
  X1[sample(1:N,size=100)] <- NA
  X3[sample(1:N,size=300)] <- NA
  
  w <- rep(1,N)
  data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
  
  # Offset
  offset <- rep(0, N)
  
  # Generate new gbm object
  set.seed(15)
  gbm.no.offset <- gbm(Y~X1+X2+X3+X4+X5+X6,
                  data=data,               
                  var.monotone=c(0,0,0,0,0,0),
                  distribution="tweedie",
                  n.trees=100,           
                  shrinkage=0.005,       
                  interaction.depth=3,   
                  bag.fraction = 0.5,    
                  train.fraction = 0.5,  
                  mFeatures = 3,         
                  n.minobsinnode = 10,   
                  keep.data=TRUE,
                  cv.folds=1,            
                  verbose = FALSE)       
  set.seed(15)
  gbm.zero.offset <- gbm(Y~X1+X2+X3+X4+X5+X6,
                       data=data,            
                       var.monotone=c(0,0,0,0,0,0),
                       distribution="tweedie",    
                       n.trees=100,               
                       shrinkage=0.005,           
                       interaction.depth=3,       
                       bag.fraction = 0.5,        
                       train.fraction = 0.5,      
                       mFeatures = 3,             
                       n.minobsinnode = 10,       
                       keep.data=TRUE,
                       offset=offset,
                       cv.folds=1,                
                       verbose = FALSE)           
  
  expect_equal(gbm.no.offset$initF, gbm.zero.offset$initF)
})
test_that("Increasing the offset reduces the initial value - Adaboost", {
  # create some data
  N <- 1000
  X1 <- runif(N)
  X2 <- runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  p <- 1/(1+exp(-(sin(3*X1) - 4*X2 + mu)))
  Y <- rbinom(N,1,p)
  
  # random weights if you want to experiment with them
  w <- rexp(N)
  w <- N*w/sum(w)
  data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3)
  
  # Offset 
  offset.smaller <- rep(1, N)
  offset.larger <- rep(20, N)
  
  # Generate new gbm object
  set.seed(15)
  gbm.smaller.offset <- gbm(Y~X1+X2+X3,        
                       data=data,              
                       weights=w,
                       offset=offset.smaller,
                       var.monotone=c(0,0,0),  
                       distribution="adaboost",
                       n.trees=100,            
                       shrinkage=0.001,        
                       interaction.depth=3,    
                       bag.fraction = 0.5,     
                       train.fraction = 0.5,   
                       cv.folds=1,             
                       n.minobsinnode = 10,    
                       verbose = FALSE)        
  set.seed(15)  
  gbm.larger.offset <- gbm(Y~X1+X2+X3,
                         data=data,                 
                         weights=w,
                         offset = offset.larger,
                         var.monotone=c(0,0,0),
                         distribution="adaboost",
                         n.trees=100, 
                         shrinkage=0.001,
                         interaction.depth=3,
                         bag.fraction = 0.5,        
                         train.fraction = 0.5,
                         cv.folds=1,                
                         n.minobsinnode = 10,       
                         verbose = FALSE)           
  
  expect_true(gbm.smaller.offset$initF - gbm.larger.offset$initF > 0)
})
test_that("Increasing the offset reduces the initial value - Poisson", {
  # Create data
  set.seed(1)
  N <- 1000
  X1 <- runif(N)
  X2 <- 2*runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  X4 <- ordered(sample(letters[1:6],N,replace=T))
  X5 <- factor(sample(letters[1:3],N,replace=T))
  X6 <- 3*runif(N)
  mu <- c(0,0,1,2)[as.numeric(X3)]
  
  SNR <- 10 # signal-to-noise ratio
  Y <- X1**1.5 + 2 * (X2**.5) + mu
  sigma <- sqrt(var(Y)/SNR)
  Y <- round(Y + abs(rnorm(N,0,sigma))) # Ensure it is normal
  
  # create a bunch of missing values
  X1[sample(1:N,size=100)] <- NA
  X3[sample(1:N,size=300)] <- NA
  
  w <- rep(1,N)
  data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
  
  # Offset 
  offset.smaller <- rep(1, N)
  offset.larger <- rep(20, N)
  
  # Generate new gbm object
  set.seed(15)
  gbm.smaller.offset <- gbm(Y~X1+X2+X3+X4+X5+X6,
                            data=data,          
                            var.monotone=c(0,0,0,0,0,0),
                            distribution="poisson",
                            n.trees=100,        
                            offset=offset.smaller,
                            shrinkage=0.005,
                            interaction.depth=3,
                            bag.fraction = 0.5,
                            train.fraction = 0.5,
                            mFeatures = 3,
                            n.minobsinnode = 10,
                            keep.data=TRUE,
                            cv.folds=1,         
                            verbose = FALSE)    
  set.seed(15)
  gbm.larger.offset <- gbm(Y~X1+X2+X3+X4+X5+X6,
                         data=data,            
                         var.monotone=c(0,0,0,0,0,0),
                         distribution="Poisson",
                         n.trees=100,           
                         offset=offset.larger,
                         shrinkage=0.005,       
                         interaction.depth=3,   
                         bag.fraction = 0.5,    
                         train.fraction = 0.5,  
                         mFeatures = 3,         
                         n.minobsinnode = 10,   
                         keep.data=TRUE,
                         cv.folds=1,            
                         verbose = FALSE)       
    
  
  expect_true(gbm.smaller.offset$initF - gbm.larger.offset$initF > 0)
})
gbm-developers/gbm3 documentation built on April 28, 2024, 10:04 p.m.