tests/testthat/test-old-api.r

####################
# Author: James Hickey
#
# Series of tests to check that the old gbm
# API 
#
####################

#### GBM ####
context("Test old API works on basic examples - gbm")

test_that("Gaussian works - gbm", {
  skip_on_cran()
  ## Based on example in R package
  
  ## test Gaussian distribution gbm model
  set.seed(1)
  
  # create some data
  N <- 1000
  X1 <- runif(N)
  X2 <- 2*runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  X4 <- ordered(sample(letters[1:6],N,replace=T))
  X5 <- factor(sample(letters[1:3],N,replace=T))
  X6 <- 3*runif(N)
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  SNR <- 10 # signal-to-noise ratio
  Y <- X1**1.5 + 2 * (X2**.5) + mu
  sigma <- sqrt(var(Y)/SNR)
  Y <- Y + rnorm(N,0,sigma)
  
  # create a bunch of missing values
  X1[sample(1:N,size=100)] <- NA
  X3[sample(1:N,size=300)] <- NA
  
  w <- rep(1,N)
  
  data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
  
  # fit initial model
  gbm1 <- gbm(Y~X1+X2+X3+X4+X5+X6,         # formula
              data=data,                   # dataset
              var.monotone=c(0,0,0,0,0,0), # -1: monotone decrease, +1: monotone increase, 0: no monotone restrictions
              distribution="gaussian",     # bernoulli, adaboost, gaussian, poisson, coxph, or
              # list(name="quantile",alpha=0.05) for quantile regression
              n.trees=2000,                 # number of trees
              shrinkage=0.005,             # shrinkage or learning rate, 0.001 to 0.1 usually work
              interaction.depth=3,         # 1: additive model, 2: two-way interactions, etc.
              bag.fraction = 0.5,          # subsampling fraction, 0.5 is probably best
              train.fraction = 0.5,        # fraction of data for training, first train.fraction*N used for training
              n.minobsinnode = 10,         # minimum number of obs needed in each node
              keep.data=TRUE,
              cv.folds=10 # do 10-fold cross-validation
              )                 
  
  # Get best model
  best.iter <- gbm.perf(gbm1,method="cv")   # returns cv estimate of best number of trees
  
  set.seed(2)
  # make some new data
  N <- 1000
  X1 <- runif(N)
  X2 <- 2*runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=TRUE))
  X4 <- ordered(sample(letters[1:6],N,replace=TRUE))
  X5 <- factor(sample(letters[1:3],N,replace=TRUE))
  X6 <- 3*runif(N)
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  # Actual underlying signal
  Y <- X1**1.5 + 2 * (X2**.5) + mu
  
  # Want to see how close predictions are to the underlying signal; noise would just interfere with this
  # Y <- Y + rnorm(N,0,sigma)
  data2 <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
  
  # predict on the new data using "best" number of trees
  f.predict <- predict(gbm1,data2,best.iter) # f.predict will be on the canonical scale (logit,log,etc.)
  
  # Base the validation tests on observed discrepancies
  expect_true(cor(data2$Y, f.predict) > 0.990)
  expect_true(sd(data2$Y-f.predict) < sigma)
})

test_that("CoxPH works - efron - gbm", {
  skip_on_cran()
  ## Require Surv to be available
  require(survival)
  
  # create some data
  set.seed(1)
  N <- 3000
  X1 <- runif(N)
  X2 <- runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  f <- 0.5*sin(3*X1 + 5*X2^2 + mu/10)
  tt.surv <- rexp(N,exp(f))
  tt.cens <- rexp(N,0.5)
  delta <- as.numeric(tt.surv <= tt.cens)
  tt <- apply(cbind(tt.surv,tt.cens),1,min)
  
  # throw in some missing values
  X1[sample(1:N,size=100)] <- NA
  X3[sample(1:N,size=300)] <- NA
  
  # random weights if you want to experiment with them
  w <- rep(1,N)
  
  data <- data.frame(tt=tt,delta=delta,X1=X1,X2=X2,X3=X3)
  
  # fit initial model
  gbm1 <- gbm(Surv(tt,delta)~X1+X2+X3,       # formula
              data=data,                 # dataset
              weights=w,
              var.monotone=c(0,0,0),     # -1: monotone decrease, +1: monotone increase, 0: no monotone restrictions
              distribution="coxph",
              n.trees=3000,              # number of trees
              shrinkage=0.001,           # shrinkage or learning rate, 0.001 to 0.1 usually work
              interaction.depth=3,       # 1: additive model, 2: two-way interactions, etc.
              bag.fraction = 0.5,        # subsampling fraction, 0.5 is probably best
              train.fraction = 0.5,      # fraction of data for training, first train.fraction*N used for training
              cv.folds = 5,              # do 5-fold cross-validation
              n.minobsinnode = 10,       # minimum total weight needed in each node
              keep.data = TRUE, tied.times.method = "efron", prior.node.coeff.var = 10)
  
  best.iter <- gbm.perf(gbm1, method="test") # returns test set estimate of best number of trees
  
  # make some new data
  set.seed(2)
  N <- 1000
  X1 <- runif(N)
  X2 <- runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  f <- 0.5*sin(3*X1 + 5*X2^2 + mu/10)  # -0.5 <= f <= 0.5 via sin fn.
  tt.surv <- rexp(N,exp(f))
  tt.cens <- rexp(N,0.5)
  
  data2 <- data.frame(tt=apply(cbind(tt.surv,tt.cens),1,min),
                      delta=as.numeric(tt.surv <= tt.cens),
                      f=f,
                      X1=X1,X2=X2,X3=X3)
  
  # predict on the new data using "best" number of trees
  # f.predict will be on the canonical scale (logit,log,etc.)
  f.predict <- predict(gbm1,data2,best.iter)
  
  #plot(data2$f,f.predict)
  # Use observed sd
  expect_true(sd(data2$f - f.predict) < 0.4)
})

test_that("CoxPH works - breslow - gbm", {
  skip_on_cran()
  # Require Surv to be available
  require(survival)
  
  # create some data
  set.seed(1)
  N <- 3000
  X1 <- runif(N)
  X2 <- runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  f <- 0.5*sin(3*X1 + 5*X2^2 + mu/10)
  tt.surv <- rexp(N,exp(f))
  tt.cens <- rexp(N,0.5)
  delta <- as.numeric(tt.surv <= tt.cens)
  tt <- apply(cbind(tt.surv,tt.cens),1,min)
  
  # throw in some missing values
  X1[sample(1:N,size=100)] <- NA
  X3[sample(1:N,size=300)] <- NA
  
  # random weights if you want to experiment with them
  w <- rep(1,N)
  
  data <- data.frame(tt=tt,delta=delta,X1=X1,X2=X2,X3=X3)
  
  # fit initial model
  gbm1 <- gbm(Surv(tt,delta)~X1+X2+X3,       # formula
              data=data,                 # dataset
              weights=w,
              var.monotone=c(0,0,0),     # -1: monotone decrease, +1: monotone increase, 0: no monotone restrictions
              distribution="coxph",
              n.trees=3000,              # number of trees
              shrinkage=0.001,           # shrinkage or learning rate, 0.001 to 0.1 usually work
              interaction.depth=3,       # 1: additive model, 2: two-way interactions, etc.
              bag.fraction = 0.5,        # subsampling fraction, 0.5 is probably best
              train.fraction = 0.5,      # fraction of data for training, first train.fraction*N used for training
              cv.folds = 5,              # do 5-fold cross-validation
              n.minobsinnode = 10,       # minimum total weight needed in each node
              keep.data = TRUE, tied.times.method="breslow", prior.node.coeff.var = 10)
  
  best.iter <- gbm.perf(gbm1,method="test") # returns test set estimate of best number of trees
  
  # make some new data
  set.seed(2)
  N <- 1000
  X1 <- runif(N)
  X2 <- runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  f <- 0.5*sin(3*X1 + 5*X2^2 + mu/10)  # -0.5 <= f <= 0.5 via sin fn.
  tt.surv <- rexp(N,exp(f))
  tt.cens <- rexp(N,0.5)
  
  data2 <- data.frame(tt=apply(cbind(tt.surv,tt.cens),1,min),
                      delta=as.numeric(tt.surv <= tt.cens),
                      f=f,
                      X1=X1,X2=X2,X3=X3)
  
  # predict on the new data using "best" number of trees
  # f.predict will be on the canonical scale (logit,log,etc.)
  f.predict <- predict(gbm1,data2, best.iter)
  
  #plot(data2$f,f.predict)
  # Use observed sd
  expect_true(sd(data2$f - f.predict) < 0.4)
})

test_that("coxph - runs to completion with train.fraction of 1.0", {
  ## Needed packages
  require(survival)
  
  # Given data from the survival data
  # keep only certain baseline variables, subjects with longitudinal data
  temp <- subset(pbc, id%in%pbcseq$id, select=c(id:sex, stage))
  pbc1 <- tmerge(data1=temp, data2=temp, id=id, death = event(time, status))
  
  pbc2 <- tmerge(pbc1, pbcseq, id=id, ascites = tdc(day, ascites),
                 bili = tdc(day, bili), albumin = tdc(day, albumin),
                 protime = tdc(day, protime), alk.phos = tdc(day, alk.phos))
  
  # Then expect no errors when performing gbm fit with train fraction = 1.0
  # GBM fit baseline data
  expect_error(gbm(Surv(time, status==2) ~ bili + protime + albumin + alk.phos, data=pbc,
                   train.fraction=1.0, distribution="coxph", n.trees=500, shrinkage=.01,  interaction.depth=3), NA)
  
  # GBM fit using start/stop times to get time-dependent covariates
  expect_error(gbm(Surv(tstart, tstop, death==2) ~ bili + protime + albumin + alk.phos, 
                   data=pbc2, distribution="coxph", train.fraction=1.0, n.trees=500, shrinkage=.01, interaction.depth=3), NA)
})

test_that("coxph - runs to completion with train.fraction < 1.0 and cv.folds > 1", {
  ## Needed packages
  require(survival)
  
  # Given data from the survival data
  # keep only certain baseline variables, subjects with longitudinal data
  temp <- subset(pbc, id%in%pbcseq$id, select=c(id:sex, stage))
  pbc1 <- tmerge(data1=temp, data2=temp, id=id, death = event(time, status))
  
  pbc2 <- tmerge(pbc1, pbcseq, id=id, ascites = tdc(day, ascites),
                 bili = tdc(day, bili), albumin = tdc(day, albumin),
                 protime = tdc(day, protime), alk.phos = tdc(day, alk.phos))
  
  # Then expect no errors when performing gbm fit with train fraction < 1.0 and cv.folds > 1
  # GBM fit baseline data
  expect_error(gbm(Surv(time, status==2) ~ bili + protime + albumin + alk.phos, data=pbc,
                   train.fraction=0.8, distribution="coxph", n.trees=500, shrinkage=.01,  interaction.depth=3), NA)
  expect_error(gbm(Surv(time, status==2) ~ bili + protime + albumin + alk.phos, data=pbc,
                   train.fraction=0.8, distribution="coxph", n.trees=500, shrinkage=.01,  cv.folds=5, interaction.depth=3), NA)
  
  # GBM fit using start/stop times to get time-dependent covariates
  expect_error(gbm(Surv(tstart, tstop, death==2) ~ bili + protime + albumin + alk.phos, 
                   data=pbc2, distribution="coxph", train.fraction=0.8, n.trees=500, shrinkage=.01, interaction.depth=3), NA)
  expect_error(gbm(Surv(tstart, tstop, death==2) ~ bili + protime + albumin + alk.phos, 
                   data=pbc2, distribution="coxph", train.fraction=0.8, n.trees=500, shrinkage=.01, cv.folds=5, interaction.depth=3), NA)
})

test_that("coxph cv.folds - runs to completion with start-stop, id'ed and stratified dataset", {
  ## Needed packages
  require(survival)
  
  # Given data from the survival package
  cgd2 <- cgd[cgd$enum==1,]
  
  # Then fitting a gbm model should throw no errors - with cv.folds > 1
  expect_error(gbm(Surv(tstop, status) ~ age + sex + inherit +
                     steroids + propylac + hos.cat, data=cgd2, 
                   n.trees=500, shrinkage=.01, distribution = "coxph", interaction.depth=1, train.fraction=1.0, cv.folds=10), NA)
  expect_error(gbm(Surv(tstop, status) ~ age + sex + inherit +
                     steroids + propylac + hos.cat, data=cgd2, 
                   n.trees=500, shrinkage=.01, distribution = "coxph", interaction.depth=1, train.fraction=0.8, cv.folds=5), NA)
  
  expect_error(gbm(Surv(tstart, tstop, status) ~ age + sex + inherit +
                     steroids + propylac, data=cgd, obs.id=cgd$id,
                   train.fraction=1.0, n.trees=500, strata= cgd$hos.cat, distribution = "coxph", shrinkage=.01, interaction.depth=3, cv.folds=10), NA)
  expect_error(gbm(Surv(tstart, tstop, status) ~ age + sex + inherit +
                     steroids + propylac, data=cgd, obs.id=cgd$id,
                   train.fraction=0.8, n.trees=500, strata= cgd$hos.cat, distribution = "coxph", shrinkage=.01, interaction.depth=3, cv.folds=10), NA)
})

test_that("Bernoulli works - gbm", {
  skip_on_cran()  
  set.seed(1)
  
  # create some data
  N <- 1000
  X1 <- runif(N)
  X2 <- runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  p <- 1/(1+exp(-(sin(3*X1) - 4*X2 + mu)))
  Y <- rbinom(N,1,p)
  
  # random weights if you want to experiment with them
  w <- rexp(N)
  w <- N*w/sum(w)
  
  data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3)
  
  # fit initial model
  gbm1 <- gbm(Y~X1+X2+X3,                # formula
              data=data,                 # dataset
              weights=w,
              var.monotone=c(0,0,0),     # -1: monotone decrease, +1: monotone increase, 0: no monotone restrictions
              distribution="bernoulli",
              n.trees=3000,              # number of trees
              shrinkage=0.001,           # shrinkage or learning rate, 0.001 to 0.1 usually work
              interaction.depth=3,       # 1: additive model, 2: two-way interactions, etc.
              bag.fraction = 0.5,        # subsampling fraction, 0.5 is probably best
              train.fraction = 0.5,      # fraction of data for training, first train.fraction*N used for training
              cv.folds=5,                # do 5-fold cross-validation
              n.minobsinnode = 10      # minimum total weight needed in each node
              )
  
  best.iter.test <- gbm.perf(gbm1,method="test") # returns test set estimate of best number of trees
  
  best.iter <- best.iter.test
  
  # make some new data
  set.seed(2)
  N <- 1000
  X1 <- runif(N)
  X2 <- runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  p <- 1/(1+exp(-(sin(3*X1) - 4*X2 + mu)))
  Y <- rbinom(N,1,p)
  data2 <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3)
  
  # predict on the new data using "best" number of trees
  # f.predict will be on the canonical scale (logit,log,etc.)
  f.1.predict <- predict(gbm1, data2, n.trees=best.iter.test)
  
  # compute quantity prior to transformation
  f.new = sin(3*X1) - 4*X2 + mu
  
  # Base the validation tests on observed discrepancies
  expect_true(sd(f.new - f.1.predict) < 1.0)
})

test_that("relative influence picks out true predictors", {
  set.seed(1234)
  X1 <- matrix(nrow=1000, ncol=50)
  X1 <- apply(X1, 2, function(x) rnorm(1000)) # Random noise
  X2 <- matrix(nrow=1000, ncol=5)
  X2 <- apply(X2, 2, function(x) c(rnorm(500), rnorm(500, 3))) # Real predictors
  cls <- rep(c(0, 1), ea=500) # Class
  X <- data.frame(cbind(X1, X2, cls))
  mod <- gbm(cls ~ ., data= X, n.trees=1000, cv.folds=5,
             shrinkage=.01, interaction.depth=2
             ,distribution = 'bernoulli')
  ri <- relative_influence(mod, sort_it=TRUE, rescale=TRUE)
  
  wh <- names(ri)[1:5]
  res <- sum(wh %in% paste("V", 51:55, sep = ""))
  expect_equal(res, 5)
})
test_that("Conversion of 2 factor Y is successful", {
  
  NumY <- sample(c(0, 1), size=1000, replace=TRUE)
  FactY = factor(ifelse(NumY==1, "Yes", "No"), levels=c("No", "Yes"))
  
  PredX <-
    data.frame(
      x1 = runif(1000)
      ,x2 = runif(1000)
    )
  
  set.seed(32479)
  g1 <- gbm(y ~ ., data = data.frame(y = NumY, PredX)
            , distribution = 'bernoulli', verbose = FALSE
            , n.trees = 50)
  rig1 <- relative_influence(g1, num_trees=50)
  
  set.seed(32479)
  g2 <- gbm(y ~ ., data = data.frame(y = FactY, PredX)
            , distribution = 'bernoulli', verbose = FALSE
            , n.trees = 50)
  rig2 <- relative_influence(g2, num_trees=50)
  
  expect_equal(rig1, rig2)
})

#### GBM.FIT ####
context("Test old API works on basic examples - gbm.fit")
test_that("Gaussian works - gbm.fit", {
  skip_on_cran()
  ## Based on example in R package
  
  ## test Gaussian distribution gbm model
  set.seed(1)
  
  # create some data
  N <- 1000
  X1 <- runif(N)
  X2 <- 2*runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  X4 <- ordered(sample(letters[1:6],N,replace=T))
  X5 <- factor(sample(letters[1:3],N,replace=T))
  X6 <- 3*runif(N)
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  SNR <- 10 # signal-to-noise ratio
  Y <- X1**1.5 + 2 * (X2**.5) + mu
  sigma <- sqrt(var(Y)/SNR)
  Y <- Y + rnorm(N,0,sigma)
  
  # create a bunch of missing values
  X1[sample(1:N,size=100)] <- NA
  X3[sample(1:N,size=300)] <- NA
  
  w <- rep(1,N)
  X <- data.frame(X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
  
  # fit initial model
  gbm1 <- gbm.fit(X,
                  Y,                   # dataset
                  var.monotone=c(0,0,0,0,0,0), # -1: monotone decrease, +1: monotone increase, 0: no monotone restrictions
                  distribution="gaussian",     # bernoulli, adaboost, gaussian, poisson, coxph, or
                  # list(name="quantile",alpha=0.05) for quantile regression
                  n.trees=2000,                 # number of trees
                  shrinkage=0.005,             # shrinkage or learning rate, 0.001 to 0.1 usually work
                  interaction.depth=3,         # 1: additive model, 2: two-way interactions, etc.
                  bag.fraction = 0.5,          # subsampling fraction, 0.5 is probably best
                  train.fraction = 0.5,        # fraction of data for training, first train.fraction*N used for training
                  n.minobsinnode = 10,         # minimum number of obs needed in each node
                  keep.data=TRUE)                 
  
  # Get best model
  best.iter <- gbm.perf(gbm1, method="test")   # returns cv estimate of best number of trees
  
  set.seed(2)
  # make some new data
  N <- 1000
  X1 <- runif(N)
  X2 <- 2*runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=TRUE))
  X4 <- ordered(sample(letters[1:6],N,replace=TRUE))
  X5 <- factor(sample(letters[1:3],N,replace=TRUE))
  X6 <- 3*runif(N)
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  # Actual underlying signal
  Y <- X1**1.5 + 2 * (X2**.5) + mu
  
  # Want to see how close predictions are to the underlying signal; noise would just interfere with this
  # Y <- Y + rnorm(N,0,sigma)
  data2 <- data.frame(X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
  
  # predict on the new data using "best" number of trees
  f.predict <- predict(gbm1, data2, best.iter) # f.predict will be on the canonical scale (logit,log,etc.)
  
  # Base the validation tests on observed discrepancies
  expect_true(cor(Y, f.predict) > 0.990)
  expect_true(sd(Y-f.predict) < sigma)
})

test_that("CoxPH works - efron - gbm", {
  skip_on_cran()
  # Require Surv to be available
  require(survival)
  
  # create some data
  set.seed(1)
  N <- 3000
  X1 <- runif(N)
  X2 <- runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  f <- 0.5*sin(3*X1 + 5*X2^2 + mu/10)
  tt.surv <- rexp(N,exp(f))
  tt.cens <- rexp(N,0.5)
  delta <- as.numeric(tt.surv <= tt.cens)
  tt <- apply(cbind(tt.surv,tt.cens),1,min)
  
  # throw in some missing values
  X1[sample(1:N,size=100)] <- NA
  X3[sample(1:N,size=300)] <- NA
  
  # random weights if you want to experiment with them
  w <- rep(1,N)
  X <- data.frame(X1=X1,X2=X2,X3=X3)

  # fit initial model
  gbm1 <- gbm.fit(X,  
              Surv(tt, delta),                 # dataset
              w=w,
              var.monotone=c(0,0,0),     # -1: monotone decrease, +1: monotone increase, 0: no monotone restrictions
              distribution="coxph",
              n.trees=3000,              # number of trees
              shrinkage=0.001,           # shrinkage or learning rate, 0.001 to 0.1 usually work
              interaction.depth=3,       # 1: additive model, 2: two-way interactions, etc.
              bag.fraction = 0.5,        # subsampling fraction, 0.5 is probably best
              train.fraction = 0.5,      # fraction of data for training, first train.fraction*N used for training
              n.minobsinnode = 10,       # minimum total weight needed in each node
              keep.data = TRUE,tied.times.method = "efron", prior.node.coeff.var = 10)
  
  best.iter <- gbm.perf(gbm1, method="test") # returns test set estimate of best number of trees
  
  # make some new data
  set.seed(2)
  N <- 1000
  X1 <- runif(N)
  X2 <- runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  f <- 0.5*sin(3*X1 + 5*X2^2 + mu/10)  # -0.5 <= f <= 0.5 via sin fn.
  tt.surv <- rexp(N,exp(f))
  tt.cens <- rexp(N,0.5)
  
  data2 <- data.frame(X1=X1,X2=X2,X3=X3)
  
  # predict on the new data using "best" number of trees
  # f.predict will be on the canonical scale (logit,log,etc.)
  f.predict <- predict(gbm1, data2, best.iter)
  
  #plot(data2$f,f.predict)
  # Use observed sd
  expect_true(sd(f - f.predict) < 0.4)
})

test_that("CoxPH works - breslow - gbm", {
  skip_on_cran()
  # Require Surv to be available
  require(survival)
  
  # create some data
  set.seed(1)
  N <- 3000
  X1 <- runif(N)
  X2 <- runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  f <- 0.5*sin(3*X1 + 5*X2^2 + mu/10)
  tt.surv <- rexp(N,exp(f))
  tt.cens <- rexp(N,0.5)
  delta <- as.numeric(tt.surv <= tt.cens)
  tt <- apply(cbind(tt.surv,tt.cens),1,min)
  
  # throw in some missing values
  X1[sample(1:N,size=100)] <- NA
  X3[sample(1:N,size=300)] <- NA
  
  # random weights if you want to experiment with them
  w <- rep(1,N)
  X <- data.frame(X1=X1,X2=X2,X3=X3)
  
  # fit initial model
  gbm1 <- gbm.fit(X,       # formula
                  Surv(tt, delta),              # dataset
                  w=w,
                  var.monotone=c(0,0,0),     # -1: monotone decrease, +1: monotone increase, 0: no monotone restrictions
                  distribution="coxph",
                  n.trees=3000,              # number of trees
                  shrinkage=0.001,           # shrinkage or learning rate, 0.001 to 0.1 usually work
                  interaction.depth=3,       # 1: additive model, 2: two-way interactions, etc.
                  bag.fraction = 0.5,        # subsampling fraction, 0.5 is probably best
                  train.fraction = 0.5,      # fraction of data for training, first train.fraction*N used for training
                  n.minobsinnode = 10,       # minimum total weight needed in each node
                  keep.data = TRUE, tied.times.method = "breslow", prior.node.coeff.var = 10)
  
  best.iter <- gbm.perf(gbm1, method="test") # returns test set estimate of best number of trees
  
  # make some new data
  set.seed(2)
  N <- 1000
  X1 <- runif(N)
  X2 <- runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  f <- 0.5*sin(3*X1 + 5*X2^2 + mu/10)  # -0.5 <= f <= 0.5 via sin fn.
  tt.surv <- rexp(N,exp(f))
  tt.cens <- rexp(N,0.5)
  
  data2 <- data.frame(X1=X1,X2=X2,X3=X3)
  
  # predict on the new data using "best" number of trees
  # f.predict will be on the canonical scale (logit,log,etc.)
  f.predict <- predict(gbm1, data2, best.iter)
  
  #plot(data2$f,f.predict)
  # Use observed sd
  expect_true(sd(f - f.predict) < 0.4)
})

test_that("Bernoulli works - gbm", {
  skip_on_cran()
  
  set.seed(1)
  
  # create some data
  N <- 1000
  X1 <- runif(N)
  X2 <- runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  p <- 1/(1+exp(-(sin(3*X1) - 4*X2 + mu)))
  Y <- rbinom(N,1,p)
  
  # random weights if you want to experiment with them
  w <- rexp(N)
  w <- N*w/sum(w)
  X <- data.frame(X1=X1,X2=X2,X3=X3)
  
  # fit initial model
  gbm1 <- gbm.fit(X,                
              Y,
              w=w,
              var.monotone=c(0,0,0),     # -1: monotone decrease, +1: monotone increase, 0: no monotone restrictions
              distribution="bernoulli",
              n.trees=3000,              # number of trees
              shrinkage=0.001,           # shrinkage or learning rate, 0.001 to 0.1 usually work
              interaction.depth=3,       # 1: additive model, 2: two-way interactions, etc.
              bag.fraction = 0.5,        # subsampling fraction, 0.5 is probably best
              train.fraction = 0.5,      # fraction of data for training, first train.fraction*N used for training
              n.minobsinnode = 10)
  
  best.iter.test <- gbm.perf(gbm1, method="test") # returns test set estimate of best number of trees
  best.iter <- best.iter.test
  
  # make some new data
  set.seed(2)
  N <- 1000
  X1 <- runif(N)
  X2 <- runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  p <- 1/(1+exp(-(sin(3*X1) - 4*X2 + mu)))
  Y <- rbinom(N,1,p)
  data2 <- data.frame(X1=X1,X2=X2,X3=X3)
  
  # predict on the new data using "best" number of trees
  # f.predict will be on the canonical scale (logit,log,etc.)
  f.1.predict <- predict(gbm1, data2, n.trees=best.iter.test)
  
  # compute quantity prior to transformation
  f.new = sin(3*X1) - 4*X2 + mu
  
  # Base the validation tests on observed discrepancies
  expect_true(sd(f.new - f.1.predict) < 1.0)
})
gbm-developers/gbm3 documentation built on April 28, 2024, 10:04 p.m.