tests/testthat/test-create-strata.r

####################
# Author: James Hickey
#
# Series of tests to check the functionality that creates strata
#
####################

context("Testing Strata creation:")
test_that("Strata creation function requires GBMData and GBMDist objects", {
  # Require Surv to be available
  require(survival)
  
  # create some data
  set.seed(1)
  N <- 3000
  X1 <- runif(N)
  X2 <- runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  f <- 0.5*sin(3*X1 + 5*X2^2 + mu/10)
  tt.surv <- rexp(N,exp(f))
  tt.cens <- rexp(N,0.5)
  delta <- as.numeric(tt.surv <= tt.cens)
  tt <- apply(cbind(tt.surv,tt.cens),1,min)
  
  # throw in some missing values
  X1[sample(1:N,size=100)] <- NA
  X3[sample(1:N,size=300)] <- NA
  
  # random weights if you want to experiment with them
  w <- rep(1,N)
  offset <- rep(0, N)
  
  Resp <- Surv(tt, delta)
  data <- gbm_data(data.frame(X1, X2, X3), Resp, w, offset)
  train_p <- training_params(id=c(rep(1, N/4), rep(2, N/4), rep(3, N/4), rep(4, N/4)), num_train = 4, 
                             num_features = 3, bag_fraction = 1, min_num_obs_in_node = 1)
  dist <- gbm_dist("CoxPH")
  
  # When not a GBMData object or GBMDist
  copy_data <- data
  copy_dist <- dist
  attr(data, "class") <- "NOTData"
  attr(dist, "class") <- "NOTDist"
  
  # Then an error is thrown
  expect_error(create_strata(data, train_p, copy_dist))
  expect_error(create_strata(copy_data, train_p, dist))
})
test_that("Strata are NA if distribution is not CoxPH", {
  # Require Surv to be available
  require(survival)
  
  # create some data
  set.seed(1)
  N <- 3000
  X1 <- runif(N)
  X2 <- runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  f <- 0.5*sin(3*X1 + 5*X2^2 + mu/10)
  tt.surv <- rexp(N,exp(f))
  tt.cens <- rexp(N,0.5)
  delta <- as.numeric(tt.surv <= tt.cens)
  tt <- apply(cbind(tt.surv,tt.cens),1,min)
  
  # throw in some missing values
  X1[sample(1:N,size=100)] <- NA
  X3[sample(1:N,size=300)] <- NA
  
  # random weights if you want to experiment with them
  w <- rep(1,N)
  offset <- rep(0, N)
  
  Resp <- Surv(tt, delta)
  data <- gbm_data(data.frame(X1, X2, X3), Resp, w, offset)
  train_p <- training_params(id=c(rep(1, N/4), rep(2, N/4), rep(3, N/4), rep(4, N/4)), num_train = 4, 
                             num_features = 3, bag_fraction = 1, min_num_obs_in_node = 1)
  
  # GIVEN Dist Not COXPH
  dist <- gbm_dist("AdaBoost")
  
  # When strata created
  dist <- create_strata(data, train_p, dist)
  
  # Then dist object is unchanged
  expect_true(is.na(dist$strata))
  expect_true(is.na(dist$sorted))
})
test_that("Creating strata fills strata, time_order and sorted fields - CoxPH", {
  # Require Surv to be available
  require(survival)
  
  # create some data
  set.seed(1)
  N <- 3000
  X1 <- runif(N)
  X2 <- runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  f <- 0.5*sin(3*X1 + 5*X2^2 + mu/10)
  tt.surv <- rexp(N,exp(f))
  tt.cens <- rexp(N,0.5)
  delta <- as.numeric(tt.surv <= tt.cens)
  tt <- apply(cbind(tt.surv,tt.cens),1,min)
  
  # throw in some missing values
  X1[sample(1:N,size=100)] <- NA
  X3[sample(1:N,size=300)] <- NA
  
  # random weights if you want to experiment with them
  w <- rep(1,N)
  offset <- rep(0, N/2)
  
  Resp <- Surv(tt, delta)
  data <- gbm_data(data.frame(X1, X2, X3), Resp, w, offset)
  train_p <- training_params(id=c(rep(1, N/4), rep(2, N/4), rep(3, N/4), rep(4, N/4)), num_train = 4, num_features = 3,
                             bag_fraction = 1, min_num_obs_in_node = 1)
  
  # GIVEN Dist - COXPH
  dist <- gbm_dist("CoxPH")
  
  # When strata created
  dist <- create_strata(data, train_p, dist)
  
  # Then dist object is changed
  expect_equal(length(dist$strata), N)
  expect_equal(length(dist$time_order), N)
  expect_equal(nrow(dist$sorted), N)
})
test_that("If response is a matrix with more than 3 columns strata cannot be created - CoxPH", {
  # create some data
  set.seed(1)
  N <- 3000
  X1 <- runif(N)
  X2 <- runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  f <- 0.5*sin(3*X1 + 5*X2^2 + mu/10)
  tt.surv <- rexp(N,exp(f))
  tt.cens <- rexp(N,0.5)
  delta <- as.numeric(tt.surv <= tt.cens)
  tt <- apply(cbind(tt.surv,tt.cens),1,min)
  
  # throw in some missing values
  X1[sample(1:N,size=100)] <- NA
  X3[sample(1:N,size=300)] <- NA
  
  # random weights if you want to experiment with them
  w <- rep(1,N)
  offset <- rep(0, N)
  
  
  # GIVEN Dist - COXPH
  dist <- gbm_dist("CoxPH")
  
  # When response has too many columns - set to 4
  Resp <- data.frame(tt, delta)
  Resp <- cbind(cbind(Resp, rnorm(N)), rnorm(N))
  data <- gbm_data(data.frame(X1, X2, X3), Resp, w, offset)
  train_p <- training_params(id=c(rep(1, N/4), rep(2, N/4), rep(3, N/4), rep(4, N/4)), num_train = 4, num_features = 3,
                             bag_fraction = 1, min_num_obs_in_node = 1)
  
  # Then error thrown when creating strata
  expect_error(create_strata(data, train_p, dist))
})
test_that("If strata field in distribution object is NULL, all data are put in same strata - CoxPH", {
  # Require Surv to be available
  require(survival)
  
  # create some data
  set.seed(1)
  N <- 3000
  X1 <- runif(N)
  X2 <- runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  f <- 0.5*sin(3*X1 + 5*X2^2 + mu/10)
  tt.surv <- rexp(N,exp(f))
  tt.cens <- rexp(N,0.5)
  delta <- as.numeric(tt.surv <= tt.cens)
  tt <- apply(cbind(tt.surv,tt.cens),1,min)
  
  # throw in some missing values
  X1[sample(1:N,size=100)] <- NA
  X3[sample(1:N,size=300)] <- NA
  
  # random weights if you want to experiment with them
  w <- rep(1,N)
  offset <- rep(0, N)
  
  Resp <- Surv(tt, delta)
  data <- gbm_data(data.frame(X1, X2, X3), Resp, w, offset)
  train_p <- training_params(id=c(rep(1, N/4), rep(2, N/4), rep(3, N/4), rep(4, N/4)), num_train = 4, num_features = 3, 
                             bag_fraction = 1, min_num_obs_in_node = 1)
  
  # GIVEN Dist - COXPH
  dist <- gbm_dist("CoxPH")
  
  # When creating strata with strata initialized to NULL
  dist$strata <- NULL
  dist <- create_strata(data, train_p, dist)
  
  # Then all examples put in same strata
  expect_equal(dist$strata[1], N)
})
test_that("The training responses are sorted according to strata and this order is stored in time_order", {
  # Require Surv to be available
  require(survival)
  
  # create some data
  set.seed(1)
  N <- 3000
  X1 <- runif(N)
  X2 <- runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  f <- 0.5*sin(3*X1 + 5*X2^2 + mu/10)
  tt.surv <- rexp(N,exp(f))
  tt.cens <- rexp(N,0.5)
  delta <- as.numeric(tt.surv <= tt.cens)
  tt <- apply(cbind(tt.surv,tt.cens),1,min)
  
  # throw in some missing values
  X1[sample(1:N,size=100)] <- NA
  X3[sample(1:N,size=300)] <- NA
  
  # random weights if you want to experiment with them
  w <- rep(1,N)
  offset <- rep(0, N)
  
  Resp <- Surv(tt, delta)
  data <- gbm_data(data.frame(X1, X2, X3), Resp, w, offset)
  train_p <- training_params(id=c(rep(1, N/5), rep(2, N/5), rep(3, N/5), rep(4, N/5)), num_train = 4, num_features = 3, 
                             bag_fraction = 1, min_num_obs_in_node = 1)
  
  # GIVEN Dist - COXPH
  dist <- gbm_dist("CoxPH")
  
  # When strata are created 
  dist <- create_strata(data, train_p, dist)
    
  # Then the training responses are sorted according to strata
  expect_equal(order(-data$y[seq_len(4*N/5), 1]), dist$time_order[seq_len(4*N/5)])
  expect_equal(order(-data$y[(4*N/5 + 1):N, 1])+(4*N/5), dist$time_order[(4*N/5 + 1):N])
})
test_that("Strata not NULL then observations put in different strata - CoxPH", {
  # Require Surv to be available
  require(survival)
  
  # create some data
  set.seed(1)
  N <- 3000
  X1 <- runif(N)
  X2 <- runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  f <- 0.5*sin(3*X1 + 5*X2^2 + mu/10)
  tt.surv <- rexp(N,exp(f))
  tt.cens <- rexp(N,0.5)
  delta <- as.numeric(tt.surv <= tt.cens)
  tt <- apply(cbind(tt.surv,tt.cens),1,min)
  
  # throw in some missing values
  X1[sample(1:N,size=100)] <- NA
  X3[sample(1:N,size=300)] <- NA
  
  # random weights if you want to experiment with them
  w <- rep(1,N)
  offset <- rep(0, N/2)
  
  Resp <- Surv(tt, delta)
  data <- gbm_data(data.frame(X1, X2, X3), Resp, w, offset)
  train_p <- training_params(id=c(rep(1, N/4), rep(2, N/4), rep(3, N/4), rep(4, N/4)), num_train = 4, num_features = 3,
                             bag_fraction = 1, min_num_obs_in_node = 1)
  
  # GIVEN Dist - COXPH
  dist <- gbm_dist("CoxPH")
  Num_Strata <- 5
  strata <- sample(seq_len(Num_Strata), N, replace=TRUE)
  dist$original_strata_id <- strata
  
  # When strata are created
  dist <- create_strata(data, train_p, dist)
  
  # Then ordered by id
  expect_equal(dist$strata[seq_len(Num_Strata)], as.vector(cumsum(table(strata))))
})
gbm-developers/gbm3 documentation built on April 28, 2024, 10:04 p.m.