tests/testthat/test-conversion-gbm-data.r

####################
# Author: James Hickey
#
# Series of tests to check that the conversion of factors 
# and that data is data is appropriate for distribution
#
####################

#### Factor Conversion #### 
context("Testing factor conversion")
test_that("Test factor conversion requires GBMData object", {
  # Given correct data
  N <- 1000
  x <- runif(N)
  p <- 0.5
  y <- rbinom(N,1,p)
  w <- rexp(N) 
  offset <- rexp(N)
  
  # When data object has class removed
  data <- gbm_data(matrix(x), y, w, offset)
  attr(data, "class") <- "FAKE"
  
  # Then error thrown when data is not gbm data 
  expect_error(convert_factors(data))
})

#### Validate data given distribution ####
context("Testing data validation and conversion")
test_that("Validation fails if not passed gbm_data object", {
  # Given data (removed class) and a default distribution object
  N <- 1000
  x <- runif(N)
  p <- 0.5
  y <- rbinom(N,1,p)
  w <- rexp(N) 
  offset <- rexp(N)
  
  data <- gbm_data(matrix(x), y, w, offset)
  attr(data, "class") <- "FAKE"
  dist <- gbm_dist()
  
  # Then error thrown
  expect_error(validate_gbm_data(data, dist))
})
test_that("Validation fails if not given distribution obj", {
  # Given data but not a distribution (remove class)
  N <- 1000
  x <- runif(N)
  p <- 0.5
  y <- rbinom(N,1,p)
  w <- rexp(N) 
  offset <- rexp(N)
  
  data <- gbm_data(matrix(x), y, w, offset)
  dist <- gbm_dist()
  attr(dist, "class") <- "FAKE"
  
  # Then error thrown
  expect_error(validate_gbm_data(data, dist))
})
test_that("Weights will be normalized to N if not Pairwise distribution", {
  # Given data and a distribution - not pairwise
  N <- 1000
  x <- runif(N)
  p <- 0.5
  y <- rbinom(N,1,p)
  w <- rexp(N) 
  offset <- rexp(N)
  
  data <- gbm_data(matrix(x), y, w, offset)
  dist <- gbm_dist()
  
  # When data is validated
  data <- validate_gbm_data(data, dist)
  
  # Then weights are normalised to N
  expect_equal(w*length(w)/sum(w), data$weights)
})
test_that("Offset vector must contain same number of points as response - if not CoxPH", {
  # Given data and a distribution - not CoxPH
  N <- 1000
  x <- runif(N)
  p <- 0.5
  y <- rbinom(N,1,p)
  w <- rexp(N) 
  offset <- rexp(N)
  
  data <- gbm_data(matrix(x), y, w, offset)
  dist <- gbm_dist()
  
  # When offset does not have the same number of points as the response
  data$offset <- data$offset[1:N-1]
  
  # Then error expected on validation
  expect_error(validate_gbm_data(data, dist))
  
})
test_that("Offset must contain 1/2 number of points as response - if CoxPH", {
  # Given data (not valid but irrelevant here) and a distribution - CoxPH
  # Offset does not have 1/2 number of points as response
  N <- 1000
  x <- runif(N)
  p <- 0.5
  y <- rbinom(N,1,p)
  w <- rexp(N) 
  offset <- rexp(N/2 + 1)
  
  data <- gbm_data(matrix(x), y, w, offset)
  dist <- gbm_dist("CoxPH")
  
  # Then error expected on validation
  expect_error(validate_gbm_data(data, dist))
})
test_that("Responses check requires GBMDist object", {
  # Given data and a distribution - not CoxPH
  N <- 1000
  x <- runif(N)
  p <- 0.5
  y <- rbinom(N,1,p)
  w <- rexp(N) 
  offset <- rexp(N)
  
  data <- gbm_data(matrix(x), y, w, offset)
  dist <- gbm_dist()
  
  # When checking responses without GBMDist object
  attr(dist, "class") <- "FAKE"
  
  # Then error thrown on checking response
  expect_error(check_response_values(dist, data$y))
})
test_that("Responses must be either a data-frame, matrix or vector", {
  # Given data and a distribution - not CoxPH
  N <- 1000
  x <- runif(N)
  p <- 0.5
  y <- rbinom(N,1,p)
  w <- rexp(N) 
  offset <- rexp(N)
  
  data <- gbm_data(matrix(x), y, w, offset)
  dist <- gbm_dist()
  
  # When responses are not a vector/matrix or data-frame
  data$y <- list()
  
  # Then error thrown on checking response
  expect_error(check_response_values(dist, data$y))
})
test_that("AdaBoost responses must be in {0, 1}", {
  # create some data
  N <- 1000
  X1 <- runif(N)
  X2 <- runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  p <- 1/(1+exp(-(sin(3*X1) - 4*X2 + mu)))
  Y <- rbinom(N,1,p)
  dist <- gbm_dist("AdaBoost")

  # When responses not in {0, 1}
  Y[1] <- -1
  
  # Then error thrown on validation
  expect_error(check_response_values(dist, Y))
})
test_that("Bernoulli responses must be in {0, 1}", {
  # create some data
  N <- 1000
  X1 <- runif(N)
  X2 <- runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  p <- 1/(1+exp(-(sin(3*X1) - 4*X2 + mu)))
  Y <- rbinom(N,1,p)
  dist <- gbm_dist("Bernoulli")
  
  # When responses not in {0, 1}
  Y[1] <- -1
  
  # Then error thrown on validation
  expect_error(check_response_values(dist, Y))
})
test_that("CoxPH responses must be a survival object", {
  # Require Surv to be available
  require(survival)
  
  # create some data
  set.seed(1)
  N <- 3000
  X1 <- runif(N)
  X2 <- runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  f <- 0.5*sin(3*X1 + 5*X2^2 + mu/10)
  tt.surv <- rexp(N,exp(f))
  tt.cens <- rexp(N,0.5)
  delta <- as.numeric(tt.surv <= tt.cens)
  tt <- apply(cbind(tt.surv,tt.cens),1,min)
  
  # throw in some missing values
  X1[sample(1:N,size=100)] <- NA
  X3[sample(1:N,size=300)] <- NA
  
  # random weights if you want to experiment with them
  w <- rep(1,N)
  
  Resp <- Surv(tt, delta)
  dist <- gbm_dist("CoxPH")
  
  # When response is not Surv object
  attr(Resp, "class") <- "NOTSurv"
  
  # Then error thrown on checking response
  expect_error(check_response_values(dist, Resp))
})
test_that("Gamma responses must be positive", {
  # create some data
  N <- 1000
  X1 <- runif(N)
  X2 <- runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  p <- 1/(1+exp(-(sin(3*X1) - 4*X2 + mu)))
  Y <- rbinom(N,1,p)
  dist <- gbm_dist("Gamma")
  
  # When responses not positive
  Y[1] <- -1
  
  # Then error thrown on validation
  expect_error(check_response_values(dist, Y))
  
})
test_that("Huberized hinge loss requires responses in {0, 1}", {
  # create some data
  N <- 1000
  X1 <- runif(N)
  X2 <- runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  p <- 1/(1+exp(-(sin(3*X1) - 4*X2 + mu)))
  Y <- rbinom(N,1,p)
  dist <- gbm_dist("Huberized")
  
  # When responses not in {0, 1}
  Y[1] <- -1
  
  # Then error thrown on validation
  expect_error(check_response_values(dist, Y))
  
})
test_that("Pairwise requires non-negative response - all metrics", {
  # create query groups, with an average size of 25 items each
  set.seed(1)
  N <- 1000
  num.queries <- floor(N/25)
  query <- sample(1:num.queries, N, replace=TRUE)
  
  # X1 is a variable determined by query group only
  query.level <- runif(num.queries)
  X1 <- query.level[query]
  
  # X2 varies with each item
  X2 <- runif(N)
  
  # X3 is uncorrelated with target
  X3 <- runif(N)
  
  # The target
  Y <- X1 + X2
  
  # Add some random noise to X2 that is correlated with
  # queries, but uncorrelated with items
  
  X2 <- X2 + scale(runif(num.queries))[query]
  
  # Add some random noise to target
  SNR <- 5 # signal-to-noise ratio
  sigma <- sqrt(var(Y)/SNR)
  Y <- Y + runif(N, 0, sigma)
  Y.norm <- round(Y/(max(Y) + 0.001))
  
  dist <- gbm_dist("Pairwise", metric="ndcg")
  dist_2 <- gbm_dist("Pairwise", metric="conc")
  dist_3 <- gbm_dist("Pairwise", metric="mrr")
  dist_4 <- gbm_dist("Pairwise", metric="map")
  
  # When response is negative
  Y.norm[1] <- -0.01
  
  # Then error thrown on checking responses
  expect_error(check_response_values(dist, Y.norm))
  expect_error(check_response_values(dist_2, Y.norm))
  expect_error(check_response_values(dist_3, Y.norm))
  expect_error(check_response_values(dist_4, Y.norm))
})
test_that("Pairwise map and mrr metrics require response in {0, 1}", {
  # create query groups, with an average size of 25 items each
  set.seed(1)
  N <- 1000
  num.queries <- floor(N/25)
  query <- sample(1:num.queries, N, replace=TRUE)
  
  # X1 is a variable determined by query group only
  query.level <- runif(num.queries)
  X1 <- query.level[query]
  
  # X2 varies with each item
  X2 <- runif(N)
  
  # X3 is uncorrelated with target
  X3 <- runif(N)
  
  # The target
  Y <- X1 + X2
  
  # Add some random noise to X2 that is correlated with
  # queries, but uncorrelated with items
  
  X2 <- X2 + scale(runif(num.queries))[query]
  
  # Add some random noise to target
  SNR <- 5 # signal-to-noise ratio
  sigma <- sqrt(var(Y)/SNR)
  Y <- Y + runif(N, 0, sigma)
  Y.norm <- round(Y/(max(Y) + 0.001))
  
  dist <- gbm_dist("Pairwise", metric="mrr")
  dist_2 <- gbm_dist("Pairwise", metric="map")
  
  # When response is not in {0, 1}
  Y.norm[1] <- 2
  
  # Then error thrown on checking responses
  expect_error(check_response_values(dist, Y.norm))
  expect_error(check_response_values(dist_2, Y.norm))
})
test_that("Poisson requires positive integer response", {
  # create some data
  N <- 1000
  X1 <- runif(N)
  X2 <- runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  p <- 1/(1+exp(-(sin(3*X1) - 4*X2 + mu)))
  Y <- rpois(N,p)
  dist <- gbm_dist("Poisson")
  
  # When responses not positive integer
  Y[1] <- 0.2
  
  # Then error thrown on validation
  expect_error(check_response_values(dist, Y))
  
})
test_that("Tweedie requires response to be positive", {
  # create some data
  N <- 1000
  X1 <- runif(N)
  X2 <- runif(N)
  X3 <- factor(sample(letters[1:4],N,replace=T))
  mu <- c(-1,0,1,2)[as.numeric(X3)]
  
  p <- 1/(1+exp(-(sin(3*X1) - 4*X2 + mu)))
  Y <- rbinom(N,1,p)
  dist <- gbm_dist("Tweedie")
  
  # When responses not positive
  Y[1] <- -1
  
  # Then error thrown on validation
  expect_error(check_response_values(dist, Y))
})
gbm-developers/gbm3 documentation built on April 28, 2024, 10:04 p.m.