tests/testthat/test_poissonsplit.R

library(ranger)
context("ranger_poisson")

# Generate poisson distributed outcome
set.seed(42)
n <- 1000
p <- 4
beta <- c(0, 0.1, -0.2, 0.3)
x <- replicate(p, runif(n))
# Use exp(..) to keep it positive (and adds interactions).
# Add -1 to make it small as Poisson should be better for small frequencies.
lambda <- exp(-1 + as.vector(x %*% beta))
y <- rpois(n, lambda)
df <- data.frame(y = y, x)

# And a simple dataset with zero outcomes
df2 <- data.frame(y = c(0, 0, 0, 0, 0, 1, 2, 3, 4, 5),
                  x1 = c("a", "a", "a", "a", "a", "b", "b", "b", "b", "b"),
                  x2 = c(0, 0, 0, 0, 0, 1, 1, 1, 2, 2))

poisson_deviance <- function(y_true, y_pred) {
  if (any(y_true == 0 & y_pred == 0)) {
    stop("Error: Poisson deviance does not exist for y_pred == y_true == 0.")
  }
  pos <- y_true > 0
  dev <- y_pred
  dev[pos] <- y_true[pos] * log(y_true[pos] / y_pred[pos]) - y_true[pos] + y_pred[pos]
  return(2 * mean(dev))
}


test_that("poisson splitting works on poisson distributed data", {
  n_train = 1:(4*n %/% 5)
  n_test = (max(n_train)+1):n
  df_train = df[n_train, ]
  df_test = df[n_test, ]
  rf_poi <- ranger(y ~ ., df_train, splitrule = "poisson", num.trees = 50, min.node.size = 50, poisson.tau = 1, seed = 123)
  rf_mse <- ranger(y ~ ., df_train, splitrule = "variance", num.trees = 50, min.node.size = 50, seed = 123)
  
  expect_is(rf_poi, "ranger")
  # deviance on test set
  expect_lt(poisson_deviance(df_test$y, predict(rf_poi, df_test)$predictions),
            poisson_deviance(df_test$y, predict(rf_mse, df_test)$predictions))
})

test_that("poisson splitting not working for negative outcome", {
  expect_error(ranger(y ~ ., data.frame(y = c(-1.5, 2), x = c(1, 2)), splitrule = "poisson"))
  expect_error(ranger(y ~ ., data.frame(y = c(0, 0), x = c(1, 2)), splitrule = "poisson"))
})

test_that("poisson.tau <= 0 throws error", {
  expect_error(ranger(y ~ ., df2, poisson.tau = 0))
})

test_that("poisson splitting predicts positive even on nodes with all values equal 0", {
  rf <- ranger(y ~ ., df2, splitrule = "poisson", poisson.tau = 0.1, mtry = 2, num.trees = 2,
               min.node.size = 1, seed = 123)
  expect_true(all(c(predict(rf, df2, predict.all = TRUE)$predictions) > 0))
})

test_that("poisson splitting gives larger predictions for larger values of poisson.tau on pure nodes with y = 0", {
  rf1 <- ranger(y ~ ., df2, splitrule = "poisson", poisson.tau = 0.1, mtry = 2, num.trees = 2,
               min.node.size = 1, seed = 123)
  rf2 <- ranger(y ~ ., df2, splitrule = "poisson", poisson.tau = 10, mtry = 2, num.trees = 2,
               min.node.size = 1, seed = 123)
  expect_true(all(predict(rf2, df2)$predictions[df2$y == 0] > predict(rf1, df2)$predictions[df2$y == 0]))
})

Try the ranger package in your browser

Any scripts or data that you put into this service are public.

ranger documentation built on April 4, 2025, 6:12 a.m.