test_ranger.R
In ranger: A Fast Implementation of Random Forests

library(ranger)
library(survival)
context("ranger")

test_that("Matrix interface works for Probability estimation", {
  rf <- ranger(dependent.variable.name = "Species", data = data.matrix(iris), write.forest = TRUE, probability = TRUE)
  expect_equal(rf$treetype, "Probability estimation")
  expect_equal(rf$forest$independent.variable.names, colnames(iris)[1:4])
})

test_that("Matrix interface prediction works for Probability estimation", {
  dat <- data.matrix(iris)
  rf <- ranger(dependent.variable.name = "Species", data = dat, write.forest = TRUE, probability = TRUE)
  expect_silent(predict(rf, dat))
})

test_that("no warning if data.frame has two classes", {
  dat <- iris
  class(dat) <- c("data.frame", "data.table")
  expect_silent(ranger(Species ~ ., data = dat))
})

test_that("Error if sample fraction is 0 or >1", {
  expect_error(ranger(Species ~ ., iris, num.trees = 5, sample.fraction = 0))
  expect_error(ranger(Species ~ ., iris, num.trees = 5, sample.fraction = 1.1))
})

test_that("Error if sample fraction is vector for regression", {
  expect_error(ranger(Sepal.Length ~ ., iris, num.trees = 5, sample.fraction = c(0.1, 0.2)), 
               "Error: Invalid value for sample\\.fraction\\. Vector values only valid for classification forests\\.")
})

test_that("Error if sample fraction is vector of wrong size", {
  expect_error(ranger(Species ~ ., iris, num.trees = 5, sample.fraction = c(0.1, 0.2)), 
               "Error: Invalid value for sample\\.fraction\\. Expecting 3 values, provided 2\\.")
})

test_that("Error if element of sample fraction vector is <0 or >1", {
  expect_error(ranger(Species ~ ., iris, num.trees = 5, sample.fraction = c(0.1, 1.1, 0.3)), 
               "Error: Invalid value for sample\\.fraction. Please give a value in \\(0,1\\] or a vector of values in \\[0,1\\]\\.")
  expect_error(ranger(Species ~ ., iris, num.trees = 5, sample.fraction = c(-3, 0.5, 0.3)), 
               "Error: Invalid value for sample.fraction. Please give a value in \\(0,1] or a vector of values in \\[0,1\\]\\.")
})

test_that("Error if sum of sample fraction vector is 0", {
  expect_error(ranger(Species ~ ., iris, num.trees = 5, sample.fraction = c(0, 0, 0)), 
               "Error: Invalid value for sample\\.fraction. Sum of values must be >0\\.")
})

test_that("Error if replace=FALSE and not enough samples", {
  expect_error(ranger(Species ~ ., iris, num.trees = 5, sample.fraction = c(0.2, 0.3, 0.4), 
                      replace = FALSE, keep.inbag = TRUE), 
               "Error: Not enough samples in class virginica; available: 50, requested: 60.")
  expect_silent(ranger(Species ~ ., iris, num.trees = 5, sample.fraction = c(0.2, 0.3, 0.4), 
                       replace = TRUE, keep.inbag = TRUE))
})

test_that("Error if sample.fraction and case.weights", {
  expect_error(ranger(Species ~ ., iris, num.trees = 5, sample.fraction = c(0.2, 0.3, 0.4), 
                      case.weights = rbinom(nrow(iris), 1, 0.5)), 
               "Error: Combination of case\\.weights and class-wise sampling not supported\\.")
})

test_that("Inbag counts match sample fraction, classification", {
  ## With replacement
  rf <- ranger(Species ~ ., iris, num.trees = 5, sample.fraction = c(0.2, 0.3, 0.4), 
               replace = TRUE, keep.inbag = TRUE)
  inbag <- do.call(cbind, rf$inbag.counts)
  expect_equal(unique(colSums(inbag[iris$Species == "setosa", ])), 30)
  expect_equal(unique(colSums(inbag[iris$Species == "versicolor", ])), 45)
  expect_equal(unique(colSums(inbag[iris$Species == "virginica", ])), 60)
  
  ## Without replacement
  rf <- ranger(Species ~ ., iris, num.trees = 5, sample.fraction = c(0.1, 0.2, 0.3), 
               replace = FALSE, keep.inbag = TRUE)
  inbag <- do.call(cbind, rf$inbag.counts)
  expect_equal(unique(colSums(inbag[iris$Species == "setosa", ])), 15)
  expect_equal(unique(colSums(inbag[iris$Species == "versicolor", ])), 30)
  expect_equal(unique(colSums(inbag[iris$Species == "virginica", ])), 45)
  
  ## Different order, without replacement
  dat <- iris[c(51:100, 101:150, 1:50), ]
  rf <- ranger(Species ~ ., dat, num.trees = 5, sample.fraction = c(0.1, 0.2, 0.3), 
               replace = FALSE, keep.inbag = TRUE)
  inbag <- do.call(cbind, rf$inbag.counts)
  expect_equal(unique(colSums(inbag[dat$Species == "setosa", ])), 15)
  expect_equal(unique(colSums(inbag[dat$Species == "versicolor", ])), 30)
  expect_equal(unique(colSums(inbag[dat$Species == "virginica", ])), 45)
  
  ## No factor outcome
  rf <- ranger(Species ~ ., data.matrix(iris), num.trees = 5, sample.fraction = c(0.2, 0.3, 0.4), 
               replace = TRUE, keep.inbag = TRUE, classification = TRUE)
  inbag <- do.call(cbind, rf$inbag.counts)
  expect_equal(unique(colSums(inbag[iris$Species == "setosa", ])), 30)
  expect_equal(unique(colSums(inbag[iris$Species == "versicolor", ])), 45)
  expect_equal(unique(colSums(inbag[iris$Species == "virginica", ])), 60)
})

test_that("Inbag counts match sample fraction, probability", {
  ## With replacement
  rf <- ranger(Species ~ ., iris, num.trees = 5, sample.fraction = c(0.2, 0.3, 0.4), 
               replace = TRUE, keep.inbag = TRUE, probability = TRUE)
  inbag <- do.call(cbind, rf$inbag.counts)
  expect_equal(unique(colSums(inbag[1:50, ])), 30)
  expect_equal(unique(colSums(inbag[51:100, ])), 45)
  expect_equal(unique(colSums(inbag[101:150, ])), 60)
  
  ## Without replacement
  rf <- ranger(Species ~ ., iris, num.trees = 5, sample.fraction = c(0.1, 0.2, 0.3), 
               replace = FALSE, keep.inbag = TRUE, probability = TRUE)
  inbag <- do.call(cbind, rf$inbag.counts)
  expect_equal(unique(colSums(inbag[1:50, ])), 15)
  expect_equal(unique(colSums(inbag[51:100, ])), 30)
  expect_equal(unique(colSums(inbag[101:150, ])), 45)
  
  ## No factor outcome
  rf <- ranger(Species ~ ., data.matrix(iris), num.trees = 5, sample.fraction = c(0.2, 0.3, 0.4), 
               replace = TRUE, keep.inbag = TRUE, probability = TRUE)
  inbag <- do.call(cbind, rf$inbag.counts)
  expect_equal(unique(colSums(inbag[1:50, ])), 30)
  expect_equal(unique(colSums(inbag[51:100, ])), 45)
  expect_equal(unique(colSums(inbag[101:150, ])), 60)
})

test_that("as.factor() in formula works", {
  n <- 20
  dt <- data.frame(x = runif(n), y = rbinom(n, 1, 0.5))
  expect_silent(ranger(as.factor(y) ~ ., data = dt, num.trees = 5, write.forest = TRUE))
})

test_that("holdout mode holding out data with 0 weight", {
  weights <- rbinom(nrow(iris), 1, 0.5)
  rf <- ranger(Species ~ ., iris, num.trees = 5, importance = "permutation",  
               case.weights = weights, replace = FALSE, sample.fraction = 0.632*mean(weights), 
               holdout = TRUE, keep.inbag = TRUE)
  inbag <- data.frame(rf$inbag.counts)
  expect_true(all(inbag[weights == 0, ] == 0))
})

test_that("holdout mode uses holdout OOB data", {
  weights <- rbinom(nrow(iris), 1, 0.5)
  rf <- ranger(Species ~ ., iris, num.trees = 5, importance = "permutation",  
               case.weights = weights, replace = FALSE, sample.fraction = 0.632*mean(weights), 
               holdout = TRUE, keep.inbag = TRUE)
  expect_false(any(is.na(rf$predictions[weights == 0])))
  expect_true(all(is.na(rf$predictions[weights == 1])))
})

test_that("holdout mode not working if no weights", {
  expect_error(ranger(Species ~ ., iris, num.trees = 5, importance = "permutation", holdout = TRUE))
})

test_that("holdout mode: no OOB prediction if no 0 weights", {
  weights <- runif(nrow(iris))
  rf <- ranger(Species ~ ., iris, num.trees = 5, importance = "permutation",  
               case.weights = weights, replace = FALSE, 
               holdout = TRUE, keep.inbag = TRUE)
  expect_true(all(is.na(rf$predictions)))
})

test_that("OOB error is correct for 1 tree, classification", {
  n <- 50
  dat <- data.frame(y = factor(rbinom(n, 1, .5)), x = rnorm(n))
  rf <- ranger(y ~ ., dat, num.trees = 1)
  expect_equal(rf$prediction.error, mean(rf$predictions != dat$y, na.rm = TRUE))
})

test_that("OOB error is correct for 1 tree, probability prediction", {
  n <- 50
  dat <- data.frame(y = factor(rbinom(n, 1, .5)), x = rnorm(n))
  rf <- ranger(y ~ ., dat, num.trees = 1, probability = TRUE)
  prob <- c(rf$predictions[dat$y == "0", 1], rf$predictions[dat$y == "1", 2])
  expect_equal(rf$prediction.error, mean((1 - prob)^2, na.rm = TRUE))
})

test_that("OOB error is correct for 1 tree, regression", {
  n <- 50
  dat <- data.frame(y = rbinom(n, 1, .5), x = rnorm(n))
  rf <- ranger(y ~ ., dat, num.trees = 1)
  expect_equal(rf$prediction.error, mean((dat$y - rf$predictions)^2, na.rm = TRUE))
})

test_that("Split points are at (A+B)/2 for numeric features, regression variance splitting", {
  dat <- data.frame(y = rbinom(100, 1, .5), x = rbinom(100, 1, .5))
  rf <- ranger(y ~ x, dat, num.trees = 10)
  split_points <- sapply(1:rf$num.trees, function(i) {
    res <- treeInfo(rf, i)$splitval
    res[!is.na(res)]
  })
  expect_equal(split_points, rep(0.5, rf$num.trees))
})

test_that("Split points are at (A+B)/2 for numeric features, regression maxstat splitting", {
  dat <- data.frame(y = rbinom(100, 1, .5), x = rbinom(100, 1, .5))
  rf <- ranger(y ~ x, dat, num.trees = 10, splitrule = "maxstat", alpha = 1)
  split_points <- sapply(1:rf$num.trees, function(i) {
    res <- treeInfo(rf, i)$splitval
    res[!is.na(res)]
  })
  expect_equal(split_points, rep(0.5, rf$num.trees))
})

test_that("Split points are at (A+B)/2 for numeric features, classification", {
  dat <- data.frame(y = factor(rbinom(100, 1, .5)), x = rbinom(100, 1, .5))
  rf <- ranger(y ~ x, dat, num.trees = 10)
  split_points <- sapply(1:rf$num.trees, function(i) {
    res <- treeInfo(rf, i)$splitval
    res[!is.na(res)]
  })
  expect_equal(split_points, rep(0.5, rf$num.trees))
})

test_that("Split points are at (A+B)/2 for numeric features, probability", {
  dat <- data.frame(y = factor(rbinom(100, 1, .5)), x = rbinom(100, 1, .5))
  rf <- ranger(y ~ x, dat, num.trees = 10, probability = TRUE)
  split_points <- sapply(1:rf$num.trees, function(i) {
    res <- treeInfo(rf, i)$splitval
    res[!is.na(res)]
  })
  expect_equal(split_points, rep(0.5, rf$num.trees))
})

test_that("Split points are at (A+B)/2 for numeric features, survival logrank splitting", {
  dat <- data.frame(time = runif(100, 1, 10), status = rbinom(100, 1, .5), x = rbinom(100, 1, .5))
  rf <- ranger(Surv(time, status) ~ x, dat, num.trees = 10, splitrule = "logrank")
  split_points <- sapply(1:rf$num.trees, function(i) {
    res <- treeInfo(rf, i)$splitval
    res[!is.na(res)]
  })
  expect_equal(split_points, rep(0.5, rf$num.trees))
})

test_that("Split points are at (A+B)/2 for numeric features, survival C-index splitting", {
  dat <- data.frame(time = runif(100, 1, 10), status = rbinom(100, 1, .5), x = rbinom(100, 1, .5))
  rf <- ranger(Surv(time, status) ~ x, dat, num.trees = 10, splitrule = "C")
  split_points <- sapply(1:rf$num.trees, function(i) {
    res <- treeInfo(rf, i)$splitval
    res[!is.na(res)]
  })
  expect_equal(split_points, rep(0.5, rf$num.trees))
})

test_that("Split points are at (A+B)/2 for numeric features, survival maxstat splitting", {
  dat <- data.frame(time = runif(100, 1, 10), status = rbinom(100, 1, .5), x = rbinom(100, 1, .5))
  rf <- ranger(Surv(time, status) ~ x, dat, num.trees = 10, splitrule = "maxstat", alpha = 1)
  split_points <- sapply(1:rf$num.trees, function(i) {
    res <- treeInfo(rf, i)$splitval
    res[!is.na(res)]
  })
  expect_equal(split_points, rep(0.5, rf$num.trees))
})

test_that("No error if variable named forest", {
  dat <- iris
  dat$forest <- rnorm(150)
  rf <- ranger(Species ~ ., dat, num.trees = 5)
  expect_silent(predict(rf, dat))
})

test_that("Prediction error not NA if oob.error=TRUE", {
  rf <- ranger(Species ~ ., iris, num.trees = 5)
  expect_false(is.na(rf$prediction.error))
  
  rf <- ranger(Surv(time,status) ~ ., veteran, num.trees = 5)
  expect_false(is.na(rf$prediction.error))
})

test_that("Prediction error is NA if oob.error=FALSE", {
  rf <- ranger(Species ~ ., iris, num.trees = 5, oob.error = FALSE)
  expect_true(is.na(rf$prediction.error))
  
  rf <- ranger(Surv(time,status) ~ ., veteran, num.trees = 5, oob.error = FALSE)
  expect_true(is.na(rf$prediction.error))
})

test_that("Tree depth creates trees of correct size", {
  # Recursive function to get tree depth
  depth <- function(rf, tree, i) {
    left <- rf$forest$child.nodeIDs[[tree]][[1]][i] + 1
    right <- rf$forest$child.nodeIDs[[tree]][[2]][i] + 1
    if (left <= 1) {
      0
    } else {
      1 + max(c(depth(rf, tree, left), depth(rf, tree, right)))
    }
  }
  forest_depth <- function(rf) {
    sapply(1:rf$num.trees, depth, rf = rf, i = 1)
  }
  
  # Depth 1
  rf <- ranger(Species ~ ., iris, num.trees = 5, max.depth = 1)
  expect_true(all(forest_depth(rf) <= 1))
  
  # Depth 4
  rf <- ranger(Species ~ ., iris, num.trees = 5, max.depth = 4)
  expect_true(all(forest_depth(rf) <= 4))
  
  # Random depth (deeper trees)
  max.depth <- round(runif(1, 1, 20))
  dat <- data.frame(y = runif(100, 0, 1), x = runif(100, 0, 1))
  rf <- ranger(y ~ ., dat, num.trees = 5, min.node.size = 1, max.depth = max.depth)
  expect_true(all(forest_depth(rf) <= max.depth))
})

test_that("Tree depth 0 equivalent to unlimited", {
  set.seed(200)
  rf1 <- ranger(Species ~ ., iris, num.trees = 5, max.depth = 0)
  
  set.seed(200)
  rf2 <- ranger(Species ~ ., iris, num.trees = 5)
  
  expect_equal(sapply(rf1$forest$split.varIDs, length), 
               sapply(rf2$forest$split.varIDs, length))
})

test_that("Meaningful predictions with max.depth = 1", {
  rf <- ranger(Sepal.Length ~ ., iris, max.depth = 1, num.trees = 5)
  pred <- predict(rf, iris)$predictions
  expect_gte(min(pred), min(iris$Sepal.Length))
  expect_lte(max(pred), max(iris$Sepal.Length))
})

test_that("Does not crash when variable named 'none'", {
  dat <- data.frame(y = rbinom(100, 1, .5), 
                    x = rbinom(100, 1, .5), 
                    none = rbinom(100, 1, .5))
  rf <- ranger(data = dat, dependent.variable.name = "y")
  expect_equal(rf$forest$independent.variable.names, c("x", "none"))
  expect_silent(predict(rf, dat))
})

test_that("mtry function input works as expected", {
  rf <- ranger(Species ~ ., data = iris, mtry = function(n) n - 1)
  expect_equal(3, rf$mtry)
})


test_that("mtry function error halts the ranger function", {
  expect_error(
    ranger(Species ~ ., data = iris, mtry = function(n) stop("this is some error")), 
    "mtry function evaluation resulted in an error.")
})

test_that("min.bucket creates nodes of correct size", {
  
  # Size 2
  rf <- ranger(Species ~ ., iris, num.trees = 5, replace = FALSE, 
               min.bucket = 2, keep.inbag = TRUE)
  pred <- predict(rf, iris, type = "terminalNodes")$prediction
  inbag <- sapply(rf$inbag.counts, function(x) x == 1)
  smallest_node <- min(sapply(1:ncol(pred), function(i) {
    min(table(pred[inbag[, i], i]))
  }))
  expect_gte(smallest_node, 2)
  
  # Size 10
  rf <- ranger(Species ~ ., iris, num.trees = 5, replace = FALSE, 
               min.bucket = 10, keep.inbag = TRUE)
  pred <- predict(rf, iris, type = "terminalNodes")$prediction
  inbag <- sapply(rf$inbag.counts, function(x) x == 1)
  smallest_node <- min(sapply(1:ncol(pred), function(i) {
    min(table(pred[inbag[, i], i]))
  }))
  expect_gte(smallest_node, 10)
  
  # Random size
  min.bucket <- round(runif(1, 1, 40))
  rf <- ranger(Species ~ ., iris, num.trees = 5, replace = FALSE, 
               min.bucket = min.bucket, keep.inbag = TRUE)
  pred <- predict(rf, iris, type = "terminalNodes")$prediction
  inbag <- sapply(rf$inbag.counts, function(x) x == 1)
  smallest_node <- min(sapply(1:ncol(pred), function(i) {
    min(table(pred[inbag[, i], i]))
  }))
  expect_gte(smallest_node, min.bucket)
})

test_that("Vector min.bucket creates nodes of correct size", {
  
  # Size 2,3,4
  rf <- ranger(Species ~ ., iris, num.trees = 5, replace = FALSE, 
               min.bucket = c(2, 3, 4), keep.inbag = TRUE)
  pred <- predict(rf, iris, type = "terminalNodes")$prediction
  inbag <- sapply(rf$inbag.counts, function(x) x == 1)
  
  smallest_nodes <- sapply(1:ncol(pred), function(i) {
    pred1 <- pred[which(inbag[, i][1:50]), i]
    pred2 <- pred[which(inbag[, i][51:100]) + 50, i]
    pred3 <- pred[which(inbag[, i][101:150]) + 100, i]
    
    pred <- rbind(data.frame(class = 1, node = pred1),
                  data.frame(class = 2, node = pred2), 
                  data.frame(class = 3, node = pred3))
    apply(table(pred), 1, min)
  })
  
  expect_true(all(smallest_nodes >= matrix(c(2, 3, 4), ncol = 5, nrow = 3)))
  
  # Size 4,3,2
  rf <- ranger(Species ~ ., iris, num.trees = 5, replace = FALSE, 
               min.bucket = c(4, 3, 2), keep.inbag = TRUE)
  pred <- predict(rf, iris, type = "terminalNodes")$prediction
  inbag <- sapply(rf$inbag.counts, function(x) x == 1)
  
  smallest_nodes <- sapply(1:ncol(pred), function(i) {
    pred1 <- pred[which(inbag[, i][1:50]), i]
    pred2 <- pred[which(inbag[, i][51:100]) + 50, i]
    pred3 <- pred[which(inbag[, i][101:150]) + 100, i]
    
    pred <- rbind(data.frame(class = 1, node = pred1),
                  data.frame(class = 2, node = pred2), 
                  data.frame(class = 3, node = pred3))
    apply(table(pred), 1, min)
  })
  
  expect_true(all(smallest_nodes >= matrix(c(4, 3, 2), ncol = 5, nrow = 3)))
  
  # Random size
  min.bucket <- round(runif(3, 1, 10))
  rf <- ranger(Species ~ ., iris, num.trees = 5, replace = FALSE, 
               min.bucket = min.bucket, keep.inbag = TRUE)
  pred <- predict(rf, iris, type = "terminalNodes")$prediction
  inbag <- sapply(rf$inbag.counts, function(x) x == 1)
  
  smallest_nodes <- sapply(1:ncol(pred), function(i) {
    pred1 <- pred[which(inbag[, i][1:50]), i]
    pred2 <- pred[which(inbag[, i][51:100]) + 50, i]
    pred3 <- pred[which(inbag[, i][101:150]) + 100, i]
    
    pred <- rbind(data.frame(class = 1, node = pred1),
                  data.frame(class = 2, node = pred2), 
                  data.frame(class = 3, node = pred3))
    apply(table(pred), 1, min)
  })
  
  expect_true(all(smallest_nodes >= matrix(min.bucket, ncol = 5, nrow = 3)))
  
  # No factor outcome
  rf <- ranger(Species ~ ., data.matrix(iris), num.trees = 5, replace = FALSE, 
               min.bucket = c(2, 3, 4), keep.inbag = TRUE, classification = TRUE)
  pred <- predict(rf, iris, type = "terminalNodes")$prediction
  inbag <- sapply(rf$inbag.counts, function(x) x == 1)
  
  smallest_nodes <- sapply(1:ncol(pred), function(i) {
    pred1 <- pred[which(inbag[, i][1:50]), i]
    pred2 <- pred[which(inbag[, i][51:100]) + 50, i]
    pred3 <- pred[which(inbag[, i][101:150]) + 100, i]
    
    pred <- rbind(data.frame(class = 1, node = pred1),
                  data.frame(class = 2, node = pred2), 
                  data.frame(class = 3, node = pred3))
    apply(table(pred), 1, min)
  })
  
  expect_true(all(smallest_nodes >= matrix(c(2, 3, 4), ncol = 5, nrow = 3)))
})