In bcjaeger/aorsf: Accelerated Oblique Random Forests

Comparisons with existing software (classification)

Several other R packages can grow oblique forests for classification, including

RLT (https://cran.r-project.org/package=RLT)
ODRF (https://liuyu-star.github.io/ODRF/)
obliqueRF (no longer on CRAN, so not included)

library(ODRF)
library(RLT)
library(microbenchmark)

Computation

We'll use the attrition data from the modeldata R package.

# keep only numeric predictors (easier to convert to matrix)
data_all <- as_tibble(modeldata::attrition) %>%
 select(outcome = Attrition, where(is.numeric))

# x and y matrices for ODRF and RLT
x <- as.matrix(select(data_all, -outcome))
y <- data_all$outcome

Compare the time taken to grow a forest of 100 trees with the RLT, aorsf, and ODRF packages:

options(microbenchmark.unit="relative")

# This benchmark is run using a 24-core processor
bm <- microbenchmark::microbenchmark(
 fit_orsf = orsf(data_all, outcome ~ ., n_tree = 100, n_thread = 0),
 fit_odrf = ODRF(X = x, y = y,  parallel = TRUE, ntrees = 100),
 fit_rlt = RLT(x = x, y = y,  model = 'classification', ntrees = 100,
               combsplit = 3, reinforcement = TRUE),
 times = 10
)

options(microbenchmark.unit="relative")

bm <- structure(list(expr = structure(c(1L, 1L, 2L, 3L, 2L, 1L, 2L,
3L, 3L, 1L, 2L, 3L, 1L, 2L, 2L, 3L, 3L, 1L, 1L, 2L, 1L, 2L, 1L,
3L, 1L, 3L, 2L, 3L, 2L, 3L), levels = c("fit_orsf", "fit_odrf",
"fit_rlt"), class = "factor"), time = c(48109600, 49112600, 10993356800,
48030845700, 11083532300, 50480400, 11266350100, 49670644800,
48571226500, 51156800, 11005391000, 49984885500, 50464800, 11087423500,
11290844100, 50331647300, 51760008300, 41431800, 40718000, 11054960000,
48484100, 11253251900, 48914400, 47786809400, 52255200, 44269840300,
11077901500, 44499918900, 11067692900, 43761864500)), class = c("microbenchmark",
"data.frame"), row.names = c(NA, -30L))

print(bm)

Based on a benchmark on the attrition data (nrow = r nrow(data_all), ncol = r ncol(data_all)), aorsf runs faster than ODRF and RLT.

Prediction

Using the same attrition data, compare the prediction accuracy of the three oblique random forest R packages. First, split intro train/test sets:

set.seed(730)
train_rows <- sample(nrow(data_all), nrow(data_all)/2)

data_train <- data_all[train_rows, ]
data_test <- data_all[-train_rows, ]

x_train <- as.matrix(select(data_train, -outcome))
y_train <- data_train$outcome

x_test <- as.matrix(select(data_test, -outcome))
y_test <- data_test$outcome

Next, fit the models to the training data:

fit_orsf = orsf(data_train, outcome ~ ., n_tree = 100)

fit_odrf = ODRF(X = x_train, y = y_train, ntrees = 100)

fit_rlt = RLT(x = x_train, y = y_train, model = 'classification',
              combsplit = 3, reinforcement = TRUE, ntrees = 100)

Last, evaluate in the testing data

eval <- tibble(
 orsf = predict(fit_orsf, new_data = data_test)[, 1],
 odrf = predict(fit_odrf, Xnew = x_test, type = 'prob')[, 1],
 rlt  = predict(fit_rlt, testx = x_test)$ProbPrediction[, 1],
 truth = data_test$outcome
) %>%
 pivot_longer(-truth) %>%
 group_by(name) %>%
 roc_auc(truth = truth, value) %>%
 arrange(desc(.estimate))

eval

Not surprisingly, discrimination is very similar when all the models being compared are oblique random forests.

Comparisons with existing software (regression)

Several other R packages can grow oblique forests for regression, including

RLT (https://cran.r-project.org/package=RLT)
ODRF (https://liuyu-star.github.io/ODRF/)
obliqueRF (no longer on CRAN, so not included)

library(ODRF)
library(RLT)
library(microbenchmark)

Computation

We'll use the ames housing data from the modeldata R package.

# keep only numeric predictors (easier to convert to matrix)
data_all <- as_tibble(modeldata::ames) %>%
 mutate(outcome = log(Sale_Price)) %>%
 select(outcome, where(is.numeric), -Sale_Price)

# x and y matrices for ODRF and RLT
x <- as.matrix(select(data_all, -outcome))
y <- data_all$outcome

Compare the time taken to grow a forest of 100 trees with the RLT, aorsf, and ODRF packages:

```r

options(microbenchmark.unit="relative")

This benchmark is run using a 24-core processor

bm <- microbenchmark::microbenchmark( fit_orsf = orsf(data_all, outcome ~ ., n_tree = 100, n_thread = 0), fit_odrf = ODRF(X = x, y = y, parallel = TRUE, ntrees = 100), fit_rlt = RLT(x = x, y = y, model = 'regression', ntrees = 100, combsplit = 3, reinforcement = TRUE), times = 1 )

```r

options(microbenchmark.unit="relative")

bm <- structure(list(expr = structure(c(1L, 1L, 2L, 3L, 2L, 1L, 2L,
                                        3L, 3L, 1L, 2L, 3L, 1L, 2L, 2L, 3L, 3L, 1L, 1L, 2L, 1L, 2L, 1L,
                                        3L, 1L, 3L, 2L, 3L, 2L, 3L), levels = c("fit_orsf", "fit_odrf",
                                                                                "fit_rlt"), class = "factor"), time = c(48109600, 49112600, 10993356800,
                                                                                                                        48030845700, 11083532300, 50480400, 11266350100, 49670644800,
                                                                                                                        48571226500, 51156800, 11005391000, 49984885500, 50464800, 11087423500,
                                                                                                                        11290844100, 50331647300, 51760008300, 41431800, 40718000, 11054960000,
                                                                                                                        48484100, 11253251900, 48914400, 47786809400, 52255200, 44269840300,
                                                                                                                        11077901500, 44499918900, 11067692900, 43761864500)), class = c("microbenchmark",
                                                                                                                                                                                        "data.frame"), row.names = c(NA, -30L))

print(bm)

Based on a benchmark on the attrition data (nrow = r nrow(data_all), ncol = r ncol(data_all)), aorsf runs faster than ODRF and RLT.

Prediction

Using the same attrition data, compare the prediction accuracy of the three oblique random forest R packages. First, split intro train/test sets:

```r

set.seed(730) train_rows <- sample(nrow(data_all), nrow(data_all)/2)

data_train <- data_all[train_rows, ] data_test <- data_all[-train_rows, ]

x_train <- as.matrix(select(data_train, -outcome)) y_train <- data_train$outcome

x_test <- as.matrix(select(data_test, -outcome)) y_test <- data_test$outcome

Next, fit the models to the training data:

 ```r

fit_orsf = orsf(data_train, outcome ~ ., n_tree = 100)

fit_odrf = ODRF(X = x_train, y = y_train, ntrees = 100)

fit_rlt = RLT(x = x_train, y = y_train, model = 'regression',
              combsplit = 3, reinforcement = TRUE, ntrees = 100)

Last, evaluate in the testing data

eval <- tibble(
 orsf = predict(fit_orsf, new_data = data_test)[, 1],
 odrf = predict(fit_odrf, Xnew = x_test)[,1],
 rlt  = predict(fit_rlt, testx = x_test)$Prediction,
 truth = data_test$outcome
) %>%
 pivot_longer(-truth) %>%
 group_by(name) %>%
 rsq(truth = truth, value) %>%
 arrange(desc(.estimate))

eval

prediction accuracy is similar.

Comparison with existing software (survival)

A much more thorough comparison of aorsf and other learners for survival data has been published (see DOI: 10.1080/10618600.2023.2231048). Below is an example comparing aorsf to the two most widely used R packages for axis based random forests: randomForestSRC and ranger. Since I used mostly base R to run the classification example, I'll run this using tidy tools.

library(ranger)
library(randomForestSRC)
library(riskRegression)

Start with a recipe to pre-process data

imputer <- recipe(pbc_orsf, formula = time + status ~ .) %>%
 step_rm(id) %>%
 step_impute_mean(all_numeric_predictors()) %>%
 step_impute_mode(all_nominal_predictors())

Next create a 10-fold cross validation object and pre-process the data:

# 10-fold cross validation; make a container for the pre-processed data
analyses <- vfold_cv(data = pbc_orsf, v = 10) %>%
 mutate(recipe = map(splits, ~prep(imputer, training = training(.x))),
        train = map(recipe, juice),
        test = map2(splits, recipe, ~bake(.y, new_data = testing(.x))))

analyses

Define functions for a 'workflow' with randomForestSRC, ranger, and aorsf.

rfsrc_wf <- function(train, test, pred_horizon){

 # rfsrc does not like tibbles, so cast input data into data.frames
 train <- as.data.frame(train)
 test <- as.data.frame(test)

 rfsrc(formula = Surv(time, status) ~ ., data = train) %>%
  predictRisk(newdata = test, times = pred_horizon) %>%
  as.numeric()

}

ranger_wf <- function(train, test, pred_horizon){

 ranger(Surv(time, status) ~ ., data = train) %>%
  predictRisk(newdata = test, times = pred_horizon) %>%
  as.numeric()

}

aorsf_wf <- function(train, test, pred_horizon){

 train %>%
  orsf(Surv(time, status) ~ .,) %>%
  predict(new_data = test,
          pred_type = 'risk',
          pred_horizon = pred_horizon) %>%
  as.numeric()

}

Run the 'workflows' on each fold:

# 5 year risk prediction
ph <- 365.25 * 5

results <- analyses %>%
 transmute(test,
           pred_aorsf = map2(train, test, aorsf_wf, pred_horizon = ph),
           pred_rfsrc = map2(train, test, rfsrc_wf, pred_horizon = ph),
           pred_ranger = map2(train, test, ranger_wf, pred_horizon = ph))

Next unnest each column to get back a tibble with all of the testing data and predictions.

results <- results %>%
 unnest(everything())

glimpse(results)

And finish by aggregating the predictions and computing performance in the testing data. Note that I am computing one statistic for all predictions instead of computing one statistic for each fold. This approach is fine when you have smaller testing sets and/or small event counts.

Score(
 object = list(aorsf = results$pred_aorsf,
               rfsrc = results$pred_rfsrc,
               ranger = results$pred_ranger),
 formula = Surv(time, status) ~ 1,
 data = results,
 summary = 'IPA',
 times = ph
)