knitr::opts_chunk$set( collapse = TRUE, comment = "#>", eval = FALSE # examples require optional packages; set eval=TRUE locally )
Starting from version 1.1.0, e2tree supports the following tree ensemble
backends in addition to randomForest and ranger:
| Package | Model class | Task |
|---------|-------------|------|
| xgboost | xgb.Booster | classification, regression |
| gbm | gbm | classification, regression |
| lightgbm | lgb.Booster | classification, regression |
| catboost | catboost.CatBoost / catboost.Model | classification, regression |
The workflow is identical regardless of the backend: train a model, build the
dissimilarity matrix with createDisMatrix(), then call e2tree().
library(e2tree) if (!require("xgboost")) install.packages("xgboost", repos="https://cran.r-project.org") library(xgboost) data(iris) set.seed(42) n <- floor(0.75 * nrow(iris)) tr <- iris[sample(nrow(iris), n), ] va <- iris[setdiff(seq_len(nrow(iris)), as.integer(rownames(tr))), ] # XGBoost requires a numeric matrix and 0-indexed integer labels X_tr <- as.matrix(tr[, 1:4]) y_tr <- as.integer(tr$Species) - 1L dm_tr <- xgb.DMatrix(data = X_tr, label = y_tr) ensemble <- xgb.train( params = list(objective = "multi:softmax", num_class = 3, max_depth = 4, eta = 0.3), data = dm_tr, nrounds = 100, verbose = 0 ) # Attach the response back to the data.frame so the formula in e2tree() # can find it; createDisMatrix() will use it to annotate the dissimilarity # matrix (in classification, `label` is optional but recommended). tr_xgb <- tr[, 1:4] tr_xgb$Species <- tr$Species D <- createDisMatrix(ensemble, data = tr_xgb, label = "Species", parallel = list(active = FALSE, no_cores = 1)) setting <- list(impTotal = 0.1, maxDec = 0.01, n = 2, level = 5) tree_xgb <- e2tree(Species ~ ., data = tr_xgb, D = D, ensemble = ensemble, setting = setting) print(tree_xgb)
For regression backends, createDisMatrix() needs the response column to
compute the dissimilarity scale. Pass the full data frame (predictors plus
response) and the name of the response column via the label argument.
library(xgboost) data(mtcars) set.seed(42) n <- floor(0.75 * nrow(mtcars)) tr <- mtcars[sample(nrow(mtcars), n), ] X_tr <- as.matrix(tr[, -1]) y_tr <- tr$mpg dm_tr <- xgb.DMatrix(data = X_tr, label = y_tr) ensemble <- xgb.train( params = list(objective = "reg:squarederror", max_depth = 4, eta = 0.3), data = dm_tr, nrounds = 100, verbose = 0 ) # `data = tr` carries the response column too; the XGBoost adapter # automatically trims the matrix to the features used at training time. D <- createDisMatrix(ensemble, data = tr, label = "mpg", parallel = list(active = FALSE, no_cores = 1)) tree <- e2tree(mpg ~ ., data = tr, D = D, ensemble = ensemble, setting = list(impTotal = 0.1, maxDec = 1e-6, n = 2, level = 5)) print(tree)
gbm expects a 0/1 numeric response for the bernoulli distribution, while
e2tree expects a factor response for classification. We therefore train
gbm on the integer column and pass a factor copy of the same column to
e2tree.
if (!require("gbm")) install.packages("gbm", repos="https://cran.r-project.org") library(gbm) data(iris) set.seed(42) df <- iris df$is_setosa <- as.integer(df$Species == "setosa") df$is_setosa_fct <- factor(df$is_setosa, levels = c(0L, 1L)) n <- floor(0.75 * nrow(df)) tr <- df[sample(nrow(df), n), ] ensemble <- gbm(is_setosa ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data = tr, distribution = "bernoulli", n.trees = 200, interaction.depth = 4, verbose = FALSE) D <- createDisMatrix(ensemble, data = tr[, c("Sepal.Length","Sepal.Width", "Petal.Length","Petal.Width", "is_setosa_fct")], label = "is_setosa_fct", parallel = list(active = FALSE, no_cores = 1)) tree <- e2tree(is_setosa_fct ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data = tr, D = D, ensemble = ensemble, setting = list(impTotal = 0.1, maxDec = 0.01, n = 2, level = 5)) print(tree)
gbm requires nTrain * bag.fraction > 2 * n.minobsinnode + 1, which fails
on small training sets such as 24-row mtcars with the default settings.
Lower n.minobsinnode and raise bag.fraction to keep the example
self-contained.
library(gbm) data(mtcars) set.seed(42) n <- floor(0.75 * nrow(mtcars)) tr <- mtcars[sample(nrow(mtcars), n), ] ensemble <- gbm(mpg ~ ., data = tr, distribution = "gaussian", n.trees = 200, interaction.depth = 4, n.minobsinnode = 2, bag.fraction = 0.8, verbose = FALSE) D <- createDisMatrix(ensemble, data = tr, label = "mpg", parallel = list(active = FALSE, no_cores = 1)) tree <- e2tree(mpg ~ ., data = tr, D = D, ensemble = ensemble, setting = list(impTotal = 0.1, maxDec = 1e-6, n = 2, level = 5)) print(tree)
if (!require("lightgbm")) install.packages("lightgbm", repos="https://cran.r-project.org") library(lightgbm) data(iris) set.seed(42) n <- floor(0.75 * nrow(iris)) tr <- iris[sample(nrow(iris), n), ] X_tr <- as.matrix(tr[, 1:4]) y_tr <- as.integer(tr$Species) - 1L ds <- lgb.Dataset(X_tr, label = y_tr) ensemble <- lgb.train( params = list(objective = "multiclass", num_class = 3, num_leaves = 15, verbose = -1), data = ds, nrounds = 100 ) tr_lgb <- tr[, 1:4] tr_lgb$Species <- tr$Species D <- createDisMatrix(ensemble, data = tr_lgb, label = "Species", parallel = list(active = FALSE, no_cores = 1)) tree <- e2tree(Species ~ ., data = tr_lgb, D = D, ensemble = ensemble, setting = list(impTotal = 0.1, maxDec = 0.01, n = 2, level = 5)) print(tree)
library(lightgbm) data(mtcars) set.seed(42) n <- floor(0.75 * nrow(mtcars)) tr <- mtcars[sample(nrow(mtcars), n), ] X_tr <- as.matrix(tr[, -1]) y_tr <- tr$mpg ds <- lgb.Dataset(X_tr, label = y_tr) ensemble <- lgb.train( params = list(objective = "regression", num_leaves = 8, min_data_in_leaf = 2, learning_rate = 0.1, verbose = -1), data = ds, nrounds = 200 ) # Pass the response column to createDisMatrix() via `label`. The # LightGBM adapter selects the columns it needs through the booster's # stored feature names, so any extra columns in `data` are ignored. D <- createDisMatrix(ensemble, data = tr, label = "mpg", parallel = list(active = FALSE, no_cores = 1)) tree <- e2tree(mpg ~ ., data = tr, D = D, ensemble = ensemble, setting = list(impTotal = 0.1, maxDec = 1e-6, n = 2, level = 5)) print(tree)
To support a further model class MyEnsemble, implement three S3 methods and
register them in NAMESPACE:
# In R/adapters.R (or a separate R/adapter_mymodel.R) get_ensemble_type.MyEnsemble <- function(ensemble) { # return "classification" or "regression" } extract_terminal_nodes.MyEnsemble <- function(ensemble, data) { # return data.frame of (n_obs × n_trees) terminal node IDs } get_ensemble_predictions.MyEnsemble <- function(ensemble, data, type) { # return numeric vector of length n_obs }
No changes to createDisMatrix(), e2tree(), or any other core function are
required.
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.