# Update bib file
# source("rjournal/gen_pkg_bib.R")
# Set global knitr chunk options
knitr::opts_chunk$set(
cache = TRUE,
fig.width = 6,
fig.asp = 0.618,
out.width = "70%",
fig.align = "center",
fig.pos = "!htb",
message = FALSE,
warning = FALSE
)
# Load required packages
library(dplyr)
library(vip)
trn <- vip::gen_friedman(500, sigma = 1, seed = 101) # simulate training data
tibble::as_tibble(trn) # inspect output
# Load required packages
library(rpart) # for fitting CART-like decision trees
library(randomForest) # for fitting RFs
library(xgboost) # for fitting GBMs
# Fit a single regression tree
tree <- rpart(y ~ ., data = trn)
# Fit an RF
set.seed(101) # for reproducibility
rfo <- randomForest(y ~ ., data = trn, importance = TRUE)
# Fit a GBM
set.seed(102) # for reproducibility
bst <- xgboost(
data = data.matrix(subset(trn, select = -y)),
label = trn$y,
objective = "reg:squarederror",
nrounds = 100,
max_depth = 5,
eta = 0.3,
verbose = 0 # suppress printing
)
# Extract VI scores from each model
vi_tree <- tree$variable.importance
vi_rfo <- rfo$variable.importance # or use `randomForest::importance(rfo)`
vi_bst <- xgb.importance(model = bst)
# VI plot for single regression tree
vi_tree <- tree$variable.importance %>%
data.frame("Importance" = .) %>%
tibble::rownames_to_column("Feature")
# VI plot for RF
vi_rfo <- rfo$importance %>%
data.frame("Importance" = .) %>%
tibble::rownames_to_column("Feature")
# VI plot for GMB
vi_bst <- bst %>% #
xgb.importance(model = .) %>%
as.data.frame() %>%
select(Feature, Importance = Gain)
# Plot results
library(ggplot2)
p1 <- vip(tree) + ggtitle("Single tree")
p2 <- vip(rfo) + ggtitle("Random forest")
p3 <- vip(bst) + ggtitle("Gradient boosting")
grid.arrange(p1, p2, p3, nrow = 1) # display plots in a grid
# Load required packages
library(vip)
# Compute model-specific VI scores
vi(tree) # CART-like decision tree
vi(rfo) # RF
vi(bst) # GBM
## p1 <- vip(tree) + ggtitle("Single tree")
## p2 <- vip(rfo) + ggtitle("Random forest")
## p3 <- vip(bst) + ggtitle("Gradient boosting")
##
## # Display plots in a grid (Figure 1)
## grid.arrange(p1, p2, p3, nrow = 1)
# Construct VIP (Figure 2)
library(ggplot2) # for theme_light() function
vip(bst, num_features = 5, geom = "point", horizontal = FALSE,
aesthetics = list(color = "red", shape = 17, size = 5)) +
theme_light()
# Fit a LM
linmod <- lm(y ~ .^2, data = trn)
backward <- step(linmod, direction = "backward", trace = 0)
# Extract VI scores
(vi_backward <- vi(backward))
# Plot VI scores; by default, `vip()` displays the top ten features
vip(vi_backward, num_features = length(coef(backward)), # Figure 3
geom = "point", horizontal = FALSE, mapping = aes(color = Sign)) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Load required packages
library(earth)
# Fit a MARS model
mars <- earth(y ~ ., data = trn, degree = 2, pmethod = "exhaustive")
# Extract VI scores
vi(mars, type = "gcv")
# Plot VI scores (Figure 4)
vip(mars)
# Load required packages
library(nnet)
# Fit a neural network
set.seed(0803) # for reproducibility
nn <- nnet(y ~ ., data = trn, size = 7, decay = 0.1,
linout = TRUE, trace = FALSE)
# Construct VIPs
p1 <- vip(nn, type = "garson")
p2 <- vip(nn, type = "olden")
# Display plots in a grid (Figure 5)
grid.arrange(p1, p2, nrow = 1)
# Load required packages
library(pdp)
# Fit a PPR model (nterms was chosen using the caret package with 5 repeats of
# 5-fold cross-validation)
pp <- ppr(y ~ ., data = trn, nterms = 11)
# PDPs for all 10 features
features <- paste0("x", 1:10)
pdps <- lapply(features, FUN = function(feature) {
pd <- partial(pp, pred.var = feature)
autoplot(pd) +
ylim(range(trn$y)) +
theme_light()
})
# Display plots in a grid
grid.arrange(grobs = pdps, ncol = 5)
# Fit a PPR model (nterms was chosen using the caret package with 5 repeats of
# 5-fold cross-validation)
pp <- ppr(y ~ ., data = trn, nterms = 11)
# Construct VIPs
p1 <- vip(pp, method = "firm") + ggtitle("PPR")
p2 <- vip(nn, method = "firm") + ggtitle("NN")
# Display plots in a grid (Figure 7)
grid.arrange(p1, p2, ncol = 2)
# ICE curves for all 10 features
ice_curves <- lapply(features, FUN = function(feature) {
ice <- partial(pp, pred.var = feature, ice = TRUE)
autoplot(ice, alpha = 0.1) +
ylim(range(trn$y)) +
theme_light()
})
# Display plots in a grid (Figure 8)
grid.arrange(grobs = ice_curves, ncol = 5)
# Construct VIPs
p1 <- vip(pp, method = "firm", ice = TRUE) + ggtitle("PPR")
p2 <- vip(nn, method = "firm", ice = TRUE) + ggtitle("NN")
# Display plots in a grid (Figure 9)
grid.arrange(p1, p2, ncol = 2)
# Construct PDP-based VI scores
(vis <- vi(pp, method = "firm"))
# Reconstruct PDPs for all 10 features (Figure 10)
par(mfrow = c(2, 5))
for (name in paste0("x", 1:10)) {
plot(attr(vis, which = "effects")[[name]], type = "l", ylim = c(9, 19), las = 1)
}
# Plot VI scores
set.seed(2021) # for reproducibility
p1 <- vip(pp, method = "permute", target = "y", metric = "rsquared",
pred_wrapper = predict) + ggtitle("PPR")
p2 <- vip(nn, method = "permute", target = "y", metric = "rsquared",
pred_wrapper = predict) + ggtitle("NN")
# Display plots in a grid (Figure 11)
grid.arrange(p1, p2, ncol = 2)
# Use 10 Monte Carlo reps
set.seed(403) # for reproducibility
vis <- vi(pp, method = "permute", target = "y", metric = "rsquared",
pred_wrapper = predict, nsim = 15)
vip(vis, geom = "boxplot") # Figure 12
list_metrics()
mae <- function(actual, predicted) {
mean(abs(actual - predicted))
}
# Construct VIP (Figure 13)
set.seed(2321) # for reproducibility
pfun <- function(object, newdata) predict(object, newdata = newdata)
vip(nn, method = "permute", target = "y", metric = mae,
smaller_is_better = TRUE, pred_wrapper = pfun) +
ggtitle("Custom loss function: MAE")
# Construct VIP (Figure 14)
set.seed(2327) # for reproducibility
vip(nn, method = "permute", pred_wrapper = pfun, target = "y", metric = "rmse",
train = trn[sample(nrow(trn), size = 400), ]) + # sample 400 observations
ggtitle("Using a random subset of training data")
# Construct VIP (Figure 15)
set.seed(8264) # for reproducibility
vip(nn, method = "permute", pred_wrapper = pfun, target = "y", metric = "mae",
nsim = 10, geom = "point", all_permutations = TRUE, jitter = TRUE) +
ggtitle("Plotting all permutation scores")
# Load required packages
library(microbenchmark)
mb <- readRDS("benchmark.rds")
autoplot(mb) # Figure 16
# Load required packages
library(xgboost)
# Feature matrix
X <- data.matrix(subset(trn, select = -y)) # matrix of feature values
# Fit an XGBoost model; hyperparameters were tuned using 5-fold CV
set.seed(859) # for reproducibility
bst <- xgboost(X, label = trn$y, nrounds = 338, max_depth = 3, eta = 0.1,
verbose = 0)
# Construct VIP (Figure 17)
vip(bst, method = "shap", train = X, exact = TRUE, include_type = TRUE)
# Load required packages
library(mlr3)
library(mlr3learners)
# Fit a ranger-based random forest using the mlr3 package
set.seed(101)
task <- TaskRegr$new("friedman", backend = trn, target = "y")
lrnr <- lrn("regr.ranger", importance = "impurity")
lrnr$train(task)
# First, compute a tibble of VI scores using any method
var_imp <- vi(lrnr)
# Next, convert to an HTML-based data table with sparklines
add_sparklines(var_imp, fit = lrnr$model, train = trn) # Figure 18
# Load the Ames housing data
ames <- AmesHousing::make_ames()
X <- subset(ames, select = -Sale_Price)
y <- ames$Sale_Price
# Load required packages
library(SuperLearner)
# List of base learners
learners <- c("SL.xgboost", "SL.ranger", "SL.earth", "SL.glmnet", "SL.ksvm")
# Stack models
set.seed(840) # for reproducibility
ctrl <- SuperLearner.CV.control(V = 5L, shuffle = TRUE)
sl <- SuperLearner(Y = y, X = X, SL.library = learners, verbose = TRUE,
cvControl = ctrl)
sl
# Prediction wrapper functions
imp_fun <- function(object, newdata) { # for permutation-based VI scores
predict(object, newdata = newdata)$pred
}
par_fun <- function(object, newdata) { # for PDPs
mean(predict(object, newdata = newdata)$pred)
}
# Setup parallel backend
library(doParallel) # load the parallel backend
cl <- makeCluster(5) # use 5 workers
registerDoParallel(cl) # register the parallel backend
# Permutation-based feature importance
set.seed(278) # for reproducibility
var_imp <- vi(sl, method = "permute", train = X, target = y, metric = "rmse",
pred_wrapper = imp_fun, nsim = 5, parallel = TRUE)
# Add sparkline representation of feature effects (# Figure 19)
add_sparklines(var_imp[1L:10L, ], fit = sl, pred.fun = par_fun, train = X,
digits = 2, verbose = TRUE, trim.outliers = TRUE,
grid.resolution = 20, parallel = TRUE)
# Shut down cluster
stopCluster(cl)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.