# R code for the manuscript
#
# partial: An R Package for Creating Partial Dependence Plots
#
# Author: Brandon M. Greenwell
# Last Modified: January 8, 2017
################################################################################
# Setup
################################################################################
# Set working directory (ignore for review)
# setwd("manuscript")
# List of packages required to run all the examples in this script
# pkgs <- c("caret",
# "doParallel",
# "dplyr",
# "e1071",
# "earth",
# "ggplot2",
# "party",
# "pdp",
# "randomForest",
# "xgboost")
# install.packages(pkgs)
# Load required packages
library(caret)
library(doParallel)
# library(dplyr) # use dplyr:: instead to avoid conflicts with plyr!
library(e1071)
library(earth)
library(ggplot2)
library(gridExtra)
library(magrittr)
library(party)
library(pdp)
library(randomForest)
library(xgboost)
################################################################################
# Constructing PDPs in R
################################################################################
# Load the (corrected) Boston housing data
data(boston)
# Fit a random forest using default settings
set.seed(101) # for reproducibility
boston.rf <- randomForest(cmedv ~ ., data = boston, importance = TRUE)
print(boston.rf) # check model results
# Call:
# randomForest(formula = cmedv ~ ., data = boston, importance = TRUE)
# Type of random forest: regression
# Number of trees: 500
# No. of variables tried at each split: 5
#
# Mean of squared residuals: 9.114248
# % Var explained: 89.17
# Figure 1
varImpPlot(boston.rf) # dotchart of variable importance scores
################################################################################
# Single predictor PDPs
################################################################################
# Figure 2
pd.lstat <- partial(boston.rf, pred.var = "lstat")
pdp1 <- plotPartial(pd.lstat)
pdp2 <- plotPartial(pd.lstat, lwd = 2, smooth = TRUE,
ylab = expression(f(lstat)))
grid.arrange(pdp1, pdp2, ncol = 2)
################################################################################
# Multi-predictor PDPs
################################################################################
# Figure 3
rwb <- colorRampPalette(c("red", "white", "blue"))
pd.lstat.rm <- partial(boston.rf, pred.var = c("lstat", "rm"))
p1 <- plotPartial(pd.lstat.rm)
p2 <- plotPartial(pd.lstat.rm, contour = TRUE, col.regions = rwb)
p3 <- plotPartial(pd.lstat.rm, levelplot = FALSE, zlab = "cmedv", drape = TRUE,
colorkey = FALSE, screen = list(z = -20, x = -60))
grid.arrange(p1, p2, p3, ncol = 3)
################################################################################
# Avoiding extrapolation
################################################################################
# Figure 4
p1 <- partial(boston.rf, pred.var = "lstat", plot = TRUE, rug = TRUE)
p2 <- partial(boston.rf, pred.var = c("lstat", "rm"), plot = TRUE, chull = TRUE)
grid.arrange(p1, p2, ncol = 2)
# Figure 5
p1 <- partial(boston.rf, "rm", plot = TRUE)
p2 <- partial(boston.rf, "rm", grid.resolution = 30, plot = TRUE)
p3 <- partial(boston.rf, "rm", pred.grid = data.frame(rm = 3:9), plot = TRUE,
check.class = FALSE)
p4 <- partial(boston.rf, "rm", quantiles = TRUE, probs = 0:10/10, plot = TRUE)
grid.arrange(p1, p2, p3, p4, ncol = 2)
################################################################################
# Addressing computational concerns
################################################################################
# Load the LA ozone data
ozone <- read.csv(paste0("http://statweb.stanford.edu/~tibs/ElemStatLearn/",
"datasets/LAozone.data"), header = TRUE)
# Fit a MARS model
ozone.mars <- earth(ozone ~ ., data = ozone, degree = 3)
# Note: the following example will not work on Windows!
# # Figure 6 (Unix/Linux)
# pdf("partial_par.pdf", width = 7, height = 5)
# registerDoParallel(cores = 4) # use 4 cores
# partial(ozone.mars, pred.var = c("wind", "temp", "dpg"), plot = TRUE,
# chull = TRUE, parallel = TRUE) # Figure 6
# dev.off()
# Figure 6
cl <- makeCluster(4) # use 4 cores
registerDoParallel(cl)
partial(ozone.mars, pred.var = c("wind", "temp", "dpg"), plot = TRUE,
chull = TRUE, parallel = TRUE, paropts = list(.packages = "earth")) # Figure 6
lattice::trellis.focus("legend", side = "right", clipp.off = TRUE,
highlight = FALSE)
grid::grid.text("ozone", 0.2, 1.05, hjust = 0.5, vjust = 1)
lattice::trellis.unfocus()
stopCluster(cl)
################################################################################
# Classification problems
################################################################################
# # Train an SVM to Edgar Anderson's iris data using 5-fold classification
# library(caret)
# set.seed(101)
# ctrl <- trainControl(method = "cv", number = 5, verboseIter = TRUE)
# iris.svm.tune <- train(x = subset(iris, select = -Species),
# y = iris$Species,
# method = "svmRadial",
# trControl = ctrl,
# tuneLength = 100)
# Fit an SVM to Edgar Anderson's iris data
iris.svm <- svm(Species ~ ., data = iris, kernel = "radial", gamma = 0.75,
cost = 0.25, probability = TRUE)
# Figure 7
pd <- NULL
for (i in 1:3) {
tmp <- partial(iris.svm, pred.var = c("Petal.Width", "Petal.Length"),
which.class = i, progress = TRUE,
grid.resolution = 101)
pd <- rbind(pd, cbind(tmp, Species = levels(iris$Species)[i]))
}
ggplot(pd, aes(x = Petal.Width, y = Petal.Length, z = yhat, fill = yhat)) +
geom_tile() +
geom_contour(color = "white", alpha = 0.5) +
scale_fill_distiller(name = "Centered\nlogit", palette = "Spectral") +
theme_bw() +
facet_grid(~ Species)
################################################################################
# User-defined prediction functions
################################################################################
# Figure 8
probFun <- function(object, newdata) {
pred <- predict(object, newdata, probability = TRUE)
prob.setosa <- attr(pred, which = "probabilities")[, "setosa"]
mean(prob.setosa)
}
grid.arrange(
partial(iris.svm, pred.var = "Petal.Width", pred.fun = probFun, plot = TRUE),
partial(iris.svm, pred.var = "Petal.Length", pred.fun = probFun, plot = TRUE),
partial(iris.svm, pred.var = c("Petal.Width", "Petal.Length"),
pred.fun = probFun, plot = TRUE),
ncol = 3
)
# Update: using the new prob argument
grid.arrange(
partial(iris.svm, pred.var = "Petal.Width", prob = TRUE, plot = TRUE),
partial(iris.svm, pred.var = "Petal.Length", prob = TRUE, plot = TRUE),
partial(iris.svm, pred.var = c("Petal.Width", "Petal.Length"), prob = TRUE,
plot = TRUE),
ncol = 3
)
# pred.ice <- function(object, newdata) {
# predict(object, newdata)
# }
# pred.ice.quan <- function(object, newdata) {
# quantile(predict(object, newdata), probs = 1:9/10)
# }
# lstat.ice <- partial(boston.rf, pred.var = "lstat",
# pred.fun = pred.ice)
# lstat.ice.quan <- partial(boston.rf, pred.var = "lstat",
# pred.fun = pred.ice.quan)
# lstat.pdp <- partial(boston.rf, pred.var = "lstat")
# ylim <- c(3.196822, 52.149681)
# grid.arrange(
# plotPartial(lstat.ice, alpha = 0.1, ylim = ylim),
# plotPartial(lstat.ice.quan, ylim = ylim),
# plotPartial(lstat.pdp, ylim = ylim),
# ncol = 3
# )
# Use partial to obtain ICE curves for rm
pred.ice <- function(object, newdata) {
predict(object, newdata)
}
rm.ice <- partial(boston.rf, pred.var = "rm", pred.fun = pred.ice)
# Figure 9
plotPartial(rm.ice, rug = TRUE, train = boston, alpha = 0.2)
# Post-process rm.ice to obtain c-ICE curves
rm.ice <- rm.ice %>%
dplyr::group_by(yhat.id) %>%
dplyr::mutate(yhat.centered = yhat - dplyr::first(yhat))
# ICE curves with PDP
p1 <- ggplot(rm.ice, aes(rm, yhat)) +
geom_line(aes(group = yhat.id), alpha = 0.2) +
stat_summary(fun = mean, geom = "line", col = "red", size = 1)
# c-ICE curves with centered PDP
p2 <- ggplot(rm.ice, aes(rm, yhat.centered)) +
geom_line(aes(group = yhat.id), alpha = 0.2) +
stat_summary(fun = mean, geom = "line", col = "red", size = 1)
# Figure 10
grid.arrange(p1, p2, ncol = 2)
# Update: using the new options ice and center
# pd.ice <- partial(boston.rf, pred.var = "rm", ice = TRUE)
# pd.cice <- partial(boston.rf, pred.var = "rm", ice = TRUE, center = TRUE)
# p1 <- plotPartial(pd.ice, alpha = 0.3) # ICE curves: lattice version
# p2 <- plotPartial(pd.cice, alpha = 0.3) # c-ICE curves: lattice version
# p3 <- autoplot(pd.ice, alpha = 0.3) # ICE curves: ggplot2 version
# p4 <- autoplot(pd.cice, alpha = 0.3) # c-ICE curves: ggplot2 version
# grid.arrange(p1, p2, p3, p4, ncol = 2)
#
# # Try some other things
# grid.arrange(
# autoplot(pd.ice, center = TRUE, alpha = 0.3),
# autoplot(pd.cice, center = TRUE, alpha = 0.3),
# autoplot(pd.ice, center = TRUE, alpha = 0.3, smooth = TRUE),
# ncol = 3
# )
################################################################################
# Using partial with the XGBoost library
################################################################################
# Tune an XGBoost model using 10-fold cross-validation
set.seed(202)
boston.xgb <- train(x = data.matrix(subset(boston, select = -cmedv)),
y = boston$cmedv,
method = "xgbTree",
metric = "Rsquared",
trControl = trainControl(method = "cv", number = 10),
tuneLength = 10)
plot(boston.xgb)
print(boston.xgb)
# Tuning parameter 'gamma' was held constant at a value of 0
# Tuning parameter 'min_child_weight' was
# held constant at a value of 1
# Rsquared was used to select the optimal model using the largest value.
# The final values used for the model were:
# nrounds = 100
# max_depth = 5
# eta = 0.3
# gamma = 0
# colsample_bytree = 0.8
# min_child_weight = 1
# subsample = 0.9444444
# Print the cross-validated resuts
# filter(boston.xgb$results, nrounds == 100 & max_depth == 5 & eta == 0.3 &
# gamma == 0 & colsample_bytree == 0.8 & min_child_weight == 1 &
# subsample == 0.94444444444444444444444)
# PDPs for lstat and rm
pdp.lstat <- partial(boston.xgb, pred.var = "lstat", plot = TRUE, rug = TRUE)
pdp.rm <- partial(boston.xgb, pred.var = "rm", plot = TRUE, rug = TRUE)
pdp.lstat.rm <- partial(boston.xgb, pred.var = c("lstat", "rm"),
plot = TRUE, chull = TRUE)
# Figure 11
grid.arrange(pdp.lstat, pdp.rm, pdp.lstat.rm, ncol = 3)
# Use use xgboost function directly
set.seed(203)
boston.xgb <- xgboost(data = data.matrix(subset(boston, select = -cmedv)),
label = boston$cmedv, objective = "reg:squarederror",
nrounds = 100, max_depth = 5, eta = 0.3, gamma = 0,
colsample_bytree = 0.8, min_child_weight = 1,
subsample = 0.9444444, save_period = NULL)
# pdf("boston_xgb.pdf", width = 12, height = 4)
X <- subset(boston, select = -cmedv)
grid.arrange(
partial(boston.xgb, pred.var = "lstat", plot = TRUE, rug = TRUE, train = X),
partial(boston.xgb, pred.var = "rm", plot = TRUE, rug = TRUE, train = X),
partial(boston.xgb, pred.var = c("lstat", "rm"), plot = TRUE, chull = TRUE,
train = X),
ncol = 3
)
# dev.off()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.