#' @importFrom colorRamps blue2red
#' @importFrom GISTools add.alpha
#' @importFrom graphics axis
#' @importFrom graphics layout
#' @importFrom graphics lines
#' @importFrom graphics par
#' @importFrom graphics plot
#' @importFrom graphics text
#' @importFrom graphics legend
#' @importFrom stats aggregate
#' @importFrom stats predict
#' @importFrom dplyr sample_n
#' @importFrom dplyr group_by
#' @importFrom dplyr group_indices
#' @importFrom parallel detectCores
#' @importFrom parallel makeCluster
#' @importFrom parallel stopCluster
#' @importFrom doParallel registerDoParallel
#' @importFrom foreach foreach
#' @importFrom foreach %dopar%
#' @title Explainability and matchplot for PDP
#'
#' @description Computes explainability and matchplots for a partial dependence function.
#'
#' @param model A model with corresponding predict function that returns numeric values.
#' @param x Data frame.
#' @param vnames Character vector of the variable set for which the patial dependence function is to be computed.
#' @param viz Logical specifying whether a matchplot should be created.
#' @param parallel Logical specifying whether computation should be parallel.
#' @param sample fraction-size for sampling of x.
#' @param pfunction User generated predict function.
#' @param ... Further arguments to be passed to the \code{predict()} method of the \code{model}.
#'
#' @return Numeric value with the explainability of the partial dependence function for the variables specified in \code{vnames}.
#'
#' @export
#'
#' @examples
#' library(pdp)
#' library(randomForest)
#' data(boston)
#' set.seed(42)
#' boston.rf <- randomForest(cmedv ~ ., data = boston)
#' xpy(boston.rf, boston, c("lstat", "rm"))
#'
#' @author \email{gero.szepannek@@web.de}
#'
#' @references \itemize{
#' \item Szepannek, G. (2019): How Much Can We See? A Note on Quantifying Explainability of Machine Learning Models,
#' \href{https://arxiv.org/abs/1910.13376}{\emph{arXiv:1910.13376 [stat.ML]}}.
#' }
#'
#' @rdname xpy
xpy <- function(model, x, vnames, viz = TRUE, parallel = TRUE, sample = 1, pfunction = NULL, ...){
# number of unique observations to give to processors each step
ssize <- 20
# Sampling
sample_n(x, round(nrow(x)*sample))
# required packages to be loaded in parallel R-threads
# this should be extracted from variable model
pckgs = c("randomForest")
# checking for predict_function parameter
if (is.null(pfunction)) {
# predict_function not specified
pfunction <- predict
}
# adding indices for unique vnames-values
x <- x %>% group_by(x[,names(x) %in% vnames])
x$ind <- x %>% group_indices()
# ignore duplicates in xs
xs <- x[!duplicated(x[,"ind"]), names(x) %in% append("ind", vnames)]
xs <- data.frame(ID = 1:nrow(xs), xs)
# keep column ind for join
xrest <- x[, !names(x) %in% append("ind", vnames), drop = FALSE]
# checking if parallel computation should be used
if (parallel) {
# calculate number of steps of size ssize
steps <- ceiling(nrow(xs)/ssize)
# create cluster of all available cores on host
cl <- makeCluster(detectCores()) # uses parallel::detectCores() + ::makeCluster
# register cluster for foreach
registerDoParallel(cl)
# parallel loop 1 to steps, results are concatenated ('c'),
# load pckgs in every thread
pdps <- foreach(i = 1:steps, .combine = 'c',.packages = pckgs) %dopar% { # uses foreach::foreach
# beginning of subset
ifrom <- (i-1)*ssize +1
# end of subset
ito <- min(i*ssize, nrow(xs))
# cartesian product
xx <- merge(xs[ifrom:ito,], xrest, by = NULL)
# predict
xx$yhat <- pfunction(model, xx, ...)
# compute average prediction
return(pred = tapply(xx$yhat, xx$ind, mean))
}
stopCluster(cl)
# convert results to dataframe
pdps <- stack(pdps)
x$pred <- pfunction(model, x, ...)
avpred <- mean(x$pred)
# merge results
preds <- merge(pdps, x[, c("pred","ind"), drop = FALSE])
}
else {
# parallel = FALSE
pdps <- NULL
from <- seq(1, nrow(xs), by = ssize)
for ( i in from){
xx <- merge(xs[i:min((i+ssize-1),nrow(xs)),], xrest , by.x = NULL, by.y = NULL)
ID <- xx[,"ind"]
xx$yhat <- pfunction(model, xx)
pdps <- rbind(pdps, aggregate(xx$yhat, list(ID), mean))
}
x$pred <- pfunction(model, x)
avpred <- mean(x$pred)
preds <- merge(pdps, x[, c("pred","ind"), drop = FALSE], by.x = "Group.1", by.y = "ind")
colnames(preds)[2] <- "values"
}
# plotting
if(viz){
rnge <- range(c(range(preds$pred), range(preds$values)))
plot(preds$values, preds$pred, xlim = rnge, ylim = rnge, xlab = "PDP", ylab = "Prediction", pch = 4, main = "PDP vs. Predictions")
lines(rnge, rnge, lty = "dotted")
}
ASE <- mean((preds$pred - preds$values)^2)
ASE0 <- mean((preds$pred - avpred)^2)
xty <- 1 - ASE / ASE0
xty
}
#' @title Forward variable selection for PDP explanation
#'
#' @description Computes forward variable selection for partial dependence function based on explainability.
#'
#' @param model A model with corresponding predict function that returns numeric values.
#' @param x Data frame.
#' @param target Character specifying the name of the (numeric) target variable (must be contained in data).
#' @param ... Further arguments to be passed to the \code{predict()} method of the \code{model}.
#'
#' @return Object of class \code{vsexp} containing the following elements:
#' @return \item{selection.order}{Vector of variable names in order of their entrance to the PD function during the variable selection process.}
#' @return \item{explainability}{Explainabilities after the different iterations of the variable selection.}
#' @return \item{details}{A data frame containing the explainabilities for all variables (rows) if they were entered in the model at each step (columns).}
#'
#' @export
#'
#' @examples
#' \dontrun{
#' library(pdp)
#' library(randomForest)
#' data(boston)
#' set.seed(42)
#' boston.rf <- randomForest(cmedv ~ ., data = boston)
#' vs <- fw.xpy(boston.rf, boston, "cmedv")
#' vs
#'
#' plot(vs)
#'}
#'
#' @author \email{gero.szepannek@@web.de}
#'
#' @references \itemize{
#' \item Szepannek, G. (2019): How Much Can We See? A Note on Quantifying Explainability of Machine Learning Models,
#' \href{https://arxiv.org/abs/1910.13376}{\emph{arXiv:1910.13376 [stat.ML]}}.
#' }
#'
#' @rdname fw.xpy
fw.xpy <- function(model, x, target, ...){
# Initialization and selection of first variable
n <- 1
cat("Step", n, "\n")
sel <- NULL
trace <- NULL
nms <- nms.full <- names(x)[-which(names(x) == target)]
xpys <- rep(NA, length(nms))
names(xpys) <- nms
for(v in nms) xpys[which(names(xpys) == v)] <- xpy(model, x, v, viz = F, ...)
sel <- c(sel, which.max(xpys))
trace <- c(trace, max(xpys, na.rm = T))
print(xpys)
cat("\n", nms.full[sel], max(xpys, na.rm = T), "\n\n")
# forward selection variables such that explainability is maximized
while(length(nms) > 1){
n <- n + 1
cat("Step", n, "\n")
nms <- nms.full[-sel]
xpys <- cbind(xpys, NA)
for(v in nms) xpys[which(rownames(xpys) == v), ncol(xpys)] <- xpy(model, x, c(names(sel), v), viz = F, ...)
sel <- c(sel, which.max(xpys[,ncol(xpys)]))
colnames(xpys) <- c(paste("Step", 1:n))
trace <- c(trace, max(xpys, na.rm = T))
print(xpys)
cat("\n", nms.full[sel], max(xpys[,ncol(xpys)], na.rm=T), "\n\n")
}
res <- list(selection.order = sel, explainability = trace, details = xpys)
class(res) <- "vsexp"
return(res)
}
#' @export
plot.vsexp <- function(x, ...){
plot(0:length(x$explainability), c(0,x$explainability), type = "l", xaxt = "n", xlab = "", ylim = c(0, 1), ylab = "explainability")
axis(1, at = 1:length(x$selection.order), labels = names(x$selection.order), las = 2)
}
#' @export
print.vsexp <- function(x, ...) print(cbind(x$selection.order, x$explainability))
#' @title Explanation gap visualization
#'
#' @description Visualization of 2D PDP vs. unexplained residual predictions.
#'
#' @param model A model with corresponding predict function that returns numeric values.
#' @param x Data frame.
#' @param vnames Character vector of the variable set for which the patial dependence function is to be computed.
#' @param type Character, either \code{"pdp"}, \code{"gap"} or \code{"both"} specifying the meaning of the colours
#' of scatter: either the value of the PD function or the differences between PD and the model's predcitions.
#' In case of \code{"both"} two plots are created.
#' @param depth Integer specifiying the number colours in the heat map.
#' @param alpha Numeric value for alpha blending of the points in the scatter plot.
#' @param right Position where to place the legend relative to the range of the x axis.
#' @param top Position where to place the legend relative to the range of the y axis.
#' @param digits Nuber of digits for rounding in the legend.
#' @param ... Further arguments to be passed to the \code{predict()} method of the \code{model}.
#'
#' @export
#'
#' @examples
#' \dontrun{
#' library(pdp)
#' library(randomForest)
#' data(boston)
#' set.seed(42)
#' boston.rf <- randomForest(cmedv ~ ., data = boston)
#' pdp2d(boston.rf, boston, c("lstat", "rm"), type = "both")
#' }
#'
#' @author \email{gero.szepannek@@web.de}
#'
#' @references \itemize{
#' \item Szepannek, G. (2019): How Much Can We See? A Note on Quantifying Explainability of Machine Learning Models,
#' \href{https://arxiv.org/abs/1910.13376}{\emph{arXiv:1910.13376 [stat.ML]}}.
#' }
#'
#' @rdname pdp2d
pdp2d <- function(model, x, vnames, type = "pdp", depth = 21, alpha = 2/3, right = 0.8, top = 0.95, digits = 1, ...){
if(sum(names(x) %in% vnames) != 2) stop("You should use 2 Variables in order to compute scatterplots!")
xs <- x[, names(x) %in% vnames, drop = FALSE]
xs <- data.frame(ID = 1:nrow(xs), xs)
xrest <- x[, !names(x) %in% vnames, drop = FALSE]
xx <- merge(xs, xrest, by.x = NULL, by.y = NULL)
ID <- xx[,1]
xx$yhat <- predict(model, xx[,-1], ...)
pdps <- aggregate(xx$yhat, list(ID), mean)
pred <- predict(model, x, ...)
if(type == "both") par(mfrow = c(1,2))
if(type != "gap"){
minmax <- range(pdps$x)
cols <- seq(minmax[1], minmax[2], length.out = depth)
cols <- sapply(pdps$x, function(z) which.min(abs(z-cols)))
cols <- colorRamps::blue2red(depth)[cols]
cols <- GISTools::add.alpha(cols, alpha)
plot(x[[vnames[1]]], x[[vnames[2]]], pch = 16, col = cols, xlab = vnames[1], ylab = vnames[2], main = "PD(x)")
# legend
vals <- seq(minmax[1], minmax[2], length.out = depth)
cols <- colorRamps::blue2red(depth)
steps <- seq(1,depth, length.out = 5)
xpos <- range(xs[,3]) %*% c(1-right, right)
ypos <- range(xs[,2]) %*% c(1-top, top)
legend(xpos, ypos, round(vals[steps],1), fill = cols[steps], bty = "n", cex = 0.7)
}
if(type != "pdp"){
diffs <- pdps$x - pred
cols <- seq(-max(abs(range(diffs))), max(abs(range(diffs))), length.out = depth)
cols <- sapply(diffs, function(z) which.min(abs(z-cols)))
cols <- colorRamps::blue2red(depth)[cols]
cols <- GISTools::add.alpha(cols, alpha)
plot(x[[vnames[1]]], x[[vnames[2]]], pch = 16, col = cols, xlab = vnames[1], ylab = vnames[2], main = "PD(x) - f(x)")
# legend
vals <- seq(-max(abs(range(diffs))), max(abs(range(diffs))), length.out = depth)
cols <- colorRamps::blue2red(depth)
steps <- seq(1,depth, length.out = 5)
xpos <- range(xs[,3]) %*% c(1-right, right)
ypos <- range(xs[,2]) %*% c(1-top, top)
legend(xpos, ypos, round(vals[steps],1), fill = cols[steps], bty = "n", cex = 0.7)
}
}
#' @title Scatterplot matrix of 2D partial dependence plots
#'
#' @description Creates a scatterplot matrix of 2D partial dependence plots.
#'
#' @param model A model with corresponding predict function that returns numeric values.
#' @param x Data frame.
#' @param vnames Character vector of the variable set for which the patial dependence function is to be computed.
#' @param depth Integer specifiying the number colours in the heat map.
#' @param alpha Numeric value for alpha blending of the points in the scatter plot.
#' @param ... Further arguments to be passed to the \code{predict()} method of the \code{model}.
#'
#' @export
#'
#' @examples
#' \dontrun{
#' library(pdp)
#' library(randomForest)
#' data(boston)
#' set.seed(42)
#' boston.rf <- randomForest(cmedv ~ ., data = boston)
#' PDPmatrix(boston.rf, boston, vnames = c("lstat", "rm", "lon", "nox"))
#' }
#'
#' @author \email{gero.szepannek@@web.de}
#'
#' @references \itemize{
#' \item Szepannek, G. (2019): How Much Can We See? A Note on Quantifying Explainability of Machine Learning Models,
#' \href{https://arxiv.org/abs/1910.13376}{\emph{arXiv:1910.13376 [stat.ML]}}.
#' }
#'
#' @rdname PDPmatrix
PDPmatrix <- function(model, x, vnames, depth = 21, alpha = 2/3, ...){
n <- length(vnames)
mat <- matrix(0,n,n)
k <- 1
for(i in 1:n){
for(j in i:n){
mat[i,j] <- k
k <- k+1
}
}
layout(mat, rep(2,n), rep(2,n))
pred <- predict(model, x, ...)
minmax <- range(pred)
colrefs <- seq(minmax[1], minmax[2], length.out = depth)
for(i in 1:(n-1)){
plot(5, 5, type="n", axes=FALSE, ann=FALSE, xlim=c(0, 10), ylim = c(0,10))
text(5,5, vnames[i], cex = 2)
for(j in (i+1):n){
xs <- x[, names(x) %in% vnames[c(i,j)], drop = FALSE]
xs <- data.frame(ID = 1:nrow(xs), xs)
xrest <- x[, !names(x) %in% vnames[c(i,j)], drop = FALSE]
xx <- merge(xs, xrest, by.x = NULL, by.y = NULL)
ID <- xx[,1]
xx$yhat <- predict(model, xx[,-1], ...)
pdps <- aggregate(xx$yhat, list(ID), mean)
cols <- sapply(pdps$x, function(z) which.min(abs(z-colrefs)))
cols <- colorRamps::blue2red(depth)[cols]
cols <- GISTools::add.alpha(cols, alpha)
par(mar=c(1,1,1,1))
plot(x[[vnames[j]]], x[[vnames[i]]], pch = 16, col = cols, xlab = vnames[1], ylab = vnames[2], main = "")
}
}
plot(5, 5, type="n", axes=FALSE, ann=FALSE, xlim=c(0, 10), ylim = c(0,10))
text(5,5, vnames[n], cex = 2)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.