knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 7, fig.height = 5 ) require(tornado) require(ggplot2)
A tornado plot is a visualization of the range of outputs expected from a variety of inputs, or alternatively, the sensitivity of the output to the range of inputs. Tornado plots have a number of features:
An importance plot attempts to display the relative impact of each variable on the model fit. Traditionally, for linear models, the concept of importance was expressed as the percentage of total response variable variance that is explained by each variable, either alone, or in the presence of the other variables in the model. Each model type can have different measures of importance. In this package, the following methods are used:
caret
package models: The definition of variable importance for each
model in the caret
package is taken from the varImp
method. See the caret
package for specifics.A few standard data sets are used in these examples:
mtcars
: Motor Trend Car Road Tests - predict Miles Per Gallon (mpg) based on
other car factorsmtcars_w_factors
: the mtcars
dataset with a few numeric variables replaced
with categorical or factor variables where appropriate. e.g. Automatic vs Manual
transmissionsurvival::ovarian
: Ovarian Cancer Survival Data - predict survival timesurvival::bladder
: Bladder Cancer recurrences - risk of bladder cancer recurrenceiris
: Edgar Anderson's Iris Data - predict the type of iris from measurementsmtcars_w_factors <- mtcars # Automatic vs Manual mtcars_w_factors$am <- factor(mtcars$am) # V or Straight cylinder arrangement mtcars_w_factors$vs <- factor(mtcars$vs) # number of cylinders mtcars_w_factors$cyl <- factor(mtcars$cyl) # number of forward gears mtcars_w_factors$gear <- factor(mtcars$gear) # number of carburetors mtcars_w_factors$carb <- factor(mtcars$carb)
lm1 <- lm(mpg ~ cyl*wt*hp, data = mtcars) torn1 <- tornado::tornado(lm1, type = "PercentChange", alpha = 0.10) plot(torn1, xlabel = "MPG", geom_bar_control = list(width = 0.4))
torn2 <- tornado::tornado(lm1, type = "ranges") plot(torn2, xlabel = "MPG", geom_bar_control = list(width = 0.4))
torn3 <- tornado::tornado(lm1, type = "percentiles", alpha = 0.05) plot(torn3, xlabel = "MPG", geom_bar_control = list(width = 0.4))
lm4 <- lm(mpg ~ cyl + wt + hp + vs, data = mtcars_w_factors) torn4 <- tornado::tornado(lm4, type = "percentiles", alpha = 0.05) plot(torn4, xlabel = "MPG", geom_bar_control = list(width = 0.4))
dict <- list(old = c("cyl", "wt", "hp", "vs"), new = c("Cylinders", "Weight", "Horsepower", "V_or_Straight")) torn5 <- tornado::tornado(lm4, type = "percentiles", alpha = 0.05, dict = dict) plot(torn5, xlabel = "MPG", geom_bar_control = list(width = 0.4))
dict <- list(old = c("cyl", "wt", "hp", "vs"), new = c("Cylinders", "Weight", "Horsepower", "V_or_Straight")) torn5 <- tornado::tornado(lm4, type = "percentiles", alpha = 0.05) plot(torn5, xlabel = "MPG", geom_bar_control = list(width = 0.4), sensitivity_colors = c("#FC8D62", "#66C2A5"), geom_point_control = list(size = 3, fill = "purple", col = "purple"))
Notes:
g <- plot(torn5, plot = FALSE, xlabel = "MPG", geom_bar_control = list(width = 0.4), sensitivity_colors = c("#FC8D62", "#66C2A5"), geom_point_control = list(size = 3, fill = "purple", col = "purple")) g <- g + ggtitle("Test Plot") g <- g + geom_hline(yintercept = 0, col = "black", lwd = 2) plot(g)
Predict if an engine is a V or Straight given covariates
glm1 <- glm(vs ~ wt + disp + cyl, data = mtcars, family = binomial(link = "logit")) torn1 <- tornado::tornado(glm1, type = "ranges", alpha = 0.10) plot(torn1, xlabel = "V or Straight Engine", geom_bar_control = list(width = 0.4))
survreg1 <- survival::survreg(survival::Surv(futime, fustat) ~ ecog.ps + rx + age + resid.ds, survival::ovarian, dist = 'weibull', scale = 1) torn1 <- tornado::tornado(survreg1, modeldata = survival::ovarian, type = "PercentChange", alpha = 0.10) plot(torn1, xlabel = "Survival Time", geom_bar_control = list(width = 0.4))
coxph1 <- survival::coxph(survival::Surv(stop, event) ~ rx + size + number, survival::bladder) torn1 <- tornado::tornado(coxph1, modeldata = survival::bladder, type = "PercentChange", alpha = 0.10) plot(torn1, xlabel = "Risk", geom_bar_control = list(width = 0.4))
if (requireNamespace("glmnet", quietly = TRUE)) { form <- formula(mpg ~ cyl*wt*hp) mf <- model.frame(form, data=mtcars) mm <- model.matrix(form, mf) gtest <- glmnet::cv.glmnet(x = mm, y = mtcars$mpg, family = "gaussian") torn <- tornado::tornado(gtest, modeldata = mtcars, form = formula(mpg ~ cyl*wt*hp), s = "lambda.1se", type = "PercentChange", alpha = 0.10) plot(torn, xlabel = "MPG", geom_bar_control = list(width = 0.4)) } else { print("glmnet is not available for vignette rendering") }
if (requireNamespace("caret", quietly = TRUE) & requireNamespace("randomForest", quietly = TRUE)) { gtest <- caret::train(x = subset(mtcars_w_factors, select = -mpg), y = mtcars_w_factors$mpg, method = "rf") torn <- tornado::tornado(gtest, type = "percentiles", alpha = 0.10) plot(torn, xlabel = "MPG") } else { print("caret or randomForest is not available for vignette rendering") }
The plot
method can also return a ggplot object
if (requireNamespace("caret", quietly = TRUE) & requireNamespace("randomForest", quietly = TRUE)) { gtest <- caret::train(x = subset(iris, select = -Species), y = iris$Species, method = "rf") torn <- tornado::tornado(gtest, type = "percentiles", alpha = 0.10, class_number = 1) g <- plot(torn, plot = FALSE, xlabel = "Probability of the Setosa Species") g <- g + ggtitle("Classifier caret::train 'rf', 10th to 90th perc. of each var.") plot(g) torn <- tornado::tornado(gtest, type = "percentiles", alpha = 0.10, class_number = 2) g <- plot(torn, plot = FALSE, xlabel = "Probability of the versicolor Species") plot(g) } else { print("caret or randomForest is not available for vignette rendering") }
gtest <- lm(mpg ~ cyl*wt*hp + gear + carb, data = mtcars) gtestreduced <- lm(mpg ~ 1, data = mtcars) imp <- tornado::importance(gtest, gtestreduced) plot(imp)
dict <- list(old = c("cyl", "wt", "hp", "vs", "gear", "carb"), new = c("Cylinders", "Weight", "Horsepower", "V_or_Straight", "Num Gears", "Num Carbs")) imp <- tornado::importance(gtest, gtestreduced, dict = dict) plot(imp, col_importance_alone = "#8DD3C7", col_importance_cumulative = "#FFFFB3")
gtest <- glm(vs ~ wt + disp + gear, data = mtcars, family = binomial(link = "logit")) gtestreduced <- glm(vs ~ 1, data = mtcars, family = binomial(link = "logit")) imp <- tornado::importance(gtest, gtestreduced) plot(imp)
model_final <- survival::survreg(survival::Surv(futime, fustat) ~ ecog.ps*rx + age, data = survival::ovarian, dist = "weibull") imp <- tornado::importance(model_final, survival::ovarian, nperm = 100) plot(imp, geom_bar_control = list(width = 0.4, fill = "blue"))
The number of variables plotted can also be controlled
if (requireNamespace("caret", quietly = TRUE) & requireNamespace("randomForest", quietly = TRUE)) { gtest <- caret::train(x = subset(mtcars_w_factors, select = -mpg), y = mtcars_w_factors$mpg, method = "rf") imp <- tornado::importance(gtest) plot(imp, nvar = 7) } else { print("caret or randomForest is not available for vignette rendering") }
The plot
method can also return a ggplot object
if (requireNamespace("caret", quietly = TRUE) & requireNamespace("randomForest", quietly = TRUE)) { gtest <- caret::train(x = subset(iris, select = -Species), y = iris$Species, method = "rf") imp <- tornado::importance(gtest) g <- plot(imp, plot = FALSE) g <- g + ggtitle("Classifier caret::train randomforest: variable importance") plot(g) } else { print("caret or randomForest is not available for vignette rendering") }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.