In brian-j-smith/MachineShop: Machine Learning Models and Tools

Iris Flowers Species

The following illustrates use of the package to predict the species of flowers in Edgar Anderson's iris data set.

stats <- list(
  list("Number of flowers" = ~ length(Species)),
  "Species" = list("setosa" = ~ n_perc(Species == "setosa"),
                   "versicolor" = ~ n_perc(Species == "versicolor"),
                   "virginica" = ~ n_perc(Species == "virginica")),
  "Sepal.Length" = list("Median (Range)" = ~ median_range(Sepal.Length)),
  "Sepal.Width" = list("Median (Range)" = ~ median_range(Sepal.Width)),
  "Petal.Length" = list("Median (Range)" = ~ median_range(Petal.Length)),
  "Petal.Width" = list("Median (Range)" = ~ median_range(Petal.Width))
)

summary_kbl(stats, iris)

Training and Test Set Analysis

## Analysis libraries
library(MachineShop)
library(magrittr)

## Training and test sets
set.seed(123)
train_indices <- sample(nrow(iris), nrow(iris) * 2 / 3)
trainset <- iris[train_indices, ]
testset <- iris[-train_indices, ]

## Model formula
fo <- Species ~ .

## Models available for factor responses
modelinfo(factor(0)) %>% names

## Model-specific information
modelinfo(GBMModel)

## Generalized boosted model fit to training set
iris_fit <- fit(fo, data = trainset, model = GBMModel)

## Variable importance
(vi <- varimp(iris_fit))

plot(vi)

## Test set predicted probabilities
predict(iris_fit, newdata = testset, type = "prob") %>% head

## Test set predicted classifications
predict(iris_fit, newdata = testset) %>% head

## Test set performance
obs <- response(iris_fit, newdata = testset)
pred <- predict(iris_fit, newdata = testset, type = "prob")
performance(obs, pred)

Resampling

## Resample estimation of model performance
(res <- resample(fo, data = iris, model = GBMModel, control = CVControl))

summary(res)

plot(res)

Performance Metrics

## Default performance metrics
performance(res) %>% summary

## Metrics available for the resample output
metricinfo(res) %>% names

## User-specified metrics
performance(res, c(accuracy, kappa2)) %>% summary

Model Tuning

## Tune over a grid of model parameters
iris_fit <- TunedModel(
  GBMModel,
  grid = expand_params(n.trees = c(25, 50, 100),
                       interaction.depth = 1:3,
                       n.minobsinnode = c(5, 10))
) %>% fit(fo, data = iris)

## Variable importance
varimp(iris_fit)

## Plot performance over the grid points
tuned_model <- as.MLModel(iris_fit)
plot(tuned_model, type = "line")

Model Comparisons

## Model comparisons
control <- CVControl(folds = 10, repeats = 5)

res1 <- resample(fo, data = iris, model = GBMModel(n.tree = 50), control = control)
res2 <- resample(fo, data = iris, model = RandomForestModel(ntree = 50), control = control)
res3 <- resample(fo, data = iris, model = NNetModel(size = 5), control = control)

res <- c(GBM = res1, RF = res2, NNet = res3)
summary(res)

plot(res)

## Pairwise model differences and t-tests
perfdiff <- diff(res)
summary(perfdiff)

t.test(perfdiff)

plot(perfdiff)

Ensemble Models

## Stacked regression
stackedmodel <- StackedModel(GBMModel, RandomForestModel, NNetModel)
res_stacked <- resample(fo, data = iris, model = stackedmodel)
summary(res_stacked)

## Super learner
supermodel <- SuperModel(GBMModel, RandomForestModel, NNetModel)
res_super <- resample(fo, data = iris, model = supermodel)
summary(res_super)

Calibration Curves

cal <- calibration(res1)
plot(cal, se = TRUE)

Confusion Matrices

(conf <- confusion(res1, cutoff = NULL))

summary(conf)

plot(conf)

Partial Dependence Plots

pd <- dependence(iris_fit, select = c(Petal.Length, Petal.Width))
plot(pd)

Preprocessing Recipe

library(recipes)

rec <- recipe(fo, data = iris) %>%
  role_case(stratum = Species)

iris_fit <- fit(rec, model = GBMModel)
varimp(iris_fit)

res <- resample(rec, model = GBMModel, control = CVControl)
summary(res)