################################################################################
### R code from Applied Predictive Modeling (2013) by Kuhn and Johnson.
### Copyright 2013 Kuhn and Johnson
### Web Page: http://www.appliedpredictivemodeling.com
### Contact: Max Kuhn (mxkuhn@gmail.com)
###
### Chapter 4: Over-Fitting and Model Tuning
###
### Required packages: caret, doMC (optional), kernlab
###
### Data used:
###
### Notes:
### 1) This code is provided without warranty.
###
### 2) This code should help the user reproduce the results in the
### text. There will be differences between this code and what is is
### the computing section. For example, the computing sections show
### how the source functions work (e.g. randomForest() or plsr()),
### which were not directly used when creating the book. Also, there may be
### syntax differences that occur over time as packages evolve. These files
### will reflect those changes.
###
### 3) In some cases, the calculations in the book were run in
### parallel. The sub-processes may reset the random number seed.
### Your results may slightly vary.
###
################################################################################
################################################################################
### Section 4.6 Choosing Final Tuning Parameters
library(caret)
data(GermanCredit)
## First, remove near-zero variance predictors then get rid of a few predictors
## that duplicate values. For example, there are two possible values for the
## housing variable: "Rent", "Own" and "ForFree". So that we don't have linear
## dependencies, we get rid of one of the levels (e.g. "ForFree")
GermanCredit <- GermanCredit[, -nearZeroVar(GermanCredit)]
GermanCredit$CheckingAccountStatus.lt.0 <- NULL
GermanCredit$SavingsAccountBonds.lt.100 <- NULL
GermanCredit$EmploymentDuration.lt.1 <- NULL
GermanCredit$EmploymentDuration.Unemployed <- NULL
GermanCredit$Personal.Male.Married.Widowed <- NULL
GermanCredit$Property.Unknown <- NULL
GermanCredit$Housing.ForFree <- NULL
## Split the data into training (80%) and test sets (20%)
set.seed(100)
inTrain <- createDataPartition(GermanCredit$Class, p = .8)[[1]]
GermanCreditTrain <- GermanCredit[ inTrain, ]
GermanCreditTest <- GermanCredit[-inTrain, ]
## The model fitting code shown in the computing section is fairly
## simplistic. For the text we estimate the tuning parameter grid
## up-front and pass it in explicitly. This generally is not needed,
## but was used here so that we could trim the cost values to a
## presentable range and to re-use later with different resampling
## methods.
library(kernlab)
set.seed(231)
sigDist <- sigest(Class ~ ., data = GermanCreditTrain, frac = 1)
svmTuneGrid <- data.frame(sigma = as.vector(sigDist)[1], C = 2^(-2:7))
### Optional: parallel processing can be used via the 'do' packages,
### such as doMC, doMPI etc. We used doMC (not on Windows) to speed
### up the computations.
### WARNING: Be aware of how much memory is needed to parallel
### process. It can very quickly overwhelm the available hardware. We
### estimate the memory usage (VSIZE = total memory size) to be
### 2566M/core.
library(doMC)
registerDoMC(4)
set.seed(1056)
svmFit <- train(Class ~ .,
data = GermanCreditTrain,
method = "svmRadial",
preProc = c("center", "scale"),
tuneGrid = svmTuneGrid,
trControl = trainControl(method = "repeatedcv",
repeats = 5,
classProbs = TRUE))
## classProbs = TRUE was added since the text was written
## Print the results
svmFit
## A line plot of the average performance. The 'scales' argument is actually an
## argument to xyplot that converts the x-axis to log-2 units.
plot(svmFit, scales = list(x = list(log = 2)))
## Test set predictions
predictedClasses <- predict(svmFit, GermanCreditTest)
str(predictedClasses)
## Use the "type" option to get class probabilities
predictedProbs <- predict(svmFit, newdata = GermanCreditTest, type = "prob")
head(predictedProbs)
## Fit the same model using different resampling methods. The main syntax change
## is the control object.
set.seed(1056)
svmFit10CV <- train(Class ~ .,
data = GermanCreditTrain,
method = "svmRadial",
preProc = c("center", "scale"),
tuneGrid = svmTuneGrid,
trControl = trainControl(method = "cv", number = 10))
svmFit10CV
set.seed(1056)
svmFitLOO <- train(Class ~ .,
data = GermanCreditTrain,
method = "svmRadial",
preProc = c("center", "scale"),
tuneGrid = svmTuneGrid,
trControl = trainControl(method = "LOOCV"))
svmFitLOO
set.seed(1056)
svmFitLGO <- train(Class ~ .,
data = GermanCreditTrain,
method = "svmRadial",
preProc = c("center", "scale"),
tuneGrid = svmTuneGrid,
trControl = trainControl(method = "LGOCV",
number = 50,
p = .8))
svmFitLGO
set.seed(1056)
svmFitBoot <- train(Class ~ .,
data = GermanCreditTrain,
method = "svmRadial",
preProc = c("center", "scale"),
tuneGrid = svmTuneGrid,
trControl = trainControl(method = "boot", number = 50))
svmFitBoot
set.seed(1056)
svmFitBoot632 <- train(Class ~ .,
data = GermanCreditTrain,
method = "svmRadial",
preProc = c("center", "scale"),
tuneGrid = svmTuneGrid,
trControl = trainControl(method = "boot632",
number = 50))
svmFitBoot632
################################################################################
### Section 4.8 Choosing Between Models
set.seed(1056)
glmProfile <- train(Class ~ .,
data = GermanCreditTrain,
method = "glm",
trControl = trainControl(method = "repeatedcv",
repeats = 5))
glmProfile
resamp <- resamples(list(SVM = svmFit, Logistic = glmProfile))
summary(resamp)
## These results are slightly different from those shown in the text.
## There are some differences in the train() function since the
## original results were produced. This is due to a difference in
## predictions from the ksvm() function when class probs are requested
## and when they are not. See, for example,
## https://stat.ethz.ch/pipermail/r-help/2013-November/363188.html
modelDifferences <- diff(resamp)
summary(modelDifferences)
## The actual paired t-test:
modelDifferences$statistics$Accuracy
################################################################################
### Session Information
sessionInfo()
q("no")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.