Nothing
################################################################################
### R code from Applied Predictive Modeling (2013) by Kuhn and Johnson.
### Copyright 2013 Kuhn and Johnson
### Web Page: http://www.appliedpredictivemodeling.com
### Contact: Max Kuhn (mxkuhn@gmail.com)
###
### Chapter 16: Remedies for Severe Class Imbalance
###
### Required packages: AppliedPredictiveModeling, caret, C50, earth, DMwR,
### DVD, kernlab, mda, pROC, randomForest, rpart
###
### Data used: The insurance data from the DWD package.
###
### Notes:
### 1) This code is provided without warranty.
###
### 2) This code should help the user reproduce the results in the
### text. There will be differences between this code and what is is
### the computing section. For example, the computing sections show
### how the source functions work (e.g. randomForest() or plsr()),
### which were not directly used when creating the book. Also, there may be
### syntax differences that occur over time as packages evolve. These files
### will reflect those changes.
###
### 3) In some cases, the calculations in the book were run in
### parallel. The sub-processes may reset the random number seed.
### Your results may slightly vary.
###
################################################################################
################################################################################
### Section 16.1 Case Study: Predicting Caravan Policy Ownership
library(DWD)
data(ticdata)
### Some of the predictor names and levels have characters that would results in
### illegal variable names. We convert then to more generic names and treat the
### ordered factors as nominal (i.e. unordered) factors.
isOrdered <- unlist(lapply(ticdata, function(x) any(class(x) == "ordered")))
recodeLevels <- function(x)
{
x <- gsub("f ", "", as.character(x))
x <- gsub(" - ", "_to_", x)
x <- gsub("-", "_to_", x)
x <- gsub("%", "", x)
x <- gsub("?", "Unk", x, fixed = TRUE)
x <- gsub("[,'\\(\\)]", "", x)
x <- gsub(" ", "_", x)
factor(paste("_", x, sep = ""))
}
convertCols <- c("STYPE", "MGEMLEEF", "MOSHOOFD",
names(isOrdered)[isOrdered])
for(i in convertCols) ticdata[,i] <- factor(gsub(" ", "0",format(as.numeric(ticdata[,i]))))
ticdata$CARAVAN <- factor(as.character(ticdata$CARAVAN),
levels = rev(levels(ticdata$CARAVAN)))
### Split the data into three sets: training, test and evaluation.
library(caret)
set.seed(156)
split1 <- createDataPartition(ticdata$CARAVAN, p = .7)[[1]]
other <- ticdata[-split1,]
training <- ticdata[ split1,]
set.seed(934)
split2 <- createDataPartition(other$CARAVAN, p = 1/3)[[1]]
evaluation <- other[ split2,]
testing <- other[-split2,]
predictors <- names(training)[names(training) != "CARAVAN"]
testResults <- data.frame(CARAVAN = testing$CARAVAN)
evalResults <- data.frame(CARAVAN = evaluation$CARAVAN)
trainingInd <- data.frame(model.matrix(CARAVAN ~ ., data = training))[,-1]
evaluationInd <- data.frame(model.matrix(CARAVAN ~ ., data = evaluation))[,-1]
testingInd <- data.frame(model.matrix(CARAVAN ~ ., data = testing))[,-1]
trainingInd$CARAVAN <- training$CARAVAN
evaluationInd$CARAVAN <- evaluation$CARAVAN
testingInd$CARAVAN <- testing$CARAVAN
isNZV <- nearZeroVar(trainingInd)
noNZVSet <- names(trainingInd)[-isNZV]
testResults <- data.frame(CARAVAN = testing$CARAVAN)
evalResults <- data.frame(CARAVAN = evaluation$CARAVAN)
################################################################################
### Section 16.2 The Effect of Class Imbalance
### These functions are used to measure performance
fiveStats <- function(...) c(twoClassSummary(...), defaultSummary(...))
fourStats <- function (data, lev = levels(data$obs), model = NULL)
{
accKapp <- postResample(data[, "pred"], data[, "obs"])
out <- c(accKapp,
sensitivity(data[, "pred"], data[, "obs"], lev[1]),
specificity(data[, "pred"], data[, "obs"], lev[2]))
names(out)[3:4] <- c("Sens", "Spec")
out
}
ctrl <- trainControl(method = "cv",
classProbs = TRUE,
summaryFunction = fiveStats)
ctrlNoProb <- ctrl
ctrlNoProb$summaryFunction <- fourStats
ctrlNoProb$classProbs <- FALSE
set.seed(1410)
rfFit <- train(CARAVAN ~ ., data = trainingInd,
method = "rf",
trControl = ctrl,
ntree = 1500,
tuneLength = 5,
metric = "ROC")
rfFit
evalResults$RF <- predict(rfFit, evaluationInd, type = "prob")[,1]
testResults$RF <- predict(rfFit, testingInd, type = "prob")[,1]
rfROC <- roc(evalResults$CARAVAN, evalResults$RF,
levels = rev(levels(evalResults$CARAVAN)))
rfROC
rfEvalCM <- confusionMatrix(predict(rfFit, evaluationInd), evalResults$CARAVAN)
rfEvalCM
set.seed(1410)
lrFit <- train(CARAVAN ~ .,
data = trainingInd[, noNZVSet],
method = "glm",
trControl = ctrl,
metric = "ROC")
lrFit
evalResults$LogReg <- predict(lrFit, evaluationInd[, noNZVSet], type = "prob")[,1]
testResults$LogReg <- predict(lrFit, testingInd[, noNZVSet], type = "prob")[,1]
lrROC <- roc(evalResults$CARAVAN, evalResults$LogReg,
levels = rev(levels(evalResults$CARAVAN)))
lrROC
lrEvalCM <- confusionMatrix(predict(lrFit, evaluationInd), evalResults$CARAVAN)
lrEvalCM
set.seed(1401)
fdaFit <- train(CARAVAN ~ ., data = training,
method = "fda",
tuneGrid = data.frame(degree = 1, nprune = 1:25),
metric = "ROC",
trControl = ctrl)
fdaFit
evalResults$FDA <- predict(fdaFit, evaluation[, predictors], type = "prob")[,1]
testResults$FDA <- predict(fdaFit, testing[, predictors], type = "prob")[,1]
fdaROC <- roc(evalResults$CARAVAN, evalResults$FDA,
levels = rev(levels(evalResults$CARAVAN)))
fdaROC
fdaEvalCM <- confusionMatrix(predict(fdaFit, evaluation[, predictors]), evalResults$CARAVAN)
fdaEvalCM
labs <- c(RF = "Random Forest", LogReg = "Logistic Regression",
FDA = "FDA (MARS)")
lift1 <- lift(CARAVAN ~ RF + LogReg + FDA, data = evalResults,
labels = labs)
plotTheme <- caretTheme()
plot(fdaROC, type = "S", col = plotTheme$superpose.line$col[3], legacy.axes = TRUE)
plot(rfROC, type = "S", col = plotTheme$superpose.line$col[1], add = TRUE, legacy.axes = TRUE)
plot(lrROC, type = "S", col = plotTheme$superpose.line$col[2], add = TRUE, legacy.axes = TRUE)
legend(.7, .25,
c("Random Forest", "Logistic Regression", "FDA (MARS)"),
cex = .85,
col = plotTheme$superpose.line$col[1:3],
lwd = rep(2, 3),
lty = rep(1, 3))
xyplot(lift1,
ylab = "%Events Found",
xlab = "%Customers Evaluated",
lwd = 2,
type = "l")
################################################################################
### Section 16.4 Alternate Cutoffs
rfThresh <- coords(rfROC, x = "best", ret="threshold",
best.method="closest.topleft")
rfThreshY <- coords(rfROC, x = "best", ret="threshold",
best.method="youden")
cutText <- ifelse(rfThresh == rfThreshY,
"is the same as",
"is similar to")
evalResults$rfAlt <- factor(ifelse(evalResults$RF > rfThresh,
"insurance", "noinsurance"),
levels = levels(evalResults$CARAVAN))
testResults$rfAlt <- factor(ifelse(testResults$RF > rfThresh,
"insurance", "noinsurance"),
levels = levels(testResults$CARAVAN))
rfAltEvalCM <- confusionMatrix(evalResults$rfAlt, evalResults$CARAVAN)
rfAltEvalCM
rfAltTestCM <- confusionMatrix(testResults$rfAlt, testResults$CARAVAN)
rfAltTestCM
rfTestCM <- confusionMatrix(predict(rfFit, testingInd), testResults$CARAVAN)
plot(rfROC, print.thres = c(.5, .3, .10, rfThresh), type = "S",
print.thres.pattern = "%.3f (Spec = %.2f, Sens = %.2f)",
print.thres.cex = .8, legacy.axes = TRUE)
################################################################################
### Section 16.5 Adjusting Prior Probabilities
priors <- table(ticdata$CARAVAN)/nrow(ticdata)*100
fdaPriors <- fdaFit
fdaPriors$finalModel$prior <- c(insurance = .6, noinsurance = .4)
fdaPriorPred <- predict(fdaPriors, evaluation[,predictors])
evalResults$FDAprior <- predict(fdaPriors, evaluation[,predictors], type = "prob")[,1]
testResults$FDAprior <- predict(fdaPriors, testing[,predictors], type = "prob")[,1]
fdaPriorCM <- confusionMatrix(fdaPriorPred, evaluation$CARAVAN)
fdaPriorCM
fdaPriorROC <- roc(testResults$CARAVAN, testResults$FDAprior,
levels = rev(levels(testResults$CARAVAN)))
fdaPriorROC
################################################################################
### Section 16.7 Sampling Methods
set.seed(1237)
downSampled <- downSample(trainingInd[, -ncol(trainingInd)], training$CARAVAN)
set.seed(1237)
upSampled <- upSample(trainingInd[, -ncol(trainingInd)], training$CARAVAN)
library(DMwR)
set.seed(1237)
smoted <- SMOTE(CARAVAN ~ ., data = trainingInd)
set.seed(1410)
rfDown <- train(Class ~ ., data = downSampled,
"rf",
trControl = ctrl,
ntree = 1500,
tuneLength = 5,
metric = "ROC")
rfDown
evalResults$RFdown <- predict(rfDown, evaluationInd, type = "prob")[,1]
testResults$RFdown <- predict(rfDown, testingInd, type = "prob")[,1]
rfDownROC <- roc(evalResults$CARAVAN, evalResults$RFdown,
levels = rev(levels(evalResults$CARAVAN)))
rfDownROC
set.seed(1401)
rfDownInt <- train(CARAVAN ~ ., data = trainingInd,
"rf",
ntree = 1500,
tuneLength = 5,
strata = training$CARAVAN,
sampsize = rep(sum(training$CARAVAN == "insurance"), 2),
metric = "ROC",
trControl = ctrl)
rfDownInt
evalResults$RFdownInt <- predict(rfDownInt, evaluationInd, type = "prob")[,1]
testResults$RFdownInt <- predict(rfDownInt, testingInd, type = "prob")[,1]
rfDownIntRoc <- roc(evalResults$CARAVAN,
evalResults$RFdownInt,
levels = rev(levels(training$CARAVAN)))
rfDownIntRoc
set.seed(1410)
rfUp <- train(Class ~ ., data = upSampled,
"rf",
trControl = ctrl,
ntree = 1500,
tuneLength = 5,
metric = "ROC")
rfUp
evalResults$RFup <- predict(rfUp, evaluationInd, type = "prob")[,1]
testResults$RFup <- predict(rfUp, testingInd, type = "prob")[,1]
rfUpROC <- roc(evalResults$CARAVAN, evalResults$RFup,
levels = rev(levels(evalResults$CARAVAN)))
rfUpROC
set.seed(1410)
rfSmote <- train(CARAVAN ~ ., data = smoted,
"rf",
trControl = ctrl,
ntree = 1500,
tuneLength = 5,
metric = "ROC")
rfSmote
evalResults$RFsmote <- predict(rfSmote, evaluationInd, type = "prob")[,1]
testResults$RFsmote <- predict(rfSmote, testingInd, type = "prob")[,1]
rfSmoteROC <- roc(evalResults$CARAVAN, evalResults$RFsmote,
levels = rev(levels(evalResults$CARAVAN)))
rfSmoteROC
rfSmoteCM <- confusionMatrix(predict(rfSmote, evaluationInd), evalResults$CARAVAN)
rfSmoteCM
samplingSummary <- function(x, evl, tst)
{
lvl <- rev(levels(tst$CARAVAN))
evlROC <- roc(evl$CARAVAN,
predict(x, evl, type = "prob")[,1],
levels = lvl)
rocs <- c(auc(evlROC),
auc(roc(tst$CARAVAN,
predict(x, tst, type = "prob")[,1],
levels = lvl)))
cut <- coords(evlROC, x = "best", ret="threshold",
best.method="closest.topleft")
bestVals <- coords(evlROC, cut, ret=c("sensitivity", "specificity"))
out <- c(rocs, bestVals*100)
names(out) <- c("evROC", "tsROC", "tsSens", "tsSpec")
out
}
rfResults <- rbind(samplingSummary(rfFit, evaluationInd, testingInd),
samplingSummary(rfDown, evaluationInd, testingInd),
samplingSummary(rfDownInt, evaluationInd, testingInd),
samplingSummary(rfUp, evaluationInd, testingInd),
samplingSummary(rfSmote, evaluationInd, testingInd))
rownames(rfResults) <- c("Original", "Down--Sampling", "Down--Sampling (Internal)",
"Up--Sampling", "SMOTE")
rfResults
rocCols <- c("black", rgb(1, 0, 0, .5), rgb(0, 0, 1, .5))
plot(roc(testResults$CARAVAN, testResults$RF, levels = rev(levels(testResults$CARAVAN))),
type = "S", col = rocCols[1], legacy.axes = TRUE)
plot(roc(testResults$CARAVAN, testResults$RFdownInt, levels = rev(levels(testResults$CARAVAN))),
type = "S", col = rocCols[2],add = TRUE, legacy.axes = TRUE)
plot(roc(testResults$CARAVAN, testResults$RFsmote, levels = rev(levels(testResults$CARAVAN))),
type = "S", col = rocCols[3], add = TRUE, legacy.axes = TRUE)
legend(.6, .4,
c("Normal", "Down-Sampling (Internal)", "SMOTE"),
lty = rep(1, 3),
lwd = rep(2, 3),
cex = .8,
col = rocCols)
xyplot(lift(CARAVAN ~ RF + RFdownInt + RFsmote,
data = testResults),
type = "l",
ylab = "%Events Found",
xlab = "%Customers Evaluated")
################################################################################
### Section 16.8 Cost–Sensitive Training
library(kernlab)
set.seed(1157)
sigma <- sigest(CARAVAN ~ ., data = trainingInd[, noNZVSet], frac = .75)
names(sigma) <- NULL
svmGrid1 <- data.frame(sigma = sigma[2],
C = 2^c(2:10))
set.seed(1401)
svmFit <- train(CARAVAN ~ .,
data = trainingInd[, noNZVSet],
method = "svmRadial",
tuneGrid = svmGrid1,
preProc = c("center", "scale"),
metric = "Kappa",
trControl = ctrl)
svmFit
evalResults$SVM <- predict(svmFit, evaluationInd[, noNZVSet], type = "prob")[,1]
testResults$SVM <- predict(svmFit, testingInd[, noNZVSet], type = "prob")[,1]
svmROC <- roc(evalResults$CARAVAN, evalResults$SVM,
levels = rev(levels(evalResults$CARAVAN)))
svmROC
svmTestROC <- roc(testResults$CARAVAN, testResults$SVM,
levels = rev(levels(testResults$CARAVAN)))
svmTestROC
confusionMatrix(predict(svmFit, evaluationInd[, noNZVSet]), evalResults$CARAVAN)
confusionMatrix(predict(svmFit, testingInd[, noNZVSet]), testingInd$CARAVAN)
set.seed(1401)
svmWtFit <- train(CARAVAN ~ .,
data = trainingInd[, noNZVSet],
method = "svmRadial",
tuneGrid = svmGrid1,
preProc = c("center", "scale"),
metric = "Kappa",
class.weights = c(insurance = 18, noinsurance = 1),
trControl = ctrlNoProb)
svmWtFit
svmWtEvalCM <- confusionMatrix(predict(svmWtFit, evaluationInd[, noNZVSet]), evalResults$CARAVAN)
svmWtEvalCM
svmWtTestCM <- confusionMatrix(predict(svmWtFit, testingInd[, noNZVSet]), testingInd$CARAVAN)
svmWtTestCM
initialRpart <- rpart(CARAVAN ~ ., data = training,
control = rpart.control(cp = 0.0001))
rpartGrid <- data.frame(cp = initialRpart$cptable[, "CP"])
cmat <- list(loss = matrix(c(0, 1, 20, 0), ncol = 2))
set.seed(1401)
cartWMod <- train(x = training[,predictors],
y = training$CARAVAN,
method = "rpart",
trControl = ctrlNoProb,
tuneGrid = rpartGrid,
metric = "Kappa",
parms = cmat)
cartWMod
library(C50)
c5Grid <- expand.grid(model = c("tree", "rules"),
trials = c(1, (1:10)*10),
winnow = FALSE)
finalCost <- matrix(c(0, 20, 1, 0), ncol = 2)
rownames(finalCost) <- colnames(finalCost) <- levels(training$CARAVAN)
set.seed(1401)
C5CostFit <- train(training[, predictors],
training$CARAVAN,
method = "C5.0",
metric = "Kappa",
tuneGrid = c5Grid,
cost = finalCost,
control = C5.0Control(earlyStopping = FALSE),
trControl = ctrlNoProb)
C5CostCM <- confusionMatrix(predict(C5CostFit, testing), testing$CARAVAN)
C5CostCM
################################################################################
### Session Information
sessionInfo()
q("no")
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.