16_Class_Imbalance.R
In AppliedPredictiveModeling: Functions and Data Sets for 'Applied Predictive Modeling'

################################################################################
### R code from Applied Predictive Modeling (2013) by Kuhn and Johnson.
### Copyright 2013 Kuhn and Johnson
### Web Page: http://www.appliedpredictivemodeling.com
### Contact: Max Kuhn (mxkuhn@gmail.com) 
###
### Chapter 16: Remedies for Severe Class Imbalance
###
### Required packages: AppliedPredictiveModeling, caret, C50, earth, DMwR, 
###                    DVD, kernlab,  mda, pROC, randomForest, rpart
###
### Data used: The insurance data from the DWD package. 
###
### Notes: 
### 1) This code is provided without warranty.
###
### 2) This code should help the user reproduce the results in the
### text. There will be differences between this code and what is is
### the computing section. For example, the computing sections show
### how the source functions work (e.g. randomForest() or plsr()),
### which were not directly used when creating the book. Also, there may be 
### syntax differences that occur over time as packages evolve. These files 
### will reflect those changes.
###
### 3) In some cases, the calculations in the book were run in 
### parallel. The sub-processes may reset the random number seed.
### Your results may slightly vary.
###
################################################################################

################################################################################
### Section 16.1 Case Study: Predicting Caravan Policy Ownership

library(DWD)
data(ticdata)

### Some of the predictor names and levels have characters that would results in
### illegal variable names. We convert then to more generic names and treat the
### ordered factors as nominal (i.e. unordered) factors. 

isOrdered <- unlist(lapply(ticdata, function(x) any(class(x) == "ordered")))

recodeLevels <- function(x)
  {
    x <- gsub("f ", "", as.character(x))
    x <- gsub(" - ", "_to_", x)
    x <- gsub("-", "_to_", x)
    x <- gsub("%", "", x)
    x <- gsub("?", "Unk", x, fixed = TRUE)
    x <- gsub("[,'\\(\\)]", "", x)
    x <- gsub(" ", "_", x)
    factor(paste("_", x, sep = ""))
  }

convertCols <- c("STYPE", "MGEMLEEF", "MOSHOOFD",
                 names(isOrdered)[isOrdered])

for(i in convertCols) ticdata[,i] <- factor(gsub(" ", "0",format(as.numeric(ticdata[,i]))))

ticdata$CARAVAN <- factor(as.character(ticdata$CARAVAN),
                          levels = rev(levels(ticdata$CARAVAN)))

### Split the data into three sets: training, test and evaluation. 
library(caret)

set.seed(156)

split1 <- createDataPartition(ticdata$CARAVAN, p = .7)[[1]]

other     <- ticdata[-split1,]
training  <- ticdata[ split1,]

set.seed(934)

split2 <- createDataPartition(other$CARAVAN, p = 1/3)[[1]]

evaluation  <- other[ split2,]
testing     <- other[-split2,]

predictors <- names(training)[names(training) != "CARAVAN"]

testResults <- data.frame(CARAVAN = testing$CARAVAN)
evalResults <- data.frame(CARAVAN = evaluation$CARAVAN)

trainingInd <- data.frame(model.matrix(CARAVAN ~ ., data = training))[,-1]
evaluationInd <- data.frame(model.matrix(CARAVAN ~ ., data = evaluation))[,-1]
testingInd <- data.frame(model.matrix(CARAVAN ~ ., data = testing))[,-1]

trainingInd$CARAVAN <- training$CARAVAN
evaluationInd$CARAVAN <- evaluation$CARAVAN
testingInd$CARAVAN <- testing$CARAVAN

isNZV <- nearZeroVar(trainingInd)
noNZVSet <- names(trainingInd)[-isNZV]

testResults <- data.frame(CARAVAN = testing$CARAVAN)
evalResults <- data.frame(CARAVAN = evaluation$CARAVAN)

################################################################################
### Section 16.2 The Effect of Class Imbalance

### These functions are used to measure performance

fiveStats <- function(...) c(twoClassSummary(...), defaultSummary(...))
fourStats <- function (data, lev = levels(data$obs), model = NULL)
{

  accKapp <- postResample(data[, "pred"], data[, "obs"])
  out <- c(accKapp,
           sensitivity(data[, "pred"], data[, "obs"], lev[1]),
           specificity(data[, "pred"], data[, "obs"], lev[2]))
  names(out)[3:4] <- c("Sens", "Spec")
  out
}

ctrl <- trainControl(method = "cv",
                     classProbs = TRUE,
                     summaryFunction = fiveStats)

ctrlNoProb <- ctrl
ctrlNoProb$summaryFunction <- fourStats
ctrlNoProb$classProbs <- FALSE


set.seed(1410)
rfFit <- train(CARAVAN ~ ., data = trainingInd,
               method = "rf",
               trControl = ctrl,
               ntree = 1500,
               tuneLength = 5,
               metric = "ROC")
rfFit

evalResults$RF <- predict(rfFit, evaluationInd, type = "prob")[,1]
testResults$RF <- predict(rfFit, testingInd, type = "prob")[,1]
rfROC <- roc(evalResults$CARAVAN, evalResults$RF,
             levels = rev(levels(evalResults$CARAVAN)))
rfROC

rfEvalCM <- confusionMatrix(predict(rfFit, evaluationInd), evalResults$CARAVAN)
rfEvalCM

set.seed(1410)
lrFit <- train(CARAVAN ~ .,
               data = trainingInd[, noNZVSet],
               method = "glm",
               trControl = ctrl,
               metric = "ROC")
lrFit

evalResults$LogReg <- predict(lrFit, evaluationInd[, noNZVSet], type = "prob")[,1]
testResults$LogReg <- predict(lrFit, testingInd[, noNZVSet], type = "prob")[,1]
lrROC <- roc(evalResults$CARAVAN, evalResults$LogReg,
             levels = rev(levels(evalResults$CARAVAN)))
lrROC

lrEvalCM <- confusionMatrix(predict(lrFit, evaluationInd), evalResults$CARAVAN)
lrEvalCM

set.seed(1401)
fdaFit <- train(CARAVAN ~ ., data = training,
                method = "fda",
                tuneGrid = data.frame(degree = 1, nprune = 1:25),
                metric = "ROC",
                trControl = ctrl)
fdaFit

evalResults$FDA <- predict(fdaFit, evaluation[, predictors], type = "prob")[,1]
testResults$FDA <- predict(fdaFit, testing[, predictors], type = "prob")[,1]
fdaROC <- roc(evalResults$CARAVAN, evalResults$FDA,
              levels = rev(levels(evalResults$CARAVAN)))
fdaROC

fdaEvalCM <- confusionMatrix(predict(fdaFit, evaluation[, predictors]), evalResults$CARAVAN)
fdaEvalCM


labs <- c(RF = "Random Forest", LogReg = "Logistic Regression",
          FDA = "FDA (MARS)")
lift1 <- lift(CARAVAN ~ RF + LogReg + FDA, data = evalResults,
              labels = labs)

plotTheme <- caretTheme()

plot(fdaROC, type = "S", col = plotTheme$superpose.line$col[3], legacy.axes = TRUE)
plot(rfROC, type = "S", col = plotTheme$superpose.line$col[1], add = TRUE, legacy.axes = TRUE)
plot(lrROC, type = "S", col = plotTheme$superpose.line$col[2], add = TRUE, legacy.axes = TRUE)
legend(.7, .25,
       c("Random Forest", "Logistic Regression", "FDA (MARS)"),
       cex = .85,
       col = plotTheme$superpose.line$col[1:3],
       lwd = rep(2, 3),
       lty = rep(1, 3))

xyplot(lift1,
       ylab = "%Events Found",
       xlab =  "%Customers Evaluated",
       lwd = 2,
       type = "l")


################################################################################
### Section 16.4 Alternate Cutoffs

rfThresh <- coords(rfROC, x = "best", ret="threshold",
                   best.method="closest.topleft")
rfThreshY <- coords(rfROC, x = "best", ret="threshold",
                    best.method="youden")

cutText <- ifelse(rfThresh == rfThreshY,
                  "is the same as",
                  "is similar to")

evalResults$rfAlt <- factor(ifelse(evalResults$RF > rfThresh,
                                   "insurance", "noinsurance"),
                            levels = levels(evalResults$CARAVAN))
testResults$rfAlt <- factor(ifelse(testResults$RF > rfThresh,
                                   "insurance", "noinsurance"),
                            levels = levels(testResults$CARAVAN))
rfAltEvalCM <- confusionMatrix(evalResults$rfAlt, evalResults$CARAVAN)
rfAltEvalCM

rfAltTestCM <- confusionMatrix(testResults$rfAlt, testResults$CARAVAN)
rfAltTestCM

rfTestCM <- confusionMatrix(predict(rfFit, testingInd), testResults$CARAVAN)


plot(rfROC, print.thres = c(.5, .3, .10, rfThresh), type = "S",
     print.thres.pattern = "%.3f (Spec = %.2f, Sens = %.2f)",
     print.thres.cex = .8, legacy.axes = TRUE)

################################################################################
### Section 16.5 Adjusting Prior Probabilities

priors <- table(ticdata$CARAVAN)/nrow(ticdata)*100
fdaPriors <- fdaFit
fdaPriors$finalModel$prior <- c(insurance = .6, noinsurance =  .4)
fdaPriorPred <- predict(fdaPriors, evaluation[,predictors])
evalResults$FDAprior <-  predict(fdaPriors, evaluation[,predictors], type = "prob")[,1]
testResults$FDAprior <-  predict(fdaPriors, testing[,predictors], type = "prob")[,1]
fdaPriorCM <- confusionMatrix(fdaPriorPred, evaluation$CARAVAN)
fdaPriorCM

fdaPriorROC <- roc(testResults$CARAVAN, testResults$FDAprior,
                   levels = rev(levels(testResults$CARAVAN)))
fdaPriorROC

################################################################################
### Section 16.7 Sampling Methods

set.seed(1237)
downSampled <- downSample(trainingInd[, -ncol(trainingInd)], training$CARAVAN)

set.seed(1237)
upSampled <- upSample(trainingInd[, -ncol(trainingInd)], training$CARAVAN)

library(DMwR)
set.seed(1237)
smoted <- SMOTE(CARAVAN ~ ., data = trainingInd)

set.seed(1410)
rfDown <- train(Class ~ ., data = downSampled,
                "rf",
                trControl = ctrl,
                ntree = 1500,
                tuneLength = 5,
                metric = "ROC")
rfDown

evalResults$RFdown <- predict(rfDown, evaluationInd, type = "prob")[,1]
testResults$RFdown <- predict(rfDown, testingInd, type = "prob")[,1]
rfDownROC <- roc(evalResults$CARAVAN, evalResults$RFdown,
                 levels = rev(levels(evalResults$CARAVAN)))
rfDownROC

set.seed(1401)
rfDownInt <- train(CARAVAN ~ ., data = trainingInd,
                   "rf",
                   ntree = 1500,
                   tuneLength = 5,
                   strata = training$CARAVAN,
                   sampsize = rep(sum(training$CARAVAN == "insurance"), 2),
                   metric = "ROC",
                   trControl = ctrl)
rfDownInt

evalResults$RFdownInt <- predict(rfDownInt, evaluationInd, type = "prob")[,1]
testResults$RFdownInt <- predict(rfDownInt, testingInd, type = "prob")[,1]
rfDownIntRoc <- roc(evalResults$CARAVAN,
                    evalResults$RFdownInt,
                    levels = rev(levels(training$CARAVAN)))
rfDownIntRoc

set.seed(1410)
rfUp <- train(Class ~ ., data = upSampled,
              "rf",
              trControl = ctrl,
              ntree = 1500,
              tuneLength = 5,
              metric = "ROC")
rfUp

evalResults$RFup <- predict(rfUp, evaluationInd, type = "prob")[,1]
testResults$RFup <- predict(rfUp, testingInd, type = "prob")[,1]
rfUpROC <- roc(evalResults$CARAVAN, evalResults$RFup,
               levels = rev(levels(evalResults$CARAVAN)))
rfUpROC

set.seed(1410)
rfSmote <- train(CARAVAN ~ ., data = smoted,
                 "rf",
                 trControl = ctrl,
                 ntree = 1500,
                 tuneLength = 5,
                 metric = "ROC")
rfSmote

evalResults$RFsmote <- predict(rfSmote, evaluationInd, type = "prob")[,1]
testResults$RFsmote <- predict(rfSmote, testingInd, type = "prob")[,1]
rfSmoteROC <- roc(evalResults$CARAVAN, evalResults$RFsmote,
                  levels = rev(levels(evalResults$CARAVAN)))
rfSmoteROC

rfSmoteCM <- confusionMatrix(predict(rfSmote, evaluationInd), evalResults$CARAVAN)
rfSmoteCM

samplingSummary <- function(x, evl, tst)
  {
    lvl <- rev(levels(tst$CARAVAN))
    evlROC <- roc(evl$CARAVAN,
                  predict(x, evl, type = "prob")[,1],
                  levels = lvl)
    rocs <- c(auc(evlROC),
              auc(roc(tst$CARAVAN,
                      predict(x, tst, type = "prob")[,1],
                      levels = lvl)))
    cut <- coords(evlROC, x = "best", ret="threshold",
                  best.method="closest.topleft")
    bestVals <- coords(evlROC, cut, ret=c("sensitivity", "specificity"))
    out <- c(rocs, bestVals*100)
    names(out) <- c("evROC", "tsROC", "tsSens", "tsSpec")
    out

  }

rfResults <- rbind(samplingSummary(rfFit, evaluationInd, testingInd),
                   samplingSummary(rfDown, evaluationInd, testingInd),
                   samplingSummary(rfDownInt, evaluationInd, testingInd),
                   samplingSummary(rfUp, evaluationInd, testingInd),
                   samplingSummary(rfSmote, evaluationInd, testingInd))
rownames(rfResults) <- c("Original", "Down--Sampling",  "Down--Sampling (Internal)",
                         "Up--Sampling", "SMOTE")

rfResults

rocCols <- c("black", rgb(1, 0, 0, .5), rgb(0, 0, 1, .5))

plot(roc(testResults$CARAVAN, testResults$RF, levels = rev(levels(testResults$CARAVAN))),
     type = "S", col = rocCols[1], legacy.axes = TRUE)
plot(roc(testResults$CARAVAN, testResults$RFdownInt, levels = rev(levels(testResults$CARAVAN))),
     type = "S", col = rocCols[2],add = TRUE, legacy.axes = TRUE)
plot(roc(testResults$CARAVAN, testResults$RFsmote, levels = rev(levels(testResults$CARAVAN))),
     type = "S", col = rocCols[3], add = TRUE, legacy.axes = TRUE)
legend(.6, .4,
       c("Normal", "Down-Sampling (Internal)", "SMOTE"),
       lty = rep(1, 3),
       lwd = rep(2, 3),
       cex = .8,
       col = rocCols)

xyplot(lift(CARAVAN ~ RF + RFdownInt + RFsmote,
            data = testResults),
       type = "l",
       ylab = "%Events Found",
       xlab =  "%Customers Evaluated")


################################################################################
### Section 16.8 Cost–Sensitive Training

library(kernlab)

set.seed(1157)
sigma <- sigest(CARAVAN ~ ., data = trainingInd[, noNZVSet], frac = .75)
names(sigma) <- NULL

svmGrid1 <- data.frame(sigma = sigma[2],
                       C = 2^c(2:10))

set.seed(1401)
svmFit <- train(CARAVAN ~ .,
                data = trainingInd[, noNZVSet],
                method = "svmRadial",
                tuneGrid = svmGrid1,
                preProc = c("center", "scale"),
                metric = "Kappa",
                trControl = ctrl)
svmFit

evalResults$SVM <- predict(svmFit, evaluationInd[, noNZVSet], type = "prob")[,1]
testResults$SVM <- predict(svmFit, testingInd[, noNZVSet], type = "prob")[,1]
svmROC <- roc(evalResults$CARAVAN, evalResults$SVM,
              levels = rev(levels(evalResults$CARAVAN)))
svmROC

svmTestROC <- roc(testResults$CARAVAN, testResults$SVM,
                  levels = rev(levels(testResults$CARAVAN)))
svmTestROC

confusionMatrix(predict(svmFit, evaluationInd[, noNZVSet]), evalResults$CARAVAN)

confusionMatrix(predict(svmFit, testingInd[, noNZVSet]), testingInd$CARAVAN)


set.seed(1401)
svmWtFit <- train(CARAVAN ~ .,
                  data = trainingInd[, noNZVSet],
                  method = "svmRadial",
                  tuneGrid = svmGrid1,
                  preProc = c("center", "scale"),
                  metric = "Kappa",
                  class.weights = c(insurance = 18, noinsurance = 1),
                  trControl = ctrlNoProb)
svmWtFit

svmWtEvalCM <- confusionMatrix(predict(svmWtFit, evaluationInd[, noNZVSet]), evalResults$CARAVAN)
svmWtEvalCM

svmWtTestCM <- confusionMatrix(predict(svmWtFit, testingInd[, noNZVSet]), testingInd$CARAVAN)
svmWtTestCM


initialRpart <- rpart(CARAVAN ~ ., data = training,
                      control = rpart.control(cp = 0.0001))
rpartGrid <- data.frame(cp = initialRpart$cptable[, "CP"])

cmat <- list(loss = matrix(c(0, 1, 20, 0), ncol = 2))
set.seed(1401)
cartWMod <- train(x = training[,predictors],
                  y = training$CARAVAN,
                  method = "rpart",
                  trControl = ctrlNoProb,
                  tuneGrid = rpartGrid,
                  metric = "Kappa",
                  parms = cmat)
cartWMod


library(C50)
c5Grid <- expand.grid(model = c("tree", "rules"),
                      trials = c(1, (1:10)*10),
                      winnow = FALSE)

finalCost <- matrix(c(0, 20, 1, 0), ncol = 2)
rownames(finalCost) <- colnames(finalCost) <- levels(training$CARAVAN)
set.seed(1401)
C5CostFit <- train(training[, predictors],
                   training$CARAVAN,
                   method = "C5.0",
                   metric = "Kappa",
                   tuneGrid = c5Grid,
                   cost = finalCost,
                   control = C5.0Control(earlyStopping = FALSE),
                   trControl = ctrlNoProb)

C5CostCM <- confusionMatrix(predict(C5CostFit, testing), testing$CARAVAN)
C5CostCM


################################################################################
### Session Information

sessionInfo()

q("no")
Any scripts or data that you put into this service are public.
AppliedPredictiveModeling documentation built on May 2, 2019, 6:13 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
AppliedPredictiveModeling
Functions and Data Sets for 'Applied Predictive Modeling'

inst/chapters/16_Class_Imbalance.R
In AppliedPredictiveModeling: Functions and Data Sets for 'Applied Predictive Modeling'

Try the AppliedPredictiveModeling package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

AppliedPredictiveModeling Functions and Data Sets for 'Applied Predictive Modeling'

inst/chapters/16_Class_Imbalance.R In AppliedPredictiveModeling: Functions and Data Sets for 'Applied Predictive Modeling'

Try the AppliedPredictiveModeling package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

AppliedPredictiveModeling
Functions and Data Sets for 'Applied Predictive Modeling'

inst/chapters/16_Class_Imbalance.R
In AppliedPredictiveModeling: Functions and Data Sets for 'Applied Predictive Modeling'