14_Class_Trees.R
In AppliedPredictiveModeling: Functions and Data Sets for 'Applied Predictive Modeling'

################################################################################
### R code from Applied Predictive Modeling (2013) by Kuhn and Johnson.
### Copyright 2013 Kuhn and Johnson
### Web Page: http://www.appliedpredictivemodeling.com
### Contact: Max Kuhn (mxkuhn@gmail.com) 
###
### Chapter 14 Classification Trees and Rule Based Models
###
### Required packages: AppliedPredictiveModeling, C50, caret, doMC (optional),
###                    gbm, lattice, partykit, pROC, randomForest, reshape2,
###                    rpart, RWeka
###
### Data used: The grant application data. See the file 'CreateGrantData.R'
###
### Notes: 
### 1) This code is provided without warranty.
###
### 2) This code should help the user reproduce the results in the
### text. There will be differences between this code and what is is
### the computing section. For example, the computing sections show
### how the source functions work (e.g. randomForest() or plsr()),
### which were not directly used when creating the book. Also, there may be 
### syntax differences that occur over time as packages evolve. These files 
### will reflect those changes.
###
### 3) In some cases, the calculations in the book were run in 
### parallel. The sub-processes may reset the random number seed.
### Your results may slightly vary.
###
################################################################################

### NOTE: Many of the models here are computationally expensive. If
### this script is run as-is, the memory requirements will accumulate
### until it exceeds 32gb. 

################################################################################
### Section 14.1 Basic Classification Trees

library(caret)

load("grantData.RData")

ctrl <- trainControl(method = "LGOCV",
                     summaryFunction = twoClassSummary,
                     classProbs = TRUE,
                     index = list(TrainSet = pre2008),
                     savePredictions = TRUE)

set.seed(476)
rpartFit <- train(x = training[,fullSet], 
                  y = training$Class,
                  method = "rpart",
                  tuneLength = 30,
                  metric = "ROC",
                  trControl = ctrl)
rpartFit

library(partykit)
plot(as.party(rpartFit$finalModel))

rpart2008 <- merge(rpartFit$pred,  rpartFit$bestTune)
rpartCM <- confusionMatrix(rpartFit, norm = "none")
rpartCM
rpartRoc <- roc(response = rpartFit$pred$obs,
                predictor = rpartFit$pred$successful,
                levels = rev(levels(rpartFit$pred$obs)))

set.seed(476)
rpartFactorFit <- train(x = training[,factorPredictors], 
                        y = training$Class,
                        method = "rpart",
                        tuneLength = 30,
                        metric = "ROC",
                        trControl = ctrl)
rpartFactorFit 
plot(as.party(rpartFactorFit$finalModel))

rpartFactor2008 <- merge(rpartFactorFit$pred,  rpartFactorFit$bestTune)
rpartFactorCM <- confusionMatrix(rpartFactorFit, norm = "none")
rpartFactorCM

rpartFactorRoc <- roc(response = rpartFactorFit$pred$obs,
                      predictor = rpartFactorFit$pred$successful,
                      levels = rev(levels(rpartFactorFit$pred$obs)))

plot(rpartRoc, type = "s", print.thres = c(.5),
     print.thres.pch = 3,
     print.thres.pattern = "",
     print.thres.cex = 1.2,
     col = "red", legacy.axes = TRUE,
     print.thres.col = "red")
plot(rpartFactorRoc,
     type = "s",
     add = TRUE,
     print.thres = c(.5),
     print.thres.pch = 16, legacy.axes = TRUE,
     print.thres.pattern = "",
     print.thres.cex = 1.2)
legend(.75, .2,
       c("Grouped Categories", "Independent Categories"),
       lwd = c(1, 1),
       col = c("black", "red"),
       pch = c(16, 3))

set.seed(476)
j48FactorFit <- train(x = training[,factorPredictors], 
                      y = training$Class,
                      method = "J48",
                      metric = "ROC",
                      trControl = ctrl)
j48FactorFit

j48Factor2008 <- merge(j48FactorFit$pred,  j48FactorFit$bestTune)
j48FactorCM <- confusionMatrix(j48FactorFit, norm = "none")
j48FactorCM

j48FactorRoc <- roc(response = j48FactorFit$pred$obs,
                    predictor = j48FactorFit$pred$successful,
                    levels = rev(levels(j48FactorFit$pred$obs)))

set.seed(476)
j48Fit <- train(x = training[,fullSet], 
                y = training$Class,
                method = "J48",
                metric = "ROC",
                trControl = ctrl)

j482008 <- merge(j48Fit$pred,  j48Fit$bestTune)
j48CM <- confusionMatrix(j48Fit, norm = "none")
j48CM

j48Roc <- roc(response = j48Fit$pred$obs,
              predictor = j48Fit$pred$successful,
              levels = rev(levels(j48Fit$pred$obs)))


plot(j48FactorRoc, type = "s", print.thres = c(.5), 
     print.thres.pch = 16, print.thres.pattern = "", 
     print.thres.cex = 1.2, legacy.axes = TRUE)
plot(j48Roc, type = "s", print.thres = c(.5), 
     print.thres.pch = 3, print.thres.pattern = "", 
     print.thres.cex = 1.2, legacy.axes = TRUE,
     add = TRUE, col = "red", print.thres.col = "red")
legend(.75, .2,
       c("Grouped Categories", "Independent Categories"),
       lwd = c(1, 1),
       col = c("black", "red"),
       pch = c(16, 3))

plot(rpartFactorRoc, type = "s", add = TRUE, 
     col = rgb(.2, .2, .2, .2), legacy.axes = TRUE)

################################################################################
### Section 14.2 Rule-Based Models

set.seed(476)
partFit <- train(x = training[,fullSet], 
                 y = training$Class,
                 method = "PART",
                 metric = "ROC",
                 trControl = ctrl)
partFit

part2008 <- merge(partFit$pred,  partFit$bestTune)
partCM <- confusionMatrix(partFit, norm = "none")
partCM

partRoc <- roc(response = partFit$pred$obs,
               predictor = partFit$pred$successful,
               levels = rev(levels(partFit$pred$obs)))
partRoc

set.seed(476)
partFactorFit <- train(training[,factorPredictors], training$Class,
                       method = "PART",
                       metric = "ROC",
                       trControl = ctrl)
partFactorFit

partFactor2008 <- merge(partFactorFit$pred,  partFactorFit$bestTune)
partFactorCM <- confusionMatrix(partFactorFit, norm = "none")
partFactorCM

partFactorRoc <- roc(response = partFactorFit$pred$obs,
                     predictor = partFactorFit$pred$successful,
                     levels = rev(levels(partFactorFit$pred$obs)))
partFactorRoc

################################################################################
### Section 14.3 Bagged Trees

set.seed(476)
treebagFit <- train(x = training[,fullSet], 
                    y = training$Class,
                    method = "treebag",
                    nbagg = 50,
                    metric = "ROC",
                    trControl = ctrl)
treebagFit

treebag2008 <- merge(treebagFit$pred,  treebagFit$bestTune)
treebagCM <- confusionMatrix(treebagFit, norm = "none")
treebagCM

treebagRoc <- roc(response = treebagFit$pred$obs,
                  predictor = treebagFit$pred$successful,
                  levels = rev(levels(treebagFit$pred$obs)))
set.seed(476)
treebagFactorFit <- train(x = training[,factorPredictors], 
                          y = training$Class,
                          method = "treebag",
                          nbagg = 50,
                          metric = "ROC",
                          trControl = ctrl)
treebagFactorFit

treebagFactor2008 <- merge(treebagFactorFit$pred,  treebagFactorFit$bestTune)
treebagFactorCM <- confusionMatrix(treebagFactorFit, norm = "none")
treebagFactorCM
treebagFactorRoc <- roc(response = treebagFactorFit$pred$obs,
                        predictor = treebagFactorFit$pred$successful,
                        levels = rev(levels(treebagFactorFit$pred$obs)))


plot(rpartRoc, type = "s", col = rgb(.2, .2, .2, .2), legacy.axes = TRUE)
plot(j48FactorRoc, type = "s", add = TRUE, col = rgb(.2, .2, .2, .2), 
     legacy.axes = TRUE)
plot(treebagRoc, type = "s", add = TRUE, print.thres = c(.5), 
     print.thres.pch = 3, legacy.axes = TRUE, print.thres.pattern = "", 
     print.thres.cex = 1.2,
     col = "red", print.thres.col = "red")
plot(treebagFactorRoc, type = "s", add = TRUE, print.thres = c(.5), 
     print.thres.pch = 16, print.thres.pattern = "", legacy.axes = TRUE, 
     print.thres.cex = 1.2)
legend(.75, .2,
       c("Grouped Categories", "Independent Categories"),
       lwd = c(1, 1),
       col = c("black", "red"),
       pch = c(16, 3))

################################################################################
### Section 14.4 Random Forests

### For the book, this model was run with only 500 trees (by
### accident). More than 1000 trees usually required to get consistent
### results.

mtryValues <- c(5, 10, 20, 32, 50, 100, 250, 500, 1000)
set.seed(476)
rfFit <- train(x = training[,fullSet], 
               y = training$Class,
               method = "rf",
               ntree = 500,
               tuneGrid = data.frame(mtry = mtryValues),
               importance = TRUE,
               metric = "ROC",
               trControl = ctrl)
rfFit

rf2008 <- merge(rfFit$pred,  rfFit$bestTune)
rfCM <- confusionMatrix(rfFit, norm = "none")
rfCM

rfRoc <- roc(response = rfFit$pred$obs,
             predictor = rfFit$pred$successful,
             levels = rev(levels(rfFit$pred$obs)))

gc()

## The randomForest package cannot handle factors with more than 32
## levels, so we make a new set of predictors where the sponsor code
## factor is entered as dummy variables instead of a single factor. 

sponsorVars <- grep("Sponsor", names(training), value = TRUE)
sponsorVars <- sponsorVars[sponsorVars != "SponsorCode"]

rfPredictors <- factorPredictors
rfPredictors <- rfPredictors[rfPredictors != "SponsorCode"]
rfPredictors <- c(rfPredictors, sponsorVars)

set.seed(476)
rfFactorFit <- train(x = training[,rfPredictors], 
                     y = training$Class,
                     method = "rf",
                     ntree = 1500,
                     tuneGrid = data.frame(mtry = mtryValues),
                     importance = TRUE,
                     metric = "ROC",
                     trControl = ctrl)
rfFactorFit

rfFactor2008 <- merge(rfFactorFit$pred,  rfFactorFit$bestTune)
rfFactorCM <- confusionMatrix(rfFactorFit, norm = "none")
rfFactorCM

rfFactorRoc <- roc(response = rfFactorFit$pred$obs,
                   predictor = rfFactorFit$pred$successful,
                   levels = rev(levels(rfFactorFit$pred$obs)))

plot(treebagRoc, type = "s", col = rgb(.2, .2, .2, .2), legacy.axes = TRUE)
plot(rpartRoc, type = "s", add = TRUE, col = rgb(.2, .2, .2, .2), legacy.axes = TRUE)
plot(j48FactorRoc, type = "s", add = TRUE, col = rgb(.2, .2, .2, .2), 
     legacy.axes = TRUE)
plot(rfRoc, type = "s", add = TRUE, print.thres = c(.5), 
     print.thres.pch = 3, legacy.axes = TRUE, print.thres.pattern = "", 
     print.thres.cex = 1.2,
     col = "red", print.thres.col = "red")
plot(rfFactorRoc, type = "s", add = TRUE, print.thres = c(.5), 
     print.thres.pch = 16, print.thres.pattern = "", legacy.axes = TRUE, 
     print.thres.cex = 1.2)
legend(.75, .2,
       c("Grouped Categories", "Independent Categories"),
       lwd = c(1, 1),
       col = c("black", "red"),
       pch = c(16, 3))


################################################################################
### Section 14.5 Boosting

gbmGrid <- expand.grid(interaction.depth = c(1, 3, 5, 7, 9),
                       n.trees = (1:20)*100,
                       shrinkage = c(.01, .1))

set.seed(476)
gbmFit <- train(x = training[,fullSet], 
                y = training$Class,
                method = "gbm",
                tuneGrid = gbmGrid,
                metric = "ROC",
                verbose = FALSE,
                trControl = ctrl)
gbmFit

gbmFit$pred <- merge(gbmFit$pred,  gbmFit$bestTune)
gbmCM <- confusionMatrix(gbmFit, norm = "none")
gbmCM

gbmRoc <- roc(response = gbmFit$pred$obs,
              predictor = gbmFit$pred$successful,
              levels = rev(levels(gbmFit$pred$obs)))

set.seed(476)
gbmFactorFit <- train(x = training[,factorPredictors], 
                      y = training$Class,
                      method = "gbm",
                      tuneGrid = gbmGrid,
                      verbose = FALSE,
                      metric = "ROC",
                      trControl = ctrl)
gbmFactorFit

gbmFactorFit$pred <- merge(gbmFactorFit$pred,  gbmFactorFit$bestTune)
gbmFactorCM <- confusionMatrix(gbmFactorFit, norm = "none")
gbmFactorCM

gbmFactorRoc <- roc(response = gbmFactorFit$pred$obs,
                    predictor = gbmFactorFit$pred$successful,
                    levels = rev(levels(gbmFactorFit$pred$obs)))

gbmROCRange <- extendrange(cbind(gbmFactorFit$results$ROC,gbmFit$results$ROC))

plot(gbmFactorFit, ylim = gbmROCRange, 
     auto.key = list(columns = 4, lines = TRUE))


plot(gbmFit, ylim = gbmROCRange, 
     auto.key = list(columns = 4, lines = TRUE))


plot(treebagRoc, type = "s", col = rgb(.2, .2, .2, .2), legacy.axes = TRUE)
plot(rpartRoc, type = "s", add = TRUE, col = rgb(.2, .2, .2, .2), legacy.axes = TRUE)
plot(j48FactorRoc, type = "s", add = TRUE, col = rgb(.2, .2, .2, .2), legacy.axes = TRUE)
plot(rfFactorRoc, type = "s", add = TRUE, col = rgb(.2, .2, .2, .2), legacy.axes = TRUE)
plot(gbmRoc, type = "s", print.thres = c(.5), print.thres.pch = 3, 
     print.thres.pattern = "", print.thres.cex = 1.2,
     add = TRUE, col = "red", print.thres.col = "red", legacy.axes = TRUE)
plot(gbmFactorRoc, type = "s", print.thres = c(.5), print.thres.pch = 16, 
     legacy.axes = TRUE, print.thres.pattern = "", print.thres.cex = 1.2,
     add = TRUE)
legend(.75, .2,
       c("Grouped Categories", "Independent Categories"),
       lwd = c(1, 1),
       col = c("black", "red"),
       pch = c(16, 3))

################################################################################
### Section 14.5 C5.0

c50Grid <- expand.grid(trials = c(1:9, (1:10)*10),
                       model = c("tree", "rules"),
                       winnow = c(TRUE, FALSE))
set.seed(476)
c50FactorFit <- train(training[,factorPredictors], training$Class,
                      method = "C5.0",
                      tuneGrid = c50Grid,
                      verbose = FALSE,
                      metric = "ROC",
                      trControl = ctrl)
c50FactorFit

c50FactorFit$pred <- merge(c50FactorFit$pred,  c50FactorFit$bestTune)
c50FactorCM <- confusionMatrix(c50FactorFit, norm = "none")
c50FactorCM

c50FactorRoc <- roc(response = c50FactorFit$pred$obs,
                    predictor = c50FactorFit$pred$successful,
                    levels = rev(levels(c50FactorFit$pred$obs)))

set.seed(476)
c50Fit <- train(training[,fullSet], training$Class,
                method = "C5.0",
                tuneGrid = c50Grid,
                metric = "ROC",
                verbose = FALSE,
                trControl = ctrl)
c50Fit

c50Fit$pred <- merge(c50Fit$pred,  c50Fit$bestTune)
c50CM <- confusionMatrix(c50Fit, norm = "none")
c50CM

c50Roc <- roc(response = c50Fit$pred$obs,
              predictor = c50Fit$pred$successful,
              levels = rev(levels(c50Fit$pred$obs)))

update(plot(c50FactorFit), ylab = "ROC AUC (2008 Hold-Out Data)")


plot(treebagRoc, type = "s", col = rgb(.2, .2, .2, .2), legacy.axes = TRUE)
plot(rpartRoc, type = "s", add = TRUE, col = rgb(.2, .2, .2, .2), legacy.axes = TRUE)
plot(j48FactorRoc, type = "s", add = TRUE, col = rgb(.2, .2, .2, .2), legacy.axes = TRUE)
plot(rfFactorRoc, type = "s", add = TRUE, col = rgb(.2, .2, .2, .2), legacy.axes = TRUE)
plot(gbmRoc, type = "s",  col = rgb(.2, .2, .2, .2), add = TRUE, legacy.axes = TRUE)
plot(c50Roc, type = "s", print.thres = c(.5), print.thres.pch = 3, 
     print.thres.pattern = "", print.thres.cex = 1.2,
     add = TRUE, col = "red", print.thres.col = "red", legacy.axes = TRUE)
plot(c50FactorRoc, type = "s", print.thres = c(.5), print.thres.pch = 16, 
     print.thres.pattern = "", print.thres.cex = 1.2,
     add = TRUE, legacy.axes = TRUE)
legend(.75, .2,
       c("Grouped Categories", "Independent Categories"),
       lwd = c(1, 1),
       col = c("black", "red"),
       pch = c(16, 3))

################################################################################
### Section 14.7 Comparing Two Encodings of Categorical Predictors

## Pull the hold-out results from each model and merge

rp1 <- caret:::getTrainPerf(rpartFit)
names(rp1) <- gsub("Train", "Independent", names(rp1))
rp2 <- caret:::getTrainPerf(rpartFactorFit)
rp2$Label <- "CART"
names(rp2) <- gsub("Train", "Grouped", names(rp2))
rp <- cbind(rp1, rp2)

j481 <- caret:::getTrainPerf(j48Fit)
names(j481) <- gsub("Train", "Independent", names(j481))
j482 <- caret:::getTrainPerf(j48FactorFit)
j482$Label <- "J48"
names(j482) <- gsub("Train", "Grouped", names(j482))
j48 <- cbind(j481, j482)

part1 <- caret:::getTrainPerf(partFit)
names(part1) <- gsub("Train", "Independent", names(part1))
part2 <- caret:::getTrainPerf(partFactorFit)
part2$Label <- "PART"
names(part2) <- gsub("Train", "Grouped", names(part2))
part <- cbind(part1, part2)

tb1 <- caret:::getTrainPerf(treebagFit)
names(tb1) <- gsub("Train", "Independent", names(tb1))
tb2 <- caret:::getTrainPerf(treebagFactorFit)
tb2$Label <- "Bagged Tree"
names(tb2) <- gsub("Train", "Grouped", names(tb2))
tb <- cbind(tb1, tb2)

rf1 <- caret:::getTrainPerf(rfFit)
names(rf1) <- gsub("Train", "Independent", names(rf1))
rf2 <- caret:::getTrainPerf(rfFactorFit)
rf2$Label <- "Random Forest"
names(rf2) <- gsub("Train", "Grouped", names(rf2))
rf <- cbind(rf1, rf2)

gbm1 <- caret:::getTrainPerf(gbmFit)
names(gbm1) <- gsub("Train", "Independent", names(gbm1))
gbm2 <- caret:::getTrainPerf(gbmFactorFit)
gbm2$Label <- "Boosted Tree"
names(gbm2) <- gsub("Train", "Grouped", names(gbm2))
bst <- cbind(gbm1, gbm2)


c501 <- caret:::getTrainPerf(c50Fit)
names(c501) <- gsub("Train", "Independent", names(c501))
c502 <- caret:::getTrainPerf(c50FactorFit)
c502$Label <- "C5.0"
names(c502) <- gsub("Train", "Grouped", names(c502))
c5 <- cbind(c501, c502)


trainPerf <- rbind(rp, j48, part, tb, rf, bst, c5)

library(lattice)
library(reshape2)
trainPerf <- melt(trainPerf)
trainPerf$metric <- "ROC"
trainPerf$metric[grepl("Sens", trainPerf$variable)] <- "Sensitivity"
trainPerf$metric[grepl("Spec", trainPerf$variable)] <- "Specificity"
trainPerf$model <- "Grouped"
trainPerf$model[grepl("Independent", trainPerf$variable)] <- "Independent"

trainPerf <- melt(trainPerf)
trainPerf$metric <- "ROC"
trainPerf$metric[grepl("Sens", trainPerf$variable)] <- "Sensitivity"
trainPerf$metric[grepl("Spec", trainPerf$variable)] <- "Specificity"
trainPerf$model <- "Independent"
trainPerf$model[grepl("Grouped", trainPerf$variable)] <- "Grouped"
trainPerf$Label <- factor(trainPerf$Label,
                          levels = rev(c("CART", "Cond. Trees", "J48", "Ripper",
                                         "PART", "Bagged Tree", "Random Forest", 
                                         "Boosted Tree", "C5.0")))

dotplot(Label ~ value|metric,
        data = trainPerf,
        groups = model,
        horizontal = TRUE,
        auto.key = list(columns = 2),
        between = list(x = 1),
        xlab = "")


sessionInfo()

q("no")
Any scripts or data that you put into this service are public.
AppliedPredictiveModeling documentation built on May 2, 2019, 9:22 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
AppliedPredictiveModeling
Functions and Data Sets for 'Applied Predictive Modeling'

inst/chapters/14_Class_Trees.R
In AppliedPredictiveModeling: Functions and Data Sets for 'Applied Predictive Modeling'

Try the AppliedPredictiveModeling package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

AppliedPredictiveModeling Functions and Data Sets for 'Applied Predictive Modeling'

inst/chapters/14_Class_Trees.R In AppliedPredictiveModeling: Functions and Data Sets for 'Applied Predictive Modeling'

Try the AppliedPredictiveModeling package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

AppliedPredictiveModeling
Functions and Data Sets for 'Applied Predictive Modeling'

inst/chapters/14_Class_Trees.R
In AppliedPredictiveModeling: Functions and Data Sets for 'Applied Predictive Modeling'