inst/chapters/12_Discriminant_Analysis.R

################################################################################
### R code from Applied Predictive Modeling (2013) by Kuhn and Johnson.
### Copyright 2013 Kuhn and Johnson
### Web Page: http://www.appliedpredictivemodeling.com
### Contact: Max Kuhn (mxkuhn@gmail.com) 
###
### Chapter 12 Discriminant Analysis and Other Linear Classification Models
###
### Required packages: AppliedPredictiveModeling, caret, doMC (optional),  
###                    glmnet, lattice, MASS, pamr, pls, pROC, sparseLDA
###
### Data used: The grant application data. See the file 'CreateGrantData.R'
###
### Notes: 
### 1) This code is provided without warranty.
###
### 2) This code should help the user reproduce the results in the
### text. There will be differences between this code and what is is
### the computing section. For example, the computing sections show
### how the source functions work (e.g. randomForest() or plsr()),
### which were not directly used when creating the book. Also, there may be 
### syntax differences that occur over time as packages evolve. These files 
### will reflect those changes.
###
### 3) In some cases, the calculations in the book were run in 
### parallel. The sub-processes may reset the random number seed.
### Your results may slightly vary.
###
################################################################################

################################################################################
### Section 12.1 Case Study: Predicting Successful Grant Applications

load("grantData.RData")

library(caret)
library(doMC)
registerDoMC(12)
library(plyr)
library(reshape2)

## Look at two different ways to split and resample the data. A support vector
## machine is used to illustrate the differences. The full set of predictors
## is used. 

pre2008Data <- training[pre2008,]
year2008Data <- rbind(training[-pre2008,], testing)

set.seed(552)
test2008 <- createDataPartition(year2008Data$Class, p = .25)[[1]]

allData <- rbind(pre2008Data, year2008Data[-test2008,])
holdout2008 <- year2008Data[test2008,]

## Use a common tuning grid for both approaches. 
svmrGrid <- expand.grid(sigma = c(.00007, .00009, .0001, .0002),
                        C = 2^(-3:8))

## Evaluate the model using overall 10-fold cross-validation
ctrl0 <- trainControl(method = "cv",
                      summaryFunction = twoClassSummary,
                      classProbs = TRUE)
set.seed(477)
svmFit0 <- train(pre2008Data[,fullSet], pre2008Data$Class,
                 method = "svmRadial",
                 tuneGrid = svmrGrid,
                 preProc = c("center", "scale"),
                 metric = "ROC",
                 trControl = ctrl0)
svmFit0

### Now fit the single 2008 test set
ctrl00 <- trainControl(method = "LGOCV",
                       summaryFunction = twoClassSummary,
                       classProbs = TRUE,
                       index = list(TestSet = 1:nrow(pre2008Data)))


set.seed(476)
svmFit00 <- train(allData[,fullSet], allData$Class,
                  method = "svmRadial",
                  tuneGrid = svmrGrid,
                  preProc = c("center", "scale"),
                  metric = "ROC",
                  trControl = ctrl00)
svmFit00

## Combine the two sets of results and plot

grid0 <- subset(svmFit0$results,  sigma == svmFit0$bestTune$sigma)
grid0$Model <- "10-Fold Cross-Validation"

grid00 <- subset(svmFit00$results,  sigma == svmFit00$bestTune$sigma)
grid00$Model <- "Single 2008 Test Set"

plotData <- rbind(grid00, grid0)

plotData <- plotData[!is.na(plotData$ROC),]
xyplot(ROC ~ C, data = plotData,
       groups = Model,
       type = c("g", "o"),
       scales = list(x = list(log = 2)),
       auto.key = list(columns = 1))

################################################################################
### Section 12.2 Logistic Regression

modelFit <- glm(Class ~ Day, data = training[pre2008,], family = binomial)
dataGrid <- data.frame(Day = seq(0, 365, length = 500))
dataGrid$Linear <- 1 - predict(modelFit, dataGrid, type = "response")
linear2008 <- auc(roc(response = training[-pre2008, "Class"],
                      predictor = 1 - predict(modelFit, 
                                              training[-pre2008,], 
                                              type = "response"),
                      levels = rev(levels(training[-pre2008, "Class"]))))


modelFit2 <- glm(Class ~ Day + I(Day^2), 
                 data = training[pre2008,], 
                 family = binomial)
dataGrid$Quadratic <- 1 - predict(modelFit2, dataGrid, type = "response")
quad2008 <- auc(roc(response = training[-pre2008, "Class"],
                    predictor = 1 - predict(modelFit2, 
                                            training[-pre2008,], 
                                            type = "response"),
                    levels = rev(levels(training[-pre2008, "Class"]))))

dataGrid <- melt(dataGrid, id.vars = "Day")

byDay <- training[pre2008, c("Day", "Class")]
byDay$Binned <- cut(byDay$Day, seq(0, 360, by = 5))

observedProps <- ddply(byDay, .(Binned),
                       function(x) c(n = nrow(x), mean = mean(x$Class == "successful")))
observedProps$midpoint <- seq(2.5, 357.5, by = 5)

xyplot(value ~ Day|variable, data = dataGrid,
       ylab = "Probability of A Successful Grant",
       ylim = extendrange(0:1),
       between = list(x = 1),
       panel = function(...)
       {
         panel.xyplot(x = observedProps$midpoint, observedProps$mean,
                      pch = 16., col = rgb(.2, .2, .2, .5))
         panel.xyplot(..., type = "l", col = "black", lwd = 2)
       })

## For the reduced set of factors, fit the logistic regression model (linear and
## quadratic) and evaluate on the 
training$Day2 <- training$Day^2
testing$Day2 <- testing$Day^2
fullSet <- c(fullSet, "Day2")
reducedSet <- c(reducedSet, "Day2")

## This control object will be used across multiple models so that the
## data splitting is consistent

ctrl <- trainControl(method = "LGOCV",
                     summaryFunction = twoClassSummary,
                     classProbs = TRUE,
                     index = list(TrainSet = pre2008),
                     savePredictions = TRUE)

set.seed(476)
lrFit <- train(x = training[,reducedSet], 
               y = training$Class,
               method = "glm",
               metric = "ROC",
               trControl = ctrl)
lrFit
set.seed(476)
lrFit2 <- train(x = training[,c(fullSet, "Day2")], 
                y = training$Class,
                method = "glm",
                metric = "ROC",
                trControl = ctrl)
lrFit2

lrFit$pred <- merge(lrFit$pred,  lrFit$bestTune)

## Get the confusion matrices for the hold-out set
lrCM <- confusionMatrix(lrFit, norm = "none")
lrCM
lrCM2 <- confusionMatrix(lrFit2, norm = "none")
lrCM2

## Get the area under the ROC curve for the hold-out set
lrRoc <- roc(response = lrFit$pred$obs,
             predictor = lrFit$pred$successful,
             levels = rev(levels(lrFit$pred$obs)))
lrRoc2 <- roc(response = lrFit2$pred$obs,
              predictor = lrFit2$pred$successful,
              levels = rev(levels(lrFit2$pred$obs)))
lrImp <- varImp(lrFit, scale = FALSE)

plot(lrRoc, legacy.axes = TRUE)

################################################################################
### Section 12.3 Linear Discriminant Analysis

## Fit the model to the reduced set
set.seed(476)
ldaFit <- train(x = training[,reducedSet], 
                y = training$Class,
                method = "lda",
                preProc = c("center","scale"),
                metric = "ROC",
                trControl = ctrl)
ldaFit

ldaFit$pred <- merge(ldaFit$pred,  ldaFit$bestTune)
ldaCM <- confusionMatrix(ldaFit, norm = "none")
ldaCM
ldaRoc <- roc(response = ldaFit$pred$obs,
              predictor = ldaFit$pred$successful,
              levels = rev(levels(ldaFit$pred$obs)))
plot(lrRoc, type = "s", col = rgb(.2, .2, .2, .2), legacy.axes = TRUE)
plot(ldaRoc, add = TRUE, type = "s", legacy.axes = TRUE)

################################################################################
### Section 12.4 Partial Least Squares Discriminant Analysis

## This model uses all of the predictors
set.seed(476)
plsFit <- train(x = training[,fullSet], 
                y = training$Class,
                method = "pls",
                tuneGrid = expand.grid(ncomp = 1:10),
                preProc = c("center","scale"),
                metric = "ROC",
                probMethod = "Bayes",
                trControl = ctrl)
plsFit

plsImpGrant <- varImp(plsFit, scale = FALSE)

bestPlsNcomp <- plsFit$results[best(plsFit$results, "ROC", maximize = TRUE), "ncomp"]
bestPlsROC <- plsFit$results[best(plsFit$results, "ROC", maximize = TRUE), "ROC"]

## Only keep the final tuning parameter data
plsFit$pred <- merge(plsFit$pred,  plsFit$bestTune)

plsRoc <- roc(response = plsFit$pred$obs,
              predictor = plsFit$pred$successful,
              levels = rev(levels(plsFit$pred$obs)))

### PLS confusion matrix information
plsCM <- confusionMatrix(plsFit, norm = "none")
plsCM

## Now fit a model that uses a smaller set of predictors chosen by unsupervised 
## filtering. 

set.seed(476)
plsFit2 <- train(x = training[,reducedSet], 
                 y = training$Class,
                 method = "pls",
                 tuneGrid = expand.grid(ncomp = 1:10),
                 preProc = c("center","scale"),
                 metric = "ROC",
                 probMethod = "Bayes",
                 trControl = ctrl)
plsFit2

bestPlsNcomp2 <- plsFit2$results[best(plsFit2$results, "ROC", maximize = TRUE), "ncomp"]
bestPlsROC2 <- plsFit2$results[best(plsFit2$results, "ROC", maximize = TRUE), "ROC"]

plsFit2$pred <- merge(plsFit2$pred,  plsFit2$bestTune)

plsRoc2 <- roc(response = plsFit2$pred$obs,
               predictor = plsFit2$pred$successful,
               levels = rev(levels(plsFit2$pred$obs)))
plsCM2 <- confusionMatrix(plsFit2, norm = "none")
plsCM2

pls.ROC <- cbind(plsFit$results,Descriptors="Full Set")
pls2.ROC <- cbind(plsFit2$results,Descriptors="Reduced Set")

plsCompareROC <- data.frame(rbind(pls.ROC,pls2.ROC))

xyplot(ROC ~ ncomp,
       data = plsCompareROC,
       xlab = "# Components",
       ylab = "ROC (2008 Hold-Out Data)",
       auto.key = list(columns = 2),
       groups = Descriptors,
       type = c("o", "g"))

## Plot ROC curves and variable importance scores
plot(ldaRoc, type = "s", col = rgb(.2, .2, .2, .2), legacy.axes = TRUE)
plot(lrRoc, type = "s", col = rgb(.2, .2, .2, .2), add = TRUE, legacy.axes = TRUE)
plot(plsRoc2, type = "s", add = TRUE, legacy.axes = TRUE)

plot(plsImpGrant, top=20, scales = list(y = list(cex = .95)))

################################################################################
### Section 12.5 Penalized Models

## The glmnet model
glmnGrid <- expand.grid(alpha = c(0,  .1,  .2, .4, .6, .8, 1),
                        lambda = seq(.01, .2, length = 40))
set.seed(476)
glmnFit <- train(x = training[,fullSet], 
                 y = training$Class,
                 method = "glmnet",
                 tuneGrid = glmnGrid,
                 preProc = c("center", "scale"),
                 metric = "ROC",
                 trControl = ctrl)
glmnFit

glmnet2008 <- merge(glmnFit$pred,  glmnFit$bestTune)
glmnetCM <- confusionMatrix(glmnFit, norm = "none")
glmnetCM

glmnetRoc <- roc(response = glmnet2008$obs,
                 predictor = glmnet2008$successful,
                 levels = rev(levels(glmnet2008$obs)))

glmnFit0 <- glmnFit
glmnFit0$results$lambda <- format(round(glmnFit0$results$lambda, 3))

glmnPlot <- plot(glmnFit0,
                 plotType = "level",
                 cuts = 15,
                 scales = list(x = list(rot = 90, cex = .65)))

update(glmnPlot,
       ylab = "Mixing Percentage\nRidge <---------> Lasso",
       sub = "",
       main = "Area Under the ROC Curve",
       xlab = "Amount of Regularization")

plot(plsRoc2, type = "s", col = rgb(.2, .2, .2, .2), legacy.axes = TRUE)
plot(ldaRoc, type = "s", add = TRUE, col = rgb(.2, .2, .2, .2), legacy.axes = TRUE)
plot(lrRoc, type = "s", col = rgb(.2, .2, .2, .2), add = TRUE, legacy.axes = TRUE)
plot(glmnetRoc, type = "s", add = TRUE, legacy.axes = TRUE)

## Sparse logistic regression

set.seed(476)
spLDAFit <- train(x = training[,fullSet], 
                  y = training$Class,
                  "sparseLDA",
                  tuneGrid = expand.grid(lambda = c(.1),
                                         NumVars = c(1:20, 50, 75, 100, 250, 500, 750, 1000)),
                  preProc = c("center", "scale"),
                  metric = "ROC",
                  trControl = ctrl)
spLDAFit

spLDA2008 <- merge(spLDAFit$pred,  spLDAFit$bestTune)
spLDACM <- confusionMatrix(spLDAFit, norm = "none")
spLDACM

spLDARoc <- roc(response = spLDA2008$obs,
                predictor = spLDA2008$successful,
                levels = rev(levels(spLDA2008$obs)))

update(plot(spLDAFit, scales = list(x = list(log = 10))),
       ylab = "ROC AUC (2008 Hold-Out Data)")

plot(plsRoc2, type = "s", col = rgb(.2, .2, .2, .2), legacy.axes = TRUE)
plot(glmnetRoc, type = "s", add = TRUE, col = rgb(.2, .2, .2, .2), legacy.axes = TRUE)
plot(ldaRoc, type = "s", add = TRUE, col = rgb(.2, .2, .2, .2), legacy.axes = TRUE)
plot(lrRoc, type = "s", col = rgb(.2, .2, .2, .2), add = TRUE, legacy.axes = TRUE)
plot(spLDARoc, type = "s", add = TRUE, legacy.axes = TRUE)

################################################################################
### Section 12.6 Nearest Shrunken Centroids

set.seed(476)
nscFit <- train(x = training[,fullSet], 
                y = training$Class,
                method = "pam",
                preProc = c("center", "scale"),
                tuneGrid = data.frame(threshold = seq(0, 25, length = 30)),
                metric = "ROC",
                trControl = ctrl)
nscFit

nsc2008 <- merge(nscFit$pred,  nscFit$bestTune)
nscCM <- confusionMatrix(nscFit, norm = "none")
nscCM
nscRoc <- roc(response = nsc2008$obs,
              predictor = nsc2008$successful,
              levels = rev(levels(nsc2008$obs)))
update(plot(nscFit), ylab = "ROC AUC (2008 Hold-Out Data)")


plot(plsRoc2, type = "s", col = rgb(.2, .2, .2, .2), legacy.axes = TRUE)
plot(glmnetRoc, type = "s", add = TRUE, col = rgb(.2, .2, .2, .2), legacy.axes = TRUE)
plot(ldaRoc, type = "s", add = TRUE, col = rgb(.2, .2, .2, .2), legacy.axes = TRUE)
plot(lrRoc, type = "s", col = rgb(.2, .2, .2, .2), add = TRUE, legacy.axes = TRUE)
plot(spLDARoc, type = "s", col = rgb(.2, .2, .2, .2), add = TRUE, legacy.axes = TRUE)
plot(nscRoc, type = "s", add = TRUE, legacy.axes = TRUE)

sessionInfo()

q("no")

Try the AppliedPredictiveModeling package in your browser

Any scripts or data that you put into this service are public.

AppliedPredictiveModeling documentation built on May 2, 2019, 9:22 a.m.