19_Feature_Select.R
In AppliedPredictiveModeling: Functions and Data Sets for 'Applied Predictive Modeling'

################################################################################
### R code from Applied Predictive Modeling (2013) by Kuhn and Johnson.
### Copyright 2013 Kuhn and Johnson
### Web Page: http://www.appliedpredictivemodeling.com
### Contact: Max Kuhn (mxkuhn@gmail.com) 
###
### Chapter 19: An Introduction to Feature Selection
###
### Required packages: AppliedPredictiveModeling, caret, MASS, corrplot,
###                    RColorBrewer, randomForest, kernlab, klaR,
###                   
###
### Data used: The Alzheimer disease data from the AppliedPredictiveModeling 
###            package
###
### Notes: 
### 1) This code is provided without warranty.
###
### 2) This code should help the user reproduce the results in the
### text. There will be differences between this code and what is is
### the computing section. For example, the computing sections show
### how the source functions work (e.g. randomForest() or plsr()),
### which were not directly used when creating the book. Also, there may be 
### syntax differences that occur over time as packages evolve. These files 
### will reflect those changes.
###
### 3) In some cases, the calculations in the book were run in 
### parallel. The sub-processes may reset the random number seed.
### Your results may slightly vary.
###
################################################################################



################################################################################
### Section 19.6 Case Study: Predicting Cognitive Impairment


library(AppliedPredictiveModeling)
data(AlzheimerDisease)

## The baseline set of predictors
bl <- c("Genotype", "age", "tau", "p_tau", "Ab_42", "male")

## The set of new assays
newAssays <- colnames(predictors)
newAssays <- newAssays[!(newAssays %in% c("Class", bl))]

## Decompose the genotype factor into binary dummy variables

predictors$E2 <- predictors$E3 <- predictors$E4 <- 0
predictors$E2[grepl("2", predictors$Genotype)] <- 1
predictors$E3[grepl("3", predictors$Genotype)] <- 1
predictors$E4[grepl("4", predictors$Genotype)] <- 1
genotype <-  predictors$Genotype

## Partition the data
library(caret)
set.seed(730)
split <- createDataPartition(diagnosis, p = .8, list = FALSE)

adData <- predictors
adData$Class <- diagnosis

training <- adData[ split, ]
testing  <- adData[-split, ]

predVars <- names(adData)[!(names(adData) %in% c("Class",  "Genotype"))]

## This summary function is used to evaluate the models.
fiveStats <- function(...) c(twoClassSummary(...), defaultSummary(...))

## We create the cross-validation files as a list to use with different 
## functions

set.seed(104)
index <- createMultiFolds(training$Class, times = 5)

## The candidate set of the number of predictors to evaluate
varSeq <- seq(1, length(predVars)-1, by = 2)

## We can also use parallel processing to run each resampled RFE
## iteration (or resampled model with train()) using different
## workers.

library(doMC)
registerDoMC(15)


## The rfe() function in the caret package is used for recursive feature 
## elimiation. We setup control functions for this and train() that use
## the same cross-validation folds. The 'ctrl' object will be modifed several
## times as we try different models

ctrl <- rfeControl(method = "repeatedcv", repeats = 5,
                   saveDetails = TRUE,
                   index = index,
                   returnResamp = "final")

fullCtrl <- trainControl(method = "repeatedcv",
                         repeats = 5,
                         summaryFunction = fiveStats,
                         classProbs = TRUE,
                         index = index)

## The correlation matrix of the new data
predCor <- cor(training[, newAssays])

library(RColorBrewer)
cols <- c(rev(brewer.pal(7, "Blues")),
          brewer.pal(7, "Reds"))
library(corrplot)
corrplot(predCor,
         order = "hclust",
         tl.pos = "n",addgrid.col = rgb(1,1,1,.01),
         col = colorRampPalette(cols)(51))

## Fit a series of models with the full set of predictors
set.seed(721)
rfFull <- train(training[, predVars],
                training$Class,
                method = "rf",
                metric = "ROC",
                tuneGrid = data.frame(mtry = floor(sqrt(length(predVars)))),
                ntree = 1000,
                trControl = fullCtrl)
rfFull

set.seed(721)
ldaFull <- train(training[, predVars],
                 training$Class,
                 method = "lda",
                 metric = "ROC",
                 ## The 'tol' argument helps lda() know when a matrix is 
                 ## singular. One of the predictors has values very close to 
                 ## zero, so we raise the vaue to be smaller than the default
                 ## value of 1.0e-4.
                 tol = 1.0e-12,
                 trControl = fullCtrl)
ldaFull

set.seed(721)
svmFull <- train(training[, predVars],
                 training$Class,
                 method = "svmRadial",
                 metric = "ROC",
                 tuneLength = 12,
                 preProc = c("center", "scale"),
                 trControl = fullCtrl)
svmFull

set.seed(721)
nbFull <- train(training[, predVars],
                training$Class,
                method = "nb",
                metric = "ROC",
                trControl = fullCtrl)
nbFull

lrFull <- train(training[, predVars],
                training$Class,
                method = "glm",
                metric = "ROC",
                trControl = fullCtrl)
lrFull

set.seed(721)
knnFull <- train(training[, predVars],
                 training$Class,
                 method = "knn",
                 metric = "ROC",
                 tuneLength = 20,
                 preProc = c("center", "scale"),
                 trControl = fullCtrl)
knnFull

## Now fit the RFE versions. To do this, the 'functions' argument of the rfe()
## object is modified to the approproate functions. For model details about 
## these functions and their arguments, see 
##
##   http://caret.r-forge.r-project.org/featureSelection.html
##
## for more information.




ctrl$functions <- rfFuncs
ctrl$functions$summary <- fiveStats
set.seed(721)
rfRFE <- rfe(training[, predVars],
             training$Class,
             sizes = varSeq,
             metric = "ROC",
             ntree = 1000,
             rfeControl = ctrl)
rfRFE

ctrl$functions <- ldaFuncs
ctrl$functions$summary <- fiveStats

set.seed(721)
ldaRFE <- rfe(training[, predVars],
              training$Class,
              sizes = varSeq,
              metric = "ROC",
              tol = 1.0e-12,
              rfeControl = ctrl)
ldaRFE

ctrl$functions <- nbFuncs
ctrl$functions$summary <- fiveStats
set.seed(721)
nbRFE <- rfe(training[, predVars],
              training$Class,
              sizes = varSeq,
              metric = "ROC",
              rfeControl = ctrl)
nbRFE

## Here, the caretFuncs list allows for a model to be tuned at each iteration 
## of feature seleciton.

ctrl$functions <- caretFuncs
ctrl$functions$summary <- fiveStats

## This options tells train() to run it's model tuning
## sequentially. Otherwise, there would be parallel processing at two
## levels, which is possible but requires W^2 workers. On our machine,
## it was more efficient to only run the RFE process in parallel. 

cvCtrl <- trainControl(method = "cv",
                       verboseIter = FALSE,
                       classProbs = TRUE,
                       allowParallel = FALSE)

set.seed(721)
svmRFE <- rfe(training[, predVars],
              training$Class,
              sizes = varSeq,
              rfeControl = ctrl,
              metric = "ROC",
              ## Now arguments to train() are used.
              method = "svmRadial",
              tuneLength = 12,
              preProc = c("center", "scale"),
              trControl = cvCtrl)
svmRFE

ctrl$functions <- lrFuncs
ctrl$functions$summary <- fiveStats

set.seed(721)
lrRFE <- rfe(training[, predVars],
               training$Class,
               sizes = varSeq,
               metric = "ROC",
               rfeControl = ctrl)
lrRFE

ctrl$functions <- caretFuncs
ctrl$functions$summary <- fiveStats

set.seed(721)
knnRFE <- rfe(training[, predVars],
              training$Class,
              sizes = varSeq,
              metric = "ROC",
              method = "knn",
              tuneLength = 20,
              preProc = c("center", "scale"),
              trControl = cvCtrl,
              rfeControl = ctrl)
knnRFE

## Each of these models can be evaluate using the plot() function to see
## the profile across subset sizes.

## Test set ROC results:
rfROCfull <- roc(testing$Class,
                 predict(rfFull, testing[,predVars], type = "prob")[,1])
rfROCfull
rfROCrfe <- roc(testing$Class,
                predict(rfRFE, testing[,predVars])$Impaired)
rfROCrfe

ldaROCfull <- roc(testing$Class,
                  predict(ldaFull, testing[,predVars], type = "prob")[,1])
ldaROCfull
ldaROCrfe <- roc(testing$Class,
                 predict(ldaRFE, testing[,predVars])$Impaired)
ldaROCrfe

nbROCfull <- roc(testing$Class,
                  predict(nbFull, testing[,predVars], type = "prob")[,1])
nbROCfull
nbROCrfe <- roc(testing$Class,
                 predict(nbRFE, testing[,predVars])$Impaired)
nbROCrfe

svmROCfull <- roc(testing$Class,
                  predict(svmFull, testing[,predVars], type = "prob")[,1])
svmROCfull
svmROCrfe <- roc(testing$Class,
                 predict(svmRFE, testing[,predVars])$Impaired)
svmROCrfe

lrROCfull <- roc(testing$Class,
                  predict(lrFull, testing[,predVars], type = "prob")[,1])
lrROCfull
lrROCrfe <- roc(testing$Class,
                 predict(lrRFE, testing[,predVars])$Impaired)
lrROCrfe

knnROCfull <- roc(testing$Class,
                  predict(knnFull, testing[,predVars], type = "prob")[,1])
knnROCfull
knnROCrfe <- roc(testing$Class,
                 predict(knnRFE, testing[,predVars])$Impaired)
knnROCrfe


## For filter methods, the sbf() function (named for Selection By Filter) is
## used. It has similar arguments to rfe() to control the model fitting and
## filtering methods. 

## P-values are created for filtering. 

## A set of four LDA models are fit based on two factors: p-value adjustment 
## using a Bonferroni adjustment and whether the predictors should be 
## pre-screened for high correlations. 

sbfResamp <- function(x, fun = mean)
{
  x <- unlist(lapply(x$variables, length))
  fun(x)
}
sbfROC <- function(mod) auc(roc(testing$Class, predict(mod, testing)$Impaired))

## This function calculates p-values using either a t-test (when the predictor
## has 2+ distinct values) or using Fisher's Exact Test otherwise.

pScore <- function(x, y)
  {
    numX <- length(unique(x))
    if(numX > 2)
      {
       out <- t.test(x ~ y)$p.value
      } else {
       out <- fisher.test(factor(x), y)$p.value
      }
    out
  }
ldaWithPvalues <- ldaSBF
ldaWithPvalues$score <- pScore
ldaWithPvalues$summary <- fiveStats

## Predictors are retained if their p-value is less than the completely 
## subjective cut-off of 0.05.

ldaWithPvalues$filter <- function (score, x, y)
{
  keepers <- score <= 0.05
  keepers
}

sbfCtrl <- sbfControl(method = "repeatedcv",
                      repeats = 5,
                      verbose = TRUE,
                      functions = ldaWithPvalues,
                      index = index)

rawCorr <- sbf(training[, predVars],
               training$Class,
               tol = 1.0e-12,
               sbfControl = sbfCtrl)
rawCorr

ldaWithPvalues$filter <- function (score, x, y)
{
  score <- p.adjust(score,  "bonferroni")
  keepers <- score <= 0.05
  keepers
}
sbfCtrl <- sbfControl(method = "repeatedcv",
                      repeats = 5,
                      verbose = TRUE,
                      functions = ldaWithPvalues,
                      index = index)

adjCorr <- sbf(training[, predVars],
               training$Class,
               tol = 1.0e-12,
               sbfControl = sbfCtrl)
adjCorr

ldaWithPvalues$filter <- function (score, x, y)
{
  keepers <- score <= 0.05
  corrMat <- cor(x[,keepers])
  tooHigh <- findCorrelation(corrMat, .75)
  if(length(tooHigh) > 0) keepers[tooHigh] <- FALSE
  keepers
}
sbfCtrl <- sbfControl(method = "repeatedcv",
                      repeats = 5,
                      verbose = TRUE,
                      functions = ldaWithPvalues,
                      index = index)

rawNoCorr <- sbf(training[, predVars],
                 training$Class,
                 tol = 1.0e-12,
                 sbfControl = sbfCtrl)
rawNoCorr

ldaWithPvalues$filter <- function (score, x, y)
{
  score <- p.adjust(score,  "bonferroni")
  keepers <- score <= 0.05
  corrMat <- cor(x[,keepers])
  tooHigh <- findCorrelation(corrMat, .75)
  if(length(tooHigh) > 0) keepers[tooHigh] <- FALSE
  keepers
}
sbfCtrl <- sbfControl(method = "repeatedcv",
                      repeats = 5,
                      verbose = TRUE,
                      functions = ldaWithPvalues,
                      index = index)

adjNoCorr <- sbf(training[, predVars],
                 training$Class,
                 tol = 1.0e-12,
                 sbfControl = sbfCtrl)
adjNoCorr

## Filter methods test set ROC results:

sbfROC(rawCorr)
sbfROC(rawNoCorr)
sbfROC(adjCorr)
sbfROC(adjNoCorr)

## Get the resampling results for all the models

rfeResamples <- resamples(list(RF = rfRFE,
                               "Logistic Reg." = lrRFE,
                               "SVM" = svmRFE,
                               "$K$--NN" = knnRFE,
                               "N. Bayes" = nbRFE,
                               "LDA" = ldaRFE))
summary(rfeResamples)

fullResamples <- resamples(list(RF = rfFull,
                                "Logistic Reg." = lrFull,
                                "SVM" = svmFull,
                                "$K$--NN" = knnFull,
                                "N. Bayes" = nbFull,
                                "LDA" = ldaFull))
summary(fullResamples)

filteredResamples <- resamples(list("No Adjustment, Corr Vars" = rawCorr,
                                    "No Adjustment, No Corr Vars" = rawNoCorr,
                                    "Bonferroni, Corr Vars" = adjCorr,
                                    "Bonferroni, No Corr Vars" = adjNoCorr))
summary(filteredResamples)

sessionInfo()
Any scripts or data that you put into this service are public.
AppliedPredictiveModeling documentation built on May 2, 2019, 6:13 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
AppliedPredictiveModeling
Functions and Data Sets for 'Applied Predictive Modeling'

inst/chapters/19_Feature_Select.R
In AppliedPredictiveModeling: Functions and Data Sets for 'Applied Predictive Modeling'

Try the AppliedPredictiveModeling package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

AppliedPredictiveModeling Functions and Data Sets for 'Applied Predictive Modeling'

inst/chapters/19_Feature_Select.R In AppliedPredictiveModeling: Functions and Data Sets for 'Applied Predictive Modeling'

Try the AppliedPredictiveModeling package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

AppliedPredictiveModeling
Functions and Data Sets for 'Applied Predictive Modeling'

inst/chapters/19_Feature_Select.R
In AppliedPredictiveModeling: Functions and Data Sets for 'Applied Predictive Modeling'