################################################################################
### R code from Applied Predictive Modeling (2013) by Kuhn and Johnson.
### Copyright 2013 Kuhn and Johnson
### Web Page: http://www.appliedpredictivemodeling.com
### Contact: Max Kuhn (mxkuhn@gmail.com)
###
### Chapter 18: Measuring Predictor Importance
###
### Required packages: AppliedPredictiveModeling, caret, CORElearn, corrplot,
### pROC, minerva
###
###
### Data used: The solubility data from the AppliedPredictiveModeling
### package, the segmentation data in the caret package and the
### grant data (created using "CreateGrantData.R" in the same
### directory as this file).
###
### Notes:
### 1) This code is provided without warranty.
###
### 2) This code should help the user reproduce the results in the
### text. There will be differences between this code and what is is
### the computing section. For example, the computing sections show
### how the source functions work (e.g. randomForest() or plsr()),
### which were not directly used when creating the book. Also, there may be
### syntax differences that occur over time as packages evolve. These files
### will reflect those changes.
###
### 3) In some cases, the calculations in the book were run in
### parallel. The sub-processes may reset the random number seed.
### Your results may slightly vary.
###
################################################################################
################################################################################
### Section 18.1 Numeric Outcomes
## Load the solubility data
library(AppliedPredictiveModeling)
data(solubility)
trainData <- solTrainXtrans
trainData$y <- solTrainY
## keep the continuous predictors and append the outcome to the data frame
SolContPred <- solTrainXtrans[, !grepl("FP", names(solTrainXtrans))]
numSolPred <- ncol(SolContPred)
SolContPred$Sol <- solTrainY
## Get the LOESS smoother and the summary measure
library(caret)
smoother <- filterVarImp(x = SolContPred[, -ncol(SolContPred)],
y = solTrainY,
nonpara = TRUE)
smoother$Predictor <- rownames(smoother)
names(smoother)[1] <- "Smoother"
## Calculate the correlation matrices and keep the columns with the correlations
## between the predictors and the outcome
correlations <- cor(SolContPred)[-(numSolPred+1),(numSolPred+1)]
rankCorrelations <- cor(SolContPred, method = "spearman")[-(numSolPred+1),(numSolPred+1)]
corrs <- data.frame(Predictor = names(SolContPred)[1:numSolPred],
Correlation = correlations,
RankCorrelation = rankCorrelations)
## The maximal information coefficient (MIC) values can be obtained from the
### minerva package:
library(minerva)
MIC <- mine(x = SolContPred[, 1:numSolPred], y = solTrainY)$MIC
MIC <- data.frame(Predictor = rownames(MIC),
MIC = MIC[,1])
## The Relief values for regression can be computed using the CORElearn
## package:
library(CORElearn)
ReliefF <- attrEval(Sol ~ ., data = SolContPred,
estimator = "RReliefFequalK")
ReliefF <- data.frame(Predictor = names(ReliefF),
Relief = ReliefF)
## Combine them all together for a plot
contDescrScores <- merge(smoother, corrs)
contDescrScores <- merge(contDescrScores, MIC)
contDescrScores <- merge(contDescrScores, ReliefF)
rownames(contDescrScores) <- contDescrScores$Predictor
contDescrScores
contDescrSplomData <- contDescrScores
contDescrSplomData$Correlation <- abs(contDescrSplomData$Correlation)
contDescrSplomData$RankCorrelation <- abs(contDescrSplomData$RankCorrelation)
contDescrSplomData$Group <- "Other"
contDescrSplomData$Group[grepl("Surface", contDescrSplomData$Predictor)] <- "SA"
featurePlot(solTrainXtrans[, c("NumCarbon", "SurfaceArea2")],
solTrainY,
between = list(x = 1),
type = c("g", "p", "smooth"),
df = 3,
aspect = 1,
labels = c("", "Solubility"))
splom(~contDescrSplomData[,c(3, 4, 2, 5)],
groups = contDescrSplomData$Group,
varnames = c("Correlation", "Rank\nCorrelation", "LOESS", "MIC"))
## Now look at the categorical (i.e. binary) predictors
SolCatPred <- solTrainXtrans[, grepl("FP", names(solTrainXtrans))]
SolCatPred$Sol <- solTrainY
numSolCatPred <- ncol(SolCatPred) - 1
tests <- apply(SolCatPred[, 1:numSolCatPred], 2,
function(x, y)
{
tStats <- t.test(y ~ x)[c("statistic", "p.value", "estimate")]
unlist(tStats)
},
y = solTrainY)
## The results are a matrix with predictors in columns. We reverse this
tests <- as.data.frame(t(tests))
names(tests) <- c("t.Statistic", "t.test_p.value", "mean0", "mean1")
tests$difference <- tests$mean1 - tests$mean0
tests
## Create a volcano plot
xyplot(-log10(t.test_p.value) ~ difference,
data = tests,
xlab = "Mean With Structure - Mean Without Structure",
ylab = "-log(p-Value)",
type = "p")
################################################################################
### Section 18.2 Categorical Outcomes
## Load the segmentation data
data(segmentationData)
segTrain <- subset(segmentationData, Case == "Train")
segTrain$Case <- segTrain$Cell <- NULL
segTest <- subset(segmentationData, Case != "Train")
segTest$Case <- segTest$Cell <- NULL
## Compute the areas under the ROC curve
aucVals <- filterVarImp(x = segTrain[, -1], y = segTrain$Class)
aucVals$Predictor <- rownames(aucVals)
## Cacluate the t-tests as before but with x and y switched
segTests <- apply(segTrain[, -1], 2,
function(x, y)
{
tStats <- t.test(x ~ y)[c("statistic", "p.value", "estimate")]
unlist(tStats)
},
y = segTrain$Class)
segTests <- as.data.frame(t(segTests))
names(segTests) <- c("t.Statistic", "t.test_p.value", "mean0", "mean1")
segTests$Predictor <- rownames(segTests)
## Fit a random forest model and get the importance scores
library(randomForest)
set.seed(791)
rfImp <- randomForest(Class ~ ., data = segTrain,
ntree = 2000,
importance = TRUE)
rfValues <- data.frame(RF = importance(rfImp)[, "MeanDecreaseGini"],
Predictor = rownames(importance(rfImp)))
## Now compute the Relief scores
set.seed(791)
ReliefValues <- attrEval(Class ~ ., data = segTrain,
estimator="ReliefFequalK", ReliefIterations = 50)
ReliefValues <- data.frame(Relief = ReliefValues,
Predictor = names(ReliefValues))
## and the MIC statistics
set.seed(791)
segMIC <- mine(x = segTrain[, -1],
## Pass the outcome as 0/1
y = ifelse(segTrain$Class == "PS", 1, 0))$MIC
segMIC <- data.frame(Predictor = rownames(segMIC),
MIC = segMIC[,1])
rankings <- merge(segMIC, ReliefValues)
rankings <- merge(rankings, rfValues)
rankings <- merge(rankings, segTests)
rankings <- merge(rankings, aucVals)
rankings
rankings$channel <- "Channel 1"
rankings$channel[grepl("Ch2$", rankings$Predictor)] <- "Channel 2"
rankings$channel[grepl("Ch3$", rankings$Predictor)] <- "Channel 3"
rankings$channel[grepl("Ch4$", rankings$Predictor)] <- "Channel 4"
rankings$t.Statistic <- abs(rankings$t.Statistic)
splom(~rankings[, c("PS", "t.Statistic", "RF", "Relief", "MIC")],
groups = rankings$channel,
varnames = c("ROC\nAUC", "Abs\nt-Stat", "Random\nForest", "Relief", "MIC"),
auto.key = list(columns = 2))
## Load the grant data. A script to create and save these data is contained
## in the same directory as this file.
load("grantData.RData")
dataSubset <- training[pre2008, c("Sponsor62B", "ContractValueBandUnk", "RFCD240302")]
## This is a simple function to compute several statistics for binary predictors
tableCalcs <- function(x, y)
{
tab <- table(x, y)
fet <- fisher.test(tab)
out <- c(OR = fet$estimate,
P = fet$p.value,
Gain = attrEval(y ~ x, estimator = "GainRatio"))
}
## lapply() is used to execute the function on each column
tableResults <- lapply(dataSubset, tableCalcs, y = training[pre2008, "Class"])
## The results come back as a list of vectors, and "rbind" is used to join
## then together as rows of a table
tableResults <- do.call("rbind", tableResults)
tableResults
## The permuted Relief scores can be computed using a function from the
## AppliedPredictiveModeling package.
permuted <- permuteRelief(x = training[pre2008, c("Sponsor62B", "Day", "NumCI")],
y = training[pre2008, "Class"],
nperm = 500,
### the remaining options are passed to attrEval()
estimator="ReliefFequalK",
ReliefIterations= 50)
## The original Relief scores:
permuted$observed
## The number of standard deviations away from the permuted mean:
permuted$standardized
## The distributions of the scores if there were no relationship between the
## predictors and outcomes
histogram(~value|Predictor,
data = permuted$permutations,
xlim = extendrange(permuted$permutations$value),
xlab = "Relief Score")
################################################################################
### Session Information
sessionInfo()
q("no")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.