Nothing
################################################################################
### R code from Applied Predictive Modeling (2013) by Kuhn and Johnson.
### Copyright 2013 Kuhn and Johnson
### Web Page: http://www.appliedpredictivemodeling.com
### Contact: Max Kuhn (mxkuhn@gmail.com)
###
### Chapter 18: Measuring Predictor Importance
###
### Required packages: AppliedPredictiveModeling, caret, CORElearn, corrplot,
### pROC, minerva
###
###
### Data used: The solubility data from the AppliedPredictiveModeling
### package, the segmentation data in the caret package and the
### grant data (created using "CreateGrantData.R" in the same
### directory as this file).
###
### Notes:
### 1) This code is provided without warranty.
###
### 2) This code should help the user reproduce the results in the
### text. There will be differences between this code and what is is
### the computing section. For example, the computing sections show
### how the source functions work (e.g. randomForest() or plsr()),
### which were not directly used when creating the book. Also, there may be
### syntax differences that occur over time as packages evolve. These files
### will reflect those changes.
###
### 3) In some cases, the calculations in the book were run in
### parallel. The sub-processes may reset the random number seed.
### Your results may slightly vary.
###
################################################################################
################################################################################
### Section 18.1 Numeric Outcomes
## Load the solubility data
library(AppliedPredictiveModeling)
data(solubility)
trainData <- solTrainXtrans
trainData$y <- solTrainY
## keep the continuous predictors and append the outcome to the data frame
SolContPred <- solTrainXtrans[, !grepl("FP", names(solTrainXtrans))]
numSolPred <- ncol(SolContPred)
SolContPred$Sol <- solTrainY
## Get the LOESS smoother and the summary measure
library(caret)
smoother <- filterVarImp(x = SolContPred[, -ncol(SolContPred)],
y = solTrainY,
nonpara = TRUE)
smoother$Predictor <- rownames(smoother)
names(smoother)[1] <- "Smoother"
## Calculate the correlation matrices and keep the columns with the correlations
## between the predictors and the outcome
correlations <- cor(SolContPred)[-(numSolPred+1),(numSolPred+1)]
rankCorrelations <- cor(SolContPred, method = "spearman")[-(numSolPred+1),(numSolPred+1)]
corrs <- data.frame(Predictor = names(SolContPred)[1:numSolPred],
Correlation = correlations,
RankCorrelation = rankCorrelations)
## The maximal information coefficient (MIC) values can be obtained from the
### minerva package:
library(minerva)
MIC <- mine(x = SolContPred[, 1:numSolPred], y = solTrainY)$MIC
MIC <- data.frame(Predictor = rownames(MIC),
MIC = MIC[,1])
## The Relief values for regression can be computed using the CORElearn
## package:
library(CORElearn)
ReliefF <- attrEval(Sol ~ ., data = SolContPred,
estimator = "RReliefFequalK")
ReliefF <- data.frame(Predictor = names(ReliefF),
Relief = ReliefF)
## Combine them all together for a plot
contDescrScores <- merge(smoother, corrs)
contDescrScores <- merge(contDescrScores, MIC)
contDescrScores <- merge(contDescrScores, ReliefF)
rownames(contDescrScores) <- contDescrScores$Predictor
contDescrScores
contDescrSplomData <- contDescrScores
contDescrSplomData$Correlation <- abs(contDescrSplomData$Correlation)
contDescrSplomData$RankCorrelation <- abs(contDescrSplomData$RankCorrelation)
contDescrSplomData$Group <- "Other"
contDescrSplomData$Group[grepl("Surface", contDescrSplomData$Predictor)] <- "SA"
featurePlot(solTrainXtrans[, c("NumCarbon", "SurfaceArea2")],
solTrainY,
between = list(x = 1),
type = c("g", "p", "smooth"),
df = 3,
aspect = 1,
labels = c("", "Solubility"))
splom(~contDescrSplomData[,c(3, 4, 2, 5)],
groups = contDescrSplomData$Group,
varnames = c("Correlation", "Rank\nCorrelation", "LOESS", "MIC"))
## Now look at the categorical (i.e. binary) predictors
SolCatPred <- solTrainXtrans[, grepl("FP", names(solTrainXtrans))]
SolCatPred$Sol <- solTrainY
numSolCatPred <- ncol(SolCatPred) - 1
tests <- apply(SolCatPred[, 1:numSolCatPred], 2,
function(x, y)
{
tStats <- t.test(y ~ x)[c("statistic", "p.value", "estimate")]
unlist(tStats)
},
y = solTrainY)
## The results are a matrix with predictors in columns. We reverse this
tests <- as.data.frame(t(tests))
names(tests) <- c("t.Statistic", "t.test_p.value", "mean0", "mean1")
tests$difference <- tests$mean1 - tests$mean0
tests
## Create a volcano plot
xyplot(-log10(t.test_p.value) ~ difference,
data = tests,
xlab = "Mean With Structure - Mean Without Structure",
ylab = "-log(p-Value)",
type = "p")
################################################################################
### Section 18.2 Categorical Outcomes
## Load the segmentation data
data(segmentationData)
segTrain <- subset(segmentationData, Case == "Train")
segTrain$Case <- segTrain$Cell <- NULL
segTest <- subset(segmentationData, Case != "Train")
segTest$Case <- segTest$Cell <- NULL
## Compute the areas under the ROC curve
aucVals <- filterVarImp(x = segTrain[, -1], y = segTrain$Class)
aucVals$Predictor <- rownames(aucVals)
## Cacluate the t-tests as before but with x and y switched
segTests <- apply(segTrain[, -1], 2,
function(x, y)
{
tStats <- t.test(x ~ y)[c("statistic", "p.value", "estimate")]
unlist(tStats)
},
y = segTrain$Class)
segTests <- as.data.frame(t(segTests))
names(segTests) <- c("t.Statistic", "t.test_p.value", "mean0", "mean1")
segTests$Predictor <- rownames(segTests)
## Fit a random forest model and get the importance scores
library(randomForest)
set.seed(791)
rfImp <- randomForest(Class ~ ., data = segTrain,
ntree = 2000,
importance = TRUE)
rfValues <- data.frame(RF = importance(rfImp)[, "MeanDecreaseGini"],
Predictor = rownames(importance(rfImp)))
## Now compute the Relief scores
set.seed(791)
ReliefValues <- attrEval(Class ~ ., data = segTrain,
estimator="ReliefFequalK", ReliefIterations = 50)
ReliefValues <- data.frame(Relief = ReliefValues,
Predictor = names(ReliefValues))
## and the MIC statistics
set.seed(791)
segMIC <- mine(x = segTrain[, -1],
## Pass the outcome as 0/1
y = ifelse(segTrain$Class == "PS", 1, 0))$MIC
segMIC <- data.frame(Predictor = rownames(segMIC),
MIC = segMIC[,1])
rankings <- merge(segMIC, ReliefValues)
rankings <- merge(rankings, rfValues)
rankings <- merge(rankings, segTests)
rankings <- merge(rankings, aucVals)
rankings
rankings$channel <- "Channel 1"
rankings$channel[grepl("Ch2$", rankings$Predictor)] <- "Channel 2"
rankings$channel[grepl("Ch3$", rankings$Predictor)] <- "Channel 3"
rankings$channel[grepl("Ch4$", rankings$Predictor)] <- "Channel 4"
rankings$t.Statistic <- abs(rankings$t.Statistic)
splom(~rankings[, c("PS", "t.Statistic", "RF", "Relief", "MIC")],
groups = rankings$channel,
varnames = c("ROC\nAUC", "Abs\nt-Stat", "Random\nForest", "Relief", "MIC"),
auto.key = list(columns = 2))
## Load the grant data. A script to create and save these data is contained
## in the same directory as this file.
load("grantData.RData")
dataSubset <- training[pre2008, c("Sponsor62B", "ContractValueBandUnk", "RFCD240302")]
## This is a simple function to compute several statistics for binary predictors
tableCalcs <- function(x, y)
{
tab <- table(x, y)
fet <- fisher.test(tab)
out <- c(OR = fet$estimate,
P = fet$p.value,
Gain = attrEval(y ~ x, estimator = "GainRatio"))
}
## lapply() is used to execute the function on each column
tableResults <- lapply(dataSubset, tableCalcs, y = training[pre2008, "Class"])
## The results come back as a list of vectors, and "rbind" is used to join
## then together as rows of a table
tableResults <- do.call("rbind", tableResults)
tableResults
## The permuted Relief scores can be computed using a function from the
## AppliedPredictiveModeling package.
permuted <- permuteRelief(x = training[pre2008, c("Sponsor62B", "Day", "NumCI")],
y = training[pre2008, "Class"],
nperm = 500,
### the remaining options are passed to attrEval()
estimator="ReliefFequalK",
ReliefIterations= 50)
## The original Relief scores:
permuted$observed
## The number of standard deviations away from the permuted mean:
permuted$standardized
## The distributions of the scores if there were no relationship between the
## predictors and outcomes
histogram(~value|Predictor,
data = permuted$permutations,
xlim = extendrange(permuted$permutations$value),
xlab = "Relief Score")
################################################################################
### Session Information
sessionInfo()
q("no")
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.