Nothing
################################################################################
### R code from Applied Predictive Modeling (2013) by Kuhn and Johnson.
### Copyright 2013 Kuhn and Johnson
### Web Page: http://www.appliedpredictivemodeling.com
### Contact: Max Kuhn (mxkuhn@gmail.com)
###
### Chapter 6: Linear Regression and Its Cousins
###
### Required packages: AppliedPredictiveModeling, lattice, corrplot, pls,
### elasticnet,
###
### Data used: The solubility from the AppliedPredictiveModeling package
###
### Notes:
### 1) This code is provided without warranty.
###
### 2) This code should help the user reproduce the results in the
### text. There will be differences between this code and what is is
### the computing section. For example, the computing sections show
### how the source functions work (e.g. randomForest() or plsr()),
### which were not directly used when creating the book. Also, there may be
### syntax differences that occur over time as packages evolve. These files
### will reflect those changes.
###
### 3) In some cases, the calculations in the book were run in
### parallel. The sub-processes may reset the random number seed.
### Your results may slightly vary.
###
################################################################################
################################################################################
### Section 6.1 Case Study: Quantitative Structure- Activity
### Relationship Modeling
library(AppliedPredictiveModeling)
data(solubility)
library(lattice)
### Some initial plots of the data
xyplot(solTrainY ~ solTrainX$MolWeight, type = c("p", "g"),
ylab = "Solubility (log)",
main = "(a)",
xlab = "Molecular Weight")
xyplot(solTrainY ~ solTrainX$NumRotBonds, type = c("p", "g"),
ylab = "Solubility (log)",
xlab = "Number of Rotatable Bonds")
bwplot(solTrainY ~ ifelse(solTrainX[,100] == 1,
"structure present",
"structure absent"),
ylab = "Solubility (log)",
main = "(b)",
horizontal = FALSE)
### Find the columns that are not fingerprints (i.e. the continuous
### predictors). grep will return a list of integers corresponding to
### column names that contain the pattern "FP".
notFingerprints <- grep("FP", names(solTrainXtrans))
library(caret)
featurePlot(solTrainXtrans[, -notFingerprints],
solTrainY,
between = list(x = 1, y = 1),
type = c("g", "p", "smooth"),
labels = rep("", 2))
library(corrplot)
### We used the full namespace to call this function because the pls
### package (also used in this chapter) has a function with the same
### name.
corrplot::corrplot(cor(solTrainXtrans[, -notFingerprints]),
order = "hclust",
tl.cex = .8)
################################################################################
### Section 6.2 Linear Regression
### Create a control function that will be used across models. We
### create the fold assignments explicitly instead of relying on the
### random number seed being set to identical values.
set.seed(100)
indx <- createFolds(solTrainY, returnTrain = TRUE)
ctrl <- trainControl(method = "cv", index = indx)
### Linear regression model with all of the predictors. This will
### produce some warnings that a 'rank-deficient fit may be
### misleading'. This is related to the predictors being so highly
### correlated that some of the math has broken down.
set.seed(100)
lmTune0 <- train(x = solTrainXtrans, y = solTrainY,
method = "lm",
trControl = ctrl)
lmTune0
### And another using a set of predictors reduced by unsupervised
### filtering. We apply a filter to reduce extreme between-predictor
### correlations. Note the lack of warnings.
tooHigh <- findCorrelation(cor(solTrainXtrans), .9)
trainXfiltered <- solTrainXtrans[, -tooHigh]
testXfiltered <- solTestXtrans[, -tooHigh]
set.seed(100)
lmTune <- train(x = trainXfiltered, y = solTrainY,
method = "lm",
trControl = ctrl)
lmTune
### Save the test set results in a data frame
testResults <- data.frame(obs = solTestY,
Linear_Regression = predict(lmTune, testXfiltered))
################################################################################
### Section 6.3 Partial Least Squares
## Run PLS and PCR on solubility data and compare results
set.seed(100)
plsTune <- train(x = solTrainXtrans, y = solTrainY,
method = "pls",
tuneGrid = expand.grid(ncomp = 1:20),
trControl = ctrl)
plsTune
testResults$PLS <- predict(plsTune, solTestXtrans)
set.seed(100)
pcrTune <- train(x = solTrainXtrans, y = solTrainY,
method = "pcr",
tuneGrid = expand.grid(ncomp = 1:35),
trControl = ctrl)
pcrTune
plsResamples <- plsTune$results
plsResamples$Model <- "PLS"
pcrResamples <- pcrTune$results
pcrResamples$Model <- "PCR"
plsPlotData <- rbind(plsResamples, pcrResamples)
xyplot(RMSE ~ ncomp,
data = plsPlotData,
#aspect = 1,
xlab = "# Components",
ylab = "RMSE (Cross-Validation)",
auto.key = list(columns = 2),
groups = Model,
type = c("o", "g"))
plsImp <- varImp(plsTune, scale = FALSE)
plot(plsImp, top = 25, scales = list(y = list(cex = .95)))
################################################################################
### Section 6.4 Penalized Models
## The text used the elasticnet to obtain a ridge regression model.
## There is now a simple ridge regression method.
ridgeGrid <- expand.grid(lambda = seq(0, .1, length = 15))
set.seed(100)
ridgeTune <- train(x = solTrainXtrans, y = solTrainY,
method = "ridge",
tuneGrid = ridgeGrid,
trControl = ctrl,
preProc = c("center", "scale"))
ridgeTune
print(update(plot(ridgeTune), xlab = "Penalty"))
enetGrid <- expand.grid(lambda = c(0, 0.01, .1),
fraction = seq(.05, 1, length = 20))
set.seed(100)
enetTune <- train(x = solTrainXtrans, y = solTrainY,
method = "enet",
tuneGrid = enetGrid,
trControl = ctrl,
preProc = c("center", "scale"))
enetTune
plot(enetTune)
testResults$Enet <- predict(enetTune, solTestXtrans)
################################################################################
### Session Information
sessionInfo()
q("no")
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.