################################################################################
### R code from Applied Predictive Modeling (2013) by Kuhn and Johnson.
### Copyright 2013 Kuhn and Johnson
### Web Page: http://www.appliedpredictivemodeling.com
### Contact: Max Kuhn (mxkuhn@gmail.com)
###
### Chapter 3: Data Pre-Processing
###
### Required packages: AppliedPredictiveModeling, e1071, caret, corrplot
###
### Data used: The (unprocessed) cell segmentation data from the
### AppliedPredictiveModeling package.
###
### Notes:
### 1) This code is provided without warranty.
###
### 2) This code should help the user reproduce the results in the
### text. There will be differences between this code and what is is
### the computing section. For example, the computing sections show
### how the source functions work (e.g. randomForest() or plsr()),
### which were not directly used when creating the book. Also, there may be
### syntax differences that occur over time as packages evolve. These files
### will reflect those changes.
###
### 3) In some cases, the calculations in the book were run in
### parallel. The sub-processes may reset the random number seed.
### Your results may slightly vary.
###
################################################################################
################################################################################
### Section 3.1 Case Study: Cell Segmentation in High-Content Screening
library(AppliedPredictiveModeling)
data(segmentationOriginal)
## Retain the original training set
segTrain <- subset(segmentationOriginal, Case == "Train")
## Remove the first three columns (identifier columns)
segTrainX <- segTrain[, -(1:3)]
segTrainClass <- segTrain$Class
################################################################################
### Section 3.2 Data Transformations for Individual Predictors
## The column VarIntenCh3 measures the standard deviation of the intensity
## of the pixels in the actin filaments
max(segTrainX$VarIntenCh3)/min(segTrainX$VarIntenCh3)
library(e1071)
skewness(segTrainX$VarIntenCh3)
library(caret)
## Use caret's preProcess function to transform for skewness
segPP <- preProcess(segTrainX, method = "BoxCox")
## Apply the transformations
segTrainTrans <- predict(segPP, segTrainX)
## Results for a single predictor
segPP$bc$VarIntenCh3
histogram(~segTrainX$VarIntenCh3,
xlab = "Natural Units",
type = "count")
histogram(~log(segTrainX$VarIntenCh3),
xlab = "Log Units",
ylab = " ",
type = "count")
segPP$bc$PerimCh1
histogram(~segTrainX$PerimCh1,
xlab = "Natural Units",
type = "count")
histogram(~segTrainTrans$PerimCh1,
xlab = "Transformed Data",
ylab = " ",
type = "count")
################################################################################
### Section 3.3 Data Transformations for Multiple Predictors
## R's prcomp is used to conduct PCA
pr <- prcomp(~ AvgIntenCh1 + EntropyIntenCh1,
data = segTrainTrans,
scale. = TRUE)
transparentTheme(pchSize = .7, trans = .3)
xyplot(AvgIntenCh1 ~ EntropyIntenCh1,
data = segTrainTrans,
groups = segTrain$Class,
xlab = "Channel 1 Fiber Width",
ylab = "Intensity Entropy Channel 1",
auto.key = list(columns = 2),
type = c("p", "g"),
main = "Original Data",
aspect = 1)
xyplot(PC2 ~ PC1,
data = as.data.frame(pr$x),
groups = segTrain$Class,
xlab = "Principal Component #1",
ylab = "Principal Component #2",
main = "Transformed",
xlim = extendrange(pr$x),
ylim = extendrange(pr$x),
type = c("p", "g"),
aspect = 1)
## Apply PCA to the entire set of predictors.
## There are a few predictors with only a single value, so we remove these first
## (since PCA uses variances, which would be zero)
isZV <- apply(segTrainX, 2, function(x) length(unique(x)) == 1)
segTrainX <- segTrainX[, !isZV]
segPP <- preProcess(segTrainX, c("BoxCox", "center", "scale"))
segTrainTrans <- predict(segPP, segTrainX)
segPCA <- prcomp(segTrainTrans, center = TRUE, scale. = TRUE)
## Plot a scatterplot matrix of the first three components
transparentTheme(pchSize = .8, trans = .3)
panelRange <- extendrange(segPCA$x[, 1:3])
splom(as.data.frame(segPCA$x[, 1:3]),
groups = segTrainClass,
type = c("p", "g"),
as.table = TRUE,
auto.key = list(columns = 2),
prepanel.limits = function(x) panelRange)
## Format the rotation values for plotting
segRot <- as.data.frame(segPCA$rotation[, 1:3])
## Derive the channel variable
vars <- rownames(segPCA$rotation)
channel <- rep(NA, length(vars))
channel[grepl("Ch1$", vars)] <- "Channel 1"
channel[grepl("Ch2$", vars)] <- "Channel 2"
channel[grepl("Ch3$", vars)] <- "Channel 3"
channel[grepl("Ch4$", vars)] <- "Channel 4"
segRot$Channel <- channel
segRot <- segRot[complete.cases(segRot),]
segRot$Channel <- factor(as.character(segRot$Channel))
## Plot a scatterplot matrix of the first three rotation variables
transparentTheme(pchSize = .8, trans = .7)
panelRange <- extendrange(segRot[, 1:3])
library(ellipse)
upperp <- function(...)
{
args <- list(...)
circ1 <- ellipse(diag(rep(1, 2)), t = .1)
panel.xyplot(circ1[,1], circ1[,2],
type = "l",
lty = trellis.par.get("reference.line")$lty,
col = trellis.par.get("reference.line")$col,
lwd = trellis.par.get("reference.line")$lwd)
circ2 <- ellipse(diag(rep(1, 2)), t = .2)
panel.xyplot(circ2[,1], circ2[,2],
type = "l",
lty = trellis.par.get("reference.line")$lty,
col = trellis.par.get("reference.line")$col,
lwd = trellis.par.get("reference.line")$lwd)
circ3 <- ellipse(diag(rep(1, 2)), t = .3)
panel.xyplot(circ3[,1], circ3[,2],
type = "l",
lty = trellis.par.get("reference.line")$lty,
col = trellis.par.get("reference.line")$col,
lwd = trellis.par.get("reference.line")$lwd)
panel.xyplot(args$x, args$y, groups = args$groups, subscripts = args$subscripts)
}
splom(~segRot[, 1:3],
groups = segRot$Channel,
lower.panel = function(...){}, upper.panel = upperp,
prepanel.limits = function(x) panelRange,
auto.key = list(columns = 2))
################################################################################
### Section 3.5 Removing Variables
## To filter on correlations, we first get the correlation matrix for the
## predictor set
segCorr <- cor(segTrainTrans)
library(corrplot)
corrplot(segCorr, order = "hclust", tl.cex = .35)
## caret's findCorrelation function is used to identify columns to remove.
highCorr <- findCorrelation(segCorr, .75)
################################################################################
### Section 3.8 Computing (Creating Dummy Variables)
data(cars)
type <- c("convertible", "coupe", "hatchback", "sedan", "wagon")
cars$Type <- factor(apply(cars[, 14:18], 1, function(x) type[which(x == 1)]))
carSubset <- cars[sample(1:nrow(cars), 20), c(1, 2, 19)]
head(carSubset)
levels(carSubset$Type)
simpleMod <- dummyVars(~Mileage + Type,
data = carSubset,
## Remove the variable name from the
## column name
levelsOnly = TRUE)
simpleMod
withInteraction <- dummyVars(~Mileage + Type + Mileage:Type,
data = carSubset,
levelsOnly = TRUE)
withInteraction
predict(withInteraction, head(carSubset))
################################################################################
### Session Information
sessionInfo()
q("no")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.