demo/carAucPenSplSVM.R
In HRW: Datasets, Functions and Scripts for Semiparametric Regression Supporting Harezlak, Ruppert & Wand (2018)

########## R script: carAucPenSplSVM ##########

# For fitting and evaluating a penalized 
# spline support vector machines for classification 
# based on the car auction from the Kaggle platform
# (www.kaggle.com) ``Don't Get  Kicked!'' competition
# (held during 2011-2012).

# Last changed: 19 JUN 2017

# Set main variables and seed:

indctrInds <- c(3,6,10,13,29,30,32,37,40,41,43)
contnsInds <- c(9,31,39,47)
set.seed(1)
lambda <- 44.7

# Load required packages:

library(LowRankQP) ; library(HRW)

# Load in original training data:

data(carAuction)

# Set long names vector:

longPredNames <- c("purchased in 2010","auctioned by Adesa","auctioned by Manheim",
                   "2003 vehicle","2004 vehicle","2005 vehicle","2006 vehicle",
                   "2007 vehicle","age at sale","Chevrolet","Ford","Dodge","Chrysler",
                   "trim is Bas","trim is LS","trim is SE","sub-model is 4DSEDAN",
                   "sub-model is 4DESANSL","sub-model is 4DSESANSE",
                   "coloured silver","coloured white","coloured blue",
                   "coloured grey","coloured black","coloured red",
                   "coloured gold","coloured orange","manual transmission",
                   "alloy wheels","covered wheels","odometer reading",
                   "made in U.S.A.","made in Asia (not Japan or South Korea)",
                   "truck","medium-sized","sports utility vehicle","compact","van",
                   "acquisition price","purchased in Texas","purchased in Florida",
                   "purchased in California","purchased in North Carolina",
                   "purchased in Arizona","purchased in Colorado",
                   "purchased in South Carolina","acquisition cost",
                   "online sale","warranty cost")

# Create new training data such that the numbers
# of "good buy" cars and "bad buy" cars are equal.

nOrig <- nrow(carAuction)
indsBadBuy <- (1:nOrig)[carAuction[,2] == 1]
indsGoodBuy <- setdiff(1:nOrig,indsBadBuy)
indsGoodBuySub <- sample(indsGoodBuy,length(indsBadBuy),replace = FALSE)
allData <- carAuction[c(indsGoodBuySub,indsBadBuy),]

# Break off a random subset of size 1000 to use for estimating
# test error, with remaining data used for training.

testInds <- sample(1:nrow(allData),1000,replace = FALSE)
trainInds <- setdiff(1:nrow(allData),testInds)
testData <-  allData[testInds,]
trainData <- allData[trainInds,]

# Set response variable (y = 1 if bad buy and y = -1 if good buy) 
# and the the matrix containing all possibly predictors

y <- 2*trainData[,2] - 1
predsAll <-  trainData[,-c(1,2)]
n <- length(y)  
numIndctr <- length(indctrInds)   
numContns <- length(contnsInds)

# Obtain vector of names of chosen predictors:

namesVec <- longPredNames[c(indctrInds,contnsInds)]

# Obtain X matrix in original units:

Xorig <- cbind(rep(1,n),predsAll[,indctrInds],predsAll[,contnsInds])

# Transform all non-intercept columns to the unit interval
# for penalized spline support vector machine fitting:

origMins <- apply(Xorig,2,min) ; origMaxs <- apply(Xorig,2,max) 
X <- matrix(NA,nrow(Xorig),ncol(Xorig))
X[,1] <- 1
for (j in 2:ncol(Xorig))
   X[,j] <- (Xorig[,j] - origMins[j])/(origMaxs[j] - origMins[j])
ncX <- ncol(X)

intKnots <- vector("list",numContns) 
reBlockInds <-  vector("list",numContns) 
currReBlockStt <- 1
numIntKnots <- c(5,rep(15,3))
Z <- NULL
for (ell in 1:numContns)
{
   j <- ell + numIndctr
   xCurr <- X[,j]
   intKnotsCurr <- quantile(unique(xCurr),
                            seq(0,1,length = numIntKnots[ell]+2)
                            [-c(1,numIntKnots[ell]+2)])
   Z <- cbind(Z,ZOSull(xCurr,c(0,1),intKnotsCurr))
   intKnots[[ell]] <- intKnotsCurr
   reBlockInds[[ell]] <- currReBlockStt:ncol(Z)
   currReBlockStt <- ncol(Z) + 1
}

Ztilde <- Z*(1/sqrt(2*lambda))
yX <- as.matrix(y*X) ; yZtilde <- as.matrix(y*Ztilde)

LowRankQPobj <- LowRankQP(yZtilde,rep(-1,n),t(yX),rep(0,ncX),
                          rep(1,n),niter = 500)

# Obtain estimated beta and u vectors:

betaHat <- as.vector(LowRankQPobj$beta) 
uHat <-  as.vector(crossprod(yZtilde,LowRankQPobj$alpha))

# Plot slice of each additive model component with  
# all other variables set at their mean:

ng <- 201
xg <- seq(0,1,length = ng)
meansVec <- apply(X[,-1],2,mean)
Xmeang <- rep(1,ng)
for (j in 1:(numIndctr+numContns))
   Xmeang <- cbind(Xmeang,rep(meansVec[j],ng))

Zmeang <- NULL
for (ell in 1:numContns)
   Zmeang <- cbind(Zmeang,ZOSull(rep(meansVec[numIndctr+ell],ng),
                                     c(0,1),intKnots[[ell]]))

fitg <- vector("list",(numIndctr+numContns))
for (j in 1:(numIndctr+numContns))
{
   Xgcurr <- Xmeang
   Xgcurr[,(j + 1)] <- xg
   Zgcurr <- Zmeang
   if (j>numIndctr) 
   {
      ell <- j - numIndctr      
      Zgcurr[,reBlockInds[[ell]]] <- ZOSull(xg,c(0,1),intKnots[[ell]])
   }
   fitg[[j]] <- Xgcurr%*%betaHat + Zgcurr%*%uHat
}

# Obtain plot showing additive components of the penalized
# spline support vector machine:

par(mfrow = c(5,3),mai = c(0.25,0.25,0.1,0.03))
ylimVal <- c(-0.5,0.5)
legendCexVal <- 1.3
subInds <- sample(1:nrow(X),500,replace = FALSE)
for (j in 1:(numIndctr + numContns))
{
   xgOrigCurr <- origMins[j + 1] + xg*(origMaxs[j + 1] - origMins[j + 1])
   xlimVal <- range(xgOrigCurr)
   if (j ==15) {xlimVal <- c(2500,15000) ; xgOrigCurr[xgOrigCurr < 2500] <- NA}
   plot(0,0,type = "n",xlim = xlimVal,ylim = ylimVal,bty = "l",
        xlab = "",ylab = "")
   lines(xgOrigCurr,fitg[[j]],col = "darkgreen",lwd = 2)
   xCurr <- origMins[j + 1] + X[subInds,(j + 1)]*(origMaxs[j + 1] - origMins[j + 1])
   rug(jitter(xCurr,factor = 0.2),col = "dodgerblue",quiet = TRUE)
   abline(h = 0,col = "slateblue")
   legend("topright",legend = namesVec[j],text.col = "indianred3",cex = legendCexVal)
}

# Obtain confusion matrix and estimate the test error:

yTest <- testData[,2]
predsAllTest <-  testData[,-c(1,2)]

# Obtain X matrix in original units:

XorigTest <- cbind(rep(1,nrow(testData)),predsAllTest[,indctrInds],
                   predsAllTest[,contnsInds])
Xtest <- matrix(NA,nrow(XorigTest),ncol(XorigTest))
Xtest[,1] <- 1
for (j in 2:ncol(XorigTest))
   Xtest[,j] <- (XorigTest[,j] - origMins[j])/(origMaxs[j] - origMins[j])

Ztest <- NULL
for (ell in 1:numContns)
{
   j <- ell + numIndctr
   Ztest <- cbind(Ztest,ZOSull(Xtest[,j],c(0,1),intKnots[[ell]]))
}
ZtestTilde <- Ztest*(1/sqrt(2*lambda))

# Obtain classifications based on test data:

fHatSVM <- Xtest%*%betaHat + ZtestTilde%*%uHat
yHat <- sign(fHatSVM)

# Obtain and print out the confusion matrix and test error:

confMat <- as.matrix(table(yTest,yHat))

names(dimnames(confMat)) <- rep("",2)
dimnames(confMat)[[1]] <- c("actual good buy","actual bad buy")
dimnames(confMat)[[2]] <- c("classified good buy","classified bad buy")

cat("\n")
cat("The confusion matrix is:\n")
print(confMat)

cat("\n")
misClassError <- round(100*(sum(confMat) - sum(diag(confMat)))/sum(confMat),3)

# Obtain and estimate misclassifcation rate:

misClassRate <- (100*(sum(confMat) - sum(diag(confMat)))/sum(confMat))

cat("The estimated misclassification rate is: ",misClassRate,"%\n\n",sep = "")

############ End of carAucPenSplSVM ############

Any scripts or data that you put into this service are public.

HRW documentation built on Nov. 23, 2021, 9:07 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

HRW
Datasets, Functions and Scripts for Semiparametric Regression Supporting Harezlak, Ruppert & Wand (2018)

demo/carAucPenSplSVM.R
In HRW: Datasets, Functions and Scripts for Semiparametric Regression Supporting Harezlak, Ruppert & Wand (2018)

Try the HRW package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

HRW Datasets, Functions and Scripts for Semiparametric Regression Supporting Harezlak, Ruppert & Wand (2018)

demo/carAucPenSplSVM.R In HRW: Datasets, Functions and Scripts for Semiparametric Regression Supporting Harezlak, Ruppert & Wand (2018)

Try the HRW package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

HRW
Datasets, Functions and Scripts for Semiparametric Regression Supporting Harezlak, Ruppert & Wand (2018)

demo/carAucPenSplSVM.R
In HRW: Datasets, Functions and Scripts for Semiparametric Regression Supporting Harezlak, Ruppert & Wand (2018)