library("caret")
data(FuelEconomy, package = "jrPred")
set.seed(27)

Cross validation

data("FuelEconomy", package = "jrPred")
mLM = train(FE~EngDispl+NumCyl+NumGears, method = "lm", data = cars2010)
res = resid(mLM)
(trainRMSE = sqrt(mean(res*res)))
# set up train control objects
tcKFOLD = trainControl(method = "cv", number = 10)
# run model
mLMKFOLD = train(FE~EngDispl+NumCyl+NumGears, method = "lm",
    data = cars2010, trControl = tcKFOLD)
trainRMSE
getTrainPerf(mLMKFOLD)
mLMKFOLD$times$everything
# a number of trainControl objects 
tc2 = trainControl(method = "cv", number = 2)
tc5 = trainControl(method = "cv", number = 5)
tc10 = trainControl(method = "cv", number = 10)
tc15 = trainControl(method = "cv", number = 15)
tc20 = trainControl(method = "cv", number = 20)
# train the model using each
mLM2 = train(FE~EngDispl+NumCyl+NumGears, method = "lm",
    data = cars2010, trControl = tc2)
mLM5 = train(FE~EngDispl+NumCyl+NumGears, method = "lm",
    data = cars2010, trControl = tc5)
mLM10 = train(FE~EngDispl+NumCyl+NumGears, method = "lm",
    data = cars2010, trControl = tc10)
mLM15 = train(FE~EngDispl+NumCyl+NumGears, method = "lm",
    data = cars2010, trControl = tc15)
mLM20 = train(FE~EngDispl+NumCyl+NumGears, method = "lm",
    data = cars2010, trControl = tc20)
# use a data frame to store all of the relevant information
(info = data.frame("Folds" = c(2,5,10,15,20),
    "Time" = c(mLM2$times$everything[1],
        mLM5$times$everything[1],
        mLM10$times$everything[1],
        mLM15$times$everything[1],
        mLM20$times$everything[1]),
    "Estimate" = c(mLM2$results$RMSE,
                   mLM5$results$RMSE,
                   mLM10$results$RMSE,
                   mLM15$results$RMSE,
                   mLM20$results$RMSE)))
# as there are more folds it takes longer to compute,
# not an issue with such a small model but something
# to consider on more complicated models.
# Estimates are going down as the number of folds increases.
# This is because for each held out fold we are using a greater
# proportion of the data in training so expect to get a better
# model.
tc = trainControl(method = "cv", number = 10)
mreg = train(FE~., data = cars2010, method = "lm", 
             preProcess = c("center","scale"),
             trControl = tc)
mridge = train(FE~., data = cars2010, method = "ridge", 
             preProcess = c("center","scale"),
             trControl = tc)

creg = coefficients(mreg$finalModel)
cridge = predict(mridge$finalModel, mode = "fraction",
s = 1, type = "coefficients")

plot(creg, pch = 19)
points(cridge$coefficients, col = 2, pch = 19)

dotplot(resamples(list(
    "reg" = mreg,
    "ridge" = mridge
)),metric = "RMSE")
mlasso = train(FE~., data = cars2010, method = "lasso", 
             preProcess = c("center","scale"),
             trControl = tc)
menet = train(FE~., data = cars2010, method = "enet", 
             preProcess = c("center","scale"),
             trControl = tc)


jr-packages/jrPred documentation built on May 6, 2019, 7:17 a.m.