library(caret)
library(mlbench)
library(leaps)
library(lab7)
library(MASS)
theSeed <- 1337
set.seed(theSeed)
trainC <- caret::trainControl("cv")
data("BostonHousing")

Here we instantiate the training and test data with the training data being 80% and test data being 20% of the data set.

trainIndex <-
  caret::createDataPartition(BostonHousing$crim,
                             p = 0.8,
                             times = 1,
                             list = FALSE)
trainDat <- BostonHousing[trainIndex, ]
testDat <- BostonHousing[-trainIndex, ]
form <- tax ~ .

We now fit the data to the models using linear regression and linear regression with forward selection on the covariates.

linMod <- caret::train(form,
                       trainDat,
                       method = "lm",
                       trControl = trainC)
linFMod <- caret::train(form,
                        trainDat,
                        method = "leapForward",
                        trControl = trainC)

We evaluate the models with analyzing the RMSE and $R^2$ values.

linMod$results$RMSE
linFMod$results$RMSE

Analyzing the RMSE and the $R^2$ of the models entails some information. Lower RMSE value would indicate a 'tighter fit' of the data and a higher $R^2$ value indicates a measure of how well observed outcomes are replicated by the model, based on the proportion of total variation of outcomes explained by the model.

# Acknowledgement to Eric Herwin and Albin Vasterlund
ridgeMod  <- list(type = "Regression",
                    library = "lab7",
                    loop = NULL,
                    prob = NULL)


  ridgeMod$parameters <- data.frame(parameter = "lambda",
                                    class = "numeric",
                                    label = "Ridge Regression")


  ridgeMod$grid <- function (x, y, len = NULL, search = "grid"){
    data.frame(lambda = lambda)
  }

  ridgeMod$fit <- function (x, y, wts, param, lev, last, classProbs, ...) {
    dat <- if (is.data.frame(x))
      x
    else as.data.frame(x)
    dat$.outcome <- y
    out <- ridgereg(.outcome ~ ., data=dat ,lambda = param$lambda, ...)

    return(out)
  }

  ridgeMod$predict <- function (modelFit, newdata, submodels = NULL) {
    if (!is.data.frame(newdata))
      newdata <- as.data.frame(newdata)
    newdata <- scale(newdata)
    return(modelFit$pred(newdata))
  }
set.seed(theSeed)
res <- c()
for(lambda in seq(0, 20, by = 1)) {
  temp <- caret::train(form,
                       data = trainDat,
                       ridgeMod)
  res[lambda * 10] <- temp$results$RMSE
}
bestLambda <- which.min(res) / 10
bestRMSE <- res[which.min(res)]
bestLambda
bestRMSE

We see from the code above that the best $\lambda = 0.5$ with the lowest RMSE of 1320.828.

temp <- caret::train(form,
                       data = trainDat,
                       ridgeMod)
# Acknowledgement to Eric Herwin and Albin Vasterlund
set.seed(theSeed)
fold_count <- 10
lambda <- seq(10,20,by=1)
fitControl <- caret::trainControl(method = "repeatedcv",
                                  number = fold_count,
                                  ## repeated ten times
                                  repeats = fold_count)
ridgeModFit <- caret::train(form,
                         data = trainDat,
                         method = ridgeMod,
                         trControl = fitControl)
ridgeModFit

Last line says the best value for $\lambda = 17$. Which is the same as as we concluded above, though with different RMSE. The interval for $\lambda \in 10:20$ is motivated by previous larger interval, but for output niceties we chose to limit lambda.

lin <- list(RMSE = linMod$results$RMSE, RSquared = linMod$results$Rsquared)
linF <- list(RMSE = linFMod$results$RMSE[3], RSquared = linFMod$results$Rsquared[3])
ridgeM <- list(RMSE = ridgeModFit$results$RMSE[7], RSquared = ridgeModFit$results$Rsquared[7])
temp <- cbind(lin,linF,ridgeM)
colnames(temp) <- c("Lin. Reg", "Lin. Reg. F", "Ridge Reg.")
temp
# Acknowledgement to Henrik Karlsson
ridgeModFit2 <- caret::train(form,
                         data = trainDat,
                         method = ridgeMod,
                         trControl = trainC)
allModels <- list(linMod, linFMod, ridgeModFit2)
names(allModels) = c("Linear Regression", "Lin. Reg. Forward", "Ridge Reg.")

lapply(allModels, function(x) {
  postResample(predict(x, newdata=testDat), testDat$tax)
})

Looking at the Rsquared metric we see that the clear preferred model is Linear Regression with Forward Selection. Also it has the lowest RMSE. Looking at only RMSE we see that Linear Regression with Forward Selection is still the preferred model, but the Ridge Regression has surprisingly high RMSE - which we can't conclude any result from.



SimonJonsson/lab7 documentation built on May 28, 2019, 3:16 p.m.