knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
library(lab4sidjac)
library("caret")
library("mlbench")
library("leaps")

1.2.1

Divide the BostonHousing data (or your own API data) into a test and training dataset using the caret package.

data("BostonHousing")
BostonHousing[,-c(4,14)] <- scale(BostonHousing[,-c(4,14)])
set.seed(1)
index <- createDataPartition(BostonHousing$medv, times = 1, p = 0.2)$Resample1
train_dat <- BostonHousing[-index,]
test_dat <- BostonHousing[index,]

1.2.2

Fit a linear regression model.

# Linear regression
set.seed(201015)
glm_fit <- caret::train(medv~.,  data = train_dat, method = "lm")

glm_fit$finalModel

Fit a linear regression model with forward selection of covariates on the training data set.

# Linear regression with forward selection
set.seed(201015)
glm_fit_forward <- caret::train(medv~.,  data= train_dat, method="leapForward",
                                tuneGrid = data.frame(nvmax = 1:ncol(train_dat[,-14])))
index <- glm_fit_forward$bestTune[[1]]
coef(glm_fit_forward$finalModel,index)

1.2.3

Evaluate the performance of this models on the training data set.

# lm 
glm_fit$results
# leapForward
glm_fit_forward$results[index,]

The linear model have a RMSE of 5.28118 and a $R^2=0.7$. The linear model with forward selection have a RMSE of 5.279548 and a $R^2=0.7$. The forward selection that measure RMSE selects only 11 variables for the model and preforms a lower RSME than the simple linear model with all variables.

1.2.4

Fit a ridge regression model using your ridgereg() function to the training data set for different values of $\lambda$.

library(lab4sidjac)
library(caret)
library(mlbench)
library(lab4sidjac)
## model list
ridgereg_model <- list(type = "Regression",
                       library = "lab4sidjac",
                       loop = NULL,
                       prob= NULL,
                       label="Ridge Regression")
## sort
ridgereg_model$sort <- function(x) x

## parameters
ridgereg_model$parameters <- data.frame(parameter = 'lambda',
                                        class = 'numeric',
                                        label = 'lambda')
ridgereg_model$grid <- function(x, y, len = NULL, search = "grid"){ 
  data.frame(lambda = seq(from = 0 , to = 10, by = 0.5 )) 
}
## fit 
ridgereg_model$fit <- function(x, y, lambda, param, lev, last, classProbs, ...){
  index <- vapply(1:ncol(x), function(i) all(y == x[,i]), logical(1))
  form <- paste0(colnames(x[!index]),collapse = " ")
  form <- gsub(" ", " + ", form)
  form <- paste0(colnames(x[index]), " ~ ", form)
  form <- as.formula(form)
  lab4sidjac::ridgereg(formula=form, data=x, lambda = param$lambda)
}
## predict 
ridgereg_model$predict <- function(modelFit, newdata, preProc = NULL, submodels = NULL){
  lab4sidjac::pred(modelFit, newdata)
}
set.seed(201015)
ridgereg_fit <- caret::train(x = train_dat, y = train_dat[,14], method=ridgereg_model)
ridgereg_fit

The final value used for the model was lambda = 4 with a RMSE = 5.273424, this preforms a better RMSE than the two previous models.

1.2.5

Find the best hyperparameter value for $\lambda$ using 10-fold cross-validation on the training set.

seedlist<- lapply(1:11, function(x){
       set.seed(x)
       sample.int(1e4, 21)
       })

fitControl <- trainControl(method = 'cv',
                           number = 10,
                           seeds = seedlist)

set.seed(201015)
ridgereg_fit_10_cross <- caret::train(x = train_dat, y = train_dat[,14], method=ridgereg_model, trControl = fitControl)
ridgereg_fit_10_cross

The final value used for the model was lambda = 3.5 with a RMSE = 4.952410, this preforms a better RMSE than the previous Ridge regression model.

1.2.6

Evaluate the performance of all three models on the test data set and write some concluding comments.

RMSE(predict(glm_fit_forward, newdata=test_dat), test_dat$medv)
RMSE(predict(ridgereg_fit, newdata=test_dat), test_dat$medv)
RMSE(predict(ridgereg_fit_10_cross, newdata=test_dat), test_dat$medv)

The best RMSE of the 3 models is Ridge Regression with all variables with a $\lambda$-value of 4.The model with Ridge regression could possible be fined tuned with a forward selection for finding a optimal model. There seems to be a small difference in test data performance between the model with and without cross validation.



Sidryd/lab4sidjac documentation built on Oct. 17, 2020, 11:05 p.m.