knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
library(lab4sidjac) library("caret") library("mlbench") library("leaps")
Divide the BostonHousing data (or your own API data) into a test and training dataset using the caret package.
data("BostonHousing") BostonHousing[,-c(4,14)] <- scale(BostonHousing[,-c(4,14)]) set.seed(1) index <- createDataPartition(BostonHousing$medv, times = 1, p = 0.2)$Resample1 train_dat <- BostonHousing[-index,] test_dat <- BostonHousing[index,]
Fit a linear regression model.
# Linear regression set.seed(201015) glm_fit <- caret::train(medv~., data = train_dat, method = "lm") glm_fit$finalModel
Fit a linear regression model with forward selection of covariates on the training data set.
# Linear regression with forward selection set.seed(201015) glm_fit_forward <- caret::train(medv~., data= train_dat, method="leapForward", tuneGrid = data.frame(nvmax = 1:ncol(train_dat[,-14]))) index <- glm_fit_forward$bestTune[[1]] coef(glm_fit_forward$finalModel,index)
Evaluate the performance of this models on the training data set.
# lm glm_fit$results # leapForward glm_fit_forward$results[index,]
The linear model have a RMSE of 5.28118 and a $R^2=0.7$. The linear model with forward selection have a RMSE of 5.279548 and a $R^2=0.7$. The forward selection that measure RMSE selects only 11 variables for the model and preforms a lower RSME than the simple linear model with all variables.
Fit a ridge regression model using your ridgereg() function to the training data set for different values of $\lambda$.
library(lab4sidjac) library(caret) library(mlbench) library(lab4sidjac) ## model list ridgereg_model <- list(type = "Regression", library = "lab4sidjac", loop = NULL, prob= NULL, label="Ridge Regression") ## sort ridgereg_model$sort <- function(x) x ## parameters ridgereg_model$parameters <- data.frame(parameter = 'lambda', class = 'numeric', label = 'lambda') ridgereg_model$grid <- function(x, y, len = NULL, search = "grid"){ data.frame(lambda = seq(from = 0 , to = 10, by = 0.5 )) } ## fit ridgereg_model$fit <- function(x, y, lambda, param, lev, last, classProbs, ...){ index <- vapply(1:ncol(x), function(i) all(y == x[,i]), logical(1)) form <- paste0(colnames(x[!index]),collapse = " ") form <- gsub(" ", " + ", form) form <- paste0(colnames(x[index]), " ~ ", form) form <- as.formula(form) lab4sidjac::ridgereg(formula=form, data=x, lambda = param$lambda) } ## predict ridgereg_model$predict <- function(modelFit, newdata, preProc = NULL, submodels = NULL){ lab4sidjac::pred(modelFit, newdata) }
set.seed(201015) ridgereg_fit <- caret::train(x = train_dat, y = train_dat[,14], method=ridgereg_model) ridgereg_fit
The final value used for the model was lambda = 4 with a RMSE = 5.273424, this preforms a better RMSE than the two previous models.
Find the best hyperparameter value for $\lambda$ using 10-fold cross-validation on the training set.
seedlist<- lapply(1:11, function(x){ set.seed(x) sample.int(1e4, 21) }) fitControl <- trainControl(method = 'cv', number = 10, seeds = seedlist) set.seed(201015) ridgereg_fit_10_cross <- caret::train(x = train_dat, y = train_dat[,14], method=ridgereg_model, trControl = fitControl) ridgereg_fit_10_cross
The final value used for the model was lambda = 3.5 with a RMSE = 4.952410, this preforms a better RMSE than the previous Ridge regression model.
Evaluate the performance of all three models on the test data set and write some concluding comments.
RMSE(predict(glm_fit_forward, newdata=test_dat), test_dat$medv) RMSE(predict(ridgereg_fit, newdata=test_dat), test_dat$medv) RMSE(predict(ridgereg_fit_10_cross, newdata=test_dat), test_dat$medv)
The best RMSE of the 3 models is Ridge Regression with all variables with a $\lambda$-value of 4.The model with Ridge regression could possible be fined tuned with a forward selection for finding a optimal model. There seems to be a small difference in test data performance between the model with and without cross validation.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.