knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
To install the bonusLab package, source it from GitHub with the command devtools::install_github("Marbr987/bonus_lab")
.
Then load the bonusLab package and the packages and data required for this vignette.
library(bonusLab) library(mlbench) library(caret)
Data prepared
data("BostonHousing") data <- BostonHousing
First of all, the BostonHousing dataset is divided in a train and a test set.
set.seed(222) train_indices <- createDataPartition(data[,1], p=0.8)[[1]] data_train <- data[train_indices,] data_test <- data[-train_indices,]
model_1 <- linreg( medv ~ ., data_train) model_1
set.seed(222) forward_selection <- caret::train(medv~., data = data_train, method = "leapForward", tuneGrid = data.frame(nvmax = 1:(ncol(data) - 1))) forward_selection$results
Performance of the model 1:
y_train_pred_1 <- pred(model_1) y_test_pred_1 <- pred(model_1, data = data_test) train_rmse <- sqrt(sum((y_train_pred_1 - data_train$medv)^2)/nrow(data_train)) test_rmse <- sqrt(sum((y_test_pred_1 - data_test$medv)^2)/nrow(data_test)) print(paste("Train RMSE is:", train_rmse, "; Test RMSE is:", test_rmse))
For model 2, we find optimal value with lambda = 1 and lambda = 2.8 and we find out with lambda = 2.3 we have lowest training RMSE. Model 2(a):
model_2_a <- ridgereg(medv ~ ., data_train, lambda = 1) y_train_pred_2_a <- pred(model_2_a) y_test_pred_2_a <- pred(model_2_a, data = data_test) train_rmse_2_a <- sqrt(sum((y_train_pred_2_a - data_train$medv)^2)/nrow(data_train)) test_rmse_2_a <- sqrt(sum((y_test_pred_2_a - data_test$medv)^2)/nrow(data_test)) print(paste("Train RMSE is:", train_rmse_2_a, "; Test RMSE is:", test_rmse_2_a))
Model 2(b):
model_2_b <- ridgereg(medv ~ ., data_train, lambda = 2.8) y_train_pred_2_b <- pred(model_2_b) y_test_pred_2_b <- pred(model_2_b, data = data_test) train_rmse_2_b <- sqrt(sum((y_train_pred_2_b - data_train$medv)^2)/nrow(data_train)) test_rmse_2_b <- sqrt(sum((y_test_pred_2_b - data_test$medv)^2)/nrow(data_test)) print(paste("Train RMSE is:", train_rmse_2_b, "; Test RMSE is:", test_rmse_2_b))
For model 3, we will use optimal value of the hyper-parameter 'lambda' and by using 10-fold cross-validation which will tune our model.
set.seed(222) model_list <- list(library = "bonusLab", type = "Regression", parameters = data.frame( parameter = "lambda", class = "numeric", label = "lambda"), fit=function(x, y, wts, param, lev, last, weights, classProbs, ...) { x <- if (is.data.frame(x)) {x} else {as.data.frame(x)} if(!is.data.frame(y)){ y <- as.data.frame(y) colnames(y) <- "medv" } ridgereg(as.formula(paste0(colnames(y), "~ .")), data=cbind(y, x), lambda=param$lambda) }, grid = function(x, y, len = NULL, search = "grid") { data.frame(lambda = seq(0.1, 5, 0.1)) }, predict=function(modelFit, newdata, preProc = NULL, submodels = NULL){ pred(modelFit, data=as.data.frame(newdata)) }, prob=NULL, sort=NULL, loop=NULL ) cv_lambda_selection <- caret::train( medv~., data = data_train, method=model_list, tuneGrid = data.frame(lambda = seq(0.1, 5, 0.1)), trControl = trainControl(# for 10-folds method = "repeatedcv", number = 10, repeats = 10)) cv_lambda_selection$bestTune
Model 3:
model_3 <- ridgereg(medv ~ ., data_train, lambda = 5) y_train_pred_3 <- pred(model_3) y_test_pred_3 <- pred(model_3, data = data_test) train_rmse_3 <- sqrt(sum((y_train_pred_3 - data_train$medv)^2)/nrow(data_train)) test_rmse_3 <- sqrt(sum((y_test_pred_3 - data_test$medv)^2)/nrow(data_test)) print(paste("Train RMSE is:", train_rmse_3, "; Test RMSE is:", test_rmse_3))
Comparing test RMSE results of all the three models, so we can evaluate which one is good.
#RMSE values of all these models are compared here print(test_rmse) print(test_rmse_2_b) print(test_rmse_3)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.