In shwva184/ridgereg: A Ridge Regression package in R

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  message = FALSE
)

Loading the below required packages and data used for predictions.

library(ridgereg)
library(caret)
library(leaps)
Boston <- MASS::Boston

Introduction

The vignette called ridgereg is created, where a simple prediction problem using our own ridgereg() function is showed.

1. Divide the BostonHousing data into a test and training dataset

The BostonHousing data from mlbench package is divided into testing and training data sets. 75% of randomly selected data goes to training data set, and remaining 25% are used in the testing data set. The response variable crim(per capita crime rate by town) is analyzed.

set.seed(123)
inTrain <- createDataPartition(y = Boston$crim, p = 0.75, list = FALSE)

training <- Boston[inTrain, ]
testing <- Boston[-inTrain, ]

set.seed(123)
train_ctrl <- trainControl(method = "repeatedcv", number = 10, p = 3/4)

2. Training the data using lm and leapforward

lm <- train(crim ~ ., data = training, method = "lm", 
                trControl = train_ctrl)
print(lm)

lf <- train(crim ~ ., data = training, method = "leapForward", 
                trControl = train_ctrl)
print(lf)

3. Evaluate the performance

resamps = resamples(list(linreg = lm, linregfwd = lf))
summary(resamps)

By looking at the metrics RMSE, Rsquared and MAE, we can see that the linear regression performs better than the linear regression with forward selection.

4. Fit a ridge regression model

Ridgereg <- list(
  type = "Regression",
  library = "ridgereg",
  loop = NULL, 
  prob = NULL,

  parameters = data.frame(parameter = "lambda", class = "numeric", label = "lambda"),
  grid = function(x, y, len = NULL, search = "grid") {
      data.frame(lambda = seq(0,2,by=0.25))
  }, 

  fit = function(x, y, wts, param, lev, last, classProbs, ...) {
    if(is.data.frame(x)){
      df <- x
    } else{
      df <- as.data.frame(x)
    }
    df$y <- y
    ridgereg(y ~ ., data = df, lambda = param$lambda, ...)
    },

  predict = function(modelFit, newdata, submodels = NULL) {
    if(!is.data.frame(newdata)){
      newdata <- as.data.frame(newdata)
    }
    predict(modelFit, newdata)                
    }, 

  sort = function(x) x[order(-x$lambda),]

)

5. Using 10-fold cross-validation on the training set

set.seed(123)
ctrl <- trainControl(method = "repeatedcv", number = 10, repeats = 10)
ridgereg_fit <- caret::train(crim ~ ., data=training, method=Ridgereg, trControl=ctrl)
ridgereg_fit

Performance evaluation of 3 models

# "Linear Regression Model"
pred_lm <- predict(lm, testing)
postResample(pred_lm, testing$crim)

# "Linear Regression Model with forward selection"
pred_step <- predict(lf, testing)
postResample(pred_step, testing$crim)

# "Ridgereg Model"
pred_ridge <- predict(ridgereg_fit, testing)
postResample(pred_ridge, testing$crim)