In thozh912/Lab7: Ridge regression and data munging package

Here we fit a linear regression model and a linear regression model with forward selection of covariates on the training dataset.

library(Lab7)
library(caret)
library(mlbench)
library(leaps)
data(BostonHousing)
BostonHousing$chas <- as.numeric(BostonHousing$chas)-1

set.seed(-264619L)
inTrain <- createDataPartition(BostonHousing$crim,
                               p=.75,
                               list=FALSE)

training <- BostonHousing[inTrain,]
testing <- BostonHousing[-inTrain,]

ctrl <- trainControl(
  method = "repeatedcv",
  number = 10)


set.seed(-264619L)
lmfit <- train(crim ~ . ,
               data = training,
               method ="lm"
)
lmfit

set.seed(-264619L)
lmforwardfit <- train(crim ~.,
                      data = training,
                      method ="leapForward"

)
lmforwardfit

We see that the best RMSE value comes from the linear regression model with forward selection where there are two predictors. This RMSE value is a little better than the one from the regular linear regression model. We now try to train our own custom made ridge regression model with a particular set.seed().

ridgemodel <- list(type = "Regression",
              library = "Lab7"
              )

ridgemodel$parameters<-data.frame(parameter="lambda",
                             class="numeric",
                             label="lambda")

Fit<-function(x,y,lambda,param,lev,last,classProbs,...){

  dat <- as.data.frame(x)

  respvector <- NULL
  respname <- NULL
  respnum <- NULL

  for(i in 1:ncol(x)){
    if(identical(y,dat[,i])){
    respvector <- dat[,i]
    respname <- names(x)[i]
    respnum <- i
    }
  }

  formula <- paste(respname,"~", sep="")

  if(ncol(x) > 1){
    for(i in 1:ncol(x)){
      if(i != respnum){
      formula <- paste(formula, "+", names(dat)[i], sep="")
      }
    }
  }

  formula <- as.formula(formula)
  model <- Lab7::ridgeregr( formula = formula, data=dat,lambda= param$lambda)
  return(model)
}

ridgemodel$fit<-Fit

ridgemodel$predict<-function(modelFit, newdata, preProc = NULL, submodels = NULL){

  predict(modelFit,newdata)
}

ridgemodel$prob<- list(NULL)

ridgemodel$sort<-function (x) x[order(-x$lambda), ]

ridgemodel$label<-"Ridge Regression"

ridgemodel$grid<-function(x,y,len=NULL, search="grid"){
  data.frame(#lambda=seq(from=20, to=45, by=1))
             lambda=seq(from=0, to=200, by=10))
}

set.seed(-264619L)
ridgeFit <- caret::train( y = training$crim,
                         x = training,
                       method = ridgemodel,
                       trControl = ctrl
)

ridgeFit

Preprocessing of data is actually done inside the ridgeregr() function. It seems that based on mimimal RMSE value, parameter lambda wants to be around 60. Now we evaluate all three models on the test dataset. Outlier residuals not shown in graphs.

lm_testeval <-predict(lmfit,testing)

lm_testres <- testing$crim - lm_testeval

plot(lm_testres, ylim= c(-5,5), main="Residuals for the linear regression predictions over test data",
     ylab="residual",xlab="test data observation #")
paste0("RSS = ",sum(lm_testres^2))
lmfor_testeval <-predict(lmforwardfit,testing)

lmfor_testres <- testing$crim - lmfor_testeval

plot(lmfor_testres, ylim= c(-5,5), main=c("Residuals for the linear regression with"," forward selection predictions over test data"),
     ylab="residual",xlab="test data observation #")
paste0("RSS = ",sum(lmfor_testres^2))
ridge_testeval <- predict(ridgeFit,testing)

ridge_testres <- testing$crim - mean(testing$crim) - ridge_testeval

plot(ridge_testres, ylim= c(-5,5), main="Residuals for the ridge regression predictions over test data",
     ylab="residual",xlab="test data observation #")
paste0("RSS = ",sum(ridge_testres^2))
#The final models can be found under (train obj)$finalModel