Here we fit a linear regression model and a linear regression model with forward selection of covariates on the training dataset.
library(Lab7) library(caret) library(mlbench) library(leaps) data(BostonHousing) BostonHousing$chas <- as.numeric(BostonHousing$chas)-1
set.seed(-264619L) inTrain <- createDataPartition(BostonHousing$crim, p=.75, list=FALSE) training <- BostonHousing[inTrain,] testing <- BostonHousing[-inTrain,] ctrl <- trainControl( method = "repeatedcv", number = 10) set.seed(-264619L) lmfit <- train(crim ~ . , data = training, method ="lm" ) lmfit
set.seed(-264619L) lmforwardfit <- train(crim ~., data = training, method ="leapForward" ) lmforwardfit
We see that the best RMSE value comes from the linear regression model with forward selection where there are two predictors. This RMSE value is a little better than the one from the regular linear regression model. We now try to train our own custom made ridge regression model with a particular set.seed().
ridgemodel <- list(type = "Regression", library = "Lab7" ) ridgemodel$parameters<-data.frame(parameter="lambda", class="numeric", label="lambda") Fit<-function(x,y,lambda,param,lev,last,classProbs,...){ dat <- as.data.frame(x) respvector <- NULL respname <- NULL respnum <- NULL for(i in 1:ncol(x)){ if(identical(y,dat[,i])){ respvector <- dat[,i] respname <- names(x)[i] respnum <- i } } formula <- paste(respname,"~", sep="") if(ncol(x) > 1){ for(i in 1:ncol(x)){ if(i != respnum){ formula <- paste(formula, "+", names(dat)[i], sep="") } } } formula <- as.formula(formula) model <- Lab7::ridgeregr( formula = formula, data=dat,lambda= param$lambda) return(model) } ridgemodel$fit<-Fit ridgemodel$predict<-function(modelFit, newdata, preProc = NULL, submodels = NULL){ predict(modelFit,newdata) } ridgemodel$prob<- list(NULL) ridgemodel$sort<-function (x) x[order(-x$lambda), ] ridgemodel$label<-"Ridge Regression" ridgemodel$grid<-function(x,y,len=NULL, search="grid"){ data.frame(#lambda=seq(from=20, to=45, by=1)) lambda=seq(from=0, to=200, by=10)) } set.seed(-264619L) ridgeFit <- caret::train( y = training$crim, x = training, method = ridgemodel, trControl = ctrl )
ridgeFit
Preprocessing of data is actually done inside the ridgeregr() function. It seems that based on mimimal RMSE value, parameter lambda wants to be around 60. Now we evaluate all three models on the test dataset. Outlier residuals not shown in graphs.
lm_testeval <-predict(lmfit,testing) lm_testres <- testing$crim - lm_testeval plot(lm_testres, ylim= c(-5,5), main="Residuals for the linear regression predictions over test data", ylab="residual",xlab="test data observation #") paste0("RSS = ",sum(lm_testres^2)) lmfor_testeval <-predict(lmforwardfit,testing) lmfor_testres <- testing$crim - lmfor_testeval plot(lmfor_testres, ylim= c(-5,5), main=c("Residuals for the linear regression with"," forward selection predictions over test data"), ylab="residual",xlab="test data observation #") paste0("RSS = ",sum(lmfor_testres^2)) ridge_testeval <- predict(ridgeFit,testing) ridge_testres <- testing$crim - mean(testing$crim) - ridge_testeval plot(ridge_testres, ylim= c(-5,5), main="Residuals for the ridge regression predictions over test data", ylab="residual",xlab="test data observation #") paste0("RSS = ",sum(ridge_testres^2)) #The final models can be found under (train obj)$finalModel
It seems that ridge regression is a better fit than linear regression or linear regression with forward selection based on RSS.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.