knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
For this project we created an R package with R code that implements a version of the linear models for both regression and binary classification.
Here are some significant formulas that have been used in this function:
Loss Function:
1.Square Loss (Regression): $L(w) = ||Xw + {\beta}_n-y||^2_2$
2.Logistic Loss (Binary classification): $L(w) = \sum^{n}_{i=1}log[1+exp(-\hat{y_i}(w^tx + {\beta}_n)]$
Cost Function: $C_\lambda(w) = L(w)+\lambda||w||^2_2$
The purpose of this section is to give users a general information of this package. We will briefly go over the main functions.
## Source Code: library(LinearModel) data(spam, package = "ElemStatLearn") data(SAheart, package = "ElemStatLearn") data(zip.train, package = "ElemStatLearn") zip.train <- zip.train[zip.train[,1] %in% c(0,1),] data(prostate, package = "ElemStatLearn") data(ozone, package = "ElemStatLearn") data.list <- list( spam = list( features = as.matrix(spam[, 1:57]), labels = ifelse(spam$spam == "spam", 1, 0), is.01 = TRUE ), SAheart = list( features = as.matrix(SAheart[, c(1:4,6:9)]), labels = SAheart$chd, is.01 = TRUE ), zip.train = list( features = as.matrix(zip.train[, -1]), labels = zip.train[, 1], is.01 = TRUE ), prostate = list(features = as.matrix(prostate[, 1:8]), labels = prostate$lpsa, is.01 = FALSE), ozone = list(features = as.matrix(ozone[,-1]), labels = ozone[, 1], is.01 = FALSE) ) n.folds <- 4L
We are going to run our code on the following data sets.
#Spam data.name = 1 data.set <- data.list[[data.name]] test.loss.mat <- matrix(0, nrow = 4, ncol = 3) #Check data type here: set.seed(2) fold.vec <- sample(rep(1:n.folds, l = length(data.set$labels))) penalty.vec <- seq(5, 0.1, by = -0.1) for (i.fold in (1:n.folds)) { train.index <- fold.vec != i.fold test.index <- !train.index x.train <- data.set$features[train.index, ] y.train <- data.set$labels[train.index] x.test <- data.set$feature[test.index, ] y.test <- data.set$labels[test.index] if (data.set$is.01) { # binary data earlystopping.list <- LMLogisticLossEarlyStoppingCV(x.train, y.train, NULL, 100L, 0.5) L2.list <- LMLogisticLossL2CV(x.train, y.train, NULL, penalty.vec) earlystopping.predict <- ifelse(earlystopping.list$predict(x.test) > 0.5, 1, 0) L2.predict <- ifelse(L2.list$predict(x.test) > 0.5, 1, 0) baseline.predict <- ifelse(mean(y.test) > 0.5 , 1, 0) # baseline.predict <- mean(y.test) } else{ # regression data earlystopping.list <- LMSquareLossEarlyStoppingCV(x.train, y.train, NULL, 50L) L2.list <- LMSquareLossL2CV(x.train, y.train, NULL, penalty.vec) earlystopping.predict <- earlystopping.list$predict(x.test) L2.predict <- L2.list$predict(x.test) baseline.predict <- mean(y.test) } # L2 loss earlystopping.loss <- mean((earlystopping.predict - y.test) ^ 2) L2.loss <- mean((L2.predict - y.test) ^ 2) baseline.loss <- mean((baseline.predict - y.test) ^ 2) test.loss.mat[i.fold,] = c(earlystopping.loss, L2.loss, baseline.loss) }
# show result colnames(test.loss.mat) <- c("Early Stopping", "L2", "Baseline") test.loss.mat # plot result barplot( test.loss.mat, main = c("Binary Classification: ", "spam"), xlab = "mean loss value", legend = (rownames(test.loss.mat)), beside = TRUE )
Comment on difference in accuracy: In this data set, both Early Stopping and L2 gain a good accuracy better than the baseline predictor.
# Run CV for whole dataset if(data.set$is.01){ # Binary model.list <- LMLogisticLossL2CV(data.set$features, data.set$labels, NULL, penalty.vec) }else{ # Regression model.list <- LMSquareLossL2CV(data.set$features,data.set$labels,NULL, penalty.vec) } dot.x <- model.list$selected.penalty dot.y <- model.list$mean.validation.loss.vec[penalty.vec == model.list$selected.penalty] matplot( y = cbind(model.list$mean.validation.loss.vec, model.list$mean.train.loss.vec), x = as.matrix(penalty.vec), xlab = "penalty", ylab = "mean loss value", type = "l", lty = 1:2, pch = 15, col = c(17) ) matpoints(x = dot.x, y = dot.y, col = 2, pch = 19) legend( x = 0, y = max(cbind(model.list$mean.validation.loss.vec, model.list$mean.train.loss.vec)), c("Validation loss", "Train loss"), lty = 1:2, xjust = 0, yjust = 1 )
What are the optimal regularization parameters?
Answer: r model.list$selected.penalty
#SAheart data.name = 2 data.set <- data.list[[data.name]] test.loss.mat <- matrix(0, nrow = 4, ncol = 3) #Check data type here: set.seed(2) fold.vec <- sample(rep(1:n.folds, l = length(data.set$labels))) penalty.vec <- seq(5, 0.1, by = -0.1) for (i.fold in (1:n.folds)) { train.index <- fold.vec != i.fold test.index <- !train.index x.train <- data.set$features[train.index, ] y.train <- data.set$labels[train.index] x.test <- data.set$feature[test.index, ] y.test <- data.set$labels[test.index] if (data.set$is.01) { # binary data earlystopping.list <- LMLogisticLossEarlyStoppingCV(x.train, y.train, NULL, 100L, 0.5) L2.list <- LMLogisticLossL2CV(x.train, y.train, NULL, penalty.vec) earlystopping.predict <- ifelse(earlystopping.list$predict(x.test) > 0.5, 1, 0) L2.predict <- ifelse(L2.list$predict(x.test) > 0.5, 1, 0) baseline.predict <- ifelse(mean(y.test) > 0.5 , 1, 0) # baseline.predict <- mean(y.test) } else{ # regression data earlystopping.list <- LMSquareLossEarlyStoppingCV(x.train, y.train, NULL, 50L) L2.list <- LMSquareLossL2CV(x.train, y.train, NULL, penalty.vec) earlystopping.predict <- earlystopping.list$predict(x.test) L2.predict <- L2.list$predict(x.test) baseline.predict <- mean(y.test) } # L2 loss earlystopping.loss <- mean((earlystopping.predict - y.test) ^ 2) L2.loss <- mean((L2.predict - y.test) ^ 2) baseline.loss <- mean((baseline.predict - y.test) ^ 2) test.loss.mat[i.fold,] = c(earlystopping.loss, L2.loss, baseline.loss) }
# show result colnames(test.loss.mat) <- c("Early Stopping", "L2", "Baseline") test.loss.mat # plot result barplot( test.loss.mat, main = c("Binary Classification: ", "SAheart"), xlab = "mean loss value", legend = (rownames(test.loss.mat)), beside = TRUE )
Comment on difference in accuracy: In this data set, the accuracy of Early Stopping and L2 is only a little bit better than the baseline predictor, which may due to the limited training data.
# Run CV for whole dataset if(data.set$is.01){ # Binary model.list <- LMLogisticLossL2CV(data.set$features, data.set$labels, NULL, penalty.vec) }else{ # Regression model.list <- LMSquareLossL2CV(data.set$features,data.set$labels,NULL, penalty.vec) } dot.x <- model.list$selected.penalty dot.y <- model.list$mean.validation.loss.vec[penalty.vec == model.list$selected.penalty] matplot( y = cbind(model.list$mean.validation.loss.vec, model.list$mean.train.loss.vec), x = as.matrix(penalty.vec), xlab = "penalty", ylab = "mean loss value", type = "l", lty = 1:2, pch = 15, col = c(17) ) matpoints(x = dot.x, y = dot.y, col = 2, pch = 19) legend( x = 0, y = max(cbind(model.list$mean.validation.loss.vec, model.list$mean.train.loss.vec)), c("Validation loss", "Train loss"), lty = 1:2, xjust = 0, yjust = 1 )
What are the optimal regularization parameters?
Answer: r model.list$selected.penalty
#zip.train data.name = 3 data.set <- data.list[[data.name]] test.loss.mat <- matrix(0, nrow = 4, ncol = 3) #Check data type here: set.seed(2) fold.vec <- sample(rep(1:n.folds, l = length(data.set$labels))) penalty.vec <- seq(5, 0.1, by = -0.1) for (i.fold in (1:n.folds)) { train.index <- fold.vec != i.fold test.index <- !train.index x.train <- data.set$features[train.index, ] y.train <- data.set$labels[train.index] x.test <- data.set$feature[test.index, ] y.test <- data.set$labels[test.index] if (data.set$is.01) { # binary data earlystopping.list <- LMLogisticLossEarlyStoppingCV(x.train, y.train, NULL, 100L, 0.5) L2.list <- LMLogisticLossL2CV(x.train, y.train, NULL, penalty.vec) earlystopping.predict <- ifelse(earlystopping.list$predict(x.test) > 0.5, 1, 0) L2.predict <- ifelse(L2.list$predict(x.test) > 0.5, 1, 0) baseline.predict <- ifelse(mean(y.test) > 0.5 , 1, 0) # baseline.predict <- mean(y.test) } else{ # regression data earlystopping.list <- LMSquareLossEarlyStoppingCV(x.train, y.train, NULL, 50L) L2.list <- LMSquareLossL2CV(x.train, y.train, NULL, penalty.vec) earlystopping.predict <- earlystopping.list$predict(x.test) L2.predict <- L2.list$predict(x.test) baseline.predict <- mean(y.test) } # L2 loss earlystopping.loss <- mean((earlystopping.predict - y.test) ^ 2) L2.loss <- mean((L2.predict - y.test) ^ 2) baseline.loss <- mean((baseline.predict - y.test) ^ 2) test.loss.mat[i.fold,] = c(earlystopping.loss, L2.loss, baseline.loss) }
# show result colnames(test.loss.mat) <- c("Early Stopping", "L2", "Baseline") test.loss.mat # plot result barplot( test.loss.mat, main = c("Binary Classification: ", "zip.train"), xlab = "mean loss value", legend = (rownames(test.loss.mat)), beside = TRUE )
Comment on difference in accuracy: In this data set, both Early Stopping and L2 are extremely accurate due to the large number of the data points.
# Run CV for whole dataset if(data.set$is.01){ # Binary model.list <- LMLogisticLossL2CV(data.set$features, data.set$labels, NULL, penalty.vec) }else{ # Regression model.list <- LMSquareLossL2CV(data.set$features,data.set$labels,NULL, penalty.vec) } dot.x <- model.list$selected.penalty dot.y <- model.list$mean.validation.loss.vec[penalty.vec == model.list$selected.penalty] matplot( y = cbind(model.list$mean.validation.loss.vec, model.list$mean.train.loss.vec), x = as.matrix(penalty.vec), xlab = "penalty", ylab = "mean loss value", type = "l", lty = 1:2, pch = 15, col = c(17) ) matpoints(x = dot.x, y = dot.y, col = 2, pch = 19) legend( x = 0, y = max(cbind(model.list$mean.validation.loss.vec, model.list$mean.train.loss.vec)), c("Validation loss", "Train loss"), lty = 1:2, xjust = 0, yjust = 1 )
What are the optimal regularization parameters?
Answer: r model.list$selected.penalty
#prostate data.name = 4 data.set <- data.list[[data.name]] test.loss.mat <- matrix(0, nrow = 4, ncol = 3) #Check data type here: set.seed(2) fold.vec <- sample(rep(1:n.folds, l = length(data.set$labels))) penalty.vec <- seq(5, 0.1, by = -0.1) for (i.fold in (1:n.folds)) { train.index <- fold.vec != i.fold test.index <- !train.index x.train <- data.set$features[train.index, ] y.train <- data.set$labels[train.index] x.test <- data.set$feature[test.index, ] y.test <- data.set$labels[test.index] if (data.set$is.01) { # binary data earlystopping.list <- LMLogisticLossEarlyStoppingCV(x.train, y.train, NULL, 100L, 0.5) L2.list <- LMLogisticLossL2CV(x.train, y.train, NULL, penalty.vec) earlystopping.predict <- ifelse(earlystopping.list$predict(x.test) > 0.5, 1, 0) L2.predict <- ifelse(L2.list$predict(x.test) > 0.5, 1, 0) baseline.predict <- ifelse(mean(y.test) > 0.5 , 1, 0) # baseline.predict <- mean(y.test) } else{ # regression data earlystopping.list <- LMSquareLossEarlyStoppingCV(x.train, y.train, NULL, 500L) L2.list <- LMSquareLossL2CV(x.train, y.train, NULL, penalty.vec) earlystopping.predict <- earlystopping.list$predict(x.test) L2.predict <- L2.list$predict(x.test) baseline.predict <- mean(y.test) } # L2 loss earlystopping.loss <- mean((earlystopping.predict - y.test) ^ 2) L2.loss <- mean((L2.predict - y.test) ^ 2) baseline.loss <- mean((baseline.predict - y.test) ^ 2) test.loss.mat[i.fold,] = c(earlystopping.loss, L2.loss, baseline.loss) }
# show result colnames(test.loss.mat) <- c("Early Stopping", "L2", "Baseline") test.loss.mat # plot result barplot( test.loss.mat, main = c("Binary Classification: ", "prostate"), xlab = "mean loss value", legend = (rownames(test.loss.mat)), beside = TRUE )
Comment on difference in accuracy: In this data set, Early Stopping and L2 are both accurate enough to predict the label comparing with the baseline predictor.
# Run CV for whole dataset if(data.set$is.01){ # Binary model.list <- LMLogisticLossL2CV(data.set$features, data.set$labels, NULL, penalty.vec) }else{ # Regression model.list <- LMSquareLossL2CV(data.set$features,data.set$labels,NULL, penalty.vec) } dot.x <- model.list$selected.penalty dot.y <- model.list$mean.validation.loss.vec[penalty.vec == model.list$selected.penalty] matplot( y = cbind(model.list$mean.validation.loss.vec, model.list$mean.train.loss.vec), x = as.matrix(penalty.vec), xlab = "penalty", ylab = "mean loss value", type = "l", lty = 1:2, pch = 15, col = c(17) ) matpoints(x = dot.x, y = dot.y, col = 2, pch = 19) legend( x = 0, y = max(cbind(model.list$mean.validation.loss.vec, model.list$mean.train.loss.vec)), c("Validation loss", "Train loss"), lty = 1:2, xjust = 0, yjust = 1 )
What are the optimal regularization parameters?
Answer: r model.list$selected.penalty
#ozone data.name = 5 data.set <- data.list[[data.name]] test.loss.mat <- matrix(0, nrow = 4, ncol = 3) #Check data type here: set.seed(2) fold.vec <- sample(rep(1:n.folds, l = length(data.set$labels))) penalty.vec <- seq(5, 0.1, by = -0.1) for (i.fold in (1:n.folds)) { train.index <- fold.vec != i.fold test.index <- !train.index x.train <- data.set$features[train.index, ] y.train <- data.set$labels[train.index] x.test <- data.set$feature[test.index, ] y.test <- data.set$labels[test.index] if (data.set$is.01) { # binary data earlystopping.list <- LMLogisticLossEarlyStoppingCV(x.train, y.train, NULL, 100L, 0.5) L2.list <- LMLogisticLossL2CV(x.train, y.train, NULL, penalty.vec) earlystopping.predict <- ifelse(earlystopping.list$predict(x.test) > 0.5, 1, 0) L2.predict <- ifelse(L2.list$predict(x.test) > 0.5, 1, 0) baseline.predict <- ifelse(mean(y.test) > 0.5 , 1, 0) # baseline.predict <- mean(y.test) } else{ # regression data earlystopping.list <- LMSquareLossEarlyStoppingCV(x.train, y.train, NULL, 50L) L2.list <- LMSquareLossL2CV(x.train, y.train, NULL, penalty.vec) earlystopping.predict <- earlystopping.list$predict(x.test) L2.predict <- L2.list$predict(x.test) baseline.predict <- mean(y.test) } # L2 loss earlystopping.loss <- mean((earlystopping.predict - y.test) ^ 2) L2.loss <- mean((L2.predict - y.test) ^ 2) baseline.loss <- mean((baseline.predict - y.test) ^ 2) test.loss.mat[i.fold,] = c(earlystopping.loss, L2.loss, baseline.loss) }
# show result colnames(test.loss.mat) <- c("Early Stopping", "L2", "Baseline") test.loss.mat # plot result barplot( test.loss.mat, main = c("Binary Classification: ", "ozone"), xlab = "mean loss value", legend = (rownames(test.loss.mat)), beside = TRUE )
Comment on difference in accuracy: In this data set, Early Stopping and L2 are both accurate enough to predict the label comparing with the baseline predictor.
# Run CV for whole dataset if(data.set$is.01){ # Binary model.list <- LMLogisticLossL2CV(data.set$features, data.set$labels, NULL, penalty.vec) }else{ # Regression model.list <- LMSquareLossL2CV(data.set$features,data.set$labels,NULL, penalty.vec) } dot.x <- model.list$selected.penalty dot.y <- model.list$mean.validation.loss.vec[penalty.vec == model.list$selected.penalty] matplot( y = cbind(model.list$mean.validation.loss.vec, model.list$mean.train.loss.vec), x = as.matrix(penalty.vec), xlab = "penalty", ylab = "mean loss value", type = "l", lty = 1:2, pch = 15, col = c(17) ) matpoints(x = dot.x, y = dot.y, col = 2, pch = 19) legend( x = 0, y = max(cbind(model.list$mean.validation.loss.vec, model.list$mean.train.loss.vec)), c("Validation loss", "Train loss"), lty = 1:2, xjust = 0, yjust = 1 )
What are the optimal regularization parameters?
Answer: r model.list$selected.penalty
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.