In SixianZhang/CS499-Coding-Project-2: Linear model for regression and binary classification

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

Introduction

For this project we created an R package with R code that implements a version of the linear models for both regression and binary classification.

Here are some significant formulas that have been used in this function:

Loss Function:

1.Square Loss (Regression): $L(w) = ||Xw + {\beta}_n-y||^2_2$

2.Logistic Loss (Binary classification): $L(w) = \sum^{n}_{i=1}log[1+exp(-\hat{y_i}(w^tx + {\beta}_n)]$

Cost Function: $C_\lambda(w) = L(w)+\lambda||w||^2_2$

Main Function

The purpose of this section is to give users a general information of this package. We will briefly go over the main functions.

## Source Code:
library(LinearModel)

data(spam, package = "ElemStatLearn")
data(SAheart, package = "ElemStatLearn")
data(zip.train, package = "ElemStatLearn")
zip.train <- zip.train[zip.train[,1] %in% c(0,1),]
data(prostate, package = "ElemStatLearn")
data(ozone, package = "ElemStatLearn")

data.list <- list(
  spam = list(
    features = as.matrix(spam[, 1:57]),
    labels = ifelse(spam$spam == "spam", 1, 0),
    is.01 = TRUE
  ),

  SAheart = list(
    features = as.matrix(SAheart[, c(1:4,6:9)]),
    labels = SAheart$chd,
    is.01 = TRUE
  ),

  zip.train = list(
    features = as.matrix(zip.train[, -1]),
    labels = zip.train[, 1],
    is.01 = TRUE
  ),

  prostate = list(features = as.matrix(prostate[, 1:8]),
                  labels = prostate$lpsa,
                  is.01 = FALSE),

  ozone = list(features = as.matrix(ozone[,-1]),
               labels = ozone[, 1],
               is.01 = FALSE)
)

n.folds <- 4L

Experiments/application

We are going to run our code on the following data sets.

Data set 1: spam

#Spam
data.name  = 1
data.set <- data.list[[data.name]]
test.loss.mat <- matrix(0, nrow = 4, ncol = 3)

#Check data type here:
set.seed(2)

fold.vec <- sample(rep(1:n.folds, l = length(data.set$labels)))

penalty.vec <- seq(5, 0.1, by = -0.1)

for (i.fold in (1:n.folds)) {
  train.index <- fold.vec != i.fold
  test.index <- !train.index

  x.train <- data.set$features[train.index, ]
  y.train <- data.set$labels[train.index]
  x.test <- data.set$feature[test.index, ]
  y.test <- data.set$labels[test.index]

  if (data.set$is.01) {
    # binary data
    earlystopping.list <-
      LMLogisticLossEarlyStoppingCV(x.train, y.train, NULL, 100L, 0.5)
    L2.list <-
      LMLogisticLossL2CV(x.train, y.train, NULL, penalty.vec)

    earlystopping.predict <-
      ifelse(earlystopping.list$predict(x.test) > 0.5, 1, 0)
    L2.predict <- ifelse(L2.list$predict(x.test) > 0.5, 1, 0)
    baseline.predict <- ifelse(mean(y.test) > 0.5 , 1, 0)
    # baseline.predict <- mean(y.test)

  } else{
    # regression data
    earlystopping.list <-
      LMSquareLossEarlyStoppingCV(x.train, y.train, NULL, 50L)
    L2.list <- LMSquareLossL2CV(x.train, y.train, NULL, penalty.vec)

    earlystopping.predict <- earlystopping.list$predict(x.test)
    L2.predict <- L2.list$predict(x.test)
    baseline.predict <- mean(y.test)
  }

  # L2 loss
  earlystopping.loss <- mean((earlystopping.predict - y.test) ^ 2)
  L2.loss <- mean((L2.predict - y.test) ^ 2)
  baseline.loss <- mean((baseline.predict - y.test) ^ 2)

  test.loss.mat[i.fold,] = c(earlystopping.loss, L2.loss, baseline.loss)
}

Matrix of loss values

# show result
colnames(test.loss.mat) <- c("Early Stopping", "L2", "Baseline")

test.loss.mat

# plot result
barplot(
  test.loss.mat,
  main = c("Binary Classification: ", "spam"),
  xlab = "mean loss value",
  legend = (rownames(test.loss.mat)),
  beside = TRUE
)

Comment on difference in accuracy: In this data set, both Early Stopping and L2 gain a good accuracy better than the baseline predictor.

Train/validation loss plot

# Run CV for whole dataset
if(data.set$is.01){
  # Binary
  model.list <- LMLogisticLossL2CV(data.set$features, data.set$labels, NULL, penalty.vec)
}else{
  # Regression
  model.list <- LMSquareLossL2CV(data.set$features,data.set$labels,NULL, penalty.vec)
}

dot.x <- model.list$selected.penalty
dot.y <- model.list$mean.validation.loss.vec[penalty.vec == model.list$selected.penalty]

matplot(
  y = cbind(model.list$mean.validation.loss.vec, model.list$mean.train.loss.vec),
  x = as.matrix(penalty.vec),
  xlab = "penalty",
  ylab = "mean loss value",
  type = "l",
  lty = 1:2,
  pch = 15,
  col = c(17)
)

matpoints(x = dot.x,
          y = dot.y,
          col = 2,
          pch = 19)
legend(
    x = 0,
    y = max(cbind(model.list$mean.validation.loss.vec, model.list$mean.train.loss.vec)),
    c("Validation loss", "Train loss"),
    lty = 1:2,
    xjust = 0,
    yjust = 1
)

What are the optimal regularization parameters? Answer: r model.list$selected.penalty

Data set 2: SAheart

#SAheart
data.name  = 2
data.set <- data.list[[data.name]]
test.loss.mat <- matrix(0, nrow = 4, ncol = 3)

#Check data type here:
set.seed(2)

fold.vec <- sample(rep(1:n.folds, l = length(data.set$labels)))

penalty.vec <- seq(5, 0.1, by = -0.1)

for (i.fold in (1:n.folds)) {
  train.index <- fold.vec != i.fold
  test.index <- !train.index

  x.train <- data.set$features[train.index, ]
  y.train <- data.set$labels[train.index]
  x.test <- data.set$feature[test.index, ]
  y.test <- data.set$labels[test.index]

  if (data.set$is.01) {
    # binary data
    earlystopping.list <-
      LMLogisticLossEarlyStoppingCV(x.train, y.train, NULL, 100L, 0.5)
    L2.list <-
      LMLogisticLossL2CV(x.train, y.train, NULL, penalty.vec)

    earlystopping.predict <-
      ifelse(earlystopping.list$predict(x.test) > 0.5, 1, 0)
    L2.predict <- ifelse(L2.list$predict(x.test) > 0.5, 1, 0)
    baseline.predict <- ifelse(mean(y.test) > 0.5 , 1, 0)
    # baseline.predict <- mean(y.test)

  } else{
    # regression data
    earlystopping.list <-
      LMSquareLossEarlyStoppingCV(x.train, y.train, NULL, 50L)
    L2.list <- LMSquareLossL2CV(x.train, y.train, NULL, penalty.vec)

    earlystopping.predict <- earlystopping.list$predict(x.test)
    L2.predict <- L2.list$predict(x.test)
    baseline.predict <- mean(y.test)
  }

  # L2 loss
  earlystopping.loss <- mean((earlystopping.predict - y.test) ^ 2)
  L2.loss <- mean((L2.predict - y.test) ^ 2)
  baseline.loss <- mean((baseline.predict - y.test) ^ 2)

  test.loss.mat[i.fold,] = c(earlystopping.loss, L2.loss, baseline.loss)
}

Matrix of loss values

# show result
colnames(test.loss.mat) <- c("Early Stopping", "L2", "Baseline")

test.loss.mat

# plot result
barplot(
  test.loss.mat,
  main = c("Binary Classification: ", "SAheart"),
  xlab = "mean loss value",
  legend = (rownames(test.loss.mat)),
  beside = TRUE
)

Comment on difference in accuracy: In this data set, the accuracy of Early Stopping and L2 is only a little bit better than the baseline predictor, which may due to the limited training data.

Train/validation loss plot

# Run CV for whole dataset
if(data.set$is.01){
  # Binary
  model.list <- LMLogisticLossL2CV(data.set$features, data.set$labels, NULL, penalty.vec)
}else{
  # Regression
  model.list <- LMSquareLossL2CV(data.set$features,data.set$labels,NULL, penalty.vec)
}

dot.x <- model.list$selected.penalty
dot.y <- model.list$mean.validation.loss.vec[penalty.vec == model.list$selected.penalty]

matplot(
  y = cbind(model.list$mean.validation.loss.vec, model.list$mean.train.loss.vec),
  x = as.matrix(penalty.vec),
  xlab = "penalty",
  ylab = "mean loss value",
  type = "l",
  lty = 1:2,
  pch = 15,
  col = c(17)
)

matpoints(x = dot.x,
          y = dot.y,
          col = 2,
          pch = 19)
legend(
    x = 0,
    y = max(cbind(model.list$mean.validation.loss.vec, model.list$mean.train.loss.vec)),
    c("Validation loss", "Train loss"),
    lty = 1:2,
    xjust = 0,
    yjust = 1
)

What are the optimal regularization parameters? Answer: r model.list$selected.penalty

Data set 3: zip.train

#zip.train
data.name  = 3
data.set <- data.list[[data.name]]
test.loss.mat <- matrix(0, nrow = 4, ncol = 3)

#Check data type here:
set.seed(2)

fold.vec <- sample(rep(1:n.folds, l = length(data.set$labels)))

penalty.vec <- seq(5, 0.1, by = -0.1)

for (i.fold in (1:n.folds)) {
  train.index <- fold.vec != i.fold
  test.index <- !train.index

  x.train <- data.set$features[train.index, ]
  y.train <- data.set$labels[train.index]
  x.test <- data.set$feature[test.index, ]
  y.test <- data.set$labels[test.index]

  if (data.set$is.01) {
    # binary data
    earlystopping.list <-
      LMLogisticLossEarlyStoppingCV(x.train, y.train, NULL, 100L, 0.5)
    L2.list <-
      LMLogisticLossL2CV(x.train, y.train, NULL, penalty.vec)

    earlystopping.predict <-
      ifelse(earlystopping.list$predict(x.test) > 0.5, 1, 0)
    L2.predict <- ifelse(L2.list$predict(x.test) > 0.5, 1, 0)
    baseline.predict <- ifelse(mean(y.test) > 0.5 , 1, 0)
    # baseline.predict <- mean(y.test)

  } else{
    # regression data
    earlystopping.list <-
      LMSquareLossEarlyStoppingCV(x.train, y.train, NULL, 50L)
    L2.list <- LMSquareLossL2CV(x.train, y.train, NULL, penalty.vec)

    earlystopping.predict <- earlystopping.list$predict(x.test)
    L2.predict <- L2.list$predict(x.test)
    baseline.predict <- mean(y.test)
  }

  # L2 loss
  earlystopping.loss <- mean((earlystopping.predict - y.test) ^ 2)
  L2.loss <- mean((L2.predict - y.test) ^ 2)
  baseline.loss <- mean((baseline.predict - y.test) ^ 2)

  test.loss.mat[i.fold,] = c(earlystopping.loss, L2.loss, baseline.loss)
}

Matrix of loss values

# show result
colnames(test.loss.mat) <- c("Early Stopping", "L2", "Baseline")

test.loss.mat

# plot result
barplot(
  test.loss.mat,
  main = c("Binary Classification: ", "zip.train"),
  xlab = "mean loss value",
  legend = (rownames(test.loss.mat)),
  beside = TRUE
)

Comment on difference in accuracy: In this data set, both Early Stopping and L2 are extremely accurate due to the large number of the data points.

Train/validation loss plot

# Run CV for whole dataset
if(data.set$is.01){
  # Binary
  model.list <- LMLogisticLossL2CV(data.set$features, data.set$labels, NULL, penalty.vec)
}else{
  # Regression
  model.list <- LMSquareLossL2CV(data.set$features,data.set$labels,NULL, penalty.vec)
}

dot.x <- model.list$selected.penalty
dot.y <- model.list$mean.validation.loss.vec[penalty.vec == model.list$selected.penalty]

matplot(
  y = cbind(model.list$mean.validation.loss.vec, model.list$mean.train.loss.vec),
  x = as.matrix(penalty.vec),
  xlab = "penalty",
  ylab = "mean loss value",
  type = "l",
  lty = 1:2,
  pch = 15,
  col = c(17)
)

matpoints(x = dot.x,
          y = dot.y,
          col = 2,
          pch = 19)
legend(
    x = 0,
    y = max(cbind(model.list$mean.validation.loss.vec, model.list$mean.train.loss.vec)),
    c("Validation loss", "Train loss"),
    lty = 1:2,
    xjust = 0,
    yjust = 1
)

What are the optimal regularization parameters? Answer: r model.list$selected.penalty

Data set 4: prostate

#prostate
data.name  = 4
data.set <- data.list[[data.name]]
test.loss.mat <- matrix(0, nrow = 4, ncol = 3)

#Check data type here:
set.seed(2)

fold.vec <- sample(rep(1:n.folds, l = length(data.set$labels)))

penalty.vec <- seq(5, 0.1, by = -0.1)

for (i.fold in (1:n.folds)) {
  train.index <- fold.vec != i.fold
  test.index <- !train.index

  x.train <- data.set$features[train.index, ]
  y.train <- data.set$labels[train.index]
  x.test <- data.set$feature[test.index, ]
  y.test <- data.set$labels[test.index]

  if (data.set$is.01) {
    # binary data
    earlystopping.list <-
      LMLogisticLossEarlyStoppingCV(x.train, y.train, NULL, 100L, 0.5)
    L2.list <-
      LMLogisticLossL2CV(x.train, y.train, NULL, penalty.vec)

    earlystopping.predict <-
      ifelse(earlystopping.list$predict(x.test) > 0.5, 1, 0)
    L2.predict <- ifelse(L2.list$predict(x.test) > 0.5, 1, 0)
    baseline.predict <- ifelse(mean(y.test) > 0.5 , 1, 0)
    # baseline.predict <- mean(y.test)

  } else{
    # regression data
    earlystopping.list <-
      LMSquareLossEarlyStoppingCV(x.train, y.train, NULL, 500L)
    L2.list <- LMSquareLossL2CV(x.train, y.train, NULL, penalty.vec)

    earlystopping.predict <- earlystopping.list$predict(x.test)
    L2.predict <- L2.list$predict(x.test)
    baseline.predict <- mean(y.test)
  }

  # L2 loss
  earlystopping.loss <- mean((earlystopping.predict - y.test) ^ 2)
  L2.loss <- mean((L2.predict - y.test) ^ 2)
  baseline.loss <- mean((baseline.predict - y.test) ^ 2)

  test.loss.mat[i.fold,] = c(earlystopping.loss, L2.loss, baseline.loss)
}

Matrix of loss values

# show result
colnames(test.loss.mat) <- c("Early Stopping", "L2", "Baseline")

test.loss.mat

# plot result
barplot(
  test.loss.mat,
  main = c("Binary Classification: ", "prostate"),
  xlab = "mean loss value",
  legend = (rownames(test.loss.mat)),
  beside = TRUE
)

Comment on difference in accuracy: In this data set, Early Stopping and L2 are both accurate enough to predict the label comparing with the baseline predictor.

Train/validation loss plot

# Run CV for whole dataset
if(data.set$is.01){
  # Binary
  model.list <- LMLogisticLossL2CV(data.set$features, data.set$labels, NULL, penalty.vec)
}else{
  # Regression
  model.list <- LMSquareLossL2CV(data.set$features,data.set$labels,NULL, penalty.vec)
}

dot.x <- model.list$selected.penalty
dot.y <- model.list$mean.validation.loss.vec[penalty.vec == model.list$selected.penalty]

matplot(
  y = cbind(model.list$mean.validation.loss.vec, model.list$mean.train.loss.vec),
  x = as.matrix(penalty.vec),
  xlab = "penalty",
  ylab = "mean loss value",
  type = "l",
  lty = 1:2,
  pch = 15,
  col = c(17)
)

matpoints(x = dot.x,
          y = dot.y,
          col = 2,
          pch = 19)
legend(
    x = 0,
    y = max(cbind(model.list$mean.validation.loss.vec, model.list$mean.train.loss.vec)),
    c("Validation loss", "Train loss"),
    lty = 1:2,
    xjust = 0,
    yjust = 1
)

What are the optimal regularization parameters? Answer: r model.list$selected.penalty

Data set 5: ozone

#ozone
data.name  = 5
data.set <- data.list[[data.name]]
test.loss.mat <- matrix(0, nrow = 4, ncol = 3)

#Check data type here:
set.seed(2)

fold.vec <- sample(rep(1:n.folds, l = length(data.set$labels)))

penalty.vec <- seq(5, 0.1, by = -0.1)

for (i.fold in (1:n.folds)) {
  train.index <- fold.vec != i.fold
  test.index <- !train.index

  x.train <- data.set$features[train.index, ]
  y.train <- data.set$labels[train.index]
  x.test <- data.set$feature[test.index, ]
  y.test <- data.set$labels[test.index]

  if (data.set$is.01) {
    # binary data
    earlystopping.list <-
      LMLogisticLossEarlyStoppingCV(x.train, y.train, NULL, 100L, 0.5)
    L2.list <-
      LMLogisticLossL2CV(x.train, y.train, NULL, penalty.vec)

    earlystopping.predict <-
      ifelse(earlystopping.list$predict(x.test) > 0.5, 1, 0)
    L2.predict <- ifelse(L2.list$predict(x.test) > 0.5, 1, 0)
    baseline.predict <- ifelse(mean(y.test) > 0.5 , 1, 0)
    # baseline.predict <- mean(y.test)

  } else{
    # regression data
    earlystopping.list <-
      LMSquareLossEarlyStoppingCV(x.train, y.train, NULL, 50L)
    L2.list <- LMSquareLossL2CV(x.train, y.train, NULL, penalty.vec)

    earlystopping.predict <- earlystopping.list$predict(x.test)
    L2.predict <- L2.list$predict(x.test)
    baseline.predict <- mean(y.test)
  }

  # L2 loss
  earlystopping.loss <- mean((earlystopping.predict - y.test) ^ 2)
  L2.loss <- mean((L2.predict - y.test) ^ 2)
  baseline.loss <- mean((baseline.predict - y.test) ^ 2)

  test.loss.mat[i.fold,] = c(earlystopping.loss, L2.loss, baseline.loss)
}

Matrix of loss values

# show result
colnames(test.loss.mat) <- c("Early Stopping", "L2", "Baseline")

test.loss.mat

# plot result
barplot(
  test.loss.mat,
  main = c("Binary Classification: ", "ozone"),
  xlab = "mean loss value",
  legend = (rownames(test.loss.mat)),
  beside = TRUE
)

Comment on difference in accuracy: In this data set, Early Stopping and L2 are both accurate enough to predict the label comparing with the baseline predictor.

Train/validation loss plot

# Run CV for whole dataset
if(data.set$is.01){
  # Binary
  model.list <- LMLogisticLossL2CV(data.set$features, data.set$labels, NULL, penalty.vec)
}else{
  # Regression
  model.list <- LMSquareLossL2CV(data.set$features,data.set$labels,NULL, penalty.vec)
}

dot.x <- model.list$selected.penalty
dot.y <- model.list$mean.validation.loss.vec[penalty.vec == model.list$selected.penalty]

matplot(
  y = cbind(model.list$mean.validation.loss.vec, model.list$mean.train.loss.vec),
  x = as.matrix(penalty.vec),
  xlab = "penalty",
  ylab = "mean loss value",
  type = "l",
  lty = 1:2,
  pch = 15,
  col = c(17)
)

matpoints(x = dot.x,
          y = dot.y,
          col = 2,
          pch = 19)
legend(
    x = 0,
    y = max(cbind(model.list$mean.validation.loss.vec, model.list$mean.train.loss.vec)),
    c("Validation loss", "Train loss"),
    lty = 1:2,
    xjust = 0,
    yjust = 1
)

What are the optimal regularization parameters? Answer: r model.list$selected.penalty

End of the report

SixianZhang/CS499-Coding-Project-2 documentation built on May 26, 2019, 3:31 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

SixianZhang/CS499-Coding-Project-2
Linear model for regression and binary classification

In SixianZhang/CS499-Coding-Project-2: Linear model for regression and binary classification

Introduction

Main Function

Experiments/application

Data set 1: spam

Matrix of loss values

Train/validation loss plot

Data set 2: SAheart

Matrix of loss values

Train/validation loss plot

Data set 3: zip.train

Matrix of loss values

Train/validation loss plot

Data set 4: prostate

Matrix of loss values

Train/validation loss plot

Data set 5: ozone

Matrix of loss values

Train/validation loss plot

End of the report

R Package Documentation

Browse R Packages

We want your feedback!

SixianZhang/CS499-Coding-Project-2 Linear model for regression and binary classification

In SixianZhang/CS499-Coding-Project-2: Linear model for regression and binary classification

Introduction

Main Function

Experiments/application

Data set 1: spam

Matrix of loss values

Train/validation loss plot

Data set 2: SAheart

Matrix of loss values

Train/validation loss plot

Data set 3: zip.train

Matrix of loss values

Train/validation loss plot

Data set 4: prostate

Matrix of loss values

Train/validation loss plot

Data set 5: ozone

Matrix of loss values

Train/validation loss plot

End of the report

R Package Documentation

Browse R Packages

We want your feedback!

SixianZhang/CS499-Coding-Project-2
Linear model for regression and binary classification