knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

Introduction

For this project we will writing an R package that implements gradient descent algorithms for neural networks with one hidden layer.

Here are some significant formulas that have been used in this function:

Loss:

1.Regression: $L(w,v) = \frac{1}{2}\sum_{i=1}^n[f(x_i)-y_i]^2 = \frac{1}{2}\sum_{i=1}^n[w^TS(v^TX_i)-y_i]^2$

2.Binary classification: $L(w,v) = \frac{1}{n}\sum^{n}{i=1}\frac{1}{2}[f(x_i)-y_i]^2 = \frac{1}{n}\sum^{n}{i=1}log[1+exp(-\tilde{y}_if(x_i))]$

Gradient:

1.$\nabla_w\frac{1}{2}[w^TS(v^Tx_i)-y_i]^2 = \frac{\partial}{\partial b_i}\frac{1}{2}[b_i-y_i]^2\nabla_wb_i = \delta_i^wz_i$

2.$\nabla_v\frac{1}{2}[w^TS[v^Tx_i]-y_i]^2 = \frac{\partial}{\partial a_i}\frac{1}{2}[w^TS[a_i]-y_i]^2\nabla_va_i = \delta_i^vx_i$

Backpropagation:

1.$w^{t+1}\longleftarrow w^t - \alpha\nabla_wL(w^t,v^t)$

2.$v^{t+1}\longleftarrow v^t-\alpha\nabla_vl(w^t,v^t)$

Main Function

The purpose of this section is to give users a general information of this package. We will briefly go over the main functions.

library(ElemStatLearn)
library(NeuralNetwork)

data(spam, package = "ElemStatLearn")
data(SAheart, package = "ElemStatLearn")
data(zip.train, package = "ElemStatLearn")
zip.train <- zip.train[zip.train[, 1] %in% c(0, 1),]
data(prostate, package = "ElemStatLearn")
data(ozone, package = "ElemStatLearn")

data.list <- list(
  spam = list(
    features = as.matrix(spam[, 1:57]),
    labels = ifelse(spam$spam == "spam", 1, 0),
    step.size = 0.06,
    is.01 = TRUE
  ),

  SAheart = list(
    features = as.matrix(SAheart[, c(1:4, 6:9)]),
    labels = ifelse(SAheart$chd == 1, 1, 0),
    step.size = 0.02,

    is.01 = TRUE
  ),

    zip.train = list(
      features = as.matrix(zip.train[,-1]),
      labels = ifelse(zip.train[, 1] == 1, 1, 0),
      step.size = 0.05,

      is.01 = TRUE
    ),


    prostate = list(
      features = as.matrix(prostate[, 1:8]),
      labels = prostate$lpsa,
      step.size = 0.02,

      is.01 = FALSE
    ),

  ozone = list(
    features = as.matrix(ozone[,-1]),
    labels = ozone[, 1],
    step.size = 0.02,

    is.01 = FALSE
  ))

n.folds <- 5L
n.hidden.units <- 100L
max.iterations <- 500L

# X.mat <- data.set$features
# y.vec <- data.set$labels
#

# legend(
#   x = 0,
#   y = max(
#     cbind(
#       model.list$mean.validation.loss.vec,
#       model.list$mean.train.loss.vec
#     )
#   ),
#   c("Validation loss", "Train loss"),
#   lty = 1:2,
#   xjust = 0,
#   yjust = 1
# )

Experiments/application

We are going to run our code on the following data sets.

Data set 1: spam

data.name = 1
data.set <- data.list[[data.name]]

test.loss.mat <- matrix(0, nrow = n.folds, ncol = 2)
step.size <- data.set$step.size

#Check data type here:

set.seed(1)

fold.vec <- sample(rep(1:n.folds, l = length(data.set$labels)))

for (i.fold in (1:n.folds)) {
  train.index <- fold.vec != i.fold
  test.index <- fold.vec == i.fold

  X.train <- data.set$features[train.index,]
  y.train <- data.set$labels[train.index]
  X.test <- data.set$features[test.index,]
  y.test <- data.set$labels[test.index]

  result.list <- NNetEarlyStoppingCV(
    X.mat = X.train,
    y.vec = y.train,
    max.iterations = max.iterations,
    step.size = step.size,
    n.hidden.units = n.hidden.units
  )

  if (data.set$is.01) {
    # binary data
    NNet.predict <-
      ifelse(result.list$predict(X.test) > 0.5, 1,0)
    NNet.loss <- mean(ifelse(NNet.predict == y.test, 0, 1))

    baseline.predict <- mean(y.test)


  } else{
    # regression data
    NNet.predict <- result.list$predict(X.test)
    NNet.loss <- mean((NNet.predict - y.test) ^ 2)

    baseline.predict <- mean(y.test)

  }


  baseline.loss <- mean((baseline.predict - y.test) ^ 2)

  test.loss.mat[i.fold,] = c(NNet.loss, baseline.loss)
}

colnames(test.loss.mat) <- c("Nerual Network", "Baseline")

Matrix of loss values

# plot result
if (!data.set$is.01) {
  barplot(
    test.loss.mat,
    main = c("Square Regression: ", "spam"),
    xlab = "mean loss value",
    legend = (rownames(test.loss.mat)),
    beside = TRUE
  )
} else{
  barplot(
    test.loss.mat,
    main = c("Binary Classification: ", "spam"),
    xlab = "mean loss value",
    legend = (rownames(test.loss.mat)),
    beside = TRUE
  )
}

model.list <- NNetEarlyStoppingCV(
  X.mat = X.train,
  y.vec = y.train,
  max.iterations = max.iterations,
  step.size = step.size,
  n.hidden.units = n.hidden.units
)

Comment on difference in accuracy: Neural network model is much better than baseline

Train/validation loss plot

matplot(
  x = seq(1, max.iterations),
  y = cbind(model.list$mean.validation.loss.vec, model.list$mean.train.loss.vec),
  xlab = "Iteration",
  ylab = "mean loss value",
  type = "l",
  lty = 1:2,
  pch = 15,
  col = c(17)
)


dot.x <- model.list$selected.step
dot.y <- model.list$mean.validation.loss.vec[dot.x]

matpoints(x = dot.x,
          y = dot.y,
          col = 2,
          pch = 19)
legend(
  x = 0,
  y = max(cbind(model.list$mean.validation.loss.vec, model.list$mean.train.loss.vec)),
  c("Validation loss", "Train loss"),
  lty = 1:2,
  xjust = 0,
  yjust = 1
)

What are the optimal regularization parameters? Answer: iteration = r which.min(model.list$mean.validation.loss.vec)

Data set 2: SAheart

data.name = 2
data.set <- data.list[[data.name]]

test.loss.mat <- matrix(0, nrow = n.folds, ncol = 2)
step.size <- data.set$step.size

#Check data type here:

set.seed(1)

fold.vec <- sample(rep(1:n.folds, l = length(data.set$labels)))

for (i.fold in (1:n.folds)) {
  train.index <- fold.vec != i.fold
  test.index <- fold.vec == i.fold

  X.train <- data.set$features[train.index,]
  y.train <- data.set$labels[train.index]
  X.test <- data.set$features[test.index,]
  y.test <- data.set$labels[test.index]

  result.list <- NNetEarlyStoppingCV(
    X.mat = X.train,
    y.vec = y.train,
    max.iterations = max.iterations,
    step.size = step.size,
    n.hidden.units = n.hidden.units
  )

  if (data.set$is.01) {
    # binary data
    NNet.predict <-
      ifelse(result.list$predict(X.test) > 0.5, 1,0)
    NNet.loss <- mean(ifelse(NNet.predict == y.test, 0, 1))

    baseline.predict <- mean(y.test)


  } else{
    # regression data
    NNet.predict <- result.list$predict(X.test)
    NNet.loss <- mean((NNet.predict - y.test) ^ 2)

    baseline.predict <- mean(y.test)

  }


  baseline.loss <- mean((baseline.predict - y.test) ^ 2)

  test.loss.mat[i.fold,] = c(NNet.loss, baseline.loss)
}

colnames(test.loss.mat) <- c("Nerual Network", "Baseline")

Matrix of loss values

# plot result
if (!data.set$is.01) {
  barplot(
    test.loss.mat,
    main = c("Square Regression: ", "SAheart"),
    xlab = "mean loss value",
    legend = (rownames(test.loss.mat)),
    beside = TRUE
  )
} else{
  barplot(
    test.loss.mat,
    main = c("Binary Classification: ", "SAheart"),
    xlab = "mean loss value",
    legend = (rownames(test.loss.mat)),
    beside = TRUE
  )
}

model.list <- NNetEarlyStoppingCV(
  X.mat = X.train,
  y.vec = y.train,
  max.iterations = max.iterations,
  step.size = step.size,
  n.hidden.units = n.hidden.units
)

Comment on difference in accuracy: Neural network model is much better than baseline

Train/validation loss plot

matplot(
  x = seq(1, max.iterations),
  y = cbind(model.list$mean.validation.loss.vec, model.list$mean.train.loss.vec),
  xlab = "Iteration",
  ylab = "mean loss value",
  type = "l",
  lty = 1:2,
  pch = 15,
  col = c(17)
)


dot.x <- model.list$selected.step
dot.y <- model.list$mean.validation.loss.vec[dot.x]

matpoints(x = dot.x,
          y = dot.y,
          col = 2,
          pch = 19)
legend(
  x = 0,
  y = max(cbind(model.list$mean.validation.loss.vec, model.list$mean.train.loss.vec)),
  c("Validation loss", "Train loss"),
  lty = 1:2,
  xjust = 0,
  yjust = 1
)

What are the optimal regularization parameters? Answer: iteration = r which.min(model.list$mean.validation.loss.vec)

Data set 3: zip.train

data.name = 3
data.set <- data.list[[data.name]]

test.loss.mat <- matrix(0, nrow = n.folds, ncol = 2)
step.size <- data.set$step.size

#Check data type here:

set.seed(1)

fold.vec <- sample(rep(1:n.folds, l = length(data.set$labels)))

for (i.fold in (1:n.folds)) {
  train.index <- fold.vec != i.fold
  test.index <- fold.vec == i.fold

  X.train <- data.set$features[train.index,]
  y.train <- data.set$labels[train.index]
  X.test <- data.set$features[test.index,]
  y.test <- data.set$labels[test.index]

  result.list <- NNetEarlyStoppingCV(
    X.mat = X.train,
    y.vec = y.train,
    max.iterations = max.iterations,
    step.size = step.size,
    n.hidden.units = n.hidden.units
  )

  if (data.set$is.01) {
    # binary data
    NNet.predict <-
      ifelse(result.list$predict(X.test) > 0.5, 1,0)
    NNet.loss <- mean(ifelse(NNet.predict == y.test, 0, 1))

    baseline.predict <- mean(y.test)


  } else{
    # regression data
    NNet.predict <- result.list$predict(X.test)
    NNet.loss <- mean((NNet.predict - y.test) ^ 2)

    baseline.predict <- mean(y.test)

  }


  baseline.loss <- mean((baseline.predict - y.test) ^ 2)

  test.loss.mat[i.fold,] = c(NNet.loss, baseline.loss)
}

colnames(test.loss.mat) <- c("Nerual Network", "Baseline")

Matrix of loss values

# plot result
if (!data.set$is.01) {
  barplot(
    test.loss.mat,
    main = c("Square Regression: ", "zip.train"),
    xlab = "mean loss value",
    legend = (rownames(test.loss.mat)),
    beside = TRUE
  )
} else{
  barplot(
    test.loss.mat,
    main = c("Binary Classification: ", "zip.train"),
    xlab = "mean loss value",
    legend = (rownames(test.loss.mat)),
    beside = TRUE
  )
}

model.list <- NNetEarlyStoppingCV(
  X.mat = X.train,
  y.vec = y.train,
  max.iterations = max.iterations,
  step.size = step.size,
  n.hidden.units = n.hidden.units
)

Comment on difference in accuracy: Neural network model is much better than baseline

Train/validation loss plot

matplot(
  x = seq(1, max.iterations),
  y = cbind(model.list$mean.validation.loss.vec, model.list$mean.train.loss.vec),
  xlab = "Iteration",
  ylab = "mean loss value",
  type = "l",
  lty = 1:2,
  pch = 15,
  col = c(17)
)


dot.x <- model.list$selected.step
dot.y <- model.list$mean.validation.loss.vec[dot.x]

matpoints(x = dot.x,
          y = dot.y,
          col = 2,
          pch = 19)
legend(
  x = 0,
  y = max(cbind(model.list$mean.validation.loss.vec, model.list$mean.train.loss.vec)),
  c("Validation loss", "Train loss"),
  lty = 1:2,
  xjust = 0,
  yjust = 1
)

What are the optimal regularization parameters? Answer: iteration = r which.min(model.list$mean.validation.loss.vec)

Data set 4: prostate

data.name = 4
data.set <- data.list[[data.name]]

test.loss.mat <- matrix(0, nrow = n.folds, ncol = 2)
step.size <- data.set$step.size

#Check data type here:

set.seed(1)

fold.vec <- sample(rep(1:n.folds, l = length(data.set$labels)))

for (i.fold in (1:n.folds)) {
  train.index <- fold.vec != i.fold
  test.index <- fold.vec == i.fold

  X.train <- data.set$features[train.index,]
  y.train <- data.set$labels[train.index]
  X.test <- data.set$features[test.index,]
  y.test <- data.set$labels[test.index]

  result.list <- NNetEarlyStoppingCV(
    X.mat = X.train,
    y.vec = y.train,
    max.iterations = max.iterations,
    step.size = step.size,
    n.hidden.units = n.hidden.units
  )

  if (data.set$is.01) {
    # binary data
    NNet.predict <-
      ifelse(result.list$predict(X.test) > 0.5, 1,0)
    NNet.loss <- mean(ifelse(NNet.predict == y.test, 0, 1))

    baseline.predict <- mean(y.test)


  } else{
    # regression data
    NNet.predict <- result.list$predict(X.test)
    NNet.loss <- mean((NNet.predict - y.test) ^ 2)

    baseline.predict <- mean(y.test)

  }


  baseline.loss <- mean((baseline.predict - y.test) ^ 2)

  test.loss.mat[i.fold,] = c(NNet.loss, baseline.loss)
}

colnames(test.loss.mat) <- c("Nerual Network", "Baseline")

Matrix of loss values

# plot result
if (!data.set$is.01) {
  barplot(
    test.loss.mat,
    main = c("Square Regression: ", "prostate"),
    xlab = "mean loss value",
    legend = (rownames(test.loss.mat)),
    beside = TRUE
  )
} else{
  barplot(
    test.loss.mat,
    main = c("Binary Classification: ", "prostate"),
    xlab = "mean loss value",
    legend = (rownames(test.loss.mat)),
    beside = TRUE
  )
}

model.list <- NNetEarlyStoppingCV(
  X.mat = X.train,
  y.vec = y.train,
  max.iterations = max.iterations,
  step.size = step.size,
  n.hidden.units = n.hidden.units
)

Comment on difference in accuracy: Neural network model is much better than baseline

Train/validation loss plot

matplot(
  x = seq(1, max.iterations),
  y = cbind(model.list$mean.validation.loss.vec, model.list$mean.train.loss.vec),
  xlab = "Iteration",
  ylab = "mean loss value",
  type = "l",
  lty = 1:2,
  pch = 15,
  col = c(17)
)


dot.x <- model.list$selected.step
dot.y <- model.list$mean.validation.loss.vec[dot.x]

matpoints(x = dot.x,
          y = dot.y,
          col = 2,
          pch = 19)
legend(
  x = 0,
  y = max(cbind(model.list$mean.validation.loss.vec, model.list$mean.train.loss.vec)),
  c("Validation loss", "Train loss"),
  lty = 1:2,
  xjust = 0,
  yjust = 1
)

What are the optimal regularization parameters? Answer: iteration = r which.min(model.list$mean.validation.loss.vec)

Data set 5: ozone

data.name = 5
data.set <- data.list[[data.name]]

test.loss.mat <- matrix(0, nrow = n.folds, ncol = 2)
step.size <- data.set$step.size

#Check data type here:

set.seed(1)

fold.vec <- sample(rep(1:n.folds, l = length(data.set$labels)))

for (i.fold in (1:n.folds)) {
  train.index <- fold.vec != i.fold
  test.index <- fold.vec == i.fold

  X.train <- data.set$features[train.index,]
  y.train <- data.set$labels[train.index]
  X.test <- data.set$features[test.index,]
  y.test <- data.set$labels[test.index]

  result.list <- NNetEarlyStoppingCV(
    X.mat = X.train,
    y.vec = y.train,
    max.iterations = max.iterations,
    step.size = step.size,
    n.hidden.units = n.hidden.units
  )

  if (data.set$is.01) {
    # binary data
    NNet.predict <-
      ifelse(result.list$predict(X.test) > 0.5, 1,0)
    NNet.loss <- mean(ifelse(NNet.predict == y.test, 0, 1))

    baseline.predict <- mean(y.test)


  } else{
    # regression data
    NNet.predict <- result.list$predict(X.test)
    NNet.loss <- mean((NNet.predict - y.test) ^ 2)

    baseline.predict <- mean(y.test)

  }


  baseline.loss <- mean((baseline.predict - y.test) ^ 2)

  test.loss.mat[i.fold,] = c(NNet.loss, baseline.loss)
}

colnames(test.loss.mat) <- c("Nerual Network", "Baseline")

Matrix of loss values

# plot result
if (!data.set$is.01) {
  barplot(
    test.loss.mat,
    main = c("Square Regression: ", "ozone"),
    xlab = "mean loss value",
    legend = (rownames(test.loss.mat)),
    beside = TRUE
  )
} else{
  barplot(
    test.loss.mat,
    main = c("Binary Classification: ", "ozone"),
    xlab = "mean loss value",
    legend = (rownames(test.loss.mat)),
    beside = TRUE
  )
}

model.list <- NNetEarlyStoppingCV(
  X.mat = X.train,
  y.vec = y.train,
  max.iterations = max.iterations,
  step.size = step.size,
  n.hidden.units = n.hidden.units
)

Comment on difference in accuracy: Neural network model is much better than baseline

Train/validation loss plot

matplot(
  x = seq(1, max.iterations),
  y = cbind(model.list$mean.validation.loss.vec, model.list$mean.train.loss.vec),
  xlab = "Iteration",
  ylab = "mean loss value",
  type = "l",
  lty = 1:2,
  pch = 15,
  col = c(17)
)


dot.x <- model.list$selected.step
dot.y <- model.list$mean.validation.loss.vec[dot.x]

matpoints(x = dot.x,
          y = dot.y,
          col = 2,
          pch = 19)
legend(
  x = 0,
  y = max(cbind(model.list$mean.validation.loss.vec, model.list$mean.train.loss.vec)),
  c("Validation loss", "Train loss"),
  lty = 1:2,
  xjust = 0,
  yjust = 1
)

What are the optimal regularization parameters? Answer: iteration = r which.min(model.list$mean.validation.loss.vec)

End of the report



SixianZhang/CS499-Coding-Project-3 documentation built on May 21, 2019, 1:42 p.m.