In casxue/bis557: Coursework for BIS 557

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

Homework 5

Problem 1

In class we used the LASSO to predict handwritten characters in the MNIST data set. Increase the out-of-sample prediction accuracy by extracting predictive features from the images.

library(bis557)
data(mnist_train)
data(mnist_test)

Xtrain <- as.matrix(mnist_train[1:1000,-1])
Xtest <- as.matrix(mnist_test[1:1000,-1])
ytrain <- mnist_train[1:1000,1]
ytest <- mnist_test[1:1000,1]

Using LASSO as is, the out-of-sample prediction accuracy is

library(glmnet)
outLm <- cv.glmnet(Xtrain, ytrain, alpha = 1, family = "multinomial")
predLm <- apply(predict(outLm, Xtest, s = outLm$lambda.min, type = "response"), 1, which.max) - 1L
sum(predLm == ytest) / length(ytest) # out-of-sample prediction accuracy

Problem 2: CASL Exercises 8.11, question 4

Adjust the kernel size, and any other parameters you think are useful, in the convolutional neural network for EMNIST in Section 8.10.4. Can you improve on the classification rate?

library(keras)
library(magrittr)
library(abind)
data(emnist)

x_train <- emnist$dataset[[1]][[1]] / 255
y_train <- emnist$dataset[[1]][[2]]
x_test <- emnist$dataset[[2]][[1]] / 255
y_test <- emnist$dataset[[2]][[2]]
z_map <- emnist$dataset[[3]] # map labels to asci codes 

n <- 100000
X_train <- array(c(x_train[1:n,]), dim = c(n, 28, 28, 1))
X_test <- array(c(x_test[1:n,]), dim = c(n, 28, 28, 1))
X <- abind(X_train, X_test, along = 1)
y <- c(y_train, y_test)

# model from 8.10.4
model <- keras_model_sequential()
model %>%
    layer_conv_2d(filters = 32, kernel_size = c(2,2), input_shape = c(28, 28, 1), padding = "same") %>%
    layer_activation(activation = "relu") %>%
    layer_conv_2d(filters = 32, kernel_size = c(2,2), padding = "same") %>%
    layer_activation(activation = "relu") %>%
    layer_max_pooling_2d(pool_size = c(2, 2)) %>%
    layer_dropout(rate = 0.5) %>%
    layer_conv_2d(filters = 32, kernel_size = c(2,2), padding = "same") %>%
    layer_activation(activation = "relu") %>%
    layer_conv_2d(filters = 32, kernel_size = c(2,2), padding = "same") %>%
    layer_activation(activation = "relu") %>%
    layer_max_pooling_2d(pool_size = c(2, 2)) %>%
    layer_dropout(rate = 0.5) %>%
    layer_flatten() %>%
    layer_dense(units = 128) %>%
    layer_activation(activation = "relu") %>%
    layer_dense(units = 128) %>%
    layer_activation(activation = "relu") %>%
    layer_dropout(rate = 0.5) %>%
    layer_dense(units = 26) %>%
    layer_activation(activation = "softmax")
model %>% compile(loss = "categorical_crossentropy", optimizer = optimizer_rmsprop(), metrics = c("accuracy"))

# new model 
model2 <- keras_model_sequential()
model2 %>%
    layer_conv_2d(filters = 60, kernel_size = c(2,2), input_shape = c(28, 28, 1), padding = "same") %>%
    layer_activation(activation = "relu") %>%
    layer_conv_2d(filters = 60, kernel_size = c(2,2), padding = "same") %>%
    layer_activation(activation = "relu") %>%
    layer_max_pooling_2d(pool_size = c(2, 2)) %>%
    layer_dropout(rate = 0.5) %>%
    layer_conv_2d(filters = 60, kernel_size = c(2,2), padding = "same") %>%
    layer_activation(activation = "relu") %>%
    layer_conv_2d(filters = 60, kernel_size = c(2,2), padding = "same") %>%
    layer_activation(activation = "relu") %>%
    layer_max_pooling_2d(pool_size = c(2, 2)) %>%
    layer_dropout(rate = 0.5) %>%
    layer_flatten() %>%
    layer_dense(units = 128) %>%
    layer_activation(activation = "relu") %>%
    layer_dense(units = 128) %>%
    layer_activation(activation = "relu") %>%
    layer_dropout(rate = 0.5) %>%
    layer_dense(units = 26) %>%
    layer_activation(activation = "softmax")
model2 %>% compile(loss = "categorical_crossentropy", optimizer = optimizer_rmsprop(), metrics = c("accuracy"))

pred <- predict_classes(model, X)
mean(pred[1:n] == y[1:n]) #training
mean(pred[(n + 1):(2 * n)] == y[(n + 1):(2 * n)]) #testing
pred2 <- predict_classes(model2, X)
mean(pred2[1:n] == y[1:n]) #training
mean(pred2[(n + 1):(2 * n)] == y[(n + 1):(2 * n)]) #testing

Problem 3: CASL Exercises 8.11, question 8

Write a function that uses mean absolute deviation as a loss function, instead of mean squared error. Test the use of this function with a simulation containing several outliers. How well do neural networks and SGD perform when using robust techniques?

The derivative of the new loss function is util_mae_p. Using the neural network functions modified from CASL, we have

# Data
X <- matrix(runif(1000, min = -1, max = 1), ncol = 1)
y <- X[, 1, drop = FALSE]^2 + rnorm(1000, sd = 0.1)
y[sample(1000, 20)] <- rnorm(20, mean = 2, sd = 0.1) # introduce outliers

# Neural network + prediction calculations
w_mse <- nn_sgd(X, y, sizes = c(1, 25, 1), epochs = 25, eta = 0.01)
w_mae <- nn_sgd(X, y, sizes = c(1, 25, 1), epochs = 25, eta = 0.01, f_p = util_mae_p)

y_pred_mse <- nn_predict(w_mse, X)
y_pred_mae <- nn_predict(w_mae, X)

# error calculation
sum((y_pred_mse - y)^2) / 1000
sum((y_pred_mae - y)^2) / 1000