knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
In class we used the LASSO to predict handwritten characters in the MNIST data set. Increase the out-of-sample prediction accuracy by extracting predictive features from the images.
library(bis557) data(mnist_train) data(mnist_test) Xtrain <- as.matrix(mnist_train[1:1000,-1]) Xtest <- as.matrix(mnist_test[1:1000,-1]) ytrain <- mnist_train[1:1000,1] ytest <- mnist_test[1:1000,1]
Using LASSO as is, the out-of-sample prediction accuracy is
library(glmnet) outLm <- cv.glmnet(Xtrain, ytrain, alpha = 1, family = "multinomial") predLm <- apply(predict(outLm, Xtest, s = outLm$lambda.min, type = "response"), 1, which.max) - 1L sum(predLm == ytest) / length(ytest) # out-of-sample prediction accuracy
Adjust the kernel size, and any other parameters you think are useful, in the convolutional neural network for EMNIST in Section 8.10.4. Can you improve on the classification rate?
library(keras) library(magrittr) library(abind) data(emnist) x_train <- emnist$dataset[[1]][[1]] / 255 y_train <- emnist$dataset[[1]][[2]] x_test <- emnist$dataset[[2]][[1]] / 255 y_test <- emnist$dataset[[2]][[2]] z_map <- emnist$dataset[[3]] # map labels to asci codes n <- 100000 X_train <- array(c(x_train[1:n,]), dim = c(n, 28, 28, 1)) X_test <- array(c(x_test[1:n,]), dim = c(n, 28, 28, 1)) X <- abind(X_train, X_test, along = 1) y <- c(y_train, y_test) # model from 8.10.4 model <- keras_model_sequential() model %>% layer_conv_2d(filters = 32, kernel_size = c(2,2), input_shape = c(28, 28, 1), padding = "same") %>% layer_activation(activation = "relu") %>% layer_conv_2d(filters = 32, kernel_size = c(2,2), padding = "same") %>% layer_activation(activation = "relu") %>% layer_max_pooling_2d(pool_size = c(2, 2)) %>% layer_dropout(rate = 0.5) %>% layer_conv_2d(filters = 32, kernel_size = c(2,2), padding = "same") %>% layer_activation(activation = "relu") %>% layer_conv_2d(filters = 32, kernel_size = c(2,2), padding = "same") %>% layer_activation(activation = "relu") %>% layer_max_pooling_2d(pool_size = c(2, 2)) %>% layer_dropout(rate = 0.5) %>% layer_flatten() %>% layer_dense(units = 128) %>% layer_activation(activation = "relu") %>% layer_dense(units = 128) %>% layer_activation(activation = "relu") %>% layer_dropout(rate = 0.5) %>% layer_dense(units = 26) %>% layer_activation(activation = "softmax") model %>% compile(loss = "categorical_crossentropy", optimizer = optimizer_rmsprop(), metrics = c("accuracy")) # new model model2 <- keras_model_sequential() model2 %>% layer_conv_2d(filters = 60, kernel_size = c(2,2), input_shape = c(28, 28, 1), padding = "same") %>% layer_activation(activation = "relu") %>% layer_conv_2d(filters = 60, kernel_size = c(2,2), padding = "same") %>% layer_activation(activation = "relu") %>% layer_max_pooling_2d(pool_size = c(2, 2)) %>% layer_dropout(rate = 0.5) %>% layer_conv_2d(filters = 60, kernel_size = c(2,2), padding = "same") %>% layer_activation(activation = "relu") %>% layer_conv_2d(filters = 60, kernel_size = c(2,2), padding = "same") %>% layer_activation(activation = "relu") %>% layer_max_pooling_2d(pool_size = c(2, 2)) %>% layer_dropout(rate = 0.5) %>% layer_flatten() %>% layer_dense(units = 128) %>% layer_activation(activation = "relu") %>% layer_dense(units = 128) %>% layer_activation(activation = "relu") %>% layer_dropout(rate = 0.5) %>% layer_dense(units = 26) %>% layer_activation(activation = "softmax") model2 %>% compile(loss = "categorical_crossentropy", optimizer = optimizer_rmsprop(), metrics = c("accuracy")) pred <- predict_classes(model, X) mean(pred[1:n] == y[1:n]) #training mean(pred[(n + 1):(2 * n)] == y[(n + 1):(2 * n)]) #testing pred2 <- predict_classes(model2, X) mean(pred2[1:n] == y[1:n]) #training mean(pred2[(n + 1):(2 * n)] == y[(n + 1):(2 * n)]) #testing
Write a function that uses mean absolute deviation as a loss function, instead of mean squared error. Test the use of this function with a simulation containing several outliers. How well do neural networks and SGD perform when using robust techniques?
The derivative of the new loss function is util_mae_p
. Using the neural network functions modified from CASL, we have
# Data X <- matrix(runif(1000, min = -1, max = 1), ncol = 1) y <- X[, 1, drop = FALSE]^2 + rnorm(1000, sd = 0.1) y[sample(1000, 20)] <- rnorm(20, mean = 2, sd = 0.1) # introduce outliers # Neural network + prediction calculations w_mse <- nn_sgd(X, y, sizes = c(1, 25, 1), epochs = 25, eta = 0.01) w_mae <- nn_sgd(X, y, sizes = c(1, 25, 1), epochs = 25, eta = 0.01, f_p = util_mae_p) y_pred_mse <- nn_predict(w_mse, X) y_pred_mae <- nn_predict(w_mae, X) # error calculation sum((y_pred_mse - y)^2) / 1000 sum((y_pred_mae - y)^2) / 1000
The MSE substantially outperformed the MAE in the procedure.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.