knitr::opts_chunk$set(
    echo = FALSE,
    fig.align = "center",
    fig.height = 7,
    fig.width = 10,
    cache = TRUE
)
library(MIOwAD)
library(ggplot2)
library(dplyr)

Loading data and splitting into train and test

I've loaded four datasets and splitted it into train and test datasets.

dsets <- c("regression/steps-large", 
           "regression/multimodal-large", 
           "classification/rings3-regular", 
           "classification/rings5-regular")
d_names <- c("steps", "multimodal", "rings3", "rings5")

# load datasets
dat <- setNames(lapply(dsets, function(dset) {
  list(train = read.csv(paste0("../../data/", dset, "-training.csv")),
       test = read.csv(paste0("../../data/", dset, "-test.csv")))
}), d_names)

# split datasets into X, y, train and test
X <- c(
  setNames(lapply(d_names[1:2], function(dset) {
    list(train = scale(as.matrix(dat[[dset]][["train"]])[, 2, drop = FALSE]),
         test = scale(as.matrix(dat[[dset]][["test"]])[, 2, drop = FALSE]))
  }), d_names[1:2]),
  setNames(lapply(d_names[3:4], function(dset) {
    list(train = scale(as.matrix(dat[[dset]][["train"]])[, 1:2, drop = FALSE]),
         test = scale(as.matrix(dat[[dset]][["test"]])[, 1:2, drop = FALSE]))
  }), d_names[3:4])
)

y <- c(
  setNames(lapply(d_names[1:2], function(dset) {
    list(train = scale(as.matrix(dat[[dset]][["train"]])[, 3, drop = FALSE]),
         test = scale(as.matrix(dat[[dset]][["test"]])[, 3, drop = FALSE]))
  }), d_names[1:2]),
  setNames(lapply(d_names[3:4], function(dset) {
    list(train = as.matrix(dat[[dset]][["train"]])[, 3, drop = FALSE],
         test = as.matrix(dat[[dset]][["test"]])[, 3, drop = FALSE])
  }), d_names[3:4])
)

# one hot encoded values
y_ohe <- list(
  rings3 =  
    list(train = do.call(cbind, 
                         lapply(0:2, function(i) 
                           ifelse(as.matrix(dat[["rings3"]][["train"]])[, 3, drop = FALSE] == i, 1, 0))),
         test = do.call(cbind, 
                        lapply(0:2, function(i) 
                          ifelse(as.matrix(dat[["rings3"]][["test"]])[, 3, drop = FALSE] == i, 1, 0)))),
  rings5 = 
    list(train = do.call(cbind, 
                         lapply(0:2, function(i) 
                           ifelse(as.matrix(dat[["rings5"]][["train"]])[, 3, drop = FALSE] == i, 1, 0))),
         test = do.call(cbind, 
                        lapply(0:2, function(i) 
                          ifelse(as.matrix(dat[["rings5"]][["test"]])[, 3, drop = FALSE] == i, 1, 0))))
)

Prepraring experiment

For each dataset I've chosen: - four possible activation functions (linear, sigmoid, relu, tanh), - three possible numbers of hidden layers (one, two, three), - three possible sizes of layers (3, 5, 10).

Next, I've trained one network for each dataset for each combination of those parameters (totalling $4 \times4 \times 3 \times 3 = 144$ networks). I've used momentum training with some default values (linear activation of last layer for regression, softmax for classification, using momentum optimizer, $\eta = 10^{-5}$ for regression and $\eta = 10^{-7}$ for classification).

# prepare lists of input and output layers suitable for datasets
first_layer <- list( 
  steps = neural_network(1),
  multimodal = neural_network(1),
  rings3 = neural_network(2),
  rings5 = neural_network(2)
)

last_layer <- list(
  steps = output_layer(1, "linear"),
  multimodal = output_layer(1, "linear"),
  rings3 = output_layer(3, "softmax"),
  rings5 = output_layer(3, "softmax")
)

# set possible parameters
activations <- c("linear", "sigmoid", "relu", "tanh")
n_layers <- c(1:3)
s_layers <- c(3, 5, 10)
s_layers_names <- c("small", "medium", "big")

# create networks
networks <- setNames(lapply(d_names, function(dset) 
  setNames(lapply(activations, function(activation)
    setNames(lapply(s_layers, function(s)
      lapply(n_layers, function(n)
          switch (n,
                  "1" = first_layer[[dset]] +       # for each dataset, activation, size and n - create net
                    hidden_layer(s, activation),
                  "2" = first_layer[[dset]] +
                    hidden_layer(s, activation) +
                    hidden_layer(s, activation),
                  "3" = first_layer[[dset]] +
                    hidden_layer(s, activation) +
                    hidden_layer(s, activation) +
                    hidden_layer(s, activation)
          ) +
          last_layer[[dset]]
      )
    ), s_layers_names)), 
    activations)), 
  d_names)
# set seed (for randomizing weights)
set.seed(1998)

trained <- setNames(lapply(d_names, function(dset) 
  setNames(lapply(activations, function(activation)
    setNames(lapply(s_layers_names, function(s)
      lapply(n_layers, function(n){
        cat("dataset:", dset, "activation:", activation, "s_layer:", s, "n_layers:", n)
          networks[[dset]][[activation]][[s]][[n]] %>%
            randomize_weights() %>%
            train_network_momentum(X[[dset]][["train"]],  # training network with suitable parameters
                                   if (dset %in% d_names[3:4]) 
                                     y_ohe[[dset]][["train"]] 
                                   else
                                     y[[dset]][["train"]],
                                   num_epochs = 500, 
                                   eta = if (dset %in% d_names[3:4])
                                     1e-5
                                   else
                                     1e-7)
      })
    ), s_layers_names)), 
    activations)), 
  d_names)

Convergence pace

# creating data frame with information on convergence:
conv_data <- do.call(rbind, lapply(d_names, function(dset) 
  cbind(dataset = dset, do.call(rbind, lapply(activations, function(activation) 
    cbind(activation = activation, do.call(rbind, lapply(s_layers_names, function(s)
      cbind(size = s, do.call(rbind, lapply(n_layers, function(n)
        data.frame(n_layers = n, epoch = 1:500, mse = unlist(trained[[dset]][[activation]][[s]][[n]][["training_history"]])))))))))))))

ggplot(data = conv_data %>% filter(dataset == "steps", mse < 1000), aes(x = epoch, y = mse, color = activation)) +
  geom_line() +
  facet_grid(n_layers ~ size) +
  scale_y_log10() +
  ggtitle("pace of convergence (mse in time) on dataset 'steps'", "by numer of layers, size of layers and activation function") +
  theme_bw() +
  theme(legend.position="bottom")

ggplot(data = conv_data %>% filter(dataset == "multimodal"), aes(x = epoch, y = mse, color = activation)) +
  geom_line() +
  facet_grid(n_layers ~ size, scales = "free_y") +
  scale_y_log10() +
  ggtitle("pace of convergence (mse in time) on dataset 'multimodal'", "by numer of layers, size of layers and activation function") +
  theme_bw() +
  theme(legend.position="bottom")

ggplot(data = conv_data %>% filter(dataset == "rings3"), aes(x = epoch, y = mse, color = activation)) +
  geom_line() +
  facet_grid(n_layers ~ size, scales = "free_y") +
  scale_y_log10() +
  ggtitle("pace of convergence (mse in time) on dataset 'rings3'", "by numer of layers, size of layers and activation function") +
  theme_bw()+
  theme(legend.position="bottom")

ggplot(data = conv_data %>% filter(dataset == "rings5"), aes(x = epoch, y = mse, color = activation)) +
  geom_line() +
  facet_grid(n_layers ~ size, scales = "free_y") +
  scale_y_log10() +
  ggtitle("pace of convergence (mse in time) on dataset 'rings5'", "by numer of layers, size of layers and activation function") +
  theme_bw()+
  theme(legend.position="bottom")

We can draw some conclusions from those plots:

Goodness of fit

# applying network to training and testing data 
fitted <- setNames(lapply(d_names, function(dset) 
  setNames(lapply(activations, function(activation)
    setNames(lapply(s_layers_names, function(s)
      lapply(n_layers, function(n){
        if (dset %in% d_names[1:2])
          list(train = trained[[dset]][[activation]][[s]][[n]] %>%
                 mse(X[[dset]][["train"]], y[[dset]][["train"]]),
               test = trained[[dset]][[activation]][[s]][[n]] %>%
                 mse(X[[dset]][["test"]], y[[dset]][["test"]]))
        else
          list(train = trained[[dset]][[activation]][[s]][[n]] %>%
                 accuracy(X[[dset]][["train"]], y[[dset]][["train"]]),
               test = trained[[dset]][[activation]][[s]][[n]] %>%
                 accuracy(X[[dset]][["test"]], y[[dset]][["test"]]))
      })
    ), s_layers_names)), 
    activations)), 
  d_names)

score_data <- do.call(rbind, lapply(d_names, function(dset) 
  cbind(dataset = dset, do.call(rbind, lapply(activations, function(activation) 
    cbind(activation = activation, do.call(rbind, lapply(s_layers_names, function(s)
      cbind(size = s, do.call(rbind, lapply(n_layers, function(n)
        data.frame(n_layers = c(n, n), set = c("train", "test"), 
                   value = c(fitted[[dset]][[activation]][[s]][[n]][["train"]],
                             fitted[[dset]][[activation]][[s]][[n]][["test"]]
                   )))))))))))))

ggplot(score_data %>% filter(dataset == "steps"), aes(y = value, x = activation, fill = set)) +
  geom_bar(stat = "identity", position = position_dodge()) +
  facet_wrap(n_layers ~ size) + 
  ggtitle("mse of network on dataset 'steps'", "by numer of layers, size of layers and activation function")+
  theme_bw()+
  theme(legend.position="bottom")

ggplot(score_data %>% filter(dataset == "multimodal"), aes(y = value, x = activation, fill = set)) +
  geom_bar(stat = "identity", position = position_dodge()) +
  facet_wrap(n_layers ~ size) + 
  ggtitle("mse of network on dataset 'multimodal'", "by numer of layers, size of layers and activation function")+
  theme_bw()+
  theme(legend.position="bottom")

ggplot(score_data %>% filter(dataset == "rings3"), aes(y = value, x = activation, fill = set)) +
  geom_bar(stat = "identity", position = position_dodge()) +
  facet_wrap(n_layers ~ size) + 
  ggtitle("accuracy of network on dataset 'rings3'", "by numer of layers, size of layers and activation function")+
  theme_bw()+
  theme(legend.position="bottom")

ggplot(score_data %>% filter(dataset == "rings5"), aes(y = value, x = activation, fill = set)) +
  geom_bar(stat = "identity", position = position_dodge()) +
  facet_wrap(n_layers ~ size) + 
  ggtitle("accuracy of network on dataset 'rings5'", "by numer of layers, size of layers and activation function")+
  theme_bw()+
  theme(legend.position="bottom")

Caution! In plots regarding classification, results on the test set are sometimes better than on the training set -- this may be caused by the fact that networks are way undertrained.

The main conclusion is: result of the training heavilly depends on the dataset and parameters.



DominikRafacz/MIOwAD documentation built on May 13, 2020, 9:41 a.m.