In titaniumtroop/aRlegislation: A Dataset of Arkansas Legislation from 2001-2019

knitr::opts_chunk$set(
  echo = TRUE,
  cache = TRUE
)

# anaconda, anaconda/bin, and anaconda/lib added to .Renviron path, so following line isn't necessary 
# reticulate::use_condaenv(conda = "/usr/local/anaconda3/bin/conda") # Point reticulate to anaconda installation

# Tensorflow installed using keras::install_keras() to create an environment named "r-reticulate"
# Uses python 3.6, not 3.7
reticulate::use_condaenv(condaenv = "r-reticulate", conda = "/usr/local/anaconda3/bin/conda", required = TRUE)

packages <- c("aRlegislation", "tidyverse", "furrr", "parallel", "e1071", "scales", "caret", "ggplot2", "ggridges", "keras", "tensorflow")
lapply(packages, require, character.only = TRUE)
rm(packages)

reticulate::use_condaenv(condaenv = "r-reticulate", conda = "/usr/local/anaconda3/bin/conda", required = TRUE)

options( scipen = 10 ) # print full numbers, not scientific notation
options(future.globals.maxSize = 3*1024*1024^2) # 3GB max, 500MB caused errors

# Load topic distributions from storage
dir.files <- list.files("../data-raw/results", "Rdata", full.names = TRUE, ignore.case = TRUE)
dir.file.info <- file.info(dir.files)
dir.file.info$name <- dir.files

newest.file <- dir.file.info %>%
  arrange(desc(mtime)) %>%
  filter(size > 0 & grepl("model_grid_long", name)) %>%
  top_n(1, mtime) %>%
  select(name)

load(newest.file$name)
rm(dir.file.info, newest.file, dir.files)

lawmaker_models <- load_lawmaker_session_models(topics = 75:250)

# We'll load the model grid and lawmaker_models separately to save space, then join them here
model_grid <- model_grid %>%
 left_join(lawmaker_models %>% select(topics, lawmaker_model), by = "topics") %>%
  rename(model = lawmaker_model)

# > names(model_grid)
# [1] "topics"        "model"         "split_group"   "train"         "test"          "class.weights"

if(! all(c("topics", "model", "split_group", "train", "test", "class.weights") %in% names(model_grid))) {
  stop(
    paste(
      "Missing information from the model grid. Please provide a dataframe that has the following: ",
      c("topics", "model", "split_group", "train", "test", "class.weights")
    )
  )
}

# Keras uses parallelism by default
# However, we do want to run post-training calculations in parallel

# Some Keras settings can be manipulated to improve CPU usage
# See ~ 1/3 usage on first rin through many models calculations
# Perhaps try the approach listed below to manually set parallelism options
# https://stackoverflow.com/questions/44591138/making-neural-network-training-reproducible-using-rstudios-keras-interface

# ---- Create parallel cluster --------------------- w
## Single System
# my.cores <- parallel::detectCores() - 1
# parallelCluster <- parallel::makeCluster(parallel::detectCores())
# or
plan(multiprocess)

## HPC
# my.cores <- as.numeric(Sys.getenv("PBS_NP")) - 1
# snow::setDefaultClusterOptions(outfile = "./output/slave.out")
# parallelCluster <- snow::makeMPIcluster(my.cores, verbose = TRUE)

# Setting eval = T ensures that tensorflow is loaded

model_no <- sample(1:NROW(model_grid), size = 1)

temp_model <- model_grid$model[[model_no]]
train <- model_grid$train[[model_no]]
test <- model_grid$test[[model_no]]
class.weights <- model_grid$class.weights[[model_no]]
batch_size <- round(length(train)^0.25)
learning_rate <- 10^-6

train_x <- as.matrix(temp_model[train, grepl("^V\\d+", names(temp_model))])
test_x <- as.matrix(temp_model[test, grepl("^V\\d+", names(temp_model))])

train_y <- to_categorical(as.numeric(ifelse(temp_model[train, c("corruption")] == "corrupt", 1, 0)))
truth <- as.numeric(ifelse(temp_model[test, c("corruption")] == "corrupt", 1, 0))
test_y <-  to_categorical(truth)
test_weights <- as.numeric(ifelse(temp_model[test, c("corruption")] == "corrupt", class.weights[1], class.weights[2]))

nn_model <- keras_model_sequential() %>%
      layer_dense(units = 50, activation = "relu", input_shape = NCOL(train_x)) %>%
      layer_dense(units = 67, activation = "relu", kernel_initializer = "he_normal") %>%
      layer_dense(units = 33, activation = "relu", kernel_initializer = "he_normal") %>%
      layer_dense(units = 11, activation = "relu", kernel_initializer = "he_normal") %>%
      layer_dense(units = 2, activation = "sigmoid") %>%
      compile(
        loss = 'binary_crossentropy',
        optimizer = call("optimizer_rmsprop", lr = learning_rate),
        metrics = c('accuracy')
      )

fit <- nn_model %>%
  fit(
    x = train_x,
    y = train_y,
    validation_data = list(test_x, test_y, test_weights),
    epochs = 50,
    batch_size = batch_size,
    verbose = T
  )

# plot(fit)

pred <- nn_model %>% predict_classes(test_x)

# Make the vectors factors so if no predicted values are 1, returns a contingency table anyway
confusion <- table(pred = factor(pred, levels = c(0,1)), truth = factor(truth, levels = c(0,1)))
TP <- confusion[2,2]
FP <- confusion[2,1]
TN <- confusion[1,1]
FN <- confusion[1,2]

precision <- TP / (TP + FP)
recall <- TP / (TP + FN)

F1 <- precision * recall / (precision + recall)

iter = 0

# Can use different numbers of hidden layers by using a for loop to make repeated calls to layer_dense
grid_size <- NROW(model_grid) * 1

if (grid_size %% NROW(model_grid) != 0) stop(paste0("Grid size must be a multiple of ", NROW(model_grid), ", the number of rows in the model grid."))

layers <- expand_grid(
  hidden_layers = 1:7,
  layer_1_neurons = seq.int(from = 50, to = 170, length.out = 5),
  layer_k_neurons = seq.int(from = 4, to = 16, length.out = 5)
) %>% bind_rows(expand_grid(
  hidden_layers = 1:7,
  layer_1_neurons = seq.int(from = 50, to = 170, length.out = 5),
  layer_k_neurons = -1
)) %>%
  mutate(layer_k_neurons = ifelse(layer_k_neurons == -1, layer_1_neurons, layer_k_neurons))

# layer_1_neurons <- 170
# layer_k_neurons <- 4
# hidden_layers <- 1

generate_layers <- function(layer_1_neurons, layer_k_neurons, hidden_layers) {
  round(seq(from = layer_1_neurons, to = layer_k_neurons, length.out = hidden_layers))
}

layers <- layers %>%
  mutate(
    neurons = pmap(., .f = generate_layers)
  )


nn_params <- expand.grid(
  layers = layers$neurons,
  loss = c("binary_crossentropy", "categorical_crossentropy"),
  learning_rate = 10^seq(from = -5, to = 0, length.out = 5),
  dropout = 10^seq(from = -1.5, to = -0.31, length.out = 5),
  # regularizer = c("regularizer_l1", "regularizer_l2"),
  # decay = 10^seq(from = -6, to = -1, length.out = 20),
  optimizer = c("optimizer_adam", "optimizer_rmsprop", "optimizer_adagrad", "optimizer_adadelta", "optimizer_sgd"),
  kernel_initializer = c("he_uniform", "he_normal", "glorot_uniform", "glorot_normal"),
  epochs = seq.int(from = 25, to = 500, length.out = 5),
  batch_size = seq.int(from = 1, to = 125, length.out = 5),
  activation = c("sigmoid", "tanh", "relu", "elu")
) %>%
  filter(! (grepl("^he", kernel_initializer) & activation %in% c("sigmoid", "tanh"))) %>%
  filter(! (grepl("^glorot", kernel_initializer) & activation %in% c("relu", "elu"))) %>%
  as_tibble() %>%
  sample_n(grid_size)

temp_grid <- tibble()
for (i in 1:(grid_size / NROW(model_grid))) {
  temp_grid <- rbind(temp_grid, model_grid)
}

search_grid <- cbind(nn_params, temp_grid) %>% as_tibble()

rm(temp_grid, nn_params, layers, i, grid_size)

iter = 1

# Can use different numbers of hidden layers by using a for loop to make repeated calls to layer_dense
grid_size <- NROW(model_grid) * 1

if (grid_size %% NROW(model_grid) != 0) stop(paste0("Grid size must be a multiple of ", NROW(model_grid), ", the number of rows in the model grid."))

layers <- expand_grid(
  hidden_layers = 1:7,
  layer_1_neurons = seq.int(from = 50, to = 170, length.out = 5),
  layer_k_neurons = seq.int(from = 4, to = 16, length.out = 5)
)

# layer_1_neurons <- 170
# layer_k_neurons <- 4
# hidden_layers <- 1

generate_layers <- function(layer_1_neurons, layer_k_neurons, hidden_layers) {
  round(seq(from = layer_1_neurons, to = layer_k_neurons, length.out = hidden_layers))
}

layers <- layers %>%
  mutate(
    neurons = pmap(., .f = generate_layers)
  )


nn_params <- expand.grid(
  layers = layers$neurons,
  loss = c("categorical_crossentropy"),
  learning_rate = 10^seq(from = -3.325, to = -2.8, length.out = 20),
  dropout = 10^seq(from = -1.5, to = -0.9, length.out = 20),
  # regularizer = c("regularizer_l1", "regularizer_l2"),
  # decay = 10^seq(from = -6, to = -1, length.out = 20),
  optimizer = c("optimizer_adam", "optimizer_sgd"),
  kernel_initializer = c("he_normal"),
  epochs = seq.int(from = 200, to = 500, length.out = 10),
  batch_size = seq.int(from = 20, to = 50, length.out = 10),
  activation = c("relu")
) %>%
  # filter(! (grepl("^he", kernel_initializer) & activation %in% c("sigmoid", "tanh"))) %>%
  # filter(! (grepl("^glorot", kernel_initializer) & activation %in% c("relu", "elu"))) %>%
  as_tibble() %>%
  sample_n(grid_size)

temp_grid <- tibble()
for (i in 1:(grid_size / NROW(model_grid))) {
  temp_grid <- rbind(temp_grid, model_grid)
}

compute_class_ratio <- function(x) {
  names(x) <- NULL
  x[1] / x[2]
}

search_grid <- cbind(nn_params, temp_grid) %>% as_tibble() %>%
  mutate(class.ratio = map(class.weights, compute_class_ratio)) %>%
  unnest(class.ratio) %>%
  filter(class.ratio >= 1000 & class.ratio <= 50000)

rm(temp_grid, nn_params, layers, i, grid_size)

# > names(search_grid)
#  [1] "layers"  "loss"   "optimizer"  "epochs"  "batch_size" "activation" "topics"  "split_group" "train"
# [10] "test"   "class.weights" "model"

# model <- search_grid$model[[1]]
# train <- search_grid$train[[1]]
# test <- search_grid$test[[1]]
# class.weights <- search_grid$class.weights[[1]]
# layers <- search_grid$layers[[1]]
# loss <- search_grid$loss[[1]]
# optimizer <- search_grid$optimizer[[1]]
# epochs <- search_grid$epochs[[1]]
# batch_size <- search_grid$batch_size[[1]]
# activation <- search_grid$activation[[1]]
# kernel_initializer <- search_grid$kernel_initializer[[1]]
# learning_rate <- search_grid$learning_rate[[1]]
# dropout <- search_grid$dropout[[1]]


compute_nn <- function(model, train, test, class.weights, layers, loss, 
                       optimizer, epochs, batch_size, activation, 
                       kernel_initializer, learning_rate,  
                       dropout, ...) {

  pb$tick()$print()

  try({

    train_x <- as.matrix(model[train, grepl("^V\\d+", names(model))])
    test_x <- as.matrix(model[test, grepl("^V\\d+", names(model))])

    train_y <- to_categorical(as.numeric(ifelse(model[train, c("corruption")] == "corrupt", 1, 0)))
    truth <- as.numeric(ifelse(model[test, c("corruption")] == "corrupt", 1, 0))
    test_y <-  to_categorical(truth)
    test_weights <- as.numeric(ifelse(model[test, c("corruption")] == "corrupt", class.weights[1], class.weights[2]))

    nn_model <- keras_model_sequential() %>%
      layer_dense(
        units = layers[1], 
        activation = activation, 
        kernel_initializer = kernel_initializer,
        # kernel_regularizer = call(as.character(regularizer), l = decay), 
        input_shape = NCOL(train_x)
      ) %>%
      layer_dropout(rate = dropout) %>%
      layer_batch_normalization() 

    # Dynamically generate hidden layers
    if (length(layers) > 1) { 
      for (i in layers[-1]) { # Drop first element, as it was already added in the preceding step
        nn_model %>%
          layer_dense(
            units = i, 
            activation = activation, 
            kernel_initializer = kernel_initializer
            # kernel_regularizer = call(as.character(regularizer), l = decay)
          ) %>%
          layer_dropout(rate = dropout) %>%
          layer_batch_normalization()
      }
    }

    nn_model %>%
      layer_dense(units = 2, activation = "softmax") %>%
      compile(
        loss = loss,
        optimizer = call(as.character(optimizer), lr = learning_rate),
        metrics = c('accuracy')
      )

    fit <- nn_model %>%
      keras::fit(
        x = train_x,
        y = train_y,
        validation_data = list(test_x, test_y, test_weights),
        epochs = epochs,
        batch_size = batch_size,
        verbose = F
      )

    serialize_model(nn_model, include_optimizer = TRUE)
  })
}

sample_size <- 50
pb <- dplyr::progress_estimated(sample_size)

corruption_NN <- search_grid[sample(1:NROW(search_grid), sample_size, replace = FALSE),] %>%
  mutate(nn_model = pmap(., compute_nn)) %>%
  as_tibble()

predict_nn <- function(nn_model, model, test, ...) {

  pb$tick()$print()
  try({
    test_x <- as.matrix(model[test, grepl("^V\\d+", names(model))])
    fit <- unserialize_model(nn_model, custom_objects = NULL, compile = TRUE)
    fit %>% predict_classes(test_x)
  })
}

pb <- dplyr::progress_estimated(NROW(corruption_NN))

corruption_NN <- corruption_NN %>%
  mutate(predict = pmap(., predict_nn)) %>%
  as_tibble()

nn_scores <- function(predict, model, test, ...) {


  if (! class(predict) == "try-error") {

    truth <- as.numeric(ifelse(model[test, c("corruption")] == "corrupt", 1, 0))

    # Make the vectors factors so if no predicted values are 1, returns a contingency table anyway
    confusion <- table(pred = factor(predict, levels = c(0,1)), truth = factor(truth, levels = c(0,1)))
    TP <- confusion[2,2]
    FP <- confusion[2,1]
    TN <- confusion[1,1]
    FN <- confusion[1,2]

    precision <- TP / (TP + FP)
    recall <- TP / (TP + FN)
    F1 <- precision * recall / (precision + recall)

    enframe(c(Precision = precision, Recall = recall, F1 = F1)) %>% 
      pivot_wider()

  } else {
    data.frame(list("Precision" = -1, "Recall" = -1, "F1" = -1))
  }
}

corruption_NN <- corruption_NN %>%
  mutate(nn_results = future_pmap(., nn_scores, .progress = TRUE)) %>%
  unnest(nn_results) %>%
  as_tibble()

save(
  corruption_NN,
  file = paste("../data-raw/results-models/corruption_NN_long ",
               gsub(":", "", Sys.time()),
               ".RData",
               sep = ""),
  envir = .GlobalEnv,
  compress = "bzip2"
)

# Saving just the result with metrics so we can compare across a range of hyperparameters

NN_results <- corruption_NN %>%
  select(-nn_model, -predict, -model) %>%
  # select(-model) %>%
  as_tibble() %>%
  mutate(weight.ratio = map(class.weights, function(x) {
    round(x[[1]] / x[[2]], 3)
  # }))
  })) %>%
  mutate(F1 = as.numeric(ifelse(is.na(F1), 0, F1)))

NN_results$weight.ratio <- unlist(NN_results$weight.ratio)

save(
  NN_results,
  file = paste("../data-raw/results-scores/NN_results_long_", iter, " ",
  # file = paste("../data-raw/results-models/NN_results_long_nn_model_only_", iter, " ",
               gsub(":", "", Sys.time()),
               ".RData",
               sep = ""),
  envir = .GlobalEnv,
  compress = "bzip2"
)

height <- 3
width <- 6

NN_results_filtered <- NN_results %>%
  filter(!is.na(F1) & F1 >=0) %>%
  mutate(layers_num = map(layers, length)) %>%
  unnest(layers_num) %>%
  mutate(layers_max = map(layers, max)) %>%
  unnest(layers_max) %>%
  mutate(layers_min = map(layers, min)) %>%
  unnest(layers_min) %>%
  mutate(layer_size_diff = (layers_max - layers_min) / layers_num) %>%
  as_tibble()
  # filter(F1 > .1)

NN_results_filtered %>%
  ggplot(aes(F1)) +
    theme_ridges() +
    geom_histogram()

NN_results_filtered %>%
  ggplot(aes(topics, F1, color = activation, shape = loss)) +
    geom_jitter() +
    scale_y_continuous(limits = c(0,1)) +
    theme_ridges() +
    theme(
      legend.position = "right"
    )
ggsave(paste0("../data-raw/results-scores/images/NN_long_topics_", iter, ".png"), height = height, width = width)

NN_results_filtered %>%
  ggplot(aes(activation, F1, color = activation, shape = loss)) +
    geom_jitter() +
    scale_y_continuous(limits = c(0,1)) +
    theme_ridges() +
    theme(
      legend.position = "right"
    )

NN_results_filtered %>%
  ggplot(aes(learning_rate, F1, color = activation, shape = loss)) +
    geom_jitter() +
    scale_y_continuous(limits = c(0,1)) +
    scale_x_log10() +
    theme_ridges() +
    theme(
      legend.position = "right"
    )

NN_results_filtered %>%
  ggplot(aes(kernel_initializer, F1, color = activation, shape = loss)) +
    geom_jitter() +
    scale_y_continuous(limits = c(0,1)) +
    theme_ridges() +
    labs(
      title = "Nu",
      subtitle = paste0("Iteration ", iter)
      ) +
    theme(
      legend.position = "right"
    )
ggsave(paste0("../data-raw/results-scores/images/NN_long_nu_", iter, ".png"), height = height, width = width)

NN_results_filtered %>%
  ggplot(aes(dropout, F1, color = activation, shape = loss)) +
    geom_jitter() +
    scale_y_continuous(limits = c(0,1)) +
    theme_ridges() +
    labs(
      title = "Dropout Rate",
      subtitle = paste0("Iteration ", iter)
      ) +
    theme(
      legend.position = "right"
    )
ggsave(paste0("../data-raw/results-scores/images/NN_long_cross_", iter, ".png"), height = height, width = width)

NN_results_filtered %>%
  ggplot(aes(epochs, F1, color = activation, shape = loss)) +
    geom_jitter() +
    scale_y_continuous(limits = c(0,1)) +
    theme_ridges() +
    theme(
      legend.position = "right"
    )

NN_results_filtered %>%
  ggplot(aes(split_group, F1, color = activation, shape = loss)) +
    geom_jitter() +
    scale_y_continuous(limits = c(0,1)) +
    theme_ridges() +
    labs(
      title = "Which train/test split?",
      subtitle = paste0("Iteration ", iter)
      ) +
    theme(
      legend.position = "right",
      axis.text.x = element_text(angle = 45)
    )
ggsave(paste0("../data-raw/results-scores/images/NN_long_split_", iter, ".png"), height = height, width = width)

NN_results_filtered %>%
  ggplot(aes(batch_size, F1, color = activation, shape = loss)) +
    geom_jitter() +
    scale_y_continuous(limits = c(0,1)) +
    theme_ridges() +
    labs(
      title = "Batch size",
      subtitle = paste0("Iteration ", iter)
      ) +
    theme(
      legend.position = "right",
      axis.text.x = element_text(angle = 45)
    )
ggsave(paste0("../data-raw/results-scores/images/NN_long_btach_size_", iter, ".png"), height = height, width = width)

NN_results_filtered %>%
  ggplot(aes(optimizer, F1, color = activation, shape = loss)) +
    geom_jitter() +
    scale_y_continuous(limits = c(0,1)) +
    theme_ridges() +
    labs(
      title = "Optimizer",
      subtitle = paste0("Iteration ", iter)
      ) +
    theme(
      legend.position = "right",
      axis.text.x = element_text(angle = 45)
    )
ggsave(paste0("../data-raw/results-scores/images/NN_long_optimizer_", iter, ".png"), height = height, width = width)

NN_results_filtered %>%
  ggplot(aes(weight.ratio, F1, color = activation, shape = loss)) +
    geom_jitter() +
    scale_y_continuous(limits = c(0,1)) +
    scale_x_log10() +
    theme_ridges() +
    labs(
      title = "Imbalanced Class Weight Ratio",
      subtitle = paste0("Iteration ", iter),
      x = "weight ratio (log)", y = "F1 score") +
    theme(
      legend.position = "right",
      axis.text.x = element_text(angle = 45)
    )
ggsave(paste0("../data-raw/results-scores/images/NN_long_weight_ratio_", iter, ".png"), height = height, width = width)

NN_results_filtered %>%
  ggplot(aes(layers_num, F1, color = activation, shape = loss, size = layer_size_diff)) +
    geom_jitter() +
    scale_y_continuous(limits = c(0,1)) +
    theme_ridges() +
    labs(
      title = "Number of Hidden Layers",
      subtitle = paste0("Iteration ", iter)
    ) +
    theme(
      legend.position = "right",
      axis.text.x = element_text(angle = 45)
    )
ggsave(paste0("../data-raw/results-scores/images/NN_long_layers_", iter, ".png"), height = height, width = width)