In titaniumtroop/aRlegislation: A Dataset of Arkansas Legislation from 2001-2019

knitr::opts_chunk$set(
  echo = TRUE,
  cache = TRUE
)

packages <- c("aRlegislation", "tidyverse", "furrr", "parallel", "e1071", "scales", "caret", "ggplot2", "ggridges")
lapply(packages, require, character.only = TRUE)
rm(packages)

options( scipen = 10 ) # print full numbers, not scientific notation
options(future.globals.maxSize = 3*1024*1024^2) # 3GB max, 500MB caused errors
# Sys.setenv("R_MAX_VSIZE"= 8e9) # has to be set by .Renviron

lawmaker_models <- load_lawmaker_session_models(datadir = "./inst/extdata/", topics = 100:205)

temp_model <- lawmaker_models$lawmaker_model[[1]]
x <- temp_model[, grepl("^V\\d+|corruption", names(temp_model))]
y <- as.numeric(temp_model$corruption)
svmfit <- e1071::svm(corruption ~ ., data = x, kernel = "linear", cost = 500, scale = FALSE, cross = 5)

pred <- predict(svmfit, x, decision.values = TRUE)
cbind(pred, temp_model[,c(3:4)]) %>% arrange(pred)

# grid search of svm tuning variables
iter <- 0

set.seed(77)
grid_size <- 520 # Should be evenly divisible by splits
splits <- 26

if (grid_size %% splits != 0) stop("Grid size must be evenly divisible by number of splits.")

lawmaker_rows <- lawmaker_models$lawmaker_model[[1]] %>%
  select(sponsor_full_name, corruption) %>%
  mutate(train = row_number())

# Create different training splits that have representative sampling of the target variable (corruption)
split <- tibble() 

for (i in 1:splits) {
  train <- lawmaker_rows %>%
    group_by(corruption) %>%
    sample_frac(size = .8) %>%
    ungroup() %>%
    select(train) %>%
    unlist() 

  test <- setdiff(lawmaker_rows$train, train)

  split <- rbind(split, tibble(i = i, train = list(train), test = list(test)))
}
rm(i, train, test, lawmaker_rows)

search_grid <- expand.grid(
  kernel = c("linear", "radial", "sigmoid", "polynomial"), 
  degree = 2:7,
  gamma = 10^seq(from = -5, to = .5, length.out = 50),
  cost = 10^seq(from = -5, to = 3, length.out = 50),
  nu = 10^seq(from = -6, to = -.6, length.out = 50),
  cross = 1:6,
  type = c("C-classification", "nu-classification")
) %>%
  as_tibble() %>%
#  distinct() %>%
  # If these goes above the distinct line, linear doesn't get sampled due to low appearance
  # In practice there are only a small number of duplicates; when these values get joined
  # below, it is unlikely there are any identical rows
  mutate(degree = ifelse(kernel == "polynomial", degree, 0)) %>%
  mutate(cost = ifelse(type == "C-classification", cost, 0)) %>%
  mutate(nu = ifelse(type == "nu-classification", nu, 0)) %>%
  mutate(gamma = ifelse(kernel != "linear", gamma, 1)) %>% 
  sample_n(grid_size)

class.weights <- expand.grid(
  corrupt = 10^seq(from = 0, to = 3, length.out = 20), 
  `not corrupt` = 10^seq(from = -3, to = 0, length.out = 20)
) %>% sample_n(grid_size, replace = TRUE)

# Convert data frame to list of named list
class.weights <- enframe(unlist(apply(class.weights, 1, list), recursive = FALSE), name = NULL)
names(class.weights) <- c("class.weights")

search_grid <- cbind(search_grid, class.weights)
rm(class.weights)

# We're repeating each set of selected models to fill the grid
# This is so we can use each model with several different train/test splits

temp_models <- tibble()
for (i in 1:(ceiling(grid_size / NROW(lawmaker_models)))) {
  temp_models <- rbind(temp_models, lawmaker_models)
}
temp_models <- head(temp_models, grid_size)
search_grid$topics <- temp_models$topics
search_grid$model <- temp_models$lawmaker_model

split_list <- tibble()

for (j in 1:splits) {
  temp <- tibble(
    split_group = rep(j, grid_size / splits), 
    train = rep(split$train[j], grid_size / splits),
    test = rep(split$test[j], grid_size / splits)
  )
  split_list <- rbind(split_list, temp)
}

rm(j, temp, split)

search_grid <- cbind(search_grid, split_list) %>% as_tibble()
rm(split_list, grid_size, splits)

# Remove model to save storage space; model can be reloaded and joined
model_grid <- search_grid %>%
  nest(svm_params = c(kernel, degree, cost, type, nu, cross, gamma)) %>%
  mutate(tmp.class.weights = paste(unlist(class.weights), collapse = ", ")) %>%
  distinct(topics, tmp.class.weights, split_group, .keep_all = TRUE) %>%
  select(topics, split_group, train, test, class.weights) %>%
  as_tibble()

save(
  model_grid,
  file = paste("../data-raw/search-grids/model_grid_long ",
               gsub(":", "", Sys.time()),
               ".RData",
               sep = ""),
  envir = .GlobalEnv,
  compress = "bzip2"
)

# grid search of svm tuning variables

set.seed(77)

iter <- 1

# Split is going to be dependent on individual runs, as they are randomly sampled
# This uses the previous split that worked well
# This needs to be changed each time the full search grid is generated
split_list <- search_grid_full %>%
  filter(split_group %in% c(4,6,13,15,18,19,22,24)) %>%
  filter(split_group %in% c(18)) %>%
  select(split_group, train, test) %>%
  distinct(split_group, .keep_all = TRUE)

splits <- NROW(split_list)

grid_size <- ceiling(1040 / splits) * splits  # Should be evenly divisible by splits
if (grid_size %% splits != 0) stop("Grid size must be evenly divisible by number of splits.")


search_grid <- expand.grid(
  kernel = c("linear"), 
  gamma = 10^seq(from = -0.825, to = .5, length.out = 50),
  cost = 10^seq(from = 0, to = 0.5, length.out = 50),
  nu = seq(from = 0.0275, to = 0.0305, length.out = 50),
  cross = 2:6,
  type = c("nu-classification")
) %>%
  as_tibble() %>%
#  distinct() %>%
  # If these goes above the distinct line, linear doesn't get sampled due to low appearance
  # In practice there are only a small number of duplicates; when these values get joined
  # below, it is unlikely there are any identical rows
  mutate(degree = ifelse(kernel == "polynomial", degree, 0)) %>%
  mutate(cost = ifelse(type == "C-classification", cost, 0)) %>%
  mutate(nu = ifelse(type == "nu-classification", nu, 0)) %>%
  mutate(gamma = ifelse(kernel != "linear", gamma, 1)) %>% 
  # filter(! (type == "nu-classification" & kernel == "sigmoid")) %>%
  # mutate(weight = ifelse(type == "nu-classification", 1, 0.5)) %>% # Only doing half the nu vs. C since removed a kernel, so need weighting for sample
  # sample_n(grid_size, weight = weight)
  sample_n(grid_size)

class.weights <- expand.grid(
  # corrupt = 10^seq(from = 0, to = 3, length.out = 20), 
  corrupt = 10^seq(from = 1.5, to = 3, length.out = 20), 
  # `not corrupt` = 10^seq(from = -3, to = 0, length.out = 20)
  `not corrupt` = 10^seq(from = -3, to = -1.5, length.out = 20)
) %>% sample_n(grid_size, replace = TRUE)

# Convert data frame to list of named list
class.weights <- enframe(unlist(apply(class.weights, 1, list), recursive = FALSE), name = NULL)
names(class.weights) <- c("class.weights")

search_grid <- cbind(search_grid, class.weights)
rm(class.weights)

# We're repeating each set of selected models to fill the grid
# This is so we can use each model with several different train/test splits

temp_models <- tibble()
filtered_models <- lawmaker_models %>%
  filter(topics >= 145 & topics <= 185)
for (i in 1:(ceiling(grid_size / NROW(filtered_models)))) {
  temp_models <- rbind(temp_models, filtered_models)
}
temp_models <- head(temp_models, grid_size)
search_grid$topics <- temp_models$topics
search_grid$model <- temp_models$lawmaker_model

search_grid <- cbind(search_grid, split_list) %>% as_tibble()
rm(split_list, grid_size, splits)

library(qwraps2)
options(qwraps2_markup = "markdown")

grid_summary <- search_grid %>% 
  select(-model, -train, -test) %>% 
  mutate(class.weights = map(class.weights, function(x) {data.frame(as.list(x))})) %>%
  unnest(class.weights) %>%
  rename(weight_corrupt = corrupt) %>%
  rename(weight_not_corrupt = not.corrupt)

grid_summary %>%
  select(.data$kernel, .data$degree, .data$gamma, .data$cost, .data$nu, .data$cross, .data$type, .data$weight_corrupt, .data$weight_not_corrupt, .data$split_group) %>%
  summary_table(.)

compute_svm <- function(model, kernel, gamma, cost, cross, degree, type, nu, train, class.weights, ...) {
  x <- model[, grepl("^V\\d+|corruption", names(model))] # Remove legislator name
  try({
    svmfit <- e1071::svm(
      corruption ~ ., 
      data = x, 
      kernel = kernel,
      gamma = gamma,
      cost = cost, 
      cross = cross, 
      degree = degree, 
      type = type, 
      nu = nu,
      subset = train, # We're subsetting the data so we can use the test set
      class.weights = class.weights,
      # na.action = na.pass # We set some values to NA tht won't be used
      scale = FALSE
    )
  })

}

corruption_SVM <- search_grid %>%
  mutate(
    svm_model = future_pmap(., .f = compute_svm, .progress = TRUE)
  ) %>%
  mutate(
    topics = unlist(map(model, NCOL)) - 4 # because in addition to topics models also have cycle, session, names, and target variables
  ) %>%
  select(topics, everything()) %>%  
  as_tibble()

k <- 4
test_set <- corruption_SVM$test[[k]]
train_set <- corruption_SVM$train[[k]]

temp_lawmaker_model <- corruption_SVM$model[[k]]
x <- temp_lawmaker_model[test_set,-1]
y <- x$corruption

temp_svm_model <- corruption_SVM$svm_model[[k]]

pred <- predict(temp_svm_model, x, decision.values = TRUE)

# Table for the train dataset
# table(predict(temp_svm_model, type = "class"), temp_lawmaker_model[train_set,c("corruption")], dnn = c("Prediction", "Actual"))

confusionMatrix(pred, y)

# Peek at highest-probability predictions
# cbind(pred, temp_lawmaker_model[test_set,c(1:2)]) %>% arrange(corruption)

# k <- 7
# test <- corruption_SVM$test[[k]]
# model <- corruption_SVM$model[[k]]
# svm_model <- corruption_SVM$svm_model[[k]]

predict_svm <- function(test, model, svm_model, ...) {
  try({
    x <- model[test, grepl("^V\\d+|corruption", names(model))]
    y <- x$corruption

    pred <- predict(svm_model, x)
    confusionMatrix(pred, y)
  })
}

corruption_SVM <- corruption_SVM %>%
  mutate(
    svm_predict = future_pmap(., predict_svm, .progress = TRUE)
  )

# k <- 7
# confM <- corruption_SVM$svm_predict[[k]]

select_metrics <- function(svm_predict, ...) {
  if (! class(svm_predict) == "try-error") {
    try({
      enframe(c(svm_predict$byClass, svm_predict$overall)) %>% 
        filter(name %in% c("Precision", "Recall", "F1")) %>% 
        pivot_wider()
    })
  } else {
    data.frame(list("Precision" = -1, "Recall" = -1, "F1" = -1))
  }
}

corruption_SVM <- corruption_SVM %>%
  mutate(
    svm_results = future_map(svm_predict, select_metrics, .progress = TRUE)
  ) %>%
  unnest(svm_results)

# Saving just the result with metrics so we can compare across a range of hyperparameters

SVM_results <- corruption_SVM %>%
  select(-svm_model, -svm_predict, -model) %>%
  as_tibble() %>%
  mutate(weight.ratio = map(class.weights, function(x) {
    round(x[[1]] / x[[2]], 3)
  })) %>%
  mutate(F1 = as.numeric(ifelse(is.na(F1), 0, F1)))

SVM_results$weight.ratio <- unlist(SVM_results$weight.ratio)

save(
  SVM_results,
  file = paste("../data-raw/results-scores/SVM_results_long_", iter, " ",
               gsub(":", "", Sys.time()),
               ".RData",
               sep = ""),
  envir = .GlobalEnv,
  compress = "bzip2"
)

# Model can be rejoined on # of topics, so dropping to save space
corruption_SVM <- corruption_SVM %>%
  select(-model) %>%
  filter(!is.na(F1))

save(
  corruption_SVM,
  file = paste("../data-raw/results-models/corruption_SVM ",
               gsub(":", "", Sys.time()),
               ".RData",
               sep = ""),
  envir = .GlobalEnv,
  compress = "bzip2"
)

height <- 3
width <- 6

SVM_results_filtered <- SVM_results %>%
  filter(!is.na(F1) & F1 >=0)
  # filter(F1 > .1)

SVM_results_filtered %>%
  ggplot(aes(F1)) +
    theme_ridges() +
    geom_histogram()
ggsave(paste0("../data-raw/results-scores/images/SVM_long_refined_histogram_", iter, ".png"), height = height, width = width)

SVM_results_filtered %>%
  ggplot(aes(topics, F1, color = kernel, shape = type)) +
    geom_jitter() +
    scale_y_continuous(limits = c(0,1)) +
    theme_ridges() +
    theme(
      legend.position = "right"
    )
ggsave(paste0("../data-raw/results-scores/images/SVM_long_refined_topics_", iter, ".png"), height = height, width = width)

SVM_results_filtered %>%
  ggplot(aes(kernel, F1, color = kernel, shape = type)) +
    geom_jitter() +
    scale_y_continuous(limits = c(0,1)) +
    theme_ridges() +
    theme(
      legend.position = "right"
    )

SVM_results_filtered %>%
  ggplot(aes(gamma, F1, color = kernel, shape = type)) +
    geom_jitter() +
    scale_y_continuous(limits = c(0,1)) +
    scale_x_log10() +
    theme_ridges() +
    theme(
      legend.position = "right"
    )

SVM_results_filtered %>%
  ggplot(aes(cost, F1, color = kernel, shape = type)) +
    geom_jitter() +
    scale_y_continuous(limits = c(0,1)) +
    scale_x_log10() +
    theme_ridges() +
    theme(
      legend.position = "right"
    )

SVM_results_filtered %>%
  ggplot(aes(nu, F1, color = kernel, shape = type)) +
    geom_jitter() +
    scale_y_continuous(limits = c(0,1)) +
    theme_ridges() +
    labs(
      title = "Nu",
      subtitle = paste0("Iteration ", iter)
      ) +
    theme(
      legend.position = "right"
    )
ggsave(paste0("../data-raw/results-scores/images/SVM_long_refined_nu_", iter, ".png"), height = height, width = width)

SVM_results_filtered %>%
  ggplot(aes(cross, F1, color = kernel, shape = type)) +
    geom_jitter() +
    scale_y_continuous(limits = c(0,1)) +
    theme_ridges() +
    labs(
      title = "Number of Crosses",
      subtitle = paste0("Iteration ", iter)
      ) +
    theme(
      legend.position = "right"
    )
ggsave(paste0("../data-raw/results-scores/images/SVM_long_refined_cross_", iter, ".png"), height = height, width = width)

SVM_results_filtered %>%
  ggplot(aes(type, F1, color = kernel, shape = type)) +
    geom_jitter() +
    scale_y_continuous(limits = c(0,1)) +
    theme_ridges() +
    theme(
      legend.position = "right"
    )

SVM_results_filtered %>%
  ggplot(aes(degree, F1, color = kernel, shape = type)) +
    geom_jitter() +
    scale_y_continuous(limits = c(0,1)) +
    theme_ridges() +
    labs(
      title = "Degree",
      subtitle = paste0("Iteration ", iter)
      ) +
    theme(
      legend.position = "right",
      axis.text.x = element_text(angle = 45)
    )

SVM_results_filtered %>%
  ggplot(aes(split_group, F1, color = kernel, shape = type)) +
    geom_jitter() +
    scale_y_continuous(limits = c(0,1)) +
    theme_ridges() +
    labs(
      title = "Which train/test split?",
      subtitle = paste0("Iteration ", iter)
      ) +
    theme(
      legend.position = "right",
      axis.text.x = element_text(angle = 45)
    )
ggsave(paste0("../data-raw/results-scores/images/SVM_long_refined_split_", iter, ".png"), height = height, width = width)

SVM_results_filtered %>%
  ggplot(aes(weight.ratio, F1, color = kernel, shape = type)) +
    geom_jitter() +
    scale_y_continuous(limits = c(0,1)) +
    scale_x_log10() +
    theme_ridges() +
    labs(
      title = "Imbalanced Class Weight Ratio",
      subtitle = paste0("Iteration ", iter),
      x = "weight ratio (log)", y = "F1 score") +
    theme(
      legend.position = "right",
      axis.text.x = element_text(angle = 45)
    )
ggsave(paste0("../data-raw/results-scores/images/SVM_long_refined_weight_ratio_", iter, ".png"), height = height, width = width)