knitr::opts_chunk$set( echo = TRUE, cache = TRUE ) packages <- c("aRlegislation", "tidyverse", "furrr", "parallel", "e1071", "scales", "caret", "ggplot2", "ggridges", "RColorBrewer") lapply(packages, require, character.only = TRUE) rm(packages) options( scipen = 10 ) # print full numbers, not scientific notation options(future.globals.maxSize = 3*1024*1024^2) # 3GB max, 500MB caused errors # Sys.setenv("R_MAX_VSIZE"= 8e9) # has to be set by .Renviron
lawmaker_models <- load_lawmaker_models(datadir = "./inst/extdata/", topics = 100:205)
temp_model <- temp_model <- lawmaker_models$lawmaker_model[[4]] x <- temp_model[-1] y <- as.numeric(x$corruption) svmfit <- e1071::svm(corruption ~ ., data = x, kernel = "linear", cost = 500, scale = FALSE, cross = 5) pred <- predict(svmfit, x, decision.values = TRUE) cbind(pred, temp_model[,c(1:2)]) %>% arrange(pred)
# grid search of svm tuning variables set.seed(77) grid_size <- 1040 # Should be evenly divisible by splits splits <- 26 if (grid_size %% splits != 0) stop("Grid size must be evenly divisible by number of splits.") lawmaker_rows <- lawmaker_models$lawmaker_model[[1]] %>% select(sponsor_full_name, corruption) %>% mutate(train = row_number()) # Create different training splits that have representative sampling of the target variable (corruption) split <- tibble() for (i in 1:splits) { train <- lawmaker_rows %>% group_by(corruption) %>% sample_frac(size = .8) %>% ungroup() %>% select(train) %>% unlist() test <- setdiff(lawmaker_rows$train, train) split <- rbind(split, tibble(i = i, train = list(train), test = list(test))) } rm(i, train, test, lawmaker_rows) search_grid <- expand.grid( kernel = c("linear", "radial", "sigmoid", "polynomial"), degree = 2:7, gamma = 10^seq(from = -5, to = .5, length.out = 50), cost = 10^seq(from = -5, to = 3, length.out = 50), nu = seq(from = 0.001, to = 0.05, length.out = 50), cross = 1:6, type = c("C-classification", "nu-classification") ) %>% as_tibble() %>% # distinct() %>% # If these goes above the distinct line, linear doesn't get sampled due to low appearance # In practice there are only a small number of duplicates; when these values get joined # below, it is unlikely there are any identical rows mutate(degree = ifelse(kernel == "polynomial", degree, 0)) %>% mutate(cost = ifelse(type == "C-classification", cost, 0)) %>% mutate(nu = ifelse(type == "nu-classification", nu, 0)) %>% mutate(gamma = ifelse(kernel != "linear", gamma, 1)) %>% sample_n(grid_size) class.weights <- expand.grid( corrupt = 10^seq(from = 0, to = 3, length.out = 20), `not corrupt` = 10^seq(from = -3, to = 0, length.out = 20) ) %>% sample_n(grid_size, replace = TRUE) # Convert data frame to list of named list class.weights <- enframe(unlist(apply(class.weights, 1, list), recursive = FALSE), name = NULL) names(class.weights) <- c("class.weights") search_grid <- cbind(search_grid, class.weights) rm(class.weights) # We're repeating each set of selected models to fill the grid # This is so we can use each model with several different train/test splits temp_models <- tibble() for (i in 1:(ceiling(grid_size / NROW(lawmaker_models)))) { temp_models <- rbind(temp_models, lawmaker_models) } temp_models <- head(temp_models, grid_size) search_grid$topics <- temp_models$topics search_grid$model <- temp_models$lawmaker_model split_list <- tibble() for (j in 1:splits) { temp <- tibble( split_group = rep(j, grid_size / splits), train = rep(split$train[j], grid_size / splits), test = rep(split$test[j], grid_size / splits) ) split_list <- rbind(split_list, temp) } rm(j, temp, split) search_grid <- cbind(search_grid, split_list) %>% as_tibble() rm(split_list, grid_size, splits) search_grid_full <- search_grid
# grid search of svm tuning variables set.seed(77) iter <- 8 # Split is going to be dependent on individual runs, as they are randomly sampled # This uses the previous split that worked well # This needs to be changed each time the full search grid is generated split_list <- search_grid_full %>% filter(split_group %in% c(4,6,13,15,18,19,22,24)) %>% filter(split_group %in% c(18)) %>% select(split_group, train, test) %>% distinct(split_group, .keep_all = TRUE) splits <- NROW(split_list) grid_size <- ceiling(1040 / splits) * splits # Should be evenly divisible by splits if (grid_size %% splits != 0) stop("Grid size must be evenly divisible by number of splits.") search_grid <- expand.grid( kernel = c("linear"), gamma = 10^seq(from = -0.825, to = .5, length.out = 50), cost = 10^seq(from = 0, to = 0.5, length.out = 50), nu = seq(from = 0.0275, to = 0.0305, length.out = 50), cross = 2:6, type = c("nu-classification") ) %>% as_tibble() %>% # distinct() %>% # If these goes above the distinct line, linear doesn't get sampled due to low appearance # In practice there are only a small number of duplicates; when these values get joined # below, it is unlikely there are any identical rows mutate(degree = ifelse(kernel == "polynomial", degree, 0)) %>% mutate(cost = ifelse(type == "C-classification", cost, 0)) %>% mutate(nu = ifelse(type == "nu-classification", nu, 0)) %>% mutate(gamma = ifelse(kernel != "linear", gamma, 1)) %>% # filter(! (type == "nu-classification" & kernel == "sigmoid")) %>% # mutate(weight = ifelse(type == "nu-classification", 1, 0.5)) %>% # Only doing half the nu vs. C since removed a kernel, so need weighting for sample # sample_n(grid_size, weight = weight) sample_n(grid_size) class.weights <- expand.grid( # corrupt = 10^seq(from = 0, to = 3, length.out = 20), corrupt = 10^seq(from = 1.5, to = 3, length.out = 20), # `not corrupt` = 10^seq(from = -3, to = 0, length.out = 20) `not corrupt` = 10^seq(from = -3, to = -1.5, length.out = 20) ) %>% sample_n(grid_size, replace = TRUE) # Convert data frame to list of named list class.weights <- enframe(unlist(apply(class.weights, 1, list), recursive = FALSE), name = NULL) names(class.weights) <- c("class.weights") search_grid <- cbind(search_grid, class.weights) rm(class.weights) # We're repeating each set of selected models to fill the grid # This is so we can use each model with several different train/test splits temp_models <- tibble() filtered_models <- lawmaker_models %>% filter(topics >= 145 & topics <= 185) for (i in 1:(ceiling(grid_size / NROW(filtered_models)))) { temp_models <- rbind(temp_models, filtered_models) } temp_models <- head(temp_models, grid_size) search_grid$topics <- temp_models$topics search_grid$model <- temp_models$lawmaker_model search_grid <- cbind(search_grid, split_list) %>% as_tibble() rm(split_list, grid_size, splits)
library(qwraps2) options(qwraps2_markup = "markdown") grid_summary <- search_grid %>% select(-model, -train, -test) %>% mutate(class.weights = map(class.weights, function(x) {data.frame(as.list(x))})) %>% unnest(class.weights) %>% rename(weight_corrupt = corrupt) %>% rename(weight_not_corrupt = not.corrupt) grid_summary %>% select(.data$kernel, .data$degree, .data$gamma, .data$cost, .data$nu, .data$cross, .data$type, .data$weight_corrupt, .data$weight_not_corrupt, .data$split_group) %>% summary_table(.)
# Remove model to save storage space; model can be reloaded and joined model_grid <- search_grid %>% nest(svm_params = c(kernel, degree, cost, type, nu, cross, gamma)) %>% mutate(tmp.class.weights = paste(unlist(class.weights), collapse = ", ")) %>% distinct(topics, tmp.class.weights, split_group, .keep_all = TRUE) %>% select(topics, split_group, train, test, class.weights) %>% as_tibble() save( model_grid, file = paste("../data-raw/results-scores/model_grid_short ", gsub(":", "", Sys.time()), ".RData", sep = ""), envir = .GlobalEnv, compress = "bzip2" )
compute_svm <- function(model, kernel, gamma, cost, cross, degree, type, nu, train, class.weights, ...) { x <- model[-1] # Remove legislator name try({ svmfit <- e1071::svm( corruption ~ ., data = x, kernel = kernel, gamma = gamma, cost = cost, cross = cross, degree = degree, type = type, nu = nu, subset = train, # We're subsetting the data so we can use the test set class.weights = class.weights, # na.action = na.pass # We set some values to NA tht won't be used scale = FALSE ) }) } corruption_SVM <- search_grid %>% mutate( svm_model = future_pmap(., .f = compute_svm, .progress = TRUE) ) %>% as_tibble()
k <- 4 test_set <- corruption_SVM$test[[k]] train_set <- corruption_SVM$train[[k]] temp_lawmaker_model <- corruption_SVM$model[[k]] x <- temp_lawmaker_model[test_set,-1] y <- x$corruption temp_svm_model <- corruption_SVM$svm_model[[k]] pred <- predict(temp_svm_model, x, decision.values = TRUE) # Table for the train dataset # table(predict(temp_svm_model, type = "class"), temp_lawmaker_model[train_set,c("corruption")], dnn = c("Prediction", "Actual")) confusionMatrix(pred, y) # Peek at highest-probability predictions # cbind(pred, temp_lawmaker_model[test_set,c(1:2)]) %>% arrange(corruption)
# k <- 7 # test <- corruption_SVM$test[[k]] # model <- corruption_SVM$model[[k]] # svm_model <- corruption_SVM$svm_model[[k]] predict_svm <- function(test, model, svm_model, ...) { try({ x <- model[test, -1] y <- x$corruption pred <- predict(svm_model, x) confusionMatrix(pred, y) }) } corruption_SVM <- corruption_SVM %>% mutate( svm_predict = future_pmap(., predict_svm, .progress = TRUE) )
# k <- 7 # confM <- corruption_SVM$svm_predict[[k]] select_metrics <- function(svm_predict, ...) { if (! class(svm_predict) == "try-error") { try({ enframe(c(svm_predict$byClass, svm_predict$overall)) %>% filter(name %in% c("Precision", "Recall", "F1")) %>% pivot_wider() }) } else { data.frame(list("Precision" = -1, "Recall" = -1, "F1" = -1)) } } corruption_SVM <- corruption_SVM %>% mutate( svm_results = future_map(svm_predict, select_metrics, .progress = TRUE) ) %>% unnest(svm_results)
# Saving just the result with metrics so we can compare across a range of hyperparameters SVM_results <- corruption_SVM %>% select(-svm_model, -svm_predict, -model) %>% as_tibble() %>% mutate(weight.ratio = map(class.weights, function(x) { round(x[[1]] / x[[2]], 3) })) %>% mutate(F1 = as.numeric(ifelse(is.na(F1), 0, F1))) SVM_results$weight.ratio <- unlist(SVM_results$weight.ratio) save( SVM_results, file = paste("../data-raw/results-scores/SVM_results_short_refined_", iter, " ", gsub(":", "", Sys.time()), ".RData", sep = ""), envir = .GlobalEnv, compress = "bzip2" )
# Model can be rejoined on # of topics, so dropping to save space corruption_SVM <- corruption_SVM %>% select(-model) %>% filter(!is.na(F1)) save( corruption_SVM, file = paste("../data-raw/results-scores/corruption_SVM_short_", iter, " ", gsub(":", "", Sys.time()), ".RData", sep = ""), envir = .GlobalEnv, compress = "bzip2" )
height <- 3 width <- 6 max.x <- 0.8 SVM_results_filtered <- SVM_results %>% filter(!is.na(F1) & F1 >=0) # filter(F1 > .1) SVM_results_filtered %>% ggplot(aes(F1)) + theme_ridges() + geom_histogram() SVM_results_filtered %>% ggplot(aes(topics, F1, color = kernel, shape = type)) + geom_jitter() + scale_y_continuous(limits = c(0,1)) + theme_ridges() + scale_color_brewer(palette = "PuOr") + labs( title = "Number of Topics", subtitle = paste0("Iteration ", iter) ) + theme( legend.position = "right" ) ggsave(paste0("../data-raw/results-scores/images/SVM_short_refined_topics_", iter, ".png"), height = height, width = width) SVM_results_filtered %>% ggplot(aes(kernel, F1, color = kernel, shape = type)) + geom_jitter() + scale_y_continuous(limits = c(0,1)) + theme_ridges() + scale_color_brewer(palette = "PuOr") + theme( legend.position = "right" ) SVM_results_filtered %>% ggplot(aes(gamma, F1, color = kernel, shape = type)) + geom_jitter() + scale_y_continuous(limits = c(0,1)) + scale_x_log10() + theme_ridges() + scale_color_brewer(palette = "PuOr") + theme( legend.position = "right" ) SVM_results_filtered %>% ggplot(aes(cost, F1, color = kernel, shape = type)) + geom_jitter() + scale_y_continuous(limits = c(0,1)) + scale_x_log10() + theme_ridges() + scale_color_brewer(palette = "PuOr") + theme( legend.position = "right" ) SVM_results_filtered %>% ggplot(aes(nu, F1, color = kernel, shape = type)) + geom_jitter() + scale_y_continuous(limits = c(0,1)) + theme_ridges() + scale_color_brewer(palette = "PuOr") + labs( title = "Nu", subtitle = paste0("Iteration ", iter) ) + theme( legend.position = "right" ) ggsave(paste0("../data-raw/results-scores/images/SVM_short_refined_nu_", iter, ".png"), height = height, width = width) SVM_results_filtered %>% ggplot(aes(cross, F1, color = kernel, shape = type)) + geom_jitter() + scale_y_continuous(limits = c(0,1)) + theme_ridges() + scale_color_brewer(palette = "PuOr") + labs( title = "Number of Crosses", subtitle = paste0("Iteration ", iter) ) + theme( legend.position = "right" ) ggsave(paste0("../data-raw/results-scores/images/SVM_short_refined_cross_", iter, ".png"), height = height, width = width) SVM_results_filtered %>% ggplot(aes(type, F1, color = kernel, shape = type)) + geom_jitter() + scale_y_continuous(limits = c(0,1)) + theme_ridges() + scale_color_brewer(palette = "PuOr") + theme( legend.position = "right" ) SVM_results_filtered %>% ggplot(aes(degree, F1, color = kernel, shape = type)) + geom_jitter() + scale_y_continuous(limits = c(0,1)) + theme_ridges() + scale_color_brewer(palette = "PuOr") + labs( title = "Degree", subtitle = paste0("Iteration ", iter) ) + theme( legend.position = "right", axis.text.x = element_text(angle = 45) ) SVM_results_filtered %>% ggplot(aes(split_group, F1, color = kernel, shape = type)) + geom_jitter() + scale_y_continuous(limits = c(0,1)) + theme_ridges() + scale_color_brewer(palette = "PuOr") + labs( title = "Which train/test split?", subtitle = paste0("Iteration ", iter) ) + theme( legend.position = "right", axis.text.x = element_text(angle = 45) ) ggsave(paste0("../data-raw/results-scores/images/SVM_short_refined_split_", iter, ".png"), height = height, width = width) SVM_results_filtered %>% ggplot(aes(weight.ratio, F1, color = kernel, shape = type)) + geom_jitter() + scale_y_continuous(limits = c(0,1)) + scale_x_log10() + theme_ridges() + scale_color_brewer(palette = "PuOr") + labs( title = "Imbalanced Class Weight Ratio", subtitle = paste0("Iteration ", iter), x = "weight ratio (log)", y = "F1 score") + theme( legend.position = "right", axis.text.x = element_text(angle = 45) ) ggsave(paste0("../data-raw/results-scores/images/SVM_short_refined_weight_ratio_", iter, ".png"), height = height, width = width)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.