knitr::opts_chunk$set(echo = F, eval = T, fig.align = "center", fig.height = 5, fig.width = 7, warning = F, message = F, error = F) pacman::p_load(devtools, tidyverse, keras, GA, dials, RSQLite, magrittr, crayon) devtools::load_all() ggplot2::theme_set(theme_classic()) # Query runs from database con <- dbConnect(RSQLite::SQLite(), "data/model_dump.db") runs <- con %>% tbl("runs") %>% as_tibble() %>% reshape_runs()
A simple collection of well working NLP models (Keras) in R, tuned and benchmarked on a variety of datasets. This is a work in progress and the first version only supports classification tasks (at the moment).
gg_compare <- runs %>% filter(data %in% c("celeb_dat", "toxic_dat")) %>% group_by(model_name) %>% mutate(direct = ifelse(best, test_acc, NA_real_) %>% as.character) %>% fill(direct, .direction = "down") %>% mutate(direct = as.numeric(direct)) %>% ungroup %>% mutate(model_name = forcats::fct_reorder(model_name, direct, .desc = T)) %>% mutate(label = ifelse(best, test_acc, NA)) %>% ggplot(aes(model_name, test_acc)) + geom_violin(fill = "gray50", alpha = .2, color = NA) + geom_jitter(width = .03, alpha = .5) + ggrepel::geom_label_repel(aes(label = label), segment.color = "red", color = "red", nudge_y = 1) + ggtitle("Test performance compared accross classifiers", subtitle = "Note: The test sets are roughly balanced") + labs(x = "", y = "Test Accuracy") + facet_grid(data~backend, scales = "free_y") + #ggthemes::theme_hc() + #scale_fill_viridis_d() + theme(legend.position = "right") gg_compare #plotly::ggplotly(gg_compare)
Training neural networks can be bothering and time consuming due to the sheer amount of hyper-parameters. Hyperparameters are values that are defined prior and provided as additional model input. Tuning those requires either deeper knowledge about the model behavior itself or computational resources for random searches or optimization on the parameter space. textlearnR
provides a light weight framework to train and compare ML models from Keras, H2O, starspace and text2vec (coming soon). Furthermore, it allows to define parameters for text processing (e.g. maximal number of words and text length), which are also considered to be priors.
Beside language models, textlearnR also integrates third party packages for automatically tuning hyperparameters. The following strategies will be avaiable:
GA
Genetic algorithms for stochastic optimization (only real-values).mlrMBO
Bayesian and model-based optimization.For constructing new parameter objects the tidy way, the package dials
is used. Each model optimized is saved to a SQLite database in data/model_dump.db
. Of course, committed to tidy principals. Contributions are highly welcomed!
keras_model <- list( simple_mlp = textlearnR::keras_simple_mlp, deep_mlp = textlearnR::keras_deep_mlp, simple_lstm = textlearnR::keras_simple_lstm, #deep_lstm = textlearnR::keras_deep_lstm, pooled_gru = textlearnR::keras_pooled_gru, cnn_lstm = textlearnR::keras_cnn_lstm, cnn_gru = textlearnR::keras_cnn_gru, gru_cnn = textlearnR::keras_gru_cnn, multi_cnn = textlearnR::keras_multi_cnn )
textlearnR::keras_simple_mlp( input_dim = 10000, embed_dim = 128, seq_len = 50, output_dim = 1 ) %>% summary
range01 <- function(x){(x-min(x))/(max(x)-min(x))} gg_flow <- runs %>% filter(data == "celeb_dat", model_name == "simple_mlp") %>% mutate(step = paste0(id, "_", step)) %>% select(id, step, best, input_dim, seq_len, embed_dim, dense_dim, dropout, test_acc) %>% gather(param, value, -step, -best, -id) %>% mutate(value = as.numeric(value)) %>% drop_na %>% group_by(param) %>% mutate(value = ifelse(as.numeric(value) < 1, value, range01(value))) %>% ungroup %>% mutate(param = factor(param, level = c("input_dim", "seq_len", "embed_dim", "dropout", "dense_dim", "test_acc"))) %>% mutate(label = ifelse(param == "test_acc" & best, value, NA)) %>% ggplot(aes(param, value)) + geom_line(aes(group = step, colour = best, size = best), show.legend = F) + ggrepel::geom_label_repel(aes(label = label), nudge_x = 1, segment.color = "red", color = "red") + scale_colour_manual(values = c("black", "red")) + scale_size_manual(values = c(.1, 1)) + #facet_wrap(~id, ncol = 1) + labs(x = "", y = "") + ggtitle("Parameters of a simple neural network", subtitle = "Note: integer variables are standardized [0, 1]") + theme_classic() + theme( #axis.text.x = element_text(angle = 30, hjust = 1), legend.position = "bottom" ) gg_reg <- runs %>% filter(data == "celeb_dat", model_name == "simple_mlp") %>% select(input_dim, seq_len, embed_dim, dense_dim, dropout, test_acc) %>% mutate_all(as.numeric) %>% mutate_all(scale) %>% lm(test_acc ~ input_dim + seq_len + embed_dim + dense_dim + dropout, data = .) %>% #sjPlot::plot_model() + broom::tidy() %>% mutate(term = factor(term, level = c("input_dim", "seq_len", "embed_dim", "dropout", "dense_dim", "(Intercept)"))) %>% #filter(term != "(Intercept)") %>% mutate(lower = estimate - 1.96*std.error, upper = estimate + 1.96*std.error, color = ifelse(estimate >= 0, T, F)) %>% ggplot(aes(term, estimate)) + geom_pointrange(aes(ymin = lower, ymax = upper, colour = color)) + geom_hline(yintercept = 0, linetype = "dashed") + ggtitle("Linear Regression with Acc ~ parameters") + theme_classic() + theme(legend.position = "none") + scale_colour_manual(values = c("red", "blue")) gridExtra::grid.arrange(gg_flow, gg_reg, ncol = 1, layout_matrix = matrix( c(1, 1, 2), 3))
gg_flow2 <- runs %>% filter(data == "celeb_dat", model_name == "multi_cnn") %>% mutate(step = paste0(id, "_", step)) %>% #glimpse dplyr::select(id, step, best, test_acc, input_dim, seq_len, embed_dim, num_filters, dropout) %>% gather(param, value, -step, -best, -id) %>% mutate(value = as.numeric(value)) %>% drop_na %>% group_by(param) %>% mutate(value = ifelse(as.numeric(value) < 1, value, range01(value))) %>% ungroup %>% mutate(param = factor(param, level = c("input_dim", "seq_len", "embed_dim", "num_filters", "dropout", "test_acc"))) %>% mutate(label = ifelse(param == "test_acc" & best, value, NA)) %>% ggplot(aes(param, value)) + geom_line(aes(group = step, colour = best, size = best)) + ggrepel::geom_label_repel(aes(label = label), nudge_x = 1, segment.color = "red", color = "red") + scale_colour_manual(values = c("black", "red")) + scale_size_manual(values = c(.1, 1)) + #facet_wrap(~id, ncol = 1) + labs(x = "", y = "") + ggtitle("Parameters of a multi-channel convolutional neural network", subtitle = "Note: integer variables are standardized [0, 1]") + theme(legend.position = "none") gg_reg2 <- runs %>% filter(data == "celeb_dat", model_name == "multi_cnn") %>% select(input_dim, seq_len, embed_dim, num_filters, dropout, test_acc) %>% mutate_all(as.numeric) %>% mutate_all(scale) %>% lm(test_acc ~ input_dim + seq_len + embed_dim + num_filters + dropout, data = .) %>% #sjPlot::plot_model() + broom::tidy() %>% mutate(term = factor(term, level = c("input_dim", "seq_len", "embed_dim", "num_filters", "dropout", "test_acc", "(Intercept)"))) %>% #filter(term != "(Intercept)") %>% mutate(lower = estimate - 1.96*std.error, upper = estimate + 1.96*std.error, color = ifelse(estimate >= 0, T, F)) %>% ggplot(aes(term, estimate)) + geom_pointrange(aes(ymin = lower, ymax = upper, colour = color)) + geom_hline(yintercept = 0, linetype = "dashed") + ggtitle("Linear Regression with Acc ~ parameters") + theme(legend.position = "none") + scale_colour_manual(values = c("red", "blue")) gridExtra::grid.arrange(gg_flow2, gg_reg2, ncol = 1, layout_matrix = matrix( c(1, 1, 2), 3))
library(ruimtehol) data("dekamer", package = "ruimtehol") dekamer$x <- strsplit(dekamer$question, "\\W") dekamer$x <- sapply(dekamer$x, FUN = function(x) paste(setdiff(x, ""), collapse = " ")) dekamer$x <- tolower(dekamer$x) dekamer$y <- strsplit(dekamer$question_theme, split = ",") dekamer$y <- lapply(dekamer$y, FUN=function(x) gsub(" ", "-", x)) set.seed(123456789) model <- embed_tagspace(x = dekamer$x, y = dekamer$y, dim = 50, lr = 0.01, epoch = 40, loss = "softmax", adagrad = TRUE, similarity = "cosine", negSearchLimit = 50, ngrams = 2, minCount = 2) plot(model) text <- c("de nmbs heeft het treinaanbod uitgebreid via onteigening ...", "de migranten komen naar europa de asielcentra ...") predict(model, text, k = 3) predict(model, "koning filip", k = 10, type = "knn") predict(model, "koning filip", k = 10, type = "embedding")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.