# devtools::install_github("systats/tidyMBO", force = T) # devtools::install_github("rstudio/keras") # devtools::install_github("rstudio/reticulate") #keras::install_keras() pacman::p_load(tidyverse, dplyr, purrr, rsample, data.table, magrittr, tidyTX, keras, mlrMBO, tidyMBO, ggthemes, Smisc, randomForest, parallelMap, emoa, magrittr, Metrics) #keras::install_keras() set.seed(2018) ggplot2::theme_set(ggthemes::theme_few())
dt <- get(load("Readme_files/df_clean.Rdata")) %>% #mutate(index = 1:n()) %>% dplyr::select(party_id, text_lemma, text_word) %>% mutate(index = 1:n()) %>% mutate(id = 1:n()) %>% tidyMBO::split_data(p = .7)
explore <- corpus_description(data = dt$train, text = "text_lemma") explore$token$tokens %>% ggplot(aes(n)) + geom_histogram() + xlim(0, 30)
library(h2o) h2o.init(nthreads = 2) h2o.no_progress() # container <- list( # data = final, # params = # c( # list( # arch = "gbm", # target = "altright", # text = "text_lemma" # ), # list( # ntrees = 30, # max_depth = 4, # learn_rate = .3, # sample_rate = .8, # stop_tol = .01, # stop_round = 1, # nbins = 10 # ) # ) # ) %>% # text_to_matrix() # # ncol(container$data$train_input) # sum(container$data$train_input) params_gbm <- makeParamSet( makeDiscreteParam("ngram", values = c("unigram", "bigram")), #makeDiscreteParam("text", values = c("text_lemma", "text_word")), #makeIntegerParam("term_min", lower = 2, upper = 5), makeIntegerParam("max_vocab", lower = 2000, upper = 4000), makeIntegerParam("ntrees", lower = 20, upper = 150), makeIntegerParam("max_depth", lower = 2, upper = 10), makeNumericParam("learn_rate", lower = .1, upper = .9), makeNumericParam("sample_rate", lower = .1, upper = .9), makeNumericParam("stop_tol", lower = .001, upper = .1), makeIntegerParam("stop_round", lower = 1, upper = 3), makeIntegerParam("nbins", lower = 10, upper = 20) ) # load("shiny_mbo/results/gbm_2.Rdata") results_gbm <- run_mbo( data = dt, params = params_gbm, #prior = results_gbm$params, const = list( arch = "gbm", target = "party_id", text = "text_lemma", balance = T ), n_init = 6, n_main = 30, metric = "accuracy" ) save(results_gbm, file = "gbm_1.Rdata") h2o::h2o.shutdown(prompt = F)
library(h2o) h2o.init( nthreads=-1, ## -1: use all available threads max_mem_size = "2G") # h2o.removeAll() results_mixed <- run_mbo( data = dt, params = params, const = list( target = "party_id", balance = T ), n_init = 5, n_main = 2, metric = c("accuracy") # experimental stage )
listLearners("regr") #listLearnerProperties("regr")
| Metric Type | Metric Name | Function Name | Formula |
| ---- | ------------------------ | ---- | ------------------------------- |
| classification | Classification Error | ce | $\frac{1}{n} \sum_{i=1}^n I(x_i \neq y_i)$ |
| classification | Accuracy | accuracy | $\frac{1}{n} \sum_{i=1}^n I(x_i = y_i)$ |
| classification | F1 Score | f1 | $\frac{2 * \text{precision} * \text{recall}}{\text{precision} + \text{recall}}$ |
| binary classification | Area Under ROC Curve | auc | $\int_0^1 [1 - G_1(G^{-1}0(1 - v))] dv$. help(auc)
for details. |
| binary classification | Log Loss | ll | $x_i * \ln(y_i) + (1 - x_i) * \ln(1 - y_i)$ |
| binary classification | Mean Log Loss | logloss | $\frac{1}{n} \sum{i=1}^n x_i * \ln(y_i) + (1 - x_i) * \ln(1 - y_i)$
library(h2o) h2o.init( nthreads=-1, ## -1: use all available threads max_mem_size = "2G") # h2o.removeAll() results <- run_mbo( data = dt, params = params, target = "party_id", text = "text_word", name = "stack_model1", n_init = 5, n_main = 2, metric = c("accuracy"), # experimental stage parallel = F # Only Unix/Mac no Windows support )
perform <- results$df perform mode1 <- list(maxlen = 30)%>% run_mbo_steps(data = dt, target = "party_id", text = "text_lemma", reconstruct = T) caret::confusionMatrix(mode1$perform, dt$test$party_id)
#save(perform, file = "perform.Rdata") load("perform.Rdata") perform %>% as.tibble()
Distribution of accuracy history
perform %>% ggplot(aes(y, fill = is.na(exec.time))) + geom_histogram()
Distribution of accuracy history
#devtools::install_github("tidyverse/ggplot2") perform %>% ggplot(aes(max_features, maxlen, colour = y, size = y)) + geom_point(alpha = .5) + scale_size_continuous(range(1, 10)) + viridis::scale_colour_viridis() perform %>% ggplot(aes(max_features, maxlen, colour = y)) + geom_point() + geom_density_2d() perform %>% ggplot(aes(max_features, maxlen)) + stat_density_2d(geom = "polygon", aes(fill = ..level.., alpha=..level..)) + viridis::scale_fill_viridis() perform %>% ggplot(aes(max_features, maxlen)) + stat_density_2d(geom = "raster", aes(fill = ..density..), contour = F) + viridis::scale_fill_viridis() perform %>% ggplot(aes(max_features, maxlen)) + stat_density_2d(geom = "raster", aes(fill = ..density.., alpha=..density..), contour = F) + viridis::scale_fill_viridis() perform %>% ggplot(aes(max_features, maxlen)) + stat_density_2d(geom = "point", aes(size = ..density..), n = 20, contour = F, alpha = .7)
perform %>% dplyr::select( max_features, maxlen, batch_size,output_dim, output_fun, y ) %>% tidyr::gather("param", "value", -y, -output_fun) %>% ggplot(aes(value, y, colour = y)) + #geom_tile() + #geom_raster() + geom_density_2d(alpha = .8, color = "grey")+ geom_point() + viridis::scale_colour_viridis("Accuracy") + facet_grid(output_fun~param, scales = "free") + theme(legend.position = "bottom")
perform %>% dplyr::select(max_features:output_fun) %>% dplyr::select_if(is.numeric) %>% mutate(id = 1:n()) %>% gather("var", "value", -id) %>% ggplot(aes(value)) + geom_histogram() + ggplot2::facet_wrap(. ~ var)
Experiment Improvement
perform %>% arrange(step) %>% mutate(best = Smisc::cumMax(y)) %>% ggplot(aes(step, best)) + geom_step() + labs(x = "Trials", y = "Best Value") perform %>% arrange(step) %>% group_by(arch) %>% mutate(best = Smisc::cumMax(y)) %>% ungroup() %>% ggplot(aes(step, best, color = output_fun)) + geom_step() + geom_step(aes(step, y, color = output_fun), alpha = .3) + labs(x = "Trials", y = "Best Value") + facet_grid(.~arch) + theme(legend.position = "bottom")
perform %>% ggplot(aes(max_features, maxlen, colour = y, label = step)) + geom_point() + geom_path(alpha = .5) + geom_text() + viridis::scale_colour_viridis()
#devtools::install_github("ggobi/ggally") library(GGally) my_bin <- function(data, mapping, ..., low = "#132B43", high = "#56B1F7") { ggplot(data = data, mapping = mapping) + geom_hex(...) + scale_fill_gradient(low = low, high = high) } perform %>% dplyr::select(output_dim, maxlen) %>% ggpairs() #mapping = aes(color = y), # lower = list( # combo = wrap("facethist", binwidth = 1), # continuous = wrap(my_bin, binwidth = c(5, 0.5), high = "red") # )
lm(y ~ ., data = perform) %>% broom::tidy()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.