In systats/tidyMBO: tidyMBO for optimizing ML language models

Packages

# devtools::install_github("systats/tidyMBO", force = T)
# devtools::install_github("rstudio/keras")
# devtools::install_github("rstudio/reticulate")
#keras::install_keras()
pacman::p_load(tidyverse, dplyr, purrr, rsample, data.table, magrittr, tidyTX, keras, mlrMBO, tidyMBO, ggthemes, Smisc, randomForest, parallelMap, emoa, magrittr, Metrics)

#keras::install_keras()
set.seed(2018)
ggplot2::theme_set(ggthemes::theme_few())

Data

dt <- get(load("Readme_files/df_clean.Rdata")) %>% 
  #mutate(index = 1:n()) %>%
  dplyr::select(party_id, text_lemma, text_word) %>%
  mutate(index = 1:n()) %>%
  mutate(id = 1:n()) %>%
  tidyMBO::split_data(p = .7)

explore <- corpus_description(data = dt$train, text = "text_lemma")

explore$token$tokens %>%
  ggplot(aes(n)) +
  geom_histogram() +
  xlim(0, 30)

H2O GBM

library(h2o)
h2o.init(nthreads = 2)
h2o.no_progress()

# container <- list(
#   data = final, 
#   params = 
#     c(
#       list(
#         arch = "gbm", 
#         target = "altright", 
#         text = "text_lemma"
#       ),
#       list(
#         ntrees = 30,
#         max_depth = 4, 
#         learn_rate = .3,
#         sample_rate = .8,
#         stop_tol = .01,
#         stop_round = 1,
#         nbins = 10
#       )
#     )
#   ) %>%
#   text_to_matrix()
# 
# ncol(container$data$train_input)
# sum(container$data$train_input)

params_gbm <- makeParamSet(
    makeDiscreteParam("ngram", values = c("unigram", "bigram")),
    #makeDiscreteParam("text", values = c("text_lemma", "text_word")),
    #makeIntegerParam("term_min", lower = 2, upper = 5),
    makeIntegerParam("max_vocab", lower = 2000, upper = 4000),
    makeIntegerParam("ntrees", lower = 20, upper = 150),
    makeIntegerParam("max_depth", lower = 2, upper = 10),
    makeNumericParam("learn_rate", lower = .1, upper = .9), 
    makeNumericParam("sample_rate", lower = .1, upper = .9), 
    makeNumericParam("stop_tol", lower = .001, upper = .1),
    makeIntegerParam("stop_round", lower = 1, upper = 3), 
    makeIntegerParam("nbins", lower = 10, upper = 20)
  )
# load("shiny_mbo/results/gbm_2.Rdata")

results_gbm <- run_mbo(
    data = dt, 
    params = params_gbm, 
    #prior = results_gbm$params,
    const = list(
      arch = "gbm", 
      target = "party_id", 
      text = "text_lemma",
      balance = T
    ), 
    n_init = 6, 
    n_main = 30, 
    metric = "accuracy"
  )

save(results_gbm, file = "gbm_1.Rdata")
h2o::h2o.shutdown(prompt = F)

library(h2o)

h2o.init(
  nthreads=-1,  ## -1: use all available threads
  max_mem_size = "2G")
# h2o.removeAll()

results_mixed <- run_mbo(
    data = dt, 
    params = params, 
    const = list(
      target = "party_id",
      balance = T
    ),
    n_init = 5, 
    n_main = 2,
    metric = c("accuracy") # experimental stage
  )

Run Main

listLearners("regr")
#listLearnerProperties("regr")

Hold-Out Set
- Training set for prediction
- Validation set performance training phase
- Test set for likely future perfromance
Cross-validation (for limited data size)
- k-fold cross validation, divides data into ko subsets of qual size.
- Build model on k-1 data folds leaving one out -> leave one out (loo) if k equals N.

| Metric Type | Metric Name | Function Name | Formula | | ---- | ------------------------ | ---- | ------------------------------- | | classification | Classification Error | ce | $\frac{1}{n} \sum_{i=1}^n I(x_i \neq y_i)$ | | classification | Accuracy | accuracy | $\frac{1}{n} \sum_{i=1}^n I(x_i = y_i)$ | | classification | F1 Score | f1 | $\frac{2 * \text{precision} * \text{recall}}{\text{precision} + \text{recall}}$ | | binary classification | Area Under ROC Curve | auc | $\int_0^1 [1 - G_1(G^{-1}0(1 - v))] dv$. help(auc) for details. | | binary classification | Log Loss | ll | $x_i * \ln(y_i) + (1 - x_i) * \ln(1 - y_i)$ | | binary classification | Mean Log Loss | logloss | $\frac{1}{n} \sum{i=1}^n x_i * \ln(y_i) + (1 - x_i) * \ln(1 - y_i)$

library(h2o)

h2o.init(
  nthreads=-1,  ## -1: use all available threads
  max_mem_size = "2G")
# h2o.removeAll()


results <- run_mbo(
    data = dt, 
    params = params, 
    target = "party_id", 
    text = "text_word",
    name = "stack_model1", 
    n_init = 5, 
    n_main = 2,
    metric = c("accuracy"), # experimental stage
    parallel = F # Only Unix/Mac no Windows support
  )

perform <- results$df
perform

mode1 <- list(maxlen = 30)%>% 
  run_mbo_steps(data = dt, target = "party_id", text = "text_lemma", reconstruct = T)

caret::confusionMatrix(mode1$perform, dt$test$party_id)

#save(perform, file = "perform.Rdata")
load("perform.Rdata")
perform %>% as.tibble()

Distribution of accuracy history

perform %>%
  ggplot(aes(y, fill = is.na(exec.time))) + 
  geom_histogram()

Distribution of accuracy history

#devtools::install_github("tidyverse/ggplot2")
perform %>%
  ggplot(aes(max_features, maxlen, colour = y, size = y)) + 
  geom_point(alpha = .5) +
  scale_size_continuous(range(1, 10)) +
  viridis::scale_colour_viridis()

perform %>%
  ggplot(aes(max_features, maxlen, colour = y)) + 
  geom_point() +
  geom_density_2d()

perform %>%
  ggplot(aes(max_features, maxlen)) + 
    stat_density_2d(geom = "polygon", aes(fill = ..level.., alpha=..level..)) +
    viridis::scale_fill_viridis()

perform %>%
  ggplot(aes(max_features, maxlen)) + 
  stat_density_2d(geom = "raster", aes(fill = ..density..), contour = F) +
    viridis::scale_fill_viridis()

perform %>%
  ggplot(aes(max_features, maxlen)) + 
  stat_density_2d(geom = "raster", aes(fill = ..density.., alpha=..density..), contour = F) +
    viridis::scale_fill_viridis()

perform %>%
  ggplot(aes(max_features, maxlen)) + 
  stat_density_2d(geom = "point", aes(size = ..density..), n = 20, contour = F, alpha = .7)

perform %>%
  dplyr::select(
    max_features, maxlen, 
    batch_size,output_dim, 
    output_fun, y
  ) %>%
  tidyr::gather("param", "value", -y, -output_fun) %>%
  ggplot(aes(value, y, colour = y)) + 
  #geom_tile() +
  #geom_raster() + 
  geom_density_2d(alpha = .8, color = "grey")+
  geom_point() +
  viridis::scale_colour_viridis("Accuracy") +
  facet_grid(output_fun~param, scales = "free") +
  theme(legend.position = "bottom")

perform %>%
  dplyr::select(max_features:output_fun) %>%
  dplyr::select_if(is.numeric) %>%
  mutate(id = 1:n()) %>%
  gather("var", "value", -id) %>%
  ggplot(aes(value)) +
  geom_histogram() +
  ggplot2::facet_wrap(. ~ var)

Experiment Improvement

perform %>%
  arrange(step) %>%
  mutate(best = Smisc::cumMax(y)) %>%
  ggplot(aes(step, best)) +
  geom_step() +
  labs(x = "Trials", y = "Best Value")

perform %>%
  arrange(step) %>%
  group_by(arch) %>%
  mutate(best = Smisc::cumMax(y)) %>%
  ungroup() %>%
  ggplot(aes(step, best, color = output_fun)) +
  geom_step() +
  geom_step(aes(step, y, color = output_fun), alpha = .3) +
  labs(x = "Trials", y = "Best Value")  +
  facet_grid(.~arch) +
  theme(legend.position = "bottom")

perform %>%
  ggplot(aes(max_features, maxlen, colour = y, label = step)) + 
  geom_point() +
  geom_path(alpha = .5) +
  geom_text() +
  viridis::scale_colour_viridis()

#devtools::install_github("ggobi/ggally")
library(GGally)
my_bin <- function(data, mapping, ..., low = "#132B43", high = "#56B1F7") {
  ggplot(data = data, mapping = mapping) +
    geom_hex(...) +
    scale_fill_gradient(low = low, high = high)
}

perform %>%
  dplyr::select(output_dim, maxlen) %>%
  ggpairs()
    #mapping = aes(color = y),
    # lower = list(
    #   combo = wrap("facethist", binwidth = 1),
    #   continuous = wrap(my_bin, binwidth = c(5, 0.5), high = "red")
    # )

Understand Parameters

lm(y ~ ., data = perform) %>%
  broom::tidy()

systats/tidyMBO documentation built on May 24, 2019, 5:06 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

systats/tidyMBO
tidyMBO for optimizing ML language models

In systats/tidyMBO: tidyMBO for optimizing ML language models

Packages

Data

H2O GBM

Run Main

Understand Parameters

R Package Documentation

Browse R Packages

We want your feedback!

systats/tidyMBO tidyMBO for optimizing ML language models

In systats/tidyMBO: tidyMBO for optimizing ML language models

Packages

Data

H2O GBM

Run Main

Understand Parameters

R Package Documentation

Browse R Packages

We want your feedback!

systats/tidyMBO
tidyMBO for optimizing ML language models