# devtools::install_github("systats/tidyTX") pacman::p_load(dplyr, ggplot2, googlesheets, openxlsx, stringr, rvest, dplyr, ggplot2, keras, mlrMBO, tidyMBO, ggthemes, Smisc, randomForest, parallelMap, emoa, DiceKriging, magrittr, h2o, ParamHelpers, text2vec) #devtools::install_github("systats/tidyMBO") set.seed(2018) ggplot2::theme_set(ggthemes::theme_few())
# googlesheets::gs_auth(token = "shiny_app_token.rds") # with_label <- gs_title("altright_data_final") %>% # gs_read() # save(with_label, file = "data/with_label.Rdata") load("data/with_label.Rdata") clean_metric <- function(x){ x %>% str_replace_all("Not Present", "1") %>% str_replace_all("Strongly Present", "5") %>% str_replace_all("99", "0") %>% as.numeric() } df_metric <- with_label %>% dplyr::select(identity:left, anti_fem:anti_mus) %>% purrr::map_df(.f = ~clean_metric(.x)) %>% purrr::map_df(.f = ~ifelse(.x == 1, 0, 1)) df_order <- with_label %>% dplyr::select(user:nchar, coder, timestamp) clean_category <- function(x){ if(is.logical(x)) return(x) x %>% str_replace_all("99", "0") %>% as.numeric() } df_category <- with_label %>% dplyr::select(lang:irony) %>% purrr::map_df(.f = ~clean_category(.x)) df_all <- bind_cols( df_order, df_metric, df_category ) %>% filter(!duplicated(text)) %>% mutate( platform = case_when( platform == "fb" ~ "Facebook", platform == "tw" ~ "Twitter", platform == "yt" ~ "YouTube" ) ) %>% mutate(altright1 = case_when( identity > 0 ~ 1 , race > 0 ~ 1 , anti_sem > 0 ~ 1 , #moral > 0 ~ 1 , #imm == 3 ~ 1 , #vict == 2 ~ 1 , TRUE ~ 0 )) %>% mutate(altright2 = case_when( identity > 0 ~ 1 , race > 0 ~ 1 , anti_sem > 0 ~ 1 , #moral > 0 ~ 1 , imm == 3 ~ 1 , #vict == 2 ~ 1 , TRUE ~ 0 )) %>% mutate(altright3 = case_when( identity > 0 ~ 1 , race > 0 ~ 1 , anti_sem > 0 ~ 1 , #moral > 0 ~ 1 , imm == 3 ~ 1 , vict == 2 ~ 1 , TRUE ~ 0 )) %>% mutate(altright3 = case_when( identity > 0 ~ 1 , race > 0 ~ 1 , anti_sem > 0 ~ 1 , #moral > 0 ~ 1 , imm == 3 ~ 1 , vict == 2 ~ 1 , TRUE ~ 0 )) %>% mutate(altright4 = case_when( anti_mus > 0 ~ 1 , identity > 0 ~ 1 , race > 0 ~ 1 , anti_sem > 0 ~ 1 , #moral > 0 ~ 1 , imm == 3 ~ 1 , vict == 2 ~ 1 , TRUE ~ 0 )) %>% mutate(altlight1 = case_when( anti_mus > 0 ~ 1 , elite > 0 ~ 1 , anti_fem > 0 ~ 1 , left > 0 ~ 1, #imm == 2 ~ 1 , #vict == 3 ~ 1 , # anti_fem > 1 ~ 1 , TRUE ~ 0 )) %>% mutate(altlight2 = case_when( #anti_mus > 0 ~ 1 , elite > 0 ~ 1 , anti_fem > 0 ~ 1 , left > 0 ~ 1, #imm == 2 ~ 1 , #vict == 3 ~ 1 , # anti_fem > 1 ~ 1 , TRUE ~ 0 )) %>% mutate(altlight3 = case_when( anti_mus > 0 ~ 1 , elite > 0 ~ 1 , anti_fem > 0 ~ 1 , left > 0 ~ 1, imm == 2 ~ 1 , vict == 3 ~ 1, TRUE ~ 0 )) %>% mutate(altlight4 = case_when( #anti_mus > 0 ~ 1 , elite > 0 ~ 1 , anti_fem > 0 ~ 1 , left > 0 ~ 1, imm == 2 ~ 1 , vict == 3 ~ 1, TRUE ~ 0 )) %>% mutate( altright = altright3 == 1 | (altlight3 == 0 & altright3 == 0), altlight = altlight3 == 1 | (altlight3 == 0 & altright3 == 0), mus = altright4 == 1 | (altlight4 == 0 & altright4 == 0) ) %>% # mutate(alt_type2 = case_when( # altright == 1 ~ 2, # altlight == 1 ~ 1, # altright == 0 & altlight == 0 ~ 0 # )) %>% #mutate(alt_dummy = ifelse(alt_type2 != 0, 1, 0)) %>% filter(!irony) #arrange(desc(altright), desc(altlight)) dt_altlight <- bind_rows( df_all %>% filter(altlight) %>% filter(altlight3 == 0) %>% sample_n(size = 2000), df_all %>% filter(altlight) %>% filter(altlight3 ==1) ) # df_sub_0 <- df_all %>% # filter(altlight != 1) %>% # sample_n(size = 2000) # # df_sub <- bind_rows(df_rest, df_sub_0) dt_altright <- bind_rows( df_all %>% filter(altright) %>% filter(altright3 == 0) %>% sample_n(size = 2000), df_all %>% filter(altright) %>% filter(altright3 ==1) ) dt_mus <- bind_rows( df_all %>% filter(mus) %>% filter(altright4 == 0) %>% sample_n(size = 2000), df_all %>% filter(mus) %>% filter(altright4 ==1) ) table(dt_mus$altright4) table(dt_altright$altlight4) # df_sub_0 <- df_all %>% # filter(altlight != 1) %>% # sample_n(size = 2000) # # df_sub <- bind_rows(df_rest, df_sub_0) # # # # # table(df_sub$alt_type2) # df_all <- df_sub # # glimpse(df_all)
library(textfeatures) tf1 <- df_all %>% textfeatures::textfeatures() tf2 <- df_all %>% mutate(alt_type = as.factor(alt_type)) %>% group_by(alt_type) %>% textfeatures::textfeatures() scale_standard <- function(x) (x - 0) / (max(x, na.rm = TRUE) - 0) ## convert to long (tidy) form and plot tf2 %>% mutate_if(is.numeric, scale_standard) %>% tidyr::gather(var, val, -alt_type) %>% ggplot(aes(x = var, y = val, fill = alt_type)) + geom_col(width = .65) + facet_wrap( ~ alt_type, nrow = 1) + coord_flip() # theme(legend.position = "none", # axis.text = element_text(colour = "black"), # plot.title = element_text(face = "bold")) + # labs(y = NULL, x = NULL, # title = "{textfeatures}: Extract Features from Text", # subtitle = "Features extracted from text of the most recent 1,000 tweets posted by each news media account")
# df_all$text %>% # stringr::str_to_lower() %>% # tidyTX::tx_replace_punc() %>% # #tidyTX::tx_replace_twitter(replace_hash = T, replace_hndl = T) %>% # tidyTX::tx_replace_url() %>% # tidyTX::tx_replace_punc() %>% # tidyTX::tx_map_dict(twitter_dict, key1 = 0, key2 = 1)
look categories that are less predictive
corpus dict
twitter_dict <- list( "moslem." = " muslim", "sjw" = " social justice warrior ", "mu.*?lim." = " muslim ", "jew." = " jew ", ".hite" = " white ", ".lack" = " black ", "\\(\\(\\(" = " jew ", "\\)\\)\\)" = " ", "gorillion" = " jew gorillion ", "chosen people" = " jew ", "hebrew" = " jew ", "heeb" = " jew swearword ", "libtard." = " liberal swearword ", "n.gg.." = " race swearword ", "kike" = " jew swearword ", "left" = "liberal", "feminazi." = " woman swearword ", "bitch" = " woman swearword ", "slut" = " woman swearword ", "whore" = " woman swearword ", "cunt" = " woman swearword ", "goy.*? " = "goy", "wetbag." = " race swearword ", "chink." = " race swearword ", "dindu." = " race swearword ", "raghead." = " race swearword " ) prep <- df_all %>% mutate(id = 1:n()) %>% mutate(text = text %>% stringr::str_to_lower() %>% tidyTX::tx_replace_punc() %>% #tidyTX::tx_replace_twitter(replace_hash = T, replace_hndl = T) %>% tidyTX::tx_replace_url() %>% tidyTX::tx_replace_punc() %>% tidyTX::tx_map_dict(twitter_dict, key1 = 0, key2 = 1) ) %>% tidytext::unnest_tokens(word, text, to_lower = F) %>% left_join(tidyTX::hash_lemma_en, by = "word") %>% mutate(lemma = ifelse(is.na(lemma), word, lemma)) %>% dplyr::anti_join(tidyTX::stop_words_en, by = "word") %>% filter(!stringr::str_detect(lemma, "[[:digit:]]|[[:punct:]]")) %>% filter(nchar(word) > 1) %>% group_by(id) %>% summarise( text_word = paste(word, collapse = " "), text_lemma = paste(lemma, collapse = " ")) %>% ungroup() %>% bind_cols(., df_all %>% dplyr::select(-text)) final <- prep %>% mutate(text = df_all[["text"]]) %>% arrange(sample(1:length(id), length(id))) %>% mutate(index = id) %>% tidyMBO::split_data(p = .8) #table(final$test$alt_dummy)
clean_fun <- function(df){ prep <- df %>% mutate(id = 1:n()) %>% mutate(text = text %>% stringr::str_to_lower() %>% tidyTX::tx_replace_punc() %>% #tidyTX::tx_replace_twitter(replace_hash = T, replace_hndl = T) %>% tidyTX::tx_replace_url() %>% tidyTX::tx_replace_punc() %>% tidyTX::tx_map_dict(twitter_dict, key1 = 0, key2 = 1) ) %>% tidytext::unnest_tokens(word, text, to_lower = F) %>% left_join(tidyTX::hash_lemma_en, by = "word") %>% mutate(lemma = ifelse(is.na(lemma), word, lemma)) %>% dplyr::anti_join(tidyTX::stop_words_en, by = "word") %>% filter(!stringr::str_detect(lemma, "[[:digit:]]|[[:punct:]]")) %>% filter(nchar(word) > 1) %>% group_by(id) %>% summarise( text_word = paste(word, collapse = " "), text_lemma = paste(lemma, collapse = " ")) %>% ungroup() %>% bind_cols(., df %>% dplyr::select(-text)) final <- prep %>% mutate(text = df[["text"]]) %>% arrange(sample(1:length(id), length(id))) %>% mutate(index = id) %>% tidyMBO::split_data(p = .8) return(final) } altright_final <- clean_fun(dt_altright) altlight_final <- clean_fun(dt_altlight) mus_final <- clean_fun(dt_mus)
corpus_description <- function(data, text){ dat <- data %>% dplyr::rename_("my_text" = text) %>% dplyr::mutate(nchar = my_text %>% nchar()) %>% dplyr::mutate(ntok = tidyTX::tx_n_tokens(my_text)) tc <- dat %>% dplyr::select(my_text) %>% tidytext::unnest_tokens(word, my_text, token = "words") %>% dplyr::count(word) %>% dplyr::arrange(desc(n)) out <- list( char = list( mean = mean(dat$nchar, na.rm = T) %>% floor(), med = median(dat$nchar, na.rm = T) ), token = list( mean = mean(dat$ntok, na.rm = T) %>% floor(), med = median(dat$ntok, na.rm = T), quant = quantile(dat$ntok), denc = quantile(dat$ntok, probs = seq(.1:1, by = .1)), n_5 = tc %>% filter(n > 5) %>% nrow(), n_3 = tc %>% filter(n > 3) %>% nrow(), n_all = tc %>% nrow(), tokens = tc ) ) return(out) } explore <- corpus_description(data = final$train, text = "text_lemma") explore$token$n_3 #prep$train %>% head() #listLearners("regr", properties = c("factors", "se")) #listLearnerProperties("regr")
params_glove <- makeParamSet( makeIntegerParam("max_vocab", lower = 2000, upper = 3000), makeIntegerParam("maxlen", lower = 10, upper = 30), makeIntegerParam("batch_size", lower = 1, upper = 20), makeIntegerParam("out_dim", lower = 20, upper = 200) #makeDiscreteParam("out_fun", values = c("softmax", "sigmoid")) ) results_glove <- run_mbo( data = final, params = params_glove, const = list( arch = "glove", target = "altright", text = "text_lemma" ), n_init = 2, n_main = 2, metric = "auc" )
# save(results_glove, file = "shiny_mbo/results/results_glove211.Rdata") dnn <- get(load("shiny_mbo/perform/dnn_lemma_0.Rdata")) gbm <- get(load("shiny_mbo/perform/gbm_1_11.Rdata")) glove <- get(load("shiny_mbo/results/gbm_1.Rdata")) results_glove$params %>% arrange(desc(auc)) final_glove <- results_glove$params %>% arrange(desc(auc)) %>% slice(1) %>% as.list() %>% list(params = ., data = results_glove$data) %>% run_mbo_steps(reconstruct = T, metric = "auc") final_glove$params$arch <- "lstm" container <- gbm container$data <- final m <- fit_glove(final_glove) final_glove %>% run_mbo_steps(reconstruct = T, metric = "auc") glimpse(final_glove$data$train) #mat <- table(dnn$perform, dnn$data$test$alt_type) mat <- table(final_glove$perform, final_glove$data$test$altright) df <- mat %>% as_tibble() %>% tidyr::spread(Var2, n) %>% rename(preds = Var1) nn <- tibble( predict = final_glove$perform ) %>% bind_cols(., final_glove$data$test) glimpse(nn) nn %>% dplyr::mutate(error = predict != altright) %>% dplyr::select(identity:altright, error, -irony) %>% tidyr::gather(var, val, -error) %>% dplyr::group_by(error, var, val) %>% dplyr::tally() %>% ungroup() %>% dplyr::group_by(var, val) %>% mutate(perc = n/sum(n)*100) %>% arrange(var, val) %>% ggplot(aes(val, n, fill = error)) + geom_bar(stat = "identity") + facet_wrap(~var) nn %>% filter(predict == 0) %>% dplyr::mutate(error = predict == alt_type) %>% dplyr::select(identity:alt_type, error, -irony) %>% tidyr::gather(var, val, -error) %>% dplyr::group_by(error, var, val) %>% dplyr::tally() %>% ungroup() %>% dplyr::group_by(var, val) %>% mutate(perc = n/sum(n)*100) %>% arrange(var, val) %>% ggplot(aes(val, n, fill = error)) + geom_bar(stat = "identity") + facet_wrap(~var)
brks <- quantile(as.matrix(mat), probs = seq(.05, .95, .05), na.rm = TRUE) clrs <- round(seq(255, 40, length.out = length(brks) + 1), 0) %>% {paste0("rgb(255,", ., ",", ., ")")} library(DT) DT::datatable(df, rownames = F) %>% formatStyle(names(df %>% dplyr::select(-preds)), backgroundColor = styleInterval(brks, clrs)) %>% formatStyle( names(df), backgroundColor = 'black' ) table(final$test$alt_type, c(final_model$perform-1)) getwd() nn <- dir("shiny_mbo/perform") %>% paste0("shiny_mbo/perform/", .) %>% as.list() %>% purrr::map(~{get(load(.x)) %>% .$params}) %>% purrr::map(as_tibble) %>% purrr::reduce(bind_rows)
params_lstm <- makeParamSet( makeIntegerParam("max_vocab", lower = 2000, upper = 3000), makeIntegerParam("maxlen", lower = 20, upper = 40), makeIntegerParam("batch_size", lower = 1, upper = 20), makeIntegerParam("out_dim", lower = 20, upper = 200), makeIntegerParam("epochs", lower = 3, upper = 6), makeIntegerParam("lstm_dim", lower = 20, upper = 200), makeNumericParam("lstm_drop", lower = 0, upper = .5 ), makeNumericParam("rnn_drop", lower = 0, upper = .5 ) #makeDiscreteParam("out_fun", values = c("softmax", "sigmoid")) ) results_lstm <- run_mbo( data = altright_final, params = params_lstm, const = list( arch = "lstm", target = "altright1", text = "text_lemma" ), n_init = 2, n_main = 3, metric = "accuracy" ) names(final$train) #save(results_lstm, file = "../shiny_mbo/results/results_lstm_final.Rdata")
perform <- results_lstm perform %>% arrange(desc(accuracy)) final_model <- perform %>% arrange(desc(accuracy)) %>% slice(1) %>% as.list() %>% run_mbo_steps(const = list(arch = "glove"), data = final, target = "alt_type", text = "text_lemma", metric = "accuracy", reconstruct = T) table(final$test$alt_type, c(final_model$perform-1))
library(h2o) h2o.init(nthreads = 2) h2o.no_progress() # container <- list( # data = final, # params = # c( # list( # arch = "gbm", # target = "altright", # text = "text_lemma" # ), # list( # ntrees = 30, # max_depth = 4, # learn_rate = .3, # sample_rate = .8, # stop_tol = .01, # stop_round = 1, # nbins = 10 # ) # ) # ) %>% # text_to_matrix() # # ncol(container$data$train_input) # sum(container$data$train_input) params_gbm <- makeParamSet( makeDiscreteParam("ngram", values = c("unigram", "bigram")), #makeDiscreteParam("text", values = c("text_lemma", "text_word")), #makeIntegerParam("term_min", lower = 2, upper = 5), makeIntegerParam("max_vocab", lower = 2000, upper = 4000), makeIntegerParam("ntrees", lower = 20, upper = 150), makeIntegerParam("max_depth", lower = 2, upper = 10), makeNumericParam("learn_rate", lower = .1, upper = .9), makeNumericParam("sample_rate", lower = .1, upper = .9), makeNumericParam("stop_tol", lower = .001, upper = .1), makeIntegerParam("stop_round", lower = 1, upper = 3), makeIntegerParam("nbins", lower = 10, upper = 20) ) load("shiny_mbo/results/gbm_2.Rdata") table(results_gbm$train$altright1) results_gbm$params results_gbm <- run_mbo( data = altright_final, params = params_gbm, prior = results_gbm$params, const = list( arch = "gbm", target = "altright1", text = "text_lemma", balance = T ), n_init = 6, n_main = 30, metric = "accuracy" ) save(results_gbm, file = "shiny_mbo/results/gbm_20.Rdata") h2o::h2o.shutdown(prompt = F)
perform <- results_gbm perform$params %>% arrange(desc(accuracy)) %>% glimpse()
h2o.init() final_model <- perform$params %>% arrange(desc(auc)) %>% slice(1) %>% as.list() %>% list(params = ., data = perform$data) %>% run_mbo_steps( metric = "auc", reconstruct = T ) container <- perform$params %>% arrange(desc(accuracy)) %>% slice(1) %>% as.list() %>% list(params = ., data = perform$data) model <- perform$params %>% arrange(desc(accuracy)) %>% slice(1) %>% as.list() %>% list(params = ., data = perform$data) %>% text_to_matrix() %>% fit_gbm() h2o::h2o.saveModel(model, path = "models", force = T) model1 <- h2o::h2o.loadModel("models/GBM_model_R_1525530020939_263") save(container, file = "models/container_gbm1.Rdata") # p <- h2o.performance(model, test_hex) # cm <- h2o.confusionMatrix(p) table(final_model$perform, final_model$data$test$altright) Metrics::accuracy(final_model$perform, final_model$data$test$altright) names(final_model$data$test$race) nn <- cbind(pred = final_model$perform$predict, final$test) glimpse(nn) nn %>% filter(pred == 0 & alt_type == 2) %>% dplyr::select(pred, alt_type, text, text_lemma) nn$text
library(h2o) h2o.init(nthreads = 2) h2o.no_progress() params_dnn <- makeParamSet( makeDiscreteParam("ngram", values = c("unigram", "bigram")), #makeIntegerParam("term_min", lower = 2, upper = 5), makeIntegerParam("max_vocab", lower = 2000, upper = 3000), makeIntegerParam("hidden1", lower = 60, upper = 200), makeIntegerParam("hidden2", lower = 60, upper = 200), # before stopping makeIntegerParam("epochs", lower = 3, upper = 6), # need for splitting # makeNumericParam("rho", lower = .95, upper = .9999999), #makeNumericParam("epsilon", lower = 1e-9, upper = 1e-07), #makeNumericParam("rate", lower = 0.0005, upper = 0.05), #makeNumericParam("rate_annealing", lower = 1e-07, upper = 1e-05), # use a #makeNumericParam("momentum_start", lower = 0, upper = 1), # need for splitting #makeNumericParam("input_dropout_ratio", lower = 0, upper = 0.3), #makeNumericParam("hidden_dropout1", lower = .1, upper = .9), #makeNumericParam("hidden_dropout2", lower = .1, upper = .9), makeNumericParam("l1", lower = 0, upper = 0.1), makeNumericParam("l2", lower = 0, upper = 0.1), ParamHelpers::makeDiscreteParam("activation", values = c("Tanh", "TanhWithDropout", "Rectifier","RectifierWithDropout", "Maxout", "MaxoutWithDropout")) ) results_dnn <- run_mbo( data = altright_final, params = params_dnn, #prior = results_dnn$params, const = list( arch = "dnn", target = "altright1", text = "text_lemma", balance = T ), n_init = 6, n_main = 30, metric = "accuracy" ) h2o.shutdown(prompt = F)
#save(results_dnn, file = "shiny_mbo/results/dnn_1.Rdata") perform <- results_dnn perform$params %>% arrange(desc(accuracy)) %>% glimpse()
h2o.init() dnn <- results_dnn$params %>% arrange(desc(accuracy)) %>% slice(1) %>% as.list() %>% list(params = ., data = results_dnn$data) %>% text_to_matrix() %>% #fit_dnn() run_mbo_steps( metric = "accuracy", reconstruct = T ) table(results_dnn$params$activation) table(dnn$perform, dnn$data$test$altright1) perform1 <- get(load("shiny_mbo/results/gbm_1.Rdata")) gbm <- perform1$params %>% arrange(desc(accuracy)) %>% slice(1) %>% as.list() %>% list(params = ., data = perform1$data) %>% text_to_matrix() %>% fit_gbm() con <- perform1$params %>% arrange(desc(accuracy)) %>% slice(1) %>% as.list() %>% list(params = ., data = perform1$data) %>% text_to_matrix() train_dtm <- con$data$train_input %>% h2o::as.h2o() x <- colnames(train_dtm) y_col <- con$data$train[[con$params$target]] %>% as.factor() %>% as.h2o() y <- colnames(y_col) h2o_train_dtm <- h2o::h2o.cbind(train_dtm, y_col) h2o::h2o.stackedEnsemble() # Train a stacked ensemble using the GBM and RF above devtools::install_github("h2oai/h2o-3/h2o-r/ensemble/h2oEnsemble-package") ensemble <- h2o.stackedEnsemble( x = x, y = y, training_frame = h2o_train_dtm, #model_id = "fart1", base_models = list(gbm, dnn) ) models <- list(gbm, dnn) metalearner <- "h2o.glm.wrapper" library(h2oEnsemble) stack <- h2oEnsemble::h2o.stack(models = models, response_frame = y_col[y], metalearner = metalearner, seed = 1, keep_levelone_data = TRUE) test_dtm <- con$data$test_input %>% h2o::as.h2o() # Train a stacked ensemble using the GBM and RF above ensemble <- h2o.stackedEnsemble( x = x, y = y, training_frame = h2o_train_dtm, base_models = list(dnn, gbm)) preds <- h2oEnsemble::predict.h2o.ensemble(stack, newdata = as.h2o(con$data$test_input)) %>% tibble::as_tibble() perform <- get_perform(container$data$test[[container$params$target]], preds$predict) # Compute test set performance: perf <- h2o.ensemble_performance(stack, newdata = test_dtm) # p <- h2o.performance(model, test_hex) # cm <- h2o.confusionMatrix(p) table(final_model1$perform, final_model1$data$test$race)
h2o.init() final_model <- run_mbo_steps( params = NULL, data = final, const = list(arch = "dnn"), target = "alt_type", text = "text_lemma", metric = "accuracy", reconstruct = F )
perform <- run_mbo_steps( NULL, data = final, const = list(arch = "dnn"), target = "alt_type", text = "text_lemma", metric = "accuracy" )
constructor <- smoof::makeSingleObjectiveFunction( name = "sdfsdfd", fn = function(x) { perform <- run_mbo_steps(x, data = final, const = list(arch = "dnn"), target = "alt_type", text = "text_lemma", metric = "accuracy") return(perform) }, par.set = params, has.simple.signature = F, # function expects a named list of parameter values minimize = minimize # to increase accuracy )
library(h2o) h2o.init(nthreads = 2) h2o.no_progress() params_xgboost <- makeParamSet( makeIntegerParam("max_features", lower = 2500, upper = 3000), makeIntegerParam("ntrees", lower = 30, upper = 120), makeIntegerParam("max_depth", lower = 2, upper = 20), makeIntegerParam("min_rows", lower = 1, upper = 3), makeNumericParam("learn_rate", lower = .5, upper = .9), makeNumericParam("sample_rate", lower = .5, upper = .9), makeNumericParam("col_sample_rate", lower = .5, upper = .9), makeNumericParam("reg_lambda", lower = 0, upper = 0.005), # L2 makeNumericParam("reg_alpha", lower = 0, upper = 0.005) #L1 ) results_xgboost <- run_mbo( data = final, params = params_xgboost, const = list(arch = "xgboost"), target = "alt_type", text = "text_lemma", name = "stack_model1", n_init = 6, n_main = 30, metric = "accuracy", # experimental stage parallel = F # Only Unix/Mac no Windows support ) h2o.shutdown(prompt = F)
library(h2o) h2o.init(nthreads = 2) h2o.no_progress() params_nb <- makeParamSet( makeIntegerParam("laplace", lower = 0, upper = 5) ) results_nb <- run_mbo( data = final, params = params_nb, const = list(arch = "nb"), target = "alt_type", text = "text_lemma", name = "stack_model1", n_init = 5, n_main = 5, metric = "accuracy", # experimental stage parallel = F # Only Unix/Mac no Windows support ) h2o.shutdown(prompt = F)
df_all %>% group_by(platform) %>% summarise(na = sum(!is.na(likes))) df_all %>% filter(likes > 100) %>% mutate(alt_type = factor(alt_type, levels = c("altright", "altlight", "none"))) %>% ggplot(aes(alt_type, likes, fill = alt_type)) + geom_boxplot() df_all %>% filter(shares > 500) %>% mutate(alt_type = factor(alt_type, levels = c("altright", "altlight", "none"))) %>% ggplot(aes(alt_type, shares, fill = alt_type)) + geom_boxplot() df_all %>% filter(comments > 100) %>% mutate(alt_type = factor(alt_type, levels = c("altright", "altlight", "none"))) %>% ggplot(aes(alt_type, comments, fill = alt_type)) + geom_boxplot() df_all %>% filter(likes > 5) %>% mutate(race = as.factor(race)) %>% mutate(alt_type = factor(alt_type, levels = c("altright", "altlight", "none"))) %>% ggplot(aes(race, likes, fill = race)) + geom_boxplot() + geom_violin() df_all %>% filter(likes > 10) %>% mutate(anti_sem = as.factor(anti_sem)) %>% mutate(alt_type = factor(alt_type, levels = c("altright", "altlight", "none"))) %>% ggplot(aes(anti_sem, likes, fill = anti_sem)) + geom_violin() # table(is.na(df_all$likes)) df_all %>% dplyr::select(likes, race, anti_sem) %>% tidyr::gather("var", "value", -likes) %>% #filter(likes > 0) %>% mutate(value = as.factor(value)) %>% group_by(var, value) %>% summarise(m = median(likes, na.rm = T), s = sd(likes, na.rm = T)) ggplot(aes(var, likes, fill = value)) + # geom_violin() #geom_boxplot(outlier.colour = NA) + ylim(0, 10) #facet_wrap(~var)
max_features <- 2500 # top most common words batch_size <- 10 maxlen <- 15 # Cut texts after this number of words (called earlier) tokenizer <- text_tokenizer(num_words = max_features) fit_text_tokenizer(tokenizer, x = final$train$text_lemma) #keras::save_text_tokenizer(tokenizer, "data/tokenizer") #tokenizer <- keras::load_text_tokenizer("data/tokenizer") final$train_seq <- tokenizer %>% texts_to_sequences(final$train$text_lemma) %>% pad_sequences(maxlen = maxlen, value = 0) final$test_seq <- tokenizer %>% texts_to_sequences(final$test$text_lemma) %>% pad_sequences(maxlen = maxlen, value = 0)
glove_fit <- keras_model_sequential() %>% layer_embedding( input_dim = 2500, output_dim = 128, input_length = 15 ) %>% layer_global_average_pooling_1d() %>% layer_dense(3, activation = "sigmoid") %>% compile( loss = "binary_crossentropy", optimizer = "adam", metrics = "accuracy" ) summary(glove_fit) glove_fit
glove_hist <- glove_fit %>% keras::fit( x = final$train_seq, y = tidyTX::tx_onehot(final$train$alt_type), batch_size = batch_size, epochs = 3, validation_split = .2 )
preds_glove <- glove_fit %>% tidyTX::tx_keras_predict(final$test_seq, 0) %>% as.vector() length(preds_glove) length(final$test$alt_type) table(preds_glove, final$test$alt_type) caret::confusionMatrix(preds_glove, final$test$alt_type)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.