knitr::opts_chunk$set(echo = T)
The underlying dataset consits of 9,000 tweets, collected on 2019-02-02 via rtweet. It consists of 1,500 tweets each from
From each account 1,200 tweets (80%) are included in training dataset and 300 (20%) in verification set.
pacman::p_load(tidyverse) tweets <- read_csv("train_tweets.csv") %>% mutate(split = T) %>% bind_rows(read_csv("test_tweets.csv")) %>% mutate(split = ifelse(is.na(split), F, split)) %>% glimpse tweets %>% count(split) #save(tweets, file = "tweets.Rdata")
celeb_mat <- tweets$name %>% dummies::dummy() celeb_names <- celeb_mat %>% colnames %>% str_remove("name|/Users/syro/MEGA/projects/celebrity-faceoff/code/keras_cnn.Rmd") celeb_target <- celeb_mat %>% as_tibble %>% set_names(celeb_names) %>% glimpse
set.seed(2019) text_train <- tweets %>% filter(split) %>% pull(text) text_test <- tweets %>% filter(!split) %>% pull(text) y_train <- celeb_target[tweets$split, ] %>% as.matrix y_test <- celeb_target[!tweets$split, ] %>% as.matrix celeb_dat <- list( text_train = text_train, text_test = text_test, y_train = y_train, y_test = y_test ) celeb_dat %>% glimpse save(celeb_dat, file = "celeb_dat.Rdata")
library(keras) maxlen <- 60 max_features <- 13488 tokenizer <- text_tokenizer(num_words = max_features, lower = F, split = " ", char_level = F) fit_text_tokenizer(tokenizer, train$text) celeb$x_train <- tokenizer %>% texts_to_sequences(train$text) %>% pad_sequences(maxlen = maxlen) celeb$x_test <- tokenizer %>% texts_to_sequences(test$text) %>% pad_sequences(maxlen = maxlen) celeb %>% map(dim) save(celeb, file = "celeb.Rdata")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.