This vignette is based on data collected for the 538 story entitled "The World's Favorite Donald Trump Tweets" by Leah Libresco available here.
Load required packages to reproduce analysis.
library(fivethirtyeight) library(ggplot2) library(dplyr) library(readr) library(tidytext) library(textdata) library(stringr) library(lubridate) library(knitr) library(hunspell) # Turn off scientific notation options(scipen = 99)
## check out structure and date range ------------------------------------------------ (minDate <- min(date(trump_twitter$created_at))) (maxDate <- max(date(trump_twitter$created_at)))
my_hunspell_stem <- function(token) { stem_token <- hunspell_stem(token)[[1]] if (length(stem_token) == 0) return(token) else return(stem_token[1]) } vec_hunspell_stem <- Vectorize(my_hunspell_stem, "token")
We first remove URLs and stopwords as specified in the tidytext
library. Stopwords are common words in English. We also do spellchecking using hunspell.
trump_tokens <- trump_twitter %>% mutate(text = str_replace_all(text, pattern=regex("(www|https?[^\\s]+)"), replacement = "")) %>% #rm urls mutate(text = str_replace_all(text, pattern = "[[:digit:]]", replacement = "")) %>% unnest_tokens(tokens, text) %>% #tokenize mutate(tokens = vec_hunspell_stem(tokens)) %>% filter(!(tokens %in% stop_words$word)) #rm stopwords
To measure the sentiment of tweets, we used the AFINN lexicon for each (non-stop) word in a tweet. The score runs between -5 and 5. We then sum the scores for each word across all words in one tweet to get a total tweet sentiment score.
afinn_sentiment <- system.file("extdata", "afinn.csv", package = "fivethirtyeight") %>% read_csv() trump_sentiment <- trump_tokens %>% inner_join(afinn_sentiment, by=c("tokens"="word")) trump_full_text_sent <- trump_sentiment %>% group_by(id) %>% summarise(score = sum(value, na.rm=TRUE)) %>% ungroup() %>% right_join(trump_twitter, by="id") %>% mutate(score_factor = ifelse(is.na(score), "Missing score", ifelse(score < 0, "-.Negative", ifelse(score == 0, "0", "+.Pos"))))
trump_full_text_sent %>% count(score_factor) %>% mutate(prop = prop.table(n))
46.4% of tweets did not have sentiment scores. 15.4% were net negative and 36.6% were net positive.
ggplot(data=trump_full_text_sent, aes(score)) + geom_histogram(bins = 10)
sentOverTimeGraph <- ggplot(data=filter(trump_full_text_sent,!is.na(score)), aes(x=created_at, y=score)) + geom_line() + geom_point() + xlab("Date") + ylab("Sentiment (afinn)") + ggtitle(paste0("Trump Tweet Sentiment (",minDate," to ",maxDate,")")) sentOverTimeGraph
most_pos_trump <- trump_full_text_sent %>% arrange(desc(score)) %>% head(n=5) %>% .[["text"]] kable(most_pos_trump, format="html")
most_neg_trump <- trump_full_text_sent %>% arrange(score) %>% head(n=5) %>% .[["text"]] kable(most_neg_trump, format = "html")
Total number of tweets and average sentiment (when available) by hour of the day, day of the week, and month
trump_tweet_times <- trump_full_text_sent %>% mutate(weekday = wday(created_at, label=TRUE), month = month(created_at, label=TRUE), hour = hour(created_at), month_over_time = round_date(created_at,"month")) plotSentByTime <- function(trump_tweet_times, timeGroupVar) { timeVar <- substitute(timeGroupVar) timeVarLabel <- str_to_title(timeVar) trump_tweet_time_sent <- trump_tweet_times %>% rename(timeGroup = !! timeVar) %>% group_by(timeGroup) %>% summarise(score = mean(score, na.rm=TRUE), Count = n()) %>% ungroup() ggplot(trump_tweet_time_sent, aes(x=timeGroup, y=Count, fill = score)) + geom_bar(stat="identity") + xlab(timeVarLabel) + ggtitle(paste("Trump Tweet Count & Sentiment by", timeVarLabel)) }
plotSentByTime(trump_tweet_times, "hour")
plotSentByTime(trump_tweet_times, "weekday")
plotSentByTime(trump_tweet_times, "month_over_time")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.