This vignette is based on data collected for the 538 story entitled "The World's Favorite Donald Trump Tweets" by Leah Libresco available here.

Load required packages to reproduce analysis.

# Turn off scientific notation
options(scipen = 99)

Check date range of tweets

## check out structure and date range ------------------------------------------------
(minDate <- min(date(trump_twitter$created_at)))
(maxDate <- max(date(trump_twitter$created_at)))

Create vectorised stemming function using hunspell

my_hunspell_stem <- function(token) {
  stem_token <- hunspell_stem(token)[[1]]
  if (length(stem_token) == 0) return(token) else return(stem_token[1])
vec_hunspell_stem <- Vectorize(my_hunspell_stem, "token")

Clean text by tokenizing & removing urls/stopwords

We first remove URLs and stopwords as specified in the tidytext library. Stopwords are common words in English. We also do spellchecking using hunspell.

trump_tokens <- trump_twitter %>% 
  mutate(text = str_replace_all(text, 
                                replacement = "")) %>% #rm urls
  mutate(text = str_replace_all(text,
                                pattern = "[[:digit:]]",
                                replacement = "")) %>% 
  unnest_tokens(tokens, text) %>% #tokenize
  mutate(tokens = vec_hunspell_stem(tokens)) %>% 
  filter(!(tokens %in% stop_words$word)) #rm stopwords

Sentiment analysis

To measure the sentiment of tweets, we used the AFINN lexicon for each (non-stop) word in a tweet. The score runs between -5 and 5. We then sum the scores for each word across all words in one tweet to get a total tweet sentiment score.

afinn_sentiment <- system.file("extdata", "afinn.csv", package = "fivethirtyeight") %>% 
trump_sentiment <- trump_tokens %>% 
  inner_join(afinn_sentiment, by=c("tokens"="word")) 

trump_full_text_sent <- trump_sentiment %>% 
  group_by(id) %>% 
  summarise(score = sum(value, na.rm=TRUE)) %>% 
  ungroup() %>% 
  right_join(trump_twitter, by="id") %>% 
  mutate(score_factor = ifelse(, "Missing score", 
                               ifelse(score < 0, "-.Negative", 
                                      ifelse(score == 0, "0", "+.Pos"))))

Distribution of sentiment scores

trump_full_text_sent %>%
  count(score_factor) %>% mutate(prop = prop.table(n))

46.4% of tweets did not have sentiment scores. 15.4% were net negative and 36.6% were net positive.

ggplot(data=trump_full_text_sent, aes(score)) + 
  geom_histogram(bins = 10)

plot sentiment over time

sentOverTimeGraph <- ggplot(data=filter(trump_full_text_sent,!, aes(x=created_at, y=score)) +
  geom_line() + 
  geom_point() +
  xlab("Date") +
  ylab("Sentiment (afinn)") +
  ggtitle(paste0("Trump Tweet Sentiment (",minDate," to ",maxDate,")"))

Examine top 5 most positive tweets

most_pos_trump <- trump_full_text_sent %>% 
  arrange(desc(score)) %>% 
  head(n=5) %>% 

kable(most_pos_trump, format="html")

Examine top 5 most negative tweets

most_neg_trump <- trump_full_text_sent %>% 
  arrange(score) %>% 
  head(n=5) %>% 
kable(most_neg_trump, format = "html")

When is trumps favorite time to tweet?

Total number of tweets and average sentiment (when available) by hour of the day, day of the week, and month

trump_tweet_times <- trump_full_text_sent %>% 
  mutate(weekday = wday(created_at, label=TRUE),
         month   = month(created_at, label=TRUE),
         hour    = hour(created_at),
         month_over_time = round_date(created_at,"month"))

plotSentByTime <- function(trump_tweet_times, timeGroupVar) {
  timeVar <- substitute(timeGroupVar)
  timeVarLabel <- str_to_title(timeVar)

  trump_tweet_time_sent <- trump_tweet_times %>% 
    rename(timeGroup = !! timeVar) %>% 
    group_by(timeGroup) %>% 
    summarise(score = mean(score, na.rm=TRUE), Count = n()) %>% 

  ggplot(trump_tweet_time_sent, aes(x=timeGroup, y=Count, fill = score)) +
    geom_bar(stat="identity") +
    xlab(timeVarLabel) +
    ggtitle(paste("Trump Tweet Count & Sentiment by", timeVarLabel))
plotSentByTime(trump_tweet_times, "hour")
plotSentByTime(trump_tweet_times, "weekday")
plotSentByTime(trump_tweet_times, "month_over_time")

