scripts/twitter_rtweet.R

library(ggplot2)
library(dplyr)
library(tidytext)
library(igraph)
library(ggraph)
# devtools::install_github("dgrtwo/widyr")
library(widyr)
library(tidyr)
# json libraries
library(rjson)
library(jsonlite)
# date time
library(lubridate)
library(zoo)

#### rtweet ####
library(rtweet)
token <- create_token(
  app = "mt_crypto",
  consumer_key = "xOqmpoGBMr91X2iKAgJey8ehn",
  consumer_secret = "GgDPJIQWGsXxgNxs262LIAwIPB3qXO0OgTg4Rgl1yyjtZEYC7J",
  access_token = "1049665959663947777-KWBBLprqiC1EL0HYWXH4RCpVwOY13S",
  access_secret = "I3SE6GrD4fEgCweW3L2Q3Rh5QjEgeRpTSTtdB9RT7HjoC")
## check to see if the token is loaded
identical(token, get_token())

searchTerm = "#rstats"
rt <- search_tweets(
  searchTerm, n = 18000, include_rts = FALSE
)
rtBackup <- rt
kp <- get_timeline(user = "@katyperry", n = 1000)
kk <- get_timeline(user = "@KimKardashian", n = 1000)
ag <- get_timeline(user = "@ArianaGrande", n = 1000)

par(mfrow = c(3,1))
ts_plot(kp, by = "1 weeks")
ts_plot(kk, by = "1 weeks")
ts_plot(ag, by = "1 weeks")
par(mfrow = c(1,1))

## plot time series of tweets
ts_plot(rt, "3 hours") +
  ggplot2::theme_minimal() +
  ggplot2::theme(plot.title = ggplot2::element_text(face = "bold")) +
  ggplot2::labs(
    x = NULL, y = NULL,
    title = "Frequency of #rstats Twitter statuses from past 9 days",
    subtitle = "Twitter status (tweet) counts aggregated using three-hour intervals",
    caption = "\nSource: Data collected from Twitter's REST API via rtweet"
  )

## search for 250,000 tweets containing the word data
rt <- search_tweets(
  "data", n = 250000, retryonratelimit = TRUE
)

## search for 10,000 tweets sent from the US
rt <- search_tweets(
  "lang:en", geocode = lookup_coords("usa"), n = 10000
)

## create lat/lng variables using all available tweet and profile geo-location data
rt <- lat_lng(rt)

## plot state boundaries
par(mar = c(0, 0, 0, 0))
maps::map("state", lwd = .25)

## plot lat and lng points onto state map
with(rt, points(lng, lat, pch = 20, cex = .75, col = rgb(0, .3, .7, .75)))


#### Ex 2 ####
rstats <- search_tweets(q = "#rstats", n = 500)
head(rt, n = 3)

rstats_tweets <- search_tweets("#rstats", n = 500,
                               include_rts = FALSE)
head(rt$screen_name)
unique(rt$screen_name)

users <- search_users("#rstats",
                      n = 500)

length(unique(rt$location))
rt %>%
  ggplot(aes(location)) +
  geom_bar() + coord_flip() +
  labs(x = "Count", y = "location", title = "Twitter users - unique locations")


rt %>%
  count(location, sort = TRUE) %>%
  mutate(location = reorder(location, n)) %>%
  na.omit() %>%
  top_n(20) %>%
  ggplot(aes(x = location, y = n)) +
  geom_col() +
  coord_flip() +
  labs(x = "Count",
       y = "Location",
       title = "Where Twitter users are from - unique locations ")


rt$stripped <- gsub("http.*","",  rt$text) %>%
  gsub(pattern = "https.*",replacement = "")
# remove punctuation, convert to lowercase, add id for each tweet!
rt_tweets_clean <- rt %>%
  dplyr::select(stripped) %>%
  unnest_tokens(word, stripped)

# plot the top 15 words -- notice any issues?
rt_tweets_clean %>%
  count(word, sort = TRUE) %>%
  top_n(15) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
  labs(x = "Count",
       y = "Unique words",
       title = "Count of unique words found in tweets")

# load list of stop words - from the tidytext package
data("stop_words")
head(stop_words)
nrow(rt_tweets_clean)

rt_tweets_words <- rt_tweets_clean %>%
  anti_join(stop_words)

nrow(rt_tweets_words)

# plot the top 15 words -- notice any issues?
rt_tweets_words %>%
  count(word, sort = TRUE) %>%
  top_n(15) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
  labs(y = "Count",
       x = "Unique words",
       title = "Count of unique words found in tweets",
       subtitle = "Stop words removed from the list")

rt_tweets_paired_words <- rt %>%
  dplyr::select(stripped) %>%
  unnest_tokens(paired_words, stripped, token = "ngrams", n = 2)

rt_tweets_paired_words %>%
  count(paired_words, sort = TRUE)

rt_tweets_separated_words <- rt_tweets_paired_words %>%
  separate(paired_words, c("word1", "word2"), sep = " ")

rt_tweets_filtered <- rt_tweets_separated_words %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word)

# new bigram counts:
rt_words_counts <- rt_tweets_filtered %>%
  count(word1, word2, sort = TRUE)

head(rt_words_counts)

rt_words_counts %>%
  filter(n >= 24) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = n, edge_width = n)) +
  geom_node_point(color = "darkslategray4", size = 3) +
  geom_node_text(aes(label = name), vjust = 1.8, size = 3) +
  labs(title = "Word Network: Tweets using the hashtag - bitcoin and btc",
       subtitle = "Text mining twitter data ",
       x = "", y = "")

#### Lesson 4 ####
# json support
library(rjson)
library(jsonlite)
# plotting and pipes - tidyverse!
library(ggplot2)
library(dplyr)
library(tidyr)
# text mining library
library(tidytext)
library(tm)
# coupled words analysis
library(widyr)
# plotting packages
library(igraph)
library(ggraph)

options(stringsAsFactors = FALSE)

# create new df with the tweet text & usernames
tweet_data <- tibble(date_time = rt$created_at,
                         username = rt$screen_name,
                         tweet_text = rt$text)
head(tweet_data)
#format = "%Y-%m-%d %H:%M:%s"
xts::periodicity(tweet_data$date_time)
start_date <- as.POSIXct('2018-10-09 05:00:00')
end_date <- as.POSIXct('2018-10-10 11:20:00')

# cleanup
rt_tweets <- tweet_data %>%
  mutate(date_time = as.POSIXct(date_time, format = "%a %b %d %H:%M:%S +0000 %Y")) %>%
  filter(date_time >= start_date & date_time <= end_date )

rt_tweet_messages <- rt_tweets %>%
  dplyr::select(tweet_text) %>%
  unnest_tokens(word, tweet_text)

data("stop_words")
nrow(rt_tweet_messages)

rt_tweet_clean <- rt_tweet_messages %>%
  anti_join(stop_words) %>%
  filter(!word == "rt")

# how many words after removing the stop words?
nrow(rt_tweet_clean)
rt_tweet_clean %>%
  count(word, sort = TRUE) %>%
  top_n(15) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
  labs(x = "Count",
       y = "Unique words",
       title = "Count of unique words found in tweets")

# cleanup
rt_tweet_clean <- tweet_data %>%
  mutate(date_time = as.POSIXct(date_time, format = "%a %b %d %H:%M:%S +0000 %Y"),
         tweet_text = gsub("\\s?(f|ht)(tp)(s?)(://)([^\\.]*)[\\.|/](\\S*)",
                           "", tweet_text)) %>%
  filter(date_time >= start_date & date_time <= end_date ) %>%
  dplyr::select(tweet_text) %>%
  unnest_tokens(word, tweet_text) %>%
  anti_join(stop_words) %>%
  filter(!word == "rt") # remove all rows that contain "rt" or retweet

rt_tweet_clean %>%
  count(word, sort = TRUE) %>%
  top_n(15) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
  labs(x = "Count",
       y = "Unique words",
       title = "Count of unique words found in tweets, ")

rt_tweets_paired <- tweet_data %>%
  dplyr::select(tweet_text) %>%
  mutate(tweet_text = removeWords(tweet_text, stop_words$word)) %>%
  mutate(tweet_text = gsub("\\brt\\b|\\bRT\\b", "", tweet_text)) %>%
  mutate(tweet_text = gsub("http://*", "", tweet_text)) %>%
  unnest_tokens(paired_words, tweet_text, token = "ngrams", n = 2)

rt_tweets_paired %>%
  count(paired_words, sort = TRUE)

rt_tweets_separated <- rt_tweets_paired %>%
  separate(paired_words, c("word1", "word2"), sep = " ")

rt_word_counts <- rt_tweets_separated %>%
  count(word1, word2, sort = TRUE)
head(rt_word_counts)

rt_word_counts %>%
  filter(n >= 50) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = n, edge_width = n)) +
  geom_node_point(color = "darkslategray4", size = 3) +
  geom_node_text(aes(label = name), vjust = 1.8, size = 3) +
  labs(title = "Word Network: Tweets during the day Colorado rt Event",
       subtitle = "September day - Text mining twitter data ",
       x = "", y = "") +
  theme_void()
#### Lesson 5 ####
library(leaflet)
library(gganimate)
library(lubridate)
library(maps)
library(ggthemes)

# create new df with just the tweet texts & usernames
tweet_data <- tibble(date_time = rt$created_at,
                         username = rt$screen_name,
                         tweet_text = rt$text) %>%
  dplyr::bind_cols(as_tibble(do.call(rbind, rt$coords_coords)) %>%
          mutate(coordinates = paste0(V1, ",", V2)) %>%
          select(coordinates))

head(tweet_data)
# cleanup & and filter to just the time period around the btc
rt_tweets <- tweet_data %>%
  mutate(coordinates = gsub("\\)|c\\(", "", coordinates),
         date_time = as.POSIXct(date_time, format = "%a %b %d %H:%M:%S +0000 %Y")) %>%
  separate(coordinates, c("long", "lat"), sep = ",") %>%
  mutate_at(c("lat", "long"), as.numeric)

# mutate(hej, coordinates = gsub("\\)|c\\(", "", coordinates)) %>%
#   separate(coordinates, c("long", "lat"), sep = ",") %>%
#   unique()

world_basemap <- ggplot() +
  borders("world", colour = "gray85", fill = "gray80") +
  theme_map()

world_basemap

tweet_locations <- rt_tweets %>%
  na.omit()

head(tweet_locations)
world_basemap +
  geom_point(data = tweet_locations, aes(x = long, y = lat),
             colour = 'purple', alpha = .5) +
  scale_size_continuous(range = c(1, 8),
                        breaks = c(250, 500, 750, 1000)) +
  labs(title = "Tweet Locations")
# plot points on top of a leaflet basemap

site_locations <- leaflet(tweet_locations) %>%
  addTiles() %>%
  addCircleMarkers(lng = ~long, lat = ~lat, popup = ~tweet_text,
                   radius = 3, stroke = FALSE)

site_locations

# plot points on top of a leaflet basemap

site_locations_base <- leaflet(tweet_locations) %>%
  addProviderTiles("CartoDB.Positron") %>%
  addCircleMarkers(lng = ~long, lat = ~lat, popup = ~tweet_text,
                   radius = 3, stroke = FALSE)

site_locations_base

# summarize by day?
# perhaps round the lat long and then do it?
# since it's all in sept
tweet_locations_grp <- tweet_locations %>%
  mutate(day = day(date_time),
         long_round = round(long, 2),
         lat_round = round(lat, 2)) %>%
  group_by(day, long_round, lat_round) %>%
  summarise(total_count = n())

# this also works -- plotting across the world here...
grouped_tweet_map <- world_basemap + geom_point(data = tweet_locations_grp,
                                                aes(long_round, lat_round, size = total_count),
                                                color = "purple", alpha = .5) + coord_fixed() +
  labs(title = "Twitter Activity") +
  transition_time(day) +
  ease_aes('linear')
grouped_tweet_map

# created animated gif file
gganimate::animate(grouped_tweet_map)
args(animate)
# save the animation to a new file
# gganimate_save(grouped_tweet_map,
#                filename = "data/week-13/btc_tweets.gif",
#                fps = 1, loop = 0,
#                width = 1280,
#                height = 1024)

#### Lesson 6 ####
twit_1e5 <- readr::read_csv("C:/Users/Soren Schwartz/Dropbox/Egne dokumenter/Skole/master/Data2018-10-11")
sentiments

tweet_data <- tibble(date_time = twit_1e5$created,
                         username = twit_1e5$screenName,
                         tweet_text = twit_1e5$text,
                         long = twit_1e5$longitude,
                     lat = twit_1e5$latitude)

# cleanup
btc_tweets  <- tweet_data %>%
  mutate(date_time = as.POSIXct(date_time, format = "%a %b %d %H:%M:%S +0000 %Y")) %>%
  mutate(tweet_text = gsub("\\s?(f|ht)(tp)(s?)(://)([^\\.]*)[\\.|/](\\S*)",
                           "", tweet_text)) %>%
  mutate(tweet_text = stringr::str_replace_all(tweet_text, regex("btc|bitcoin",
                                                                 ignore_case = TRUE), "")) %>%
  mutate(coord = paste0(long, ",", lat))

# gsub("http://*|https://*)", "", tweet_text)
# gsub("http.*","",  rt$text)
# gsub(pattern = "https.*",replacement = "")
# gsub("\\s?(f|ht)(tp)(s?)(://)([^\\.]*)[\\.|/](\\S*)",
#      "", tweet_text)
# gsub("\\brt\\b|\\bRT\\b", "", tweet_text)
# gsub("http://*", "", tweet_text)
data("stop_words")

# get a list of words
btc_tweet_clean <- btc_tweets %>%
  dplyr::select(tweet_text) %>%
  unnest_tokens(word, tweet_text) %>%
  anti_join(stop_words) %>%
  filter(!word %in% c("rt", "t.co"))
# plot the top 15 words -- notice any issues?
btc_tweet_clean %>%
  count(word, sort = TRUE) %>%
  top_n(15) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
  labs(x = "Count",
       y = "Unique words",
       title = "Count of unique words found in tweets")

# join sentiment classification to the tweet words
bing_word_counts <- btc_tweet_clean %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()

bing_word_counts %>%
   group_by(sentiment) %>%
   summarise(sum(n))

bing_word_counts %>%
  group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(title = "Sentiment",
       y = "Contribution to sentiment",
       x = NULL) +
  coord_flip()

### By day ###
# cleanup
btc_tweets_day <- tweet_data %>%
  mutate(date_time = as.POSIXct(date_time, format = "%a %b %d %H:%M:%S +0000 %Y")) %>%
  mutate(tweet_text = gsub("\\s?(f|ht)(tp)(s?)(://)([^\\.]*)[\\.|/](\\S*)",
                           "", tweet_text)) %>%
  mutate(tweet_text = stringr::str_replace_all(tweet_text, regex("btc|bitcoin",
                                                                 ignore_case = TRUE), "")) %>%
  mutate(coord = paste0(long, ",", lat),
         day = lubridate::as_date(date_time))

# get a list of words
btc_tweet_clean_day <- btc_tweets_day %>%
  dplyr::select(tweet_text, day) %>%
  unnest_tokens(word, tweet_text) %>%
  anti_join(stop_words) %>%
  filter(!word %in% c("rt", "t.co"))

# plot the top 15 words -- notice any issues?
btc_tweet_clean_day %>%
  count(word, sort = TRUE) %>%
  top_n(15) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
  labs(x = "Count",
       y = "Unique words",
       title = "Count of unique words found in a day's worth of tweets")

# join sentiment classification to the tweet words
bing_sentiment_day <- btc_tweet_clean_day %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, day, sort = TRUE) %>%
  group_by(sentiment) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  group_by(day, sentiment) %>%
  top_n(n = 5, wt = n) %>%
  # create a date / sentiment column for sorting
  mutate(sent_date = paste0(day, " - ", sentiment)) %>%
  arrange(day, sentiment, n)

bing_sentiment_day$sent_date <- factor(bing_sentiment_day$sent_date,
                                        levels = unique(bing_sentiment_day$sent_date))

### Counts seperated on days and sentiment ###
btc_tweet_clean_day %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, day, sort = TRUE) %>%
  group_by(sentiment) %>%
  ungroup() %>%
  group_by(day, sentiment) %>%
  summarise(sum(n))

# group by month and sentiment and then plot top 5 words each month
bing_sentiment_day %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sent_date, scales = "free_y", ncol = 2) +
  labs(title = "Sentiment during the day ",
       y = "Number of Times Word Appeared in Tweets",
       x = NULL) +
  coord_flip()
3schwartz/SpecialeScrAndFun documentation built on May 4, 2019, 6:29 a.m.