library(ggplot2)
library(dplyr)
library(tidytext)
library(igraph)
library(ggraph)
# devtools::install_github("dgrtwo/widyr")
library(widyr)
library(tidyr)
# json libraries
library(rjson)
library(jsonlite)
# date time
library(lubridate)
library(zoo)
#### rtweet ####
library(rtweet)
token <- create_token(
app = "mt_crypto",
consumer_key = "xOqmpoGBMr91X2iKAgJey8ehn",
consumer_secret = "GgDPJIQWGsXxgNxs262LIAwIPB3qXO0OgTg4Rgl1yyjtZEYC7J",
access_token = "1049665959663947777-KWBBLprqiC1EL0HYWXH4RCpVwOY13S",
access_secret = "I3SE6GrD4fEgCweW3L2Q3Rh5QjEgeRpTSTtdB9RT7HjoC")
## check to see if the token is loaded
identical(token, get_token())
searchTerm = "#rstats"
rt <- search_tweets(
searchTerm, n = 18000, include_rts = FALSE
)
rtBackup <- rt
kp <- get_timeline(user = "@katyperry", n = 1000)
kk <- get_timeline(user = "@KimKardashian", n = 1000)
ag <- get_timeline(user = "@ArianaGrande", n = 1000)
par(mfrow = c(3,1))
ts_plot(kp, by = "1 weeks")
ts_plot(kk, by = "1 weeks")
ts_plot(ag, by = "1 weeks")
par(mfrow = c(1,1))
## plot time series of tweets
ts_plot(rt, "3 hours") +
ggplot2::theme_minimal() +
ggplot2::theme(plot.title = ggplot2::element_text(face = "bold")) +
ggplot2::labs(
x = NULL, y = NULL,
title = "Frequency of #rstats Twitter statuses from past 9 days",
subtitle = "Twitter status (tweet) counts aggregated using three-hour intervals",
caption = "\nSource: Data collected from Twitter's REST API via rtweet"
)
## search for 250,000 tweets containing the word data
rt <- search_tweets(
"data", n = 250000, retryonratelimit = TRUE
)
## search for 10,000 tweets sent from the US
rt <- search_tweets(
"lang:en", geocode = lookup_coords("usa"), n = 10000
)
## create lat/lng variables using all available tweet and profile geo-location data
rt <- lat_lng(rt)
## plot state boundaries
par(mar = c(0, 0, 0, 0))
maps::map("state", lwd = .25)
## plot lat and lng points onto state map
with(rt, points(lng, lat, pch = 20, cex = .75, col = rgb(0, .3, .7, .75)))
#### Ex 2 ####
rstats <- search_tweets(q = "#rstats", n = 500)
head(rt, n = 3)
rstats_tweets <- search_tweets("#rstats", n = 500,
include_rts = FALSE)
head(rt$screen_name)
unique(rt$screen_name)
users <- search_users("#rstats",
n = 500)
length(unique(rt$location))
rt %>%
ggplot(aes(location)) +
geom_bar() + coord_flip() +
labs(x = "Count", y = "location", title = "Twitter users - unique locations")
rt %>%
count(location, sort = TRUE) %>%
mutate(location = reorder(location, n)) %>%
na.omit() %>%
top_n(20) %>%
ggplot(aes(x = location, y = n)) +
geom_col() +
coord_flip() +
labs(x = "Count",
y = "Location",
title = "Where Twitter users are from - unique locations ")
rt$stripped <- gsub("http.*","", rt$text) %>%
gsub(pattern = "https.*",replacement = "")
# remove punctuation, convert to lowercase, add id for each tweet!
rt_tweets_clean <- rt %>%
dplyr::select(stripped) %>%
unnest_tokens(word, stripped)
# plot the top 15 words -- notice any issues?
rt_tweets_clean %>%
count(word, sort = TRUE) %>%
top_n(15) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x = word, y = n)) +
geom_col() +
xlab(NULL) +
coord_flip() +
labs(x = "Count",
y = "Unique words",
title = "Count of unique words found in tweets")
# load list of stop words - from the tidytext package
data("stop_words")
head(stop_words)
nrow(rt_tweets_clean)
rt_tweets_words <- rt_tweets_clean %>%
anti_join(stop_words)
nrow(rt_tweets_words)
# plot the top 15 words -- notice any issues?
rt_tweets_words %>%
count(word, sort = TRUE) %>%
top_n(15) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x = word, y = n)) +
geom_col() +
xlab(NULL) +
coord_flip() +
labs(y = "Count",
x = "Unique words",
title = "Count of unique words found in tweets",
subtitle = "Stop words removed from the list")
rt_tweets_paired_words <- rt %>%
dplyr::select(stripped) %>%
unnest_tokens(paired_words, stripped, token = "ngrams", n = 2)
rt_tweets_paired_words %>%
count(paired_words, sort = TRUE)
rt_tweets_separated_words <- rt_tweets_paired_words %>%
separate(paired_words, c("word1", "word2"), sep = " ")
rt_tweets_filtered <- rt_tweets_separated_words %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
# new bigram counts:
rt_words_counts <- rt_tweets_filtered %>%
count(word1, word2, sort = TRUE)
head(rt_words_counts)
rt_words_counts %>%
filter(n >= 24) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = n, edge_width = n)) +
geom_node_point(color = "darkslategray4", size = 3) +
geom_node_text(aes(label = name), vjust = 1.8, size = 3) +
labs(title = "Word Network: Tweets using the hashtag - bitcoin and btc",
subtitle = "Text mining twitter data ",
x = "", y = "")
#### Lesson 4 ####
# json support
library(rjson)
library(jsonlite)
# plotting and pipes - tidyverse!
library(ggplot2)
library(dplyr)
library(tidyr)
# text mining library
library(tidytext)
library(tm)
# coupled words analysis
library(widyr)
# plotting packages
library(igraph)
library(ggraph)
options(stringsAsFactors = FALSE)
# create new df with the tweet text & usernames
tweet_data <- tibble(date_time = rt$created_at,
username = rt$screen_name,
tweet_text = rt$text)
head(tweet_data)
#format = "%Y-%m-%d %H:%M:%s"
xts::periodicity(tweet_data$date_time)
start_date <- as.POSIXct('2018-10-09 05:00:00')
end_date <- as.POSIXct('2018-10-10 11:20:00')
# cleanup
rt_tweets <- tweet_data %>%
mutate(date_time = as.POSIXct(date_time, format = "%a %b %d %H:%M:%S +0000 %Y")) %>%
filter(date_time >= start_date & date_time <= end_date )
rt_tweet_messages <- rt_tweets %>%
dplyr::select(tweet_text) %>%
unnest_tokens(word, tweet_text)
data("stop_words")
nrow(rt_tweet_messages)
rt_tweet_clean <- rt_tweet_messages %>%
anti_join(stop_words) %>%
filter(!word == "rt")
# how many words after removing the stop words?
nrow(rt_tweet_clean)
rt_tweet_clean %>%
count(word, sort = TRUE) %>%
top_n(15) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x = word, y = n)) +
geom_col() +
xlab(NULL) +
coord_flip() +
labs(x = "Count",
y = "Unique words",
title = "Count of unique words found in tweets")
# cleanup
rt_tweet_clean <- tweet_data %>%
mutate(date_time = as.POSIXct(date_time, format = "%a %b %d %H:%M:%S +0000 %Y"),
tweet_text = gsub("\\s?(f|ht)(tp)(s?)(://)([^\\.]*)[\\.|/](\\S*)",
"", tweet_text)) %>%
filter(date_time >= start_date & date_time <= end_date ) %>%
dplyr::select(tweet_text) %>%
unnest_tokens(word, tweet_text) %>%
anti_join(stop_words) %>%
filter(!word == "rt") # remove all rows that contain "rt" or retweet
rt_tweet_clean %>%
count(word, sort = TRUE) %>%
top_n(15) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x = word, y = n)) +
geom_col() +
xlab(NULL) +
coord_flip() +
labs(x = "Count",
y = "Unique words",
title = "Count of unique words found in tweets, ")
rt_tweets_paired <- tweet_data %>%
dplyr::select(tweet_text) %>%
mutate(tweet_text = removeWords(tweet_text, stop_words$word)) %>%
mutate(tweet_text = gsub("\\brt\\b|\\bRT\\b", "", tweet_text)) %>%
mutate(tweet_text = gsub("http://*", "", tweet_text)) %>%
unnest_tokens(paired_words, tweet_text, token = "ngrams", n = 2)
rt_tweets_paired %>%
count(paired_words, sort = TRUE)
rt_tweets_separated <- rt_tweets_paired %>%
separate(paired_words, c("word1", "word2"), sep = " ")
rt_word_counts <- rt_tweets_separated %>%
count(word1, word2, sort = TRUE)
head(rt_word_counts)
rt_word_counts %>%
filter(n >= 50) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = n, edge_width = n)) +
geom_node_point(color = "darkslategray4", size = 3) +
geom_node_text(aes(label = name), vjust = 1.8, size = 3) +
labs(title = "Word Network: Tweets during the day Colorado rt Event",
subtitle = "September day - Text mining twitter data ",
x = "", y = "") +
theme_void()
#### Lesson 5 ####
library(leaflet)
library(gganimate)
library(lubridate)
library(maps)
library(ggthemes)
# create new df with just the tweet texts & usernames
tweet_data <- tibble(date_time = rt$created_at,
username = rt$screen_name,
tweet_text = rt$text) %>%
dplyr::bind_cols(as_tibble(do.call(rbind, rt$coords_coords)) %>%
mutate(coordinates = paste0(V1, ",", V2)) %>%
select(coordinates))
head(tweet_data)
# cleanup & and filter to just the time period around the btc
rt_tweets <- tweet_data %>%
mutate(coordinates = gsub("\\)|c\\(", "", coordinates),
date_time = as.POSIXct(date_time, format = "%a %b %d %H:%M:%S +0000 %Y")) %>%
separate(coordinates, c("long", "lat"), sep = ",") %>%
mutate_at(c("lat", "long"), as.numeric)
# mutate(hej, coordinates = gsub("\\)|c\\(", "", coordinates)) %>%
# separate(coordinates, c("long", "lat"), sep = ",") %>%
# unique()
world_basemap <- ggplot() +
borders("world", colour = "gray85", fill = "gray80") +
theme_map()
world_basemap
tweet_locations <- rt_tweets %>%
na.omit()
head(tweet_locations)
world_basemap +
geom_point(data = tweet_locations, aes(x = long, y = lat),
colour = 'purple', alpha = .5) +
scale_size_continuous(range = c(1, 8),
breaks = c(250, 500, 750, 1000)) +
labs(title = "Tweet Locations")
# plot points on top of a leaflet basemap
site_locations <- leaflet(tweet_locations) %>%
addTiles() %>%
addCircleMarkers(lng = ~long, lat = ~lat, popup = ~tweet_text,
radius = 3, stroke = FALSE)
site_locations
# plot points on top of a leaflet basemap
site_locations_base <- leaflet(tweet_locations) %>%
addProviderTiles("CartoDB.Positron") %>%
addCircleMarkers(lng = ~long, lat = ~lat, popup = ~tweet_text,
radius = 3, stroke = FALSE)
site_locations_base
# summarize by day?
# perhaps round the lat long and then do it?
# since it's all in sept
tweet_locations_grp <- tweet_locations %>%
mutate(day = day(date_time),
long_round = round(long, 2),
lat_round = round(lat, 2)) %>%
group_by(day, long_round, lat_round) %>%
summarise(total_count = n())
# this also works -- plotting across the world here...
grouped_tweet_map <- world_basemap + geom_point(data = tweet_locations_grp,
aes(long_round, lat_round, size = total_count),
color = "purple", alpha = .5) + coord_fixed() +
labs(title = "Twitter Activity") +
transition_time(day) +
ease_aes('linear')
grouped_tweet_map
# created animated gif file
gganimate::animate(grouped_tweet_map)
args(animate)
# save the animation to a new file
# gganimate_save(grouped_tweet_map,
# filename = "data/week-13/btc_tweets.gif",
# fps = 1, loop = 0,
# width = 1280,
# height = 1024)
#### Lesson 6 ####
twit_1e5 <- readr::read_csv("C:/Users/Soren Schwartz/Dropbox/Egne dokumenter/Skole/master/Data2018-10-11")
sentiments
tweet_data <- tibble(date_time = twit_1e5$created,
username = twit_1e5$screenName,
tweet_text = twit_1e5$text,
long = twit_1e5$longitude,
lat = twit_1e5$latitude)
# cleanup
btc_tweets <- tweet_data %>%
mutate(date_time = as.POSIXct(date_time, format = "%a %b %d %H:%M:%S +0000 %Y")) %>%
mutate(tweet_text = gsub("\\s?(f|ht)(tp)(s?)(://)([^\\.]*)[\\.|/](\\S*)",
"", tweet_text)) %>%
mutate(tweet_text = stringr::str_replace_all(tweet_text, regex("btc|bitcoin",
ignore_case = TRUE), "")) %>%
mutate(coord = paste0(long, ",", lat))
# gsub("http://*|https://*)", "", tweet_text)
# gsub("http.*","", rt$text)
# gsub(pattern = "https.*",replacement = "")
# gsub("\\s?(f|ht)(tp)(s?)(://)([^\\.]*)[\\.|/](\\S*)",
# "", tweet_text)
# gsub("\\brt\\b|\\bRT\\b", "", tweet_text)
# gsub("http://*", "", tweet_text)
data("stop_words")
# get a list of words
btc_tweet_clean <- btc_tweets %>%
dplyr::select(tweet_text) %>%
unnest_tokens(word, tweet_text) %>%
anti_join(stop_words) %>%
filter(!word %in% c("rt", "t.co"))
# plot the top 15 words -- notice any issues?
btc_tweet_clean %>%
count(word, sort = TRUE) %>%
top_n(15) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x = word, y = n)) +
geom_col() +
xlab(NULL) +
coord_flip() +
labs(x = "Count",
y = "Unique words",
title = "Count of unique words found in tweets")
# join sentiment classification to the tweet words
bing_word_counts <- btc_tweet_clean %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
bing_word_counts %>%
group_by(sentiment) %>%
summarise(sum(n))
bing_word_counts %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(title = "Sentiment",
y = "Contribution to sentiment",
x = NULL) +
coord_flip()
### By day ###
# cleanup
btc_tweets_day <- tweet_data %>%
mutate(date_time = as.POSIXct(date_time, format = "%a %b %d %H:%M:%S +0000 %Y")) %>%
mutate(tweet_text = gsub("\\s?(f|ht)(tp)(s?)(://)([^\\.]*)[\\.|/](\\S*)",
"", tweet_text)) %>%
mutate(tweet_text = stringr::str_replace_all(tweet_text, regex("btc|bitcoin",
ignore_case = TRUE), "")) %>%
mutate(coord = paste0(long, ",", lat),
day = lubridate::as_date(date_time))
# get a list of words
btc_tweet_clean_day <- btc_tweets_day %>%
dplyr::select(tweet_text, day) %>%
unnest_tokens(word, tweet_text) %>%
anti_join(stop_words) %>%
filter(!word %in% c("rt", "t.co"))
# plot the top 15 words -- notice any issues?
btc_tweet_clean_day %>%
count(word, sort = TRUE) %>%
top_n(15) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x = word, y = n)) +
geom_col() +
xlab(NULL) +
coord_flip() +
labs(x = "Count",
y = "Unique words",
title = "Count of unique words found in a day's worth of tweets")
# join sentiment classification to the tweet words
bing_sentiment_day <- btc_tweet_clean_day %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, day, sort = TRUE) %>%
group_by(sentiment) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
group_by(day, sentiment) %>%
top_n(n = 5, wt = n) %>%
# create a date / sentiment column for sorting
mutate(sent_date = paste0(day, " - ", sentiment)) %>%
arrange(day, sentiment, n)
bing_sentiment_day$sent_date <- factor(bing_sentiment_day$sent_date,
levels = unique(bing_sentiment_day$sent_date))
### Counts seperated on days and sentiment ###
btc_tweet_clean_day %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, day, sort = TRUE) %>%
group_by(sentiment) %>%
ungroup() %>%
group_by(day, sentiment) %>%
summarise(sum(n))
# group by month and sentiment and then plot top 5 words each month
bing_sentiment_day %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sent_date, scales = "free_y", ncol = 2) +
labs(title = "Sentiment during the day ",
y = "Number of Times Word Appeared in Tweets",
x = NULL) +
coord_flip()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.