# Convert days to dates start_date <- as_date(params$start_day) end_date <- as_date(params$end_day) days <- seq(start_date, end_date, by = 1) # Set random seed set.seed(params$seed) # Set ggplot2 theme theme_set(match.fun(params$theme)()) theme_update( axis.text = element_text(size = 12), axis.title = element_text(size = 12), legend.position = "bottom", strip.background = element_rect(fill = params$accent) ) # Tweet type scales scale_fill_tweet_type = scale_fill_manual( name = "Tweet type", limits = c(TRUE, FALSE), labels = c("Original", "Retweet"), values = c(params$accent, params$accent2) ) scale_colour_tweet_type = scale_colour_manual( name = "Tweet type", limits = c(TRUE, FALSE), labels = c("Original", "Retweet"), values = c(params$accent, params$accent2) )
asis_output("## Query hashtags\n") plot_data <- map_dfr(hashtags_vec, function(.hashtag) { tweets %>% filter(str_detect(tolower(text), tolower(.hashtag))) %>% group_by(is_retweet) %>% summarise(Count = n()) %>% mutate(Hashtag = .hashtag) }) ggplot(plot_data, aes(x = Hashtag, y = Count, fill = !is_retweet)) + geom_col() + scale_fill_tweet_type + labs(title = "Number of tweets for each query hashtag")
tweets %>% group_by(date, is_retweet) %>% summarise(count = n()) %>% ggplot(aes(date, count, fill = !is_retweet)) + geom_col() + scale_fill_tweet_type + labs(x = "Date", y = "Tweets", title = paste(hashtags_query, "tweets per day"))
Filtered for dates r params$start_day
- r params$end_day
in the
r params$timezone
timezone.
tweets_days <- tweets %>% filter(date >= start_date, date <= end_date) %>% group_by(date, hour) tweets_days %>% group_by(date, hour, is_retweet) %>% summarise(count = n()) %>% ggplot(aes(hour, count, fill = !is_retweet)) + geom_col() + scale_fill_tweet_type + facet_grid(strftime(date, "%b %d") ~ .) + labs(x = "Hour", y = "Tweets", title = paste(hashtags_query, "tweets by time of day"), subtitle = paste("tweets between", params$start_day, "and", params$end_day))
tweets %>% group_by(screen_name, is_retweet) %>% summarise(count = n()) %>% group_by(screen_name) %>% mutate(total = sum(count)) %>% ungroup() %>% arrange(desc(total)) %>% slice(1:20) %>% ggplot(aes(reorder(screen_name, total), count, fill = !is_retweet)) + geom_col() + scale_fill_tweet_type + coord_flip() + labs(x = "Screen Name", y = "Tweets", title = paste(hashtags_query, "tweets by user"), subtitle = "top 20 users")
tweets %>% filter(is_retweet == FALSE) %>% count(screen_name) %>% arrange(desc(n)) %>% slice(1:20) %>% ggplot(aes(reorder(screen_name, n), n)) + geom_col(fill = params$accent) + coord_flip() + labs(x = "Screen Name", y = "Tweets", title = paste(hashtags_query, "tweets by user"), subtitle = "top 20 users (original)")
tweets %>% filter(is_retweet == TRUE) %>% count(screen_name) %>% arrange(desc(n)) %>% slice(1:20) %>% ggplot(aes(reorder(screen_name, n), n)) + geom_col(fill = params$accent2) + coord_flip() + labs(x = "Screen Name", y = "Tweets", title = paste(hashtags_query, "tweets by user"), subtitle = "top 20 users (retweets)")
plot_data <- tweets %>% group_by(screen_name) %>% summarise(orig = sum(!is_retweet), retweet = sum(is_retweet)) %>% mutate(total = orig + retweet) ggplot(plot_data, aes(total, (orig / total) - (retweet / total))) + geom_hline(yintercept = 0, colour = "red") + geom_point(colour = params$accent) + geom_text_repel(data = top_n(plot_data, 10, total), aes(label = screen_name)) + labs(x = "Total tweets", y = "<<< more retweets : more original tweets >>>", title = paste(hashtags_query, "original tweets compared to retweets"), subtitle = "top 10 users labelled")
top_users <- tweets %>% count(screen_name) %>% arrange(desc(n)) %>% slice(1:5) %>% pull(screen_name) %>% fct_inorder() tweets %>% filter(screen_name %in% top_users) %>% mutate(screen_name = factor(screen_name, levels = levels(top_users))) %>% ggplot(aes(datetime, 1, colour = !is_retweet)) + geom_jitter(width = 0, height = 1) + scale_colour_tweet_type + facet_wrap(~ screen_name, ncol = 1) + labs(x = "Datetime", title = paste(hashtags_query, "top users timeline"), subtitle = "when the top 5 users tweeted") + theme(axis.title.y = element_blank(), axis.text.y = element_blank(), axis.ticks.y = element_blank(), legend.position = "bottom")
plot_list <- lapply(seq_along(days), function(idx) { tweets_days %>% filter(date == days[idx]) %>% group_by(screen_name, is_retweet) %>% summarise(count = n()) %>% group_by(screen_name) %>% mutate(total = sum(count)) %>% ungroup() %>% arrange(desc(total)) %>% slice(1:20) %>% ggplot(aes(reorder(screen_name, total), count, fill = !is_retweet)) + geom_col() + scale_fill_tweet_type + coord_flip() + labs(x = "Screen Name", y = "Tweets", title = paste(hashtags_query, "tweets by user, Day", idx), subtitle = "top 20 users") }) src_list <- lapply(seq_along(plot_list), function(idx) { src <- c("#### Day <<idx>> {.unnumbered}", "```r", "plot_list[[<<idx>>]]", "```", "") knit_expand(text = src, delim = c("<<", ">>")) }) out <- knit_child(text = unlist(src_list))
r if (Sys.Date() >= start_date) out
plot_list <- lapply(seq_along(days), function(idx) { tweets_days %>% filter(date == days[idx]) %>% ungroup() %>% filter(is_retweet == FALSE) %>% count(screen_name) %>% arrange(desc(n)) %>% slice(1:20) %>% ggplot(aes(reorder(screen_name, n), n)) + geom_col(fill = params$accent) + coord_flip() + labs(x = "Screen Name", y = "Tweets", title = paste(hashtags_query, "tweets by user, Day", idx), subtitle = "top 20 users (original)") }) src_list <- lapply(seq_along(plot_list), function(idx) { src <- c("#### Day <<idx>> {.unnumbered}", "```r", "plot_list[[<<idx>>]]", "```", "") knit_expand(text = src, delim = c("<<", ">>")) }) out <- knit_child(text = unlist(src_list))
r if (Sys.Date() >= start_date) out
plot_list <- lapply(seq_along(days), function(idx) { tweets_days %>% filter(date == days[idx]) %>% ungroup() %>% filter(is_retweet == TRUE) %>% count(screen_name) %>% arrange(desc(n)) %>% slice(1:20) %>% ggplot(aes(reorder(screen_name, n), n)) + geom_col(fill = params$accent2) + coord_flip() + labs(x = "Screen Name", y = "Tweets", title = paste(hashtags_query, "tweets by user, Day", idx), subtitle = "top 20 users (retweets)") }) src_list <- lapply(seq_along(plot_list), function(idx) { src <- c("#### Day <<idx>> {.unnumbered}", "```r", "plot_list[[<<idx>>]]", "```", "") knit_expand(text = src, delim = c("<<", ">>")) }) out <- knit_child(text = unlist(src_list))
r if (Sys.Date() >= start_date) out
tweets %>% distinct(screen_name, source) %>% count(source) %>% filter(n >= 5) %>% ggplot(aes(reorder(source, n), n)) + geom_col(fill = params$accent) + coord_flip() + labs(x = "Source", y = "Users", title = paste(hashtags_query, "users by source"), subtitle = "sources with at least 5 distinct users")
tweets %>% count(source, is_retweet) %>% group_by(source) %>% mutate(total_n = sum(n)) %>% ungroup() %>% filter(total_n >= 5) %>% ggplot(aes(reorder(source, total_n), n, fill = !is_retweet)) + geom_col() + coord_flip() + scale_fill_tweet_type + labs(x = "Source", y = "Tweets", title = paste(hashtags_query, "tweets by source"), subtitle = "sources with at least 5 tweets")
The "replies network", composed from users who reply directly to one another, coloured by PageRank.
tweets_replies <- tweets %>% filter(!is.na(reply_to_screen_name)) %>% select(screen_name, reply_to_screen_name) %>% graph.data.frame(directed = TRUE) V(tweets_replies)$label <- V(tweets_replies)$name V(tweets_replies)$id <- V(tweets_replies)$name V(tweets_replies)$pr <- page_rank(tweets_replies)$vector ggraph(tweets_replies, layout = "fr") + geom_edge_link(arrow = arrow(length = unit(4, 'mm')), end_cap = circle(1, 'mm'), colour = "darkgrey") + geom_node_point(aes(colour = pr)) + geom_node_text(aes(label = label), colour = params$accent, repel = FALSE) + scale_colour_viridis_c() + theme_graph() + theme(legend.position = "none")
The "mentions network", where users mention other users in their tweets.
Filtered for a k-core of r params$kcore
. Node colour and size adjusted
according to PageRank score.
tweets_mentions <- tweets %>% filter(!is.na(mentions_screen_name)) %>% select(screen_name, mentions_screen_name) %>% unnest(mentions_screen_name) %>% mutate(mentions_screen_name = strsplit(mentions_screen_name, " ")) %>% unnest(mentions_screen_name) %>% graph.data.frame() V(tweets_mentions)$label <- V(tweets_mentions)$name V(tweets_mentions)$id <- V(tweets_mentions)$name V(tweets_mentions)$pr <- page_rank(tweets_mentions)$vector V(tweets_mentions)$kcore <- coreness(tweets_mentions) lo_kcore <- V(tweets_mentions)$kcore < params$kcore tweets_mentions <- delete_vertices(tweets_mentions, V(tweets_mentions)[lo_kcore]) ggraph(tweets_mentions, layout = "fr") + geom_edge_link(arrow = arrow(length = unit(2, 'mm')), end_cap = circle(1, 'mm'), width = 0.1, colour = "darkgrey") + geom_node_point(aes(colour = pr)) + geom_node_text(aes(label = label, size = pr,), colour = params$accent, repel = FALSE) + scale_colour_viridis_c() + theme_graph() + theme(legend.position = "none")
tweets %>% count(is_retweet) %>% ggplot(aes(is_retweet, n, fill = !is_retweet)) + geom_col() + scale_fill_tweet_type + labs(x = "Is retweet", y = "Tweets", title = paste(hashtags_query, "tweets by retweet status"))
tweets %>% ggplot(aes(retweet_count)) + geom_histogram(bins = max(tweets$retweet_count), fill = params$accent) + labs(x = "Retweet count", y = "Tweets", title = paste(hashtags_query, "distribution of retweets per tweet"))
top <- tweets %>% filter(is.na(retweet_status_id)) %>% filter(retweet_count > 0) %>% select(status_id, screen_name, text, retweet_count) %>% arrange(desc(retweet_count)) %>% distinct() %>% slice(1:10) top %>% select(-status_id) %>% kable(format = "html")
tryCatch({ tweet_shot(top$status_id[1]) }, error = function(msg) { if (grepl("Twitter status not found", msg)) { "Tweet not found, it may have been deleted " } else { stop(msg) } })
tweets %>% mutate(has_favorite = ifelse(favorite_count > 0, TRUE, FALSE)) %>% count(has_favorite, is_retweet) %>% ggplot(aes(has_favorite, n, fill = !is_retweet)) + geom_col() + scale_fill_tweet_type + labs(x = "Has like", y = "Tweets", title = paste(hashtags_query, "tweets by liked status"))
tweets %>% ggplot(aes(favorite_count)) + geom_histogram(bins = max(tweets$favorite_count), fill = params$accent) + labs(x = "Like count", y = "Tweets", title = paste(hashtags_query, "distribution of likes per tweet"))
top <- tweets %>% filter(favorite_count > 0) %>% select(status_id, screen_name, text, favorite_count) %>% arrange(desc(favorite_count)) %>% distinct() %>% slice(1:10) top %>% select(-status_id) %>% kable(format = "html")
tryCatch({ tweet_shot(top$status_id[1]) }, error = function(msg) { if (grepl("Twitter status not found", msg)) { "Tweet not found, it may have been deleted " } else { stop(msg) } })
tweets %>% count(is_quote, is_retweet) %>% ggplot(aes(is_quote, n, fill = is_retweet)) + geom_col() + scale_fill_tweet_type + labs(x = "Is quote", y = "Tweets", title = paste(hashtags_query, "tweets by quote status"))
tweets %>% filter(!is.na(quoted_status_id)) %>% count(quoted_status_id) %>% ggplot(aes(n)) + geom_histogram(bins = 10, fill = params$accent) + labs(x = "Quote count", y = "Tweets", title = paste(hashtags_query, "distribution of quotes per tweet")) + scale_x_continuous(limits = c(0, 10), breaks = seq(0, 10, 2))
top <- tweets %>% filter(!is.na(quoted_status_id)) %>% count(quoted_status_id) %>% filter(n > 0) %>% arrange(desc(n)) %>% inner_join(select(tweets, status_id, screen_name, quoted_status_id, is_retweet, text), by = "quoted_status_id") %>% filter(is_retweet == FALSE) %>% select(status_id, screen_name, text, quote_count = n) %>% distinct() %>% slice(1:10) top %>% select(-status_id) %>% kable(format = "html")
tryCatch({ tweet_shot(top$status_id[1]) }, error = function(msg) { if (grepl("Twitter status not found", msg)) { "Tweet not found, it may have been deleted " } else { stop(msg) } })
tweets %>% mutate(has_media = !is.na(media_url)) %>% count(has_media, is_retweet) %>% ggplot(aes(has_media, n, fill = !is_retweet)) + geom_col() + scale_fill_tweet_type + labs(x = "Has media", y = "Tweets", title = paste(hashtags_query, "tweets by media status"))
tweets_media <- tweets %>% filter(!is.na(media_url)) %>% arrange(desc(favorite_count)) %>% filter(favorite_count > 0) tweets_media %>% slice(1:10) %>% select(screen_name, text, favorite_count) %>% kable(format = "html")
The top 100 words used 3 or more times.
data("stop_words") tweets %>% filter(is_retweet == FALSE) %>% unnest_tokens(word, text) %>% select(word) %>% filter(!word %in% c(gsub("#", "", tolower(hashtags_vec)), "https", "t.co", "amp"), !word %in% tolower(tweets$screen_name), !grepl("^\\d+$", word)) %>% anti_join(stop_words, by = "word") %>% count(word) %>% with(wordcloud(word, n, max.words = 100, min.freq = 3, colors = brewer.pal(6, "Spectral")))
Other hashtags used 5 or more times.
tweets %>% filter(is_retweet == FALSE) %>% select(text) %>% mutate(text = str_split(text, "[:space:]")) %>% unnest(text) %>% filter(str_detect(text, "^#")) %>% mutate( text = str_remove_all(text, "[:punct:]"), text = str_to_lower(text), text = stringi::stri_trans_general(text, "latin-ascii") ) %>% count(text) %>% filter( !(text %in% str_remove(tolower(hashtags_vec), "#")), n >= 5 ) %>% slice_max(n, n = 30) %>% ggplot(aes(reorder(text, n), n)) + geom_col(fill = params$accent) + coord_flip() + labs(x = "Hashtag", y = "Tweets", title = paste("Other hashtags used with", hashtags_query), subtitle = paste("30 most common alternative hashtags used at", "least 5 times in original tweets"))
Words that were tweeted next to each other at least r params$bigram_filter
times.
tweets %>% filter(is_retweet == FALSE) %>% select(text) %>% unnest_tokens(bigram, text, token = "ngrams", n = 2) %>% separate(bigram, c("word1", "word2"), sep = " ") %>% filter(!word1 %in% stop_words$word, !word1 %in% c(gsub("#", "", tolower(hashtags_vec)), "https", "t.co", "amp"), !word1 %in% tolower(tweets$screen_name), !grepl("^\\d+$", word1)) %>% filter(!word2 %in% stop_words$word, !word2 %in% c(gsub("#", "", tolower(hashtags_vec)), "https", "t.co", "amp"), !word2 %in% tolower(tweets$screen_name), !grepl("^\\d+$", word2)) %>% count(word1, word2, sort = TRUE) %>% filter(n >= params$bigram_filter) %>% graph_from_data_frame() %>% ggraph(layout = "fr") + geom_edge_link(aes(edge_colour = n), arrow = arrow(type = "closed", length = unit(2, 'mm')), end_cap = circle(1, 'mm')) + geom_node_point(size = 2, colour = params$accent) + geom_node_text(aes(label = name), vjust = 1, hjust = 0.5, repel = TRUE, segment.colour = "pink") + scale_edge_color_gradientn(colours = viridis(100)) + theme_graph()
Top 10 words associated with r params$topics_k
topics identified by LDA.
topics <- tweets %>% filter(is_retweet == FALSE) %>% select(document = status_id, text) %>% unnest_tokens(word, text) %>% filter(!word %in% stop_words$word, !word %in% c(gsub("#", "", tolower(hashtags_vec)), "https", "t.co", "amp"), !word %in% tolower(tweets$screen_name), !grepl("^\\d+$", word)) %>% count(document, word, sort = TRUE) %>% ungroup() %>% cast_dtm(document, word, n) %>% LDA(k = params$topics_k, control = list(seed = 1))
topics %>% tidy(matrix = "beta") %>% group_by(topic) %>% top_n(10, beta) %>% ungroup() %>% arrange(topic, -beta) %>% mutate(term = reorder_within(term, beta, topic)) %>% ggplot(aes(x = term, y = beta, fill = factor(topic))) + geom_col(show.legend = FALSE, fill = params$accent) + scale_x_reordered() + facet_wrap(~ topic, scales = "free") + coord_flip() + labs(y = "beta (occurence in topics)", title = "Topic modelling", subtitle = paste("Top terms for", params$topics_k, "LDA topics associated with", hashtags_query)) + theme(axis.title.y = element_blank())
Most representative tweets for each topic
topics_list <- topics %>% tidy(matrix = "gamma") %>% group_by(document) %>% top_n(1, gamma) %>% ungroup() %>% group_by(topic) %>% top_n(10, gamma) %>% arrange(-gamma) %>% left_join(tweets, by = c(document = "status_id")) %>% select(topic, screen_name, text, gamma) %>% split(.$topic) src_list <- lapply(seq_along(topics_list), function(idx) { src <- c("#### Topic <<idx>> {.unnumbered}", "```r", "kable(topics_list[[<<idx>>]][, -1], format = 'html') %>%", "kable_styling(bootstrap_options = c('striped', 'condensed'))", "```", "") knit_expand(text = src, delim = c("<<", ">>")) }) out <- knit_child(text = unlist(src_list))
r out
Links to GitHub, GitLab, BitBucket, Bioconductor or CRAN mentioned in Tweets.
url_counts <- tweets %>% unnest(urls_expanded_url) %>% group_by(urls_expanded_url) %>% summarise( Tweets = sum(!is_retweet), Retweets = sum(is_retweet), .groups = "drop" ) %>% mutate(Total = Tweets + Retweets) urls <- tweets %>% pull(urls_expanded_url) %>% discard(~ all(is.na(.x))) %>% flatten_chr() %>% unique() regexes <- list( c(Type = "GitHub", re = "http[s]?://github.com/[\\w-]+/([A-Za-z0-9_\\.-]+).*"), c(Type = "GitLab", re = "http[s]?://gitlab.com/[\\w-]+/([A-Za-z0-9_\\.-]+).*"), c(Type = "BitBucket", re = "http[s]?://bitbucket.org/[\\w-]+/([A-Za-z0-9_\\.-]+).*"), c(Type = "Bioc", re = "https://bioconductor.org/packages.*/([A-Za-z0-9_\\.-]+).*"), c(Type = "CRAN", re = "https://cran.*/packages/([A-Za-z0-9_\\.-]+).*") ) software <- map_df(regexes, function(re) { urls %>% str_match(re["re"]) %>% as_tibble(.name_repair = "unique") %>% rename(URL = "...1", Name = "...2") %>% mutate(Type = re["Type"]) }) %>% drop_na() %>% mutate( LinkURL = case_when( Type == "GitHub" ~ str_to_lower(str_replace(URL, "http:", "https:")), Type == "GitLab" ~ str_to_lower(str_replace(URL, "http:", "https:")), Type == "BitBucket" ~ str_to_lower(str_replace(URL, "http:", "https:")), Type == "Bioc" ~ paste0("https://bioconductor.org/packages/", Name), Type == "CRAN" ~ paste0("https://CRAN.R-project.org/package=", Name), TRUE ~ URL ) ) %>% mutate( Link = str_remove(LinkURL, "https://"), Link = str_remove(Link, "github.com/"), Link = str_remove(Link, "gitlab.com/"), Link = str_remove(Link, "bitbucket.org/"), Link = paste0("[", str_trunc(Link, 70), "](", LinkURL, ")") ) %>% left_join(url_counts, by = c(URL = "urls_expanded_url")) %>% arrange(desc(Total), desc(Tweets), desc(Retweets), Name) %>% select(Name, Tweets, Retweets, Type, Link)
r if (nrow(software) >= 1) kable(software)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.