knitr::opts_chunk$set(echo = FALSE) # do not echo code knitr::opts_chunk$set(warning = FALSE) knitr::opts_chunk$set(message = FALSE) knitr::opts_chunk$set(fig_caption = TRUE) knitr::opts_chunk$set(fig_height = 6) # default, make it bigger to stretch vertical axis knitr::opts_chunk$set(fig_width = 8) # full width knitr::opts_chunk$set(tidy = TRUE) # tidy up code in case echo = TRUE
# Set start time ---- startTime <- proc.time() # Libraries ---- library(myUtils) # additional libs required by this code reqLibs <- c("ggplot2", "plotly", "readr", "reshape2", "stringr", "knitr", "kableExtra", "hashTagR" ) print(paste0("Loading the following libraries: ", reqLibs)) # Use Luke's function to require/install/load myUtils::loadLibraries(reqLibs)
dPath <- "~/Data/twitter/boty2018/raw/" hashTags <- params$hashTags searchString <- params$searchString timeZone <- "Pacific/Auckland"
Just a bit of dataknut fun woven around the day job.
You'll be wanting Section \@ref(cumulative) for the trending hashtags...
CC-BY unless otherwise noted.
r lubridate::year(today())
) Twitter hashtag analysis: r searchString
. Downloaded from: r params$pubUrl
.See:
I've not been in NZ at this time of year before so r params$hashTags
is a whole new cultural experience.
The idea is to extract and visualise tweets and re-tweets of r params$hashTags
(see r params$explHashTag
and the Forest & Bird voting site).
Why? Err.... Just. Because.
Code borrows extensively from https://github.com/mkearney/rtweet
The analysis used rtweet to ask the free Standard
Twitter search API to extract 'all' tweets containing the r searchString
hashtags in the freely available recent (last 7 days) twitterVerse. When a search is repeated the same tweet can appear more than once if one of it's attributes (e.g. number of likes & re-tweets) has changed since the last search.
The search was repeated irregularly throughout the time that voting was open.
It is possible that not all relevant tweets have been extracted because the free (Standard
) search does not run over a complete archive of all tweets.
Future work should instead use the Twitter streaming API to set up a proper siphon of relevant tweets throughout the relevant period.
# load from pre-collected ---- message("Load from pre-collected data and check for duplicates") # this code uses the function to load them raw_twDT <- hashTagR::loadTweets(dPath,searchString) # process them twNoDupsDT <- hashTagR::processTweets(raw_twDT) # for testing # this data.table will have duplicate entries because: # 1. we may have run the search mutiple times # 2. the search results are dynamic - they can change if a tweet is liked, re-tweeted etc as this data is updated # we don't want duplicates, we just want the most recent tweet record by time of creation and screen_names rn <- nrow(twNoDupsDT) twDT <- unique(raw_twDT, fromLast = TRUE, by = c("created_at", "screen_name") ) # drop duplicates un <- nrow(twDT) twDT <- twDT[, created_at_local := lubridate::with_tz(created_at, tzone = timeZone)] # beware - running this outside NZ will lead to strange graphs twDT <- twDT[, ba_obsDate := lubridate::date(created_at_local)] twDT <- twDT[, ba_obsTime := hms::as.hms(created_at)] # this will auto-convert to local time twDT <- twDT[, ba_tweetType := "Tweet"] twDT <- twDT[is_retweet == TRUE, ba_tweetType := "Re-tweet"] twDT <- twDT[is_quote == TRUE, ba_tweetType := "Quote"] twDT <- twDT[, obsDateHour := lubridate::floor_date(created_at_local, "hour")]
r myUtils::tidyNum(rn - un)
duplicates of the <created_at
><screen_name
> tuple were removed from the original r myUtils::tidyNum(rn)
extracted tweets. Note that the duplicates exist in the raw data and may be useful for analysis of the dynamics of re-tweeting etc over time.
The cleaned data used in the rest of this report has:
r tidyNum(nrow(twDT))
tweets (including r tidyNum(nrow(twDT[is_quote == "TRUE"]))
quotes and r tidyNum(nrow(twDT[is_retweet == "TRUE"]))
re-tweets) r tidyNum(uniqueN(twDT$screen_name))
tweeters r min(twDT$created_at_local)
and r max(twDT$created_at_local)
(r timeZone
).Voting closed on Sunday 14th October with the results announced on Monday 15th.
dataCap <- paste0("Source: Data collected from Twitter's REST API via rtweet", "\nAll (re)tweets and quotes containing ", searchString, " from ", min(twDT$created_at_local), " to ", max(twDT$created_at_local), " (", timeZone, ")" )
plotDT <- twDT[, .( nTweets = .N, nTweeters = uniqueN(screen_name) ), keyby = .(obsDateHour, ba_tweetType) ] myPlot <- ggplot2::ggplot(plotDT, aes(x = obsDateHour)) + geom_line(aes(y = nTweets, colour = "N tweets")) + geom_line(aes(y = nTweeters, colour = "N tweeters")) + facet_grid(ba_tweetType ~ .) + scale_x_datetime(breaks ="1 day", date_labels = "%a %d %b %Y") + theme(strip.text.y = element_text(size = 9, colour = "black", angle = 90)) + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 0.5)) + theme(legend.position = "bottom") + theme(legend.title = element_blank()) + labs(caption = dataCap, x = "Date", y = "Count" ) myPlot
Figure \@ref(fig:allDaysChart) shows the number of tweets and tweeters in the data extract by day. The quotes, tweets and re-tweets have been separated.
If you are in New Zealand and you are wondering why there are no tweets today
(r lubridate::today()
) the answer is that twitter data (and these plots) are working in UTC and (y)our today()
may not have started yet in UTC. Don't worry, all the tweets are here - it's just our old friend the timezone... :-)
Next we'll try by screen name.
plotDT <- twDT[, .( nTweets = .N ), by = .(screen_name, ba_obsDate)] tilePlot <- ggplot(plotDT, aes(x = ba_obsDate)) + geom_tile(aes(y = screen_name, fill = nTweets)) + theme(strip.text.y = element_text(size = 9, colour = "black", angle = 0)) + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 0.5)) + #scale_x_reverse() + # fix reverse plotting of long scale_x_date(date_breaks = "1 day", date_labels = "%a %d %b %Y") + scale_fill_gradient(low="green", high = "red") + theme(legend.position = "bottom") + theme(legend.title = element_blank()) + labs(caption = dataCap, x = "Date", y = "Screen name" ) tilePlot
Figure \@ref(fig:screenNamesTile) is a really bad visualisation of all tweeters tweeting over time. Each row of pixels is a tweeter (the names are probably illegible) and a green dot indicates a few tweets in the given day while a red dot indicates a lot of tweets.
So let's re-do that for the top 50 tweeters so we can see their tweetStreaks (tm)...
Top tweeters:
allTweetersDT <- twDT[, .(nTweets = .N), by = screen_name][order(-nTweets)] kableExtra::kable(caption = "Top 15 tweeters (all days)", head(allTweetersDT, 15)) %>% kable_styling()
And their tweetStreaks are shown in Figure \@ref(fig:screenNameTop50)...
myDataCap <- paste0(dataCap, "\nScreen names in reverse alphabetical order" ) matchDT <- head(allTweetersDT,50) matchDT <- matchDT[, maxT := nTweets] setkey(matchDT, screen_name) setkey(twDT, screen_name) tempDT <- merge(twDT, matchDT) plotDT <- tempDT[matchDT, .( nTweets = .N ), keyby = .(maxT,screen_name,ba_obsDate)] plotDT <- plotDT[order(plotDT$maxT,plotDT$screen_name)] myPlot <- ggplot(plotDT, aes(x = ba_obsDate)) + geom_tile(aes(y = screen_name, fill = nTweets)) + theme(strip.text.y = element_text(size = 9, colour = "black", angle = 0)) + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 0.5)) + scale_x_date(date_breaks = "1 day", date_labels = "%a %d %b %Y") + scale_fill_gradient(low="green", high = "red") + theme(legend.position = "bottom") + theme(legend.title = element_blank()) + labs(caption = myDataCap, x = "Date", y = "Screen name" ) myPlot
Any twitterBots...?
This is very quick and dirty but... to calculate this we have to do a bit of string processing first.
htDT <- twDT[!is.na(hashtags), .(hashtags, ba_obsDate, ba_tweetType)] # remove any tweets without hashtags. How can there be no hashtags when we searched on hashtags? # now string split them # https://stackoverflow.com/questions/33200179/dynamically-assign-number-of-splits-in-data-table-tstrsplit splits <- max(lengths(strsplit(htDT$hashtags, "|", , fixed=T))) htDT <- htDT[, paste0("ht", 1:splits) := tstrsplit(hashtags, "|", fixed=T)] # reshape the list htLongDT <- reshape2::melt(htDT, id=c("hashtags","ba_obsDate", "ba_tweetType")) # remove NA htLongDT <- htLongDT[!is.na(value)] message("We have ", nrow(htLongDT), " hashtags.") message("That's about ~ ", round(nrow(htLongDT)/nrow(twDT),2), " hashtags per tweet...") # process in steps to be clear htLongDT <- htLongDT[, htOrig := value]
This is how I have tidied the hashtags (make other suggestions here):
# First we make everything lower case htLongDT <- htLongDT[, htLower := tolower(htOrig)] # lower case # Next we remove the macrons just in case # h/t: https://twitter.com/Thoughtfulnz/status/1046685305569345536 htLongDT <- htLongDT[, htClean := stringr::str_replace_all(htLower,"[āēīōū]",myUtils::deMacron)] # Now remove 'team' from a string so that e.g. teamkaki == kaki htLongDT <- htLongDT[, htClean := gsub("team", "",htClean)] # Now remove variants on 'vote' htLongDT <- htLongDT[, htClean := gsub("vote4", "",htClean)] htLongDT <- htLongDT[, htClean := gsub("vote", "",htClean)]
Table \@ref(tab:tweetTable) shows the total count of each #hashtag by (re)tweet type. With thanks to David Hood for code to help make sure that kakī == kaki (etc).
kableExtra::kable(caption = "Top 20 hashtags", head(htLongDT[, .(count = .N), by = .(hashTag = htClean, type = ba_tweetType)][order(-count)],20)) %>% kable_styling()
threshold <- 10
Figure \@ref(fig:plotHashtags) plots the daily occurence of these hashtags after removing variants of r searchString
and selecting only those which have more than r threshold
mentions on any day. For clarity tweets and re-tweets are aggregated. See Section \@ref(problems) for the problems with this #hashTag counting approach.
# remove the hashtags we searched for testDT <- htLongDT for(n in 1:length(hashTags)){ testDT <- testDT[!(grepl(hashTags[n],htClean,ignore.case = TRUE))] } htTabDT <- testDT[, .(count = .N), keyby = .(ba_obsDate, htClean, ba_tweetType)] ggplot2::ggplot(htTabDT[count > threshold], aes(x = ba_obsDate, y = htClean, fill = count)) + geom_tile() + theme(strip.text.y = element_text(size = 9, colour = "black", angle = 0)) + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 0.5)) + scale_x_date(date_breaks = "1 day", date_labels = "%a %d %b %Y") + scale_fill_gradient(low="green", high = "red") + # facet_grid(ba_tweetType ~ .) + theme(legend.position = "bottom") + theme(legend.title = element_blank()) + labs(caption = paste0(dataCap,"\nReverse alphabetical order"), x = "Date", y = "Unique hashtags" )
So, who's gonna win? No idea.
There are a lot of problems with this approach (see Section \@ref(problems)) but if the hashtags have any predictive value at all then Figure \@ref(fig:cumulativeHtCounts) should be an indicator of the direction of travel (watch for lines of apparently dis-similar hashtags where the macron fix has failed) and Figure \@ref(fig:htCountsTodate) shows the totals to date.
The official results show the Kererū as the winner and the Kakī third after the Kākāpō.
threshold <- 30
Figure \@ref(fig:cumulativeHtCounts) uses plotly to avoid having to render a large legend - just hover over the lines to see who is who...
plotDT <- htTabDT[, .(count = sum(count)), keyby = .(ba_obsDate,htClean)] plotDT <- plotDT[, cumCount := cumsum(count), by = htClean] plotDT <- plotDT[, maxCount := max(cumCount), by = htClean] # so we can filter plotDT <- plotDT[, hashtag := htClean] plotDT <- plotDT[, date := ba_obsDate] linePlot <- ggplot2::ggplot(plotDT[maxCount > threshold], aes(x = date, y = cumCount, colour = hashtag)) + geom_line() + theme(strip.text.y = element_text(size = 9, colour = "black", angle = 0)) + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 0.5)) + scale_x_date(date_breaks = "1 day", date_labels = "%a %d %b %Y") + theme(legend.position = "none") + theme(legend.title = element_blank()) + labs(caption = dataCap, x = "Date", y = "Cumulative count per day" ) plotly::ggplotly(linePlot)
plotDT <- htTabDT[, .(count = sum(count)), keyby = .(htClean)] plotDT <- plotDT[, htCleanOr := reorder(htClean, -count)] ggplot2::ggplot(plotDT[count > threshold], aes(x = htCleanOr, y = count, fill = htCleanOr)) + geom_col() + theme(strip.text.y = element_text(size = 9, colour = "black", angle = 0)) + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 0.5)) + theme(legend.position = "none") + theme(legend.title = element_blank()) + labs(caption = dataCap, x = "hashtag", y = "Count to date" )
Loads of them. But primarily:
#<birdName>
hashtag (like this rather popular one) will not show up. #YMMV;As ever, #YMMV.
t <- proc.time() - startTime elapsed <- t[[3]]
Analysis completed in r elapsed
seconds ( r round(elapsed/60,2)
minutes) using knitr in RStudio with r R.version.string
running on r R.version$platform
.
A special mention must go to rtweet [@rtweet-package] for the twitter API interaction functions.
Other R packages used:
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.