#' Top Hashtags in intervall
#'
#' Retrives the most used hashtags in a given time interval
#'
#' @param mongo a mongoDB connection to use (default: hometimeline)
#' @param interval a time interval which can be generated by \link{getInterval} needs to be in unix timestamp format
#' @param amount limits the number of hashtags returned
#'
#' @return list of hashtags
#'
#' @examples
#' #topHashtagsInInterval(hometimeline, interval = getInterval(60*24), 10)
#' #or alternatively specify interval beforehand
#' #timeframe <- c("2016-01-04 18:37:43 CET","2016-01-05 18:37:43 CET")
#' #hashtags <- topHashtagsInInterval(hometimeline, interval =
#' #getInterval(inputinterval = timeframe), 10)
#'
#' @export
topHashtagsInInterval <- function(mongo, interval, amount = 10){
# interval -> first element is the starting point of the interval, i.e. the earlier timepoint
# second element is the end of the interval, i.e. the later timepoint
# limiting operator within the query somehow did not work anymore
# so placed the amount parameter at the end of the fctn.
basequery <- '[{"$unwind": "$entities.hashtags"},
{"$match": {"timestamp_ms": {"$gte": "starttime" , "$lte": "endtime" }}},
{"$project": {"entities.hashtags": {"$toLower": "$entities.hashtags.text"}}},
{"$group": {"_id":"$entities.hashtags", "count":{"$sum":1}}},
{"$sort": {"count":-1}},
{"$limit": amount}]'
query <- str_replace_all(str_c(basequery), c("starttime"=interval[1],
"endtime"=interval[2],
"amount"= amount))
hashtags <- mongo$aggregate(query)
if( length(hashtags)==0) {
return(hashtags)
}
return(hashtags)
}
#' Get Interval
#'
#' Calculates an intervall from now-decrease until now. Twitter uses GMT therefore
#' the supplied time can be modifed by adjust, default is for German daylight saving time.
#' Alternatively an interval may be provided and the corresponding unix timestamps will
#' be returned. Set Sys.setlocale("LC_TIME", "English") in order
#' to be able to correctly process the twitter timestamp. Avoid timestamps with the
#' times 00:00:00 and 12:00:00 since they are a digit short.
#'
#' @param decrease size of interval (in minutes)
#' @param adjust time to subtract from system time to get GMT (in minutes)
#' @param inputinterval alternatively input a time interval
#'
#' @return interval
#'
#' @examples
#' #Get an interval covering the last 24 hours
#' getInterval(60*24)
#'
#' #Get an interval covering the last hour
#' getInterval()
#'
#' #Specify interval and get corresponding unix timestamps
#' #isodate <- c("2016-01-04 18:37:43","2016-01-05 18:37:43")
#' #getInterval(inputinterval = isodate)
#'
#' #twitter <- c("Tue Jan 05 12:51:00 +0000 2016","Tue Jan 05 16:51:00 +0000 2016")
#' #twitter <- getInterval(inputinterval = twitter)
#'
#' @export
getInterval <- function(decrease = 60, adjust = 0, inputinterval) {
# calculates the time interval by the specified amount of the decrease parameter in minutes
# standard is 60 minutes.
if (missing(inputinterval) == TRUE) {
time <- as.numeric(Sys.time()) * 1000 - adjust
interval <- c(time - decrease * 60000, time)
}
else {
if (all(grepl(pattern = "^\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", x = inputinterval)) ==
TRUE) {
interval <-
as.numeric(as.POSIXct(inputinterval, format = "%Y-%m-%d %T")) * 1000
}
else if (all(
grepl(pattern = "^\\w{3} \\w{3} \\d{2} \\d{2}:\\d{2}:\\d{2} \\+\\d{4} \\d{4}", x = inputinterval)
) == TRUE) {
interval <-
as.numeric(as.POSIXct(inputinterval, tz = "GMT", format = "%a %b %d %H:%M:%S %z %Y")) *
1000 - ceiling(as.numeric(difftime(
as.POSIXlt(Sys.time(), Sys.timezone()),as.POSIXlt(Sys.time(), tz = "GMT"),tz = "", units = "secs"
))) * 1000
}
else {
message("Wrong interval format. Use YYYY-MM-DD HH:MM:SS or the Twitter date format.")
}
}
return(floor(interval))
}
# Helper function to merge top hashtags from different time intervalls to one list
# Will NOT be exported, used by streamingLoop
mergeHashtagFrames<- function (frame1,frame2) {
if(length(frame1)==0) {
return(frame2)
}
else if(length(frame2)==0) {
return(frame1)
}
else {
hashtags <- unique(rbind.data.frame(frame1,frame2))
return(hashtags)
}
}
# Converts date of the form YYYY-mm-dd HH:MM:SS to a unixtimestamp in ms
userToTimestamp <- function(data) {
data <- as.numeric(as.POSIXct(data, format = "%Y-%m-%d %T"))*1000
return(data)
}
# Converts unixtimestamps to userdate of the form yyyy-mm-dd hh:mm:ss
timestampToUserdate <- function(data) {
if(is.numeric(data) == TRUE) {
if(all(nchar(data, type="chars")==13)) {
data <- as.POSIXct(data/1000, origin = "1970-01-01 00:00:00")
}
else if(all(nchar(data, type="chars")==10)) {
data <- as.POSIXct(data, origin = "1970-01-01 00:00:00")
}
else {
stop("Timestamp is incorrect. Timestamps need equal lengths in order to be converted.")
}
}
else {
stop("Data is not numeric.")
}
return(data)
}
# Converts Twitterdate of the Form "Fri Nov 13 12:56:04 +0000 2015" to unixtimestamp in ms
twitterToTimestamp <- function(data) {
data <- as.numeric(as.POSIXct(data, format ="%a %b %d %H:%M:%S %z %Y"))*1000
return(data)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.