R/dateOperation.R

#' Top Hashtags in intervall
#'
#' Retrives the most used hashtags in a given time interval
#'
#' @param mongo a mongoDB connection to use (default: hometimeline)
#' @param interval a time interval which can be generated by \link{getInterval} needs to be in unix timestamp format
#' @param amount limits the number of hashtags returned
#'
#' @return list of hashtags
#'
#' @examples
#' #topHashtagsInInterval(hometimeline, interval = getInterval(60*24), 10)
#' #or alternatively specify interval beforehand
#' #timeframe <- c("2016-01-04 18:37:43 CET","2016-01-05 18:37:43 CET")
#' #hashtags <- topHashtagsInInterval(hometimeline, interval =
#' #getInterval(inputinterval = timeframe), 10)
#'
#' @export
topHashtagsInInterval <- function(mongo, interval, amount = 10){
  # interval -> first element is the starting point of the interval, i.e. the earlier timepoint
  # second element is the end of the interval, i.e. the later timepoint
  # limiting operator within the query somehow did not work anymore
  # so placed the amount parameter at the end of the fctn.

  basequery <- '[{"$unwind": "$entities.hashtags"},
                 {"$match": {"timestamp_ms": {"$gte": "starttime" , "$lte": "endtime" }}},
                 {"$project": {"entities.hashtags": {"$toLower": "$entities.hashtags.text"}}},
                 {"$group": {"_id":"$entities.hashtags", "count":{"$sum":1}}},
                 {"$sort": {"count":-1}},
                 {"$limit": amount}]'
  query <- str_replace_all(str_c(basequery), c("starttime"=interval[1],
                                               "endtime"=interval[2],
                                               "amount"= amount))
  hashtags <- mongo$aggregate(query)
  if( length(hashtags)==0) {
    return(hashtags)
  }
  return(hashtags)
}

#' Get Interval
#'
#' Calculates an intervall from now-decrease until now. Twitter uses GMT therefore
#' the supplied time can be modifed by adjust, default is for German daylight saving time.
#' Alternatively an interval may be provided and the corresponding unix timestamps will
#' be returned. Set Sys.setlocale("LC_TIME", "English") in order
#' to be able to correctly process the twitter timestamp. Avoid timestamps with the
#' times 00:00:00 and 12:00:00 since they are a digit short.
#'
#' @param decrease size of interval (in minutes)
#' @param adjust time to subtract from system time to get GMT (in minutes)
#' @param inputinterval alternatively input a time interval
#'
#' @return interval
#'
#' @examples
#' #Get an interval covering the last 24 hours
#' getInterval(60*24)
#'
#' #Get an interval covering the last hour
#' getInterval()
#'
#' #Specify interval and get corresponding unix timestamps
#' #isodate <- c("2016-01-04 18:37:43","2016-01-05 18:37:43")
#' #getInterval(inputinterval = isodate)
#'
#' #twitter <- c("Tue Jan 05 12:51:00 +0000 2016","Tue Jan 05 16:51:00 +0000 2016")
#' #twitter <- getInterval(inputinterval = twitter)
#'
#' @export
getInterval <- function(decrease = 60, adjust = 0, inputinterval) {
  # calculates the time interval by the specified amount of the decrease parameter in minutes
  # standard is 60 minutes.
  if (missing(inputinterval) == TRUE) {
    time <- as.numeric(Sys.time()) * 1000 - adjust
    interval <- c(time - decrease * 60000, time)
  }

  else {
    if (all(grepl(pattern = "^\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", x = inputinterval)) ==
        TRUE) {
      interval <-
        as.numeric(as.POSIXct(inputinterval, format = "%Y-%m-%d %T")) * 1000
    }

    else if (all(
      grepl(pattern = "^\\w{3} \\w{3} \\d{2} \\d{2}:\\d{2}:\\d{2} \\+\\d{4} \\d{4}", x = inputinterval)
    ) == TRUE) {
      interval <-
        as.numeric(as.POSIXct(inputinterval, tz = "GMT", format = "%a %b %d %H:%M:%S %z %Y")) *
        1000 - ceiling(as.numeric(difftime(
          as.POSIXlt(Sys.time(), Sys.timezone()),as.POSIXlt(Sys.time(), tz = "GMT"),tz = "", units = "secs"
        ))) * 1000
    }
    else {
      message("Wrong interval format. Use YYYY-MM-DD HH:MM:SS or the Twitter date format.")
    }
  }
  return(floor(interval))
}

# Helper function to merge top hashtags from different time intervalls to one list
# Will NOT be exported, used by streamingLoop
mergeHashtagFrames<- function (frame1,frame2) {
  if(length(frame1)==0) {
    return(frame2)
  }
  else if(length(frame2)==0) {
    return(frame1)
  }
  else {
    hashtags <- unique(rbind.data.frame(frame1,frame2))
    return(hashtags)
  }
}

# Converts date of the form YYYY-mm-dd HH:MM:SS to a unixtimestamp in ms
userToTimestamp <- function(data) {

  data <- as.numeric(as.POSIXct(data, format = "%Y-%m-%d %T"))*1000
  return(data)
}

# Converts unixtimestamps to userdate of the form yyyy-mm-dd hh:mm:ss
timestampToUserdate <- function(data) {

  if(is.numeric(data) == TRUE) {

    if(all(nchar(data, type="chars")==13)) {
      data <- as.POSIXct(data/1000, origin = "1970-01-01 00:00:00")
    }

    else if(all(nchar(data, type="chars")==10)) {
      data <- as.POSIXct(data, origin = "1970-01-01 00:00:00")
    }

    else {
      stop("Timestamp is incorrect. Timestamps need equal lengths in order to be converted.")
    }
  }
  else {
    stop("Data is not numeric.")
  }
  return(data)
}

# Converts Twitterdate of the Form "Fri Nov 13 12:56:04 +0000 2015" to unixtimestamp in ms
twitterToTimestamp <- function(data) {

  data <- as.numeric(as.POSIXct(data, format ="%a %b %d %H:%M:%S %z %Y"))*1000
  return(data)
}
ProjectTw/TwitteR2Mongo documentation built on May 8, 2019, 3:44 a.m.