R/twitter.R

Defines functions create_twitter_credentials_template fetchAllTweets identifyTweetLanguage getTopics

#' @export
create_twitter_credentials_template <- function(path = "~/Desktop/twittercred.json"){
  if(!file.exists(path)){
    cred_file <- jsonlite::toJSON(list(consumer_key = "consumer_key",
                                       consumer_secret = "consumer_secret",
                                       access_token = "access_token",
                                       access_secret = "access_secret")
                                  , auto_unbox = TRUE)
    cat(cred_file, file = path)
  }
}

#' @import data.table
#' @export
fetchAllTweets <- function(loadCache = TRUE, cred_path = "~/Desktop/twittercred.json",
                           username = "NUMBER26", query_type = "timeline"){
  backup_path <- sprintf("inst/tweetBackup_%s_%s.rda", username, query_type)
  if(loadCache){
    if(file.exists(backup_path)){
    return(readRDS(backup_path))
    } else {
      warning("No cached data available.")
    }
  }
  n26db <- data.table()
  cred <- jsonlite::fromJSON(cred_path)
  twitteR::setup_twitter_oauth(cred$consumer_key, cred$consumer_secret,
                               cred$access_token, cred$access_secret)
  userTimelineSafe <- dplyr::failwith(NULL, twitteR::userTimeline)
  searchSafe <- dplyr::failwith(NULL, twitteR::searchTwitter)
  twListToDFSafe <- dplyr::failwith(NULL, twitteR::twListToDF)

  for(i in 1:50){
    if(nrow(n26db) > 0){
    maxID = n26db[created == min(created), id]
    } else { maxID = NULL }
    if(query_type == "timeline"){
      n26tweets <- userTimelineSafe(username, n = 200, maxID = maxID) %>%
                    twListToDFSafe() %>% data.table
    } else if(query_type == "search"){
      n26tweets <- searchSafe(paste0("@", username), n = 200, maxID = maxID) %>%
                      twListToDFSafe() %>% data.table
    }
    if(nrow(n26tweets) <= 1 | is.null(n26tweets)) { break } else {
      n26db <- rbind(n26db, n26tweets)
      n26db %<>% unique()
      saveRDS(n26db, file = backup_path)
    }
  }

  return(n26db)
}

#' @export
identifyTweetLanguage <- function(text_vect){
  text_vect %<>% iconv("UTF-8", "latin1")

  data(TC_char_profiles, package = "textcat")
  lang_1 <- text_vect %>% textcat::textcat() %>% tolower
  lang_1[is.na(lang_1)] <- "NA_1"
  lang_2 <- cldr::detectLanguage(text_vect)$detectedLanguage %>% tolower %>% replace(NA, "NA_2")
  lang_2[is.na(lang_2)] <- "NA_2"

  lang <- lang_1
  lang[lang_1 != lang_2] <- NA
  lang[!lang %in% c("english", "german")] <- NA #, "spanish", "italian", "greek", "slovakian"
  return(lang)
}

#' @export
getTopics <- function(text_vect, lang = "german"){
  text_vect %<>% iconv("UTF-8", "latin1")
  text_vect %<>% gsub("(RT|via)((?:\\b\\W*@\\w+)+)", '', .)
  text_vect %<>% gsub("http[^[:blank:]]+", '', .)
  text_vect %<>% gsub("@\\w+", '', .)
  text_vect %<>% gsub("[ \t]{2,}", '', .)
  text_vect %<>% gsub("^\\s+|\\s+$", '', .)
  text_vect %<>% gsub('\\d+', '', .)
  text_vect %<>% tm::removeWords(tm::stopwords(lang))
  text_vect %<>% tm::removeWords(c("hey", "gibt", "number26"))
  text_vect %<>% tm::removePunctuation()
  text_vect %<>% tm::stripWhitespace()

  tweetCorpus <- text_vect %>% tm::VectorSource() %>% tm::VCorpus()

  # Text processing...
  corpus_clean <- tm::tm_map(tweetCorpus, tm::content_transformer(tolower))
  # corpus_clean <- tm_map(corpus_clean, removeNumbers)
  # stemDocument
  # filter out very frequent words...
  corpus_clean <- tm::tm_map(corpus_clean, tm::removeWords, tm::stopwords(lang))
  corpus_clean <- tm::tm_map(corpus_clean, tm::removeWords, c("hey", "gibt", "number26"))
  corpus_clean <- tm::tm_map(corpus_clean, tm::removePunctuation)
  corpus_clean <- tm::tm_map(corpus_clean, tm::stripWhitespace)

  corpus_clean <- tm::DocumentTermMatrix(corpus_clean) #, control = list(weighting = tm::weightTfIdf)
  return(corpus_clean)
}

#' @export
twitterLookup <- function(twitterID, resource = "/statuses/lookup", cred_path = "~/Desktop/twittercred.json"){

  cred <- jsonlite::fromJSON(cred_path)
  twitteR::setup_twitter_oauth(cred$consumer_key, cred$consumer_secret,
                               cred$access_token, cred$access_secret)

  twitterData <- twitterID[!is.na(twitterID)]
  twitterData <- split(twitterData, ceiling(seq_along(twitterData) / 100))
  ratelimit <- twitteR::getCurRateLimitInfo()
  ratelimit <- as.numeric(ratelimit[ratelimit$resource == resource, ]$remaining)
  for(i in 1:min(length(twitterData), ratelimit)){
    if(resource == "/statuses/lookup"){
      twitterData[[i]] %<>% twitteR::lookup_statuses()
    } else if(resource == "/users/lookup"){
      twitterData[[i]] %<>% twitteR::lookupUsers()
    }
   sprintf("Pulling next 100 objects, %s round(s) completed.", i) %>% print
  }

  twitterData %<>% unlist(recursive = FALSE)
  return(twitterData)
}


#' @export
addMeta <- function(tweets){
  tweets %<>% data.table() %>% unique()
  tweets$lang <- identifyTweetLanguage(text_vect = tweets$text)
  tweets$tw_length <- nchar(gsub(" ?@[^[:space:]]+ ?", "", tweets$text))
  return(tweets)
}


getSentiment <- function(tweets){
  tweets <- copy(tweets)
  a <- httr::POST("http://www.sentiment140.com/api/bulkClassifyJson?app",
                body = list(data = tweets[, .(text, id)]), encode = "json")
  a_text <- httr::content(a, "text")

  a_results <- a_text %>% jsonlite::fromJSON()
  a_results <- a_results$data[, c("id", "polarity")] %>% data.table()
  a_results[, polarity := factor(polarity, levels = c(0, 2, 4),
                                 labels = c("negative", "neutral", "positive"),
                                 ordered = TRUE)]
  tweets_mod <- merge(tweets, a_results, by = "id")
  return(tweets_mod)
}

#' @export
export_tweet_data <- function(loadCache = TRUE){
  timelineTweets <- fetchAllTweets(loadCache)
  timelineTweets %<>% addMeta()

  timelineConvTweets <- twitterLookup(timelineTweets$replyToSID)
  timelineConvDf <- timelineConvTweets %>% twitteR::twListToDF() %>% data.table()
  usersDf <- twitterLookup(timelineTweets[, unique(replyToUID)], resource = "/users/lookup")
  usersDf %<>% twitteR::twListToDF() %>% data.table()
  timelineConvDf <- merge(timelineConvDf, usersDf, by = "screenName", suffixes = c("", "_user"))
  timelineConvDf %<>% addMeta()

  convDF <- merge(timelineConvDf, timelineTweets, by.x = "id", by.y = "replyToSID",
                  suffixes = c("_usertweet", "_n26tweet"))
  convDF[, created_usertweet := created_usertweet %>% as.POSIXct()]
  convDF[, created_n26tweet := created_n26tweet %>% as.POSIXct()]
  convDF[, response_time := ((created_n26tweet - created_usertweet) / 60) %>% as.numeric()]
  convDF[, Weekday := lubridate::wday(created_usertweet, label = TRUE)]
  convDF[, hour := lubridate::hour(created_usertweet)]

  write.csv(convDF[, .(response_time, hour, Weekday, hour, created_usertweet)],
            file = "~/Desktop/datascience/inst/waittimeapp/twitterDF.csv")
  return(convDF)
}
jlewis91/datascience documentation built on May 19, 2019, 12:46 p.m.