R/TextMining_Twitter_API.R

Defines functions TwitterWordCould

Documented in TwitterWordCould

#' @title
#' TwiterWordCloud
#' @description function create a data fram of the searched tweet including the frequency
#' @name
#' TwitterWordCould
#'
#' @param word , is the tweet search word
#' @param min_freq , is the minimum frequency of the tweet search
#' @param max_number , is the maximum number of the tweet search
#'
#' @return df_Tesla , returns a data frame with the term and frequency
#' @export


# Setup ####
install.packages("twitteR", repos = "http://cran.us.r-project.org")
install.packages("RCurl", repos = "http://cran.us.r-project.org")
install.packages("SnowballC", repos = "http://cran.us.r-project.org")
install.packages("tm", repos = "http://cran.us.r-project.org")
install.packages("wordcloud", repos = "http://cran.us.r-project.org")


#Load library
library(twitteR)
library(RCurl)
library(SnowballC)
library(tm)
library(wordcloud)

#Require
#require(twitteR)
#require(RCurl)
#require(SnowballC)
#require(tm)
#require(wordcloud)

# the function ####
TwitterWordCould <- function(word, min_freq, max_number){

  #Illegal input
  if(typeof(word)!= "character")
    stop("Illegal input word!")
  if(typeof(min_freq)!= "double")
    stop("Illegal input frequent!")
  if(typeof(max_number)!= "double")
    stop("Illegal input max number!")

  #Maximum number of Twitt (Base on the limitation of API)
  if(max_number> 1500)
    stop("Can not handle more than 1500 Twitts")
  if(min_freq < 10)
    stop("Min frequent shoud not be too small (<10)")
  if(min_freq > max_number)
    stop("min freq shoud be langer than maximum number of Twitt")

  #setup autorisierung ####
  reqURL <- "https://api.twitter.com/oauth/request_token"
  accessURL <- "http://api.twitter.com/oauth/access_token"
  authURL <- "http://api.twitter.com/oauth/authorize"

  consumer_key <- "uYqdnmEfQdo8UCihFdYAKMST0"
  consumer_secret <- "DzJ7xkxx0XBbj0uuW8tWsqqTdgdm8r8L4JrbrKch6pP41mwhuK"
  access_token <- "864101367014711296-kzlQkr0oPj0793pAX3c4mPYtlvlX7Qw"
  access_secret <-"vPLJXprEMI3MlBUAXBpVvbOaeoPFVDxZwqUT0wRNF4fjQ"

  # setup
  setup_twitter_oauth(consumer_key, consumer_secret, access_token,access_secret)
  options(httr_oauth_cache=T)

  # search Twitter, just english language, max number - max_number (max is 1500 Tweets)
  tweets <- searchTwitter(word, lang = "en", n = max_number)

  #convert to character vector
  tweets_text <- sapply(tweets, function(x) x$getText())

  #create corpus from vector of tweets
  tweets_corpus <- Corpus(VectorSource(tweets_text))

  #lower cases, remove numbers, cut out stopword, remove punctuation
  tweets_clean <- tm_map(tweets_corpus,removePunctuation)
  tweets_clean <- tm_map(tweets_clean,content_transformer(tolower))
  tweets_clean <- tm_map(tweets_clean, removeWords,stopwords("english"))
  tweets_clean <- tm_map(tweets_clean, removeNumbers)
  tweets_clean <- tm_map(tweets_clean, stripWhitespace)
  tweets_clean <- tm_map(tweets_clean, stemDocument)

  #Term Document Matrix
  tdm_tweets <- TermDocumentMatrix(tweets_clean)

  #create data frame
  term_freq_tweets <- rowSums(as.matrix(tdm_tweets))
  term_freq_tweets <- subset(term_freq_tweets, term_freq_tweets >= 1)
  df_tweets <- data.frame(term = names(term_freq_tweets), freq = term_freq_tweets)

  # return the data frame
  return(df_tweets)
}

#TEST
#data <- TwitterWordCould(word = "Audi", min_freq = 10, max_number = 1000)
#wordcloud(words = data$term, freq = data$freq, min.freq = 20,
#           random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, "Dark2"))
# plot(data)

#data <- TwitterWordCould("Tesla", 10,100)
#data_test <- data
Philhoels/AdvRLab5 documentation built on May 22, 2019, 5:08 p.m.