Nothing
#' @rdname parseTweets
#' @export
#'
#' @title
#' Converts tweets in JSON format to data frame.
#'
#' @description
#' This function parses tweets downloaded using \code{filterStream},
#' \code{sampleStream} or \code{userStream} and returns a data frame. If tweet contains
#' 280-character text it will return the complete text and not only 140 characters.
#'
#' @author
#' Pablo Barbera \email{pablo.barbera@@nyu.edu}
#'
#' @param tweets A character string naming the file where tweets are stored or the
#' name of the object in memory where the tweets were saved as strings.
#'
#' @param simplify If \code{TRUE} it will return a data frame with only tweet and user
#' fields (i.e., no geographic information or url entities).
#'
#' @param verbose logical, default is \code{TRUE}, which will print in the console
#' the number of tweets that have been parsed.
#'
#' @param legacy logical, default is \code{FALSE}. Read tweets using old method (reading lines into memory and parsing
#' line by line). Try using \code{legacy=TRUE} if getting errors with default options. Note that legacy mode will
#' only return up to 140 characters per tweet.
#'
#' @details
#' \code{parseTweets} parses tweets downloaded using the \code{\link{filterStream}},
#' \code{\link{sampleStream}} or \code{\link{userStream}} functions
#' and returns a data frame where each row corresponds to one tweet and each column
#' represents a different field for each tweet (id, text, created_at, etc.).
#'
#' The total number of tweets that are parsed might be lower than the number of lines
#' in the file or object that contains the tweets because blank lines, deletion notices,
#' and incomplete tweets are ignored.
#'
#' To parse json to a twitter list, see \code{\link{readTweets}}. That function can be significantly
#' faster for large files, when only a few fields are required.
#'
#' Note also that the \code{retweet_count} field contains the number of times a given tweet
#' was retweeted at the time it was captured from the API, or for automatic retweets the number
#' of times the original tweet was retweeted.
#'
#' @seealso \code{\link{filterStream}}, \code{\link{sampleStream}}, \code{\link{userStream}}
#'
#' @examples
#' ## The dataset example_tweets contains 10 public statuses published
#' ## by @@twitterapi in plain text format. The code below converts the object
#' ## into a data frame that can be manipulated by other functions.
#'
#' data(example_tweets)
#' tweets.df <- parseTweets(example_tweets, simplify=TRUE, legacy=TRUE)
#'
#' \dontrun{
#' ## A more complete example, that shows how to capture a user's home timeline
#' ## for one hour using authentication via OAuth, and then parsing the tweets
#' ## into a data frame.
#'
#' library(ROAuth)
#' reqURL <- "https://api.twitter.com/oauth/request_token"
#' accessURL <- "https://api.twitter.com/oauth/access_token"
#' authURL <- "https://api.twitter.com/oauth/authorize"
#' consumerKey <- "xxxxxyyyyyzzzzzz"
#' consumerSecret <- "xxxxxxyyyyyzzzzzzz111111222222"
#' my_oauth <- OAuthFactory$new(consumerKey=consumerKey,
#' consumerSecret=consumerSecret,
#' requestURL=reqURL,
#' accessURL=accessURL,
#' authURL=authURL)
#' my_oauth$handshake()
#' userStream( file="my_timeline.json", with="followings",
#' timeout=3600, oauth=my_oauth )
#' tweets.df <- parseTweets("my_timeline.json")
#' }
#'
#'
parseTweets <- function(tweets, simplify=FALSE, verbose=TRUE, legacy=FALSE){
# if tweets is not a file or an object, try legacy mode
if (!file.exists(tweets[1]) & !exists(tweets[1])){
legacy <- TRUE
}
if (!legacy){
results <- stream_in(tweets)
# adding RT count from embedded RT or if not available, from top-level tweet
retweet_count <- rep(NA, nrow(results))
# adding RT count from top-level tweet
if (!is.null(results$retweet_count)){
retweet_count <- results$retweet_count
}
# replacing with RT count from embedded RT when available
if (!is.null(results$retweeted_status.retweet_count)){
retweet_count <- ifelse(!is.na(results$retweeted_status.retweet_count),
results$retweeted_status.retweet_count, results$retweet_count)
}
# same for favorite counts
favorite_count <- rep(NA, nrow(results))
if (!is.null(results$favorite_count)){
favorite_count <- results$favorite_count
}
# replacing with RT count from embedded RT when available
if (!is.null(results$retweeted_status.favorite_count)){
favorite_count <- ifelse(!is.na(results$retweeted_status.favorite_count),
results$retweeted_status.favorite_count, results$favorite_count)
}
# extracting the full text when tweet is >140 characters
# 1) tweets from REST API with tweet_mode = 'extended'
error <- tryCatch(text <- results$full_text, error=function(e) e)
# 2) tweets from Streaming API that included 'extended_tweet' field
if (inherits(error, 'error') || is.null(error)){
if (any(grepl('extended_tweet', names(results)))){
text <- results$extended_tweet.full_text
# replacing with text if missing
text <- ifelse(!is.na(text), text, results$text)
# checking if there are any RTs
if (!is.null(results$retweeted_status.extended_tweet.full_text)){
text <- ifelse(!is.na(results$retweeted_status.extended_tweet.full_text),
# full text from RT
paste0('RT @', results$retweeted_status.user.screen_name, ':',
results$retweeted_status.extended_tweet.full_text),
# 140+ text from original tweet
text)
}
}
# 3) tweets from Streaming API that do not include 'extended_tweet' field
if (!any(grepl('extended_tweet', names(results)))){
text <- results$text
}
}
df <- data.frame(
text = text,
retweet_count = retweet_count,
favorite_count = favorite_count,
favorited = results$favorited,
truncated = results$truncated,
id_str = results$id_str,
in_reply_to_screen_name = results$in_reply_to_screen_name,
source = results$source,
retweeted = results[[grep("retweeted", names(results))[1]]],
created_at = results$created_at,
in_reply_to_status_id_str = results$in_reply_to_status_id_str,
in_reply_to_user_id_str = results$in_reply_to_user_id_str,
lang = results$lang,
listed_count = results$user.listed_count,
verified = results$user.verified,
location = results$user.location,
user_id_str = results$user.id_str,
description = results$user.description,
geo_enabled = results$user.geo_enabled,
user_created_at = results$user.created_at,
statuses_count = results$user.statuses_count,
followers_count = results$user.followers_count,
favourites_count = results$user.favourites_count,
protected = results$user.protected,
user_url = results$user.url,
name = results$user.name,
time_zone = results$user.time_zone,
user_lang = results$user.lang,
utc_offset = results$user.utc_offset,
friends_count = results$user.friends_count,
screen_name = results$user.screen_name,
stringsAsFactors=F)
# adding geographic variables and url entities
if (simplify==FALSE){
df$country_code <- NA
if (!is.null(results$place.country_code)) df$country_code <- results$place.country_code
df$country <- NA
if (!is.null(results$place.country)) df$country <- results$place.country
df$place_type <- NA
if (!is.null(results$place.type)) df$place_type <- results$place.type
df$full_name <- NA
if (!is.null(results$place.full_name)) df$full_name <- results$place.full_name
df$place_name <- NA
if (!is.null(results$place.name)) df$place_name <- results$place.name
df$place_id <- NA
if (!is.null(results$place.id)) df$place_id <- results$place.id
place_lat_1 <- rep(NA, nrow(df))
if (!is.null(results$place.bounding_box.coordinates.0.0.1)){
place_lat_1 <- results$place.bounding_box.coordinates.0.0.1
}
place_lat_2 <- rep(NA, nrow(df))
if (!is.null(results$place.bounding_box.coordinates.0.1.1)){
place_lat_2 <- results$place.bounding_box.coordinates.0.1.1
}
df$place_lat <- sapply(1:nrow(df), function(x)
mean(c(place_lat_1[x], place_lat_2[x]), na.rm=TRUE))
place_lon_1 <- rep(NA, nrow(df))
if (!is.null(results$place.bounding_box.coordinates.0.0.0)){
place_lon_1 <- results$place.bounding_box.coordinates.0.0.0
}
place_lon_2 <- rep(NA, nrow(df))
if (!is.null(results$place.bounding_box.coordinates.0.2.0)){
place_lon_2 <- results$place.bounding_box.coordinates.0.2.0
}
df$place_lon <- sapply(1:nrow(df), function(x)
mean(c(place_lon_1[x], place_lon_2[x]), na.rm=TRUE))
df$lat <- NA
if (!is.null(results$geo.coordinates.0)) df$lat <- results$geo.coordinates.0
df$lon <- NA
if (!is.null(results$geo.coordinates.1)) df$lon <- results$geo.coordinates.1
df$expanded_url <- NA
if (!is.null(results$entities.urls.0.expanded_url)){
df$expanded_url <- results$entities.urls.0.expanded_url
}
df$url <- NA
if (!is.null(results$entities.urls.0.url)){
df$url <- results$entities.urls.0.url
}
}
# removing empty rows
df <- df[!is.na(df$id_str),]
}
if (legacy){
## from json to list
results.list <- readTweets(tweets, verbose=FALSE)
# if no text in list, change it to NULL
if (length(results.list)==0){
stop(deparse(substitute(tweets)), " did not contain any tweets. ",
"See ?parseTweets for more details.")
}
# constructing data frame with tweet and user variable
df <- data.frame(
text = unlistWithNA(results.list, 'text'),
retweet_count = unlistWithNA(results.list, 'retweet_count'),
favorite_count = unlistWithNA(results.list, 'favorite_count'),
favorited = unlistWithNA(results.list, 'favorited'),
truncated = unlistWithNA(results.list, 'truncated'),
id_str = unlistWithNA(results.list, 'id_str'),
in_reply_to_screen_name = unlistWithNA(results.list, 'in_reply_to_screen_name'),
source = unlistWithNA(results.list, 'source'),
retweeted = unlistWithNA(results.list, 'retweeted'),
created_at = unlistWithNA(results.list, 'created_at'),
in_reply_to_status_id_str = unlistWithNA(results.list, 'in_reply_to_status_id_str'),
in_reply_to_user_id_str = unlistWithNA(results.list, 'in_reply_to_user_id_str'),
lang = unlistWithNA(results.list, 'lang'),
listed_count = unlistWithNA(results.list, c('user', 'listed_count')),
verified = unlistWithNA(results.list, c('user', 'verified')),
location = unlistWithNA(results.list, c('user', 'location')),
user_id_str = unlistWithNA(results.list, c('user', 'id_str')),
description = unlistWithNA(results.list, c('user', 'description')),
geo_enabled = unlistWithNA(results.list, c('user', 'geo_enabled')),
user_created_at = unlistWithNA(results.list, c('user', 'created_at')),
statuses_count = unlistWithNA(results.list, c('user', 'statuses_count')),
followers_count = unlistWithNA(results.list, c('user', 'followers_count')),
favourites_count = unlistWithNA(results.list, c('user', 'favourites_count')),
protected = unlistWithNA(results.list, c('user', 'protected')),
user_url = unlistWithNA(results.list, c('user', 'url')),
name = unlistWithNA(results.list, c('user', 'name')),
time_zone = unlistWithNA(results.list, c('user', 'time_zone')),
user_lang = unlistWithNA(results.list, c('user', 'lang')),
utc_offset = unlistWithNA(results.list, c('user', 'utc_offset')),
friends_count = unlistWithNA(results.list, c('user', 'friends_count')),
screen_name = unlistWithNA(results.list, c('user', 'screen_name')),
stringsAsFactors=F)
# adding geographic variables and url entities
if (simplify==FALSE){
df$country_code <- unlistWithNA(results.list, c('place', 'country_code'))
df$country <- unlistWithNA(results.list, c('place', 'country'))
df$place_type <- unlistWithNA(results.list, c('place', 'place_type'))
df$full_name <- unlistWithNA(results.list, c('place', 'full_name'))
df$place_name <- unlistWithNA(results.list, c('place', 'name'))
df$place_id <- unlistWithNA(results.list, c('place', 'id'))
place_lat_1 <- unlistWithNA(results.list, c('place', 'bounding_box', 'coordinates', 1, 1, 2))
place_lat_2 <- unlistWithNA(results.list, c('place', 'bounding_box', 'coordinates', 1, 2, 2))
df$place_lat <- sapply(1:length(results.list), function(x)
mean(c(place_lat_1[x], place_lat_2[x]), na.rm=TRUE))
place_lon_1 <- unlistWithNA(results.list, c('place', 'bounding_box', 'coordinates', 1, 1, 1))
place_lon_2 <- unlistWithNA(results.list, c('place', 'bounding_box', 'coordinates', 1, 3, 1))
df$place_lon <- sapply(1:length(results.list), function(x)
mean(c(place_lon_1[x], place_lon_2[x]), na.rm=TRUE))
df$lat <- unlistWithNA(results.list, c('geo', 'coordinates', 1))
df$lon <- unlistWithNA(results.list, c('geo', 'coordinates', 2))
df$expanded_url <- unlistWithNA(results.list, c('entities', 'urls', 1, 'expanded_url'))
df$url <- unlistWithNA(results.list, c('entities', 'urls', 1, 'url'))
}
}
# information message
if (verbose==TRUE) message(length(df$text), " tweets have been parsed.")
return(df)
}
unlistWithNA <- function(lst, field){
if (length(field)==1){
notnulls <- unlist(lapply(lst, function(x) !is.null(x[[field]])))
vect <- rep(NA, length(lst))
vect[notnulls] <- unlist(lapply(lst[notnulls], '[[', field))
}
if (length(field)==2){
notnulls <- unlist(lapply(lst, function(x) !is.null(x[[field[1]]][[field[2]]])))
vect <- rep(NA, length(lst))
vect[notnulls] <- unlist(lapply(lst[notnulls], function(x) x[[field[1]]][[field[2]]]))
}
if (length(field)==3 & field[1]!="geo"){
notnulls <- unlist(lapply(lst, function(x) !is.null(x[[field[1]]][[field[2]]][[field[3]]])))
vect <- rep(NA, length(lst))
vect[notnulls] <- unlist(lapply(lst[notnulls], function(x) x[[field[1]]][[field[2]]][[field[3]]]))
}
if (field[1]=="geo"){
notnulls <- unlist(lapply(lst, function(x) !is.null(x[[field[1]]][[field[2]]])))
vect <- rep(NA, length(lst))
vect[notnulls] <- unlist(lapply(lst[notnulls], function(x) x[[field[1]]][[field[2]]][[as.numeric(field[3])]]))
}
if (length(field)==4 && field[2]!="urls"){
notnulls <- unlist(lapply(lst, function(x) length(x[[field[1]]][[field[2]]][[field[3]]][[field[4]]])>0))
vect <- rep(NA, length(lst))
vect[notnulls] <- unlist(lapply(lst[notnulls], function(x) x[[field[1]]][[field[2]]][[field[3]]][[field[4]]]))
}
if (length(field)==4 && field[2]=="urls"){
notnulls <- unlist(lapply(lst, function(x) length(x[[field[1]]][[field[2]]])>0))
vect <- rep(NA, length(lst))
vect[notnulls] <- unlist(lapply(lst[notnulls], function(x) x[[field[1]]][[field[2]]][[as.numeric(field[3])]][[field[4]]]))
}
if (length(field)==6 && field[2]=="bounding_box"){
notnulls <- unlist(lapply(lst, function(x) length(x[[field[1]]][[field[2]]])>0))
vect <- rep(NA, length(lst))
vect[notnulls] <- unlist(lapply(lst[notnulls], function(x)
x[[field[1]]][[field[2]]][[field[3]]][[as.numeric(field[4])]][[as.numeric(field[5])]][[as.numeric(field[6])]]))
}
return(vect)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.