### parse_function
#' Turn JSON file of tweets into a CSV file
#'
#' \code{parse_function} what does the code really do?
#'
#' @param x describe the input
#'
#' @return describe the output.
#'
#' @examples
#'
#' @import dplyr
#' @import stringr
#' @import ndjson
#' @import readr
#'
#' @export
parse_function <- function (x){
if (stringr::str_detect(x, "jsonl")) {
# what does this do?
gc()
json <- ndjson::stream_in(x) %>%
dplyr::filter(lang=="en") %>%
dplyr::distinct(id_str,.keep_all = T) %>%
dplyr::mutate(
quoted_status.coordinates.coordinates.0= ifelse(is.null(.$quoted_status.coordinates.coordinates.0), NA, .$quoted_status.coordinates.coordinates.0),
quoted_status.coordinates.coordinates.1= ifelse(is.null(.$quoted_status.coordinates.coordinates.1), NA, .$quoted_status.coordinates.coordinates.1),
retweeted_status.coordinates.coordinates.0= ifelse(is.null(.$retweeted_status.coordinates.coordinates.0), NA, .$retweeted_status.coordinates.coordinates.0),
retweeted_status.coordinates.coordinates.1= ifelse(is.null(.$retweeted_status.coordinates.coordinates.1), NA, .$retweeted_status.coordinates.coordinates.1),
coordinates.coordinates.0= ifelse(is.null(.$coordinates.coordinates.0), NA, .$coordinates.coordinates.0),
coordinates.coordinates.1= ifelse(is.null(.$coordinates.coordinates.1), NA, .$coordinates.coordinates.1),
coordinates.type= ifelse(is.null(.$coordinates.type), NA, .$coordinates.type),
withheld_in_countries.0= ifelse(is.null(.$withheld_in_countries.0), NA, .$withheld_in_countries.0),
quoted_status.extended_tweet.full_text= ifelse(is.null(.$quoted_status.extended_tweet.full_text), NA, .$quoted_status.extended_tweet.full_text),
retweeted_status.extended_tweet.full_text= ifelse(is.null(.$retweeted_status.extended_tweet.full_text), NA, .$retweeted_status.extended_tweet.full_text),
extended_tweet.full_text= ifelse(is.null(.$extended_tweet.full_text), NA, .$extended_tweet.full_text)
) %>%
dplyr::select(text,
created_at,
id,
id_str,
lang,
possibly_sensitive,
source,
timestamp_ms,
coord_longitude=coordinates.coordinates.0,
coord_latitude=coordinates.coordinates.1,
coordinates.type,
place.name,
place.country,
place.country_code,
entities.urls.0.expanded_url,
withheld_in_countries.0,
user.created_at,
user.description,
user.followers_count,
user.friends_count,
user.id,
user.id_str,
user.location,
user.name,
user.protected,
user.screen_name,
user.statuses_count,
user.verified,
user.location,
user.time_zone,
is_quote_status,
quoted_status.created_at,
quoted_status.id,
quoted_status.id_str,
quoted_status.text,
quoted_status.lang,
quoted_status.possibly_sensitive,
quoted_status.coord_longitude= quoted_status.coordinates.coordinates.0,
quoted_status.coord_latitude= quoted_status.coordinates.coordinates.1,
quoted_status.place.name,
quoted_status.place.country,
quoted_status.place.country_code,
quoted_status.entities.urls.0.expanded_url,
quoted_status.user.created_at,
quoted_status.user.description,
quoted_status.user.friends_count,
quoted_status.user.followers_count,
quoted_status.user.id,
quoted_status.user.id_str,
quoted_status.user.screen_name,
quoted_status.user.name,
quoted_status.user.statuses_count,
quoted_status.user.verified,
quoted_status.user.protected,
quoted_status.user.location,
quoted_status.user.time_zone,
retweeted_status.created_at,
retweeted_status.id,
retweeted_status.id_str,
retweeted_status.text,
retweeted_status.lang,
retweeted_status.possibly_sensitive,
retweeted_status.coord_longitude=retweeted_status.coordinates.coordinates.0,
retweeted_status.coord_latitude=retweeted_status.coordinates.coordinates.1,
retweeted_status.place.name,
retweeted_status.place.country,
retweeted_status.place.country_code,
retweeted_status.entities.urls.0.expanded_url,
retweeted_status.user.created_at,
retweeted_status.user.description,
retweeted_status.user.followers_count,
retweeted_status.user.friends_count,
retweeted_status.user.id,
retweeted_status.user.id_str,
retweeted_status.user.screen_name,
retweeted_status.user.name,
retweeted_status.user.statuses_count,
retweeted_status.user.verified,
retweeted_status.user.location,
retweeted_status.user.protected,
retweeted_status.user.time_zone,
in_reply_to_screen_name,
in_reply_to_status_id,
in_reply_to_status_id_str,
in_reply_to_user_id,
in_reply_to_user_id_str,
quoted_status.extended_tweet.full_text,
retweeted_status.extended_tweet.full_text,
extended_tweet.full_text) %>%
dplyr::mutate(text_real= case_when(
!is.na(extended_tweet.full_text) ~ extended_tweet.full_text ,
!is.na(retweeted_status.extended_tweet.full_text) ~ retweeted_status.extended_tweet.full_text,
#!is.na(quoted_status.extended_tweet.full_text) ~ quoted_status.extended_tweet.full_text,
TRUE ~ text)) %>%
dplyr::select(text_real, id_str, everything()) %>%
dplyr::filter(str_detect(string = text_real, pattern = regex('muslim|islam',ignore_case = T)))
# not sure we want to write within the function? Is it good practice?
readr::write_csv(json, path = paste0(str_sub(x,end = -7), ".csv"))
gc()
}else {
print(paste0("Not a jsonl file this ", x))
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.