R/get_tweets.R

Defines functions qf_get_tweets_from_list qf_get_tweets_by_user

Documented in qf_get_tweets_by_user qf_get_tweets_from_list

#' Extracts and stores tweets locally
#' 
#'
#' @param users A character vector of handles or id of Twitter users. Twitter users should ideally be provided in the form of user ids. If screen names are given, they will be converted to Twitter ids. In the rare case when a user screen name is made exclusively by numbers, provide users as lists using rtweet::as_screenname. See `?rtweet::as_screenname` for details.
#' @param wait An integer, defaults to 1. Seconds to wait between requests to Twitter.
#' @param n An integer, number of new tweets to request per users, defaults to 100.
#' @param twitter_token A twitter token generated by rtweet.
#' @examples
#' 
#' @export

qf_get_tweets_by_user <- function(users,
                                  wait = 1,
                                  n = 100,
                                  cache_id = TRUE,
                                  twitter_token = NULL) {
  
  if (suppressWarnings(sum(is.na(as.numeric(users)))>0)==TRUE) {
    if (cache_id==TRUE) {
      fs::dir_create(path = "users_id")
      if (fs::file_exists(fs::path("users_id", "users_id.rds"))==TRUE) {
        users_df <- readr::read_rds(file = fs::path("users_id", "users_id.rds"))
        users_available_l <- is.element(el = users, set = users_df$screen_name)
        if (sum(users_available_l)<length(users_available_l)) {
          users_df_new <- rtweet::lookup_users(users = users[!users_available_l],
                                               parse = TRUE,
                                               token = twitter_token) %>% 
            dplyr::select(user_id, screen_name) %>% 
            dplyr::distinct()
          users_df <- dplyr::bind_rows(users_df, 
                                       users_df_new)
          readr::write_rds(x = users_df %>% 
                             dplyr::select(user_id, screen_name) %>% 
                             dplyr::distinct(),
                           path = fs::path("users_id", "users_id.rds"))
        } 
        users <- users_df %>% 
          dplyr::filter(is.element(el = screen_name, set = users)) %>% 
          dplyr::pull(user_id)
      } else {
        users_df <- rtweet::lookup_users(users = users,
                                         parse = TRUE,
                                         token = twitter_token) %>% 
          dplyr::select(user_id, screen_name) %>% 
          dplyr::distinct()
        users <- users_df %>% 
          dplyr::pull(user_id)
        readr::write_rds(x = users_df,
                         path = fs::path("users_id", "users_id.rds"))
      }
    } else {
      users <- tweet::lookup_users(users = users, parse = TRUE, token = twitter_token) %>% 
        dplyr::pull(user_id)
      
    }
  }

  fs::dir_create(path = "tweets_by_user")
  
  local_tweet_locations <- fs::dir_ls(path = "tweets_by_user",
                                      recurse = FALSE,
                                      type = "file",
                                      glob = "*.rds")
  
  if (length(local_tweet_locations)==0) {
    existing_users <- NA
    new_users <- users
  } else {
    existing_users <- stringr::str_remove(string = fs::path_file(path = local_tweet_locations),
                                          pattern = stringr::fixed(".rds"))
    new_users <- users[is.element(el = users, set = existing_users)==FALSE]
    preexisting_users <- users[is.element(el = users, set = existing_users)==TRUE]
  }
  
  new_tweets <- tibble::tibble(users = new_users, new_tweets = NA)
  
  for (i in seq_along(new_users)) { 
    temp <- tryCatch(expr = rtweet::get_timeline(user = new_users[i],
                                                 n = n,
                                                 token = twitter_token),
                     error = function(e) {
                       update_tweets$newTweets[i] <- NA
                       NULL
                     })
    # if Twitter throws back anything looking real, add it to stored file
    if (is.null(temp)==FALSE) {
      if (nrow(temp)>0) {
        if (is.na(temp$screen_name[1])==FALSE){
          saveRDS(object = temp, file = fs::path("tweets_by_user", paste0(new_users[i], ".rds")))
          # store how many new tweets in data frame for reference
          new_tweets$new_tweets[i] <- nrow(temp)
          message(paste(new_tweets[i,], collapse = " - "))
          Sys.sleep(time = wait)
        }
      }
    }
  }
  # now process pre-existing users
  if (is.na(existing_users[1])==FALSE) {
    
    update_tweets <- tibble::tibble(users = preexisting_users,
                                    new_tweets = NA)
    
    for (i in seq_along(preexisting_users)) {  # start processing by oldest modified
      stored <- readRDS(file = fs::path("tweets_by_user",
                                        paste0(preexisting_users[i], ".rds")))
      if (is.null(stored)==FALSE) { # 
        # if there's an error, print it but go ahead
        temp <- tryCatch(expr = rtweet::get_timeline(user = preexisting_users[i],
                                                     n = n,
                                                     since_id = sum(max(as.numeric(stored$status_id)), 1),
                                                     token = twitter_token),
                         error = function(e) {
                           # do nothing
                         })
        # if Twitter throws back anything looking real, add it to stored file
        if (is.null(temp)==FALSE) {
          if (nrow(temp)>0) {
            if (is.na(temp$screen_name[1])==FALSE){
              pre_save <- dplyr::bind_rows(temp, stored) %>%
                dplyr::distinct(status_id, .keep_all = TRUE) %>%
                dplyr::arrange(created_at)
              saveRDS(object = pre_save,file = fs::path("tweets_by_user", paste0(preexisting_users[i], ".rds")))
              # store how many new tweets in data frame for reference
              update_tweets$new_tweets[i] <- nrow(pre_save)-nrow(stored)
              message(paste(paste(update_tweets[i,], collapse = " - "), "new tweets"))
              Sys.sleep(time = wait)
            }
          }
        }
      }
    }
    new_tweets <- dplyr::bind_rows(new_tweets, update_tweets)
  }
  # report back what has been downloaded
  new_tweets
}

#' Extracts and stores tweets locally
#' Extracts and stores tweets locally based on Twitter lists
#'
#' @param n An integer, number of new tweets to request per users, defaults to 200.
#' @param cache_lists Logical, defaults to TRUE. If TRUE, stores locally the lists owned by a given user, in a subfolder `lists_by_user`. If list has already been downloaded, it just loads it. To overwrite, set `overwrite` to TRUE.
#' @param overwrite_lists Logical, defaults to FALSE. If TRUE, it overwrites previously downloaded lists.
#' @param twitter_token A twitter token generated by rtweet.
#' @examples
#' 
#' @export
#' 
qf_get_tweets_from_list <- function(list_id = NULL,
                                    slug = NULL,
                                    owner_user = NULL,
                                    since_id = NULL,
                                    max_id = NULL,
                                    n = 200,
                                    include_rts = TRUE,
                                    parse = TRUE,
                                    twitter_token = NULL, 
                                    cache_lists = TRUE,
                                    overwrite_lists = FALSE) {
  if (is.null(list_id)==TRUE) {
    if (cache_lists == TRUE) {
      fs::dir_create(path = "lists_by_user")
      cached_list_location <- fs::path("lists_by_user", paste0(owner_user, ".rds"))
      if (fs::file_exists(cached_list_location)==TRUE) {
        list_users <- readRDS(file = cached_list_location)
      } else {
        list_users <- rtweet::lists_users(user = owner_user,
                                          reverse = TRUE,
                                          token = twitter_token,
                                          parse = TRUE)
        saveRDS(object = list_users, file = cached_list_location)
      }
    }
    list_id <- list_users %>% 
      dplyr::rename(slug_l = slug) %>% 
      dplyr::filter(slug_l == slug) %>% 
      dplyr::pull(list_id)
  }
  today_folder <- fs::path("tweets_from_list", list_id, Sys.Date())
  fs::dir_create(path = today_folder, recurse = TRUE)
  local_tweets_location <- fs::dir_ls(path = fs::path("tweets_from_list", list_id),
                                      recurse = TRUE,
                                      type = "file",
                                      glob = "*.rds")
  if (length(local_tweets_location)==0) {
    tweets_from_list <- rtweet::lists_statuses(list_id = list_id,
                                               n = n,
                                               include_rts = include_rts,
                                               since_id = since_id,
                                               parse = parse,
                                               token = twitter_token) %>% 
      dplyr::arrange(created_at)
    saveRDS(object = tweets_from_list,
            file = fs::path(today_folder,
                            paste0(Sys.time(), "-", list_id, ".rds")))
  } else {
    for (i in 1:100) {
      previous_tweets <- readRDS(file = local_tweets_location[length(local_tweets_location)+1-i])
      if (nrow(previous_tweets)>0) break
    }
    
    if (is.null(since_id)==TRUE) {
      tweets_from_list <- rtweet::lists_statuses(list_id = list_id,
                                                 n = n,
                                                 since_id = max(as.numeric(previous_tweets$status_id)),
                                                 include_rts = include_rts,
                                                 parse = parse,
                                                 token = twitter_token) %>% 
        dplyr::arrange(created_at)
    } else {
      tweets_from_list <- rtweet::lists_statuses(list_id = list_id,
                                                 n = n,
                                                 since_id = max(as.numeric(previous_tweets$status_id)),
                                                 include_rts = include_rts,
                                                 parse = parse,
                                                 token = twitter_token) %>% 
        dplyr::arrange(created_at)
    }
    
    saveRDS(object = tweets_from_list,
            file = fs::path(today_folder,
                            paste0(Sys.time(), "-", "list_id-", list_id, ".rds")))
  }
  return(tweets_from_list)
}
giocomai/edjnetquotefinder documentation built on Feb. 11, 2022, 12:51 p.m.