pushshiftR: Search Reddit comments using the PushShift API

Documented in get_reddit_comments

# custom internal function to mimic stringr::str_squish() with no external deps
base_str_squish <- function(x){
  # remove leading and trailing whitespace with base functions for no new deps
  x <- gsub(x = x, pattern = "^\\s+|\\s+$", replacement = "")
  # replace any double spaces with a single space
  x <- gsub(x = x, pattern = "\\s+", replacement = " ")
}


created_utc <- public_description <- subscribers <- description <- NULL

# https://github.com/reddit-archive/reddit/wiki/API
# <platform>:<app ID>:<version string> (by /u/<reddit username>)
ua_string <- "pushshiftR:v0.0.1 (by /u/belangeranalytics)"


# Methods for calling the pushshift.io Reddit search APIs

# https://github.com/pushshift/api

# | Parameter | Description | Default | Accepted Values |
#   | ------ | ------ | ------- | ------ |
#   | q | Search term. | N/A | String / Quoted String for phrases |
#   | ids | Get specific comments via their ids | N/A | Comma-delimited base36 ids |
#   | size | Number of results to return | 25 | Integer <= 500 |
#   | fields | One return specific fields (comma delimited) | All Fields Returned | string or comma-delimited string |
#   | sort | Sort results in a specific order | "desc" | "asc", "desc" |
#   | sort_type | Sort by a specific attribute | "created_utc" | "score", "num_comments", "created_utc" |
#   | aggs | Return aggregation summary | N/A | ["author", "link_id", "created_utc", "subreddit"] |
#   | author | Restrict to a specific author | N/A | String |
#   | subreddit | Restrict to a specific subreddit | N/A | String |
#   | after | Return results after this date | N/A | Epoch value or Integer + "s,m,h,d" (i.e. 30d for 30 days) |
#   | before | Return results before this date | N/A | Epoch value or Integer + "s,m,h,d" (i.e. 30d for 30 days) |
#   | frequency | Used with the aggs parameter when set to created_utc | N/A | "second", "minute", "hour", "day" |
#   | metadata | display metadata about the query | false | "true", "false" |

#' Search Reddit comments using the PushShift API
#'
#' Query the PushShift API to search Reddit comments. Does some minimal input
#' validation and massaging for quality-of-life. Incorporates polite batching,
#' since the API will only return 100 results at a time.
#'
#' @param q Search term. String / Double-quoted String for phrases.
#' @param ids Get specific comments via their ids. Comma-delimited base36 ids.
#' @param size Number of results to return. Default is 25; values > 100 handled through batching.
#' @param fields Return specific fields, either comma-delimited string or character vector. Default is all fields returned. Date/time created is always returned.
#' @param sort Sort by a specific attribute. "score", "num_comments", "created_utc"
#' @param aggs Return aggregation summary. *DISABLED BY PUSHSHIFT DUE TO SERVER LOAD*
#' @param author Restrict to a specific author.
#' @param subreddit Restrict to a specific subreddit.
#' @param after Return results after this date. Epoch value or Integer + "s,m,h,d" (i.e. 30d for 30 days)
#' @param before Return results before this date. Epoch value or Integer + "s,m,h,d" (i.e. 30d for 30 days)
#' @param frequency Used with the aggs parameter when set to created_utc. *DISABLED BY PUSHSHIFT DUE TO SERVER LOAD*
#' @param metadata display metadata about the query. Default false.
#' @param batch_pause Pause between batches in seconds. Default is 1s.
#' @param parse_utc Boolean flag: parse UTC timestamps into human-readable date-times? Default TRUE.
#' @param verbose Debug boolean flag to enable/disable message logging to the console.
#'
#' @return A data frame with class `tbl_df` with on row for each comment returned by the API.
#' @export
#'
#' @examples
#' \dontrun{test <- get_reddit_comments(q = "coffee maker", size = 250)}
get_reddit_comments <- function(q = NA, ids = NA, size = 25, fields = NA, sort = c("created_utc", "score", "num_comments"), aggs = NA, author = NA, subreddit = NA, after = NA, before = NA, frequency = NA, metadata = FALSE, batch_pause = 1, parse_utc = TRUE, verbose = TRUE){

  # basic input fixing
  sort <- match.arg(sort, sort)
  #sort_type <- match.arg(sort_type)
  q <- base_str_squish(q)
  ids <- base_str_squish(ids)
  fields <- base_str_squish(fields)
  author <- base_str_squish(author)
  subreddit <- base_str_squish(subreddit)

  # if we got fields in a character vector, flatten it
  if (length(fields) > 1) {
    fields <- paste0(fields, collapse = ",")  #stringr::str_flatten(fields, collapse = ",")
    # add created_utc if it's not there
    #if (!stringr::str_detect(fields, "created_utc")) fields <- paste0(fields,",created_utc")
    if (!grepl(x = fields, pattern = "created_utc")) fields <- paste0(fields,",created_utc")
  }

  # the search endpoint
  api_endpoint <- "https://api.pushshift.io/reddit/search/comment?"

  url <- httr::parse_url(api_endpoint)

  # TODO: fix bug in number of batches BATCHING.
  batches <- ((size-1) %/% 100) + 1
  all_results <- dplyr::tibble()

  # set up initial before time: if not provided, it's this moment in Unix epoch
  if (is.na(before)) before <- round(unclass(as.POSIXct(Sys.time())))

  for (batch in 1:batches){
    if (verbose) message(sprintf("Batch %d/%d", batch, batches))

    # fetch the right number in this batch: 100 until the last batch, then
    # get the extras
    if ((batch < batches) & size > 100)  size_to_fetch <- 100
    if ((batch == batches) & size > 100) size_to_fetch <- size - (batches-1) * 100 #size %% 100
    if (size <= 100) size_to_fetch <- size

    # set up a blank API query
    #url$query <- list()

    # add the query portions if they're there
    # if (!is.na(q)) url$query <- append(url$query, list (q = q))
    # if (!is.na(ids)) url$query <- append(url$query, list (ids = ids))
    # if (!is.na(size)) url$query <- append(url$query, list (size = size_to_fetch)) # note differs in each batch
    # if (!is.na(fields)) url$query <- append(url$query, list (fields = fields))
    # if (!is.na(sort)) url$query <- append(url$query, list (sort = sort))
    # if (!is.na(sort_type)) url$query <- append(url$query, list (sort_type = sort_type))
    # if (!is.na(author)) url$query <- append(url$query, list (author = author))
    # if (!is.na(subreddit)) url$query <- append(url$query, list (subreddit = subreddit))
    # if (!is.na(after)) url$query <- append(url$query, list (after = after))
    # if (!is.na(before)) url$query <- append(url$query, list (before = before))
    # if (!is.na(metadata)) url$query <- append(url$query, list (metadata = metadata))
    #
    # # create final url
    # final_url <- httr::build_url(url)

    # build_url() messes up commas in the query parameters so we'll do it manually
    final_url <- api_endpoint

    if (!is.na(q)) final_url <- paste0(final_url, sprintf("q=%s&", q))
    if (!is.na(ids)) final_url <- paste0(final_url, sprintf("ids=%s&", ids))
    if (!is.na(size)) final_url <- paste0(final_url, sprintf("size=%s&", size_to_fetch)) # note differs in each batch
    if (!is.na(fields)) final_url <- paste0(final_url, sprintf("fields=%s&", fields))
    if (!is.na(sort)) final_url <- paste0(final_url, sprintf("sort=%s&", sort))
    #if (!is.na(sort_type))  final_url <- paste0(final_url, sprintf("sort_type=%s&", sort_type))
    if (!is.na(author)) final_url <- paste0(final_url, sprintf("author=%s&", author))
    if (!is.na(subreddit)) final_url <- paste0(final_url, sprintf("subreddit=%s&", subreddit))
    if (!is.na(after)) final_url <- paste0(final_url, sprintf("after=%s&", after))
    if (!is.na(before)) final_url <- paste0(final_url, sprintf("before=%s&", before))
    if (!is.na(metadata)) final_url <- paste0(final_url, sprintf("metadata=%s&", metadata))


    # get response in a loop to check status codes.
    # status 429 means we hit the rate limit: pause for increasing numbers of seconds and try 5 times
    # if we get a good result, or after the fifth try, proceed.
    # if the result is good, add it to the results.
    # if the result is not good, return whatever we have so that we don't lose results to date.
    times <- 0
    while (times < 5){
      times <- times + 1
      if (verbose) message(sprintf("Calling API endpoint: %s", final_url))
      resp <- httr::GET(final_url)

      # if we didn't get a rate limit warning, exit the loop
      if (httr::status_code(resp) != 429) break;

      # otherwise optionally alert the user and pause
      if (verbose) message(sprintf("Try %s: Got error code 429. Waiting for %s seconds...", times, times))
      Sys.sleep(times)
    }

    # If the status code is anything other than 200, send warning, return API response for debugging
    if (!httr::status_code(resp) == 200){
      warning (sprintf("Pushshift API returned error code %s in batch %s of %s. Returning any results already collected.\nFirst few lines of response:\n%s",
                       httr::status_code(resp), batch, batches,
                       httr::content(resp, type = "text/json", encoding = "UTF-8")))
      batch <- batches
    } else {

      # Only do this if we got a good response, status code 200

      # parse response
      if (verbose) message("API response good, parsing results..")
      response <- httr::content(resp, type = "text/json", encoding = "UTF-8") %>%
        jsonlite::fromJSON()

      response_data <- response$data %>%
        dplyr::as_tibble() %>%
        dplyr::mutate(dplyr::across(where(is.list), function(x) paste0(as.character(unlist(x)), collapse = ", " ) ))
      #dplyr::mutate(dplyr::across(where(is.list), function(x) stringr::str_flatten(as.character(unlist(x)), collapse = ", " ) ))

      # if we got nothing, we're jumping out of the for-loop now
      if (nrow(response_data) == 0) {
        if (verbose) message ("No results found. Stopping now.")
        batch <- batches
      }

      # add what we found
      all_results <- dplyr::bind_rows(all_results, response_data)

      # update our "before" parameter to only look for earlier results
      before <- min(response_data$created_utc)

    } # end if good response status 200

    # do a pause if we're not at the last batch
    if (batch < batches) {
      if (verbose) message(sprintf("Pausing for %f seconds...", batch_pause))
      Sys.sleep(batch_pause)
    }

  } # end of batch loop

  # remove any duplicates
  all_results <- dplyr::distinct(all_results)

  # fix some common encoding errors with < and >, which are given as "&lt;" and "&gt;"

  all_results <- all_results %>%
    dplyr::mutate(body = gsub(x = body, pattern = "&gt;", replacement = ">"),
                  body = gsub(x = body, pattern = "&lt;", replacement = "<"))

  # optionally, parse the Unix timestamps into human-readable date-times
  if (parse_utc) {
    all_results <- all_results %>%
      dplyr::mutate(created_datetime = as.POSIXct(created_utc, origin = "1970-01-01"))
  }

  return(all_results)

}



# https://www.reddit.com/r/redditdev/comments/ldje2i/too_many_requests_error_even_though_im_sending/

get_subreddit_info <- function(subreddit, verbose = TRUE){
  # TODO: input validation on subreddit
  # subreddit should be character vector without "r/"

  warning("Reddit's API limits to 60 requests/min for REGISTERED apps. And this one's not registered yet!")

  url <- paste0("https://www.reddit.com/r/",subreddit,"/about.json")

  resp <- httr::GET(url, httr::user_agent(ua_string))


  # on error, send warning, return API response for debugging
  if (!httr::status_code(resp) == 200){
    stop (sprintf("Error code %s.\nFirst few lines of response:\n%s",
                  httr::status_code(resp),
                  httr::content(resp, type = "text/json", encoding = "UTF-8")))
  }

  # parse response
  if (verbose) message("API response good, parsing results..")
  response <- httr::content(resp, type = "text/json", encoding = "UTF-8") %>%
    jsonlite::fromJSON()

  response_data <- response$data %>%
    purrr::map(unlist) %>%
    purrr::map(function(x) if (is.null(x)) {""} else {x}) %>%
    #purrr::map(function(x) if(length(x) > 1) {stringr::str_flatten(x, collapse = ",")} else {x}) %>%
    purrr::map(function(x) if(length(x) > 1) {paste0(x, collapse = ",")} else {x}) %>%
    dplyr::as_tibble() %>%
    dplyr::mutate(subreddit = subreddit) %>%
    dplyr::select(subreddit, url, public_description, subscribers, description)


  return(response_data)

}


# https://bookdown.org/Maxine/tidy-text-mining/tokenizing-by-n-gram.html