R/SearchSubreddit.R

#' Search Subreddit Function
#'
#' This function allows you to search reddit for a searchTerm
#' in a specific subreddit and returns a dataframe with where each
#' row is a thread and each column is an attribute of the thread
#' @param seachTerm What you're searching for
#' @param subreddit The subreddit in which you're searching
#' @keywords reddit subreddit search API
#' @export
#' @examples
#' SearchSubreddit("cats with dogs", "CatsStandingUp")

SearchSubreddit <- function(searchTerm, subreddit = all) {
  library(tidyverse)
  library(httr)
  ### Accessory function to remove lists from thread data
  CleanUpLists <- function(x) {
    types <- unlist(map(x, ~typeof(.)))
    x[(which(types == "NULL"))] <- NA
    output <- x[-(which(types == "list"))]
    output <- lapply(output, as.character)
    return(output)
  }
  ### The initial API call to get search results
  initialURL <- paste0("https://oauth.reddit.com/r/", subreddit, "/search.json?q=", searchTerm, "&sort=new&type=link&restrict_sr=TRUE&t=all&raw_json=1&limit=100")

  ### Perform the search API call
  response <- GET(initialURL,
                  user_agent("Reddit Comment Scraper"),
                  config(token = token)) %>% content()

  threads <- map_dfr(response$data$children, ~ (CleanUpLists(.$data)))

  ### Print status update
  searchPage <- 1
  print(paste("Finished page", searchPage, "of results for", searchTerm))

  ### Init newThreads for the while loop.
  newThreads <- data_frame()

  ### Keep searching until you don't get the max number of search results (100)
  while((nrow(newThreads) == 100 | nrow(threads) == 100) & searchPage < 10) {
    ### Count how many pages deep we're going
    searchPage <- searchPage + 1
    print(paste("Getting page", searchPage, "of results for", searchTerm))
    # Pause to respect API rules.
    Sys.sleep(1.1)

    ### Get the fullname of the last thread in the search results
    lastThread <- tail(threads$name, n = 1L)

    ### Append the last thread fullname to the search URL
    nextURL <- paste0(initialURL, "&after=", lastThread)

    ### Get the next set of search results
    response <- GET(nextURL,
                    user_agent("Reddit Comment Scraper"),
                    config(token = token)) %>% content()

    ### Extract the data
    newThreads <- map_dfr(response$data$children, ~ (CleanUpLists(.$data)))

    ### Combine the old data with the new
    threads <- bind_rows(threads, newThreads)
  }
  print(paste("Done searching for", searchTerm, "after", searchPage, "pages"))
  threads <- threads

  return(suppressMessages(type_convert(threads, col_type = RedditColTypes("thread"))))
}
colindouglas/screddr documentation built on May 27, 2019, 1:08 p.m.