R/scrape-index2df.R

Defines functions index2df chk_idx_mode parse_board board_search_error custom_idx_url term_idx_url newest_idx_url

Documented in index2df

#' Extract data from multiple index pages of a PTT board.
#'
#' \code{index2df} scrapes the index pages of a board
#' ("\emph{
#' \href{https://liao961120.github.io/pttR/articles/translation}{kan ban}
#' }") and extracts the
#' information into a data frame.
#'
#' @param board Character. Either a \strong{URL} or a
#'   \strong{board name}, such as "\emph{Gossiping}",
#'   "\emph{Baseball}", "\emph{LoL}".
#'   \strong{board name} is case-insensitive. See
#'   \strong{Examples} for details.
#'   \code{board} has a different requirements when used
#'   with argument \code{search} (See below).
#' @param newest Integer. Number of pages, starting from
#'   the most recent page, to scrape.
#'   Defaults to \code{1}, which scrapes only the
#'   newest page. If set to \code{2}, then scrapes
#'   the newest and the second-newest page, and so
#'   forth.
#' @param pages Integer vector. A vector of index page number(s).
#'   This parameter lets you scrape index pages by providing
#'   index page numbers. Becareful not to
#'   provide numbers exceeding the range of current index pages.
#'   Defaults to \code{NA}.
#' @param search_term Character. A term to search in the index,
#'   such as "\emph{\href{https://liao961120.github.io/pttR/articles/translation}{lu she}}". There are also some advanced
#'   search methods:
#'   \describe{
#'   \item{Post thread}{Prepend "\emph{thread:}" to the
#'   search term (post title): "\emph{thread:<post-title>}".}
#'   \item{Posts of an author}{Prepend "\emph{author:}"
#'   to the author's ID, e.g., "\emph{author:Plumage}".}
#'   }
#' @param search_page Integer vector. A vector of index page
#'   number(s). With argument \code{search_term} set,
#'   \code{search_page} lets you scrape index pages related
#'   to a specific term. Defaults to \code{1}, which scrapes only
#'   the newest page.
#'
#' @return A data frame with one post info per row.
#'
#' @examples
#' # Get data from 'Gossiping'
#' index_df <- index2df("Gossiping")
#' head(index_df)
#'
#' \dontrun{
#' # Or use URL directly
#' link <- "https://www.ptt.cc/bbs/Gossiping/index"
#'
#' index_df <- index2df(link)
#' }
#' @section Warning:
#' \strong{Do not request too many pages one time}.
#' It places heavy load on the server.
#'
#' @seealso \code{\link{get_index_info}}
#'   \code{\link{get_index_info}} extracts data from
#'   \emph{one} index page, while \code{index2df} deals with
#'   \emph{several}. In addition, \code{index2df} has more
#'   functionality to deal with multiple pages extraction
#'
#' @export
index2df <- function(board, newest = 1, pages = NA,
                     search_term = NA, search_page = 1) {
  stopifnot(is.character(board),
            length(newest) == 1,
            is.numeric(search_page))

  board <- parse_board(board)
  search_case <- chk_idx_mode(board, newest, pages,
                              search_term, search_page)

  # Return df with idx and URLs
  if (search_case == "custom_pages") {
    df0 <- custom_idx_url(board, pages)
  } else if (search_case == "term_search") {
    df0 <- term_idx_url(board, search_term, search_page)
  } else if (search_case == "newest_pages") {
    df0 <- newest_idx_url(board, newest)
  } else stop("No URLs found")


  # IMPORTANT: coerce factor to chr,
  # or as.integer(factor) changes the value
  idx_n <- as.integer(as.character(df0$idx_n))
  urls <- df0$url

  # Combine individual dfs to one df
  df <- vector("list", length = length(urls))

  for (i in seq_along(urls)) {
    temp <- get_index_info(urls[i])
    temp$idx_n <- idx_n[i]
    df[[i]] <- temp
  }
  df <- dplyr::bind_rows(df)

  return(df)
}


# Helper Functions of index2df ------
# Input Checking --------------

## Determine the mode of index2df
## One of: "custom_pages", "term_search", "newest_pages", stop
chk_idx_mode <- function(board, newest, pages,
                             search_term, search_page) {
  if (!(TRUE %in% is.na(pages))) {
    cat("Searching custom range of index pages ...\n")
    case <- "custom_pages"
  } else if (!(TRUE %in% is.na(search_term))) {
    cat("Searching ", board, " for '",
        search_term, "' ...\n", sep="")
    case <- "term_search"
  } else if (is.numeric(newest)) {
    if (newest <= 0) stop("Arg. 'newest' must be a positive int")
    cat("Searching the newest", newest, "page(s) of",
        board ,"...\n")
    case <- "newest_pages"
  } else stop("Input error.\n")

  return(case)
}

## Parse board input
#' @importFrom stringr str_detect str_remove
#' @keywords internal
parse_board <- function(board) {
  if (str_detect(board, "^http")) {
    board <- str_remove(board, "^https://www.ptt.cc/bbs/") %>%
      str_remove("/.+$")
  }
  if (str_detect(board, ".html")) stop("Not a board URL")

  return(board)
}

## Page Not Found error
#' @importFrom magrittr %>%
#' @importFrom rvest html_node html_text
board_search_error <- function(url) {
  not_found <- read_html2(url) %>%
    html_node("head") %>%
    html_node("title") %>%
    html_text() %>%
    str_detect("^404")
  error_message <- c("Page Not Found\n",
                     "Invalid board name or\n",
                     "Search range exceeding limits\n")
  if (not_found) stop(error_message)
}

# Constructing a data frame of URLs and index_page_nums -----
## Search by custom pages
custom_idx_url <- function(board, pages) {
  idx_n <- pages

  urls <- paste0("https://www.ptt.cc/bbs/", board,
                 "/index", idx_n, ".html")
  url_limit <- paste0("https://www.ptt.cc/bbs/", board,
                      "/index", max(idx_n), ".html")
  board_search_error(url_limit)

  df <- data_frame(idx_n, url = urls)
  return(df)
}

## Search by term
#' @importFrom utils URLencode head
term_idx_url <- function(board, search_term, search_page) {
  idx_n <- search_page
  term <- search_term

  urls <- paste0("https://www.ptt.cc/bbs/", board,
                 "/search?page=", idx_n,
                 "&q=", URLencode(term))
  url_limit <- paste0("https://www.ptt.cc/bbs/", board,
                    "/search?page=", max(idx_n),
                    "&q=", URLencode(term))
  board_search_error(url_limit)

  df <- tibble::data_frame(idx_n, url = urls)
  return(df)
}

## Search newest index
newest_idx_url <- function(board, newest) {
  idx_url <- get_index_url(paste0("https://www.ptt.cc/bbs/",
                                  board, "/index.html"))
  cat("The newest index page of '", board, "' is ", idx_url[1],
      ".\n", idx_url[2], "\n", sep = "")

  idx_n <- seq(from = as.integer(idx_url[1]),
               to = as.integer(idx_url[1]) - newest + 1)
  if (min(idx_n) <= 0) stop("Range exceeds possible.")

  urls <- paste0("https://www.ptt.cc/bbs/", board,
               "/index", idx_n, ".html")

  df <- tibble::data_frame(idx_n, url = urls)
  return(df)
}
liao961120/pttR documentation built on Dec. 16, 2019, 2:19 a.m.