R/scrape.R

Defines functions m_scrape

#' @importFrom httr http_error content modify_url parse_url
m_scrape <- function(bow, query=NULL, params=NULL, accept="html", content=NULL, verbose=FALSE) { # nolint

  if(!inherits(bow, "polite"))
    stop("Please be polite: bow then scrape!")

  stopifnot(is.list(query) || is.null(query))

  if(!is.null(query))
    bow$url <- httr::modify_url(bow$url, query=query)
  else{
  if(!is.null(params)){
    warning("Argument `params` is deprecated. Please, use `query` passing parameters as a named list.", call.=FALSE)
    params <- gsub("^.*\\?", "", params) # this will delete url prefix if any
    query <- httr::parse_url(paste0("http://www.example.com/q?", params))$query
    bow$url <- httr::modify_url(bow$url, query=query)
   }
  }

  if(!is_scrapable(bow)){
    warning("No scraping allowed here!", call. = FALSE)
    return(NULL)
  }

  if(substr(accept,1,1)!="." && !grepl("/", accept)){
    accept <- paste0(".", accept)
  }

  accept_type <- httr::accept(accept)
  bow$config <- c(bow$config, accept_type)

  response <- httr_get_ltd(bow$url, bow$config, bow$handle, bow$times, verbose)

  if(httr::http_error(response)){
    warning(httr::http_status(response)$message, " ", bow$url, call. = FALSE)
    return(NULL)
  }

  content <- content %otherwise% response$headers$`content-type`

  res <- tryCatch(
    {
      httr::content(response, type = content)
    },
    error=function(cond){
      warning(paste0("<polite session> Encountered an error while parsing content.\n",
              "There seems to be a mismatch of content type or encoding or both.\n",
              "The server says it is serving:", response$headers$`content-type`, "\n",
              "Here's the text of the error generated by httr::content(): \n", cond, "\n",
              "Returning raw vector which can be parsed with rawToChar(). Good luck!\n"), call. = FALSE)
      return(httr::content(response, as = "raw"))
    }
  )
  res
}


#' Scrape the content of authorized page/API
#'
#' @param bow host introduction object of class `polite`, `session` created by `bow()` or `nod()`
#' @param query named list of parameters to be appended to URL in the format `list(param1=valA, param2=valB)`
#' @param params deprecated. Use `query` argument above.
#' @param accept character value of expected data type to be returned by host (e.g. `html`, `json`, `xml`, `csv`, `txt`, etc.)
#' @param content MIME type (aka internet media type) used to override the content type returned by the server.
#' See http://en.wikipedia.org/wiki/Internet_media_type for a list of common types. You can add the `charset` parameter to override the server's default encoding
#' @param verbose extra feedback from the function. Defaults to `FALSE`
#'
#' @return Object of class `httr::response` which can be further processed by functions in `rvest` package
#'
#' @examples
#' \donttest{
#'  library(rvest)
#'   bow("https://en.wikipedia.org/wiki/List_of_cognitive_biases") %>%
#'    scrape(content="text/html; charset=UTF-8") %>%
#'    html_nodes(".wikitable") %>%
#'    html_table()
#'}
#'
#' @export
scrape <- memoise::memoise(m_scrape)

Try the polite package in your browser

Any scripts or data that you put into this service are public.

polite documentation built on July 9, 2023, 5:21 p.m.