R/xml_parse.R

Defines functions download_xml read_xml.response read_xml.connection read_xml.raw read_xml.character read_html.response read_html.default read_html read_xml

Documented in download_xml read_html read_xml read_xml.character read_xml.connection read_xml.raw

#' Read HTML or XML.
#'
#' @section Setting the "user agent" header:
#'
#' When performing web scraping tasks it is both good practice --- and often required ---
#' to set the [user agent](https://en.wikipedia.org/wiki/User_agent) request header
#' to a specific value. Sometimes this value is assigned to emulate a browser in order
#' to have content render in a certain way (e.g. `Mozilla/5.0 (Windows NT 5.1; rv:52.0)
#' Gecko/20100101 Firefox/52.0` to emulate more recent Windows browsers). Most often,
#' this value should be set to provide the web resource owner information on who you are
#' and the intent of your actions like this Google scraping bot user agent identifier:
#' `Googlebot/2.1 (+http://www.google.com/bot.html)`.
#'
#' You can set the HTTP user agent for URL-based requests using [httr::set_config()] and [httr::user_agent()]:
#'
#' `httr::set_config(httr::user_agent("me@@example.com; +https://example.com/info.html"))`
#'
#' [httr::set_config()] changes the configuration globally,
#' [httr::with_config()] can be used to change configuration temporarily.
#'
#' @param x A string, a connection, or a raw vector.
#'
#'   A string can be either a path, a url or literal xml. Urls will
#'   be converted into connections either using `base::url` or, if
#'   installed, `curl::curl`. Local paths ending in `.gz`,
#'   `.bz2`, `.xz`, `.zip` will be automatically uncompressed.
#'
#'   If a connection, the complete connection is read into a raw vector before
#'   being parsed.
#' @param encoding Specify a default encoding for the document. Unless
#'   otherwise specified XML documents are assumed to be in UTF-8 or
#'   UTF-16. If the document is not UTF-8/16, and lacks an explicit
#'   encoding directive, this allows you to supply a default.
#' @param ... Additional arguments passed on to methods.
#' @param as_html Optionally parse an xml file as if it's html.
#' @param base_url When loading from a connection, raw vector or literal
#'   html/xml, this allows you to specify a base url for the document. Base
#'   urls are used to turn relative urls into absolute urls.
#' @param n If `file` is a connection, the number of bytes to read per
#'    iteration. Defaults to 64kb.
#' @param verbose When reading from a slow connection, this prints some
#'    output on every iteration so you know its working.
#' @param options Set parsing options for the libxml2 parser. Zero or more of
#' \Sexpr[results=rd]{xml2:::describe_options(xml2:::xml_parse_options())}
#' @return An XML document. HTML is normalised to valid XML - this may not
#'   be exactly the same transformation performed by the browser, but it's
#'   a reasonable approximation.
#' @export
#' @examples
#' # Literal xml/html is useful for small examples
#' read_xml("<foo><bar /></foo>")
#' read_html("<html><title>Hi<title></html>")
#' read_html("<html><title>Hi")
#'
#' # From a local path
#' read_html(system.file("extdata", "r-project.html", package = "xml2"))
#'
#' \dontrun{
#' # From a url
#' cd <- read_xml(xml2_example("cd_catalog.xml"))
#' me <- read_html("http://had.co.nz")
#' }
read_xml <- function(x, encoding = "", ..., as_html = FALSE, options = "NOBLANKS") {
  UseMethod("read_xml")
}

#' @export
#' @rdname read_xml
read_html <- function(x, encoding = "", ..., options = c("RECOVER", "NOERROR", "NOBLANKS")) {
  UseMethod("read_html")
}

#' @export
read_html.default <- function(x, encoding = "", ..., options = c("RECOVER", "NOERROR", "NOBLANKS")) {
  options <- parse_options(options, xml_parse_options())

  suppressWarnings(read_xml(x, encoding = encoding, ..., as_html = TRUE, options = options))
}

#' @export
read_html.response <- function(x, encoding = "", options = c("RECOVER",
    "NOERROR", "NOBLANKS"), ...) {
  need_package("httr")

  options <- parse_options(options, xml_parse_options())

  httr::stop_for_status(x)
  content <- httr::content(x, as = "raw")
  xml2::read_html(content, encoding = encoding, options = options, ...)
}

#' @export
#' @rdname read_xml
read_xml.character <- function(x, encoding = "", ..., as_html = FALSE,
                               options = "NOBLANKS") {

  if (length(x) == 0) {
    stop("Document is empty", call. = FALSE)
  }

  if (length(x) > 1) {
    stop("`x` must be a string of length 1", call. = FALSE)
  }

  options <- parse_options(options, xml_parse_options())
  if (grepl("<|>", x)) {
    read_xml.raw(charToRaw(enc2utf8(x)), "UTF-8", ..., as_html = as_html, options = options)
  } else {
    con <- path_to_connection(x)
    if (inherits(con, "connection")) {
      read_xml.connection(con, encoding = encoding, ..., as_html = as_html,
        base_url = x, options = options)
    } else {
      doc <- .Call(doc_parse_file, con, encoding = encoding, as_html = as_html,
        options = options)
      xml_document(doc)
    }
  }
}

#' @export
#' @rdname read_xml
read_xml.raw <- function(x, encoding = "", base_url = "", ...,
                         as_html = FALSE, options = "NOBLANKS") {
  options <- parse_options(options, xml_parse_options())

  doc <- .Call(doc_parse_raw, x, encoding, base_url, as_html, options)
  xml_document(doc)
}

#' @export
#' @rdname read_xml
read_xml.connection <- function(x, encoding = "", n = 64 * 1024,
                                verbose = FALSE, ..., base_url = "",
                                as_html = FALSE, options = "NOBLANKS") {
  options <- parse_options(options, xml_parse_options())

  if (!isOpen(x)) {
    open(x, "rb")
    on.exit(close(x))
  }

  raw <- .Call(read_connection_, x, n)
  read_xml.raw(raw, encoding = encoding, base_url = base_url, as_html =
    as_html, options = options)
}

#' @export
read_xml.response <- function(x, encoding = "", base_url = "", ...,
                              as_html = FALSE, options = "NOBLANKS") {
  need_package("httr")

  options <- parse_options(options, xml_parse_options())
  httr::stop_for_status(x)
  content <- httr::content(x, as = "raw")
  xml2::read_xml(content, encoding = encoding, base_url = if (nzchar(base_url)) base_url else x$url,
    as_html = as_html, option = options, ...)
}

#' Download a HTML or XML file
#'
#' Libcurl implementation of `C_download` (the "internal" download method)
#' with added support for https, ftps, gzip, etc. Default behavior is identical
#' to [download.file()], but request can be fully configured by passing
#' a custom [curl::handle()].
#' @inherit curl::curl_download
#' @param file A character string with the name where the downloaded file is
#'   saved.
#' @seealso [curl_download][curl::curl_download]
#' @export
#' @examples
#' \dontrun{
#' download_html("http://tidyverse.org/index.html")
#' }
download_xml <- function(url, file = basename(url), quiet = TRUE, mode = "wb",
  handle = curl::new_handle()) {

  if (!requireNamespace("curl", quietly = TRUE)) {
    stop("`curl` must be installed to use `download_xml()`", call. = FALSE)
  }

  curl::curl_download(url, file, quiet = quiet, mode = mode, handle = handle)

  invisible(file)
}

#' @export
#' @rdname download_xml
download_html <- download_xml

Try the xml2 package in your browser

Any scripts or data that you put into this service are public.

xml2 documentation built on July 9, 2023, 6:44 p.m.