R/hu-read-html.R

Defines functions hu_read_html

Documented in hu_read_html

#' Read HTML from a URL with Browser Emulation & in a JavaScript Context
#'
#' Use a JavaScript-enabled browser context to read and render HTML from a URL.
#'
#' For the code in the examples, this is the site that is being scraped:
#'
#' \if{html}{
#' \figure{test-url-table.png}{options: width="100\%" alt="Figure: test-url-table.png"}
#' }
#'
#' \if{latex}{
#' \figure{test-url-table.png}{options: width=10cm}
#' }
#'
#' Note that it has a table of values but it is rendered via JavaScript.
#'
#' @param url URL to retrieve
#' @param emulate browser to emulate; one of "`best`", "`chrome`", "`firefox`", "`ie`"
#' @param ret what to return; if `html_document` (the default) then the HTML created
#'        by the `HtmlUnit` emulated browser context is passed to [xml2::read_html()]
#'        and an `xml2` `html_document`/`xml_document` is returned. Note that this causes
#'        further HTML processing by `xml2`/`libxml2` so is not _exactly_ what
#'        `HtmlUnit` generated. If you want the HTML code (text) without any further
#'        processing then use `text` as the value.
#' @param js_delay time (ms) to let loaded javascript to execute; default is 2 seconds (2000 ms)
#' @param timeout overall timeout (ms); `0` == infinite wait (not recommended); note: the
#'        timeout is used twice: first in making the socket connection,
#'        second for data retrieval. If the time is critical you must
#'        allow for twice the time specified here. Default 30s (30000 ms)
#' @param ignore_ssl_errors Should SSL/TLS errors be ignored. The default (`TRUE`) is
#'        a current hack due to how `HtmlUnit` seems to handle virtual hosted sites
#'        with multiple vhosts and multiple certificates. You can try it with `FALSE`
#'        initially and revert back to `TRUE` if you encounter issues.
#' @param enable_dnt Enable the "Do Not Track" header. Default: `FALSE`.
#' @param download_images Download images as the page is loaded? Since this
#'        function is a high-level wrapper designed to do a read of HTML,
#'        it is recommended that you leave this the default `FALSE` to save
#'        time/bandwidth.
#' @param options options to pass to [xml2::read_html()] if `ret` == `html_document`.
#' @return an `xml2` `html_document`/`xml_document` if `ret` == `html_document` else
#'         the HTML document text generated by `HtmlUnit`.
#' @export
#' @examples \dontrun{
#' test_url <- "https://hrbrmstr.github.io/htmlunitjars/index.html"
#' hu_read_html(test_url)
#' }
hu_read_html <- function(url,
                         emulate = c("best", "chrome", "firefox", "ie", "edge"),
                         ret = c("html_document", "text"),
                         js_delay = 2000L,
                         timeout = 30000L,
                         ignore_ssl_errors = TRUE,
                         enable_dnt = FALSE,
                         download_images = FALSE,
                         options = c("RECOVER", "NOERROR", "NOBLANKS")) {

  emulate <- match.arg(emulate, c("best", "chrome", "firefox", "ie", "edge"))
  ret <- match.arg(ret, c("html_document", "text"))

  available_browsers <- J("com.gargoylesoftware.htmlunit.BrowserVersion")

  switch(
    emulate,
    best = available_browsers$BEST_SUPPORTED,
    chrome = available_browsers$CHROME,
    firefox = available_browsers$FIREFOX,
    edge = available_browsers$EDGE,
    ie = available_browsers$INTERNET_EXPLORER
  ) -> use_browser

  wc <- new(J("com.gargoylesoftware.htmlunit.WebClient"), use_browser)

  cssErrorHandler <- .jnew("is.rud.htmlunit.RDefaultCssErrorHandler")
  wc$setCssErrorHandler(cssErrorHandler)

  incorrectListenerHandler <- .jnew("is.rud.htmlunit.RIncorrectnessListener")
  wc$setIncorrectnessListener(incorrectListenerHandler)

  res <- wc$waitForBackgroundJavaScriptStartingBefore(.jlong(as.integer(js_delay)))

  wc_opts <- wc$getOptions()
  wc_opts$setThrowExceptionOnFailingStatusCode(FALSE)
  wc_opts$setThrowExceptionOnScriptError(FALSE)
  wc_opts$setTimeout(as.integer(timeout))

  if (ignore_ssl_errors) wc_opts$setUseInsecureSSL(TRUE)
  if (enable_dnt) wc_opts$setDoNotTrackEnabled(TRUE)
  if (download_images) wc_opts$setDownloadImages(TRUE)

  pg <- wc$getPage(url)

  # response <- pg$getWebResponse()
  # content <- response$getContentAsString()

  if (ret == "html_document") return(xml2::read_html(pg$asXml(), options = options))

  return(pg$asText())

}
hrbrmstr/htmlunit documentation built on Aug. 19, 2020, 3:05 p.m.