Nothing
#' Read in .html Content
#'
#' Read in the content from a .html file. This is generalized, reading in all
#' body text. For finer control the user should utilize the \pkg{xml2} and
#' \pkg{rvest} packages.
#'
#' @param file The path to the .html file.
#' @param skip The number of lines to skip.
#' @param remove.empty logical. If `TRUE` empty elements in the vector are
#' removed.
#' @param trim logical. If `TRUE` the leading/training white space is
#' removed.
#' @param ... Other arguments passed to [xml2::read_html()][xml2::read_xml].
#' @return Returns a character vector.
#' @keywords html
#' @rdname read_html
#' @export
#' @references The xpath is taken from Tony Breyal's response on StackOverflow:
#' <https://stackoverflow.com/questions/3195522/is-there-a-simple-way-in-r-to-extract-only-the-text-elements-of-an-html-page/3195926#3195926>
#' @examples
#' html_dat <- read_html(
#' system.file("docs/textreadr_creed.html", package = "textreadr")
#' )
#'
#' \dontrun{
#' url <- "http://www.talkstats.com/index.php"
#' file <- download(url)
#' (txt <- read_html(url))
#' (txt <- read_html(file))
#' }
read_html <- function (file, skip = 0, remove.empty = TRUE, trim = TRUE, ...) {
## read in the html
doc <- xml2::read_html(file, ...)
## extract the body content
pvalues <- rvest::html_text(
rvest::html_nodes(
doc,
xpath = "//body//text()[not(ancestor::script)][not(ancestor::style)][not(ancestor::noscript)]"
)
)
## formatting
if (isTRUE(remove.empty)) pvalues <- pvalues[!grepl("^\\s*$", pvalues)]
if (skip > 0) pvalues <- pvalues[-seq(skip)]
if (isTRUE(trim)) pvalues <- trimws(pvalues)
if (length(pvalues) == 0) pvalues <- ''
pvalues
}
#' @rdname read_html
#' @export
read_xml <- read_html
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.