#' @export htmlToText
htmlToText <- function(input, ...) {
# code from Tony Breyal (original available at https://github.com/tonybreyal/Blog-Reference-Functions/tree/master/R)
###---PACKAGES ---###
require(RCurl)
require(XML)
###--- LOCAL FUNCTIONS ---###
# Determine how to grab html for a single input element
evaluate_input <- function(input) {
# if input is a .html file
if(file.exists(input)) {
char.vec <- readLines(input, warn = FALSE)
return(paste(char.vec, collapse = ""))
}
# if input is html text
if(grepl("</html>", input, fixed = TRUE)) return(input)
# if input is a URL, probably should use a regex here instead?
if(!grepl(" ", input)) {
# downolad SSL certificate in case of https problem
if(!file.exists("cacert.perm")) download.file(url="http://curl.haxx.se/ca/cacert.pem", destfile="cacert.perm")
return(getURL(input, followlocation = TRUE, cainfo = "cacert.perm"))
}
# return NULL if none of the conditions above apply
return(NULL)
}
# convert HTML to plain text
convert_html_to_text <- function(html) {
doc <- htmlParse(html, asText = TRUE)
text <- xpathSApply(doc, "//text()[not(ancestor::script)][not(ancestor::style)][not(ancestor::noscript)][not(ancestor::form)]", xmlValue)
return(text)
}
# format text vector into one character string
collapse_text <- function(txt) {
return(paste(txt, collapse = " "))
}
###--- MAIN ---###
# STEP 1: Evaluate input
html.list <- lapply(input, evaluate_input)
# STEP 2: Extract text from HTML
text.list <- lapply(html.list, convert_html_to_text)
# STEP 3: Return text
text.vector <- sapply(text.list, collapse_text)
return(text.vector)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.