#' Facilitates extraction of contents from an html file
#'
#' @param html_document An html document parsed with `xml2::read_html()` or
#' `rvest::read_html()`.
#' @param container Defaults to NULL. Type of html container from where links
#' are to be extracted, such as "div", "ul", and others. Either
#' `container_class` or `container_id` must also be provided.
#' @param container_class Defaults to NULL. If provided, also `container` must
#' be given (and `container_id` must be NULL). Only text found inside the
#' provided combination of container/class will be extracted.
#' @param container_id Defaults to NULL. If provided, also `container` must be
#' given (and `container_id` must be NULL). Only text found inside the
#' provided combination of container/class will be extracted.
#' @param container_itemprop Defaults to NULL. If provided, also `container`
#' must be given (and `container_id` and `container_class` must be NULL or
#' will be silently ignored). Only text found inside the provided combination
#' of container/itemprop will be extracted.
#' @param container_instance Defaults to NULL. If given, it must be an integer.
#' If a given combination is found more than once in the same page, the
#' relevant occurrence is kept. Use with caution, as not all pages always
#' include the same number of elements of the same class/with the same id.
#' @param sub_element Defaults to NULL. If provided, also `container` must be
#' given. Only text within elements of given type under the chosen combination
#' of container/containerClass will be extracted. When given, it will
#' tipically be "p", to extract all p elements inside the selected div.
#' @param no_children Defaults to FALSE, i.e. by default all subelements of the
#' selected combination (e.g. div with given class) are extracted. If TRUE,
#' only text found under the given combination (but not its subelements) will
#' be extracted. Corresponds to the xpath string `/node()[not(self::div)]`.
#' @param attribute Defaults to NULL. If given, type of attribute to extract.
#' Typically used in combination with container, as in
#' `cas_extract_html(container = "time", attribute = "datetime")`.
#' @param exclude_css_path Defaults to NULL. To remove script, for example, use
#' `script`, which is transformed to `:not(script)`. May cause issues, use
#' with caution.
#' @param exclude_xpath Defaults to NULL. A common pattern when extracting text
#' would be `//script|//iframe|//img|//style`, as it is assumed that these
#' containers (javascript contents, iframes, css blocks, and images) are most
#' likely undesirable when extracting text. Customise as needed. For example,
#' if besides the above you also want to remove a `div` of class
#' `related-articles`, you may use
#' `//script|//iframe|//img|//div[@class='related-articles']`Be careful when
#' using `exclude_xpath` as the relevant Xpath is removed from the original
#' objext passed to `cas_extract_html()`. To be clear, the input object is
#' changed, and, for example, if used once in one of the extractors these
#' containers won't be available to other extractors.
#' @param custom_xpath Defaults to NULL. If given, all other parameters are
#' ignored and given Xpath used instead.
#' @param custom_css_path Defaults to NULL. If given, all other parameters are
#' ignored and given CSSpath used instead.
#' @param keep_everything Defaults to FALSE. If TRUE, all text included in the
#' page is returned as a single string.
#' @param trim Defaults to TRUE. If TRUE, applies `stringr::str_trim()` to
#' output, removing whitespace from start and end of string.
#' @param squish Defaults to FALSE. If TRUE, applies `stringr::str_squish()` to
#' output, removing whitespace from start and end of string, and replacing
#' any whitespace (including new lines) with a single space.
#' @param no_match Defaults to "". A common alternative would be NA. Value to
#' return when the given container, selector or element is not found.
#' @param extract_text Defaults to TRUE. If TRUE, text is extracted.
#' @param as_character Defaults to TRUE. If FALSE, and if `extract_text` is set
#' to FALSE, then an `xml_nodeset` object is returned.
#'
#' @return A character vector of length one.
#' @export
#'
#' @examples
#' \dontrun{
#' if (interactive()) {
#' url <- "https://example.com"
#' html_document <- rvest::read_html(x = url)
#'
#' # example for a tag that looks like:
#' # <meta name="twitter:title" content="Example title" />
#'
#' cas_extract_html(
#' html_document = html_document,
#' container = "meta",
#' container_name = "twitter:title",
#' attribute = "content"
#' )
#'
#'
#' # example for a tag that looks like:
#' # <meta name="keywords" content="various;keywords;">
#' cas_extract_html(
#' html_document = html_document,
#' container = "meta",
#' container_name = "keywords",
#' attribute = "content"
#' )
#'
#' # example for a tag that looks like:
#' # <meta property="article:published_time" content="2016-10-29T13:09+03:00"/>
#' cas_extract_html(
#' html_document = html_document,
#' container = "meta",
#' container_property = "article:published_time",
#' attribute = "content"
#' )
#' }
#' }
cas_extract_html <- function(html_document,
container = NULL,
container_class = NULL,
container_id = NULL,
container_name = NULL,
container_property = NULL,
container_itemprop = NULL,
container_instance = NULL,
attribute = NULL,
sub_element = NULL,
no_children = NULL,
trim = TRUE,
squish = FALSE,
no_match = "",
exclude_css_path = NULL,
exclude_xpath = NULL,
custom_xpath = NULL,
custom_css_path = NULL,
keep_everything = FALSE,
extract_text = TRUE,
as_character = TRUE) {
if (keep_everything == TRUE) {
output <- html_document
} else if (is.null(custom_xpath) == FALSE) {
output <- html_document %>%
rvest::html_elements(xpath = custom_xpath)
} else if (is.null(custom_css_path) == FALSE) {
output <- html_document %>%
rvest::html_elements(css = custom_css_path)
} else if (is.null(container_itemprop) == FALSE) {
if (is.null(attribute)) {
output <- html_document %>%
rvest::html_elements(xpath = stringr::str_c(
"//",
container,
"[@itemprop='",
container_itemprop, "']"
))
} else {
output <- html_document %>%
rvest::html_elements(xpath = stringr::str_c(
"//",
container,
"[@itemprop='",
container_itemprop, "']"
))
}
} else if (is.null(container_class) == TRUE & is.null(container_id) == TRUE) {
if (is.null(container_name) == TRUE) {
if (is.null(container_property)) {
output <- html_document %>%
rvest::html_elements(container)
} else {
output <- html_document %>%
rvest::html_elements(xpath = stringr::str_c(
"//",
container,
"[@property='",
container_property, "']"
))
}
} else {
if (is.null(attribute)) {
output <- html_document %>%
rvest::html_elements(xpath = stringr::str_c(
"//",
container,
"[@name='",
container_name, "']"
))
} else {
output <- html_document %>%
rvest::html_elements(xpath = stringr::str_c(
"//",
container,
"[@name='",
container_name, "']"
))
}
}
} else if (is.null(container_class) == FALSE & is.null(attribute) == FALSE) {
if (is.null(container_name)) {
output <- html_document %>%
rvest::html_elements(xpath = stringr::str_c(
"//",
container
))
} else {
output <- html_document %>%
rvest::html_elements(xpath = stringr::str_c(
"//",
container,
"[@name='",
container_name, "']"
))
}
} else if (is.null(container_class) == FALSE & is.null(container_id) == TRUE) {
output <- html_document %>%
rvest::html_elements(xpath = stringr::str_c(
"//",
container,
"[@class='",
container_class, "']"
))
} else if (is.null(container_class) == TRUE & is.null(container_id) == FALSE) {
output <- html_document %>%
rvest::html_elements(xpath = stringr::str_c("//", container, "[@id='", container_id, "']"))
}
if (is.null(exclude_xpath) == FALSE) {
xml2::xml_remove(output %>% xml2::xml_find_all(xpath = exclude_xpath))
}
if (is.null(attribute) == FALSE) {
output <- output %>%
rvest::html_attr(name = attribute)
} else {
if (is.null(exclude_css_path) == FALSE) {
output <- output %>%
rvest::html_elements(css = stringr::str_c(
":not(",
exclude_css_path,
")"
))
}
if (is.null(sub_element) == FALSE) {
output <- output %>%
rvest::html_elements(sub_element)
}
if (extract_text == TRUE) {
output <- output %>%
rvest::html_text2()
} else {
if (as_character == TRUE) {
output <- output %>%
as.character()
} else {
return(output)
}
}
}
if (length(output) > 1) {
if (is.null(container_instance) == FALSE) {
output <- output[container_instance]
} else {
output <- stringr::str_c(output, collapse = "\n")
}
} else if (length(output) == 0) {
output <- as.character(no_match)
}
if (trim == TRUE) {
output <- stringr::str_trim(string = output)
}
if (squish == TRUE) {
output <- stringr::str_squish(string = output)
}
output
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.