View source: R/cas_find_extractor.R
cas_find_extractor | R Documentation |
cas_extract_html()
This may or may not work, but it may be worth giving this a quick a try before looking for alternatives. The parameters returned first should work best.
cas_find_extractor(
html_document,
pattern,
containers = c("h1", "h2", "h3", "h4", "span", "td", "p", "div"),
exclude_css_path = NULL
)
html_document |
An html document parsed with |
pattern |
A text string to be matched. |
containers |
Containers to be parsed for best matches. By default:
|
exclude_css_path |
Defaults to NULL. To remove script, for example, use
|
A data frame list with container and class or id of values that
should work if passed to cas_extract_html()
.
## Not run:
if (interactive) {
# not ideal example, but you'll get the gist, see additonal example below
library("castarter")
url <- "https://www.nasa.gov/news-release/nasa-sets-coverage-for-roscosmos-spacewalk-outside-space-station/"
html_page <- rvest::read_html(url)
cas_find_extractor(
html_document = html_page,
pattern = "NASA Sets Coverage for Roscosmos Spacewalk Outside Space Station"
)
cas_find_extractor(
html_document = html_page,
pattern = "Oct 23, 2023"
)
cas_find_extractor(
html_document = html_page,
pattern = "Roxana Bardan"
)
cas_find_extractor(
html_document = html_page,
pattern = "RELEASE"
)
## Use this information to extract contents
library("castarter")
url <- "https://www.state.gov/designating-russian-virtual-currency-money-launderer/"
html_page <- rvest::read_html(url)
cas_find_extractor(
html_document = html_page,
pattern = "Designating Russian Virtual Currency Money Launderer"
)
cas_extract_html(
html_document = html_page,
container = "span",
container_class = "bc_current collapse"
)
cas_extract_html(
html_document = html_page,
container = "h1",
container_class = "featured-content__headline stars-above"
)
cas_find_extractor(
html_document = html_page,
pattern = "Press Statement"
)
cas_extract_html(
html_document = html_page,
container = "p",
container_class = "article-meta doctype-meta"
)
cas_find_extractor(
html_document = html_page,
pattern = "Matthew Miller, Department Spokesperson"
)
cas_extract_html(
html_document = html_page,
container = "p",
container_class = "article-meta__author-bureau"
)
cas_find_extractor(
html_document = html_page,
pattern = "November 3, 2023"
)
cas_extract_html(
html_document = html_page,
container = "p",
container_class = "article-meta__publish-date"
)
cas_find_extractor(
html_document = html_page,
pattern = "The United States is sanctioning Ekaterina Zhdanova",
exclude_css_path = "script"
)
cas_extract_html(
html_document = html_page,
container = "div",
container_class = "entry-content",
exclude_css_path = "script"
)
}
## End(Not run)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.