knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

Inspecting XML nodes

library(rvest)
library(dplyr)
library(xml2)
rv_doc <- rvest::read_html("https://www.churchofjesuschrist.org/study/liahona/2020/11/15cook?lang=eng")
rv_doc %>%
  html_elements(".body-block") %>%
  xml2::html_structure()

Explore node 1:

rv_doc %>%
  html_elements(".body-block") %>%
  xml2::xml_child(1)

Explore node 2:

rv_doc %>%
  html_elements(".body-block") %>%
  xml2::xml_child(2)
rv_doc %>%
  html_elements(".body-block") %>%
  xml_contents()

rv_doc %>%
  html_elements(".body-block p")

rv_doc %>%
  html_elements(".body-block") %>%
  html_children()
rv_doc %>%
  html_elements("header")
rv_doc %>%
  html_elements(".body") %>%
  html_elements("header") %>%
  html_text2()

Get specific paragraph by id:

rv_doc %>%
  html_elements("#p5")

Get multiple things at the same time (headers and paragraphs):

rv_doc %>%
  html_elements(".body-block h2, .body-block p")

Scratch code

header_ids <- rv_doc %>%
  html_elements(".body-block h2") %>%
  html_attr("id")
p_ids <- rv_doc %>%
  html_elements(".body-block p") %>%
  html_element("#p1")
xm_contents <- rv_doc %>%
  html_elements(".body-block") %>%
  xml_contents()
rv_doc %>%
  html_elements(".body-block") %>%
  # html_children() %>%
  xml_child(1) %>%
  xml_contents() %>%
  html_elements("p")
xm_contents %>%
  xml_child(1) %>%
  html_text()

Scrape metadata for url

rv_doc %>%
  html_elements("head") %>%
  html_elements("meta")


bryanwhiting/generalconference documentation built on Dec. 19, 2021, 11:51 a.m.