#' @title
#' Scrape the "Links to Resources" Section at a Registry Number URL
#'
#' @inherit chemidplus_scraping_functions description return
#'
#' @inheritSection chemidplus_scraping_functions Links to Resources
#'
#' @inheritParams chemidplus_scraping_functions
#'
#' @seealso
#' \code{\link[xml2]{read_xml}}
#' \code{\link[pg13]{lsSchema}},\code{\link[pg13]{createSchema}},\code{\link[pg13]{lsTables}},\code{\link[pg13]{query}},\code{\link[pg13]{buildQuery}},\code{\link[pg13]{appendTable}},\code{\link[pg13]{writeTable}}
#' \code{\link[rvest]{html_nodes}},\code{\link[rvest]{html_text}}
#' \code{\link[purrr]{map}}
#' \code{\link[tibble]{as_tibble}}
#' \code{\link[dplyr]{bind}},\code{\link[dplyr]{mutate}},\code{\link[dplyr]{filter_all}}
#'
#' @rdname get_links_to_resources
#'
#' @family chemidplus scraping
#'
#' @export
#'
#' @importFrom xml2 read_html
#' @importFrom pg13 lsSchema createSchema lsTables query buildQuery appendTable writeTable
#' @importFrom rvest html_nodes html_attrs
#' @importFrom purrr map
#' @importFrom tibble as_tibble_row
#' @importFrom dplyr bind_rows transmute filter_at
#' @importFrom magrittr %>%
get_links_to_resources <-
function(conn,
rn_url,
response,
schema = "chemidplus",
sleep_time = 3) {
# https://chem.nlm.nih.gov/chemidplus/rn/83-38-5
# https://chem.nlm.nih.gov/chemidplus/rn/97232-34-3
# https://chem.nlm.nih.gov/chemidplus/rn/499313-74-3
# https://chem.nlm.nih.gov/chemidplus/rn/90106-68-6
# https://chem.nlm.nih.gov/chemidplus/rn/225234-03-7
# https://chem.nlm.nih.gov/chemidplus/rn/57197-43-0
# https://chem.nlm.nih.gov/chemidplus/rn/95-30-7
# https://chem.nlm.nih.gov/chemidplus/rn/3819-76-9
# https://chem.nlm.nih.gov/chemidplus/rn/58911-04-9
# https://chem.nlm.nih.gov/chemidplus/rn/135669-44-2
# https://chem.nlm.nih.gov/chemidplus/rn/142192-09-4
# https://chem.nlm.nih.gov/chemidplus/rn/995-32-4
# https://chem.nlm.nih.gov/chemidplus/rn/1660-95-3
# https://chem.nlm.nih.gov/chemidplus/rn/2666-14-0
# https://chem.nlm.nih.gov/chemidplus/rn/2809-20-3
# https://chem.nlm.nih.gov/chemidplus/rn/816143-80-9
# https://chem.nlm.nih.gov/chemidplus/rn/37357-69-0
# https://chem.nlm.nih.gov/chemidplus/rn/518-20-7
# https://chem.nlm.nih.gov/chemidplus/rn/5117216-75-8
# https://chem.nlm.nih.gov/chemidplus/rn/105026-49-1
# https://chem.nlm.nih.gov/chemidplus/rn/105026-50-4
# https://chem.nlm.nih.gov/chemidplus/rn/5105026-51-5
# https://chem.nlm.nih.gov/chemidplus/rn/ [INN:NF]127-58-2
# https://chem.nlm.nih.gov/chemidplus/rn/50-21-5
# https://chem.nlm.nih.gov/chemidplus/rn/621-42-1
# https://chem.nlm.nih.gov/chemidplus/rn/50-33-9
# https://chem.nlm.nih.gov/chemidplus/rn/934016-19-0
# https://chem.nlm.nih.gov/chemidplus/rn/56-81-5
# https://chem.nlm.nih.gov/chemidplus/rn/57-91-0
# https://chem.nlm.nih.gov/chemidplus/rn/1939126-74-5
# https://chem.nlm.nih.gov/chemidplus/rn/1208255-63-3
# https://chem.nlm.nih.gov/chemidplus/rn/131740-09-5
# https://chem.nlm.nih.gov/chemidplus/rn/59-01-8
# https://chem.nlm.nih.gov/chemidplus/rn/570406-98-3
# https://chem.nlm.nih.gov/chemidplus/rn/677007-74-8
# https://chem.nlm.nih.gov/chemidplus/rn/699313-73-2
# https://chem.nlm.nih.gov/chemidplus/rn/143491-54-7
# https://chem.nlm.nih.gov/chemidplus/rn/25174336-60-0
# https://chem.nlm.nih.gov/chemidplus/rn/145199-73-1
# https://chem.nlm.nih.gov/chemidplus/rn/17795-21-0
# https://chem.nlm.nih.gov/chemidplus/rn/155775-04-5
# https://chem.nlm.nih.gov/chemidplus/rn/12674-15-6
# conn <- chariot::connectAthena()
# rn_url <- "https://chem.nlm.nih.gov/chemidplus/rn/12674-15-6"
if (missing(response)) {
response <- xml2::read_html(rn_url, options = c("RECOVER", "NOERROR", "NOBLANKS", "HUGE"))
Sys.sleep(sleep_time)
}
if (!missing(conn)) {
connSchemas <-
pg13::lsSchema(conn = conn)
if (!(schema %in% connSchemas)) {
pg13::createSchema(conn = conn,
schema = schema)
}
chemiTables <- pg13::lsTables(conn = conn,
schema = schema)
if ("LINKS_TO_RESOURCES"%in% chemiTables) {
links_to_resources <-
pg13::query(conn = conn,
sql_statement = pg13::buildQuery(distinct = TRUE,
schema = schema,
tableName = "LINKS_TO_RESOURCES",
whereInField = "rn_url",
whereInVector = rn_url))
}
}
# Proceed if:
# Connection was provided and no Synonyms Table exists
# Connection was provided and links_to_resources is nrow 0
# No connection was provided
if (!missing(conn)) {
if ("LINKS_TO_RESOURCES" %in% chemiTables) {
proceed <- nrow(links_to_resources) == 0
} else {
proceed <- TRUE
}
} else {
proceed <- TRUE
}
if (proceed) {
# locator_types <-
# response %>%
# rvest::html_nodes("#locators h3") %>%
# rvest::html_text()
#
#
# if (length(locator_types) == 0) {
# locator_types <-
# response %>%
# rvest::html_nodes("#locators h2") %>%
# rvest::html_text()
# }
# locators_content <-
# response %>%
# rvest::html_nodes("#locators") %>%
# rvest::html_text()
#
# locators_link <-
# response %>%
# rvest::html_nodes("#locators") %>%
# rvest::html_text()
links_to_resources <-
response %>%
rvest::html_nodes("#locators a") %>%
rvest::html_attrs() %>%
purrr::map(tibble::as_tibble_row) %>%
dplyr::bind_rows() %>%
dplyr::transmute(
ltr_datetime = Sys.time(),
resource_agency = `data-name`,
resource_link = href,
rn_url = rn_url) %>%
dplyr::filter_at(vars(resource_link),
any_vars(nchar(.) < 255))
if (!missing(conn)) {
if ("LINKS_TO_RESOURCES" %in% chemiTables) {
pg13::appendTable(conn = conn,
schema = schema,
tableName = "LINKS_TO_RESOURCES",
links_to_resources)
} else {
pg13::writeTable(conn = conn,
schema = schema,
tableName = "LINKS_TO_RESOURCES",
links_to_resources)
}
}
}
if (nrow(showConnections())) {
closeAllConnections()
}
if (missing(conn)) {
links_to_resources
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.