R/scrape_publication.R

#' Scrape metadata for one publication
#' @importFrom magrittr "%>%"

scrape_publication <- function(url = NULL,
                               topic_list = c("Arts, culture and sport", "Brexit", "Building, planning and design",
                                              "Business, industry and innovation", "Children and families",
                                              "Communities and third sector", "Constitution and democracy",
                                              "Economy", "Education", "Energy", "Environment and climate change",
                                              "Equality and rights", "Farming and rural", "Health and social care",
                                              "Housing", "International", "Law and order", "Marine and fisheries",
                                              "Money and tax", "Public safety and emergencies", "Public sector",
                                              "Research", "Statistics", "Transport", "Work and skills"),
                               pattern = "National Records of Scotland") {

  force(topic_list)

  publication_html <- xml2::read_html(url)

  supporting_document_urls <- publication_html %>%
    rvest::html_nodes(".no-icon") %>%
    rvest::html_attr(name = "href")

  metadata <- data.frame(date_published = publication_html %>%
                           rvest::html_node(xpath = "//span[contains(text(),'Published:')]/following-sibling::span[1]") %>%
                           rvest::html_text(trim = TRUE) %>%
                           ifelse(length(.) == 0, NA, .) %>%
                           lubridate::dmy(),
                         title = publication_html %>%
                           rvest::html_node(".article-header__title") %>%
                           rvest::html_text(trim = TRUE),
                         summary = publication_html %>%
                           rvest::html_node(".leader") %>%
                           rvest::html_text(trim = TRUE),
                         type = publication_html %>%
                           rvest::html_node(".article-header__label") %>%
                           rvest::html_text() %>%
                           ifelse(length(.) == 0, NA_character_, .),
                         directorate = publication_html %>%
                           rvest::html_nodes(xpath = "//dt[contains(text(),'Directorate:')]/following-sibling::dd[1]") %>%
                           rvest::html_text(trim = TRUE) %>%
                           ifelse(length(.) == 0, NA_character_, .),
                         isbn = publication_html %>%
                           rvest::html_nodes(xpath = "//dt[contains(text(),'ISBN:')]/following-sibling::dd[1]") %>%
                           rvest::html_text(trim = TRUE) %>%
                           ifelse(length(.) == 0, NA_character_, .),
                         date_of_meeting = publication_html %>%
                           rvest::html_nodes(xpath = "//span[contains(text(),'Date of meeting:')]/following-sibling::span[1]") %>%
                           rvest::html_text(trim = TRUE) %>%
                           ifelse(length(.) == 0, NA, .),
                         contact = publication_html %>%
                           rvest::html_nodes(".publication-info__contact a") %>%
                           rvest::html_text(trim = TRUE) %>%
                           ifelse(length(.) == 0, NA_character_, .),
                         mentions_web_page = publication_html %>%
                           rvest::html_text(trim = TRUE) %>%
                           stringr::str_count(pattern = pattern) %>%
                           ifelse(length(.) == 0, NA_character_, .),
                         number_of_supporting_documents = length(supporting_document_urls),
                         stringsAsFactors = FALSE)

  if (length(supporting_document_urls) == 0) {

    metadata[["number_of_supporting_documents_with_mentions"]] <- 0
    metadata[["mentions_supporting_documents"]] <- 0

  } else {

    mentions <- supporting_document_urls %>%
      paste0("https://www.gov.scot", .) %>%
      sapply(count_mentions, pattern = pattern)

    metadata[["number_of_supporting_documents_with_mentions"]] <-
      mentions[mentions != 0] %>%
      length()

    metadata[["mentions_supporting_documents"]] <- sum(mentions)
  }

  topics <- publication_html %>%
    rvest::html_nodes(xpath = "//dt[contains(text(),'Part of:')]/following-sibling::dd/a") %>%
    rvest::html_text(trim = TRUE) %>%
    if (length(.) == 0) NA_character_ else .

  topics <- topic_list %in% topics %>%
    as.list() %>%
    as.data.frame(col.names = topic_list)

  metadata <- cbind(metadata, topics)

  Sys.sleep(1)

  return(metadata)

}
jsphdms/govscot documentation built on May 3, 2019, 7:40 p.m.