R/update_providers.R

Defines functions oai_base update_providers

Documented in update_providers

#' @title Update the locally stored OAI-PMH data providers table.
#'
#' @description Data comes from
#' <http://www.openarchives.org/Register/BrowseSites>. It includes the
#' oai-identifier (if they have one) and the base URL. The website has
#' the name of the data provider too, but not provided in the data pulled
#' down here, but you can grab the name using the example below.
#'
#' @export
#' @details This table is scraped from
#' <http://www.openarchives.org/Register/BrowseSites>.
#' I would get it from <http://www.openarchives.org/pmh/registry/ListFriends>,
#' but it does not include repository names.
#'
#' This function updates the table for you. Does take a while though, so
#' go get a coffee.
#' @param path Path to put data in.
#' @param ... Curl options passed on to [httr::GET()]
#' @seealso [load_providers()]
#' @examples \dontrun{
#' update_providers()
#' load_providers()
#' }

update_providers <- function(path = ".", ...) {
  tt <- GET(oai_base(), ...)
  stop_for_status(tt)
  temp <- content(tt, "text", encoding = "UTF-8")
  prov <- xml2::read_html(temp)
  tab <- xml2::xml_find_all(prov, "//table[@class=\"registration-table\"]")
  children <- xml2::xml_children(tab)
  providers <- rbind.fill(lapply(children[-1], function(z) {
    data.frame(
      t(gsub("\n|\\s\\s+", "", xml2::xml_text(xml2::xml_children(z)[3:5]))),
      stringsAsFactors = FALSE)
  }))
  names(providers) <- c("repo_name", "base_url", "oai_identifier")
  save(providers, file = paste(path, "/", Sys.Date(), "-providers.rda",
    sep = ""))
}

oai_base <- function() "http://www.openarchives.org/Register/BrowseSites"
ropensci/oai documentation built on Nov. 18, 2022, 5:33 p.m.