R/esa_sme_scrape.R

Defines functions enframe_description enframe_info details_url esa_sme_base_url esa_sme_details esa_sme_column_css esa_sme_column esa_sme_details_url esa_sme_enframe esa_sme_json2html esa_sme_req_write pastax_agent esa_sme_params dissable_ssl_verifypeer esa_base_url esa_sme_details_request esa_sme_request esa_sme_last_page_json esa_sme_json esa_sme_scrape

Documented in esa_sme_scrape

#' Scrape and export the dataset `esa_sme`
#'
#' @param pages_n How many pages do you want to scrape? For tests you may scrape
#'   only a few pages, say 1 or 2. To update the exported dataset use "all".
#'
#' @export
#' @keywords internal
#'
#' @examples
#' \dontrun{
#' esa_sme_scrape(1)
#' }
esa_sme_scrape <- function(pages_n = "all") {
  render(
    data_raw("esa_sme.Rmd"),
    output_format = github_document(html_preview = FALSE),
    params = list(request = TRUE, pages_n = pages_n)
  )
}

esa_sme_json <- function(page) {
  resp <- page %>%
    esa_sme_request() %>%
    req_perform()

  json <- resp %>%
    resp_body_json()
}

#' @examples
#' json <- esa_sme_json(page = 1)
#' esa_sme_last_page_json(json)
#' @noRd
esa_sme_last_page_json <- function(json) {
  pages_chr <- json$html %>%
    read_html() %>%
    html_elements(".pagination") %>%
    html_elements("li") %>%
    html_text2()

  pages_int <- suppressWarnings(as.integer(pages_chr))
  max(pages_int, na.rm = TRUE)
}

#' Request one page of esa SME data
#'
#' @examples
#' req <- esa_sme_request(page = 2)
#' req
#' @noRd
esa_sme_request <- function(page) {
  request(esa_base_url(), options = dissable_ssl_verifypeer()) %>%
    req_url_path_append("PublicEntityDir") %>%
    req_url_path_append("PublicEntityDirGridSme") %>%
    req_url_query(!!!esa_sme_params(page)) %>%
    req_user_agent(pastax_agent())
}

esa_sme_details_request <- function(details_id) {
  request(esa_base_url(), options = dissable_ssl_verifypeer()) %>%
    req_url_path_append("PublicEntityDir") %>%
    req_url_path_append("PublicEntityDirPopupDetailSME") %>%
    req_url_path_append(details_id) %>%
    req_user_agent(pastax_agent())
}

esa_base_url <- function() {
  "https://esastar-emr.sso.esa.int/"
}

dissable_ssl_verifypeer <- function() {
  list(ssl_verifypeer = 0)
}

esa_sme_params <- function(page) {
  list(
    term = "",
    isForRegister = "False",
    isForEmits = "True",
    "grid-page" = page
  )
}

pastax_agent <- function() {
  "pastax (https://github.com/2DegreesInvesting/pastax)"
}

#' Take an httr2 request object and write the response content to a json file
#' @examples
#' req <- esa_sme_request(1)
#' req %>% esa_sme_req_write(path = tempfile())
#' @noRd
esa_sme_req_write <- function(req, path) {
  req %>%
    req_perform() %>%
    resp_body_json() %>%
    write_json(path)

  invisible(req)
}

esa_sme_json2html <- function(path) {
  read_html(fromJSON(path)$html)
}

esa_sme_enframe <- function(html) {
  tibble(
    details_url = esa_sme_details_url(html),
    name = esa_sme_column(html, 1),
    country_of_registration = esa_sme_column(html, 2),
    entity_type = esa_sme_column(html, 3),
    entity_size = esa_sme_column(html, 4),
    esastar_status = esa_sme_column(html, 5),
  )
}

esa_sme_details_url <- function(html) {
  details <- html %>%
    html_elements(".gridDetails") %>%
    html_attr("href")

  path(esa_base_url(), details)
}

esa_sme_column <- function(html, id) {
  html %>%
    html_elements(esa_sme_column_css(id)) %>%
    html_text2()
}

esa_sme_column_css <- function(id) {
  glue(".grid-cell:nth-child({id})")
}

esa_sme_details <- function(html) {
  dplyr::bind_cols(
    tibble::tibble(description = enframe_description(html)),
    details = enframe_info(html)
  )
}

esa_sme_base_url <- function() {
  esa_base_url()
}

details_url <- function(html) {
  details <- html %>%
    html_elements(".gridDetails") %>%
    html_attr("href")

  fs::path(esa_sme_base_url(), details)
}

enframe_info <- function(html) {
  ids <- html %>%
    html_elements("input") %>%
    html_attr("id")

  values <- html %>%
    html_elements("input") %>%
    html_attr("value")

  values %>%
    purrr::set_names(ids) %>%
    tibble::enframe() %>%
    dplyr::filter(!.data$name == "") %>%
    tidyr::pivot_wider(names_from = .data$name)
}

enframe_description <- function(html) {
  html %>%
    html_elements(".formEditInput") %>%
    html_text2() %>%
    trimws() %>%
    purrr::keep(nzchar)
}
2DegreesInvesting/pastax documentation built on Feb. 12, 2022, 7:46 a.m.