In 2DegreesInvesting/pastax: Data for the 'PASTAX' Project

knitr::opts_chunk$set(
  cache = TRUE,
  collapse = TRUE,
  comment = "#>"
)

Create the esa_sme dataset avoiding needless computation and requests:

To re-compute set the cache parameter to FALSE in the setup chunk.
To re-request set the request parameter to TRUE in the yaml header.

library(dplyr, warn.conflicts = FALSE)
library(janitor, warn.conflicts = FALSE)
library(fs)
library(purrr)
library(usethis)
library(here)
devtools::load_all()

Summary

How many pages do we need to scrape?

any_page <- 2
json <- esa_sme_json(page = any_page)
last_page <- esa_sme_last_page_json(json)
last_page

Ensure we have a directory where to save the response of each request.

pages_dir <- create_data_raw_dir("esa_sme", "page")

Write each response to a json file.

pages <- seq_len(last_page)
if (!params$pages_n == "all") pages <- head(pages, params$pages_n)

requests <- map(pages, ~esa_sme_request(.x))
paths <- path(pages_dir, paste0(pages, ".json"))
walk2(requests, paths, ~esa_sme_req_write(.x, .y))

Transform each .json file into a row of a data frame.

jsons <- dir_ls(pages_dir)

esa_sme_summary <- jsons %>% 
  map(esa_sme_json2html) %>% 
  map_df(esa_sme_enframe, .id = "page_path")

Tidy.

esa_sme_summary <- esa_sme_summary %>% 
  mutate(page_id = path_ext_remove(path_file(page_path))) %>% 
  mutate(details_id = path_ext_remove(path_file(details_url)))

esa_sme_summary

Details

Ensure we have a directory where to save the response of each request.

details_dir <- create_data_raw_dir("esa_sme", "details")

Write each response to a json file.

details_id <- path_file(esa_sme_summary$details_url)
if (!params$pages_n == "all") details_id <- head(details_id, params$pages_n)

requests <- map(details_id, ~esa_sme_details_request(.x))
paths <- path(details_dir, paste0(details_id, ".json"))
walk2(requests, paths, ~esa_sme_req_write(.x, .y))

Transform each .json file into a row of a data frame.

jsons <- dir_ls(details_dir)

esa_sme_details <- jsons %>% 
  map(~esa_sme_json2html(.x)) %>% 
  map_df(~esa_sme_details(.x), .id = "details_id")

Tidy.

esa_sme_details <- esa_sme_details %>% 
  mutate(details_id = path_ext_remove(path_file(.data$details_id))) %>% 
  clean_names() %>% 
  select(-.data$name)

esa_sme_details

Full dataset

Combine the summary and details datasets and tidy.

esa_sme <- esa_sme_summary %>% 
  left_join(esa_sme_details, by = "details_id") %>%
  arrange(.data$page_id, .data$name) %>%
  relocate(page_id, details_id) %>%
  select(-.data$page_path, -.data$details_url)

esa_sme