Create the esa_sme
dataset avoiding needless computation and requests:
cache
parameter to FALSE
in the setup
chunk.request
parameter to TRUE
in the yaml
header.library(dplyr, warn.conflicts = FALSE)
library(janitor, warn.conflicts = FALSE)
library(fs)
library(purrr)
library(usethis)
library(here)
#> here() starts at /home/mauro/git/pastax
devtools::load_all()
#> ℹ Loading pastax
How many pages do we need to scrape?
any_page <- 2
json <- esa_sme_json(page = any_page)
last_page <- esa_sme_last_page_json(json)
last_page
#> [1] 147
Ensure we have a directory where to save the response of each request.
pages_dir <- create_data_raw_dir("esa_sme", "page")
Write each response to a json file.
pages <- seq_len(last_page)
if (!params$pages_n == "all") pages <- head(pages, params$pages_n)
requests <- map(pages, ~esa_sme_request(.x))
paths <- path(pages_dir, paste0(pages, ".json"))
walk2(requests, paths, ~esa_sme_req_write(.x, .y))
Transform each .json file into a row of a data frame.
jsons <- dir_ls(pages_dir)
esa_sme_summary <- jsons %>%
map(esa_sme_json2html) %>%
map_df(esa_sme_enframe, .id = "page_path")
Tidy.
esa_sme_summary <- esa_sme_summary %>%
mutate(page_id = path_ext_remove(path_file(page_path))) %>%
mutate(details_id = path_ext_remove(path_file(details_url)))
esa_sme_summary
#> # A tibble: 2,917 × 9
#> page_path details_url name country_of_regi… entity_type entity_size
#> <chr> <fs::path> <chr> <chr> <chr> <chr>
#> 1 /home/mauro… …etailSME/63731 Genera… CH-Switzerland Company TBD
#> 2 /home/mauro… …tailSME/129405 Instit… SK-Slovakia Research o… TBD
#> 3 /home/mauro… …etailSME/86371 LAIXER… NL-The Netherla… Company TBD
#> 4 /home/mauro… …etailSME/59569 (GovEd… GB-United Kingd… Company TBD
#> 5 /home/mauro… …etailSME/86159 10 10 … IT-Italy Company TBD
#> 6 /home/mauro… …tailSME/128365 11tens… GR-Greece Company TBD
#> 7 /home/mauro… …etailSME/68356 12G Fl… SE-Sweden Company TBD
#> 8 /home/mauro… …tailSME/128211 1D wor… DE-Germany Company TBD
#> 9 /home/mauro… …etailSME/86542 1POINT… BE-Belgium Company TBD
#> 10 /home/mauro… …tailSME/129014 1st-Re… AT-Austria Company TBD
#> # … with 2,907 more rows, and 3 more variables: esastar_status <chr>,
#> # page_id <chr>, details_id <chr>
Ensure we have a directory where to save the response of each request.
details_dir <- create_data_raw_dir("esa_sme", "details")
Write each response to a json file.
details_id <- path_file(esa_sme_summary$details_url)
if (!params$pages_n == "all") details_id <- head(details_id, params$pages_n)
requests <- map(details_id, ~esa_sme_details_request(.x))
paths <- path(details_dir, paste0(details_id, ".json"))
walk2(requests, paths, ~esa_sme_req_write(.x, .y))
Transform each .json file into a row of a data frame.
jsons <- dir_ls(details_dir)
esa_sme_details <- jsons %>%
map(~esa_sme_json2html(.x)) %>%
map_df(~esa_sme_details(.x), .id = "details_id")
Tidy.
esa_sme_details <- esa_sme_details %>%
mutate(details_id = path_ext_remove(path_file(.data$details_id))) %>%
clean_names() %>%
select(-.data$name)
esa_sme_details
#> # A tibble: 2,917 × 17
#> details_id description entity_id nationality_desc entity_code
#> <chr> <chr> <chr> <chr> <chr>
#> 1 107486 "<p>We are a dynamic, youn… 0 IT-Italy 1000037016
#> 2 107487 "HFC specializes in design… 0 DE-Germany 1000037020
#> 3 107488 "<span style=\"font-family… 0 GB-United Kingd… 1000037023
#> 4 107496 "<p>We develop and build u… 0 DE-Germany 1000037025
#> 5 107497 "<p>Serendipity helps ecos… 60381 NL-The Netherla… 1000037120
#> 6 107501 "<p>know.space is a specia… 60385 IE-Ireland 1000037191
#> 7 127527 "<p>ATL has been active si… 70406 HU-Hungary 1000037172
#> 8 127542 "<p>Cyblix provides soluti… 0 PT-Portugal 1000037061
#> 9 127554 "Control Survey SRL is a R… 0 RO-Romania 1000037057
#> 10 127555 "<p>HUGETECH is experience… 0 PL-Poland 1000037078
#> # … with 2,907 more rows, and 12 more variables: entity_vat_number <chr>,
#> # entity_sme_status <chr>, entity_type_desc <chr>, entity_size_desc <chr>,
#> # address <chr>, number <chr>, city <chr>, postal_code <chr>, phone <chr>,
#> # entity_fax <chr>, entity_mail <chr>, entity_web_site <chr>
Combine the summary and details datasets and tidy.
esa_sme <- esa_sme_summary %>%
left_join(esa_sme_details, by = "details_id") %>%
arrange(.data$page_id, .data$name) %>%
relocate(page_id, details_id) %>%
select(-.data$page_path, -.data$details_url)
esa_sme
#> # A tibble: 2,917 × 23
#> page_id details_id name country_of_regist… entity_type entity_size
#> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 1 59569 (GovEd Ltd) Im… GB-United Kingdom Company TBD
#> 2 1 86159 10 10 ONE OBJE… IT-Italy Company TBD
#> 3 1 128365 11tensors GR-Greece Company TBD
#> 4 1 68356 12G Flight Sys… SE-Sweden Company TBD
#> 5 1 128211 1D works UG DE-Germany Company TBD
#> 6 1 86542 1POINT61 BE-Belgium Company TBD
#> 7 1 129014 1st-Relief GmbH AT-Austria Company TBD
#> 8 1 70734 21 Robots EE-Estonia Company TBD
#> 9 1 83933 221e IT-Italy Company TBD
#> 10 1 85134 27G-Technology… HU-Hungary Company TBD
#> # … with 2,907 more rows, and 17 more variables: esastar_status <chr>,
#> # description <chr>, entity_id <chr>, nationality_desc <chr>,
#> # entity_code <chr>, entity_vat_number <chr>, entity_sme_status <chr>,
#> # entity_type_desc <chr>, entity_size_desc <chr>, address <chr>,
#> # number <chr>, city <chr>, postal_code <chr>, phone <chr>, entity_fax <chr>,
#> # entity_mail <chr>, entity_web_site <chr>
Export.
use_data(esa_sme, overwrite = TRUE)
#> ✓ Setting active project to '/home/mauro/git/pastax'
#> ✓ Saving 'esa_sme' to 'data/esa_sme.rda'
#> • Document your data (see 'https://r-pkgs.org/data.html')
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.