data-raw/esa_sme.md

esa_sme

Create the esa_sme dataset avoiding needless computation and requests:

library(dplyr, warn.conflicts = FALSE)
library(janitor, warn.conflicts = FALSE)
library(fs)
library(purrr)
library(usethis)
library(here)
#> here() starts at /home/mauro/git/pastax
devtools::load_all()
#> ℹ Loading pastax

Summary

How many pages do we need to scrape?

any_page <- 2
json <- esa_sme_json(page = any_page)
last_page <- esa_sme_last_page_json(json)
last_page
#> [1] 147

Ensure we have a directory where to save the response of each request.

pages_dir <- create_data_raw_dir("esa_sme", "page")

Write each response to a json file.

pages <- seq_len(last_page)
if (!params$pages_n == "all") pages <- head(pages, params$pages_n)

requests <- map(pages, ~esa_sme_request(.x))
paths <- path(pages_dir, paste0(pages, ".json"))
walk2(requests, paths, ~esa_sme_req_write(.x, .y))

Transform each .json file into a row of a data frame.

jsons <- dir_ls(pages_dir)

esa_sme_summary <- jsons %>% 
  map(esa_sme_json2html) %>% 
  map_df(esa_sme_enframe, .id = "page_path")

Tidy.

esa_sme_summary <- esa_sme_summary %>% 
  mutate(page_id = path_ext_remove(path_file(page_path))) %>% 
  mutate(details_id = path_ext_remove(path_file(details_url)))

esa_sme_summary
#> # A tibble: 2,917 × 9
#>    page_path    details_url     name    country_of_regi… entity_type entity_size
#>    <chr>        <fs::path>      <chr>   <chr>            <chr>       <chr>      
#>  1 /home/mauro… …etailSME/63731 Genera… CH-Switzerland   Company     TBD        
#>  2 /home/mauro… …tailSME/129405 Instit… SK-Slovakia      Research o… TBD        
#>  3 /home/mauro… …etailSME/86371 LAIXER… NL-The Netherla… Company     TBD        
#>  4 /home/mauro… …etailSME/59569 (GovEd… GB-United Kingd… Company     TBD        
#>  5 /home/mauro… …etailSME/86159 10 10 … IT-Italy         Company     TBD        
#>  6 /home/mauro… …tailSME/128365 11tens… GR-Greece        Company     TBD        
#>  7 /home/mauro… …etailSME/68356 12G Fl… SE-Sweden        Company     TBD        
#>  8 /home/mauro… …tailSME/128211 1D wor… DE-Germany       Company     TBD        
#>  9 /home/mauro… …etailSME/86542 1POINT… BE-Belgium       Company     TBD        
#> 10 /home/mauro… …tailSME/129014 1st-Re… AT-Austria       Company     TBD        
#> # … with 2,907 more rows, and 3 more variables: esastar_status <chr>,
#> #   page_id <chr>, details_id <chr>

Details

Ensure we have a directory where to save the response of each request.

details_dir <- create_data_raw_dir("esa_sme", "details")

Write each response to a json file.

details_id <- path_file(esa_sme_summary$details_url)
if (!params$pages_n == "all") details_id <- head(details_id, params$pages_n)

requests <- map(details_id, ~esa_sme_details_request(.x))
paths <- path(details_dir, paste0(details_id, ".json"))
walk2(requests, paths, ~esa_sme_req_write(.x, .y))

Transform each .json file into a row of a data frame.

jsons <- dir_ls(details_dir)

esa_sme_details <- jsons %>% 
  map(~esa_sme_json2html(.x)) %>% 
  map_df(~esa_sme_details(.x), .id = "details_id")

Tidy.

esa_sme_details <- esa_sme_details %>% 
  mutate(details_id = path_ext_remove(path_file(.data$details_id))) %>% 
  clean_names() %>% 
  select(-.data$name)

esa_sme_details
#> # A tibble: 2,917 × 17
#>    details_id description                 entity_id nationality_desc entity_code
#>    <chr>      <chr>                       <chr>     <chr>            <chr>      
#>  1 107486     "<p>We are a dynamic, youn… 0         IT-Italy         1000037016 
#>  2 107487     "HFC specializes in design… 0         DE-Germany       1000037020 
#>  3 107488     "<span style=\"font-family… 0         GB-United Kingd… 1000037023 
#>  4 107496     "<p>We develop and build u… 0         DE-Germany       1000037025 
#>  5 107497     "<p>Serendipity helps ecos… 60381     NL-The Netherla… 1000037120 
#>  6 107501     "<p>know.space is a specia… 60385     IE-Ireland       1000037191 
#>  7 127527     "<p>ATL has been active si… 70406     HU-Hungary       1000037172 
#>  8 127542     "<p>Cyblix provides soluti… 0         PT-Portugal      1000037061 
#>  9 127554     "Control Survey SRL is a R… 0         RO-Romania       1000037057 
#> 10 127555     "<p>HUGETECH is experience… 0         PL-Poland        1000037078 
#> # … with 2,907 more rows, and 12 more variables: entity_vat_number <chr>,
#> #   entity_sme_status <chr>, entity_type_desc <chr>, entity_size_desc <chr>,
#> #   address <chr>, number <chr>, city <chr>, postal_code <chr>, phone <chr>,
#> #   entity_fax <chr>, entity_mail <chr>, entity_web_site <chr>

Full dataset

Combine the summary and details datasets and tidy.

esa_sme <- esa_sme_summary %>% 
  left_join(esa_sme_details, by = "details_id") %>%
  arrange(.data$page_id, .data$name) %>%
  relocate(page_id, details_id) %>%
  select(-.data$page_path, -.data$details_url)

esa_sme
#> # A tibble: 2,917 × 23
#>    page_id details_id name            country_of_regist… entity_type entity_size
#>    <chr>   <chr>      <chr>           <chr>              <chr>       <chr>      
#>  1 1       59569      (GovEd Ltd) Im… GB-United Kingdom  Company     TBD        
#>  2 1       86159      10 10 ONE OBJE… IT-Italy           Company     TBD        
#>  3 1       128365     11tensors       GR-Greece          Company     TBD        
#>  4 1       68356      12G Flight Sys… SE-Sweden          Company     TBD        
#>  5 1       128211     1D works UG     DE-Germany         Company     TBD        
#>  6 1       86542      1POINT61        BE-Belgium         Company     TBD        
#>  7 1       129014     1st-Relief GmbH AT-Austria         Company     TBD        
#>  8 1       70734      21 Robots       EE-Estonia         Company     TBD        
#>  9 1       83933      221e            IT-Italy           Company     TBD        
#> 10 1       85134      27G-Technology… HU-Hungary         Company     TBD        
#> # … with 2,907 more rows, and 17 more variables: esastar_status <chr>,
#> #   description <chr>, entity_id <chr>, nationality_desc <chr>,
#> #   entity_code <chr>, entity_vat_number <chr>, entity_sme_status <chr>,
#> #   entity_type_desc <chr>, entity_size_desc <chr>, address <chr>,
#> #   number <chr>, city <chr>, postal_code <chr>, phone <chr>, entity_fax <chr>,
#> #   entity_mail <chr>, entity_web_site <chr>

Export.

use_data(esa_sme, overwrite = TRUE)
#> ✓ Setting active project to '/home/mauro/git/pastax'
#> ✓ Saving 'esa_sme' to 'data/esa_sme.rda'
#> • Document your data (see 'https://r-pkgs.org/data.html')


2DegreesInvesting/pastax documentation built on Feb. 12, 2022, 7:46 a.m.