knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
library(studentenstatistikNRW)

The Research Organization Registry (ROR)

The Research Organization Registry (ROR) is a global, community-led registry of open persistent identifiers for research organizations. The Research Organization Registry (ROR) includes IDs and metadata for more than 107,000 organizations and counting. Registry data is CC0 and openly available via a search interface, REST API, and data dump. Registry updates are curated through a community process and released at least once a month.

ROR REST API v2

Version 2 of the ROR schema and API was released on April 15, 2024 (official API documentation). Let us use the REST API to retrieve data on all German research organizations in education.

ror_base_url <- "https://api.ror.org/v2/organizations"

req <- httr2::request(
  ror_base_url
) |> 
  httr2::req_url_query(
    filter = c(
      "types:education",
      "country.country_code:DE"
    ),
    .multi = "comma"
  )

resp <- req |> 
  httr2::req_perform()

resp_body <- resp |> 
  httr2::resp_body_json()

no_of_results <- resp_body$number_of_results
no_of_pages <- ceiling(no_of_results / 20)

To determine how many pages you will need to retrieve in order to obtain your entire result set, check metadata.number_of_results and divide by 20. Regardless of which page you are on, metadata.number_of_results indicates the total number of results returned by your request.

The total number of search results for a query filtering on types:education and country.country_code:DE gives r no_of_results results. This means we have to request the data for r no_of_pages pages. We can implement this with httr2::iterate_with_offset.

ror_base_url <- "https://api.ror.org/v2/organizations"

req <- httr2::request(
  ror_base_url
) |> 
  httr2::req_url_query(
    filter = c(
      "types:education",
      "country.country_code:DE"
    ),
    .multi = "comma"
  ) |> 
  httr2::req_throttle(
    10
  )

resps <- httr2::req_perform_iterative(
  req,
  next_req = httr2::iterate_with_offset(
    param_name = "page",
    resp_pages = function(resp) ceiling(httr2::resp_body_json(resp)$number_of_results / 20)
  ),
  max_reqs = Inf
)

df_ror_germany_education <- resps |> 
  purrr::map(
    httr2::resp_body_json
  ) |> 
  purrr::map(
    "items"
  ) |> 
  purrr::list_flatten() |> 
  tibble::tibble() |> 
  tidyr::unnest_wider(
    1
  )

df_ror_germany_education

Extracting the data in the list columns for each German research organization in education

Because df_ror_germany_education contains many list columns, extracting data can be quite cumbersome. Here are workflows for the most important list columns.

Names

For illustrative purposes, this is one way of retrieving all content from df_ror_germany_education[["names"]] for each organization.

df_names <- tibble::tibble(
  id = df_ror_germany_education[["id"]],
  names = df_ror_germany_education[["names"]]
) |> 
  tidyr::unnest_longer(
    col = 2,
    indices_include = TRUE
  ) |> 
  tidyr::unnest_wider(
    col = "names"
  ) |> 
  tidyr::unnest_wider(
    col = "types",
    names_sep = "_"
  ) |> 
  dplyr::select(
    id,
    names_id,
    name = value,
    lang,
    dplyr::starts_with("types")
  )

df_names

Links

This is one way of retrieving all content from df_ror_germany_education[["links"]] for each organization.

df_links <- tibble::tibble(
  id = df_ror_germany_education[["id"]],
  links = df_ror_germany_education[["links"]]
) |> 
  tidyr::unnest_longer(
    col = 2,
    indices_include = TRUE
  ) |> 
  tidyr::unnest_wider(
    col = "links"
  ) 

df_links

Locations

This is one way of retrieving all content from df_ror_germany_education[["locations"]] for each organization.

df_locations <- tibble::tibble(
  id = df_ror_germany_education[["id"]],
  locations = df_ror_germany_education[["locations"]]
) |> 
  tidyr::unnest_longer(
    col = 2
  ) |> 
  tidyr::unnest_wider(
    col = "locations"
  ) |> 
  tidyr::unnest_wider(
    col = "geonames_details"
  )

df_locations

External IDs

This is one way of retrieving all content from df_ror_germany_education[["external_ids"]] for each organization.

df_external_ids <- tibble::tibble(
  id = df_ror_germany_education[["id"]],
  external_ids = df_ror_germany_education[["external_ids"]]
) |> 
  tidyr::unnest_longer(
    col = 2,
    indices_include = TRUE
  ) |> 
  tidyr::unnest_wider(
    col = "external_ids"
  ) |> 
  tidyr::unnest_wider(
    col = "all",
    names_sep = '_'
  ) |> 
  dplyr::select(
    id,
    external_ids_id,
    type,
    preferred,
    dplyr::starts_with("all")
  )

df_external_ids

# Let's filter on all Wikidata entries

df_external_ids |> 
  dplyr::filter(
    type == "wikidata"
  )


RichardMeyer-Eppler/studentenstatistikNRW documentation built on July 27, 2024, 3:14 a.m.