From dataset To RDF
In dataset: Create Data Frames that are Easier to Exchange and Reuse

library(knitr)
knitr::opts_chunk$set(
  collapse = TRUE,
  message  = FALSE,
  comment = "#>",
  out.width = '90%'
)
library(here)
library(readxl)
library(kableExtra)

library(dataset)

Our datasets are defined in a way that their dimensions can be easily and unambiguously reduced to triples for RDF applications; they can be easily serialized to, or synchronized with semantic web applications with the rdflib package^[Carl Boettiger: A tidyverse lover’s intro to RDF].

Read more about how we adopted to datacube model of SDMX and the RDF Data Cube Vocabulary for datasets as R objects in the The dataset S3 Class vignette article.

Because our datasets conform the tidy data concept, they can be reduced into long-form triples.

| | | ----------------|---------|-------------|------------- RDF | subject | predicate | object
JSON | object | property | value spreadsheet | row id | column name | cell data.frame | key | variable | measurement data.frame | key | attribute | value

Table source: rdflib

Dimension reductions of the dataset

Our datasets are tidy.

example_df <- readxl::read_excel(
  system.file("extdata", "rdf_example.xlsx", package = "dataset"), 
  sheet = "dataset-wide")

example_dataset <- dataset (
  x = example_df,
  Dimensions = c("time", "geo", "sex"),
  Measures = "value", 
  Attributes = c("freq", "status")
)

attr(example_dataset, "local_id") <- 'rowid'

You can start to reduce the dimensions, for example, with uniting the dimensions. In this case, the row identifier becomes more and more a unique resource identifier, i.e. a URI.

include_graphics(here("vignettes", "RDF_chart_1.png"))

Eventually you can reduce the entire dataset into a triple. The uri uniquely defines the observations, the component maintains the W3C/SDMX datacube models main structural element, and the value field the value of the dimension, measurement or attribute.

example_ds <- dataset_uri(ds = example_dataset) 
example_ds

subset( example_ds, select = c("URI", "value"))

nq_file <- file.path(tempdir(), "triple_file.nq")

library(rdflib)
rdf <- rdf()

for ( i in seq_len(nrow(example_ds))) {
  rdf %>% 
  rdf_add("", 
          predicate = example_ds$URI[i], 
          object = example_ds$value[i])
}

rdf_serialize(rdf, doc = nq_file)

rdf_parse(nq_file)

library(dplyr)
long_dataset <- example_dataset %>%
  select ( -.data$value) %>%
  mutate_all( as.character)  %>%
  tidyr::pivot_longer(cols = any_of(c("geo", "sex", "time", "unit", "freq", "status")), 
                      names_to = "predicate",
                      values_to = "object") %>%
  bind_rows( example_dataset %>%
  select ( all_of(c("rowid","value"))) %>%
    mutate_all(as.character) %>%
  tidyr::pivot_longer(cols = any_of(c("value")), 
                      names_to = "predicate",
                      values_to = "object") %>%
    mutate ( object = as.character(object))) %>%
  rename ( URI = .data$rowid) %>%
  mutate ( URI = paste0("https:://example.org/my_data/", .data$URI))

long_dataset %>% head()

knitr::include_graphics(here("vignettes", "RDF_chart_2.png"))

rdf2 <- rdf()

for ( i in seq_len(nrow(long_dataset))) {
  rdf2 %>% 
     rdf_add(subject = long_dataset$URI[i],
             predicate = long_dataset$predicate[i], 
             object = long_dataset$object[i])
}

rdf2

rdf <- NULL
rdf2 <- NULL

The benefit of standard codelists

In this example, except for the measurement of the observation, we used only SDMX-attribute conform variable names and codes. The advantage of this approach is that it is very easy to increase the dimensions of the dataset, and add human-readable labels, potentially in many natural languages.

set.seed(2022)
library(statcodelists)

example_long %>%
   filter (.data$component == "sex") %>%
   left_join(statcodelists::CL_SEX %>%
              rename ( value = .data$id ), 
            by = "value") %>%
  bind_rows (
    example_long %>%
      filter (.data$component == "freq") %>%
      left_join(statcodelists::CL_FREQ %>%
              dplyr::rename ( value = .data$id ), 
            by = "value") 
  )  %>% 
    bind_rows (
    example_long %>%
      filter (.data$component == "status") %>%
      left_join(statcodelists::CL_OBS_STATUS %>%
              dplyr::rename ( value = .data$id ), 
            by = "value") 
  ) %>%
  group_by (.data$component) %>%
  sample_frac( size = 0.3 ) %>%
  kableExtra::kbl() %>%
  kableExtra::kable_paper()