Motivation.R
In dataset: Create Data Frames that are Easier to Exchange and Reuse

## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
if (!requireNamespace("dplyr", quietly = TRUE) || !requireNamespace("tidyr", quietly = TRUE)) {
  stop("Please install 'dplyr' and 'tidyr' to run this vignette.")
}

library(dplyr)
library(tidyr)

## ----setup--------------------------------------------------------------------
library(dataset)

## ----example-ambiguity--------------------------------------------------------
data.frame(
  geo = c("LI", "SM"),
  CPI = c("0.8", "0.9"),
  GNI = c("8976", "9672")
)

## ----smallcountrydataset------------------------------------------------------
options(sciphen = 4)
small_country_dataset <- dataset_df(
  country_name = defined(
    c("AD", "LI"),
    concept = "http://data.europa.eu/bna/c_6c2bb82d",
    namespace = "https://www.geonames.org/countries/$1/"
  ),
  gdp = defined(
    c(3897, 7365),
    label = "Gross Domestic Product",
    unit = "million dollars",
    concept = "http://data.europa.eu/83i/aa/GDP"
  ),
  population = defined(
    c(77543, 40015),
    label = "Population",
    concept = "http://data.europa.eu/bna/c_f2b50efd"
  ),
  dataset_bibentry = dublincore(
    creator = person(given = "Jane", family = "Doe"),
    title = "Small Country Dataset",
    publisher = "Reprex"
  )
)

## ----percapita----------------------------------------------------------------
small_country_dataset$gdp_capita <- defined(
  small_country_dataset$gdp * 1e6 / small_country_dataset$population,
  unit = "dollar",
  label = "GDP Per Capita"
)

## ----gdpexmampel, warning=FALSE, message=FALSE--------------------------------
library(tibble)

# Dataset D (GDP in billions of USD)
D <- tibble(
  year = c(2020, 2020, 2021, 2021, 2022, 2022),
  geocode = c("USA", "CAN", "USA", "CAN", "USA", "CAN"),
  country_name = c("United States", "Canada", "United States", "Canada", "United States", "Canada"),
  GDP = c(21000, 2000, 22000, 2100, 23000, 2200) # GDP in billions of USD
)

# Dataset E (GDP in billions of EUR)
E <- tibble(
  year = c(2020, 2020, 2021, 2021, 2022, 2022),
  geocode = c("USA", "FRA", "USA", "FRA", "USA", "FRA"),
  country_name = c("United States", "France", "United States", "France", "United States", "France"),
  GDP = c(18000, 2500, 19000, 2600, 20000, 2700) # GDP in billions of EUR
)

## ----ambigousjoin-------------------------------------------------------------
full_join(D, E)

## ----strintc, eval=FALSE------------------------------------------------------
# GDP_D <- defined(c(21000, 2000, 22000, 2100, 23000, 2200), unit = "M_USD")
# GDP_E <- defined(c(18000, 2500, 19000, 2600, 20000, 2700), unit = "M_EUR")
# c(GDP_D, GDP_E)

## ----errormessage-------------------------------------------------------------
message("Error in c.haven_labelled_defined(GDP_D, GDP_E) :
  c.haven_labelled_defined(x,y): x,y must have no unit or the same unit")

## ----matchigunits-------------------------------------------------------------
GDP_F <- defined(c(21000, 2000, 22000, 2100, 23000, 2200), unit = "M_EUR")
GDP_E <- defined(c(18000, 2500, 19000, 2600, 20000, 2700), unit = "M_EUR")
c(GDP_F, GDP_E)

## ----d-enriched---------------------------------------------------------------
D_enriched <- dataset_df(
  year = c(2020, 2020, 2021, 2021, 2022, 2022),
  geocode = c("USA", "CAN", "USA", "CAN", "USA", "CAN"),
  country_name = c("United States", "Canada", "United States", "Canada", "United States", "Canada"),
  GDP = c(21000, 2000, 22000, 2100, 23000, 2200),
  dataset_bibentry = dataset::dublincore(
    title = "North American GDP Dataset",
    description = "Dataset containing GDP data for North American countries.",
    creator = person("Daniel", "Antal"), # Replace with the actual creator
    publisher = "Reprex",
    dataset_date = Sys.Date(),
    subject = "GDP"
  )
)

D_enriched

## ----ambigousjoinwithdplyr----------------------------------------------------
library(tibble)

# Dataset D (GDP in billions of USD)
D <- tibble(
  year = c(2020, 2020, 2021, 2021, 2022, 2022),
  geocode = c("USA", "CAN", "USA", "CAN", "USA", "CAN"),
  country_name = c(
    "United States", "Canada", "United States",
    "Canada", "United States", "Canada"
  ),
  GDP = c(21000, 2000, 22000, 2100, 23000, 2200) # GDP in billions of USD
)

# Dataset E (GDP in billions of EUR)
E <- tibble(
  year = c(2020, 2020, 2021, 2021, 2022, 2022),
  geocode = c("USA", "FRA", "USA", "FRA", "USA", "FRA"),
  country_name = c("United States", "France", "United States", "France", "United States", "France"),
  GDP = c(18000, 2500, 19000, 2600, 20000, 2700) # GDP in billions of EUR
)

joined_data <- dplyr::full_join(D, E)
joined_data

## ----two-gdp-datasets---------------------------------------------------------
library(tibble)

# Dataset D (GDP in billions of USD)
D <- tibble(
  year = c(2020, 2020, 2021, 2021, 2022, 2022),
  geocode = c("USA", "CAN", "USA", "CAN", "USA", "CAN"),
  country_name = c(
    "United States", "Canada", "United States",
    "Canada", "United States", "Canada"
  ),
  GDP = c(21000, 2000, 22000, 2100, 23000, 2200) # GDP in billions of USD
)

# Dataset E (GDP in billions of EUR)
E <- tibble(
  year = c(2020, 2020, 2021, 2021, 2022, 2022),
  geocode = c("USA", "FRA", "USA", "FRA", "USA", "FRA"),
  country_name = c(
    "United States", "France", "United States",
    "France", "United States", "France"
  ),
  GDP = c(18000, 2500, 19000, 2600, 20000, 2700) # GDP in billions of EUR
)

# Combine datasets
combined <- bind_rows(D, E)

# Convert all to character *before* pivoting
combined_char <- combined %>%
  mutate(across(everything(), as.character))

# Rename geocode to subject
combined_subject <- combined_char %>%
  rename(subject = geocode)

# Pivot longer, excluding the subject (triples are named, but predicates are not renamed yet)
named_triples <- combined_subject %>%
  pivot_longer(
    cols = c(year, country_name, GDP),
    names_to = "predicate",
    values_to = "object"
  )

# Replace geocodes with Geonames IDs and add datatype annotations
geonames_mapping <- tibble(
  subject = c("USA", "CAN", "FRA"),
  geonames_id = c("6252001", "6251999", "2988317")
)

rdf_triples <- named_triples %>%
  left_join(geonames_mapping, by = "subject") %>%
  mutate(
    subject = paste0("geonames:", geonames_id),
    predicate = case_when(
      predicate == "year" ~ "sdmx:TIME_PERIOD",
      predicate == "country_name" ~ "schema:name",
      predicate == "GDP" ~ "sdmx:OBS_VALUE",
      TRUE ~ predicate
    ),
    object = case_when(
      predicate == "sdmx:TIME_PERIOD" ~ paste0("\"", object, "\"^^<http://www.w3.org/2001/XMLSchema#gYear>"),
      predicate == "sdmx:OBS_VALUE" ~ paste0("\"", object, "\"^^<http://www.w3.org/2001/XMLSchema#double>"),
      TRUE ~ object
    )
  ) %>%
  select(subject, predicate, object)

print("RDF Triples (subject, predicate, object):")
print(rdf_triples)

## ----smallmusiciandataset1----------------------------------------------------
small_country_musicians <- data.frame(
  qid = c("Q275912", "Q116196078"),
  artist_name = defined(
    c("Marta Roure", "wavvyboi"),
    concept = "https://www.wikidata.org/wiki/Property:P2093"
  ),
  location = defined(
    c("Andorra", "Lichtenstein"),
    concept = "https://www.wikidata.org/wiki/Property:P276"
  ),
  date_of_birth = defined(
    c(as.Date("1981-01-16"), as.Date("1998-04-28")),
    concept = "https://www.wikidata.org/wiki/Property:P569"
  )
)

small_country_musicians$age <- defined(
  2024 - as.integer(
    substr(
      as.character(small_country_musicians$date_of_birth),
      1, 4
    )
  ),
  label = "Age in 2024"
)

Any scripts or data that you put into this service are public.

dataset documentation built on June 8, 2025, 10:15 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

dataset
Create Data Frames that are Easier to Exchange and Reuse

inst/doc/Motivation.R
In dataset: Create Data Frames that are Easier to Exchange and Reuse

Try the dataset package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

dataset Create Data Frames that are Easier to Exchange and Reuse

inst/doc/Motivation.R In dataset: Create Data Frames that are Easier to Exchange and Reuse

Try the dataset package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

dataset
Create Data Frames that are Easier to Exchange and Reuse

inst/doc/Motivation.R
In dataset: Create Data Frames that are Easier to Exchange and Reuse