NOT_CRAN <- identical(tolower(Sys.getenv("NOT_CRAN")), "true")
CONFIGURED <- !is.null(getOption("dw.auth_token"))
knitr::opts_chunk$set(
  collapse = FALSE,
  comment = "##",
  purl = NOT_CRAN && CONFIGURED,
  eval = NOT_CRAN && CONFIGURED
)

Querying

Datasets on data.world can be queried using SQL and SPARQL. A specific dataset is the starting point for a query, but its not the end. The data in that dataset can be joined with data from any other dataset on the platform.

If you are unfamiliar with or interested in learning more about SQL and SPARQL, check out our tutorials:

SQL queries

Inspecting the schema

When using SQL, it's a good idea to start by understanding the schema inferred by data.world from the data that has been uploaded. That is done by querying the Tables table.

intro_ds <- "https://data.world/jonloyens/an-intro-to-dataworld-dataset"
tables_df <- data.world::query(
  data.world::qry_sql("SELECT * FROM Tables"),
  dataset = intro_ds
)
tables_df

It is also possible to obtain information about columns. For that, use the TableColumns table instead. Note that this time, we use the alternate means of calling the qry_sql constructor, by separating the dataset owner and dataset id.

Try:

intro_ds_owner <- "jonloyens"
intro_ds <- "an-intro-to-dataworld-dataset"
columns_df <- data.world::query(
  data.world::qry_sql("SELECT * FROM TableColumns"),
  owner_id = intro_ds_owner,
  dataset_id = intro_ds
)
columns_df

SELECT queries

Just like the queries above, any SQL query can be created using qry_sql.
For example:

assists_vs_height <- data.world::qry_sql(paste0(
  "SELECT t.Name, t.Height, s.AssistsPerGame ",
  "FROM DataDotWorldBBallTeam as t ",
  "JOIN DataDotWorldBBallStats as s ON t.Name = s.Name ",
  "ORDER BY s.AssistsPerGame DESC"))

data.world::query(assists_vs_height, owner_id = intro_ds_owner,
  dataset_id = intro_ds)

Parameterized queries

SQL queries can be parameterized for maximum flexibility. Parameters are always positional and can be provided via the qry_sql object.
For example:

assists_greater_than <- data.world::qry_sql(paste0(
  "SELECT t.Name, t.Height, s.AssistsPerGame ",
  "FROM DataDotWorldBBallTeam as t ",
  "JOIN DataDotWorldBBallStats as s ON t.Name = s.Name ",
  "WHERE s.AssistsPerGame > ? ",
  "ORDER BY s.AssistsPerGame DESC"))

assists_greater_than$params <- c(10)
data.world::query(assists_greater_than, intro_ds_owner, intro_ds)

SPARQL

Running SPARQL queries is very similar to running SQL queries, except that the qry_sparql constructor is used instead.

Additionally, instead of a tabular schema, with SPARQL one queries a data graph.

SELECT queries

The previous SELECT query from the SQL example, translated to SPARQL, looks like this:

assists_vs_height <- data.world::qry_sparql(paste(
  "BASE <http://data.world/jonloyens/an-intro-to-dataworld-dataset/> ",
  "PREFIX t: <DataDotWorldBBallTeam.csv/DataDotWorldBBallTeam#> ",
  "PREFIX s: <DataDotWorldBBallStats.csv/DataDotWorldBBallStats#> ",
  "SELECT ?name ?height ?assists WHERE { ",
  "  ?pt t:Name ?name . ",
  "  ?ps s:Name ?name . ", # the join column 
  "  ?pt t:Height ?height . ",
  "  ?ps s:AssistsPerGame ?assists . ",
  "} ",
  "ORDER BY DESC (?assists)", sep = "\n"
))

data.world::query(assists_vs_height, intro_ds_owner, intro_ds)

Parameterized SPARQL queries

Similarly, the previous parameterized query from the SQL example can be translated to SPARQL as follows:

assists_greater_than <- data.world::qry_sparql(paste(
  "BASE <http://data.world/jonloyens/an-intro-to-dataworld-dataset/> ",
  "PREFIX t: <DataDotWorldBBallTeam.csv/DataDotWorldBBallTeam#> ",
  "PREFIX s: <DataDotWorldBBallStats.csv/DataDotWorldBBallStats#> ",
  "SELECT ?name ?height ?assists WHERE { ",
  "  ?pt t:Name ?name . ",
  "  ?ps s:Name ?name . ", # the join column 
  "  ?pt t:Height ?height . ",
  "  ?ps s:AssistsPerGame ?assists . ",
  "  FILTER(?assists > $v1) ",
  "} ",
  "ORDER BY DESC (?assists)", sep = "\n"
))
assists_greater_than$params <- c("$v1" = 10)
data.world::query(assists_greater_than, intro_ds_owner, intro_ds)

Notice that in the SPARQL case, parameters are always named.

Next

Queries are a powerful feature and are a great tool for pulling and wrangling data.
For additional examples, check out our tutorials and published queries here and here.



datadotworld/data.world-r documentation built on July 2, 2021, 2:17 a.m.