prov: A Semantic Data Provenance Index

Documented in prov write_prov

#' Write a provenance trace into JSON-LD
#'
#' @param data_in path or URI for input data
#' @param code path or URI for code
#' @param data_out path or URI to output data
#' @param meta path or URI to metadata describing the workflow
#' @param creator URI, list node, or text for creator
#' @param title Dataset title, character string
#' @param description Dataset description, character string
#' @param issued publication date, as Date or character object
#' @param license URL to a copyright license
#' @param provdb path to output JSON file, default "prov.json"
#' @param append Should we append to existing json or overwrite it?
#' @param schema Use schema.org or DCAT2 schema? See details.
#' @param embed_actions should we incldue schema:Action to create?
#' @param ... additional named elements passed to Dataset
#' @details 
#' 
#' If creator, title, and description are all empty, will serialize
#' only a graph of distribution (data download) elements, not a 
#' Dataset. 
#' 
#' Additional elements passed through `...` must be explicitly namespaced,
#' e.g. `dcat:version`, when using DCAT2 schema. When using schema.org,
#' elements must be in schema.org namespace.
#' 
#' Provenance can be expressed in (purely) schema.org or as DCAT2 
#' (includes terms from DCTERMS, PROV, DCAT2, CITO ontologies). 
#' The latter is more expressive in terms of provenance.
#' Also note DCAT2 but not schema.org can explicitly encode compression and
#' metadata file relationships.
#' @export
#'
#' @examples
#'  
#' ## Use temp files for illustration only
#' provdb <- tempfile(fileext = ".json")
#' input_data <- tempfile(fileext = ".csv")
#' output_data <- tempfile(fileext = ".csv")
#' code <- tempfile(fileext = ".R")
#' 
#' ## A minimal workflow: 
#' write.csv(mtcars, input_data)
#' out <- lm(mpg ~ disp, data = mtcars)
#' write.csv(out$coefficients, output_data)
#' 
#' # really this would already exist...
#' writeLines("out <- lm(mpg ~ disp, data = mtcars)", code)
#' 
#' ## And here we go: 
#' write_prov(input_data, code, output_data, provdb = provdb,  
#'            append= FALSE)
#'  
#' ## Include a title to group these into a Dataset:
#' write_prov(input_data, code, output_data, provdb = provdb,
#'            title = "example dataset with provenance",  append= FALSE)
#'            
write_prov <-  function(
  data_in = NULL,
  code = NULL, 
  data_out = NULL,
  meta = NULL,
  creator = NULL,
  title = NULL,
  description = NULL,
  issued = as.character(Sys.Date()),
  license = "https://creativecommons.org/publicdomain/zero/1.0/legalcode",
  provdb = "prov.json",
  append = TRUE,
  schema = c("http://schema.org", "http://www.w3.org/ns/dcat"),
  embed_actions = is.null(code),
  ...
){
  
  schema <- match.arg(schema)
  
  prov_obj <- 
  prov(data_in = data_in, 
       code = code, 
       data_out = data_out,
       meta = meta,
       creator = creator,
       title = title,
       description = description,
       issued = issued,
       license = license,
       schema = schema, 
       embed_actions = embed_actions,
       ...)
  
  write_jsonld(prov_obj, provdb, append, schema = schema)
  
}

#' generate provenance information
#' 
#' @inheritParams write_prov
#' @export
prov <-  function(
  data_in = NULL,
  code = NULL, 
  data_out = NULL,
  meta = NULL,
  creator = NULL,
  title = NULL,
  description = NULL,
  issued = as.character(Sys.Date()),
  license = "https://creativecommons.org/publicdomain/zero/1.0/legalcode",
  schema = c("http://schema.org", "http://www.w3.org/ns/dcat"),
  embed_actions = FALSE,
  ...
  ){
  
  schema <- match.arg(schema)
  files <- 
    switch(schema, 
           "http://www.w3.org/ns/dcat" = 
             dcat_provenance(data_in = data_in, 
                               code = code, 
                               data_out = data_out, 
                               meta = meta),
           "http://schema.org" = 
             schema_provenance(data_in = data_in, 
                               code = code, 
                               data_out = data_out)
    )
  
  
  ## If we have none of these fields, don't package as a dataset
  if(all(is.null(c(creator, title, description))))
    return(list("@graph" = files))
  
  actions <- list()
  
  if(embed_actions) {
    if(grepl("schema.org", schema)){
     ## If we're writing a dataset, action type should not be included
     ## in the distribution element! 
     type <- lookup(files, "type")
     actions <- files[type == "Action"]
     files <- files[type != "Action"]
    }
  
  }
  out <- switch(schema, 
         "http://www.w3.org/ns/dcat" = 
           dcat_dataset(distribution = files,
                        creator = creator,
                        title = title,
                        description = description,
                        issued = issued,
                        license = license,
                        ...),
         "http://schema.org" = 
           schema_dataset(distribution = files,
               creator = creator,
               title = title,
               description = description,
               issued = issued,
               license = license,
               ...)
  )
  ## in schema.org, we need to list actions separately
  if(length(actions) > 0){
    return(list(
      "@graph" = list(out, actions)
          ))
  }
  out
              
}