R/csvwr.R

Defines functions transform_datetime_format base_uri

Documented in base_uri transform_datetime_format

#' csvwr: Read and write CSV on the Web (CSVW)
#'
#' Read and write csv tables annotated with
#' \href{https://w3c.github.io/csvw/metadata/}{csvw metadata}. This helps to
#' ensure consistent processing and reduce the amount of manual work needed to
#' parse and prepare data before it can be used in analysis.
#'
#' @section Getting started:
#'
#' The best place to start is the
#' \href{../doc/read-write-csvw.html}{Reading and Writing CSVW} vignette.
#'
#' @section Reading annotated tables:
#' \itemize{
#'   \item \code{\link{read_csvw}} Parse a table group
#'   \item \code{\link{read_csvw_dataframe}} Parse a table group and extract the first
#'     data frame
#' }
#'
#' @section Writing table annotations:
#' \itemize{
#'   \item \code{\link{derive_table_schema}} Derive table schema from a data frame
#'   \item \code{\link{create_metadata}} Create a table group annotation
#'   \item \code{\link{derive_metadata}} Derive an annotation from a csv file
#' }
#'
#' @docType package
#' @name csvwr
#' @importFrom magrittr %>%
#' @importFrom rlang %||%
#' @keywords internal
"_PACKAGE" # roxygen needs to document something! https://r-pkgs.org/man.html#man-packages

# https://github.com/tidyverse/magrittr/issues/29
# Otherwise R Check complains:
# > no visible binding for global variable ‘.’
globalVariables(".")

#' Retrieve the base URI from configuration
#'
#' @return returns the value of `csvwr_base_uri` option, defaulting to `example.net`
#' @examples
#' \dontrun{
#' base_uri() # returns default
#'
#' options(csvwr_base_uri="http://www.w3.org/2013/csvw/tests/")
#' base_uri()
#' }
#' @export
base_uri <- function() {
  getOption("csvwr_base_uri", "http://example.net/")
}

#' Transform date/time format string from Unicode TR35 to POSIX 1003.1
#'
#' As per the [csvw specification for date and time
#' formats](https://www.w3.org/TR/2015/REC-tabular-data-model-20151217/#h-formats-for-dates-and-times)
#' we accept format strings using the [date field symbols defined in unicode
#' TR35](https://www.unicode.org/reports/tr35/tr35-dates.html#Date_Field_Symbol_Table).
#' These are converted to POSIX 1003.1 date format strings for use in
#' [base::strptime()] or [readr::parse_date()]/[readr::parse_datetime()].
#'
#' @param format_string a UAX35 date format string
#' @return a POSIX date format string
#' @examples
#' \dontrun{
#' fmt <- transform_datetime_format("dd.MM.yyyy")
#' strptime("01.01.2001", format=fmt)
#' }
#' @md
#' @export
transform_datetime_format <- function(format_string) {
  format_string %>%
    gsub("yyyy", "%Y", .) %>%
    gsub("dd", "%d", .) %>%
    gsub("MM", "%m", .) %>%
    gsub("(?<!%)d", "%d", ., perl=T) %>%
    gsub("(?<!%)M", "%m", ., perl=T) %>%
    gsub("HH", "%H", .) %>%
    gsub("mm", "%M", .)
}


#' Map csvw datatypes to R types
#'
#' Translate [csvw datatypes](https://www.w3.org/TR/tabular-metadata/#datatypes) to R types.
#' This implementation currently targets [readr::cols] column specifications.
#'
#' @param datatypes a list of csvw datatypes
#' @return a `readr::cols` specification - a list of collectors
#' @examples
#' \dontrun{
#' cspec <- datatype_to_type(list("double", list(base="date", format="yyyy-MM-dd")))
#' readr::read_csv(readr::readr_example("challenge.csv"), col_types=cspec)
#' }
#' @md
#' @export
datatype_to_type <- function(datatypes) {
  datatypes %>% purrr::map(function(datatype) {
    if(is.list(datatype)) {
      # complex types (specified with a list)
      switch(datatype$base %||% "string",
             boolean = readr::col_character(), # TODO: parse as logical
             date = readr::col_date(format=transform_datetime_format(datatype$format)),
             datetime = readr::col_datetime(format=transform_datetime_format(datatype$format)),
             decimal = readr::col_double(), # TODO: validate max/min
             string = readr::col_character(), # TODO: use format regex in validation
             stop("unrecognised complex datatype: ", datatype))
    } else {
      # simple types (specified with a string)
      switch(datatype %||% "string",
             integer = readr::col_integer(), # TODO: more variants
             double = readr::col_double(),
             float = readr::col_double(),
             number = readr::col_double(),
             decimal = readr::col_double(),
             string = readr::col_character(), # TODO: more variants
             boolean = readr::col_logical(),
             date = readr::col_date(),
             datetime = readr::col_datetime(),
             time = readr::col_time(),
             duration = readr::col_character(),
             gDay = readr::col_character(), # TODO: xsd datatypes
             gMonth = readr::col_character(),
             gMonthDay = readr::col_character(),
             gYear = readr::col_character(),
             gYearMonth = readr::col_character(),
             xml = readr::col_character(),
             html = readr::col_character(),
             json = readr::col_character(),
             binary = readr::col_character(), # Base 64
             hexBinary = readr::col_character(),
             QName = readr::col_character(),
             anyURI = readr::col_character(),
             any = readr::col_character(),
             normalizedString = readr::col_character(),
             stop("unrecognised simple datatype: ", datatype))
    }
    # TODO: value and length constraints
  })
}


#' Map R types to csvw datatype
#'
#' Translate R types to [csvw datatypes](https://www.w3.org/TR/tabular-metadata/#datatypes).
#' Acts as an inverse of `datatype_to_type` but doesn't provide a 1:1 correspondence.
#'
#' @param types a list of R types
#' @return a list of csvw datatypes
#' @md
#' @export
type_to_datatype <- function(types) {
  types %>% purrr::map(function(type) {
    switch(type[[1]],
           numeric = "number",
           integer = "integer",
           double = "double",
           character = "string",
           factor = "string",
           logical = "boolean",
           Date = "date",
           "string")
  })
}

#' Read CSV on the Web
#'
#' If the argument to `filename` is a json metadata document, this will be used to find csv files for
#' each table using the value of `csvw:url`.
#'
#' If the argument to `filename` is a csv file, and no `metadata` is provided, an attempt is made to
#' derive metadata.
#'
#' If the argument to `filename` is a csv file, and the `metadata` is provided, then the given csv will
#' override the value of `csvw:url`.
#'
#' The csvw metadata is returned as a list. In each table in the table group, an element named
#' `dataframe` is added which provides the contents of the csv table parsed into a data frame using
#' the table schema.
#'
#' @param filename a path for a csv table or a json metadata document
#' @param metadata optional user metadata
#' @return csvw metadata list, with a `dataframe` property added to each table
#' @examples
#' \dontrun{
#' read_csvw("metadata.json")
#' read_csvw("table.csv", "metadata.json")
#' }
#' @export
#' @md
read_csvw <- function(filename, metadata=NULL) {
  metadata <- locate_metadata(filename, metadata)

  metadata$tables <- lapply(metadata$tables,
                            try_add_dataframe,
                            filename=filename,
                            group=metadata)

  metadata <- normalise_metadata(metadata, location=filename)

  return(metadata)
}

#' Add data frame to csvw table annotation
#'
#' @param table a `csvw:Table` annotation
#' @param filename a filename/ URL for the csv table
#' @param group a list of metadata for the table group to use as a fallback
#' @return a table annotation with a `dataframe` attribute added with data frame
#' holding the contents of the table
#' @md
#' @keywords internal
add_dataframe <- function(table, filename, group) {
  schema <- table$tableSchema %||% group$tableSchema
  dialect <- override_defaults(table$dialect, group$dialect, default_dialect)
  if(is.null(schema)) {
    # if we need to derive a default schema, then set this on the table itself
    table$tableSchema <- schema <- default_schema(filename, dialect)
  }
  csv_url <- locate_table(filename, table$url)

  table_columns <- schema$columns[!coalesce_truth(schema$columns[["virtual"]]), ]
  column_names <- table_columns$name
  column_types <- datatype_to_type(table_columns$datatype)

  dtf <- readr::read_csv(csv_url,
                         trim_ws=T,
                         skip=dialect$headerRowCount,
                         col_names=column_names,
                         col_types=column_types)
  table$dataframe <- dtf

  table
}

#' Try to add a dataframe to the table
#'
#' If this fails, a list describing the error is added instead
#'
#' @param table a `csvw:Table` annotation
#' @param ... arguments to `add_dataframe`
#' @return A table annotation with a `dataframe` attribute added with data frame
#' holding the contents of the table or an error.
#' @md
#' @keywords internal
try_add_dataframe <- function(table, ...) {
  tryCatch(add_dataframe(table, ...),
           error=function(e) {
             cli::cli_alert_danger("Could not read data frame from {.path {table$url}} because of {e$message}")
             table$dataframe <- list(error=e,  ...)
             table
            })
}

#' Read a data frame from the first table in a csvw
#'
#' Wrapper around `read_csvw` convenient when you're only interested in the data and there's only one table
#' @param filename a path for a csv table or a json metadata document
#' @param metadata optional user metadata
#' @return A data frame parsed using the table schema
#' @export
read_csvw_dataframe <- function(filename, metadata=NULL) {
  read_csvw(filename, metadata)$tables[[1]]$dataframe
}

#' Locate metadata for a table
#'
#' Follows the procedure defined in the [csvw model](https://w3c.github.io/csvw/syntax/#locating-metadata):
#'
#' 1. Metadata supplied by the user
#' 1. Metadata referenced by a link header
#' 1. Metadata located through default paths
#' 1. Metadata embedded in the file
#'
#' We extend this to use the [derive_metadata] function to inspect the table itself.
#'
#' @param filename a path for a csv table or a json metadata document
#' @param metadata optional user metadata
#' @return csvw metadata list
#' @md
#' @keywords internal
locate_metadata <- function(filename, metadata) {
  if(!is.null(metadata)) {
    # overriding metadata
    metadata <- read_metadata(metadata)
  } else if(stringr::str_ends(filename,"\\.json")) {
    # file itself is the metadata
    metadata <- read_metadata(filename)
  } else {
    # retrieve file references by link header (TODO)

    # attempt to locate via defaults/ configuration
    location <- find_metadata(filename)
    if(!is.null(location)) {
      metadata <- read_metadata(location)
    } else {
      # use embedded metadata from the tabular file (TODO)

      # derive metadata from the table itself
      metadata <- derive_metadata(filename)
    }
  }
  return(metadata)
}

#' Locate csv data table
#'
#' @param filename the file passed to `read_csvw` in the first place (could be the csv or json annotations)
#' @param url the location of the the table as defined in the metadata
#' @return The location of the table
#' @keywords internal
locate_table <- function(filename, url) {
  # append table's url to location of filename
  url_relative_to_filename <- paste(dirname(filename), url, sep="/")

  # if argument to read_csvw was a csv use that, otherwise use the table's `csvw:url`
  ifelse(stringr::str_ends(filename, "\\.csv"), filename, url_relative_to_filename)
} # TODO: unit test me!

#' Set the base of a URI template
#'
#' @param t a character vector of URI templates
#' @param url a filename url being used as a context (string)
#' @return a character vector of templates with base paths/ domains set appropriately
#' @keywords internal
set_uri_base <- function(t, url) {
  # prefix current domain if rendered template starts with a `/`
  if(strtrim(t,1)=="/") {
    domain <- paste(strsplit(url, "/")[[1]][c(1,3)], collapse="//")
    t <- paste0(domain, t)
  } else {
    # prefix current path if template doesn't start with `{+url}`
    if(strtrim(t,6)!="{+url}") {
      path <- dirname(url)
      t <- paste(path, t, sep="/")
    }
  }
  t
}

#' Render URI templates
#'
#' Interpolate variable bindings into a URI template.
#'
#' This doesn't yet implement the whole of RFC 6570, just enough to make the tests pass
#'
#' You can bind variables by passing a list to the explicit `bindings` argument,
#' or variadically with `...` by naming arguments according to the variable name you wish to bind.
#'
#' @param templates a character vector with URI templates
#' @param bindings a list of variable bindings to be interpolated into templates
#' @param ... further bindings specified as named function arguments
#' @return a character vector with the expanded URI
#' @examples
#' render_uri_templates("{+url}/resource?query=value", list(url="http://example.net"))
#' render_uri_templates("{+url}", url="http://example.net")
#' @importFrom stringr str_replace
#' @importFrom stringr str_glue_data
#' @export
#' @md
render_uri_templates <- function(templates, bindings=NULL, ...) {
  bindings <- c(bindings, list(...))
  templates %>%
    purrr::map(set_uri_base, url=bindings$url) %>%
    stringr::str_replace("\\+", "") %>%
    # https://datatracker.ietf.org/doc/html/rfc6570#section-3.2.4
    stringr::str_replace("\\{#", "#{") %>%
    purrr::map_chr(function(t) stringr::str_glue_data(bindings, t))
}

#' Identify metadata location configurations for a tabular file
#'
#' Returns default locations. Will ultimately retrieve remote configuration
#'
#' @param filename a csv file
#' @return a character vector of URI templates
#' @keywords internal
location_configuration <- function(filename) {
  default_locations <- c("{+url}-metadata.json","csv-metadata.json")
  # TODO retrieve site-wide configuration from `/.well-known/csvm`
  # https://w3c.github.io/csvw/syntax/#default-locations-and-site-wide-location-configuration
  default_locations
}

#' Find the first existing file from a set of candidates
#'
#' @param filenames a vector of candidates
#' @return
#' If one of the filenames passed is found, then the first is returned.
#' If none of the filenames exist, `NULL` is returned
#' @keywords internal
find_existing_file <- function(filenames) {
  for(candidate in filenames) {
    if(file.exists(candidate)) {
      found <- candidate
      break
    }
  }

  if(exists("found")) {
    return(found)
  } else {
    return(NULL)
  }
}


#' Find metadata for a tabular file
#'
#' Searches through the default locations attempting to locate metadata.
#'
#' @param filename a csv file
#' @return a uri for the metadata, or null if none were found
#' @keywords internal
find_metadata <- function(filename) {
  candidates <- render_uri_templates(location_configuration(filename), url=filename)

  find_existing_file(candidates)
}

#' Read and parse CSVW Metadata
#'
#' Reads in a json document as a list, transforming columns specifications into a dataframe.
#'
#' @param filename a path for a json metadata document
#' @return csvw metadata list
#' @export
read_metadata <- function(filename) {
  metadata <- jsonlite::read_json(filename)

  metadata <- parse_metadata(metadata, location=filename)

  metadata$tables <- lapply(metadata$tables, function(t) {
    if(!is.null(t$tableSchema)) {
      t$tableSchema$columns <- parse_columns(t$tableSchema$columns)
    }
    t
  })
  if(!is.null(metadata$tableSchema)) {
    metadata$tableSchema$columns <- parse_columns(metadata$tableSchema$columns)
  }
  metadata
}

#' Identify the type of an annotation property
#'
#' This can be done from the name of the property. The
#' [metadata descriptor diagram](https://w3c.github.io/csvw/metadata/properties.svg)
#' provides a succinct summary of the types of each property and how they fit together.
#'
#' @param property a named list element
#' @return a string defining the property type
#' @examples
#' property_type(list(url="http://example.net"))
#' @md
#' @noRd
property_type <- function(property) {
  stopifnot(length(property)==1)
  if(is.null(names(property))) {
    return("atomic") # can't determine type of unnamed property, must be atomic
  }
  switch(names(property),
         "tables"="array",
         "transformations"="array",
         "notes"="array",
         "@context"="array",
         "foreignKeys"="array",
         "columns"="array",
         "lineTerminators"="array",
         "url"="link",
         "@id"="link",
         "resource"="link",
         "schemaReference"="link",
         "targetFormat"="link",
         "scriptFormat"="link",
         "aboutUrl"="uri_template",
         "propertyUrl"="uri_template",
         "valueUrl"="uri_template",
         "primaryKey"="column_reference",
         "rowTitles"="column_reference",
         "columnReference"="column_reference",
         "titles"="natural_language",
         "tableSchema"="object",
         "dialect"="object",
         "reference"="object",
         "atomic")
}

#' Determine the base URL for CSVW metadata
#'
#' @param metadata the csvw metadata
#' @param location where the metadata was originally located
#' @return A string containing the base URL
#' @keywords internal
base_url <- function(metadata, location) {
  context <- metadata$`@context`
  base <- if(is.list(context)) {
    context[2]$`@base`
  }
  if(is.null(base)) {
    base <- base_uri()
  }
  url <- normalise_url(location, base)
  url
}

#' Resolve one URL against another
#'
#' @param url1 the base url
#' @param url2 a relative url
#' @return A single absolute url
#' @keywords internal
resolve_url <- function(url1, url2) {
  sep <- ifelse(stringr::str_ends(url1,"/"),"","/")
  combined <- paste(url1, url2, sep=sep)
  combined %>% stringr::str_replace("/./","/")
}

#' Does the string provide an absolute URL
#'
#' @param string the url, path or template
#' @return true if the string is an absolute url
#' @keywords internal
is_absolute_url <- function(string) {
  #grepl("://", string) # contains a URL scheme
  grepl(":", string) # works with cURIes
}

#' Normalise a URL
#'
#' Ensures that a url is specified absolutely with reference to a base
#'
#' @param url a string
#' @param base the base to use for normalisation
#' @return A string containing a normalised URL
#' @keywords internal
normalise_url <- function(url, base) {
  # do nothing if the url is already absolute
  if(is_absolute_url(url)) {
    url
  } else {
    resolve_url(base, url)
  }
}

#' Normalise an annotation property
#'
#' This follows the [normalisation](https://w3c.github.io/csvw/metadata/#normalization)
#' process set out in the csvw specification.
#' @param property an annotation property (a list)
#' @param base_url the base URL for normalisation
#' @return a property (list) a
#' @keywords internal
normalise_property <- function(property, base_url) {
  normalised_property <- switch(property_type(property),
    link = normalise_url(property, base_url),
    property)
  if(is.list(normalised_property)) {
    normalised_property
  } else {
    # ensure we return a list named as per the input
    stats::setNames(list(normalised_property), nm=names(property))
  }
}

#' Parse metadata
#'
#' Coerces the metadata to ensure it describes a table group.
#' Retrieves any linked tableSchema.
#'
#' @param metadata a csvw metadata list
#' @param location the location of the metadata
#' @return metadata coerced into a
#'   [table group description](https://www.w3.org/TR/tabular-metadata/#dfn-table-group-description)
#' @md
#' @keywords internal
parse_metadata <- function(metadata, location) {
  # coerce to table group description
  if(is.null(metadata$tables)) {
    if(is.null(metadata$url)) {
      stop("Metadata doesn't define any tables, or a url to build a table definition from")
      # Could still be valid with a top-level schema, dialect or transformation
      # https://w3c.github.io/csvw/metadata/#top-level-properties
    } else {
      # put context at top level and everything else into a table group description
      metadata <- list(`@context`=metadata$`@context`,
                       tables=list(metadata[names(metadata) != "@context"]))
    }
  }

  # retrieve linked tableSchema
  metadata$tables <- lapply(metadata$tables, function(table) {
    # read list in place if is a link (not a list itself)
    if(isa(table$tableSchema, "character")) {
      table$tableSchema <- jsonlite::read_json(normalise_url(table$tableSchema,dirname(location)))
    }
    table
  })

  metadata
}

#' Normalise metadata
#'
#' The spec defines a [normalisation process](https://w3c.github.io/csvw/metadata/#normalization).
#'
#' @param metadata a csvw metadata list
#' @param location the location of the metadata
#' @return metadata with normalised properties
#' @md
normalise_metadata <- function(metadata, location) {
  # normalise annotations
  metadata <- rlmap(metadata, normalise_property, base_url=base_url(metadata, dirname(location)))

  metadata
}

#' Parse columns schema
#'
#' @param columns a list of lists specification of columns
#' @return a data frame with a row per column specification
#' @keywords internal
parse_columns <- function(columns) {
  d <- list_of_lists_to_df(columns)

  # required defaults to false
  if(is.null(d$required)) {
    d$required <- F
  } else {
    d$required <- sapply(d$required, function(x) ifelse(is.null(x) || is.na(x),F,x))
  }

  if(is.null(d$datatype)) {
    d$datatype <- "string"
  } else {
    d$datatype <- lapply(d$datatype, function(x) if(any(is.null(x) | is.na(x))) { "string" } else { x })
  }

  d
}

#' Coalesce value to truthiness
#'
#' Determine whether the input is true, with missing values being interpreted as false.
#'
#' @param x logical, `NA` or `NULL`
#' @return `FALSE` if x is anything but `TRUE`
coalesce_truth <- function(x) {
  if(is.null(x)) {
    F
  } else {
    ifelse(is.na(x), F, x)
  }
}

#' Unlist unless the list-elements are themselves lists
#'
#' Convert a list of elements to a vector. Unlike `base::unlist` this doesn't
#' convert the elements of inner lists to vector elements. Thus only a list a single
#' layer deep is flattened to a vector.
#'
#' @param l a list
#' @return A list of lists or a vector
#' @keywords internal
unlist1 <- function(l) {
  is_list <- vapply(l, is.list, logical(1))
  if(!any(is_list)) {
    unlist(l)
  } else {
    l
  }
}

#' Parse list of lists specification into a data frame
#'
#' @param ll a list of lists
#' @return a data frame with a row per list
#' @keywords internal
list_of_lists_to_df <- function(ll) {
  # need to get all names, not just use those from first list per transpose
  nms <- ll %>% purrr::map(names) %>% purrr::reduce(union)

  purrr::transpose(ll, .names=nms) %>%
    # coalesce missing specifications to NA
    purrr::map(function(l) { lapply(l, function(x) { if(is.null(x)) { NA } else { x }}) }) %>%
    # convert single-depth lists into vectors
    purrr::map(unlist1) %>%
    # prevents lists being split into columns (allows cells to be lists)
    purrr::map_if(is.list, I) %>%
    as.data.frame(stringsAsFactors=F, col.names=nms)
}




#' Derive csvw metadata from a csv file
#'
#' @param filename a csv file
#' @return a list of csvw metadata
#' @examples
#' derive_metadata(csvwr_example("computer-scientists.csv"))
#' @export
derive_metadata <- function(filename) {
  schema <- default_schema(filename, list(header=T))
  create_metadata(tables=list(
    list(url = paste0(base_uri(), filename), tableSchema=schema)
  ))
}

#' Derive csvw table schema from a data frame
#'
#' @param d a data frame
#' @return a list describing a `csvw:tableSchema`
#' @examples
#' derive_table_schema(data.frame(a=1,b=2))
#' @export
derive_table_schema <- function(d) {
  cols <- colnames(d) # TODO: derive snake-case names instead of make.names dots

  types <- if(coalesce_truth(getOption("csvwr_compatibility_mode"))) {
    "string"
  } else {
    type_to_datatype(unname(lapply(d,class))) %>% purrr::simplify()
  }

  columns <- data.frame(name=cols, titles=cols, stringsAsFactors = F)
  columns$datatype <- types # list in constructor would've led to a variable per list element

  list(columns=columns)
}


#' Create a default table schema given a csv file and dialect
#'
#' If neither the table nor the group have a `tableSchema` annotation,
#' then this default schema will used.
#'
#' @param filename a csv file
#' @param dialect specification of the csv's dialect (default: `default_dialect`)
#' @return a table schema
#' @md
default_schema <- function(filename, dialect=default_dialect) {
  data_sample <- readr::read_csv(filename, n_max=10, col_names=dialect$header, col_types=readr::cols())
  if(!dialect$header) {
    names(data_sample) <- paste0("_col.", 1:ncol(data_sample))
  }
  derive_table_schema(data_sample)
}

#' CSVW default dialect
#'
#' The [CSVW Default Dialect specification](https://w3c.github.io/csvw/metadata/#dialect-descriptions)
#' described in [CSV Dialect Description Format](http://dataprotocols.org/csv-dialect/).
#'
#' @return a list specifying a default csv dialect
default_dialect <- list(
  encoding="utf-8",
  lineTerminators=c("\r\n","\n"),
  quoteChar="\"",
  doubleQuote=T,
  skipRows=0,
  commentPrefix="#",
  header=T,
  headerRowCount=1,
  delimiter=",",
  skipColumns=0,
  skipBlankRows=F,
  skipInitialSpace=F,
  trim=F
)

#' Override defaults
#'
#' Merges two lists applying `override` values on top of the `default` values.
#'
#' @param ... any number of lists with configuration values
#'
#' @return a list with the values from the first list replacing those in the second and so on
#' @keywords internal
override_defaults <- function(...) {
  dialect <- list()

  set_value <- function(x) { dialect[names(x)] <<- x }

  purrr::walk(rev(list(...)), function(l) {
    purrr::lmap(l, set_value)
  })

  dialect
}

#' Create tabular metadata from a list of tables
#'
#' The table annotations should each be a list with keys for `url` and `tableSchema`.
#' You can use `derive_table_schema` to derive a schema from a data frame.
#'
#' @param tables a list of `csvw:table` annotations
#' @return a list describing a tabular metadata annotation
#' @examples
#' d <- data.frame(foo="bar")
#' table <- list(url="filename.csv", tableSchema=derive_table_schema(d))
#' create_metadata(tables=list(table))
#' @export
create_metadata <- function(tables) {
  list("@context"="http://www.w3.org/ns/csvw", tables = tables)
}


#' Check for blank values
#'
#' Is the value NA or an empty string `""`?
#'
#' @param value any vector
#' @return a logical indicating if the value is blank
#' @examples
#' \dontrun{
#' is_blank(c(NA, "", 1, TRUE, "a"))
#' }
#' @noRd
is_blank <- function(value) {
  is.na(value) | (value=="")
}

#' Determine if an annotation is non-core
#'
#' Checks if the annotation is non-core, and should thus be treated as a json-ld note.
#'
#' @param property a list element
#' @return `TRUE` the annotation is core, `FALSE` otherwise
#' @keywords internal
is_non_core_annotation <- function(property) {
  !any(names(property) %in% c("tables","tableScheme","url","@context","columns"))
}

#' Convert json-ld annotation to json
#'
#' Follows the [rules for JSON-LD to JSON conversion set out in the csv2json
#' standard](https://w3c.github.io/csvw/csv2json/#json-ld-to-json).
#'
#' @param property a json-ld annotation (single list element)
#' @return A compacted list element
#' @md
#' @keywords internal
json_ld_to_json <- function(property) {
  stopifnot(length(property)==1)
  value <- property[[1]]
  if(is.list(value)) {
    value <- compact_json_ld(value)
  }
  stats::setNames(list(value), nm=names(property))
}

#' Compact objects to values
#'
#' Follows the [rules for JSON-LD to JSON conversion set out in the csv2json
#' standard](https://w3c.github.io/csvw/csv2json/#json-ld-to-json).
#'
#' @param value an element from a list (could be a vector or another list)
#' @return A compacted value.
#' @keywords internal
#' @md
compact_json_ld <- function(value) {
  if(is.list(value)) {
    if(any(c("@id","@value") %in% names(value))) {
      value <- value[[1]]
    } else {
      lapply(value, compact_json_ld)
    }
  } else {
    value
  }
}


#' Serialise cell values for JSON representation
#'
#' @param cell a typed value
#' @return a representation comparable with the JSON representation (typically a string)
#' @keywords internal
render_cell <- function(cell) {
  switch(class(cell)[1],
         Date = strftime(cell),
         POSIXct = strftime(cell),
         POSIXlt = strftime(cell),
         cell)
}

#' Convert a table to a list
#'
#' Follows the pattern for csv2json
#' @param table the csvw table
#' @param group list of metadata for the group used for a fallback schema and dialect
#' @return a list representation of the table's contents
#' @importFrom magrittr %>%
#' @keywords internal
table_to_list <- function(table, group) {
  table <- purrr::lmap_if(table, is_non_core_annotation, json_ld_to_json, .else=identity)
  schema <- table$tableSchema %||% group$tableSchema # TODO: check that normalize doesn't obviate this
  dialect <- override_defaults(table$dialect, group$dialect, default_dialect)
  row_num <- 0
  table$row <- purrr::pmap(table$dataframe, list) %>%
    purrr::map(function(r) {
      row_num <<- row_num + 1
      url <- paste0(table$url, "#row=", row_num + dialect$headerRowCount)
      suppressWarnings(
        names(r) <- schema$columns[!suppressWarnings(coalesce_truth(schema$columns$virtual)), ]$name
      )
      r <- lapply(r, render_cell)
      if(!is.null(schema$aboutUrl)) {
        template <- paste0("{+url}",schema$aboutUrl)
        r <- c(list("@id"=render_uri_templates(template,bindings=c(url=table$url,r))),r)
      }
      row_description <- list(purrr::discard(r, .p=is_blank))
      list(url=url, rownum=row_num, describes=row_description)
    })
  table[!(names(table) %in% c("tableSchema","dataframe"))]
}

#' Convert a csvw metadata to a list (csv2json)
#'
#' @param csvw a csvw metadata list
#' @return a list following the csv2json translation rules
#' @examples
#' \dontrun{
#' csvw_to_list(read_csvw("example.csv"))
#' }
#' @export
csvw_to_list <- function(csvw) {
  list(tables=lapply(csvw$tables, table_to_list, group=csvw))
}

#csvw_triples <- function(csvw) # returns vector of triples/ s-p-o data.frame

#' Get path to csvwr example
#'
#' The csvwr package includes some example csvw files in it's `inst/extdata` directory.
#' You can use this function to find them.
#'
#' Inspired by [readr::readr_example()]
#'
#' @param path The filename. If `NULL`, the example files will be listed.
#' @return either a file path or a vector of filenames
#' @examples
#' csvwr_example()
#' csvwr_example("computer-scientists.csv")
#' @export
#' @md
csvwr_example <- function(path=NULL) {
  if (is.null(path)) {
    dir(system.file("extdata", package = "csvwr"))
  }
  else {
    system.file("extdata", path, package = "csvwr", mustWork = TRUE)
  }
}

Try the csvwr package in your browser

Any scripts or data that you put into this service are public.

csvwr documentation built on Nov. 21, 2022, 5:05 p.m.