R/csvwr_overrides.R

Defines functions datatype_to_type add_dataframe coalesce_truth default_schema override_defaults

Documented in coalesce_truth default_schema override_defaults

# borrowed from csvwr
#' CSVW default dialect
#'
#' The [CSVW Default Dialect specification](https://w3c.github.io/csvw/metadata/#dialect-descriptions)
#' described in [CSV Dialect Description Format](http://dataprotocols.org/csv-dialect/).
#'
#' @return a list specifying a default csv dialect
default_dialect <- list(
    encoding="utf-8",
    lineTerminators=c("\r\n","\n"),
    quoteChar="\"",
    doubleQuote=TRUE,
    skipRows=0,
    commentPrefix="#",
    header=TRUE,
    headerRowCount=1,
    delimiter=",",
    skipColumns=0,
    skipBlankRows=FALSE,
    skipInitialSpace=FALSE,
    trim=FALSE
)

#' Override defaults
#'
#' Merges two lists applying `override` values on top of the `default` values.
#'
#' @param ... any number of lists with configuration values
#'
#' @return a list with the values from the first list replacing those in the second and so on
#' @keywords internal
#' @importFrom purrr walk
#' @importFrom purrr lmap
override_defaults <- function(...) {
    dialect <- list()

    set_value <- function(x) { dialect[names(x)] <<- x }

    purrr::walk(rev(list(...)), function(l) {
        purrr::lmap(l, set_value)
    })

    dialect
}


#' Create a default table schema given a csv file and dialect
#'
#' If neither the table nor the group have a `tableSchema` annotation,
#' then this default schema will used.
#'
#' @param filename a csv file
#' @param dialect specification of the csv's dialect (default: `default_dialect`)
#' @return a table schema
#' @md
default_schema <- function(filename, dialect=default_dialect) {
  data_sample <- readr::read_csv(filename, n_max=10, col_names=dialect$header, col_types=readr::cols())
  if(!dialect$header) {
    names(data_sample) <- paste0("_col.", 1:ncol(data_sample))
  }
  derive_table_schema(data_sample)
}

#' Coalesce value to truthiness
#'
#' Determine whether the input is true, with missing values being interpreted as false.
#'
#' @param x logical, `NA` or `NULL`
#' @return `FALSE` if x is anything but `TRUE`
coalesce_truth <- function(x) {
    if(is.null(x)) {
        FALSE
    } else {
        ifelse(is.na(x), FALSE, x)
    }
}

#' @importFrom magrittr %>%
#' @importFrom rlang %||%
#' @importFrom readr read_delim
add_dataframe <- function(table, filename, group) {
    schema <- table$tableSchema %||% group$tableSchema
    dialect <- override_defaults(table$dialect, group$dialect, default_dialect)
    if(is.null(schema)) {
        # if we need to derive a default schema, then set this on the table itself
        table$tableSchema <- schema <- default_schema(filename, dialect)
    }
    table_columns <- schema$columns[!coalesce_truth(schema$columns[["virtual"]]), ]
    column_names <- table_columns$name
    column_types <- datatype_to_type(table_columns$datatype)
    readr::read_delim(
        filename,
        trim_ws=TRUE,
        skip=dialect$headerRowCount,
        col_names=column_names,
        col_types=column_types)
}





#' Map csvw datatypes to R types
#'
#' Translate [csvw datatypes](https://www.w3.org/TR/tabular-metadata/#datatypes) to R types.
#' This implementation currently targets [readr::cols] column specifications.
#'
#' rcldf adds some overrides here to add e.g. anyURI etc.
datatype_to_type <- function(datatypes) {

  datatypes %>% purrr::map(function(datatype) {
    if(is.list(datatype)) {
      # complex types (specified with a list)
      switch(datatype$base %||% "string",
             integer = readr::col_integer(),
             anyURI = readr::col_character(),
             boolean = readr::col_character(),
             date = readr::col_date(format=transform_datetime_format(datatype$format)),
             datetime = readr::col_datetime(format=transform_datetime_format(datatype$format)),
             decimal = readr::col_double(),
             string = readr::col_character(),
             stop("unrecognised complex datatype: ", datatype))
    } else {
      # simple types (specified with a string)
      switch(datatype %||% "string",
             integer = readr::col_integer(),
             anyURI = readr::col_character(),
             double = readr::col_double(),
             float = readr::col_double(),
             number = readr::col_double(),
             decimal = readr::col_double(),
             string = readr::col_character(),
             boolean = readr::col_logical(),
             date = readr::col_date(),
             datetime = readr::col_datetime(),
             time = readr::col_time(),
             duration = readr::col_character(),
             gDay = readr::col_character(),
             gMonth = readr::col_character(),
             gMonthDay = readr::col_character(),
             gYear = readr::col_character(),
             gYearMonth = readr::col_character(),
             xml = readr::col_character(),
             html = readr::col_character(),
             json = readr::col_character(),
             binary = readr::col_character(), # Base 64
             hexBinary = readr::col_character(),
             QName = readr::col_character(),
             anyURI = readr::col_character(),
             any = readr::col_character(),
             normalizedString = readr::col_character(),
             stop("unrecognised simple datatype: ", datatype))
    }
    # TODO: value and length constraints
  })
}
SimonGreenhill/rcldf documentation built on April 25, 2024, 8:27 a.m.