#' Adds documentation of a dataset to a datapackage
#'
#' Add information about variables in a dataset to a \code{datapackage.json} metadata file.
#'
#' @param filename A character vector of paths to plain-text tabular data files, relative to \code{dir}.
#' @param meta Information about each data file. A list with a list entry for each entry in
#' \code{filename}; see details. If a single list is provided for multiple data files, it will apply to all.
#' @param packagename Package to add the metadata to; path to the \code{.json} file relative to
#' \code{dir}, or a list with the read-in version.
#' @param dir Directory in which to look for \code{filename}, and write \code{packagename}.
#' @param write Logical; if \code{FALSE}, returns the \code{paths} metadata without reading or rewriting
#' \code{packagename}.
#' @param refresh Logical; if \code{FALSE}, will retain any existing dataset information.
#' @param sha A number specifying the Secure Hash Algorithm function,
#' if \code{openssl} is available (checked with \code{Sys.which('openssl')}).
#' @param clean Logical; if \code{TRUE}, strips special characters before saving.
#' @param open_after Logical; if \code{TRUE}, opens the written datapackage after saving.
#' @details
#' \code{meta} should be a list with unnamed entries for entry in \code{filename},
#' and each entry can include a named entry for any of these:
#' \describe{
#' \item{source}{
#' A list or list of lists with entries for at least \code{name}, and ideally for \code{url}.
#' }
#' \item{ids}{
#' A list or list of lists with entries for at least \code{variable} (the name of a variable in the dataset).
#' Might also include \code{map} with a list or path to a JSON file resulting in a list with an
#' entry for each ID, and additional information about that entity, to be read in a its features.
#' All files will be loaded to help with aggregation, but local files will be included in the datapackage,
#' whereas hosted files will be loaded client-side.
#' }
#' \item{time}{
#' A string giving the name of a variable in the dataset representing a repeated observation of the same entity.
#' }
#' \item{variables}{
#' A list with named entries providing more information about the variables in the dataset.
#' See \code{\link{data_measure_info}}.
#' }
#' }
#' @examples
#' \dontrun{
#' # write example data
#' write.csv(mtcars, "mtcars.csv")
#'
#' # add it to an existing datapackage.json file in the current working directory
#' data_add("mtcars.csv")
#' }
#' @return An invisible version of the updated datapackage, which is also written to
#' \code{datapackage.json} if \code{write = TRUE}.
#' @seealso Initialize the \code{datapackage.json} file with \code{\link{init_data}}.
#' @export
data_add <- function(filename, meta = list(), packagename = "datapackage.json", dir = ".", write = TRUE,
refresh = TRUE, sha = "512", clean = FALSE, open_after = FALSE) {
if (missing(filename)) cli_abort("{.arg filename} must be specified")
setnames <- names(filename)
if (file.exists(filename[[1]])) {
if (dir == ".") dir <- dirname(filename[[1]])
filename <- basename(filename)
}
if (check_template("site", dir = dir)$status[["strict"]] &&
all(file.exists(paste0(dir, "/docs/data/", filename)))) {
dir <- paste0(dir, "/docs/data")
}
if (any(!file.exists(paste0(dir, "/", filename)))) {
filename <- filename[!file.exists(filename)]
cli_abort("{?a file/files} did not exist: {filename}")
}
package <- if (is.character(packagename) && file.exists(paste0(dir, "/", packagename))) {
paste0(dir, "/", packagename)
} else {
packagename
}
if (write) {
if (is.character(package)) {
package <- paste0(dir, "/", packagename)
package <- if (file.exists(package)) {
packagename <- package
jsonlite::read_json(package)
} else {
init_data(if (!is.null(setnames)) setnames[[1]] else filename[[1]], dir = dir)
}
}
if (!is.list(package)) {
cli_abort(c(
"{.arg package} does not appear to be in the right format",
i = "this should be (or be read in from JSON as) a list with a {.code resource} entry"
))
}
}
if (!is.list(package)) package <- list()
collect_metadata <- function(file) {
f <- paste0(dir, "/", filename[[file]])
m <- if (single_meta) meta else metas[[file]]
format <- if (grepl(".csv", f, fixed = TRUE)) "csv" else if (grepl(".rds", f, fixed = TRUE)) "rds" else "tsv"
if (is.na(format)) format <- "rds"
info <- file.info(f)
metas <- list()
unpack_meta <- function(n) {
if (!length(m[[n]])) list() else if (is.list(m[[n]][[1]])) m[[n]] else list(m[[n]])
}
ids <- unpack_meta("ids")
idvars <- NULL
for (i in seq_along(ids)) {
if (is.list(ids[[i]])) {
if (length(ids[[i]]$map) == 1 && is.character(ids[[i]]$map) && file.exists(ids[[i]]$map)) {
ids[[i]]$map_content <- paste(readLines(ids[[i]]$map, warn = FALSE), collapse = "")
}
} else {
ids[[i]] <- list(variable = ids[[i]])
}
if (!ids[[i]]$variable %in% idvars) idvars <- c(idvars, ids[[i]]$variable)
}
data <- if (format == "rds") {
tryCatch(readRDS(f), error = function(e) NULL)
} else {
attempt_read(f, if (length(idvars)) idvars[1] else "")
}
if (is.null(data)) {
cli_abort(c(
paste0("failed to read in the data file ({.file {f}})"),
i = "check that it is in a compatible format"
))
}
if (!all(rownames(data) == seq_len(nrow(data)))) data <- cbind(`_row` = rownames(data), data)
timevar <- unlist(unpack_meta("time"))
times <- if (is.null(timevar)) rep(1, nrow(data)) else data[[timevar]]
times_unique <- unique(times)
if (!single_meta) {
varinf <- unpack_meta("variables")
if (length(varinf) == 1 && is.character(varinf[[1]])) {
if (!file.exists(varinf[[1]])) varinf[[1]] <- paste0(dir, "/", varinf[[1]])
if (file.exists(varinf[[1]])) {
if (varinf[[1]] %in% names(metas)) {
varinf <- metas[[varinf[[1]]]]
} else {
varinf <- metas[[varinf[[1]]]] <- data_measure_info(varinf[[1]], write = FALSE, render = TRUE)
}
varinf <- varinf[varinf != ""]
}
}
varinf_full <- names(varinf)
varinf_suf <- sub("^[^:]+:", "", varinf_full)
}
res <- list(
bytes = as.integer(info$size),
encoding = stri_enc_detect(f)[[1]][1, 1],
md5 = md5sum(f)[[1]],
format = format,
name = if (!is.null(setnames)) {
setnames[file]
} else if (!is.null(m$name)) {
m$name
} else {
sub("\\.[^.]*$", "", basename(filename[[file]]))
},
filename = filename[[file]],
source = unpack_meta("source"),
ids = ids,
id_length = if (length(idvars)) {
id_lengths <- nchar(data[[idvars[1]]])
if (all(id_lengths == id_lengths[1])) id_lengths[1] else 0
} else {
0
},
time = timevar,
profile = "data-resource",
created = as.character(info$mtime),
last_modified = as.character(info$ctime),
row_count = nrow(data),
entity_count = if (length(idvars)) length(unique(data[[idvars[1]]])) else nrow(data),
schema = list(
fields = lapply(colnames(data)[!colnames(data) %in% idvars], function(cn) {
v <- data[[cn]]
invalid <- !is.finite(v)
r <- list(name = cn, duplicates = sum(duplicated(v)))
if (!single_meta) {
if (cn %in% varinf_full) {
r$info <- varinf[[cn]]
} else if (cn %in% varinf_suf) {
r$info <- varinf[[which(varinf_suf == cn)]]
}
r$info <- r$info[r$info != ""]
}
su <- !is.na(v)
if (any(su)) {
r$time_range <- which(times_unique %in% range(times[su])) - 1
r$time_range <- if (length(r$time_range)) r$time_range[c(1, length(r$time_range))] else c(-1, -1)
} else {
r$time_range <- c(-1, -1)
}
if (!is.character(v) && all(invalid)) {
r$type <- "unknown"
r$missing <- length(v)
} else if (is.numeric(v)) {
r$type <- if (all(invalid | as.integer(v) == v)) "integer" else "float"
r$missing <- sum(invalid)
r$mean <- round(mean(v, na.rm = TRUE), 6)
r$sd <- round(sd(v, na.rm = TRUE), 6)
r$min <- round(min(v, na.rm = TRUE), 6)
r$max <- round(max(v, na.rm = TRUE), 6)
} else {
r$type <- "string"
if (!is.factor(v)) v <- as.factor(as.character(v))
r$missing <- sum(is.na(v) | is.nan(v) | grepl("^[\\s.-]$", v))
r$table <- structure(as.list(tabulate(v)), names = levels(v))
}
r
})
)
)
if (!single_meta && "_references" %in% names(varinf)) res[["_references"]] <- varinf[["_references"]]
if (Sys.which("openssl") != "") {
res[[paste0("sha", sha)]] <- calculate_sha(f, sha)
}
res
}
single_meta <- FALSE
metas <- if (!is.null(names(meta))) {
if (!is.null(setnames) && all(setnames %in% names(meta))) {
meta[setnames]
} else {
single_meta <- TRUE
if (length(meta$variables) == 1 && is.character(meta$variables)) {
if (!file.exists(meta$variables)) meta$variables <- paste0(dir, "/", meta$variables)
if (file.exists(meta$variables)) meta$variables <- jsonlite::read_json(meta$variables)
}
meta$variables <- replace_equations(meta$variables)
meta
}
} else {
meta[seq_along(filename)]
}
if (!single_meta) {
metas <- lapply(metas, function(m) {
m$variables <- replace_equations(m$variables)
m
})
}
metadata <- lapply(seq_along(filename), collect_metadata)
if (single_meta) package$measure_info <- lapply(meta$variables, function(e) e[e != ""])
package$resources <- c(metadata, if (!refresh) package$resources)
names <- vapply(package$resources, "[[", "", "filename")
if (anyDuplicated(names)) {
package$resources <- package$resources[!duplicated(names)]
}
if (clean) {
cf <- lma_dict("special", perl = TRUE, as.function = gsub)
package <- jsonlite::fromJSON(cf(jsonlite::toJSON(package, auto_unbox = TRUE)))
}
if (write) {
packagename <- if (is.character(packagename)) packagename else "datapackage.json"
jsonlite::write_json(
package, if (file.exists(packagename)) packagename else paste0(dir, "/", packagename),
auto_unbox = TRUE, digits = 6
)
if (interactive()) {
cli_bullets(c(v = paste(
if (refresh) "updated resource in" else "added resource to", "datapackage.json:"
), "*" = paste0("{.path ", packagename, "}")))
if (open_after) navigateToFile(packagename)
}
}
invisible(package)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.