R/data_add.R

Defines functions data_add

Documented in data_add

#' Adds documentation of a dataset to a datapackage
#'
#' Add information about variables in a dataset to a \code{datapackage.json} metadata file.
#'
#' @param filename A character vector of paths to plain-text tabular data files, relative to \code{dir}.
#' @param meta Information about each data file. A list with a list entry for each entry in
#' \code{filename}; see details. If a single list is provided for multiple data files, it will apply to all.
#' @param packagename Package to add the metadata to; path to the \code{.json} file relative to
#' \code{dir}, or a list with the read-in version.
#' @param dir Directory in which to look for \code{filename}, and write \code{packagename}.
#' @param write Logical; if \code{FALSE}, returns the \code{paths} metadata without reading or rewriting
#' \code{packagename}.
#' @param refresh Logical; if \code{FALSE}, will retain any existing dataset information.
#' @param sha A number specifying the Secure Hash Algorithm function,
#' if \code{openssl} is available (checked with \code{Sys.which('openssl')}).
#' @param clean Logical; if \code{TRUE}, strips special characters before saving.
#' @param open_after Logical; if \code{TRUE}, opens the written datapackage after saving.
#' @details
#' \code{meta} should be a list with unnamed entries for entry in \code{filename},
#' and each entry can include a named entry for any of these:
#' \describe{
#'   \item{source}{
#'   A list or list of lists with entries for at least \code{name}, and ideally for \code{url}.
#'   }
#'   \item{ids}{
#'   A list or list of lists with entries for at least \code{variable} (the name of a variable in the dataset).
#'   Might also include \code{map} with a list or path to a JSON file resulting in a list with an
#'   entry for each ID, and additional information about that entity, to be read in a its features.
#'   All files will be loaded to help with aggregation, but local files will be included in the datapackage,
#'   whereas hosted files will be loaded client-side.
#'   }
#'   \item{time}{
#'   A string giving the name of a variable in the dataset representing a repeated observation of the same entity.
#'   }
#'   \item{variables}{
#'   A list with named entries providing more information about the variables in the dataset.
#'   See \code{\link{data_measure_info}}.
#'   }
#' }
#' @examples
#' \dontrun{
#' # write example data
#' write.csv(mtcars, "mtcars.csv")
#'
#' # add it to an existing datapackage.json file in the current working directory
#' data_add("mtcars.csv")
#' }
#' @return An invisible version of the updated datapackage, which is also written to
#' \code{datapackage.json} if \code{write = TRUE}.
#' @seealso Initialize the \code{datapackage.json} file with \code{\link{init_data}}.
#' @export

data_add <- function(filename, meta = list(), packagename = "datapackage.json", dir = ".", write = TRUE,
                     refresh = TRUE, sha = "512", clean = FALSE, open_after = FALSE) {
  if (missing(filename)) cli_abort("{.arg filename} must be specified")
  setnames <- names(filename)
  if (file.exists(filename[[1]])) {
    if (dir == ".") dir <- dirname(filename[[1]])
    filename <- basename(filename)
  }
  if (check_template("site", dir = dir)$status[["strict"]] &&
    all(file.exists(paste0(dir, "/docs/data/", filename)))) {
    dir <- paste0(dir, "/docs/data")
  }
  if (any(!file.exists(paste0(dir, "/", filename)))) {
    filename <- filename[!file.exists(filename)]
    cli_abort("{?a file/files} did not exist: {filename}")
  }
  package <- if (is.character(packagename) && file.exists(paste0(dir, "/", packagename))) {
    paste0(dir, "/", packagename)
  } else {
    packagename
  }
  if (write) {
    if (is.character(package)) {
      package <- paste0(dir, "/", packagename)
      package <- if (file.exists(package)) {
        packagename <- package
        jsonlite::read_json(package)
      } else {
        init_data(if (!is.null(setnames)) setnames[[1]] else filename[[1]], dir = dir)
      }
    }
    if (!is.list(package)) {
      cli_abort(c(
        "{.arg package} does not appear to be in the right format",
        i = "this should be (or be read in from JSON as) a list with a {.code resource} entry"
      ))
    }
  }
  if (!is.list(package)) package <- list()
  collect_metadata <- function(file) {
    f <- paste0(dir, "/", filename[[file]])
    m <- if (single_meta) meta else metas[[file]]
    format <- if (grepl(".csv", f, fixed = TRUE)) "csv" else if (grepl(".rds", f, fixed = TRUE)) "rds" else "tsv"
    if (is.na(format)) format <- "rds"
    info <- file.info(f)
    metas <- list()
    unpack_meta <- function(n) {
      if (!length(m[[n]])) list() else if (is.list(m[[n]][[1]])) m[[n]] else list(m[[n]])
    }
    ids <- unpack_meta("ids")
    idvars <- NULL
    for (i in seq_along(ids)) {
      if (is.list(ids[[i]])) {
        if (length(ids[[i]]$map) == 1 && is.character(ids[[i]]$map) && file.exists(ids[[i]]$map)) {
          ids[[i]]$map_content <- paste(readLines(ids[[i]]$map, warn = FALSE), collapse = "")
        }
      } else {
        ids[[i]] <- list(variable = ids[[i]])
      }
      if (!ids[[i]]$variable %in% idvars) idvars <- c(idvars, ids[[i]]$variable)
    }
    data <- if (format == "rds") {
      tryCatch(readRDS(f), error = function(e) NULL)
    } else {
      attempt_read(f, if (length(idvars)) idvars[1] else "")
    }
    if (is.null(data)) {
      cli_abort(c(
        paste0("failed to read in the data file ({.file {f}})"),
        i = "check that it is in a compatible format"
      ))
    }
    if (!all(rownames(data) == seq_len(nrow(data)))) data <- cbind(`_row` = rownames(data), data)
    timevar <- unlist(unpack_meta("time"))
    times <- if (is.null(timevar)) rep(1, nrow(data)) else data[[timevar]]
    times_unique <- unique(times)
    if (!single_meta) {
      varinf <- unpack_meta("variables")
      if (length(varinf) == 1 && is.character(varinf[[1]])) {
        if (!file.exists(varinf[[1]])) varinf[[1]] <- paste0(dir, "/", varinf[[1]])
        if (file.exists(varinf[[1]])) {
          if (varinf[[1]] %in% names(metas)) {
            varinf <- metas[[varinf[[1]]]]
          } else {
            varinf <- metas[[varinf[[1]]]] <- data_measure_info(varinf[[1]], write = FALSE, render = TRUE)
          }
          varinf <- varinf[varinf != ""]
        }
      }
      varinf_full <- names(varinf)
      varinf_suf <- sub("^[^:]+:", "", varinf_full)
    }
    res <- list(
      bytes = as.integer(info$size),
      encoding = stri_enc_detect(f)[[1]][1, 1],
      md5 = md5sum(f)[[1]],
      format = format,
      name = if (!is.null(setnames)) {
        setnames[file]
      } else if (!is.null(m$name)) {
        m$name
      } else {
        sub("\\.[^.]*$", "", basename(filename[[file]]))
      },
      filename = filename[[file]],
      source = unpack_meta("source"),
      ids = ids,
      id_length = if (length(idvars)) {
        id_lengths <- nchar(data[[idvars[1]]])
        if (all(id_lengths == id_lengths[1])) id_lengths[1] else 0
      } else {
        0
      },
      time = timevar,
      profile = "data-resource",
      created = as.character(info$mtime),
      last_modified = as.character(info$ctime),
      row_count = nrow(data),
      entity_count = if (length(idvars)) length(unique(data[[idvars[1]]])) else nrow(data),
      schema = list(
        fields = lapply(colnames(data)[!colnames(data) %in% idvars], function(cn) {
          v <- data[[cn]]
          invalid <- !is.finite(v)
          r <- list(name = cn, duplicates = sum(duplicated(v)))
          if (!single_meta) {
            if (cn %in% varinf_full) {
              r$info <- varinf[[cn]]
            } else if (cn %in% varinf_suf) {
              r$info <- varinf[[which(varinf_suf == cn)]]
            }
            r$info <- r$info[r$info != ""]
          }
          su <- !is.na(v)
          if (any(su)) {
            r$time_range <- which(times_unique %in% range(times[su])) - 1
            r$time_range <- if (length(r$time_range)) r$time_range[c(1, length(r$time_range))] else c(-1, -1)
          } else {
            r$time_range <- c(-1, -1)
          }
          if (!is.character(v) && all(invalid)) {
            r$type <- "unknown"
            r$missing <- length(v)
          } else if (is.numeric(v)) {
            r$type <- if (all(invalid | as.integer(v) == v)) "integer" else "float"
            r$missing <- sum(invalid)
            r$mean <- round(mean(v, na.rm = TRUE), 6)
            r$sd <- round(sd(v, na.rm = TRUE), 6)
            r$min <- round(min(v, na.rm = TRUE), 6)
            r$max <- round(max(v, na.rm = TRUE), 6)
          } else {
            r$type <- "string"
            if (!is.factor(v)) v <- as.factor(as.character(v))
            r$missing <- sum(is.na(v) | is.nan(v) | grepl("^[\\s.-]$", v))
            r$table <- structure(as.list(tabulate(v)), names = levels(v))
          }
          r
        })
      )
    )
    if (!single_meta && "_references" %in% names(varinf)) res[["_references"]] <- varinf[["_references"]]
    if (Sys.which("openssl") != "") {
      res[[paste0("sha", sha)]] <- calculate_sha(f, sha)
    }
    res
  }
  single_meta <- FALSE
  metas <- if (!is.null(names(meta))) {
    if (!is.null(setnames) && all(setnames %in% names(meta))) {
      meta[setnames]
    } else {
      single_meta <- TRUE
      if (length(meta$variables) == 1 && is.character(meta$variables)) {
        if (!file.exists(meta$variables)) meta$variables <- paste0(dir, "/", meta$variables)
        if (file.exists(meta$variables)) meta$variables <- jsonlite::read_json(meta$variables)
      }
      meta$variables <- replace_equations(meta$variables)
      meta
    }
  } else {
    meta[seq_along(filename)]
  }
  if (!single_meta) {
    metas <- lapply(metas, function(m) {
      m$variables <- replace_equations(m$variables)
      m
    })
  }
  metadata <- lapply(seq_along(filename), collect_metadata)
  if (single_meta) package$measure_info <- lapply(meta$variables, function(e) e[e != ""])
  package$resources <- c(metadata, if (!refresh) package$resources)
  names <- vapply(package$resources, "[[", "", "filename")
  if (anyDuplicated(names)) {
    package$resources <- package$resources[!duplicated(names)]
  }
  if (clean) {
    cf <- lma_dict("special", perl = TRUE, as.function = gsub)
    package <- jsonlite::fromJSON(cf(jsonlite::toJSON(package, auto_unbox = TRUE)))
  }
  if (write) {
    packagename <- if (is.character(packagename)) packagename else "datapackage.json"
    jsonlite::write_json(
      package, if (file.exists(packagename)) packagename else paste0(dir, "/", packagename),
      auto_unbox = TRUE, digits = 6
    )
    if (interactive()) {
      cli_bullets(c(v = paste(
        if (refresh) "updated resource in" else "added resource to", "datapackage.json:"
      ), "*" = paste0("{.path ", packagename, "}")))
      if (open_after) navigateToFile(packagename)
    }
  }
  invisible(package)
}
uva-bi-sdad/community documentation built on Oct. 12, 2023, 1:18 p.m.