Nothing
#' Store a Data.Frame as a Git2rdata Object on Disk
#'
#' A git2rdata object consists of two files.
#' The `".tsv"` file contains the raw data as a plain text tab separated file.
#' The `".yml"` contains the metadata on the columns in plain text YAML format.
#' See `vignette("plain text", package = "git2rdata")` for more details on the
#' implementation.
#' @param x the `data.frame`.
#' @param file the name of the git2rdata object. Git2rdata objects cannot
#' have dots in their name. The name may include a relative path. `file` is a
#' path relative to the `root`.
#' Note that `file` must point to a location within `root`.
#' @param root The root of a project. Can be a file path or a `git-repository`.
#' Defaults to the current working directory (`"."`).
#' @param sorting an optional vector of column names defining which columns to
#' use for sorting `x` and in what order to use them.
#' The default empty `sorting` yields a warning.
#' Add `sorting` to avoid this warning.
#' Strongly recommended in combination with version control.
#' See `vignette("efficiency", package = "git2rdata")` for an illustration of
#' the importance of sorting.
#' @param strict What to do when the metadata changes. `strict = FALSE`
#' overwrites the data and the metadata with a warning listing the changes,
#' `strict = TRUE` returns an error and leaves the data and metadata as is.
#' Defaults to `TRUE`.
#' @param ... parameters used in some methods
#' @inheritParams meta
#' @inheritParams utils::write.table
#' @return a named vector with the file paths relative to `root`. The names
#' contain the hashes of the files.
#' @export
#' @family storage
#' @template example_io
#' @note `..generic` is a reserved name for the metadata and is a forbidden
#' column name in a `data.frame`.
write_vc <- function(
x, file, root = ".", sorting, strict = TRUE, optimize = TRUE, na = "NA", ...,
split_by
) {
UseMethod("write_vc", root)
}
#' @export
write_vc.default <- function(
x, file, root, sorting, strict = TRUE, optimize = TRUE, na = "NA", ...
) {
stop("a 'root' of class ", class(root), " is not supported", call. = FALSE)
}
#' @rdname write_vc
#' @param split_by An optional vector of variables name to split the text files.
#' This creates a separate file for every combination.
#' We prepend these variables to the vector of `sorting` variables.
#' @export
#' @importFrom assertthat assert_that is.string is.flag
#' @importFrom yaml read_yaml write_yaml
#' @importFrom utils write.table
#' @importFrom git2r hash
write_vc.character <- function(
x, file, root = ".", sorting, strict = TRUE, optimize = TRUE,
na = "NA", ..., split_by = character(0)
) {
assert_that(
inherits(x, "data.frame"), is.string(file), is.string(root), is.string(na),
noNA(na), no_whitespace(na), is.flag(strict), is.flag(optimize)
)
root <- normalizePath(root, winslash = "/", mustWork = TRUE)
file <- clean_data_path(root = root, file = file)
if (!file.exists(dirname(file["raw_file"]))) {
dir.create(dirname(file["raw_file"]), recursive = TRUE)
}
if (!file.exists(file["meta_file"])) {
raw_data <- meta(
x, optimize = optimize, na = na, sorting = sorting, split_by = split_by
)
} else {
tryCatch(
is_git2rmeta(file = remove_root(file = file["meta_file"], root = root),
root = root, message = "error"),
error = function(e) {
stop(paste("Existing metadata file is invalid.", e$message, sep = "\n"),
call. = FALSE)
}
)
old <- read_yaml(file["meta_file"])
class(old) <- "meta_list"
raw_data <- meta(x, optimize = optimize, na = na, sorting = sorting,
old = old, strict = strict, split_by = split_by)
problems <- compare_meta(attr(raw_data, "meta"), old)
if (length(problems)) {
problems <- c(
"See vignette('version_control', package = 'git2rdata') for more information.",
"", problems)
if (strict) {
problems <- c(
"The data was not overwritten because of the issues below.", problems)
stop(paste(problems, collapse = "\n"), call. = FALSE)
}
problems <- c(
"Changes in the metadata may lead to unnecessarily large diffs.",
problems)
warning(paste(problems, collapse = "\n"), call. = FALSE)
if (missing(sorting) && !is.null(old[["..generic"]][["sorting"]])) {
sorting <- old[["..generic"]][["sorting"]]
}
}
}
file["raw_file"] <- ifelse(
attr(raw_data, "meta")[["..generic"]][["optimize"]],
file["raw_file"],
gsub("\\.tsv$", ".csv", file["raw_file"])
)
assert_that(
unlink(file["raw_file"], recursive = TRUE) == 0,
msg = "Failed to remove existing files."
)
if (length(split_by) == 0) {
write.table(
x = raw_data, file = file["raw_file"], append = FALSE, quote = FALSE,
sep = ifelse(
attr(raw_data, "meta")[["..generic"]][["optimize"]], "\t", ","
),
eol = "\n", na = na, dec = ".", row.names = FALSE,
col.names = TRUE, fileEncoding = "UTF-8"
)
} else {
index <- unique(raw_data[split_by])
index[["..hash"]] <- hash(apply(index, 1, paste, collapse = "\t"))
dir.create(file["raw_file"], showWarnings = FALSE, recursive = TRUE)
write.table(
x = index, file = file.path(file["raw_file"], "index.tsv"),
append = FALSE, quote = FALSE, sep = "\t", eol = "\n", na = na, dec = ".",
row.names = FALSE, col.names = TRUE, fileEncoding = "UTF-8"
)
detail_names <- colnames(raw_data)[!colnames(raw_data) %in% split_by]
vapply(
seq_len(nrow(index)),
function(i) {
matching <- vapply(
split_by,
function(split) {
raw_data[[split]] == index[[split]][i]
},
logical(nrow(raw_data))
)
rf <- file.path(file["raw_file"], paste0(index[i, "..hash"], ".tsv"))
write.table(
x = raw_data[apply(matching, 1, all), detail_names, drop = FALSE],
file = rf,
append = FALSE, quote = FALSE, sep = "\t", eol = "\n", na = na,
dec = ".", row.names = FALSE, col.names = TRUE, fileEncoding = "UTF-8"
)
return(TRUE)
},
logical(1)
)
}
meta_data <- attr(raw_data, "meta")
meta_data[["..generic"]][["git2rdata"]] <- as.character(
packageVersion("git2rdata")
)
meta_data[["..generic"]][["data_hash"]] <- datahash(file["raw_file"])
write_yaml(meta_data, file["meta_file"],
fileEncoding = "UTF-8")
hashes <- remove_root(file = file, root = root)
names(hashes) <-
c(
meta_data[["..generic"]][["data_hash"]],
meta_data[["..generic"]][["hash"]]
)
return(hashes)
}
#' @importFrom methods setOldClass
setOldClass("git_repository")
#' @rdname write_vc
#' @param stage Logical value indicating whether to stage the changes after
#' writing the data. Defaults to `FALSE`.
#' @inheritParams git2r::add
#' @export
#' @importFrom git2r workdir add
#' @importFrom assertthat assert_that is.flag
write_vc.git_repository <- function(
x, file, root, sorting, strict = TRUE, optimize = TRUE, na = "NA", ...,
stage = FALSE, force = FALSE
) {
assert_that(is.flag(stage), is.flag(force))
hashes <- write_vc(
x = x, file = file, root = workdir(root), sorting = sorting,
strict = strict, optimize = optimize, na = na, ...
)
if (!stage) {
return(hashes)
}
add(root, path = hashes, force = force)
return(hashes)
}
compare_meta <- function(new, old) {
new[["..generic"]][["data_hash"]] <- NULL
old[["..generic"]][["data_hash"]] <- NULL
problems <- character(0)
if (isTRUE(all.equal(new, old))) {
return(problems)
}
new_optimize <- new[["..generic"]][["optimize"]]
old_optimize <- old[["..generic"]][["optimize"]]
if (new_optimize != old_optimize) {
problems <- c(
problems,
sprintf(
"- New data is %s, whereas old data was %s.
Check the 'optimized' argument.",
ifelse(new_optimize, "optimized", "verbose"),
ifelse(old_optimize, "optimized", "verbose")
)
)
}
if (new[["..generic"]][["NA string"]] != old[["..generic"]][["NA string"]]) {
problems <- c(
problems,
sprintf(
"- New data uses '%s' as NA string, whereas old data used '%s'.
Check the 'NA' argument.",
new[["..generic"]][["NA string"]], old[["..generic"]][["NA string"]]
)
)
}
new_sorting <- new[["..generic"]][["sorting"]]
old_sorting <- old[["..generic"]][["sorting"]]
if (!isTRUE(all.equal(new_sorting, old_sorting))) {
sprintf(
"- The sorting variables changed.
- Sorting for the new data: %s.
- Sorting for the old data: %s.",
paste(sprintf("'%s'", new_sorting), collapse = ", "),
paste(sprintf("'%s'", old_sorting), collapse = ", ")
) -> extra
problems <- c(problems, extra)
}
new_split_by <- new[["..generic"]][["split_by"]]
old_split_by <- old[["..generic"]][["split_by"]]
if (!isTRUE(all.equal(new_split_by, old_split_by))) {
sprintf(
"- The split_by variables changed.
- Split_by for the new data: %s.
- Split_by for the old data: %s.",
paste(sprintf("'%s'", new_split_by), collapse = ", "),
paste(sprintf("'%s'", old_split_by), collapse = ", ")
) -> extra
problems <- c(problems, extra)
}
new <- new[names(new) != "..generic"]
old <- old[names(old) != "..generic"]
if (length(new) != length(old)) {
problems <- c(problems, "- New data has a different number of variables.")
}
if (!all(names(new) %in% names(old))) {
problems <- c(problems,
sprintf(
"- New variables: %s.",
paste(names(new)[!names(new) %in% names(old)], collapse = ", ")
)
)
}
if (!all(names(old) %in% names(new))) {
problems <- c(problems,
sprintf(
"- Deleted variables: %s.",
paste(names(old)[!names(old) %in% names(new)], collapse = ", ")
)
)
}
common_variables <- names(old)[names(old) %in% names(new)]
old_class <- vapply(old[common_variables], "[[", character(1), "class")
new_class <- vapply(new[common_variables], "[[", character(1), "class")
delta <- which(old_class != new_class)
if (length(delta)) {
problems <- c(problems,
sprintf("- Change in class: '%s' from %s to %s.", common_variables[delta],
old_class[delta], new_class[delta])
)
}
problems <- compare_factors(
problems = problems,
common_variables = common_variables[old_class == new_class],
old_class = old_class[old_class == new_class],
old = old,
new = new
)
return(problems)
}
compare_factors <- function(problems, common_variables, old_class, old, new) {
for (id in common_variables[old_class == "factor"]) {
if (old[[id]]$ordered != new[[id]]$ordered) {
problems <- c(
problems,
sprintf(
"- '%s' changes from %s to %s.", id,
ifelse(old[[id]]$ordered, "ordinal", "nominal"),
ifelse(new[[id]]$ordered, "ordinal", "nominal")
)
)
}
if (!isTRUE(all.equal(old[[id]][["labels"]], new[[id]][["labels"]]))) {
problems <- c(problems, sprintf("- New factor labels for '%s'.", id))
}
if (!isTRUE(all.equal(old[[id]][["index"]], new[[id]][["index"]]))) {
problems <- c(problems, sprintf("- New indices for '%s'.", id))
}
}
return(problems)
}
#' @noRd
#' @param file the file including the path
#' @param root the path of the root
remove_root <- function(file, root) {
n_root <- nchar(root) + 1
has_root <- substr(file, 1, n_root) == paste0(root, "/")
file[has_root] <- substr(file[has_root], n_root + 1, nchar(file[has_root]))
return(file)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.