#' Refresh Data Commons Repositories
#'
#' Clone and/or pull repositories that are part of a data commons project.
#'
#' @param dir Directory of the data commons projects, as created by \code{\link{init_datacommons}}.
#' @param clone_method Means of cloning new repositories; either \code{"http"} (default) or \code{"ssh"}.
#' @param include_distributions Logical; if \code{TRUE}, will attempt to locate and cache copies of datasets
#' pointed to from the data repositories (so far just from Dataverse, implicitly from DOI files).
#' @param refresh_distributions Logical; if \code{TRUE}, will download fresh copies of the distribution metadata.
#' @param only_new Logical; if \code{TRUE}, only repositories that do not yet exist will be processed.
#' @param reset_repos Logical; if \code{TRUE}, will fetch and hard reset the repositories to remove any local changes.
#' @param reset_on_fail Logical; if \code{TRUE}, will reset only if a regular pull fails.
#' @param rescan_only Logical; if \code{TRUE}, will only read the files that are already in place, without checking for
#' updates from the remote repository.
#' @param run_checks Logical; if \code{FALSE}, will not run \code{\link{check_repository}} on each repository.
#' @param dataset_map A named vector of ID to dataset mappings to pass to \code{\link{check_repository}}
#' if \code{run_checks} is \code{TRUE}.
#' @param force_value_check Logical; if \code{TRUE}, will always intensively check values, even on large files.
#' @param verbose Logical; if \code{FALSE}, will not show updated repositories.
#' @examples
#' \dontrun{
#' # refresh from a data commons working directory
#' datacommons_refresh(".")
#' }
#' @return An invisible character vector of updated repositories.
#' @export
datacommons_refresh <- function(dir, clone_method = "http", include_distributions = FALSE, refresh_distributions = FALSE,
only_new = FALSE, reset_repos = FALSE, reset_on_fail = FALSE, rescan_only = FALSE,
run_checks = TRUE, dataset_map = "region_type", force_value_check = FALSE, verbose = TRUE) {
if (missing(dir)) cli_abort('{.arg dir} must be specified (e.g., as ".")')
if (Sys.which("git") == "") {
cli_abort(c(
x = "the {.emph git} command could not be located",
i = "you might need to install git: {.url https://git-scm.com/downloads}"
))
}
check <- check_template("datacommons", dir = dir)
if (!check$exists) {
cli_abort(c(
x = "{.arg dir} does not appear to point to a data commons project",
i = paste0('initialize it with {.code init_datacommons("', dir, '")}')
))
}
dir <- normalizePath(dir, "/", FALSE)
commons <- jsonlite::read_json(paste0(dir, "/commons.json"))
repos <- sort(unique(unlist(Filter(length, c(
commons$repositories, readLines(paste0(dir, "/scripts/repos.txt"))
)))))
if (!length(repos)) cli_abort("no repositories are listed in {.file commons.json}.")
repos <- gsub("^[\"']+|['\"]+$|^.*github\\.com/", "", repos)
su <- !grepl("/", repos, fixed = TRUE)
if (any(su)) {
repos <- repos[su]
cli_abort("repo{?s are/ is} missing a username prefix: {.files {repos}}")
}
repos <- sub("^([^/]+/[^/#@]+)[^/]*$", "\\1", repos)
if (!identical(unlist(commons$repositories, use.names = FALSE), repos)) {
commons$repositories <- repos
jsonlite::write_json(commons, paste0(dir, "/commons.json"), auto_unbox = TRUE, pretty = TRUE)
}
writeLines(repos, paste0(dir, "/scripts/repos.txt"))
if (only_new) {
repos <- repos[!file.exists(paste0(dir, "/repos/", sub("^.*/", "", repos)))]
if (!length(repos)) {
if (verbose) cli_alert_success("no new repositories")
return(invisible(repos))
}
}
updated <- dist_updated <- failed <- logical(length(repos))
wd <- getwd()
on.exit(setwd(wd))
repo_dir <- paste0(normalizePath(paste0(dir, "/repos/"), "/", FALSE), "/")
dir.create(repo_dir, FALSE, TRUE)
setwd(repo_dir)
method <- if (clone_method == "ssh") "git@github.com:" else "https://github.com/"
if (include_distributions) dir.create(paste0(dir, "/cache"), FALSE)
manifest_file <- paste0(dir, "/manifest/repos.json")
repo_manifest <- list()
for (i in seq_along(repos)) {
r <- repos[[i]]
rn <- sub("^.*/", "", r)
cr <- paste0(repo_dir, rn, "/")
if (!rescan_only) {
change_dir <- dir.exists(rn)
if (verbose) cli_alert_info(paste(if (change_dir) "pulling" else "cloning", rn))
if (change_dir) setwd(cr)
s <- tryCatch(if (change_dir) {
if (reset_repos || reset_on_fail) {
attempt <- if (reset_on_fail) system2("git", "pull", stdout = TRUE) else NULL
if (!is.null(attr(attempt, "status"))) {
system2("git", "clean --f", stdout = TRUE)
system2("git", "fetch", stdout = TRUE)
system2("git", "reset --hard FETCH_HEAD", stdout = TRUE)
} else {
attempt
}
} else {
system2("git", "pull", stdout = TRUE)
}
} else {
system2("git", c("clone", paste0(method, r, ".git")), stdout = TRUE)
}, error = function(e) e$message)
if (change_dir) setwd(repo_dir)
if (length(s) != 1 || s != "Already up to date.") {
if (!is.null(attr(s, "status"))) {
failed[i] <- TRUE
cli_alert_warning(c(
x = paste0("failed to retrieve ", r, ": ", paste(s, collapse = " "))
))
} else {
updated[i] <- TRUE
}
} else if (!length(list.files(rn))) system2("rm", c("-rf", rn))
}
repo_manifest[[r]]$base_url <- get_git_remote(paste0(cr, ".git/config"))
files <- sort(list.files(
cr, "\\.(?:csv|tsv|txt|dat|rda|rdata)(?:\\.[gbx]z2?)?$",
full.names = TRUE, recursive = TRUE, ignore.case = TRUE
))
files <- normalizePath(files, "/")
for (f in files) {
repo_manifest[[r]]$files[[sub("^.*/repos/[^/]+/", "", f)]] <- list(
size = file.size(f),
sha = system2("git", c("hash-object", shQuote(f)), stdout = TRUE),
md5 = md5sum(f)[[1]]
)
}
doi <- repo_manifest[[r]]$distributions$dataverse$doi
if (include_distributions && !is.null(doi)) {
if (verbose) {
ul <- cli_ul()
iul <- cli_ul()
cli_li("including Dataverse distribution for {.emph {doi}}")
}
meta_file <- paste0(dir, "/cache/", rn, "/dataverse_metadata.json")
meta <- if (!refresh_distributions && file.exists(meta_file)) {
jsonlite::read_json(meta_file, simplifyVector = TRUE)
} else {
tryCatch(download_dataverse_info(doi, refresh = refresh_distributions), error = function(e) NULL)
}
if (is.null(meta)) {
if (verbose) {
cli_li(col_red("failed to download Dataverse metadata for {.emph {doi}}"))
cli_end(iul)
cli_end(ul)
}
} else {
if (is.null(meta$latestVersion)) meta$latestVersion <- list(files = meta$files)
dir.create(paste0(dir, "/cache/", rn), FALSE)
jsonlite::write_json(meta, meta_file, auto_unbox = TRUE)
repo_manifest[[r]]$distributions$dataverse$id <- meta$datasetId
repo_manifest[[r]]$distributions$dataverse$server <- meta$server
repo_manifest[[r]]$distributions$dataverse$files <- list()
if (length(meta$latestVersion$files)) {
for (f in meta$latestVersion$files) {
existing <- paste0(dir, "/cache/", rn, "/", f$dataFile$filename)
if (file.exists(existing)) {
if (verbose) cli_li("checking existing version of {.file {f$dataFile$filename}}")
if (md5sum(existing) != f$dataFile$md5) unlink(existing)
}
if (!file.exists(existing)) {
if (verbose) cli_li("downloading {.file {f$dataFile$filename}}")
res <- tryCatch(download_dataverse_data(
doi, paste0(dir, "/cache/", rn),
files = f$label, load = FALSE, decompress = FALSE
), error = function(e) NULL)
if (is.null(res)) {
if (verbose) cli_li(col_red("failed to download {.file {f$dataFile$filename}}"))
} else {
dist_updated[i] <- TRUE
}
}
if (file.exists(existing)) {
repo_manifest[[r]]$distributions$dataverse$files[[sub("^.*/cache/[^/]+/", "", existing)]] <- list(
id = f$dataFile$id,
size = file.size(existing),
md5 = md5sum(existing)[[1]]
)
}
}
}
}
if (verbose) {
cli_end(iul)
cli_end(ul)
}
}
if (run_checks) {
if (verbose) cli_progress_step("running checks...", msg_done = "ran checks:")
repo_manifest[[r]]$repo_checks <- tryCatch(
check_repository(cr, dataset = dataset_map, verbose = FALSE),
error = function(e) NULL
)
repo_manifest[[r]]$repo_checks <- lapply(repo_manifest[[r]]$repo_checks[
grep("^summary|^(?:info|warn|fail)_", names(repo_manifest[[r]]$repo_checks))
], function(l) {
if (is.character(l)) l <- sub("^.*/repos/[^/]+/", "", l)
if (!is.null(names(l))) names(l) <- sub("^.*/repos/[^/]+/", "", names(l))
l
})
if (verbose) {
cli_progress_done()
if (length(repo_manifest[[r]]$repo_checks$summary)) {
print(repo_manifest[[r]]$repo_checks$summary)
cat("\n")
}
}
}
}
if (verbose) {
if (any(updated)) {
updated_repos <- repos[updated]
cli_alert_success("updated data repositor{?ies/y}: {.file {updated_repos}}")
}
if (any(dist_updated)) {
updated_distributions <- repos[dist_updated]
cli_alert_success("updated distributed file{?s} in: {.file {updated_distributions}}")
}
if (any(failed)) {
failed_repos <- repos[failed]
cli_alert_danger("failed to retrieve repositor{?ies/y}: {.file {failed_repos}}")
} else if (!any(updated | dist_updated)) {
cli_alert_success("all data repositories are up to date")
}
}
if (length(repo_manifest)) {
su <- names(repo_manifest) %in% repos
if (any(su)) {
jsonlite::write_json(repo_manifest[su], manifest_file, auto_unbox = TRUE)
} else {
cli_warn("no repos were found in the existing repo manifest")
}
}
init_datacommons(dir, refresh_after = FALSE, verbose = FALSE)
invisible(repos[updated | dist_updated])
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.