knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) # for debugging # knitr::opts_chunk$set( # echo = TRUE, # message = TRUE, # warning = TRUE, # error = TRUE # ) # Chunk options .opt_width <- options(width = 450) # save the built-in output hook hook_output <- knitr::knit_hooks$get("output") # flags to determine output flag_eval_chunk <- if (vmTools:::is_windows_admin() | .Platform$OS.type %in% c("unix", "linux")) TRUE else FALSE # set a new output hook to truncate text output # - set a chunk option as e.g. : `{r chunk_name, out.lines = 15}` # if the output is too long, it will be truncated like: # # top output # ... # bottom output knitr::knit_hooks$set(output = function(x, options) { if (!is.null(n <- options$out.lines)) { x <- vmTools:::split_line_breaks(x) if (length(x) > n) { # truncate the output # x <- c(head(x, n), "....\n") x <- c(head(x, n/2), '....', tail(x, n/2 + 1)) } x <- paste(x, collapse = "\n") } hook_output(x, options) })
if(!flag_eval_chunk){ knitr::asis_output(" > **Note:** This vignette demonstrates symbolic link creation, which requires administrator privileges on Windows. > > On systems without these privileges, code chunks are not evaluated, but all code is shown. > > To fully run this vignette, use a Unix-based system or Windows with administrator rights. ") }
# Defining a couple vignette utilities print_tree <- function(x) {vmTools:::dir_tree(x)} # Make a directory with desired defaults without cluttering the vignette make_directory <- function(path){ dir.create(path, recursive = TRUE, showWarnings = FALSE) } # print a symlink's target from the file system print_symlink <- function(symlink_type, root_input){ print(grep(symlink_type, system(paste("ls -alt", root_input), intern = TRUE), value = TRUE)) } #' Get output directory for results to save in. #' #' Returns a path to save results in of the form "YYYY_MM_DD.VV". #' #' @param root path to root of output results #' @param date character date in form of "YYYY_MM_DD" or "today". "today" will be interpreted as today's date. get_output_dir <- function(root, date) { if (date == "today") { date <- format(Sys.Date(), "%Y_%m_%d") } cur.version <- get_latest_output_date_index(root, date = date) dir.name <- sprintf("%s.%02i", date, cur.version + 1) return(dir.name) } #' get the latest index for given an output dir and a date #' #' directories are assumed to be named in YYYY_MM_DD.VV format with sane #' year/month/date/version values. #' #' @param dir path to directory with versioned dirs #' @param date character in be YYYY_MM_DD format #' #' @return largest version in directory tree or 0 if there are no version OR #' the directory tree does not exist get_latest_output_date_index <- function(root, date) { currentfolders <- list.files(root) # subset to date pat <- sprintf("^%s[.]\\d{2}$", date) date_dirs <- grep(pat, currentfolders, value = TRUE) if (length(date_dirs) == 0) { return(0) } # get the index after day date_list <- strsplit(date_dirs, "[.]") inds <- unlist(lapply(date_list, function(x) x[2])) if (is.na(max(inds, na.rm = TRUE))) inds <- 0 return(max(as.numeric(inds))) } resolve_symlink <- function(path){ path_resolved <- vmTools:::clean_path(path) if(file.exists(path_resolved)) { return(path_resolved) } else { message("Could not resolve symlink: ", path) } } show_vars <- c("log_id", "timestamp", "user", "version_name", # "version_path", # causes ouput to be too wide "action", "comment") print_public_methods <- function(class){ output <- capture.output(print(class)) idx_private <- which(grepl("Private", output)) idx_clone <- which(grepl("clone", output)) idx_custom <- which(grepl("startup guidance messages", output)) # `initialize` is not the name of the method, may confuse new users idx_initialize <- which(grepl("initialize", output)) output[idx_initialize] <- sub("initialize", "new", output[idx_initialize]) idx_keep <- c(1:idx_private - 1, idx_custom) idx_keep <- setdiff(idx_keep, idx_clone) cat(paste0(output[idx_keep], collapse = "\n")) }
The SymLink Tool is a way for researchers to manage multiple data pipeline output runs.
It's designed with user flexibility and project officers in mind, and doesn't require using anything like a database.
This tool assumes you have a large set of output folders for runs of your pipeline, and you store them on the file system.
The Symlink Tool will:
H:
or J:
drive i.e. on the File System.OK, that's nice, but symlinks are pretty easy to make. What else do I (the researcher) get for free if I use this tool?
The most important thing you get are some simple logs that automatically keep track of which folders have been 'best.'
Hmm, is that all? I feel like I could keep an excel document or HUB page that does the same thing?
That's very true, but this doesn't require you to remember, or do any typing yourself.
Also, maybe there are pipeline runs you want to track for different reasons.
Do you want to keep track of all those by hand?
Mark folders as 'keep', with a symlink and log entry that tells you it's important.
And your Project Officer gets things too!
That sounds pretty nice, but didn't you also say something about deleting folders? Why do I need help doing that?
You get some additional benefits - The SymLink Tool will also:
When you're ready to delete, you'll get: 1. Safety - The Symlink Tool will only delete folders that marked to 'remove'. 1. Provenance - You'll get a record in the central log telling you which pipeline runs were deleted, when, why they were deleted (user gets to add a comment).
I'm still reading, and curious to see how this works.
What this demonstration is.
We'll showcase the life-cycle of a typical pipeline output folder.
best_
keep_
remove_
What this demonstration is not.
This won't be an exhaustive demonstration of all the available options, this is a vignette of an average use-case.
symlink_tool_vignette_technical.Rmd
file for more detailed technical explanations.My team uses a output_root
folder for all inputs we submit to ST-GPR.
This way we can prepare the data, then submit various ST-GPR models with
different parameters without needing to re-prep the inputs. The results of the
ST-GPR models go into an output folder, which we'll ignore for simplicity.
library(vmTools) library(data.table) # Make the root folder output_root <- file.path(tempdir(), "slt", "output_root") dir.create(output_root, recursive = TRUE, showWarnings = FALSE)
Call on SLT bare to print class information and methods (functions linked with the tool).
# For this Intro Vignette, we're only showing public methods # - for all methods, see the Technical Vignette SLT
# For the intro vignettee, hide private methods print_public_methods(SLT)
When you make a new tool, this tool is tied to a specific output folder. You can't change the output folder once you've made the tool.
symlink_tool_vignette_technical.Rmd
file.Note: You can define the root for results outputs and logs separately, but we're using the same root for simplicity.
# Instantiate (create) a new Symlink Tool object slt_prep <- SLT$new( user_root_list = list("output_root" = output_root), user_central_log_root = output_root )
# multithreading can cause github actions issues .opt_mccores <- options(mc.cores = 1)
Note: SLT
is an R6 class included with the vmTools package that manages the symlink tool.
# Look at the directory tree print_tree(output_root)
Use the Symlink Tool to create a new folder in your output root.
YYYY_MM_DD.VV
naming schemedate_vers1 <- get_output_dir(output_root, "2024_02_01") slt_prep$make_new_version_folder(version_name = date_vers1)
Capture some paths, using the Symlink Tool to help. We'll use these in a minute.
path_log_central <- slt_prep$return_dictionaries()[["LOG_CENTRAL"]][["path"]] fname_dv_log <- slt_prep$return_dictionaries()[["log_path"]] root_dv1 <- slt_prep$return_dynamic_fields()[["VERS_PATHS"]][["output_root"]] path_log_dv1 <- file.path(root_dv1, fname_dv_log)
Show the file tree.
print_tree(output_root)
Show central log.
(log_central <- data.table::fread(path_log_central)[, ..show_vars])
Show new run version folder log.
(log_dv1 <- data.table::fread(path_log_dv1)[, ..show_vars])
Now let's make some files representing models in this folder.
# Make some dummy files fnames_my_models <- paste0("my_model_", 1:5, ".csv") invisible(file.create(file.path(root_dv1, fnames_my_models)))
print_tree(output_root)
We like the models! We want to elevate this run version folder to best_
status.
Note: All mark_xxxx
operations require a user entry as a named list.
comment
field is currently supported (future version will expand).# Mark best, and take note of messaging slt_prep$mark_best(version_name = date_vers1, user_entry = list(comment = "Best model GBD2023"))
Inspect both the central log and ...
(log_central <- data.table::fread(path_log_central)[, ..show_vars])
...the run version folder log.
(log_dv1 <- data.table::fread(path_log_dv1)[, ..show_vars])
We now have a 'best' symlink that points to our 'best' run version, 2024_02_01.01
print_tree(output_root) resolve_symlink(file.path(output_root, "best"))
Since we run our pipelines many times, we want to track those runs.
Run the pipeline two more times on the same day, inspect the models, and make a human decision about the result quality.
# Second run date_vers2 <- get_output_dir(output_root, "2024_02_01") slt_prep$make_new_version_folder(version_name = date_vers2) # note - the dynamic fields update when you make new folders, so we won't see the dv1 path anymore root_dv2 <- slt_prep$return_dynamic_fields()$VERS_PATHS invisible(file.create(file.path(root_dv2, fnames_my_models))) # Third run date_vers3 <- get_output_dir(output_root, "2024_02_01") slt_prep$make_new_version_folder(version_name = date_vers3) root_dv3 <- slt_prep$return_dynamic_fields()$VERS_PATHS invisible(file.create(file.path(root_dv3, fnames_my_models)))
Now let's look at our file output structure, and central log.
print_tree(output_root)
(log_central <- data.table::fread(path_log_central)[, ..show_vars])
After inspecting our results, we decide the third run is actually the best_
.
best_
status.# Mark best, and take note of messaging slt_prep$mark_best(version_name = date_vers3, user_entry = list(comment = "New best model GBD2023"))
Inspect the central log - The third version is now bested.
(log_central <- data.table::fread(path_log_central)[, ..show_vars])
Note: Multiple "marks" on the same folder will produce no results (but reports will still run)
slt_prep$mark_best(version_name = date_vers3, user_entry = list(comment = "New best model GBD2023"))
Let's also take a look inside each of the run version folder logs.
best
automatically, and the third version was marked as best
. best
symlink points to the third pipeline run.Looking at all three run-version logs we see:
(log_dv1 <- data.table::fread(path_log_dv1)[, ..show_vars])
(log_dv2 <- data.table::fread(file.path(root_dv2, fname_dv_log))[, ..show_vars])
(log_dv3 <- data.table::fread(file.path(root_dv3, fname_dv_log))[, ..show_vars])
print_tree(output_root) resolve_symlink(file.path(output_root, "best"))
We want to keep the first run, even though it's not the best anymore.
best
runs. We can mark this version with a keep_
symlink.# Mark keep, and take note of messaging slt_prep$mark_keep( version_name = date_vers1, user_entry = list(comment = "Previous best") )
The first version is now marked as keep
.
(log_central <- data.table::fread(path_log_central)[, ..show_vars])
Note: Marking a folder keep_
does not make it unique, like best_
. Many folders can be marked keep_
.
(log_dv1 <- data.table::fread(path_log_dv1)[, ..show_vars])
print_tree(output_root) resolve_symlink(file.path(output_root, "keep_2024_02_01.01"))
We want to remove the second run, because the model was experimental, or performed poorly.
remove_
symlink. From here we could use the Symlink Tool to delete the folders, or round them up for ST-GPR model deletion, etc. Either way, we now have a record of which folders are no longer needed, and why.# Mark remove, and take note of messaging slt_prep$mark_remove( version_name = date_vers2, user_entry = list(comment = "Obsolete dev folder"))
Inspect the central log - The second version is now marked as remove_
.
remove_
does not make it unique, like best_
. Many folders can be marked remove
_`.(log_central <- data.table::fread(path_log_central)[, ..show_vars])
print_tree(output_root) resolve_symlink(file.path(output_root, "remove_2024_02_01.02"))
Now that we have marked the second run as remove_
, we can use the Symlink Tool to delete the folders.
First, we'll find (roundup
) all our remove_
folders.
(dt_to_remove <- slt_prep$roundup_remove())
Next, we can handle them any way we choose. For this demonstration, we'll delete them.
_remove
-marked runs to free quota space, for example.for(dir_dv_remove in dt_to_remove$output_root$version_name){ slt_prep$delete_version_folders( version_name = dir_dv_remove, user_entry = list(comment = "Deleting dev folder"), require_user_input = FALSE ) } # The default setting prompts user input, but the process can be automated, as for this vignette. # # Do you want to delete the following folders? # /tmp/RtmpRmKCTu/slt/output_root/2024_02_01.02 # /tmp/RtmpRmKCTu/slt/output_root/remove_2024_02_01.02 # # 1: No # 2: Yes
Check the central log - since the folder is gone, this will maintain a record of when this folder was deleted.
(log_central <- data.table::fread(path_log_central)[, ..show_vars])
Note: As soon as we marked a folder, there was a report ready in our folder. The report_key_versions.csv
file will scan every run-version with a Tool-created Symlink for a log, and show its last row (current status).
(data.table::fread(file.path(output_root, "report_key_versions.csv")))
We can generate more reports of the pipeline runs, and the status of the folders based on different needs. These reports are useful for tracking the status of the pipeline runs, and for making decisions about which folders to keep, delete, or promote.
REPORT_DISCREPANCIES.csv
that will show issues with the run-version logs, in case some were edited by hand in ways that could cause problems.# Generate reports slt_prep$make_reports()
print_tree(output_root)
The report_all_logs.csv
file will scan every run-version for a log, and show its last row (current status).
(data.table::fread(file.path(output_root, "report_all_logs.csv")))
Two other reports sometimes diagnostically helpful are:
report_all_logs_symlink.csv
file will scan run-version folders for log of any other symlink type (in case the user hand-creates symlinks). report_all_logs_non_symlink.csv
file will scan run-version folders that are not currently marked, and show their current status.system(paste("rm -rf", output_root)) options(.opt_width) options(.opt_mccores)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.