knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) # for debugging # knitr::opts_chunk$set( # echo = TRUE, # message = TRUE, # warning = TRUE, # error = TRUE # ) # Chunk options .opt_width <- options(width = 450) # save the built-in output hook hook_output <- knitr::knit_hooks$get("output") # flags to determine output flag_eval_chunk <- if (vmTools:::is_windows_admin() | .Platform$OS.type %in% c("unix", "linux")) TRUE else FALSE # set a new output hook to truncate text output # - set a chunk option as e.g. : `{r chunk_name, out.lines = 15}` # if the output is too long, it will be truncated like: # # top output # ... # bottom output knitr::knit_hooks$set(output = function(x, options) { if (!is.null(n <- options$out.lines)) { x <- vmTools:::split_line_breaks(x) if (length(x) > n) { # truncate the output # x <- c(head(x, n), "....\n") x <- c(head(x, n/2), '....', tail(x, n/2 + 1)) } x <- paste(x, collapse = "\n") } hook_output(x, options) })
if(!flag_eval_chunk){ knitr::asis_output(" > **Note:** This vignette demonstrates symbolic link creation, which requires administrator privileges on Windows. > > On systems without these privileges, code chunks are not evaluated, but all code is shown. > > To fully run this vignette, use a Unix-based system or Windows with administrator rights. ") }
# Defining a couple vignette utilities print_tree <- function(x) {vmTools:::dir_tree(x)} # print a symlink's target from the file system print_symlink <- function(symlink_type){ print(grep(symlink_type, system(paste("ls -alt", root_input), intern = TRUE), value = TRUE)) } #' Get output directory for results to save in. #' #' Returns a path to save results in of the form "YYYY_MM_DD.VV". #' #' @param root path to root of output results #' @param date character date in form of "YYYY_MM_DD" or "today". "today" will be interpreted as today's date. get_output_dir <- function(root, date) { if (date == "today") { date <- format(Sys.Date(), "%Y_%m_%d") } cur.version <- get_latest_output_date_index(root, date = date) dir.name <- sprintf("%s.%02i", date, cur.version + 1) return(dir.name) } #' get the latest index for given an output dir and a date #' #' directories are assumed to be named in YYYY_MM_DD.VV format with sane #' year/month/date/version values. #' #' @param dir path to directory with versioned dirs #' @param date character in be YYYY_MM_DD format #' #' @return largest version in directory tree or 0 if there are no version OR #' the directory tree does not exist get_latest_output_date_index <- function(root, date) { currentfolders <- list.files(root) # subset to date pat <- sprintf("^%s[.]\\d{2}$", date) date_dirs <- grep(pat, currentfolders, value = T) if (length(date_dirs) == 0) { return(0) } # get the index after day date_list <- strsplit(date_dirs, "[.]") inds <- unlist(lapply(date_list, function(x) x[2])) if (is.na(max(inds, na.rm = T))) inds <- 0 return(max(as.numeric(inds))) } print_public_methods <- function(SLT){ output <- capture.output(print(SLT)) idx_private <- which(output %like% "Private") idx_clone <- which(output %like% "clone") idx_custom <- which(output %like% "startup guidance messages") # can't get this to print - frustrating # idx_custom <- which(output %like% "foo") idx_keep <- c(1:idx_private - 1, idx_custom) idx_keep <- setdiff(idx_keep, idx_clone) cat(paste0(output[idx_keep], collapse = "\n")) }
The SymLink Tool is an R6 object-oriented tool that helps a researcher manage pipeline outputs in a standard way. It falls under the category of 'standard tooling'. It will:
This assumes you have a large set of versioned output folders for your pipeline.
If you're already using a database to manage your output versions, you probably don't need this.
If you have a big mess of folders you're having difficulty tracking, this tool may help you out!
SLT
(short for SymLink Tool) is an R6 object generator, or "R6ClassGenerator".
When you want to make a new 'instance' of a tool, call the new
method on the tool's class.
Let's start by calling $new()
with no arguments.
library(vmTools) library(data.table) slt <- try(SLT$new())
OK, now that we know what the tool expects, let's feed it this information and try using it in earnest.
In my pipeline, I divert outputs to two folders:
I want to have the same version_name
of my pipeline outputs in both roots so I can correlate pre and post modeled data.
If you need to handle roots independently, then you should instantiate different versions of the tool to handle each independent root, giving each instance of the tool a unique name e.g. slt_input
and slt_output
.
version_name
is simply a string like "2024_02_02_new_covariates" that's important to you, the modeler, to tell you when and why the pipeline was run. There is no requirement for this to include a date, but it's good practice.In addition, the tool needs a location for a central log. I'll set that one level above both my output folders, since the central log will be shared between them.
version_name
level, not the folder level, so one 'best' promotion affects folders in both my output roots.# a safe temporary directory every user has access to, that we'll clean up later root_base <- file.path(tempdir(), "slt") root_input <- file.path(root_base, "to_model") root_output <- file.path(root_base, "modeled") PATHS <- list( log_cent = file.path(root_base, "log_symlinks_central.csv"), log_2024_02_02 = file.path(root_input, "2024_02_02", "logs/log_version_history.csv"), log_2024_02_10 = file.path(root_input, "2024_02_10", "logs/log_version_history.csv") )
Try to make the tool naively.
slt <- try(SLT$new( user_root_list = list( root_input = root_input, root_output = root_output ) , user_central_log_root = root_base ))
# We need to ensure all output folders exist first dir.create(root_input, recursive = TRUE, showWarnings = FALSE) dir.create(root_output, recursive = TRUE, showWarnings = FALSE) # Now everything should work suppressWarnings({ # idiosyncratic and benign cluster message slt <- SLT$new( user_root_list = list( root_input = root_input, root_output = root_output ) , user_central_log_root = root_base ) })
# multithreading can cause github actions issues .opt_mccores <- options(mc.cores = 1)
What do we have in our root_base folder?
print_tree(root_base)
We should now have a central log, and two output folder.
You can mark any output folder as 'best', and give it a 'best' symlink in each output root.
version_name
can be 'best', and the SLT will demote the current 'best' version if you promote a new version_name
.version_name
folder, and an entry in the central log.NOTE:
version_name
log record records both 'demote' and 'promote' actions.NOTE:
First we'll create two version_name
folders to play with in each root.
dir.create(file.path(root_input, "2024_02_02"), recursive = TRUE, showWarnings = FALSE) dir.create(file.path(root_output, "2024_02_02"), recursive = TRUE, showWarnings = FALSE) dir.create(file.path(root_input, "2024_02_10"), recursive = TRUE, showWarnings = FALSE) dir.create(file.path(root_output, "2024_02_10"), recursive = TRUE, showWarnings = FALSE) print_tree(root_base)
Then we'll mark one as best.
slt$mark_best(version_name = "2024_02_02", user_entry = list(comment = "testing mark_best"))
Look at the folder structure again.
print_tree(root_base)
Where does the 'best' folder point to?
print_symlink("best")
Now let's mark the other one as best, and see what happens to the symlinks.
# The tool is chatty by default at the console, but it's easy to make it quite if it's part of a pipeline. suppressMessages({ slt$mark_best(version_name = "2024_02_10", user_entry = list(comment = "testing mark_best")) })
print_tree(root_base)
print_symlink("best")
In this trio, we see the central log, and each versioned folder's log of best promotion events.
data.table::fread(PATHS$log_cent)
data.table::fread(PATHS$log_2024_02_02)
data.table::fread(PATHS$log_2024_02_10)
You've probably also noticed a report. This also shows the current state of all tool-created symlinks, built from the version_name
folder logs (not the central log).
mark
a folder.data.table::fread(file.path(root_input, "report_key_versions.csv"))
data.table::fread(file.path(root_output, "report_key_versions.csv"))
This tool was designed to allow the researcher to use it 'mid-stream' during a modeling round. I.e. you may mark existing output folders as best, and all the logging takes care of itself.
In addition, the researcher may choose to have this tool manage folder creation. This is useful if you want to ensure that all your output folders are managed by the same tool, and that the tool is aware of all the version_name
folders that exist. Further, there are more reports you can run against your version_name
logs that are more informative if you create all your folders with the tool. (See 'Other Features')
# Let's use a programmatic example to build a new `version_name.` version_name_input <- get_output_dir(root_input, "today") version_name_output <- get_output_dir(root_output, "today") if(!version_name_input == version_name_output) { stop("version_name_input and version_name_output must be the same") } version_name_today <- intersect(version_name_input, version_name_output) # This creates folders safely, and will not overwrite existing folders if called twice. slt$make_new_version_folder(version_name = version_name_today) slt$make_new_version_folder(version_name = version_name_today)
Now we can see the new folders, with their logs and creation date-time stamps.
YYYY_MM_DD.VV
if reruns on the same day are necessary.2024_02_10.01
, 2024_02_10.02
, 2024_02_10.03
, etc.print_tree(root_base)
data.table::fread(file.path(root_input, version_name_today, "logs/log_version_history.csv"))
This folder can be marked best just as the others, and prior best will be demoted.
suppressMessages({ slt$mark_best(version_name = version_name_today, user_entry = list(comment = "testing mark_best")) }) data.table::fread(PATHS$log_cent)
data.table::fread(PATHS$log_2024_02_10)
Now let's say you review your results and no version of outputs should be 'best'. You can run unmark()
to remove the 'best' status.
version_name
log shows demotion.version_name
log are the acid test of what's current.suppressMessages({ slt$unmark(version_name = version_name_today, user_entry = list(comment = "testing unmark_best")) }) print_tree(root_base)
data.table::fread(PATHS$log_cent)
data.table::fread(file.path(root_input, version_name_today, "logs/log_version_history.csv"))
data.table::fread(file.path(root_input, "report_key_versions.csv"))
We love our 'best' version of outputs as long as it's best, but time passes and we get new 'best' versions.
When it's time to remove those old folders, we can use this tool to do that safely, in two stages.
Think of the two-step process a bit like git add
and git commit
.
mark_remove
, which puts it the the 'deletion staging area'.delete_version_folders()
to actually delete the folder and update the central log.Let's demonstrate on the folder we made programmatically with today's date.
suppressMessages({ slt$mark_remove(version_name = version_name_today, user_entry = list(comment = "testing mark_remove")) })
And let's look at the folder structure.
remove_
, we combine 'remove_' with the version_name
to make the folder name unique.print_tree(root_base)
print_symlink("remove")
And finally look at the log.
data.table::fread(file.path(root_input, version_name_today, "logs/log_version_history.csv"))
The report should also update to show the new symlink.
data.table::fread(file.path(root_input, "report_key_versions.csv"))
First, let's naively try to delete a folder we haven't marked as ready for removal.
Note:
root
s.slt$delete_version_folders( version_name = "2024_02_02", user_entry = list(comment = "testing delete_version_folders") )
Now let's delete the folder we have marked as ready for removal.
root
, it will ask if you're sure you want to delete the folder.slt$delete_version_folders( version_name = version_name_today, user_entry = list(comment = "testing delete_version_folders"), require_user_input = FALSE )
Let's look at the folder structure.
remove_
symlink should be gone.print_tree(root_base)
Since we no longer have a version_name
log, we can't look at it. But we can look at the central log.
data.table::fread(PATHS$log_cent)
Let's look at the report.
version_name
in the report, since it's been deleted.data.table::fread(file.path(root_input, "report_key_versions.csv"))
Other available features will be covered briefly, and will assume the reader has already read the Symlink Tool Intro section.
It's likely you'll have other output versions you want to keep, but not as 'best'. You can mark these as 'keep'.
keep_<version_name>
suppressMessages( slt$mark_keep(version_name = "2024_02_10", user_entry = list(comment = "testing mark_keep")) )
print_tree(root_base)
print_symlink("keep")
In addition to the report_key_versions.csv
file, there are other reports available.
These will show the status of the last log row for each version_name
folder in each root
folder.
You can view things like:
NOTE: This includes a discrepancy report that shows if logs do not conform to expected standards.
# Show the types of reports currently available slt$make_reports
# Run the reports suppressMessages({ slt$make_reports() }) print_tree(root_base)
# View an example report - logs for folders with no active symlink # - you can see this folder was previously marked 'best' data.table::fread(file.path(root_input, "report_all_logs_non_symlink.csv"))
# Expect this to be absent for the vignette try(data.table::fread(file.path(root_input, "REPORT_DISCREPANCIES.csv")))
Let's say you have a set of folders you want to keep or remove, and you want to do it all at once.
We'll demonstrate by:
remove_
remove_
folders for deletionkeep_
# Make a set of dummy folders dv1 <- get_output_dir(root_input, "today") slt$make_new_version_folder(dv1) dv2 <- get_output_dir(root_input, "today") slt$make_new_version_folder(dv2) dv3 <- get_output_dir(root_input, "today") slt$make_new_version_folder(dv3) dv4 <- get_output_dir(root_input, "today") slt$make_new_version_folder(dv4) print_tree(root_base)
# Mark some as 'remove_' suppressMessages({ for(dv in c(dv1, dv2)){ slt$mark_remove(dv, user_entry = list(comment = "mark_remove for roundup")) } })
# Round up and delete roundup_remove_list <- slt$roundup_remove()
if(! identical(roundup_remove_list$root_input$version_name, roundup_remove_list$root_output$version_name)){ stop("roundup_remove found different version_names in each `root` folder") }
suppressMessages({ for(dv in roundup_remove_list$root_input$version_name){ slt$delete_version_folders( version_name = dv, user_entry = list(comment = "roundup_remove"), require_user_input = FALSE ) } }) print_tree(root_base)
Use the log creation date (first row) to round up folders created on, before, or after that date.
my_date <- format(Sys.Date(), "%Y_%m_%d") roundup_date_list <- slt$roundup_by_date( user_date = my_date, date_selector = "lte" # less than or equal to today's date )
# mark all our dummy folders (with the ".VV" pattern) as keepers dv_keep <- grep( pattern = "\\.\\d\\d" , x = roundup_date_list$root_input$version_name , value = TRUE ) suppressMessages({ for(dv in dv_keep){ slt$mark_keep(dv, user_entry = list(comment = "roundup_by_date")) } }) print_tree(root_base)
The date roundup relies on the log creation date (recall, the Linux filesystem does not record folder creation / birth dates). If you've made your own folders without the symlink tool, you can make a blank log easily. You can hand-edit the creation date if you know when the folder was made.
Note:
root
independently. So even though we're not creating a folder in both our root
s, the tool will create as many logs as it can.# Make a naive folder without a log dir.create(file.path(root_output, "2024_02_10_naive")) try(slt$make_new_log(version_name = "2024_02_10_naive"))
print_tree(root_base)
You can audit the internal state of the tool with the print_
functions.
# Print all static fields (output truncated) slt$return_dictionaries()
# ROOTS are likely most interesting to the user. slt$return_dictionaries(item_names = "ROOTS")
# Show the last 'action' the tool performed # - these fields are set as part of each 'marking' new action. slt$return_dynamic_fields()
# Finally, clean up all our temporary folders system(paste("rm -rf", root_base))
options(.opt_width) options(.opt_mccores)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.