Symlink Tool Intro Vignette"

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

# for debugging
# knitr::opts_chunk$set(
#   echo = TRUE,
#   message = TRUE,
#   warning = TRUE,
#   error = TRUE
# )

# Chunk options
.opt_width <- options(width = 450)

# save the built-in output hook
hook_output <- knitr::knit_hooks$get("output")
# flags to determine output
flag_eval_chunk <- if (vmTools:::is_windows_admin() | .Platform$OS.type %in% c("unix", "linux")) TRUE else FALSE

# set a new output hook to truncate text output
# - set a chunk option as e.g. : `{r chunk_name, out.lines = 15}`
# if the output is too long, it will be truncated like:
# 
# top output
# ...
# bottom output
knitr::knit_hooks$set(output = function(x, options) {
  if (!is.null(n <- options$out.lines)) {
    x <- vmTools:::split_line_breaks(x)
    if (length(x) > n) {
      # truncate the output
      # x <- c(head(x, n), "....\n")
      x <- c(head(x, n/2), '....', tail(x, n/2 + 1))
    }
    x <- paste(x, collapse = "\n")
  }
  hook_output(x, options)
})
if(!flag_eval_chunk){

 knitr::asis_output("
> **Note:** This vignette demonstrates symbolic link creation, which requires administrator privileges on Windows.
> 
> On systems without these privileges, code chunks are not evaluated, but all code is shown.
> 
> To fully run this vignette, use a Unix-based system or Windows with administrator rights.
")

} 
# Defining a couple vignette utilities
print_tree <- function(x) {vmTools:::dir_tree(x)}

# Make a directory with desired defaults without cluttering the vignette
make_directory <- function(path){
  dir.create(path, recursive = TRUE, showWarnings = FALSE)
}

# print a symlink's target from the file system 
print_symlink <- function(symlink_type, root_input){
  print(grep(symlink_type, system(paste("ls -alt", root_input), intern = TRUE), value = TRUE))
}

#' Get output directory for results to save in.
#'
#' Returns a path to save results in of the form "YYYY_MM_DD.VV".
#'
#' @param root path to root of output results
#' @param date character date in form of "YYYY_MM_DD" or "today". "today" will be interpreted as today's date.
get_output_dir <- function(root, date) {
  if (date == "today") {
    date <- format(Sys.Date(), "%Y_%m_%d")
  }
  cur.version <- get_latest_output_date_index(root, date = date)

  dir.name <- sprintf("%s.%02i", date, cur.version + 1)
  return(dir.name)
}

#' get the latest index for given an output dir and a date
#'
#' directories are assumed to be named in YYYY_MM_DD.VV format with sane
#' year/month/date/version values.
#'
#' @param dir path to directory with versioned dirs
#' @param date character in be YYYY_MM_DD format
#'
#' @return largest version in directory tree or 0 if there are no version OR
#' the directory tree does not exist
get_latest_output_date_index <- function(root, date) {
  currentfolders <- list.files(root)

  # subset to date
  pat <- sprintf("^%s[.]\\d{2}$", date)
  date_dirs <- grep(pat, currentfolders, value = TRUE)

  if (length(date_dirs) == 0) {
    return(0)
  }

  # get the index after day
  date_list <- strsplit(date_dirs, "[.]")

  inds <- unlist(lapply(date_list, function(x) x[2]))
  if (is.na(max(inds, na.rm = TRUE))) inds <- 0

  return(max(as.numeric(inds)))
}

resolve_symlink <- function(path){
   path_resolved <- vmTools:::clean_path(path)
   if(file.exists(path_resolved)) {
      return(path_resolved)
   } else {
      message("Could not resolve symlink: ", path)
   }
}

show_vars <- c("log_id",
               "timestamp",
               "user",
               "version_name",
               # "version_path", # causes ouput to be too wide
               "action",
               "comment")

print_public_methods <- function(class){
  output <- capture.output(print(class))
  idx_private <- which(grepl("Private", output))
  idx_clone <- which(grepl("clone", output))
  idx_custom <- which(grepl("startup guidance messages", output))
  # `initialize` is not the name of the method, may confuse new users
  idx_initialize <- which(grepl("initialize", output))
  output[idx_initialize] <- sub("initialize", "new", output[idx_initialize])
  idx_keep <- c(1:idx_private - 1, idx_custom)
  idx_keep <- setdiff(idx_keep, idx_clone)
  cat(paste0(output[idx_keep], collapse = "\n"))
}

What is the SymLink Tool?

Give Me a High-Level Overview

The SymLink Tool is a way for researchers to manage multiple data pipeline output runs.

It's designed with user flexibility and project officers in mind, and doesn't require using anything like a database.

Is This for Me and My Team?

This tool assumes you have a large set of output folders for runs of your pipeline, and you store them on the file system.

Give Me a Little More Detail

The Symlink Tool will:

  1. Create a 'best' symlink to the 'best' pipeline run of your outputs.
    1. This is just a shortcut link in the H: or J: drive i.e. on the File System.
    2. It allows you to have a stable file path to results that you and others don't need to update.
    3. Safely update the 'best' symlink when you, the researcher, want to 'promote' a new pipeline run to being 'best'.

OK, that's nice, but symlinks are pretty easy to make. What else do I (the researcher) get for free if I use this tool?

The most important thing you get are some simple logs that automatically keep track of which folders have been 'best.'

  1. Maintain a central log.
    1. Which pipeline run is currently 'best'?
    2. Which older runs were ever 'best' in case you need to 'roll back' what you consider 'best'.
    3. Track deletion of old pipeline runs.
  2. Maintain run-specific logs.
    1. When was this run started? (the file system doesn't track this!)
    2. When was it marked 'best'?
    3. If it stopped being 'best', when did that happen?

Hmm, is that all? I feel like I could keep an excel document or HUB page that does the same thing?

That's very true, but this doesn't require you to remember, or do any typing yourself.

Also, maybe there are pipeline runs you want to track for different reasons.


And your Project Officer gets things too!

  1. Run reports on all these logs about the status of all your important pipeline outputs.

That sounds pretty nice, but didn't you also say something about deleting folders? Why do I need help doing that?

You get some additional benefits - The SymLink Tool will also:

  1. Let you mark folders you want to 'remove' before you actually delete them.
    1. This is a staging area where you can take time to decide before you actually delete them.
    2. You can also 'unmark' them for removal if you change your mind.

When you're ready to delete, you'll get: 1. Safety - The Symlink Tool will only delete folders that marked to 'remove'. 1. Provenance - You'll get a record in the central log telling you which pipeline runs were deleted, when, why they were deleted (user gets to add a comment).

I'm still reading, and curious to see how this works.



Demonstration

What this demonstration is.

We'll showcase the life-cycle of a typical pipeline output folder.

What this demonstration is not.

This won't be an exhaustive demonstration of all the available options, this is a vignette of an average use-case.

Set Your Output Folder

My team uses a output_root folder for all inputs we submit to ST-GPR.
This way we can prepare the data, then submit various ST-GPR models with different parameters without needing to re-prep the inputs. The results of the ST-GPR models go into an output folder, which we'll ignore for simplicity.

library(vmTools)
library(data.table)
# Make the root folder
output_root <- file.path(tempdir(), "slt", "output_root")
dir.create(output_root, 
           recursive    = TRUE, 
           showWarnings = FALSE)

Make the Symlink Tool

Call on SLT bare to print class information and methods (functions linked with the tool).

# For this Intro Vignette, we're only showing public methods 
# - for all methods, see the Technical Vignette
SLT
# For the intro vignettee, hide private methods
print_public_methods(SLT)

When you make a new tool, this tool is tied to a specific output folder. You can't change the output folder once you've made the tool.

Note: You can define the root for results outputs and logs separately, but we're using the same root for simplicity.

# Instantiate (create) a new Symlink Tool object
slt_prep <- SLT$new(
      user_root_list        = list("output_root" = output_root),
      user_central_log_root = output_root
   )
# multithreading can cause github actions issues
.opt_mccores <- options(mc.cores = 1)

Note: SLT is an R6 class included with the vmTools package that manages the symlink tool.

# Look at the directory tree
print_tree(output_root)

New Folder

Use the Symlink Tool to create a new folder in your output root.

date_vers1 <- get_output_dir(output_root, "2024_02_01")
slt_prep$make_new_version_folder(version_name = date_vers1)

Capture some paths, using the Symlink Tool to help. We'll use these in a minute.

path_log_central <- slt_prep$return_dictionaries()[["LOG_CENTRAL"]][["path"]]
fname_dv_log     <- slt_prep$return_dictionaries()[["log_path"]]
root_dv1         <- slt_prep$return_dynamic_fields()[["VERS_PATHS"]][["output_root"]]
path_log_dv1     <- file.path(root_dv1, fname_dv_log)

Show the file tree.

print_tree(output_root)

Show central log.

(log_central <- data.table::fread(path_log_central)[, ..show_vars])

Show new run version folder log.

(log_dv1 <- data.table::fread(path_log_dv1)[, ..show_vars])

Produce Model Results

Now let's make some files representing models in this folder.

# Make some dummy files
fnames_my_models <- paste0("my_model_", 1:5, ".csv")
invisible(file.create(file.path(root_dv1, fnames_my_models)))
print_tree(output_root)

Mark Best

We like the models! We want to elevate this run version folder to best_ status.

Note: All mark_xxxx operations require a user entry as a named list.

# Mark best, and take note of messaging
slt_prep$mark_best(version_name = date_vers1,
                   user_entry   = list(comment = "Best model GBD2023"))

Inspect both the central log and ...

(log_central <- data.table::fread(path_log_central)[, ..show_vars])

...the run version folder log.

(log_dv1 <- data.table::fread(path_log_dv1)[, ..show_vars])

We now have a 'best' symlink that points to our 'best' run version, 2024_02_01.01

print_tree(output_root)

resolve_symlink(file.path(output_root, "best"))

New Pipeline Runs

Since we run our pipelines many times, we want to track those runs.

Run the pipeline two more times on the same day, inspect the models, and make a human decision about the result quality.

# Second run
date_vers2 <- get_output_dir(output_root, "2024_02_01")
slt_prep$make_new_version_folder(version_name = date_vers2)

# note - the dynamic fields update when you make new folders, so we won't see the dv1 path anymore
root_dv2   <- slt_prep$return_dynamic_fields()$VERS_PATHS
invisible(file.create(file.path(root_dv2, fnames_my_models)))

# Third run
date_vers3 <- get_output_dir(output_root, "2024_02_01")
slt_prep$make_new_version_folder(version_name = date_vers3)
root_dv3   <- slt_prep$return_dynamic_fields()$VERS_PATHS
invisible(file.create(file.path(root_dv3, fnames_my_models)))

Now let's look at our file output structure, and central log.

print_tree(output_root)
(log_central <- data.table::fread(path_log_central)[, ..show_vars])

Mark New Best

After inspecting our results, we decide the third run is actually the best_.

# Mark best, and take note of messaging
slt_prep$mark_best(version_name = date_vers3,
                   user_entry   = list(comment = "New best model GBD2023"))

Inspect the central log - The third version is now bested.

(log_central <- data.table::fread(path_log_central)[, ..show_vars])

Note: Multiple "marks" on the same folder will produce no results (but reports will still run)

slt_prep$mark_best(version_name = date_vers3,
                   user_entry   = list(comment = "New best model GBD2023"))

Let's also take a look inside each of the run version folder logs.

Looking at all three run-version logs we see:

(log_dv1 <- data.table::fread(path_log_dv1)[, ..show_vars])
(log_dv2 <- data.table::fread(file.path(root_dv2, fname_dv_log))[, ..show_vars])
(log_dv3 <- data.table::fread(file.path(root_dv3, fname_dv_log))[, ..show_vars])
print_tree(output_root)

resolve_symlink(file.path(output_root, "best"))

Mark Keep

We want to keep the first run, even though it's not the best anymore.

# Mark keep, and take note of messaging
slt_prep$mark_keep(
   version_name = date_vers1,
   user_entry   = list(comment = "Previous best")
)

The first version is now marked as keep.

(log_central <- data.table::fread(path_log_central)[, ..show_vars])

Note: Marking a folder keep_ does not make it unique, like best_. Many folders can be marked keep_.

(log_dv1 <- data.table::fread(path_log_dv1)[, ..show_vars])
print_tree(output_root)

resolve_symlink(file.path(output_root, "keep_2024_02_01.01"))

Mark Remove

We want to remove the second run, because the model was experimental, or performed poorly.

# Mark remove, and take note of messaging
slt_prep$mark_remove(
   version_name = date_vers2,
   user_entry   = list(comment = "Obsolete dev folder"))

Inspect the central log - The second version is now marked as remove_.

(log_central <- data.table::fread(path_log_central)[, ..show_vars])
print_tree(output_root)

resolve_symlink(file.path(output_root, "remove_2024_02_01.02"))

Delete Folders

Now that we have marked the second run as remove_, we can use the Symlink Tool to delete the folders.

First, we'll find (roundup) all our remove_ folders.

(dt_to_remove <- slt_prep$roundup_remove())

Next, we can handle them any way we choose. For this demonstration, we'll delete them.

for(dir_dv_remove in dt_to_remove$output_root$version_name){
   slt_prep$delete_version_folders(
      version_name       = dir_dv_remove,
      user_entry         = list(comment = "Deleting dev folder"),
      require_user_input = FALSE
   )
}

# The default setting prompts user input, but the process can be automated, as for this vignette.
# 
# Do you want to delete the following folders?
#   /tmp/RtmpRmKCTu/slt/output_root/2024_02_01.02
#   /tmp/RtmpRmKCTu/slt/output_root/remove_2024_02_01.02 
# 
# 1: No
# 2: Yes

Check the central log - since the folder is gone, this will maintain a record of when this folder was deleted.

(log_central <- data.table::fread(path_log_central)[, ..show_vars])

Reports

Note: As soon as we marked a folder, there was a report ready in our folder. The report_key_versions.csv file will scan every run-version with a Tool-created Symlink for a log, and show its last row (current status).

(data.table::fread(file.path(output_root, "report_key_versions.csv")))

We can generate more reports of the pipeline runs, and the status of the folders based on different needs. These reports are useful for tracking the status of the pipeline runs, and for making decisions about which folders to keep, delete, or promote.

# Generate reports
slt_prep$make_reports()
print_tree(output_root)

The report_all_logs.csv file will scan every run-version for a log, and show its last row (current status).

(data.table::fread(file.path(output_root, "report_all_logs.csv")))

Two other reports sometimes diagnostically helpful are:

system(paste("rm -rf", output_root))
options(.opt_width)
options(.opt_mccores)


Try the vmTools package in your browser

Any scripts or data that you put into this service are public.

vmTools documentation built on Aug. 8, 2025, 7:28 p.m.