In arendsee/pied: functions for compositional, bifurcating pipelines

pied is designed to cleanly separate the elements of a workflow.

Elements of analysis

A computational pipeline is a composition of these primary elements:

a directed graph of data transformations
descriptive functions of the final and possibly intermediate data
configuration handling
data validation
error handling and logging
caching results, determining if caches are out-of-date, refreshing them
parallelization and other performance issues
visualization of the pipeline

Also the code should be readable, reusable, and flexible.

Test case

pied's real power is in highly-branched workflows with complex data and many visualization and logging requirements.

In this test case I will make a simple hierarchical cluster of the iris data set.

library(pied)
library(magrittr)

# Load a builtin R dataset and return
f_load_data <- function() {
  data(iris, envir=environment()) 
  iris$Species <- NULL
  iris
}

# Normalize the numeric columns
f_normalize <- function(x) {
  as.data.frame(lapply(x, function(x) (x - mean(x)) / sd(x))) 
}

# Build a distance matrix
f_dist <- function(x, ...) {
  dist(x, ...)
}

# Build a hierarchical cluster
f_hclust <- function(x, ...) {
  hclust(x, ...)
}

With a bit of magrittr magic these can be composed into a simple pipeline that produces a single plot.

f_load_data() %>% f_normalize %>% f_dist %>% f_hclust %>% plot

This is beautiful, but what if we want to validate the inputs of each.

I can add validation code to each function, calling an error on a negative result:

# Normalize the numeric columns
f_normalize_2 <- function(x) {
  expected_names = c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width")
  if(!
    is.data.frame(x)                           &&
    length(expected_names) == length(names(x)) &&
    all(names(x) == expected_names)            &&
    all(unlist(lapply(x, is.numeric)))
  ){
    stop("Bad input")
  }
  as.data.frame(lapply(x, function(x) (x - mean(x)) / sd(x))) 
}

# Build a distance matrix
f_dist_2 <- function(x, ...) {
  if(!
    is.data.frame(x)                                        &&
    all(unlist(lapply(x, is.numeric)))                      &&
    all(unlist(lapply(x, function(x) abs(mean(x)) < 1e-5))) &&
    all(unlist(lapply(x, function(x) abs((sd(x) - 1)) < 1e-5)))
  ){
    stop("Bad input")
  }
  dist(x, ...)
}

# Build a hierarchical cluster
f_hclust_2 <- function(x, ...) {
  if(! 'dist' %in% class(x)){
    stop("Bad input")
  }
  hclust(x, ...)
}

Now what if we want to print intermediate results? We can further modify our functions.

f_dist_3 <- function(x, ...) {
  if(!
    is.data.frame(x)                                        &&
    all(unlist(lapply(x, is.numeric)))                      &&
    all(unlist(lapply(x, function(x) abs(mean(x)) < 1e-5))) &&
    all(unlist(lapply(x, function(x) abs((sd(x) - 1)) < 1e-5)))
  ){
    stop("Bad input")
  }
  result <- dist(x, ...)
  print(summary(result))
  result
}

f_load_data() %>% f_normalize_2 %>% f_dist_3 %>% f_hclust_2 %>% nothing

Note nothing is just a function that prevents anything from being printed to output (like /dev/null), just to avoid cluttering the vignette. Now what if we decide we don't like always printing summaries, but only if verbose is set.

f_dist_4 <- function(x, verbose=FALSE, ...) {
  if(!
    is.data.frame(x)                                        &&
    all(unlist(lapply(x, is.numeric)))                      &&
    all(unlist(lapply(x, function(x) abs(mean(x)) < 1e-5))) &&
    all(unlist(lapply(x, function(x) abs((sd(x) - 1)) < 1e-5)))
  ){
    stop("Bad input")
  }
  result <- dist(x, ...)
  if(verbose){
    print(summary(result))
  }
  result
}

f_load_data() %>% f_normalize_2 %>% f_dist_4(verbose=TRUE) %>% f_hclust_2 %>% nothing

Now what if we also want to optionally plot the intermediate data and optionally cache the results?

f_dist_5 <- function(x, verbose=FALSE, doplot=FALSE, cache="", ...) {
  if(!
    is.data.frame(x)                                        &&
    all(unlist(lapply(x, is.numeric)))                      &&
    all(unlist(lapply(x, function(x) abs(mean(x)) < 1e-5))) &&
    all(unlist(lapply(x, function(x) abs((sd(x) - 1)) < 1e-5)))
  ){
    stop("Bad input")
  }
  if(file.exists(cache)){
    load(cache)
  } else {
    result <- dist(x, ...)
    save(result, file=cache)
  }
  if(verbose){
    print(summary(result))
  }
  if(doplot){
    plot(result)
  }
  result
}

f_load_data()                                       %>%
    f_normalize_2                                   %>%
    f_dist_5(verbose=TRUE, cache="load_data.Rdata") %>%
    f_hclust_2                                      %>%
    nothing

This is still a very minimal example of how bloated a function can become. The original trivial function function(x,...){dist(x,...)} has grown into a 20-line unreadable mess. Even worse, this mess may need to be duplicated into the other functions.

Redoing with pied

pied's solution is to partition each function in the pipeline into the following elements: a pure function, effect functions, a cache function, a validator function, and a pass and a fail function. This composite function can be easily created with the hnode command.

Here is the minimal pied example

h_load_data <- hwell('a')
h_normalize <- hpipe('a -> b')
h_dist      <- hpipe('b -> c')
h_hclust    <- hpipe('c -> d')

h_fun(h_load_data) <- f_load_data
h_fun(h_normalize) <- f_normalize
h_fun(h_dist)      <- f_dist
h_fun(h_hclust)    <- f_hclust

connect('h_load_data --> h_normalize --> h_dist --> h_hclust')

h_hclust()

hnode returns a function that should normally be called with no arguments. Each call recursively calls the input functions until a source function is reached.

Now we can begin expanding upon these basic functions. First I will add validators. Rather than writing validation code into the pure functions, I will write them as independent functions.

v_iris <- function(x) {
  expected_names = c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width")
  is.data.frame(x)                           &&
  length(expected_names) == length(names(x)) &&
  all(names(x) == expected_names)            &&
  all(unlist(lapply(x, is.numeric)))
}

v_normed_df <- function(x, ...) {
  is.data.frame(x)                                        &&
  all(unlist(lapply(x, is.numeric)))                      &&
  all(unlist(lapply(x, function(x) abs(mean(x)) < 1e-5))) &&
  all(unlist(lapply(x, function(x) abs((sd(x) - 1)) < 1e-5)))
}

# Build a hierarchical cluster
v_dist <- function(x, ...) {
  'dist' %in% class(x)
}

h_val(h_normalize) <- v_iris
h_val(h_dist)      <- v_normed_df
h_val(h_hclust)    <- v_dist
h_hclust()

Now lets add output summary functions to each

summary_and_print <- compose(summary, print)
h_effect(h_load_data) <- summary_and_print
h_effect(h_normalize) <- summary_and_print
h_effect(h_dist)      <- summary_and_print
h_effect(h_hclust)    <- summary_and_print

Now we can run this

h_hclust()

Appendix

Full executable code

library(pied)

# Load a builtin R dataset and return
f_load_data <- function() {
  data(iris, envir=environment()) 
  iris$Species <- NULL
  iris
}

# Normalize the numeric columns
f_normalize <- function(x) {
  as.data.frame(lapply(x, function(x) (x - mean(x)) / sd(x))) 
}

# Build a distance matrix
f_dist <- function(x, ...) {
  dist(x, ...)
}

# Build a hierarchical cluster
f_hclust <- function(x, ...) {
  hclust(x, ...)
}

h_load_data <- hwell('a')
h_normalize <- hpipe('a -> b')
h_dist      <- hpipe('b -> c')
h_hclust    <- hpipe('c -> d')

h_fun(h_load_data) <- f_load_data
h_fun(h_normalize) <- f_normalize
h_fun(h_dist)      <- f_dist
h_fun(h_hclust)    <- f_hclust

connect('h_load_data --> h_normalize --> h_dist --> h_hclust')

h_hclust()