dtrackr.R
In dtrackr: Track your Data Pipelines

## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----setup--------------------------------------------------------------------
# devtools::load_all()
library(dplyr)
library(dtrackr)


## -----------------------------------------------------------------------------
# devtools::load_all()
filename = "~/tmp/iris.csv"
# this is just a pretend example
# iris = read.csv(filename)


iris %>%
  track(
    .headline = "Iris data:",
    .messages = c(
      "loaded from \"{filename}\"",
      "starts with {.count} items")
  ) %>%
  group_by(Species) %>%
  comment(
    .headline = "{.strata}",
    .messages = c(
    "In {Species}",
    "there are {.count} items",
    "that is {sprintf('%1.0f',.count/.total*100)}% of the total"),
    .tag = "note1"
    ) %>%
  ungroup() %>%
  comment("Final data has {.total} items", .tag="note2") %>%  
  flowchart()


## -----------------------------------------------------------------------------

iris %>%
  track("starts with {.count} items") %>%
  group_by(Species) %>%
  status(
    petalMean = sprintf("%1.1f", mean(Petal.Width)),
    petalSd = sprintf("%1.1f", sd(Petal.Width)),
    .messages = c(
    "In {Species} the petals are",
    "on average {petalMean} \u00B1 {petalSd} cms wide")) %>%
  ungroup(.messages = "ends with {.total} items") %>%
  flowchart()


## -----------------------------------------------------------------------------
ggplot2::diamonds %>%
  track() %>%
  group_by(cut) %>%
  count_subgroup(
    color,
    .messages = "colour {.name}: {sprintf('%1.1f%%', {.count}/{.subtotal}*100)}"
  ) %>%
  ungroup() %>%
  flowchart()

## -----------------------------------------------------------------------------

iris %>%
  track() %>%
  group_by(Species) %>%
  filter(
    Petal.Width < mean(Petal.Width)+sd(Petal.Width)
  ) %>%
  ungroup() %>%
  flowchart()


## -----------------------------------------------------------------------------

dataset1 = iris %>%
  track() %>%
  comment("starts with {.count} items") %>%
  exclude_all(
    Species=="versicolor" ~ "removing {.excluded} versicolor"
  ) %>%
  group_by(Species) %>%
  comment("{Species} has {.count} items") %>%
  exclude_all(
    Petal.Width > mean(Petal.Width)+sd(Petal.Width) ~ "{.excluded} with petals > 1 SD wider than the mean",
    Petal.Length > mean(Petal.Length)+sd(Petal.Length) ~ "{.excluded} with petals > 1 SD longer than the mean",
    Sepal.Width > mean(Sepal.Width)+sd(Sepal.Width) ~ "{.excluded} with sepals > 1 SD wider than the mean",
    Sepal.Length > mean(Sepal.Length)+sd(Sepal.Length) ~ "{.excluded} with sepals > 1 SD longer than the mean"
  ) %>%
  comment("{Species} now has {.count} items") %>%
  ungroup() %>%
  comment("ends with {.total} items")

dataset1 %>% flowchart()


## -----------------------------------------------------------------------------

dataset2 = iris %>%
  track() %>%
  comment("starts with {.count} items") %>%
  include_any(
    Species=="versicolor" ~ "{.included} versicolor",
    Species=="setosa" ~ "{.included} setosa"
  ) %>%
  #mutate(Species = forcats::fct_drop(Species)) %>%
  group_by(Species) %>%
  comment("{Species} has {.count} items") %>%
  include_any(
    Petal.Width < mean(Petal.Width)+sd(Petal.Width) ~ "{.included} with petals <= 1 SD wider than the mean",
    Petal.Length < mean(Petal.Length)+sd(Petal.Length) ~ "{.included} with petals <= 1 SD longer than the mean",
    Sepal.Width < mean(Sepal.Width)+sd(Sepal.Width) ~ "{.included} with sepals <= 1 SD wider than the mean",
    Sepal.Length < mean(Sepal.Length)+sd(Sepal.Length) ~ "{.included} with sepals <= 1 SD longer than the mean"
  ) %>%
  comment("{Species} now has {.count} items") %>%
  ungroup() %>%
  comment("ends with {.total} items")
  
dataset2 %>% flowchart()


## -----------------------------------------------------------------------------

tmp = iris %>%
  track() %>% 
  capture_exclusions() %>%
  exclude_all(
    Petal.Length > 5.8 ~ "{.excluded} long ones",
    Petal.Length < 1.3 ~ "{.excluded} short ones",
    .stage = "petal length exclusion"
  ) %>%
  comment("leaving {.count}") %>%
  group_by(Species) %>%
  filter(
    Sepal.Length >= quantile(Sepal.Length, 0.05),
    .messages="removing {.count.in-.count.out} with sepals < q 0.05",
    .type = "comment",
    .stage = "sepal length exclusion"
  ) %>%
  comment("leaving {.count}") %>%
  exclude_all(
    Petal.Width < 0.2 ~ "{.excluded} narrow ones",
    Petal.Width > 2.1 ~ "{.excluded} wide ones"
  ) %>%
  comment("leaving {.count}")

tmp %>% flowchart()


## -----------------------------------------------------------------------------
tmp %>% excluded()


## -----------------------------------------------------------------------------
tmp2 = tmp %>% p_get()

# the nodes, .id is a graph unique identifier
tmp2$nodes %>% glimpse()

# the edges, .to and .from are foreign keys for .id
tmp2$edges %>% glimpse()

## -----------------------------------------------------------------------------
cat(tmp %>% p_get_as_dot())