First we create a set of pairs and graph the frequency:

library(predpipe)
library(dplyr)
pairs <- pipes %>%
  do(data.frame(from = head(.$step, -1), to = tail(.$step, -1)))

transitions <- pairs %>%
  group_by(from, to) %>%
  summarize(n = n(), packages = length(unique(package))) %>%
  group_by(from) %>%
  mutate(normalized = n / sum(n)) %>%
  ungroup() %>%
  arrange(desc(n))

transitions

The "normalized" column is overtaken by 1-to-1 transitions, so we decide to include only steps (to or from) that happen in at least 5 times across at least two packages. (This filtering should be examined).

transitions_filtered <- transitions %>%
  group_by(from) %>%
  filter(sum(n) >= 5, max(packages) >= 2) %>%
  group_by(to) %>%
  filter(sum(n) >= 5, max(packages) >= 2) %>%
  ungroup() %>%
  arrange(desc(normalized))

transitions_filtered

We can then graph the transition matrix:

library(ggplot2)

add_freq <- function(x, wts = rep(1, x)) {
  wts_ave <- ave(wts, x)
  ret <- factor(paste0(x, " (", unname(table(x)[x] * wts_ave), ")"))
  reorder(ret, wts, sum)
}

# reorder transitions by their frequency
transitions_filtered %>%
  ungroup() %>%
  mutate(from = add_freq(from, n),
         to = add_freq(to, n)) %>%
  ggplot(aes(to, from, fill = normalized)) +
  geom_tile() +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))


jimhester/predpipe documentation built on May 19, 2019, 10:33 a.m.