In davidsjoberg/ggsankey: Sankey, Alluvial and Sankey Bump Plots

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.path = "man/figures/README-",
  out.width = "100%",
  dpi = 300,
  comment = FALSE,
  error = FALSE,
  warning = FALSE,
  fig.height = 4,
  fig.width = 8
)

library(ggsankey)
library(dplyr)
library(ggplot2)
library(tidyr)
library(titanic)
library(gapminder)

ggsankey

The goal of ggsankey is to make beautiful sankey, alluvial and sankey bump plots in ggplot2

Installation

You can install the development version of ggsankey from github with:

# install.packages("devtools")
devtools::install_github("davidsjoberg/ggsankey")

How does it work

Google defines a sankey as:

A sankey diagram is a visualization used to depict a flow from one set of values to another. The things being connected are called nodes and the connections are called links. Sankeys are best used when you want to show a many-to-many mapping between two domains or multiple paths through a set of stages.

To plot a sankey diagram with ggsankey each observation has a stage (called a discrete x-value in ggplot) and be part of a node. Furthermore, each observation needs to have instructions of which node it will belong to in the next stage. See the image below for some clarification.

# Ellips 1
xmiddle <- 2
data <- tibble(x0 = xmiddle, y0 = 180, a = .2, b = 280, angle = 0)
data$m1 <- ifelse(is.null(data$m1), 2, data$m1)
data$m2 <- ifelse(is.null(data$m2), data$m1, data$m2)
n_ellipses <- nrow(data)
n <- 360

data <- data[rep(seq_len(n_ellipses), each = n), ]
points <- rep(seq(0, 2 * pi, length.out = n + 1)[seq_len(n)],
              n_ellipses)
cos_p <- cos(points)
sin_p <- sin(points)
x_tmp <- abs(cos_p)^(2 / data$m1) * data$a * sign(cos_p)
y_tmp <- abs(sin_p)^(2 / data$m2) * data$b * sign(sin_p)
data$x <- data$x0 + x_tmp * cos(data$angle) - y_tmp * sin(data$angle)
data$y <- data$y0 + x_tmp * sin(data$angle) + y_tmp * cos(data$angle)
data1 <- data %>% 
  filter(x >= xmiddle) %>% 
  arrange(y)
data2 <- data %>% 
  filter(x <= xmiddle) %>% 
  arrange(y)

# Ellips 2
xmiddle <- 3.5
data <- tibble(x0 = xmiddle, y0 = 385, a = .07, b = 60, angle = 0)
data$m1 <- ifelse(is.null(data$m1), 2, data$m1)
data$m2 <- ifelse(is.null(data$m2), data$m1, data$m2)
n_ellipses <- nrow(data)
n <- 360

data <- data[rep(seq_len(n_ellipses), each = n), ]
points <- rep(seq(0, 2 * pi, length.out = n + 1)[seq_len(n)],
              n_ellipses)
cos_p <- cos(points)
sin_p <- sin(points)
x_tmp <- abs(cos_p)^(2 / data$m1) * data$a * sign(cos_p)
y_tmp <- abs(sin_p)^(2 / data$m2) * data$b * sign(sin_p)
data$x <- data$x0 + x_tmp * cos(data$angle) - y_tmp * sin(data$angle)
data$y <- data$y0 + x_tmp * sin(data$angle) + y_tmp * cos(data$angle)
datat1 <- data %>% 
  filter(x >= xmiddle) %>% 
  arrange(y)
datat2 <- data %>% 
  filter(x <= xmiddle) %>% 
  arrange(y)

# Ellips 3
xmiddle <- 3
data <- tibble(x0 = xmiddle, y0 = 0, a = .15, b = 600, angle = 0)
data$m1 <- ifelse(is.null(data$m1), 2, data$m1)
data$m2 <- ifelse(is.null(data$m2), data$m1, data$m2)
n_ellipses <- nrow(data)
n <- 360

data <- data[rep(seq_len(n_ellipses), each = n), ]
points <- rep(seq(0, 2 * pi, length.out = n + 1)[seq_len(n)],
              n_ellipses)
cos_p <- cos(points)
sin_p <- sin(points)
x_tmp <- abs(cos_p)^(2 / data$m1) * data$a * sign(cos_p)
y_tmp <- abs(sin_p)^(2 / data$m2) * data$b * sign(sin_p)
data$x <- data$x0 + x_tmp * cos(data$angle) - y_tmp * sin(data$angle)
data$y <- data$y0 + x_tmp * sin(data$angle) + y_tmp * cos(data$angle)
datatt1 <- data %>% 
  filter(x >= xmiddle) %>% 
  arrange(y)
datatt2 <- data %>% 
  filter(x <= xmiddle) %>% 
  arrange(y)

# PLOT
df <- titanic::titanic_train %>%
  as_tibble() %>%
  drop_na() %>%
  make_long(Embarked, Sex, Pclass, Survived)

df <- df %>%
  dplyr::mutate(
    shift = case_when(
      x == "Embarked" & node == "S" ~ 300,
      T ~ 0
    )
  )

ggplot(df, aes(x = x, next_x = next_x, node = node, next_node = next_node, fill = factor(node), label = node, shift = shift)) +
  geom_sankey(color = "transparent", fill = "transparent") +
  geom_path(data = data1, aes(x, y), inherit.aes = F, color = "red", linewidth = 1.5) +
  geom_path(data = datat1, aes(x, y), inherit.aes = F, color = "red", linewidth = 1.5) +
  geom_path(data = datatt1, aes(x, y), inherit.aes = F, color = "red", linewidth = 1.5) +
  geom_sankey(node.color = "black", flow.color = "black") +
  # geom_sankey_label(size = 3, color = "black", fill = "white") +
  geom_path(data = data2, aes(x, y), inherit.aes = F, color = "red", linewidth = 1.5) +
  geom_path(data = datat2, aes(x, y), inherit.aes = F, color = "red", linewidth = 1.5) +
  geom_path(data = datatt2, aes(x, y), inherit.aes = F, color = "red", linewidth = 1.5) +
  geom_text(data = tibble(x = c(3.5, 2, 3.5), y = c(510, 520, -570), label = c("Flow", "Node", "Stage (x)")), aes(x, y, label = label), inherit.aes = F, color = "red", size = 8) +
  scale_fill_viridis_d(drop = FALSE) +
  scale_x_discrete(expand = scales::expand_range(.2)) +
  theme_void(base_size = 18) +
  labs(x = NULL) +
  theme(legend.position = "none",
        plot.title = element_text(hjust = .5)) +
  labs(y = NULL,
       title = "Principal aesthetics")
# ggsave("sankey_aes.png", dpi = 800, height = 4, width = 8)

Hence, to use geom_sankey the aesthetics x, next_x, node and next_node are required. The last stage should point to NA. The aesthetics fill and color will affect both nodes and flows.

ggplot(df, aes(x = x, next_x = next_x, node = node, next_node = next_node, fill = factor(node), label = node, shift = shift)) +
  geom_sankey(color = "transparent", fill = "transparent") +
  geom_sankey(node.color = "black", flow.color = "black") +
    # node fill

  # aes fill
  geom_text(aes(1.8, 600, label = "fill"), color = "black", inherit.aes = F, size = 8, hjust = 0) +
  geom_curve(aes(1.85, 550, xend = 2, yend = 300), color = "black", inherit.aes = F, arrow = arrow(length = unit(0.1, "inches")), curvature = .3) +
  geom_curve(aes(2, 550, xend = 2.2, yend = 300), color = "black", inherit.aes = F, arrow = arrow(length = unit(0.1, "inches")), curvature = -.3) +

  # aes color
  geom_text(aes(2.8, 600, label = "color"), color = "black", inherit.aes = F, size = 8, hjust = 0) +
  geom_curve(aes(3, 550, xend = 3, yend = 455), color = "black", inherit.aes = F, arrow = arrow(length = unit(0.1, "inches")), curvature = .3) +
  geom_curve(aes(3.07, 550, xend = 3.2, yend = 455), color = "black", inherit.aes = F, arrow = arrow(length = unit(0.1, "inches")), curvature = -.3) +

  # aes shift
  geom_text(aes(.55, 100, label = "shift"), color = "black", inherit.aes = F, size = 8, hjust = 0) +
  geom_segment(aes(x = 1, xend = 1, y = -100, yend = 200), color = "black", inherit.aes = F, arrow = arrow(length = unit(0.1, "inches"))) +

  scale_fill_viridis_d(option = "A", drop = FALSE) +
  scale_x_discrete(expand = scales::expand_range(.2)) +
  theme_void(base_size = 18) +
  labs(x = NULL) +
  theme(legend.position = "none",
        plot.title = element_text(hjust = .5)) +
  labs(y = NULL,
       title = "Additional aesthetics")
# ggsave("sankey_aes.png", dpi = 800, height = 4, width = 8)

To control geometries (not changed by data) like fill, color, size, alpha etc for nodes and flows you can either choose to set a global value that affect both, or you can specify which one you want to alter. For example node.color = 'black' will only draw a black line around the nodes, but not the flows (links).

ggplot(df, aes(x = x, next_x = next_x, node = node, next_node = next_node, label = node, shift = shift)) +
  geom_sankey(color = "transparent", fill = "transparent") +
  geom_sankey(node.color = "black",
              node.fill = "#e44436ff",
              node.size = .5, 
              flow.alpha = .7,
              flow.color = "#c07a3eff",
              flow.fill = "#3b345dff") +
  # geom_sankey_label(size = 3, color = "black", fill = "white") +

  # space
  geom_text(data = tibble(x = c(4.4), y = c(70), label = c("space")), aes(x, y, label = label), inherit.aes = F, color = "black", size = 6) +
  geom_errorbar(aes(x = 4, ymin = 35, ymax = 98), inherit.aes = F, color = "black", linewidth = .9, width = .06) +

  # width
  geom_text(data = tibble(x = 2, y = -545, label = c("width")), aes(x, y, label = label), inherit.aes = F, color = "black", size = 6) +
  geom_errorbarh(aes(xmin = 2-.05, xmax = 2+.05, y = -470), inherit.aes = F, color = "black", linewidth = .9, height = 30) +

  # node color
  geom_text(aes(2.22, 500, label = "node.color"), color = "black", inherit.aes = F, size = 5, hjust = 0) +
  geom_curve(aes(2.2, 500, xend = 2, yend = 415), color = "black", inherit.aes = F, arrow = arrow(length = unit(0.1, "inches"))) +

  # node fill
  geom_text(aes(1.6, 600, label = "node.fill"), color = "#e44436ff", inherit.aes = F, size = 5, hjust = 0) +
  geom_curve(aes(1.7, 560, xend = 2, yend = 300), color = "black", inherit.aes = F, arrow = arrow(length = unit(0.1, "inches")), curvature = .3) +

  # flow color
  geom_text(aes(3.92, 500, label = "flow.color"), color = "#c07a3eff", inherit.aes = F, size = 5, hjust = 0) +
  geom_curve(aes(3.9, 500, xend = 3.65, yend = 415), color = "black", inherit.aes = F, arrow = arrow(length = unit(0.1, "inches"))) +

  # flow fill
  geom_text(aes(3.05, 620, label = "flow.fill"), color = "#3b345dff", inherit.aes = F, size = 5, hjust = 0) +
  geom_curve(aes(3.1, 580, xend = 3.2, yend = 400), color = "black", inherit.aes = F, arrow = arrow(length = unit(0.1, "inches")), curvature = .1) +

  # flow alpha
  geom_text(aes(3.69, -500, label = "flow.alpha"), color = "black", inherit.aes = F, size = 5, hjust = 0) +
  geom_text(aes(3.69, -555, label = "(Transparency)"), color = "black", inherit.aes = F, size = 3, hjust = 0) +
  geom_curve(aes(3.65, -500, xend = 3.42, yend = -220), color = "black", inherit.aes = F, arrow = arrow(length = unit(0.1, "inches")), curvature = -.5) +

  scale_fill_viridis_d(drop = FALSE) +
  scale_x_discrete(expand = scales::expand_range(.2)) +
  theme_void(base_size = 18) +
  labs(x = NULL) +
  theme(legend.position = "none",
        plot.title = element_text(hjust = .5)) +
  labs(y = NULL,
       title = "Control the geometries")

# ggsave("sankey_geom.png", dpi = 800, height = 4, width = 8)

Basic usage

geom_sankey

A basic sankey plot that shows how dimensions are linked.

df <- mtcars %>%
  make_long(cyl, vs, am, gear, carb)

ggplot(df, aes(x = x, 
               next_x = next_x, 
               node = node, 
               next_node = next_node,
               fill = factor(node))) +
  geom_sankey() +
  scale_fill_discrete(drop=FALSE)

And by adding a little pimp.

Labels with geom_sankey_label which places labels in the center of nodes if given the same aesthetics.
ggsankey also comes with custom minimalistic themes that can be used. Here I use theme_sankey.

ggplot(df, aes(x = x, next_x = next_x, node = node, next_node = next_node, fill = factor(node), label = node)) +
  geom_sankey(flow.alpha = .6,
              node.color = "gray30") +
  geom_sankey_label(size = 3, color = "white", fill = "gray40") +
  scale_fill_viridis_d(drop = FALSE) +
  theme_sankey(base_size = 18) +
  labs(x = NULL) +
  theme(legend.position = "none",
        plot.title = element_text(hjust = .5)) +
  ggtitle("Car features")

geom_alluvial

Alluvial plots are very similiar to sankey plots but have no spaces between nodes and start at y = 0 instead being centered around the x-axis.

ggplot(df, aes(x = x, next_x = next_x, node = node, next_node = next_node, fill = factor(node), label = node)) +
  geom_alluvial(flow.alpha = .6) +
  geom_alluvial_text(size = 3, color = "white") +
  scale_fill_viridis_d(drop = FALSE) +
  theme_alluvial(base_size = 18) +
  labs(x = NULL) +
  theme(legend.position = "none",
        plot.title = element_text(hjust = .5)) +
  ggtitle("Car features")

geom_sankey_bump

Sankey bump plots is mix between bump plots and sankey and mostly useful for time series. When a group becomes larger than another it bumps above it.

df <- gapminder %>%
  group_by(continent, year) %>%
  summarise(gdp = (sum(pop * gdpPercap)/1e9) %>% round(0), .groups = "keep") %>%
  ungroup()

ggplot(df, aes(x = year,
               node = continent,
               fill = continent,
               value = gdp)) +
  geom_sankey_bump(space = 0, type = "alluvial", color = "transparent", smooth = 6) +
  scale_fill_viridis_d(option = "A", alpha = .8) +
  theme_sankey_bump(base_size = 16) +
  labs(x = NULL,
       y = "GDP ($ bn)",
       fill = NULL,
       color = NULL) +
  theme(legend.position = "bottom") +
  labs(title = "GDP development per continent")