knitr::opts_chunk$set(
  collapse = TRUE
  , eval = T
  , comment = "#>"
  , out.width = '100%'
  , message = F
  , warning = F
  , echo = T
)
suppressPackageStartupMessages( require(oetteR) )
suppressPackageStartupMessages( require(tidyverse) )

Introduction

Alluvial Plots can be a powerfull tool to visualise categorical data. It will group observations that have similar values across a set of dimensions and visualise them as flows. The individual flows can be emphasised through different colouring methods.

Alluvial Plots

Visualize Data in a tidy data format

For this dataformat the f_plot_alluvial function is suitable. Also see the help documentation and the examples of that function.

Data

data_ls = mtcars %>%
  f_clean_data()

data_tidy = data_ls$data
max_variables = 5
variables = c( data_ls$categoricals[1:3], data_ls$numericals[1:3] )

head(data_tidy, 10) %>%
  knitr::kable()

Colouring

f_plot_alluvial( data = data_tidy
                , variables = variables
                , max_variables = max_variables
                , fill_by = 'first_variable' )

f_plot_alluvial( data = data_tidy
                , variables = variables
                , max_variables = max_variables
                , fill_by = 'last_variable' )

f_plot_alluvial( data = data_tidy
                , variables = variables
                , max_variables = max_variables
                , fill_by = 'all_flows' )

f_plot_alluvial( data = data_tidy
                , variables = variables
                , max_variables = max_variables
                , fill_by = 'values' )

Ordering

The order of the variables on the x axis is determined by the parameter variables. The order of any y values can be changed using the oder_levels argument. Simply pass the values you want to reorder as a character vector.

f_plot_alluvial( data = data_tidy
                , variables = variables
                , max_variables = max_variables
                , fill_by = 'values'
                , order_levels = c('1', '0') )

Visualize data in the 'gathered' format

Data

Here we have more than one row for each observation and measurements that belong to the same group such as mean arrival delay is gathered in one column, which is indexed by the quarter column. In an alluvial Plot we might want to add another independent variable for coloring like in this case carrier.

monthly_flights = nycflights13::flights %>%
  group_by(month, tailnum, origin, dest, carrier) %>%
  summarise() %>%
  group_by( tailnum, origin, dest, carrier) %>%
  count() %>%
  filter( n == 12 ) %>%
  select( - n ) %>%
  left_join( nycflights13::flights ) %>%
  .[complete.cases(.), ] %>%
  ungroup() %>%
  mutate( tailnum = pmap_chr(list(tailnum, origin, dest, carrier), paste )
          , qu = cut(month, 4)) %>%
  group_by(tailnum, carrier, origin, dest, qu ) %>%
  summarise( mean_arr_delay = mean(arr_delay) ) %>%
  ungroup() %>%
  mutate( mean_arr_delay = ifelse( mean_arr_delay < 10, 'on_time', 'late' ) )

levels(monthly_flights$qu) = c('Q1', 'Q2', 'Q3', 'Q4')

data_gath = monthly_flights

head(data_gath, 10) %>%
knitr::kable()

col_x = 'qu'
col_y = 'mean_arr_delay'
col_fill = 'carrier'
col_id = 'tailnum'

Coloring

color by independent variable carrier

f_plot_alluvial_1v1( data_gath, col_x = 'qu', col_y = 'mean_arr_delay', col_id = 'tailnum', col_fill )
Move colored independent variable to the left
f_plot_alluvial_1v1( data_gath, col_x, col_y, col_id, col_fill, fill_right = F )

Other coloring options

f_plot_alluvial_1v1( data_gath, col_x = 'qu', col_y = 'mean_arr_delay', col_id = 'tailnum', fill_by = 'last_variable' )
f_plot_alluvial_1v1( data_gath, col_x = 'qu', col_y = 'mean_arr_delay', col_id = 'tailnum', fill_by = 'first_variable' )
f_plot_alluvial_1v1( data_gath, col_x = 'qu', col_y = 'mean_arr_delay', col_id = 'tailnum', fill_by = 'all_flows' )
f_plot_alluvial_1v1( data_gath, col_x = 'qu', col_y = 'mean_arr_delay', col_id = 'tailnum', fill_by = 'value' )

Ordering

Y levels of dependent variable

f_plot_alluvial_1v1( data_gath, col_x, col_y, col_id, fill_by = 'first_variable'
                     , order_levels_y = c('on_time', 'late') )

Y levels of independent color variable carrier

order_by_carrier_size = data_gath %>%
  group_by(carrier) %>%
  count() %>%
  arrange( desc(n) ) %>%
  .[['carrier']]

f_plot_alluvial_1v1( data_gath, col_x, col_y, col_id, col_fill
                     , order_levels_fill = order_by_carrier_size )

X levels

f_plot_alluvial_1v1( data_gath, col_x, col_y, col_id, fill_by = 'first_variable'
                     , order_levels_x = c('Q4', 'Q3', 'Q2', 'Q1') )

General

Change the colors

Any color palette can be passed to both functions.

f_plot_alluvial_1v1( data_gath, col_x, col_y, col_id, fill_by = 'last_variable'
                     , col_vector_flow = rev( RColorBrewer::brewer.pal(9, 'Purples') )
                     , col_vector_value = rev( RColorBrewer::brewer.pal(9, 'Oranges') ) )

f_plot_alluvial( data = data_tidy
                , variables = variables
                , max_variables = max_variables
                , fill_by = 'first_variable' 
                , col_vector_flow = rev( RColorBrewer::brewer.pal(9, 'Reds') )
                , col_vector_value = rev( RColorBrewer::brewer.pal(9, 'Greens') )
                )

Missing Data

Missing Data will automatically be labeled as NA. The label can be changed and it can be ordered as usual

Gathered Data

data = data_gath %>%
  select(tailnum, qu, mean_arr_delay) %>%
  sample_frac(0.9)

f_plot_alluvial_1v1( data, col_x, col_y, col_id, fill_by = 'last_variable'
                         , NA_label = 'none'
                         , order_levels_y = 'none')

Tidy Data

data = data_tidy
data$cyl[1:4] = NA

f_plot_alluvial( data = data
                     , variables = variables
                     , max_variables = max_variables
                     , fill_by = 'first_variable'
                     , NA_label = 'none'
                     , order_levels = 'none' )

Connect Flows to observations in original data

link = file.path( system.file(package = 'oetteR')
                  , 'Rmd vignettes'
                  , 'vignette_visualising_regression_models.html' )

The plot objects returned by both functions have an attribute called data_key which is an x-y table arranged like the alluvial plot one column containing the original ID. See vignette for visualising regression models for an example on how this is effectively used.

tidy data

p = f_plot_alluvial( data = data_tidy
                , variables = variables
                , max_variables = max_variables
                , fill_by = 'first_variable' )

p$data_key %>%
  head(10) %>%
  knitr::kable()

gathered data

p = f_plot_alluvial_1v1( data_gath, col_x = 'qu', col_y = 'mean_arr_delay', col_id = 'tailnum', col_fill = 'carrier' )


p$data_key %>%
  head(10) %>%
  knitr::kable()

The Plot objects can be manipulated like any other ggplot object

Flip

p = p +
  coord_flip()

p

Title

p = p +
  ggtitle('Look at my flip')

p

Repel Labels

Unfortunately does not work yet

p = p %>%
  ggrepel::geom_text_repel()


erblast/oetteR documentation built on May 27, 2019, 12:11 p.m.