knitr::opts_chunk$set( collapse = TRUE , eval = T , comment = "#>" , out.width = '100%' , message = F , warning = F , echo = T )
suppressPackageStartupMessages( require(oetteR) ) suppressPackageStartupMessages( require(tidyverse) )
Alluvial Plots can be a powerfull tool to visualise categorical data. It will group observations that have similar values across a set of dimensions and visualise them as flows. The individual flows can be emphasised through different colouring methods.
For this dataformat the f_plot_alluvial
function is suitable. Also see the help documentation and the examples of that function.
data_ls = mtcars %>% f_clean_data() data_tidy = data_ls$data max_variables = 5 variables = c( data_ls$categoricals[1:3], data_ls$numericals[1:3] ) head(data_tidy, 10) %>% knitr::kable()
f_plot_alluvial( data = data_tidy , variables = variables , max_variables = max_variables , fill_by = 'first_variable' ) f_plot_alluvial( data = data_tidy , variables = variables , max_variables = max_variables , fill_by = 'last_variable' ) f_plot_alluvial( data = data_tidy , variables = variables , max_variables = max_variables , fill_by = 'all_flows' ) f_plot_alluvial( data = data_tidy , variables = variables , max_variables = max_variables , fill_by = 'values' )
The order of the variables on the x axis is determined by the parameter variables. The order of any y values can be changed using the oder_levels argument. Simply pass the values you want to reorder as a character vector.
f_plot_alluvial( data = data_tidy , variables = variables , max_variables = max_variables , fill_by = 'values' , order_levels = c('1', '0') )
Here we have more than one row for each observation and measurements that belong to the same group such as mean arrival delay is gathered in one column, which is indexed by the quarter column. In an alluvial Plot we might want to add another independent variable for coloring like in this case carrier
.
monthly_flights = nycflights13::flights %>% group_by(month, tailnum, origin, dest, carrier) %>% summarise() %>% group_by( tailnum, origin, dest, carrier) %>% count() %>% filter( n == 12 ) %>% select( - n ) %>% left_join( nycflights13::flights ) %>% .[complete.cases(.), ] %>% ungroup() %>% mutate( tailnum = pmap_chr(list(tailnum, origin, dest, carrier), paste ) , qu = cut(month, 4)) %>% group_by(tailnum, carrier, origin, dest, qu ) %>% summarise( mean_arr_delay = mean(arr_delay) ) %>% ungroup() %>% mutate( mean_arr_delay = ifelse( mean_arr_delay < 10, 'on_time', 'late' ) ) levels(monthly_flights$qu) = c('Q1', 'Q2', 'Q3', 'Q4') data_gath = monthly_flights head(data_gath, 10) %>% knitr::kable() col_x = 'qu' col_y = 'mean_arr_delay' col_fill = 'carrier' col_id = 'tailnum'
carrier
f_plot_alluvial_1v1( data_gath, col_x = 'qu', col_y = 'mean_arr_delay', col_id = 'tailnum', col_fill )
f_plot_alluvial_1v1( data_gath, col_x, col_y, col_id, col_fill, fill_right = F )
f_plot_alluvial_1v1( data_gath, col_x = 'qu', col_y = 'mean_arr_delay', col_id = 'tailnum', fill_by = 'last_variable' ) f_plot_alluvial_1v1( data_gath, col_x = 'qu', col_y = 'mean_arr_delay', col_id = 'tailnum', fill_by = 'first_variable' ) f_plot_alluvial_1v1( data_gath, col_x = 'qu', col_y = 'mean_arr_delay', col_id = 'tailnum', fill_by = 'all_flows' ) f_plot_alluvial_1v1( data_gath, col_x = 'qu', col_y = 'mean_arr_delay', col_id = 'tailnum', fill_by = 'value' )
f_plot_alluvial_1v1( data_gath, col_x, col_y, col_id, fill_by = 'first_variable' , order_levels_y = c('on_time', 'late') )
carrier
order_by_carrier_size = data_gath %>% group_by(carrier) %>% count() %>% arrange( desc(n) ) %>% .[['carrier']] f_plot_alluvial_1v1( data_gath, col_x, col_y, col_id, col_fill , order_levels_fill = order_by_carrier_size )
f_plot_alluvial_1v1( data_gath, col_x, col_y, col_id, fill_by = 'first_variable' , order_levels_x = c('Q4', 'Q3', 'Q2', 'Q1') )
Any color palette can be passed to both functions.
f_plot_alluvial_1v1( data_gath, col_x, col_y, col_id, fill_by = 'last_variable' , col_vector_flow = rev( RColorBrewer::brewer.pal(9, 'Purples') ) , col_vector_value = rev( RColorBrewer::brewer.pal(9, 'Oranges') ) ) f_plot_alluvial( data = data_tidy , variables = variables , max_variables = max_variables , fill_by = 'first_variable' , col_vector_flow = rev( RColorBrewer::brewer.pal(9, 'Reds') ) , col_vector_value = rev( RColorBrewer::brewer.pal(9, 'Greens') ) )
Missing Data will automatically be labeled as NA
. The label can be changed and it can be ordered as usual
data = data_gath %>% select(tailnum, qu, mean_arr_delay) %>% sample_frac(0.9) f_plot_alluvial_1v1( data, col_x, col_y, col_id, fill_by = 'last_variable' , NA_label = 'none' , order_levels_y = 'none')
data = data_tidy data$cyl[1:4] = NA f_plot_alluvial( data = data , variables = variables , max_variables = max_variables , fill_by = 'first_variable' , NA_label = 'none' , order_levels = 'none' )
link = file.path( system.file(package = 'oetteR') , 'Rmd vignettes' , 'vignette_visualising_regression_models.html' )
The plot objects returned by both functions have an attribute called data_key
which is an x-y table arranged like the alluvial plot one column containing the original ID. See vignette for visualising regression models for an example on how this is effectively used.
p = f_plot_alluvial( data = data_tidy , variables = variables , max_variables = max_variables , fill_by = 'first_variable' ) p$data_key %>% head(10) %>% knitr::kable()
p = f_plot_alluvial_1v1( data_gath, col_x = 'qu', col_y = 'mean_arr_delay', col_id = 'tailnum', col_fill = 'carrier' ) p$data_key %>% head(10) %>% knitr::kable()
p = p + coord_flip() p
p = p + ggtitle('Look at my flip') p
Unfortunately does not work yet
p = p %>% ggrepel::geom_text_repel()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.