knitr::opts_chunk$set(collapse = TRUE, comment = "#> ") knitr::opts_chunk$set(fig.width = 10) # prevent press() noise and autoplot() noise knitr::opts_chunk$set(message = FALSE, warning = FALSE)
Investigation of vec_order()
performance compared with base::order()
using various data types and distributions of data (total size, number of groups, etc).
library(vctrs) library(rlang) library(stringr) library(ggplot2) library(dplyr) library(forcats)
# Wrapper around `order()` that is also meaningful for data frames and # always chooses radix ordering base_order <- function(x, na.last = TRUE, decreasing = FALSE) { if (is.data.frame(x)) { x <- unname(x) } else { x <- list(x) } args <- list(na.last = na.last, decreasing = decreasing, method = "radix") args <- c(x, args) exec("order", !!!args) }
# Generate `size` random words of varying string sizes new_dictionary <- function(size, min_length, max_length) { lengths <- rlang::seq2(min_length, max_length) stringi::stri_rand_strings( size, sample(lengths, size = size, replace = TRUE) ) }
# Work around bench_expr bug where vectorized attribute isn't being sliced # https://github.com/r-lib/bench/pull/90 filter_bench <- function(.data, ...) { out <- dplyr::mutate(.data, rn = row_number()) %>% dplyr::filter(...) # patch up bench_expr which <- out$rn desc <- attr(.data$expression, "description") attr(out$expression, "description") <- desc[which] out$rn <- NULL out }
plot_bench <- function(df, title = waiver()) { df %>% ggplot(aes(x = n_groups, y = as.numeric(median))) + geom_point(aes(color = as.character(expression))) + facet_wrap(~ size, labeller = label_both, nrow = 1) + scale_x_log10() + scale_y_log10() + labs( x = "Number of groups (log10)", y = "Time (log10 seconds)", color = "Type", title = title ) }
The performance of integer sorting is generally the same as order()
. The one exception is with nearly sorted integer vectors, where order()
does better for some unknown reason. See the "Nearly sorted sequence" section.
set.seed(123) size <- 10 ^ (1:4) n_groups <- 10 ^ (1:6) df <- bench::press( size = size, n_groups = n_groups, { x <- sample(n_groups, size, replace = TRUE) bench::mark(vec_order(x), base_order(x), iterations = 50) } )
We seem to have a small edge when ordering very small vectors, but this practically won't make too much of a difference.
plot_bench(df, title = "Integers (small)")
set.seed(123) size <- 10 ^ (5:7) n_groups <- 10 ^ (1:6) df <- bench::press( size = size, n_groups = n_groups, { x <- sample(n_groups, size, replace = TRUE) bench::mark(vec_order(x), base_order(x), iterations = 10) } )
Performance seems to be generally about the same no matter the size or number of groups.
plot_bench(df, title = "Integers (large)")
Investigating the performance of switching from the counting sort to the radix sort. This happens at INT_ORDER_COUNTING_RANGE_BOUNDARY
which is 100,000.
set.seed(123) n_groups <- c(80000, 90000, 99999, 100001, 110000, 120000) size <- 1e7 df <- bench::press( n_groups = n_groups, { x <- sample(n_groups, size, replace = TRUE) bench::mark(vec_order(x), base_order(x), iterations = 20) } )
There is a definite jump in performance when initially moving to the counting sort. Perhaps this boundary isn't optimal, but it seems to scale well after the boundary.
df
df %>% ggplot(aes(x = n_groups, y = as.numeric(median))) + geom_point(aes( color = as.character(expression), shape = fct_reorder(as.factor(n_groups), n_groups) )) + scale_x_log10() + scale_y_log10() + labs( x = "Number of groups (log10)", y = "Time (log10 seconds)", color = "Type", shape = "Number of groups", title = "Integers - Counting sort jump" )
Double performance is overall similar to integers when compared with order()
. It is generally slower than ordering integers because a maximum of 8 passes are required to order a double (8 bytes vs 4 bytes in integers).
set.seed(123) size <- 10 ^ (1:4) n_groups <- 10 ^ (1:6) df <- bench::press( size = size, n_groups = n_groups, { x <- sample(n_groups, size, replace = TRUE) + 0 bench::mark(vec_order(x), base_order(x), iterations = 50) } )
Performance is about the same.
plot_bench(df, title = "Doubles (small)")
set.seed(123) size <- 10 ^ (5:7) n_groups <- 10 ^ (1:6) df <- bench::press( size = size, n_groups = n_groups, { x <- sample(n_groups, size, replace = TRUE) + 0 bench::mark( vec_order(x), base_order(x), iterations = 10 ) } )
I imagine the increase in gc's for large sizes comes from the fact that vec_order()
uses Rf_allocVector()
to generate its working memory, and base_order()
uses malloc()
, which won't trigger a gc.
autoplot(df) + guides(x = guide_axis(n.dodge = 2)) + ggtitle("Doubles (large size)")
plot_bench(df, title = "Doubles (large)")
I expect to be slightly slower with character vectors, as base R has access to macros for TRUELENGTH()
and LEVELS()
(used to determine encoding), but we have to call their function equivalents.
There is also an additional component here, the maximum string length. The longest string in the vector determines how many "passes" are required in the radix sort. Longer strings mean more passes which generally takes more time. However, keep in mind that we only radix sort the unique strings, so this often doesn't hurt us that much (like in dplyr where we group on a character column with just a few groups).
set.seed(123) size <- 10 ^ (1:4) n_groups <- 10 ^ (1:6) df <- bench::press( size = size, n_groups = n_groups, { dict <- new_dictionary(n_groups, min_length = 5, max_length = 20) x <- sample(dict, size, replace = TRUE) bench::mark(vec_order(x), base_order(x), iterations = 50) } )
plot_bench(df, title = "Characters (small)")
As expected, for small ish sizes we are somewhat slower. This difference seems to go away as you increase the number of groups.
df %>% mutate(expression = as.character(expression)) %>% filter_bench(size == 10000)
set.seed(123) size <- 10 ^ (5:7) n_groups <- 10 ^ (1:6) df <- bench::press( size = size, n_groups = n_groups, { dict <- new_dictionary(n_groups, min_length = 5, max_length = 20) x <- sample(dict, size, replace = TRUE) bench::mark(vec_order(x), base_order(x), iterations = 10) } )
Generally about the same once the size gets larger
plot_bench(df, title = "Characters (large)")
Zoom into the size 1e7 section and change to normal seconds (not log seconds)
As the number of unique strings increases, we have to radix order more strings. This generally takes more time.
df %>% filter_bench(size == 1e7) %>% ggplot(aes(x = n_groups, y = as.numeric(median))) + geom_point(aes(color = as.character(expression))) + geom_line(aes(color = as.character(expression))) + scale_x_log10() + labs( x = "Number of groups (log10)", y = "Time (seconds)", color = "Type", title = "Characters - 1e7 size" )
Very large set of completely random strings
set.seed(123) n_groups <- 1e7 x <- new_dictionary(n_groups, min_length = 5, max_length = 20) bench::mark(vec_order(x), base_order(x), iterations = 10)
What is the effect of string size on total time?
set.seed(123) size <- 1e6 n_groups <- 10 ^ (1:6) string_size <- c(10, 20, 40, 60, 80, 100) df <- bench::press( string_size = string_size, n_groups = n_groups, { dict <- new_dictionary(n_groups, string_size, string_size) x <- sample(dict, size, replace = TRUE) bench::mark(vec_order(x), base_order(x), iterations = 10) } )
The string size doesn't seem to add too much more time.
df %>% ggplot(aes(x = n_groups, y = as.numeric(median))) + geom_point(aes(color = as.character(expression))) + facet_wrap(~ string_size, labeller = label_both, nrow = 1) + scale_x_log10() + scale_y_log10() + labs( x = "Number of groups (log10)", y = "Time (log10 seconds)", color = "Type", title = "Varying string size" ) + guides(x = guide_axis(n.dodge = 2))
Zoom in to 1e6 groups and varying string size. Here you can see that there is some effect on total time. Larger maximum string size = more time required, but it isn't that bad.
We also seem to consistently do better than order()
when most of the strings are unique. My guess is that this is a consequence of the fact that I store the results of Rf_length()
on each string (this queries the nchars of the string) so I don't have to call that function repeatedly in the recursive section of the algorithm. order()
doesn't do this. This adds a little memory overhead, but makes up for that in speed.
df %>% filter_bench(n_groups == 1e6) %>% select(-n_groups) %>% ggplot(aes(x = string_size, y = as.numeric(median))) + geom_point(aes(color = as.character(expression))) + scale_y_log10() + labs( x = "String size", y = "Time (seconds)", color = "Type", title = "Varying string size (n_groups = 1e6)" )
Both base and vctrs use a fast check for sortedness up front. For the very specific case of integer vectors with default options, base has an even faster sortedness check that beats us, but I'm not too worried about it. It can also return an ALTREP sequence, so it doesn't get hit with memory overhead.
x <- 1:1e7 + 0L # Base is faster (simpler check + ALTREP result) bench::mark( vec_order(x), base_order(x), iterations = 50 ) # But tweak some options and it becomes closer bench::mark( vec_order(x, direction = "desc", na_value = "smallest"), base_order(x, decreasing = TRUE, na.last = FALSE), iterations = 50 )
Nearly sorted integer vectors is one case where base does a little better than we do for some reason. I can't seem to figure out why. But it doesn't ever seem to be a dramatic difference.
x <- c(1:1e7, 1:20) + 0L bench::mark( vec_order(x), base_order(x), iterations = 20 )
The performance difference goes away (for the most part) with doubles
x <- c(1:1e7, 1:20) + 0 bench::mark( vec_order(x), base_order(x), iterations = 20 )
It is also worth comparing with multiple columns, since this is what a group by would typically do.
This uses a counting sort for both since the ranges are small
Performance is around the same.
set.seed(123) size <- 1e7 n_groups1 <- 20 n_groups2 <- 100 df <- data.frame( x = sample(n_groups1, size, replace = TRUE), y = sample(n_groups2, size, replace = TRUE) ) bench::mark(vec_order(df), base_order(df), iterations = 10)
This uses a radix sort for both since there is no counting sort for doubles.
Performance is around the same.
set.seed(123) size <- 1e7 n_groups1 <- 20 n_groups2 <- 100 df <- data.frame( x = sample(n_groups1, size, replace = TRUE) + 0, y = sample(n_groups2, size, replace = TRUE) + 0 ) bench::mark(vec_order(df), base_order(df), iterations = 10)
Like test 2a, but the doubles groups are very spread out.
We tend to do much better here, but I am not entirely sure why.
set.seed(123) size <- 1e7 n_groups1 <- 20 n_groups2 <- 100 dict1 <- sample(1e15, n_groups1) dict2 <- sample(1e15, n_groups2) df <- data.frame( x = sample(dict1, size, replace = TRUE) + 0, y = sample(dict2, size, replace = TRUE) + 0 ) bench::mark(vec_order(df), base_order(df), iterations = 10)
20 integer columns, each with 2 groups. 1e6 total size.
There end up being around 64,000 unique rows
Performance is about the same
set.seed(123) size <- 1e6L n_groups <- 2 n_cols <- 20 cols <- replicate(n_cols, sample(n_groups, size, TRUE), simplify = FALSE) names(cols) <- seq_along(cols) df <- vctrs::new_data_frame(cols, size) bench::mark(vec_order(df), base_order(df), iterations = 10)
Same as before but with character columns. We do slightly worse here.
set.seed(123) size <- 1e6L n_groups <- 2 n_cols <- 20 cols <- replicate( n_cols, { dict <- new_dictionary(n_groups, 5, 10) sample(dict, size, TRUE) }, simplify = FALSE ) names(cols) <- seq_along(cols) df <- vctrs::new_data_frame(cols, size) bench::mark(vec_order(df), base_order(df), iterations = 10)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.