knitr::opts_chunk$set(collapse = TRUE, comment = "#> ") knitr::opts_chunk$set(fig.width = 10) # prevent press() noise and autoplot() noise knitr::opts_chunk$set(message = FALSE, warning = FALSE)
Investigates performance of vec_order()
vs current implementation of vec_unique()
, which is based on hashing and a dictionary. It might be worth switching to use the sort based approach of vec_order()
.
library(vctrs) library(rlang) library(stringr) library(ggplot2) library(dplyr)
# Generate `size` random words of varying string sizes new_dictionary <- function(size, min_length, max_length) { lengths <- rlang::seq2(min_length, max_length) stringi::stri_rand_strings( size, sample(lengths, size = size, replace = TRUE) ) }
# Work around bench_expr bug where vectorized attribute isn't being sliced # https://github.com/r-lib/bench/pull/90 filter_bench <- function(.data, ...) { out <- dplyr::mutate(.data, rn = row_number()) %>% dplyr::filter(...) # patch up bench_expr which <- out$rn desc <- attr(.data$expression, "description") attr(out$expression, "description") <- desc[which] out$rn <- NULL out }
plot_bench <- function(df, title = waiver()) { df %>% ggplot(aes(x = n_groups, y = as.numeric(median))) + geom_point(aes(color = as.character(expression))) + facet_wrap(~ size, labeller = label_both, nrow = 1) + scale_x_log10() + scale_y_log10() + labs( x = "Number of groups (log10)", y = "Time (log10 seconds)", color = "Type", title = title ) }
vec_unique_loc()
It is worth comparing to vec_unique_loc()
, which is the most bare bones of the uniqueness functions, to test whether or not uniqueness-by-sorting can be faster than uniqueness-by-hashing.
In a branch, I hacked together an implementation of vec_unique_loc()
based on vec_order()
. It takes approximately the same amount of time as vec_order()
itself, so I will just use vec_order()
as a proxy for the sorting approach.
set.seed(123) size <- 10 ^ (1:4) n_groups <- 10 ^ (1:6) df <- bench::press( size = size, n_groups = n_groups, { x <- sample(n_groups, size, replace = TRUE) bench::mark( vec_order(x), vec_unique_loc(x), iterations = 100, check = FALSE ) } )
Performance is generally the same for small sizes
plot_bench(df, "Integers (small)")
However, size = 10000
seems to already show vec_order()
being faster.
df[-1] %>% filter(size == 10000)
set.seed(123) size <- 10 ^ (5:7) n_groups <- 10 ^ (1:6) df <- bench::press( size = size, n_groups = n_groups, { x <- sample(n_groups, size, replace = TRUE) bench::mark( vec_order(x), vec_unique_loc(x), iterations = 20, check = FALSE ) } )
As the total size increases, vec_order()
starts to do better.
plot_bench(df, "Integers (large)")
This benchmark shows how much better vec_order()
scales for large size and large number of groups. For integers it is always faster, and scales extremely well.
set.seed(123) size <- 10 ^ 8 n_groups <- 10 ^ (1:7) df <- bench::press( size = size, n_groups = n_groups, { x <- sample(n_groups, size, replace = TRUE) bench::mark( vec_order(x), vec_unique_loc(x), iterations = 20, check = FALSE ) } )
df %>% ggplot(aes(x = n_groups, y = as.numeric(median))) + geom_point(aes(color = as.character(expression))) + geom_line(aes(color = as.character(expression))) + scale_x_log10() + labs( x = "Number of groups (log10)", y = "Time (seconds)", color = "Type", title = "Integers - Size=1e8, Varying n_groups" )
set.seed(123) size <- 10 ^ (1:4) n_groups <- 10 ^ (1:6) df <- bench::press( size = size, n_groups = n_groups, { x <- sample(n_groups, size, replace = TRUE) + 0 bench::mark( vec_order(x), vec_unique_loc(x), iterations = 100, check = FALSE ) } )
vec_order()
is generally a bit slower for these smaller sizes, but it scales much better with large sizes and larger number of groups. See the next test.
plot_bench(df, "Doubles (small size)")
This benchmark shows how much better vec_order()
scales for large size and large number of groups. For doubles it is slower up front, but scales much better.
set.seed(123) size <- 10 ^ 8 n_groups <- 10 ^ (1:7) df <- bench::press( size = size, n_groups = n_groups, { x <- sample(n_groups, size, replace = TRUE) + 0 bench::mark( vec_order(x), vec_unique_loc(x), iterations = 20, check = FALSE ) } )
df %>% ggplot(aes(x = n_groups, y = as.numeric(median))) + geom_point(aes(color = as.character(expression))) + geom_line(aes(color = as.character(expression))) + scale_x_log10() + labs( x = "Number of groups (log10)", y = "Time (seconds)", color = "Type", title = "Doubles - Size=1e8, Varying n_groups" )
Currently string ordering is much slower than vec_unique_loc()
(especially when most strings are unique) due to all of the allocations that are required + the fact that it does a radix ordering of unique strings and then an integer radix ordering after that.
I am confident that the C level part of vec_order()
could gain a sort_character = false
option that would do a much faster counting sort in order-of-first-appearance that utilizes the truelengths in a different way. It wouldn't sort strings at all, so should be very fast. This is what cgroup()
does in base::order()
, which is not currently implemented in vec_order()
because I didn't have a use for it until now. https://github.com/wch/r-source/blob/8d7ac4699fba640d030703fa010b66bf26054cbd/src/main/radixsort.c#L1051
Very large set of strings with 10 groups
set.seed(123) size <- 1e7 n_groups <- 10 dict <- new_dictionary(n_groups, min_length = 5, max_length = 20) x <- sample(dict, size, TRUE) bench::mark(vec_order(x), vec_unique_loc(x), iterations = 10, check = FALSE)
Very large set of completely random strings
vec_order()
is doing way too much work to actually sort the strings.set.seed(123) n_groups <- 1e7 x <- new_dictionary(n_groups, min_length = 5, max_length = 20) bench::mark(vec_order(x), vec_unique_loc(x), iterations = 10, check = FALSE)
3 integer columns, each with 20 groups. 1e7 total size.
set.seed(123) size <- 1e7L n_groups <- 20 n_cols <- 3 cols <- replicate(n_cols, sample(n_groups, size, TRUE), simplify = FALSE) names(cols) <- seq_along(cols) df <- vctrs::new_data_frame(cols, size) bench::mark( vec_order(df), vec_unique_loc(df), iterations = 10, check = FALSE )
Same as before but with character columns. We do worse here because as mentioned before, we do too much work in vec_order()
right now with character vectors.
set.seed(123) size <- 1e7L n_groups <- 20 n_cols <- 3 cols <- replicate( n_cols, { dict <- new_dictionary(n_groups, 5, 20) sample(dict, size, TRUE) }, simplify = FALSE ) names(cols) <- seq_along(cols) df <- vctrs::new_data_frame(cols, size) bench::mark( vec_order(df), vec_unique_loc(df), iterations = 5, check = FALSE )
20 integer columns, each with 2 groups. 1e7 total size.
set.seed(123) size <- 1e7L n_groups <- 2 n_cols <- 20 cols <- replicate(n_cols, sample(n_groups, size, TRUE), simplify = FALSE) names(cols) <- seq_along(cols) df <- vctrs::new_data_frame(cols, size) bench::mark( vec_order(df), vec_unique_loc(df), iterations = 5, check = FALSE )
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.