knitr::opts_chunk$set(collapse = TRUE, comment = "#> ")
knitr::opts_chunk$set(fig.width = 10)

# prevent press() noise and autoplot() noise
knitr::opts_chunk$set(message = FALSE, warning = FALSE)

Investigates performance of vec_order() vs current implementation of vec_unique(), which is based on hashing and a dictionary. It might be worth switching to use the sort based approach of vec_order().

Setup

library(vctrs)
library(rlang)
library(stringr)
library(ggplot2)
library(dplyr)
# Generate `size` random words of varying string sizes
new_dictionary <- function(size, min_length, max_length) {
  lengths <- rlang::seq2(min_length, max_length)

  stringi::stri_rand_strings(
    size,
    sample(lengths, size = size, replace = TRUE)
  )
}
# Work around bench_expr bug where vectorized attribute isn't being sliced
# https://github.com/r-lib/bench/pull/90

filter_bench <- function(.data, ...) {
  out <- dplyr::mutate(.data, rn = row_number()) %>%
    dplyr::filter(...)

  # patch up bench_expr
  which <- out$rn
  desc <- attr(.data$expression, "description")
  attr(out$expression, "description") <- desc[which]

  out$rn <- NULL

  out
}
plot_bench <- function(df, title = waiver()) {
  df %>%
    ggplot(aes(x = n_groups, y = as.numeric(median))) +
    geom_point(aes(color = as.character(expression))) +
    facet_wrap(~ size, labeller = label_both, nrow = 1) +
    scale_x_log10() +
    scale_y_log10() + 
    labs(
      x = "Number of groups (log10)",
      y = "Time (log10 seconds)",
      color = "Type",
      title = title
    )
}

Compare with vec_unique_loc()

It is worth comparing to vec_unique_loc(), which is the most bare bones of the uniqueness functions, to test whether or not uniqueness-by-sorting can be faster than uniqueness-by-hashing.

In a branch, I hacked together an implementation of vec_unique_loc() based on vec_order(). It takes approximately the same amount of time as vec_order() itself, so I will just use vec_order() as a proxy for the sorting approach.

Integers

Test 1

set.seed(123)

size <- 10 ^ (1:4)
n_groups <- 10 ^ (1:6)

df <- bench::press(
  size = size,
  n_groups = n_groups,
  {
    x <- sample(n_groups, size, replace = TRUE)
    bench::mark(
      vec_order(x), vec_unique_loc(x), 
      iterations = 100, check = FALSE
    )
  }
)

Performance is generally the same for small sizes

plot_bench(df, "Integers (small)")

However, size = 10000 seems to already show vec_order() being faster.

df[-1] %>%
  filter(size == 10000)

Test 2

set.seed(123)

size <- 10 ^ (5:7)
n_groups <- 10 ^ (1:6)

df <- bench::press(
  size = size,
  n_groups = n_groups,
  {
    x <- sample(n_groups, size, replace = TRUE)
    bench::mark(
      vec_order(x), vec_unique_loc(x), 
      iterations = 20, check = FALSE
    )
  }
)

As the total size increases, vec_order() starts to do better.

plot_bench(df, "Integers (large)")

Test 3

This benchmark shows how much better vec_order() scales for large size and large number of groups. For integers it is always faster, and scales extremely well.

set.seed(123)

size <- 10 ^ 8
n_groups <- 10 ^ (1:7)

df <- bench::press(
  size = size,
  n_groups = n_groups,
  {
    x <- sample(n_groups, size, replace = TRUE)
    bench::mark(
      vec_order(x), vec_unique_loc(x), 
      iterations = 20, check = FALSE
    )
  }
)
df %>%
  ggplot(aes(x = n_groups, y = as.numeric(median))) +
  geom_point(aes(color = as.character(expression))) +
  geom_line(aes(color = as.character(expression))) +
  scale_x_log10() +
  labs(
    x = "Number of groups (log10)",
    y = "Time (seconds)",
    color = "Type",
    title = "Integers - Size=1e8, Varying n_groups"
  )

Doubles

Test 1

set.seed(123)

size <- 10 ^ (1:4)
n_groups <- 10 ^ (1:6)

df <- bench::press(
  size = size,
  n_groups = n_groups,
  {
    x <- sample(n_groups, size, replace = TRUE) + 0
    bench::mark(
      vec_order(x), vec_unique_loc(x), 
      iterations = 100, check = FALSE
    )
  }
)

vec_order() is generally a bit slower for these smaller sizes, but it scales much better with large sizes and larger number of groups. See the next test.

plot_bench(df, "Doubles (small size)")

Test 2

This benchmark shows how much better vec_order() scales for large size and large number of groups. For doubles it is slower up front, but scales much better.

set.seed(123)

size <- 10 ^ 8
n_groups <- 10 ^ (1:7)

df <- bench::press(
  size = size,
  n_groups = n_groups,
  {
    x <- sample(n_groups, size, replace = TRUE) + 0
    bench::mark(
      vec_order(x), vec_unique_loc(x), 
      iterations = 20, check = FALSE
    )
  }
)
df %>%
  ggplot(aes(x = n_groups, y = as.numeric(median))) +
  geom_point(aes(color = as.character(expression))) +
  geom_line(aes(color = as.character(expression))) +
  scale_x_log10() +
  labs(
    x = "Number of groups (log10)",
    y = "Time (seconds)",
    color = "Type",
    title = "Doubles - Size=1e8, Varying n_groups"
  )

Characters

Test 1

Currently string ordering is much slower than vec_unique_loc() (especially when most strings are unique) due to all of the allocations that are required + the fact that it does a radix ordering of unique strings and then an integer radix ordering after that.

I am confident that the C level part of vec_order() could gain a sort_character = false option that would do a much faster counting sort in order-of-first-appearance that utilizes the truelengths in a different way. It wouldn't sort strings at all, so should be very fast. This is what cgroup() does in base::order(), which is not currently implemented in vec_order() because I didn't have a use for it until now. https://github.com/wch/r-source/blob/8d7ac4699fba640d030703fa010b66bf26054cbd/src/main/radixsort.c#L1051

Very large set of strings with 10 groups

set.seed(123)

size <- 1e7
n_groups <- 10

dict <- new_dictionary(n_groups, min_length = 5, max_length = 20)
x <- sample(dict, size, TRUE)

bench::mark(vec_order(x), vec_unique_loc(x), iterations = 10, check = FALSE)

Very large set of completely random strings

set.seed(123)

n_groups <- 1e7

x <- new_dictionary(n_groups, min_length = 5, max_length = 20)

bench::mark(vec_order(x), vec_unique_loc(x), iterations = 10, check = FALSE)

Multiple columns

Test 1

3 integer columns, each with 20 groups. 1e7 total size.

set.seed(123)

size <- 1e7L
n_groups <- 20
n_cols <- 3

cols <- replicate(n_cols, sample(n_groups, size, TRUE), simplify = FALSE)
names(cols) <- seq_along(cols)
df <- vctrs::new_data_frame(cols, size)

bench::mark(
  vec_order(df), 
  vec_unique_loc(df), 
  iterations = 10,
  check = FALSE
)

Test 2

Same as before but with character columns. We do worse here because as mentioned before, we do too much work in vec_order() right now with character vectors.

set.seed(123)

size <- 1e7L
n_groups <- 20
n_cols <- 3

cols <- replicate(
  n_cols, 
  {
    dict <- new_dictionary(n_groups, 5, 20)
    sample(dict, size, TRUE)
  }, 
  simplify = FALSE
)

names(cols) <- seq_along(cols)
df <- vctrs::new_data_frame(cols, size)

bench::mark(
  vec_order(df), 
  vec_unique_loc(df), 
  iterations = 5,
  check = FALSE
)

Test 3

20 integer columns, each with 2 groups. 1e7 total size.

set.seed(123)

size <- 1e7L
n_groups <- 2
n_cols <- 20

cols <- replicate(n_cols, sample(n_groups, size, TRUE), simplify = FALSE)
names(cols) <- seq_along(cols)
df <- vctrs::new_data_frame(cols, size)

bench::mark(
  vec_order(df), 
  vec_unique_loc(df), 
  iterations = 5,
  check = FALSE
)


r-lib/vctrs documentation built on Oct. 30, 2024, 8:54 a.m.