In ufo-org/ufo-r-operators: Operator overloads for UFO vectors

knitr::opts_chunk$set(echo = TRUE)

devtools::install_github("olafmersmann/microbenchmarkCore")
devtools::install_github("olafmersmann/microbenchmark")

library(ufovectors)
library(ufoaltrep)
library(microbenchmark)
library(ggplot2)
library(scales)

setwd("~/Workspace/ufo_workspace/UFOs/ufovectors/benchmark")
ufo_set_debug_mode(F)
options(scipen=999) # effectively turns off scientific notation

Generating files

generate_binary_file <- function(path, range, repeats) {
  sum <- 0
  minimum <- integer(0)
  maximum <- integer(0)
  size <- length(range) * repeats
  f <- file(path, "wb")
  for (i in 0:repeats) {
    writeBin(range, f)
    sum <- sum + sum(range, na.rm = TRUE)
    minimum <- min(minimum, range, na.rm = TRUE)
    maximum <- max(maximum, range, na.rm = TRUE)
  }
  close(f)
  list(path=path, sum=sum, size=size, min=minimum, max=maximum)
}

stats <- list(
  stats_32mln_1s = generate_binary_file("32mln_1s.bin", as.integer(rep(1, 1000000)), 32), #126MB
  stats_32mln_seq_ints = generate_binary_file("32mln_seq_ints.bin", 1:1000000, 32),
  stats_32mln_rand_ints = generate_binary_file("32mln_rand_ints.bin", sample(1000000, replace=T), 32)
)

print(stats)

File-backed matrices

Two nigh-identical implementations of file-backed vectors. One done with UFOs, one done with vectors.

Creation: 32 million 1s

min_load_count = 1024 * 1024 / 4

result <- microbenchmark(
  "UFO" = { 
    ufo_integer_bin(stats$stats_32mln_1s$path, read_only=FALSE, min_load_count=min_load_count)
  },
  "UFO/RO" = { 
    ufo_integer_bin(stats$stats_32mln_1s$path, read_only=TRUE, min_load_count=min_load_count)
  },
  "ALTREP" = {
    altrep_ufo_integer_bin(stats$stats_32mln_1s$path)
  },
  times = 50L
)

autoplot(result) + scale_y_continuous(labels = scales::label_number_si())

Sum: 32 million 1s

min_load_count = 1024 * 1024 / 4

ufo <- ufo_integer_bin(stats$stats_32mln_1s$path, read_only=FALSE, min_load_count=min_load_count)
ufo.ro <- ufo_integer_bin(stats$stats_32mln_1s$path, read_only=TRUE, min_load_count=min_load_count)
altrep <- altrep_ufo_integer_bin(stats$stats_32mln_1s$path)

result <- microbenchmark(
  "UFO" = { sum(ufo) },
  "UFO/RO" = { sum(ufo.ro) },
  "ALTREP" = { sum(altrep) },
  check = function(values) {
    all(sapply(values, function(result) result == stats$stats_32mln_1s$sum))
  },
  times = 50L
)

autoplot(result) + scale_y_continuous(labels = scales::label_number_si())

Sum: 32 million random integers

min_load_count = 1024 * 1024 / 4
actually_check = T

ufo <- ufo_integer_bin(stats$stats_32mln_rand_ints$path, read_only=FALSE, min_load_count=min_load_count)
ufo.ro <- ufo_integer_bin(stats$stats_32mln_rand_ints$path, read_only=TRUE, min_load_count=min_load_count)
altrep <- altrep_ufo_integer_bin(stats$stats_32mln_rand_ints$path)

result <- microbenchmark(
  "UFO" = { sum(ufo) },
  "UFO/RO" = { sum(ufo.ro) },
  "ALTREP" = { sum(altrep) },
  check = function(values) {
    all(sapply(values, function(result) result == stats$stats_32mln_rand_ints$sum))
  },
  times = 50L
)

autoplot(result) + scale_y_continuous(labels = scales::label_number_si())

For loop: 32 million random integers

min_load_count = 1024 * 1024 / 4
some_function <- function(x) x

ufo <- ufo_integer_bin(stats$stats_32mln_rand_ints$path, read_only=FALSE, min_load_count=min_load_count)
ufo.ro <- ufo_integer_bin(stats$stats_32mln_rand_ints$path, read_only=TRUE, min_load_count=min_load_count)
altrep <- altrep_ufo_integer_bin(stats$stats_32mln_rand_ints$path)

result <- microbenchmark(
  "UFO" = {
    for (e in ufo) some_function(e)
  },
  "UFO/RO" = {
    for (e in ufo) some_function(e)
  },
  "ALTREP" = {
    for (e in altrep) some_function(e)
  },

  times = 10L
)

autoplot(result) + scale_y_continuous(labels = scales::label_number_si())

ALTREP is really slow because: - it has to do dispatch on each element - it has to read from disk on each element

TODO: add an ALTREP implementation that buffers elements.

Sequences

Here I'm comparing

our implementation of sequences, vs.
seq.int which just returns an ordinary R vector, vs.
a compact vector (result of n:m), which, I believe, is implemented with ALTREP

Creation

size = 32000000

result <- microbenchmark(
  "UFO" = { ufo_integer_seq(1, size, 1, read_only = FALSE) },
  "UFO/RO" = {  ufo_integer_seq(1, size, 1, read_only = TRUE) },
  "ALTREP" = {  altrep_integer_seq(1, size, 1) },
  "standard vector" = { as.integer(c(1:size)) },
  #"compact vector" = { 1:size },
  "seq.int" = { seq.int(1, size, 1) },
  times = 50L
)

autoplot(result) + scale_y_continuous(labels = scales::label_number_si())

Sum: 32 mln int sequence

size = 32000000

ufo <- ufo_integer_seq(1, size, 1, read_only = FALSE)
ufo.ro <- ufo_integer_seq(1, size, 1, read_only = TRUE)
altrep <- altrep_integer_seq(1, size, 1)
vec.std <- as.integer(c(1:size))
vec.comp <- 1:size
vec.seq.int <-  seq.int(1, size, 1)

result <- microbenchmark(
  "UFO" = { sum(ufo) },
  "UFO/RO" = { sum(ufo.ro) },
  "ALTREP" = { sum(altrep) },
  "standard vector" = { sum(vec.std) },
  #"compact vector" = { sum(vec.comp) },
  "seq.int" = { sum(vec.seq.int) },
  check = function(values) {
    all(sapply(values, function(result) result == sum(1:size)))
  },
  times = 50L
)

autoplot(result) + scale_y_continuous(labels = scales::label_number_si())

The compact vectors implement their own sum, which doesn't go through all the elements, instead it just calculates the sum from the boundaries of the sequence. I removed them in the end.

(size / 2.0) * (n1 + n1 + inc * (size - 1))

For loop: 32 mln int sequence

size = 32000000
some_function <- function(x) x

ufo <- ufo_integer_seq(1, size, 1, read_only = FALSE)
ufo.ro <- ufo_integer_seq(1, size, 1, read_only = TRUE)
altrep <- altrep_integer_seq(1, size, 1)
vec.std <- as.integer(c(1:size))
vec.comp <- 1:size
vec.seq.int <-  seq.int(1, size, 1)

result <- microbenchmark(
  "UFO" = {
    for (e in ufo) some_function(e)
  },
  "UFO/RO" = {
    for (e in ufo.ro) some_function(e)
  },
  "seq.int" = {
    for (e in vec.seq.int) some_function(e)
  },
  "compact vector" = {
    for (e in vec.comp) some_function(e)
  },
  "standard vector" = {
    for (e in vec.std) some_function(e)
  },
  times = 10L
)

autoplot(result) + scale_y_continuous(labels = scales::label_number_si())