knitr::opts_chunk$set(echo = TRUE)
devtools::install_github("olafmersmann/microbenchmarkCore") devtools::install_github("olafmersmann/microbenchmark")
library(ufovectors) library(ufoaltrep) library(microbenchmark) library(ggplot2) library(scales)
setwd("~/Workspace/ufo_workspace/UFOs/ufovectors/benchmark") ufo_set_debug_mode(F) options(scipen=999) # effectively turns off scientific notation
generate_binary_file <- function(path, range, repeats) { sum <- 0 minimum <- integer(0) maximum <- integer(0) size <- length(range) * repeats f <- file(path, "wb") for (i in 0:repeats) { writeBin(range, f) sum <- sum + sum(range, na.rm = TRUE) minimum <- min(minimum, range, na.rm = TRUE) maximum <- max(maximum, range, na.rm = TRUE) } close(f) list(path=path, sum=sum, size=size, min=minimum, max=maximum) } stats <- list( stats_32mln_1s = generate_binary_file("32mln_1s.bin", as.integer(rep(1, 1000000)), 32), #126MB stats_32mln_seq_ints = generate_binary_file("32mln_seq_ints.bin", 1:1000000, 32), stats_32mln_rand_ints = generate_binary_file("32mln_rand_ints.bin", sample(1000000, replace=T), 32) ) print(stats)
Two nigh-identical implementations of file-backed vectors. One done with UFOs, one done with vectors.
min_load_count = 1024 * 1024 / 4 result <- microbenchmark( "UFO" = { ufo_integer_bin(stats$stats_32mln_1s$path, read_only=FALSE, min_load_count=min_load_count) }, "UFO/RO" = { ufo_integer_bin(stats$stats_32mln_1s$path, read_only=TRUE, min_load_count=min_load_count) }, "ALTREP" = { altrep_ufo_integer_bin(stats$stats_32mln_1s$path) }, times = 50L ) autoplot(result) + scale_y_continuous(labels = scales::label_number_si())
min_load_count = 1024 * 1024 / 4 ufo <- ufo_integer_bin(stats$stats_32mln_1s$path, read_only=FALSE, min_load_count=min_load_count) ufo.ro <- ufo_integer_bin(stats$stats_32mln_1s$path, read_only=TRUE, min_load_count=min_load_count) altrep <- altrep_ufo_integer_bin(stats$stats_32mln_1s$path) result <- microbenchmark( "UFO" = { sum(ufo) }, "UFO/RO" = { sum(ufo.ro) }, "ALTREP" = { sum(altrep) }, check = function(values) { all(sapply(values, function(result) result == stats$stats_32mln_1s$sum)) }, times = 50L ) autoplot(result) + scale_y_continuous(labels = scales::label_number_si())
min_load_count = 1024 * 1024 / 4 actually_check = T ufo <- ufo_integer_bin(stats$stats_32mln_rand_ints$path, read_only=FALSE, min_load_count=min_load_count) ufo.ro <- ufo_integer_bin(stats$stats_32mln_rand_ints$path, read_only=TRUE, min_load_count=min_load_count) altrep <- altrep_ufo_integer_bin(stats$stats_32mln_rand_ints$path) result <- microbenchmark( "UFO" = { sum(ufo) }, "UFO/RO" = { sum(ufo.ro) }, "ALTREP" = { sum(altrep) }, check = function(values) { all(sapply(values, function(result) result == stats$stats_32mln_rand_ints$sum)) }, times = 50L ) autoplot(result) + scale_y_continuous(labels = scales::label_number_si())
min_load_count = 1024 * 1024 / 4 some_function <- function(x) x ufo <- ufo_integer_bin(stats$stats_32mln_rand_ints$path, read_only=FALSE, min_load_count=min_load_count) ufo.ro <- ufo_integer_bin(stats$stats_32mln_rand_ints$path, read_only=TRUE, min_load_count=min_load_count) altrep <- altrep_ufo_integer_bin(stats$stats_32mln_rand_ints$path) result <- microbenchmark( "UFO" = { for (e in ufo) some_function(e) }, "UFO/RO" = { for (e in ufo) some_function(e) }, "ALTREP" = { for (e in altrep) some_function(e) }, times = 10L ) autoplot(result) + scale_y_continuous(labels = scales::label_number_si())
ALTREP is really slow because: - it has to do dispatch on each element - it has to read from disk on each element
TODO: add an ALTREP implementation that buffers elements.
Here I'm comparing
seq.int
which just returns an ordinary R vector, vs. n:m
), which, I believe, is implemented with ALTREPsize = 32000000 result <- microbenchmark( "UFO" = { ufo_integer_seq(1, size, 1, read_only = FALSE) }, "UFO/RO" = { ufo_integer_seq(1, size, 1, read_only = TRUE) }, "ALTREP" = { altrep_integer_seq(1, size, 1) }, "standard vector" = { as.integer(c(1:size)) }, #"compact vector" = { 1:size }, "seq.int" = { seq.int(1, size, 1) }, times = 50L ) autoplot(result) + scale_y_continuous(labels = scales::label_number_si())
size = 32000000 ufo <- ufo_integer_seq(1, size, 1, read_only = FALSE) ufo.ro <- ufo_integer_seq(1, size, 1, read_only = TRUE) altrep <- altrep_integer_seq(1, size, 1) vec.std <- as.integer(c(1:size)) vec.comp <- 1:size vec.seq.int <- seq.int(1, size, 1) result <- microbenchmark( "UFO" = { sum(ufo) }, "UFO/RO" = { sum(ufo.ro) }, "ALTREP" = { sum(altrep) }, "standard vector" = { sum(vec.std) }, #"compact vector" = { sum(vec.comp) }, "seq.int" = { sum(vec.seq.int) }, check = function(values) { all(sapply(values, function(result) result == sum(1:size))) }, times = 50L ) autoplot(result) + scale_y_continuous(labels = scales::label_number_si())
The compact vectors implement their own sum, which doesn't go through all the elements, instead it just calculates the sum from the boundaries of the sequence. I removed them in the end.
(size / 2.0) * (n1 + n1 + inc * (size - 1))
size = 32000000 some_function <- function(x) x ufo <- ufo_integer_seq(1, size, 1, read_only = FALSE) ufo.ro <- ufo_integer_seq(1, size, 1, read_only = TRUE) altrep <- altrep_integer_seq(1, size, 1) vec.std <- as.integer(c(1:size)) vec.comp <- 1:size vec.seq.int <- seq.int(1, size, 1) result <- microbenchmark( "UFO" = { for (e in ufo) some_function(e) }, "UFO/RO" = { for (e in ufo.ro) some_function(e) }, "seq.int" = { for (e in vec.seq.int) some_function(e) }, "compact vector" = { for (e in vec.comp) some_function(e) }, "standard vector" = { for (e in vec.std) some_function(e) }, times = 10L ) autoplot(result) + scale_y_continuous(labels = scales::label_number_si())
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.