In randy3k/collections: High Performance Container Data Types

The goal of this document is to compare the performances of some of the existing packages.

dequer r packageDescription("dequer")$Versionimplements double ended queues and it supports arbitrary R objects. However, it uses R_PreserveObject and R_ReleaseObject heavily which could be an issue for long queues.
datastructures r packageDescription("datastructures")$Version uses ‘Boost’ and ‘STL’ data types to implement queues and hashmaps. For some reasons, it is often slow as shown in the benchmark.
liqueueR r packageDescription("liqueueR")$Version implements queues in pure R code which explains its slowness.
hash r packageDescription("hash")$Version uses R environments to create hash tables. It is known that regular R enviroments leak memory.
fastmap r packageDescription("fastmap")$Version. The current implementation of collection::dict is actually inspired by it. However, a more efficient hash table library tommy is used.
hashtab of R 4.2.
We also compare to serveral base R implementations based on lists or environments, see here for details.

requireNamespace("bench")
library(tidyverse)
options(digits = 3)
knitr::opts_chunk$set(warning = FALSE, message = FALSE)

Queue

queue <- function() {
    self <- environment()
    q <- NULL
    n <- NULL
    push <- function(item) {
        if (is.null(item)) {
            q[n + 1] <<- list(item)
        } else {
            q[[n + 1]] <<- item
        }
        n <<- n + 1
        invisible(self)
    }
    pop <- function() {
        if (n == 0) stop("queue is empty")
        v <- q[[1]]
        q <<- q[-1]
        n <<- n - 1
        v
    }
    peek <- function() {
        if (n == 0) stop("queue is empty")
        q[[1]]
    }
    clear <- function() {
        q <<- list()
        n <<- 0
        self
    }
    clear()
    self
}

bench_queue <- bench::press(
    n = c(10, 50, 100, 200, 500),
    bench::mark(
        `dequer::queue` = {
            q <- dequer::queue()
            pushback <- dequer::pushback
            pop <- dequer::pop
            x <- rnorm(n)
            for (i in 1:n) pushback(q, x[i])
            for (i in 1:n) pop(q)
        },
        `datastructures::queue` = {
            q <- datastructures::queue()
            insert <- datastructures::insert
            pop <- datastructures::pop
            x <- rnorm(n)
            for (i in 1:n) insert(q, x[i])
            for (i in 1:n) pop(q)
        },
        `liqueueR::Queue` = {
            q <- liqueueR::Queue$new()
            x <- rnorm(n)
            for (i in 1:n) q$push(x[i])
            for (i in 1:n) q$pop()
        },
        `base::list` = {
            q <- queue()
            x <- rnorm(n)
            for (i in 1:n) q$push(x[i])
            for (i in 1:n) q$pop()
        },
        `collections::queue` = {
            q <- collections::queue()
            x <- rnorm(n)
            for (i in 1:n) q$push(x[i])
            for (i in 1:n) q$pop()
        },
        check = FALSE
    )) %>%
    mutate(expression = fct_reorder(
            as.character(expression), median, .fun = mean, .desc = TRUE))

bench_queue %>%
    ggplot(aes(x = n, y = median)) +
    geom_line(aes(color = expression)) +
    scale_colour_brewer(palette = "Set2", direction = -1) +
    ggtitle("push and pop n times") + ylab("time")
bench_queue %>%
    filter(expression %in% c("base::list", "dequer::queue", "collections::queue")) %>%
    ggplot(aes(x = n, y = median)) +
    geom_line(aes(color = expression)) +
    scale_colour_brewer(palette = "Set2", direction = -1) +
    ggtitle("(zoom in)") + ylab("time")

Stack

stack <- function() {
    self <- environment()
    q <- NULL
    n <- NULL

    push <- function(item) {
        if (is.null(item)) {
            q[n + 1] <<- list(item)
        } else {
            q[[n + 1]] <<- item
        }
        n <<- n + 1
        invisible(self)
    }
    pop <- function() {
        if (n == 0) stop("stack is empty")
        v <- q[[n]]
        q <<- q[-n]
        n <<- n - 1
        v
    }
    clear <- function() {
        q <<- list()
        n <<- 0
        invisible(self)
    }
    clear()
    self
}

bench_stack <- bench::press(
    n = c(10, 50, 100, 200, 500),
    bench::mark(
        `dequer::stack` = {
            q <- dequer::stack()
            push <- dequer::push
            pop <- dequer::pop
            x <- rnorm(n)
            for (i in 1:n) push(q, x[i])
            for (i in 1:n) pop(q)
        },
        `datastructures::stack` = {
            q <- datastructures::stack()
            insert <- datastructures::insert
            pop <- datastructures::pop
            x <- rnorm(n)
            for (i in 1:n) insert(q, x[i])
            for (i in 1:n) pop(q)
        },
        `liqueueR::Stack` = {
            q <- liqueueR::Stack$new()
            x <- rnorm(n)
            for (i in 1:n) q$push(x[i])
            for (i in 1:n) q$pop()
        },
        `base::list` = {
            q <- stack()
            x <- rnorm(n)
            for (i in 1:n) q$push(x[i])
            for (i in 1:n) q$pop()
        },
        `collections::stack` = {
            q <- collections::stack()
            x <- rnorm(n)
            for (i in 1:n) q$push(x[i])
            for (i in 1:n) q$pop()
        },
        check = FALSE
    )) %>%
    mutate(expression = fct_reorder(
            as.character(expression), median, .fun = mean, .desc = TRUE))

bench_stack %>%
    ggplot(aes(x = n, y = median)) +
    geom_line(aes(color = expression)) +
    scale_colour_brewer(palette = "Set2", direction = -1) +
    ggtitle("push and pop n times") + ylab("time")
bench_stack %>%
    filter(expression %in% c("base::list", "dequer::stack", "collections::stack")) %>%
    ggplot(aes(x = n, y = median)) +
    geom_line(aes(color = expression)) +
    scale_colour_brewer(palette = "Set2", direction = -1) +
    ggtitle("(zoom in)") + ylab("time")

Deque

deque <- function() {
    self <- environment()
    q <- NULL
    n <- NULL

    push <- function(item) {
        if (is.null(item)) {
            q[n + 1] <<- list(item)
        } else {
            q[[n + 1]] <<- item
        }
        n <<- self$n + 1
        invisible(self)
    }
    pushleft <- function(item) {
        q <<- c(list(item), q)
        n <<- n + 1
        invisible(self)
    }
    pop <- function() {
        if (n == 0) stop("deque is empty")
        v <- q[[n]]
        q <<- q[-n]
        n <<- n - 1
        v
    }
    popleft <- function() {
        if (n == 0) stop("deque is empty")
        v <- q[[1]]
        q <<- q[-1]
        n <<- n - 1
        v
    }
    clear <- function() {
        q <<- list()
        n <<- 0
        invisible(self)
    }
    clear()
    self
}

bench::press(
    n = c(10, 50, 100, 200, 500),
    bench::mark(
        `dequer::deque` = {
            q <- dequer::deque()
            push <- dequer::push
            pop <- dequer::pop
            popback <- dequer::popback
            x <- rnorm(n)
            for (i in 1:n) push(q, x[i])
            for (i in 1:floor(n / 2)) {
                pop(q)
                popback(q)
            }
        },
        `base::list` = {
            q <- deque()
            x <- rnorm(n)
            for (i in 1:n) q$push(x[i])
            for (i in 1:floor(n / 2)) {
                q$pop()
                q$popleft()
            }
        },
        `collections::deque` = {
            q <- collections::deque()
            x <- rnorm(n)
            for (i in 1:n) q$push(x[i])
            for (i in 1:floor(n / 2)) {
                q$pop()
                q$popleft()
            }
        },
        check = FALSE
    )) %>%
    mutate(expression = fct_reorder(
            as.character(expression), median, .fun = mean, .desc = TRUE)) %>%
    ggplot(aes(x = n, y = median)) +
    geom_line(aes(color = expression)) +
    scale_colour_brewer(palette = "Set2", direction = -1) +
    ggtitle("push and pop n times") + ylab("time")

Priority Queue

bench::press(
    n = c(10, 50, 100, 200, 500),
    bench::mark(
        `liqueueR::PriorityQueue` = {
            q <- liqueueR::PriorityQueue$new()
            x <- rnorm(n)
            p <- sample.int(n, n, replace = TRUE)
            for (i in 1:n) q$push(x[i], p[i])
            for (i in 1:n) q$pop()
        },
        `datastructures::binomial_heap` = {
            q <- datastructures::binomial_heap(key.class = "integer")
            insert <- datastructures::insert
            pop <- datastructures::pop
            x <- rnorm(n)
            p <- sample.int(n, n, replace = TRUE)
            for (i in 1:n) insert(q, p[i], x[i])
            for (i in 1:n) pop(q)
        },
        `collections::priority_queue` = {
            q <- collections::priority_queue()
            x <- rnorm(n)
            p <- sample.int(n, n, replace = TRUE)
            for (i in 1:n) q$push(x[i], p[i])
            for (i in 1:n) q$pop()
        },
        check = FALSE
    )) %>%
    mutate(expression = fct_reorder(
            as.character(expression), median, .fun = mean, .desc = TRUE)) %>%
    ggplot(aes(x = n, y = median)) +
    geom_line(aes(color = expression)) +
    scale_colour_brewer(palette = "Set2", direction = -1) +
    ggtitle("push and pop n times") + ylab("time")

Dictionary

bench_dict <- bench::press(
    n = c(10, 50, 100, 200, 500),
    bench::mark(
        `datastructures::hashmap` = {
            h <- datastructures::hashmap()
            insert <- datastructures::insert
            x <- rnorm(n)
            k <- as.character(seq_len(n))
            for (i in 1:n) insert(h, k[i], x[i])
            for (i in sample(1:n)) h[k[i]]
        },
        `hash::hash` = {
            h <- hash::hash()
            x <- rnorm(n)
            k <- as.character(seq_len(n))
            for (i in 1:n) h[[k[i]]] <- x[i]
            for (i in sample(1:n)) h[[k[i]]]
        },
        `fastmap::fastmap` = {
            h <- fastmap::fastmap()
            x <- rnorm(n)
            k <- as.character(seq_len(n))
            for (i in 1:n) h$set(k[i], x[i])
            for (i in sample(1:n)) h$get(k[i])
        },
        `base::new.env` = {
            h <- new.env(parent = emptyenv())
            x <- rnorm(n)
            k <- as.character(seq_len(n))
            for (i in 1:n) h[[k[i]]] <- x[i]
            for (i in sample(1:n)) get0(k[i], envir = x, inherits = FALSE)
        },
        `utils::hashtab` = {
            h <- hashtab()
            x <- rnorm(n)
            k <- as.character(seq_len(n))
            for (i in 1:n) h[[k[i]]] <- x[i]
            for (i in sample(1:n)) h[[k[i]]]
        },    
        `collections::dict` = {
            h <- collections::dict()
            x <- rnorm(n)
            k <- as.character(seq_len(n))
            for (i in 1:n) h$set(k[i], x[i])
            for (i in sample(1:n)) h$get(k[i])
        },
        check = FALSE
    )) %>%
    mutate(expression = fct_reorder(
            as.character(expression), median, .fun = mean, .desc = TRUE))

bench_dict %>%
    ggplot(aes(x = n, y = median)) +
    geom_line(aes(color = expression)) +
    scale_colour_brewer(palette = "Set2", direction = -1) +
    ggtitle("push and pop n times") + ylab("time")
bench_dict %>%
    filter(expression %in% 
           c("hash::hash", "fastmap::fastmap", "collections::dict", 
             "utils::hashtab", "base::new.env")) %>%
    ggplot(aes(x = n, y = median)) +
    geom_line(aes(color = expression)) +
    scale_colour_brewer(palette = "Set2", direction = -1) +
    ggtitle("(zoom in)") + ylab("time")

Note that base::new.env suffers from memory leak issue. The performance gap between R environments and collections::dict is mainly attributed to the overhead in accessing the $get and $set methods from the dict object.

Ordered dict

bench::press(
    n = c(10, 50, 100, 200, 500),
    bench::mark(
        `base::list` = {
            h <- list()
            x <- rnorm(n)
            k <- as.character(seq_len(n))
            for (i in 1:n) h[[k[i]]] <- x[i]
            for (i in sample(1:n)) h[[k[i]]]
        },
        `collections::ordered_dict` = {
            h <- collections::ordered_dict()
            x <- rnorm(n)
            k <- as.character(seq_len(n))
            for (i in 1:n) h$set(k[i], x[i])
            for (i in sample(1:n)) h$get(k[i])
        },
        check = FALSE
    )) %>%
    mutate(expression = fct_reorder(
            as.character(expression), median, .fun = mean, .desc = TRUE)) %>%
    ggplot(aes(x = n, y = median)) +
    geom_line(aes(color = expression)) +
    scale_colour_brewer(palette = "Set2", direction = -1) +
    ggtitle("set and get n times") + ylab("time")