Performance of the bit package
In bit: Classes and Methods for Fast Memory-Efficient Boolean Selections

knitr::opts_chunk$set(collapse = TRUE, comment = "#>")
library(bit)
library(microbenchmark)
# rmarkdown::render("vignettes/bit-performance.Rmd")
# these are the real settings for the performance vignette
times <- 5
Domain <- c(small=1e3, big=1e6)
Sample <- c(small=1e3, big=1e6)
# these are the settings to keep the cost of CRAN low
#times <- 5
#Domain <- c(small=1e1, big=1e3)
#Sample <- c(small=1e1, big=1e3)

pagebreak <- function() {
  if (knitr::is_latex_output())
    "\\newpage"
  else
    '<div style="page-break-before: always;" />'
}

A performance example

Before we measure performance of the main functionality of the package, note that something simple as '(a:b)[-i]' can and has been accelerated in this package:

a <- 1L
b <- 1e7L
i <- sample(a:b, 1e3)
x <- c(
  R = median(microbenchmark((a:b)[-i], times=times)$time),
  bit = median(microbenchmark(bit_rangediff(c(a, b), i), times=times)$time),
  merge = median(microbenchmark(merge_rangediff(c(a, b), bit_sort(i)), times=times)$time)
)
knitr::kable(as.data.frame(as.list(x / x["R"] * 100)), caption="% of time relative to R", digits=1)

The vignette is compiled with the following performance settings: r times replications with domain size small r Domain["small"] and big r Domain["big"], sample size small r Sample["small"] and big r Sample["big"].

Boolean data types

"A designer knows he has achieved perfection not when there is nothing left to add, but when there is nothing left to take away." "Il semble que la perfection soit atteinte non quand il n'y a plus rien à ajouter, mais quand il n'y a plus rien à retrancher" (Antoine de St. Exupery, Terre des Hommes (Gallimard, 1939), p. 60.)

We compare memory consumption (n=r format(Sample[["big"]])) and runtime (median of r times replications) of the different booltypes for the following filter scenarios:

# TODO(r-lib/lintr#773): nolint as a chunk option.
# nolint start: strings_as_factors_linter.
knitr::kable(
  data.frame(
    coin="random 50%",
    often="random 99%",
    rare="random 1%",
    chunk="contiguous chunk of 5%"
  ),
  caption="selection characteristic"
)
# nolint end: strings_as_factors_linter.

B <- booltypes[c("logical", "bit", "bitwhich", "which", "ri")]
M <-
  c("size", "[]", "[which]", "[which]<-TRUE", "[]<-logical", "!", "&", "|", "==", "!=", "summary")
G <- list(
  coin = function(n) sample(c(FALSE, TRUE), n, replace=TRUE, prob=c(0.5, 0.5)),
  often = function(n) sample(c(FALSE, TRUE), n, replace=TRUE, prob=c(0.01, 0.99)),
  rare = function(n) sample(c(FALSE, TRUE), n, replace=TRUE, prob=c(0.99, 0.01)),
  chunk = function(n) ri(n %/% 20, 2L * n %/% 20, n)
)
X <- vector("list", length(B) * length(G))
dim(X) <- c(booltype=length(B), data=length(G))
dimnames(X) <- list(booltype=names(B), data=names(G))
tim <- array(NA,
  dim=c(booltype=length(B), metric=length(M), data=length(G)),
  dimnames=list(booltype=names(B), metric=M, data=names(G))
)
for (g in names(G)) {
  x <- G[[g]](Sample[["big"]])
  if (g %in% c("coin", "often", "rare"))
    w <- as.which(as.logical(x))
  for (b in B) {
    if (booltypes[[b]] < 'ri' || (b == 'ri' && g == 'chunk')) {
      X[[b, g]] <- as.booltype(x, b)
      if (g %in% c("coin", "often", "rare") && b %in% c("logical", "bit", "bitwhich")) {
        l <- as.booltype(logical(Sample[["big"]]), b)
        tim[b, "[which]", g] <- median(microbenchmark(l[w], times=times)$time)
        tim[b, "[which]<-TRUE", g] <- median(microbenchmark(l[w] <- TRUE, times=times)$time)
        tim[b, "[]", g] <- median(microbenchmark(l[], times=times)$time)
        tim[b, "[]<-logical", g] <- median(microbenchmark(l[] <- x, times=times)$time)
      }
      tim[b, "size", g] <- object.size(X[[b, g]])
    }
  }
}
for (g in names(G)) {
  for (b in c("logical", "bit", "bitwhich")) {
    x <- X[[b, g]]
    if (!is.null(x)) {
      tim[b, "!", g] <- median(microbenchmark(!x, times=times)$time)
      tim[b, "&", g] <- median(microbenchmark(x & x, times=times)$time)
      tim[b, "|", g] <- median(microbenchmark(x | x, times=times)$time)
      tim[b, "==", g] <- median(microbenchmark(x == x, times=times)$time)
      tim[b, "!=", g] <- median(microbenchmark(x != x, times=times)$time)
      tim[b, "summary", g] <- median(microbenchmark(summary.booltype(x), times=times)$time)
    }
  }
}
i <- match("size", M)
for (b in rev(names(B))) { # logical was in first position, so we do this last!
  tim[b, i, ] <- 100 * tim[b, i, ] / tim["logical", i, ]
  tim[b, -i, ] <- 100 * tim[b, -i, ] / max(tim["logical", -i, ], na.rm=TRUE)
}
#rm(X)

There are substantial savings in skewed filter situations:

x <- tim[1:3, , "rare"]
m <- rep("", ncol(x))
m <- as.vector(rbind(m, colnames(x), m))
dotchart(x,
  xlim=c(0, max(100, max(x))),
  labels=m,
  pch=c("R", "b", "w"),
  col=c("black", "blue", "red"),
  main="% size and timings in 'rare' scenario",
  sub="l='logical'  b='bit'  w='bitwhich'           % of max(R) in all scenarios"
)

x <- tim[1:3, , "often"]
dotchart(x,
  xlim=c(0, max(100, max(x))),
  labels=m,
  pch=c("R", "b", "w"),
  col=c("black", "blue", "red"),
  main="% size and timings in 'often' scenario",
  sub="l='logical'  b='bit'  w='bitwhich'           % of max(R) in all scenarios"
)

Even in non-skewed situations the new booltypes are competitive:

x <- tim[1:3, , "coin"]
dotchart(x,
  xlim=c(0, max(100, max(x))),
  labels=m,
  pch=c("R", "b", "w"),
  col=c("black", "blue", "red"),
  main="% size and timings in 'coin' scenario",
  sub="l='logical'  b='bit'  w='bitwhich'           % of max(R) in all scenarios"
)

Detailed tables follow.

r pagebreak()

% memory consumption of filter

knitr::kable(round(tim[, "size", ], 1), caption="% bytes of logical")

% time extracting

knitr::kable(round(tim[, "[]", ], 1), caption="% time of logical")

% time assigning

knitr::kable(round(tim[, "[]<-logical", ], 1), caption="% time of logical")

% time subscripting with 'which'

knitr::kable(round(tim[, "[which]", ], 1), caption="% time of logical")

% time assigning with 'which'

knitr::kable(round(tim[, "[which]<-TRUE", ], 1), caption="% time of logical")

% time Boolean NOT

knitr::kable(round(tim[, "!", ], 1), caption="% time for Boolean NOT")

% time Boolean AND

knitr::kable(round(tim[, "&", ], 1), caption="% time for Boolean &")

% time Boolean OR

knitr::kable(round(tim[, "|", ], 1), caption="% time for Boolean |")

% time Boolean EQUALITY

knitr::kable(round(tim[, "==", ], 1), caption="% time for Boolean ==")

% time Boolean XOR

knitr::kable(round(tim[, "!=", ], 1), caption="% time for Boolean !=")

% time Boolean SUMMARY

knitr::kable(round(tim[, "summary", ][1:2, 1:2], 1), caption="% time for Boolean summary")

Fast methods for `integer` set operations

"The space-efficient structure of bitmaps dramatically reduced the run time of sorting" (Jon Bently, Programming Pearls, Cracking the oyster, p. 7)

binaryDomain <- list(
  smallsmall = rep(Domain["small"], 2L),
  smallbig=Domain,
  bigsmall=rev(Domain),
  bigbig=rep(Domain["big"], 2L)
)
binarySample <- list(
  smallsmall = rep(Sample["small"], 2L),
  smallbig=Sample,
  bigsmall=rev(Sample),
  bigbig=rep(Sample["big"], 2L)
)

M <- c("R", "bit", "merge")
G <- c("sort", "sortunique")
D <- c("unsorted", "sorted")

sortM <- vector("list", length(M) * length(G))
dim(sortM) <- c(method=length(M), goal=length(G))
dimnames(sortM) <- list(method=M, goal=G)
sortM[["R", "sort"]] <- sort
sortM[["R", "sortunique"]] <- function(x) sort(unique(x))
sortM[["bit", "sort"]] <- bit_sort
sortM[["bit", "sortunique"]] <- bit_sort_unique

timsort <- array(NA_integer_,
  dim=c(M=2, G=length(G), D=length(D), N=length(Domain)),
  dimnames=list(M=M[1:2], G=G, D=D, N=names(Domain))
)
for (n in names(Domain)) {
  x <- sample(Domain[[n]], Sample[[n]], replace = TRUE)
  d <- "unsorted"
  for (m in c("R", "bit")) {
    for (g in G) {
      timsort[m, g, d, n] <- median(microbenchmark(sortM[[m, g]](x), times=times)$time)
    }
  }
  x <- bit_sort(x)
  d <- "sorted"
  for (m in 1:2) {
    for (g in G) {
      timsort[m, g, d, n] <- median(microbenchmark(sortM[[m, g]](x), times=times)$time)
    }
  }
}


binaryU <-
  c("match", "in", "notin", "union", "intersect", "setdiff", "symdiff", "setequal", "setearly")
binaryM <- vector("list", length(M) * length(binaryU))
dim(binaryM) <- c(method=length(M), task=length(binaryU))
dimnames(binaryM) <- list(method=M, task=binaryU)
binaryM[["R", "match"]] <- match
binaryM[["merge", "match"]] <- merge_match

binaryM[["R", "in"]] <- get("%in%")
binaryM[["bit", "in"]] <- bit_in
binaryM[["merge", "in"]] <- merge_in

binaryM[["R", "notin"]] <- function(x, y) !(x %in% y)
binaryM[["bit", "notin"]] <- function(x, y) !bit_in(x, y)
binaryM[["merge", "notin"]] <- merge_notin

binaryM[["R", "union"]] <- union
binaryM[["bit", "union"]] <- bit_union
binaryM[["merge", "union"]] <- merge_union

binaryM[["R", "intersect"]] <- intersect
binaryM[["bit", "intersect"]] <- bit_intersect
binaryM[["merge", "intersect"]] <- merge_intersect

binaryM[["R", "setdiff"]] <- setdiff
binaryM[["bit", "setdiff"]] <- bit_setdiff
binaryM[["merge", "setdiff"]] <- merge_setdiff

binaryM[["R", "symdiff"]] <- function(x, y) union(setdiff(x, y), setdiff(y, x))
binaryM[["bit", "symdiff"]] <- bit_symdiff
binaryM[["merge", "symdiff"]] <- merge_symdiff

binaryM[["R", "setequal"]] <-
  function(x, y) setequal(x, x) # we compare x to x which avoids early termination
binaryM[["bit", "setequal"]] <- function(x, y) bit_setequal(x, x)
binaryM[["merge", "setequal"]] <- function(x, y) merge_setequal(x, x)

binaryM[["R", "setearly"]] <-
  function(x, y) setequal(x, y) # we compare x to x which avoids early termination
binaryM[["bit", "setearly"]] <- function(x, y) bit_setequal(x, y)
binaryM[["merge", "setearly"]] <- function(x, y) merge_setequal(x, y)

unaryU <- c("unique", "duplicated", "anyDuplicated", "sumDuplicated")
unaryM <- vector("list", length(M) * length(unaryU))
dim(unaryM) <- c(method=length(M), task=length(unaryU))
dimnames(unaryM) <- list(method=M, task=unaryU)
unaryM[["R", "unique"]] <- unique
unaryM[["bit", "unique"]] <- bit_unique
unaryM[["merge", "unique"]] <- merge_unique
unaryM[["R", "duplicated"]] <- duplicated
unaryM[["bit", "duplicated"]] <- bit_duplicated
unaryM[["merge", "duplicated"]] <- merge_duplicated
unaryM[["R", "anyDuplicated"]] <- anyDuplicated
unaryM[["bit", "anyDuplicated"]] <- bit_anyDuplicated
unaryM[["merge", "anyDuplicated"]] <- merge_anyDuplicated
unaryM[["R", "sumDuplicated"]] <- function(x) sum(duplicated(x))
unaryM[["bit", "sumDuplicated"]] <- bit_sumDuplicated
unaryM[["merge", "sumDuplicated"]] <- merge_sumDuplicated

tim <- array(NA_integer_,
  dim=c(M=length(M), U=length(unaryU) + length(binaryU), N=length(binaryDomain), D=length(D)),
  dimnames=list(M=M, U=c(unaryU, binaryU), N=names(binaryDomain), D=D)
)

for (n in names(binaryDomain)) {
  xnam <- names(binaryDomain[[n]])[1]
  ynam <- names(binaryDomain[[n]])[2]
  x <- sample(binaryDomain[[n]][1], binarySample[[n]][1], replace = FALSE)
  y <- sample(binaryDomain[[n]][2], binarySample[[n]][2], replace = FALSE)
  d <- "unsorted"
  if (length(x) == length(y)) {
    for (u in unaryU) {
      for (m in setdiff(M, "merge")) {
        f <- unaryM[[m, u]]
        if (!is.null(f))
          tim[m, u, n, d] <- median(microbenchmark(f(x), times=times)$time)
      }
    }
  }
  for (u in binaryU) {
    for (m in setdiff(M, "merge")) {
      f <- binaryM[[m, u]]
      if (!is.null(f))
        tim[m, u, n, d] <- median(microbenchmark(f(x, y), times=times)$time)
    }
  }
  x <- bit_sort(x)
  y <- bit_sort(y)
  d <- "sorted"
  if (length(x) == length(y)) {
    for (u in unaryU) {
      for (m in M) {
        f <- unaryM[[m, u]]
        if (!is.null(f)) {
          tim[m, u, n, d] <- median(microbenchmark(f(x), times=times)$time)
          # now plug-in measures for unsorted merge
          if (m == "merge") {
            tim["merge", u, n, "unsorted"] <-
              timsort["bit", "sort", "unsorted", xnam] +  tim["merge", u, n, "sorted"]
          }
        }
      }
    }
  }
  for (u in binaryU) {
    for (m in M) {
      f <- binaryM[[m, u]]
      if (!is.null(f)) {
        tim[m, u, n, d] <- median(microbenchmark(f(x, y), times=times)$time)
        # now plug-in measures for unsorted merge
        if (m == "merge") {
          tim["merge", u, n, "unsorted"] =
            timsort["bit", "sort", "unsorted", xnam] +
            timsort["bit", "sort", "unsorted", ynam] +
            tim["merge", u, n, "sorted"]
        }
      }
    }
  }
}

y <- timsort[, , , "big"]
y <- 100 * y / max(y["R", , ], na.rm=TRUE)
oldpar <- par(mfrow=c(2, 1), mar=c(5, 8, 2, 1))
x <- y[, , "unsorted"]
dotchart(x,
  xlim=c(0, max(100, max(y))),
  labels="",
  pch=c("R", "b"),
  xlab="execution time",
  main="unsorted",
  col=c("red", "blue")
)
x <- y[, , "sorted"]
dotchart(x,
  xlim=c(0, max(100, max(y))),
  labels="",
  pch=c("R", "b"),
  xlab="execution time",
  main="sorted",
  col=c("red", "blue")
)
par(oldpar)

tim2 <- tim
for (n in names(binaryDomain)) {
  for (d in D) {
    tim2[, , n, d] <- 100 * tim[, , n, d] / max(tim["R", , n, d], na.rm=TRUE)
  }
}

r pagebreak()

y <- tim2[, , "bigbig", ]
y <- 100 * y / max(y["R", , ], na.rm=TRUE)
x <- y[, , "unsorted"]
m <- rep("", ncol(x))
m <- as.vector(rbind(m, colnames(x), m))
dotchart(x,
  xlim=c(0, max(100, max(y, na.rm=TRUE))),
  labels=m,
  pch=c("R", "b", "m"),
  col=c("red", "blue", "black"),
  main="Timings in 'unsorted bigbig' scenario",
  sub="R='hash'   b='bit'   m='merge'"
)

x <- y[, , "sorted"]
dotchart(x,
  xlim=c(0, max(y, na.rm=TRUE)),
  labels=m,
  pch=c("R", "b", "m"),
  col=c("red", "blue", "black"),
  main="Timings in 'sorted bigbig' scenario",
  sub="R='hash'   b='bit'   m='merge'"
)

r pagebreak()

% time for sorting

x <- 100 * timsort["bit", , , ] / timsort["R", , , ]
s <- "sorted"
knitr::kable(x[, s, ], caption=paste(s, "data relative to R's sort"), digits=1)

s <- "unsorted"
knitr::kable(x[, s, ], caption=paste(s, "data relative to R's sort"), digits=1)

% time for unique

f <- function(u) {
  n <- c("smallsmall", "bigbig")
  x <- tim[c("bit", "merge", "merge"), u, n, ]
  dimnames(x)$M[3] <- "sort"
  dimnames(x)$N <- c("small", "big")
  x["sort", , "unsorted"] <- timsort["bit", "sort", "unsorted", ]
  x["sort", , "sorted"] <- 0
  for (m in dimnames(x)$M) {
    x[m, , ] <- x[m, , ] / tim["R", u, n, ] * 100
  }
  x
}
x <- f("unique")
s <- "sorted"
knitr::kable(x[, , s], caption=paste(s, "data relative to R"), digits=1)

s <- "unsorted"
knitr::kable(x[, , s], caption=paste(s, "data relative to R"), digits=1)

% time for duplicated

x <- f("duplicated")
s <- "sorted"
knitr::kable(x[, , s], caption=paste(s, "data relative to R"), digits=1)

s <- "unsorted"
knitr::kable(x[, , s], caption=paste(s, "data relative to R"), digits=1)

% time for anyDuplicated

x <- f("anyDuplicated")
s <- "sorted"
knitr::kable(x[, , s], caption=paste(s, "data relative to R"), digits=1)

s <- "unsorted"
knitr::kable(x[, , s], caption=paste(s, "data relative to R"), digits=1)

% time for sumDuplicated

x <- f("sumDuplicated")
s <- "sorted"
knitr::kable(x[, , s], caption=paste(s, "data relative to R"), digits=1)

s <- "unsorted"
knitr::kable(x[, , s], caption=paste(s, "data relative to R"), digits=1)

% time for match

f <- function(u) {
  x <- tim[c("bit", "merge", "merge"), u, , ]
  dimnames(x)$M[3] <- "sort"
  s <- timsort["bit", "sort", "unsorted", ]
  x["sort", , "unsorted"] <- rep(s, c(2, 2)) + c(s, s)
  x["sort", , "sorted"] <- 0
  for (m in dimnames(x)$M) {
    x[m, , ] <- x[m, , ] / tim["R", u, , ] * 100
  }
  x
}
x <- f("match")
s <- "sorted"
knitr::kable(x[, , s], caption=paste(s, "data relative to R"), digits=1)

s <- "unsorted"
knitr::kable(x[, , s], caption=paste(s, "data relative to R"), digits=1)

% time for in

x <- f("in")
s <- "sorted"
knitr::kable(x[, , s], caption=paste(s, "data relative to R"), digits=1)

s <- "unsorted"
knitr::kable(x[, , s], caption=paste(s, "data relative to R"), digits=1)

% time for notin

x <- f("notin")
s <- "sorted"
knitr::kable(x[, , s], caption=paste(s, "data relative to R"), digits=1)

s <- "unsorted"
knitr::kable(x[, , s], caption=paste(s, "data relative to R"), digits=1)

% time for union

x <- f("union")
s <- "sorted"
knitr::kable(x[, , s], caption=paste(s, "data relative to R"), digits=1)

s <- "unsorted"
knitr::kable(x[, , s], caption=paste(s, "data relative to R"), digits=1)

% time for intersect

x <- f("intersect")
s <- "sorted"
knitr::kable(x[, , s], caption=paste(s, "data relative to R"), digits=1)

s <- "unsorted"
knitr::kable(x[, , s], caption=paste(s, "data relative to R"), digits=1)

% time for setdiff

x <- f("setdiff")
s <- "sorted"
knitr::kable(x[, , s], caption=paste(s, "data relative to R"), digits=1)

s <- "unsorted"
knitr::kable(x[, , s], caption=paste(s, "data relative to R"), digits=1)

% time for symdiff

x <- f("symdiff")
s <- "sorted"
knitr::kable(x[, , s], caption=paste(s, "data relative to R"), digits=1)

s <- "unsorted"
knitr::kable(x[, , s], caption=paste(s, "data relative to R"), digits=1)

% time for setequal

x <- f("setequal")
s <- "sorted"
knitr::kable(x[, , s], caption=paste(s, "data relative to R"), digits=1)

s <- "unsorted"
knitr::kable(x[, , s], caption=paste(s, "data relative to R"), digits=1)

% time for setearly

x <- f("setearly")
s <- "sorted"
knitr::kable(x[, , s], caption=paste(s, "data relative to R"), digits=1)

s <- "unsorted"
knitr::kable(x[, , s], caption=paste(s, "data relative to R"), digits=1)

Any scripts or data that you put into this service are public.

bit documentation built on April 4, 2025, 3:09 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

bit
Classes and Methods for Fast Memory-Efficient Boolean Selections

Performance of the bit package
In bit: Classes and Methods for Fast Memory-Efficient Boolean Selections

A performance example

Boolean data types

% memory consumption of filter

% time extracting

% time assigning

% time subscripting with 'which'

% time assigning with 'which'

% time Boolean NOT

% time Boolean AND

% time Boolean OR

% time Boolean EQUALITY

% time Boolean XOR

% time Boolean SUMMARY

Fast methods for `integer` set operations

% time for sorting

% time for unique

% time for duplicated

% time for anyDuplicated

% time for sumDuplicated

% time for match

% time for in

% time for notin

% time for union

% time for intersect

% time for setdiff

% time for symdiff

% time for setequal

% time for setearly

Try the bit package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

bit Classes and Methods for Fast Memory-Efficient Boolean Selections

Performance of the bit package In bit: Classes and Methods for Fast Memory-Efficient Boolean Selections

A performance example

Boolean data types

% memory consumption of filter

% time extracting

% time assigning

% time subscripting with 'which'

% time assigning with 'which'

% time Boolean NOT

% time Boolean AND

% time Boolean OR

% time Boolean EQUALITY

% time Boolean XOR

% time Boolean SUMMARY

Fast methods for integer set operations

% time for sorting

% time for unique

% time for duplicated

% time for anyDuplicated

% time for sumDuplicated

% time for match

% time for in

% time for notin

% time for union

% time for intersect

% time for setdiff

% time for symdiff

% time for setequal

% time for setearly

Try the bit package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

bit
Classes and Methods for Fast Memory-Efficient Boolean Selections

Performance of the bit package
In bit: Classes and Methods for Fast Memory-Efficient Boolean Selections

Fast methods for `integer` set operations