Performance of the bit package

knitr::opts_chunk$set(collapse = TRUE, comment = "#>")
require(bit)
require(microbenchmark)
# rmarkdown::render("vignettes/bit-performance.Rmd")
# these are the real settings for the performance vignette
times <- 5
Domain <- c(small=1e3, big=1e6)
Sample <- c(small=1e3, big=1e6)
# these are the settings to keep the cost of CRAN low
#times <- 5
#Domain <- c(small=1e1, big=1e3)
#Sample <- c(small=1e1, big=1e3)

pagebreak <- function() {
  if(knitr::is_latex_output())
    return("\\newpage")
  else
    return('<div style="page-break-before: always;" />')
}

A performance example

Before we measure performance of the main functionality of the package, note that something simple as '(a:b)[-i]' can and has been accelerated in this package:

a <- 1L
b <- 1e7L
i <- sample(a:b,1e3)
x <- c(
  R = median(microbenchmark((a:b)[-i], times=times)$time)
, bit = median(microbenchmark(bit_rangediff(c(a,b), i), times=times)$time)
, merge = median(microbenchmark(merge_rangediff(c(a,b), bit_sort(i)), times=times)$time)
)
knitr::kable(as.data.frame(as.list(x/x["R"]*100)), caption="% of time relative to R", digits=1)

The vignette is compiled with the following performance settings: r times replications with domain size small r Domain["small"] and big r Domain["big"], sample size small r Sample["small"] and big r Sample["big"].

Boolean data types

"A designer knows he has achieved perfection not when there is nothing left to add, but when there is nothing left to take away."
"Il semble que la perfection soit atteinte non quand il n'y a plus rien à ajouter, mais quand il n'y a plus rien à retrancher"
(Antoine de St. Exupery, Terre des Hommes (Gallimard, 1939), p. 60.)

We compare memory consumption (n=r format(Sample[["big"]])) and runtime (median of r times replications) of the different booltypes for the following filter scenarios:

knitr::kable(
  data.frame(coin="random 50%", often="random 99%", rare="random 1%", chunk="contiguous chunk of 5%")
  , caption="selection characteristic")
B <- booltypes[c("logical","bit","bitwhich","which","ri")]
M <- c("size", "[]", "[which]", "[which]<-TRUE", "[]<-logical", "!", "&", "|", "==", "!=", "summary")
G <- list(
  coin = function(n)sample(c(FALSE, TRUE), n, replace=TRUE, prob=c(0.5,0.5))
, often = function(n)sample(c(FALSE, TRUE), n, replace=TRUE, prob=c(0.01,0.99))
, rare = function(n)sample(c(FALSE, TRUE), n, replace=TRUE, prob=c(0.99,0.01))
, chunk = function(n)ri(n%/%20,2L*n%/%20,n)
)
X <- vector("list", length(B)*length(G))
dim(X) <- c(booltype=length(B), data=length(G))
dimnames(X) <- list(booltype=names(B), data=names(G))
tim <- array(NA
             , dim=c(booltype=length(B), metric=length(M), data=length(G))
             , dimnames=list(booltype=names(B), metric=M, data=names(G))
             )
for (g in names(G)){
  x <- G[[g]](Sample[["big"]])
  if (g %in% c("coin","often","rare"))
    w <- as.which(as.logical(x))
  for (b in B){
    if (booltypes[[b]] < 'ri' || (b == 'ri' && g=='chunk')){
      X[[b,g]] <- as.booltype(x, b)
      if (g %in% c("coin","often","rare") && b %in% c("logical","bit","bitwhich")){
        l <- as.booltype(logical(Sample[["big"]]), b)
        tim[b,"[which]",g] <- median(microbenchmark(l[w], times=times)$time)
        tim[b,"[which]<-TRUE",g] <- median(microbenchmark(l[w]<-TRUE, times=times)$time)
        tim[b,"[]",g] <- median(microbenchmark(l[], times=times)$time)
        tim[b,"[]<-logical",g] <- median(microbenchmark(l[]<-x, times=times)$time)
      }
      tim[b,"size",g] <- object.size(X[[b,g]])
    }
  }
}
for (g in names(G)){
  for (b in c("logical","bit","bitwhich")){
    x <- X[[b,g]]
    if (!is.null(x)){
      tim[b,"!",g] <- median(microbenchmark(!x, times=times)$time)
      tim[b,"&",g] <- median(microbenchmark(x & x, times=times)$time)
      tim[b,"|",g] <- median(microbenchmark(x | x, times=times)$time)
      tim[b,"==",g] <- median(microbenchmark(x == x, times=times)$time)
      tim[b,"!=",g] <- median(microbenchmark(x != x, times=times)$time)
      tim[b,"summary",g] <- median(microbenchmark(summary.booltype(x), times=times)$time)
    }
  }
}
i <- match("size", M)
for(b in rev(names(B)))  # logical was in first position, so we do this last!
{
  tim[b,i,] <- 100 * tim[b,i,] / tim["logical",i,]
  tim[b,-i,] <- 100 * tim[b,-i,] / max(tim["logical",-i,], na.rm=TRUE)
}
#rm(X)

There are substantial savings in skewed filter situations:

x <- tim[1:3,,"rare"]
m <- rep("", ncol(x))
m <- as.vector(rbind(m, colnames(x), m))
dotchart(x, xlim=c(0,max(100, max(x))), labels=m, pch=c("R","b","w"), col=c("black","blue","red"), main="% size and timings in 'rare' scenario", sub="l='logical'  b='bit'  w='bitwhich'           % of max(R) in all scenarios")
x <- tim[1:3,,"often"]
dotchart(x, xlim=c(0,max(100, max(x))), labels=m, pch=c("R","b","w"), col=c("black","blue","red"), main="% size and timings in 'often' scenario", sub="l='logical'  b='bit'  w='bitwhich'           % of max(R) in all scenarios")

Even in non-skewed situations the new booltypes are competitive:

x <- tim[1:3,,"coin"]
dotchart(x, xlim=c(0,max(100, max(x))), labels=m, pch=c("R","b","w"), col=c("black","blue","red"), main="% size and timings in 'coin' scenario", sub="l='logical'  b='bit'  w='bitwhich'           % of max(R) in all scenarios")

Detailed tables follow.

r pagebreak()

% memory consumption of filter

knitr::kable(round(tim[,"size",], 1), caption="% bytes of logical")

% time extracting

knitr::kable(round(tim[,"[]",], 1), caption="% time of logical")

% time assigning

knitr::kable(round(tim[,"[]<-logical",], 1), caption="% time of logical")

% time subscripting with 'which'

knitr::kable(round(tim[,"[which]",], 1), caption="% time of logical")

% time assigning with 'which'

knitr::kable(round(tim[,"[which]<-TRUE",], 1), caption="% time of logical")

% time Boolean NOT

knitr::kable(round(tim[,"!",], 1), caption="% time for Boolean NOT")

% time Boolean AND

knitr::kable(round(tim[,"&",], 1), caption="% time for Boolean &")

% time Boolean OR

knitr::kable(round(tim[,"|",], 1), caption="% time for Boolean |")

% time Boolean EQUALITY

knitr::kable(round(tim[,"==",], 1), caption="% time for Boolean ==")

% time Boolean XOR

knitr::kable(round(tim[,"!=",], 1), caption="% time for Boolean !=")

% time Boolean SUMMARY

knitr::kable(round(tim[,"summary",][1:2,1:2], 1), caption="% time for Boolean summary")

Fast methods for integer set operations

"The space-efficient structure of bitmaps dramatically reduced the run time of sorting"
(Jon Bently, Programming Pearls, Cracking the oyster, p. 7)

binaryDomain <- list(
    smallsmall = rep(Domain["small"], 2)
  , smallbig=Domain
  , bigsmall=rev(Domain)
  , bigbig=rep(Domain["big"], 2)
)
binarySample <- list(
    smallsmall = rep(Sample["small"], 2)
  , smallbig=Sample
  , bigsmall=rev(Sample)
  , bigbig=rep(Sample["big"], 2)
)

M <- c("R","bit","merge")
G <- c("sort","sortunique")
D <- c("unsorted","sorted")

sortM <- vector("list", length(M)*length(G))
dim(sortM) <- c(method=length(M), goal=length(G))
dimnames(sortM) <- list(method=M, goal=G)
sortM[["R","sort"]] <- sort
sortM[["R", "sortunique"]] <- function(x)sort(unique(x))
sortM[["bit","sort"]] <- bit_sort
sortM[["bit","sortunique"]] <- bit_sort_unique

timsort <- array(NA_integer_
             , dim=c(M=2, G=length(G), D=length(D), N=length(Domain)) 
             , dimnames=list(M=M[1:2], G=G, D=D, N=names(Domain)) 
             )
for(n in names(Domain)){
  x <- sample(Domain[[n]], Sample[[n]], replace = TRUE)
  d <- "unsorted"
  for (m in c("R","bit")){
    for (g in G){
      timsort[m,g,d,n] <- median(microbenchmark(sortM[[m,g]](x), times=times)$time)
    }
  }
  x <- bit_sort(x)
  d <- "sorted"
  for (m in 1:2){
    for (g in G){
      timsort[m,g,d,n] <- median(microbenchmark(sortM[[m,g]](x), times=times)$time)
    }
  }
}


binaryU <- c("match","in","notin","union","intersect","setdiff","symdiff","setequal","setearly")
binaryM <- vector("list", length(M)*length(binaryU))
dim(binaryM) <- c(method=length(M), task=length(binaryU))
dimnames(binaryM) <- list(method=M, task=binaryU)
binaryM[["R","match"]] <- match
binaryM[["merge","match"]] <- merge_match

binaryM[["R","in"]] <- get("%in%")
binaryM[["bit","in"]] <- bit_in
binaryM[["merge","in"]] <- merge_in

binaryM[["R","notin"]] <- function(x, y)!(x %in% y)
binaryM[["bit","notin"]] <- function(x, y)!bit_in(x,y)
binaryM[["merge","notin"]] <- merge_notin

binaryM[["R","union"]] <- union
binaryM[["bit","union"]] <- bit_union
binaryM[["merge","union"]] <- merge_union

binaryM[["R","intersect"]] <- intersect
binaryM[["bit","intersect"]] <- bit_intersect
binaryM[["merge","intersect"]] <- merge_intersect

binaryM[["R","setdiff"]] <- setdiff
binaryM[["bit","setdiff"]] <- bit_setdiff
binaryM[["merge","setdiff"]] <- merge_setdiff

binaryM[["R","symdiff"]] <- function(x,y)union(setdiff(x,y), setdiff(y,x))
binaryM[["bit","symdiff"]] <- bit_symdiff
binaryM[["merge","symdiff"]] <- merge_symdiff

binaryM[["R","setequal"]] <- function(x,y)setequal(x,x)  # we compare x to x which avoids early termination and hence 
binaryM[["bit","setequal"]] <- function(x,y)bit_setequal(x,x)
binaryM[["merge","setequal"]] <- function(x,y)merge_setequal(x,x)

binaryM[["R","setearly"]] <- function(x,y)setequal(x,y)  # we compare x to x which avoids early termination and hence 
binaryM[["bit","setearly"]] <- function(x,y)bit_setequal(x,y)
binaryM[["merge","setearly"]] <- function(x,y)merge_setequal(x,y)

unaryU <- c("unique","duplicated","anyDuplicated","sumDuplicated")
unaryM <- vector("list", length(M)*length(unaryU))
dim(unaryM) <- c(method=length(M), task=length(unaryU))
dimnames(unaryM) <- list(method=M, task=unaryU)
unaryM[["R","unique"]] <- unique
unaryM[["bit","unique"]] <- bit_unique
unaryM[["merge","unique"]] <- merge_unique
unaryM[["R","duplicated"]] <- duplicated
unaryM[["bit","duplicated"]] <- bit_duplicated
unaryM[["merge","duplicated"]] <- merge_duplicated
unaryM[["R","anyDuplicated"]] <- anyDuplicated
unaryM[["bit","anyDuplicated"]] <- bit_anyDuplicated
unaryM[["merge","anyDuplicated"]] <- merge_anyDuplicated
unaryM[["R","sumDuplicated"]] <- function(x)sum(duplicated(x))
unaryM[["bit","sumDuplicated"]] <- bit_sumDuplicated
unaryM[["merge","sumDuplicated"]] <- merge_sumDuplicated

tim <- array(NA_integer_
             , dim=c(M=length(M), U=length(unaryU)+length(binaryU), N=length(binaryDomain), D=length(D))
             , dimnames=list(M=M, U=c(unaryU,binaryU), N=names(binaryDomain), D=D)
             )

for(n in names(binaryDomain)){
  xnam <- names(binaryDomain[[n]])[1]
  ynam <- names(binaryDomain[[n]])[2]
  x <- sample(binaryDomain[[n]][1], binarySample[[n]][1], replace = FALSE)
  y <- sample(binaryDomain[[n]][2], binarySample[[n]][2], replace = FALSE)
  d <- "unsorted"
  if (length(x)==length(y))
  for (u in unaryU){
    for (m in setdiff(M,"merge")){
      f <- unaryM[[m,u]]
      if (!is.null(f))
        tim[m,u,n,d] <- median(microbenchmark(f(x), times=times)$time)
    }
  }
  for (u in binaryU){
    for (m in setdiff(M,"merge")){
      f <- binaryM[[m,u]]
      if (!is.null(f))
        tim[m,u,n,d] <- median(microbenchmark(f(x,y), times=times)$time)
    }
  }
  x <- bit_sort(x)
  y <- bit_sort(y)
  d <- "sorted"
  if (length(x)==length(y))
  for (u in unaryU){
    for (m in M){
      f <- unaryM[[m,u]]
      if (!is.null(f)){
        tim[m,u,n,d] <- median(microbenchmark(f(x), times=times)$time)
        # now plug-in measures for unsorted merge
        if (m == "merge") 
          tim["merge",u,n,"unsorted"] <- timsort["bit","sort","unsorted",xnam] +  tim["merge",u,n,"sorted"]
      }
    }
  }
  for (u in binaryU){
    for (m in M){
      f <- binaryM[[m,u]]
      if (!is.null(f)){
        tim[m,u,n,d] <- median(microbenchmark(f(x,y), times=times)$time)
        # now plug-in measures for unsorted merge
        if (m == "merge")
          tim["merge",u,n,"unsorted"] <- timsort["bit","sort","unsorted",xnam] + timsort["bit","sort","unsorted",ynam] + tim["merge",u,n,"sorted"]
      }
    }
  }
}
y <- timsort[,,,"big"]
y <- 100 * y / max(y["R",,], na.rm=TRUE)
oldpar <- par(mfrow=c(2,1), mar=c(5,8,2,1))
x <- y[,,"unsorted"]
dotchart(x, xlim=c(0, max(100, max(y))), labels="", pch=c("R","b"), xlab="execution time", main="unsorted", col=c("red","blue"))
x <- y[,,"sorted"]
dotchart(x, xlim=c(0, max(100, max(y))), labels="", pch=c("R","b"), xlab="execution time", main="sorted", col=c("red","blue"))
par(oldpar)
tim2 <- tim
for (n in names(binaryDomain))
  for (d in D)
    tim2[,,n,d] <- 100*tim[,,n,d]/max(tim["R",,n,d], na.rm=TRUE)

r pagebreak()

y <- tim2[,,"bigbig",]
y <- 100 * y / max(y["R",,], na.rm=TRUE)
x <- y[,,"unsorted"]
m <- rep("", ncol(x))
m <- as.vector(rbind(m, colnames(x), m))
dotchart(x, xlim=c(0, max(100,max(y, na.rm=TRUE))), labels=m, pch=c("R","b","m"), col=c("red","blue","black"), main="Timings in 'unsorted bigbig' scenario", sub="R='hash'   b='bit'   m='merge'")
x <- y[,,"sorted"]
dotchart(x, xlim=c(0, max(y, na.rm=TRUE)), labels=m, pch=c("R","b","m"), col=c("red","blue","black"), main="Timings in 'sorted bigbig' scenario", sub="R='hash'   b='bit'   m='merge'")

r pagebreak()

% time for sorting

x <- 100*timsort["bit",,,]/timsort["R",,,]
s <- "sorted"
knitr::kable(x[,s,], caption=paste(s,"data relative to R's sort"), digits=1)
s <- "unsorted"
knitr::kable(x[,s,], caption=paste(s,"data relative to R's sort"), digits=1)

% time for unique

f <- function(u){
  n <- c("smallsmall","bigbig")
  x <- tim[c("bit","merge","merge"),u,n,]
  dimnames(x)$M[3] <- "sort"
  dimnames(x)$N <- c("small","big")
  x["sort",,"unsorted"] <- timsort["bit","sort","unsorted",]
  x["sort",,"sorted"] <- 0
  for (m in dimnames(x)$M)
    x[m,,] <- x[m,,] / tim["R",u,n,] * 100
  x
}
x <- f("unique")
s <- "sorted"
knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)
s <- "unsorted"
knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)

% time for duplicated

x <- f("duplicated")
s <- "sorted"
knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)
s <- "unsorted"
knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)

% time for anyDuplicated

x <- f("anyDuplicated")
s <- "sorted"
knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)
s <- "unsorted"
knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)

% time for sumDuplicated

x <- f("sumDuplicated")
s <- "sorted"
knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)
s <- "unsorted"
knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)

% time for match

f <- function(u){
  x <- tim[c("bit","merge","merge"),u,,]
  dimnames(x)$M[3] <- "sort"
  s <- timsort["bit","sort","unsorted",]
  x["sort",,"unsorted"] <- rep(s, c(2,2)) + c(s,s)
  x["sort",,"sorted"] <- 0
  for (m in dimnames(x)$M)
    x[m,,] <- x[m,,] / tim["R",u,,] * 100
  x
}
x <- f("match")
s <- "sorted"
knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)
s <- "unsorted"
knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)

% time for in

x <- f("in")
s <- "sorted"
knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)
s <- "unsorted"
knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)

% time for notin

x <- f("notin")
s <- "sorted"
knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)
s <- "unsorted"
knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)

% time for union

x <- f("union")
s <- "sorted"
knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)
s <- "unsorted"
knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)

% time for intersect

x <- f("intersect")
s <- "sorted"
knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)
s <- "unsorted"
knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)

% time for setdiff

x <- f("setdiff")
s <- "sorted"
knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)
s <- "unsorted"
knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)

% time for symdiff

x <- f("symdiff")
s <- "sorted"
knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)
s <- "unsorted"
knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)

% time for setequal

x <- f("setequal")
s <- "sorted"
knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)
s <- "unsorted"
knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)

% time for setearly

x <- f("setearly")
s <- "sorted"
knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)
s <- "unsorted"
knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)


Try the bit package in your browser

Any scripts or data that you put into this service are public.

bit documentation built on Aug. 4, 2020, 9:06 a.m.