inst/doc/bit-performance.R

## ---- echo = FALSE, results = "hide", message = FALSE-------------------------
knitr::opts_chunk$set(collapse = TRUE, comment = "#>")
require(bit)
require(microbenchmark)
# rmarkdown::render("vignettes/bit-performance.Rmd")
# these are the real settings for the performance vignette
times <- 5
Domain <- c(small=1e3, big=1e6)
Sample <- c(small=1e3, big=1e6)
# these are the settings to keep the cost of CRAN low
#times <- 5
#Domain <- c(small=1e1, big=1e3)
#Sample <- c(small=1e1, big=1e3)

pagebreak <- function() {
  if(knitr::is_latex_output())
    return("\\newpage")
  else
    return('<div style="page-break-before: always;" />')
}


## ---- echo=TRUE, results='asis'-----------------------------------------------
a <- 1L
b <- 1e7L
i <- sample(a:b,1e3)
x <- c(
  R = median(microbenchmark((a:b)[-i], times=times)$time)
, bit = median(microbenchmark(bit_rangediff(c(a,b), i), times=times)$time)
, merge = median(microbenchmark(merge_rangediff(c(a,b), bit_sort(i)), times=times)$time)
)
knitr::kable(as.data.frame(as.list(x/x["R"]*100)), caption="% of time relative to R", digits=1)

## ---- echo=FALSE, results='asis'----------------------------------------------
knitr::kable(
  data.frame(coin="random 50%", often="random 99%", rare="random 1%", chunk="contiguous chunk of 5%")
  , caption="selection characteristic")

## ---- echo=FALSE, results='asis'----------------------------------------------
B <- booltypes[c("logical","bit","bitwhich","which","ri")]
M <- c("size", "[]", "[which]", "[which]<-TRUE", "[]<-logical", "!", "&", "|", "==", "!=", "summary")
G <- list(
  coin = function(n)sample(c(FALSE, TRUE), n, replace=TRUE, prob=c(0.5,0.5))
, often = function(n)sample(c(FALSE, TRUE), n, replace=TRUE, prob=c(0.01,0.99))
, rare = function(n)sample(c(FALSE, TRUE), n, replace=TRUE, prob=c(0.99,0.01))
, chunk = function(n)ri(n%/%20,2L*n%/%20,n)
)
X <- vector("list", length(B)*length(G))
dim(X) <- c(booltype=length(B), data=length(G))
dimnames(X) <- list(booltype=names(B), data=names(G))
tim <- array(NA
             , dim=c(booltype=length(B), metric=length(M), data=length(G))
             , dimnames=list(booltype=names(B), metric=M, data=names(G))
             )
for (g in names(G)){
  x <- G[[g]](Sample[["big"]])
  if (g %in% c("coin","often","rare"))
    w <- as.which(as.logical(x))
  for (b in B){
    if (booltypes[[b]] < 'ri' || (b == 'ri' && g=='chunk')){
      X[[b,g]] <- as.booltype(x, b)
      if (g %in% c("coin","often","rare") && b %in% c("logical","bit","bitwhich")){
        l <- as.booltype(logical(Sample[["big"]]), b)
        tim[b,"[which]",g] <- median(microbenchmark(l[w], times=times)$time)
        tim[b,"[which]<-TRUE",g] <- median(microbenchmark(l[w]<-TRUE, times=times)$time)
        tim[b,"[]",g] <- median(microbenchmark(l[], times=times)$time)
        tim[b,"[]<-logical",g] <- median(microbenchmark(l[]<-x, times=times)$time)
      }
      tim[b,"size",g] <- object.size(X[[b,g]])
    }
  }
}
for (g in names(G)){
  for (b in c("logical","bit","bitwhich")){
    x <- X[[b,g]]
    if (!is.null(x)){
      tim[b,"!",g] <- median(microbenchmark(!x, times=times)$time)
      tim[b,"&",g] <- median(microbenchmark(x & x, times=times)$time)
      tim[b,"|",g] <- median(microbenchmark(x | x, times=times)$time)
      tim[b,"==",g] <- median(microbenchmark(x == x, times=times)$time)
      tim[b,"!=",g] <- median(microbenchmark(x != x, times=times)$time)
      tim[b,"summary",g] <- median(microbenchmark(summary.booltype(x), times=times)$time)
    }
  }
}
i <- match("size", M)
for(b in rev(names(B)))  # logical was in first position, so we do this last!
{
  tim[b,i,] <- 100 * tim[b,i,] / tim["logical",i,]
  tim[b,-i,] <- 100 * tim[b,-i,] / max(tim["logical",-i,], na.rm=TRUE)
}
#rm(X)

## ---- echo=FALSE, fig.cap = "% size and execution time for bit (b) and bitwhich (w) relative to logical (R) in the 'rare' scenario"----
x <- tim[1:3,,"rare"]
m <- rep("", ncol(x))
m <- as.vector(rbind(m, colnames(x), m))
dotchart(x, xlim=c(0,max(100, max(x))), labels=m, pch=c("R","b","w"), col=c("black","blue","red"), main="% size and timings in 'rare' scenario", sub="l='logical'  b='bit'  w='bitwhich'           % of max(R) in all scenarios")

## ---- echo=FALSE, fig.cap = "% size and execution time for bit (b) and bitwhich (w) relative to logical (R) in the 'often' scenario"----
x <- tim[1:3,,"often"]
dotchart(x, xlim=c(0,max(100, max(x))), labels=m, pch=c("R","b","w"), col=c("black","blue","red"), main="% size and timings in 'often' scenario", sub="l='logical'  b='bit'  w='bitwhich'           % of max(R) in all scenarios")

## ---- echo=FALSE, fig.cap = "% size and execution time for bit (b) and bitwhich (w) relative to logical (R) in the 'coin' scenario"----
x <- tim[1:3,,"coin"]
dotchart(x, xlim=c(0,max(100, max(x))), labels=m, pch=c("R","b","w"), col=c("black","blue","red"), main="% size and timings in 'coin' scenario", sub="l='logical'  b='bit'  w='bitwhich'           % of max(R) in all scenarios")

## ---- echo=FALSE, results='asis'----------------------------------------------
knitr::kable(round(tim[,"size",], 1), caption="% bytes of logical")

## ---- echo=FALSE, results='asis'----------------------------------------------
knitr::kable(round(tim[,"[]",], 1), caption="% time of logical")

## ---- echo=FALSE, results='asis'----------------------------------------------
knitr::kable(round(tim[,"[]<-logical",], 1), caption="% time of logical")

## ---- echo=FALSE, results='asis'----------------------------------------------
knitr::kable(round(tim[,"[which]",], 1), caption="% time of logical")

## ---- echo=FALSE, results='asis'----------------------------------------------
knitr::kable(round(tim[,"[which]<-TRUE",], 1), caption="% time of logical")

## ---- echo=FALSE, results='asis'----------------------------------------------
knitr::kable(round(tim[,"!",], 1), caption="% time for Boolean NOT")

## ---- echo=FALSE, results='asis'----------------------------------------------
knitr::kable(round(tim[,"&",], 1), caption="% time for Boolean &")

## ---- echo=FALSE, results='asis'----------------------------------------------
knitr::kable(round(tim[,"|",], 1), caption="% time for Boolean |")

## ---- echo=FALSE, results='asis'----------------------------------------------
knitr::kable(round(tim[,"==",], 1), caption="% time for Boolean ==")

## ---- echo=FALSE, results='asis'----------------------------------------------
knitr::kable(round(tim[,"!=",], 1), caption="% time for Boolean !=")

## ---- echo=FALSE, results='asis'----------------------------------------------
knitr::kable(round(tim[,"summary",][1:2,1:2], 1), caption="% time for Boolean summary")

## ---- echo=FALSE, results='asis'----------------------------------------------
binaryDomain <- list(
    smallsmall = rep(Domain["small"], 2)
  , smallbig=Domain
  , bigsmall=rev(Domain)
  , bigbig=rep(Domain["big"], 2)
)
binarySample <- list(
    smallsmall = rep(Sample["small"], 2)
  , smallbig=Sample
  , bigsmall=rev(Sample)
  , bigbig=rep(Sample["big"], 2)
)

M <- c("R","bit","merge")
G <- c("sort","sortunique")
D <- c("unsorted","sorted")

sortM <- vector("list", length(M)*length(G))
dim(sortM) <- c(method=length(M), goal=length(G))
dimnames(sortM) <- list(method=M, goal=G)
sortM[["R","sort"]] <- sort
sortM[["R", "sortunique"]] <- function(x)sort(unique(x))
sortM[["bit","sort"]] <- bit_sort
sortM[["bit","sortunique"]] <- bit_sort_unique

timsort <- array(NA_integer_
             , dim=c(M=2, G=length(G), D=length(D), N=length(Domain)) 
             , dimnames=list(M=M[1:2], G=G, D=D, N=names(Domain)) 
             )
for(n in names(Domain)){
  x <- sample(Domain[[n]], Sample[[n]], replace = TRUE)
  d <- "unsorted"
  for (m in c("R","bit")){
    for (g in G){
      timsort[m,g,d,n] <- median(microbenchmark(sortM[[m,g]](x), times=times)$time)
    }
  }
  x <- bit_sort(x)
  d <- "sorted"
  for (m in 1:2){
    for (g in G){
      timsort[m,g,d,n] <- median(microbenchmark(sortM[[m,g]](x), times=times)$time)
    }
  }
}


binaryU <- c("match","in","notin","union","intersect","setdiff","symdiff","setequal","setearly")
binaryM <- vector("list", length(M)*length(binaryU))
dim(binaryM) <- c(method=length(M), task=length(binaryU))
dimnames(binaryM) <- list(method=M, task=binaryU)
binaryM[["R","match"]] <- match
binaryM[["merge","match"]] <- merge_match

binaryM[["R","in"]] <- get("%in%")
binaryM[["bit","in"]] <- bit_in
binaryM[["merge","in"]] <- merge_in

binaryM[["R","notin"]] <- function(x, y)!(x %in% y)
binaryM[["bit","notin"]] <- function(x, y)!bit_in(x,y)
binaryM[["merge","notin"]] <- merge_notin

binaryM[["R","union"]] <- union
binaryM[["bit","union"]] <- bit_union
binaryM[["merge","union"]] <- merge_union

binaryM[["R","intersect"]] <- intersect
binaryM[["bit","intersect"]] <- bit_intersect
binaryM[["merge","intersect"]] <- merge_intersect

binaryM[["R","setdiff"]] <- setdiff
binaryM[["bit","setdiff"]] <- bit_setdiff
binaryM[["merge","setdiff"]] <- merge_setdiff

binaryM[["R","symdiff"]] <- function(x,y)union(setdiff(x,y), setdiff(y,x))
binaryM[["bit","symdiff"]] <- bit_symdiff
binaryM[["merge","symdiff"]] <- merge_symdiff

binaryM[["R","setequal"]] <- function(x,y)setequal(x,x)  # we compare x to x which avoids early termination and hence 
binaryM[["bit","setequal"]] <- function(x,y)bit_setequal(x,x)
binaryM[["merge","setequal"]] <- function(x,y)merge_setequal(x,x)
  
binaryM[["R","setearly"]] <- function(x,y)setequal(x,y)  # we compare x to x which avoids early termination and hence 
binaryM[["bit","setearly"]] <- function(x,y)bit_setequal(x,y)
binaryM[["merge","setearly"]] <- function(x,y)merge_setequal(x,y)

unaryU <- c("unique","duplicated","anyDuplicated","sumDuplicated")
unaryM <- vector("list", length(M)*length(unaryU))
dim(unaryM) <- c(method=length(M), task=length(unaryU))
dimnames(unaryM) <- list(method=M, task=unaryU)
unaryM[["R","unique"]] <- unique
unaryM[["bit","unique"]] <- bit_unique
unaryM[["merge","unique"]] <- merge_unique
unaryM[["R","duplicated"]] <- duplicated
unaryM[["bit","duplicated"]] <- bit_duplicated
unaryM[["merge","duplicated"]] <- merge_duplicated
unaryM[["R","anyDuplicated"]] <- anyDuplicated
unaryM[["bit","anyDuplicated"]] <- bit_anyDuplicated
unaryM[["merge","anyDuplicated"]] <- merge_anyDuplicated
unaryM[["R","sumDuplicated"]] <- function(x)sum(duplicated(x))
unaryM[["bit","sumDuplicated"]] <- bit_sumDuplicated
unaryM[["merge","sumDuplicated"]] <- merge_sumDuplicated

tim <- array(NA_integer_
             , dim=c(M=length(M), U=length(unaryU)+length(binaryU), N=length(binaryDomain), D=length(D))
             , dimnames=list(M=M, U=c(unaryU,binaryU), N=names(binaryDomain), D=D)
             )

for(n in names(binaryDomain)){
  xnam <- names(binaryDomain[[n]])[1]
  ynam <- names(binaryDomain[[n]])[2]
  x <- sample(binaryDomain[[n]][1], binarySample[[n]][1], replace = FALSE)
  y <- sample(binaryDomain[[n]][2], binarySample[[n]][2], replace = FALSE)
  d <- "unsorted"
  if (length(x)==length(y))
  for (u in unaryU){
    for (m in setdiff(M,"merge")){
      f <- unaryM[[m,u]]
      if (!is.null(f))
        tim[m,u,n,d] <- median(microbenchmark(f(x), times=times)$time)
    }
  }
  for (u in binaryU){
    for (m in setdiff(M,"merge")){
      f <- binaryM[[m,u]]
      if (!is.null(f))
        tim[m,u,n,d] <- median(microbenchmark(f(x,y), times=times)$time)
    }
  }
  x <- bit_sort(x)
  y <- bit_sort(y)
  d <- "sorted"
  if (length(x)==length(y))
  for (u in unaryU){
    for (m in M){
      f <- unaryM[[m,u]]
      if (!is.null(f)){
        tim[m,u,n,d] <- median(microbenchmark(f(x), times=times)$time)
        # now plug-in measures for unsorted merge
        if (m == "merge") 
          tim["merge",u,n,"unsorted"] <- timsort["bit","sort","unsorted",xnam] +  tim["merge",u,n,"sorted"]
      }
    }
  }
  for (u in binaryU){
    for (m in M){
      f <- binaryM[[m,u]]
      if (!is.null(f)){
        tim[m,u,n,d] <- median(microbenchmark(f(x,y), times=times)$time)
        # now plug-in measures for unsorted merge
        if (m == "merge")
          tim["merge",u,n,"unsorted"] <- timsort["bit","sort","unsorted",xnam] + timsort["bit","sort","unsorted",ynam] + tim["merge",u,n,"sorted"]
      }
    }
  }
}

## ---- echo=FALSE, fig.cap = "Execution time for R (R) and bit (b)"------------
y <- timsort[,,,"big"]
y <- 100 * y / max(y["R",,], na.rm=TRUE)
oldpar <- par(mfrow=c(2,1), mar=c(5,8,2,1))
x <- y[,,"unsorted"]
dotchart(x, xlim=c(0, max(100, max(y))), labels="", pch=c("R","b"), xlab="execution time", main="unsorted", col=c("red","blue"))
x <- y[,,"sorted"]
dotchart(x, xlim=c(0, max(100, max(y))), labels="", pch=c("R","b"), xlab="execution time", main="sorted", col=c("red","blue"))
par(oldpar)

## ---- echo=FALSE, results='hide'----------------------------------------------
tim2 <- tim
for (n in names(binaryDomain))
  for (d in D)
    tim2[,,n,d] <- 100*tim[,,n,d]/max(tim["R",,n,d], na.rm=TRUE)

## ---- echo=FALSE, fig.cap = "Execution time for R, bit and merge relative to most expensive R in 'unsorted bigbig' scenario"----
y <- tim2[,,"bigbig",]
y <- 100 * y / max(y["R",,], na.rm=TRUE)
x <- y[,,"unsorted"]
m <- rep("", ncol(x))
m <- as.vector(rbind(m, colnames(x), m))
dotchart(x, xlim=c(0, max(100,max(y, na.rm=TRUE))), labels=m, pch=c("R","b","m"), col=c("red","blue","black"), main="Timings in 'unsorted bigbig' scenario", sub="R='hash'   b='bit'   m='merge'")

## ---- echo=FALSE, fig.cap = "Execution time for R, bit and merge in 'sorted bigbig' scenario"----
x <- y[,,"sorted"]
dotchart(x, xlim=c(0, max(y, na.rm=TRUE)), labels=m, pch=c("R","b","m"), col=c("red","blue","black"), main="Timings in 'sorted bigbig' scenario", sub="R='hash'   b='bit'   m='merge'")

## ---- echo=FALSE, results='asis'----------------------------------------------
x <- 100*timsort["bit",,,]/timsort["R",,,]
s <- "sorted"
knitr::kable(x[,s,], caption=paste(s,"data relative to R's sort"), digits=1)

## ---- echo=FALSE, results='asis'----------------------------------------------
s <- "unsorted"
knitr::kable(x[,s,], caption=paste(s,"data relative to R's sort"), digits=1)

## ---- echo=FALSE, results='asis'----------------------------------------------
f <- function(u){
  n <- c("smallsmall","bigbig")
  x <- tim[c("bit","merge","merge"),u,n,]
  dimnames(x)$M[3] <- "sort"
  dimnames(x)$N <- c("small","big")
  x["sort",,"unsorted"] <- timsort["bit","sort","unsorted",]
  x["sort",,"sorted"] <- 0
  for (m in dimnames(x)$M)
    x[m,,] <- x[m,,] / tim["R",u,n,] * 100
  x
}
x <- f("unique")
s <- "sorted"
knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)

## ---- echo=FALSE, results='asis'----------------------------------------------
s <- "unsorted"
knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)

## ---- echo=FALSE, results='asis'----------------------------------------------
x <- f("duplicated")
s <- "sorted"
knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)

## ---- echo=FALSE, results='asis'----------------------------------------------
s <- "unsorted"
knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)

## ---- echo=FALSE, results='asis'----------------------------------------------
x <- f("anyDuplicated")
s <- "sorted"
knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)

## ---- echo=FALSE, results='asis'----------------------------------------------
s <- "unsorted"
knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)

## ---- echo=FALSE, results='asis'----------------------------------------------
x <- f("sumDuplicated")
s <- "sorted"
knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)

## ---- echo=FALSE, results='asis'----------------------------------------------
s <- "unsorted"
knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)

## ---- echo=FALSE, results='asis'----------------------------------------------
f <- function(u){
  x <- tim[c("bit","merge","merge"),u,,]
  dimnames(x)$M[3] <- "sort"
  s <- timsort["bit","sort","unsorted",]
  x["sort",,"unsorted"] <- rep(s, c(2,2)) + c(s,s)
  x["sort",,"sorted"] <- 0
  for (m in dimnames(x)$M)
    x[m,,] <- x[m,,] / tim["R",u,,] * 100
  x
}
x <- f("match")
s <- "sorted"
knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)

## ---- echo=FALSE, results='asis'----------------------------------------------
s <- "unsorted"
knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)

## ---- echo=FALSE, results='asis'----------------------------------------------
x <- f("in")
s <- "sorted"
knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)

## ---- echo=FALSE, results='asis'----------------------------------------------
s <- "unsorted"
knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)

## ---- echo=FALSE, results='asis'----------------------------------------------
x <- f("notin")
s <- "sorted"
knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)

## ---- echo=FALSE, results='asis'----------------------------------------------
s <- "unsorted"
knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)

## ---- echo=FALSE, results='asis'----------------------------------------------
x <- f("union")
s <- "sorted"
knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)

## ---- echo=FALSE, results='asis'----------------------------------------------
s <- "unsorted"
knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)

## ---- echo=FALSE, results='asis'----------------------------------------------
x <- f("intersect")
s <- "sorted"
knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)

## ---- echo=FALSE, results='asis'----------------------------------------------
s <- "unsorted"
knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)

## ---- echo=FALSE, results='asis'----------------------------------------------
x <- f("setdiff")
s <- "sorted"
knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)

## ---- echo=FALSE, results='asis'----------------------------------------------
s <- "unsorted"
knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)

## ---- echo=FALSE, results='asis'----------------------------------------------
x <- f("symdiff")
s <- "sorted"
knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)

## ---- echo=FALSE, results='asis'----------------------------------------------
s <- "unsorted"
knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)

## ---- echo=FALSE, results='asis'----------------------------------------------
x <- f("setequal")
s <- "sorted"
knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)

## ---- echo=FALSE, results='asis'----------------------------------------------
s <- "unsorted"
knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)

## ---- echo=FALSE, results='asis'----------------------------------------------
x <- f("setearly")
s <- "sorted"
knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)

## ---- echo=FALSE, results='asis'----------------------------------------------
s <- "unsorted"
knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)

Try the bit package in your browser

Any scripts or data that you put into this service are public.

bit documentation built on Nov. 16, 2022, 1:12 a.m.