knitr::opts_chunk$set(collapse = TRUE, comment = "#>") require(bit) require(microbenchmark) # rmarkdown::render("vignettes/bit-performance.Rmd") # these are the real settings for the performance vignette times <- 5 Domain <- c(small=1e3, big=1e6) Sample <- c(small=1e3, big=1e6) # these are the settings to keep the cost of CRAN low #times <- 5 #Domain <- c(small=1e1, big=1e3) #Sample <- c(small=1e1, big=1e3) pagebreak <- function() { if(knitr::is_latex_output()) return("\\newpage") else return('<div style="page-break-before: always;" />') }
Before we measure performance of the main functionality of the package, note that something simple as '(a:b)[-i]' can and has been accelerated in this package:
a <- 1L b <- 1e7L i <- sample(a:b,1e3) x <- c( R = median(microbenchmark((a:b)[-i], times=times)$time) , bit = median(microbenchmark(bit_rangediff(c(a,b), i), times=times)$time) , merge = median(microbenchmark(merge_rangediff(c(a,b), bit_sort(i)), times=times)$time) ) knitr::kable(as.data.frame(as.list(x/x["R"]*100)), caption="% of time relative to R", digits=1)
The vignette is compiled with the following performance settings: r times
replications with domain size small r Domain["small"]
and big r Domain["big"]
, sample size small r Sample["small"]
and big r Sample["big"]
.
"A designer knows he has achieved perfection not when there is nothing left to add, but when there is nothing left to take away."
"Il semble que la perfection soit atteinte non quand il n'y a plus rien à ajouter, mais quand il n'y a plus rien à retrancher"
(Antoine de St. Exupery, Terre des Hommes (Gallimard, 1939), p. 60.)
We compare memory consumption (n=r format(Sample[["big"]])
) and runtime (median of r times
replications) of the different booltype
s for the following filter scenarios:
knitr::kable( data.frame(coin="random 50%", often="random 99%", rare="random 1%", chunk="contiguous chunk of 5%") , caption="selection characteristic")
B <- booltypes[c("logical","bit","bitwhich","which","ri")] M <- c("size", "[]", "[which]", "[which]<-TRUE", "[]<-logical", "!", "&", "|", "==", "!=", "summary") G <- list( coin = function(n)sample(c(FALSE, TRUE), n, replace=TRUE, prob=c(0.5,0.5)) , often = function(n)sample(c(FALSE, TRUE), n, replace=TRUE, prob=c(0.01,0.99)) , rare = function(n)sample(c(FALSE, TRUE), n, replace=TRUE, prob=c(0.99,0.01)) , chunk = function(n)ri(n%/%20,2L*n%/%20,n) ) X <- vector("list", length(B)*length(G)) dim(X) <- c(booltype=length(B), data=length(G)) dimnames(X) <- list(booltype=names(B), data=names(G)) tim <- array(NA , dim=c(booltype=length(B), metric=length(M), data=length(G)) , dimnames=list(booltype=names(B), metric=M, data=names(G)) ) for (g in names(G)){ x <- G[[g]](Sample[["big"]]) if (g %in% c("coin","often","rare")) w <- as.which(as.logical(x)) for (b in B){ if (booltypes[[b]] < 'ri' || (b == 'ri' && g=='chunk')){ X[[b,g]] <- as.booltype(x, b) if (g %in% c("coin","often","rare") && b %in% c("logical","bit","bitwhich")){ l <- as.booltype(logical(Sample[["big"]]), b) tim[b,"[which]",g] <- median(microbenchmark(l[w], times=times)$time) tim[b,"[which]<-TRUE",g] <- median(microbenchmark(l[w]<-TRUE, times=times)$time) tim[b,"[]",g] <- median(microbenchmark(l[], times=times)$time) tim[b,"[]<-logical",g] <- median(microbenchmark(l[]<-x, times=times)$time) } tim[b,"size",g] <- object.size(X[[b,g]]) } } } for (g in names(G)){ for (b in c("logical","bit","bitwhich")){ x <- X[[b,g]] if (!is.null(x)){ tim[b,"!",g] <- median(microbenchmark(!x, times=times)$time) tim[b,"&",g] <- median(microbenchmark(x & x, times=times)$time) tim[b,"|",g] <- median(microbenchmark(x | x, times=times)$time) tim[b,"==",g] <- median(microbenchmark(x == x, times=times)$time) tim[b,"!=",g] <- median(microbenchmark(x != x, times=times)$time) tim[b,"summary",g] <- median(microbenchmark(summary.booltype(x), times=times)$time) } } } i <- match("size", M) for(b in rev(names(B))) # logical was in first position, so we do this last! { tim[b,i,] <- 100 * tim[b,i,] / tim["logical",i,] tim[b,-i,] <- 100 * tim[b,-i,] / max(tim["logical",-i,], na.rm=TRUE) } #rm(X)
There are substantial savings in skewed filter situations:
x <- tim[1:3,,"rare"] m <- rep("", ncol(x)) m <- as.vector(rbind(m, colnames(x), m)) dotchart(x, xlim=c(0,max(100, max(x))), labels=m, pch=c("R","b","w"), col=c("black","blue","red"), main="% size and timings in 'rare' scenario", sub="l='logical' b='bit' w='bitwhich' % of max(R) in all scenarios")
x <- tim[1:3,,"often"] dotchart(x, xlim=c(0,max(100, max(x))), labels=m, pch=c("R","b","w"), col=c("black","blue","red"), main="% size and timings in 'often' scenario", sub="l='logical' b='bit' w='bitwhich' % of max(R) in all scenarios")
Even in non-skewed situations the new booltypes are competitive:
x <- tim[1:3,,"coin"] dotchart(x, xlim=c(0,max(100, max(x))), labels=m, pch=c("R","b","w"), col=c("black","blue","red"), main="% size and timings in 'coin' scenario", sub="l='logical' b='bit' w='bitwhich' % of max(R) in all scenarios")
Detailed tables follow.
r pagebreak()
knitr::kable(round(tim[,"size",], 1), caption="% bytes of logical")
knitr::kable(round(tim[,"[]",], 1), caption="% time of logical")
knitr::kable(round(tim[,"[]<-logical",], 1), caption="% time of logical")
knitr::kable(round(tim[,"[which]",], 1), caption="% time of logical")
knitr::kable(round(tim[,"[which]<-TRUE",], 1), caption="% time of logical")
knitr::kable(round(tim[,"!",], 1), caption="% time for Boolean NOT")
knitr::kable(round(tim[,"&",], 1), caption="% time for Boolean &")
knitr::kable(round(tim[,"|",], 1), caption="% time for Boolean |")
knitr::kable(round(tim[,"==",], 1), caption="% time for Boolean ==")
knitr::kable(round(tim[,"!=",], 1), caption="% time for Boolean !=")
knitr::kable(round(tim[,"summary",][1:2,1:2], 1), caption="% time for Boolean summary")
integer
set operations"The space-efficient structure of bitmaps dramatically reduced the run time of sorting"
(Jon Bently, Programming Pearls, Cracking the oyster, p. 7)
binaryDomain <- list( smallsmall = rep(Domain["small"], 2) , smallbig=Domain , bigsmall=rev(Domain) , bigbig=rep(Domain["big"], 2) ) binarySample <- list( smallsmall = rep(Sample["small"], 2) , smallbig=Sample , bigsmall=rev(Sample) , bigbig=rep(Sample["big"], 2) ) M <- c("R","bit","merge") G <- c("sort","sortunique") D <- c("unsorted","sorted") sortM <- vector("list", length(M)*length(G)) dim(sortM) <- c(method=length(M), goal=length(G)) dimnames(sortM) <- list(method=M, goal=G) sortM[["R","sort"]] <- sort sortM[["R", "sortunique"]] <- function(x)sort(unique(x)) sortM[["bit","sort"]] <- bit_sort sortM[["bit","sortunique"]] <- bit_sort_unique timsort <- array(NA_integer_ , dim=c(M=2, G=length(G), D=length(D), N=length(Domain)) , dimnames=list(M=M[1:2], G=G, D=D, N=names(Domain)) ) for(n in names(Domain)){ x <- sample(Domain[[n]], Sample[[n]], replace = TRUE) d <- "unsorted" for (m in c("R","bit")){ for (g in G){ timsort[m,g,d,n] <- median(microbenchmark(sortM[[m,g]](x), times=times)$time) } } x <- bit_sort(x) d <- "sorted" for (m in 1:2){ for (g in G){ timsort[m,g,d,n] <- median(microbenchmark(sortM[[m,g]](x), times=times)$time) } } } binaryU <- c("match","in","notin","union","intersect","setdiff","symdiff","setequal","setearly") binaryM <- vector("list", length(M)*length(binaryU)) dim(binaryM) <- c(method=length(M), task=length(binaryU)) dimnames(binaryM) <- list(method=M, task=binaryU) binaryM[["R","match"]] <- match binaryM[["merge","match"]] <- merge_match binaryM[["R","in"]] <- get("%in%") binaryM[["bit","in"]] <- bit_in binaryM[["merge","in"]] <- merge_in binaryM[["R","notin"]] <- function(x, y)!(x %in% y) binaryM[["bit","notin"]] <- function(x, y)!bit_in(x,y) binaryM[["merge","notin"]] <- merge_notin binaryM[["R","union"]] <- union binaryM[["bit","union"]] <- bit_union binaryM[["merge","union"]] <- merge_union binaryM[["R","intersect"]] <- intersect binaryM[["bit","intersect"]] <- bit_intersect binaryM[["merge","intersect"]] <- merge_intersect binaryM[["R","setdiff"]] <- setdiff binaryM[["bit","setdiff"]] <- bit_setdiff binaryM[["merge","setdiff"]] <- merge_setdiff binaryM[["R","symdiff"]] <- function(x,y)union(setdiff(x,y), setdiff(y,x)) binaryM[["bit","symdiff"]] <- bit_symdiff binaryM[["merge","symdiff"]] <- merge_symdiff binaryM[["R","setequal"]] <- function(x,y)setequal(x,x) # we compare x to x which avoids early termination and hence binaryM[["bit","setequal"]] <- function(x,y)bit_setequal(x,x) binaryM[["merge","setequal"]] <- function(x,y)merge_setequal(x,x) binaryM[["R","setearly"]] <- function(x,y)setequal(x,y) # we compare x to x which avoids early termination and hence binaryM[["bit","setearly"]] <- function(x,y)bit_setequal(x,y) binaryM[["merge","setearly"]] <- function(x,y)merge_setequal(x,y) unaryU <- c("unique","duplicated","anyDuplicated","sumDuplicated") unaryM <- vector("list", length(M)*length(unaryU)) dim(unaryM) <- c(method=length(M), task=length(unaryU)) dimnames(unaryM) <- list(method=M, task=unaryU) unaryM[["R","unique"]] <- unique unaryM[["bit","unique"]] <- bit_unique unaryM[["merge","unique"]] <- merge_unique unaryM[["R","duplicated"]] <- duplicated unaryM[["bit","duplicated"]] <- bit_duplicated unaryM[["merge","duplicated"]] <- merge_duplicated unaryM[["R","anyDuplicated"]] <- anyDuplicated unaryM[["bit","anyDuplicated"]] <- bit_anyDuplicated unaryM[["merge","anyDuplicated"]] <- merge_anyDuplicated unaryM[["R","sumDuplicated"]] <- function(x)sum(duplicated(x)) unaryM[["bit","sumDuplicated"]] <- bit_sumDuplicated unaryM[["merge","sumDuplicated"]] <- merge_sumDuplicated tim <- array(NA_integer_ , dim=c(M=length(M), U=length(unaryU)+length(binaryU), N=length(binaryDomain), D=length(D)) , dimnames=list(M=M, U=c(unaryU,binaryU), N=names(binaryDomain), D=D) ) for(n in names(binaryDomain)){ xnam <- names(binaryDomain[[n]])[1] ynam <- names(binaryDomain[[n]])[2] x <- sample(binaryDomain[[n]][1], binarySample[[n]][1], replace = FALSE) y <- sample(binaryDomain[[n]][2], binarySample[[n]][2], replace = FALSE) d <- "unsorted" if (length(x)==length(y)) for (u in unaryU){ for (m in setdiff(M,"merge")){ f <- unaryM[[m,u]] if (!is.null(f)) tim[m,u,n,d] <- median(microbenchmark(f(x), times=times)$time) } } for (u in binaryU){ for (m in setdiff(M,"merge")){ f <- binaryM[[m,u]] if (!is.null(f)) tim[m,u,n,d] <- median(microbenchmark(f(x,y), times=times)$time) } } x <- bit_sort(x) y <- bit_sort(y) d <- "sorted" if (length(x)==length(y)) for (u in unaryU){ for (m in M){ f <- unaryM[[m,u]] if (!is.null(f)){ tim[m,u,n,d] <- median(microbenchmark(f(x), times=times)$time) # now plug-in measures for unsorted merge if (m == "merge") tim["merge",u,n,"unsorted"] <- timsort["bit","sort","unsorted",xnam] + tim["merge",u,n,"sorted"] } } } for (u in binaryU){ for (m in M){ f <- binaryM[[m,u]] if (!is.null(f)){ tim[m,u,n,d] <- median(microbenchmark(f(x,y), times=times)$time) # now plug-in measures for unsorted merge if (m == "merge") tim["merge",u,n,"unsorted"] <- timsort["bit","sort","unsorted",xnam] + timsort["bit","sort","unsorted",ynam] + tim["merge",u,n,"sorted"] } } } }
y <- timsort[,,,"big"] y <- 100 * y / max(y["R",,], na.rm=TRUE) oldpar <- par(mfrow=c(2,1), mar=c(5,8,2,1)) x <- y[,,"unsorted"] dotchart(x, xlim=c(0, max(100, max(y))), labels="", pch=c("R","b"), xlab="execution time", main="unsorted", col=c("red","blue")) x <- y[,,"sorted"] dotchart(x, xlim=c(0, max(100, max(y))), labels="", pch=c("R","b"), xlab="execution time", main="sorted", col=c("red","blue")) par(oldpar)
tim2 <- tim for (n in names(binaryDomain)) for (d in D) tim2[,,n,d] <- 100*tim[,,n,d]/max(tim["R",,n,d], na.rm=TRUE)
r pagebreak()
y <- tim2[,,"bigbig",] y <- 100 * y / max(y["R",,], na.rm=TRUE) x <- y[,,"unsorted"] m <- rep("", ncol(x)) m <- as.vector(rbind(m, colnames(x), m)) dotchart(x, xlim=c(0, max(100,max(y, na.rm=TRUE))), labels=m, pch=c("R","b","m"), col=c("red","blue","black"), main="Timings in 'unsorted bigbig' scenario", sub="R='hash' b='bit' m='merge'")
x <- y[,,"sorted"] dotchart(x, xlim=c(0, max(y, na.rm=TRUE)), labels=m, pch=c("R","b","m"), col=c("red","blue","black"), main="Timings in 'sorted bigbig' scenario", sub="R='hash' b='bit' m='merge'")
r pagebreak()
x <- 100*timsort["bit",,,]/timsort["R",,,] s <- "sorted" knitr::kable(x[,s,], caption=paste(s,"data relative to R's sort"), digits=1)
s <- "unsorted" knitr::kable(x[,s,], caption=paste(s,"data relative to R's sort"), digits=1)
f <- function(u){ n <- c("smallsmall","bigbig") x <- tim[c("bit","merge","merge"),u,n,] dimnames(x)$M[3] <- "sort" dimnames(x)$N <- c("small","big") x["sort",,"unsorted"] <- timsort["bit","sort","unsorted",] x["sort",,"sorted"] <- 0 for (m in dimnames(x)$M) x[m,,] <- x[m,,] / tim["R",u,n,] * 100 x } x <- f("unique") s <- "sorted" knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)
s <- "unsorted" knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)
x <- f("duplicated") s <- "sorted" knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)
s <- "unsorted" knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)
x <- f("anyDuplicated") s <- "sorted" knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)
s <- "unsorted" knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)
x <- f("sumDuplicated") s <- "sorted" knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)
s <- "unsorted" knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)
f <- function(u){ x <- tim[c("bit","merge","merge"),u,,] dimnames(x)$M[3] <- "sort" s <- timsort["bit","sort","unsorted",] x["sort",,"unsorted"] <- rep(s, c(2,2)) + c(s,s) x["sort",,"sorted"] <- 0 for (m in dimnames(x)$M) x[m,,] <- x[m,,] / tim["R",u,,] * 100 x } x <- f("match") s <- "sorted" knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)
s <- "unsorted" knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)
x <- f("in") s <- "sorted" knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)
s <- "unsorted" knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)
x <- f("notin") s <- "sorted" knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)
s <- "unsorted" knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)
x <- f("union") s <- "sorted" knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)
s <- "unsorted" knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)
x <- f("intersect") s <- "sorted" knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)
s <- "unsorted" knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)
x <- f("setdiff") s <- "sorted" knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)
s <- "unsorted" knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)
x <- f("symdiff") s <- "sorted" knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)
s <- "unsorted" knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)
x <- f("setequal") s <- "sorted" knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)
s <- "unsorted" knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)
x <- f("setearly") s <- "sorted" knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)
s <- "unsorted" knitr::kable(x[,,s], caption=paste(s,"data relative to R"), digits=1)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.