R/highlevel64.R

Defines functions mean.integer64 quantile.integer64 qtile.integer64 qtile prank.integer64 prank rank.integer64 tiepos.integer64 tiepos keypos.integer64 keypos as.integer64.factor as.ordered.integer64 as.factor.integer64 table.integer64 unipos.integer64 unipos unique.integer64 duplicated.integer64 match.integer64 optimizer64 benchmark64

Documented in as.factor.integer64 as.integer64.factor as.ordered.integer64 benchmark64 duplicated.integer64 keypos keypos.integer64 match.integer64 mean.integer64 optimizer64 prank prank.integer64 qtile qtile.integer64 quantile.integer64 rank.integer64 table.integer64 tiepos tiepos.integer64 unipos unipos.integer64 unique.integer64

# /*
# R-Code for matching and other functions based on hashing
# S3 atomic 64bit integers for R
# (c) 2012 Jens Oehlschägel
# Licence: GPL2
# Provided 'as is', use at your own risk
# Created: 2011-12-11
# Last changed:  2011-12-11
# */

#! \name{benchmark64}
#! \alias{benchmark64}
#! \alias{optimizer64}
#! \title{
#!  Function for measuring algorithmic performance \cr 
#!  of high-level and low-level integer64 functions
#! }
#! \description{
#!  \code{benchmark64} compares high-level integer64 functions against the integer functions from Base R \cr
#!  \code{optimizer64} compares for each high-level integer64 function the Base R integer function with several low-level integer64 functions with and without caching \cr
#! }
#! \usage{
#! benchmark64(nsmall = 2^16, nbig = 2^25, timefun = repeat.time
#! )
#! optimizer64(nsmall = 2^16, nbig = 2^25, timefun = repeat.time
#! , what = c("match", "\%in\%", "duplicated", "unique", "unipos", "table", "rank", "quantile")
#! , uniorder = c("original", "values", "any")
#! , taborder = c("values", "counts")
#! , plot = TRUE
#! )
#! }
#! \arguments{
#!   \item{nsmall}{ size of smaller vector }
#!   \item{nbig}{ size of larger bigger vector }
#!   \item{timefun}{ a function for timing such as \code{\link[bit]{repeat.time}} or \code{\link{system.time}} }
#!   \item{what}{
#!  a vector of names of high-level functions
#! }
#!   \item{uniorder}{
#!  one of the order parameters that are allowed in \code{\link{unique.integer64}} and \code{\link{unipos.integer64}}
#! }
#!   \item{taborder}{
#!  one of the order parameters that are allowed in \code{\link{table.integer64}} 
#! }
#!   \item{plot}{
#!  set to FALSE to suppress plotting 
#! }
#! }
#! \details{
#!  \code{benchmark64} compares the following scenarios for the following use cases: 
#!  \tabular{rl}{
#!   \bold{scenario name} \tab \bold{explanation} \cr
#!   32-bit  \tab applying Base R function to 32-bit integer data \cr
#!   64-bit \tab applying bit64 function to 64-bit integer data (with no cache) \cr
#!   hashcache \tab dito when cache contains \code{\link{hashmap}}, see \code{\link{hashcache}} \cr
#!   sortordercache \tab dito when cache contains sorting and ordering, see \code{\link{sortordercache}} \cr
#!   ordercache \tab dito when cache contains ordering only, see \code{\link{ordercache}} \cr
#!   allcache \tab dito when cache contains sorting, ordering and hashing \cr
#!  }
#!  \tabular{rl}{
#!   \bold{use case name} \tab \bold{explanation} \cr
#!   cache         \tab filling the cache according to scenario \cr
#!   match(s,b)    \tab match small in big vector \cr
#!   s \%in\% b      \tab small \%in\% big vector \cr
#!   match(b,s)    \tab match big in small vector \cr
#!   b \%in\% s      \tab big \%in\% small vector \cr
#!   match(b,b)    \tab match big in (different) big vector \cr
#!   b \%in\% b      \tab big \%in\% (different) big vector \cr
#!   duplicated(b) \tab duplicated of big vector \cr
#!   unique(b)     \tab unique of big vector \cr
#!   table(b)      \tab table of big vector \cr
#!   sort(b)       \tab sorting of big vector \cr
#!   order(b)      \tab ordering of big vector \cr
#!   rank(b)       \tab ranking of big vector \cr
#!   quantile(b)   \tab quantiles of big vector \cr
#!   summary(b)    \tab summary of of big vector \cr
#!   SESSION       \tab exemplary session involving multiple calls (including cache filling costs) \cr
#!  }
#!  Note that the timings for the cached variants do \emph{not} contain the time costs of building the cache, except for the timing of the exemplary user session, where the cache costs are included in order to evaluate amortization. 
#! }
#! \value{
#!  \code{benchmark64} returns a matrix with elapsed seconds, different high-level tasks in rows and different scenarios to solve the task in columns. The last row named 'SESSION' contains the elapsed seconds of the exemplary sesssion.
#!  \cr
#!  \code{optimizer64} returns a dimensioned list with one row for each high-level function timed and two columns named after the values of the \code{nsmall} and \code{nbig} sample sizes. Each list cell contains a matrix with timings, low-level-methods in rows and three measurements \code{c("prep","both","use")} in columns. If it can be measured separately, \code{prep} contains the timing of preparatory work such as sorting and hashing, and \code{use} contains the timing of using the prepared work. If the function timed does both, preparation and use, the timing is in \code{both}.  
#! }
#! \author{
#!  Jens Oehlschlägel <Jens.Oehlschlaegel@truecluster.com>
#! }
#! \seealso{
#!  \code{\link{integer64}}
#! }
#! \examples{
#! message("this small example using system.time does not give serious timings\n
#! this we do this only to run regression tests")
#! benchmark64(nsmall=2^7, nbig=2^13, timefun=function(expr)system.time(expr, gcFirst=FALSE))
#! optimizer64(nsmall=2^7, nbig=2^13, timefun=function(expr)system.time(expr, gcFirst=FALSE)
#! , plot=FALSE
#! )
#!\dontrun{
#! message("for real measurement of sufficiently large datasets run this on your machine")
#! benchmark64()
#! optimizer64()
#!}
#! message("let's look at the performance results on Core i7 Lenovo T410 with 8 GB RAM")
#! data(benchmark64.data)
#! print(benchmark64.data)
#! 
#! matplot(log2(benchmark64.data[-1,1]/benchmark64.data[-1,])
#! , pch=c("3", "6", "h", "s", "o", "a") 
#! , xlab="tasks [last=session]"
#! , ylab="log2(relative speed) [bigger is better]"
#! )
#! matplot(t(log2(benchmark64.data[-1,1]/benchmark64.data[-1,]))
#! , type="b", axes=FALSE 
#! , lwd=c(rep(1, 14), 3)
#! , xlab="context"
#! , ylab="log2(relative speed) [bigger is better]"
#! )
#! axis(1
#! , labels=c("32-bit", "64-bit", "hash", "sortorder", "order", "hash+sortorder")
#! , at=1:6
#! )
#! axis(2)
#! data(optimizer64.data)
#! print(optimizer64.data)
#! oldpar <- par(no.readonly = TRUE)
#! par(mfrow=c(2,1))
#! par(cex=0.7)
#! for (i in 1:nrow(optimizer64.data)){
#!  for (j in 1:2){
#!    tim <- optimizer64.data[[i,j]]
#!   barplot(t(tim))
#!   if (rownames(optimizer64.data)[i]=="match")
#!    title(paste("match", colnames(optimizer64.data)[j], "in", colnames(optimizer64.data)[3-j]))
#!   else if (rownames(optimizer64.data)[i]=="\%in\%")
#!    title(paste(colnames(optimizer64.data)[j], "\%in\%", colnames(optimizer64.data)[3-j]))
#!   else
#!    title(paste(rownames(optimizer64.data)[i], colnames(optimizer64.data)[j]))
#!  }
#! }
#! par(mfrow=c(1,1))
#!}
#! \keyword{ misc }

#! \name{benchmark64.data}
#! \alias{benchmark64.data}
#! \docType{data}
#! \title{
#!  Results of performance measurement on a Core i7 Lenovo T410 8 GB RAM under Windows 7 64bit
#! }
#! \description{
#!   These are the results of calling \code{\link{benchmark64}}
#! }
#! \usage{data(benchmark64.data)}
#! \format{
#!   The format is:
#!  num [1:16, 1:6] 2.55e-05 2.37 2.39 1.28 1.39 ...
#!  - attr(*, "dimnames")=List of 2
#!   ..$ : chr [1:16] "cache" "match(s,b)" "s \%in\% b" "match(b,s)" ...
#!   ..$ : chr [1:6] "32-bit" "64-bit" "hashcache" "sortordercache" ...
#! }
#! \examples{
#! data(benchmark64.data)
#! print(benchmark64.data)
#! matplot(log2(benchmark64.data[-1,1]/benchmark64.data[-1,])
#! , pch=c("3", "6", "h", "s", "o", "a")
#! , xlab="tasks [last=session]"
#! , ylab="log2(relative speed) [bigger is better]"
#! )
#! matplot(t(log2(benchmark64.data[-1,1]/benchmark64.data[-1,]))
#! , axes=FALSE
#! , type="b"
#! , lwd=c(rep(1, 14), 3)
#! , xlab="context"
#! , ylab="log2(relative speed) [bigger is better]"
#! )
#! axis(1
#! , labels=c("32-bit", "64-bit", "hash", "sortorder", "order", "hash+sortorder")
#! , at=1:6
#! )
#! axis(2)
#! }
#! \keyword{datasets}


#! \name{optimizer64.data}
#! \alias{optimizer64.data}
#! \docType{data}
#! \title{
#!  Results of performance measurement on a Core i7 Lenovo T410 8 GB RAM under Windows 7 64bit
#! }
#! \description{
#!   These are the results of calling \code{\link{optimizer64}}
#! }
#! \usage{data(optimizer64.data)}
#! \format{
#!   The format is:
#! List of 16
#!  $ : num [1:9, 1:3] 0 0 1.63 0.00114 2.44 ...
#!   ..- attr(*, "dimnames")=List of 2
#!   .. ..$ : chr [1:9] "match" "match.64" "hashpos" "hashrev" ...
#!   .. ..$ : chr [1:3] "prep" "both" "use"
#!  $ : num [1:10, 1:3] 0 0 0 1.62 0.00114 ...
#!   ..- attr(*, "dimnames")=List of 2
#!   .. ..$ : chr [1:10] "\%in\%" "match.64" "\%in\%.64" "hashfin" ...
#!   .. ..$ : chr [1:3] "prep" "both" "use"
#!  $ : num [1:10, 1:3] 0 0 0.00105 0.00313 0.00313 ...
#!   ..- attr(*, "dimnames")=List of 2
#!   .. ..$ : chr [1:10] "duplicated" "duplicated.64" "hashdup" "sortorderdup1" ...
#!   .. ..$ : chr [1:3] "prep" "both" "use"
#!  $ : num [1:15, 1:3] 0 0 0 0.00104 0.00104 ...
#!   ..- attr(*, "dimnames")=List of 2
#!   .. ..$ : chr [1:15] "unique" "unique.64" "hashmapuni" "hashuni" ...
#!   .. ..$ : chr [1:3] "prep" "both" "use"
#!  $ : num [1:14, 1:3] 0 0 0 0.000992 0.000992 ...
#!   ..- attr(*, "dimnames")=List of 2
#!   .. ..$ : chr [1:14] "unique" "unipos.64" "hashmapupo" "hashupo" ...
#!   .. ..$ : chr [1:3] "prep" "both" "use"
#!  $ : num [1:13, 1:3] 0 0 0 0 0.000419 ...
#!   ..- attr(*, "dimnames")=List of 2
#!   .. ..$ : chr [1:13] "tabulate" "table" "table.64" "hashmaptab" ...
#!   .. ..$ : chr [1:3] "prep" "both" "use"
#!  $ : num [1:7, 1:3] 0 0 0 0.00236 0.00714 ...
#!   ..- attr(*, "dimnames")=List of 2
#!   .. ..$ : chr [1:7] "rank" "rank.keep" "rank.64" "sortorderrnk" ...
#!   .. ..$ : chr [1:3] "prep" "both" "use"
#!  $ : num [1:6, 1:3] 0 0 0.00189 0.00714 0 ...
#!   ..- attr(*, "dimnames")=List of 2
#!   .. ..$ : chr [1:6] "quantile" "quantile.64" "sortqtl" "orderqtl" ...
#!   .. ..$ : chr [1:3] "prep" "both" "use"
#!  $ : num [1:9, 1:3] 0 0 0.00105 1.17 0 ...
#!   ..- attr(*, "dimnames")=List of 2
#!   .. ..$ : chr [1:9] "match" "match.64" "hashpos" "hashrev" ...
#!   .. ..$ : chr [1:3] "prep" "both" "use"
#!  $ : num [1:10, 1:3] 0 0 0 0.00104 1.18 ...
#!   ..- attr(*, "dimnames")=List of 2
#!   .. ..$ : chr [1:10] "\%in\%" "match.64" "\%in\%.64" "hashfin" ...
#!   .. ..$ : chr [1:3] "prep" "both" "use"
#!  $ : num [1:10, 1:3] 0 0 1.64 2.48 2.48 ...
#!   ..- attr(*, "dimnames")=List of 2
#!   .. ..$ : chr [1:10] "duplicated" "duplicated.64" "hashdup" "sortorderdup1" ...
#!   .. ..$ : chr [1:3] "prep" "both" "use"
#!  $ : num [1:15, 1:3] 0 0 0 1.64 1.64 ...
#!   ..- attr(*, "dimnames")=List of 2
#!   .. ..$ : chr [1:15] "unique" "unique.64" "hashmapuni" "hashuni" ...
#!   .. ..$ : chr [1:3] "prep" "both" "use"
#!  $ : num [1:14, 1:3] 0 0 0 1.62 1.62 ...
#!   ..- attr(*, "dimnames")=List of 2
#!   .. ..$ : chr [1:14] "unique" "unipos.64" "hashmapupo" "hashupo" ...
#!   .. ..$ : chr [1:3] "prep" "both" "use"
#!  $ : num [1:13, 1:3] 0 0 0 0 0.32 ...
#!   ..- attr(*, "dimnames")=List of 2
#!   .. ..$ : chr [1:13] "tabulate" "table" "table.64" "hashmaptab" ...
#!   .. ..$ : chr [1:3] "prep" "both" "use"
#!  $ : num [1:7, 1:3] 0 0 0 2.96 10.69 ...
#!   ..- attr(*, "dimnames")=List of 2
#!   .. ..$ : chr [1:7] "rank" "rank.keep" "rank.64" "sortorderrnk" ...
#!   .. ..$ : chr [1:3] "prep" "both" "use"
#!  $ : num [1:6, 1:3] 0 0 1.62 10.61 0 ...
#!   ..- attr(*, "dimnames")=List of 2
#!   .. ..$ : chr [1:6] "quantile" "quantile.64" "sortqtl" "orderqtl" ...
#!   .. ..$ : chr [1:3] "prep" "both" "use"
#!  - attr(*, "dim")= int [1:2] 8 2
#!  - attr(*, "dimnames")=List of 2
#!   ..$ : chr [1:8] "match" "\%in\%" "duplicated" "unique" ...
#!   ..$ : chr [1:2] "65536" "33554432"
#! }
#! \examples{
#! data(optimizer64.data)
#! print(optimizer64.data)
#! oldpar <- par(no.readonly = TRUE)
#! par(mfrow=c(2,1))
#! par(cex=0.7)
#! for (i in 1:nrow(optimizer64.data)){
#!  for (j in 1:2){
#!    tim <- optimizer64.data[[i,j]]
#!   barplot(t(tim))
#!   if (rownames(optimizer64.data)[i]=="match")
#!    title(paste("match", colnames(optimizer64.data)[j], "in", colnames(optimizer64.data)[3-j]))
#!   else if (rownames(optimizer64.data)[i]=="\%in\%")
#!    title(paste(colnames(optimizer64.data)[j], "\%in\%", colnames(optimizer64.data)[3-j]))
#!   else
#!    title(paste(rownames(optimizer64.data)[i], colnames(optimizer64.data)[j]))
#!  }
#! }
#! par(mfrow=c(1,1))
#! }
#! \keyword{datasets}


benchmark64 <- function(nsmall=2^16, nbig=2^25, timefun=repeat.time)
{
 
 message('\ncompare performance for a complete sessions of calls')
 s <- sample(nbig, nsmall, TRUE)
 b <- sample(nbig, nbig, TRUE)
 b2 <- sample(nbig, nbig, TRUE)
 
 tim1 <- double(6)
 names(tim1) <- c("32-bit","64-bit","hashcache","sortordercache","ordercache","allcache")

 s <- as.integer(s)
 b <- as.integer(b)
 b2 <- as.integer(b2)
 
 i <- 1
 for (i in 1:6){
  message("\n=== ", names(tim1)[i], " ===")
  
  if (i==2){
   s <- as.integer64(s)
   b <- as.integer64(b)
   b2 <- as.integer64(b2)
  }

  tim1[i] <- 0
 
  tim1[i] <- tim1[i] + timefun({
   switch(as.character(i)
   , "3" = {hashcache(s); hashcache(b); hashcache(b2)}
   , "4" = {sortordercache(s); sortordercache(b); sortordercache(b2)}
   , "5" = {ordercache(s); ordercache(b); ordercache(b2)}
   , "6" = {hashcache(s); hashcache(b); hashcache(b2);sortordercache(s); sortordercache(b); sortordercache(b2)}
   )
  })[3]
 
  message('check data range, mean etc.')
  tim1[i] <- tim1[i] + timefun({
   summary(b)
  })[3]
  message('get all percentiles for plotting distribution shape')
  tim1[i] <- tim1[i] + timefun({
   quantile(b, probs=seq(0, 1, 0.01))
  })[3]
  message('list the upper and lower permille of values')
  tim1[i] <- tim1[i] + timefun({
   quantile(b, probs=c(0.001, 0.999))
   sort(b, na.last=NA)
  })[3]
  message('OK, for some of these values I want to see the complete ROW, so I need their positions in the data.frame')
  tim1[i] <- tim1[i] + timefun({
   if(i==1)order(b) else order.integer64(b)
  })[3]
  message('check if any values are duplicated')
  tim1[i] <- tim1[i] + timefun({
   any(duplicated(b))
  })[3]
  message('since not unique, then check distribution of frequencies')
  tim1[i] <- tim1[i] + timefun({
   if(i==1)tabulate(table(b, exclude=NULL)) else tabulate(table.integer64(b, return='list')$counts)
  })[3]
  message("OK, let's plot the percentiles of unique values versus the percentiles allowing for duplicates")
  tim1[i] <- tim1[i] + timefun({
   quantile(b, probs=seq(0, 1, 0.01))
   quantile(unique(b), probs=seq(0, 1, 0.01))
  })[3]
  message('check whether we find a match for each fact in the dimension table')
  tim1[i] <- tim1[i] + timefun({
   all(if(i==1) b %in% s else "%in%.integer64"(b, s))
  })[3]
  message('check whether there are any dimension table entries not in the fact table')
  tim1[i] <- tim1[i] + timefun({
   all(if(i==1) s %in% b else "%in%.integer64"(s, b))
  })[3]
  message('check whether we find a match for each fact in a parallel fact table')
  tim1[i] <- tim1[i] + timefun({
   all(if(i==1) b %in% b2 else "%in%.integer64"(b, b2))
  })[3]
  message('find positions of facts in dimension table for joining')
  tim1[i] <- tim1[i] + timefun({
   if(i==1) match(b, s) else match.integer64(b, s)
  })[3]
  message('find positions of facts in parallel fact table for joining')
  tim1[i] <- tim1[i] + timefun({
   if(i==1) match(b, b2) else match.integer64(b, b2)
  })[3]
  message('out of curiosity: how well rank-correlated are fact and parallel fact table?')
  tim1[i] <- tim1[i] + timefun({
   if (i==1){
    cor(rank(b, na.last="keep"), rank(b2, na.last="keep"), use="na.or.complete")
   }else{
    cor(rank.integer64(b), rank.integer64(b2), use="na.or.complete")
   }
  })[3]
  
  remcache(s)
  remcache(b)
  remcache(b2)
  
  print(round(rbind(seconds=tim1, factor=tim1[1]/tim1), 3))
 
 }

        # 32-bit         64-bit      hashcache sortordercache     ordercache       allcache 
       # 196.510          8.963          8.242          5.183         12.325          6.043 
        # 32-bit         64-bit      hashcache sortordercache     ordercache       allcache 
         # 1.000         21.924         23.842         37.913         15.944         32.519 

   
 message("\nnow let's look more systematically at the components involved")
 s <- sample(nbig, nsmall, TRUE)
 b <- sample(nbig, nbig, TRUE)
 b2 <- sample(nbig, nbig, TRUE)
 
 tim2 <- matrix(0, 15, 6)
 dimnames(tim2) <- list(c("cache", "match(s,b)", "s %in% b", "match(b,s)", "b %in% s", "match(b,b)", "b %in% b", "duplicated(b)", "unique(b)", "table(b)", "sort(b)", "order(b)", "rank(b)", "quantile(b)", "summary(b)")
 , c("32-bit","64-bit","hashcache","sortordercache","ordercache","allcache"))

 s <- as.integer(s)
 b <- as.integer(b)
 b2 <- as.integer(b2)
 
 i <- 1
 for (i in 1:6){
  if (i==2){
   s <- as.integer64(s)
   b <- as.integer64(b)
   b2 <- as.integer64(b2)
  }
 
  if (i>2)message(colnames(tim2)[i], " cache")
  tim2["cache",i] <- timefun({
   switch(as.character(i)
   , "3" = {hashcache(s); hashcache(b); hashcache(b2)}
   , "4" = {sortordercache(s); sortordercache(b); sortordercache(b2)}
   , "5" = {ordercache(s); ordercache(b); ordercache(b2)}
   , "6" = {hashcache(s); hashcache(b); hashcache(b2);sortordercache(s); sortordercache(b); sortordercache(b2)}
   )
  })[3]
 
  message(colnames(tim2)[i], " match(s,b)")
  tim2["match(s,b)",i] <- timefun({
   if (i==1) match(s, b) else match.integer64(s, b)
  })[3]
 
  message(colnames(tim2)[i], " s %in% b")
  tim2["s %in% b",i] <- timefun({
   if (i==1) s %in% b else "%in%.integer64"(s,b)
  })[3]
 
  message(colnames(tim2)[i], " match(b,s)")
  tim2["match(b,s)",i] <- timefun({
   if (i==1) match(b, s) else match.integer64(b, s)
  })[3]
 
  message(colnames(tim2)[i], " b %in% s")
  tim2["b %in% s",i] <- timefun({
   if (i==1) b %in% s else "%in%.integer64"(b,s)
  })[3]
 
  message(colnames(tim2)[i], " match(b,b)")
  tim2["match(b,b)",i] <- timefun({
   if (i==1) match(b, b2) else match.integer64(b, b2)
  })[3]
 
  message(colnames(tim2)[i], " b %in% b")
  tim2["b %in% b",i] <- timefun({
   if (i==1) b %in% b2 else "%in%.integer64"(b,b2)
  })[3]
 
  message(colnames(tim2)[i], " duplicated(b)")
  tim2["duplicated(b)",i] <- timefun({
   duplicated(b)
  })[3]
 
  message(colnames(tim2)[i], " unique(b)")
  tim2["unique(b)",i] <- timefun({
   unique(b)
  })[3]
 
  message(colnames(tim2)[i], " table(b)")
  tim2["table(b)",i] <- timefun({
   if(i==1) table(b) else table.integer64(b, return='list')
  })[3]
 
  message(colnames(tim2)[i], " sort(b)")
  tim2["sort(b)",i] <- timefun({
   sort(b)
  })[3]
 
  message(colnames(tim2)[i], " order(b)")
  tim2["order(b)",i] <- timefun({
   if(i==1) order(b) else order.integer64(b)
  })[3]
 
  message(colnames(tim2)[i], " rank(b)")
  tim2["rank(b)",i] <- timefun({
   if(i==1) rank(b) else rank.integer64(b)
  })[3]
 
  message(colnames(tim2)[i], " quantile(b)")
  tim2["quantile(b)",i] <- timefun({
   quantile(b)
  })[3]
 
  message(colnames(tim2)[i], " summary(b)")
  tim2["summary(b)",i] <- timefun({
   summary(b)
  })[3]
  
  remcache(s)
  remcache(b)
  remcache(b2)
  
  tim3 <- rbind(tim2, SESSION=tim1)
  #tim2 <- tim2[,1]/tim2
  
  cat("seconds")
  print(round(tim3, 3))
  cat("factor")
  print(round(tim3[,1]/tim3, 3))
 
 }


 
               # 32-bit 64-bit hashcache sortordercache ordercache allcache
# cache           0.000  0.000     0.775          1.330      6.500    2.660
# match(s,b)      0.820  0.218     0.004          0.025      0.093    0.004
# s %in% b        0.810  0.234     0.003          0.022      0.093    0.003
# match(b,s)      0.450  0.228     0.232          0.224      0.224    0.226
# b %in% s        0.510  0.226     0.224          0.222      0.218    0.222
# match(b,b)      2.370  0.870     0.505          0.890      0.880    0.505
# b %in% b        2.350  0.850     0.480          0.865      0.870    0.483
# duplicated(b)   0.875  0.510     0.141          0.116      0.383    0.117
# unique(b)       0.930  0.555     0.447          0.156      0.427    0.450
# table(b)      110.340  0.725     0.680          0.234      0.575    0.202
# sort(b)         2.440  0.400     0.433          0.072      0.460    0.069
# order(b)       12.780  0.680     0.615          0.036      0.036    0.035
# rank(b)        13.480  0.860     0.915          0.240      0.545    0.246
# quantile(b)     0.373  0.400     0.410          0.000      0.000    0.000
# summary(b)      0.645  0.423     0.427          0.016      0.016    0.016
# TOTAL         149.173  7.179     6.291          4.448     11.320    5.239
              # 32-bit  64-bit hashcache sortordercache ordercache allcache
# cache              1   1.062     0.000          0.000      0.000    0.000
# match(s,b)         1   3.761   230.420         32.475      8.843  217.300
# s %in% b           1   3.462   234.090         36.450      8.735  237.386
# match(b,s)         1   1.974     1.940          2.009      2.009    1.991
# b %in% s           1   2.257     2.277          2.297      2.339    2.297
# match(b,b)         1   2.724     4.693          2.663      2.693    4.693
# b %in% b           1   2.765     4.896          2.717      2.701    4.862
# duplicated(b)      1   1.716     6.195          7.572      2.283    7.500
# unique(b)          1   1.676     2.082          5.972      2.180    2.067
# table(b)           1 152.193   162.265        471.538    191.896  546.238
# sort(b)            1   6.100     5.631         33.822      5.304   35.534
# order(b)           1  18.794    20.780        357.840    354.297  366.950
# rank(b)            1  15.674    14.732         56.167     24.734   54.797
# quantile(b)        1   0.933     0.911        804.907    806.027  810.133
# summary(b)         1   1.524     1.512         39.345     39.345   39.345
# TOTAL              1  20.778    23.712         33.534     13.177   28.476 

  tim3
}


optimizer64 <- function(nsmall=2^16, nbig=2^25, timefun=repeat.time
, what=c("match","%in%","duplicated","unique","unipos","table","rank","quantile")
, uniorder = c("original", "values", "any")
, taborder = c("values", "counts")
, plot = TRUE
)
{
 uniorder <- match.arg(uniorder)
 taborder <- match.arg(taborder)
 ret <- vector("list", 2*length(what))
 dim(ret) <- c(length(what), 2L)
 dimnames(ret) <- list(what, c(nsmall, nbig))
 
 if (plot){
  oldpar <- par(no.readonly = TRUE)
  on.exit(par(oldpar))
  par(mfrow=c(2,1))
 }
 
 if ("match" %in% what){
  message("match: timings of different methods")
  N1 <- c(nsmall, nbig)
  N2 <- c(nbig, nsmall)
  for (i in seq_along(N1)){
   n1 <- N1[i]
   n2 <- N2[i]
   x1 <- c(sample(n2, n1-1, TRUE), NA)
   x2 <- c(sample(n2, n2-1, TRUE), NA)
   tim <- matrix(0, 9, 3)
   dimnames(tim) <- list(c("match","match.64","hashpos","hashrev","sortorderpos","orderpos","hashcache","sortorder.cache","order.cache"), c("prep","both","use"))

   tim["match","both"] <- timefun({
    p <- match(x1, x2)
   })[3]
   x1 <- as.integer64(x1)
   x2 <- as.integer64(x2)

   tim["match.64","both"] <- timefun({
    p2 <- match.integer64(x1, x2)
   })[3]
   stopifnot(identical(p2, p))

   tim["hashpos","prep"] <- timefun({
    h2 <- hashmap(x2)
   })[3]
   tim["hashpos","use"] <- timefun({
    p2 <- hashpos(h2, x1)
   })[3]
   stopifnot(identical(p2, p))
   
   tim["hashrev","prep"] <- timefun({
    h1 <- hashmap(x1)
   })[3]
   tim["hashrev","use"] <- timefun({
    p1 <- hashrev(h1, x2)
   })[3]
   stopifnot(identical(p2, p))
   
   tim["sortorderpos","prep"] <- system.time({
    s2 <- clone(x2)
    o2 <- seq_along(x2)
    ramsortorder(s2, o2, na.last=FALSE)
   })[3]
   tim["sortorderpos","use"] <- timefun({
    p2 <- sortorderpos(s2, o2, x1)
   })[3]
   stopifnot(identical(p2, p))
   
   tim["orderpos","prep"] <- timefun({
    o2 <- seq_along(x2)
    ramorder(x2, o2, na.last=FALSE)
   })[3]
   tim["orderpos","use"] <- timefun({
    p2 <- orderpos(x2, o2, x1, method=2)
   })[3]
   stopifnot(identical(p2, p))
   
   hashcache(x2)
   tim["hashcache","use"] <- timefun({
    p2 <- match.integer64(x1, x2)
   })[3]
   stopifnot(identical(p2, p))
   remcache(x2)
   
   sortordercache(x2)
   tim["sortorder.cache","use"] <- timefun({
    p2 <- match.integer64(x1, x2)
   })[3]
   stopifnot(identical(p2, p))
   remcache(x2)
   
   ordercache(x2)
   tim["order.cache","use"] <- timefun({
    p2 <- match.integer64(x1, x2)
   })[3]
   stopifnot(identical(p2, p))
   remcache(x2)

   if (plot){
    barplot(t(tim))
    n <- format(c(n1, n2))
    title(paste("match", n[1], "in", n[2]))
   }
   
   ret[["match",as.character(n1)]] <- tim
  }
 }

 if ("%in%" %in% what){
  message("%in%: timings of different methods")
  N1 <- c(nsmall, nbig)
  N2 <- c(nbig, nsmall)
  for (i in seq_along(N1)){
   n1 <- N1[i]
   n2 <- N2[i]
   x1 <- c(sample(n2, n1-1, TRUE), NA)
   x2 <- c(sample(n2, n2-1, TRUE), NA)
   tim <- matrix(0, 10, 3)
   dimnames(tim) <- list(c("%in%","match.64","%in%.64","hashfin","hashrin","sortfin","orderfin","hash.cache","sortorder.cache","order.cache"), c("prep","both","use"))

   tim["%in%","both"] <- timefun({
    p <- x1 %in% x2
   })[3]
   x1 <- as.integer64(x1)
   x2 <- as.integer64(x2)

   tim["match.64","both"] <- timefun({
    p2 <- match.integer64(x1,x2, nomatch = 0L) > 0L
   })[3]
   stopifnot(identical(p2, p))

   tim["%in%.64","both"] <- timefun({
    p2 <- "%in%.integer64"(x1,x2) # this is using the custom version
   })[3]
   stopifnot(identical(p2, p))

   tim["hashfin","prep"] <- timefun({
    h2 <- hashmap(x2)
   })[3]
   tim["hashfin","use"] <- timefun({
    p2 <- hashfin(h2, x1)
   })[3]
   stopifnot(identical(p2, p))
   
   tim["hashrin","prep"] <- timefun({
    h1 <- hashmap(x1)
   })[3]
   tim["hashrin","use"] <- timefun({
    p1 <- hashrin(h1, x2)
   })[3]
   stopifnot(identical(p2, p))
   
   tim["sortfin","prep"] <- timefun({
    s2 <- clone(x2)
    ramsort(s2, na.last=FALSE)
   })[3]
   tim["sortfin","use"] <- timefun({
    p2 <- sortfin(s2, x1)
   })[3]
   stopifnot(identical(p2, p))
   
   tim["orderfin","prep"] <- timefun({
    o2 <- seq_along(x2)
    ramorder(x2, o2, na.last=FALSE)
   })[3]
   tim["orderfin","use"] <- timefun({
    p2 <- orderfin(x2, o2, x1)
   })[3]
   stopifnot(identical(p2, p))
   
   hashcache(x2)
   tim["hash.cache","use"] <- timefun({
    p2 <- "%in%.integer64"(x1, x2)
   })[3]
   stopifnot(identical(p2, p))
   remcache(x2)
   
   sortordercache(x2)
   tim["sortorder.cache","use"] <- timefun({
    p2 <- "%in%.integer64"(x1, x2)
   })[3]
   stopifnot(identical(p2, p))
   remcache(x2)
   
   ordercache(x2)
   tim["order.cache","use"] <- timefun({
    p2 <- "%in%.integer64"(x1, x2)
   })[3]
   stopifnot(identical(p2, p))
   remcache(x2)
   
   if (plot){
    barplot(t(tim))
    n <- format(c(n1, n2))
    title(paste(n[1], "%in%", n[2]))
   }
    
   ret[["%in%",as.character(n1)]] <- tim
  }
 }
 if ("duplicated" %in% what){
  message("duplicated: timings of different methods")
  N <- c(nsmall, nbig)
  for (i in seq_along(N)){
   n <- N[i]
   x <- c(sample(n, n-1, TRUE), NA)
   tim <- matrix(0, 10, 3)
   dimnames(tim) <- list(c("duplicated","duplicated.64","hashdup","sortorderdup1","sortorderdup2","orderdup1","orderdup2"
    ,"hash.cache","sortorder.cache","order.cache")
   , c("prep","both","use"))

   tim["duplicated","both"] <- timefun({
    p <- duplicated(x)
   })[3]
   x <- as.integer64(x)

   tim["duplicated.64","both"] <- timefun({
    p2 <- duplicated(x)
   })[3]
   stopifnot(identical(p2, p))

   tim["hashdup","prep"] <- timefun({
    h <- hashmap(x)
   })[3]
   tim["hashdup","use"] <- timefun({
    p2 <- hashdup(h)
   })[3]
   stopifnot(identical(p2, p))
   
   tim["sortorderdup1","prep"] <- timefun({
    s <- clone(x)
    o <- seq_along(x)
    ramsortorder(s, o, na.last=FALSE)
    nunique <- sortnut(s)[1]
   })[3]
   tim["sortorderdup1","use"] <- timefun({
    p2 <- sortorderdup(s, o, method=1)
   })[3]
   stopifnot(identical(p2, p))
    
   tim["sortorderdup2","prep"] <- tim["sortorderdup1","prep"]
   tim["sortorderdup2","use"] <- timefun({
    p2 <- sortorderdup(s, o, method=2)
   })[3]
   stopifnot(identical(p2, p))
    
   tim["orderdup1","prep"] <- timefun({
    o <- seq_along(x)
    ramorder(x, o, na.last=FALSE)
    nunique <- ordernut(x,o)[1]
   })[3]
   tim["orderdup1","use"] <- timefun({
    p2 <- orderdup(x, o, method=1)
   })[3]
   stopifnot(identical(p2, p))
   
   tim["orderdup2","prep"] <- tim["orderdup1","prep"]
   tim["orderdup2","use"] <- timefun({
    p2 <- orderdup(x, o, method=2)
   })[3]
   stopifnot(identical(p2, p))
   
   hashcache(x)
   tim["hash.cache","use"] <- timefun({
    p2 <- duplicated(x)
   })[3]
   stopifnot(identical(p2, p))
   remcache(x)

   sortordercache(x)
   tim["sortorder.cache","use"] <- timefun({
    p2 <- duplicated(x)
   })[3]
   stopifnot(identical(p2, p))
   remcache(x)

   ordercache(x)
   tim["order.cache","use"] <- timefun({
    p2 <- duplicated(x)
   })[3]
   stopifnot(identical(p2, p))
   remcache(x)
   
   if (plot){
    barplot(t(tim), cex.names=0.7)
    title(paste("duplicated(",n,")", sep=""))
   }
   
   ret[["duplicated",as.character(n)]] <- tim
  }
 }
 if ("unique" %in% what){
  message("unique: timings of different methods")
  N <- c(nsmall, nbig)
  for (i in seq_along(N)){
   n <- N[i]
   x <- c(sample(n, n-1, TRUE), NA)
   tim <- matrix(0, 15, 3)
   dimnames(tim) <- list(
   c("unique","unique.64","hashmapuni","hashuni","hashunikeep","sortuni","sortunikeep","orderuni","orderunikeep","hashdup","sortorderdup"
    ,"hash.cache","sort.cache","sortorder.cache","order.cache")
   , c("prep","both","use"))

   tim["unique","both"] <- timefun({
    p <- unique(x)
   })[3]
   x <- as.integer64(x)
   p <- as.integer64(p)
   if (uniorder=="values")
    ramsort(p, na.last=FALSE)

   tim["unique.64","both"] <- timefun({
    p2 <- unique(x, order=uniorder)
   })[3]
   if (uniorder!="any")
    stopifnot(identical.integer64(p2, p))

   tim["hashmapuni","both"] <- timefun({
    p2 <- hashmapuni(x)
   })[3]
   if (uniorder=="original")
    stopifnot(identical.integer64(p2, p))
   
   tim["hashuni","prep"] <- timefun({
    h <- hashmap(x)
    # for(r in 1:r)h <- hashmap(x, nunique=h$nunique)
   })[3]
   tim["hashuni","use"] <- timefun({
    p2 <- hashuni(h)
   })[3]
   if (uniorder=="values")
    stopifnot(identical.integer64(sort(p2, na.last=FALSE), p))
   
   tim["hashunikeep","prep"] <- tim["hashuni","prep"] 
   tim["hashunikeep","use"] <- timefun({
    p2 <- hashuni(h, keep.order=TRUE)
   })[3]
   if (uniorder=="original")
    stopifnot(identical.integer64(p2, p))

   tim["sortuni","prep"] <- timefun({
    s <- clone(x)
    ramsort(s, na.last=FALSE)
    nunique <- sortnut(s)[1]
   })[3]
   tim["sortuni","use"] <- timefun({
    p2 <- sortuni(s, nunique)
   })[3]
   if (uniorder=="values")
    stopifnot(identical.integer64(sort(p2, na.last=FALSE), p))
   
   tim["sortunikeep","prep"] <- timefun({
    s <- clone(x)
    o <- seq_along(x)
    ramsortorder(s, o, na.last=FALSE)
    nunique <- sortnut(s)[1]
   })[3]
   tim["sortunikeep","use"] <- timefun({
    p2 <- sortorderuni(x, s, o, nunique)
   })[3]
   if (uniorder=="original")
    stopifnot(identical.integer64(p2, p))
    
   tim["orderuni","prep"] <- timefun({
    o <- seq_along(x)
    ramorder(x, o, na.last=FALSE)
    nunique <- ordernut(x,o)[1]
   })[3]
   tim["orderuni","use"] <- timefun({
    p2 <- orderuni(x, o, nunique)
   })[3]
   if (uniorder=="values")
    stopifnot(identical.integer64(sort(p2, na.last=FALSE), p))
   
   tim["orderunikeep","prep"] <- tim["orderuni","prep"]
   tim["orderunikeep","use"] <- timefun({
    p2 <- orderuni(x, o, nunique, keep.order=TRUE)
    nunique <- ordernut(x,o)[1]
   })[3]
   if (uniorder=="original")
    stopifnot(identical.integer64(p2, p))

   tim["hashdup","prep"] <- tim["hashuni","prep"]
   tim["hashdup","use"] <- timefun({
    p2 <- x[!hashdup(h)]
   })[3]
   if (uniorder=="original")
    stopifnot(identical.integer64(p2, p))

   tim["sortorderdup","prep"] <- tim["sortunikeep","prep"]
   tim["sortorderdup","use"] <- timefun({
    p2 <- x[!sortorderdup(s, o)]
   })[3]
   if (uniorder=="original")
    stopifnot(identical.integer64(p2, p))

   
   hashcache(x)
   tim["hash.cache","use"] <- timefun({
    p2 <- unique(x, order=uniorder)
   })[3]
   if (uniorder!="any")
    stopifnot(identical.integer64(p2, p))
   remcache(x)

   sortcache(x)
   tim["sort.cache","use"] <- timefun({
    p2 <- unique(x, order=uniorder)
   })[3]
   if (uniorder!="any")
    stopifnot(identical.integer64(p2, p))
   remcache(x)

   sortordercache(x)
   tim["sortorder.cache","use"] <- timefun({
    p2 <- unique(x, order=uniorder)
   })[3]
   if (uniorder!="any")
    stopifnot(identical.integer64(p2, p))
   remcache(x)

   ordercache(x)
   tim["order.cache","use"] <- timefun({
    p2 <- unique(x, order=uniorder)
   })[3]
   if (uniorder!="any")
    stopifnot(identical.integer64(p2, p))
   remcache(x)
   
   if (plot){
    barplot(t(tim), cex.names=0.7)
    title(paste("unique(",n,", order=",uniorder,")", sep=""))
   }
   
   ret[["unique",as.character(n)]] <- tim
  }
 }
 if ("unipos" %in% what){
  message("unipos: timings of different methods")
  N <- c(nsmall, nbig)
  for (i in seq_along(N)){
   n <- N[i]
   x <- c(sample(n, n-1, TRUE), NA)
   tim <- matrix(0, 14, 3)
   dimnames(tim) <- list(
   c("unique","unipos.64","hashmapupo","hashupo","hashupokeep","sortorderupo","sortorderupokeep","orderupo","orderupokeep","hashdup","sortorderdup"
    ,"hash.cache","sortorder.cache","order.cache")
   , c("prep","both","use"))

   tim["unique","both"] <- timefun({
    unique(x)
   })[3]
   x <- as.integer64(x)

   tim["unipos.64","both"] <- timefun({
    p <- unipos(x, order=uniorder)
   })[3]

   tim["hashmapupo","both"] <- timefun({
    p2 <- hashmapupo(x)
   })[3]
   if (uniorder=="original")
    stopifnot(identical(p2, p))
   
   tim["hashupo","prep"] <- timefun({
    h <- hashmap(x)
    # if nunique is small we could re-build the hashmap at a smaller size
    # h <- hashmap(x, nunique=h$nunique)
   })[3]
   tim["hashupo","use"] <- timefun({
    p2 <- hashupo(h)
   })[3]
   if (uniorder=="values")
    stopifnot(identical(sort(p2, na.last=FALSE), sort(p, na.last=FALSE)))
   
   tim["hashupokeep","prep"] <- tim["hashupo","prep"] 
   tim["hashupokeep","use"] <- timefun({
    p2 <- hashupo(h, keep.order=TRUE)
   })[3]
   if (uniorder=="original")
    stopifnot(identical(p2, p))

   
   tim["sortorderupo","prep"] <- timefun({
    s <- clone(x)
    o <- seq_along(x)
    ramsortorder(s, o, na.last=FALSE)
    nunique <- sortnut(s)[1]
   })[3]
   tim["sortorderupo","use"] <- timefun({
    p2 <- sortorderupo(s, o, nunique)
   })[3]
   if (uniorder=="values")
    stopifnot(identical(p2, p))
    
   tim["sortorderupokeep","prep"] <- timefun({
    s <- clone(x)
    o <- seq_along(x)
    ramsortorder(s, o, na.last=FALSE)
    nunique <- sortnut(s)[1]
   })[3]
   tim["sortorderupokeep","use"] <- timefun({
    p2 <- sortorderupo(s, o, nunique, keep.order=TRUE)
   })[3]
   if (uniorder=="original")
    stopifnot(identical(p2, p))
    
   tim["orderupo","prep"] <- timefun({
    o <- seq_along(x)
    ramorder(x, o, na.last=FALSE)
    nunique <- ordernut(x,o)[1]
   })[3]
   tim["orderupo","use"] <- timefun({
    p2 <- orderupo(x, o, nunique)
   })[3]
   if (uniorder=="values")
    stopifnot(identical(p2, p))
   
   tim["orderupokeep","prep"] <- tim["orderupo","prep"]
   tim["orderupokeep","use"] <- timefun({
    p2 <- orderupo(x, o, nunique, keep.order=TRUE)
    nunique <- ordernut(x,o)[1]
   })[3]
   if (uniorder=="original")
    stopifnot(identical(p2, p))

   tim["hashdup","prep"] <- tim["hashupo","prep"]
   tim["hashdup","use"] <- timefun({
    p2 <- (1:n)[!hashdup(h)]
   })[3]
   if (uniorder=="original")
    stopifnot(identical(p2, p))

   tim["sortorderdup","prep"] <- tim["sortorderupokeep","prep"]
   tim["sortorderdup","use"] <- timefun({
    p2 <- (1:n)[!sortorderdup(s, o)]
   })[3]
   if (uniorder=="original")
    stopifnot(identical(p2, p))
   
   hashcache(x)
   tim["hash.cache","use"] <- timefun({
    p2 <- unipos(x, order=uniorder)
   })[3]
   if (uniorder!="any")
    stopifnot(identical(p2, p))
   remcache(x)

   sortordercache(x)
   tim["sortorder.cache","use"] <- timefun({
    p2 <- unipos(x, order=uniorder)
   })[3]
   if (uniorder!="any")
    stopifnot(identical(p2, p))
   remcache(x)

   ordercache(x)
   tim["order.cache","use"] <- timefun({
    p2 <- unipos(x, order=uniorder)
   })[3]
   if (uniorder!="any")
    stopifnot(identical(p2, p))
   remcache(x)
   
   if (plot){
    barplot(t(tim), cex.names=0.7)
    title(paste("unipos(",n,", order=",uniorder,")", sep=""))
   }
   
   ret[["unipos",as.character(n)]] <- tim
  }
 }
 if ("table" %in% what){
  message("table: timings of different methods")
  N <- c(nsmall, nbig)
  for (i in seq_along(N)){
   n <- N[i]
   x <- c(sample(1024, n-1, TRUE), NA)
   tim <- matrix(0, 13, 3)
   dimnames(tim) <- list(c("tabulate","table","table.64","hashmaptab","hashtab","hashtab2","sorttab","sortordertab","ordertab","ordertabkeep"
    ,"hash.cache","sort.cache","order.cache")
   , c("prep","both","use"))

   tim["tabulate","both"] <- timefun({
    tabulate(x)
   })[3]
   
   tim["table","both"] <- timefun({
    p <- table(x, exclude=NULL)
   })[3]
   p <- p[-length(p)]
   
   x <- as.integer64(x)

   tim["table.64","both"] <- timefun({
    p2 <- table.integer64(x, order=taborder)
   })[3]
   p2 <- p2[-1]
   stopifnot(identical(p2, p))

   tim["hashmaptab","both"] <- timefun({
    p <- hashmaptab(x)
   })[3]
   
   tim["hashtab","prep"] <- timefun({
    h <- hashmap(x)
   })[3]
   tim["hashtab","use"] <- timefun({
    p2 <- hashtab(h)
   })[3]
   stopifnot(identical(p2, p))
   
   tim["hashtab2","prep"] <- tim["hashtab","prep"] + timefun({
    h <- hashmap(x, nunique=h$nunique)
   })[3]
   tim["hashtab2","use"] <- timefun({
    p2 <- hashtab(h)
   })[3]
   
   sortp <- function(p){
    s <- p$values
    o <- seq_along(s)
    ramsortorder(s,o, na.last=FALSE)
    list(values=s, counts=p$counts[o])
   }
   p <- sortp(p)
   p2 <- sortp(p2)
   stopifnot(identical(p2, p))
   
   tim["sorttab","prep"] <- timefun({
    s <- clone(x)
    ramsort(s, na.last=FALSE)
    nunique <- sortnut(s)[1]
   })[3]
   tim["sorttab","use"] <- timefun({
    p2 <- list(values=sortuni(s, nunique), counts=sorttab(s, nunique))
   })[3]
   stopifnot(identical(p2, p))
    
   tim["sortordertab","prep"] <- timefun({
    s <- clone(x)
    o <- seq_along(x)
    ramsortorder(s, o, na.last=FALSE)
    nunique <- sortnut(s)[1]
  	})[3]
			tim["sortordertab","use"] <- timefun({
				p2 <- list(values=sortorderuni(x, s, o, nunique), counts=sortordertab(s, o))
			})[3]
			p2 <- sortp(p2)
			stopifnot(identical(p2, p))
				
			tim["ordertab","prep"] <- timefun({
				o <- seq_along(x)
				ramorder(x, o, na.last=FALSE)
				nunique <- ordernut(x, o)[1]
			})[3]
			tim["ordertab","use"] <- timefun({
				p2 <- list(values=orderuni(x, o, nunique), counts=ordertab(x, o, nunique))
			})[3]
			stopifnot(identical(p2, p))
				
			tim["ordertabkeep","prep"] <- tim["ordertab","prep"] 
			tim["ordertabkeep","use"] <- timefun({
				p2 <- list(values=orderuni(x, o, nunique, keep.order=TRUE), counts=ordertab(x, o, nunique, keep.order=TRUE))
			})[3]
			p2 <- sortp(p2)
			stopifnot(identical(p2, p))
			
			hashcache(x)
			tim["hash.cache","use"] <- timefun({
				p <- table.integer64(x, order=taborder)
			})[3]
			remcache(x)

			sortordercache(x)
			tim["sort.cache","use"] <- timefun({
				p2 <- table.integer64(x, order=taborder)
			})[3]
			stopifnot(identical(p2, p))
			remcache(x)

			ordercache(x)
			tim["order.cache","use"] <- timefun({
				p2 <- table.integer64(x, order=taborder)
			})[3]
			stopifnot(identical(p2, p))
			remcache(x)
			
			if (plot){
				barplot(t(tim), cex.names=0.7)
				title(paste("table.integer64(",n,", order=",taborder,")", sep=""))
			}
			
			ret[["table",as.character(n)]] <- tim
		}
	}
	if ("rank" %in% what){
		message("rank: timings of different methods")
		N <- c(nsmall, nbig)
		for (i in seq_along(N)){
			n <- N[i]
			x <- c(sample(n, n-1, TRUE), NA)
			tim <- matrix(0, 7, 3)
			dimnames(tim) <- list(c("rank","rank.keep","rank.64","sortorderrnk","orderrnk"
				,"sort.cache","order.cache")
			, c("prep","both","use"))

			tim["rank","both"] <- timefun({
				rank(x)
			})[3]
			
			tim["rank.keep","both"] <- timefun({
				p <- rank(x, na.last="keep")
			})[3]
			
			x <- as.integer64(x)

			tim["rank.64","both"] <- timefun({
				p2 <- rank.integer64(x)
			})[3]
			stopifnot(identical(p2, p))
				
			tim["sortorderrnk","prep"] <- timefun({
				s <- clone(x)
				o <- seq_along(x)
				na.count <- ramsortorder(s, o, na.last=FALSE)
			})[3]
			tim["sortorderrnk","use"] <- timefun({
				p2 <- sortorderrnk(s, o, na.count)
			})[3]
			stopifnot(identical(p2, p))
				
			tim["orderrnk","prep"] <- timefun({
				o <- seq_along(x)
				na.count <- ramorder(x, o, na.last=FALSE)
			})[3]
			tim["orderrnk","use"] <- timefun({
				p2 <- orderrnk(x, o, na.count)
			})[3]
			stopifnot(identical(p2, p))
				
			sortordercache(x)
			tim["sort.cache","use"] <- timefun({
				p2 <- rank.integer64(x)
			})[3]
			stopifnot(identical(p2, p))
			remcache(x)

			ordercache(x)
			tim["order.cache","use"] <- timefun({
				p2 <- rank.integer64(x)
			})[3]
			stopifnot(identical(p2, p))
			remcache(x)
			
			if (plot){
				barplot(t(tim), cex.names=0.7)
				title(paste("rank.integer64(",n,")", sep=""))
			}
			
			ret[["rank",as.character(n)]] <- tim
		}
	}
	if ("quantile" %in% what){
		message("quantile: timings of different methods")
		N <- c(nsmall, nbig)
		for (i in seq_along(N)){
			n <- N[i]
			x <- c(sample(n, n-1, TRUE), NA)
			tim <- matrix(0, 6, 3)
			dimnames(tim) <- list(c("quantile","quantile.64","sortqtl","orderqtl"
				,"sort.cache","order.cache")
			, c("prep","both","use"))

			tim["quantile","both"] <- timefun({
				p <- quantile(x, type=1, na.rm=TRUE)
			})[3]
			p2 <- p
			p <- as.integer64(p2)
			names(p) <- names(p2)
			
			x <- as.integer64(x)

			tim["quantile.64","both"] <- timefun({
				p2 <- quantile(x, na.rm=TRUE)
			})[3]
			stopifnot(identical(p2, p))
				
			tim["sortqtl","prep"] <- timefun({
				s <- clone(x)
				na.count <- ramsort(s, na.last=FALSE)
			})[3]
			tim["sortqtl","use"] <- timefun({
				p2 <- sortqtl(s, na.count, seq(0, 1, 0.25))
			})[3]
			stopifnot(identical(unname(p2), unname(p)))
				
			tim["orderqtl","prep"] <- timefun({
				o <- seq_along(x)
				na.count <- ramorder(x, o, na.last=FALSE)
			})[3]
			tim["orderqtl","use"] <- timefun({
				p2 <- orderqtl(x, o, na.count, seq(0, 1, 0.25))
			})[3]
			stopifnot(identical(unname(p2), unname(p)))
				
			sortordercache(x)
			tim["sort.cache","use"] <- timefun({
				p2 <- quantile(x, na.rm=TRUE)
			})[3]
			stopifnot(identical(p2, p))
			remcache(x)

			ordercache(x)
			tim["order.cache","use"] <- timefun({
				p2 <- quantile(x, na.rm=TRUE)
			})[3]
			stopifnot(identical(p2, p))
			remcache(x)
			
			if (plot){
				barplot(t(tim), cex.names=0.7)
				title(paste("quantile(",n,")", sep=""))
			}
			
			ret[["quantile",as.character(n)]] <- tim
		}
	}

	ret
	
}


#! \name{match.integer64}
#! \alias{match.integer64}
#! \alias{\%in\%.integer64}
#! \title{
#! 64-bit integer matching
#! }
#! \description{
#! \code{match} returns a vector of the positions of (first) matches of its first argument in its second. 
#! 
#! \code{\%in\%} is a more intuitive interface as a binary operator, which returns a logical vector indicating if there is a match or not for its left operand. 
#! 
#! }
#! \usage{
#! \method{match}{integer64}(x, table, nomatch = NA_integer_, nunique = NULL, method = NULL, ...)
#! \method{\%in\%}{integer64}(x, table, ...)
#! }
#! \arguments{
#!   \item{x}{
#! 	integer64 vector: the values to be matched, optionally carrying a cache created with \code{\link{hashcache}}
#! }
#!   \item{table}{
#! 	integer64 vector: the values to be matched against, optionally carrying a cache created with \code{\link{hashcache}} or \code{\link{sortordercache}}
#! }
#!   \item{nomatch}{
#!   the value to be returned in the case when no match is found. Note that it is coerced to integer.
#! }
#!   \item{nunique}{
#! 	NULL or the number of unique values of table (including NA). Providing \code{nunique} can speed-up matching when \code{table} has no cache. Note that a wrong nunique can cause undefined behaviour up to a crash.
#! }
#!   \item{method}{
#! 	NULL for automatic method selection or a suitable low-level method, see details
#! }
#!   \item{\dots}{
#! ignored
#! }
#! }
#! \details{
#!   These functions automatically choose from several low-level functions considering the size of \code{x} and \code{table} and the availability of caches. 
#! 
#! 
#!   Suitable methods for \code{\%in\%.integer64} are \code{\link{hashpos}} (hash table lookup), \code{\link{hashrev}} (reverse lookup), \code{\link{sortorderpos}} (fast ordering) and \code{\link{orderpos}} (memory saving ordering).
#!   Suitable methods for \code{match.integer64} are \code{\link{hashfin}} (hash table lookup), \code{\link{hashrin}} (reverse lookup), \code{\link{sortfin}} (fast sorting) and \code{\link{orderfin}} (memory saving ordering).
#! }
#! \value{
#!   A vector of the same length as \code{x}.
#! 
#!   \code{match}: An integer vector giving the position in \code{table} of
#!   the first match if there is a match, otherwise \code{nomatch}.
#! 
#!   If \code{x[i]} is found to equal \code{table[j]} then the value
#!   returned in the \code{i}-th position of the return value is \code{j},
#!   for the smallest possible \code{j}.  If no match is found, the value
#!   is \code{nomatch}.
#! 
#!   \code{\%in\%}: A logical vector, indicating if a match was located for
#!   each element of \code{x}: thus the values are \code{TRUE} or
#!   \code{FALSE} and never \code{NA}.
#! }
#! \author{
#! 	Jens Oehlschlägel <Jens.Oehlschlaegel@truecluster.com>
#! }
#! \seealso{
#! 	\code{\link{match}}
#! }
#! \examples{
#! x <- as.integer64(c(NA, 0:9), 32)
#! table <- as.integer64(c(1:9, NA))
#! match.integer64(x, table)
#! "\%in\%.integer64"(x, table)
#!
#! x <- as.integer64(sample(c(rep(NA, 9), 0:9), 32, TRUE))
#! table <- as.integer64(sample(c(rep(NA, 9), 1:9), 32, TRUE))
#! stopifnot(identical(match.integer64(x, table), match(as.integer(x), as.integer(table))))
#! stopifnot(identical("\%in\%.integer64"(x, table), as.integer(x) \%in\% as.integer(table)))
#! 
#! \dontrun{
#! 	message("check when reverse hash-lookup beats standard hash-lookup")
#! 	e <- 4:24
#! 	timx <- timy <- matrix(NA, length(e), length(e), dimnames=list(e,e))
#! 	for (iy in seq_along(e))
#! 	for (ix in 1:iy){
#! 		nx <- 2^e[ix]
#! 		ny <- 2^e[iy]
#! 		x <- as.integer64(sample(ny, nx, FALSE))
#! 		y <- as.integer64(sample(ny, ny, FALSE))
#! 		#hashfun(x, bits=as.integer(5))
#! 		timx[ix,iy] <- repeat.time({
#! 		hx <- hashmap(x)
#! 		py <- hashrev(hx, y)
#! 		})[3]
#! 		timy[ix,iy] <- repeat.time({
#! 		hy <- hashmap(y)
#! 		px <- hashpos(hy, x)
#! 		})[3]
#! 		#identical(px, py)
#! 		print(round(timx[1:iy,1:iy]/timy[1:iy,1:iy], 2), na.print="")
#! 	}
#!
#! 	message("explore best low-level method given size of x and table")
#! 	B1 <- 1:27
#! 	B2 <- 1:27
#! 	tim <- array(NA, dim=c(length(B1), length(B2), 5)
#!  , dimnames=list(B1, B2, c("hashpos","hashrev","sortpos1","sortpos2","sortpos3")))
#! 	for (i1 in B1)
#! 	for (i2 in B2)
#! 	{
#! 	  b1 <- B1[i1]
#! 	  b2 <- B1[i2]
#! 	  n1 <- 2^b1
#! 	  n2 <- 2^b2
#! 	  x1 <- as.integer64(c(sample(n2, n1-1, TRUE), NA))
#! 	  x2 <- as.integer64(c(sample(n2, n2-1, TRUE), NA))
#! 	  tim[i1,i2,1] <- repeat.time({h <- hashmap(x2);hashpos(h, x1);rm(h)})[3]
#! 	  tim[i1,i2,2] <- repeat.time({h <- hashmap(x1);hashrev(h, x2);rm(h)})[3]
#! 	  s <- clone(x2); o <- seq_along(s); ramsortorder(s, o)
#! 	  tim[i1,i2,3] <- repeat.time(sortorderpos(s, o, x1, method=1))[3]
#! 	  tim[i1,i2,4] <- repeat.time(sortorderpos(s, o, x1, method=2))[3]
#! 	  tim[i1,i2,5] <- repeat.time(sortorderpos(s, o, x1, method=3))[3]
#! 	  rm(s,o)
#! 	  print(apply(tim, 1:2, function(ti)if(any(is.na(ti)))NA else which.min(ti)))
#! 	}
#! }
#! }
#! \keyword{manip}
#! \keyword{logic}


match.integer64 <- function(x, table, nomatch = NA_integer_, nunique=NULL, method=NULL, ...){
  stopifnot(is.integer64(x))
  table <- as.integer64(table)
  c <- cache(table)
  if (is.null(method)){
    if (is.null(c)){
			nx <- length(x)
			if (is.null(nunique))
				nunique <- length(table)
			btable <- as.integer(ceiling(log2(nunique*1.5)))
			bx <- as.integer(ceiling(log2(nx*1.5)))
			if (bx<=17 && btable>=16){
				method <- "hashrev"
			}else{
				method <- "hashpos"
			}
	}else{
		if (exists("hashmap", envir=c, inherits=FALSE)){
			method <- "hashpos"
		}else if (exists("sort", envir=c, inherits=FALSE) && exists("order", envir=c, inherits=FALSE) && (length(table)>length(x) || length(x)<4096)){
			method <- "sortorderpos"
		}else if (exists("order", envir=c, inherits=FALSE) && (length(table)>length(x) || length(x)<4096)){
			method <- "orderpos"
		}else{
			nx <- length(x)
			if (is.null(nunique)){
			  if (exists("nunique", envir=c, inherits=FALSE))
				nunique <- c$nunique
			  else
				nunique <- length(table)
			}
			btable <- as.integer(ceiling(log2(nunique*1.5)))
			bx <- as.integer(ceiling(log2(nx*1.5)))
			if (bx<=17 && btable>=16){
				method <- "hashrev"
			}else{
				method <- "hashpos"
			}
		}
	}
  }
  switch(method
  , hashpos={
			if (is.null(c) || !exists("hashmap", envir=c, inherits=FALSE)){
				if (exists("btable", inherits=FALSE))
					h <- hashmap(table, hashbits=btable)
				else{
					if (is.null(nunique))
						nunique <- c$nunique
					h <- hashmap(table, nunique=nunique)
				}
			}else
				h <- c
			p <- hashpos(h, x, nomatch=nomatch)
    }
  , hashrev={
		c <- cache(x)
		if (is.null(c) || !exists("hashmap", envir=c, inherits=FALSE)){
				if (exists("bx", inherits=FALSE))
					h <- hashmap(x, bits=bx)
				else{
					if (is.null(nunique))
						nunique <- c$nunique
					h <- hashmap(x, nunique=nunique)
				}
			}else
				h <- c
		p <- hashrev(h, table, nomatch=nomatch)
    }
  , sortorderpos={
		if (is.null(c) || !exists("sort", c) || !exists("order", c)){
			s <- clone(table)
			o <- seq_along(s)
			ramsortorder(s, o, na.last=FALSE)
		}else{
			s <- get("sort", c)
			o <- get("order", c)
		}
		p <- sortorderpos(s, o, x, nomatch=nomatch)
    }
  , orderpos={
		if (is.null(c) || !exists("order", c)){
			o <- seq_along(s)
			ramorder(table, o, na.last=FALSE)
		}else{
			o <- get("order", c)
		}
		p <- orderpos(table, o, x, nomatch=nomatch)
    }
  , stop("unknown method")
  )
  p
}


"%in%.integer64" <- function(x, table, ...){
  stopifnot(is.integer64(x))
  table <- as.integer64(table)
	nunique <- NULL
	method <- NULL
  c <- cache(table)
  if (is.null(method)){
    if (is.null(c)){
			nx <- length(x)
			if (is.null(nunique))
				nunique <- length(table)
			btable <- as.integer(ceiling(log2(nunique*1.5)))
			bx <- as.integer(ceiling(log2(nx*1.5)))
			if (bx<=17 && btable>=16){
				method <- "hashrin"
			}else{
				method <- "hashfin"
			}
	}else{
		if (exists("hashmap", envir=c, inherits=FALSE)){
			method <- "hashfin"
		}else if (exists("sort", envir=c, inherits=FALSE) && (length(table)>length(x) || length(x)<4096)){
			method <- "sortfin"
		}else if (exists("order", envir=c, inherits=FALSE) && (length(table)>length(x) || length(x)<4096)){
			method <- "orderfin"
		}else{
			nx <- length(x)
			if (is.null(nunique)){
			  if (exists("nunique", envir=c, inherits=FALSE))
				nunique <- c$nunique
			  else
				nunique <- length(table)
			}
			btable <- as.integer(ceiling(log2(nunique*1.5)))
			bx <- as.integer(ceiling(log2(nx*1.5)))
			if (bx<=17 && btable>=16){
				method <- "hashrin"
			}else{
				method <- "hashfin"
			}
		}
	}
  }
  switch(method
  , hashfin={
		if (is.null(c) || !exists("hashmap", envir=c, inherits=FALSE)){
			if (exists("btable", inherits=FALSE))
				h <- hashmap(table, hashbits=btable)
			else{
				if (is.null(nunique))
					nunique <- c$nunique
				h <- hashmap(table, nunique=nunique)
			}
		}else
			h <- c
		p <- hashfin(h, x)
    }
  , hashrin={
		c <- cache(x)
		if (is.null(c) || !exists("hashmap", envir=c, inherits=FALSE)){
				if (exists("bx", inherits=FALSE))
					h <- hashmap(x, bits=bx)
				else{
					if (is.null(nunique))
						nunique <- c$nunique
					h <- hashmap(x, nunique=nunique)
				}
		}else
			h <- c
		p <- hashrin(h, table)
    }
  , sortfin={
		if (is.null(c) || !exists("sort", c)){
			s <- clone(table)
			ramsort(s, na.last=FALSE)
		}else{
			s <- get("sort", c)
		}
		p <- sortfin(s, x)
    }
  , orderfin={
		if (is.null(c) || !exists("order", c)){
			o <- seq_along(s)
			ramorder(table, o, na.last=FALSE)
		}else{
			o <- get("order", c)
		}
		p <- orderfin(table, o, x)
    }
  , stop("unknown method")
  )
  p
}

#! \name{duplicated.integer64}
#! \alias{duplicated.integer64}
#! \title{Determine Duplicate Elements of integer64}
#! \description{
#!   \code{duplicated()} determines which elements of a vector or data frame are duplicates
#!   of elements with smaller subscripts, and returns a logical vector
#!   indicating which elements (rows) are duplicates.
#! }
#! \usage{
#! \method{duplicated}{integer64}(x, incomparables = FALSE, nunique = NULL, method = NULL, \dots)
#! }
#! \arguments{
#!   \item{x}{a vector or a data frame or an array or \code{NULL}.}
#!   \item{incomparables}{ignored}
#!   \item{nunique}{
#! 	NULL or the number of unique values (including NA). Providing \code{nunique} can speed-up matching when \code{x} has no cache. Note that a wrong nunique can cause undefined behaviour up to a crash.
#! }
#!   \item{method}{
#! 	NULL for automatic method selection or a suitable low-level method, see details
#! }
#!   \item{\dots}{ignored}
#! }
#! \details{
#!   This function automatically chooses from several low-level functions considering the size of \code{x} and the availability of a cache. 
#! 
#!   Suitable methods are \code{\link{hashdup}} (hashing), \code{\link{sortorderdup}} (fast ordering) and \code{\link{orderdup}} (memory saving ordering).
#! }
#! \value{
#!     \code{duplicated()}: a logical vector of the same length as \code{x}.  
#! }
#! \author{
#! 	Jens Oehlschlägel <Jens.Oehlschlaegel@truecluster.com>
#! }
#! \seealso{ \code{\link{duplicated}}, \code{\link{unique.integer64}}  }
#! \examples{
#! x <- as.integer64(sample(c(rep(NA, 9), 1:9), 32, TRUE))
#! duplicated(x)
#! 
#! stopifnot(identical(duplicated(x),  duplicated(as.integer(x))))
#! }
#! \keyword{logic}
#! \keyword{manip}
#! 

duplicated.integer64 <- function(x
, incomparables = FALSE  # dummy parameter
, nunique = NULL
, method = NULL
, ...
){
  stopifnot(identical(incomparables, FALSE))
  c <- cache(x)
  if (is.null(nunique) && !is.null(c))
	nunique <- c$nunique
  if (is.null(method)){
    if (is.null(c)){
		if (length(x)>5e7)
			method <- "sortorderdup"
		else
			method <- "hashdup"
	}else{
		if (exists("sort", envir=c, inherits=FALSE) && exists("order", envir=c, inherits=FALSE))
			method <- "sortorderdup"
		else if (exists("hashmap", envir=c, inherits=FALSE))
			method <- "hashdup"
		else if (exists("order", envir=c, inherits=FALSE))
			method <- "orderdup"
		else if (length(x)>5e7)
			method <- "sortorderdup"
		else
			method <- "hashdup"
	}
  }
  switch(method
  , hashdup={
		if (is.null(c) || !exists("hashmap", envir=c, inherits=FALSE))
			h <- hashmap(x, nunique=nunique)
		else
			h <- c
		p <- hashdup(h)
    }
  , sortorderdup={
		if (is.null(c) || !exists("sort", c, inherits=FALSE) || !exists("order", c, inherits=FALSE)){
			s <- clone(x)
			o <- seq_along(s)
			ramsortorder(s, o, na.last=FALSE)
		}else{
			s <- get("sort", c, inherits=FALSE)
			o <- get("order", c, inherits=FALSE)
		}
		p <- sortorderdup(s, o)
    }
  , orderdup={
		if (is.null(c) || !exists("order", c, inherits=FALSE)){
			o <- seq_along(s)
			ramorder(x, o, na.last=FALSE)
		}else{
			o <- get("order", c, inherits=FALSE)
		}
		p <- orderdup(x, o)
    }
  , stop("unknown method", method)
  )
  p
}


#! \name{unique.integer64}
#! \alias{unique.integer64}
#! \title{Extract Unique Elements from integer64}
#! \description{
#!   \code{unique} returns a vector like \code{x} but with duplicate elements/rows removed.
#! }
#! \usage{
#! \method{unique}{integer64}(x, incomparables = FALSE, order = c("original","values","any")
#! , nunique = NULL, method = NULL, \dots)
#! }
#! \arguments{
#!   \item{x}{a vector or a data frame or an array or \code{NULL}.}
#!   \item{incomparables}{ignored}
#!   \item{order}{The order in which unique values will be returned, see details}
#!   \item{nunique}{
#! 	NULL or the number of unique values (including NA). Providing \code{nunique} can speed-up matching when \code{x} has no cache. Note that a wrong nunique can cause undefined behaviour up to a crash.
#! }
#!   \item{method}{
#! 	NULL for automatic method selection or a suitable low-level method, see details
#! }
#!   \item{\dots}{ignored}
#! }
#! \details{
#!   This function automatically chooses from several low-level functions considering the size of \code{x} and the availability of a cache. 
#!   Suitable methods are \code{\link{hashmapuni}} (simultaneously creating and using a hashmap)
#! , \code{\link{hashuni}} (first creating a hashmap then using it)
#! , \code{\link{sortuni}} (fast sorting for sorted order only)
#! , \code{\link{sortorderuni}} (fast ordering for original order only) 
#! and \code{\link{orderuni}} (memory saving ordering).
#! \cr
#! The default \code{order="original"} returns unique values in the order of the first appearance in \code{x} like in \code{\link{unique}}, this costs extra processing. 
#! \code{order="values"} returns unique values in sorted order like in \code{\link{table}}, this costs extra processing with the hash methods but comes for free. 
#! \code{order="any"} returns unique values in undefined order, possibly faster. For hash methods this will be a quasi random order, for sort methods this will be sorted order.
#! }
#! \value{
#!   For a vector, an object of the same type of \code{x}, but with only
#!   one copy of each duplicated element.  No attributes are copied (so
#!   the result has no names).
#! }
#! \author{
#! 	Jens Oehlschlägel <Jens.Oehlschlaegel@truecluster.com>
#! }
#! \seealso{
#!   \code{\link{unique}} for the generic, \code{\link{unipos}} which gives the indices of the unique
#!   elements and \code{\link{table.integer64}} which gives frequencies of the unique elements.
#! }
#! \examples{
#! x <- as.integer64(sample(c(rep(NA, 9), 1:9), 32, TRUE))
#! unique(x)
#! unique(x, order="values")
#! 
#! stopifnot(identical(unique(x),  x[!duplicated(x)]))
#! stopifnot(identical(unique(x),  as.integer64(unique(as.integer(x)))))
#! stopifnot(identical(unique(x, order="values")
#! ,  as.integer64(sort(unique(as.integer(x)), na.last=FALSE))))
#! }
#! \keyword{manip}
#! \keyword{logic}


unique.integer64 <- function(x
, incomparables = FALSE  # dummy parameter
, order = c("original","values","any")
, nunique = NULL
, method = NULL
, ...
){
  stopifnot(identical(incomparables, FALSE))
  order <- match.arg(order)
  c <- cache(x)
  keep.order <- order == "original"
  if (is.null(nunique) && !is.null(c))
	nunique <- c$nunique
  if (is.null(method)){
    if (is.null(c)){
		if (order=="values")
			method <- "sortuni"
		else
			method <- "hashmapuni"
	}else{
		switch(order
		, "original" = {
			if (exists("hashmap", envir=c, inherits=FALSE))
				method <- "hashuni"
			else if (exists("order", envir=c, inherits=FALSE)){
				if (exists("sort", envir=c, inherits=FALSE))
					method <- "sortorderuni"
				else
					method <- "orderuni"
			}else
				method <- "hashmapuni"
		}
		, "values" = {
			if (exists("sort", envir=c, inherits=FALSE))
				method <- "sortuni"
			else if (exists("order", envir=c, inherits=FALSE))
				method <- "orderuni"
			else if (exists("hashmap", envir=c, inherits=FALSE) && c$nunique<length(x)/2)
				method <- "hashuni"
			else
				method <- "sortuni"
		}
		, "any" = {
			if (exists("sort", envir=c, inherits=FALSE))
				method <- "sortuni"
			else if (exists("hashmap", envir=c, inherits=FALSE))
				method <- "hashuni"
			else if (exists("order", envir=c, inherits=FALSE))
				method <- "orderuni"
			else
				method <- "sortuni"
		}
		)
	}
  }
  switch(method
  , hashmapuni={
		p <- hashmapuni(x, nunique=nunique)
    }
  , hashuni={
		if (is.null(c) || !exists("hashmap", envir=c, inherits=FALSE))
			h <- hashmap(x, nunique=nunique)
		else
			h <- c
		p <- hashuni(h, keep.order=keep.order)
		if (order=="values")
			ramsort(p, na.last=FALSE)
    }
  , sortuni={
		if (is.null(c) || !exists("sort", c, inherits=FALSE)){
			s <- clone(x)
			ramsort(s, na.last=FALSE)
		}else{
			s <- get("sort", c, inherits=FALSE)
		}
		if (is.null(nunique))
			nunique <- sortnut(s)[1]
		p <- sortuni(s, nunique)
    }
  , sortorderuni={
		if (is.null(c) || !exists("sort", c, inherits=FALSE) || !exists("order", c, inherits=FALSE)){
			s <- clone(x)
			o <- seq_along(x)
			ramsortorder(s, o, na.last=FALSE)
		}else{
			s <- get("sort", c, inherits=FALSE)
			o <- get("order", c, inherits=FALSE)
		}
		if (is.null(nunique))
			nunique <- sortnut(s)[1]
		p <- sortorderuni(x, s, o, nunique)
    }
  , orderuni={
		if (is.null(c) || !exists("order", c, inherits=FALSE)){
			o <- seq_along(x)
			ramorder(x, o, na.last=FALSE)
		}else{
			o <- get("order", c, inherits=FALSE)
		}
		if (is.null(nunique))
			nunique <- ordernut(x, o)[1]
		p <- orderuni(x, o, nunique, keep.order=keep.order)
    }
  , stop("unknown method", method)
  )
  p
}


#! \name{unipos}
#! \alias{unipos}
#! \alias{unipos.integer64}
#! \title{Extract Positions of Unique Elements}
#! \description{
#!   \code{unipos} returns the positions of those elements returned by \code{\link{unique}}.
#! }
#! \usage{
#! unipos(x, incomparables = FALSE, order = c("original","values","any"), \dots)
#! \method{unipos}{integer64}(x, incomparables = FALSE, order = c("original","values","any")
#! , nunique = NULL, method = NULL, \dots)
#! }
#! \arguments{
#!   \item{x}{a vector or a data frame or an array or \code{NULL}.}
#!   \item{incomparables}{ignored}
#!   \item{order}{The order in which positions of unique values will be returned, see details}
#!   \item{nunique}{
#! 	NULL or the number of unique values (including NA). Providing \code{nunique} can speed-up when \code{x} has no cache. Note that a wrong nunique can cause undefined behaviour up to a crash.
#! }
#!   \item{method}{
#! 	NULL for automatic method selection or a suitable low-level method, see details
#! }
#!   \item{\dots}{ignored}
#! }
#! \details{
#!   This function automatically chooses from several low-level functions considering the size of \code{x} and the availability of a cache. 
#!   Suitable methods are \code{\link{hashmapupo}} (simultaneously creating and using a hashmap)
#! , \code{\link{hashupo}} (first creating a hashmap then using it)
#! , \code{\link{sortorderupo}} (fast ordering) 
#! and \code{\link{orderupo}} (memory saving ordering).
#! \cr
#! The default \code{order="original"} collects unique values in the order of the first appearance in \code{x} like in \code{\link{unique}}, this costs extra processing. 
#! \code{order="values"} collects unique values in sorted order like in \code{\link{table}}, this costs extra processing with the hash methods but comes for free. 
#! \code{order="any"} collects unique values in undefined order, possibly faster. For hash methods this will be a quasi random order, for sort methods this will be sorted order.
#! }
#! \value{
#!   an integer vector of positions
#! }
#! \author{
#! 	Jens Oehlschlägel <Jens.Oehlschlaegel@truecluster.com>
#! }
#! \seealso{
#!   \code{\link{unique.integer64}} for unique values and \code{\link{match.integer64}} for general matching.
#! }
#! \examples{
#! x <- as.integer64(sample(c(rep(NA, 9), 1:9), 32, TRUE))
#! unipos(x)
#! unipos(x, order="values")
#! 
#! stopifnot(identical(unipos(x),  (1:length(x))[!duplicated(x)]))
#! stopifnot(identical(unipos(x),  match.integer64(unique(x), x)))
#! stopifnot(identical(unipos(x, order="values"),  match.integer64(unique(x, order="values"), x)))
#! stopifnot(identical(unique(x),  x[unipos(x)]))
#! stopifnot(identical(unique(x, order="values"),  x[unipos(x, order="values")]))
#! }
#! \keyword{manip}
#! \keyword{logic}


unipos <- function(x, incomparables = FALSE, order = c("original","values","any"), ...)UseMethod("unipos")
unipos.integer64 <- function(x
, incomparables = FALSE  # dummy parameter
, order = c("original","values","any")
, nunique = NULL
, method = NULL
, ...
){
  stopifnot(identical(incomparables, FALSE))
  order <- match.arg(order)
  c <- cache(x)
  keep.order <- order == "original"
  if (is.null(nunique) && !is.null(c))
	nunique <- c$nunique
  if (is.null(method)){
    if (is.null(c)){
		if (order=="values")
			method <- "sortorderupo"
		else
			method <- "hashmapupo"
	}else{
		switch(order
		, "original" = {
			if (exists("hashmap", envir=c, inherits=FALSE))
				method <- "hashupo"
			else if (exists("order", envir=c, inherits=FALSE)){
				if (exists("sort", envir=c, inherits=FALSE))
					method <- "sortorderupo"
				else
					method <- "orderupo"
			}else
				method <- "hashmapupo"
		}
		, "values" = {
			if (exists("order", envir=c, inherits=FALSE)){
				if (exists("sort", envir=c, inherits=FALSE))
					method <- "sortorderupo"
				else
					method <- "orderupo"
			}else if (exists("hashmap", envir=c, inherits=FALSE) && c$nunique<length(x)/2)
				method <- "hashupo"
			else
				method <- "sortorderupo"
		}
		, "any" = {
			if (exists("sort", envir=c, inherits=FALSE) && exists("order", envir=c, inherits=FALSE))
				method <- "sortorderupo"
			else if (exists("hashmap", envir=c, inherits=FALSE))
				method <- "hashupo"
			else if (exists("order", envir=c, inherits=FALSE))
				method <- "orderupo"
			else
				method <- "sortorderupo"
		}
		)
	}
  }
  switch(method
  , hashmapupo={
		p <- hashmapupo(x, nunique=nunique)
    }
  , hashupo={
		if (is.null(c) || !exists("hashmap", envir=c, inherits=FALSE))
			h <- hashmap(x, nunique=nunique)
		else
			h <- c
		p <- hashupo(h, keep.order=keep.order)
		if (order=="values"){
			s <- x[p]
			ramsortorder(s, p, na.last=FALSE)
		}
    }
  , sortorderupo={
		if (is.null(c) || !exists("sort", c, inherits=FALSE) || !exists("order", c, inherits=FALSE)){
			s <- clone(x)
			o <- seq_along(x)
			ramsortorder(s, o, na.last=FALSE)
		}else{
			s <- get("sort", c, inherits=FALSE)
			o <- get("order", c, inherits=FALSE)
		}
		if (is.null(nunique))
			nunique <- sortnut(s)[1]
		p <- sortorderupo(s, o, nunique, keep.order=keep.order)
    }
  , orderupo={
		if (is.null(c) || !exists("order", c, inherits=FALSE)){
			o <- seq_along(x)
			ramorder(x, o, na.last=FALSE)
		}else{
			o <- get("order", c, inherits=FALSE)
		}
		if (is.null(nunique))
			nunique <- ordernut(x, o)[1]
		p <- orderupo(x, o, nunique, keep.order=keep.order)
    }
  , stop("unknown method", method)
  )
  p
}



#! \name{table.integer64}
#! \title{Cross Tabulation and Table Creation for integer64}
#! \alias{table.integer64}
#! 
#! \concept{counts}
#! \concept{frequencies}
#! \concept{occurrences}
#! \concept{contingency table}
#! 
#! \description{
#!   \code{table.integer64} uses the cross-classifying integer64 vectors to build a contingency
#!   table of the counts at each combination of vector values.
#! }
#! \usage{
#! table.integer64(\dots
#! , return = c("table","data.frame","list")
#! , order = c("values","counts")
#! , nunique = NULL
#! , method = NULL
#! , dnn = list.names(...), deparse.level = 1
#! ) 
#! }
#! \arguments{
#!   \item{\dots}{one or more objects which can be interpreted as factors
#!     (including character strings), or a list (or data frame) whose
#!     components can be so interpreted.  (For \code{as.table} and
#!     \code{as.data.frame}, arguments passed to specific methods.)}
#!   \item{nunique}{
#! 	NULL or the number of unique values of table (including NA). Providing \code{nunique} can speed-up matching when \code{table} has no cache. Note that a wrong nunique can cause undefined behaviour up to a crash.
#! }
#!   \item{order}{
#! 	By default results are created sorted by "values", or by "counts"
#! }
#!   \item{method}{
#! 	NULL for automatic method selection or a suitable low-level method, see details
#! }
#!   \item{return}{
#!      choose the return format, see details
#! }
#!   \item{dnn}{the names to be given to the dimensions in the result (the
#!     \emph{dimnames names}).}
#!   \item{deparse.level}{controls how the default \code{dnn} is
#!     constructed.  See \sQuote{Details}.}
#! }
#! \details{
#!   This function automatically chooses from several low-level functions considering the size of \code{x} and the availability of a cache. 
#!   Suitable methods are \code{\link{hashmaptab}} (simultaneously creating and using a hashmap)
#! , \code{\link{hashtab}} (first creating a hashmap then using it)
#! , \code{\link{sortordertab}} (fast ordering) 
#! and \code{\link{ordertab}} (memory saving ordering).
#! \cr
#!   If the argument \code{dnn} is not supplied, the internal function
#!   \code{list.names} is called to compute the \sQuote{dimname names}.  If the
#!   arguments in \code{\dots} are named, those names are used.  For the
#!   remaining arguments, \code{deparse.level = 0} gives an empty name,
#!   \code{deparse.level = 1} uses the supplied argument if it is a symbol,
#!   and \code{deparse.level = 2} will deparse the argument.
#! 
#!   Arguments \code{exclude}, \code{useNA}, are not supported, i.e. \code{NA}s are always tabulated, and, different from \code{\link{table}} they are sorted first if \code{order="values"}. 
#! }
#! \value{
#!   By default (with \code{return="table"}) \code{\link{table}} returns a \emph{contingency table}, an object of
#!   class \code{"table"}, an array of integer values. Note that unlike S the result is always an array, a 1D array if one factor is given. Note also that for multidimensional arrays this is a \emph{dense} return structure which can dramatically increase RAM requirements (for large arrays with high mutual information, i.e. many possible input combinations of which only few occur) and that \code{\link{table}} is limited to \code{2^31} possible combinations (e.g. two input vectors with 46340 unique values only). Finally note that the tabulated values or value-combinations are represented as \code{dimnames} and that the implied conversion of values to strings can cause \emph{severe} performance problems since each string needs to be integrated into R's global string cache. 
#!   \cr
#!   You can use the other \code{return=} options to cope with these problems, the potential combination limit is increased from \code{2^31} to \code{2^63} with these options, RAM is only rewquired for observed combinations and string conversion is avoided. 
#!   \cr
#!   With \code{return="data.frame"} you get a \emph{dense} representation as a \code{\link{data.frame}} (like that resulting from \code{as.data.frame(table(...))}) where only observed combinations are listed (each as a data.frame row) with the corresponding frequency counts (the latter as component
#!   named by \code{responseName}).  This is the inverse of \code{\link{xtabs}}..
#!   \cr
#!   With \code{return="list"} you also get a \emph{dense} representation as a simple \code{\link{list}} with components 
#!   \item{values }{a integer64 vector of the technically tabulated values, for 1D this is the tabulated values themselves, for kD these are the values representing the potential combinations of input values}
#!   \item{counts}{the frequency counts}
#!   \item{dims}{only for kD: a list with the vectors of the unique values of the input dimensions}
#! }
#! \note{
#!   Note that by using \code{\link{as.integer64.factor}} we can also input 
#!   factors into \code{table.integer64} -- only the \code{\link{levels}} get lost.
#!  \cr
#!   Note that because of the existence of \code{\link{as.factor.integer64}} 
#! the standard \code{\link{table}} function -- within its limits -- can also be used 
#! for \code{\link{integer64}}, and especially for combining \code{\link{integer64}} input 
#! with other data types.
#! }
#! \seealso{
#!   \code{\link{table}} for more info on the standard version coping with Base R's data types, \code{\link{tabulate}} which can faster tabulate \code{\link{integer}s} with a limited range \code{[1L .. nL not too big]}, \code{\link{unique.integer64}} for the unique values without counting them and \code{\link{unipos.integer64}} for the positions of the unique values. 
#! }
#! \examples{
#! message("pure integer64 examples")
#! x <- as.integer64(sample(c(rep(NA, 9), 1:9), 32, TRUE))
#! y <- as.integer64(sample(c(rep(NA, 9), 1:9), 32, TRUE))
#! z <- sample(c(rep(NA, 9), letters), 32, TRUE)
#! table.integer64(x)
#! table.integer64(x, order="counts")
#! table.integer64(x, y)
#! table.integer64(x, y, return="data.frame")
#!
#! message("via as.integer64.factor we can use 'table.integer64' also for factors")
#! table.integer64(x, as.integer64(as.factor(z)))
#! 
#! message("via as.factor.integer64 we can also use 'table' for integer64")
#! table(x)
#! table(x, exclude=NULL)
#! table(x, z, exclude=NULL)
#!
#! \dontshow{
#!  stopifnot(identical(table.integer64(as.integer64(c(1,1,2))), table(c(1,1,2))))
#!  stopifnot(identical(table.integer64(as.integer64(c(1,1,2)),as.integer64(c(3,4,4))), table(c(1,1,2),c(3,4,4))))
#!  message("the following works with three warnings due to coercion")
#!  stopifnot(identical(table.integer64(c(1,1,2)), table(c(1,1,2))))
#!  stopifnot(identical(table.integer64(as.integer64(c(1,1,2)),c(3,4,4)), table(c(1,1,2),c(3,4,4))))
#!  stopifnot(identical(table.integer64(c(1,1,2),as.integer64(c(3,4,4))), table(c(1,1,2),c(3,4,4))))
#!  message("the following works because of as.factor.integer64")
#!  stopifnot(identical(table(as.integer64(c(1,1,2))), table(c(1,1,2))))  
#!  stopifnot(identical(table(as.integer64(c(1,1,2)),as.integer64(c(3,4,4))), table(c(1,1,2),c(3,4,4))))
#!  stopifnot(identical(table(as.integer64(c(1,1,2)),c(3,4,4)), table(c(1,1,2),c(3,4,4))))
#!  stopifnot(identical(table(c(1,1,2),as.integer64(c(3,4,4))), table(c(1,1,2),c(3,4,4))))
#! }
#!
#! }
#! \keyword{category}


table.integer64 <- function(
  ...
, return = c("table","data.frame","list")
, order = c("values","counts")
, nunique = NULL
, method = NULL
, dnn = list.names(...), deparse.level = 1
){
  order <- match.arg(order)
  return <- match.arg(return)
  # this is taken from 'table'
	list.names <- function(...){
        l <- as.list(substitute(list(...)))[-1L]
        nm <- names(l)
        fixup <- if (is.null(nm)) 
            seq_along(l)
        else nm == ""
        dep <- vapply(l[fixup], function(x) switch(deparse.level + 
            1, "", if (is.symbol(x)) as.character(x) else "", 
            deparse(x, nlines = 1)[1L]), "")
        if (is.null(nm)) 
            dep
        else {
            nm[fixup] <- dep
            nm
        }
    }  

	# COPY ON MODIFY is broken for reading from list(...)
	# because list(...) creates a copy of all ... and this invalidates our caches
	# therefore we go this sick workaround
	argsymbols <- as.list(substitute(list(...)))[-1L]
	argframe <- parent.frame()
	A <- function(i)eval(argsymbols[[i]], argframe)
	N <- length(argsymbols)
	if (!N) 
		stop("nothing to tabulate")
	if (N == 1L && is.list(A(1L))){
		args <- A(1L)
		if (length(dnn) != length(args)) 
            dnn <- if (!is.null(argn <- names(args))) argn
				else paste(dnn[1L], seq_along(args), sep = ".")		
		N <- length(args)
		A <- function(i)args[[i]]
	}
	force(dnn)
		
	if (N==1L){
		x <- A(1L)
			if (!is.integer64(x)){
				warning("coercing first argument to integer64")
				x <- as.integer64(x)
			}
	}else{
		a <- A(1L)
		n <- length(a)
		nu <- integer(N)
		d <- integer64(N+1); d[[1]] <- 1L
		dims <- vector("list", N)
		names(dims) <- dnn
		for (i in 1:N){
			a <- A(i)
			if (length(a) != n) 
				stop("all input vectors must have the same length")
			if (!is.integer64(a)){
				warning("coercing argument ", i, " to integer64")
				a <- as.integer64(a)
			}
			c <- cache(a)
			if (is.null(c$order)){
				s <- clone(a)
				o <- seq_along(s)
				ramsortorder(s,o)
				nu[[i]] <- sortnut(s)[["nunique"]]
			}else if (is.null(c$sort)){
				o <- c$order
				s <- a[o]
				nu[[i]] <- c$nunique
			}else{
				o <- c$order
				s <- c$sort
				nu[[i]] <- c$nunique
			}
			d[[i+1]] <- d[[i]] * nu[[i]]
			if (is.na(d[[i+1]]))
				stop("attempt to make a table from more than >= 2^63 hypothetical combinations")
			dims[[i]] <- sortuni(s, nu[[i]])
			if (i==1L)
				x <- sortorderkey(s,o) - 1L
			else
				x <- x + d[[i]] * (sortorderkey(s,o) - 1L)
		}
	}
  c <- cache(x)
  if (is.null(nunique) && !is.null(c))
	nunique <- c$nunique
  if (is.null(method)){
    if (is.null(c)){
		if (order=="values" && (is.null(nunique) || nunique>65536))
			method <- "sorttab"
		else
			method <- "hashmaptab"
	}else{
		if (order=="values"){
			if (exists("sort", envir=c, inherits=FALSE))
				method <- "sorttab"
			else if (exists("hashmap", envir=c, inherits=FALSE) && c$nunique<sqrt(length(x)))
				method <- "hashtab"
			else if (exists("order", envir=c, inherits=FALSE))
				method <- "ordertab"
			else
				method <- "sorttab"
		}else{ # order = "counts"
			if (exists("hashmap", envir=c, inherits=FALSE))
				method <- "hashtab"
			else if (exists("sort", envir=c, inherits=FALSE))
				method <- "sorttab"
			else if (exists("order", envir=c, inherits=FALSE))
				method <- "ordertab"
			else
				method <- "hashmaptab"
		}
	}
  }
  switch(method
  , hashmaptab={
		tmp <- hashmaptab(x, nunique=nunique)
		cnt <- tmp$counts
		val <- tmp$values
		rm(tmp)
    }
  , hashtab={
		if (is.null(c) || !exists("hashmap", envir=c, inherits=FALSE))
			h <- hashmap(x, nunique=nunique)
		else
			h <- c 
		tmp <- hashtab(h, keep.order=FALSE)
		cnt <- tmp$counts
		val <- tmp$values
		rm(tmp)
    }
  , sorttab={
		if (is.null(c) || !exists("sort", c, inherits=FALSE)){
			s <- clone(x)
			ramsort(s, na.last=FALSE)
		}else{
			s <- get("sort", c, inherits=FALSE)
		}
		if (is.null(nunique))
			nunique <- sortnut(s)[1]
		val <- sortuni(s, nunique)
		cnt <- sorttab(s, nunique)
    }
  , ordertab={
		if (is.null(c) || !exists("order", c, inherits=FALSE)){
			o <- seq_along(x)
			ramorder(x, o, na.last=FALSE)
		}else{
			o <- get("order", c, inherits=FALSE)
		}
		if (is.null(nunique))
			nunique <- ordernut(x, o)[1]
		val <- orderuni(x, o, nunique, keep.order=FALSE)
		cnt <- ordertab(x, o, nunique, keep.order=FALSE)
		rm(o)
    }
  , stop("unknown method", method)
  )
	if (order=="values"){
		if (substr(method, 1, 4)=="hash"){
			o <- seq_along(val)
			ramsortorder(val, o, na.last=FALSE)
			cnt <- cnt[o]
		}
	}else{
		# xx workaround until we have implemented ramsort.integer
		o <- sort.list(cnt, na.last=NA, method="quick")
		cnt <- cnt[o]
		# o <- seq_along(cnt)
		# ramsortorder(cnt, o, na.last=FALSE)
		val <- val[o]
	}
  ## attaching names is extremely expensive with many unique values, doing this only for compatibility with 'table' here
  switch(return
  ,  "table" = {
  		if (N==1){
			attr(cnt, "dim") <-length(cnt)
			dn <- list(as.character(val))
			names(dn) <- dnn[1]
			attr(cnt, "dimnames") <- dn
		}else{
			a <- array(0L, dim=nu, dimnames=lapply(dims, as.character))
			a[as.integer(val)+1L] <- as.integer(cnt)
			cnt <- a
		}
		oldClass(cnt) <- "table"
	}
  ,  "data.frame" = {
		if (N==1){
			cnt <- data.frame(values=val, Freq=cnt)
			names(cnt)[[1]] <- dnn[1]
		}else{
			for (i in N:1){
				w <- val %/% d[[i]]
				val <- val - d[[i]]*w
				dims[[i]] <- dims[[i]][as.integer(w)+1L]
			}
			cnt <- data.frame(dims, Freq=cnt)
		}
	}
  , "list" = {
		if (N==1)
			cnt <- list(values=val, counts=cnt)
		else
			cnt <- list(values=val, counts=cnt, dims=dims)
    }
  )
  cnt
}


as.factor.integer64 <- function(x){

	c <- cache(x)
	if (is.null(c$order)){
		s <- clone(x)
		o <- seq_along(s)
		na.count <- ramsortorder(s,o)
		nu <- sortnut(s)[["nunique"]]
	}else if (is.null(c$sort)){
		o <- c$order
		s <- x[o]
		na.count <- c$na.count
		nu <- c$nunique
	}else{
		o <- c$order
		s <- c$sort
		na.count <- c$na.count
		nu <- c$nunique
	}
	dimtab <- sortuni(s, nu)
	dimpos <- sortorderkey(s,o,na.skip.num=na.count) - 1L
	attr(dimpos, "levels") <- dimtab
	oldClass(dimpos) <- "factor"
	dimpos
}

as.ordered.integer64 <- function(x){

	c <- cache(x)
	if (is.null(c$order)){
		s <- clone(x)
		o <- seq_along(s)
		na.count <- ramsortorder(s,o)
		nu <- sortnut(s)[["nunique"]]
	}else if (is.null(c$sort)){
		o <- c$order
		s <- x[o]
		na.count <- c$na.count
		nu <- c$nunique
	}else{
		o <- c$order
		s <- c$sort
		na.count <- c$na.count
		nu <- c$nunique
	}
	dimtab <- sortuni(s, nu)
	dimpos <- sortorderkey(s,o,na.skip.num=na.count) - 1L
	attr(dimpos, "levels") <- dimtab
	oldClass(dimpos) <- c("ordered", "factor")
	dimpos
}

as.integer64.factor <- function(x, ...)as.integer64(unclass(x))



#! \name{keypos}
#! \alias{keypos}
#! \alias{keypos.integer64}
#! \title{Extract Positions in redundant dimension table}
#! \description{
#!   \code{keypos} returns the positions of the (fact table) elements that participate in their sorted unique subset (dimension table)
#! }
#! \usage{
#! keypos(x, \dots)
#! \method{keypos}{integer64}(x, method = NULL, \dots)
#! }
#! \arguments{
#!   \item{x}{a vector or a data frame or an array or \code{NULL}.}
#!   \item{method}{
#! 	NULL for automatic method selection or a suitable low-level method, see details
#! }
#!   \item{\dots}{ignored}
#! }
#! \details{
#!   NAs are sorted first in the dimension table, see \code{\link{ramorder.integer64}}.
#!   \cr
#!   This function automatically chooses from several low-level functions considering the size of \code{x} and the availability of a cache. 
#!   Suitable methods are \code{\link{sortorderkey}} (fast ordering) 
#! and \code{\link{orderkey}} (memory saving ordering).
#! }
#! \value{
#!   an integer vector of the same length as code{x} containing positions relativ to code{sort(unique(x), na.last=FALSE)}
#! }
#! \author{
#! 	Jens Oehlschlägel <Jens.Oehlschlaegel@truecluster.com>
#! }
#! \seealso{
#!   \code{\link{unique.integer64}} for the unique subset and \code{\link{match.integer64}} for finding positions in a different vector.
#! }
#! \examples{
#! x <- as.integer64(sample(c(rep(NA, 9), 1:9), 32, TRUE))
#! keypos(x)
#! 
#! stopifnot(identical(keypos(x),  match.integer64(x, sort(unique(x), na.last=FALSE))))
#! }
#! \keyword{manip}
#! \keyword{univar}



keypos <- function(x, ...)UseMethod("keypos")
keypos.integer64 <- function(x
, method = NULL
, ...
){
  c <- cache(x)
  if (is.null(method)){
    if (is.null(c)){
		method <- "sortorderkey"
	}else{
		if (exists("order", envir=c, inherits=FALSE)){
			if (exists("sort", envir=c, inherits=FALSE))
				method <- "sortorderkey"
			else
				method <- "orderkey"
		}else
			method <- "sortorderkey"
	}
  }
  switch(method
  , sortorderkey={
		if (is.null(c) || !exists("sort", c, inherits=FALSE) || !exists("order", c, inherits=FALSE)){
			s <- clone(x)
			o <- seq_along(x)
			ramsortorder(s, o, na.last=FALSE)
		}else{
			s <- get("sort", c, inherits=FALSE)
			o <- get("order", c, inherits=FALSE)
		}
		p <- sortorderkey(s, o)
    }
  , orderkey={
		if (is.null(c) || !exists("order", c, inherits=FALSE)){
			o <- seq_along(x)
			ramorder(x, o, na.last=FALSE)
		}else{
			o <- get("order", c, inherits=FALSE)
		}
		p <- orderkey(x, o)
    }
  , stop("unknown method", method)
  )
  p
}

#! \name{tiepos}
#! \alias{tiepos}
#! \alias{tiepos.integer64}
#! \title{Extract Positions of Tied Elements}
#! \description{
#!   \code{tiepos} returns the positions of those elements that participate in ties.
#! }
#! \usage{
#! tiepos(x, \dots)
#! \method{tiepos}{integer64}(x, nties = NULL, method = NULL, \dots)
#! }
#! \arguments{
#!   \item{x}{a vector or a data frame or an array or \code{NULL}.}
#!   \item{nties}{
#! 	NULL or the number of tied values (including NA). Providing \code{nties} can speed-up when \code{x} has no cache. Note that a wrong nties can cause undefined behaviour up to a crash.
#! }
#!   \item{method}{
#! 	NULL for automatic method selection or a suitable low-level method, see details
#! }
#!   \item{\dots}{ignored}
#! }
#! \details{
#!   This function automatically chooses from several low-level functions considering the size of \code{x} and the availability of a cache. 
#!   Suitable methods are \code{\link{sortordertie}} (fast ordering) 
#! and \code{\link{ordertie}} (memory saving ordering).
#! }
#! \value{
#!   an integer vector of positions
#! }
#! \author{
#! 	Jens Oehlschlägel <Jens.Oehlschlaegel@truecluster.com>
#! }
#! \seealso{
#!   \code{\link{rank.integer64}} for possibly tied ranks and \code{\link{unipos.integer64}} for positions of unique values.
#! }
#! \examples{
#! x <- as.integer64(sample(c(rep(NA, 9), 1:9), 32, TRUE))
#! tiepos(x)
#! 
#! stopifnot(identical(tiepos(x),  (1:length(x))[duplicated(x) | rev(duplicated(rev(x)))]))
#! }
#! \keyword{manip}
#! \keyword{univar}


tiepos <- function(x, ...)UseMethod("tiepos")
tiepos.integer64 <- function(x
, nties = NULL
, method = NULL
, ...
){
  c <- cache(x)
  if (is.null(nties) && !is.null(c))
	nties <- c$nties
  if (is.null(method)){
    if (is.null(c)){
		method <- "sortordertie"
	}else{
		if (exists("order", envir=c, inherits=FALSE)){
			if (exists("sort", envir=c, inherits=FALSE))
				method <- "sortordertie"
			else
				method <- "ordertie"
		}else
			method <- "sortordertie"
	}
  }
  switch(method
  , sortordertie={
		if (is.null(c) || !exists("sort", c, inherits=FALSE) || !exists("order", c, inherits=FALSE)){
			s <- clone(x)
			o <- seq_along(x)
			ramsortorder(s, o, na.last=FALSE)
		}else{
			s <- get("sort", c, inherits=FALSE)
			o <- get("order", c, inherits=FALSE)
		}
		if (is.null(nties))
			nties <- sortnut(s)[2]
		p <- sortordertie(s, o, nties)
    }
  , ordertie={
		if (is.null(c) || !exists("order", c, inherits=FALSE)){
			o <- seq_along(x)
			ramorder(x, o, na.last=FALSE)
		}else{
			o <- get("order", c, inherits=FALSE)
		}
		if (is.null(nties))
			nties <- ordernut(x, o)[2]
		p <- ordertie(x, o, nties)
    }
  , stop("unknown method", method)
  )
  p
}


#! \name{rank.integer64}
#! \alias{rank.integer64}
#! \title{Sample Ranks from integer64}
#! \description{
#!   Returns the sample ranks of the values in a vector.  Ties (i.e., equal
#!   values) are averaged and missing values propagated.
#! }
#! \usage{
#! 	\method{rank}{integer64}(x, method = NULL, \dots)
#! }
#! \arguments{
#!   \item{x}{a integer64 vector}
#!   \item{method}{
#! 	NULL for automatic method selection or a suitable low-level method, see details
#! }
#!   \item{\dots}{ignored}
#! }
#! \details{
#!   This function automatically chooses from several low-level functions considering the size of \code{x} and the availability of a cache. 
#!   Suitable methods are \code{\link{sortorderrnk}} (fast ordering) 
#! and \code{\link{orderrnk}} (memory saving ordering).
#! }
#! \value{
#!   A numeric vector of the same length as \code{x}.
#! }
#! \author{
#! 	Jens Oehlschlägel <Jens.Oehlschlaegel@truecluster.com>
#! }
#! \seealso{
#!   \code{\link{order.integer64}}, \code{\link{rank}} and \code{\link{prank}} for percent rank.
#! }
#! \examples{
#! x <- as.integer64(sample(c(rep(NA, 9), 1:9), 32, TRUE))
#! rank.integer64(x)
#! 
#! stopifnot(identical(rank.integer64(x),  rank(as.integer(x)
#! , na.last="keep", ties.method = "average")))
#! }
#! \keyword{univar}

rank.integer64 <- function(x
, method = NULL
, ...
){
  c <- cache(x)
  if (is.null(method)){
    if (is.null(c)){
		method <- "sortorderrnk"
	}else{
		if (exists("order", envir=c, inherits=FALSE)){
			if (exists("sort", envir=c, inherits=FALSE))
				method <- "sortorderrnk"
			else
				method <- "orderrnk"
		}else
			method <- "sortorderrnk"
	}
  }
  switch(method
  , sortorderrnk={
		if (is.null(c) || !exists("sort", c, inherits=FALSE) || !exists("order", c, inherits=FALSE)){
			s <- clone(x)
			o <- seq_along(x)
			na.count <- ramsortorder(s, o, na.last=FALSE)
		}else{
			s <- get("sort", c, inherits=FALSE)
			o <- get("order", c, inherits=FALSE)
			na.count <- get("na.count", c, inherits=FALSE)
		}
		p <- sortorderrnk(s, o, na.count)
    }
  , orderrnk={
		if (is.null(c) || !exists("order", c, inherits=FALSE)){
			o <- seq_along(x)
			na.count <- ramorder(x, o, na.last=FALSE)
		}else{
			o <- get("order", c, inherits=FALSE)
			na.count <- get("na.count", c, inherits=FALSE)
		}
		p <- orderrnk(x, o, na.count)
    }
  , stop("unknown method", method)
  )
  p
}

#! \name{prank}
#! \alias{prank}
#! \alias{prank.integer64}
#! \title{(P)ercent (Rank)s}
#! \description{
#! 	Function \code{prank.integer64}  projects the values [min..max] via ranks [1..n] to [0..1]. 
#! 	\code{\link{qtile.integer64}} is the inverse function of 'prank.integer64' and projects [0..1] to [min..max].
#! }
#! \usage{
#! 	prank(x, \dots)
#! 	\method{prank}{integer64}(x, method = NULL, \dots)
#! }
#! \arguments{
#!   \item{x}{a integer64 vector}
#!   \item{method}{
#! 	NULL for automatic method selection or a suitable low-level method, see details
#! }
#!   \item{\dots}{ignored}
#! }
#! \details{
#! 	Function \code{prank.integer64} is based on \code{\link{rank.integer64}}.
#! }
#! \value{
#!   \code{prank} returns a numeric vector of the same length as \code{x}.
#! }
#! \author{
#! 	Jens Oehlschlägel <Jens.Oehlschlaegel@truecluster.com>
#! }
#! \seealso{
#!   \code{\link{rank.integer64}} for simple ranks and \code{\link{qtile}} for the inverse function quantiles.
#! }
#! \examples{
#! x <- as.integer64(sample(c(rep(NA, 9), 1:9), 32, TRUE))
#! prank(x)
#! 
#! x <- x[!is.na(x)]
#! stopifnot(identical(x,  unname(qtile(x, probs=prank(x)))))
#! }
#! \keyword{univar}

prank <- function(x, ...)UseMethod("prank")
prank.integer64 <- function(x
, method = NULL
, ...
)
{	
	n <- nvalid(x)
	if (n<2L)
		return(rep(as.integer64(NA), length(x)))
	(rank.integer64(x, method=method, ...)-1L)/(n-1L)
}

#! \name{qtile}
#! \alias{qtile}
#! \alias{qtile.integer64}
#! \alias{quantile.integer64}
#! \alias{median.integer64}
#! \alias{mean.integer64}
#! \alias{summary.integer64}
#! \title{(Q)uan(Tile)s }
#! \description{
#! 	Function \code{\link{prank.integer64}}  projects the values [min..max] via ranks [1..n] to [0..1]. 
#! 	\code{qtile.ineger64} is the inverse function of 'prank.integer64' and projects [0..1] to [min..max].
#! }
#! \usage{
#! 	qtile(x, probs=seq(0, 1, 0.25), \dots)
#! 	\method{qtile}{integer64}(x, probs = seq(0, 1, 0.25), names = TRUE, method = NULL, \dots)
#! 	\method{quantile}{integer64}(x, probs = seq(0, 1, 0.25), na.rm = FALSE, names = TRUE, type=0L, \dots)
#! 	\method{median}{integer64}(x, na.rm = FALSE, \dots)
#!  \method{mean}{integer64}(x, na.rm = FALSE, \dots)
#! 	\method{summary}{integer64}(object, \dots)
#!  ## mean(x, na.rm = FALSE, ...)
#!  ## or
#!  ## mean(x, na.rm = FALSE)
#! }
#! \arguments{
#!   \item{x}{a integer64 vector}
#!   \item{object}{a integer64 vector}
#!   \item{probs}{
#! 		numeric vector of probabilities with values in [0,1] - possibly containing \code{NA}s
#! }
#!   \item{names}{
#! 	logical; if \code{TRUE}, the result has a \code{names} attribute. Set to \code{FALSE} for speedup with many probs.
#! }
#!   \item{type}{
#! 	an integer selecting the quantile algorithm, currently only 0 is supported, see details
#! }
#!   \item{method}{
#! 	NULL for automatic method selection or a suitable low-level method, see details
#! }
#!   \item{na.rm}{
#! 	logical; if \code{TRUE}, any \code{NA} and \code{NaN}'s are removed from \code{x} before the quantiles are computed.
#! }
#!   \item{\dots}{ignored}
#! }
#! \details{
#!  Functions \code{quantile.integer64} with \code{type=0} and \code{median.integer64} are convenience wrappers to \code{qtile}.
#!  \cr
#!	Function \code{qtile} behaves very similar to \code{quantile.default} with \code{type=1} 
#!  in that it only returns existing values, it is mostly symetric 
#!  but it is using 'round' rather than 'floor'. 
#!  \cr
#!  Note that this implies that \code{median.integer64} does not interpolate for even number of values 
#! (interpolation would create values that could not be represented as 64-bit integers).
#!  \cr
#!   This function automatically chooses from several low-level functions considering the size of \code{x} and the availability of a cache. 
#!   Suitable methods are \code{\link{sortqtl}} (fast sorting) 
#! and \code{\link{orderqtl}} (memory saving ordering).
#! }
#! \value{
#!   \code{prank} returns a numeric vector of the same length as \code{x}.
#!   \cr
#!   \code{qtile} returns a vector with elements from \code{x} 
#!   at the relative positions specified by \code{probs}.
#! }
#! \author{
#! 	Jens Oehlschlägel <Jens.Oehlschlaegel@truecluster.com>
#! }
#! \seealso{
#!   \code{\link{rank.integer64}} for simple ranks and \code{\link{quantile}} for quantiles.
#! }
#! \examples{
#! x <- as.integer64(sample(c(rep(NA, 9), 1:9), 32, TRUE))
#! qtile(x, probs=seq(0, 1, 0.25))
#! quantile(x, probs=seq(0, 1, 0.25), na.rm=TRUE)
#! median(x, na.rm=TRUE)
#! summary(x)
#! 
#! x <- x[!is.na(x)]
#! stopifnot(identical(x,  unname(qtile(x, probs=prank(x)))))
#! }
#! \keyword{univar}

qtile <- function(x, probs = seq(0, 1, 0.25), ...)UseMethod("qtile")
qtile.integer64 <- function(x, probs = seq(0, 1, 0.25), names = TRUE, method = NULL, ...){
	if (any(is.na(probs) | probs<0 | probs>1))
		stop("p outside [0,1]")
  c <- cache(x)
  if (is.null(method)){
    if (is.null(c)){
		method <- "sortqtl"
	}else{
		if (exists("sort", envir=c, inherits=FALSE))
			method <- "sortqtl"
		else if (exists("order", envir=c, inherits=FALSE))
			method <- "orderqtl"
		else
			method <- "sortqtl"
	}
  }
  switch(method
  , sortqtl={
		if (is.null(c) || !exists("sort", c, inherits=FALSE)){
			s <- clone(x)
			na.count <- ramsort(s, na.last=FALSE)
		}else{
			s <- get("sort", c, inherits=FALSE)
			na.count <- get("na.count", c, inherits=FALSE)
		}
		qs <- sortqtl(s, na.count, probs)
    }
  , orderqtl={
		if (is.null(c) || !exists("order", c, inherits=FALSE)){
			o <- seq_along(x)
			na.count <- ramorder(x, o, na.last=FALSE)
		}else{
			o <- get("order", c, inherits=FALSE)
			na.count <- get("na.count", c, inherits=FALSE)
		}
		qs <- orderqtl(x, o, na.count, probs)
    }
  , stop("unknown method", method)
  )
  if (names){
	np <- length(probs)
	dig <- max(2L, getOption("digits"))
	names(qs) <- paste(if (np < 100) 
		formatC(100 * probs, format = "fg", width = 1, digits = dig)
	else format(100 * probs, trim = TRUE, digits = dig), 
		"%", sep = "")
  }
  qs
}


quantile.integer64 <- function(x, probs = seq(0, 1, 0.25), na.rm = FALSE, names = TRUE, type=0L, ...){
	if (type[[1]]!=0L)
		stop("only type==0 ('qtile') supported")
	if (!na.rm && na.count(x)>0)
		stop("missing values not allowed with 'na.rm='==FALSE")
	qtile.integer64(x, probs = probs, na.rm = na.rm, names = names, ...)
}


# adding ... (wish of Kurt Hornik 23.3.2017)
if (is.na(match("...", names(formals(median))))){
	median.integer64 <- function(x, na.rm=FALSE){
		if (!na.rm && na.count(x)>0)
			stop("missing values not allowed with 'na.rm='==FALSE")
		qtile.integer64(x, probs = 0.5, na.rm = na.rm, names = FALSE)
	}
}else{
	median.integer64 <- function(x, na.rm=FALSE, ...){
		if (!na.rm && na.count(x)>0)
			stop("missing values not allowed with 'na.rm='==FALSE")
		qtile.integer64(x, probs = 0.5, na.rm = na.rm, names = FALSE)
	}
}
	
# mean.integer64 <- function(x, na.rm=FALSE){
	# s <- sum(x, na.rm=na.rm)
	# if (!is.na(s)){
		# if (na.rm)
			# s <- s%/%(length(x)-na.count(x))
		# else
			# s <- s%/%length(x)
	# }
	# s
# }
mean.integer64 <- function(x, na.rm=FALSE, ...){
	ret <- .Call(C_mean_integer64, x, as.logical(na.rm), double(1))
	oldClass(ret) <- "integer64"
	ret
}

summary.integer64 <- function (object, ...){
	nas <- na.count(object)
	qq <- quantile(object, na.rm=TRUE)
	qq <- c(qq[1L:3L], mean(object, na.rm=TRUE), qq[4L:5L])
    names(qq) <- c("Min.", "1st Qu.", "Median", "Mean", "3rd Qu.", "Max.")
	if (any(nas)) 
		c(qq, "NA's" = nas)
	else qq
}
truecluster/bit64 documentation built on Sept. 4, 2020, 10:13 a.m.