R/highlevel64.R

Defines functions summary.integer64 mean.integer64 quantile.integer64 qtile.integer64 qtile prank.integer64 prank rank.integer64 tiepos.integer64 tiepos keypos.integer64 keypos as.integer64.factor table.integer64 unipos.integer64 unipos unique.integer64 duplicated.integer64 `%in%.integer64` match.integer64 optimizer64 benchmark64

Documented in as.integer64.factor benchmark64 duplicated.integer64 keypos keypos.integer64 match.integer64 mean.integer64 optimizer64 prank prank.integer64 qtile qtile.integer64 quantile.integer64 rank.integer64 summary.integer64 table.integer64 tiepos tiepos.integer64 unipos unipos.integer64 unique.integer64

# /*
# R-Code for matching and other functions based on hashing
# S3 atomic 64bit integers for R
# (c) 2012 Jens Oehlschägel
# Licence: GPL2
# Provided 'as is', use at your own risk
# Created: 2011-12-11
# Last changed:  2011-12-11
# */

#' Function for measuring algorithmic performance of high-level and low-level integer64 functions
#'
#' @param nsmall size of smaller vector
#' @param nbig size of larger bigger vector
#' @param timefun a function for timing such as [bit::repeat.time()] or [system.time()]
#' @param what a vector of names of high-level functions
#' @param uniorder one of the order parameters that are allowed in [unique.integer64()] and [unipos.integer64()]
#' @param taborder one of the order parameters that are allowed in [table.integer64()]
#' @param plot set to FALSE to suppress plotting
#'
#' @details
#' `benchmark64` compares the following scenarios for the following use cases:
#'
#' | **scenario name** | **explanation**                                 |
#' |------------------:|:------------------------------------------------|
#' |            32-bit | applying Base R function to 32-bit integer data |
#' |            64-bit | applying bit64 function to 64-bit integer data (with no cache) |
#' |         hashcache | ditto when cache contains [hashmap()], see [hashcache()] |
#' |    sortordercache | ditto when cache contains sorting and ordering, see [sortordercache()] |
#' |        ordercache | ditto when cache contains ordering only, see [ordercache()] |
#' |          allcache | ditto when cache contains sorting, ordering and hashing |
#'
#' | **use case name** | **explanation**                         |
#' |------------------:|:----------------------------------------|
#' |             cache | filling the cache according to scenario |
#' |       match(s, b) | match small in big vector               |
#' |          s %in% b | small %in% big vector                   |
#' |       match(b, s) | match big in small vector               |
#' |          b %in% s | big %in% small vector                   |
#' |       match(b, b) | match big in (different) big vector     |
#' |          b %in% b | big %in% (different) big vector         |
#' |     duplicated(b) | duplicated of big vector                |
#' |         unique(b) | unique of big vector                    |
#' |          table(b) | table of big vector                     |
#' |           sort(b) | sorting of big vector                   |
#' |          order(b) | ordering of big vector                  |
#' |           rank(b) | ranking of big vector                   |
#' |       quantile(b) | quantiles of big vector                 |
#' |        summary(b) | summary of of big vector                |
#' |           SESSION | exemplary session involving multiple calls (including cache filling costs) |
#'
#' Note that the timings for the cached variants do _not_ contain the
#'   time costs of building the cache, except for the timing of the exemplary
#'   user session, where the cache costs are included in order to evaluate amortization.
#'
#' @return
#' `benchmark64` returns a matrix with elapsed seconds, different high-level tasks
#'   in rows and different scenarios to solve the task in columns. The last row
#'   named 'SESSION' contains the elapsed seconds of the exemplary sesssion.
#'
#' `optimizer64` returns a dimensioned list with one row for each high-level
#'   function timed and two columns named after the values of the `nsmall` and
#'   `nbig` sample sizes. Each list cell contains a matrix with timings,
#'   low-level-methods in rows and three measurements `c("prep", "both", "use")`
#'   in columns. If it can be measured separately, `prep` contains the timing
#'   of preparatory work such as sorting and hashing, and `use` contains the
#'   timing of using the prepared work. If the function timed does both,
#'   preparation and use, the timing is in `both`.
#'
#' @seealso [integer64()]
#' @examples
#' message("this small example using system.time does not give serious timings\n
#' this we do this only to run regression tests")
#' benchmark64(nsmall=2^7, nbig=2^13, timefun=function(expr)system.time(expr, gcFirst=FALSE))
#' optimizer64(nsmall=2^7, nbig=2^13, timefun=function(expr)system.time(expr, gcFirst=FALSE)
#' , plot=FALSE
#' )
#'\dontrun{
#' message("for real measurement of sufficiently large datasets run this on your machine")
#' benchmark64()
#' optimizer64()
#'}
#' message("let's look at the performance results on Core i7 Lenovo T410 with 8 GB RAM")
#' data(benchmark64.data)
#' print(benchmark64.data)
#'
#' matplot(log2(benchmark64.data[-1, 1]/benchmark64.data[-1, ])
#' , pch=c("3", "6", "h", "s", "o", "a")
#' , xlab="tasks [last=session]"
#' , ylab="log2(relative speed) [bigger is better]"
#' )
#' matplot(t(log2(benchmark64.data[-1, 1]/benchmark64.data[-1, ]))
#' , type="b", axes=FALSE
#' , lwd=c(rep(1, 14), 3)
#' , xlab="context"
#' , ylab="log2(relative speed) [bigger is better]"
#' )
#' axis(1
#' , labels=c("32-bit", "64-bit", "hash", "sortorder", "order", "hash+sortorder")
#' , at=1:6
#' )
#' axis(2)
#' data(optimizer64.data)
#' print(optimizer64.data)
#' oldpar <- par(no.readonly = TRUE)
#' par(mfrow=c(2, 1))
#' par(cex=0.7)
#' for (i in 1:nrow(optimizer64.data)) {
#'  for (j in 1:2) {
#'    tim <- optimizer64.data[[i, j]]
#'   barplot(t(tim))
#'   if (rownames(optimizer64.data)[i]=="match")
#'    title(paste("match", colnames(optimizer64.data)[j], "in", colnames(optimizer64.data)[3-j]))
#'   else if (rownames(optimizer64.data)[i]=="%in%")
#'    title(paste(colnames(optimizer64.data)[j], "%in%", colnames(optimizer64.data)[3-j]))
#'   else
#'    title(paste(rownames(optimizer64.data)[i], colnames(optimizer64.data)[j]))
#'  }
#' }
#' par(mfrow=c(1, 1))
#' @keywords misc
#' @name benchmark64
NULL

#' @describeIn benchmark64 compares high-level integer64 functions against the
#'   integer functions from Base R
#' @export
# nocov start
# nolint start: brace_linter, line_length_linter.
benchmark64 <- function(nsmall=2L^16L, nbig=2L^25L, timefun=repeat.time) {

 message('\ncompare performance for a complete sessions of calls')
 s <- sample(nbig, nsmall, TRUE)
 b <- sample(nbig, nbig, TRUE)
 b2 <- sample(nbig, nbig, TRUE)

 tim1 <- double(6L)
 names(tim1) <- c("32-bit", "64-bit", "hashcache", "sortordercache", "ordercache", "allcache")

 s <- as.integer(s)
 b <- as.integer(b)
 b2 <- as.integer(b2)

 for (i in 1:6) {
  message("\n=== ", names(tim1)[i], " ===")

  if (i==2L) {
   s <- as.integer64(s)
   b <- as.integer64(b)
   b2 <- as.integer64(b2)
  }

  tim1[i] <- 0L

  tim1[i] <- tim1[i] + timefun({
   switch(i,
     NULL, # i=1
     NULL, # i=2
     { hashcache(s); hashcache(b); hashcache(b2) },
     { sortordercache(s); sortordercache(b); sortordercache(b2) },
     { ordercache(s); ordercache(b); ordercache(b2) },
     { hashcache(s); hashcache(b); hashcache(b2);sortordercache(s); sortordercache(b); sortordercache(b2) }
   )
  })[3L]

  message('check data range, mean etc.')
  tim1[i] <- tim1[i] + timefun({
   summary(b)
  })[3L]
  message('get all percentiles for plotting distribution shape')
  tim1[i] <- tim1[i] + timefun({
   quantile(b, probs=seq(0.0, 1.0, 0.01))
  })[3L]
  message('list the upper and lower permille of values')
  tim1[i] <- tim1[i] + timefun({
   quantile(b, probs=c(0.001, 0.999))
   sort(b, na.last=NA)
  })[3L]
  message('OK, for some of these values I want to see the complete ROW, so I need their positions in the data.frame')
  tim1[i] <- tim1[i] + timefun({
   if (i==1L) order(b) else order.integer64(b)
  })[3L]
  message('check if any values are duplicated')
  tim1[i] <- tim1[i] + timefun({
   anyDuplicated(b)
  })[3L]
  message('since not unique, then check distribution of frequencies')
  tim1[i] <- tim1[i] + timefun({
   if (i==1L) tabulate(table(b, exclude=NULL)) else tabulate(table.integer64(b, return='list')$counts)
  })[3L]
  message("OK, let's plot the percentiles of unique values versus the percentiles allowing for duplicates")
  tim1[i] <- tim1[i] + timefun({
   quantile(b, probs=seq(0.0, 1.0, 0.01))
   quantile(unique(b), probs=seq(0.0, 1.0, 0.01))
  })[3L]
  message('check whether we find a match for each fact in the dimension table')
  tim1[i] <- tim1[i] + timefun({
   all(if (i==1L) b %in% s else "%in%.integer64"(b, s))
  })[3L]
  message('check whether there are any dimension table entries not in the fact table')
  tim1[i] <- tim1[i] + timefun({
   all(if (i==1L) s %in% b else "%in%.integer64"(s, b))
  })[3L]
  message('check whether we find a match for each fact in a parallel fact table')
  tim1[i] <- tim1[i] + timefun({
   all(if (i==1L) b %in% b2 else "%in%.integer64"(b, b2))
  })[3L]
  message('find positions of facts in dimension table for joining')
  tim1[i] <- tim1[i] + timefun({
   if (i==1L) match(b, s) else match.integer64(b, s)
  })[3L]
  message('find positions of facts in parallel fact table for joining')
  tim1[i] <- tim1[i] + timefun({
   if (i==1L) match(b, b2) else match.integer64(b, b2)
  })[3L]
  message('out of curiosity: how well rank-correlated are fact and parallel fact table?')
  tim1[i] <- tim1[i] + timefun({
   if (i==1L) {
    cor(rank(b, na.last="keep"), rank(b2, na.last="keep"), use="na.or.complete")
   } else {
    cor(rank.integer64(b), rank.integer64(b2), use="na.or.complete")
   }
  })[3L]

  remcache(s)
  remcache(b)
  remcache(b2)

  print(round(rbind(seconds=tim1, factor=tim1[1L]/tim1), 3L))

 }

        # 32-bit         64-bit      hashcache sortordercache     ordercache       allcache
       # 196.510          8.963          8.242          5.183         12.325          6.043
        # 32-bit         64-bit      hashcache sortordercache     ordercache       allcache
         # 1.000         21.924         23.842         37.913         15.944         32.519


 message("\nnow let's look more systematically at the components involved")
 s <- sample(nbig, nsmall, TRUE)
 b <- sample(nbig, nbig, TRUE)
 b2 <- sample(nbig, nbig, TRUE)

  tim2 <- matrix(0.0, 15L, 6L)
  dimnames(tim2) <- list(
    c("cache", "match(s, b)", "s %in% b", "match(b, s)", "b %in% s", "match(b, b)", "b %in% b", "duplicated(b)", "unique(b)", "table(b)", "sort(b)", "order(b)", "rank(b)", "quantile(b)", "summary(b)"), # nolint: line_length_linter.
    c("32-bit", "64-bit", "hashcache", "sortordercache", "ordercache", "allcache")
  )

 s <- as.integer(s)
 b <- as.integer(b)
 b2 <- as.integer(b2)

 i <- 1L
 for (i in 1:6) {
  if (i==2L) {
   s <- as.integer64(s)
   b <- as.integer64(b)
   b2 <- as.integer64(b2)
  }

  if (i>2L) message(colnames(tim2)[i], " cache")
  tim2["cache", i] <- timefun({
   switch(i,
     NULL, # i=1
     NULL, # i=2
     { hashcache(s); hashcache(b); hashcache(b2) },
     { sortordercache(s); sortordercache(b); sortordercache(b2) },
     { ordercache(s); ordercache(b); ordercache(b2) },
     { hashcache(s); hashcache(b); hashcache(b2);sortordercache(s); sortordercache(b); sortordercache(b2) }
   )
  })[3L]

  message(colnames(tim2)[i], " match(s, b)")
  tim2["match(s, b)", i] <- timefun({
   if (i==1L) match(s, b) else match.integer64(s, b)
  })[3L]

  message(colnames(tim2)[i], " s %in% b")
  tim2["s %in% b", i] <- timefun({
   if (i==1L) s %in% b else "%in%.integer64"(s, b)
  })[3L]

  message(colnames(tim2)[i], " match(b, s)")
  tim2["match(b, s)", i] <- timefun({
   if (i==1L) match(b, s) else match.integer64(b, s)
  })[3L]

  message(colnames(tim2)[i], " b %in% s")
  tim2["b %in% s", i] <- timefun({
   if (i==1L) b %in% s else "%in%.integer64"(b, s)
  })[3L]

  message(colnames(tim2)[i], " match(b, b)")
  tim2["match(b, b)", i] <- timefun({
   if (i==1L) match(b, b2) else match.integer64(b, b2)
  })[3L]

  message(colnames(tim2)[i], " b %in% b")
  tim2["b %in% b", i] <- timefun({
   if (i==1L) b %in% b2 else "%in%.integer64"(b, b2)
  })[3L]

  message(colnames(tim2)[i], " duplicated(b)")
  tim2["duplicated(b)", i] <- timefun({
   duplicated(b)
  })[3L]

  message(colnames(tim2)[i], " unique(b)")
  tim2["unique(b)", i] <- timefun({
   unique(b)
  })[3L]

  message(colnames(tim2)[i], " table(b)")
  tim2["table(b)", i] <- timefun({
   if (i==1L) table(b) else table.integer64(b, return='list')
  })[3L]

  message(colnames(tim2)[i], " sort(b)")
  tim2["sort(b)", i] <- timefun({
   sort(b)
  })[3L]

  message(colnames(tim2)[i], " order(b)")
  tim2["order(b)", i] <- timefun({
   if (i==1L) order(b) else order.integer64(b)
  })[3L]

  message(colnames(tim2)[i], " rank(b)")
  tim2["rank(b)", i] <- timefun({
   if (i==1L) rank(b) else rank.integer64(b)
  })[3L]

  message(colnames(tim2)[i], " quantile(b)")
  tim2["quantile(b)", i] <- timefun({
   quantile(b)
  })[3L]

  message(colnames(tim2)[i], " summary(b)")
  tim2["summary(b)", i] <- timefun({
   summary(b)
  })[3L]

  remcache(s)
  remcache(b)
  remcache(b2)

  tim3 <- rbind(tim2, SESSION=tim1)
  #tim2 <- tim2[, 1]/tim2

  cat("seconds")
  print(round(tim3, 3L))
  cat("factor")
  print(round(tim3[, 1L]/tim3, 3L))

 }



               # 32-bit 64-bit hashcache sortordercache ordercache allcache
# cache           0.000  0.000     0.775          1.330      6.500    2.660
# match(s, b)     0.820  0.218     0.004          0.025      0.093    0.004
# s %in% b        0.810  0.234     0.003          0.022      0.093    0.003
# match(b, s)     0.450  0.228     0.232          0.224      0.224    0.226
# b %in% s        0.510  0.226     0.224          0.222      0.218    0.222
# match(b, b)     2.370  0.870     0.505          0.890      0.880    0.505
# b %in% b        2.350  0.850     0.480          0.865      0.870    0.483
# duplicated(b)   0.875  0.510     0.141          0.116      0.383    0.117
# unique(b)       0.930  0.555     0.447          0.156      0.427    0.450
# table(b)      110.340  0.725     0.680          0.234      0.575    0.202
# sort(b)         2.440  0.400     0.433          0.072      0.460    0.069
# order(b)       12.780  0.680     0.615          0.036      0.036    0.035
# rank(b)        13.480  0.860     0.915          0.240      0.545    0.246
# quantile(b)     0.373  0.400     0.410          0.000      0.000    0.000
# summary(b)      0.645  0.423     0.427          0.016      0.016    0.016
# TOTAL         149.173  7.179     6.291          4.448     11.320    5.239
              # 32-bit  64-bit hashcache sortordercache ordercache allcache
# cache              1   1.062     0.000          0.000      0.000    0.000
# match(s, b)        1   3.761   230.420         32.475      8.843  217.300
# s %in% b           1   3.462   234.090         36.450      8.735  237.386
# match(b, s)        1   1.974     1.940          2.009      2.009    1.991
# b %in% s           1   2.257     2.277          2.297      2.339    2.297
# match(b, b)        1   2.724     4.693          2.663      2.693    4.693
# b %in% b           1   2.765     4.896          2.717      2.701    4.862
# duplicated(b)      1   1.716     6.195          7.572      2.283    7.500
# unique(b)          1   1.676     2.082          5.972      2.180    2.067
# table(b)           1 152.193   162.265        471.538    191.896  546.238
# sort(b)            1   6.100     5.631         33.822      5.304   35.534
# order(b)           1  18.794    20.780        357.840    354.297  366.950
# rank(b)            1  15.674    14.732         56.167     24.734   54.797
# quantile(b)        1   0.933     0.911        804.907    806.027  810.133
# summary(b)         1   1.524     1.512         39.345     39.345   39.345
# TOTAL              1  20.778    23.712         33.534     13.177   28.476

  tim3
}

#' @describeIn benchmark64 compares for each high-level integer64 function the Base
#'   R integer function with several low-level integer64 functions with and
#'   without caching
#' @export
optimizer64 <- function(nsmall=2L^16L,
                        nbig=2L^25L,
                        timefun=repeat.time,
                        what=c("match", "%in%", "duplicated", "unique", "unipos", "table", "rank", "quantile"),
                        uniorder=c("original", "values", "any"),
                        taborder=c("values", "counts"),
                        plot=TRUE) {
 uniorder <- match.arg(uniorder)
 taborder <- match.arg(taborder)
 ret <- vector("list", 2L*length(what))
 dim(ret) <- c(length(what), 2L)
 dimnames(ret) <- list(what, c(nsmall, nbig))

 if (plot) {
  oldpar <- par(no.readonly = TRUE)
  on.exit(par(oldpar))
  par(mfrow=c(2L, 1L))
 }

 if ("match" %in% what) {
  message("match: timings of different methods")
  N1 <- c(nsmall, nbig)
  N2 <- c(nbig, nsmall)
  for (i in seq_along(N1)) {
   n1 <- N1[i]
   n2 <- N2[i]
   x1 <- c(sample(n2, n1-1L, TRUE), NA)
   x2 <- c(sample(n2, n2-1L, TRUE), NA)
   tim <- matrix(0.0, 9L, 3L)
   dimnames(tim) <- list(
    c("match", "match.64", "hashpos", "hashrev", "sortorderpos", "orderpos", "hashcache", "sortorder.cache", "order.cache"), # nolint: line_length_linter.
    c("prep", "both", "use")
  )

   tim["match", "both"] <- timefun({
    p <- match(x1, x2)
   })[3L]
   x1 <- as.integer64(x1)
   x2 <- as.integer64(x2)

   tim["match.64", "both"] <- timefun({
    p2 <- match.integer64(x1, x2)
   })[3L]
   stopifnot(identical(p2, p))

   tim["hashpos", "prep"] <- timefun({
    h2 <- hashmap(x2)
   })[3L]
   tim["hashpos", "use"] <- timefun({
    p2 <- hashpos(h2, x1)
   })[3L]
   stopifnot(identical(p2, p))

   tim["hashrev", "prep"] <- timefun({
    h1 <- hashmap(x1)
   })[3L]
   tim["hashrev", "use"] <- timefun({
    p1 <- hashrev(h1, x2)
   })[3L]
   stopifnot(identical(p1, p))

   tim["sortorderpos", "prep"] <- system.time({
    s2 <- clone(x2)
    o2 <- seq_along(x2)
    ramsortorder(s2, o2, na.last=FALSE)
   })[3L]
   tim["sortorderpos", "use"] <- timefun({
    p2 <- sortorderpos(s2, o2, x1)
   })[3L]
   stopifnot(identical(p2, p))

   tim["orderpos", "prep"] <- timefun({
    o2 <- seq_along(x2)
    ramorder(x2, o2, na.last=FALSE)
   })[3L]
   tim["orderpos", "use"] <- timefun({
    p2 <- orderpos(x2, o2, x1, method=2L)
   })[3L]
   stopifnot(identical(p2, p))

   hashcache(x2)
   tim["hashcache", "use"] <- timefun({
    p2 <- match.integer64(x1, x2)
   })[3L]
   stopifnot(identical(p2, p))
   remcache(x2)

   sortordercache(x2)
   tim["sortorder.cache", "use"] <- timefun({
    p2 <- match.integer64(x1, x2)
   })[3L]
   stopifnot(identical(p2, p))
   remcache(x2)

   ordercache(x2)
   tim["order.cache", "use"] <- timefun({
    p2 <- match.integer64(x1, x2)
   })[3L]
   stopifnot(identical(p2, p))
   remcache(x2)

   if (plot) {
    barplot(t(tim))
    n <- format(c(n1, n2))
    title(paste("match", n[1L], "in", n[2L]))
   }

   ret[["match", as.character(n1)]] <- tim
  }
 }

 if ("%in%" %in% what) {
  message("%in%: timings of different methods")
  N1 <- c(nsmall, nbig)
  N2 <- c(nbig, nsmall)
  for (i in seq_along(N1)) {
   n1 <- N1[i]
   n2 <- N2[i]
   x1 <- c(sample(n2, n1-1L, TRUE), NA)
   x2 <- c(sample(n2, n2-1L, TRUE), NA)
   tim <- matrix(0.0, 10L, 3L)
   dimnames(tim) <- list(
    c("%in%", "match.64", "%in%.64", "hashfin", "hashrin", "sortfin", "orderfin", "hash.cache", "sortorder.cache", "order.cache"), # nolint: line_length_linter.
    c("prep", "both", "use")
  )

   tim["%in%", "both"] <- timefun({
    p <- x1 %in% x2
   })[3L]
   x1 <- as.integer64(x1)
   x2 <- as.integer64(x2)

   tim["match.64", "both"] <- timefun({
    p2 <- match.integer64(x1, x2, nomatch = 0L) > 0L
   })[3L]
   stopifnot(identical(p2, p))

   tim["%in%.64", "both"] <- timefun({
    p2 <- "%in%.integer64"(x1, x2) # this is using the custom version
   })[3L]
   stopifnot(identical(p2, p))

   tim["hashfin", "prep"] <- timefun({
    h2 <- hashmap(x2)
   })[3L]
   tim["hashfin", "use"] <- timefun({
    p2 <- hashfin(h2, x1)
   })[3L]
   stopifnot(identical(p2, p))

   tim["hashrin", "prep"] <- timefun({
    h1 <- hashmap(x1)
   })[3L]
   tim["hashrin", "use"] <- timefun({
    p1 <- hashrin(h1, x2)
   })[3L]
   stopifnot(identical(p2, p))

   tim["sortfin", "prep"] <- timefun({
    s2 <- clone(x2)
    ramsort(s2, na.last=FALSE)
   })[3L]
   tim["sortfin", "use"] <- timefun({
    p2 <- sortfin(s2, x1)
   })[3L]
   stopifnot(identical(p2, p))

   tim["orderfin", "prep"] <- timefun({
    o2 <- seq_along(x2)
    ramorder(x2, o2, na.last=FALSE)
   })[3L]
   tim["orderfin", "use"] <- timefun({
    p2 <- orderfin(x2, o2, x1)
   })[3L]
   stopifnot(identical(p2, p))

   hashcache(x2)
   tim["hash.cache", "use"] <- timefun({
    p2 <- "%in%.integer64"(x1, x2)
   })[3L]
   stopifnot(identical(p2, p))
   remcache(x2)

   sortordercache(x2)
   tim["sortorder.cache", "use"] <- timefun({
    p2 <- "%in%.integer64"(x1, x2)
   })[3L]
   stopifnot(identical(p2, p))
   remcache(x2)

   ordercache(x2)
   tim["order.cache", "use"] <- timefun({
    p2 <- "%in%.integer64"(x1, x2)
   })[3L]
   stopifnot(identical(p2, p))
   remcache(x2)

   if (plot) {
    barplot(t(tim))
    n <- format(c(n1, n2))
    title(paste(n[1L], "%in%", n[2L]))
   }

   ret[["%in%", as.character(n1)]] <- tim
  }
 }
 if ("duplicated" %in% what) {
  message("duplicated: timings of different methods")
  N <- c(nsmall, nbig)
  for (i in seq_along(N)) {
   n <- N[i]
   x <- c(sample(n, n-1L, TRUE), NA)
   tim <- matrix(0.0, 10L, 3L)
   dimnames(tim) <- list(
    c("duplicated", "duplicated.64", "hashdup", "sortorderdup1", "sortorderdup2", "orderdup1", "orderdup2", "hash.cache", "sortorder.cache", "order.cache"), # nolint: line_length_linter.
    c("prep", "both", "use")
  )

   tim["duplicated", "both"] <- timefun({
    p <- duplicated(x)
   })[3L]
   x <- as.integer64(x)

   tim["duplicated.64", "both"] <- timefun({
    p2 <- duplicated(x)
   })[3L]
   stopifnot(identical(p2, p))

   tim["hashdup", "prep"] <- timefun({
    h <- hashmap(x)
   })[3L]
   tim["hashdup", "use"] <- timefun({
    p2 <- hashdup(h)
   })[3L]
   stopifnot(identical(p2, p))

   tim["sortorderdup1", "prep"] <- timefun({
    s <- clone(x)
    o <- seq_along(x)
    ramsortorder(s, o, na.last=FALSE)
    nunique <- sortnut(s)[1L]
   })[3L]
   tim["sortorderdup1", "use"] <- timefun({
    p2 <- sortorderdup(s, o, method=1L)
   })[3L]
   stopifnot(identical(p2, p))

   tim["sortorderdup2", "prep"] <- tim["sortorderdup1", "prep"]
   tim["sortorderdup2", "use"] <- timefun({
    p2 <- sortorderdup(s, o, method=2L)
   })[3L]
   stopifnot(identical(p2, p))

   tim["orderdup1", "prep"] <- timefun({
    o <- seq_along(x)
    ramorder(x, o, na.last=FALSE)
    nunique <- ordernut(x, o)[1L]
   })[3L]
   tim["orderdup1", "use"] <- timefun({
    p2 <- orderdup(x, o, method=1L)
   })[3L]
   stopifnot(identical(p2, p))

   tim["orderdup2", "prep"] <- tim["orderdup1", "prep"]
   tim["orderdup2", "use"] <- timefun({
    p2 <- orderdup(x, o, method=2L)
   })[3L]
   stopifnot(identical(p2, p))

   hashcache(x)
   tim["hash.cache", "use"] <- timefun({
    p2 <- duplicated(x)
   })[3L]
   stopifnot(identical(p2, p))
   remcache(x)

   sortordercache(x)
   tim["sortorder.cache", "use"] <- timefun({
    p2 <- duplicated(x)
   })[3L]
   stopifnot(identical(p2, p))
   remcache(x)

   ordercache(x)
   tim["order.cache", "use"] <- timefun({
    p2 <- duplicated(x)
   })[3L]
   stopifnot(identical(p2, p))
   remcache(x)

   if (plot) {
    barplot(t(tim), cex.names=0.7)
    title(paste0("duplicated(", n, ")"))
   }

   ret[["duplicated", as.character(n)]] <- tim
  }
 }
 if ("unique" %in% what) {
  message("unique: timings of different methods")
  N <- c(nsmall, nbig)
  for (i in seq_along(N)) {
    n <- N[i]
    x <- c(sample(n, n-1L, TRUE), NA)
    tim <- matrix(0.0, 15L, 3L)
    dimnames(tim) <- list(
      c("unique", "unique.64", "hashmapuni", "hashuni", "hashunikeep", "sortuni", "sortunikeep", "orderuni", "orderunikeep", "hashdup", "sortorderdup", "hash.cache", "sort.cache", "sortorder.cache", "order.cache"), # nolint: line_length_linter.
      c("prep", "both", "use")
    )

   tim["unique", "both"] <- timefun({
    p <- unique(x)
   })[3L]
   x <- as.integer64(x)
   p <- as.integer64(p)
   if (uniorder=="values")
    ramsort(p, na.last=FALSE)

   tim["unique.64", "both"] <- timefun({
    p2 <- unique(x, order=uniorder)
   })[3L]
   if (uniorder!="any")
    stopifnot(identical.integer64(p2, p))

   tim["hashmapuni", "both"] <- timefun({
    p2 <- hashmapuni(x)
   })[3L]
   if (uniorder=="original")
    stopifnot(identical.integer64(p2, p))

   tim["hashuni", "prep"] <- timefun({
    h <- hashmap(x)
    # for(r in 1:r)h <- hashmap(x, nunique=h$nunique)
   })[3L]
   tim["hashuni", "use"] <- timefun({
    p2 <- hashuni(h)
   })[3L]
   if (uniorder=="values")
    stopifnot(identical.integer64(sort(p2, na.last=FALSE), p))

   tim["hashunikeep", "prep"] <- tim["hashuni", "prep"]
   tim["hashunikeep", "use"] <- timefun({
    p2 <- hashuni(h, keep.order=TRUE)
   })[3L]
   if (uniorder=="original")
    stopifnot(identical.integer64(p2, p))

   tim["sortuni", "prep"] <- timefun({
    s <- clone(x)
    ramsort(s, na.last=FALSE)
    nunique <- sortnut(s)[1L]
   })[3L]
   tim["sortuni", "use"] <- timefun({
    p2 <- sortuni(s, nunique)
   })[3L]
   if (uniorder=="values")
    stopifnot(identical.integer64(sort(p2, na.last=FALSE), p))

   tim["sortunikeep", "prep"] <- timefun({
    s <- clone(x)
    o <- seq_along(x)
    ramsortorder(s, o, na.last=FALSE)
    nunique <- sortnut(s)[1L]
   })[3L]
   tim["sortunikeep", "use"] <- timefun({
    p2 <- sortorderuni(x, s, o, nunique)
   })[3L]
   if (uniorder=="original")
    stopifnot(identical.integer64(p2, p))

   tim["orderuni", "prep"] <- timefun({
    o <- seq_along(x)
    ramorder(x, o, na.last=FALSE)
    nunique <- ordernut(x, o)[1L]
   })[3L]
   tim["orderuni", "use"] <- timefun({
    p2 <- orderuni(x, o, nunique)
   })[3L]
   if (uniorder=="values")
    stopifnot(identical.integer64(sort(p2, na.last=FALSE), p))

   tim["orderunikeep", "prep"] <- tim["orderuni", "prep"]
   tim["orderunikeep", "use"] <- timefun({
    p2 <- orderuni(x, o, nunique, keep.order=TRUE)
    nunique <- ordernut(x, o)[1L]
   })[3L]
   if (uniorder=="original")
    stopifnot(identical.integer64(p2, p))

   tim["hashdup", "prep"] <- tim["hashuni", "prep"]
   tim["hashdup", "use"] <- timefun({
    p2 <- x[!hashdup(h)]
   })[3L]
   if (uniorder=="original")
    stopifnot(identical.integer64(p2, p))

   tim["sortorderdup", "prep"] <- tim["sortunikeep", "prep"]
   tim["sortorderdup", "use"] <- timefun({
    p2 <- x[!sortorderdup(s, o)]
   })[3L]
   if (uniorder=="original")
    stopifnot(identical.integer64(p2, p))


   hashcache(x)
   tim["hash.cache", "use"] <- timefun({
    p2 <- unique(x, order=uniorder)
   })[3L]
   if (uniorder!="any")
    stopifnot(identical.integer64(p2, p))
   remcache(x)

   sortcache(x)
   tim["sort.cache", "use"] <- timefun({
    p2 <- unique(x, order=uniorder)
   })[3L]
   if (uniorder!="any")
    stopifnot(identical.integer64(p2, p))
   remcache(x)

   sortordercache(x)
   tim["sortorder.cache", "use"] <- timefun({
    p2 <- unique(x, order=uniorder)
   })[3L]
   if (uniorder!="any")
    stopifnot(identical.integer64(p2, p))
   remcache(x)

   ordercache(x)
   tim["order.cache", "use"] <- timefun({
    p2 <- unique(x, order=uniorder)
   })[3L]
   if (uniorder!="any")
    stopifnot(identical.integer64(p2, p))
   remcache(x)

   if (plot) {
    barplot(t(tim), cex.names=0.7)
    title(paste0("unique(", n, ", order=", uniorder, ")"))
   }

   ret[["unique", as.character(n)]] <- tim
  }
 }
 if ("unipos" %in% what) {
  message("unipos: timings of different methods")
  N <- c(nsmall, nbig)
  for (i in seq_along(N)) {
    n <- N[i]
    x <- c(sample(n, n-1L, TRUE), NA)
    tim <- matrix(0.0, 14L, 3L)
    dimnames(tim) <- list(
      c("unique", "unipos.64", "hashmapupo", "hashupo", "hashupokeep", "sortorderupo", "sortorderupokeep", "orderupo", "orderupokeep", "hashdup", "sortorderdup", "hash.cache", "sortorder.cache", "order.cache"), # nolint: line_length_linter.
      c("prep", "both", "use")
    )

   tim["unique", "both"] <- timefun({
    unique(x)
   })[3L]
   x <- as.integer64(x)

   tim["unipos.64", "both"] <- timefun({
    p <- unipos(x, order=uniorder)
   })[3L]

   tim["hashmapupo", "both"] <- timefun({
    p2 <- hashmapupo(x)
   })[3L]
   if (uniorder=="original")
    stopifnot(identical(p2, p))

   tim["hashupo", "prep"] <- timefun({
    h <- hashmap(x)
    # if nunique is small we could re-build the hashmap at a smaller size
    # h <- hashmap(x, nunique=h$nunique)
   })[3L]
   tim["hashupo", "use"] <- timefun({
    p2 <- hashupo(h)
   })[3L]
   if (uniorder=="values")
    stopifnot(identical(sort(p2, na.last=FALSE), sort(p, na.last=FALSE)))

   tim["hashupokeep", "prep"] <- tim["hashupo", "prep"]
   tim["hashupokeep", "use"] <- timefun({
    p2 <- hashupo(h, keep.order=TRUE)
   })[3L]
   if (uniorder=="original")
    stopifnot(identical(p2, p))


   tim["sortorderupo", "prep"] <- timefun({
    s <- clone(x)
    o <- seq_along(x)
    ramsortorder(s, o, na.last=FALSE)
    nunique <- sortnut(s)[1L]
   })[3L]
   tim["sortorderupo", "use"] <- timefun({
    p2 <- sortorderupo(s, o, nunique)
   })[3L]
   if (uniorder=="values")
    stopifnot(identical(p2, p))

   tim["sortorderupokeep", "prep"] <- timefun({
    s <- clone(x)
    o <- seq_along(x)
    ramsortorder(s, o, na.last=FALSE)
    nunique <- sortnut(s)[1L]
   })[3L]
   tim["sortorderupokeep", "use"] <- timefun({
    p2 <- sortorderupo(s, o, nunique, keep.order=TRUE)
   })[3L]
   if (uniorder=="original")
    stopifnot(identical(p2, p))

   tim["orderupo", "prep"] <- timefun({
    o <- seq_along(x)
    ramorder(x, o, na.last=FALSE)
    nunique <- ordernut(x, o)[1L]
   })[3L]
   tim["orderupo", "use"] <- timefun({
    p2 <- orderupo(x, o, nunique)
   })[3L]
   if (uniorder=="values")
    stopifnot(identical(p2, p))

   tim["orderupokeep", "prep"] <- tim["orderupo", "prep"]
   tim["orderupokeep", "use"] <- timefun({
    p2 <- orderupo(x, o, nunique, keep.order=TRUE)
    nunique <- ordernut(x, o)[1L]
   })[3L]
   if (uniorder=="original")
    stopifnot(identical(p2, p))

   tim["hashdup", "prep"] <- tim["hashupo", "prep"]
   tim["hashdup", "use"] <- timefun({
    p2 <- (1:n)[!hashdup(h)]
   })[3L]
   if (uniorder=="original")
    stopifnot(identical(p2, p))

   tim["sortorderdup", "prep"] <- tim["sortorderupokeep", "prep"]
   tim["sortorderdup", "use"] <- timefun({
    p2 <- (1:n)[!sortorderdup(s, o)]
   })[3L]
   if (uniorder=="original")
    stopifnot(identical(p2, p))

   hashcache(x)
   tim["hash.cache", "use"] <- timefun({
    p2 <- unipos(x, order=uniorder)
   })[3L]
   if (uniorder!="any")
    stopifnot(identical(p2, p))
   remcache(x)

   sortordercache(x)
   tim["sortorder.cache", "use"] <- timefun({
    p2 <- unipos(x, order=uniorder)
   })[3L]
   if (uniorder!="any")
    stopifnot(identical(p2, p))
   remcache(x)

   ordercache(x)
   tim["order.cache", "use"] <- timefun({
    p2 <- unipos(x, order=uniorder)
   })[3L]
   if (uniorder!="any")
    stopifnot(identical(p2, p))
   remcache(x)

   if (plot) {
    barplot(t(tim), cex.names=0.7)
    title(paste0("unipos(", n, ", order=", uniorder, ")"))
   }

   ret[["unipos", as.character(n)]] <- tim
  }
 }
 if ("table" %in% what) {
  message("table: timings of different methods")
  N <- c(nsmall, nbig)
  for (i in seq_along(N)) {
   n <- N[i]
   x <- c(sample.int(1024L, n-1L, replace=TRUE), NA)
   tim <- matrix(0.0, 13L, 3L)
   dimnames(tim) <- list(c("tabulate", "table", "table.64", "hashmaptab", "hashtab", "hashtab2", "sorttab", "sortordertab", "ordertab", "ordertabkeep"
    , "hash.cache", "sort.cache", "order.cache")
   , c("prep", "both", "use"))

   tim["tabulate", "both"] <- timefun({
    tabulate(x)
   })[3L]

   tim["table", "both"] <- timefun({
    p <- table(x, exclude=NULL)
   })[3L]
   p <- p[-length(p)]

   x <- as.integer64(x)

   tim["table.64", "both"] <- timefun({
    p2 <- table.integer64(x, order=taborder)
   })[3L]
   p2 <- p2[-1L]
   stopifnot(identical(p2, p))

   tim["hashmaptab", "both"] <- timefun({
    p <- hashmaptab(x)
   })[3L]

   tim["hashtab", "prep"] <- timefun({
    h <- hashmap(x)
   })[3L]
   tim["hashtab", "use"] <- timefun({
    p2 <- hashtab(h)
   })[3L]
   stopifnot(identical(p2, p))

   tim["hashtab2", "prep"] <- tim["hashtab", "prep"] + timefun({
    h <- hashmap(x, nunique=h$nunique)
   })[3L]
   tim["hashtab2", "use"] <- timefun({
    p2 <- hashtab(h)
   })[3L]

   sortp <- function(p) {
    s <- p$values
    o <- seq_along(s)
    ramsortorder(s, o, na.last=FALSE)
    list(values=s, counts=p$counts[o])
   }
   p <- sortp(p)
   p2 <- sortp(p2)
   stopifnot(identical(p2, p))

   tim["sorttab", "prep"] <- timefun({
    s <- clone(x)
    ramsort(s, na.last=FALSE)
    nunique <- sortnut(s)[1L]
   })[3L]
   tim["sorttab", "use"] <- timefun({
    p2 <- list(values=sortuni(s, nunique), counts=sorttab(s, nunique))
   })[3L]
   stopifnot(identical(p2, p))

   tim["sortordertab", "prep"] <- timefun({
    s <- clone(x)
    o <- seq_along(x)
    ramsortorder(s, o, na.last=FALSE)
    nunique <- sortnut(s)[1L]
      })[3L]
            tim["sortordertab", "use"] <- timefun({
                p2 <- list(values=sortorderuni(x, s, o, nunique), counts=sortordertab(s, o))
            })[3L]
            p2 <- sortp(p2)
            stopifnot(identical(p2, p))

            tim["ordertab", "prep"] <- timefun({
                o <- seq_along(x)
                ramorder(x, o, na.last=FALSE)
                nunique <- ordernut(x, o)[1L]
            })[3L]
            tim["ordertab", "use"] <- timefun({
                p2 <- list(values=orderuni(x, o, nunique), counts=ordertab(x, o, nunique))
            })[3L]
            stopifnot(identical(p2, p))

            tim["ordertabkeep", "prep"] <- tim["ordertab", "prep"]
            tim["ordertabkeep", "use"] <- timefun({
                p2 <- list(values=orderuni(x, o, nunique, keep.order=TRUE), counts=ordertab(x, o, nunique, keep.order=TRUE))
            })[3L]
            p2 <- sortp(p2)
            stopifnot(identical(p2, p))

            hashcache(x)
            tim["hash.cache", "use"] <- timefun({
                p <- table.integer64(x, order=taborder)
            })[3L]
            remcache(x)

            sortordercache(x)
            tim["sort.cache", "use"] <- timefun({
                p2 <- table.integer64(x, order=taborder)
            })[3L]
            stopifnot(identical(p2, p))
            remcache(x)

            ordercache(x)
            tim["order.cache", "use"] <- timefun({
                p2 <- table.integer64(x, order=taborder)
            })[3L]
            stopifnot(identical(p2, p))
            remcache(x)

            if (plot) {
                barplot(t(tim), cex.names=0.7)
                title(paste0("table.integer64(", n, ", order=", taborder, ")"))
            }

            ret[["table", as.character(n)]] <- tim
        }
    }
    if ("rank" %in% what) {
        message("rank: timings of different methods")
        N <- c(nsmall, nbig)
        for (i in seq_along(N)) {
            n <- N[i]
            x <- c(sample(n, n-1L, TRUE), NA)
            tim <- matrix(0.0, 7L, 3L)
            dimnames(tim) <- list(c("rank", "rank.keep", "rank.64", "sortorderrnk", "orderrnk"
                , "sort.cache", "order.cache")
            , c("prep", "both", "use"))

            tim["rank", "both"] <- timefun({
                rank(x)
            })[3L]

            tim["rank.keep", "both"] <- timefun({
                p <- rank(x, na.last="keep")
            })[3L]

            x <- as.integer64(x)

            tim["rank.64", "both"] <- timefun({
                p2 <- rank.integer64(x)
            })[3L]
            stopifnot(identical(p2, p))

            tim["sortorderrnk", "prep"] <- timefun({
                s <- clone(x)
                o <- seq_along(x)
                na.count <- ramsortorder(s, o, na.last=FALSE)
            })[3L]
            tim["sortorderrnk", "use"] <- timefun({
                p2 <- sortorderrnk(s, o, na.count)
            })[3L]
            stopifnot(identical(p2, p))

            tim["orderrnk", "prep"] <- timefun({
                o <- seq_along(x)
                na.count <- ramorder(x, o, na.last=FALSE)
            })[3L]
            tim["orderrnk", "use"] <- timefun({
                p2 <- orderrnk(x, o, na.count)
            })[3L]
            stopifnot(identical(p2, p))

            sortordercache(x)
            tim["sort.cache", "use"] <- timefun({
                p2 <- rank.integer64(x)
            })[3L]
            stopifnot(identical(p2, p))
            remcache(x)

            ordercache(x)
            tim["order.cache", "use"] <- timefun({
                p2 <- rank.integer64(x)
            })[3L]
            stopifnot(identical(p2, p))
            remcache(x)

            if (plot) {
                barplot(t(tim), cex.names=0.7)
                title(paste0("rank.integer64(", n, ")"))
            }

            ret[["rank", as.character(n)]] <- tim
        }
    }
    if ("quantile" %in% what) {
        message("quantile: timings of different methods")
        N <- c(nsmall, nbig)
        for (i in seq_along(N)) {
            n <- N[i]
            x <- c(sample(n, n-1L, TRUE), NA)
            tim <- matrix(0.0, 6L, 3L)
            dimnames(tim) <- list(c("quantile", "quantile.64", "sortqtl", "orderqtl"
                , "sort.cache", "order.cache")
            , c("prep", "both", "use"))

            tim["quantile", "both"] <- timefun({
                p <- quantile(x, type=1L, na.rm=TRUE)
            })[3L]
            p2 <- p
            p <- as.integer64(p2)
            names(p) <- names(p2)

            x <- as.integer64(x)

            tim["quantile.64", "both"] <- timefun({
                p2 <- quantile(x, na.rm=TRUE)
            })[3L]
            stopifnot(identical(p2, p))

            tim["sortqtl", "prep"] <- timefun({
                s <- clone(x)
                na.count <- ramsort(s, na.last=FALSE)
            })[3L]
            tim["sortqtl", "use"] <- timefun({
                p2 <- sortqtl(s, na.count, seq(0.0, 1.0, 0.25))
            })[3L]
            stopifnot(identical(unname(p2), unname(p)))

            tim["orderqtl", "prep"] <- timefun({
                o <- seq_along(x)
                na.count <- ramorder(x, o, na.last=FALSE)
            })[3L]
            tim["orderqtl", "use"] <- timefun({
                p2 <- orderqtl(x, o, na.count, seq(0.0, 1.0, 0.25))
            })[3L]
            stopifnot(identical(unname(p2), unname(p)))

            sortordercache(x)
            tim["sort.cache", "use"] <- timefun({
                p2 <- quantile(x, na.rm=TRUE)
            })[3L]
            stopifnot(identical(p2, p))
            remcache(x)

            ordercache(x)
            tim["order.cache", "use"] <- timefun({
                p2 <- quantile(x, na.rm=TRUE)
            })[3L]
            stopifnot(identical(p2, p))
            remcache(x)

            if (plot) {
                barplot(t(tim), cex.names=0.7)
                title(paste0("quantile(", n, ")"))
            }

            ret[["quantile", as.character(n)]] <- tim
        }
    }

    ret

}
# nolint end: brace_linter, line_length_linter.
# nocov end

#' 64-bit integer matching
#'
#' `match` returns a vector of the positions of (first) matches of its first
#'   argument in its second.
#' `%in%` is a more intuitive interface as a binary operator, which returns a
#'   logical vector indicating if there is a match or not for its left operand.
#'
#' @param x integer64 vector: the values to be matched, optionally carrying a
#'   cache created with [hashcache()]
#' @param table integer64 vector: the values to be matched against, optionally
#'   carrying a cache created with [hashcache()] or [sortordercache()]
#' @param nomatch the value to be returned in the case when no match is found.
#'   Note that it is coerced to integer.
#' @param nunique NULL or the number of unique values of table (including NA).
#'   Providing `nunique` can speed-up matching when `table` has no cache. Note
#'   that a wrong nunique can cause undefined behaviour up to a crash.
#' @param method NULL for automatic method selection or a suitable low-level
#'   method, see details
#' @param ... ignored
#'
#' @details
#' These functions automatically choose from several low-level functions
#'   considering the size of `x` and `table` and the availability of caches.
#'
#' Suitable methods for `%in%.integer64` are
#'  - [`hashpos`] (hash table lookup)
#'  - [`hashrev`] (reverse lookup)
#'  - [`sortorderpos`] (fast ordering)
#'  - [`orderpos`] (memory saving ordering).
#'
#' Suitable methods for `match.integer64` are
#'  - [`hashfin`] (hash table lookup)
#'  - [`hashrin`] (reverse lookup)
#'  - [`sortfin`] (fast sorting)
#'  - [`orderfin`] (memory saving ordering).
#'
#' @return
#' A vector of the same length as `x`.
#'
#' `match`: An integer vector giving the position in `table` of
#'   the first match if there is a match, otherwise `nomatch`.
#'
#' If `x[i]` is found to equal `table[j]` then the value
#'   returned in the `i`-th position of the return value is `j`,
#'   for the smallest possible `j`.  If no match is found, the value
#'   is `nomatch`.
#'
#' `%in%`: A logical vector, indicating if a match was located for
#'   each element of `x`: thus the values are `TRUE` or
#'   `FALSE` and never `NA`.
#'
#' @seealso [match()]
#' @examples
#' x <- as.integer64(c(NA, 0:9), 32)
#' table <- as.integer64(c(1:9, NA))
#' match.integer64(x, table)
#' "%in%.integer64"(x, table)
#'
#' x <- as.integer64(sample(c(rep(NA, 9), 0:9), 32, TRUE))
#' table <- as.integer64(sample(c(rep(NA, 9), 1:9), 32, TRUE))
#' stopifnot(identical(match.integer64(x, table), match(as.integer(x), as.integer(table))))
#' stopifnot(identical("%in%.integer64"(x, table), as.integer(x) %in% as.integer(table)))
#'
#' \dontrun{
#'     library(bit)
#'     message("check when reverse hash-lookup beats standard hash-lookup")
#'     e <- 4:24
#'     timx <- timy <- matrix(NA, length(e), length(e), dimnames=list(e, e))
#'     for (iy in seq_along(e))
#'     for (ix in 1:iy) {
#'         nx <- 2^e[ix]
#'         ny <- 2^e[iy]
#'         x <- as.integer64(sample(ny, nx, FALSE))
#'         y <- as.integer64(sample(ny, ny, FALSE))
#'         #hashfun(x, bits=as.integer(5))
#'         timx[ix, iy] <- repeat.time({
#'         hx <- hashmap(x)
#'         py <- hashrev(hx, y)
#'         })[3]
#'         timy[ix, iy] <- repeat.time({
#'         hy <- hashmap(y)
#'         px <- hashpos(hy, x)
#'         })[3]
#'         #identical(px, py)
#'         print(round(timx[1:iy, 1:iy]/timy[1:iy, 1:iy], 2), na.print="")
#'     }
#'
#'     message("explore best low-level method given size of x and table")
#'     B1 <- 1:27
#'     B2 <- 1:27
#'     tim <- array(NA, dim=c(length(B1), length(B2), 5)
#'  , dimnames=list(B1, B2, c("hashpos", "hashrev", "sortpos1", "sortpos2", "sortpos3")))
#'     for (i1 in B1)
#'     for (i2 in B2)
#'     {
#'       b1 <- B1[i1]
#'       b2 <- B1[i2]
#'       n1 <- 2^b1
#'       n2 <- 2^b2
#'       x1 <- as.integer64(c(sample(n2, n1-1, TRUE), NA))
#'       x2 <- as.integer64(c(sample(n2, n2-1, TRUE), NA))
#'       tim[i1, i2, 1] <- repeat.time({h <- hashmap(x2);hashpos(h, x1);rm(h)})[3]
#'       tim[i1, i2, 2] <- repeat.time({h <- hashmap(x1);hashrev(h, x2);rm(h)})[3]
#'       s <- clone(x2); o <- seq_along(s); ramsortorder(s, o)
#'       tim[i1, i2, 3] <- repeat.time(sortorderpos(s, o, x1, method=1))[3]
#'       tim[i1, i2, 4] <- repeat.time(sortorderpos(s, o, x1, method=2))[3]
#'       tim[i1, i2, 5] <- repeat.time(sortorderpos(s, o, x1, method=3))[3]
#'       rm(s, o)
#'       print(apply(tim, 1:2, function(ti)if(any(is.na(ti)))NA else which.min(ti)))
#'     }
#' }
#' @keywords manip logic
#' @export
match.integer64 <- function(x, table, nomatch = NA_integer_, nunique=NULL, method=NULL, ...) {
  stopifnot(is.integer64(x))
  table <- as.integer64(table)
  cache_env <- cache(table)
  if (is.null(method)) {
    if (is.null(cache_env)) {
            nx <- length(x)
            if (is.null(nunique))
                nunique <- length(table)
            btable <- as.integer(ceiling(log2(nunique*1.5)))
            bx <- as.integer(ceiling(log2(nx*1.5)))
            if (bx<=17L && btable>=16L) {
                method <- "hashrev"
            } else {
                method <- "hashpos"
            }
    } else if (!is.null(cache_env$hashmap)) {
        method <- "hashpos"
    } else if (!is.null(cache_env$sort) && !is.null(cache_env$order) && (length(table)>length(x) || length(x)<4096L)) {
        method <- "sortorderpos"
    } else if (!is.null(cache_env$order) && (length(table)>length(x) || length(x)<4096L)) {
        method <- "orderpos"
    } else {
        nx <- length(x)
        if (is.null(nunique)) {
            if (!is.null(cache_env$nunique))
                nunique <- cache_env$nunique
            else
                nunique <- length(table)
        }
        btable <- as.integer(ceiling(log2(nunique*1.5)))
        bx <- as.integer(ceiling(log2(nx*1.5)))
        if (bx<=17L && btable>=16L) {
            method <- "hashrev"
        } else {
            method <- "hashpos"
        }
    }
  }
  method <- match.arg(method, c("hashpos", "hashrev", "sortorderpos", "orderpos"))
  switch(method,
    hashpos={
      if (is.null(cache_env) || is.null(cache_env$hashmap)) {
        if (exists("btable", inherits=FALSE)) {
          h <- hashmap(table, hashbits=btable)
        } else {
          if (is.null(nunique))
            nunique <- cache_env$nunique
          h <- hashmap(table, nunique=nunique)
        }
      } else {
        h <- cache_env
      }
      p <- hashpos(h, x, nomatch=nomatch)
    },
    hashrev={
      cache_env <- cache(x)
      if (is.null(cache_env) || is.null(cache_env$hashmap)) {
        if (exists("bx", inherits=FALSE)) {
          h <- hashmap(x, bits=bx)
        } else {
          if (is.null(nunique))
            nunique <- cache_env$nunique
          h <- hashmap(x, nunique=nunique)
        }
      } else {
        h <- cache_env
      }
      p <- hashrev(h, table, nomatch=nomatch)
    },
    sortorderpos={
      if (is.null(cache_env) || !exists("sort", cache_env) || !exists("order", cache_env)) {
        s <- clone(table)
        o <- seq_along(s)
        ramsortorder(s, o, na.last=FALSE)
      } else {
        s <- get("sort", cache_env)
        o <- get("order", cache_env)
      }
      p <- sortorderpos(s, o, x, nomatch=nomatch)
    },
    orderpos={
      if (is.null(cache_env) || !exists("order", cache_env)) {
        o <- seq_along(s)
        ramorder(table, o, na.last=FALSE)
      } else {
        o <- get("order", cache_env)
      }
      p <- orderpos(table, o, x, nomatch=nomatch)
    }
  )
  p
}

#' @rdname match.integer64
#' @export
`%in%.integer64` <- function(x, table, ...) {
  stopifnot(is.integer64(x))
  table <- as.integer64(table)
  nunique <- NULL
  cache_env <- cache(table)
  if (is.null(cache_env)) {
    nx <- length(x)
    if (is.null(nunique))
      nunique <- length(table)
    btable <- as.integer(ceiling(log2(nunique*1.5)))
    bx <- as.integer(ceiling(log2(nx*1.5)))
    if (bx<=17L && btable>=16L) {
      method <- "hashrin"
    } else {
      method <- "hashfin"
    }
  } else if (!is.null(cache_env$hashmap)) {
    method <- "hashfin"
  } else if (!is.null(cache_env$sort) && (length(table)>length(x) || length(x)<4096L)) {
    method <- "sortfin"
  } else if (!is.null(cache_env$order) && (length(table)>length(x) || length(x)<4096L)) {
    method <- "orderfin"
  } else {
    nx <- length(x)
    if (is.null(nunique)) {
      if (!is.null(cache_env$nunique))
        nunique <- cache_env$nunique
      else
        nunique <- length(table)
    }
    btable <- as.integer(ceiling(log2(nunique*1.5)))
    bx <- as.integer(ceiling(log2(nx*1.5)))
    if (bx<=17L && btable>=16L) {
      method <- "hashrin"
    } else {
      method <- "hashfin"
    }
  }
  method <- match.arg(method, c("hashfin", "hashrin", "sortfin", "orderfin"))
  switch(method,
    hashfin={
      if (is.null(cache_env) || is.null(cache_env$hashmap)) {
        if (exists("btable", inherits=FALSE)) {
          h <- hashmap(table, hashbits=btable)
        } else {
          if (is.null(nunique))
            nunique <- cache_env$nunique
          h <- hashmap(table, nunique=nunique)
        }
      } else {
        h <- cache_env
      }
      p <- hashfin(h, x)
    },
    hashrin={
      cache_env <- cache(x)
      if (is.null(cache_env) || is.null(cache_env$hashmap)) {
        if (exists("bx", inherits=FALSE)) {
          h <- hashmap(x, bits=bx)
        } else {
          if (is.null(nunique))
            nunique <- cache_env$nunique
          h <- hashmap(x, nunique=nunique)
        }
      } else {
        h <- cache_env
      }
      p <- hashrin(h, table)
    },
    sortfin={
      if (is.null(cache_env) || !exists("sort", cache_env)) {
        s <- clone(table)
        ramsort(s, na.last=FALSE)
      } else {
        s <- get("sort", cache_env)
      }
      p <- sortfin(s, x)
    },
    orderfin={
      if (is.null(cache_env) || !exists("order", cache_env)) {
        o <- seq_along(s)
        ramorder(table, o, na.last=FALSE)
      } else {
        o <- get("order", cache_env)
      }
      p <- orderfin(table, o, x)
    }
  )
  p
}

#' Determine Duplicate Elements of integer64
#'
#' `duplicated()` determines which elements of a vector or data frame are duplicates
#'   of elements with smaller subscripts, and returns a logical vector
#'   indicating which elements (rows) are duplicates.
#'
#' @param x a vector or a data frame or an array or `NULL`.
#' @param incomparables ignored
#' @param nunique NULL or the number of unique values (including NA). Providing
#'   `nunique` can speed-up matching when `x` has no cache. Note that a wrong
#'   `nunique` can cause undefined behaviour up to a crash.
#' @param method NULL for automatic method selection or a suitable low-level
#'   method, see details
#' @param ... ignored
#'
#' @details
#' This function automatically chooses from several low-level functions
#'   considering the size of `x` and the availability of a cache.
#'
#' Suitable methods are
#'  - [`hashdup`] (hashing)
#'  - [`sortorderdup`] (fast ordering)
#'  - [`orderdup`] (memory saving ordering).
#'
#' @return `duplicated()`: a logical vector of the same length as `x`.
#' @seealso [duplicated()], [unique.integer64()]
#' @examples
#' x <- as.integer64(sample(c(rep(NA, 9), 1:9), 32, TRUE))
#' duplicated(x)
#'
#' stopifnot(identical(duplicated(x),  duplicated(as.integer(x))))
#' @keywords logic manip
#' @export
duplicated.integer64 <- function(x, incomparables = FALSE, nunique = NULL, method = NULL, ...) {
  stopifnot(identical(incomparables, FALSE))
  cache_env <- cache(x)
  if (is.null(nunique) && !is.null(cache_env))
    nunique <- cache_env$nunique
  if (is.null(method)) {
    if (is.null(cache_env)) {
      if (length(x)>50000000L)
        method <- "sortorderdup" # nocov. Too large for practical unit tests.
      else
        method <- "hashdup"
    } else if (!is.null(cache_env$sort) && !is.null(cache_env$order)) {
      method <- "sortorderdup"
    } else if (!is.null(cache_env$hashmap)) {
      method <- "hashdup"
    } else if (!is.null(cache_env$order)) {
      method <- "orderdup"
    } else if (length(x) > 50000000L) {
      method <- "sortorderdup"
    } else {
      method <- "hashdup"
    }
  }
  method <- match.arg(method, c("hashdup", "sortorderdup", "orderdup"))
  switch(method,
    hashdup={
      if (is.null(cache_env) || is.null(cache_env$hashmap))
        h <- hashmap(x, nunique=nunique)
      else
        h <- cache_env
      p <- hashdup(h)
    },
    sortorderdup={
      if (is.null(cache_env) || is.null(cache_env$sort) || is.null(cache_env$order)) {
        s <- clone(x)
        o <- seq_along(s)
        ramsortorder(s, o, na.last=FALSE)
      } else {
        s <- get("sort", cache_env, inherits=FALSE)
        o <- get("order", cache_env, inherits=FALSE)
      }
      p <- sortorderdup(s, o)
    },
    orderdup={
      if (is.null(cache_env) || is.null(cache_env$order)) {
        o <- seq_along(s)
        ramorder(x, o, na.last=FALSE)
      } else {
        o <- get("order", cache_env, inherits=FALSE)
      }
      p <- orderdup(x, o)
    }
  )
  p
}

#' Extract Unique Elements from integer64
#'
#' `unique` returns a vector like `x` but with duplicate elements/rows removed.
#'
#' @param x a vector or a data frame or an array or `NULL`.
#' @param incomparables ignored
#' @param order The order in which unique values will be returned, see details
#' @param nunique NULL or the number of unique values (including NA). Providing
#'   `nunique` can speed-up matching when `x` has no cache. Note that a wrong
#'   `nunique`` can cause undefined behaviour up to a crash.
#' @param method NULL for automatic method selection or a suitable low-level
#'   method, see details
#' @param ... ignored
#'
#' @details
#' This function automatically chooses from several low-level functions
#'   considering the size of `x` and the availability of a cache.
#'
#' Suitable methods are
#'  - [`hashmapuni`] (simultaneously creating and using a hashmap)
#'  - [`hashuni`] (first creating a hashmap then using it)
#'  - [`sortuni`] (fast sorting for sorted order only)
#'  - [`sortorderuni`] (fast ordering for original order only)
#'  - [`orderuni`] (memory saving ordering).
#'
#' The default `order="original"` returns unique values in the order of the
#'   first appearance in `x` like in [unique()], this costs extra processing.
#'   `order="values"` returns unique values in sorted order like in [table()],
#'   this costs extra processing with the hash methods but comes for free.
#'   `order="any"` returns unique values in undefined order, possibly faster.
#'   For hash methods this will be a quasi random order, for sort methods this
#'   will be sorted order.
#'
#' @return For a vector, an object of the same type of `x`, but with only
#'   one copy of each duplicated element.  No attributes are copied (so
#'   the result has no names).
#'
#' @seealso [unique()] for the generic, [unipos()] which gives the indices
#'   of the unique elements and [table.integer64()] which gives frequencies
#'   of the unique elements.
#'
#' @examples
#' x <- as.integer64(sample(c(rep(NA, 9), 1:9), 32, TRUE))
#' unique(x)
#' unique(x, order="values")
#'
#' stopifnot(identical(unique(x),  x[!duplicated(x)]))
#' stopifnot(identical(unique(x),  as.integer64(unique(as.integer(x)))))
#' stopifnot(identical(unique(x, order="values")
#' ,  as.integer64(sort(unique(as.integer(x)), na.last=FALSE))))
#'
#' @keywords manip logic
#' @export
unique.integer64 <- function(x,
                             incomparables=FALSE,
                             order=c("original", "values", "any"),
                             nunique=NULL,
                             method=NULL,
                             ...) {
  stopifnot(identical(incomparables, FALSE))
  order <- match.arg(order)
  cache_env <- cache(x)
  keep.order <- order == "original"
  if (is.null(nunique) && !is.null(cache_env))
    nunique <- cache_env$nunique
  if (is.null(method)) {
    if (is.null(cache_env)) {
        if (order=="values")
            method <- "sortuni"
        else
            method <- "hashmapuni"
    } else {
      switch(order,
        original = {
          if (!is.null(cache_env$hashmap))
            method <- "hashuni"
          else if (!is.null(cache_env$order)) {
            if (!is.null(cache_env$sort))
              method <- "sortorderuni"
            else
              method <- "orderuni"
          } else {
            method <- "hashmapuni"
          }
        },
        values = {
          if (!is.null(cache_env$sort))
            method <- "sortuni"
          else if (!is.null(cache_env$order))
            method <- "orderuni"
          else if (!is.null(cache_env$hashmap) && cache_env$nunique<length(x)/2L)
            method <- "hashuni"
          else
            method <- "sortuni"
        },
        any = {
          if (!is.null(cache_env$sort))
            method <- "sortuni"
          else if (!is.null(cache_env$hashmap))
            method <- "hashuni"
          else if (!is.null(cache_env$order))
            method <- "orderuni"
          else
            method <- "sortuni"
        }
      )
    }
  }
  method <- match.arg(method, c("hashmapuni", "hashuni", "sortuni", "sortorderuni", "orderuni"))
  switch(method,
    hashmapuni={
      p <- hashmapuni(x, nunique=nunique)
    },
    hashuni={
      if (is.null(cache_env) || is.null(cache_env$hashmap))
        h <- hashmap(x, nunique=nunique)
      else
        h <- cache_env
      p <- hashuni(h, keep.order=keep.order)
      if (order=="values")
        ramsort(p, na.last=FALSE)
    },
    sortuni={
      if (is.null(cache_env) || is.null(cache_env$sort)) {
        s <- clone(x)
        ramsort(s, na.last=FALSE)
      } else {
        s <- get("sort", cache_env, inherits=FALSE)
      }
      if (is.null(nunique))
        nunique <- sortnut(s)[1L]
      p <- sortuni(s, nunique)
    },
    sortorderuni={
      if (is.null(cache_env) || is.null(cache_env$sort) || is.null(cache_env$order)) {
        s <- clone(x)
        o <- seq_along(x)
        ramsortorder(s, o, na.last=FALSE)
      } else {
        s <- get("sort", cache_env, inherits=FALSE)
        o <- get("order", cache_env, inherits=FALSE)
      }
      if (is.null(nunique))
        nunique <- sortnut(s)[1L]
      p <- sortorderuni(x, s, o, nunique)
    },
    orderuni={
      if (is.null(cache_env) || is.null(cache_env$order)) {
        o <- seq_along(x)
        ramorder(x, o, na.last=FALSE)
      } else {
        o <- get("order", cache_env, inherits=FALSE)
      }
      if (is.null(nunique))
        nunique <- ordernut(x, o)[1L]
      p <- orderuni(x, o, nunique, keep.order=keep.order)
    }
  )
  p
}

#' Extract Positions of Unique Elements
#'
#' `unipos` returns the positions of those elements returned by [unique()].
#'
#' @param x a vector or a data frame or an array or `NULL`.
#' @param incomparables ignored
#' @param order The order in which positions of unique values will be returned,
#'   see details
#' @param nunique NULL or the number of unique values (including NA). Providing
#'   `nunique` can speed-up when `x` has no cache. Note that a wrong `nunique`
#'   can cause undefined behaviour up to a crash.
#' @param method NULL for automatic method selection or a suitable low-level
#'   method, see details
#' @param ... ignored
#'
#' @details
#' This function automatically chooses from several low-level functions
#'   considering the size of `x` and the availability of a cache.
#'
#' Suitable methods are
#'  - [`hashmapupo`] (simultaneously creating and using a hashmap)
#'  - [`hashupo`] (first creating a hashmap then using it)
#'  - [`sortorderupo`] (fast ordering)
#'  - [`orderupo`] (memory saving ordering).
#'
#' The default `order="original"` collects unique values in the order of
#'   the first appearance in `x` like in [unique()], this costs extra processing.
#'   `order="values"` collects unique values in sorted order like in [table()],
#'   this costs extra processing with the hash methods but comes for free.
#'   `order="any"` collects unique values in undefined order, possibly faster.
#'   For hash methods this will be a quasi random order, for sort methods this
#'   will be sorted order.
#'
#' @return an integer vector of positions
#' @seealso [unique.integer64()] for unique values and [match.integer64()]
#'   for general matching.
#' @examples
#' x <- as.integer64(sample(c(rep(NA, 9), 1:9), 32, TRUE))
#' unipos(x)
#' unipos(x, order="values")
#'
#' stopifnot(identical(unipos(x),  (1:length(x))[!duplicated(x)]))
#' stopifnot(identical(unipos(x),  match.integer64(unique(x), x)))
#' stopifnot(identical(unipos(x, order="values"),  match.integer64(unique(x, order="values"), x)))
#' stopifnot(identical(unique(x),  x[unipos(x)]))
#' stopifnot(identical(unique(x, order="values"),  x[unipos(x, order="values")]))
#'
#' @keywords manip logic
#' @export
unipos <- function(x, incomparables = FALSE, order = c("original", "values", "any"), ...) UseMethod("unipos")

#' @rdname unipos
#' @export
unipos.integer64 <- function(x,
                             incomparables=FALSE,
                             order=c("original", "values", "any"),
                             nunique=NULL,
                             method=NULL,
                             ...) {
  stopifnot(identical(incomparables, FALSE))
  order <- match.arg(order)
  cache_env <- cache(x)
  keep.order <- order == "original"
  if (is.null(nunique) && !is.null(cache_env))
    nunique <- cache_env$nunique
  if (is.null(method)) {
    if (is.null(cache_env)) {
      if (order=="values")
        method <- "sortorderupo"
      else
        method <- "hashmapupo"
    } else {
      switch(order,
        original = {
          if (!is.null(cache_env$hashmap))
            method <- "hashupo"
          else if (!is.null(cache_env$order)) {
            if (!is.null(cache_env$sort))
              method <- "sortorderupo"
            else
              method <- "orderupo"
          } else {
            method <- "hashmapupo"
          }
        },
        values = {
          if (!is.null(cache_env$order)) {
            if (!is.null(cache_env$sort))
              method <- "sortorderupo"
            else
              method <- "orderupo"
          } else if (!is.null(cache_env$hashmap) && cache_env$nunique<length(x)/2L) {
            method <- "hashupo"
          } else {
            method <- "sortorderupo"
          }
        },
        any = {
          if (!is.null(cache_env$sort) && !is.null(cache_env$order))
            method <- "sortorderupo"
          else if (!is.null(cache_env$hashmap))
            method <- "hashupo"
          else if (!is.null(cache_env$order))
            method <- "orderupo"
          else
            method <- "sortorderupo"
        }
      )
    }
  }
  method <- match.arg(method, c("hashmapupo", "hashupo", "sortorderupo", "orderupo"))
  switch(method,
    hashmapupo={
      p <- hashmapupo(x, nunique=nunique)
    },
    hashupo={
      if (is.null(cache_env) || is.null(cache_env$hashmap))
        h <- hashmap(x, nunique=nunique)
      else
        h <- cache_env
      p <- hashupo(h, keep.order=keep.order)
      if (order == "values") {
        s <- x[p]
        ramsortorder(s, p, na.last=FALSE)
      }
    },
    sortorderupo={
      if (is.null(cache_env) || is.null(cache_env$sort) || is.null(cache_env$order)) {
        s <- clone(x)
        o <- seq_along(x)
        ramsortorder(s, o, na.last=FALSE)
      } else {
        s <- get("sort", cache_env, inherits=FALSE)
        o <- get("order", cache_env, inherits=FALSE)
      }
      if (is.null(nunique))
        nunique <- sortnut(s)[1L]
      p <- sortorderupo(s, o, nunique, keep.order=keep.order)
    },
    orderupo={
      if (is.null(cache_env) || is.null(cache_env$order)) {
        o <- seq_along(x)
        ramorder(x, o, na.last=FALSE)
      } else {
        o <- get("order", cache_env, inherits=FALSE)
      }
      if (is.null(nunique))
        nunique <- ordernut(x, o)[1L]
      p <- orderupo(x, o, nunique, keep.order=keep.order)
    }
  )
  p
}

#' Cross Tabulation and Table Creation for integer64
#'
#' `table.integer64` uses the cross-classifying integer64 vectors to build a
#'   contingency table of the counts at each combination of vector values.
#'
#' @param ... one or more objects which can be interpreted as factors
#'   (including character strings), or a list (or data frame) whose
#'   components can be so interpreted.  (For `as.table` and `as.data.frame`,
#'   arguments passed to specific methods.)
#' @param nunique NULL or the number of unique values of table (including NA).
#'   Providing `nunique` can speed-up matching when `table` has no cache. Note
#'   that a wrong `nunique` can cause undefined behaviour up to a crash.
#' @param order By default results are created sorted by "values", or by "counts"
#' @param method NULL for automatic method selection or a suitable low-level
#'   method, see details
#' @param return choose the return format, see details
#' @param dnn the names to be given to the dimensions in the result
#'   (the _dimnames names_).
#' @param deparse.level controls how the default `dnn` is constructed. See Details.
#'
#' @details
#' This function automatically chooses from several low-level functions considering
#'   the size of `x` and the availability of a cache.
#'
#' Suitable methods are
#'  - [`hashmaptab`] (simultaneously creating and using a hashmap)
#'  - [`hashtab`] (first creating a hashmap then using it)
#'  - [`sortordertab`] (fast ordering)
#'  - [`ordertab`] (memory saving ordering).
#'
#' If the argument `dnn` is not supplied, the internal function
#'   `list.names` is called to compute the 'dimname names'.  If the
#'   arguments in `...` are named, those names are used.  For the
#'   remaining arguments, `deparse.level = 0` gives an empty name,
#'   `deparse.level = 1` uses the supplied argument if it is a symbol,
#'   and `deparse.level = 2` will deparse the argument.
#'
#' Arguments `exclude`, `useNA`, are not supported, i.e. `NA`s are always tabulated,
#'   and, different from [table()] they are sorted first if `order="values"`.
#'
#' @return By default (with `return="table"`) [table()] returns a
#'   _contingency table_, an object of class `"table"`, an array of integer values.
#'   Note that unlike S the result is always an array, a 1D array if one factor is
#'   given. Note also that for multidimensional arrays this is a _dense_ return
#'   structure which can dramatically increase RAM requirements (for large arrays
#'   with high mutual information, i.e. many possible input combinations of which
#'   only few occur) and that [table()] is limited to `2^31` possible combinations
#'   (e.g. two input vectors with 46340 unique values only). Finally note that the
#'   tabulated values or value-combinations are represented as `dimnames` and that
#'   the implied conversion of values to strings can cause _severe_ performance
#'   problems since each string needs to be integrated into R's global string cache.
#'
#' You can use the other `return=` options to cope with these problems, the potential
#'   combination limit is increased from `2^31` to `2^63` with these options, RAM is
#'   only required for observed combinations and string conversion is avoided.
#'
#' With `return="data.frame"` you get a _dense_ representation as a [data.frame()]
#'   (like that resulting from `as.data.frame(table(...))`) where only observed
#'   combinations are listed (each as a data.frame row) with the corresponding
#'   frequency counts (the latter as component named by `responseName`). This is
#'   the inverse of [xtabs()].
#'
#' With `return="list"` you also get a _dense_ representation as a simple
#'   [list()] with components
#'  - `values` a integer64 vector of the technically tabulated values, for 1D this
#'    is the tabulated values themselves, for kD these are the values representing
#'    the potential combinations of input values
#'  - `counts` the frequency counts
#'  - `dims` only for kD: a list with the vectors of the unique values of the
#'    input dimensions
#'
#' @note Note that by using [as.integer64.factor()] we can also input
#'   factors into `table.integer64` -- only the [levels()] get lost.
#'
#' @seealso [table()] for more info on the standard version coping with Base R's
#'   data types, [tabulate()] which can faster tabulate [`integer`]s with a limited
#'   range `[1L .. nL not too big]`, [unique.integer64()] for the unique values
#'   without counting them and [unipos.integer64()] for the positions of the unique values.
#'
#' @examples
#' message("pure integer64 examples")
#' x <- as.integer64(sample(c(rep(NA, 9), 1:9), 32, TRUE))
#' y <- as.integer64(sample(c(rep(NA, 9), 1:9), 32, TRUE))
#' z <- sample(c(rep(NA, 9), letters), 32, TRUE)
#' table.integer64(x)
#' table.integer64(x, order="counts")
#' table.integer64(x, y)
#' table.integer64(x, y, return="data.frame")
#'
#' message("via as.integer64.factor we can use 'table.integer64' also for factors")
#' table.integer64(x, as.integer64(as.factor(z)))
#' @keywords category
#' @concept counts
#' @concept frequencies
#' @concept occurrences
#' @concept contingency table
#' @export
table.integer64 <- function(...,
                            return = c("table", "data.frame", "list"),
                            order = c("values", "counts"),
                            nunique = NULL,
                            method = NULL,
                            dnn = list.names(...),
                            deparse.level = 1L) {
  order <- match.arg(order)
  return <- match.arg(return)
  # this is taken from 'table'
  list.names <- function(...) {
    l <- as.list(substitute(list(...)))[-1L]
    nm <- names(l)
    fixup <- if (is.null(nm))
      seq_along(l)
    else nm == ""
    dep <- vapply(
      l[fixup],
      function(x) switch(deparse.level + 1L, "", if (is.symbol(x)) as.character(x) else "", deparse(x, nlines=1L)[1L]),
      ""
    )
    if (is.null(nm)) {
      dep
    } else {
      nm[fixup] <- dep
      nm
    }
  }

  # COPY ON MODIFY is broken for reading from list(...)
  # because list(...) creates a copy of all ... and this invalidates our caches
  # therefore we go this sick workaround
  argsymbols <- as.list(substitute(list(...)))[-1L]
  argframe <- parent.frame()
  A <- function(i) eval(argsymbols[[i]], argframe)
  N <- length(argsymbols)
  if (!N)
      stop("nothing to tabulate")
  if (N == 1L && is.list(A(1L))) {
    args <- A(1L) # nolint: object_overwrite_linter. This code should probably be refactored anyway.
    if (length(dnn) != length(args))
      # TODO(R>=4.4.0): names(args) %||% paste(dnn[1L], seq_along(args), sep=".")
      dnn <- if (!is.null(argn <- names(args))) argn else paste(dnn[1L], seq_along(args), sep=".")
    N <- length(args)
    A <- function(i) args[[i]]
  }
  force(dnn)

  if (N==1L) {
    x <- A(1L)
    if (!is.integer64(x)) {
      warning("coercing first argument to integer64")
      x <- as.integer64(x)
    }
  } else {
    a <- A(1L)
    n <- length(a)
    nu <- integer(N)
    d <- integer64(N+1L); d[[1L]] <- 1L
    dims <- vector("list", N)
    names(dims) <- dnn
    for (i in 1:N) {
      a <- A(i)
      if (length(a) != n)
        stop("all input vectors must have the same length")
      if (!is.integer64(a)) {
        warning("coercing argument ", i, " to integer64")
        a <- as.integer64(a)
      }
      cache_env <- cache(a)
      if (is.null(cache_env$order)) {
        s <- clone(a)
        o <- seq_along(s)
        ramsortorder(s, o)
        nu[[i]] <- sortnut(s)[["nunique"]]
      } else if (is.null(cache_env$sort)) {
        o <- cache_env$order
        s <- a[o]
        nu[[i]] <- cache_env$nunique
      } else {
        o <- cache_env$order
        s <- cache_env$sort
        nu[[i]] <- cache_env$nunique
      }
      d[[i+1L]] <- d[[i]] * nu[[i]]
      if (is.na(d[[i+1L]]))
        stop("attempt to make a table from more than >= 2^63 hypothetical combinations")
      dims[[i]] <- sortuni(s, nu[[i]])
      if (i==1L)
        x <- sortorderkey(s, o) - 1L
      else
        x <- x + d[[i]] * (sortorderkey(s, o) - 1L)
    }
  }
  cache_env <- cache(x)
  if (is.null(nunique) && !is.null(cache_env))
    nunique <- cache_env$nunique
  if (is.null(method)) {
    if (is.null(cache_env)) {
      if (order=="values" && (is.null(nunique) || nunique>65536L))
        method <- "sorttab"
      else
        method <- "hashmaptab"
    } else {
      # nolint next: unnecessary_nesting_linter. Good parallelism.
      if (order=="values") {
        if (!is.null(cache_env$sort))
          method <- "sorttab"
        else if (!is.null(cache_env$hashmap) && cache_env$nunique<sqrt(length(x)))
          method <- "hashtab"
        else if (!is.null(cache_env$order))
          method <- "ordertab"
        else
          method <- "sorttab"
      } else { # order = "counts"
        # nolint next: unnecessary_nesting_linter. Good parallelism.
        if (!is.null(cache_env$hashmap))
          method <- "hashtab"
        else if (!is.null(cache_env$sort))
          method <- "sorttab"
        else if (!is.null(cache_env$order))
          method <- "ordertab"
        else
          method <- "hashmaptab"
      }
    }
  }
  method <- match.arg(method, c("hashmaptab", "hashtab", "sorttab", "ordertab"))
  switch(method,
    hashmaptab={
      tmp <- hashmaptab(x, nunique=nunique)
      cnt <- tmp$counts
      val <- tmp$values
      rm(tmp)
    },
    hashtab={
      if (is.null(cache_env) || is.null(cache_env$hashmap))
        h <- hashmap(x, nunique=nunique)
      else
        h <- cache_env
      tmp <- hashtab(h, keep.order=FALSE)
      cnt <- tmp$counts
      val <- tmp$values
      rm(tmp)
    },
    sorttab={
      if (is.null(cache_env) || is.null(cache_env$sort)) {
        s <- clone(x)
        ramsort(s, na.last=FALSE)
      } else {
        s <- get("sort", cache_env, inherits=FALSE)
      }
      if (is.null(nunique))
        nunique <- sortnut(s)[1L]
      val <- sortuni(s, nunique)
      cnt <- sorttab(s, nunique)
    },
    ordertab={
      if (is.null(cache_env) || is.null(cache_env$order)) {
        o <- seq_along(x)
        ramorder(x, o, na.last=FALSE)
      } else {
        o <- get("order", cache_env, inherits=FALSE)
      }
      if (is.null(nunique))
        nunique <- ordernut(x, o)[1L]
      val <- orderuni(x, o, nunique, keep.order=FALSE)
      cnt <- ordertab(x, o, nunique, keep.order=FALSE)
      rm(o)
    }
  )
  if (order=="values") {
    if (startsWith(method, "hash")) {
      o <- seq_along(val)
      ramsortorder(val, o, na.last=FALSE)
      cnt <- cnt[o]
    }
  } else {
    # xx workaround until we have implemented ramsort.integer
    o <- sort.list(cnt, na.last=NA, method="quick")
    cnt <- cnt[o]
    # o <- seq_along(cnt)
    # ramsortorder(cnt, o, na.last=FALSE)
    val <- val[o]
  }

  ## attaching names is extremely expensive with many unique values, doing this only for compatibility with 'table' here
  switch(return,
    table = {
      if (N == 1L) {
        attr(cnt, "dim") <- length(cnt)
        dn <- list(as.character(val))
        names(dn) <- dnn[1L]
        attr(cnt, "dimnames") <- dn
      } else {
        a <- array(0L, dim=nu, dimnames=lapply(dims, as.character))
        a[as.integer(val)+1L] <- as.integer(cnt)
        cnt <- a
      }
      oldClass(cnt) <- "table"
    },
    data.frame = {
      if (N==1L) {
        cnt <- data.frame(values=val, Freq=cnt)
        names(cnt)[[1L]] <- dnn[1L]
      } else {
        for (i in N:1) {
          w <- val %/% d[[i]]
          val <- val - d[[i]]*w
          dims[[i]] <- dims[[i]][as.integer(w)+1L]
        }
        cnt <- data.frame(dims, Freq=cnt)
      }
    },
    list = {
      if (N == 1L)
        cnt <- list(values=val, counts=cnt)
      else
        cnt <- list(values=val, counts=cnt, dims=dims)
    }
  )
  cnt
}

as.integer64.factor <- function(x, ...) as.integer64(unclass(x))

#' Extract Positions in redundant dimension table
#'
#' `keypos` returns the positions of the (fact table) elements that participate
#'   in their sorted unique subset (dimension table)
#'
#' @param x a vector or a data frame or an array or `NULL`.
#' @param method NULL for automatic method selection or a suitable low-level
#'   method, see details
#' @param ... ignored
#'
#' @details
#' NAs are sorted first in the dimension table, see [ramorder.integer64()].
#'
#' This function automatically chooses from several low-level functions
#'   considering the size of `x` and the availability of a cache.
#'
#' Suitable methods are
#'  - [`sortorderkey`] (fast ordering)
#'  - [`orderkey`] (memory saving ordering).
#'
#' @return an integer vector of the same length as `x` containing positions
#'   relative to `sort(unique(x), na.last=FALSE)`
#' @seealso [unique.integer64()] for the unique subset and [match.integer64()]
#'   for finding positions in a different vector.
#'
#' @examples
#' x <- as.integer64(sample(c(rep(NA, 9), 1:9), 32, TRUE))
#' keypos(x)
#'
#' stopifnot(identical(keypos(x),  match.integer64(x, sort(unique(x), na.last=FALSE))))
#' @keywords manip univar
#' @export
keypos <- function(x, ...) UseMethod("keypos")

#' @rdname keypos
#' @export
keypos.integer64 <- function(x, method = NULL, ...) {
  cache_env <- cache(x)
  if (is.null(method)) {
    if (is.null(cache_env)) {
      method <- "sortorderkey"
    } else if (!is.null(cache_env$order)) {
      if (!is.null(cache_env$sort))
        method <- "sortorderkey"
      else
        method <- "orderkey"
    } else {
      method <- "sortorderkey"
    }
  }
  method <- match.arg(method, c("sortorderkey", "orderkey"))
  switch(method,
    sortorderkey={
      if (is.null(cache_env) || is.null(cache_env$sort) || is.null(cache_env$order)) {
        s <- clone(x)
        o <- seq_along(x)
        ramsortorder(s, o, na.last=FALSE)
      } else {
        s <- get("sort", cache_env, inherits=FALSE)
        o <- get("order", cache_env, inherits=FALSE)
      }
      p <- sortorderkey(s, o)
    },
    orderkey={
      if (is.null(cache_env) || is.null(cache_env$order)) {
        o <- seq_along(x)
        ramorder(x, o, na.last=FALSE)
      } else {
        o <- get("order", cache_env, inherits=FALSE)
      }
      p <- orderkey(x, o)
    }
  )
  p
}

#' Extract Positions of Tied Elements
#'
#' `tiepos` returns the positions of those elements that participate in ties.
#'
#' @param x a vector or a data frame or an array or `NULL`.
#' @param nties NULL or the number of tied values (including NA). Providing
#'   `nties` can speed-up when `x` has no cache. Note that a wrong nties can
#'   cause undefined behaviour up to a crash.
#' @param method NULL for automatic method selection or a suitable low-level
#'   method, see details
#' @param ... ignored
#'
#' @details
#' This function automatically chooses from several low-level functions
#'   considering the size of `x` and the availability of a cache.
#'
#' Suitable methods are
#'  - [`sortordertie`] (fast ordering)
#'  - [`ordertie`] (memory saving ordering).
#'
#' @return an integer vector of positions
#' @seealso [rank.integer64()] for possibly tied ranks and [unipos.integer64()]
#'   for positions of unique values.
#'
#' @examples
#' x <- as.integer64(sample(c(rep(NA, 9), 1:9), 32, TRUE))
#' tiepos(x)
#'
#' stopifnot(identical(tiepos(x),  (1:length(x))[duplicated(x) | rev(duplicated(rev(x)))]))
#' @keywords manip univar
#' @export
tiepos <- function(x, ...) UseMethod("tiepos")

#' @rdname tiepos
#' @export
tiepos.integer64 <- function(x, nties = NULL, method = NULL, ...) {
  cache_env <- cache(x)
  if (is.null(nties) && !is.null(cache_env))
    nties <- cache_env$nties
  if (is.null(method)) {
    if (is.null(cache_env)) {
      method <- "sortordertie"
    } else if (!is.null(cache_env$order)) {
      if (!is.null(cache_env$sort))
        method <- "sortordertie"
      else
        method <- "ordertie"
    } else {
      method <- "sortordertie"
    }
  }
  method <- match.arg(method, c("sortordertie", "ordertie"))
  switch(method,
    sortordertie={
      if (is.null(cache_env) || is.null(cache_env$sort) || is.null(cache_env$order)) {
        s <- clone(x)
        o <- seq_along(x)
        ramsortorder(s, o, na.last=FALSE)
      } else {
        s <- get("sort", cache_env, inherits=FALSE)
        o <- get("order", cache_env, inherits=FALSE)
      }
      if (is.null(nties))
        nties <- sortnut(s)[2L]
      p <- sortordertie(s, o, nties)
    },
    ordertie={
      if (is.null(cache_env) || is.null(cache_env$order)) {
        o <- seq_along(x)
        ramorder(x, o, na.last=FALSE)
      } else {
        o <- get("order", cache_env, inherits=FALSE)
      }
      if (is.null(nties))
        nties <- ordernut(x, o)[2L]
      p <- ordertie(x, o, nties)
    }
  )
  p
}

#' Sample Ranks from integer64
#'
#' Returns the sample ranks of the values in a vector.  Ties (i.e., equal
#'   values) are averaged and missing values propagated.
#'
#' @param x a integer64 vector
#' @param method NULL for automatic method selection or a suitable low-level
#'   method, see details
#' @param ... ignored
#'
#' @details
#' This function automatically chooses from several low-level functions
#'   considering the size of `x` and the availability of a cache.
#' Suitable methods are
#'  - [sortorderrnk()] (fast ordering)
#'  - [orderrnk()] (memory saving ordering).
#'
#' @return A numeric vector of the same length as `x`.
#' @seealso [order.integer64()], [rank()] and [prank()] for percent rank.
#' @examples
#' x <- as.integer64(sample(c(rep(NA, 9), 1:9), 32, TRUE))
#' rank.integer64(x)
#'
#' stopifnot(identical(rank.integer64(x),  rank(as.integer(x)
#' , na.last="keep", ties.method = "average")))
#'
#' @keywords univar
#' @export
rank.integer64 <- function(x, method = NULL, ...) {
  cache_env <- cache(x)
  if (is.null(method)) {
    if (is.null(cache_env)) {
      method <- "sortorderrnk"
    } else if (!is.null(cache_env$order)) {
      if (!is.null(cache_env$sort))
        method <- "sortorderrnk"
      else
        method <- "orderrnk"
    } else {
      method <- "sortorderrnk"
    }
  }
  method <- match.arg(method, c("sortorderrnk", "orderrnk"))
  switch(method,
    sortorderrnk={
      if (is.null(cache_env) || is.null(cache_env$sort) || is.null(cache_env$order)) {
        s <- clone(x)
        o <- seq_along(x)
        na.count <- ramsortorder(s, o, na.last=FALSE)
      } else {
        s <- get("sort", cache_env, inherits=FALSE)
        o <- get("order", cache_env, inherits=FALSE)
        na.count <- get("na.count", cache_env, inherits=FALSE)
      }
      p <- sortorderrnk(s, o, na.count)
    },
    orderrnk={
      if (is.null(cache_env) || is.null(cache_env$order)) {
        o <- seq_along(x)
        na.count <- ramorder(x, o, na.last=FALSE)
      } else {
        o <- get("order", cache_env, inherits=FALSE)
        na.count <- get("na.count", cache_env, inherits=FALSE)
      }
      p <- orderrnk(x, o, na.count)
    }
  )
  p
}

#' (P)ercent (Rank)s
#'
#' Function `prank.integer64`  projects the values `[min..max]` via ranks
#'   `[1..n]` to `[0..1]`.
#' [qtile.integer64()] is the inverse function of 'prank.integer64' and
#'   projects `[0..1]` to `[min..max]`.
#'
#' @param x a integer64 vector
#' @param method NULL for automatic method selection or a suitable low-level
#'   method, see details
#' @param ... ignored
#'
#' @details Function `prank.integer64` is based on [rank.integer64()].
#' @return `prank` returns a numeric vector of the same length as `x`.
#' @seealso [rank.integer64()] for simple ranks and [qtile()] for the
#'   inverse function quantiles.
#' @examples
#' x <- as.integer64(sample(c(rep(NA, 9), 1:9), 32, TRUE))
#' prank(x)
#'
#' x <- x[!is.na(x)]
#' stopifnot(identical(x,  unname(qtile(x, probs=prank(x)))))
#' @keywords univar
#' @export
prank <- function(x, ...) UseMethod("prank")
#' @rdname prank
#' @export
prank.integer64 <- function(x, method = NULL, ...) {
  n <- nvalid(x)
  if (n<2L) return(rep(as.integer64(NA), length(x)))
  (rank.integer64(x, method=method, ...)-1L) / (n-1L)
}

#' (Q)uan(Tile)s
#'
#' Function [prank.integer64()]  projects the values `[min..max]` via ranks
#'   `[1..n]` to `[0..1]`.
#'
#' `qtile.integer64` is the inverse function of 'prank.integer64' and projects
#'   `[0..1]` to `[min..max]`.
#'
#' @param x a integer64 vector
#' @param probs numeric vector of probabilities with values in `[0, 1]` - possibly containing `NA`s
#' @param ... ignored
#'
#' @details
#'
#' Functions `quantile.integer64` with `type=0` and `median.integer64` are
#'   convenience wrappers to `qtile`.
#'
#' Function `qtile` behaves very similar to `quantile.default` with `type=1`
#'   in that it only returns existing values, it is mostly symmetric but it is
#'   using 'round' rather than 'floor'.
#'
#' Note that this implies that `median.integer64` does not interpolate for even
#'   number of values (interpolation would create values that could not be
#'   represented as 64-bit integers).
#'
#' This function automatically chooses from several low-level functions
#'   considering the size of `x` and the availability of a cache.
#'
#' Suitable methods are
#'  - [`sortqtl`] (fast sorting)
#'  - [`orderqtl`] (memory saving ordering).
#'
#' @return
#' `prank` returns a numeric vector of the same length as `x`.
#'
#' `qtile` returns a vector with elements from `x`
#'   at the relative positions specified by `probs`.
#' @seealso [rank.integer64()] for simple ranks and [quantile()] for quantiles.
#' @examples
#' x <- as.integer64(sample(c(rep(NA, 9), 1:9), 32, TRUE))
#' qtile(x, probs=seq(0, 1, 0.25))
#' quantile(x, probs=seq(0, 1, 0.25), na.rm=TRUE)
#' median(x, na.rm=TRUE)
#' summary(x)
#'
#' x <- x[!is.na(x)]
#' stopifnot(identical(x,  unname(qtile(x, probs=prank(x)))))
#' @keywords univar
#' @export
qtile <- function(x, probs = seq(0.0, 1.0, 0.25), ...) UseMethod("qtile")

#' @rdname qtile
#' @param names logical; if `TRUE`, the result has a `names` attribute. Set to `FALSE` for speedup with many probs.
#' @param method NULL for automatic method selection or a suitable low-level method, see details
#' @export
qtile.integer64 <- function(x, probs = seq(0.0, 1.0, 0.25), names = TRUE, method = NULL, ...) {
  if (any(is.na(probs) | probs<0.0 | probs>1.0))
    stop("p outside [0, 1]")
  cache_env <- cache(x)
  if (is.null(method)) {
    if (is.null(cache_env))
      method <- "sortqtl"
    else if (!is.null(cache_env$sort))
      method <- "sortqtl"
    else if (!is.null(cache_env$order))
      method <- "orderqtl"
    else
      method <- "sortqtl"
  }
  method <- match.arg(method, c("sortqtl", "orderqtl"))
  switch(method,
    sortqtl={
      if (is.null(cache_env) || is.null(cache_env$sort)) {
        s <- clone(x)
        na.count <- ramsort(s, na.last=FALSE)
      } else {
        s <- get("sort", cache_env, inherits=FALSE)
        na.count <- get("na.count", cache_env, inherits=FALSE)
      }
      qs <- sortqtl(s, na.count, probs)
    },
    orderqtl={
      if (is.null(cache_env) || is.null(cache_env$order)) {
        o <- seq_along(x)
        na.count <- ramorder(x, o, na.last=FALSE)
      } else {
        o <- get("order", cache_env, inherits=FALSE)
        na.count <- get("na.count", cache_env, inherits=FALSE)
      }
      qs <- orderqtl(x, o, na.count, probs)
    }
  )
  if (names) {
    np <- length(probs)
    dig <- max(2L, getOption("digits"))
    names(qs) <- paste0(
      if (np < 100L)
        formatC(100.0 * probs, format = "fg", width = 1L, digits = dig)
      else
        format(100.0 * probs, trim = TRUE, digits = dig),
      "%"
    )
  }
  qs
}

#' @rdname qtile
#' @param type an integer selecting the quantile algorithm, currently only
#'   0 is supported, see details
#' @param na.rm logical; if `TRUE`, any `NA` and `NaN`'s are removed from
#'   `x` before the quantiles are computed.
#' @export
quantile.integer64 <- function(x, probs = seq(0.0, 1.0, 0.25), na.rm = FALSE, names = TRUE, type=0L, ...) {
    if (type[[1L]]!=0L)
        stop("only type==0 ('qtile') supported")
    if (!na.rm && na.count(x)>0L)
        stop("missing values not allowed with 'na.rm='==FALSE")
    qtile.integer64(x, probs = probs, na.rm = na.rm, names = names, ...)
}

# TODO(R>=3.4.0): Drop this branch when median always gets '...'
# adding ... (wish of Kurt Hornik 23.3.2017)
if (is.na(match("...", names(formals(median))))) {
    # nocov start. Only run on old R.
    median_i64_impl_ <- function(x, na.rm=FALSE) {
        if (!na.rm && na.count(x)>0L)
            stop("missing values not allowed with 'na.rm='==FALSE")
        qtile.integer64(x, probs = 0.5, na.rm = na.rm, names = FALSE)
    }
    # nocov end.
} else {
    median_i64_impl_ <- function(x, na.rm=FALSE, ...) {
        if (!na.rm && na.count(x)>0L)
            stop("missing values not allowed with 'na.rm='==FALSE")
        qtile.integer64(x, probs = 0.5, na.rm = na.rm, names = FALSE)
    }
}

#' @rdname qtile
#' @export
median.integer64 <- median_i64_impl_

# mean.integer64 <- function(x, na.rm=FALSE) {
    # s <- sum(x, na.rm=na.rm)
    # if (!is.na(s)) {
        # if (na.rm)
            # s <- s%/%(length(x)-na.count(x))
        # else
            # s <- s%/%length(x)
    # }
    # s
# }
#' @rdname qtile
#' @export
mean.integer64 <- function(x, na.rm=FALSE, ...) {
    ret <- .Call(C_mean_integer64, x, as.logical(na.rm), double(1L))
    oldClass(ret) <- "integer64"
    ret
}

#' @rdname qtile
#' @param object a integer64 vector
#' @export
summary.integer64 <- function(object, ...) {
    nas <- na.count(object)
    qq <- quantile(object, na.rm=TRUE)
    qq <- c(qq[1L:3L], mean(object, na.rm=TRUE), qq[4L:5L])
    names(qq) <- c("Min.", "1st Qu.", "Median", "Mean", "3rd Qu.", "Max.")
    if (any(nas))
        c(qq, "NA's" = nas)
    else qq
}
truecluster/bit64 documentation built on April 12, 2025, 7:41 p.m.