R/bitsort.R

Defines functions bit_rangediff bit_setequal bit_symdiff bit_setdiff bit_intersect bit_union bit_sumDuplicated bit_anyDuplicated bit_duplicated bit_unique bit_in bit_sort_unique bit_sort bitsort countsort quicksort3 quicksort2 range_sortna range_nanozero range_na

Documented in bit_anyDuplicated bit_duplicated bit_in bit_intersect bit_rangediff bit_setdiff bit_setequal bitsort bit_sort bit_sort_unique bit_sumDuplicated bit_symdiff bit_union bit_unique countsort quicksort2 quicksort3 range_na range_nanozero range_sortna

# Fast methods for integers leveraging bit vectors
# (c) 2016-2017 Jens Oehlschägel
# Licence: GPL2
# Provided 'as is', use at your own risk

#' Get range and number of NAs
#'
#' @param x an integer vector
#'
#' @return an integer vector with three elements \describe{
#'   \item{ 1 }{ min integer }
#'   \item{ 2 }{ max integer }
#'   \item{ 3 }{ number of NAs }
#'   }
#' @seealso \code{\link{range_nanozero}} and \code{\link{range_sortna}}
#'
#' @examples
#' range_na(c(0L,1L,2L,NA))
#' @export
range_na <- function(x){
  if (!(is.integer(x) || is.factor(x)))
    stop("x must be integer (or factor)")
  .Call(C_R_range_na, x)
}

#' Remove zeros and get range and number of NAs 
#'
#' @param x an integer vector
#'
#' @return an integer vector without zeros and with an attribute \code{\link{range_na}}  with three elements \describe{
#'   \item{ 1 }{ min integer }
#'   \item{ 2 }{ max integer }
#'   \item{ 3 }{ number of NAs }
#'   }
#' @seealso \code{\link{range_na}} and \code{\link{range_sortna}}
#'
#' @examples
#' range_nanozero(c(0L,1L,2L,NA))
#' @export
range_nanozero <- function(x){
  if (!(is.integer(x) || is.factor(x)))
    stop("x must be integer (or factor)")
  .Call(C_R_range_nanozero, x)
}

#' Prepare for sorting and get range, number of NAs and unsortedness
#' 
#' In one pass over the vector \code{NA}s are treated according to parameter
#' \code{na.last} exactly like \code{\link{sort}} does, the \code{\link{range}},
#' number of \code{NA}s and unsortedness is determined.
#'
#' @param x an integer vector
#' @inheritParams bit_sort
#'
#' @return an integer vector with \code{NA}s are treated and an \code{attribute} \code{\link{range_na}}  with four elements \describe{
#'   \item{ 1 }{ min integer }
#'   \item{ 2 }{ max integer }
#'   \item{ 3 }{ number of NAs }
#'   \item{ 3 }{ 0 for sorted vector and 1 for \code{\link{is.unsorted}}  }
#'   }
#' @seealso \code{\link{range_na}} and \code{\link{range_nanozero}}
#'
#' @examples
#' range_sortna(c(0L,1L,NA,2L))
#' range_sortna(c(2L,NA,1L,0L))
#' range_sortna(c(0L,1L,NA,2L), na.last=TRUE)
#' range_sortna(c(2L,NA,1L,0L), na.last=TRUE)
#' range_sortna(c(0L,1L,NA,2L), na.last=FALSE)
#' range_sortna(c(2L,NA,1L,0L), na.last=FALSE)
#' @export
range_sortna <- function(x, decreasing = FALSE, na.last=NA){
  if (!(is.integer(x) || is.factor(x)))
    stop("x must be integer (or factor)")
  .Call(C_R_range_sortna, x, as.logical(decreasing), as.logical(na.last))
}


#' Low-level sorting: binary quicksort
#' 
#' In one pass over the vector \code{NA}s are handled according to parameter
#' \code{na.last} by \code{\link{range_sortna}}, then, if the vector is unsorted, 
#' binary quicksort is invoked.
#' 
#' @param x an integer vector
#' @inheritParams bit_sort
#'
#' @return a sorted vector
#'
#' @examples
#' quicksort2(c(2L,0L,1L,NA,2L))
#' quicksort2(c(2L,0L,1L,NA,2L), na.last=TRUE)
#' quicksort2(c(2L,0L,1L,NA,2L), na.last=FALSE)
#' @export
quicksort2 <- function(x, na.last=NA){
  if (!(is.integer(x) || is.factor(x)))
    stop("x must be integer (or factor)")
  if (!length(x))
    return(x)
  x <- range_sortna(x, na.last=na.last)
  range_sortna <- getsetattr(x, "range_sortna", NULL)
  if (is.na(range_sortna[1]))  # no numbers to sort
    return(rep(NA_integer_, range_sortna[3]))  
  if (range_sortna[4])  # is.unsorted
    .Call(C_R_int_quicksort2, x, range_sortna, as.logical(na.last))
  else
    x 
}

#' Low-level sorting: threeway quicksort
#' 
#' In one pass over the vector \code{NA}s are handled according to parameter
#' \code{na.last} by \code{\link{range_sortna}}, then, if the vector is unsorted, 
#' threeway quicksort is invoked.
#' 
#' @param x an integer vector
#' @inheritParams bit_sort
#'
#' @return a sorted vector
#'
#' @examples
#' countsort(c(2L,0L,1L,NA,2L))
#' countsort(c(2L,0L,1L,NA,2L), na.last=TRUE)
#' countsort(c(2L,0L,1L,NA,2L), na.last=FALSE)
#' @export
quicksort3 <- function(x, na.last=NA){
  if (!(is.integer(x) || is.factor(x)))
    stop("x must be integer (or factor)")
  if (!length(x))
    return(x)
  x <- range_sortna(x, na.last=na.last)
  range_sortna <- getsetattr(x, "range_sortna", NULL)
  if (is.na(range_sortna[1]))  # no numbers to sort
    return(rep(NA_integer_, range_sortna[3]))  
  if (range_sortna[4])  # is.unsorted
    .Call(C_R_int_quicksort3, x, range_sortna, as.logical(na.last))
  else
    x
}


#' Low-level sorting: counting sort
#' 
#' In one pass over the vector \code{NA}s are handled according to parameter
#' \code{na.last} by \code{\link{range_sortna}}, then, if the vector is unsorted, 
#' counting sort is invoked.
#' 
#' @param x an integer vector
#' @inheritParams bit_sort
#'
#' @return a sorted vector
#'
#' @examples
#' countsort(c(2L,0L,1L,NA,2L))
#' countsort(c(2L,0L,1L,NA,2L), na.last=TRUE)
#' countsort(c(2L,0L,1L,NA,2L), na.last=FALSE)
#' @export
countsort <- function(x, na.last=NA){
  if (!(is.integer(x) || is.factor(x)))
    stop("x must be integer (or factor)")
  if (!length(x))
    return(x)
  x <- range_sortna(x, na.last=na.last)
  range_sortna <- getsetattr(x, "range_sortna", NULL)
  if (is.na(range_sortna[1]))  # no numbers to sort
    return(rep(NA_integer_, range_sortna[3]))  
  if (range_sortna[4])  # is.unsorted
    .Call(C_R_int_countsort, x, range_sortna, as.logical(na.last))
  else
    x
}

#' Low-level sorting: bit sort
#' 
#' In one pass over the vector \code{NA}s are handled according to parameter
#' \code{na.last} by \code{\link{range_sortna}}, then, if the vector is unsorted, 
#' bit sort is invoked.
#' 
#' @param x an integer vector
#' @param depth an integer scalar giving the number of bit-passed before switching to quicksort
#' @inheritParams bit_sort
#'
#' @return a sorted vector
#'
#' @examples
#' bitsort(c(2L,0L,1L,NA,2L))
#' bitsort(c(2L,0L,1L,NA,2L), na.last=TRUE)
#' bitsort(c(2L,0L,1L,NA,2L), na.last=FALSE)
#' @export
bitsort <- function(x, na.last=NA, depth=1){
  if (!(is.integer(x) || is.factor(x)))
    stop("x must be integer (or factor)")
  if (!length(x))
    return(x)
  x <- range_sortna(x, na.last=na.last)
  range_sortna <- getsetattr(x, "range_sortna", NULL)
  if (is.na(range_sortna[1]))  # no numbers to sort
    return(rep(NA_integer_, range_sortna[3]))  
  if (!range_sortna[4])  # already sorted
    return(x)
  n <- as.double(range_sortna[2])-as.double(range_sortna[1])+1
  .Call(C_R_bit_sort, tmp=bit(n), x, range_sortna, as.logical(na.last), depth=as.integer(depth))
}



# if (FALSE){
#   # time measure relative speeds (radixsort is not yet integrated)
#   require(microbenchmark)
#   require(bit)
#   require(ff)
#   x <- sample(2,1e8,T)
#   system.time(bitsort(x))
#   x <- sample(1e8,1e8,T)
#   system.time(bitsort(x))
#   
#   D <- as.integer(2^(1:27))
#   N <- as.integer(2^(1:27))
#   M <- c("b","c","q") #,"r")
#   tim <- array(NA, dim=c(D=length(D), N=length(N), M=length(M)), dimnames=list(D=D, N=N, M=M))
#   for (ni in seq_along(N)){
#     n <- N[ni]
#     for (di in seq_along(D)){
#       d <- D[di]    
#       x <- sample(d,n,T)
#       x[1] <- d
#       if (di > ni-5 && di < ni+10){
#         tim[di,ni,"b"] <- microbenchmark(bitsort(x), times = 1)$time
#         tim[di,ni,"c"] <- microbenchmark(countsort(x), times = 1)$time
#         tim[di,ni,"q"] <- microbenchmark(quicksort3(x), times = 1)$time
#         #tim[di,ni,"r"] <- microbenchmark(radixsort(copy_vector(x)), times = 1)$time
#       }
#     }
#     print(apply(tim[,1:ni,,drop=FALSE], 1:2, function(x){i <- which.min(x); if (length(i))M[i] else "."}), quote=FALSE)
#   }
#   
#   round( tim[,,"b"] / tim[,,"c"], 1 )
#   round( tim[,,"b"] / tim[,,"q"], 1 )
#   round( tim[,,"c"] / tim[,,"q"], 1 )
#   round( tim[,,"c"] / tim[,,"r"], 1 )
#   round( tim[,,"q"] / tim[,,"r"], 1 )
#   
#   M <- dimnames(tim)[[3]]
#   r <- as.integer(rownames(tim[,,1]))[row(tim[,,1])]
#   n <- as.integer(colnames(tim[,,1]))[col(tim[,,1])]
#   d <- n/r
#   #x <- ifelse(d>=1, "c", "q")
#   x <- ifelse(d>=0.5,"c",ifelse(n<=1024,"q",ifelse(d<0.0625,"q","b")))
#   x <- ifelse(n<=1024, ifelse(d<0.5, "q", "c"), ifelse(d<1, ifelse(d<0.0625, "q", "b"), "c"))
#   dim(x) <- dim(tim[,,1])
#   dimnames(x) <- dimnames(tim[,,1])
#   y <- tim[cbind(as.vector(row(x)), as.vector(col(x)), match(as.vector(x), M))]
#   attributes(y) <- attributes(x)
#   print(apply(tim, 1:2, function(x){i <- which.min(x); if (length(i))M[i] else "."}), quote=FALSE)
#   print(x, quote=F)
#   round(y / apply(tim[,,-4], 1:2, min, na.rm=T), 1)
#   round(y / tim[,,1], 1)
#   
#   f <- function(n, d=1, k=1){
#     x <- dbinom(1:log2(n), n, 1/(d*n))*n
#     r <- rev(cumsum(rev(x)))
#     rr <- c(r[-1], 0)
#     c <- r*log2(r)
#     b <- n*k+r+rr*log2(rr)
#     w <- as.integer(!is.na(b) & b<c)
#     #print(round(data.frame(x=x, r=r, rr=rr, c=c, b=b, w=w)))
#     sum(w)
#   }
#   sapply(2^(1:30), f, d=8, k=1)
#   sapply(2^(1:30), f, k=2)  
#   sapply(2^(1:30), f, k=4)  
#   sapply(2^(1:30), f, k=8)  
#   sapply(2^(1:30), f, k=16)  
#   sapply(2^(1:30), f, k=32)  
#   
#   require(microbenchmark)
#   require(bit)
#   n <- as.integer(2^27)
#   x <- sample(n*4, n, T)
#   times <- 1
#   microbenchmark(countsort(x), times=times)
#   microbenchmark(quicksort3(x), times=times)
#   microbenchmark(bitsort(x, depth=1), times=times)
#   microbenchmark(bitsort(x, depth=2), times=times)
#   microbenchmark(bitsort(x, depth=3), times=times)
#   microbenchmark(bitsort(x, depth=4), times=times)
#   microbenchmark(bitsort(x, depth=5), times=times)
#   
#   microbenchmark(quicksort2(x), times=times)
#   microbenchmark(bit_sort_unique(x), times=times)
#   
#   # ergo: ab 2^16 zwischen 1/4 und 1/32
#   
# }


#' bit sort
#' 
#' fast sorting of integers
#' 
#' determines the range of the integers and checks if the density justifies use 
#' of a bit vector; if yes, sorts the first occurences of each integer in the
#' range using a bit vector, sorts the rest and merges; if no, falls back to quicksort.
#' 
#' @param x an integer vector
#' @param decreasing  (currently only \code{FALSE} is supported)
#' @param na.last \code{NA} removes NAs, \code{FALSE} puts NAs at the beginning,
#'   \code{TRUE} puts NAs at the end
#' @param has.dup TRUE (the default) assumes that \code{x} might have 
#'   duplicates, set to \code{FALSE} if duplicates are impossible
#'   
#' @return a sorted vector
#' @seealso \code{\link{sort}}, \code{\link{ramsort}}, 
#'   \code{\link{bit_sort_unique}}
#' 
#' @examples
#' bit_sort(c(2L,1L,NA,NA,1L,2L))
#' bit_sort(c(2L,1L,NA,NA,1L,2L), na.last=FALSE)
#' bit_sort(c(2L,1L,NA,NA,1L,2L), na.last=TRUE)
#' 
#' \dontrun{
#' x <- sample(1e7, replace=TRUE)
#' system.time(bit_sort(x))
#' system.time(sort(x))
#' }
#' @export
bit_sort <- function(x, decreasing = FALSE, na.last=NA, has.dup = TRUE){
  if (decreasing)
    stop("decreasing=TRUE not implemented")
  if (!has.dup)
    return(bit_sort_unique(x, decreasing = decreasing, na.last=na.last, has.dup=FALSE))
  if (!(is.integer(x) || is.factor(x)))
    stop("x must be integer (or factor)")
  if (!length(x))
    return(x)
  x <- range_sortna(x, decreasing = decreasing, na.last=na.last)
  range_sortna <- getsetattr(x, "range_sortna", NULL)
  if (is.na(range_sortna[1]))  # no numbers to sort
    return(rep(NA_integer_, range_sortna[3]))  
  if (!range_sortna[4])  # already sorted
    return(x)
  nr <- as.double(range_sortna[2])-as.double(range_sortna[1])+1
  d <- length(x) / nr
  if (nr<=65536){
    if (d<0.5)
      ret <- .Call(C_R_int_quicksort3, x, range_sortna, as.logical(na.last))
    else
      ret <- .Call(C_R_int_countsort, x, range_sortna, as.logical(na.last))
  }else{
    if (d < 0.03125 || nr > .Machine$integer.max ){
      ret <- .Call(C_R_int_quicksort3, x, range_sortna, as.logical(na.last))
    }else if (d <= 0.25){
      ret <- .Call(C_R_bit_sort, tmp=bit(nr), x, range_sortna, as.logical(na.last), depth=1L)
    }else{
      ret <- .Call(C_R_int_countsort, x, range_sortna, as.logical(na.last))
    }
  }
  ret
}


#' bit sort unique
#' 
#' fast combination of \code{\link{sort}} and \code{\link{unique}} for integers
#' 
#' determines the range of the integers and checks if the density justifies use 
#' of a bit vector; if yes, creates the result using a bit vector; if no, falls back to \code{sort(unique())}
#'
#' @param x an integer vector
#' @param decreasing \code{FALSE} (ascending) or \code{TRUE} (descending)
#' @param na.last \code{NA} removes NAs, \code{FALSE} puts NAs at the beginning, \code{TRUE} puts NAs at the end  
#' @param has.dup TRUE (the default) assumes that \code{x} might have duplicates, set to \code{FALSE} if duplicates are impossible
#' @param range_na \code{NULL} calls \code{\link{range_na}}, optionally the result of \code{\link{range_na}} can be given here to avoid calling it again
#' 
#' @return a sorted unique integer vector 
#' @seealso \code{\link{sort}}, \code{\link{unique}}, 
#'   \code{\link{bit_sort}}, \code{\link{bit_unique}}
#'
#' @examples
#' bit_sort_unique(c(2L,1L,NA,NA,1L,2L))
#' bit_sort_unique(c(2L,1L,NA,NA,1L,2L), na.last=FALSE)
#' bit_sort_unique(c(2L,1L,NA,NA,1L,2L), na.last=TRUE)
#' bit_sort_unique(c(2L,1L,NA,NA,1L,2L), decreasing = TRUE)
#' bit_sort_unique(c(2L,1L,NA,NA,1L,2L), decreasing = TRUE, na.last=FALSE)
#' bit_sort_unique(c(2L,1L,NA,NA,1L,2L), decreasing = TRUE, na.last=TRUE)
#' 
#' \dontrun{
#' x <- sample(1e7, replace=TRUE)
#' system.time(bit_sort_unique(x))
#' system.time(sort(unique(x)))
#' x <- sample(1e7)
#' system.time(bit_sort_unique(x))
#' system.time(sort(x))
#' } 
#' @export
bit_sort_unique <- function(x, decreasing = FALSE, na.last=NA, has.dup=TRUE, range_na=NULL){
  if (!(is.integer(x) || is.factor(x)))
    stop("x must be integer (or factor)")
  if (is.null(range_na))
    range_na <- range_na(x)
  else 
    range_na <- as.integer(range_na)
  if (is.na(range_na[1])){
    if (is.na(na.last) || length(range_na)<3L || range_na[3]==0)
      return(integer())
    else 
      return(NA_integer_)
  }
  nr <- as.double(range_na[2])-as.double(range_na[1])+1
  d <- length(x) / nr
  if (nr > .Machine$integer.max || d < 0 ){
    if (has.dup){
      ret <- quicksort3(x, na.last = xor(na.last, decreasing))
      ret <- merge_unique(ret)
    }else{
      ret <- quicksort2(x, na.last = xor(na.last, decreasing))
    }
    if (decreasing)
      reverse_vector(ret)
    else
      ret
  }else
    .Call(C_R_bit_sort_unique, tmp=bit(nr), x, range_na, as.logical(na.last), as.logical(decreasing))
} 


#' bit \%in\%
#' 
#' fast \code{\link{\%in\%}} for integers
#' 
#' determines the range of the integers and checks if the density justifies use 
#' of a bit vector; if yes, maps \code{x} or \code{table} -- whatever is smaller
#' -- into a bit vector and searches the other of \code{table} or \code{x} in
#' the it vector; if no, falls back to \code{\link{\%in\%}}
#' 
#' @param x an integer vector of values to be looked-up
#' @param table an integer vector used as lookup-table
#' @param retFUN a function that coerces \code{\link{bit}} and \code{\link{logical}} vectors
#'   
#' @return a boolean vector coerced to \code{retFUN}
#' @seealso \code{\link{\%in\%}}
#' @examples
#' bit_in(1:2, 2:3)
#' bit_in(1:2, 2:3, retFUN=as.logical)
#' @export
bit_in <- function(x, table, retFUN=as.bit){
  if (!(is.integer(x) || is.factor(x)))
    stop("x must be integer (or factor)")
  if (!(is.integer(table) || is.factor(table)))
    stop("table must be integer (or factor)")
  nx <- length(x)
  nt <- length(table)
  reverse <- nx<nt
  if (reverse){
    range_na <- range_na(x)
    n <- nx
  }else{
    range_na <- range_na(table)
    n <- nt
  }
  if (is.na(range_na[1]))
    nr <- 0L  # no numbers, no temp bit vector
  else
    nr <- as.double(range_na[2])-as.double(range_na[1])+1
  d <- (nx+nt) / nr
  if (nr==0L || nr > .Machine$integer.max || d < 1/64 ){
    ret <- !is.na(match(x, table))
  }else{
    if (reverse){
      ret <- .Call(C_R_bit_table_in, tmp=bit(nr), x, table, range_na, ret=bit(length(x)))
    }else{
      ret <- .Call(C_R_bit_in_table, tmp=bit(nr), x, table, range_na, ret=bit(length(x)))
    }
  }
  retFUN(ret)
}

#' bit unique and duplicated
#' 
#' Fast versions of \code{\link{unique}}, \code{\link{duplicated}} ,
#' \code{\link{anyDuplicated}} and \code{sum(duplicated(x))} for integers.
#' 
#' determines the range of the integers and checks if the density justifies use 
#' of a bit vector; if yes, uses a bit vector for finding duplicates; if no, 
#' falls back to \code{\link{unique}}, \code{\link{duplicated}}, \code{\link{anyDuplicated}} and \code{sum(duplicated(x))}
#' 
#' @name bit_unidup
#' @param x an integer vector
#' @param na.rm \code{NA} treats NAs like other integers, \code{TRUE} treats 
#'   \emph{all} NAs as duplicates, \code{FALSE} treats \emph{no} NAs as 
#'   duplicates
#' @inheritParams bit_in
#' @inheritParams bit_sort_unique
#'   
#' @return 
#'   \code{bit_unique} returns a vector of unique integers, \cr
#'   \code{bit_duplicated} returns a boolean vector coerced to \code{retFUN}, \cr
#'   \code{bit_anyDuplicated} returns the position of the first duplicate (or zero if no duplicates) \cr
#'   \code{bit_sumDuplicated} returns the number of duplicated values (as.integer)
#' @seealso \code{\link{bit_sort_unique}}
#' 
#' @examples
#' bit_unique(c(2L,1L,NA,NA,1L,2L))
#' bit_unique(c(2L,1L,NA,NA,1L,2L), na.rm=FALSE)
#' bit_unique(c(2L,1L,NA,NA,1L,2L), na.rm=TRUE)
#' 
#' bit_duplicated(c(2L,1L,NA,NA,1L,2L))
#' bit_duplicated(c(2L,1L,NA,NA,1L,2L), na.rm=FALSE)
#' bit_duplicated(c(2L,1L,NA,NA,1L,2L), na.rm=TRUE)
#'
#' bit_anyDuplicated(c(2L,1L,NA,NA,1L,2L))
#' bit_anyDuplicated(c(2L,1L,NA,NA,1L,2L), na.rm=FALSE)
#' bit_anyDuplicated(c(2L,1L,NA,NA,1L,2L), na.rm=TRUE)
#'
#' bit_sumDuplicated(c(2L,1L,NA,NA,1L,2L))
#' bit_sumDuplicated(c(2L,1L,NA,NA,1L,2L), na.rm=FALSE)
#' bit_sumDuplicated(c(2L,1L,NA,NA,1L,2L), na.rm=TRUE)
NULL

#' @describeIn bit_unidup extracts unique elements
#' @export
bit_unique <- function(x, na.rm = NA, range_na=NULL){
  na.rm <- as.logical(na.rm)
  if (!(is.integer(x) || is.factor(x)))
    stop("x must be integer (or factor)")
  if (!length(x))
    return(x)
  if (is.null(range_na))
    range_na <- range_na(x)
  else 
    range_na <- as.integer(range_na)
  if (is.na(range_na[1])){
    if (is.na(na.rm)){
      return(NA_integer_)
    }else if (na.rm){
      return(integer())
    }else{
      return(rep(NA_integer_, range_na[3]))
    }
  }
  nr <- as.double(range_na[2])-as.double(range_na[1])+1
  d <- length(x) / nr
  if (nr > .Machine$integer.max || d < 1/64 ){
    if (is.na(na.rm)){
      unique(x, incomparables = FALSE)
    }else if (na.rm){
      x <- unique(x, incomparables = FALSE)
      x <- x[!is.na(x)]
      x
    }else{
      unique(x, incomparables = NA)
    }
  }else{
    .Call(C_R_bit_unique, tmp=bit(nr), x, range_na, na.rm)
  }
}

#' @describeIn bit_unidup determines duplicate elements
#' @export
bit_duplicated <- function(x, na.rm = NA, range_na=NULL, retFUN=as.bit){
  na.rm <- as.logical(na.rm)
  if (!(is.integer(x) || is.factor(x)))
    stop("x must be integer (or factor)")
  if (!length(x))
    return(retFUN(NULL))
  if (is.null(range_na))
    range_na <- range_na(x)
  else 
    range_na <- as.integer(range_na)
  if (is.na(range_na[1]))
    nr <- 0L  # no numbers, no temp bit vector
  else
    nr <- as.double(range_na[2])-as.double(range_na[1])+1
  d <- length(x) / nr
  if (nr > .Machine$integer.max || d < 1/64 ){
    if (is.na(na.rm)){
      ret <- duplicated(x, incomparables = FALSE)
    }else if (na.rm){
      ret <- duplicated(x, incomparables = FALSE) | is.na(x)
    }else{
      ret <- duplicated(x, incomparables = NA)
    }
  }else{
    ret <- .Call(C_R_bit_duplicated, tmp=bit(nr), x, range_na, ret=bit(length(x)), as.logical(na.rm))
  }
  retFUN(ret)
}

#' @describeIn bit_unidup checks for existence of duplicate elements
#' @export
bit_anyDuplicated <- function(x, na.rm = NA, range_na=NULL){
  na.rm <- as.logical(na.rm)
  if (!(is.integer(x) || is.factor(x)))
    stop("x must be integer (or factor)")
  if (!length(x))
    return(0L)
  if (is.null(range_na))
    range_na <- range_na(x)
  else 
    range_na <- as.integer(range_na)
  if (is.na(range_na[1]))
    nr <- 0L  # no numbers, no temp bit vector
  else
    nr <- as.double(range_na[2])-as.double(range_na[1])+1
  d <- length(x) / nr
  if (nr > .Machine$integer.max || d < 1/64 ){
    if (is.na(na.rm)){
      anyDuplicated(x, incomparables = FALSE)
    }else if (na.rm){
      max(0L, min(anyDuplicated(x, incomparables = NA), firstNA(x)))
    }else{
      anyDuplicated(x, incomparables = NA)
    }
  }else{
    .Call(C_R_bit_anyDuplicated, bit(nr), x, range_na=range_na, as.logical(na.rm))
  }
}

#' @describeIn bit_unidup counts duplicate elements
#' @export
bit_sumDuplicated <- function(x, na.rm = NA, range_na=NULL){
  if (!(is.integer(x) || is.factor(x)))
    stop("x must be integer (or factor)")
  if (!length(x))
    return(0L)
  if (is.null(range_na))
    range_na <- range_na(x)
  else 
    range_na <- as.integer(range_na)
  if (is.na(range_na[1]))
    nr <- 0L  # no numbers, no temp bit vector
  else
    nr <- as.double(range_na[2])-as.double(range_na[1])+1
  d <- length(x) / nr
  if (nr > .Machine$integer.max || d < 1/64 ){
    if (is.na(na.rm)){
      sum(duplicated(x, incomparables = FALSE))
    }else if (na.rm){
      sum(duplicated(x, incomparables = NA)) + sum(is.na(x))
    }else{
      sum(duplicated(x, incomparables = NA))
    }
  }else{
    .Call(C_R_bit_sumDuplicated, bit(nr), x, range_na=range_na, as.logical(na.rm))
  }
}




#' bit set operations
#' 
#' Fast versions of \code{\link{union}}, \code{\link{intersect}}, 
#' \code{\link{setdiff}}, symmetric difference and \code{\link{setequal}}
#' for integers.
#' 
#' determines the range of the integers and checks if the density justifies use 
#' of a bit vector; if yes, uses a bit vector for finding duplicates; if no, 
#' falls back to \code{\link{union}}, \code{\link{intersect}},
#' \code{\link{setdiff}}, \code{union(setdiff(x,y),setdiff(y,x))} and \code{\link{setequal}}
#' 
#' @name bit_setops
#' @param x an integer vector
#' @param y an integer vector
#'   
#' @return an integer vector
#' @seealso \code{\link{bit_in}}, \code{\link{bit_rangediff}}
#' 
#' @examples
#' bit_union(1:2, 2:3)
#' bit_intersect(1:2, 2:3)
#' bit_setdiff(1:2, 2:3)
#' bit_symdiff(1:2, 2:3)
#' bit_setequal(1:2, 2:3)
#' bit_setequal(1:2, 2:1)
NULL

#' @describeIn bit_setops union
#' @export
bit_union <- function(x, y){
  if (!(is.integer(x) || is.factor(x)))
    stop("x must be integer (or factor)")
  if (!(is.integer(y) || is.factor(y)))
    stop("y must be integer (or factor)")
  nx <- length(x)
  ny <- length(y)
  rx <- range_na(x)
  ry <- range_na(y)
  range_na <- c(min(rx[1],ry[1]), max(rx[2],ry[2]), rx[3]+ry[3])
  if (max(rx[1],ry[1])>min(rx[2],ry[2])){
    if ( range_na[3]>0 && (rx[3]>0) == (rx[3]>0) )
      return(bit_unique(c(x,y), range_na=range_na))
    else
      return(c(bit_unique(x, range_na=rx), bit_unique(y, range_na=ry)))
  }
  if (is.na(range_na[1]))
    nr <- 0L  # no numbers, no temp bit vector
  else
    nr <- as.double(range_na[2])-as.double(range_na[1])+1
  d <- (nx+ny) / nr
  if (nr==0L || nr > .Machine$integer.max || d < 1/64 ){
    ret <- union(x,y)
  }else{
    ret <- .Call(C_R_bit_union, tmp=bit(nr), x, y, range_na)
  }
  ret
}

#' @describeIn bit_setops intersection
#' @export
bit_intersect <- function(x, y){
  if (!(is.integer(x) || is.factor(x)))
    stop("x must be integer (or factor)")
  if (!(is.integer(y) || is.factor(y)))
    stop("y must be integer (or factor)")
  nx <- length(x)
  ny <- length(y)
  rx <- range_na(x)
  ry <- range_na(y)
  range_na <- c(max(rx[1],ry[1]), min(rx[2],ry[2]), rx[3]+ry[3])
  if (range_na[1]>range_na[2])
    return(integer())
  if (is.na(range_na[1]))
    nr <- 0L  # no numbers, no temp bit vector
  else
    nr <- as.double(range_na[2])-as.double(range_na[1])+1
  d <- (nx+ny) / nr
  if (nr==0L || nr > .Machine$integer.max || d < 1/64 ){
    ret <- intersect(x,y)
  }else{
    ret <- .Call(C_R_bit_intersect, tmp=bit(nr), x, y, range_na)
  }
  ret
}

#' @describeIn bit_setops asymmetric difference
#' @export
bit_setdiff <- function(x, y){
  if (!(is.integer(x) || is.factor(x)))
    stop("x must be integer (or factor)")
  if (!(is.integer(y) || is.factor(y)))
    stop("y must be integer (or factor)")
  nx <- length(x)
  ny <- length(y)
  range_na <- range_na(x)
  if (is.na(range_na[1]))
    nr <- 0L  # no numbers, no temp bit vector
  else
    nr <- as.double(range_na[2])-as.double(range_na[1])+1
  d <- (nx+ny) / nr
  if (nr==0L || nr > .Machine$integer.max || d < 1/64 ){
    ret <- setdiff(x,y)
  }else{
    ret <- .Call(C_R_bit_setdiff, tmp=bit(nr), x, y, range_na)
  }
  ret
}

#' @describeIn bit_setops symmetricx difference
#' @export
bit_symdiff <- function(x, y){
  if (!(is.integer(x) || is.factor(x)))
    stop("x must be integer (or factor)")
  if (!(is.integer(y) || is.factor(y)))
    stop("y must be integer (or factor)")
  nx <- length(x)
  ny <- length(y)
  rx <- range_na(x)
  ry <- range_na(y)
  range_na <- c(min(rx[1],ry[1]), max(rx[2],ry[2]), rx[3]+ry[3])
  if (max(rx[1],ry[1])>min(rx[2],ry[2])){
    if ( range_na[3]>0 && (rx[3]>0) == (rx[3]>0) )
      return(bit_unique(c(x,y), range_na=range_na))
    else
      return(c(bit_unique(x, range_na=rx), bit_unique(y, range_na=ry)))
  }
  if (is.na(range_na[1]))
    nr <- 0L  # no numbers, no temp bit vector
  else
    nr <- as.double(range_na[2])-as.double(range_na[1])+1
  d <- (nx+ny) / nr
  if (nr==0L || nr > .Machine$integer.max || d < 1/64 ){
    ret <- union(setdiff(x,y),setdiff(y,x))
  }else{
    ret <- .Call(C_R_bit_symdiff, bit(nr), bit(nr), x, y, range_na, rx[3]>0, ry[3]>0)
  }
  ret
}

#' @describeIn bit_setops equality
#' @export
bit_setequal <- function(x, y){
  if (!(is.integer(x) || is.factor(x)))
    stop("x must be integer (or factor)")
  if (!(is.integer(y) || is.factor(y)))
    stop("y must be integer (or factor)")
  nx <- length(x)
  ny <- length(y)
  rx <- range_na(x)
  ry <- range_na(y)
  rx[3] <- as.logical(rx[3])  # map NA count to 0/1
  ry[3] <- as.logical(ry[3])  # map NA count to 0/1
  if (!identical(rx,ry))
    return(FALSE)
  range_na <- rx
  if (is.na(range_na[1]))
    nr <- 0L  # no numbers, no temp bit vector
  else
    nr <- as.double(range_na[2])-as.double(range_na[1])+1
  d <- (nx+ny) / nr
  if (nr==0L || nr > .Machine$integer.max || d < 1/256 ){
    ret <- setequal(x,y)
  }else{
    ret <- .Call(C_R_bit_setequal, bit(nr), bit(nr), x, y, range_na)
  }
  ret
}


#' bit range difference
#' 
#' Fast version of \code{setdiff(rx[1]:rx[2], y)}.
#' 
#' determines the range of the integers \code{y} and checks if the density justifies use 
#' of a bit vector; if yes, uses a bit vector for the set operation; if no, 
#' falls back to a quicksort and \code{\link{merge_rangediff}}
#' 
#' @param y an integer vector of elements to exclude
#' @param revx \code{FALSE} as is, \code{TRUE} to reverse the direction and sign of \code{rx}
#' @param revy \code{FALSE} as is, \code{TRUE} to reverse the direction and sign of \code{y}
#' @inheritParams merge_rev
#'
#' @return an integer vector
#' @seealso \code{\link{bit_setdiff}}, \code{\link{merge_rangediff}}
#'
#' @examples
#' bit_rangediff(c(1L,6L), c(3L,4L))
#' bit_rangediff(c(6L,1L), c(3L,4L))
#' bit_rangediff(c(6L,1L), c(3L,4L), revx=TRUE)
#' bit_rangediff(c(6L,1L), c(3L,4L), revx=TRUE, revy=TRUE)
#' @export
bit_rangediff <- function(rx, y, revx=FALSE, revy=FALSE){
  if (!is.integer(y))
    stop("y must be integer")
  if (!is.ri(rx)){
    stopifnot(length(rx)==2)
    rx <- as.integer(rx)
  }
  if (rx[1]>rx[2])
    ox <- as.double(rev(rx))
  else
    ox <- as.double(rx)
  n <- ox[2]-ox[1]+1L
  d <- length(y) / n
  if (n > .Machine$integer.max || d < 1/64 ){
    ox <- as.integer(ox)
    y <- .Call(C_R_int_quicksort2, copy_vector(y), c(ox, 0L), na.last=NA)
    ret <- merge_rangediff(ox, y, revx=revx, revy=revy)
    if (rx[1]>rx[2])
      ret <- rev(ret)
    ret
  }else{
    .Call(C_R_bit_rangediff, tmp=bit(n), as.integer(rx), y, as.logical(revx), as.logical(revy))
  }
}


# /* some experiments - just ignore
# bit_sample1 <- function (x, size, replace = FALSE, prob = NULL){
#   if (length(x)==1L){
#     x <- seq_len(x)
#   } else{
#     x <- as.integer(x)
#   }  
#   size <- as.integer(size)
#   if (replace)
#     stop("replace not implemented")
#   else{
#     stopifnot(size <= length(x))
#     .Call(C_R_bit_sample, x, bit(length(x)), size)
#   }
# }
# bit_sample <- function (x, size=length(x), replace = FALSE, prob = NULL){
#   if (length(x)==1L){
#     x <- seq_len(x)
#   } else{
#     x <- as.integer(x)
#   }  
#   size <- as.integer(size)
#   replace <- as.logical(replace)
#   if (!replace && size>length(x))
#     stop("size > length(x) without replace")
#   .Call(C_R_bit_sample, x, size, replace)
# }
# */
truecluster/bit documentation built on Nov. 20, 2022, 2:34 a.m.