R/integer-utils.R

Defines functions reverseIntegerInjection findIntervalAndStartFromWidth make_XYZxyz_to_XxYyZz_subscript fancy_mseq mseq sortedMerge intbitsOR intbitsAND intbitsNOT implodeIntBits explodeIntBits makePowersOfTwo tabulate2 duplicatedIntegerQuads selfmatchIntegerQuads .selfmatchIntegerQuads_hash .selfmatchIntegerQuads_quick matchIntegerQuads .matchIntegerQuads_hash .matchIntegerQuads_quick orderIntegerQuads sortedIntegerQuads runEndsOfIntegerPairs duplicatedIntegerPairs selfmatchIntegerPairs .selfmatchIntegerPairs_hash .selfmatchIntegerPairs_quick matchIntegerPairs .matchIntegerPairs_hash .matchIntegerPairs_quick orderIntegerPairs sortedIntegerPairs pcompareIntegerPairs .normargMethod .normargIntegerOrFactor toListOfIntegerVectors groupsum diffWithLast diffWithInitialZero anyMissingOrOutside isSequence

Documented in duplicatedIntegerPairs duplicatedIntegerQuads isSequence matchIntegerPairs matchIntegerQuads orderIntegerPairs orderIntegerQuads selfmatchIntegerPairs selfmatchIntegerQuads toListOfIntegerVectors

### =========================================================================
### Some low-level utility functions to operate on integer vectors
### -------------------------------------------------------------------------
###
### Unless stated otherwise, the functions defined in this file are not
### exported.
###

### Exported!
### TODO: Implment this in C so we won't need to create 'seq_len(of.length)'
### and we will be able to bail out early.
isSequence <- function(x, of.length=length(x))
{
    if (!is.integer(x))
        stop("'x' must be an integer vector")
    if (!isSingleNumber(of.length) || of.length < 0L)
        stop("'length' must be a single non-negative integer")
    length(x) == of.length && identical(x, seq_len(of.length))
}

anyMissingOrOutside <- function(x, lower = -.Machine$integer.max,
                                   upper = .Machine$integer.max)
{
    if (!is.integer(x))
        stop("'x' must be an integer vector")
    if (!is.integer(lower))
        lower <- as.integer(lower)
    if (!is.integer(upper))
        upper <- as.integer(upper)
    .Call2("Integer_any_missing_or_outside",
           x, lower, upper,
           PACKAGE="S4Vectors")
}

### Equivalent to (but much faster than):
###
###   diff(c(0L, x))
###
### except that NAs are not supported.
diffWithInitialZero <- function(x)
{
    if (!is.integer(x))
        stop("'x' must be an integer vector")
    .Call2("Integer_diff_with_0", x, PACKAGE="S4Vectors")
}

### Equivalent to (but much faster than):
###
###   diff(c(x, last))
###
### except that NAs are not supported.
diffWithLast <- function(x, last)
{
  if (!is.integer(x))
    stop("'x' must be an integer vector")
  if (!isSingleInteger(last))
    stop("'last' must be a single, non-NA integer")
  .Call2("Integer_diff_with_last", x, last, PACKAGE="S4Vectors")
}

### x: integer vector.
### breakpoints: vector of positions on 'x' in increasing order.
### Equivalent to (but 10x faster than):
###     sum(relist(x, PartitioningByEnd(breakpoints)))
### Also equivalent to (but 200x faster than):
###     f <- rep(factor(seq_along(breakpoints)), diff(c(0L, breakpoints)))
###     vapply(split(x, f, drop=FALSE), sum, integer(1), USE.NAMES=FALSE)
groupsum <- function(x, breakpoints)
{   
    if (last_or(breakpoints, 0L) != length(x))
        stop("invalid 'breakpoints' argument")
    diffWithInitialZero(cumsum(x)[breakpoints])
}


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### toListOfIntegerVectors()
###
### On a character vector toListOfIntegerVectors(x) is an alternative to:
###   lapply(strsplit(x, ",", fixed=TRUE), as.integer)
### except that:
###  - strsplit() accepts NAs but we don't (we raise an error);
###  - as.integer() introduces NAs by coercion (with a warning) but we don't
###    (we raise an error);
###  - as.integer() supports "inaccurate integer conversion in coercion"
###    when the value to coerce is > INT_MAX (then it's coerced to INT_MAX)
###    but we don't (we raise an error);
###  - as.integer() will coerce non-integer values (e.g. 10.3) to an int
###    by truncating them but we don't (we raise an error).
### Also when it fails, toListOfIntegerVectors() prints a detailed parse
### error message.
### Finally it's faster and uses much less memory. E.g. it's 8x faster and
### uses < 1 Mb versus > 60 Mb on the 'biginput' character vector below:
###   library(rtracklayer)
###   session <- browserSession()
###   genome(session) <- "hg19"
###   query <- ucscTableQuery(session, "UCSC Genes")
###   tx <- getTable(query)
###   ## 165920 strings in 'biginput' as of Jan 31, 2018.
###   biginput <- c(as.character(tx$exonStarts), as.character(tx$exonEnds))

### Exported!
toListOfIntegerVectors <- function(x, sep=",")
{
    if (!isSingleString(sep) || nchar(sep) != 1L)
        stop("'sep' must be a single-letter string")
    ans <- .Call2("to_list_of_ints", x, sep, PACKAGE="S4Vectors")
    names(ans) <- names(x)
    ans
}


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Fast ordering/comparing of integer pairs.
###

.normargIntegerOrFactor <- function(arg, argname)
{
    if (is.factor(arg))
        arg <- as.integer(arg)
    else if (is(arg, "Rle") &&
             (is(runValue(arg), "integer") || is(runValue(arg), "factor")))
        arg <- as.integer(arg)
    else if (!is.integer(arg))
        stop("'", argname, "' must be an integer vector or factor")
    arg
}

.normargMethod <- function(method=c("auto", "quick", "hash"), a_len)
{
    method <- match.arg(method)
    if (method == "auto") {
        if (a_len <= 2^29)
            method <- "hash"
        else
            method <- "quick"
    }
    method
}

pcompareIntegerPairs <- function(a1, b1, a2, b2)
{
    a1 <- .normargIntegerOrFactor(a1, "a1")
    b1 <- .normargIntegerOrFactor(b1, "b1")
    if (length(a1) != length(b1))
        stop("'a1' and 'b1' must have the same length")
    a2 <- .normargIntegerOrFactor(a2, "a2")
    b2 <- .normargIntegerOrFactor(b2, "b2")
    if (length(a2) != length(b2))
        stop("'a2' and 'b2' must have the same length")
    .Call2("Integer_pcompare2", a1, b1, a2, b2, PACKAGE="S4Vectors")
}

sortedIntegerPairs <- function(a, b, decreasing=FALSE, strictly=FALSE)
{
    a <- .normargIntegerOrFactor(a, "a")
    b <- .normargIntegerOrFactor(b, "b")
    .Call2("Integer_sorted2", a, b, decreasing, strictly, PACKAGE="S4Vectors")
}

### Exported!
orderIntegerPairs <- function(a, b, decreasing=FALSE)
{
    a <- .normargIntegerOrFactor(a, "a")
    b <- .normargIntegerOrFactor(b, "b")
    #.Call2("Integer_order2", a, b, decreasing, PACKAGE="S4Vectors")
    base::order(a, b, decreasing=decreasing)
}

.matchIntegerPairs_quick <- function(a1, b1, a2, b2, nomatch=NA_integer_)
{
    .Call2("Integer_match2_quick",
           a1, b1, a2, b2, nomatch,
           PACKAGE="S4Vectors")
}

.matchIntegerPairs_hash <- function(a1, b1, a2, b2, nomatch=NA_integer_)
{
    .Call2("Integer_match2_hash",
           a1, b1, a2, b2, nomatch,
           PACKAGE="S4Vectors")
}

### Exported!
matchIntegerPairs <- function(a1, b1, a2, b2, nomatch=NA_integer_,
                              method=c("auto", "quick", "hash"))
{
    a1 <- .normargIntegerOrFactor(a1, "a1")
    b1 <- .normargIntegerOrFactor(b1, "b1")
    if (length(a1) != length(b1))
        stop("'a1' and 'b1' must have the same length")
    a2 <- .normargIntegerOrFactor(a2, "a2")
    b2 <- .normargIntegerOrFactor(b2, "b2")
    if (length(a2) != length(b2))
        stop("'a2' and 'b2' must have the same length")
    if (!isSingleNumberOrNA(nomatch))
        stop("'nomatch' must be a single number or NA")
    if (!is.integer(nomatch))
        nomatch <- as.integer(nomatch)
    method <- .normargMethod(method, length(a2))
    if (method == "quick") {
        ans <- .matchIntegerPairs_quick(a1, b1, a2, b2, nomatch=nomatch)
    } else {
        ans <- .matchIntegerPairs_hash(a1, b1, a2, b2, nomatch=nomatch)
    }
    ans
}

.selfmatchIntegerPairs_quick <- function(a, b)
{
    .Call2("Integer_selfmatch2_quick", a, b, PACKAGE="S4Vectors")
}

### Author: Martin Morgan
.selfmatchIntegerPairs_hash <- function(a, b)
{
    .Call2("Integer_selfmatch2_hash", a, b, PACKAGE="S4Vectors")
}

### Exported!
selfmatchIntegerPairs <- function(a, b, method=c("auto", "quick", "hash"))
{
    a <- .normargIntegerOrFactor(a, "a")
    b <- .normargIntegerOrFactor(b, "b")
    if (length(a) != length(b))
        stop("'a' and 'b' must have the same length")
    method <- .normargMethod(method, length(a))
    if (method == "quick") {
        ans <- .selfmatchIntegerPairs_quick(a, b)
    } else {
        ans <- .selfmatchIntegerPairs_hash(a, b)
    }
    ans
}

### Exported!
###
### For 'a' and 'b' integer vectors of equal length with no NAs,
### 'duplicatedIntegerPairs(a, b)' is equivalent to (but much faster than):
###
###   duplicated(cbind(a, b))
###
### For efficiency reasons, we don't support (and don't even check) for NAs.
duplicatedIntegerPairs <- function(a, b,
                                   fromLast=FALSE,
                                   method=c("auto", "quick", "hash"))
{
    a <- .normargIntegerOrFactor(a, "a")
    b <- .normargIntegerOrFactor(b, "b")
    if (length(a) != length(b))
        stop("'a' and 'b' must have the same length")
    if (!isTRUEorFALSE(fromLast))
        stop("'fromLast' must be TRUE or FALSE")
    if (length(a) == 0L)
        return(logical(0L))
    if (length(a) == 1L)
        return(FALSE)
    ## This is a temporary (and inefficient) workaround until "quick"
    ## and "hash" methods can natively support fromLast=TRUE.
    ## TODO: Add support for fromLast=TRUE to "quick" and "hash" methods.
    if (fromLast)
        return(rev(duplicatedIntegerPairs(rev(a), rev(b), method=method)))
    sm <- selfmatchIntegerPairs(a, b, method=method)
    sm != seq_len(length(sm))
}

### For 'a' and 'b' integer vectors of equal length with no NAs,
### 'runEndsOfIntegerPairs(a, b)' finds the runs of identical rows in
### 'cbind(a, b)' and returns the indices of the last row in each run.
### In other words, it's equivalent to (but much faster than):
###
###   cumsum(runLength(Rle(paste(a, b, sep="|"))))
###
### Note that, if the rows in 'cbind(a, b)' are already sorted, then
### 'runEndsOfIntegerPairs(a, b)' returns the indices of the unique rows.
### In other words, 'runEndsOfIntegerPairs()' could be used to efficiently
### extract the unique pairs of integers from a presorted set of pairs.
### However, at the moment (April 2011) using 'duplicatedIntegerPairs()'
### is still faster than using 'runEndsOfIntegerPairs()' for finding the
### duplicated or unique pairs of integers in a presorted set of pairs.
### But this only because 'runEndsOfIntegerPairs()' is not as fast as it
### could/should be (an all-in-C implementation would probably solve this).
###
### For efficiency reasons, we don't support (and don't even check) for NAs.
### TODO: What happens if 'a' and 'b' don't have the same length? Shouldn't
### we check for that?
runEndsOfIntegerPairs <- function(a, b)
{
    not_same_as_prev <- diffWithInitialZero(a) != 0L |
                        diffWithInitialZero(b) != 0L
    if (length(not_same_as_prev) == 0L)
        return(integer())
    which(c(not_same_as_prev[-1L], TRUE))
}


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Fast ordering/comparing of integer quadruplets.
###

sortedIntegerQuads <- function(a, b, c, d, decreasing=FALSE, strictly=FALSE)
{
    a <- .normargIntegerOrFactor(a, "a")
    b <- .normargIntegerOrFactor(b, "b")
    c <- .normargIntegerOrFactor(c, "c")
    d <- .normargIntegerOrFactor(d, "d")
    .Call2("Integer_sorted4", a, b, c, d, decreasing, strictly,
           PACKAGE="S4Vectors")
}

### Exported!
orderIntegerQuads <- function(a, b, c, d, decreasing=FALSE)
{
    a <- .normargIntegerOrFactor(a, "a")
    b <- .normargIntegerOrFactor(b, "b")
    c <- .normargIntegerOrFactor(c, "c")
    d <- .normargIntegerOrFactor(d, "d")
    #.Call2("Integer_order4", a, b, c, d, decreasing, PACKAGE="S4Vectors")
    base::order(a, b, c, d, decreasing=decreasing)
}

.matchIntegerQuads_quick <- function(a1, b1, c1, d1, a2, b2, c2, d2,
                                     nomatch=NA_integer_)
{
    .Call2("Integer_match4_quick",
           a1, b1, c1, d1, a2, b2, c2, d2, nomatch,
           PACKAGE="S4Vectors")
}

.matchIntegerQuads_hash <- function(a1, b1, c1, d1, a2, b2, c2, d2,
                                    nomatch=NA_integer_)
{
    .Call2("Integer_match4_hash",
           a1, b1, c1, d1, a2, b2, c2, d2, nomatch,
           PACKAGE="S4Vectors")
}

### Exported!
matchIntegerQuads <- function(a1, b1, c1, d1, a2, b2, c2, d2,
                              nomatch=NA_integer_,
                              method=c("auto", "quick", "hash"))
{
    a1 <- .normargIntegerOrFactor(a1, "a1")
    b1 <- .normargIntegerOrFactor(b1, "b1")
    c1 <- .normargIntegerOrFactor(c1, "c1")
    d1 <- .normargIntegerOrFactor(d1, "d1")
    if (length(a1) != length(b1) ||
        length(b1) != length(c1) ||
        length(c1) != length(d1))
        stop("'a1', 'b1', 'c1' and 'd1' must have the same length")
    a2 <- .normargIntegerOrFactor(a2, "a2")
    b2 <- .normargIntegerOrFactor(b2, "b2")
    c2 <- .normargIntegerOrFactor(c2, "c2")
    d2 <- .normargIntegerOrFactor(d2, "d2")
    if (length(a2) != length(b2) ||
        length(b2) != length(c2) ||
        length(c2) != length(d2))
        stop("'a2', 'b2', 'c2' and 'd2' must have the same length")
    if (!isSingleNumberOrNA(nomatch))
        stop("'nomatch' must be a single number or NA")
    if (!is.integer(nomatch))
        nomatch <- as.integer(nomatch)
    method <- .normargMethod(method, length(a2))
    if (method == "quick") {
        ans <- .matchIntegerQuads_quick(a1, b1, c1, d1, a2, b2, c2, d2,
                                        nomatch=nomatch)
    } else {
        ans <- .matchIntegerQuads_hash(a1, b1, c1, d1, a2, b2, c2, d2,
                                       nomatch=nomatch)
    }
    ans
}

.selfmatchIntegerQuads_quick <- function(a, b, c, d)
{
    .Call2("Integer_selfmatch4_quick", a, b, c, d, PACKAGE="S4Vectors")
}

.selfmatchIntegerQuads_hash <- function(a, b, c, d)
{
    .Call2("Integer_selfmatch4_hash", a, b, c, d, PACKAGE="S4Vectors")
}

### Exported!
selfmatchIntegerQuads <- function(a, b, c, d,
                                  method=c("auto", "quick", "hash"))
{
    a <- .normargIntegerOrFactor(a, "a")
    b <- .normargIntegerOrFactor(b, "b")
    c <- .normargIntegerOrFactor(c, "c")
    d <- .normargIntegerOrFactor(d, "d")
    if (length(a) != length(b) ||
        length(b) != length(c) ||
        length(c) != length(d))
        stop("'a', 'b', 'c' and 'd' must have the same length")
    method <- .normargMethod(method, length(a))
    if (method == "quick") {
        ans <- .selfmatchIntegerQuads_quick(a, b, c, d)
    } else {
        ans <- .selfmatchIntegerQuads_hash(a, b, c, d)
    }
    ans
}

### Exported!
duplicatedIntegerQuads <- function(a, b, c, d,
                                   fromLast=FALSE,
                                   method=c("auto", "quick", "hash"))
{
    a <- .normargIntegerOrFactor(a, "a")
    b <- .normargIntegerOrFactor(b, "b")
    c <- .normargIntegerOrFactor(c, "c")
    d <- .normargIntegerOrFactor(d, "d")
    if (length(a) != length(b) ||
        length(b) != length(c) ||
        length(c) != length(d))
        stop("'a', 'b', 'c' and 'd' must have the same length")
    if (!isTRUEorFALSE(fromLast))
        stop("'fromLast' must be TRUE or FALSE")
    if (length(a) == 0L)
        return(logical(0L))
    if (length(a) == 1L)
        return(FALSE)
    ## This is a temporary (and inefficient) workaround until "quick"
    ## and "hash" methods can natively support fromLast=TRUE.
    ## TODO: Add support for fromLast=TRUE to "quick" and "hash" methods.
    if (fromLast)
        return(rev(duplicatedIntegerQuads(rev(a), rev(b), rev(c), rev(d),
                                          method=method)))
    sm <- selfmatchIntegerQuads(a, b, c, d, method=method)
    sm != seq_len(length(sm))
}


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### tabulate2()
###
### An enhanced version of base::tabulate() that: (1) handles integer weights
### (NA and negative weights are OK), and (2) throws an error if 'strict' is
### TRUE and if 'x' contains NAs or values not in the [1, 'nbins'] interval.
### Unlike with base::tabulate(), 'nbins' needs to be specified (no default
### value). Also for now, it only works if 'x' is an integer vector.
###

tabulate2 <- function(x, nbins, weight=1L, strict=FALSE)
{
    if (!is.integer(x))
        stop("'x' must be an integer vector")
    if (!isSingleNumber(nbins))
        stop("'nbins' must be a single integer")
    if (!is.integer(nbins))
        nbins <- as.integer(nbins)
    if (!is.integer(weight))
        stop("'weight' must be an integer vector")
    if (!isTRUEorFALSE(strict))
        stop("'strict' must be TRUE or FALSE")
    .Call2("Integer_tabulate2", x, nbins, weight, strict, PACKAGE="S4Vectors")
}


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Bitwise operations.
###
### The bitwise operations in this section don't treat the integer NA (aka
### NA_integer_) in any particular way: at the C level an NA_integer_ is
### just a 32-bit pattern like any other int in C.
###

makePowersOfTwo <- function(nbit)
{
    if (!isSingleInteger(nbit) || nbit < 0L)
        stop("'nbit' must be a single non-negative integer")
    if (nbit == 0L)
        return(integer(0))
    as.integer(cumprod(c(1L, rep.int(2L, nbit-1L))))
}

### Returns an integer matrix with 'length(x)' rows and 'length(bitpos)' cols.
explodeIntBits <- function(x, bitpos=1:32)
{
    if (!is.integer(x))
        stop("'x' must be an integer vector")
    if (!is.integer(bitpos))
        stop("'bitpos' must be an integer vector")
    ## Old implementation: not very efficient and also broken on NAs and
    ## negative integers!
    #if (length(bitpos) == 0L)
    #    return(matrix(nrow=length(x), ncol=0L))
    #nbit <- max(bitpos)
    #if (is.na(nbit) || min(bitpos) <= 0L)
    #    stop("'bitpos' must contain potive values only")
    #ans <- matrix(nrow=length(x), ncol=nbit)
    #for (i in seq_len(ncol(ans))) {
    #    ans[ , i] <- x %% 2L
    #    x <- x %/% 2L
    #}
    #ans[ , bitpos, drop=FALSE]
    .Call2("Integer_explode_bits", x, bitpos, PACKAGE="S4Vectors")
}

### FIXME: Broken if ncol(x) = 32.
implodeIntBits <- function(x)
{
    if (!is.matrix(x))
        stop("'x' must be a matrix")
    tx <- t(x)
    data <- tx * makePowersOfTwo(nrow(tx))
    ## In some circumstances (e.g. if 'tx' has 0 col), the "dim" attribute
    ## gets lost during the above multiplication.
    if (is.null(dim(data)))
        dim(data) <- dim(tx)
    as.integer(colSums(data))
}

intbitsNOT <- function(x)
{
    stop("not yet implemented")  # fix implodeIntBits() first!
    xbits <- explodeIntBits(x)
    implodeIntBits(!xbits)
}

intbitsAND <- function(x, y)
{
    stop("not yet implemented")  # fix implodeIntBits() first!
    xbits <- explodeIntBits(x)
    ybits <- explodeIntBits(y)
    implodeIntBits(xbits & ybits)
}

intbitsOR <- function(x, y)
{
    stop("not yet implemented")  # fix implodeIntBits() first!
    xbits <- explodeIntBits(x)
    ybits <- explodeIntBits(y)
    implodeIntBits(xbits | ybits)
}


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Others.
###

sortedMerge <- function(x, y)
    .Call2("Integer_sorted_merge", x, y, PACKAGE="S4Vectors")

mseq <- function(from, to)
{
    if (!is.integer(from))
        from <- as.integer(from)
    if (!is.integer(to))
        to <- as.integer(to)
    .Call2("Integer_mseq", from, to, PACKAGE="S4Vectors")
}

fancy_mseq <- function(lengths, offset=0L, rev=FALSE)
{
    if (!is.integer(lengths))
        lengths <- as.integer(lengths)
    if (!is.integer(offset))
        offset <- as.integer(offset)
    if (!is.logical(rev))
        stop("'rev' must be a logical vector")
    #unlist(lapply(seq_len(length(lengths)),
    #              function(i) {
    #                  tmp <- seq_len(lengths[i]) + offset[i]
    #                  if (rev[i])
    #                      tmp <- rev(tmp)
    #                  tmp
    #              }))
    .Call2("Integer_fancy_mseq", lengths, offset, rev, PACKAGE="S4Vectors")
}

make_XYZxyz_to_XxYyZz_subscript <- function(N)
    as.vector(matrix(seq_len(2L * N), nrow=2L, byrow=TRUE))

findIntervalAndStartFromWidth <- function(x, width)
    .Call2("findIntervalAndStartFromWidth", x, width, PACKAGE="S4Vectors")

### Reverse an injection from 1:M to 1:N.
### The injection is represented by an integer vector of length M (eventually
### with NAs). Fundamental property:
###
###   reverseIntegerInjection(reverseIntegerInjection(injection, N), M)
###
### is the identity function.
### Can be used to efficiently reverse the result of a call to 'order()'.
reverseIntegerInjection <- function(injection, N)
{
    M <- length(injection)
    ans <- rep.int(NA_integer_, N)
    is_not_na <- !is.na(injection)
    ans[injection[is_not_na]] <- seq_len(M)[is_not_na]
    ans
}

Try the S4Vectors package in your browser

Any scripts or data that you put into this service are public.

S4Vectors documentation built on Dec. 11, 2020, 2:02 a.m.