R/factor256.R

Defines functions setkeyv256 tabulate256_levels unique256 order256 rank256 tabulate256 factor256_enotin factor256_ein factor256_notin factor256_in ensure_common256 no_intertwine as_factor isntSorted256 anyNA.factor256 format.factor256 print.factor256 is.factor256 levels.factor256 relevel256 StackMatch as.integer0 character2factor256 integer2factor256 raw2factor256 logical2factor256 .orig_lgl recompose256 factor256

Documented in as_factor factor256 factor256_ein factor256_enotin factor256_in factor256_notin is.factor256 isntSorted256 levels.factor256 order256 rank256 recompose256 relevel256 setkeyv256 tabulate256 tabulate256_levels unique256

#' Factors of fewer than 256 elements
#' @importFrom utils packageName
#' @importFrom utils hasName
#' @importFrom utils head
#' @importFrom utils globalVariables
#' @importFrom utils tail

#' @useDynLib factor256, .registration=TRUE
#'
#' @description Whereas base R's factors are based on 32-bit integer vectors,
#' \code{factor256} uses 8-bit raw vectors to minimize its memory footprint.
#'
#' @param x An atomic vector with fewer than 256 unique elements.
#' @param levels An optional character vector of or representing the unique values of \code{x}.
#'
#'
#' @param f A raw vector of class \code{factor256}.
#' @param tbl The table of values to lookup in \code{f}. May be a \code{factor256}
#' class but will be implicitly converted based on the levels of \code{f}.
#'
#' @param strictly If \code{TRUE} then if \code{x[i] == x[j]} and \code{i != j}
#' then \code{x} is not sorted.
#'
#' @param nmax,dotInterval (\code{tabulate256_levels} only). Every \code{dotInterval}
#' iterations through \code{x} check number of unique elements detected so far. If any count exceeds
#' \code{nmax} the rest of the vector is ignored.
#'
#' @return
#' \code{factor256} is a class based on raw vectors.
#' Values in \code{x} absent from \code{levels} are mapped to \code{00}.
#'
#' In the following list, \code{o} is the result.
#'
#' \describe{
#' \item{\code{factor256}}{A raw vector of class \code{factor256}.}
#' \item{\code{recompose256}}{is the inverse operation.}
#' \item{\code{factor256_e?(not)?in}}{A logical vector the same length of \code{f}, \code{o[i] = TRUE} if
#' \code{f[i]} is among the values of \code{tbl} when converted to \code{factor256}.
#' \code{_notin} is the negation. The \code{factor256_e} variants will error if
#' none of the values of \code{tbl} are present in \code{f}.}
#' \item{\code{tabulate256}}{Takes a raw vector and counts the number of times
#' each element occurs within it. It is always length-256; if an element is absent
#' it will have value zero in the output.}
#' \item{\code{tabulate256_levels}}{Similar to \code{tabulate256} but with optional arguments \code{nmax},
#' \code{dotInterval}.}
#' \item{\code{as_factor}}{Converts from \code{factor256} to \code{factor}.}
#' \item{\code{order256}}{Same as \code{order} but supports raw vectors. \code{order256(x)}}
#' \item{\code{rank256}}{Same as \code{rank} with \code{ties.method = "first"} but supports raw vectors.}
#' \item{\code{unique256}}{Unique elements of.}
#'
#' }
#'
#' @examples
#' f10 <- factor256(1:10)
#'
#' fletters <- factor256(rep(letters, 1:26))
#' head(factor256_in(fletters, "g"))
#' head(tabulate256(fletters))
#' head(recompose256(fletters))
#'
#' gletters <- factor256(rep(letters, 1:26), levels = letters[1:25])
#' tail(tabulate256(gletters))
#' tabulate256_levels(gletters, nmax = 5L, dotInterval = 1L)

#'
#'
#' @export

factor256 <- function(x, levels = NULL) {
  if (inherits(x, "factor256")) {
    if (is.null(levels) || identical(levels(x), levels)) {
      return(x)
    }
    attr(x, "factor256_levels") <- union(levels(x), levels)
    return(x)
  }
  stopifnot(is.atomic(x))
  if (is.null(levels)) {
    levels <- unique(x)
    if (length(levels) >= 256L) {
      stop("`length(unique(x)) = ", length(levels), "` but must be less than 256.")
    }
  } else {
    if (length(levels) >= 256L) {
      stop("length(levels) = ", length(levels), " but must be less than 256.")
    }
    if (anyDuplicated(levels)) {
      dup_ele <- levels[anyDuplicated(levels)]
      stop("`levels` had duplicated elements: ", dup_ele,
           # don't worry abouy efficiency of levels == dup_ele since length() < 256
           " appeared at positions ", toString(head(which(levels == dup_ele))), ". ",
           "All elements of `levels` must be unique.")
    }
    if (is.character(levels) && !is.character(x) && !is.logical(x)) {
      ux <- unique(x)
      ans <- integer2factor256(x, ux)
      class(ans) <- "factor256"
      attr(ans, "factor256_levels") <- as.character(levels)
      return(ans)
    }
  }

  ans <-
    switch(typeof(x),
           "raw" = raw2factor256(x, levels),
           "logical" = logical2factor256(x, levels),
           "integer" = integer2factor256(x, levels),
           "character" = character2factor256(x, levels),
           "double" = integer2factor256(x, levels))
  class(ans) <- "factor256"
  attr(ans, "factor256_levels") <- levels
  ans
}

#' @rdname factor256
#' @export
recompose256 <- function(f) {
  if (!is.factor256(f)) {
    stop("`f` was of class ", class(f), " but must be a 'factor256'.")
  }
  if (!is.raw(f)) {
    stop("Internal error: f was type ", typeof(f), " but type raw was expected.") # nocov
  }
  lf <- levels(f)
  switch(as.character(attr(f, "orig_type")),
         "logical" = .Call("Cfactor2562logical", f, PACKAGE = packageName()),
         "integer" = (lf[as.integer0(f)]),
         "double" = (lf[as.integer0(f)]),
         "raw" = as.raw(lf)[as.integer0(f)],
         # including character
         lf[as.integer0(f)])
}

.orig_lgl <- function(x) {
  attr(x, "orig_type") == "logical"
}

logical2factor256 <- function(x, levels = NULL) {
  ans <- .Call("Clogical2factor256", x, PACKAGE = packageName())
  attr(ans, "orig_type") <- "logical"
  ans
}

raw2factor256 <- function(x, levels) {
  ans <- .Call("C_raw2factor256", x, levels, PACKAGE = packageName())
  attr(ans, "orig_type") <- "raw"
  ans
}

integer2factor256 <- function(x, levels) {
  mx <- match(x, levels, nomatch = 0L)
  ans <- .Call("Cint2factor256", mx, PACKAGE = packageName())
  attr(ans, "orig_type") <- typeof(x)
  ans
}

character2factor256 <- function(x, ux, anyNAx) {
  mx <- match(x, ux, nomatch = 0L)
  ans <- .Call("Cint2factor256", mx, PACKAGE = packageName())
  attr(ans, "orig_type") <- "character"
  ans
}

as.integer0 <- function(x) {
  # same as integer but converts 0 to NA
  .Call("C_asInteger0", x, PACKAGE = packageName())
}


StackMatch <- function(x, ux = NULL) {
  # as.raw(match(x, ux, nomatch = 0L))
  # but only when
  x <- as.integer(x)
  if (!is.integer(ux)) {
    ux <- unique(x)
  }
  ans <- .Call("CStackMatch", x, as.integer(ux), PACKAGE = packageName())
  if (is.null(ans)) {
    message("Falling back in StackMatch")
    return(as.raw(match(x, ux, nomatch = 0L)))
  }
  ans
}

#' @rdname factor256
#' @export
relevel256 <- function(x, levels) {
  lx <- levels(x)
  if (length(levels) == 1) {
    if (levels %notin% lx) {
      stop("`levels = ", levels, "` but must be an existing level.")
    }
    nl <- c(levels, setdiff(lx, levels))
    mnl <- match(nl, lx)
    ans <- raw2factor256(x, as.raw(mnl))
    levels(ans) <- nl
    return(ans)
  }
  if (length(levels) != length(levels(x))) {
    stop("length(levels) = ", length(levels), ", yet length(levels(x)) = ", length(levels(x)), ". ",
         "levels must have the same length as the levels of `x`, or length 1.")
  }
  nl <- levels
  mnl <- match(nl, lx)
  ans <- raw2factor256(x, as.raw(mnl))
  levels(ans) <- nl
  return(ans)
  x

}

#' @rdname factor256
#' @export
levels.factor256 <- function(x) {
  attr(x, "factor256_levels")
}

#' @export
"levels<-.factor256" <- function(x, value) {
  attr(x, "factor256_levels") <- value
}

#' @rdname factor256
#' @export
is.factor256 <- function(x) {
  inherits(x, "factor256")
}


#' @export
"[.factor256" <- function(x, i, ...) {
  lvls <- levels(x)
  o <- as.raw(x)[i]
  attr(o, "factor256_levels") <- lvls
  class(o) <- oldClass(x)
  attr(o, "orig_type") <- attr(x, "orig_type")
  o
}

#' @export
print.factor256 <- function(x, ...) {
  cat("factor256 ")
  print(levels(x)[as.integer0(x)], max = 20)
  cat("Levels[", length(levels(x)), "]: ", sep = "")
  cat(head(levels(x)))
}

#' @export
format.factor256 <- function(x, ...) {
  format(levels(x)[as.integer(x)], ...)
}

## #' @method is.unsorted factor256
## #' @export
## is.unsorted.factor256 <- function(x, na.rm = FALSE, strictly = FALSE) {
##   o <- .Call("CisntSorted256", x, strictly, PACKAGE = packageName())
##   as.logical(o)
## }


#' @export
anyNA.factor256 <- function(x, ...) {
  FALSE
}

#' @rdname factor256
#' @export
isntSorted256 <- function(x, strictly = FALSE) {
  .Call("CisntSorted256", x, strictly, PACKAGE = packageName())
}

#' @rdname factor256
#' @export
as_factor <- function(x) {
  if (is.factor(x)) {
    return(x)
  }
  lvls <- levels(x)
  f <- as.integer0(x)
  class(f) <- "factor"
  levels(f) <- lvls
  f
}


no_intertwine <- function(x, y) {
  # x and y are levels, we don't want the order to be different
  # and can only be different if no overlap
  if (identical(x, y)) {
    return(TRUE)
  }
  i <- 1L
  mx <- match(x, y)
  if (anyNA(mx)) {
    if (all(is.na(mx))) {
      return(TRUE)
    }

  }
}

ensure_common256 <- function(x, to) {
  stopifnot(is.factor256(x), is.factor256(to))
  if (identical(levels(x), levels(to))) {
    return(x)
  }
  factor256(x, levels = union(levels(x), levels(to)))
}


#' @rdname factor256
#' @export
factor256_in <- function(x, tbl) {
  x <- factor256(x)
  tbl <- factor256(as.character(tbl), levels(x))
  .Call("Cfactor256_in", x, tbl, FALSE, PACKAGE = packageName())
}

#' @rdname factor256
#' @export
factor256_notin <- function(x, tbl) {
  x <- factor256(x)
  tbl <- factor256(as.character(tbl), levels(x))
  .Call("Cfactor256_in", x, tbl, TRUE, PACKAGE = packageName())
}

#' @rdname factor256
#' @export
factor256_ein <- function(x, tbl) {
  x <- factor256(x)
  if (anyDuplicated(tbl)) {
    stop("`anyDuplicated(tbl) = ", anyDuplicated(tbl), "`. Remove duplicate elements from tbl.")
  }

  tbl256 <- factor256(as.character(tbl), levels(x))
  ux <- unique256(x)
  for (j in seq_along(tbl)) {
    if (!(as.raw(tbl256[j]) %in% ux)) {
      stop("`tbl` contained ", tbl[j], ", but this value was not found in x. ",
           "All values of `tbl` must be in `x`. Ensure you have specified `tbl` correctly.")
    }
  }
  .Call("Cfactor256_in", x, tbl256, FALSE, PACKAGE = packageName())
}

#' @rdname factor256
#' @export
factor256_enotin <- function(x, tbl) {
  x <- factor256(x)
  if (anyDuplicated(tbl)) {
    stop("`anyDuplicated(tbl) = ", anyDuplicated(tbl), "`. Remove duplicate elements from tbl.")
  }
  tbl256 <- factor256(tbl, levels(x))
  ux <- unique256(x)
  for (j in seq_along(tbl)) {
    if (!(as.raw(tbl256[j]) %in% ux)) {
      stop("`tbl` contained ", tbl[j], ", but this value was not found in x. ",
           "All values of `tbl` must be in `x`. Ensure you have specified `tbl` correctly.")
    }
  }
  .Call("Cfactor256_in", x, tbl256, TRUE, PACKAGE = packageName())
}

#' @rdname factor256
#' @export
tabulate256 <- function(f) {
  stopifnot(is.raw(f))
  .Call("Ctabulate256", f, PACKAGE = packageName())
}

#' @rdname factor256
#' @export
rank256 <- function(x) {
  if (!is.raw(x)) {
    return(rank(x, ties.method = "first"))
  }
  .Call("C_rank256", x, FALSE, PACKAGE = packageName())
}

#' @rdname factor256
#' @export
order256 <- function(x) {
  if (!is.raw(x)) {
    return(order(x))
  }
  .Call("C_rank256", x, TRUE, PACKAGE = packageName())
}

#' @rdname factor256
#' @export
unique256 <- function(x) {
  stopifnot(is.raw(x))
  tx <- tabulate256(x)
  as.raw(0:255)[tx > 0]
}

#' @rdname factor256
#' @export
tabulate256_levels <- function(x, nmax = NULL, dotInterval = 65535L) {
  if (!is.integer(nmax)) {
    nmax <- 256L
  }
  .Call("Ctabulate256_levels", x, nmax = nmax, dotInterval = dotInterval, PACKAGE = packageName())
}


"%notin%" <- function(x, tbl) is.na(match(x, tbl))


#' @name setkeyv256
#' @title \code{setkey} for raw columns
#' @param DT A \code{data.table}.
#' @param cols Column names as in \code{data.table::setkeyv}
#' @return
#' Same as \code{data.table::setkeyv} except that raw \code{cols} will be
#' converted to factors (as \code{data.table} does not allow raw keys).
#' @export
setkeyv256 <- function(DT, cols) {
  if (requireNamespace("data.table", quietly = TRUE)) {
    stopifnot(data.table::is.data.table(DT), is.character(cols))
    for (j in cols) {
      if (is.factor256(v <- .subset2(DT, j))) {
        data.table::set(DT, j = j, value = as_factor(v))
      }
    }
    data.table::setkeyv(DT, cols)
  }
}

Try the factor256 package in your browser

Any scripts or data that you put into this service are public.

factor256 documentation built on Nov. 17, 2023, 9:06 a.m.