R/factor_nosort.R

Defines functions factor_sorted_levels refactor factor_nosort_rcpp factor_nosort

#' Fast Factor Generation
#'
#' This function generates factors more quickly, without leveraging
#' \pkg{fastmatch}. The speed increase with \pkg{fastmatch} for ICD-9 codes
#' was about 33% reduction for 10 million codes. SOMEDAY could be faster still
#' using \pkg{Rcpp}, and a hashed matching algorithm.
#'
#' \code{NaN}s are converted to \code{NA} when used on numeric values. Extracted
#' from https://github.com/kevinushey/Kmisc.git
#'
#' These feature from base R are missing: \code{exclude = NA, ordered =
#' is.ordered(x), nmax = NA}
#' @param x An object of atomic type \code{integer}, \code{numeric},
#'   \code{character} or \code{logical}.
#' @param levels An optional character vector of levels. Is coerced to the same
#'   type as \code{x}. By default, we compute the levels as
#'   \code{sort(unique.default(x))}.
#' @param labels A set of labels used to rename the levels, if desired.
#' @examples
#' x <- c("z", "a", "123")
#' icd:::factor_nosort(x)
#' # should return a factor without modification
#' x <- as.factor(x)
#' identical(icd:::factor_nosort(x), x)
#' # unless the levels change:
#' icd:::factor_nosort(x, levels = c("a", "z"))
#'
#' # existing factor levels aren't re-ordered without also moving elements
#' f <- factor(c("a", "b", "b", "c"))
#' g <- icd:::factor_nosort(f, levels = c("a", "c", "b"))
#' stopifnot(g[4] == "c")
#' @details I don't think there is any requirement for factor levels to be
#'   sorted in advance, especially not for ICD-9 codes where a simple
#'   alphanumeric sorting will likely be completely wrong.
#' @keywords internal
#' @noRd
factor_nosort <- function(x, levels) {
  if (missing(levels)) {
    if (is.factor(x)) {
      return(x)
    } else {
      levels <- unique.default(x)
    }
  }
  suppressWarnings(f <- match(x, levels))
  levels(f) <- as.character(levels)
  class(f) <- "factor"
  f
}

#' R wrapper to the \CRANpkg{Rcpp} function. Will re-factor a factor with new
#' levels without converting to string vector.
#' @param na.rm Logical, if \code{TRUE}, simple drop all NA values, i.e., values
#'   with no corresponding level.
#' @keywords internal
#' @noRd
factor_nosort_rcpp <- function(x, levels, na.rm = FALSE) {
  # TODO: if re-factoring, use my refactor code
  if (missing(levels)) {
    if (is.factor(x)) {
      return(x)
    } else {
      levels <- unique.default(x)
    }
  }
  if (na.rm) {
    levels <- levels[!is.na(levels)]
  }
  factor_nosort_rcpp_worker(as.character(x), levels, na_rm = na.rm)
  # TODO SLOW - if re-factoring, there is a faster way
}

#' Refactor by integer matching levels in C++
#'
#' Slightly slower for small factors, three times faster for one hundred million
#' elements with two million new levels. Three times faster for any \code{n >
#' 1e6}. With \code{NA} values, margin is smaller, but still beats base
#' \code{factor}.
#' @param levels character vector of new levels
#' @param exclude_na Simpler equivalent to \code{base::factor} exclude. By
#'   default, \code{refactor} will not count \code{NA} as a factor level if
#'   there are \code{NA} elements in the input data. As with
#'   \code{base::factor}, if \code{exclude_na} is \code{TRUE}, an \code{NA}
#'   level explicitly requested in the \code{levels} argument is still dropped.
#' @param validate Single logical value, if \code{TRUE}, the input factor is
#'   checked for consistency. This should almost never be a problem (i.e., for
#'   any factors generated by R, or those carefully generated by the user).
#'   However, invalid factor input has the possibility of breaking assumptions
#'   in the C++ code. The default is \code{FALSE}. For huge data sets, and known
#'   valid input, skipping this check saves significant time. E.g., if the
#'   levels are not unique, the factor would be invalid. It is possible to
#'   construct such a level using R code, but it is never possible by using
#'   \code{factor}. \code{icd::factor_is_valid} can be used independently.
#' @examples
#' \dontrun{
#' f <- factor(c(1, 2, 3))
#' icd:::refactor(f, c("2", "3"))
#' f <- factor(c(1, 2, NA))
#' icd:::refactor(f, c("2", "3", NA))
#' }
#' @keywords internal manip
#' @noRd
refactor <- function(x,
                     levels,
                     na.rm = FALSE,
                     exclude_na = TRUE,
                     validate = FALSE) {
  stopifnot(is.factor(x))
  if (na.rm) {
    refactor_narm_worker(x = x, new_levels = levels, validate = validate)
  } else {
    refactor_worker(
      x = x, new_levels = levels, exclude_na = exclude_na,
      validate = validate
    )
  }
}

#' Factor with levels sorted according to class, not alphabetic
#' @examples
#' (f <- factor(as.icd9cm(c("E100", "100", "V90"))))
#' (g <- factor_sorted_levels(as.icd9cm(c("E100", "100", "V90"))))
#' factor_sorted_levels(f)
#' stopifnot(identical(g, factor_sorted_levels(as.icd9cm(f))))
#' str(g)
#' class(g)
#' @keywords internal
#' @noRd
factor_sorted_levels <- function(x, levels = unique(sort(x)), ...) {
  cl <- class(x)
  # unique drops the class, sort uses it and keeps it.
  out <- factor(x = x, levels = levels, ...)
  class(out) <- if ("factor" %in% cl) {
    cl
  } else if ("character" %in% cl) {
    sub("character", "factor", cl)
  } else {
    c(cl, "factor")
  }
  out
}
jackwasey/icd documentation built on Nov. 23, 2021, 9:56 a.m.