Nothing
#' Fast Factor Generation
#'
#' This function generates factors more quickly, without leveraging
#' \pkg{fastmatch}. The speed increase with \pkg{fastmatch} for ICD-9 codes
#' was about 33% reduction for 10 million codes. SOMEDAY could be faster still
#' using \pkg{Rcpp}, and a hashed matching algorithm.
#'
#' \code{NaN}s are converted to \code{NA} when used on numeric values. Extracted
#' from https://github.com/kevinushey/Kmisc.git
#'
#' These feature from base R are missing: \code{exclude = NA, ordered =
#' is.ordered(x), nmax = NA}
#' @param x An object of atomic type \code{integer}, \code{numeric},
#' \code{character} or \code{logical}.
#' @param levels An optional character vector of levels. Is coerced to the same
#' type as \code{x}. By default, we compute the levels as
#' \code{sort(unique.default(x))}.
#' @param labels A set of labels used to rename the levels, if desired.
#' @examples
#' x <- c("z", "a", "123")
#' icd:::factor_nosort(x)
#' # should return a factor without modification
#' x <- as.factor(x)
#' identical(icd:::factor_nosort(x), x)
#' # unless the levels change:
#' icd:::factor_nosort(x, levels = c("a", "z"))
#'
#' # existing factor levels aren't re-ordered without also moving elements
#' f <- factor(c("a", "b", "b", "c"))
#' g <- icd:::factor_nosort(f, levels = c("a", "c", "b"))
#' stopifnot(g[4] == "c")
#' @details I don't think there is any requirement for factor levels to be
#' sorted in advance, especially not for ICD-9 codes where a simple
#' alphanumeric sorting will likely be completely wrong.
#' @keywords internal
#' @noRd
factor_nosort <- function(x, levels) {
if (missing(levels)) {
if (is.factor(x)) {
return(x)
} else {
levels <- unique.default(x)
}
}
suppressWarnings(f <- match(x, levels))
levels(f) <- as.character(levels)
class(f) <- "factor"
f
}
#' R wrapper to the \CRANpkg{Rcpp} function. Will re-factor a factor with new
#' levels without converting to string vector.
#' @param na.rm Logical, if \code{TRUE}, simple drop all NA values, i.e., values
#' with no corresponding level.
#' @keywords internal
#' @noRd
factor_nosort_rcpp <- function(x, levels, na.rm = FALSE) {
# TODO: if re-factoring, use my refactor code
if (missing(levels)) {
if (is.factor(x)) {
return(x)
} else {
levels <- unique.default(x)
}
}
if (na.rm) {
levels <- levels[!is.na(levels)]
}
factor_nosort_rcpp_worker(as.character(x), levels, na_rm = na.rm)
# TODO SLOW - if re-factoring, there is a faster way
}
#' Refactor by integer matching levels in C++
#'
#' Slightly slower for small factors, three times faster for one hundred million
#' elements with two million new levels. Three times faster for any \code{n >
#' 1e6}. With \code{NA} values, margin is smaller, but still beats base
#' \code{factor}.
#' @param levels character vector of new levels
#' @param exclude_na Simpler equivalent to \code{base::factor} exclude. By
#' default, \code{refactor} will not count \code{NA} as a factor level if
#' there are \code{NA} elements in the input data. As with
#' \code{base::factor}, if \code{exclude_na} is \code{TRUE}, an \code{NA}
#' level explicitly requested in the \code{levels} argument is still dropped.
#' @param validate Single logical value, if \code{TRUE}, the input factor is
#' checked for consistency. This should almost never be a problem (i.e., for
#' any factors generated by R, or those carefully generated by the user).
#' However, invalid factor input has the possibility of breaking assumptions
#' in the C++ code. The default is \code{FALSE}. For huge data sets, and known
#' valid input, skipping this check saves significant time. E.g., if the
#' levels are not unique, the factor would be invalid. It is possible to
#' construct such a level using R code, but it is never possible by using
#' \code{factor}. \code{icd::factor_is_valid} can be used independently.
#' @examples
#' \dontrun{
#' f <- factor(c(1, 2, 3))
#' icd:::refactor(f, c("2", "3"))
#' f <- factor(c(1, 2, NA))
#' icd:::refactor(f, c("2", "3", NA))
#' }
#' @keywords internal manip
#' @noRd
refactor <- function(x,
levels,
na.rm = FALSE,
exclude_na = TRUE,
validate = FALSE) {
stopifnot(is.factor(x))
if (na.rm) {
refactor_narm_worker(x = x, new_levels = levels, validate = validate)
} else {
refactor_worker(
x = x, new_levels = levels, exclude_na = exclude_na,
validate = validate
)
}
}
#' Factor with levels sorted according to class, not alphabetic
#' @examples
#' (f <- factor(as.icd9cm(c("E100", "100", "V90"))))
#' (g <- factor_sorted_levels(as.icd9cm(c("E100", "100", "V90"))))
#' factor_sorted_levels(f)
#' stopifnot(identical(g, factor_sorted_levels(as.icd9cm(f))))
#' str(g)
#' class(g)
#' @keywords internal
#' @noRd
factor_sorted_levels <- function(x, levels = unique(sort(x)), ...) {
cl <- class(x)
# unique drops the class, sort uses it and keeps it.
out <- factor(x = x, levels = levels, ...)
class(out) <- if ("factor" %in% cl) {
cl
} else if ("character" %in% cl) {
sub("character", "factor", cl)
} else {
c(cl, "factor")
}
out
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.