R/trim.R

Defines functions trim_character trim_datetime trim_date trim_integer trim_numeric

Documented in trim_character trim_date trim_datetime trim_integer trim_numeric

#' @name trim
#' @aliases trim_numeric trim_integer trim_date trim_datetime trim_character
#' @title Trim extreme values
#'
#' @description Trim extreme values from an atomic vector, and replace with a specific value (typically `NA_*`).
#'
#' @usage
#' trim_numeric(x, bounds=c(-Inf, Inf), replacement=NA_real_)
#' trim_integer(x, bounds=c(-2147483647L, 2147483647L), replacement=NA_integer_)
#' trim_date(
#'   x,
#'   bounds      = as.Date(c("1940-01-01", "2029-12-31")),
#'   replacement = as.Date(NA_character_)
#' )
#' trim_datetime(
#'   x,
#'   bounds      = as.POSIXct(c("1940-01-01 00:00", "2029-12-31 23:59")),
#'   replacement = as.POSIXct(NA_character_)
#' )
#' trim_character(
#'   x,
#'   pattern = "^.*$",
#'   replacement = NA_character_
#' )
#'
#' @param x The input vector to be trimmed.  Required
#' @param bounds A two-element vector that establishes the lower and upper *inclusive* bounds of `x`.
#' @param pattern A perl-style regular expression passed to [base::grepl()].
#' Vector elements that match the pattern are returned.
#' Vector elements that do not match the pattern are replaced with [NA_character_].
#' @param replacement A scalar that will replace all instances of `x`
#' that fall outside of `bounds` or `pattern`.
#'
#' @return An atomic vector with the same number of elements as `x`.
#'
#' @note
#' The data type of `x`, `bounds`, and `replacement` must match the atomic data type of the function.
#' In other words, `trim_numeric()` accepts only parameters of type 'numeric' (otherwise known as
#' 'double-precision floating point').  Likewise, `trim_date()` accepts only parameters of type `Date`.
#'
#' The lower bound must be less than or equal the upper bound.
#'
#' The default bounds for numerics and integers are at the extremes of the data type.
#' The default bounds for [dates](https://stat.ethz.ch/R-manual/R-devel/library/base/html/Dates.html) are
#' arbitrary, because the origin is slippery.
#'
#' @author Will Beasley
#'
#' @examples
#' library(OuhscMunge)
#'
#' trim_numeric(runif(10, -1, 10), bounds=c(4, 8))
#'
#' trim_integer(c(NA, 1:10), bounds=c(4L, 8L))
#'
#' trim_date(
#'   x      = as.Date(c("1902-02-02", "1999-09-09", "2020-02-22", "1930-01-01", "1930-01-02")),
#'   bounds = as.Date(c("1990-01-01", "2030-01-01"))
#' )
#'
#' trim_datetime(
#'   x      = as.POSIXct(c("1902-02-02", "1999-09-09", "2020-02-22", "1930-01-01", "1930-01-02")),
#'   bounds = as.POSIXct(c("1990-01-01", "2030-01-01"))
#' )
#'
#' zip_codes <- c("12345", "a2345", "54321-6789", "54321-67890")
#' trim_character(zip_codes, "^\\d{5}(-\\d{4})?$")
#' trim_character(zip_codes)                                      # Everything passes.

#' @export
trim_numeric <- function(x, bounds = c(-Inf, Inf), replacement = NA_real_) {
  checkmate::assert_numeric(x, any.missing = TRUE)
  checkmate::assert_numeric(bounds, len = 2, any.missing = FALSE)
  checkmate::assert_numeric(replacement, len = 1)

  if (!(bounds[1] <= bounds[2]))
    stop("The lower element of `bounds` must be equal or less than the upper element of `bounds`.")

  # Set values that are outside the thresholds to replacement.
  dplyr::if_else(
    condition = dplyr::between(x, bounds[1], bounds[2]),
    true      = x,
    false     = replacement
    # missing   = replacement # A future option is to set missing values to the replacement value also
  )
}

#' @export
trim_integer <- function(x, bounds = c(-2147483647L, 2147483647L), replacement = NA_integer_) {
  checkmate::assert_integer(x, any.missing = TRUE)
  checkmate::assert_integer(bounds, len = 2, any.missing = FALSE)
  checkmate::assert_integer(replacement, len = 1)

  if (!(bounds[1] <= bounds[2]))
    stop("The lower element of `bounds` must be equal or less than the upper element of `bounds`.")

  # Set values that are outside the thresholds to replacement.
  dplyr::if_else(
    condition = dplyr::between(x, bounds[1], bounds[2]),
    true      = x,
    false     = replacement
  )
}

#' @export
trim_date <- function(x, bounds = as.Date(c("1940-01-01", "2029-12-31")), replacement = as.Date(NA_character_)) {
  checkmate::assert_date(x, any.missing=TRUE)
  checkmate::assert_date(bounds, min.len=2, max.len=2, any.missing=FALSE)
  checkmate::assert_date(replacement, min.len=1, max.len=1)

  if (!(bounds[1] <= bounds[2]))
    stop("The lower element of `bounds` must be equal or less than the upper element of `bounds`.")

  # Set values that are outside the thresholds to replacement.
  dplyr::if_else(
    condition = dplyr::between(x, bounds[1], bounds[2]),
    true      = x,
    false     = replacement
  )
}

#' @export
trim_datetime <- function(x, bounds = as.POSIXct(c("1940-01-01 00:00", "2029-12-31 23:59")), replacement = as.POSIXct(NA_character_)) {
  checkmate::assert_posixct(x, any.missing=TRUE)
  checkmate::assert_posixct(bounds, min.len=2, max.len=2, any.missing=FALSE)
  checkmate::assert_posixct(replacement, min.len=1, max.len=1)

  if (!(bounds[1] <= bounds[2]))
    stop("The lower element of `bounds` must be equal or less than the upper element of `bounds`.")

  # Set values that are outside the thresholds to replacement.
  dplyr::if_else(
    condition = dplyr::between(x, bounds[1], bounds[2]),
    true      = x,
    false     = replacement
  )
}

#' @export
trim_character <- function(x, pattern = "^.*$", replacement = NA_character_) {
  checkmate::assert_character(x, any.missing = TRUE)
  checkmate::assert_character(pattern, len = 1, any.missing = FALSE)
  checkmate::assert_character(replacement, len = 1)

  # Set values that fail the pattern to replacement.
  dplyr::if_else(
    condition = grepl(pattern, x, perl = TRUE),
    true      = x,
    false     = replacement
  )
}
OuhscBbmc/OuhscMunge documentation built on March 2, 2024, 11:44 a.m.