eflMunchr: Data Pre-processing Tools Used by FL

Documented in prune_fence prune_fence_slopes

#' Identify the Rows Outside the Fence.
#'
#' Identify the rows outside the fence.
#'
#' Compute the fence to eliminate values that are clearly out-of-bound.
#' Normally all values should be non-negative. In case they are not, and
#' offset is used. Also, sometimes the data is nowhere near zero and is such
#' cases the fence is not useful, again in that case the offset solves that
#' problem. The algorithm will generate an error when \code{-Inf, Inf} values
#' are in the input. The \code{NA} are treated as being out-of-bound.
#'
#' @section info:
#'
#' if the argument \code{info} is set to \code{TRUE} then a list with the following elements is given.
#' \describe{
#'   \item{is_outside}{Logical vector, TRUE is when the row is outside the
#'   fence limits, FALSE otherwise.}
#'   \item{slopes}{The slopes for the small and big fences.}
#'   \item{offsets}{The offset used to scale the x and y values.}
#'   \item{fences}{Dataframe with x = original x values; y = original y values,
#'   small = y value of the small fence on the scaled coordinates;
#'   big = y value of the big fence on the scaled coordinates;
#'   small_inv = y value of the small fence on the original scale
#'   (useful for plotting);
#'   big_inv = y value of the big fence on the original scale
#'   (useful for plotting.)}
#' }
#'
#' @inheritParams prune
#' @param is_offset If TRUE (default) the offset number will be
#' \code{offset = min(min(x), min(y))}, otherwise there will be not offset,
#' that is \code{offset = 0}.
#' @param info If FALSE (default) a logical vector with the is returned. If TRUE
#' a list with the logical vector is returned as well as the slopes. This is
#' used usually to help plot the fences.
#'
#' @source \emph{Statistical Data Cleaning}, Mark van der Loo and
#' Edwin de Jonge, 2018. Section 7.5.2, p. 176-179.
#'
#' @return If \code{info = FALSE} (default), logical vector where TRUE indicates
#' values outside the fence. If \code{info = TRUE}, a list with
#' the logical vector called \code{is_outside}, the list of slopes called
#' \code{slopes}, the list of offsets called \code{offsets}, and the data.frame
#' of fences data \code{fences}.
#' @export
prune_fence <- function(data, cols, is_offset = TRUE, info = FALSE) {
  checkmate::assertDataFrame(data, min.rows = 2, min.cols = 2)
  checkmate::assertCharacter(cols, min.chars = 1, len = 2, unique = TRUE,
                             any.missing = FALSE)
  checkmate::assertNames(cols, subset.of = names(data))
  checkmate::assertNumeric(data[, cols[1]], finite = TRUE)
  checkmate::assertNumeric(data[, cols[2]], finite = TRUE)
  checkmate::assertFlag(is_offset)

  x <- data[, cols[1]]
  y <- data[, cols[2]]

  # the data is scaled to the minimum of all data.
  # This normally increases the chances of the algorithm to work.
  # Some cases don't work when not using an offset.
  # See the test when is_offset = FALSE
  if (is_offset) {
    # the_offset = min(min(x, na.rm = TRUE), min(y, na.rm = TRUE))
    offsets <- list("x" = min(x, na.rm = TRUE), "y" = min(y, na.rm = TRUE))
  } else {
    # the_offset <- 0
    offsets <- list("x" = 0, "y" = 0)
  }
  # cat("\n", "offset", "\n")
  # print(the_offset)
  # cat("\n")
  # assertthat::assert_that(is.finite(the_offset))
  assertthat::assert_that(is.finite(offsets$x), is.finite(offsets$y))

  # scale x and y with offset
  x_scaled <- x - offsets$x
  y_scaled <- y - offsets$y
  # cat("\n", "x_scaled", "\n")
  # print(x_scaled)
  # cat("\n", "y_scaled", "\n")
  # print(y_scaled)
  # all values must be >= 0 and finite
  checkmate::assertNumeric(x_scaled, finite = TRUE, lower = 0)
  checkmate::assertNumeric(y_scaled, finite = TRUE, lower = 0)

  # get the slopes
  slopes <- prune_fence_slopes(x_scaled, y_scaled)

  # compute te coordinates of the fences
  fences <- list("small" = x_scaled * slopes$small,
                 "big" = x_scaled * slopes$big)

  # all rows with NA are considered outside the fences
  is_outside <- is.na(x_scaled) | is.na(y_scaled)

  # flag values outside the fences
  is_outside <- is_outside | y_scaled < fences$small
  is_outside <- is_outside | y_scaled > fences$big

  # the fence values
  results <- data.frame("x" = x,
                       "y" = y,
                       "small" = fences$small,
                       "big" = fences$big,
                       "small_inv" = fences$small + offsets$y,
                       "big_inv" = fences$big + offsets$y,
                       "oob" = is_outside)
  # must have same nb of rows as data
  assertthat::assert_that(nrow(results) == nrow(data))

  if(!info) {
    out <- is_outside
  } else {
    out <- list("oob" = is_outside,
                "slopes" = slopes,
                "offsets" = offsets,
                "results" = results)
  }
  out
}


#' Compute the Slopes Used for Fencing
#'
#' Compute the slopes used for fencing.
#'
#' The function compute the slopes and also applies several tests to the result.
#' Problem with the data will always usually affect the ratio.
#'
#' @param x Numeric vector. Must have finite values.
#' @param y Numeric vector. Must have finite values.
#' @param tol Number, tolerance when testing the ratios.
#'
#' @return List with \code{small_slope} and \code{big_slope}.
#' @export
prune_fence_slopes <- function(x, y, tol = .Machine$double.eps^0.5) {
  checkmate::assertNumeric(x, finite = TRUE, min.len = 3)
  checkmate::assertNumeric(y, finite = TRUE, len = length(x))
  checkmate::assertNumber(tol, lower = 0, finite = TRUE)

  # must only used values positive values to compute the slope
  # and make sure all NA are excluded
  x_pos <- x[x > 0 & !is.na(x)]
  y_pos <- y[y > 0 & !is.na(y)]
  msg <- sprintf("There are only %d positive values in scaled x.", length(x_pos))
  assertthat::assert_that(length(x_pos) >= 2, msg = msg)
  msg <- sprintf("There are only %d positive values in scaled y.", length(y_pos))
  assertthat::assert_that(length(y_pos) >= 2, msg = msg)
  # cat("\n", "inside: x_pos", "\n")
  # msg <- sprintf("Range of x_pos: %s", toString(range(x_pos)))
  # print(msg)
  # cat("inside: y_pos", "\n")
  # msg <- sprintf("Range of y_pos: %s", toString(range(y_pos)))
  # print(msg)
  # cat("\n")

  # the slope used with a given x to determine the limit above which y must be
  small_slope = min(x_pos, na.rm = TRUE) / max(y_pos, na.rm = TRUE)
  # the slope used with a given x to determine the limit below which y must be
  big_slope = max(x_pos, na.rm = TRUE) / min(y_pos, na.rm = TRUE)
  assertthat::assert_that(small_slope > 0, big_slope > 0)
  # cat("\n", "inside: the slopes", "\n")
  # print(sprintf("small slope = %f, big slope = %f", small_slope, big_slope))
  # cat("\n")


  x_rng <- range(x_pos)
  y_rng <- range(y_pos)
  if (diff(x_rng) < tol | diff(y_rng) < tol) {
    # Fencing does not work when at least one of the 2 data vectors
    # has constantv values
    msg_head <- "All pos values in x or y are about the same. Fencing won't work."
    msg_head <- cli::col_yellow(msg_head)
    msg_body <- "This happen if at least one of the 2 columns has constant data."
    msg_body <- c("X" = msg_body,
                  "i" = sprintf("x pos value range: %s", toString(x_rng)),
                  "i" = sprintf("y pos value range: %s", toString(y_rng)))
    msg <- paste(msg_head, rlang::format_error_bullets(msg_body), sep = "\n")
    rlang::abort(
      message = msg,
      class = "prune_fence_slopes_error1")
   }

  list("small" = small_slope, "big" = big_slope)
}