R/capLargeValues.R

#' @title Convert large/infinite numeric values in a data.frame or task.
#'
#' @description
#' Convert numeric entries which large/infinite (absolute) values
#' in a data.frame or task.
#' Only numeric/integer columns are affected.
#'
#' @template arg_taskdf
#' @param target [\code{character}]\cr
#'   Name of the column(s) specifying the response.
#'   Target columns will not be capped.
#'   Default is \code{character(0)}.
#' @param cols [\code{character}]\cr
#'   Which columns to convert.
#'   Default is all numeric columns.
#' @param threshold [\code{numeric(1)}]\cr
#'   Threshold for capping.
#'   Every entry whose absolute value is equal or larger is converted.
#'   Default is \code{Inf}.
#' @param impute [\code{numeric(1)}]\cr
#'   Replacement value for large entries.
#'   Large negative entries are converted to \code{-impute}.
#'   Default is \code{threshold}.
#' @param what [\code{character(1)}]\cr
#'   What kind of entries are affected?
#'   \dQuote{abs} means \code{abs(x) > threshold},
#'   \dQuote{pos} means \code{abs(x) > threshold && x > 0},
#'   \dQuote{neg} means \code{abs(x) > threshold && x < 0}.
#'   Default is \dQuote{abs}.
#' @return [\code{data.frame}]
#' @export
#' @family eda_and_preprocess
#' @examples
#' capLargeValues(iris, threshold = 5, impute = 5)
capLargeValues = function(obj, target = character(0L), cols = NULL,
  threshold = Inf, impute = threshold, what = "abs") {
  checkTargetPreproc(obj, target, cols)
  assertNumber(threshold, lower = 0)
  assertNumber(impute, lower = 0)
  assertChoice(what, c("abs", "pos", "neg"))
  UseMethod("capLargeValues")
}

#' @export
capLargeValues.Task = function(obj, target = character(0L), cols = NULL,
  threshold = Inf, impute = threshold, what = "abs") {
  d = getTaskData(obj)
  d = capLargeValues.data.frame(d, target = character(0L), cols = cols,
    threshold = threshold, impute = impute)
  changeData(obj, data = d)
}

#' @export
capLargeValues.data.frame = function(obj, target = character(0L), cols = NULL,
  threshold = Inf, impute = threshold, what = "abs") {
  allnumfeats = colnames(obj)[vlapply(obj, is.numeric)]
  allnumfeats = setdiff(allnumfeats, target)

  # check that user requested cols are only numeric cols with the target
  if (!is.null(cols))
    assertSubset(cols, allnumfeats)
  else
    cols = allnumfeats

  fun = switch(what,
    abs = function(x) abs(x) > threshold,
    pos = function(x) abs(x) > threshold & x > 0,
    neg = function(x) abs(x) > threshold & x < 0
  )

  for (cn in cols) {
    x = obj[[cn]]
    ind = which(fun(x))
    if (length(ind) > 0L)
      obj[ind, cn] = ifelse(x[ind] > threshold, impute, -impute)
  }
  return(obj)
}
Najah-lshanableh/R-data-mining2 documentation built on May 6, 2019, 10:11 a.m.