twextras: twextras

Documented in var.windsorized windsorize

#' Windsorize data
#' 
#' @param x      a numeric vector.
#' @param method possible choices are "trim" for trimming
#'               arbitrary range of values or "mad" for
#'               removing values that are outlying from the
#'               median by \code{distance * MAD}. Values
#'               outside that range are replaced with the
#'               nearest endpoint.
#' @param trim   the fraction (0 to 0.5) of observations to be
#'               trimmed from each end of x. Values
#'               outside that range are replaced with the
#'               nearest endpoint.
#' @param dist   distance from the median when using
#'               \code{method = "mad"}.
#' 
#' @references 
#' Keselman, H. J., Algina, J., Lix, L. M., Wilcox, R. R.,
#' & Deering, K. (2008). A generally robust approach for
#' testing hypotheses and setting confidence intervals for
#' effect sizes. Psychological Methods, 13, 110–129.
#' 
#' @importFrom stats quantile mad
#' @export

windsorize <- function(x, method = c("trim", "outliers"), trim = 0.2, dist = 3) {
  method <- match.arg(method)
  
  if (method == "trim") {
    
    trim <- trim/2
    stopifnot(trim >= 0 && trim < 0.5)
    lim <- quantile(x, probs=c(trim, 1-trim))
    n <- sum(x >= lim[1] & x <= lim[2], na.rm = TRUE)
    x[ x < lim[1] ] <- lim[1]
    x[ x > lim[2] ] <- lim[2]
    
  } else {
    
    stopifnot(dist > 0)
    med <- median(x)
    y <- x - med
    sc <- mad(y, center = 0) * dist
    n <- sum(y > -sc & y < sc, na.rm = TRUE)
    y[y > sc] <- sc
    y[y < -sc] <- -sc
    x <- y + med
    lim <- c(max(min(x), med-sc), min(max(x), med+sc))
    
  }

  structure(x, class = "windsorized",
            neff = n, limits = lim,
            method = method)
}


#' @export

print.windsorized <- function(x, ...) {
  print(as.numeric(x), ...)
  cat("n = ", length(x))
  cat(", neff = ", attr(x, "neff"))
  cat(sprintf(" (%0.1f%%)", attr(x, "neff")/length(x)*100))
  cat(", limits: ", attr(x, "limits"))
}


#' @rdname windsorize
#' @export

var.windsorized <- function(x, ...) {
  (var(x, ...)*(length(x)-1)) / (attr(x, "samplesize")-1) 
}