#' Windsorize data
#'
#' @param x a numeric vector.
#' @param method possible choices are "trim" for trimming
#' arbitrary range of values or "mad" for
#' removing values that are outlying from the
#' median by \code{distance * MAD}. Values
#' outside that range are replaced with the
#' nearest endpoint.
#' @param trim the fraction (0 to 0.5) of observations to be
#' trimmed from each end of x. Values
#' outside that range are replaced with the
#' nearest endpoint.
#' @param dist distance from the median when using
#' \code{method = "mad"}.
#'
#' @references
#' Keselman, H. J., Algina, J., Lix, L. M., Wilcox, R. R.,
#' & Deering, K. (2008). A generally robust approach for
#' testing hypotheses and setting confidence intervals for
#' effect sizes. Psychological Methods, 13, 110–129.
#'
#' @importFrom stats quantile mad
#' @export
windsorize <- function(x, method = c("trim", "outliers"), trim = 0.2, dist = 3) {
method <- match.arg(method)
if (method == "trim") {
trim <- trim/2
stopifnot(trim >= 0 && trim < 0.5)
lim <- quantile(x, probs=c(trim, 1-trim))
n <- sum(x >= lim[1] & x <= lim[2], na.rm = TRUE)
x[ x < lim[1] ] <- lim[1]
x[ x > lim[2] ] <- lim[2]
} else {
stopifnot(dist > 0)
med <- median(x)
y <- x - med
sc <- mad(y, center = 0) * dist
n <- sum(y > -sc & y < sc, na.rm = TRUE)
y[y > sc] <- sc
y[y < -sc] <- -sc
x <- y + med
lim <- c(max(min(x), med-sc), min(max(x), med+sc))
}
structure(x, class = "windsorized",
neff = n, limits = lim,
method = method)
}
#' @export
print.windsorized <- function(x, ...) {
print(as.numeric(x), ...)
cat("n = ", length(x))
cat(", neff = ", attr(x, "neff"))
cat(sprintf(" (%0.1f%%)", attr(x, "neff")/length(x)*100))
cat(", limits: ", attr(x, "limits"))
}
#' @rdname windsorize
#' @export
var.windsorized <- function(x, ...) {
(var(x, ...)*(length(x)-1)) / (attr(x, "samplesize")-1)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.