# R/funop.R In Sielinski/vacuum: Tukey's Vacuum Cleaner

#### Documented in funop

#' Identifies outliers in a numeric vector
#' @description
#' FUNOP stands for FUll NOrmal Plot.
#'
#' The procedure identifies outliers by calculating their slope (\code{z}),
#' relative to the vector's median.
#'
#' The procedure ignores values in the middle third of the \emph{ordered}
#' vector. The remaining values are all candidates for consideration. The
#' slopes of candidate values are calculated, and the median of their slopes
#' is used as the primary basis for identifying outliers.
#'
#' Any value whose slope is \code{B} times larger than the median slope is
#' identified as an outlier. Additionally, any value whose \emph{magnitude}
#' is larger than that of the slope-based outliers is also identified as
#' an outlier.
#'
#' However, the procedure will \emph{not} identify as outliers any values
#' within \code{A} standard deviations of the vector's median (i.e., not
#' the median of candidate slopes).
#' @param x
#' Numeric vector to inspect for outliers (does not need to be ordered)
#' @param A
#' Number of standard deviations beyond the median of \code{x}
#' @param B
#' Multiples beyond the median slope of candidate values
#' @return
#' A data frame containing one row for every member of \code{x} (in the same
#' order as \code{x}) and the
#' following columns:
#' * \code{y}: Original values of vector \code{x}
#' * \code{i}: Ordinal position of value \code{y} in the sorted vector \code{x}
#' * \code{middle}: Boolean indicating whether ordinal position \code{i} is in the middle third of the vector
#' * \code{a}: Result of \code{a_qnorm(i, length(x))}
#' * \code{z}: Slope of \code{y} relative to \code{median(y)}
#' * \code{special}: Boolean indicating whether \code{y} is an outlier
#'
#' Additionally, the data frame will have the following attributes,
#' which FUNOP calculates as part of its procedure:
#' * \code{y_split}: Median of the vector
#' * \code{y_trimmed}: Mean of the top and bottom thirds of the ordered vector
#' * \code{z_split}: Median slope of candidate values
#' @seealso [vacuum::a_qnorm()]
#' @references
#' Tukey, John W. "The Future of Data Analysis."
#' \emph{The Annals of Mathematical Statistics},
#' \emph{33}(1), 1962, pp 1-67. \emph{JSTOR},
#' \url{https://www.jstor.org/stable/2237638}.
#' @examples
#' funop(c(1, 2, 3, 11))
#' funop(table_1)
#'
#' attr(funop(table_1), 'z_split')
#' @export

funop <- function(x, A = 0, B = 1.5) {
special <- orig_order <- y <- i <- middle <- a <- z <- NULL

if (!is.vector(x) | !is.numeric(x)) {
warning('argument "x" must be a numeric vector')
} else if (length(A) != 1 | !is.numeric(A)) {
warning('argument "A" must be a single numeric value')
} else if (length(B) != 1 | !is.numeric(B)) {
warning('argument "B" must be a single numeric value')
} else if (anyNA(c(A, B))) {
warning('arguments "A" and "B" must be single numeric values')
} else {

# (b1)
# Let a_{i|n} be a typical value for the ith ordered observation in
# a sample of n from a unit normal distribution.
n <- length(x)

# initialze dataframe to hold results
result <- data.frame(
y = x,
orig_order = 1:n,
a = NA,
z = NA,
middle = FALSE,
special = FALSE
)

# put array in order
result <- result %>%
dplyr::arrange(x) %>%
dplyr::mutate(i = dplyr::row_number())

# calculate a_{i|n}
result$a <- a_qnorm(result$i, n)

# (b2)
# Let y_1 ≤ y_2 ≤ … ≤ y_n be the ordered values to be examined.
# Let y_split be their median (or let y_trimmed be the mean of the y_i
# with  (1/3)n < i ≤ (2/3)n).
middle_third <- (floor(n / 3) + 1):ceiling(2 * n / 3)
outer_thirds <- (1:n)[-middle_third]

result$middle[middle_third] <- TRUE y_split <- stats::median(result$y)
y_trimmed <- mean(result$y[middle_third]) # (b3) # For i ≤ (1/3)n or > (2/3)n only, # let z_i = (y_i – y_split) / a_{i|n} # (or let z_i = (y_i – y_trimmed) / a_{i|n}). result$z[outer_thirds] <-
(result$y[outer_thirds] - y_split) / result$a[outer_thirds]

# (b4)
# Let z_split be the median of the z’s thus obtained.
z_split <- stats::median(result$z[outer_thirds]) # (b5) # Give special attention to z’s for which both # |y_i – y_split| ≥ A · z_split and z_i ≥ B · z_split # where A and B are prechosen. result$special <-
ifelse(result$z >= (B * z_split) & abs(result$y - y_split) >= (A * z_split), TRUE, FALSE)

# (b5*)
# Particularly for small n, z_j’s with j more extreme than an i
# for which (b5) selects z_i also deserve special attention.

# in the top third, look for values larger than ones already found
top_third <- outer_thirds[outer_thirds > max(middle_third)]

# take advantage of the fact that we've already indexed our result set
# and simply look for values of i larger than the smallest i in the
# top third (further to the right of our x-axis)
if (any(result$special[top_third])) { min_i <- result[top_third, ] %>% dplyr::filter(special == TRUE) min_i <- min(min_i$i)

result$special[which(result$i > min_i)] <- TRUE
}

# in the top third, look for values smaller than ones already found
bottom_third <- outer_thirds[outer_thirds < min(middle_third)]

# look for values of i smaller than the largest i in the bottom third
# (further to the left of our x-axis)
if (any(result$special[bottom_third])) { max_i <- result[bottom_third, ] %>% dplyr::filter(special == TRUE) max_i <- max(max_i$i)

result$special[which(result$i < max_i)] <- TRUE
}

result <- result %>%
dplyr::arrange(orig_order) %>%
dplyr::select(y, i, middle, a, z, special)

attr(result, 'y_split') <- y_split
attr(result, 'y_trimmed') <- y_trimmed
attr(result, 'z_split') <- z_split

result
}
}

Sielinski/vacuum documentation built on Sept. 15, 2020, 11:32 a.m.