R/TestPointIsAnomaly.R

Defines functions TestPointIsAnomaly

Documented in TestPointIsAnomaly

#' @title Check Whether Test Point is Anomaly
#'
#' @description Assume the \code{training} set was generated by a process that
#'   follows certain distribution, i.e. "normal" or "t", this function checks
#'   whether to reject the null hypothesis that the \code{test} point are
#'   generated by the same process. If the probability of obtaining a result
#'   equals to or more extreme than \code{test} is lower than \code{p}, then
#'   this function returns \code{TRUE}, meaning the null hypothesis is rejected
#'   and the \code{test} point is likely to be an anomoly. If argument
#'   \code{exclude} is specified, elements at those designated positions are
#'   removed from the training set.
#'
#' @param training A numeric vector containing the samples used to fit the
#'   distribution
#' @param test A numeric value to be tested
#' @param dist A string specifies the distribution to fit. Options are
#'   t-distribution(default: \code{dist = "t"}), normal distribution(\code{dist
#'   = "normal"}).
#' @param exclude A logical vector with length equals to
#'   \code{length(training)}. It is used to remove elements at designated
#'   positions from fitting the distribution. By default, \code{exclude = NULL},
#'   which means no element is excluded.
#' @param p p-value threshold with values in \emph{[0, 1]}.
#' @param direction Directionality of the anomalies to be deteted. Options are:
#'   \emph{'pos'}, \emph{'neg'} and \emph{'both'}. Defaults to be \emph{'both'}.
#' @return returns \code{TRUE} if the test point is likely to be an anomaly and
#'   \code{FALSE} otherwise. For debugging purpose, this function also returns
#'   metadata \code{median}, \code{mad}, \code{score} and
#'   \code{hist}.\itemize{\item \code{median} sample median of training set
#'   \item \code{mad} sample mean absolute deviation calculated from training
#'   set \item \code{score} defined as \code{(test - median) / mad} \item
#'   \code{hist} histogram of training points overlaid with stats function used
#'   to fit}
#' @examples
#' set.seed(1)
#' training <- rt(1000, df = 10)
#' test <- 4
#' exclude <- sample(c(T, F), 1000, replace = T, prob = c(0.005, 0.995))
#' r <- TestPointIsAnomaly(training, test, dist = "t", exclude = exclude, p = 0.1, df = 10)
#' attr(r, "hist")
#' training <- rnorm(1000, mean = 0, sd = 10)
#' r <- TestPointIsAnomaly(training, test, dist = "normal", p = 0.1)
#' attr(r, "hist")
#' @import ggplot2
#' @export

TestPointIsAnomaly <- function(training, test, dist = 't', exclude = NULL, p = 0.01, direction = direction, ...){
  if(!is.null(exclude)){
    training <- training[!exclude]
  }
  median <- median(training)
  mad <- mad(training)
  score <- abs(test - median)/mad
  threshold <- 0
  hist <- ggplot(data.frame(training = training), aes(x = training)) + geom_histogram(bins = 30, color = "gray", aes(y = ..density..))
  if(dist == "t") {
    threshold <- switch(direction, pos = qt(p, lower.tail = F, ...), neg = qt(p, ...), both = c(qt(p/2, lower.tail = F, ...), qt(p/2, ...)))
    hist <- hist + stat_function(fun = dt, color = "red", lwd = 2, xlim = c(min(training), max(training)), args = list(...))
  }
  if(dist == "normal") {
    threshold <- switch(direction, pos = qnorm(p, mean = median, sd = mad, lower.tail = F, ...), neg = qnorm(p, mean = median, sd = mad, ...), both = c(qnorm(p/2, mean = median, sd = mad, ...), qnorm(p/2, mean = median, sd = mad, lower.tail = F, ...)))
    hist <- hist + stat_function(fun = dnorm, color = "red", lwd = 2, xlim = c(min(training), max(training)), args = list(mean = median, sd = mad, ...))
  }
  isAnomaly <- switch(direction, pos = (score >= threshold), neg = (score <= threshold), both = (score <= threshold[1] || score >= threshold[2]))
  attr(isAnomaly, "mad") <- mad
  attr(isAnomaly, "score") <- score
  attr(isAnomaly, "hist") <- hist
  attr(isAnomaly, "median") <- median
  isAnomaly
}
jingjin1018/anetimeseries documentation built on May 19, 2019, 10:35 a.m.