R/get_anomaly_score.R

Defines functions get_anomaly_score

Documented in get_anomaly_score

#' Estimate anomaly-score for each value of each Key of a data frame in three steps
#' 1. Identification of Changepoints and subsequent segmentation of the time series
#' 2. (Optional) Partly Normalization of Values of each Key
#' 3. Estimating anomaly-score based on residuals of generalized-arma and (normal) arima-model plus
#' remainder based on stl (time series decomposition)
#' The user can control which kind of anomalies she is interested in regarding three dimensions:
#'    1. speed: set normalize_weight and method_weights$garima to 0 in order to increase speed
#'    2. global vs local anomalies: stl finds global outliers, arima and garima local outliers conditional on last values
#'    3. balanced vs geometrical intuitive: garima-anomaly-score is generally symmetric distributed. Distribution of stl- and arima
#'       anomaly-scores depend on normalization beforehand. No normalization and right-skewed original data will lead to right-skewed
#'       anomaly-score-distribution here.
#'
#' @param df data frame containing at least Columns Key, Date and Value
#' @param method_weights List of Weights for calculation of overall anomaly-score. Setting garima = 0 significantly increases computational performance
#' @param normalize_weight Share of Normalization. Only affects arima and stl. Setting to 0 singificantly increases computational performance
#' @param x.matrix optional matrix of regressors, use to model seasonality
#' @return List containing extended inputted df possessing method-specific and global anomaly-score, and input-params
#' @export
#' @import tidyverse
#' @import lubridate
#' @import magrittr
#' @import assertthat
#' @import forecast
#' @import strucchange
#' @import gamlss
#' @import gamlss.util
#' @import anomalize
get_anomaly_score <- function(df, method_weights = list(garima = 0.5, arima = 0.5, stl = 0.5),
                              normalize_weight = 0, remove_seasonality = TRUE){

  assertthat::assert_that(is.data.frame(df), msg = "df must be of class 'data.frame'")
  assertthat::assert_that(all(c("Key", "Date", "Value") %in% colnames(df)), msg = "df must contain colums 'Key', 'Date' and 'Value'")
  assertthat::assert_that(is.numeric(df$Value), msg = "Value must be numeric")

  assertthat::assert_that(all(names(method_weights) == c("garima", "arima", "stl")), msg = "method_weights incorrect - must be in correct order")
  assertthat::assert_that(all(method_weights %>% unlist >= 0), msg = "all method_weights must be > 0 ")
  assertthat::assert_that((method_weights %>% unlist %>% sum) > 0, msg = "method_weights sum must be > 0")

  assertthat::assert_that(normalize_weight >= 0 & normalize_weight <= 1, msg = "normalize_weight must be between 0 and 1")

  if(method_weights$arima == 0 & method_weights$stl == 0 & normalize_weight > 0){
    warning("normalize_weight will only affects arima and stl method - will be ignored now")
    normalize_weight <- 0
  }

  if((method_weights %>% unlist %>% sum) != 1) warning("method_weights do not sum up to 1 - will be scaled proportionally")

  method_weights %<>% lapply(function(x) x / method_weights %>% unlist %>% sum)

  # get data granularity per Key
  df %<>% get_granularity()

  # get changepoints and segment
  df %<>% group_by(Key) %>% mutate(breaks_vector = get_changepoints(Value)) %>%
    mutate(segment = cumsum(breaks_vector))


  # if wanted, remove seasonality
  if(remove_seasonality){
    df %<>% group_by(Key) %>% mutate(Value.ds = remove_seasonality(dates = Date, values = Value, granularity = granularity))
  } else {
    df %<>% mutate(Value.ds = Value)
  }

  # get garima score
  if(method_weights$garima > 0) df %<>%  group_by(segment, Key) %>%
    mutate(anomaly_score_garima = get_garima_score(Value.ds)) #2.6 min je 25

  ### prepare arima and stl

  # get scaled unnormalized Values
  df %<>% mutate(Value.scaled = as.numeric(scale(Value.ds)))

  # if set, get normalized values and calculate average between scaled and normalized into Value.use
  if(normalize_weight > 0){
    df %<>% group_by(Key, segment) %>%
      mutate(Value.normalized = get_normalized_values(Value.ds)) %>%
      rowwise() %>%
      mutate(Value.use = (1-normalize_weight) * Value.scaled + normalize_weight * Value.normalized) %>%
      group_by(Key, segment)
  } else {
    df %<>% mutate(Value.use = Value.scaled)
  }


  # get arima score
  if(method_weights$arima > 0){
    df %<>%  group_by(segment, Key) %>%
      mutate(anomaly_score_arima = get_arima_score(x = Value.use))
  }

  # get stl score
  if(method_weights$stl > 0){
    df %<>% group_by(segment, Key) %>%
      mutate(anomaly_score_stl = get_stl_score(dates = Date, values = Value.use,
                                               granularity = granularity))
  }

  df %<>% ungroup() %>%
    mutate_at(vars(contains("anomaly_score")),
              function(x) as.numeric(scale(x)))

  # calculate overall anomaly score
  anomaly_score <- as.matrix(df %>% select_at(vars(contains("anomaly")))) %*%
    as.matrix(unlist(method_weights)[unlist(method_weights) != 0]) %>% c

  df %<>% mutate(anomaly_score = anomaly_score)

  output_list <- list(df = df, method_weights = method_weights, normalize_weight = normalize_weight)
  return(output_list)
}
td-berlin/anomalizer documentation built on Feb. 21, 2020, 2:03 a.m.