#' Estimate anomaly-score for each value of each Key of a data frame in three steps
#' 1. Identification of Changepoints and subsequent segmentation of the time series
#' 2. (Optional) Partly Normalization of Values of each Key
#' 3. Estimating anomaly-score based on residuals of generalized-arma and (normal) arima-model plus
#' remainder based on stl (time series decomposition)
#' The user can control which kind of anomalies she is interested in regarding three dimensions:
#' 1. speed: set normalize_weight and method_weights$garima to 0 in order to increase speed
#' 2. global vs local anomalies: stl finds global outliers, arima and garima local outliers conditional on last values
#' 3. balanced vs geometrical intuitive: garima-anomaly-score is generally symmetric distributed. Distribution of stl- and arima
#' anomaly-scores depend on normalization beforehand. No normalization and right-skewed original data will lead to right-skewed
#' anomaly-score-distribution here.
#'
#' @param df data frame containing at least Columns Key, Date and Value
#' @param method_weights List of Weights for calculation of overall anomaly-score. Setting garima = 0 significantly increases computational performance
#' @param normalize_weight Share of Normalization. Only affects arima and stl. Setting to 0 singificantly increases computational performance
#' @param x.matrix optional matrix of regressors, use to model seasonality
#' @return List containing extended inputted df possessing method-specific and global anomaly-score, and input-params
#' @export
#' @import tidyverse
#' @import lubridate
#' @import magrittr
#' @import assertthat
#' @import forecast
#' @import strucchange
#' @import gamlss
#' @import gamlss.util
#' @import anomalize
get_anomaly_score <- function(df, method_weights = list(garima = 0.5, arima = 0.5, stl = 0.5),
normalize_weight = 0, remove_seasonality = TRUE){
assertthat::assert_that(is.data.frame(df), msg = "df must be of class 'data.frame'")
assertthat::assert_that(all(c("Key", "Date", "Value") %in% colnames(df)), msg = "df must contain colums 'Key', 'Date' and 'Value'")
assertthat::assert_that(is.numeric(df$Value), msg = "Value must be numeric")
assertthat::assert_that(all(names(method_weights) == c("garima", "arima", "stl")), msg = "method_weights incorrect - must be in correct order")
assertthat::assert_that(all(method_weights %>% unlist >= 0), msg = "all method_weights must be > 0 ")
assertthat::assert_that((method_weights %>% unlist %>% sum) > 0, msg = "method_weights sum must be > 0")
assertthat::assert_that(normalize_weight >= 0 & normalize_weight <= 1, msg = "normalize_weight must be between 0 and 1")
if(method_weights$arima == 0 & method_weights$stl == 0 & normalize_weight > 0){
warning("normalize_weight will only affects arima and stl method - will be ignored now")
normalize_weight <- 0
}
if((method_weights %>% unlist %>% sum) != 1) warning("method_weights do not sum up to 1 - will be scaled proportionally")
method_weights %<>% lapply(function(x) x / method_weights %>% unlist %>% sum)
# get data granularity per Key
df %<>% get_granularity()
# get changepoints and segment
df %<>% group_by(Key) %>% mutate(breaks_vector = get_changepoints(Value)) %>%
mutate(segment = cumsum(breaks_vector))
# if wanted, remove seasonality
if(remove_seasonality){
df %<>% group_by(Key) %>% mutate(Value.ds = remove_seasonality(dates = Date, values = Value, granularity = granularity))
} else {
df %<>% mutate(Value.ds = Value)
}
# get garima score
if(method_weights$garima > 0) df %<>% group_by(segment, Key) %>%
mutate(anomaly_score_garima = get_garima_score(Value.ds)) #2.6 min je 25
### prepare arima and stl
# get scaled unnormalized Values
df %<>% mutate(Value.scaled = as.numeric(scale(Value.ds)))
# if set, get normalized values and calculate average between scaled and normalized into Value.use
if(normalize_weight > 0){
df %<>% group_by(Key, segment) %>%
mutate(Value.normalized = get_normalized_values(Value.ds)) %>%
rowwise() %>%
mutate(Value.use = (1-normalize_weight) * Value.scaled + normalize_weight * Value.normalized) %>%
group_by(Key, segment)
} else {
df %<>% mutate(Value.use = Value.scaled)
}
# get arima score
if(method_weights$arima > 0){
df %<>% group_by(segment, Key) %>%
mutate(anomaly_score_arima = get_arima_score(x = Value.use))
}
# get stl score
if(method_weights$stl > 0){
df %<>% group_by(segment, Key) %>%
mutate(anomaly_score_stl = get_stl_score(dates = Date, values = Value.use,
granularity = granularity))
}
df %<>% ungroup() %>%
mutate_at(vars(contains("anomaly_score")),
function(x) as.numeric(scale(x)))
# calculate overall anomaly score
anomaly_score <- as.matrix(df %>% select_at(vars(contains("anomaly")))) %*%
as.matrix(unlist(method_weights)[unlist(method_weights) != 0]) %>% c
df %<>% mutate(anomaly_score = anomaly_score)
output_list <- list(df = df, method_weights = method_weights, normalize_weight = normalize_weight)
return(output_list)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.