R/get_changepoints.R

Defines functions get_changepoints

Documented in get_changepoints

#' Estimate anomaly-score for each value of each Key of a data frame in three steps
#' @param x vector containing time-series values
#' @param min_segment_size minimum number of datapoints between each changepoint. Only taken if > min_segment_share*N
#' @param min_segment_share minimum share of datapoints between each changepoint. Decreasing this number fucks performance exponentially
#' @param min_RSS_decrease minimum RSS decrease to apply changepoint. The lower, the more changepoints are found
#' @return Boolean-Vector of size x saying if realted value is a changepoint
#' @export
get_changepoints <- function(x, min_segment_size = 50,
                             min_segment_share = 0.2, min_RSS_decrease = 0.2){
  library(tidyverse)
  library(strucchange)
  assertthat::assert_that(is.numeric(x), msg = "x must be numeric")
  assertthat::assert_that(is.numeric(min_RSS_decrease), msg = "min_RSS_decrease must be numeric")
  assertthat::assert_that(is.numeric(min_segment_size), msg = "min_segment_size must be numeric")
  assertthat::assert_that(min_RSS_decrease >= 0, msg = "min_RSS_decrease must be greater than 0")
  assertthat::assert_that(min_segment_size > 0, msg = "min_RSS_decrease must be greater than 0")

  if(min_segment_size >= 0.5*length(x)) return(rep(FALSE, length(x)))

  if(min_segment_size/length(x) < min_segment_share){
    h <- min_segment_share
  }  else {
    h <- min_segment_size
  }

  if(min_segment_size  < .1){
    warning("Small min_segment_size effects runtime of the function exponentially")
  }
  if(min_RSS_decrease > 1){
    warning("A min_RSS_decrease-Value greater than 1 means, that
            only changepoints will be returned, that at least halv the RSS")
  }
  breaks <- strucchange::breakpoints(x ~ 1, hcp = "foreach", het.err = T, h = h)
  Number_of_Breakspoints <- function(obj, min_improvement = min_RSS_decrease){
    x <- data.frame(RSS = summary(obj)$RSS[-2,])$RSS
    keep <- (x[-length(x)] / x[-1]) > (1 + min_improvement)
    if(all(!keep)) return(0)
    if(any(keep)) return(which.max(keep * seq(keep)))
  }
  N_Breaks <- Number_of_Breakspoints(breaks)
  Breakspoints <- summary(breaks)$breakpoints[N_Breaks,] %>%  na.omit %>%  as.numeric
  breaks_vector <- rep(FALSE, length(x))
  breaks_vector[Breakspoints] = TRUE
  return(breaks_vector)
}
td-berlin/anomalizer documentation built on Feb. 21, 2020, 2:03 a.m.