#' Apply outlier detection method
#
#'
#' @param time_series ts object. The signal were to applied the outlier method.
#' @param outlier_method String. Defines the type of outlier method to be applied.
#' Options: *Standard Deviation*, *Iglewicz Hoaglin Method*, *Winsorizing* and *IQR*.
#' @param data_rule String. Defines the classification of the data according to the number of observations.
#' Options: *Historical*, *Moving Average Error* and *Rstl Error*.
#' @param mov_avg_n Integer. Defines how many periods to consider to smooth the series.
#' @param threshold_val Double. Defines the interval cut-off the extreme values. Between $0$ and $1$.
#' @param causal_factor Logical. Defines if the series have regressors or not.
#' @param lag Double. Defines the number of lags to be used for moving average
#' @import pracma
#' @importFrom DescTools Winsorize
#' @importFrom stats sd ts quantile IQR median mad stl
#' @importFrom stlplus stlplus
#' @import dvmisc
#' @author Sze Gee
#'
#' @return ts object.
#' @export
#'
#' @examples
#' \dontrun{
#' outlier_cleansing()
#' }
outlier_cleansing = function(time_series , outlier_method, data_rule , mov_avg_n, threshold_val, causal_factor = T, lag = 4){
if(class(time_series)!="ts"){
time_series <- ts(time_series, start = 1, frequency = 12)
}
if(outlier_method == "Standard Deviation"){ # Same as KNX
if(data_rule == "Historical"){
new_ts <- time_series
avg <- mean(new_ts)
s_d <- sd(new_ts)
new_ts[new_ts < avg - threshold_val * s_d] = avg - threshold_val * s_d # lower threshold
new_ts[new_ts > avg + threshold_val * s_d] = avg + threshold_val * s_d # upper threshold
if(sum(new_ts) == 0){ new_ts = time_series} # if outlier method changes all values to zero, revert back to original ts
new_ts <- pmax(0, new_ts) # Coerce negative values to zero
return(
list(new_ts = new_ts
, method = "Standard Deviation"
, data_rule = "Historical"
, upper_threshold = replace(x = time_series, list = 1:length(time_series), values = avg + threshold_val*s_d)
, lower_threshold = replace(x = time_series, list = 1:length(time_series), values = avg - threshold_val*s_d))
)
}
if(data_rule == "Moving Average Error"){ # ok
new_ts = time_series
ts_ma = movavg(new_ts, n = mov_avg_n, type = "s")
resi <- new_ts - ts_ma
avg <- mean(resi)
s_d <- sd(resi)
resi_new <- resi
resi_new[resi_new < (avg-threshold_val * sd(resi))] = (avg - threshold_val * sd(resi)) # lower threshold
resi_new[resi_new > (avg+threshold_val * sd(resi))] = (avg + threshold_val * sd(resi)) # upper threshold
adjustment = resi_new - resi
new_ts = time_series + adjustment
if(sum(new_ts) == 0){new_ts =time_series } # if outlier method changes all values to zero, revert back to original ts
new_ts = pmax(0,new_ts) # Coerce negative values to zero
return(list(new_ts = new_ts,
method = "SD",
data_rule = "Moving average",
upper_threshold = (time_series- resi + threshold_val * s_d),
lower_threshold = (time_series- resi - threshold_val * s_d))
)
}
if(data_rule == "Rstl Error" & causal_factor ){ # ok
rstl = stlplus(time_series,s.window = "periodic",n_p = 12)
resi = rstl$data[,"remainder"]
avg = mean(resi[!(is.na(resi))])
s_d = sd(resi[!(is.na(resi))])
resi_new = resi
resi_new[resi_new< (avg-threshold_val * sd( resi)) ] = (avg-threshold_val * s_d) # lower threshold
resi_new[resi_new> (avg+threshold_val * sd( resi)) ] = (avg+threshold_val * s_d) # upper threshold
adjustment =resi_new -resi
new_ts = time_series + adjustment
if(sum(new_ts[!is.na(new_ts)]) == 0){new_ts =time_series } # if outlier method changes all values to zero, revert back to original ts
new_ts = pmax(0,new_ts) # Coerce negative values to zero
return(list(new_ts = new_ts,
method = "Standard Deviation",
data_rule = "Rstl Error",
upper_threshold = (time_series- rstl$data[,"remainder"]+ threshold_val * s_d) ,
lower_threshold = (time_series- rstl$data[,"remainder"]- threshold_val * s_d))
)
}
if(data_rule == "Rstl Error" & causal_factor == F){ # use stl for ts with causal factor
rstl = stl(time_series,s.window = "periodic",robust = T)
resi = rstl[["time.series"]][, 3]
avg = mean(resi)
s_d = sd(resi)
resi_new = resi
resi_new[resi_new< (avg-threshold_val * sd( resi)) ] = (avg-threshold_val * s_d) # lower threshold
resi_new[resi_new> (avg+threshold_val * sd( resi)) ] = (avg+threshold_val * s_d) # upper threshold
adjustment =resi_new -resi
new_ts = time_series + adjustment
if(sum(new_ts) == 0){new_ts =time_series } # if outlier method changes all values to zero, revert back to original ts
new_ts = pmax(0,new_ts) # Coerce negative values to zero
return(list(new_ts = new_ts,
method = "Standard Deviation",
data_rule = "Rstl Error",
upper_threshold = (time_series- rstl[["time.series"]][, 3]+ threshold_val * s_d) ,
lower_threshold = (time_series- rstl[["time.series"]][, 3]- threshold_val * s_d))
)
}
}
if(outlier_method == "Iglewicz Hoaglin Method"){
if(data_rule == "Historical"){ # ok
dist_lower = median(time_series) - threshold_val*(mad(time_series,constant = 1)/0.6745)
dist_upper = median(time_series) + threshold_val*(mad(time_series,constant = 1)/0.6745)
new_ts <- time_series
new_ts[time_series < dist_lower] <- dist_lower
new_ts[time_series > dist_upper] <- dist_upper
if(sum(new_ts) == 0){new_ts =time_series } # if outlier method changes all values to zero, revert back to original ts
new_ts = pmax(0, new_ts) # Coerce negative values to zero
return(
list(new_ts = new_ts
, method = "Iglewicz Hoaglin Method"
, data_rule = "Historical"
, upper_threshold = replace(x =time_series,list = 1:length(time_series), values = dist_lower)
, lower_threshold = replace(x =time_series,list = 1:length(time_series), values = dist_upper))
)
}
if(data_rule == "Moving Average Error"){ # ok
ts_ma <- movavg(time_series, n = mov_avg_n, type = "s")
resi <- time_series - ts_ma
resi_new <- resi
dist_lower = - threshold_val * (mad(resi, constant = 1)/0.6745) # see "-"
dist_upper = threshold_val * (mad(resi, constant = 1)/0.6745)
resi_new[resi_new < dist_lower ] = dist_lower # lower threshold
resi_new[resi_new > dist_upper ] = dist_upper # upper threshold
adjustment = resi_new - resi
new_ts = time_series + adjustment
if(sum(new_ts) == 0){ new_ts = time_series } # if outlier method changes all values to zero, revert back to original ts
new_ts <- pmax(0, new_ts) # Coerce negative values to zero
return(list(new_ts = new_ts,
method = "Iglewicz Hoaglin Method",
data_rule = "Moving Average Error",
upper_threshold = time_series - resi + dist_upper,
lower_threshold = time_series - resi + dist_lower)
)
}
if(data_rule == "Rstl Error" & causal_factor){ # ok
rstl = stlplus(time_series,s.window = "periodic",robust = T)
resi = rstl$data[,"remainder"]
resi_new = resi
dist_lower = - threshold_val*(mad(resi[!is.na(resi)],constant = 1)/0.6745)
dist_upper = threshold_val*(mad(resi[!is.na(resi)],constant = 1)/0.6745)
resi_new[resi_new< dist_lower ] = dist_lower #lower threshold
resi_new[resi_new> dist_upper ] = dist_upper #upper threshold
adjustment =resi_new -resi
new_ts = time_series + adjustment
if(sum(new_ts[!is.na(new_ts)]) == 0){new_ts =time_series } # if outlier method changes all values to zero, revert back to original ts
new_ts = pmax(0,new_ts) # Coerce negative values to zero
lower = time_series - rstl$data[,"remainder"] + dist_lower
upper = time_series - rstl$data[,"remainder"] + dist_upper
return(list(new_ts = new_ts,
method = "Iglewicz Hoaglin Method",
data_rule = "Rstl Error",
upper_threshold = upper,
lower_threshold = lower)
)
}
if(data_rule == "Rstl Error" & causal_factor == F){
rstl = stl(time_series,s.window = "periodic",robust = T)
resi = rstl[["time.series"]][, 3]
resi_new = resi
dist_lower = - threshold_val*(mad(resi,constant = 1)/0.6745)
dist_upper = threshold_val*(mad(resi,constant = 1)/0.6745)
resi_new[resi_new< dist_lower ] = dist_lower #lower threshold
resi_new[resi_new> dist_upper ] = dist_upper #upper threshold
adjustment =resi_new -resi
new_ts = time_series + adjustment
if(sum(new_ts[!is.na(new_ts)]) == 0){new_ts =time_series } # if outlier method changes all values to zero, revert back to original ts
new_ts = pmax(0,new_ts) # Coerce negative values to zero
lower = time_series - rstl[["time.series"]][, 3] + dist_lower
upper = time_series - rstl[["time.series"]][, 3] + dist_upper
return(list(new_ts = new_ts,
method = "Iglewicz Hoaglin Method",
data_rule = "Rstl Error",
upper_threshold = upper,
lower_threshold = lower)
)
}
}
if(outlier_method == "Winsorizing"){
if(data_rule == "Historical"){ # same as Kinaxis
new_ts = time_series
new_ts = Winsorize(new_ts, probs=c(threshold_val, 1 - threshold_val), na.rm = TRUE)
if(sum(new_ts, na.rm = T) == 0){new_ts = time_series} # if outlier method changes all values to zero, revert back to original ts
new_ts = pmax(0, new_ts) # Coerce negative values to zero
return(list(new_ts = new_ts,
method = "Winzorize",
data_rule = "Hist",
upper_threshold = replace(x =time_series,list = 1:length(time_series), values = max(new_ts)),
lower_threshold = replace(x =time_series,list = 1:length(time_series), values = min(new_ts)))
)
}
if(data_rule == "Moving Average Error"){ # same as Kinaxis
ts_ma = movavg(time_series, n = mov_avg_n , type = "s")
resi = time_series - ts_ma
resi_new = resi
win_resi = Winsorize(resi, probs=c(threshold_val,1-threshold_val))
resi_new[resi_new< min(win_resi) ] = min(win_resi) #lower threshold
resi_new[resi_new> max(win_resi) ] = max(win_resi) #upper threshold
adjustment =resi_new - resi
new_ts = time_series + adjustment
if(sum(new_ts) == 0){new_ts = time_series } # if outlier method changes all values to zero, revert back to original ts
new_ts = pmax(0, new_ts) # Coerce negative values to zero
return(list(new_ts = new_ts,
method = "Winsorizing",
data_rule = "Moving Average Error",
upper_threshold = time_series - resi + max(win_resi),
lower_threshold = time_series - resi + min(win_resi))
)
}
if(data_rule == "Rstl Error" & causal_factor == T){
#This is not correct_ normal STL should be applied if there's no regressor attached to that GMID_
stl_try <- try(stlplus(time_series, n.p = 12, s.window = "periodic"), silent = TRUE)
if(class(stl_try) != "try-error"){
rstl = stlplus(time_series,s.window = "periodic",n_p = 12)
resi = rstl$data[,"remainder"]
resi_new = resi
win_resi = Winsorize(resi, probs=c(threshold_val, 1-threshold_val), na.rm = T )
resi_new[resi_new < min(win_resi)] = min(win_resi) #lower threshold
resi_new[resi_new > max(win_resi)] = max(win_resi) #upper threshold
adjustment =resi_new -resi
new_ts = time_series + adjustment
if(sum(new_ts[!(is.na(new_ts))]) == 0){new_ts =time_series } # if outlier method changes all values to zero, revert back to original ts
new_ts = pmax(0,new_ts) # Coerce negative values to zero
lower = time_series - rstl$data[,"remainder"] + min(win_resi, na.rm = T)
upper = time_series - rstl$data[,"remainder"] + max(win_resi, na.rm = T)
return(list(new_ts = new_ts,
method = "Winsorizing",
data_rule = "Rstl Error",
upper_threshold = upper,
lower_threshold = lower)
)
} else {
new_ts = time_series
new_ts = Winsorize(new_ts, probs=c(threshold_val, 1-threshold_val), na.rm = TRUE)
if(sum(new_ts, na.rm = TRUE) == 0){new_ts = time_series} # if outlier method changes all values to zero, revert back to original ts
new_ts = pmax(0, new_ts)
return(list(new_ts = new_ts,
method = "Winzorize",
data_rule = "Hist",
upper_threshold = replace(x =time_series,list = 1:length(time_series), values = max(new_ts, na.rm = T)),
lower_threshold = replace(x =time_series,list = 1:length(time_series), values = min(new_ts, na.rm = T))
)
)
}
}
if(data_rule == "Rstl Error" & causal_factor == F){
rstl = stl(time_series, s.window = "periodic", robust = T)
resi = rstl[["time.series"]][, 3]
resi_new = resi
win_resi = Winsorize(resi, probs=c(threshold_val, 1 - threshold_val),na.rm = T )
resi_new[resi_new< min(win_resi) ] = min(win_resi) #lower threshold
resi_new[resi_new> max(win_resi) ] = max(win_resi) #upper threshold
adjustment =resi_new -resi
new_ts = time_series + adjustment
if(sum(new_ts[!(is.na(new_ts))]) == 0){new_ts =time_series } # if outlier method changes all values to zero, revert back to original ts
new_ts = pmax(0,new_ts) # Coerce negative values to zero
lower = time_series - rstl[["time.series"]][, 3] + min(win_resi,na.rm = T)
upper = time_series - rstl[["time.series"]][, 3] + max(win_resi,na.rm = T)
return(list(new_ts = new_ts,
method = "Winsorizing",
data_rule = "Rstl Error",
upper_threshold = upper,
lower_threshold = lower)
)
}
}
if(outlier_method == "IQR"){
new_ts = time_series
qnt = quantile(new_ts, probs=c(0.25, 0.75))
caps = quantile(new_ts, probs=c(0.05, 0.95))
H = 1.5*IQR(new_ts)
new_ts[new_ts<(qnt[1]-H)] = caps[1]
new_ts[new_ts>(qnt[2]+H)] = caps[2]
if(sum(new_ts) == 0){new_ts =time_series } # if outlier method changes all values to zero, revert back to original ts
new_ts = pmax(0,new_ts) # Coerce negative values to zero
return(list(new_ts = new_ts,
method = "IQR",
upper_threshold = qnt[2]+H,
lower_threshold = qnt[1]-H)
)
}
warning("No method applied, original time series is returned_")
return(list(new_ts = time_series))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.