Nothing
#' @title Outlier detection for single or grouped time series
#'
#' @description This function provides anomaly signals (even a graphical visualization) when there is a 'jump' in a single time series, or the 'jump' is too much different respect those ones of grouped similar time series.
#'
#' @param dati data frame (grouped time series: phenomenon+date+group+values) or vector (single time series)
#' @param graph logical value for graphical analysis (default=FALSE)
#' @param linear_analysis logical value for linear analysis (default=FALSE)
#' @param val_test_limit value for outlier detection sensitiveness (default=5 ; max=10)
#' @param save_out logical value for saving detected outliers (default=FALSE)
#' @param out_out a character file name for saving outliers in csv form, delimited with ";" and using ',' as decimal separator (default out.csv)
#' @param pdf_out a character file name for saving graphic analysis in pdf file (default=out.pdf)
#' @param r_out rows number of graphs (default=3)
#' @param c_out cols number of graphs (default=2)
#' @param first_line value for first dotted line in graphic analysis (default=1)
#' @param pace_line value for pace in dotted line in graphic analysis (default=6)
#' @return Data frame of possible outliers in a triad. Output record: rows/time.2/series/y1/y2/y3/test(AV)/AV/ n/median(AV)/mad(AV)/madindex(AV). Where time.2 is the center of the triad y1, y2, y3; test(AV) is the number to compare with 5 to detect outlier; n is the number of observations of the group ....
#' @export
#' @examples
#' ## we can start with data without outliers but structured with co-movement between groups
#'data("dati")
#'## first column for phenomenon
#'## 2° col for time written in ordered numbers or strings
#'## 3° col for group classification variable
#'## 4° col for values
#'str(dati)
#'#######################################
#'## a data frame without any outlier
#'#######################################
#'out=wash.out(dati)
#'out ## empity data frame
#'length(out[,1]) ## no row
#'## we can add two outliers
#'#### time=3 temperature value=0
#'dati[99,4]= 0
#'## ... and then for "rain" phenomenon!
#'#### time=3 rain value=37
#'dati[118,4]= 37
#'#######################################
#'## data.frame with 2 fresh outliers
#'#######################################
#'out=wash.out(dati)
#'## all "three terms" time series
#'## let's take a look at anomalous time series
#'out
#'## ... the same but we save results in a file....
#'## If we don't specify a name, out.csv is the default
#'out=wash.out(dati,save_out=TRUE,out_out="tabel_out.csv")
#'out
#'## we put the parameter from 5 to 10, using this upper one to capture
#'## only particularly anomalous outliers
#'out=wash.out(dati, val_test_limit = 10)
#'out
#'## save plots and outliers in a pdf file "out.pdf" as a default
#'out=wash.out(dati, val_test_limit = 10, graph=TRUE)
#'out
#'## we can make the usual analysis for groups but we can also use that one
#'## reserved for every single time series
#'## (linear_analysis): two files for saved outliers (out.csv and linout.csv)
#'## and for graph display in two pdf files (out.pdf and linout.pdf)
#'out=wash.out(dati,val_test_limit=5,save_out=TRUE,linear_analysis=TRUE,graph=TRUE)
#'out
#'## out return only the linear analysis...
#'## ... in this case we lose the co-movement information an we run the risk
#'## of finding too much variance in a single time series
#'## and detecting not too much likely outliers
#'##########################################################
#'## single time series analysis
#'##########################################################
#'data(ts)
#'str(ts)
#'sts= ts$dati
#'plot(sts,type="b",pch=20,col="red")
#'## a time series with a variability and an increasing trend
#'## sts is a vector and linear analysis is the default one
#'out=wash.out(sts)
#'out
#'## we find no outlier
#'out=wash.out(sts,val_test_limit=5,linear_analysis=TRUE,graph=TRUE)
#'out
#'## no outlier
#'## We can add an outlier with limited amount
#'sts[5]=sts[5]*2
#'plot(sts,type="b",pch=20,col="red")
#'out=wash.out(sts,val_test_limit=5)
#'out
#'## test is over 5 for a bit
#'out=wash.out(sts,val_test_limit=5,save_out=TRUE,graph=TRUE)
#'out
#'data(ts)
#'sts= ts$dati
#'sts[5]=sts[5]*3
#'## we can try a greater value to put an outlier of a certain importance
#'plot(sts,type="b",pch=20,col="blue")
#'out=wash.out(sts,val_test_limit=5,save_out=TRUE,graph=TRUE)
#'out
#'## washer procedure identify three triads of outliers values
#'system("rm *.csv *.pdf")
#'
wash.out = function( dati ,
# p t i y
# dati structure: phenom./date/series/values/... other
graph=FALSE ,
linear_analysis=FALSE ,
val_test_limit = 5 ,
save_out=FALSE ,
out_out="out.csv" ,
pdf_out="out.pdf" ,
r_out =3 ,
c_out=2 ,
first_line =1 ,
pace_line = 6
)
{
## start function code
## sub function recall
washer2.AV = function( dati ) # p t i y
{ # dati structure: phenom./date/series/values/... other
# example: Phenomenon Time Zone Value ...
# ----------- -------- -- ----- --------
# Temperature 20091231 A1 20.1 ...
# Temperature 20091231 A2 21.0 ...
# ...
# Rain 20081231 B1 123.0 ...
# ...
###############################################################################################
AV = function(y) { # y matrix 3 columns (y1 y2 y3) and n rows
AV=array(0,length(y[,1]))
100*(2*y[,2]-y[,1]-y[,3])/(stats::median(y[,1]+y[,2]+y[,3])+ y[,1]+y[,2]+y[,3]) }
# output array AV
###############################################################################################
test.AV = function(AV) { # AV array n rows
t(rbind(test.AV=abs(AV-stats::median(AV))/stats::mad(AV),AV=AV,n=length(AV),median.AV=stats::median(AV),mad.AV=stats::mad(AV) ,
madindex.AV=stats::mad(AV)*1000/150 )) }
# col 1 2 3 5 6 7
# output: test / AV / n / median(AV) / mad(AV) / madindex
################################################################################################
if (min(dati[,4])> 0) {
dati=dati[which(!is.na(dati[,4])),]
dati=dati[order(dati[,1],dati[,3],dati[,2]),]
fen=rownames( table(dati[,1]) )
nfen=length(fen)
out= NA
for ( fi in 1:nfen)
{ print(c("phenomenon:",fi) ,quote=FALSE)
time=rownames( table(dati[which(fen[fi]==dati[,1]),2]) )
n=length(time)
for ( i in 2:(n-1) )
{ datiy= dati[dati[,1] == fen[fi] & dati[,2] %in% c(time[i-1],time[i],time[i+1]),c(2,3,4)]
y1=stats::reshape(datiy,timevar=colnames(datiy)[1],idvar=colnames(datiy)[2],direction="wide" )
y1=y1[!is.na(apply(y1[,c(2:4)],1,sum)),]
y=y1[,c(2,3,4)]
colnames(y)=c("t.1","t.2","t.3")
out=rbind(out,data.frame(fen=fen[fi],t.2=time[i],
series=y1[,1],y=y,test.AV(AV(y))))
}
}
rownames(out)=(1:length(out[,1])-1)
washer2.AV=out[2:length(out[,1]),]
# col 1 2 3 4 5 6 7 8 9 10 11 12
# output: rows /time.2/series/y1/y2/y3/test(AV)/AV/ n /median(AV)/mad(AV)/madindex(AV)
# end function washer2.AV
} else print(" . . . zero or negative y: t r a n s l a t i o n r e q u i r e d !!!")
}
####################################################################################################
td_w= function(dati, phen="T")
{ ## serie storica a gruppi di tre
## ----------------------------------
time=rep(0,(length(dati)-2)*3)
zone=rep(0,(length(dati)-2)*3)
value=rep(0,(length(dati)-2)*3)
out = data.frame(phen,time,zone,value)
k=1
for (i in 1:(length(dati)-2))
{for( j in 1:3)
{
out$zone[k+j-1] = i
out$time[k+j-1] = j
out$value[k+j-1] = dati[i+j-1]
}
k=k+3
}
return(out)
}
translate = function( dati )
{
min_var=min(dati[,4],na.rm = TRUE)
if (min_var>0) min_var = 0.0001
dati[,4]= dati[,4] - min_var+0.0001
return(dati)
}
inv_translate = function (out,dati)
{
min_var=min(dati[,4],na.rm = TRUE)
if (min_var>0) min_var = 0.0001
out$y.t.1 = out$y.t.1 +min_var -0.0001
out$y.t.2 = out$y.t.2 +min_var -0.0001
out$y.t.3 = out$y.t.3 +min_var -0.0001
return(out)
}
graph_f = function(dati,out,pdf_out,r_out,c_out,first_line,pace_line,val_test_limit)
{
#requireNamespace(gplots)
grDevices::pdf(pdf_out)
graphics::par(mfrow=c(r_out,c_out),cex.main=.6)
a= out[out[,"test.AV"]> val_test_limit,]
a= a[order(paste(a$fen,a$series)),]
series_old = 0
fen_old = 0
for (i in 1:length(a[,1]))
{
#######################
series=a[i,"series"]
fen=a[i,"fen"]
#######################
if( !(series == series_old & fen == fen_old))
{
elenco=a[a$series==a[i,"series"] & a$fen==a[i,"fen"],c(1,3,2,4,5,6,7)]
elenco[4:7]=round(elenco[4:7],2)
# names(elenco)[1]= "settore"
# names(elenco)[2] = "area"
if (length(elenco[,1]) > 25) elenco = elenco[1:25,]
gplots::textplot(elenco,cex = .50 , show.rownames = FALSE)
graphics::title("Outlier(s) list \n (max 25)")
##
cond = ( dati[,3]== series & dati[,1]== fen )
ss <- dati[ cond , c(2,4)]
ss=ss[order(ss[,1]),]
#ss[,1]=as.integer(ss[,1])
#ss
x=1:length(ss[,1])
time=ss[,1]
y=ss[,2]
graphics::plot(x,y,xaxt="n",type="l",pch=20, cex=0.5,col="blue",
main=paste('Series:',a[i,"series"],' phen. ',fen),
lwd=0.4)
graphics::points(x,y,col="darkblue",pch=20, cex=0.5)
graphics::axis(1, at=seq(first_line, length(ss[,1]), by = pace_line),
labels=time[seq(first_line, length(ss[,1]), by = pace_line)],
las=2,cex.axis=0.8)
graphics::abline(h=0, col = "blue",lty = 3)
graphics::abline(v=seq(first_line-0.1, length(ss[,1]), by = pace_line), col = "blue",lty = 3)
series_old = series
fen_old = fen
}
out_t2=which(ss[,1]==a[i,"t.2"])
graphics::abline(v=out_t2-1.1,col="red", lwd=0.1)
graphics::abline(v=out_t2+1.1,col="red", lwd =0.1)
}
grDevices::dev.off() #
}
if(is.vector(dati))
{ linear_analysis=TRUE
if(length(dati)<10)
{
print("No linear analisys: too few data!")
return(0)
}
if (min(dati)>=0 ) min_var = 0.0001
else min_var = min(dati)
dati= dati - min_var+0.0001
out= washer2.AV(td_w(dati))
out=out[out[,7]>val_test_limit,]
if( length(out[,1])==0) print("NO outlier!!!")
dum=out[,2]
out[,2]=out[,3]+1
out[,3]=dum
out$y.t.1 = out$y.t.1 +min_var -0.0001
out$y.t.2 = out$y.t.2 +min_var -0.0001
out$y.t.3 = out$y.t.3 +min_var -0.0001
dati= dati +min_var-0.0001
if(save_out & length(out[,1])>0){
utils::write.csv2(out[out[,7]>val_test_limit,],file=out_out)
print(paste("File '",out_out,"' saved!",sep=""))
print(paste("Dir -> ",getwd()))
}
if(graph & length(out[,1])>0)
{
dati=data.frame(phen=rep("T",length(dati)),data=1:length(dati),series=rep(2,length(dati)),dati=dati)
#print(dati)
#print(out)
graph_f(dati,out,paste("lin",pdf_out,sep=""),r_out=2,c_out=1,first_line,pace_line,val_test_limit)
print(paste("File'",paste("lin",pdf_out,sep=""),"' saved!",sep=""))
print(paste("Dir -> ",getwd()))
}
return(out[out[,7]>val_test_limit,])
}
else if(is.data.frame(dati))
{
out= washer2.AV(translate(dati))
out=out[out[,7]>val_test_limit,]
if( length(out[,1])==0) print("NO outlier!!!")
else out=inv_translate(out,dati)
if(save_out & length(out[,1])>0){
utils::write.csv2(out[out[,7]>val_test_limit,],file=out_out)
print(paste("File '",out_out,"' saved!",sep=""))
print(paste("Dir -> ",getwd()))
}
if(graph & length(out[,1])>0)
{
graph_f(dati,out,pdf_out,r_out,c_out,first_line,pace_line,val_test_limit)
print(paste("File '",pdf_out,"' saved!",sep=""))
print(paste("Dir -> ",getwd()))
}
if(linear_analysis)
{
fen_series=dati[,c(1,3)]
datiw=dati[,c(1,2,3,4)]
datiw[,1] = paste(datiw[,1],datiw[,3], sep="")
fen_series[,3]=paste("p_",datiw[,1],sep="")
datiw[,3] <- NULL
names(datiw)[1]="col"
names(datiw)[2]="DATA"
names(datiw)[3]="p"
datiw=stats::reshape(datiw,v.names="p",idvar="DATA",timevar="col",direction="wide",sep="_")
n_col = length(datiw[1,])
print(paste ("cols:",n_col))
n_row = length(datiw[,1])
if(n_row<10)
{
print("No linear analisys: too few data!")
return(out[out[,7]>val_test_limit,])
}
print(paste ("rows:",n_row))
mesi = datiw$DATA[c(-1,-n_row)]
###############################################################
## first elaboration out of cycle
###############################################################
min_var= min(datiw[,2],na.rm = TRUE)
datiw[,2]= datiw[,2] - min_var +0.0001
out=washer2.AV(td_w(datiw[,2]))
out$fen= colnames(datiw)[2]
out$t.2=mesi
out$y.t.1 = out$y.t.1 +min_var -0.0001
out$y.t.2 = out$y.t.2 +min_var -0.0001
out$y.t.3 = out$y.t.3 +min_var -0.0001
datiw[,2] = datiw[,2] +min_var -0.0001
###############################################################
for (i in 3:n_col)
###############################################################
{
min_var= min(datiw[,i],na.rm = TRUE)
datiw[,i]= datiw[,i] - min_var +0.0001
out0=washer2.AV(td_w(datiw[,i]))
out0$fen= colnames(datiw)[i]
out0$t.2=mesi
out0$y.t.1 = out0$y.t.1 +min_var -0.0001
out0$y.t.2 = out0$y.t.2 +min_var -0.0001
out0$y.t.3 = out0$y.t.3 +min_var -0.0001
datiw[,i] = datiw[,i] +min_var -0.0001
out=rbind(out,out0)
print(paste("Time series: ",i))
}
###############################################################
for (i in 1:length(out[,1]))
{pos=match(out[i,1],fen_series[,3])
as.character(fen_series[pos,1]) -> out[i,1]
as.character(fen_series[pos,2]) -> out[i,3]
}
}
out=out[out[,7]>val_test_limit,]
if(linear_analysis & save_out & length(out[,1])>0){
utils::write.csv2(out[out[,7]>val_test_limit,],file=paste("lin",out_out,sep=""))
print(paste("File 'lin",out_out,"' saved!",sep=""))
print(paste("Dir -> ",getwd()))
}
if(linear_analysis & graph & length(out[,1])>0)
{
graph_f(dati,out,paste("lin",pdf_out,sep=""),r_out,c_out,first_line,pace_line,val_test_limit)
print(paste("File '",paste("lin",pdf_out,sep=""),"' saved!",sep=""))
print(paste("Dir -> ",getwd()))
}
}
else print ("not compliant data ")
#dev.off() #
return(out)
## end function
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.