#' Check for outliers in SIR models.
#'
#' This function takes a matrix \eqn{X} defining the same \eqn{n} individuals and a vector \eqn{y} defining also those individuals.
#'
#' @param X A matrix nxp describinf n observations through p varaibles.
#' @param y A vector of \strong{n} observations corresponding to the n rows of \emph{X}.
#' @param H A positive real value. The number of slices in the SIR model. Default is 5.
#' @param method Which method to be performed. One of \emph{BOOT} for bootstrap, \emph{TTR} for train-test replication and \emph{MONO} for mono-model testing.
#' @param kernel The kernel to be used, one of "normal" or "box". Default is "normal". Can be abbreviated. See \code{\link{ksmooth}}.
#' @param bandwidth A real indicating the width of the kernel. If NULL, automatic kernel width tuning is used. Default is "NULL".
#' @param nb.replications An interger giving the number of repliccations for \emph{BOOT} and \emph{TTR} methods. Default is "1000".
#' @param pourcent A real between 0 and 1. The proportion of observations to keep in the test dataset. Default is "0.1".
#' @param plotBandwidthEst An integer. Wether or not to plot the EDR direction with the kernel estimation. Default is "FALSE".
#' @param NCORES An integer. Fiwing the number of cores to be used in the sampling computations. Only BOOT method has been parallelized due to its computation time.
#'
#' @return A list containing indices of outliers in the case of the 3 methods and also the extreme observations in the case of \emph{BOOT}.
#'
#' @export
#' @importFrom stats ksmooth median na.omit
#' @importFrom grDevices boxplot.stats
#' @importFrom graphics abline lines mtext par plot points text title legend
#' @importFrom changepoint cpt.var
#' @importFrom dr dr dr.basis
#' @import foreach
#' @importFrom parallel makeCluster stopCluster
#' @importFrom doParallel registerDoParallel
#'
#' @references{
#' \insertAllCited{}
#' }
#'
#' @examples
#' data(ozone)
#' # MONO method
#' output <- outlierSIR(ozone[,-c(1,12,13)],ozone[,1],method="MONO")
#' # TTR method
#' # output <- outlierSIR(ozone[,-c(1,12,13)],ozone[,1],method="TTR",
#' # plotBandwidthEst = FALSE,kernel="normal",nb.replications = 1000)
#' # BOOT method, using 5 cores of computation
#' # output <- outlierSIR(ozone[,-c(1,12,13)],ozone[,1],method="BOOT",
#' # plotBandwidthEst = TRUE,kernel="normal",nb.replications = 5000,NCORES=5)
outlierSIR <- function(X,y,method=c("BOOT","TTR","MONO"),
kernel="normal",bandwidth=NULL,H=5,
nb.replications=1000,
pourcent=0.1,plotBandwidthEst=F,NCORES=1){
## Reset personnal plot par() settings
opar <- par(no.readonly =TRUE)
on.exit(par(opar))
## ----------------------------------
##########
## TEST ##
##########
## Unitarian test
test_vec <- function(vec){
a <- boxplot.stats(vec)$out
tete <- which(a<median(vec))
if(length(tete)>0){
a <- a[-tete]
}
which(vec %in% a)
}
# BOOT Version
detectOutliers <- function(BOOTS){
#-------------------
n <- nrow(BOOTS[[1]])
er <- BOOTS[[3]]
nb <- BOOTS[[1]]
mean_moins_sd_1 <- rep(NA,n)
for(i in 1:n){
df <- data.frame(x=nb[i,],y=er[i,])
mean_moins_sd_1[i] <- mean(df$y[-which(df$x %in% 0)],na.rm = T)
}
## Find outliers
outli <- test_vec(log(mean_moins_sd_1))
## Find outliers and extremes
extre <- test_vec(mean_moins_sd_1)
if(length(outli)>0 & length(extre)>0){
extre <- extre[-which(extre %in% outli)]
}
outli <- list(outliers = outli,extreme = extre)
}
bd_edr_est <- function(DONNEES,kernel="normal",H=5,plotBandwidthEst=F){
CV.prog <- function(x,y,hmin=(max(x)-min(x))/20,hmax=(max(x)-min(x))/2,nbh=25,
kernel="normal",plotBandwidthEst=F){
n<-length(x)
vecth<-seq(from=hmin,to=hmax,length=nbh)
matCV<-cbind(vecth,rep(0,nbh))
for (h in 1:nbh){
ypred<-rep(0,n)
for (i in 1:n){
ypred[i]<-ksmooth(x[-i], y[-i], kernel=kernel, bandwidth=vecth[h],x.points=x[i])$y
}
matCV[h,2]<-sum((y-as.matrix(ypred,ncol=1))^2)
}
matCV <- na.omit(matCV)
hopt<-matCV[which(matCV[,2]==min(matCV[,2])),1]
if (plotBandwidthEst){
par(mfrow=c(1,3))
plot(matCV,type="l",xlab="h",ylab="CV(h)")
abline(v=hopt,col=2,lwd=2)
title(paste("hopt=",round(hopt,digits=3)," with kernel '",kernel,"'",sep=""))
plot(x,y,main=paste("Regression line with kernel '",kernel,"'",sep=""))
lines(ksmooth(x, y, bandwidth=hopt,n.points=50,kernel=kernel))
}
list(matCV=matCV,hopt=hopt)
}
## ----------
res.est <- dr(y~., method="sir", nslices = H, data=DONNEES)
B.est <- dr.basis(res.est)[,1,drop=FALSE]
indice.est <- as.vector(as.matrix(DONNEES[,-1])%*%B.est)
list(bwd=CV.prog(indice.est,
DONNEES$y,
plotBandwidthEst=plotBandwidthEst,kernel = kernel)$hopt,B.est=B.est)
}
###########
### TTR ###
###########
method_TTR <- function(DONNEES,nb.replications=1000,
pourcent=0.35,H=5,bandwidth=NULL,
plotBandwidthEst=F){
# Bandwith estimation
res <- bd_edr_est(DONNEES,H=H,plotBandwidthEst=plotBandwidthEst,kernel=kernel)
B.est <- res$B.est
if(is.null(bandwidth)){
bandwidth <- res$bwd
}
matrice.distances <- matrix(NA,nrow=nrow(DONNEES),ncol=nb.replications)
for (r in 1 :nb.replications){
indice.train <- sample(1:nrow(DONNEES),replace=FALSE,size=round(nrow(DONNEES)*(1-pourcent),digits = 0))
donnees.train <- DONNEES[indice.train,]
donnees.test <- as.matrix(DONNEES[-indice.train,])
res.r <- dr(y~., method="sir", nslices = H, data=donnees.train)
B.r <- dr.basis(res.r)[,1,drop=FALSE]
donnees.train <-as.matrix(donnees.train)
indice.test <- c(1:nrow(DONNEES))[-indice.train]
for (i in 1:length(indice.test)){
matrice.distances[indice.test[i],r] <- abs(
ksmooth(x = as.vector(donnees.train[,-1]%*%B.r),y=donnees.train[,1],
bandwidth = bandwidth,kernel="normal",x.points = sum(donnees.test[i,-1]*B.r))$y
- donnees.test[i,1])
}
}
vecteur.distances <- apply(matrice.distances,1,function(l){mean(na.omit(l))})
# Automatic determination of the number of outliers
#--------------------------------------------------
indice.outlier.NaN <- which(is.na(vecteur.distances))
if (length(indice.outlier.NaN)>0){
err_mean <- apply(matrice.distances[-indice.outlier.NaN,],1,function(rr){mean(na.omit(rr))})
}
if (length(indice.outlier.NaN)==0){
err_mean <- apply(matrice.distances,1,function(rr){mean(na.omit(rr))})
}
ord_mean <- rev(order(err_mean))
seg1=cpt.var(err_mean[ord_mean],method="BinSeg",Q=1)
nbre.choisi.outliers <- seg1@cpts[1]
indice.outlier <- rev(order(vecteur.distances))[1:(nbre.choisi.outliers+length(indice.outlier.NaN))] #
nb_outliers_ttr <- length(indice.outlier)
indice.outlier
}
############
### BOOT ###
############
method_BOOT <- function(DONNEES,#zerosAndOnes=F,
nb.replications=1000,
H=5,bandwidth=NULL,
kernel="normal",
plotBandwidthEst=F){
matrice.distances=matrice.realizations=
matrice.nb_input <- matrix(0,nrow=nrow(DONNEES),ncol=nb.replications)
# Bandwith estimation
res <- bd_edr_est(DONNEES,H=H,plotBandwidthEst=plotBandwidthEst,kernel=kernel)
B.est <- res$B.est
if(is.null(bandwidth)){
bandwidth <- res$bwd
}
`%my_do%` <- ifelse(NCORES!=1,
{
out<-`%dopar%`
cl <- makeCluster(NCORES)
registerDoParallel(cl)
out},
{
out <- `%do%`
out})
MATRICES <- foreach(r=1:nb.replications,
.combine = rbind,.packages = "dr") %my_do% {
# for (r in 1:nb.replications){
# if(!zerosAndOnes){
indice.train <- sample(1:nrow(DONNEES),replace=TRUE,size=nrow(DONNEES))
# }else{
# indice.train_01 <- sample(c(0,1),replace=TRUE,size=nrow(DONNEES))
# indice.train <- (1:nrow(DONNEES))[which(indice.train_01==1)]
# }
matrice.distances=matrice.realizations=matrice.nb_input <- rep(0,nrow(DONNEES))
id_train <- as.numeric(names(table(indice.train)))
matrice.nb_input[id_train] <- table(indice.train)
donnees.train <- DONNEES[indice.train,]
donnees.test <- as.matrix(DONNEES)
res.r <- dr(y~., method="sir", nslices = H, data=donnees.train)
B.r <- dr.basis(res.r)[,1,drop=FALSE]
donnees.train <-as.matrix(donnees.train)
indice.test <- 1:nrow(DONNEES)
x.points <- as.vector(donnees.test[indice.test,-1,drop=F]%*%B.r)
realisations <- ksmooth(x = as.vector(donnees.train[,-1]%*%B.r),
y=donnees.train[,1],
bandwidth = bandwidth,
x.points = x.points,
kernel=kernel)
index_reals <- unlist(lapply(x.points,function(xo){which(realisations$x==xo)}))
matrice.realizations <- realisations$y[index_reals]
matrice.distances <-
abs(
matrice.realizations
- donnees.test[,1]
)
c(matrice.nb_input,matrice.realizations,matrice.distances)
}
if(NCORES!=1){
stopCluster(cl)
}
pp <- ncol(MATRICES)
matrice.nb_input <- t(MATRICES[,1:(pp/3),drop=F])
matrice.realizations <- t(MATRICES[,pp/3+1:(pp/3),drop=F])
matrice.distances <- t(MATRICES[,2*pp/3+1:(pp/3),drop=F])
list(
matrice.nb_input,
matrice.realizations,
matrice.distances,
B.est)
}
############
### MONO ###
############
method_MONO <- function(DONNEES,
H=5,bandwidth=NULL,
kernel="normal",plotBandwidthEst=F){
nb.replications <- nrow(DONNEES)
matrice.distances=
matrice.realizations=
matrice.nb_input <- matrix(0,nrow=nrow(DONNEES),ncol=1)
# Bandwith estimation
res <- bd_edr_est(DONNEES,H=H,plotBandwidthEst=plotBandwidthEst,kernel=kernel)
B.est <- res$B.est
if(is.null(bandwidth)){
bandwidth <- res$bwd
}
r <- 1
indice.train <- 1:nrow(DONNEES)
matrice.nb_input[as.numeric(names(table(indice.train))),r] <-
table(indice.train)
donnees.train <- DONNEES[indice.train,]
donnees.test <- as.matrix(DONNEES)
res.r <- dr(y~., method="sir", nslices = H, data=donnees.train)
B.r <- dr.basis(res.r)[,1,drop=FALSE]
donnees.train <-as.matrix(donnees.train)
indice.test <- 1:nrow(DONNEES)
for(j in indice.test){
matrice.realizations[j,r] <-
ksmooth(x = as.vector(donnees.train[,-1]%*%B.r),
y=donnees.train[,1],
bandwidth = bandwidth,
x.points = as.vector(donnees.test[j,-1,drop=F]%*%B.r),
kernel=kernel
)$y
}
matrice.distances[,r] <-
abs(
matrice.realizations[,r]
- donnees.test[,1]
)
list(
matrice.nb_input,
matrice.realizations,
matrice.distances,
B.est)
}
############
### PLOT ###
############
plot_solution_no_TRUE <- function(DONNEES,id_in,H=5){
if(is.list(id_in)){
indice.outlier <- id_in$outliers
extr <- id_in$extreme
}else{
indice.outlier <- id_in
}
# Estimation de la direction EDR sur les donnees completes
#---------------------------------------------------------
ids <- if(length(indice.outlier)>0)-indice.outlier else 1:nrow(DONNEES)
res.est <- dr(y~., method="sir", nslices = H, data=DONNEES[ids,])
B.est <- dr.basis(res.est)[,1,drop=FALSE]
plot(as.matrix(DONNEES[,-1])%*%B.est,DONNEES$y, pch=16,ylab="y",
xlab="",main=method)
mtext(text = "Estimated index",side = 1,line = 2)
if(length(indice.outlier)>0){
yy <- DONNEES$y[indice.outlier]
id_left <- which(yy<median(yy))
id_right <- which(yy>=median(yy))
points(as.matrix(DONNEES[indice.outlier,-1])%*%B.est, DONNEES$y[indice.outlier],
col="blue",pch=1,cex=2)
text(as.matrix(DONNEES[indice.outlier[id_left],-1])%*%B.est,
DONNEES$y[indice.outlier][id_left],
as.character(indice.outlier)[id_left],cex=1.5,col="blue",pos = 3)
text(as.matrix(DONNEES[indice.outlier[id_right],-1])%*%B.est,
DONNEES$y[indice.outlier][id_right],
as.character(indice.outlier)[id_right],cex=1.5,col="blue",pos = 1)
}
if(is.list(id_in)){
legend("topleft",legend = c("Normal observation","Extreme observation", "Outlier"),
fill=c("black","darkorange","blue"),bty="n")
if(length(extr)>0){
yy <- DONNEES$y[extr]
id_left <- which(yy<median(yy))
id_right <- which(yy>=median(yy))
points(as.matrix(DONNEES[extr,-1])%*%B.est, DONNEES$y[extr],
col="darkorange",pch=1,cex=2)
text(as.matrix(DONNEES[extr[id_left],-1])%*%B.est,
DONNEES$y[extr][id_left],
as.character(extr)[id_left],cex=1.5,col="darkorange",pos = 3)
text(as.matrix(DONNEES[extr[id_right],-1])%*%B.est,
DONNEES$y[extr][id_right],
as.character(extr)[id_right],cex=1.5,col="darkorange",pos = 1)
}
}else{
legend("topleft",legend = c("Normal observation", "Outlier"),
fill= c("black","blue"),bty="n")
}
}
## ----------- ##
#################
### MAIN PART ###
#################
## ----------- ##
DONNEES <- data.frame(list(y=y,X))
time <- system.time({
if(method=="MONO"){
MONOS <- method_MONO(DONNEES,H = H, kernel=kernel,bandwidth=bandwidth,
plotBandwidthEst=plotBandwidthEst)
output <- test_vec(MONOS[[3]])
}else if(method=="TTR"){
output <- method_TTR(DONNEES,nb.replications=nb.replications,
pourcent=pourcent,H=H,bandwidth=bandwidth,
plotBandwidthEst=plotBandwidthEst)
}else{
BOOTS <- method_BOOT(DONNEES,
nb.replications = nb.replications,
H = H, kernel=kernel,bandwidth=bandwidth,
plotBandwidthEst=plotBandwidthEst)
output <- detectOutliers(BOOTS)
}
})[3]
cat(paste("Time elapsed",round(time,3),"\n"))
plot_solution_no_TRUE(DONNEES,output)
output
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.