# R/lof.R In rushkin/outlier: Outlier detection in multidimensional data

#### Documented in lof

```#' Local Outlier Factor (LOF) values
#'
#' Calculates the local outlier factor (LOF) for multidimensional data
#'
#' @param x a matrix or data frame of data points. Each row is a data point, each column is a dimension.
#' @param k number of nearest neighbors in the LOF calculation (default 5).
#' @param metric distance metric to use. This must be one of "euclidean", "maximum", "manhattan", "canberra", "binary" or "minkowski". Any unambiguous substring can be given, case-insensitive
#' @param q the power of the Minkowski distance.
#' @return a vector of LOF values, in the same order as the data points in \code{x}. NA values will be returned for those data points which contained at least one NA coordinate.
#' @export
lof=function(x, k=5, metric='euclidean', q=3){

metrics=c('euclidean','maximum','manhattan','canberra','binary','minkowski')
metric=pmatch(tolower(metric),metrics)
if(is.na(metric)) stop('invalid distance metric name')
if(metric==-1) stop('ambiguous distance metric name')
metric=metrics[metric]

x=as.matrix(x)

indices=1:nrow(x)
indna=which(apply(x,1,function(x){any(is.na(x))}))
N=length(indices)

distmat=as.matrix(dist(x,method=metric,p=q))
diag(distmat)=NA

#Indices of k nearest neighbors. A point may have more than k values in case there are distance-ties. So the nrows of this matrix is from the longest series of indices, and missing values are filled with NAs.
nk=apply(distmat,1,function(y){

ind=which(rank(y,ties.method = 'min',na.last=TRUE)<=k)

ind=c(ind,rep(NA,N-length(ind)))
return(ind)

})
nk=nk[rowSums(nk,na.rm=TRUE)>0,]

#Add to it another row, which simply contains the indices. This is convenient later. The nrows will become l.
l=nrow(nk)+1
nkaugmented=rbind(nk,indices)

#k-distances of points. returns a vector (named)
kdist=apply(nkaugmented,2,function(y){

temp=distmat[y[l],y[-l]]
temp=temp[!is.na(temp)]
if(length(temp)==0) return(NA)

return(max(temp))

})

#convert kdist to a matrix by replicating the same row:
kdist=matrix(kdist,nrow = 1)
kdist=kdist[rep(1,N),]

#Calculate reachability distance matrix
reachdist=pmax(kdist,distmat)

#Local reachability density. Returns a vector (named)
lrd=1/apply(nkaugmented,2,function(y){
temp=reachdist[y[l],y[-l]]
temp=temp[!is.na(temp)]
if(length(temp)==0) return(NA)
return(mean(temp))
})

#Local outlier factor. Returns a vector
lof=apply(nk,2,function(y){
mean(lrd[y],na.rm=TRUE)
})/lrd
names(lof)=NULL

lof[indna]=NA
return(lof)
}
```
rushkin/outlier documentation built on Oct. 13, 2018, 10:48 a.m.