Nothing
#' Outlier detection for compositional data
#'
#' Outlier detection for compositional data using standard and robust
#' statistical methods.
#'
#' The outlier detection procedure is based on (robust) Mahalanobis distances
#' in isometric logratio coordinates. Observations with
#' squared Mahalanobis distance greater equal a certain quantile of the
#' chi-squared distribution are marked as outliers.
#'
#' If method \dQuote{robust} is chosen, the outlier detection is based on the
#' homogeneous majority of the compositional data set. If method
#' \dQuote{standard} is used, standard measures of location and scatter are
#' applied during the outlier detection procedure. Method \dQuote{robust}
#' can be used if the number of variables is greater than the number
#' of observations. Here the OGK estimator is chosen.
#'
#' plot method: the Mahalanobis distance are plotted against the index.
#' The dashed line indicates the (1 - alpha) quantile of the chi-squared
#' distribution. Observations with Mahalanobis distance greater than this
#' quantile could be considered as compositional outliers.
#'
#' @aliases outCoDa print.outCoDa plot.outCoDa
#' @param x compositional data
#' @param quantile quantile, corresponding to a significance level, is used as
#' a cut-off value for outlier identification: observations with larger
#' (squared) robust Mahalanobis distance are considered as potential outliers.
#' @param method either \dQuote{robust} (default) or \dQuote{standard}
#' @param alpha the size of the subsets for the robust covariance estimation
#' according the MCD-estimator for which the determinant is minimized, see \code{\link[robustbase]{covMcd}}.
#' @param coda if TRUE, data transformed to coordinate representation before outlier detection.
#' @param y unused second plot argument for the plot method
#' @param ... additional parameters for print and plot method passed through
#' @return \item{mahalDist }{resulting Mahalanobis distance} \item{limit
#' }{quantile of the Chi-squared distribution} \item{outlierIndex }{logical
#' vector indicating outliers and non-outliers} \item{method }{method used}
#' @note It is highly recommended to use the robust version of the procedure.
#' @author Matthias Templ, Karel Hron
#' @seealso \code{\link{pivotCoord}}
#' @references Egozcue J.J., Pawlowsky-Glahn, V., Mateu-Figueras, G.,
#' Barcelo-Vidal, C. (2003) Isometric logratio transformations for compositional
#' data analysis. \emph{Mathematical Geology}, 35 (3) 279-300.
#'
#' Filzmoser, P., and Hron, K. (2008) Outlier detection for compositional data
#' using robust methods. \emph{Math. Geosciences}, 40, 233-248.
#'
#' Rousseeuw, P.J., Van Driessen, K. (1999) A fast algorithm for the minimum
#' covariance determinant estimator. \emph{Technometrics}, 41, 212-223.
#' @keywords multivariate
#' @export
#' @examples
#'
#' data(expenditures)
#' oD <- outCoDa(expenditures)
#' oD
#' ## providing a function:
#' oD <- outCoDa(expenditures, coda = log)
#' ## for high-dimensional data:
#' oD <- outCoDa(expenditures, method = "robustHD")
outCoDa <- function(x, quantile=0.975, method="robust",
alpha = 0.5, coda = TRUE){
if(dim(x)[2] < 2) stop("need data with at least 2 variables")
if(alpha < 0.5 | alpha > 1) stop("allowed values for h are between 0.5 and 1")
covEst <- function(x, type) {
standard <- function(x){
list(mean=colMeans(x, na.rm=TRUE), varmat=cov(x))
}
robust <- function(x){
v <- robustbase::covMcd(x, alpha = alpha)
list(mean=v$center, varmat=v$cov)
}
robustHD <- function(x){
v <- robustbase::covOGK(x, sigmamu = robustbase::s_Qn)
list(mean=v$center, varmat=v$cov)
}
switch(type,
standard = standard(x),
robust = robust(x),
robustHD = robustHD(x))
}
if(!is.logical(coda) & !is.function(coda)){
stop("coda must be logical or function")
}
if(!is.logical(coda)){
x <- coda(x)
} else if (coda){
x <- pivotCoord(x)
}
cv <- covEst(x, "robust")
cvc <- covEst(x, "standard")
dM <- sqrt(mahalanobis(x, center=cv$mean, cov=cv$varmat))
dMc <- sqrt(mahalanobis(x, center=cvc$mean, cov=cvc$varmat))
# limit <- sqrt(qchisq(p=quantile, df=ncol(x)-1))
limit <- sqrt(qchisq(p=quantile, df=ncol(x)))
res <- list(mahalDist = dM, limit = limit,
outlierIndex = dM > limit, method=method,
om2 = dMc > limit,
m2=dMc, coda = coda)
class(res) <- "outCoDa"
invisible(res)
}
#' @rdname outCoDa
#' @export
#' @method print outCoDa
print.outCoDa <- function(x, ...){
cat("\n --------------------\n")
print(paste(length(which(x$outlierIndex == TRUE)), "out of", length(x$outlierIndex), "observations are detected as outliers."))
cat("\n --------------------\n\n")
}
#' @rdname outCoDa
#' @export
#' @method plot outCoDa
#' @param which 1 ... MD against index
#' 2 ... distance-distance plot
plot.outCoDa <- function(x, y, ..., which = 1){
# plot(1:length(x$mahalDist), x$mahalDist, ylab="Mahalanobis distance", xlab="Index", type="n", ylim=c(0, max(x$mahalDist)) )
# points(1:length(x$mahalDist[x$outlierIndex]), x$mahalDist[x$outlierIndex], pch=3, col="red")
# points(1:length(x$mahalDist[!x$outlierIndex]), x$mahalDist[!x$outlierIndex])
# abline(h=x$limit, lty=2)
# #print(x$mahalDist)
if(which == 1){
if(x$method =="standard") yl <- "Mahalanobis distance" else yl <- "Robust Mahalanobis distance"
plot(1:length(x$mahalDist), x$mahalDist, ylab = yl,
xlab = "Index", ylim = c(0, max(x$mahalDist)),
pch = as.numeric(x$outlierIndex)*2+1)
abline(h=x$limit, lty=2)
}
if(which == 2){
if(x$method == "standard"){
return(cat("apply method robust in outCoDa to make a distance-distance plot available"))
}
n <- length(x$mahalDist)
outlier <- ifelse(x$outlierIndex & x$om2, "both",
ifelse(x$outlierIndex & !x$om2, "robust only",
ifelse(x$om2 & !x$outlierIndex, "standard only", "no outlier")))
MD <- NULL
RMD <- NULL
limit <- NULL
name <- NULL
w <- NULL
df <- data.frame("RMD" = x$mahalDist,
"MD" = x$m2,
"limit" = rep(x$limit, n),
"outlier" = outlier,
"name" = 1:n)
gg <- ggplot(df, aes(x = MD, y = RMD, colour = outlier ))
gg <- gg + geom_point(data=subset(df, RMD <= limit & MD <= limit))
gg <- gg + geom_abline(intercept = 0, slope = 1, linetype = "dashed", colour="grey")
gg <- gg + geom_text(data=subset(df, RMD > limit | MD > limit), aes(MD, RMD, label=name))
gg <- gg + geom_vline(xintercept = x$limit, size=0.25) + geom_hline(yintercept = x$limit, size=0.25)
gg <- gg + theme_bw()
print(gg)
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.