#' Distance-Distance Plot
#'
#' @description This function allows you to input a data matrix, along with a robust estimate
#' of the multivariate center and robust estimate of the covariance. The standard mean and covariance
#' estimates are calculated from the data matrix, and a scatterplot of the mahalanobis distances for
#' each row based on the standard estimate and robust estimates returned.
#'
#' @param x a data frame or matrix of numeric covariates
#' @param rmu robust multivariate center
#' @param rcov robust covariance matrix
#' @param interact if set to TRUE, allows you to label the outlying points by clicking on them. defaults to FALSE.
#'
#' @return a plot
#' @export
#'
#' @examples
#' out <- cov.ogk(x, all = T)
#' ddPlot(x, out$center, out$cov)
#'
ddPlot <- function(x, rmu, rcov, interact = F){
x <- as.matrix(x); n <- nrow(x); p <- ncol(x)
distrob <- sqrt(mahalanobis_dist(x, center=rmu, cov=rcov))
cn <- outlier.critval(x, rmu, rcov, alpha = 0.975)$cn
distcla <- sqrt(mahalanobis_dist(x, center=colMeans(x), cov=cov(x) * ((n-1)/n)))
plot(distcla, distrob, main="Distance-Distance Plot", xlab="MD", ylab="RD", type="n")
if(cn != Inf) { alpha <- sqrt(c(cn, qchisq(c(0.75,0.5,0.25),ncol(x)))) }
else { alpha <- sqrt(qchisq(c(0.975,0.75,0.5,0.25),ncol(x))) }
abline(h=alpha[1])
abline(v=alpha[1])
abline(a=0, b=1)
lpch <- c(17,17,18,19,16)
lcex <- rev(c(1,1,1,1,1))
lalpha <- length(alpha)
xs <- scale(x) - min(scale(x))
eucl <- sqrt(apply(xs^2, 1, sum))
colors <- colorRampPalette(colors = c("#06cf3f", "#066acf", "#09e274", "#0a0af5", "#5C31CE",
"#f609ab", "#F6094F", "#f60934", "#EC1600", "#ff1800"),
space = "Lab", bias = 1.25, interpolate = "spline")
rbcol <- paste0(colors(nrow(x)), "A1")[as.integer(cut(eucl,nrow(x),labels=1:nrow(x)))]
rd <- distrob
for(j in 1:lalpha) {
if(j==1) {
points(distcla[rd>=alpha[j]], distrob[rd>=alpha[j]], pch=lpch[j],cex=lcex[j],col=rbcol[rd>=alpha[j]])
}
if (j>1 & j<lalpha) points(distcla[rd<alpha[j-1] & rd>=alpha[j]], distrob[rd<alpha[j-1] & rd>=alpha[j]], cex=lcex[j],pch=lpch[j], col=rbcol[rd<alpha[j-1] & rd>=alpha[j]])
if (j==lalpha){
points(distcla[rd<alpha[j-1] & rd>=alpha[j]], distrob[rd<alpha[j-1] & rd>=alpha[j]], cex=lcex[j],pch=lpch[j], col=rbcol[rd<alpha[j-1] & rd>=alpha[j]])
points(distcla[rd<alpha[j]], distrob[rd<alpha[j]], pch=lpch[j+1],cex=lcex[j+1], col=rbcol[rd<alpha[j]])
}
}
o <- ( rd > min(sqrt(cn), sqrt(qchisq(0.975, dim(x)[2]) ) ) )
l <- list(outliers = o, md.cla = distcla, md.rob=distrob)
if (interact) identify(distcla, distrob)
}
#' Chi-Squared Plot for Outlier Detection
#'
#' @description This plots the robust Mahalanobis distances from a robust estimate
#' of multivariate location and covariance against the theoretical Chi-squared quantiles
#' for a matrix with p dimensions. This functions analagously to the univariate qqplot.
#'
#' @param x a data frame or matrix of numeric covariates
#' @param rmu robust multivariate center
#' @param rcov robust covariance matrix
#' @param alpha quantile of chi-squared distribution for declaring outliers. defaults to 0.975
#'
#' @return a plot
#' @export
#' @examples
#' out <- cov.ogk(x, all = T)
#' chisqPlot(x, out$center, out$cov)
#'
chisqPlot <- function(x, rmu, rcov, alpha=0.975) {
dist <- mahalanobis_dist(x, rmu, rcov)
s <- sort(dist, index=TRUE)
q <- (0.5:length(dist))/length(dist)
qchi <- qchisq(q, df=ncol(x))
cutoff <- qchisq(alpha, df = ncol(x))
good <- which(s$x <= cutoff)
bad <- which(s$x >= cutoff)
plot(s$x[good], qchi[good], xlab="RD",
ylab=paste0("Chi-Squared (df=", ncol(x), " quantiles)"),
main="Chi-Squared Plot", col="#0644ccCC",
xlim=c(0,max(s$x)), ylim=c(0,max(qchi))
)
points(s$x[bad], qchi[bad], col = "#f3054eCC")
}
#' Panel of plots displaying multivariate outliers among the univariate variables.
#'
#' @description Multivariate outliers identify rows of data that contain unusual combinations
#' of values. As such, the definition differs from a univariate outlier as being an extreme value.
#' The value of the ith entry in the jth column may belong to a multivariate outlier, but not be a
#' univariate outlier, and vice versa. This visualizes the distinction. In the displayed plot
#' all variables are centered by their respective medians and scaled by their respective average
#' absolute deviations. This standardization is done only after all calculations are done so that
#' all variables are properly aligned with the y-axis of the plot. The x-axis consists of random
#' coordinates, so the plot may differ if created multiple times. No information is lost or changed.
#' The position along the x-axis is randomized so that the values do not fall on a straight line.
#'
#' @param x a data frame or matrix of numeric covariates
#' @param rmu robust multivariate center
#' @param rcov robust covariance matrix
#' @param plot if TRUE (the default), the results are plotted using ggplot2. If you desire to make
#' your own plot, plot can be changed to FALSE to return the information used by ggplot2.
#' @param nrow the number of rows for ggplot2's facet_wrap. defaults to 1.
#' @param ncol optional argument to specify the number of columns for ggplot2's facet wrap.
#'
#' @return a plot or a data frame.
#' @export
#'
#' @examples
#' out <- cov.ogk(x, all = T)
#' outlierPlot(x, out$center, out$cov)
#'
outlierPlot <- function (x, rmu = NULL, rcov = NULL, plot = T, nrow = 1, ncol = NULL) {
if (!is.matrix(x) && !is.data.frame(x))
stop("x must be matrix or data.frame")
if (ncol(x) < 2)
stop("x must be at least two-dimensional")
dist <- mahalanobis_dist(x, center = rmu, cov = rcov)
xarw <- outlier.critval(x, rmu, rcov, alpha = 0.975)
if (xarw$cn != Inf) {
alpha <- sqrt(c(xarw$cn, qchisq(c(0.75, 0.5, 0.25), ncol(x))))
}
else {
alpha <- sqrt(qchisq(c(0.975, 0.75, 0.5, 0.25), ncol(x)))
}
sx <- matrix(NA, nrow = nrow(x), ncol = ncol(x))
for (i in 1:ncol(x)) sx[, i] <- (x[, i] - xarw$m[i])/sqrt(xarw$c[i, i])
r <- range(sx)
xs <- scale(x) - min(scale(x))
eucl <- sqrt(apply(xs^2, 1, sum))
o <- (sqrt(dist) > sqrt(xarw$cn))
o <- sapply(o, function(i) ifelse(isTRUE(i), "outlier", "typical"))
l <- list(outliers = o, md = sqrt(dist))
xlong <- tidyr:::pivot_longer(cbind.data.frame(Scale2(x, hdmedian, aad), observation = 1:nrow(x), outlier = o, md = l$md), cols = colnames(x))
xlong <- cbind.data.frame(xlong, randval = runif(nrow(xlong), -1, 1))
colnames(xlong) <- c("observation","outlier", "MD", "variable", "value", "randval")
xlong$outlier <- factor(xlong$outlier,levels = c("typical", "outlier"))
if (plot){
ggplot(xlong, aes(x = randval, y = value, color = outlier, shape = outlier, alpha = MD)) +
facet_wrap(~ variable, nrow = nrow, ncol = ncol) +
geom_point() +
scale_colour_manual(values = c(typical = "#0000ff", outlier = "#ff1800")) +
scale_alpha_continuous(range = c(0.25, 1), guide = F) +
scale_shape_manual(values = c(16, 17), guide = F) +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())
}
else return(xlong)
}
outlier.critval <-function(x, center, cov, alpha=0.975){
n <- nrow(x); p <- ncol(x)
if (p<=10){pcrit <- (0.241-0.0029*p)/sqrt(n)}
if (p>=11){pcrit <- (0.252-0.0018*p)/sqrt(n)}
if (missing(alpha)) {delta<-qchisq(0.975, p)} else {delta <- qchisq(alpha, p)}
d2<-mahalanobis_dist(x, center, cov);d2ord <- sort(d2)
dif <- pchisq(d2ord,p) - (0.5:n)/n;i <- (d2ord>=delta) & (dif>0)
if (sum(i)==0) alfan<-0 else alfan<-max(dif[i])
if (alfan<pcrit) alfan<-0
if (alfan>0) cn<-max(d2ord[n-ceiling(n*alfan)],delta) else cn<-Inf
w <- d2<cn
if(sum(w)==0) {m <- center; c <- cov}
else {
m <- colMeans(x[w,])
c1 <- as.matrix(x - rep(1, n) %*% t(m))
c <- crossprod(c1*w, c1)/sum(w)
}
list(m = m, c = c, cn = cn, w = w)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.