cvreg: Cross Validation and Robust Estimation Utilities

Documented in chisqPlot ddPlot

#' Distance-Distance Plot
#'
#' @description This function allows you to input a data matrix, along with a robust estimate
#' of the multivariate center and robust estimate of the covariance. The standard mean and covariance
#' estimates are calculated from the data matrix, and a scatterplot of the mahalanobis distances for
#' each row based on the standard estimate and robust estimates returned.
#'
#' @param x a data frame or matrix of numeric covariates
#' @param rmu robust multivariate center
#' @param rcov robust covariance matrix
#' @param interact if set to TRUE, allows you to label the outlying points by clicking on them. defaults to FALSE.
#'
#' @return a plot
#' @export
#'
#' @examples
#' out <- cov.ogk(x, all = T)
#' ddPlot(x, out$center, out$cov)
#'
ddPlot <- function(x, rmu, rcov, interact = F){
    x <- as.matrix(x); n <- nrow(x); p <- ncol(x)
    distrob <- sqrt(mahalanobis_dist(x, center=rmu, cov=rcov))
    cn <- outlier.critval(x, rmu, rcov, alpha = 0.975)$cn
    distcla <- sqrt(mahalanobis_dist(x, center=colMeans(x), cov=cov(x) * ((n-1)/n)))
    plot(distcla, distrob, main="Distance-Distance Plot", xlab="MD", ylab="RD", type="n")
    if(cn != Inf) { alpha <- sqrt(c(cn, qchisq(c(0.75,0.5,0.25),ncol(x)))) }
    else { alpha <- sqrt(qchisq(c(0.975,0.75,0.5,0.25),ncol(x))) }
    abline(h=alpha[1])
    abline(v=alpha[1])
    abline(a=0, b=1)
    lpch <- c(17,17,18,19,16)
    lcex <- rev(c(1,1,1,1,1))
    lalpha <- length(alpha)
    xs <- scale(x) - min(scale(x))
    eucl <- sqrt(apply(xs^2, 1, sum))
    colors <- colorRampPalette(colors = c("#06cf3f", "#066acf", "#09e274", "#0a0af5", "#5C31CE",
                                          "#f609ab", "#F6094F", "#f60934", "#EC1600", "#ff1800"),
                              space = "Lab", bias = 1.25, interpolate = "spline")
    rbcol <- paste0(colors(nrow(x)), "A1")[as.integer(cut(eucl,nrow(x),labels=1:nrow(x)))]
    rd <- distrob

    for(j in 1:lalpha) {
        if(j==1) {
            points(distcla[rd>=alpha[j]], distrob[rd>=alpha[j]], pch=lpch[j],cex=lcex[j],col=rbcol[rd>=alpha[j]])
        }
        if (j>1 & j<lalpha) points(distcla[rd<alpha[j-1] & rd>=alpha[j]], distrob[rd<alpha[j-1] & rd>=alpha[j]], cex=lcex[j],pch=lpch[j], col=rbcol[rd<alpha[j-1] & rd>=alpha[j]])
        if (j==lalpha){
            points(distcla[rd<alpha[j-1] & rd>=alpha[j]], distrob[rd<alpha[j-1] & rd>=alpha[j]], cex=lcex[j],pch=lpch[j], col=rbcol[rd<alpha[j-1] & rd>=alpha[j]])
            points(distcla[rd<alpha[j]], distrob[rd<alpha[j]], pch=lpch[j+1],cex=lcex[j+1], col=rbcol[rd<alpha[j]])
        }
    }
    o <- ( rd > min(sqrt(cn), sqrt(qchisq(0.975, dim(x)[2]) ) ) )
    l <- list(outliers = o, md.cla = distcla, md.rob=distrob)

    if (interact) identify(distcla, distrob)

}


#' Chi-Squared Plot for Outlier Detection
#'
#' @description This plots the robust Mahalanobis distances from a robust estimate
#' of multivariate location and covariance against the theoretical Chi-squared quantiles
#' for a matrix with p dimensions. This functions analagously to the univariate qqplot.
#'
#' @param x a data frame or matrix of numeric covariates
#' @param rmu robust multivariate center
#' @param rcov robust covariance matrix
#' @param alpha quantile of chi-squared distribution for declaring outliers. defaults to 0.975
#'
#' @return a plot
#' @export
#' @examples
#' out <- cov.ogk(x, all = T)
#' chisqPlot(x, out$center, out$cov)
#'
chisqPlot <- function(x, rmu, rcov, alpha=0.975) {
        dist <- mahalanobis_dist(x, rmu, rcov)
        s <- sort(dist, index=TRUE)
        q <- (0.5:length(dist))/length(dist)
        qchi <- qchisq(q, df=ncol(x))
        cutoff <- qchisq(alpha, df = ncol(x))
        good <- which(s$x <= cutoff)
        bad <- which(s$x >= cutoff)
        plot(s$x[good], qchi[good], xlab="RD",
             ylab=paste0("Chi-Squared (df=", ncol(x), " quantiles)"),
             main="Chi-Squared Plot", col="#0644ccCC",
             xlim=c(0,max(s$x)), ylim=c(0,max(qchi))
           )
        points(s$x[bad], qchi[bad], col = "#f3054eCC")

}


#' Panel of plots displaying multivariate outliers among the univariate variables.
#'
#' @description Multivariate outliers identify rows of data that contain unusual combinations
#' of values. As such, the definition differs from a univariate outlier as being an extreme value.
#' The value of the ith entry in the jth column may belong to a multivariate outlier, but not be a
#' univariate outlier, and vice versa. This visualizes the distinction. In the displayed plot
#' all variables are centered by their respective medians and scaled by their respective average
#' absolute deviations. This standardization is done only after all calculations are done so that
#' all variables are properly aligned with the y-axis of the plot. The x-axis consists of random
#' coordinates, so the plot may differ if created multiple times. No information is lost or changed.
#' The position along the x-axis is randomized so that the values do not fall on a straight line.
#'
#' @param x a data frame or matrix of numeric covariates
#' @param rmu robust multivariate center
#' @param rcov robust covariance matrix
#' @param plot if TRUE (the default), the results are plotted using ggplot2. If you desire to make
#' your own plot, plot can be changed to FALSE to return the information used by ggplot2.
#' @param nrow the number of rows for ggplot2's facet_wrap. defaults to 1.
#' @param ncol optional argument to specify the number of columns for ggplot2's facet wrap.
#'
#' @return a plot or a data frame.
#' @export
#'
#' @examples
#' out <- cov.ogk(x, all = T)
#' outlierPlot(x, out$center, out$cov)
#'
outlierPlot <- function (x, rmu = NULL, rcov = NULL, plot = T, nrow = 1, ncol = NULL) {

    if (!is.matrix(x) && !is.data.frame(x))
        stop("x must be matrix or data.frame")
    if (ncol(x) < 2)
        stop("x must be at least two-dimensional")

    dist <- mahalanobis_dist(x, center = rmu, cov = rcov)

    xarw <- outlier.critval(x, rmu, rcov, alpha = 0.975)
    if (xarw$cn != Inf) {
        alpha <- sqrt(c(xarw$cn, qchisq(c(0.75, 0.5, 0.25), ncol(x))))
    }
    else {
        alpha <- sqrt(qchisq(c(0.975, 0.75, 0.5, 0.25), ncol(x)))
    }
    sx <- matrix(NA, nrow = nrow(x), ncol = ncol(x))
    for (i in 1:ncol(x)) sx[, i] <- (x[, i] - xarw$m[i])/sqrt(xarw$c[i, i])
    r <- range(sx)
    xs <- scale(x) - min(scale(x))
    eucl <- sqrt(apply(xs^2, 1, sum))
    o <- (sqrt(dist) > sqrt(xarw$cn))
    o <- sapply(o, function(i) ifelse(isTRUE(i), "outlier", "typical"))
    l <- list(outliers = o, md = sqrt(dist))
    xlong <- tidyr:::pivot_longer(cbind.data.frame(Scale2(x, hdmedian, aad), observation = 1:nrow(x), outlier = o, md = l$md), cols = colnames(x))

     xlong <- cbind.data.frame(xlong, randval = runif(nrow(xlong), -1, 1))
    colnames(xlong) <- c("observation","outlier", "MD", "variable", "value", "randval")
    xlong$outlier <- factor(xlong$outlier,levels = c("typical", "outlier"))
    if (plot){
        ggplot(xlong, aes(x = randval, y = value, color = outlier, shape = outlier, alpha = MD)) +
            facet_wrap(~ variable, nrow = nrow, ncol = ncol) +
            geom_point() +
            scale_colour_manual(values = c(typical = "#0000ff", outlier = "#ff1800")) +
            scale_alpha_continuous(range = c(0.25, 1), guide = F) +
            scale_shape_manual(values = c(16, 17), guide = F) +
            theme(axis.title.x=element_blank(),
                  axis.text.x=element_blank(),
                  axis.ticks.x=element_blank())
    }
    else return(xlong)
}

outlier.critval <-function(x, center, cov, alpha=0.975){
    n <- nrow(x); p <- ncol(x)
    if (p<=10){pcrit <- (0.241-0.0029*p)/sqrt(n)}
    if (p>=11){pcrit <- (0.252-0.0018*p)/sqrt(n)}
    if (missing(alpha)) {delta<-qchisq(0.975, p)} else {delta <- qchisq(alpha, p)}
    d2<-mahalanobis_dist(x, center, cov);d2ord <- sort(d2)
    dif <- pchisq(d2ord,p) - (0.5:n)/n;i <- (d2ord>=delta) & (dif>0)
    if (sum(i)==0) alfan<-0 else alfan<-max(dif[i])
    if (alfan<pcrit) alfan<-0
    if (alfan>0) cn<-max(d2ord[n-ceiling(n*alfan)],delta) else cn<-Inf
    w <- d2<cn
    if(sum(w)==0) {m <- center; c <- cov}
    else {
        m  <- colMeans(x[w,])
        c1 <- as.matrix(x - rep(1, n) %*% t(m))
        c  <- crossprod(c1*w, c1)/sum(w)
    }
    list(m = m, c = c, cn = cn, w = w)
}