R/mvout.R

Defines functions print.mvout mvout

Documented in mvout print.mvout

mvout <- 
  function(x, method = c("none", "princomp", "factanal"), standardize = TRUE,
           robust = TRUE, direction = rep("two.sided", ncol(x)), thresh = 0.01, 
           keepx = TRUE, factors = 2, scores = c("regression", "Bartlett"), 
           rotation = c("none", "varimax", "promax"), ...){
    # robust detection of (directional) multivariate outliers
    # using minimum covariance determinant (MCD) estimator
    # Nathaniel E. Helwig
    # updated: 2025 Mar 27
    
    
    #########***#########   INITIAL CHECKS   #########***#########
    
    # check 'x'
    x <- as.matrix(x)
    n <- nrow(x)
    p <- ncol(x)
    if(n <= p) stop("Input 'x' needs to satisfy:  nrow(x) > ncol(x)")
    
    # check 'method'
    method <- as.character(method[1])
    method <- pmatch(method, c("none", "princomp", "factanal"))
    if(is.na(method)) stop("Invalid 'method' input. See help files.")
    method <- c("none", "princomp", "factanal")[method]
    if(method == "factanal" && p < 3L) stop("Need ncol(x) >= 3 when 'method' is 'factanal'")
    
    # check 'standardize'
    if(method == "princomp"){
      standardize <- as.logical(standardize[1])
      if(!any(standardize == c(TRUE, FALSE))) stop("Input 'standardize' must be logical.")
    } else {
      standardize <- TRUE
    }
    
    # check 'robust'
    robust <- as.logical(robust[1])
    if(!any(robust == c(TRUE, FALSE))) stop("Input 'robust' must be TRUE or FALSE.")
    
    # check 'direction'
    direction <- as.character(direction)
    if(length(direction) != p) {
      if(length(direction) != 1L){
        warning("The length of 'direction' does not match the number of columns of 'x',\nso the first element of 'direction' is being used for all variables.")
      }
      direction <- pmatch(direction[1], c("two.sided", "less", "greater"))
      if(is.na(direction)) stop("Invalid 'direction' input. See help files.")
      direction <- c("two.sided", "less", "greater")[direction]
      direction <- rep(direction, p)
    } else {
      for(j in 1:p){
        directionj <- pmatch(direction[j], c("two.sided", "less", "greater"))
        if(is.na(directionj)) stop("Invalid 'direction' input. See help files.")
        direction[j] <- c("two.sided", "less", "greater")[directionj]
      }
    }
    
    # check 'thresh'
    thresh <- as.numeric(thresh[1])
    if(thresh <= 0 | thresh >= 1) stop("Input 'thresh' must satisfy: 0 < thresh < 1")
    
    # check 'keepx'
    keepx <- as.logical(keepx[1])
    if(!any(keepx == c(TRUE, FALSE))) stop("Input 'keepx' must be TRUE or FALSE.")
    xin <- if(keepx) x else NULL
    
    # check 'factors'
    if(method != "none"){
      factors <- as.integer(factors[1])
      if(factors < 1L | factors > p) stop("Input 'factors' must satisfy: 1 <= factors <= ncol(x)")
    }
    
    # check scores
    if(method == "factanal"){
      scores <- as.character(scores[1])
      scores <- pmatch(scores, c("regression", "Bartlett"))
      if(is.na(scores)) stop("Invalid 'scores' input.")
      scores <- c("regression", "Bartlett")[scores]
    }
    scores.orig <- scores
    
    # check rotation
    if(method == "none"){
      rotation <- "none"
    } else {
      rotation <- as.character(rotation[1])
      rotation <- pmatch(rotation, c("none", "varimax", "promax"))
      if(is.na(rotation)) stop("Invalid 'rotation' input.")
      rotation <- c("none", "varimax", "promax")[rotation]
    }
    
    
    #########***#########   ROBUST Z-SCORES   #########***#########
    
    # calculate (robust) mean and covariance
    if(robust){
      rcov <- robustbase::covMcd(x, cor = standardize, ...)
    } else {
      rcov <- list(center = colMeans(x),
                   cov = cov(x),
                   mcd.wt = rep(1, n))
      if(standardize)  rcov$cor <- cor(x)
    }
    
    # robust center (and scale?)
    if(standardize){
      x <- scale(x, center = rcov$center, scale = sqrt(diag(rcov$cov)))
      orig.cov <- rcov$cov
      rcov$cov <- rcov$cor
    } else {
      x <- scale(x, center = rcov$center, scale = FALSE)
      orig.cov <- NULL
    }
    
    
    #########***#########   MAHALANOBIS DISTANCE   #########***#########
    
    # directional distances?
    directional = FALSE
    if(any(direction != "two.sided")){
      directional <- TRUE
      xorig <- x
      indx <- which(direction != "two.sided")
      for(j in indx){
        sgn <- ifelse(direction[j] == "less", -1, 1)
        x[,j] <- sgn * pmax(sgn * x[,j], 0)
      }
    }
    
    # robust distance calculation
    if(method == "none"){
      
      rdis <- mahalanobis(x, center = FALSE, cov = rcov$cov)
      
    } else if(method[1] == "princomp"){
      
      reig <- eigen(rcov$cov, symmetric = TRUE)
      uniquenesses <- diag(rcov$cov) - rowSums((reig$vectors[,1:factors,drop=FALSE] %*% diag(sqrt(reig$values[1:factors]), nrow = factors))^2)
      loadings <- reig$vectors[,1:factors,drop=FALSE]
      lsigns <- sign(colSums(loadings^3))
      if(any(lsigns == 0)) lsigns[lsigns == 0] <- 1
      loadings <- loadings %*% diag(lsigns, nrow = factors)
      scores <- x %*% loadings
      rdis <- mahalanobis(scores, center = FALSE, cov = diag(reig$values[1:factors], nrow = factors))
      if(directional) x <- xorig
      scores <- x %*% loadings %*% diag(1/sqrt(reig$values[1:factors]), nrow = factors)
      loadings <- loadings %*% diag(sqrt(reig$values[1:factors]), nrow = factors)
      p <- factors
      
    } else {
      
      rfac <- factanal(factors = factors, covmat = rcov$cov, 
                       n.obs = sum(rcov$mcd.wt), rotation = "none")
      uniquenesses <- rfac$uniquenesses
      loadings <- rfac$loadings
      lsigns <- sign(colSums(loadings^3))
      if(any(lsigns == 0)) lsigns[lsigns == 0] <- 1
      loadings <- loadings %*% diag(lsigns, nrow = factors)
      covmat <- tcrossprod(rfac$loadings) + diag(rfac$uniquenesses)
      rdis <- mahalanobis(x, center = FALSE, cov = covmat)
      if(directional) x <- xorig
      PsiInvLo <- diag(1 / uniquenesses, nrow = p, ncol = p) %*% loadings
      if(scores == "regression"){
        scores <- x %*% PsiInvLo %*% solve(diag(factors) + crossprod(loadings, PsiInvLo))
      } else {
        scores <- x %*% PsiInvLo %*% solve(crossprod(loadings, PsiInvLo))
      }
      
    } # end if(method == "none")
    
    
    #########***#########   OUTLIER DETECTION   #########***#########
    
    # robust critical value
    cval.in <- qchisq(1 - thresh, df = p)
    cval.out <- qf(1 - thresh, df1 = p, df2 = n - p) * ((n - 1) * p / (n - p))
    
    # robust outlier detection
    outlier <- ifelse(rdis > ifelse(rcov$mcd.wt, cval.in, cval.out), TRUE, FALSE)
    
    
    #########***#########   POSTPROCESSING   #########***#########
    
    # rotate loadings?
    cormat <- irot <- diag(factors)
    if(method != "none" && rotation != "none"){
      if(rotation == "varimax"){
        rot <- varimax(loadings)
        irot <- rot$rotmat
        loadings <- rot$loadings
        scores <- scores %*% rot$rotmat
      } else {
        rot <- promax(loadings)
        loadings <- rot$loadings
        irot <- solve(t(rot$rotmat))
        scores <- scores %*% irot
        cormat <- crossprod(irot)
      }
    }
    
    # return results
    if(!is.null(orig.cov)) rcov$cov <- orig.cov
    args <- list(x = xin, method = method, standardize = standardize,
                 direction = direction, thresh = thresh,
                 robust = robust, factors = factors, 
                 scores = scores.orig, rotation = rotation)
    if(method == "none"){
      res <- list(distance = rdis, 
                  outlier = outlier, 
                  mcd = rcov, 
                  args = args)
    } else {
      rownames(loadings) <- colnames(x)
      colnames(loadings) <- paste0("Factor", 1:ncol(loadings))
      rownames(scores) <- rownames(x)
      colnames(scores) <- paste0("Factor", 1:ncol(scores))
      res <- list(distance = rdis, 
                  outlier = outlier, 
                  mcd = rcov,
                  args = args, 
                  scores = scores,
                  loadings = loadings,
                  uniquenesses = uniquenesses,
                  invrot = irot,
                  cormat = cormat)
    } 
    class(res) <- "mvout"
    return(res)
    
  } # mvout

print.mvout <- function(x, ...){
  #cat("Multivariate Outliers Detection\n\n")
  cat(paste0(" \n ", round(100 * mean(x$outlier), 2), "% flagged as outliers\n"))
  cat("\n Mahalanobis Distances:\n")
  dq <- quantile(x$distance, probs = c(0.8, 0.85, 0.9, 0.95, 0.99))
  print(round(dq, 4))
  cat("\n")
  cat("       method: ", x$args$method, "\n")
  cat("  standardize: ", x$args$standardize, "\n")
  cat("       robust: ", x$args$robust, "\n")
  if(length(unique(x$args$direction)) == 1) {
    cat("    direction: ", x$args$direction[1], "\n")
  } else {
    cat("    direction: ", head(x$args$direction), "\n\n")
  }
  cat("       thresh: ", x$args$thresh, "\n\n")
}

Try the mvout package in your browser

Any scripts or data that you put into this service are public.

mvout documentation built on June 8, 2025, 10:41 a.m.