knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
library(fastMCD) library(MASS) library(robustbase) library(tictoc)
To demonstrate the usage of fastMCD
, we will first generate matrices from a multivariate gaussian distribution with known center ($\mu$) and scatter ($\Sigma$). In this case, we will choose $\mu = 0_3$ and $\Sigma = \mathbb{1}_3$.
set.seed(54513) mu <- rep(0, 3) sigma <- diag(rep(1, 3)) X <- mvrnorm(n = 20000, mu = mu, Sigma = sigma)
We will then replace some proportion of these observations with observations from a multivariate gaussian distribution with different parameters $\mu_1$ and $\Sigma_1$.
mu1 <- rep(5, 3) sigma1 <- matrix(c(1,0.2,0.4,0.2,3,0.5,0.4,0.5,1), nrow = 3) outlier_index <- seq(from = 1, to = 20000, length = 1000) X[outlier_index,] <- mvrnorm(n = 1000, mu = mu1, Sigma = sigma1)
We can observe how the naive sample mean and sample covariance fail when these outliers are introduced.
colMeans(X) cov(X)
To obtain a robust estimate of the mean and covariance, we use fastMCD
.
res <- fastMCD(X) res$center res$cov
Our new estimates from fastMCD
are much closer to the true parameters than the naive estimates.
To assess the accuracy of fastMCD
, we can simulate data under a variety of parameters and compare the results of fastMCD
to results obtained using robustbase::covMcd
, which also provides an implementation of the FAST-MCD algorithm. Although estimates are not deterministic, we can expect fastMCD
and robust::covMcd
to give approximately equal results.
X <- mvrnorm(2000, mu = rep(1, 3), Sigma = diag(rep(1, 3))) X[sample(1:2000, 100),] <- mvrnorm(100, mu = rep(10,3), Sigma = diag(rep(1,3))) alpha <- ((2000 + 3 + 1) / 2) / 2000 res <- fastMCD(X) res_rob <- covMcd(X, raw.only = T, alpha = alpha) all.equal(res$center, unname(res_rob$center), tol = 0.1) all.equal(as.vector(res$cov), as.vector(res_rob$cov), tol = 0.1)
Because FAST-MCD randomly samples subsets, its estimates will differ from run to run. One way to test whether the accuracy of fastMCD
differs from that of covMcd
is to conduct a simulation study and perform a statistical test. We simulate 100 matrices, add 10% outliers, and report the mean of the absolute error of the covariance matrix for each method. We can then use a paired t-test to determine whether there a significant difference in the mean error of covariance terms for fastMCD
and covMcd
.
n_sim <- 50 res_f <- rep(0, n_sim) res_r <- rep(0, n_sim) n <- 700 alpha <- ((n + 3 + 1) / 2) / n for (i in 1:n_sim){ X <- mvrnorm(n, mu = rep(1, 3), Sigma = diag(rep(1, 3))) X[sample(1:n, 100),] <- mvrnorm(100, mu = rep(3, 3), Sigma = diag(rep(1, 3))) res <- fastMCD(X) resrob <- covMcd(X, raw.only = T, alpha = alpha) res_f[i] <- mean(abs(res$cov - diag(rep(1,3)))) res_r[i] <- mean(abs(resrob$cov - diag(rep(1,3)))) } t.test(x = res_f, y = res_r, paired = T)
We may repeat this procedure under other parameter schemes. For example, we may use 2,000 subjects, a random mean vector from a uniform distribution from -10 to 10, and a randomly generated covariance matrix, with outliers coming from a gaussian distribution, with outliers coming from the same distribution with a shifted mean vector.
n_sim <- 50 res_f <- rep(0, n_sim) res_r <- rep(0, n_sim) n <- 2000 alpha <- ((n + 3 + 1) / 2) / n mu <- rnorm(3, -10, 10) Sigma <- matrix(runif(9, 0, 3), 3) Sigma <- t(Sigma) %*% Sigma for (i in 1:n_sim){ X <- mvrnorm(n, mu = mu, Sigma = Sigma) X[sample(1:n, 100),] <- mvrnorm(100, mu = mu + rnorm(3), Sigma = Sigma) res <- fastMCD(X) resrob <- covMcd(X, raw.only = T, alpha = alpha) res_f[i] <- mean(abs(res$cov - Sigma)) res_r[i] <- mean(abs(resrob$cov - Sigma)) } t.test(x = res_f, y = res_r, paired = T)
Once again, we fail to reject the null hypothesis that the mean absolute error of the estimate covariance terms differs between fastMCD
and covMcd
.
Using estimated covariance and mean parameters, we may obtain Mahalanobis distances for each observation and label observations with $d ^2 > \chi_{p, 1-\epsilon}^2$ as outliers, wherer $\epsilon$ is the error rate. We can then compare the classification for the two.
n <- 3000 alpha <- ((n + 3 + 1) / 2) / n mu <- rnorm(3, -10, 10) Sigma <- matrix(runif(9, 0, 3), 3) Sigma <- t(Sigma) %*% Sigma X <- mvrnorm(n, mu = mu, Sigma = Sigma) outlier_index <- sample(1:n, 100) X[outlier_index,] <- mvrnorm(100, mu = mu + rnorm(3), Sigma) res <- fastMCD(X) res_b <- covMcd(X, raw.only = T, alpha = alpha)
y <- rep(FALSE, n); y[outlier_index] <- TRUE d <- mahalanobis(X, center = res$center, cov = res$cov) y_pred <- d > qchisq(p = 1-100/3000, df = 3) table(y, y_pred)
d_b <- mahalanobis(X, center = res_b$center, cov = res_b$cov) y_pred_b <- d_b > qchisq(p = 1-100/3000, df = 3) table(y, y_pred_b)
We observe that contingency tables of fastMCD
and covMcd
are quite consistent.
We can get a sense for the efficiency of fastMCD
by comparing it with covMcd
.
X <- mvrnorm(1e3, mu = mu, Sigma = Sigma) tic() res <- fastMCD(X) toc()
alpha <- ((1e3 + 3 + 1) / 2) / 1e3 tic() res <- covMcd(X, raw.only = T, alpha = alpha) toc()
Let's try another comparison with a large sample size
X <- mvrnorm(1e5, mu = mu, Sigma = Sigma) tic() res <- fastMCD(X) toc()
alpha <- ((1e5 + 3 + 1) / 2) / 1e5 tic() res <- covMcd(X, raw.only = T, alpha = alpha) toc()
We see that fastMCD
is not quite as efficient as covRob
, although it clearly improves on the polynomial time complexity of a naive MCD approach.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.