cellMCD examples"

knitr::opts_chunk$set(
 fig.width = 8 ,
 fig.height = 12,
 fig.align ='center'
)

Introduction

This file contains some examples of the use of the cellMCD function. It reproduces all the figures in the section on real data examples of the report "The cellwise MCD estimator" by J. Raymaekers and P.J. Rousseeuw.

library("cellWise")

Top Gear data

We chose this dataset because both its variables and its cases are fairly easy to interpret. We start by loading the data, select the numerical variables, and rename some of them for nicer visualizations.

library(robustHD)
data(TopGear)
dim(TopGear)
rownames(TopGear) = paste(TopGear[,1],TopGear[,2])
colnames(TopGear)
TopGear = TopGear[c(5,7,9:17)] # objective numerical variables
colnames(TopGear)
colnames(TopGear) = c("price", "displacement", "horsepower",
                      "torque", "acceleration", "top speed",
                      "miles/gallon", "weight", "length",
                      "width", "height")

Now we run the checkDataSet routine, to remove any observations with more than half of the variables missing. We additionally rescale the first variable to avoid huge numbers.

out = checkDataSet(TopGear, silent = TRUE)
Xorig = out$remX
dim(Xorig) 
Xorig[,1] = Xorig[,1]/1000
head(Xorig)

Finally, we transform some variables (price, displacement, horsepower, torque, top speed) to get roughly gaussianity in the center:

X = Xorig
X[,c(1:4,6)] = log(Xorig[,c(1:4,6)])

There are still some NAs left in the data, most in weight:

X = as.matrix(X)
colSums(is.na(X))

Now the preprocessing is over, and we can start with analyzing the data further. First we standardize X to Z for marginal outliers. There are indeed quite a few marginal outliers, especially in the MPG variable

out = estLocScale(X)
Z = scale(X,center=out$loc, scale=out$scale)
cutf = sqrt(qchisq(p=0.99,df=1))
boxplot(Z)

As a reference, we can use casewise MCD and inspect the resulting robust distances:

out <- robustbase::covMcd(X) 
plot(sqrt(out$mah))
abline(h = sqrt(qchisq(0.99, df = 11)))

Now we run cellMCD:

cellout <- cellMCD(X)

We immediately see 3 huge residuals in MPG, and one in acceleration.

Zres = cellout$Zres
boxplot(Zres) 
qqnorm(as.vector(Zres)); abline(0,1) 

How many cells were flagged by cellMCD? In total, 324 cells were flagged. We also see that 151 cars do not have a single flagged cell.

W = cellout$W
sum(as.vector(1-W)) 
rowS = rowSums(1-W)
sum(rowS == 0) # 151 cars do not have a single flagged cell.

Now look at some plots by variable. We start with price:

j = 1 # price
# Index plot of Zres:
# (ids = plot.cellMCD(cellout, j, type="index", identify=T)$ids)
ids = c(6,54,50,195,212,222,221)
Zres[ids,j,drop=F]
plot_cellMCD(cellout, j, type="index", ids=ids)

We saw that price has quite a few outliers on the right. Camaro and Proton cost less than would be expected based on their other characteristics.

# Plot Zres versus X:
plot_cellMCD(cellout, j, type="Zres/X", ids=ids)
# Plot Zres versus predicted cell:
plot_cellMCD(cellout, j, type="Zres/pred", ids=ids)
# Plot observed cell versus predicted cell:
# (ids = plot.cellMCD(cellout, j, type="X/pred", identify=T)$ids)
ids = c(179,3,212,218,54,6,222,221,50,195)
Xorig[ids,j,drop=F]
plot_cellMCD(cellout, j, type="X/pred", vband=T, hband=T, ids=ids)

The code below reproduces Figure 2 of the paper.

###########
## Figure 2
###########

# pdf(file="TopGear_fig2_test.pdf",width=5.2,height=10.4)
par(mfrow=c(2,1)) # (nrows,ncols)
rn = rownames(X) # for object labels
#
######### 1. HP
par(mar = c(3.6, 3.5, 2.8, 1.0))
j = 3
ids = c(52,59,70,218)
labs = rn[ids]; labs[1] = "Caterham"
xvals = c(52, 59, 70, 218) 
yvals = c(-6.20, -8.35, -4.55, -4.8)
plot_cellMCD(cellout, j, type="index",
             ylab="standardized residual of log(horsepower)",
             main="standardized residual of log(horsepower)")
text(x = xvals, y = yvals, labels = labs, cex = 0.9)
#
######### 2. Length
par(mar = c(3.6, 3.5, 2.8, 1.0))
#
j = 9
ids = c(3,50,51,119,144,195,218,232,249)
labs = rn[ids]
labs[1] = "Aston Cygnet"; labs[3] = "Caterham"
xvals = c(2560, 5050, 3100, 5252, 4245, 4605, 2360, 2700, 3330) 
yvals = c(-4.46, -4.21, -3.91, 4.20, -4.67, -5.43,
          -5.46, -6.45, -5.26)
plot_cellMCD(cellout, j, type="Zres/X",
             main="standardized residual versus X for length",
             xlab="length (mm)", xlim=c(1970,6000), 
             ylim=c(-6.51,4.88)) #,ids=ids)
text(x = xvals, y = yvals, labels = labs, cex = 0.9)
# dev.off()

Now we reproduce Figure 3.

# pdf(file="TopGear_fig3_test.pdf",width=5.2,height=10.4)
par(mfrow=c(2,1))
rn = rownames(X) # for object labels
#
######### 1. weight
par(mar = c(3.6, 3.5, 2.8, 1.0))
j = 8
ids = c(29,51,144,163,183,197,195)
pred = cellout$preds
labs = rn[ids]
labs[2] = "Caterham"
labs[4] = "Mercedes-Benz G"
xvals = c(1490, 1000, 1890, 1575, 1757,  810, 2250) 
yvals = c(5.63,-5.07,-5.10,4.52,-6.19,-6.60,-8.20)
plot_cellMCD(cellout, j, type="Zres/pred", # vband=F,
             xlab = "predicted weight (kg)") # , ids=ids)
text(x = xvals, y = yvals, labels = labs, cex = 0.9)
#
######### 2. top speed
par(mar = c(3.6, 3.5, 2.8, 1.0))
j = 6
ids = c(50,42,52,79,195,218,219,232,258)
labs = rn[ids]
labs[3] = "Caterham"
xvals = c(5.145, 4.937, 5.083, 5.065, 5.076, 4.90, 4.75, 4.63, 5.038)
yvals = c(5.53, 4.53, 4.72, 5.345, 5.44, 3.97, 4.43, 4.30, 4.61)
plot_cellMCD(cellout, j, type="X/pred", vband=F, vlines=F,
             xlab="predicted log(top speed)",
             ylab="observed log(top speed)",
             main="log(top speed) versus its prediction") #, ids=ids)
text(x = xvals, y = yvals, labels = labs, cex = 0.9)
# dev.off()

Finally, we create Figure 4.

############
### Figure 4
############

# pdf(file="TopGear_fig4_test.pdf",width=5.2,height=10.4)
par(mfrow=c(2,1)) # (nrows,ncols)
rn = rownames(X) # for object labels
#
######### 1. MPG versus torque
par(mar = c(3.6, 3.5, 2.8, 1.0))
ids=c(42,258,50) 
labs = rn[ids]
xvals = c(5.56, 6.28, 7.27) 
yvals = c(470, 235, -16)
plot_cellMCD(cellout, type = "bivariate", horizvar = 4, 
             vertivar = 7, ids = ids, 
             xlim=c(3.5,7.7), ylim = c(-27,480), 
             main = "miles/gallon versus log(torque)",
             xlab = "log(torque)",
             ylab = "miles/gallon",
             opacity=0.5, labelpoints=F)
text(x = xvals, y = yvals, labels = labs, cex = 0.9)
#
######### 2. Width versus acceleration
par(mar = c(3.6, 3.5, 2.8, 1.0))
ids = c(218,144,233,51,136,179) 
labs = rn[ids]
labs[3] = "Ssangyong" # name was too long for plot 
labs[4] = "Caterham" 
labs[5] = "Land Rover"
xvals = c(0.0,  -3.0, -3.23,  0.5, 14.2, 15.9) 
yvals = c(1200, 1850, 1915, 1688, 2039, 1445)
plot_cellMCD(cellout,  type = "bivariate", horizvar=5, 
             vertivar=10, ids=ids,
             xlab = "acceleration (seconds to 62 mph)",
             ylab = "width (mm)", xlim = c(-6.1,20),
             ylim = c(1200, 2150),
             opacity=0.5, labelpoints=F)
text(x = xvals, y = yvals, labels = labs, cex = 0.9)
# dev.off()

par(mfrow=c(1,1))

Comparison of cellMCD with covMCD

In the following, we compare covMCD and cellMCD on benchmark datasets with casewise outliers. Since some datasets have skewed variables, we always run X = transfo(X)$Y before both estimators. We require n/d >= 5, and we will stop if some variables have 25% or more outliers. The easiest way to compare both fits is to plot(rd,cd) and report cor(rd,cd).

The robustbase library contains the datasets as well as the covMCD function.

library(robustbase)

Aircraft data without response (n=23, d=4)

In this dataset we obtain a very high correlation (over 95%) between the robust distances based on covMCD and cellMCD.

data(aircraft)
X      <- as.matrix(aircraft)[, 1:4]
pairs(X) 
X      <- transfo(X)$Y
rowout <- robustbase::covMcd(X)
rd     <- sqrt(mahalanobis(X, center = rowout$center, cov = rowout$cov))
cutoff <- sqrt(qchisq(0.975,ncol(X)))
sum(rd > cutoff)
cellout <- cellMCD(X)

cd <- sqrt(mahalanobis(X, center = cellout$mu, cov = cellout$S))
plot(rd, cd)
round(cor(rd, cd),3)

Aircraft with response (n=23, d=5)

When we add the response to the Aircraft dataset, we have n/d < 5. Therefore, the data is no longer suitable for cellMCD.

data(aircraft)
X <- as.matrix(aircraft)
nrow(X) / ncol(X)

There is also another issue. The response variable has more than 25% marginal outliers. This leads to an error

cellout <- cellMCD(X)

alcohol (n=44, d=7)

Here we obtain a correlation between rd and cd of close to 90%. Note that covMCD has a very small eigenvalue (of the order 1e-7). The cellMCD hits its lower bound on the eigenvalues and is thus regularized.

data(alcohol)
X <- as.matrix(alcohol)
X <- transfo(X)$Y
rowout <- robustbase::covMcd(X)
eigen(rowout$cov)$values
rd <- sqrt(mahalanobis(X, center = rowout$center, cov = rowout$cov))
cutoff <- sqrt(qchisq(0.975, ncol(X)))
sum(rd > cutoff)
cellout <- cellMCD(X)

cd <- sqrt(mahalanobis(X, center = cellout$mu, cov = cellout$S))
round(cor(rd, cd),3)  

Animals2 (n=65, d=2)

The original Animals dataset (n=28) is in package MASS and gives a very similar result.

data(Animals2)
X <- as.matrix(log(Animals2))
X <- transfo(X)$Y
rowout <- robustbase::covMcd(X)
rd <- sqrt(mahalanobis(X, center = rowout$center, cov = rowout$cov))
cellout <- cellMCD(X)
cd <- sqrt(mahalanobis(X, center = cellout$mu, cov = cellout$S))
plot(rd, cd)
round(cor(rd, cd), 3) 

bushfire (n=38, d=5)

This dataset yields again a very high correlation between rd and cd of over 97%.

data(bushfire)
X <- as.matrix(bushfire)
X <- transfo(X)$Y
rowout <- robustbase::covMcd(X)
rd <- sqrt(mahalanobis(X,center=rowout$center,cov=rowout$cov))
cutoff <- sqrt(qchisq(0.975, ncol(X)))
sum(rd > cutoff)
cellout <- cellMCD(X) 
cd <- sqrt(mahalanobis(X, center = cellout$mu, cov = cellout$S))
round(cor(rd, cd), 3)

cloud (n = 19, d = 2)

This small dataset does not really contain outliers. No cells are flagged, and the results of covMCD and cellMCD are very similar

data(cloud)
X <- as.matrix(cloud)
X <- transfo(X)$Y
rowout <- robustbase::covMcd(X)
rd <- sqrt(mahalanobis(X, center = rowout$center, cov = rowout$cov))
cellout <- cellMCD(X) 

cd <- sqrt(mahalanobis(X,center=cellout$mu,cov=cellout$S))
round(cor(rd,cd),3) 

delivery without response n=25, d=2

Another example without real outliers. The results of cellMCD and covMCD are very close.

data(delivery)
X <- as.matrix(delivery)[, 1:2]
X <- transfo(X)$Y
rowout <- robustbase::covMcd(X)
rd <- sqrt(mahalanobis(X, center = rowout$center, cov = rowout$cov))
cutoff <- sqrt(qchisq(0.975, ncol(X)))
sum(rd > cutoff)
cellout <- cellMCD(X)
cd <- sqrt(mahalanobis(X, center = cellout$mu, cov = cellout$S))
round(cor(rd, cd), 3) 

delivery with response n=25, d=3

Now there is a small difference between cellMCD and covMCD. The output of cellMCD is more informative, pointing to the likely source of outlyingness.

data(delivery)
X <- as.matrix(delivery)
X <- transfo(X)$Y
rowout <- robustbase::covMcd(X)
rd <- sqrt(mahalanobis(X, center = rowout$center, cov = rowout$cov))
cutoff <- sqrt(qchisq(0.975, ncol(X)))
sum(rd > cutoff) 
cellout <- cellMCD(X) 
cd <- sqrt(mahalanobis(X, center = cellout$mu, cov = cellout$S))
round(cor(rd, cd), 3) 

exAM (n=12, d=2)

Very similar result between covMCD and cellMCD in terms of location and covariance estimates, and only one suspicious observation is identified. The cellMCD points to the most likely source (i.e. cell) causing the outlyingness of this observation.

data(exAM)
X <- as.matrix(exAM)
X <- transfo(X)$Y
rowout <- robustbase::covMcd(X)
rd <- sqrt(mahalanobis(X, center = rowout$center, cov = rowout$cov))
cutoff <- sqrt(qchisq(0.975, ncol(X)))
sum(rd > cutoff) 
cellout <- cellMCD(X)
cd <- sqrt(mahalanobis(X, center = cellout$mu, cov = cellout$S))
round(cor(rd, cd), 3)

hbk without response (n=75, d=3)

Here we find perfect correlation between rd and cd.

data(hbk)
X <- as.matrix(hbk)[, 1:3]
X <- transfo(X)$Y
rowout <- robustbase::covMcd(X)
rd <- sqrt(mahalanobis(X, center = rowout$center, cov = rowout$cov))
cutoff <- sqrt(qchisq(0.975, ncol(X)))
sum(rd > cutoff) 
cellout <- cellMCD(X)
cd <- sqrt(mahalanobis(X, center = cellout$mu, cov = cellout$S))
round(cor(rd, cd), 3) 

hbk with response (n=75, d=4)

Also here we find perfect correlation between rd and cd.

data(hbk)
X <- as.matrix(hbk)
X <- transfo(X)$Y
rowout <- robustbase::covMcd(X)
rd <- sqrt(mahalanobis(X, center = rowout$center, cov = rowout$cov))
cutoff <- sqrt(qchisq(0.975, ncol(X)))
sum(rd > cutoff)
cellout <- cellMCD(X)
cd <- sqrt(mahalanobis(X, center = cellout$mu, cov = cellout$S))
round(cor(rd, cd), 3)

kootenay (n=13, d=2)

Perfect correlation between rd and cd.

data(kootenay)
X <- as.matrix(kootenay)
X <- transfo(X)$Y 
rowout <- robustbase::covMcd(X)
rd <- sqrt(mahalanobis(X, center = rowout$center, cov = rowout$cov))
cutoff <- sqrt(qchisq(0.975, ncol(X)))
sum(rd > cutoff)
cellout <- cellMCD(X)
cd = sqrt(mahalanobis(X, center = cellout$mu, cov = cellout$S))
round(cor(rd, cd), 3) 

lactic (n=13, d=2)

This data set is too discrete. It has only 5 unique values for the first variable. This runs into an error:

data(lactic)
X <- as.matrix(lactic)
X <- transfo(X)$Y

milk (n=68, d=8)

Despite substantial numbers of cells flagged in several variables, the correlation between cellMCD and covMCD is over large.

data(milk)
X <- as.matrix(milk)
X <- transfo(X)$Y
rowout <- robustbase::covMcd(X)
rd <- sqrt(mahalanobis(X, center = rowout$center, cov = rowout$cov))
cellout <- cellMCD(X)
cd <- sqrt(mahalanobis(X, center = cellout$mu, cov = cellout$S))
round(cor(rd, cd), 3) 

pension (n=13, d=2)

This data set has a skewed second variable.

data(pension)
X <- as.matrix(pension)
X <- transfo(X)$Y
rowout <- robustbase::covMcd(X)
rd <- sqrt(mahalanobis(X,center=rowout$center,cov=rowout$cov))
cellout <- cellMCD(X)
cd <- sqrt(mahalanobis(X,center=cellout$mu,cov=cellout$S))
round(cor(rd,cd),3) 

phosphor (n=18, d=3)

Another data set with very similar rd and cd.

data(phosphor)
X <- as.matrix(phosphor)
X <- transfo(X)$Y
rowout <- robustbase::covMcd(X)
rd <- sqrt(mahalanobis(X, center = rowout$center, cov = rowout$cov))
cellout <- cellMCD(X)
cd <- sqrt(mahalanobis(X, center = cellout$mu, cov = cellout$S))
round(cor(rd, cd), 3) 

pilot (n=20, d=2)

This dataset has a very strong correlation between the variables, but no outliers.

data(pilot)
X <- as.matrix(pilot)
X <- transfo(X)$Y
rowout <- robustbase::covMcd(X)
rd <- sqrt(mahalanobis(X, center = rowout$center, cov = rowout$cov))
cutoff <- sqrt(qchisq(0.975, ncol(X)))
sum(rd > cutoff) 
cellout <- cellMCD(X)
cd <- sqrt(mahalanobis(X, center = cellout$mu, cov = cellout$S))
round(cor(rd, cd), 3) 

Note that when no cells are flagged, the raw output of cellMCD is the maximum likelihood estimate, that is, the arithmetic mean and (n-1)/n times the classical covariance matrix:

cellout$raw.mu - colMeans(X) 
n <- nrow(X)
cellout$raw.S - (n - 1) * cov(X) / n

radarImage n=1573, d=5

In this example, the data is not elliptical, so caution is warranted.

data(radarImage)
X <- as.matrix(radarImage)
X <- transfo(X)$Y
pairs(X) 
rowout <- robustbase::covMcd(X)
rd <- sqrt(mahalanobis(X, center = rowout$center, cov = rowout$cov))
cutoff <- sqrt(qchisq(0.975,ncol(X)))
sum(rd > cutoff) 
cellout <- cellMCD(X) 
cd <- sqrt(mahalanobis(X, center = cellout$mu, cov = cellout$S))
round(cor(rd,cd),3)

Salinity without response n=28, d=3

There are some differences between cellMCD and covMCD on this dataset. Visually, there do not seem to be very extreme outliers. Nevertheless, covMCD discards three observations. cellMCD on the other hand only discards a single cell.

data(salinity)
X <- as.matrix(salinity)[, 1:3]
X <- transfo(X)$Y
rowout <- robustbase::covMcd(X)
rd <- sqrt(mahalanobis(X, center = rowout$center, cov = rowout$cov))
cutoff <- sqrt(qchisq(0.975, ncol(X)))
sum(rd > cutoff) 
cellout <- cellMCD(X)
cd <- sqrt(mahalanobis(X, center = cellout$mu, cov = cellout$S))
round(cor(rd,cd),3) 

Salinity with response (n=28, d=4)

Now the results between covMCD and cellMCD are a little closer. CellMCD flags two cells, and covMCD flags three rows.

data(salinity)
X <- as.matrix(salinity)
X <- transfo(X)$Y
rowout <- robustbase::covMcd(X)
rd <- sqrt(mahalanobis(X, center = rowout$center, cov = rowout$cov))
cutoff <- sqrt(qchisq(0.975, ncol(X)))
sum(rd > cutoff) 
cellout <- cellMCD(X) 
cd <- sqrt(mahalanobis(X, center = cellout$mu, cov = cellout$S))
round(cor(rd, cd), 3) 

starsCYG (n=47, d=2)

On this data the results of covMCD and cellMCD are very close.

data(starsCYG)
X <- as.matrix(starsCYG)
X <- transfo(X, nbsteps = 1)$Y
rowout <- robustbase::covMcd(X)
rd <- sqrt(mahalanobis(X, center = rowout$center, cov = rowout$cov))
cutoff <- sqrt(qchisq(0.975, ncol(X)))
sum(rd > cutoff)
cellout <- cellMCD(X)
cd <- sqrt(mahalanobis(X, center = cellout$mu, cov = cellout$S))
round(cor(rd,cd),3) 

toxicity (n=38, d=10)

This dataset has too many marginal outliers:

data(toxicity)
X <- as.matrix(toxicity)
X <- transfo(X)$Y
rowout <- robustbase::covMcd(X)
rd     <- sqrt(mahalanobis(X,center=rowout$center,cov=rowout$cov))
cutoff <- sqrt(qchisq(0.975,ncol(X)))
sum(rd > cutoff) 
cellout <- cellMCD(X)

wood without response n=20, d=5

This data does not satisfy n/d < 5. cellMCD throws a warning in this case, and caution is warranted.

data(wood)
X <- as.matrix(wood)
X <- transfo(X, nbsteps = 1)$Y
cellout <- cellMCD(X)


Try the cellWise package in your browser

Any scripts or data that you put into this service are public.

cellWise documentation built on Oct. 25, 2023, 5:07 p.m.