tests/m-s-estimator.R
In robustbase: Basic Robust Statistics

## Test implementation of M-S estimator
require(robustbase)
source(system.file("xtraR/m-s_fns.R", package = "robustbase", mustWork=TRUE))
source(system.file("xtraR/ex-funs.R", package = "robustbase", mustWork=TRUE))
source(system.file("xtraR/test-tools.R", package = "robustbase")) # assert.EQ

## dataset with factors and continuous variables:
data(education)
education <- within(education, Region <- factor(Region))
## for testing purposes:
education2 <- within(education, Group <- factor(rep(1:3, length.out=length(Region))))

## Test splitFrame (type fii is the only problematic type)
testFun <- function(formula, x1.idx) {
    obj <- lm(formula, education2)
    mf <- obj$model
    ret <- splitFrame(mf, type="fii")
    if (missing(x1.idx)) {
        print(ret$x1.idx)
        return(which(unname(ret$x1.idx)))
    }
    stopifnot(identical(x1.idx, which(unname(ret$x1.idx))))
}
testFun(Y ~ 1, integer(0))
testFun(Y ~ X1*X2*X3, integer(0))
testFun(Y ~ Region + X1 + X2 + X3, 1:4)
testFun(Y ~ 0 + Region + X1 + X2 + X3, 1:4)
testFun(Y ~ Region*X1 + X2 + X3, c(1:5, 8:10))
testFun(Y ~ Region*X1 + X2 + X3 + Region*Group, c(1:5, 8:18))
testFun(Y ~ Region*X1 + X2 + X3 + Region*Group*X2, c(1:6, 8:29))
testFun(Y ~ Region*X1 + X2 + Region*Group*X2, 1:28)
testFun(Y ~ Region*X1 + X2 + Region:Group:X2, 1:21)
testFun(Y ~ Region*X1 + X2*X3 + Region:Group:X2, c(1:6, 8:10, 12:23))
testFun(Y ~ (X1+X2+X3+Region)^2, c(1:7,10:12,14:19))
testFun(Y ~ (X1+X2+X3+Region)^3, c(1:19, 21:29))
testFun(Y ~ (X1+X2+X3+Region)^4, 1:32)
testFun(Y ~ Region:X1:X2 + X1*X2, c(1:1, 4:7))


control <- lmrob.control()
cntrlT1 <- lmrob.control(trace.lev=1)
f.lm <- lm(Y ~ Region + X1 + X2 + X3, education)
splt <- splitFrame(f.lm$model)
stopifnot(identical(names(splt$x1.idx), names(coef(f.lm))),
          unname(splt$x1.idx) == c(rep(TRUE, 4), rep(FALSE, 3))
          )
y <- education$Y

## test orthogonalizing
x1 <- splt$x1
x2 <- splt$x2
tmp <- lmrob.lar(x1, y, control)
y.tilde <- tmp$resid
t1 <- tmp$coef
x2.tilde <- x2
T2 <- matrix(0, nrow=ncol(x1), ncol=ncol(x2))
for (i in 1:ncol(x2)) {
    tmp <- lmrob.lar(x1, x2[,i], control)
    x2.tilde[,i] <- tmp$resid
    T2[,i] <- tmp$coef
}
T2

set.seed(10)
mss1 <- m_s_subsample(x1, x2.tilde, y.tilde, cntrlT1, orth = FALSE)
mss1 <- within(mss1, b1 <- drop(t1 + b1 - T2 %*% b2))
stopifnot(all.equal(30.81835, mss1$scale, tol=1e-7))
set.seed(10)
mss2 <- m_s_subsample(x1, x2,       y,       cntrlT1, orth = TRUE)
stopifnot(all.equal(mss1, mss2))

res <- vector("list", 100)
set.seed(0)
time <- system.time(for (i in seq_along(res)) {
    tmp <- m_s_subsample(x1, x2.tilde, y.tilde, control, FALSE)
    res[[i]] <- unlist(within(tmp, b1 <- drop(t1 + b1 - T2 %*% b2)))
})
cat('Time elapsed in subsampling: ', time,'\n')
## show a summary of the results  {"FIXME": output is platform dependent}
summary(res1 <- do.call(rbind, res))
## compare with fast S solution
fmS <- lmrob(Y ~ Region + X1 + X2 + X3, education, init="S")
coef(fmS)
fmS$scale

###  Comparing m-s_descent implementations()  {our C and R} : -------------------

ctrl <- control
#ctrl$trace.lev <- 5
ctrl$k.max <- 1
mC <- m_s_descent      (x1, x2, y, ctrl, mss2$b1, mss2$b2, mss2$scale+10)
mR <- m_s_descent_Ronly(x1, x2, y, ctrl, mss2$b1, mss2$b2, mss2$scale+10)
nm <- c("b1","b2", "scale", "res")
stopifnot(all.equal(mC[nm], mR[nm], check.attributes = FALSE, tolerance = 4e-14))
                                        # seen 5.567e-15 in OpenBLAS      ^^^^^

## control$k.m_s <- 100
res3 <- vector("list", 100)
time <- system.time(for (i in seq_along(res3)) {
    ri <- res[[i]]
    res3[[i]] <- unlist(m_s_descent(x1, x2, y, control,
				    ri[1:4], ri[5:7], ri[8]))
})
cat('Time elapsed in descent proc: ', time,'\n')

## show a summary of the results   {"FIXME": output is platform dependent}
res4 <- do.call(rbind, res3)
summary(res4[,1:8])

stopifnot(all.equal( # 'test', not only plot:
	  res1[, "scale"],   res4[,"scale"], tol = 0.03),
	  res1[, "scale"] >= res4[,"scale"] - 1e-7 ) # 1e-7 just in case
     plot(res1[, "scale"],   res4[,"scale"])
abline(0,1, col=adjustcolor("gray", 0.5))

## Test lmrob.M.S
x <- model.matrix(fmS)
control$trace.lev <- 3
##      ---------   --
set.seed(1003)
fMS <- lmrob.M.S(x, y, control, fmS$model)
resid <- drop(y - x %*% fMS$coef)
assert.EQ(resid, fMS$resid, check.attributes=FALSE, tol = 1e-12)

## Test direct call to lmrob
## 1. trace_lev output:
set.seed(17)
fMS <- lmrob(Y ~ Region + X1 + X2 + X3, education, init = "M-S", trace.lev=2)

set.seed(13)
fiMS <- lmrob(Y ~ Region + X1 + X2 + X3, education, init = "M-S")
out2 <- capture.output(summary(fiMS))
writeLines(out2)

set.seed(13)
fiM.S <- lmrob(Y ~ Region + X1 + X2 + X3, education, init=lmrob.M.S)
out3 <- capture.output(summary(fiM.S))

## must be the same {apart from the "init=" in the call}:
i <- 3
stopifnot(identical(out2[-i], out3[-i]))
## the difference:
c(rbind(out2[i], out3[i]))


###  "Skipping design matrix equilibration" warning can arise for reasonable designs -----
set.seed(1)
x2 <- matrix(rnorm(2*30), 30, 2)
data <- data.frame(y = rnorm(30), group = rep(letters[1:3], each=10), x2)

obj <- lmrob(y ~ ., data, init="M-S", trace.lev=1)

## illustration: the zero row is introduced during the orthogonalization of x2 wrt x1
## l1 regression always produces p zero residuals
## by chance, the zero residuals of multiple columns happen to be on the same row
sf <- splitFrame(obj$model)
x1 <- sf$x1
x2 <- sf$x2
control <- obj$control

## orthogonalize
x2.tilde <- x2

for(i in 1:ncol(x2)) {
    tmp <- lmrob.lar(x1, x2[,i], control)
    x2.tilde[,i] <- tmp$resid
}
x2.tilde == 0


## Specifying init="M-S" for a model without categorical variables
## used to cause a segfault; now uses "S"
lmrob(LNOx ~ LNOxEm, NOxEmissions[1:10,], init="M-S")

## Now an ANOVA model with *only* categorical variables
n <- 64 # multiple of 16
stopifnot(n %% 16 == 0)
d.AOV <- data.frame(y = round(100*rnorm(64)),
		    A=gl(4,n/4), B=gl(2,8, n), C=gl(2,4,n))
fm <- lmrob(y ~ A*B*C, data = d.AOV, init = "M-S", trace.lev=2)

## lmrob_M_S(n = 64, nRes = 500, (p1,p2)=(16,0), (orth,subs,desc)=(1,1,1))
##  Starting subsampling procedure.. Error in lmrob.M.S(x, y, control, mf) :
##   'Calloc' could not allocate memory (18446744073709551616 of 4 bytes)

## BTW: Can we compute an  M-estimate (instead of MM-*) as we
## ---  cannot have any x-outliers in such an ANOVA!