tests/m-s-estimator.R

## Test implementation of M-S estimator
require(robustbase)
source(system.file("xtraR/m-s_fns.R", package = "robustbase", mustWork=TRUE))
source(system.file("xtraR/ex-funs.R", package = "robustbase", mustWork=TRUE))
source(system.file("xtraR/test-tools.R", package = "robustbase")) # assert.EQ

## dataset with factors and continuous variables:
data(education)
education <- within(education, Region <- factor(Region))
## for testing purposes:
education2 <- within(education, Group <- factor(rep(1:3, length.out=length(Region))))

## Test splitFrame (type fii is the only problematic type)
testFun <- function(formula, x1.idx) {
    obj <- lm(formula, education2)
    mf <- obj$model
    ret <- splitFrame(mf, type="fii")
    if (missing(x1.idx)) {
        print(ret$x1.idx)
        return(which(unname(ret$x1.idx)))
    }
    stopifnot(identical(x1.idx, which(unname(ret$x1.idx))))
}
testFun(Y ~ 1, integer(0))
testFun(Y ~ X1*X2*X3, integer(0))
testFun(Y ~ Region + X1 + X2 + X3, 1:4)
testFun(Y ~ 0 + Region + X1 + X2 + X3, 1:4)
testFun(Y ~ Region*X1 + X2 + X3, c(1:5, 8:10))
testFun(Y ~ Region*X1 + X2 + X3 + Region*Group, c(1:5, 8:18))
testFun(Y ~ Region*X1 + X2 + X3 + Region*Group*X2, c(1:6, 8:29))
testFun(Y ~ Region*X1 + X2 + Region*Group*X2, 1:28)
testFun(Y ~ Region*X1 + X2 + Region:Group:X2, 1:21)
testFun(Y ~ Region*X1 + X2*X3 + Region:Group:X2, c(1:6, 8:10, 12:23))
testFun(Y ~ (X1+X2+X3+Region)^2, c(1:7,10:12,14:19))
testFun(Y ~ (X1+X2+X3+Region)^3, c(1:19, 21:29))
testFun(Y ~ (X1+X2+X3+Region)^4, 1:32)
testFun(Y ~ Region:X1:X2 + X1*X2, c(1:1, 4:7))


control <- lmrob.control()
cntrlT1 <- lmrob.control(trace.lev=1)
f.lm <- lm(Y ~ Region + X1 + X2 + X3, education)
splt <- splitFrame(f.lm$model)
stopifnot(identical(names(splt$x1.idx), names(coef(f.lm))),
          unname(splt$x1.idx) == c(rep(TRUE, 4), rep(FALSE, 3))
          )
y <- education$Y

## test orthogonalizing
x1 <- splt$x1
x2 <- splt$x2
tmp <- lmrob.lar(x1, y, control)
y.tilde <- tmp$resid
t1 <- tmp$coef
x2.tilde <- x2
T2 <- matrix(0, nrow=ncol(x1), ncol=ncol(x2))
for (i in 1:ncol(x2)) {
    tmp <- lmrob.lar(x1, x2[,i], control)
    x2.tilde[,i] <- tmp$resid
    T2[,i] <- tmp$coef
}
T2

set.seed(10)
mss1 <- m_s_subsample(x1, x2.tilde, y.tilde, cntrlT1, orth = FALSE)
mss1 <- within(mss1, b1 <- drop(t1 + b1 - T2 %*% b2))
stopifnot(all.equal(30.81835, mss1$scale, tol=1e-7))
set.seed(10)
mss2 <- m_s_subsample(x1, x2,       y,       cntrlT1, orth = TRUE)
stopifnot(all.equal(mss1, mss2))

res <- vector("list", 100)
set.seed(0)
time <- system.time(for (i in seq_along(res)) {
    tmp <- m_s_subsample(x1, x2.tilde, y.tilde, control, FALSE)
    res[[i]] <- unlist(within(tmp, b1 <- drop(t1 + b1 - T2 %*% b2)))
})
cat('Time elapsed in subsampling: ', time,'\n')
## show a summary of the results  {"FIXME": output is platform dependent}
summary(res1 <- do.call(rbind, res))
## compare with fast S solution
fmS <- lmrob(Y ~ Region + X1 + X2 + X3, education, init="S")
coef(fmS)
fmS$scale

###  Comparing m-s_descent implementations()  {our C and R} : -------------------

ctrl <- control
#ctrl$trace.lev <- 5
ctrl$k.max <- 1
mC <- m_s_descent      (x1, x2, y, ctrl, mss2$b1, mss2$b2, mss2$scale+10)
mR <- m_s_descent_Ronly(x1, x2, y, ctrl, mss2$b1, mss2$b2, mss2$scale+10)
nm <- c("b1","b2", "scale", "res")
stopifnot(all.equal(mC[nm], mR[nm], check.attributes = FALSE, tolerance = 4e-14))
                                        # seen 5.567e-15 in OpenBLAS      ^^^^^

## control$k.m_s <- 100
res3 <- vector("list", 100)
time <- system.time(for (i in seq_along(res3)) {
    ri <- res[[i]]
    res3[[i]] <- unlist(m_s_descent(x1, x2, y, control,
				    ri[1:4], ri[5:7], ri[8]))
})
cat('Time elapsed in descent proc: ', time,'\n')

## show a summary of the results   {"FIXME": output is platform dependent}
res4 <- do.call(rbind, res3)
summary(res4[,1:8])

stopifnot(all.equal( # 'test', not only plot:
	  res1[, "scale"],   res4[,"scale"], tol = 0.03),
	  res1[, "scale"] >= res4[,"scale"] - 1e-7 ) # 1e-7 just in case
     plot(res1[, "scale"],   res4[,"scale"])
abline(0,1, col=adjustcolor("gray", 0.5))

## Test lmrob.M.S
x <- model.matrix(fmS)
control$trace.lev <- 3
##      ---------   --
set.seed(1003)
fMS <- lmrob.M.S(x, y, control, fmS$model)
resid <- drop(y - x %*% fMS$coef)
assert.EQ(resid, fMS$resid, check.attributes=FALSE, tol = 1e-12)

## Test direct call to lmrob
## 1. trace_lev output:
set.seed(17)
fMS <- lmrob(Y ~ Region + X1 + X2 + X3, education, init = "M-S", trace.lev=2)

set.seed(13)
fiMS <- lmrob(Y ~ Region + X1 + X2 + X3, education, init = "M-S")
out2 <- capture.output(summary(fiMS))
writeLines(out2)

set.seed(13)
fiM.S <- lmrob(Y ~ Region + X1 + X2 + X3, education, init=lmrob.M.S)
out3 <- capture.output(summary(fiM.S))

## must be the same {apart from the "init=" in the call}:
i <- 3
stopifnot(identical(out2[-i], out3[-i]))
## the difference:
c(rbind(out2[i], out3[i]))


###  "Skipping design matrix equilibration" warning can arise for reasonable designs -----
set.seed(1)
x2 <- matrix(rnorm(2*30), 30, 2)
data <- data.frame(y = rnorm(30), group = rep(letters[1:3], each=10), x2)

obj <- lmrob(y ~ ., data, init="M-S", trace.lev=1)

## illustration: the zero row is introduced during the orthogonalization of x2 wrt x1
## l1 regression always produces p zero residuals
## by chance, the zero residuals of multiple columns happen to be on the same row
sf <- splitFrame(obj$model)
x1 <- sf$x1
x2 <- sf$x2
control <- obj$control

## orthogonalize
x2.tilde <- x2

for(i in 1:ncol(x2)) {
    tmp <- lmrob.lar(x1, x2[,i], control)
    x2.tilde[,i] <- tmp$resid
}
x2.tilde == 0


## Specifying init="M-S" for a model without categorical variables
## used to cause a segfault; now uses "S"
lmrob(LNOx ~ LNOxEm, NOxEmissions[1:10,], init="M-S")

## Now an ANOVA model with *only* categorical variables
n <- 64 # multiple of 16
stopifnot(n %% 16 == 0)
d.AOV <- data.frame(y = round(100*rnorm(64)),
		    A=gl(4,n/4), B=gl(2,8, n), C=gl(2,4,n))
fm <- lmrob(y ~ A*B*C, data = d.AOV, init = "M-S", trace.lev=2)

## lmrob_M_S(n = 64, nRes = 500, (p1,p2)=(16,0), (orth,subs,desc)=(1,1,1))
##  Starting subsampling procedure.. Error in lmrob.M.S(x, y, control, mf) :
##   'Calloc' could not allocate memory (18446744073709551616 of 4 bytes)

## BTW: Can we compute an  M-estimate (instead of MM-*) as we
## ---  cannot have any x-outliers in such an ANOVA!

Try the robustbase package in your browser

Any scripts or data that you put into this service are public.

robustbase documentation built on Jan. 27, 2024, 3 p.m.