tests/tsest.R
In rrcov: Scalable Robust Estimators with High Breakdown Point

## VT::15.09.2013 - this will render the output independent
##  from the version of the package
suppressPackageStartupMessages(library(rrcov))

library(MASS)

dodata <- function(nrep = 1, time = FALSE, full = TRUE, method) {
    doest <- function(x, xname, nrep = 1, method=c("sfast", "surreal", "bisquare", "rocke", "suser", "MM", "sdet")) {

        method <- match.arg(method)

        lname <- 20
        n <- dim(x)[1]
        p <- dim(x)[2]

        mm <- if(method == "MM") CovMMest(x) else CovSest(x, method=method)

        crit <- log(mm@crit)

        xres <- sprintf("%3d %3d %12.6f\n", dim(x)[1], dim(x)[2], crit)
        lpad <- lname-nchar(xname)
        cat(pad.right(xname,lpad), xres)

        dist <- getDistance(mm)
        quantiel <- qchisq(0.975, p)
        ibad <- which(dist >= quantiel)
        names(ibad) <- NULL
        nbad <- length(ibad)
        cat("Outliers: ",nbad,"\n")
        if(nbad > 0)
            print(ibad)
        cat("-------------\n")
        show(mm)
        cat("--------------------------------------------------------\n")
    }

    options(digits = 5)
    set.seed(101) # <<-- sub-sampling algorithm now based on R's RNG and seed

    data(heart)
    data(starsCYG)
    data(phosphor)
    data(stackloss)
    data(coleman)
    data(salinity)
    data(wood)
    data(hbk)

    data(Animals, package = "MASS")
    brain <- Animals[c(1:24, 26:25, 27:28),]
    data(milk)
    data(bushfire)
###
    data(rice)
    data(hemophilia)
    data(fish)

    tmp <- sys.call()
    cat("\nCall: ", deparse(substitute(tmp)),"\n")

    cat("Data Set               n   p    LOG(det)       Time\n")
    cat("===================================================\n")
    doest(heart[, 1:2], data(heart), nrep, method=method)
    doest(starsCYG, data(starsCYG), nrep, method=method)
    doest(data.matrix(subset(phosphor, select = -plant)), data(phosphor), nrep, method=method)
    doest(stack.x, data(stackloss), nrep, method=method)
    doest(data.matrix(subset(coleman, select = -Y)), data(coleman), nrep, method=method)
    doest(data.matrix(subset(salinity, select = -Y)), data(salinity), nrep, method=method)
    doest(data.matrix(subset(wood, select = -y)), data(wood), nrep, method=method)
    doest(data.matrix(subset(hbk,  select = -Y)), data(hbk), nrep, method=method)


    doest(brain, "Animals", nrep, method=method)
    doest(milk, data(milk), nrep, method=method)
    doest(bushfire, data(bushfire), nrep, method=method)

    doest(data.matrix(subset(rice,  select = -Overall_evaluation)), data(rice), nrep, method=method)
    doest(data.matrix(subset(hemophilia,  select = -gr)), data(hemophilia), nrep, method=method)
    doest(data.matrix(subset(fish,  select = -Species)), data(fish), nrep, method=method)

    ## from package 'datasets'
    doest(airquality[,1:4], data(airquality), nrep, method=method)
    doest(attitude, data(attitude), nrep, method=method)
    doest(attenu, data(attenu), nrep, method=method)
    doest(USJudgeRatings, data(USJudgeRatings), nrep, method=method)
    doest(USArrests, data(USArrests), nrep, method=method)
    doest(longley, data(longley), nrep, method=method)
    doest(Loblolly, data(Loblolly), nrep, method=method)
    doest(quakes[,1:4], data(quakes), nrep, method=method)

    cat("===================================================\n")
}

#   generate contaminated data using the function gendata with different
#   number of outliers and check if the M-estimate breaks - i.e. the
#   largest eigenvalue is larger than e.g. 5.
#   For n=50 and p=10 and d=5 the M-estimate can break for number of
#   outliers grater than 20.
dogen <- function(){
    eig <- vector("numeric",26)
    for(i in 0:25) {
        gg <- gendata(eps=i)
        mm <- CovSRocke(gg$x, t0=gg$tgood, S0=gg$sgood)
        eig[i+1] <- ev <- getEvals(mm)[1]
        cat(i, ev, "\n")

##        stopifnot(ev < 5 || i > 20)
    }
    plot(0:25, eig, type="l", xlab="Number of outliers", ylab="Largest Eigenvalue")
}

#
# generate data 50x10 as multivariate normal N(0,I) and add
# eps % outliers by adding d=5.0 to each component.
#   - if eps <0 and eps <=0.5, the number of outliers is eps*n
#   - if eps >= 1, it is the number of outliers
# - use the center and cov of the good data as good start
# - use the center and the cov of all data as a bad start
#   If using a good  start, the M-estimate must iterate to
#   the good solution: the largest eigenvalue is less then e.g. 5
#
gendata <- function(n=50, p=10, eps=0, d=5.0){

    if(eps < 0 || eps > 0.5 && eps < 1.0 || eps > 0.5*n)
        stop("eps is out of range")

    library(MASS)

    x <- mvrnorm(n, rep(0,p), diag(p))
    bad <- vector("numeric")
    nbad = if(eps < 1) eps*n else eps
    if(nbad > 0){
        bad <- sample(n, nbad)
        x[bad,] <- x[bad,] + d
    }
    cov1 <- cov.wt(x)
    cov2 <- if(nbad <= 0) cov1 else cov.wt(x[-bad,])

    list(x=x, bad=sort(bad), tgood=cov2$center, sgood=cov2$cov, tbad=cov1$center, sbad=cov1$cov)
}

pad.right <- function(z, pads)
{
    ## Pads spaces to right of text
    padding <- paste(rep(" ", pads), collapse = "")
    paste(z, padding, sep = "")
}


## -- now do it:
dodata(method="sfast")
dodata(method="sdet")
##dodata(method="suser")
##dodata(method="surreal")
dodata(method="bisquare")
dodata(method="rocke")
dodata(method="MM")
##dogen()
##cat('Time elapsed: ', proc.time(),'\n') # for ``statistical reasons''