tests/tmest.R

library(rrcov)
library(MASS)

dodata <- function(nrep = 1, time = FALSE, full = TRUE) {
    ##@bdescr
    ## Test the function covMest() on the literature datasets:
    ##
    ## Call covMest() for all regression datasets available in rrcov and print:
    ##  - execution time (if time == TRUE)
    ##  - The constants c and M
    ##  - determinant of the covariance matrix
    ##  - estimated center and covarinance matrix if full == TRUE)
    ##
    ##@edescr
    ##
    ##@in  nrep              : [integer] number of repetitions to use for estimating the
    ##                                   (average) execution time
    ##@in  time              : [boolean] whether to evaluate the execution time
    ##@in  full              : [boolean] whether to print the estimated cente and covariance matrix

    domest <- function(x, xname, nrep = 1) {
        n <- dim(x)[1]
        p <- dim(x)[2]
        mm <- covMest(x)
        crit <- log(mm$crit)
        c1 <- mm$c1
        M <- mm$M

        if(time) {
            xtime <- system.time(dorep(x, nrep))[1]/nrep
            xres <- sprintf("%3d %3d %8.4f %8.4f %12.6f %10.3f\n",
                            dim(x)[1], dim(x)[2], c1, M, crit, xtime)
        }
        else {
            xres <- sprintf("%3d %3d %8.4f %8.4f %12.6f\n", dim(x)[1], dim(x)[2], c1, M, crit)
        }
        lpad <- lname-nchar(xname)
        cat(pad.right(xname,lpad), xres)

        if(full) {
            cat("-------------\n")
            print(mm)
            cat("----------------------------------------------------------------------\n")
        }
    }

    options(digits = 5)
    set.seed(101) # <<-- sub-sampling algorithm now based on R's RNG and seed

    lname <- 20
 
    data(heart)
    data(starsCYG)
    data(phosphor)
    data(stackloss)
    data(coleman)
    data(salinity)
    data(wood)
    data(hbk)

    data(Animals, package = "MASS")
    brain <- Animals[c(1:24, 26:25, 27:28),]
    data(milk)
    data(bushfire)

    tmp <- sys.call()
    cat("\nCall: ", deparse(substitute(tmp)),"\n")

    cat("Data Set               n   p       c1        M     LOG(det)       Time\n")
    cat("======================================================================\n")
    domest(heart[, 1:2], data(heart), nrep)
    domest(starsCYG, data(starsCYG), nrep)
    domest(data.matrix(subset(phosphor, select = -plant)), data(phosphor), nrep)
    domest(stack.x, data(stackloss), nrep)
    domest(data.matrix(subset(coleman, select = -Y)), data(coleman), nrep)
    domest(data.matrix(subset(salinity, select = -Y)), data(salinity), nrep)
    domest(data.matrix(subset(wood, select = -y)), data(wood), nrep)
    domest(data.matrix(subset(hbk,  select = -Y)), data(hbk), nrep)

    domest(brain, "Animals", nrep)
    domest(milk, data(milk), nrep)
    domest(bushfire, data(bushfire), nrep)
    cat("======================================================================\n")
}

#   generate contaminated data using the function gendata with different
#   number of outliers and check if the M-estimate breaks - i.e. the 
#   largest eigenvalue is larger than e.g. 5. 
#   For n=50 and p=10 and d=5 the M-estimate can break for number of 
#   outliers grater than 20.
dogen <- function(){
    eig <- vector("numeric",26)
    for(i in 0:25) {
        gg <- gendata(eps=i)
        mm <- covMest(gg$x, t0=gg$tgood, S0=gg$sgood, arp=0.001)
        eig[i+1] <- ev <- eigen(mm$cov)$values[1]
        # cat(i, ev, "\n")
        
        stopifnot(ev < 5 || i > 20)
    }
    # plot(0:25, eig, type="l", xlab="Number of outliers", ylab="Largest Eigenvalue")
}

#
# generate data 50x10 as multivariate normal N(0,I) and add
# eps % outliers by adding d=5.0 to each component.
#   - if eps <0 and eps <=0.5, the number of outliers is eps*n
#   - if eps >= 1, it is the number of outliers
# - use the center and cov of the good data as good start
# - use the center and the cov of all data as a bad start
#   If using a good  start, the M-estimate must iterate to 
#   the good solution: the largest eigenvalue is less then e.g. 5
#
gendata <- function(n=50, p=10, eps=0, d=5.0){

    if(eps < 0 || eps > 0.5 && eps < 1.0 || eps > 0.5*n)
        stop("eps is out of range")
        
    library(MASS)
    
    x <- mvrnorm(n, rep(0,p), diag(p))
    bad <- vector("numeric")
    nbad = if(eps < 1) eps*n else eps
    if(nbad > 0){
        bad <- sample(n, nbad)
        x[bad,] <- x[bad,] + d
    }
    cov1 <- cov.wt(x)
    cov2 <- if(nbad <= 0) cov1 else cov.wt(x[-bad,])
    
    list(x=x, bad=sort(bad), tgood=cov2$center, sgood=cov2$cov, tbad=cov1$center, sbad=cov1$cov)
}

pad.right <- function(z, pads)
{
    ## Pads spaces to right of text
    padding <- paste(rep(" ", pads), collapse = "")
    paste(z, padding, sep = "")
}


## -- now do it:
dodata()
dogen()
#cat('Time elapsed: ', proc.time(),'\n') # for ``statistical reasons''
armstrtw/rrcov documentation built on May 10, 2019, 1:43 p.m.