R/covNNC-orig.R In robustX: 'eXtra' / 'eXperimental' Functionality for Robust Statistics

Documented in cov.nnve

```## Original version (by Wang & Raftery, 2003;  originally cov.nnve())
## This is only to verify the improvements by MM, and hence *not* exported
##
covNNC1 <- function(datamat, k = 12, pnoise = 0.05, emconv = 0.001, bound = 1.5,
extension = TRUE, devsm = 0.01)
{
##
##
## Function to perform Nearest Neighbor Variance Estimation
## Wang and Raftery(2002), "Nearest neighbor variance estimation (NNVE):
##     Robust covariance estimation via nearest neighbor cleaning
##                 (with discussion)",
##     Journal of the American Statistical Association 97:994-1019
## Available as Technical Report 368 (2000) from
##    http://www.stat.washington.edu/www/research/reports
##
## The data need to be a matrix of points in the process.
## Each column contains one variable.
##
##
##  Returns a list with the following components:
##      cov            - robust covariance estimate
##      mu             - mean
##      postprob       - posterior probability
##      classification - classification (0=noise otherwise 1)
##                      (obtained by rounding postprob)
##      innc - list of initial nearest-neighbor results
##            (components are the same as above)
##
##
datamat <- as.matrix(datamat)
d <- dim(datamat)[2]
n <- dim(datamat)[1]
##
##  The Re-scale NNC function
##
nclean.sub <- function(datamat, k, distances = NULL, convergence = 0.001,
S.mean = NULL, S.sd = NULL)
{
##
## Function to perform the Nearest Neighbour cleaning of
##--------------------------------------------------------------
## find the density of D_k
##
dDk <- function(x, lambda, k, d, alpha.d)
{
(exp( - lambda * alpha.d * x^d + log(2) + k * log(
lambda * alpha.d) + log(x) * (d * k - 1) - log(
gamma(k))))
}
## end of dDk
##--------------------------------------------------------------
## nearest-neighbor in S-PLUS
##
splusNN <- function(datamat, k)
{
n <- nrow(datamat)
distances <- dist(datamat)
##
##  This next part sorts through the Splus distance object and forms kNNd,
##  kth nearest neighbour distance, for each point.
##
kNNd <- rep(0, n)
N <- (n - 1):0
I <- c(0, cumsum(N[-1]))
J <- c(0, I + n - 1)
a <- z <- NULL
for(j in 1:n) {
if(j > 1)
a <- i + I[1:i]
if(j < n)
z <- J[j] + 1:N[j]
kNNd[j] <- sort(distances[c(a, z)])[k]
i <- j
}
kNNd
}
## end of splusNN
##--------------------------------------------------------------
d <- dim(datamat)[2]
n <- dim(datamat)[1]
##
kthNND <- splusNN(t((t(datamat) - S.mean)/S.sd), k = k)
alpha.d <- (2 * pi^(d/2))/(d * gamma(d/2))
##
## Now use kthNND in E-M algorithm, first get starting guesses.
##
delta <- rep(0, n)
delta[kthNND > (min(kthNND) + diff(range(kthNND))/3)] <- 1
p <- 0.5
lambda1 <- k/(alpha.d * mean((kthNND[delta == 0])^d))
lambda2 <- k/(alpha.d * mean((kthNND[delta == 1])^d))
loglik.old <- 0
loglik.new <- 1
##
## Iterator starts here,
##
while(abs(loglik.new - loglik.old)/(1+abs(loglik.new)) > convergence)
{
## E - step
delta <- (p * dDk(kthNND, lambda1, k = k, d = d,
alpha.d = alpha.d))/(p * dDk(kthNND, lambda1,
k = k, d = d, alpha.d = alpha.d) + (1 - p) *
dDk(kthNND, lambda2, k = k, d = d, alpha.d =
alpha.d))
## M - step
p <- sum(delta)/n
lambda1 <- (k * sum(delta))/(alpha.d * sum((kthNND^
d) * delta))
lambda2 <- (k * sum((1 - delta)))/(alpha.d * sum((
kthNND^d) * (1 - delta)))
loglik.old <- loglik.new
loglik.new <- sum( - p * lambda1 * alpha.d * ((kthNND^
d) * delta) - (1 - p) * lambda2 * alpha.d *
((kthNND^d) * (1 - delta)) + delta * k * log(
lambda1 * alpha.d) + (1 - delta) * k * log(
lambda2 * alpha.d))
}
##
## z will be the classifications. 1= in cluster. 0= in noise.
##
probs <- dDk(kthNND, lambda1, k = k, d = d, alpha.d = alpha.d)/
(dDk(kthNND, lambda1, k = k, d = d, alpha.d = alpha.d) +
dDk(kthNND, lambda2, k = k, d = d, alpha.d = alpha.d))
mprob <- 1. - probs
mu1 <- apply((probs * datamat), 2, sum)/sum(probs)
mu2 <- apply((mprob * datamat), 2, sum)/sum(mprob)
tpsig1 <- t(datamat) - mu1
tpsig2 <- t(datamat) - mu2
Sig1 <- tpsig1 %*% (probs * t(tpsig1))/sum(probs)
Sig2 <- tpsig2 %*% (mprob * t(tpsig2))/sum(mprob)
sd1 <- sqrt(diag(Sig1))
sd2 <- sqrt(diag(Sig2))
ans <- rbind(mu1, sd1, mu2, sd2)
zz <- list(z = round(probs), kthNND = kthNND, probs = probs,
p = p, mu1 = mu1, mu2 = mu2, sd1 = sd1, sd2 = sd2,
lambda1 = lambda1, lambda2 = lambda2, Sig1 = Sig1,
Sig2 = Sig2, ans = ans)
return(zz)
}
##----- end of nclean.sub-----------------------------------------------
##
##	  Two procedures to link between a symmetric matrix and its vec(.)
##
##
Mtovec <- function(M)
{
n <- dim(M)[1]
d <- dim(M)[2]
if(abs(n - d) > 0.01) {
cat("The input has to be a square matrix")
}
else {
vec <- rep(0, 0)
for(i in (1:n)) {
for(j in (i:d)) {
vec <- c(vec, M[i, j])
}
}
vec
}
}
vectoM <- function(vec, d)
{
## unused, now warning: n <- length(vec)
M <- matrix(rep(0, d * d), d, d)
L <- 1
for(i in 1:d) {
for(j in i:d) {
M[i, j] <- vec[L]
L <- L + 1
M[j, i] <- M[i, j]
}
}
M
}
##
##----------------------------------------------------------------------
##
pd <- dim(datamat)[2]
S.mean <- apply(datamat, 2, median)
##
##  NNC based on original data
##
orgNNC <- nclean.sub(datamat, k, convergence = 0.001, S.mean = S.mean,
S.sd = S.sd)
nnoise <- min(c(sum(1 - orgNNC\$z), round(pnoise * n)))
knnd <- orgNNC\$kthNND
ord <- (n + 1) - rank(knnd)
muT <- orgNNC\$mu1
SigT <- orgNNC\$Sig1
SigT <- (SigT + t(SigT))/2.
SigTN <- diag(orgNNC\$sd1^2)
if(nnoise > 6) {
ncho <- nnoise
ncho1 <- floor(ncho/2)
ncho2 <- ncho - ncho1
cho <- (1:n)[ord <= ncho1]
xcho <- datamat[cho,  ]
ev <- eigen(SigT)
evv <- ev\$values
minv <- max((1:d)[evv > 9.9999999999999998e-13])
if(minv > 2) {
vv1 <- ev\$vectors[, (minv - 1)]
vv2 <- ev\$vectors[, minv]
}
else {
vv1 <- ev\$vectors[, 1]
vv2 <- ev\$vectors[, 2]
}
ot <- acos(sum(vv1 * vv2)/(sum(vv1^2) * sum(vv2^2))^0.5)
for(kk1 in 1:(ncho2)) {
pseg <- 1/(ncho2 + 1) * kk1 * ot
xcho <- rbind(xcho, (sin(pseg) * vv1 + cos(pseg) * vv2 +
muT))
}
}
else {
nnoise <- 3
cho <- (1:n)[ord <= nnoise]
xcho <- datamat[cho,  ]
}
n2 <- (dim(xcho))[1]
schox <- mahalanobis(xcho, muT, SigTN)
Nc <- matrix(rep(muT, n2), nrow = n2, byrow = TRUE)
Ndir <- (xcho - Nc)/(schox^0.5)
##
## initial set up
##
ch1 <- c(qchisq(0.99, pd), qchisq(1 - 10^(-4), pd))
Xa <- seq(ch1[1], ch1[2], length = 6)
gap <- Xa[2] - Xa[1]
initv <- diag(orgNNC\$Sig1)
xa <- Xa[1]
SaveM <- c(xa, orgNNC\$mu1, Mtovec(orgNNC\$Sig1))
OldP <- orgNNC\$probs
SaveP <- OldP
Np <- Nc - Ndir * (xa^0.5)
updNNC <- nclean.sub(rbind(datamat, Np), k, convergence = 0.001, S.mean = S.mean,
S.sd = S.sd)
SaveM <- rbind(SaveM, c(xa, updNNC\$mu1, Mtovec(updNNC\$Sig1)))
SaveP <- rbind(SaveP, (updNNC\$probs)[1:n])
##sda<-Mtovec(orgNNC\$Sig1)
##sda save the results corresponding to xa = qchisq(.99,pd)
stopv <- diag(updNNC\$Sig1)
time1 <- 2
##
##
while((time1 <= 6) && (all(stopv < (1 + bound) * initv))) {
xa <- Xa[time1]
Np <- Nc - Ndir * (xa^0.5)
updNNC <- nclean.sub(rbind(datamat, Np), k, convergence = 0.001, S.mean =
S.mean, S.sd = S.sd)
SaveM <- rbind(SaveM, c(xa, updNNC\$mu1, Mtovec(updNNC\$Sig1)))
SaveP <- rbind(SaveP[2,  ], (updNNC\$probs)[1:n])
time1 <- time1 + 1
stopv <- diag(updNNC\$Sig1)
NULL
}
##
##
##   Procedure stop if the added noise cause a "surge" within the range
##   sdb save the results within the given "range"
##
##
if(all(stopv < (1 + bound) * initv)) {
dSaveM <- dim(SaveM)[1]
ans <- SaveM[dSaveM,  ]
## sdb <- SaveM[dSaveM,  ]
NewP <- SaveP[2,  ]
##
##
if(extension) {
time2 <- 1
Fstop <- FALSE
tpv <- stopv
while((time2 < 2) && (all(stopv < (1 + bound) * initv))
) {
xa <- xa + gap
startv <- stopv
Np <- Nc - Ndir * (xa^0.5)
updNNC <- nclean.sub(rbind(datamat, Np), k, convergence =
0.001, S.mean = S.mean, S.sd = S.sd)
SaveM <- rbind(SaveM, c(xa, updNNC\$mu1, Mtovec(
updNNC\$Sig1)))
SaveP <- rbind(SaveP[2,  ], (updNNC\$probs)[
1:n])
stopv <- apply(rbind((startv * 2 - tpv), diag(
updNNC\$Sig1)), 2, mean)
tpv <- diag(updNNC\$Sig1)
Fstop <- all((abs(stopv - startv) <= ((1+abs(startv)) *
devsm)))
if(Fstop)
time2 <- time2 + 1
else time2 <- 1
NULL
}
##end-while
## checking the stop criterior at the end of extension
if(all(stopv < (1 + bound) * initv)) {
dSaveM <- dim(SaveM)[1]
ans <- SaveM[dSaveM,  ]
NewP <- SaveP[2,  ]
}
else {
dSaveM <- dim(SaveM)[1]
ans <- SaveM[dSaveM - 1,  ]
NewP <- SaveP[1,  ]
}
}
}
else {
dSaveM <- dim(SaveM)[1]
ans <- SaveM[dSaveM - 1,  ]
## sdb <- ans[-1]
NewP <- SaveP[1,  ]
}
nncvar <- vectoM(ans[ - (1:(1 + pd))], pd)
mu <- ans[2:(1 + pd)]
##
list(cov = nncvar, mu = mu, postprob = NewP, classification = round(NewP),
innc = list(cov = orgNNC\$Sig1, mu = orgNNC\$mu1, postprob = OldP,
classification = round(OldP)))
}

cov.nnve <- function(datamat, k = 12, ...)
{
.Deprecated("covNNC")
covNNC(datamat, k=k, ...)
}
```

Try the robustX package in your browser

Any scripts or data that you put into this service are public.

robustX documentation built on May 2, 2019, 4:06 a.m.