Nothing
.EA.dist <-
function(data, n,p,weights,reach,transmission.function, power, distance.type, maxl)
{
# Calculation of distances for EPIDEMIC Algorithm for multivariate outlier detection and imputation
# This is a utility for EAdet and EAimp
# Modular programming by Beat Hulliger
# 13.5.2009, 14.8.2009, 22.8.2014
#
# save distances, i.e. the later counterprobabilities
counterprobs <- as.single(dist(data, method = distance.type))
#
# The dist function handles missing values correctly except
# if there is no overlap (see counterprob)
#
min.di <- rep(0, n)
means.di <- rep(0, n) #
# Will be used for the sample spatial median and for d0
#
for(i in 1:n) {
di <- counterprobs[.ind.dijs(i, 1:n, n)]
min.di[i] <- .nz.min(di) #
# weighted mean of distances to account for missing distances
means.di[i] <- sum(di * weights[ - i], na.rm = TRUE)/sum(weights[ - i][!is.na(di)])
}
#
# Sample spatial median
# Restrict to usable observations (for sample spatial median)
#
usable.records <- apply(!is.na(data), 1, sum) >= p/2
means.di.complete <- means.di
means.di.complete[!usable.records] <- NA
sample.spatial.median.index <- which(means.di.complete == min(means.di.complete, na.rm = TRUE))[1] #
# Determine tuning distance d0
#
max.min.di <- max(min.di, na.rm = TRUE)
d0 <- switch(EXPR=as.character(reach),max=min(max.min.di,2*sqrt(p)),
quant=min(weighted.quantile(min.di, w = weights, prob = 1-(p+1)/n), 2 * sqrt(p)),
reach)
# if(reach=="max") {d0 <- min(max.min.di,2*sqrt(p))} else
# {d0 <- min(weighted.quantile(min.di, w = weights, prob = 1-(p+1)/n), 2 * sqrt(p))}
#
# Calculation of counterprobabilities
# counterprobabilities stocked in distances vector to save memory
# counterprobabilities are set to 1 if missing
#
if (n%%2==0) {
l.batch <- n-1
n.loops <- n/2
} else {
l.batch <- n
n.loops <- (n-1)/2
}
if(transmission.function == "step") {
for(i in 1:n.loops) {
dij <- counterprobs[(i-1)*l.batch+(1:l.batch)]
dij <- as.single(dij > d0)
dij[is.na(dij)] <- 1
counterprobs[(i-1)*l.batch+(1:l.batch)] <- as.single(dij)
}
}
else {
if(transmission.function == "linear") {
for(i in 1:n.loops) {
dij <- counterprobs[(i-1)*l.batch+(1:l.batch)]
dij <- 1 - pmax(0, d0 - dij)/d0
dij[is.na(dij)] <- 1
counterprobs[(i-1)*l.batch+(1:l.batch)] <- as.single(dij)
}
}
else {
if(transmission.function== "power") {
beta <- as.single((0.01^(-1/power) - 1)/d0)
for(i in 1:n.loops) {
dij <- counterprobs[(i-1)*l.batch+(1:l.batch)]
dij <- 1 - (beta * dij + 1)^(-power)
dij <- ifelse( dij>d0,1,dij)
dij[is.na(dij)] <- 1
counterprobs[(i-1)*l.batch+(1:l.batch)] <- as.single(dij)
}
}
else { # default transmission function is the root function
for(i in 1:n.loops) {
dij <- counterprobs[(i-1)*l.batch+(1:l.batch)]
dij <- 1-(1-dij/d0)^(1/power)
dij <- ifelse( dij>d0,1,dij)
dij[is.na(dij)] <- 1
counterprobs[(i-1)*l.batch+(1:l.batch)] <- as.single(dij)
}
}
}
}
return(invisible(list(output=c(sample.spatial.median.index,max.min.di,d0),
min.dist2nn=min.di,counterprobs=counterprobs)))
}
.nz.min <-
function(x) {
############ Non-zero non-missing minimum function ############
# B?guin,C. 2001, B. Hulliger 2015
if (sum(!is.na(x))==0) {
nz.min.temp <- NA
} else {
nz.min.temp <- min(x[x != 0], na.rm = TRUE)
}
return(nz.min.temp)
}
.ind.dij <-
function(i, j, n)
{############ Addressing function for Epidemic Algorithm ############
# B?guin, C. 2001
(i - 1) * n - ((i + 1) * i)/2 + j
}
.ind.dijs <-
function(i, js, n)
{############ Addressing function for Epidemic Algorithm ############
# B?guin, C. 2001
indices <- c(.ind.dij(js[js < i], i, n), .ind.dij(i, js[js > i], n))
return(indices[!is.na(indices)])
}
.sum.weights <-
function(observations,weights,value,lt=TRUE)
{
# sum of weights for observations < value (if lt=TRUE) or observations=value (if lt=FALSE)
if (lt) return(sum(weights*(observations<value),na.rm=TRUE)) else
return(sum(weights*(observations==value),na.rm=TRUE))
}
.sweep.operator <-
function(M,k,reverse=FALSE)
{
################## Sweep operator ##################
#
# Definition of the sweep and reverse-sweep operator (Schafer pp 159-160)
#
if (reverse) {
Gjk <- M[,k]
Hkk <- -1/M[k,k]
M <- M+(Gjk%*%t(Gjk))*Hkk
M[k,] <- M[,k] <- Gjk*Hkk
M[k,k] <- Hkk
return(M)
}
else {
Gjk <- M[,k]
Hkk <- 1/M[k,k]
M <- M-(Gjk%*%t(Gjk))*Hkk
M[k,] <- M[,k] <- Gjk*Hkk
M[k,k] <- -Hkk
return(M)
}
}
.psi.lismi <-
function(d,present,psi.par=c(2,1.25))
{
################## psi-function ##################
# Defined in Little and Smith for ER algorithm
#
#
a <- psi.par[1]
b <- psi.par[2]
d0<-sqrt(present)+a/2
return(ifelse(d-d0<=0,d,d0*exp(-(d-d0)^2/(2*b^2))))
}
.EM.normal <-
function(data, weights=rep(1,nrow(data)), n=sum(weights) ,p=ncol(data), s.counts, s.id, S,
T.obs, start.mean=rep(0,p),start.var=diag(1,p),numb.it=10,Estep.output=FALSE)
{
################## ##################
################## EM for multivariate normal data ##################
################## ##################
#
# This version of EM does not contain the computation of the observed sufficient statistics,
# they will be computed in the main program of BEM and passed as parameters as well as the
# statistics on the missingness patterns.
#
#
################## Initialization ##################
#
# Creates theta which is the matrix form of the initial parameter used by EM
#
theta <- matrix(0,p+1,p+1)
theta[1,1] <- -1
theta[1,2:(p+1)] <- theta[2:(p+1),1] <- start.mean
theta[2:(p+1),2:(p+1)] <- start.var
#
################## Iterations of EM ##################
#
for (boucle in 1:numb.it)
{
if (Estep.output) cat("E-step ",boucle,"\n")
#
################## The E-step ##################
#
# Initializing T.tot to T.obs
#
T.tot <- T.obs
#
# Start loop on missing patterns s from 1 to S
#
for (s in 1:S)
{
#
# Identification of the indices of x.mis and x.obs
#
x.mis.id <- (1:p)[is.na(data[s.id[s],])]
x.obs.id <- (1:p)[-x.mis.id]
#
# Sweep of theta over the indices of x.obs
#
C.s <- theta
for (k in x.obs.id)
if (C.s[k+1,k+1]!=0)
{C.s <- .sweep.operator(C.s,k+1)}
#
# Start loop over all observations x having missing pattern s
#
for (i in 1:s.counts[s])
{
if (s==1) {
x <- data[i,]
weight <- weights[i]
}
else {
x <- data[s.id[s-1]+i,]
weight <- weights[s.id[s-1]+i]
}
#
# Computation of x.star=E(x.mis|x.obs)
#
x.star <- x
for (k in x.mis.id)
x.star[k] <- C.s[1,k+1]+sum(C.s[x.obs.id+1,k+1]*x[x.obs.id])
#
# Updating T.tot
#
T.tot[1,] <- T.tot[,1] <- T.tot[1,]+weight*c(1,x.star)
T.tot[2:(p+1),2:(p+1)] <- T.tot[2:(p+1),2:(p+1)]+weight*(x.star%*%t(x.star))
T.tot[x.mis.id+1,x.mis.id+1] <- T.tot[x.mis.id+1,x.mis.id+1]+weight*C.s[x.mis.id+1,x.mis.id+1]
}
}
#
################## The M-step ##################
#
theta <- .sweep.operator(T.tot/n,1)
}
#
################## End of EM ##################
#
return(theta)
}
.ER.normal <-
function(data, weights=rep(1,nrow(data)), psi.par=c(2,1.25), np=sum(weights) ,p=ncol(data), s.counts, s.id, S, missing.items, nb.missing.items,
start.mean=rep(0,p),start.var=diag(1,p),numb.it=10,Estep.output=FALSE,tolerance=1e-06)
{
#
################## Initialization ##################
#
# Creates theta which is the matrix form of the initial parameter used by EM
#
theta <- matrix(0,p+1,p+1)
theta[1,1] <- -1
theta[1,2:(p+1)] <- theta[2:(p+1),1] <- start.mean
theta[2:(p+1),2:(p+1)] <- start.var
if (Estep.output) cat("\n","theta: \n",theta)
break.flag <- FALSE
#
# Initialisation of robustness weights to 1
#
rob.weights <- rep(1,nrow(data))
np.hat <- np
#
################## Iterations of EM ##################
#
for (boucle in 1:numb.it)
{
if (Estep.output) cat("\n E-step ",boucle)
#
################## The E-step ##################
#
# Initializing T.tot and storing of old.theta
#
T.tot <- matrix(0,p+1,p+1)
old.theta <- theta
#
#
# Start loop on missing patterns s from 1 to S
#
for (s in 1:S)
{
#
# Identification of the indices of x.mis and x.obs
#
x.mis.id <- (1:p)[is.na(data[s.id[s],])]
x.obs.id <- (1:p)[-x.mis.id]
#
# Sweep of theta over the indices of x.obs
#
C.s <- theta
for (k in x.obs.id)
if (C.s[k+1,k+1]!=0)
{C.s <- .sweep.operator(C.s,k+1)}
#
# Start loop over all observations x having missing pattern s
#
for (i in 1:s.counts[s])
{
if (s==1) {
x <- data[i,]
weight <- weights[i]
rob.weight <- rob.weights[i]
}
else {
x <- data[s.id[s-1]+i,]
weight <- weights[s.id[s-1]+i]
rob.weight <- rob.weights[s.id[s-1]+i]
}
#
# Computation of x.star=E(x.mis|x.obs)
#
x.star <- x
for (k in x.mis.id)
x.star[k] <- C.s[1,k+1]+sum(C.s[x.obs.id+1,k+1]*x[x.obs.id])
# robustification with robustness weights from last iteration
x.star <- x.star
#
# Updating T.tot
#
T.tot[1,] <- T.tot[,1] <- T.tot[1,]+rob.weight*weight*c(1,x.star)
T.tot[2:(p+1),2:(p+1)] <- T.tot[2:(p+1),2:(p+1)]+rob.weight*weight*(x.star%*%t(x.star))
T.tot[x.mis.id+1,x.mis.id+1] <- T.tot[x.mis.id+1,x.mis.id+1]+rob.weight*weight*C.s[x.mis.id+1,x.mis.id+1]
}
}
#
################## The M-step ##################
#
np.hat <- sum(weights*rob.weights)
theta <- .sweep.operator(T.tot/np.hat,1)
if (Estep.output) cat("\n","theta: \n",theta,"\n")
#
# Computation of Mahalanobis distances (marginal version)
#
ER.mean <- theta[1,2:(p+1)]
ER.var <- theta[2:(p+1),2:(p+1)]
indices <- (!missing.items[1,])
dist <- mahalanobis(data[1:s.id[1],indices,drop=FALSE],ER.mean[indices],ER.var[indices,indices])*p/(p-nb.missing.items[1])
if (S>1)
{
for (i in 2:S)
{
indices <- (!missing.items[i,])
dist <- c(dist,mahalanobis(data[(s.id[i-1]+1):s.id[i],indices,drop=FALSE],ER.mean[indices],ER.var[indices,indices,drop=FALSE])*p/(p-nb.missing.items[i]))
}
}
#
# new robustness weights
#
rob.weights <- ifelse(dist>0,.psi.lismi(sqrt(dist),apply(!is.na(data),1,sum),psi.par=psi.par)/sqrt(dist),1)
if (Estep.output) cat("\n Delta: ", signif(max(abs(theta-old.theta)),8),"\n")
if (max(abs(theta-old.theta))<tolerance) {break.flag<-T;break}
}
return(list(theta=theta,dist=dist,rob.weights=rob.weights,convergence=break.flag))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.