R/modi-internal.R

Defines functions .EA.dist .nz.min .ind.dij .ind.dijs .sum.weights .sweep.operator .psi.lismi .EM.normal .ER.normal

.EA.dist <-
function(data, n,p,weights,reach,transmission.function, power, distance.type, maxl)
{
# Calculation of distances for EPIDEMIC Algorithm for multivariate outlier detection and imputation
# This is a utility for EAdet and EAimp
# Modular programming by Beat Hulliger
# 13.5.2009, 14.8.2009, 22.8.2014
#
# save distances, i.e. the later counterprobabilities
	counterprobs <- as.single(dist(data, method = distance.type))
	#
# The dist function handles missing values correctly except 
# if there is no overlap (see counterprob)
# 
	min.di <- rep(0, n)
	means.di <- rep(0, n)	# 
# Will be used for the sample spatial median and for d0
#
	for(i in 1:n) {
		di <- counterprobs[.ind.dijs(i, 1:n, n)]
		min.di[i] <- .nz.min(di)	#
# weighted mean of distances to account for missing distances
		means.di[i] <- sum(di * weights[ - i], na.rm = TRUE)/sum(weights[ - i][!is.na(di)])
	}
#
# Sample spatial median
# Restrict to usable observations (for sample spatial median)
#
     usable.records <- apply(!is.na(data), 1, sum) >= p/2
	means.di.complete <- means.di
	means.di.complete[!usable.records] <- NA
	sample.spatial.median.index <- which(means.di.complete == min(means.di.complete, na.rm = TRUE))[1]	#
# Determine tuning distance d0 
#
	max.min.di <- max(min.di, na.rm = TRUE)
     d0 <- switch(EXPR=as.character(reach),max=min(max.min.di,2*sqrt(p)),
                      quant=min(weighted.quantile(min.di, w = weights, prob = 1-(p+1)/n), 2 * sqrt(p)),
                      reach)
#	if(reach=="max") {d0 <- min(max.min.di,2*sqrt(p))} else
#	{d0 <- min(weighted.quantile(min.di, w = weights, prob = 1-(p+1)/n), 2 * sqrt(p))}
# 
# Calculation of counterprobabilities
# counterprobabilities stocked in distances vector to save memory
# counterprobabilities are set to 1 if missing
#
	if (n%%2==0) {
		l.batch <- n-1
	    n.loops <- n/2
	} else {
		l.batch <- n
		n.loops <- (n-1)/2
	}
	if(transmission.function == "step") {
		for(i in 1:n.loops) {
	            dij <- counterprobs[(i-1)*l.batch+(1:l.batch)]
		        dij <- as.single(dij > d0)
			    dij[is.na(dij)] <- 1
			    counterprobs[(i-1)*l.batch+(1:l.batch)] <- as.single(dij)
		}
	}
	else {
		if(transmission.function == "linear") {
			for(i in 1:n.loops) {
	            dij <- counterprobs[(i-1)*l.batch+(1:l.batch)]
		        dij <- 1 - pmax(0, d0 - dij)/d0
			    dij[is.na(dij)] <- 1
			    counterprobs[(i-1)*l.batch+(1:l.batch)] <- as.single(dij)
			}
		}
		else {
			if(transmission.function== "power") {
			beta <- as.single((0.01^(-1/power) - 1)/d0)			
			for(i in 1:n.loops) {
	                    dij <- counterprobs[(i-1)*l.batch+(1:l.batch)]
		            dij <- 1 - (beta * dij + 1)^(-power)
			    dij <- ifelse( dij>d0,1,dij)
			    dij[is.na(dij)] <- 1
			    counterprobs[(i-1)*l.batch+(1:l.batch)] <- as.single(dij)
			}
			}
			else { # default transmission function is the root function
				for(i in 1:n.loops) {
	           	        dij <- counterprobs[(i-1)*l.batch+(1:l.batch)]
		        	dij <- 1-(1-dij/d0)^(1/power)
			     	dij <- ifelse( dij>d0,1,dij)
			     	dij[is.na(dij)] <- 1
			     	counterprobs[(i-1)*l.batch+(1:l.batch)] <- as.single(dij)
					}
				}
		}
	}
return(invisible(list(output=c(sample.spatial.median.index,max.min.di,d0),
                      min.dist2nn=min.di,counterprobs=counterprobs)))
}

.nz.min <-
function(x) {
############ Non-zero non-missing minimum function ############
# B?guin,C. 2001, B. Hulliger 2015
  if (sum(!is.na(x))==0) {
    nz.min.temp <- NA
  } else {
	  nz.min.temp <- min(x[x != 0], na.rm = TRUE)
  }
  return(nz.min.temp)  
}

.ind.dij <-
function(i, j, n)
	{############ Addressing function for Epidemic Algorithm ############
# B?guin, C. 2001

		(i - 1) * n - ((i + 1) * i)/2 + j
	}

.ind.dijs <-
function(i, js, n)
	{############ Addressing function for Epidemic Algorithm ############
# B?guin, C. 2001
		indices <- c(.ind.dij(js[js < i], i, n), .ind.dij(i, js[js > i], n))
		return(indices[!is.na(indices)])
	}

.sum.weights <-
function(observations,weights,value,lt=TRUE)
{
# sum of weights for observations < value (if lt=TRUE) or observations=value (if lt=FALSE)
if (lt) return(sum(weights*(observations<value),na.rm=TRUE)) else 
return(sum(weights*(observations==value),na.rm=TRUE))
}

.sweep.operator <-
function(M,k,reverse=FALSE) 
{   
################## Sweep operator ##################
#
# Definition of the sweep and reverse-sweep operator (Schafer pp 159-160)
#
if (reverse) {
    Gjk <- M[,k]
    Hkk <- -1/M[k,k]
    M <- M+(Gjk%*%t(Gjk))*Hkk
    M[k,] <- M[,k] <- Gjk*Hkk
    M[k,k] <- Hkk
    return(M)      
}
else {
    Gjk <- M[,k]
    Hkk <- 1/M[k,k]
    M <- M-(Gjk%*%t(Gjk))*Hkk
    M[k,] <- M[,k] <- Gjk*Hkk
    M[k,k] <- -Hkk
    return(M)        
}
}

.psi.lismi <-
function(d,present,psi.par=c(2,1.25))
{
################## psi-function ##################
# Defined in Little and Smith for ER algorithm
# 
#
    a <- psi.par[1]
    b <- psi.par[2]
    d0<-sqrt(present)+a/2
    return(ifelse(d-d0<=0,d,d0*exp(-(d-d0)^2/(2*b^2))))
}


.EM.normal <-
function(data, weights=rep(1,nrow(data)), n=sum(weights) ,p=ncol(data), s.counts, s.id, S, 
                            T.obs, start.mean=rep(0,p),start.var=diag(1,p),numb.it=10,Estep.output=FALSE)
{
##################                                  ##################
##################  EM for multivariate normal data ##################
##################                                  ##################
#
# This version of EM does not contain the computation of the observed sufficient statistics,
# they will be computed in the main program of BEM and passed as parameters as well as the
# statistics on the missingness patterns.
#
#
################## Initialization ##################
#
# Creates theta which is the matrix form of the initial parameter used by EM
#
theta <- matrix(0,p+1,p+1)
theta[1,1] <- -1
theta[1,2:(p+1)] <- theta[2:(p+1),1] <- start.mean
theta[2:(p+1),2:(p+1)] <- start.var
#
################## Iterations of EM ##################
#
for (boucle in 1:numb.it)
{
    if (Estep.output) cat("E-step ",boucle,"\n")
    #
    ################## The E-step ##################
    #
    # Initializing T.tot to T.obs
    # 
    T.tot <- T.obs
    # 
    # Start loop on missing patterns s from 1 to S
    #
    for (s in 1:S)
    {
        #
        # Identification of the indices of x.mis and x.obs
        #
        x.mis.id <- (1:p)[is.na(data[s.id[s],])]
        x.obs.id <- (1:p)[-x.mis.id]
        #
        # Sweep of theta over the indices of x.obs
        #
        C.s <- theta
        for (k in x.obs.id)
            if (C.s[k+1,k+1]!=0) 
                {C.s <- .sweep.operator(C.s,k+1)}
        #
        # Start loop over all observations x having missing pattern s
        #
        for (i in 1:s.counts[s])
        {
            if (s==1) {
                            x <- data[i,]
                            weight <- weights[i]
                        }
            else        {
                            x <- data[s.id[s-1]+i,]
                            weight <- weights[s.id[s-1]+i]
                        }
            #
            # Computation of x.star=E(x.mis|x.obs)
            #
            x.star <- x
            for (k in x.mis.id)
                x.star[k] <- C.s[1,k+1]+sum(C.s[x.obs.id+1,k+1]*x[x.obs.id])
            #
            # Updating T.tot
            #
            T.tot[1,] <- T.tot[,1] <- T.tot[1,]+weight*c(1,x.star)      
            T.tot[2:(p+1),2:(p+1)] <-   T.tot[2:(p+1),2:(p+1)]+weight*(x.star%*%t(x.star))
            T.tot[x.mis.id+1,x.mis.id+1] <- T.tot[x.mis.id+1,x.mis.id+1]+weight*C.s[x.mis.id+1,x.mis.id+1]
        }
    }
    #
    ################## The M-step ##################
    #
    theta <- .sweep.operator(T.tot/n,1)  
}
#
################## End of EM ##################
#
return(theta)
}


.ER.normal <-
function(data, weights=rep(1,nrow(data)), psi.par=c(2,1.25), np=sum(weights) ,p=ncol(data), s.counts, s.id, S, missing.items, nb.missing.items,
                            start.mean=rep(0,p),start.var=diag(1,p),numb.it=10,Estep.output=FALSE,tolerance=1e-06)
{
#
################## Initialization ##################
#
# Creates theta which is the matrix form of the initial parameter used by EM
#
theta <- matrix(0,p+1,p+1)
theta[1,1] <- -1
theta[1,2:(p+1)] <- theta[2:(p+1),1] <- start.mean
theta[2:(p+1),2:(p+1)] <- start.var
if (Estep.output) cat("\n","theta: \n",theta)

break.flag <- FALSE
#
# Initialisation of robustness weights to 1
#
rob.weights <- rep(1,nrow(data))
np.hat <- np
#
################## Iterations of EM ##################
#
for (boucle in 1:numb.it)
{
    if (Estep.output) cat("\n E-step ",boucle)
    #
    ################## The E-step ##################
    #
    # Initializing T.tot and storing of old.theta
    # 
    T.tot <- matrix(0,p+1,p+1)
    old.theta <- theta
    # 
    # 
    # Start loop on missing patterns s from 1 to S
    #
    for (s in 1:S)
    {
        #
        # Identification of the indices of x.mis and x.obs
        #
        x.mis.id <- (1:p)[is.na(data[s.id[s],])]
        x.obs.id <- (1:p)[-x.mis.id]
        #
        # Sweep of theta over the indices of x.obs
        #
        C.s <- theta
        for (k in x.obs.id)
            if (C.s[k+1,k+1]!=0) 
                {C.s <- .sweep.operator(C.s,k+1)}
        #
        # Start loop over all observations x having missing pattern s
        #
        for (i in 1:s.counts[s])
        {
            if (s==1) {
                            x <- data[i,]
                            weight <- weights[i] 
                            rob.weight <- rob.weights[i]
                        }
            else        {
                            x <- data[s.id[s-1]+i,]
                            weight <- weights[s.id[s-1]+i] 
                            rob.weight <- rob.weights[s.id[s-1]+i]
                        }
            #
            # Computation of x.star=E(x.mis|x.obs)
            #
            x.star <- x
            for (k in x.mis.id)
                x.star[k] <- C.s[1,k+1]+sum(C.s[x.obs.id+1,k+1]*x[x.obs.id])
            # robustification with robustness weights from last iteration
            x.star <- x.star 
            #
            # Updating T.tot
            #
            T.tot[1,] <- T.tot[,1] <- T.tot[1,]+rob.weight*weight*c(1,x.star)     
            T.tot[2:(p+1),2:(p+1)] <-   T.tot[2:(p+1),2:(p+1)]+rob.weight*weight*(x.star%*%t(x.star))
            T.tot[x.mis.id+1,x.mis.id+1] <- T.tot[x.mis.id+1,x.mis.id+1]+rob.weight*weight*C.s[x.mis.id+1,x.mis.id+1]
        }
    }
    #
    ################## The M-step ##################
    #
    np.hat <- sum(weights*rob.weights)
    theta <- .sweep.operator(T.tot/np.hat,1) 
    if (Estep.output) cat("\n","theta: \n",theta,"\n")
    #
    # Computation of Mahalanobis distances (marginal version)
    #
        ER.mean <- theta[1,2:(p+1)]
        ER.var <- theta[2:(p+1),2:(p+1)]
        indices <- (!missing.items[1,])
        dist <- mahalanobis(data[1:s.id[1],indices,drop=FALSE],ER.mean[indices],ER.var[indices,indices])*p/(p-nb.missing.items[1])
        if (S>1)
        {
            for (i in 2:S)
            {
                indices <- (!missing.items[i,])
                dist <- c(dist,mahalanobis(data[(s.id[i-1]+1):s.id[i],indices,drop=FALSE],ER.mean[indices],ER.var[indices,indices,drop=FALSE])*p/(p-nb.missing.items[i]))
            }
        }
    #
    # new robustness weights
    #
    rob.weights <- ifelse(dist>0,.psi.lismi(sqrt(dist),apply(!is.na(data),1,sum),psi.par=psi.par)/sqrt(dist),1)
    if (Estep.output) cat("\n Delta: ", signif(max(abs(theta-old.theta)),8),"\n")
    if (max(abs(theta-old.theta))<tolerance) {break.flag<-T;break}
}  
return(list(theta=theta,dist=dist,rob.weights=rob.weights,convergence=break.flag))
}

Try the modi package in your browser

Any scripts or data that you put into this service are public.

modi documentation built on May 2, 2019, 6:48 p.m.