R/modi-internal.R
In modi: Multivariate outlier detection and imputation for incomplete survey data

.EA.dist <-
function(data, n,p,weights,reach,transmission.function, power, distance.type, maxl)
{
# Calculation of distances for EPIDEMIC Algorithm for multivariate outlier detection and imputation
# This is a utility for EAdet and EAimp
# Modular programming by Beat Hulliger
# 13.5.2009, 14.8.2009, 22.8.2014
#
# save distances, i.e. the later counterprobabilities
	counterprobs <- as.single(dist(data, method = distance.type))
	#
# The dist function handles missing values correctly except 
# if there is no overlap (see counterprob)
# 
	min.di <- rep(0, n)
	means.di <- rep(0, n)	# 
# Will be used for the sample spatial median and for d0
#
	for(i in 1:n) {
		di <- counterprobs[.ind.dijs(i, 1:n, n)]
		min.di[i] <- .nz.min(di)	#
# weighted mean of distances to account for missing distances
		means.di[i] <- sum(di * weights[ - i], na.rm = TRUE)/sum(weights[ - i][!is.na(di)])
	}
#
# Sample spatial median
# Restrict to usable observations (for sample spatial median)
#
     usable.records <- apply(!is.na(data), 1, sum) >= p/2
	means.di.complete <- means.di
	means.di.complete[!usable.records] <- NA
	sample.spatial.median.index <- which(means.di.complete == min(means.di.complete, na.rm = TRUE))[1]	#
# Determine tuning distance d0 
#
	max.min.di <- max(min.di, na.rm = TRUE)
     d0 <- switch(EXPR=as.character(reach),max=min(max.min.di,2*sqrt(p)),
                      quant=min(weighted.quantile(min.di, w = weights, prob = 1-(p+1)/n), 2 * sqrt(p)),
                      reach)
#	if(reach=="max") {d0 <- min(max.min.di,2*sqrt(p))} else
#	{d0 <- min(weighted.quantile(min.di, w = weights, prob = 1-(p+1)/n), 2 * sqrt(p))}
# 
# Calculation of counterprobabilities
# counterprobabilities stocked in distances vector to save memory
# counterprobabilities are set to 1 if missing
#
	if (n%%2==0) {
		l.batch <- n-1
	    n.loops <- n/2
	} else {
		l.batch <- n
		n.loops <- (n-1)/2
	}
	if(transmission.function == "step") {
		for(i in 1:n.loops) {
	            dij <- counterprobs[(i-1)*l.batch+(1:l.batch)]
		        dij <- as.single(dij > d0)
			    dij[is.na(dij)] <- 1
			    counterprobs[(i-1)*l.batch+(1:l.batch)] <- as.single(dij)
		}
	}
	else {
		if(transmission.function == "linear") {
			for(i in 1:n.loops) {
	            dij <- counterprobs[(i-1)*l.batch+(1:l.batch)]
		        dij <- 1 - pmax(0, d0 - dij)/d0
			    dij[is.na(dij)] <- 1
			    counterprobs[(i-1)*l.batch+(1:l.batch)] <- as.single(dij)
			}
		}
		else {
			if(transmission.function== "power") {
			beta <- as.single((0.01^(-1/power) - 1)/d0)			
			for(i in 1:n.loops) {
	                    dij <- counterprobs[(i-1)*l.batch+(1:l.batch)]
		            dij <- 1 - (beta * dij + 1)^(-power)
			    dij <- ifelse( dij>d0,1,dij)
			    dij[is.na(dij)] <- 1
			    counterprobs[(i-1)*l.batch+(1:l.batch)] <- as.single(dij)
			}
			}
			else { # default transmission function is the root function
				for(i in 1:n.loops) {
	           	        dij <- counterprobs[(i-1)*l.batch+(1:l.batch)]
		        	dij <- 1-(1-dij/d0)^(1/power)
			     	dij <- ifelse( dij>d0,1,dij)
			     	dij[is.na(dij)] <- 1
			     	counterprobs[(i-1)*l.batch+(1:l.batch)] <- as.single(dij)
					}
				}
		}
	}
return(invisible(list(output=c(sample.spatial.median.index,max.min.di,d0),
                      min.dist2nn=min.di,counterprobs=counterprobs)))
}

.nz.min <-
function(x) {
############ Non-zero non-missing minimum function ############
# B?guin,C. 2001, B. Hulliger 2015
  if (sum(!is.na(x))==0) {
    nz.min.temp <- NA
  } else {
	  nz.min.temp <- min(x[x != 0], na.rm = TRUE)
  }
  return(nz.min.temp)  
}

.ind.dij <-
function(i, j, n)
	{############ Addressing function for Epidemic Algorithm ############
# B?guin, C. 2001

		(i - 1) * n - ((i + 1) * i)/2 + j
	}

.ind.dijs <-
function(i, js, n)
	{############ Addressing function for Epidemic Algorithm ############
# B?guin, C. 2001
		indices <- c(.ind.dij(js[js < i], i, n), .ind.dij(i, js[js > i], n))
		return(indices[!is.na(indices)])
	}

.sum.weights <-
function(observations,weights,value,lt=TRUE)
{
# sum of weights for observations < value (if lt=TRUE) or observations=value (if lt=FALSE)
if (lt) return(sum(weights*(observations<value),na.rm=TRUE)) else 
return(sum(weights*(observations==value),na.rm=TRUE))
}

.sweep.operator <-
function(M,k,reverse=FALSE) 
{   
################## Sweep operator ##################
#
# Definition of the sweep and reverse-sweep operator (Schafer pp 159-160)
#
if (reverse) {
    Gjk <- M[,k]
    Hkk <- -1/M[k,k]
    M <- M+(Gjk%*%t(Gjk))*Hkk
    M[k,] <- M[,k] <- Gjk*Hkk
    M[k,k] <- Hkk
    return(M)      
}
else {
    Gjk <- M[,k]
    Hkk <- 1/M[k,k]
    M <- M-(Gjk%*%t(Gjk))*Hkk
    M[k,] <- M[,k] <- Gjk*Hkk
    M[k,k] <- -Hkk
    return(M)        
}
}

.psi.lismi <-
function(d,present,psi.par=c(2,1.25))
{
################## psi-function ##################
# Defined in Little and Smith for ER algorithm
# 
#
    a <- psi.par[1]
    b <- psi.par[2]
    d0<-sqrt(present)+a/2
    return(ifelse(d-d0<=0,d,d0*exp(-(d-d0)^2/(2*b^2))))
}


.EM.normal <-
function(data, weights=rep(1,nrow(data)), n=sum(weights) ,p=ncol(data), s.counts, s.id, S, 
                            T.obs, start.mean=rep(0,p),start.var=diag(1,p),numb.it=10,Estep.output=FALSE)
{
##################                                  ##################
##################  EM for multivariate normal data ##################
##################                                  ##################
#
# This version of EM does not contain the computation of the observed sufficient statistics,
# they will be computed in the main program of BEM and passed as parameters as well as the
# statistics on the missingness patterns.
#
#
################## Initialization ##################
#
# Creates theta which is the matrix form of the initial parameter used by EM
#
theta <- matrix(0,p+1,p+1)
theta[1,1] <- -1
theta[1,2:(p+1)] <- theta[2:(p+1),1] <- start.mean
theta[2:(p+1),2:(p+1)] <- start.var
#
################## Iterations of EM ##################
#
for (boucle in 1:numb.it)
{
    if (Estep.output) cat("E-step ",boucle,"\n")
    #
    ################## The E-step ##################
    #
    # Initializing T.tot to T.obs
    # 
    T.tot <- T.obs
    # 
    # Start loop on missing patterns s from 1 to S
    #
    for (s in 1:S)
    {
        #
        # Identification of the indices of x.mis and x.obs
        #
        x.mis.id <- (1:p)[is.na(data[s.id[s],])]
        x.obs.id <- (1:p)[-x.mis.id]
        #
        # Sweep of theta over the indices of x.obs
        #
        C.s <- theta
        for (k in x.obs.id)
            if (C.s[k+1,k+1]!=0) 
                {C.s <- .sweep.operator(C.s,k+1)}
        #
        # Start loop over all observations x having missing pattern s
        #
        for (i in 1:s.counts[s])
        {
            if (s==1) {
                            x <- data[i,]
                            weight <- weights[i]
                        }
            else        {
                            x <- data[s.id[s-1]+i,]
                            weight <- weights[s.id[s-1]+i]
                        }
            #
            # Computation of x.star=E(x.mis|x.obs)
            #
            x.star <- x
            for (k in x.mis.id)
                x.star[k] <- C.s[1,k+1]+sum(C.s[x.obs.id+1,k+1]*x[x.obs.id])
            #
            # Updating T.tot
            #
            T.tot[1,] <- T.tot[,1] <- T.tot[1,]+weight*c(1,x.star)      
            T.tot[2:(p+1),2:(p+1)] <-   T.tot[2:(p+1),2:(p+1)]+weight*(x.star%*%t(x.star))
            T.tot[x.mis.id+1,x.mis.id+1] <- T.tot[x.mis.id+1,x.mis.id+1]+weight*C.s[x.mis.id+1,x.mis.id+1]
        }
    }
    #
    ################## The M-step ##################
    #
    theta <- .sweep.operator(T.tot/n,1)  
}
#
################## End of EM ##################
#
return(theta)
}


.ER.normal <-
function(data, weights=rep(1,nrow(data)), psi.par=c(2,1.25), np=sum(weights) ,p=ncol(data), s.counts, s.id, S, missing.items, nb.missing.items,
                            start.mean=rep(0,p),start.var=diag(1,p),numb.it=10,Estep.output=FALSE,tolerance=1e-06)
{
#
################## Initialization ##################
#
# Creates theta which is the matrix form of the initial parameter used by EM
#
theta <- matrix(0,p+1,p+1)
theta[1,1] <- -1
theta[1,2:(p+1)] <- theta[2:(p+1),1] <- start.mean
theta[2:(p+1),2:(p+1)] <- start.var
if (Estep.output) cat("\n","theta: \n",theta)

break.flag <- FALSE
#
# Initialisation of robustness weights to 1
#
rob.weights <- rep(1,nrow(data))
np.hat <- np
#
################## Iterations of EM ##################
#
for (boucle in 1:numb.it)
{
    if (Estep.output) cat("\n E-step ",boucle)
    #
    ################## The E-step ##################
    #
    # Initializing T.tot and storing of old.theta
    # 
    T.tot <- matrix(0,p+1,p+1)
    old.theta <- theta
    # 
    # 
    # Start loop on missing patterns s from 1 to S
    #
    for (s in 1:S)
    {
        #
        # Identification of the indices of x.mis and x.obs
        #
        x.mis.id <- (1:p)[is.na(data[s.id[s],])]
        x.obs.id <- (1:p)[-x.mis.id]
        #
        # Sweep of theta over the indices of x.obs
        #
        C.s <- theta
        for (k in x.obs.id)
            if (C.s[k+1,k+1]!=0) 
                {C.s <- .sweep.operator(C.s,k+1)}
        #
        # Start loop over all observations x having missing pattern s
        #
        for (i in 1:s.counts[s])
        {
            if (s==1) {
                            x <- data[i,]
                            weight <- weights[i] 
                            rob.weight <- rob.weights[i]
                        }
            else        {
                            x <- data[s.id[s-1]+i,]
                            weight <- weights[s.id[s-1]+i] 
                            rob.weight <- rob.weights[s.id[s-1]+i]
                        }
            #
            # Computation of x.star=E(x.mis|x.obs)
            #
            x.star <- x
            for (k in x.mis.id)
                x.star[k] <- C.s[1,k+1]+sum(C.s[x.obs.id+1,k+1]*x[x.obs.id])
            # robustification with robustness weights from last iteration
            x.star <- x.star 
            #
            # Updating T.tot
            #
            T.tot[1,] <- T.tot[,1] <- T.tot[1,]+rob.weight*weight*c(1,x.star)     
            T.tot[2:(p+1),2:(p+1)] <-   T.tot[2:(p+1),2:(p+1)]+rob.weight*weight*(x.star%*%t(x.star))
            T.tot[x.mis.id+1,x.mis.id+1] <- T.tot[x.mis.id+1,x.mis.id+1]+rob.weight*weight*C.s[x.mis.id+1,x.mis.id+1]
        }
    }
    #
    ################## The M-step ##################
    #
    np.hat <- sum(weights*rob.weights)
    theta <- .sweep.operator(T.tot/np.hat,1) 
    if (Estep.output) cat("\n","theta: \n",theta,"\n")
    #
    # Computation of Mahalanobis distances (marginal version)
    #
        ER.mean <- theta[1,2:(p+1)]
        ER.var <- theta[2:(p+1),2:(p+1)]
        indices <- (!missing.items[1,])
        dist <- mahalanobis(data[1:s.id[1],indices,drop=FALSE],ER.mean[indices],ER.var[indices,indices])*p/(p-nb.missing.items[1])
        if (S>1)
        {
            for (i in 2:S)
            {
                indices <- (!missing.items[i,])
                dist <- c(dist,mahalanobis(data[(s.id[i-1]+1):s.id[i],indices,drop=FALSE],ER.mean[indices],ER.var[indices,indices,drop=FALSE])*p/(p-nb.missing.items[i]))
            }
        }
    #
    # new robustness weights
    #
    rob.weights <- ifelse(dist>0,.psi.lismi(sqrt(dist),apply(!is.na(data),1,sum),psi.par=psi.par)/sqrt(dist),1)
    if (Estep.output) cat("\n Delta: ", signif(max(abs(theta-old.theta)),8),"\n")
    if (max(abs(theta-old.theta))<tolerance) {break.flag<-T;break}
}  
return(list(theta=theta,dist=dist,rob.weights=rob.weights,convergence=break.flag))
}

Any scripts or data that you put into this service are public.

modi documentation built on May 2, 2019, 6:48 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

modi
Multivariate outlier detection and imputation for incomplete survey data

R/modi-internal.R
In modi: Multivariate outlier detection and imputation for incomplete survey data

Defines functions .EA.dist .nz.min .ind.dij .ind.dijs .sum.weights .sweep.operator .psi.lismi .EM.normal .ER.normal

Try the modi package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

modi Multivariate outlier detection and imputation for incomplete survey data

R/modi-internal.R In modi: Multivariate outlier detection and imputation for incomplete survey data

Defines functions .EA.dist .nz.min .ind.dij .ind.dijs .sum.weights .sweep.operator .psi.lismi .EM.normal .ER.normal

Try the modi package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

modi
Multivariate outlier detection and imputation for incomplete survey data

R/modi-internal.R
In modi: Multivariate outlier detection and imputation for incomplete survey data