R/modi-internal.R

Defines functions ER.normal EM.normal psi.lismi sweep.operator weightsum ind.dijs ind.dij nz.min EA.dist

Documented in EA.dist EM.normal ER.normal ind.dij ind.dijs nz.min psi.lismi sweep.operator weightsum

#' Utility function for EAdet and EAimp
#'
#' Calculation of distances for EPIDEMIC Algorithm for
#' multivariate outlier detection and imputation
#'
#' @param data a data frame or matrix with data.
#' @param n number of rows.
#' @param p number of columns.
#' @param weights a vector of positive sampling weights.
#' @param reach if \code{reach = "max"} the maximal nearest
#' neighbor distance is used as the basis for the transmission
#' function, otherwise the weighted
#' @param transmission.function form of the transmission function
#' of distance d: \code{"step"} is a heaviside function which jumps
#' to \code{1} at \code{d0}, \code{"linear"} is linear between
#' \code{0} and \code{d0}, \code{"power"} is \code{(beta*d+1)^(-p)}
#' for \code{p = ncol(data)} as default, \code{"root"} is the function
#' \code{1-(1-d/d0)^(1/maxl)}.
#' @param power sets \code{p = power}.
#' @param distance.type distance type in function \code{dist()}.
#' @param maxl maximum number of steps without infection.
#' @keywords internal
#' @author Beat Hulliger
#' @export
#' @importFrom stats dist
EA.dist <- function(data, n, p, weights, reach,
                    transmission.function, power,
                    distance.type, maxl) {

    # save distances, i.e. the later counterprobabilities
    counterprobs <- as.single(dist(data, method = distance.type))

	# The dist function handles missing values correctly except
	# if there is no overlap (see counterprob)
	min.di <- rep(0, n)
	means.di <- rep(0, n)

	# Will be used for the sample spatial median and for d0
	for (i in 1:n) {
	    di <- counterprobs[ind.dijs(i, 1:n, n)]
		  min.di[i] <- nz.min(di)

		  # weighted mean of distances to account for missing distances
		  means.di[i] <- sum(di * weights[-i], na.rm = TRUE) /
            sum(weights[-i][!is.na(di)])
	}

	# Sample spatial median
    # Restrict to usable observations (for sample spatial median)
	usable.records <- apply(!is.na(data), 1, sum) >= p / 2
	means.di.complete <- means.di
	means.di.complete[!usable.records] <- NA
	sample.spatial.median.index <- which(means.di.complete ==
        min(means.di.complete, na.rm = TRUE))[1]

	# Determine tuning distance d0
	max.min.di <- max(min.di, na.rm = TRUE)
    d0 <- if (is.numeric(reach)) {
        reach
    } else {
        switch(reach,
            "max" = min(max.min.di, 2 * sqrt(p)),
            "quant" = min(weighted.quantile(min.di, w = weights,
                prob = 1 - (p + 1) / n), 2 * sqrt(p)),
            stop("argument 'reach' is not defined\n"))
    }

	# Calculation of counterprobabilities
    # counterprobabilities stocked in distances vector to save memory
    # counterprobabilities are set to 1 if missing
	if (n%%2 == 0) {
	    l.batch <- n - 1
	    n.loops <- n / 2
	} else {
	    l.batch <- n
		n.loops <- (n - 1) / 2
	}

	if (transmission.function == "step") {
        for (i in 1:n.loops) {
	        dij <- counterprobs[(i - 1) * l.batch + (1:l.batch)]
	        dij <- as.single(dij > d0)
	        dij[is.na(dij)] <- 1
	        counterprobs[(i - 1) * l.batch + (1:l.batch)] <- as.single(dij)
	    }

    } else {
        if(transmission.function == "linear") {
			for(i in 1:n.loops) {
			  dij <- counterprobs[(i - 1) * l.batch + (1:l.batch)]
			  dij <- 1 - pmax(0, d0 - dij) / d0
			  dij[is.na(dij)] <- 1
			  counterprobs[(i - 1) * l.batch + (1:l.batch)] <- as.single(dij)
			}

	    } else {
            if (transmission.function == "power") {
                beta <- as.single((0.01^(-1 / power) - 1) / d0)

                for (i in 1:n.loops) {
                    dij <- counterprobs[(i - 1) * l.batch + (1:l.batch)]
                    dij <- 1 - (beta * dij + 1)^(-power)
                    dij <- ifelse(dij > d0, 1, dij)
                    dij[is.na(dij)] <- 1
                    counterprobs[(i - 1) * l.batch + (1:l.batch)] <- as.single(dij)
                  }

            } else { # default transmission function is the root function
                for (i in 1:n.loops) {
                    dij <- counterprobs[(i - 1) * l.batch + (1:l.batch)]
                    dij <- 1 - (1 - dij / d0)^(1 / power)
                    dij <- ifelse(dij > d0, 1, dij)
                    dij[is.na(dij)] <- 1
                    counterprobs[(i - 1) * l.batch + (1:l.batch)] <- as.single(dij)
                }
            }
        }
    }

	return(invisible(list(
	  sample.spatial.median.index = sample.spatial.median.index,
	  max.min.di = max.min.di,
	  transmission.distance = d0,
	  min.di = min.di,
	  counterprobs = counterprobs)))
}



#' Non-zero non-missing minimum function
#'
#' Returns the non-zero non-missing minimum.
#'
#' @param x vector of data.
#' @keywords internal
#' @author Beat Hulliger
#' @export
nz.min <- function(x) {
  if (sum(!is.na(x)) == 0) {
    nz.min.temp <- NA
  } else {
    nz.min.temp <- min(x[x != 0], na.rm = TRUE)
  }
  return(nz.min.temp)
}



#' Addressing function for Epidemic Algorithm
#'
#' Utility function for Epidemic Algorithm.
#'
#' @param i index i.
#' @param j index j.
#' @param n number of rows.
#' @keywords internal
#' @author Cédric Béguin
#' @export
ind.dij <- function(i, j, n) {
  return((i - 1) * n - ((i + 1) * i) / 2 + j)
}



#' Addressing function for Epidemic Algorithm
#'
#' Utility function for Epidemic Algorithm.
#'
#' @param i index i.
#' @param js indexes js.
#' @param n number of rows.
#' @keywords internal
#' @author Cédric Béguin
#' @export
ind.dijs <- function(i, js, n){
  indices <- c(ind.dij(js[js < i], i, n), ind.dij(i, js[js > i], n))
  return(indices[!is.na(indices)])
}



#' Utility function for TRC.R among others
#'
#' Sum of weights for observations < value (if lt = TRUE)
#' or observations=value (if lt = FALSE).
#'
#' @param observations vector of observations.
#' @param weights vector of weights.
#' @param value value.
#' @param lt either \code{TRUE} or \code{FALSE}.
#' @keywords internal
#' @author Beat Hulliger
#' @export
weightsum <- function(observations, weights, value, lt = TRUE) {
  if (lt) {
    return(sum(weights * (observations < value), na.rm = TRUE))
  } else {
    return(sum(weights * (observations == value), na.rm = TRUE))
  }
}



#' Sweep operator
#'
#' Definition of the sweep and reverse-sweep operator (Schafer pp 159-160)
#'
#' @param M a matrix.
#' @param k column.
#' @param reverse either \code{TRUE} or \code{FALSE}.
#' @keywords internal
#' @author Beat Hulliger
#' @export
sweep.operator <- function(M, k, reverse = FALSE) {
  if (reverse) {
    Gjk <- M[ ,k]
    Hkk <- -1 / M[k,k]
    M <- M + (Gjk %*% t(Gjk)) * Hkk
    M[k, ] <- M[ ,k] <- Gjk * Hkk
    M[k,k] <- Hkk
    return(M)
  } else {
    Gjk <- M[ ,k]
    Hkk <- 1 / M[k,k]
    M <- M - (Gjk %*% t(Gjk)) * Hkk
    M[k, ] <- M[ ,k] <- Gjk * Hkk
    M[k,k] <- -Hkk
    return(M)
  }
}



#' psi-function
#'
#' Defined in Little and Smith for ER algorithm
#'
#' @param d vector of distances.
#' @param present present.
#' @param psi.par parameters.
#' @keywords internal
#' @author Beat Hulliger
#' @export
psi.lismi <- function(d, present, psi.par = c(2, 1.25)) {
  a <- psi.par[1]
  b <- psi.par[2]
  d0 <- sqrt(present) + a / 2
  return(ifelse(d - d0 <= 0, d, d0 * exp(-(d - d0)^2 / (2 * b^2))))
}



#' EM for multivariate normal data
#'
#' This version of EM does not contain the computation of the
#' observed sufficient statistics, they will be computed in the
#' main program of BEM and passed as parameters as well as the
#' statistics on the missingness patterns.
#'
#' @param data matrix or dataframe with data.
#' @param weights vector of weights.
#' @param n number of rows.
#' @param p number of columns.
#' @param s.counts s.counts.
#' @param s.id s.id.
#' @param S S.
#' @param T.obs T.obs.
#' @param start.mean initial center.
#' @param start.var initial variance.
#' @param numb.it numb.it.
#' @param Estep.output Estep.output.
#' @keywords internal
#' @author Beat Hulliger
#' @export
EM.normal <- function(data, weights = rep(1, nrow(data)), n = sum(weights),
                       p = ncol(data), s.counts, s.id, S, T.obs,
                       start.mean = rep(0, p), start.var = diag(1, p),
                       numb.it = 10, Estep.output = FALSE) {

  # ------- Initialization -------

  # Creates theta which is the matrix form of the initial
  # parameter used by EM
  theta <- matrix(0, p + 1, p + 1)
  theta[1,1] <- -1
  theta[1,2:(p + 1)] <- theta[2:(p + 1), 1] <- start.mean
  theta[2:(p + 1),2:(p + 1)] <- start.var

  # ------- Iterations of EM -------
  for (boucle in 1:numb.it) {

    if (Estep.output) {cat("E-step ", boucle, "\n")}

    # ------- The E-step -------

    # Initializing T.tot to T.obs
    T.tot <- T.obs

    # Start loop on missing patterns s from 1 to S
    for (s in 1:S) {

      # Identification of the indices of x.mis and x.obs
      x.mis.id <- (1:p)[is.na(data[s.id[s], ])]
      x.obs.id <- (1:p)[-x.mis.id]

      # Sweep of theta over the indices of x.obs
      C.s <- theta

      for (k in x.obs.id) {
        if (C.s[k + 1, k + 1] != 0) {
          C.s <- sweep.operator(C.s, k + 1)
        }
      }

      # Start loop over all observations x having missing pattern s
      for (i in 1:s.counts[s]) {
        if (s == 1) {
          x <- data[i, ]
          weight <- weights[i]
        } else {
          x <- data[s.id[s - 1] + i, ]
          weight <- weights[s.id[s - 1] + i]
        }

        # Computation of x.star = E(x.mis|x.obs)
        x.star <- x

        for (k in x.mis.id) {
          x.star[k] <- C.s[1, k + 1] + sum(C.s[x.obs.id + 1, k + 1] * x[x.obs.id])
        }

        # Updating T.tot
        T.tot[1, ] <- T.tot[ ,1] <- T.tot[1, ] + weight * c(1, x.star)
        T.tot[2:(p + 1), 2:(p + 1)] <- T.tot[2:(p + 1), 2:(p + 1)] + weight * (x.star %*% t(x.star))
        T.tot[x.mis.id + 1, x.mis.id + 1] <- T.tot[x.mis.id + 1, x.mis.id + 1] + weight * C.s[x.mis.id + 1, x.mis.id + 1]

      }

    }

    # ------- The M-step -------
    theta <- sweep.operator(T.tot / n, 1)

  }

  # ------- End of EM -------
  return(theta)

}



#' Utility for ER function
#'
#' The \code{ER} function is an implementation of the
#' ER-algorithm of Little and Smith (1987).
#'
#' @param data matrix or dataframe with data.
#' @param weights vector of weights.
#' @param psi.par parameters for psi function.
#' @param np np.
#' @param p number of columns.
#' @param s.counts s.counts.
#' @param s.id s.id.
#' @param S S.
#' @param missing.items missing items.
#' @param nb.missing.items number of missing items.
#' @param start.mean initial center.
#' @param start.var initial variance.
#' @param numb.it number of iterations.
#' @param Estep.output Estep.output.
#' @param tolerance tolerance.
#' @keywords internal
#' @author Beat Hulliger
#' @export
ER.normal <- function(data, weights = rep(1, nrow(data)),
                       psi.par = c(2, 1.25), np = sum(weights),
                       p = ncol(data), s.counts, s.id, S, missing.items,
                       nb.missing.items, start.mean = rep(0, p),
                       start.var = diag(1, p), numb.it = 10,
                       Estep.output = FALSE, tolerance = 1e-06) {

  # ------- Initialization -------

  # Creates theta which is the matrix form of the
  # initial parameter used by EM
  theta <- matrix(0, p + 1, p + 1)
  theta[1,1] <- -1
  theta[1, 2:(p + 1)] <- theta[2:(p + 1), 1] <- start.mean
  theta[2:(p + 1), 2:(p + 1)] <- start.var

  if (Estep.output) {cat("\n", "theta: \n", theta)}

  break.flag <- FALSE

  # Initialisation of robustness weights to 1
  rob.weights <- rep(1, nrow(data))
  np.hat <- np

  # ------- Iterations of EM -------

  for (boucle in 1:numb.it) {

    if (Estep.output) {cat("\n E-step ", boucle)}

    # ------- The E-step -------

    # Initializing T.tot and storing of old.theta
    T.tot <- matrix(0, p + 1, p + 1)
    old.theta <- theta

    # Start loop on missing patterns s from 1 to S
    for (s in 1:S) {

      # Identification of the indices of x.mis and x.obs
      x.mis.id <- (1:p)[is.na(data[s.id[s], ])]
      x.obs.id <- (1:p)[-x.mis.id]

      # Sweep of theta over the indices of x.obs
      C.s <- theta

      for (k in x.obs.id) {
        if (C.s[k + 1, k + 1] != 0) {
          C.s <- sweep.operator(C.s, k + 1)
        }
      }

      # Start loop over all observations x having missing pattern s
      for (i in 1:s.counts[s]) {
        if (s == 1) {
          x <- data[i, ]
          weight <- weights[i]
          rob.weight <- rob.weights[i]
        } else {
          x <- data[s.id[s - 1] + i, ]
          weight <- weights[s.id[s - 1] + i]
          rob.weight <- rob.weights[s.id[s - 1] + i]
        }

        # Computation of x.star = E(x.mis|x.obs)
        x.star <- x

        for (k in x.mis.id) {
          x.star[k] <- C.s[1, k + 1] + sum(C.s[x.obs.id + 1, k + 1] * x[x.obs.id])
        }

        # robustification with robustness weights from last iteration
        x.star <- x.star

        # Updating T.tot
        T.tot[1, ] <- T.tot[ ,1] <- T.tot[1, ] + rob.weight * weight * c(1, x.star)
        T.tot[2:(p + 1), 2:(p + 1)] <- T.tot[2:(p + 1), 2:(p + 1)] + rob.weight * weight * (x.star %*% t(x.star))
        T.tot[x.mis.id + 1, x.mis.id + 1] <- T.tot[x.mis.id + 1, x.mis.id + 1] + rob.weight * weight * C.s[x.mis.id + 1, x.mis.id + 1]
      }
    }

    # ------- The M-step -------

    np.hat <- sum(weights * rob.weights)
    theta <- sweep.operator(T.tot / np.hat, 1)

    if (Estep.output) {cat("\n", "theta: \n", theta, "\n")}

    # Computation of Mahalanobis distances (marginal version)
    ER.mean <- theta[1, 2:(p + 1)]
    ER.var <- theta[2:(p + 1), 2:(p + 1)]
    indices <- (!missing.items[1, ])
    dist <- mahalanobis(data[1:s.id[1], indices, drop = FALSE],
                        ER.mean[indices],
                        ER.var[indices, indices]) * p / (p - nb.missing.items[1])

    if (S > 1) {

      for (i in 2:S) {
        indices <- (!missing.items[i, ])
        dist <-
          c(dist,
            mahalanobis(data[(s.id[i - 1] + 1):s.id[i], indices, drop = FALSE],
                        ER.mean[indices],
                        ER.var[indices, indices, drop = FALSE]) * p / (p - nb.missing.items[i]))
      }

    }

    # new robustness weights
    rob.weights <-
      ifelse(dist > 0, psi.lismi(sqrt(dist), apply(!is.na(data), 1, sum), psi.par = psi.par) / sqrt(dist), 1)

    if (Estep.output) {cat("\n Delta: ", signif(max(abs(theta-old.theta)),8),"\n")}

    if (max(abs(theta - old.theta)) < tolerance) {
      break.flag <- T; break
    }

  }

  return(list(theta = theta, dist = dist,
              rob.weights = rob.weights, convergence = break.flag))

}
martinSter/modi documentation built on March 14, 2023, 12:09 p.m.