R/ICPCA.R
In cellWise: Analyzing Data with Cellwise Outliers

Documented in ICPCA

ICPCA <- function(X, k, scale = FALSE, maxiter = 20, tol = 0.005,
                  tolProb = 0.99, distprob = 0.99) {
  #  
  #   This function is based on a Matlab function from
  #   Missing Data Imputation Toolbox v1.0
  #   A. Folch-Fortuny, F. Arteaga and A. Ferrer
  #   
  # Its inputs are:
  #
  # X : the input data, which must be a matrix or a data frame.
  #     It may contain NA's. It must always be provided.
  # k : the desired number of principal components.
  # scale     : a value indicating whether and how the original
  #             variables should be scaled. If scale=FALSE (default)
  #             or scale=NULL no scaling is performed (and a vector
  #             of 1s is returned in the $scaleX slot).
  #             If scale=TRUE the data are scaled to have a
  #             standard deviation of 1.
  #             Alternatively scale can be a function like mad,
  #             or a vector of length equal to the number of columns
  #             of x.
  #             The resulting scale estimates are returned in the
  #             $scaleX slot of the ICPCA output.
  # maxiter   : maximum number of iterations. Default is 20.
  # tol       : tolerance for iterations. Default is 0.005.
  # tolProb   : tolerance probability for residuals. Defaults to 0.99.
  # distprob  : probability determining the cutoff values for
  #             orthogonal and score distances. Default is 0.99.
  #
  # The outputs are:
  #
  # scaleX      : the scales of the columns of X.
  # k           : the number of principal components.
  # loadings    : the columns are the k loading vectors.
  # eigenvalues : the k eigenvalues.
  # center      : vector with the fitted center.
  # covmatrix   : estimated covariance matrix.
  # n.obs       : number of cases.
  # It          : number of iteration steps.
  # diff        : convergence criterion.
  # X.NAimp     : data with all NA's imputed by MacroPCA.
  # scores      : scores of X.NAimp
  # OD          : orthogonal distances of the rows of X.NAimp
  # cutoffOD    : cutoff value for the OD.
  # SD          : score distances of the rows of X.NAimp
  # cutoffSD    : cutoff value for the SD.
  # highOD      : row numbers of observations with OD > cutoffOD.
  # highSD      : row numbers of observations with SD > cutoffSD.
  # residScale  : scale of the residuals.
  # stdResid    : standardized residuals. Note that these are NA
  #               for all missing values of the data X.
  # indcells    : indices of cellwise outliers.
  
  X <- as.matrix(X)
  n <- dim(X)[1]
  p <- dim(X)[2]
  mis <- is.na(X)
  
  # Determine scale
  if (is.logical(scale)) {
    if (!scale) scaleX <- vector("numeric", p) + 1
    if (scale) scaleX <- apply(X, 2, FUN = sd, na.rm = TRUE)
    scale <- sd # scale is set to the function sd
  } else if (is.function(scale)) {
    scaleX <- apply(X, 2, FUN = scale, na.rm = TRUE)
  } else {
    scaleX <- scale
    scale <- sd # scale is set to the function sd
  }
  
  X <- sweep(X, 2, scaleX, "/") # standardization
  XO <- X # still has the NA's
  rm(X)  # to save space
  Xnai <- XO  # initialize the NA-imputed Xnai
  
  diff <- 100
  It <- 0
  if (any(mis)) { # if there are missings
    misrc <- which(is.na(XO), arr.ind = TRUE)
    r <- misrc[, 1]
    c <- misrc[, 2]
    meanc <- colMeans(XO, na.rm = TRUE) # Mean vector
    for (i in seq_len(length(r))) {
      Xnai[r[i], c[i]] <- meanc[c[i]];  
      # Impute missing data with mean values 
    }
    while (It < maxiter & diff > tol) { # Iterate
      It <- It + 1;
      # Xmis <- Xnai[mis]        # current imputations
      mXnai <- colMeans(Xnai)       # mean vector
      Xnaic <- sweep(Xnai, 2, mXnai)  # centered Xnai
      if (n < p) {
        XnaicSVD <- svd(t(Xnaic))
        Pr <- as.matrix(XnaicSVD$u[, seq_len(k)]) 
        # reduced loadings matrix
      } else {
        XnaicSVD <- svd(Xnaic)
        Pr <- as.matrix(XnaicSVD$v[, seq_len(k)])
        # reduced loadings matrix
      }
      Tr <- Xnaic %*% Pr          # reduced scores matrix     
      Xnaihat <- Tr %*% t(Pr)     # fit to Xnaic
      Xnaihat <- sweep(Xnaihat, 2, mXnai, "+") # fit to Xnai
      Xnai[mis] <- Xnaihat[mis]         # impute missings
      # d <- (Xnai[mis]-Xmis)^2       
      if (It > 1) diff <- maxAngle(Pr, PrPrev)
      PrPrev <- Pr
    } 
  } else {# if there are no missings
    mXnai <- colMeans(Xnai)      # mean vector
    Xnaic <- sweep(Xnai, 2, mXnai) # centered Xnai
    if (n < p) {
      XnaicSVD <- svd(t(Xnaic))
      Pr <- as.matrix(XnaicSVD$u[, seq_len(k)]) 
      # reduced loadings matrix
    } else {
      XnaicSVD <- svd(Xnaic)
      Pr <- as.matrix(XnaicSVD$v[, seq_len(k)]) 
      # reduced loadings matrix
    }
    Tr <- Xnaic %*% Pr        # reduced scores matrix     
    Xnaihat <- Tr %*% t(Pr)   # fit to Xnaic       
    Xnaihat <- sweep(Xnaihat, 2, mXnai, "+") # fit to Xnai
  }
  
  rank   <- rankMM(Xnaic, sv = XnaicSVD$d)
  S      <- cov(Xnai) # is not the EM covariance
  center <- colMeans(Xnai)
  
  res <- list(loadings = Pr, eigenvalues = (XnaicSVD$d ^ 2) / (n - 1),
              center = center, k = k, h = n, alpha = 1,  scores = Tr)
  NAimp <- pca.distances.classic(res, Xnai, rank, distprob)
  NAimp$highOD <- which(NAimp$OD > NAimp$cutoffOD)
  NAimp$highSD <- which(NAimp$SD > NAimp$cutoffSD)
  
  # Compute standardized residuals with NA's
  stdResid   <- XO - Xnaihat
  residScale <- apply(stdResid, 2, FUN = scale, na.rm = TRUE)
  stdResid   <- sweep(stdResid, 2, residScale, "/")
  indcells   <- which(abs(stdResid) > sqrt(qchisq(tolProb, 1)))
  
  Xnai <- sweep(Xnai, 2, scaleX, "*") # unstandardize
  center <- center * scaleX
  
  return(list(scaleX = scaleX,
              k = k,
              loadings = Pr,
              eigenvalues = res$eigenvalues[seq_len(k)],
              center = center,
              covmatrix = S,
              It = It,
              diff = diff,
              X.NAimp = Xnai,
              scores = Tr,
              OD = NAimp$OD,
              cutoffOD = NAimp$cutoffOD,
              SD = NAimp$SD,
              cutoffSD = NAimp$cutoffSD,              
              highOD = NAimp$highOD,
              highSD = NAimp$highSD,
              residScale = residScale,
              stdResid = stdResid,
              indcells = indcells))
} # ends ICPCA



pca.distances.classic <- function(obj, data, r, crit = 0.99) {   
  # based on rrcov:::.distances
  n <- nrow(data)
  q <- ncol(obj$scores)
  smat <- diag(obj$eigenvalues, ncol = q)
  nk <- min(q, rankMM(smat))
  if (nk < q) warning(paste("The smallest eigenvalue is ", 
                            obj$eigenvalues[q], 
                            " so the diagonal matrix of the eigenvalues",
                            "cannot be inverted!", sep = ""))  
  mySd <- sqrt(mahalanobis(as.matrix(obj$scores[, seq_len(nk)]),
                           rep(0, nk), diag(obj$eigenvalues[seq_len(nk)], ncol = nk)))
  cutoffSD <- sqrt(qchisq(crit, obj$k))
  out <- list(SD = mySd, cutoffSD = cutoffSD)
  out$OD <- apply(data - matrix(rep(obj$center, times = n), nrow = n, 
                                byrow = TRUE) - obj$scores %*% t(obj$loadings),
                  1, vecnorm)
  if (is.list(dimnames(obj$scores))) {
    names(out$OD) <- dimnames(obj$scores)[[1]]
  }
  out$cutoffOD <- 0
  if (obj$k != r) {
    out$cutoffOD <- critOD(out$OD, crit = crit, classic = TRUE) }
  return(out)
}