R/wpca.R

#########################################################################/**
# @RdocGeneric wpca
# @alias wpca.matrix
#
# @title "Light-weight Weighted Principal Component Analysis"
#
# \usage{
# @usage wpca,matrix
# }
#
# \description{
#   Calculates the (weighted) principal components of a matrix, that is,
#   finds a new coordinate system (not unique) for representing the given
#   multivariate data such that
#    i) all dimensions are orthogonal to each other, and
#   ii) all dimensions have maximal variances.
# }
#
# \arguments{
#   \item{x}{An NxK @matrix.}
#   \item{w}{An N @vector of weights for each row (observation) in
#     the data matrix. If @NULL, all observations get the same weight,
#     that is, standard PCA is used.}
#   \item{center}{If @TRUE, the (weighted) sample mean column @vector is
#     subtracted from each column in \code{mat}, first.
#     If data is not centered, the effect will be that a linear subspace
#     that goes through the origin is fitted.}
#   \item{scale}{If @TRUE, each column in \code{mat} is
#     divided by its (weighted) root-mean-square of the
#     centered column, first.}
#   \item{method}{If \code{"dgesdd"} LAPACK's divide-and-conquer
#     based SVD routine is used (faster [1]).
#     If \code{"dgesvd"}, LAPACK's QR-decomposition-based routine is used.
#   }
#   \item{swapDirections}{If @TRUE, the signs of eigenvectors
#     that have more negative than positive components are inverted.
#     The signs of corresponding principal components are also inverted.
#     This is only of interest when for instance visualizing or comparing
#     with other PCA estimates from other methods, because the
#     PCA (SVD) decomposition of a matrix is not unique.
#   }
#   \item{...}{Not used.}
# }
#
# \value{
#   Returns a @list with elements:
#   \item{pc}{An NxK @matrix where the column @vectors are the
#             principal components (a.k.a. loading vectors,
#             spectral loadings or factors etc).}
#   \item{d}{An K @vector containing the eigenvalues of the
#             principal components.}
#   \item{vt}{An KxK @matrix where the column @vectors are the
#             eigenvector of the principal components.}
#   \item{xMean}{The center coordinate.}
#
#   It holds that \code{x == t(t(fit$pc \%*\% fit$vt) + fit$xMean)}.
# }
#
# \section{Method}{
#   A singular value decomposition (SVD) is carried out.
#   Let X=\code{mat}, then the SVD of the matrix is \eqn{X = U D V'}, where
#   \eqn{U} and \eqn{V} are orthogonal, and \eqn{D} is a diagonal matrix
#   with singular values. The principal returned by this method are \eqn{U D}.
#
#   Internally \code{La.svd()} (or \code{svd()}) of the \pkg{base}
#   package is used.
#   For a popular and well written introduction to SVD see for instance [2].
# }
#
# \examples{
#   @include "../incl/wpca.matrix.Rex"
#
#   if (dev.cur() > 1) dev.off()
#
#   @include "../incl/wpca2.matrix.Rex"
# }
#
# @author
#
# \references{
#   [1] J. Demmel and  J. Dongarra, \emph{DOE2000 Progress Report}, 2004.
#       \url{https://people.eecs.berkeley.edu/~demmel/DOE2000/Report0100.html} \cr
#   [2] Todd Will, \emph{Introduction to the Singular Value Decomposition},
#       UW-La Crosse, 2004. \url{http://websites.uwlax.edu/twill/svd/} \cr
# }
#
# \seealso{
#   For a iterative re-weighted PCA method, see @see "iwpca".
#   For Singular Value Decomposition, see @see "base::svd".
#   For other implementations of Principal Component Analysis functions see
#   (if they are installed):
#   @see "stats::prcomp" in package \pkg{stats} and \code{pca()} in package
#   \pkg{pcurve}.
# }
#
# @keyword "algebra"
#*/#########################################################################
setMethodS3("wpca", "matrix", function(x, w=NULL, center=TRUE, scale=FALSE, method=c("dgesdd", "dgesvd"), swapDirections=FALSE, ...) {
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # 1. Verify the arguments
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Argument: 'x'
  x <- as.matrix(x);

  # The dimensions of 'x'
  N <- nrow(x);
  K <- ncol(x);

  # Standardizes the weights to [0,1] such that they sum to 1.
  if (!is.null(w)) {
    w <- rep(w, length.out=N);
    w <- w/sum(w);
    if (anyMissing(w))
      stop("Argument 'w' has missing values.");
  }

  ## Argument 'method':
  method <- match.arg(method, choices = c("dgesdd", "dgesvd"))
  

  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # 2. Weighted or non-weighted centering and rescaling of the data
  #
  # Note: The following split of (center == TRUE) and (center == FALSE)
  # is to minimize memory usage. In other words, the codes is longer,
  # but more memory efficient.
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  if (center || scale) {
    if (is.null(w)) {
      # Calculates the standard column means
      xMean <- colMeans(x, na.rm=TRUE);           # a K vector
    } else {
      # Calculates the weighted column means (recall that sum(w) == 1)
      xMean <- as.vector(w %*% x);                # a K vector
    }

    if (center) {
      # Centers the data directly by subtracting the column means
      for (kk in 1:ncol(x))
        x[,kk] <- x[,kk] - xMean[kk];
    } else {
      # ...or calculates the centered data for rescaling
      xc <- x;
      for (kk in 1:ncol(x))
        xc[,kk] <- x[,kk] - xMean[kk];
    }

    if (scale) {
      if (is.null(w)) {
        # Non-weighted root-mean-squares
        rms <- function(v) {         # v - column vector of length N
          v <- v[!is.na(v)];
          sqrt(sum(v^2)/max(1, length(v)-1));
        }
      } else {
        # Weighted root-mean-squares
        rms <- function(v) {         # v - column vector of length N
          ok <- !is.na(v);
          v <- w[ok]*v[ok];
          sqrt(sum(v^2)/max(1, length(v)-1));
        }
      }

      if (center) {
        xRMS <- apply(x, MARGIN=2, FUN=rms);
      } else {
        xRMS <- apply(xc, MARGIN=2, FUN=rms);
        # Not needed anymore
        xc <- NULL;
      }

      for (kk in 1:ncol(x))
        x[,kk] <- x[,kk] / xRMS[kk];

      # Not needed anymore
      xRMS <- rms <- NULL;
    }
  } else {
    xMean <- rep(0, length=K);
  }

  # Weight the observations?
  if (!is.null(w)) {
    x <- sqrt(w)*x;
  }

  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # 3. Singular Value Decomposition, i.e. X = U D V'
  #
  # "We compare DGESVD, the original QR-based routines from
  #  LAPACK 1.0, with DGESDD, the new divide-and-conquer based
  #  routine from LAPACK 3.0. The table below shows the speeds
  #  on several machines. The new routine is 5.7 to 16.8 times
  #  faster than the old routine. Part of the speed results
  #  from doing many fewer operations, and the rest comes from
  #  doing them faster (a high Mflop rate)." [1]
  # [1] J. Demmel and  J. Dongarra, DOE2000 Progress Report,
  #     http://www.cs.berkeley.edu/~demmel/DOE2000/Report0100.html
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  if (method == "dgesdd" || method == "dgesvd") {
    duvt <- La.svd(x);
  } else {
    stop(sprintf("Unknown LAPACK or LINPACK routine to solve SVD: %s", method));
  }

  # 'duvt' is a list with the follwing components:
  #
  # u  - a NxK matrix whose columns contain the left singular
  #      vectors (eigenvectors) of 'x'.
  #      It holds that t(u) %*% u == I

  # d  - a K vector containing the singular value of
  #      each principal component on its diagonal.
  #      It holds that d[1] >= d[2] >= ... d[K] >= 0.

  # vt - a KxK transposed matrix whose columns contain the right
  #      singular vectors (eigenvector) of 'x'.
  #      It holds that t(v) %*% v == I

  # Not need anymore, in case a local copy has been created!
  x <- NULL;

  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # 4. The PCA principal components
  #
  # (a.k.a. loading vectors, spectral loadings or factors).
  #
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  d <- duvt$d;
  vt <- duvt$vt;
  pc <- duvt$u;
  # Not need anymore
  duvt <- NULL;

  # Note: D == diag(duvt$d) is memory expensive since the dimensions of D
  # is the same as the dimensions of 'x'. Thus, it unwise to do:
  # pc <- duvt$u %*% diag(duvt$d);
  for (kk in seq_len(N))
    pc[kk,] <- pc[kk,] * d;

  if (!is.null(w)) {
    # Rescale the principal components
    pc <- pc / sqrt(w);
    # Not need anymore
    w <- NULL;
  }

  if (swapDirections) {
    swap <- apply(vt, MARGIN=1, FUN=function(z) sum(sign(z)) < 0);

    # Which eigenvectors should swap signs?
    swap <- which(swap);
    for (kk in swap) {
      vt[kk,] <- -vt[kk,];
      pc[,kk] <- -pc[,kk];
    }
  }

  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # 4. Return the parameter estimates
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  res <- list(pc=pc, d=d, vt=vt, xMean=xMean);
  class(res) <- "WPCAFit";
  res;
}) # wpca()


############################################################################
# HISTORY:
# 2015-05-24
# o Removed obsolete method="dsvdc"; only needed in R (< 1.7.0).
# 2013-09-26
# o Now utilizing anyMissing().
# 2006-06-26
# o Function would not work in R v2.4.0 devel, because argument 'method' was
#   removed from La.svd().
# 2006-04-25
# o Updated the URL to Todd Will's webpage.
# 2005-02-20
# o Added '...' to please R CMD check.
# 2005-02-20
# o Now using setMethodS3() and added '...' to please R CMD check.
# 2005-01-24
# o Added a check for missing values of argument 'w'.
# 2004-05-14
# o Made into a method of class matrix instead of a stand-alone function.
# 2003-03-09
# o Created! Verified that it gives similar results as acp().
############################################################################
HenrikBengtsson/aroma.light documentation built on July 3, 2023, 1:57 a.m.