R/rpca.R

Defines functions predict.rpca print.summary.rpca summary.rpca print.rpca rpca.default rpca

Documented in rpca

#' @title  Randomized principal component analysis (rpca).
#
#' @description Fast computation of the principal components analysis using the randomized singular value decomposition.
#
#' @details
#' Principal component analysis is an important linear dimension reduction technique.
#'
#' Randomized PCA is computed via the randomized SVD algorithm (\code{\link{rsvd}}).
#' The computational gain is substantial, if the desired number of principal components
#' is relatively small, i.e. \eqn{k << min(m,n)}.
#'
#' The print and summary method can be used to present the results in a nice format.
#' A scree plot can be produced with \code{\link{ggscreeplot}}. 
#' The individuals factor map can be produced with \code{\link{ggindplot}},
#' and a correlation plot with \code{\link{ggcorplot}}.
#'
#' The predict function can be used to compute the scores of new observations. The data
#' will automatically be centered (and scaled if requested). This is not fully supported for
#' complex input matrices.
#'
#'
#' @param A      array_like; \cr
#'               a numeric \eqn{(m, n)} input matrix (or data frame) to be analyzed. \cr
#'               If the data contain \eqn{NA}s na.omit is applied.
#'
#' @param k      integer; \cr
#'               number of dominant principle components to be computed. It is required that \eqn{k} is smaller or equal to
#'               \eqn{min(m,n)}, but it is recommended that \eqn{k << min(m,n)}.
#'
#' @param center  bool, optional; \cr
#'                logical value which indicates whether the variables should be
#'                shifted to be zero centered (\eqn{TRUE} by default).
#'
#' @param scale   bool, optional; \cr
#'                logical value which indicates whether the variables should
#'                be scaled to have unit variance (\eqn{TRUE} by default).
#'
#' @param retx    bool, optional; \cr
#'                logical value indicating whether the rotated variables / scores
#'                should be returned (\eqn{TRUE} by default).
#'
#' @param p       integer, optional; \cr
#'                oversampling parameter for \eqn{rsvd} (default \eqn{p=10}), see \code{\link{rsvd}}.
#'
#' @param q       integer, optional; \cr
#'                number of additional power iterations for \eqn{rsvd} (default \eqn{q=1}), see \code{\link{rsvd}}.
#'
#' @param rand   bool, optional; \cr
#'               if (\eqn{TRUE}), the \eqn{rsvd} routine is used, otherwise \eqn{svd} is used.
#'
#'
#' @return \code{rpca} returns a list with class \eqn{rpca} containing the following components:
#' \describe{
#'    \item{rotation}{  array_like; \cr
#'                      the rotation (eigenvectors); \eqn{(n, k)} dimensional array.
#'    }
#'
#'    \item{eigvals}{  array_like; \cr
#'                     eigenvalues; \eqn{k} dimensional vector.
#'    }
#'    \item{sdev}{     array_like; \cr
#'                     standard deviations of the principal components; \eqn{k} dimensional vector.
#'    }
#'    \item{x}{        array_like; \cr
#'                     the scores / rotated data; \eqn{(m, k)} dimensional array.
#'    }
#'    \item{center, scale}{  array_like; \cr
#'                     the centering and scaling used.
#'    }
#'}
#'
#'
#' @note  The principal components are not unique and only defined up to sign
#' (a constant of modulus one in the complex case) and so may differ between different
#'  PCA implementations.
#'
#' Similar to \code{\link{prcomp}} the variances are computed with the usual divisor N - 1.
#'
#'
#' @references
#' \itemize{
#'  \item [1] N. B. Erichson, S. Voronin, S. L. Brunton and J. N. Kutz. 2019.
#'        Randomized Matrix Decompositions Using {R}. 
#'        Journal of Statistical Software, 89(11), 1-48.
#'        \url{http://doi.org/10.18637/jss.v089.i11}.
#' 
#'   \item  [2] N. Halko, P. Martinsson, and J. Tropp.
#'          "Finding structure with randomness: probabilistic
#'          algorithms for constructing approximate matrix
#'          decompositions" (2009).
#'          (available at arXiv \url{http://arxiv.org/abs/0909.4061}).
#'}
#'
#'
#' @author N. Benjamin Erichson, \email{erichson@berkeley.edu}
#'
#' @seealso \code{\link{ggscreeplot}}, \code{\link{ggindplot}},
#'          \code{\link{ggcorplot}}, \code{\link{plot.rpca}},
#'          \code{\link{predict}},   \code{\link{rsvd}}
#'
#' @examples
#'
#'library('rsvd')
#'#
#'# Load Edgar Anderson's Iris Data
#'#
#'data('iris')
#'
#'#
#'# log transform
#'#
#'log.iris <- log( iris[ , 1:4] )
#'iris.species <- iris[ , 5]
#'
#'#
#'# Perform rPCA and compute only the first two PCs
#'#
#'iris.rpca <- rpca(log.iris, k=2)
#'summary(iris.rpca) # Summary
#'print(iris.rpca) # Prints the rotations
#'
#'#
#'# Use rPCA to compute all PCs, similar to \code{\link{prcomp}}
#'#
#'iris.rpca <- rpca(log.iris)
#'summary(iris.rpca) # Summary
#'print(iris.rpca) # Prints the rotations
#'plot(iris.rpca) # Produce screeplot, variable and individuls factor maps.
#'

#' @export
rpca <- function(A, k=NULL, center=TRUE, scale=TRUE, retx=TRUE, p=10, q=2, rand = TRUE) UseMethod("rpca")

#' @export
rpca.default <- function(A, k=NULL, center=TRUE, scale=TRUE, retx=TRUE, p=10, q=2, rand = TRUE) {
  
  A <- as.matrix(A)
  
  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  # Checks
  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  if (any(is.na(A))) {
    warning("Missing values are omitted: na.omit(A).")
    A <- stats::na.omit(A)
  }   
  
  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  # Init rpca object
  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  rpcaObj = list(rotation = NULL,
                 eigvals = NULL,
                 sdev = NULL,
                 var = NULL,
                 center = center,
                 scale = scale,
                 x=NULL)
  
  m <- nrow(A)
  n <- ncol(A)
  
  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  # Set target rank
  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  if(is.null(k)) rand <- FALSE
  if(is.null(k)) k <- min(n,m)
  if(k > min(n,m)) k <- min(n,m)
  if(k<1) stop("Target rank is not valid!")
  
  
  
  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  # Center/Scale data
  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  if(center == TRUE) {
    rpcaObj$center <- colMeans(A)
    A <- sweep(A, MARGIN = 2, STATS = rpcaObj$center, FUN = "-", check.margin = TRUE)
    #A <- H(H(A) - rpcaObj$center)
  } else { rpcaObj$center <- FALSE }
  
  if(scale == TRUE) {
    rpcaObj$scale <- sqrt(colSums(A**2) / (m-1))
    if(is.complex(rpcaObj$scale)) { rpcaObj$scale[Re(rpcaObj$scale) < 1e-8 ] <- 1+0i  
    } else {rpcaObj$scale[rpcaObj$scale < 1e-8] <- 1}
    A <- sweep(A, MARGIN = 2, STATS = rpcaObj$scale, FUN = "/", check.margin = TRUE)
    #A <- H(H(A) / rpcaObj$scale)
    #A[is.nan(A)] <- 0
  } else { rpcaObj$scale <- FALSE }
  
  
  
  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  # Compute randomized svd / eigen decomposition
  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  if(rand == TRUE) {
    svdalg = 'rsvd'
  }else { 
    svdalg = 'svd' 
  }
  
  out <- switch(svdalg,
                svd = svd(A, nu = k, nv = k),
                rsvd = rsvd(A, k = k, p = p, q = q),
                stop("Selected SVD algorithm is not supported!")
  )
  
  rpcaObj$eigvals <- switch(svdalg,
                            svd = out$d[1:k]**2 / (m-1),
                            rsvd = out$d**2 / (m-1)
  )
  
  rpcaObj$rotation <- switch(svdalg,
                             svd = out$v,
                             rsvd = out$v
  )
  
  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  # Explained variance and explained variance ratio
  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  rpcaObj$sdev <-  sqrt( rpcaObj$eigvals )
  rpcaObj$var <- sum( apply( Re(A) , 2, stats::var ) )
  if(is.complex(A)) rpcaObj$var <- Re(rpcaObj$var + sum( apply( Im(A) , 2, stats::var ) ))
  
  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  # Add row and col names
  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  rownames(rpcaObj$rotation) <- colnames(A)
  colnames(rpcaObj$rotation) <- paste(rep('PC', k), 1:k, sep = "")
  
  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  # Compute rotated data
  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  if(retx==TRUE) {
    #rpcaObj$x <- A %*% rpcaObj$rotation # slow
    #rpcaObj$x <- H(H(out$u[,1:k]) * out$d[1:k])
    rpcaObj$x <- sweep(out$u[, 1:k, drop=FALSE], MARGIN = 2, STATS = out$d[1:k], FUN = "*", check.margin = TRUE)
    rownames(rpcaObj$x) <- rownames(A)
  }
  
  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  # Return
  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  class(rpcaObj) <- "rpca"
  return( rpcaObj )
  
}#End rPCA


#' @export
print.rpca <- function(x , ...) {
  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  # Print rpca
  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  cat("Standard deviations:\n")
  print(round(x$sdev, 3))
  cat("\nEigenvalues:\n")
  print(round(x$eigvals, 3))
  cat("\nRotation:\n")
  print(round(x$rotation, 3))
}

#' @export
summary.rpca <- function( object , ... )
{
  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  # Summary rpca
  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  variance = object$sdev**2
  explained_variance_ratio = variance / object$var
  cum_explained_variance_ratio = cumsum( explained_variance_ratio )
  
  x <- t(data.frame( var = round(variance, 3),
                     sdev = round(object$sdev, 3),
                     prob = round(explained_variance_ratio, 3),
                     cum = round(cum_explained_variance_ratio, 3)))
  
  rownames( x ) <- c( 'Explained variance',
                      'Standard deviations',
                      'Proportion of variance',
                      'Cumulative proportion')
  
  colnames( x ) <- paste(rep('PC', length(object$sdev)), 1:length(object$sdev), sep = "")
  
  x <- as.matrix(x)
  
  return( x )
}

#' @export
print.summary.rpca <- function( x , ... )
{
  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  # Print summary rpca
  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  cat( "Importance of components:\n" )
  print( x )
  cat("\n")
}

#' @export
predict.rpca <- function( object, newdata, ...)
{
  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  # Predict
  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  if(!is.logical(object$center)) {
    #newdata <- H(H(newdata) - object$center)
    newdata <- sweep(newdata, MARGIN = 2, STATS = object$center, FUN = "-", check.margin = TRUE)
  }
  
  if(!is.logical(object$scale)) {
    #newdata <- H(H(newdata) / object$scale)
    newdata <- sweep(newdata, MARGIN = 2, STATS = object$scale, FUN = "/", check.margin = TRUE)
    newdata[is.nan(newdata)] <- 0
  }
  
  x <- as.matrix(newdata) %*% as.matrix(object$rotation)
  rownames(x) <- rownames(newdata)
  
  return( x )
}
Benli11/rSVD documentation built on April 20, 2021, 6:50 a.m.