R/mspca.R

Defines functions mspca

Documented in mspca

#' Rank-k Sparse Principal Components Analysis
#' 
#' Multifactor version of the rank-1 SPCA of \code{\link{spca}}.Cmponents are found by 
#' applying \code{\link{spca}} to the original matrix after deflation by deducting 
#' the components already found. It is possible to apply different penalties to 
#' different components.
#' 
#' @details This is similar to \code{\link[PMA]{SPC}}, but differs especially with respect to
#' the centring of the matrix.
#' 
#' @param Z Matrix to be decomposed
#' @param k Required rank of the result
#' @param c  L1-norm bound for V (greater than or equal to 1), either length-1, or 
#' with k entries (one for each component). Feasible solutions are available 
#' for values greater than or equal to 1. For values larger than \code{sqrt(ncow(Z)}, it has no effect.
#' @param start Starting values to use for the v vector in each iteration: Either \code{"independent"} or \code{"predetermined"}. The former uses the first right singular vector of the 
#' residual matrix after removing previous components in each step, while the former 
#' uses the right singular vectors of the SVD of \code{Z}
#' @param maxit Maximum number of iterations
#' @param eps Stopping criterion, and absolute error tolerance on the mean squared reconstruction error
#' @param center Logical indicating whether to column-centre the matrix Z 
#' @param scale Logical indicating whether to set the standard deviations of the columns of Z equal
#' to one before analysis
#' 
#' @return A list with the following components:
#' \describe{
#'  \item{scores}{The data matrix after projection to the principal component space}
#'  \item{loadings}{The matrix of component loadings}
#'  \item{sdev}{The standard deviations of each of the principal components}
#'  \item{pve}{Data frame giving the proportion of variance explained by the obtained components}
#' }
#' 
#' @examples 
#' 
#' ## Random matrix example, with n > p
#' set.seed(1)
#' Z <- matrix(rnorm(100), nrow = 20, ncol = 5)
#' mspca(Z, c = 1.25, k = 5)
#' 
#' ## Random matrix example with n < p
#' Z2 <- matrix(rnorm(100), nrow = 5, ncol = 20)
#' mspca(Z2, c = 2.5, k = 5)
#' 
#' ## Example with different c for components 1 and 2
#' mspca(Z, k = 2, c = c(1.5, 1.1))
#' 
#' ## Comparison to PCA
#' summary(prcomp(Z2))
#' mspca(Z2, k = 5, c = max(dim(Z2)))$pve
#' 
#' @export
#' 
mspca <- function(Z, k = 2, c = 1, start = c("independent", "predetermined"), 
                  maxit = 20, eps = sqrt(.Machine$double.eps), 
                  center = TRUE, scale = FALSE) {
  
  ## Check
  if (k > min(dim(Z))) {
    stop(paste("Argument 'k' should be at most", min(dim(Z))))
  }
  
  ## Mean-center and/or scale columns to standard devaition of 1
  if (center || scale) {
    Z <- scale(Z, center = center, scale = scale)
  }
  
  ## Construct c as a vector, and check
  if (length(c) == 1) {
    c <- rep(c, k)
  }
  if (length(c) != k) {
    stop("Argument 'c' should be either length 1 or length 'k'.")
  }

  ## Initialization of deflated matrix, iteration counter, SVD components
  R <- Z
  i <- 0
  d <- double(k)
  umat <- matrix(NA, nrow = nrow(Z), ncol = k)
  vmat <- matrix(NA, nrow = ncol(Z), ncol = k)
  
  ## Match on start
  start <- match.arg(start)
  
  ## V from SVD of Z for starting values
  if (start == "predetermined") {
    vmat <- svd(Z, nu = 0, nv = k)$v
  }
  
  ## Initialization of variance explained
  cum_var_expl <- double(k)
  
  ## Iterate
  for (i in seq_len(k)) {
    
    ## Determine starting value for v: Either v from R or from Z
    vstart <- switch(start, 
                     independent = svd(R, nu = 0, nv = 1)$v[, 1], 
                     predetermined = vmat[, i])
    
    ## Get rank 1 SPCA of R (never scale here)
    R_pmd <- spca(Z = R, c = c[i], vstart = vstart, maxit = maxit, eps = eps, 
                  center = FALSE, scale = FALSE)
    
    ## Update d, u, v
    d[i] <- R_pmd$d
    umat[, i] <- R_pmd$u
    vmat[, i] <- R_pmd$v
    
    # Deflate the current residual matrix
    R <- R - R_pmd$d[1] * tcrossprod(R_pmd$u, R_pmd$v)
    
    # Calculate the approximation Z_k and store variance explained
    vcurr <- vmat[, seq_len(i), drop = FALSE]
    Z_k <- Z %*% vcurr %*% solve(t(vcurr) %*% vcurr) %*% t(vcurr)
    cum_var_expl[i] <- sum(Z_k^2)
  }
  
  ## Calculate PC scores, standard deviations of PCs
  colnames(vmat) <- paste0("PC", seq_len(k))
  scores <- as.data.frame(Z %*% vmat %*% solve(t(vmat) %*% vmat) %*% t(vmat))
  colnames(scores) <- paste0("PC", seq_len(k))
  sdev <- d / sqrt((nrow(Z) - 1))
  
  ## Total variance to be explained, and proportion explained
  tot_var <- sum(svd(Z, nu = 0, nv = 0)$d^2)
  pve <- data.frame(PC = seq_len(k), 
                    std_deviation = sdev, 
                    prop_variance = diff(c(0, cum_var_expl)) / tot_var,
                    cum_prop = cum_var_expl / tot_var)
  
  ## Return
  rownames(vmat) <- colnames(Z)
  rownames(scores) <- rownames(Z)
  list(scores = scores, loadings = vmat, sdev = sdev, pve = pve, svd = list(d = d, u = umat, v = vmat))
}
schoonees/spca documentation built on Jan. 31, 2021, 6:21 p.m.