R/simuPCA.R

Defines functions simuPCA

Documented in simuPCA

#'@title  Simulation from a sparse PCA model
#'@description Simulation of a data matrix  drawn from a sparse PCA model. 
#'The data matrix A of dimension n times p is drawn from a centered multivariate normal 
#'distribution. The m < p leading eigenvectors of the covariance matrix of this normal distribution are sparse.
#'@param n the number of observations. 
#'@param Ztrue a numerical matrix of dimension p times m. The columns of Ztrue are m sparse and othogonal vectors. 
#'They will be normalized and used as the m leading eigenvectors of the covariance matrix.
#'@param valp a numerical vector of size p. The values are the eigenvalues of the covariance matrix.
#'@param seed a seed value to replicate the simulation. By default, seed=FALSE.
#'@return Returns a data matrix A of dimension n times p.
#'@details This function generates data with a covariance matrix C having sparse eigenvectors V. C is synthesized
#'through the eigenvalue decomposition C=VDV^T where D=diag(valp). The first m columns of V are the sparse and orthogonal 
#'columns of Ztrue.  The remaining p-m eigenvectors are not specified to be sparse and are generated as follows. A matrix U of 
#'dimension n times (p-m) is randomly dawn from U(0,1) and V=Q from the QR decomposition  [Ztrue,U]=QR. The data matrix A is 
#'then generated by drawing n samples from a zero-mean distribution with covariance matrix C.
#'
#'@export
#'
#'@references 
#'\itemize{
#'\item M. Chavent and G. Chavent, Group-sparse block PCA and explained variance, arXiv:1705.00461
#'\item M. Journee and al., Generalized Power Method for Sparse Principal Component Analysis, Journal of Machine Learning Research 11 (2010) 517-553.
#'\item H. Shen and J.Z. Huang, Journal of Multivariate Analysis 99 (2008) 1015-1034.
#'}
#'@examples
#' # Example from Journee & al. 2010
#'  v1 <- c(rep(1/sqrt(10),10),rep(0,490))
#'  v2 <- c(rep(0,10), rep(1/sqrt(10),10),rep(0,480))
#'  valp <- c(400,300,rep(1,498))
#'  n <- 100
#'  A <- simuPCA(n,cbind(v1,v2),valp,seed=1)
#'  svd(A)$d[1:3]^2 #eigenvalues of the empirical covariance matrix.
#'  
#' # Example from Shen & Huang 2008
#'  v1 <- c(1,1,1,1,0,0,0,0,0.9,0.9)
#'  v2 <- c(0,0,0,0,1,1,1,1,-0.3,0.3)
#'  valp <- c(200,100,50,50,6,5,4,3,2,1)
#'  n <- 100
#'  A <- simuPCA(n,cbind(v1,v2),valp,seed=1)
#'  svd(A)$d^2 #eigenvalues of the empirical covariance matrix (times n).
#'  
#' # Example from Chavent & Chavent 2017
#'  valp <- c(c(200,180,150,130),rep(1,16))
#'  data("Ztrue")
#'  n <- 100
#'  A <-simuPCA(n,Ztrue,valp,seed=1)
#'  svd(A)$d^2 #eigenvalues of the empirical covariance matrix.
#'  
#'@seealso \code{\link{sparsePCA}}, \code{\link{groupsparsePCA}}, \code{\link{pev}}
simuPCA <- function(n=30, Ztrue, valp, seed=FALSE)
{
  norm2 <- function(x) sqrt(sum(x^2))
  p <- nrow(Ztrue)
  m <- ncol(Ztrue)
  if (m > p) stop("the number of loadings (number of columns in Ztrue) 
                  must be smaller of equal to the number of variables (rowns)",call.=FALSE)
  for (j in 1:ncol(Ztrue))
        Ztrue[,j] <- Ztrue[,j]/norm2(Ztrue[,j])
  if (seed != FALSE) set.seed(seed)

  if (length(valp)!=p)
    stop("the length of valp must be equal to the number of rows in Ztrue",call. = FALSE)
  U <- matrix(stats::runif(p*(p-m)),p,(p-m))
  V <- qr.Q(qr(cbind(Ztrue,U)))
  C <- V%*%diag(valp)%*%t(V)
  if (seed != FALSE) set.seed(seed)
  A <- mvtnorm::rmvnorm(n,sigma=C) /sqrt(n)
  return(A)
}
chavent/sparsePCA documentation built on July 2, 2017, 1:14 a.m.