R/SPCAvRP.R

SPCAvRP <- function( data                         # either the data matrix (n x p) or the sample covariance matrix (p x p)
                   , cov = FALSE                  # TRUE if data is given as a sample covariance matrix
                   , l                            # the sparsity level of the principal component estimator (for multiple component estimation use "SPCAvRP_subspace" or "SPCAvRP_deflation")
                                                  # (if the true sparsity k is unknown, this can be an array of (at most p) different sparsity levels and then estimators of corrsponding sparsity levels are returned)
                   , d = 20                       # dimension of the random projections (if the true sparsity k is known, use d = k)
                   , A = 600                      # the number of groups of projections
                   , B = 200                      # the number of projections in a group
                   , center_data = TRUE           # TRUE if the data matrix should be centered
                   , parallel = FALSE             # TRUE if the selection step should be computed in parallel (uses package "parallel")
                   , cluster_type = "PSOCK"       # if parallel = TRUE, this should be be "PSOCK" or "FORK" (cf. package "parallel")
                   , cores =  1                   # if parallel = TRUE and cluster_type = "FORK", this corresponds to the number of cores to use
                   , machine_names = NULL         # if parallel = TRUE, this contains the names of computers on network to form cluster
)
  # output : a list of three elements
  #             output$vector : a matrix of dimension p x length(l) with its columns as estimated eigenvectors of sparsity level l
  #             output$value  : an array of dimesnion lenght(l) with the eigenvalues cooresponding to the estimated eigenvectors in output$vector
  #             output$importance_scores  : an array of length p with importance scores for each variable 1,...,p
{
  ##########################################
  ################## the required functions:
  
  ## Project the sample covariance matrix along given axis-aligned projections:
  project_covariance <- function(   data          # either data matrix or sample covariance matrix
                                  , cov           # TRUE if data is given as a sample covariance matrix 
                                  , rand_ind      # projections (indices of non-zero elements)
  ) # output: projections - a list of nrow(rand_ind) projected sample covariance matrices of dimension ncol(rand_ind) x ncol(rand_ind)
  {
    M <- nrow(rand_ind) 
    if( cov == TRUE ){
      projections <- sapply(1:M, function(i){
        proj_ind <- rand_ind[i,]
        cov_proj <- data[proj_ind,proj_ind]
        return(list(cov_proj))
      })
    }else{
      projections <- sapply(1:M, function(i){
        proj_ind <- rand_ind[i,]
        cov_proj <- 1/nrow(data)*crossprod(data[,proj_ind])
        return(list(cov_proj))
      })
    }
    return(projections)
  }
  
  ## Compute the importance scores w for each variable:
  SPCAvRP_importance_scores <- function(   cov_projections # a list of projected covariances P*Sigma*P (as generated by function 'project_covariance')
                                         , rand_ind        # corresponding projections P (indices of non-zero elements)
                                         , p               # original dimension
                                         , A               # number of projections to be aggregated 
  ) # output : w, which is an array of length p with importance scores for each variable 1,...,p  
  { 
    N <- nrow(rand_ind)
    B <- floor(N/A)
    d <- ncol(rand_ind)
    # Selection of good projections
    v_lambda_hat_stars <- sapply(1:A, function(a){
      group_decomposition <- sapply(1:B, function(b){
        eig <- eigen(cov_projections[[B*(a-1)+b]], TRUE)
        if(1 == d){eig$values<-append(eig$values,0)}
        return(list(eig$values[1:2],eig$vectors[,1]))})
      lambdas_hat <- matrix(unlist(group_decomposition[1,]),nrow = B, byrow = TRUE)
      v_hat <- t(matrix(unlist(group_decomposition[2,]), nrow = B, ncol = d, byrow = TRUE))
      b_star <- which.max(lambdas_hat[,1])
      v_hat_star <- rep(0,p)
      v_hat_star[unlist(rand_ind[B*(a-1)+b_star,])] <- v_hat[,b_star]
      return(append(v_hat_star,lambdas_hat[b_star,]))
    }) 
    v_hat_stars <- v_lambda_hat_stars[1:p,]
    lambda_hat_stars <- v_lambda_hat_stars[p+1:2,]
    # Aggregation of good projections 
    w <- rowMeans(matrix(rep(lambda_hat_stars[1,]-lambda_hat_stars[2,],p),nrow=p,byrow=TRUE)*v_hat_stars^2)
    return(w) 
  }
  
  ## Select the projection yielding the largest eigenvalue among one group of B different d-dimensional axis-aligned projections generated uniformly at random:
  select_projection <- function(    data       # either data matrix or sample covariance matrix
                                  , cov = TRUE # FALSE if data is given as a data matrix
                                  , p          # dimension of the samples
                                  , d          # dimension of the random projections
                                  , B          # number of random projections to generate and select from
  ) # output: v_hat_star, which is the entrywise-squared weighted eigenvector of P Sigma_hat P, where P is the projection yielding the largest eigenvalue among B different random projections and the weight is given as the eigenvalue gap of P Sigma_hat P      
  {
    rand_ind <- as.vector(replicate(B,sample.int(p,d))) 
    group_decomposition <- sapply(1:B, function(b){
      if(cov == TRUE){
        eig <- eigen(data[rand_ind[d*(b-1)+1:d],rand_ind[d*(b-1)+1:d]], TRUE)
      }else{
        eig <- eigen(1/nrow(data)*crossprod(data[,rand_ind[d*(b-1)+1:d]]), TRUE)
      }
      values <- eig$values
      if(1 == d){values<-append(values,0)}
      vector <- eig$vectors[,1]                                 
      return(list(values[1:2],vector))})
    lambdas_hat <- matrix(unlist(group_decomposition[1,]),nrow = B, byrow = TRUE)
    v_hat <- t(matrix(unlist(group_decomposition[2,]), nrow = B, ncol = d, byrow = TRUE))
    b_star <- which.max(lambdas_hat[,1])
    v_hat_star <- rep(0,p)
    v_hat_star[rand_ind[d*(b_star-1)+1:d]] <- matrix(rep(lambdas_hat[b_star,1]-lambdas_hat[b_star,2],p),nrow=p,byrow=TRUE)*v_hat[,b_star]^2
    return(v_hat_star)
  }
  
  ## Compute the leading eigenvector of the sample covariance matrix given the indices of variables ranked by their importance and desired sparsity level:
  final_estimator <- function(    data            # either data matrix or sample covariance matrix
                                , cov             # TRUE if data is given as a sample covariance matrix 
                                , l               # sparsity of the principal component estimator (if the true sparsity is unknown, this can be an array of (at most p) different sparsity levels and then estimators of corrsponding sparsity levels are returned)
                                , ranking         # p varaibles ranked by their importance
  )# output : a list of two elements, output$vector (a vector / matrix with its columns as estimated eigenvectors of different sparsity level l) and output$value (an array of length lenght(l) with estimated eigenvalues)
  { 
    p <- ncol(data)
    l_loop <-sapply(1:length(l), function(i){
      top_coord <- ranking[1:l[i]]
      if( cov == TRUE ){
        eig <- eigen(data[top_coord,top_coord], TRUE)
      }else{
        eig <- eigen(1/nrow(data)*crossprod(data[,top_coord]), TRUE)
      }
      lambda_hat <- eig$values[1]
      v_hat <- rep(0,p)  
      v_hat[top_coord] <- eig$vectors[,1] 
      return(list(v_hat,lambda_hat))
    })
    v_hat <- matrix(unlist(l_loop[1,]), nrow = p, ncol = length(l) , byrow = FALSE)
    if(length(l)==1){v_hat <- as.vector(v_hat)}
    lambda_hat <- unlist(l_loop[2,])
    output <- list("vector" = v_hat, "value" = lambda_hat)
    return(output) 
  }
  
  ######################################
  ################## the main procedure:
  
  if( !cov & center_data ){
    data <- scale(data, center_data, FALSE)
  }

  p <- ncol(data)
  p_cond <- (p <= 2*d*sqrt(A*B))
  if( p_cond & cov == FALSE){
    data <- 1/nrow(data)*crossprod(data)
    cov <- TRUE
  }else if( !p_cond & cov == FALSE){
    print('SPCAvRP: since p > 2*d*sqrt(A*B), computation of p x p sample covariance is avoided')
  }

  if(!parallel){

  # Generate random projections
  rand_ind <- matrix(replicate(A*B,sample.int(p,d)), nrow = A*B, byrow = TRUE)

  # Project sample covariance
  cov_projections <- project_covariance(data, cov, rand_ind)

  # Ranking of variables by their importance scores
  w <- SPCAvRP_importance_scores(cov_projections, rand_ind, p, A)
  ranking<-sort(w, TRUE, index.return = TRUE)$ix

  }else{
    if(cluster_type == "FORK"){
      cluster = makeCluster(cores, type="FORK")
      #clusterEvalQ(cl = cluster, tools::psnice(value = 19))
    }
    if(cluster_type == "PSOCK"){
      if(is.null(machine_names)){
        cluster <- makePSOCKcluster(1)
      }else{
        cluster <- makePSOCKcluster(names = machine_names)
      }
      clusterExport(cluster, c("select_projection"), envir = environment())
      #clusterEvalQ(cl = cluster, tools::psnice(value = 19))
    }

    # Selection of good projections in parallel
    v_hat_stars <- parSapply(cl = cluster, 1:A, function(a){return(select_projection(data, cov, p, d, B))})
    stopCluster(cluster)

    # Aggregation of good projections
    w <- rowSums(abs(v_hat_stars))/A
    ranking <- sort(w, TRUE, index.return=TRUE)$ix
  }

  output_eig <- final_estimator(data, cov, l, ranking)
  output <- list("vector" = output_eig$vector, "value" = output_eig$value, "importance_scores" = w)

  return(output)
}

Try the SPCAvRP package in your browser

Any scripts or data that you put into this service are public.

SPCAvRP documentation built on May 6, 2019, 1:04 a.m.