R/PX-CAVI-Laplace.R
In VBsparsePCA: The Variational Bayesian Method for Sparse PCA

Documented in spca.cavi.Laplace

#' Function for the PX-CAVI algorithm using the Laplace slab
#'
#' This function employs the PX-CAVI algorithm proposed in Ning (2020).
#' The \eqn{g} in the slab density of the spike and slab prior is chosen to be the Laplace density, i.e.,
#' \eqn{N(0, \sigma^2/\lambda_1 I_r)}.
#' Details of the model and the prior can be found in the Details section in the description of the `VBsparsePCA()` function.
#' This function is not capable of handling the case when r > 1. In that case, we recommend to use the multivariate distribution instead.
#'
#'
#'
#'@param x Data an \eqn{n*p} matrix.
#'@param r Rank.
#'@param lambda Tuning parameter for the density \eqn{g}.
#'@param max.iter The maximum number of iterations for running the algorithm.
#'@param eps The convergence threshold; the default is \eqn{10^{-4}}.
#'@param sig2.true The default is false, \eqn{\sigma^2} will be estimated; if sig2 is known and its value is given, then \eqn{\sigma^2} will not be estimated.
#'@param threshold The threshold to determine whether \eqn{\gamma_j} is 0 or 1; the default value is 0.5.
#'@param theta.int The initial value of theta mean; if not provided, the algorithm will estimate it using PCA.
#'@param theta.var.int The initial value of theta.var; if not provided, the algorithm will set it to be 1e-3*diag(r).
#'@param kappa.para1 The value of \eqn{\alpha_1} of \eqn{\pi(\kappa)}; default is 1.
#'@param kappa.para2 The value of \eqn{\alpha_2} of \eqn{\pi(\kappa)}; default is \eqn{p+1}.
#'@param sigma.a The value of \eqn{\sigma_a} of \eqn{\pi(\sigma^2)}; default is 1.
#'@param sigma.b The value of \eqn{\sigma_b} of \eqn{\pi(\sigma^2)}; default is 2.
#'
#'
#'@return \item{iter}{The number of iterations to reach convergence.}
#' \item{selection}{A vector (if \eqn{r = 1} or with the jointly row-sparsity assumption) or a matrix (if otherwise) containing the estimated value for \eqn{\boldsymbol \gamma}.}
#' \item{theta.mean}{The loadings matrix.}
#' \item{theta.var}{The covariance of each non-zero rows in the loadings matrix.}
#' \item{sig2}{Variance of the noise.}
#' \item{obj.fn}{A vector contains the value of the objective function of each iteration. It can be used to check whether the algorithm converges}
#'
#' @examples
#' #In this example, the first 20 rows in the loadings matrix are nonzero, the rank is 1
#' set.seed(2021)
#' library(MASS)
#' library(pracma)
#' n <- 200
#' p <- 1000
#' s <- 20
#' r <- 1
#' sig2 <- 0.1
#' # generate eigenvectors
#' U.s <- randortho(s, type = c("orthonormal"))
#' U <- rep(0, p)
#' U[1:s] <- as.vector(U.s[, 1:r])
#' s.star <- rep(0, p)
#' s.star[1:s] <- 1
#' eigenvalue <- seq(20, 10, length.out = r)
#' # generate Sigma
#' theta.true <- U * sqrt(eigenvalue)
#' Sigma <- tcrossprod(theta.true) + sig2*diag(p)
#' # generate n*p dataset
#' X <- t(mvrnorm(n, mu = rep(0, p), Sigma = Sigma))
#' result <- spca.cavi.Laplace(x = X, r = 1)
#' loadings <- result$theta.mean
#' @export
#'
spca.cavi.Laplace <- function(
  x, r = 1, lambda = 1, max.iter = 100, eps = 1e-3,
  sig2.true = NA, threshold = 0.5,
  theta.int = NA, theta.var.int = NA,
  kappa.para1 = NA, kappa.para2 = NA, sigma.a = NA,
  sigma.b = NA
  ) {

  #------ Dimension of input matrices ------#
  n <- dim(x)[1]
  p <- dim(x)[2]

  # initialize theta.hat and theta.var
  svd.res <- svd(x)
  if (is.na(theta.int) == TRUE) {
    theta.mean <- as.matrix((t(svd.res$u) %*% x)[1, ]) / sqrt(n-1)
  } else {
    theta.mean <- theta.int
  }

  if (is.na(theta.var.int) == TRUE) {
    theta.var <- lapply(1:p, FUN =function(j){1e-3*diag(r)})
  } else {
    theta.var <- lapply(1:p, FUN =function(j){theta.var.int})
  }

  # initialize z
  z <- rep(1,p)

  # initialize sigma - choose sig2 to be the second smallest eigenvalue of the covariance matrix
  if (is.na(sig2.true) == TRUE) {
    sig2 <- (svd.res$d[length(svd.res$d)-1])^2 / (n-1)
  } else {
    sig2 <- sig2.true
  }

  # choose hyperparameter for priors
  # para1 for the prior of theta
  if (is.na(kappa.para1) == TRUE) {
    kappa.para1 <- 1
  } else {
    kappa.para1 <- kappa.para1
  }

  # para2 for the prior of theta
  if (is.na(kappa.para2) == TRUE) {
    kappa.para2 <- p+1
  } else {
    kappa.para2 <- kappa.para2
  }

  # para1 for the prior of sigma
  if (is.na(sigma.a) == TRUE) {
    sigma.a <- 1
  } else {
    sigma.a <- sigma.a
  }

  # para2 for the prior of sigma
  if (is.na(sigma.b) == TRUE) {
    sigma.b <- 2
  } else {
    sigma.b <- sigma.b
  }

  # create empty matrix to collect results
  theta.res <- array(NA, dim = c(p, r, max.iter))
  theta.var.res <- rep(NA, max.iter)
  selection.res <- array(NA, dim = c(p,r, max.iter))
  obj.fn.res <- matrix(NA, max.iter)
  sig2.res <- rep(NA, max.iter)

  for (iter in 1:max.iter) {

    ########################## E-step: update q(w) #############################
    # sum.theta.var <- Reduce("+", theta.var)
    if (iter == 1) {
      theta.mean.old <- theta.mean
      z.hat.old <- z.theta <- z
      beta.mean <- theta.mean
      beta.var <- theta.var
    }

    z.theta.mean <- theta.mean*z.theta # obtain theta.mean * z
    z.theta.var.sum <- sum(sapply(1:p, function(j) {
      z.theta[j] * theta.var[[j]]
    }))

    var.w <- solve(crossprod(z.theta.mean)/sig2 + z.theta.var.sum + diag(r)) # var of w

    # mean of w
    mean.w <- sapply(1:n, FUN = function(i) {
      x[i,] %*% ((z.theta.mean) %*% var.w) / sig2
    })

    # MSE of w
    MSE.w.i <- lapply(1:n, FUN = function(i) {
      if (r == 1) {
        (mean.w[i]^2 + var.w)
      } else {
        (tcrossprod(mean.w[, i]) + var.w)
      }
    })
    MSE.w <- Reduce("+", MSE.w.i) # sum of n second moments of w_i

    ########################## update beta #############################
    fn.beta.mean <- function(x, mean.w, var.w, beta.mean, beta.var, lambda, sig2) {

      mean.w <- t(mean.w)
      term1 <- - 1/sig2 * x %*% (t(mean.w) %*% beta.mean)
      term2 <- 1/(2*sig2) * beta.mean %*% MSE.w %*% beta.mean
      term3 <- lambda * sum(sapply(1:r, FUN = function(k) {
        foldednorm.mean(beta.mean[k], sig2 * beta.var[k,k])
      }))
      obj <- c(term1) + c(term2) + c(term3)
      return(obj)
    }

    # calculate beta.mean
    beta.mean <- sapply(1:p, FUN = function(j) {
      optimize(fn.beta.mean, c(-1e2, 1e2), x = x[, j], mean.w = mean.w, var.w = var.w,
               beta.var = beta.var[[j]], lambda = lambda, sig2 = sig2)$minimum
    })
    beta.mean <- matrix(beta.mean, p, 1)

    fn.beta.var <- function(mean.w, var.w, beta.mean, beta.var.vec, lambda, sig2, r) {

      beta.var <- beta.var.vec
      term1.1 <- c(beta.var) * MSE.w
      term1 <- 1/2 * term1.1
      term2 <- 1/2 * log(c(beta.var)+1e-6)
      term3 <- lambda * foldednorm.mean(c(beta.mean), sig2 * c(beta.var))

      obj <- c(term1) + c(term2) + c(term3)
      return(obj)
    }

    beta.var <- lapply(1:p, function(j) {

      value <- optimize(fn.beta.var, lower = 0, upper = 0.5, mean.w = mean.w, var.w = var.w,
                        beta.mean = beta.mean[j, ], lambda = lambda, sig2 = sig2,r=r)$minimum
      mtx <- matrix(value, r, r)

    })


    #### Update z ########
    h.hat <- sapply(1:p, FUN = function(j) {

      term1 <- - 1/(sig2) * sum((mean.w %*% x[, j]) * beta.mean[j, ])
      term2 <- 1/(2*sig2) * beta.mean[j, ]^2 * MSE.w
      term3.1 <- sig2 * beta.var[[j]] * MSE.w
      term3 <- term3.1 / (2*sig2)
      term4 <- lambda * foldednorm.mean(beta.mean[j, ], sig2 * beta.var[[j]])
      term5 <- r * log(sqrt(2/(pi * sig2 * lambda^2)))/2 - log(beta.var[[j]])/2 -
        1/2 + log(kappa.para2/kappa.para1)

      sum <- -(c(term1) + c(term2) + c(term3) + c(term4) + c(term5))
    })

    z.hat <- pracma::sigmoid(h.hat)
    z <- ifelse(z.hat < threshold, 0, 1) # the estimated z for beta
    z.beta.mean <- beta.mean * z

    ######################## Estimate sig2 ###########################
    if (is.na(sig2.true) == TRUE) {

      sigma2.fn <- function(sigma2) {
        sigma.foreach.j <- sapply(1:p, FUN = function(j) {
          sum(x[, j]^2 - 2*c(mean.w %*% x[, j] %*% z.beta.mean[j, ]))/(2*sig2) +
            lambda*sum(sapply(1:r, FUN = function(k) {
              foldednorm.mean(beta.mean[j, k], sig2 * beta.var[[j]][k,k])
            })) - r*log(sig2)/2 + n*log(sig2)/2
        })
        obj.fn <- sum(sigma.foreach.j) + (sigma.a+1) * log(sig2) + sigma.b/sig2
      }

      sig2 <- 1/optimize(sigma2.fn, lower = 0, upper = 10)$minimum
    }

    #################### Rotate beta back to theta ###################
    # first rotation obtain D
    D <- MSE.w/n
    chol.D <- t(chol(D))
    beta.mean.tilde <- beta.mean %*% chol.D
    theta.var <- lapply(1:p, function(j) {
      chol.D %*% beta.var[[j]] %*% t(chol.D)
    })
   # theta.var = beta.var

    # second rotation obtain A
    beta.mean.tilde.svd <- svd(beta.mean.tilde)
    A <- beta.mean.tilde.svd$u # obtain A
    theta.mean <- A %*% beta.mean.tilde.svd$d

    ####################### obtain sparsity for theta #######################
    z.theta <- z

    ################# obtain the objective function #####################
    obj.fn1 <- sum((tcrossprod(theta.mean) - tcrossprod(theta.mean.old))^2)
    obj.fn2 <- sum(abs(z.hat - z.hat.old))
    obj.fn <- max(obj.fn1, obj.fn2)

    ######################### collect results ############################
    selection.res[, 1,iter] <- z.theta
    theta.res[, , iter] <- theta.mean * z.theta
    obj.fn.res[iter] <- obj.fn
    sig2.res[iter] <- sig2

    if (obj.fn < eps) {
      break
    } else {
      theta.mean.old <- theta.mean
      z.hat.old <- z.hat
    }
  }

  return(list(iter = iter, selection = selection.res[,,iter],
              theta.mean = theta.res[,,iter], theta.var  = theta.var,
              sig2 = sig2, obj.fn = obj.fn.res[1:iter]))
}