gfpca: Generalized Functional Principal Components Analysis

Documented in gfpca_Bayes

#' gfpca_Bayes
#' 
#' Implements a Bayesian approach to generalized functional principal
#' components analysis for sparsely observed binary curves
#' 
#' 
#' @param data A dataframe containing observed data. Should have column names
#' \code{index} for observation times, \code{value} for observed responses,
#' and \code{id} for curve indicators.
#' @param npc Number of FPC basis functions to estimate; defaults to 3
#' @param output_index Grid on which estimates should be computed. Defaults to
#' \code{NULL} and returns estimates on the timepoints in the observed dataset
#' @param nbasis Number of basis functions used in spline expansions; defaults to 10
#' @param iter Number of sampler iterations; defaults to 1000
#' @param warmup Number of iterations discarded as warmup; defaults to 400
#' @param method Bayesian fitting method; \code{vb} (default) or \code{sampling}
#' 
#' @author Jeff Goldsmith \email{jeff.goldsmith@@columbia.edu} and John Muschelli
#' 
#' @references Gertheiss, J., Goldsmith, J., and Staicu, A.-M. (2016). A note
#' on modeling sparse exponential-family functional response curves.
#' \emph{Under Review}.
#' @examples
#' 
#' \dontrun{
#' library(mvtnorm)
#' library(boot)
#' library(refund.shiny)
#' 
#' ## set simulation design elements
#' 
#' bf = 10                           ## number of bspline fns used in smoothing the cov
#' D = 101                           ## size of grid for observations
#' Kp = 2                            ## number of true FPC basis functions
#' grid = seq(0, 1, length = D)
#' 
#' ## sample size and sparsity
#' I <- 300
#' mobs <- 7:10
#' 
#' ## mean structure
#' mu <- 8*(grid - 0.4)^2 - 3
#' 
#' ## Eigenfunctions /Eigenvalues for cov:
#' psi.true = matrix(NA, 2, D)
#' psi.true[1,] = sqrt(2)*cos(2*pi*grid)
#' psi.true[2,] = sqrt(2)*sin(2*pi*grid)
#' 
#' lambda.true = c(1, 0.5)
#' 
#' ## generate data
#' 
#' set.seed(1)
#' 
#' ## pca effects: xi_i1 phi1(t)+ xi_i2 phi2(t)
#' c.true = rmvnorm(I, mean = rep(0, Kp), sigma = diag(lambda.true))
#' Zi = c.true %*% psi.true
#' 
#' Wi = matrix(rep(mu, I), nrow=I, byrow=T) + Zi
#' pi.true = inv.logit(Wi)  # inverse logit is defined by g(x)=exp(x)/(1+exp(x))
#' Yi.obs = matrix(NA, I, D)
#' for(i in 1:I){
#'   for(j in 1:D){
#'     Yi.obs[i,j] = rbinom(1, 1, pi.true[i,j])
#'   }
#' }
#' 
#' ## "sparsify" data
#' for (i in 1:I)
#' {
#'   mobsi <- sample(mobs, 1)
#'   obsi <- sample(1:D, mobsi)
#'   Yi.obs[i,-obsi] <- NA
#' }
#' 
#' Y.vec = as.vector(t(Yi.obs))
#' subject <- rep(1:I, rep(D,I))
#' t.vec = rep(grid, I)
#' 
#' data.sparse = data.frame(
#'   index = t.vec,
#'   value = Y.vec,
#'   id = subject
#' )
#' 
#' data.sparse = data.sparse[!is.na(data.sparse$value),]
#' 
#' ## fit Bayesian models
#' fit.Bayes = gfpca_Bayes(data = data.sparse)
#' plot(mu)
#' lines(fit.Bayes$mu, col=2)
#' plot_shiny(fit.Bayes)
#' }
#' 
#' @export gfpca_Bayes
#' @import rstan
#' @importFrom splines bs
gfpca_Bayes <- function(data, npc = 3, output_index = NULL, nbasis = 10, iter = 1000, warmup = 400, 
												method = c("vb", "sampling")){
  
	method = match.arg(method)
	
  ## implement some data checks
  
  if (is.null(output_index)) { output_index = sort(unique(data['index'][[1]])) }
  
  I = length(unique(data['id'][[1]]))
  D = length(output_index)
  
  X.des = matrix(1, nrow = I, ncol = 1)
  p = 1
  
  BS_fit = bs(data['index'][[1]], df = nbasis, intercept = TRUE, degree = 3)
  BS_output = bs(output_index, df = nbasis, intercept = TRUE, degree = 3)
  
  alpha = .1
  diff0 = diag(1, D, D)
  diff2 = matrix(rep(c(1, -2, 1, rep(0, D - 2)), D - 2)[1:((D - 2)*D)], D - 2, D, byrow = TRUE)
  P0 = t(BS_output) %*% t(diff0) %*% diff0 %*% BS_output
  P2 = t(BS_output) %*% t(diff2) %*% diff2 %*% BS_output
  P.mat = alpha * P0 + (1 - alpha) * P2
  
  
  ## format sparse data for stan fitting
  Y.vec.obs = data['value'][[1]]
  subject.obs = data['id'][[1]]
  n.total = length(Y.vec.obs)
  
  ## fit model using STAN
  stanfit = stanmodels$gfpca

  dat = list(Y = Y.vec.obs, X = X.des, BS = BS_fit,
             subjId = subject.obs,
             N = n.total, I = I, D = D, p = p, Kt = nbasis, Kp = npc, 
             PenMat = P.mat)
  
  if (method == "vb") {
  	GenFPCA.fit = vb(stanfit, data = dat, iter = iter)
  } else if (method == "sampling") {
  	GenFPCA.fit = sampling(stanfit,
                           data = dat, iter = iter,
                           warmup = warmup,
                           control = list(adapt_delta = .65),
                           chains = 1, verbose = FALSE)
  }
  
  ## post process to obtain point estimates
  beta.post = extract(GenFPCA.fit, "beta")$beta
  beta_psi.post = extract(GenFPCA.fit, "beta_psi")$beta_psi
  c.post = extract(GenFPCA.fit, "c")$c
  
  betaHat.post = array(NA, dim = c(p, D, dim(c.post)[1]))
  for (i in 1:dim(c.post)[1]) {
    betaHat.post[,,i] = (beta.post[i,,] %*% t(BS_output))
  }
  
  y.post = z.post = array(NA, dim = c(I, D, dim(c.post)[1]))
  for (i in 1:dim(c.post)[1]) {
    y.post[,,i] = X.des %*% (beta.post[i,,] %*% 
                               t(BS_output)) + c.post[i,,] %*% 
      (beta_psi.post[i,,] %*% t(BS_output))
    z.post[,,i] = c.post[i,,] %*% (beta_psi.post[i,,] %*% t(BS_output))
  }
  z_pm = apply(z.post, c(1,2), mean)
  Yhat_matrix = apply(y.post, c(1,2), mean)
  
  ## obtain orthogonal estimate of FPC quantities
  decomp = svd(z_pm)
  evalues = decomp$d[1:npc]
  efunctions = decomp$v[, 1:npc]
  scores = decomp$u[, 1:npc]
  
  ## format output
  mu = as.vector(apply(betaHat.post, c(1,2), mean))
  Y = data
  index = output_index
  family = "binomial"
  Yhat = data.frame(
    value = as.vector(t(Yhat_matrix)),
    index = rep(output_index, I),
    id = rep(1:I, each = D)
  )
  
  ret.objects = c("Yhat", "Y", "scores", "mu", "efunctions", "evalues", "npc", "index", "family")
  ret = lapply(1:length(ret.objects), function(u) get(ret.objects[u]))
  names(ret) = ret.objects
  class(ret) = "fpca"
  return(ret)
  
}