R/mixedMemModel.R

#' Constructor for a Mixed Membership Model object which can be fit using 
#' variational inference
#' 
#' Constructor for a \code{mixedMemModelVI} object which can be used be fit using
#' variational inference in the \code{mixedMem} package.
#' 
#' The function returns an object of \code{mixedMemModelVI} class. This object
#' contains dimensions of the model, the observed data, and the model
#' parameters. Once a \code{mixedMemModelVI} object is created, the specified 
#' model can be fit for the data using the \code{mmVarFit} function. If the
#' inputs are inconsistent (ie if dimensions do not match,
#' or if observations and distribution types are not compatible, 
#' \code{mixedMemModelVI} will throw an error. 
#' For additional details on usage, and model assumptions, see the 
#' corresponding vignette "Fitting Mixed Membership Models 
#' Using \code{mixedMem}". Supported data types include Bernoulli, Multinomial,
#' and Rank (Plackett-Luce). Note that the variational methods of \code{mixeMem}
#' do not support the "Extended GoM" model which can be fit through the provided
#' MCMC methods.
#' 
#' @param Total the number of individuals in the sample.
#' @param J the number of variables observed on each individual.
#' @param Rj a vector of length J specifying the number of repeated measurements
#'  for each variable.
#' @param Nijr an array with dimension (\code{Total}, \code{J}, \code{max(Rj)}) indicating the number
#'  of ranking levels for each variable and each replication. For multinomial and Bernoulli
#'  variables, \code{Nijr}[i,j,r] = 1. For rank variables, \code{Nijr}[i,j,r] indicates the
#'  number of alternatives ranked.
#' @param K the number of sub-populations.
#' @param Vj a vector of length \code{J} specifying the number of possible responses
#'  for each variable. For a Bernoulli variable \code{Vj}[j] = 1. 
#' @param alpha a vector of length \code{K} which is the parameter for Dirichlet
#'  membership distribution.
#' @param theta an array with dimensions (\code{J},\code{K},\code{max(Vj)}) which governs the variable
#'  distributions. The parameter \code{theta}[j,k,] governs the distribution of variable J for a complete member of sub-population k. For instance, if variable j is a Bernoulli variable, theta[j,k,1] is the probability of success; if
#'  variable j is a multinomial variable, \code{theta}[j,k, 1:Vj[j]] is the probability for each of the \code{Vj}[j] categories; if variable j
#'  is a rank variable, \code{theta}[j,k, 1:Vj[j]] are the support parameters for each of the \code{Vj}[j] possible categories. Since the dimension of the relevant parameters
#'  can differ across variables, any unused elements of the array should be set to 0, while all other elements should be positive.
#' @param phi an array with dimensions (\code{Total},\code{K}) which specifies the variational
#'  parameters for the membership vectors, \eqn{\lambda}. The default group membership initialization is uniform across all groups (phi[i,k] = 1/K for all k).
#'  The default initialization is highly recommended.
#' @param delta an array with dimensions (\code{Total},\code{J},\code{max(Rj)}, \code{max(Nijr)}, \code{K}) which specifies
#'  the variational parameters for the context vectors Z. The default initialization is
#'   uniform across all sub-populations (\code{delta}[i, j, r, n, k] = 1/K for all k).
#' @param dist a vector of length \code{J} specifying variable types. Options are
#'  "bernoulli", "multinomial" or "rank" corresponing to the distributions
#'   of the observed variables.
#' @param obs an array with dimensions (\code{Total},\code{J},\code{max(Rj)}, \code{max(Nijr)}) corresponding to 
#' the observed data. For Bernoulli random variables, the data consist of 0/1's. 
#' For multinomial or rank data the data consist of integers 0,1,\ldots,(Vj[j]-1).
#' @return returns an object of class \code{mixedMemModelVI}
#' @examples
#' set.seed(123)
#' Total <- 50 # 50 Individuals
#' J <- 3 # 3 different variables
#' # distributions of each variable
#' dist <- c("multinomial", "bernoulli", "rank") 
#' # 100 repeated measures of the multinomial, 5 repeated measures of the
#' # Bernoulli, 1 repeated measure of the rank
#' Rj <- c(100, 5, 1) 
#' 
#' K <- 4 # 4 sub-populations
#' alpha <- rep(.5, K) #hyperparameter for dirichlet distribution
#' 
#' # Number of categories/alternatives for each variable. For the Bernoulli, Vj = 1
#' Vj <- c(10, 1, 4) 
#' 
#' 
#' theta <- array(0, dim = c(J, K, max(Vj)))
#' # Parameters governing multinomial
#' theta[1,,] <- gtools::rdirichlet(K, rep(.3, Vj[1]))
#' # parameters governing Bernoulli
#' theta[2,,] <- cbind(rbeta(K, 1,1), matrix(0, nrow = K, ncol = Vj[1]-1))
#' theta[3,,] <- cbind(gtools::rdirichlet(K, rep(.3, Vj[3])),
#'  matrix(0, nrow = K, ncol = Vj[1]-Vj[3]))
#' 
#' # Items selected for each observation. For Multinomial and Bernoulli, this is always 1
#' # For rank data, this will be the number of alternatives ranked
#' Nijr = array(0, dim = c(Total, J, max(Rj)))
#' Nijr[,1,c(1:Rj[1])] = 1 # N_ijr is 1 for multinomial variables
#' Nijr[,2,c(1:Rj[2])] = 1 # N_ijr is 1 for Bernoulli variables
#' Nijr[,3,c(1:Rj[3])] = sample.int(Vj[3], size = Total, replace = TRUE)
#' 
#' # generate random sample of observations
#' sampleMixedMem <- rmixedMem(Total, J, Rj, Nijr, K, Vj,
#' dist, theta, alpha)
#' 
#' ## Initialize a mixedMemModel object
#' test_model <- mixedMemModel(Total = Total, J = J,Rj = Rj,
#'  Nijr= Nijr, K = K, Vj = Vj,dist = dist, obs = sampleMixedMem$obs,
#'   alpha = alpha, theta = theta)
#' # Look at Summary of the initialized model
#' summary(test_model)
#' # Plot the current values for theta
#' plot(test_model) 
#' @export
mixedMemModelVarInf <- function(Total, J, Rj = rep(1, J),
                                Nijr = array(1, dim = c(Total, J, max(Rj))),
                                K, Vj, alpha, theta, phi = NULL,
                                delta = NULL, dist, obs) {
  
  # Checks if model defaults are used and fills in defaults
  if (is.null(delta))
  {
    delta = array(0, dim = c(Total,J, max(Rj), max(Nijr), K))
    for (i in 1:Total) {
      for (j in 1:J) {
        for (r in 1:Rj[j]) {
          for (n in 1:Nijr[i,j,r]) {
            delta[i, j, r, n, ] <- rep(1/K, K)
          }
        }
      }
    }
  }
  
  if (is.null(phi)) {
    phi <- array(1/K, dim = c(Total,K))
  }
  
  if (length(dim(obs)) == 2) {
    obs <- array(obs, dim = c(dim(obs), 1, 1))
  } else if (length(dim(obs)) == 3) {
    obs <- array(obs, dim = c(dim(obs), 1))
  }
  
  
  
  #put objects in a list
  model_obj <- list(Total, J, Rj, Nijr, K, Vj,
                     alpha, theta , phi , delta,
                     dist, obs)
  
  names(model_obj) = c("Total", "J", "Rj", "Nijr", "K", "Vj", "alpha","theta", "phi",
                         "delta", "dist" ,"obs")    
    
  class(model_obj) <- "mixedMemModelVarInf" 
  #check for valid model parameters
  checkModelVarInf(model_obj)
  
  
  # relabel all attributes
  dimnames(model_obj$theta) <- list(paste("Var", c(1:J), sep = ""),
                                    paste("Group", c(1:K), sep = ""),
                                    paste("Cat", c(1:(max(Vj))), sep = ""))
  
  names(model_obj$alpha) <- paste("Group", c(1:K))
  
  names(model_obj$Vj) <- paste("Var", c(1:J), sep = "")
  names(model_obj$Rj) <- paste("Var", c(1:J), sep = "")
  names(model_obj$dist) <- paste("Var", c(1:J), sep = "")
  
  dimnames(model_obj$phi) <- list(paste("Ind", c(1:Total), sep = ""),
                                  paste("Group", c(1:K), sep = ""))
  
  dimnames(model_obj$delta) <- list(paste("Ind", c(1:Total), sep = ""),
                                    paste("Var", c(1:J), sep = ""),
                                    paste("Rep", c(1:max(Rj)), sep = ""),
                                    paste("Rank", c(1:max(Nijr)), sep = ""),
                                    paste("Group", c(1:K), sep = ""))
  
  dimnames(model_obj$Nijr) <- list(paste("Ind", c(1:Total), sep = ""),
                                   paste("Var", c(1:J), sep = ""),
                                   paste("Rep", c(1:max(Rj)), sep = ""))
  
  dimnames(model_obj$obs) <- list(paste("Ind", c(1:Total), sep = ""),
                                  paste("Var", c(1:J), sep = ""),
                                  paste("Rep", c(1:max(Rj)), sep = ""),
                                  paste("Rank", c(1:max(Nijr)), sep = ""))
  
  return(model_obj)
}


#' Constructor for a Mixed Membership Model object which can be fit through MCMC
#' 
#' Constructor for a \code{mixedMemModelMCMC} object which can be fit using MCMC 
#' in the \code{mixedMem} package.
#' 
#' The function returns an object of \code{mixedMemModelMCMC} class. This object contains dimensions of the model,
#' the observed data, and the model parameters. Once a \code{mixedMemModel} object is created,
#' the specified model can be fit for the data using the \code{mmMCMCFit}
#' function. If the inputs are inconsistent (ie if dimensions do not match,
#' or if observations and distribution types are not compatible,
#' \code{mixedMemModelMCMC} will throw an error. Supported data types 
#' include Bernoulli and Multinomial (Note that rank data is not supported 
#' in the MCMC method). The MCMC method is capable of also estimating the 
#' extended GoM model which allows for a fixed number of "stayers". See 
#' Erosheva et al (2007) for a detailed description of the extended GoM Model.
#' For additional details on usage, and model
#' assumptions, see the corresponding vignette "Fitting Mixed 
#' Membership Models Using \code{mixedMem}".
#' 
#' @param Total the number of individuals in the sample.
#' @param J the number of variables observed on each individual.
#' @param dist a vector of length \code{J} specifying variable types. Options are
#' "bernoulli" or "multinomial" corresponing to the distributions
#'  of the observed variables.
#' @param Rj a vector of length J specifying the number of repeated measurements
#'  for each variable.
#' @param Vj a vector of length \code{J} specifying the number of possible responses
#'  for each variable. For a Bernoulli variable \code{Vj}[j] = 1. 
#' @param obs an array with dimensions (\code{Total},\code{J},\code{max(Rj)}) corresponding to 
#' the observed data. For Bernoulli random variables, the data consist of 0/1's. 
#' For multinomial the data consist of integers 0,1,\ldots,(Vj[j]-1).
#' @param K the number of sub-populations.
#' @param theta an array with dimensions (\code{J},\code{K},\code{max(Vj)}) which governs the variable
#'  distributions. The parameter \code{theta}[j,k,] governs the distribution of variable J for a complete member of sub-population k. For instance, if variable j is a Bernoulli variable, theta[j,k,1] is the probability of success; if
#'  variable j is a multinomial variable, \code{theta}[j,k, 1:Vj[j]] is the probability for each of the \code{Vj}[j] categories. Since the dimension of the relevant parameters
#'  can differ across variables, any unused elements of the array should be set to 0, while all other elements should be positive.
#' @param alpha a scalar representing the sum of the dirichlet membership parameters
#' @param ksi a vector which lies in the K-1 simplex which represents the relative frequency of each sub-population
#' @param lambda a matrix with dimensions (\code{Total}, \code{K}) which represent the partial membership for individual i in group k
#' @param Z an array with dimensions (\code{Total},\code{J},\code{max(Rj)}) which represents the sub-population according to which
#' individual i responded to the r repitition of variable j
#' @param tau an array of dimension (\code{J}, \code{K}, max(2, \code{max(Vj)})) which holds the prior parameters for each of the
#' theta parameters. Note that for Bernoulli variables, even though Vj = 1, both parameters of the beta must be specified so we still require
#' 2 values
#' @param beta shape parameter for prior for alpha 
#' @param gamma scale parameter for prior for alpha
#' @param extended 1 indicates the extended GoM model is being used; 0 indicates the
#' normal mixed membership model without fixed stayers
#' @param P vector of length S + 1 which lies in S dimensional simplex indicating the proportion of individuals in the fixed groups and GoM compartment.
#' The first S elements correspond to the stayer classes, while the S + 1 element indicates the proportion of individuals in GoM compartment
#' @param S integer indicating the number of fixed stayer classes if extended GoM is used.
#' @param fixedObs an array with dimensions (S, \code{J}, \code{max(Rj)}) corresonding to the observed
#' responses for a fixed group in the extended GoM model from Erosheva et al (2007).
#' @return returns an object of class \code{mixedMemModelMCMC}.
#' @export
mixedMemModelMCMC <- function(Total, J, dist, Rj = rep(1, J), Vj, obs, K, theta, alpha0,
                              ksi, lambda = NULL, Z = NULL, tau, beta, gamma,
                              extended = 0, P = NULL, S = NULL, fixedObs = NULL){
  
  if (length(dim(obs)) == 2) {
    obs <- array(obs, dim = c(dim(obs), 1))
  }
  
  if (is.null(lambda)) {
    lambda <- gtools::rdirichlet(Total, alpha0 * ksi)
  }
  
  if (is.null(Z)) {
    Z <- array(sample.int(K, size = Total * J * max(Rj),
                          replace = T, prob = alpha0 * ksi), dim = c(Total, J, max(Rj)))
  }
  
  model <- list(Total = Total, J = J, dist = dist, Rj = Rj, Vj = Vj, obs = obs,
                  K = K,  theta = theta, alpha = c(alpha0), ksi = ksi,
                  lambda = lambda, Z = Z, tau = tau, beta = beta, gamma  = gamma, 
                  extended = extended, P = P, S = S, fixedObs = fixedObs)
  
  checkModelMCMC(model)
  
  
  names(model$Vj) <- names(model$Rj) <- names(model$dist) <-
    paste("Var", c(1:J))
  
  dimnames(model$obs) <- list(paste("Ind", c(1:Total)),
                              paste("Var", c(1:J)),
                              paste("Rep", c(1:max(Rj))))
  
  
  dimnames(model$theta) <- list(paste("Var", c(1:J)),
                                paste("Group", c(1:K)),
                                paste("Cat", c(0:(max(Vj)-1))))
  
  names(model$ksi) <- paste("Group", c(1:K))
  
  dimnames(model$lambda) <- list(paste("Ind", c(1:Total)),
                                 paste("Group", c(1:K)))
  
  dimnames(model$Z) <- list(paste("Ind", c(1:Total)),
                            paste("Var", c(1:J)),
                            paste("Rep", c(1:max(Rj))))
  
  
  dimnames(model$tau) <- list(paste("Var", c(1:J)),
                              paste("Group", c(1:K)),
                              paste("Cat", c(0:(max(c(Vj, 2) -1)))))
  
  if (extended) {
    names(model$P) <- c(paste("Stayer", c(1:S)), "GoM")
  }
  
  class(model) <- "mixedMemModelMCMC"
  
  return(model)
  
}








#' Summary of a Mixed Membership Model
#' 
#' Generic S3 summary function for \code{mixedMemModelVarInf} class. 
#' 
#' \code{summary} provides a summary of the given \code{mixedMemModelVarInf} object. The function 
#' prints the ELBO, the dimensions of the model and each variable type.
#'  
#' @param object the \code{mixedMemModelVarInf} object to be summarized
#' @param ... additional parameters
#' @seealso mixedMemModelVI
#' @export
summary.mixedMemModelVarInf = function(object,...)
{
  cat("== Summary for Mixed Membership Model ==\n")
  cat(paste("Total: ", object$Total, "\t\t K: ",object$K, "\t\t ELBO: ",round(computeELBO(object),2),"\n\n" ,sep = ""))
  df = data.frame(paste("  ",c(1:object$J),sep = ""), object$dist, object$Rj,
                  object$Vj)
  colnames(df) = c("Variable  ", "Variable Type    ", "Replicates    ", "Categories  ")
  print(df, row.names = FALSE, right = FALSE)
}


#' Summary of a Mixed Membership Model
#' 
#' Generic S3 summary function for \code{mixedMemModelMCMC} class. 
#' 
#' \code{summary} provides a summary of the given \code{mixedMemModelMCMC} object. The function 
#' prints the ELBO, the dimensions of the model and each variable type.
#'  
#' @param object the \code{mixedMemModelMCMC} object to be summarized
#' @param ... additional parameters
#' @seealso mixedMemModelMCMC
#' @export
summary.mixedMemModelMCMC = function(object,...)
{
  cat("== Summary for Mixed Membership Model ==\n")
  cat(paste("Total: ", object$Total, "\t\t K: ",object$K, "\n\n" ,sep = ""))
  df <- data.frame(paste("  ",c(1:object$J),sep = ""), object$dist, object$Rj,
                  object$Vj)
  colnames(df) <- c("Variable  ", "Variable Type    ", "Replicates    ", "Categories  ")
  print(df, row.names = FALSE, right = FALSE)
}


#' Plot a Mixed Membership Model
#' 
#' Generic S3 function to produce visual representation of a \code{mixedMemModelVarInf} object. This function calls either the \code{vizTheta} or
#' the \code{vizMem} function. 
#' 
#' @param x the \code{mixedMemModelVarInf} object to be plotted.
#' @param type a string which indicates which estimated parameters to plot; valid options are "theta" or "membership".
#'  \code{vizTheta} is called when the type is "theta" and \code{vizMem} is called when the type is "membership". 
#' @param compare an array or matrix for comparison. When type = "theta", \code{compare} should be an array the same size as x$theta.
#' When type = "membership", \code{compare} should be a matrix the same size as x$phi.
#' @param main the main figure title.
#' @param varNames a vector of strings corresponding to names for each variable if plot type is theta.
#' @param groupNames a vector of strings corresponding to labels for each sub-population.
#' @param nrow the number of rows for the grid of plots.
#' @param ncol the number of columns for the grid of plots. If plot type is "theta", this
#' must be K, if plot type is  "membership", this must be a positive integer.
#' @param indices a vector of integers. If the plot type is  "membership", this indicates which individuals to plot.
#' When plot type is  "theta", this indicates which variables to plot. 
#' @param fitNames a vector of strings corresponding to labels for each fit.
#' @param ... additional parameters to be passed.
#' @seealso mixedMemModelVI, vizTheta, vizMem
#' @export
plot.mixedMemModelVarInf = function(x, type = "theta" , compare = NULL,
                              main = NULL,
                              varNames = NULL,
                              groupNames = NULL,
                              nrow = NULL, ncol = NULL, indices = NULL, fitNames = NULL,...){
  if(type =="theta") {
    if(is.null(main)){
      main = "Estimated Theta"
    }
    vizTheta(x, compare = compare, main = main, varNames = varNames,
             groupNames = groupNames, nrow = nrow, fitNames = fitNames, indices = indices)
  } else if (type == "membership") {
    if(is.null(main)){
      main = "Estimated Memberships"
    }
    vizMem(x,compare = compare, main = main, nrow = nrow, ncol = ncol,
           indices = indices, fitNames = fitNames, groupNames = groupNames)
  }
}

#' Plot a Mixed Membership Model
#' 
#' Generic S3 function to produce visual representation of a \code{mixedMemModelMCMC} object. This function calls either the \code{vizTheta} or
#' the \code{vizMem} function. 
#' 
#' @param x the \code{mixedMemModel} object to be plotted.
#' @param type a string which indicates which estimated parameters to plot; valid options are "theta" or "membership".
#'  \code{vizTheta} is called when the type is "theta" and \code{vizMem} is called when the type is "membership". 
#' @param compare an array or matrix for comparison. When type = "theta", \code{compare} should be an array the same size as x$theta.
#' When type = "membership", \code{compare} should be a matrix the same size as x$phi.
#' @param main the main figure title.
#' @param varNames a vector of strings corresponding to names for each variable if plot type is theta.
#' @param groupNames a vector of strings corresponding to labels for each sub-population.
#' @param nrow the number of rows for the grid of plots.
#' @param ncol the number of columns for the grid of plots. If plot type is "theta", this
#' must be K, if plot type is  "membership", this must be a positive integer.
#' @param indices a vector of integers. If the plot type is  "membership", this indicates which individuals to plot.
#' When plot type is  "theta", this indicates which variables to plot. 
#' @param fitNames a vector of strings corresponding to labels for each fit.
#' @param ... additional parameters to be passed.
#' @seealso mixedMemModelMCMC, vizTheta, vizMem
#' @export
plot.mixedMemModelMCMC = function(x, type = "theta" , compare = NULL,
                                main = NULL,
                                varNames = NULL,
                                groupNames = NULL,
                                nrow = NULL, ncol = NULL, indices = NULL, fitNames = NULL, ...){
  
  if (type == "theta") {
    if (is.null(main)) {
      main <- "Estimated Theta"
    }
    vizTheta(x, compare = compare, main = main, varNames = varNames,
             groupNames = groupNames, nrow = nrow, fitNames = fitNames, indices = indices)
  } else if (type == "membership") {
    if (is.null(main)) {
      main = "Estimated Memberships"
    }
    vizMem(x,compare = compare, main = main, nrow = nrow, ncol = ncol,
           indices = indices, fitNames = fitNames, groupNames = groupNames)
  }
}



#' Print the theta estimates for each sub-population
#' 
#' Print the theta estimates for each sub-population to the R console in a more
#' aesthetically pleasing manner
#' 
#' @param model the \code{mixedMemModel} who's theta will be printed.
#' @seealso mixedMemModel, vizTheta
#' @export
theta.table <- function(model, digits = 3, top.level = "group"){
  if (top.level == "group"){
    for (k in 1:model$K){
      cat(paste("Group ", k, ":", "(alpha = ", round(model$alpha[k], 3),
                "; relative freq = ", round(model$alpha[k] / sum(model$alpha), 2), ")", sep = ""))
      cat("\n")
      for (j in 1:model$J)
      {
        cat(paste("> Var", j,":", model$dist[j], "\n\t"))
        if (model$dist[j] == "bernoulli") {
          for (v in 0:1) {
            cat(paste("Cat", v,"\t"))
          }
          cat("\n\t")
          cat(paste(round(1 - model$theta[j,k,1], digits),"\t"))
          cat(paste(round(model$theta[j,k,1], digits),"\t"))
        } else{
          for (v in 1:model$Vj[j]) {
            cat(paste("Cat", v - 1,"\t"))
          }
          cat("\n\t")
          for (v in 1:model$Vj[j]) {
            cat(paste(round(model$theta[j, k, v],digits),"\t"))
          }
        }
        cat("\n")
      }
    }
  } else {
      for (j in 1:model$J) {
        cat(paste("Var", j,":", model$dist[j], "\n\t"))
        if (model$dist[j] == "bernoulli") {
          cat("\t\t")
          for (v in 0:1) {
            cat(paste("Cat", v,"\t"))
          }
          cat("\n")
          for (k in 1:model$K) {
            cat(paste("\tGroup", k, "\t"))
            cat(paste(round(1 - model$theta[j,k,1],digits), "\t"))
            cat(paste(round(model$theta[j,k,1],digits), "\t"))
            cat("\n")
          }
          cat("\n")
        } else {
          cat("\t\t")
          for (v in 1:model$Vj[j]) {
            cat(paste("Cat", v,"\t"))
          }
          cat("\n")
          for (k in 1:model$K) {
            cat(paste("\tGroup", k, "\t"))
            for (v in 1:model$Vj[j]) {
              cat(paste(round(model$theta[j,k,v],digits),"\t"))
            }
            cat("\n")
          }
        }
        cat("\n")
      }
  }
}
ysamwang/mixedMem documentation built on May 4, 2019, 5:33 p.m.