R/GEM.R
In CUB: A Class of Mixture Models for Ordinal Data

Documented in GEM

#' @title Main function for GEM models
#' @description Main function to estimate and validate GEneralized Mixture models with uncertainty. 
#' @aliases GEM
#' @usage GEM(Formula,family=c("cub","cube","ihg","cush"),data,...)
#' @param Formula Object of class Formula. Response variable is the vector of ordinal observations - see Details.
#' @param family Character string indicating which class of GEM models to fit.
#' @param data an optional data frame (or object coercible by \code{as.data.frame} to a data frame)
#'  containing the variables in the model. If missing, the variables are taken from \code{environment(Formula)}.
#' @param ... Additional arguments to be passed for the specification of the model. See details and examples.
#' @export GEM
#' @return An object of the class "GEM" is a list containing the following elements: 
#' \item{estimates}{Maximum likelihood estimates of parameters}
#' \item{loglik}{Log-likelihood function at the final estimates}
#' \item{varmat}{Variance-covariance matrix of final estimates}
#' \item{niter}{Number of executed iterations}
#' \item{BIC}{BIC index for the estimated model}
#' \item{ordinal}{Vector of ordinal responses on which the model has been fitted}
#' \item{time}{Processor time for execution}
#' \item{ellipsis}{Retrieve the arguments passed to the call and extra arguments generated via the call}
#' \item{family}{Character string indicating the sub-class of the fitted model}
#' \item{formula}{Returns the Formula of the call for the fitted model}
#' \item{call}{Returns the executed call}
#' @import methods Formula
#' @details It is the main function for GEM models estimation, calling for the corresponding function for
#'  the specified subclass. The number of categories \code{m} is internally retrieved but it is advisable to pass
#'  it as an argument to the call if some category has zero frequency.\cr
#'  If  \code{family="cub"}, then a CUB mixture model is fitted to the data to explain uncertainty, 
#'  feeling and possible shelter effect by further passing the extra argument \code{shelter} for the corresponding category.
#'  Subjects' covariates can be included by specifying covariates matrices in the 
#'  Formula as \code{ordinal~Y|W|X},  to explain uncertainty (Y), feeling (W) or shelter (X).  Notice that
#'  covariates for shelter effect can be included only if specified for both feeling and uncertaint (GeCUB models). \cr
#'  If \code{family="cube"}, then a CUBE mixture model (Combination of Uniform and Beta-Binomial) is fitted to the data
#'   to explain uncertainty, feeling and overdispersion.   Subjects' covariates can be also included to explain the
#'    feeling component or all the three components by  specifying covariates matrices in the Formula as 
#'    \code{ordinal~Y|W|Z} to explain uncertainty (Y), feeling (W) or 
#'  overdispersion (Z). An extra logical argument \code{expinform} indicates whether or not to use the expected or the 
#'  observed information matrix (default is FALSE). \cr
#'  If \code{family="ihg"}, then an IHG model is fitted to the data. IHG models (Inverse Hypergeometric) are nested into
#'   CUBE models (see the references below). The parameter \eqn{\theta} gives the probability of observing 
#'   the first category and is therefore a direct measure of preference, attraction, pleasantness toward the 
#'  investigated item. This is the reason why \eqn{\theta} is customarily referred to as the preference parameter of the 
#'  IHG model. Covariates for the preference parameter \eqn{\theta} have to be specified in matrix form in the Formula as 
#'  \code{ordinal~U}. \cr
#'  If \code{family="cush"}, then a CUSH model is fitted to the data (Combination of Uniform and SHelter effect).
#'   The category corresponding to the inflation should be
#'  passed via argument \code{shelter}. Covariates for the shelter parameter \eqn{\delta}
#'  are specified in matrix form Formula as \code{ordinal~X}. \cr
#'  Even if no covariate is included in the model for a given component, the corresponding model matrix needs always
#'  to be specified: in this case, it should be set to 0 (see examples below). Extra arguments include the maximum 
#'  number of iterations (\code{maxiter}, default: \code{maxiter}=500) for the optimization algorithm and 
#'  the required error tolerance (\code{toler}, default: \code{toler}=1e-6). \cr
#'  Standard methods: \code{logLik()}, \code{BIC()}, \code{vcov()}, \code{fitted()}, \code{coef()}, \code{print()}, \code{summary()}
#'  are implemented.\cr
#'  The optimization procedure is run via \code{optim()} when required. If the estimated variance-covariance matrix is not positive definite, the function returns a 
#'  warning message and produces a matrix with NA entries.
#' @references 
#' D'Elia A. (2003). Modelling ranks using the inverse hypergeometric distribution, 
#' \emph{Statistical Modelling: an International Journal}, \bold{3}, 65--78 \cr
#' D'Elia A. and Piccolo D. (2005). A mixture model for preferences data analysis, 
#'   \emph{Computational Statistics & Data Analysis},  \bold{49}, 917--937 \cr
#' Capecchi S. and Piccolo D. (2017). Dealing with heterogeneity in ordinal responses,
#'  \emph{Quality and Quantity}, \bold{51}(5), 2375--2393 \cr
#' Iannario M. (2014). Modelling Uncertainty and Overdispersion in Ordinal Data, 
#' \emph{Communications in Statistics - Theory and Methods}, \bold{43}, 771--786 \cr
#' Piccolo D. (2015). Inferential issues for CUBE models with covariates,
#'  \emph{Communications in Statistics. Theory and Methods}, \bold{44}(23), 771--786. \cr
#' Iannario M. (2015). Detecting latent components in ordinal data with overdispersion by means
#'  of a mixture distribution, \emph{Quality & Quantity}, \bold{49}, 977--987 \cr
#'  Iannario M. and Piccolo D. (2016a). A comprehensive framework for regression models of ordinal data.
#'  \emph{Metron}, \bold{74}(2), 233--252.\cr
#'  Iannario M. and Piccolo D. (2016b). A generalized framework for modelling ordinal data. 
#'  \emph{Statistical Methods and Applications}, \bold{25}, 163--189.\cr 
#' @seealso \code{\link{logLik}}, \code{\link{coef}}, \code{\link{BIC}},  \code{\link{makeplot}},
#'  \code{\link{summary}}, \code{\link{vcov}}, \code{\link{fitted}}, \code{\link{cormat}}
#' @keywords models
#' @examples 
#' 
#' library(CUB)
#' ## CUB models with no covariates
#' model<-GEM(Formula(Walking~0|0|0),family="cub",data=relgoods)
#' coef(model,digits=5)     # Estimated parameter vector (pai,csi)
#' logLik(model)            # Log-likelihood function at ML estimates
#' vcov(model,digits=4)     # Estimated Variance-Covariance matrix
#' cormat(model)            # Parameter Correlation matrix
#' fitted(model)            # Fitted probability distribution
#' makeplot(model)
#' ################
#' ## CUB model with shelter effect
#' model<-GEM(Formula(officeho~0|0|0),family="cub",shelter=7,data=univer)
#' BICshe<-BIC(model,digits=4)
#' ################
#' ## CUB model with covariate for uncertainty
#' modelcovpai<-GEM(Formula(Parents~Smoking|0|0),family="cub",data=relgoods)
#' fitted(modelcovpai)
#' makeplot(modelcovpai)
#' ################
#' ## CUB model with covariates for both uncertainty and feeling components
#' data(univer)
#' model<-GEM(Formula(global~gender|freqserv|0),family="cub",data=univer,maxiter=50,toler=1e-2)
#' param<-coef(model)
#' bet<-param[1:2]      # ML estimates of coefficients for uncertainty covariate: gender
#' gama<-param[3:4]     # ML estimates of coefficients for feeling covariate: lage
#' ##################
#' ## CUBE models with no covariates
#' model<-GEM(Formula(MeetRelatives~0|0|0),family="cube",starting=c(0.5,0.5,0.1),
#'   data=relgoods,expinform=TRUE,maxiter=50,toler=1e-2)
#' coef(model,digits=4)       # Final ML estimates
#' vcov(model)
#' fitted(model)
#' makeplot(model)
#' summary(model)
#' ##################
#' ## IHG with covariates
#' modelcov<-GEM(willingn~freqserv,family="ihg",data=univer)
#' omega<-coef(modelcov)      ## ML estimates 
#' maxlik<-logLik(modelcov)   ## 
#' makeplot(modelcov)
#' summary(modelcov)
#' ###################
#' ## CUSH models without covariate
#' model<-GEM(Dog~0,family="cush",shelter=1,data=relgoods)
#' delta<-coef(model)      # ML estimates of delta
#' maxlik<-logLik(model)   # Log-likelihood at ML estimates
#' summary(model)
#' makeplot(model)


GEM<-function(Formula,family=c("cub","cube","ihg","cush"),data,...){
  
  call <- match.call()
  
  if (missing(data)) data <- environment(Formula)
  
  mf<-model.frame(Formula,data,na.action=na.omit)
  ordinal<-as.numeric(model.response(mf))
  
  #if (missing(m)) m <- length(levels(factor(ordinal,ordered=TRUE)))
  
  ellipsis.arg<-list(...)

  ellipsis.arg[['data']]<-data
  
  if (is.null(ellipsis.arg[['m']])){
    ellipsis.arg[['m']] <- length(levels(factor(ordinal,ordered=TRUE)))
     }

 # m<-mget('m',ifnotfound=list(max(ordinal)))
  ### nc number of categories 
 
  #as.list(mapply(c, lista1, m, SIMPLIFY = TRUE))
  
  # # #cl<-match.call(expand.dots=TRUE)
  
  if (family == "cub"){
    if (is.null(ellipsis.arg[['maxiter']])){
      ellipsis.arg[['maxiter']] <- 500
    }
    if (is.null(ellipsis.arg$toler)){
      ellipsis.arg$toler <- 1e-4
    }
  
    mod<-CUB(Formula,data,ellipsis.arg)
    modelname<-"CUB"
    
  }
  if (family =="cube"){
    if (is.null(ellipsis.arg[['maxiter']])){
      ellipsis.arg[['maxiter']] <- 1000
    }
    if (is.null(ellipsis.arg$toler)){
      ellipsis.arg$toler <- 1e-6
    }
    maxiter<- ellipsis.arg[['maxiter']]
    toler<-ellipsis.arg$toler
    mod<-CUBE(Formula,data,ellipsis.arg)
    modelname<-"CUBE"
    
  }
  if (family == "ihg"){
   # mod<-IHG(Formula,data,...)
    modelname<-"IHG"
    ellipsis.arg[['maxiter']]<-1
    ellipsis.arg$niter<-1
    mod<-IHG(Formula,data,ellipsis.arg)
  }
  if (family == "cush"){
    ellipsis.arg[['maxiter']]<-1
    ellipsis.arg$niter<-1
    mod<-CUSH(Formula,data,ellipsis.arg)
    modelname<-"CUSH"
  
  }
  
  stime<-mod$estimates
  durata<-mod$time
  #maxiter<-mod$maxiter
  loglik<-as.numeric(mod$loglik)
  niter<-mod$niter
  varmat<-mod$varmat
  BIC<-as.numeric(mod$BIC)
  time<-mod$durata
  

  results<-list('estimates'=stime,'ordinal'=ordinal,'time'=durata,
                'loglik'=loglik,'niter'=niter,'varmat'=varmat,
                'BIC'=BIC,'ellipsis'=ellipsis.arg,'family'=modelname,
                'formula'=Formula,'call'=call)
  
  attr(results,"hidden")<-c("estimates","ordinal","loglik","varmat","BIC","ellipsis","family")
  
  class(results)<-"GEM"
  
  

  
  
  # print.GEM<-function(x,...){
  #   hid<-attr(x,"hidden")
  #   print(x[!names(x)%in%hid])
  # }
  # 
  

  if (family == "cub"){
    class(results)<-append("GEM","CUB")
    #class(results)<-c("CUB","GEM")
  }
  if (family =="cube"){
    class(results)<-append("GEM","CUBE")
  }
  if (family == "ihg"){
    class(results)<-append("GEM","IHG")
  }
  if (family == "cush"){
    class(results)<-append("GEM","CUSH")
  }
  
  invisible(results)
  #return(results)
}