R/fit.full.GMCM.R
In GMCM: Fast Estimation of Gaussian Mixture Copula Models

#' Estimate GMCM parameters of the general model
#'
#' Estimates the parameters of general Gaussian mixture copula models (GMCM).
#' The function finds the maximum likelihood estimate of a general
#' GMCM with various optimization procedures. Note, all but the PEM methods
#' provides the maximum likelihood estimate.
#'
#' @details The \code{"L-BFGS-B"} method does not perform a transformation of
#'   the parameters and uses box constraints as implemented in \code{optim}. \cr
#'   Note that the many parameter configurations are poorly estimable or
#'   directly unidentifiable.
#'
#'   \code{fit.general.GMCM} is simply an alias of \code{fit.full.gmcm}.
#'
#' @aliases fit.full.gmcm fit.general.GMCM fit.general.gmcm
#' @param u An \code{n} by \code{d} matrix of marginally uniform observations.
#'   Rows corresponds to observations and columns to the dimensions of the
#'   variables. I.e. these are often ranked and scaled test statistics or other
#'   observations.
#' @param m The number of components to be fitted.
#' @param theta A list of parameters as defined in \code{\link{rtheta}}. If
#'   \code{theta} is not provided, then heuristic starting values are chosen
#'   using the k-means algorithm.
#' @param method A character vector of length \eqn{1}{1}. The optimization
#'   method used. Should be either \code{"NM"}, \code{"SANN"}, \code{"L-BFGS"},
#'   \code{"L-BFGS-B"}, or \code{"PEM"} which are the Nelder-Mead, Simulated
#'   Annealing, limited-memory quasi-Newton method, limited-memory quasi-Newton
#'   method with box constraints, and the pseudo EM algorithm, respectively.
#'   Default is \code{"NM"}. See \code{\link{optim}} for further details.
#' @param max.ite The maximum number of iterations. If the \code{method} is
#'   \code{"SANN"} this is the number of iterations as there is no other
#'   stopping criterion. (See \code{\link{optim}})
#' @param verbose Logical. If \code{TRUE}, a trace of the parameter estimates
#'   is made.
#' @param \dots Arguments passed to the \code{control}-list in
#'   \code{\link{optim}} when \code{method} is not equal to \code{"PEM"}.
#'   If \code{method} equals \code{"PEM"}, the arguments are passed to
#'    \code{\link{PseudoEMAlgorithm}} if the \code{method}.
#'
#' @return A list of parameters formatted as described in \code{\link{rtheta}}.
#'
#' When \code{method} equals \code{"PEM"}, a list of extra information
#' (log-likelihood trace, the matrix of group probabilities, theta trace) is
#' added as an attribute called "extra".
#'
#' @note All the optimization procedures are strongly dependent on the initial
#'   values and other parameters (such as the cooling scheme for method SANN).
#'   Therefore it is advisable to apply multiple
#'   different initial parameters (and optimization routines) and select the
#'   best fit.
#'
#'   The \code{\link{choose.theta}} itself chooses random a initialization.
#'   Hence, the output when \code{theta} is not directly supplied can vary.
#'
#'   See \code{\link{optim}} for further details.
#' @author Anders Ellern Bilgrau <anders.ellern.bilgrau@@gmail.com>
#' @seealso \code{\link{optim}}, \code{\link{get.prob}}
#' @references
#'   Li, Q., Brown, J. B. J. B., Huang, H., & Bickel, P. J. (2011).
#'   Measuring reproducibility of high-throughput experiments. The Annals of
#'   Applied Statistics, 5(3), 1752-1779. doi:10.1214/11-AOAS466
#'
#'   Tewari, A., Giering, M. J., & Raghunathan, A. (2011). Parametric
#'   Characterization of Multimodal Distributions with Non-gaussian Modes. 2011
#'   IEEE 11th International Conference on Data Mining Workshops, 286-292.
#' doi:10.1109/ICDMW.2011.135
#' @examples
#' set.seed(17)
#' sim <- SimulateGMCMData(n = 1000, m = 3, d = 2)
#'
#' # Plotting simulated data
#' par(mfrow = c(1,2))
#' plot(sim$z, col = rainbow(3)[sim$K], main = "Latent process")
#' plot(sim$u, col = rainbow(3)[sim$K], main = "GMCM process")
#'
#' # Observed data
#' uhat <- Uhat(sim$u)
#'
#' # The model should be fitted multiple times using different starting estimates
#' start.theta <- choose.theta(uhat, m = 3)  # Random starting estimate
#' res <- fit.full.GMCM(u = uhat, theta = start.theta,
#'                      method = "NM", max.ite = 3000,
#'                      reltol = 1e-2, trace = TRUE)  # Note, 1e-2 is too big
#'
#' # Confusion matrix
#' Khat <- apply(get.prob(uhat, theta = res), 1, which.max)
#' table("Khat" = Khat, "K" = sim$K)  # Note, some components have been swapped
#'
#' # Simulation from GMCM with the fitted parameters
#' simfit <- SimulateGMCMData(n = 1000, theta = res)
#'
#' # As seen, the underlying latent process is hard to estimate.
#' # The clustering, however, is very good.
#' par(mfrow = c(2,2))
#' plot(simfit$z, col = simfit$K, main = "Model check 1\nSimulated GMM")
#' plot(simfit$u, col = simfit$K, main = "Model check 2\nSimulated GMCM")
#' plot(sim$u, col = Khat, main = "MAP clustering")
#' @export
fit.full.GMCM <- function (u,
                           m,
                           theta = choose.theta(u, m),
                           method = c("NM", "SANN", "L-BFGS", "L-BFGS-B", "PEM"),
                           max.ite = 1000,
                           verbose = TRUE,
                           ...) {
  # Note, Uhat is idempotent. Hence, already ranked data will not change
  u <- Uhat(u)

  method <- gsub("NM", "Nelder-Mead", match.arg(method))

  if (missing(m) & missing(theta)) {
    stop("m is not supplied.")
  }

  if (method != "PEM") {

    gmcm.loglik <- function (par, u, m) { # Defining objective function
      theta <- vector2theta(par, d = ncol(u), m = m)
      theta$pie <- theta$pie/sum(theta$pie)
      loglik <- dgmcm.loglik(theta = theta, u)
      return(loglik)
    }

    par <- theta2vector(theta)
    fit <- optim(par, gmcm.loglik, u = u, m = theta$m,
                 control = list(maxit = max.ite,
                                fnscale = -1, trace = verbose, ...),
                 method = method)
    theta <- vector2theta(fit$par, d = theta$d, m = theta$m)
    theta$pie <- theta$pie/sum(theta$pie)

  } else {

    fit <- PseudoEMAlgorithm(x = u,
                             theta = theta,
                             max.ite = max.ite,
                             verbose = verbose,
                             meta.special.case = FALSE,
                             ...)
    theta <- fit$theta

    attr(theta, "extra") <- fit[setdiff(names(fit), "theta")]

  }

  class(theta) <- "theta"
  return(theta)
}

#' @rdname fit.full.GMCM
#' @export
fit.general.GMCM <- fit.full.GMCM