Nothing
#' Estimate GMCM parameters of the general model
#'
#' Estimates the parameters of general Gaussian mixture copula models (GMCM).
#' The function finds the maximum likelihood estimate of a general
#' GMCM with various optimization procedures. Note, all but the PEM methods
#' provides the maximum likelihood estimate.
#'
#' @details The \code{"L-BFGS-B"} method does not perform a transformation of
#' the parameters and uses box constraints as implemented in \code{optim}. \cr
#' Note that the many parameter configurations are poorly estimable or
#' directly unidentifiable.
#'
#' \code{fit.general.GMCM} is simply an alias of \code{fit.full.gmcm}.
#'
#' @aliases fit.full.gmcm fit.general.GMCM fit.general.gmcm
#' @param u An \code{n} by \code{d} matrix of marginally uniform observations.
#' Rows corresponds to observations and columns to the dimensions of the
#' variables. I.e. these are often ranked and scaled test statistics or other
#' observations.
#' @param m The number of components to be fitted.
#' @param theta A list of parameters as defined in \code{\link{rtheta}}. If
#' \code{theta} is not provided, then heuristic starting values are chosen
#' using the k-means algorithm.
#' @param method A character vector of length \eqn{1}{1}. The optimization
#' method used. Should be either \code{"NM"}, \code{"SANN"}, \code{"L-BFGS"},
#' \code{"L-BFGS-B"}, or \code{"PEM"} which are the Nelder-Mead, Simulated
#' Annealing, limited-memory quasi-Newton method, limited-memory quasi-Newton
#' method with box constraints, and the pseudo EM algorithm, respectively.
#' Default is \code{"NM"}. See \code{\link{optim}} for further details.
#' @param max.ite The maximum number of iterations. If the \code{method} is
#' \code{"SANN"} this is the number of iterations as there is no other
#' stopping criterion. (See \code{\link{optim}})
#' @param verbose Logical. If \code{TRUE}, a trace of the parameter estimates
#' is made.
#' @param \dots Arguments passed to the \code{control}-list in
#' \code{\link{optim}} when \code{method} is not equal to \code{"PEM"}.
#' If \code{method} equals \code{"PEM"}, the arguments are passed to
#' \code{\link{PseudoEMAlgorithm}} if the \code{method}.
#'
#' @return A list of parameters formatted as described in \code{\link{rtheta}}.
#'
#' When \code{method} equals \code{"PEM"}, a list of extra information
#' (log-likelihood trace, the matrix of group probabilities, theta trace) is
#' added as an attribute called "extra".
#'
#' @note All the optimization procedures are strongly dependent on the initial
#' values and other parameters (such as the cooling scheme for method SANN).
#' Therefore it is advisable to apply multiple
#' different initial parameters (and optimization routines) and select the
#' best fit.
#'
#' The \code{\link{choose.theta}} itself chooses random a initialization.
#' Hence, the output when \code{theta} is not directly supplied can vary.
#'
#' See \code{\link{optim}} for further details.
#' @author Anders Ellern Bilgrau <anders.ellern.bilgrau@@gmail.com>
#' @seealso \code{\link{optim}}, \code{\link{get.prob}}
#' @references
#' Li, Q., Brown, J. B. J. B., Huang, H., & Bickel, P. J. (2011).
#' Measuring reproducibility of high-throughput experiments. The Annals of
#' Applied Statistics, 5(3), 1752-1779. doi:10.1214/11-AOAS466
#'
#' Tewari, A., Giering, M. J., & Raghunathan, A. (2011). Parametric
#' Characterization of Multimodal Distributions with Non-gaussian Modes. 2011
#' IEEE 11th International Conference on Data Mining Workshops, 286-292.
#' doi:10.1109/ICDMW.2011.135
#' @examples
#' set.seed(17)
#' sim <- SimulateGMCMData(n = 1000, m = 3, d = 2)
#'
#' # Plotting simulated data
#' par(mfrow = c(1,2))
#' plot(sim$z, col = rainbow(3)[sim$K], main = "Latent process")
#' plot(sim$u, col = rainbow(3)[sim$K], main = "GMCM process")
#'
#' # Observed data
#' uhat <- Uhat(sim$u)
#'
#' # The model should be fitted multiple times using different starting estimates
#' start.theta <- choose.theta(uhat, m = 3) # Random starting estimate
#' res <- fit.full.GMCM(u = uhat, theta = start.theta,
#' method = "NM", max.ite = 3000,
#' reltol = 1e-2, trace = TRUE) # Note, 1e-2 is too big
#'
#' # Confusion matrix
#' Khat <- apply(get.prob(uhat, theta = res), 1, which.max)
#' table("Khat" = Khat, "K" = sim$K) # Note, some components have been swapped
#'
#' # Simulation from GMCM with the fitted parameters
#' simfit <- SimulateGMCMData(n = 1000, theta = res)
#'
#' # As seen, the underlying latent process is hard to estimate.
#' # The clustering, however, is very good.
#' par(mfrow = c(2,2))
#' plot(simfit$z, col = simfit$K, main = "Model check 1\nSimulated GMM")
#' plot(simfit$u, col = simfit$K, main = "Model check 2\nSimulated GMCM")
#' plot(sim$u, col = Khat, main = "MAP clustering")
#' @export
fit.full.GMCM <- function (u,
m,
theta = choose.theta(u, m),
method = c("NM", "SANN", "L-BFGS", "L-BFGS-B", "PEM"),
max.ite = 1000,
verbose = TRUE,
...) {
# Note, Uhat is idempotent. Hence, already ranked data will not change
u <- Uhat(u)
method <- gsub("NM", "Nelder-Mead", match.arg(method))
if (missing(m) & missing(theta)) {
stop("m is not supplied.")
}
if (method != "PEM") {
gmcm.loglik <- function (par, u, m) { # Defining objective function
theta <- vector2theta(par, d = ncol(u), m = m)
theta$pie <- theta$pie/sum(theta$pie)
loglik <- dgmcm.loglik(theta = theta, u)
return(loglik)
}
par <- theta2vector(theta)
fit <- optim(par, gmcm.loglik, u = u, m = theta$m,
control = list(maxit = max.ite,
fnscale = -1, trace = verbose, ...),
method = method)
theta <- vector2theta(fit$par, d = theta$d, m = theta$m)
theta$pie <- theta$pie/sum(theta$pie)
} else {
fit <- PseudoEMAlgorithm(x = u,
theta = theta,
max.ite = max.ite,
verbose = verbose,
meta.special.case = FALSE,
...)
theta <- fit$theta
attr(theta, "extra") <- fit[setdiff(names(fit), "theta")]
}
class(theta) <- "theta"
return(theta)
}
#' @rdname fit.full.GMCM
#' @export
fit.general.GMCM <- fit.full.GMCM
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.