R/tclustIC.R
In fsdaR: Robust Data Analysis Through Monitoring and Dynamic Visualization

Documented in tclustIC

######
##  VT::17.12.2018
##
##
##  roxygen2::roxygenise("C:/users/valen/onedrive/myrepo/R/fsdaR", load_code=roxygen2:::load_installed)
##
#' Performs cluster analysis by calling \code{\link{tclustfsda}} for different
#'  number of groups \code{k} and restriction factors \code{c}
#'
#' @description  Computes the values of BIC (MIXMIX), ICL (MIXCLA) or CLA (CLACLA),
#'  for different values of \code{k} (number of groups) and different values of \code{c}
#'  (restriction factor), for a prespecified level of trimming (the last two letters in the name
#'  stand for 'Information Criterion'). In order to minimize
#'  randomness, given \code{k}, the same subsets are used for each value of \code{c}.
#'
#' @param x An n x p data matrix (n observations and p variables).
#'  Rows of x represent observations, and columns represent variables.
#'
#'  Missing values (NA's) and infinite values (Inf's) are allowed,
#'  since observations (rows) with missing or infinite values will
#'  automatically be excluded from the computations.
#'
#' @param kk an integer vector specifying the number of mixture components (clusters) for which the BIC is to be calculated. By default \code{kk=1:5}.
#' @param cc an  vector specifying the values of the restriction factor which have to be considered. By default \code{cc=c(1, 2, 4, 8, 16, 32, 64, 128)}.
#' @param whichIC A character value which specifies which information criteria must be computed
#'  for each \code{k} (number of groups) and each value of the restriction factor \code{c}. Possible values for \code{whichIC} are:
#'  \itemize{
#'   \item "MIXMIX": a mixture model is fitted and for computing the information criterion
#'      the mixture likelihood is used. This option corresponds to the use of the Bayesian
#'      Information criterion (BIC). In output just the matrix \code{MIXMIX} is given.
#'  \item "MIXCLA": a mixture model is fitted but to compute the information criterion
#'      the classification likelihood is used. This option corresponds to the use of the
#'      Integrated Complete Likelihood (ICL). In the output just the matrix \code{MIXCLA} is given.
#'  \item "CLACLA": everything is based on the classification likelihood. This information
#'      criterion will be called CLA. In the output just the matrix \code{CLACLA} is given.
#'  \item "ALL": both classification and mixture likelihood are used. In this case all
#'      three information criteria CLA, ICL and BIC are computed. In the output all
#'      three matrices \code{MIXMIX}, \code{MIXCLA} and \code{CLACLA} are given.
#'  }
#'
#' @param alpha Global trimming level. A scalar between 0 and 0.5 or an integer specifying the number of
#'  observations which have to be trimmed. If \code{alpha=0} all observations are considered. By default \code{alpha=0}.
#'
#'  More in detail, if \code{0 < alpha < 1} clustering is based on \code{h = fix(n * (1-alpha))}
#'  observations, else if alpha is an integer greater than 1 clustering is based on \code{h = n - floor(alpha)}.
#'
#' @param plot If \code{plot=TRUE}, a plot of the BIC (MIXMIX), ICL (MIXCLA) curve
#'  and CLACLA is shown on the screen. The plots which are shown depend on
#'  the input option \code{whichIC}.
#   If \code{plot=FALSE} (default) or \code{plot=0}  no plot is produced.
#' @param nsamp If a scalar, it contains the number of subsamples which will be extracted.
#'  If \code{nsamp = 0} all subsets will be extracted. Remark - if the number of all possible
#'  subset is greater than 300 the default is to extract all subsets, otherwise just 300.
#'  If \code{nsamp} is a matrix it contains in the rows the indexes of the subsets which
#'  have to be extracted. \code{nsamp} in this case can be conveniently generated by
#'  function \code{subsets()}. \code{nsamp} can have \code{k} columns or \code{k * (p + 1)}
#'  columns. If \code{nsamp} has \code{k} columns the \code{k} initial centroids each
#'  iteration i are given by \code{X[nsamp[i,] ,]} and the covariance matrices are equal
#'  to the identity.
#'
#'  If \code{nsamp} has \code{k * (p + 1)} columns, the initial centroids and covariance
#'  matrices in iteration \code{i} are computed as follows:
#'  \itemize{
#'  \item X1 <- X[nsamp[i ,] ,]
#'  \item mean(X1[1:p + 1, ]) contains the initial centroid for group 1
#'  \item cov(X1[1:p + 1, ]) contains the initial cov matrix for group 1
#'  \item mean(X1[(p + 2):(2*p + 2), ]) contains the initial centroid for group 2
#'  \item cov(X1[(p + 2):(2*p + 2), ]) contains the initial cov matrix for group 2
#'  \item ...
#'  \item mean(X1[(k-1)*p+1):(k*(p+1), ]) contains the initial centroids for group k
#'  \item cov(X1[(k-1)*p+1):(k*(p+1), ]) contains the initial cov matrix for group k.
#'  }
#'
#'  REMARK: If \code{nsamp} is not a scalar, the option \code{startv1} given below is ignored.
#'  More precisely, if \code{nsamp} has \code{k} columns \code{startv1 = 0} else if
#'  \code{nsamp} has \code{k*(p+1)} columns option \code{startv1=1}.
#'
#' @param refsteps Number of refining iterations in each subsample. Default is \code{refsteps=15}.
#'  \code{refsteps = 0} means "raw-subsampling" without iterations.
#'
#' @param reftol Tolerance of the refining steps. The default value is 1e-14
#'
#' @param equalweights A logical specifying wheather cluster weights in the concentration
#'  and assignment steps shall be considered. If \code{equalweights=TRUE} we are (ideally)
#'  assuming equally sized groups, else if \code{equalweights = false} (default) we allow for
#'  different group weights. Please, check in the given references which functions
#' are maximized in both cases.
#'
#' @param msg  Controls whether to display or not messages on the screen If \code{msg==TRUE} (default)
#'  messages are displayed on the screen. If \code{msg=2}, detailed messages are displayed,
#'  for example the information at iteration level.
#' @param nocheck Check input arguments. If \code{nocheck=TRUE} no check is performed
#'  on matrix \code{X}. The default \code{nocheck=FALSE}.
#'
#' @param startv1 How to initialize centroids and covariance matrices. Scalar.
#'  If \code{startv1=1} then initial centroids and covariance matrices are based
#'  on \code{(p+1)} observations randomly chosen, else each centroid is initialized
#'  taking a random row of input data matrix and covariance matrices are initialized
#'  with identity matrices. The default value is\code{startv1=1}.
#'
#'  Remark 1: in order to start with a routine which is in the required parameter space,
#'  eigenvalue restrictions are immediately applied.
#'
#' Remark 2 - option \code{startv1} is used just if \code{nsamp} is a scalar
#'  (see for more details the help associated with \code{nsamp}).
#'
#' @param restrtype Type of restriction to be applied on the cluster scatter matrices.
#'  Valid values are \code{'eigen'} (default), or \code{'deter'}.
#'  \code{"eigen"} implies restriction on the eigenvalues while \code{"deter"}
#'  implies restriction on the determinants.
#'
#' @param UnitsSameGroup List of the units which must (whenever possible) have
#'  a particular label. For example \code{UnitsSameGroup=c(20, 26)}, means that
#'  group which contains unit 20 is always labelled with number 1. Similarly,
#'  the group which contains unit 26 is always labelled with number 2, (unless
#'  it is found that unit 26 already belongs to group 1).
#'  In general, group which contains unit \code{UnitsSameGroup(r)} where \code{r=2, ...length(kk)-1}
#'  is labelled with number \code{r} (unless it is found that unit \code{UnitsSameGroup(r)}
#'  has already been assigned to groups \code{1, 2, ..., r-1}.
#'
#' @param numpool The number of parallel sessions to open. If numpool is not defined,
#'  then it is set equal to the number of physical cores in the computer.
#'
#' @param cleanpool Logical, indicating if the open pool must be closed or not.
#'  It is useful to leave it open if there are subsequent parallel sessions to execute,
#'  so that to save the time required to open a new pool.
#'
#' @param trace Whether to print intermediate results. Default is \code{trace=FALSE}.
#'
#' @param ... potential further arguments passed to lower level functions.

#' @return  An S3 object of class \code{\link{tclustic.object}}

#' @references
#'      Cerioli, A., Garcia-Escudero, L.A., Mayo-Iscar, A. and Riani M. (2017).
#'      Finding the Number of Groups in Model-Based Clustering via Constrained Likelihoods,
#'      \emph{Journal of Computational and Graphical Statistics}, pp. 404-416,
#'      https://doi.org/10.1080/10618600.2017.1390469.
#'
#'
#' @seealso \code{\link{tclustfsda}}, \code{\link{tclustICplot}}, \code{\link{tclustICsol}}, \code{\link{carbikeplot}}
#' @examples
#'  \dontrun{
#'  data(geyser2)
#'  (out <- tclustIC(geyser2, whichIC="MIXMIX", plot=FALSE, alpha=0.1))
#'  summary(out)
#'  }
#'
#'  \dontrun{
#'  data(flea)
#'  Y <- as.matrix(flea[, 1:(ncol(flea)-1)])    # select only the numeric variables
#'  rownames(Y) <- 1:nrow(Y)
#'  head(Y)
#'
#'  (out <- tclustIC(Y, whichIC="CLACLA", plot=FALSE, alpha=0.1, nsamp=100, numpool=1))
#'  summary(out)
#'  }
#'
#' @export
#' @author FSDA team, \email{valentin.todorov@@chello.at}

tclustIC <- function(x, kk=1:5, cc=c(1, 2, 4, 8, 16, 32, 64, 128), alpha=0,
        whichIC=c("ALL", "MIXMIX", "MIXCLA", "CLACLA"),
        nsamp, refsteps=15, reftol=1e-14, equalweights=FALSE, msg=TRUE, nocheck=FALSE,
        plot=FALSE,
        startv1=1, restrtype= c("eigen", "deter"),
        UnitsSameGroup, numpool, cleanpool,
        trace=FALSE, ...)
{
    whichIC <- match.arg(whichIC)

	if(is.data.frame(x))
	    x <- data.matrix(x)
	else if(!is.matrix(x))
	    x <- matrix(x, length(x), 1,
			dimnames = list(names(x), deparse(substitute(x))))
    if(!is.numeric(x)) stop("x is not a numeric")

    dx <- dim(x)
    xn <- (dnx <- dimnames(x))[[2]]
    xn <- if (!is.null(xn))
        xn
    else if (dx[2] > 1)
        paste("X", 1:dx[2], sep = "")
    else if(dx[2])
        "X"
    dimnames(x) <- list(dnx[[1]], xn)

    n <- nrow(x)
    p <- ncol(x)


    outclass <- "tclustic"
    control <- list()

    xplots <- 0
    if(is.logical(plot))
        xplots <- ifelse(plot, 1, 0)
    else  if(is.numeric(plot) && plot >= 0 && plot <= 1)
        xplots <- plot
    else
        stop("Invalid parameter 'plot'. Should be TRUE/FALSE or 0, 1.")
    control$plots <- xplots

    control$kk <- kk
    control$cc <- cc
    control$alpha <- alpha
    control$whichIC <- whichIC
    if(!missing(nsamp))
        control$nsamp <- nsamp
    control$refsteps <- refsteps
    control$reftol <- reftol
    control$equalweights <- equalweights

    if(!missing(UnitsSameGroup))
        control$UnitsSameGroup <- UnitsSameGroup
    if(!missing(numpool))
        control$numpool <- numpool
    if(!missing(cleanpool))
        control$cleanpool <- ifelse(cleanpool, 1, 0)

    xmsg <- 0
    if(is.logical(msg))
        xmsg <- ifelse(msg, 1, 0)
    else  if(is.numeric(msg) && msg >= 0 && msg <= 2)
        xmsg <- msg
    else
        stop("Invalid parameter 'msg'. Should be TRUE/FALSE or 0, 1, 2.")
    control$msg <- xmsg
    xtemp <- 0
    if(is.logical(nocheck))
        xtemp <- ifelse(nocheck, 1, 0)
    else  if(is.numeric(nocheck) && nocheck >= 0 && nocheck <= 1)
        xtemp <- nocheck
    else
        stop("Invalid parameter 'nocheck'. Should be TRUE/FALSE or 0, 1.")
    control$nocheck <- xtemp
    control$startv1 <- startv1
    control$restrtype <- match.arg(restrtype)

    ## ES 27.06.2018: parameters that are mandatory to the MATLAB function
    ## cannot be put into the MATLAB function because they have to be supplied
    ## to the function individually and not in (name, value) pairs.
    ## Mandatory parameters to the MATLAB FSDA function
    parlist = c(.jarray(x, dispatch=TRUE))

    if(trace)
        print(control)

    paramNames = names(control)
    if(length(paramNames) > 0)
    {
        for (i in 1:length(paramNames)) {
            paramName = paramNames[i]
            paramValue = control[[i]]

            matlabValue = rType2MatlabType(paramName, paramValue)
            parlist = c(parlist, .jnew("java/lang/String", paramName), matlabValue)
        }
    }

    out <- callFsdaFunction("tclustIC", "[Ljava/lang/Object;", 1, parlist)
    if(is.null(out))
        return(NULL)

    arr1 = .jcast(out[[1]], "com/mathworks/toolbox/javabuilder/MWStructArray")
    arr = .jnew("org/jrc/ipsc/globesec/sitaf/fsda/FsdaMWStructArray", arr1)

    if(trace)
    {
        cat("\nReturning from MATLAB tclustIC().  Fields returned by MATLAB: \n")
        print(arr$fieldNames())
    }

    kk_ret <- as.vector(as.matrix(.jevalArray(arr$get("kk", as.integer(1)), "[[D", simplify = TRUE)))
    cc_ret <- as.vector(as.matrix(.jevalArray(arr$get("cc", as.integer(1)), "[[D", simplify = TRUE)))
    alpha_ret = as.vector(as.matrix(.jevalArray(arr$get("alpha", as.integer(1)), "[[D", simplify = TRUE)))[1]

    CLACLA <- if(as.integer(arr$hasField("CLACLA", as.integer(1))) != 1) NULL
              else as.matrix(.jevalArray(arr$get("CLACLA", as.integer(1)), "[[D", simplify = TRUE))
    IDXCLA <- if(as.integer(arr$hasField("IDXCLA", as.integer(1))) != 1) NULL
              else unwrapComplexNumericCellArray(as.matrix(.jevalArray(arr$get("IDXCLA", as.integer(1)))))
    MIXMIX <- if(as.integer(arr$hasField("MIXMIX", as.integer(1))) != 1) NULL
              else as.matrix(.jevalArray(arr$get("MIXMIX", as.integer(1)), "[[D", simplify = TRUE))
    MIXCLA <- if(as.integer(arr$hasField("MIXCLA", as.integer(1))) != 1) NULL
              else as.matrix(.jevalArray(arr$get("MIXCLA", as.integer(1)), "[[D", simplify = TRUE))
    IDXMIX <- if(as.integer(arr$hasField("IDXMIX", as.integer(1))) != 1) NULL
              else unwrapComplexNumericCellArray(as.matrix(.jevalArray(arr$get("IDXMIX", as.integer(1)))))

    xkk <- paste0("k=", kk_ret)
    xcc <- paste0("c=", cc_ret)
    if(!is.null(MIXMIX))
        dimnames(MIXMIX) <- list(xkk, xcc)
    if(!is.null(MIXCLA))
        dimnames(MIXCLA) <- list(xkk, xcc)
    if(!is.null(CLACLA))
        dimnames(CLACLA) <- list(xkk, xcc)
    if(!is.null(IDXMIX))
        dimnames(IDXMIX) <- list(xkk, xcc)
    if(!is.null(IDXCLA))
        dimnames(IDXCLA) <- list(xkk, xcc)

    Y <- if(as.integer(arr$hasField("Y", as.integer(1))) != 1) NULL
                else as.matrix(.jevalArray(arr$get("Y", as.integer(1)), "[[D", simplify = TRUE))

    ans = list(call=match.call(), CLACLA=CLACLA, IDXCLA=IDXCLA, MIXMIX=MIXMIX, MIXCLA=MIXCLA, IDXMIX=IDXMIX,
            kk=kk_ret, cc=cc_ret, alpha=alpha_ret, whichIC=whichIC, X=Y)

    freeMatlabResources(out)

    ## Remove any NULL elements (this happens if whichIC != ALL)
    if(length(del <- which(unlist(lapply(ans, FUN=is.null)))) > 0)
        ans <- ans[-del]

    if(trace)
    {
        cat("\ntclustIC(): object 'out' after removing the NULL objects:")
        print(names(ans))
    }

    class(ans) <- outclass

    return (ans)
}