R/tclustregIC.R
In fsdaR: Robust Data Analysis Through Monitoring and Dynamic Visualization

Documented in tclustregIC

######
##  VT::22.11.2019
##
##
##  roxygen2::roxygenise("C:/users/valen/onedrive/myrepo/R/fsdaR", load_code=roxygen2:::load_installed)
##
#'  Computes \code{tclustreg} for different number of groups \code{k}
#'  and restriction factors \code{c}.
#'
#' @description (the last two letters stand for 'Information Criterion') computes
#'  the values of BIC (MIXMIX), ICL (MIXCLA) or CLA (CLACLA), for different values
#'  of \code{k} (number of groups) and different values of \code{c}
#'  (restriction factor for the variances of the residuals), for
#'  a prespecified level of trimming. In order to minimize randomness, given \code{k},
#'  the same subsets are used for each value of \code{c}.
#'
#' @param y Response variable. A vector with \code{n} elements that
#'  contains the response variable.
#'
#' @param x An n x p data matrix (n observations and p variables).
#'  Rows of x represent observations, and columns represent variables.
#'
#'  Missing values (NA's) and infinite values (Inf's) are allowed,
#'  since observations (rows) with missing or infinite values will
#'  automatically be excluded from the computations.
#'
#' @param alphaLik Trimming level, a scalar between 0 and 0.5 or an
#'  integer specifying the number of observations which have to be trimmed.
#'  If \code{alphaLik=0}, there is no trimming.  More in detail, if \code{0 < alphaLik < 1}
#'  clustering is based on \code{h = floor(n * (1 - alphaLik))} observations.
#'  If \code{alphaLik} is an integer greater than 1 clustering is
#'  based on \code{h = n - floor(alphaLik)}. More in detail, likelihood
#'  contributions are sorted and the units associated with the smallest \code{n - h}
#'  contributions are trimmed.
#'
#' @param alphaX Second-level trimming or constrained weighted model for \code{x}.
#
#' @param intercept wheather to use constant term (default is \code{intercept=TRUE}
#'
#' @param plot If \code{plot=FALSE} (default) or \code{plot=0}  no plot is produced.
#'  If \code{plot=TRUE} a plot with the final allocation is shown (using the spmplot function).
#'  If \code{X} is 2-dimensional, the lines associated to the groups are shown too.
#'
#' @param nsamp If a scalar, it contains the number of subsamples which will be extracted.
#'  If \code{nsamp = 0} all subsets will be extracted. Remark - if the number of all possible
#'  subset is greater than 300 the default is to extract all subsets, otherwise just 300.
#'  If \code{nsamp} is a matrix it contains in the rows the indexes of the subsets which
#'  have to be extracted. \code{nsamp} in this case can be conveniently generated by
#'  function \code{subsets()}. \code{nsamp} must have \code{k * p} columns. The first \code{p}
#'  columns are used to estimate the regression coefficient of group 1, ..., the last \code{p}
#'  columns are used to estimate the regression coefficient of group \code{k}.
#'
#' @param refsteps Number of refining iterations in each subsample. Default is \code{refsteps=10}.
#'  \code{refsteps = 0} means "raw-subsampling" without iterations.
#'
#' @param reftol Tolerance of the refining steps. The default value is 1e-14
#'
#' @param equalweights A logical specifying wheather cluster weights in the concentration
#'  and assignment steps shall be considered. If \code{equalweights=TRUE} we are (ideally)
#'  assuming equally sized groups, else if \code{equalweights = false} (default) we allow for
#'  different group weights. Please, check in the given references which functions
#' are maximized in both cases.
#'
#' @param wtrim How to apply the weights on the observations - a flag taking values in c(0, 1, 2, 3, 4).
#'  \itemize{
#'      \item If \code{wtrim==0} (no weights), the algorithm reduces to the standard \code{tclustreg} algorithm.
#'      \item If \code{wtrim==1}, trimming is done by weighting the observations using values specified in vector
#'          \code{we}. In this case, vector \code{we} must be supplied by the user.
#'      \item If \code{wtrim==2}, trimming is again done by weighting the observations
#'          using values specified in vector \code{we}. In this case, vector \code{we}
#'          is computed from the data as a function of the density estimate pdfe.
#'          Specifically, the weight of each observation is the probability of retaining
#'          the observation, computed as
#'          \deqn{pretain_{ig} = 1-pdfe_{ig}/max_{ig}(pdfe_{ig})}
#'      \item If \code{wtrim==3}, trimming is again done by weighting the observations using
#'          values specified in vector \code{we}. In this case, each element wei of vector
#'          \code{we} is a Bernoulli random variable with probability of success
#'            \eqn{pdfe_{ig}}.
#'          In the clustering framework this is done under the constraint that no group is empty.
#'      \item If \code{wtrim==4}, trimming is done with the tandem approach of Cerioli and Perrotta (2014).
#'  }
#'
#'
#' @param we Weights. A vector of size n-by-1 containing application-specific weights
#'    Default is a vector of ones.
#'
#' @param msg  Controls whether to display or not messages on the screen If \code{msg==TRUE} (default)
#'  messages are displayed on the screen. If \code{msg=2}, detailed messages are displayed,
#'  for example the information at iteration level.
#'
#' @param RandNumbForNini pre-extracted random numbers to initialize proportions.
#'  Matrix of size k-by-nrow(nsamp) containing the random numbers which
#'  are used to initialize the proportions of the groups. This option is effective only if
#'  \code{nsamp} is a matrix which contains pre-extracted subsamples. The purpose of this
#'  option is to enable the user to replicate the results when the function \code{tclustreg()}
#'  is called using a parfor instruction (as it happens for example in routine IC, where
#'  \code{tclustreg()} is called through a parfor for different values of the restriction factor).
#'  The default is that \code{RandNumbForNini} is empty - then uniform random numbers are used.
#'
#' @param trace Whether to print intermediate results. Default is \code{trace=FALSE}.
#'
#' @param ... potential further arguments passed to lower level functions.
#'
#' @return  An S3 object of class \code{\link{tclustreg.object}}
#'
#' @references
#'
#'      Torti F., Perrotta D., Riani, M. and Cerioli A. (2019). Assessing Robust Methodologies for Clustering Linear Regression Data,
#'      Advances in Data Analysis and Classification, Vol. 13, pp 227-257.
#'
#' @export
#' @author FSDA team, \email{valentin.todorov@@chello.at}

tclustregIC <- function(y, x, alphaLik, alphaX, intercept=TRUE, plot=FALSE,
        nsamp, refsteps=10, reftol=10e-14, equalweights=FALSE, wtrim=0, we, msg=TRUE, RandNumbForNini,
        trace=FALSE, ...)
{
    if(is.data.frame(x))
      x <- data.matrix(x)
    else if(!is.matrix(x))
      x <- matrix(x, length(x), 1,
                  dimnames = list(names(x), deparse(substitute(x))))
    if(!is.numeric(x)) stop("x is not a numeric")

    if(is.data.frame(y))
      y <- data.matrix(y)
    else if(!is.matrix(y))
      y <- matrix(y, length(y), 1,
                  dimnames = list(names(y), deparse(substitute(y))))
    if(!is.numeric(y)) stop("y is not a numeric")

    dx <- dim(x)
    xn <- (dnx <- dimnames(x))[[2]]
    xn <- if (!is.null(xn))
        xn
    else if (dx[2] > 1)
        paste("X", 1:dx[2], sep = "")
    else if(dx[2])
        "X"
    dimnames(x) <- list(dnx[[1]], xn)

    n <- nrow(x)
    p <- ncol(x)

    control <- list(...)
    control$intercept <- ifelse(intercept, 1, 0)
    control$plots <- ifelse(plot, 1, 0)

    if(!missing(nsamp)){
        control$nsamp <- nsamp
        if(is.matrix(nsamp) && !missing(RandNumbForNini))
            control$RandNumbForNini <- RandNumbForNini
    }
    control$refsteps <- refsteps
    control$reftol <- reftol
    control$equalweights <- equalweights
    if(is.null(wtrim) || is.na(wtrim) || !is.numeric(wtrim) || wtrim < 0 || wtrim > 4)
        stop("'wtrim' must be numeric, one of 0, 1, 2, 3 or 4!")
    control$wtrim <- wtrim
    if(!missing(we))
    {
        if(!is.numeric(we) || length(we) != n)
            stop("Parameter 'we' must be a numeric vector of length ", n)
        control$we <- we
    }


    xmsg <- 0
    if(is.logical(msg))
        xmsg <- ifelse(msg, 1, 0)
    else  if(is.numeric(msg) && msg >= 0 && msg <= 2)
        xmsg <- msg
    else
        stop("Invalid parameter 'msg'. Should be TRUE/FALSE or 0, 1, 2.")
    control$msg <- xmsg

    outclass <- "tclustregIC"

    ## ES 27.06.2018: parameters that are mandatory to the MATLAB function
    ## cannot be put into the MATLAB function because they have to be supplied
    ## to the function individually and not in (name, value) pairs

    parlist = c(.jarray(y, dispatch=TRUE), .jarray(x, dispatch=TRUE))
    if(length(alphaLik) == 1)
        parlist = c(parlist, .jnew("java/lang/Double", as.double(alphaLik)))        # alpha 1
    else
        parlist = c(parlist, .jarray(alphaLik, dispatch=TRUE))
    if(length(alphaX) == 1)
        parlist = c(parlist, .jnew("java/lang/Double", as.double(alphaX)))          # alpha 2
    else
        parlist = c(parlist, .jarray(alphaX, dispatch=TRUE))

    paramNames = names(control)
    if(trace)
        print(control)

    if(length(paramNames) > 0)
    {
        for (i in 1:length(paramNames)) {
            paramName = paramNames[i]
            paramValue = control[[i]]

            matlabValue = rType2MatlabType(paramName, paramValue)
            parlist = c(parlist, .jnew("java/lang/String", paramName), matlabValue)
        }
    }

    out <- callFsdaFunction("tclustregIC", "[Ljava/lang/Object;", 1, parlist)
    if(is.null(out))
        return(NULL)

    arr1 = .jcast(out[[1]], "com/mathworks/toolbox/javabuilder/MWStructArray")
    arr = .jnew("org/jrc/ipsc/globesec/sitaf/fsda/FsdaMWStructArray", arr1)

    if(trace)
    {
        cat("\nReturning from MATLAB tclust().  Fields returned by MATLAB: \n")
        print(arr$fieldNames())
    }

    bopt = as.matrix(.jevalArray(arr$get("bopt", as.integer(1)), "[[D", simplify = TRUE))

    sigma2opt = as.matrix(.jevalArray(arr$get("sigma2opt", as.integer(1)), "[[D", simplify = TRUE))
    sigma2opt_corr = as.matrix(.jevalArray(arr$get("sigma2opt_corr", as.integer(1)), "[[D", simplify = TRUE))


    sigmaXopt = if(as.integer(arr$hasField("sigmaXopt", as.integer(1))) != 1) NULL
                else as.matrix(.jevalArray(arr$get("sigmaXopt", as.integer(1)), "[[D", simplify = TRUE))


    idx = as.vector(as.matrix(.jevalArray(arr$get("idx", as.integer(1)), "[[D", simplify = TRUE)))

    size = as.matrix(.jevalArray(arr$get("siz", as.integer(1)), "[[D", simplify = TRUE))
    rows <- 1:nrow(size)
    cols <- c("Cluster", "Size", "Percent")
    dimnames(size) <- list(rows, cols)

    idx_before_tr = if(as.integer(arr$hasField("idx_before_tr", as.integer(1))) != 1) NULL
                    else as.vector(as.matrix(.jevalArray(arr$get("idx_before_tr", as.integer(1)), "[[D", simplify = TRUE)))

    we <- if(as.integer(arr$hasField("we", as.integer(1))) != 1) NULL
          else as.vector(as.matrix(.jevalArray(arr$get("we", as.integer(1)), "[[D", simplify = TRUE)))

    postprobopt <- if(as.integer(arr$hasField("postprobopt", as.integer(1))) != 1) NULL
                   else as.matrix(.jevalArray(arr$get("postprobopt", as.integer(1)), "[[D", simplify = TRUE))

    MIXMIX <- if(as.integer(arr$hasField("MIXMIX", as.integer(1))) != 1) NULL
                 else as.vector(as.matrix(.jevalArray(arr$get("MIXMIX", as.integer(1)), "[[D", simplify = TRUE)))
    MIXCLA <- if(as.integer(arr$hasField("MIXCLA", as.integer(1))) != 1) NULL
                 else as.vector(as.matrix(.jevalArray(arr$get("MIXCLA", as.integer(1)), "[[D", simplify = TRUE)))
    CLACLA <- if(as.integer(arr$hasField("CLACLA", as.integer(1))) != 1) NULL
                 else as.vector(as.matrix(.jevalArray(arr$get("CLACLA", as.integer(1)), "[[D", simplify = TRUE)))

    ans <- list(call=match.call(), alphaLik=alphaLik, alphaX=alphaX,
            MIXMIX=MIXMIX, MIXCLA=MIXCLA, CLACLA=CLACLA)

    freeMatlabResources(out)

    class(ans) <- outclass
    return (ans)
}