R/tclustreg.R
In fsdaR: Robust Data Analysis Through Monitoring and Dynamic Visualization

Documented in tclustreg

######
##  VT::25.06.2019
##
##
##  roxygen2::roxygenise("C:/users/valen/onedrive/myrepo/R/fsdaR", load_code=roxygen2:::load_installed)
##
#'  Computes robust linear grouping analysis
#'
#' @description Performs robust linear grouping analysis.
#'
#' @param y Response variable. A vector with \code{n} elements that
#'  contains the response variable.
#'
#' @param x An n x p data matrix (n observations and p variables).
#'  Rows of x represent observations, and columns represent variables.
#'
#'  Missing values (NA's) and infinite values (Inf's) are allowed,
#'  since observations (rows) with missing or infinite values will
#'  automatically be excluded from the computations.
#'
#' @param k Number of groups.
#'
#' @param restrfactor Restriction factor for regression residuals and
#'  covariance matrices of the explanatory variables. Scalar or vector
#'  with two elements. If \code{restrfactor} is a scalar it controls the differences
#'  among group scatters of the residuals. The value 1 is the strongest
#'  restriction. If \code{restrfactor} is a vector with two elements
#'  the first element controls the differences among group scatters of
#'  the residuals and the second the differences among covariance
#'  matrices of the explanatory variables. Note that \code{restrfactor[2]}
#'  is used just if \code{alphaX=1}, that is if constrained weighted model
#'  for \code{x} is assumed.
#'
#' @param alphaLik Trimming level, a scalar between 0 and 0.5 or an
#'  integer specifying the number of observations which have to be trimmed.
#'  If \code{alphaLik=0}, there is no trimming.  More in detail, if \code{0 < alphaLik < 1}
#'  clustering is based on \code{h = floor(n * (1 - alphaLik))} observations.
#'  If \code{alphaLik} is an integer greater than 1 clustering is
#'  based on \code{h = n - floor(alphaLik)}. More in detail, likelihood
#'  contributions are sorted and the units associated with the smallest \code{n - h}
#'  contributions are trimmed.
#'
#' @param alphaX Second-level trimming or constrained weighted model for \code{x}.
#
#' @param intercept wheather to use constant term (default is \code{intercept=TRUE}
#'
#' @param plot If \code{plot=FALSE} (default) or \code{plot=0}  no plot is produced.
#'  If \code{plot=TRUE} a plot with the final allocation is shown (using the spmplot function).
#'  If \code{X} is 2-dimensional, the lines associated to the groups are shown too.
#'
#' @param nsamp If a scalar, it contains the number of subsamples which will be extracted.
#'  If \code{nsamp = 0} all subsets will be extracted. Remark - if the number of all possible
#'  subset is greater than 300 the default is to extract all subsets, otherwise just 300.
#'  If \code{nsamp} is a matrix it contains in the rows the indexes of the subsets which
#'  have to be extracted. \code{nsamp} in this case can be conveniently generated by
#'  function \code{subsets()}. \code{nsamp} must have \code{k * p} columns. The first \code{p}
#'  columns are used to estimate the regression coefficient of group 1, ..., the last \code{p}
#'  columns are used to estimate the regression coefficient of group \code{k}.
#'
#' @param refsteps Number of refining iterations in each subsample. Default is \code{refsteps=10}.
#'  \code{refsteps = 0} means "raw-subsampling" without iterations.
#'
#' @param reftol Tolerance of the refining steps. The default value is 1e-14
#'
#' @param mixt Specifies whether mixture modelling or crisp assignment approach to model
#'  based clustering must be used. In the case of mixture modelling parameter mixt also
#'  controls which is the criterion to find the untrimmed units in each step of the maximization.
#'  If \code{mixt>=1} mixture modelling is assumed else crisp assignment.
#'  The default value is \code{mixt=0}, i.e. crisp assignment. Please see
#'  for details the provided references.
#'  The parameter \code{mixt} also controls the criterion to select the units to trim.
#'  If \code{mixt = 2} the \code{h} units are those which give the largest contribution
#'  to the likelihood, else if \code{mixt=1} the criterion to select the \code{h} units is
#'  exactly the same as the one which is used in crisp assignment.
#'
#' @param equalweights A logical specifying wheather cluster weights in the concentration
#'  and assignment steps shall be considered. If \code{equalweights=TRUE} we are (ideally)
#'  assuming equally sized groups, else if \code{equalweights = false} (default) we allow for
#'  different group weights. Please, check in the given references which functions
#' are maximized in both cases.
#'
#' @param wtrim How to apply the weights on the observations - a flag taking values in c(0, 1, 2, 3, 4).
#'  \itemize{
#'      \item If \code{wtrim==0} (no weights), the algorithm reduces to the standard \code{tclustreg} algorithm.
#'      \item If \code{wtrim==1}, trimming is done by weighting the observations using values specified in vector
#'          \code{we}. In this case, vector \code{we} must be supplied by the user.
#'      \item If \code{wtrim==2}, trimming is again done by weighting the observations
#'          using values specified in vector \code{we}. In this case, vector \code{we}
#'          is computed from the data as a function of the density estimate pdfe.
#'          Specifically, the weight of each observation is the probability of retaining
#'          the observation, computed as
#'          \deqn{pretain_{ig} = 1-pdfe_{ig}/max_{ig}(pdfe_{ig})}
#'      \item If \code{wtrim==3}, trimming is again done by weighting the observations using
#'          values specified in vector \code{we}. In this case, each element wei of vector
#'          \code{we} is a Bernoulli random variable with probability of success
#'            \eqn{pdfe_{ig}}.
#'          In the clustering framework this is done under the constraint that no group is empty.
#'      \item If \code{wtrim==4}, trimming is done with the tandem approach of Cerioli and Perrotta (2014).
#'  }
#'
#'
#' @param we Weights. A vector of size n-by-1 containing application-specific weights
#'    Default is a vector of ones.
#'
#' @param msg  Controls whether to display or not messages on the screen If \code{msg==TRUE} (default)
#'  messages are displayed on the screen. If \code{msg=2}, detailed messages are displayed,
#'  for example the information at iteration level.
#'
#' @param RandNumbForNini pre-extracted random numbers to initialize proportions.
#'  Matrix of size k-by-nrow(nsamp) containing the random numbers which
#'  are used to initialize the proportions of the groups. This option is effective only if
#'  \code{nsamp} is a matrix which contains pre-extracted subsamples. The purpose of this
#'  option is to enable the user to replicate the results when the function \code{tclustreg()}
#'  is called using a parfor instruction (as it happens for example in routine IC, where
#'  \code{tclustreg()} is called through a parfor for different values of the restriction factor).
#'  The default is that \code{RandNumbForNini} is empty - then uniform random numbers are used.
#'
#' @param trace Whether to print intermediate results. Default is \code{trace=FALSE}.
#'
#' @param ... potential further arguments passed to lower level functions.
#'
#' @return  An S3 object of class \code{\link{tclustreg.object}}
#'
#' @references
#'      Mayo-Iscar A. (2016). The joint role of trimming and constraints in robust
#'      estimation for mixtures of gaussian factor analyzers,
#'      Computational Statistics and Data Analysis", Vol. 99, pp. 131-147.
#'
#'      Garcia-Escudero, L.A., Gordaliza, A., Greselin, F., Ingrassia, S. and Mayo-Iscar, A. (2017),
#'      Robust estimation of mixtures of regressions with random covariates, via trimming and constraints,
#'      Statistics and Computing, Vol. 27, pp. 377-402.
#'
#'      Garcia-Escudero, L.A., Gordaliza A., Mayo-Iscar A., and San Martin R. (2010).
#'      Robust clusterwise linear regression through trimming,
#'      Computational Statistics and Data Analysis, Vol. 54, pp.3057-3069.
#'
#'      Cerioli, A. and Perrotta, D. (2014). Robust Clustering Around Regression Lines with High Density Regions.
#'      Advances in Data Analysis and Classification, Vol. 8, pp. 5-26.
#'
#'      Torti F., Perrotta D., Riani, M. and Cerioli A. (2019). Assessing Robust Methodologies for Clustering Linear Regression Data,
#'      Advances in Data Analysis and Classification, Vol. 13, pp 227-257.
#'
#' @examples
#'  \dontrun{

#'  ## The X data have been introduced by Gordaliza, Garcia-Escudero & Mayo-Iscar (2013).
#'  ## The dataset presents two parallel components without contamination.
#'
#'  data(X)
#'  y1 = X[, ncol(X)]
#'  X1 = X[,-ncol(X), drop=FALSE]
#'
#'  (out <- tclustreg(y1, X1, k=2, alphaLik=0.05, alphaX=0.01, restrfactor=5, plot=TRUE, trace=TRUE))
#'
#'  (out <- tclustreg(y1, X1, k=2, alphaLik=0.05, alphaX=0.01, restrfactor=2,
#'          mixt=2, plot=TRUE, trace=TRUE))
#'
#'  ##  Examples with fishery data
#'
#'  data(fishery)
#'  X <- fishery
#'
#'  ## some jittering is necessary because duplicated units are not treated:
#'  ## this needs to be addressed
#'  X <- X + 10^(-8) * abs(matrix(rnorm(nrow(X)*ncol(X)), ncol=2))
#'
#'  y1 <- X[, ncol(X)]
#'  X1 <- X[, -ncol(X), drop=FALSE]
#'
#'  (out <- tclustreg(y1, X1, k=3, restrfact=50, alphaLik=0.04, alphaX=0.01, trace=TRUE))

#'  ## Example 2:
#'
#'  ## Define some arbitrary weightssome arbitrary weights for the units
#'      we <- sqrt(X1)/sum(sqrt(X1))
#'
#'  ##  tclustreg required parameters
#'      k <- 2; restrfact <- 50; alpha1 <- 0.04; alpha2 <- 0.01
#'
#'  ##  Now tclust is run on each combination of mixt and wtrim options
#'
#'      cat("\nmixt=0; wtrim=0",
#'          "\nStandard tclustreg, with classification likelihood and without thinning\n")
#'      (out <- tclustreg(y1, X1, k=k, restrfact=restrfact, alphaLik=alpha1, alphaX=alpha2,
#'              mixt=0, wtrim=0, trace=TRUE))
#'
#'      cat("\nmixt=2; wtrim=0",
#'          "\nMixture likelihood, no thinning\n")
#'      (out <- tclustreg(y1, X1, k=k, restrfact=restrfact, alphaLik=alpha1, alphaX=alpha2,
#'              mixt=2, wtrim=0, trace=TRUE))
#'
#'      cat("\nmixt=0; wtrim=1",
#'          "\nClassification likelihood, thinning based on user weights\n")
#'      (out <- tclustreg(y1, X1, k=k, restrfact=restrfact, alphaLik=alpha1, alphaX=alpha2,
#'              mixt=0, we=we, wtrim=1, trace=TRUE))
#'
#'      cat("\nmixt=2; wtrim=1",
#'          "\nMixture likelihood, thinning based on user weights\n")
#'      (out <- tclustreg(y1, X1, k=k, restrfact=restrfact, alphaLik=alpha1, alphaX=alpha2,
#'              mixt=2, we=we, wtrim=1, trace=TRUE))
#'
#'      cat("\nmixt=0; wtrim=2",
#'          "\nClassification likelihood, thinning based on retention probabilities\n")
#'      (out <- tclustreg(y1, X1, k=k, restrfact=restrfact, alphaLik=alpha1, alphaX=alpha2,
#'              mixt=0, wtrim=2, trace=TRUE))
#'
#'      cat("\nmixt=2; wtrim=2",
#'          "\nMixture likelihood, thinning based on retention probabilities\n")
#'      (out <- tclustreg(y1, X1, k=k, restrfact=restrfact, alphaLik=alpha1, alphaX=alpha2,
#'              mixt=2, wtrim=2, trace=TRUE))
#'
#'      cat("\nmixt=0; wtrim=3",
#'          "\nClassification likelihood, thinning based on bernoulli weights\n")
#'      (out <- tclustreg(y1, X1, k=k, restrfact=restrfact, alphaLik=alpha1, alphaX=alpha2,
#'              mixt=0, wtrim=3, trace=TRUE))
#'
#'      cat("\nmixt=2; wtrim=3",
#'          "\nMixture likelihood, thinning based on bernoulli weights\n")
#'      (out <- tclustreg(y1, X1, k=k, restrfact=restrfact, alphaLik=alpha1, alphaX=alpha2,
#'              mixt=2, wtrim=3, trace=TRUE))
#'
#'      cat("\nmixt=0; wtrim=4",
#'          "\nClassification likelihood, tandem thinning based on bernoulli weights\n")
#'      (out <- tclustreg(y1, X1, k=k, restrfact=restrfact, alphaLik=alpha1, alphaX=alpha2,
#'              mixt=0, wtrim=4, trace=TRUE))
#'
#'      cat("\nmixt=2; wtrim=4",
#'          "\nMixture likelihood, tandem thinning based on bernoulli weights\n")
#'      (out <- tclustreg(y1, X1, k=k, restrfact=restrfact, alphaLik=alpha1, alphaX=alpha2,
#'              mixt=2, wtrim=4, trace=TRUE))
#'
#'  }
#' @export
#' @author FSDA team, \email{valentin.todorov@@chello.at}

tclustreg <- function(y, x, k, alphaLik, alphaX, restrfactor=12, intercept=TRUE, plot=FALSE,
        nsamp, refsteps=10, reftol=10e-14, equalweights=FALSE, mixt=0, wtrim=0, we, msg=TRUE, RandNumbForNini,
        trace=FALSE, ...)
{
    if(is.data.frame(x))
      x <- data.matrix(x)
    else if(!is.matrix(x))
      x <- matrix(x, length(x), 1,
                  dimnames = list(names(x), deparse(substitute(x))))
    if(!is.numeric(x)) stop("x is not a numeric")

    if(is.data.frame(y))
      y <- data.matrix(y)
    else if(!is.matrix(y))
      y <- matrix(y, length(y), 1,
                  dimnames = list(names(y), deparse(substitute(y))))
    if(!is.numeric(y)) stop("y is not a numeric")

    dx <- dim(x)
    xn <- (dnx <- dimnames(x))[[2]]
    xn <- if (!is.null(xn))
        xn
    else if (dx[2] > 1)
        paste("X", 1:dx[2], sep = "")
    else if(dx[2])
        "X"
    dimnames(x) <- list(dnx[[1]], xn)

    n <- nrow(x)
    p <- ncol(x)

    control <- list(...)
    control$intercept <- ifelse(intercept, 1, 0)
    control$plots <- ifelse(plot, 1, 0)

    if(!missing(nsamp)){
        control$nsamp <- nsamp
        if(is.matrix(nsamp) && !missing(RandNumbForNini))
            control$RandNumbForNini <- RandNumbForNini
    }
    control$refsteps <- refsteps
    control$reftol <- reftol
    control$equalweights <- equalweights
    control$mixt <- mixt
    if(is.null(wtrim) || is.na(wtrim) || !is.numeric(wtrim) || wtrim < 0 || wtrim > 4)
        stop("'wtrim' must be numeric, one of 0, 1, 2, 3 or 4!")
    control$wtrim <- wtrim
    if(!missing(we))
    {
        if(!is.numeric(we) || length(we) != n)
            stop("Parameter 'we' must be a numeric vector of length ", n)
        control$we <- we
    }


    xmsg <- 0
    if(is.logical(msg))
        xmsg <- ifelse(msg, 1, 0)
    else  if(is.numeric(msg) && msg >= 0 && msg <= 2)
        xmsg <- msg
    else
        stop("Invalid parameter 'msg'. Should be TRUE/FALSE or 0, 1, 2.")
    control$msg <- xmsg

    outclass <- "tclustreg"

    ## ES 27.06.2018: parameters that are mandatory to the MATLAB function
    ## cannot be put into the MATLAB function because they have to be supplied
    ## to the function individually and not in (name, value) pairs

    parlist = c(.jarray(y, dispatch=TRUE), .jarray(x, dispatch=TRUE))
    parlist = c(parlist, .jnew("java/lang/Double", as.double(k)))           # k
    parlist = c(parlist, .jnew("java/lang/Double", as.double(restrfactor))) # restrfactor

    if(length(alphaLik) == 1)
        parlist = c(parlist, .jnew("java/lang/Double", as.double(alphaLik)))# alpha 1
    else
        parlist = c(parlist, .jarray(alphaLik, dispatch=TRUE))
    if(length(alphaX) == 1)
        parlist = c(parlist, .jnew("java/lang/Double", as.double(alphaX)))  # alpha 2
    else
        parlist = c(parlist, .jarray(alphaX, dispatch=TRUE))

    paramNames = names(control)
    if(trace)
        print(control)

    if(length(paramNames) > 0)
    {
        for (i in 1:length(paramNames)) {
            paramName = paramNames[i]
            paramValue = control[[i]]

            matlabValue = rType2MatlabType(paramName, paramValue)
            parlist = c(parlist, .jnew("java/lang/String", paramName), matlabValue)
        }
    }

    out <- callFsdaFunction("tclustreg", "[Ljava/lang/Object;", 1, parlist)
    if(is.null(out))
        return(NULL)

    arr1 = .jcast(out[[1]], "com/mathworks/toolbox/javabuilder/MWStructArray")
    arr = .jnew("org/jrc/ipsc/globesec/sitaf/fsda/FsdaMWStructArray", arr1)

    if(trace)
    {
        cat("\nReturning from MATLAB tclustreg().  Fields returned by MATLAB: \n")
        print(arr$fieldNames())
    }

    bopt = as.matrix(.jevalArray(arr$get("bopt", as.integer(1)), "[[D", simplify = TRUE))
##    dimnames(bopt) <- list(paste0("B", 1:k), if(is.null(dimnames(x)[2])) paste0("X", 1:ncol(x)) else dimnames(x)[[2]])
##    bopt <- t(bopt)

    sigma2opt = as.matrix(.jevalArray(arr$get("sigma2opt", as.integer(1)), "[[D", simplify = TRUE))
    sigma2opt_corr = as.matrix(.jevalArray(arr$get("sigma2opt_corr", as.integer(1)), "[[D", simplify = TRUE))

    muXopt = if(as.integer(arr$hasField("muXopt", as.integer(1))) != 1) NULL
            else as.matrix(.jevalArray(arr$get("muXopt", as.integer(1)), "[[D", simplify = TRUE))
    if(!is.null(muXopt))
    {
        dimnames(muXopt) <- list(paste0("C", 1:k), if(is.null(dimnames(x)[2])) paste0("X", 1:ncol(x)) else dimnames(x)[[2]])
        muXopt <- t(muXopt)
    }

    sigmaXopt = if(as.integer(arr$hasField("sigmaXopt", as.integer(1))) != 1) NULL
                else as.matrix(.jevalArray(arr$get("sigmaXopt", as.integer(1)), "[[D", simplify = TRUE))

    if(!is.null(muXopt))
    {
        rows <- paste0("C", 1:k)
        cols <- if(is.null(dimnames(x)[[2]])) paste0("X", 1:ncol(x)) else dimnames(x)[[2]]
        if(k == 1)
            dimnames(sigmaopt) <- list(cols, cols)
        else
            dimnames(sigmaopt) <- list(cols, cols, rows)
    }

    idx = as.vector(as.matrix(.jevalArray(arr$get("idx", as.integer(1)), "[[D", simplify = TRUE)))

    size = as.matrix(.jevalArray(arr$get("siz", as.integer(1)), "[[D", simplify = TRUE))
##    rows <- if(dim(size)[1] == k) paste0("C", 1:k) else paste0("C", 0:k)
    rows <- 1:nrow(size)
    cols <- c("Cluster", "Size", "Percent")
    dimnames(size) <- list(rows, cols)

    idx_before_tr = if(as.integer(arr$hasField("idx_before_tr", as.integer(1))) != 1) NULL
                    else as.vector(as.matrix(.jevalArray(arr$get("idx_before_tr", as.integer(1)), "[[D", simplify = TRUE)))

    post <- NULL
    if(as.integer(arr$hasField("post", as.integer(1))) == 1)
    {
        post <- as.matrix(.jevalArray(arr$get("post", as.integer(1)), "[[D", simplify = TRUE))
        rows <- if(is.null(dimnames(x)[[1]])) 1:nrow(x) else dimnames(x)[[1]]
        cols <- paste0("C", 1:k)
        dimnames(post) <- list(rows, cols)
    }

    vopt <- if(as.integer(arr$hasField("vopt", as.integer(1))) != 1) NULL
            else .jevalArray(arr$get("vopt", as.integer(1)), "[[D", simplify = TRUE)

    we <- if(as.integer(arr$hasField("we", as.integer(1))) != 1) NULL
          else as.vector(as.matrix(.jevalArray(arr$get("we", as.integer(1)), "[[D", simplify = TRUE)))

    postprobopt <- if(as.integer(arr$hasField("postprobopt", as.integer(1))) != 1) NULL
                   else as.matrix(.jevalArray(arr$get("postprobopt", as.integer(1)), "[[D", simplify = TRUE))

    MIXMIX <- if(as.integer(arr$hasField("MIXMIX", as.integer(1))) != 1) NULL
                 else as.vector(as.matrix(.jevalArray(arr$get("MIXMIX", as.integer(1)), "[[D", simplify = TRUE)))
    MIXCLA <- if(as.integer(arr$hasField("MIXCLA", as.integer(1))) != 1) NULL
                 else as.vector(as.matrix(.jevalArray(arr$get("MIXCLA", as.integer(1)), "[[D", simplify = TRUE)))
    CLACLA <- if(as.integer(arr$hasField("CLACLA", as.integer(1))) != 1) NULL
                 else as.vector(as.matrix(.jevalArray(arr$get("CLACLA", as.integer(1)), "[[D", simplify = TRUE)))

    ans <- list(call=match.call(), alphaLik=alphaLik, alphaX=alphaX, k=k, restrfactor=restrfactor,
            bopt=bopt, sigma2opt=sigma2opt, sigma2opt_corr=sigma2opt_corr,
            muXopt=muXopt, sigmaXopt=sigmaXopt, idx=idx, size=size, idx_before_tr=idx_before_tr,
            post=post, vopt=vopt, we=we, postprobopt=postprobopt,
            MIXMIX=MIXMIX, MIXCLA=MIXCLA, CLACLA=CLACLA)

    freeMatlabResources(out)

    class(ans) <- outclass
    return (ans)
}