R/lsbclust.R
In lsbclust: Least-Squares Bilinear Clustering for Three-Way Data

#' Least-squares Bilinear Clustering of Three-way Data
#' 
#' This function clusters along one way of a three-way array (as specified by \code{margin}) while
#' decomposing along the other two dimensions. Four types of clusterings are allowed based on the
#' respective two-way slices of the array: on the overall means, row margins, column margins and the 
#' interactions between rows and columns. Which clusterings can be fit is determined by the vector
#' \code{delta}, with four binary elements. All orthogonal models are fitted. 
#' The nonorthogonal case \code{delta = (1, 1, 0, 0)} returns an error. See the reference for further details.
#' 
#' @param data A three-way array representing the data.
#' @param margin An integer giving the single subscript of \code{data} over which the clustering 
#' will be applied. 
#' @param delta A four-element binary vector (logical or numeric) indicating which sum-to-zero 
#' constraints must be enforced.
#' @param nclust A vector of length four giving the number of clusters for the overall mean, the row
#' margins, the column margins and the interactions (in that order) respectively. Alternatively, a
#' vector of length one, in which case all components will have the same number of clusters.
#' @param ndim The required rank for the approximation of the interactions (a scalar).
#' @param fixed One of \code{"none"}, \code{"rows"} or \code{"columns"} indicating whether to fix neither
#' sets of coordinates, or whether to fix the row or column coordinates across clusters respectively.
#' If a vector is supplied, only the first element will be used (passed to \code{\link{int.lsbclust}}).
#' @param nstart The number of random starts to use for the interaction clustering.
#' @param nstart.kmeans The number of random starts to use in \code{\link{kmeans}}.
#' @param starts A list containing starting configurations for the cluster membership vector. If not
#' supplied, random initializations will be generated (passed to \code{\link{int.lsbclust}}).
#' @param alpha Numeric value in [0, 1] which determines how the singular values are distributed
#' between rows and columns (passed to \code{\link{int.lsbclust}}).
#' @param parallel Logical indicating whether to parallel over different starts or not 
#' (passed to \code{\link{int.lsbclust}}).
#' @param maxit The maximum number of iterations allowed in the interaction clustering.
#' @param verbose Integer controlling the amount of information printed: 0 = no information, 
#' 1 = Information on random starts and progress, and 2 = information is printed after
#' each iteration for the interaction clustering.
#' @param method The method for calculating cluster agreement across random starts, passed on
#' to \code{\link{cl_agreement}} (passed to \code{\link{int.lsbclust}}).
#' @param sep.nclust Logical indicating how nclust should be used across different \code{type}'s.
#' If \code{sep.nclust} is \code{TRUE}, \code{nclust} is recycled so that each \code{type} can
#' have a different number of clusters. If \code{sep.nclust} is \code{FALSE}, the same vector
#' \code{nclust} is used for all \code{type}'s.
#' @param type One of \code{"rows"}, \code{"columns"} or \code{"overall"} (or a unique abbreviation of 
#' one of these) indicating whether clustering should be done on row margins, column margins or
#' the overall means of the two-way slices respectively. If more than one opion are supplied, the
#' algorithm is run for all (unique) options supplied (passed to \code{\link{orc.lsbclust}}). This
#' is an optional argument.
#' @param \dots Additional arguments passed to \code{\link{kmeans}}.
#' @return Returns an object of S3 class \code{lsbclust} which has slots:
#'    \item{\code{overall}}{Object of class \code{ovl.kmeans} for the overall means clustering}
#'    \item{\code{rows}}{Object of class \code{row.kmeans} for the row means clustering}
#'    \item{\code{columns}}{Object of class \code{col.kmeans} for the column means clustering}
#'    \item{\code{interactions}}{Object of class \code{int.lsbclust} for the interaction clustering}
#'    \item{\code{call}}{The function call used to create the object}
#'    \item{\code{delta}}{The value of \code{delta} in the fit}
#'    \item{\code{df}}{Breakdown of the degrees-of-freedom across the different subproblems}
#'    \item{\code{loss}}{Breakdown of the loss across subproblems}
#'    \item{\code{time}}{Time taken in seconds to calculate the solution}
#'    \item{\code{cluster}}{Matrix of cluster membership per observation for all cluster types}
#' @seealso \code{\link{int.lsbclust}}, \code{\link{orc.lsbclust}}
#' @export
#' @references Schoonees, P.C., Groenen, P.J.F., Van de Velden, M. Least-squares Bilinear Clustering
#' of Three-way Data. Econometric Institute Report, EI2014-23.
#' @importFrom graphics plot
#' @importFrom methods is
#' @import stats
lsbclust <- function(data, margin = 3L, delta = c(1L, 1L, 1L, 1L), nclust, ndim = 2L,
                     fixed = c("none", "rows", "columns"), nstart = 20L, starts = NULL, 
                     nstart.kmeans = 500L, alpha = 0.5, parallel = FALSE, maxit = 100L, verbose = 1, 
                     method = "diag", type = NULL, sep.nclust = TRUE, ...) {
  
  ## Capture call, start time
  time0 <- proc.time()[3]
  cll <- match.call()
  
  ## Preliminaries
  J <- dim(data)[-margin][1]
  K <- dim(data)[-margin][2]
  
  ## Select correct option for fixed (only one)
  fixed <- match.arg(fixed)
  
  ## Sanity checks
  delta <- as.numeric(delta)
  if (length(delta) != 4 || !all(delta %in% 0:1))  stop("Argument 'delta' supplied in an unrecognized format.")
  if (all(delta == c(1, 1, 0, 0))) stop("This model is non-orthogonal")
  if (!is(data, "array") || length(dim(data)) != 3) stop("Data must be a three-way array.") 
  if (!all(margin %in% 1:3) || length(margin) != 1) stop("Argument 'margin' must be 1, 2 or 3.")
  
  ## Check also ... arguments against stats::kmeans()
  args.dots <- list(...)
  nms.dots <- names(args.dots)
  kmeans.args <- names(formals(stats::kmeans))
  check.dots <- nms.dots %in% kmeans.args
  if (any(!check.dots)) {
    stop("Unknown argument(s): ", nms.dots[!check.dots])    
  }
  
  ## Make sure there are dimnames for ways not clustered over
  dnms <- dimnames(data)
  if (is.null(dnms)) namenull <- rep(TRUE, 3L)
  else namenull <- sapply(dimnames(data), is.null)
  if (any(namenull[-margin])) {
    dims <- !(seq_len(3) == margin)
    dimnames(data)[dims & namenull] <- list(paste0("Row", seq_len(J)), paste0("Col", seq_len(K)))[namenull[-margin]]
  }
  
  ## If nclust is of length one, expand it to length 4
  if (length(nclust) == 1) nclust <- rep(nclust, 4)
  if (length(nclust) != 4) stop("nclust should be either of length 1 or 4.")
  
  ## Other components
  orc <- orc.lsbclust(data = data, margin = margin, delta = delta, nclust = nclust[-4L], 
                      sep.nclust = sep.nclust, type = type, nstart = nstart.kmeans, 
                      verbose = verbose, ...)
  if (all(delta == c(1, 0, 0, 0)) || all(delta == c(0, 1, 0, 0)) || 
        all(delta == c(0, 1, 1, 0)) || all(delta == c(1, 0, 0, 1))) {
    orc <- list(orc)
    names(orc) <- ifelse(delta[1], "columns", "rows")
  }
  ind.orc <- c("overall", "rows", "columns") %in% names(orc)
  
  ## Interactions
  if (verbose) {
    cat(paste0("Interaction clustering (", nstart, " starts)..."))
  }
  int <- int.lsbclust(data = data, margin = margin, delta = delta, nclust = nclust[4L], ndim = ndim,
                      fixed = fixed, nstart = nstart, starts = starts, alpha = alpha, 
                      parallel = parallel, maxit = maxit, verbose = verbose, method = method)
  if (verbose) {
    cat("\tDONE\n")
  }
  
  ## Total degrees-of-freedom
  df <- unlist(c(sapply(orc, '[[', 'df'), interactions = int$df))
  
  ## Total loss
  loss <- unlist(c(mapply(function(x, y) x$tot.withinss * y, orc, c(J*K, K, J)[ind.orc]), 
            interactions = int$minloss * int$maxloss))
  
  ## Process output
  out <- list(overall = orc$overall, rows = orc$rows, columns = orc$columns, interactions = int, 
              call = cll, delta = delta, df = df, loss = loss, time = proc.time()[3] - time0)
  
  ## Collect all clusterings
  clust <- do.call(cbind, lapply(out[1:4], "[[", "cluster"))
  out$cluster <- clust
  
  ## Compute fitted values
  fitted <- array(0, dim = dim(data))
  odelta <- delta[1] * delta[3] + delta[2] * delta[4] - delta[1] * delta[2]
  
  ## Permute so third way represents observations
  fitted <- aperm.default(fitted, perm = int$perm)
  
  ## Construct array of fitted values
  for (i in seq_len(dim(data)[margin])) {
    fitted[, , i] <- int$means[[int$cluster[i]]]
    if (odelta)
      fitted[, , i] <- fitted[, , i] + orc$overall$centers[orc$overall$cluster[i], ]
    if (delta[2L])
      fitted[, , i] <- fitted[, , i] + orc$rows$centers[orc$rows$cluster[i], ] %o% rep(1L, K) 
    if (delta[1L])
      fitted[, , i] <- fitted[, , i] + rep(1L, J) %o% orc$columns$centers[orc$columns$cluster[i], ]
  }
  dimnames(fitted) <- dimnames(data)
  
  ## Permute to original shape
  fitted <- aperm.default(fitted, perm = order(int$perm))
  
  ## Add fitted to out
  out$fitted <- fitted
  
  ## Update S3 class
  class(out) <- "lsbclust"
  return(out)
}