R/mlmatrixt.R
In MixMatrix: Classification with Matrix Variate Normal and t Distributions

Documented in MLmatrixt

#   mlmatrixt.R
#   MixMatrix: Classification with Matrix Variate Normal and t distributions
#   Copyright (C) 2018-9  GZ Thompson <gzthompson@gmail.com>
#
#   This program is free software; you can redistribute it and/or modify
#   it under the terms of the GNU General Public License as published by
#   the Free Software Foundation; either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU General Public License for more details.
#
#     You should have received a copy of the GNU General Public License
#   along with this program; if not, a copy is available at
#   https://www.R-project.org/Licenses/

#' Maximum likelihood estimation for matrix variate t distributions
#'
#' For the matrix variate normal distribution, maximum likelihood estimates
#' exist for \eqn{N > max(p/q,q/p)+1} and are unique for \eqn{N > max(p,q)}.
#' The number necessary for the matrix variate t has not been worked out but
#' this is a lower bound. This implements an ECME algorithm to estimate the
#' mean, covariance, and degrees of freedom parameters. An AR(1), compound
#' symmetry, or independence restriction can be proposed for either or both
#' variance matrices. However, if they are inappropriate for the data, they may
#' fail with a warning.
#'
#' @param data Either a list of matrices or a 3-D array with matrices in
#'    dimensions 1 and 2, indexed by dimension 3.
#' @param row.mean By default, `FALSE`. If `TRUE`, will fit a
#'    common mean within each row. If both this and `col.mean` are
#'    `TRUE`, there will be a common mean for the entire matrix.
#' @param col.mean By default, `FALSE`. If `TRUE`, will fit a
#'    common mean within each row. If both this and `row.mean` are
#'    `TRUE`, there will be a common mean for the entire matrix.
#' @param row.variance Imposes a variance structure on the rows. Either
#'     'none', 'AR(1)', 'CS' for 'compound symmetry', 'Correlation' for a
#'     correlation matrix, or 'Independence' for
#'     independent and identical variance across the rows.
#'     Only positive correlations are allowed for AR(1) and CS and these
#'     restrictions may not be guaranteed to converge.
#'     Note that while maximum likelihood estimators are available (and used)
#'     for the unconstrained variance matrices, `optim` is used for any
#'     constraints so it may be considerably slower.
#' @param col.variance  Imposes a variance structure on the columns.
#'     Either 'none', 'AR(1)', 'CS', 'Correlation', or 'Independence'.
#'     Only positive correlations are allowed for
#'     AR(1) and CS.
#' @param df Starting value for the degrees of freedom. If `fixed = TRUE`,
#'     then this is required and not updated. By default, set to 10.
#' @param fixed Whether `df` is estimated or fixed.
#'     By default, `TRUE`.
#' @param tol Convergence criterion. Measured against square deviation
#'    between iterations of the two variance-covariance matrices.
#' @param max.iter Maximum possible iterations of the algorithm.
#' @param U (optional) Can provide a starting point for the U matrix.
#'    By default, an identity matrix.
#' @param V (optional) Can provide a starting point for the V matrix.
#'    By default, an identity matrix.
#' @param ... (optional) additional arguments can be passed to `optim`
#'    if using restrictions on the variance.
#'
#' @return Returns a list with the following elements:
#' \describe{
#'       \item{`mean`}{the mean matrix}
#'       \item{`U`}{the between-row covariance matrix}
#'       \item{`V`}{the between-column covariance matrix}
#'       \item{`var`}{the scalar variance parameter
#'            (the first entry of the covariances are restricted to unity)}
#'       \item{`nu`}{the degrees of freedom parameter}
#'       \item{`iter`}{the number of iterations}
#'       \item{`tol`}{the squared difference between iterations of
#'            the variance matrices at the time of stopping}
#'       \item{`logLik`}{log likelihood of result.}
#'       \item{`convergence`}{a convergence flag,
#'       `TRUE` if converged.}
#'       \item{`call`}{The (matched) function call.}
#'    }
#'
#' @export
#' @seealso [rmatrixnorm()], [rmatrixt()],
#' [MLmatrixnorm()]
#'
#' @references
#'     Thompson, G Z.  R Maitra, W Q Meeker, A Bastawros (2019),
#'     "Classification with the matrix-variate-t distribution", arXiv
#'     e-prints arXiv:1907.09565 <https://arxiv.org/abs/1907.09565>
#'
#'     Dickey, James M. 1967. “Matricvariate Generalizations of the
#'     Multivariate t Distribution and the Inverted Multivariate t
#'     Distribution.” Ann. Math. Statist. 38 (2): 511–18.
#'     \doi{10.1214/aoms/1177698967}
#'
#'     Liu, Chuanhai, and Donald B. Rubin. 1994. “The ECME Algorithm:
#'     A Simple Extension of EM and ECM with Faster Monotone Convergence.”
#'     Biometrika 81 (4): 633–48.
#'           \doi{10.2307/2337067}
#'
#'    Meng, Xiao-Li, and Donald B. Rubin. 1993. “Maximum Likelihood Estimation
#'    via the ECM Algorithm: A General Framework.” Biometrika 80 (2): 267–78.
#'             \doi{10.1093/biomet/80.2.267}
#'
#'     Rubin, D.B. 1983. “Encyclopedia of Statistical Sciences.” In, 4th ed.,
#'       272–5. John Wiley.
#'
#' @examples
#' set.seed(20180202)
#' # drawing from a distribution with specified mean and covariance
#' A <- rmatrixt(
#'   n = 100, mean = matrix(c(100, 0, -100, 0, 25, -1000), nrow = 2),
#'   L = matrix(c(2, 1, 0, .1), nrow = 2), list = TRUE, df = 5
#' )
#' # fitting maximum likelihood estimates
#' results <- MLmatrixt(A, tol = 1e-5, df = 5)
#' print(results)
MLmatrixt <- function(data, row.mean = FALSE, col.mean = FALSE,
                      row.variance = "none", col.variance = "none",
                      df = 10, fixed = TRUE,
                      tol = .Machine$double.eps^0.5, max.iter = 5000, U, V,
                      ...) {
  if (is.null(df) || df == 0 || is.infinite(df)) {
    return(MLmatrixnorm(
      data, row.mean, col.mean, row.variance,
      col.variance, tol, max.iter, U, V, ...
    ))
  }
  if (inherits(data, "list"))  {
    data <- array(unlist(data),
      dim = c(
        nrow(data[[1]]),
        ncol(data[[1]]), length(data)
      )
    )
  }
  if (!all(
    is.numeric(data), is.numeric(tol),
    is.numeric(max.iter)
  )) {
    stop("Non-numeric input. ")
  }
  if (!(missing(U))) {
    if (!(is.numeric(U))) stop("Non-numeric input.")
  }
  if (!(missing(V))) {
    if (!(is.numeric(V))) stop("Non-numeric input.")
  }

  row_set_var <- FALSE

  rowvarparse <- .varparse(row.variance)
  row_set_var <- rowvarparse$varflag
  row.variance <- rowvarparse$varopt

  col_set_var <- FALSE
  colvarparse <- .varparse(col.variance)
  col_set_var <- colvarparse$varflag
  col.variance <- colvarparse$varopt


  dims <- dim(data)

  if (max(dims[1] / dims[2], dims[2] / dims[1]) > (dims[3] - 1)) {
    stop("Need more observations to estimate parameters.")
  }
  # don't have initial starting point for U and V, start with diag.
  if (missing(U)) {
    est_u <- diag(dims[1])
  } else {
    est_u <- U
  }
  if (missing(V)) {
    est_v <- diag(dims[2])
  } else {
    est_v <- V
  }

  mu <- rowMeans(data, dims = 2)
  if (row.mean) {
    # make it so that the mean is constant within a row
    mu <- matrix(rowMeans(mu), nrow = dims[1], ncol = dims[2])
  }
  if (col.mean) {
    # make it so that the mean is constant within a column
    mu <- matrix(colMeans(mu), nrow = dims[1], ncol = dims[2], byrow = TRUE)
  }
  # if both are true, this makes it so the mean is constant all over
  swept_data <- sweep(data, c(1, 2), mu)
  iter <- 0
  error_term <- 1e+40

  if (col_set_var) { #set var, est_v, swept_data, dims, col_var
    if (est_v[1, 2] > 0) {
      rho_col <- est_v[1, 2]
    } else {
      inter_v <- txax(swept_data, 0.5 * (est_u + t(est_u)))
      est_v <- rowSums(inter_v, dims = 2) / (dims[3] * dims[1])
      if (col.variance == "AR(1)") {
        est_v <- stats::cov2cor(est_v)
        rho_col <- est_v[1, 2]
      }
      if (col.variance == "CS") {
        est_v <- stats::cov2cor(est_v)
        rho_col <- mean(est_v[1, ] / est_v[1, 1])
      }
      if (col.variance == "I") rho_col <- 0
      if (rho_col > .9) rho_col <- .9
      if (rho_col < 0) rho_col <- 0
      est_v <- varmatgenerate(dims[2], rho_col, col.variance)
    }
  }

  if (row_set_var) {
    if (est_u[1, 2] > 0) {
      rho_row <- est_u[1, 2]
    } else {
      inter_u <- xatx(swept_data, 0.5 * (est_v + t(est_v)))
      est_u <- rowSums(inter_u, dims = 2) / (dims[3] * dims[2])
      if (row.variance == "AR(1)") {
        est_u <- stats::cov2cor(est_u)
        rho_row <- est_u[1, 2] / est_u[1, 1]
      }
      if (row.variance == "CS") {
        est_u <- stats::cov2cor(est_u)
        rho_row <- mean(est_u[1, ] / est_u[1, 1])
      }
      if (row.variance == "I") rho_row <- 0
      if (rho_row > .9) rho_row <- .9
      if (rho_row < 0) rho_row <- 0
      est_u <- varmatgenerate(dims[1], rho_row, row.variance)
    }
  }

  varflag <- FALSE

  p <- dims[1]
  q <- dims[2]
  n <- dims[3]


  while (iter < max.iter && error_term > tol && (!varflag)) {
    ### E step
    s_list <- .sstep(data, mu, est_u, est_v, rep(1.0, n))
    ss <- s_list$ss
    ssx <- s_list$ssx
    ssxx <- s_list$ssxx
    ssd <- s_list$ssd


    ### CM STEP
    ### MEANS:
    new_mu <- .means_function(data,
      v = est_v, ss, ssx, rep(1.0, n),
      row.mean, col.mean, "t"
    )

    ### VARS:
    colvarlist <- .col_vars(
      data, new_mu, df, rep(1.0, n), ss, ssx, ssxx,
      col.variance, col_set_var, varflag
    )
    varflag <- colvarlist$varflag
    if (!varflag) new_v <- colvarlist$V

    rowvarlist <- .row_vars(
      data, new_mu, df, rep(1, n), ss, ssx, ssxx,
      row.variance, row_set_var, varflag
    )
    varflag <- rowvarlist$varflag
    if (!varflag) new_u <- rowvarlist$U

    ### IF NU UPDATE
    if (!fixed) {
      new_df <- df
      ## insert E step for NU and M step for NU
      ssdtmp <- ssd

      detss <- determinant(ss, logarithm = TRUE)$modulus[1]
      nu_ll <- function(nu) {
        (CholWishart::mvdigamma((nu + p - 1) / 2, p) -
          CholWishart::mvdigamma((nu + p + q - 1) / 2, p) -
          (ssdtmp / n - (detss - p * log(n * (nu + p - 1)) +
            p * log(nu + p + q - 1))))
        # this latest ECME-ish one gives SLIGHTLY different results but is faster
        # (ssdtmp/n +  determinant(new_u, logarithm = TRUE)$modulus[1]))
      }
      if (!isTRUE(sign(nu_ll(1e-6)) * sign(nu_ll(1000)) <= 0)) {
        warning("Endpoints of derivative of df likelihood do not have
opposite sign.
                 Check df specification.")
        varflag <- TRUE
      } else {
        fit0 <- stats::uniroot(nu_ll, c(1e-6, 1000), ...)
        new_df <- fit0$root
      }
    } else {
      new_df <- df
    }
    ### CHECK CONVERGENCE
    error_term <- sum((new_v - est_v)^2) / (q * q) +
      sum((new_u - est_u)^2) / (p * p) +
      sum((new_mu - mu)^2) / (p * q) + (df - new_df)^2 / (n * p * q)
    ### check, force symmetry
    if (max(abs(new_v - t(new_v)) > tol)) {
      warning("V matrix may not be symmetric")
    }

    if (max(abs(new_u - t(new_u)) > tol)) {
      warning("U matrix may not be symmetric")
    }
    est_v <- .5 * (new_v + t(new_v))
    est_u <- .5 * (new_u + t(new_u))
    mu <- new_mu
    df <- new_df

    iter <- iter + 1
  }
  if (iter >= max.iter || error_term > tol || varflag) {
    warning("Failed to converge")
  }
  log_lik <- sum(dmatrixt(data, mu, U = est_u, V = est_v, df = df, log = TRUE))
  converged <- !(iter >= max.iter || error_term > tol || varflag)

  return(list(
    mean = mu,
    U = est_u / est_u[1, 1],
    V = est_v,
    var = est_u[1, 1],
    nu = df,
    iter = iter,
    tol = error_term,
    logLik = log_lik,
    convergence = converged,
    call = match.call()
  ))
}