R/dist_t.R

Defines functions MtM_complete_data MtM_incomplete_data

############################################################################
###                                                                      ###
###              Multivariate t Mixture for Incomplete Data              ###
###                                                                      ###
############################################################################

MtM_incomplete_data <- function(
    X,
    G,
    model          = c('t', 'C'),
    max_iter       = 20,
    epsilon        = 0.01,
    init_method    = c("kmedoids", "kmeans", "hierarchical", "mclust", "manual"),
    clusters       = NULL,
    outlier_cutoff = 0.95,
    progress       = TRUE
) {

  #----------------------#
  #    Input Checking    #
  #----------------------#

  model <- match.arg(model)

  if (model == 't') {
    if (outlier_cutoff <= 0 | outlier_cutoff >= 1) {
      stop('outlier_cutoff must be in (0, 1)')
    }
  }

  #------------------------------------#
  #    Objects for the EM Algorithm    #
  #------------------------------------#

  n <- nrow(X)
  d <- ncol(X)

  do <- rowSums(!is.na(X))
  R  <- is.na(X)
  M  <- unique(R)
  np <- nrow(M)

  Im <- vector('list', np)    # which observations with missing pattern j

  for (j in 1:np) {
    Im[[j]] <- which( apply(R, 1, function(r) all(r == M[j, ]) ) )
  }

  z           <- matrix(NA_real_, nrow = n, ncol = G)
  z_tilde     <- matrix(NA_real_, nrow = n, ncol = G)
  w_tilde     <- matrix(NA_real_, nrow = n, ncol = G)
  zw_tilde    <- matrix(NA_real_, nrow = n, ncol = G)
  X_tilde     <- array(rep(X, G), dim = c(n, d, G))
  Sigma_tilde <- array(NA_real_, dim = c(d, d, n, G))

  py    <- rep(NA_real_, G)
  mu    <- matrix(NA_real_, nrow = G, ncol = d)
  Sigma <- array(NA_real_, dim = c(d, d, G))

  if (model == 't') {
    df <- rep(10, G)
  }

  if (model == 'C') {
    df <- rep(1, G)
  }

  log_dens <- matrix(NA_real_, nrow = n, ncol = G)
  iter     <- 0
  loglik   <- NULL

  #--------------------------------#
  #    Parameter Initialization    #
  #--------------------------------#

  init_method <- match.arg(init_method)

  X_imp <- mean_impute(X)

  if (G == 1) {

    max_iter <- 1

    pars <- cluster_pars(X = X_imp, clusters = rep(1, n))

    py    <- 1
    mu    <- pars$mu
    Sigma <- pars$Sigma

  } else {

    init <- initialize_clusters(
      X           = X_imp,
      G           = G,
      init_method = init_method,
      clusters    = clusters
    )

    py    <- init$pi
    mu    <- init$mu
    Sigma <- init$Sigma

  }

  #---------------------#
  #  The EM algorithm   #
  #---------------------#

  if (progress) {
    cat('\nModel Fitting:\n')
    pb <- txtProgressBar(min = 0, max = max_iter, style = 3, width = 75, char = "=")
  }

  while (iter < max_iter & getall(loglik) > epsilon) {

    #++++ E-step ++++#

    for (g in 1:G) {
      for (j in 1:np) {

        m    <- M[j, ]                         # missing pattern j
        o    <- !m                             # observed pattern j
        Xo_j <- X[Im[[j]], o, drop = FALSE]    # observations with missing pattern j

        mu_o     <- mu[g, o]
        Sigma_oo <- Sigma[o, o, g]

        z[Im[[j]], g]       <- mvtnorm::dmvt(Xo_j, delta = mu_o, sigma = as.matrix(Sigma_oo), df = df[g], log = FALSE)
        w_tilde[Im[[j]], g] <- (df[g] + sum(o)) / (df[g] + mahalanobis(Xo_j, mu_o, Sigma_oo, tol = 1e-20))

      }
    }

    z_tilde <- sweep(z, 2, py, '*')
    z_tilde <- z_tilde / rowSums(z_tilde)

    z_tilde[is.infinite(z_tilde) | is.nan(z_tilde)] <- 1/G

    for (g in 1:G) {

      zw_tilde[, g] <- z_tilde[, g] * w_tilde[, g]

      for (j in 1:np) {

        m <- M[j, ]    # missing pattern j

        if (any(m)) {

          o <- !m        # observed pattern j

          mu_m <- mu[g, m]
          mu_o <- mu[g, o]

          Sigma_oo     <- Sigma[o, o, g]
          Sigma_mo     <- Sigma[m, o, g]
          Sigma_oo_inv <- mnormt::pd.solve(Sigma_oo)

          for (i in Im[[j]]) {
            xi <- X[i, ]

            x_ig_tilde       <- mu_m + Sigma_mo %*% Sigma_oo_inv %*% (xi[o] - mu_o)
            X_tilde[i, m, g] <- x_ig_tilde
          }

        }

      }

    }

    #++++ M-step: pi ++++#

    N  <- colSums(z_tilde)
    py <- N / n

    #++++ M-step: mu ++++#

    for (g in 1:G) {
      mu_num  <- colSums(z_tilde[, g] * w_tilde[, g] * X_tilde[, , g])
      mu_den  <- sum(z_tilde[, g] * w_tilde[, g])
      mu[g, ] <- mu_num / mu_den
    }

    #++++ M-step: Prepare Sigma tilde ++++#

    for (g in 1:G) {
      for (j in 1:np) {

        m <- M[j, ]    # missing pattern j

        if (any(m)) {

          o <- !m        # observed pattern j

          mu_m <- mu[g, m]
          mu_o <- mu[g, o]

          Sigma_oo     <- Sigma[o, o, g]
          Sigma_om     <- Sigma[o, m, g]
          Sigma_mo     <- Sigma[m, o, g]
          Sigma_mm     <- Sigma[m, m, g]
          Sigma_oo_inv <- mnormt::pd.solve(Sigma_oo)

          S_mm <- Sigma_mm - Sigma_mo %*% Sigma_oo_inv %*% Sigma_om

          for (i in Im[[j]]) {
            xi <- X[i, ]

            Sigma_tilde[o, o, i, g] <- tcrossprod(xi[o] - mu_o)
            Sigma_tilde[o, m, i, g] <- tcrossprod(xi[o] - mu_o, X_tilde[i, m, g] - mu_m)
            Sigma_tilde[m, o, i, g] <- t(Sigma_tilde[o, m, i, g])
            Sigma_tilde[m, m, i, g] <- tcrossprod(X_tilde[i, m, g] - mu_m) + S_mm / w_tilde[i, g]
          }

        } else {

          X_centrd <- sweep(X[Im[[j]], ], 2, mu[g, ], '-')
          cr_prods <- apply(X_centrd, 1, tcrossprod)

          Sigma_tilde[, , Im[[j]], g] <- array(
            data = unlist(cr_prods),
            dim  = c(d, d, length(Im[[j]]))
          )

        }

      }
    }


    for (g in 1:G) {

      #++++ M-step: Sigma ++++#

      slc_ind      <- slice.index(Sigma_tilde[, , , g, drop = FALSE], 3)
      Sigma_num    <- rowSums(zw_tilde[slc_ind, g] * Sigma_tilde[, , ,g, drop = FALSE], dims = 2)
      Sigma[, , g] <- Sigma_num / N[g]

      if (max(abs(Sigma[, , g] - t(Sigma[, , g]))) > .Machine$double.eps) {
        matr <- Sigma[, , g]
        matr[lower.tri(matr)] <- t(matr)[lower.tri(t(matr))]
        Sigma[, , g] <- matr
      }

      #++++ M-step: Degrees of Freedom ++++#

      if (model == 't') {
        root <- tryCatch({

          uniroot(function(a) {

            A <- -digamma(a / 2) + log(a / 2) + 1
            B <-  sum( z_tilde[, g] * (log(w_tilde[, g]) - w_tilde[, g]) ) / N[g]
            C <-  sum( z_tilde[, g] * (digamma((df[g] + do) / 2) - log((df[g] + do) / 2)) ) / N[g]

            return(A + B + C)

          }, lower = 0.001, upper = 200)$root

        }, error = function(e) return(NULL))

        df[g] <- ifelse(!is.null(root), root, df[g])
      }

      if (model == 'C') {
        df[g] <- 1
      }

    }

    #++++ Observed Log-Likelihood ++++#

    for (g in 1:G) {
      for (j in 1:np) {

        m    <- M[j, ]                         # missing pattern j
        o    <- !m                             # observed pattern j
        Xo_j <- X[Im[[j]], o, drop = FALSE]    # observations with missing pattern j

        mu_o     <- mu[g, o]
        Sigma_oo <- Sigma[o, o, g]

        log_dens[Im[[j]], g] <- mvtnorm::dmvt(Xo_j, delta = mu_o, sigma = as.matrix(Sigma_oo), df = df[g], log = TRUE)

      }
    }

    log_py_dens  <- sweep(log_dens, 2, log(py), FUN = '+')
    final_loglik <- sum( apply(log_py_dens, 1, log_sum_exp) )
    loglik       <- c(loglik, final_loglik)

    #++++ Update Progress ++++#

    iter <- iter + 1

    if (progress) {
      setTxtProgressBar(pb, iter)
    }

  }

  if (progress) {
    close(pb)
    if (iter < max_iter) {
      cat('\nConvergence was reached before', max_iter, 'iterations\n')
    }
  }

  #---------------------------#
  #    Cluster Memberships    #
  #---------------------------#

  clusters <- apply(z_tilde, 1, which.max)

  #------------------#
  #    Imputation    #
  #------------------#

  X_imputed <- X
  complete  <- complete.cases(X)

  for (i in which(!complete)) {
    X_imputed[i, ] <- X_tilde[i, , clusters[i]]
  }

  #-------------------------#
  #    Outlier Detection    #
  #-------------------------#

  if (model == 't') {
    delta <- matrix(NA_real_, nrow = n, ncol = G)

    for (g in 1:G) {
      delta[, g] <- mahalanobis(X_imputed, center = mu[g, ], cov = Sigma[, , g], tol = 1e-20)
    }

    cluster_matr <- clusters_to_matrix(clusters, G)
    delta        <- rowSums(delta * cluster_matr)
    outliers     <- 1 - pchisq(delta, df = d) < 1 - outlier_cutoff
  }

  if (model == 'C') {
    outliers <- NULL
  }

  #------------------------#
  #  Number of parameters  #
  #------------------------#

  npar <- list(
    pi    = G - 1,
    mu    = G * d,
    Sigma = G * d * (d + 1) / 2,
    df    = G
  )

  if (model == 'C') {
    npar$df <- NULL
  }

  npar$total <- Reduce('+', npar)

  #----------------------------#
  #    Information Criteria    #
  #----------------------------#

  AIC <- -2 * final_loglik + 2 * npar$total
  BIC <- -2 * final_loglik + npar$total * log(n)

  KIC  <- -2 * final_loglik + 3 * (npar$total + 1)
  KICc <- -2 * final_loglik + 2 * (npar$total + 1) * n/(n-npar$total -2) - n * digamma((n-npar$total)/2) + n * log(n/2)

  AIC3 <- -2 * final_loglik + 3 * npar$total
  CAIC <- -2 * final_loglik + npar$total * (1 + log(n))
  AICc <- -2 * final_loglik + 2 * npar$total * n/(n - npar$total - 1)

  ent <- apply(z_tilde, 1, max)
  ICL <- BIC - sum(ent * log(ent))

  AWE <- -2 * (final_loglik + sum(ent * log(ent))) + 2 * npar$total * (3/2 + log(n))
  CLC <- -2 * final_loglik + 2 * sum(ent * log(ent))

  #----------------------#
  #    Prepare Output    #
  #----------------------#

  c_names <- paste('comp', 1:G, sep = '')
  v_names <- colnames(X)

  if (is.null(v_names)) {
    v_names <- 1:d
  }

  names(py)       <- c_names
  rownames(mu)    <- c_names
  colnames(mu)    <- v_names
  dimnames(Sigma) <- list(v_names, v_names, c_names)
  names(df)       <- c_names

  if (G == 1) {
    mu    <- mu[1, ]
    Sigma <- Sigma[, , 1]
  }

  output <- list(
    model         = paste(model, '_incomplete_data', sep = ''),
    pi            = py,
    mu            = mu,
    Sigma         = Sigma,
    df            = df,
    z_tilde       = z_tilde,
    clusters      = clusters,
    outliers      = outliers,
    data          = X_imputed,
    complete      = !is.na(X),
    npar          = npar,
    max_iter      = max_iter,
    iter_stop     = iter,
    final_loglik  = final_loglik,
    loglik        = loglik,
    AIC           = AIC,
    BIC           = BIC,
    KIC           = KIC,
    KICc          = KICc,
    AIC3          = AIC3,
    CAIC          = CAIC,
    AICc          = AICc,
    ent           = ent,
    ICL           = ICL,
    AWE           = AWE,
    CLC           = CLC,
    init_method   = init_method
  )

  if (model == 'C') {
    output$df       <- NULL
    output$outliers <- NULL
  }

  class(output) <- 'MixtureMissing'

  return(output)

}

####################################################################
###                                                              ###
###           Multivariate t Mixture for Complete Data           ###
###                                                              ###
####################################################################

MtM_complete_data <- function(
    X,
    G,
    model          = c('t', 'C'),
    max_iter       = 20,
    epsilon        = 0.01,
    init_method    = c("kmedoids", "kmeans", "hierarchical", "mclust", "manual"),
    clusters       = NULL,
    equal_prop     = FALSE,
    identity_cov   = FALSE,
    outlier_cutoff = 0.95,
    progress       = TRUE
) {

  #----------------------#
  #    Input Checking    #
  #----------------------#

  model <- match.arg(model)

  if (model == 't') {
    if (outlier_cutoff <= 0 | outlier_cutoff >= 1) {
      stop('outlier_cutoff must be in (0, 1)')
    }
  }

  #------------------------------------#
  #    Objects for the EM Algorithm    #
  #------------------------------------#

  n <- nrow(X)
  d <- ncol(X)

  z           <- matrix(NA_real_, nrow = n, ncol = G)
  z_tilde     <- matrix(NA_real_, nrow = n, ncol = G)
  w_tilde     <- matrix(NA_real_, nrow = n, ncol = G)
  zw_tilde    <- matrix(NA_real_, nrow = n, ncol = G)
  Sigma_tilde <- array(NA_real_, dim = c(d, d, n, G))

  py    <- rep(NA_real_, G)
  mu    <- matrix(NA_real_, nrow = G, ncol = d)
  Sigma <- array(NA_real_, dim = c(d, d, G))

  if (model == 't') {
    df <- rep(10, G)
  }

  if (model == 'C') {
    df <- rep(1, G)
  }

  log_dens <- matrix(NA_real_, nrow = n, ncol = G)
  iter     <- 0
  loglik   <- NULL

  #--------------------------------#
  #    Parameter Initialization    #
  #--------------------------------#

  init_method <- match.arg(init_method)

  if (G == 1) {

    max_iter <- 1

    pars <- cluster_pars(X = X, clusters = rep(1, n))

    py    <- 1
    mu    <- pars$mu
    Sigma <- pars$Sigma

  } else {

    init <- initialize_clusters(
      X           = X,
      G           = G,
      init_method = init_method,
      clusters    = clusters
    )

    py    <- init$pi
    mu    <- init$mu
    Sigma <- init$Sigma

  }

  #------------------------#
  #    The EM Algorithm    #
  #------------------------#

  if (progress) {
    cat('\nModel Fitting:\n')
    pb <- txtProgressBar(min = 0, max = max_iter, style = 3, width = 75, char = "=")
  }

  while (iter < max_iter & getall(loglik) > epsilon) {

    #++++ E-step ++++#

    for (g in 1:G) {
      z[, g]       <- mvtnorm::dmvt(X, delta = mu[g, ], sigma = as.matrix(Sigma[, , g]), df = df[g], log = FALSE)
      w_tilde[, g] <- (df[g] + d) / (df[g] + mahalanobis(X, mu[g, ], Sigma[, , g], tol = 1e-20))
    }

    z_tilde <- sweep(z, 2, py, '*')
    z_tilde <- z_tilde / rowSums(z_tilde)

    z_tilde[is.infinite(z_tilde) | is.nan(z_tilde)] <- 1/G

    for (g in 1:G) {
      zw_tilde[, g] <- z_tilde[, g] * w_tilde[, g]
    }

    #++++ M-Step: pi ++++#

    N  <- colSums(z_tilde)
    py <- N / n

    #++++ M-Step: mu ++++#

    for (g in 1:G) {
      mu_num  <- colSums(z_tilde[, g] * w_tilde[, g] * X)
      mu_den  <- sum(z_tilde[, g] * w_tilde[, g])
      mu[g, ] <- mu_num / mu_den
    }

    #++++ M-Step: Prepare Sigma tilde ++++#

    for (g in 1:G) {
      X_centered           <- sweep(X, 2, mu[g, ])
      X_centered_crossprod <- apply(X_centered, 1, tcrossprod)
      Sigma_tilde[, , , g] <- array(X_centered_crossprod, dim = c(d, d, n))
    }

    for (g in 1:G) {

      #++++ M-Step: Sigma ++++#

      slc_ind      <- slice.index(Sigma_tilde[, , , g, drop = FALSE], 3)
      Sigma_num    <- rowSums(zw_tilde[slc_ind, g] * Sigma_tilde[, , , g, drop = FALSE], dims = 2)
      Sigma[, , g] <- Sigma_num / N[g]

      if (max(abs(Sigma[, , g] - t(Sigma[, , g]))) > .Machine$double.eps) {
        matr <- Sigma[, , g]
        matr[lower.tri(matr)] <- t(matr)[lower.tri(t(matr))]
        Sigma[, , g] <- matr
      }

      #++++ M-step: Degree of Freedom ++++#

      if (model == 't') {
        root <- tryCatch({

          uniroot(function(a) {

            A <- -digamma(a / 2) + log(a / 2) + 1
            B <-  sum(z_tilde[, g] * (log(w_tilde[, g]) - w_tilde[, g])) / N[g]
            C <-  digamma((df[g] + d) / 2) - log((df[g] + d) / 2)

            return(A + B + C)

          }, lower = 2, upper = 200)$root

        }, error = function(e) return(NULL))

        df[g] <- ifelse(!is.null(root), root, df[g])
      }

      if (model == 'C') {
        df[g] <- 1
      }

    }

    #++++ Observed Log-likelihood ++++#

    for (g in 1:G) {
      log_dens[, g] <- mvtnorm::dmvt(X, delta = mu[g, ], sigma = as.matrix(Sigma[, , g]), df = df[g], log = TRUE)
    }

    log_py_dens  <- sweep(log_dens, 2, log(py), FUN = '+')
    final_loglik <- sum( apply(log_py_dens, 1, log_sum_exp) )
    loglik       <- c(loglik, final_loglik)

    #++++ Update progress ++++#

    iter <- iter + 1

    if (progress) {
      setTxtProgressBar(pb, iter)
    }

  }

  if (progress) {
    close(pb)
    if (iter < max_iter) {
      cat('\nConvergence was reached before', max_iter, 'iterations\n')
    }
  }

  #---------------------------#
  #    Cluster Memberships    #
  #---------------------------#

  clusters <- apply(z_tilde, 1, which.max)

  #-------------------------#
  #    Outlier Detection    #
  #-------------------------#

  if (model == 't') {
    delta <- matrix(NA_real_, nrow = n, ncol = G)

    for (g in 1:G) {
      delta[, g] <- mahalanobis(X, center = mu[g, ], cov = Sigma[, , g], tol = 1e-20)
    }

    cluster_matr <- clusters_to_matrix(clusters, G)
    delta        <- rowSums(delta * cluster_matr)
    outliers     <- 1 - pchisq(delta, df = d) < 1 - outlier_cutoff
  }

  if (model == 'C') {
    outliers <- NULL
  }

  #------------------------#
  #  Number of parameters  #
  #------------------------#

  npar <- list(
    pi    = G - 1,
    mu    = G * d,
    Sigma = G * d * (d + 1) / 2,
    df    = G
  )

  if (model == 'C') {
    npar$df <- NULL
  }

  npar$total <- Reduce('+', npar)

  #----------------------------#
  #    Information Criteria    #
  #----------------------------#

  AIC <- -2 * final_loglik + 2 * npar$total
  BIC <- -2 * final_loglik + npar$total * log(n)

  KIC  <- -2 * final_loglik + 3 * (npar$total + 1)
  KICc <- -2 * final_loglik + 2 * (npar$total + 1) * n/(n-npar$total -2) - n * digamma((n-npar$total)/2) + n * log(n/2)

  AIC3 <- -2 * final_loglik + 3 * npar$total
  CAIC <- -2 * final_loglik + npar$total * (1 + log(n))
  AICc <- -2 * final_loglik + 2 * npar$total * n/(n - npar$total - 1)

  ent <- apply(z_tilde, 1, max)
  ICL <- BIC - sum(ent * log(ent))

  AWE <- -2 * (final_loglik + sum(ent * log(ent))) + 2 * npar$total * (3/2 + log(n))
  CLC <- -2 * final_loglik + 2 * sum(ent * log(ent))

  #----------------------#
  #    Prepare Output    #
  #----------------------#

  c_names <- paste('comp', 1:G, sep = '')
  v_names <- colnames(X)

  if (is.null(v_names)) {
    v_names <- 1:d
  }

  names(py)       <- c_names
  rownames(mu)    <- c_names
  colnames(mu)    <- v_names
  dimnames(Sigma) <- list(v_names, v_names, c_names)
  names(df)       <- c_names

  if (G == 1) {
    mu    <- mu[1, ]
    Sigma <- Sigma[, , 1]
  }

  output <- list(
    model         = paste(model, '_complete_data', sep = ''),
    pi            = py,
    mu            = mu,
    Sigma         = Sigma,
    df            = df,
    z_tilde       = z_tilde,
    clusters      = clusters,
    outliers      = outliers,
    data          = X,
    complete      = !is.na(X),
    npar          = npar,
    max_iter      = max_iter,
    iter_stop     = iter,
    final_loglik  = final_loglik,
    loglik        = loglik,
    AIC           = AIC,
    BIC           = BIC,
    KIC           = KIC,
    KICc          = KICc,
    AIC3          = AIC3,
    CAIC          = CAIC,
    AICc          = AICc,
    ent           = ent,
    ICL           = ICL,
    AWE           = AWE,
    CLC           = CLC,
    init_method   = init_method
  )

  if (model == 'C') {
    output$df       <- NULL
    output$outliers <- NULL
  }

  class(output) <- 'MixtureMissing'

  return(output)

}

Try the MixtureMissing package in your browser

Any scripts or data that you put into this service are public.

MixtureMissing documentation built on Oct. 16, 2024, 1:09 a.m.