R/mixtureSEM.R
In Rankcluster: Model-Based Clustering for Multivariate Partial Ranking Data

# mixtureSEM
# @title algorithm SEM for
# @author Grimonprez Quentin
# @param X matrix where each row is a rank and the last column contains the frequencies
# @param g number of groups
# @param m a vector with the size of ranks for each dimension
# @param maxIt the maximum number of iteration of the algorithm

# @param Qsem the total number of iterations for the SEM algorithm (default value=40)
# @param Bsem burn-in period for SEM algorithm (default value=10)
# @param RjSE a vector containing the number of iteration for each dimension of the Gibbs algorithm in the SE step for
# generating partial ranks and orders of presentation(only for SEM algorithm, default value=m(m-1)/2)
# @param RjM a vector containing the number of iterations for each dimension for the Gibbs Sampler in the M step
# (only for SEM algorithm, default value=m(m-1)/2)
# @param Ql number of iterations of the Gibbs sampler for estimation of log-likelihood
# (only for SEM algorithm, default value=100)
# @param Bl burn-in period for estimation of log-likelihood (only for SEM algorithm, default value=50)
# @param detail boolean, if TRUE, time and others information will be print during the process (default value FALSE)

# @return an object containing the reference rank mu, the probability pi of a correct comparison, proportion,
# conditional probability of belonging to each cluster (tik), the loglikelihood, the partition, the BIC and the ICL
# @references "Model-based clustering for multivariate partial ranking data", J. Jacques, C. Biernacki
# @examples
# data(APA)#m=5
# mixtureSEM(APA,2)
# @export

mixtureSEM <- function(X, g, m, Qsem, Bsem, Ql, Bl, RjSE, RjM, maxTry, run, detail) {
  n <- nrow(X)
  d <- length(m)
  if (ncol(X) != sum(m)) {
    stop(paste0("the number of column of X (", ncol(X), ") does not match to the sum of vector m (", sum(m), ")."))
  }


  # Verification des donnees
  for (i in 1:d) {
    check <- apply(X[, (1 + cumsum(c(0, m))[i]):(cumsum(c(0, m))[i + 1])], 1, checkTiePartialRank, m[i])
    if (sum(check) != n) {
      indfalse <- which(check == 0)
      stop(paste("Data are not correct.\n", "For dimension", i, ", ranks at row", indfalse, "are not correct."))
    }
  }

  res <- .Call("semR", X, m, g, Qsem, Bsem, Ql, Bl, RjSE, RjM, maxTry, run, detail, PACKAGE = "Rankcluster")
  if (res$stock[1] == 2) {
    res$indexPb <- lapply(res$indexPb, unique)
    for (i in 1:d) {
      if (length(res$indexPb) != 0) {
        cat(paste0("For dimension ", i, ", rankings at the following index have format problem:\n"))
        cat(res$indexPb[[i]])
      }
    }
    stop(
      "Problem with your data.\n The ranks have to be given in the ranking notation (see convertRank function), with the following convention:
- missing positions are replaced by 0
- tied are replaced by the lowest position they share\n")
  }


  # recuperation des resultats
  if (res$stock[1] == 1) { # si convergence
      res$referenceRank <- tliste3d2mat(res$referenceRank)
      res$initMu <- tliste3d2mat(res$initMu)
      res$p <- liste2d2matgd(res$p)
      res$initPi <- liste2d2matgd(res$initPi)
      res$cluster <- res$cluster + 1

      res$entropy <- cbind(res$entropy, res$cluster)
      res$probability <- cbind(res$probability, res$cluster)
      colnames(res$entropy) <- c("entropy", "cluster")
      colnames(res$probability) <- c("probability", "cluster")

      res$distMu <- liste3d2listematgd(res$distMu)
      res$distP <- liste3d2listematgd(res$distP)

      ### rank conversion from ordering to ranking
      indM <- c(0, cumsum(m))

      for (i in seq_along(m)){
        # res$initMu
        res$initMu[, (indM[i] + 1):indM[i + 1]] <- t(
          apply(res$initMu[, (indM[i] + 1):indM[i + 1], drop = FALSE], 1, convertRank)
        )

        # res$referenceRank
        res$referenceRank[, (indM[i] + 1):indM[i + 1]] <- t(
          apply(res$referenceRank[, (indM[i] + 1):indM[i + 1], drop = FALSE], 1, convertRank)
        )
      }


      if (res$stock[2] == 1) { # si il y a des donnees partielles
          res$partialRank <- tliste3d2mat(res$partialRank) ## proba a rajoute

          rownames(res$partialRank) <- rep("", nrow(res$partialRank)) # enlever les cl1...
          # colnames(res$partialRank)[1]="Index"
          # colnames(res$partialRank)[ncol(res$rangPartial)]="Probability"
          res$initPartialRank <- tliste3d2mat(res$initPartialRank)
          res$scorePartial <- tliste3d2mat(res$scorePartial)
          # colnames(res$initPartialRank)[1]="Index"
          rownames(res$initPartialRank) <- rep("", nrow(res$initPartialRank))
          rownames(res$scorePartial) <- rep("", nrow(res$scorePartial))

          ### rank conversion from ordering to ranking
          for (i in seq_along(m)) {
            # res$initPartialRank
            res$initPartialRank[, (indM[i] + 1):indM[i + 1]] <- t(
              apply(res$initPartialRank[, (indM[i] + 1):indM[i + 1], drop = FALSE], 1, convertRank)
            )

            # res$partialRank
            #res$partialRank[,(indM[i]+1):indM[i+1]]=t(apply(res$partialRank[,(indM[i]+1):indM[i+1],drop=FALSE],1,convertRank))
            for (j in 1:n) {
              ordtemp <- order(res$partialRank[j, (indM[i] + 1):indM[i + 1]])
              res$partialRank[j, (indM[i] + 1):indM[i + 1]] <- ordtemp
              res$scorePartial[j, (indM[i] + 1):indM[i + 1]] <- res$scorePartial[j, ((indM[i] + 1):indM[i + 1])][ordtemp]
            }
          }


          res$distPartialRank <- lapply(res$distPartialRank, FUN = function(x) {
            listedistPartiel(x)
          })

          result <- new(
            Class = "Output",
            bic = res$stock[4],
            icl = res$stock[5],
            ll = res$stock[3],
            proportion = res$proportion,
            pi = res$p,
            mu = res$referenceRank,
            tik = res$tik,
            partition = res$cluster,
            entropy = res$entropy,
            probability = res$probability,
            convergence = TRUE,
            partial = TRUE,
            partialRank = res$partialRank,
            distanceZ = res$distZ,
            distanceMu = res$distMu,
            distanceProp = res$distProp,
            distancePi = res$distP,
            distancePartialRank = res$distPartialRank,
            piInitial = res$initPi,
            muInitial = res$initMu,
            partialRankInitial = res$initPartialRank,
            proportionInitial = res$initProportion,
            partialRankScore = res$scorePartial
          )
        } else {
        result <- new(
          Class = "Output",
          bic = res$stock[4],
          icl = res$stock[5],
          ll = res$stock[3],
          proportion = res$proportion,
          pi = res$p,
          mu = res$referenceRank,
          tik = res$tik,
          partition = res$cluster,
          entropy = res$entropy,
          probability = res$probability,
          convergence = TRUE,
          partial = FALSE,
          distanceZ = res$distZ,
          distanceMu = res$distMu,
          distanceProp = res$distProp,
          distancePi = res$distP,
          piInitial = res$initPi,
          muInitial = res$initMu,
          proportionInitial = res$initProportion
        )
      }

      if (detail) {
        cat("RESULTS:\n")
        cat("NUMBER OF CLUSTERS: ", g)
        cat("\nLoglikelihood =", res$stock[3])
        cat("\nBIC=", res$stock[4])
        cat("\nICL=", res$stock[5])
        cat("\nProportion:", res$proportion)
        cat("\nProbabilities pi:\n")
        print(res$p)
        cat("\nReference ranks mu:\n")
        print(res$referenceRank)
      }
    } else {
    result <- new(Class = "Output", convergence = FALSE)
  }

  return(result)
}