R/test_indep_clust.R

Defines functions test_indep_clust

Documented in test_indep_clust

#' Pseudo likelihood ratio test for dependent clusterings
#'
#'
#' Implements the pseudo likelihood ratio test described in Section 3 of
#' Gao et. al. (2019) "Are Clusterings of Multiple Data Views Independent?"
#' for testing for dependence between clusterings of two data views.
#' Fits Gaussian mixture models in each view.
#'
#' @param x Multi-view data with two views; a list of two numeric vectors
#'  (in the case of univariate data) or matrices containing the two data views.
#'  In matrix format, rows correspond to observations and columns correspond to variables.
#' @param B An integer specifying the number of permutations to
#' use for the permutation procedure. The default number is 200.
#' @param model1 A character string indicating the model
#' to be fitted for Gaussian model-based clustering in view 1 using
#' the function \code{\link[mclust]{Mclust}}. The default is \code{"EII"}
#' (spherical, equal volume).
#' The help file for \code{\link[mclust]{mclustModelNames}} describes
#' the available model options.
#' @param model2 A character string indicating indicating the model
#' to be fitted for Gaussian model-based clustering in view 1 using
#' the function \code{\link[mclust]{Mclust}}. The default is \code{"EII"}
#' (spherical, equal volume). The help file for
#'  \code{\link[mclust]{mclustModelNames}} describes the available model options.
#' @param step A numeric value containing the fixed step size to be used in the optimization
#' algorithm for estimating Pi. The default step size is 0.001. See Supplement C of
#'  "Are Clusterings of Multiple Data Views Independent?" for details.
#' @param maxiter A numeric value containing the maximum number of iterations to run in
#'  the optimization algorithm. The default maximum is 1000.
#' @param init1 An optional argument containing the model to be fitted in the
#' hierarchical clustering initialization in Gaussian model-based clustering
#' in view 1. The default is \code{"VVV"} (ellipsoidal, varying volume,
#'  shape, and orientation).
#' The help file for \code{\link[mclust]{hc}} describes
#' the available model options.
#' @param init2 An optional argument containing the model to be fitted in the
#' hierarchical clustering initialization in Gaussian model-based clustering
#' in view 2. The default is \code{"VVV"} (ellipsoidal, varying volume,
#'  shape, and orientation).
#' The help file for \code{\link[mclust]{hc}} describes
#' the available model options.
#' @param K1 An optional argument containing the number of clusters in View 1.
#'  If left out, then the number of clusters is chosen with BIC as described in
#'  Section 2.3.3 of "Are Clusterings of Multiple Data Views Independent?"
#' @param K2 An optional argument containing the number of clusters in View 2.
#'  If left out, then the number of clusters is chosen with BIC as described in
#'  Section 2.3.3 of "Are Clusterings of Multiple Data Views Independent?"
#' @import mclust
#' @export
#' @return
#' A list containing the following output components:
#' \item{K1}{The number of clusters in view 1}
#' \item{K2}{The number of clusters in view 2}
#' \item{Pi.est}{The estimated Pi matrix}
#' \item{PLRstat}{The pseudo likelihood ratio test statistic}
#' \item{pval}{The p-value}
#' \item{modelfit1}{The object of class '\code{Mclust}' corresponding to the model-based clustering fitted
#' in View 1; contains eg. estimated parameters and cluster assignments. The help file for \code{\link[mclust]{Mclust}} describes the components of the
#' object.}
#' \item{modelfit2}{The object of class '\code{Mclust}' corresponding to the model-based clustering fitted
#' in View 2; contains eg. estimated parameters and cluster assignments.
#' The help file for \code{\link[mclust]{Mclust}} describes the components of the
#' object.}
#' @examples
#' set.seed(1)
#' n <- 50
#' sig <- 2
#' p <- 2
#' K <- 3
#' mu1 <- cbind(c(2, 0), c(0, 2),  c(2, -2), c(-2, 0), c(0, -2), c(-2, 2))
#' mu2 <- cbind(c(-2, 0), c(0, -2), c(-2, 2), c(2, 0), c(0, 2), c(2, -2))
#' # Generates two-view data where the clusters are independent.
#' x1 <- list(matrix(sig* rnorm(n*p), n, p) + t(mu1)[sample(1:K, n, replace=TRUE), ],
#'          matrix(sig * rnorm(n*p), n, p) + t(mu2)[sample(1:K, n, replace=TRUE), ])
#'
#' # Generate two-view data where the clusters are identical.
#' n <- 70
#' cl <- sample(1:K, n, replace=TRUE)
#' x2 <- list(matrix(sig* rnorm(n*p), n, p) + t(mu1)[cl, ],
#'          matrix(sig * rnorm(n*p), n, p) + t(mu2)[cl, ])
#'
#' # Run the function on independent data views; we do not reject the null hypothesis.
#'  # By default, not specifying K1 and K2 means the number of clusters
#'  # to use in the test in each view is chosen via BIC.
#'  # Covariance matrix model specified is shared sigma^2 I covariance matrix in view 1
#'  # and shared diagonal covariance matrix in view 2.
#'  # B specifies the number of permutations to do for the permutation test.
#'  # Covariance matrix model specified for initialization
#'  # is shared sigma^2 I covariance matrix in view 1
#' indep1 <- test_indep_clust(x1,model1="EII", model2="EEI",
#' init1="EII", B=52)
#' # The estimated cluster parameters in view 1
#' indep1$modelfit1$parameters
#' # The cluster assignments in view 2
#' indep1$modelfit2$classification
#'
#' # Run the function on identical data views; we reject the null hypothesis
#' # We specify the number of clusters in each view to use in the test.
#' # Covariance matrix model specified is shared covariance matrix in view 1
#' # and shared diagonal covariance matrix in view 2.
#' # See mclust documentation for more covariance model specification options.
#' identical2 <- test_indep_clust(x2,model1="EEE", model2="EEI", K1=2, K2=3, B=51)
#' # P-value
#' identical2$pval
#'
#' @references
#' Scrucca L., Fop M., Murphy T. B. and Raftery A. E. (2016)
#' mclust 5: clustering, classification and density estimation
#' using Gaussian finite mixture models, The R Journal, 8/1, pp. 205-233.
#'
#' Fraley C. and Raftery A. E. (2002) Model-based clustering,
#' discriminant analysis and density estimation,
#' Journal of the American Statistical Association, 97/458, pp. 611-631.
#'
#' Gao, L.L., Bien, J., Witten, D. (2019) Are Clusterings of Multiple Data Views Independent?
#' Biostatistics <DOI:10.1093/biostatistics/kxz001>.
#'
test_indep_clust <- function(x, model1="EII", model2="EII",
                             K1=NULL, K2=NULL,
                             init1=NULL, init2=NULL,
                             B=200, step=0.001, maxiter=1000) {

  if(typeof(x) != "list") {
    stop("x is not in the right format; should be a list of two matrices
         (or vectors, for univariate data)")
  } else {
    if(length(x) != 2) {
      stop("x is not in the right format; should be a list of two numeric matrices
         (or vectors, for univariate data)")
    }  else {
      if(!is.numeric(x[[1]]) || !is.numeric(x[[2]])) {
        stop("x is not in the right format; should be a list of two numeric matrices
         (or vectors, for univariate data)")
      } else {
        if(is.matrix(x[[1]]) != is.matrix(x[[2]])) {
          stop("x is not in the right format; should be a list of two numeric matrices
         (or vectors, for univariate data)")
        } else {
          val1 <- ifelse(is.matrix(x[[1]]), nrow(x[[1]]), length(x[[1]]))
          val2 <- ifelse(is.matrix(x[[2]]), nrow(x[[2]]), length(x[[2]]))
          if(val1 != val2){
            stop("view 1 should contain the same number of observations as view 2")
          }
        }
      }
    }
  }

  if(any(is.na(x[[1]])) || any(is.na(x[[2]])))
    stop(paste("views cannot contain any missing values"))

  if(!model1 %in% c("E", "V", "EII", "VII", "EEI", "VEI", "EVI", "VVI", "EEE",
                    "EVE", "VEE", "VVE", "EEV", "VEV", "EVV", "VVV"))

  if(model1 %in% c("E", "V") && is.matrix(x[[1]]))
    stop(paste(model1, "is a univariate model for Mclust, cannot be used for
         multivariate data") )

  if(model1 %in% c("EII", "VII", "EEI", "VEI", "EVI", "VVI", "EEE",
                   "EVE", "VEE", "VVE", "EEV", "VEV", "EVV", "VVV") && !is.matrix(x[[1]]))
    stop(paste(model1, "is a multivariate model for Mclust, cannot be used for
               univariate data"))

  if(!model2 %in% c("E", "V", mclust.options("emModelNames")))
    stop(paste("Did not pass a valid model for Mclust in View 2") )

  if(model2 %in% c("E", "V") && is.matrix(x[[2]]))
    stop(paste(model2, "is a univariate model for Mclust, cannot be used for
               multivariate data"))
  if(model2 %in% c("EII", "VII", "EEI", "VEI", "EVI", "VVI", "EEE",
                   "EVE", "VEE", "VVE", "EEV", "VEV", "EVV", "VVV") && !is.matrix(x[[2]]))
    stop(paste(model2, "is a multivariate model for Mclust, cannot be used for
               univariate data") )


  if(is.null(init1)) {
    init1 <- "VVV"
  } else {
    if(!init1 %in% c("E", "V", "VVV", "EEE", "VII", "EII"))
      stop(paste("Did not pass a valid initialization model for Mclust in View 1") )

    if(init1 %in% c("E", "V") && is.matrix(x[[1]]))
      stop(paste(init1, "is a univariate initialization model for Mclust,
                 cannot be used for multivariate data") )

    if(init1 %in% c("VVV", "EEE", "VII", "EII") && !is.matrix(x[[1]]))
      stop(paste(init1, "is a multivariate initialization model for Mclust,
                 cannot be used for univariate data"))

    agree1 <- ((init1 %in% c("E", "V")) == (model1 %in% c("E", "V")))
    if(!agree1) {
      stop("In view 1, initialization model is for univariate data,
          but model is for multivariate data, or
        initialization model is for multivariate data,
           but model is for univariate data")
    }
  }

  if(is.null(init2)) {
    init2 <- "VVV"
  } else {
    if(!init2 %in% c("E", "V", "VVV", "EEE", "VII", "EII"))
      stop(paste("Did not pass a valid initialization model for Mclust in View 2") )

    if(init2 %in% c("E", "V") && is.matrix(x[[2]]))
      stop(paste(init2, "is a univariate initialization model for Mclust,
                 cannot be used for multivariate data"))
    if(init2 %in% c("VVV", "EEE", "VII", "EII") && !is.matrix(x[[2]]))
      stop(paste(init2, "is a univariate initialization model for Mclust,
                 cannot be used for univariate data") )
    agree2 <- ((init2 %in% c("E", "V")) == (model2 %in% c("E", "V")))
    if(!agree2) {
      stop("In view 2, initialization model is for univariate data,
          but model is for multivariate data, or
           initialization model is for multivariate data,
           but model is for univariate data")
    }
  }


  if(step <=0 ) stop('Step size for optimization algorithm should be > 0')
  if(B < 1) stop('Number of permutations should be > 0')
  if(B <= 50) warning('Number of permutation iterations is specified to be small;
                     p-value approximation will likely be imprecise')
  if(maxiter %% 1 != 0|| maxiter <= 0) stop('Maximum number of iterations should be
                                                a positive integer')

  # Model-based clustering of each view
  if(is.null(K1)) {
    EM.View1 <- mclust::Mclust(x[[1]], G=2:9, modelNames=c(model1),
                               initialization=list(hcPairs=hc(x[[1]],
                                modelName=init1)))
    if(is.null(EM.View1)) stop("The model specified for model-based clustering
                               could not be fitted in view 1")
    K1 <- EM.View1$G
  } else {
    if(!is.numeric(K1)) stop('Number of clusters must be a positive integer in View 1')
    if(K1 %% 1 != 0 || K1 < 0) stop('Number of clusters must be a positive integer in View 1')
    if(K1 < 1) stop('Number of clusters must be greater than 1 in View 1')


    EM.View1 <- mclust::Mclust(x[[1]], K1, modelNames=c(model1),
                               initialization=list(hcPairs=hc(x[[1]],
                                          modelName=init1)))
    if(is.null(EM.View1)) stop("The model specified for model-based clustering
                               could not be fitted in view 1")
  }

  if(is.null(K2)) {
    EM.View2 <- mclust::Mclust(x[[2]], G=2:9, modelNames=c(model2),
                               initialization=list(hcPairs=hc(x[[2]],
                                       modelName=init2)))
    if(is.null(EM.View2)) stop("The model specified for model-based clustering
                               could not be fitted in view 1")
    K2 <- EM.View2$G
  } else {
    if(!is.numeric(K2)) stop('Number of clusters must be a positive integer in View 2')
    if(K2 %% 1 != 0 || K2 < 0) stop('Number of clusters must be a positive integer in View 2')
    if(K2 < 1) stop('Number of clusters must be greater than 1 in View 2')
    EM.View2 <- mclust::Mclust(x[[2]], K2, modelNames=c(model2),
                               initialization=list(hcPairs=hc(x[[2]],
                                        modelName=init2)))
    if(is.null(EM.View2)) stop("The model specified for model-based clustering
                               could not be fitted in view 2")
  }


  EM.View1.param <- EM.View1$parameters
  EM.View2.param <- EM.View2$parameters

  # Density matrices for each mixture component
  logphi1 <-  mclust::cdens(model1, x[[1]], logarithm=TRUE, EM.View1.param)
  logphi2 <- mclust::cdens(model2, x[[2]], logarithm=TRUE, EM.View2.param)

  n <- nrow(logphi1)
  pi1.est <- EM.View1.param$pro
  pi2.est <- EM.View2.param$pro
  # Optimization to estimate Pi
  Pi.est <- optimize_over_Pi(logphi1, logphi2, pi1.est, pi2.est,
                             max.iter=maxiter, stepsz=step)

  # Compute pseudo likelihood ratio test statistic
  nullLogLik <- EM.View1$loglik + EM.View2$loglik
  log.Lambda = -Pi.est$obj - nullLogLik
  PLRstat <- 2*log.Lambda

  cat("Computing p-value: ")
  # Permutation procedure
  PLRstat.perm <- rep(0, B)
  for(b in 1:B) {

    if(b %% 10^round(log10(B) - 1) == 0) {
      cat(round(b/B*100))
      cat("% of permutations done ... ")
    }

    if(b == 1001) {
      earlyPval <- mean(ifelse(PLRstat.perm[1:1000] >= PLRstat, 1, 0))
      if(earlyPval >= 0.1) {
        PLRstat.perm <- PLRstat.perm[1:1000]
        cat("stopped permuting at 1000 permutations (calculated permutation p-value >= 0.1)")
        cat("\n")
        cat("\n")
        break
      }
    }
    Pi.est.perm <- optimize_over_Pi(logphi1, logphi2[sample(1:n, replace=F), ],
                                    pi1.est, pi2.est, max.iter=maxiter, stepsz=step)
    log.Lambda.perm <- -Pi.est.perm$obj - nullLogLik
    PLRstat.perm[b] <- 2*log.Lambda.perm
  }
  cat("\n")
  cat("\n")

  pval.perm <- mean(ifelse(PLRstat.perm >= PLRstat, 1, 0))

  # Likelihood Ratio statistic
  return(list(PLRstat=PLRstat,
              Pi.est=Pi.est$Pi,
              K1=K1,
              K2=K2,
              pval=pval.perm,
              modelfit1=EM.View1,
              modelfit2=EM.View2))
}

Try the multiviewtest package in your browser

Any scripts or data that you put into this service are public.

multiviewtest documentation built on Feb. 28, 2019, 5:05 p.m.