R/MvBinaryEstim.R

Defines functions MvBinaryEstim MvBinaryEstimCAH is.inlistmodels locations.inlistmodels MvBinaryEstimCAH2 MvBinaryEstimMH

Documented in MvBinaryEstim

##' MvBinary a package for Multivariate Binary data
##'
##' MvBinary is a tool for fitting the distribution of correlated multivariate binary data.
##'
##' \tabular{ll}{
##'   Package: \tab MvBinary\cr 
##'   Type: \tab Package\cr 
##'   Version: \tab 1.0.0\cr
##'   Date: \tab 2015-11-03\cr 
##'   License: \tab GPL-2\cr 
##'   LazyLoad: \tab yes\cr
##' }
##'
##'
##' @name MvBinary-package
##' @aliases MvBinary
##' @rdname MvBinary-package
##' @docType package
##' @keywords package
##' @import methods
##' @import mgcv
##' @import parallel
##' @import graphics
##' @export MvBinaryEstim
##' @export MvBinaryProbaPost
##' @export ComputeEmpiricCramer
##' @export ComputeMvBinaryCramer
##' @exportMethod print
##' @exportMethod summary
##' @exportClass MvBinaryResult
##' @importFrom stats as.dist cutree hclust optimize runif
##'
##' @author
##' Author: Marbac M., and Sedki S.
##'
##' @references Matthieu Marbac, Mohammed Sedki (2015). A Family of Blockwise One-Factor Distributions for Modelling High-Dimensional Binary Data. arXiv:1511.01343
##'
##' @examples
##' # Package loading
##' rm(list=ls())
##' require(MvBinary)
##' 
##' # Data loading
##' data(MvBinaryExample)
##' 
##' # Parameter estimation by the HAC-based algorithm on 2 cores
##' # where the EM algorithms are initialized 10 times
##' res.CAH <- MvBinaryEstim(MvBinaryExample, 2, nbinit.EM = 10)
##' 
##' # Summary of the estimated model
##' summary(res.CAH) 
##' 
##' # Print the parameters of the estimated model
##' print(res.CAH) 
NULL

##' Real binary data: Plants
##' 
##' The file plants.rda describes 35583 plants by indicating if they occur (1) or not (2) in 69 states of the Norht America.
##'
##' This data set been extracted from the USA plants database, July 29, 2015.
##'
##' @format A matrix with 35583 observations on the 69 variables.
##'
##' 
##'
##'
##' @name plants
##' @docType data
##' @keywords datasets
##' 
##' @examples
##'   data(plants)
NULL


##' Simulated binary data: MvBinaryExample
##' 
##' The file MvBinaryExample.rda describes 400 individuals by 6 binary variables.
##'
##' This data set has been simulated from the MvBinary model. The first three variables are dependent. The last three variables are dependent.
##'
##' @format A matrix with 400 observations on the 6 variables.
##'
##' 
##'
##'
##' @name MvBinaryExample
##' @docType data
##' @keywords datasets
##' 
##' @examples
##'   data(MvBinaryExample)
NULL




###################################################################################
##' Create an instance of the [\code{\linkS4class{MvBinaryResult}}] class
##'
##' This function performs the model selection and the parameter inference.
##' 
##' @param x matrix of the binary observation.
##' @param nbcores number of cores used for the model selection (only for Linux). Default is 1.
##' @param algorithm algorithm used for the model selection ("HAC": deterministic algorithm based on the HAC of the variables, "MH": stochastic algorithm for optimizing the BIC criterion, "List": comparison of the models provided by the users). Default is "HAC".
##' @param models list of models provided by the user (only used when algorithm="List"). Default is NULL
##' @param tol.EM stopping criterion for the EM algorithm. Default is 0.01
##' @param nbinit.EM number of random initializations for the EM algorithm. Default is 40.
##' @param nbiter.MH number of successive iterations without finding a model having a better BIC criterion which involves the stopping of the Metropolis-Hastings algorithm (only used when algorithm="MH"). Default is 50.
##' @param nbchains.MH number of radom initializations for the stochastic algorithm (only used when algorithm="MH"). Default is 10.
##'
##' @examples
##' # Data loading
##' data(MvBinaryExample)
##' 
##' # Parameter estimation by the HAC-based algorithm on 2 cores
##' # where the EM algorithms are initialized 10 times
##' res.CAH <- MvBinaryEstim(MvBinaryExample, 2, nbinit.EM = 10)
##' 
##' # Parameter estimation for two competing models
##' res.CAH <- MvBinaryEstim(MvBinaryExample, algorithm="List",
##'  models=list(c(1,1,2,2,3,4), c(1,1,1,2,2,2)), nbinit.EM = 10)
##' 
##' # Summary of the estimated model
##' summary(res.CAH) 
##' 
##' # Print the parameters of the estimated model
##' print(res.CAH) 
##'
##' @return Returns an instance of the [\code{\linkS4class{MvBinaryResult}}] class. 
##' @export
##'
##'
MvBinaryEstim <- function(x, nbcores=1, algorithm="HAC", modelslist=NULL, tol.EM=0.01, nbinit.EM=40, nbiter.MH=50, nbchains.MH=10){
  if ( (is.matrix(x)==FALSE) || any((x==0) + (x==1) == 0) )    stop("The input parameter x must be a binary matrix")
  if (is.null(colnames(x)))     colnames(x) <- paste("x",1:ncol(x), sep="")  
  if ((is.numeric(nbcores)==FALSE) || (length(nbcores)!=1) || (nbcores!=ceiling(nbcores)) ) stop("The input parameter nbcores must be an integer")
  if (algorithm %in% c("HAC", "HAC2", "MH", "List") == FALSE)    stop("The input parameter algorithm must take one of these values HAC, MH, List")
  if (is.null(modelslist)==FALSE){
    if (is.list(modelslist)){  lapply(1:length(modelslist), function(j) if (length(modelslist[[j]])!=ncol(x))  stop("The input parameter modelslist must be a list where each element gives the partition of the variables by a vector of size d"))
    }else{
      stop("The input parameter modelslist must be a list where each element gives the partition of the variables by a vector of size d")
    }
  }
  if ((is.numeric(tol.EM)==FALSE) || (length(tol.EM)!=1))    stop("The input parameter tol.EM must be a numeric value")
  if ((is.numeric(nbinit.EM)==FALSE) || (length(nbinit.EM)!=1) || (nbinit.EM!=ceiling(nbinit.EM)) )    stop("The input parameter nbinit.EM must be an integer")
  output <- NULL
  if (algorithm=="MH")  output <- MvBinaryEstimMH(x, nbcores, tol.EM, nbinit.EM, nbiter.MH, nbchains.MH)
  if (algorithm=="HAC")   output <- MvBinaryEstimCAH(x, nbcores, tol.EM, nbinit.EM, modelslist)
  if (algorithm=="HAC2")   output <- MvBinaryEstimCAH2(x, nbcores, tol.EM, nbinit.EM, modelslist)
  return(output)
}

MvBinaryEstimCAH <- function(x, nbcores=1, tol=0.01, nbinit.EM=40, models=NULL){
  alpha <- colMeans(x)
  if (is.null(models)){
    tree <- hclust(as.dist(1-ComputeEmpiricCramer(x)), method="ward.D")
    models <- lapply(1:ncol(x), function(k) cutree(tree, k))
  }
  # Inference for the competiting models
  if ((nbcores>1)&&(Sys.info()["sysname"] != "Windows")){
    reference <- mclapply(X = models,
                          FUN = XEMmodel,
                          dataset=x,
                          alpha=alpha,
                          tol=tol,
                          nbinit.EM=nbinit.EM,
                          mc.cores = min(detectCores(all.tests = FALSE, logical = FALSE), nbcores),
                          mc.preschedule = TRUE, mc.cleanup = TRUE)
  }else{
    reference <- lapply(1:length(models), function(loc) XEMmodel(x, alpha, tol, nbinit.EM, models[[loc]]))
  }
  # Design outputs
  allBIC <- unlist(lapply(1:length(reference), function(loc) reference[[loc]]$bic))
  #  allModels <-  matrix(unlist(lapply(1:length(reference), function(loc) models[[loc]])), length(reference), ncol(x), byrow = TRUE)
  Best <- reference[[which.max(allBIC)]]
  names(reference[[which.max(allBIC)]]$epsilon) <-  names(reference[[which.max(allBIC)]]$delta) <- colnames(x)
  return( new("MvBinaryResult", 
              alpha=alpha, 
              epsilon=reference[[which.max(allBIC)]]$epsilon, 
              delta=reference[[which.max(allBIC)]]$delta, 
              blocks=reference[[which.max(allBIC)]]$model,
              nbparam=reference[[which.max(allBIC)]]$nbparam,
              loglike=reference[[which.max(allBIC)]]$loglike,
              bic=reference[[which.max(allBIC)]]$bic)
  )
}


is.inlistmodels <- function(cand, mod)  sum(unlist(lapply(1:length(mod), function(b) all(cand %in% mod[[b]]) * all(mod[[b]] %in% cand))))
locations.inlistmodels <- function(cand, mod) which(unlist(lapply(1:length(mod), function(b) all(cand %in% mod[[b]]) * all(mod[[b]] %in% cand))) ==1)

MvBinaryEstimCAH2 <- function(x, nbcores=1, tol=0.01, nbinit.EM=40, models=NULL){
  alpha <- colMeans(x)
  if (is.null(models)){
    tree <- hclust(as.dist(1-ComputeEmpiricCramer(x)), method="ward.D")
    models <- lapply(1:ncol(x), function(k) cutree(tree, k))
  }
  mod <- as.list(models[[length(models)]])
  for (u in (length(mod)-1):1){
    add <- unlist(lapply(1:max(models[[u]]), function(b) (is.inlistmodels(which(models[[u]]==b), mod)==0)))
    mod <- c(list(which(models[[u]]==which(add==1))), mod)
  }
  essai <- list()
  if (nbcores>1){
    essai <- mclapply(1:length(mod), function(b) XEMblock2(x, alpha, tol, nbinit.EM, mod[[b]]),  mc.cores = min(detectCores(all.tests = FALSE, logical = FALSE), nbcores), mc.preschedule = TRUE, mc.cleanup = TRUE)
  }else{
    essai <- lapply(1:length(mod), function(b) XEMblock2(x, alpha, tol, nbinit.EM, mod[[b]]))    
  }
  best <- which.max(unlist(lapply(1:length(models), function(u) sum(unlist(lapply(1:max(models[[u]]), function(b) essai[[locations.inlistmodels(which(models[[u]]==b), mod)]]$bic))))))
  blocks <- models[[best]]
  bic <- sum(unlist(lapply(1:max(models[[best]]), function(b) essai[[locations.inlistmodels(which(models[[best]]==b), mod)]]$bic)))
  loglike <- sum(unlist(lapply(1:max(models[[best]]), function(b) essai[[locations.inlistmodels(which(models[[best]]==b), mod)]]$loglike)))
  nbparam <- sum(unlist(lapply(1:max(models[[best]]), function(b) essai[[locations.inlistmodels(which(models[[best]]==b), mod)]]$nbparam)))
  epsilon <- delta <- rep(NA, ncol(x))
  epsilon[order(blocks)] <- unlist(lapply(1:max(models[[best]]), function(b) essai[[locations.inlistmodels(which(models[[best]]==b), mod)]]$epsilon))
  delta[order(blocks)] <- unlist(lapply(1:max(models[[best]]), function(b) essai[[locations.inlistmodels(which(models[[best]]==b), mod)]]$delta))
  return( new("MvBinaryResult", 
              alpha=alpha, 
              epsilon=epsilon, 
              delta=delta, 
              blocks=blocks,
              loglike=loglike,
              nbparam=nbparam,
              bic=bic)
  )
}

MvBinaryEstimMH <- function(x, nbcores=1, tol.EM=0.01, nbinit.EM=40, nbiter.MH=50, nbchains.MH=10){
  alpha <- colMeans(x)
  # Inference for the competiting models
  nb.cpus <- min(detectCores(all.tests = FALSE, logical = FALSE), nbcores, nbchains.MH)
  if ((nbcores>1)&&(Sys.info()["sysname"] != "Windows")){
    reference <- mclapply(X = as.list(rep(nbiter.MH, nbchains.MH)),
                          FUN = OneMH,
                          x=x,
                          alpha=alpha,
                          tol=tol.EM,
                          nbinit=nbinit.EM,
                          mc.cores = nb.cpus,
                          mc.preschedule = TRUE,
                          mc.cleanup = TRUE)
  }else{
    reference <- list(); for (loc in 1:nbchains.MH) reference[[loc]] <- OneMH(x, alpha, tol.EM, nbinit.EM, nbiter.MH)
  }
  # Design outputs
  allBIC <- rep(NA, length(reference))
  for (loc in 1:length(reference))    allBIC[loc] <- reference[[loc]]$bic
  Best <- XEMmodel(x, alpha, tol.EM, nbinit.EM, reference[[which.max(allBIC)]]$blocks)
  names(Best$epsilon) <-  names(Best$delta) <- colnames(x)
  return( new("MvBinaryResult", 
              alpha=alpha, 
              epsilon=Best$epsilon, 
              delta=Best$delta, 
              blocks=Best$model,
              nbparam=Best$nbparam,
              loglike=Best$loglike,
              bic=Best$bic)
  )
}

Try the MvBinary package in your browser

Any scripts or data that you put into this service are public.

MvBinary documentation built on May 2, 2019, 5:57 p.m.