R/ClustMMDD.R
In ClustMMDD: Variable Selection in Clustering by Mixture Models for Discrete Data

#######################################
####	Main R functions interfacing R and cpp 
#######################################
## library("Rcpp")
## sourceCpp("ClustMMDD.cpp")

#########################################
###	Other useful functions
#########################################

#########################################
##	Transform a data.frame from R to Rcpp
#########################################

#'	Transform a data.frame for \code{\link{ClustMMDD}} main functions compatibility.
#'	@title	Transform a data.frame for \code{\link{ClustMMDD}} main functions compatibility
#'
#'	@description
#'	\code{dataR2C(x, ploidy)} returns a list. 
#'
#'	@param	x	A data.frame or a matrix with number of columns equal to \code{ncol(x)*ploidy}.
#'	@param	ploidy	The number of columns for each variable. For example, ploidy = 2 for genotypic data from diploid individual.

#'	@return	A list of elements needed for \code{\link{ClustMMDD}} main functions :
#'	\itemize{
#'		\item{$data}{A matrix compatible with \code{\link{ClustMMDD}} main functions.}
#'		\item{$ploidy}{The number of columns for each variable.}
#'		\item{$N}{The number of lines in x.}
#'		\item{$P}{The number of variables equal to ncol(x)/ploidy.}
#'		\item{$N_LEVELS}{The vector of the numbers of levels for the variables.}
#'		\item{$LEVELS}{The levels for the variables.}
#'		\item{$COUNT}{The observed counts of the levels.}
#'		\item{$FREQ}{The observed frequencies.}
#'	}
#'	
#'	@author	Wilson Toussile
#'
#'	@seealso \code{\link{cutEachCol}} in \code{\link{ClustMMDD}} package.
#'
#'	@examples
#'	\dontrun{
#'	data(genotype1)
#'	head(genotype1) # Each data is a set of 2 observed alleles. For example "101102" = ("101", "102").
#'	genotype2 = cutEachCol(genotype1[, -11], 2) #ploidy = 2 for these genotype data
#'	head(genotype2)
#'	genotype3 = dataR2C(genotype2, ploidy = 2)
#'	head (genotype3$data)
#'	str(genotype3)
#'	}
#'	@export
dataR2C <- function(x, ploidy = 1)
{
  if(!is.element(ploidy, 1:5)) stop("ploidy not in 1:5")
  if(ncol(x)%%ploidy != 0) stop("Number of columns in data not corresponding to ploidy !")

  x = as.matrix(x)
  N = nrow(x)	
  P = ncol(x)/ploidy 
  
  if(ploidy == 1) 
  {
	x1 = data.frame(x)
  }
  else
  {
	x1 = sapply(1:P, function(j)
	{
	  j0 = (j-1)*ploidy + 1
	  return(x[, j0:(j0+ploidy-1)])
	})
  }
  
  COUNT = lapply(1:P, function(j) return(table(unlist(x1[, j]))))
  
  xC = matrix(match(x1[ ,1], names(COUNT[[1]])), ncol = ploidy)
  if(P>1)
  {
	for(j in 2:P)
	  xC = cbind(xC, matrix(match(x1[ ,j], names(COUNT[[j]])), ncol = ploidy))
  }
  
  Freq = lapply(1:P, function(j) return(COUNT[[j]]/(ploidy*N)))
  
  N_levels = sapply(1:P, function(j) return(length(COUNT[[j]])))
  
  
  LevelsInR = lapply(1:P, function(j) return(as.character(names(COUNT[[j]]))))
  LevelsR2C = unlist(LevelsInR)

  xC1 = xC - 1
  
  return(list(data = as.matrix(xC1), 
			  ploidy = as.integer(ploidy), 
			  N = as.integer(N), 
			  P = as.integer(P),
			  N_LEVELS = as.integer(N_levels), 
			  LEVELS = as.character(LevelsR2C), 
			  LEVELS_R = LevelsInR, 
			  COUNT = as.integer(unlist(COUNT)), 
			  FREQ = as.double(unlist(Freq))))
}

## cutEachCol
#'	Cut each column of a data.frame or matrix of strings to a given number of columns.
#' @description
#'	Each string in the data.frame represents a set of \code{\link{ploidy}} observations from the same set of levels. For example "101102" may represents {"101", "102"}, in which case ploidy = 2.
#'	@title	cutEachCol
#'
#'	@param	xdata	A data.frame or a matrix of strings.
#'	@param	ploidy	The number of columns for each column. For example, for genotypic data from diploid individual, \code{ploidy = 2} : for example, a data "ab" represents \code{\{"a", "b"\}} observed alleles.
#'
#'	@return	A list of elements needed for \code{\link{ClustMMDD}} main functions :
#'	\itemize{
#'		\item{data}{A matrix compatible with \code{\link{ClustMMDD}}  main functions.}
#'		\item{ploidy}{The number of columns for each column.}
#'		\item{N}{The number of lines in xdata.}
#'		\item{P}{The number of columns in xdata}
#'		\item{N_LEVELS}{The vector of the numbers of levels.}
#'		\item{LEVELS}{The levels for the variables.}
#'		\item{COUNT}{The observed counts of the levels.}
#'		\item{FREQ}{The observed frequencies.}
#'	}
#'
#'
#'	@author	Wilson Toussile
#'
#'	@seealso \code{\link{dataR2C}} for \code{\link{ClustMMDD}} data format.
#'
#'	@examples
#'	\dontrun{
#'	data(genotype1)
#'	head(genotype1) # Each data is a set of 2 observed alleles. For example "101102" = ("101", "102").
#'	genotype2 = cutEachCol(genotype1[,-11], ploidy = 2) #ploidy = 2 for these genotype data
#'	head(genotype2)
#'	}
#'	@export
cutEachCol <- function(xdata, ploidy)
{
  if(!is.data.frame(xdata))
	stop("Argument xdata must be a data.frame")
	
  NberMax = 5
  if(!is.element(ploidy, 1:NberMax)) 
		stop(paste("ploidy not in ", 1:NberMax))
		
  n = nrow(xdata)
  
  xdata = matrix(as.character(unlist(xdata)), nrow = n)
  
	##Rcpp
  return(try(cutEachColInN(xdata, ploidy)))
}

## Set EM options
#'	Set Expectation and Maximization options.
#'
#'	@title	Set Expectation and Maximization options.
#'
#'	@param	emOptions	A list of options needed by the Expectation and maximization algorithm :
#'	\itemize{
#'		\item{epsi}{In [1e-5, 1e-20], it is upper bound of the relative increasing on log-likelihood.} 
#'		\item{typeSmallEM}{In c(0, 1, 2) : 0 = classic EM, 1 = SEM, 2 = CEM.}
#'		\item{typeEM}{In c(0, 1, 2) : 0 = classic EM, 1 = SEM, 2 = CEM.} 
#'		\item{nberSmallEM}{The number of random parameter point from which to perform a certain \code{nberIterations} of EM runs.} 
#'		\item{nberIterations}{The number of iterations for each small EM.} 
#'		\item{nberMaxIterations}{The maximum number of iterations if EM algorithm converge hardly.} 
#'		\item{putThreshold}{If \code{TRUE}, the probabilities of levels are assumed to be positive in all clusters.}
#'	}
#'
#'	@details
#'	Use \code{setEmOptions()} to set all options to default.
#'	
#'	@author	Wilson Toussile.
#'
#'	@references
#'	\itemize{
#'		\item  \href{http://projecteuclid.org/euclid.ejs/1379596773}{Dominique Bontemps and Wilson Toussile (2013)} : Clustering and #'	variable selection for categorical multivariate data. Electronic Journal of Statistics, Volume 7, 2344-2371, ISSN.
#'		\item 	\href{http://www.springerlink.com/content/r7k02q167120xl64/}{Wilson Toussile and Elisabeth Gassiat (2009)} : Variable #'	selection in model-based clustering using multilocus genotype data. Adv Data Anal Classif, Vol 3, number 2, 109-134.
#'	}
#'
#'	@seealso	\code{\link{EmOptions}} for getting the current EM options.
#'
#'	@keywords	EM
#'
#'	@examples
#'	\dontrun{
#'	EmOptions()
#'	setEmOptions(list(epsi = 1e-6))
#'	EmOptions()
#'	setEmOptions()	# To set default values
#'	EmOptions()
#'	}
#'	@export
setEmOptions <- function(emOptions = list(epsi = NULL, 
										typeSmallEM = NULL, 
										typeEM = NULL, 
										nberSmallEM = NULL, 
										nberIterations = NULL, 
										nberMaxIterations = NULL, 
										putThreshold = NULL))
{
	##Rcpp
	try(
  	EmSettings(xepsi = ifelse(is.null(emOptions[["epsi"]]), -1.0, emOptions[["epsi"]]),
  	xnberSmallEM = ifelse(is.null(emOptions[["nberSmallEM"]]), -1, emOptions[["nberSmallEM"]]),
  	xnberIterations = ifelse(is.null(emOptions[["nberIterations"]]), -1, emOptions[["nberIterations"]]),
  	xtypeSmallEM = ifelse(is.null(emOptions[["typeSmallEM"]]), 0, emOptions[["typeSmallEM"]]),
  	xtypeEM = ifelse(is.null(emOptions[["typeEM"]]), 0, emOptions[["typeEM"]]),
  	xnberIterLongEM = ifelse(is.null(emOptions[["nberMaxIterations"]]), -1, emOptions[["nberMaxIterations"]]),
  	xputThreshold = ifelse(is.null(emOptions[["putThreshold"]]), FALSE, emOptions[["putThreshold"]])))
}

##	Get the current EM options
#'	@title	Display current Expectation and Maximization options.
#'
#'	@description
#'	Display the Expectation and Maximization algorithm current options.
#'
#'	@return	A list of EM options :
#'	\itemize{
#'		\item{epsi : }{The upper bound of the relative increasing on log-likelihood.} 
#'		\item{nberSmallEM : }{The number of random parameters points from which to run small EM. The estimated parameter point associated to the 
#'		maximum log-likelihood is then used for a final EM run.}
#'		\item{nberIterations : }{The number of iterations in each small EM.}
#'		\item{typeSmallEM : }{0 = classic EM, 1 = SEM and 2 = CEM.}
#'		\item{typeEM : }{0 = classic EM, 1 = SEM and 2 = CEM.}
#'		\item{nberMaxIterations : }{The maximum number of iterations in the final EM if the convergence is slow.}
#'		\item{putThreshold : }{The indication whether all parameter estimates are positive.}
#'	}
#'
#'	@author	Wilson Toussile.
#'
#'	@references
#'	\itemize{
#'		\item  \href{http://projecteuclid.org/euclid.ejs/1379596773}{Dominique Bontemps and Wilson Toussile (2013)} : Clustering and #'	variable selection for categorical multivariate data. Electronic Journal of Statistics, Volume 7, 2344-2371, ISSN.
#'		\item 	\href{http://www.springerlink.com/content/r7k02q167120xl64/}{Wilson Toussile and Elisabeth Gassiat (2009)} : Variable #'	selection in model-based clustering using multilocus genotype data. Adv Data Anal Classif, Vol 3, number 2, 109-134.
#'	}
#'
#'	@keywords EM SEM CEM
#'
#'	@seealso \code{\link{setEmOptions}} for set EM options.
#'	
#'	@examples
#'	\dontrun{
#'	EmOptions()
#'	setEmOptions(list(epsi = 1e-6))
#'	EmOptions()
#'	setEmOptions()	# To set default values
#'	EmOptions()
#'	}
#'	@export
EmOptions <- function()
{
	##Rcpp
	return (getEmOptions_Rcpp())
}

##	A general function for the EM algorithm ##
#'	@title	Perform the Expectation and Maximization algorithm followed by the maximum a posteriori clustering.
#'
#'	@description
#'	Compute an approximation of the maximum likelihood parameter estimates using Expectation and Maximization (EM) algorithm.
#'
#'	@param	xdata	A matrix of strings with the number of columns equal to ploidy * (number of variables).
#'	@param	K	The number of clusters (or populations).
#'	@param	S	The subset of clustering variables on the form of vector of logicals indicating the selected variables of clustering.
#'	@param	emOptions	A list of EM options (see \code{\link{EmOptions}} and \code{\link{setEmOptions}}).
#'	@param	ploidy	The number of occurrences for each variable in the data. For example, ploidy = 2 for genotype data from diploid individual.
#'	@param	cte	A double used for the selection criterion named \code{CteDim} in which the penalty function is \code{pen(K,S)=cte*dim}, 
#'	where \code{dim} is the number of free parameters.
#'
#'	@return	A list of 
#'	\itemize{
#'		\item{$N}{The size (number of lines) of the dataset.}
#'		\item{K}{The number of clusters (populations).}
#'		\item{S}{A vector of logicals indicating the selected variables for clustering.}
#'		\item{dim}{The number of free parameters.}
#'		\item{pi_K}{The vector of mixing proportions.}
#'		\item{prob}{A list of matrices, each matrix being the probabilities of a variables.}
#'		\item{logLik}{The log-likelihood.}
#'		\item{entropy}{The entropy.}
#'		\item{criteria}{Criteria values c(BIC, AIC, ICL, CteDim).}
#'		\item{Tik}{A stochastic matrix given belonging probabilities of individuals in the sample.}
#'		\item{mapClassif}{Maximum a posteriori classification.}
#'		\item{NbersLevels}{The numbers of observed levels.}
#'		\item{levels}{The observed levels.}
#'	}
#'	
#'
#'	@author	Wilson Toussile.
#'
#'	@references
#'	\itemize{
#'		\item  \href{http://projecteuclid.org/euclid.ejs/1379596773}{Dominique Bontemps and Wilson Toussile (2013)} : Clustering and #'	variable selection for categorical multivariate data. Electronic Journal of Statistics, Volume 7, 2344-2371, ISSN.
#'		\item 	\href{http://www.springerlink.com/content/r7k02q167120xl64/}{Wilson Toussile and Elisabeth Gassiat (2009)} : Variable #'	selection in model-based clustering using multilocus genotype data. Adv Data Anal Classif, Vol 3, number 2, 109-134.
#'	}
#'
#'	@keywords	EM Classification Clustering
#'
#'	@seealso \code{\link{dataR2C}}.
#'
#'	@examples
#'	\dontrun{
#'	data(genotype1)
#'	head(genotype1) # Each data is a set of 2 observed alleles. For example "101102" = ("101", "102").
#'	genotype2 = cutEachCol(genotype1[, -11], ploidy = 2) #ploidy = 2 for these genotype data
#'	head(genotype2)
#'	
#'	#See the EM options
#'	EmOptions() # Options can be set by \code{\link{setEmOptions()}}
#'	par5 = em.cluster.R (genotype2, K = 5, S = c(rep(TRUE, 8), rep(FALSE, 2)), ploidy = 2)
#'	str(par5)
#'	head(par5$Tik)
#'	}
#'	@export
em.cluster.R <- function(xdata, K, S,
				 ploidy = 1,
				 emOptions = list(
				 epsi = NULL,
				 typeSmallEM = NULL,
				 typeEM = NULL, 
				 nberSmallEM = NULL,
				 nberIterations = NULL,
				 nberMaxIterations = NULL,
				 putThreshold = NULL),
				 cte = 1.0)
{
  K = as.integer(K)
  S = as.logical(S)
  ploidy = as.integer(ploidy)
  
  ##	Transform data for Rcpp function
  data2C = dataR2C(xdata, ploidy = ploidy)
  
  n = as.integer(data2C$N)
  p = as.integer(ncol(xdata)/ploidy)
  
  ##	Compatibility of dataset dimensions
  if(p != length(S)) stop("Incompatibility between the number of variables and the length of S !")
  
  S = as.logical(S)
  n_levels <- data2C$N_LEVELS
  count <- data2C$COUNT
  levels <- data2C$LEVELS
  freq <- data2C$FREQ
  
  setEmOptions(emOptions)
  
  outPar = try(EM_Rcpp(t(data2C$data), ploidy , levels, n_levels, count, freq, K, S, cte))
    
  try(initialiseEmSettings())
  
  invisible(setModelKS(outPar))  
}

##	Selection of the number of components for a given S
#'	Perform a selection of the number K of clusters for a given subset S of clustering variables.
#'
#'	@title	Selection of the number K of clusters.
#'
#'	\code{selectK.R(xdata, S, Kmax)} returns a list of estimated paramaters for each selection criteria.
#'
#'	@param	xdata	A dataset.
#'	@param	S	A subset of clustering variables on the form of logical vector of the same length P as the number of variables in x.
#'	@param	Kmax	The maximum number of clusters to be explored.
#'	@param	Kmin	The minimum number of clusters to be explored. The default value is set to 1.
#'	@param	emOptions	A list of EM options (see \code{\link{EmOptions}} and \code{\link{setEmOptions}}).
#'	@param	ploidy	The number of occurrences for each variable in the data. For example, ploidy = 2 for genotype 
#'	@param	cte	A double used for the selection criterion named \code{CteDim} in which the penalty function is 
#'	\code{pen(K,S)=cte*dim}, where \code{dim} is the number of free parameters.
#'
#'	@return	A list of estimated paramaters for each selection criteria. 
#'
#'	@keywords	EM Classification	Clustering
#'
#'	@author	Wilson Toussile
#'
#'	@references
#'	\itemize{
#'		\item  \href{http://projecteuclid.org/euclid.ejs/1379596773}{Dominique Bontemps and Wilson Toussile (2013)} : Clustering and #'	variable selection for categorical multivariate data. Electronic Journal of Statistics, Volume 7, 2344-2371, ISSN.
#'		\item 	\href{http://www.springerlink.com/content/r7k02q167120xl64/}{Wilson Toussile and Elisabeth Gassiat (2009)} : Variable #'	selection in model-based clustering using multilocus genotype data. Adv Data Anal Classif, Vol 3, number 2, 109-134.
#'	}
#'
#'	@seealso \code{\link{backward.explorer}} for more exploration of the competing models, \code{\link{dimJump.R}} 
#'	for data drven calibration of the penality function, and \code{\link{selection.model.R}} for model selection.
#'
#'	@examples
#'	\dontrun{
#'	data(genotype1)
#'	head(genotype1) # Each data is a set of 2 observed alleles. For example "101102" = ("101", "102").
#'	genotype2 = cutEachCol(genotype1[, -11], ploidy = 2) #ploidy = 2 for these genotype data
#'	head(genotype2)
#'	S = c(rep(TRUE, 8), rep(FALSE, 2))
#'
#'	outPut = selectK.R(genotype2, S, Kmax = 6, ploidy = 2, Kmin=1)
#'	outPut[["BIC"]]
#'	}
#'	@export
selectK.R <- function(xdata, S, Kmax,
					  ploidy = 1, 
					  Kmin=1, 
					  emOptions = list(
					  epsi = 1e-5,
					  nberSmallEM = 20,
					  nberIterations = 15,
					  nberMaxIterations = 5000,
					  typeSmallEM = 0,
					  typeEM = 0,
					  putThreshold = FALSE),
					  cte = 1.0,
					  project = deparse(substitute(xdata)))
{
  p = as.integer(ncol(xdata)/ploidy)
  if(length(S) != p) stop("Incorrect dimensions !")
  if(Kmax<=Kmin) stop("Incorrect range of the number of components: choose Kmax >= Kmin !")
  
  data2C = dataR2C(xdata, ploidy = ploidy)
  ploidy = as.integer(ploidy)
  S = as.logical(S)
  n = data2C$N
  p = data2C$P
  count <- data2C$COUNT
  levels <- data2C$LEVELS
  n_levels <- data2C$N_LEVELS
  freq <- data2C$FREQ
  
  setEmOptions(emOptions)
	NBER_CRITERIA = getNberCriteria_Rcpp()
  
  criteria = matrix(ncol = NBER_CRITERIA)
  outPar = list()
  
  FileNameExploredModels = paste(project, "_", "ExploredModels.txt", sep="")
	if(!file.exists(FileNameExploredModels))
	{
  	file.create(FileNameExploredModels)
		cat(c("N", "P",  "K", paste("S", 1:p, sep=""), "logLik",  "dim", "entropy\n"), file = FileNameExploredModels)
	}
  
  #FIXME TFaire en sorte que seuls qui ne sont pas dans le fichier le soient.
  for(K in Kmin:Kmax)
  {
    cat("\n K = ", K ,"\n")

		K = as.integer(K)
	
		##Rcpp
		outParK = try(EM_Rcpp(t(data2C$data), ploidy , levels, n_levels, count, freq, K, S, cte))

		outPar[[K - Kmin + 1]] = outParK
		criteria = rbind(criteria, outParK[["criteria"]])

		##Rcpp
		xfound = isInFile_Rcpp(K, outParK[["S"]], FileNameExploredModels, header = TRUE) #list(TrueFalse, line, dim, logLik)

		if(!xfound[["TrueFalse"]])
		{
		  outParKx = list(N = n, P = p, K = K, S = as.logical(outParK[["S"]]),
		  logLik = as.double(outParK[["logLik"]]), dim = as.integer(outParK[["dim"]]),
		  entropy = as.double(outParK[["entropy"]]))

		  writeModelInFile_Rcpp(outParKx, FileNameExploredModels)
		}
	}
  
  criteria = criteria[-1, ]
  colnames(criteria) = getCriteriaNames_Rcpp()
  
  #Best model by criterion
  bestK = apply(criteria, 2, which.min)
  outPutList = list()
  for(i in 1:NBER_CRITERIA)
		outPutList[[i]] = setModelKS(outPar[[bestK[i]]])
	
  names(outPutList) = getCriteriaNames_Rcpp()
  
  setEmOptions()
  
  invisible(outPutList)
}


##	Find a model in a file
#'	Find a given model defined by (K, S) in a file.
#'	@title	Find a model in a file.
#'
#'	\code{isInFile.R(K, S, file, header)}
#'	@param	K	A positive integer which is the number of clusters.
#'	@param	S	A vector of logicals of the length equal to the number of variables indicating the clustering variables. 
#'	@param	file	A file where to find the model.
#'	@param	header	A logical indicating if the file containts header or not.
#'
#'	@return	0 A list : 
#'	\itemize{
#'		\item A logical indicating if the given model was found and the following if TRUE.
#'		\item The line where the given is in the file.
#'		\item The size of the dataset from which the model was estimated.
#'		\item The log-likelihood.
#'		\item The dimension of the model.
#'		\item The entropy associated to estimated parameters of the models.
#'	}
#'
#'	@author	Wilson Toussile
#'
#'	@references
#'	\itemize{
#'		\item  \href{http://projecteuclid.org/euclid.ejs/1379596773}{Dominique Bontemps and Wilson Toussile (2013)} : Clustering and #'	variable selection for categorical multivariate data. Electronic Journal of Statistics, Volume 7, 2344-2371, ISSN.
#'		\item 	\href{http://www.springerlink.com/content/r7k02q167120xl64/}{Wilson Toussile and Elisabeth Gassiat (2009)} : Variable #'	selection in model-based clustering using multilocus genotype data. Adv Data Anal Classif, Vol 3, number 2, 109-134.
#'	}
#'	
#'	@examples
#'	\dontrun{
#'	data(genotype1)
#'	head(genotype1) # Each data is a set of 2 observed alleles. For example "101102" = ("101", "102").
#'	genotype2 = cutEachCol(genotype1[, -11], ploidy = 2) #ploidy = 2 for these genotype data
#'	head(genotype2)
#'	S = c(rep(TRUE, 8), rep(FALSE, 2))
#'
#'	outPut = selectK.R(genotype2, S, Kmax = 6, ploidy = 2, Kmin=1)
#'	isInFile.R(K = 5, S, "genotype2_ExploredModels.txt", header = TRUE)
#'	isInFile(K = 5, rep(TRUE, 10), "genotype2_ExploredModels.txt", header = TRUE)
#'	}
#'	@export
isInFile.R <- function(K, S, file, header)
{
  ## isInFile_Rcpp() from Cpp returns list(TrueFalse, line, dim, logLik)
  if(!is.logical(S))
		stop("Argument S must be a vector of logicals")
		
	outList = isInFile_Rcpp(K, S, file, header)
	outList[["line"]] = outList[["line"]] + 1
  return(outList)
}


##	Read or compute the EM 
##	TODO : Write a cpp version of this function in the future
##	Not available for users
#'	Read a given model from a file or compute if not found. This function is not available for users.
#'	@title	Read a given model from a file or compute the estimates if not found.
#'
#'	@param	xdata	A list of dataset and several description paramaters seach as frequencies.
#'	@param	xK	The number of components (clusters or populations).
#'	@param	xS	The subset of relevant variables.
#'	@param	xReferenceModel	The indicator of if the model is a reference model in an exclusion step of the backward-stepwise explorer.
#'	@param	xReferenceModelsIndex	The vector indicating the models that have been at once reference at an exclusion step.
#'	@param	xNberExploredModels	The current number of explored models.
#'	@param	xFileNameExploredModels	The explored models.
#'	@param	cte	A constant real.
#'	@param	header	Indication of the presence of header in the file.
#'
#'	@author	Wilson Toussile
#'
#'	@references
#'	\itemize{
#'		\item  \href{http://projecteuclid.org/euclid.ejs/1379596773}{Dominique Bontemps and Wilson Toussile (2013)} : Clustering and #'	variable selection for categorical multivariate data. Electronic Journal of Statistics, Volume 7, 2344-2371, ISSN.
#'		\item 	\href{http://www.springerlink.com/content/r7k02q167120xl64/}{Wilson Toussile and Elisabeth Gassiat (2009)} : Variable #'	selection in model-based clustering using multilocus genotype data. Adv Data Anal Classif, Vol 3, number 2, 109-134.
#'	}
#'
#'	@seealso \code{\link{dimJump.R}} for data drven calibration of the penality function, and 
#'	\code{\link{selection.model.R}} for model selection.
#'
read.or.compute <- function(xdata, 
							xK, 
							xS, 
							xReferenceModel, 
							xReferenceModelsIndex,
							xNberExploredModels,
							xFileNameExploredModels,
							cte = as.double(1.0),
							header = TRUE)
{
  LocalxReferenceModelsIndex <- deparse(substitute(xReferenceModelsIndex))
  LocalNberExploredModels <- deparse(substitute(xNberExploredModels))
  
  ##	Find the model in xFileNameExploredModels
  xfound = isInFile_Rcpp(xK, xS, xFileNameExploredModels, header) #list(TrueFalse, line - 1, dim, logLik) 
  
  ## debug
  #print(xfound)
  
  xN = as.integer(xdata$N)
	NBER_CRITERIA = getNberCriteria_Rcpp()
  criteria = double(NBER_CRITERIA)
  
  if(xfound[["TrueFalse"]])
  {
		numModel = xfound[["line"]] + 1
		if(xReferenceModel) xReferenceModelsIndex[numModel] = TRUE
	  
		##Rcpp::DoubleVector computeCriteria_Rcpp(double lv, int dim, int N, double entropy = 0, double cte = 1.0) from cpp.
		criteria = computeCriteria_Rcpp(xfound[["logLik"]], xfound[["dim"]], xN, xfound[["entropy"]], cte)
		return(criteria)
  }
  else
  {
		xP <- as.integer(xdata$P)
		xploidy <- as.integer(xdata$ploidy)
		xN_LEVELS = as.integer(xdata$N_LEVELS)
		xLEVELS <- as.integer(xdata$LEVELS)
		xCOUNT <- as.integer(xdata$COUNT)
		xFREQ <- as.double(xdata$FREQ)
		xK <- as.integer(xK)
		xS <- as.logical(xS)
	
		##Rcpp::List EM(Rcpp::IntegerMatrix tab, int n_occurrences, Rcpp::CharacterVector levels, Rcpp::IntegerVector n_levels, Rcpp::IntegerVector levels_count,
		#Rcpp::DoubleVector levels_freq, int K, Rcpp::LogicalVector S, double cte = 1.0)
		outPar <- try(EM_Rcpp(t(xdata$data), xploidy , xLEVELS, xN_LEVELS, xCOUNT, xFREQ, xK, xS, cte))
	
		## Now, the number of models = number of lines in FileNameExploredModels
		xNberExploredModels <- nberOfLines(xFileNameExploredModels) + 1
		xReferenceModelsIndex[xNberExploredModels] <- xReferenceModel
		
		outParX = list(N = xN, P = xP, K = xK, S = xS, logLik = outPar[["logLik"]], dim = outPar[["dim"]], entropy = outPar[["entropy"]])
	
		##Rcpp
		writeModelInFile_Rcpp(outParX, xFileNameExploredModels)
	
		assign(LocalxReferenceModelsIndex, xReferenceModelsIndex, pos = parent.frame())
		assign(LocalNberExploredModels, xNberExploredModels, pos = parent.frame())
	
		return(outPar[["criteria"]])
  }  
}

##	Backward explorer
#'	Gather a set of most competive models using a backward-stepwise strategy.
#'	@title	Gather a set most competive models.
#'
#'	@param	x	The data matrix.
#'	@param	Kmax	The maximum number of clusters to be explored.
#'	@param	Criterion	The model selection criterion in c("BIC", "AIC", "ICL", "CteDim") used for exploration (see details).
#'	@param	ForceExclusion	The indication of whether to force exclusion or not. The default value is set to FALSE.
#'	@param	Kmin	The minimum number of clusters. The default value is set to 1.
#'	@param	Smin	A logical vector that indicates variables to include in the set of selected variables. 
#'	The default value NULL: no variable is preselected.
#'	@param	ploidy	The number of columns for each variable in the data. For example, ploidy = 2 for genotypic data from diploid individual.
#'	@param	emOptions	A list of EM options (see \code{\link{EmOptions}} and \code{\link{setEmOptions}}).
#'	@param	project	The name of the project. The default value is the name of the dataset.
#'
#'	@return A data.frame of selected models for the proposed criteria.
#'	
#'	@details 1
#'	If the penalized log-likelihood criteria is \code{CteDim}, a sequence of penalty functions of the form
#'	\code{CteDim*dim} is used. In this shape of penalty function, \code{CteDim} is in \code{[0.5, log(N)]}, where 
#'	\code{N} is the number of individuals in the sample data. Thus, \code{AIC} and \code{BIC} penalties are i the sequence
#'	of penalties.
#'  
#'	@author	Wilson Toussile
#'
#'	@references
#'	\itemize{
#'		\item  \href{http://projecteuclid.org/euclid.ejs/1379596773}{Dominique Bontemps and Wilson Toussile (2013)} : Clustering and #'	variable selection for categorical multivariate data. Electronic Journal of Statistics, Volume 7, 2344-2371, ISSN.
#'		\item 	\href{http://www.springerlink.com/content/r7k02q167120xl64/}{Wilson Toussile and Elisabeth Gassiat (2009)} : Variable #'	selection in model-based clustering using multilocus genotype data. Adv Data Anal Classif, Vol 3, number 2, 109-134.
#'	}
#'	
#'	@keywords	Backward-stepwise	Cluster	Classification
#'
#'	@seealso \code{\link{dimJump.R}} for data drven calibration of the penality function, and 
#'	\code{\link{selection.model.R}} for model selection.
#'
#'	@examples
#'	\dontrun{
#'	data(genotype1)
#'	head(genotype1) # Each data is a set of 2 observed alleles. For example "101102" = ("101", "102").
#'	genotype2 = cutEachCol(genotype1[, -11], ploidy = 2) #ploidy = 2 for these genotype data
#'	########
#'	# The following command create a file "genotype2_ExploredModels.txt" that containts the explored models.
#'	
#'	#output = backward.explorer(genotype2, Kmax = 10, ploidy = 2, Kmin = 1, Criterion = "CteDim")
#'	
#'	
#'	data(genotype2_ExploredModels)
#'	head(genotype2_ExploredModels)
#'	}
#'	@export
backward.explorer <- function(x,
									 Kmax, 
									 Criterion,
									 ploidy = 1,
									 ForceExclusion = FALSE,
									 emOptions = list(
									 epsi = NULL,
									 nberSmallEM = NULL,
									 nberIterations = NULL,
									 nberMaxIterations = NULL,
									 typeSmallEM = NULL,
									 typeEM = NULL,
									 putThreshold = NULL),
									 Kmin = 1, 
									 Smin = NULL,
									 project = deparse(substitute(x)))
{  
  # Kmax > Kmin
  if(Kmax == 1 | Kmax < Kmin) 
	stop("Incorrect range of the number of components: choose Kamx > 1 and Kmax >= Kmin !")
	
  ##	Number of criteria
  ##Rcpp
	NBER_CRITERIA = getNberCriteria_Rcpp()
  NAMES_CRITERIA = getCriteriaNames_Rcpp()
	
  if(!is.element(Criterion, NAMES_CRITERIA)) 
    stop("Choose the argument Criterion in c(BIC, AIC, ICL, CteDim) !")

  CRIT = match(Criterion, NAMES_CRITERIA)

  ##Set grid for exploration
  if(CRIT == 4) cte = as.double(seq(from = 0.5, to = log(1000), by = 0.5))
  if(CRIT != 4)   cte = as.double(1.0)
	
  ##	Output files
  #	For explored models
  FileNameExploredModels = paste(project, "_", "ExploredModels.txt", sep="")

  ##	Transform data
  data2C = dataR2C(x, ploidy = ploidy)
  
  P = data2C$P
  
  if(!file.exists(FileNameExploredModels))
	{
  	file.create(FileNameExploredModels)
		cat(c("N", "P",  "K", paste("S", 1:P, sep=""), "logLik",  "dim", "entropy\n"), file = FileNameExploredModels)
	}
  
  ##	Consider all variables if is.null(Smin)
  if(is.null(Smin)) Smin = rep(FALSE, P)
  else if(all(Smin)) stop("Incorrect Smin !")
  
  
  ##	
  ploidy = as.integer(ploidy)
  xdata = data2C$data
  N = data2C$N
  P = data2C$P
  COUNT <- data2C$COUNT
  LEVELS <- data2C$LEVELS
  N_LEVELS <- data2C$N_LEVELS
  FREQ <- data2C$FREQ
  
  ##	Explored models on matrix form 
  NberExploredModels = 0
  ReferenceModelsIndex = c()
  
  ##	If FileNameExploredModels is not empty, initialise ReferenceModelsIndex FIXME
  if(file.exists(FileNameExploredModels))
  {
		NberExploredModels = nberOfLines(FileNameExploredModels)
		ReferenceModelsIndex = rep(FALSE, NberExploredModels)
  }
  
  ##	Control parameters
  HasExcluded = FALSE
  HasIncluded = FALSE
    
  if(Kmin == 1)
  {
		S = rep(FALSE, P)
	
		xfound = isInFile_Rcpp(Kmin, S, FileNameExploredModels, header = TRUE) #list(TrueFalse, line, dim, logLik)
		if(!xfound[["TrueFalse"]])
		{
			##Rcpp
	  	outParK = try(EM_Rcpp(t(xdata), ploidy , LEVELS, N_LEVELS, COUNT, FREQ, Kmin, S, 1.0))
			
	  	outParKX = list(N = N, P = P, K = Kmin, S = S, logLik = outParK[["logLik"]], dim = outParK[["dim"]], entropy = outParK[["entropy"]])
	  	writeModelInFile_Rcpp(outParKX, FileNameExploredModels)
	  	NberExploredModels = NberExploredModels + 1
	  	ReferenceModelsIndex[NberExploredModels] <- FALSE
		}
		else
		{
	  	ReferenceModelsIndex[xfound[["line"]] + 1] <- FALSE
		}
	
		Kmin <- Kmin+1
  }

  setEmOptions(emOptions)
   
  for(K in Kmin:Kmax)
  {
   for(xcte in cte)
   {
      ##	Control parameters
      HasExcluded <- FALSE
      HasIncluded <- FALSE
	
	  	K = as.integer(K)
  
      ## Initialise selected set of variables
      S <- rep(TRUE, P)
	
	  	repeat
	  		{
					#####################################
					##	Exclusion step
					#####################################
	
					#	Reference model
					if(!all(S == Smin))
					{
		  			cat("\n>>> In Exclusion with K = ", K, "\n")
	  
		 	 			RefCriteria <- read.or.compute(data2C,
									   K, 
								 S, 
								 xReferenceModel=TRUE, 
								 ReferenceModelsIndex,
								 NberExploredModels,
								 FileNameExploredModels,
								 xcte,
								 header = TRUE)
		
		  			##	Criteria of the reference model
		  			cat("\n>>> K = ", K, "; S = ", c(1:P)[S], "\n")
		  			cat("\n >>> Criteria = ", RefCriteria, "\n")
	  
		  			##	Matrix of criteria of competing models
		  			CompetCriteria <- matrix(NA, ncol=NBER_CRITERIA)
		  			colnames(CompetCriteria) <- NAMES_CRITERIA
	  
		  			for(j in 1:P)
		  			{
							if(S[j] & !Smin[j])
							{
			  				xxS <- S
			  				xxS[j] <- FALSE
			
			  				xxCompetCriteria = read.or.compute(data2C,
											   K, 
									  xxS, 
									  xReferenceModel=FALSE, 
									  ReferenceModelsIndex,
									  NberExploredModels,
									  FileNameExploredModels,
									  xcte,
									  header = TRUE)
			
			  				CompetCriteria = rbind(CompetCriteria, xxCompetCriteria)
			
			  				##	Print xxS
			  				cat("\n>>> K = ", K, "; S[-", j, "] = ", c(1:P)[xxS], "; Criteria = ", xxCompetCriteria, "\n >>> Constant = ", xcte, "\n")
							}
							else CompetCriteria = rbind(CompetCriteria, rep(NA, NBER_CRITERIA))
		  			}
	  
		  			#	Warning: CompetCriteria has been initialized with NAs on the first line
		  			candidateEx = which.min(CompetCriteria[, CRIT]) - 1
		
		  			##	Print exclusion candidate
		  			cat("\n>>> Candidate for Exclusion = ", candidateEx, "\n")
	  
		  			if((ForceExclusion && !HasIncluded) | (RefCriteria[CRIT] >= CompetCriteria[candidateEx + 1, CRIT]))
		  				{
								##	Then exclusion
								S[candidateEx] <- FALSE
								HasExcluded <- TRUE
								RefCriteria <- CompetCriteria[candidateEx+1, ]
		  
								##	Print excluded variable
								cat("\n>>> Variable ", candidateEx, " excluded\n")
		  				}
		  				else HasExcluded = FALSE
					}
					else HasExcluded = FALSE
	  
					###########################################
					##	Inclusion step
					###########################################
	  
					#print("Before Inclusion")
	  
					##	Break if there is no variables to include
					if(!all(S))
					{
		  			#	reference model come from privious Exclusion step
		
		  			##	Debug
		  			cat("\n>>> In Inclusion step with K = ", K, "\n")
		
		  			##	Competing criteria
		  			CompetCriteria <- matrix(NA, ncol=NBER_CRITERIA)
		  			colnames(CompetCriteria) <- NAMES_CRITERIA
	  
		  			for(j in 1:P)
		  			{
							if(!S[j])
							{
			  				xxS <- S
			  				xxS[j] <- TRUE
			  				xxCompetCriteria <- read.or.compute(data2C,
												K, 
									   xxS, 
									   xReferenceModel=FALSE, 
									   ReferenceModelsIndex,
									   NberExploredModels,
									   FileNameExploredModels,
									   xcte,
									   header = TRUE)
			
			  				CompetCriteria = rbind(CompetCriteria, xxCompetCriteria)
			
			  				##	Print xxS
			  				cat("\n>>> K = ", K, "; S[+", j, "] = ", c(1:P)[xxS], "; Criteria = ", xxCompetCriteria, "\n >>> Constant = ", xcte, "\n")
							}
							else CompetCriteria = rbind(CompetCriteria, rep(NA, NBER_CRITERIA))
		  			}
	  
		  			#	Warning: CompetCriteria has been initialized with NAs on the first line
		  			candidateIn <- which.min(CompetCriteria[, CRIT]) - 1
	  
		  			candidateS <- as.logical(S)
		  			candidateS[candidateIn] <- TRUE
		
		  			##	Print inclusion candidate
		  			cat("\n>>> Candidate for Inclusion = ", candidateIn, "\n")
	  
		  			found = isInFile_Rcpp(K, candidateS, FileNameExploredModels, header = TRUE) #list(TrueFalse, line, dim, logLik)
		  			numModel = found[["line"]] + 1
		
		  			if((RefCriteria[CRIT] > CompetCriteria[candidateIn + 1, CRIT]) &  (!ReferenceModelsIndex[numModel]))
		  			{
							##	Then Inclusion
							S[candidateIn] <- TRUE
							HasIncluded <- TRUE
							RefCriteria <- CompetCriteria[candidateIn+1, ]
							ReferenceModelsIndex[numModel] <- TRUE
		  
							##	Print included variable
							cat("\n>>> Variable ", candidateIn, " included\n")
		  			}
		  			else HasIncluded <- FALSE
		
		  			##	Debug
		  			cat("Out Inclusion")
					}
					else HasIncluded <- FALSE
	  
	  
					##	Break repeat
					if(!ForceExclusion)
					{
		  			if(((!HasExcluded) & (!HasIncluded)) | sum(as.integer(S) - as.integer(Smin)) <= 1) break
					}
					if(sum(as.integer(S) - as.integer(Smin)) <= 1) break
				}
			}
  	}
  
    
  	cat("\n\n>>> The explored models are in the file named ", FileNameExploredModels, "\n")
  
  	## Best models indexes
  	##Rcpp
  	ListCriteria = computeCriteriaFromFile_Rcpp(FileNameExploredModels, 1.0, header = TRUE)# From cpp
  
  	bestModelsIndexes = ListCriteria[["bestModelsIndexes"]] + 1
  
  	##Rcpp
  	bestModels = sapply(bestModelsIndexes, function(i) return (readModelAt_Rcpp(FileNameExploredModels, i - 1, header = TRUE)))
  	colnames(bestModels) = getCriteriaNames_Rcpp()
  	rownames(bestModels) = c("N", "P", "K", "S", "logLik", "dim", "entropy")

  	try(initialiseEmSettings())
  
  	invisible(bestModels)
}

## Constant calibration
#'	Data driven calibration of the penalty function.
#'
#'	@title	Data driven calibration of the penalty function
#'
#'	\code{dimJump.R(fileOrData, h = integer(), N = integer(), header = logical())} returns a data driven penalty parameter.
#'	
#'	@param	fileOrData	A character string or a data.frame (see details).
#'	@param	h	An integer defining the sliding window used to find the biggest jump.
#'	@param	N	The size of the sample data.
#'	@param	header	A logical indicating if the file containts header or not.
#'
#'	@return	NULL
#'
#'	@details
#'	This function is a dimension jump version of the slope heuristics for calibration of penalty function.
#'
#'	@seealso \code{\link{model.selection.R}} for final selection.
#'
#'	@keywords Cluster Classification
#'
#'	@author	Wilson Toussile
#'
#'	@references
#'	\itemize{
#'		\item  \href{http://projecteuclid.org/euclid.ejs/1379596773}{Dominique Bontemps and Wilson Toussile (2013)} : Clustering and #'	variable selection for categorical multivariate data. Electronic Journal of Statistics, Volume 7, 2344-2371, ISSN.
#'		\item 	\href{http://www.springerlink.com/content/r7k02q167120xl64/}{Wilson Toussile and Elisabeth Gassiat (2009)} : Variable #'	selection in model-based clustering using multilocus genotype data. Adv Data Anal Classif, Vol 3, number 2, 109-134.
#'	}
#'	
#'	@examples
#'	\dontrun{
#'	data(genotype2_ExploredModels)
#'	outDimJump = dimJump.R(genotype2_ExploredModels, N = 1000, h = 5)
#'	outDimJump[[1]]
#'	}
#'	@export
dimJump.R <- function(fileOrData, h = integer(), N = integer(), header = logical())
{
	## Compatibility of data
	if(!is.character(fileOrData) & (!is.data.frame(fileOrData)))
		stop(gettextf("Argument 'fileOrData' must be a string or a data.frame"))
	
	if(is.character(fileOrData))
	{
		if(!file.exists(fileOrData))
			stop(paste("File ", fileOrData, " does not exists"))
			
		exploredModels = read.table(fileOrData, header = header)
		xlogLik = exploredModels[, "logLik"]
		xdim = exploredModels[, "dim"]
	}
	else
	{
		xlogLik = as.double(fileOrData[, "logLik"])
		xdim = as.integer(fileOrData[, "dim"])
	}
	
	if(N <= 0) stop("The size N of dataset is incorrect")
			
	
	windowSize = 0.075
	maxConstant = log(N)
	minConstant = 0.25
	gridLength = round(log(N)/windowSize) + 1
	vectConstants = as.double(seq(from = windowSize, to = windowSize * gridLength, by = windowSize))
	vectLogLik = double(length(vectConstants))
	vectDim = integer(length(vectConstants))
	
	##Rcpp
	if(!try(selectDimFromData_Rcpp(xlogLik, xdim, vectConstants, vectLogLik, vectDim)))
		stop("Selection of dimension failed")
	
	BeginEnd1 = integer(2)
	BeginEnd2 = integer(2)
	
	##Rcpp
	dimJump_Rcpp(vectDim, h, BeginEnd1, BeginEnd2)
	
	BeginEnd1 = BeginEnd1 + 1
	BeginEnd2 = BeginEnd2 + 1
	
	constant1 = mean(vectConstants[BeginEnd1])
	constant2 = mean(vectConstants[BeginEnd2])
	
	which_max_dim = which.max(xdim)
	a1 = xlogLik[which_max_dim] - constant1*xdim[which_max_dim]
	a2 = xlogLik[which_max_dim] - constant2*xdim[which_max_dim]
	
	dev.new(title = "Penalty parameter", width = 12, height = 6)
	par(mfrow = c(1, 2))
	
	plot(vectConstants, vectDim, pch = 20, col = "grey40", xlab = "Parameter", ylab = "selected dimension")
	abline(v = vectConstants[BeginEnd1[1]], col = "blue", lty = 3)
	abline(v = constant1, col = "blue")
	abline(v = vectConstants[BeginEnd1[2]], col = "blue", lty = 3)
	abline(v = vectConstants[BeginEnd2[1]], col = "red", lty = 3)
	abline(v = constant2, col = "red")
	abline(v = vectConstants[BeginEnd2[2]], col = "red", lty = 3)
	text(constant1, mean(vectDim), paste("Biggest jump = ", constant1), col = "blue")
	text(constant2, 0.50*mean(vectDim), paste("Second biggest jump = ", constant2), col = "red")
	legend("topright", legend = c(paste("Biggest jump at = ", constant1), paste("Second biggest jump at = ", constant2)), lty = 1, col = c("blue", "red"))
	title(main = "Proposed data driven parameter")
	
	plot(xdim, xlogLik, xlab = "dim", ylab = "Log-likelihood", col = "grey40")
	abline(a1, constant1, col = "blue", lwd = 2, lty = 1)
	abline(a2, constant2, col = "red", lwd = 2, lty = 2)
	legend("topleft", legend = c(paste("Slope 1 = ", constant1), paste("Slope 2 = ", constant2)), col = c("blue", "red"), lty = c(1, 2), lwd = 2)
	title(main = "Associated slope")
	
	bounds = cbind(vectConstants[BeginEnd1], vectConstants[BeginEnd2])
	rownames(bounds) = c("lower", "uper")
	colnames(bounds) = c("constant1", "constant2")
	
	invisible(list(constants = c(constant1, constant2), bounds = bounds, dimLogLik = data.frame(dim = vectDim, logLik = vectLogLik)))
}


## Function for model selection
#'	Function for selection of both the number K of clusters and the subset S of clustering variables.
#'	
#'	@title	Function for selection of both the number K of clusters and the subset S of clustering variables.
#'	
#'	\code{model.selection.R(fileOrData, cte, header, lines)} returns a data.frame in which each line defines 
#'	the selected model for a criterion.
#'
#'	@param	fileOrData	A character string or a data.frame (see details). If \code{fileOrData} is a data.frame, 
#'	it must containts at a column named \code{logLik} and another named \code{dim} (see details).
#'	@param	cte	A penalty function parameter. The associated criterion is \code{-log(likelihood)+cte*dim}.
#'	@param	header	Indication of the presence of header in the file.
#'	@param	lines	A vector of integer. If not empty and \code{fileOrData} is the name of a file, only models defined 
#'		in lines in \code{lines} are compared.
#'	
#'	@return	A data.frame of selected models for the proposed penalized criteria.
#'
#'	@author	Wilson Toussile
#'
#'	@references
#'	\itemize{
#'		\item  \href{http://projecteuclid.org/euclid.ejs/1379596773}{Dominique Bontemps and Wilson Toussile (2013)} : Clustering and #'	variable selection for categorical multivariate data. Electronic Journal of Statistics, Volume 7, 2344-2371, ISSN.
#'		\item 	\href{http://www.springerlink.com/content/r7k02q167120xl64/}{Wilson Toussile and Elisabeth Gassiat (2009)} : Variable #'	selection in model-based clustering using multilocus genotype data. Adv Data Anal Classif, Vol 3, number 2, 109-134.
#'	}
#'	
#'	@examples
#'	\dontrun{
#'	data(genotype2_ExploredModels)
#'	outDimJump = dimJump.R(genotype2_ExploredModels, N = 1000, h = 5)
#'	cte1 = outDimJump[[1]][1]
#'	outSlection = model.selection.R(genotype2_ExploredModels, cte = cte1, header = TRUE)
#'	outSlection
#'	}
#'	@export
model.selection.R <- function(fileOrData,
								cte = as.double(1.0), 
								alpha = as.double(2.0),
								header = TRUE, 
								lines = integer())
{
	## Compatibility of data
	if(!is.character(fileOrData) & (!is.data.frame(fileOrData)))
		stop(gettextf("Argument 'fileOrData' must be a character string or a data frame"))
		
	if((alpha < 1.5) | (alpha > 2))
	  stop(gettextf("The slop coefficient 'alpha' must be in [1.5, 2]"))
	  
	cte = alpha * cte
	
	##Rcpp
	NBER_CRITERIA = getNberCriteria_Rcpp()
		
	if(is.character(fileOrData))
	{
		if(!file.exists(fileOrData))
			stop(gettextf("File '%s' does not exists in the working directory", fileOrData))
			
		parList = readModelAt_Rcpp(fileOrData, 0, header)
		P = parList[["P"]]
		#N = parList[["N"]]
			
		lines = as.integer(lines - 1)
			
		vectK = integer(NBER_CRITERIA)
		matS = matrix(as.integer(0), nrow = NBER_CRITERIA, ncol = P)
		vectLogLik = double(NBER_CRITERIA)
		vectDim = integer(NBER_CRITERIA)
		vectEntropy = double(NBER_CRITERIA)
		vectCriteria = double(NBER_CRITERIA)
		vectN = integer(1)
		
		##Rcpp
		try(selectModelFromFile_Rcpp(fileOrData, vectN, vectK, matS, vectLogLik, vectDim, vectEntropy, vectCriteria, cte, header, lines))
		
		outPut = data.frame(N = rep(vectN[1], NBER_CRITERIA), P = rep(P, NBER_CRITERIA), K = vectK, X = matS, logLik = vectLogLik, dim = vectDim, entropy = vectEntropy)
	}
	else
	{
	  ## Revoir
		N = fileOrData[1, 1]
		P = fileOrData[1, 2]
		vectIndexes = integer(NBER_CRITERIA)
		vectCriteria = double(NBER_CRITERIA)
		vectLogLik = fileOrData[, "logLik"]
		vectDim = fileOrData[, "dim"]
		vectEntropy = fileOrData[, "entropy"]
		
		##Rcpp
		try(selectModelFromData_Rcpp(vectLogLik, vectDim, vectEntropy, N, cte, vectIndexes, vectCriteria))
		vectIndexes = vectIndexes + 1
		
		outPut = data.frame(fileOrData[vectIndexes, ], criteria = vectCriteria)
	}
	
	##Rcpp
	rownames(outPut) = getCriteriaNames_Rcpp()
	
	return (outPut)
}


## Documentation of data
#' @title \code{genotype1} is a data frame of genotype data.
#'
#'	@name genotype1
#'
#'	@docType data
#'	
#'	@format
#'	The format is: chr [1:1000, 1:10] "109107" "105101" "106106" ... and the 11-th column containts integers representing 
#'	the prior classification in \code{5} sub-populations.
#'
#'	@description A simulated data frame of genotype data with \code{N = 1000} individuals gegnotyped at \code{P = 10} loci. 
#'	Each string represents two alleles. For example, "109107" represents "109" and "107". The last column 
#'
#'	@author Wilson Toussile
#'
#'	@references
#'	\itemize{
#'		\item  \href{http://projecteuclid.org/euclid.ejs/1379596773}{Dominique Bontemps and Wilson Toussile (2013)} : Clustering and #'	variable selection for categorical multivariate data. Electronic Journal of Statistics, Volume 7, 2344-2371, ISSN.
#'		\item 	\href{http://www.springerlink.com/content/r7k02q167120xl64/}{Wilson Toussile and Elisabeth Gassiat (2009)} : Variable #'	selection in model-based clustering using multilocus genotype data. Adv Data Anal Classif, Vol 3, number 2, 109-134.
#'	}
#'
#'	
#'	@keywords genotype
NULL
Any scripts or data that you put into this service are public.
ClustMMDD documentation built on May 2, 2019, 2:44 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
ClustMMDD
Variable Selection in Clustering by Mixture Models for Discrete Data

R/ClustMMDD.R
In ClustMMDD: Variable Selection in Clustering by Mixture Models for Discrete Data

Try the ClustMMDD package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

ClustMMDD Variable Selection in Clustering by Mixture Models for Discrete Data

R/ClustMMDD.R In ClustMMDD: Variable Selection in Clustering by Mixture Models for Discrete Data

Try the ClustMMDD package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

ClustMMDD
Variable Selection in Clustering by Mixture Models for Discrete Data

R/ClustMMDD.R
In ClustMMDD: Variable Selection in Clustering by Mixture Models for Discrete Data