#' @title Choice of the best binary question
#' @description This function finds the best binary question do divide a cluster A
#' into to subclusters such that the bipartition (A_l, A_l_c) has maximum between-clusters inertia.
#' @details This function works for both categorical, numerical and mixed data.
#' This is the core function of the divclust algorithm.
#' We are seeking the binary question which gives the best
#' bipartition. A binary question is defined with a cutting variable (quantitative or qualitative), and
#' a cutting value. For quantitative variable, the cutting value is a real number. For qualitative, the
#' cutting value is
#' @param X the data matrix of dimension (nxp) where p is equal to the number of numerical variables
#' plus the number of categories. This matrix is used to construct the binary questions
#' @param Z the numerical data matrix of dimension (nxk) used to compute the inertia criterion
#' (the matrix of the principal components for instance)
#' @param indices vector of indices for the cluster A to divide.
#' @param vec_quali vector containing the number of categories for each modalities (according to the
#' categories observed in A_l)
#' @param w weights vector
#' @param D diagonal distance matrix coefficients
#' @param vec_order vector containing TRUE if the categories of the variable are ordered
#' @return \item{inert}{the between-clusters inertia of the bipartition (A_l, A_l_c)}
#' @return \item{A_l}{the vector of indices of the cluster A_l}
#' @return \item{A_l}{the vector of indices of the cluster A_l_c}
#' @return \item{cut_ind}{the index of the cutting variables}
#' @return \item{cut_val}{a list with : \itemize{
#' \item $type: the type of the cutting variable (quantitative or qualitative)
#' \item $value: a real value if the variable is quantitative and a biparition of categories
#' if the variable is qualitative.
#' The first cluster of this bipartition of categories is given by $value$bipart whereas the second
#' cluster is given by $value$bipart_c}}
#' @keywords internal
#' @export
choose_question <- function (X, Z, indices, vec_quali = c(), w = rep(1. / nrow(Z), nrow(Z)),
D = rep(1, ncol(Z)),vec_order) {
l = length(indices)
#if (l <= 1)
# stop(' choose_question : cannot divide singleton or empty sets')
p <- dim(X)[2]
inert_max <- 0
second_inert <- 0
c_max <- 0
j_max <- -1
A_l_max <- c()
A_l_c_max <- c()
nb_quanti <- p - sum(vec_quali)
nb_quali <- length(vec_quali)
if (l > 1) {
# first we list all the quantitative variables
for (j in seq_len(nb_quanti)) {
cut_vals <- compute_cut_values(X[indices, j])
inerts <- sapply(cut_vals, function(l) { between_cluster_inert(Z, indices[X[indices, j] <= l], indices[X[indices, j] > l], w, D) })
l_max <- which.max(inerts)
if ( length(l_max)== 1 && inerts[l_max] >=inert_max) { # beware, think to add a empty test
denom <- inert_1D(X, j, indices, w)
nume <- between_cluster_inert_1D(X, j , indices[X[indices, j] <= cut_vals[l_max]], indices[X[indices, j] > cut_vals[l_max]], w)
second_inert_candidat <- nume / denom
if (inerts[l_max] == inert_max) { # beware, may be a tolerance test would be better
if (second_inert_candidat >second_inert ) {
c_max <- list(type = "quanti", value = cut_vals[l_max])
j_max <- j
second_inert <- second_inert_candidat
inert_max <- inerts[l_max]
A_l_max = indices[X[indices, j_max] <= c_max$value]
A_l_c_max = indices[X[indices, j_max] > c_max$value]
}
} # equality case end
else {
second_inert <- second_inert_candidat
j_max <- j
c_max <- list(type = "quanti", value = cut_vals[l_max])
inert_max <- inerts[l_max]
A_l_max = indices[X[indices, j_max] <= c_max$value]
A_l_c_max = indices[X[indices, j_max] > c_max$value]
}
}
}
start_offsets = c(0, cumsum(vec_quali)) + nb_quanti
for(j in seq_len(nb_quali)) {
nb_mod = vec_quali[[j]]
to_keep = seq_len(nb_mod)[apply(X[indices, start_offsets[[j]] + seq_len(nb_mod)],2,sum) != 0]
if (vec_order[j]==TRUE)
{
biparts <- list()
if (length(to_keep) >1)
biparts = lapply(1:(length(to_keep)-1),function(x){to_keep[1:x]})
} else
biparts = bipart(length(to_keep))
nb_biparts = length(biparts)
select_quali <- function(bp_index) {
A_l <- indices[apply(as.matrix(X[indices, start_offsets[[j]] + to_keep[biparts[[bp_index]]]] == 1), 1, any)]
A_l_c <- indices[ apply(as.matrix(X[indices, start_offsets[[j]] + to_keep[biparts[[bp_index]]]] == 0), 1, all)]
return(between_cluster_inert(Z, A_l, A_l_c, w, D))}
inerts <- lapply(seq_len(nb_biparts), select_quali)
l_max <- which.max(inerts)
if (length(l_max) >=1 && inerts[[l_max]] > inert_max) {
inert_max <- inerts[[l_max]]
value <- to_keep[biparts[[l_max]]]
bipart_c <- setdiff(to_keep, value)
c_max <- list(type = "quali", value = list(bipart=value,bipart_c = bipart_c))
j_max <- j
A_l_max <- indices[apply(as.matrix(X[indices, start_offsets[[j]] + c_max$value$bipart] == 1), 1, any)]
A_l_c_max <- indices[apply(as.matrix(X[indices, start_offsets[[j]] + c_max$value$bipart] == 0), 1, all)]
}
}
}
return (list(inert = inert_max, A_l = A_l_max, A_l_c = A_l_c_max, cut_ind = j_max, cut_val = c_max))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.