#' @importFrom Rcpp evalCpp
#' @useDynLib divclust
#'
#' @import intervals
#' @title Monothetic divisive hierarchical clustering
#' @description
#' DIVCLUS-T is a divisive hierarchical clustering algorithm based on a monothetic bipartitional
#' approach allowing the dendrogram of the hierarchy to be read as a decision tree.
#' It is designed for numerical, categorical (ordered or not) or mixed data. Like the Ward agglomerative hierarchical
#' clustering algorithm and the k-means partitioning algorithm, it is based on the minimization of
#' the inertia criterion. However, it provides
#' a simple and natural monothetic interpretation of the clusters. Indeed, each cluster is decribed by set of
#' binary questions. The inertia criterion is calculated on all the principal components of PCAmix
#' (and then on standardized data in the numerical case).
#' @param data a data frame with numerical and/or categorical variables.
#' If the variable is ordinal, the column must be of class factor with the argument ordered=TRUE.
#' @param K the number of final clusters (leaves of the tree). By default, the complete dendrogram
#' is performed.
#' @return \item{tree}{an internal \strong{tree}}
#' @return \item{clusters}{the list of observations in each final cluster (the leaves of the tree)}
#' @return \item{description}{the monothetic description of each final cluster (the leaves of the tree)}
#' @return \item{which_cluster}{a vector of integers indicating the final cluster of each observation}
#' @return \item{height}{the height of the clusters in the dendrogram of the tree}
#' @return \item{B}{the proportion of inertia explained by the final partition (between-cluster inertia/total inertia)}
#' @return \item{data_quanti}{the quantitative data set}
#' @return \item{data_quali}{the qualitative data set}
#' @return \item{mod_quali}{the list of categories of qualitative variables}
#' @return \item{vec_quali}{number of categories of each qualitative variable}
#' @return \item{kmax}{the number of different observations i.e. the maximal number of leaves}
#' @return \item{T}{The total inertia}
#' @details The tree has K leaves corresponding to a partition in K clusters if K is specified
#' in input. Otherwise, each final cluster contains one observation and the tree is the
#' complete dendrogram. The between-cluster inertia of the final partition of the leaves is the sum of
#' the heights of the clusters in the tree. The total inertia for the quantitative dataset is equal
#' to p1 (the number of quantitative variables). The total inertia for the qualitative
#' dataset is m-p2 where m is the total number of categories and p2 is the number of qualitative
#' variables. For a mixture of quantitative and qualitative data, the total variance is
#' p1+m-p2. The quality of a partition is the proportion of inertia explained by the patition
#' which is the between-cluster inertia divided by the total inertia. The height of a cluster in the dendrogram of divclust
#' is the inertia variation which is also the aggregation criterion of Ward used in ascendant
#' hierarchical clustering. This can be used, to help in the choice of the number of clusters
#' as for Ward hierarchical clustering. For ordered qualitative variables (class factor with argument ordered
#' =TRUE), this order on the categories is used to reduce the number of possible binary questions.
#' @seealso \link{plot.divclust} \link{cutreediv}
#' @export
#' @examples
#' data(protein) # pure quantitatives data
#' tree <- divclust(protein) # full clustering
#' plot(tree)
#' plot(1:(tree$kmax-1),tree$height,xlab="number of cluster",ylab="height",main="Split levels")
#' c_5 <- divclust(protein, K=5) # stops clustering to 5 clusters
#' plot(c_5,nqbin=4)
#' c_5$B*100 #explained inertia
#' c_5$clusters # retrieve the list of observations in each cluster
#' c_5$description # and their monothetic description
#'
#' data(dogs) # pure qualitative data
#' tree <- divclust(dogs) # full clustering
#' plot(tree)
#' plot(1:(tree$kmax-1),tree$height,xlab="number of cluster",ylab="height",main="Split levels")
#' c_4 <- divclust(dogs, K=4) # stops clustering to 4 clusters
#' plot(c_4)
#' c_4$clusters # retrieve the list of observations in each cluster
#' c_4$description # and their monothetic description
#' c_4$which_cluster # return a vector indicating to which cluster belongs each individual
#' c_4$B*100 #explained variance
#'
#' dogs2 <- dogs # take the order of categories into account (to reduce the complexity)
#' levels(dogs$Size)
#' size2 <- factor(dogs$Size,c("small","large","medium")) #changes the order of the levels
#' levels(size2)
#' dogs2$Size <- ordered(size2) #specify argument ordered=TRUE in the class factor
#' tree <- divclust(dogs2) # full clustering with variable Size considered as ordered.
#' plot(tree) #the constraint on the order changes the clustering
#' data(wine) # mixed data
#' data <- wine[,1:29]
#' c_tot <- divclust(data) # full clustering
#' plot(c_tot)
#' c_4 <- divclust(data, 4) # stops clustering to 4 clusters
#' plot(c_4)
#' p2 <- length(c_4$vec_quali)
#' p1 <- ncol(data)-p2
#' sum(c_4$height)/(p1+sum(c_4$vec_quali)-p2)*100 #explained variance
#' c_tot$tree$v #internal contain of the root node
#' c_tot$tree$r$v #internal contain of the right node of the root node
divclust <- function (data, K = NULL)
{
cl <- match.call()
if (is.null(data)) {
stop("no problem to solve")
}
obj <- split_mix(data)
data_quanti <- obj$data_quanti
data_quali <- obj$data_quali
# quanti preprocessing
if (!is.null(data_quanti)) {
mu_quanti <- colMeans(data_quanti)
n <- nrow(data_quanti)
nb_quanti <- ncol(data_quanti)
rnames <- rownames(data_quanti)
red <- sqrt((n-1)/n)
sig_quanti <- apply(data_quanti,2, sd)*red
#X_quanti <- as.matrix(data_quanti)
X_quanti_init <- as.matrix(data_quanti)
# BEWARE X_quanti is both centered and standardized
X_quanti <- X_quanti_init - matrix(mu_quanti, n, nb_quanti, byrow=TRUE)
X_quanti <- X_quanti /matrix(sig_quanti, n, nb_quanti, byrow=TRUE)
} else {
n <- nrow(data_quali)
X_quanti <- X_quanti_init <-matrix(0,n,0)
nb_quanti <- 0
mu_quanti <- c()
sig_quanti <- c()
}
# quali preprocessing
if (!is.null(data_quali)) {
if (sum(unlist(lapply(data_quali,is.character))) !=0)
stop("All columns in data_quali must be of class factor")
res <- format_cat_data(data_quali)
rnames <- rownames(data_quali)
mod_quali <- res$mod_quali
vec_quali <- res$vec_quali
cnames_quali <- res$cnames_quali
X_quali <- as.matrix(res$Y)
mu_quali <- colMeans(X_quali)
nb_quali <- length(vec_quali)
vec_order <- unlist(lapply(data_quali,function(x){class(x)[1]=="ordered"}))
} else {
X_quali <-matrix(0,n,0)
mu_quali <- matrix(0,n,0)
cnames_quali <- list()
nb_quali <- 0
mod_quali <- list()
vec_quali <- c()
vec_order <- c()
}
# preprocessing
#X <- cbind(X_quanti, X_quali)
X <- cbind(X_quanti_init, X_quali)
p <- ncol(X)
X_quali <- X_quali - matrix(mu_quali, n, length(mu_quali), byrow=TRUE)
X_centre <- cbind(X_quanti, X_quali)
maxdim <- min(n,ncol(X)-nb_quali)
dec <- gsvd(X_centre, c(rep(1, nb_quanti), 1./mu_quali), rep(1./n, n))
Z <- dec$u[,1:maxdim] %*% diag(dec$d[1:maxdim])
w <- rep(1. / n, n)
D <- rep(1, ncol(Z))
# tree construction
# algorithm's sketch :
# start with one leaf consisting in best question chosen on all indices
# leaves <- {Tree(choose_qestion(Z, all_indices))}
# each iteration, compute "locally maximal partition"
# leaves <- (leaves\\{Fmax}) U {Tree(A_l(F_max), Tree(Al_c(F_max))}
dendro <-new.env()
dendro$v <- choose_question(X, Z, c(1:n), vec_quali, w, D,vec_order)
leaves <- list(dendro)
k <- 1
kmax <- nrow(unique(X))
if (is.null(K)) K <- kmax
if (!(K %in% 2:kmax)) K <- kmax
height <- rep(NA,K-1)
while (k < K) {
ind_max <- which.max(lapply(leaves, function(x){x$v$inert}))
height[k] <- leaves[[ind_max]]$v$inert
Fmin <- leaves[[ind_max]]
leaves[[ind_max]] <- NULL
cqg <- choose_question(X, Z, Fmin$v$A_l, vec_quali, w, D,vec_order)
Fmin$l <- new.env()
Fmin$l$v <- cqg
leaves <- append(leaves, Fmin$l)
cqd <- choose_question(X, Z, Fmin$v$A_l_c, vec_quali, w, D,vec_order)
Fmin$r <- new.env()
Fmin$r$v<- cqd
leaves <- append(leaves, Fmin$r)
k <- k+1
}
cluster <-list(tree=dendro)
cluster$X <- X
cluster$mod_quali <- mod_quali
cluster$vec_quali <- vec_quali
cluster$mu_quali <- mu_quali
cluster$mu_quanti <- mu_quanti
cluster$sig_quanti <- sig_quanti
cluster$nb_quanti <- nb_quanti
cluster$rnames <- rnames
cluster$cnames_quanti <- colnames(data_quanti)
cluster$cnames_quali <- cnames_quali
cluster$kmax <- kmax
cluster$height <- height
cluster$T <- ncol(X)-nb_quali
cluster$B <- sum(height)/cluster$T
ret_leaves <- list_leaves(cluster)
#cluster$description <-Reduce(append,lapply(ret_leaves$leaves, function(x) {x$path}))
# suppress the first character of a string
#cut_1 <- function(s) { Reduce(paste0,sapply(strsplit(s,""), identity)[-1])}
#cluster$description <- lapply(cluster$description, cut_1)
#cluster$description <- lapply(cluster$description, cut_1) # Yes we apply two times!
cluster$description <- make_description(ret_leaves$leaves,cluster)
names(cluster$description) <- paste("C", 1:K, sep = "")
cluster$clusters <- lapply(ret_leaves$leaves, function(x) {rnames[x$class]})
names(cluster$clusters) <- paste("C", 1:K, sep = "")
cluster$which_cluster <- ret_leaves$classes
cluster$data_quanti <- data_quanti
cluster$data_quali <- data_quali
cluster$call <- cl
class(cluster) <- "divclust"
return(cluster)
}
#' Wines of Val de Loire data
#' @format A data frame with 21 rows (the number of wines) and
#' 31 columns: the first column corresponds to the label of origin,
#' the second column corresponds to the soil,
#' and the others correspond to sensory descriptors.
#' @source Centre de recherche INRA d'Angers
#' @description data refering to 21 wines of Val de Loire.
#' @name wine
NULL
#' Protein data
#' @format A data frame with 25 rows (the European countries) and 9 columns (the food groups)
#' @source Originated by A. Weber and cited in Hand et al., A Handbook of Small Data Sets, (1994, p. 297).
#' @description The data measure the amount of protein consumed for nine food groups in
#' 25 European countries. The nine food groups are red meat (RedMeat), white meat (WhiteMeat),
#' eggs (Eggs), milk (Milk), fish (Fish), cereal (Cereal), starch (Starch), nuts (Nuts), and fruits and vegetables (FruitVeg).
#' @name protein
NULL
#' Breeds of Dogs data
#' @format A data frame with 27 rows (the breeds of dogs) and 7 columns: their size, weight and speed with 3 categories
#' (small, medium, large), their intelligence (low, medium, high), their affectivity and aggressiveness
#' with 3 categories (low, high), their function (utility, compagny, hunting).
#' @source Originated by A. Brefort (1982) and cited in Saporta G. (2011).
#' @description Data refering to 27 breeds of dogs.
#' @name dogs
NULL
#' Equality case data
#' @format A numerial data frame with 20 rows and 2 columns simulated from two gaussien distributions.
#' @description These data illstrate the case where two binary questions give the same bipartition and how
#' the binary question with the most discriminant variable (X1 here) is chosen.
#' @name equality_case
#' @examples
#' data(equality_case)
#' plot(equality_case) #X1 discriminates the bipartition better than X2
#' tree <- divclust(equality_case,K=3)
#' plot(tree,nqbin=1) # the binary question with X1 is chosen
NULL
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.