R/functions_metrics.R

Defines functions statistical_test EvaluateC dunn_indexC davies_bouldinC silhouetteC transformdata

Documented in davies_bouldinC dunn_indexC EvaluateC silhouetteC statistical_test transformdata

# ------------------------------------------------------------------------- #
# CLUSTCHECK
# Functions for clustering evalation metrics
# ------------------------------------------------------------------------- #
#' transformdata
#'
#'Data tranformation for categorical and mixed variables
#'
#' @param object An object of class ccdata
#'
#' @return A new dataset for the metrics function, created with the new coordinates based on a FAMD.
#' @export
#' @import FactoMineR
#' @import factoextra
#'
#' @examples
#' data(BankCustomer)
#' obj <- Dataset(BankCustomer, BankCustomer$Cluster)
#' transformdata(obj)
transformdata <- function(object){
  if(object$vartype=="NUM"){
    stop("Error : the variables are numerical and don't need factorial transformation")
  }
  res.famd <- FactoMineR::FAMD(object$active_data, graph = FALSE)
  ind <- factoextra::get_famd_ind(res.famd)
  return(as.data.frame(ind$coord))
}
# ------------------------------------------------------------------------- #
# Silhouette Values
# ------------------------------------------------------------------------- #
#' Silhouette coefficient
#'
#' @param object An object of class ccdata
#' @param clusters A vector corresponding to the dataset clustering results (predicted clusters with ccdata class object by default)
#'
#' @return The silouhette value for all the cluster groups and a mean silouhette.
#' @export
#'
#' @examples
#' data(BankCustomer)
#' obj <- Dataset(BankCustomer, BankCustomer$Cluster)
#' silhouetteC(obj)
silhouetteC <- function(object, clusters=object$pred_clusters) {
  if(object$vartype!= "NUM"){
    data <- transformdata(object)
  }else{
    data <- object$active_data
  }
  # a: The mean distance between a sample and all other points in the same class.
  # b: The mean distance between a sample and all other points in the next nearest cluster.
  d <- as.matrix(dist(data))
  n <- ncol(d)
  a <- NULL; b <- NULL
  for (col in 1:n){
    cluster <- clusters[col]
    # calculation for a
    same_class <- which(clusters==cluster) # identification of the class samples
    same_class_wo_sample <- same_class[which(same_class!=col)] # we remove the sample here
    a <- c(a,mean(d[same_class_wo_sample,col]))
    # calculation for b
    all_different_class <- which(clusters!=cluster) # identification of all the other samples
    w <- as.integer(names(which.min(d[all_different_class,col]))) # identification of the closest sample in an other sample
    nearest_cluster <- clusters[w] # identification of the next nearest cluster
    different_class <- which(clusters==nearest_cluster)
    b <- c(b,mean(d[different_class,col]))
  }
  s <- (b - a)/pmax(a,b) # silhouette formula

  # Cluster silhouette
  sk <- NULL
  for (k in unique(clusters)){
    ind = which(clusters == k)
    nbk <- sum(clusters == k)
    sk <- c(sk, 1/nbk * (sum(s[ind])))
  }
  return(list(cluster_silhouette=sk, mean_silhouette=mean(s)))
}
# ------------------------------------------------------------------------- #
# davies-Bouldin partition metric
# ------------------------------------------------------------------------- #
#' Davies-Bouldin Index
#'
#' @param object An object of class ccdata
#' @param clusters A vector corresponding to the dataset clustering results (predicted clusters with ccdata class object by default)
#'
#' @return The Davies-Bouldin index for all the cluster groups.
#' @export
#'
#' @examples
#' data(BankCustomer)
#' obj <- Dataset(BankCustomer, BankCustomer$Cluster)
#' davies_bouldinC(obj)
davies_bouldinC <- function(object, clusters=object$pred_clusters) {
  # s : the average distance between each point of cluster and the centroid of that cluster – also know as cluster diameter
  # d : the distance between cluster centroids
  if(object$vartype!= "NUM"){
    data <- transformdata(object)
  }else{
    data <- object$active_data
  }
  k <- length(unique(clusters))
  p <- ncol(data)
  centroids <- matrix(nrow=k, ncol=p)
  d <- matrix(nrow=k, ncol=k)
  s <- matrix(nrow=1, ncol=k)
  R <- matrix(nrow=k, ncol=k)
  maxR <- matrix(nrow=1, ncol=k)
  # Centroids calculation
  i=1
  for (g in unique(clusters)){
    k_data <- data[which(clusters==g),]
    centroids[i,] <- sapply(k_data, mean, na.rm=T)
    s[i] <- sqrt(mean(rowSums(apply(k_data, 2, function(y) (y - mean(y))^2))))
    i = i+1
  }
  # R and d calculation
  for (i in 1:k){
    for (j in 1:k){
      d[i,j] <- sqrt(sum((centroids[i,] - centroids[j,])^2))
      R[i,j] <- (s[i]+s[j])/d[i,j]
    }
  }
  # Index calculation
  for (i in 1:k){
    maxR[i] <- max(R[i,][is.finite(R[i,])])
  }
  DB <- sum(maxR)/k
  return(DB)
}
# ------------------------------------------------------------------------- #
# Dunn partition metric
# ------------------------------------------------------------------------- #
#' Dunn Index
#'
#' @param object An object of class ccdata
#' @param clusters A vector corresponding to the dataset clustering results (predicted clusters with ccdata class object by default)
#'
#' @return The Dunn Index for all the cluster groups
#' @export
#'
#' @examples
#' data(BankCustomer)
#' obj <- Dataset(BankCustomer, BankCustomer$Cluster)
#' dunn_indexC(obj)
dunn_indexC <- function(object, clusters=object$pred_clusters) {
  # Calculated using the following:
  # d1 : distance of samples to their centroids
  # d2 : distance betwewen centroids
  # Dunn Index is the ratio between the d2 min and the d1 max
  if(object$vartype!= "NUM"){
    data <- transformdata(object)
  }else{
    data <- object$active_data
  }
  k <- length(unique(clusters))
  p <- ncol(data)
  centroids <- matrix(nrow=k, ncol=p)
  d1 <- matrix(nrow=1, ncol=k)
  d2 <- matrix(nrow=k, ncol=k)
  # Centroids calculation
  i=1
  for (g in unique(clusters)){
    k_data <- data[which(clusters==g),]
    centroids[i,] <- sapply(k_data, mean, na.rm=T)
    i = i+1
  }
  # d2 calculation
  for (i in 1:k){
    for (j in 1:k){
      d2[i,j] <- sqrt(sum((centroids[i,] - centroids[j,])^2))
    }
  }
  # d1 calculation
  i=1
  for (g in unique(clusters)){
    k_data <- data[which(clusters==g),]
    d1[i] <- sqrt(mean(rowSums(apply(k_data, 2, function(y) (y - mean(y))^2))))
    i = i+1
  }
  DI <- min(d2[d2>0])/max(d1)
  return(DI)
}
# ------------------------------------------------------------------------- #
# Classification evalusation
# ------------------------------------------------------------------------- #
#' EvaluateC
#'
#' Evaluates the performance of the classifier by comparing predicted vs true clusters (true clusters required as input)
#'
#' @param object An object of class ccdata
#'
#' @param true_clusters Vector of the true clusters (true clusters with ccdata class object by default)
#'
#'
#' @return Confusion matrix, error rate, recall and precision. A matrix of recall and precision for all the cluster groups.
#' @export
#'
#' @examples
#' data(BankCustomer)
#' obj <- Dataset(BankCustomer, BankCustomer$Cluster)
#' EvaluateC(obj, BankCustomer$Cluster)
EvaluateC <- function(object, true_clusters = object$true_clusters) {
  if (is.null(true_clusters) == FALSE) {
    if (is.data.frame(true_clusters) == TRUE) {
      if (ncol(true_clusters) != 1) {
        stop("Error : you didn't enter a true cluster")
      }
      true_clusters <- unlist(true_clusters)
    }
    table <- contingency(object, true_clusters)
    tab <- table[[1]]
    ConfMat <- table[[2]]
    nli <- table[[5]]
    nco <- table[[4]]
    n <- object$n
    if (nli != nco) {
      stop(
        "Error : you don't have the same numbers of clusters between predicted and true values"
      )
    } else{
      if (nli == 2) {
        Errorrate <- 1 - ((ConfMat[1, 2] + ConfMat[2, 1]) / n)
        Recall <- ConfMat[1, 1] / (ConfMat[1, 3])
        Precision <- ConfMat[1, 1] / ConfMat[3, 1]
        matrix = matrix(
          Errorrate,
          Recall,
          Precision,
          nrow = 3,
          ncol = 1,
          dimnames = list(c(
            "Errorrate", "Recall", "Precision"
          ), c("Valeurs"))
        )
        print(matrix)
      } else if (nli > 2) {
        print(ConfMat)
        Errorrate <- 1 - ((sum(diag(tab))) / n)
        cat("Error Rate :", Errorrate, "\n")
        for (i in 1:nli) {
          group <- object$cluster_names[i]
          cat("cluster:", group, "\n")
          Recall <- ConfMat[i, i] / ConfMat[i, nli + 1]
          Precision <- ConfMat[i, i] / ConfMat[nli + 1, i]
          matrix = matrix(
            c(Recall, Precision),
            nrow = 2,
            ncol = 1,
            dimnames = list(c("Recall", "Precision"), c("Valeurs"))
          )
          print(matrix)
        }
      }

    }
  } else{
    stop("Error : you didn't enter a true cluster vector")
  }
}
# ------------------------------------------------------------------------- #
# Student, Anova and khi2 tests
# ------------------------------------------------------------------------- #
#' Statistical Test
#'
#' @param object An object of class ccdata
#' @param var Vector of a variable present in the dataset
#'
#' @return A sentence indicating whether the two variables are significantly different
#' @export
#' @importFrom stats aov reorder t.test var
#' @examples
#' data(BankCustomer)
#' obj <- Dataset(BankCustomer, BankCustomer$Cluster)
#' statistical_test(obj,BankCustomer$profession) #the cluster have significantly the same job?
#' statistical_test(obj,BankCustomer$revenu) #The cluster have significantly the same salary?
statistical_test <- function(object, var){
  namevar <- deparse(substitute(var))
  varname <- strsplit(namevar,split='$', fixed = TRUE)[[1]][2]
  k <- length(object$cluster_names)
  if(is.numeric(object$all_data[[varname]])){
    if (k == 1){
      stop("Error : you have only one cluster group")
    }else if(k == 2){
      groupe <- unique(object$pred_clusters)
      cluster1 <- data[object$pred_clusters==groupe[1],]
      moy1 <- mean(cluster1[[varname]])
      cluster2 <- data[object$pred_clusters==groupe[2],]
      moy2 <- mean(cluster2[[varname]])
      if (moy1>moy2){
        test <- t.test(cluster1[[varname]], cluster2[[varname]], alternative = "greater")
        if (test$p.value < 0.05){
          cat("Mean of", varname, "is signifantly higher in", groupe[1], "than in", groupe[2])
        }else{
          test <- t.test(cluster1[[varname]], cluster2[[varname]])
          if (test$p.value < 0.05){
            cat("Mean of", varname, "is significantly different between the two cluster groups")
          }}
      }else{
        test <- t.test(cluster2[[varname]], cluster1[[varname]], alternative = "greater")
        if (test$p.value < 0.05){
          cat("Mean of", varname, "is signifantly higher in", groupe[2], "than in", groupe[])
        }else{
          test <- t.test(cluster1[[varname]], cluster2[[varname]])
          if (test$p.value < 0.05){
            cat("Mean of", varname, "is significantly different between the two cluster groups")
          }
        }
      }
    }else{
      boxplot(object$all_data[[varname]]~object$pred_clusters)
      mod=aov(object$all_data[[varname]]~object$pred_clusters)
      p_value <- (summary(mod)[[1]][[1,"Pr(>F)"]])
      if (p_value < 0.05){
        cat("The cluster group has a significant impact on", varname)
      }else{
        cat("There are not significant impact of the cluster on", varname)
      }
    }
  }else{
    khi2 <- chisq.test(table(object$pred_clusters, object$all_data[[varname]]))
    if (khi2$p.value < 0.05){
      cat("The cluster significantly doesn't have the same ", varname)
    }else
      cat("There are not significant impact of the cluster on", varname)
  }
}
adrienPAVOINE/ClustCheck documentation built on Dec. 31, 2020, 6:45 p.m.