R/optimal.partition.R

Defines functions Silhouette.coefficient Silhouette Adiam.Msplit split diam

Documented in Adiam.Msplit diam Silhouette Silhouette.coefficient split

#' Diameter of a class of variables based on different distance function
#'
#' @param X a data frame for a set of variables X
#' @param dist.func PD / MPD / kendall / footrule
#'
#' @return a value
#'
#' @keywords internal
diam <- function(X, dist.func = "PD") {
  Idx.Dist <- c("PD", "MPD", "kendall", "footrule")
  if (!dist.func %in% Idx.Dist) {
    stop("'dist.func' should be one of 'PD', 'MPD', 'kendall','footrule'.")
  }
  df_X <- as.data.frame(X)
  n_X <- length(df_X)
  if (n_X == 1) {
    diam <- 1
  }
  if (n_X != 1) {
    if (dist.func == "PD") {
      diam <- min(1 - dist.mat.T(df_X))
    }
    if (dist.func == "MPD") {
      diam <- min(1 - dist.mat.T(df_X, mutual = TRUE))
    }
    if (dist.func == "kendall") {
      diam <- min(1 - dist.mat.concor(df_X, method = "kendall"))
    }
    if (dist.func == "footrule") {
      diam <- min(1 - dist.mat.concor(df_X, method = "footrule"))
    }
  }
  return(diam)
}


#' Split of two classes of variables based on different distance function
#'
#' @param X a data frame for a set of variables X
#' @param Y a data frame for a set of variables Y
#' @param dist.func PD / MPD / kendall / footrule
#'
#' @return a value
#'
#' @keywords internal
split <- function(X, Y, dist.func = "PD") {
  Idx.Dist <- c("PD", "MPD", "kendall", "footrule")
  if (!dist.func %in% Idx.Dist) {
    stop("'dist.func' should be one of 'PD', 'MPD', 'kendall','footrule'.")
  }
  df_X <- as.data.frame(X)
  df_Y <- as.data.frame(Y)
  n_X <- length(df_X)
  n_Y <- length(df_Y)
  if (n_Y == 0) {
    sim <- NA
  }
  if (n_Y != 0) {
    sim <- c()
    for (i in 1L:n_X) {
      df <- as.data.frame(cbind(df_X[, i], df_Y))
      if (dist.func == "PD") {
        sim <- c(sim, max(1 - dist.mat.T(df)[1L:n_Y]))
      }
      if (dist.func == "MPD") {
        sim <- c(sim, max(1 - dist.mat.T(df, mutual = T)[1L:n_Y]))
      }
      if (dist.func == "kendall") {
        sim <- c(sim, max(1 - dist.mat.concor(df, method = "kendall")[1L:n_Y]))
      }
      if (dist.func == "footrule") {
        sim <- c(sim, max(1 - dist.mat.concor(df, method = "footrule")[1L:n_Y]))
      }
    }
  }
  return(max(sim))
}


#' Average diameter & Maximum split of every partition of a given dendrogram
#'
#' @param X a data frame for a set of variables X
#' @param dend a dendrogramm
#' @param dist.func PD / MPD / kendall / footrule
#'
#' @return a data frame
#'
#' @importFrom stats as.hclust
#' @import dendextend
#'
#' @keywords internal
Adiam.Msplit <- function(X, dend = dend, dist.func = "PD") {
  Idx.Dist <- c("PD", "MPD", "kendall", "footrule")
  if (!dist.func %in% Idx.Dist) {
    stop("'dist.func' should be one of 'PD', 'MPD', 'kendall','footrule'.")
  }
  if (!inherits(dend, "dendrogram")) {
    stop("'dend' should be a 'dendrogram'.")
  }
  df <- as.data.frame(X)
  var <- names(df)
  for (i in seq(var)) {
    if (grepl(" ", var[i])) {
      var[i] <- unlist(strsplit(var[i], " "))[1]
    }
  }
  names(df) <- var
  # all partitions of the given dendrogram
  all_partition <- list()
  for (i in 2L:length(df)) {
    part <- list()
    for (j in 1L:i) {
      part[j] <- list(which(var %in% names(which(cutree(as.hclust(dend), k = i) == j))))
    }
    all_partition[i - 1] <- list(part)
  }
  # Adiam and Msplit for every partition
  partition <- c()
  ad <- c()
  ms <- c()
  dis <- c()
  for (i in seq_along(all_partition)) {
    clust <- all_partition[i][[1]]
    nclust <- length(clust)
    d <- c()
    s <- c() # diamter and split for k. class in i. partition
    for (k in 1L:nclust) {
      c <- clust[[k]]
      cX <- df[, c]
      cY <- df[, -c]
      d <- c(d, diam(cX, dist.func = dist.func))
      s <- c(s, split(cX, cY, dist.func = dist.func))
    }
    ad_partition <- mean(d)
    ms_partition <- max(s)
    dis_partition <- sqrt((1 - ad_partition)^2 + ms_partition^2)
    partition <- c(partition, nclust)
    ad <- c(ad, ad_partition)
    ms <- c(ms, ms_partition)
    dis <- c(dis, dis_partition)
  }
  tradeoff <- data.frame(partition = partition, ad = ad, ms = ms, dis = dis)
  return(tradeoff)
}


#' Silhouette value for the i. variable given variable partition
#'
#' @param i the index of the variable
#' @param df a data frame for all variables
#' @param partition a partition
#' @param dist.func PD / MPD / kendall / footrule
#'
#' @return a value for Silhouette
#'
#' @keywords internal
Silhouette <- function(i, df, partition, dist.func = "PD") {
  Idx.Dist <- c("PD", "MPD", "kendall", "footrule")
  if (!dist.func %in% Idx.Dist) {
    stop("'dist.func' should be one of 'PD', 'MPD', 'kendall','footrule'.")
  }

  a_i <- function(obj, cls, dist.func = dist.func) {
    if (!identical(obj, cls[, 1])) {
      stop("Error")
    }
    if (dist.func == "PD") {
      a <- mean(dist.mat.T(cls)[1L:(length(cls) - 1)])
    }
    if (dist.func == "MPD") {
      a <- mean(dist.mat.T(cls, mutual = T)[1L:(length(cls) - 1)])
    }
    if (dist.func == "kendall") {
      a <- mean(dist.mat.concor(cls, method = "kendall")[1L:(length(cls) - 1)])
    }
    if (dist.func == "footrule") {
      a <- mean(dist.mat.concor(cls, method = "footrule")[1L:(length(cls) - 1)])
    }
    return(a)
  }

  b <- c()
  for (j in seq_along(partition)) {
    if (i %in% partition[[j]]) {
      if (length(partition[[j]]) == 1) {
        return(0)
      }
      ai <- a_i(df[, i], df[, c(i, partition[[j]][partition[[j]] != i])], dist.func = dist.func)
    } else {
      b <- c(b, a_i(df[, i], df[, c(i, partition[[j]])], dist.func = dist.func))
    }
  }
  bi <- min(b)
  return((bi - ai) / max(ai, bi))
}


#' Silhouette coefficients given a dendrogram
#'
#' @param X a data frame for a set of variables X
#' @param dend a dendrogramm
#' @param dist.func PD / MPD / kendall / footrule
#'
#' @return a data frame
#'
#' @importFrom stats as.hclust
#' @import dendextend
#'
#' @keywords internal
Silhouette.coefficient <- function(X, dend, dist.func = "PD") {
  Idx.Dist <- c("PD", "MPD", "kendall", "footrule")
  if (!dist.func %in% Idx.Dist) {
    stop("'dist.func' should be one of 'PD', 'MPD', 'kendall','footrule'.")
  }
  if (!inherits(dend, "dendrogram")) {
    stop("'dend' should be a 'dendrogram'.")
  }
  df <- as.data.frame(X)
  var <- names(df)
  for (i in seq(var)) {
    if (grepl(" ", var[i])) {
      var[i] <- unlist(strsplit(var[i], " "))[1]
    }
  }
  names(df) <- var
  # all partitions of the given dendrogram
  all_partition <- list()
  for (i in 2L:length(df)) {
    part <- list()
    for (j in 1L:i) {
      part[j] <- list(which(var %in% names(which(cutree(as.hclust(dend), k = i) == j))))
    }
    all_partition[i - 1] <- list(part)
  }
  # Silhouette coefficient for every partition
  num <- c()
  ASW <- c()
  for (i in seq_along(all_partition)) {
    partition <- all_partition[i][[1]]
    s <- c()
    for (j in seq_along(X)) {
      s <- c(s, Silhouette(j, X, partition, dist.func = dist.func))
    }
    num <- c(num, length(partition))
    ASW <- c(ASW, mean(s))
  }
  Silhouette_Index <- data.frame(partition = num, ASW = ASW)
  return(Silhouette_Index)
}

Try the didec package in your browser

Any scripts or data that you put into this service are public.

didec documentation built on Sept. 14, 2024, 9:07 a.m.