Categorical Data

Documented in clusterFunSel

#' Adapted k-modes algorithm
#'
#' K-modes function to accept any categorical distance based on
#' the function found in `klaR:kmodes`.
#'
#' @param data A matrix or data frame of categorical data. Objects have to be in rows, variables in columns.
#' @param modes The number of modes
#' @param distFun Pairwise categorical distance function. A function accepting two categorical vectors.
#' @param iter.max The maximum number of iterations allowed.
#' @return An object of class kmodes as found in `klaR` packages.
#' An additional component specifies the categorical distance function found in `distFun`.
#' @export

kmodesD <- function (data,
                    modes,
                    distFun,
                    iter.max = 10) {
   weighted = FALSE
   fast = TRUE

   if (!is.data.frame(data))
      data <- as.data.frame(data)
   isnumeric <- sapply(data, is.numeric)
   isfactor <- sapply(data, is.factor)
   if (any(isfactor)) {
      levs <- vector("list", ncol(data))
      for (j in which(isfactor)) {
         levsj <- levels(data[, j])
         data[, j] <- levsj[data[, j]]
         levs[[j]] <- levsj
      }
   }
   if (any(isnumeric)) {
      lengths <- sapply(data[, isnumeric], function(z) return(length(unique(z))))
      if (any(lengths > 30))
         warning("data has numeric coloumns with more than 30 different levels!")
   }
   update_mode <- function(num, num_var, data, cluster) {
      clust <- data[which(cluster == num), ]
      apply(clust, 2, function(cat) {
         cat <- table(cat)
         names(cat)[which.max(cat)]
      })
   }
   distance <- function(mode, obj, weights) {
      if (is.null(weights))
         return(distFun(mode, obj))
         ## return(sum(mode != obj))
      obj <- as.character(obj)
      mode <- as.character(mode)
      different <- which(mode != obj)
      n_mode <- n_obj <- numeric(length(different))
      for (i in seq(along = different)) {
         weight <- weights[[different[i]]]
         names <- names(weight)
         n_mode[i] <- weight[which(names == mode[different[i]])]
         n_obj[i] <- weight[which(names == obj[different[i]])]
      }
      dist <- sum((n_mode + n_obj)/(n_mode * n_obj))
      return(dist)
   }
   n <- nrow(data)
   num_var <- ncol(data)
   data <- as.data.frame(data)
   cluster <- numeric(n)
   names(cluster) <- 1:n
   if (missing(modes))
      stop("'modes' must be a number or a data frame.")
   if (iter.max < 1)
      stop("'iter.max' must be positive.")
   if (length(modes) == 1) {
      k <- modes
      modes <- unique(data)[sample(nrow(unique(data)))[1:k],
      ]
      for (i in 1:k) cluster[which(rownames(data) == rownames(modes)[i])] <- i
   }
   else {
      if (any(duplicated(modes)))
         stop("Initial modes are not distinct.")
      if (ncol(data) != ncol(modes))
         stop("'data' and 'modes' must have same number of columns")
      modes <- as.data.frame(modes)
      if (any(isfactor)) {
         if (!all(sapply(modes[, isfactor], is.factor)))
            stop("Types of modes do not match data!")
         for (j in which(isfactor)) modes[, j] <- levels(modes[,
                                                               j])[modes[, j]]
      }
      for (j in 1:num_var) if (weighted)
         if (!all(modes[, j] %in% unique(data[, j])))
            stop("For weighted call values of modes must exist in data!")
      k <- nrow(modes)
   }
   if (k > nrow(unique(data)))
      stop("More cluster modes than distinct data points.")
   if (weighted) {
      weights <- vector("list", num_var)
      for (i in 1:num_var) weights[[i]] <- table(data[, i])
   }
   else {
      weights <- NULL
   }
   if (!fast) {
      for (j in which(cluster == 0)) {
         dist <- apply(modes, 1, function(x) distance(x, data[j,
         ], weights))
         cluster[j] <- which.min(dist)
         modes[cluster[j], ] <- update_mode(cluster[j], num_var,
                                            data, cluster)
      }
      for (i in 1:iter.max) {
         continue <- FALSE
         for (j in 1:n) {
            dist <- apply(modes, 1, function(x) distance(x,
                                                         data[j, ], weights))
            clust_new <- which.min(dist)
            clust_old <- cluster[j]
            if (clust_new != clust_old) {
               cluster[j] <- clust_new
               modes[clust_new, ] <- update_mode(clust_new,
                                                 num_var, data, cluster)
               modes[clust_old, ] <- update_mode(clust_old,
                                                 num_var, data, cluster)
               continue <- TRUE
            }
         }
         if (!continue)
            break
      }
   }
   if (fast) {
      dists <- matrix(NA, nrow = n, ncol = k)
      if (!weighted) {
         for (i in 1:k) {
            di <- sapply(1:ncol(data), function(j) return(data[,
                                                               j] != rep(modes[i, j], n)))
            di <- rowSums(di)
            dists[, i] <- di
         }
      }
      if (weighted) {
         n_obj <- matrix(NA, nrow = n, ncol = ncol(data))
         for (j in 1:ncol(data)) n_obj[, j] <- weights[[j]][sapply(as.character(data[,
                                                                                     j]), function(z) return(which(names(weights[[j]]) ==
                                                                                                                      z)))]
         n_mode <- matrix(NA, nrow = nrow(modes), ncol = ncol(data))
         for (j in 1:ncol(data)) n_mode[, j] <- weights[[j]][sapply(as.character(modes[,
                                                                                       j]), function(z) return(which(names(weights[[j]]) ==
                                                                                                                        z)))]
         for (i in 1:k) {
            di <- sapply(1:ncol(data), function(j) return(data[,
                                                               j] != rep(modes[i, j], n)))
            wts <- (n_mode[rep(i, n), ] + n_obj)/(n_mode[rep(i,
                                                             n), ] * n_obj)
            di <- rowSums(di * wts)
            dists[, i] <- di
         }
      }
      cluster <- apply(dists, 1, function(z) {
         a <- which(z == min(z, na.rm = TRUE))
         if (length(a) > 1)
            a <- sample(a, 1)
         return(a)
      })
      for (j in 1:nrow(modes)) modes[j, ] <- update_mode(j,
                                                         num_var, data, cluster)
      for (i in 1:iter.max) {
         continue <- FALSE
         dists <- matrix(NA, nrow = n, ncol = k)
         if (!weighted) {
            for (i in 1:k) {
               di <- sapply(1:ncol(data), function(j) return(data[,
                                                                  j] != rep(modes[i, j], n)))
               di <- rowSums(di)
               dists[, i] <- di
            }
         }
         if (weighted) {
            n_mode <- matrix(NA, nrow = nrow(modes), ncol = ncol(data))
            for (j in 1:ncol(data)) n_mode[, j] <- weights[[j]][sapply(as.character(modes[,
                                                                                          j]), function(z) return(which(names(weights[[j]]) ==
                                                                                                                           z)))]
            for (i in 1:k) {
               di <- sapply(1:ncol(data), function(j) return(data[,
                                                                  j] != rep(modes[i, j], n)))
               wts <- (n_mode[rep(i, n), ] + n_obj)/(n_mode[rep(i,
                                                                n), ] * n_obj)
               di <- rowSums(di * wts)
               dists[, i] <- di
            }
         }
         old.cluster <- cluster
         cluster <- apply(dists, 1, function(z) {
            a <- which(z == min(z, na.rm = TRUE))
            if (length(a) > 1)
               a <- sample(a, 1)
            return(a)
         })
         for (j in 1:nrow(modes)) modes[j, ] <- update_mode(j,
                                                            num_var, data, cluster)
         if (any(old.cluster != cluster))
            continue <- TRUE
         if (!continue)
            break
      }
   }
   cluster.size <- table(cluster)
   if (length(cluster.size) < k)
      warning("One or more clusters are empty.")
   dists <- matrix(NA, nrow = n, ncol = k)
   if (weighted) {
      n_mode <- matrix(NA, nrow = nrow(modes), ncol = ncol(data))
      for (j in 1:ncol(data)) n_mode[, j] <- weights[[j]][sapply(as.character(modes[,
                                                                                    j]), function(z) return(which(names(weights[[j]]) ==
                                                                                                                     z)))]
   }
   for (i in 1:k) {
      di <- sapply(1:ncol(data), function(j) return(data[,
                                                         j] != rep(modes[i, j], n)))
      if (!weighted)
         di <- rowSums(di)
      if (weighted) {
         if (!fast) {
            n_obj <- matrix(NA, nrow = n, ncol = ncol(data))
            for (j in 1:ncol(data)) n_obj[, j] <- weights[[j]][sapply(as.character(data[,
                                                                                        j]), function(z) return(which(names(weights[[j]]) ==
                                                                                                                         z)))]
         }
         wts <- (n_mode[rep(i, n), ] + n_obj)/(n_mode[rep(i,
                                                          n), ] * n_obj)
         di <- rowSums(di * wts)
      }
      dists[, i] <- di
   }
   diffs <- numeric(k)
   for (i in seq_along(cluster.size)) diffs[i] <- sum(dists[cluster ==
                                                               i, i])
   rownames(modes) <- 1:k
   colnames(modes) <- colnames(data)
   if (any(isfactor))
      for (j in which(isfactor)) modes[, j] <- factor(modes[,
                                                            j], levels = levs[[j]])
   if (any(isnumeric))
      for (j in which(isnumeric)) modes[, j] <- as.numeric(modes[,
                                                                 j])

   result <- list(cluster = cluster,
                  size = cluster.size,
                  modes = modes,
                  withindiff = diffs,
                  iterations = i,
                  weighted = FALSE,
                  distFun = distFun)
   class(result) <- "kmodes"
   return(result)
}

#' Clustering generating function
#'
#' A function that generates formatted algorithmic functions that can be plugged
#' to enable run a wide variety of clustering algorithm for `clusGapDiscr` function.
#'
#' @param clustFun A character string with the following possible options:
#' 'pam' (default) from `cluster::pam`, 'diana' from `cluster::diana`, 'fanny' from `cluster::fanny`,
#' 'agnes-\{average, single, complete, ward, weighted\}' from `cluster::agnes`,
#' 'hclust-\{ward.D, ward.D2, single, complete, average, mcquitty, median, centroid\}' from `base::hclust`,
#' 'kmodes' from `klar::kmodes` (`iter.max = 10`, `weighted = FALSE` and `fast= TRUE`).
#' 'kmodes-N' enables to run the `kmodes` algorithm with a given number N of iterations where `iter.max = N`.
#' @return An object of class kmodes as found in `klaR` packages.
#' An additional component specifies the categorical distance function found in `distFun`.
#' @export

clusterFunSel <- function(clustFun ){
   if(clustFun  == 'pam'){
         return(cluster::pam)

   }else if(clustFun  == 'fanny'){
         return(cluster::fanny)

   }else if(clustFun  == 'diana'){
      dianaK <- function(x, k){
         out <- cluster::diana(x = x) %>%
            stats::cutree(k = k)
         list(cluster = out)
      }
      return(dianaK)

   }else if(grepl(pattern = '^agnes-.+', x = clustFun  )){
      cMeth <- sub('agnes-', '', clustFun  )
      agnesK <- function(x, k){
         out <- cluster::agnes(x = x,
                               diss = TRUE,
                               method = cMeth) %>%
            stats::cutree(k = k)
         list(cluster = out)
      }
      return(agnesK)

   }else if(grepl(pattern = '^hclust-.+', x = clustFun )){
      cMeth <- sub('hclust-', '', clustFun )
      hClustK <- function(x, k){
         out <- stats::hclust(d = x, method = cMeth) %>%
            stats::cutree(k = k)
         list(cluster = out)
      }
      return(hClustK)

   }else if(grepl(pattern = '^kmodes.*', x = clustFun )){

      cIter <- ifelse(grepl(pattern = 'kmodes-.+',  x = clustFun ),
                      as.numeric(sub('kmodes-', '', clustFun )),
                      10)
      kmodesD_ <- function(x, k, distFun){
         kmodesD(data = x, modes = k, distFun = distFun, iter.max = cIter)
      }
      return(kmodesD_)

   }
}
Any scripts or data that you put into this service are public.
DiscreteGapStatistic documentation built on April 3, 2025, 9:22 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
DiscreteGapStatistic
An Extension of the Gap Statistic for Ordinal/Categorical Data

R/AuxFuns.R
In DiscreteGapStatistic: An Extension of the Gap Statistic for Ordinal/Categorical Data

Defines functions clusterFunSel

Documented in clusterFunSel

Try the DiscreteGapStatistic package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

DiscreteGapStatistic An Extension of the Gap Statistic for Ordinal/Categorical Data

R/AuxFuns.R In DiscreteGapStatistic: An Extension of the Gap Statistic for Ordinal/Categorical Data

Defines functions clusterFunSel

Documented in clusterFunSel

Try the DiscreteGapStatistic package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

DiscreteGapStatistic
An Extension of the Gap Statistic for Ordinal/Categorical Data

R/AuxFuns.R
In DiscreteGapStatistic: An Extension of the Gap Statistic for Ordinal/Categorical Data